├── .gitignore
├── albert_config
    ├── bert_config.json
    ├── albert_config_base.json
    ├── albert_config_large.json
    ├── albert_config_tiny.json
    ├── albert_config_xlarge.json
    └── albert_config_xxlarge.json
├── LICENSE
├── README.md
├── test_changes.py
├── bert_utils.py
├── optimization_finetuning.py
├── tf_metrics.py
├── optimization.py
├── tokenization.py
├── run_pretraining.py
├── data
    └── test.txt
├── albert_ner.py
└── create_pretraining_data.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | albert_base_ner_checkpoints
3 | albert_base_zh
4 | 


--------------------------------------------------------------------------------
/albert_config/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.1,
 6 |   "hidden_size": 768, 
 7 |   "initializer_range": 0.02, 
 8 |   "intermediate_size": 3072, 
 9 |   "max_position_embeddings": 512, 
10 |   "num_attention_heads": 12, 
11 |   "num_hidden_layers": 12, 
12 |   "pooler_fc_size": 768, 
13 |   "pooler_num_attention_heads": 12, 
14 |   "pooler_num_fc_layers": 3, 
15 |   "pooler_size_per_head": 128, 
16 |   "pooler_type": "first_token_transform", 
17 |   "type_vocab_size": 2, 
18 |   "vocab_size": 21128
19 | }
20 | 


--------------------------------------------------------------------------------
/albert_config/albert_config_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.0,
 6 |   "hidden_size": 768,
 7 |   "embedding_size": 128,
 8 |   "initializer_range": 0.02, 
 9 |   "intermediate_size": 3072 ,
10 |   "max_position_embeddings": 512, 
11 |   "num_attention_heads": 12,
12 |   "num_hidden_layers": 12,
13 | 
14 |   "pooler_fc_size": 768,
15 |   "pooler_num_attention_heads": 12,
16 |   "pooler_num_fc_layers": 3, 
17 |   "pooler_size_per_head": 128, 
18 |   "pooler_type": "first_token_transform", 
19 |   "type_vocab_size": 2, 
20 |   "vocab_size": 21128,
21 |    "ln_type":"postln"
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/albert_config/albert_config_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.0,
 6 |   "hidden_size": 1024,
 7 |   "embedding_size": 128,
 8 |   "initializer_range": 0.02, 
 9 |   "intermediate_size": 4096,
10 |   "max_position_embeddings": 512, 
11 |   "num_attention_heads": 16,
12 |   "num_hidden_layers": 24,
13 | 
14 |   "pooler_fc_size": 768,
15 |   "pooler_num_attention_heads": 12,
16 |   "pooler_num_fc_layers": 3, 
17 |   "pooler_size_per_head": 128, 
18 |   "pooler_type": "first_token_transform", 
19 |   "type_vocab_size": 2, 
20 |   "vocab_size": 21128,
21 |    "ln_type":"postln"
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/albert_config/albert_config_tiny.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.0,
 6 |   "hidden_size": 312,
 7 |   "embedding_size": 128,
 8 |   "initializer_range": 0.02, 
 9 |   "intermediate_size": 1248 ,
10 |   "max_position_embeddings": 512, 
11 |   "num_attention_heads": 12,
12 |   "num_hidden_layers": 4,
13 | 
14 |   "pooler_fc_size": 768,
15 |   "pooler_num_attention_heads": 12,
16 |   "pooler_num_fc_layers": 3, 
17 |   "pooler_size_per_head": 128, 
18 |   "pooler_type": "first_token_transform", 
19 |   "type_vocab_size": 2, 
20 |   "vocab_size": 21128,
21 |    "ln_type":"postln"
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/albert_config/albert_config_xlarge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.0,
 6 |   "hidden_size": 2048,
 7 |   "embedding_size": 128,
 8 |   "initializer_range": 0.02, 
 9 |   "intermediate_size": 8192,
10 |   "max_position_embeddings": 512, 
11 |   "num_attention_heads": 32,
12 |   "num_hidden_layers": 24,
13 | 
14 |   "pooler_fc_size": 1024,
15 |   "pooler_num_attention_heads": 64,
16 |   "pooler_num_fc_layers": 3, 
17 |   "pooler_size_per_head": 128, 
18 |   "pooler_type": "first_token_transform", 
19 |   "type_vocab_size": 2, 
20 |   "vocab_size": 21128,
21 |   "ln_type":"postln"
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/albert_config/albert_config_xxlarge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.0,
 6 |   "hidden_size": 4096,
 7 |   "embedding_size": 128,
 8 |   "initializer_range": 0.02, 
 9 |   "intermediate_size": 16384,
10 |   "max_position_embeddings": 512, 
11 |   "num_attention_heads": 64,
12 |   "num_hidden_layers": 12,
13 | 
14 |   "pooler_fc_size": 1024,
15 |   "pooler_num_attention_heads": 64,
16 |   "pooler_num_fc_layers": 3, 
17 |   "pooler_size_per_head": 128, 
18 |   "pooler_type": "first_token_transform", 
19 |   "type_vocab_size": 2, 
20 |   "vocab_size": 21128,
21 |    "ln_type":"preln"
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 songheqi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # albert-chinese-ner
 2 | 
 3 | ## 前言
 4 | 
 5 | 这次的albert某种程度上可能比bert本身更具有意义，恰逢中文预训练模型出来，还是按照之前的数据来做NER方面的fine-tune
 6 | 
 7 | PS: 移步传统[**bert ner**](https://github.com/ProHiryu/bert-chinese-ner)模型
 8 | 
 9 | ## Resources
10 | 
11 | - [Bert](https://github.com/google-research/bert)
12 | - [ALBert](https://github.com/google-research/albert)
13 | - [ALBert_zh](https://github.com/brightmart/albert_zh)
14 | 
15 | ## Papers
16 | 
17 | - [ALBERT](https://arxiv.org/pdf/1909.11942.pdf)
18 | 
19 | ## 配置
20 | 
21 | 1. 下载albert中文模型，这里使用的是base
22 | 2. 将模型文件夹重命名为albert_base_zh，放入项目中
23 | 3. 运行
24 |    ```bash
25 |    python albert_ner.py --task_name ner --do_train true --do_eval true --data_dir data --vocab_file ./albert_config/vocab.txt --bert_config_file ./albert_base_zh/albert_config_base.json --max_seq_length 128 --train_batch_size 64 --learning_rate 2e-5 --num_train_epochs 3 --output_dir albert_base_ner_checkpoints
26 |    ```
27 | 4.最好使用tensorflow > 1.13, 这里运行的是1.15，不支持tf2.0
28 | 
29 | ## 结果
30 | 
31 | Base模型下训练3个epoch后：
32 | 
33 | ```bash
34 | INFO:tensorflow:  eval_f = 0.9280548
35 | INFO:tensorflow:  eval_precision = 0.923054
36 | INFO:tensorflow:  eval_recall = 0.9331808
37 | INFO:tensorflow:  global_step = 2374
38 | INFO:tensorflow:  loss = 13.210413
39 | ```
40 | 
41 | 测试结果同样：
42 | 
43 | ```
44 | [CLS]
45 | B-LOC
46 | I-LOC
47 | O
48 | B-LOC
49 | I-LOC
50 | I-PER
51 | O
52 | O
53 | O
54 | O
55 | O
56 | O
57 | O
58 | O
59 | O
60 | [SEP]
61 | [CLS]
62 | ```
63 | 
64 | ## 总结
65 | 
66 | 比起Bert本体，模型确实小了很多，效果却基本相当甚至领先bert，训练时间大幅缩小，NLP的“大舰巨炮”时代可能真的要过去了
67 | 


--------------------------------------------------------------------------------
/test_changes.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import tensorflow as tf
 3 | from modeling import embedding_lookup_factorized,transformer_model
 4 | import os
 5 | 
 6 | """
 7 | 测试albert主要的改进点：词嵌入的因式分解、层间参数共享、段落间连贯性
 8 | test main change of albert from bert
 9 | """
10 | batch_size = 2048
11 | sequence_length = 512
12 | vocab_size = 30000
13 | hidden_size = 1024
14 | num_attention_heads = int(hidden_size / 64)
15 | 
16 | def get_total_parameters():
17 |     """
18 |     get total parameters of a graph
19 |     :return:
20 |     """
21 |     total_parameters = 0
22 |     for variable in tf.trainable_variables():
23 |         # shape is an array of tf.Dimension
24 |         shape = variable.get_shape()
25 |         # print(shape)
26 |         # print(len(shape))
27 |         variable_parameters = 1
28 |         for dim in shape:
29 |             # print(dim)
30 |             variable_parameters *= dim.value
31 |         # print(variable_parameters)
32 |         total_parameters += variable_parameters
33 |     return total_parameters
34 | 
35 | def test_factorized_embedding():
36 |     """
37 |     test of Factorized embedding parameterization
38 |     :return:
39 |     """
40 |     input_ids=tf.zeros((batch_size, sequence_length),dtype=tf.int32)
41 |     output, embedding_table, embedding_table_2=embedding_lookup_factorized(input_ids,vocab_size,hidden_size)
42 |     print("output:",output)
43 | 
44 | def test_share_parameters():
45 |     """
46 |     test of share parameters across all layers: how many parameter after share parameter across layers of transformer.
47 |     :return:
48 |     """
49 |     def total_parameters_transformer(share_parameter_across_layers):
50 |         input_tensor=tf.zeros((batch_size, sequence_length, hidden_size),dtype=tf.float32)
51 |         print("transformer_model. input:",input_tensor)
52 |         transformer_result=transformer_model(input_tensor,hidden_size=hidden_size,num_attention_heads=num_attention_heads,share_parameter_across_layers=share_parameter_across_layers)
53 |         print("transformer_result:",transformer_result)
54 |         total_parameters=get_total_parameters()
55 |         print('total_parameters(not share):',total_parameters)
56 | 
57 |     share_parameter_across_layers=False
58 |     total_parameters_transformer(share_parameter_across_layers) # total parameters, not share: 125,976,576 = 125 million
59 | 
60 |     tf.reset_default_graph() # Clears the default graph stack and resets the global default graph
61 |     share_parameter_across_layers=True
62 |     total_parameters_transformer(share_parameter_across_layers) #  total parameters,   share: 10,498,048 = 10.5 million
63 | 
64 | def test_sentence_order_prediction():
65 |     """
66 |     sentence order prediction.
67 | 
68 |     check method of create_instances_from_document_albert from create_pretrining_data.py
69 | 
70 |     :return:
71 |     """
72 |     # 添加运行权限
73 |     os.system("chmod +x create_pretrain_data.sh")
74 | 
75 |     os.system("./create_pretrain_data.sh")
76 | 
77 | 
78 | # 1.test of Factorized embedding parameterization
79 | #test_factorized_embedding()
80 | 
81 | # 2. test of share parameters across all layers: how many parameter after share parameter across layers of transformer.
82 | # before share parameter: 125,976,576; after share parameter:
83 | #test_share_parameters()
84 | 
85 | # 3. test of sentence order prediction(SOP)
86 | test_sentence_order_prediction()
87 | 
88 | 


--------------------------------------------------------------------------------
/bert_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import collections
  6 | import copy
  7 | import json
  8 | import math
  9 | import re
 10 | import six
 11 | import tensorflow as tf
 12 | 
 13 | def get_shape_list(tensor, expected_rank=None, name=None):
 14 | 	"""Returns a list of the shape of tensor, preferring static dimensions.
 15 | 
 16 | 	Args:
 17 | 		tensor: A tf.Tensor object to find the shape of.
 18 | 		expected_rank: (optional) int. The expected rank of `tensor`. If this is
 19 | 			specified and the `tensor` has a different rank, and exception will be
 20 | 			thrown.
 21 | 		name: Optional name of the tensor for the error message.
 22 | 
 23 | 	Returns:
 24 | 		A list of dimensions of the shape of tensor. All static dimensions will
 25 | 		be returned as python integers, and dynamic dimensions will be returned
 26 | 		as tf.Tensor scalars.
 27 | 	"""
 28 | 	if name is None:
 29 | 		name = tensor.name
 30 | 
 31 | 	if expected_rank is not None:
 32 | 		assert_rank(tensor, expected_rank, name)
 33 | 
 34 | 	shape = tensor.shape.as_list()
 35 | 
 36 | 	non_static_indexes = []
 37 | 	for (index, dim) in enumerate(shape):
 38 | 		if dim is None:
 39 | 			non_static_indexes.append(index)
 40 | 
 41 | 	if not non_static_indexes:
 42 | 		return shape
 43 | 
 44 | 	dyn_shape = tf.shape(tensor)
 45 | 	for index in non_static_indexes:
 46 | 		shape[index] = dyn_shape[index]
 47 | 	return shape
 48 | 
 49 | def reshape_to_matrix(input_tensor):
 50 | 	"""Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
 51 | 	ndims = input_tensor.shape.ndims
 52 | 	if ndims < 2:
 53 | 		raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
 54 | 										 (input_tensor.shape))
 55 | 	if ndims == 2:
 56 | 		return input_tensor
 57 | 
 58 | 	width = input_tensor.shape[-1]
 59 | 	output_tensor = tf.reshape(input_tensor, [-1, width])
 60 | 	return output_tensor
 61 | 
 62 | def reshape_from_matrix(output_tensor, orig_shape_list):
 63 | 	"""Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
 64 | 	if len(orig_shape_list) == 2:
 65 | 		return output_tensor
 66 | 
 67 | 	output_shape = get_shape_list(output_tensor)
 68 | 
 69 | 	orig_dims = orig_shape_list[0:-1]
 70 | 	width = output_shape[-1]
 71 | 
 72 | 	return tf.reshape(output_tensor, orig_dims + [width])
 73 | 
 74 | def assert_rank(tensor, expected_rank, name=None):
 75 | 	"""Raises an exception if the tensor rank is not of the expected rank.
 76 | 
 77 | 	Args:
 78 | 		tensor: A tf.Tensor to check the rank of.
 79 | 		expected_rank: Python integer or list of integers, expected rank.
 80 | 		name: Optional name of the tensor for the error message.
 81 | 
 82 | 	Raises:
 83 | 		ValueError: If the expected shape doesn't match the actual shape.
 84 | 	"""
 85 | 	if name is None:
 86 | 		name = tensor.name
 87 | 
 88 | 	expected_rank_dict = {}
 89 | 	if isinstance(expected_rank, six.integer_types):
 90 | 		expected_rank_dict[expected_rank] = True
 91 | 	else:
 92 | 		for x in expected_rank:
 93 | 			expected_rank_dict[x] = True
 94 | 
 95 | 	actual_rank = tensor.shape.ndims
 96 | 	if actual_rank not in expected_rank_dict:
 97 | 		scope_name = tf.get_variable_scope().name
 98 | 		raise ValueError(
 99 | 				"For the tensor `%s` in scope `%s`, the actual rank "
100 | 				"`%d` (shape = %s) is not equal to the expected rank `%s`" %
101 | 				(name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
102 | 
103 | def gather_indexes(sequence_tensor, positions):
104 | 	"""Gathers the vectors at the specific positions over a minibatch."""
105 | 	sequence_shape = get_shape_list(sequence_tensor, expected_rank=3)
106 | 	batch_size = sequence_shape[0]
107 | 	seq_length = sequence_shape[1]
108 | 	width = sequence_shape[2]
109 | 
110 | 	flat_offsets = tf.reshape(
111 | 			tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
112 | 	flat_positions = tf.reshape(positions + flat_offsets, [-1])
113 | 	flat_sequence_tensor = tf.reshape(sequence_tensor,
114 | 																		[batch_size * seq_length, width])
115 | 	output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
116 | 	return output_tensor
117 | 
118 | # add sequence mask for:
119 | # 1. random shuffle lm modeling---xlnet with random shuffled input
120 | # 2. left2right and right2left language modeling
121 | # 3. conditional generation
122 | def generate_seq2seq_mask(attention_mask, mask_sequence, seq_type, **kargs):
123 | 	if seq_type == 'seq2seq':
124 | 		if mask_sequence is not None:
125 | 			seq_shape = get_shape_list(mask_sequence, expected_rank=2)
126 | 			seq_len = seq_shape[1]
127 | 			ones = tf.ones((1, seq_len, seq_len))
128 | 			a_mask = tf.matrix_band_part(ones, -1, 0)
129 | 			s_ex12 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 2)
130 | 			s_ex13 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 3)
131 | 			a_mask = (1 - s_ex13) * (1 - s_ex12) + s_ex13 * a_mask
132 | 			# generate mask of batch x seq_len x seq_len
133 | 			a_mask = tf.reshape(a_mask, (-1, seq_len, seq_len))
134 | 			out_mask = attention_mask * a_mask
135 | 		else:
136 | 			ones = tf.ones_like(attention_mask[:1])
137 | 			mask = (tf.matrix_band_part(ones, -1, 0))
138 | 			out_mask = attention_mask * mask
139 | 	else:
140 | 		out_mask = attention_mask
141 | 
142 | 	return out_mask
143 | 
144 | 


--------------------------------------------------------------------------------
/optimization_finetuning.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
 26 |   """Creates an optimizer training op."""
 27 |   global_step = tf.train.get_or_create_global_step()
 28 | 
 29 |   learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 30 | 
 31 |   # Implements linear decay of the learning rate.
 32 |   learning_rate = tf.train.polynomial_decay(
 33 |       learning_rate,
 34 |       global_step,
 35 |       num_train_steps,
 36 |       end_learning_rate=0.0,
 37 |       power=1.0,
 38 |       cycle=False)
 39 | 
 40 |   # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 41 |   # learning rate will be `global_step/num_warmup_steps * init_lr`.
 42 |   if num_warmup_steps:
 43 |     global_steps_int = tf.cast(global_step, tf.int32)
 44 |     warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 45 | 
 46 |     global_steps_float = tf.cast(global_steps_int, tf.float32)
 47 |     warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 48 | 
 49 |     warmup_percent_done = global_steps_float / warmup_steps_float
 50 |     warmup_learning_rate = init_lr * warmup_percent_done
 51 | 
 52 |     is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 53 |     learning_rate = (
 54 |         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 55 | 
 56 |   # It is recommended that you use this optimizer for fine tuning, since this
 57 |   # is how the model was trained (note that the Adam m/v variables are NOT
 58 |   # loaded from init_checkpoint.)
 59 |   optimizer = AdamWeightDecayOptimizer(
 60 |       learning_rate=learning_rate,
 61 |       weight_decay_rate=0.01,
 62 |       beta_1=0.9,
 63 |       beta_2=0.999, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999,
 64 |       epsilon=1e-6,
 65 |       exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 66 | 
 67 |   if use_tpu:
 68 |     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 69 | 
 70 |   tvars = tf.trainable_variables()
 71 |   grads = tf.gradients(loss, tvars)
 72 | 
 73 |   # This is how the model was pre-trained.
 74 |   (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
 75 | 
 76 |   train_op = optimizer.apply_gradients(
 77 |       zip(grads, tvars), global_step=global_step)
 78 | 
 79 |   # Normally the global step update is done inside of `apply_gradients`.
 80 |   # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
 81 |   # a different optimizer, you should probably take this line out.
 82 |   new_global_step = global_step + 1
 83 |   train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 84 |   return train_op
 85 | 
 86 | 
 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 88 |   """A basic Adam optimizer that includes "correct" L2 weight decay."""
 89 | 
 90 |   def __init__(self,
 91 |                learning_rate,
 92 |                weight_decay_rate=0.0,
 93 |                beta_1=0.9,
 94 |                beta_2=0.999,
 95 |                epsilon=1e-6,
 96 |                exclude_from_weight_decay=None,
 97 |                name="AdamWeightDecayOptimizer"):
 98 |     """Constructs a AdamWeightDecayOptimizer."""
 99 |     super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 | 
101 |     self.learning_rate = learning_rate
102 |     self.weight_decay_rate = weight_decay_rate
103 |     self.beta_1 = beta_1
104 |     self.beta_2 = beta_2
105 |     self.epsilon = epsilon
106 |     self.exclude_from_weight_decay = exclude_from_weight_decay
107 | 
108 |   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 |     """See base class."""
110 |     assignments = []
111 |     for (grad, param) in grads_and_vars:
112 |       if grad is None or param is None:
113 |         continue
114 | 
115 |       param_name = self._get_variable_name(param.name)
116 | 
117 |       m = tf.get_variable(
118 |           name=param_name + "/adam_m",
119 |           shape=param.shape.as_list(),
120 |           dtype=tf.float32,
121 |           trainable=False,
122 |           initializer=tf.zeros_initializer())
123 |       v = tf.get_variable(
124 |           name=param_name + "/adam_v",
125 |           shape=param.shape.as_list(),
126 |           dtype=tf.float32,
127 |           trainable=False,
128 |           initializer=tf.zeros_initializer())
129 | 
130 |       # Standard Adam update.
131 |       next_m = (
132 |           tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 |       next_v = (
134 |           tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 |                                                     tf.square(grad)))
136 | 
137 |       update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 | 
139 |       # Just adding the square of the weights to the loss function is *not*
140 |       # the correct way of using L2 regularization/weight decay with Adam,
141 |       # since that will interact with the m and v parameters in strange ways.
142 |       #
143 |       # Instead we want ot decay the weights in a manner that doesn't interact
144 |       # with the m/v parameters. This is equivalent to adding the square
145 |       # of the weights to the loss with plain (non-momentum) SGD.
146 |       if self._do_use_weight_decay(param_name):
147 |         update += self.weight_decay_rate * param
148 | 
149 |       update_with_lr = self.learning_rate * update
150 | 
151 |       next_param = param - update_with_lr
152 | 
153 |       assignments.extend(
154 |           [param.assign(next_param),
155 |            m.assign(next_m),
156 |            v.assign(next_v)])
157 |     return tf.group(*assignments, name=name)
158 | 
159 |   def _do_use_weight_decay(self, param_name):
160 |     """Whether to use L2 weight decay for `param_name`."""
161 |     if not self.weight_decay_rate:
162 |       return False
163 |     if self.exclude_from_weight_decay:
164 |       for r in self.exclude_from_weight_decay:
165 |         if re.search(r, param_name) is not None:
166 |           return False
167 |     return True
168 | 
169 |   def _get_variable_name(self, param_name):
170 |     """Get the variable name from the tensor name."""
171 |     m = re.match("^(.*):\\d+$", param_name)
172 |     if m is not None:
173 |       param_name = m.group(1)
174 |     return param_name
175 | 


--------------------------------------------------------------------------------
/tf_metrics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Multiclass
  3 | from: 
  4 | https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py
  5 | 
  6 | """
  7 | 
  8 | __author__ = "Guillaume Genthial"
  9 | 
 10 | import numpy as np
 11 | import tensorflow as tf
 12 | from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix
 13 | 
 14 | 
 15 | def precision(labels, predictions, num_classes, pos_indices=None,
 16 |               weights=None, average='micro'):
 17 |     """Multi-class precision metric for Tensorflow
 18 |     Parameters
 19 |     ----------
 20 |     labels : Tensor of tf.int32 or tf.int64
 21 |         The true labels
 22 |     predictions : Tensor of tf.int32 or tf.int64
 23 |         The predictions, same shape as labels
 24 |     num_classes : int
 25 |         The number of classes
 26 |     pos_indices : list of int, optional
 27 |         The indices of the positive classes, default is all
 28 |     weights : Tensor of tf.int32, optional
 29 |         Mask, must be of compatible shape with labels
 30 |     average : str, optional
 31 |         'micro': counts the total number of true positives, false
 32 |             positives, and false negatives for the classes in
 33 |             `pos_indices` and infer the metric from it.
 34 |         'macro': will compute the metric separately for each class in
 35 |             `pos_indices` and average. Will not account for class
 36 |             imbalance.
 37 |         'weighted': will compute the metric separately for each class in
 38 |             `pos_indices` and perform a weighted average by the total
 39 |             number of true labels for each class.
 40 |     Returns
 41 |     -------
 42 |     tuple of (scalar float Tensor, update_op)
 43 |     """
 44 |     cm, op = _streaming_confusion_matrix(
 45 |         labels, predictions, num_classes, weights)
 46 |     pr, _, _ = metrics_from_confusion_matrix(
 47 |         cm, pos_indices, average=average)
 48 |     op, _, _ = metrics_from_confusion_matrix(
 49 |         op, pos_indices, average=average)
 50 |     return (pr, op)
 51 | 
 52 | 
 53 | def recall(labels, predictions, num_classes, pos_indices=None, weights=None,
 54 |            average='micro'):
 55 |     """Multi-class recall metric for Tensorflow
 56 |     Parameters
 57 |     ----------
 58 |     labels : Tensor of tf.int32 or tf.int64
 59 |         The true labels
 60 |     predictions : Tensor of tf.int32 or tf.int64
 61 |         The predictions, same shape as labels
 62 |     num_classes : int
 63 |         The number of classes
 64 |     pos_indices : list of int, optional
 65 |         The indices of the positive classes, default is all
 66 |     weights : Tensor of tf.int32, optional
 67 |         Mask, must be of compatible shape with labels
 68 |     average : str, optional
 69 |         'micro': counts the total number of true positives, false
 70 |             positives, and false negatives for the classes in
 71 |             `pos_indices` and infer the metric from it.
 72 |         'macro': will compute the metric separately for each class in
 73 |             `pos_indices` and average. Will not account for class
 74 |             imbalance.
 75 |         'weighted': will compute the metric separately for each class in
 76 |             `pos_indices` and perform a weighted average by the total
 77 |             number of true labels for each class.
 78 |     Returns
 79 |     -------
 80 |     tuple of (scalar float Tensor, update_op)
 81 |     """
 82 |     cm, op = _streaming_confusion_matrix(
 83 |         labels, predictions, num_classes, weights)
 84 |     _, re, _ = metrics_from_confusion_matrix(
 85 |         cm, pos_indices, average=average)
 86 |     _, op, _ = metrics_from_confusion_matrix(
 87 |         op, pos_indices, average=average)
 88 |     return (re, op)
 89 | 
 90 | 
 91 | def f1(labels, predictions, num_classes, pos_indices=None, weights=None,
 92 |        average='micro'):
 93 |     return fbeta(labels, predictions, num_classes, pos_indices, weights,
 94 |                  average)
 95 | 
 96 | 
 97 | def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None,
 98 |           average='micro', beta=1):
 99 |     """Multi-class fbeta metric for Tensorflow
100 |     Parameters
101 |     ----------
102 |     labels : Tensor of tf.int32 or tf.int64
103 |         The true labels
104 |     predictions : Tensor of tf.int32 or tf.int64
105 |         The predictions, same shape as labels
106 |     num_classes : int
107 |         The number of classes
108 |     pos_indices : list of int, optional
109 |         The indices of the positive classes, default is all
110 |     weights : Tensor of tf.int32, optional
111 |         Mask, must be of compatible shape with labels
112 |     average : str, optional
113 |         'micro': counts the total number of true positives, false
114 |             positives, and false negatives for the classes in
115 |             `pos_indices` and infer the metric from it.
116 |         'macro': will compute the metric separately for each class in
117 |             `pos_indices` and average. Will not account for class
118 |             imbalance.
119 |         'weighted': will compute the metric separately for each class in
120 |             `pos_indices` and perform a weighted average by the total
121 |             number of true labels for each class.
122 |     beta : int, optional
123 |         Weight of precision in harmonic mean
124 |     Returns
125 |     -------
126 |     tuple of (scalar float Tensor, update_op)
127 |     """
128 |     cm, op = _streaming_confusion_matrix(
129 |         labels, predictions, num_classes, weights)
130 |     _, _, fbeta = metrics_from_confusion_matrix(
131 |         cm, pos_indices, average=average, beta=beta)
132 |     _, _, op = metrics_from_confusion_matrix(
133 |         op, pos_indices, average=average, beta=beta)
134 |     return (fbeta, op)
135 | 
136 | 
137 | def safe_div(numerator, denominator):
138 |     """Safe division, return 0 if denominator is 0"""
139 |     numerator, denominator = tf.to_float(numerator), tf.to_float(denominator)
140 |     zeros = tf.zeros_like(numerator, dtype=numerator.dtype)
141 |     denominator_is_zero = tf.equal(denominator, zeros)
142 |     return tf.where(denominator_is_zero, zeros, numerator / denominator)
143 | 
144 | 
145 | def pr_re_fbeta(cm, pos_indices, beta=1):
146 |     """Uses a confusion matrix to compute precision, recall and fbeta"""
147 |     num_classes = cm.shape[0]
148 |     neg_indices = [i for i in range(num_classes) if i not in pos_indices]
149 |     cm_mask = np.ones([num_classes, num_classes])
150 |     cm_mask[neg_indices, neg_indices] = 0
151 |     diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask))
152 | 
153 |     cm_mask = np.ones([num_classes, num_classes])
154 |     cm_mask[:, neg_indices] = 0
155 |     tot_pred = tf.reduce_sum(cm * cm_mask)
156 | 
157 |     cm_mask = np.ones([num_classes, num_classes])
158 |     cm_mask[neg_indices, :] = 0
159 |     tot_gold = tf.reduce_sum(cm * cm_mask)
160 | 
161 |     pr = safe_div(diag_sum, tot_pred)
162 |     re = safe_div(diag_sum, tot_gold)
163 |     fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re)
164 | 
165 |     return pr, re, fbeta
166 | 
167 | 
168 | def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro',
169 |                                   beta=1):
170 |     """Precision, Recall and F1 from the confusion matrix
171 |     Parameters
172 |     ----------
173 |     cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes)
174 |         The streaming confusion matrix.
175 |     pos_indices : list of int, optional
176 |         The indices of the positive classes
177 |     beta : int, optional
178 |         Weight of precision in harmonic mean
179 |     average : str, optional
180 |         'micro', 'macro' or 'weighted'
181 |     """
182 |     num_classes = cm.shape[0]
183 |     if pos_indices is None:
184 |         pos_indices = [i for i in range(num_classes)]
185 | 
186 |     if average == 'micro':
187 |         return pr_re_fbeta(cm, pos_indices, beta)
188 |     elif average in {'macro', 'weighted'}:
189 |         precisions, recalls, fbetas, n_golds = [], [], [], []
190 |         for idx in pos_indices:
191 |             pr, re, fbeta = pr_re_fbeta(cm, [idx], beta)
192 |             precisions.append(pr)
193 |             recalls.append(re)
194 |             fbetas.append(fbeta)
195 |             cm_mask = np.zeros([num_classes, num_classes])
196 |             cm_mask[idx, :] = 1
197 |             n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask)))
198 | 
199 |         if average == 'macro':
200 |             pr = tf.reduce_mean(precisions)
201 |             re = tf.reduce_mean(recalls)
202 |             fbeta = tf.reduce_mean(fbetas)
203 |             return pr, re, fbeta
204 |         if average == 'weighted':
205 |             n_gold = tf.reduce_sum(n_golds)
206 |             pr_sum = sum(p * n for p, n in zip(precisions, n_golds))
207 |             pr = safe_div(pr_sum, n_gold)
208 |             re_sum = sum(r * n for r, n in zip(recalls, n_golds))
209 |             re = safe_div(re_sum, n_gold)
210 |             fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds))
211 |             fbeta = safe_div(fbeta_sum, n_gold)
212 |             return pr, re, fbeta
213 | 
214 |     else:
215 |         raise NotImplementedError()


--------------------------------------------------------------------------------
/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
 26 |     """Creates an optimizer training op."""
 27 |     global_step = tf.train.get_or_create_global_step()
 28 | 
 29 |     learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 30 | 
 31 |     # Implements linear decay of the learning rate.
 32 |     learning_rate = tf.train.polynomial_decay(
 33 |         learning_rate,
 34 |         global_step,
 35 |         num_train_steps,
 36 |         end_learning_rate=0.0,
 37 |         power=1.0,
 38 |         cycle=False)
 39 | 
 40 |     # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 41 |     # learning rate will be `global_step/num_warmup_steps * init_lr`.
 42 |     if num_warmup_steps:
 43 |         global_steps_int = tf.cast(global_step, tf.int32)
 44 |         warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 45 | 
 46 |         global_steps_float = tf.cast(global_steps_int, tf.float32)
 47 |         warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 48 | 
 49 |         warmup_percent_done = global_steps_float / warmup_steps_float
 50 |         warmup_learning_rate = init_lr * warmup_percent_done
 51 | 
 52 |         is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 53 |         learning_rate = (
 54 |                 (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 55 | 
 56 |     # It is recommended that you use this optimizer for fine tuning, since this
 57 |     # is how the model was trained (note that the Adam m/v variables are NOT
 58 |     # loaded from init_checkpoint.)
 59 |     optimizer = LAMBOptimizer(
 60 |         learning_rate=learning_rate,
 61 |         weight_decay_rate=0.01,
 62 |         beta_1=0.9,
 63 |         beta_2=0.999,
 64 |         epsilon=1e-6,
 65 |         exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 66 | 
 67 |     if use_tpu:
 68 |         optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 69 | 
 70 |     tvars = tf.trainable_variables()
 71 |     grads = tf.gradients(loss, tvars)
 72 | 
 73 |     # This is how the model was pre-trained.
 74 |     (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
 75 | 
 76 |     train_op = optimizer.apply_gradients(
 77 |         zip(grads, tvars), global_step=global_step)
 78 | 
 79 |     # Normally the global step update is done inside of `apply_gradients`.
 80 |     # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
 81 |     # a different optimizer, you should probably take this line out.
 82 |     new_global_step = global_step + 1
 83 |     train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 84 |     return train_op
 85 | 
 86 | 
 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 88 |     """A basic Adam optimizer that includes "correct" L2 weight decay."""
 89 | 
 90 |     def __init__(self,
 91 |                  learning_rate,
 92 |                  weight_decay_rate=0.0,
 93 |                  beta_1=0.9,
 94 |                  beta_2=0.999,
 95 |                  epsilon=1e-6,
 96 |                  exclude_from_weight_decay=None,
 97 |                  name="AdamWeightDecayOptimizer"):
 98 |         """Constructs a AdamWeightDecayOptimizer."""
 99 |         super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 | 
101 |         self.learning_rate = learning_rate
102 |         self.weight_decay_rate = weight_decay_rate
103 |         self.beta_1 = beta_1
104 |         self.beta_2 = beta_2
105 |         self.epsilon = epsilon
106 |         self.exclude_from_weight_decay = exclude_from_weight_decay
107 | 
108 |     def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 |         """See base class."""
110 |         assignments = []
111 |         for (grad, param) in grads_and_vars:
112 |             if grad is None or param is None:
113 |                 continue
114 | 
115 |             param_name = self._get_variable_name(param.name)
116 | 
117 |             m = tf.get_variable(
118 |                 name=param_name + "/adam_m",
119 |                 shape=param.shape.as_list(),
120 |                 dtype=tf.float32,
121 |                 trainable=False,
122 |                 initializer=tf.zeros_initializer())
123 |             v = tf.get_variable(
124 |                 name=param_name + "/adam_v",
125 |                 shape=param.shape.as_list(),
126 |                 dtype=tf.float32,
127 |                 trainable=False,
128 |                 initializer=tf.zeros_initializer())
129 | 
130 |             # Standard Adam update.
131 |             next_m = (
132 |                     tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 |             next_v = (
134 |                     tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 |                                                               tf.square(grad)))
136 | 
137 |             update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 | 
139 |             # Just adding the square of the weights to the loss function is *not*
140 |             # the correct way of using L2 regularization/weight decay with Adam,
141 |             # since that will interact with the m and v parameters in strange ways.
142 |             #
143 |             # Instead we want ot decay the weights in a manner that doesn't interact
144 |             # with the m/v parameters. This is equivalent to adding the square
145 |             # of the weights to the loss with plain (non-momentum) SGD.
146 |             if self._do_use_weight_decay(param_name):
147 |                 update += self.weight_decay_rate * param
148 | 
149 |             update_with_lr = self.learning_rate * update
150 | 
151 |             next_param = param - update_with_lr
152 | 
153 |             assignments.extend(
154 |                 [param.assign(next_param),
155 |                  m.assign(next_m),
156 |                  v.assign(next_v)])
157 |         return tf.group(*assignments, name=name)
158 | 
159 |     def _do_use_weight_decay(self, param_name):
160 |         """Whether to use L2 weight decay for `param_name`."""
161 |         if not self.weight_decay_rate:
162 |             return False
163 |         if self.exclude_from_weight_decay:
164 |             for r in self.exclude_from_weight_decay:
165 |                 if re.search(r, param_name) is not None:
166 |                     return False
167 |         return True
168 | 
169 |     def _get_variable_name(self, param_name):
170 |         """Get the variable name from the tensor name."""
171 |         m = re.match("^(.*):\\d+$", param_name)
172 |         if m is not None:
173 |             param_name = m.group(1)
174 |         return param_name
175 | 
176 | 
177 | #
178 | class LAMBOptimizer(tf.train.Optimizer):
179 |     """
180 |     LAMBOptimizer optimizer.
181 |     https://github.com/ymcui/LAMB_Optimizer_TF
182 |     # IMPORTANT NOTE
183 |     - This is NOT an official implementation.
184 |     - LAMB optimizer is changed from arXiv v1 ~ v3.
185 |     - We implement v3 version (which is the latest version on June, 2019.).
186 |     - Our implementation is based on `AdamWeightDecayOptimizer` in BERT (provided by Google).
187 | 
188 |     # References
189 |     - Large Batch Optimization for Deep Learning: Training BERT in 76 minutes. https://arxiv.org/abs/1904.00962v3
190 |     - BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. https://arxiv.org/abs/1810.04805
191 |     # Parameters
192 |     - There is nothing special, just the same as `AdamWeightDecayOptimizer`.
193 |     """
194 | 
195 |     def __init__(self,
196 |                  learning_rate,
197 |                  weight_decay_rate=0.01,
198 |                  beta_1=0.9,
199 |                  beta_2=0.999,
200 |                  epsilon=1e-6,
201 |                  exclude_from_weight_decay=None,
202 |                  name="LAMBOptimizer"):
203 |         """Constructs a LAMBOptimizer."""
204 |         super(LAMBOptimizer, self).__init__(False, name)
205 | 
206 |         self.learning_rate = learning_rate
207 |         self.weight_decay_rate = weight_decay_rate
208 |         self.beta_1 = beta_1
209 |         self.beta_2 = beta_2
210 |         self.epsilon = epsilon
211 |         self.exclude_from_weight_decay = exclude_from_weight_decay
212 | 
213 |     def apply_gradients(self, grads_and_vars, global_step=None, name=None):
214 |         """See base class."""
215 |         assignments = []
216 |         for (grad, param) in grads_and_vars:
217 |             if grad is None or param is None:
218 |                 continue
219 | 
220 |             param_name = self._get_variable_name(param.name)
221 | 
222 |             m = tf.get_variable(
223 |                 name=param_name + "/lamb_m",
224 |                 shape=param.shape.as_list(),
225 |                 dtype=tf.float32,
226 |                 trainable=False,
227 |                 initializer=tf.zeros_initializer())
228 |             v = tf.get_variable(
229 |                 name=param_name + "/lamb_v",
230 |                 shape=param.shape.as_list(),
231 |                 dtype=tf.float32,
232 |                 trainable=False,
233 |                 initializer=tf.zeros_initializer())
234 | 
235 |             # Standard Adam update.
236 |             next_m = (
237 |                     tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
238 |             next_v = (
239 |                     tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
240 |                                                               tf.square(grad)))
241 | 
242 |             update = next_m / (tf.sqrt(next_v) + self.epsilon)
243 | 
244 |             # Just adding the square of the weights to the loss function is *not*
245 |             # the correct way of using L2 regularization/weight decay with Adam,
246 |             # since that will interact with the m and v parameters in strange ways.
247 |             #
248 |             # Instead we want ot decay the weights in a manner that doesn't interact
249 |             # with the m/v parameters. This is equivalent to adding the square
250 |             # of the weights to the loss with plain (non-momentum) SGD.
251 |             if self._do_use_weight_decay(param_name):
252 |                 update += self.weight_decay_rate * param
253 | 
254 |             ############## BELOW ARE THE SPECIFIC PARTS FOR LAMB ##############
255 | 
256 |             # Note: Here are two choices for scaling function \phi(z)
257 |             # minmax:   \phi(z) = min(max(z, \gamma_l), \gamma_u)
258 |             # identity: \phi(z) = z
259 |             # The authors does not mention what is \gamma_l and \gamma_u
260 |             # UPDATE: after asking authors, they provide me the code below.
261 |             # ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
262 |             #      math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)
263 | 
264 |             r1 = tf.sqrt(tf.reduce_sum(tf.square(param)))
265 |             r2 = tf.sqrt(tf.reduce_sum(tf.square(update)))
266 | 
267 |             r = tf.where(tf.greater(r1, 0.0),
268 |                          tf.where(tf.greater(r2, 0.0),
269 |                                   r1 / r2,
270 |                                   1.0),
271 |                          1.0)
272 | 
273 |             eta = self.learning_rate * r
274 | 
275 |             update_with_lr = eta * update
276 | 
277 |             next_param = param - update_with_lr
278 | 
279 |             assignments.extend(
280 |                 [param.assign(next_param),
281 |                  m.assign(next_m),
282 |                  v.assign(next_v)])
283 |         return tf.group(*assignments, name=name)
284 | 
285 |     def _do_use_weight_decay(self, param_name):
286 |         """Whether to use L2 weight decay for `param_name`."""
287 |         if not self.weight_decay_rate:
288 |             return False
289 |         if self.exclude_from_weight_decay:
290 |             for r in self.exclude_from_weight_decay:
291 |                 if re.search(r, param_name) is not None:
292 |                     return False
293 |         return True
294 | 
295 |     def _get_variable_name(self, param_name):
296 |         """Get the variable name from the tensor name."""
297 |         m = re.match("^(.*):\\d+$", param_name)
298 |         if m is not None:
299 |             param_name = m.group(1)
300 |         return param_name


--------------------------------------------------------------------------------
/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import re
 23 | import unicodedata
 24 | import six
 25 | import tensorflow as tf
 26 | 
 27 | 
 28 | def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
 29 |   """Checks whether the casing config is consistent with the checkpoint name."""
 30 | 
 31 |   # The casing has to be passed in by the user and there is no explicit check
 32 |   # as to whether it matches the checkpoint. The casing information probably
 33 |   # should have been stored in the bert_config.json file, but it's not, so
 34 |   # we have to heuristically detect it to validate.
 35 | 
 36 |   if not init_checkpoint:
 37 |     return
 38 | 
 39 |   m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
 40 |   if m is None:
 41 |     return
 42 | 
 43 |   model_name = m.group(1)
 44 | 
 45 |   lower_models = [
 46 |       "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
 47 |       "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
 48 |   ]
 49 | 
 50 |   cased_models = [
 51 |       "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
 52 |       "multi_cased_L-12_H-768_A-12"
 53 |   ]
 54 | 
 55 |   is_bad_config = False
 56 |   if model_name in lower_models and not do_lower_case:
 57 |     is_bad_config = True
 58 |     actual_flag = "False"
 59 |     case_name = "lowercased"
 60 |     opposite_flag = "True"
 61 | 
 62 |   if model_name in cased_models and do_lower_case:
 63 |     is_bad_config = True
 64 |     actual_flag = "True"
 65 |     case_name = "cased"
 66 |     opposite_flag = "False"
 67 | 
 68 |   if is_bad_config:
 69 |     raise ValueError(
 70 |         "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
 71 |         "However, `%s` seems to be a %s model, so you "
 72 |         "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
 73 |         "how the model was pre-training. If this error is wrong, please "
 74 |         "just comment out this check." % (actual_flag, init_checkpoint,
 75 |                                           model_name, case_name, opposite_flag))
 76 | 
 77 | 
 78 | def convert_to_unicode(text):
 79 |   """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 80 |   if six.PY3:
 81 |     if isinstance(text, str):
 82 |       return text
 83 |     elif isinstance(text, bytes):
 84 |       return text.decode("utf-8", "ignore")
 85 |     else:
 86 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 87 |   elif six.PY2:
 88 |     if isinstance(text, str):
 89 |       return text.decode("utf-8", "ignore")
 90 |     elif isinstance(text, unicode):
 91 |       return text
 92 |     else:
 93 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 94 |   else:
 95 |     raise ValueError("Not running on Python2 or Python 3?")
 96 | 
 97 | 
 98 | def printable_text(text):
 99 |   """Returns text encoded in a way suitable for print or `tf.logging`."""
100 | 
101 |   # These functions want `str` for both Python2 and Python3, but in one case
102 |   # it's a Unicode string and in the other it's a byte string.
103 |   if six.PY3:
104 |     if isinstance(text, str):
105 |       return text
106 |     elif isinstance(text, bytes):
107 |       return text.decode("utf-8", "ignore")
108 |     else:
109 |       raise ValueError("Unsupported string type: %s" % (type(text)))
110 |   elif six.PY2:
111 |     if isinstance(text, str):
112 |       return text
113 |     elif isinstance(text, unicode):
114 |       return text.encode("utf-8")
115 |     else:
116 |       raise ValueError("Unsupported string type: %s" % (type(text)))
117 |   else:
118 |     raise ValueError("Not running on Python2 or Python 3?")
119 | 
120 | 
121 | def load_vocab(vocab_file):
122 |   """Loads a vocabulary file into a dictionary."""
123 |   vocab = collections.OrderedDict()
124 |   index = 0
125 |   with tf.gfile.GFile(vocab_file, "r") as reader:
126 |     while True:
127 |       token = convert_to_unicode(reader.readline())
128 |       if not token:
129 |         break
130 |       token = token.strip()
131 |       vocab[token] = index
132 |       index += 1
133 |   return vocab
134 | 
135 | 
136 | def convert_by_vocab(vocab, items):
137 |   """Converts a sequence of [tokens|ids] using the vocab."""
138 |   output = []
139 |   #print("items:",items) #['[CLS]', '日', '##期', '，', '但', '被', '##告', '金', '##东', '##福', '载', '##明', '[MASK]', 'U', '##N', '##K', ']', '保', '##证', '本', '##月', '1', '##4', '[MASK]', '到', '##位', '，', '2', '##0', '##1', '##5', '年', '6', '[MASK]', '1', '##1', '日', '[', 'U', '##N', '##K', ']', '，', '原', '##告', '[MASK]', '认', '##可', '于', '2', '##0', '##1', '##5', '[MASK]', '6', '月', '[MASK]', '[MASK]', '日', '##向', '被', '##告', '主', '##张', '权', '##利', '。', '而', '[MASK]', '[MASK]', '自', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '年', '6', '月', '1', '##1', '日', '[SEP]', '原', '##告', '于', '2', '##0', '##1', '##6', '[MASK]', '6', '[MASK]', '2', '##4', '日', '起', '##诉', '，', '主', '##张', '保', '##证', '责', '##任', '，', '已', '超', '##过', '保', '##证', '期', '##限', '[MASK]', '保', '##证', '人', '依', '##法', '不', '##再', '承', '##担', '保', '##证', '[MASK]', '[MASK]', '[MASK]', '[SEP]']
140 |   for i,item in enumerate(items):
141 |     #print(i,"item:",item) #  ##期
142 |     output.append(vocab[item])
143 |   return output
144 | 
145 | 
146 | def convert_tokens_to_ids(vocab, tokens):
147 |   return convert_by_vocab(vocab, tokens)
148 | 
149 | 
150 | def convert_ids_to_tokens(inv_vocab, ids):
151 |   return convert_by_vocab(inv_vocab, ids)
152 | 
153 | 
154 | def whitespace_tokenize(text):
155 |   """Runs basic whitespace cleaning and splitting on a piece of text."""
156 |   text = text.strip()
157 |   if not text:
158 |     return []
159 |   tokens = text.split()
160 |   return tokens
161 | 
162 | 
163 | class FullTokenizer(object):
164 |   """Runs end-to-end tokenziation."""
165 | 
166 |   def __init__(self, vocab_file, do_lower_case=True):
167 |     self.vocab = load_vocab(vocab_file)
168 |     self.inv_vocab = {v: k for k, v in self.vocab.items()}
169 |     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
170 |     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
171 | 
172 |   def tokenize(self, text):
173 |     split_tokens = []
174 |     for token in self.basic_tokenizer.tokenize(text):
175 |       for sub_token in self.wordpiece_tokenizer.tokenize(token):
176 |         split_tokens.append(sub_token)
177 | 
178 |     return split_tokens
179 | 
180 |   def convert_tokens_to_ids(self, tokens):
181 |     return convert_by_vocab(self.vocab, tokens)
182 | 
183 |   def convert_ids_to_tokens(self, ids):
184 |     return convert_by_vocab(self.inv_vocab, ids)
185 | 
186 | 
187 | class BasicTokenizer(object):
188 |   """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
189 | 
190 |   def __init__(self, do_lower_case=True):
191 |     """Constructs a BasicTokenizer.
192 | 
193 |     Args:
194 |       do_lower_case: Whether to lower case the input.
195 |     """
196 |     self.do_lower_case = do_lower_case
197 | 
198 |   def tokenize(self, text):
199 |     """Tokenizes a piece of text."""
200 |     text = convert_to_unicode(text)
201 |     text = self._clean_text(text)
202 | 
203 |     # This was added on November 1st, 2018 for the multilingual and Chinese
204 |     # models. This is also applied to the English models now, but it doesn't
205 |     # matter since the English models were not trained on any Chinese data
206 |     # and generally don't have any Chinese data in them (there are Chinese
207 |     # characters in the vocabulary because Wikipedia does have some Chinese
208 |     # words in the English Wikipedia.).
209 |     text = self._tokenize_chinese_chars(text)
210 | 
211 |     orig_tokens = whitespace_tokenize(text)
212 |     split_tokens = []
213 |     for token in orig_tokens:
214 |       if self.do_lower_case:
215 |         token = token.lower()
216 |         token = self._run_strip_accents(token)
217 |       split_tokens.extend(self._run_split_on_punc(token))
218 | 
219 |     output_tokens = whitespace_tokenize(" ".join(split_tokens))
220 |     return output_tokens
221 | 
222 |   def _run_strip_accents(self, text):
223 |     """Strips accents from a piece of text."""
224 |     text = unicodedata.normalize("NFD", text)
225 |     output = []
226 |     for char in text:
227 |       cat = unicodedata.category(char)
228 |       if cat == "Mn":
229 |         continue
230 |       output.append(char)
231 |     return "".join(output)
232 | 
233 |   def _run_split_on_punc(self, text):
234 |     """Splits punctuation on a piece of text."""
235 |     chars = list(text)
236 |     i = 0
237 |     start_new_word = True
238 |     output = []
239 |     while i < len(chars):
240 |       char = chars[i]
241 |       if _is_punctuation(char):
242 |         output.append([char])
243 |         start_new_word = True
244 |       else:
245 |         if start_new_word:
246 |           output.append([])
247 |         start_new_word = False
248 |         output[-1].append(char)
249 |       i += 1
250 | 
251 |     return ["".join(x) for x in output]
252 | 
253 |   def _tokenize_chinese_chars(self, text):
254 |     """Adds whitespace around any CJK character."""
255 |     output = []
256 |     for char in text:
257 |       cp = ord(char)
258 |       if self._is_chinese_char(cp):
259 |         output.append(" ")
260 |         output.append(char)
261 |         output.append(" ")
262 |       else:
263 |         output.append(char)
264 |     return "".join(output)
265 | 
266 |   def _is_chinese_char(self, cp):
267 |     """Checks whether CP is the codepoint of a CJK character."""
268 |     # This defines a "chinese character" as anything in the CJK Unicode block:
269 |     #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
270 |     #
271 |     # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
272 |     # despite its name. The modern Korean Hangul alphabet is a different block,
273 |     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
274 |     # space-separated words, so they are not treated specially and handled
275 |     # like the all of the other languages.
276 |     if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
277 |         (cp >= 0x3400 and cp <= 0x4DBF) or  #
278 |         (cp >= 0x20000 and cp <= 0x2A6DF) or  #
279 |         (cp >= 0x2A700 and cp <= 0x2B73F) or  #
280 |         (cp >= 0x2B740 and cp <= 0x2B81F) or  #
281 |         (cp >= 0x2B820 and cp <= 0x2CEAF) or
282 |         (cp >= 0xF900 and cp <= 0xFAFF) or  #
283 |         (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
284 |       return True
285 | 
286 |     return False
287 | 
288 |   def _clean_text(self, text):
289 |     """Performs invalid character removal and whitespace cleanup on text."""
290 |     output = []
291 |     for char in text:
292 |       cp = ord(char)
293 |       if cp == 0 or cp == 0xfffd or _is_control(char):
294 |         continue
295 |       if _is_whitespace(char):
296 |         output.append(" ")
297 |       else:
298 |         output.append(char)
299 |     return "".join(output)
300 | 
301 | 
302 | class WordpieceTokenizer(object):
303 |   """Runs WordPiece tokenziation."""
304 | 
305 |   def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
306 |     self.vocab = vocab
307 |     self.unk_token = unk_token
308 |     self.max_input_chars_per_word = max_input_chars_per_word
309 | 
310 |   def tokenize(self, text):
311 |     """Tokenizes a piece of text into its word pieces.
312 | 
313 |     This uses a greedy longest-match-first algorithm to perform tokenization
314 |     using the given vocabulary.
315 | 
316 |     For example:
317 |       input = "unaffable"
318 |       output = ["un", "##aff", "##able"]
319 | 
320 |     Args:
321 |       text: A single token or whitespace separated tokens. This should have
322 |         already been passed through `BasicTokenizer.
323 | 
324 |     Returns:
325 |       A list of wordpiece tokens.
326 |     """
327 | 
328 |     text = convert_to_unicode(text)
329 | 
330 |     output_tokens = []
331 |     for token in whitespace_tokenize(text):
332 |       chars = list(token)
333 |       if len(chars) > self.max_input_chars_per_word:
334 |         output_tokens.append(self.unk_token)
335 |         continue
336 | 
337 |       is_bad = False
338 |       start = 0
339 |       sub_tokens = []
340 |       while start < len(chars):
341 |         end = len(chars)
342 |         cur_substr = None
343 |         while start < end:
344 |           substr = "".join(chars[start:end])
345 |           if start > 0:
346 |             substr = "##" + substr
347 |           if substr in self.vocab:
348 |             cur_substr = substr
349 |             break
350 |           end -= 1
351 |         if cur_substr is None:
352 |           is_bad = True
353 |           break
354 |         sub_tokens.append(cur_substr)
355 |         start = end
356 | 
357 |       if is_bad:
358 |         output_tokens.append(self.unk_token)
359 |       else:
360 |         output_tokens.extend(sub_tokens)
361 |     return output_tokens
362 | 
363 | 
364 | def _is_whitespace(char):
365 |   """Checks whether `chars` is a whitespace character."""
366 |   # \t, \n, and \r are technically contorl characters but we treat them
367 |   # as whitespace since they are generally considered as such.
368 |   if char == " " or char == "\t" or char == "\n" or char == "\r":
369 |     return True
370 |   cat = unicodedata.category(char)
371 |   if cat == "Zs":
372 |     return True
373 |   return False
374 | 
375 | 
376 | def _is_control(char):
377 |   """Checks whether `chars` is a control character."""
378 |   # These are technically control characters but we count them as whitespace
379 |   # characters.
380 |   if char == "\t" or char == "\n" or char == "\r":
381 |     return False
382 |   cat = unicodedata.category(char)
383 |   if cat in ("Cc", "Cf"):
384 |     return True
385 |   return False
386 | 
387 | 
388 | def _is_punctuation(char):
389 |   """Checks whether `chars` is a punctuation character."""
390 |   cp = ord(char)
391 |   # We treat all non-letter/number ASCII as punctuation.
392 |   # Characters such as "^", "$", and "`" are not in the Unicode
393 |   # Punctuation class but we treat them as punctuation anyways, for
394 |   # consistency.
395 |   if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
396 |       (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
397 |     return True
398 |   cat = unicodedata.category(char)
399 |   if cat.startswith("P"):
400 |     return True
401 |   return False
402 | 


--------------------------------------------------------------------------------
/run_pretraining.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Run masked LM/next sentence masked_lm pre-training for BERT."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import modeling
 23 | import optimization
 24 | import tensorflow as tf
 25 | 
 26 | flags = tf.flags
 27 | 
 28 | FLAGS = flags.FLAGS
 29 | 
 30 | ## Required parameters
 31 | flags.DEFINE_string(
 32 |     "bert_config_file", None,
 33 |     "The config json file corresponding to the pre-trained BERT model. "
 34 |     "This specifies the model architecture.")
 35 | 
 36 | flags.DEFINE_string(
 37 |     "input_file", None,
 38 |     "Input TF example files (can be a glob or comma separated).")
 39 | 
 40 | flags.DEFINE_string(
 41 |     "output_dir", None,
 42 |     "The output directory where the model checkpoints will be written.")
 43 | 
 44 | ## Other parameters
 45 | flags.DEFINE_string(
 46 |     "init_checkpoint", None,
 47 |     "Initial checkpoint (usually from a pre-trained BERT model).")
 48 | 
 49 | flags.DEFINE_integer(
 50 |     "max_seq_length", 128,
 51 |     "The maximum total input sequence length after WordPiece tokenization. "
 52 |     "Sequences longer than this will be truncated, and sequences shorter "
 53 |     "than this will be padded. Must match data generation.")
 54 | 
 55 | flags.DEFINE_integer(
 56 |     "max_predictions_per_seq", 20,
 57 |     "Maximum number of masked LM predictions per sequence. "
 58 |     "Must match data generation.")
 59 | 
 60 | flags.DEFINE_bool("do_train", False, "Whether to run training.")
 61 | 
 62 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
 63 | 
 64 | flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
 65 | 
 66 | flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
 67 | 
 68 | flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
 69 | 
 70 | flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.")
 71 | 
 72 | flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
 73 | 
 74 | flags.DEFINE_integer("save_checkpoints_steps", 1000,
 75 |                      "How often to save the model checkpoint.")
 76 | 
 77 | flags.DEFINE_integer("iterations_per_loop", 1000,
 78 |                      "How many steps to make in each estimator call.")
 79 | 
 80 | flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
 81 | 
 82 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
 83 | 
 84 | tf.flags.DEFINE_string(
 85 |     "tpu_name", None,
 86 |     "The Cloud TPU to use for training. This should be either the name "
 87 |     "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
 88 |     "url.")
 89 | 
 90 | tf.flags.DEFINE_string(
 91 |     "tpu_zone", None,
 92 |     "[Optional] GCE zone where the Cloud TPU is located in. If not "
 93 |     "specified, we will attempt to automatically detect the GCE project from "
 94 |     "metadata.")
 95 | 
 96 | tf.flags.DEFINE_string(
 97 |     "gcp_project", None,
 98 |     "[Optional] Project name for the Cloud TPU-enabled project. If not "
 99 |     "specified, we will attempt to automatically detect the GCE project from "
100 |     "metadata.")
101 | 
102 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
103 | 
104 | flags.DEFINE_integer(
105 |     "num_tpu_cores", 8,
106 |     "Only used if `use_tpu` is True. Total number of TPU cores to use.")
107 | 
108 | 
109 | def model_fn_builder(bert_config, init_checkpoint, learning_rate,
110 |                      num_train_steps, num_warmup_steps, use_tpu,
111 |                      use_one_hot_embeddings):
112 |   """Returns `model_fn` closure for TPUEstimator."""
113 | 
114 |   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
115 |     """The `model_fn` for TPUEstimator."""
116 | 
117 |     tf.logging.info("*** Features ***")
118 |     for name in sorted(features.keys()):
119 |       tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
120 | 
121 |     input_ids = features["input_ids"]
122 |     input_mask = features["input_mask"]
123 |     segment_ids = features["segment_ids"]
124 |     masked_lm_positions = features["masked_lm_positions"]
125 |     masked_lm_ids = features["masked_lm_ids"]
126 |     masked_lm_weights = features["masked_lm_weights"]
127 |     next_sentence_labels = features["next_sentence_labels"]
128 | 
129 |     is_training = (mode == tf.estimator.ModeKeys.TRAIN)
130 | 
131 |     model = modeling.BertModel(
132 |         config=bert_config,
133 |         is_training=is_training,
134 |         input_ids=input_ids,
135 |         input_mask=input_mask,
136 |         token_type_ids=segment_ids,
137 |         use_one_hot_embeddings=use_one_hot_embeddings)
138 | 
139 |     (masked_lm_loss,
140 |      masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
141 |          bert_config, model.get_sequence_output(), model.get_embedding_table(),model.get_embedding_table_2(),
142 |          masked_lm_positions, masked_lm_ids, masked_lm_weights)
143 | 
144 |     (next_sentence_loss, next_sentence_example_loss,
145 |      next_sentence_log_probs) = get_next_sentence_output(
146 |          bert_config, model.get_pooled_output(), next_sentence_labels)
147 | 
148 |     total_loss = masked_lm_loss + next_sentence_loss
149 | 
150 |     tvars = tf.trainable_variables()
151 | 
152 |     initialized_variable_names = {}
153 |     print("init_checkpoint:",init_checkpoint)
154 |     scaffold_fn = None
155 |     if init_checkpoint:
156 |       (assignment_map, initialized_variable_names
157 |       ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
158 |       if use_tpu:
159 | 
160 |         def tpu_scaffold():
161 |           tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
162 |           return tf.train.Scaffold()
163 | 
164 |         scaffold_fn = tpu_scaffold
165 |       else:
166 |         tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
167 | 
168 |     tf.logging.info("**** Trainable Variables ****")
169 |     for var in tvars:
170 |       init_string = ""
171 |       if var.name in initialized_variable_names:
172 |         init_string = ", *INIT_FROM_CKPT*"
173 |       tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
174 |                       init_string)
175 | 
176 |     output_spec = None
177 |     if mode == tf.estimator.ModeKeys.TRAIN:
178 |       train_op = optimization.create_optimizer(
179 |           total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
180 | 
181 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
182 |           mode=mode,
183 |           loss=total_loss,
184 |           train_op=train_op,
185 |           scaffold_fn=scaffold_fn)
186 |     elif mode == tf.estimator.ModeKeys.EVAL:
187 | 
188 |       def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
189 |                     masked_lm_weights, next_sentence_example_loss,
190 |                     next_sentence_log_probs, next_sentence_labels):
191 |         """Computes the loss and accuracy of the model."""
192 |         masked_lm_log_probs = tf.reshape(masked_lm_log_probs,[-1, masked_lm_log_probs.shape[-1]])
193 |         masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32)
194 |         masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
195 |         masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
196 |         masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
197 |         masked_lm_accuracy = tf.metrics.accuracy(
198 |             labels=masked_lm_ids,
199 |             predictions=masked_lm_predictions,
200 |             weights=masked_lm_weights)
201 |         masked_lm_mean_loss = tf.metrics.mean(
202 |             values=masked_lm_example_loss, weights=masked_lm_weights)
203 | 
204 |         next_sentence_log_probs = tf.reshape(
205 |             next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
206 |         next_sentence_predictions = tf.argmax(
207 |             next_sentence_log_probs, axis=-1, output_type=tf.int32)
208 |         next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
209 |         next_sentence_accuracy = tf.metrics.accuracy(
210 |             labels=next_sentence_labels, predictions=next_sentence_predictions)
211 |         next_sentence_mean_loss = tf.metrics.mean(
212 |             values=next_sentence_example_loss)
213 | 
214 |         return {
215 |             "masked_lm_accuracy": masked_lm_accuracy,
216 |             "masked_lm_loss": masked_lm_mean_loss,
217 |             "next_sentence_accuracy": next_sentence_accuracy,
218 |             "next_sentence_loss": next_sentence_mean_loss,
219 |         }
220 | 
221 |       # next_sentence_example_loss=0.0 TODO
222 |       # next_sentence_log_probs=0.0 # TODO
223 |       eval_metrics = (metric_fn, [
224 |           masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
225 |           masked_lm_weights, next_sentence_example_loss,
226 |           next_sentence_log_probs, next_sentence_labels
227 |       ])
228 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
229 |           mode=mode,
230 |           loss=total_loss,
231 |           eval_metrics=eval_metrics,
232 |           scaffold_fn=scaffold_fn)
233 |     else:
234 |       raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
235 | 
236 |     return output_spec
237 | 
238 |   return model_fn
239 | 
240 | 
241 | def get_masked_lm_output(bert_config, input_tensor, output_weights,project_weights, positions,
242 |                          label_ids, label_weights):
243 |   """Get loss and log probs for the masked LM."""
244 |   input_tensor = gather_indexes(input_tensor, positions)
245 | 
246 |   with tf.variable_scope("cls/predictions"):
247 |     # We apply one more non-linear transformation before the output layer.
248 |     # This matrix is not used after pre-training.
249 |     with tf.variable_scope("transform"):
250 |       input_tensor = tf.layers.dense(
251 |           input_tensor,
252 |           units=bert_config.hidden_size,
253 |           activation=modeling.get_activation(bert_config.hidden_act),
254 |           kernel_initializer=modeling.create_initializer(
255 |               bert_config.initializer_range))
256 |       input_tensor = modeling.layer_norm(input_tensor)
257 | 
258 |     # The output weights are the same as the input embeddings, but there is
259 |     # an output-only bias for each token.
260 |     output_bias = tf.get_variable(
261 |         "output_bias",
262 |         shape=[bert_config.vocab_size],
263 |         initializer=tf.zeros_initializer())
264 |     # logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
265 |     # input_tensor=[-1,hidden_size], project_weights=[embedding_size, hidden_size], project_weights_transpose=[hidden_size, embedding_size]--->[-1, embedding_size]
266 |     input_project = tf.matmul(input_tensor, project_weights, transpose_b=True)
267 |     logits = tf.matmul(input_project, output_weights, transpose_b=True)
268 |     #  # input_project=[-1, embedding_size], output_weights=[vocab_size, embedding_size], output_weights_transpose=[embedding_size, vocab_size] ---> [-1, vocab_size]
269 | 
270 |     logits = tf.nn.bias_add(logits, output_bias)
271 |     log_probs = tf.nn.log_softmax(logits, axis=-1)
272 | 
273 |     label_ids = tf.reshape(label_ids, [-1])
274 |     label_weights = tf.reshape(label_weights, [-1])
275 | 
276 |     one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
277 | 
278 |     # The `positions` tensor might be zero-padded (if the sequence is too
279 |     # short to have the maximum number of predictions). The `label_weights`
280 |     # tensor has a value of 1.0 for every real prediction and 0.0 for the
281 |     # padding predictions.
282 |     per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
283 |     numerator = tf.reduce_sum(label_weights * per_example_loss)
284 |     denominator = tf.reduce_sum(label_weights) + 1e-5
285 |     loss = numerator / denominator
286 | 
287 |   return (loss, per_example_loss, log_probs)
288 | 
289 | 
290 | def get_next_sentence_output(bert_config, input_tensor, labels):
291 |   """Get loss and log probs for the next sentence prediction."""
292 | 
293 |   # Simple binary classification. Note that 0 is "next sentence" and 1 is
294 |   # "random sentence". This weight matrix is not used after pre-training.
295 |   with tf.variable_scope("cls/seq_relationship"):
296 |     output_weights = tf.get_variable(
297 |         "output_weights",
298 |         shape=[2, bert_config.hidden_size],
299 |         initializer=modeling.create_initializer(bert_config.initializer_range))
300 |     output_bias = tf.get_variable(
301 |         "output_bias", shape=[2], initializer=tf.zeros_initializer())
302 | 
303 |     logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
304 |     logits = tf.nn.bias_add(logits, output_bias)
305 |     log_probs = tf.nn.log_softmax(logits, axis=-1)
306 |     labels = tf.reshape(labels, [-1])
307 |     one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
308 |     per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
309 |     loss = tf.reduce_mean(per_example_loss)
310 |     return (loss, per_example_loss, log_probs)
311 | 
312 | 
313 | def gather_indexes(sequence_tensor, positions):
314 |   """Gathers the vectors at the specific positions over a minibatch."""
315 |   sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
316 |   batch_size = sequence_shape[0]
317 |   seq_length = sequence_shape[1]
318 |   width = sequence_shape[2]
319 | 
320 |   flat_offsets = tf.reshape(
321 |       tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
322 |   flat_positions = tf.reshape(positions + flat_offsets, [-1])
323 |   flat_sequence_tensor = tf.reshape(sequence_tensor,
324 |                                     [batch_size * seq_length, width])
325 |   output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
326 |   return output_tensor
327 | 
328 | 
329 | def input_fn_builder(input_files,
330 |                      max_seq_length,
331 |                      max_predictions_per_seq,
332 |                      is_training,
333 |                      num_cpu_threads=16):
334 |   """Creates an `input_fn` closure to be passed to TPUEstimator."""
335 | 
336 |   def input_fn(params):
337 |     """The actual input function."""
338 |     batch_size = params["batch_size"]
339 | 
340 |     name_to_features = {
341 |         "input_ids":
342 |             tf.FixedLenFeature([max_seq_length], tf.int64),
343 |         "input_mask":
344 |             tf.FixedLenFeature([max_seq_length], tf.int64),
345 |         "segment_ids":
346 |             tf.FixedLenFeature([max_seq_length], tf.int64),
347 |         "masked_lm_positions":
348 |             tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
349 |         "masked_lm_ids":
350 |             tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
351 |         "masked_lm_weights":
352 |             tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
353 |         "next_sentence_labels":
354 |             tf.FixedLenFeature([1], tf.int64),
355 |     }
356 | 
357 |     # For training, we want a lot of parallel reading and shuffling.
358 |     # For eval, we want no shuffling and parallel reading doesn't matter.
359 |     if is_training:
360 |       d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
361 |       d = d.repeat()
362 |       d = d.shuffle(buffer_size=len(input_files))
363 | 
364 |       # `cycle_length` is the number of parallel files that get read.
365 |       cycle_length = min(num_cpu_threads, len(input_files))
366 | 
367 |       # `sloppy` mode means that the interleaving is not exact. This adds
368 |       # even more randomness to the training pipeline.
369 |       d = d.apply(
370 |           tf.contrib.data.parallel_interleave(
371 |               tf.data.TFRecordDataset,
372 |               sloppy=is_training,
373 |               cycle_length=cycle_length))
374 |       d = d.shuffle(buffer_size=100)
375 |     else:
376 |       d = tf.data.TFRecordDataset(input_files)
377 |       # Since we evaluate for a fixed number of steps we don't want to encounter
378 |       # out-of-range exceptions.
379 |       d = d.repeat()
380 | 
381 |     # We must `drop_remainder` on training because the TPU requires fixed
382 |     # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
383 |     # and we *don't* want to drop the remainder, otherwise we wont cover
384 |     # every sample.
385 |     d = d.apply(
386 |         tf.contrib.data.map_and_batch(
387 |             lambda record: _decode_record(record, name_to_features),
388 |             batch_size=batch_size,
389 |             num_parallel_batches=num_cpu_threads,
390 |             drop_remainder=True))
391 |     return d
392 | 
393 |   return input_fn
394 | 
395 | 
396 | def _decode_record(record, name_to_features):
397 |   """Decodes a record to a TensorFlow example."""
398 |   example = tf.parse_single_example(record, name_to_features)
399 | 
400 |   # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
401 |   # So cast all int64 to int32.
402 |   for name in list(example.keys()):
403 |     t = example[name]
404 |     if t.dtype == tf.int64:
405 |       t = tf.to_int32(t)
406 |     example[name] = t
407 | 
408 |   return example
409 | 
410 | 
411 | def main(_):
412 |   tf.logging.set_verbosity(tf.logging.INFO)
413 | 
414 |   if not FLAGS.do_train and not FLAGS.do_eval: # 必须是训练或验证的类型
415 |     raise ValueError("At least one of `do_train` or `do_eval` must be True.")
416 | 
417 |   bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # 从json文件中获得配置信息
418 | 
419 |   tf.gfile.MakeDirs(FLAGS.output_dir)
420 | 
421 |   input_files = [] # 输入可以是多个文件，以“逗号隔开”；可以是一个匹配形式的，如“input_x*”
422 |   for input_pattern in FLAGS.input_file.split(","):
423 |     input_files.extend(tf.gfile.Glob(input_pattern))
424 | 
425 |   tf.logging.info("*** Input Files ***")
426 |   for input_file in input_files:
427 |     tf.logging.info("  %s" % input_file)
428 | 
429 |   tpu_cluster_resolver = None
430 |   if FLAGS.use_tpu and FLAGS.tpu_name:
431 |       tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( # TODO
432 |             tpu=FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
433 | 
434 |   print("###tpu_cluster_resolver:",tpu_cluster_resolver,";FLAGS.use_tpu:",FLAGS.use_tpu,";FLAGS.tpu_name:",FLAGS.tpu_name,";FLAGS.tpu_zone:",FLAGS.tpu_zone)
435 |   # ###tpu_cluster_resolver: <tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver object at 0x7f4b387b06a0> ;FLAGS.use_tpu: True ;FLAGS.tpu_name: grpc://10.240.1.83:8470
436 | 
437 |   is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
438 |   run_config = tf.contrib.tpu.RunConfig(
439 |       keep_checkpoint_max=20, # 10
440 |       cluster=tpu_cluster_resolver,
441 |       master=FLAGS.master,
442 |       model_dir=FLAGS.output_dir,
443 |       save_checkpoints_steps=FLAGS.save_checkpoints_steps,
444 |       tpu_config=tf.contrib.tpu.TPUConfig(
445 |           iterations_per_loop=FLAGS.iterations_per_loop,
446 |           num_shards=FLAGS.num_tpu_cores,
447 |           per_host_input_for_training=is_per_host))
448 | 
449 |   model_fn = model_fn_builder(
450 |       bert_config=bert_config,
451 |       init_checkpoint=FLAGS.init_checkpoint,
452 |       learning_rate=FLAGS.learning_rate,
453 |       num_train_steps=FLAGS.num_train_steps,
454 |       num_warmup_steps=FLAGS.num_warmup_steps,
455 |       use_tpu=FLAGS.use_tpu,
456 |       use_one_hot_embeddings=FLAGS.use_tpu)
457 | 
458 |   # If TPU is not available, this will fall back to normal Estimator on CPU
459 |   # or GPU.
460 |   estimator = tf.contrib.tpu.TPUEstimator(
461 |       use_tpu=FLAGS.use_tpu,
462 |       model_fn=model_fn,
463 |       config=run_config,
464 |       train_batch_size=FLAGS.train_batch_size,
465 |       eval_batch_size=FLAGS.eval_batch_size)
466 | 
467 |   if FLAGS.do_train:
468 |     tf.logging.info("***** Running training *****")
469 |     tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
470 |     train_input_fn = input_fn_builder(
471 |         input_files=input_files,
472 |         max_seq_length=FLAGS.max_seq_length,
473 |         max_predictions_per_seq=FLAGS.max_predictions_per_seq,
474 |         is_training=True)
475 |     estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
476 | 
477 |   if FLAGS.do_eval:
478 |     tf.logging.info("***** Running evaluation *****")
479 |     tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
480 | 
481 |     eval_input_fn = input_fn_builder(
482 |         input_files=input_files,
483 |         max_seq_length=FLAGS.max_seq_length,
484 |         max_predictions_per_seq=FLAGS.max_predictions_per_seq,
485 |         is_training=False)
486 | 
487 |     result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)
488 | 
489 |     output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
490 |     with tf.gfile.GFile(output_eval_file, "w") as writer:
491 |       tf.logging.info("***** Eval results *****")
492 |       for key in sorted(result.keys()):
493 |         tf.logging.info("  %s = %s", key, str(result[key]))
494 |         writer.write("%s = %s\n" % (key, str(result[key])))
495 | 
496 | 
497 | if __name__ == "__main__":
498 |   flags.mark_flag_as_required("input_file")
499 |   flags.mark_flag_as_required("bert_config_file")
500 |   flags.mark_flag_as_required("output_dir")
501 |   tf.app.run()
502 | 


--------------------------------------------------------------------------------
/data/test.txt:
--------------------------------------------------------------------------------
   1 | 美 B-LOC
   2 | 国 I-LOC
   3 | 的 O
   4 | 华 B-PER
   5 | 莱 B-PER
   6 | 士 B-PER
   7 | ， O
   8 | 我 O
   9 | 和 O
  10 | 他 O
  11 | 谈 O
  12 | 笑 O
  13 | 风 O
  14 | 生 O
  15 | 。 O
  16 | 
  17 | 看 O
  18 | 包 B-PER
  19 | 公 I-PER
  20 | 断 O
  21 | 案 O
  22 | 的 O
  23 | 戏 O
  24 | ， O
  25 | 看 O
  26 | 他 O
  27 | 威 O
  28 | 风 O
  29 | 凛 O
  30 | 凛 O
  31 | 坐 O
  32 | 公 O
  33 | 堂 O
  34 | 拍 O
  35 | 桌 O
  36 | 子 O
  37 | 动 O
  38 | 刑 O
  39 | 具 O
  40 | ， O
  41 | 多 O
  42 | 少 O
  43 | 还 O
  44 | 有 O
  45 | 一 O
  46 | 点 O
  47 | 担 O
  48 | 心 O
  49 | ， O
  50 | 总 O
  51 | 怕 O
  52 | 靠 O
  53 | 这 O
  54 | 一 O
  55 | 套 O
  56 | 办 O
  57 | 法 O
  58 | 弄 O
  59 | 出 O
  60 | 错 O
  61 | 案 O
  62 | 来 O
  63 | ， O
  64 | 放 O
  65 | 过 O
  66 | 了 O
  67 | 真 O
  68 | 正 O
  69 | 的 O
  70 | 坏 O
  71 | 人 O
  72 | ； O
  73 | 
  74 | 可 O
  75 | 看 O
  76 | 《 O
  77 | 包 B-PER
  78 | 公 I-PER
  79 | 赶 O
  80 | 驴 O
  81 | 》 O
  82 | 这 O
  83 | 出 O
  84 | 戏 O
  85 | ， O
  86 | 心 O
  87 | 里 O
  88 | 就 O
  89 | 很 O
  90 | 踏 O
  91 | 实 O
  92 | ： O
  93 | 这 O
  94 | 样 O
  95 | 是 O
  96 | 一 O
  97 | 断 O
  98 | 一 O
  99 | 个 O
 100 | 准 O
 101 | 的 O
 102 | 。 O
 103 | 
 104 | 譬 O
 105 | 如 O
 106 | 看 O
 107 | 《 O
 108 | 施 B-PER
 109 | 公 O
 110 | 案 O
 111 | 》 O
 112 | ， O
 113 | 施 B-PER
 114 | 大 O
 115 | 人 O
 116 | 坐 O
 117 | 公 O
 118 | 堂 O
 119 | 问 O
 120 | 案 O
 121 | 子 O
 122 | 不 O
 123 | 得 O
 124 | 要 O
 125 | 领 O
 126 | ， O
 127 | 总 O
 128 | 是 O
 129 | 扮 O
 130 | 成 O
 131 | 普 O
 132 | 通 O
 133 | 百 O
 134 | 姓 O
 135 | 深 O
 136 | 入 O
 137 | 民 O
 138 | 间 O
 139 | 暗 O
 140 | 中 O
 141 | 查 O
 142 | 访 O
 143 | ， O
 144 | 结 O
 145 | 果 O
 146 | 就 O
 147 | 屡 O
 148 | 破 O
 149 | 奇 O
 150 | 案 O
 151 | 了 O
 152 | 。 O
 153 | 
 154 | 如 O
 155 | 果 O
 156 | 有 O
 157 | 人 O
 158 | 问 O
 159 | 我 O
 160 | ： O
 161 | “ O
 162 | 你 O
 163 | 看 O
 164 | 过 O
 165 | 许 O
 166 | 多 O
 167 | 包 B-PER
 168 | 公 I-PER
 169 | 戏 O
 170 | ， O
 171 | 哪 O
 172 | 一 O
 173 | 出 O
 174 | 最 O
 175 | 好 O
 176 | ？ O
 177 | ” O
 178 | 
 179 | 我 O
 180 | 要 O
 181 | 毫 O
 182 | 不 O
 183 | 犹 O
 184 | 豫 O
 185 | 地 O
 186 | 回 O
 187 | 答 O
 188 | 道 O
 189 | ： O
 190 | “ O
 191 | 自 O
 192 | 然 O
 193 | 是 O
 194 | 《 O
 195 | 包 B-PER
 196 | 公 I-PER
 197 | 赶 O
 198 | 驴 O
 199 | 》 O
 200 | 啦 O
 201 | ！ O
 202 | 
 203 | 包 B-PER
 204 | 公 I-PER
 205 | 毕 O
 206 | 竟 O
 207 | 是 O
 208 | 包 B-PER
 209 | 公 I-PER
 210 | ， O
 211 | 若 O
 212 | 是 O
 213 | 换 O
 214 | 了 O
 215 | 好 O
 216 | 摆 O
 217 | 身 O
 218 | 份 O
 219 | 的 O
 220 | 什 O
 221 | 么 O
 222 | 公 O
 223 | ， O
 224 | 便 O
 225 | 要 O
 226 | 先 O
 227 | 派 O
 228 | 人 O
 229 | 通 O
 230 | 报 O
 231 | ， O
 232 | 然 O
 233 | 后 O
 234 | 由 O
 235 | 卫 O
 236 | 士 O
 237 | 前 O
 238 | 呼 O
 239 | 后 O
 240 | 拥 O
 241 | 而 O
 242 | 去 O
 243 | ， O
 244 | 如 O
 245 | 何 O
 246 | 查 O
 247 | 得 O
 248 | 出 O
 249 | 实 O
 250 | 情 O
 251 | ！ O
 252 | ” O
 253 | （ O
 254 | 马 B-PER
 255 | 得 I-PER
 256 | ／ O
 257 | 画 O
 258 | ） O
 259 | 
 260 | 学 O
 261 | 习 O
 262 | 基 O
 263 | 本 O
 264 | 法 O
 265 | 顺 O
 266 | 利 O
 267 | 迎 O
 268 | 回 O
 269 | 归 O
 270 | 
 271 | 本 O
 272 | 报 O
 273 | 评 O
 274 | 论 O
 275 | 员 O
 276 | 
 277 | 再 O
 278 | 过 O
 279 | 5 O
 280 | 5 O
 281 | 天 O
 282 | ， O
 283 | 我 O
 284 | 国 O
 285 | 政 O
 286 | 府 O
 287 | 将 O
 288 | 对 O
 289 | 香 B-LOC
 290 | 港 I-LOC
 291 | 恢 O
 292 | 复 O
 293 | 行 O
 294 | 使 O
 295 | 主 O
 296 | 权 O
 297 | 。 O
 298 | 
 299 | 在 O
 300 | 香 B-LOC
 301 | 港 I-LOC
 302 | 回 O
 303 | 归 O
 304 | 前 O
 305 | 的 O
 306 | 最 O
 307 | 后 O
 308 | 阶 O
 309 | 段 O
 310 | ， O
 311 | 中 B-ORG
 312 | 共 I-ORG
 313 | 中 I-ORG
 314 | 央 I-ORG
 315 | 举 O
 316 | 办 O
 317 | 《 O
 318 | “ O
 319 | 一 O
 320 | 国 O
 321 | 两 O
 322 | 制 O
 323 | ” O
 324 | 与 O
 325 | 香 B-LOC
 326 | 港 I-LOC
 327 | 基 O
 328 | 本 O
 329 | 法 O
 330 | 》 O
 331 | 讲 O
 332 | 座 O
 333 | ， O
 334 | 中 O
 335 | 央 O
 336 | 领 O
 337 | 导 O
 338 | 同 O
 339 | 志 O
 340 | 认 O
 341 | 真 O
 342 | 听 O
 343 | 讲 O
 344 | ， O
 345 | 虚 O
 346 | 心 O
 347 | 学 O
 348 | 习 O
 349 | ， O
 350 | 很 O
 351 | 有 O
 352 | 意 O
 353 | 义 O
 354 | 。 O
 355 | 
 356 | 这 O
 357 | 表 O
 358 | 明 O
 359 | ， O
 360 | 以 O
 361 | 江 B-PER
 362 | 泽 I-PER
 363 | 民 I-PER
 364 | 同 O
 365 | 志 O
 366 | 为 O
 367 | 核 O
 368 | 心 O
 369 | 的 O
 370 | 党 B-ORG
 371 | 中 I-ORG
 372 | 央 I-ORG
 373 | 坚 O
 374 | 定 O
 375 | 不 O
 376 | 移 O
 377 | 地 O
 378 | 贯 O
 379 | 彻 O
 380 | 邓 B-PER
 381 | 小 I-PER
 382 | 平 I-PER
 383 | 同 O
 384 | 志 O
 385 | “ O
 386 | 一 O
 387 | 国 O
 388 | 两 O
 389 | 制 O
 390 | ” O
 391 | 的 O
 392 | 伟 O
 393 | 大 O
 394 | 构 O
 395 | 想 O
 396 | ， O
 397 | 不 O
 398 | 折 O
 399 | 不 O
 400 | 扣 O
 401 | 地 O
 402 | 执 O
 403 | 行 O
 404 | 基 O
 405 | 本 O
 406 | 法 O
 407 | 。 O
 408 | 
 409 | “ O
 410 | 一 O
 411 | 国 O
 412 | 两 O
 413 | 制 O
 414 | ” O
 415 | 是 O
 416 | 邓 B-PER
 417 | 小 I-PER
 418 | 平 I-PER
 419 | 同 O
 420 | 志 O
 421 | 的 O
 422 | 一 O
 423 | 个 O
 424 | 伟 O
 425 | 大 O
 426 | 构 O
 427 | 想 O
 428 | ， O
 429 | 《 O
 430 | 中 B-LOC
 431 | 华 I-LOC
 432 | 人 I-LOC
 433 | 民 I-LOC
 434 | 共 I-LOC
 435 | 和 I-LOC
 436 | 国 I-LOC
 437 | 香 B-LOC
 438 | 港 I-LOC
 439 | 特 I-LOC
 440 | 别 I-LOC
 441 | 行 I-LOC
 442 | 政 I-LOC
 443 | 区 I-LOC
 444 | 基 O
 445 | 本 O
 446 | 法 O
 447 | 》 O
 448 | 是 O
 449 | 贯 O
 450 | 彻 O
 451 | 落 O
 452 | 实 O
 453 | “ O
 454 | 一 O
 455 | 国 O
 456 | 两 O
 457 | 制 O
 458 | ” O
 459 | 伟 O
 460 | 大 O
 461 | 构 O
 462 | 想 O
 463 | 的 O
 464 | 一 O
 465 | 部 O
 466 | 全 O
 467 | 国 O
 468 | 性 O
 469 | 法 O
 470 | 律 O
 471 | ， O
 472 | 是 O
 473 | 一 O
 474 | 部 O
 475 | 有 O
 476 | 鲜 O
 477 | 明 O
 478 | 中 B-LOC
 479 | 国 I-LOC
 480 | 特 O
 481 | 色 O
 482 | 的 O
 483 | 法 O
 484 | 律 O
 485 | 。 O
 486 | 
 487 | 它 O
 488 | 把 O
 489 | 中 O
 490 | 央 O
 491 | 对 O
 492 | 解 O
 493 | 决 O
 494 | 香 B-LOC
 495 | 港 I-LOC
 496 | 问 O
 497 | 题 O
 498 | 的 O
 499 | 基 O
 500 | 本 O
 501 | 方 O
 502 | 针 O
 503 | 政 O
 504 | 策 O
 505 | 具 O
 506 | 体 O
 507 | 化 O
 508 | 、 O
 509 | 法 O
 510 | 律 O
 511 | 化 O
 512 | ， O
 513 | 成 O
 514 | 为 O
 515 | 国 O
 516 | 家 O
 517 | 意 O
 518 | 志 O
 519 | 。 O
 520 | 
 521 | 学 O
 522 | 习 O
 523 | 基 O
 524 | 本 O
 525 | 法 O
 526 | ， O
 527 | 顺 O
 528 | 利 O
 529 | 迎 O
 530 | 回 O
 531 | 归 O
 532 | ， O
 533 | 是 O
 534 | 一 O
 535 | 项 O
 536 | 迫 O
 537 | 切 O
 538 | 的 O
 539 | 任 O
 540 | 务 O
 541 | 。 O
 542 | 
 543 | 要 O
 544 | 学 O
 545 | 好 O
 546 | 基 O
 547 | 本 O
 548 | 法 O
 549 | ， O
 550 | 首 O
 551 | 先 O
 552 | 要 O
 553 | 认 O
 554 | 识 O
 555 | 到 O
 556 | 基 O
 557 | 本 O
 558 | 法 O
 559 | 的 O
 560 | 意 O
 561 | 义 O
 562 | 。 O
 563 | 
 564 | 邓 B-PER
 565 | 小 I-PER
 566 | 平 I-PER
 567 | 同 O
 568 | 志 O
 569 | 生 O
 570 | 前 O
 571 | 高 O
 572 | 度 O
 573 | 评 O
 574 | 价 O
 575 | 这 O
 576 | 部 O
 577 | 法 O
 578 | 律 O
 579 | ， O
 580 | 他 O
 581 | 指 O
 582 | 出 O
 583 | ： O
 584 | “ O
 585 | 说 O
 586 | 它 O
 587 | 具 O
 588 | 有 O
 589 | 历 O
 590 | 史 O
 591 | 意 O
 592 | 义 O
 593 | ， O
 594 | 不 O
 595 | 只 O
 596 | 对 O
 597 | 过 O
 598 | 去 O
 599 | 、 O
 600 | 现 O
 601 | 在 O
 602 | ， O
 603 | 而 O
 604 | 且 O
 605 | 包 O
 606 | 括 O
 607 | 将 O
 608 | 来 O
 609 | ； O
 610 | 
 611 | 说 O
 612 | 国 O
 613 | 际 O
 614 | 意 O
 615 | 义 O
 616 | ， O
 617 | 不 O
 618 | 只 O
 619 | 对 O
 620 | 第 O
 621 | 三 O
 622 | 世 O
 623 | 界 O
 624 | ， O
 625 | 而 O
 626 | 且 O
 627 | 对 O
 628 | 全 O
 629 | 人 O
 630 | 类 O
 631 | 都 O
 632 | 具 O
 633 | 有 O
 634 | 长 O
 635 | 远 O
 636 | 意 O
 637 | 义 O
 638 | 。 O
 639 | 
 640 | 这 O
 641 | 是 O
 642 | 一 O
 643 | 个 O
 644 | 具 O
 645 | 有 O
 646 | 创 O
 647 | 造 O
 648 | 性 O
 649 | 的 O
 650 | 杰 O
 651 | 作 O
 652 | 。 O
 653 | 
 654 | “ O
 655 | 基 O
 656 | 本 O
 657 | 法 O
 658 | 不 O
 659 | 仅 O
 660 | 为 O
 661 | 确 O
 662 | 保 O
 663 | 香 B-LOC
 664 | 港 I-LOC
 665 | 平 O
 666 | 稳 O
 667 | 过 O
 668 | 渡 O
 669 | 发 O
 670 | 挥 O
 671 | 重 O
 672 | 要 O
 673 | 作 O
 674 | 用 O
 675 | ， O
 676 | 也 O
 677 | 为 O
 678 | 确 O
 679 | 保 O
 680 | 香 B-LOC
 681 | 港 I-LOC
 682 | 长 O
 683 | 期 O
 684 | 繁 O
 685 | 荣 O
 686 | 稳 O
 687 | 定 O
 688 | 发 O
 689 | 挥 O
 690 | 重 O
 691 | 要 O
 692 | 作 O
 693 | 用 O
 694 | ； O
 695 | 
 696 | 不 O
 697 | 仅 O
 698 | 为 O
 699 | 当 O
 700 | 前 O
 701 | 解 O
 702 | 决 O
 703 | 香 B-LOC
 704 | 港 I-LOC
 705 | 问 O
 706 | 题 O
 707 | 发 O
 708 | 挥 O
 709 | 作 O
 710 | 用 O
 711 | ， O
 712 | 也 O
 713 | 为 O
 714 | 在 O
 715 | 不 O
 716 | 远 O
 717 | 的 O
 718 | 将 O
 719 | 来 O
 720 | 解 O
 721 | 决 O
 722 | 澳 B-LOC
 723 | 门 I-LOC
 724 | 问 O
 725 | 题 O
 726 | 和 O
 727 | 最 O
 728 | 终 O
 729 | 解 O
 730 | 决 O
 731 | 台 B-LOC
 732 | 湾 I-LOC
 733 | 问 O
 734 | 题 O
 735 | ， O
 736 | 实 O
 737 | 现 O
 738 | 祖 O
 739 | 国 O
 740 | 完 O
 741 | 全 O
 742 | 统 O
 743 | 一 O
 744 | 发 O
 745 | 挥 O
 746 | 重 O
 747 | 要 O
 748 | 作 O
 749 | 用 O
 750 | 。 O
 751 | 
 752 | 基 O
 753 | 本 O
 754 | 法 O
 755 | 的 O
 756 | 主 O
 757 | 要 O
 758 | 特 O
 759 | 征 O
 760 | ， O
 761 | 是 O
 762 | 把 O
 763 | “ O
 764 | 一 O
 765 | 国 O
 766 | ” O
 767 | 与 O
 768 | “ O
 769 | 两 O
 770 | 制 O
 771 | ” O
 772 | 紧 O
 773 | 密 O
 774 | 结 O
 775 | 合 O
 776 | ， O
 777 | 维 O
 778 | 护 O
 779 | 国 O
 780 | 家 O
 781 | 的 O
 782 | 主 O
 783 | 权 O
 784 | 、 O
 785 | 统 O
 786 | 一 O
 787 | 和 O
 788 | 领 O
 789 | 土 O
 790 | 完 O
 791 | 整 O
 792 | 与 O
 793 | 授 O
 794 | 权 O
 795 | 香 B-LOC
 796 | 港 I-LOC
 797 | 特 I-LOC
 798 | 别 I-LOC
 799 | 行 I-LOC
 800 | 政 I-LOC
 801 | 区 I-LOC
 802 | 实 O
 803 | 行 O
 804 | 高 O
 805 | 度 O
 806 | 自 O
 807 | 治 O
 808 | 紧 O
 809 | 密 O
 810 | 结 O
 811 | 合 O
 812 | 。 O
 813 | 
 814 | 在 O
 815 | 一 O
 816 | 个 O
 817 | 统 O
 818 | 一 O
 819 | 的 O
 820 | 中 B-LOC
 821 | 华 I-LOC
 822 | 人 I-LOC
 823 | 民 I-LOC
 824 | 共 I-LOC
 825 | 和 I-LOC
 826 | 国 I-LOC
 827 | ， O
 828 | 可 O
 829 | 以 O
 830 | 实 O
 831 | 行 O
 832 | 社 O
 833 | 会 O
 834 | 主 O
 835 | 义 O
 836 | 和 O
 837 | 资 O
 838 | 本 O
 839 | 主 O
 840 | 义 O
 841 | 两 O
 842 | 种 O
 843 | 制 O
 844 | 度 O
 845 | ， O
 846 | 这 O
 847 | 是 O
 848 | 为 O
 849 | 了 O
 850 | 民 O
 851 | 族 O
 852 | 、 O
 853 | 国 O
 854 | 家 O
 855 | 的 O
 856 | 根 O
 857 | 本 O
 858 | 利 O
 859 | 益 O
 860 | 。 O
 861 | 
 862 | 只 O
 863 | 有 O
 864 | 认 O
 865 | 真 O
 866 | 学 O
 867 | 习 O
 868 | ， O
 869 | 才 O
 870 | 能 O
 871 | 理 O
 872 | 解 O
 873 | 意 O
 874 | 义 O
 875 | ， O
 876 | 认 O
 877 | 识 O
 878 | 特 O
 879 | 征 O
 880 | 。 O
 881 | 
 882 | 制 O
 883 | 定 O
 884 | 一 O
 885 | 部 O
 886 | 好 O
 887 | 法 O
 888 | 律 O
 889 | ， O
 890 | 很 O
 891 | 不 O
 892 | 容 O
 893 | 易 O
 894 | ； O
 895 | 
 896 | 遵 O
 897 | 守 O
 898 | 法 O
 899 | 律 O
 900 | ， O
 901 | 执 O
 902 | 行 O
 903 | 法 O
 904 | 律 O
 905 | ， O
 906 | 也 O
 907 | 很 O
 908 | 不 O
 909 | 容 O
 910 | 易 O
 911 | 。 O
 912 | 
 913 | 必 O
 914 | 须 O
 915 | 重 O
 916 | 申 O
 917 | ， O
 918 | 有 O
 919 | 法 O
 920 | 必 O
 921 | 依 O
 922 | ， O
 923 | 执 O
 924 | 法 O
 925 | 必 O
 926 | 严 O
 927 | ， O
 928 | 违 O
 929 | 法 O
 930 | 必 O
 931 | 究 O
 932 | 。 O
 933 | 
 934 | 基 O
 935 | 本 O
 936 | 法 O
 937 | 作 O
 938 | 为 O
 939 | 一 O
 940 | 部 O
 941 | 全 O
 942 | 国 O
 943 | 性 O
 944 | 的 O
 945 | 法 O
 946 | 律 O
 947 | ， O
 948 | 不 O
 949 | 仅 O
 950 | 香 B-LOC
 951 | 港 I-LOC
 952 | 要 O
 953 | 严 O
 954 | 格 O
 955 | 遵 O
 956 | 守 O
 957 | ， O
 958 | 各 O
 959 | 省 O
 960 | 、 O
 961 | 自 O
 962 | 治 O
 963 | 区 O
 964 | 、 O
 965 | 直 O
 966 | 辖 O
 967 | 市 O
 968 | 都 O
 969 | 要 O
 970 | 严 O
 971 | 格 O
 972 | 遵 O
 973 | 守 O
 974 | 。 O
 975 | 
 976 | 从 O
 977 | 中 B-ORG
 978 | 共 I-ORG
 979 | 中 I-ORG
 980 | 央 I-ORG
 981 | 举 O
 982 | 办 O
 983 | 这 O
 984 | 个 O
 985 | 讲 O
 986 | 座 O
 987 | ， O
 988 | 可 O
 989 | 以 O
 990 | 看 O
 991 | 出 O
 992 | ， O
 993 | 党 O
 994 | 和 O
 995 | 政 O
 996 | 府 O
 997 | 正 O
 998 | 在 O
 999 | 努 O
1000 | 力 O
1001 | 加 O
1002 | 强 O
1003 | 法 O
1004 | 制 O
1005 | 建 O
1006 | 设 O
1007 | ， O
1008 | 坚 O
1009 | 持 O
1010 | 依 O
1011 | 法 O
1012 | 治 O
1013 | 国 O
1014 | 。 O
1015 | 
1016 | 有 O
1017 | 了 O
1018 | 法 O
1019 | 律 O
1020 | ， O
1021 | 有 O
1022 | 了 O
1023 | 制 O
1024 | 度 O
1025 | ， O
1026 | 就 O
1027 | 有 O
1028 | 了 O
1029 | 保 O
1030 | 证 O
1031 | ， O
1032 | 就 O
1033 | 使 O
1034 | “ O
1035 | 一 O
1036 | 国 O
1037 | 两 O
1038 | 制 O
1039 | ” O
1040 | 的 O
1041 | 伟 O
1042 | 大 O
1043 | 构 O
1044 | 想 O
1045 | 以 O
1046 | 法 O
1047 | 律 O
1048 | 的 O
1049 | 形 O
1050 | 式 O
1051 | 固 O
1052 | 定 O
1053 | 下 O
1054 | 来 O
1055 | 。 O
1056 | 
1057 | 全 O
1058 | 国 O
1059 | 人 O
1060 | 民 O
1061 | 特 O
1062 | 别 O
1063 | 是 O
1064 | 香 B-LOC
1065 | 港 I-LOC
1066 | 同 O
1067 | 胞 O
1068 | 也 O
1069 | 从 O
1070 | 中 O
1071 | 再 O
1072 | 一 O
1073 | 次 O
1074 | 看 O
1075 | 到 O
1076 | ， O
1077 | 中 B-ORG
1078 | 国 I-ORG
1079 | 共 I-ORG
1080 | 产 I-ORG
1081 | 党 I-ORG
1082 | 和 O
1083 | 人 O
1084 | 民 O
1085 | 政 O
1086 | 府 O
1087 | 是 O
1088 | 高 O
1089 | 度 O
1090 | 负 O
1091 | 责 O
1092 | 任 O
1093 | 的 O
1094 | 党 O
1095 | 和 O
1096 | 政 O
1097 | 府 O
1098 | ， O
1099 | 一 O
1100 | 切 O
1101 | 从 O
1102 | 人 O
1103 | 民 O
1104 | 的 O
1105 | 利 O
1106 | 益 O
1107 | 出 O
1108 | 发 O
1109 | ， O
1110 | 一 O
1111 | 切 O
1112 | 为 O
1113 | 了 O
1114 | 祖 O
1115 | 国 O
1116 | 的 O
1117 | 繁 O
1118 | 荣 O
1119 | 富 O
1120 | 强 O
1121 | ， O
1122 | 香 B-LOC
1123 | 港 I-LOC
1124 | 的 O
1125 | 明 O
1126 | 天 O
1127 | 将 O
1128 | 更 O
1129 | 美 O
1130 | 好 O
1131 | 。 O
1132 | 
1133 | 学 O
1134 | 习 O
1135 | 基 O
1136 | 本 O
1137 | 法 O
1138 | ， O
1139 | 中 O
1140 | 央 O
1141 | 领 O
1142 | 导 O
1143 | 带 O
1144 | 了 O
1145 | 个 O
1146 | 好 O
1147 | 头 O
1148 | 。 O
1149 | 
1150 | 全 O
1151 | 党 O
1152 | 和 O
1153 | 全 O
1154 | 国 O
1155 | 人 O
1156 | 民 O
1157 | 特 O
1158 | 别 O
1159 | 是 O
1160 | 各 O
1161 | 级 O
1162 | 党 O
1163 | 政 O
1164 | 领 O
1165 | 导 O
1166 | 干 O
1167 | 部 O
1168 | ， O
1169 | 都 O
1170 | 要 O
1171 | 重 O
1172 | 视 O
1173 | 学 O
1174 | 习 O
1175 | 。 O
1176 | 
1177 | 只 O
1178 | 有 O
1179 | 学 O
1180 | 习 O
1181 | 好 O
1182 | ， O
1183 | 才 O
1184 | 能 O
1185 | 贯 O
1186 | 彻 O
1187 | 好 O
1188 | 。 O
1189 | 
1190 | 为 O
1191 | 了 O
1192 | 迎 O
1193 | 接 O
1194 | 香 B-LOC
1195 | 港 I-LOC
1196 | 顺 O
1197 | 利 O
1198 | 回 O
1199 | 归 O
1200 | 祖 O
1201 | 国 O
1202 | 这 O
1203 | 一 O
1204 | 中 B-LOC
1205 | 华 I-LOC
1206 | 民 O
1207 | 族 O
1208 | 的 O
1209 | 盛 O
1210 | 事 O
1211 | ， O
1212 | 首 O
1213 | 先 O
1214 | 要 O
1215 | 有 O
1216 | 一 O
1217 | 个 O
1218 | 扎 O
1219 | 实 O
1220 | 的 O
1221 | 思 O
1222 | 想 O
1223 | 准 O
1224 | 备 O
1225 | 和 O
1226 | 良 O
1227 | 好 O
1228 | 的 O
1229 | 精 O
1230 | 神 O
1231 | 状 O
1232 | 态 O
1233 | 。 O
1234 | 
1235 | 基 O
1236 | 本 O
1237 | 法 O
1238 | 连 O
1239 | 着 O
1240 | 你 O
1241 | 我 O
1242 | 他 O
1243 | 
1244 | 叶 B-PER
1245 | 秋 I-PER
1246 | 
1247 | 赠 O
1248 | 书 O
1249 | 想 O
1250 | 来 O
1251 | 是 O
1252 | 香 B-LOC
1253 | 港 I-LOC
1254 | 同 O
1255 | 胞 O
1256 | 的 O
1257 | 一 O
1258 | 种 O
1259 | 文 O
1260 | 明 O
1261 | 礼 O
1262 | 仪 O
1263 | 。 O
1264 | 
1265 | 抵 O
1266 | 港 B-LOC
1267 | 仅 O
1268 | 数 O
1269 | 日 O
1270 | ， O
1271 | 就 O
1272 | 收 O
1273 | 到 O
1274 | 厚 O
1275 | 厚 O
1276 | 几 O
1277 | 摞 O
1278 | 书 O
1279 | 。 O
1280 | 
1281 | 匆 O
1282 | 匆 O
1283 | 翻 O
1284 | 阅 O
1285 | 一 O
1286 | 遍 O
1287 | ， O
1288 | 发 O
1289 | 现 O
1290 | 各 O
1291 | 种 O
1292 | 版 O
1293 | 本 O
1294 | 的 O
1295 | 《 O
1296 | 中 B-LOC
1297 | 华 I-LOC
1298 | 人 I-LOC
1299 | 民 I-LOC
1300 | 共 I-LOC
1301 | 和 I-LOC
1302 | 国 I-LOC
1303 | 香 B-LOC
1304 | 港 I-LOC
1305 | 特 I-LOC
1306 | 别 I-LOC
1307 | 行 I-LOC
1308 | 政 I-LOC
1309 | 区 I-LOC
1310 | 基 O
1311 | 本 O
1312 | 法 O
1313 | 》 O
1314 | 竟 O
1315 | 有 O
1316 | 六 O
1317 | 册 O
1318 | 之 O
1319 | 多 O
1320 | ， O
1321 | 推 O
1322 | 介 O
1323 | 普 O
1324 | 及 O
1325 | 基 O
1326 | 本 O
1327 | 法 O
1328 | 的 O
1329 | 书 O
1330 | 籍 O
1331 | 还 O
1332 | 要 O
1333 | 多 O
1334 | 。 O
1335 | 
1336 | 应 O
1337 | 约 O
1338 | 去 O
1339 | 湾 B-LOC
1340 | 仔 I-LOC
1341 | 道 I-LOC
1342 | 谈 O
1343 | 事 O
1344 | ， O
1345 | 路 O
1346 | 过 O
1347 | 一 O
1348 | 个 O
1349 | 名 O
1350 | 为 O
1351 | “ O
1352 | 艺 O
1353 | 美 O
1354 | ” O
1355 | 的 O
1356 | 书 O
1357 | 店 O
1358 | ， O
1359 | 看 O
1360 | 到 O
1361 | 摆 O
1362 | 放 O
1363 | 在 O
1364 | 最 O
1365 | 抢 O
1366 | 眼 O
1367 | 位 O
1368 | 置 O
1369 | 的 O
1370 | 也 O
1371 | 是 O
1372 | 基 O
1373 | 本 O
1374 | 法 O
1375 | 及 O
1376 | 其 O
1377 | 推 O
1378 | 介 O
1379 | 图 O
1380 | 书 O
1381 | 。 O
1382 | 
1383 | 迎 O
1384 | 街 O
1385 | 介 O
1386 | 绍 O
1387 | 新 O
1388 | 书 O
1389 | 的 O
1390 | 告 O
1391 | 示 O
1392 | 上 O
1393 | 醒 O
1394 | 目 O
1395 | 地 O
1396 | 写 O
1397 | 着 O
1398 | ： O
1399 | “ O
1400 | 基 O
1401 | 本 O
1402 | 法 O
1403 | 连 O
1404 | 着 O
1405 | 你 O
1406 | 、 O
1407 | 我 O
1408 | 、 O
1409 | 他 O
1410 | ， O
1411 | 让 O
1412 | 我 O
1413 | 们 O
1414 | 都 O
1415 | 来 O
1416 | 认 O
1417 | 识 O
1418 | 基 O
1419 | 本 O
1420 | 法 O
1421 | 。 O
1422 | ” O
1423 | 
1424 | 由 O
1425 | 此 O
1426 | 可 O
1427 | 见 O
1428 | ， O
1429 | 在 O
1430 | 法 O
1431 | 制 O
1432 | 观 O
1433 | 念 O
1434 | 很 O
1435 | 强 O
1436 | 的 O
1437 | 港 B-LOC
1438 | 人 O
1439 | 心 O
1440 | 目 O
1441 | 中 O
1442 | ， O
1443 | 基 O
1444 | 本 O
1445 | 法 O
1446 | 具 O
1447 | 有 O
1448 | 极 O
1449 | 大 O
1450 | 的 O
1451 | 权 O
1452 | 威 O
1453 | 和 O
1454 | 尊 O
1455 | 严 O
1456 | 。 O
1457 | 
1458 | 香 B-LOC
1459 | 港 I-LOC
1460 | 各 O
1461 | 界 O
1462 | 人 O
1463 | 士 O
1464 | 从 O
1465 | 各 O
1466 | 自 O
1467 | 的 O
1468 | 角 O
1469 | 度 O
1470 | 去 O
1471 | 高 O
1472 | 度 O
1473 | 评 O
1474 | 价 O
1475 | 它 O
1476 | 。 O
1477 | 
1478 | 行 O
1479 | 政 O
1480 | 官 O
1481 | 员 O
1482 | 表 O
1483 | 示 O
1484 | ： O
1485 | “ O
1486 | 香 B-LOC
1487 | 港 I-LOC
1488 | 继 O
1489 | 续 O
1490 | 繁 O
1491 | 荣 O
1492 | 稳 O
1493 | 定 O
1494 | 、 O
1495 | 实 O
1496 | 现 O
1497 | 香 B-LOC
1498 | 港 I-LOC
1499 | 梦 O
1500 | 的 O
1501 | 成 O
1502 | 功 O
1503 | 要 O
1504 | 素 O
1505 | ， O
1506 | 在 O
1507 | 基 O
1508 | 本 O
1509 | 法 O
1510 | 中 O
1511 | 得 O
1512 | 到 O
1513 | 了 O
1514 | 充 O
1515 | 分 O
1516 | 保 O
1517 | 证 O
1518 | 。 O
1519 | ” O
1520 | 
1521 | 法 O
1522 | 律 O
1523 | 界 O
1524 | 人 O
1525 | 士 O
1526 | 认 O
1527 | 为 O
1528 | ： O
1529 | “ O
1530 | 法 O
1531 | 治 O
1532 | 精 O
1533 | 神 O
1534 | 能 O
1535 | 否 O
1536 | 继 O
1537 | 续 O
1538 | 保 O
1539 | 持 O
1540 | ， O
1541 | 基 O
1542 | 本 O
1543 | 法 O
1544 | 已 O
1545 | 作 O
1546 | 了 O
1547 | 明 O
1548 | 确 O
1549 | 规 O
1550 | 定 O
1551 | 。 O
1552 | 
1553 | 只 O
1554 | 要 O
1555 | 恪 O
1556 | 守 O
1557 | 广 O
1558 | 大 O
1559 | 港 B-LOC
1560 | 人 O
1561 | 认 O
1562 | 受 O
1563 | 的 O
1564 | 香 B-LOC
1565 | 港 I-LOC
1566 | 法 O
1567 | 律 O
1568 | 体 O
1569 | 系 O
1570 | 中 O
1571 | 的 O
1572 | 这 O
1573 | 个 O
1574 | 总 O
1575 | 纲 O
1576 | 纪 O
1577 | 、 O
1578 | 总 O
1579 | 章 O
1580 | 程 O
1581 | ， O
1582 | 香 B-LOC
1583 | 港 I-LOC
1584 | 将 O
1585 | 健 O
1586 | 步 O
1587 | 迈 O
1588 | 向 O
1589 | 新 O
1590 | 世 O
1591 | 纪 O
1592 | 。 O
1593 | ” O
1594 | 
1595 | 劳 O
1596 | 工 O
1597 | 界 O
1598 | 的 O
1599 | 成 O
1600 | 员 O
1601 | 说 O
1602 | ， O
1603 | 涉 O
1604 | 及 O
1605 | 保 O
1606 | 障 O
1607 | 劳 O
1608 | 工 O
1609 | 合 O
1610 | 法 O
1611 | 权 O
1612 | 益 O
1613 | 的 O
1614 | 条 O
1615 | 款 O
1616 | ， O
1617 | “ O
1618 | 香 B-LOC
1619 | 港 I-LOC
1620 | 现 O
1621 | 在 O
1622 | 有 O
1623 | 的 O
1624 | ， O
1625 | 基 O
1626 | 本 O
1627 | 法 O
1628 | 都 O
1629 | 保 O
1630 | 持 O
1631 | 了 O
1632 | ； O
1633 | 
1634 | 香 B-LOC
1635 | 港 I-LOC
1636 | 现 O
1637 | 在 O
1638 | 没 O
1639 | 有 O
1640 | 的 O
1641 | ， O
1642 | 基 O
1643 | 本 O
1644 | 法 O
1645 | 里 O
1646 | 也 O
1647 | 有 O
1648 | 了 O
1649 | 。 O
1650 | 
1651 | 大 O
1652 | 家 O
1653 | 因 O
1654 | 此 O
1655 | 吃 O
1656 | 了 O
1657 | 定 O
1658 | 心 O
1659 | 丸 O
1660 | 。 O
1661 | ” O
1662 | 
1663 | 基 O
1664 | 本 O
1665 | 法 O
1666 | 受 O
1667 | 到 O
1668 | 港 B-LOC
1669 | 人 O
1670 | 的 O
1671 | 普 O
1672 | 遍 O
1673 | 欢 O
1674 | 迎 O
1675 | 和 O
1676 | 高 O
1677 | 度 O
1678 | 重 O
1679 | 视 O
1680 | 是 O
1681 | 势 O
1682 | 所 O
1683 | 必 O
1684 | 然 O
1685 | 。 O
1686 | 
1687 | 历 O
1688 | 时 O
1689 | 四 O
1690 | 年 O
1691 | 零 O
1692 | 八 O
1693 | 个 O
1694 | 月 O
1695 | 、 O
1696 | 凝 O
1697 | 聚 O
1698 | 了 O
1699 | 香 B-LOC
1700 | 港 I-LOC
1701 | 和 O
1702 | 内 O
1703 | 地 O
1704 | 无 O
1705 | 数 O
1706 | 人 O
1707 | 的 O
1708 | 智 O
1709 | 慧 O
1710 | 而 O
1711 | 制 O
1712 | 定 O
1713 | 的 O
1714 | 基 O
1715 | 本 O
1716 | 法 O
1717 | ， O
1718 | 将 O
1719 | 邓 B-PER
1720 | 小 I-PER
1721 | 平 I-PER
1722 | 同 O
1723 | 志 O
1724 | 倡 O
1725 | 导 O
1726 | 的 O
1727 | “ O
1728 | 一 O
1729 | 国 O
1730 | 两 O
1731 | 制 O
1732 | ” O
1733 | 伟 O
1734 | 大 O
1735 | 构 O
1736 | 想 O
1737 | 以 O
1738 | 法 O
1739 | 律 O
1740 | 形 O
1741 | 式 O
1742 | 固 O
1743 | 定 O
1744 | 下 O
1745 | 来 O
1746 | ， O
1747 | 成 O
1748 | 为 O
1749 | 国 O
1750 | 家 O
1751 | 和 O
1752 | 人 O
1753 | 民 O
1754 | 的 O
1755 | 意 O
1756 | 志 O
1757 | 。 O
1758 | 
1759 | 邓 B-PER
1760 | 小 I-PER
1761 | 平 I-PER
1762 | 赞 O
1763 | 许 O
1764 | ： O
1765 | 基 O
1766 | 本 O
1767 | 法 O
1768 | 具 O
1769 | 有 O
1770 | 历 O
1771 | 史 O
1772 | 意 O
1773 | 义 O
1774 | 和 O
1775 | 国 O
1776 | 际 O
1777 | 意 O
1778 | 义 O
1779 | ， O
1780 | 是 O
1781 | 一 O
1782 | 个 O
1783 | 具 O
1784 | 有 O
1785 | 创 O
1786 | 造 O
1787 | 性 O
1788 | 的 O
1789 | 杰 O
1790 | 作 O
1791 | 。 O
1792 | 
1793 | 基 O
1794 | 本 O
1795 | 法 O
1796 | 既 O
1797 | 是 O
1798 | 香 B-LOC
1799 | 港 I-LOC
1800 | 回 O
1801 | 归 O
1802 | 后 O
1803 | 特 B-LOC
1804 | 区 I-LOC
1805 | 一 O
1806 | 切 O
1807 | 运 O
1808 | 作 O
1809 | 的 O
1810 | 法 O
1811 | 律 O
1812 | 基 O
1813 | 础 O
1814 | ， O
1815 | 更 O
1816 | 是 O
1817 | 保 O
1818 | 持 O
1819 | 香 B-LOC
1820 | 港 I-LOC
1821 | 长 O
1822 | 期 O
1823 | 稳 O
1824 | 定 O
1825 | 繁 O
1826 | 荣 O
1827 | 的 O
1828 | 法 O
1829 | 律 O
1830 | 保 O
1831 | 证 O
1832 | 。 O
1833 | 
1834 | 实 O
1835 | 践 O
1836 | 已 O
1837 | 经 O
1838 | 并 O
1839 | 将 O
1840 | 继 O
1841 | 续 O
1842 | 证 O
1843 | 明 O
1844 | 这 O
1845 | 一 O
1846 | 点 O
1847 | 。 O
1848 | 
1849 | 董 B-PER
1850 | 建 I-PER
1851 | 华 I-PER
1852 | 先 O
1853 | 生 O
1854 | 近 O
1855 | 日 O
1856 | 撰 O
1857 | 文 O
1858 | 称 O
1859 | “ O
1860 | 基 O
1861 | 本 O
1862 | 法 O
1863 | 是 O
1864 | ‘ O
1865 | 一 O
1866 | 国 O
1867 | 两 O
1868 | 制 O
1869 | ’ O
1870 | 的 O
1871 | 一 O
1872 | 次 O
1873 | 成 O
1874 | 功 O
1875 | 实 O
1876 | 践 O
1877 | 。 O
1878 | ” O
1879 | 
1880 | 说 O
1881 | 来 O
1882 | 也 O
1883 | 巧 O
1884 | ， O
1885 | 姬 B-PER
1886 | 鹏 I-PER
1887 | 飞 I-PER
1888 | 同 O
1889 | 志 O
1890 | 1 O
1891 | 9 O
1892 | 9 O
1893 | 0 O
1894 | 年 O
1895 | 4 O
1896 | 月 O
1897 | 在 O
1898 | 邓 B-PER
1899 | 小 I-PER
1900 | 平 I-PER
1901 | 同 O
1902 | 志 O
1903 | 题 O
1904 | 写 O
1905 | 书 O
1906 | 名 O
1907 | 的 O
1908 | 《 O
1909 | 基 O
1910 | 本 O
1911 | 法 O
1912 | 的 O
1913 | 诞 O
1914 | 生 O
1915 | 》 O
1916 | 一 O
1917 | 书 O
1918 | 序 O
1919 | 言 O
1920 | 中 O
1921 | 也 O
1922 | 写 O
1923 | 了 O
1924 | 同 O
1925 | 样 O
1926 | 的 O
1927 | 话 O
1928 | 。 O
1929 | 
1930 | 真 O
1931 | 可 O
1932 | 谓 O
1933 | 仁 O
1934 | 者 O
1935 | 智 O
1936 | 者 O
1937 | 所 O
1938 | 见 O
1939 | 略 O
1940 | 同 O
1941 | 。 O
1942 | 
1943 | 基 O
1944 | 本 O
1945 | 法 O
1946 | 是 O
1947 | 一 O
1948 | 部 O
1949 | 具 O
1950 | 有 O
1951 | 普 O
1952 | 遍 O
1953 | 约 O
1954 | 束 O
1955 | 力 O
1956 | 的 O
1957 | 重 O
1958 | 要 O
1959 | 法 O
1960 | 律 O
1961 | 。 O
1962 | 
1963 | 7 O
1964 | 月 O
1965 | 1 O
1966 | 日 O
1967 | ， O
1968 | 这 O
1969 | 部 O
1970 | 重 O
1971 | 要 O
1972 | 法 O
1973 | 律 O
1974 | 即 O
1975 | 开 O
1976 | 始 O
1977 | 正 O
1978 | 式 O
1979 | 实 O
1980 | 施 O
1981 | 。 O
1982 | 
1983 | 基 O
1984 | 本 O
1985 | 法 O
1986 | 不 O
1987 | 仅 O
1988 | 体 O
1989 | 现 O
1990 | 了 O
1991 | 香 B-LOC
1992 | 港 I-LOC
1993 | 同 O
1994 | 胞 O
1995 | 的 O
1996 | 意 O
1997 | 志 O
1998 | 和 O
1999 | 利 O
2000 | 益 O
2001 | ， O
2002 | 也 O
2003 | 体 O
2004 | 现 O
2005 | 了 O
2006 | 全 O
2007 | 国 O
2008 | 人 O
2009 | 民 O
2010 | 的 O
2011 | 意 O
2012 | 志 O
2013 | 和 O
2014 | 利 O
2015 | 益 O
2016 | 。 O
2017 | 
2018 | 正 O
2019 | 因 O
2020 | 为 O
2021 | 如 O
2022 | 此 O
2023 | ， O
2024 | 江 B-PER
2025 | 泽 I-PER
2026 | 民 I-PER
2027 | 同 O
2028 | 志 O
2029 | 强 O
2030 | 调 O
2031 | ： O
2032 | 香 B-LOC
2033 | 港 I-LOC
2034 | 基 O
2035 | 本 O
2036 | 法 O
2037 | 是 O
2038 | 一 O
2039 | 部 O
2040 | 全 O
2041 | 国 O
2042 | 性 O
2043 | 的 O
2044 | 法 O
2045 | 律 O
2046 | ， O
2047 | 不 O
2048 | 仅 O
2049 | 香 B-LOC
2050 | 港 I-LOC
2051 | 要 O
2052 | 严 O
2053 | 格 O
2054 | 遵 O
2055 | 守 O
2056 | ， O
2057 | 各 O
2058 | 省 O
2059 | 、 O
2060 | 自 O
2061 | 治 O
2062 | 区 O
2063 | 、 O
2064 | 直 O
2065 | 辖 O
2066 | 市 O
2067 | 都 O
2068 | 要 O
2069 | 严 O
2070 | 格 O
2071 | 遵 O
2072 | 守 O
2073 | 。 O
2074 | 
2075 | 还 O
2076 | 表 O
2077 | 示 O
2078 | ， O
2079 | 不 O
2080 | 仅 O
2081 | 我 O
2082 | 要 O
2083 | 遵 O
2084 | 守 O
2085 | ， O
2086 | 我 O
2087 | 希 O
2088 | 望 O
2089 | 香 B-LOC
2090 | 港 I-LOC
2091 | 同 O
2092 | 胞 O
2093 | 和 O
2094 | 全 O
2095 | 国 O
2096 | 1 O
2097 | 2 O
2098 | 亿 O
2099 | 人 O
2100 | 民 O
2101 | 也 O
2102 | 要 O
2103 | 遵 O
2104 | 守 O
2105 | 。 O
2106 | 
2107 | 学 O
2108 | 习 O
2109 | 、 O
2110 | 贯 O
2111 | 彻 O
2112 | 基 O
2113 | 本 O
2114 | 法 O
2115 | 的 O
2116 | 过 O
2117 | 程 O
2118 | ， O
2119 | 无 O
2120 | 疑 O
2121 | 是 O
2122 | 增 O
2123 | 强 O
2124 | 法 O
2125 | 制 O
2126 | 观 O
2127 | 念 O
2128 | 、 O
2129 | 推 O
2130 | 进 O
2131 | 法 O
2132 | 制 O
2133 | 建 O
2134 | 设 O
2135 | 的 O
2136 | 过 O
2137 | 程 O
2138 | ， O
2139 | 无 O
2140 | 疑 O
2141 | 是 O
2142 | 内 O
2143 | 地 O
2144 | 和 O
2145 | 香 B-LOC
2146 | 港 I-LOC
2147 | 在 O
2148 | 新 O
2149 | 的 O
2150 | 征 O
2151 | 途 O
2152 | 上 O
2153 | 并 O
2154 | 肩 O
2155 | 同 O
2156 | 行 O
2157 | 、 O
2158 | 共 O
2159 | 创 O
2160 | 辉 O
2161 | 煌 O
2162 | 的 O
2163 | 过 O
2164 | 程 O
2165 | 。 O
2166 | 
2167 | 法 O
2168 | 律 O
2169 | 一 O
2170 | 旦 O
2171 | 为 O
2172 | 人 O
2173 | 民 O
2174 | 群 O
2175 | 众 O
2176 | 所 O
2177 | 掌 O
2178 | 握 O
2179 | ， O
2180 | 就 O
2181 | 会 O
2182 | 变 O
2183 | 成 O
2184 | 伟 O
2185 | 大 O
2186 | 的 O
2187 | 力 O
2188 | 量 O
2189 | 。 O
2190 | 
2191 | 行 O
2192 | 文 O
2193 | 至 O
2194 | 此 O
2195 | ， O
2196 | 我 O
2197 | 对 O
2198 | “ O
2199 | 基 O
2200 | 本 O
2201 | 法 O
2202 | 连 O
2203 | 着 O
2204 | 你 O
2205 | 我 O
2206 | 他 O
2207 | ” O
2208 | 有 O
2209 | 了 O
2210 | 更 O
2211 | 深 O
2212 | 刻 O
2213 | 、 O
2214 | 更 O
2215 | 真 O
2216 | 切 O
2217 | 的 O
2218 | 理 O
2219 | 解 O
2220 | 。 O
2221 | 
2222 | 任 B-PER
2223 | 建 I-PER
2224 | 新 I-PER
2225 | 在 O
2226 | 向 O
2227 | 八 B-ORG
2228 | 届 I-ORG
2229 | 全 I-ORG
2230 | 国 I-ORG
2231 | 人 I-ORG
2232 | 大 I-ORG
2233 | 五 O
2234 | 次 O
2235 | 会 O
2236 | 议 O
2237 | 的 O
2238 | 报 O
2239 | 告 O
2240 | 中 O
2241 | 说 O
2242 | 坚 O
2243 | 持 O
2244 | 严 O
2245 | 肃 O
2246 | 执 O
2247 | 法 O
2248 | 提 O
2249 | 高 O
2250 | 司 O
2251 | 法 O
2252 | 水 O
2253 | 平 O
2254 | 
2255 | 新 B-ORG
2256 | 华 I-ORG
2257 | 社 I-ORG
2258 | 北 B-LOC
2259 | 京 I-LOC
2260 | 3 O
2261 | 月 O
2262 | 1 O
2263 | 1 O
2264 | 日 O
2265 | 电 O
2266 | 最 B-ORG
2267 | 高 I-ORG
2268 | 人 I-ORG
2269 | 民 I-ORG
2270 | 法 I-ORG
2271 | 院 I-ORG
2272 | 院 O
2273 | 长 O
2274 | 任 B-PER
2275 | 建 I-PER
2276 | 新 I-PER
2277 | 今 O
2278 | 天 O
2279 | 在 O
2280 | 八 B-ORG
2281 | 届 I-ORG
2282 | 全 I-ORG
2283 | 国 I-ORG
2284 | 人 I-ORG
2285 | 大 I-ORG
2286 | 五 O
2287 | 次 O
2288 | 会 O
2289 | 议 O
2290 | 第 O
2291 | 五 O
2292 | 次 O
2293 | 全 O
2294 | 体 O
2295 | 会 O
2296 | 议 O
2297 | 作 O
2298 | 报 O
2299 | 告 O
2300 | 时 O
2301 | 说 O
2302 | ， O
2303 | 严 O
2304 | 肃 O
2305 | 执 O
2306 | 法 O
2307 | 是 O
2308 | 社 O
2309 | 会 O
2310 | 主 O
2311 | 义 O
2312 | 法 O
2313 | 制 O
2314 | 建 O
2315 | 设 O
2316 | 的 O
2317 | 重 O
2318 | 要 O
2319 | 内 O
2320 | 容 O
2321 | ， O
2322 | 是 O
2323 | 党 O
2324 | 和 O
2325 | 国 O
2326 | 家 O
2327 | 对 O
2328 | 司 O
2329 | 法 O
2330 | 活 O
2331 | 动 O
2332 | 的 O
2333 | 根 O
2334 | 本 O
2335 | 要 O
2336 | 求 O
2337 | 。 O
2338 | 
2339 | 一 O
2340 | 年 O
2341 | 来 O
2342 | ， O
2343 | 全 O
2344 | 国 O
2345 | 法 O
2346 | 院 O
2347 | 坚 O
2348 | 持 O
2349 | 严 O
2350 | 肃 O
2351 | 执 O
2352 | 法 O
2353 | 、 O
2354 | 努 O
2355 | 力 O
2356 | 提 O
2357 | 高 O
2358 | 司 O
2359 | 法 O
2360 | 水 O
2361 | 平 O
2362 | 。 O


--------------------------------------------------------------------------------
/albert_ner.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """BERT finetuning runner."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | 
 22 | import collections
 23 | import csv
 24 | import os
 25 | import modeling
 26 | import optimization_finetuning as optimization
 27 | import tokenization
 28 | import tensorflow as tf
 29 | import pickle
 30 | import tf_metrics
 31 | 
 32 | # from loss import bi_tempered_logistic_loss
 33 | 
 34 | flags = tf.flags
 35 | 
 36 | FLAGS = flags.FLAGS
 37 | 
 38 | ## Required parameters
 39 | flags.DEFINE_string(
 40 |     "data_dir", None,
 41 |     "The input data dir. Should contain the .tsv files (or other data files) "
 42 |     "for the task.")
 43 | 
 44 | flags.DEFINE_string(
 45 |     "bert_config_file", None,
 46 |     "The config json file corresponding to the pre-trained BERT model. "
 47 |     "This specifies the model architecture.")
 48 | 
 49 | flags.DEFINE_string("task_name", "NER", "The name of the task to train.")
 50 | 
 51 | flags.DEFINE_string("vocab_file", None,
 52 |                     "The vocabulary file that the BERT model was trained on.")
 53 | 
 54 | flags.DEFINE_string(
 55 |     "output_dir", None,
 56 |     "The output directory where the model checkpoints will be written.")
 57 | 
 58 | ## Other parameters
 59 | 
 60 | flags.DEFINE_string(
 61 |     "init_checkpoint", "albert_base_zh/albert_model.ckpt",
 62 |     "Initial checkpoint (usually from a pre-trained BERT model).")
 63 | 
 64 | flags.DEFINE_bool(
 65 |     "do_lower_case", True,
 66 |     "Whether to lower case the input text. Should be True for uncased "
 67 |     "models and False for cased models.")
 68 | 
 69 | flags.DEFINE_integer(
 70 |     "max_seq_length", 128,
 71 |     "The maximum total input sequence length after WordPiece tokenization. "
 72 |     "Sequences longer than this will be truncated, and sequences shorter "
 73 |     "than this will be padded.")
 74 | 
 75 | flags.DEFINE_bool("do_train", False, "Whether to run training.")
 76 | 
 77 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
 78 | 
 79 | flags.DEFINE_bool(
 80 |     "do_predict", False,
 81 |     "Whether to run the model in inference mode on the test set.")
 82 | 
 83 | flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
 84 | 
 85 | flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
 86 | 
 87 | flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
 88 | 
 89 | flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
 90 | 
 91 | flags.DEFINE_float("num_train_epochs", 3.0,
 92 |                    "Total number of training epochs to perform.")
 93 | 
 94 | flags.DEFINE_float(
 95 |     "warmup_proportion", 0.1,
 96 |     "Proportion of training to perform linear learning rate warmup for. "
 97 |     "E.g., 0.1 = 10% of training.")
 98 | 
 99 | flags.DEFINE_integer("save_checkpoints_steps", 1000,
100 |                      "How often to save the model checkpoint.")
101 | 
102 | flags.DEFINE_integer("iterations_per_loop", 1000,
103 |                      "How many steps to make in each estimator call.")
104 | 
105 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
106 | 
107 | tf.flags.DEFINE_string(
108 |     "tpu_name", None,
109 |     "The Cloud TPU to use for training. This should be either the name "
110 |     "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
111 |     "url.")
112 | 
113 | tf.flags.DEFINE_string(
114 |     "tpu_zone", None,
115 |     "[Optional] GCE zone where the Cloud TPU is located in. If not "
116 |     "specified, we will attempt to automatically detect the GCE project from "
117 |     "metadata.")
118 | 
119 | tf.flags.DEFINE_string(
120 |     "gcp_project", None,
121 |     "[Optional] Project name for the Cloud TPU-enabled project. If not "
122 |     "specified, we will attempt to automatically detect the GCE project from "
123 |     "metadata.")
124 | 
125 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
126 | 
127 | flags.DEFINE_integer(
128 |     "num_tpu_cores", 8,
129 |     "Only used if `use_tpu` is True. Total number of TPU cores to use.")
130 | 
131 | 
132 | class InputExample(object):
133 |   """A single training/test example for simple sequence classification."""
134 | 
135 |   def __init__(self, guid, text, label=None):
136 |     """Constructs a InputExample.
137 |     Args:
138 |       guid: Unique id for the example.
139 |       text_a: string. The untokenized text of the first sequence. For single
140 |         sequence tasks, only this sequence must be specified.
141 |       text_b: (Optional) string. The untokenized text of the second sequence.
142 |         Only must be specified for sequence pair tasks.
143 |       label: (Optional) string. The label of the example. This should be
144 |         specified for train and dev examples, but not for test examples.
145 |     """
146 |     self.guid = guid
147 |     self.text = text
148 |     self.label = label
149 | 
150 | 
151 | class PaddingInputExample(object):
152 |   """Fake example so the num input examples is a multiple of the batch size.
153 |   When running eval/predict on the TPU, we need to pad the number of examples
154 |   to be a multiple of the batch size, because the TPU requires a fixed batch
155 |   size. The alternative is to drop the last batch, which is bad because it means
156 |   the entire output data won't be generated.
157 |   We use this class instead of `None` because treating `None` as padding
158 |   battches could cause silent errors.
159 |   """
160 | 
161 | 
162 | class InputFeatures(object):
163 |   """A single set of features of data."""
164 | 
165 |   def __init__(self,
166 |                input_ids,
167 |                input_mask,
168 |                segment_ids,
169 |                label_ids):
170 |     self.input_ids = input_ids
171 |     self.input_mask = input_mask
172 |     self.segment_ids = segment_ids
173 |     self.label_ids = label_ids
174 |     # self.is_real_example = is_real_example
175 | 
176 | 
177 | class DataProcessor(object):
178 |   """Base class for data converters for sequence classification data sets."""
179 | 
180 |   def get_train_examples(self, data_dir):
181 |     """Gets a collection of `InputExample`s for the train set."""
182 |     raise NotImplementedError()
183 | 
184 |   def get_dev_examples(self, data_dir):
185 |     """Gets a collection of `InputExample`s for the dev set."""
186 |     raise NotImplementedError()
187 | 
188 |   def get_test_examples(self, data_dir):
189 |     """Gets a collection of `InputExample`s for prediction."""
190 |     raise NotImplementedError()
191 | 
192 |   def get_labels(self):
193 |     """Gets the list of labels for this data set."""
194 |     raise NotImplementedError()
195 | 
196 |   @classmethod
197 |   def _read_data(cls, input_file):
198 |     """Reads a BIO data."""
199 |     with open(input_file, encoding='utf-8') as f:
200 |       lines = []
201 |       words = []
202 |       labels = []
203 |       for line in f:
204 |         contends = line.strip()
205 |         word = line.strip().split(' ')[0]
206 |         label = line.strip().split(' ')[-1]
207 |         if contends.startswith("-DOCSTART-"):
208 |           words.append('')
209 |           continue
210 |         # if len(contends) == 0 and words[-1] == '。':
211 |         if len(contends) == 0:
212 |           l = ' '.join([label for label in labels if len(label) > 0])
213 |           w = ' '.join([word for word in words if len(word) > 0])
214 |           lines.append([l, w])
215 |           words = []
216 |           labels = []
217 |           continue
218 |         words.append(word)
219 |         labels.append(label)
220 |       return lines
221 | 
222 | class NerProcessor(DataProcessor):
223 |   def get_train_examples(self, data_dir):
224 |     return self._create_example(
225 |       self._read_data(os.path.join(data_dir, "train.txt")), "train"
226 |     )
227 | 
228 |   def get_dev_examples(self, data_dir):
229 |     return self._create_example(
230 |       self._read_data(os.path.join(data_dir, "dev.txt")), "dev"
231 |     )
232 | 
233 |   def get_test_examples(self,data_dir):
234 |     return self._create_example(
235 |       self._read_data(os.path.join(data_dir, "test.txt")), "test")
236 | 
237 | 
238 |   def get_labels(self):
239 |     # return ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]","[SEP]"]
240 |     return ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "X","[CLS]","[SEP]"]
241 | 
242 |   def _create_example(self, lines, set_type):
243 |     examples = []
244 |     for (i, line) in enumerate(lines):
245 |       guid = "%s-%s" % (set_type, i)
246 |       text = tokenization.convert_to_unicode(line[1])
247 |       label = tokenization.convert_to_unicode(line[0])
248 |       examples.append(InputExample(guid=guid, text=text, label=label))
249 |     return examples
250 | 
251 | 
252 | def write_tokens(tokens,mode):
253 |   if mode=="test":
254 |     path = os.path.join(FLAGS.output_dir, "token_"+mode+".txt")
255 |     wf = open(path,'a')
256 |     for token in tokens:
257 |       if token!="**NULL**":
258 |         wf.write(token+'\n')
259 |     wf.close()
260 | 
261 | def convert_single_example(ex_index, example, label_map, max_seq_length, tokenizer,mode):
262 |   textlist = example.text.split(' ')
263 |   labellist = example.label.split(' ')
264 |   tokens = []
265 |   labels = []
266 |   # print(textlist)
267 |   for i, word in enumerate(textlist):
268 |     token = tokenizer.tokenize(word)
269 |     # print(token)
270 |     tokens.extend(token)
271 |     label_1 = labellist[i]
272 |     # print(label_1)
273 |     for m in range(len(token)):
274 |       if m == 0:
275 |         labels.append(label_1)
276 |       # else:
277 |       #     labels.append("X")
278 |     # print(tokens, labels)
279 |     # tokens = tokenizer.tokenize(example.text)
280 |   if len(tokens) >= max_seq_length - 1:
281 |     tokens = tokens[0:(max_seq_length - 2)]
282 |     labels = labels[0:(max_seq_length - 2)]
283 |   ntokens = []
284 |   segment_ids = []
285 |   label_ids = []
286 |   ntokens.append("[CLS]")
287 |   segment_ids.append(0)
288 |   # append("O") or append("[CLS]") not sure!
289 |   label_ids.append(label_map["[CLS]"])
290 |   for i, token in enumerate(tokens):
291 |     ntokens.append(token)
292 |     segment_ids.append(0)
293 |     label_ids.append(label_map[labels[i]])
294 |   ntokens.append("[SEP]")
295 |   segment_ids.append(0)
296 |   # append("O") or append("[SEP]") not sure!
297 |   label_ids.append(label_map["[SEP]"])
298 |   input_ids = tokenizer.convert_tokens_to_ids(ntokens)
299 |   input_mask = [1] * len(input_ids)
300 |   #label_mask = [1] * len(input_ids)
301 |   while len(input_ids) < max_seq_length:
302 |     input_ids.append(0)
303 |     input_mask.append(0)
304 |     segment_ids.append(0)
305 |     # we don't concerned about it!
306 |     label_ids.append(0)
307 |     ntokens.append("**NULL**")
308 |     #label_mask.append(0)
309 |   # print(len(input_ids))
310 |   assert len(input_ids) == max_seq_length
311 |   assert len(input_mask) == max_seq_length
312 |   assert len(segment_ids) == max_seq_length
313 |   assert len(label_ids) == max_seq_length
314 |   #assert len(label_mask) == max_seq_length
315 | 
316 |   if ex_index < 5:
317 |     tf.logging.info("*** Example ***")
318 |     tf.logging.info("guid: %s" % (example.guid))
319 |     tf.logging.info("tokens: %s" % " ".join(
320 |       [tokenization.printable_text(x) for x in tokens]))
321 |     tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
322 |     tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
323 |     tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
324 |     tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids]))
325 |     #tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask]))
326 | 
327 |   feature = InputFeatures(
328 |     input_ids=input_ids,
329 |     input_mask=input_mask,
330 |     segment_ids=segment_ids,
331 |     label_ids=label_ids,
332 |     #label_mask = label_mask
333 |   )
334 |   write_tokens(ntokens,mode)
335 |   return feature
336 | 
337 | 
338 | def file_based_convert_examples_to_features(
339 |     examples, label_list, max_seq_length, tokenizer, output_file, mode=None):
340 |   """Convert a set of `InputExample`s to a TFRecord file."""
341 |   label_map = {}
342 |   for (i, label) in enumerate(label_list,1):
343 |     label_map[label] = i
344 |   with open('albert_base_ner_checkpoints/label2id.pkl','wb') as w:
345 |     pickle.dump(label_map,w)
346 | 
347 |   writer = tf.python_io.TFRecordWriter(output_file)
348 | 
349 |   for (ex_index, example) in enumerate(examples):
350 |     if ex_index % 10000 == 0:
351 |       tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
352 | 
353 |     feature = convert_single_example(ex_index, example, label_map,
354 |                                      max_seq_length, tokenizer, mode)
355 | 
356 |     def create_int_feature(values):
357 |       f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
358 |       return f
359 | 
360 |     features = collections.OrderedDict()
361 |     features["input_ids"] = create_int_feature(feature.input_ids)
362 |     features["input_mask"] = create_int_feature(feature.input_mask)
363 |     features["segment_ids"] = create_int_feature(feature.segment_ids)
364 |     features["label_ids"] = create_int_feature(feature.label_ids)
365 |     # features["is_real_example"] = create_int_feature(
366 |     #     [int(feature.is_real_example)])
367 | 
368 |     tf_example = tf.train.Example(features=tf.train.Features(feature=features))
369 |     writer.write(tf_example.SerializeToString())
370 |   writer.close()
371 | 
372 | 
373 | def file_based_input_fn_builder(input_file, seq_length, is_training,
374 |                                 drop_remainder):
375 |   """Creates an `input_fn` closure to be passed to TPUEstimator."""
376 | 
377 |   name_to_features = {
378 |       "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
379 |       "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
380 |       "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
381 |       "label_ids": tf.FixedLenFeature([seq_length], tf.int64),
382 |       # "is_real_example": tf.FixedLenFeature([], tf.int64),
383 |   }
384 | 
385 |   def _decode_record(record, name_to_features):
386 |     """Decodes a record to a TensorFlow example."""
387 |     example = tf.parse_single_example(record, name_to_features)
388 | 
389 |     # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
390 |     # So cast all int64 to int32.
391 |     for name in list(example.keys()):
392 |       t = example[name]
393 |       if t.dtype == tf.int64:
394 |         t = tf.to_int32(t)
395 |       example[name] = t
396 | 
397 |     return example
398 | 
399 |   def input_fn(params):
400 |     """The actual input function."""
401 |     batch_size = params["batch_size"]
402 | 
403 |     # For training, we want a lot of parallel reading and shuffling.
404 |     # For eval, we want no shuffling and parallel reading doesn't matter.
405 |     d = tf.data.TFRecordDataset(input_file)
406 |     if is_training:
407 |       d = d.repeat()
408 |       d = d.shuffle(buffer_size=100)
409 | 
410 |     d = d.apply(
411 |         tf.contrib.data.map_and_batch(
412 |             lambda record: _decode_record(record, name_to_features),
413 |             batch_size=batch_size,
414 |             drop_remainder=drop_remainder))
415 | 
416 |     return d
417 | 
418 |   return input_fn
419 | 
420 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
421 |                  labels, num_labels, use_one_hot_embeddings):
422 |   """Creates a classification model."""
423 |   model = modeling.BertModel(
424 |       config=bert_config,
425 |       is_training=is_training,
426 |       input_ids=input_ids,
427 |       input_mask=input_mask,
428 |       token_type_ids=segment_ids,
429 |       use_one_hot_embeddings=use_one_hot_embeddings)
430 | 
431 |   output_layer = model.get_sequence_output()
432 | 
433 |   hidden_size = output_layer.shape[-1].value
434 | 
435 |   output_weight = tf.get_variable(
436 |       "output_weights", [num_labels, hidden_size],
437 |       initializer=tf.truncated_normal_initializer(stddev=0.02))
438 | 
439 |   output_bias = tf.get_variable(
440 |       "output_bias", [num_labels], initializer=tf.zeros_initializer())
441 | 
442 |   with tf.variable_scope("loss"):
443 |     if is_training:
444 |       # I.e., 0.1 dropout
445 |       output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
446 |     output_layer = tf.reshape(output_layer, [-1, hidden_size])
447 |     logits = tf.matmul(output_layer, output_weight, transpose_b=True)
448 |     logits = tf.nn.bias_add(logits, output_bias)
449 |     logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, 11])
450 |         
451 |     log_probs = tf.nn.log_softmax(logits, axis=-1)
452 |     one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
453 |     per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
454 |     loss = tf.reduce_sum(per_example_loss)
455 |     probabilities = tf.nn.softmax(logits, axis=-1)
456 |     predict = tf.argmax(probabilities,axis=-1)
457 |     return (loss, per_example_loss, logits,predict)
458 | 
459 | def layer_norm(input_tensor, name=None):
460 |   """Run layer normalization on the last dimension of the tensor."""
461 |   return tf.contrib.layers.layer_norm(
462 |       inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
463 | 
464 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
465 |                      num_train_steps, num_warmup_steps, use_tpu,
466 |                      use_one_hot_embeddings):
467 |   """Returns `model_fn` closure for TPUEstimator."""
468 | 
469 |   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
470 |     """The `model_fn` for TPUEstimator."""
471 | 
472 |     tf.logging.info("*** Features ***")
473 |     for name in sorted(features.keys()):
474 |       tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
475 | 
476 |     input_ids = features["input_ids"]
477 |     input_mask = features["input_mask"]
478 |     segment_ids = features["segment_ids"]
479 |     label_ids = features["label_ids"]
480 |     # is_real_example = None
481 |     # if "is_real_example" in features:
482 |     #   is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
483 |     # else:
484 |     #   is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
485 | 
486 |     is_training = (mode == tf.estimator.ModeKeys.TRAIN)
487 | 
488 |     (total_loss, per_example_loss, logits, predicts) = create_model(
489 |         bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
490 |         num_labels, use_one_hot_embeddings)
491 | 
492 |     tvars = tf.trainable_variables()
493 |     initialized_variable_names = {}
494 |     scaffold_fn = None
495 |     if init_checkpoint:
496 |       (assignment_map, initialized_variable_names
497 |       ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
498 |       if use_tpu:
499 | 
500 |         def tpu_scaffold():
501 |           tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
502 |           return tf.train.Scaffold()
503 | 
504 |         scaffold_fn = tpu_scaffold
505 |       else:
506 |         tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
507 | 
508 |     tf.logging.info("**** Trainable Variables ****")
509 |     for var in tvars:
510 |       init_string = ""
511 |       if var.name in initialized_variable_names:
512 |         init_string = ", *INIT_FROM_CKPT*"
513 |       tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
514 |                       init_string)
515 | 
516 |     output_spec = None
517 |     if mode == tf.estimator.ModeKeys.TRAIN:
518 | 
519 |       train_op = optimization.create_optimizer(
520 |           total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
521 | 
522 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
523 |           mode=mode,
524 |           loss=total_loss,
525 |           train_op=train_op,
526 |           scaffold_fn=scaffold_fn)
527 |     elif mode == tf.estimator.ModeKeys.EVAL:
528 | 
529 |       def metric_fn(per_example_loss, label_ids, logits):
530 |         # def metric_fn(label_ids, logits):
531 |         predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
532 |         precision = tf_metrics.precision(label_ids,predictions,11,[2,3,4,5,6,7],average="macro")
533 |         recall = tf_metrics.recall(label_ids,predictions,11,[2,3,4,5,6,7],average="macro")
534 |         f = tf_metrics.f1(label_ids,predictions,11,[2,3,4,5,6,7],average="macro")
535 |         #
536 |         return {
537 |           "eval_precision":precision,
538 |           "eval_recall":recall,
539 |           "eval_f": f,
540 |           #"eval_loss": loss,
541 |         }
542 |       eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
543 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
544 |           mode=mode,
545 |           loss=total_loss,
546 |           eval_metrics=eval_metrics,
547 |           scaffold_fn=scaffold_fn)
548 |     else:
549 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
550 |           mode=mode,
551 |           predictions= predicts,
552 |           scaffold_fn=scaffold_fn)
553 |     return output_spec
554 | 
555 |   return model_fn
556 | 
557 | 
558 | # This function is not used by this file but is still used by the Colab and
559 | # people who depend on it.
560 | def input_fn_builder(features, seq_length, is_training, drop_remainder):
561 |   """Creates an `input_fn` closure to be passed to TPUEstimator."""
562 | 
563 |   all_input_ids = []
564 |   all_input_mask = []
565 |   all_segment_ids = []
566 |   all_label_ids = []
567 | 
568 |   for feature in features:
569 |     all_input_ids.append(feature.input_ids)
570 |     all_input_mask.append(feature.input_mask)
571 |     all_segment_ids.append(feature.segment_ids)
572 |     all_label_ids.append(feature.label_id)
573 | 
574 |   def input_fn(params):
575 |     """The actual input function."""
576 |     batch_size = params["batch_size"]
577 | 
578 |     num_examples = len(features)
579 | 
580 |     # This is for demo purposes and does NOT scale to large data sets. We do
581 |     # not use Dataset.from_generator() because that uses tf.py_func which is
582 |     # not TPU compatible. The right way to load data is with TFRecordReader.
583 |     d = tf.data.Dataset.from_tensor_slices({
584 |         "input_ids":
585 |             tf.constant(
586 |                 all_input_ids, shape=[num_examples, seq_length],
587 |                 dtype=tf.int32),
588 |         "input_mask":
589 |             tf.constant(
590 |                 all_input_mask,
591 |                 shape=[num_examples, seq_length],
592 |                 dtype=tf.int32),
593 |         "segment_ids":
594 |             tf.constant(
595 |                 all_segment_ids,
596 |                 shape=[num_examples, seq_length],
597 |                 dtype=tf.int32),
598 |         "label_ids":
599 |             tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
600 |     })
601 | 
602 |     if is_training:
603 |       d = d.repeat()
604 |       d = d.shuffle(buffer_size=100)
605 | 
606 |     d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
607 |     return d
608 | 
609 |   return input_fn
610 | 
611 | # This function is not used by this file but is still used by the Colab and
612 | # people who depend on it.
613 | def convert_examples_to_features(examples, label_list, max_seq_length,
614 |                                  tokenizer):
615 |   """Convert a set of `InputExample`s to a list of `InputFeatures`."""
616 | 
617 |   features = []
618 |   for (ex_index, example) in enumerate(examples):
619 |     if ex_index % 10000 == 0:
620 |       tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
621 | 
622 |     feature = convert_single_example(ex_index, example, label_list,
623 |                                      max_seq_length, tokenizer)
624 | 
625 |     features.append(feature)
626 |   return features
627 | 
628 | 
629 | def main(_):
630 |   tf.logging.set_verbosity(tf.logging.INFO)
631 | 
632 |   processors = {
633 |       # TODO change processors
634 |       "ner": NerProcessor
635 |   }
636 | 
637 |   tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
638 |                                                 FLAGS.init_checkpoint)
639 | 
640 |   if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
641 |     raise ValueError(
642 |         "At least one of `do_train`, `do_eval` or `do_predict' must be True.")
643 | 
644 |   bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
645 | 
646 |   if FLAGS.max_seq_length > bert_config.max_position_embeddings:
647 |     raise ValueError(
648 |         "Cannot use sequence length %d because the BERT model "
649 |         "was only trained up to sequence length %d" %
650 |         (FLAGS.max_seq_length, bert_config.max_position_embeddings))
651 | 
652 |   tf.gfile.MakeDirs(FLAGS.output_dir)
653 | 
654 |   task_name = FLAGS.task_name.lower()
655 | 
656 |   if task_name not in processors:
657 |     raise ValueError("Task not found: %s" % (task_name))
658 | 
659 |   processor = processors[task_name]()
660 | 
661 |   label_list = processor.get_labels()
662 | 
663 |   tokenizer = tokenization.FullTokenizer(
664 |       vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
665 | 
666 |   tpu_cluster_resolver = None
667 |   if FLAGS.use_tpu and FLAGS.tpu_name:
668 |     tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
669 |         FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
670 | 
671 |   is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
672 |   # Cloud TPU: Invalid TPU configuration, ensure ClusterResolver is passed to tpu.
673 |   print("###tpu_cluster_resolver:",tpu_cluster_resolver)
674 |   run_config = tf.contrib.tpu.RunConfig(
675 |       cluster=tpu_cluster_resolver,
676 |       master=FLAGS.master,
677 |       model_dir=FLAGS.output_dir,
678 |       save_checkpoints_steps=FLAGS.save_checkpoints_steps,
679 |       tpu_config=tf.contrib.tpu.TPUConfig(
680 |           iterations_per_loop=FLAGS.iterations_per_loop,
681 |           num_shards=FLAGS.num_tpu_cores,
682 |           per_host_input_for_training=is_per_host))
683 | 
684 |   train_examples = None
685 |   num_train_steps = None
686 |   num_warmup_steps = None
687 |   if FLAGS.do_train:
688 |     train_examples =processor.get_train_examples(FLAGS.data_dir) # TODO
689 |     print("###length of total train_examples:",len(train_examples))
690 |     num_train_steps = int(len(train_examples)/ FLAGS.train_batch_size * FLAGS.num_train_epochs)
691 |     num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
692 | 
693 |   model_fn = model_fn_builder(
694 |       bert_config=bert_config,
695 |       num_labels=len(label_list) + 1,
696 |       init_checkpoint=FLAGS.init_checkpoint,
697 |       learning_rate=FLAGS.learning_rate,
698 |       num_train_steps=num_train_steps,
699 |       num_warmup_steps=num_warmup_steps,
700 |       use_tpu=FLAGS.use_tpu,
701 |       use_one_hot_embeddings=FLAGS.use_tpu)
702 | 
703 |   # If TPU is not available, this will fall back to normal Estimator on CPU
704 |   # or GPU.
705 |   estimator = tf.contrib.tpu.TPUEstimator(
706 |       use_tpu=FLAGS.use_tpu,
707 |       model_fn=model_fn,
708 |       config=run_config,
709 |       train_batch_size=FLAGS.train_batch_size,
710 |       eval_batch_size=FLAGS.eval_batch_size,
711 |       predict_batch_size=FLAGS.predict_batch_size)
712 | 
713 |   if FLAGS.do_train:
714 |     train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
715 |     train_file_exists=os.path.exists(train_file)
716 |     print("###train_file_exists:", train_file_exists," ;train_file:",train_file)
717 |     if not train_file_exists: # if tf_record file not exist, convert from raw text file. # TODO
718 |         file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
719 |     tf.logging.info("***** Running training *****")
720 |     tf.logging.info("  Num examples = %d", len(train_examples))
721 |     tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
722 |     tf.logging.info("  Num steps = %d", num_train_steps)
723 |     train_input_fn = file_based_input_fn_builder(
724 |         input_file=train_file,
725 |         seq_length=FLAGS.max_seq_length,
726 |         is_training=True,
727 |         drop_remainder=True)
728 |     estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
729 | 
730 |   if FLAGS.do_eval:
731 |     eval_examples = processor.get_dev_examples(FLAGS.data_dir)
732 |     num_actual_eval_examples = len(eval_examples)
733 |     if FLAGS.use_tpu:
734 |       # TPU requires a fixed batch size for all batches, therefore the number
735 |       # of examples must be a multiple of the batch size, or else examples
736 |       # will get dropped. So we pad with fake examples which are ignored
737 |       # later on. These do NOT count towards the metric (all tf.metrics
738 |       # support a per-instance weight, and these get a weight of 0.0).
739 |       while len(eval_examples) % FLAGS.eval_batch_size != 0:
740 |         eval_examples.append(PaddingInputExample())
741 | 
742 |     eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
743 |     file_based_convert_examples_to_features(
744 |         eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
745 | 
746 |     tf.logging.info("***** Running evaluation *****")
747 |     tf.logging.info("  Num examples = %d (%d actual, %d padding)",
748 |                     len(eval_examples), num_actual_eval_examples,
749 |                     len(eval_examples) - num_actual_eval_examples)
750 |     tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
751 | 
752 |     # This tells the estimator to run through the entire set.
753 |     eval_steps = None
754 |     # However, if running eval on the TPU, you will need to specify the
755 |     # number of steps.
756 |     if FLAGS.use_tpu:
757 |       assert len(eval_examples) % FLAGS.eval_batch_size == 0
758 |       eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)
759 | 
760 |     eval_drop_remainder = True if FLAGS.use_tpu else False
761 |     eval_input_fn = file_based_input_fn_builder(
762 |         input_file=eval_file,
763 |         seq_length=FLAGS.max_seq_length,
764 |         is_training=False,
765 |         drop_remainder=eval_drop_remainder)
766 | 
767 |     #######################################################################################################################
768 |     # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy
769 |     steps_and_files = []
770 |     filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
771 |     for filename in filenames:
772 |         if filename.endswith(".index"):
773 |             ckpt_name = filename[:-6]
774 |             cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
775 |             global_step = int(cur_filename.split("-")[-1])
776 |             tf.logging.info("Add {} to eval list.".format(cur_filename))
777 |             steps_and_files.append([global_step, cur_filename])
778 |     steps_and_files = sorted(steps_and_files, key=lambda x: x[0])
779 | 
780 |     output_eval_file = os.path.join(FLAGS.data_dir, "eval_results_albert_zh.txt")
781 |     print("output_eval_file:",output_eval_file)
782 |     tf.logging.info("output_eval_file:"+output_eval_file)
783 |     with tf.gfile.GFile(output_eval_file, "w") as writer:
784 |         for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]):
785 |             result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename)
786 | 
787 |             tf.logging.info("***** Eval results %s *****" % (filename))
788 |             writer.write("***** Eval results %s *****\n" % (filename))
789 |             for key in sorted(result.keys()):
790 |                 tf.logging.info("  %s = %s", key, str(result[key]))
791 |                 writer.write("%s = %s\n" % (key, str(result[key])))
792 |     #######################################################################################################################
793 | 
794 |     #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
795 |     #
796 |     #output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
797 |     #with tf.gfile.GFile(output_eval_file, "w") as writer:
798 |     #  tf.logging.info("***** Eval results *****")
799 |     #  for key in sorted(result.keys()):
800 |     #    tf.logging.info("  %s = %s", key, str(result[key]))
801 |     #    writer.write("%s = %s\n" % (key, str(result[key])))
802 | 
803 |   if FLAGS.do_predict:
804 |     token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
805 |     with open('albert_base_ner_checkpoints/label2id.pkl','rb') as rf:
806 |       label2id = pickle.load(rf)
807 |       id2label = {value:key for key,value in label2id.items()}
808 |     if os.path.exists(token_path):
809 |       os.remove(token_path)
810 |     predict_examples = processor.get_test_examples(FLAGS.data_dir)
811 | 
812 |     predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
813 |     file_based_convert_examples_to_features(predict_examples, label_list,
814 |                                             FLAGS.max_seq_length, tokenizer,
815 |                                             predict_file,mode="test")
816 |                             
817 |     tf.logging.info("***** Running prediction*****")
818 |     tf.logging.info("  Num examples = %d", len(predict_examples))
819 |     tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
820 |     if FLAGS.use_tpu:
821 |       # Warning: According to tpu_estimator.py Prediction on TPU is an
822 |       # experimental feature and hence not supported here
823 |       raise ValueError("Prediction in TPU not supported")
824 |     predict_drop_remainder = True if FLAGS.use_tpu else False
825 |     predict_input_fn = file_based_input_fn_builder(
826 |       input_file=predict_file,
827 |       seq_length=FLAGS.max_seq_length,
828 |       is_training=False,
829 |       drop_remainder=predict_drop_remainder)
830 | 
831 |     result = estimator.predict(input_fn=predict_input_fn)
832 |     output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")
833 |     with open(output_predict_file,'w') as writer:
834 |       for prediction in result:
835 |         output_line = "\n".join(id2label[id] for id in prediction if id!=0) + "\n"
836 |         writer.write(output_line)
837 | 
838 | 
839 | if __name__ == "__main__":
840 |   flags.mark_flag_as_required("data_dir")
841 |   flags.mark_flag_as_required("task_name")
842 |   flags.mark_flag_as_required("vocab_file")
843 |   flags.mark_flag_as_required("bert_config_file")
844 |   flags.mark_flag_as_required("output_dir")
845 |   tf.app.run()
846 | 


--------------------------------------------------------------------------------
/create_pretraining_data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Create masked LM/next sentence masked_lm TF examples for BERT."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import random
 23 | import tokenization
 24 | import tensorflow as tf
 25 | import jieba
 26 | import re
 27 | flags = tf.flags
 28 | 
 29 | FLAGS = flags.FLAGS
 30 | 
 31 | flags.DEFINE_string("input_file", None,
 32 |                     "Input raw text file (or comma-separated list of files).")
 33 | 
 34 | flags.DEFINE_string(
 35 |     "output_file", None,
 36 |     "Output TF example file (or comma-separated list of files).")
 37 | 
 38 | flags.DEFINE_string("vocab_file", None,
 39 |                     "The vocabulary file that the BERT model was trained on.")
 40 | 
 41 | flags.DEFINE_bool(
 42 |     "do_lower_case", True,
 43 |     "Whether to lower case the input text. Should be True for uncased "
 44 |     "models and False for cased models.")
 45 | 
 46 | flags.DEFINE_bool(
 47 |     "do_whole_word_mask", False,
 48 |     "Whether to use whole word masking rather than per-WordPiece masking.")
 49 | 
 50 | flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
 51 | 
 52 | flags.DEFINE_integer("max_predictions_per_seq", 20,
 53 |                      "Maximum number of masked LM predictions per sequence.")
 54 | 
 55 | flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
 56 | 
 57 | flags.DEFINE_integer(
 58 |     "dupe_factor", 10,
 59 |     "Number of times to duplicate the input data (with different masks).")
 60 | 
 61 | flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
 62 | 
 63 | flags.DEFINE_float(
 64 |     "short_seq_prob", 0.1,
 65 |     "Probability of creating sequences which are shorter than the "
 66 |     "maximum length.")
 67 | 
 68 | flags.DEFINE_bool("non_chinese", False,"manually set this to True if you are not doing chinese pre-train task.")
 69 | 
 70 | 
 71 | class TrainingInstance(object):
 72 |   """A single training instance (sentence pair)."""
 73 | 
 74 |   def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
 75 |                is_random_next):
 76 |     self.tokens = tokens
 77 |     self.segment_ids = segment_ids
 78 |     self.is_random_next = is_random_next
 79 |     self.masked_lm_positions = masked_lm_positions
 80 |     self.masked_lm_labels = masked_lm_labels
 81 | 
 82 |   def __str__(self):
 83 |     s = ""
 84 |     s += "tokens: %s\n" % (" ".join(
 85 |         [tokenization.printable_text(x) for x in self.tokens]))
 86 |     s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
 87 |     s += "is_random_next: %s\n" % self.is_random_next
 88 |     s += "masked_lm_positions: %s\n" % (" ".join(
 89 |         [str(x) for x in self.masked_lm_positions]))
 90 |     s += "masked_lm_labels: %s\n" % (" ".join(
 91 |         [tokenization.printable_text(x) for x in self.masked_lm_labels]))
 92 |     s += "\n"
 93 |     return s
 94 | 
 95 |   def __repr__(self):
 96 |     return self.__str__()
 97 | 
 98 | 
 99 | def write_instance_to_example_files(instances, tokenizer, max_seq_length,
100 |                                     max_predictions_per_seq, output_files):
101 |   """Create TF example files from `TrainingInstance`s."""
102 |   writers = []
103 |   for output_file in output_files:
104 |     writers.append(tf.python_io.TFRecordWriter(output_file))
105 | 
106 |   writer_index = 0
107 | 
108 |   total_written = 0
109 |   for (inst_index, instance) in enumerate(instances):
110 |     input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
111 |     input_mask = [1] * len(input_ids)
112 |     segment_ids = list(instance.segment_ids)
113 |     assert len(input_ids) <= max_seq_length
114 | 
115 |     while len(input_ids) < max_seq_length:
116 |       input_ids.append(0)
117 |       input_mask.append(0)
118 |       segment_ids.append(0)
119 | 
120 |     assert len(input_ids) == max_seq_length
121 |     assert len(input_mask) == max_seq_length
122 |     assert len(segment_ids) == max_seq_length
123 | 
124 |     masked_lm_positions = list(instance.masked_lm_positions)
125 |     masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
126 |     masked_lm_weights = [1.0] * len(masked_lm_ids)
127 | 
128 |     while len(masked_lm_positions) < max_predictions_per_seq:
129 |       masked_lm_positions.append(0)
130 |       masked_lm_ids.append(0)
131 |       masked_lm_weights.append(0.0)
132 | 
133 |     next_sentence_label = 1 if instance.is_random_next else 0
134 | 
135 |     features = collections.OrderedDict()
136 |     features["input_ids"] = create_int_feature(input_ids)
137 |     features["input_mask"] = create_int_feature(input_mask)
138 |     features["segment_ids"] = create_int_feature(segment_ids)
139 |     features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
140 |     features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
141 |     features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
142 |     features["next_sentence_labels"] = create_int_feature([next_sentence_label])
143 | 
144 |     tf_example = tf.train.Example(features=tf.train.Features(feature=features))
145 | 
146 |     writers[writer_index].write(tf_example.SerializeToString())
147 |     writer_index = (writer_index + 1) % len(writers)
148 | 
149 |     total_written += 1
150 | 
151 |     if inst_index < 20:
152 |       tf.logging.info("*** Example ***")
153 |       tf.logging.info("tokens: %s" % " ".join(
154 |           [tokenization.printable_text(x) for x in instance.tokens]))
155 | 
156 |       for feature_name in features.keys():
157 |         feature = features[feature_name]
158 |         values = []
159 |         if feature.int64_list.value:
160 |           values = feature.int64_list.value
161 |         elif feature.float_list.value:
162 |           values = feature.float_list.value
163 |         tf.logging.info(
164 |             "%s: %s" % (feature_name, " ".join([str(x) for x in values])))
165 | 
166 |   for writer in writers:
167 |     writer.close()
168 | 
169 |   tf.logging.info("Wrote %d total instances", total_written)
170 | 
171 | 
172 | def create_int_feature(values):
173 |   feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
174 |   return feature
175 | 
176 | 
177 | def create_float_feature(values):
178 |   feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
179 |   return feature
180 | 
181 | 
182 | def create_training_instances(input_files, tokenizer, max_seq_length,
183 |                               dupe_factor, short_seq_prob, masked_lm_prob,
184 |                               max_predictions_per_seq, rng):
185 |   """Create `TrainingInstance`s from raw text."""
186 |   all_documents = [[]]
187 | 
188 |   # Input file format:
189 |   # (1) One sentence per line. These should ideally be actual sentences, not
190 |   # entire paragraphs or arbitrary spans of text. (Because we use the
191 |   # sentence boundaries for the "next sentence prediction" task).
192 |   # (2) Blank lines between documents. Document boundaries are needed so
193 |   # that the "next sentence prediction" task doesn't span between documents.
194 |   for input_file in input_files:
195 |     with tf.gfile.GFile(input_file, "r") as reader:
196 |       while True:
197 |         strings=reader.readline()
198 |         strings=strings.replace("   "," ").replace("  "," ") # 如果有两个或三个空格，替换为一个空格
199 |         line = tokenization.convert_to_unicode(strings)
200 |         if not line:
201 |           break
202 |         line = line.strip()
203 | 
204 |         # Empty lines are used as document delimiters
205 |         if not line:
206 |           all_documents.append([])
207 |         tokens = tokenizer.tokenize(line)
208 |         if tokens:
209 |           all_documents[-1].append(tokens)
210 | 
211 |   # Remove empty documents
212 |   all_documents = [x for x in all_documents if x]
213 |   rng.shuffle(all_documents)
214 | 
215 |   vocab_words = list(tokenizer.vocab.keys())
216 |   instances = []
217 |   for _ in range(dupe_factor):
218 |     for document_index in range(len(all_documents)):
219 |       instances.extend(
220 |         create_instances_from_document_albert( # change to albert style for sentence order prediction(SOP), 2019-08-28, brightmart
221 |               all_documents, document_index, max_seq_length, short_seq_prob,
222 |               masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
223 | 
224 |   rng.shuffle(instances)
225 |   return instances
226 | 
227 | def get_new_segment(segment):  # 新增的方法 ####
228 |     """
229 |     输入一句话，返回一句经过处理的话: 为了支持中文全称mask，将被分开的词，将上特殊标记("#")，使得后续处理模块，能够知道哪些字是属于同一个词的。
230 |     :param segment: 一句话. e.g.  ['悬', '灸', '技', '术', '培', '训', '专', '家', '教', '你', '艾', '灸', '降', '血', '糖', '，', '为', '爸', '妈', '收', '好', '了', '！']
231 |     :return: 一句处理过的话 e.g.    ['悬', '##灸', '技', '术', '培', '训', '专', '##家', '教', '你', '艾', '##灸', '降', '##血', '##糖', '，', '为', '爸', '##妈', '收', '##好', '了', '！']
232 |     """
233 |     seq_cws = jieba.lcut("".join(segment)) # 分词
234 |     seq_cws_dict = {x: 1 for x in seq_cws} # 分词后的词加入到词典dict
235 |     new_segment = []
236 |     i = 0
237 |     while i < len(segment): # 从句子的第一个字开始处理，知道处理完整个句子
238 |       if len(re.findall('[\u4E00-\u9FA5]', segment[i])) == 0:  # 如果找不到中文的，原文加进去即不用特殊处理。
239 |         new_segment.append(segment[i])
240 |         i += 1
241 |         continue
242 | 
243 |       has_add = False
244 |       for length in range(3, 0, -1):
245 |         if i + length > len(segment):
246 |           continue
247 |         if ''.join(segment[i:i + length]) in seq_cws_dict:
248 |           new_segment.append(segment[i])
249 |           for l in range(1, length):
250 |             new_segment.append('##' + segment[i + l])
251 |           i += length
252 |           has_add = True
253 |           break
254 |       if not has_add:
255 |         new_segment.append(segment[i])
256 |         i += 1
257 |     # print("get_new_segment.wwm.get_new_segment:",new_segment)
258 |     return new_segment
259 | 
260 | def create_instances_from_document_albert(
261 |     all_documents, document_index, max_seq_length, short_seq_prob,
262 |     masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
263 |   """Creates `TrainingInstance`s for a single document.
264 |      This method is changed to create sentence-order prediction (SOP) followed by idea from paper of ALBERT, 2019-08-28, brightmart
265 |   """
266 |   document = all_documents[document_index] # 得到一个文档
267 | 
268 |   # Account for [CLS], [SEP], [SEP]
269 |   max_num_tokens = max_seq_length - 3
270 | 
271 |   # We *usually* want to fill up the entire sequence since we are padding
272 |   # to `max_seq_length` anyways, so short sequences are generally wasted
273 |   # computation. However, we *sometimes*
274 |   # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
275 |   # sequences to minimize the mismatch between pre-training and fine-tuning.
276 |   # The `target_seq_length` is just a rough target however, whereas
277 |   # `max_seq_length` is a hard limit.
278 |   target_seq_length = max_num_tokens
279 |   if rng.random() < short_seq_prob: # 有一定的比例，如10%的概率，我们使用比较短的序列长度，以缓解预训练的长序列和调优阶段（可能的）短序列的不一致情况
280 |     target_seq_length = rng.randint(2, max_num_tokens)
281 | 
282 |   # We DON'T just concatenate all of the tokens from a document into a long
283 |   # sequence and choose an arbitrary split point because this would make the
284 |   # next sentence prediction task too easy. Instead, we split the input into
285 |   # segments "A" and "B" based on the actual "sentences" provided by the user
286 |   # input.
287 |   # 设法使用实际的句子，而不是任意的截断句子，从而更好的构造句子连贯性预测的任务
288 |   instances = []
289 |   current_chunk = [] # 当前处理的文本段，包含多个句子
290 |   current_length = 0
291 |   i = 0
292 |   # print("###document:",document) # 一个document可以是一整篇文章、新闻、词条等. document:[['是', '爷', '们', '，', '就', '得', '给', '媳', '妇', '幸', '福'], ['关', '注', '【', '晨', '曦', '教', '育', '】', '，', '获', '取', '育', '儿', '的', '智', '慧', '，', '与', '孩', '子', '一', '同', '成', '长', '！'], ['方', '法', ':', '打', '开', '微', '信', '→', '添', '加', '朋', '友', '→', '搜', '号', '→', '##he', '##bc', '##x', '##jy', '##→', '关', '注', '!', '我', '是', '一', '个', '爷', '们', '，', '孝', '顺', '是', '做', '人', '的', '第', '一', '准', '则', '。'], ['甭', '管', '小', '时', '候', '怎', '么', '跟', '家', '长', '犯', '混', '蛋', '，', '长', '大', '了', '，', '就', '底', '报', '答', '父', '母', '，', '以', '后', '我', '媳', '妇', '也', '必', '须', '孝', '顺', '。'], ['我', '是', '一', '个', '爷', '们', '，', '可', '以', '花', '心', '，', '可', '以', '好', '玩', '。'], ['但', '我', '一', '定', '会', '找', '一', '个', '管', '的', '住', '我', '的', '女', '人', '，', '和', '我', '一', '起', '生', '活', '。'], ['28', '岁', '以', '前', '在', '怎', '么', '玩', '都', '行', '，', '但', '我', '最', '后', '一', '定', '会', '找', '一', '个', '勤', '俭', '持', '家', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', '，', '我', '不', '会', '让', '自', '己', '的', '女', '人', '受', '一', '点', '委', '屈', '，', '每', '次', '把', '她', '抱', '在', '怀', '里', '，', '看', '她', '洋', '溢', '着', '幸', '福', '的', '脸', '，', '我', '都', '会', '引', '以', '为', '傲', '，', '这', '特', '么', '就', '是', '我', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', '，', '干', '什', '么', '也', '不', '能', '忘', '了', '自', '己', '媳', '妇', '，', '就', '算', '和', '哥', '们', '一', '起', '喝', '酒', '，', '喝', '到', '很', '晚', '，', '也', '要', '提', '前', '打', '电', '话', '告', '诉', '她', '，', '让', '她', '早', '点', '休', '息', '。'], ['我', '是', '一', '爷', '们', '，', '我', '媳', '妇', '绝', '对', '不', '能', '抽', '烟', '，', '喝', '酒', '还', '勉', '强', '过', '得', '去', '，', '不', '过', '该', '喝', '的', '时', '候', '喝', '，', '不', '该', '喝', '的', '时', '候', '，', '少', '扯', '纳', '极', '薄', '蛋', '。'], ['我', '是', '一', '爷', '们', '，', '我', '媳', '妇', '必', '须', '听', '我', '话', '，', '在', '人', '前', '一', '定', '要', '给', '我', '面', '子', '，', '回', '家', '了', '咱', '什', '么', '都', '好', '说', '。'], ['我', '是', '一', '爷', '们', '，', '就', '算', '难', '的', '吃', '不', '上', '饭', '了', '，', '都', '不', '张', '口', '跟', '媳', '妇', '要', '一', '分', '钱', '。'], ['我', '是', '一', '爷', '们', '，', '不', '管', '上', '学', '还', '是', '上', '班', '，', '我', '都', '会', '送', '媳', '妇', '回', '家', '。'], ['我', '是', '一', '爷', '们', '，', '交', '往', '不', '到', '1', '年', '，', '绝', '对', '不', '会', '和', '媳', '妇', '提', '过', '分', '的', '要', '求', '，', '我', '会', '尊', '重', '她', '。'], ['我', '是', '一', '爷', '们', '，', '游', '戏', '永', '远', '比', '不', '上', '我', '媳', '妇', '重', '要', '，', '只', '要', '媳', '妇', '发', '话', '，', '我', '绝', '对', '唯', '命', '是', '从', '。'], ['我', '是', '一', '爷', '们', '，', '上', 'q', '绝', '对', '是', '为', '了', '等', '媳', '妇', '，', '所', '有', '暧', '昧', '的', '心', '情', '只', '为', '她', '一', '个', '女', '人', '而', '写', '，', '我', '不', '一', '定', '会', '经', '常', '写', '日', '志', '，', '可', '是', '我', '会', '告', '诉', '全', '世', '界', '，', '我', '很', '爱', '她', '。'], ['我', '是', '一', '爷', '们', '，', '不', '一', '定', '要', '经', '常', '制', '造', '浪', '漫', '、', '偶', '尔', '过', '个', '节', '日', '也', '要', '送', '束', '玫', '瑰', '花', '给', '媳', '妇', '抱', '回', '家', '。'], ['我', '是', '一', '爷', '们', '，', '手', '机', '会', '24', '小', '时', '为', '她', '开', '机', '，', '让', '她', '半', '夜', '痛', '经', '的', '时', '候', '，', '做', '恶', '梦', '的', '时', '候', '，', '随', '时', '可', '以', '联', '系', '到', '我', '。'], ['我', '是', '一', '爷', '们', '，', '我', '会', '经', '常', '带', '媳', '妇', '出', '去', '玩', '，', '她', '不', '一', '定', '要', '和', '我', '所', '有', '的', '哥', '们', '都', '认', '识', '，', '但', '见', '面', '能', '说', '的', '上', '话', '就', '行', '。'], ['我', '是', '一', '爷', '们', '，', '我', '会', '和', '媳', '妇', '的', '姐', '妹', '哥', '们', '搞', '好', '关', '系', '，', '让', '她', '们', '相', '信', '我', '一', '定', '可', '以', '给', '我', '媳', '妇', '幸', '福', '。'], ['我', '是', '一', '爷', '们', '，', '吵', '架', '后', '、', '也', '要', '主', '动', '打', '电', '话', '关', '心', '她', '，', '咱', '是', '一', '爷', '们', '，', '给', '媳', '妇', '服', '个', '软', '，', '道', '个', '歉', '怎', '么', '了', '？'], ['我', '是', '一', '爷', '们', '，', '绝', '对', '不', '会', '嫌', '弃', '自', '己', '媳', '妇', '，', '拿', '她', '和', '别', '人', '比', '，', '说', '她', '这', '不', '如', '人', '家', '，', '纳', '不', '如', '人', '家', '的', '。'], ['我', '是', '一', '爷', '们', '，', '陪', '媳', '妇', '逛', '街', '时', '，', '碰', '见', '熟', '人', '，', '无', '论', '我', '媳', '妇', '长', '的', '好', '看', '与', '否', '，', '我', '都', '会', '大', '方', '的', '介', '绍', '。'], ['谁', '让', '咱', '爷', '们', '就', '好', '这', '口', '呢', '。'], ['我', '是', '一', '爷', '们', '，', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'], ['【', '我', '们', '重', '在', '分', '享', '。'], ['所', '有', '文', '字', '和', '美', '图', '，', '来', '自', '网', '络', '，', '晨', '欣', '教', '育', '整', '理', '。'], ['对', '原', '文', '作', '者', '，', '表', '示', '敬', '意', '。'], ['】', '关', '注', '晨', '曦', '教', '育', '[UNK]', '[UNK]', '晨', '曦', '教', '育', '（', '微', '信', '号', '：', 'he', '##bc', '##x', '##jy', '）', '。'], ['打', '开', '微', '信', '，', '扫', '描', '二', '维', '码', '，', '关', '注', '[UNK]', '晨', '曦', '教', '育', '[UNK]', '，', '获', '取', '更', '多', '育', '儿', '资', '源', '。'], ['点', '击', '下', '面', '订', '阅', '按', '钮', '订', '阅', '，', '会', '有', '更', '多', '惊', '喜', '哦', '！']]
293 |   while i < len(document): # 从文档的第一个位置开始，按个往下看
294 |     segment = document[i] # segment是列表，代表的是按字分开的一个完整句子，如 segment=['我', '是', '一', '爷', '们', '，', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。']
295 |     if FLAGS.non_chinese==False: # if non chinese is False, that means it is chinese, then do something to make chinese whole word mask works.
296 |       segment = get_new_segment(segment)  # whole word mask for chinese: 结合分词的中文的whole mask设置即在需要的地方加上“##”
297 | 
298 |     current_chunk.append(segment) # 将一个独立的句子加入到当前的文本块中
299 |     current_length += len(segment) # 累计到为止位置接触到句子的总长度
300 |     if i == len(document) - 1 or current_length >= target_seq_length:
301 |       # 如果累计的序列长度达到了目标的长度，或当前走到了文档结尾==>构造并添加到“A[SEP]B“中的A和B中；
302 |       if current_chunk: # 如果当前块不为空
303 |         # `a_end` is how many segments from `current_chunk` go into the `A`
304 |         # (first) sentence.
305 |         a_end = 1
306 |         if len(current_chunk) >= 2: # 当前块，如果包含超过两个句子，取当前块的一部分作为“A[SEP]B“中的A部分
307 |           a_end = rng.randint(1, len(current_chunk) - 1)
308 |         # 将当前文本段中选取出来的前半部分，赋值给A即tokens_a
309 |         tokens_a = []
310 |         for j in range(a_end):
311 |           tokens_a.extend(current_chunk[j])
312 | 
313 |         # 构造“A[SEP]B“中的B部分(有一部分是正常的当前文档中的后半部;在原BERT的实现中一部分是随机的从另一个文档中选取的，）
314 |         tokens_b = []
315 |         for j in range(a_end, len(current_chunk)):
316 |           tokens_b.extend(current_chunk[j])
317 | 
318 |         # 有百分之50%的概率交换一下tokens_a和tokens_b的位置
319 |         # print("tokens_a length1:",len(tokens_a))
320 |         # print("tokens_b length1:",len(tokens_b)) # len(tokens_b) = 0
321 | 
322 |         if len(tokens_a)==0 or len(tokens_b)==0: continue
323 |         if rng.random() < 0.5: # 交换一下tokens_a和tokens_b
324 |           is_random_next=True
325 |           temp=tokens_a
326 |           tokens_a=tokens_b
327 |           tokens_b=temp
328 |         else:
329 |           is_random_next=False
330 | 
331 |         truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
332 | 
333 |         assert len(tokens_a) >= 1
334 |         assert len(tokens_b) >= 1
335 | 
336 |         # 把tokens_a & tokens_b加入到按照bert的风格，即以[CLS]tokens_a[SEP]tokens_b[SEP]的形式，结合到一起，作为最终的tokens; 也带上segment_ids，前面部分segment_ids的值是0，后面部分的值是1.
337 |         tokens = []
338 |         segment_ids = []
339 |         tokens.append("[CLS]")
340 |         segment_ids.append(0)
341 |         for token in tokens_a:
342 |           tokens.append(token)
343 |           segment_ids.append(0)
344 | 
345 |         tokens.append("[SEP]")
346 |         segment_ids.append(0)
347 | 
348 |         for token in tokens_b:
349 |           tokens.append(token)
350 |           segment_ids.append(1)
351 |         tokens.append("[SEP]")
352 |         segment_ids.append(1)
353 | 
354 |         # 创建masked LM的任务的数据 Creates the predictions for the masked LM objective
355 |         (tokens, masked_lm_positions,
356 |          masked_lm_labels) = create_masked_lm_predictions(
357 |              tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
358 |         instance = TrainingInstance( # 创建训练实例的对象
359 |             tokens=tokens,
360 |             segment_ids=segment_ids,
361 |             is_random_next=is_random_next,
362 |             masked_lm_positions=masked_lm_positions,
363 |             masked_lm_labels=masked_lm_labels)
364 |         instances.append(instance)
365 |       current_chunk = [] # 清空当前块
366 |       current_length = 0 # 重置当前文本块的长度
367 |     i += 1 # 接着文档中的内容往后看
368 | 
369 |   return instances
370 | 
371 | 
372 | def create_instances_from_document_original( # THIS IS ORIGINAL BERT STYLE FOR CREATE DATA OF MLM AND NEXT SENTENCE PREDICTION TASK
373 |     all_documents, document_index, max_seq_length, short_seq_prob,
374 |     masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
375 |   """Creates `TrainingInstance`s for a single document."""
376 |   document = all_documents[document_index] # 得到一个文档
377 | 
378 |   # Account for [CLS], [SEP], [SEP]
379 |   max_num_tokens = max_seq_length - 3
380 | 
381 |   # We *usually* want to fill up the entire sequence since we are padding
382 |   # to `max_seq_length` anyways, so short sequences are generally wasted
383 |   # computation. However, we *sometimes*
384 |   # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
385 |   # sequences to minimize the mismatch between pre-training and fine-tuning.
386 |   # The `target_seq_length` is just a rough target however, whereas
387 |   # `max_seq_length` is a hard limit.
388 |   target_seq_length = max_num_tokens
389 |   if rng.random() < short_seq_prob: # 有一定的比例，如10%的概率，我们使用比较短的序列长度，以缓解预训练的长序列和调优阶段（可能的）短序列的不一致情况
390 |     target_seq_length = rng.randint(2, max_num_tokens)
391 | 
392 |   # We DON'T just concatenate all of the tokens from a document into a long
393 |   # sequence and choose an arbitrary split point because this would make the
394 |   # next sentence prediction task too easy. Instead, we split the input into
395 |   # segments "A" and "B" based on the actual "sentences" provided by the user
396 |   # input.
397 |   # 设法使用实际的句子，而不是任意的截断句子，从而更好的构造句子连贯性预测的任务
398 |   instances = []
399 |   current_chunk = [] # 当前处理的文本段，包含多个句子
400 |   current_length = 0
401 |   i = 0
402 |   # print("###document:",document) # 一个document可以是一整篇文章、新闻、一个词条等. document:[['是', '爷', '们', '，', '就', '得', '给', '媳', '妇', '幸', '福'], ['关', '注', '【', '晨', '曦', '教', '育', '】', '，', '获', '取', '育', '儿', '的', '智', '慧', '，', '与', '孩', '子', '一', '同', '成', '长', '！'], ['方', '法', ':', '打', '开', '微', '信', '→', '添', '加', '朋', '友', '→', '搜', '号', '→', '##he', '##bc', '##x', '##jy', '##→', '关', '注', '!', '我', '是', '一', '个', '爷', '们', '，', '孝', '顺', '是', '做', '人', '的', '第', '一', '准', '则', '。'], ['甭', '管', '小', '时', '候', '怎', '么', '跟', '家', '长', '犯', '混', '蛋', '，', '长', '大', '了', '，', '就', '底', '报', '答', '父', '母', '，', '以', '后', '我', '媳', '妇', '也', '必', '须', '孝', '顺', '。'], ['我', '是', '一', '个', '爷', '们', '，', '可', '以', '花', '心', '，', '可', '以', '好', '玩', '。'], ['但', '我', '一', '定', '会', '找', '一', '个', '管', '的', '住', '我', '的', '女', '人', '，', '和', '我', '一', '起', '生', '活', '。'], ['28', '岁', '以', '前', '在', '怎', '么', '玩', '都', '行', '，', '但', '我', '最', '后', '一', '定', '会', '找', '一', '个', '勤', '俭', '持', '家', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', '，', '我', '不', '会', '让', '自', '己', '的', '女', '人', '受', '一', '点', '委', '屈', '，', '每', '次', '把', '她', '抱', '在', '怀', '里', '，', '看', '她', '洋', '溢', '着', '幸', '福', '的', '脸', '，', '我', '都', '会', '引', '以', '为', '傲', '，', '这', '特', '么', '就', '是', '我', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', '，', '干', '什', '么', '也', '不', '能', '忘', '了', '自', '己', '媳', '妇', '，', '就', '算', '和', '哥', '们', '一', '起', '喝', '酒', '，', '喝', '到', '很', '晚', '，', '也', '要', '提', '前', '打', '电', '话', '告', '诉', '她', '，', '让', '她', '早', '点', '休', '息', '。'], ['我', '是', '一', '爷', '们', '，', '我', '媳', '妇', '绝', '对', '不', '能', '抽', '烟', '，', '喝', '酒', '还', '勉', '强', '过', '得', '去', '，', '不', '过', '该', '喝', '的', '时', '候', '喝', '，', '不', '该', '喝', '的', '时', '候', '，', '少', '扯', '纳', '极', '薄', '蛋', '。'], ['我', '是', '一', '爷', '们', '，', '我', '媳', '妇', '必', '须', '听', '我', '话', '，', '在', '人', '前', '一', '定', '要', '给', '我', '面', '子', '，', '回', '家', '了', '咱', '什', '么', '都', '好', '说', '。'], ['我', '是', '一', '爷', '们', '，', '就', '算', '难', '的', '吃', '不', '上', '饭', '了', '，', '都', '不', '张', '口', '跟', '媳', '妇', '要', '一', '分', '钱', '。'], ['我', '是', '一', '爷', '们', '，', '不', '管', '上', '学', '还', '是', '上', '班', '，', '我', '都', '会', '送', '媳', '妇', '回', '家', '。'], ['我', '是', '一', '爷', '们', '，', '交', '往', '不', '到', '1', '年', '，', '绝', '对', '不', '会', '和', '媳', '妇', '提', '过', '分', '的', '要', '求', '，', '我', '会', '尊', '重', '她', '。'], ['我', '是', '一', '爷', '们', '，', '游', '戏', '永', '远', '比', '不', '上', '我', '媳', '妇', '重', '要', '，', '只', '要', '媳', '妇', '发', '话', '，', '我', '绝', '对', '唯', '命', '是', '从', '。'], ['我', '是', '一', '爷', '们', '，', '上', 'q', '绝', '对', '是', '为', '了', '等', '媳', '妇', '，', '所', '有', '暧', '昧', '的', '心', '情', '只', '为', '她', '一', '个', '女', '人', '而', '写', '，', '我', '不', '一', '定', '会', '经', '常', '写', '日', '志', '，', '可', '是', '我', '会', '告', '诉', '全', '世', '界', '，', '我', '很', '爱', '她', '。'], ['我', '是', '一', '爷', '们', '，', '不', '一', '定', '要', '经', '常', '制', '造', '浪', '漫', '、', '偶', '尔', '过', '个', '节', '日', '也', '要', '送', '束', '玫', '瑰', '花', '给', '媳', '妇', '抱', '回', '家', '。'], ['我', '是', '一', '爷', '们', '，', '手', '机', '会', '24', '小', '时', '为', '她', '开', '机', '，', '让', '她', '半', '夜', '痛', '经', '的', '时', '候', '，', '做', '恶', '梦', '的', '时', '候', '，', '随', '时', '可', '以', '联', '系', '到', '我', '。'], ['我', '是', '一', '爷', '们', '，', '我', '会', '经', '常', '带', '媳', '妇', '出', '去', '玩', '，', '她', '不', '一', '定', '要', '和', '我', '所', '有', '的', '哥', '们', '都', '认', '识', '，', '但', '见', '面', '能', '说', '的', '上', '话', '就', '行', '。'], ['我', '是', '一', '爷', '们', '，', '我', '会', '和', '媳', '妇', '的', '姐', '妹', '哥', '们', '搞', '好', '关', '系', '，', '让', '她', '们', '相', '信', '我', '一', '定', '可', '以', '给', '我', '媳', '妇', '幸', '福', '。'], ['我', '是', '一', '爷', '们', '，', '吵', '架', '后', '、', '也', '要', '主', '动', '打', '电', '话', '关', '心', '她', '，', '咱', '是', '一', '爷', '们', '，', '给', '媳', '妇', '服', '个', '软', '，', '道', '个', '歉', '怎', '么', '了', '？'], ['我', '是', '一', '爷', '们', '，', '绝', '对', '不', '会', '嫌', '弃', '自', '己', '媳', '妇', '，', '拿', '她', '和', '别', '人', '比', '，', '说', '她', '这', '不', '如', '人', '家', '，', '纳', '不', '如', '人', '家', '的', '。'], ['我', '是', '一', '爷', '们', '，', '陪', '媳', '妇', '逛', '街', '时', '，', '碰', '见', '熟', '人', '，', '无', '论', '我', '媳', '妇', '长', '的', '好', '看', '与', '否', '，', '我', '都', '会', '大', '方', '的', '介', '绍', '。'], ['谁', '让', '咱', '爷', '们', '就', '好', '这', '口', '呢', '。'], ['我', '是', '一', '爷', '们', '，', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'], ['【', '我', '们', '重', '在', '分', '享', '。'], ['所', '有', '文', '字', '和', '美', '图', '，', '来', '自', '网', '络', '，', '晨', '欣', '教', '育', '整', '理', '。'], ['对', '原', '文', '作', '者', '，', '表', '示', '敬', '意', '。'], ['】', '关', '注', '晨', '曦', '教', '育', '[UNK]', '[UNK]', '晨', '曦', '教', '育', '（', '微', '信', '号', '：', 'he', '##bc', '##x', '##jy', '）', '。'], ['打', '开', '微', '信', '，', '扫', '描', '二', '维', '码', '，', '关', '注', '[UNK]', '晨', '曦', '教', '育', '[UNK]', '，', '获', '取', '更', '多', '育', '儿', '资', '源', '。'], ['点', '击', '下', '面', '订', '阅', '按', '钮', '订', '阅', '，', '会', '有', '更', '多', '惊', '喜', '哦', '！']]
403 |   while i < len(document): # 从文档的第一个位置开始，按个往下看
404 |     segment = document[i] # segment是列表，代表的是按字分开的一个完整句子，如 segment=['我', '是', '一', '爷', '们', '，', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。']
405 |     # print("###i:",i,";segment:",segment)
406 |     current_chunk.append(segment) # 将一个独立的句子加入到当前的文本块中
407 |     current_length += len(segment) # 累计到为止位置接触到句子的总长度
408 |     if i == len(document) - 1 or current_length >= target_seq_length: # 如果累计的序列长度达到了目标的长度==>构造并添加到“A[SEP]B“中的A和B中。
409 |       if current_chunk: # 如果当前块不为空
410 |         # `a_end` is how many segments from `current_chunk` go into the `A`
411 |         # (first) sentence.
412 |         a_end = 1
413 |         if len(current_chunk) >= 2: # 当前块，如果包含超过两个句子，怎取当前块的一部分作为“A[SEP]B“中的A部分
414 |           a_end = rng.randint(1, len(current_chunk) - 1)
415 |         # 将当前文本段中选取出来的前半部分，赋值给A即tokens_a
416 |         tokens_a = []
417 |         for j in range(a_end):
418 |           tokens_a.extend(current_chunk[j])
419 | 
420 |         # 构造“A[SEP]B“中的B部分(原本的B有一部分是随机的从另一个文档中选取的，有一部分是正常的当前文档中的后半部）
421 |         tokens_b = []
422 |         # Random next
423 |         is_random_next = False
424 |         if len(current_chunk) == 1 or rng.random() < 0.5: # 有50%的概率，是从其他文档中随机的选取一个文档，并得到这个文档的后半版本作为B即tokens_b
425 |           is_random_next = True
426 |           target_b_length = target_seq_length - len(tokens_a)
427 | 
428 |           # This should rarely go for more than one iteration for large
429 |           # corpora. However, just to be careful, we try to make sure that
430 |           # the random document is not the same as the document
431 |           # we're processing.
432 |           random_document_index=0
433 |           for _ in range(10): # 随机的选出一个与当前的文档不一样的文档的索引
434 |             random_document_index = rng.randint(0, len(all_documents) - 1)
435 |             if random_document_index != document_index:
436 |               break
437 | 
438 |           random_document = all_documents[random_document_index] # 选出这个文档
439 |           random_start = rng.randint(0, len(random_document) - 1) # 从这个文档选出一个段落的开始位置
440 |           for j in range(random_start, len(random_document)): # 从这个文档的开始位置到结束，作为我们的“A[SEP]B“中的B即tokens_b
441 |             tokens_b.extend(random_document[j])
442 |             if len(tokens_b) >= target_b_length:
443 |               break
444 |           # We didn't actually use these segments so we "put them back" so
445 |           # they don't go to waste. 这里是为了防止文本的浪费的一个小技巧
446 |           num_unused_segments = len(current_chunk) - a_end # e.g. 550-200=350
447 |           i -= num_unused_segments # i=i-num_unused_segments, e.g. i=400, num_unused_segments=350, 那么 i=i-num_unused_segments=400-350=50
448 |         # Actual next
449 |         else: # 有另外50%的几乎，从当前文本块（长度为max_sequence_length）中的后段中填充到tokens_b即“A[SEP]B“中的B。
450 |           is_random_next = False
451 |           for j in range(a_end, len(current_chunk)):
452 |             tokens_b.extend(current_chunk[j])
453 |         truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
454 | 
455 |         assert len(tokens_a) >= 1
456 |         assert len(tokens_b) >= 1
457 | 
458 |         # 把tokens_a & tokens_b加入到按照bert的风格，即以[CLS]tokens_a[SEP]tokens_b[SEP]的形式，结合到一起，作为最终的tokens; 也带上segment_ids，前面部分segment_ids的值是0，后面部分的值是1.
459 |         tokens = []
460 |         segment_ids = []
461 |         tokens.append("[CLS]")
462 |         segment_ids.append(0)
463 |         for token in tokens_a:
464 |           tokens.append(token)
465 |           segment_ids.append(0)
466 | 
467 |         tokens.append("[SEP]")
468 |         segment_ids.append(0)
469 | 
470 |         for token in tokens_b:
471 |           tokens.append(token)
472 |           segment_ids.append(1)
473 |         tokens.append("[SEP]")
474 |         segment_ids.append(1)
475 | 
476 |         # 创建masked LM的任务的数据 Creates the predictions for the masked LM objective
477 |         (tokens, masked_lm_positions,
478 |          masked_lm_labels) = create_masked_lm_predictions(
479 |              tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
480 |         instance = TrainingInstance( # 创建训练实例的对象
481 |             tokens=tokens,
482 |             segment_ids=segment_ids,
483 |             is_random_next=is_random_next,
484 |             masked_lm_positions=masked_lm_positions,
485 |             masked_lm_labels=masked_lm_labels)
486 |         instances.append(instance)
487 |       current_chunk = [] # 清空当前块
488 |       current_length = 0 # 重置当前文本块的长度
489 |     i += 1 # 接着文档中的内容往后看
490 | 
491 |   return instances
492 | 
493 | 
494 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
495 |                                           ["index", "label"])
496 | 
497 | 
498 | def create_masked_lm_predictions(tokens, masked_lm_prob,
499 |                                  max_predictions_per_seq, vocab_words, rng):
500 |   """Creates the predictions for the masked LM objective."""
501 | 
502 |   cand_indexes = []
503 |   for (i, token) in enumerate(tokens):
504 |     if token == "[CLS]" or token == "[SEP]":
505 |       continue
506 |     # Whole Word Masking means that if we mask all of the wordpieces
507 |     # corresponding to an original word. When a word has been split into
508 |     # WordPieces, the first token does not have any marker and any subsequence
509 |     # tokens are prefixed with ##. So whenever we see the ## token, we
510 |     # append it to the previous set of word indexes.
511 |     #
512 |     # Note that Whole Word Masking does *not* change the training code
513 |     # at all -- we still predict each WordPiece independently, softmaxed
514 |     # over the entire vocabulary.
515 |     if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and
516 |             token.startswith("##")):
517 |       cand_indexes[-1].append(i)
518 |     else:
519 |       cand_indexes.append([i])
520 | 
521 |   rng.shuffle(cand_indexes)
522 | 
523 |   if FLAGS.non_chinese==False: # if non chinese is False, that means it is chinese, then try to remove "##" which is added previously
524 |     output_tokens = [t[2:] if len(re.findall('##[\u4E00-\u9FA5]', t)) > 0 else t for t in tokens]  # 去掉"##"
525 |   else: # english and other language, which is not chinese
526 |     output_tokens = list(tokens)
527 | 
528 |   num_to_predict = min(max_predictions_per_seq,
529 |                        max(1, int(round(len(tokens) * masked_lm_prob))))
530 | 
531 |   masked_lms = []
532 |   covered_indexes = set()
533 |   for index_set in cand_indexes:
534 |     if len(masked_lms) >= num_to_predict:
535 |       break
536 |     # If adding a whole-word mask would exceed the maximum number of
537 |     # predictions, then just skip this candidate.
538 |     if len(masked_lms) + len(index_set) > num_to_predict:
539 |       continue
540 |     is_any_index_covered = False
541 |     for index in index_set:
542 |       if index in covered_indexes:
543 |         is_any_index_covered = True
544 |         break
545 |     if is_any_index_covered:
546 |       continue
547 |     for index in index_set:
548 |       covered_indexes.add(index)
549 | 
550 |       masked_token = None
551 |       # 80% of the time, replace with [MASK]
552 |       if rng.random() < 0.8:
553 |         masked_token = "[MASK]"
554 |       else:
555 |         # 10% of the time, keep original
556 |         if rng.random() < 0.5:
557 |           if FLAGS.non_chinese == False: # if non chinese is False, that means it is chinese, then try to remove "##" which is added previously
558 |             masked_token = tokens[index][2:] if len(re.findall('##[\u4E00-\u9FA5]', tokens[index])) > 0 else tokens[index]  # 去掉"##"
559 |           else:
560 |             masked_token = tokens[index]
561 |         # 10% of the time, replace with random word
562 |         else:
563 |           masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
564 | 
565 |       output_tokens[index] = masked_token
566 | 
567 |       masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
568 |   assert len(masked_lms) <= num_to_predict
569 |   masked_lms = sorted(masked_lms, key=lambda x: x.index)
570 | 
571 |   masked_lm_positions = []
572 |   masked_lm_labels = []
573 |   for p in masked_lms:
574 |     masked_lm_positions.append(p.index)
575 |     masked_lm_labels.append(p.label)
576 | 
577 |   # tf.logging.info('%s' % (tokens))
578 |   # tf.logging.info('%s' % (output_tokens))
579 |   return (output_tokens, masked_lm_positions, masked_lm_labels)
580 | 
581 | def create_masked_lm_predictions_original(tokens, masked_lm_prob,
582 |                                  max_predictions_per_seq, vocab_words, rng):
583 |   """Creates the predictions for the masked LM objective."""
584 | 
585 |   cand_indexes = []
586 |   for (i, token) in enumerate(tokens):
587 |     if token == "[CLS]" or token == "[SEP]":
588 |       continue
589 |     # Whole Word Masking means that if we mask all of the wordpieces
590 |     # corresponding to an original word. When a word has been split into
591 |     # WordPieces, the first token does not have any marker and any subsequence
592 |     # tokens are prefixed with ##. So whenever we see the ## token, we
593 |     # append it to the previous set of word indexes.
594 |     #
595 |     # Note that Whole Word Masking does *not* change the training code
596 |     # at all -- we still predict each WordPiece independently, softmaxed
597 |     # over the entire vocabulary.
598 |     if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and
599 |         token.startswith("##")):
600 |       cand_indexes[-1].append(i)
601 |     else:
602 |       cand_indexes.append([i])
603 | 
604 |   rng.shuffle(cand_indexes)
605 | 
606 |   output_tokens = list(tokens)
607 | 
608 |   num_to_predict = min(max_predictions_per_seq,
609 |                        max(1, int(round(len(tokens) * masked_lm_prob))))
610 | 
611 |   masked_lms = []
612 |   covered_indexes = set()
613 |   for index_set in cand_indexes:
614 |     if len(masked_lms) >= num_to_predict:
615 |       break
616 |     # If adding a whole-word mask would exceed the maximum number of
617 |     # predictions, then just skip this candidate.
618 |     if len(masked_lms) + len(index_set) > num_to_predict:
619 |       continue
620 |     is_any_index_covered = False
621 |     for index in index_set:
622 |       if index in covered_indexes:
623 |         is_any_index_covered = True
624 |         break
625 |     if is_any_index_covered:
626 |       continue
627 |     for index in index_set:
628 |       covered_indexes.add(index)
629 | 
630 |       masked_token = None
631 |       # 80% of the time, replace with [MASK]
632 |       if rng.random() < 0.8:
633 |         masked_token = "[MASK]"
634 |       else:
635 |         # 10% of the time, keep original
636 |         if rng.random() < 0.5:
637 |           masked_token = tokens[index]
638 |         # 10% of the time, replace with random word
639 |         else:
640 |           masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
641 | 
642 |       output_tokens[index] = masked_token
643 | 
644 |       masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
645 |   assert len(masked_lms) <= num_to_predict
646 |   masked_lms = sorted(masked_lms, key=lambda x: x.index)
647 | 
648 |   masked_lm_positions = []
649 |   masked_lm_labels = []
650 |   for p in masked_lms:
651 |     masked_lm_positions.append(p.index)
652 |     masked_lm_labels.append(p.label)
653 | 
654 |   return (output_tokens, masked_lm_positions, masked_lm_labels)
655 | 
656 | 
657 | def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
658 |   """Truncates a pair of sequences to a maximum sequence length."""
659 |   while True:
660 |     total_length = len(tokens_a) + len(tokens_b)
661 |     if total_length <= max_num_tokens:
662 |       break
663 | 
664 |     trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
665 |     assert len(trunc_tokens) >= 1
666 | 
667 |     # We want to sometimes truncate from the front and sometimes from the
668 |     # back to add more randomness and avoid biases.
669 |     if rng.random() < 0.5:
670 |       del trunc_tokens[0]
671 |     else:
672 |       trunc_tokens.pop()
673 | 
674 | 
675 | def main(_):
676 |   tf.logging.set_verbosity(tf.logging.INFO)
677 | 
678 |   tokenizer = tokenization.FullTokenizer(
679 |       vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
680 | 
681 |   input_files = []
682 |   for input_pattern in FLAGS.input_file.split(","):
683 |     input_files.extend(tf.gfile.Glob(input_pattern))
684 | 
685 |   tf.logging.info("*** Reading from input files ***")
686 |   for input_file in input_files:
687 |     tf.logging.info("  %s", input_file)
688 | 
689 |   rng = random.Random(FLAGS.random_seed)
690 |   instances = create_training_instances(
691 |       input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
692 |       FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
693 |       rng)
694 | 
695 |   output_files = FLAGS.output_file.split(",")
696 |   tf.logging.info("*** Writing to output files ***")
697 |   for output_file in output_files:
698 |     tf.logging.info("  %s", output_file)
699 | 
700 |   write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
701 |                                   FLAGS.max_predictions_per_seq, output_files)
702 | 
703 | 
704 | if __name__ == "__main__":
705 |   flags.mark_flag_as_required("input_file")
706 |   flags.mark_flag_as_required("output_file")
707 |   flags.mark_flag_as_required("vocab_file")
708 |   tf.app.run()


--------------------------------------------------------------------------------