├── .gitignore ├── albert_config ├── bert_config.json ├── albert_config_base.json ├── albert_config_large.json ├── albert_config_tiny.json ├── albert_config_xlarge.json └── albert_config_xxlarge.json ├── LICENSE ├── README.md ├── test_changes.py ├── bert_utils.py ├── optimization_finetuning.py ├── tf_metrics.py ├── optimization.py ├── tokenization.py ├── run_pretraining.py ├── data └── test.txt ├── albert_ner.py └── create_pretraining_data.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | albert_base_ner_checkpoints 3 | albert_base_zh 4 | -------------------------------------------------------------------------------- /albert_config/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 768, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 512, 10 | "num_attention_heads": 12, 11 | "num_hidden_layers": 12, 12 | "pooler_fc_size": 768, 13 | "pooler_num_attention_heads": 12, 14 | "pooler_num_fc_layers": 3, 15 | "pooler_size_per_head": 128, 16 | "pooler_type": "first_token_transform", 17 | "type_vocab_size": 2, 18 | "vocab_size": 21128 19 | } 20 | -------------------------------------------------------------------------------- /albert_config/albert_config_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.0, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.0, 6 | "hidden_size": 768, 7 | "embedding_size": 128, 8 | "initializer_range": 0.02, 9 | "intermediate_size": 3072 , 10 | "max_position_embeddings": 512, 11 | "num_attention_heads": 12, 12 | "num_hidden_layers": 12, 13 | 14 | "pooler_fc_size": 768, 15 | "pooler_num_attention_heads": 12, 16 | "pooler_num_fc_layers": 3, 17 | "pooler_size_per_head": 128, 18 | "pooler_type": "first_token_transform", 19 | "type_vocab_size": 2, 20 | "vocab_size": 21128, 21 | "ln_type":"postln" 22 | 23 | } 24 | -------------------------------------------------------------------------------- /albert_config/albert_config_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.0, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.0, 6 | "hidden_size": 1024, 7 | "embedding_size": 128, 8 | "initializer_range": 0.02, 9 | "intermediate_size": 4096, 10 | "max_position_embeddings": 512, 11 | "num_attention_heads": 16, 12 | "num_hidden_layers": 24, 13 | 14 | "pooler_fc_size": 768, 15 | "pooler_num_attention_heads": 12, 16 | "pooler_num_fc_layers": 3, 17 | "pooler_size_per_head": 128, 18 | "pooler_type": "first_token_transform", 19 | "type_vocab_size": 2, 20 | "vocab_size": 21128, 21 | "ln_type":"postln" 22 | 23 | } 24 | -------------------------------------------------------------------------------- /albert_config/albert_config_tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.0, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.0, 6 | "hidden_size": 312, 7 | "embedding_size": 128, 8 | "initializer_range": 0.02, 9 | "intermediate_size": 1248 , 10 | "max_position_embeddings": 512, 11 | "num_attention_heads": 12, 12 | "num_hidden_layers": 4, 13 | 14 | "pooler_fc_size": 768, 15 | "pooler_num_attention_heads": 12, 16 | "pooler_num_fc_layers": 3, 17 | "pooler_size_per_head": 128, 18 | "pooler_type": "first_token_transform", 19 | "type_vocab_size": 2, 20 | "vocab_size": 21128, 21 | "ln_type":"postln" 22 | 23 | } 24 | -------------------------------------------------------------------------------- /albert_config/albert_config_xlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.0, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.0, 6 | "hidden_size": 2048, 7 | "embedding_size": 128, 8 | "initializer_range": 0.02, 9 | "intermediate_size": 8192, 10 | "max_position_embeddings": 512, 11 | "num_attention_heads": 32, 12 | "num_hidden_layers": 24, 13 | 14 | "pooler_fc_size": 1024, 15 | "pooler_num_attention_heads": 64, 16 | "pooler_num_fc_layers": 3, 17 | "pooler_size_per_head": 128, 18 | "pooler_type": "first_token_transform", 19 | "type_vocab_size": 2, 20 | "vocab_size": 21128, 21 | "ln_type":"postln" 22 | 23 | } 24 | -------------------------------------------------------------------------------- /albert_config/albert_config_xxlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.0, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.0, 6 | "hidden_size": 4096, 7 | "embedding_size": 128, 8 | "initializer_range": 0.02, 9 | "intermediate_size": 16384, 10 | "max_position_embeddings": 512, 11 | "num_attention_heads": 64, 12 | "num_hidden_layers": 12, 13 | 14 | "pooler_fc_size": 1024, 15 | "pooler_num_attention_heads": 64, 16 | "pooler_num_fc_layers": 3, 17 | "pooler_size_per_head": 128, 18 | "pooler_type": "first_token_transform", 19 | "type_vocab_size": 2, 20 | "vocab_size": 21128, 21 | "ln_type":"preln" 22 | 23 | } 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 songheqi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # albert-chinese-ner 2 | 3 | ## 前言 4 | 5 | 这次的albert某种程度上可能比bert本身更具有意义,恰逢中文预训练模型出来,还是按照之前的数据来做NER方面的fine-tune 6 | 7 | PS: 移步传统[**bert ner**](https://github.com/ProHiryu/bert-chinese-ner)模型 8 | 9 | ## Resources 10 | 11 | - [Bert](https://github.com/google-research/bert) 12 | - [ALBert](https://github.com/google-research/albert) 13 | - [ALBert_zh](https://github.com/brightmart/albert_zh) 14 | 15 | ## Papers 16 | 17 | - [ALBERT](https://arxiv.org/pdf/1909.11942.pdf) 18 | 19 | ## 配置 20 | 21 | 1. 下载albert中文模型,这里使用的是base 22 | 2. 将模型文件夹重命名为albert_base_zh,放入项目中 23 | 3. 运行 24 | ```bash 25 | python albert_ner.py --task_name ner --do_train true --do_eval true --data_dir data --vocab_file ./albert_config/vocab.txt --bert_config_file ./albert_base_zh/albert_config_base.json --max_seq_length 128 --train_batch_size 64 --learning_rate 2e-5 --num_train_epochs 3 --output_dir albert_base_ner_checkpoints 26 | ``` 27 | 4.最好使用tensorflow > 1.13, 这里运行的是1.15,不支持tf2.0 28 | 29 | ## 结果 30 | 31 | Base模型下训练3个epoch后: 32 | 33 | ```bash 34 | INFO:tensorflow: eval_f = 0.9280548 35 | INFO:tensorflow: eval_precision = 0.923054 36 | INFO:tensorflow: eval_recall = 0.9331808 37 | INFO:tensorflow: global_step = 2374 38 | INFO:tensorflow: loss = 13.210413 39 | ``` 40 | 41 | 测试结果同样: 42 | 43 | ``` 44 | [CLS] 45 | B-LOC 46 | I-LOC 47 | O 48 | B-LOC 49 | I-LOC 50 | I-PER 51 | O 52 | O 53 | O 54 | O 55 | O 56 | O 57 | O 58 | O 59 | O 60 | [SEP] 61 | [CLS] 62 | ``` 63 | 64 | ## 总结 65 | 66 | 比起Bert本体,模型确实小了很多,效果却基本相当甚至领先bert,训练时间大幅缩小,NLP的“大舰巨炮”时代可能真的要过去了 67 | -------------------------------------------------------------------------------- /test_changes.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import tensorflow as tf 3 | from modeling import embedding_lookup_factorized,transformer_model 4 | import os 5 | 6 | """ 7 | 测试albert主要的改进点:词嵌入的因式分解、层间参数共享、段落间连贯性 8 | test main change of albert from bert 9 | """ 10 | batch_size = 2048 11 | sequence_length = 512 12 | vocab_size = 30000 13 | hidden_size = 1024 14 | num_attention_heads = int(hidden_size / 64) 15 | 16 | def get_total_parameters(): 17 | """ 18 | get total parameters of a graph 19 | :return: 20 | """ 21 | total_parameters = 0 22 | for variable in tf.trainable_variables(): 23 | # shape is an array of tf.Dimension 24 | shape = variable.get_shape() 25 | # print(shape) 26 | # print(len(shape)) 27 | variable_parameters = 1 28 | for dim in shape: 29 | # print(dim) 30 | variable_parameters *= dim.value 31 | # print(variable_parameters) 32 | total_parameters += variable_parameters 33 | return total_parameters 34 | 35 | def test_factorized_embedding(): 36 | """ 37 | test of Factorized embedding parameterization 38 | :return: 39 | """ 40 | input_ids=tf.zeros((batch_size, sequence_length),dtype=tf.int32) 41 | output, embedding_table, embedding_table_2=embedding_lookup_factorized(input_ids,vocab_size,hidden_size) 42 | print("output:",output) 43 | 44 | def test_share_parameters(): 45 | """ 46 | test of share parameters across all layers: how many parameter after share parameter across layers of transformer. 47 | :return: 48 | """ 49 | def total_parameters_transformer(share_parameter_across_layers): 50 | input_tensor=tf.zeros((batch_size, sequence_length, hidden_size),dtype=tf.float32) 51 | print("transformer_model. input:",input_tensor) 52 | transformer_result=transformer_model(input_tensor,hidden_size=hidden_size,num_attention_heads=num_attention_heads,share_parameter_across_layers=share_parameter_across_layers) 53 | print("transformer_result:",transformer_result) 54 | total_parameters=get_total_parameters() 55 | print('total_parameters(not share):',total_parameters) 56 | 57 | share_parameter_across_layers=False 58 | total_parameters_transformer(share_parameter_across_layers) # total parameters, not share: 125,976,576 = 125 million 59 | 60 | tf.reset_default_graph() # Clears the default graph stack and resets the global default graph 61 | share_parameter_across_layers=True 62 | total_parameters_transformer(share_parameter_across_layers) # total parameters, share: 10,498,048 = 10.5 million 63 | 64 | def test_sentence_order_prediction(): 65 | """ 66 | sentence order prediction. 67 | 68 | check method of create_instances_from_document_albert from create_pretrining_data.py 69 | 70 | :return: 71 | """ 72 | # 添加运行权限 73 | os.system("chmod +x create_pretrain_data.sh") 74 | 75 | os.system("./create_pretrain_data.sh") 76 | 77 | 78 | # 1.test of Factorized embedding parameterization 79 | #test_factorized_embedding() 80 | 81 | # 2. test of share parameters across all layers: how many parameter after share parameter across layers of transformer. 82 | # before share parameter: 125,976,576; after share parameter: 83 | #test_share_parameters() 84 | 85 | # 3. test of sentence order prediction(SOP) 86 | test_sentence_order_prediction() 87 | 88 | -------------------------------------------------------------------------------- /bert_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import collections 6 | import copy 7 | import json 8 | import math 9 | import re 10 | import six 11 | import tensorflow as tf 12 | 13 | def get_shape_list(tensor, expected_rank=None, name=None): 14 | """Returns a list of the shape of tensor, preferring static dimensions. 15 | 16 | Args: 17 | tensor: A tf.Tensor object to find the shape of. 18 | expected_rank: (optional) int. The expected rank of `tensor`. If this is 19 | specified and the `tensor` has a different rank, and exception will be 20 | thrown. 21 | name: Optional name of the tensor for the error message. 22 | 23 | Returns: 24 | A list of dimensions of the shape of tensor. All static dimensions will 25 | be returned as python integers, and dynamic dimensions will be returned 26 | as tf.Tensor scalars. 27 | """ 28 | if name is None: 29 | name = tensor.name 30 | 31 | if expected_rank is not None: 32 | assert_rank(tensor, expected_rank, name) 33 | 34 | shape = tensor.shape.as_list() 35 | 36 | non_static_indexes = [] 37 | for (index, dim) in enumerate(shape): 38 | if dim is None: 39 | non_static_indexes.append(index) 40 | 41 | if not non_static_indexes: 42 | return shape 43 | 44 | dyn_shape = tf.shape(tensor) 45 | for index in non_static_indexes: 46 | shape[index] = dyn_shape[index] 47 | return shape 48 | 49 | def reshape_to_matrix(input_tensor): 50 | """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" 51 | ndims = input_tensor.shape.ndims 52 | if ndims < 2: 53 | raise ValueError("Input tensor must have at least rank 2. Shape = %s" % 54 | (input_tensor.shape)) 55 | if ndims == 2: 56 | return input_tensor 57 | 58 | width = input_tensor.shape[-1] 59 | output_tensor = tf.reshape(input_tensor, [-1, width]) 60 | return output_tensor 61 | 62 | def reshape_from_matrix(output_tensor, orig_shape_list): 63 | """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" 64 | if len(orig_shape_list) == 2: 65 | return output_tensor 66 | 67 | output_shape = get_shape_list(output_tensor) 68 | 69 | orig_dims = orig_shape_list[0:-1] 70 | width = output_shape[-1] 71 | 72 | return tf.reshape(output_tensor, orig_dims + [width]) 73 | 74 | def assert_rank(tensor, expected_rank, name=None): 75 | """Raises an exception if the tensor rank is not of the expected rank. 76 | 77 | Args: 78 | tensor: A tf.Tensor to check the rank of. 79 | expected_rank: Python integer or list of integers, expected rank. 80 | name: Optional name of the tensor for the error message. 81 | 82 | Raises: 83 | ValueError: If the expected shape doesn't match the actual shape. 84 | """ 85 | if name is None: 86 | name = tensor.name 87 | 88 | expected_rank_dict = {} 89 | if isinstance(expected_rank, six.integer_types): 90 | expected_rank_dict[expected_rank] = True 91 | else: 92 | for x in expected_rank: 93 | expected_rank_dict[x] = True 94 | 95 | actual_rank = tensor.shape.ndims 96 | if actual_rank not in expected_rank_dict: 97 | scope_name = tf.get_variable_scope().name 98 | raise ValueError( 99 | "For the tensor `%s` in scope `%s`, the actual rank " 100 | "`%d` (shape = %s) is not equal to the expected rank `%s`" % 101 | (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) 102 | 103 | def gather_indexes(sequence_tensor, positions): 104 | """Gathers the vectors at the specific positions over a minibatch.""" 105 | sequence_shape = get_shape_list(sequence_tensor, expected_rank=3) 106 | batch_size = sequence_shape[0] 107 | seq_length = sequence_shape[1] 108 | width = sequence_shape[2] 109 | 110 | flat_offsets = tf.reshape( 111 | tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) 112 | flat_positions = tf.reshape(positions + flat_offsets, [-1]) 113 | flat_sequence_tensor = tf.reshape(sequence_tensor, 114 | [batch_size * seq_length, width]) 115 | output_tensor = tf.gather(flat_sequence_tensor, flat_positions) 116 | return output_tensor 117 | 118 | # add sequence mask for: 119 | # 1. random shuffle lm modeling---xlnet with random shuffled input 120 | # 2. left2right and right2left language modeling 121 | # 3. conditional generation 122 | def generate_seq2seq_mask(attention_mask, mask_sequence, seq_type, **kargs): 123 | if seq_type == 'seq2seq': 124 | if mask_sequence is not None: 125 | seq_shape = get_shape_list(mask_sequence, expected_rank=2) 126 | seq_len = seq_shape[1] 127 | ones = tf.ones((1, seq_len, seq_len)) 128 | a_mask = tf.matrix_band_part(ones, -1, 0) 129 | s_ex12 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 2) 130 | s_ex13 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 3) 131 | a_mask = (1 - s_ex13) * (1 - s_ex12) + s_ex13 * a_mask 132 | # generate mask of batch x seq_len x seq_len 133 | a_mask = tf.reshape(a_mask, (-1, seq_len, seq_len)) 134 | out_mask = attention_mask * a_mask 135 | else: 136 | ones = tf.ones_like(attention_mask[:1]) 137 | mask = (tf.matrix_band_part(ones, -1, 0)) 138 | out_mask = attention_mask * mask 139 | else: 140 | out_mask = attention_mask 141 | 142 | return out_mask 143 | 144 | -------------------------------------------------------------------------------- /optimization_finetuning.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | import tensorflow as tf 23 | 24 | 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): 26 | """Creates an optimizer training op.""" 27 | global_step = tf.train.get_or_create_global_step() 28 | 29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 30 | 31 | # Implements linear decay of the learning rate. 32 | learning_rate = tf.train.polynomial_decay( 33 | learning_rate, 34 | global_step, 35 | num_train_steps, 36 | end_learning_rate=0.0, 37 | power=1.0, 38 | cycle=False) 39 | 40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 41 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 42 | if num_warmup_steps: 43 | global_steps_int = tf.cast(global_step, tf.int32) 44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 45 | 46 | global_steps_float = tf.cast(global_steps_int, tf.float32) 47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 48 | 49 | warmup_percent_done = global_steps_float / warmup_steps_float 50 | warmup_learning_rate = init_lr * warmup_percent_done 51 | 52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 53 | learning_rate = ( 54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 55 | 56 | # It is recommended that you use this optimizer for fine tuning, since this 57 | # is how the model was trained (note that the Adam m/v variables are NOT 58 | # loaded from init_checkpoint.) 59 | optimizer = AdamWeightDecayOptimizer( 60 | learning_rate=learning_rate, 61 | weight_decay_rate=0.01, 62 | beta_1=0.9, 63 | beta_2=0.999, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999, 64 | epsilon=1e-6, 65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 66 | 67 | if use_tpu: 68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 69 | 70 | tvars = tf.trainable_variables() 71 | grads = tf.gradients(loss, tvars) 72 | 73 | # This is how the model was pre-trained. 74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) 75 | 76 | train_op = optimizer.apply_gradients( 77 | zip(grads, tvars), global_step=global_step) 78 | 79 | # Normally the global step update is done inside of `apply_gradients`. 80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use 81 | # a different optimizer, you should probably take this line out. 82 | new_global_step = global_step + 1 83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 84 | return train_op 85 | 86 | 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 88 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 89 | 90 | def __init__(self, 91 | learning_rate, 92 | weight_decay_rate=0.0, 93 | beta_1=0.9, 94 | beta_2=0.999, 95 | epsilon=1e-6, 96 | exclude_from_weight_decay=None, 97 | name="AdamWeightDecayOptimizer"): 98 | """Constructs a AdamWeightDecayOptimizer.""" 99 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 100 | 101 | self.learning_rate = learning_rate 102 | self.weight_decay_rate = weight_decay_rate 103 | self.beta_1 = beta_1 104 | self.beta_2 = beta_2 105 | self.epsilon = epsilon 106 | self.exclude_from_weight_decay = exclude_from_weight_decay 107 | 108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 109 | """See base class.""" 110 | assignments = [] 111 | for (grad, param) in grads_and_vars: 112 | if grad is None or param is None: 113 | continue 114 | 115 | param_name = self._get_variable_name(param.name) 116 | 117 | m = tf.get_variable( 118 | name=param_name + "/adam_m", 119 | shape=param.shape.as_list(), 120 | dtype=tf.float32, 121 | trainable=False, 122 | initializer=tf.zeros_initializer()) 123 | v = tf.get_variable( 124 | name=param_name + "/adam_v", 125 | shape=param.shape.as_list(), 126 | dtype=tf.float32, 127 | trainable=False, 128 | initializer=tf.zeros_initializer()) 129 | 130 | # Standard Adam update. 131 | next_m = ( 132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 133 | next_v = ( 134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 135 | tf.square(grad))) 136 | 137 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 138 | 139 | # Just adding the square of the weights to the loss function is *not* 140 | # the correct way of using L2 regularization/weight decay with Adam, 141 | # since that will interact with the m and v parameters in strange ways. 142 | # 143 | # Instead we want ot decay the weights in a manner that doesn't interact 144 | # with the m/v parameters. This is equivalent to adding the square 145 | # of the weights to the loss with plain (non-momentum) SGD. 146 | if self._do_use_weight_decay(param_name): 147 | update += self.weight_decay_rate * param 148 | 149 | update_with_lr = self.learning_rate * update 150 | 151 | next_param = param - update_with_lr 152 | 153 | assignments.extend( 154 | [param.assign(next_param), 155 | m.assign(next_m), 156 | v.assign(next_v)]) 157 | return tf.group(*assignments, name=name) 158 | 159 | def _do_use_weight_decay(self, param_name): 160 | """Whether to use L2 weight decay for `param_name`.""" 161 | if not self.weight_decay_rate: 162 | return False 163 | if self.exclude_from_weight_decay: 164 | for r in self.exclude_from_weight_decay: 165 | if re.search(r, param_name) is not None: 166 | return False 167 | return True 168 | 169 | def _get_variable_name(self, param_name): 170 | """Get the variable name from the tensor name.""" 171 | m = re.match("^(.*):\\d+$", param_name) 172 | if m is not None: 173 | param_name = m.group(1) 174 | return param_name 175 | -------------------------------------------------------------------------------- /tf_metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Multiclass 3 | from: 4 | https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py 5 | 6 | """ 7 | 8 | __author__ = "Guillaume Genthial" 9 | 10 | import numpy as np 11 | import tensorflow as tf 12 | from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix 13 | 14 | 15 | def precision(labels, predictions, num_classes, pos_indices=None, 16 | weights=None, average='micro'): 17 | """Multi-class precision metric for Tensorflow 18 | Parameters 19 | ---------- 20 | labels : Tensor of tf.int32 or tf.int64 21 | The true labels 22 | predictions : Tensor of tf.int32 or tf.int64 23 | The predictions, same shape as labels 24 | num_classes : int 25 | The number of classes 26 | pos_indices : list of int, optional 27 | The indices of the positive classes, default is all 28 | weights : Tensor of tf.int32, optional 29 | Mask, must be of compatible shape with labels 30 | average : str, optional 31 | 'micro': counts the total number of true positives, false 32 | positives, and false negatives for the classes in 33 | `pos_indices` and infer the metric from it. 34 | 'macro': will compute the metric separately for each class in 35 | `pos_indices` and average. Will not account for class 36 | imbalance. 37 | 'weighted': will compute the metric separately for each class in 38 | `pos_indices` and perform a weighted average by the total 39 | number of true labels for each class. 40 | Returns 41 | ------- 42 | tuple of (scalar float Tensor, update_op) 43 | """ 44 | cm, op = _streaming_confusion_matrix( 45 | labels, predictions, num_classes, weights) 46 | pr, _, _ = metrics_from_confusion_matrix( 47 | cm, pos_indices, average=average) 48 | op, _, _ = metrics_from_confusion_matrix( 49 | op, pos_indices, average=average) 50 | return (pr, op) 51 | 52 | 53 | def recall(labels, predictions, num_classes, pos_indices=None, weights=None, 54 | average='micro'): 55 | """Multi-class recall metric for Tensorflow 56 | Parameters 57 | ---------- 58 | labels : Tensor of tf.int32 or tf.int64 59 | The true labels 60 | predictions : Tensor of tf.int32 or tf.int64 61 | The predictions, same shape as labels 62 | num_classes : int 63 | The number of classes 64 | pos_indices : list of int, optional 65 | The indices of the positive classes, default is all 66 | weights : Tensor of tf.int32, optional 67 | Mask, must be of compatible shape with labels 68 | average : str, optional 69 | 'micro': counts the total number of true positives, false 70 | positives, and false negatives for the classes in 71 | `pos_indices` and infer the metric from it. 72 | 'macro': will compute the metric separately for each class in 73 | `pos_indices` and average. Will not account for class 74 | imbalance. 75 | 'weighted': will compute the metric separately for each class in 76 | `pos_indices` and perform a weighted average by the total 77 | number of true labels for each class. 78 | Returns 79 | ------- 80 | tuple of (scalar float Tensor, update_op) 81 | """ 82 | cm, op = _streaming_confusion_matrix( 83 | labels, predictions, num_classes, weights) 84 | _, re, _ = metrics_from_confusion_matrix( 85 | cm, pos_indices, average=average) 86 | _, op, _ = metrics_from_confusion_matrix( 87 | op, pos_indices, average=average) 88 | return (re, op) 89 | 90 | 91 | def f1(labels, predictions, num_classes, pos_indices=None, weights=None, 92 | average='micro'): 93 | return fbeta(labels, predictions, num_classes, pos_indices, weights, 94 | average) 95 | 96 | 97 | def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None, 98 | average='micro', beta=1): 99 | """Multi-class fbeta metric for Tensorflow 100 | Parameters 101 | ---------- 102 | labels : Tensor of tf.int32 or tf.int64 103 | The true labels 104 | predictions : Tensor of tf.int32 or tf.int64 105 | The predictions, same shape as labels 106 | num_classes : int 107 | The number of classes 108 | pos_indices : list of int, optional 109 | The indices of the positive classes, default is all 110 | weights : Tensor of tf.int32, optional 111 | Mask, must be of compatible shape with labels 112 | average : str, optional 113 | 'micro': counts the total number of true positives, false 114 | positives, and false negatives for the classes in 115 | `pos_indices` and infer the metric from it. 116 | 'macro': will compute the metric separately for each class in 117 | `pos_indices` and average. Will not account for class 118 | imbalance. 119 | 'weighted': will compute the metric separately for each class in 120 | `pos_indices` and perform a weighted average by the total 121 | number of true labels for each class. 122 | beta : int, optional 123 | Weight of precision in harmonic mean 124 | Returns 125 | ------- 126 | tuple of (scalar float Tensor, update_op) 127 | """ 128 | cm, op = _streaming_confusion_matrix( 129 | labels, predictions, num_classes, weights) 130 | _, _, fbeta = metrics_from_confusion_matrix( 131 | cm, pos_indices, average=average, beta=beta) 132 | _, _, op = metrics_from_confusion_matrix( 133 | op, pos_indices, average=average, beta=beta) 134 | return (fbeta, op) 135 | 136 | 137 | def safe_div(numerator, denominator): 138 | """Safe division, return 0 if denominator is 0""" 139 | numerator, denominator = tf.to_float(numerator), tf.to_float(denominator) 140 | zeros = tf.zeros_like(numerator, dtype=numerator.dtype) 141 | denominator_is_zero = tf.equal(denominator, zeros) 142 | return tf.where(denominator_is_zero, zeros, numerator / denominator) 143 | 144 | 145 | def pr_re_fbeta(cm, pos_indices, beta=1): 146 | """Uses a confusion matrix to compute precision, recall and fbeta""" 147 | num_classes = cm.shape[0] 148 | neg_indices = [i for i in range(num_classes) if i not in pos_indices] 149 | cm_mask = np.ones([num_classes, num_classes]) 150 | cm_mask[neg_indices, neg_indices] = 0 151 | diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask)) 152 | 153 | cm_mask = np.ones([num_classes, num_classes]) 154 | cm_mask[:, neg_indices] = 0 155 | tot_pred = tf.reduce_sum(cm * cm_mask) 156 | 157 | cm_mask = np.ones([num_classes, num_classes]) 158 | cm_mask[neg_indices, :] = 0 159 | tot_gold = tf.reduce_sum(cm * cm_mask) 160 | 161 | pr = safe_div(diag_sum, tot_pred) 162 | re = safe_div(diag_sum, tot_gold) 163 | fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re) 164 | 165 | return pr, re, fbeta 166 | 167 | 168 | def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro', 169 | beta=1): 170 | """Precision, Recall and F1 from the confusion matrix 171 | Parameters 172 | ---------- 173 | cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes) 174 | The streaming confusion matrix. 175 | pos_indices : list of int, optional 176 | The indices of the positive classes 177 | beta : int, optional 178 | Weight of precision in harmonic mean 179 | average : str, optional 180 | 'micro', 'macro' or 'weighted' 181 | """ 182 | num_classes = cm.shape[0] 183 | if pos_indices is None: 184 | pos_indices = [i for i in range(num_classes)] 185 | 186 | if average == 'micro': 187 | return pr_re_fbeta(cm, pos_indices, beta) 188 | elif average in {'macro', 'weighted'}: 189 | precisions, recalls, fbetas, n_golds = [], [], [], [] 190 | for idx in pos_indices: 191 | pr, re, fbeta = pr_re_fbeta(cm, [idx], beta) 192 | precisions.append(pr) 193 | recalls.append(re) 194 | fbetas.append(fbeta) 195 | cm_mask = np.zeros([num_classes, num_classes]) 196 | cm_mask[idx, :] = 1 197 | n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask))) 198 | 199 | if average == 'macro': 200 | pr = tf.reduce_mean(precisions) 201 | re = tf.reduce_mean(recalls) 202 | fbeta = tf.reduce_mean(fbetas) 203 | return pr, re, fbeta 204 | if average == 'weighted': 205 | n_gold = tf.reduce_sum(n_golds) 206 | pr_sum = sum(p * n for p, n in zip(precisions, n_golds)) 207 | pr = safe_div(pr_sum, n_gold) 208 | re_sum = sum(r * n for r, n in zip(recalls, n_golds)) 209 | re = safe_div(re_sum, n_gold) 210 | fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds)) 211 | fbeta = safe_div(fbeta_sum, n_gold) 212 | return pr, re, fbeta 213 | 214 | else: 215 | raise NotImplementedError() -------------------------------------------------------------------------------- /optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | import tensorflow as tf 23 | 24 | 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): 26 | """Creates an optimizer training op.""" 27 | global_step = tf.train.get_or_create_global_step() 28 | 29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 30 | 31 | # Implements linear decay of the learning rate. 32 | learning_rate = tf.train.polynomial_decay( 33 | learning_rate, 34 | global_step, 35 | num_train_steps, 36 | end_learning_rate=0.0, 37 | power=1.0, 38 | cycle=False) 39 | 40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 41 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 42 | if num_warmup_steps: 43 | global_steps_int = tf.cast(global_step, tf.int32) 44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 45 | 46 | global_steps_float = tf.cast(global_steps_int, tf.float32) 47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 48 | 49 | warmup_percent_done = global_steps_float / warmup_steps_float 50 | warmup_learning_rate = init_lr * warmup_percent_done 51 | 52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 53 | learning_rate = ( 54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 55 | 56 | # It is recommended that you use this optimizer for fine tuning, since this 57 | # is how the model was trained (note that the Adam m/v variables are NOT 58 | # loaded from init_checkpoint.) 59 | optimizer = LAMBOptimizer( 60 | learning_rate=learning_rate, 61 | weight_decay_rate=0.01, 62 | beta_1=0.9, 63 | beta_2=0.999, 64 | epsilon=1e-6, 65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 66 | 67 | if use_tpu: 68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 69 | 70 | tvars = tf.trainable_variables() 71 | grads = tf.gradients(loss, tvars) 72 | 73 | # This is how the model was pre-trained. 74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) 75 | 76 | train_op = optimizer.apply_gradients( 77 | zip(grads, tvars), global_step=global_step) 78 | 79 | # Normally the global step update is done inside of `apply_gradients`. 80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use 81 | # a different optimizer, you should probably take this line out. 82 | new_global_step = global_step + 1 83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 84 | return train_op 85 | 86 | 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 88 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 89 | 90 | def __init__(self, 91 | learning_rate, 92 | weight_decay_rate=0.0, 93 | beta_1=0.9, 94 | beta_2=0.999, 95 | epsilon=1e-6, 96 | exclude_from_weight_decay=None, 97 | name="AdamWeightDecayOptimizer"): 98 | """Constructs a AdamWeightDecayOptimizer.""" 99 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 100 | 101 | self.learning_rate = learning_rate 102 | self.weight_decay_rate = weight_decay_rate 103 | self.beta_1 = beta_1 104 | self.beta_2 = beta_2 105 | self.epsilon = epsilon 106 | self.exclude_from_weight_decay = exclude_from_weight_decay 107 | 108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 109 | """See base class.""" 110 | assignments = [] 111 | for (grad, param) in grads_and_vars: 112 | if grad is None or param is None: 113 | continue 114 | 115 | param_name = self._get_variable_name(param.name) 116 | 117 | m = tf.get_variable( 118 | name=param_name + "/adam_m", 119 | shape=param.shape.as_list(), 120 | dtype=tf.float32, 121 | trainable=False, 122 | initializer=tf.zeros_initializer()) 123 | v = tf.get_variable( 124 | name=param_name + "/adam_v", 125 | shape=param.shape.as_list(), 126 | dtype=tf.float32, 127 | trainable=False, 128 | initializer=tf.zeros_initializer()) 129 | 130 | # Standard Adam update. 131 | next_m = ( 132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 133 | next_v = ( 134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 135 | tf.square(grad))) 136 | 137 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 138 | 139 | # Just adding the square of the weights to the loss function is *not* 140 | # the correct way of using L2 regularization/weight decay with Adam, 141 | # since that will interact with the m and v parameters in strange ways. 142 | # 143 | # Instead we want ot decay the weights in a manner that doesn't interact 144 | # with the m/v parameters. This is equivalent to adding the square 145 | # of the weights to the loss with plain (non-momentum) SGD. 146 | if self._do_use_weight_decay(param_name): 147 | update += self.weight_decay_rate * param 148 | 149 | update_with_lr = self.learning_rate * update 150 | 151 | next_param = param - update_with_lr 152 | 153 | assignments.extend( 154 | [param.assign(next_param), 155 | m.assign(next_m), 156 | v.assign(next_v)]) 157 | return tf.group(*assignments, name=name) 158 | 159 | def _do_use_weight_decay(self, param_name): 160 | """Whether to use L2 weight decay for `param_name`.""" 161 | if not self.weight_decay_rate: 162 | return False 163 | if self.exclude_from_weight_decay: 164 | for r in self.exclude_from_weight_decay: 165 | if re.search(r, param_name) is not None: 166 | return False 167 | return True 168 | 169 | def _get_variable_name(self, param_name): 170 | """Get the variable name from the tensor name.""" 171 | m = re.match("^(.*):\\d+$", param_name) 172 | if m is not None: 173 | param_name = m.group(1) 174 | return param_name 175 | 176 | 177 | # 178 | class LAMBOptimizer(tf.train.Optimizer): 179 | """ 180 | LAMBOptimizer optimizer. 181 | https://github.com/ymcui/LAMB_Optimizer_TF 182 | # IMPORTANT NOTE 183 | - This is NOT an official implementation. 184 | - LAMB optimizer is changed from arXiv v1 ~ v3. 185 | - We implement v3 version (which is the latest version on June, 2019.). 186 | - Our implementation is based on `AdamWeightDecayOptimizer` in BERT (provided by Google). 187 | 188 | # References 189 | - Large Batch Optimization for Deep Learning: Training BERT in 76 minutes. https://arxiv.org/abs/1904.00962v3 190 | - BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. https://arxiv.org/abs/1810.04805 191 | # Parameters 192 | - There is nothing special, just the same as `AdamWeightDecayOptimizer`. 193 | """ 194 | 195 | def __init__(self, 196 | learning_rate, 197 | weight_decay_rate=0.01, 198 | beta_1=0.9, 199 | beta_2=0.999, 200 | epsilon=1e-6, 201 | exclude_from_weight_decay=None, 202 | name="LAMBOptimizer"): 203 | """Constructs a LAMBOptimizer.""" 204 | super(LAMBOptimizer, self).__init__(False, name) 205 | 206 | self.learning_rate = learning_rate 207 | self.weight_decay_rate = weight_decay_rate 208 | self.beta_1 = beta_1 209 | self.beta_2 = beta_2 210 | self.epsilon = epsilon 211 | self.exclude_from_weight_decay = exclude_from_weight_decay 212 | 213 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 214 | """See base class.""" 215 | assignments = [] 216 | for (grad, param) in grads_and_vars: 217 | if grad is None or param is None: 218 | continue 219 | 220 | param_name = self._get_variable_name(param.name) 221 | 222 | m = tf.get_variable( 223 | name=param_name + "/lamb_m", 224 | shape=param.shape.as_list(), 225 | dtype=tf.float32, 226 | trainable=False, 227 | initializer=tf.zeros_initializer()) 228 | v = tf.get_variable( 229 | name=param_name + "/lamb_v", 230 | shape=param.shape.as_list(), 231 | dtype=tf.float32, 232 | trainable=False, 233 | initializer=tf.zeros_initializer()) 234 | 235 | # Standard Adam update. 236 | next_m = ( 237 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 238 | next_v = ( 239 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 240 | tf.square(grad))) 241 | 242 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 243 | 244 | # Just adding the square of the weights to the loss function is *not* 245 | # the correct way of using L2 regularization/weight decay with Adam, 246 | # since that will interact with the m and v parameters in strange ways. 247 | # 248 | # Instead we want ot decay the weights in a manner that doesn't interact 249 | # with the m/v parameters. This is equivalent to adding the square 250 | # of the weights to the loss with plain (non-momentum) SGD. 251 | if self._do_use_weight_decay(param_name): 252 | update += self.weight_decay_rate * param 253 | 254 | ############## BELOW ARE THE SPECIFIC PARTS FOR LAMB ############## 255 | 256 | # Note: Here are two choices for scaling function \phi(z) 257 | # minmax: \phi(z) = min(max(z, \gamma_l), \gamma_u) 258 | # identity: \phi(z) = z 259 | # The authors does not mention what is \gamma_l and \gamma_u 260 | # UPDATE: after asking authors, they provide me the code below. 261 | # ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where( 262 | # math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) 263 | 264 | r1 = tf.sqrt(tf.reduce_sum(tf.square(param))) 265 | r2 = tf.sqrt(tf.reduce_sum(tf.square(update))) 266 | 267 | r = tf.where(tf.greater(r1, 0.0), 268 | tf.where(tf.greater(r2, 0.0), 269 | r1 / r2, 270 | 1.0), 271 | 1.0) 272 | 273 | eta = self.learning_rate * r 274 | 275 | update_with_lr = eta * update 276 | 277 | next_param = param - update_with_lr 278 | 279 | assignments.extend( 280 | [param.assign(next_param), 281 | m.assign(next_m), 282 | v.assign(next_v)]) 283 | return tf.group(*assignments, name=name) 284 | 285 | def _do_use_weight_decay(self, param_name): 286 | """Whether to use L2 weight decay for `param_name`.""" 287 | if not self.weight_decay_rate: 288 | return False 289 | if self.exclude_from_weight_decay: 290 | for r in self.exclude_from_weight_decay: 291 | if re.search(r, param_name) is not None: 292 | return False 293 | return True 294 | 295 | def _get_variable_name(self, param_name): 296 | """Get the variable name from the tensor name.""" 297 | m = re.match("^(.*):\\d+$", param_name) 298 | if m is not None: 299 | param_name = m.group(1) 300 | return param_name -------------------------------------------------------------------------------- /tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import re 23 | import unicodedata 24 | import six 25 | import tensorflow as tf 26 | 27 | 28 | def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): 29 | """Checks whether the casing config is consistent with the checkpoint name.""" 30 | 31 | # The casing has to be passed in by the user and there is no explicit check 32 | # as to whether it matches the checkpoint. The casing information probably 33 | # should have been stored in the bert_config.json file, but it's not, so 34 | # we have to heuristically detect it to validate. 35 | 36 | if not init_checkpoint: 37 | return 38 | 39 | m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) 40 | if m is None: 41 | return 42 | 43 | model_name = m.group(1) 44 | 45 | lower_models = [ 46 | "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", 47 | "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" 48 | ] 49 | 50 | cased_models = [ 51 | "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", 52 | "multi_cased_L-12_H-768_A-12" 53 | ] 54 | 55 | is_bad_config = False 56 | if model_name in lower_models and not do_lower_case: 57 | is_bad_config = True 58 | actual_flag = "False" 59 | case_name = "lowercased" 60 | opposite_flag = "True" 61 | 62 | if model_name in cased_models and do_lower_case: 63 | is_bad_config = True 64 | actual_flag = "True" 65 | case_name = "cased" 66 | opposite_flag = "False" 67 | 68 | if is_bad_config: 69 | raise ValueError( 70 | "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " 71 | "However, `%s` seems to be a %s model, so you " 72 | "should pass in `--do_lower_case=%s` so that the fine-tuning matches " 73 | "how the model was pre-training. If this error is wrong, please " 74 | "just comment out this check." % (actual_flag, init_checkpoint, 75 | model_name, case_name, opposite_flag)) 76 | 77 | 78 | def convert_to_unicode(text): 79 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 80 | if six.PY3: 81 | if isinstance(text, str): 82 | return text 83 | elif isinstance(text, bytes): 84 | return text.decode("utf-8", "ignore") 85 | else: 86 | raise ValueError("Unsupported string type: %s" % (type(text))) 87 | elif six.PY2: 88 | if isinstance(text, str): 89 | return text.decode("utf-8", "ignore") 90 | elif isinstance(text, unicode): 91 | return text 92 | else: 93 | raise ValueError("Unsupported string type: %s" % (type(text))) 94 | else: 95 | raise ValueError("Not running on Python2 or Python 3?") 96 | 97 | 98 | def printable_text(text): 99 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 100 | 101 | # These functions want `str` for both Python2 and Python3, but in one case 102 | # it's a Unicode string and in the other it's a byte string. 103 | if six.PY3: 104 | if isinstance(text, str): 105 | return text 106 | elif isinstance(text, bytes): 107 | return text.decode("utf-8", "ignore") 108 | else: 109 | raise ValueError("Unsupported string type: %s" % (type(text))) 110 | elif six.PY2: 111 | if isinstance(text, str): 112 | return text 113 | elif isinstance(text, unicode): 114 | return text.encode("utf-8") 115 | else: 116 | raise ValueError("Unsupported string type: %s" % (type(text))) 117 | else: 118 | raise ValueError("Not running on Python2 or Python 3?") 119 | 120 | 121 | def load_vocab(vocab_file): 122 | """Loads a vocabulary file into a dictionary.""" 123 | vocab = collections.OrderedDict() 124 | index = 0 125 | with tf.gfile.GFile(vocab_file, "r") as reader: 126 | while True: 127 | token = convert_to_unicode(reader.readline()) 128 | if not token: 129 | break 130 | token = token.strip() 131 | vocab[token] = index 132 | index += 1 133 | return vocab 134 | 135 | 136 | def convert_by_vocab(vocab, items): 137 | """Converts a sequence of [tokens|ids] using the vocab.""" 138 | output = [] 139 | #print("items:",items) #['[CLS]', '日', '##期', ',', '但', '被', '##告', '金', '##东', '##福', '载', '##明', '[MASK]', 'U', '##N', '##K', ']', '保', '##证', '本', '##月', '1', '##4', '[MASK]', '到', '##位', ',', '2', '##0', '##1', '##5', '年', '6', '[MASK]', '1', '##1', '日', '[', 'U', '##N', '##K', ']', ',', '原', '##告', '[MASK]', '认', '##可', '于', '2', '##0', '##1', '##5', '[MASK]', '6', '月', '[MASK]', '[MASK]', '日', '##向', '被', '##告', '主', '##张', '权', '##利', '。', '而', '[MASK]', '[MASK]', '自', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '年', '6', '月', '1', '##1', '日', '[SEP]', '原', '##告', '于', '2', '##0', '##1', '##6', '[MASK]', '6', '[MASK]', '2', '##4', '日', '起', '##诉', ',', '主', '##张', '保', '##证', '责', '##任', ',', '已', '超', '##过', '保', '##证', '期', '##限', '[MASK]', '保', '##证', '人', '依', '##法', '不', '##再', '承', '##担', '保', '##证', '[MASK]', '[MASK]', '[MASK]', '[SEP]'] 140 | for i,item in enumerate(items): 141 | #print(i,"item:",item) # ##期 142 | output.append(vocab[item]) 143 | return output 144 | 145 | 146 | def convert_tokens_to_ids(vocab, tokens): 147 | return convert_by_vocab(vocab, tokens) 148 | 149 | 150 | def convert_ids_to_tokens(inv_vocab, ids): 151 | return convert_by_vocab(inv_vocab, ids) 152 | 153 | 154 | def whitespace_tokenize(text): 155 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 156 | text = text.strip() 157 | if not text: 158 | return [] 159 | tokens = text.split() 160 | return tokens 161 | 162 | 163 | class FullTokenizer(object): 164 | """Runs end-to-end tokenziation.""" 165 | 166 | def __init__(self, vocab_file, do_lower_case=True): 167 | self.vocab = load_vocab(vocab_file) 168 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 169 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 170 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 171 | 172 | def tokenize(self, text): 173 | split_tokens = [] 174 | for token in self.basic_tokenizer.tokenize(text): 175 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 176 | split_tokens.append(sub_token) 177 | 178 | return split_tokens 179 | 180 | def convert_tokens_to_ids(self, tokens): 181 | return convert_by_vocab(self.vocab, tokens) 182 | 183 | def convert_ids_to_tokens(self, ids): 184 | return convert_by_vocab(self.inv_vocab, ids) 185 | 186 | 187 | class BasicTokenizer(object): 188 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 189 | 190 | def __init__(self, do_lower_case=True): 191 | """Constructs a BasicTokenizer. 192 | 193 | Args: 194 | do_lower_case: Whether to lower case the input. 195 | """ 196 | self.do_lower_case = do_lower_case 197 | 198 | def tokenize(self, text): 199 | """Tokenizes a piece of text.""" 200 | text = convert_to_unicode(text) 201 | text = self._clean_text(text) 202 | 203 | # This was added on November 1st, 2018 for the multilingual and Chinese 204 | # models. This is also applied to the English models now, but it doesn't 205 | # matter since the English models were not trained on any Chinese data 206 | # and generally don't have any Chinese data in them (there are Chinese 207 | # characters in the vocabulary because Wikipedia does have some Chinese 208 | # words in the English Wikipedia.). 209 | text = self._tokenize_chinese_chars(text) 210 | 211 | orig_tokens = whitespace_tokenize(text) 212 | split_tokens = [] 213 | for token in orig_tokens: 214 | if self.do_lower_case: 215 | token = token.lower() 216 | token = self._run_strip_accents(token) 217 | split_tokens.extend(self._run_split_on_punc(token)) 218 | 219 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 220 | return output_tokens 221 | 222 | def _run_strip_accents(self, text): 223 | """Strips accents from a piece of text.""" 224 | text = unicodedata.normalize("NFD", text) 225 | output = [] 226 | for char in text: 227 | cat = unicodedata.category(char) 228 | if cat == "Mn": 229 | continue 230 | output.append(char) 231 | return "".join(output) 232 | 233 | def _run_split_on_punc(self, text): 234 | """Splits punctuation on a piece of text.""" 235 | chars = list(text) 236 | i = 0 237 | start_new_word = True 238 | output = [] 239 | while i < len(chars): 240 | char = chars[i] 241 | if _is_punctuation(char): 242 | output.append([char]) 243 | start_new_word = True 244 | else: 245 | if start_new_word: 246 | output.append([]) 247 | start_new_word = False 248 | output[-1].append(char) 249 | i += 1 250 | 251 | return ["".join(x) for x in output] 252 | 253 | def _tokenize_chinese_chars(self, text): 254 | """Adds whitespace around any CJK character.""" 255 | output = [] 256 | for char in text: 257 | cp = ord(char) 258 | if self._is_chinese_char(cp): 259 | output.append(" ") 260 | output.append(char) 261 | output.append(" ") 262 | else: 263 | output.append(char) 264 | return "".join(output) 265 | 266 | def _is_chinese_char(self, cp): 267 | """Checks whether CP is the codepoint of a CJK character.""" 268 | # This defines a "chinese character" as anything in the CJK Unicode block: 269 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 270 | # 271 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 272 | # despite its name. The modern Korean Hangul alphabet is a different block, 273 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 274 | # space-separated words, so they are not treated specially and handled 275 | # like the all of the other languages. 276 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 277 | (cp >= 0x3400 and cp <= 0x4DBF) or # 278 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 279 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 280 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 281 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 282 | (cp >= 0xF900 and cp <= 0xFAFF) or # 283 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 284 | return True 285 | 286 | return False 287 | 288 | def _clean_text(self, text): 289 | """Performs invalid character removal and whitespace cleanup on text.""" 290 | output = [] 291 | for char in text: 292 | cp = ord(char) 293 | if cp == 0 or cp == 0xfffd or _is_control(char): 294 | continue 295 | if _is_whitespace(char): 296 | output.append(" ") 297 | else: 298 | output.append(char) 299 | return "".join(output) 300 | 301 | 302 | class WordpieceTokenizer(object): 303 | """Runs WordPiece tokenziation.""" 304 | 305 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): 306 | self.vocab = vocab 307 | self.unk_token = unk_token 308 | self.max_input_chars_per_word = max_input_chars_per_word 309 | 310 | def tokenize(self, text): 311 | """Tokenizes a piece of text into its word pieces. 312 | 313 | This uses a greedy longest-match-first algorithm to perform tokenization 314 | using the given vocabulary. 315 | 316 | For example: 317 | input = "unaffable" 318 | output = ["un", "##aff", "##able"] 319 | 320 | Args: 321 | text: A single token or whitespace separated tokens. This should have 322 | already been passed through `BasicTokenizer. 323 | 324 | Returns: 325 | A list of wordpiece tokens. 326 | """ 327 | 328 | text = convert_to_unicode(text) 329 | 330 | output_tokens = [] 331 | for token in whitespace_tokenize(text): 332 | chars = list(token) 333 | if len(chars) > self.max_input_chars_per_word: 334 | output_tokens.append(self.unk_token) 335 | continue 336 | 337 | is_bad = False 338 | start = 0 339 | sub_tokens = [] 340 | while start < len(chars): 341 | end = len(chars) 342 | cur_substr = None 343 | while start < end: 344 | substr = "".join(chars[start:end]) 345 | if start > 0: 346 | substr = "##" + substr 347 | if substr in self.vocab: 348 | cur_substr = substr 349 | break 350 | end -= 1 351 | if cur_substr is None: 352 | is_bad = True 353 | break 354 | sub_tokens.append(cur_substr) 355 | start = end 356 | 357 | if is_bad: 358 | output_tokens.append(self.unk_token) 359 | else: 360 | output_tokens.extend(sub_tokens) 361 | return output_tokens 362 | 363 | 364 | def _is_whitespace(char): 365 | """Checks whether `chars` is a whitespace character.""" 366 | # \t, \n, and \r are technically contorl characters but we treat them 367 | # as whitespace since they are generally considered as such. 368 | if char == " " or char == "\t" or char == "\n" or char == "\r": 369 | return True 370 | cat = unicodedata.category(char) 371 | if cat == "Zs": 372 | return True 373 | return False 374 | 375 | 376 | def _is_control(char): 377 | """Checks whether `chars` is a control character.""" 378 | # These are technically control characters but we count them as whitespace 379 | # characters. 380 | if char == "\t" or char == "\n" or char == "\r": 381 | return False 382 | cat = unicodedata.category(char) 383 | if cat in ("Cc", "Cf"): 384 | return True 385 | return False 386 | 387 | 388 | def _is_punctuation(char): 389 | """Checks whether `chars` is a punctuation character.""" 390 | cp = ord(char) 391 | # We treat all non-letter/number ASCII as punctuation. 392 | # Characters such as "^", "$", and "`" are not in the Unicode 393 | # Punctuation class but we treat them as punctuation anyways, for 394 | # consistency. 395 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 396 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 397 | return True 398 | cat = unicodedata.category(char) 399 | if cat.startswith("P"): 400 | return True 401 | return False 402 | -------------------------------------------------------------------------------- /run_pretraining.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Run masked LM/next sentence masked_lm pre-training for BERT.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import modeling 23 | import optimization 24 | import tensorflow as tf 25 | 26 | flags = tf.flags 27 | 28 | FLAGS = flags.FLAGS 29 | 30 | ## Required parameters 31 | flags.DEFINE_string( 32 | "bert_config_file", None, 33 | "The config json file corresponding to the pre-trained BERT model. " 34 | "This specifies the model architecture.") 35 | 36 | flags.DEFINE_string( 37 | "input_file", None, 38 | "Input TF example files (can be a glob or comma separated).") 39 | 40 | flags.DEFINE_string( 41 | "output_dir", None, 42 | "The output directory where the model checkpoints will be written.") 43 | 44 | ## Other parameters 45 | flags.DEFINE_string( 46 | "init_checkpoint", None, 47 | "Initial checkpoint (usually from a pre-trained BERT model).") 48 | 49 | flags.DEFINE_integer( 50 | "max_seq_length", 128, 51 | "The maximum total input sequence length after WordPiece tokenization. " 52 | "Sequences longer than this will be truncated, and sequences shorter " 53 | "than this will be padded. Must match data generation.") 54 | 55 | flags.DEFINE_integer( 56 | "max_predictions_per_seq", 20, 57 | "Maximum number of masked LM predictions per sequence. " 58 | "Must match data generation.") 59 | 60 | flags.DEFINE_bool("do_train", False, "Whether to run training.") 61 | 62 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") 63 | 64 | flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") 65 | 66 | flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") 67 | 68 | flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") 69 | 70 | flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.") 71 | 72 | flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.") 73 | 74 | flags.DEFINE_integer("save_checkpoints_steps", 1000, 75 | "How often to save the model checkpoint.") 76 | 77 | flags.DEFINE_integer("iterations_per_loop", 1000, 78 | "How many steps to make in each estimator call.") 79 | 80 | flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.") 81 | 82 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 83 | 84 | tf.flags.DEFINE_string( 85 | "tpu_name", None, 86 | "The Cloud TPU to use for training. This should be either the name " 87 | "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " 88 | "url.") 89 | 90 | tf.flags.DEFINE_string( 91 | "tpu_zone", None, 92 | "[Optional] GCE zone where the Cloud TPU is located in. If not " 93 | "specified, we will attempt to automatically detect the GCE project from " 94 | "metadata.") 95 | 96 | tf.flags.DEFINE_string( 97 | "gcp_project", None, 98 | "[Optional] Project name for the Cloud TPU-enabled project. If not " 99 | "specified, we will attempt to automatically detect the GCE project from " 100 | "metadata.") 101 | 102 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") 103 | 104 | flags.DEFINE_integer( 105 | "num_tpu_cores", 8, 106 | "Only used if `use_tpu` is True. Total number of TPU cores to use.") 107 | 108 | 109 | def model_fn_builder(bert_config, init_checkpoint, learning_rate, 110 | num_train_steps, num_warmup_steps, use_tpu, 111 | use_one_hot_embeddings): 112 | """Returns `model_fn` closure for TPUEstimator.""" 113 | 114 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 115 | """The `model_fn` for TPUEstimator.""" 116 | 117 | tf.logging.info("*** Features ***") 118 | for name in sorted(features.keys()): 119 | tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) 120 | 121 | input_ids = features["input_ids"] 122 | input_mask = features["input_mask"] 123 | segment_ids = features["segment_ids"] 124 | masked_lm_positions = features["masked_lm_positions"] 125 | masked_lm_ids = features["masked_lm_ids"] 126 | masked_lm_weights = features["masked_lm_weights"] 127 | next_sentence_labels = features["next_sentence_labels"] 128 | 129 | is_training = (mode == tf.estimator.ModeKeys.TRAIN) 130 | 131 | model = modeling.BertModel( 132 | config=bert_config, 133 | is_training=is_training, 134 | input_ids=input_ids, 135 | input_mask=input_mask, 136 | token_type_ids=segment_ids, 137 | use_one_hot_embeddings=use_one_hot_embeddings) 138 | 139 | (masked_lm_loss, 140 | masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( 141 | bert_config, model.get_sequence_output(), model.get_embedding_table(),model.get_embedding_table_2(), 142 | masked_lm_positions, masked_lm_ids, masked_lm_weights) 143 | 144 | (next_sentence_loss, next_sentence_example_loss, 145 | next_sentence_log_probs) = get_next_sentence_output( 146 | bert_config, model.get_pooled_output(), next_sentence_labels) 147 | 148 | total_loss = masked_lm_loss + next_sentence_loss 149 | 150 | tvars = tf.trainable_variables() 151 | 152 | initialized_variable_names = {} 153 | print("init_checkpoint:",init_checkpoint) 154 | scaffold_fn = None 155 | if init_checkpoint: 156 | (assignment_map, initialized_variable_names 157 | ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) 158 | if use_tpu: 159 | 160 | def tpu_scaffold(): 161 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 162 | return tf.train.Scaffold() 163 | 164 | scaffold_fn = tpu_scaffold 165 | else: 166 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 167 | 168 | tf.logging.info("**** Trainable Variables ****") 169 | for var in tvars: 170 | init_string = "" 171 | if var.name in initialized_variable_names: 172 | init_string = ", *INIT_FROM_CKPT*" 173 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 174 | init_string) 175 | 176 | output_spec = None 177 | if mode == tf.estimator.ModeKeys.TRAIN: 178 | train_op = optimization.create_optimizer( 179 | total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) 180 | 181 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 182 | mode=mode, 183 | loss=total_loss, 184 | train_op=train_op, 185 | scaffold_fn=scaffold_fn) 186 | elif mode == tf.estimator.ModeKeys.EVAL: 187 | 188 | def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, 189 | masked_lm_weights, next_sentence_example_loss, 190 | next_sentence_log_probs, next_sentence_labels): 191 | """Computes the loss and accuracy of the model.""" 192 | masked_lm_log_probs = tf.reshape(masked_lm_log_probs,[-1, masked_lm_log_probs.shape[-1]]) 193 | masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) 194 | masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) 195 | masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) 196 | masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) 197 | masked_lm_accuracy = tf.metrics.accuracy( 198 | labels=masked_lm_ids, 199 | predictions=masked_lm_predictions, 200 | weights=masked_lm_weights) 201 | masked_lm_mean_loss = tf.metrics.mean( 202 | values=masked_lm_example_loss, weights=masked_lm_weights) 203 | 204 | next_sentence_log_probs = tf.reshape( 205 | next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) 206 | next_sentence_predictions = tf.argmax( 207 | next_sentence_log_probs, axis=-1, output_type=tf.int32) 208 | next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) 209 | next_sentence_accuracy = tf.metrics.accuracy( 210 | labels=next_sentence_labels, predictions=next_sentence_predictions) 211 | next_sentence_mean_loss = tf.metrics.mean( 212 | values=next_sentence_example_loss) 213 | 214 | return { 215 | "masked_lm_accuracy": masked_lm_accuracy, 216 | "masked_lm_loss": masked_lm_mean_loss, 217 | "next_sentence_accuracy": next_sentence_accuracy, 218 | "next_sentence_loss": next_sentence_mean_loss, 219 | } 220 | 221 | # next_sentence_example_loss=0.0 TODO 222 | # next_sentence_log_probs=0.0 # TODO 223 | eval_metrics = (metric_fn, [ 224 | masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, 225 | masked_lm_weights, next_sentence_example_loss, 226 | next_sentence_log_probs, next_sentence_labels 227 | ]) 228 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 229 | mode=mode, 230 | loss=total_loss, 231 | eval_metrics=eval_metrics, 232 | scaffold_fn=scaffold_fn) 233 | else: 234 | raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) 235 | 236 | return output_spec 237 | 238 | return model_fn 239 | 240 | 241 | def get_masked_lm_output(bert_config, input_tensor, output_weights,project_weights, positions, 242 | label_ids, label_weights): 243 | """Get loss and log probs for the masked LM.""" 244 | input_tensor = gather_indexes(input_tensor, positions) 245 | 246 | with tf.variable_scope("cls/predictions"): 247 | # We apply one more non-linear transformation before the output layer. 248 | # This matrix is not used after pre-training. 249 | with tf.variable_scope("transform"): 250 | input_tensor = tf.layers.dense( 251 | input_tensor, 252 | units=bert_config.hidden_size, 253 | activation=modeling.get_activation(bert_config.hidden_act), 254 | kernel_initializer=modeling.create_initializer( 255 | bert_config.initializer_range)) 256 | input_tensor = modeling.layer_norm(input_tensor) 257 | 258 | # The output weights are the same as the input embeddings, but there is 259 | # an output-only bias for each token. 260 | output_bias = tf.get_variable( 261 | "output_bias", 262 | shape=[bert_config.vocab_size], 263 | initializer=tf.zeros_initializer()) 264 | # logits = tf.matmul(input_tensor, output_weights, transpose_b=True) 265 | # input_tensor=[-1,hidden_size], project_weights=[embedding_size, hidden_size], project_weights_transpose=[hidden_size, embedding_size]--->[-1, embedding_size] 266 | input_project = tf.matmul(input_tensor, project_weights, transpose_b=True) 267 | logits = tf.matmul(input_project, output_weights, transpose_b=True) 268 | # # input_project=[-1, embedding_size], output_weights=[vocab_size, embedding_size], output_weights_transpose=[embedding_size, vocab_size] ---> [-1, vocab_size] 269 | 270 | logits = tf.nn.bias_add(logits, output_bias) 271 | log_probs = tf.nn.log_softmax(logits, axis=-1) 272 | 273 | label_ids = tf.reshape(label_ids, [-1]) 274 | label_weights = tf.reshape(label_weights, [-1]) 275 | 276 | one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) 277 | 278 | # The `positions` tensor might be zero-padded (if the sequence is too 279 | # short to have the maximum number of predictions). The `label_weights` 280 | # tensor has a value of 1.0 for every real prediction and 0.0 for the 281 | # padding predictions. 282 | per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) 283 | numerator = tf.reduce_sum(label_weights * per_example_loss) 284 | denominator = tf.reduce_sum(label_weights) + 1e-5 285 | loss = numerator / denominator 286 | 287 | return (loss, per_example_loss, log_probs) 288 | 289 | 290 | def get_next_sentence_output(bert_config, input_tensor, labels): 291 | """Get loss and log probs for the next sentence prediction.""" 292 | 293 | # Simple binary classification. Note that 0 is "next sentence" and 1 is 294 | # "random sentence". This weight matrix is not used after pre-training. 295 | with tf.variable_scope("cls/seq_relationship"): 296 | output_weights = tf.get_variable( 297 | "output_weights", 298 | shape=[2, bert_config.hidden_size], 299 | initializer=modeling.create_initializer(bert_config.initializer_range)) 300 | output_bias = tf.get_variable( 301 | "output_bias", shape=[2], initializer=tf.zeros_initializer()) 302 | 303 | logits = tf.matmul(input_tensor, output_weights, transpose_b=True) 304 | logits = tf.nn.bias_add(logits, output_bias) 305 | log_probs = tf.nn.log_softmax(logits, axis=-1) 306 | labels = tf.reshape(labels, [-1]) 307 | one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) 308 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 309 | loss = tf.reduce_mean(per_example_loss) 310 | return (loss, per_example_loss, log_probs) 311 | 312 | 313 | def gather_indexes(sequence_tensor, positions): 314 | """Gathers the vectors at the specific positions over a minibatch.""" 315 | sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) 316 | batch_size = sequence_shape[0] 317 | seq_length = sequence_shape[1] 318 | width = sequence_shape[2] 319 | 320 | flat_offsets = tf.reshape( 321 | tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) 322 | flat_positions = tf.reshape(positions + flat_offsets, [-1]) 323 | flat_sequence_tensor = tf.reshape(sequence_tensor, 324 | [batch_size * seq_length, width]) 325 | output_tensor = tf.gather(flat_sequence_tensor, flat_positions) 326 | return output_tensor 327 | 328 | 329 | def input_fn_builder(input_files, 330 | max_seq_length, 331 | max_predictions_per_seq, 332 | is_training, 333 | num_cpu_threads=16): 334 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 335 | 336 | def input_fn(params): 337 | """The actual input function.""" 338 | batch_size = params["batch_size"] 339 | 340 | name_to_features = { 341 | "input_ids": 342 | tf.FixedLenFeature([max_seq_length], tf.int64), 343 | "input_mask": 344 | tf.FixedLenFeature([max_seq_length], tf.int64), 345 | "segment_ids": 346 | tf.FixedLenFeature([max_seq_length], tf.int64), 347 | "masked_lm_positions": 348 | tf.FixedLenFeature([max_predictions_per_seq], tf.int64), 349 | "masked_lm_ids": 350 | tf.FixedLenFeature([max_predictions_per_seq], tf.int64), 351 | "masked_lm_weights": 352 | tf.FixedLenFeature([max_predictions_per_seq], tf.float32), 353 | "next_sentence_labels": 354 | tf.FixedLenFeature([1], tf.int64), 355 | } 356 | 357 | # For training, we want a lot of parallel reading and shuffling. 358 | # For eval, we want no shuffling and parallel reading doesn't matter. 359 | if is_training: 360 | d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) 361 | d = d.repeat() 362 | d = d.shuffle(buffer_size=len(input_files)) 363 | 364 | # `cycle_length` is the number of parallel files that get read. 365 | cycle_length = min(num_cpu_threads, len(input_files)) 366 | 367 | # `sloppy` mode means that the interleaving is not exact. This adds 368 | # even more randomness to the training pipeline. 369 | d = d.apply( 370 | tf.contrib.data.parallel_interleave( 371 | tf.data.TFRecordDataset, 372 | sloppy=is_training, 373 | cycle_length=cycle_length)) 374 | d = d.shuffle(buffer_size=100) 375 | else: 376 | d = tf.data.TFRecordDataset(input_files) 377 | # Since we evaluate for a fixed number of steps we don't want to encounter 378 | # out-of-range exceptions. 379 | d = d.repeat() 380 | 381 | # We must `drop_remainder` on training because the TPU requires fixed 382 | # size dimensions. For eval, we assume we are evaluating on the CPU or GPU 383 | # and we *don't* want to drop the remainder, otherwise we wont cover 384 | # every sample. 385 | d = d.apply( 386 | tf.contrib.data.map_and_batch( 387 | lambda record: _decode_record(record, name_to_features), 388 | batch_size=batch_size, 389 | num_parallel_batches=num_cpu_threads, 390 | drop_remainder=True)) 391 | return d 392 | 393 | return input_fn 394 | 395 | 396 | def _decode_record(record, name_to_features): 397 | """Decodes a record to a TensorFlow example.""" 398 | example = tf.parse_single_example(record, name_to_features) 399 | 400 | # tf.Example only supports tf.int64, but the TPU only supports tf.int32. 401 | # So cast all int64 to int32. 402 | for name in list(example.keys()): 403 | t = example[name] 404 | if t.dtype == tf.int64: 405 | t = tf.to_int32(t) 406 | example[name] = t 407 | 408 | return example 409 | 410 | 411 | def main(_): 412 | tf.logging.set_verbosity(tf.logging.INFO) 413 | 414 | if not FLAGS.do_train and not FLAGS.do_eval: # 必须是训练或验证的类型 415 | raise ValueError("At least one of `do_train` or `do_eval` must be True.") 416 | 417 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # 从json文件中获得配置信息 418 | 419 | tf.gfile.MakeDirs(FLAGS.output_dir) 420 | 421 | input_files = [] # 输入可以是多个文件,以“逗号隔开”;可以是一个匹配形式的,如“input_x*” 422 | for input_pattern in FLAGS.input_file.split(","): 423 | input_files.extend(tf.gfile.Glob(input_pattern)) 424 | 425 | tf.logging.info("*** Input Files ***") 426 | for input_file in input_files: 427 | tf.logging.info(" %s" % input_file) 428 | 429 | tpu_cluster_resolver = None 430 | if FLAGS.use_tpu and FLAGS.tpu_name: 431 | tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( # TODO 432 | tpu=FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) 433 | 434 | print("###tpu_cluster_resolver:",tpu_cluster_resolver,";FLAGS.use_tpu:",FLAGS.use_tpu,";FLAGS.tpu_name:",FLAGS.tpu_name,";FLAGS.tpu_zone:",FLAGS.tpu_zone) 435 | # ###tpu_cluster_resolver: ;FLAGS.use_tpu: True ;FLAGS.tpu_name: grpc://10.240.1.83:8470 436 | 437 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 438 | run_config = tf.contrib.tpu.RunConfig( 439 | keep_checkpoint_max=20, # 10 440 | cluster=tpu_cluster_resolver, 441 | master=FLAGS.master, 442 | model_dir=FLAGS.output_dir, 443 | save_checkpoints_steps=FLAGS.save_checkpoints_steps, 444 | tpu_config=tf.contrib.tpu.TPUConfig( 445 | iterations_per_loop=FLAGS.iterations_per_loop, 446 | num_shards=FLAGS.num_tpu_cores, 447 | per_host_input_for_training=is_per_host)) 448 | 449 | model_fn = model_fn_builder( 450 | bert_config=bert_config, 451 | init_checkpoint=FLAGS.init_checkpoint, 452 | learning_rate=FLAGS.learning_rate, 453 | num_train_steps=FLAGS.num_train_steps, 454 | num_warmup_steps=FLAGS.num_warmup_steps, 455 | use_tpu=FLAGS.use_tpu, 456 | use_one_hot_embeddings=FLAGS.use_tpu) 457 | 458 | # If TPU is not available, this will fall back to normal Estimator on CPU 459 | # or GPU. 460 | estimator = tf.contrib.tpu.TPUEstimator( 461 | use_tpu=FLAGS.use_tpu, 462 | model_fn=model_fn, 463 | config=run_config, 464 | train_batch_size=FLAGS.train_batch_size, 465 | eval_batch_size=FLAGS.eval_batch_size) 466 | 467 | if FLAGS.do_train: 468 | tf.logging.info("***** Running training *****") 469 | tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) 470 | train_input_fn = input_fn_builder( 471 | input_files=input_files, 472 | max_seq_length=FLAGS.max_seq_length, 473 | max_predictions_per_seq=FLAGS.max_predictions_per_seq, 474 | is_training=True) 475 | estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) 476 | 477 | if FLAGS.do_eval: 478 | tf.logging.info("***** Running evaluation *****") 479 | tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) 480 | 481 | eval_input_fn = input_fn_builder( 482 | input_files=input_files, 483 | max_seq_length=FLAGS.max_seq_length, 484 | max_predictions_per_seq=FLAGS.max_predictions_per_seq, 485 | is_training=False) 486 | 487 | result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) 488 | 489 | output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") 490 | with tf.gfile.GFile(output_eval_file, "w") as writer: 491 | tf.logging.info("***** Eval results *****") 492 | for key in sorted(result.keys()): 493 | tf.logging.info(" %s = %s", key, str(result[key])) 494 | writer.write("%s = %s\n" % (key, str(result[key]))) 495 | 496 | 497 | if __name__ == "__main__": 498 | flags.mark_flag_as_required("input_file") 499 | flags.mark_flag_as_required("bert_config_file") 500 | flags.mark_flag_as_required("output_dir") 501 | tf.app.run() 502 | -------------------------------------------------------------------------------- /data/test.txt: -------------------------------------------------------------------------------- 1 | 美 B-LOC 2 | 国 I-LOC 3 | 的 O 4 | 华 B-PER 5 | 莱 B-PER 6 | 士 B-PER 7 | , O 8 | 我 O 9 | 和 O 10 | 他 O 11 | 谈 O 12 | 笑 O 13 | 风 O 14 | 生 O 15 | 。 O 16 | 17 | 看 O 18 | 包 B-PER 19 | 公 I-PER 20 | 断 O 21 | 案 O 22 | 的 O 23 | 戏 O 24 | , O 25 | 看 O 26 | 他 O 27 | 威 O 28 | 风 O 29 | 凛 O 30 | 凛 O 31 | 坐 O 32 | 公 O 33 | 堂 O 34 | 拍 O 35 | 桌 O 36 | 子 O 37 | 动 O 38 | 刑 O 39 | 具 O 40 | , O 41 | 多 O 42 | 少 O 43 | 还 O 44 | 有 O 45 | 一 O 46 | 点 O 47 | 担 O 48 | 心 O 49 | , O 50 | 总 O 51 | 怕 O 52 | 靠 O 53 | 这 O 54 | 一 O 55 | 套 O 56 | 办 O 57 | 法 O 58 | 弄 O 59 | 出 O 60 | 错 O 61 | 案 O 62 | 来 O 63 | , O 64 | 放 O 65 | 过 O 66 | 了 O 67 | 真 O 68 | 正 O 69 | 的 O 70 | 坏 O 71 | 人 O 72 | ; O 73 | 74 | 可 O 75 | 看 O 76 | 《 O 77 | 包 B-PER 78 | 公 I-PER 79 | 赶 O 80 | 驴 O 81 | 》 O 82 | 这 O 83 | 出 O 84 | 戏 O 85 | , O 86 | 心 O 87 | 里 O 88 | 就 O 89 | 很 O 90 | 踏 O 91 | 实 O 92 | : O 93 | 这 O 94 | 样 O 95 | 是 O 96 | 一 O 97 | 断 O 98 | 一 O 99 | 个 O 100 | 准 O 101 | 的 O 102 | 。 O 103 | 104 | 譬 O 105 | 如 O 106 | 看 O 107 | 《 O 108 | 施 B-PER 109 | 公 O 110 | 案 O 111 | 》 O 112 | , O 113 | 施 B-PER 114 | 大 O 115 | 人 O 116 | 坐 O 117 | 公 O 118 | 堂 O 119 | 问 O 120 | 案 O 121 | 子 O 122 | 不 O 123 | 得 O 124 | 要 O 125 | 领 O 126 | , O 127 | 总 O 128 | 是 O 129 | 扮 O 130 | 成 O 131 | 普 O 132 | 通 O 133 | 百 O 134 | 姓 O 135 | 深 O 136 | 入 O 137 | 民 O 138 | 间 O 139 | 暗 O 140 | 中 O 141 | 查 O 142 | 访 O 143 | , O 144 | 结 O 145 | 果 O 146 | 就 O 147 | 屡 O 148 | 破 O 149 | 奇 O 150 | 案 O 151 | 了 O 152 | 。 O 153 | 154 | 如 O 155 | 果 O 156 | 有 O 157 | 人 O 158 | 问 O 159 | 我 O 160 | : O 161 | “ O 162 | 你 O 163 | 看 O 164 | 过 O 165 | 许 O 166 | 多 O 167 | 包 B-PER 168 | 公 I-PER 169 | 戏 O 170 | , O 171 | 哪 O 172 | 一 O 173 | 出 O 174 | 最 O 175 | 好 O 176 | ? O 177 | ” O 178 | 179 | 我 O 180 | 要 O 181 | 毫 O 182 | 不 O 183 | 犹 O 184 | 豫 O 185 | 地 O 186 | 回 O 187 | 答 O 188 | 道 O 189 | : O 190 | “ O 191 | 自 O 192 | 然 O 193 | 是 O 194 | 《 O 195 | 包 B-PER 196 | 公 I-PER 197 | 赶 O 198 | 驴 O 199 | 》 O 200 | 啦 O 201 | ! O 202 | 203 | 包 B-PER 204 | 公 I-PER 205 | 毕 O 206 | 竟 O 207 | 是 O 208 | 包 B-PER 209 | 公 I-PER 210 | , O 211 | 若 O 212 | 是 O 213 | 换 O 214 | 了 O 215 | 好 O 216 | 摆 O 217 | 身 O 218 | 份 O 219 | 的 O 220 | 什 O 221 | 么 O 222 | 公 O 223 | , O 224 | 便 O 225 | 要 O 226 | 先 O 227 | 派 O 228 | 人 O 229 | 通 O 230 | 报 O 231 | , O 232 | 然 O 233 | 后 O 234 | 由 O 235 | 卫 O 236 | 士 O 237 | 前 O 238 | 呼 O 239 | 后 O 240 | 拥 O 241 | 而 O 242 | 去 O 243 | , O 244 | 如 O 245 | 何 O 246 | 查 O 247 | 得 O 248 | 出 O 249 | 实 O 250 | 情 O 251 | ! O 252 | ” O 253 | ( O 254 | 马 B-PER 255 | 得 I-PER 256 | / O 257 | 画 O 258 | ) O 259 | 260 | 学 O 261 | 习 O 262 | 基 O 263 | 本 O 264 | 法 O 265 | 顺 O 266 | 利 O 267 | 迎 O 268 | 回 O 269 | 归 O 270 | 271 | 本 O 272 | 报 O 273 | 评 O 274 | 论 O 275 | 员 O 276 | 277 | 再 O 278 | 过 O 279 | 5 O 280 | 5 O 281 | 天 O 282 | , O 283 | 我 O 284 | 国 O 285 | 政 O 286 | 府 O 287 | 将 O 288 | 对 O 289 | 香 B-LOC 290 | 港 I-LOC 291 | 恢 O 292 | 复 O 293 | 行 O 294 | 使 O 295 | 主 O 296 | 权 O 297 | 。 O 298 | 299 | 在 O 300 | 香 B-LOC 301 | 港 I-LOC 302 | 回 O 303 | 归 O 304 | 前 O 305 | 的 O 306 | 最 O 307 | 后 O 308 | 阶 O 309 | 段 O 310 | , O 311 | 中 B-ORG 312 | 共 I-ORG 313 | 中 I-ORG 314 | 央 I-ORG 315 | 举 O 316 | 办 O 317 | 《 O 318 | “ O 319 | 一 O 320 | 国 O 321 | 两 O 322 | 制 O 323 | ” O 324 | 与 O 325 | 香 B-LOC 326 | 港 I-LOC 327 | 基 O 328 | 本 O 329 | 法 O 330 | 》 O 331 | 讲 O 332 | 座 O 333 | , O 334 | 中 O 335 | 央 O 336 | 领 O 337 | 导 O 338 | 同 O 339 | 志 O 340 | 认 O 341 | 真 O 342 | 听 O 343 | 讲 O 344 | , O 345 | 虚 O 346 | 心 O 347 | 学 O 348 | 习 O 349 | , O 350 | 很 O 351 | 有 O 352 | 意 O 353 | 义 O 354 | 。 O 355 | 356 | 这 O 357 | 表 O 358 | 明 O 359 | , O 360 | 以 O 361 | 江 B-PER 362 | 泽 I-PER 363 | 民 I-PER 364 | 同 O 365 | 志 O 366 | 为 O 367 | 核 O 368 | 心 O 369 | 的 O 370 | 党 B-ORG 371 | 中 I-ORG 372 | 央 I-ORG 373 | 坚 O 374 | 定 O 375 | 不 O 376 | 移 O 377 | 地 O 378 | 贯 O 379 | 彻 O 380 | 邓 B-PER 381 | 小 I-PER 382 | 平 I-PER 383 | 同 O 384 | 志 O 385 | “ O 386 | 一 O 387 | 国 O 388 | 两 O 389 | 制 O 390 | ” O 391 | 的 O 392 | 伟 O 393 | 大 O 394 | 构 O 395 | 想 O 396 | , O 397 | 不 O 398 | 折 O 399 | 不 O 400 | 扣 O 401 | 地 O 402 | 执 O 403 | 行 O 404 | 基 O 405 | 本 O 406 | 法 O 407 | 。 O 408 | 409 | “ O 410 | 一 O 411 | 国 O 412 | 两 O 413 | 制 O 414 | ” O 415 | 是 O 416 | 邓 B-PER 417 | 小 I-PER 418 | 平 I-PER 419 | 同 O 420 | 志 O 421 | 的 O 422 | 一 O 423 | 个 O 424 | 伟 O 425 | 大 O 426 | 构 O 427 | 想 O 428 | , O 429 | 《 O 430 | 中 B-LOC 431 | 华 I-LOC 432 | 人 I-LOC 433 | 民 I-LOC 434 | 共 I-LOC 435 | 和 I-LOC 436 | 国 I-LOC 437 | 香 B-LOC 438 | 港 I-LOC 439 | 特 I-LOC 440 | 别 I-LOC 441 | 行 I-LOC 442 | 政 I-LOC 443 | 区 I-LOC 444 | 基 O 445 | 本 O 446 | 法 O 447 | 》 O 448 | 是 O 449 | 贯 O 450 | 彻 O 451 | 落 O 452 | 实 O 453 | “ O 454 | 一 O 455 | 国 O 456 | 两 O 457 | 制 O 458 | ” O 459 | 伟 O 460 | 大 O 461 | 构 O 462 | 想 O 463 | 的 O 464 | 一 O 465 | 部 O 466 | 全 O 467 | 国 O 468 | 性 O 469 | 法 O 470 | 律 O 471 | , O 472 | 是 O 473 | 一 O 474 | 部 O 475 | 有 O 476 | 鲜 O 477 | 明 O 478 | 中 B-LOC 479 | 国 I-LOC 480 | 特 O 481 | 色 O 482 | 的 O 483 | 法 O 484 | 律 O 485 | 。 O 486 | 487 | 它 O 488 | 把 O 489 | 中 O 490 | 央 O 491 | 对 O 492 | 解 O 493 | 决 O 494 | 香 B-LOC 495 | 港 I-LOC 496 | 问 O 497 | 题 O 498 | 的 O 499 | 基 O 500 | 本 O 501 | 方 O 502 | 针 O 503 | 政 O 504 | 策 O 505 | 具 O 506 | 体 O 507 | 化 O 508 | 、 O 509 | 法 O 510 | 律 O 511 | 化 O 512 | , O 513 | 成 O 514 | 为 O 515 | 国 O 516 | 家 O 517 | 意 O 518 | 志 O 519 | 。 O 520 | 521 | 学 O 522 | 习 O 523 | 基 O 524 | 本 O 525 | 法 O 526 | , O 527 | 顺 O 528 | 利 O 529 | 迎 O 530 | 回 O 531 | 归 O 532 | , O 533 | 是 O 534 | 一 O 535 | 项 O 536 | 迫 O 537 | 切 O 538 | 的 O 539 | 任 O 540 | 务 O 541 | 。 O 542 | 543 | 要 O 544 | 学 O 545 | 好 O 546 | 基 O 547 | 本 O 548 | 法 O 549 | , O 550 | 首 O 551 | 先 O 552 | 要 O 553 | 认 O 554 | 识 O 555 | 到 O 556 | 基 O 557 | 本 O 558 | 法 O 559 | 的 O 560 | 意 O 561 | 义 O 562 | 。 O 563 | 564 | 邓 B-PER 565 | 小 I-PER 566 | 平 I-PER 567 | 同 O 568 | 志 O 569 | 生 O 570 | 前 O 571 | 高 O 572 | 度 O 573 | 评 O 574 | 价 O 575 | 这 O 576 | 部 O 577 | 法 O 578 | 律 O 579 | , O 580 | 他 O 581 | 指 O 582 | 出 O 583 | : O 584 | “ O 585 | 说 O 586 | 它 O 587 | 具 O 588 | 有 O 589 | 历 O 590 | 史 O 591 | 意 O 592 | 义 O 593 | , O 594 | 不 O 595 | 只 O 596 | 对 O 597 | 过 O 598 | 去 O 599 | 、 O 600 | 现 O 601 | 在 O 602 | , O 603 | 而 O 604 | 且 O 605 | 包 O 606 | 括 O 607 | 将 O 608 | 来 O 609 | ; O 610 | 611 | 说 O 612 | 国 O 613 | 际 O 614 | 意 O 615 | 义 O 616 | , O 617 | 不 O 618 | 只 O 619 | 对 O 620 | 第 O 621 | 三 O 622 | 世 O 623 | 界 O 624 | , O 625 | 而 O 626 | 且 O 627 | 对 O 628 | 全 O 629 | 人 O 630 | 类 O 631 | 都 O 632 | 具 O 633 | 有 O 634 | 长 O 635 | 远 O 636 | 意 O 637 | 义 O 638 | 。 O 639 | 640 | 这 O 641 | 是 O 642 | 一 O 643 | 个 O 644 | 具 O 645 | 有 O 646 | 创 O 647 | 造 O 648 | 性 O 649 | 的 O 650 | 杰 O 651 | 作 O 652 | 。 O 653 | 654 | “ O 655 | 基 O 656 | 本 O 657 | 法 O 658 | 不 O 659 | 仅 O 660 | 为 O 661 | 确 O 662 | 保 O 663 | 香 B-LOC 664 | 港 I-LOC 665 | 平 O 666 | 稳 O 667 | 过 O 668 | 渡 O 669 | 发 O 670 | 挥 O 671 | 重 O 672 | 要 O 673 | 作 O 674 | 用 O 675 | , O 676 | 也 O 677 | 为 O 678 | 确 O 679 | 保 O 680 | 香 B-LOC 681 | 港 I-LOC 682 | 长 O 683 | 期 O 684 | 繁 O 685 | 荣 O 686 | 稳 O 687 | 定 O 688 | 发 O 689 | 挥 O 690 | 重 O 691 | 要 O 692 | 作 O 693 | 用 O 694 | ; O 695 | 696 | 不 O 697 | 仅 O 698 | 为 O 699 | 当 O 700 | 前 O 701 | 解 O 702 | 决 O 703 | 香 B-LOC 704 | 港 I-LOC 705 | 问 O 706 | 题 O 707 | 发 O 708 | 挥 O 709 | 作 O 710 | 用 O 711 | , O 712 | 也 O 713 | 为 O 714 | 在 O 715 | 不 O 716 | 远 O 717 | 的 O 718 | 将 O 719 | 来 O 720 | 解 O 721 | 决 O 722 | 澳 B-LOC 723 | 门 I-LOC 724 | 问 O 725 | 题 O 726 | 和 O 727 | 最 O 728 | 终 O 729 | 解 O 730 | 决 O 731 | 台 B-LOC 732 | 湾 I-LOC 733 | 问 O 734 | 题 O 735 | , O 736 | 实 O 737 | 现 O 738 | 祖 O 739 | 国 O 740 | 完 O 741 | 全 O 742 | 统 O 743 | 一 O 744 | 发 O 745 | 挥 O 746 | 重 O 747 | 要 O 748 | 作 O 749 | 用 O 750 | 。 O 751 | 752 | 基 O 753 | 本 O 754 | 法 O 755 | 的 O 756 | 主 O 757 | 要 O 758 | 特 O 759 | 征 O 760 | , O 761 | 是 O 762 | 把 O 763 | “ O 764 | 一 O 765 | 国 O 766 | ” O 767 | 与 O 768 | “ O 769 | 两 O 770 | 制 O 771 | ” O 772 | 紧 O 773 | 密 O 774 | 结 O 775 | 合 O 776 | , O 777 | 维 O 778 | 护 O 779 | 国 O 780 | 家 O 781 | 的 O 782 | 主 O 783 | 权 O 784 | 、 O 785 | 统 O 786 | 一 O 787 | 和 O 788 | 领 O 789 | 土 O 790 | 完 O 791 | 整 O 792 | 与 O 793 | 授 O 794 | 权 O 795 | 香 B-LOC 796 | 港 I-LOC 797 | 特 I-LOC 798 | 别 I-LOC 799 | 行 I-LOC 800 | 政 I-LOC 801 | 区 I-LOC 802 | 实 O 803 | 行 O 804 | 高 O 805 | 度 O 806 | 自 O 807 | 治 O 808 | 紧 O 809 | 密 O 810 | 结 O 811 | 合 O 812 | 。 O 813 | 814 | 在 O 815 | 一 O 816 | 个 O 817 | 统 O 818 | 一 O 819 | 的 O 820 | 中 B-LOC 821 | 华 I-LOC 822 | 人 I-LOC 823 | 民 I-LOC 824 | 共 I-LOC 825 | 和 I-LOC 826 | 国 I-LOC 827 | , O 828 | 可 O 829 | 以 O 830 | 实 O 831 | 行 O 832 | 社 O 833 | 会 O 834 | 主 O 835 | 义 O 836 | 和 O 837 | 资 O 838 | 本 O 839 | 主 O 840 | 义 O 841 | 两 O 842 | 种 O 843 | 制 O 844 | 度 O 845 | , O 846 | 这 O 847 | 是 O 848 | 为 O 849 | 了 O 850 | 民 O 851 | 族 O 852 | 、 O 853 | 国 O 854 | 家 O 855 | 的 O 856 | 根 O 857 | 本 O 858 | 利 O 859 | 益 O 860 | 。 O 861 | 862 | 只 O 863 | 有 O 864 | 认 O 865 | 真 O 866 | 学 O 867 | 习 O 868 | , O 869 | 才 O 870 | 能 O 871 | 理 O 872 | 解 O 873 | 意 O 874 | 义 O 875 | , O 876 | 认 O 877 | 识 O 878 | 特 O 879 | 征 O 880 | 。 O 881 | 882 | 制 O 883 | 定 O 884 | 一 O 885 | 部 O 886 | 好 O 887 | 法 O 888 | 律 O 889 | , O 890 | 很 O 891 | 不 O 892 | 容 O 893 | 易 O 894 | ; O 895 | 896 | 遵 O 897 | 守 O 898 | 法 O 899 | 律 O 900 | , O 901 | 执 O 902 | 行 O 903 | 法 O 904 | 律 O 905 | , O 906 | 也 O 907 | 很 O 908 | 不 O 909 | 容 O 910 | 易 O 911 | 。 O 912 | 913 | 必 O 914 | 须 O 915 | 重 O 916 | 申 O 917 | , O 918 | 有 O 919 | 法 O 920 | 必 O 921 | 依 O 922 | , O 923 | 执 O 924 | 法 O 925 | 必 O 926 | 严 O 927 | , O 928 | 违 O 929 | 法 O 930 | 必 O 931 | 究 O 932 | 。 O 933 | 934 | 基 O 935 | 本 O 936 | 法 O 937 | 作 O 938 | 为 O 939 | 一 O 940 | 部 O 941 | 全 O 942 | 国 O 943 | 性 O 944 | 的 O 945 | 法 O 946 | 律 O 947 | , O 948 | 不 O 949 | 仅 O 950 | 香 B-LOC 951 | 港 I-LOC 952 | 要 O 953 | 严 O 954 | 格 O 955 | 遵 O 956 | 守 O 957 | , O 958 | 各 O 959 | 省 O 960 | 、 O 961 | 自 O 962 | 治 O 963 | 区 O 964 | 、 O 965 | 直 O 966 | 辖 O 967 | 市 O 968 | 都 O 969 | 要 O 970 | 严 O 971 | 格 O 972 | 遵 O 973 | 守 O 974 | 。 O 975 | 976 | 从 O 977 | 中 B-ORG 978 | 共 I-ORG 979 | 中 I-ORG 980 | 央 I-ORG 981 | 举 O 982 | 办 O 983 | 这 O 984 | 个 O 985 | 讲 O 986 | 座 O 987 | , O 988 | 可 O 989 | 以 O 990 | 看 O 991 | 出 O 992 | , O 993 | 党 O 994 | 和 O 995 | 政 O 996 | 府 O 997 | 正 O 998 | 在 O 999 | 努 O 1000 | 力 O 1001 | 加 O 1002 | 强 O 1003 | 法 O 1004 | 制 O 1005 | 建 O 1006 | 设 O 1007 | , O 1008 | 坚 O 1009 | 持 O 1010 | 依 O 1011 | 法 O 1012 | 治 O 1013 | 国 O 1014 | 。 O 1015 | 1016 | 有 O 1017 | 了 O 1018 | 法 O 1019 | 律 O 1020 | , O 1021 | 有 O 1022 | 了 O 1023 | 制 O 1024 | 度 O 1025 | , O 1026 | 就 O 1027 | 有 O 1028 | 了 O 1029 | 保 O 1030 | 证 O 1031 | , O 1032 | 就 O 1033 | 使 O 1034 | “ O 1035 | 一 O 1036 | 国 O 1037 | 两 O 1038 | 制 O 1039 | ” O 1040 | 的 O 1041 | 伟 O 1042 | 大 O 1043 | 构 O 1044 | 想 O 1045 | 以 O 1046 | 法 O 1047 | 律 O 1048 | 的 O 1049 | 形 O 1050 | 式 O 1051 | 固 O 1052 | 定 O 1053 | 下 O 1054 | 来 O 1055 | 。 O 1056 | 1057 | 全 O 1058 | 国 O 1059 | 人 O 1060 | 民 O 1061 | 特 O 1062 | 别 O 1063 | 是 O 1064 | 香 B-LOC 1065 | 港 I-LOC 1066 | 同 O 1067 | 胞 O 1068 | 也 O 1069 | 从 O 1070 | 中 O 1071 | 再 O 1072 | 一 O 1073 | 次 O 1074 | 看 O 1075 | 到 O 1076 | , O 1077 | 中 B-ORG 1078 | 国 I-ORG 1079 | 共 I-ORG 1080 | 产 I-ORG 1081 | 党 I-ORG 1082 | 和 O 1083 | 人 O 1084 | 民 O 1085 | 政 O 1086 | 府 O 1087 | 是 O 1088 | 高 O 1089 | 度 O 1090 | 负 O 1091 | 责 O 1092 | 任 O 1093 | 的 O 1094 | 党 O 1095 | 和 O 1096 | 政 O 1097 | 府 O 1098 | , O 1099 | 一 O 1100 | 切 O 1101 | 从 O 1102 | 人 O 1103 | 民 O 1104 | 的 O 1105 | 利 O 1106 | 益 O 1107 | 出 O 1108 | 发 O 1109 | , O 1110 | 一 O 1111 | 切 O 1112 | 为 O 1113 | 了 O 1114 | 祖 O 1115 | 国 O 1116 | 的 O 1117 | 繁 O 1118 | 荣 O 1119 | 富 O 1120 | 强 O 1121 | , O 1122 | 香 B-LOC 1123 | 港 I-LOC 1124 | 的 O 1125 | 明 O 1126 | 天 O 1127 | 将 O 1128 | 更 O 1129 | 美 O 1130 | 好 O 1131 | 。 O 1132 | 1133 | 学 O 1134 | 习 O 1135 | 基 O 1136 | 本 O 1137 | 法 O 1138 | , O 1139 | 中 O 1140 | 央 O 1141 | 领 O 1142 | 导 O 1143 | 带 O 1144 | 了 O 1145 | 个 O 1146 | 好 O 1147 | 头 O 1148 | 。 O 1149 | 1150 | 全 O 1151 | 党 O 1152 | 和 O 1153 | 全 O 1154 | 国 O 1155 | 人 O 1156 | 民 O 1157 | 特 O 1158 | 别 O 1159 | 是 O 1160 | 各 O 1161 | 级 O 1162 | 党 O 1163 | 政 O 1164 | 领 O 1165 | 导 O 1166 | 干 O 1167 | 部 O 1168 | , O 1169 | 都 O 1170 | 要 O 1171 | 重 O 1172 | 视 O 1173 | 学 O 1174 | 习 O 1175 | 。 O 1176 | 1177 | 只 O 1178 | 有 O 1179 | 学 O 1180 | 习 O 1181 | 好 O 1182 | , O 1183 | 才 O 1184 | 能 O 1185 | 贯 O 1186 | 彻 O 1187 | 好 O 1188 | 。 O 1189 | 1190 | 为 O 1191 | 了 O 1192 | 迎 O 1193 | 接 O 1194 | 香 B-LOC 1195 | 港 I-LOC 1196 | 顺 O 1197 | 利 O 1198 | 回 O 1199 | 归 O 1200 | 祖 O 1201 | 国 O 1202 | 这 O 1203 | 一 O 1204 | 中 B-LOC 1205 | 华 I-LOC 1206 | 民 O 1207 | 族 O 1208 | 的 O 1209 | 盛 O 1210 | 事 O 1211 | , O 1212 | 首 O 1213 | 先 O 1214 | 要 O 1215 | 有 O 1216 | 一 O 1217 | 个 O 1218 | 扎 O 1219 | 实 O 1220 | 的 O 1221 | 思 O 1222 | 想 O 1223 | 准 O 1224 | 备 O 1225 | 和 O 1226 | 良 O 1227 | 好 O 1228 | 的 O 1229 | 精 O 1230 | 神 O 1231 | 状 O 1232 | 态 O 1233 | 。 O 1234 | 1235 | 基 O 1236 | 本 O 1237 | 法 O 1238 | 连 O 1239 | 着 O 1240 | 你 O 1241 | 我 O 1242 | 他 O 1243 | 1244 | 叶 B-PER 1245 | 秋 I-PER 1246 | 1247 | 赠 O 1248 | 书 O 1249 | 想 O 1250 | 来 O 1251 | 是 O 1252 | 香 B-LOC 1253 | 港 I-LOC 1254 | 同 O 1255 | 胞 O 1256 | 的 O 1257 | 一 O 1258 | 种 O 1259 | 文 O 1260 | 明 O 1261 | 礼 O 1262 | 仪 O 1263 | 。 O 1264 | 1265 | 抵 O 1266 | 港 B-LOC 1267 | 仅 O 1268 | 数 O 1269 | 日 O 1270 | , O 1271 | 就 O 1272 | 收 O 1273 | 到 O 1274 | 厚 O 1275 | 厚 O 1276 | 几 O 1277 | 摞 O 1278 | 书 O 1279 | 。 O 1280 | 1281 | 匆 O 1282 | 匆 O 1283 | 翻 O 1284 | 阅 O 1285 | 一 O 1286 | 遍 O 1287 | , O 1288 | 发 O 1289 | 现 O 1290 | 各 O 1291 | 种 O 1292 | 版 O 1293 | 本 O 1294 | 的 O 1295 | 《 O 1296 | 中 B-LOC 1297 | 华 I-LOC 1298 | 人 I-LOC 1299 | 民 I-LOC 1300 | 共 I-LOC 1301 | 和 I-LOC 1302 | 国 I-LOC 1303 | 香 B-LOC 1304 | 港 I-LOC 1305 | 特 I-LOC 1306 | 别 I-LOC 1307 | 行 I-LOC 1308 | 政 I-LOC 1309 | 区 I-LOC 1310 | 基 O 1311 | 本 O 1312 | 法 O 1313 | 》 O 1314 | 竟 O 1315 | 有 O 1316 | 六 O 1317 | 册 O 1318 | 之 O 1319 | 多 O 1320 | , O 1321 | 推 O 1322 | 介 O 1323 | 普 O 1324 | 及 O 1325 | 基 O 1326 | 本 O 1327 | 法 O 1328 | 的 O 1329 | 书 O 1330 | 籍 O 1331 | 还 O 1332 | 要 O 1333 | 多 O 1334 | 。 O 1335 | 1336 | 应 O 1337 | 约 O 1338 | 去 O 1339 | 湾 B-LOC 1340 | 仔 I-LOC 1341 | 道 I-LOC 1342 | 谈 O 1343 | 事 O 1344 | , O 1345 | 路 O 1346 | 过 O 1347 | 一 O 1348 | 个 O 1349 | 名 O 1350 | 为 O 1351 | “ O 1352 | 艺 O 1353 | 美 O 1354 | ” O 1355 | 的 O 1356 | 书 O 1357 | 店 O 1358 | , O 1359 | 看 O 1360 | 到 O 1361 | 摆 O 1362 | 放 O 1363 | 在 O 1364 | 最 O 1365 | 抢 O 1366 | 眼 O 1367 | 位 O 1368 | 置 O 1369 | 的 O 1370 | 也 O 1371 | 是 O 1372 | 基 O 1373 | 本 O 1374 | 法 O 1375 | 及 O 1376 | 其 O 1377 | 推 O 1378 | 介 O 1379 | 图 O 1380 | 书 O 1381 | 。 O 1382 | 1383 | 迎 O 1384 | 街 O 1385 | 介 O 1386 | 绍 O 1387 | 新 O 1388 | 书 O 1389 | 的 O 1390 | 告 O 1391 | 示 O 1392 | 上 O 1393 | 醒 O 1394 | 目 O 1395 | 地 O 1396 | 写 O 1397 | 着 O 1398 | : O 1399 | “ O 1400 | 基 O 1401 | 本 O 1402 | 法 O 1403 | 连 O 1404 | 着 O 1405 | 你 O 1406 | 、 O 1407 | 我 O 1408 | 、 O 1409 | 他 O 1410 | , O 1411 | 让 O 1412 | 我 O 1413 | 们 O 1414 | 都 O 1415 | 来 O 1416 | 认 O 1417 | 识 O 1418 | 基 O 1419 | 本 O 1420 | 法 O 1421 | 。 O 1422 | ” O 1423 | 1424 | 由 O 1425 | 此 O 1426 | 可 O 1427 | 见 O 1428 | , O 1429 | 在 O 1430 | 法 O 1431 | 制 O 1432 | 观 O 1433 | 念 O 1434 | 很 O 1435 | 强 O 1436 | 的 O 1437 | 港 B-LOC 1438 | 人 O 1439 | 心 O 1440 | 目 O 1441 | 中 O 1442 | , O 1443 | 基 O 1444 | 本 O 1445 | 法 O 1446 | 具 O 1447 | 有 O 1448 | 极 O 1449 | 大 O 1450 | 的 O 1451 | 权 O 1452 | 威 O 1453 | 和 O 1454 | 尊 O 1455 | 严 O 1456 | 。 O 1457 | 1458 | 香 B-LOC 1459 | 港 I-LOC 1460 | 各 O 1461 | 界 O 1462 | 人 O 1463 | 士 O 1464 | 从 O 1465 | 各 O 1466 | 自 O 1467 | 的 O 1468 | 角 O 1469 | 度 O 1470 | 去 O 1471 | 高 O 1472 | 度 O 1473 | 评 O 1474 | 价 O 1475 | 它 O 1476 | 。 O 1477 | 1478 | 行 O 1479 | 政 O 1480 | 官 O 1481 | 员 O 1482 | 表 O 1483 | 示 O 1484 | : O 1485 | “ O 1486 | 香 B-LOC 1487 | 港 I-LOC 1488 | 继 O 1489 | 续 O 1490 | 繁 O 1491 | 荣 O 1492 | 稳 O 1493 | 定 O 1494 | 、 O 1495 | 实 O 1496 | 现 O 1497 | 香 B-LOC 1498 | 港 I-LOC 1499 | 梦 O 1500 | 的 O 1501 | 成 O 1502 | 功 O 1503 | 要 O 1504 | 素 O 1505 | , O 1506 | 在 O 1507 | 基 O 1508 | 本 O 1509 | 法 O 1510 | 中 O 1511 | 得 O 1512 | 到 O 1513 | 了 O 1514 | 充 O 1515 | 分 O 1516 | 保 O 1517 | 证 O 1518 | 。 O 1519 | ” O 1520 | 1521 | 法 O 1522 | 律 O 1523 | 界 O 1524 | 人 O 1525 | 士 O 1526 | 认 O 1527 | 为 O 1528 | : O 1529 | “ O 1530 | 法 O 1531 | 治 O 1532 | 精 O 1533 | 神 O 1534 | 能 O 1535 | 否 O 1536 | 继 O 1537 | 续 O 1538 | 保 O 1539 | 持 O 1540 | , O 1541 | 基 O 1542 | 本 O 1543 | 法 O 1544 | 已 O 1545 | 作 O 1546 | 了 O 1547 | 明 O 1548 | 确 O 1549 | 规 O 1550 | 定 O 1551 | 。 O 1552 | 1553 | 只 O 1554 | 要 O 1555 | 恪 O 1556 | 守 O 1557 | 广 O 1558 | 大 O 1559 | 港 B-LOC 1560 | 人 O 1561 | 认 O 1562 | 受 O 1563 | 的 O 1564 | 香 B-LOC 1565 | 港 I-LOC 1566 | 法 O 1567 | 律 O 1568 | 体 O 1569 | 系 O 1570 | 中 O 1571 | 的 O 1572 | 这 O 1573 | 个 O 1574 | 总 O 1575 | 纲 O 1576 | 纪 O 1577 | 、 O 1578 | 总 O 1579 | 章 O 1580 | 程 O 1581 | , O 1582 | 香 B-LOC 1583 | 港 I-LOC 1584 | 将 O 1585 | 健 O 1586 | 步 O 1587 | 迈 O 1588 | 向 O 1589 | 新 O 1590 | 世 O 1591 | 纪 O 1592 | 。 O 1593 | ” O 1594 | 1595 | 劳 O 1596 | 工 O 1597 | 界 O 1598 | 的 O 1599 | 成 O 1600 | 员 O 1601 | 说 O 1602 | , O 1603 | 涉 O 1604 | 及 O 1605 | 保 O 1606 | 障 O 1607 | 劳 O 1608 | 工 O 1609 | 合 O 1610 | 法 O 1611 | 权 O 1612 | 益 O 1613 | 的 O 1614 | 条 O 1615 | 款 O 1616 | , O 1617 | “ O 1618 | 香 B-LOC 1619 | 港 I-LOC 1620 | 现 O 1621 | 在 O 1622 | 有 O 1623 | 的 O 1624 | , O 1625 | 基 O 1626 | 本 O 1627 | 法 O 1628 | 都 O 1629 | 保 O 1630 | 持 O 1631 | 了 O 1632 | ; O 1633 | 1634 | 香 B-LOC 1635 | 港 I-LOC 1636 | 现 O 1637 | 在 O 1638 | 没 O 1639 | 有 O 1640 | 的 O 1641 | , O 1642 | 基 O 1643 | 本 O 1644 | 法 O 1645 | 里 O 1646 | 也 O 1647 | 有 O 1648 | 了 O 1649 | 。 O 1650 | 1651 | 大 O 1652 | 家 O 1653 | 因 O 1654 | 此 O 1655 | 吃 O 1656 | 了 O 1657 | 定 O 1658 | 心 O 1659 | 丸 O 1660 | 。 O 1661 | ” O 1662 | 1663 | 基 O 1664 | 本 O 1665 | 法 O 1666 | 受 O 1667 | 到 O 1668 | 港 B-LOC 1669 | 人 O 1670 | 的 O 1671 | 普 O 1672 | 遍 O 1673 | 欢 O 1674 | 迎 O 1675 | 和 O 1676 | 高 O 1677 | 度 O 1678 | 重 O 1679 | 视 O 1680 | 是 O 1681 | 势 O 1682 | 所 O 1683 | 必 O 1684 | 然 O 1685 | 。 O 1686 | 1687 | 历 O 1688 | 时 O 1689 | 四 O 1690 | 年 O 1691 | 零 O 1692 | 八 O 1693 | 个 O 1694 | 月 O 1695 | 、 O 1696 | 凝 O 1697 | 聚 O 1698 | 了 O 1699 | 香 B-LOC 1700 | 港 I-LOC 1701 | 和 O 1702 | 内 O 1703 | 地 O 1704 | 无 O 1705 | 数 O 1706 | 人 O 1707 | 的 O 1708 | 智 O 1709 | 慧 O 1710 | 而 O 1711 | 制 O 1712 | 定 O 1713 | 的 O 1714 | 基 O 1715 | 本 O 1716 | 法 O 1717 | , O 1718 | 将 O 1719 | 邓 B-PER 1720 | 小 I-PER 1721 | 平 I-PER 1722 | 同 O 1723 | 志 O 1724 | 倡 O 1725 | 导 O 1726 | 的 O 1727 | “ O 1728 | 一 O 1729 | 国 O 1730 | 两 O 1731 | 制 O 1732 | ” O 1733 | 伟 O 1734 | 大 O 1735 | 构 O 1736 | 想 O 1737 | 以 O 1738 | 法 O 1739 | 律 O 1740 | 形 O 1741 | 式 O 1742 | 固 O 1743 | 定 O 1744 | 下 O 1745 | 来 O 1746 | , O 1747 | 成 O 1748 | 为 O 1749 | 国 O 1750 | 家 O 1751 | 和 O 1752 | 人 O 1753 | 民 O 1754 | 的 O 1755 | 意 O 1756 | 志 O 1757 | 。 O 1758 | 1759 | 邓 B-PER 1760 | 小 I-PER 1761 | 平 I-PER 1762 | 赞 O 1763 | 许 O 1764 | : O 1765 | 基 O 1766 | 本 O 1767 | 法 O 1768 | 具 O 1769 | 有 O 1770 | 历 O 1771 | 史 O 1772 | 意 O 1773 | 义 O 1774 | 和 O 1775 | 国 O 1776 | 际 O 1777 | 意 O 1778 | 义 O 1779 | , O 1780 | 是 O 1781 | 一 O 1782 | 个 O 1783 | 具 O 1784 | 有 O 1785 | 创 O 1786 | 造 O 1787 | 性 O 1788 | 的 O 1789 | 杰 O 1790 | 作 O 1791 | 。 O 1792 | 1793 | 基 O 1794 | 本 O 1795 | 法 O 1796 | 既 O 1797 | 是 O 1798 | 香 B-LOC 1799 | 港 I-LOC 1800 | 回 O 1801 | 归 O 1802 | 后 O 1803 | 特 B-LOC 1804 | 区 I-LOC 1805 | 一 O 1806 | 切 O 1807 | 运 O 1808 | 作 O 1809 | 的 O 1810 | 法 O 1811 | 律 O 1812 | 基 O 1813 | 础 O 1814 | , O 1815 | 更 O 1816 | 是 O 1817 | 保 O 1818 | 持 O 1819 | 香 B-LOC 1820 | 港 I-LOC 1821 | 长 O 1822 | 期 O 1823 | 稳 O 1824 | 定 O 1825 | 繁 O 1826 | 荣 O 1827 | 的 O 1828 | 法 O 1829 | 律 O 1830 | 保 O 1831 | 证 O 1832 | 。 O 1833 | 1834 | 实 O 1835 | 践 O 1836 | 已 O 1837 | 经 O 1838 | 并 O 1839 | 将 O 1840 | 继 O 1841 | 续 O 1842 | 证 O 1843 | 明 O 1844 | 这 O 1845 | 一 O 1846 | 点 O 1847 | 。 O 1848 | 1849 | 董 B-PER 1850 | 建 I-PER 1851 | 华 I-PER 1852 | 先 O 1853 | 生 O 1854 | 近 O 1855 | 日 O 1856 | 撰 O 1857 | 文 O 1858 | 称 O 1859 | “ O 1860 | 基 O 1861 | 本 O 1862 | 法 O 1863 | 是 O 1864 | ‘ O 1865 | 一 O 1866 | 国 O 1867 | 两 O 1868 | 制 O 1869 | ’ O 1870 | 的 O 1871 | 一 O 1872 | 次 O 1873 | 成 O 1874 | 功 O 1875 | 实 O 1876 | 践 O 1877 | 。 O 1878 | ” O 1879 | 1880 | 说 O 1881 | 来 O 1882 | 也 O 1883 | 巧 O 1884 | , O 1885 | 姬 B-PER 1886 | 鹏 I-PER 1887 | 飞 I-PER 1888 | 同 O 1889 | 志 O 1890 | 1 O 1891 | 9 O 1892 | 9 O 1893 | 0 O 1894 | 年 O 1895 | 4 O 1896 | 月 O 1897 | 在 O 1898 | 邓 B-PER 1899 | 小 I-PER 1900 | 平 I-PER 1901 | 同 O 1902 | 志 O 1903 | 题 O 1904 | 写 O 1905 | 书 O 1906 | 名 O 1907 | 的 O 1908 | 《 O 1909 | 基 O 1910 | 本 O 1911 | 法 O 1912 | 的 O 1913 | 诞 O 1914 | 生 O 1915 | 》 O 1916 | 一 O 1917 | 书 O 1918 | 序 O 1919 | 言 O 1920 | 中 O 1921 | 也 O 1922 | 写 O 1923 | 了 O 1924 | 同 O 1925 | 样 O 1926 | 的 O 1927 | 话 O 1928 | 。 O 1929 | 1930 | 真 O 1931 | 可 O 1932 | 谓 O 1933 | 仁 O 1934 | 者 O 1935 | 智 O 1936 | 者 O 1937 | 所 O 1938 | 见 O 1939 | 略 O 1940 | 同 O 1941 | 。 O 1942 | 1943 | 基 O 1944 | 本 O 1945 | 法 O 1946 | 是 O 1947 | 一 O 1948 | 部 O 1949 | 具 O 1950 | 有 O 1951 | 普 O 1952 | 遍 O 1953 | 约 O 1954 | 束 O 1955 | 力 O 1956 | 的 O 1957 | 重 O 1958 | 要 O 1959 | 法 O 1960 | 律 O 1961 | 。 O 1962 | 1963 | 7 O 1964 | 月 O 1965 | 1 O 1966 | 日 O 1967 | , O 1968 | 这 O 1969 | 部 O 1970 | 重 O 1971 | 要 O 1972 | 法 O 1973 | 律 O 1974 | 即 O 1975 | 开 O 1976 | 始 O 1977 | 正 O 1978 | 式 O 1979 | 实 O 1980 | 施 O 1981 | 。 O 1982 | 1983 | 基 O 1984 | 本 O 1985 | 法 O 1986 | 不 O 1987 | 仅 O 1988 | 体 O 1989 | 现 O 1990 | 了 O 1991 | 香 B-LOC 1992 | 港 I-LOC 1993 | 同 O 1994 | 胞 O 1995 | 的 O 1996 | 意 O 1997 | 志 O 1998 | 和 O 1999 | 利 O 2000 | 益 O 2001 | , O 2002 | 也 O 2003 | 体 O 2004 | 现 O 2005 | 了 O 2006 | 全 O 2007 | 国 O 2008 | 人 O 2009 | 民 O 2010 | 的 O 2011 | 意 O 2012 | 志 O 2013 | 和 O 2014 | 利 O 2015 | 益 O 2016 | 。 O 2017 | 2018 | 正 O 2019 | 因 O 2020 | 为 O 2021 | 如 O 2022 | 此 O 2023 | , O 2024 | 江 B-PER 2025 | 泽 I-PER 2026 | 民 I-PER 2027 | 同 O 2028 | 志 O 2029 | 强 O 2030 | 调 O 2031 | : O 2032 | 香 B-LOC 2033 | 港 I-LOC 2034 | 基 O 2035 | 本 O 2036 | 法 O 2037 | 是 O 2038 | 一 O 2039 | 部 O 2040 | 全 O 2041 | 国 O 2042 | 性 O 2043 | 的 O 2044 | 法 O 2045 | 律 O 2046 | , O 2047 | 不 O 2048 | 仅 O 2049 | 香 B-LOC 2050 | 港 I-LOC 2051 | 要 O 2052 | 严 O 2053 | 格 O 2054 | 遵 O 2055 | 守 O 2056 | , O 2057 | 各 O 2058 | 省 O 2059 | 、 O 2060 | 自 O 2061 | 治 O 2062 | 区 O 2063 | 、 O 2064 | 直 O 2065 | 辖 O 2066 | 市 O 2067 | 都 O 2068 | 要 O 2069 | 严 O 2070 | 格 O 2071 | 遵 O 2072 | 守 O 2073 | 。 O 2074 | 2075 | 还 O 2076 | 表 O 2077 | 示 O 2078 | , O 2079 | 不 O 2080 | 仅 O 2081 | 我 O 2082 | 要 O 2083 | 遵 O 2084 | 守 O 2085 | , O 2086 | 我 O 2087 | 希 O 2088 | 望 O 2089 | 香 B-LOC 2090 | 港 I-LOC 2091 | 同 O 2092 | 胞 O 2093 | 和 O 2094 | 全 O 2095 | 国 O 2096 | 1 O 2097 | 2 O 2098 | 亿 O 2099 | 人 O 2100 | 民 O 2101 | 也 O 2102 | 要 O 2103 | 遵 O 2104 | 守 O 2105 | 。 O 2106 | 2107 | 学 O 2108 | 习 O 2109 | 、 O 2110 | 贯 O 2111 | 彻 O 2112 | 基 O 2113 | 本 O 2114 | 法 O 2115 | 的 O 2116 | 过 O 2117 | 程 O 2118 | , O 2119 | 无 O 2120 | 疑 O 2121 | 是 O 2122 | 增 O 2123 | 强 O 2124 | 法 O 2125 | 制 O 2126 | 观 O 2127 | 念 O 2128 | 、 O 2129 | 推 O 2130 | 进 O 2131 | 法 O 2132 | 制 O 2133 | 建 O 2134 | 设 O 2135 | 的 O 2136 | 过 O 2137 | 程 O 2138 | , O 2139 | 无 O 2140 | 疑 O 2141 | 是 O 2142 | 内 O 2143 | 地 O 2144 | 和 O 2145 | 香 B-LOC 2146 | 港 I-LOC 2147 | 在 O 2148 | 新 O 2149 | 的 O 2150 | 征 O 2151 | 途 O 2152 | 上 O 2153 | 并 O 2154 | 肩 O 2155 | 同 O 2156 | 行 O 2157 | 、 O 2158 | 共 O 2159 | 创 O 2160 | 辉 O 2161 | 煌 O 2162 | 的 O 2163 | 过 O 2164 | 程 O 2165 | 。 O 2166 | 2167 | 法 O 2168 | 律 O 2169 | 一 O 2170 | 旦 O 2171 | 为 O 2172 | 人 O 2173 | 民 O 2174 | 群 O 2175 | 众 O 2176 | 所 O 2177 | 掌 O 2178 | 握 O 2179 | , O 2180 | 就 O 2181 | 会 O 2182 | 变 O 2183 | 成 O 2184 | 伟 O 2185 | 大 O 2186 | 的 O 2187 | 力 O 2188 | 量 O 2189 | 。 O 2190 | 2191 | 行 O 2192 | 文 O 2193 | 至 O 2194 | 此 O 2195 | , O 2196 | 我 O 2197 | 对 O 2198 | “ O 2199 | 基 O 2200 | 本 O 2201 | 法 O 2202 | 连 O 2203 | 着 O 2204 | 你 O 2205 | 我 O 2206 | 他 O 2207 | ” O 2208 | 有 O 2209 | 了 O 2210 | 更 O 2211 | 深 O 2212 | 刻 O 2213 | 、 O 2214 | 更 O 2215 | 真 O 2216 | 切 O 2217 | 的 O 2218 | 理 O 2219 | 解 O 2220 | 。 O 2221 | 2222 | 任 B-PER 2223 | 建 I-PER 2224 | 新 I-PER 2225 | 在 O 2226 | 向 O 2227 | 八 B-ORG 2228 | 届 I-ORG 2229 | 全 I-ORG 2230 | 国 I-ORG 2231 | 人 I-ORG 2232 | 大 I-ORG 2233 | 五 O 2234 | 次 O 2235 | 会 O 2236 | 议 O 2237 | 的 O 2238 | 报 O 2239 | 告 O 2240 | 中 O 2241 | 说 O 2242 | 坚 O 2243 | 持 O 2244 | 严 O 2245 | 肃 O 2246 | 执 O 2247 | 法 O 2248 | 提 O 2249 | 高 O 2250 | 司 O 2251 | 法 O 2252 | 水 O 2253 | 平 O 2254 | 2255 | 新 B-ORG 2256 | 华 I-ORG 2257 | 社 I-ORG 2258 | 北 B-LOC 2259 | 京 I-LOC 2260 | 3 O 2261 | 月 O 2262 | 1 O 2263 | 1 O 2264 | 日 O 2265 | 电 O 2266 | 最 B-ORG 2267 | 高 I-ORG 2268 | 人 I-ORG 2269 | 民 I-ORG 2270 | 法 I-ORG 2271 | 院 I-ORG 2272 | 院 O 2273 | 长 O 2274 | 任 B-PER 2275 | 建 I-PER 2276 | 新 I-PER 2277 | 今 O 2278 | 天 O 2279 | 在 O 2280 | 八 B-ORG 2281 | 届 I-ORG 2282 | 全 I-ORG 2283 | 国 I-ORG 2284 | 人 I-ORG 2285 | 大 I-ORG 2286 | 五 O 2287 | 次 O 2288 | 会 O 2289 | 议 O 2290 | 第 O 2291 | 五 O 2292 | 次 O 2293 | 全 O 2294 | 体 O 2295 | 会 O 2296 | 议 O 2297 | 作 O 2298 | 报 O 2299 | 告 O 2300 | 时 O 2301 | 说 O 2302 | , O 2303 | 严 O 2304 | 肃 O 2305 | 执 O 2306 | 法 O 2307 | 是 O 2308 | 社 O 2309 | 会 O 2310 | 主 O 2311 | 义 O 2312 | 法 O 2313 | 制 O 2314 | 建 O 2315 | 设 O 2316 | 的 O 2317 | 重 O 2318 | 要 O 2319 | 内 O 2320 | 容 O 2321 | , O 2322 | 是 O 2323 | 党 O 2324 | 和 O 2325 | 国 O 2326 | 家 O 2327 | 对 O 2328 | 司 O 2329 | 法 O 2330 | 活 O 2331 | 动 O 2332 | 的 O 2333 | 根 O 2334 | 本 O 2335 | 要 O 2336 | 求 O 2337 | 。 O 2338 | 2339 | 一 O 2340 | 年 O 2341 | 来 O 2342 | , O 2343 | 全 O 2344 | 国 O 2345 | 法 O 2346 | 院 O 2347 | 坚 O 2348 | 持 O 2349 | 严 O 2350 | 肃 O 2351 | 执 O 2352 | 法 O 2353 | 、 O 2354 | 努 O 2355 | 力 O 2356 | 提 O 2357 | 高 O 2358 | 司 O 2359 | 法 O 2360 | 水 O 2361 | 平 O 2362 | 。 O -------------------------------------------------------------------------------- /albert_ner.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """BERT finetuning runner.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | 22 | import collections 23 | import csv 24 | import os 25 | import modeling 26 | import optimization_finetuning as optimization 27 | import tokenization 28 | import tensorflow as tf 29 | import pickle 30 | import tf_metrics 31 | 32 | # from loss import bi_tempered_logistic_loss 33 | 34 | flags = tf.flags 35 | 36 | FLAGS = flags.FLAGS 37 | 38 | ## Required parameters 39 | flags.DEFINE_string( 40 | "data_dir", None, 41 | "The input data dir. Should contain the .tsv files (or other data files) " 42 | "for the task.") 43 | 44 | flags.DEFINE_string( 45 | "bert_config_file", None, 46 | "The config json file corresponding to the pre-trained BERT model. " 47 | "This specifies the model architecture.") 48 | 49 | flags.DEFINE_string("task_name", "NER", "The name of the task to train.") 50 | 51 | flags.DEFINE_string("vocab_file", None, 52 | "The vocabulary file that the BERT model was trained on.") 53 | 54 | flags.DEFINE_string( 55 | "output_dir", None, 56 | "The output directory where the model checkpoints will be written.") 57 | 58 | ## Other parameters 59 | 60 | flags.DEFINE_string( 61 | "init_checkpoint", "albert_base_zh/albert_model.ckpt", 62 | "Initial checkpoint (usually from a pre-trained BERT model).") 63 | 64 | flags.DEFINE_bool( 65 | "do_lower_case", True, 66 | "Whether to lower case the input text. Should be True for uncased " 67 | "models and False for cased models.") 68 | 69 | flags.DEFINE_integer( 70 | "max_seq_length", 128, 71 | "The maximum total input sequence length after WordPiece tokenization. " 72 | "Sequences longer than this will be truncated, and sequences shorter " 73 | "than this will be padded.") 74 | 75 | flags.DEFINE_bool("do_train", False, "Whether to run training.") 76 | 77 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") 78 | 79 | flags.DEFINE_bool( 80 | "do_predict", False, 81 | "Whether to run the model in inference mode on the test set.") 82 | 83 | flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") 84 | 85 | flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") 86 | 87 | flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") 88 | 89 | flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") 90 | 91 | flags.DEFINE_float("num_train_epochs", 3.0, 92 | "Total number of training epochs to perform.") 93 | 94 | flags.DEFINE_float( 95 | "warmup_proportion", 0.1, 96 | "Proportion of training to perform linear learning rate warmup for. " 97 | "E.g., 0.1 = 10% of training.") 98 | 99 | flags.DEFINE_integer("save_checkpoints_steps", 1000, 100 | "How often to save the model checkpoint.") 101 | 102 | flags.DEFINE_integer("iterations_per_loop", 1000, 103 | "How many steps to make in each estimator call.") 104 | 105 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 106 | 107 | tf.flags.DEFINE_string( 108 | "tpu_name", None, 109 | "The Cloud TPU to use for training. This should be either the name " 110 | "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " 111 | "url.") 112 | 113 | tf.flags.DEFINE_string( 114 | "tpu_zone", None, 115 | "[Optional] GCE zone where the Cloud TPU is located in. If not " 116 | "specified, we will attempt to automatically detect the GCE project from " 117 | "metadata.") 118 | 119 | tf.flags.DEFINE_string( 120 | "gcp_project", None, 121 | "[Optional] Project name for the Cloud TPU-enabled project. If not " 122 | "specified, we will attempt to automatically detect the GCE project from " 123 | "metadata.") 124 | 125 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") 126 | 127 | flags.DEFINE_integer( 128 | "num_tpu_cores", 8, 129 | "Only used if `use_tpu` is True. Total number of TPU cores to use.") 130 | 131 | 132 | class InputExample(object): 133 | """A single training/test example for simple sequence classification.""" 134 | 135 | def __init__(self, guid, text, label=None): 136 | """Constructs a InputExample. 137 | Args: 138 | guid: Unique id for the example. 139 | text_a: string. The untokenized text of the first sequence. For single 140 | sequence tasks, only this sequence must be specified. 141 | text_b: (Optional) string. The untokenized text of the second sequence. 142 | Only must be specified for sequence pair tasks. 143 | label: (Optional) string. The label of the example. This should be 144 | specified for train and dev examples, but not for test examples. 145 | """ 146 | self.guid = guid 147 | self.text = text 148 | self.label = label 149 | 150 | 151 | class PaddingInputExample(object): 152 | """Fake example so the num input examples is a multiple of the batch size. 153 | When running eval/predict on the TPU, we need to pad the number of examples 154 | to be a multiple of the batch size, because the TPU requires a fixed batch 155 | size. The alternative is to drop the last batch, which is bad because it means 156 | the entire output data won't be generated. 157 | We use this class instead of `None` because treating `None` as padding 158 | battches could cause silent errors. 159 | """ 160 | 161 | 162 | class InputFeatures(object): 163 | """A single set of features of data.""" 164 | 165 | def __init__(self, 166 | input_ids, 167 | input_mask, 168 | segment_ids, 169 | label_ids): 170 | self.input_ids = input_ids 171 | self.input_mask = input_mask 172 | self.segment_ids = segment_ids 173 | self.label_ids = label_ids 174 | # self.is_real_example = is_real_example 175 | 176 | 177 | class DataProcessor(object): 178 | """Base class for data converters for sequence classification data sets.""" 179 | 180 | def get_train_examples(self, data_dir): 181 | """Gets a collection of `InputExample`s for the train set.""" 182 | raise NotImplementedError() 183 | 184 | def get_dev_examples(self, data_dir): 185 | """Gets a collection of `InputExample`s for the dev set.""" 186 | raise NotImplementedError() 187 | 188 | def get_test_examples(self, data_dir): 189 | """Gets a collection of `InputExample`s for prediction.""" 190 | raise NotImplementedError() 191 | 192 | def get_labels(self): 193 | """Gets the list of labels for this data set.""" 194 | raise NotImplementedError() 195 | 196 | @classmethod 197 | def _read_data(cls, input_file): 198 | """Reads a BIO data.""" 199 | with open(input_file, encoding='utf-8') as f: 200 | lines = [] 201 | words = [] 202 | labels = [] 203 | for line in f: 204 | contends = line.strip() 205 | word = line.strip().split(' ')[0] 206 | label = line.strip().split(' ')[-1] 207 | if contends.startswith("-DOCSTART-"): 208 | words.append('') 209 | continue 210 | # if len(contends) == 0 and words[-1] == '。': 211 | if len(contends) == 0: 212 | l = ' '.join([label for label in labels if len(label) > 0]) 213 | w = ' '.join([word for word in words if len(word) > 0]) 214 | lines.append([l, w]) 215 | words = [] 216 | labels = [] 217 | continue 218 | words.append(word) 219 | labels.append(label) 220 | return lines 221 | 222 | class NerProcessor(DataProcessor): 223 | def get_train_examples(self, data_dir): 224 | return self._create_example( 225 | self._read_data(os.path.join(data_dir, "train.txt")), "train" 226 | ) 227 | 228 | def get_dev_examples(self, data_dir): 229 | return self._create_example( 230 | self._read_data(os.path.join(data_dir, "dev.txt")), "dev" 231 | ) 232 | 233 | def get_test_examples(self,data_dir): 234 | return self._create_example( 235 | self._read_data(os.path.join(data_dir, "test.txt")), "test") 236 | 237 | 238 | def get_labels(self): 239 | # return ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]","[SEP]"] 240 | return ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "X","[CLS]","[SEP]"] 241 | 242 | def _create_example(self, lines, set_type): 243 | examples = [] 244 | for (i, line) in enumerate(lines): 245 | guid = "%s-%s" % (set_type, i) 246 | text = tokenization.convert_to_unicode(line[1]) 247 | label = tokenization.convert_to_unicode(line[0]) 248 | examples.append(InputExample(guid=guid, text=text, label=label)) 249 | return examples 250 | 251 | 252 | def write_tokens(tokens,mode): 253 | if mode=="test": 254 | path = os.path.join(FLAGS.output_dir, "token_"+mode+".txt") 255 | wf = open(path,'a') 256 | for token in tokens: 257 | if token!="**NULL**": 258 | wf.write(token+'\n') 259 | wf.close() 260 | 261 | def convert_single_example(ex_index, example, label_map, max_seq_length, tokenizer,mode): 262 | textlist = example.text.split(' ') 263 | labellist = example.label.split(' ') 264 | tokens = [] 265 | labels = [] 266 | # print(textlist) 267 | for i, word in enumerate(textlist): 268 | token = tokenizer.tokenize(word) 269 | # print(token) 270 | tokens.extend(token) 271 | label_1 = labellist[i] 272 | # print(label_1) 273 | for m in range(len(token)): 274 | if m == 0: 275 | labels.append(label_1) 276 | # else: 277 | # labels.append("X") 278 | # print(tokens, labels) 279 | # tokens = tokenizer.tokenize(example.text) 280 | if len(tokens) >= max_seq_length - 1: 281 | tokens = tokens[0:(max_seq_length - 2)] 282 | labels = labels[0:(max_seq_length - 2)] 283 | ntokens = [] 284 | segment_ids = [] 285 | label_ids = [] 286 | ntokens.append("[CLS]") 287 | segment_ids.append(0) 288 | # append("O") or append("[CLS]") not sure! 289 | label_ids.append(label_map["[CLS]"]) 290 | for i, token in enumerate(tokens): 291 | ntokens.append(token) 292 | segment_ids.append(0) 293 | label_ids.append(label_map[labels[i]]) 294 | ntokens.append("[SEP]") 295 | segment_ids.append(0) 296 | # append("O") or append("[SEP]") not sure! 297 | label_ids.append(label_map["[SEP]"]) 298 | input_ids = tokenizer.convert_tokens_to_ids(ntokens) 299 | input_mask = [1] * len(input_ids) 300 | #label_mask = [1] * len(input_ids) 301 | while len(input_ids) < max_seq_length: 302 | input_ids.append(0) 303 | input_mask.append(0) 304 | segment_ids.append(0) 305 | # we don't concerned about it! 306 | label_ids.append(0) 307 | ntokens.append("**NULL**") 308 | #label_mask.append(0) 309 | # print(len(input_ids)) 310 | assert len(input_ids) == max_seq_length 311 | assert len(input_mask) == max_seq_length 312 | assert len(segment_ids) == max_seq_length 313 | assert len(label_ids) == max_seq_length 314 | #assert len(label_mask) == max_seq_length 315 | 316 | if ex_index < 5: 317 | tf.logging.info("*** Example ***") 318 | tf.logging.info("guid: %s" % (example.guid)) 319 | tf.logging.info("tokens: %s" % " ".join( 320 | [tokenization.printable_text(x) for x in tokens])) 321 | tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 322 | tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 323 | tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 324 | tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) 325 | #tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) 326 | 327 | feature = InputFeatures( 328 | input_ids=input_ids, 329 | input_mask=input_mask, 330 | segment_ids=segment_ids, 331 | label_ids=label_ids, 332 | #label_mask = label_mask 333 | ) 334 | write_tokens(ntokens,mode) 335 | return feature 336 | 337 | 338 | def file_based_convert_examples_to_features( 339 | examples, label_list, max_seq_length, tokenizer, output_file, mode=None): 340 | """Convert a set of `InputExample`s to a TFRecord file.""" 341 | label_map = {} 342 | for (i, label) in enumerate(label_list,1): 343 | label_map[label] = i 344 | with open('albert_base_ner_checkpoints/label2id.pkl','wb') as w: 345 | pickle.dump(label_map,w) 346 | 347 | writer = tf.python_io.TFRecordWriter(output_file) 348 | 349 | for (ex_index, example) in enumerate(examples): 350 | if ex_index % 10000 == 0: 351 | tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) 352 | 353 | feature = convert_single_example(ex_index, example, label_map, 354 | max_seq_length, tokenizer, mode) 355 | 356 | def create_int_feature(values): 357 | f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) 358 | return f 359 | 360 | features = collections.OrderedDict() 361 | features["input_ids"] = create_int_feature(feature.input_ids) 362 | features["input_mask"] = create_int_feature(feature.input_mask) 363 | features["segment_ids"] = create_int_feature(feature.segment_ids) 364 | features["label_ids"] = create_int_feature(feature.label_ids) 365 | # features["is_real_example"] = create_int_feature( 366 | # [int(feature.is_real_example)]) 367 | 368 | tf_example = tf.train.Example(features=tf.train.Features(feature=features)) 369 | writer.write(tf_example.SerializeToString()) 370 | writer.close() 371 | 372 | 373 | def file_based_input_fn_builder(input_file, seq_length, is_training, 374 | drop_remainder): 375 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 376 | 377 | name_to_features = { 378 | "input_ids": tf.FixedLenFeature([seq_length], tf.int64), 379 | "input_mask": tf.FixedLenFeature([seq_length], tf.int64), 380 | "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), 381 | "label_ids": tf.FixedLenFeature([seq_length], tf.int64), 382 | # "is_real_example": tf.FixedLenFeature([], tf.int64), 383 | } 384 | 385 | def _decode_record(record, name_to_features): 386 | """Decodes a record to a TensorFlow example.""" 387 | example = tf.parse_single_example(record, name_to_features) 388 | 389 | # tf.Example only supports tf.int64, but the TPU only supports tf.int32. 390 | # So cast all int64 to int32. 391 | for name in list(example.keys()): 392 | t = example[name] 393 | if t.dtype == tf.int64: 394 | t = tf.to_int32(t) 395 | example[name] = t 396 | 397 | return example 398 | 399 | def input_fn(params): 400 | """The actual input function.""" 401 | batch_size = params["batch_size"] 402 | 403 | # For training, we want a lot of parallel reading and shuffling. 404 | # For eval, we want no shuffling and parallel reading doesn't matter. 405 | d = tf.data.TFRecordDataset(input_file) 406 | if is_training: 407 | d = d.repeat() 408 | d = d.shuffle(buffer_size=100) 409 | 410 | d = d.apply( 411 | tf.contrib.data.map_and_batch( 412 | lambda record: _decode_record(record, name_to_features), 413 | batch_size=batch_size, 414 | drop_remainder=drop_remainder)) 415 | 416 | return d 417 | 418 | return input_fn 419 | 420 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, 421 | labels, num_labels, use_one_hot_embeddings): 422 | """Creates a classification model.""" 423 | model = modeling.BertModel( 424 | config=bert_config, 425 | is_training=is_training, 426 | input_ids=input_ids, 427 | input_mask=input_mask, 428 | token_type_ids=segment_ids, 429 | use_one_hot_embeddings=use_one_hot_embeddings) 430 | 431 | output_layer = model.get_sequence_output() 432 | 433 | hidden_size = output_layer.shape[-1].value 434 | 435 | output_weight = tf.get_variable( 436 | "output_weights", [num_labels, hidden_size], 437 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 438 | 439 | output_bias = tf.get_variable( 440 | "output_bias", [num_labels], initializer=tf.zeros_initializer()) 441 | 442 | with tf.variable_scope("loss"): 443 | if is_training: 444 | # I.e., 0.1 dropout 445 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 446 | output_layer = tf.reshape(output_layer, [-1, hidden_size]) 447 | logits = tf.matmul(output_layer, output_weight, transpose_b=True) 448 | logits = tf.nn.bias_add(logits, output_bias) 449 | logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, 11]) 450 | 451 | log_probs = tf.nn.log_softmax(logits, axis=-1) 452 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) 453 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 454 | loss = tf.reduce_sum(per_example_loss) 455 | probabilities = tf.nn.softmax(logits, axis=-1) 456 | predict = tf.argmax(probabilities,axis=-1) 457 | return (loss, per_example_loss, logits,predict) 458 | 459 | def layer_norm(input_tensor, name=None): 460 | """Run layer normalization on the last dimension of the tensor.""" 461 | return tf.contrib.layers.layer_norm( 462 | inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) 463 | 464 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, 465 | num_train_steps, num_warmup_steps, use_tpu, 466 | use_one_hot_embeddings): 467 | """Returns `model_fn` closure for TPUEstimator.""" 468 | 469 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 470 | """The `model_fn` for TPUEstimator.""" 471 | 472 | tf.logging.info("*** Features ***") 473 | for name in sorted(features.keys()): 474 | tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) 475 | 476 | input_ids = features["input_ids"] 477 | input_mask = features["input_mask"] 478 | segment_ids = features["segment_ids"] 479 | label_ids = features["label_ids"] 480 | # is_real_example = None 481 | # if "is_real_example" in features: 482 | # is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) 483 | # else: 484 | # is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) 485 | 486 | is_training = (mode == tf.estimator.ModeKeys.TRAIN) 487 | 488 | (total_loss, per_example_loss, logits, predicts) = create_model( 489 | bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, 490 | num_labels, use_one_hot_embeddings) 491 | 492 | tvars = tf.trainable_variables() 493 | initialized_variable_names = {} 494 | scaffold_fn = None 495 | if init_checkpoint: 496 | (assignment_map, initialized_variable_names 497 | ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) 498 | if use_tpu: 499 | 500 | def tpu_scaffold(): 501 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 502 | return tf.train.Scaffold() 503 | 504 | scaffold_fn = tpu_scaffold 505 | else: 506 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 507 | 508 | tf.logging.info("**** Trainable Variables ****") 509 | for var in tvars: 510 | init_string = "" 511 | if var.name in initialized_variable_names: 512 | init_string = ", *INIT_FROM_CKPT*" 513 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 514 | init_string) 515 | 516 | output_spec = None 517 | if mode == tf.estimator.ModeKeys.TRAIN: 518 | 519 | train_op = optimization.create_optimizer( 520 | total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) 521 | 522 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 523 | mode=mode, 524 | loss=total_loss, 525 | train_op=train_op, 526 | scaffold_fn=scaffold_fn) 527 | elif mode == tf.estimator.ModeKeys.EVAL: 528 | 529 | def metric_fn(per_example_loss, label_ids, logits): 530 | # def metric_fn(label_ids, logits): 531 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) 532 | precision = tf_metrics.precision(label_ids,predictions,11,[2,3,4,5,6,7],average="macro") 533 | recall = tf_metrics.recall(label_ids,predictions,11,[2,3,4,5,6,7],average="macro") 534 | f = tf_metrics.f1(label_ids,predictions,11,[2,3,4,5,6,7],average="macro") 535 | # 536 | return { 537 | "eval_precision":precision, 538 | "eval_recall":recall, 539 | "eval_f": f, 540 | #"eval_loss": loss, 541 | } 542 | eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) 543 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 544 | mode=mode, 545 | loss=total_loss, 546 | eval_metrics=eval_metrics, 547 | scaffold_fn=scaffold_fn) 548 | else: 549 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 550 | mode=mode, 551 | predictions= predicts, 552 | scaffold_fn=scaffold_fn) 553 | return output_spec 554 | 555 | return model_fn 556 | 557 | 558 | # This function is not used by this file but is still used by the Colab and 559 | # people who depend on it. 560 | def input_fn_builder(features, seq_length, is_training, drop_remainder): 561 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 562 | 563 | all_input_ids = [] 564 | all_input_mask = [] 565 | all_segment_ids = [] 566 | all_label_ids = [] 567 | 568 | for feature in features: 569 | all_input_ids.append(feature.input_ids) 570 | all_input_mask.append(feature.input_mask) 571 | all_segment_ids.append(feature.segment_ids) 572 | all_label_ids.append(feature.label_id) 573 | 574 | def input_fn(params): 575 | """The actual input function.""" 576 | batch_size = params["batch_size"] 577 | 578 | num_examples = len(features) 579 | 580 | # This is for demo purposes and does NOT scale to large data sets. We do 581 | # not use Dataset.from_generator() because that uses tf.py_func which is 582 | # not TPU compatible. The right way to load data is with TFRecordReader. 583 | d = tf.data.Dataset.from_tensor_slices({ 584 | "input_ids": 585 | tf.constant( 586 | all_input_ids, shape=[num_examples, seq_length], 587 | dtype=tf.int32), 588 | "input_mask": 589 | tf.constant( 590 | all_input_mask, 591 | shape=[num_examples, seq_length], 592 | dtype=tf.int32), 593 | "segment_ids": 594 | tf.constant( 595 | all_segment_ids, 596 | shape=[num_examples, seq_length], 597 | dtype=tf.int32), 598 | "label_ids": 599 | tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), 600 | }) 601 | 602 | if is_training: 603 | d = d.repeat() 604 | d = d.shuffle(buffer_size=100) 605 | 606 | d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) 607 | return d 608 | 609 | return input_fn 610 | 611 | # This function is not used by this file but is still used by the Colab and 612 | # people who depend on it. 613 | def convert_examples_to_features(examples, label_list, max_seq_length, 614 | tokenizer): 615 | """Convert a set of `InputExample`s to a list of `InputFeatures`.""" 616 | 617 | features = [] 618 | for (ex_index, example) in enumerate(examples): 619 | if ex_index % 10000 == 0: 620 | tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) 621 | 622 | feature = convert_single_example(ex_index, example, label_list, 623 | max_seq_length, tokenizer) 624 | 625 | features.append(feature) 626 | return features 627 | 628 | 629 | def main(_): 630 | tf.logging.set_verbosity(tf.logging.INFO) 631 | 632 | processors = { 633 | # TODO change processors 634 | "ner": NerProcessor 635 | } 636 | 637 | tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, 638 | FLAGS.init_checkpoint) 639 | 640 | if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: 641 | raise ValueError( 642 | "At least one of `do_train`, `do_eval` or `do_predict' must be True.") 643 | 644 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 645 | 646 | if FLAGS.max_seq_length > bert_config.max_position_embeddings: 647 | raise ValueError( 648 | "Cannot use sequence length %d because the BERT model " 649 | "was only trained up to sequence length %d" % 650 | (FLAGS.max_seq_length, bert_config.max_position_embeddings)) 651 | 652 | tf.gfile.MakeDirs(FLAGS.output_dir) 653 | 654 | task_name = FLAGS.task_name.lower() 655 | 656 | if task_name not in processors: 657 | raise ValueError("Task not found: %s" % (task_name)) 658 | 659 | processor = processors[task_name]() 660 | 661 | label_list = processor.get_labels() 662 | 663 | tokenizer = tokenization.FullTokenizer( 664 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 665 | 666 | tpu_cluster_resolver = None 667 | if FLAGS.use_tpu and FLAGS.tpu_name: 668 | tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( 669 | FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) 670 | 671 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 672 | # Cloud TPU: Invalid TPU configuration, ensure ClusterResolver is passed to tpu. 673 | print("###tpu_cluster_resolver:",tpu_cluster_resolver) 674 | run_config = tf.contrib.tpu.RunConfig( 675 | cluster=tpu_cluster_resolver, 676 | master=FLAGS.master, 677 | model_dir=FLAGS.output_dir, 678 | save_checkpoints_steps=FLAGS.save_checkpoints_steps, 679 | tpu_config=tf.contrib.tpu.TPUConfig( 680 | iterations_per_loop=FLAGS.iterations_per_loop, 681 | num_shards=FLAGS.num_tpu_cores, 682 | per_host_input_for_training=is_per_host)) 683 | 684 | train_examples = None 685 | num_train_steps = None 686 | num_warmup_steps = None 687 | if FLAGS.do_train: 688 | train_examples =processor.get_train_examples(FLAGS.data_dir) # TODO 689 | print("###length of total train_examples:",len(train_examples)) 690 | num_train_steps = int(len(train_examples)/ FLAGS.train_batch_size * FLAGS.num_train_epochs) 691 | num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) 692 | 693 | model_fn = model_fn_builder( 694 | bert_config=bert_config, 695 | num_labels=len(label_list) + 1, 696 | init_checkpoint=FLAGS.init_checkpoint, 697 | learning_rate=FLAGS.learning_rate, 698 | num_train_steps=num_train_steps, 699 | num_warmup_steps=num_warmup_steps, 700 | use_tpu=FLAGS.use_tpu, 701 | use_one_hot_embeddings=FLAGS.use_tpu) 702 | 703 | # If TPU is not available, this will fall back to normal Estimator on CPU 704 | # or GPU. 705 | estimator = tf.contrib.tpu.TPUEstimator( 706 | use_tpu=FLAGS.use_tpu, 707 | model_fn=model_fn, 708 | config=run_config, 709 | train_batch_size=FLAGS.train_batch_size, 710 | eval_batch_size=FLAGS.eval_batch_size, 711 | predict_batch_size=FLAGS.predict_batch_size) 712 | 713 | if FLAGS.do_train: 714 | train_file = os.path.join(FLAGS.output_dir, "train.tf_record") 715 | train_file_exists=os.path.exists(train_file) 716 | print("###train_file_exists:", train_file_exists," ;train_file:",train_file) 717 | if not train_file_exists: # if tf_record file not exist, convert from raw text file. # TODO 718 | file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) 719 | tf.logging.info("***** Running training *****") 720 | tf.logging.info(" Num examples = %d", len(train_examples)) 721 | tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) 722 | tf.logging.info(" Num steps = %d", num_train_steps) 723 | train_input_fn = file_based_input_fn_builder( 724 | input_file=train_file, 725 | seq_length=FLAGS.max_seq_length, 726 | is_training=True, 727 | drop_remainder=True) 728 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) 729 | 730 | if FLAGS.do_eval: 731 | eval_examples = processor.get_dev_examples(FLAGS.data_dir) 732 | num_actual_eval_examples = len(eval_examples) 733 | if FLAGS.use_tpu: 734 | # TPU requires a fixed batch size for all batches, therefore the number 735 | # of examples must be a multiple of the batch size, or else examples 736 | # will get dropped. So we pad with fake examples which are ignored 737 | # later on. These do NOT count towards the metric (all tf.metrics 738 | # support a per-instance weight, and these get a weight of 0.0). 739 | while len(eval_examples) % FLAGS.eval_batch_size != 0: 740 | eval_examples.append(PaddingInputExample()) 741 | 742 | eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") 743 | file_based_convert_examples_to_features( 744 | eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) 745 | 746 | tf.logging.info("***** Running evaluation *****") 747 | tf.logging.info(" Num examples = %d (%d actual, %d padding)", 748 | len(eval_examples), num_actual_eval_examples, 749 | len(eval_examples) - num_actual_eval_examples) 750 | tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) 751 | 752 | # This tells the estimator to run through the entire set. 753 | eval_steps = None 754 | # However, if running eval on the TPU, you will need to specify the 755 | # number of steps. 756 | if FLAGS.use_tpu: 757 | assert len(eval_examples) % FLAGS.eval_batch_size == 0 758 | eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) 759 | 760 | eval_drop_remainder = True if FLAGS.use_tpu else False 761 | eval_input_fn = file_based_input_fn_builder( 762 | input_file=eval_file, 763 | seq_length=FLAGS.max_seq_length, 764 | is_training=False, 765 | drop_remainder=eval_drop_remainder) 766 | 767 | ####################################################################################################################### 768 | # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy 769 | steps_and_files = [] 770 | filenames = tf.gfile.ListDirectory(FLAGS.output_dir) 771 | for filename in filenames: 772 | if filename.endswith(".index"): 773 | ckpt_name = filename[:-6] 774 | cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) 775 | global_step = int(cur_filename.split("-")[-1]) 776 | tf.logging.info("Add {} to eval list.".format(cur_filename)) 777 | steps_and_files.append([global_step, cur_filename]) 778 | steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) 779 | 780 | output_eval_file = os.path.join(FLAGS.data_dir, "eval_results_albert_zh.txt") 781 | print("output_eval_file:",output_eval_file) 782 | tf.logging.info("output_eval_file:"+output_eval_file) 783 | with tf.gfile.GFile(output_eval_file, "w") as writer: 784 | for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): 785 | result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) 786 | 787 | tf.logging.info("***** Eval results %s *****" % (filename)) 788 | writer.write("***** Eval results %s *****\n" % (filename)) 789 | for key in sorted(result.keys()): 790 | tf.logging.info(" %s = %s", key, str(result[key])) 791 | writer.write("%s = %s\n" % (key, str(result[key]))) 792 | ####################################################################################################################### 793 | 794 | #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) 795 | # 796 | #output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") 797 | #with tf.gfile.GFile(output_eval_file, "w") as writer: 798 | # tf.logging.info("***** Eval results *****") 799 | # for key in sorted(result.keys()): 800 | # tf.logging.info(" %s = %s", key, str(result[key])) 801 | # writer.write("%s = %s\n" % (key, str(result[key]))) 802 | 803 | if FLAGS.do_predict: 804 | token_path = os.path.join(FLAGS.output_dir, "token_test.txt") 805 | with open('albert_base_ner_checkpoints/label2id.pkl','rb') as rf: 806 | label2id = pickle.load(rf) 807 | id2label = {value:key for key,value in label2id.items()} 808 | if os.path.exists(token_path): 809 | os.remove(token_path) 810 | predict_examples = processor.get_test_examples(FLAGS.data_dir) 811 | 812 | predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") 813 | file_based_convert_examples_to_features(predict_examples, label_list, 814 | FLAGS.max_seq_length, tokenizer, 815 | predict_file,mode="test") 816 | 817 | tf.logging.info("***** Running prediction*****") 818 | tf.logging.info(" Num examples = %d", len(predict_examples)) 819 | tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) 820 | if FLAGS.use_tpu: 821 | # Warning: According to tpu_estimator.py Prediction on TPU is an 822 | # experimental feature and hence not supported here 823 | raise ValueError("Prediction in TPU not supported") 824 | predict_drop_remainder = True if FLAGS.use_tpu else False 825 | predict_input_fn = file_based_input_fn_builder( 826 | input_file=predict_file, 827 | seq_length=FLAGS.max_seq_length, 828 | is_training=False, 829 | drop_remainder=predict_drop_remainder) 830 | 831 | result = estimator.predict(input_fn=predict_input_fn) 832 | output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") 833 | with open(output_predict_file,'w') as writer: 834 | for prediction in result: 835 | output_line = "\n".join(id2label[id] for id in prediction if id!=0) + "\n" 836 | writer.write(output_line) 837 | 838 | 839 | if __name__ == "__main__": 840 | flags.mark_flag_as_required("data_dir") 841 | flags.mark_flag_as_required("task_name") 842 | flags.mark_flag_as_required("vocab_file") 843 | flags.mark_flag_as_required("bert_config_file") 844 | flags.mark_flag_as_required("output_dir") 845 | tf.app.run() 846 | -------------------------------------------------------------------------------- /create_pretraining_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Create masked LM/next sentence masked_lm TF examples for BERT.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import random 23 | import tokenization 24 | import tensorflow as tf 25 | import jieba 26 | import re 27 | flags = tf.flags 28 | 29 | FLAGS = flags.FLAGS 30 | 31 | flags.DEFINE_string("input_file", None, 32 | "Input raw text file (or comma-separated list of files).") 33 | 34 | flags.DEFINE_string( 35 | "output_file", None, 36 | "Output TF example file (or comma-separated list of files).") 37 | 38 | flags.DEFINE_string("vocab_file", None, 39 | "The vocabulary file that the BERT model was trained on.") 40 | 41 | flags.DEFINE_bool( 42 | "do_lower_case", True, 43 | "Whether to lower case the input text. Should be True for uncased " 44 | "models and False for cased models.") 45 | 46 | flags.DEFINE_bool( 47 | "do_whole_word_mask", False, 48 | "Whether to use whole word masking rather than per-WordPiece masking.") 49 | 50 | flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") 51 | 52 | flags.DEFINE_integer("max_predictions_per_seq", 20, 53 | "Maximum number of masked LM predictions per sequence.") 54 | 55 | flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") 56 | 57 | flags.DEFINE_integer( 58 | "dupe_factor", 10, 59 | "Number of times to duplicate the input data (with different masks).") 60 | 61 | flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") 62 | 63 | flags.DEFINE_float( 64 | "short_seq_prob", 0.1, 65 | "Probability of creating sequences which are shorter than the " 66 | "maximum length.") 67 | 68 | flags.DEFINE_bool("non_chinese", False,"manually set this to True if you are not doing chinese pre-train task.") 69 | 70 | 71 | class TrainingInstance(object): 72 | """A single training instance (sentence pair).""" 73 | 74 | def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, 75 | is_random_next): 76 | self.tokens = tokens 77 | self.segment_ids = segment_ids 78 | self.is_random_next = is_random_next 79 | self.masked_lm_positions = masked_lm_positions 80 | self.masked_lm_labels = masked_lm_labels 81 | 82 | def __str__(self): 83 | s = "" 84 | s += "tokens: %s\n" % (" ".join( 85 | [tokenization.printable_text(x) for x in self.tokens])) 86 | s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) 87 | s += "is_random_next: %s\n" % self.is_random_next 88 | s += "masked_lm_positions: %s\n" % (" ".join( 89 | [str(x) for x in self.masked_lm_positions])) 90 | s += "masked_lm_labels: %s\n" % (" ".join( 91 | [tokenization.printable_text(x) for x in self.masked_lm_labels])) 92 | s += "\n" 93 | return s 94 | 95 | def __repr__(self): 96 | return self.__str__() 97 | 98 | 99 | def write_instance_to_example_files(instances, tokenizer, max_seq_length, 100 | max_predictions_per_seq, output_files): 101 | """Create TF example files from `TrainingInstance`s.""" 102 | writers = [] 103 | for output_file in output_files: 104 | writers.append(tf.python_io.TFRecordWriter(output_file)) 105 | 106 | writer_index = 0 107 | 108 | total_written = 0 109 | for (inst_index, instance) in enumerate(instances): 110 | input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) 111 | input_mask = [1] * len(input_ids) 112 | segment_ids = list(instance.segment_ids) 113 | assert len(input_ids) <= max_seq_length 114 | 115 | while len(input_ids) < max_seq_length: 116 | input_ids.append(0) 117 | input_mask.append(0) 118 | segment_ids.append(0) 119 | 120 | assert len(input_ids) == max_seq_length 121 | assert len(input_mask) == max_seq_length 122 | assert len(segment_ids) == max_seq_length 123 | 124 | masked_lm_positions = list(instance.masked_lm_positions) 125 | masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) 126 | masked_lm_weights = [1.0] * len(masked_lm_ids) 127 | 128 | while len(masked_lm_positions) < max_predictions_per_seq: 129 | masked_lm_positions.append(0) 130 | masked_lm_ids.append(0) 131 | masked_lm_weights.append(0.0) 132 | 133 | next_sentence_label = 1 if instance.is_random_next else 0 134 | 135 | features = collections.OrderedDict() 136 | features["input_ids"] = create_int_feature(input_ids) 137 | features["input_mask"] = create_int_feature(input_mask) 138 | features["segment_ids"] = create_int_feature(segment_ids) 139 | features["masked_lm_positions"] = create_int_feature(masked_lm_positions) 140 | features["masked_lm_ids"] = create_int_feature(masked_lm_ids) 141 | features["masked_lm_weights"] = create_float_feature(masked_lm_weights) 142 | features["next_sentence_labels"] = create_int_feature([next_sentence_label]) 143 | 144 | tf_example = tf.train.Example(features=tf.train.Features(feature=features)) 145 | 146 | writers[writer_index].write(tf_example.SerializeToString()) 147 | writer_index = (writer_index + 1) % len(writers) 148 | 149 | total_written += 1 150 | 151 | if inst_index < 20: 152 | tf.logging.info("*** Example ***") 153 | tf.logging.info("tokens: %s" % " ".join( 154 | [tokenization.printable_text(x) for x in instance.tokens])) 155 | 156 | for feature_name in features.keys(): 157 | feature = features[feature_name] 158 | values = [] 159 | if feature.int64_list.value: 160 | values = feature.int64_list.value 161 | elif feature.float_list.value: 162 | values = feature.float_list.value 163 | tf.logging.info( 164 | "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) 165 | 166 | for writer in writers: 167 | writer.close() 168 | 169 | tf.logging.info("Wrote %d total instances", total_written) 170 | 171 | 172 | def create_int_feature(values): 173 | feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) 174 | return feature 175 | 176 | 177 | def create_float_feature(values): 178 | feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) 179 | return feature 180 | 181 | 182 | def create_training_instances(input_files, tokenizer, max_seq_length, 183 | dupe_factor, short_seq_prob, masked_lm_prob, 184 | max_predictions_per_seq, rng): 185 | """Create `TrainingInstance`s from raw text.""" 186 | all_documents = [[]] 187 | 188 | # Input file format: 189 | # (1) One sentence per line. These should ideally be actual sentences, not 190 | # entire paragraphs or arbitrary spans of text. (Because we use the 191 | # sentence boundaries for the "next sentence prediction" task). 192 | # (2) Blank lines between documents. Document boundaries are needed so 193 | # that the "next sentence prediction" task doesn't span between documents. 194 | for input_file in input_files: 195 | with tf.gfile.GFile(input_file, "r") as reader: 196 | while True: 197 | strings=reader.readline() 198 | strings=strings.replace(" "," ").replace(" "," ") # 如果有两个或三个空格,替换为一个空格 199 | line = tokenization.convert_to_unicode(strings) 200 | if not line: 201 | break 202 | line = line.strip() 203 | 204 | # Empty lines are used as document delimiters 205 | if not line: 206 | all_documents.append([]) 207 | tokens = tokenizer.tokenize(line) 208 | if tokens: 209 | all_documents[-1].append(tokens) 210 | 211 | # Remove empty documents 212 | all_documents = [x for x in all_documents if x] 213 | rng.shuffle(all_documents) 214 | 215 | vocab_words = list(tokenizer.vocab.keys()) 216 | instances = [] 217 | for _ in range(dupe_factor): 218 | for document_index in range(len(all_documents)): 219 | instances.extend( 220 | create_instances_from_document_albert( # change to albert style for sentence order prediction(SOP), 2019-08-28, brightmart 221 | all_documents, document_index, max_seq_length, short_seq_prob, 222 | masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) 223 | 224 | rng.shuffle(instances) 225 | return instances 226 | 227 | def get_new_segment(segment): # 新增的方法 #### 228 | """ 229 | 输入一句话,返回一句经过处理的话: 为了支持中文全称mask,将被分开的词,将上特殊标记("#"),使得后续处理模块,能够知道哪些字是属于同一个词的。 230 | :param segment: 一句话. e.g. ['悬', '灸', '技', '术', '培', '训', '专', '家', '教', '你', '艾', '灸', '降', '血', '糖', ',', '为', '爸', '妈', '收', '好', '了', '!'] 231 | :return: 一句处理过的话 e.g. ['悬', '##灸', '技', '术', '培', '训', '专', '##家', '教', '你', '艾', '##灸', '降', '##血', '##糖', ',', '为', '爸', '##妈', '收', '##好', '了', '!'] 232 | """ 233 | seq_cws = jieba.lcut("".join(segment)) # 分词 234 | seq_cws_dict = {x: 1 for x in seq_cws} # 分词后的词加入到词典dict 235 | new_segment = [] 236 | i = 0 237 | while i < len(segment): # 从句子的第一个字开始处理,知道处理完整个句子 238 | if len(re.findall('[\u4E00-\u9FA5]', segment[i])) == 0: # 如果找不到中文的,原文加进去即不用特殊处理。 239 | new_segment.append(segment[i]) 240 | i += 1 241 | continue 242 | 243 | has_add = False 244 | for length in range(3, 0, -1): 245 | if i + length > len(segment): 246 | continue 247 | if ''.join(segment[i:i + length]) in seq_cws_dict: 248 | new_segment.append(segment[i]) 249 | for l in range(1, length): 250 | new_segment.append('##' + segment[i + l]) 251 | i += length 252 | has_add = True 253 | break 254 | if not has_add: 255 | new_segment.append(segment[i]) 256 | i += 1 257 | # print("get_new_segment.wwm.get_new_segment:",new_segment) 258 | return new_segment 259 | 260 | def create_instances_from_document_albert( 261 | all_documents, document_index, max_seq_length, short_seq_prob, 262 | masked_lm_prob, max_predictions_per_seq, vocab_words, rng): 263 | """Creates `TrainingInstance`s for a single document. 264 | This method is changed to create sentence-order prediction (SOP) followed by idea from paper of ALBERT, 2019-08-28, brightmart 265 | """ 266 | document = all_documents[document_index] # 得到一个文档 267 | 268 | # Account for [CLS], [SEP], [SEP] 269 | max_num_tokens = max_seq_length - 3 270 | 271 | # We *usually* want to fill up the entire sequence since we are padding 272 | # to `max_seq_length` anyways, so short sequences are generally wasted 273 | # computation. However, we *sometimes* 274 | # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter 275 | # sequences to minimize the mismatch between pre-training and fine-tuning. 276 | # The `target_seq_length` is just a rough target however, whereas 277 | # `max_seq_length` is a hard limit. 278 | target_seq_length = max_num_tokens 279 | if rng.random() < short_seq_prob: # 有一定的比例,如10%的概率,我们使用比较短的序列长度,以缓解预训练的长序列和调优阶段(可能的)短序列的不一致情况 280 | target_seq_length = rng.randint(2, max_num_tokens) 281 | 282 | # We DON'T just concatenate all of the tokens from a document into a long 283 | # sequence and choose an arbitrary split point because this would make the 284 | # next sentence prediction task too easy. Instead, we split the input into 285 | # segments "A" and "B" based on the actual "sentences" provided by the user 286 | # input. 287 | # 设法使用实际的句子,而不是任意的截断句子,从而更好的构造句子连贯性预测的任务 288 | instances = [] 289 | current_chunk = [] # 当前处理的文本段,包含多个句子 290 | current_length = 0 291 | i = 0 292 | # print("###document:",document) # 一个document可以是一整篇文章、新闻、词条等. document:[['是', '爷', '们', ',', '就', '得', '给', '媳', '妇', '幸', '福'], ['关', '注', '【', '晨', '曦', '教', '育', '】', ',', '获', '取', '育', '儿', '的', '智', '慧', ',', '与', '孩', '子', '一', '同', '成', '长', '!'], ['方', '法', ':', '打', '开', '微', '信', '→', '添', '加', '朋', '友', '→', '搜', '号', '→', '##he', '##bc', '##x', '##jy', '##→', '关', '注', '!', '我', '是', '一', '个', '爷', '们', ',', '孝', '顺', '是', '做', '人', '的', '第', '一', '准', '则', '。'], ['甭', '管', '小', '时', '候', '怎', '么', '跟', '家', '长', '犯', '混', '蛋', ',', '长', '大', '了', ',', '就', '底', '报', '答', '父', '母', ',', '以', '后', '我', '媳', '妇', '也', '必', '须', '孝', '顺', '。'], ['我', '是', '一', '个', '爷', '们', ',', '可', '以', '花', '心', ',', '可', '以', '好', '玩', '。'], ['但', '我', '一', '定', '会', '找', '一', '个', '管', '的', '住', '我', '的', '女', '人', ',', '和', '我', '一', '起', '生', '活', '。'], ['28', '岁', '以', '前', '在', '怎', '么', '玩', '都', '行', ',', '但', '我', '最', '后', '一', '定', '会', '找', '一', '个', '勤', '俭', '持', '家', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', ',', '我', '不', '会', '让', '自', '己', '的', '女', '人', '受', '一', '点', '委', '屈', ',', '每', '次', '把', '她', '抱', '在', '怀', '里', ',', '看', '她', '洋', '溢', '着', '幸', '福', '的', '脸', ',', '我', '都', '会', '引', '以', '为', '傲', ',', '这', '特', '么', '就', '是', '我', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', ',', '干', '什', '么', '也', '不', '能', '忘', '了', '自', '己', '媳', '妇', ',', '就', '算', '和', '哥', '们', '一', '起', '喝', '酒', ',', '喝', '到', '很', '晚', ',', '也', '要', '提', '前', '打', '电', '话', '告', '诉', '她', ',', '让', '她', '早', '点', '休', '息', '。'], ['我', '是', '一', '爷', '们', ',', '我', '媳', '妇', '绝', '对', '不', '能', '抽', '烟', ',', '喝', '酒', '还', '勉', '强', '过', '得', '去', ',', '不', '过', '该', '喝', '的', '时', '候', '喝', ',', '不', '该', '喝', '的', '时', '候', ',', '少', '扯', '纳', '极', '薄', '蛋', '。'], ['我', '是', '一', '爷', '们', ',', '我', '媳', '妇', '必', '须', '听', '我', '话', ',', '在', '人', '前', '一', '定', '要', '给', '我', '面', '子', ',', '回', '家', '了', '咱', '什', '么', '都', '好', '说', '。'], ['我', '是', '一', '爷', '们', ',', '就', '算', '难', '的', '吃', '不', '上', '饭', '了', ',', '都', '不', '张', '口', '跟', '媳', '妇', '要', '一', '分', '钱', '。'], ['我', '是', '一', '爷', '们', ',', '不', '管', '上', '学', '还', '是', '上', '班', ',', '我', '都', '会', '送', '媳', '妇', '回', '家', '。'], ['我', '是', '一', '爷', '们', ',', '交', '往', '不', '到', '1', '年', ',', '绝', '对', '不', '会', '和', '媳', '妇', '提', '过', '分', '的', '要', '求', ',', '我', '会', '尊', '重', '她', '。'], ['我', '是', '一', '爷', '们', ',', '游', '戏', '永', '远', '比', '不', '上', '我', '媳', '妇', '重', '要', ',', '只', '要', '媳', '妇', '发', '话', ',', '我', '绝', '对', '唯', '命', '是', '从', '。'], ['我', '是', '一', '爷', '们', ',', '上', 'q', '绝', '对', '是', '为', '了', '等', '媳', '妇', ',', '所', '有', '暧', '昧', '的', '心', '情', '只', '为', '她', '一', '个', '女', '人', '而', '写', ',', '我', '不', '一', '定', '会', '经', '常', '写', '日', '志', ',', '可', '是', '我', '会', '告', '诉', '全', '世', '界', ',', '我', '很', '爱', '她', '。'], ['我', '是', '一', '爷', '们', ',', '不', '一', '定', '要', '经', '常', '制', '造', '浪', '漫', '、', '偶', '尔', '过', '个', '节', '日', '也', '要', '送', '束', '玫', '瑰', '花', '给', '媳', '妇', '抱', '回', '家', '。'], ['我', '是', '一', '爷', '们', ',', '手', '机', '会', '24', '小', '时', '为', '她', '开', '机', ',', '让', '她', '半', '夜', '痛', '经', '的', '时', '候', ',', '做', '恶', '梦', '的', '时', '候', ',', '随', '时', '可', '以', '联', '系', '到', '我', '。'], ['我', '是', '一', '爷', '们', ',', '我', '会', '经', '常', '带', '媳', '妇', '出', '去', '玩', ',', '她', '不', '一', '定', '要', '和', '我', '所', '有', '的', '哥', '们', '都', '认', '识', ',', '但', '见', '面', '能', '说', '的', '上', '话', '就', '行', '。'], ['我', '是', '一', '爷', '们', ',', '我', '会', '和', '媳', '妇', '的', '姐', '妹', '哥', '们', '搞', '好', '关', '系', ',', '让', '她', '们', '相', '信', '我', '一', '定', '可', '以', '给', '我', '媳', '妇', '幸', '福', '。'], ['我', '是', '一', '爷', '们', ',', '吵', '架', '后', '、', '也', '要', '主', '动', '打', '电', '话', '关', '心', '她', ',', '咱', '是', '一', '爷', '们', ',', '给', '媳', '妇', '服', '个', '软', ',', '道', '个', '歉', '怎', '么', '了', '?'], ['我', '是', '一', '爷', '们', ',', '绝', '对', '不', '会', '嫌', '弃', '自', '己', '媳', '妇', ',', '拿', '她', '和', '别', '人', '比', ',', '说', '她', '这', '不', '如', '人', '家', ',', '纳', '不', '如', '人', '家', '的', '。'], ['我', '是', '一', '爷', '们', ',', '陪', '媳', '妇', '逛', '街', '时', ',', '碰', '见', '熟', '人', ',', '无', '论', '我', '媳', '妇', '长', '的', '好', '看', '与', '否', ',', '我', '都', '会', '大', '方', '的', '介', '绍', '。'], ['谁', '让', '咱', '爷', '们', '就', '好', '这', '口', '呢', '。'], ['我', '是', '一', '爷', '们', ',', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'], ['【', '我', '们', '重', '在', '分', '享', '。'], ['所', '有', '文', '字', '和', '美', '图', ',', '来', '自', '网', '络', ',', '晨', '欣', '教', '育', '整', '理', '。'], ['对', '原', '文', '作', '者', ',', '表', '示', '敬', '意', '。'], ['】', '关', '注', '晨', '曦', '教', '育', '[UNK]', '[UNK]', '晨', '曦', '教', '育', '(', '微', '信', '号', ':', 'he', '##bc', '##x', '##jy', ')', '。'], ['打', '开', '微', '信', ',', '扫', '描', '二', '维', '码', ',', '关', '注', '[UNK]', '晨', '曦', '教', '育', '[UNK]', ',', '获', '取', '更', '多', '育', '儿', '资', '源', '。'], ['点', '击', '下', '面', '订', '阅', '按', '钮', '订', '阅', ',', '会', '有', '更', '多', '惊', '喜', '哦', '!']] 293 | while i < len(document): # 从文档的第一个位置开始,按个往下看 294 | segment = document[i] # segment是列表,代表的是按字分开的一个完整句子,如 segment=['我', '是', '一', '爷', '们', ',', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'] 295 | if FLAGS.non_chinese==False: # if non chinese is False, that means it is chinese, then do something to make chinese whole word mask works. 296 | segment = get_new_segment(segment) # whole word mask for chinese: 结合分词的中文的whole mask设置即在需要的地方加上“##” 297 | 298 | current_chunk.append(segment) # 将一个独立的句子加入到当前的文本块中 299 | current_length += len(segment) # 累计到为止位置接触到句子的总长度 300 | if i == len(document) - 1 or current_length >= target_seq_length: 301 | # 如果累计的序列长度达到了目标的长度,或当前走到了文档结尾==>构造并添加到“A[SEP]B“中的A和B中; 302 | if current_chunk: # 如果当前块不为空 303 | # `a_end` is how many segments from `current_chunk` go into the `A` 304 | # (first) sentence. 305 | a_end = 1 306 | if len(current_chunk) >= 2: # 当前块,如果包含超过两个句子,取当前块的一部分作为“A[SEP]B“中的A部分 307 | a_end = rng.randint(1, len(current_chunk) - 1) 308 | # 将当前文本段中选取出来的前半部分,赋值给A即tokens_a 309 | tokens_a = [] 310 | for j in range(a_end): 311 | tokens_a.extend(current_chunk[j]) 312 | 313 | # 构造“A[SEP]B“中的B部分(有一部分是正常的当前文档中的后半部;在原BERT的实现中一部分是随机的从另一个文档中选取的,) 314 | tokens_b = [] 315 | for j in range(a_end, len(current_chunk)): 316 | tokens_b.extend(current_chunk[j]) 317 | 318 | # 有百分之50%的概率交换一下tokens_a和tokens_b的位置 319 | # print("tokens_a length1:",len(tokens_a)) 320 | # print("tokens_b length1:",len(tokens_b)) # len(tokens_b) = 0 321 | 322 | if len(tokens_a)==0 or len(tokens_b)==0: continue 323 | if rng.random() < 0.5: # 交换一下tokens_a和tokens_b 324 | is_random_next=True 325 | temp=tokens_a 326 | tokens_a=tokens_b 327 | tokens_b=temp 328 | else: 329 | is_random_next=False 330 | 331 | truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) 332 | 333 | assert len(tokens_a) >= 1 334 | assert len(tokens_b) >= 1 335 | 336 | # 把tokens_a & tokens_b加入到按照bert的风格,即以[CLS]tokens_a[SEP]tokens_b[SEP]的形式,结合到一起,作为最终的tokens; 也带上segment_ids,前面部分segment_ids的值是0,后面部分的值是1. 337 | tokens = [] 338 | segment_ids = [] 339 | tokens.append("[CLS]") 340 | segment_ids.append(0) 341 | for token in tokens_a: 342 | tokens.append(token) 343 | segment_ids.append(0) 344 | 345 | tokens.append("[SEP]") 346 | segment_ids.append(0) 347 | 348 | for token in tokens_b: 349 | tokens.append(token) 350 | segment_ids.append(1) 351 | tokens.append("[SEP]") 352 | segment_ids.append(1) 353 | 354 | # 创建masked LM的任务的数据 Creates the predictions for the masked LM objective 355 | (tokens, masked_lm_positions, 356 | masked_lm_labels) = create_masked_lm_predictions( 357 | tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) 358 | instance = TrainingInstance( # 创建训练实例的对象 359 | tokens=tokens, 360 | segment_ids=segment_ids, 361 | is_random_next=is_random_next, 362 | masked_lm_positions=masked_lm_positions, 363 | masked_lm_labels=masked_lm_labels) 364 | instances.append(instance) 365 | current_chunk = [] # 清空当前块 366 | current_length = 0 # 重置当前文本块的长度 367 | i += 1 # 接着文档中的内容往后看 368 | 369 | return instances 370 | 371 | 372 | def create_instances_from_document_original( # THIS IS ORIGINAL BERT STYLE FOR CREATE DATA OF MLM AND NEXT SENTENCE PREDICTION TASK 373 | all_documents, document_index, max_seq_length, short_seq_prob, 374 | masked_lm_prob, max_predictions_per_seq, vocab_words, rng): 375 | """Creates `TrainingInstance`s for a single document.""" 376 | document = all_documents[document_index] # 得到一个文档 377 | 378 | # Account for [CLS], [SEP], [SEP] 379 | max_num_tokens = max_seq_length - 3 380 | 381 | # We *usually* want to fill up the entire sequence since we are padding 382 | # to `max_seq_length` anyways, so short sequences are generally wasted 383 | # computation. However, we *sometimes* 384 | # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter 385 | # sequences to minimize the mismatch between pre-training and fine-tuning. 386 | # The `target_seq_length` is just a rough target however, whereas 387 | # `max_seq_length` is a hard limit. 388 | target_seq_length = max_num_tokens 389 | if rng.random() < short_seq_prob: # 有一定的比例,如10%的概率,我们使用比较短的序列长度,以缓解预训练的长序列和调优阶段(可能的)短序列的不一致情况 390 | target_seq_length = rng.randint(2, max_num_tokens) 391 | 392 | # We DON'T just concatenate all of the tokens from a document into a long 393 | # sequence and choose an arbitrary split point because this would make the 394 | # next sentence prediction task too easy. Instead, we split the input into 395 | # segments "A" and "B" based on the actual "sentences" provided by the user 396 | # input. 397 | # 设法使用实际的句子,而不是任意的截断句子,从而更好的构造句子连贯性预测的任务 398 | instances = [] 399 | current_chunk = [] # 当前处理的文本段,包含多个句子 400 | current_length = 0 401 | i = 0 402 | # print("###document:",document) # 一个document可以是一整篇文章、新闻、一个词条等. document:[['是', '爷', '们', ',', '就', '得', '给', '媳', '妇', '幸', '福'], ['关', '注', '【', '晨', '曦', '教', '育', '】', ',', '获', '取', '育', '儿', '的', '智', '慧', ',', '与', '孩', '子', '一', '同', '成', '长', '!'], ['方', '法', ':', '打', '开', '微', '信', '→', '添', '加', '朋', '友', '→', '搜', '号', '→', '##he', '##bc', '##x', '##jy', '##→', '关', '注', '!', '我', '是', '一', '个', '爷', '们', ',', '孝', '顺', '是', '做', '人', '的', '第', '一', '准', '则', '。'], ['甭', '管', '小', '时', '候', '怎', '么', '跟', '家', '长', '犯', '混', '蛋', ',', '长', '大', '了', ',', '就', '底', '报', '答', '父', '母', ',', '以', '后', '我', '媳', '妇', '也', '必', '须', '孝', '顺', '。'], ['我', '是', '一', '个', '爷', '们', ',', '可', '以', '花', '心', ',', '可', '以', '好', '玩', '。'], ['但', '我', '一', '定', '会', '找', '一', '个', '管', '的', '住', '我', '的', '女', '人', ',', '和', '我', '一', '起', '生', '活', '。'], ['28', '岁', '以', '前', '在', '怎', '么', '玩', '都', '行', ',', '但', '我', '最', '后', '一', '定', '会', '找', '一', '个', '勤', '俭', '持', '家', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', ',', '我', '不', '会', '让', '自', '己', '的', '女', '人', '受', '一', '点', '委', '屈', ',', '每', '次', '把', '她', '抱', '在', '怀', '里', ',', '看', '她', '洋', '溢', '着', '幸', '福', '的', '脸', ',', '我', '都', '会', '引', '以', '为', '傲', ',', '这', '特', '么', '就', '是', '我', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', ',', '干', '什', '么', '也', '不', '能', '忘', '了', '自', '己', '媳', '妇', ',', '就', '算', '和', '哥', '们', '一', '起', '喝', '酒', ',', '喝', '到', '很', '晚', ',', '也', '要', '提', '前', '打', '电', '话', '告', '诉', '她', ',', '让', '她', '早', '点', '休', '息', '。'], ['我', '是', '一', '爷', '们', ',', '我', '媳', '妇', '绝', '对', '不', '能', '抽', '烟', ',', '喝', '酒', '还', '勉', '强', '过', '得', '去', ',', '不', '过', '该', '喝', '的', '时', '候', '喝', ',', '不', '该', '喝', '的', '时', '候', ',', '少', '扯', '纳', '极', '薄', '蛋', '。'], ['我', '是', '一', '爷', '们', ',', '我', '媳', '妇', '必', '须', '听', '我', '话', ',', '在', '人', '前', '一', '定', '要', '给', '我', '面', '子', ',', '回', '家', '了', '咱', '什', '么', '都', '好', '说', '。'], ['我', '是', '一', '爷', '们', ',', '就', '算', '难', '的', '吃', '不', '上', '饭', '了', ',', '都', '不', '张', '口', '跟', '媳', '妇', '要', '一', '分', '钱', '。'], ['我', '是', '一', '爷', '们', ',', '不', '管', '上', '学', '还', '是', '上', '班', ',', '我', '都', '会', '送', '媳', '妇', '回', '家', '。'], ['我', '是', '一', '爷', '们', ',', '交', '往', '不', '到', '1', '年', ',', '绝', '对', '不', '会', '和', '媳', '妇', '提', '过', '分', '的', '要', '求', ',', '我', '会', '尊', '重', '她', '。'], ['我', '是', '一', '爷', '们', ',', '游', '戏', '永', '远', '比', '不', '上', '我', '媳', '妇', '重', '要', ',', '只', '要', '媳', '妇', '发', '话', ',', '我', '绝', '对', '唯', '命', '是', '从', '。'], ['我', '是', '一', '爷', '们', ',', '上', 'q', '绝', '对', '是', '为', '了', '等', '媳', '妇', ',', '所', '有', '暧', '昧', '的', '心', '情', '只', '为', '她', '一', '个', '女', '人', '而', '写', ',', '我', '不', '一', '定', '会', '经', '常', '写', '日', '志', ',', '可', '是', '我', '会', '告', '诉', '全', '世', '界', ',', '我', '很', '爱', '她', '。'], ['我', '是', '一', '爷', '们', ',', '不', '一', '定', '要', '经', '常', '制', '造', '浪', '漫', '、', '偶', '尔', '过', '个', '节', '日', '也', '要', '送', '束', '玫', '瑰', '花', '给', '媳', '妇', '抱', '回', '家', '。'], ['我', '是', '一', '爷', '们', ',', '手', '机', '会', '24', '小', '时', '为', '她', '开', '机', ',', '让', '她', '半', '夜', '痛', '经', '的', '时', '候', ',', '做', '恶', '梦', '的', '时', '候', ',', '随', '时', '可', '以', '联', '系', '到', '我', '。'], ['我', '是', '一', '爷', '们', ',', '我', '会', '经', '常', '带', '媳', '妇', '出', '去', '玩', ',', '她', '不', '一', '定', '要', '和', '我', '所', '有', '的', '哥', '们', '都', '认', '识', ',', '但', '见', '面', '能', '说', '的', '上', '话', '就', '行', '。'], ['我', '是', '一', '爷', '们', ',', '我', '会', '和', '媳', '妇', '的', '姐', '妹', '哥', '们', '搞', '好', '关', '系', ',', '让', '她', '们', '相', '信', '我', '一', '定', '可', '以', '给', '我', '媳', '妇', '幸', '福', '。'], ['我', '是', '一', '爷', '们', ',', '吵', '架', '后', '、', '也', '要', '主', '动', '打', '电', '话', '关', '心', '她', ',', '咱', '是', '一', '爷', '们', ',', '给', '媳', '妇', '服', '个', '软', ',', '道', '个', '歉', '怎', '么', '了', '?'], ['我', '是', '一', '爷', '们', ',', '绝', '对', '不', '会', '嫌', '弃', '自', '己', '媳', '妇', ',', '拿', '她', '和', '别', '人', '比', ',', '说', '她', '这', '不', '如', '人', '家', ',', '纳', '不', '如', '人', '家', '的', '。'], ['我', '是', '一', '爷', '们', ',', '陪', '媳', '妇', '逛', '街', '时', ',', '碰', '见', '熟', '人', ',', '无', '论', '我', '媳', '妇', '长', '的', '好', '看', '与', '否', ',', '我', '都', '会', '大', '方', '的', '介', '绍', '。'], ['谁', '让', '咱', '爷', '们', '就', '好', '这', '口', '呢', '。'], ['我', '是', '一', '爷', '们', ',', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'], ['【', '我', '们', '重', '在', '分', '享', '。'], ['所', '有', '文', '字', '和', '美', '图', ',', '来', '自', '网', '络', ',', '晨', '欣', '教', '育', '整', '理', '。'], ['对', '原', '文', '作', '者', ',', '表', '示', '敬', '意', '。'], ['】', '关', '注', '晨', '曦', '教', '育', '[UNK]', '[UNK]', '晨', '曦', '教', '育', '(', '微', '信', '号', ':', 'he', '##bc', '##x', '##jy', ')', '。'], ['打', '开', '微', '信', ',', '扫', '描', '二', '维', '码', ',', '关', '注', '[UNK]', '晨', '曦', '教', '育', '[UNK]', ',', '获', '取', '更', '多', '育', '儿', '资', '源', '。'], ['点', '击', '下', '面', '订', '阅', '按', '钮', '订', '阅', ',', '会', '有', '更', '多', '惊', '喜', '哦', '!']] 403 | while i < len(document): # 从文档的第一个位置开始,按个往下看 404 | segment = document[i] # segment是列表,代表的是按字分开的一个完整句子,如 segment=['我', '是', '一', '爷', '们', ',', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'] 405 | # print("###i:",i,";segment:",segment) 406 | current_chunk.append(segment) # 将一个独立的句子加入到当前的文本块中 407 | current_length += len(segment) # 累计到为止位置接触到句子的总长度 408 | if i == len(document) - 1 or current_length >= target_seq_length: # 如果累计的序列长度达到了目标的长度==>构造并添加到“A[SEP]B“中的A和B中。 409 | if current_chunk: # 如果当前块不为空 410 | # `a_end` is how many segments from `current_chunk` go into the `A` 411 | # (first) sentence. 412 | a_end = 1 413 | if len(current_chunk) >= 2: # 当前块,如果包含超过两个句子,怎取当前块的一部分作为“A[SEP]B“中的A部分 414 | a_end = rng.randint(1, len(current_chunk) - 1) 415 | # 将当前文本段中选取出来的前半部分,赋值给A即tokens_a 416 | tokens_a = [] 417 | for j in range(a_end): 418 | tokens_a.extend(current_chunk[j]) 419 | 420 | # 构造“A[SEP]B“中的B部分(原本的B有一部分是随机的从另一个文档中选取的,有一部分是正常的当前文档中的后半部) 421 | tokens_b = [] 422 | # Random next 423 | is_random_next = False 424 | if len(current_chunk) == 1 or rng.random() < 0.5: # 有50%的概率,是从其他文档中随机的选取一个文档,并得到这个文档的后半版本作为B即tokens_b 425 | is_random_next = True 426 | target_b_length = target_seq_length - len(tokens_a) 427 | 428 | # This should rarely go for more than one iteration for large 429 | # corpora. However, just to be careful, we try to make sure that 430 | # the random document is not the same as the document 431 | # we're processing. 432 | random_document_index=0 433 | for _ in range(10): # 随机的选出一个与当前的文档不一样的文档的索引 434 | random_document_index = rng.randint(0, len(all_documents) - 1) 435 | if random_document_index != document_index: 436 | break 437 | 438 | random_document = all_documents[random_document_index] # 选出这个文档 439 | random_start = rng.randint(0, len(random_document) - 1) # 从这个文档选出一个段落的开始位置 440 | for j in range(random_start, len(random_document)): # 从这个文档的开始位置到结束,作为我们的“A[SEP]B“中的B即tokens_b 441 | tokens_b.extend(random_document[j]) 442 | if len(tokens_b) >= target_b_length: 443 | break 444 | # We didn't actually use these segments so we "put them back" so 445 | # they don't go to waste. 这里是为了防止文本的浪费的一个小技巧 446 | num_unused_segments = len(current_chunk) - a_end # e.g. 550-200=350 447 | i -= num_unused_segments # i=i-num_unused_segments, e.g. i=400, num_unused_segments=350, 那么 i=i-num_unused_segments=400-350=50 448 | # Actual next 449 | else: # 有另外50%的几乎,从当前文本块(长度为max_sequence_length)中的后段中填充到tokens_b即“A[SEP]B“中的B。 450 | is_random_next = False 451 | for j in range(a_end, len(current_chunk)): 452 | tokens_b.extend(current_chunk[j]) 453 | truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) 454 | 455 | assert len(tokens_a) >= 1 456 | assert len(tokens_b) >= 1 457 | 458 | # 把tokens_a & tokens_b加入到按照bert的风格,即以[CLS]tokens_a[SEP]tokens_b[SEP]的形式,结合到一起,作为最终的tokens; 也带上segment_ids,前面部分segment_ids的值是0,后面部分的值是1. 459 | tokens = [] 460 | segment_ids = [] 461 | tokens.append("[CLS]") 462 | segment_ids.append(0) 463 | for token in tokens_a: 464 | tokens.append(token) 465 | segment_ids.append(0) 466 | 467 | tokens.append("[SEP]") 468 | segment_ids.append(0) 469 | 470 | for token in tokens_b: 471 | tokens.append(token) 472 | segment_ids.append(1) 473 | tokens.append("[SEP]") 474 | segment_ids.append(1) 475 | 476 | # 创建masked LM的任务的数据 Creates the predictions for the masked LM objective 477 | (tokens, masked_lm_positions, 478 | masked_lm_labels) = create_masked_lm_predictions( 479 | tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) 480 | instance = TrainingInstance( # 创建训练实例的对象 481 | tokens=tokens, 482 | segment_ids=segment_ids, 483 | is_random_next=is_random_next, 484 | masked_lm_positions=masked_lm_positions, 485 | masked_lm_labels=masked_lm_labels) 486 | instances.append(instance) 487 | current_chunk = [] # 清空当前块 488 | current_length = 0 # 重置当前文本块的长度 489 | i += 1 # 接着文档中的内容往后看 490 | 491 | return instances 492 | 493 | 494 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance", 495 | ["index", "label"]) 496 | 497 | 498 | def create_masked_lm_predictions(tokens, masked_lm_prob, 499 | max_predictions_per_seq, vocab_words, rng): 500 | """Creates the predictions for the masked LM objective.""" 501 | 502 | cand_indexes = [] 503 | for (i, token) in enumerate(tokens): 504 | if token == "[CLS]" or token == "[SEP]": 505 | continue 506 | # Whole Word Masking means that if we mask all of the wordpieces 507 | # corresponding to an original word. When a word has been split into 508 | # WordPieces, the first token does not have any marker and any subsequence 509 | # tokens are prefixed with ##. So whenever we see the ## token, we 510 | # append it to the previous set of word indexes. 511 | # 512 | # Note that Whole Word Masking does *not* change the training code 513 | # at all -- we still predict each WordPiece independently, softmaxed 514 | # over the entire vocabulary. 515 | if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and 516 | token.startswith("##")): 517 | cand_indexes[-1].append(i) 518 | else: 519 | cand_indexes.append([i]) 520 | 521 | rng.shuffle(cand_indexes) 522 | 523 | if FLAGS.non_chinese==False: # if non chinese is False, that means it is chinese, then try to remove "##" which is added previously 524 | output_tokens = [t[2:] if len(re.findall('##[\u4E00-\u9FA5]', t)) > 0 else t for t in tokens] # 去掉"##" 525 | else: # english and other language, which is not chinese 526 | output_tokens = list(tokens) 527 | 528 | num_to_predict = min(max_predictions_per_seq, 529 | max(1, int(round(len(tokens) * masked_lm_prob)))) 530 | 531 | masked_lms = [] 532 | covered_indexes = set() 533 | for index_set in cand_indexes: 534 | if len(masked_lms) >= num_to_predict: 535 | break 536 | # If adding a whole-word mask would exceed the maximum number of 537 | # predictions, then just skip this candidate. 538 | if len(masked_lms) + len(index_set) > num_to_predict: 539 | continue 540 | is_any_index_covered = False 541 | for index in index_set: 542 | if index in covered_indexes: 543 | is_any_index_covered = True 544 | break 545 | if is_any_index_covered: 546 | continue 547 | for index in index_set: 548 | covered_indexes.add(index) 549 | 550 | masked_token = None 551 | # 80% of the time, replace with [MASK] 552 | if rng.random() < 0.8: 553 | masked_token = "[MASK]" 554 | else: 555 | # 10% of the time, keep original 556 | if rng.random() < 0.5: 557 | if FLAGS.non_chinese == False: # if non chinese is False, that means it is chinese, then try to remove "##" which is added previously 558 | masked_token = tokens[index][2:] if len(re.findall('##[\u4E00-\u9FA5]', tokens[index])) > 0 else tokens[index] # 去掉"##" 559 | else: 560 | masked_token = tokens[index] 561 | # 10% of the time, replace with random word 562 | else: 563 | masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] 564 | 565 | output_tokens[index] = masked_token 566 | 567 | masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) 568 | assert len(masked_lms) <= num_to_predict 569 | masked_lms = sorted(masked_lms, key=lambda x: x.index) 570 | 571 | masked_lm_positions = [] 572 | masked_lm_labels = [] 573 | for p in masked_lms: 574 | masked_lm_positions.append(p.index) 575 | masked_lm_labels.append(p.label) 576 | 577 | # tf.logging.info('%s' % (tokens)) 578 | # tf.logging.info('%s' % (output_tokens)) 579 | return (output_tokens, masked_lm_positions, masked_lm_labels) 580 | 581 | def create_masked_lm_predictions_original(tokens, masked_lm_prob, 582 | max_predictions_per_seq, vocab_words, rng): 583 | """Creates the predictions for the masked LM objective.""" 584 | 585 | cand_indexes = [] 586 | for (i, token) in enumerate(tokens): 587 | if token == "[CLS]" or token == "[SEP]": 588 | continue 589 | # Whole Word Masking means that if we mask all of the wordpieces 590 | # corresponding to an original word. When a word has been split into 591 | # WordPieces, the first token does not have any marker and any subsequence 592 | # tokens are prefixed with ##. So whenever we see the ## token, we 593 | # append it to the previous set of word indexes. 594 | # 595 | # Note that Whole Word Masking does *not* change the training code 596 | # at all -- we still predict each WordPiece independently, softmaxed 597 | # over the entire vocabulary. 598 | if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and 599 | token.startswith("##")): 600 | cand_indexes[-1].append(i) 601 | else: 602 | cand_indexes.append([i]) 603 | 604 | rng.shuffle(cand_indexes) 605 | 606 | output_tokens = list(tokens) 607 | 608 | num_to_predict = min(max_predictions_per_seq, 609 | max(1, int(round(len(tokens) * masked_lm_prob)))) 610 | 611 | masked_lms = [] 612 | covered_indexes = set() 613 | for index_set in cand_indexes: 614 | if len(masked_lms) >= num_to_predict: 615 | break 616 | # If adding a whole-word mask would exceed the maximum number of 617 | # predictions, then just skip this candidate. 618 | if len(masked_lms) + len(index_set) > num_to_predict: 619 | continue 620 | is_any_index_covered = False 621 | for index in index_set: 622 | if index in covered_indexes: 623 | is_any_index_covered = True 624 | break 625 | if is_any_index_covered: 626 | continue 627 | for index in index_set: 628 | covered_indexes.add(index) 629 | 630 | masked_token = None 631 | # 80% of the time, replace with [MASK] 632 | if rng.random() < 0.8: 633 | masked_token = "[MASK]" 634 | else: 635 | # 10% of the time, keep original 636 | if rng.random() < 0.5: 637 | masked_token = tokens[index] 638 | # 10% of the time, replace with random word 639 | else: 640 | masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] 641 | 642 | output_tokens[index] = masked_token 643 | 644 | masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) 645 | assert len(masked_lms) <= num_to_predict 646 | masked_lms = sorted(masked_lms, key=lambda x: x.index) 647 | 648 | masked_lm_positions = [] 649 | masked_lm_labels = [] 650 | for p in masked_lms: 651 | masked_lm_positions.append(p.index) 652 | masked_lm_labels.append(p.label) 653 | 654 | return (output_tokens, masked_lm_positions, masked_lm_labels) 655 | 656 | 657 | def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): 658 | """Truncates a pair of sequences to a maximum sequence length.""" 659 | while True: 660 | total_length = len(tokens_a) + len(tokens_b) 661 | if total_length <= max_num_tokens: 662 | break 663 | 664 | trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b 665 | assert len(trunc_tokens) >= 1 666 | 667 | # We want to sometimes truncate from the front and sometimes from the 668 | # back to add more randomness and avoid biases. 669 | if rng.random() < 0.5: 670 | del trunc_tokens[0] 671 | else: 672 | trunc_tokens.pop() 673 | 674 | 675 | def main(_): 676 | tf.logging.set_verbosity(tf.logging.INFO) 677 | 678 | tokenizer = tokenization.FullTokenizer( 679 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 680 | 681 | input_files = [] 682 | for input_pattern in FLAGS.input_file.split(","): 683 | input_files.extend(tf.gfile.Glob(input_pattern)) 684 | 685 | tf.logging.info("*** Reading from input files ***") 686 | for input_file in input_files: 687 | tf.logging.info(" %s", input_file) 688 | 689 | rng = random.Random(FLAGS.random_seed) 690 | instances = create_training_instances( 691 | input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, 692 | FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, 693 | rng) 694 | 695 | output_files = FLAGS.output_file.split(",") 696 | tf.logging.info("*** Writing to output files ***") 697 | for output_file in output_files: 698 | tf.logging.info(" %s", output_file) 699 | 700 | write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, 701 | FLAGS.max_predictions_per_seq, output_files) 702 | 703 | 704 | if __name__ == "__main__": 705 | flags.mark_flag_as_required("input_file") 706 | flags.mark_flag_as_required("output_file") 707 | flags.mark_flag_as_required("vocab_file") 708 | tf.app.run() --------------------------------------------------------------------------------