├── .gitignore ├── README.md ├── __init__.py ├── modeling.py ├── optimization.py ├── results.py ├── run_chinese_classification.py └── tokenization.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #bert-chinese classification 2 | 3 | Use google BERT to do chinese sentences multiclass classification ! 4 | 5 | #train 6 | 7 | export BERT_BASE_DIR=/path/to/model/chinese_L-12_H-768_A-12 8 | 9 | export DATA_DIR=/path/to/data 10 | 11 | python run_chinese_classification.py \ 12 | --do_train=true \ 13 | --do_eval=true \ 14 | --train_dir=$DATA_DIR/Chinesedata/train.tsv \ 15 | --dev_dir=$DATA_DIR/Chinesedata/dev.tsv \ 16 | --test_dir=$DATA_DIR/Chinesedata/test.tsv \ 17 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 18 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 19 | --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ 20 | --max_seq_length=512 \ 21 | --train_batch_size=6 \ 22 | --learning_rate=1e-6 \ 23 | --num_train_epochs=10.0 \ 24 | --output_dir=/path/to/chinese_model 25 | 26 | #test 27 | 28 | export BERT_BASE_DIR=/path/to/model/chinese_L-12_H-768_A-12 29 | 30 | export DATA_DIR=/path/to/model/data 31 | 32 | export TRAINED_CLASSIFIER=/path/to/model/chinese_model 33 | 34 | 35 | python run_chinese_classification.py \ 36 | --train_dir=$DATA_DIR/Chinesedata/train.tsv \ 37 | --dev_dir=$DATA_DIR/Chinesedata/dev.tsv \ 38 | --test_dir=$DATA_DIR/Chinesedata/test.tsv \ 39 | --do_predict=true \ 40 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 41 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 42 | --init_checkpoint=$TRAINED_CLASSIFIER \ 43 | --max_seq_length=512 \ 44 | --output_dir=/path/to/chinese_result 45 | 46 | #results: 47 | 48 | python results.py 49 | 50 | #data 51 | 52 | url:https://pan.baidu.com/s/1qDngiTq1FyNxb5GX-zDiCg 53 | passwd:9dk8 54 | 55 | #reference: 56 | 57 | https://github.com/google-research/bert 58 | 59 | https://arxiv.org/abs/1810.04805 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /modeling.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """The main BERT model and related functions.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import copy 23 | import json 24 | import math 25 | import re 26 | import six 27 | import tensorflow as tf 28 | 29 | 30 | class BertConfig(object): 31 | """Configuration for `BertModel`.""" 32 | 33 | def __init__(self, 34 | vocab_size, 35 | hidden_size=768, 36 | num_hidden_layers=12, 37 | num_attention_heads=12, 38 | intermediate_size=3072, 39 | hidden_act="gelu", 40 | hidden_dropout_prob=0.1, 41 | attention_probs_dropout_prob=0.1, 42 | max_position_embeddings=512, 43 | type_vocab_size=16, 44 | initializer_range=0.02): 45 | """Constructs BertConfig. 46 | 47 | Args: 48 | vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. 49 | hidden_size: Size of the encoder layers and the pooler layer. 50 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 51 | num_attention_heads: Number of attention heads for each attention layer in 52 | the Transformer encoder. 53 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 54 | layer in the Transformer encoder. 55 | hidden_act: The non-linear activation function (function or string) in the 56 | encoder and pooler. 57 | hidden_dropout_prob: The dropout probability for all fully connected 58 | layers in the embeddings, encoder, and pooler. 59 | attention_probs_dropout_prob: The dropout ratio for the attention 60 | probabilities. 61 | max_position_embeddings: The maximum sequence length that this model might 62 | ever be used with. Typically set this to something large just in case 63 | (e.g., 512 or 1024 or 2048). 64 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 65 | `BertModel`. 66 | initializer_range: The stdev of the truncated_normal_initializer for 67 | initializing all weight matrices. 68 | """ 69 | self.vocab_size = vocab_size 70 | self.hidden_size = hidden_size 71 | self.num_hidden_layers = num_hidden_layers 72 | self.num_attention_heads = num_attention_heads 73 | self.hidden_act = hidden_act 74 | self.intermediate_size = intermediate_size 75 | self.hidden_dropout_prob = hidden_dropout_prob 76 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 77 | self.max_position_embeddings = max_position_embeddings 78 | self.type_vocab_size = type_vocab_size 79 | self.initializer_range = initializer_range 80 | 81 | @classmethod 82 | def from_dict(cls, json_object): 83 | """Constructs a `BertConfig` from a Python dictionary of parameters.""" 84 | config = BertConfig(vocab_size=None) 85 | for (key, value) in six.iteritems(json_object): 86 | config.__dict__[key] = value 87 | return config 88 | 89 | @classmethod 90 | def from_json_file(cls, json_file): 91 | """Constructs a `BertConfig` from a json file of parameters.""" 92 | with tf.gfile.GFile(json_file, "r") as reader: 93 | text = reader.read() 94 | return cls.from_dict(json.loads(text)) 95 | 96 | def to_dict(self): 97 | """Serializes this instance to a Python dictionary.""" 98 | output = copy.deepcopy(self.__dict__) 99 | return output 100 | 101 | def to_json_string(self): 102 | """Serializes this instance to a JSON string.""" 103 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 104 | 105 | 106 | class BertModel(object): 107 | """BERT model ("Bidirectional Embedding Representations from a Transformer"). 108 | 109 | Example usage: 110 | 111 | ```python 112 | # Already been converted into WordPiece token ids 113 | input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) 114 | input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) 115 | token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) 116 | 117 | config = modeling.BertConfig(vocab_size=32000, hidden_size=512, 118 | num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) 119 | 120 | model = modeling.BertModel(config=config, is_training=True, 121 | input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) 122 | 123 | label_embeddings = tf.get_variable(...) 124 | pooled_output = model.get_pooled_output() 125 | logits = tf.matmul(pooled_output, label_embeddings) 126 | ... 127 | ``` 128 | """ 129 | 130 | def __init__(self, 131 | config, 132 | is_training, 133 | input_ids, 134 | input_mask=None, 135 | token_type_ids=None, 136 | use_one_hot_embeddings=True, 137 | scope=None): 138 | """Constructor for BertModel. 139 | 140 | Args: 141 | config: `BertConfig` instance. 142 | is_training: bool. true for training model, false for eval model. Controls 143 | whether dropout will be applied. 144 | input_ids: int32 Tensor of shape [batch_size, seq_length]. 145 | input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. 146 | token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. 147 | use_one_hot_embeddings: (optional) bool. Whether to use one-hot word 148 | embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, 149 | it is must faster if this is True, on the CPU or GPU, it is faster if 150 | this is False. 151 | scope: (optional) variable scope. Defaults to "bert". 152 | 153 | Raises: 154 | ValueError: The config is invalid or one of the input tensor shapes 155 | is invalid. 156 | """ 157 | config = copy.deepcopy(config) 158 | if not is_training: 159 | config.hidden_dropout_prob = 0.0 160 | config.attention_probs_dropout_prob = 0.0 161 | 162 | input_shape = get_shape_list(input_ids, expected_rank=2) 163 | batch_size = input_shape[0] 164 | seq_length = input_shape[1] 165 | 166 | if input_mask is None: 167 | input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) 168 | 169 | if token_type_ids is None: 170 | token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) 171 | 172 | with tf.variable_scope(scope, default_name="bert"): 173 | with tf.variable_scope("embeddings"): 174 | # Perform embedding lookup on the word ids. 175 | (self.embedding_output, self.embedding_table) = embedding_lookup( 176 | input_ids=input_ids, 177 | vocab_size=config.vocab_size, 178 | embedding_size=config.hidden_size, 179 | initializer_range=config.initializer_range, 180 | word_embedding_name="word_embeddings", 181 | use_one_hot_embeddings=use_one_hot_embeddings) 182 | 183 | # Add positional embeddings and token type embeddings, then layer 184 | # normalize and perform dropout. 185 | self.embedding_output = embedding_postprocessor( 186 | input_tensor=self.embedding_output, 187 | use_token_type=True, 188 | token_type_ids=token_type_ids, 189 | token_type_vocab_size=config.type_vocab_size, 190 | token_type_embedding_name="token_type_embeddings", 191 | use_position_embeddings=True, 192 | position_embedding_name="position_embeddings", 193 | initializer_range=config.initializer_range, 194 | max_position_embeddings=config.max_position_embeddings, 195 | dropout_prob=config.hidden_dropout_prob) 196 | 197 | with tf.variable_scope("encoder"): 198 | # This converts a 2D mask of shape [batch_size, seq_length] to a 3D 199 | # mask of shape [batch_size, seq_length, seq_length] which is used 200 | # for the attention scores. 201 | attention_mask = create_attention_mask_from_input_mask( 202 | input_ids, input_mask) 203 | 204 | # Run the stacked transformer. 205 | # `sequence_output` shape = [batch_size, seq_length, hidden_size]. 206 | self.all_encoder_layers = transformer_model( 207 | input_tensor=self.embedding_output, 208 | attention_mask=attention_mask, 209 | hidden_size=config.hidden_size, 210 | num_hidden_layers=config.num_hidden_layers, 211 | num_attention_heads=config.num_attention_heads, 212 | intermediate_size=config.intermediate_size, 213 | intermediate_act_fn=get_activation(config.hidden_act), 214 | hidden_dropout_prob=config.hidden_dropout_prob, 215 | attention_probs_dropout_prob=config.attention_probs_dropout_prob, 216 | initializer_range=config.initializer_range, 217 | do_return_all_layers=True) 218 | 219 | self.sequence_output = self.all_encoder_layers[-1] 220 | # The "pooler" converts the encoded sequence tensor of shape 221 | # [batch_size, seq_length, hidden_size] to a tensor of shape 222 | # [batch_size, hidden_size]. This is necessary for segment-level 223 | # (or segment-pair-level) classification tasks where we need a fixed 224 | # dimensional representation of the segment. 225 | with tf.variable_scope("pooler"): 226 | # We "pool" the model by simply taking the hidden state corresponding 227 | # to the first token. We assume that this has been pre-trained 228 | first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) 229 | self.pooled_output = tf.layers.dense( 230 | first_token_tensor, 231 | config.hidden_size, 232 | activation=tf.tanh, 233 | kernel_initializer=create_initializer(config.initializer_range)) 234 | 235 | def get_pooled_output(self): 236 | return self.pooled_output 237 | 238 | def get_sequence_output(self): 239 | """Gets final hidden layer of encoder. 240 | 241 | Returns: 242 | float Tensor of shape [batch_size, seq_length, hidden_size] corresponding 243 | to the final hidden of the transformer encoder. 244 | """ 245 | return self.sequence_output 246 | 247 | def get_all_encoder_layers(self): 248 | return self.all_encoder_layers 249 | 250 | def get_embedding_output(self): 251 | """Gets output of the embedding lookup (i.e., input to the transformer). 252 | 253 | Returns: 254 | float Tensor of shape [batch_size, seq_length, hidden_size] corresponding 255 | to the output of the embedding layer, after summing the word 256 | embeddings with the positional embeddings and the token type embeddings, 257 | then performing layer normalization. This is the input to the transformer. 258 | """ 259 | return self.embedding_output 260 | 261 | def get_embedding_table(self): 262 | return self.embedding_table 263 | 264 | 265 | def gelu(input_tensor): 266 | """Gaussian Error Linear Unit. 267 | 268 | This is a smoother version of the RELU. 269 | Original paper: https://arxiv.org/abs/1606.08415 270 | 271 | Args: 272 | input_tensor: float Tensor to perform activation. 273 | 274 | Returns: 275 | `input_tensor` with the GELU activation applied. 276 | """ 277 | cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) 278 | return input_tensor * cdf 279 | 280 | 281 | def get_activation(activation_string): 282 | """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. 283 | 284 | Args: 285 | activation_string: String name of the activation function. 286 | 287 | Returns: 288 | A Python function corresponding to the activation function. If 289 | `activation_string` is None, empty, or "linear", this will return None. 290 | If `activation_string` is not a string, it will return `activation_string`. 291 | 292 | Raises: 293 | ValueError: The `activation_string` does not correspond to a known 294 | activation. 295 | """ 296 | 297 | # We assume that anything that"s not a string is already an activation 298 | # function, so we just return it. 299 | if not isinstance(activation_string, six.string_types): 300 | return activation_string 301 | 302 | if not activation_string: 303 | return None 304 | 305 | act = activation_string.lower() 306 | if act == "linear": 307 | return None 308 | elif act == "relu": 309 | return tf.nn.relu 310 | elif act == "gelu": 311 | return gelu 312 | elif act == "tanh": 313 | return tf.tanh 314 | else: 315 | raise ValueError("Unsupported activation: %s" % act) 316 | 317 | 318 | def get_assignment_map_from_checkpoint(tvars, init_checkpoint): 319 | """Compute the union of the current variables and checkpoint variables.""" 320 | assignment_map = {} 321 | initialized_variable_names = {} 322 | 323 | name_to_variable = collections.OrderedDict() 324 | for var in tvars: 325 | name = var.name 326 | m = re.match("^(.*):\\d+$", name) 327 | if m is not None: 328 | name = m.group(1) 329 | name_to_variable[name] = var 330 | 331 | init_vars = tf.train.list_variables(init_checkpoint) 332 | 333 | assignment_map = collections.OrderedDict() 334 | for x in init_vars: 335 | (name, var) = (x[0], x[1]) 336 | if name not in name_to_variable: 337 | continue 338 | assignment_map[name] = name 339 | initialized_variable_names[name] = 1 340 | initialized_variable_names[name + ":0"] = 1 341 | 342 | return (assignment_map, initialized_variable_names) 343 | 344 | 345 | def dropout(input_tensor, dropout_prob): 346 | """Perform dropout. 347 | 348 | Args: 349 | input_tensor: float Tensor. 350 | dropout_prob: Python float. The probability of dropping out a value (NOT of 351 | *keeping* a dimension as in `tf.nn.dropout`). 352 | 353 | Returns: 354 | A version of `input_tensor` with dropout applied. 355 | """ 356 | if dropout_prob is None or dropout_prob == 0.0: 357 | return input_tensor 358 | 359 | output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) 360 | return output 361 | 362 | 363 | def layer_norm(input_tensor, name=None): 364 | """Run layer normalization on the last dimension of the tensor.""" 365 | return tf.contrib.layers.layer_norm( 366 | inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) 367 | 368 | 369 | def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): 370 | """Runs layer normalization followed by dropout.""" 371 | output_tensor = layer_norm(input_tensor, name) 372 | output_tensor = dropout(output_tensor, dropout_prob) 373 | return output_tensor 374 | 375 | 376 | def create_initializer(initializer_range=0.02): 377 | """Creates a `truncated_normal_initializer` with the given range.""" 378 | return tf.truncated_normal_initializer(stddev=initializer_range) 379 | 380 | 381 | def embedding_lookup(input_ids, 382 | vocab_size, 383 | embedding_size=128, 384 | initializer_range=0.02, 385 | word_embedding_name="word_embeddings", 386 | use_one_hot_embeddings=False): 387 | """Looks up words embeddings for id tensor. 388 | 389 | Args: 390 | input_ids: int32 Tensor of shape [batch_size, seq_length] containing word 391 | ids. 392 | vocab_size: int. Size of the embedding vocabulary. 393 | embedding_size: int. Width of the word embeddings. 394 | initializer_range: float. Embedding initialization range. 395 | word_embedding_name: string. Name of the embedding table. 396 | use_one_hot_embeddings: bool. If True, use one-hot method for word 397 | embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better 398 | for TPUs. 399 | 400 | Returns: 401 | float Tensor of shape [batch_size, seq_length, embedding_size]. 402 | """ 403 | # This function assumes that the input is of shape [batch_size, seq_length, 404 | # num_inputs]. 405 | # 406 | # If the input is a 2D tensor of shape [batch_size, seq_length], we 407 | # reshape to [batch_size, seq_length, 1]. 408 | if input_ids.shape.ndims == 2: 409 | input_ids = tf.expand_dims(input_ids, axis=[-1]) 410 | 411 | embedding_table = tf.get_variable( 412 | name=word_embedding_name, 413 | shape=[vocab_size, embedding_size], 414 | initializer=create_initializer(initializer_range)) 415 | 416 | if use_one_hot_embeddings: 417 | flat_input_ids = tf.reshape(input_ids, [-1]) 418 | one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) 419 | output = tf.matmul(one_hot_input_ids, embedding_table) 420 | else: 421 | output = tf.nn.embedding_lookup(embedding_table, input_ids) 422 | 423 | input_shape = get_shape_list(input_ids) 424 | 425 | output = tf.reshape(output, 426 | input_shape[0:-1] + [input_shape[-1] * embedding_size]) 427 | return (output, embedding_table) 428 | 429 | 430 | def embedding_postprocessor(input_tensor, 431 | use_token_type=False, 432 | token_type_ids=None, 433 | token_type_vocab_size=16, 434 | token_type_embedding_name="token_type_embeddings", 435 | use_position_embeddings=True, 436 | position_embedding_name="position_embeddings", 437 | initializer_range=0.02, 438 | max_position_embeddings=512, 439 | dropout_prob=0.1): 440 | """Performs various post-processing on a word embedding tensor. 441 | 442 | Args: 443 | input_tensor: float Tensor of shape [batch_size, seq_length, 444 | embedding_size]. 445 | use_token_type: bool. Whether to add embeddings for `token_type_ids`. 446 | token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. 447 | Must be specified if `use_token_type` is True. 448 | token_type_vocab_size: int. The vocabulary size of `token_type_ids`. 449 | token_type_embedding_name: string. The name of the embedding table variable 450 | for token type ids. 451 | use_position_embeddings: bool. Whether to add position embeddings for the 452 | position of each token in the sequence. 453 | position_embedding_name: string. The name of the embedding table variable 454 | for positional embeddings. 455 | initializer_range: float. Range of the weight initialization. 456 | max_position_embeddings: int. Maximum sequence length that might ever be 457 | used with this model. This can be longer than the sequence length of 458 | input_tensor, but cannot be shorter. 459 | dropout_prob: float. Dropout probability applied to the final output tensor. 460 | 461 | Returns: 462 | float tensor with same shape as `input_tensor`. 463 | 464 | Raises: 465 | ValueError: One of the tensor shapes or input values is invalid. 466 | """ 467 | input_shape = get_shape_list(input_tensor, expected_rank=3) 468 | batch_size = input_shape[0] 469 | seq_length = input_shape[1] 470 | width = input_shape[2] 471 | 472 | if seq_length > max_position_embeddings: 473 | raise ValueError("The seq length (%d) cannot be greater than " 474 | "`max_position_embeddings` (%d)" % 475 | (seq_length, max_position_embeddings)) 476 | 477 | output = input_tensor 478 | 479 | if use_token_type: 480 | if token_type_ids is None: 481 | raise ValueError("`token_type_ids` must be specified if" 482 | "`use_token_type` is True.") 483 | token_type_table = tf.get_variable( 484 | name=token_type_embedding_name, 485 | shape=[token_type_vocab_size, width], 486 | initializer=create_initializer(initializer_range)) 487 | # This vocab will be small so we always do one-hot here, since it is always 488 | # faster for a small vocabulary. 489 | flat_token_type_ids = tf.reshape(token_type_ids, [-1]) 490 | one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) 491 | token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) 492 | token_type_embeddings = tf.reshape(token_type_embeddings, 493 | [batch_size, seq_length, width]) 494 | output += token_type_embeddings 495 | 496 | if use_position_embeddings: 497 | full_position_embeddings = tf.get_variable( 498 | name=position_embedding_name, 499 | shape=[max_position_embeddings, width], 500 | initializer=create_initializer(initializer_range)) 501 | # Since the position embedding table is a learned variable, we create it 502 | # using a (long) sequence length `max_position_embeddings`. The actual 503 | # sequence length might be shorter than this, for faster training of 504 | # tasks that do not have long sequences. 505 | # 506 | # So `full_position_embeddings` is effectively an embedding table 507 | # for position [0, 1, 2, ..., max_position_embeddings-1], and the current 508 | # sequence has positions [0, 1, 2, ... seq_length-1], so we can just 509 | # perform a slice. 510 | if seq_length < max_position_embeddings: 511 | position_embeddings = tf.slice(full_position_embeddings, [0, 0], 512 | [seq_length, -1]) 513 | else: 514 | position_embeddings = full_position_embeddings 515 | 516 | num_dims = len(output.shape.as_list()) 517 | 518 | # Only the last two dimensions are relevant (`seq_length` and `width`), so 519 | # we broadcast among the first dimensions, which is typically just 520 | # the batch size. 521 | position_broadcast_shape = [] 522 | for _ in range(num_dims - 2): 523 | position_broadcast_shape.append(1) 524 | position_broadcast_shape.extend([seq_length, width]) 525 | position_embeddings = tf.reshape(position_embeddings, 526 | position_broadcast_shape) 527 | output += position_embeddings 528 | 529 | output = layer_norm_and_dropout(output, dropout_prob) 530 | return output 531 | 532 | 533 | def create_attention_mask_from_input_mask(from_tensor, to_mask): 534 | """Create 3D attention mask from a 2D tensor mask. 535 | 536 | Args: 537 | from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. 538 | to_mask: int32 Tensor of shape [batch_size, to_seq_length]. 539 | 540 | Returns: 541 | float Tensor of shape [batch_size, from_seq_length, to_seq_length]. 542 | """ 543 | from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) 544 | batch_size = from_shape[0] 545 | from_seq_length = from_shape[1] 546 | 547 | to_shape = get_shape_list(to_mask, expected_rank=2) 548 | to_seq_length = to_shape[1] 549 | 550 | to_mask = tf.cast( 551 | tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) 552 | 553 | # We don't assume that `from_tensor` is a mask (although it could be). We 554 | # don't actually care if we attend *from* padding tokens (only *to* padding) 555 | # tokens so we create a tensor of all ones. 556 | # 557 | # `broadcast_ones` = [batch_size, from_seq_length, 1] 558 | broadcast_ones = tf.ones( 559 | shape=[batch_size, from_seq_length, 1], dtype=tf.float32) 560 | 561 | # Here we broadcast along two dimensions to create the mask. 562 | mask = broadcast_ones * to_mask 563 | 564 | return mask 565 | 566 | 567 | def attention_layer(from_tensor, 568 | to_tensor, 569 | attention_mask=None, 570 | num_attention_heads=1, 571 | size_per_head=512, 572 | query_act=None, 573 | key_act=None, 574 | value_act=None, 575 | attention_probs_dropout_prob=0.0, 576 | initializer_range=0.02, 577 | do_return_2d_tensor=False, 578 | batch_size=None, 579 | from_seq_length=None, 580 | to_seq_length=None): 581 | """Performs multi-headed attention from `from_tensor` to `to_tensor`. 582 | 583 | This is an implementation of multi-headed attention based on "Attention 584 | is all you Need". If `from_tensor` and `to_tensor` are the same, then 585 | this is self-attention. Each timestep in `from_tensor` attends to the 586 | corresponding sequence in `to_tensor`, and returns a fixed-with vector. 587 | 588 | This function first projects `from_tensor` into a "query" tensor and 589 | `to_tensor` into "key" and "value" tensors. These are (effectively) a list 590 | of tensors of length `num_attention_heads`, where each tensor is of shape 591 | [batch_size, seq_length, size_per_head]. 592 | 593 | Then, the query and key tensors are dot-producted and scaled. These are 594 | softmaxed to obtain attention probabilities. The value tensors are then 595 | interpolated by these probabilities, then concatenated back to a single 596 | tensor and returned. 597 | 598 | In practice, the multi-headed attention are done with transposes and 599 | reshapes rather than actual separate tensors. 600 | 601 | Args: 602 | from_tensor: float Tensor of shape [batch_size, from_seq_length, 603 | from_width]. 604 | to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. 605 | attention_mask: (optional) int32 Tensor of shape [batch_size, 606 | from_seq_length, to_seq_length]. The values should be 1 or 0. The 607 | attention scores will effectively be set to -infinity for any positions in 608 | the mask that are 0, and will be unchanged for positions that are 1. 609 | num_attention_heads: int. Number of attention heads. 610 | size_per_head: int. Size of each attention head. 611 | query_act: (optional) Activation function for the query transform. 612 | key_act: (optional) Activation function for the key transform. 613 | value_act: (optional) Activation function for the value transform. 614 | attention_probs_dropout_prob: (optional) float. Dropout probability of the 615 | attention probabilities. 616 | initializer_range: float. Range of the weight initializer. 617 | do_return_2d_tensor: bool. If True, the output will be of shape [batch_size 618 | * from_seq_length, num_attention_heads * size_per_head]. If False, the 619 | output will be of shape [batch_size, from_seq_length, num_attention_heads 620 | * size_per_head]. 621 | batch_size: (Optional) int. If the input is 2D, this might be the batch size 622 | of the 3D version of the `from_tensor` and `to_tensor`. 623 | from_seq_length: (Optional) If the input is 2D, this might be the seq length 624 | of the 3D version of the `from_tensor`. 625 | to_seq_length: (Optional) If the input is 2D, this might be the seq length 626 | of the 3D version of the `to_tensor`. 627 | 628 | Returns: 629 | float Tensor of shape [batch_size, from_seq_length, 630 | num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is 631 | true, this will be of shape [batch_size * from_seq_length, 632 | num_attention_heads * size_per_head]). 633 | 634 | Raises: 635 | ValueError: Any of the arguments or tensor shapes are invalid. 636 | """ 637 | 638 | def transpose_for_scores(input_tensor, batch_size, num_attention_heads, 639 | seq_length, width): 640 | output_tensor = tf.reshape( 641 | input_tensor, [batch_size, seq_length, num_attention_heads, width]) 642 | 643 | output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) 644 | return output_tensor 645 | 646 | from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) 647 | to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) 648 | 649 | if len(from_shape) != len(to_shape): 650 | raise ValueError( 651 | "The rank of `from_tensor` must match the rank of `to_tensor`.") 652 | 653 | if len(from_shape) == 3: 654 | batch_size = from_shape[0] 655 | from_seq_length = from_shape[1] 656 | to_seq_length = to_shape[1] 657 | elif len(from_shape) == 2: 658 | if (batch_size is None or from_seq_length is None or to_seq_length is None): 659 | raise ValueError( 660 | "When passing in rank 2 tensors to attention_layer, the values " 661 | "for `batch_size`, `from_seq_length`, and `to_seq_length` " 662 | "must all be specified.") 663 | 664 | # Scalar dimensions referenced here: 665 | # B = batch size (number of sequences) 666 | # F = `from_tensor` sequence length 667 | # T = `to_tensor` sequence length 668 | # N = `num_attention_heads` 669 | # H = `size_per_head` 670 | 671 | from_tensor_2d = reshape_to_matrix(from_tensor) 672 | to_tensor_2d = reshape_to_matrix(to_tensor) 673 | 674 | # `query_layer` = [B*F, N*H] 675 | query_layer = tf.layers.dense( 676 | from_tensor_2d, 677 | num_attention_heads * size_per_head, 678 | activation=query_act, 679 | name="query", 680 | kernel_initializer=create_initializer(initializer_range)) 681 | 682 | # `key_layer` = [B*T, N*H] 683 | key_layer = tf.layers.dense( 684 | to_tensor_2d, 685 | num_attention_heads * size_per_head, 686 | activation=key_act, 687 | name="key", 688 | kernel_initializer=create_initializer(initializer_range)) 689 | 690 | # `value_layer` = [B*T, N*H] 691 | value_layer = tf.layers.dense( 692 | to_tensor_2d, 693 | num_attention_heads * size_per_head, 694 | activation=value_act, 695 | name="value", 696 | kernel_initializer=create_initializer(initializer_range)) 697 | 698 | # `query_layer` = [B, N, F, H] 699 | query_layer = transpose_for_scores(query_layer, batch_size, 700 | num_attention_heads, from_seq_length, 701 | size_per_head) 702 | 703 | # `key_layer` = [B, N, T, H] 704 | key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, 705 | to_seq_length, size_per_head) 706 | 707 | # Take the dot product between "query" and "key" to get the raw 708 | # attention scores. 709 | # `attention_scores` = [B, N, F, T] 710 | attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) 711 | attention_scores = tf.multiply(attention_scores, 712 | 1.0 / math.sqrt(float(size_per_head))) 713 | 714 | if attention_mask is not None: 715 | # `attention_mask` = [B, 1, F, T] 716 | attention_mask = tf.expand_dims(attention_mask, axis=[1]) 717 | 718 | # Since attention_mask is 1.0 for positions we want to attend and 0.0 for 719 | # masked positions, this operation will create a tensor which is 0.0 for 720 | # positions we want to attend and -10000.0 for masked positions. 721 | adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 722 | 723 | # Since we are adding it to the raw scores before the softmax, this is 724 | # effectively the same as removing these entirely. 725 | attention_scores += adder 726 | 727 | # Normalize the attention scores to probabilities. 728 | # `attention_probs` = [B, N, F, T] 729 | attention_probs = tf.nn.softmax(attention_scores) 730 | 731 | # This is actually dropping out entire tokens to attend to, which might 732 | # seem a bit unusual, but is taken from the original Transformer paper. 733 | attention_probs = dropout(attention_probs, attention_probs_dropout_prob) 734 | 735 | # `value_layer` = [B, T, N, H] 736 | value_layer = tf.reshape( 737 | value_layer, 738 | [batch_size, to_seq_length, num_attention_heads, size_per_head]) 739 | 740 | # `value_layer` = [B, N, T, H] 741 | value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) 742 | 743 | # `context_layer` = [B, N, F, H] 744 | context_layer = tf.matmul(attention_probs, value_layer) 745 | 746 | # `context_layer` = [B, F, N, H] 747 | context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) 748 | 749 | if do_return_2d_tensor: 750 | # `context_layer` = [B*F, N*V] 751 | context_layer = tf.reshape( 752 | context_layer, 753 | [batch_size * from_seq_length, num_attention_heads * size_per_head]) 754 | else: 755 | # `context_layer` = [B, F, N*V] 756 | context_layer = tf.reshape( 757 | context_layer, 758 | [batch_size, from_seq_length, num_attention_heads * size_per_head]) 759 | 760 | return context_layer 761 | 762 | 763 | def transformer_model(input_tensor, 764 | attention_mask=None, 765 | hidden_size=768, 766 | num_hidden_layers=12, 767 | num_attention_heads=12, 768 | intermediate_size=3072, 769 | intermediate_act_fn=gelu, 770 | hidden_dropout_prob=0.1, 771 | attention_probs_dropout_prob=0.1, 772 | initializer_range=0.02, 773 | do_return_all_layers=False): 774 | """Multi-headed, multi-layer Transformer from "Attention is All You Need". 775 | 776 | This is almost an exact implementation of the original Transformer encoder. 777 | 778 | See the original paper: 779 | https://arxiv.org/abs/1706.03762 780 | 781 | Also see: 782 | https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py 783 | 784 | Args: 785 | input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. 786 | attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, 787 | seq_length], with 1 for positions that can be attended to and 0 in 788 | positions that should not be. 789 | hidden_size: int. Hidden size of the Transformer. 790 | num_hidden_layers: int. Number of layers (blocks) in the Transformer. 791 | num_attention_heads: int. Number of attention heads in the Transformer. 792 | intermediate_size: int. The size of the "intermediate" (a.k.a., feed 793 | forward) layer. 794 | intermediate_act_fn: function. The non-linear activation function to apply 795 | to the output of the intermediate/feed-forward layer. 796 | hidden_dropout_prob: float. Dropout probability for the hidden layers. 797 | attention_probs_dropout_prob: float. Dropout probability of the attention 798 | probabilities. 799 | initializer_range: float. Range of the initializer (stddev of truncated 800 | normal). 801 | do_return_all_layers: Whether to also return all layers or just the final 802 | layer. 803 | 804 | Returns: 805 | float Tensor of shape [batch_size, seq_length, hidden_size], the final 806 | hidden layer of the Transformer. 807 | 808 | Raises: 809 | ValueError: A Tensor shape or parameter is invalid. 810 | """ 811 | if hidden_size % num_attention_heads != 0: 812 | raise ValueError( 813 | "The hidden size (%d) is not a multiple of the number of attention " 814 | "heads (%d)" % (hidden_size, num_attention_heads)) 815 | 816 | attention_head_size = int(hidden_size / num_attention_heads) 817 | input_shape = get_shape_list(input_tensor, expected_rank=3) 818 | batch_size = input_shape[0] 819 | seq_length = input_shape[1] 820 | input_width = input_shape[2] 821 | 822 | # The Transformer performs sum residuals on all layers so the input needs 823 | # to be the same as the hidden size. 824 | if input_width != hidden_size: 825 | raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % 826 | (input_width, hidden_size)) 827 | 828 | # We keep the representation as a 2D tensor to avoid re-shaping it back and 829 | # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on 830 | # the GPU/CPU but may not be free on the TPU, so we want to minimize them to 831 | # help the optimizer. 832 | prev_output = reshape_to_matrix(input_tensor) 833 | 834 | all_layer_outputs = [] 835 | for layer_idx in range(num_hidden_layers): 836 | with tf.variable_scope("layer_%d" % layer_idx): 837 | layer_input = prev_output 838 | 839 | with tf.variable_scope("attention"): 840 | attention_heads = [] 841 | with tf.variable_scope("self"): 842 | attention_head = attention_layer( 843 | from_tensor=layer_input, 844 | to_tensor=layer_input, 845 | attention_mask=attention_mask, 846 | num_attention_heads=num_attention_heads, 847 | size_per_head=attention_head_size, 848 | attention_probs_dropout_prob=attention_probs_dropout_prob, 849 | initializer_range=initializer_range, 850 | do_return_2d_tensor=True, 851 | batch_size=batch_size, 852 | from_seq_length=seq_length, 853 | to_seq_length=seq_length) 854 | attention_heads.append(attention_head) 855 | 856 | attention_output = None 857 | if len(attention_heads) == 1: 858 | attention_output = attention_heads[0] 859 | else: 860 | # In the case where we have other sequences, we just concatenate 861 | # them to the self-attention head before the projection. 862 | attention_output = tf.concat(attention_heads, axis=-1) 863 | 864 | # Run a linear projection of `hidden_size` then add a residual 865 | # with `layer_input`. 866 | with tf.variable_scope("output"): 867 | attention_output = tf.layers.dense( 868 | attention_output, 869 | hidden_size, 870 | kernel_initializer=create_initializer(initializer_range)) 871 | attention_output = dropout(attention_output, hidden_dropout_prob) 872 | attention_output = layer_norm(attention_output + layer_input) 873 | 874 | # The activation is only applied to the "intermediate" hidden layer. 875 | with tf.variable_scope("intermediate"): 876 | intermediate_output = tf.layers.dense( 877 | attention_output, 878 | intermediate_size, 879 | activation=intermediate_act_fn, 880 | kernel_initializer=create_initializer(initializer_range)) 881 | 882 | # Down-project back to `hidden_size` then add the residual. 883 | with tf.variable_scope("output"): 884 | layer_output = tf.layers.dense( 885 | intermediate_output, 886 | hidden_size, 887 | kernel_initializer=create_initializer(initializer_range)) 888 | layer_output = dropout(layer_output, hidden_dropout_prob) 889 | layer_output = layer_norm(layer_output + attention_output) 890 | prev_output = layer_output 891 | all_layer_outputs.append(layer_output) 892 | 893 | if do_return_all_layers: 894 | final_outputs = [] 895 | for layer_output in all_layer_outputs: 896 | final_output = reshape_from_matrix(layer_output, input_shape) 897 | final_outputs.append(final_output) 898 | return final_outputs 899 | else: 900 | final_output = reshape_from_matrix(prev_output, input_shape) 901 | return final_output 902 | 903 | 904 | def get_shape_list(tensor, expected_rank=None, name=None): 905 | """Returns a list of the shape of tensor, preferring static dimensions. 906 | 907 | Args: 908 | tensor: A tf.Tensor object to find the shape of. 909 | expected_rank: (optional) int. The expected rank of `tensor`. If this is 910 | specified and the `tensor` has a different rank, and exception will be 911 | thrown. 912 | name: Optional name of the tensor for the error message. 913 | 914 | Returns: 915 | A list of dimensions of the shape of tensor. All static dimensions will 916 | be returned as python integers, and dynamic dimensions will be returned 917 | as tf.Tensor scalars. 918 | """ 919 | if name is None: 920 | name = tensor.name 921 | 922 | if expected_rank is not None: 923 | assert_rank(tensor, expected_rank, name) 924 | 925 | shape = tensor.shape.as_list() 926 | 927 | non_static_indexes = [] 928 | for (index, dim) in enumerate(shape): 929 | if dim is None: 930 | non_static_indexes.append(index) 931 | 932 | if not non_static_indexes: 933 | return shape 934 | 935 | dyn_shape = tf.shape(tensor) 936 | for index in non_static_indexes: 937 | shape[index] = dyn_shape[index] 938 | return shape 939 | 940 | 941 | def reshape_to_matrix(input_tensor): 942 | """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" 943 | ndims = input_tensor.shape.ndims 944 | if ndims < 2: 945 | raise ValueError("Input tensor must have at least rank 2. Shape = %s" % 946 | (input_tensor.shape)) 947 | if ndims == 2: 948 | return input_tensor 949 | 950 | width = input_tensor.shape[-1] 951 | output_tensor = tf.reshape(input_tensor, [-1, width]) 952 | return output_tensor 953 | 954 | 955 | def reshape_from_matrix(output_tensor, orig_shape_list): 956 | """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" 957 | if len(orig_shape_list) == 2: 958 | return output_tensor 959 | 960 | output_shape = get_shape_list(output_tensor) 961 | 962 | orig_dims = orig_shape_list[0:-1] 963 | width = output_shape[-1] 964 | 965 | return tf.reshape(output_tensor, orig_dims + [width]) 966 | 967 | 968 | def assert_rank(tensor, expected_rank, name=None): 969 | """Raises an exception if the tensor rank is not of the expected rank. 970 | 971 | Args: 972 | tensor: A tf.Tensor to check the rank of. 973 | expected_rank: Python integer or list of integers, expected rank. 974 | name: Optional name of the tensor for the error message. 975 | 976 | Raises: 977 | ValueError: If the expected shape doesn't match the actual shape. 978 | """ 979 | if name is None: 980 | name = tensor.name 981 | 982 | expected_rank_dict = {} 983 | if isinstance(expected_rank, six.integer_types): 984 | expected_rank_dict[expected_rank] = True 985 | else: 986 | for x in expected_rank: 987 | expected_rank_dict[x] = True 988 | 989 | actual_rank = tensor.shape.ndims 990 | if actual_rank not in expected_rank_dict: 991 | scope_name = tf.get_variable_scope().name 992 | raise ValueError( 993 | "For the tensor `%s` in scope `%s`, the actual rank " 994 | "`%d` (shape = %s) is not equal to the expected rank `%s`" % 995 | (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) 996 | -------------------------------------------------------------------------------- /optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | import tensorflow as tf 23 | 24 | 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): 26 | """Creates an optimizer training op.""" 27 | global_step = tf.train.get_or_create_global_step() 28 | 29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 30 | 31 | # Implements linear decay of the learning rate. 32 | learning_rate = tf.train.polynomial_decay( 33 | learning_rate, 34 | global_step, 35 | num_train_steps, 36 | end_learning_rate=0.0, 37 | power=1.0, 38 | cycle=False) 39 | 40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 41 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 42 | if num_warmup_steps: 43 | global_steps_int = tf.cast(global_step, tf.int32) 44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 45 | 46 | global_steps_float = tf.cast(global_steps_int, tf.float32) 47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 48 | 49 | warmup_percent_done = global_steps_float / warmup_steps_float 50 | warmup_learning_rate = init_lr * warmup_percent_done 51 | 52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 53 | learning_rate = ( 54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 55 | 56 | # It is recommended that you use this optimizer for fine tuning, since this 57 | # is how the model was trained (note that the Adam m/v variables are NOT 58 | # loaded from init_checkpoint.) 59 | optimizer = AdamWeightDecayOptimizer( 60 | learning_rate=learning_rate, 61 | weight_decay_rate=0.01, 62 | beta_1=0.9, 63 | beta_2=0.999, 64 | epsilon=1e-6, 65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 66 | 67 | if use_tpu: 68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 69 | 70 | tvars = tf.trainable_variables() 71 | grads = tf.gradients(loss, tvars) 72 | 73 | # This is how the model was pre-trained. 74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) 75 | 76 | train_op = optimizer.apply_gradients( 77 | zip(grads, tvars), global_step=global_step) 78 | 79 | new_global_step = global_step + 1 80 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 81 | return train_op 82 | 83 | 84 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 85 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 86 | 87 | def __init__(self, 88 | learning_rate, 89 | weight_decay_rate=0.0, 90 | beta_1=0.9, 91 | beta_2=0.999, 92 | epsilon=1e-6, 93 | exclude_from_weight_decay=None, 94 | name="AdamWeightDecayOptimizer"): 95 | """Constructs a AdamWeightDecayOptimizer.""" 96 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 97 | 98 | self.learning_rate = learning_rate 99 | self.weight_decay_rate = weight_decay_rate 100 | self.beta_1 = beta_1 101 | self.beta_2 = beta_2 102 | self.epsilon = epsilon 103 | self.exclude_from_weight_decay = exclude_from_weight_decay 104 | 105 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 106 | """See base class.""" 107 | assignments = [] 108 | for (grad, param) in grads_and_vars: 109 | if grad is None or param is None: 110 | continue 111 | 112 | param_name = self._get_variable_name(param.name) 113 | 114 | m = tf.get_variable( 115 | name=param_name + "/adam_m", 116 | shape=param.shape.as_list(), 117 | dtype=tf.float32, 118 | trainable=False, 119 | initializer=tf.zeros_initializer()) 120 | v = tf.get_variable( 121 | name=param_name + "/adam_v", 122 | shape=param.shape.as_list(), 123 | dtype=tf.float32, 124 | trainable=False, 125 | initializer=tf.zeros_initializer()) 126 | 127 | # Standard Adam update. 128 | next_m = ( 129 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 130 | next_v = ( 131 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 132 | tf.square(grad))) 133 | 134 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 135 | 136 | # Just adding the square of the weights to the loss function is *not* 137 | # the correct way of using L2 regularization/weight decay with Adam, 138 | # since that will interact with the m and v parameters in strange ways. 139 | # 140 | # Instead we want ot decay the weights in a manner that doesn't interact 141 | # with the m/v parameters. This is equivalent to adding the square 142 | # of the weights to the loss with plain (non-momentum) SGD. 143 | if self._do_use_weight_decay(param_name): 144 | update += self.weight_decay_rate * param 145 | 146 | update_with_lr = self.learning_rate * update 147 | 148 | next_param = param - update_with_lr 149 | 150 | assignments.extend( 151 | [param.assign(next_param), 152 | m.assign(next_m), 153 | v.assign(next_v)]) 154 | return tf.group(*assignments, name=name) 155 | 156 | def _do_use_weight_decay(self, param_name): 157 | """Whether to use L2 weight decay for `param_name`.""" 158 | if not self.weight_decay_rate: 159 | return False 160 | if self.exclude_from_weight_decay: 161 | for r in self.exclude_from_weight_decay: 162 | if re.search(r, param_name) is not None: 163 | return False 164 | return True 165 | 166 | def _get_variable_name(self, param_name): 167 | """Get the variable name from the tensor name.""" 168 | m = re.match("^(.*):\\d+$", param_name) 169 | if m is not None: 170 | param_name = m.group(1) 171 | return param_name 172 | -------------------------------------------------------------------------------- /results.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | data_dir1="./test_results.tsv" 3 | data_dir="./test.tsv" 4 | with open(file=data_dir,mode="r",encoding="utf-8") as f: 5 | text=f.readlines() 6 | y_true=[] 7 | for t in text: 8 | if t.split("\t")!=0 and t!="\n": 9 | y_true.append(int(t.split("\t")[0])) 10 | 11 | with open(file=data_dir1,mode="r",encoding="utf-8") as f: 12 | result=f.readlines() 13 | y_pred=[] 14 | for l in result: 15 | l=list(map(float,l.split("\t"))) 16 | y_pred.append(np.argmax(l)) 17 | 18 | from sklearn import metrics 19 | # 混淆矩阵 20 | print("Confusion Matrix...") 21 | cm = metrics.confusion_matrix(y_true, y_pred) 22 | from sklearn.metrics import classification_report 23 | target_names = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐'] 24 | print(classification_report(y_true, y_pred, target_names=target_names)) -------------------------------------------------------------------------------- /run_chinese_classification.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import collections 8 | import csv 9 | import os 10 | import modeling 11 | import optimization 12 | import tokenization 13 | import tensorflow as tf 14 | flags = tf.flags 15 | 16 | FLAGS = flags.FLAGS 17 | 18 | ## Required parameters 19 | flags.DEFINE_string( 20 | "train_dir", None, 21 | "The input train data dir. Should contain the .tsv files (or other data files) " 22 | "for the task.") 23 | flags.DEFINE_string( 24 | "dev_dir", None, 25 | "The input dev data dir. Should contain the .tsv files (or other data files) " 26 | "for the task.") 27 | flags.DEFINE_string( 28 | "test_dir", None, 29 | "The input test dir. Should contain the .tsv files (or other data files) " 30 | "for the task.") 31 | 32 | flags.DEFINE_string( 33 | "bert_config_file", None, 34 | "The config json file corresponding to the pre-trained BERT model. " 35 | "This specifies the model architecture.") 36 | 37 | 38 | flags.DEFINE_string("vocab_file", None, 39 | "The vocabulary file that the BERT model was trained on.") 40 | 41 | flags.DEFINE_string( 42 | "output_dir", None, 43 | "The output directory where the model checkpoints will be written.") 44 | 45 | ## Other parameters 46 | 47 | flags.DEFINE_string( 48 | "init_checkpoint", None, 49 | "Initial checkpoint (usually from a pre-trained BERT model).") 50 | 51 | flags.DEFINE_bool( 52 | "do_lower_case", True, 53 | "Whether to lower case the input text. Should be True for uncased " 54 | "models and False for cased models.") 55 | 56 | flags.DEFINE_integer( 57 | "max_seq_length", 256, 58 | "The maximum total input sequence length after WordPiece tokenization. " 59 | "Sequences longer than this will be truncated, and sequences shorter " 60 | "than this will be padded.") 61 | 62 | flags.DEFINE_bool("do_train", False, "Whether to run training.") 63 | 64 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") 65 | 66 | flags.DEFINE_bool("do_predict", False, "Whether to run the model in inference mode on the test set.") 67 | 68 | flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") 69 | 70 | flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") 71 | 72 | flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") 73 | 74 | flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") 75 | 76 | flags.DEFINE_float("num_train_epochs", 3.0, 77 | "Total number of training epochs to perform.") 78 | 79 | flags.DEFINE_float( 80 | "warmup_proportion", 0.1, 81 | "Proportion of training to perform linear learning rate warmup for. " 82 | "E.g., 0.1 = 10% of training.") 83 | 84 | flags.DEFINE_integer("save_checkpoints_steps", 1000, 85 | "How often to save the model checkpoint.") 86 | 87 | flags.DEFINE_integer("iterations_per_loop", 1000, 88 | "How many steps to make in each estimator call.") 89 | 90 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 91 | 92 | tf.flags.DEFINE_string( 93 | "tpu_name", None, 94 | "The Cloud TPU to use for training. This should be either the name " 95 | "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " 96 | "url.") 97 | 98 | tf.flags.DEFINE_string( 99 | "tpu_zone", None, 100 | "[Optional] GCE zone where the Cloud TPU is located in. If not " 101 | "specified, we will attempt to automatically detect the GCE project from " 102 | "metadata.") 103 | 104 | tf.flags.DEFINE_string( 105 | "gcp_project", None, 106 | "[Optional] Project name for the Cloud TPU-enabled project. If not " 107 | "specified, we will attempt to automatically detect the GCE project from " 108 | "metadata.") 109 | 110 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") 111 | 112 | flags.DEFINE_integer( 113 | "num_tpu_cores", 8, 114 | "Only used if `use_tpu` is True. Total number of TPU cores to use.") 115 | 116 | class InputExample(object): 117 | """A single training/test example for simple sequence classification.""" 118 | 119 | def __init__(self, guid, text_a, text_b=None, label=None): 120 | """Constructs a InputExample. 121 | 122 | Args: 123 | guid: Unique id for the example. 124 | text_a: string. The untokenized text of the first sequence. For single 125 | sequence tasks, only this sequence must be specified. 126 | text_b: (Optional) string. The untokenized text of the second sequence. 127 | Only must be specified for sequence pair tasks. 128 | label: (Optional) string. The label of the example. This should be 129 | specified for train and dev examples, but not for test examples. 130 | """ 131 | self.guid = guid 132 | self.text_a = text_a 133 | self.text_b = text_b 134 | self.label = label 135 | class InputFeatures(object): 136 | """A single set of features of data.""" 137 | 138 | def __init__(self, input_ids, input_mask, segment_ids, label_id): 139 | self.input_ids = input_ids 140 | self.input_mask = input_mask 141 | self.segment_ids = segment_ids 142 | self.label_id = label_id 143 | 144 | def read_data(input_file): 145 | examples=[] 146 | with open(file=input_file,mode= "r",encoding="utf-8") as f: 147 | reader = f.read().strip().split('\n') 148 | for i ,data in enumerate(reader): 149 | if data!="" and data!=" ": 150 | text=data.split("\t")[1] 151 | label= data.split("\t")[0] 152 | guid=i 153 | example=InputExample(guid=guid,text_a=text,text_b=None,label=label) 154 | examples.append(example) 155 | return examples 156 | 157 | def convert_single_example(example, max_seq_length, tokenizer): 158 | """Converts a single `InputExample` into a single `InputFeatures`.""" 159 | 160 | tokens_a = tokenizer.tokenize(example.text_a) 161 | # Account for [CLS] and [SEP] with "- 2" 162 | if len(tokens_a) > max_seq_length - 2: 163 | tokens_a = tokens_a[0:(max_seq_length - 2)] 164 | 165 | # The convention in BERT is: 166 | # (a) For sequence pairs: 167 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 168 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 169 | # (b) For single sequences: 170 | # tokens: [CLS] the dog is hairy . [SEP] 171 | # type_ids: 0 0 0 0 0 0 0 172 | # 173 | # Where "type_ids" are used to indicate whether this is the first 174 | # sequence or the second sequence. The embedding vectors for `type=0` and 175 | # `type=1` were learned during pre-training and are added to the wordpiece 176 | # embedding vector (and position vector). This is not *strictly* necessary 177 | # since the [SEP] token unambiguously separates the sequences, but it makes 178 | # it easier for the model to learn the concept of sequences. 179 | # 180 | # For classification tasks, the first vector (corresponding to [CLS]) is 181 | # used as as the "sentence vector". Note that this only makes sense because 182 | # the entire model is fine-tuned. 183 | tokens = [] 184 | segment_ids = [] 185 | tokens.append("[CLS]") 186 | segment_ids.append(0) 187 | for token in tokens_a: 188 | tokens.append(token) 189 | segment_ids.append(0) 190 | tokens.append("[SEP]") 191 | segment_ids.append(0) 192 | 193 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 194 | 195 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 196 | # tokens are attended to. 197 | input_mask = [1] * len(input_ids) 198 | 199 | # Zero-pad up to the sequence length. 200 | while len(input_ids) < max_seq_length: 201 | input_ids.append(0) 202 | input_mask.append(0) 203 | segment_ids.append(0) 204 | 205 | assert len(input_ids) == max_seq_length 206 | assert len(input_mask) == max_seq_length 207 | assert len(segment_ids) == max_seq_length 208 | 209 | label_id = int(example.label) 210 | 211 | feature = InputFeatures( 212 | input_ids=input_ids, 213 | input_mask=input_mask, 214 | segment_ids=segment_ids, 215 | label_id=label_id) 216 | return feature 217 | 218 | def convert_examples_to_features(examples, max_seq_length, 219 | tokenizer): 220 | """Convert a set of `InputExample`s to a list of `InputFeatures`.""" 221 | features = [] 222 | for (ex_index, example) in enumerate(examples): 223 | feature = convert_single_example(example, max_seq_length, tokenizer) 224 | features.append(feature) 225 | return features 226 | def file_based_convert_examples_to_features( 227 | examples, max_seq_length, tokenizer, output_file): 228 | """Convert a set of `InputExample`s to a TFRecord file.""" 229 | 230 | writer = tf.python_io.TFRecordWriter(output_file) 231 | 232 | for (ex_index, example) in enumerate(examples): 233 | if ex_index % 10000 == 0: 234 | tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) 235 | 236 | feature = convert_single_example(example,max_seq_length, tokenizer) 237 | 238 | def create_int_feature(values): 239 | f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) 240 | return f 241 | 242 | features = collections.OrderedDict() 243 | features["input_ids"] = create_int_feature(feature.input_ids) 244 | features["input_mask"] = create_int_feature(feature.input_mask) 245 | features["segment_ids"] = create_int_feature(feature.segment_ids) 246 | features["label_ids"] = create_int_feature([feature.label_id]) 247 | 248 | tf_example = tf.train.Example(features=tf.train.Features(feature=features)) 249 | writer.write(tf_example.SerializeToString()) 250 | 251 | def file_based_input_fn_builder(input_file, seq_length, is_training, 252 | drop_remainder): 253 | """Creates an `input_fn` closure to be passed to TPU Estimator.""" 254 | 255 | name_to_features = { 256 | "input_ids": tf.FixedLenFeature([seq_length], tf.int64), 257 | "input_mask": tf.FixedLenFeature([seq_length], tf.int64), 258 | "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), 259 | "label_ids": tf.FixedLenFeature([], tf.int64), 260 | } 261 | 262 | def _decode_record(record, name_to_features): 263 | """Decodes a record to a TensorFlow example.""" 264 | example = tf.parse_single_example(record, name_to_features) 265 | 266 | # tf.Example only supports tf.int64, but the TPU only supports tf.int32. 267 | # So cast all int64 to int32. 268 | for name in list(example.keys()): 269 | t = example[name] 270 | if t.dtype == tf.int64: 271 | t = tf.to_int32(t) 272 | example[name] = t 273 | 274 | return example 275 | 276 | def input_fn(params): 277 | """The actual input function.""" 278 | batch_size = params["batch_size"] 279 | 280 | # For training, we want a lot of parallel reading and shuffling. 281 | # For eval, we want no shuffling and parallel reading doesn't matter. 282 | d = tf.data.TFRecordDataset(input_file) 283 | if is_training: 284 | d = d.repeat() 285 | d = d.shuffle(buffer_size=100) 286 | 287 | d = d.apply( 288 | tf.contrib.data.map_and_batch( 289 | lambda record: _decode_record(record, name_to_features), 290 | batch_size=batch_size, 291 | drop_remainder=drop_remainder)) 292 | 293 | return d 294 | 295 | return input_fn 296 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, 297 | labels, num_labels, use_one_hot_embeddings): 298 | """Creates a classification model.""" 299 | model = modeling.BertModel( 300 | config=bert_config, 301 | is_training=is_training, 302 | input_ids=input_ids, 303 | input_mask=input_mask, 304 | token_type_ids=segment_ids, 305 | use_one_hot_embeddings=use_one_hot_embeddings) 306 | 307 | # In the demo, we are doing a simple classification task on the entire 308 | # segment. 309 | # 310 | # If you want to use the token-level output, use model.get_sequence_output() 311 | # instead. 312 | output_layer = model.get_pooled_output() 313 | 314 | hidden_size = output_layer.shape[-1].value 315 | 316 | output_weights = tf.get_variable( 317 | "output_weights", [num_labels, hidden_size], 318 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 319 | 320 | output_bias = tf.get_variable( 321 | "output_bias", [num_labels], initializer=tf.zeros_initializer()) 322 | 323 | with tf.variable_scope("loss"): 324 | if is_training: 325 | # I.e., 0.1 dropout 326 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 327 | 328 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 329 | logits = tf.nn.bias_add(logits, output_bias) 330 | probabilities = tf.nn.softmax(logits, axis=-1) 331 | log_probs = tf.nn.log_softmax(logits, axis=-1) 332 | 333 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) 334 | 335 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 336 | loss = tf.reduce_mean(per_example_loss) 337 | 338 | return (loss, per_example_loss, logits, probabilities) 339 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, 340 | num_train_steps, num_warmup_steps, use_tpu, 341 | use_one_hot_embeddings): 342 | """Returns `model_fn` closure for TPUEstimator.""" 343 | 344 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 345 | """The `model_fn` for TPUEstimator.""" 346 | 347 | tf.logging.info("*** Features ***") 348 | for name in sorted(features.keys()): 349 | tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) 350 | 351 | input_ids = features["input_ids"] 352 | input_mask = features["input_mask"] 353 | segment_ids = features["segment_ids"] 354 | label_ids = features["label_ids"] 355 | 356 | is_training = (mode == tf.estimator.ModeKeys.TRAIN) 357 | 358 | (total_loss, per_example_loss, logits, probabilities) = create_model( 359 | bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, 360 | num_labels, use_one_hot_embeddings) 361 | 362 | tvars = tf.trainable_variables() 363 | 364 | scaffold_fn = None 365 | if init_checkpoint: 366 | (assignment_map, initialized_variable_names 367 | ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) 368 | if use_tpu: 369 | 370 | def tpu_scaffold(): 371 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 372 | return tf.train.Scaffold() 373 | 374 | scaffold_fn = tpu_scaffold 375 | else: 376 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 377 | 378 | tf.logging.info("**** Trainable Variables ****") 379 | for var in tvars: 380 | init_string = "" 381 | if var.name in initialized_variable_names: 382 | init_string = ", *INIT_FROM_CKPT*" 383 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 384 | init_string) 385 | 386 | output_spec = None 387 | if mode == tf.estimator.ModeKeys.TRAIN: 388 | 389 | train_op = optimization.create_optimizer( 390 | total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) 391 | 392 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 393 | mode=mode, 394 | loss=total_loss, 395 | train_op=train_op, 396 | scaffold_fn=scaffold_fn) 397 | elif mode == tf.estimator.ModeKeys.EVAL: 398 | 399 | def metric_fn(per_example_loss, label_ids, logits): 400 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) 401 | accuracy = tf.metrics.accuracy(label_ids, predictions) 402 | loss = tf.metrics.mean(per_example_loss) 403 | return { 404 | "eval_accuracy": accuracy, 405 | "eval_loss": loss, 406 | } 407 | 408 | eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) 409 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 410 | mode=mode, 411 | loss=total_loss, 412 | eval_metrics=eval_metrics, 413 | scaffold_fn=scaffold_fn) 414 | else: 415 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 416 | mode=mode, 417 | predictions=probabilities, 418 | scaffold_fn=scaffold_fn) 419 | return output_spec 420 | 421 | return model_fn 422 | def input_fn_builder(features, seq_length, is_training, drop_remainder): 423 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 424 | 425 | all_input_ids = [] 426 | all_input_mask = [] 427 | all_segment_ids = [] 428 | all_label_ids = [] 429 | 430 | for feature in features: 431 | all_input_ids.append(feature.input_ids) 432 | all_input_mask.append(feature.input_mask) 433 | all_segment_ids.append(feature.segment_ids) 434 | all_label_ids.append(feature.label_id) 435 | 436 | def input_fn(params): 437 | """The actual input function.""" 438 | batch_size = params["batch_size"] 439 | 440 | num_examples = len(features) 441 | 442 | # This is for demo purposes and does NOT scale to large data sets. We do 443 | # not use Dataset.from_generator() because that uses tf.py_func which is 444 | # not TPU compatible. The right way to load data is with TFRecordReader. 445 | d = tf.data.Dataset.from_tensor_slices({ 446 | "input_ids": 447 | tf.constant( 448 | all_input_ids, shape=[num_examples, seq_length], 449 | dtype=tf.int32), 450 | "input_mask": 451 | tf.constant( 452 | all_input_mask, 453 | shape=[num_examples, seq_length], 454 | dtype=tf.int32), 455 | "segment_ids": 456 | tf.constant( 457 | all_segment_ids, 458 | shape=[num_examples, seq_length], 459 | dtype=tf.int32), 460 | "label_ids": 461 | tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), 462 | }) 463 | 464 | if is_training: 465 | d = d.repeat() 466 | d = d.shuffle(buffer_size=100) 467 | 468 | d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) 469 | return d 470 | 471 | return input_fn 472 | def main(_): 473 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 474 | tf.gfile.MakeDirs(FLAGS.output_dir) 475 | tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 476 | tpu_cluster_resolver = None 477 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 478 | run_config = tf.contrib.tpu.RunConfig( 479 | cluster=tpu_cluster_resolver, 480 | master=FLAGS.master, 481 | model_dir=FLAGS.output_dir, 482 | save_checkpoints_steps=FLAGS.save_checkpoints_steps, 483 | tpu_config=tf.contrib.tpu.TPUConfig( 484 | iterations_per_loop=FLAGS.iterations_per_loop, 485 | num_shards=FLAGS.num_tpu_cores, 486 | per_host_input_for_training=is_per_host)) 487 | 488 | train_examples = None 489 | num_train_steps = None 490 | num_warmup_steps = None 491 | if FLAGS.do_train: 492 | train_examples = read_data(FLAGS.train_dir) 493 | num_train_steps = int( 494 | len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) 495 | num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) 496 | 497 | model_fn = model_fn_builder( 498 | bert_config=bert_config, 499 | num_labels=10, 500 | init_checkpoint=FLAGS.init_checkpoint, 501 | learning_rate=FLAGS.learning_rate, 502 | num_train_steps=num_train_steps, 503 | num_warmup_steps=num_warmup_steps, 504 | use_tpu=FLAGS.use_tpu, 505 | use_one_hot_embeddings=FLAGS.use_tpu) 506 | 507 | # If TPU is not available, this will fall back to normal Estimator on CPU 508 | # or GPU. 509 | estimator = tf.contrib.tpu.TPUEstimator( 510 | use_tpu=FLAGS.use_tpu, 511 | model_fn=model_fn, 512 | config=run_config, 513 | train_batch_size=FLAGS.train_batch_size, 514 | eval_batch_size=FLAGS.eval_batch_size, 515 | predict_batch_size=FLAGS.predict_batch_size) 516 | 517 | if FLAGS.do_train: 518 | train_file = os.path.join(FLAGS.output_dir, "train.tf_record") 519 | file_based_convert_examples_to_features( 520 | train_examples, FLAGS.max_seq_length, tokenizer, train_file) 521 | tf.logging.info("***** Running training *****") 522 | tf.logging.info(" Num examples = %d", len(train_examples)) 523 | tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) 524 | tf.logging.info(" Num steps = %d", num_train_steps) 525 | train_input_fn = file_based_input_fn_builder( 526 | input_file=train_file, 527 | seq_length=FLAGS.max_seq_length, 528 | is_training=True, 529 | drop_remainder=True) 530 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) 531 | 532 | if FLAGS.do_eval: 533 | tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 534 | eval_examples = read_data(FLAGS.dev_dir) 535 | eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") 536 | file_based_convert_examples_to_features( 537 | examples=eval_examples, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer, output_file=eval_file) 538 | 539 | tf.logging.info("***** Running evaluation *****") 540 | tf.logging.info(" Num examples = %d", len(eval_examples)) 541 | tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) 542 | 543 | # This tells the estimator to run through the entire set. 544 | eval_steps = None 545 | # However, if running eval on the TPU, you will need to specify the 546 | # number of steps. 547 | if FLAGS.use_tpu: 548 | # Eval will be slightly WRONG on the TPU because it will truncate 549 | # the last batch. 550 | eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) 551 | 552 | eval_drop_remainder = True if FLAGS.use_tpu else False 553 | eval_input_fn = file_based_input_fn_builder( 554 | input_file=eval_file, 555 | seq_length=FLAGS.max_seq_length, 556 | is_training=False, 557 | drop_remainder=eval_drop_remainder) 558 | 559 | result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) 560 | 561 | output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") 562 | with tf.gfile.GFile(output_eval_file, "w") as writer: 563 | tf.logging.info("***** Eval results *****") 564 | for key in sorted(result.keys()): 565 | tf.logging.info(" %s = %s", key, str(result[key])) 566 | writer.write("%s = %s\n" % (key, str(result[key]))) 567 | 568 | if FLAGS.do_predict: 569 | tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 570 | predict_examples = read_data(FLAGS.test_dir) 571 | predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") 572 | file_based_convert_examples_to_features(predict_examples,FLAGS.max_seq_length, tokenizer=tokenizer,output_file=predict_file) 573 | 574 | tf.logging.info("***** Running prediction*****") 575 | tf.logging.info(" Num examples = %d", len(predict_examples)) 576 | tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) 577 | 578 | if FLAGS.use_tpu: 579 | # Warning: According to tpu_estimator.py Prediction on TPU is an 580 | # experimental feature and hence not supported here 581 | raise ValueError("Prediction in TPU not supported") 582 | 583 | predict_drop_remainder = True if FLAGS.use_tpu else False 584 | predict_input_fn = file_based_input_fn_builder( 585 | input_file=predict_file, 586 | seq_length=FLAGS.max_seq_length, 587 | is_training=False, 588 | drop_remainder=predict_drop_remainder) 589 | 590 | result = estimator.predict(input_fn=predict_input_fn) 591 | 592 | output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") 593 | with tf.gfile.GFile(output_predict_file, "w") as writer: 594 | tf.logging.info("***** Predict results *****") 595 | for prediction in result: 596 | output_line = "\t".join(str(pre) for pre in prediction) + "\n" 597 | writer.write(output_line) 598 | 599 | 600 | if __name__ == "__main__": 601 | flags.mark_flag_as_required("train_dir") 602 | flags.mark_flag_as_required("dev_dir") 603 | flags.mark_flag_as_required("test_dir") 604 | flags.mark_flag_as_required("vocab_file") 605 | flags.mark_flag_as_required("bert_config_file") 606 | flags.mark_flag_as_required("output_dir") 607 | tf.app.run() 608 | -------------------------------------------------------------------------------- /tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import unicodedata 23 | import six 24 | import tensorflow as tf 25 | 26 | 27 | def convert_to_unicode(text): 28 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 29 | if six.PY3: 30 | if isinstance(text, str): 31 | return text 32 | elif isinstance(text, bytes): 33 | return text.decode("utf-8", "ignore") 34 | else: 35 | raise ValueError("Unsupported string type: %s" % (type(text))) 36 | elif six.PY2: 37 | if isinstance(text, str): 38 | return text.decode("utf-8", "ignore") 39 | elif isinstance(text, unicode): 40 | return text 41 | else: 42 | raise ValueError("Unsupported string type: %s" % (type(text))) 43 | else: 44 | raise ValueError("Not running on Python2 or Python 3?") 45 | 46 | 47 | def printable_text(text): 48 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 49 | 50 | # These functions want `str` for both Python2 and Python3, but in one case 51 | # it's a Unicode string and in the other it's a byte string. 52 | if six.PY3: 53 | if isinstance(text, str): 54 | return text 55 | elif isinstance(text, bytes): 56 | return text.decode("utf-8", "ignore") 57 | else: 58 | raise ValueError("Unsupported string type: %s" % (type(text))) 59 | elif six.PY2: 60 | if isinstance(text, str): 61 | return text 62 | elif isinstance(text, unicode): 63 | return text.encode("utf-8") 64 | else: 65 | raise ValueError("Unsupported string type: %s" % (type(text))) 66 | else: 67 | raise ValueError("Not running on Python2 or Python 3?") 68 | 69 | 70 | def load_vocab(vocab_file): 71 | """Loads a vocabulary file into a dictionary.""" 72 | vocab = collections.OrderedDict() 73 | index = 0 74 | with tf.gfile.GFile(vocab_file, "r") as reader: 75 | while True: 76 | token = convert_to_unicode(reader.readline()) 77 | if not token: 78 | break 79 | token = token.strip() 80 | vocab[token] = index 81 | index += 1 82 | return vocab 83 | 84 | 85 | def convert_tokens_to_ids(vocab, tokens): 86 | """Converts a sequence of tokens into ids using the vocab.""" 87 | ids = [] 88 | for token in tokens: 89 | ids.append(vocab[token]) 90 | return ids 91 | 92 | 93 | def whitespace_tokenize(text): 94 | """Runs basic whitespace cleaning and splitting on a peice of text.""" 95 | text = text.strip() 96 | if not text: 97 | return [] 98 | tokens = text.split() 99 | return tokens 100 | 101 | 102 | class FullTokenizer(object): 103 | """Runs end-to-end tokenziation.""" 104 | 105 | def __init__(self, vocab_file, do_lower_case=True): 106 | self.vocab = load_vocab(vocab_file) 107 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 108 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 109 | 110 | def tokenize(self, text): 111 | split_tokens = [] 112 | for token in self.basic_tokenizer.tokenize(text): 113 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 114 | split_tokens.append(sub_token) 115 | 116 | return split_tokens 117 | 118 | def convert_tokens_to_ids(self, tokens): 119 | return convert_tokens_to_ids(self.vocab, tokens) 120 | 121 | 122 | class BasicTokenizer(object): 123 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 124 | 125 | def __init__(self, do_lower_case=True): 126 | """Constructs a BasicTokenizer. 127 | 128 | Args: 129 | do_lower_case: Whether to lower case the input. 130 | """ 131 | self.do_lower_case = do_lower_case 132 | 133 | def tokenize(self, text): 134 | """Tokenizes a piece of text.""" 135 | text = convert_to_unicode(text) 136 | text = self._clean_text(text) 137 | 138 | # This was added on November 1st, 2018 for the multilingual and Chinese 139 | # models. This is also applied to the English models now, but it doesn't 140 | # matter since the English models were not trained on any Chinese data 141 | # and generally don't have any Chinese data in them (there are Chinese 142 | # characters in the vocabulary because Wikipedia does have some Chinese 143 | # words in the English Wikipedia.). 144 | text = self._tokenize_chinese_chars(text) 145 | 146 | orig_tokens = whitespace_tokenize(text) 147 | split_tokens = [] 148 | for token in orig_tokens: 149 | if self.do_lower_case: 150 | token = token.lower() 151 | token = self._run_strip_accents(token) 152 | split_tokens.extend(self._run_split_on_punc(token)) 153 | 154 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 155 | return output_tokens 156 | 157 | def _run_strip_accents(self, text): 158 | """Strips accents from a piece of text.""" 159 | text = unicodedata.normalize("NFD", text) 160 | output = [] 161 | for char in text: 162 | cat = unicodedata.category(char) 163 | if cat == "Mn": 164 | continue 165 | output.append(char) 166 | return "".join(output) 167 | 168 | def _run_split_on_punc(self, text): 169 | """Splits punctuation on a piece of text.""" 170 | chars = list(text) 171 | i = 0 172 | start_new_word = True 173 | output = [] 174 | while i < len(chars): 175 | char = chars[i] 176 | if _is_punctuation(char): 177 | output.append([char]) 178 | start_new_word = True 179 | else: 180 | if start_new_word: 181 | output.append([]) 182 | start_new_word = False 183 | output[-1].append(char) 184 | i += 1 185 | 186 | return ["".join(x) for x in output] 187 | 188 | def _tokenize_chinese_chars(self, text): 189 | """Adds whitespace around any CJK character.""" 190 | output = [] 191 | for char in text: 192 | cp = ord(char) 193 | if self._is_chinese_char(cp): 194 | output.append(" ") 195 | output.append(char) 196 | output.append(" ") 197 | else: 198 | output.append(char) 199 | return "".join(output) 200 | 201 | def _is_chinese_char(self, cp): 202 | """Checks whether CP is the codepoint of a CJK character.""" 203 | # This defines a "chinese character" as anything in the CJK Unicode block: 204 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 205 | # 206 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 207 | # despite its name. The modern Korean Hangul alphabet is a different block, 208 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 209 | # space-separated words, so they are not treated specially and handled 210 | # like the all of the other languages. 211 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 212 | (cp >= 0x3400 and cp <= 0x4DBF) or # 213 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 214 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 215 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 216 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 217 | (cp >= 0xF900 and cp <= 0xFAFF) or # 218 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 219 | return True 220 | 221 | return False 222 | 223 | def _clean_text(self, text): 224 | """Performs invalid character removal and whitespace cleanup on text.""" 225 | output = [] 226 | for char in text: 227 | cp = ord(char) 228 | if cp == 0 or cp == 0xfffd or _is_control(char): 229 | continue 230 | if _is_whitespace(char): 231 | output.append(" ") 232 | else: 233 | output.append(char) 234 | return "".join(output) 235 | 236 | 237 | class WordpieceTokenizer(object): 238 | """Runs WordPiece tokenziation.""" 239 | 240 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): 241 | self.vocab = vocab 242 | self.unk_token = unk_token 243 | self.max_input_chars_per_word = max_input_chars_per_word 244 | 245 | def tokenize(self, text): 246 | """Tokenizes a piece of text into its word pieces. 247 | 248 | This uses a greedy longest-match-first algorithm to perform tokenization 249 | using the given vocabulary. 250 | 251 | For example: 252 | input = "unaffable" 253 | output = ["un", "##aff", "##able"] 254 | 255 | Args: 256 | text: A single token or whitespace separated tokens. This should have 257 | already been passed through `BasicTokenizer. 258 | 259 | Returns: 260 | A list of wordpiece tokens. 261 | """ 262 | 263 | text = convert_to_unicode(text) 264 | 265 | output_tokens = [] 266 | for token in whitespace_tokenize(text): 267 | chars = list(token) 268 | if len(chars) > self.max_input_chars_per_word: 269 | output_tokens.append(self.unk_token) 270 | continue 271 | 272 | is_bad = False 273 | start = 0 274 | sub_tokens = [] 275 | while start < len(chars): 276 | end = len(chars) 277 | cur_substr = None 278 | while start < end: 279 | substr = "".join(chars[start:end]) 280 | if start > 0: 281 | substr = "##" + substr 282 | if substr in self.vocab: 283 | cur_substr = substr 284 | break 285 | end -= 1 286 | if cur_substr is None: 287 | is_bad = True 288 | break 289 | sub_tokens.append(cur_substr) 290 | start = end 291 | 292 | if is_bad: 293 | output_tokens.append(self.unk_token) 294 | else: 295 | output_tokens.extend(sub_tokens) 296 | return output_tokens 297 | 298 | 299 | def _is_whitespace(char): 300 | """Checks whether `chars` is a whitespace character.""" 301 | # \t, \n, and \r are technically contorl characters but we treat them 302 | # as whitespace since they are generally considered as such. 303 | if char == " " or char == "\t" or char == "\n" or char == "\r": 304 | return True 305 | cat = unicodedata.category(char) 306 | if cat == "Zs": 307 | return True 308 | return False 309 | 310 | 311 | def _is_control(char): 312 | """Checks whether `chars` is a control character.""" 313 | # These are technically control characters but we count them as whitespace 314 | # characters. 315 | if char == "\t" or char == "\n" or char == "\r": 316 | return False 317 | cat = unicodedata.category(char) 318 | if cat.startswith("C"): 319 | return True 320 | return False 321 | 322 | 323 | def _is_punctuation(char): 324 | """Checks whether `chars` is a punctuation character.""" 325 | cp = ord(char) 326 | # We treat all non-letter/number ASCII as punctuation. 327 | # Characters such as "^", "$", and "`" are not in the Unicode 328 | # Punctuation class but we treat them as punctuation anyways, for 329 | # consistency. 330 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 331 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 332 | return True 333 | cat = unicodedata.category(char) 334 | if cat.startswith("P"): 335 | return True 336 | return False 337 | --------------------------------------------------------------------------------