├── .gitignore
├── README.md
├── __init__.py
├── modeling.py
├── optimization.py
├── results.py
├── run_chinese_classification.py
└── tokenization.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Initially taken from Github's Python gitignore file
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # SageMath parsed files
 89 | *.sage.py
 90 | 
 91 | # Environments
 92 | .env
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 | 
115 | # Pyre type checker
116 | .pyre/
117 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #bert-chinese classification
 2 | 
 3 | Use google BERT to do chinese sentences multiclass classification !
 4 | 
 5 | #train
 6 | 
 7 | export BERT_BASE_DIR=/path/to/model/chinese_L-12_H-768_A-12
 8 | 
 9 | export DATA_DIR=/path/to/data
10 | 
11 | python run_chinese_classification.py \
12 |   --do_train=true \
13 |   --do_eval=true \
14 |   --train_dir=$DATA_DIR/Chinesedata/train.tsv \
15 |   --dev_dir=$DATA_DIR/Chinesedata/dev.tsv \
16 |   --test_dir=$DATA_DIR/Chinesedata/test.tsv \
17 |   --vocab_file=$BERT_BASE_DIR/vocab.txt \
18 |   --bert_config_file=$BERT_BASE_DIR/bert_config.json \
19 |   --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
20 |   --max_seq_length=512 \
21 |   --train_batch_size=6 \
22 |   --learning_rate=1e-6 \
23 |   --num_train_epochs=10.0 \
24 |   --output_dir=/path/to/chinese_model
25 | 
26 | #test
27 | 
28 | export BERT_BASE_DIR=/path/to/model/chinese_L-12_H-768_A-12
29 | 
30 | export DATA_DIR=/path/to/model/data
31 | 
32 | export TRAINED_CLASSIFIER=/path/to/model/chinese_model
33 | 
34 | 
35 | python run_chinese_classification.py \
36 |   --train_dir=$DATA_DIR/Chinesedata/train.tsv \
37 |   --dev_dir=$DATA_DIR/Chinesedata/dev.tsv \
38 |   --test_dir=$DATA_DIR/Chinesedata/test.tsv \
39 |   --do_predict=true \
40 |   --vocab_file=$BERT_BASE_DIR/vocab.txt \
41 |   --bert_config_file=$BERT_BASE_DIR/bert_config.json \
42 |   --init_checkpoint=$TRAINED_CLASSIFIER \
43 |   --max_seq_length=512 \
44 |   --output_dir=/path/to/chinese_result
45 |  
46 | #results:
47 | 
48 | python results.py
49 | 
50 | #data
51 | 
52 | url：https://pan.baidu.com/s/1qDngiTq1FyNxb5GX-zDiCg 
53 | passwd：9dk8 
54 | 
55 | #reference:
56 | 
57 | https://github.com/google-research/bert
58 | 
59 | https://arxiv.org/abs/1810.04805
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/modeling.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """The main BERT model and related functions."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import copy
 23 | import json
 24 | import math
 25 | import re
 26 | import six
 27 | import tensorflow as tf
 28 | 
 29 | 
 30 | class BertConfig(object):
 31 |   """Configuration for `BertModel`."""
 32 | 
 33 |   def __init__(self,
 34 |                vocab_size,
 35 |                hidden_size=768,
 36 |                num_hidden_layers=12,
 37 |                num_attention_heads=12,
 38 |                intermediate_size=3072,
 39 |                hidden_act="gelu",
 40 |                hidden_dropout_prob=0.1,
 41 |                attention_probs_dropout_prob=0.1,
 42 |                max_position_embeddings=512,
 43 |                type_vocab_size=16,
 44 |                initializer_range=0.02):
 45 |     """Constructs BertConfig.
 46 | 
 47 |     Args:
 48 |       vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
 49 |       hidden_size: Size of the encoder layers and the pooler layer.
 50 |       num_hidden_layers: Number of hidden layers in the Transformer encoder.
 51 |       num_attention_heads: Number of attention heads for each attention layer in
 52 |         the Transformer encoder.
 53 |       intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 54 |         layer in the Transformer encoder.
 55 |       hidden_act: The non-linear activation function (function or string) in the
 56 |         encoder and pooler.
 57 |       hidden_dropout_prob: The dropout probability for all fully connected
 58 |         layers in the embeddings, encoder, and pooler.
 59 |       attention_probs_dropout_prob: The dropout ratio for the attention
 60 |         probabilities.
 61 |       max_position_embeddings: The maximum sequence length that this model might
 62 |         ever be used with. Typically set this to something large just in case
 63 |         (e.g., 512 or 1024 or 2048).
 64 |       type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 65 |         `BertModel`.
 66 |       initializer_range: The stdev of the truncated_normal_initializer for
 67 |         initializing all weight matrices.
 68 |     """
 69 |     self.vocab_size = vocab_size
 70 |     self.hidden_size = hidden_size
 71 |     self.num_hidden_layers = num_hidden_layers
 72 |     self.num_attention_heads = num_attention_heads
 73 |     self.hidden_act = hidden_act
 74 |     self.intermediate_size = intermediate_size
 75 |     self.hidden_dropout_prob = hidden_dropout_prob
 76 |     self.attention_probs_dropout_prob = attention_probs_dropout_prob
 77 |     self.max_position_embeddings = max_position_embeddings
 78 |     self.type_vocab_size = type_vocab_size
 79 |     self.initializer_range = initializer_range
 80 | 
 81 |   @classmethod
 82 |   def from_dict(cls, json_object):
 83 |     """Constructs a `BertConfig` from a Python dictionary of parameters."""
 84 |     config = BertConfig(vocab_size=None)
 85 |     for (key, value) in six.iteritems(json_object):
 86 |       config.__dict__[key] = value
 87 |     return config
 88 | 
 89 |   @classmethod
 90 |   def from_json_file(cls, json_file):
 91 |     """Constructs a `BertConfig` from a json file of parameters."""
 92 |     with tf.gfile.GFile(json_file, "r") as reader:
 93 |       text = reader.read()
 94 |     return cls.from_dict(json.loads(text))
 95 | 
 96 |   def to_dict(self):
 97 |     """Serializes this instance to a Python dictionary."""
 98 |     output = copy.deepcopy(self.__dict__)
 99 |     return output
100 | 
101 |   def to_json_string(self):
102 |     """Serializes this instance to a JSON string."""
103 |     return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
104 | 
105 | 
106 | class BertModel(object):
107 |   """BERT model ("Bidirectional Embedding Representations from a Transformer").
108 | 
109 |   Example usage:
110 | 
111 |   ```python
112 |   # Already been converted into WordPiece token ids
113 |   input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
114 |   input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
115 |   token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
116 | 
117 |   config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
118 |     num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
119 | 
120 |   model = modeling.BertModel(config=config, is_training=True,
121 |     input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
122 | 
123 |   label_embeddings = tf.get_variable(...)
124 |   pooled_output = model.get_pooled_output()
125 |   logits = tf.matmul(pooled_output, label_embeddings)
126 |   ...
127 |   ```
128 |   """
129 | 
130 |   def __init__(self,
131 |                config,
132 |                is_training,
133 |                input_ids,
134 |                input_mask=None,
135 |                token_type_ids=None,
136 |                use_one_hot_embeddings=True,
137 |                scope=None):
138 |     """Constructor for BertModel.
139 | 
140 |     Args:
141 |       config: `BertConfig` instance.
142 |       is_training: bool. true for training model, false for eval model. Controls
143 |         whether dropout will be applied.
144 |       input_ids: int32 Tensor of shape [batch_size, seq_length].
145 |       input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
146 |       token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
147 |       use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
148 |         embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
149 |         it is must faster if this is True, on the CPU or GPU, it is faster if
150 |         this is False.
151 |       scope: (optional) variable scope. Defaults to "bert".
152 | 
153 |     Raises:
154 |       ValueError: The config is invalid or one of the input tensor shapes
155 |         is invalid.
156 |     """
157 |     config = copy.deepcopy(config)
158 |     if not is_training:
159 |       config.hidden_dropout_prob = 0.0
160 |       config.attention_probs_dropout_prob = 0.0
161 | 
162 |     input_shape = get_shape_list(input_ids, expected_rank=2)
163 |     batch_size = input_shape[0]
164 |     seq_length = input_shape[1]
165 | 
166 |     if input_mask is None:
167 |       input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
168 | 
169 |     if token_type_ids is None:
170 |       token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
171 | 
172 |     with tf.variable_scope(scope, default_name="bert"):
173 |       with tf.variable_scope("embeddings"):
174 |         # Perform embedding lookup on the word ids.
175 |         (self.embedding_output, self.embedding_table) = embedding_lookup(
176 |             input_ids=input_ids,
177 |             vocab_size=config.vocab_size,
178 |             embedding_size=config.hidden_size,
179 |             initializer_range=config.initializer_range,
180 |             word_embedding_name="word_embeddings",
181 |             use_one_hot_embeddings=use_one_hot_embeddings)
182 | 
183 |         # Add positional embeddings and token type embeddings, then layer
184 |         # normalize and perform dropout.
185 |         self.embedding_output = embedding_postprocessor(
186 |             input_tensor=self.embedding_output,
187 |             use_token_type=True,
188 |             token_type_ids=token_type_ids,
189 |             token_type_vocab_size=config.type_vocab_size,
190 |             token_type_embedding_name="token_type_embeddings",
191 |             use_position_embeddings=True,
192 |             position_embedding_name="position_embeddings",
193 |             initializer_range=config.initializer_range,
194 |             max_position_embeddings=config.max_position_embeddings,
195 |             dropout_prob=config.hidden_dropout_prob)
196 | 
197 |       with tf.variable_scope("encoder"):
198 |         # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
199 |         # mask of shape [batch_size, seq_length, seq_length] which is used
200 |         # for the attention scores.
201 |         attention_mask = create_attention_mask_from_input_mask(
202 |             input_ids, input_mask)
203 | 
204 |         # Run the stacked transformer.
205 |         # `sequence_output` shape = [batch_size, seq_length, hidden_size].
206 |         self.all_encoder_layers = transformer_model(
207 |             input_tensor=self.embedding_output,
208 |             attention_mask=attention_mask,
209 |             hidden_size=config.hidden_size,
210 |             num_hidden_layers=config.num_hidden_layers,
211 |             num_attention_heads=config.num_attention_heads,
212 |             intermediate_size=config.intermediate_size,
213 |             intermediate_act_fn=get_activation(config.hidden_act),
214 |             hidden_dropout_prob=config.hidden_dropout_prob,
215 |             attention_probs_dropout_prob=config.attention_probs_dropout_prob,
216 |             initializer_range=config.initializer_range,
217 |             do_return_all_layers=True)
218 | 
219 |       self.sequence_output = self.all_encoder_layers[-1]
220 |       # The "pooler" converts the encoded sequence tensor of shape
221 |       # [batch_size, seq_length, hidden_size] to a tensor of shape
222 |       # [batch_size, hidden_size]. This is necessary for segment-level
223 |       # (or segment-pair-level) classification tasks where we need a fixed
224 |       # dimensional representation of the segment.
225 |       with tf.variable_scope("pooler"):
226 |         # We "pool" the model by simply taking the hidden state corresponding
227 |         # to the first token. We assume that this has been pre-trained
228 |         first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
229 |         self.pooled_output = tf.layers.dense(
230 |             first_token_tensor,
231 |             config.hidden_size,
232 |             activation=tf.tanh,
233 |             kernel_initializer=create_initializer(config.initializer_range))
234 | 
235 |   def get_pooled_output(self):
236 |     return self.pooled_output
237 | 
238 |   def get_sequence_output(self):
239 |     """Gets final hidden layer of encoder.
240 | 
241 |     Returns:
242 |       float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
243 |       to the final hidden of the transformer encoder.
244 |     """
245 |     return self.sequence_output
246 | 
247 |   def get_all_encoder_layers(self):
248 |     return self.all_encoder_layers
249 | 
250 |   def get_embedding_output(self):
251 |     """Gets output of the embedding lookup (i.e., input to the transformer).
252 | 
253 |     Returns:
254 |       float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
255 |       to the output of the embedding layer, after summing the word
256 |       embeddings with the positional embeddings and the token type embeddings,
257 |       then performing layer normalization. This is the input to the transformer.
258 |     """
259 |     return self.embedding_output
260 | 
261 |   def get_embedding_table(self):
262 |     return self.embedding_table
263 | 
264 | 
265 | def gelu(input_tensor):
266 |   """Gaussian Error Linear Unit.
267 | 
268 |   This is a smoother version of the RELU.
269 |   Original paper: https://arxiv.org/abs/1606.08415
270 | 
271 |   Args:
272 |     input_tensor: float Tensor to perform activation.
273 | 
274 |   Returns:
275 |     `input_tensor` with the GELU activation applied.
276 |   """
277 |   cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
278 |   return input_tensor * cdf
279 | 
280 | 
281 | def get_activation(activation_string):
282 |   """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
283 | 
284 |   Args:
285 |     activation_string: String name of the activation function.
286 | 
287 |   Returns:
288 |     A Python function corresponding to the activation function. If
289 |     `activation_string` is None, empty, or "linear", this will return None.
290 |     If `activation_string` is not a string, it will return `activation_string`.
291 | 
292 |   Raises:
293 |     ValueError: The `activation_string` does not correspond to a known
294 |       activation.
295 |   """
296 | 
297 |   # We assume that anything that"s not a string is already an activation
298 |   # function, so we just return it.
299 |   if not isinstance(activation_string, six.string_types):
300 |     return activation_string
301 | 
302 |   if not activation_string:
303 |     return None
304 | 
305 |   act = activation_string.lower()
306 |   if act == "linear":
307 |     return None
308 |   elif act == "relu":
309 |     return tf.nn.relu
310 |   elif act == "gelu":
311 |     return gelu
312 |   elif act == "tanh":
313 |     return tf.tanh
314 |   else:
315 |     raise ValueError("Unsupported activation: %s" % act)
316 | 
317 | 
318 | def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
319 |   """Compute the union of the current variables and checkpoint variables."""
320 |   assignment_map = {}
321 |   initialized_variable_names = {}
322 | 
323 |   name_to_variable = collections.OrderedDict()
324 |   for var in tvars:
325 |     name = var.name
326 |     m = re.match("^(.*):\\d+$", name)
327 |     if m is not None:
328 |       name = m.group(1)
329 |     name_to_variable[name] = var
330 | 
331 |   init_vars = tf.train.list_variables(init_checkpoint)
332 | 
333 |   assignment_map = collections.OrderedDict()
334 |   for x in init_vars:
335 |     (name, var) = (x[0], x[1])
336 |     if name not in name_to_variable:
337 |       continue
338 |     assignment_map[name] = name
339 |     initialized_variable_names[name] = 1
340 |     initialized_variable_names[name + ":0"] = 1
341 | 
342 |   return (assignment_map, initialized_variable_names)
343 | 
344 | 
345 | def dropout(input_tensor, dropout_prob):
346 |   """Perform dropout.
347 | 
348 |   Args:
349 |     input_tensor: float Tensor.
350 |     dropout_prob: Python float. The probability of dropping out a value (NOT of
351 |       *keeping* a dimension as in `tf.nn.dropout`).
352 | 
353 |   Returns:
354 |     A version of `input_tensor` with dropout applied.
355 |   """
356 |   if dropout_prob is None or dropout_prob == 0.0:
357 |     return input_tensor
358 | 
359 |   output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
360 |   return output
361 | 
362 | 
363 | def layer_norm(input_tensor, name=None):
364 |   """Run layer normalization on the last dimension of the tensor."""
365 |   return tf.contrib.layers.layer_norm(
366 |       inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
367 | 
368 | 
369 | def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
370 |   """Runs layer normalization followed by dropout."""
371 |   output_tensor = layer_norm(input_tensor, name)
372 |   output_tensor = dropout(output_tensor, dropout_prob)
373 |   return output_tensor
374 | 
375 | 
376 | def create_initializer(initializer_range=0.02):
377 |   """Creates a `truncated_normal_initializer` with the given range."""
378 |   return tf.truncated_normal_initializer(stddev=initializer_range)
379 | 
380 | 
381 | def embedding_lookup(input_ids,
382 |                      vocab_size,
383 |                      embedding_size=128,
384 |                      initializer_range=0.02,
385 |                      word_embedding_name="word_embeddings",
386 |                      use_one_hot_embeddings=False):
387 |   """Looks up words embeddings for id tensor.
388 | 
389 |   Args:
390 |     input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
391 |       ids.
392 |     vocab_size: int. Size of the embedding vocabulary.
393 |     embedding_size: int. Width of the word embeddings.
394 |     initializer_range: float. Embedding initialization range.
395 |     word_embedding_name: string. Name of the embedding table.
396 |     use_one_hot_embeddings: bool. If True, use one-hot method for word
397 |       embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
398 |       for TPUs.
399 | 
400 |   Returns:
401 |     float Tensor of shape [batch_size, seq_length, embedding_size].
402 |   """
403 |   # This function assumes that the input is of shape [batch_size, seq_length,
404 |   # num_inputs].
405 |   #
406 |   # If the input is a 2D tensor of shape [batch_size, seq_length], we
407 |   # reshape to [batch_size, seq_length, 1].
408 |   if input_ids.shape.ndims == 2:
409 |     input_ids = tf.expand_dims(input_ids, axis=[-1])
410 | 
411 |   embedding_table = tf.get_variable(
412 |       name=word_embedding_name,
413 |       shape=[vocab_size, embedding_size],
414 |       initializer=create_initializer(initializer_range))
415 | 
416 |   if use_one_hot_embeddings:
417 |     flat_input_ids = tf.reshape(input_ids, [-1])
418 |     one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
419 |     output = tf.matmul(one_hot_input_ids, embedding_table)
420 |   else:
421 |     output = tf.nn.embedding_lookup(embedding_table, input_ids)
422 | 
423 |   input_shape = get_shape_list(input_ids)
424 | 
425 |   output = tf.reshape(output,
426 |                       input_shape[0:-1] + [input_shape[-1] * embedding_size])
427 |   return (output, embedding_table)
428 | 
429 | 
430 | def embedding_postprocessor(input_tensor,
431 |                             use_token_type=False,
432 |                             token_type_ids=None,
433 |                             token_type_vocab_size=16,
434 |                             token_type_embedding_name="token_type_embeddings",
435 |                             use_position_embeddings=True,
436 |                             position_embedding_name="position_embeddings",
437 |                             initializer_range=0.02,
438 |                             max_position_embeddings=512,
439 |                             dropout_prob=0.1):
440 |   """Performs various post-processing on a word embedding tensor.
441 | 
442 |   Args:
443 |     input_tensor: float Tensor of shape [batch_size, seq_length,
444 |       embedding_size].
445 |     use_token_type: bool. Whether to add embeddings for `token_type_ids`.
446 |     token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
447 |       Must be specified if `use_token_type` is True.
448 |     token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
449 |     token_type_embedding_name: string. The name of the embedding table variable
450 |       for token type ids.
451 |     use_position_embeddings: bool. Whether to add position embeddings for the
452 |       position of each token in the sequence.
453 |     position_embedding_name: string. The name of the embedding table variable
454 |       for positional embeddings.
455 |     initializer_range: float. Range of the weight initialization.
456 |     max_position_embeddings: int. Maximum sequence length that might ever be
457 |       used with this model. This can be longer than the sequence length of
458 |       input_tensor, but cannot be shorter.
459 |     dropout_prob: float. Dropout probability applied to the final output tensor.
460 | 
461 |   Returns:
462 |     float tensor with same shape as `input_tensor`.
463 | 
464 |   Raises:
465 |     ValueError: One of the tensor shapes or input values is invalid.
466 |   """
467 |   input_shape = get_shape_list(input_tensor, expected_rank=3)
468 |   batch_size = input_shape[0]
469 |   seq_length = input_shape[1]
470 |   width = input_shape[2]
471 | 
472 |   if seq_length > max_position_embeddings:
473 |     raise ValueError("The seq length (%d) cannot be greater than "
474 |                      "`max_position_embeddings` (%d)" %
475 |                      (seq_length, max_position_embeddings))
476 | 
477 |   output = input_tensor
478 | 
479 |   if use_token_type:
480 |     if token_type_ids is None:
481 |       raise ValueError("`token_type_ids` must be specified if"
482 |                        "`use_token_type` is True.")
483 |     token_type_table = tf.get_variable(
484 |         name=token_type_embedding_name,
485 |         shape=[token_type_vocab_size, width],
486 |         initializer=create_initializer(initializer_range))
487 |     # This vocab will be small so we always do one-hot here, since it is always
488 |     # faster for a small vocabulary.
489 |     flat_token_type_ids = tf.reshape(token_type_ids, [-1])
490 |     one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
491 |     token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
492 |     token_type_embeddings = tf.reshape(token_type_embeddings,
493 |                                        [batch_size, seq_length, width])
494 |     output += token_type_embeddings
495 | 
496 |   if use_position_embeddings:
497 |     full_position_embeddings = tf.get_variable(
498 |         name=position_embedding_name,
499 |         shape=[max_position_embeddings, width],
500 |         initializer=create_initializer(initializer_range))
501 |     # Since the position embedding table is a learned variable, we create it
502 |     # using a (long) sequence length `max_position_embeddings`. The actual
503 |     # sequence length might be shorter than this, for faster training of
504 |     # tasks that do not have long sequences.
505 |     #
506 |     # So `full_position_embeddings` is effectively an embedding table
507 |     # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
508 |     # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
509 |     # perform a slice.
510 |     if seq_length < max_position_embeddings:
511 |       position_embeddings = tf.slice(full_position_embeddings, [0, 0],
512 |                                      [seq_length, -1])
513 |     else:
514 |       position_embeddings = full_position_embeddings
515 | 
516 |     num_dims = len(output.shape.as_list())
517 | 
518 |     # Only the last two dimensions are relevant (`seq_length` and `width`), so
519 |     # we broadcast among the first dimensions, which is typically just
520 |     # the batch size.
521 |     position_broadcast_shape = []
522 |     for _ in range(num_dims - 2):
523 |       position_broadcast_shape.append(1)
524 |     position_broadcast_shape.extend([seq_length, width])
525 |     position_embeddings = tf.reshape(position_embeddings,
526 |                                      position_broadcast_shape)
527 |     output += position_embeddings
528 | 
529 |   output = layer_norm_and_dropout(output, dropout_prob)
530 |   return output
531 | 
532 | 
533 | def create_attention_mask_from_input_mask(from_tensor, to_mask):
534 |   """Create 3D attention mask from a 2D tensor mask.
535 | 
536 |   Args:
537 |     from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
538 |     to_mask: int32 Tensor of shape [batch_size, to_seq_length].
539 | 
540 |   Returns:
541 |     float Tensor of shape [batch_size, from_seq_length, to_seq_length].
542 |   """
543 |   from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
544 |   batch_size = from_shape[0]
545 |   from_seq_length = from_shape[1]
546 | 
547 |   to_shape = get_shape_list(to_mask, expected_rank=2)
548 |   to_seq_length = to_shape[1]
549 | 
550 |   to_mask = tf.cast(
551 |       tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
552 | 
553 |   # We don't assume that `from_tensor` is a mask (although it could be). We
554 |   # don't actually care if we attend *from* padding tokens (only *to* padding)
555 |   # tokens so we create a tensor of all ones.
556 |   #
557 |   # `broadcast_ones` = [batch_size, from_seq_length, 1]
558 |   broadcast_ones = tf.ones(
559 |       shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
560 | 
561 |   # Here we broadcast along two dimensions to create the mask.
562 |   mask = broadcast_ones * to_mask
563 | 
564 |   return mask
565 | 
566 | 
567 | def attention_layer(from_tensor,
568 |                     to_tensor,
569 |                     attention_mask=None,
570 |                     num_attention_heads=1,
571 |                     size_per_head=512,
572 |                     query_act=None,
573 |                     key_act=None,
574 |                     value_act=None,
575 |                     attention_probs_dropout_prob=0.0,
576 |                     initializer_range=0.02,
577 |                     do_return_2d_tensor=False,
578 |                     batch_size=None,
579 |                     from_seq_length=None,
580 |                     to_seq_length=None):
581 |   """Performs multi-headed attention from `from_tensor` to `to_tensor`.
582 | 
583 |   This is an implementation of multi-headed attention based on "Attention
584 |   is all you Need". If `from_tensor` and `to_tensor` are the same, then
585 |   this is self-attention. Each timestep in `from_tensor` attends to the
586 |   corresponding sequence in `to_tensor`, and returns a fixed-with vector.
587 | 
588 |   This function first projects `from_tensor` into a "query" tensor and
589 |   `to_tensor` into "key" and "value" tensors. These are (effectively) a list
590 |   of tensors of length `num_attention_heads`, where each tensor is of shape
591 |   [batch_size, seq_length, size_per_head].
592 | 
593 |   Then, the query and key tensors are dot-producted and scaled. These are
594 |   softmaxed to obtain attention probabilities. The value tensors are then
595 |   interpolated by these probabilities, then concatenated back to a single
596 |   tensor and returned.
597 | 
598 |   In practice, the multi-headed attention are done with transposes and
599 |   reshapes rather than actual separate tensors.
600 | 
601 |   Args:
602 |     from_tensor: float Tensor of shape [batch_size, from_seq_length,
603 |       from_width].
604 |     to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
605 |     attention_mask: (optional) int32 Tensor of shape [batch_size,
606 |       from_seq_length, to_seq_length]. The values should be 1 or 0. The
607 |       attention scores will effectively be set to -infinity for any positions in
608 |       the mask that are 0, and will be unchanged for positions that are 1.
609 |     num_attention_heads: int. Number of attention heads.
610 |     size_per_head: int. Size of each attention head.
611 |     query_act: (optional) Activation function for the query transform.
612 |     key_act: (optional) Activation function for the key transform.
613 |     value_act: (optional) Activation function for the value transform.
614 |     attention_probs_dropout_prob: (optional) float. Dropout probability of the
615 |       attention probabilities.
616 |     initializer_range: float. Range of the weight initializer.
617 |     do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
618 |       * from_seq_length, num_attention_heads * size_per_head]. If False, the
619 |       output will be of shape [batch_size, from_seq_length, num_attention_heads
620 |       * size_per_head].
621 |     batch_size: (Optional) int. If the input is 2D, this might be the batch size
622 |       of the 3D version of the `from_tensor` and `to_tensor`.
623 |     from_seq_length: (Optional) If the input is 2D, this might be the seq length
624 |       of the 3D version of the `from_tensor`.
625 |     to_seq_length: (Optional) If the input is 2D, this might be the seq length
626 |       of the 3D version of the `to_tensor`.
627 | 
628 |   Returns:
629 |     float Tensor of shape [batch_size, from_seq_length,
630 |       num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
631 |       true, this will be of shape [batch_size * from_seq_length,
632 |       num_attention_heads * size_per_head]).
633 | 
634 |   Raises:
635 |     ValueError: Any of the arguments or tensor shapes are invalid.
636 |   """
637 | 
638 |   def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
639 |                            seq_length, width):
640 |     output_tensor = tf.reshape(
641 |         input_tensor, [batch_size, seq_length, num_attention_heads, width])
642 | 
643 |     output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
644 |     return output_tensor
645 | 
646 |   from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
647 |   to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
648 | 
649 |   if len(from_shape) != len(to_shape):
650 |     raise ValueError(
651 |         "The rank of `from_tensor` must match the rank of `to_tensor`.")
652 | 
653 |   if len(from_shape) == 3:
654 |     batch_size = from_shape[0]
655 |     from_seq_length = from_shape[1]
656 |     to_seq_length = to_shape[1]
657 |   elif len(from_shape) == 2:
658 |     if (batch_size is None or from_seq_length is None or to_seq_length is None):
659 |       raise ValueError(
660 |           "When passing in rank 2 tensors to attention_layer, the values "
661 |           "for `batch_size`, `from_seq_length`, and `to_seq_length` "
662 |           "must all be specified.")
663 | 
664 |   # Scalar dimensions referenced here:
665 |   #   B = batch size (number of sequences)
666 |   #   F = `from_tensor` sequence length
667 |   #   T = `to_tensor` sequence length
668 |   #   N = `num_attention_heads`
669 |   #   H = `size_per_head`
670 | 
671 |   from_tensor_2d = reshape_to_matrix(from_tensor)
672 |   to_tensor_2d = reshape_to_matrix(to_tensor)
673 | 
674 |   # `query_layer` = [B*F, N*H]
675 |   query_layer = tf.layers.dense(
676 |       from_tensor_2d,
677 |       num_attention_heads * size_per_head,
678 |       activation=query_act,
679 |       name="query",
680 |       kernel_initializer=create_initializer(initializer_range))
681 | 
682 |   # `key_layer` = [B*T, N*H]
683 |   key_layer = tf.layers.dense(
684 |       to_tensor_2d,
685 |       num_attention_heads * size_per_head,
686 |       activation=key_act,
687 |       name="key",
688 |       kernel_initializer=create_initializer(initializer_range))
689 | 
690 |   # `value_layer` = [B*T, N*H]
691 |   value_layer = tf.layers.dense(
692 |       to_tensor_2d,
693 |       num_attention_heads * size_per_head,
694 |       activation=value_act,
695 |       name="value",
696 |       kernel_initializer=create_initializer(initializer_range))
697 | 
698 |   # `query_layer` = [B, N, F, H]
699 |   query_layer = transpose_for_scores(query_layer, batch_size,
700 |                                      num_attention_heads, from_seq_length,
701 |                                      size_per_head)
702 | 
703 |   # `key_layer` = [B, N, T, H]
704 |   key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
705 |                                    to_seq_length, size_per_head)
706 | 
707 |   # Take the dot product between "query" and "key" to get the raw
708 |   # attention scores.
709 |   # `attention_scores` = [B, N, F, T]
710 |   attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
711 |   attention_scores = tf.multiply(attention_scores,
712 |                                  1.0 / math.sqrt(float(size_per_head)))
713 | 
714 |   if attention_mask is not None:
715 |     # `attention_mask` = [B, 1, F, T]
716 |     attention_mask = tf.expand_dims(attention_mask, axis=[1])
717 | 
718 |     # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
719 |     # masked positions, this operation will create a tensor which is 0.0 for
720 |     # positions we want to attend and -10000.0 for masked positions.
721 |     adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
722 | 
723 |     # Since we are adding it to the raw scores before the softmax, this is
724 |     # effectively the same as removing these entirely.
725 |     attention_scores += adder
726 | 
727 |   # Normalize the attention scores to probabilities.
728 |   # `attention_probs` = [B, N, F, T]
729 |   attention_probs = tf.nn.softmax(attention_scores)
730 | 
731 |   # This is actually dropping out entire tokens to attend to, which might
732 |   # seem a bit unusual, but is taken from the original Transformer paper.
733 |   attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
734 | 
735 |   # `value_layer` = [B, T, N, H]
736 |   value_layer = tf.reshape(
737 |       value_layer,
738 |       [batch_size, to_seq_length, num_attention_heads, size_per_head])
739 | 
740 |   # `value_layer` = [B, N, T, H]
741 |   value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
742 | 
743 |   # `context_layer` = [B, N, F, H]
744 |   context_layer = tf.matmul(attention_probs, value_layer)
745 | 
746 |   # `context_layer` = [B, F, N, H]
747 |   context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
748 | 
749 |   if do_return_2d_tensor:
750 |     # `context_layer` = [B*F, N*V]
751 |     context_layer = tf.reshape(
752 |         context_layer,
753 |         [batch_size * from_seq_length, num_attention_heads * size_per_head])
754 |   else:
755 |     # `context_layer` = [B, F, N*V]
756 |     context_layer = tf.reshape(
757 |         context_layer,
758 |         [batch_size, from_seq_length, num_attention_heads * size_per_head])
759 | 
760 |   return context_layer
761 | 
762 | 
763 | def transformer_model(input_tensor,
764 |                       attention_mask=None,
765 |                       hidden_size=768,
766 |                       num_hidden_layers=12,
767 |                       num_attention_heads=12,
768 |                       intermediate_size=3072,
769 |                       intermediate_act_fn=gelu,
770 |                       hidden_dropout_prob=0.1,
771 |                       attention_probs_dropout_prob=0.1,
772 |                       initializer_range=0.02,
773 |                       do_return_all_layers=False):
774 |   """Multi-headed, multi-layer Transformer from "Attention is All You Need".
775 | 
776 |   This is almost an exact implementation of the original Transformer encoder.
777 | 
778 |   See the original paper:
779 |   https://arxiv.org/abs/1706.03762
780 | 
781 |   Also see:
782 |   https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
783 | 
784 |   Args:
785 |     input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
786 |     attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
787 |       seq_length], with 1 for positions that can be attended to and 0 in
788 |       positions that should not be.
789 |     hidden_size: int. Hidden size of the Transformer.
790 |     num_hidden_layers: int. Number of layers (blocks) in the Transformer.
791 |     num_attention_heads: int. Number of attention heads in the Transformer.
792 |     intermediate_size: int. The size of the "intermediate" (a.k.a., feed
793 |       forward) layer.
794 |     intermediate_act_fn: function. The non-linear activation function to apply
795 |       to the output of the intermediate/feed-forward layer.
796 |     hidden_dropout_prob: float. Dropout probability for the hidden layers.
797 |     attention_probs_dropout_prob: float. Dropout probability of the attention
798 |       probabilities.
799 |     initializer_range: float. Range of the initializer (stddev of truncated
800 |       normal).
801 |     do_return_all_layers: Whether to also return all layers or just the final
802 |       layer.
803 | 
804 |   Returns:
805 |     float Tensor of shape [batch_size, seq_length, hidden_size], the final
806 |     hidden layer of the Transformer.
807 | 
808 |   Raises:
809 |     ValueError: A Tensor shape or parameter is invalid.
810 |   """
811 |   if hidden_size % num_attention_heads != 0:
812 |     raise ValueError(
813 |         "The hidden size (%d) is not a multiple of the number of attention "
814 |         "heads (%d)" % (hidden_size, num_attention_heads))
815 | 
816 |   attention_head_size = int(hidden_size / num_attention_heads)
817 |   input_shape = get_shape_list(input_tensor, expected_rank=3)
818 |   batch_size = input_shape[0]
819 |   seq_length = input_shape[1]
820 |   input_width = input_shape[2]
821 | 
822 |   # The Transformer performs sum residuals on all layers so the input needs
823 |   # to be the same as the hidden size.
824 |   if input_width != hidden_size:
825 |     raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
826 |                      (input_width, hidden_size))
827 | 
828 |   # We keep the representation as a 2D tensor to avoid re-shaping it back and
829 |   # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
830 |   # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
831 |   # help the optimizer.
832 |   prev_output = reshape_to_matrix(input_tensor)
833 | 
834 |   all_layer_outputs = []
835 |   for layer_idx in range(num_hidden_layers):
836 |     with tf.variable_scope("layer_%d" % layer_idx):
837 |       layer_input = prev_output
838 | 
839 |       with tf.variable_scope("attention"):
840 |         attention_heads = []
841 |         with tf.variable_scope("self"):
842 |           attention_head = attention_layer(
843 |               from_tensor=layer_input,
844 |               to_tensor=layer_input,
845 |               attention_mask=attention_mask,
846 |               num_attention_heads=num_attention_heads,
847 |               size_per_head=attention_head_size,
848 |               attention_probs_dropout_prob=attention_probs_dropout_prob,
849 |               initializer_range=initializer_range,
850 |               do_return_2d_tensor=True,
851 |               batch_size=batch_size,
852 |               from_seq_length=seq_length,
853 |               to_seq_length=seq_length)
854 |           attention_heads.append(attention_head)
855 | 
856 |         attention_output = None
857 |         if len(attention_heads) == 1:
858 |           attention_output = attention_heads[0]
859 |         else:
860 |           # In the case where we have other sequences, we just concatenate
861 |           # them to the self-attention head before the projection.
862 |           attention_output = tf.concat(attention_heads, axis=-1)
863 | 
864 |         # Run a linear projection of `hidden_size` then add a residual
865 |         # with `layer_input`.
866 |         with tf.variable_scope("output"):
867 |           attention_output = tf.layers.dense(
868 |               attention_output,
869 |               hidden_size,
870 |               kernel_initializer=create_initializer(initializer_range))
871 |           attention_output = dropout(attention_output, hidden_dropout_prob)
872 |           attention_output = layer_norm(attention_output + layer_input)
873 | 
874 |       # The activation is only applied to the "intermediate" hidden layer.
875 |       with tf.variable_scope("intermediate"):
876 |         intermediate_output = tf.layers.dense(
877 |             attention_output,
878 |             intermediate_size,
879 |             activation=intermediate_act_fn,
880 |             kernel_initializer=create_initializer(initializer_range))
881 | 
882 |       # Down-project back to `hidden_size` then add the residual.
883 |       with tf.variable_scope("output"):
884 |         layer_output = tf.layers.dense(
885 |             intermediate_output,
886 |             hidden_size,
887 |             kernel_initializer=create_initializer(initializer_range))
888 |         layer_output = dropout(layer_output, hidden_dropout_prob)
889 |         layer_output = layer_norm(layer_output + attention_output)
890 |         prev_output = layer_output
891 |         all_layer_outputs.append(layer_output)
892 | 
893 |   if do_return_all_layers:
894 |     final_outputs = []
895 |     for layer_output in all_layer_outputs:
896 |       final_output = reshape_from_matrix(layer_output, input_shape)
897 |       final_outputs.append(final_output)
898 |     return final_outputs
899 |   else:
900 |     final_output = reshape_from_matrix(prev_output, input_shape)
901 |     return final_output
902 | 
903 | 
904 | def get_shape_list(tensor, expected_rank=None, name=None):
905 |   """Returns a list of the shape of tensor, preferring static dimensions.
906 | 
907 |   Args:
908 |     tensor: A tf.Tensor object to find the shape of.
909 |     expected_rank: (optional) int. The expected rank of `tensor`. If this is
910 |       specified and the `tensor` has a different rank, and exception will be
911 |       thrown.
912 |     name: Optional name of the tensor for the error message.
913 | 
914 |   Returns:
915 |     A list of dimensions of the shape of tensor. All static dimensions will
916 |     be returned as python integers, and dynamic dimensions will be returned
917 |     as tf.Tensor scalars.
918 |   """
919 |   if name is None:
920 |     name = tensor.name
921 | 
922 |   if expected_rank is not None:
923 |     assert_rank(tensor, expected_rank, name)
924 | 
925 |   shape = tensor.shape.as_list()
926 | 
927 |   non_static_indexes = []
928 |   for (index, dim) in enumerate(shape):
929 |     if dim is None:
930 |       non_static_indexes.append(index)
931 | 
932 |   if not non_static_indexes:
933 |     return shape
934 | 
935 |   dyn_shape = tf.shape(tensor)
936 |   for index in non_static_indexes:
937 |     shape[index] = dyn_shape[index]
938 |   return shape
939 | 
940 | 
941 | def reshape_to_matrix(input_tensor):
942 |   """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
943 |   ndims = input_tensor.shape.ndims
944 |   if ndims < 2:
945 |     raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
946 |                      (input_tensor.shape))
947 |   if ndims == 2:
948 |     return input_tensor
949 | 
950 |   width = input_tensor.shape[-1]
951 |   output_tensor = tf.reshape(input_tensor, [-1, width])
952 |   return output_tensor
953 | 
954 | 
955 | def reshape_from_matrix(output_tensor, orig_shape_list):
956 |   """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
957 |   if len(orig_shape_list) == 2:
958 |     return output_tensor
959 | 
960 |   output_shape = get_shape_list(output_tensor)
961 | 
962 |   orig_dims = orig_shape_list[0:-1]
963 |   width = output_shape[-1]
964 | 
965 |   return tf.reshape(output_tensor, orig_dims + [width])
966 | 
967 | 
968 | def assert_rank(tensor, expected_rank, name=None):
969 |   """Raises an exception if the tensor rank is not of the expected rank.
970 | 
971 |   Args:
972 |     tensor: A tf.Tensor to check the rank of.
973 |     expected_rank: Python integer or list of integers, expected rank.
974 |     name: Optional name of the tensor for the error message.
975 | 
976 |   Raises:
977 |     ValueError: If the expected shape doesn't match the actual shape.
978 |   """
979 |   if name is None:
980 |     name = tensor.name
981 | 
982 |   expected_rank_dict = {}
983 |   if isinstance(expected_rank, six.integer_types):
984 |     expected_rank_dict[expected_rank] = True
985 |   else:
986 |     for x in expected_rank:
987 |       expected_rank_dict[x] = True
988 | 
989 |   actual_rank = tensor.shape.ndims
990 |   if actual_rank not in expected_rank_dict:
991 |     scope_name = tf.get_variable_scope().name
992 |     raise ValueError(
993 |         "For the tensor `%s` in scope `%s`, the actual rank "
994 |         "`%d` (shape = %s) is not equal to the expected rank `%s`" %
995 |         (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
996 | 


--------------------------------------------------------------------------------
/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
 26 |   """Creates an optimizer training op."""
 27 |   global_step = tf.train.get_or_create_global_step()
 28 | 
 29 |   learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 30 | 
 31 |   # Implements linear decay of the learning rate.
 32 |   learning_rate = tf.train.polynomial_decay(
 33 |       learning_rate,
 34 |       global_step,
 35 |       num_train_steps,
 36 |       end_learning_rate=0.0,
 37 |       power=1.0,
 38 |       cycle=False)
 39 | 
 40 |   # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 41 |   # learning rate will be `global_step/num_warmup_steps * init_lr`.
 42 |   if num_warmup_steps:
 43 |     global_steps_int = tf.cast(global_step, tf.int32)
 44 |     warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 45 | 
 46 |     global_steps_float = tf.cast(global_steps_int, tf.float32)
 47 |     warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 48 | 
 49 |     warmup_percent_done = global_steps_float / warmup_steps_float
 50 |     warmup_learning_rate = init_lr * warmup_percent_done
 51 | 
 52 |     is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 53 |     learning_rate = (
 54 |         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 55 | 
 56 |   # It is recommended that you use this optimizer for fine tuning, since this
 57 |   # is how the model was trained (note that the Adam m/v variables are NOT
 58 |   # loaded from init_checkpoint.)
 59 |   optimizer = AdamWeightDecayOptimizer(
 60 |       learning_rate=learning_rate,
 61 |       weight_decay_rate=0.01,
 62 |       beta_1=0.9,
 63 |       beta_2=0.999,
 64 |       epsilon=1e-6,
 65 |       exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 66 | 
 67 |   if use_tpu:
 68 |     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 69 | 
 70 |   tvars = tf.trainable_variables()
 71 |   grads = tf.gradients(loss, tvars)
 72 | 
 73 |   # This is how the model was pre-trained.
 74 |   (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
 75 | 
 76 |   train_op = optimizer.apply_gradients(
 77 |       zip(grads, tvars), global_step=global_step)
 78 | 
 79 |   new_global_step = global_step + 1
 80 |   train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 81 |   return train_op
 82 | 
 83 | 
 84 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 85 |   """A basic Adam optimizer that includes "correct" L2 weight decay."""
 86 | 
 87 |   def __init__(self,
 88 |                learning_rate,
 89 |                weight_decay_rate=0.0,
 90 |                beta_1=0.9,
 91 |                beta_2=0.999,
 92 |                epsilon=1e-6,
 93 |                exclude_from_weight_decay=None,
 94 |                name="AdamWeightDecayOptimizer"):
 95 |     """Constructs a AdamWeightDecayOptimizer."""
 96 |     super(AdamWeightDecayOptimizer, self).__init__(False, name)
 97 | 
 98 |     self.learning_rate = learning_rate
 99 |     self.weight_decay_rate = weight_decay_rate
100 |     self.beta_1 = beta_1
101 |     self.beta_2 = beta_2
102 |     self.epsilon = epsilon
103 |     self.exclude_from_weight_decay = exclude_from_weight_decay
104 | 
105 |   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
106 |     """See base class."""
107 |     assignments = []
108 |     for (grad, param) in grads_and_vars:
109 |       if grad is None or param is None:
110 |         continue
111 | 
112 |       param_name = self._get_variable_name(param.name)
113 | 
114 |       m = tf.get_variable(
115 |           name=param_name + "/adam_m",
116 |           shape=param.shape.as_list(),
117 |           dtype=tf.float32,
118 |           trainable=False,
119 |           initializer=tf.zeros_initializer())
120 |       v = tf.get_variable(
121 |           name=param_name + "/adam_v",
122 |           shape=param.shape.as_list(),
123 |           dtype=tf.float32,
124 |           trainable=False,
125 |           initializer=tf.zeros_initializer())
126 | 
127 |       # Standard Adam update.
128 |       next_m = (
129 |           tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
130 |       next_v = (
131 |           tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
132 |                                                     tf.square(grad)))
133 | 
134 |       update = next_m / (tf.sqrt(next_v) + self.epsilon)
135 | 
136 |       # Just adding the square of the weights to the loss function is *not*
137 |       # the correct way of using L2 regularization/weight decay with Adam,
138 |       # since that will interact with the m and v parameters in strange ways.
139 |       #
140 |       # Instead we want ot decay the weights in a manner that doesn't interact
141 |       # with the m/v parameters. This is equivalent to adding the square
142 |       # of the weights to the loss with plain (non-momentum) SGD.
143 |       if self._do_use_weight_decay(param_name):
144 |         update += self.weight_decay_rate * param
145 | 
146 |       update_with_lr = self.learning_rate * update
147 | 
148 |       next_param = param - update_with_lr
149 | 
150 |       assignments.extend(
151 |           [param.assign(next_param),
152 |            m.assign(next_m),
153 |            v.assign(next_v)])
154 |     return tf.group(*assignments, name=name)
155 | 
156 |   def _do_use_weight_decay(self, param_name):
157 |     """Whether to use L2 weight decay for `param_name`."""
158 |     if not self.weight_decay_rate:
159 |       return False
160 |     if self.exclude_from_weight_decay:
161 |       for r in self.exclude_from_weight_decay:
162 |         if re.search(r, param_name) is not None:
163 |           return False
164 |     return True
165 | 
166 |   def _get_variable_name(self, param_name):
167 |     """Get the variable name from the tensor name."""
168 |     m = re.match("^(.*):\\d+$", param_name)
169 |     if m is not None:
170 |       param_name = m.group(1)
171 |     return param_name
172 | 


--------------------------------------------------------------------------------
/results.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | data_dir1="./test_results.tsv"
 3 | data_dir="./test.tsv"
 4 | with open(file=data_dir,mode="r",encoding="utf-8") as f:
 5 |     text=f.readlines()
 6 |     y_true=[]
 7 |     for t in text:
 8 |         if  t.split("\t")!=0 and t!="\n":
 9 |             y_true.append(int(t.split("\t")[0]))
10 | 
11 | with open(file=data_dir1,mode="r",encoding="utf-8") as f:
12 |     result=f.readlines()
13 |     y_pred=[]
14 |     for l in result:
15 |         l=list(map(float,l.split("\t")))
16 |         y_pred.append(np.argmax(l))
17 | 
18 | from sklearn import metrics
19 | # 混淆矩阵
20 | print("Confusion Matrix...")
21 | cm = metrics.confusion_matrix(y_true, y_pred)
22 | from sklearn.metrics import classification_report
23 | target_names = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
24 | print(classification_report(y_true, y_pred, target_names=target_names))


--------------------------------------------------------------------------------
/run_chinese_classification.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | 
  7 | import collections
  8 | import csv
  9 | import os
 10 | import modeling
 11 | import optimization
 12 | import tokenization
 13 | import tensorflow as tf
 14 | flags = tf.flags
 15 | 
 16 | FLAGS = flags.FLAGS
 17 | 
 18 | ## Required parameters
 19 | flags.DEFINE_string(
 20 |     "train_dir", None,
 21 |     "The input train data dir. Should contain the .tsv files (or other data files) "
 22 |     "for the task.")
 23 | flags.DEFINE_string(
 24 |     "dev_dir", None,
 25 |     "The input dev data dir. Should contain the .tsv files (or other data files) "
 26 |     "for the task.")
 27 | flags.DEFINE_string(
 28 |     "test_dir", None,
 29 |     "The input test dir. Should contain the .tsv files (or other data files) "
 30 |     "for the task.")
 31 | 
 32 | flags.DEFINE_string(
 33 |     "bert_config_file", None,
 34 |     "The config json file corresponding to the pre-trained BERT model. "
 35 |     "This specifies the model architecture.")
 36 | 
 37 | 
 38 | flags.DEFINE_string("vocab_file", None,
 39 |                     "The vocabulary file that the BERT model was trained on.")
 40 | 
 41 | flags.DEFINE_string(
 42 |     "output_dir", None,
 43 |     "The output directory where the model checkpoints will be written.")
 44 | 
 45 | ## Other parameters
 46 | 
 47 | flags.DEFINE_string(
 48 |     "init_checkpoint", None,
 49 |     "Initial checkpoint (usually from a pre-trained BERT model).")
 50 | 
 51 | flags.DEFINE_bool(
 52 |     "do_lower_case", True,
 53 |     "Whether to lower case the input text. Should be True for uncased "
 54 |     "models and False for cased models.")
 55 | 
 56 | flags.DEFINE_integer(
 57 |     "max_seq_length", 256,
 58 |     "The maximum total input sequence length after WordPiece tokenization. "
 59 |     "Sequences longer than this will be truncated, and sequences shorter "
 60 |     "than this will be padded.")
 61 | 
 62 | flags.DEFINE_bool("do_train", False, "Whether to run training.")
 63 | 
 64 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
 65 | 
 66 | flags.DEFINE_bool("do_predict", False, "Whether to run the model in inference mode on the test set.")
 67 | 
 68 | flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
 69 | 
 70 | flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
 71 | 
 72 | flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
 73 | 
 74 | flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
 75 | 
 76 | flags.DEFINE_float("num_train_epochs", 3.0,
 77 |                    "Total number of training epochs to perform.")
 78 | 
 79 | flags.DEFINE_float(
 80 |     "warmup_proportion", 0.1,
 81 |     "Proportion of training to perform linear learning rate warmup for. "
 82 |     "E.g., 0.1 = 10% of training.")
 83 | 
 84 | flags.DEFINE_integer("save_checkpoints_steps", 1000,
 85 |                      "How often to save the model checkpoint.")
 86 | 
 87 | flags.DEFINE_integer("iterations_per_loop", 1000,
 88 |                      "How many steps to make in each estimator call.")
 89 | 
 90 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
 91 | 
 92 | tf.flags.DEFINE_string(
 93 |     "tpu_name", None,
 94 |     "The Cloud TPU to use for training. This should be either the name "
 95 |     "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
 96 |     "url.")
 97 | 
 98 | tf.flags.DEFINE_string(
 99 |     "tpu_zone", None,
100 |     "[Optional] GCE zone where the Cloud TPU is located in. If not "
101 |     "specified, we will attempt to automatically detect the GCE project from "
102 |     "metadata.")
103 | 
104 | tf.flags.DEFINE_string(
105 |     "gcp_project", None,
106 |     "[Optional] Project name for the Cloud TPU-enabled project. If not "
107 |     "specified, we will attempt to automatically detect the GCE project from "
108 |     "metadata.")
109 | 
110 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
111 | 
112 | flags.DEFINE_integer(
113 |     "num_tpu_cores", 8,
114 |     "Only used if `use_tpu` is True. Total number of TPU cores to use.")
115 | 
116 | class InputExample(object):
117 |   """A single training/test example for simple sequence classification."""
118 | 
119 |   def __init__(self, guid, text_a, text_b=None, label=None):
120 |     """Constructs a InputExample.
121 | 
122 |     Args:
123 |       guid: Unique id for the example.
124 |       text_a: string. The untokenized text of the first sequence. For single
125 |         sequence tasks, only this sequence must be specified.
126 |       text_b: (Optional) string. The untokenized text of the second sequence.
127 |         Only must be specified for sequence pair tasks.
128 |       label: (Optional) string. The label of the example. This should be
129 |         specified for train and dev examples, but not for test examples.
130 |     """
131 |     self.guid = guid
132 |     self.text_a = text_a
133 |     self.text_b = text_b
134 |     self.label = label
135 | class InputFeatures(object):
136 |   """A single set of features of data."""
137 | 
138 |   def __init__(self, input_ids, input_mask, segment_ids, label_id):
139 |     self.input_ids = input_ids
140 |     self.input_mask = input_mask
141 |     self.segment_ids = segment_ids
142 |     self.label_id = label_id
143 | 
144 | def read_data(input_file):
145 |     examples=[]
146 |     with open(file=input_file,mode= "r",encoding="utf-8") as f:
147 |       reader = f.read().strip().split('\n')
148 |     for i ,data in enumerate(reader):
149 |         if data!="" and data!=" ":
150 |             text=data.split("\t")[1]
151 |             label= data.split("\t")[0]
152 |             guid=i
153 |             example=InputExample(guid=guid,text_a=text,text_b=None,label=label)
154 |             examples.append(example)
155 |     return examples
156 | 
157 | def convert_single_example(example, max_seq_length, tokenizer):
158 |   """Converts a single `InputExample` into a single `InputFeatures`."""
159 | 
160 |   tokens_a = tokenizer.tokenize(example.text_a)
161 | # Account for [CLS] and [SEP] with "- 2"
162 |   if len(tokens_a) > max_seq_length - 2:
163 |     tokens_a = tokens_a[0:(max_seq_length - 2)]
164 | 
165 |   # The convention in BERT is:
166 |   # (a) For sequence pairs:
167 |   #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
168 |   #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
169 |   # (b) For single sequences:
170 |   #  tokens:   [CLS] the dog is hairy . [SEP]
171 |   #  type_ids: 0     0   0   0  0     0 0
172 |   #
173 |   # Where "type_ids" are used to indicate whether this is the first
174 |   # sequence or the second sequence. The embedding vectors for `type=0` and
175 |   # `type=1` were learned during pre-training and are added to the wordpiece
176 |   # embedding vector (and position vector). This is not *strictly* necessary
177 |   # since the [SEP] token unambiguously separates the sequences, but it makes
178 |   # it easier for the model to learn the concept of sequences.
179 |   #
180 |   # For classification tasks, the first vector (corresponding to [CLS]) is
181 |   # used as as the "sentence vector". Note that this only makes sense because
182 |   # the entire model is fine-tuned.
183 |   tokens = []
184 |   segment_ids = []
185 |   tokens.append("[CLS]")
186 |   segment_ids.append(0)
187 |   for token in tokens_a:
188 |     tokens.append(token)
189 |     segment_ids.append(0)
190 |   tokens.append("[SEP]")
191 |   segment_ids.append(0)
192 | 
193 |   input_ids = tokenizer.convert_tokens_to_ids(tokens)
194 | 
195 |   # The mask has 1 for real tokens and 0 for padding tokens. Only real
196 |   # tokens are attended to.
197 |   input_mask = [1] * len(input_ids)
198 | 
199 |   # Zero-pad up to the sequence length.
200 |   while len(input_ids) < max_seq_length:
201 |     input_ids.append(0)
202 |     input_mask.append(0)
203 |     segment_ids.append(0)
204 | 
205 |   assert len(input_ids) == max_seq_length
206 |   assert len(input_mask) == max_seq_length
207 |   assert len(segment_ids) == max_seq_length
208 | 
209 |   label_id = int(example.label)
210 | 
211 |   feature = InputFeatures(
212 |       input_ids=input_ids,
213 |       input_mask=input_mask,
214 |       segment_ids=segment_ids,
215 |       label_id=label_id)
216 |   return feature
217 | 
218 | def convert_examples_to_features(examples, max_seq_length,
219 |                                  tokenizer):
220 |   """Convert a set of `InputExample`s to a list of `InputFeatures`."""
221 |   features = []
222 |   for (ex_index, example) in enumerate(examples):
223 |     feature = convert_single_example(example, max_seq_length, tokenizer)
224 |     features.append(feature)
225 |   return features
226 | def file_based_convert_examples_to_features(
227 |     examples, max_seq_length, tokenizer, output_file):
228 |   """Convert a set of `InputExample`s to a TFRecord file."""
229 | 
230 |   writer = tf.python_io.TFRecordWriter(output_file)
231 | 
232 |   for (ex_index, example) in enumerate(examples):
233 |     if ex_index % 10000 == 0:
234 |       tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
235 | 
236 |     feature = convert_single_example(example,max_seq_length, tokenizer)
237 | 
238 |     def create_int_feature(values):
239 |       f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
240 |       return f
241 | 
242 |     features = collections.OrderedDict()
243 |     features["input_ids"] = create_int_feature(feature.input_ids)
244 |     features["input_mask"] = create_int_feature(feature.input_mask)
245 |     features["segment_ids"] = create_int_feature(feature.segment_ids)
246 |     features["label_ids"] = create_int_feature([feature.label_id])
247 | 
248 |     tf_example = tf.train.Example(features=tf.train.Features(feature=features))
249 |     writer.write(tf_example.SerializeToString())
250 | 
251 | def file_based_input_fn_builder(input_file, seq_length, is_training,
252 |                                 drop_remainder):
253 |   """Creates an `input_fn` closure to be passed to TPU Estimator."""
254 | 
255 |   name_to_features = {
256 |       "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
257 |       "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
258 |       "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
259 |       "label_ids": tf.FixedLenFeature([], tf.int64),
260 |   }
261 | 
262 |   def _decode_record(record, name_to_features):
263 |     """Decodes a record to a TensorFlow example."""
264 |     example = tf.parse_single_example(record, name_to_features)
265 | 
266 |     # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
267 |     # So cast all int64 to int32.
268 |     for name in list(example.keys()):
269 |       t = example[name]
270 |       if t.dtype == tf.int64:
271 |         t = tf.to_int32(t)
272 |       example[name] = t
273 | 
274 |     return example
275 | 
276 |   def input_fn(params):
277 |     """The actual input function."""
278 |     batch_size = params["batch_size"]
279 | 
280 |     # For training, we want a lot of parallel reading and shuffling.
281 |     # For eval, we want no shuffling and parallel reading doesn't matter.
282 |     d = tf.data.TFRecordDataset(input_file)
283 |     if is_training:
284 |       d = d.repeat()
285 |       d = d.shuffle(buffer_size=100)
286 | 
287 |     d = d.apply(
288 |         tf.contrib.data.map_and_batch(
289 |             lambda record: _decode_record(record, name_to_features),
290 |             batch_size=batch_size,
291 |             drop_remainder=drop_remainder))
292 | 
293 |     return d
294 | 
295 |   return input_fn
296 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
297 |                  labels, num_labels, use_one_hot_embeddings):
298 |   """Creates a classification model."""
299 |   model = modeling.BertModel(
300 |       config=bert_config,
301 |       is_training=is_training,
302 |       input_ids=input_ids,
303 |       input_mask=input_mask,
304 |       token_type_ids=segment_ids,
305 |       use_one_hot_embeddings=use_one_hot_embeddings)
306 | 
307 |   # In the demo, we are doing a simple classification task on the entire
308 |   # segment.
309 |   #
310 |   # If you want to use the token-level output, use model.get_sequence_output()
311 |   # instead.
312 |   output_layer = model.get_pooled_output()
313 | 
314 |   hidden_size = output_layer.shape[-1].value
315 | 
316 |   output_weights = tf.get_variable(
317 |       "output_weights", [num_labels, hidden_size],
318 |       initializer=tf.truncated_normal_initializer(stddev=0.02))
319 | 
320 |   output_bias = tf.get_variable(
321 |       "output_bias", [num_labels], initializer=tf.zeros_initializer())
322 | 
323 |   with tf.variable_scope("loss"):
324 |     if is_training:
325 |       # I.e., 0.1 dropout
326 |       output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
327 | 
328 |     logits = tf.matmul(output_layer, output_weights, transpose_b=True)
329 |     logits = tf.nn.bias_add(logits, output_bias)
330 |     probabilities = tf.nn.softmax(logits, axis=-1)
331 |     log_probs = tf.nn.log_softmax(logits, axis=-1)
332 | 
333 |     one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
334 | 
335 |     per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
336 |     loss = tf.reduce_mean(per_example_loss)
337 | 
338 |     return (loss, per_example_loss, logits, probabilities)
339 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
340 |                      num_train_steps, num_warmup_steps, use_tpu,
341 |                      use_one_hot_embeddings):
342 |   """Returns `model_fn` closure for TPUEstimator."""
343 | 
344 |   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
345 |     """The `model_fn` for TPUEstimator."""
346 | 
347 |     tf.logging.info("*** Features ***")
348 |     for name in sorted(features.keys()):
349 |       tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
350 | 
351 |     input_ids = features["input_ids"]
352 |     input_mask = features["input_mask"]
353 |     segment_ids = features["segment_ids"]
354 |     label_ids = features["label_ids"]
355 | 
356 |     is_training = (mode == tf.estimator.ModeKeys.TRAIN)
357 | 
358 |     (total_loss, per_example_loss, logits, probabilities) = create_model(
359 |         bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
360 |         num_labels, use_one_hot_embeddings)
361 | 
362 |     tvars = tf.trainable_variables()
363 | 
364 |     scaffold_fn = None
365 |     if init_checkpoint:
366 |       (assignment_map, initialized_variable_names
367 |       ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
368 |       if use_tpu:
369 | 
370 |         def tpu_scaffold():
371 |           tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
372 |           return tf.train.Scaffold()
373 | 
374 |         scaffold_fn = tpu_scaffold
375 |       else:
376 |         tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
377 | 
378 |     tf.logging.info("**** Trainable Variables ****")
379 |     for var in tvars:
380 |       init_string = ""
381 |       if var.name in initialized_variable_names:
382 |         init_string = ", *INIT_FROM_CKPT*"
383 |       tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
384 |                       init_string)
385 | 
386 |     output_spec = None
387 |     if mode == tf.estimator.ModeKeys.TRAIN:
388 | 
389 |       train_op = optimization.create_optimizer(
390 |           total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
391 | 
392 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
393 |           mode=mode,
394 |           loss=total_loss,
395 |           train_op=train_op,
396 |           scaffold_fn=scaffold_fn)
397 |     elif mode == tf.estimator.ModeKeys.EVAL:
398 | 
399 |       def metric_fn(per_example_loss, label_ids, logits):
400 |         predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
401 |         accuracy = tf.metrics.accuracy(label_ids, predictions)
402 |         loss = tf.metrics.mean(per_example_loss)
403 |         return {
404 |             "eval_accuracy": accuracy,
405 |             "eval_loss": loss,
406 |         }
407 | 
408 |       eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
409 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
410 |           mode=mode,
411 |           loss=total_loss,
412 |           eval_metrics=eval_metrics,
413 |           scaffold_fn=scaffold_fn)
414 |     else:
415 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
416 |           mode=mode,
417 |           predictions=probabilities,
418 |           scaffold_fn=scaffold_fn)
419 |     return output_spec
420 | 
421 |   return model_fn
422 | def input_fn_builder(features, seq_length, is_training, drop_remainder):
423 |   """Creates an `input_fn` closure to be passed to TPUEstimator."""
424 | 
425 |   all_input_ids = []
426 |   all_input_mask = []
427 |   all_segment_ids = []
428 |   all_label_ids = []
429 | 
430 |   for feature in features:
431 |     all_input_ids.append(feature.input_ids)
432 |     all_input_mask.append(feature.input_mask)
433 |     all_segment_ids.append(feature.segment_ids)
434 |     all_label_ids.append(feature.label_id)
435 | 
436 |   def input_fn(params):
437 |     """The actual input function."""
438 |     batch_size = params["batch_size"]
439 | 
440 |     num_examples = len(features)
441 | 
442 |     # This is for demo purposes and does NOT scale to large data sets. We do
443 |     # not use Dataset.from_generator() because that uses tf.py_func which is
444 |     # not TPU compatible. The right way to load data is with TFRecordReader.
445 |     d = tf.data.Dataset.from_tensor_slices({
446 |         "input_ids":
447 |             tf.constant(
448 |                 all_input_ids, shape=[num_examples, seq_length],
449 |                 dtype=tf.int32),
450 |         "input_mask":
451 |             tf.constant(
452 |                 all_input_mask,
453 |                 shape=[num_examples, seq_length],
454 |                 dtype=tf.int32),
455 |         "segment_ids":
456 |             tf.constant(
457 |                 all_segment_ids,
458 |                 shape=[num_examples, seq_length],
459 |                 dtype=tf.int32),
460 |         "label_ids":
461 |             tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
462 |     })
463 | 
464 |     if is_training:
465 |       d = d.repeat()
466 |       d = d.shuffle(buffer_size=100)
467 | 
468 |     d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
469 |     return d
470 | 
471 |   return input_fn
472 | def main(_):
473 |     bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
474 |     tf.gfile.MakeDirs(FLAGS.output_dir)
475 |     tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
476 |     tpu_cluster_resolver = None
477 |     is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
478 |     run_config = tf.contrib.tpu.RunConfig(
479 |         cluster=tpu_cluster_resolver,
480 |         master=FLAGS.master,
481 |         model_dir=FLAGS.output_dir,
482 |         save_checkpoints_steps=FLAGS.save_checkpoints_steps,
483 |         tpu_config=tf.contrib.tpu.TPUConfig(
484 |             iterations_per_loop=FLAGS.iterations_per_loop,
485 |             num_shards=FLAGS.num_tpu_cores,
486 |             per_host_input_for_training=is_per_host))
487 | 
488 |     train_examples = None
489 |     num_train_steps = None
490 |     num_warmup_steps = None
491 |     if FLAGS.do_train:
492 |         train_examples = read_data(FLAGS.train_dir)
493 |         num_train_steps = int(
494 |             len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
495 |         num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
496 | 
497 |     model_fn = model_fn_builder(
498 |         bert_config=bert_config,
499 |         num_labels=10,
500 |         init_checkpoint=FLAGS.init_checkpoint,
501 |         learning_rate=FLAGS.learning_rate,
502 |         num_train_steps=num_train_steps,
503 |         num_warmup_steps=num_warmup_steps,
504 |         use_tpu=FLAGS.use_tpu,
505 |         use_one_hot_embeddings=FLAGS.use_tpu)
506 | 
507 |     # If TPU is not available, this will fall back to normal Estimator on CPU
508 |     # or GPU.
509 |     estimator = tf.contrib.tpu.TPUEstimator(
510 |         use_tpu=FLAGS.use_tpu,
511 |         model_fn=model_fn,
512 |         config=run_config,
513 |         train_batch_size=FLAGS.train_batch_size,
514 |         eval_batch_size=FLAGS.eval_batch_size,
515 |         predict_batch_size=FLAGS.predict_batch_size)
516 | 
517 |     if FLAGS.do_train:
518 |         train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
519 |         file_based_convert_examples_to_features(
520 |             train_examples, FLAGS.max_seq_length, tokenizer, train_file)
521 |         tf.logging.info("***** Running training *****")
522 |         tf.logging.info("  Num examples = %d", len(train_examples))
523 |         tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
524 |         tf.logging.info("  Num steps = %d", num_train_steps)
525 |         train_input_fn = file_based_input_fn_builder(
526 |             input_file=train_file,
527 |             seq_length=FLAGS.max_seq_length,
528 |             is_training=True,
529 |             drop_remainder=True)
530 |         estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
531 | 
532 |     if FLAGS.do_eval:
533 |         tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
534 |         eval_examples = read_data(FLAGS.dev_dir)
535 |         eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
536 |         file_based_convert_examples_to_features(
537 |             examples=eval_examples, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer, output_file=eval_file)
538 | 
539 |         tf.logging.info("***** Running evaluation *****")
540 |         tf.logging.info("  Num examples = %d", len(eval_examples))
541 |         tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
542 | 
543 |         # This tells the estimator to run through the entire set.
544 |         eval_steps = None
545 |         # However, if running eval on the TPU, you will need to specify the
546 |         # number of steps.
547 |         if FLAGS.use_tpu:
548 |           # Eval will be slightly WRONG on the TPU because it will truncate
549 |           # the last batch.
550 |           eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
551 | 
552 |         eval_drop_remainder = True if FLAGS.use_tpu else False
553 |         eval_input_fn = file_based_input_fn_builder(
554 |             input_file=eval_file,
555 |             seq_length=FLAGS.max_seq_length,
556 |             is_training=False,
557 |             drop_remainder=eval_drop_remainder)
558 | 
559 |         result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
560 | 
561 |         output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
562 |         with tf.gfile.GFile(output_eval_file, "w") as writer:
563 |           tf.logging.info("***** Eval results *****")
564 |           for key in sorted(result.keys()):
565 |             tf.logging.info("  %s = %s", key, str(result[key]))
566 |             writer.write("%s = %s\n" % (key, str(result[key])))
567 | 
568 |     if FLAGS.do_predict:
569 |         tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
570 |         predict_examples = read_data(FLAGS.test_dir)
571 |         predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
572 |         file_based_convert_examples_to_features(predict_examples,FLAGS.max_seq_length, tokenizer=tokenizer,output_file=predict_file)
573 | 
574 |         tf.logging.info("***** Running prediction*****")
575 |         tf.logging.info("  Num examples = %d", len(predict_examples))
576 |         tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
577 | 
578 |         if FLAGS.use_tpu:
579 |           # Warning: According to tpu_estimator.py Prediction on TPU is an
580 |           # experimental feature and hence not supported here
581 |           raise ValueError("Prediction in TPU not supported")
582 | 
583 |         predict_drop_remainder = True if FLAGS.use_tpu else False
584 |         predict_input_fn = file_based_input_fn_builder(
585 |           input_file=predict_file,
586 |           seq_length=FLAGS.max_seq_length,
587 |           is_training=False,
588 |           drop_remainder=predict_drop_remainder)
589 | 
590 |         result = estimator.predict(input_fn=predict_input_fn)
591 | 
592 |         output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
593 |         with tf.gfile.GFile(output_predict_file, "w") as writer:
594 |           tf.logging.info("***** Predict results *****")
595 |           for prediction in result:
596 |             output_line = "\t".join(str(pre) for pre in prediction) + "\n"
597 |             writer.write(output_line)
598 | 
599 | 
600 | if __name__ == "__main__":
601 |     flags.mark_flag_as_required("train_dir")
602 |     flags.mark_flag_as_required("dev_dir")
603 |     flags.mark_flag_as_required("test_dir")
604 |     flags.mark_flag_as_required("vocab_file")
605 |     flags.mark_flag_as_required("bert_config_file")
606 |     flags.mark_flag_as_required("output_dir")
607 |     tf.app.run()
608 | 


--------------------------------------------------------------------------------
/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import unicodedata
 23 | import six
 24 | import tensorflow as tf
 25 | 
 26 | 
 27 | def convert_to_unicode(text):
 28 |   """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 29 |   if six.PY3:
 30 |     if isinstance(text, str):
 31 |       return text
 32 |     elif isinstance(text, bytes):
 33 |       return text.decode("utf-8", "ignore")
 34 |     else:
 35 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 36 |   elif six.PY2:
 37 |     if isinstance(text, str):
 38 |       return text.decode("utf-8", "ignore")
 39 |     elif isinstance(text, unicode):
 40 |       return text
 41 |     else:
 42 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 43 |   else:
 44 |     raise ValueError("Not running on Python2 or Python 3?")
 45 | 
 46 | 
 47 | def printable_text(text):
 48 |   """Returns text encoded in a way suitable for print or `tf.logging`."""
 49 | 
 50 |   # These functions want `str` for both Python2 and Python3, but in one case
 51 |   # it's a Unicode string and in the other it's a byte string.
 52 |   if six.PY3:
 53 |     if isinstance(text, str):
 54 |       return text
 55 |     elif isinstance(text, bytes):
 56 |       return text.decode("utf-8", "ignore")
 57 |     else:
 58 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 59 |   elif six.PY2:
 60 |     if isinstance(text, str):
 61 |       return text
 62 |     elif isinstance(text, unicode):
 63 |       return text.encode("utf-8")
 64 |     else:
 65 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 66 |   else:
 67 |     raise ValueError("Not running on Python2 or Python 3?")
 68 | 
 69 | 
 70 | def load_vocab(vocab_file):
 71 |   """Loads a vocabulary file into a dictionary."""
 72 |   vocab = collections.OrderedDict()
 73 |   index = 0
 74 |   with tf.gfile.GFile(vocab_file, "r") as reader:
 75 |     while True:
 76 |       token = convert_to_unicode(reader.readline())
 77 |       if not token:
 78 |         break
 79 |       token = token.strip()
 80 |       vocab[token] = index
 81 |       index += 1
 82 |   return vocab
 83 | 
 84 | 
 85 | def convert_tokens_to_ids(vocab, tokens):
 86 |   """Converts a sequence of tokens into ids using the vocab."""
 87 |   ids = []
 88 |   for token in tokens:
 89 |     ids.append(vocab[token])
 90 |   return ids
 91 | 
 92 | 
 93 | def whitespace_tokenize(text):
 94 |   """Runs basic whitespace cleaning and splitting on a peice of text."""
 95 |   text = text.strip()
 96 |   if not text:
 97 |     return []
 98 |   tokens = text.split()
 99 |   return tokens
100 | 
101 | 
102 | class FullTokenizer(object):
103 |   """Runs end-to-end tokenziation."""
104 | 
105 |   def __init__(self, vocab_file, do_lower_case=True):
106 |     self.vocab = load_vocab(vocab_file)
107 |     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
108 |     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
109 | 
110 |   def tokenize(self, text):
111 |     split_tokens = []
112 |     for token in self.basic_tokenizer.tokenize(text):
113 |       for sub_token in self.wordpiece_tokenizer.tokenize(token):
114 |         split_tokens.append(sub_token)
115 | 
116 |     return split_tokens
117 | 
118 |   def convert_tokens_to_ids(self, tokens):
119 |     return convert_tokens_to_ids(self.vocab, tokens)
120 | 
121 | 
122 | class BasicTokenizer(object):
123 |   """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
124 | 
125 |   def __init__(self, do_lower_case=True):
126 |     """Constructs a BasicTokenizer.
127 | 
128 |     Args:
129 |       do_lower_case: Whether to lower case the input.
130 |     """
131 |     self.do_lower_case = do_lower_case
132 | 
133 |   def tokenize(self, text):
134 |     """Tokenizes a piece of text."""
135 |     text = convert_to_unicode(text)
136 |     text = self._clean_text(text)
137 | 
138 |     # This was added on November 1st, 2018 for the multilingual and Chinese
139 |     # models. This is also applied to the English models now, but it doesn't
140 |     # matter since the English models were not trained on any Chinese data
141 |     # and generally don't have any Chinese data in them (there are Chinese
142 |     # characters in the vocabulary because Wikipedia does have some Chinese
143 |     # words in the English Wikipedia.).
144 |     text = self._tokenize_chinese_chars(text)
145 | 
146 |     orig_tokens = whitespace_tokenize(text)
147 |     split_tokens = []
148 |     for token in orig_tokens:
149 |       if self.do_lower_case:
150 |         token = token.lower()
151 |         token = self._run_strip_accents(token)
152 |       split_tokens.extend(self._run_split_on_punc(token))
153 | 
154 |     output_tokens = whitespace_tokenize(" ".join(split_tokens))
155 |     return output_tokens
156 | 
157 |   def _run_strip_accents(self, text):
158 |     """Strips accents from a piece of text."""
159 |     text = unicodedata.normalize("NFD", text)
160 |     output = []
161 |     for char in text:
162 |       cat = unicodedata.category(char)
163 |       if cat == "Mn":
164 |         continue
165 |       output.append(char)
166 |     return "".join(output)
167 | 
168 |   def _run_split_on_punc(self, text):
169 |     """Splits punctuation on a piece of text."""
170 |     chars = list(text)
171 |     i = 0
172 |     start_new_word = True
173 |     output = []
174 |     while i < len(chars):
175 |       char = chars[i]
176 |       if _is_punctuation(char):
177 |         output.append([char])
178 |         start_new_word = True
179 |       else:
180 |         if start_new_word:
181 |           output.append([])
182 |         start_new_word = False
183 |         output[-1].append(char)
184 |       i += 1
185 | 
186 |     return ["".join(x) for x in output]
187 | 
188 |   def _tokenize_chinese_chars(self, text):
189 |     """Adds whitespace around any CJK character."""
190 |     output = []
191 |     for char in text:
192 |       cp = ord(char)
193 |       if self._is_chinese_char(cp):
194 |         output.append(" ")
195 |         output.append(char)
196 |         output.append(" ")
197 |       else:
198 |         output.append(char)
199 |     return "".join(output)
200 | 
201 |   def _is_chinese_char(self, cp):
202 |     """Checks whether CP is the codepoint of a CJK character."""
203 |     # This defines a "chinese character" as anything in the CJK Unicode block:
204 |     #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
205 |     #
206 |     # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
207 |     # despite its name. The modern Korean Hangul alphabet is a different block,
208 |     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
209 |     # space-separated words, so they are not treated specially and handled
210 |     # like the all of the other languages.
211 |     if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
212 |         (cp >= 0x3400 and cp <= 0x4DBF) or  #
213 |         (cp >= 0x20000 and cp <= 0x2A6DF) or  #
214 |         (cp >= 0x2A700 and cp <= 0x2B73F) or  #
215 |         (cp >= 0x2B740 and cp <= 0x2B81F) or  #
216 |         (cp >= 0x2B820 and cp <= 0x2CEAF) or
217 |         (cp >= 0xF900 and cp <= 0xFAFF) or  #
218 |         (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
219 |       return True
220 | 
221 |     return False
222 | 
223 |   def _clean_text(self, text):
224 |     """Performs invalid character removal and whitespace cleanup on text."""
225 |     output = []
226 |     for char in text:
227 |       cp = ord(char)
228 |       if cp == 0 or cp == 0xfffd or _is_control(char):
229 |         continue
230 |       if _is_whitespace(char):
231 |         output.append(" ")
232 |       else:
233 |         output.append(char)
234 |     return "".join(output)
235 | 
236 | 
237 | class WordpieceTokenizer(object):
238 |   """Runs WordPiece tokenziation."""
239 | 
240 |   def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
241 |     self.vocab = vocab
242 |     self.unk_token = unk_token
243 |     self.max_input_chars_per_word = max_input_chars_per_word
244 | 
245 |   def tokenize(self, text):
246 |     """Tokenizes a piece of text into its word pieces.
247 | 
248 |     This uses a greedy longest-match-first algorithm to perform tokenization
249 |     using the given vocabulary.
250 | 
251 |     For example:
252 |       input = "unaffable"
253 |       output = ["un", "##aff", "##able"]
254 | 
255 |     Args:
256 |       text: A single token or whitespace separated tokens. This should have
257 |         already been passed through `BasicTokenizer.
258 | 
259 |     Returns:
260 |       A list of wordpiece tokens.
261 |     """
262 | 
263 |     text = convert_to_unicode(text)
264 | 
265 |     output_tokens = []
266 |     for token in whitespace_tokenize(text):
267 |       chars = list(token)
268 |       if len(chars) > self.max_input_chars_per_word:
269 |         output_tokens.append(self.unk_token)
270 |         continue
271 | 
272 |       is_bad = False
273 |       start = 0
274 |       sub_tokens = []
275 |       while start < len(chars):
276 |         end = len(chars)
277 |         cur_substr = None
278 |         while start < end:
279 |           substr = "".join(chars[start:end])
280 |           if start > 0:
281 |             substr = "##" + substr
282 |           if substr in self.vocab:
283 |             cur_substr = substr
284 |             break
285 |           end -= 1
286 |         if cur_substr is None:
287 |           is_bad = True
288 |           break
289 |         sub_tokens.append(cur_substr)
290 |         start = end
291 | 
292 |       if is_bad:
293 |         output_tokens.append(self.unk_token)
294 |       else:
295 |         output_tokens.extend(sub_tokens)
296 |     return output_tokens
297 | 
298 | 
299 | def _is_whitespace(char):
300 |   """Checks whether `chars` is a whitespace character."""
301 |   # \t, \n, and \r are technically contorl characters but we treat them
302 |   # as whitespace since they are generally considered as such.
303 |   if char == " " or char == "\t" or char == "\n" or char == "\r":
304 |     return True
305 |   cat = unicodedata.category(char)
306 |   if cat == "Zs":
307 |     return True
308 |   return False
309 | 
310 | 
311 | def _is_control(char):
312 |   """Checks whether `chars` is a control character."""
313 |   # These are technically control characters but we count them as whitespace
314 |   # characters.
315 |   if char == "\t" or char == "\n" or char == "\r":
316 |     return False
317 |   cat = unicodedata.category(char)
318 |   if cat.startswith("C"):
319 |     return True
320 |   return False
321 | 
322 | 
323 | def _is_punctuation(char):
324 |   """Checks whether `chars` is a punctuation character."""
325 |   cp = ord(char)
326 |   # We treat all non-letter/number ASCII as punctuation.
327 |   # Characters such as "^", "$", and "`" are not in the Unicode
328 |   # Punctuation class but we treat them as punctuation anyways, for
329 |   # consistency.
330 |   if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
331 |       (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
332 |     return True
333 |   cat = unicodedata.category(char)
334 |   if cat.startswith("P"):
335 |     return True
336 |   return False
337 | 


--------------------------------------------------------------------------------