├── image
    ├── 01.png
    ├── 02.png
    ├── 03.png
    ├── 04.png
    ├── 05.png
    ├── 06.png
    ├── 07.png
    ├── 08.png
    ├── 09.png
    ├── 10.png
    ├── 11.png
    ├── 12.png
    ├── 13.png
    ├── 14.png
    ├── 15.png
    ├── 16.png
    ├── 17.png
    ├── 18.png
    ├── 19.png
    ├── 20.png
    ├── 21.png
    ├── 22.png
    ├── 23.png
    ├── 24.png
    └── equation_1.png
├── README.md
├── Bert模型源码解析.md
└── modeling.py


/image/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/01.png


--------------------------------------------------------------------------------
/image/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/02.png


--------------------------------------------------------------------------------
/image/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/03.png


--------------------------------------------------------------------------------
/image/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/04.png


--------------------------------------------------------------------------------
/image/05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/05.png


--------------------------------------------------------------------------------
/image/06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/06.png


--------------------------------------------------------------------------------
/image/07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/07.png


--------------------------------------------------------------------------------
/image/08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/08.png


--------------------------------------------------------------------------------
/image/09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/09.png


--------------------------------------------------------------------------------
/image/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/10.png


--------------------------------------------------------------------------------
/image/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/11.png


--------------------------------------------------------------------------------
/image/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/12.png


--------------------------------------------------------------------------------
/image/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/13.png


--------------------------------------------------------------------------------
/image/14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/14.png


--------------------------------------------------------------------------------
/image/15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/15.png


--------------------------------------------------------------------------------
/image/16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/16.png


--------------------------------------------------------------------------------
/image/17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/17.png


--------------------------------------------------------------------------------
/image/18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/18.png


--------------------------------------------------------------------------------
/image/19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/19.png


--------------------------------------------------------------------------------
/image/20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/20.png


--------------------------------------------------------------------------------
/image/21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/21.png


--------------------------------------------------------------------------------
/image/22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/22.png


--------------------------------------------------------------------------------
/image/23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/23.png


--------------------------------------------------------------------------------
/image/24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/24.png


--------------------------------------------------------------------------------
/image/equation_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/equation_1.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Bert-model-code-interpretation
2 | 解读tensorflow版本Bert中modeling.py数据流
3 | * [Bert模型源码解析](./Bert模型源码解析.md)
4 | * [文章简书地址](https://www.jianshu.com/p/2a3872148766)
5 | 
6 | 


--------------------------------------------------------------------------------
/Bert模型源码解析.md:
--------------------------------------------------------------------------------
  1 | ## Bert模型tensorflow源码解析（详解Transformer Encoder数据运算）
  2 | 
  3 | ### Contents
  4 | 
  5 | * [前言](#前言)
  6 | * [模型输入](#模型输入)
  7 | * [Padding_Mask](#Padding_Mask)
  8 | * [attention_layer](#attention_layer)
  9 | * [transformer_model](#transformer_model)
 10 | * [Bert_model class](#Bert_model-class)
 11 | * [后续](#后续)
 12 | 
 13 | 
 14 | ### 前言
 15 | 
 16 | 关于Bert模型的基本内容这里就不讲述了，可参考其它文章，这里有一个收集了很多讲解bert文章的网址：
 17 | 
 18 | [http://www.52nlp.cn/bert-paper-论文-文章-代码资源汇总](http://www.52nlp.cn/bert-paper-%E8%AE%BA%E6%96%87-%E6%96%87%E7%AB%A0-%E4%BB%A3%E7%A0%81%E8%B5%84%E6%BA%90%E6%B1%87%E6%80%BB)
 19 | 
 20 | 与大多数文章不同的是，本文主要是对Bert模型部分的源码进行详细解读，搞清楚数据从Bert模型输入到输出的每一步变化，这对于我们理解Bert模型、特别是改造Bert是具有极大帮助的。**需要注意的是，阅读本文之前，请先对Transformer、Bert有个大致的了解，本文直接讲述源码中的数据运算细节，并不会涉及一些基础内容**。当然，我们还是先来回顾下Bert模型结构：
 21 | 
 22 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/01.png?raw=true)
 23 | 
 24 | Bert模型采用的是transformer的encoder部分（见上图），不同的是输入部分Bert增加了segment_embedding且模型细节方面有些微区别。下面直接进入Bert源码解析。Bert模型部分源码地址：
 25 | 
 26 | [https://github.com/google-research/bert/blob/master/modeling.py](https://github.com/google-research/bert/blob/master/modeling.py)。
 27 | 
 28 | 
 29 | 
 30 | ### 模型输入
 31 | 
 32 | Bert的输入有三部分：token_embedding、segment_embedding、position_embedding，它们分别指得是词的向量表示、词位于哪句话中、词的位置信息：
 33 | 
 34 | ![Bert输入](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/02.png?raw=true)
 35 | 
 36 | Bert输入部分由下面两个函数得到：
 37 | 
 38 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/03.png?raw=true)
 39 | 
 40 | embedding_lookup得到token_embedding，embedding_postprocessor得到将这三个输入向量相加的结果，注意embedding_postprocessor函数return最后结果之前有一个layer normalize和droupout处理：
 41 | 
 42 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/04.png?raw=true)
 43 | 
 44 | 
 45 | 
 46 | ### Padding_Mask
 47 | 
 48 | 由于输入句子长度不一样，Bert作了填充处理，将填充的部分标记为0，其余标记为1，这样是为了在做attention时能将填充部分得到的attention权重很少，从而能尽可能忽略padding部分对模型的影响：
 49 | 
 50 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/05.png?raw=true)
 51 | 
 52 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/06.png?raw=true)
 53 | 
 54 | 
 55 | 
 56 | ### attention_layer
 57 | 
 58 | 为了方便分析数据流通，对张量的维度作如下简记：
 59 | 
 60 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/07.png?raw=true)
 61 | 
 62 | 做了该简记后，经过词向量层输入Bert的张量维度为[B, F, embedding_size]，attention_mask维度为[B, F, T]。由于在Bert中是self-attention，F和T是相等的。接下来我详细解读一下attention_layer函数，该函数是Bert的Multi-Head Attention，也是模型最为复杂的部分。更详细的代码可以结合源码看。在进入这部分之前，也建议先了解一下2017年谷歌提出的transformer模型，推荐Jay Alammar可视化地介绍Transformer的博客文章The Illustrated Transformer ，非常容易理解整个机制。而Bert采用的是transformer的encoding部分，attention只用到了self-attention，self-attention可以看成Q=K的特殊情况。所以attention_layer函数参数中才会有from_tensor，to_tensor这两个变量，一个代表Q，另一个代表K及V（这里的Q，K，V含义不作介绍，可参考transformer模型讲解相关文章）。
 63 | 
 64 | ​	atterntion_layer函数里面首先定义了函数transpose_for_scores：
 65 | 
 66 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/08.png?raw=true)	
 67 | 
 68 | 该函数的作用是将attention层的输入（Q，K，V）切割成维度为[B, N, F 或T, H]。了解transformer可以知道，Q、K、V是输入的词向量分别经过一个线性变换得到的。在做线性变换即MLP层时先将input_tensor(维度为[B, F, embedding_size])reshape成二维的（其实源码在下一个函数transformer_model中使用这个函数传进去的参数已经变成二维的了，这一点看下一个函数transformer_model可以看到）：
 69 | 
 70 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/09.png?raw=true)
 71 | 
 72 | 接下来就是MLP层，即对输入的词向量input_tensor作三个不同的线性变换去得到Q、K、V，当然这一步后维度还需要转换一下才能得到最终的Q、K、V：
 73 | 
 74 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/10.png?raw=true)
 75 | 
 76 | MLP层将[B * F, embedding_size]变成[B * F, N * H]。但从后面的代码（transformer_model函数）可以看到embedding_size等于hidden_size等于N * H，相当于这个MLP层没有改变维度大小，这一点也是比较难理解的：
 77 | 
 78 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/11.png?raw=true)
 79 | 
 80 | 之后，代码通过先前介绍的transpose_for_scores函数得到Q、K、V，维度分别为[B, N, F, H]、[B, N, T, H]、[B, N, T, H]。不解得是，后面的求V代码并不是通过transpose_for_scores函数得到，而是又把transpose_for_scores函数体再写了一遍（:joy:）。
 81 | 
 82 | 到目前为止Q、K、V我们都已经得到了，我们再来回顾一下论文“Attention is all you need”中的attention公式：
 83 | 
 84 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/equation_1.png?raw=true)
 85 | 
 86 | 下面这部分得到的attention_scores得到的是softmax里面的部分。这里简单解释下tf.matmul。这个函数实质上是对最后两维进行普通的矩阵乘法，前面的维度都当做batch，因此这要求相乘的两个张量前面的维度是一样的，后面两个维度满足普通矩阵的乘法规则即可。细想一下attention的运算过程，这刚好是可以用这个矩阵乘法来得到结果的。得到的attention_scores的维度为[B, N, F, T]。只看后面两个维度（即只考虑一个数据、一个attention），attention_scores其实就是一个attention中Q和K作用得到的权重系数（还未经过softmax），而Q和K长度分别是F和T，因此共有F * T个这样的系数：
 87 | 
 88 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/12.png?raw=true)
 89 | 
 90 | 那么比较关键的一步来了——Mask，即将padding部分“mask”掉（**这和Bert预测词向量任务时的mask是完全不同的，详情参考相关文章，这里只讨论模型的详细架构**）：
 91 | 
 92 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/13.png?raw=true)
 93 | 
 94 | 我们在前面步骤中得到的attention_mask的维度为[B, F, T]，为了能实现矩阵加法，所以先在维度1上（指第二个维度，第一个维度axis=0）扩充一维，得到维度为[B, 1, F, T]。然后利用python里面的**广播机制**就可以相加了，要mask的部分加上-10000.0，不mask的部分加上0。这个模型的mask是在softmax之前做的，至于具体原因我也不太清楚，还是继续跟着数据流走吧。加上mask之后就是softmax，softmax之后又加了dropout：
 95 | 
 96 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/14.png?raw=true)
 97 | 
 98 | 再之后就是softmax之后的权重系数乘上后面的V，得到维度为[B, N, F, H]，在维度为1和维度为2的位置转置一下变成[B, F, N, H]，该函数可以返回两种维度的张量：
 99 | 
100 | 1. [B * F,  N * H]（*源码中注释H变成了V，这一点是错误吗？还是我理解错了？*）
101 | 2. [B, F, N * H]
102 | 
103 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/15.png?raw=true)
104 | 
105 | 至此，我将bert模型中最为复杂的Multi-Head Attention数据变化形式讲解完了。下一个函数transformer_model搭建Bert整体模型。
106 | 
107 | 
108 | 
109 | ### transformer_model
110 | 
111 | 下面我对transformer_model这个函数进行解析，该函数是将Transformer Encoded所有的组件结合在一起。 很多时候，结合图形理解是非常有帮助的。下面我们先看一下下面这个图吧（我们把这个图的结构叫做transformer block吧）：
112 | 
113 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/16.png?raw=true)
114 | 
115 | 整个Bert模型其实就是num_hidden_layers个这样的结构串连，相当于有num_hidden_layers个transformer_block。而self-attention部分在上个函数已经梳理得很清楚了，剩下的其实都是一些熟悉的组件（残差、MLP、LN）。transformer_model先处理好输入的词向量，然后进入一个循坏，每个循坏就是一个block：
116 | 
117 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/17.png?raw=true)
118 | 
119 | 上面的截图并未包括所有的循环代码，我们一步步来走下去。显然，代码是将上一个transformer block的输出作为下一个transformer block的输入。那么第一个transformer block的输入是什么呢？当然是我们前面所说的三个输入向量相加得到的input_tensor。至于每个block维度是否对得上，计算是否准确，继续看后面的代码就知道了。该代码中还用了变量all_layer_outputs来保存每一个block的输出结果，设置参数do_return_all_layers可以选择输出每个block的结果或者最后一个block的结果。transformer_model中使用attention_layer函数的输入数据维度为二维的（[B * F或B * T, hidden_size]）。详细看attention_layer函数时是可以输入二维张量数据的：
120 | 
121 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/18.png?raw=true)
122 | 
123 | 至于下面这部分为什么会有attention_heads这个变量，原因我也不知道，仿佛在这里是多此一举，源码中的解释如下：
124 | 
125 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/19.png?raw=true)
126 | 
127 | 我们再回顾一下上一个函数attention_layer，return的结果维度为[B * F,  N * H]或[B, F,  N * H]。注意这里面使用的attention_layer函数do_return_2d_tensor参数设置为True，所以attention_output的维度为[B * F, N * H]。然后再做一层MLP（该层并没改变维度，因为hidden_size=N * H）、dropout、layer_norm：
128 | 
129 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/20.png?raw=true)
130 | 
131 | 此时attention_output的维度还是[B * F, N * H或hidden_size]。由上面的图可以接下来是继续MLP层加dropout加layer_norm，只不过该层MLP的神经元数intermediate_size是一个超参数，可以人工指定：
132 | 
133 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/21.png?raw=true)
134 | 
135 | 由上面截图的代码可知接下来做了两层MLP，维度变化[B * F, hidden_size]到[B * F, intermediate_size]再到[B * F, hidden_size]，再经过dropout和layer_norm维度大小不变。至此，一个transformer block已经走完了。而此时得到的layer_out将作为下一个block的输入，这个维度与该模型第一个block的的输入是一样的，然后就是这样num_hidden_layers次循环下去得到最后一个block的输出结果layer_output，维度依旧为[B * F, hidden_size]。
136 | 
137 | return的时候通过reshape_from_matrix函数把block的输出变成维度和input_shape一样的维度，即一开始词向量输入input_tensor的维度（[batch_size, seq_length, hidden_size]）
138 | 
139 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/22.png?raw=true)
140 | 
141 | 
142 | 
143 | ### Bert_model class
144 | 
145 | 为了方便训练，模型的整个过程都封装在Bert_model类中，通过该类的实例可以访问模型中的结果。详细的过程见代码。上述几个函数梳理之后便没什么复杂的了，只是把内容整合在一起了。self.all_encoder_layers是经过transformer_model函数返回每个block的结果，self.sequence_output得到最后一个维度的结果，由上面的分析知维度为[Batch_szie, seq_length, hidden_size]，这和一开始词向量的维度是一样的，只不过这个结果是经过Transformer Encoded提取特征之后的，包含重要的信息，也是Bert想得到的结果：
146 | 
147 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/23.png?raw=true)
148 | 
149 | 在这一步之后，该类用成员变量self.pooled_output保存第一个位置再经过一个MLP层的输出结果。熟悉数据输入形式的可以知道，这个位置是[CLS]，该位置的输出在Bert预训练中是用来判断句子上下文关系的：
150 | 
151 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/24.png?raw=true)
152 | 
153 | 这里保存该结果除了可以用于Bert预训练，还可以微调Bert用于分类任务，详细可参考:
154 | 
155 | [https://www.jianshu.com/p/22e462f01d8c](https://www.jianshu.com/p/22e462f01d8c)   
156 | 
157 | 
158 | 
159 | ### 后续
160 | 
161 | 文中可能存在不少笔误或者理解不正确的表达不清晰地方敬请谅解，非常欢迎能提出来共同学习。
162 | 


--------------------------------------------------------------------------------
/modeling.py:
--------------------------------------------------------------------------------
   1 | # coding=utf-8
   2 | # Copyright 2018 The Google AI Language Team Authors.
   3 | #
   4 | # Licensed under the Apache License, Version 2.0 (the "License");
   5 | # you may not use this file except in compliance with the License.
   6 | # You may obtain a copy of the License at
   7 | #
   8 | #     http://www.apache.org/licenses/LICENSE-2.0
   9 | #
  10 | # Unless required by applicable law or agreed to in writing, software
  11 | # distributed under the License is distributed on an "AS IS" BASIS,
  12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 | # See the License for the specific language governing permissions and
  14 | # limitations under the License.
  15 | """The main BERT model and related functions."""
  16 | 
  17 | from __future__ import absolute_import
  18 | from __future__ import division
  19 | from __future__ import print_function
  20 | 
  21 | import collections
  22 | import copy
  23 | import json
  24 | import math
  25 | import re
  26 | import six
  27 | import tensorflow as tf
  28 | 
  29 | 
  30 | class BertConfig(object):
  31 |   """Configuration for `BertModel`."""
  32 | 
  33 |   def __init__(self,
  34 |                vocab_size,
  35 |                hidden_size=768,
  36 |                num_hidden_layers=12,
  37 |                num_attention_heads=12,
  38 |                intermediate_size=3072,
  39 |                hidden_act="gelu",
  40 |                hidden_dropout_prob=0.1,
  41 |                attention_probs_dropout_prob=0.1,
  42 |                max_position_embeddings=512,
  43 |                type_vocab_size=16,
  44 |                initializer_range=0.02):
  45 |     """Constructs BertConfig.
  46 | 
  47 |     Args:
  48 |       vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
  49 |       hidden_size: Size of the encoder layers and the pooler layer.
  50 |       num_hidden_layers: Number of hidden layers in the Transformer encoder.
  51 |       num_attention_heads: Number of attention heads for each attention layer in
  52 |         the Transformer encoder.
  53 |       intermediate_size: The size of the "intermediate" (i.e., feed-forward)
  54 |         layer in the Transformer encoder.
  55 |       hidden_act: The non-linear activation function (function or string) in the
  56 |         encoder and pooler.
  57 |       hidden_dropout_prob: The dropout probability for all fully connected
  58 |         layers in the embeddings, encoder, and pooler.
  59 |       attention_probs_dropout_prob: The dropout ratio for the attention
  60 |         probabilities.
  61 |       max_position_embeddings: The maximum sequence length that this model might
  62 |         ever be used with. Typically set this to something large just in case
  63 |         (e.g., 512 or 1024 or 2048).
  64 |       type_vocab_size: The vocabulary size of the `token_type_ids` passed into
  65 |         `BertModel`.
  66 |       initializer_range: The stdev of the truncated_normal_initializer for
  67 |         initializing all weight matrices.
  68 |     """
  69 |     self.vocab_size = vocab_size
  70 |     self.hidden_size = hidden_size
  71 |     self.num_hidden_layers = num_hidden_layers
  72 |     self.num_attention_heads = num_attention_heads
  73 |     self.hidden_act = hidden_act
  74 |     self.intermediate_size = intermediate_size
  75 |     self.hidden_dropout_prob = hidden_dropout_prob
  76 |     self.attention_probs_dropout_prob = attention_probs_dropout_prob
  77 |     self.max_position_embeddings = max_position_embeddings
  78 |     self.type_vocab_size = type_vocab_size
  79 |     self.initializer_range = initializer_range
  80 | 
  81 |   @classmethod
  82 |   def from_dict(cls, json_object):
  83 |     """Constructs a `BertConfig` from a Python dictionary of parameters."""
  84 |     config = BertConfig(vocab_size=None)
  85 |     for (key, value) in six.iteritems(json_object):
  86 |       config.__dict__[key] = value
  87 |     return config
  88 | 
  89 |   @classmethod
  90 |   def from_json_file(cls, json_file):
  91 |     """Constructs a `BertConfig` from a json file of parameters."""
  92 |     with tf.gfile.GFile(json_file, "r") as reader:
  93 |       text = reader.read()
  94 |     return cls.from_dict(json.loads(text))
  95 | 
  96 |   def to_dict(self):
  97 |     """Serializes this instance to a Python dictionary."""
  98 |     output = copy.deepcopy(self.__dict__)
  99 |     return output
 100 | 
 101 |   def to_json_string(self):
 102 |     """Serializes this instance to a JSON string."""
 103 |     return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 104 | 
 105 | 
 106 | class BertModel(object):
 107 |   """BERT model ("Bidirectional Embedding Representations from a Transformer").
 108 | 
 109 |   Example usage:
 110 | 
 111 |   ```python
 112 |   # Already been converted into WordPiece token ids
 113 |   input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
 114 |   input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
 115 |   token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
 116 | 
 117 |   config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
 118 |     num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
 119 | 
 120 |   model = modeling.BertModel(config=config, is_training=True,
 121 |     input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
 122 | 
 123 |   label_embeddings = tf.get_variable(...)
 124 |   pooled_output = model.get_pooled_output()
 125 |   logits = tf.matmul(pooled_output, label_embeddings)
 126 |   ...
 127 |   ```
 128 |   """
 129 | 
 130 |   def __init__(self,
 131 |                config,
 132 |                is_training,
 133 |                input_ids,
 134 |                input_mask=None,
 135 |                token_type_ids=None,
 136 |                use_one_hot_embeddings=True,
 137 |                scope=None):
 138 |     """Constructor for BertModel.
 139 | 
 140 |     Args:
 141 |       config: `BertConfig` instance.
 142 |       is_training: bool. rue for training model, false for eval model. Controls
 143 |         whether dropout will be applied.
 144 |       input_ids: int32 Tensor of shape [batch_size, seq_length].
 145 |       input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
 146 |       token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
 147 |       use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
 148 |         embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
 149 |         it is must faster if this is True, on the CPU or GPU, it is faster if
 150 |         this is False.
 151 |       scope: (optional) variable scope. Defaults to "bert".
 152 | 
 153 |     Raises:
 154 |       ValueError: The config is invalid or one of the input tensor shapes
 155 |         is invalid.
 156 |     """
 157 |     config = copy.deepcopy(config)
 158 |     if not is_training:
 159 |       config.hidden_dropout_prob = 0.0
 160 |       config.attention_probs_dropout_prob = 0.0
 161 | 
 162 |     input_shape = get_shape_list(input_ids, expected_rank=2)
 163 |     batch_size = input_shape[0]
 164 |     seq_length = input_shape[1]
 165 | 
 166 |     if input_mask is None:
 167 |       input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
 168 | 
 169 |     if token_type_ids is None:
 170 |       token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
 171 | 
 172 |     with tf.variable_scope(scope, default_name="bert"):
 173 |       with tf.variable_scope("embeddings"):
 174 |         # Perform embedding lookup on the word ids.
 175 |         (self.embedding_output, self.embedding_table) = embedding_lookup(
 176 |             input_ids=input_ids,
 177 |             vocab_size=config.vocab_size,
 178 |             embedding_size=config.hidden_size,
 179 |             initializer_range=config.initializer_range,
 180 |             word_embedding_name="word_embeddings",
 181 |             use_one_hot_embeddings=use_one_hot_embeddings)
 182 | 
 183 |         # Add positional embeddings and token type embeddings, then layer
 184 |         # normalize and perform dropout.
 185 |         self.embedding_output = embedding_postprocessor(
 186 |             input_tensor=self.embedding_output,
 187 |             use_token_type=True,
 188 |             token_type_ids=token_type_ids,
 189 |             token_type_vocab_size=config.type_vocab_size,
 190 |             token_type_embedding_name="token_type_embeddings",
 191 |             use_position_embeddings=True,
 192 |             position_embedding_name="position_embeddings",
 193 |             initializer_range=config.initializer_range,
 194 |             max_position_embeddings=config.max_position_embeddings,
 195 |             dropout_prob=config.hidden_dropout_prob)
 196 | 
 197 |       with tf.variable_scope("encoder"):
 198 |         # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
 199 |         # mask of shape [batch_size, seq_length, seq_length] which is used
 200 |         # for the attention scores.
 201 |         # return元素都为1维度为[batch_szie, seq_length, seq_length]的矩阵
 202 |         attention_mask = create_attention_mask_from_input_mask(
 203 |             input_ids, input_mask)
 204 | 
 205 |         # Run the stacked transformer.
 206 |         # `sequence_output` shape = [batch_size, seq_length, hidden_size].
 207 |         self.all_encoder_layers = transformer_model(
 208 |             input_tensor=self.embedding_output,
 209 |             attention_mask=attention_mask,
 210 |             hidden_size=config.hidden_size,
 211 |             num_hidden_layers=config.num_hidden_layers,
 212 |             num_attention_heads=config.num_attention_heads,
 213 |             intermediate_size=config.intermediate_size,
 214 |             intermediate_act_fn=get_activation(config.hidden_act),
 215 |             hidden_dropout_prob=config.hidden_dropout_prob,
 216 |             attention_probs_dropout_prob=config.attention_probs_dropout_prob,
 217 |             initializer_range=config.initializer_range,
 218 |             do_return_all_layers=True)
 219 | 
 220 |       self.sequence_output = self.all_encoder_layers[-1]
 221 |       # The "pooler" converts the encoded sequence tensor of shape
 222 |       # [batch_size, seq_length, hidden_size] to a tensor of shape
 223 |       # [batch_size, hidden_size]. This is necessary for segment-level
 224 |       # (or segment-pair-level) classification tasks where we need a fixed
 225 |       # dimensional representation of the segment.
 226 |       with tf.variable_scope("pooler"):
 227 |         # We "pool" the model by simply taking the hidden state corresponding
 228 |         # to the first token. We assume that this has been pre-trained
 229 |         first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
 230 |         self.pooled_output = tf.layers.dense(
 231 |             first_token_tensor,
 232 |             config.hidden_size,
 233 |             activation=tf.tanh,
 234 |             kernel_initializer=create_initializer(config.initializer_range))
 235 | 
 236 |   def get_pooled_output(self):
 237 |     return self.pooled_output
 238 | 
 239 |   def get_sequence_output(self):
 240 |     """Gets final hidden layer of encoder.
 241 | 
 242 |     Returns:
 243 |       float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
 244 |       to the final hidden of the transformer encoder.
 245 |     """
 246 |     return self.sequence_output
 247 | 
 248 |   def get_all_encoder_layers(self):
 249 |     return self.all_encoder_layers
 250 | 
 251 |   def get_embedding_output(self):
 252 |     """Gets output of the embedding lookup (i.e., input to the transformer).
 253 | 
 254 |     Returns:
 255 |       float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
 256 |       to the output of the embedding layer, after summing the word
 257 |       embeddings with the positional embeddings and the token type embeddings,
 258 |       then performing layer normalization. This is the input to the transformer.
 259 |     """
 260 |     return self.embedding_output
 261 | 
 262 |   def get_embedding_table(self):
 263 |     return self.embedding_table
 264 | 
 265 | 
 266 | def gelu(input_tensor):
 267 |   """Gaussian Error Linear Unit.
 268 | 
 269 |   This is a smoother version of the RELU.
 270 |   Original paper: https://arxiv.org/abs/1606.08415
 271 | 
 272 |   Args:
 273 |     input_tensor: float Tensor to perform activation.
 274 | 
 275 |   Returns:
 276 |     `input_tensor` with the GELU activation applied.
 277 |   """
 278 |   cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
 279 |   return input_tensor * cdf
 280 | 
 281 | 
 282 | def get_activation(activation_string):
 283 |   """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
 284 | 
 285 |   Args:
 286 |     activation_string: String name of the activation function.
 287 | 
 288 |   Returns:
 289 |     A Python function corresponding to the activation function. If
 290 |     `activation_string` is None, empty, or "linear", this will return None.
 291 |     If `activation_string` is not a string, it will return `activation_string`.
 292 | 
 293 |   Raises:
 294 |     ValueError: The `activation_string` does not correspond to a known
 295 |       activation.
 296 |   """
 297 | 
 298 |   # We assume that anything that"s not a string is already an activation
 299 |   # function, so we just return it.
 300 |   if not isinstance(activation_string, six.string_types):
 301 |     return activation_string
 302 | 
 303 |   if not activation_string:
 304 |     return None
 305 | 
 306 |   act = activation_string.lower()
 307 |   if act == "linear":
 308 |     return None
 309 |   elif act == "relu":
 310 |     return tf.nn.relu
 311 |   elif act == "gelu":
 312 |     return gelu
 313 |   elif act == "tanh":
 314 |     return tf.tanh
 315 |   else:
 316 |     raise ValueError("Unsupported activation: %s" % act)
 317 | 
 318 | 
 319 | def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
 320 |   """Compute the union of the current variables and checkpoint variables."""
 321 |   assignment_map = {}
 322 |   initialized_variable_names = {}
 323 | 
 324 |   name_to_variable = collections.OrderedDict()
 325 |   for var in tvars:
 326 |     name = var.name
 327 |     m = re.match("^(.*):\\d+$", name)
 328 |     if m is not None:
 329 |       name = m.group(1)
 330 |     name_to_variable[name] = var
 331 | 
 332 |   init_vars = tf.train.list_variables(init_checkpoint)
 333 | 
 334 |   assignment_map = collections.OrderedDict()
 335 |   for x in init_vars:
 336 |     (name, var) = (x[0], x[1])
 337 |     if name not in name_to_variable:
 338 |       continue
 339 |     assignment_map[name] = name
 340 |     initialized_variable_names[name] = 1
 341 |     initialized_variable_names[name + ":0"] = 1
 342 | 
 343 |   return (assignment_map, initialized_variable_names)
 344 | 
 345 | 
 346 | def dropout(input_tensor, dropout_prob):
 347 |   """Perform dropout.
 348 | 
 349 |   Args:
 350 |     input_tensor: float Tensor.
 351 |     dropout_prob: Python float. The probability of dropping out a value (NOT of
 352 |       *keeping* a dimension as in `tf.nn.dropout`).
 353 | 
 354 |   Returns:
 355 |     A version of `input_tensor` with dropout applied.
 356 |   """
 357 |   if dropout_prob is None or dropout_prob == 0.0:
 358 |     return input_tensor
 359 | 
 360 |   output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
 361 |   return output
 362 | 
 363 | 
 364 | def layer_norm(input_tensor, name=None):
 365 |   """Run layer normalization on the last dimension of the tensor."""
 366 |   return tf.contrib.layers.layer_norm(
 367 |       inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
 368 | 
 369 | 
 370 | def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
 371 |   """Runs layer normalization followed by dropout."""
 372 |   output_tensor = layer_norm(input_tensor, name)
 373 |   output_tensor = dropout(output_tensor, dropout_prob)
 374 |   return output_tensor
 375 | 
 376 | 
 377 | def create_initializer(initializer_range=0.02):
 378 |   """Creates a `truncated_normal_initializer` with the given range."""
 379 |   return tf.truncated_normal_initializer(stddev=initializer_range)
 380 | 
 381 | 
 382 | def embedding_lookup(input_ids,
 383 |                      vocab_size,
 384 |                      embedding_size=128,
 385 |                      initializer_range=0.02,
 386 |                      word_embedding_name="word_embeddings",
 387 |                      use_one_hot_embeddings=False):
 388 |   """Looks up words embeddings for id tensor.
 389 | 
 390 |   Args:
 391 |     input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
 392 |       ids.
 393 |     vocab_size: int. Size of the embedding vocabulary.
 394 |     embedding_size: int. Width of the word embeddings.
 395 |     initializer_range: float. Embedding initialization range.
 396 |     word_embedding_name: string. Name of the embedding table.
 397 |     use_one_hot_embeddings: bool. If True, use one-hot method for word
 398 |       embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
 399 |       for TPUs.
 400 | 
 401 |   Returns:
 402 |     float Tensor of shape [batch_size, seq_length, embedding_size].
 403 |   """
 404 |   # This function assumes that the input is of shape [batch_size, seq_length,
 405 |   # num_inputs].
 406 |   #
 407 |   # If the input is a 2D tensor of shape [batch_size, seq_length], we
 408 |   # reshape to [batch_size, seq_length, 1].
 409 |   if input_ids.shape.ndims == 2:
 410 |     input_ids = tf.expand_dims(input_ids, axis=[-1])
 411 | 
 412 |   embedding_table = tf.get_variable(
 413 |       name=word_embedding_name,
 414 |       shape=[vocab_size, embedding_size],
 415 |       initializer=create_initializer(initializer_range))
 416 | 
 417 |   if use_one_hot_embeddings:
 418 |     flat_input_ids = tf.reshape(input_ids, [-1])
 419 |     one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
 420 |     output = tf.matmul(one_hot_input_ids, embedding_table)
 421 |   else:
 422 |     output = tf.nn.embedding_lookup(embedding_table, input_ids)  
 423 |     # 此时output的维度为[batch_size, seq_length, 1, embedding_size]
 424 | 
 425 |   input_shape = get_shape_list(input_ids)
 426 | 
 427 |   # 将output变成[batch_size, seq_length, embedding_size]的维度
 428 |   output = tf.reshape(output,
 429 |                       input_shape[0:-1] + [input_shape[-1] * embedding_size])
 430 |   return (output, embedding_table)
 431 | 
 432 | 
 433 | def embedding_postprocessor(input_tensor,
 434 |                             use_token_type=False,
 435 |                             token_type_ids=None,
 436 |                             token_type_vocab_size=16,
 437 |                             token_type_embedding_name="token_type_embeddings",
 438 |                             use_position_embeddings=True,
 439 |                             position_embedding_name="position_embeddings",
 440 |                             initializer_range=0.02,
 441 |                             max_position_embeddings=512,
 442 |                             dropout_prob=0.1):
 443 |   """Performs various post-processing on a word embedding tensor.
 444 | 
 445 |   Args:
 446 |     input_tensor: float Tensor of shape [batch_size, seq_length,
 447 |       embedding_size].
 448 |     use_token_type: bool. Whether to add embeddings for `token_type_ids`.
 449 |     token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
 450 |       Must be specified if `use_token_type` is True.
 451 |     token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
 452 |     token_type_embedding_name: string. The name of the embedding table variable
 453 |       for token type ids.
 454 |     use_position_embeddings: bool. Whether to add position embeddings for the
 455 |       position of each token in the sequence.
 456 |     position_embedding_name: string. The name of the embedding table variable
 457 |       for positional embeddings.
 458 |     initializer_range: float. Range of the weight initialization.
 459 |     max_position_embeddings: int. Maximum sequence length that might ever be
 460 |       used with this model. This can be longer than the sequence length of
 461 |       input_tensor, but cannot be shorter.
 462 |     dropout_prob: float. Dropout probability applied to the final output tensor.
 463 | 
 464 |   Returns:
 465 |     float tensor with same shape as `input_tensor`.
 466 | 
 467 |   Raises:
 468 |     ValueError: One of the tensor shapes or input values is invalid.
 469 |   """
 470 |   input_shape = get_shape_list(input_tensor, expected_rank=3)
 471 |   batch_size = input_shape[0]
 472 |   seq_length = input_shape[1]
 473 |   width = input_shape[2]
 474 | 
 475 |   output = input_tensor
 476 | 
 477 |   if use_token_type:
 478 |     if token_type_ids is None:
 479 |       raise ValueError("`token_type_ids` must be specified if"
 480 |                        "`use_token_type` is True.")
 481 |     token_type_table = tf.get_variable(
 482 |         name=token_type_embedding_name,
 483 |         shape=[token_type_vocab_size, width],
 484 |         initializer=create_initializer(initializer_range))
 485 |     # This vocab will be small so we always do one-hot here, since it is always
 486 |     # faster for a small vocabulary.
 487 |     flat_token_type_ids = tf.reshape(token_type_ids, [-1])
 488 |     one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
 489 |     token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
 490 |     token_type_embeddings = tf.reshape(token_type_embeddings,
 491 |                                        [batch_size, seq_length, width])
 492 |     output += token_type_embeddings
 493 | 
 494 |   if use_position_embeddings:
 495 |     assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
 496 |     with tf.control_dependencies([assert_op]):
 497 |       full_position_embeddings = tf.get_variable(
 498 |           name=position_embedding_name,
 499 |           shape=[max_position_embeddings, width],
 500 |           initializer=create_initializer(initializer_range))
 501 |       # Since the position embedding table is a learned variable, we create it
 502 |       # using a (long) sequence length `max_position_embeddings`. The actual
 503 |       # sequence length might be shorter than this, for faster training of
 504 |       # tasks that do not have long sequences.
 505 |       #
 506 |       # So `full_position_embeddings` is effectively an embedding table
 507 |       # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
 508 |       # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
 509 |       # perform a slice.
 510 |       position_embeddings = tf.slice(full_position_embeddings, [0, 0],
 511 |                                      [seq_length, -1])
 512 |       num_dims = len(output.shape.as_list())
 513 | 
 514 |       # Only the last two dimensions are relevant (`seq_length` and `width`), so
 515 |       # we broadcast among the first dimensions, which is typically just
 516 |       # the batch size.
 517 |       position_broadcast_shape = []
 518 |       for _ in range(num_dims - 2):
 519 |         position_broadcast_shape.append(1)
 520 |       position_broadcast_shape.extend([seq_length, width])
 521 |       # 将position_embeddings变成和output相同的维度以便相加
 522 |       position_embeddings = tf.reshape(position_embeddings,
 523 |                                        position_broadcast_shape)
 524 |       output += position_embeddings
 525 |   
 526 |   # LN和droupout处理，这里的droupout为1-droupout_prob
 527 |   output = layer_norm_and_dropout(output, dropout_prob)
 528 |   return output
 529 | 
 530 | 
 531 | def create_attention_mask_from_input_mask(from_tensor, to_mask):
 532 |   """Create 3D attention mask from a 2D tensor mask.
 533 | 
 534 |   Args:
 535 |     from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
 536 |     to_mask: int32 Tensor of shape [batch_size, to_seq_length].
 537 | 
 538 |   Returns:
 539 |     float Tensor of shape [batch_size, from_seq_length, to_seq_length].
 540 |   """
 541 |   from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
 542 |   batch_size = from_shape[0]
 543 |   from_seq_length = from_shape[1]
 544 | 
 545 |   to_shape = get_shape_list(to_mask, expected_rank=2)
 546 |   to_seq_length = to_shape[1]
 547 | 
 548 |   to_mask = tf.cast(
 549 |       tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
 550 | 
 551 |   # We don't assume that `from_tensor` is a mask (although it could be). We
 552 |   # don't actually care if we attend *from* padding tokens (only *to* padding)
 553 |   # tokens so we create a tensor of all ones.
 554 |   #
 555 |   # `broadcast_ones` = [batch_size, from_seq_length, 1]
 556 |   broadcast_ones = tf.ones(
 557 |       shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
 558 | 
 559 |   # Here we broadcast along two dimensions to create the mask.
 560 |   mask = broadcast_ones * to_mask
 561 | 
 562 |   return mask
 563 | 
 564 | 
 565 | def attention_layer(from_tensor,
 566 |                     to_tensor,
 567 |                     attention_mask=None,
 568 |                     num_attention_heads=1,
 569 |                     size_per_head=512,
 570 |                     query_act=None,
 571 |                     key_act=None,
 572 |                     value_act=None,
 573 |                     attention_probs_dropout_prob=0.0,
 574 |                     initializer_range=0.02,
 575 |                     do_return_2d_tensor=False,
 576 |                     batch_size=None,
 577 |                     from_seq_length=None,
 578 |                     to_seq_length=None):
 579 |   """Performs multi-headed attention from `from_tensor` to `to_tensor`.
 580 | 
 581 |   This is an implementation of multi-headed attention based on "Attention
 582 |   is all you Need". If `from_tensor` and `to_tensor` are the same, then
 583 |   this is self-attention. Each timestep in `from_tensor` attends to the
 584 |   corresponding sequence in `to_tensor`, and returns a fixed-with vector.
 585 | 
 586 |   This function first projects `from_tensor` into a "query" tensor and
 587 |   `to_tensor` into "key" and "value" tensors. These are (effectively) a list
 588 |   of tensors of length `num_attention_heads`, where each tensor is of shape
 589 |   [batch_size, seq_length, size_per_head].
 590 | 
 591 |   Then, the query and key tensors are dot-producted and scaled. These are
 592 |   softmaxed to obtain attention probabilities. The value tensors are then
 593 |   interpolated by these probabilities, then concatenated back to a single
 594 |   tensor and returned.
 595 | 
 596 |   In practice, the multi-headed attention are done with transposes and
 597 |   reshapes rather than actual separate tensors.
 598 | 
 599 |   Args:
 600 |     from_tensor: float Tensor of shape [batch_size, from_seq_length,
 601 |       from_width].
 602 |     to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
 603 |     attention_mask: (optional) int32 Tensor of shape [batch_size,
 604 |       from_seq_length, to_seq_length]. The values should be 1 or 0. The
 605 |       attention scores will effectively be set to -infinity for any positions in
 606 |       the mask that are 0, and will be unchanged for positions that are 1.
 607 |     num_attention_heads: int. Number of attention heads.
 608 |     size_per_head: int. Size of each attention head.
 609 |     query_act: (optional) Activation function for the query transform.
 610 |     key_act: (optional) Activation function for the key transform.
 611 |     value_act: (optional) Activation function for the value transform.
 612 |     attention_probs_dropout_prob: (optional) float. Dropout probability of the
 613 |       attention probabilities.
 614 |     initializer_range: float. Range of the weight initializer.
 615 |     do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
 616 |       * from_seq_length, num_attention_heads * size_per_head]. If False, the
 617 |       output will be of shape [batch_size, from_seq_length, num_attention_heads
 618 |       * size_per_head].
 619 |     batch_size: (Optional) int. If the input is 2D, this might be the batch size
 620 |       of the 3D version of the `from_tensor` and `to_tensor`.
 621 |     from_seq_length: (Optional) If the input is 2D, this might be the seq length
 622 |       of the 3D version of the `from_tensor`.
 623 |     to_seq_length: (Optional) If the input is 2D, this might be the seq length
 624 |       of the 3D version of the `to_tensor`.
 625 | 
 626 |   Returns:
 627 |     float Tensor of shape [batch_size, from_seq_length,
 628 |       num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
 629 |       true, this will be of shape [batch_size * from_seq_length,
 630 |       num_attention_heads * size_per_head]).
 631 | 
 632 |   Raises:
 633 |     ValueError: Any of the arguments or tensor shapes are invalid.
 634 |   """
 635 | 
 636 |   def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
 637 |                            seq_length, width):
 638 |     output_tensor = tf.reshape(
 639 |         input_tensor, [batch_size, seq_length, num_attention_heads, width])
 640 | 
 641 |     # 第二个维度变成num_attention_heads，相当于切割
 642 |     output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
 643 |     return output_tensor
 644 | 
 645 |   from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
 646 |   to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
 647 | 
 648 |   if len(from_shape) != len(to_shape):
 649 |     raise ValueError(
 650 |         "The rank of `from_tensor` must match the rank of `to_tensor`.")
 651 | 
 652 |   if len(from_shape) == 3:
 653 |     batch_size = from_shape[0]
 654 |     from_seq_length = from_shape[1]
 655 |     to_seq_length = to_shape[1]
 656 |   elif len(from_shape) == 2:
 657 |     if (batch_size is None or from_seq_length is None or to_seq_length is None):
 658 |       raise ValueError(
 659 |           "When passing in rank 2 tensors to attention_layer, the values "
 660 |           "for `batch_size`, `from_seq_length`, and `to_seq_length` "
 661 |           "must all be specified.")
 662 | 
 663 |   # Scalar dimensions referenced here:
 664 |   #   B = batch size (number of sequences)
 665 |   #   F = `from_tensor` sequence length
 666 |   #   T = `to_tensor` sequence length
 667 |   #   N = `num_attention_heads`
 668 |   #   H = `size_per_head`
 669 | 
 670 |   # 转化成二维向量，第二个维度为width，第一个维度变成batch_size * from_seq_length
 671 |   from_tensor_2d = reshape_to_matrix(from_tensor)
 672 |   to_tensor_2d = reshape_to_matrix(to_tensor)
 673 | 
 674 |   # 对Q做线性映射，映射到-1维度为num_attention_heads * size_per_head
 675 |   # `query_layer` = [B*F, N*H]
 676 |   query_layer = tf.layers.dense(
 677 |       from_tensor_2d,
 678 |       num_attention_heads * size_per_head,
 679 |       activation=query_act,
 680 |       name="query",
 681 |       kernel_initializer=create_initializer(initializer_range))
 682 | 
 683 |   # 对K做线性映射，映射到-1维度为num_attention_heads * size_per_head
 684 |   # `key_layer` = [B*T, N*H]
 685 |   key_layer = tf.layers.dense(
 686 |       to_tensor_2d,
 687 |       num_attention_heads * size_per_head,
 688 |       activation=key_act,
 689 |       name="key",
 690 |       kernel_initializer=create_initializer(initializer_range))
 691 | 
 692 |   # 对V做线性映射，映射到-1维度为num_attention_heads * size_per_head
 693 |   # `value_layer` = [B*T, N*H]
 694 |   value_layer = tf.layers.dense(
 695 |       to_tensor_2d,
 696 |       num_attention_heads * size_per_head,
 697 |       activation=value_act,
 698 |       name="value",
 699 |       kernel_initializer=create_initializer(initializer_range))
 700 | 
 701 |   # 此步从-1维度再经1,2为互换将Q或K或V变成
 702 |   # 维度batch_size * num_attention_heads * from_seq_length * size_per_head
 703 |   # `query_layer` = [B, N, F, H]
 704 |   query_layer = transpose_for_scores(query_layer, batch_size,
 705 |                                      num_attention_heads, from_seq_length,
 706 |                                      size_per_head)
 707 | 
 708 |   # `key_layer` = [B, N, T, H]
 709 |   key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
 710 |                                    to_seq_length, size_per_head)
 711 | 
 712 |   # Take the dot product between "query" and "key" to get the raw
 713 |   # attention scores.
 714 |   # `attention_scores` = [B, N, F, T]
 715 |   attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
 716 |   attention_scores = tf.multiply(attention_scores,
 717 |                                  1.0 / math.sqrt(float(size_per_head)))
 718 | 
 719 |   if attention_mask is not None:
 720 |     # `attention_mask` = [B, 1, F, T]
 721 |     attention_mask = tf.expand_dims(attention_mask, axis=[1])
 722 | 
 723 |     # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
 724 |     # masked positions, this operation will create a tensor which is 0.0 for
 725 |     # positions we want to attend and -10000.0 for masked positions.
 726 |     adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
 727 | 
 728 |     # Since we are adding it to the raw scores before the softmax, this is
 729 |     # effectively the same as removing these entirely.
 730 |     # 这里维度为什么可以直接相加？？？？？？？？？？？？？？？？？广播机制！！！！
 731 |     attention_scores += adder
 732 | 
 733 |   # Normalize the attention scores to probabilities.
 734 |   # `attention_probs` = [B, N, F, T]
 735 |   attention_probs = tf.nn.softmax(attention_scores)
 736 | 
 737 |   # This is actually dropping out entire tokens to attend to, which might
 738 |   # seem a bit unusual, but is taken from the original Transformer paper.
 739 |   attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
 740 | 
 741 |   # `value_layer` = [B, T, N, H]
 742 |   value_layer = tf.reshape(
 743 |       value_layer,
 744 |       [batch_size, to_seq_length, num_attention_heads, size_per_head])
 745 | 
 746 |   # `value_layer` = [B, N, T, H]
 747 |   value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
 748 | 
 749 |   # `context_layer` = [B, N, F, H]
 750 |   context_layer = tf.matmul(attention_probs, value_layer)
 751 | 
 752 |   # `context_layer` = [B, F, N, H]
 753 |   context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
 754 | 
 755 |   if do_return_2d_tensor:
 756 |     # `context_layer` = [B*F, N*V]
 757 |     context_layer = tf.reshape(
 758 |         context_layer,
 759 |         [batch_size * from_seq_length, num_attention_heads * size_per_head])
 760 |   else:
 761 |     # `context_layer` = [B, F, N*V]
 762 |     context_layer = tf.reshape(
 763 |         context_layer,
 764 |         [batch_size, from_seq_length, num_attention_heads * size_per_head])
 765 | 
 766 |   return context_layer
 767 | 
 768 | 
 769 | def transformer_model(input_tensor,
 770 |                       attention_mask=None,
 771 |                       hidden_size=768,
 772 |                       num_hidden_layers=12,
 773 |                       num_attention_heads=12,
 774 |                       intermediate_size=3072,
 775 |                       intermediate_act_fn=gelu,
 776 |                       hidden_dropout_prob=0.1,
 777 |                       attention_probs_dropout_prob=0.1,
 778 |                       initializer_range=0.02,
 779 |                       do_return_all_layers=False):
 780 |   """Multi-headed, multi-layer Transformer from "Attention is All You Need".
 781 | 
 782 |   This is almost an exact implementation of the original Transformer encoder.
 783 | 
 784 |   See the original paper:
 785 |   https://arxiv.org/abs/1706.03762
 786 | 
 787 |   Also see:
 788 |   https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
 789 | 
 790 |   Args:
 791 |     input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
 792 |     attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
 793 |       seq_length], with 1 for positions that can be attended to and 0 in
 794 |       positions that should not be.
 795 |     hidden_size: int. Hidden size of the Transformer.
 796 |     num_hidden_layers: int. Number of layers (blocks) in the Transformer.
 797 |     num_attention_heads: int. Number of attention heads in the Transformer.
 798 |     intermediate_size: int. The size of the "intermediate" (a.k.a., feed
 799 |       forward) layer.
 800 |     intermediate_act_fn: function. The non-linear activation function to apply
 801 |       to the output of the intermediate/feed-forward layer.
 802 |     hidden_dropout_prob: float. Dropout probability for the hidden layers.
 803 |     attention_probs_dropout_prob: float. Dropout probability of the attention
 804 |       probabilities.
 805 |     initializer_range: float. Range of the initializer (stddev of truncated
 806 |       normal).
 807 |     do_return_all_layers: Whether to also return all layers or just the final
 808 |       layer.
 809 | 
 810 |   Returns:
 811 |     float Tensor of shape [batch_size, seq_length, hidden_size], the final
 812 |     hidden layer of the Transformer.
 813 | 
 814 |   Raises:
 815 |     ValueError: A Tensor shape or parameter is invalid.
 816 |   """
 817 |   if hidden_size % num_attention_heads != 0:
 818 |     raise ValueError(
 819 |         "The hidden size (%d) is not a multiple of the number of attention "
 820 |         "heads (%d)" % (hidden_size, num_attention_heads))
 821 | 
 822 |   attention_head_size = int(hidden_size / num_attention_heads)
 823 |   input_shape = get_shape_list(input_tensor, expected_rank=3)
 824 |   batch_size = input_shape[0]
 825 |   seq_length = input_shape[1]
 826 |   input_width = input_shape[2]
 827 | 
 828 |   # The Transformer performs sum residuals on all layers so the input needs
 829 |   # to be the same as the hidden size.
 830 |   if input_width != hidden_size:
 831 |     raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
 832 |                      (input_width, hidden_size))
 833 | 
 834 |   # We keep the representation as a 2D tensor to avoid re-shaping it back and
 835 |   # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
 836 |   # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
 837 |   # help the optimizer.
 838 |   prev_output = reshape_to_matrix(input_tensor)
 839 | 
 840 |   all_layer_outputs = []
 841 |   for layer_idx in range(num_hidden_layers):
 842 |     with tf.variable_scope("layer_%d" % layer_idx):
 843 |       layer_input = prev_output
 844 | 
 845 |       with tf.variable_scope("attention"):
 846 |         attention_heads = []
 847 |         with tf.variable_scope("self"):
 848 |           attention_head = attention_layer(
 849 |               from_tensor=layer_input,
 850 |               to_tensor=layer_input,
 851 |               attention_mask=attention_mask,
 852 |               num_attention_heads=num_attention_heads,
 853 |               size_per_head=attention_head_size,
 854 |               attention_probs_dropout_prob=attention_probs_dropout_prob,
 855 |               initializer_range=initializer_range,
 856 |               do_return_2d_tensor=True,
 857 |               batch_size=batch_size,
 858 |               from_seq_length=seq_length,
 859 |               to_seq_length=seq_length)
 860 |           attention_heads.append(attention_head)
 861 | 
 862 |         attention_output = None
 863 |         if len(attention_heads) == 1:
 864 |           attention_output = attention_heads[0]
 865 |         else:
 866 |           # In the case where we have other sequences, we just concatenate
 867 |           # them to the self-attention head before the projection.
 868 |           attention_output = tf.concat(attention_heads, axis=-1)
 869 | 
 870 |         # Run a linear projection of `hidden_size` then add a residual
 871 |         # with `layer_input`.
 872 |         with tf.variable_scope("output"):
 873 |           attention_output = tf.layers.dense(
 874 |               attention_output,
 875 |               hidden_size,
 876 |               kernel_initializer=create_initializer(initializer_range))
 877 |           attention_output = dropout(attention_output, hidden_dropout_prob)
 878 |           attention_output = layer_norm(attention_output + layer_input)
 879 | 
 880 |       # The activation is only applied to the "intermediate" hidden layer.
 881 |       with tf.variable_scope("intermediate"):
 882 |         intermediate_output = tf.layers.dense(
 883 |             attention_output,
 884 |             intermediate_size,
 885 |             activation=intermediate_act_fn,
 886 |             kernel_initializer=create_initializer(initializer_range))
 887 | 
 888 |       # Down-project back to `hidden_size` then add the residual.
 889 |       with tf.variable_scope("output"):
 890 |         layer_output = tf.layers.dense(
 891 |             intermediate_output,
 892 |             hidden_size,
 893 |             kernel_initializer=create_initializer(initializer_range))
 894 |         layer_output = dropout(layer_output, hidden_dropout_prob)
 895 |         layer_output = layer_norm(layer_output + attention_output)
 896 |         prev_output = layer_output
 897 |         all_layer_outputs.append(layer_output)
 898 |   
 899 |   # 是否是返回所有多头注意力的结果还是最后一层
 900 |   if do_return_all_layers:
 901 |     final_outputs = []
 902 |     for layer_output in all_layer_outputs:
 903 |       final_output = reshape_from_matrix(layer_output, input_shape)
 904 |       final_outputs.append(final_output)
 905 |     return final_outputs
 906 |   else:
 907 |     final_output = reshape_from_matrix(prev_output, input_shape)
 908 |     return final_output
 909 | 
 910 | 
 911 | def get_shape_list(tensor, expected_rank=None, name=None):
 912 |   """Returns a list of the shape of tensor, preferring static dimensions.
 913 | 
 914 |   Args:
 915 |     tensor: A tf.Tensor object to find the shape of.
 916 |     expected_rank: (optional) int. The expected rank of `tensor`. If this is
 917 |       specified and the `tensor` has a different rank, and exception will be
 918 |       thrown.
 919 |     name: Optional name of the tensor for the error message.
 920 | 
 921 |   Returns:
 922 |     A list of dimensions of the shape of tensor. All static dimensions will
 923 |     be returned as python integers, and dynamic dimensions will be returned
 924 |     as tf.Tensor scalars.
 925 |   """
 926 |   if name is None:
 927 |     name = tensor.name
 928 | 
 929 |   if expected_rank is not None:
 930 |     assert_rank(tensor, expected_rank, name)
 931 | 
 932 |   shape = tensor.shape.as_list()
 933 | 
 934 |   non_static_indexes = []
 935 |   for (index, dim) in enumerate(shape):
 936 |     if dim is None:
 937 |       non_static_indexes.append(index)
 938 | 
 939 |   if not non_static_indexes:
 940 |     return shape
 941 | 
 942 |   dyn_shape = tf.shape(tensor)   # tf.shape输出的tensor维度的张量
 943 |   for index in non_static_indexes:
 944 |     shape[index] = dyn_shape[index]
 945 |   return shape
 946 | 
 947 | 
 948 | def reshape_to_matrix(input_tensor):
 949 |   """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
 950 |   ndims = input_tensor.shape.ndims
 951 |   if ndims < 2:
 952 |     raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
 953 |                      (input_tensor.shape))
 954 |   if ndims == 2:
 955 |     return input_tensor
 956 | 
 957 |   width = input_tensor.shape[-1]
 958 |   output_tensor = tf.reshape(input_tensor, [-1, width])
 959 |   return output_tensor
 960 | 
 961 | 
 962 | def reshape_from_matrix(output_tensor, orig_shape_list):
 963 |   """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
 964 |   if len(orig_shape_list) == 2:
 965 |     return output_tensor
 966 | 
 967 |   output_shape = get_shape_list(output_tensor)
 968 | 
 969 |   orig_dims = orig_shape_list[0:-1]
 970 |   width = output_shape[-1]
 971 | 
 972 |   return tf.reshape(output_tensor, orig_dims + [width])
 973 | 
 974 | 
 975 | def assert_rank(tensor, expected_rank, name=None):
 976 |   """Raises an exception if the tensor rank is not of the expected rank.
 977 | 
 978 |   Args:
 979 |     tensor: A tf.Tensor to check the rank of.
 980 |     expected_rank: Python integer or list of integers, expected rank.
 981 |     name: Optional name of the tensor for the error message.
 982 | 
 983 |   Raises:
 984 |     ValueError: If the expected shape doesn't match the actual shape.
 985 |   """
 986 |   if name is None:
 987 |     name = tensor.name
 988 | 
 989 |   expected_rank_dict = {}
 990 |   if isinstance(expected_rank, six.integer_types):
 991 |     expected_rank_dict[expected_rank] = True
 992 |   else:
 993 |     for x in expected_rank:
 994 |       expected_rank_dict[x] = True
 995 | 
 996 |   actual_rank = tensor.shape.ndims
 997 |   if actual_rank not in expected_rank_dict:
 998 |     scope_name = tf.get_variable_scope().name
 999 |     raise ValueError(
1000 |         "For the tensor `%s` in scope `%s`, the actual rank "
1001 |         "`%d` (shape = %s) is not equal to the expected rank `%s`" %
1002 |         (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
1003 | 


--------------------------------------------------------------------------------