├── image ├── 01.png ├── 02.png ├── 03.png ├── 04.png ├── 05.png ├── 06.png ├── 07.png ├── 08.png ├── 09.png ├── 10.png ├── 11.png ├── 12.png ├── 13.png ├── 14.png ├── 15.png ├── 16.png ├── 17.png ├── 18.png ├── 19.png ├── 20.png ├── 21.png ├── 22.png ├── 23.png ├── 24.png └── equation_1.png ├── README.md ├── Bert模型源码解析.md └── modeling.py /image/01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/01.png -------------------------------------------------------------------------------- /image/02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/02.png -------------------------------------------------------------------------------- /image/03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/03.png -------------------------------------------------------------------------------- /image/04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/04.png -------------------------------------------------------------------------------- /image/05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/05.png -------------------------------------------------------------------------------- /image/06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/06.png -------------------------------------------------------------------------------- /image/07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/07.png -------------------------------------------------------------------------------- /image/08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/08.png -------------------------------------------------------------------------------- /image/09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/09.png -------------------------------------------------------------------------------- /image/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/10.png -------------------------------------------------------------------------------- /image/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/11.png -------------------------------------------------------------------------------- /image/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/12.png -------------------------------------------------------------------------------- /image/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/13.png -------------------------------------------------------------------------------- /image/14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/14.png -------------------------------------------------------------------------------- /image/15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/15.png -------------------------------------------------------------------------------- /image/16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/16.png -------------------------------------------------------------------------------- /image/17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/17.png -------------------------------------------------------------------------------- /image/18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/18.png -------------------------------------------------------------------------------- /image/19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/19.png -------------------------------------------------------------------------------- /image/20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/20.png -------------------------------------------------------------------------------- /image/21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/21.png -------------------------------------------------------------------------------- /image/22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/22.png -------------------------------------------------------------------------------- /image/23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/23.png -------------------------------------------------------------------------------- /image/24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/24.png -------------------------------------------------------------------------------- /image/equation_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1234560o/Bert-model-code-interpretation/HEAD/image/equation_1.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bert-model-code-interpretation 2 | 解读tensorflow版本Bert中modeling.py数据流 3 | * [Bert模型源码解析](./Bert模型源码解析.md) 4 | * [文章简书地址](https://www.jianshu.com/p/2a3872148766) 5 | 6 | -------------------------------------------------------------------------------- /Bert模型源码解析.md: -------------------------------------------------------------------------------- 1 | ## Bert模型tensorflow源码解析(详解Transformer Encoder数据运算) 2 | 3 | ### Contents 4 | 5 | * [前言](#前言) 6 | * [模型输入](#模型输入) 7 | * [Padding_Mask](#Padding_Mask) 8 | * [attention_layer](#attention_layer) 9 | * [transformer_model](#transformer_model) 10 | * [Bert_model class](#Bert_model-class) 11 | * [后续](#后续) 12 | 13 | 14 | ### 前言 15 | 16 | 关于Bert模型的基本内容这里就不讲述了,可参考其它文章,这里有一个收集了很多讲解bert文章的网址: 17 | 18 | [http://www.52nlp.cn/bert-paper-论文-文章-代码资源汇总](http://www.52nlp.cn/bert-paper-%E8%AE%BA%E6%96%87-%E6%96%87%E7%AB%A0-%E4%BB%A3%E7%A0%81%E8%B5%84%E6%BA%90%E6%B1%87%E6%80%BB) 19 | 20 | 与大多数文章不同的是,本文主要是对Bert模型部分的源码进行详细解读,搞清楚数据从Bert模型输入到输出的每一步变化,这对于我们理解Bert模型、特别是改造Bert是具有极大帮助的。**需要注意的是,阅读本文之前,请先对Transformer、Bert有个大致的了解,本文直接讲述源码中的数据运算细节,并不会涉及一些基础内容**。当然,我们还是先来回顾下Bert模型结构: 21 | 22 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/01.png?raw=true) 23 | 24 | Bert模型采用的是transformer的encoder部分(见上图),不同的是输入部分Bert增加了segment_embedding且模型细节方面有些微区别。下面直接进入Bert源码解析。Bert模型部分源码地址: 25 | 26 | [https://github.com/google-research/bert/blob/master/modeling.py](https://github.com/google-research/bert/blob/master/modeling.py)。 27 | 28 | 29 | 30 | ### 模型输入 31 | 32 | Bert的输入有三部分:token_embedding、segment_embedding、position_embedding,它们分别指得是词的向量表示、词位于哪句话中、词的位置信息: 33 | 34 | ![Bert输入](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/02.png?raw=true) 35 | 36 | Bert输入部分由下面两个函数得到: 37 | 38 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/03.png?raw=true) 39 | 40 | embedding_lookup得到token_embedding,embedding_postprocessor得到将这三个输入向量相加的结果,注意embedding_postprocessor函数return最后结果之前有一个layer normalize和droupout处理: 41 | 42 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/04.png?raw=true) 43 | 44 | 45 | 46 | ### Padding_Mask 47 | 48 | 由于输入句子长度不一样,Bert作了填充处理,将填充的部分标记为0,其余标记为1,这样是为了在做attention时能将填充部分得到的attention权重很少,从而能尽可能忽略padding部分对模型的影响: 49 | 50 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/05.png?raw=true) 51 | 52 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/06.png?raw=true) 53 | 54 | 55 | 56 | ### attention_layer 57 | 58 | 为了方便分析数据流通,对张量的维度作如下简记: 59 | 60 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/07.png?raw=true) 61 | 62 | 做了该简记后,经过词向量层输入Bert的张量维度为[B, F, embedding_size],attention_mask维度为[B, F, T]。由于在Bert中是self-attention,F和T是相等的。接下来我详细解读一下attention_layer函数,该函数是Bert的Multi-Head Attention,也是模型最为复杂的部分。更详细的代码可以结合源码看。在进入这部分之前,也建议先了解一下2017年谷歌提出的transformer模型,推荐Jay Alammar可视化地介绍Transformer的博客文章The Illustrated Transformer ,非常容易理解整个机制。而Bert采用的是transformer的encoding部分,attention只用到了self-attention,self-attention可以看成Q=K的特殊情况。所以attention_layer函数参数中才会有from_tensor,to_tensor这两个变量,一个代表Q,另一个代表K及V(这里的Q,K,V含义不作介绍,可参考transformer模型讲解相关文章)。 63 | 64 | ​ atterntion_layer函数里面首先定义了函数transpose_for_scores: 65 | 66 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/08.png?raw=true) 67 | 68 | 该函数的作用是将attention层的输入(Q,K,V)切割成维度为[B, N, F 或T, H]。了解transformer可以知道,Q、K、V是输入的词向量分别经过一个线性变换得到的。在做线性变换即MLP层时先将input_tensor(维度为[B, F, embedding_size])reshape成二维的(其实源码在下一个函数transformer_model中使用这个函数传进去的参数已经变成二维的了,这一点看下一个函数transformer_model可以看到): 69 | 70 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/09.png?raw=true) 71 | 72 | 接下来就是MLP层,即对输入的词向量input_tensor作三个不同的线性变换去得到Q、K、V,当然这一步后维度还需要转换一下才能得到最终的Q、K、V: 73 | 74 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/10.png?raw=true) 75 | 76 | MLP层将[B * F, embedding_size]变成[B * F, N * H]。但从后面的代码(transformer_model函数)可以看到embedding_size等于hidden_size等于N * H,相当于这个MLP层没有改变维度大小,这一点也是比较难理解的: 77 | 78 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/11.png?raw=true) 79 | 80 | 之后,代码通过先前介绍的transpose_for_scores函数得到Q、K、V,维度分别为[B, N, F, H]、[B, N, T, H]、[B, N, T, H]。不解得是,后面的求V代码并不是通过transpose_for_scores函数得到,而是又把transpose_for_scores函数体再写了一遍(:joy:)。 81 | 82 | 到目前为止Q、K、V我们都已经得到了,我们再来回顾一下论文“Attention is all you need”中的attention公式: 83 | 84 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/equation_1.png?raw=true) 85 | 86 | 下面这部分得到的attention_scores得到的是softmax里面的部分。这里简单解释下tf.matmul。这个函数实质上是对最后两维进行普通的矩阵乘法,前面的维度都当做batch,因此这要求相乘的两个张量前面的维度是一样的,后面两个维度满足普通矩阵的乘法规则即可。细想一下attention的运算过程,这刚好是可以用这个矩阵乘法来得到结果的。得到的attention_scores的维度为[B, N, F, T]。只看后面两个维度(即只考虑一个数据、一个attention),attention_scores其实就是一个attention中Q和K作用得到的权重系数(还未经过softmax),而Q和K长度分别是F和T,因此共有F * T个这样的系数: 87 | 88 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/12.png?raw=true) 89 | 90 | 那么比较关键的一步来了——Mask,即将padding部分“mask”掉(**这和Bert预测词向量任务时的mask是完全不同的,详情参考相关文章,这里只讨论模型的详细架构**): 91 | 92 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/13.png?raw=true) 93 | 94 | 我们在前面步骤中得到的attention_mask的维度为[B, F, T],为了能实现矩阵加法,所以先在维度1上(指第二个维度,第一个维度axis=0)扩充一维,得到维度为[B, 1, F, T]。然后利用python里面的**广播机制**就可以相加了,要mask的部分加上-10000.0,不mask的部分加上0。这个模型的mask是在softmax之前做的,至于具体原因我也不太清楚,还是继续跟着数据流走吧。加上mask之后就是softmax,softmax之后又加了dropout: 95 | 96 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/14.png?raw=true) 97 | 98 | 再之后就是softmax之后的权重系数乘上后面的V,得到维度为[B, N, F, H],在维度为1和维度为2的位置转置一下变成[B, F, N, H],该函数可以返回两种维度的张量: 99 | 100 | 1. [B * F, N * H](*源码中注释H变成了V,这一点是错误吗?还是我理解错了?*) 101 | 2. [B, F, N * H] 102 | 103 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/15.png?raw=true) 104 | 105 | 至此,我将bert模型中最为复杂的Multi-Head Attention数据变化形式讲解完了。下一个函数transformer_model搭建Bert整体模型。 106 | 107 | 108 | 109 | ### transformer_model 110 | 111 | 下面我对transformer_model这个函数进行解析,该函数是将Transformer Encoded所有的组件结合在一起。 很多时候,结合图形理解是非常有帮助的。下面我们先看一下下面这个图吧(我们把这个图的结构叫做transformer block吧): 112 | 113 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/16.png?raw=true) 114 | 115 | 整个Bert模型其实就是num_hidden_layers个这样的结构串连,相当于有num_hidden_layers个transformer_block。而self-attention部分在上个函数已经梳理得很清楚了,剩下的其实都是一些熟悉的组件(残差、MLP、LN)。transformer_model先处理好输入的词向量,然后进入一个循坏,每个循坏就是一个block: 116 | 117 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/17.png?raw=true) 118 | 119 | 上面的截图并未包括所有的循环代码,我们一步步来走下去。显然,代码是将上一个transformer block的输出作为下一个transformer block的输入。那么第一个transformer block的输入是什么呢?当然是我们前面所说的三个输入向量相加得到的input_tensor。至于每个block维度是否对得上,计算是否准确,继续看后面的代码就知道了。该代码中还用了变量all_layer_outputs来保存每一个block的输出结果,设置参数do_return_all_layers可以选择输出每个block的结果或者最后一个block的结果。transformer_model中使用attention_layer函数的输入数据维度为二维的([B * F或B * T, hidden_size])。详细看attention_layer函数时是可以输入二维张量数据的: 120 | 121 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/18.png?raw=true) 122 | 123 | 至于下面这部分为什么会有attention_heads这个变量,原因我也不知道,仿佛在这里是多此一举,源码中的解释如下: 124 | 125 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/19.png?raw=true) 126 | 127 | 我们再回顾一下上一个函数attention_layer,return的结果维度为[B * F, N * H]或[B, F, N * H]。注意这里面使用的attention_layer函数do_return_2d_tensor参数设置为True,所以attention_output的维度为[B * F, N * H]。然后再做一层MLP(该层并没改变维度,因为hidden_size=N * H)、dropout、layer_norm: 128 | 129 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/20.png?raw=true) 130 | 131 | 此时attention_output的维度还是[B * F, N * H或hidden_size]。由上面的图可以接下来是继续MLP层加dropout加layer_norm,只不过该层MLP的神经元数intermediate_size是一个超参数,可以人工指定: 132 | 133 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/21.png?raw=true) 134 | 135 | 由上面截图的代码可知接下来做了两层MLP,维度变化[B * F, hidden_size]到[B * F, intermediate_size]再到[B * F, hidden_size],再经过dropout和layer_norm维度大小不变。至此,一个transformer block已经走完了。而此时得到的layer_out将作为下一个block的输入,这个维度与该模型第一个block的的输入是一样的,然后就是这样num_hidden_layers次循环下去得到最后一个block的输出结果layer_output,维度依旧为[B * F, hidden_size]。 136 | 137 | return的时候通过reshape_from_matrix函数把block的输出变成维度和input_shape一样的维度,即一开始词向量输入input_tensor的维度([batch_size, seq_length, hidden_size]) 138 | 139 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/22.png?raw=true) 140 | 141 | 142 | 143 | ### Bert_model class 144 | 145 | 为了方便训练,模型的整个过程都封装在Bert_model类中,通过该类的实例可以访问模型中的结果。详细的过程见代码。上述几个函数梳理之后便没什么复杂的了,只是把内容整合在一起了。self.all_encoder_layers是经过transformer_model函数返回每个block的结果,self.sequence_output得到最后一个维度的结果,由上面的分析知维度为[Batch_szie, seq_length, hidden_size],这和一开始词向量的维度是一样的,只不过这个结果是经过Transformer Encoded提取特征之后的,包含重要的信息,也是Bert想得到的结果: 146 | 147 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/23.png?raw=true) 148 | 149 | 在这一步之后,该类用成员变量self.pooled_output保存第一个位置再经过一个MLP层的输出结果。熟悉数据输入形式的可以知道,这个位置是[CLS],该位置的输出在Bert预训练中是用来判断句子上下文关系的: 150 | 151 | ![](https://github.com/1234560o/Bert-model-code-interpretation/blob/master/image/24.png?raw=true) 152 | 153 | 这里保存该结果除了可以用于Bert预训练,还可以微调Bert用于分类任务,详细可参考: 154 | 155 | [https://www.jianshu.com/p/22e462f01d8c](https://www.jianshu.com/p/22e462f01d8c) 156 | 157 | 158 | 159 | ### 后续 160 | 161 | 文中可能存在不少笔误或者理解不正确的表达不清晰地方敬请谅解,非常欢迎能提出来共同学习。 162 | -------------------------------------------------------------------------------- /modeling.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """The main BERT model and related functions.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import copy 23 | import json 24 | import math 25 | import re 26 | import six 27 | import tensorflow as tf 28 | 29 | 30 | class BertConfig(object): 31 | """Configuration for `BertModel`.""" 32 | 33 | def __init__(self, 34 | vocab_size, 35 | hidden_size=768, 36 | num_hidden_layers=12, 37 | num_attention_heads=12, 38 | intermediate_size=3072, 39 | hidden_act="gelu", 40 | hidden_dropout_prob=0.1, 41 | attention_probs_dropout_prob=0.1, 42 | max_position_embeddings=512, 43 | type_vocab_size=16, 44 | initializer_range=0.02): 45 | """Constructs BertConfig. 46 | 47 | Args: 48 | vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. 49 | hidden_size: Size of the encoder layers and the pooler layer. 50 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 51 | num_attention_heads: Number of attention heads for each attention layer in 52 | the Transformer encoder. 53 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 54 | layer in the Transformer encoder. 55 | hidden_act: The non-linear activation function (function or string) in the 56 | encoder and pooler. 57 | hidden_dropout_prob: The dropout probability for all fully connected 58 | layers in the embeddings, encoder, and pooler. 59 | attention_probs_dropout_prob: The dropout ratio for the attention 60 | probabilities. 61 | max_position_embeddings: The maximum sequence length that this model might 62 | ever be used with. Typically set this to something large just in case 63 | (e.g., 512 or 1024 or 2048). 64 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 65 | `BertModel`. 66 | initializer_range: The stdev of the truncated_normal_initializer for 67 | initializing all weight matrices. 68 | """ 69 | self.vocab_size = vocab_size 70 | self.hidden_size = hidden_size 71 | self.num_hidden_layers = num_hidden_layers 72 | self.num_attention_heads = num_attention_heads 73 | self.hidden_act = hidden_act 74 | self.intermediate_size = intermediate_size 75 | self.hidden_dropout_prob = hidden_dropout_prob 76 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 77 | self.max_position_embeddings = max_position_embeddings 78 | self.type_vocab_size = type_vocab_size 79 | self.initializer_range = initializer_range 80 | 81 | @classmethod 82 | def from_dict(cls, json_object): 83 | """Constructs a `BertConfig` from a Python dictionary of parameters.""" 84 | config = BertConfig(vocab_size=None) 85 | for (key, value) in six.iteritems(json_object): 86 | config.__dict__[key] = value 87 | return config 88 | 89 | @classmethod 90 | def from_json_file(cls, json_file): 91 | """Constructs a `BertConfig` from a json file of parameters.""" 92 | with tf.gfile.GFile(json_file, "r") as reader: 93 | text = reader.read() 94 | return cls.from_dict(json.loads(text)) 95 | 96 | def to_dict(self): 97 | """Serializes this instance to a Python dictionary.""" 98 | output = copy.deepcopy(self.__dict__) 99 | return output 100 | 101 | def to_json_string(self): 102 | """Serializes this instance to a JSON string.""" 103 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 104 | 105 | 106 | class BertModel(object): 107 | """BERT model ("Bidirectional Embedding Representations from a Transformer"). 108 | 109 | Example usage: 110 | 111 | ```python 112 | # Already been converted into WordPiece token ids 113 | input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) 114 | input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) 115 | token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) 116 | 117 | config = modeling.BertConfig(vocab_size=32000, hidden_size=512, 118 | num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) 119 | 120 | model = modeling.BertModel(config=config, is_training=True, 121 | input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) 122 | 123 | label_embeddings = tf.get_variable(...) 124 | pooled_output = model.get_pooled_output() 125 | logits = tf.matmul(pooled_output, label_embeddings) 126 | ... 127 | ``` 128 | """ 129 | 130 | def __init__(self, 131 | config, 132 | is_training, 133 | input_ids, 134 | input_mask=None, 135 | token_type_ids=None, 136 | use_one_hot_embeddings=True, 137 | scope=None): 138 | """Constructor for BertModel. 139 | 140 | Args: 141 | config: `BertConfig` instance. 142 | is_training: bool. rue for training model, false for eval model. Controls 143 | whether dropout will be applied. 144 | input_ids: int32 Tensor of shape [batch_size, seq_length]. 145 | input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. 146 | token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. 147 | use_one_hot_embeddings: (optional) bool. Whether to use one-hot word 148 | embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, 149 | it is must faster if this is True, on the CPU or GPU, it is faster if 150 | this is False. 151 | scope: (optional) variable scope. Defaults to "bert". 152 | 153 | Raises: 154 | ValueError: The config is invalid or one of the input tensor shapes 155 | is invalid. 156 | """ 157 | config = copy.deepcopy(config) 158 | if not is_training: 159 | config.hidden_dropout_prob = 0.0 160 | config.attention_probs_dropout_prob = 0.0 161 | 162 | input_shape = get_shape_list(input_ids, expected_rank=2) 163 | batch_size = input_shape[0] 164 | seq_length = input_shape[1] 165 | 166 | if input_mask is None: 167 | input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) 168 | 169 | if token_type_ids is None: 170 | token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) 171 | 172 | with tf.variable_scope(scope, default_name="bert"): 173 | with tf.variable_scope("embeddings"): 174 | # Perform embedding lookup on the word ids. 175 | (self.embedding_output, self.embedding_table) = embedding_lookup( 176 | input_ids=input_ids, 177 | vocab_size=config.vocab_size, 178 | embedding_size=config.hidden_size, 179 | initializer_range=config.initializer_range, 180 | word_embedding_name="word_embeddings", 181 | use_one_hot_embeddings=use_one_hot_embeddings) 182 | 183 | # Add positional embeddings and token type embeddings, then layer 184 | # normalize and perform dropout. 185 | self.embedding_output = embedding_postprocessor( 186 | input_tensor=self.embedding_output, 187 | use_token_type=True, 188 | token_type_ids=token_type_ids, 189 | token_type_vocab_size=config.type_vocab_size, 190 | token_type_embedding_name="token_type_embeddings", 191 | use_position_embeddings=True, 192 | position_embedding_name="position_embeddings", 193 | initializer_range=config.initializer_range, 194 | max_position_embeddings=config.max_position_embeddings, 195 | dropout_prob=config.hidden_dropout_prob) 196 | 197 | with tf.variable_scope("encoder"): 198 | # This converts a 2D mask of shape [batch_size, seq_length] to a 3D 199 | # mask of shape [batch_size, seq_length, seq_length] which is used 200 | # for the attention scores. 201 | # return元素都为1维度为[batch_szie, seq_length, seq_length]的矩阵 202 | attention_mask = create_attention_mask_from_input_mask( 203 | input_ids, input_mask) 204 | 205 | # Run the stacked transformer. 206 | # `sequence_output` shape = [batch_size, seq_length, hidden_size]. 207 | self.all_encoder_layers = transformer_model( 208 | input_tensor=self.embedding_output, 209 | attention_mask=attention_mask, 210 | hidden_size=config.hidden_size, 211 | num_hidden_layers=config.num_hidden_layers, 212 | num_attention_heads=config.num_attention_heads, 213 | intermediate_size=config.intermediate_size, 214 | intermediate_act_fn=get_activation(config.hidden_act), 215 | hidden_dropout_prob=config.hidden_dropout_prob, 216 | attention_probs_dropout_prob=config.attention_probs_dropout_prob, 217 | initializer_range=config.initializer_range, 218 | do_return_all_layers=True) 219 | 220 | self.sequence_output = self.all_encoder_layers[-1] 221 | # The "pooler" converts the encoded sequence tensor of shape 222 | # [batch_size, seq_length, hidden_size] to a tensor of shape 223 | # [batch_size, hidden_size]. This is necessary for segment-level 224 | # (or segment-pair-level) classification tasks where we need a fixed 225 | # dimensional representation of the segment. 226 | with tf.variable_scope("pooler"): 227 | # We "pool" the model by simply taking the hidden state corresponding 228 | # to the first token. We assume that this has been pre-trained 229 | first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) 230 | self.pooled_output = tf.layers.dense( 231 | first_token_tensor, 232 | config.hidden_size, 233 | activation=tf.tanh, 234 | kernel_initializer=create_initializer(config.initializer_range)) 235 | 236 | def get_pooled_output(self): 237 | return self.pooled_output 238 | 239 | def get_sequence_output(self): 240 | """Gets final hidden layer of encoder. 241 | 242 | Returns: 243 | float Tensor of shape [batch_size, seq_length, hidden_size] corresponding 244 | to the final hidden of the transformer encoder. 245 | """ 246 | return self.sequence_output 247 | 248 | def get_all_encoder_layers(self): 249 | return self.all_encoder_layers 250 | 251 | def get_embedding_output(self): 252 | """Gets output of the embedding lookup (i.e., input to the transformer). 253 | 254 | Returns: 255 | float Tensor of shape [batch_size, seq_length, hidden_size] corresponding 256 | to the output of the embedding layer, after summing the word 257 | embeddings with the positional embeddings and the token type embeddings, 258 | then performing layer normalization. This is the input to the transformer. 259 | """ 260 | return self.embedding_output 261 | 262 | def get_embedding_table(self): 263 | return self.embedding_table 264 | 265 | 266 | def gelu(input_tensor): 267 | """Gaussian Error Linear Unit. 268 | 269 | This is a smoother version of the RELU. 270 | Original paper: https://arxiv.org/abs/1606.08415 271 | 272 | Args: 273 | input_tensor: float Tensor to perform activation. 274 | 275 | Returns: 276 | `input_tensor` with the GELU activation applied. 277 | """ 278 | cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) 279 | return input_tensor * cdf 280 | 281 | 282 | def get_activation(activation_string): 283 | """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. 284 | 285 | Args: 286 | activation_string: String name of the activation function. 287 | 288 | Returns: 289 | A Python function corresponding to the activation function. If 290 | `activation_string` is None, empty, or "linear", this will return None. 291 | If `activation_string` is not a string, it will return `activation_string`. 292 | 293 | Raises: 294 | ValueError: The `activation_string` does not correspond to a known 295 | activation. 296 | """ 297 | 298 | # We assume that anything that"s not a string is already an activation 299 | # function, so we just return it. 300 | if not isinstance(activation_string, six.string_types): 301 | return activation_string 302 | 303 | if not activation_string: 304 | return None 305 | 306 | act = activation_string.lower() 307 | if act == "linear": 308 | return None 309 | elif act == "relu": 310 | return tf.nn.relu 311 | elif act == "gelu": 312 | return gelu 313 | elif act == "tanh": 314 | return tf.tanh 315 | else: 316 | raise ValueError("Unsupported activation: %s" % act) 317 | 318 | 319 | def get_assignment_map_from_checkpoint(tvars, init_checkpoint): 320 | """Compute the union of the current variables and checkpoint variables.""" 321 | assignment_map = {} 322 | initialized_variable_names = {} 323 | 324 | name_to_variable = collections.OrderedDict() 325 | for var in tvars: 326 | name = var.name 327 | m = re.match("^(.*):\\d+$", name) 328 | if m is not None: 329 | name = m.group(1) 330 | name_to_variable[name] = var 331 | 332 | init_vars = tf.train.list_variables(init_checkpoint) 333 | 334 | assignment_map = collections.OrderedDict() 335 | for x in init_vars: 336 | (name, var) = (x[0], x[1]) 337 | if name not in name_to_variable: 338 | continue 339 | assignment_map[name] = name 340 | initialized_variable_names[name] = 1 341 | initialized_variable_names[name + ":0"] = 1 342 | 343 | return (assignment_map, initialized_variable_names) 344 | 345 | 346 | def dropout(input_tensor, dropout_prob): 347 | """Perform dropout. 348 | 349 | Args: 350 | input_tensor: float Tensor. 351 | dropout_prob: Python float. The probability of dropping out a value (NOT of 352 | *keeping* a dimension as in `tf.nn.dropout`). 353 | 354 | Returns: 355 | A version of `input_tensor` with dropout applied. 356 | """ 357 | if dropout_prob is None or dropout_prob == 0.0: 358 | return input_tensor 359 | 360 | output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) 361 | return output 362 | 363 | 364 | def layer_norm(input_tensor, name=None): 365 | """Run layer normalization on the last dimension of the tensor.""" 366 | return tf.contrib.layers.layer_norm( 367 | inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) 368 | 369 | 370 | def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): 371 | """Runs layer normalization followed by dropout.""" 372 | output_tensor = layer_norm(input_tensor, name) 373 | output_tensor = dropout(output_tensor, dropout_prob) 374 | return output_tensor 375 | 376 | 377 | def create_initializer(initializer_range=0.02): 378 | """Creates a `truncated_normal_initializer` with the given range.""" 379 | return tf.truncated_normal_initializer(stddev=initializer_range) 380 | 381 | 382 | def embedding_lookup(input_ids, 383 | vocab_size, 384 | embedding_size=128, 385 | initializer_range=0.02, 386 | word_embedding_name="word_embeddings", 387 | use_one_hot_embeddings=False): 388 | """Looks up words embeddings for id tensor. 389 | 390 | Args: 391 | input_ids: int32 Tensor of shape [batch_size, seq_length] containing word 392 | ids. 393 | vocab_size: int. Size of the embedding vocabulary. 394 | embedding_size: int. Width of the word embeddings. 395 | initializer_range: float. Embedding initialization range. 396 | word_embedding_name: string. Name of the embedding table. 397 | use_one_hot_embeddings: bool. If True, use one-hot method for word 398 | embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better 399 | for TPUs. 400 | 401 | Returns: 402 | float Tensor of shape [batch_size, seq_length, embedding_size]. 403 | """ 404 | # This function assumes that the input is of shape [batch_size, seq_length, 405 | # num_inputs]. 406 | # 407 | # If the input is a 2D tensor of shape [batch_size, seq_length], we 408 | # reshape to [batch_size, seq_length, 1]. 409 | if input_ids.shape.ndims == 2: 410 | input_ids = tf.expand_dims(input_ids, axis=[-1]) 411 | 412 | embedding_table = tf.get_variable( 413 | name=word_embedding_name, 414 | shape=[vocab_size, embedding_size], 415 | initializer=create_initializer(initializer_range)) 416 | 417 | if use_one_hot_embeddings: 418 | flat_input_ids = tf.reshape(input_ids, [-1]) 419 | one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) 420 | output = tf.matmul(one_hot_input_ids, embedding_table) 421 | else: 422 | output = tf.nn.embedding_lookup(embedding_table, input_ids) 423 | # 此时output的维度为[batch_size, seq_length, 1, embedding_size] 424 | 425 | input_shape = get_shape_list(input_ids) 426 | 427 | # 将output变成[batch_size, seq_length, embedding_size]的维度 428 | output = tf.reshape(output, 429 | input_shape[0:-1] + [input_shape[-1] * embedding_size]) 430 | return (output, embedding_table) 431 | 432 | 433 | def embedding_postprocessor(input_tensor, 434 | use_token_type=False, 435 | token_type_ids=None, 436 | token_type_vocab_size=16, 437 | token_type_embedding_name="token_type_embeddings", 438 | use_position_embeddings=True, 439 | position_embedding_name="position_embeddings", 440 | initializer_range=0.02, 441 | max_position_embeddings=512, 442 | dropout_prob=0.1): 443 | """Performs various post-processing on a word embedding tensor. 444 | 445 | Args: 446 | input_tensor: float Tensor of shape [batch_size, seq_length, 447 | embedding_size]. 448 | use_token_type: bool. Whether to add embeddings for `token_type_ids`. 449 | token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. 450 | Must be specified if `use_token_type` is True. 451 | token_type_vocab_size: int. The vocabulary size of `token_type_ids`. 452 | token_type_embedding_name: string. The name of the embedding table variable 453 | for token type ids. 454 | use_position_embeddings: bool. Whether to add position embeddings for the 455 | position of each token in the sequence. 456 | position_embedding_name: string. The name of the embedding table variable 457 | for positional embeddings. 458 | initializer_range: float. Range of the weight initialization. 459 | max_position_embeddings: int. Maximum sequence length that might ever be 460 | used with this model. This can be longer than the sequence length of 461 | input_tensor, but cannot be shorter. 462 | dropout_prob: float. Dropout probability applied to the final output tensor. 463 | 464 | Returns: 465 | float tensor with same shape as `input_tensor`. 466 | 467 | Raises: 468 | ValueError: One of the tensor shapes or input values is invalid. 469 | """ 470 | input_shape = get_shape_list(input_tensor, expected_rank=3) 471 | batch_size = input_shape[0] 472 | seq_length = input_shape[1] 473 | width = input_shape[2] 474 | 475 | output = input_tensor 476 | 477 | if use_token_type: 478 | if token_type_ids is None: 479 | raise ValueError("`token_type_ids` must be specified if" 480 | "`use_token_type` is True.") 481 | token_type_table = tf.get_variable( 482 | name=token_type_embedding_name, 483 | shape=[token_type_vocab_size, width], 484 | initializer=create_initializer(initializer_range)) 485 | # This vocab will be small so we always do one-hot here, since it is always 486 | # faster for a small vocabulary. 487 | flat_token_type_ids = tf.reshape(token_type_ids, [-1]) 488 | one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) 489 | token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) 490 | token_type_embeddings = tf.reshape(token_type_embeddings, 491 | [batch_size, seq_length, width]) 492 | output += token_type_embeddings 493 | 494 | if use_position_embeddings: 495 | assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) 496 | with tf.control_dependencies([assert_op]): 497 | full_position_embeddings = tf.get_variable( 498 | name=position_embedding_name, 499 | shape=[max_position_embeddings, width], 500 | initializer=create_initializer(initializer_range)) 501 | # Since the position embedding table is a learned variable, we create it 502 | # using a (long) sequence length `max_position_embeddings`. The actual 503 | # sequence length might be shorter than this, for faster training of 504 | # tasks that do not have long sequences. 505 | # 506 | # So `full_position_embeddings` is effectively an embedding table 507 | # for position [0, 1, 2, ..., max_position_embeddings-1], and the current 508 | # sequence has positions [0, 1, 2, ... seq_length-1], so we can just 509 | # perform a slice. 510 | position_embeddings = tf.slice(full_position_embeddings, [0, 0], 511 | [seq_length, -1]) 512 | num_dims = len(output.shape.as_list()) 513 | 514 | # Only the last two dimensions are relevant (`seq_length` and `width`), so 515 | # we broadcast among the first dimensions, which is typically just 516 | # the batch size. 517 | position_broadcast_shape = [] 518 | for _ in range(num_dims - 2): 519 | position_broadcast_shape.append(1) 520 | position_broadcast_shape.extend([seq_length, width]) 521 | # 将position_embeddings变成和output相同的维度以便相加 522 | position_embeddings = tf.reshape(position_embeddings, 523 | position_broadcast_shape) 524 | output += position_embeddings 525 | 526 | # LN和droupout处理,这里的droupout为1-droupout_prob 527 | output = layer_norm_and_dropout(output, dropout_prob) 528 | return output 529 | 530 | 531 | def create_attention_mask_from_input_mask(from_tensor, to_mask): 532 | """Create 3D attention mask from a 2D tensor mask. 533 | 534 | Args: 535 | from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. 536 | to_mask: int32 Tensor of shape [batch_size, to_seq_length]. 537 | 538 | Returns: 539 | float Tensor of shape [batch_size, from_seq_length, to_seq_length]. 540 | """ 541 | from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) 542 | batch_size = from_shape[0] 543 | from_seq_length = from_shape[1] 544 | 545 | to_shape = get_shape_list(to_mask, expected_rank=2) 546 | to_seq_length = to_shape[1] 547 | 548 | to_mask = tf.cast( 549 | tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) 550 | 551 | # We don't assume that `from_tensor` is a mask (although it could be). We 552 | # don't actually care if we attend *from* padding tokens (only *to* padding) 553 | # tokens so we create a tensor of all ones. 554 | # 555 | # `broadcast_ones` = [batch_size, from_seq_length, 1] 556 | broadcast_ones = tf.ones( 557 | shape=[batch_size, from_seq_length, 1], dtype=tf.float32) 558 | 559 | # Here we broadcast along two dimensions to create the mask. 560 | mask = broadcast_ones * to_mask 561 | 562 | return mask 563 | 564 | 565 | def attention_layer(from_tensor, 566 | to_tensor, 567 | attention_mask=None, 568 | num_attention_heads=1, 569 | size_per_head=512, 570 | query_act=None, 571 | key_act=None, 572 | value_act=None, 573 | attention_probs_dropout_prob=0.0, 574 | initializer_range=0.02, 575 | do_return_2d_tensor=False, 576 | batch_size=None, 577 | from_seq_length=None, 578 | to_seq_length=None): 579 | """Performs multi-headed attention from `from_tensor` to `to_tensor`. 580 | 581 | This is an implementation of multi-headed attention based on "Attention 582 | is all you Need". If `from_tensor` and `to_tensor` are the same, then 583 | this is self-attention. Each timestep in `from_tensor` attends to the 584 | corresponding sequence in `to_tensor`, and returns a fixed-with vector. 585 | 586 | This function first projects `from_tensor` into a "query" tensor and 587 | `to_tensor` into "key" and "value" tensors. These are (effectively) a list 588 | of tensors of length `num_attention_heads`, where each tensor is of shape 589 | [batch_size, seq_length, size_per_head]. 590 | 591 | Then, the query and key tensors are dot-producted and scaled. These are 592 | softmaxed to obtain attention probabilities. The value tensors are then 593 | interpolated by these probabilities, then concatenated back to a single 594 | tensor and returned. 595 | 596 | In practice, the multi-headed attention are done with transposes and 597 | reshapes rather than actual separate tensors. 598 | 599 | Args: 600 | from_tensor: float Tensor of shape [batch_size, from_seq_length, 601 | from_width]. 602 | to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. 603 | attention_mask: (optional) int32 Tensor of shape [batch_size, 604 | from_seq_length, to_seq_length]. The values should be 1 or 0. The 605 | attention scores will effectively be set to -infinity for any positions in 606 | the mask that are 0, and will be unchanged for positions that are 1. 607 | num_attention_heads: int. Number of attention heads. 608 | size_per_head: int. Size of each attention head. 609 | query_act: (optional) Activation function for the query transform. 610 | key_act: (optional) Activation function for the key transform. 611 | value_act: (optional) Activation function for the value transform. 612 | attention_probs_dropout_prob: (optional) float. Dropout probability of the 613 | attention probabilities. 614 | initializer_range: float. Range of the weight initializer. 615 | do_return_2d_tensor: bool. If True, the output will be of shape [batch_size 616 | * from_seq_length, num_attention_heads * size_per_head]. If False, the 617 | output will be of shape [batch_size, from_seq_length, num_attention_heads 618 | * size_per_head]. 619 | batch_size: (Optional) int. If the input is 2D, this might be the batch size 620 | of the 3D version of the `from_tensor` and `to_tensor`. 621 | from_seq_length: (Optional) If the input is 2D, this might be the seq length 622 | of the 3D version of the `from_tensor`. 623 | to_seq_length: (Optional) If the input is 2D, this might be the seq length 624 | of the 3D version of the `to_tensor`. 625 | 626 | Returns: 627 | float Tensor of shape [batch_size, from_seq_length, 628 | num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is 629 | true, this will be of shape [batch_size * from_seq_length, 630 | num_attention_heads * size_per_head]). 631 | 632 | Raises: 633 | ValueError: Any of the arguments or tensor shapes are invalid. 634 | """ 635 | 636 | def transpose_for_scores(input_tensor, batch_size, num_attention_heads, 637 | seq_length, width): 638 | output_tensor = tf.reshape( 639 | input_tensor, [batch_size, seq_length, num_attention_heads, width]) 640 | 641 | # 第二个维度变成num_attention_heads,相当于切割 642 | output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) 643 | return output_tensor 644 | 645 | from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) 646 | to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) 647 | 648 | if len(from_shape) != len(to_shape): 649 | raise ValueError( 650 | "The rank of `from_tensor` must match the rank of `to_tensor`.") 651 | 652 | if len(from_shape) == 3: 653 | batch_size = from_shape[0] 654 | from_seq_length = from_shape[1] 655 | to_seq_length = to_shape[1] 656 | elif len(from_shape) == 2: 657 | if (batch_size is None or from_seq_length is None or to_seq_length is None): 658 | raise ValueError( 659 | "When passing in rank 2 tensors to attention_layer, the values " 660 | "for `batch_size`, `from_seq_length`, and `to_seq_length` " 661 | "must all be specified.") 662 | 663 | # Scalar dimensions referenced here: 664 | # B = batch size (number of sequences) 665 | # F = `from_tensor` sequence length 666 | # T = `to_tensor` sequence length 667 | # N = `num_attention_heads` 668 | # H = `size_per_head` 669 | 670 | # 转化成二维向量,第二个维度为width,第一个维度变成batch_size * from_seq_length 671 | from_tensor_2d = reshape_to_matrix(from_tensor) 672 | to_tensor_2d = reshape_to_matrix(to_tensor) 673 | 674 | # 对Q做线性映射,映射到-1维度为num_attention_heads * size_per_head 675 | # `query_layer` = [B*F, N*H] 676 | query_layer = tf.layers.dense( 677 | from_tensor_2d, 678 | num_attention_heads * size_per_head, 679 | activation=query_act, 680 | name="query", 681 | kernel_initializer=create_initializer(initializer_range)) 682 | 683 | # 对K做线性映射,映射到-1维度为num_attention_heads * size_per_head 684 | # `key_layer` = [B*T, N*H] 685 | key_layer = tf.layers.dense( 686 | to_tensor_2d, 687 | num_attention_heads * size_per_head, 688 | activation=key_act, 689 | name="key", 690 | kernel_initializer=create_initializer(initializer_range)) 691 | 692 | # 对V做线性映射,映射到-1维度为num_attention_heads * size_per_head 693 | # `value_layer` = [B*T, N*H] 694 | value_layer = tf.layers.dense( 695 | to_tensor_2d, 696 | num_attention_heads * size_per_head, 697 | activation=value_act, 698 | name="value", 699 | kernel_initializer=create_initializer(initializer_range)) 700 | 701 | # 此步从-1维度再经1,2为互换将Q或K或V变成 702 | # 维度batch_size * num_attention_heads * from_seq_length * size_per_head 703 | # `query_layer` = [B, N, F, H] 704 | query_layer = transpose_for_scores(query_layer, batch_size, 705 | num_attention_heads, from_seq_length, 706 | size_per_head) 707 | 708 | # `key_layer` = [B, N, T, H] 709 | key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, 710 | to_seq_length, size_per_head) 711 | 712 | # Take the dot product between "query" and "key" to get the raw 713 | # attention scores. 714 | # `attention_scores` = [B, N, F, T] 715 | attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) 716 | attention_scores = tf.multiply(attention_scores, 717 | 1.0 / math.sqrt(float(size_per_head))) 718 | 719 | if attention_mask is not None: 720 | # `attention_mask` = [B, 1, F, T] 721 | attention_mask = tf.expand_dims(attention_mask, axis=[1]) 722 | 723 | # Since attention_mask is 1.0 for positions we want to attend and 0.0 for 724 | # masked positions, this operation will create a tensor which is 0.0 for 725 | # positions we want to attend and -10000.0 for masked positions. 726 | adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 727 | 728 | # Since we are adding it to the raw scores before the softmax, this is 729 | # effectively the same as removing these entirely. 730 | # 这里维度为什么可以直接相加?????????????????广播机制!!!! 731 | attention_scores += adder 732 | 733 | # Normalize the attention scores to probabilities. 734 | # `attention_probs` = [B, N, F, T] 735 | attention_probs = tf.nn.softmax(attention_scores) 736 | 737 | # This is actually dropping out entire tokens to attend to, which might 738 | # seem a bit unusual, but is taken from the original Transformer paper. 739 | attention_probs = dropout(attention_probs, attention_probs_dropout_prob) 740 | 741 | # `value_layer` = [B, T, N, H] 742 | value_layer = tf.reshape( 743 | value_layer, 744 | [batch_size, to_seq_length, num_attention_heads, size_per_head]) 745 | 746 | # `value_layer` = [B, N, T, H] 747 | value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) 748 | 749 | # `context_layer` = [B, N, F, H] 750 | context_layer = tf.matmul(attention_probs, value_layer) 751 | 752 | # `context_layer` = [B, F, N, H] 753 | context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) 754 | 755 | if do_return_2d_tensor: 756 | # `context_layer` = [B*F, N*V] 757 | context_layer = tf.reshape( 758 | context_layer, 759 | [batch_size * from_seq_length, num_attention_heads * size_per_head]) 760 | else: 761 | # `context_layer` = [B, F, N*V] 762 | context_layer = tf.reshape( 763 | context_layer, 764 | [batch_size, from_seq_length, num_attention_heads * size_per_head]) 765 | 766 | return context_layer 767 | 768 | 769 | def transformer_model(input_tensor, 770 | attention_mask=None, 771 | hidden_size=768, 772 | num_hidden_layers=12, 773 | num_attention_heads=12, 774 | intermediate_size=3072, 775 | intermediate_act_fn=gelu, 776 | hidden_dropout_prob=0.1, 777 | attention_probs_dropout_prob=0.1, 778 | initializer_range=0.02, 779 | do_return_all_layers=False): 780 | """Multi-headed, multi-layer Transformer from "Attention is All You Need". 781 | 782 | This is almost an exact implementation of the original Transformer encoder. 783 | 784 | See the original paper: 785 | https://arxiv.org/abs/1706.03762 786 | 787 | Also see: 788 | https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py 789 | 790 | Args: 791 | input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. 792 | attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, 793 | seq_length], with 1 for positions that can be attended to and 0 in 794 | positions that should not be. 795 | hidden_size: int. Hidden size of the Transformer. 796 | num_hidden_layers: int. Number of layers (blocks) in the Transformer. 797 | num_attention_heads: int. Number of attention heads in the Transformer. 798 | intermediate_size: int. The size of the "intermediate" (a.k.a., feed 799 | forward) layer. 800 | intermediate_act_fn: function. The non-linear activation function to apply 801 | to the output of the intermediate/feed-forward layer. 802 | hidden_dropout_prob: float. Dropout probability for the hidden layers. 803 | attention_probs_dropout_prob: float. Dropout probability of the attention 804 | probabilities. 805 | initializer_range: float. Range of the initializer (stddev of truncated 806 | normal). 807 | do_return_all_layers: Whether to also return all layers or just the final 808 | layer. 809 | 810 | Returns: 811 | float Tensor of shape [batch_size, seq_length, hidden_size], the final 812 | hidden layer of the Transformer. 813 | 814 | Raises: 815 | ValueError: A Tensor shape or parameter is invalid. 816 | """ 817 | if hidden_size % num_attention_heads != 0: 818 | raise ValueError( 819 | "The hidden size (%d) is not a multiple of the number of attention " 820 | "heads (%d)" % (hidden_size, num_attention_heads)) 821 | 822 | attention_head_size = int(hidden_size / num_attention_heads) 823 | input_shape = get_shape_list(input_tensor, expected_rank=3) 824 | batch_size = input_shape[0] 825 | seq_length = input_shape[1] 826 | input_width = input_shape[2] 827 | 828 | # The Transformer performs sum residuals on all layers so the input needs 829 | # to be the same as the hidden size. 830 | if input_width != hidden_size: 831 | raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % 832 | (input_width, hidden_size)) 833 | 834 | # We keep the representation as a 2D tensor to avoid re-shaping it back and 835 | # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on 836 | # the GPU/CPU but may not be free on the TPU, so we want to minimize them to 837 | # help the optimizer. 838 | prev_output = reshape_to_matrix(input_tensor) 839 | 840 | all_layer_outputs = [] 841 | for layer_idx in range(num_hidden_layers): 842 | with tf.variable_scope("layer_%d" % layer_idx): 843 | layer_input = prev_output 844 | 845 | with tf.variable_scope("attention"): 846 | attention_heads = [] 847 | with tf.variable_scope("self"): 848 | attention_head = attention_layer( 849 | from_tensor=layer_input, 850 | to_tensor=layer_input, 851 | attention_mask=attention_mask, 852 | num_attention_heads=num_attention_heads, 853 | size_per_head=attention_head_size, 854 | attention_probs_dropout_prob=attention_probs_dropout_prob, 855 | initializer_range=initializer_range, 856 | do_return_2d_tensor=True, 857 | batch_size=batch_size, 858 | from_seq_length=seq_length, 859 | to_seq_length=seq_length) 860 | attention_heads.append(attention_head) 861 | 862 | attention_output = None 863 | if len(attention_heads) == 1: 864 | attention_output = attention_heads[0] 865 | else: 866 | # In the case where we have other sequences, we just concatenate 867 | # them to the self-attention head before the projection. 868 | attention_output = tf.concat(attention_heads, axis=-1) 869 | 870 | # Run a linear projection of `hidden_size` then add a residual 871 | # with `layer_input`. 872 | with tf.variable_scope("output"): 873 | attention_output = tf.layers.dense( 874 | attention_output, 875 | hidden_size, 876 | kernel_initializer=create_initializer(initializer_range)) 877 | attention_output = dropout(attention_output, hidden_dropout_prob) 878 | attention_output = layer_norm(attention_output + layer_input) 879 | 880 | # The activation is only applied to the "intermediate" hidden layer. 881 | with tf.variable_scope("intermediate"): 882 | intermediate_output = tf.layers.dense( 883 | attention_output, 884 | intermediate_size, 885 | activation=intermediate_act_fn, 886 | kernel_initializer=create_initializer(initializer_range)) 887 | 888 | # Down-project back to `hidden_size` then add the residual. 889 | with tf.variable_scope("output"): 890 | layer_output = tf.layers.dense( 891 | intermediate_output, 892 | hidden_size, 893 | kernel_initializer=create_initializer(initializer_range)) 894 | layer_output = dropout(layer_output, hidden_dropout_prob) 895 | layer_output = layer_norm(layer_output + attention_output) 896 | prev_output = layer_output 897 | all_layer_outputs.append(layer_output) 898 | 899 | # 是否是返回所有多头注意力的结果还是最后一层 900 | if do_return_all_layers: 901 | final_outputs = [] 902 | for layer_output in all_layer_outputs: 903 | final_output = reshape_from_matrix(layer_output, input_shape) 904 | final_outputs.append(final_output) 905 | return final_outputs 906 | else: 907 | final_output = reshape_from_matrix(prev_output, input_shape) 908 | return final_output 909 | 910 | 911 | def get_shape_list(tensor, expected_rank=None, name=None): 912 | """Returns a list of the shape of tensor, preferring static dimensions. 913 | 914 | Args: 915 | tensor: A tf.Tensor object to find the shape of. 916 | expected_rank: (optional) int. The expected rank of `tensor`. If this is 917 | specified and the `tensor` has a different rank, and exception will be 918 | thrown. 919 | name: Optional name of the tensor for the error message. 920 | 921 | Returns: 922 | A list of dimensions of the shape of tensor. All static dimensions will 923 | be returned as python integers, and dynamic dimensions will be returned 924 | as tf.Tensor scalars. 925 | """ 926 | if name is None: 927 | name = tensor.name 928 | 929 | if expected_rank is not None: 930 | assert_rank(tensor, expected_rank, name) 931 | 932 | shape = tensor.shape.as_list() 933 | 934 | non_static_indexes = [] 935 | for (index, dim) in enumerate(shape): 936 | if dim is None: 937 | non_static_indexes.append(index) 938 | 939 | if not non_static_indexes: 940 | return shape 941 | 942 | dyn_shape = tf.shape(tensor) # tf.shape输出的tensor维度的张量 943 | for index in non_static_indexes: 944 | shape[index] = dyn_shape[index] 945 | return shape 946 | 947 | 948 | def reshape_to_matrix(input_tensor): 949 | """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" 950 | ndims = input_tensor.shape.ndims 951 | if ndims < 2: 952 | raise ValueError("Input tensor must have at least rank 2. Shape = %s" % 953 | (input_tensor.shape)) 954 | if ndims == 2: 955 | return input_tensor 956 | 957 | width = input_tensor.shape[-1] 958 | output_tensor = tf.reshape(input_tensor, [-1, width]) 959 | return output_tensor 960 | 961 | 962 | def reshape_from_matrix(output_tensor, orig_shape_list): 963 | """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" 964 | if len(orig_shape_list) == 2: 965 | return output_tensor 966 | 967 | output_shape = get_shape_list(output_tensor) 968 | 969 | orig_dims = orig_shape_list[0:-1] 970 | width = output_shape[-1] 971 | 972 | return tf.reshape(output_tensor, orig_dims + [width]) 973 | 974 | 975 | def assert_rank(tensor, expected_rank, name=None): 976 | """Raises an exception if the tensor rank is not of the expected rank. 977 | 978 | Args: 979 | tensor: A tf.Tensor to check the rank of. 980 | expected_rank: Python integer or list of integers, expected rank. 981 | name: Optional name of the tensor for the error message. 982 | 983 | Raises: 984 | ValueError: If the expected shape doesn't match the actual shape. 985 | """ 986 | if name is None: 987 | name = tensor.name 988 | 989 | expected_rank_dict = {} 990 | if isinstance(expected_rank, six.integer_types): 991 | expected_rank_dict[expected_rank] = True 992 | else: 993 | for x in expected_rank: 994 | expected_rank_dict[x] = True 995 | 996 | actual_rank = tensor.shape.ndims 997 | if actual_rank not in expected_rank_dict: 998 | scope_name = tf.get_variable_scope().name 999 | raise ValueError( 1000 | "For the tensor `%s` in scope `%s`, the actual rank " 1001 | "`%d` (shape = %s) is not equal to the expected rank `%s`" % 1002 | (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) 1003 | --------------------------------------------------------------------------------