├── conf.py ├── layers_keras.py ├── readme.md └── extract_sen_vec.py /conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/10 9:13 4 | # @author :Mo 5 | # @function :path of FeatureProject 6 | 7 | import os 8 | 9 | # path of BERT model 10 | file_path = os.path.dirname (__file__) 11 | # file_path = file_path.replace ('conf', '') + 'Data' 12 | model_dir = os.path.join (file_path, 'chinese_L-12_H-768_A-12/') 13 | config_name = os.path.join (model_dir, 'bert_config.json') 14 | ckpt_name = os.path.join (model_dir, 'bert_model.ckpt') 15 | vocab_file = os.path.join (model_dir, 'vocab.txt') 16 | # gpu使用率 17 | gpu_memory_fraction = 0.2 18 | # 默认取倒数第二层的输出值作为句向量 19 | layer_indexes = [-2] 20 | # 序列的最大程度,单文本建议把该值调小 21 | max_seq_len = 64 -------------------------------------------------------------------------------- /layers_keras.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/10 10:49 4 | # @author :Mo 5 | # @function :create model of keras-bert for get [-2] layers 6 | 7 | from keras.engine import Layer 8 | 9 | 10 | class NonMaskingLayer (Layer): 11 | """ 12 | fix convolutional 1D can't receive masked input, detail: https://github.com/keras-team/keras/issues/4978 13 | thanks for https://github.com/jacoxu 14 | """ 15 | 16 | def __init__(self, **kwargs): 17 | self.supports_masking = True 18 | super (NonMaskingLayer, self).__init__ (**kwargs) 19 | 20 | def build(self, input_shape): 21 | pass 22 | 23 | def compute_mask(self, input, input_mask=None): 24 | # do not pass the mask to the next layers 25 | return None 26 | 27 | def call(self, x, mask=None): 28 | return x 29 | 30 | def compute_output_shape(self, input_shape): 31 | return input_shape -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # bert预训练模型 2 | 3 | google的bert预训练模型: 4 | [BERT-Large, Uncased (Whole Word Masking)](https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip): 24-layer, 1024-hidden, 16-heads, 340M parameters 5 | [BERT-Large, Cased (Whole Word Masking)](https://storage.googleapis.com/bert_models/2019_05_30/wwm_cased_L-24_H-1024_A-16.zip): 24-layer, 1024-hidden, 16-heads, 340M parameters 6 | [BERT-Base, Uncased: 12-layer](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip), 768-hidden, 12-heads, 110M parameters 7 | [BERT-Large, Uncased: 24-layer](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip), 1024-hidden, 16-heads, 340M parameters 8 | [BERT-Base, Cased: 12-layer, 768-hidden](https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip), 12-heads , 110M parameters 9 | [BERT-Large, Cased: 24-layer, 1024-hidden](https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip), 16-heads, 340M parameters 10 | [BERT-Base, Multilingual Cased (New, recommended)](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip): 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters 11 | [BERT-Base, Multilingual Uncased (Orig, not recommended) (Not recommended, use Multilingual [Casedinstead)](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip): 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters 12 | [BERT-Base, Chinese](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip): Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters 13 | 14 | 1. 先下载相应的预训练模型 15 | 2. 配置conf.py里边的路径 16 | 3. 利用extract_sen_vec.py 里的 gen_sen_vec()函数生成句向量,gen_word_vec()生成词向量 -------------------------------------------------------------------------------- /extract_sen_vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/8 20:04 4 | # @author :Mo 5 | # @function :extract feature of bert and keras 6 | 7 | import codecs 8 | import warnings 9 | warnings.filterwarnings('ignore') 10 | import os 11 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 12 | import keras.backend.tensorflow_backend as ktf_keras 13 | import numpy as np 14 | import tensorflow as tf 15 | 16 | from tensorflow.keras.layers import Add 17 | from keras.models import Model 18 | from keras_bert import load_trained_model_from_checkpoint, Tokenizer 19 | 20 | from layers_keras import NonMaskingLayer 21 | from conf import gpu_memory_fraction, config_name, ckpt_name, vocab_file, max_seq_len, layer_indexes 22 | 23 | # 全局使用,使其可以django、flask、tornado等调用 24 | graph = None 25 | model = None 26 | 27 | # gpu配置与使用率设置 28 | # os.environ['CUDA_VISIBLE_DEVICES'] = '0' 29 | config = tf.ConfigProto(allow_soft_placement=True) 30 | # config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction 31 | sess = tf.Session(config=config) #注意 ,这里为tensorflow2.0版本,与第1.0有差距。ktf_keras.set_session (sess) 32 | 33 | 34 | class KerasBertVector (): 35 | def __init__(self): 36 | self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len 37 | # 全局使用,使其可以django、flask、tornado等调用 38 | global graph 39 | graph = tf.get_default_graph () 40 | global model 41 | model = load_trained_model_from_checkpoint (self.config_path, self.checkpoint_path, 42 | seq_len=self.max_seq_len) 43 | # print (model.output) 44 | # print (len (model.layers)) 45 | # lay = model.layers 46 | # 一共104个layer,其中前八层包括token,pos,embed等, 47 | # 每4层(MultiHeadAttention,Dropout,Add,LayerNormalization) 48 | # 一共24层 49 | layer_dict = [7] 50 | layer_0 = 7 51 | for i in range (12): 52 | layer_0 = layer_0 + 4 53 | layer_dict.append (layer_0) 54 | # 输出它本身 55 | if len (layer_indexes) == 0: 56 | encoder_layer = model.output 57 | # 分类如果只有一层,就只取最后那一层的weight,取得不正确 58 | elif len (layer_indexes) == 1: 59 | if layer_indexes[0] in [i + 1 for i in range (12)]: 60 | encoder_layer = model.get_layer (index=layer_dict[layer_indexes[0]]).output 61 | else: 62 | encoder_layer = model.get_layer (index=layer_dict[-2]).output 63 | # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 64 | else: 65 | # layer_indexes must be [1,2,3,......12...24] 66 | # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] 67 | all_layers = [model.get_layer (index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range (12)] 68 | else model.get_layer (index=layer_dict[-1]).output # 如果给出不正确,就默认输出最后一层 69 | for lay in layer_indexes] 70 | # print (layer_indexes) 71 | # print (all_layers) 72 | # 其中layer==1的output是格式不对,第二层输入input是list 73 | all_layers_select = [] 74 | for all_layers_one in all_layers: 75 | all_layers_select.append (all_layers_one) 76 | encoder_layer = Add () (all_layers_select) 77 | # print (encoder_layer.shape) 78 | # print ("KerasBertEmbedding:") 79 | # print (encoder_layer.shape) 80 | output_layer = NonMaskingLayer () (encoder_layer) 81 | model = Model (model.inputs, output_layer) 82 | # model.summary(120) 83 | # reader tokenizer 84 | self.token_dict = {} 85 | with codecs.open (self.dict_path, 'r', 'utf8') as reader: 86 | for line in reader: 87 | token = line.strip () 88 | self.token_dict[token] = len (self.token_dict) 89 | 90 | self.tokenizer = Tokenizer (self.token_dict) 91 | 92 | def bert_encode_sen(self, texts): 93 | # 文本预处理 94 | input_ids = [] 95 | input_masks = [] 96 | input_type_ids = [] 97 | for text in texts: 98 | # print (text) 99 | tokens_text = self.tokenizer.tokenize (text) 100 | # print ('Tokens:', tokens_text) 101 | input_id, input_type_id = self.tokenizer.encode (first=text, max_len=self.max_seq_len) 102 | input_mask = [0 if ids == 0 else 1 for ids in input_id] 103 | input_ids.append (input_id) 104 | input_type_ids.append (input_type_id) 105 | input_masks.append (input_mask) 106 | 107 | input_ids = np.array (input_ids) 108 | input_masks = np.array (input_masks) 109 | input_type_ids = np.array (input_type_ids) 110 | 111 | # 全局使用,使其可以django、flask、tornado等调用 112 | with graph.as_default (): 113 | predicts = model.predict ([input_ids, input_type_ids], batch_size=1) 114 | # print (predicts.shape) 115 | # for i, token in enumerate (tokens_text): 116 | # (token, [len (predicts[0][i].tolist ())], predicts[0][i].tolist ()) 117 | 118 | # 相当于pool,采用的是https://github.com/terrifyzhao/bert-utils/blob/master/graph.py 119 | mul_mask = lambda x, m: x * np.expand_dims (m, axis=-1) 120 | masked_reduce_mean = lambda x, m: np.sum (mul_mask (x, m), axis=1) / (np.sum (m, axis=1, keepdims=True) + 1e-9) 121 | 122 | pools = [] 123 | for i in range (len (predicts)): 124 | pred = predicts[i] 125 | masks = input_masks.tolist () 126 | mask_np = np.array ([masks[i]]) 127 | pooled = masked_reduce_mean (pred, mask_np) 128 | pooled = pooled.tolist () 129 | pools.append (pooled[0]) 130 | # print ('bert:', pools) 131 | return pools 132 | 133 | def bert_encode_word(self, texts): 134 | # 文本预处理 135 | input_ids = [] 136 | input_masks = [] 137 | input_type_ids = [] 138 | for text in texts: 139 | # print (text) 140 | tokens_text = self.tokenizer.tokenize (text) 141 | # print ('Tokens:', tokens_text) 142 | input_id, input_type_id = self.tokenizer.encode (first=text, max_len=self.max_seq_len) 143 | input_mask = [0 if ids == 0 else 1 for ids in input_id] 144 | input_ids.append (input_id) 145 | input_type_ids.append (input_type_id) 146 | input_masks.append (input_mask) 147 | 148 | input_ids = np.array (input_ids) 149 | input_masks = np.array (input_masks) 150 | input_type_ids = np.array (input_type_ids) 151 | 152 | # 全局使用,使其可以django、flask、tornado等调用 153 | with graph.as_default (): 154 | predicts = model.predict ([input_ids, input_type_ids], batch_size=1) 155 | # print (predicts.shape) 156 | # for i, token in enumerate (tokens_text): 157 | # (token, [len (predicts[0][i].tolist ())], predicts[0][i].tolist ()) 158 | words_vec=predicts[0][1:len(tokens_text)-1] 159 | words_vec = np.array (words_vec) 160 | words_vec = (words_vec.astype (np.float32)) 161 | ret=[] 162 | for i in words_vec: 163 | ret.append(i) 164 | return ret 165 | 166 | bert_vector = KerasBertVector () 167 | # if __name__ == "__main__": 168 | def gen_sen_vec(sen): 169 | pooled = bert_vector.bert_encode_sen([sen]) 170 | vec = pooled[0] 171 | vec = np.array (vec) 172 | vec.tolist () 173 | return vec 174 | def gen_words_vec(sen): 175 | pooled = bert_vector.bert_encode_word ([sen]) 176 | return pooled 177 | 178 | # 179 | # vec=gen_sen_vec('how are you') 180 | # vec=np.array(vec) 181 | # print(vec) 182 | # vec=(vec.astype(np.float64)) 183 | # vec.tolist() 184 | # print(vec) 185 | # print (pooled) 186 | # while True: 187 | # print ("input:") 188 | # ques = input () 189 | # print (bert_vector.bert_encode ([ques])) 190 | --------------------------------------------------------------------------------