├── README.md ├── margin_softmax.py └── sent_sim.py /README.md: -------------------------------------------------------------------------------- 1 | # keras sparse implement of margin-softmax 2 | 3 | Keras implement of AM-Softmax, AAM-softmax, and so on. 4 | 5 | An semantic similarity model with GRU + AM-Softmax. 6 | 7 | https://kexue.fm/archives/5743 8 | -------------------------------------------------------------------------------- /margin_softmax.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | 3 | import keras.backend as K 4 | 5 | 6 | # 普通sparse交叉熵,以logits为输入 7 | def sparse_logits_categorical_crossentropy(y_true, y_pred, scale=30): 8 | return K.sparse_categorical_crossentropy(y_true, scale * y_pred, from_logits=True) 9 | 10 | 11 | # 稀疏版AM-Softmax 12 | def sparse_amsoftmax_loss(y_true, y_pred, scale=30, margin=0.35): 13 | y_true = K.expand_dims(y_true[:, 0], 1) # 保证y_true的shape=(None, 1) 14 | y_true = K.cast(y_true, 'int32') # 保证y_true的dtype=int32 15 | batch_idxs = K.arange(0, K.shape(y_true)[0]) 16 | batch_idxs = K.expand_dims(batch_idxs, 1) 17 | idxs = K.concatenate([batch_idxs, y_true], 1) 18 | y_true_pred = K.tf.gather_nd(y_pred, idxs) # 目标特征,用tf.gather_nd提取出来 19 | y_true_pred = K.expand_dims(y_true_pred, 1) 20 | y_true_pred_margin = y_true_pred - margin # 减去margin 21 | _Z = K.concatenate([y_pred, y_true_pred_margin], 1) # 为计算配分函数 22 | _Z = _Z * scale # 缩放结果,主要因为pred是cos值,范围[-1, 1] 23 | logZ = K.logsumexp(_Z, 1, keepdims=True) # 用logsumexp,保证梯度不消失 24 | logZ = logZ + K.log(1 - K.exp(scale * y_true_pred - logZ)) # 从Z中减去exp(scale * y_true_pred) 25 | return - y_true_pred_margin * scale + logZ 26 | 27 | 28 | # 简单的类A-Softmax(m=4) 29 | def sparse_simpler_asoftmax_loss(y_true, y_pred, scale=30): 30 | y_true = K.expand_dims(y_true[:, 0], 1) # 保证y_true的shape=(None, 1) 31 | y_true = K.cast(y_true, 'int32') # 保证y_true的dtype=int32 32 | batch_idxs = K.arange(0, K.shape(y_true)[0]) 33 | batch_idxs = K.expand_dims(batch_idxs, 1) 34 | idxs = K.concatenate([batch_idxs, y_true], 1) 35 | y_true_pred = K.tf.gather_nd(y_pred, idxs) # 目标特征,用tf.gather_nd提取出来 36 | y_true_pred = K.expand_dims(y_true_pred, 1) 37 | # 用到了四倍角公式进行展开 38 | y_true_pred_margin = 1 - 8 * K.square(y_true_pred) + 8 * K.square(K.square(y_true_pred)) 39 | # 下面等效于min(y_true_pred, y_true_pred_margin) 40 | y_true_pred_margin = y_true_pred_margin - K.relu(y_true_pred_margin - y_true_pred) 41 | _Z = K.concatenate([y_pred, y_true_pred_margin], 1) # 为计算配分函数 42 | _Z = _Z * scale # 缩放结果,主要因为pred是cos值,范围[-1, 1] 43 | logZ = K.logsumexp(_Z, 1, keepdims=True) # 用logsumexp,保证梯度不消失 44 | logZ = logZ + K.log(1 - K.exp(scale * y_true_pred - logZ)) # 从Z中减去exp(scale * y_true_pred) 45 | return - y_true_pred_margin * scale + logZ 46 | -------------------------------------------------------------------------------- /sent_sim.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import json 7 | from keras.models import Model 8 | from keras.layers import * 9 | from keras.constraints import unit_norm 10 | from margin_softmax import * 11 | from keras.callbacks import Callback 12 | 13 | 14 | num_train_groups = 90000 # 前9万组问题拿来做训练 15 | maxlen = 32 16 | batch_size = 100 17 | min_count = 5 18 | word_size = 128 19 | epochs = 30 # amsoftmax需要25个epoch,其它需要20个epoch 20 | 21 | 22 | data = pd.read_csv('tongyiju.csv', encoding='utf-8', header=None, delimiter='\t') 23 | 24 | def strQ2B(ustring): # 全角转半角 25 | rstring = '' 26 | for uchar in ustring: 27 | inside_code=ord(uchar) 28 | if inside_code == 12288: # 全角空格直接转换 29 | inside_code = 32 30 | elif (inside_code >= 65281 and inside_code <= 65374): # 全角字符(除空格)根据关系转化 31 | inside_code -= 65248 32 | rstring += unichr(inside_code) 33 | return rstring 34 | 35 | 36 | data[1] = data[1].apply(strQ2B) 37 | data[1] = data[1].str.lower() 38 | 39 | chars = {} 40 | for s in tqdm(iter(data[1])): 41 | for c in s: 42 | if c not in chars: 43 | chars[c] = 0 44 | chars[c] += 1 45 | 46 | 47 | # 0: padding标记 48 | # 1: unk标记 49 | chars = {i:j for i,j in chars.items() if j >= min_count} 50 | id2char = {i+2:j for i,j in enumerate(chars)} 51 | char2id = {j:i for i,j in id2char.items()} 52 | 53 | def string2id(s): 54 | _ = [char2id.get(i, 1) for i in s[:maxlen]] 55 | _ = _ + [0] * (maxlen - len(_)) 56 | return _ 57 | 58 | 59 | data[2] = data[1].apply(string2id) 60 | train_data = data[data[0] < num_train_groups] 61 | train_data = train_data.sample(frac=1) 62 | x_train = np.array(list(train_data[2])) 63 | y_train = np.array(list(train_data[0])).reshape((-1,1)) 64 | 65 | valid_data = data[data[0] >= num_train_groups] 66 | 67 | 68 | # 正式模型,基于GRU的分类器 69 | x_in = Input(shape=(maxlen,)) 70 | x_embedded = Embedding(len(chars)+2, 71 | word_size)(x_in) 72 | x = CuDNNGRU(word_size)(x_embedded) 73 | x = Lambda(lambda x: K.l2_normalize(x, 1))(x) 74 | 75 | pred = Dense(num_train_groups, 76 | use_bias=False, 77 | kernel_constraint=unit_norm())(x) 78 | 79 | encoder = Model(x_in, x) # 最终的目的是要得到一个编码器 80 | model = Model(x_in, pred) # 用分类问题做训练 81 | 82 | model.compile(loss=sparse_amsoftmax_loss, 83 | optimizer='adam', 84 | metrics=['sparse_categorical_accuracy']) 85 | 86 | 87 | # 为验证集的排序准备 88 | # 实际上用numpy写也没有问题,但是用Keras写能借助GPU加速 89 | x_in = Input(shape=(word_size,)) 90 | x = Dense(len(valid_data), use_bias=False)(x_in) # 计算相似度 91 | x = Lambda(lambda x: K.tf.nn.top_k(x, 11)[1])(x) # 取出topk的下标 92 | model_sort = Model(x_in, x) 93 | 94 | # id与组别之间的映射 95 | id2g = dict(zip(valid_data.index-valid_data.index[0], valid_data[0])) 96 | 97 | 98 | def evaluate(): # 评测函数 99 | print 'validing...' 100 | valid_vec = encoder.predict(np.array(list(valid_data[2])), 101 | verbose=True, 102 | batch_size=1000) # encoder计算句向量 103 | model_sort.set_weights([valid_vec.T]) # 载入句向量为权重 104 | sorted_result = model_sort.predict(valid_vec, 105 | verbose=True, 106 | batch_size=1000) # 计算topk 107 | new_result = np.vectorize(lambda s: id2g[s])(sorted_result) 108 | _ = new_result[:, 0] != new_result[:, 0] # 生成一个全为False的向量 109 | for i in range(10): # 注意按照相似度排序的话,第一个就是输入句子(全匹配) 110 | _ = _ + (new_result[:, 0] == new_result[:, i+1]) 111 | if i+1 == 1: 112 | top1_acc = 1. * _.sum() / len(_) 113 | elif i+1 == 5: 114 | top5_acc = 1. * _.sum() / len(_) 115 | elif i+1 == 10: 116 | top10_acc = 1. * _.sum() / len(_) 117 | 118 | return top1_acc, top5_acc, top10_acc 119 | 120 | 121 | # 定义Callback器,计算验证集的acc,并保存最优模型 122 | class Evaluate(Callback): 123 | def __init__(self): 124 | self.accs = {'top1': [], 'top5': [], 'top10': []} 125 | self.highest = 0. 126 | def on_epoch_end(self, epoch, logs=None): 127 | top1_acc, top5_acc, top10_acc = evaluate() 128 | self.accs['top1'].append(top1_acc) 129 | self.accs['top5'].append(top5_acc) 130 | self.accs['top10'].append(top10_acc) 131 | if top1_acc >= self.highest: # 保存最优模型权重 132 | self.highest = top1_acc 133 | model.save_weights('sent_sim_amsoftmax.model') 134 | json.dump({'accs': self.accs, 'highest_top1': self.highest}, 135 | open('valid_amsoftmax.log', 'w'), indent=4) 136 | print 'top1_acc: %s, top5_acc: %s, top10_acc: %s' % (top1_acc, top5_acc, top10_acc) 137 | 138 | 139 | evaluator = Evaluate() 140 | 141 | history = model.fit(x_train, 142 | y_train, 143 | batch_size=batch_size, 144 | epochs=epochs, 145 | callbacks=[evaluator]) 146 | 147 | 148 | valid_vec = encoder.predict(np.array(list(valid_data[2])), 149 | verbose=True, 150 | batch_size=1000) # encoder计算句向量 151 | 152 | def most_similar(s): 153 | v = encoder.predict(np.array([string2id(s)]))[0] 154 | sims = np.dot(valid_vec, v) 155 | for i in sims.argsort()[-10:][::-1]: 156 | print valid_data.iloc[i][1],sims[i] 157 | 158 | 159 | most_similar(u'ps格式可以转换成ai格式吗') 160 | most_similar(u'广州的客运站的数目') 161 | most_similar(u'沙发一般有多高') 162 | --------------------------------------------------------------------------------