├── README.md
├── margin_softmax.py
└── sent_sim.py


/README.md:
--------------------------------------------------------------------------------
1 | # keras sparse implement of margin-softmax
2 | 
3 | Keras implement of AM-Softmax, AAM-softmax, and so on.
4 | 
5 | An semantic similarity model with GRU + AM-Softmax.
6 | 
7 | https://kexue.fm/archives/5743
8 | 


--------------------------------------------------------------------------------
/margin_softmax.py:
--------------------------------------------------------------------------------
 1 | #! -*- coding: utf-8 -*-
 2 | 
 3 | import keras.backend as K
 4 | 
 5 | 
 6 | # 普通sparse交叉熵，以logits为输入
 7 | def sparse_logits_categorical_crossentropy(y_true, y_pred, scale=30):
 8 |     return K.sparse_categorical_crossentropy(y_true, scale * y_pred, from_logits=True)
 9 | 
10 | 
11 | # 稀疏版AM-Softmax
12 | def sparse_amsoftmax_loss(y_true, y_pred, scale=30, margin=0.35):
13 |     y_true = K.expand_dims(y_true[:, 0], 1) # 保证y_true的shape=(None, 1)
14 |     y_true = K.cast(y_true, 'int32') # 保证y_true的dtype=int32
15 |     batch_idxs = K.arange(0, K.shape(y_true)[0])
16 |     batch_idxs = K.expand_dims(batch_idxs, 1)
17 |     idxs = K.concatenate([batch_idxs, y_true], 1)
18 |     y_true_pred = K.tf.gather_nd(y_pred, idxs) # 目标特征，用tf.gather_nd提取出来
19 |     y_true_pred = K.expand_dims(y_true_pred, 1)
20 |     y_true_pred_margin = y_true_pred - margin # 减去margin
21 |     _Z = K.concatenate([y_pred, y_true_pred_margin], 1) # 为计算配分函数
22 |     _Z = _Z * scale # 缩放结果，主要因为pred是cos值，范围[-1, 1]
23 |     logZ = K.logsumexp(_Z, 1, keepdims=True) # 用logsumexp，保证梯度不消失
24 |     logZ = logZ + K.log(1 - K.exp(scale * y_true_pred - logZ)) # 从Z中减去exp(scale * y_true_pred)
25 |     return - y_true_pred_margin * scale + logZ
26 | 
27 | 
28 | # 简单的类A-Softmax（m=4）
29 | def sparse_simpler_asoftmax_loss(y_true, y_pred, scale=30):
30 |     y_true = K.expand_dims(y_true[:, 0], 1) # 保证y_true的shape=(None, 1)
31 |     y_true = K.cast(y_true, 'int32') # 保证y_true的dtype=int32
32 |     batch_idxs = K.arange(0, K.shape(y_true)[0])
33 |     batch_idxs = K.expand_dims(batch_idxs, 1)
34 |     idxs = K.concatenate([batch_idxs, y_true], 1)
35 |     y_true_pred = K.tf.gather_nd(y_pred, idxs) # 目标特征，用tf.gather_nd提取出来
36 |     y_true_pred = K.expand_dims(y_true_pred, 1)
37 |     # 用到了四倍角公式进行展开
38 |     y_true_pred_margin = 1 - 8 * K.square(y_true_pred) + 8 * K.square(K.square(y_true_pred))
39 |     # 下面等效于min(y_true_pred, y_true_pred_margin)
40 |     y_true_pred_margin = y_true_pred_margin - K.relu(y_true_pred_margin - y_true_pred)
41 |     _Z = K.concatenate([y_pred, y_true_pred_margin], 1) # 为计算配分函数
42 |     _Z = _Z * scale # 缩放结果，主要因为pred是cos值，范围[-1, 1]
43 |     logZ = K.logsumexp(_Z, 1, keepdims=True) # 用logsumexp，保证梯度不消失
44 |     logZ = logZ + K.log(1 - K.exp(scale * y_true_pred - logZ)) # 从Z中减去exp(scale * y_true_pred)
45 |     return - y_true_pred_margin * scale + logZ
46 | 


--------------------------------------------------------------------------------
/sent_sim.py:
--------------------------------------------------------------------------------
  1 | #! -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from tqdm import tqdm
  6 | import json
  7 | from keras.models import Model
  8 | from keras.layers import *
  9 | from keras.constraints import unit_norm
 10 | from margin_softmax import *
 11 | from keras.callbacks import Callback
 12 | 
 13 | 
 14 | num_train_groups = 90000 # 前9万组问题拿来做训练
 15 | maxlen = 32
 16 | batch_size = 100
 17 | min_count = 5
 18 | word_size = 128
 19 | epochs = 30 # amsoftmax需要25个epoch，其它需要20个epoch
 20 | 
 21 | 
 22 | data = pd.read_csv('tongyiju.csv', encoding='utf-8', header=None, delimiter='\t')
 23 | 
 24 | def strQ2B(ustring): # 全角转半角
 25 |     rstring = ''
 26 |     for uchar in ustring:
 27 |         inside_code=ord(uchar)
 28 |         if inside_code == 12288: # 全角空格直接转换
 29 |             inside_code = 32
 30 |         elif (inside_code >= 65281 and inside_code <= 65374): # 全角字符（除空格）根据关系转化
 31 |             inside_code -= 65248
 32 |         rstring += unichr(inside_code)
 33 |     return rstring
 34 | 
 35 | 
 36 | data[1] = data[1].apply(strQ2B)
 37 | data[1] = data[1].str.lower()
 38 | 
 39 | chars = {}
 40 | for s in tqdm(iter(data[1])):
 41 |     for c in s:
 42 |         if c not in chars:
 43 |             chars[c] = 0
 44 |         chars[c] += 1
 45 | 
 46 | 
 47 | # 0: padding标记
 48 | # 1: unk标记
 49 | chars = {i:j for i,j in chars.items() if j >= min_count}
 50 | id2char = {i+2:j for i,j in enumerate(chars)}
 51 | char2id = {j:i for i,j in id2char.items()}
 52 | 
 53 | def string2id(s):
 54 |     _ = [char2id.get(i, 1) for i in s[:maxlen]]
 55 |     _ = _ + [0] * (maxlen - len(_))
 56 |     return _
 57 | 
 58 | 
 59 | data[2] = data[1].apply(string2id)
 60 | train_data = data[data[0] < num_train_groups]
 61 | train_data = train_data.sample(frac=1)
 62 | x_train = np.array(list(train_data[2]))
 63 | y_train = np.array(list(train_data[0])).reshape((-1,1))
 64 | 
 65 | valid_data = data[data[0] >= num_train_groups]
 66 | 
 67 | 
 68 | # 正式模型，基于GRU的分类器
 69 | x_in = Input(shape=(maxlen,))
 70 | x_embedded = Embedding(len(chars)+2,
 71 |                        word_size)(x_in)
 72 | x = CuDNNGRU(word_size)(x_embedded)
 73 | x = Lambda(lambda x: K.l2_normalize(x, 1))(x)
 74 | 
 75 | pred = Dense(num_train_groups,
 76 |              use_bias=False,
 77 |              kernel_constraint=unit_norm())(x)
 78 | 
 79 | encoder = Model(x_in, x) # 最终的目的是要得到一个编码器
 80 | model = Model(x_in, pred) # 用分类问题做训练
 81 | 
 82 | model.compile(loss=sparse_amsoftmax_loss,
 83 |               optimizer='adam',
 84 |               metrics=['sparse_categorical_accuracy'])
 85 | 
 86 | 
 87 | # 为验证集的排序准备
 88 | # 实际上用numpy写也没有问题，但是用Keras写能借助GPU加速
 89 | x_in = Input(shape=(word_size,))
 90 | x = Dense(len(valid_data), use_bias=False)(x_in) # 计算相似度
 91 | x = Lambda(lambda x: K.tf.nn.top_k(x, 11)[1])(x) # 取出topk的下标
 92 | model_sort = Model(x_in, x)
 93 | 
 94 | # id与组别之间的映射
 95 | id2g = dict(zip(valid_data.index-valid_data.index[0], valid_data[0]))
 96 | 
 97 | 
 98 | def evaluate(): # 评测函数
 99 |     print 'validing...'
100 |     valid_vec = encoder.predict(np.array(list(valid_data[2])),
101 |                                 verbose=True,
102 |                                 batch_size=1000) # encoder计算句向量
103 |     model_sort.set_weights([valid_vec.T]) # 载入句向量为权重
104 |     sorted_result = model_sort.predict(valid_vec,
105 |                                        verbose=True,
106 |                                        batch_size=1000) # 计算topk
107 |     new_result = np.vectorize(lambda s: id2g[s])(sorted_result)
108 |     _ = new_result[:, 0] != new_result[:, 0] # 生成一个全为False的向量
109 |     for i in range(10): # 注意按照相似度排序的话，第一个就是输入句子（全匹配）
110 |         _ = _ + (new_result[:, 0] == new_result[:, i+1])
111 |         if i+1 == 1:
112 |             top1_acc = 1. * _.sum() / len(_)
113 |         elif i+1 == 5:
114 |             top5_acc = 1. * _.sum() / len(_)
115 |         elif i+1 == 10:
116 |             top10_acc = 1. * _.sum() / len(_)
117 | 
118 |     return top1_acc, top5_acc, top10_acc
119 | 
120 | 
121 | # 定义Callback器，计算验证集的acc，并保存最优模型
122 | class Evaluate(Callback):
123 |     def __init__(self):
124 |         self.accs = {'top1': [], 'top5': [], 'top10': []}
125 |         self.highest = 0.
126 |     def on_epoch_end(self, epoch, logs=None):
127 |         top1_acc, top5_acc, top10_acc = evaluate()
128 |         self.accs['top1'].append(top1_acc)
129 |         self.accs['top5'].append(top5_acc)
130 |         self.accs['top10'].append(top10_acc)
131 |         if top1_acc >= self.highest: # 保存最优模型权重
132 |             self.highest = top1_acc
133 |             model.save_weights('sent_sim_amsoftmax.model')
134 |         json.dump({'accs': self.accs, 'highest_top1': self.highest},
135 |                   open('valid_amsoftmax.log', 'w'), indent=4)
136 |         print 'top1_acc: %s, top5_acc: %s, top10_acc: %s' % (top1_acc, top5_acc, top10_acc)
137 | 
138 | 
139 | evaluator = Evaluate()
140 | 
141 | history = model.fit(x_train,
142 |                     y_train,
143 |                     batch_size=batch_size,
144 |                     epochs=epochs,
145 |                     callbacks=[evaluator])
146 | 
147 | 
148 | valid_vec = encoder.predict(np.array(list(valid_data[2])),
149 |                             verbose=True,
150 |                             batch_size=1000) # encoder计算句向量
151 | 
152 | def most_similar(s):
153 |     v = encoder.predict(np.array([string2id(s)]))[0]
154 |     sims = np.dot(valid_vec, v)
155 |     for i in sims.argsort()[-10:][::-1]:
156 |         print valid_data.iloc[i][1],sims[i]
157 | 
158 | 
159 | most_similar(u'ps格式可以转换成ai格式吗')
160 | most_similar(u'广州的客运站的数目')
161 | most_similar(u'沙发一般有多高')
162 | 


--------------------------------------------------------------------------------