├── README.md ├── data ├── data_process.py ├── main_example.py ├── original_data │ ├── keyphrase_dataset.tar.gz │ └── readme └── readme ├── load.py ├── main.py ├── models ├── __init__.py ├── bi_lstm_model.py └── model.py ├── predict.py └── tools.py /README.md: -------------------------------------------------------------------------------- 1 | # Keyphrase Extraction 2 | Source codes of our EMNLP2016 paper [Keyphrase Extraction Using Deep Recurrent Neural Networks on Twitter](http://jkx.fudan.edu.cn/~qzhang/paper/keyphrase.emnlp2016.pdf) 3 | 4 | ## Preparation 5 | You need to prepare the pre-trained word vectors. 6 | * Pre-trained word vectors. Download [GoogleNews-vectors-negative300.bin.gz](https://code.google.com/archive/p/word2vec/) 7 | 8 | 9 | ## Details 10 | Joint RNN model 11 | 12 | * data文件夹存储数据集 13 | 14 | * checkpoints文件夹存储模型训练得到的参数 15 | 16 | * main.py是主程序 17 | 18 | * models/model.py定义了joint-rnn模型 19 | 20 | * models/bi_lstm_model.py 用双向lstm代替rnn 21 | 22 | * load.py用于加载数据集 23 | 24 | * tools.py定义了一些工具函数 25 | 26 | ## Requirement 27 | tensorflow0.11 + tensorlayer 28 | 29 | -------------------------------------------------------------------------------- /data/data_process.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import re 4 | import cPickle 5 | from collections import Counter 6 | 7 | 8 | def getlist(filename): 9 | 10 | with open(filename) as f: 11 | datalist,taglist=[],[] 12 | for line in f: 13 | line=line.strip() 14 | datalist.append(line.split('\t')[0]) 15 | taglist.append(line.split('\t')[1]) 16 | 17 | 18 | 19 | return datalist,taglist 20 | 21 | #build vocabulary 22 | def get_dict(filenames): 23 | trnTweet,testTweet=filenames 24 | sentence_list=getlist(trnTweet)[0]+getlist(testTweet)[0] 25 | 26 | words2idx=1,{} 27 | words=[] 28 | 29 | for sentence in sentence_list: 30 | word_list=sentence.split() 31 | words.extend(word_list) 32 | 33 | word_counts=Counter(words) 34 | words2idx={word[0]:i+1 for i,word in enumerate(word_counts.most_common())} 35 | 36 | labels2idx = {'O': 0, 'B': 1, 'I': 2, 'E': 3, 'S': 4} 37 | dicts = {'words2idx': words2idx, 'labels2idx': labels2idx} 38 | 39 | return dicts 40 | 41 | def get_train_test_dicts(filenames): 42 | """ 43 | Args: 44 | filenames:trnTweet,testTweet,tag_id_cnt 45 | 46 | Returns: 47 | dataset:train_set,test_set,dicts 48 | 49 | train_set=[train_lex,train_y,train_z] 50 | test_set=[test_lex,test_y,test_z] 51 | dicts = {'words2idx': words2idx, 'labels2idx': labels2idx} 52 | 53 | 54 | """ 55 | trnTweetCnn, testTweetCnn= filenames 56 | dicts=get_dict([trnTweetCnn,testTweetCnn]) 57 | 58 | trn_data=getlist(trnTweetCnn) 59 | test_data=getlist(testTweetCnn) 60 | 61 | trn_sentence_list,trn_tag_list=trn_data 62 | test_sentence_list,test_tag_list=test_data 63 | 64 | words2idx=dicts['words2idx'] 65 | labels2idx=dicts['labels2idx'] 66 | 67 | def get_lex_y(sentence_list,tag_list,words2idx): 68 | lex,y,z=[],[],[] 69 | bad_cnt=0 70 | for s,tag in zip(sentence_list,tag_list): 71 | 72 | 73 | 74 | word_list=s.split() 75 | t_list=tag.split() 76 | 77 | emb=map(lambda x:words2idx[x],word_list) 78 | 79 | 80 | begin=-1 81 | for i in range(len(word_list)): 82 | ok=True 83 | for j in range(len(t_list)): 84 | if word_list[i+j]!=t_list[j]: 85 | ok=False; 86 | break 87 | if ok==True: 88 | begin=i 89 | break 90 | 91 | if begin==-1: 92 | bad_cnt+=1 93 | continue 94 | 95 | lex.append(emb) 96 | 97 | labels_y=[0]*len(word_list) 98 | for i in range(len(t_list)): 99 | labels_y[begin+i]=1 100 | y.append(labels_y) 101 | 102 | labels_z=[0]*len(word_list) 103 | if len(t_list)==1: 104 | labels_z[begin]=labels2idx['S'] 105 | elif len(t_list)>1: 106 | labels_z[begin]=labels2idx['B'] 107 | 108 | for i in range(len(t_list)-2): 109 | labels_z[begin+i+1]=labels2idx['I'] 110 | labels_z[begin+len(t_list)-1]=labels2idx['E'] 111 | 112 | z.append(labels_z) 113 | return lex,y,z 114 | 115 | train_lex, train_y, train_z = get_lex_y(trn_sentence_list,trn_tag_list, words2idx) 116 | test_lex, test_y, test_z = get_lex_y(test_sentence_list,test_tag_list,words2idx) 117 | train_set = [train_lex, train_y, train_z] 118 | test_set = [test_lex, test_y, test_z] 119 | data_set = [train_set, test_set, dicts] 120 | with open('data_set.pkl', 'w') as f: 121 | cPickle.dump(data_set, f) 122 | return data_set 123 | 124 | 125 | 126 | def load_bin_vec(frame,vocab): 127 | k=0 128 | word_vecs={} 129 | with open(frame) as f: 130 | for line in f: 131 | word=line.strip().split(' ',1)[0] 132 | embeding=line.strip().split(' ',1)[1].split() 133 | if word in vocab: 134 | word_vecs[word]=np.asarray(embeding,dtype=np.float32) 135 | k+=1 136 | if k%10000==0: 137 | print "load_bin_vec %d" % k 138 | 139 | return word_vecs 140 | 141 | def add_unknown_words(word_vecs, vocab, min_df=1, dim=300): 142 | """ 143 | For words that occur in at least min_df documents, create a separate word vector. 144 | 0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones 145 | """ 146 | k=0 147 | for w in vocab: 148 | if w not in word_vecs: 149 | word_vecs[w]=np.asarray(np.random.uniform(-0.25,0.25,dim),dtype=np.float32) 150 | k+=1 151 | if k % 10000==0: 152 | print "add_unknow_words %d" % k 153 | return word_vecs 154 | 155 | def get_embedding(w2v,words2idx,k=300): 156 | embedding = np.zeros((len(w2v) + 2, k), dtype=np.float32) 157 | for (w,idx) in words2idx.items(): 158 | embedding[idx]=w2v[w] 159 | #embedding[0]=np.asarray(np.random.uniform(-0.25,0.25,k),dtype=np.float32) 160 | with open('embedding.pkl','w') as f: 161 | cPickle.dump(embedding,f) 162 | return embedding 163 | 164 | 165 | if __name__ == '__main__': 166 | data_folder = ["original_data/trnTweet","original_data/testTweet"] 167 | data_set = get_train_test_dicts(data_folder) 168 | print "data_set complete!" 169 | dicts = data_set[2] 170 | vocab = set(dicts['words2idx'].keys()) 171 | print "total num words: " + str(len(vocab)) 172 | print "dataset created!" 173 | train_set, test_set, dicts=data_set 174 | print len(train_set[0]) 175 | 176 | #GoogleNews-vectors-negative300.txt为预先训练的词向量 177 | w2v_file='original_data/GoogleNews-vectors-negative300.txt' 178 | w2v=load_bin_vec(w2v_file,vocab) 179 | print "word2vec loaded" 180 | add_unknown_words(w2v,vocab) 181 | embedding=get_embedding(w2v,dicts['words2idx']) 182 | print "embedding created" 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | -------------------------------------------------------------------------------- /data/main_example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import cPickle 3 | def main(): 4 | 5 | f = open('data_set.pkl') 6 | train_set, test_set, dicts = cPickle.load(f) 7 | embedding = cPickle.load(open('embedding.pkl')) 8 | 9 | word2idx=dicts['words2idx'] 10 | labels2idx=dicts['labels2idx'] 11 | 12 | train_lex, train_y, train_z = train_set 13 | test_lex, test_y, test_z = test_set 14 | # 将测试集分成测试集和验证,最终训练集,验证集,测试集7:1:2 15 | tr = int(len(test_lex)*0.67) 16 | valid_lex, valid_y, valid_z = test_lex[tr:], test_y[tr:], test_z[tr:] 17 | test_lex, test_y, test_z = test_lex[:tr],test_y[:tr],test_z[:tr] 18 | 19 | 20 | print 'len(train_data) {}'.format(len(train_lex)) 21 | print 'len(valid_data) {}'.format(len(valid_lex)) 22 | print 'len(test_data) {}'.format(len(test_lex)) 23 | 24 | vocab_size = len(word2idx) 25 | print 'len(vocab) {}'.format(vocab_size) 26 | print "Train started!" 27 | 28 | main() -------------------------------------------------------------------------------- /data/original_data/keyphrase_dataset.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fudannlp16/KeyPhrase-Extraction/507f36d2f03d6d89793859a50035391684ab5c52/data/original_data/keyphrase_dataset.tar.gz -------------------------------------------------------------------------------- /data/original_data/readme: -------------------------------------------------------------------------------- 1 | trnTweet和testTweet为原始数据 2 | testTweet 33755 3 | trainTweet 78760 4 | 每一行为tweet和其对应的hashtag,tweet和hashtag以\t隔开 5 | -------------------------------------------------------------------------------- /data/readme: -------------------------------------------------------------------------------- 1 | 文件结构 2 | original_data: 3 | trnTweet 4 | testTweet 5 | GoogleNews-vectors-negative300.txt 6 | data_process.py 7 | data_set.pkl 8 | embedding.pkl 9 | main_example.py 10 | 11 | 12 | 13 | 1.original_data为原始数据目录 14 | trnTweet和testTweet是处理好的原始tweet 15 | trainTweet 78760 16 | testTweet 33755 17 | 每一行包含tweet和对应的hashtag,tweet和hashtag以\t隔开 18 | GoogleNews-vectors-negative300.txt为用Google news预训练的300维词向量(available at https://code.google.com/archive/p/word2vec/) 19 | 20 | 21 | 2.data_process.py 为数据处理脚本 22 | 运行python data_process.py会生成处理好的数据 23 | data_set.pkl和embedding.pkl 24 | 25 | 26 | 3.data_set.pkl 数据文件 27 | data_set=[train_set,test_set,dicts] 28 | train_set=[train_lex,train_y,train_z] 29 | test_set=[test_lex,test_y,test_z] 30 | dicts = {'words2idx': words2idx, 'labels2idx': labels2idx}(labels2idx好像用不到) 31 | 32 | words2idx,labelsidx数据类型为字典,key代表单词,value代表对应的id(数字) 33 | train_lex,train_y,train_z test_lex,test_y,test_z是处理好的数据(id化) 34 | train_lex为tweet.train_y表示tweet每个单词对应的标记,是hashtag对应1,不是hashtag对应0,trian_z表示带有位置的标记,0表示不是hashtag,1表示hashtag的开始位置,2表示hashtag的中间位置,3表示hangtag的结尾位置,4表示对应的hashtag就是一个单词.即labels2idx = {'O': 0, 'B': 1, 'I': 2, 'E': 3, 'S': 4} 35 | train_lex,train_y,train_z test_lex,test_y,test_z的数据类型为python嵌套列表 36 | 如train_lex[0]表示第1行tweet,也是一个列表,里面存贮着单词对应的id(数字) 37 | 38 | 39 | 4.embedding.pkl用GoogleNews-vectors-negative300预训练的300维词向量,包含数据集中所有单词的词向量,padding对应的id为0,全0初始化 40 | 使用方法 41 | embedding=cPickle.load(open('embedding.pkl')) 42 | 43 | 44 | 5.main_example.py为使用示例 45 | 46 | 47 | -------------------------------------------------------------------------------- /load.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import cPickle 4 | import random 5 | def atisfold(): 6 | f = open('data/data_set.pkl') 7 | train_set, test_set, dicts = cPickle.load(f) 8 | embedding = cPickle.load(open('data/embedding.pkl')) 9 | return train_set, test_set,dicts,embedding 10 | 11 | def pad_sentences(sentences, padding_word=0, forced_sequence_length=None): 12 | if forced_sequence_length is None: 13 | sequence_length=max(len(x) for x in sentences) 14 | else: 15 | sequence_length=forced_sequence_length 16 | padded_sentences=[] 17 | for i in xrange(len(sentences)): 18 | sentence=sentences[i] 19 | num_padding=sequence_length-len(sentence) 20 | if num_padding<0: 21 | padded_sentence=sentence[0:sequence_length] 22 | else: 23 | padded_sentence=sentence+[int(padding_word)]*num_padding 24 | 25 | padded_sentences.append(padded_sentence) 26 | 27 | return padded_sentences 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import tensorlayer as tl 4 | import numpy as np 5 | import time 6 | import time 7 | import os 8 | import random 9 | import load 10 | import models.model as model 11 | import tools 12 | import sys 13 | 14 | def main(): 15 | s={ 16 | 'nh1':300, 17 | 'nh2':300, 18 | 'win':3, 19 | 'emb_dimension':300, 20 | 'lr':0.1, 21 | 'lr_decay':0.5, 22 | 'max_grad_norm':5, 23 | 'seed':345, 24 | 'nepochs':150, 25 | 'batch_size':16, 26 | 'keep_prob':0.5, 27 | 'check_dir':'./checkpoints', 28 | 'display_test_per':3, 29 | 'lr_decay_per':10 30 | } 31 | 32 | train_set,test_set,dic,embedding=load.atisfold() 33 | 34 | 35 | idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems()) 36 | idx2word = dict((k,v) for v,k in dic['words2idx'].iteritems()) 37 | 38 | train_lex, train_y, train_z = train_set 39 | 40 | tr = int(len(train_lex)*0.9) 41 | valid_lex, valid_y, valid_z = train_lex[tr:], train_y[tr:], train_z[tr:] 42 | train_lex, train_y, train_z = train_lex[:tr], train_y[:tr], train_z[:tr] 43 | test_lex, test_y, test_z = test_set 44 | 45 | print 'len(train_data) {}'.format(len(train_lex)) 46 | print 'len(valid_data) {}'.format(len(valid_lex)) 47 | print 'len(test_data) {}'.format(len(test_lex)) 48 | 49 | vocab = set(dic['words2idx'].keys()) 50 | vocsize = len(vocab) 51 | print 'len(vocab) {}'.format(vocsize) 52 | print "Train started!" 53 | 54 | y_nclasses = 2 55 | z_nclasses = 5 56 | 57 | nsentences = len(train_lex) 58 | 59 | 60 | with tf.Session() as sess: 61 | 62 | rnn=model.Model( 63 | nh1=s['nh1'], 64 | nh2=s['nh2'], 65 | ny=y_nclasses, 66 | nz=z_nclasses, 67 | de=s['emb_dimension'], 68 | cs=s['win'], 69 | lr=s['lr'], 70 | lr_decay=s['lr_decay'], 71 | embedding=embedding, 72 | max_gradient_norm=s['max_grad_norm'], 73 | model_cell='lstm' 74 | ) 75 | 76 | checkpoint_dir=s['check_dir'] 77 | if not os.path.exists(checkpoint_dir): 78 | os.mkdir(checkpoint_dir) 79 | checkpoint_prefix=os.path.join(checkpoint_dir,'model') 80 | 81 | def train_step(cwords,label_y,label_z): 82 | feed={ 83 | rnn.input_x:cwords, 84 | rnn.input_y:label_y, 85 | rnn.input_z:label_z, 86 | rnn.keep_prob:s['keep_prob'], 87 | rnn.batch_size:s['batch_size'] 88 | } 89 | fetches=[rnn.loss,rnn.train_op] 90 | loss,_=sess.run(fetches=fetches,feed_dict=feed) 91 | return loss 92 | 93 | def dev_step(cwords): 94 | feed={ 95 | rnn.input_x:cwords, 96 | rnn.keep_prob:1.0, 97 | rnn.batch_size:s['batch_size'] 98 | } 99 | fetches=rnn.sz_pred 100 | sz_pred=sess.run(fetches=fetches,feed_dict=feed) 101 | return sz_pred 102 | 103 | saver=tf.train.Saver(tf.all_variables()) 104 | sess.run(tf.initialize_all_variables()) 105 | 106 | best_f=-1 107 | best_e=0 108 | test_best_f=-1 109 | test_best_e=0 110 | best_res=None 111 | test_best_res=None 112 | for e in xrange(s['nepochs']): 113 | tools.shuffle([train_lex,train_y,train_z],s['seed']) 114 | t_start=time.time() 115 | for step,batch in enumerate(tl.iterate.minibatches(train_lex,zip(train_y,train_z),batch_size=s['batch_size'])): 116 | input_x,target=batch 117 | label_y,label_z=zip(*target) 118 | input_x=load.pad_sentences(input_x) 119 | label_y=load.pad_sentences(label_y) 120 | label_z=load.pad_sentences(label_z) 121 | cwords=tools.contextwin_2(input_x,s['win']) 122 | loss=train_step(cwords,label_y,label_z) 123 | 124 | print 'loss %.2f' % loss,' [learning] epoch %i>> %2.2f%%' % (e,s['batch_size']*step*100./nsentences),'completed in %.2f (sec) <<\r' % (time.time()-t_start), 125 | 126 | sys.stdout.flush() 127 | 128 | #VALID 129 | 130 | predictions_valid=[] 131 | predictions_test=[] 132 | groundtruth_valid=[] 133 | groundtruth_test=[] 134 | for batch in tl.iterate.minibatches(valid_lex,valid_z,batch_size=s['batch_size']): 135 | x,z=batch 136 | x=load.pad_sentences(x) 137 | x=tools.contextwin_2(x,s['win']) 138 | predictions_valid.extend(dev_step(x)) 139 | groundtruth_valid.extend(z) 140 | 141 | res_valid=tools.conlleval(predictions_valid,groundtruth_valid,'') 142 | 143 | if res_valid['f']>best_f: 144 | best_f=res_valid['f'] 145 | best_e=e 146 | best_res=res_valid 147 | print '\nVALID new best:',res_valid 148 | path = saver.save(sess=sess, save_path=checkpoint_prefix, global_step=e) 149 | print "Save model checkpoint to {}".format(path) 150 | else: 151 | print '\nVALID new curr:',res_valid 152 | 153 | #TEST 154 | if e%s['display_test_per']==0: 155 | for batch in tl.iterate.minibatches(test_lex, test_z, batch_size=s['batch_size']): 156 | x,z = batch 157 | x = load.pad_sentences(x) 158 | x = tools.contextwin_2(x, s['win']) 159 | predictions_test.extend(dev_step(x)) 160 | groundtruth_test.extend(z) 161 | 162 | 163 | res_test = tools.conlleval(predictions_test, groundtruth_test, '') 164 | 165 | if res_test['f'] > test_best_f: 166 | test_best_f = res_test['f'] 167 | test_best_e=e 168 | test_best_res=res_test 169 | print 'TEST new best:',res_test 170 | else: 171 | print 'TEST new curr:',res_test 172 | 173 | # learning rate decay if no improvement in 10 epochs 174 | if e-best_e>s['lr_decay_per']: 175 | sess.run(fetches=rnn.learning_rate_decay_op) 176 | lr=sess.run(fetches=rnn.lr) 177 | print 'learning rate:%f' % lr 178 | if lr<1e-5:break 179 | print 180 | 181 | print "Train finished!" 182 | print 'Valid Best Result: epoch %d: ' % (best_e),best_res 183 | print 'Test Best Result: epoch %d: ' %(test_best_e),test_best_res 184 | 185 | if __name__ == '__main__': 186 | main() 187 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fudannlp16/KeyPhrase-Extraction/507f36d2f03d6d89793859a50035391684ab5c52/models/__init__.py -------------------------------------------------------------------------------- /models/bi_lstm_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | 6 | class Model(object): 7 | 8 | def __init__(self, 9 | nh1, 10 | nh2, 11 | ny, 12 | nz, 13 | de, 14 | cs, 15 | lr, 16 | lr_decay, 17 | embedding, 18 | max_gradient_norm, 19 | model_cell='rnn', 20 | model='basic_model', 21 | nonstatic=False): 22 | 23 | self.batch_size = tf.placeholder(dtype=tf.int32, shape=None) 24 | self.input_x=tf.placeholder(tf.int32,shape=[None,None,cs],name='input_x') 25 | self.input_y=tf.placeholder(tf.int32,shape=[None,None],name="input_y") 26 | self.input_z=tf.placeholder(tf.int32,shape=[None,None],name='input_z') 27 | self.keep_prob=tf.placeholder(dtype=tf.float32,name='keep_prob') 28 | 29 | self.lr=tf.Variable(lr,dtype=tf.float32) 30 | 31 | self.learning_rate_decay_op = self.lr.assign( 32 | self.lr * lr_decay) 33 | 34 | 35 | #Creating embedding input 36 | with tf.device("/cpu:0"),tf.name_scope('embedding'): 37 | if nonstatic: 38 | W=tf.constant(embedding,name='embW',dtype=tf.float32) 39 | else: 40 | W=tf.Variable(embedding,name='embW',dtype=tf.float32) 41 | inputs=tf.nn.embedding_lookup(W,self.input_x) 42 | inputs=tf.reshape(inputs,[self.batch_size,-1,cs*de]) 43 | 44 | #Droupout embedding input 45 | inputs=tf.nn.dropout(inputs,keep_prob=self.keep_prob,name='drop_inputs') 46 | 47 | #Create the internal multi-layer cell for rnn 48 | if model_cell=='rnn': 49 | single_cell0=tf.nn.rnn_cell.BasicRNNCell(nh1) 50 | single_cell1=tf.nn.rnn_cell.BasicRNNCell(nh1) 51 | single_cell2=tf.nn.rnn_cell.BasicRNNCell(nh2) 52 | elif model_cell=='lstm': 53 | single_cell0=tf.nn.rnn_cell.BasicLSTMCell(nh1,state_is_tuple=True) 54 | single_cell1=tf.nn.rnn_cell.BasicLSTMCell(nh1,state_is_tuple=True) 55 | single_cell2=tf.nn.rnn_cell.BasicLSTMCell(nh2,state_is_tuple=True) 56 | elif model_cell=='gru': 57 | single_cell0=tf.nn.rnn_cell.GRUCell(nh1) 58 | single_cell1=tf.nn.rnn_cell.GRUCell(nh1) 59 | single_cell2=tf.nn.rnn_cell.GRUCell(nh2) 60 | else: 61 | raise 'model_cell error!' 62 | #DropoutWrapper rnn_cell 63 | single_cell0 = tf.nn.rnn_cell.DropoutWrapper(single_cell0,output_keep_prob=self.keep_prob) 64 | single_cell1 = tf.nn.rnn_cell.DropoutWrapper(single_cell1, output_keep_prob=self.keep_prob) 65 | single_cell2 = tf.nn.rnn_cell.DropoutWrapper(single_cell2, output_keep_prob=self.keep_prob) 66 | 67 | self.init_state=single_cell1.zero_state(self.batch_size,dtype=tf.float32) 68 | 69 | 70 | #Bi-RNN1 71 | 72 | x_len = tf.cast(tf.shape(inputs)[1], tf.int64) 73 | batch=2 74 | with tf.variable_scope('bi_rnn1'): 75 | self.outputs1,self.state1=tf.nn.bidirectional_dynamic_rnn( 76 | single_cell0, 77 | single_cell1, 78 | inputs, 79 | sequence_length=[x_len]*batch, 80 | dtype=tf.float32 81 | ) 82 | 83 | self.outputs1=tf.concat(2,self.outputs1) 84 | 85 | 86 | #RNN2 87 | with tf.variable_scope('rnn2'): 88 | self.outputs2,self.state2=tf.nn.dynamic_rnn( 89 | cell=single_cell2, 90 | inputs=self.outputs1, 91 | initial_state=self.init_state, 92 | dtype=tf.float32 93 | ) 94 | 95 | #outputs_y 96 | with tf.variable_scope('output_sy'): 97 | w_y=tf.get_variable("softmax_w_y",[2*nh1,ny]) 98 | b_y=tf.get_variable("softmax_b_y",[ny]) 99 | outputs1=tf.reshape(self.outputs1,[-1,2*nh1]) 100 | sy=tf.nn.xw_plus_b(outputs1,w_y,b_y) 101 | self.sy_pred = tf.reshape(tf.argmax(sy, 1), [self.batch_size, -1]) 102 | #outputs_z 103 | with tf.variable_scope('output_sz'): 104 | w_z = tf.get_variable("softmax_w_z", [nh2, nz]) 105 | b_z = tf.get_variable("softmax_b_z", [nz]) 106 | outputs2 = tf.reshape(self.outputs2, [-1, nh2]) 107 | sz = tf.nn.xw_plus_b(outputs2, w_z,b_z) 108 | self.sz_pred = tf.reshape(tf.argmax(sz, 1), [self.batch_size, -1]) 109 | #loss 110 | with tf.variable_scope('loss'): 111 | label_y = tf.reshape(self.input_y, [-1]) 112 | loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(sy, label_y) 113 | label_z = tf.reshape(self.input_z, [-1]) 114 | loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(sz, label_z) 115 | self.loss=tf.reduce_sum(0.5*loss1+0.5*loss2)/tf.cast(self.batch_size,tf.float32) 116 | 117 | tvars=tf.trainable_variables() 118 | grads,_=tf.clip_by_global_norm(tf.gradients(self.loss,tvars),max_gradient_norm) 119 | optimizer=tf.train.GradientDescentOptimizer(self.lr) 120 | self.train_op=optimizer.apply_gradients(zip(grads,tvars)) 121 | 122 | def cost(output, target): 123 | # Compute cross entropy for each frame. 124 | cross_entropy = target * tf.log(output) 125 | cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2) 126 | mask = tf.sign(tf.reduce_max(tf.abs(target), reduction_indices=2)) 127 | cross_entropy *= mask 128 | # Average over actual sequence lengths. 129 | cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1) 130 | cross_entropy /= tf.reduce_sum(mask, reduction_indices=1) 131 | return tf.reduce_mean(cross_entropy) -------------------------------------------------------------------------------- /models/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | class Model(object): 6 | 7 | def __init__(self, 8 | nh1, 9 | nh2, 10 | ny, 11 | nz, 12 | de, 13 | cs, 14 | lr, 15 | lr_decay, 16 | embedding, 17 | max_gradient_norm, 18 | model_cell='rnn', 19 | model='basic_model', 20 | nonstatic=False): 21 | 22 | self.batch_size = tf.placeholder(dtype=tf.int32, shape=None) 23 | self.input_x=tf.placeholder(tf.int32,shape=[None,None,cs],name='input_x') 24 | self.input_y=tf.placeholder(tf.int32,shape=[None,None],name="input_y") 25 | self.input_z=tf.placeholder(tf.int32,shape=[None,None],name='input_z') 26 | self.keep_prob=tf.placeholder(dtype=tf.float32,name='keep_prob') 27 | 28 | self.lr=tf.Variable(lr,dtype=tf.float32) 29 | 30 | self.learning_rate_decay_op = self.lr.assign( 31 | self.lr * lr_decay) 32 | 33 | 34 | #Creating embedding input 35 | with tf.device("/cpu:0"),tf.name_scope('embedding'): 36 | if nonstatic: 37 | W=tf.constant(embedding,name='embW',dtype=tf.float32) 38 | else: 39 | W=tf.Variable(embedding,name='embW',dtype=tf.float32) 40 | inputs=tf.nn.embedding_lookup(W,self.input_x) 41 | inputs=tf.reshape(inputs,[self.batch_size,-1,cs*de]) 42 | 43 | #Droupout embedding input 44 | inputs=tf.nn.dropout(inputs,keep_prob=self.keep_prob,name='drop_inputs') 45 | 46 | #Create the internal multi-layer cell for rnn 47 | if model_cell=='rnn': 48 | single_cell1=tf.nn.rnn_cell.BasicRNNCell(nh1) 49 | single_cell2=tf.nn.rnn_cell.BasicRNNCell(nh2) 50 | elif model_cell=='lstm': 51 | single_cell1=tf.nn.rnn_cell.BasicLSTMCell(nh1,state_is_tuple=True) 52 | single_cell2=tf.nn.rnn_cell.BasicLSTMCell(nh2,state_is_tuple=True) 53 | elif model_cell=='gru': 54 | single_cell1=tf.nn.rnn_cell.GRUCell(nh1) 55 | single_cell2=tf.nn.rnn_cell.GRUCell(nh2) 56 | else: 57 | raise 'model_cell error!' 58 | #DropoutWrapper rnn_cell 59 | single_cell1 = tf.nn.rnn_cell.DropoutWrapper(single_cell1, output_keep_prob=self.keep_prob) 60 | single_cell2 = tf.nn.rnn_cell.DropoutWrapper(single_cell2, output_keep_prob=self.keep_prob) 61 | 62 | self.init_state=single_cell1.zero_state(self.batch_size,dtype=tf.float32) 63 | 64 | #RNN1 65 | with tf.variable_scope('rnn1'): 66 | self.outputs1,self.state1=tf.nn.dynamic_rnn( 67 | cell=single_cell1, 68 | inputs=inputs, 69 | initial_state=self.init_state, 70 | dtype=tf.float32 71 | ) 72 | 73 | #RNN2 74 | with tf.variable_scope('rnn2'): 75 | self.outputs2,self.state2=tf.nn.dynamic_rnn( 76 | cell=single_cell2, 77 | inputs=self.outputs1, 78 | initial_state=self.init_state, 79 | dtype=tf.float32 80 | ) 81 | 82 | #outputs_y 83 | with tf.variable_scope('output_sy'): 84 | w_y=tf.get_variable("softmax_w_y",[nh1,ny]) 85 | b_y=tf.get_variable("softmax_b_y",[ny]) 86 | outputs1=tf.reshape(self.outputs1,[-1,nh1]) 87 | sy=tf.nn.xw_plus_b(outputs1,w_y,b_y) 88 | self.sy_pred = tf.reshape(tf.argmax(sy, 1), [self.batch_size, -1]) 89 | #outputs_z 90 | with tf.variable_scope('output_sz'): 91 | w_z = tf.get_variable("softmax_w_z", [nh2, nz]) 92 | b_z = tf.get_variable("softmax_b_z", [nz]) 93 | outputs2 = tf.reshape(self.outputs2, [-1, nh2]) 94 | sz = tf.nn.xw_plus_b(outputs2, w_z,b_z) 95 | self.sz_pred = tf.reshape(tf.argmax(sz, 1), [self.batch_size, -1]) 96 | #loss 97 | with tf.variable_scope('loss'): 98 | label_y = tf.reshape(self.input_y, [-1]) 99 | loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(sy, label_y) 100 | label_z = tf.reshape(self.input_z, [-1]) 101 | loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(sz, label_z) 102 | self.loss=tf.reduce_sum(0.5*loss1+0.5*loss2)/tf.cast(self.batch_size,tf.float32) 103 | 104 | tvars=tf.trainable_variables() 105 | grads,_=tf.clip_by_global_norm(tf.gradients(self.loss,tvars),max_gradient_norm) 106 | optimizer=tf.train.GradientDescentOptimizer(self.lr) 107 | self.train_op=optimizer.apply_gradients(zip(grads,tvars)) 108 | 109 | def cost(output, target): 110 | # Compute cross entropy for each frame. 111 | cross_entropy = target * tf.log(output) 112 | cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2) 113 | mask = tf.sign(tf.reduce_max(tf.abs(target), reduction_indices=2)) 114 | cross_entropy *= mask 115 | # Average over actual sequence lengths. 116 | cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1) 117 | cross_entropy /= tf.reduce_sum(mask, reduction_indices=1) 118 | return tf.reduce_mean(cross_entropy) 119 | 120 | 121 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import tensorlayer as tl 4 | import numpy as np 5 | import time 6 | import time 7 | import os 8 | import random 9 | import load 10 | import models.model as model 11 | 12 | import tools 13 | import sys 14 | 15 | def main(): 16 | s={ 17 | 'nh1':300, 18 | 'nh2':300, 19 | 'win':3, 20 | 'emb_dimension':300, 21 | 'lr':0.1, 22 | 'lr_decay':0.5, 23 | 'max_grad_norm':5, 24 | 'seed':345, 25 | 'nepochs':50, 26 | 'batch_size':16, 27 | 'keep_prob':1.0, 28 | 'check_dir':'./checkpoints', 29 | 'display_test_per':5, 30 | 'lr_decay_per':10 31 | } 32 | 33 | 34 | # load the dataset 35 | train_set,test_set,dic,embedding=load.atisfold() 36 | idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems()) 37 | idx2word = dict((k,v) for v,k in dic['words2idx'].iteritems()) 38 | 39 | vocab = set(dic['words2idx'].keys()) 40 | vocsize = len(vocab) 41 | 42 | test_lex, test_y, test_z = test_set[0:1000] 43 | 44 | y_nclasses = 2 45 | z_nclasses = 5 46 | 47 | 48 | with tf.Session() as sess: 49 | 50 | rnn = model.Model( 51 | nh1=s['nh1'], 52 | nh2=s['nh2'], 53 | ny=y_nclasses, 54 | nz=z_nclasses, 55 | de=s['emb_dimension'], 56 | cs=s['win'], 57 | lr=s['lr'], 58 | lr_decay=s['lr_decay'], 59 | embedding=embedding, 60 | max_gradient_norm=s['max_grad_norm'], 61 | model_cell='lstm' 62 | ) 63 | 64 | checkpoint_dir = s['check_dir'] 65 | saver = tf.train.Saver(tf.all_variables()) 66 | ckpt = tf.train.get_checkpoint_state(checkpoint_dir) 67 | if ckpt and ckpt.model_checkpoint_path: 68 | saver.restore(sess, ckpt.model_checkpoint_path) 69 | 70 | def dev_step(cwords): 71 | feed={ 72 | rnn.input_x:cwords, 73 | rnn.keep_prob:1.0, 74 | rnn.batch_size:s['batch_size'] 75 | } 76 | fetches=rnn.sz_pred 77 | sz_pred=sess.run(fetches=fetches,feed_dict=feed) 78 | return sz_pred 79 | print "测试结果:" 80 | predictions_test=[] 81 | groundtruth_test=[] 82 | for batch in tl.iterate.minibatches(test_lex, test_z, batch_size=s['batch_size']): 83 | x, z = batch 84 | x = load.pad_sentences(x) 85 | x = tools.contextwin_2(x, s['win']) 86 | predictions_test.extend(dev_step(x)) 87 | groundtruth_test.extend(z) 88 | 89 | res_test = tools.conlleval(predictions_test, groundtruth_test, '') 90 | 91 | print res_test 92 | 93 | if __name__ == '__main__': 94 | main() 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | 4 | def shuffle(lol,seed): 5 | ''' 6 | lol :: list of list as input 7 | seed :: seed the shuffling 8 | 9 | shuffle inplace each list in the same order 10 | ''' 11 | for l in lol: 12 | random.seed(seed) 13 | random.shuffle(l) 14 | 15 | def contextwin(l, win): 16 | ''' 17 | win :: int corresponding to the size of the window 18 | given a list of indexes composing a sentence 19 | it will return a list of list of indexes corresponding 20 | to context windows surrounding each word in the sentence 21 | ''' 22 | assert (win % 2) == 1 23 | assert win >=1 24 | l = list(l) 25 | 26 | lpadded = win/2 * [0] + l + win/2 * [0] 27 | out = [ lpadded[i:i+win] for i in range(len(l)) ] 28 | 29 | assert len(out) == len(l) 30 | return out 31 | 32 | def contextwin_2(ls,win): 33 | assert (win % 2) == 1 34 | assert win >=1 35 | outs=[] 36 | for l in ls: 37 | outs.append(contextwin(l,win)) 38 | return outs 39 | 40 | def getKeyphraseList(l): 41 | res, now= [], [] 42 | for i in xrange(len(l)): 43 | if l[i] != 0: 44 | now.append(str(i)) 45 | if l[i] == 0 or i == len(l) - 1: 46 | if len(now) != 0: 47 | res.append(' '.join(now)) 48 | now = [] 49 | return set(res) 50 | 51 | def conlleval(predictions, groundtruth, file): 52 | assert len(predictions) == len(groundtruth) 53 | res = {} 54 | all_cnt, good_cnt = len(predictions), 0 55 | p_cnt, r_cnt, pr_cnt = 0, 0, 0 56 | for i in range(all_cnt): 57 | # print i 58 | if all(predictions[i][0:len(groundtruth[i])] == groundtruth[i]) == True: 59 | good_cnt += 1 60 | pKeyphraseList = getKeyphraseList(predictions[i][0:len(groundtruth[i])]) 61 | gKeyphraseList = getKeyphraseList(groundtruth[i]) 62 | p_cnt += len(pKeyphraseList) 63 | r_cnt += len(gKeyphraseList) 64 | pr_cnt += len(pKeyphraseList & gKeyphraseList) 65 | res['a'] = 1.0*good_cnt/all_cnt 66 | res['p'] = 1.0*good_cnt/p_cnt 67 | res['r'] = 1.0*good_cnt/r_cnt 68 | res['f'] = 2.0*res['p']*res['r']/(res['p']+res['r']) 69 | return res 70 | 71 | 72 | --------------------------------------------------------------------------------