├── README.md
├── data
    ├── data_process.py
    ├── main_example.py
    ├── original_data
    │   ├── keyphrase_dataset.tar.gz
    │   └── readme
    └── readme
├── load.py
├── main.py
├── models
    ├── __init__.py
    ├── bi_lstm_model.py
    └── model.py
├── predict.py
└── tools.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Keyphrase Extraction
 2 | Source codes of our EMNLP2016 paper [Keyphrase Extraction Using Deep Recurrent Neural Networks on Twitter](http://jkx.fudan.edu.cn/~qzhang/paper/keyphrase.emnlp2016.pdf)
 3 | 
 4 | ## Preparation
 5 | You need to prepare  the pre-trained word vectors.
 6 | * Pre-trained word vectors. Download [GoogleNews-vectors-negative300.bin.gz](https://code.google.com/archive/p/word2vec/)
 7 | 
 8 | 
 9 | ## Details
10 | Joint RNN model
11 | 
12 | * data文件夹存储数据集
13 | 
14 | * checkpoints文件夹存储模型训练得到的参数
15 | 
16 | * main.py是主程序
17 | 
18 | * models/model.py定义了joint-rnn模型
19 | 
20 | * models/bi_lstm_model.py 用双向lstm代替rnn
21 | 
22 | * load.py用于加载数据集
23 | 
24 | * tools.py定义了一些工具函数
25 | 
26 | ## Requirement
27 | tensorflow0.11 + tensorlayer
28 | 
29 | 


--------------------------------------------------------------------------------
/data/data_process.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import re
  4 | import cPickle
  5 | from collections import Counter
  6 | 
  7 | 
  8 | def getlist(filename):
  9 |     
 10 |     with open(filename) as f:
 11 |         datalist,taglist=[],[]
 12 |         for line in f:
 13 |             line=line.strip()
 14 |             datalist.append(line.split('\t')[0])
 15 |             taglist.append(line.split('\t')[1])
 16 |             
 17 |    
 18 |     
 19 |     return datalist,taglist
 20 | 
 21 | #build vocabulary
 22 | def get_dict(filenames):
 23 |     trnTweet,testTweet=filenames
 24 |     sentence_list=getlist(trnTweet)[0]+getlist(testTweet)[0]
 25 | 
 26 |     words2idx=1,{}
 27 |     words=[]
 28 | 
 29 |     for sentence in sentence_list:
 30 |         word_list=sentence.split()
 31 |         words.extend(word_list)
 32 | 
 33 |     word_counts=Counter(words)
 34 |     words2idx={word[0]:i+1 for i,word in enumerate(word_counts.most_common())}
 35 | 
 36 |     labels2idx = {'O': 0, 'B': 1, 'I': 2, 'E': 3, 'S': 4}
 37 |     dicts = {'words2idx': words2idx, 'labels2idx': labels2idx}
 38 | 
 39 |     return dicts
 40 | 
 41 | def get_train_test_dicts(filenames):
 42 |     """
 43 |     Args:
 44 |     filenames:trnTweet,testTweet,tag_id_cnt
 45 | 
 46 |     Returns:
 47 |     dataset:train_set,test_set,dicts
 48 | 
 49 |     train_set=[train_lex,train_y,train_z]
 50 |     test_set=[test_lex,test_y,test_z]
 51 |     dicts = {'words2idx': words2idx, 'labels2idx': labels2idx}
 52 | 
 53 | 
 54 |     """
 55 |     trnTweetCnn, testTweetCnn= filenames
 56 |     dicts=get_dict([trnTweetCnn,testTweetCnn])
 57 | 
 58 |     trn_data=getlist(trnTweetCnn)
 59 |     test_data=getlist(testTweetCnn)
 60 | 
 61 |     trn_sentence_list,trn_tag_list=trn_data
 62 |     test_sentence_list,test_tag_list=test_data
 63 |     
 64 |     words2idx=dicts['words2idx']
 65 |     labels2idx=dicts['labels2idx']
 66 | 
 67 |     def get_lex_y(sentence_list,tag_list,words2idx):
 68 |         lex,y,z=[],[],[]
 69 |         bad_cnt=0
 70 |         for s,tag in zip(sentence_list,tag_list):
 71 |        
 72 |             
 73 | 
 74 |             word_list=s.split()
 75 |             t_list=tag.split()
 76 | 
 77 |             emb=map(lambda x:words2idx[x],word_list)
 78 | 
 79 | 
 80 |             begin=-1
 81 |             for i in range(len(word_list)):
 82 |                 ok=True
 83 |                 for j in range(len(t_list)):
 84 |                     if word_list[i+j]!=t_list[j]:
 85 |                         ok=False;
 86 |                         break
 87 |                 if ok==True:
 88 |                     begin=i
 89 |                     break
 90 | 
 91 |             if begin==-1:
 92 |                 bad_cnt+=1
 93 |                 continue
 94 | 
 95 |             lex.append(emb)
 96 | 
 97 |             labels_y=[0]*len(word_list)
 98 |             for i in range(len(t_list)):
 99 |                 labels_y[begin+i]=1
100 |             y.append(labels_y)
101 | 
102 |             labels_z=[0]*len(word_list)
103 |             if len(t_list)==1:
104 |                 labels_z[begin]=labels2idx['S']
105 |             elif len(t_list)>1:
106 |                 labels_z[begin]=labels2idx['B']
107 | 
108 |                 for i in range(len(t_list)-2):
109 |                     labels_z[begin+i+1]=labels2idx['I']
110 |                 labels_z[begin+len(t_list)-1]=labels2idx['E']
111 | 
112 |             z.append(labels_z)
113 |         return lex,y,z
114 |     
115 |     train_lex, train_y, train_z = get_lex_y(trn_sentence_list,trn_tag_list, words2idx)
116 |     test_lex, test_y, test_z = get_lex_y(test_sentence_list,test_tag_list,words2idx)
117 |     train_set = [train_lex, train_y, train_z]
118 |     test_set = [test_lex, test_y, test_z]
119 |     data_set = [train_set, test_set, dicts]
120 |     with open('data_set.pkl', 'w') as f:
121 |         cPickle.dump(data_set, f)
122 |     return data_set
123 | 
124 | 
125 | 
126 | def load_bin_vec(frame,vocab):
127 |     k=0
128 |     word_vecs={}
129 |     with open(frame) as f:
130 |         for line in f:
131 |             word=line.strip().split(' ',1)[0]
132 |             embeding=line.strip().split(' ',1)[1].split()
133 |             if word in vocab:
134 |                 word_vecs[word]=np.asarray(embeding,dtype=np.float32)
135 |             k+=1
136 |             if k%10000==0:
137 |                 print "load_bin_vec %d" % k
138 | 
139 |     return word_vecs
140 | 
141 | def add_unknown_words(word_vecs, vocab, min_df=1, dim=300):
142 |     """
143 |     For words that occur in at least min_df documents, create a separate word vector.
144 |     0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
145 |     """
146 |     k=0
147 |     for w in vocab:
148 |         if w not in word_vecs:
149 |             word_vecs[w]=np.asarray(np.random.uniform(-0.25,0.25,dim),dtype=np.float32)
150 |             k+=1
151 |             if k % 10000==0:
152 |                 print "add_unknow_words %d" % k
153 |     return word_vecs
154 | 
155 | def get_embedding(w2v,words2idx,k=300):
156 |     embedding = np.zeros((len(w2v) + 2, k), dtype=np.float32)
157 |     for (w,idx) in words2idx.items():
158 |         embedding[idx]=w2v[w]
159 |     #embedding[0]=np.asarray(np.random.uniform(-0.25,0.25,k),dtype=np.float32)
160 |     with open('embedding.pkl','w') as f:
161 |         cPickle.dump(embedding,f)
162 |     return embedding
163 | 
164 | 
165 | if __name__ == '__main__':
166 |     data_folder = ["original_data/trnTweet","original_data/testTweet"]
167 |     data_set = get_train_test_dicts(data_folder)
168 |     print "data_set complete!"
169 |     dicts = data_set[2]
170 |     vocab = set(dicts['words2idx'].keys())
171 |     print "total num words: " + str(len(vocab))
172 |     print "dataset created!"
173 |     train_set, test_set, dicts=data_set
174 |     print len(train_set[0])
175 | 
176 |     #GoogleNews-vectors-negative300.txt为预先训练的词向量 
177 |     w2v_file='original_data/GoogleNews-vectors-negative300.txt' 
178 |     w2v=load_bin_vec(w2v_file,vocab)
179 |     print "word2vec loaded"
180 |     add_unknown_words(w2v,vocab)
181 |     embedding=get_embedding(w2v,dicts['words2idx'])
182 |     print "embedding created"
183 | 
184 | 
185 | 
186 |       
187 | 
188 | 
189 | 
190 | 
191 |         
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 


--------------------------------------------------------------------------------
/data/main_example.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import cPickle
 3 | def main():
 4 |     
 5 |     f = open('data_set.pkl')
 6 |     train_set, test_set, dicts = cPickle.load(f)
 7 |     embedding = cPickle.load(open('embedding.pkl'))
 8 |   
 9 |     word2idx=dicts['words2idx']
10 |     labels2idx=dicts['labels2idx']
11 | 
12 |     train_lex, train_y, train_z = train_set
13 |     test_lex,  test_y, test_z  = test_set
14 |     # 将测试集分成测试集和验证,最终训练集,验证集,测试集7:1:2
15 |     tr = int(len(test_lex)*0.67)
16 |     valid_lex, valid_y, valid_z = test_lex[tr:], test_y[tr:], test_z[tr:]
17 |     test_lex,  test_y, test_z  = test_lex[:tr],test_y[:tr],test_z[:tr]
18 | 
19 | 
20 |     print 'len(train_data) {}'.format(len(train_lex))
21 |     print 'len(valid_data) {}'.format(len(valid_lex))
22 |     print 'len(test_data) {}'.format(len(test_lex))
23 | 
24 |     vocab_size = len(word2idx)
25 |     print 'len(vocab) {}'.format(vocab_size)
26 |     print "Train started!"
27 | 
28 | main()


--------------------------------------------------------------------------------
/data/original_data/keyphrase_dataset.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fudannlp16/KeyPhrase-Extraction/507f36d2f03d6d89793859a50035391684ab5c52/data/original_data/keyphrase_dataset.tar.gz


--------------------------------------------------------------------------------
/data/original_data/readme:
--------------------------------------------------------------------------------
1 | trnTweet和testTweet为原始数据
2 | testTweet 33755
3 | trainTweet 78760
4 | 每一行为tweet和其对应的hashtag,tweet和hashtag以\t隔开
5 | 


--------------------------------------------------------------------------------
/data/readme:
--------------------------------------------------------------------------------
 1 | 文件结构
 2 | original_data：
 3 |     trnTweet
 4 |     testTweet
 5 |     GoogleNews-vectors-negative300.txt
 6 | data_process.py
 7 | data_set.pkl
 8 | embedding.pkl
 9 | main_example.py
10 | 
11 | 
12 | 
13 | 1.original_data为原始数据目录
14 | trnTweet和testTweet是处理好的原始tweet
15 | trainTweet 78760
16 | testTweet 33755
17 | 每一行包含tweet和对应的hashtag,tweet和hashtag以\t隔开
18 | GoogleNews-vectors-negative300.txt为用Google news预训练的300维词向量(available at https://code.google.com/archive/p/word2vec/)
19 | 
20 | 
21 | 2.data_process.py 为数据处理脚本
22 | 运行python data_process.py会生成处理好的数据
23 | data_set.pkl和embedding.pkl
24 | 
25 | 
26 | 3.data_set.pkl 数据文件
27 | data_set=[train_set,test_set,dicts]
28 | train_set=[train_lex,train_y,train_z]
29 | test_set=[test_lex,test_y,test_z]
30 | dicts = {'words2idx': words2idx, 'labels2idx': labels2idx}(labels2idx好像用不到)
31 | 
32 | words2idx,labelsidx数据类型为字典,key代表单词,value代表对应的id(数字)
33 | train_lex,train_y,train_z test_lex,test_y,test_z是处理好的数据(id化)
34 | train_lex为tweet.train_y表示tweet每个单词对应的标记,是hashtag对应1,不是hashtag对应0,trian_z表示带有位置的标记,0表示不是hashtag,1表示hashtag的开始位置,2表示hashtag的中间位置,3表示hangtag的结尾位置,4表示对应的hashtag就是一个单词.即labels2idx = {'O': 0, 'B': 1, 'I': 2, 'E': 3, 'S': 4}
35 | train_lex,train_y,train_z test_lex,test_y,test_z的数据类型为python嵌套列表
36 | 如train_lex[0]表示第1行tweet,也是一个列表,里面存贮着单词对应的id(数字)
37 | 
38 | 
39 | 4.embedding.pkl用GoogleNews-vectors-negative300预训练的300维词向量,包含数据集中所有单词的词向量,padding对应的id为0,全0初始化
40 | 使用方法
41 | embedding=cPickle.load(open('embedding.pkl'))
42 | 
43 | 
44 | 5.main_example.py为使用示例
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/load.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import cPickle
 4 | import random
 5 | def atisfold():
 6 |     f = open('data/data_set.pkl')
 7 |     train_set, test_set, dicts = cPickle.load(f)
 8 |     embedding = cPickle.load(open('data/embedding.pkl'))
 9 |     return train_set, test_set,dicts,embedding
10 | 
11 | def pad_sentences(sentences, padding_word=0, forced_sequence_length=None):
12 |     if forced_sequence_length is None:
13 |         sequence_length=max(len(x) for x in sentences)
14 |     else:
15 |         sequence_length=forced_sequence_length
16 |     padded_sentences=[]
17 |     for i in xrange(len(sentences)):
18 |         sentence=sentences[i]
19 |         num_padding=sequence_length-len(sentence)
20 |         if num_padding<0:
21 |             padded_sentence=sentence[0:sequence_length]
22 |         else:
23 |             padded_sentence=sentence+[int(padding_word)]*num_padding
24 | 
25 |         padded_sentences.append(padded_sentence)
26 | 
27 |     return padded_sentences
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import tensorflow as tf
  3 | import tensorlayer as tl
  4 | import numpy as np
  5 | import time
  6 | import time
  7 | import os
  8 | import random
  9 | import load
 10 | import models.model as model
 11 | import tools
 12 | import sys
 13 | 
 14 | def main():
 15 |     s={
 16 |         'nh1':300,
 17 |         'nh2':300,
 18 |         'win':3,
 19 |         'emb_dimension':300,
 20 |         'lr':0.1,
 21 |         'lr_decay':0.5,
 22 |         'max_grad_norm':5,
 23 |         'seed':345,
 24 |         'nepochs':150,
 25 |         'batch_size':16,
 26 |         'keep_prob':0.5,
 27 |         'check_dir':'./checkpoints',
 28 |         'display_test_per':3,
 29 |         'lr_decay_per':10
 30 |     }
 31 | 
 32 |     train_set,test_set,dic,embedding=load.atisfold()
 33 |     
 34 |     
 35 |     idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems())
 36 |     idx2word  = dict((k,v) for v,k in dic['words2idx'].iteritems())
 37 | 
 38 |     train_lex, train_y, train_z = train_set
 39 | 
 40 |     tr = int(len(train_lex)*0.9)
 41 |     valid_lex, valid_y, valid_z = train_lex[tr:], train_y[tr:], train_z[tr:]
 42 |     train_lex, train_y, train_z = train_lex[:tr], train_y[:tr], train_z[:tr]
 43 |     test_lex,  test_y, test_z  = test_set
 44 | 
 45 |     print 'len(train_data) {}'.format(len(train_lex))
 46 |     print 'len(valid_data) {}'.format(len(valid_lex))
 47 |     print 'len(test_data) {}'.format(len(test_lex))
 48 | 
 49 |     vocab = set(dic['words2idx'].keys())
 50 |     vocsize = len(vocab)
 51 |     print 'len(vocab) {}'.format(vocsize)
 52 |     print "Train started!"
 53 | 
 54 |     y_nclasses = 2
 55 |     z_nclasses = 5
 56 | 
 57 |     nsentences = len(train_lex)
 58 | 
 59 | 
 60 |     with tf.Session() as sess:
 61 |         
 62 |         rnn=model.Model(
 63 |             nh1=s['nh1'],
 64 |             nh2=s['nh2'],
 65 |             ny=y_nclasses,
 66 |             nz=z_nclasses,
 67 |             de=s['emb_dimension'],
 68 |             cs=s['win'],
 69 |             lr=s['lr'],
 70 |             lr_decay=s['lr_decay'],
 71 |             embedding=embedding,
 72 |             max_gradient_norm=s['max_grad_norm'],
 73 |             model_cell='lstm'
 74 |         )
 75 | 
 76 |         checkpoint_dir=s['check_dir']
 77 |         if not os.path.exists(checkpoint_dir):
 78 |             os.mkdir(checkpoint_dir)
 79 |         checkpoint_prefix=os.path.join(checkpoint_dir,'model')
 80 | 
 81 |         def train_step(cwords,label_y,label_z):
 82 |             feed={
 83 |                 rnn.input_x:cwords,
 84 |                 rnn.input_y:label_y,
 85 |                 rnn.input_z:label_z,
 86 |                 rnn.keep_prob:s['keep_prob'],
 87 |                 rnn.batch_size:s['batch_size']
 88 |             }
 89 |             fetches=[rnn.loss,rnn.train_op]
 90 |             loss,_=sess.run(fetches=fetches,feed_dict=feed)
 91 |             return loss
 92 | 
 93 |         def dev_step(cwords):
 94 |             feed={
 95 |                 rnn.input_x:cwords,
 96 |                 rnn.keep_prob:1.0,
 97 |                 rnn.batch_size:s['batch_size']
 98 |             }
 99 |             fetches=rnn.sz_pred
100 |             sz_pred=sess.run(fetches=fetches,feed_dict=feed)
101 |             return sz_pred
102 | 
103 |         saver=tf.train.Saver(tf.all_variables())
104 |         sess.run(tf.initialize_all_variables())
105 | 
106 |         best_f=-1
107 |         best_e=0
108 |         test_best_f=-1
109 |         test_best_e=0
110 |         best_res=None
111 |         test_best_res=None
112 |         for e in xrange(s['nepochs']):
113 |             tools.shuffle([train_lex,train_y,train_z],s['seed'])
114 |             t_start=time.time()
115 |             for step,batch in enumerate(tl.iterate.minibatches(train_lex,zip(train_y,train_z),batch_size=s['batch_size'])):
116 |                 input_x,target=batch
117 |                 label_y,label_z=zip(*target)
118 |                 input_x=load.pad_sentences(input_x)
119 |                 label_y=load.pad_sentences(label_y)
120 |                 label_z=load.pad_sentences(label_z)
121 |                 cwords=tools.contextwin_2(input_x,s['win'])
122 |                 loss=train_step(cwords,label_y,label_z)
123 | 
124 |                 print 'loss %.2f' % loss,' [learning] epoch %i>> %2.2f%%' % (e,s['batch_size']*step*100./nsentences),'completed in %.2f (sec) <<\r' % (time.time()-t_start),
125 | 
126 |                 sys.stdout.flush()
127 | 
128 |             #VALID
129 | 
130 |             predictions_valid=[]
131 |             predictions_test=[]
132 |             groundtruth_valid=[]
133 |             groundtruth_test=[]
134 |             for batch in  tl.iterate.minibatches(valid_lex,valid_z,batch_size=s['batch_size']):
135 |                 x,z=batch
136 |                 x=load.pad_sentences(x)
137 |                 x=tools.contextwin_2(x,s['win'])
138 |                 predictions_valid.extend(dev_step(x))
139 |                 groundtruth_valid.extend(z)
140 | 
141 |             res_valid=tools.conlleval(predictions_valid,groundtruth_valid,'')
142 | 
143 |             if res_valid['f']>best_f:
144 |                 best_f=res_valid['f']
145 |                 best_e=e
146 |                 best_res=res_valid
147 |                 print '\nVALID new best:',res_valid
148 |                 path = saver.save(sess=sess, save_path=checkpoint_prefix, global_step=e)
149 |                 print "Save model checkpoint to {}".format(path)
150 |             else:
151 |                 print '\nVALID new curr:',res_valid
152 | 
153 |             #TEST
154 |             if e%s['display_test_per']==0:
155 |                 for batch in tl.iterate.minibatches(test_lex, test_z, batch_size=s['batch_size']):
156 |                     x,z = batch
157 |                     x = load.pad_sentences(x)
158 |                     x = tools.contextwin_2(x, s['win'])
159 |                     predictions_test.extend(dev_step(x))
160 |                     groundtruth_test.extend(z)
161 | 
162 | 
163 |                 res_test = tools.conlleval(predictions_test, groundtruth_test, '')
164 | 
165 |                 if res_test['f'] > test_best_f:
166 |                     test_best_f = res_test['f']
167 |                     test_best_e=e
168 |                     test_best_res=res_test
169 |                     print 'TEST new best:',res_test
170 |                 else:
171 |                     print 'TEST new curr:',res_test
172 | 
173 |             # learning rate decay if no improvement in 10 epochs
174 |             if e-best_e>s['lr_decay_per']:
175 |                 sess.run(fetches=rnn.learning_rate_decay_op)
176 |             lr=sess.run(fetches=rnn.lr)
177 |             print 'learning rate:%f' % lr
178 |             if lr<1e-5:break
179 |             print
180 | 
181 |         print "Train finished!"
182 |         print 'Valid Best Result: epoch %d:  ' % (best_e),best_res
183 |         print 'Test Best Result: epoch %d:  ' %(test_best_e),test_best_res
184 | 
185 | if __name__ == '__main__':
186 |     main()
187 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fudannlp16/KeyPhrase-Extraction/507f36d2f03d6d89793859a50035391684ab5c52/models/__init__.py


--------------------------------------------------------------------------------
/models/bi_lstm_model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | 
  5 | 
  6 | class Model(object):
  7 | 
  8 |     def __init__(self,
  9 |                  nh1,
 10 |                  nh2,
 11 |                  ny,
 12 |                  nz,
 13 |                  de,
 14 |                  cs,
 15 |                  lr,
 16 |                  lr_decay,
 17 |                  embedding,
 18 |                  max_gradient_norm,
 19 |                  model_cell='rnn',
 20 |                  model='basic_model',
 21 |                  nonstatic=False):
 22 | 
 23 |         self.batch_size = tf.placeholder(dtype=tf.int32, shape=None)
 24 |         self.input_x=tf.placeholder(tf.int32,shape=[None,None,cs],name='input_x')
 25 |         self.input_y=tf.placeholder(tf.int32,shape=[None,None],name="input_y")
 26 |         self.input_z=tf.placeholder(tf.int32,shape=[None,None],name='input_z')
 27 |         self.keep_prob=tf.placeholder(dtype=tf.float32,name='keep_prob')
 28 | 
 29 |         self.lr=tf.Variable(lr,dtype=tf.float32)
 30 | 
 31 |         self.learning_rate_decay_op = self.lr.assign(
 32 |             self.lr * lr_decay)
 33 | 
 34 | 
 35 |         #Creating embedding input
 36 |         with tf.device("/cpu:0"),tf.name_scope('embedding'):
 37 |             if nonstatic:
 38 |                 W=tf.constant(embedding,name='embW',dtype=tf.float32)
 39 |             else:
 40 |                 W=tf.Variable(embedding,name='embW',dtype=tf.float32)
 41 |             inputs=tf.nn.embedding_lookup(W,self.input_x)
 42 |             inputs=tf.reshape(inputs,[self.batch_size,-1,cs*de])
 43 | 
 44 |         #Droupout embedding input
 45 |         inputs=tf.nn.dropout(inputs,keep_prob=self.keep_prob,name='drop_inputs')
 46 | 
 47 |         #Create the internal multi-layer cell for rnn
 48 |         if model_cell=='rnn':
 49 |             single_cell0=tf.nn.rnn_cell.BasicRNNCell(nh1)
 50 |             single_cell1=tf.nn.rnn_cell.BasicRNNCell(nh1)
 51 |             single_cell2=tf.nn.rnn_cell.BasicRNNCell(nh2)
 52 |         elif model_cell=='lstm':
 53 |             single_cell0=tf.nn.rnn_cell.BasicLSTMCell(nh1,state_is_tuple=True)
 54 |             single_cell1=tf.nn.rnn_cell.BasicLSTMCell(nh1,state_is_tuple=True)
 55 |             single_cell2=tf.nn.rnn_cell.BasicLSTMCell(nh2,state_is_tuple=True)
 56 |         elif model_cell=='gru':
 57 |             single_cell0=tf.nn.rnn_cell.GRUCell(nh1)
 58 |             single_cell1=tf.nn.rnn_cell.GRUCell(nh1)
 59 |             single_cell2=tf.nn.rnn_cell.GRUCell(nh2)
 60 |         else:
 61 |             raise 'model_cell error!'
 62 |         #DropoutWrapper rnn_cell
 63 |         single_cell0 = tf.nn.rnn_cell.DropoutWrapper(single_cell0,output_keep_prob=self.keep_prob)
 64 |         single_cell1 = tf.nn.rnn_cell.DropoutWrapper(single_cell1, output_keep_prob=self.keep_prob)
 65 |         single_cell2 = tf.nn.rnn_cell.DropoutWrapper(single_cell2, output_keep_prob=self.keep_prob)
 66 |         
 67 |         self.init_state=single_cell1.zero_state(self.batch_size,dtype=tf.float32) 
 68 |         
 69 |         
 70 |         #Bi-RNN1
 71 | 
 72 |         x_len = tf.cast(tf.shape(inputs)[1], tf.int64)
 73 |         batch=2
 74 |         with tf.variable_scope('bi_rnn1'):
 75 |             self.outputs1,self.state1=tf.nn.bidirectional_dynamic_rnn(
 76 |                 single_cell0,
 77 |                 single_cell1,
 78 |                 inputs,
 79 |                 sequence_length=[x_len]*batch,
 80 |                 dtype=tf.float32
 81 |             )
 82 | 
 83 |         self.outputs1=tf.concat(2,self.outputs1)
 84 | 
 85 |         
 86 |         #RNN2
 87 |         with tf.variable_scope('rnn2'):
 88 |             self.outputs2,self.state2=tf.nn.dynamic_rnn(
 89 |                 cell=single_cell2,
 90 |                 inputs=self.outputs1,
 91 |                 initial_state=self.init_state,
 92 |                 dtype=tf.float32
 93 |             )
 94 | 
 95 |         #outputs_y
 96 |         with tf.variable_scope('output_sy'):
 97 |             w_y=tf.get_variable("softmax_w_y",[2*nh1,ny])
 98 |             b_y=tf.get_variable("softmax_b_y",[ny])
 99 |             outputs1=tf.reshape(self.outputs1,[-1,2*nh1])
100 |             sy=tf.nn.xw_plus_b(outputs1,w_y,b_y)
101 |             self.sy_pred = tf.reshape(tf.argmax(sy, 1), [self.batch_size, -1])
102 |         #outputs_z
103 |         with tf.variable_scope('output_sz'):
104 |             w_z = tf.get_variable("softmax_w_z", [nh2, nz])
105 |             b_z = tf.get_variable("softmax_b_z", [nz])
106 |             outputs2 = tf.reshape(self.outputs2, [-1, nh2])
107 |             sz = tf.nn.xw_plus_b(outputs2, w_z,b_z)
108 |             self.sz_pred = tf.reshape(tf.argmax(sz, 1), [self.batch_size, -1])
109 |         #loss
110 |         with tf.variable_scope('loss'):
111 |             label_y = tf.reshape(self.input_y, [-1])
112 |             loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(sy, label_y)
113 |             label_z = tf.reshape(self.input_z, [-1])
114 |             loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(sz, label_z)
115 |             self.loss=tf.reduce_sum(0.5*loss1+0.5*loss2)/tf.cast(self.batch_size,tf.float32)
116 | 
117 |         tvars=tf.trainable_variables()
118 |         grads,_=tf.clip_by_global_norm(tf.gradients(self.loss,tvars),max_gradient_norm)
119 |         optimizer=tf.train.GradientDescentOptimizer(self.lr)
120 |         self.train_op=optimizer.apply_gradients(zip(grads,tvars))
121 | 
122 |     def cost(output, target):
123 |         # Compute cross entropy for each frame.
124 |         cross_entropy = target * tf.log(output)
125 |         cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
126 |         mask = tf.sign(tf.reduce_max(tf.abs(target), reduction_indices=2))
127 |         cross_entropy *= mask
128 |         # Average over actual sequence lengths.
129 |         cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
130 |         cross_entropy /= tf.reduce_sum(mask, reduction_indices=1)
131 |         return tf.reduce_mean(cross_entropy)


--------------------------------------------------------------------------------
/models/model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | 
  5 | class Model(object):
  6 | 
  7 |     def __init__(self,
  8 |                  nh1,
  9 |                  nh2,
 10 |                  ny,
 11 |                  nz,
 12 |                  de,
 13 |                  cs,
 14 |                  lr,
 15 |                  lr_decay,
 16 |                  embedding,
 17 |                  max_gradient_norm,
 18 |                  model_cell='rnn',
 19 |                  model='basic_model',
 20 |                  nonstatic=False):
 21 | 
 22 |         self.batch_size = tf.placeholder(dtype=tf.int32, shape=None)
 23 |         self.input_x=tf.placeholder(tf.int32,shape=[None,None,cs],name='input_x')
 24 |         self.input_y=tf.placeholder(tf.int32,shape=[None,None],name="input_y")
 25 |         self.input_z=tf.placeholder(tf.int32,shape=[None,None],name='input_z')
 26 |         self.keep_prob=tf.placeholder(dtype=tf.float32,name='keep_prob')
 27 | 
 28 |         self.lr=tf.Variable(lr,dtype=tf.float32)
 29 | 
 30 |         self.learning_rate_decay_op = self.lr.assign(
 31 |             self.lr * lr_decay)
 32 | 
 33 | 
 34 |         #Creating embedding input
 35 |         with tf.device("/cpu:0"),tf.name_scope('embedding'):
 36 |             if nonstatic:
 37 |                 W=tf.constant(embedding,name='embW',dtype=tf.float32)
 38 |             else:
 39 |                 W=tf.Variable(embedding,name='embW',dtype=tf.float32)
 40 |             inputs=tf.nn.embedding_lookup(W,self.input_x)
 41 |             inputs=tf.reshape(inputs,[self.batch_size,-1,cs*de])
 42 | 
 43 |         #Droupout embedding input
 44 |         inputs=tf.nn.dropout(inputs,keep_prob=self.keep_prob,name='drop_inputs')
 45 | 
 46 |         #Create the internal multi-layer cell for rnn
 47 |         if model_cell=='rnn':
 48 |             single_cell1=tf.nn.rnn_cell.BasicRNNCell(nh1)
 49 |             single_cell2=tf.nn.rnn_cell.BasicRNNCell(nh2)
 50 |         elif model_cell=='lstm':
 51 |             single_cell1=tf.nn.rnn_cell.BasicLSTMCell(nh1,state_is_tuple=True)
 52 |             single_cell2=tf.nn.rnn_cell.BasicLSTMCell(nh2,state_is_tuple=True)
 53 |         elif model_cell=='gru':
 54 |             single_cell1=tf.nn.rnn_cell.GRUCell(nh1)
 55 |             single_cell2=tf.nn.rnn_cell.GRUCell(nh2)
 56 |         else:
 57 |             raise 'model_cell error!'
 58 |         #DropoutWrapper rnn_cell
 59 |         single_cell1 = tf.nn.rnn_cell.DropoutWrapper(single_cell1, output_keep_prob=self.keep_prob)
 60 |         single_cell2 = tf.nn.rnn_cell.DropoutWrapper(single_cell2, output_keep_prob=self.keep_prob)
 61 |       
 62 |         self.init_state=single_cell1.zero_state(self.batch_size,dtype=tf.float32) 
 63 |         
 64 |         #RNN1
 65 |         with tf.variable_scope('rnn1'):
 66 |             self.outputs1,self.state1=tf.nn.dynamic_rnn(
 67 |                 cell=single_cell1,
 68 |                 inputs=inputs,
 69 |                 initial_state=self.init_state,
 70 |                 dtype=tf.float32
 71 |             )
 72 | 
 73 |         #RNN2
 74 |         with tf.variable_scope('rnn2'):
 75 |             self.outputs2,self.state2=tf.nn.dynamic_rnn(
 76 |                 cell=single_cell2,
 77 |                 inputs=self.outputs1,
 78 |                 initial_state=self.init_state,
 79 |                 dtype=tf.float32
 80 |             )
 81 | 
 82 |         #outputs_y
 83 |         with tf.variable_scope('output_sy'):
 84 |             w_y=tf.get_variable("softmax_w_y",[nh1,ny])
 85 |             b_y=tf.get_variable("softmax_b_y",[ny])
 86 |             outputs1=tf.reshape(self.outputs1,[-1,nh1])
 87 |             sy=tf.nn.xw_plus_b(outputs1,w_y,b_y)
 88 |             self.sy_pred = tf.reshape(tf.argmax(sy, 1), [self.batch_size, -1])
 89 |         #outputs_z
 90 |         with tf.variable_scope('output_sz'):
 91 |             w_z = tf.get_variable("softmax_w_z", [nh2, nz])
 92 |             b_z = tf.get_variable("softmax_b_z", [nz])
 93 |             outputs2 = tf.reshape(self.outputs2, [-1, nh2])
 94 |             sz = tf.nn.xw_plus_b(outputs2, w_z,b_z)
 95 |             self.sz_pred = tf.reshape(tf.argmax(sz, 1), [self.batch_size, -1])
 96 |         #loss
 97 |         with tf.variable_scope('loss'):
 98 |             label_y = tf.reshape(self.input_y, [-1])
 99 |             loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(sy, label_y)
100 |             label_z = tf.reshape(self.input_z, [-1])
101 |             loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(sz, label_z)
102 |             self.loss=tf.reduce_sum(0.5*loss1+0.5*loss2)/tf.cast(self.batch_size,tf.float32)
103 | 
104 |         tvars=tf.trainable_variables()
105 |         grads,_=tf.clip_by_global_norm(tf.gradients(self.loss,tvars),max_gradient_norm)
106 |         optimizer=tf.train.GradientDescentOptimizer(self.lr)
107 |         self.train_op=optimizer.apply_gradients(zip(grads,tvars))
108 | 
109 |     def cost(output, target):
110 |         # Compute cross entropy for each frame.
111 |         cross_entropy = target * tf.log(output)
112 |         cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
113 |         mask = tf.sign(tf.reduce_max(tf.abs(target), reduction_indices=2))
114 |         cross_entropy *= mask
115 |         # Average over actual sequence lengths.
116 |         cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
117 |         cross_entropy /= tf.reduce_sum(mask, reduction_indices=1)
118 |         return tf.reduce_mean(cross_entropy)
119 |  
120 | 
121 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import tensorflow as tf
 3 | import tensorlayer as tl
 4 | import numpy as np
 5 | import time
 6 | import time
 7 | import os
 8 | import random
 9 | import load
10 | import models.model as model
11 | 
12 | import tools
13 | import sys
14 | 
15 | def main():
16 |     s={
17 |         'nh1':300,
18 |         'nh2':300,
19 |         'win':3,
20 |         'emb_dimension':300,
21 |         'lr':0.1,
22 |         'lr_decay':0.5,
23 |         'max_grad_norm':5,
24 |         'seed':345,
25 |         'nepochs':50,
26 |         'batch_size':16,
27 |         'keep_prob':1.0,
28 |         'check_dir':'./checkpoints',
29 |         'display_test_per':5,
30 |         'lr_decay_per':10
31 |     }
32 | 
33 |     
34 |     # load the dataset
35 |     train_set,test_set,dic,embedding=load.atisfold()
36 |     idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems())
37 |     idx2word  = dict((k,v) for v,k in dic['words2idx'].iteritems())
38 | 
39 |     vocab = set(dic['words2idx'].keys())
40 |     vocsize = len(vocab)
41 | 
42 |     test_lex,  test_y, test_z  = test_set[0:1000]
43 | 
44 |     y_nclasses = 2
45 |     z_nclasses = 5
46 | 
47 | 
48 |     with tf.Session() as sess:
49 | 
50 |         rnn = model.Model(
51 |             nh1=s['nh1'],
52 |             nh2=s['nh2'],
53 |             ny=y_nclasses,
54 |             nz=z_nclasses,
55 |             de=s['emb_dimension'],
56 |             cs=s['win'],
57 |             lr=s['lr'],
58 |             lr_decay=s['lr_decay'],
59 |             embedding=embedding,
60 |             max_gradient_norm=s['max_grad_norm'],
61 |             model_cell='lstm'
62 |         )
63 | 
64 |         checkpoint_dir = s['check_dir']
65 |         saver = tf.train.Saver(tf.all_variables())
66 |         ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
67 |         if ckpt and ckpt.model_checkpoint_path:
68 |             saver.restore(sess, ckpt.model_checkpoint_path)
69 | 
70 |         def dev_step(cwords):
71 |             feed={
72 |                 rnn.input_x:cwords,
73 |                 rnn.keep_prob:1.0,
74 |                 rnn.batch_size:s['batch_size']
75 |             }
76 |             fetches=rnn.sz_pred
77 |             sz_pred=sess.run(fetches=fetches,feed_dict=feed)
78 |             return sz_pred
79 |         print "测试结果："
80 |         predictions_test=[]
81 |         groundtruth_test=[]
82 |         for batch in tl.iterate.minibatches(test_lex, test_z, batch_size=s['batch_size']):
83 |             x, z = batch
84 |             x = load.pad_sentences(x)
85 |             x = tools.contextwin_2(x, s['win'])
86 |             predictions_test.extend(dev_step(x))
87 |             groundtruth_test.extend(z)
88 | 
89 |         res_test = tools.conlleval(predictions_test, groundtruth_test, '')
90 | 
91 |         print res_test
92 | 
93 | if __name__ == '__main__':
94 |     main()
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/tools.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | 
 4 | def shuffle(lol,seed):
 5 |     '''
 6 |     lol :: list of list as input
 7 |     seed :: seed the shuffling
 8 | 
 9 |     shuffle inplace each list in the same order
10 |     '''
11 |     for l in lol:
12 |         random.seed(seed)
13 |         random.shuffle(l)
14 | 
15 | def contextwin(l, win):
16 |     '''
17 |     win :: int corresponding to the size of the window
18 |     given a list of indexes composing a sentence
19 |     it will return a list of list of indexes corresponding
20 |     to context windows surrounding each word in the sentence
21 |     '''
22 |     assert (win % 2) == 1
23 |     assert win >=1
24 |     l = list(l)
25 | 
26 |     lpadded = win/2 * [0] + l + win/2 * [0]
27 |     out = [ lpadded[i:i+win] for i in range(len(l)) ]
28 | 
29 |     assert len(out) == len(l)
30 |     return out
31 | 
32 | def contextwin_2(ls,win):
33 |     assert (win % 2) == 1
34 |     assert win >=1
35 |     outs=[]
36 |     for l in ls:
37 |         outs.append(contextwin(l,win))
38 |     return outs
39 | 
40 | def getKeyphraseList(l):
41 |     res, now= [], []
42 |     for i in xrange(len(l)):
43 |         if l[i] != 0:
44 |             now.append(str(i))
45 |         if l[i] == 0 or i == len(l) - 1:
46 |             if len(now) != 0:
47 |                 res.append(' '.join(now))
48 |             now = []
49 |     return set(res)
50 | 
51 | def conlleval(predictions, groundtruth, file):
52 |     assert len(predictions) == len(groundtruth)
53 |     res = {}
54 |     all_cnt, good_cnt = len(predictions), 0
55 |     p_cnt, r_cnt, pr_cnt = 0, 0, 0
56 |     for i in range(all_cnt):
57 |         # print i
58 |         if all(predictions[i][0:len(groundtruth[i])] == groundtruth[i]) == True:
59 |             good_cnt += 1
60 |         pKeyphraseList = getKeyphraseList(predictions[i][0:len(groundtruth[i])])
61 |         gKeyphraseList = getKeyphraseList(groundtruth[i])
62 |         p_cnt += len(pKeyphraseList)
63 |         r_cnt += len(gKeyphraseList)
64 |         pr_cnt += len(pKeyphraseList & gKeyphraseList)
65 |     res['a'] = 1.0*good_cnt/all_cnt
66 |     res['p'] = 1.0*good_cnt/p_cnt
67 |     res['r'] = 1.0*good_cnt/r_cnt
68 |     res['f'] = 2.0*res['p']*res['r']/(res['p']+res['r'])
69 |     return res
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------