├── README.md ├── code ├── README.MD ├── data_preprocess.py ├── glove.sh ├── mydemo.sh ├── runme.sh └── tradition_feat.py ├── data ├── README.MD └── checkpoint │ └── README.MD ├── model ├── Fast_attention.py ├── Fast_attention_withsta.py ├── Fast_attention_withsta2.py ├── RCNN.py ├── README.MD ├── RNN_attention.py ├── RNN_attention2.py ├── RNN_attention_withsta.py ├── RNN_attention_withsta2.py ├── TextCNN.py └── rnnpool.py ├── stacking ├── README.MD ├── generate_presudo_labels.py └── stack.py └── 达观杯-8-redhand.pptx /README.md: -------------------------------------------------------------------------------- 1 | # Text-classifier 2 | 2018达观杯文本智能处理比赛,文本分类主题,最终排名 8/3462,F1score为0.79895 3 | 4 | [链接](http://www.pkbigdata.com/common/cmpt/%E2%80%9C%E8%BE%BE%E8%A7%82%E6%9D%AF%E2%80%9D%E6%96%87%E6%9C%AC%E6%99%BA%E8%83%BD%E5%A4%84%E7%90%86%E6%8C%91%E6%88%98%E8%B5%9B_%E7%AB%9E%E8%B5%9B%E4%BF%A1%E6%81%AF.html) 5 | -------------------------------------------------------------------------------- /code/README.MD: -------------------------------------------------------------------------------- 1 | run code 2 | -------------------------------------------------------------------------------- /code/data_preprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd, numpy as np 2 | from tqdm import tqdm 3 | 4 | column='word_seg' 5 | labels=pd.read_csv('../data/train_set.csv',usecols=['class']).values 6 | labels=labels.reshape(-1) 7 | np.save('../data/labels.npy',labels) 8 | train = pd.read_csv('../data/train_set.csv',usecols=[column]) 9 | test=pd.read_csv('../data/test_set.csv',usecols=[column]) 10 | alldoc=np.concatenate((train[column].values,test[column].values),axis=0) 11 | 12 | import collections 13 | def build_vocab(data): 14 | ls=collections.Counter() 15 | for row in tqdm(range(data.shape[0])): 16 | ls.update(collections.Counter(data[row].split())) 17 | return ls 18 | import operator 19 | word=build_vocab(alldoc) 20 | temp = sorted(word.items(),key=operator.itemgetter(1),reverse=True) 21 | 22 | word=dict(filter(lambda x: (x[1]>1)&(x[1]<4000000),temp)) 23 | word2idx={} 24 | for i,k in enumerate(word): 25 | word2idx[k]=i 26 | idx2word=list(word) 27 | print (len(idx2word)) 28 | 29 | def build_word(data,word2idx,maxlen): 30 | ls=data[column].values 31 | embed=np.ones((ls.shape[0],maxlen),dtype=np.int32)*679249 32 | for row in tqdm(range(ls.shape[0])): 33 | s=ls[row].split() 34 | cnt=0 35 | for w in s: 36 | if w in word2idx: 37 | embed[row,cnt]=word2idx[w] 38 | cnt+=1 39 | if cnt>=maxlen: 40 | break 41 | return embed 42 | 43 | train_embed=build_word(train,word2idx,maxlen=1000) 44 | test_embed=build_word(test,word2idx,maxlen=1000) 45 | import gc 46 | gc.collect() 47 | np.save('../data/train_embed.npy',train_embed) 48 | np.save('../data/test_embed.npy',test_embed) 49 | 50 | print ('using glove to train') 51 | alldoc=pd.concat([train,test]) 52 | alldoc.to_csv('alldoc.txt',header=None,index=None) 53 | import subprocess 54 | subprocess.call('./glove.sh',shell=True) 55 | 56 | with open('glove/vectors.txt', 'r+') as f: 57 | content = f.read() 58 | f.seek(0, 0) 59 | f.write('679242 100\n'+content) 60 | 61 | from gensim.models import Word2Vec 62 | import gensim 63 | model = gensim.models.KeyedVectors.load_word2vec_format('glove/vectors.txt', binary=False) 64 | 65 | 66 | word_vec=np.zeros([679250,100],dtype=np.float32) 67 | cnt=0 68 | for i in range(679242): 69 | try: 70 | word_vec[i]=model.wv.word_vec(idx2word[i]) 71 | except: 72 | print (idx2word[i],word[idx2word[i]]) 73 | word_vec[i]=np.random.rand() 74 | print (cnt) 75 | np.save('../data/glove.npy',word_vec) 76 | 77 | 78 | alldoc=np.concatenate((train[column].values,test[column].values),axis=0) 79 | print ('now train word2vec') 80 | import gensim 81 | TaggededDocument = gensim.models.doc2vec.TaggedDocument 82 | class sentences_generator(): 83 | def __init__(self, doc): 84 | self.doc = doc 85 | def __iter__(self): 86 | for line in self.doc: 87 | sentence = line.split() 88 | yield sentence 89 | 90 | from gensim.models import word2vec 91 | sents=sentences_generator(alldoc) 92 | print ('start training,need 2hours or more') 93 | model = word2vec.Word2Vec(sents, sg=1, size=100, window=5, min_count=2, hs=1, workers=8,iter=20) 94 | word_vec=np.zeros([679250,100],dtype=np.float32) 95 | for i in range(679242): 96 | word_vec[i]=model.wv.word_vec(idx2word[i]) 97 | np.save('../data/word_vec.npy',word_vec) 98 | 99 | 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /code/glove.sh: -------------------------------------------------------------------------------- 1 | git clone http://github.com/stanfordnlp/glove 2 | cd glove && make 3 | cd .. 4 | cp mydemo.sh glove 5 | cp alldoc.txt glove 6 | cd glove 7 | ./mydemo.sh 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /code/mydemo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | make 4 | 5 | CORPUS=alldoc.txt 6 | VOCAB_FILE=vocab.txt 7 | COOCCURRENCE_FILE=cooccurrence.bin 8 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin 9 | BUILDDIR=build 10 | SAVE_FILE=vectors 11 | VERBOSE=2 12 | MEMORY=4.0 13 | VOCAB_MIN_COUNT=2 14 | VECTOR_SIZE=100 15 | MAX_ITER=50 16 | WINDOW_SIZE=15 17 | BINARY=2 18 | NUM_THREADS=8 19 | X_MAX=10 20 | 21 | echo 22 | echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE" 23 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE 24 | echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE" 25 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE 26 | echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE" 27 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE 28 | echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE" 29 | $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE 30 | -------------------------------------------------------------------------------- /code/runme.sh: -------------------------------------------------------------------------------- 1 | echo "copy data" 2 | cp ../data/train_set.csv ../team/data/member1_data 3 | cp ../data/test_set.csv ../team/data/member1_data 4 | 5 | echo "run team first" 6 | cd ../team/code 7 | ./train.sh 8 | 9 | 10 | echo "run team2" 11 | cd ../../code 12 | 13 | echo "data_preprocess" 14 | python data_preprocess.py 15 | echo "tradition feature for word" 16 | python tradition_feat.py "word_seg" 17 | echo "tradition feature for char" 18 | python tradition_feat.py "article" 19 | 20 | echo "merge team_data" 21 | cp ../team/data/member1_data/team_data/train_x.npy ../stacking/team 22 | cp ../team/data/member1_data/team_data/test_x.npy ../stacking/team 23 | cp ../team/data/member1_data/article_train_tfidf_svd.npy ../data 24 | cp ../team/data/member1_data/article_test_tfidf_svd.npy ../data 25 | 26 | echo "generate presudo labels" 27 | cd .. 28 | cd stacking 29 | python generate_presudo_labels.py 30 | 31 | echo "deep model" 32 | cd .. 33 | cd model 34 | 35 | echo "Fast_attention" 36 | python Fast_attention.py 1 37 | python Fast_attention.py 2 38 | python Fast_attention.py 3 39 | python Fast_attention.py 4 40 | python Fast_attention.py 5 41 | echo "Fast_attention_withsta" 42 | python Fast_attention_withsta.py 1 43 | python Fast_attention_withsta.py 2 44 | python Fast_attention_withsta.py 3 45 | python Fast_attention_withsta.py 4 46 | python Fast_attention_withsta.py 5 47 | 48 | echo "Fast_attention_withsta2" 49 | python Fast_attention_withsta2.py 1 50 | python Fast_attention_withsta2.py 2 51 | python Fast_attention_withsta2.py 3 52 | python Fast_attention_withsta2.py 4 53 | python Fast_attention_withsta2.py 5 54 | 55 | echo "TextCNN" 56 | python TextCNN.py 1 57 | python TextCNN.py 2 58 | python TextCNN.py 3 59 | python TextCNN.py 4 60 | python TextCNN.py 5 61 | 62 | echo "RCNN" 63 | python RCNN.py 1 64 | python RCNN.py 2 65 | python RCNN.py 3 66 | python RCNN.py 4 67 | python RCNN.py 5 68 | 69 | echo "rnnpool" 70 | python rnnpool.py 1 71 | python rnnpool.py 2 72 | python rnnpool.py 3 73 | python rnnpool.py 4 74 | python rnnpool.py 5 75 | 76 | echo "RNN_attention" 77 | python RNN_attention.py 1 78 | python RNN_attention.py 2 79 | python RNN_attention.py 3 80 | python RNN_attention.py 4 81 | python RNN_attention.py 5 82 | 83 | 84 | echo "RNN_attention2" 85 | python RNN_attention2.py 1 86 | python RNN_attention2.py 2 87 | python RNN_attention2.py 3 88 | python RNN_attention2.py 4 89 | python RNN_attention2.py 5 90 | 91 | echo "RNN_attention_withsta" 92 | python RNN_attention_withsta.py 1 93 | python RNN_attention_withsta.py 2 94 | python RNN_attention_withsta.py 3 95 | python RNN_attention_withsta.py 4 96 | python RNN_attention_withsta.py 5 97 | 98 | echo "RNN_attention_withsta2" 99 | python RNN_attention_withsta2.py 1 100 | python RNN_attention_withsta2.py 2 101 | python RNN_attention_withsta2.py 3 102 | python RNN_attention_withsta2.py 4 103 | python RNN_attention_withsta2.py 5 104 | 105 | 106 | echo "stacking" 107 | cd .. 108 | cd stacking 109 | python stack.py 110 | -------------------------------------------------------------------------------- /code/tradition_feat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | column=sys.argv[1] 3 | print (column) 4 | import pandas as pd, numpy as np 5 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 6 | import gc 7 | 8 | train = pd.read_csv('../data/train_set.csv',usecols=[column]) 9 | test=pd.read_csv('../data/test_set.csv',usecols=[column]) 10 | 11 | vec = TfidfVectorizer(ngram_range=(1,2),min_df=10, max_df=0.8,use_idf=0,smooth_idf=1,stop_words=['816903','520477'], 12 | sublinear_tf=1) 13 | 14 | train_term_doc = vec.fit_transform(train[column]) 15 | test_term_doc=vec.transform(test[column]) 16 | labels=np.load('../data/labels.npy') 17 | 18 | from sklearn.linear_model import SGDClassifier 19 | clf = SGDClassifier(loss='log',n_jobs=-1,max_iter=15,random_state=2018) 20 | 21 | gc.collect() 22 | np.random.seed(2018) 23 | r1=(np.random.uniform(0,1,train_term_doc.shape[0])*5).astype(np.int32) 24 | 25 | val_tf=np.zeros((102277,19)) 26 | test_tf=np.zeros((102277,19)) 27 | 28 | for cv_fold in range(5): 29 | 30 | filter_t=(r1!=cv_fold) 31 | filter_v=(r1==cv_fold) 32 | x_train,y_train=train_term_doc[filter_t].copy(),labels[filter_t] 33 | x_val,y_val=train_term_doc[~filter_t].copy(),labels[~filter_t] 34 | 35 | '''信息增益''' 36 | smooth=0.00000001 37 | KL=np.zeros([1,x_train.shape[1]]) 38 | for c in range(1,20): 39 | filter_c=(y_train==c) 40 | '''类内散度:该特征出现时该类为C的次数/该特征出现的次数,这个比值越大越好,但是需要做点平滑,因为词语频率很低的词语该项也很小''' 41 | CD=np.array((np.sum(x_train[filter_c],axis=0)+smooth)/(np.sum(x_train,axis=0)+19*smooth)) 42 | '''计算熵,熵越小越好''' 43 | KL-=(CD*np.log(CD)) 44 | print (KL.min(),KL.mean(),KL.max()) 45 | KL=KL.max()-KL+0.5 46 | print (KL.min(),KL.mean(),KL.max()) 47 | 48 | gc.collect() 49 | x_train=x_train.multiply(KL) 50 | x_val=x_val.multiply(KL) 51 | x_test=test_term_doc.copy() 52 | x_test=x_test.multiply(KL) 53 | 54 | clf.fit(x_train,y_train) 55 | val_tf[filter_v,:] = clf.predict_proba(x_val) 56 | print (np.mean(np.argmax(val_tf[filter_v,:],1)+1==val_pred)) 57 | 58 | test_tf+= clf.predict_proba(x_test) 59 | test_tf/=5 60 | 61 | np.save('../stacking/tfidf/val_tfidf_%s'%column,val_tf) 62 | np.save('../stacking/tfidf/test_tfidf_%s'%column,test_tf) 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /data/README.MD: -------------------------------------------------------------------------------- 1 | 用于保存初始文件和中间临时文件,以及checkpoint 2 | -------------------------------------------------------------------------------- /data/checkpoint/README.MD: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /model/Fast_attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import pandas as pd 4 | import sys 5 | 6 | vector1=np.load('../data/word_vec.npy') 7 | word_embed_vector=vector1 8 | print (word_embed_vector.shape) 9 | 10 | class param(object): 11 | num_classes=19 12 | sequence_length=1000 13 | embed_size=100 14 | vocab_size=679250 15 | batch_size=128 16 | lr=5e-3 17 | 18 | class fast_param(param): 19 | drop_keep_prob=0.5 20 | l2_lambda=1e-4 21 | hiddim=80 22 | vector_num=10 23 | atten_size=100 24 | 25 | arg=fast_param() 26 | 27 | class Basic_model: 28 | def __init__(self,num_classes,sequence_length,vocab_size,embed_size): 29 | self.num_classes = num_classes 30 | self.sequence_length = sequence_length 31 | self.vocab_size = vocab_size 32 | self.embed_size = embed_size 33 | self.global_steps=tf.Variable(0, trainable=False) 34 | self.embed=tf.Variable(word_embed_vector,name='embeding_vector') 35 | 36 | #placeholder 37 | self.x = tf.placeholder(tf.int32, [None, self.sequence_length], name="input_x1") # X 38 | self.y = tf.placeholder(tf.float32,[None,19], name="labels") 39 | self.keep_prob=tf.placeholder(tf.float32,name="dropout_keep_prob") 40 | self.dropembed=tf.placeholder(tf.float32,name="embed_keep_prob") 41 | self.training=tf.placeholder(tf.bool,name="training") 42 | self.lr = tf.placeholder(tf.float32, name="learning_rate") 43 | self.lr_embed = tf.placeholder(tf.float32, name="embed_learning_rate") 44 | self.lamda=tf.placeholder(tf.float32, name="l2_regular") 45 | self.topk=tf.placeholder(tf.int32, name="topk") 46 | 47 | def weight_init(self,shape,name): 48 | with tf.variable_scope(name,reuse=tf.AUTO_REUSE): 49 | weight=tf.get_variable('kernel',shape,initializer=tf.contrib.layers.xavier_initializer()) 50 | return weight 51 | 52 | def bias_init(self,shape,name): 53 | with tf.variable_scope(name,reuse=tf.AUTO_REUSE): 54 | bias=tf.Variable(tf.zeros(shape)+0.1,tf.float32,name='bias') 55 | return bias 56 | 57 | 58 | 59 | class FastText(Basic_model): 60 | def __init__(self,arg): 61 | super(FastText, self).__init__(arg.num_classes,arg.sequence_length,arg.vocab_size,arg.embed_size) 62 | self.hiddim=arg.hiddim 63 | self.atten_size=arg.atten_size 64 | self.vector_num=arg.vector_num 65 | 66 | self.W_atten=self.weight_init([self.embed_size,self.atten_size],name='atten') 67 | self.b_atten=self.bias_init([self.atten_size],name='atten') 68 | self.W_atten2=self.weight_init([self.atten_size,self.atten_size],name='atten2') 69 | self.b_atten2=self.bias_init([self.atten_size],name='atten2') 70 | 71 | self.UW=self.weight_init([self.atten_size,self.vector_num],name='UW') 72 | self.class_vec=self.weight_init([self.vector_num,self.embed_size,self.hiddim],name='class_vec') 73 | self.char_svd=tf.placeholder(tf.float32,[None,200], name="char_svd") 74 | 75 | self.logit=self.forward() 76 | self.proba=tf.nn.softmax(self.logit,axis=1) 77 | self.losses=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y, logits=self.logit)) 78 | self.l2_losses = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'kernel' in v.name]) * self.lamda 79 | self.loss_add_reg=self.losses+self.l2_losses 80 | [print(v) for v in tf.trainable_variables() if 'kernel' in v.name] 81 | 82 | self.acc=tf.reduce_mean(tf.cast(tf.equal(tf.argmax(self.logit,1),tf.argmax(self.y,1)),tf.float32)) 83 | 84 | var1 = [v for v in tf.trainable_variables() if 'embeding_vector' in v.name] 85 | var2 = [v for v in tf.trainable_variables() if 'embeding_vector' not in v.name] 86 | print ('pretrained,fine-tuning',var1[0]) 87 | 88 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 89 | with tf.control_dependencies(update_ops): 90 | self.train_step1=tf.train.AdamOptimizer(self.lr_embed).minimize(self.loss_add_reg,var_list=var1) 91 | self.train_step2=tf.train.AdamOptimizer(self.lr).minimize(self.loss_add_reg,global_step=self.global_steps,var_list=var2) 92 | self.train_op = tf.group(self.train_step1, self.train_step2) 93 | 94 | def forward(self): 95 | s = tf.nn.embedding_lookup(self.embed,self.x)#[None,sentence_length,embed_size,1] 96 | print ('s',s.shape) 97 | 98 | o1=tf.nn.tanh(tf.matmul(tf.reshape(s,[-1,self.embed_size]),self.W_atten)+self.b_atten) 99 | o1a=tf.nn.dropout(o1,self.keep_prob) 100 | o1b=tf.nn.tanh(tf.matmul(o1,self.W_atten2)+self.b_atten2) 101 | print ('o1',o1.shape) 102 | 103 | o2=tf.reshape(tf.matmul(o1,self.UW),[-1,self.sequence_length,self.vector_num]) 104 | '''这里可以做点文章!对于一些权重很低的词语,我不想让他们加入!''' 105 | o3=tf.nn.softmax(o2,axis=1) 106 | print ('o3',o3.shape) 107 | 108 | context_vec=tf.reduce_sum(tf.expand_dims(s,axis=-1)*tf.expand_dims(o3,axis=2),axis=1) 109 | print (context_vec.shape) 110 | 111 | newc=tf.transpose(context_vec,[2,0,1]) 112 | print ('newc',newc.shape) 113 | 114 | print ('newc',newc.shape) 115 | print ('classvec',self.class_vec.shape) 116 | o4=tf.transpose(tf.matmul(newc,self.class_vec),[1,0,2]) 117 | o5=tf.reshape(o4,[-1,o4.shape[1]*o4.shape[2]]) 118 | print (o4.shape) 119 | print (o5.shape) 120 | 121 | o5bn=tf.nn.relu(tf.layers.batch_normalization(o5,training=self.training)) 122 | o5all=o5bn 123 | o5drop=tf.nn.dropout(o5all,self.keep_prob) 124 | 125 | print('o5all',o5all.shape) 126 | 127 | score=tf.layers.dense(o5drop,self.num_classes,activation=None,use_bias=True, 128 | kernel_initializer=tf.contrib.layers.xavier_initializer() 129 | ,kernel_regularizer=None) 130 | print ('score',score.shape) 131 | 132 | return score 133 | 134 | tf.reset_default_graph() 135 | ss=FastText(arg) 136 | 137 | import sys 138 | use_test=[] 139 | cv_fold=int(sys.argv[1]) 140 | print (cv_fold) 141 | if cv_fold==1: 142 | use_test=list(range(50001)) 143 | if cv_fold==2: 144 | use_test=list(range(50000,102277)) 145 | if cv_fold==3: 146 | use_test=list(range(30000,80001)) 147 | if cv_fold==4: 148 | use_test=list(range(30001))+list(range(80000,102277)) 149 | if cv_fold==5: 150 | use_test=list(range(12000))+list(range(24000,36000))+list(range(48000,60000))+list(range(72000,90000)) 151 | print (len(use_test),use_test[0],use_test[-1]) 152 | 153 | train_embed=np.load('../data/train_embed.npy') 154 | test_embed=np.load('../data/test_embed.npy')[use_test] 155 | labels=np.load('../data/labels.npy') 156 | labels-=1 157 | slabel=np.zeros((labels.shape[0],19)) 158 | slabel[np.arange(labels.shape[0]),labels]=1.0 159 | np.random.seed(2018) 160 | 161 | r1=(np.random.uniform(0,1,train_embed.shape[0])*5).astype(np.int32) 162 | filter_t=(r1!=(cv_fold-1)) 163 | filter_v=~filter_t 164 | x_train , y_train= train_embed[filter_t],slabel[filter_t] 165 | x_val , y_val= train_embed[filter_v],slabel[filter_v] 166 | 167 | test_pred_labels=np.load('../stacking/stacking.npy')[use_test] 168 | x_train=np.concatenate((x_train,test_embed),axis=0) 169 | y_train=np.concatenate((y_train,test_pred_labels)) 170 | print (x_train.shape,y_train.shape) 171 | 172 | import random 173 | import gc 174 | r2=list(range(x_train.shape[0])) 175 | 176 | 177 | saver = tf.train.Saver() 178 | lastacc=0 179 | lastloss=99999 180 | learning_rate=1e-3 181 | embed_rate=2e-4 182 | finetune=False 183 | with tf.Session(config=config) as sess: 184 | sess.run(tf.global_variables_initializer()) 185 | for ep in range(50): 186 | ite=0 187 | random.shuffle(r2) 188 | while(ite 4 | RNN-attention is the pure deep model which gets the highest score on LB A 0.7852, while the cv score is just 0.7803 5 | -------------------------------------------------------------------------------- /model/RNN_attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import pandas as pd 4 | import sys 5 | 6 | vector1=np.load('../data/word_vec.npy') 7 | #vector2=np.load('../data/glove.npy') 8 | word_embed_vector=vector1 9 | #word_embed_vector=np.concatenate((vector1,vector2),axis=1) 10 | print (word_embed_vector.shape) 11 | 12 | class param(object): 13 | num_classes=19 14 | sequence_length=1000 15 | embed_size=100 16 | vocab_size=679250 17 | batch_size=128 18 | lr=5e-3 19 | epoch=10 20 | 21 | class rnn_att_param(param): 22 | drop_keep_prob=0.5 23 | l2_lambda=1e-4 24 | hiddim=80 25 | hidden_size=128 26 | vector_num=10 27 | 28 | arg=rnn_att_param() 29 | 30 | 31 | class Basic_model: 32 | def __init__(self,num_classes,sequence_length,vocab_size,embed_size): 33 | self.num_classes = num_classes 34 | self.sequence_length = sequence_length 35 | self.vocab_size = vocab_size 36 | self.embed_size = embed_size 37 | self.global_steps=tf.Variable(0, trainable=False) 38 | self.embed=tf.Variable(word_embed_vector,name='embeding_vector') 39 | 40 | #placeholder 41 | self.x = tf.placeholder(tf.int32, [None, self.sequence_length], name="input_x1") # X 42 | self.y = tf.placeholder(tf.float32,[None,19], name="labels") 43 | self.keep_prob=tf.placeholder(tf.float32,name="dropout_keep_prob") 44 | self.dropembed=tf.placeholder(tf.float32,name="dropembed") 45 | self.training=tf.placeholder(tf.bool,name="training") 46 | self.lr = tf.placeholder(tf.float32, name="learning_rate") 47 | self.lr_embed = tf.placeholder(tf.float32, name="embed_learning_rate") 48 | self.lamda=tf.placeholder(tf.float32, name="l2_regular") 49 | 50 | 51 | def weight_init(self,shape,name): 52 | with tf.variable_scope(name,reuse=tf.AUTO_REUSE): 53 | weight=tf.get_variable('kernel',shape,initializer=tf.contrib.layers.xavier_initializer()) 54 | return weight 55 | 56 | def bias_init(self,shape,name): 57 | with tf.variable_scope(name,reuse=tf.AUTO_REUSE): 58 | bias=tf.Variable(tf.zeros(shape)+0.1,tf.float32,name='bias') 59 | return bias 60 | 61 | class RNN_att(Basic_model): 62 | def __init__(self,arg): 63 | super(RNN_att, self).__init__(arg.num_classes,arg.sequence_length,arg.vocab_size,arg.embed_size) 64 | self.hiddim=arg.hiddim 65 | self.hidden_size=arg.hidden_size 66 | self.vector_num=arg.vector_num 67 | self.UW=self.weight_init([self.hidden_size,self.vector_num],name='UW') 68 | self.class_vec=self.weight_init([self.vector_num,self.hidden_size,self.hiddim],name='class_vec') 69 | self.W_atten=self.weight_init([self.hidden_size,self.hidden_size],name='atten') 70 | self.b_atten=self.bias_init([self.hidden_size],name='atten') 71 | self.char_svd=tf.placeholder(tf.float32,[None,200], name="char_svd") 72 | 73 | self.logit=self.forward() 74 | self.proba=tf.nn.softmax(self.logit,axis=1) 75 | 76 | self.losses=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y, logits=self.logit)) 77 | 78 | self.l2_losses = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'kernel' in v.name]) * self.lamda 79 | 80 | self.loss_add_reg=self.losses+self.l2_losses 81 | [print(v) for v in tf.trainable_variables() if 'kernel' in v.name] 82 | 83 | self.acc=tf.reduce_mean(tf.cast(tf.equal(tf.argmax(self.logit,1),tf.argmax(self.y,1)),tf.float32)) 84 | 85 | var1 = [v for v in tf.trainable_variables() if 'embeding_vector' in v.name] 86 | var2 = [v for v in tf.trainable_variables() if 'embeding_vector' not in v.name] 87 | print ('pretrained,fine-tuning',var1[0]) 88 | 89 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 90 | with tf.control_dependencies(update_ops): 91 | self.train_step1=tf.train.AdamOptimizer(self.lr_embed).minimize(self.loss_add_reg,var_list=var1) 92 | self.train_step2=tf.train.AdamOptimizer(self.lr).minimize(self.loss_add_reg,global_step=self.global_steps,var_list=var2) 93 | self.train_op = tf.group(self.train_step1, self.train_step2) 94 | 95 | def forward(self): 96 | 97 | s = tf.nn.embedding_lookup(self.embed,self.x)#[None,sentence_length,embed_size,1] 98 | print ('s',s.shape) 99 | 100 | s2=tf.transpose(s,[1,0,2]) 101 | print (s2.shape) 102 | 103 | rnn = tf.contrib.cudnn_rnn.CudnnGRU(num_layers=1, num_units=self.hidden_size//2, direction='bidirectional') 104 | rnn_embed = tf.transpose(rnn(s2)[0],[1,0,2]) 105 | 106 | print (rnn_embed.shape) 107 | o1=tf.reshape(rnn_embed,[-1,rnn_embed.shape[2]]) 108 | o1b=tf.nn.tanh(tf.matmul(o1,self.W_atten)+self.b_atten) 109 | print ('o1.shape',o1b.shape) 110 | 111 | o2=tf.reshape(tf.matmul(o1b,self.UW),[-1,self.sequence_length,self.vector_num]) 112 | o3=tf.nn.softmax(o2,axis=1) 113 | print ('o3',o3.shape) 114 | 115 | context_vec=tf.reduce_sum(tf.expand_dims(rnn_embed,axis=-1)*tf.expand_dims(o3,axis=2),axis=1) 116 | print (context_vec.shape) 117 | context_vecb=tf.transpose(context_vec,[2,0,1]) 118 | print ('c',context_vecb.shape) 119 | 120 | newc=context_vecb 121 | print ('newc',newc.shape) 122 | print ('classvec',self.class_vec.shape) 123 | o4=tf.transpose(tf.matmul(newc,self.class_vec),[1,0,2]) 124 | o5=tf.reshape(o4,[-1,o4.shape[1]*o4.shape[2]]) 125 | print (o4.shape) 126 | print (o5.shape) 127 | 128 | o5bn=tf.nn.relu(tf.layers.batch_normalization(o5,training=self.training)) 129 | o5all=o5bn 130 | o5drop=tf.nn.dropout(o5all,self.keep_prob) 131 | print('o5all',o5all.shape) 132 | 133 | score=tf.layers.dense(o5drop,self.num_classes,activation=None,use_bias=True, 134 | kernel_initializer=tf.contrib.layers.xavier_initializer() 135 | ,kernel_regularizer=None) 136 | 137 | return score 138 | 139 | 140 | tf.reset_default_graph() 141 | ss=RNN_att(arg) 142 | 143 | import sys 144 | use_test=[] 145 | cv_fold=int(sys.argv[1]) 146 | print (cv_fold) 147 | if cv_fold==1: 148 | use_test=list(range(50001)) 149 | if cv_fold==2: 150 | use_test=list(range(50000,102277)) 151 | if cv_fold==3: 152 | use_test=list(range(30000,80001)) 153 | if cv_fold==4: 154 | use_test=list(range(30001))+list(range(80000,102277)) 155 | if cv_fold==5: 156 | use_test=list(range(12000))+list(range(24000,36000))+list(range(48000,60000))+list(range(72000,90000)) 157 | print (len(use_test),use_test[0],use_test[-1]) 158 | 159 | 160 | train_embed=np.load('../data/train_embed.npy') 161 | test_embed=np.load('../data/test_embed.npy')[use_test] 162 | labels=np.load('../data/labels.npy') 163 | labels-=1 164 | slabel=np.zeros((labels.shape[0],19)) 165 | slabel[np.arange(labels.shape[0]),labels]=1.0 166 | 167 | np.random.seed(2018) 168 | r1=(np.random.uniform(0,1,train_embed.shape[0])*5).astype(np.int32) 169 | filter_t=(r1!=(cv_fold-1)) 170 | filter_v=~filter_t 171 | x_train , y_train = train_embed[filter_t],slabel[filter_t] 172 | x_val , y_val = train_embed[filter_v],slabel[filter_v] 173 | 174 | test_pred_labels=np.load('../stacking/stacking.npy')[use_test] 175 | 176 | '''在数据中加入一半的测试集,这是使用了虚假标签的方法!由于正确率大概是80%左右, 177 | 可以想象,一共15000个训练样本,正确标记样本有14000,不到10%的噪声而已!这样做是为了增加样本的多样性,防止过拟合。可以见到更多的词语组合。 178 | 在CNN和RNN中可能更加有效果。''' 179 | x_train=np.concatenate((x_train,test_embed),axis=0) 180 | y_train=np.concatenate((y_train,test_pred_labels)) 181 | print (x_train.shape,y_train.shape) 182 | 183 | import random 184 | import gc 185 | r2=list(range(x_train.shape[0])) 186 | 187 | 188 | 189 | saver = tf.train.Saver() 190 | lastacc=0 191 | lastloss=99999 192 | learning_rate=1e-3 193 | embed_rate=1e-4 194 | finetune=False 195 | with tf.Session() as sess: 196 | sess.run(tf.global_variables_initializer()) 197 | for ep in range(50): 198 | ite=0 199 | random.shuffle(r2) 200 | while(ite