├── README.md
├── dataProcess.py
├── textClassify.py
└── trainText.py


/README.md:
--------------------------------------------------------------------------------
1 | LSTM_TextClassify_word2vec 基于word2vector将每一个词转换为固定大小的input_size,采用双层LSTM进行训练，分类结果百分之九十左右.使用google训练好的模型GoogleNews-vectors-negative300.bin.gz， 模型链接: https://pan.baidu.com/s/1lnFJYrOkzE17tBe5Q4RfaQ 密码: dce2 采用数据 aclImdb 处理流程： 1.首先将要分类的文本数据分词（中文需分词，英文不用） 2.将每个词转换成固定大小的vector，并保存至文件中，以备训练加载 3.将以上数据送入双层LSTM训练即可。 4.根据训练好的模型进行分类 5.里面的路径等参数需根据自己的情况进行修改
2 | 


--------------------------------------------------------------------------------
/dataProcess.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import itertools
  3 | import codecs
  4 | from collections import Counter
  5 | import pickle
  6 | import os
  7 | import numpy as np
  8 | import gensim
  9 | import matplotlib.pyplot as plt
 10 | maxLen = 600
 11 | trainPath = '/media/SSD/LinuxData/DataSet/aclImdb/train'
 12 | testPath = '/media/SSD/LinuxData/DataSet/aclImdb/test'
 13 | vocabPath = '/media/SSD/LinuxData/DataSet/aclImdb/imdb.vocab'
 14 | def clean_str(string):
 15 |     """
 16 |     Tokenization/string cleaning for all datasets except for SST.
 17 |     Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
 18 |     """
 19 |     print('data clean.........')
 20 |     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
 21 |     string = re.sub(r"\'s", " \'s", string)
 22 |     string = re.sub(r"\'ve", " \'ve", string)
 23 |     string = re.sub(r"n\'t", " n\'t", string)
 24 |     string = re.sub(r"\'re", " \'re", string)
 25 |     string = re.sub(r"\'d", " \'d", string)
 26 |     string = re.sub(r"\'ll", " \'ll", string)
 27 |     string = re.sub(r",", " , ", string)
 28 |     string = re.sub(r"!", " ! ", string)
 29 |     string = re.sub(r"\(", " \( ", string)
 30 |     string = re.sub(r"\)", " \) ", string)
 31 |     string = re.sub(r"\?", " \? ", string)
 32 |     string = re.sub(r"\s{2,}", " ", string)
 33 |     return string.strip().lower()
 34 | 
 35 | def load_data(dataPath=None):
 36 |     if dataPath == None:
 37 |         print('no dataPath to load......')
 38 |         exit()
 39 |     print('loading text data.....')
 40 |     print('loading pos data.....')
 41 |     posPath = os.path.join(dataPath,'pos')
 42 |     fileList = os.listdir(posPath)
 43 |     positive_example = []
 44 |     positive_label = []
 45 |     for filename in fileList:
 46 |         line = open(os.path.join(posPath,filename)).readline()
 47 |         positive_example.append(line)
 48 |         positive_label.append([1,0])
 49 | 
 50 |     #print(np.shape(positive_example))
 51 |     print('pos data load finish ! shape: ',np.shape(positive_example))
 52 |     print('loading neg data......')
 53 |     negPath = os.path.join(dataPath,'neg')
 54 |     fileList = os.listdir(negPath)
 55 |     neg_example = []
 56 |     neg_label = []
 57 |     for filename in fileList:
 58 |         line = open(os.path.join(negPath,filename)).readline()
 59 |         neg_example.append(line)
 60 |         neg_label.append([0,1])
 61 | 
 62 |     print('neg data load finish ! shape: ',np.shape(neg_example))
 63 |     x_text = positive_example + neg_example
 64 |     x_text = [clean_str(strs).split() for strs in x_text]
 65 |     print('data clean finished !')
 66 |     '''
 67 |     numWords = []
 68 |     for line in x_text:
 69 |         numWords.append(len(line))
 70 |     print('file num: ',len(numWords),'\ntotal words: ',sum(numWords),'\naverage words: ',sum(numWords)/len(numWords))
 71 |     print('max len: ',max(numWords))
 72 |     plt.hist(numWords,50)
 73 |     plt.xlabel('sequence length')
 74 |     plt.ylabel('frequency')
 75 |     plt.axis([0,1200,0,8000])
 76 |     plt.show()
 77 |     '''
 78 | 
 79 |     #x_label = positive_label + neg_label
 80 |     x_label = np.concatenate([positive_label, neg_label], 0)
 81 | 
 82 |     #print('shape: ',np.shape(x_text),np.shape(x_label),x_text[0])
 83 |     print('loading data success .')
 84 |     return [x_text,x_label]
 85 | 
 86 | #load_data(trainPath)
 87 | 
 88 | def build_vocab():
 89 | 
 90 |     with open(vocabPath) as f:
 91 |         vocab_dict = {word:i+1 for i,word in enumerate(f.readlines())}
 92 |     vocab_size = len(vocab_dict)
 93 |     return vocab_dict,vocab_size
 94 | 
 95 | def pad_sentence(sentences,maxLen=maxLen,pading='00'):
 96 |     print('padding sentence..........')
 97 |     padded_sentence = []
 98 |     for i in range(len(sentences)):
 99 |         sentence = sentences[i]
100 |         num_pad = maxLen - len(sentence)
101 |         if num_pad >= 0:
102 |             new_sentence = sentence + [pading]*num_pad
103 |             padded_sentence.append(new_sentence)
104 |         else:
105 |             new_sentence = sentence[:maxLen]
106 |             padded_sentence.append(new_sentence)
107 |     #print('padding shape: ',np.shape(np.array(padded_sentence)))
108 |     print('padding sentence success.')
109 |     print('after padding,the shape of x_text is: ',np.shape(padded_sentence))
110 |     print('x_text: ')
111 |     #for i in range(10):
112 |     #    print(padded_sentence[i])
113 |     return padded_sentence
114 | 
115 | def build_input_data(data,lables,wordvecPath,batch_size,num_epoches,vector_size):
116 |     data = np.array(data)
117 |     lables = np.array(lables)
118 |     data_size = len(data)
119 |     num_batches_per_epoch = int(data_size / batch_size) + 1
120 |     #max_length = max(len(x) for x in data)
121 |     print('loading google word2vec....................')
122 |     model = gensim.models.KeyedVectors.load_word2vec_format(wordvecPath,binary=True)
123 |     print('load google word2vec success !')
124 |     #vocab_dict,vocab_size = build_vocab()
125 |     W = []
126 |     f = open('wordvec.pkl','wb')
127 |     input_data = []
128 |     for epoch in range(num_epoches):
129 |         print('writting epoch: ',epoch,'/',num_epoches,'  .........')
130 |         shuffle_indices = np.random.permutation(np.arange(data_size))
131 |         print('shuffling data....................')
132 |         shuffle_data = data[shuffle_indices]
133 |         shuffle_label = lables[shuffle_indices]
134 |         print('shuffling data finished .')
135 |         for batch_num in range(num_batches_per_epoch):
136 |             print('writing batch ',batch_num,'  at epoch ',epoch,' .............')
137 |             start_index = batch_num * batch_size
138 |             end_index = min((batch_num + 1) * batch_size, data_size)
139 |             batch_data = shuffle_data[start_index:end_index]
140 |             Sentence_vec = []
141 |             for single_sentence in batch_data:
142 |                 sin_sen_vec = []
143 |                 for word in single_sentence:
144 |                     try:
145 |                         vec = model[word]
146 |                     except:
147 |                         vec = np.zeros(shape=(vector_size),dtype=np.float32)
148 |                     sin_sen_vec.append(vec)
149 |                 Sentence_vec.append(sin_sen_vec)
150 |             input_data.append([Sentence_vec,shuffle_label[start_index:end_index]])
151 |             print('batch num: ',len(input_data))
152 |             pickle.dump([Sentence_vec,shuffle_label[start_index:end_index]],f)
153 |             print('batch ',batch_num,' at epoch ',epoch,' writing finished .')
154 | 
155 |         print('epoch ',epoch,' writing finished .')
156 | 
157 |     #pickle.dump(W,open('wordvec.pkl','wb'))
158 |     return input_data
159 | 
160 | 
161 | wordvecPath = '/media/SSD/LinuxData/model/goole_word2vec/GoogleNews-vectors-negative300.bin.gz'
162 | x_text,x_label = load_data(trainPath)
163 | x_text = pad_sentence(x_text)
164 | #build_input_data(np.array(x_text),np.array(x_label),wordvecPath,25,10,300)
165 | build_input_data(x_text,x_label,wordvecPath,100,5,300)
166 | 


--------------------------------------------------------------------------------
/textClassify.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | rnn_unit = 200
  4 | input_size = 300
  5 | output_size = 2
  6 | batch_size = 100
  7 | time_step = 600
  8 | #max_length
  9 | testWordVecor = ''  #测试数据的vector向量
 10 | model_path=''    #模型所在路径
 11 | epoches = 5
 12 | lr = 0.001
 13 | dataLen = 25000  #数据长度，即句子的个数
 14 | testNum = 1000   #药测试数据的个数，不能超过25000
 15 | from tensorflow.contrib import rnn
 16 | #import data_process
 17 | import pickle
 18 | 
 19 | 
 20 | def getTextData(num):
 21 |     if num > dataLen:
 22 |         print('数据长度最大不超过25000,请重新输入testNum的值，此次退出！')
 23 |         exit()
 24 |     print('加载测试数据..........')
 25 |     file = open(testWordVecor,'rb')
 26 |     start = np.random.randint(0,dataLen - num-1)
 27 |     end = start + num;
 28 |     data = []
 29 |     label = []
 30 |     for i in range(start,end):
 31 |         t_data,t_label = pickle.load(file)
 32 |         data.append(t_data)
 33 |         label.append(t_label)
 34 |     file.close()
 35 |     data = np.array(data)
 36 |     label = np.array(label)
 37 |     print('测试数据加载完成！')
 38 |     return data,label,num
 39 | 
 40 | 
 41 | 
 42 | 
 43 | weights={
 44 |          'in':tf.Variable(tf.random_normal([input_size,rnn_unit])),
 45 |          'out':tf.Variable(tf.random_normal([rnn_unit,output_size]))
 46 |         }
 47 | biases={
 48 |         'in':tf.Variable(tf.constant(0.1,shape=[rnn_unit,])),
 49 |         'out':tf.Variable(tf.constant(0.1,shape=[output_size,]))
 50 |        }
 51 | 
 52 | 
 53 | def LstmCell():
 54 |     lstm_cell = rnn.BasicLSTMCell(rnn_unit, state_is_tuple=True)
 55 |     #lstm_cell = rnn.DropoutWrapper(cell=lstm_cell,output_keep_prob=0.75)
 56 |     return lstm_cell
 57 | def lstm(X,test=False):
 58 |     batch_size=tf.shape(X)[0]
 59 |     time_step=tf.shape(X)[1]
 60 |     w_in=weights['in']
 61 |     b_in=biases['in']
 62 |     input=tf.reshape(X,[-1,input_size])  #需要将tensor转成2维进行计算，计算后的结果作为隐藏层的输入
 63 |     input_rnn=tf.matmul(input,w_in)+b_in
 64 |     input_rnn=tf.reshape(input_rnn,[-1,time_step,rnn_unit])  #将tensor转成3维，作为lstm cell的输入
 65 | 
 66 |     #cell=tf.nn.rnn_cell.BasicLSTMCell(rnn_unit)
 67 |     cell = rnn.MultiRNNCell([LstmCell() for _ in range(2)])
 68 |     init_state=cell.zero_state(batch_size,dtype=tf.float32)
 69 |     #output_rnn, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x, dtype=tf.float32)
 70 |     #output_rnn,final_states=tf.nn.dynamic_rnn(cell, input_rnn,initial_state=init_state, dtype=tf.float32)  #output_rnn是记录lstm每个输出节点的结果，final_states是最后一个cell的结果
 71 |     output_rnn, final_states = tf.nn.dynamic_rnn(cell, input_rnn, dtype=tf.float32)  # output_rnn是记录lstm每个输出节点的结果，final_states是最后一个cell的结果
 72 | 
 73 |     output = tf.transpose(output_rnn,[1,0,2])
 74 |     #output = output[:,-1,:]
 75 |     output = output[-1]
 76 |     output=tf.reshape(output,[-1,rnn_unit]) #作为输出层的输入
 77 |     #output = tf.gather(output,int(output.get_shape()[0])-1)
 78 |     w_out=weights['out']
 79 |     b_out=biases['out']
 80 |     pred=tf.matmul(output,w_out)+b_out
 81 |     return pred,final_states,output
 82 | 
 83 | 
 84 | 
 85 | def text_classify(time_step=time_step):
 86 |     #print("batch_size: ",batch_size,'\n',"time_step: ",time_step,'\n',"hiden_unit: ",rnn_unit,'\n','input_size: ',input_size,'\n','output_size: ',output_size,'\n','train_step: ',train_step,'\n','learning_reate: ',lr)
 87 |     X=tf.placeholder(tf.float32, shape=[None,time_step,input_size])
 88 |     #Y=tf.placeholder(tf.float32, shape=[None,output_size])
 89 |     pred,_,output=lstm(X)
 90 |     #corPred = tf.equal(tf.argmax(pred,1),tf.argmax(Y,1))
 91 |     #accuracy = tf.reduce_mean(tf.cast(corPred,tf.float32))
 92 |     data,label,_ = getTextData(testNum)
 93 |     saver = tf.train.Saver(tf.global_variables())
 94 |     with tf.Session() as sess:
 95 |         #sess.run(tf.global_variables_initializer())
 96 |         ckpt = tf.train.get_checkpoint_state(model_path)
 97 |         if ckpt and ckpt.model_checkpoint_path:
 98 |             saver.restore(sess, ckpt.model_checkpoint_path)
 99 |             print('加载现有模型...')
100 |         else:
101 |             print("模型不存在，程序退出！！！")
102 |             exit()
103 |         f_d = {X:data}
104 |         pred = sess.run(pred, feed_dict=f_d)
105 |         pred = np.array(pred)
106 |         corPred = np.equal(np.argmax(pred, 1), np.argmax(label, 1))
107 |         accuracy = np.mean(np.array(corPred).astype(np.float32))
108 |         print('测试结果为：\n')
109 |         print('accuracy: ', accuracy)
110 | 
111 | 
112 | text_classify()
113 | 


--------------------------------------------------------------------------------
/trainText.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | rnn_unit = 200
  4 | input_size = 300
  5 | output_size = 2
  6 | batch_size = 100
  7 | time_step = 600
  8 | #max_length
  9 | epoches = 50
 10 | lr = 0.001
 11 | from tensorflow.contrib import rnn
 12 | #import data_process
 13 | import pickle
 14 | 
 15 | 
 16 | weights={
 17 |          'in':tf.Variable(tf.random_normal([input_size,rnn_unit])),
 18 |          'out':tf.Variable(tf.random_normal([rnn_unit,output_size]))
 19 |         }
 20 | biases={
 21 |         'in':tf.Variable(tf.constant(0.1,shape=[rnn_unit,])),
 22 |         'out':tf.Variable(tf.constant(0.1,shape=[output_size,]))
 23 |        }
 24 | 
 25 | 
 26 | def LstmCell():
 27 |     lstm_cell = rnn.BasicLSTMCell(rnn_unit, state_is_tuple=True)
 28 |     lstm_cell = rnn.DropoutWrapper(cell=lstm_cell,output_keep_prob=0.75)
 29 |     return lstm_cell
 30 | def lstm(X):
 31 |     batch_size=tf.shape(X)[0]
 32 |     time_step=tf.shape(X)[1]
 33 |     w_in=weights['in']
 34 |     b_in=biases['in']
 35 |     input=tf.reshape(X,[-1,input_size])  #需要将tensor转成2维进行计算，计算后的结果作为隐藏层的输入
 36 |     input_rnn=tf.matmul(input,w_in)+b_in
 37 |     input_rnn=tf.reshape(input_rnn,[-1,time_step,rnn_unit])  #将tensor转成3维，作为lstm cell的输入
 38 | 
 39 |     #cell=tf.nn.rnn_cell.BasicLSTMCell(rnn_unit)
 40 |     cell = rnn.MultiRNNCell([LstmCell() for _ in range(2)])
 41 |     init_state=cell.zero_state(batch_size,dtype=tf.float32)
 42 |     #output_rnn, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x, dtype=tf.float32)
 43 |     #output_rnn,final_states=tf.nn.dynamic_rnn(cell, input_rnn,initial_state=init_state, dtype=tf.float32)  #output_rnn是记录lstm每个输出节点的结果，final_states是最后一个cell的结果
 44 |     output_rnn, final_states = tf.nn.dynamic_rnn(cell, input_rnn, dtype=tf.float32)  # output_rnn是记录lstm每个输出节点的结果，final_states是最后一个cell的结果
 45 | 
 46 |     output = tf.transpose(output_rnn,[1,0,2])
 47 |     #output = output[:,-1,:]
 48 |     output = output[-1]
 49 |     output=tf.reshape(output,[-1,rnn_unit]) #作为输出层的输入
 50 |     #output = tf.gather(output,int(output.get_shape()[0])-1)
 51 |     w_out=weights['out']
 52 |     b_out=biases['out']
 53 |     pred=tf.matmul(output,w_out)+b_out
 54 |     return pred,final_states,output
 55 | 
 56 | 
 57 | 
 58 | def train_lstm(batch_size=batch_size,time_step=time_step,epoches=epoches):
 59 |     #print("batch_size: ",batch_size,'\n',"time_step: ",time_step,'\n',"hiden_unit: ",rnn_unit,'\n','input_size: ',input_size,'\n','output_size: ',output_size,'\n','train_step: ',train_step,'\n','learning_reate: ',lr)
 60 |     X=tf.placeholder(tf.float32, shape=[None,time_step,input_size])
 61 |     Y=tf.placeholder(tf.float32, shape=[None,output_size])
 62 |     #batch_index,train_x,train_y=get_train_data(batch_size,time_step,train_begin,train_end)
 63 |     #batch_index,train_x,train_y = data_train(batch_size,time_step,path=data_path)
 64 |     pred,_,output=lstm(X)
 65 |     corPred = tf.equal(tf.argmax(pred,1),tf.argmax(Y,1))
 66 |     accuracy = tf.reduce_mean(tf.cast(corPred,tf.float32))
 67 |     #损失函数
 68 |     #loss=tf.reduce_mean(tf.square(tf.reshape(pred,[-1])-tf.reshape(Y, [-1])))
 69 |     loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred,labels=Y))
 70 |     #print("hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh: ", np.shape(tf.reshape(pred, [-1])), np.shape(output))
 71 |     train_op=tf.train.AdamOptimizer(lr).minimize(loss)
 72 |     saver=tf.train.Saver(tf.global_variables(),max_to_keep=5)
 73 |     #module_file = tf.train.latest_checkpoint()
 74 | 
 75 |     with tf.Session() as sess:
 76 |         sess.run(tf.global_variables_initializer())
 77 |         ckpt = tf.train.get_checkpoint_state('Model/')
 78 |         if ckpt and ckpt.model_checkpoint_path:
 79 |             saver.restore(sess, ckpt.model_checkpoint_path)
 80 |             print('加载现有模型...')
 81 |         else:
 82 |             print("模型不存在，开始训练......")
 83 |         i = 0
 84 |         for epoch in range(epoches):
 85 |             print('epoch ',epoch,' training......')
 86 |             file = open('wordvec.pkl', 'rb')
 87 |             x_train, x_label = pickle.load(file)
 88 |             #i = 0
 89 |             while((x_train != None) ):
 90 |                 feed_dict={X:x_train,Y:x_label}
 91 |                 _,loss_ = sess.run([train_op,loss],feed_dict=feed_dict)
 92 |                 print('epoch: ',epoch,'  batch :',i,'  loss: ',loss_)
 93 |                 if i % 5 == 0:
 94 |                     accu = sess.run([accuracy],feed_dict=feed_dict)
 95 |                 x_train, x_label = pickle.load(file)
 96 |                 i = i + 1
 97 |             print('epoch ',epoch,' training end !')
 98 |             saver.save(sess,'Model/model.ckt',global_step=i)
 99 |             file.close()
100 | 
101 | train_lstm()
102 | 


--------------------------------------------------------------------------------