├── README.md ├── dataProcess.py ├── textClassify.py └── trainText.py /README.md: -------------------------------------------------------------------------------- 1 | LSTM_TextClassify_word2vec 基于word2vector将每一个词转换为固定大小的input_size,采用双层LSTM进行训练,分类结果百分之九十左右.使用google训练好的模型GoogleNews-vectors-negative300.bin.gz, 模型链接: https://pan.baidu.com/s/1lnFJYrOkzE17tBe5Q4RfaQ 密码: dce2 采用数据 aclImdb 处理流程: 1.首先将要分类的文本数据分词(中文需分词,英文不用) 2.将每个词转换成固定大小的vector,并保存至文件中,以备训练加载 3.将以上数据送入双层LSTM训练即可。 4.根据训练好的模型进行分类 5.里面的路径等参数需根据自己的情况进行修改 2 | -------------------------------------------------------------------------------- /dataProcess.py: -------------------------------------------------------------------------------- 1 | import re 2 | import itertools 3 | import codecs 4 | from collections import Counter 5 | import pickle 6 | import os 7 | import numpy as np 8 | import gensim 9 | import matplotlib.pyplot as plt 10 | maxLen = 600 11 | trainPath = '/media/SSD/LinuxData/DataSet/aclImdb/train' 12 | testPath = '/media/SSD/LinuxData/DataSet/aclImdb/test' 13 | vocabPath = '/media/SSD/LinuxData/DataSet/aclImdb/imdb.vocab' 14 | def clean_str(string): 15 | """ 16 | Tokenization/string cleaning for all datasets except for SST. 17 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py 18 | """ 19 | print('data clean.........') 20 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 21 | string = re.sub(r"\'s", " \'s", string) 22 | string = re.sub(r"\'ve", " \'ve", string) 23 | string = re.sub(r"n\'t", " n\'t", string) 24 | string = re.sub(r"\'re", " \'re", string) 25 | string = re.sub(r"\'d", " \'d", string) 26 | string = re.sub(r"\'ll", " \'ll", string) 27 | string = re.sub(r",", " , ", string) 28 | string = re.sub(r"!", " ! ", string) 29 | string = re.sub(r"\(", " \( ", string) 30 | string = re.sub(r"\)", " \) ", string) 31 | string = re.sub(r"\?", " \? ", string) 32 | string = re.sub(r"\s{2,}", " ", string) 33 | return string.strip().lower() 34 | 35 | def load_data(dataPath=None): 36 | if dataPath == None: 37 | print('no dataPath to load......') 38 | exit() 39 | print('loading text data.....') 40 | print('loading pos data.....') 41 | posPath = os.path.join(dataPath,'pos') 42 | fileList = os.listdir(posPath) 43 | positive_example = [] 44 | positive_label = [] 45 | for filename in fileList: 46 | line = open(os.path.join(posPath,filename)).readline() 47 | positive_example.append(line) 48 | positive_label.append([1,0]) 49 | 50 | #print(np.shape(positive_example)) 51 | print('pos data load finish ! shape: ',np.shape(positive_example)) 52 | print('loading neg data......') 53 | negPath = os.path.join(dataPath,'neg') 54 | fileList = os.listdir(negPath) 55 | neg_example = [] 56 | neg_label = [] 57 | for filename in fileList: 58 | line = open(os.path.join(negPath,filename)).readline() 59 | neg_example.append(line) 60 | neg_label.append([0,1]) 61 | 62 | print('neg data load finish ! shape: ',np.shape(neg_example)) 63 | x_text = positive_example + neg_example 64 | x_text = [clean_str(strs).split() for strs in x_text] 65 | print('data clean finished !') 66 | ''' 67 | numWords = [] 68 | for line in x_text: 69 | numWords.append(len(line)) 70 | print('file num: ',len(numWords),'\ntotal words: ',sum(numWords),'\naverage words: ',sum(numWords)/len(numWords)) 71 | print('max len: ',max(numWords)) 72 | plt.hist(numWords,50) 73 | plt.xlabel('sequence length') 74 | plt.ylabel('frequency') 75 | plt.axis([0,1200,0,8000]) 76 | plt.show() 77 | ''' 78 | 79 | #x_label = positive_label + neg_label 80 | x_label = np.concatenate([positive_label, neg_label], 0) 81 | 82 | #print('shape: ',np.shape(x_text),np.shape(x_label),x_text[0]) 83 | print('loading data success .') 84 | return [x_text,x_label] 85 | 86 | #load_data(trainPath) 87 | 88 | def build_vocab(): 89 | 90 | with open(vocabPath) as f: 91 | vocab_dict = {word:i+1 for i,word in enumerate(f.readlines())} 92 | vocab_size = len(vocab_dict) 93 | return vocab_dict,vocab_size 94 | 95 | def pad_sentence(sentences,maxLen=maxLen,pading='00'): 96 | print('padding sentence..........') 97 | padded_sentence = [] 98 | for i in range(len(sentences)): 99 | sentence = sentences[i] 100 | num_pad = maxLen - len(sentence) 101 | if num_pad >= 0: 102 | new_sentence = sentence + [pading]*num_pad 103 | padded_sentence.append(new_sentence) 104 | else: 105 | new_sentence = sentence[:maxLen] 106 | padded_sentence.append(new_sentence) 107 | #print('padding shape: ',np.shape(np.array(padded_sentence))) 108 | print('padding sentence success.') 109 | print('after padding,the shape of x_text is: ',np.shape(padded_sentence)) 110 | print('x_text: ') 111 | #for i in range(10): 112 | # print(padded_sentence[i]) 113 | return padded_sentence 114 | 115 | def build_input_data(data,lables,wordvecPath,batch_size,num_epoches,vector_size): 116 | data = np.array(data) 117 | lables = np.array(lables) 118 | data_size = len(data) 119 | num_batches_per_epoch = int(data_size / batch_size) + 1 120 | #max_length = max(len(x) for x in data) 121 | print('loading google word2vec....................') 122 | model = gensim.models.KeyedVectors.load_word2vec_format(wordvecPath,binary=True) 123 | print('load google word2vec success !') 124 | #vocab_dict,vocab_size = build_vocab() 125 | W = [] 126 | f = open('wordvec.pkl','wb') 127 | input_data = [] 128 | for epoch in range(num_epoches): 129 | print('writting epoch: ',epoch,'/',num_epoches,' .........') 130 | shuffle_indices = np.random.permutation(np.arange(data_size)) 131 | print('shuffling data....................') 132 | shuffle_data = data[shuffle_indices] 133 | shuffle_label = lables[shuffle_indices] 134 | print('shuffling data finished .') 135 | for batch_num in range(num_batches_per_epoch): 136 | print('writing batch ',batch_num,' at epoch ',epoch,' .............') 137 | start_index = batch_num * batch_size 138 | end_index = min((batch_num + 1) * batch_size, data_size) 139 | batch_data = shuffle_data[start_index:end_index] 140 | Sentence_vec = [] 141 | for single_sentence in batch_data: 142 | sin_sen_vec = [] 143 | for word in single_sentence: 144 | try: 145 | vec = model[word] 146 | except: 147 | vec = np.zeros(shape=(vector_size),dtype=np.float32) 148 | sin_sen_vec.append(vec) 149 | Sentence_vec.append(sin_sen_vec) 150 | input_data.append([Sentence_vec,shuffle_label[start_index:end_index]]) 151 | print('batch num: ',len(input_data)) 152 | pickle.dump([Sentence_vec,shuffle_label[start_index:end_index]],f) 153 | print('batch ',batch_num,' at epoch ',epoch,' writing finished .') 154 | 155 | print('epoch ',epoch,' writing finished .') 156 | 157 | #pickle.dump(W,open('wordvec.pkl','wb')) 158 | return input_data 159 | 160 | 161 | wordvecPath = '/media/SSD/LinuxData/model/goole_word2vec/GoogleNews-vectors-negative300.bin.gz' 162 | x_text,x_label = load_data(trainPath) 163 | x_text = pad_sentence(x_text) 164 | #build_input_data(np.array(x_text),np.array(x_label),wordvecPath,25,10,300) 165 | build_input_data(x_text,x_label,wordvecPath,100,5,300) 166 | -------------------------------------------------------------------------------- /textClassify.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | rnn_unit = 200 4 | input_size = 300 5 | output_size = 2 6 | batch_size = 100 7 | time_step = 600 8 | #max_length 9 | testWordVecor = '' #测试数据的vector向量 10 | model_path='' #模型所在路径 11 | epoches = 5 12 | lr = 0.001 13 | dataLen = 25000 #数据长度,即句子的个数 14 | testNum = 1000 #药测试数据的个数,不能超过25000 15 | from tensorflow.contrib import rnn 16 | #import data_process 17 | import pickle 18 | 19 | 20 | def getTextData(num): 21 | if num > dataLen: 22 | print('数据长度最大不超过25000,请重新输入testNum的值,此次退出!') 23 | exit() 24 | print('加载测试数据..........') 25 | file = open(testWordVecor,'rb') 26 | start = np.random.randint(0,dataLen - num-1) 27 | end = start + num; 28 | data = [] 29 | label = [] 30 | for i in range(start,end): 31 | t_data,t_label = pickle.load(file) 32 | data.append(t_data) 33 | label.append(t_label) 34 | file.close() 35 | data = np.array(data) 36 | label = np.array(label) 37 | print('测试数据加载完成!') 38 | return data,label,num 39 | 40 | 41 | 42 | 43 | weights={ 44 | 'in':tf.Variable(tf.random_normal([input_size,rnn_unit])), 45 | 'out':tf.Variable(tf.random_normal([rnn_unit,output_size])) 46 | } 47 | biases={ 48 | 'in':tf.Variable(tf.constant(0.1,shape=[rnn_unit,])), 49 | 'out':tf.Variable(tf.constant(0.1,shape=[output_size,])) 50 | } 51 | 52 | 53 | def LstmCell(): 54 | lstm_cell = rnn.BasicLSTMCell(rnn_unit, state_is_tuple=True) 55 | #lstm_cell = rnn.DropoutWrapper(cell=lstm_cell,output_keep_prob=0.75) 56 | return lstm_cell 57 | def lstm(X,test=False): 58 | batch_size=tf.shape(X)[0] 59 | time_step=tf.shape(X)[1] 60 | w_in=weights['in'] 61 | b_in=biases['in'] 62 | input=tf.reshape(X,[-1,input_size]) #需要将tensor转成2维进行计算,计算后的结果作为隐藏层的输入 63 | input_rnn=tf.matmul(input,w_in)+b_in 64 | input_rnn=tf.reshape(input_rnn,[-1,time_step,rnn_unit]) #将tensor转成3维,作为lstm cell的输入 65 | 66 | #cell=tf.nn.rnn_cell.BasicLSTMCell(rnn_unit) 67 | cell = rnn.MultiRNNCell([LstmCell() for _ in range(2)]) 68 | init_state=cell.zero_state(batch_size,dtype=tf.float32) 69 | #output_rnn, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x, dtype=tf.float32) 70 | #output_rnn,final_states=tf.nn.dynamic_rnn(cell, input_rnn,initial_state=init_state, dtype=tf.float32) #output_rnn是记录lstm每个输出节点的结果,final_states是最后一个cell的结果 71 | output_rnn, final_states = tf.nn.dynamic_rnn(cell, input_rnn, dtype=tf.float32) # output_rnn是记录lstm每个输出节点的结果,final_states是最后一个cell的结果 72 | 73 | output = tf.transpose(output_rnn,[1,0,2]) 74 | #output = output[:,-1,:] 75 | output = output[-1] 76 | output=tf.reshape(output,[-1,rnn_unit]) #作为输出层的输入 77 | #output = tf.gather(output,int(output.get_shape()[0])-1) 78 | w_out=weights['out'] 79 | b_out=biases['out'] 80 | pred=tf.matmul(output,w_out)+b_out 81 | return pred,final_states,output 82 | 83 | 84 | 85 | def text_classify(time_step=time_step): 86 | #print("batch_size: ",batch_size,'\n',"time_step: ",time_step,'\n',"hiden_unit: ",rnn_unit,'\n','input_size: ',input_size,'\n','output_size: ',output_size,'\n','train_step: ',train_step,'\n','learning_reate: ',lr) 87 | X=tf.placeholder(tf.float32, shape=[None,time_step,input_size]) 88 | #Y=tf.placeholder(tf.float32, shape=[None,output_size]) 89 | pred,_,output=lstm(X) 90 | #corPred = tf.equal(tf.argmax(pred,1),tf.argmax(Y,1)) 91 | #accuracy = tf.reduce_mean(tf.cast(corPred,tf.float32)) 92 | data,label,_ = getTextData(testNum) 93 | saver = tf.train.Saver(tf.global_variables()) 94 | with tf.Session() as sess: 95 | #sess.run(tf.global_variables_initializer()) 96 | ckpt = tf.train.get_checkpoint_state(model_path) 97 | if ckpt and ckpt.model_checkpoint_path: 98 | saver.restore(sess, ckpt.model_checkpoint_path) 99 | print('加载现有模型...') 100 | else: 101 | print("模型不存在,程序退出!!!") 102 | exit() 103 | f_d = {X:data} 104 | pred = sess.run(pred, feed_dict=f_d) 105 | pred = np.array(pred) 106 | corPred = np.equal(np.argmax(pred, 1), np.argmax(label, 1)) 107 | accuracy = np.mean(np.array(corPred).astype(np.float32)) 108 | print('测试结果为:\n') 109 | print('accuracy: ', accuracy) 110 | 111 | 112 | text_classify() 113 | -------------------------------------------------------------------------------- /trainText.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | rnn_unit = 200 4 | input_size = 300 5 | output_size = 2 6 | batch_size = 100 7 | time_step = 600 8 | #max_length 9 | epoches = 50 10 | lr = 0.001 11 | from tensorflow.contrib import rnn 12 | #import data_process 13 | import pickle 14 | 15 | 16 | weights={ 17 | 'in':tf.Variable(tf.random_normal([input_size,rnn_unit])), 18 | 'out':tf.Variable(tf.random_normal([rnn_unit,output_size])) 19 | } 20 | biases={ 21 | 'in':tf.Variable(tf.constant(0.1,shape=[rnn_unit,])), 22 | 'out':tf.Variable(tf.constant(0.1,shape=[output_size,])) 23 | } 24 | 25 | 26 | def LstmCell(): 27 | lstm_cell = rnn.BasicLSTMCell(rnn_unit, state_is_tuple=True) 28 | lstm_cell = rnn.DropoutWrapper(cell=lstm_cell,output_keep_prob=0.75) 29 | return lstm_cell 30 | def lstm(X): 31 | batch_size=tf.shape(X)[0] 32 | time_step=tf.shape(X)[1] 33 | w_in=weights['in'] 34 | b_in=biases['in'] 35 | input=tf.reshape(X,[-1,input_size]) #需要将tensor转成2维进行计算,计算后的结果作为隐藏层的输入 36 | input_rnn=tf.matmul(input,w_in)+b_in 37 | input_rnn=tf.reshape(input_rnn,[-1,time_step,rnn_unit]) #将tensor转成3维,作为lstm cell的输入 38 | 39 | #cell=tf.nn.rnn_cell.BasicLSTMCell(rnn_unit) 40 | cell = rnn.MultiRNNCell([LstmCell() for _ in range(2)]) 41 | init_state=cell.zero_state(batch_size,dtype=tf.float32) 42 | #output_rnn, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x, dtype=tf.float32) 43 | #output_rnn,final_states=tf.nn.dynamic_rnn(cell, input_rnn,initial_state=init_state, dtype=tf.float32) #output_rnn是记录lstm每个输出节点的结果,final_states是最后一个cell的结果 44 | output_rnn, final_states = tf.nn.dynamic_rnn(cell, input_rnn, dtype=tf.float32) # output_rnn是记录lstm每个输出节点的结果,final_states是最后一个cell的结果 45 | 46 | output = tf.transpose(output_rnn,[1,0,2]) 47 | #output = output[:,-1,:] 48 | output = output[-1] 49 | output=tf.reshape(output,[-1,rnn_unit]) #作为输出层的输入 50 | #output = tf.gather(output,int(output.get_shape()[0])-1) 51 | w_out=weights['out'] 52 | b_out=biases['out'] 53 | pred=tf.matmul(output,w_out)+b_out 54 | return pred,final_states,output 55 | 56 | 57 | 58 | def train_lstm(batch_size=batch_size,time_step=time_step,epoches=epoches): 59 | #print("batch_size: ",batch_size,'\n',"time_step: ",time_step,'\n',"hiden_unit: ",rnn_unit,'\n','input_size: ',input_size,'\n','output_size: ',output_size,'\n','train_step: ',train_step,'\n','learning_reate: ',lr) 60 | X=tf.placeholder(tf.float32, shape=[None,time_step,input_size]) 61 | Y=tf.placeholder(tf.float32, shape=[None,output_size]) 62 | #batch_index,train_x,train_y=get_train_data(batch_size,time_step,train_begin,train_end) 63 | #batch_index,train_x,train_y = data_train(batch_size,time_step,path=data_path) 64 | pred,_,output=lstm(X) 65 | corPred = tf.equal(tf.argmax(pred,1),tf.argmax(Y,1)) 66 | accuracy = tf.reduce_mean(tf.cast(corPred,tf.float32)) 67 | #损失函数 68 | #loss=tf.reduce_mean(tf.square(tf.reshape(pred,[-1])-tf.reshape(Y, [-1]))) 69 | loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred,labels=Y)) 70 | #print("hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh: ", np.shape(tf.reshape(pred, [-1])), np.shape(output)) 71 | train_op=tf.train.AdamOptimizer(lr).minimize(loss) 72 | saver=tf.train.Saver(tf.global_variables(),max_to_keep=5) 73 | #module_file = tf.train.latest_checkpoint() 74 | 75 | with tf.Session() as sess: 76 | sess.run(tf.global_variables_initializer()) 77 | ckpt = tf.train.get_checkpoint_state('Model/') 78 | if ckpt and ckpt.model_checkpoint_path: 79 | saver.restore(sess, ckpt.model_checkpoint_path) 80 | print('加载现有模型...') 81 | else: 82 | print("模型不存在,开始训练......") 83 | i = 0 84 | for epoch in range(epoches): 85 | print('epoch ',epoch,' training......') 86 | file = open('wordvec.pkl', 'rb') 87 | x_train, x_label = pickle.load(file) 88 | #i = 0 89 | while((x_train != None) ): 90 | feed_dict={X:x_train,Y:x_label} 91 | _,loss_ = sess.run([train_op,loss],feed_dict=feed_dict) 92 | print('epoch: ',epoch,' batch :',i,' loss: ',loss_) 93 | if i % 5 == 0: 94 | accu = sess.run([accuracy],feed_dict=feed_dict) 95 | x_train, x_label = pickle.load(file) 96 | i = i + 1 97 | print('epoch ',epoch,' training end !') 98 | saver.save(sess,'Model/model.ckt',global_step=i) 99 | file.close() 100 | 101 | train_lstm() 102 | --------------------------------------------------------------------------------