├── .DS_Store ├── FastText.py ├── MyModel.py ├── README.md ├── TextAttention.py ├── TextCNNmodel.py ├── TextRCNNmodel.py ├── TextRNNmodel.py ├── dataPreprocess.py ├── main_control.py ├── py2 ├── 01mail.py ├── 02mail.py ├── 03fastText.py ├── 03fastText_keras.py ├── 04textCNN.py ├── 05textRNN.py ├── 06textRCNN.py ├── 07Attention.py ├── README.md ├── mymodel.py └── word2vec.py └── word2vec.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lytforgood/TextClassification/9f2dd4621bb45045eba4af09a20f800ab5074e12/.DS_Store -------------------------------------------------------------------------------- /FastText.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from keras.preprocessing import sequence 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.layers import Embedding 7 | from keras.layers import GlobalAveragePooling1D 8 | from keras.callbacks import EarlyStopping 9 | import numpy as np 10 | import logging 11 | import pandas as pd 12 | 13 | # Set parameters: 14 | # ngram_range = 2 will add bi-grams features 15 | # ngram_range = 2 16 | # max_features = len(words) 17 | # maxlen = 30 18 | # batch_size = 32 19 | # embedding_dims = 64 20 | # epochs = 100 21 | # embedding_dims,batch_size,epochs 22 | 23 | def getdata_train(path,ngram_range,maxlen,max_token,embedding_dims,batch_size,epochs,logpath,modelpath,modelname): 24 | print("fastText n-gram sentence new_maxlen"+str(maxlen)) 25 | ##数据获取 26 | print('Loading data...') 27 | # path = './data/nlpmaildatasample2.csv' 28 | d = pd.read_csv(path, header=None) 29 | d.columns = ['title', 'lable'] 30 | 31 | # drop=True 不生成index列 32 | d = d[-pd.isnull(d["title"])].reset_index(drop=True) 33 | 34 | all_data = set() 35 | for line in d["title"]: 36 | ws = line.split(" ") 37 | for w in ws: 38 | all_data.add(w) 39 | words = list(all_data) 40 | word_to_id = dict(zip(words, range(len(words)))) 41 | dx = [] 42 | for line in d["title"]: 43 | ws = line.split(" ") 44 | dx.append([word_to_id[w] for w in ws if w in word_to_id]) 45 | # dy=list(d['lable']) 46 | dy = d['lable'] 47 | 48 | def create_ngram_set(input_list, ngram_value=2): 49 | """ 50 | Extract a set of n-grams from a list of integers. 51 | >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2) 52 | {(4, 9), (4, 1), (1, 4), (9, 4)} 53 | >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3) 54 | [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)] 55 | """ 56 | return set(zip(*[input_list[i:] for i in range(ngram_value)])) 57 | 58 | def add_ngram(sequences, token_indice, ngram_range=2): 59 | """ 60 | Augment the input list of list (sequences) by appending n-grams values. 61 | Example: adding bi-gram 62 | >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] 63 | >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017} 64 | >>> add_ngram(sequences, token_indice, ngram_range=2) 65 | [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]] 66 | Example: adding tri-gram 67 | >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] 68 | >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018} 69 | >>> add_ngram(sequences, token_indice, ngram_range=3) 70 | [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]] 71 | """ 72 | new_sequences = [] 73 | for input_list in sequences: 74 | new_list = input_list[:] 75 | for i in range(len(new_list) - ngram_range + 1): 76 | for ngram_value in range(2, ngram_range + 1): 77 | ngram = tuple(new_list[i:i + ngram_value]) 78 | if ngram in token_indice: 79 | new_list.append(token_indice[ngram]) 80 | new_sequences.append(new_list) 81 | 82 | return new_sequences 83 | 84 | print('Loading data...') 85 | inx = int(len(dx) / 5 * 3) 86 | x_train, y_train, x_test, y_test = dx[0:inx], dy[0:inx], dx[inx:len(dx)], dy[inx:len(dx)] 87 | 88 | print(len(x_train), 'train sequences') 89 | print(len(x_test), 'test sequences') 90 | print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int))) 91 | print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int))) 92 | 93 | if ngram_range > 1: 94 | print('Adding {}-gram features'.format(ngram_range)) 95 | # Create set of unique n-gram from the training set. 96 | ngram_set = set() 97 | for input_list in x_train: 98 | for i in range(2, ngram_range + 1): 99 | set_of_ngram = create_ngram_set(input_list, ngram_value=i) 100 | ngram_set.update(set_of_ngram) 101 | 102 | # Dictionary mapping n-gram token to a unique integer. 103 | # Integer values are greater than max_features in order 104 | # to avoid collision with existing features. 105 | start_index = max_token + 1 106 | token_indice = {v: k + start_index for k, v in enumerate(ngram_set)} 107 | indice_token = {token_indice[k]: k for k in token_indice} 108 | 109 | # max_features is the highest integer that could be found in the dataset. 110 | max_features = np.max(list(indice_token.keys())) + 1 111 | 112 | # Augmenting x_train and x_test with n-grams features 113 | x_train = add_ngram(x_train, token_indice, ngram_range) 114 | x_test = add_ngram(x_test, token_indice, ngram_range) 115 | print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int))) 116 | print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int))) 117 | 118 | print('Pad sequences (samples x time)') 119 | x_train = sequence.pad_sequences(x_train, maxlen=maxlen) 120 | x_test = sequence.pad_sequences(x_test, maxlen=maxlen) 121 | print('x_train shape:', x_train.shape) 122 | print('x_test shape:', x_test.shape) 123 | 124 | print('Build model...') 125 | model = Sequential() 126 | 127 | # we start off with an efficient embedding layer which maps 128 | # our vocab indices into embedding_dims dimensions 129 | model.add(Embedding(max_features, 130 | embedding_dims, 131 | input_length=maxlen)) 132 | 133 | # we add a GlobalAveragePooling1D, which will average the embeddings 134 | # of all words in the document 135 | model.add(GlobalAveragePooling1D()) 136 | 137 | # We project onto a single unit output layer, and squash it with a sigmoid: 138 | model.add(Dense(1, activation='sigmoid')) 139 | 140 | model.compile(loss='binary_crossentropy', 141 | optimizer='adam', 142 | metrics=['accuracy']) 143 | 144 | # patience经过几个epoch后loss不在变化停止训练 145 | early_stopping = EarlyStopping(monitor='val_loss', patience=2) 146 | # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping]) 147 | 148 | hist = model.fit(x_train, y_train, 149 | batch_size=batch_size, 150 | epochs=epochs, 151 | validation_data=(x_test, y_test), callbacks=[early_stopping]) 152 | 153 | # print(hist.history) 154 | ##输出loss与acc到日志文件 155 | log_format = "%(asctime)s - %(message)s" 156 | logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format) 157 | logging.warning(modelname) 158 | for i in range(len(hist.history["acc"])): 159 | strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i]) 160 | logging.warning(strlog) 161 | 162 | model.save(modelpath + modelname + '.h5') -------------------------------------------------------------------------------- /MyModel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 1、自定义模型 Conv-BiGRU 卷积和循环并行 4 | 2、自定义模型 卷积和循环串行 5 | """ 6 | from keras.layers import Dense, Input, Flatten,concatenate 7 | from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional 8 | from keras.models import Model 9 | import logging 10 | from keras.callbacks import EarlyStopping 11 | 12 | def train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,modelname): 13 | sentence = Input(shape=(None,), dtype="int32") 14 | embedding_layer = Embedding(max_token + 1, 15 | embedding_dims, 16 | input_length=maxlen, 17 | weights=[embedding_matrix], 18 | trainable=False) 19 | sentence_embedding = embedding_layer(sentence) 20 | c2 = Conv1D(2, 2, activation='relu')(sentence_embedding) 21 | p2 = MaxPooling1D(27)(c2) 22 | p2 = Flatten()(p2) 23 | 24 | c3 = Conv1D(2, 3, activation='relu')(sentence_embedding) 25 | p3 = MaxPooling1D(26)(c3) 26 | p3 = Flatten()(p3) 27 | 28 | c4 = Conv1D(2, 4, activation='relu')(sentence_embedding) 29 | p4 = MaxPooling1D(25)(c4) 30 | p4 = Flatten()(p4) 31 | 32 | g1 = Bidirectional(GRU(128))(sentence_embedding) 33 | 34 | x = concatenate([p2, p3, p4, g1]) 35 | output = Dense(1, activation="sigmoid")(x) 36 | model = Model(inputs=sentence, outputs=output) 37 | model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) 38 | 39 | # patience经过几个epoch后loss不在变化停止训练 40 | early_stopping = EarlyStopping(monitor='val_loss', patience=2) 41 | hist = model.fit(x_train, y_train, 42 | batch_size=batch_size, 43 | epochs=epochs, 44 | validation_data=(x_test, y_test), callbacks=[early_stopping]) 45 | ##输出loss与acc到日志文件 46 | log_format = "%(asctime)s - %(message)s" 47 | logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format) 48 | logging.warning(modelname) 49 | for i in range(len(hist.history["acc"])): 50 | strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i]) 51 | logging.warning(strlog) 52 | 53 | model.save(modelpath + modelname + '.h5') -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## keras实现深度学习模型 进行文本分类 2 | 3 | > 实验数据采用真实邮件数据,涉及个人隐私,无法公开,可自行寻找数据测试--格式为:文本内容,标签 4 | 5 | > 模型参数未经过合适调整,目前正在实验修改验证模型当中,修改完成会更新项目 6 | 7 | 8 | - py2 详见py2目录下说明 9 | - main_control.py 主程序入口 10 | - dataPreprocess.py 数据处理 数据输入为:句子中的词(空格分开),标签 11 | - word2vec.py 训练word2vec模型 12 | - FastText.py fastText keras实现 13 | - TextCNNmodel.py word2vecter做词向量的CNN模型 14 | - TextRNNmodel.py SimpleRNN 双向lstm GRU 15 | - TextRCNNmodel.py Recurrent Convolutional Neural Networks for Text Classification 16 | - TextAttention.py 双向LSTM+Attention分层注意网络 -HAN模型 (与论文有区别) 17 | - MyModel.py 并行卷积和双向GRU 18 | -------------------------------------------------------------------------------- /TextAttention.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from keras.layers import Dense, Input, Flatten,Permute,Reshape 3 | from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed 4 | from keras.layers import merge 5 | from keras.models import Model 6 | from keras import backend as K 7 | from keras.layers.core import Lambda,RepeatVector 8 | import logging 9 | from keras.callbacks import EarlyStopping 10 | 11 | def train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,modelname): 12 | embedding_layer = Embedding(max_token + 1, 13 | embedding_dims, 14 | weights=[embedding_matrix], 15 | input_length=maxlen, 16 | trainable=True) 17 | # LSTM步长 18 | TIME_STEPS = maxlen 19 | SINGLE_ATTENTION_VECTOR = False 20 | 21 | ##不带别名的自编写Attention 22 | # def attention_3d_block(inputs): 23 | # # inputs.shape = (batch_size, time_steps, input_dim) 24 | # input_dim = int(inputs.shape[2]) 25 | # a = Permute((2, 1))(inputs) 26 | # a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what. 27 | # a = Dense(TIME_STEPS, activation='softmax')(a) 28 | # if SINGLE_ATTENTION_VECTOR: 29 | # a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a) 30 | # a = RepeatVector(input_dim)(a) 31 | # a_probs = Permute((2, 1), name='attention_vec')(a) 32 | # output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul') 33 | # return output_attention_mul 34 | ##使用多次attention需要新命名 35 | def attention_3d_block2(inputs, new_layer_name): 36 | # inputs.shape = (batch_size, time_steps, input_dim) 37 | input_dim = int(inputs.shape[2]) 38 | a = Permute((2, 1))(inputs) 39 | a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what. 40 | a = Dense(TIME_STEPS, activation='softmax')(a) 41 | if SINGLE_ATTENTION_VECTOR: 42 | a = Lambda(lambda x: K.mean(x, axis=1), name=new_layer_name + '_' + 'dim_reduction')(a) 43 | a = RepeatVector(input_dim)(a) 44 | a_probs = Permute((2, 1), name=new_layer_name + '_''attention_vec')(a) 45 | output_attention_mul = merge([inputs, a_probs], name=new_layer_name + '_''attention_mul', mode='mul') 46 | return output_attention_mul 47 | 48 | # 单向LSTM之后加入Attention 49 | # sentence_input = Input(shape=(maxlen,), dtype='int32') 50 | # embedded_sequences = embedding_layer(sentence_input) 51 | # lstm_out = LSTM(100, return_sequences=True)(embedded_sequences) 52 | # attention_mul = attention_3d_block(lstm_out) 53 | # attention_mul = Flatten()(attention_mul) 54 | # output = Dense(1, activation='sigmoid')(attention_mul) 55 | # model = Model(sentence_input, output) 56 | # model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 57 | # model.fit(x_train, y_train, validation_data=(x_test, y_test), 58 | # nb_epoch=epochs, batch_size=batch_size) 59 | 60 | # 双向LSTM词encoder 输入是 词标签数组 61 | sentence_input = Input(shape=(maxlen,), dtype='int32') 62 | embedded_sequences = embedding_layer(sentence_input) 63 | forward_rnn = LSTM(100, return_sequences=True) 64 | backward_rnn = LSTM(100, return_sequences=True, go_backwards=True) 65 | lstm_out_f_rnn = forward_rnn(embedded_sequences) 66 | attention_f_mul = attention_3d_block2(lstm_out_f_rnn, "forward") 67 | lstm_out_b_rnn = backward_rnn(embedded_sequences) 68 | attention_b_mul = attention_3d_block2(lstm_out_b_rnn, "backward") 69 | attention_mul = merge([attention_f_mul, attention_b_mul], mode='concat', concat_axis=-1) 70 | attention_mul = Flatten()(attention_mul) 71 | output = Dense(1, activation='sigmoid')(attention_mul) 72 | model = Model(sentence_input, output) 73 | model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 74 | 75 | # patience经过几个epoch后loss不在变化停止训练 76 | early_stopping = EarlyStopping(monitor='val_loss', patience=2) 77 | # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping]) 78 | print('Train...') 79 | # history = model.fit([doc_x_train, left_x_train, right_x_train], y_train, epochs = 1) 80 | # loss = history.history["loss"][0] 81 | hist = model.fit(x_train, y_train, validation_data=(x_test, y_test), 82 | nb_epoch=epochs, batch_size=batch_size, callbacks=[early_stopping]) 83 | # print(hist.history) 84 | ##输出loss与acc到日志文件 85 | log_format = "%(asctime)s - %(message)s" 86 | logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format) 87 | logging.warning(modelname) 88 | for i in range(len(hist.history["acc"])): 89 | strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i]) 90 | logging.warning(strlog) 91 | 92 | model.save(modelpath + modelname + '.h5') -------------------------------------------------------------------------------- /TextCNNmodel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from keras import Input, Model 3 | from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout 4 | 5 | from keras.callbacks import EarlyStopping 6 | import logging 7 | 8 | 9 | def train(x_train, y_train, x_test, y_test, maxlen, max_token, embedding_matrix, embedding_dims, batch_size, epochs, 10 | logpath, modelpath, modelname): 11 | print(modelname + 'Build model...') 12 | sentence = Input((maxlen,)) 13 | embedding_layer = Embedding(max_token + 1, 14 | embedding_dims, 15 | input_length=maxlen, 16 | weights=[embedding_matrix]) 17 | sentence_embedding = embedding_layer(sentence) 18 | c2 = Conv1D(128, 3, activation='relu')(sentence_embedding) 19 | p2 = GlobalMaxPooling1D()(c2) 20 | 21 | c3 = Conv1D(128, 4, activation='relu')(sentence_embedding) 22 | p3 = GlobalMaxPooling1D()(c3) 23 | 24 | c4 = Conv1D(128, 5, activation='relu')(sentence_embedding) 25 | p4 = GlobalMaxPooling1D()(c4) 26 | 27 | x = Concatenate()([p2, p3, p4]) 28 | output = Dense(1, activation="sigmoid")(x) 29 | model = Model(inputs=sentence, outputs=output) 30 | model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) 31 | 32 | # print(model.summary()) 33 | # patience经过几个epoch后loss不在变化停止训练 34 | early_stopping = EarlyStopping(monitor='val_loss', patience=2) 35 | # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping]) 36 | print('Train...') 37 | hist = model.fit(x_train, y_train, 38 | batch_size=batch_size, 39 | epochs=epochs, 40 | validation_data=(x_test, y_test), callbacks=[early_stopping]) 41 | 42 | # print(hist.history) 43 | ##输出loss与acc到日志文件 44 | log_format = "%(asctime)s - %(message)s" 45 | logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format) 46 | logging.warning(modelname) 47 | for i in range(len(hist.history["acc"])): 48 | strlog = str(i + 1) + " Epoch " + "-loss: " + str(hist.history["loss"][i]) + " -acc: " + str( 49 | hist.history["acc"][i]) + " -val_loss: " + str(hist.history["val_loss"][i]) + " -val_acc: " + str( 50 | hist.history["val_acc"][i]) 51 | logging.warning(strlog) 52 | 53 | model.save(modelpath + modelname + '.h5') 54 | 55 | 56 | if __name__ == '__main__': 57 | print('11') 58 | -------------------------------------------------------------------------------- /TextRCNNmodel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from keras import backend 3 | from keras.layers import Dense, Input, Lambda, LSTM, TimeDistributed 4 | from keras.layers.merge import concatenate 5 | from keras.layers.embeddings import Embedding 6 | from keras.models import Model 7 | 8 | from keras.callbacks import EarlyStopping 9 | import logging 10 | import numpy as np 11 | 12 | 13 | def train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,modelname,hidden_dim_1,hidden_dim_2): 14 | print(modelname + 'Build model...') 15 | document = Input(shape=(None,), dtype="int32") 16 | left_context = Input(shape=(None,), dtype="int32") 17 | right_context = Input(shape=(None,), dtype="int32") 18 | 19 | embedder = Embedding(max_token + 1, embedding_dims, weights=[embedding_matrix], trainable=False) #input_length=maxlen 20 | doc_embedding = embedder(document) 21 | l_embedding = embedder(left_context) 22 | r_embedding = embedder(right_context) 23 | 24 | # I use LSTM RNNs instead of vanilla RNNs as described in the paper. 25 | forward = LSTM(hidden_dim_1, return_sequences=True)(l_embedding) # See equation (1). 26 | backward = LSTM(hidden_dim_1, return_sequences=True, go_backwards=True)(r_embedding) # See equation (2). 27 | together = concatenate([forward, doc_embedding, backward], axis=2) # See equation (3). 28 | 29 | semantic = TimeDistributed(Dense(hidden_dim_2, activation="tanh"))(together) # See equation (4). 30 | 31 | # Keras provides its own max-pooling layers, but they cannot handle variable length input 32 | # (as far as I can tell). As a result, I define my own max-pooling layer here. 33 | pool_rnn = Lambda(lambda x: backend.max(x, axis=1), output_shape=(hidden_dim_2,))(semantic) # See equation (5). 34 | 35 | output = Dense(1, input_dim=hidden_dim_2, activation="sigmoid")(pool_rnn) # See equations (6) and (7).NUM_CLASSES=1 36 | 37 | model = Model(inputs=[document, left_context, right_context], outputs=output) 38 | model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) 39 | 40 | ##生成左右上下文 41 | print('Build left and right data') 42 | doc_x_train = np.array(x_train) 43 | # We shift the document to the right to obtain the left-side contexts. 44 | left_x_train = np.array([[max_token] + t_one[:-1].tolist() for t_one in x_train]) 45 | # We shift the document to the left to obtain the right-side contexts. 46 | right_x_train = np.array([t_one[1:].tolist() + [max_token] for t_one in x_train]) 47 | 48 | doc_x_test = np.array(x_test) 49 | # We shift the document to the right to obtain the left-side contexts. 50 | left_x_test = np.array([[max_token] + t_one[:-1].tolist() for t_one in x_test]) 51 | # We shift the document to the left to obtain the right-side contexts. 52 | right_x_test = np.array([t_one[1:].tolist() + [max_token] for t_one in x_test]) 53 | 54 | # patience经过几个epoch后loss不在变化停止训练 55 | early_stopping = EarlyStopping(monitor='val_loss', patience=2) 56 | # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping]) 57 | print('Train...') 58 | # history = model.fit([doc_x_train, left_x_train, right_x_train], y_train, epochs = 1) 59 | # loss = history.history["loss"][0] 60 | hist = model.fit([doc_x_train, left_x_train, right_x_train], y_train, 61 | batch_size=batch_size, 62 | epochs=epochs, 63 | validation_data=[[doc_x_test, left_x_test, right_x_test], y_test], callbacks=[early_stopping]) 64 | 65 | # print(hist.history) 66 | ##输出loss与acc到日志文件 67 | log_format = "%(asctime)s - %(message)s" 68 | logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format) 69 | logging.warning(modelname) 70 | for i in range(len(hist.history["acc"])): 71 | strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i]) 72 | logging.warning(strlog) 73 | 74 | model.save(modelpath + modelname + '.h5') -------------------------------------------------------------------------------- /TextRNNmodel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from keras.models import Sequential 3 | from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional,GRU,SimpleRNN 4 | import logging 5 | from keras.callbacks import EarlyStopping 6 | 7 | def train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,modelname): 8 | embedding_layer = Embedding(max_token + 1, 9 | embedding_dims, 10 | input_length=maxlen, 11 | weights=[embedding_matrix], 12 | trainable=False) 13 | print(modelname + 'Build model...') 14 | model = Sequential() 15 | model.add(embedding_layer) 16 | model.add(SimpleRNN(128, activation="relu")) 17 | # model.add(LSTM(128)) 18 | # model.add(Bidirectional(LSTM(200))) ### 输出维度64 GRU 19 | # model.add(Bidirectional(GRU(64))) 20 | model.add(Dropout(0.2)) 21 | model.add(Dense(1, activation='sigmoid')) 22 | # try using different optimizers and different optimizer configs 23 | model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) 24 | # lstm常选参数model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) 25 | # a stateful LSTM model 26 | # lahead: the input sequence length that the LSTM 27 | # https://github.com/keras-team/keras/blob/master/examples/lstm_stateful.py 28 | # model = Sequential() 29 | # model.add(LSTM(20,input_shape=(lahead, 1), 30 | # batch_size=batch_size, 31 | # stateful=stateful)) 32 | # model.add(Dense(1)) 33 | # model.compile(loss='mse', optimizer='adam') 34 | 35 | # patience经过几个epoch后loss不在变化停止训练 36 | early_stopping = EarlyStopping(monitor='val_loss', patience=2) 37 | # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping]) 38 | print('Train...') 39 | hist = model.fit(x_train, y_train, 40 | batch_size=batch_size, 41 | epochs=epochs, 42 | validation_data=(x_test, y_test), callbacks=[early_stopping]) 43 | # print(hist.history) 44 | ##输出loss与acc到日志文件 45 | log_format = "%(asctime)s - %(message)s" 46 | logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format) 47 | logging.warning(modelname) 48 | for i in range(len(hist.history["acc"])): 49 | strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i]) 50 | logging.warning(strlog) 51 | 52 | model.save(modelpath + modelname + '.h5') 53 | 54 | def train2(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,modelname): 55 | embedding_layer = Embedding(max_token + 1, 56 | embedding_dims, 57 | input_length=maxlen, 58 | weights=[embedding_matrix], 59 | trainable=False) 60 | print(modelname + 'Build model...') 61 | model = Sequential() 62 | model.add(embedding_layer) 63 | # model.add(SimpleRNN(128, activation="relu")) 64 | # model.add(LSTM(128)) 65 | model.add(Bidirectional(LSTM(200))) ### 输出维度64 GRU 66 | # model.add(Bidirectional(GRU(64))) 67 | model.add(Dropout(0.2)) 68 | model.add(Dense(1, activation='sigmoid')) 69 | # try using different optimizers and different optimizer configs 70 | model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) 71 | # lstm常选参数model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) 72 | # a stateful LSTM model 73 | # lahead: the input sequence length that the LSTM 74 | # https://github.com/keras-team/keras/blob/master/examples/lstm_stateful.py 75 | # model = Sequential() 76 | # model.add(LSTM(20,input_shape=(lahead, 1), 77 | # batch_size=batch_size, 78 | # stateful=stateful)) 79 | # model.add(Dense(1)) 80 | # model.compile(loss='mse', optimizer='adam') 81 | 82 | # patience经过几个epoch后loss不在变化停止训练 83 | early_stopping = EarlyStopping(monitor='val_loss', patience=2) 84 | # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping]) 85 | print('Train...') 86 | hist = model.fit(x_train, y_train, 87 | batch_size=batch_size, 88 | epochs=epochs, 89 | validation_data=(x_test, y_test), callbacks=[early_stopping]) 90 | # print(hist.history) 91 | ##输出loss与acc到日志文件 92 | log_format = "%(asctime)s - %(message)s" 93 | logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format) 94 | logging.warning(modelname) 95 | for i in range(len(hist.history["acc"])): 96 | strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i]) 97 | logging.warning(strlog) 98 | 99 | model.save(modelpath + modelname + '.h5') 100 | 101 | def train3(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,modelname): 102 | embedding_layer = Embedding(max_token + 1, 103 | embedding_dims, 104 | input_length=maxlen, 105 | weights=[embedding_matrix], 106 | trainable=False) 107 | print(modelname+'Build model...') 108 | model = Sequential() 109 | model.add(embedding_layer) 110 | # model.add(SimpleRNN(128, activation="relu")) 111 | # model.add(LSTM(128)) 112 | # model.add(Bidirectional(LSTM(200))) ### 输出维度64 GRU 113 | model.add(Bidirectional(GRU(128))) 114 | model.add(Dropout(0.2)) 115 | model.add(Dense(1, activation='sigmoid')) 116 | # try using different optimizers and different optimizer configs 117 | model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) 118 | # lstm常选参数model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) 119 | # a stateful LSTM model 120 | # lahead: the input sequence length that the LSTM 121 | # https://github.com/keras-team/keras/blob/master/examples/lstm_stateful.py 122 | # model = Sequential() 123 | # model.add(LSTM(20,input_shape=(lahead, 1), 124 | # batch_size=batch_size, 125 | # stateful=stateful)) 126 | # model.add(Dense(1)) 127 | # model.compile(loss='mse', optimizer='adam') 128 | 129 | # patience经过几个epoch后loss不在变化停止训练 130 | early_stopping = EarlyStopping(monitor='val_loss', patience=2) 131 | # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping]) 132 | print('Train...') 133 | hist = model.fit(x_train, y_train, 134 | batch_size=batch_size, 135 | epochs=epochs, 136 | validation_data=(x_test, y_test), callbacks=[early_stopping]) 137 | # print(hist.history) 138 | ##输出loss与acc到日志文件 139 | log_format = "%(asctime)s - %(message)s" 140 | logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format) 141 | logging.warning(modelname) 142 | for i in range(len(hist.history["acc"])): 143 | strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i]) 144 | logging.warning(strlog) 145 | 146 | model.save(modelpath + modelname + '.h5') -------------------------------------------------------------------------------- /dataPreprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | import numpy as np 4 | import gensim 5 | from keras.preprocessing.sequence import pad_sequences 6 | 7 | """ 8 | dataPreprocess 9 | set 10 | path='./data/nlpmail_re3.txt' 11 | batch_size = 32 12 | embedding_dims = 128 #词向量长度 13 | epochs = 100 14 | w2vpath="./data/w2c_model" 15 | hidden_dim_1 = 200 16 | hidden_dim_2 = 100 17 | return x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix, 18 | """ 19 | def getdata(path,embedding_dims,w2vpath): 20 | print('Loading data...') 21 | d = pd.read_csv(path,header=None) 22 | d.columns=['title','lable'] 23 | 24 | #drop=True 不生成index列 25 | d=d[-pd.isnull(d["title"])].reset_index(drop=True) 26 | 27 | all_data=set() 28 | for line in d["title"]: 29 | ws=str(line).split(" ") 30 | for w in ws: 31 | if w == ' ' or w == '' or w=="\t" or w=="??": 32 | continue 33 | all_data.add(w) 34 | words=list(all_data) 35 | 36 | word_to_id = dict(zip(words, range(len(words)))) 37 | dx=[] 38 | for line in d["title"]: 39 | ws=str(line).split(" ") 40 | dx.append([word_to_id[w] for w in ws if w in word_to_id]) 41 | # dy=list(d['lable']) 42 | dy=d['lable'] 43 | 44 | print('Average sequence length: {}'.format(np.mean(list(map(len, dx)), dtype=int))) 45 | # set parameters: 46 | maxlen=np.max(list(map(len, dx))) #maxlen = 29 最长文本词数 47 | 48 | inx=int(len(dx)/5*3) 49 | x_train, y_train, x_test, y_test = dx[0:inx],dy[0:inx],dx[inx:len(dx)],dy[inx:len(dx)] 50 | 51 | print(len(x_train), 'train sequences') 52 | print(len(x_test), 'test sequences') 53 | 54 | print('Pad sequences (samples x time)') 55 | x_train = pad_sequences(x_train, maxlen=maxlen) 56 | x_test = pad_sequences(x_test, maxlen=maxlen) 57 | print('x_train shape:', x_train.shape) 58 | print('x_test shape:', x_test.shape) 59 | 60 | 61 | print('Indexing word vectors.') 62 | embeddings_index = {} 63 | model = gensim.models.Word2Vec.load(w2vpath) 64 | 65 | #初始化一个0向量 统计未出现词个数 66 | null_word=np.zeros(embedding_dims) 67 | null_word_count=0 68 | 69 | for word in words: 70 | try: 71 | embeddings_index[word]=model[word] 72 | except: 73 | embeddings_index[word]=null_word 74 | null_word_count+=1 75 | print('Found %s word vectors.' % len(embeddings_index)) 76 | print('Found %s null word.' % null_word_count) 77 | 78 | print('Preparing embedding matrix.') 79 | max_token = len(word_to_id) 80 | embedding_matrix = np.zeros((max_token + 1, embedding_dims)) 81 | for word, i in word_to_id.items(): 82 | if i > max_token: 83 | continue 84 | embedding_vector = embeddings_index.get(word) 85 | if embedding_vector is not None: 86 | # words not found in embedding index will be all-zeros. 87 | embedding_matrix[i] = embedding_vector 88 | 89 | return x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix -------------------------------------------------------------------------------- /main_control.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 使用Python脚本控制多个Python脚本运行 4 | 方法一: 5 | 直接Python顺序运行 6 | import os 7 | os.system("D:\ProgramData\Anaconda3\python D:\mysoft\dlspace\FastText3.py") #因为没有环境变量需要制定python路径 mac/linux os.system("python /xx/a.py") 8 | os.system("D:\ProgramData\Anaconda3\python D:\mysoft\dlspace\main_control.py") 9 | 方法二: 10 | 写成函数形式,调用函数 如下 11 | """ 12 | import dataPreprocess 13 | import FastText 14 | import TextCNNmodel 15 | import TextRNNmodel 16 | import TextRCNNmodel 17 | import TextAttention 18 | import MyModel 19 | print("设置参数") 20 | #获取数据参数 21 | # path = './data/nlpmail_re3.txt' 22 | path="./data/nlpmaildatasample2.csv" #数据输入 23 | w2vpath = "./data/w2c_model" #w2v模型地址 24 | embedding_dims = 128 # 词向量长度 25 | logpath='./model/mylog.txt' #日志记录地址 26 | modelpath='./model/' #模型保存目录 27 | #模型训练参数 28 | batch_size = 32 29 | epochs = 100 30 | #fastText参数 31 | ngram_range=2 32 | #TextRCNNmodel参数 33 | hidden_dim_1 = 200 34 | hidden_dim_2 = 100 35 | 36 | 37 | print("获取数据") 38 | x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix=dataPreprocess.getdata(path,embedding_dims,w2vpath) 39 | 40 | print("调用模型") 41 | FastText.getdata_train(path,ngram_range,maxlen+10,max_token,embedding_dims,batch_size,epochs,logpath,modelpath,"FastText") 42 | TextCNNmodel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextCNN") 43 | TextRNNmodel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextSimpleRNN") 44 | TextRNNmodel.train2(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextBiLSTM") 45 | TextRNNmodel.train3(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextBiGRU") 46 | TextRCNNmodel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextRCNN",hidden_dim_1,hidden_dim_2) 47 | TextAttention.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextAttention") 48 | MyModel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"MyConBiGRU") 49 | -------------------------------------------------------------------------------- /py2/01mail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | 文本数据生成 输出 文本 词典 5 | 思考: 6 | 去停用词改为使用信息增益、互信息法、L1正则化选择词特征 7 | ''' 8 | import pandas as pd 9 | import re 10 | import jieba 11 | import cPickle as pickle 12 | import sys #这里只是一个对sys的引用,只能reload才能进行重新加载 13 | from sklearn.utils import shuffle 14 | 15 | def data2all(): 16 | ##数据生成 17 | f1 = pd.read_csv("./data/data2016_0730_1028.csv",sep=',',header=None,encoding="utf-8") 18 | f2 = pd.read_csv("./data/data2016_0730_1028.csv",sep=',',header=None,encoding="utf-8") 19 | f3 = pd.read_csv("./data/data20161028_20170108.csv",sep=',',header=None,encoding="utf-8") 20 | f =pd.concat([f1,f2,f3]) 21 | f.columns = ['accept','title','send','accept','time','label','day'] 22 | 23 | all_data=f[["title","label"]] 24 | 25 | x=all_data.groupby(["label"])["title"].count() 26 | # Index([u'个人文件夹(个人过滤器)', u'垃圾箱(系统判断)', u'已退信', u'投递中', u'投递成功', u'收件箱', u'自动转发',u'被拦截(个人过滤器)', u'被拦截(用户黑名单)', u'被拦截(系统拦截)'], 27 | d1=all_data[(all_data["label"]==x.index[1])].reset_index() #垃圾 28 | d2=all_data[(all_data["label"]==x.index[4])].reset_index() #投递成功 29 | d3=all_data[(all_data["label"]==x.index[5])].reset_index() #收件箱 30 | 31 | d=pd.concat([d1,d2,d3]) 32 | d=d[["title","label"]] 33 | d.to_csv("./data/nlpmail.csv",header=False,index=False,encoding="utf-8") 34 | 35 | #合并数据 36 | data2all() 37 | 38 | stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr 39 | reload(sys) #通过import引用进来时,setdefaultencoding函数在被系统调用后被删除了,所以必须reload一次 40 | sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde 41 | sys.setdefaultencoding('utf-8') 42 | 43 | ##读取文件 44 | d = pd.read_csv("./data/nlpmail.csv",sep=',',header=None,encoding="utf-8") 45 | d.columns = ['title','lable'] 46 | ##类别编码 47 | def label2num(x): 48 | l=0 49 | if(x==u"垃圾箱(系统判断)"): 50 | l=1 51 | return l 52 | d["lable2"]=[label2num(x) for x in d["lable"]] 53 | d["index"]=range(d.shape[0]) 54 | 55 | 56 | ##去除标点符号 57 | def remove_punctuation(line): 58 | #中文标点 !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏. 59 | #英文标点 !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ 60 | try: 61 | line = re.sub("[!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+".decode("utf-8"), "",line.decode("utf-8")) 62 | except Exception as e: 63 | print "error" 64 | return line 65 | 66 | ##结巴分词 67 | def cutline(line): 68 | line=str(line) #防止只有数字的识别为float 纯数字转换成 数字 一词 69 | words = jieba.cut(line, cut_all=False) 70 | re=" ".join(words) 71 | return re 72 | 73 | #创建字典 词级别 74 | def createVocabList(dataSet): 75 | all_data=[] 76 | for line in dataSet: 77 | for words in line.split(" "): 78 | all_data.append(words) 79 | all_data=set(all_data) 80 | return all_data 81 | 82 | #去除空值 83 | d=d.dropna() 84 | #去标点 85 | d["title"]=[remove_punctuation(x) for x in d["title"]] 86 | d=d[["index","title","label2"]] 87 | 88 | #替换特殊空格 89 | def replaySspace(line): 90 | line=line.replace('\xc2\xa0', '') 91 | return line 92 | 93 | 94 | d["title"]=[replaySspace(x) for x in d["title"]] 95 | 96 | ##去掉全英文和字母 97 | def rematch(line): 98 | if re.match('^[A-Za-z0-9]+$',line): 99 | line="q100" 100 | return line 101 | 102 | d["title"]=[rematch(x) for x in d["title"]] 103 | 104 | d=d[d["title"]!="q100"] 105 | 106 | #分词 107 | d["title"]=[cutline(x) for x in d["title"]] 108 | 109 | ##保存文本 110 | # path='./data/nlpmaildata.pkl' 111 | # output = file(path, 'wb') 112 | # pickle.dump(d, output, True) 113 | # output.close() 114 | # ##保存字典 115 | # vocab_dir=createVocabList(d["title"]) 116 | # vocab_dir=list(vocab_dir) 117 | # path='./data/vocab_dir.pkl' 118 | # output = file(path, 'wb') 119 | # pickle.dump(vocab_dir, output, True) 120 | # output.close() 121 | 122 | 123 | #数据清洗 替换英文和字母 选取文本长度>4的文本 124 | def replayxx(line): 125 | words=line.split(" ") 126 | newwords=[] 127 | for w in words: 128 | if w.encode( 'UTF-8' ).isdigit(): 129 | w="数字" 130 | if w.encode( 'UTF-8' ).isalpha(): 131 | w="英文" 132 | if re.match('^[A-Za-z0-9]+$',w): 133 | w="数字英文" 134 | newwords.append(w) 135 | res=" ".join(newwords) 136 | return res 137 | d["title"]=[replayxx(x) for x in d["title"]] 138 | 139 | d=d[["title","lable2"]].reset_index(drop = True) 140 | d.columns=['title','lable'] 141 | 142 | # d["title"]=[x.encode("utf-8") for x in d["title"]] 143 | 144 | path='./data/nlpmaildata2.pkl' 145 | output = file(path, 'wb') 146 | pickle.dump(d, output, True) 147 | output.close() 148 | 149 | d = shuffle(d) 150 | d.to_csv("./data/nlpmail_re.csv",header=False,index=False,encoding="utf_8_sig") #(452526, 2) 151 | #切分数据集 152 | df1=d[(d["lable"]==1)].sample(frac=0.2) 153 | df2=d[(d["lable"]==0)].sample(frac=0.2) 154 | d=pd.concat([df1,df2]) 155 | from sklearn.utils import shuffle 156 | d = shuffle(d) 157 | d =d.sample(frac=0.2) 158 | path='./data/nlpmaildatasample2.pkl' 159 | output = file(path, 'wb') 160 | pickle.dump(d, output, True) 161 | output.close() 162 | 163 | d.to_csv("./data/nlpmaildatasample2.csv",header=False,index=False,encoding="utf_8_sig") 164 | -------------------------------------------------------------------------------- /py2/02mail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | 文本向量化 词袋方法 TF-IDF 文本Hash 朴素贝叶斯 5 | ''' 6 | import pandas as pd 7 | import re 8 | import jieba 9 | import cPickle as pickle 10 | import numpy as np 11 | 12 | ##读取文件 13 | # path='./data/nlpmaildata2.pkl' 14 | # path='./data/nlpmaildatasample2.pkl' 15 | # f2 = file(path, 'rb') 16 | # d = pickle.load(f2) 17 | # f2.close() 18 | path='./data/nlpmaildatasample2.csv' 19 | d = pd.read_csv(path,header=None) 20 | d.columns=['title','lable'] 21 | #打乱数据 22 | # from sklearn.utils import shuffle 23 | # d = shuffle(d) 24 | #获取停用词表 25 | def get_stopwords(path): 26 | f= open(path) 27 | stopwords=[] 28 | for line in f: 29 | stopwords.append(line.strip().decode("utf-8")) 30 | return stopwords 31 | #停用词导入 32 | stopwords=get_stopwords("./data/stopwords.txt") 33 | #获取训练标签 34 | dy=list(d["lable"]) 35 | ############################################################################################ 36 | ##方法1.1 自定义词袋方法 37 | ##词袋模型 38 | # def bagOfWords2VecMN(vocabList, inputSet): 39 | # returnVec = [0]*len(vocabList) 40 | # for word in inputSet: 41 | # if word in vocabList: 42 | # returnVec[vocabList.index(word)] += 1 43 | # return returnVec 44 | # path='./data/vocab_dir.pkl' 45 | # f2 = file(path, 'rb') 46 | # vocab_dir = pickle.load(f2) 47 | # f2.close() 48 | # #转换成list词袋 字典维度太大 会执行失败!!!行数*字典维度/1024/1024/1024=需要多少G内存 49 | # train=[] 50 | # label=list(d["label2"]) 51 | # for i in range(len(d["title"])): 52 | # if(i%10000 ==0): 53 | # print float(i)/float(len(d["title"])) 54 | # t=d["title"][i] 55 | # words=t.split(" ") 56 | # vec=bagOfWords2VecMN(vocab_dir,words) 57 | # train.append(vec) 58 | ############################################################################################# 59 | ##方法1.2 词袋向量化之sklearn 60 | #词袋向量化 61 | from sklearn.feature_extraction.text import CountVectorizer 62 | vectorizer=CountVectorizer(stop_words=stopwords) 63 | #输入是带空格的分词后list 64 | # d_x=vectorizer.fit_transform(d["title"]).toarray() #训练并转换 65 | vectorizer.fit(d["title"]) 66 | dx=vectorizer.transform(d["title"]).toarray() 67 | #返回满足条件的索引所在位置 68 | # print np.where(d_x[0]>0) 69 | #对应字典获取 70 | vocab_dir=vectorizer.get_feature_names() 71 | ############################################################################################# 72 | ##方法1.3 词袋向量化之sklearn,TF-IDF和标准化 73 | # from sklearn.feature_extraction.text import TfidfVectorizer 74 | # vector = TfidfVectorizer(stop_words=stopwords) 75 | # vector.fit(d["title"]) 76 | # dx=vector.transform(d["title"]).toarray() 77 | # vocab_dir = vector.get_feature_names()#获取词袋模型中的所有词 78 | ############################################################################################ 79 | ##方法2 文本Hash Trick 用哈希技巧矢量化大文本语料库 80 | ##原理 hash(文本1)=位置5 hash(文本2)=位置5 位置5的值=1+1or新的哈希函数 81 | # from sklearn.feature_extraction.text import HashingVectorizer 82 | # vectorizer2=HashingVectorizer(n_features = 1000,norm = None,stop_words=stopwords) 83 | # vectorizer2.fit(d["title"]) 84 | # dx=vectorizer2.transform(d["title"]).toarray() 85 | ############################################################################################# 86 | ##朴素贝叶斯按比例验证 87 | # from sklearn.naive_bayes import MultinomialNB 88 | # from sklearn.model_selection import cross_val_score 89 | # from sklearn.model_selection import StratifiedKFold 90 | # clf = MultinomialNB() 91 | # ##修改cv分折方法 92 | # skf = StratifiedKFold(n_splits=5) 93 | # ##修改score 94 | # scores = cross_val_score(clf, dx, dy, cv=skf, scoring='accuracy') 95 | # scores2 = cross_val_score(clf, dx, dy, cv=skf, scoring='f1') 96 | # #评分估计的平均得分和 95% 置信区间由此给出 97 | # print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) 98 | # print("f1: %0.2f (+/- %0.2f)" % (scores2.mean(), scores.std() * 2)) 99 | ############################################################################################# 100 | ##按比例切分训练集 101 | from sklearn.naive_bayes import MultinomialNB 102 | from sklearn.naive_bayes import BernoulliNB 103 | from sklearn.model_selection import train_test_split 104 | from sklearn.metrics import accuracy_score 105 | from sklearn.metrics import f1_score 106 | 107 | X_train, X_test, y_train, y_test = train_test_split(dx, dy, test_size=0.2, random_state=0) 108 | # clf = MultinomialNB() 109 | clf = BernoulliNB() 110 | clf.fit(X_train, y_train) 111 | y_pred=clf.predict(X_test) 112 | print("Accuracy: %0.2f" % accuracy_score(y_test, y_pred)) 113 | print("F1: %0.2f" % f1_score(y_test, y_pred)) 114 | 115 | ##一致性对比 116 | dtrain=d[0:d.shape[0]/5*3] 117 | dtest=d[d.shape[0]/5*3:d.shape[0]] 118 | X_train, X_test, y_train, y_test=vectorizer.transform(dtrain["title"]).toarray(),vectorizer.transform(dtest["title"]).toarray(),list(dtrain["lable"]),list(dtest["lable"]) 119 | clf = BernoulliNB() 120 | clf.fit(X_train, y_train) 121 | y_pred=clf.predict(X_test) 122 | print("Accuracy: %0.2f" % accuracy_score(y_test, y_pred)) 123 | print("F1: %0.2f" % f1_score(y_test, y_pred)) 124 | # Accuracy: 0.73 125 | # F1: 0.66 126 | # #评价标准 127 | # from sklearn import metrics 128 | # print "Accuracy : %.2f" % metrics.accuracy_score(label, pre_reduce) 129 | # print "recall : %.2f" % metrics.recall_score(label, pre_reduce) 130 | # print "F1 : %.2f" % metrics.f1_score(label, pre_reduce) 131 | -------------------------------------------------------------------------------- /py2/03fastText.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 文本分类之fastText 5 | 方法一:自己编写 6 | 方法二:Facebook开源工具https://github.com/facebookresearch/fastText#text-classification 7 | paper:https://arxiv.org/pdf/1607.01759.pdf 8 | fastText的核心思想就是:将整篇文档的词及n-gram向量叠加平均得到文档向量,然后使用文档向量做softmax多分类 9 | 字符级n-gram特征的引入以及分层Softmax分类 10 | 参考: 11 | http://blog.csdn.net/sinat_26917383/article/details/54850933 12 | http://www.52nlp.cn/category/text-classification 13 | """ 14 | #方法二 fastText对词向量生成考虑到上下文 基于Hierarchical(分层) Softmax 15 | # 输入格式 词(空格分开)_lable_标签 eg:英媒 称 威 __label__affairs 16 | import pandas as pd 17 | import re 18 | import jieba 19 | import cPickle as pickle 20 | import numpy as np 21 | 22 | ##读取文件 23 | path='./data/nlpmaildatasample2.csv' 24 | d = pd.read_csv(path,header=None) 25 | d.columns=['title','lable'] 26 | 27 | dtrain=d[0:d.shape[0]/5*3] 28 | dtest=d[d.shape[0]/5*3:d.shape[0]] 29 | 30 | #生成训练文件 31 | def w2file(data,filename): 32 | f = open(filename,"w") 33 | for i in range(data.shape[0]): 34 | outline = d['title'][i] + "\t__label__" + str(d['lable'][i]) + "\n" 35 | f.write(outline) 36 | f.close() 37 | 38 | w2file(dtrain,"./data/fasttext_train.txt") 39 | w2file(dtest,"./data/fasttext_test.txt") 40 | 41 | import logging 42 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 43 | import fastText 44 | #训练模型 45 | classifier = fastText.FastText.train_supervised("./data/fasttext_train.txt",lr=0.1, dim=100,wordNgrams=1,label=u"__label__") 46 | #参数 47 | # train_supervised(input, lr=0.1, dim=100, ws=5, epoch=5, minCount=1, minCountLabel=0, minn=0, maxn=0, neg=5, wordNgrams=1, loss=u'softmax', bucket=2000000, thread=12, lrUpdateRate=100, t=0.0001, label=u'__label__', verbose=2, pretrainedVectors=u'') 48 | # input_file training file path (required) 49 | # output output file path (required) 50 | # lr learning rate [0.05] 51 | # lr_update_rate change the rate of updates for the learning rate [100] 52 | # dim size of word vectors [100] 53 | # ws size of the context window [5] 54 | # epoch number of epochs [5] 55 | # min_count minimal number of word occurences [5] 56 | # neg number of negatives sampled [5] 57 | # word_ngrams max length of word ngram [1] 58 | # loss loss function {ns, hs, softmax} [ns] 59 | # bucket number of buckets [2000000] 60 | # minn min length of char ngram [3] 61 | # maxn max length of char ngram [6] 62 | # thread number of threads [12] 63 | # t sampling threshold [0.0001] 64 | # silent disable the log output from the C++ extension [1] 65 | # encoding specify input_file encoding [utf-8] 66 | ((u'__label__0',), array([ 0.77616984])) 67 | #测试模型 help(classifier) 68 | result = classifier.test("./data/fasttext_test.txt") 69 | print result 70 | texts=[str(t).decode("utf-8") for t in dtest["title"]] #预测与输入编码必须一致 71 | ##predict输出格式((u'__label__0',), array([ 0.77616984])) 72 | y_pred = [int(e[0].replace("__label__","")) for e in classifier.predict(texts)[0]] #预测输出结果为元组 73 | y_test=list(dtest["lable"]) 74 | from sklearn.metrics import accuracy_score 75 | from sklearn.metrics import f1_score 76 | print("Accuracy: %0.2f" % accuracy_score(y_test, y_pred)) 77 | print("F1: %0.2f" % f1_score(y_test, y_pred)) 78 | # Accuracy: 0.73 79 | # F1: 0.65 80 | -------------------------------------------------------------------------------- /py2/03fastText_keras.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 文本分类之fastText 5 | 方法一:自己编写 keras官方版本 6 | """ 7 | #建立词典 8 | from __future__ import print_function 9 | from keras.preprocessing import sequence 10 | from keras.models import Sequential 11 | from keras.layers import Dense 12 | from keras.layers import Embedding 13 | from keras.layers import GlobalAveragePooling1D 14 | import pandas as pd 15 | import cPickle as pickle 16 | import numpy as np 17 | 18 | ##读取文件 19 | path='./data/nlpmaildatasample2.csv' 20 | d = pd.read_csv(path,header=None) 21 | d.columns=['title','lable'] 22 | 23 | all_data=set() 24 | for line in d["title"]: 25 | ws=line.split(" ") 26 | for w in ws: 27 | all_data.add(w) 28 | words=list(all_data) 29 | word_to_id = dict(zip(words, range(len(words)))) 30 | dx=[] 31 | for line in d["title"]: 32 | ws=line.split(" ") 33 | dx.append([word_to_id[w] for w in ws if w in word_to_id]) 34 | dy=list(d['lable']) 35 | # dy=d['lable'] 36 | 37 | def create_ngram_set(input_list, ngram_value=2): 38 | """ 39 | Extract a set of n-grams from a list of integers. 40 | >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2) 41 | {(4, 9), (4, 1), (1, 4), (9, 4)} 42 | >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3) 43 | [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)] 44 | """ 45 | return set(zip(*[input_list[i:] for i in range(ngram_value)])) 46 | 47 | 48 | def add_ngram(sequences, token_indice, ngram_range=2): 49 | """ 50 | Augment the input list of list (sequences) by appending n-grams values. 51 | Example: adding bi-gram 52 | >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] 53 | >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017} 54 | >>> add_ngram(sequences, token_indice, ngram_range=2) 55 | [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]] 56 | Example: adding tri-gram 57 | >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] 58 | >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018} 59 | >>> add_ngram(sequences, token_indice, ngram_range=3) 60 | [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]] 61 | """ 62 | new_sequences = [] 63 | for input_list in sequences: 64 | new_list = input_list[:] 65 | for i in range(len(new_list) - ngram_range + 1): 66 | for ngram_value in range(2, ngram_range + 1): 67 | ngram = tuple(new_list[i:i + ngram_value]) 68 | if ngram in token_indice: 69 | new_list.append(token_indice[ngram]) 70 | new_sequences.append(new_list) 71 | 72 | return new_sequences 73 | 74 | # Set parameters: 75 | # ngram_range = 2 will add bi-grams features 76 | ngram_range = 1 77 | max_features = len(words) 78 | maxlen = 400 79 | batch_size = 32 80 | embedding_dims = 50 81 | epochs = 5 82 | 83 | print('Loading data...') 84 | x_train, y_train, x_test, y_test = dx[0:len(dx)/5*3],dy[0:len(dx)/5*3],dx[len(dx)/5*3:len(dx)],dy[len(dx)/5*3:len(dx)] 85 | 86 | print(len(x_train), 'train sequences') 87 | print(len(x_test), 'test sequences') 88 | print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int))) 89 | print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int))) 90 | 91 | if ngram_range > 1: 92 | print('Adding {}-gram features'.format(ngram_range)) 93 | # Create set of unique n-gram from the training set. 94 | ngram_set = set() 95 | for input_list in x_train: 96 | for i in range(2, ngram_range + 1): 97 | set_of_ngram = create_ngram_set(input_list, ngram_value=i) 98 | ngram_set.update(set_of_ngram) 99 | 100 | # Dictionary mapping n-gram token to a unique integer. 101 | # Integer values are greater than max_features in order 102 | # to avoid collision with existing features. 103 | start_index = max_features + 1 104 | token_indice = {v: k + start_index for k, v in enumerate(ngram_set)} 105 | indice_token = {token_indice[k]: k for k in token_indice} 106 | 107 | # max_features is the highest integer that could be found in the dataset. 108 | max_features = np.max(list(indice_token.keys())) + 1 109 | 110 | # Augmenting x_train and x_test with n-grams features 111 | x_train = add_ngram(x_train, token_indice, ngram_range) 112 | x_test = add_ngram(x_test, token_indice, ngram_range) 113 | print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int))) 114 | print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int))) 115 | 116 | print('Pad sequences (samples x time)') 117 | x_train = sequence.pad_sequences(x_train, maxlen=maxlen) 118 | x_test = sequence.pad_sequences(x_test, maxlen=maxlen) 119 | print('x_train shape:', x_train.shape) 120 | print('x_test shape:', x_test.shape) 121 | 122 | print('Build model...') 123 | model = Sequential() 124 | 125 | # we start off with an efficient embedding layer which maps 126 | # our vocab indices into embedding_dims dimensions 127 | model.add(Embedding(max_features, 128 | embedding_dims, 129 | input_length=maxlen)) 130 | 131 | # we add a GlobalAveragePooling1D, which will average the embeddings 132 | # of all words in the document 133 | model.add(GlobalAveragePooling1D()) 134 | 135 | # We project onto a single unit output layer, and squash it with a sigmoid: 136 | model.add(Dense(1, activation='sigmoid')) 137 | 138 | model.compile(loss='binary_crossentropy', 139 | optimizer='adam', 140 | metrics=['accuracy']) 141 | 142 | model.fit(x_train, y_train, 143 | batch_size=batch_size, 144 | epochs=epochs, 145 | validation_data=(x_test, y_test)) 146 | -------------------------------------------------------------------------------- /py2/04textCNN.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 文本分类之textCNN 疑问MaxPooling1D?使用计算?区别和GlobalMaxPooling1D 5 | 论文:Convolutional Neural Networks for Sentence Classification 6 | 论文解读:http://www.jeyzhang.com/cnn-apply-on-modelling-sentence.html 7 | 输入层:词个数x词向量维数---矩阵的类型可以是静态的(static)word vector是固定不变,动态的(non static)word vector也当做是可优化的参数这一过程称为 Fine tune 8 | 卷积层:若干个Feature Map--不同大小滤波器 卷积核大小为nxk k是词向量维度 1D默认宽度为词向量维度 9 | 池化层:Max-over-time Pooling--输出为各个Feature Map的最大值们,即一个一维的向量 10 | 全连接 + Softmax层:池化层的一维向量的输出通过全连接的方式,连接一个Softmax层 11 | Dropout:倒数第二层的全连接部分,L2正则化,减轻过拟合 12 | 词向量变种: 13 | CNN-rand:对不同单词的向量作随机初始化,BP的时候作调整 Embedding层选择随机初始化方法 14 | static:拿word2vec, FastText or GloVe训练好的词向量 15 | non-static:拿word2vec, FastText or GloVe训练好的词向量,训练过程中再对它们微调Fine tuned(自己理解:先用其他大文本语料训练w2v再用本文本训练w2v) 16 | multiple channel :类比于图像中的RGB通道, 这里也可以用 static 与 non-static 搭两个通道来搞 17 | 结论: 18 | CNN-static较与CNN-rand好,说明pre-training的word vector确实有较大的提升作用(这也难怪,因为pre-training的word vector显然利用了更大规模的文本数据信息); 19 | CNN-non-static较于CNN-static大部分要好,说明适当的Fine tune也是有利的,是因为使得vectors更加贴近于具体的任务; 20 | CNN-multichannel较于CNN-single在小规模的数据集上有更好的表现,实际上CNN-multichannel体现了一种折中思想,即既不希望Fine tuned的vector距离原始值太远,但同时保留其一定的变化空间 21 | github:https://github.com/yoonkim/CNN_sentence 22 | code参考 23 | http://blog.csdn.net/diye2008/article/details/53105652?locationNum=11&fps=1 24 | glove embedding参考http://blog.csdn.net/sscssz/article/details/53333225 25 | """ 26 | from __future__ import print_function 27 | 28 | from keras.preprocessing.sequence import pad_sequences 29 | from keras.layers import Dense, Input, Flatten,GlobalMaxPooling1D 30 | from keras.layers import Conv1D, MaxPooling1D, Embedding,Dropout 31 | from keras.models import Model 32 | from keras.optimizers import * 33 | from keras.models import Sequential 34 | from keras.layers import merge 35 | import pandas as pd 36 | import cPickle as pickle 37 | import numpy as np 38 | import gensim 39 | 40 | ##数据获取 41 | print('Loading data...') 42 | path='./data/nlpmaildatasample2.csv' 43 | d = pd.read_csv(path,header=None) 44 | d.columns=['title','lable'] 45 | 46 | all_data=set() 47 | for line in d["title"]: 48 | ws=line.split(" ") 49 | for w in ws: 50 | if w == ' ' or w == '' or w=="\t": 51 | continue 52 | all_data.add(w) 53 | words=list(all_data) 54 | word_to_id = dict(zip(words, range(len(words)))) 55 | dx=[] 56 | for line in d["title"]: 57 | ws=line.split(" ") 58 | dx.append([word_to_id[w] for w in ws if w in word_to_id]) 59 | # dy=list(d['lable']) 60 | dy=d['lable'] 61 | 62 | 63 | print('Average sequence length: {}'.format(np.mean(list(map(len, dx)), dtype=int))) 64 | 65 | # set parameters: 66 | maxlen=np.max(list(map(len, dx))) #maxlen = 400 最长文本词数 67 | max_features = 20000 #字典允许最大大小 68 | batch_size = 32 69 | embedding_dims = 64 #词向量长度 70 | epochs = 2 71 | w2vpath="./data/w2c_model" 72 | 73 | x_train, y_train, x_test, y_test = dx[0:len(dx)/5*3],dy[0:len(dx)/5*3],dx[len(dx)/5*3:len(dx)],dy[len(dx)/5*3:len(dx)] 74 | print(len(x_train), 'train sequences') 75 | print(len(x_test), 'test sequences') 76 | 77 | print('Pad sequences (samples x time)') 78 | x_train = pad_sequences(x_train, maxlen=maxlen) 79 | x_test = pad_sequences(x_test, maxlen=maxlen) 80 | print('x_train shape:', x_train.shape) 81 | print('x_test shape:', x_test.shape) 82 | 83 | 84 | print('Indexing word vectors.') 85 | embeddings_index = {} 86 | model = gensim.models.Word2Vec.load(w2vpath) 87 | for word in words: 88 | embeddings_index[word]=model[word] 89 | print('Found %s word vectors.' % len(embeddings_index)) 90 | 91 | print('Preparing embedding matrix.') 92 | nb_words = min(max_features, len(word_to_id)) 93 | embedding_matrix = np.zeros((nb_words + 1, embedding_dims)) 94 | for word, i in word_to_id.items(): 95 | if i > max_features: 96 | continue 97 | embedding_vector = embeddings_index.get(word) 98 | if embedding_vector is not None: 99 | # words not found in embedding index will be all-zeros. 100 | embedding_matrix[i] = embedding_vector # word_index to word_embedding_vector ,<20000(nb_words) 101 | 102 | 103 | # 神经网路的第一层,词向量层,本文使用了预训练word2vec词向量,可以把trainable那里设为False 104 | embedding_layer = Embedding(nb_words+1, 105 | embedding_dims, 106 | input_length=maxlen, 107 | weights=[embedding_matrix], 108 | trainable=False) 109 | print('Build model...') 110 | ##最简单cnn 111 | # model = Sequential() 112 | # model.add(Embedding(nb_words + 1, 113 | # embedding_dims, 114 | # input_length=maxlen)) 115 | # model.add(Dropout(0.2)) 116 | # model.add(Conv1D(250,#filters 117 | # 3,#kernel_size 118 | # padding='valid', 119 | # activation='relu', 120 | # strides=1)) 121 | # model.add(GlobalMaxPooling1D()) 122 | # model.add(Dense(250))#hidden layer: 123 | # model.add(Dropout(0.2)) 124 | # model.add(Activation('relu')) 125 | # model.add(Dense(1)) 126 | # model.add(Activation('sigmoid')) 127 | # model.compile(loss='binary_crossentropy', 128 | # optimizer='adam', 129 | # metrics=['accuracy']) 130 | # model.fit(x_train, y_train, 131 | # batch_size=batch_size, 132 | # epochs=epochs, 133 | # validation_data=(x_test, y_test)) 134 | 135 | ###3层合并model 经过词向量表达的文本为一维数据,因此在TextCNN卷积用的是一维卷积 136 | #left model 137 | model_left = Sequential() 138 | #https://keras.io/layers/embeddings/ 139 | # model.add(Embedding(max_features,embedding_dims,input_length=maxlen)) 140 | model_left.add(embedding_layer) 141 | model_left.add(Conv1D(128, 5, activation='relu')) #128输出的维度 5卷积核大小 142 | model_left.add(MaxPooling1D())#5 143 | model_left.add(Conv1D(128, 5, activation='relu')) 144 | model_left.add(MaxPooling1D())#5 145 | model_left.add(Conv1D(128, 5, activation='relu')) 146 | model_left.add(MaxPooling1D()) #35 #model_left.add(GlobalMaxPooling1D()) 147 | model_left.add(Flatten()) 148 | 149 | model_right = Sequential() 150 | model_right.add(embedding_layer) 151 | model_right.add(Conv1D(128, 4, activation='relu')) 152 | model_right.add(MaxPooling1D())#4 153 | model_right.add(Conv1D(128, 4, activation='relu')) 154 | model_right.add(MaxPooling1D())#4 155 | model_right.add(Conv1D(128, 4, activation='relu')) 156 | model_right.add(MaxPooling1D())#28 157 | model_right.add(Flatten()) 158 | 159 | model_3 = Sequential() 160 | model_3.add(embedding_layer) 161 | model_3.add(Conv1D(128, 6, activation='relu')) 162 | model_3.add(MaxPooling1D())#3 163 | model_3.add(Conv1D(128, 6, activation='relu')) 164 | model_3.add(MaxPooling1D())#3 165 | model_3.add(Conv1D(128, 6, activation='relu')) 166 | model_3.add(MaxPooling1D())#30 167 | model_3.add(Flatten()) 168 | 169 | merged = Merge([model_left, model_right,model_3], mode='concat') # 将三种不同卷积窗口的卷积层组合 连接在一起,当然也可以只是用三个model中的一个,一样可以得到不错的效果,只是本文采用论文中的结构设计 170 | model = Sequential() 171 | model.add(merged) # add merge 172 | model.add(Dense(128, activation='relu')) # 全连接层 173 | model.add(Dropout(0.2)) 174 | model.add(Dense(1, activation='sigmoid')) # softmax对应多分类 需要修改loss,输出文本属于类别中每个类别的概率 175 | 176 | model.compile(loss='binary_crossentropy', 177 | optimizer='adam', 178 | metrics=['accuracy']) 179 | model.fit(x_train, y_train, 180 | batch_size=batch_size, 181 | epochs=epochs, 182 | validation_data=(x_test, y_test)) 183 | 184 | score = model.evaluate(x_train, y_train, verbose=0) # 评估模型在训练集中的效果,准确率约99% 185 | print('train score:', score[0]) 186 | print('train accuracy:', score[1]) 187 | score = model.evaluate(x_test, y_test, verbose=0) # 评估模型在测试集中的效果,准确率约为97%,迭代次数多了,会进一步提升 188 | print('Test score:', score[0]) 189 | print('Test accuracy:', score[1]) 190 | -------------------------------------------------------------------------------- /py2/05textRNN.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 未使用word2vector的双向lstm 5 | t 时刻输出不仅取决于之前时刻的序列输入,还取决于将来时刻序列输入 6 | embedding--->bi-directional lstm--->concat output--->average----->softmax 7 | lstm中的Xt-1,Xt代表的是一个样本中的每一个词 所有循环只在一个样本中循环 8 | TimeDistributed包装器=把一个层应用到输入的每一个时间步上-http://keras-cn.readthedocs.io/en/latest/layers/wrapper/ 9 | 思考: 10 | 分类的时候不只使用最后一个隐藏元的输出,而是把所有隐藏元的输出做K-MaxPooling再分类 11 | 在双向GRU前添加单层卷积层提取一次ngram特征-C-GRU 12 | """ 13 | from __future__ import print_function 14 | from keras.preprocessing import sequence 15 | from keras.models import Sequential 16 | from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional 17 | from keras.datasets import imdb 18 | 19 | import pandas as pd 20 | import cPickle as pickle 21 | import numpy as np 22 | import gensim 23 | 24 | ##数据获取 25 | print('Loading data...') 26 | path='./data/nlpmaildatasample2.csv' 27 | d = pd.read_csv(path,header=None) 28 | d.columns=['title','lable'] 29 | 30 | all_data=set() 31 | for line in d["title"]: 32 | ws=line.split(" ") 33 | for w in ws: 34 | if w == ' ' or w == '' or w=="\t": 35 | continue 36 | all_data.add(w) 37 | words=list(all_data) 38 | word_to_id = dict(zip(words, range(len(words)))) 39 | dx=[] 40 | for line in d["title"]: 41 | ws=line.split(" ") 42 | dx.append([word_to_id[w] for w in ws if w in word_to_id]) 43 | # dy=list(d['lable']) 44 | dy=d['lable'] 45 | 46 | # set parameters: 47 | maxlen=np.max(list(map(len, dx))) #maxlen = 400 最长文本词数 48 | max_features = len(word_to_id)+1 49 | batch_size = 32 50 | embedding_dims=128 51 | 52 | x_train, y_train, x_test, y_test = dx[0:len(dx)/5*3],dy[0:len(dx)/5*3],dx[len(dx)/5*3:len(dx)],dy[len(dx)/5*3:len(dx)] 53 | print(len(x_train), 'train sequences') 54 | print(len(x_test), 'test sequences') 55 | print('Pad sequences (samples x time)') 56 | x_train = sequence.pad_sequences(x_train, maxlen=maxlen) 57 | x_test = sequence.pad_sequences(x_test, maxlen=maxlen) 58 | print('x_train shape:', x_train.shape) 59 | print('x_test shape:', x_test.shape) 60 | 61 | print('Build model...') 62 | model = Sequential() 63 | model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) 64 | model.add(Bidirectional(LSTM(64))) ### 输出维度64 GRU 65 | model.add(Dropout(0.5)) 66 | model.add(Dense(1, activation='sigmoid')) 67 | # try using different optimizers and different optimizer configs 68 | model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) 69 | #lstm常选参数model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) 70 | # a stateful LSTM model 71 | #lahead: the input sequence length that the LSTM 72 | # https://github.com/keras-team/keras/blob/master/examples/lstm_stateful.py 73 | # model = Sequential() 74 | # model.add(LSTM(20,input_shape=(lahead, 1), 75 | # batch_size=batch_size, 76 | # stateful=stateful)) 77 | # model.add(Dense(1)) 78 | # model.compile(loss='mse', optimizer='adam') 79 | 80 | 81 | print('Train...') 82 | model.fit(x_train, y_train, 83 | batch_size=batch_size, 84 | epochs=4, 85 | validation_data=[x_test, y_test]) 86 | 87 | # y_pred = model.predict_classes(x_test, verbose=0) 88 | -------------------------------------------------------------------------------- /py2/06textRCNN.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 使用Word2vec定义词向量矩阵 5 | recurrent structure (convolutional layer): 6 | 词向量矩阵 7 | left(无意义补0+去最后一个词) max_token对应词向量为0向量 8 | right(去第一个词+无意义补0) 9 | lstm(left)+词向量矩阵+lstm(right)===上一个词+当前词+下一个词 10 | structure:1)recurrent structure (convolutional layer) 2)max pooling 3) fully connected layer+softmax 11 | Recurrent convolutional neural networks for text classification 12 | http://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/view/9745 13 | tensoflow版https://github.com/brightmart/text_classification/blob/master/a04_TextRCNN/p71_TextRCNN_model.py 14 | """ 15 | import pandas as pd 16 | import cPickle as pickle 17 | import numpy as np 18 | import gensim 19 | from keras.preprocessing import sequence 20 | from keras import backend 21 | from keras.layers import Dense, Input, Lambda, LSTM, TimeDistributed 22 | from keras.layers.merge import concatenate 23 | from keras.layers.embeddings import Embedding 24 | from keras.models import Model 25 | 26 | ##数据获取 27 | print('Loading data...') 28 | path='./data/nlpmaildatasample2.pkl' 29 | f2 = file(path, 'rb') 30 | d = pickle.load(f2) 31 | f2.close() 32 | # path='./data/nlpmaildatasample2.csv' 33 | # d = pd.read_csv(path,header=None) 34 | # d.columns=['title','lable'] 35 | 36 | all_data=set() 37 | for line in d["title"]: 38 | ws=line.split(" ") 39 | for w in ws: 40 | if w == ' ' or w == '' or w=="\t": 41 | continue 42 | all_data.add(w) 43 | words=list(all_data) 44 | word_to_id = dict(zip(words, range(len(words)))) 45 | dx=[] 46 | for line in d["title"]: 47 | ws=line.split(" ") 48 | dx.append([word_to_id[w] for w in ws if w in word_to_id]) 49 | # dy=list(d['lable']) 50 | dy=d['lable'] 51 | 52 | 53 | print('Average sequence length: {}'.format(np.mean(list(map(len, dx)), dtype=int))) 54 | 55 | # set parameters: 56 | maxlen=np.max(list(map(len, dx))) #maxlen = 400 最长文本词数 57 | max_features = 20000 #字典允许最大大小 58 | batch_size = 32 59 | embedding_dims = 64 #词向量长度 60 | epochs = 2 61 | hidden_dim_1 = 200 62 | hidden_dim_2 = 100 63 | w2vpath="./data/w2c_model" 64 | 65 | x_train, y_train, x_test, y_test = dx[0:len(dx)/5*3],dy[0:len(dx)/5*3],dx[len(dx)/5*3:len(dx)],dy[len(dx)/5*3:len(dx)] 66 | print(len(x_train), 'train sequences') 67 | print(len(x_test), 'test sequences') 68 | 69 | print('Pad sequences (samples x time)') 70 | x_train = sequence.pad_sequences(x_train, maxlen=maxlen) 71 | x_test = sequence.pad_sequences(x_test, maxlen=maxlen) 72 | print('x_train shape:', x_train.shape) 73 | print('x_test shape:', x_test.shape) 74 | 75 | 76 | print('Indexing word vectors.') 77 | embeddings_index = {} 78 | model = gensim.models.Word2Vec.load(w2vpath) 79 | for word in words: 80 | embeddings_index[word]=model[word] 81 | print('Found %s word vectors.' % len(embeddings_index)) 82 | 83 | print('Preparing embedding matrix.') 84 | max_token = min(max_features, len(word_to_id)) 85 | embedding_matrix = np.zeros((max_token + 1, embedding_dims)) 86 | for word, i in word_to_id.items(): 87 | if i > max_features: 88 | continue 89 | embedding_vector = embeddings_index.get(word) 90 | if embedding_vector is not None: 91 | # words not found in embedding index will be all-zeros. 92 | embedding_matrix[i] = embedding_vector # word_index to word_embedding_vector ,<20000(max_token) 93 | 94 | print('Build model...') 95 | document = Input(shape = (None, ), dtype = "int32") 96 | left_context = Input(shape = (None, ), dtype = "int32") 97 | right_context = Input(shape = (None, ), dtype = "int32") 98 | 99 | embedder = Embedding(max_token + 1, embedding_dims, weights = [embedding_matrix], trainable = False) 100 | doc_embedding = embedder(document) 101 | l_embedding = embedder(left_context) 102 | r_embedding = embedder(right_context) 103 | 104 | # I use LSTM RNNs instead of vanilla RNNs as described in the paper. 105 | forward = LSTM(hidden_dim_1, return_sequences = True)(l_embedding) # See equation (1). 106 | backward = LSTM(hidden_dim_1, return_sequences = True, go_backwards = True)(r_embedding) # See equation (2). 107 | together = concatenate([forward, doc_embedding, backward], axis = 2) # See equation (3). 108 | 109 | semantic = TimeDistributed(Dense(hidden_dim_2, activation = "tanh"))(together) # See equation (4). 110 | 111 | # Keras provides its own max-pooling layers, but they cannot handle variable length input 112 | # (as far as I can tell). As a result, I define my own max-pooling layer here. 113 | pool_rnn = Lambda(lambda x: backend.max(x, axis = 1), output_shape = (hidden_dim_2, ))(semantic) # See equation (5). 114 | 115 | output = Dense(1, input_dim = hidden_dim_2, activation = "sigmoid")(pool_rnn) # See equations (6) and (7).NUM_CLASSES=1 116 | 117 | model = Model(inputs = [document, left_context, right_context], outputs = output) 118 | model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"]) 119 | 120 | ##生成左右上下文 121 | print('Build left and right data') 122 | doc_x_train = np.array(x_train) 123 | # We shift the document to the right to obtain the left-side contexts. 124 | left_x_train = np.array([[max_token]+t_one[:-1].tolist() for t_one in x_train]) 125 | # We shift the document to the left to obtain the right-side contexts. 126 | right_x_train = np.array([t_one[1:].tolist()+[max_token] for t_one in x_train]) 127 | 128 | doc_x_test = np.array(x_test) 129 | # We shift the document to the right to obtain the left-side contexts. 130 | left_x_test = np.array([[max_token]+t_one[:-1].tolist() for t_one in x_test]) 131 | # We shift the document to the left to obtain the right-side contexts. 132 | right_x_test = np.array([t_one[1:].tolist()+[max_token] for t_one in x_test]) 133 | 134 | 135 | # history = model.fit([doc_x_train, left_x_train, right_x_train], y_train, epochs = 1) 136 | # loss = history.history["loss"][0] 137 | model.fit([doc_x_train, left_x_train, right_x_train], y_train, 138 | batch_size=batch_size, 139 | epochs=4, 140 | validation_data=[[doc_x_test, left_x_test, right_x_test], y_test]) 141 | 142 | -------------------------------------------------------------------------------- /py2/07Attention.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 单双向lstm 之后加 + Attention HAN模型 5 | paper:Hierarchical Attention Networks for Document Classification 6 | 加入Attention之后最大的好处自然是能够直观的解释各个句子和词对分类类别的重要性 7 | Structure: 8 | 1.embedding 9 | 2.Word Encoder: 词级双向GRU,以获得丰富的词汇表征 10 | 3.Word Attention:词级注意在句子中获取重要信息 11 | 4.Sentence Encoder: 句子级双向GRU,以获得丰富的句子表征 12 | 5.Sentence Attetion: 句级注意以获得句子中的重点句子 13 | 6.FC+Softmax 14 | # HierarchicalAttention: 1.Word Encoder. 2.Word Attention. 3.Sentence Encoder 4.Sentence Attention 5.linear classifier. 2017-06-13 15 | Attention层是一个MLP+softmax机制 16 | code参考:https://github.com/richliao/textClassifier 17 | https://github.com/philipperemy/keras-attention-mechanism 18 | https://github.com/codekansas/keras-language-modeling/blob/master/keras_models.py 19 | https://github.com/codekansas/keras-language-modeling 20 | https://github.com/EdGENetworks/attention-networks-for-classification 21 | https://github.com/brightmart/text_classification/tree/master/a05_HierarchicalAttentionNetwork 22 | 原理解说:https://www.zhihu.com/question/68482809/answer/268320399 23 | """ 24 | from keras.preprocessing import sequence 25 | from keras.layers import Dense, Input, Flatten,Permute,Reshape 26 | from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed 27 | from keras.layers import merge 28 | from keras.models import Model 29 | from keras import backend as K 30 | 31 | import numpy as np 32 | import pandas as pd 33 | import cPickle as pickle 34 | import numpy as np 35 | import gensim 36 | 37 | ##数据获取 38 | print('Loading data...') 39 | path='./data/nlpmaildatasample2.pkl' 40 | f2 = file(path, 'rb') 41 | d = pickle.load(f2) 42 | f2.close() 43 | # path='./data/nlpmaildatasample2.csv' 44 | # d = pd.read_csv(path,header=None) 45 | # d.columns=['title','lable'] 46 | 47 | all_data=set() 48 | for line in d["title"]: 49 | ws=line.split(" ") 50 | for w in ws: 51 | if w == ' ' or w == '' or w=="\t": 52 | continue 53 | all_data.add(w) 54 | words=list(all_data) 55 | word_to_id = dict(zip(words, range(len(words)))) 56 | dx=[] 57 | for line in d["title"]: 58 | ws=line.split(" ") 59 | dx.append([word_to_id[w] for w in ws if w in word_to_id]) 60 | # dy=list(d['lable']) 61 | dy=d['lable'] 62 | 63 | 64 | print('Average sequence length: {}'.format(np.mean(list(map(len, dx)), dtype=int))) 65 | 66 | # set parameters: 67 | maxlen=np.max(list(map(len, dx))) #maxlen = 400 最长文本词数 68 | max_features = 20000 #字典允许最大大小 69 | batch_size = 32 70 | embedding_dims = 64 #词向量长度 71 | epochs = 2 72 | hidden_dim_1 = 200 73 | hidden_dim_2 = 100 74 | w2vpath="./data/w2c_model" 75 | 76 | x_train, y_train, x_test, y_test = dx[0:len(dx)/5*3],dy[0:len(dx)/5*3],dx[len(dx)/5*3:len(dx)],dy[len(dx)/5*3:len(dx)] 77 | print(len(x_train), 'train sequences') 78 | print(len(x_test), 'test sequences') 79 | 80 | print('Pad sequences (samples x time)') 81 | x_train = sequence.pad_sequences(x_train, maxlen=maxlen) 82 | x_test = sequence.pad_sequences(x_test, maxlen=maxlen) 83 | print('x_train shape:', x_train.shape) 84 | print('x_test shape:', x_test.shape) 85 | 86 | 87 | print('Indexing word vectors.') 88 | embeddings_index = {} 89 | model = gensim.models.Word2Vec.load(w2vpath) 90 | for word in words: 91 | embeddings_index[word]=model[word] 92 | print('Found %s word vectors.' % len(embeddings_index)) 93 | 94 | print('Preparing embedding matrix.') 95 | max_token = min(max_features, len(word_to_id)) 96 | embedding_matrix = np.zeros((max_token + 1, embedding_dims)) 97 | for word, i in word_to_id.items(): 98 | if i > max_features: 99 | continue 100 | embedding_vector = embeddings_index.get(word) 101 | if embedding_vector is not None: 102 | # words not found in embedding index will be all-zeros. 103 | embedding_matrix[i] = embedding_vector # word_index to word_embedding_vector ,<20000(max_token) 104 | 105 | 106 | ##句子最多几句 107 | max_sents=1 108 | 109 | embedding_layer = Embedding(max_token + 1, 110 | embedding_dims, 111 | weights=[embedding_matrix], 112 | input_length=maxlen, 113 | trainable=True) 114 | #LSTM步长 115 | TIME_STEPS=maxlen 116 | SINGLE_ATTENTION_VECTOR = False 117 | ##不带别名的自编写Attention 118 | # def attention_3d_block(inputs): 119 | # # inputs.shape = (batch_size, time_steps, input_dim) 120 | # input_dim = int(inputs.shape[2]) 121 | # a = Permute((2, 1))(inputs) 122 | # a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what. 123 | # a = Dense(TIME_STEPS, activation='softmax')(a) 124 | # if SINGLE_ATTENTION_VECTOR: 125 | # a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a) 126 | # a = RepeatVector(input_dim)(a) 127 | # a_probs = Permute((2, 1), name='attention_vec')(a) 128 | # output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul') 129 | # return output_attention_mul 130 | ##使用多次attention需要新命名 131 | def attention_3d_block2(inputs,new_layer_name): 132 | # inputs.shape = (batch_size, time_steps, input_dim) 133 | input_dim = int(inputs.shape[2]) 134 | a = Permute((2, 1))(inputs) 135 | a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what. 136 | a = Dense(TIME_STEPS, activation='softmax')(a) 137 | if SINGLE_ATTENTION_VECTOR: 138 | a = Lambda(lambda x: K.mean(x, axis=1), name=new_layer_name+'_'+'dim_reduction')(a) 139 | a = RepeatVector(input_dim)(a) 140 | a_probs = Permute((2, 1), name=new_layer_name+'_''attention_vec')(a) 141 | output_attention_mul = merge([inputs, a_probs], name=new_layer_name+'_''attention_mul', mode='mul') 142 | return output_attention_mul 143 | 144 | #单向LSTM之后加入Attention 145 | # sentence_input = Input(shape=(maxlen,), dtype='int32') 146 | # embedded_sequences = embedding_layer(sentence_input) 147 | # lstm_out = LSTM(100, return_sequences=True)(embedded_sequences) 148 | # attention_mul = attention_3d_block(lstm_out) 149 | # attention_mul = Flatten()(attention_mul) 150 | # output = Dense(1, activation='sigmoid')(attention_mul) 151 | # model = Model(sentence_input, output) 152 | # model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 153 | # model.fit(x_train, y_train, validation_data=(x_test, y_test), 154 | # nb_epoch=epochs, batch_size=batch_size) 155 | 156 | #双向LSTM词encoder 输入是 词标签数组 157 | sentence_input = Input(shape=(maxlen,), dtype='int32') 158 | embedded_sequences = embedding_layer(sentence_input) 159 | forward_rnn = LSTM(100, return_sequences=True) 160 | backward_rnn = LSTM(100, return_sequences=True, go_backwards=True) 161 | lstm_out_f_rnn = forward_rnn(embedded_sequences) 162 | attention_f_mul = attention_3d_block2(lstm_out_f_rnn,"forward") 163 | lstm_out_b_rnn = backward_rnn(embedded_sequences) 164 | attention_b_mul = attention_3d_block2(lstm_out_b_rnn,"backward") 165 | attention_mul=merge([attention_f_mul, attention_b_mul], mode='concat', concat_axis=-1) 166 | attention_mul = Flatten()(attention_mul) 167 | output = Dense(1, activation='sigmoid')(attention_mul) 168 | model = Model(sentence_input, output) 169 | model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 170 | model.fit(x_train, y_train, validation_data=(x_test, y_test), 171 | nb_epoch=epochs, batch_size=batch_size) 172 | 173 | ####先词Attention再句Attention Hierarchical Attention Networks for Document Classification 174 | #词encoder 输入是 词标签数组 未完待续 175 | #句encoder 输入是 句子个数x词标签数组 176 | 177 | -------------------------------------------------------------------------------- /py2/README.md: -------------------------------------------------------------------------------- 1 | ## keras实现深度学习模型 进行文本分类 2 | 3 | > 实验数据采用真实邮件数据,涉及个人隐私,无法公开,可自行寻找数据测试--格式为:文本内容,标签 4 | 5 | > 模型参数未经过合适调整,目前正在实验修改验证模型当中,修改完成会更新项目 6 | 7 | 01mail.py 文本数据生成-输出文本 词典 非一次执行 8 | 9 | 02mail.py 文本词袋向量化/TF-IDF标准化/文本Hash+朴素贝叶斯 10 | 11 | 03fastText.py fastText库训练 12 | 13 | 03fastText_keras.py fastText keras实现 14 | 15 | 04textCNN.py word2vecter做词向量的CNN两种模型 16 | 17 | 05textRNN.py 双向lstm随机初始词向量 18 | 19 | 06textRCNN.py Recurrent Convolutional Neural Networks for Text Classification 20 | 21 | 07Attention.py 双向LSTM+Attention分层注意网络 -HAN模型 22 | 23 | -------------------------------------------------------------------------------- /py2/mymodel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 1、自定义模型 Conv-BiGRU 卷积和循环并行 5 | 2、自定义模型 卷积和循环串行 6 | """ 7 | from keras.preprocessing import sequence 8 | from keras.layers import Dense, Input, Flatten,Permute,Reshape 9 | from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed 10 | from keras.layers import merge 11 | from keras.models import Model 12 | from keras import backend as K 13 | from keras.models import Sequential 14 | 15 | import numpy as np 16 | import pandas as pd 17 | import cPickle as pickle 18 | import numpy as np 19 | import gensim 20 | 21 | ##数据获取 22 | print('Loading data...') 23 | path='./data/nlpmaildatasample2.pkl' 24 | f2 = file(path, 'rb') 25 | d = pickle.load(f2) 26 | f2.close() 27 | # path='./data/nlpmaildatasample2.csv' 28 | # d = pd.read_csv(path,header=None) 29 | # d.columns=['title','lable'] 30 | 31 | all_data=set() 32 | for line in d["title"]: 33 | ws=line.split(" ") 34 | for w in ws: 35 | if w == ' ' or w == '' or w=="\t": 36 | continue 37 | all_data.add(w) 38 | words=list(all_data) 39 | word_to_id = dict(zip(words, range(len(words)))) 40 | dx=[] 41 | for line in d["title"]: 42 | ws=line.split(" ") 43 | dx.append([word_to_id[w] for w in ws if w in word_to_id]) 44 | # dy=list(d['lable']) 45 | dy=d['lable'] 46 | 47 | 48 | print('Average sequence length: {}'.format(np.mean(list(map(len, dx)), dtype=int))) 49 | 50 | # set parameters: 51 | maxlen=np.max(list(map(len, dx))) #maxlen = 400 最长文本词数 52 | max_features = 20000 #字典允许最大大小 53 | batch_size = 32 54 | embedding_dims = 64 #词向量长度 55 | epochs = 2 56 | hidden_dim_1 = 200 57 | hidden_dim_2 = 100 58 | w2vpath="./data/w2c_model" 59 | 60 | x_train, y_train, x_test, y_test = dx[0:len(dx)/5*3],dy[0:len(dx)/5*3],dx[len(dx)/5*3:len(dx)],dy[len(dx)/5*3:len(dx)] 61 | print(len(x_train), 'train sequences') 62 | print(len(x_test), 'test sequences') 63 | 64 | print('Pad sequences (samples x time)') 65 | x_train = sequence.pad_sequences(x_train, maxlen=maxlen) 66 | x_test = sequence.pad_sequences(x_test, maxlen=maxlen) 67 | print('x_train shape:', x_train.shape) 68 | print('x_test shape:', x_test.shape) 69 | 70 | 71 | print('Indexing word vectors.') 72 | embeddings_index = {} 73 | model = gensim.models.Word2Vec.load(w2vpath) 74 | for word in words: 75 | embeddings_index[word]=model[word] 76 | print('Found %s word vectors.' % len(embeddings_index)) 77 | 78 | print('Preparing embedding matrix.') 79 | max_token = min(max_features, len(word_to_id)) 80 | embedding_matrix = np.zeros((max_token + 1, embedding_dims)) 81 | for word, i in word_to_id.items(): 82 | if i > max_features: 83 | continue 84 | embedding_vector = embeddings_index.get(word) 85 | if embedding_vector is not None: 86 | # words not found in embedding index will be all-zeros. 87 | embedding_matrix[i] = embedding_vector # word_index to word_embedding_vector ,<20000(max_token) 88 | 89 | embedding_layer = Embedding(max_token+1,embedding_dims,input_length=maxlen,weights=[embedding_matrix],trainable=False) 90 | 91 | ####并行 92 | model_left = Sequential() 93 | model_left.add(embedding_layer) 94 | model_left.add(Bidirectional(GRU(128))) 95 | 96 | model_right = Sequential() 97 | model_right.add(embedding_layer) 98 | model_right.add(Conv1D(128, 5, activation='relu')) #128卷积核的个数 5卷积核大小 99 | model_right.add(MaxPooling1D())#5 100 | model_right.add(Conv1D(128, 1, activation='relu')) 101 | model_right.add(MaxPooling1D())#5 102 | model_right.add(Flatten()) 103 | 104 | merged = Merge([model_left, model_right], mode='concat') 105 | model = Sequential() 106 | model.add(merged) # add merge 107 | model.add(Dense(128, activation='relu')) # 全连接层 108 | model.add(Dropout(0.2)) 109 | model.add(Dense(1, activation='sigmoid')) # 110 | 111 | model.compile(loss='binary_crossentropy', 112 | optimizer='adam', 113 | metrics=['accuracy']) 114 | model.fit(x_train, y_train, 115 | batch_size=batch_size, 116 | epochs=epochs, 117 | validation_data=(x_test, y_test)) 118 | 119 | ####串行 120 | sentence_input = Input(shape=(maxlen,), dtype='int32') 121 | embedded_sequences = embedding_layer(sentence_input) 122 | conv_1=Conv1D(128, 3, activation='relu')(embedded_sequences) 123 | maxpool_1=MaxPooling1D()(conv_1) 124 | drop_1 = Dropout(0.2)(maxpool_1) 125 | biGRU=Bidirectional(GRU(128))(drop_1) 126 | drop_2 = Dropout(0.5)(biGRU) 127 | dense_1 = Dense(1, activation='sigmoid')(drop_2) 128 | model.compile(loss='binary_crossentropy', 129 | optimizer='adam', 130 | metrics=['accuracy']) 131 | model.fit(x_train, y_train, 132 | batch_size=batch_size, 133 | epochs=epochs, 134 | validation_data=(x_test, y_test)) 135 | -------------------------------------------------------------------------------- /py2/word2vec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | gensim的Word2vector使用 5 | pip install gensim 6 | 输入数据要求是:分词后数据,以空格为单词的分隔符 7 | """ 8 | from gensim.models import Word2Vec 9 | import pandas as pd 10 | import cPickle as pickle 11 | path='./data/nlpmaildata2.pkl' 12 | f2 = file(path, 'rb') 13 | d = pickle.load(f2) 14 | f2.close() 15 | 16 | 17 | modelpath="./data/w2c_model" 18 | sentences=list(d["title"]) 19 | sentences= [s.decode("utf-8").encode('utf-8').split() for s in sentences] 20 | 21 | model = Word2Vec(sentences, sg=1, size=64, window=5, min_count=1, negative=3, sample=0.001, hs=1, workers=4) 22 | # 1.sg=1是skip-gram算法,对低频词敏感;默认sg=0为CBOW算法。 23 | # 2.size是输出词向量的维数,值太小会导致词映射因为冲突而影响结果,值太大则会耗内存并使算法计算变慢,一般值取为100到200之间。 24 | # 3.window是句子中当前词与目标词之间的最大距离,3表示在目标词前看3-b个词,后面看b个词(b在0-3之间随机)。 25 | # 4.min_count是对词进行过滤,频率小于min-count的单词则会被忽视,默认值为5。 26 | # 5.negative和sample可根据训练结果进行微调,sample表示更高频率的词被随机下采样到所设置的阈值,默认值为1e-3。 27 | # 6.hs=1表示层级softmax将会被使用,默认hs=0且negative不为0,则负采样将会被选择使用。 28 | # 7.workers控制训练的并行,此参数只有在安装了Cpython后才有效,否则只能使用单核。 29 | # model["英文"] 30 | model.save(modelpath) 31 | # model = Word2Vec.load(fname) 32 | 33 | #模型使用(词语相似度计算等) 34 | # model.most_similar(positive=['woman', 'king'], negative=['man']) 35 | # #输出[('queen', 0.50882536), ...] 36 | 37 | # model.doesnt_match("breakfast cereal dinner lunch".split()) 38 | # #输出'cereal' 39 | 40 | # model.similarity('woman', 'man') 41 | # #输出0.73723527 42 | 43 | # model['computer'] # raw numpy vector of a word 44 | #输出array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) 45 | -------------------------------------------------------------------------------- /word2vec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | gensim的Word2vector使用 5 | pip install gensim 6 | 输入数据要求是:分词后数据,以空格为单词的分隔符 7 | 原理讲解 8 | https://www.cnblogs.com/f-young/p/7906451.html 9 | """ 10 | from gensim.models import Word2Vec 11 | import pandas as pd 12 | # import cPickle as pickle 13 | # path='./data/nlpmaildata2.pkl' 14 | # f2 = file(path, 'rb') 15 | # d = pickle.load(f2) 16 | # f2.close() 17 | 18 | path='./data/nlpmail_re3.txt' 19 | d = pd.read_csv(path,header=None) 20 | d.columns=['title','lable'] 21 | # sentences= [str(s).split() for s in sentences] 22 | 23 | 24 | modelpath="./data/w2c_model" 25 | sentences=list(d["title"]) 26 | sentences= [str(s).split() for s in sentences] 27 | 28 | model = Word2Vec(sentences, sg=1, size=128, window=5, min_count=1, negative=3, sample=0.001, hs=1, workers=4) 29 | # 1.sg=1是skip-gram算法,对低频词敏感;默认sg=0为CBOW算法。 30 | # 2.size是输出词向量的维数,值太小会导致词映射因为冲突而影响结果,值太大则会耗内存并使算法计算变慢,一般值取为100到200之间。 31 | # 3.window是句子中当前词与目标词之间的最大距离,3表示在目标词前看3-b个词,后面看b个词(b在0-3之间随机)。 32 | # 4.min_count是对词进行过滤,频率小于min-count的单词则会被忽视,默认值为5。 33 | # 5.negative和sample可根据训练结果进行微调,sample表示更高频率的词被随机下采样到所设置的阈值,默认值为1e-3。 34 | #作者在论文中说到,当样本量比较小的时候,选择5-20个negative words效果会比较好,当样本量比较大的时候,2-5个negative words就能得到很好的效果 35 | # 6.hs=1表示层级softmax将会被使用,默认hs=0且negative不为0,则负采样将会被选择使用。 36 | # 7.workers控制训练的并行,此参数只有在安装了Cpython后才有效,否则只能使用单核。 37 | # model["英文"] 38 | model.save(modelpath) 39 | # model = Word2Vec.load(fname) 40 | 41 | #模型使用(词语相似度计算等) 42 | # model.most_similar(positive=['woman', 'king'], negative=['man']) 43 | # #输出[('queen', 0.50882536), ...] 44 | 45 | # model.doesnt_match("breakfast cereal dinner lunch".split()) 46 | # #输出'cereal' 47 | 48 | # model.similarity('woman', 'man') 49 | # #输出0.73723527 50 | 51 | # model['computer'] # raw numpy vector of a word 52 | #输出array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) 53 | --------------------------------------------------------------------------------