├── .DS_Store
├── FastText.py
├── MyModel.py
├── README.md
├── TextAttention.py
├── TextCNNmodel.py
├── TextRCNNmodel.py
├── TextRNNmodel.py
├── dataPreprocess.py
├── main_control.py
├── py2
    ├── 01mail.py
    ├── 02mail.py
    ├── 03fastText.py
    ├── 03fastText_keras.py
    ├── 04textCNN.py
    ├── 05textRNN.py
    ├── 06textRCNN.py
    ├── 07Attention.py
    ├── README.md
    ├── mymodel.py
    └── word2vec.py
└── word2vec.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lytforgood/TextClassification/9f2dd4621bb45045eba4af09a20f800ab5074e12/.DS_Store


--------------------------------------------------------------------------------
/FastText.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from keras.preprocessing import sequence
  4 | from keras.models import Sequential
  5 | from keras.layers import Dense
  6 | from keras.layers import Embedding
  7 | from keras.layers import GlobalAveragePooling1D
  8 | from keras.callbacks import EarlyStopping
  9 | import numpy as np
 10 | import logging
 11 | import pandas as pd
 12 | 
 13 | # Set parameters:
 14 | # ngram_range = 2 will add bi-grams features
 15 | # ngram_range = 2
 16 | # max_features = len(words)
 17 | # maxlen = 30
 18 | # batch_size = 32
 19 | # embedding_dims = 64
 20 | # epochs = 100
 21 | # embedding_dims,batch_size,epochs
 22 | 
 23 | def getdata_train(path,ngram_range,maxlen,max_token,embedding_dims,batch_size,epochs,logpath,modelpath,modelname):
 24 |     print("fastText n-gram sentence new_maxlen"+str(maxlen))
 25 |     ##数据获取
 26 |     print('Loading data...')
 27 |     # path = './data/nlpmaildatasample2.csv'
 28 |     d = pd.read_csv(path, header=None)
 29 |     d.columns = ['title', 'lable']
 30 | 
 31 |     # drop=True 不生成index列
 32 |     d = d[-pd.isnull(d["title"])].reset_index(drop=True)
 33 | 
 34 |     all_data = set()
 35 |     for line in d["title"]:
 36 |         ws = line.split(" ")
 37 |         for w in ws:
 38 |             all_data.add(w)
 39 |     words = list(all_data)
 40 |     word_to_id = dict(zip(words, range(len(words))))
 41 |     dx = []
 42 |     for line in d["title"]:
 43 |         ws = line.split(" ")
 44 |         dx.append([word_to_id[w] for w in ws if w in word_to_id])
 45 |     # dy=list(d['lable'])
 46 |     dy = d['lable']
 47 | 
 48 |     def create_ngram_set(input_list, ngram_value=2):
 49 |         """
 50 |         Extract a set of n-grams from a list of integers.
 51 |         >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
 52 |         {(4, 9), (4, 1), (1, 4), (9, 4)}
 53 |         >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
 54 |         [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
 55 |         """
 56 |         return set(zip(*[input_list[i:] for i in range(ngram_value)]))
 57 | 
 58 |     def add_ngram(sequences, token_indice, ngram_range=2):
 59 |         """
 60 |         Augment the input list of list (sequences) by appending n-grams values.
 61 |         Example: adding bi-gram
 62 |         >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
 63 |         >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
 64 |         >>> add_ngram(sequences, token_indice, ngram_range=2)
 65 |         [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
 66 |         Example: adding tri-gram
 67 |         >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
 68 |         >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
 69 |         >>> add_ngram(sequences, token_indice, ngram_range=3)
 70 |         [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
 71 |         """
 72 |         new_sequences = []
 73 |         for input_list in sequences:
 74 |             new_list = input_list[:]
 75 |             for i in range(len(new_list) - ngram_range + 1):
 76 |                 for ngram_value in range(2, ngram_range + 1):
 77 |                     ngram = tuple(new_list[i:i + ngram_value])
 78 |                     if ngram in token_indice:
 79 |                         new_list.append(token_indice[ngram])
 80 |             new_sequences.append(new_list)
 81 | 
 82 |         return new_sequences
 83 | 
 84 |     print('Loading data...')
 85 |     inx = int(len(dx) / 5 * 3)
 86 |     x_train, y_train, x_test, y_test = dx[0:inx], dy[0:inx], dx[inx:len(dx)], dy[inx:len(dx)]
 87 | 
 88 |     print(len(x_train), 'train sequences')
 89 |     print(len(x_test), 'test sequences')
 90 |     print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
 91 |     print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
 92 | 
 93 |     if ngram_range > 1:
 94 |         print('Adding {}-gram features'.format(ngram_range))
 95 |         # Create set of unique n-gram from the training set.
 96 |         ngram_set = set()
 97 |         for input_list in x_train:
 98 |             for i in range(2, ngram_range + 1):
 99 |                 set_of_ngram = create_ngram_set(input_list, ngram_value=i)
100 |                 ngram_set.update(set_of_ngram)
101 | 
102 |         # Dictionary mapping n-gram token to a unique integer.
103 |         # Integer values are greater than max_features in order
104 |         # to avoid collision with existing features.
105 |         start_index = max_token + 1
106 |         token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
107 |         indice_token = {token_indice[k]: k for k in token_indice}
108 | 
109 |         # max_features is the highest integer that could be found in the dataset.
110 |         max_features = np.max(list(indice_token.keys())) + 1
111 | 
112 |         # Augmenting x_train and x_test with n-grams features
113 |         x_train = add_ngram(x_train, token_indice, ngram_range)
114 |         x_test = add_ngram(x_test, token_indice, ngram_range)
115 |         print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
116 |         print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
117 | 
118 |     print('Pad sequences (samples x time)')
119 |     x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
120 |     x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
121 |     print('x_train shape:', x_train.shape)
122 |     print('x_test shape:', x_test.shape)
123 | 
124 |     print('Build model...')
125 |     model = Sequential()
126 | 
127 |     # we start off with an efficient embedding layer which maps
128 |     # our vocab indices into embedding_dims dimensions
129 |     model.add(Embedding(max_features,
130 |                         embedding_dims,
131 |                         input_length=maxlen))
132 | 
133 |     # we add a GlobalAveragePooling1D, which will average the embeddings
134 |     # of all words in the document
135 |     model.add(GlobalAveragePooling1D())
136 | 
137 |     # We project onto a single unit output layer, and squash it with a sigmoid:
138 |     model.add(Dense(1, activation='sigmoid'))
139 | 
140 |     model.compile(loss='binary_crossentropy',
141 |                   optimizer='adam',
142 |                   metrics=['accuracy'])
143 | 
144 |     # patience经过几个epoch后loss不在变化停止训练
145 |     early_stopping = EarlyStopping(monitor='val_loss', patience=2)
146 |     # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping])
147 | 
148 |     hist = model.fit(x_train, y_train,
149 |                      batch_size=batch_size,
150 |                      epochs=epochs,
151 |                      validation_data=(x_test, y_test), callbacks=[early_stopping])
152 | 
153 |     # print(hist.history)
154 |     ##输出loss与acc到日志文件
155 |     log_format = "%(asctime)s - %(message)s"
156 |     logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format)
157 |     logging.warning(modelname)
158 |     for i in range(len(hist.history["acc"])):
159 |         strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i])
160 |         logging.warning(strlog)
161 | 
162 |     model.save(modelpath + modelname + '.h5')


--------------------------------------------------------------------------------
/MyModel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 1、自定义模型 Conv-BiGRU 卷积和循环并行
 4 | 2、自定义模型 卷积和循环串行
 5 | """
 6 | from keras.layers import Dense, Input, Flatten,concatenate
 7 | from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional
 8 | from keras.models import Model
 9 | import logging
10 | from keras.callbacks import EarlyStopping
11 | 
12 | def train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,modelname):
13 |     sentence = Input(shape=(None,), dtype="int32")
14 |     embedding_layer = Embedding(max_token + 1,
15 |                                 embedding_dims,
16 |                                 input_length=maxlen,
17 |                                 weights=[embedding_matrix],
18 |                                 trainable=False)
19 |     sentence_embedding = embedding_layer(sentence)
20 |     c2 = Conv1D(2, 2, activation='relu')(sentence_embedding)
21 |     p2 = MaxPooling1D(27)(c2)
22 |     p2 = Flatten()(p2)
23 | 
24 |     c3 = Conv1D(2, 3, activation='relu')(sentence_embedding)
25 |     p3 = MaxPooling1D(26)(c3)
26 |     p3 = Flatten()(p3)
27 | 
28 |     c4 = Conv1D(2, 4, activation='relu')(sentence_embedding)
29 |     p4 = MaxPooling1D(25)(c4)
30 |     p4 = Flatten()(p4)
31 | 
32 |     g1 = Bidirectional(GRU(128))(sentence_embedding)
33 | 
34 |     x = concatenate([p2, p3, p4, g1])
35 |     output = Dense(1, activation="sigmoid")(x)
36 |     model = Model(inputs=sentence, outputs=output)
37 |     model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
38 | 
39 |     # patience经过几个epoch后loss不在变化停止训练
40 |     early_stopping = EarlyStopping(monitor='val_loss', patience=2)
41 |     hist = model.fit(x_train, y_train,
42 |                      batch_size=batch_size,
43 |                      epochs=epochs,
44 |                      validation_data=(x_test, y_test), callbacks=[early_stopping])
45 |     ##输出loss与acc到日志文件
46 |     log_format = "%(asctime)s - %(message)s"
47 |     logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format)
48 |     logging.warning(modelname)
49 |     for i in range(len(hist.history["acc"])):
50 |         strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i])
51 |         logging.warning(strlog)
52 | 
53 |     model.save(modelpath + modelname + '.h5')


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## keras实现深度学习模型 进行文本分类
 2 | 
 3 | > 实验数据采用真实邮件数据，涉及个人隐私，无法公开，可自行寻找数据测试--格式为：文本内容,标签
 4 | 
 5 | > 模型参数未经过合适调整，目前正在实验修改验证模型当中，修改完成会更新项目
 6 | 
 7 | 
 8 | - py2 详见py2目录下说明
 9 | - main_control.py 主程序入口
10 | - dataPreprocess.py 数据处理 数据输入为：句子中的词(空格分开),标签
11 | - word2vec.py 训练word2vec模型
12 | - FastText.py fastText keras实现
13 | - TextCNNmodel.py word2vecter做词向量的CNN模型 
14 | - TextRNNmodel.py SimpleRNN 双向lstm GRU
15 | - TextRCNNmodel.py Recurrent Convolutional Neural Networks for Text Classification
16 | - TextAttention.py 双向LSTM+Attention分层注意网络 -HAN模型 (与论文有区别)
17 | - MyModel.py 并行卷积和双向GRU
18 | 


--------------------------------------------------------------------------------
/TextAttention.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from keras.layers import Dense, Input, Flatten,Permute,Reshape
 3 | from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
 4 | from keras.layers import merge
 5 | from keras.models import Model
 6 | from keras import backend as K
 7 | from keras.layers.core import Lambda,RepeatVector
 8 | import logging
 9 | from keras.callbacks import EarlyStopping
10 | 
11 | def train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,modelname):
12 |     embedding_layer = Embedding(max_token + 1,
13 |                                 embedding_dims,
14 |                                 weights=[embedding_matrix],
15 |                                 input_length=maxlen,
16 |                                 trainable=True)
17 |     # LSTM步长
18 |     TIME_STEPS = maxlen
19 |     SINGLE_ATTENTION_VECTOR = False
20 | 
21 |     ##不带别名的自编写Attention
22 |     # def attention_3d_block(inputs):
23 |     #     # inputs.shape = (batch_size, time_steps, input_dim)
24 |     #     input_dim = int(inputs.shape[2])
25 |     #     a = Permute((2, 1))(inputs)
26 |     #     a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
27 |     #     a = Dense(TIME_STEPS, activation='softmax')(a)
28 |     #     if SINGLE_ATTENTION_VECTOR:
29 |     #         a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
30 |     #         a = RepeatVector(input_dim)(a)
31 |     #     a_probs = Permute((2, 1), name='attention_vec')(a)
32 |     #     output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
33 |     #     return output_attention_mul
34 |     ##使用多次attention需要新命名
35 |     def attention_3d_block2(inputs, new_layer_name):
36 |         # inputs.shape = (batch_size, time_steps, input_dim)
37 |         input_dim = int(inputs.shape[2])
38 |         a = Permute((2, 1))(inputs)
39 |         a = Reshape((input_dim, TIME_STEPS))(a)  # this line is not useful. It's just to know which dimension is what.
40 |         a = Dense(TIME_STEPS, activation='softmax')(a)
41 |         if SINGLE_ATTENTION_VECTOR:
42 |             a = Lambda(lambda x: K.mean(x, axis=1), name=new_layer_name + '_' + 'dim_reduction')(a)
43 |             a = RepeatVector(input_dim)(a)
44 |         a_probs = Permute((2, 1), name=new_layer_name + '_''attention_vec')(a)
45 |         output_attention_mul = merge([inputs, a_probs], name=new_layer_name + '_''attention_mul', mode='mul')
46 |         return output_attention_mul
47 | 
48 |     # 单向LSTM之后加入Attention
49 |     # sentence_input = Input(shape=(maxlen,), dtype='int32')
50 |     # embedded_sequences = embedding_layer(sentence_input)
51 |     # lstm_out = LSTM(100, return_sequences=True)(embedded_sequences)
52 |     # attention_mul = attention_3d_block(lstm_out)
53 |     # attention_mul = Flatten()(attention_mul)
54 |     # output = Dense(1, activation='sigmoid')(attention_mul)
55 |     # model = Model(sentence_input, output)
56 |     # model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
57 |     # model.fit(x_train, y_train, validation_data=(x_test, y_test),
58 |     #           nb_epoch=epochs, batch_size=batch_size)
59 | 
60 |     # 双向LSTM词encoder  输入是 词标签数组
61 |     sentence_input = Input(shape=(maxlen,), dtype='int32')
62 |     embedded_sequences = embedding_layer(sentence_input)
63 |     forward_rnn = LSTM(100, return_sequences=True)
64 |     backward_rnn = LSTM(100, return_sequences=True, go_backwards=True)
65 |     lstm_out_f_rnn = forward_rnn(embedded_sequences)
66 |     attention_f_mul = attention_3d_block2(lstm_out_f_rnn, "forward")
67 |     lstm_out_b_rnn = backward_rnn(embedded_sequences)
68 |     attention_b_mul = attention_3d_block2(lstm_out_b_rnn, "backward")
69 |     attention_mul = merge([attention_f_mul, attention_b_mul], mode='concat', concat_axis=-1)
70 |     attention_mul = Flatten()(attention_mul)
71 |     output = Dense(1, activation='sigmoid')(attention_mul)
72 |     model = Model(sentence_input, output)
73 |     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
74 | 
75 |     # patience经过几个epoch后loss不在变化停止训练
76 |     early_stopping = EarlyStopping(monitor='val_loss', patience=2)
77 |     # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping])
78 |     print('Train...')
79 |     # history = model.fit([doc_x_train, left_x_train, right_x_train], y_train, epochs = 1)
80 |     # loss = history.history["loss"][0]
81 |     hist = model.fit(x_train, y_train, validation_data=(x_test, y_test),
82 |                      nb_epoch=epochs, batch_size=batch_size, callbacks=[early_stopping])
83 |     # print(hist.history)
84 |     ##输出loss与acc到日志文件
85 |     log_format = "%(asctime)s - %(message)s"
86 |     logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format)
87 |     logging.warning(modelname)
88 |     for i in range(len(hist.history["acc"])):
89 |         strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i])
90 |         logging.warning(strlog)
91 | 
92 |     model.save(modelpath + modelname + '.h5')


--------------------------------------------------------------------------------
/TextCNNmodel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from keras import Input, Model
 3 | from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout
 4 | 
 5 | from keras.callbacks import EarlyStopping
 6 | import logging
 7 | 
 8 | 
 9 | def train(x_train, y_train, x_test, y_test, maxlen, max_token, embedding_matrix, embedding_dims, batch_size, epochs,
10 |           logpath, modelpath, modelname):
11 |     print(modelname + 'Build model...')
12 |     sentence = Input((maxlen,))
13 |     embedding_layer = Embedding(max_token + 1,
14 |                                 embedding_dims,
15 |                                 input_length=maxlen,
16 |                                 weights=[embedding_matrix])
17 |     sentence_embedding = embedding_layer(sentence)
18 |     c2 = Conv1D(128, 3, activation='relu')(sentence_embedding)
19 |     p2 = GlobalMaxPooling1D()(c2)
20 | 
21 |     c3 = Conv1D(128, 4, activation='relu')(sentence_embedding)
22 |     p3 = GlobalMaxPooling1D()(c3)
23 | 
24 |     c4 = Conv1D(128, 5, activation='relu')(sentence_embedding)
25 |     p4 = GlobalMaxPooling1D()(c4)
26 | 
27 |     x = Concatenate()([p2, p3, p4])
28 |     output = Dense(1, activation="sigmoid")(x)
29 |     model = Model(inputs=sentence, outputs=output)
30 |     model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
31 | 
32 |     # print(model.summary())
33 |     # patience经过几个epoch后loss不在变化停止训练
34 |     early_stopping = EarlyStopping(monitor='val_loss', patience=2)
35 |     # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping])
36 |     print('Train...')
37 |     hist = model.fit(x_train, y_train,
38 |                      batch_size=batch_size,
39 |                      epochs=epochs,
40 |                      validation_data=(x_test, y_test), callbacks=[early_stopping])
41 | 
42 |     # print(hist.history)
43 |     ##输出loss与acc到日志文件
44 |     log_format = "%(asctime)s - %(message)s"
45 |     logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format)
46 |     logging.warning(modelname)
47 |     for i in range(len(hist.history["acc"])):
48 |         strlog = str(i + 1) + " Epoch " + "-loss: " + str(hist.history["loss"][i]) + " -acc: " + str(
49 |             hist.history["acc"][i]) + " -val_loss: " + str(hist.history["val_loss"][i]) + " -val_acc: " + str(
50 |             hist.history["val_acc"][i])
51 |         logging.warning(strlog)
52 | 
53 |     model.save(modelpath + modelname + '.h5')
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     print('11')
58 | 


--------------------------------------------------------------------------------
/TextRCNNmodel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from keras import backend
 3 | from keras.layers import Dense, Input, Lambda, LSTM, TimeDistributed
 4 | from keras.layers.merge import concatenate
 5 | from keras.layers.embeddings import Embedding
 6 | from keras.models import Model
 7 | 
 8 | from keras.callbacks import EarlyStopping
 9 | import logging
10 | import numpy as np
11 | 
12 | 
13 | def train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,modelname,hidden_dim_1,hidden_dim_2):
14 |     print(modelname + 'Build model...')
15 |     document = Input(shape=(None,), dtype="int32")
16 |     left_context = Input(shape=(None,), dtype="int32")
17 |     right_context = Input(shape=(None,), dtype="int32")
18 | 
19 |     embedder = Embedding(max_token + 1, embedding_dims, weights=[embedding_matrix], trainable=False) #input_length=maxlen
20 |     doc_embedding = embedder(document)
21 |     l_embedding = embedder(left_context)
22 |     r_embedding = embedder(right_context)
23 | 
24 |     # I use LSTM RNNs instead of vanilla RNNs as described in the paper.
25 |     forward = LSTM(hidden_dim_1, return_sequences=True)(l_embedding)  # See equation (1).
26 |     backward = LSTM(hidden_dim_1, return_sequences=True, go_backwards=True)(r_embedding)  # See equation (2).
27 |     together = concatenate([forward, doc_embedding, backward], axis=2)  # See equation (3).
28 | 
29 |     semantic = TimeDistributed(Dense(hidden_dim_2, activation="tanh"))(together)  # See equation (4).
30 | 
31 |     # Keras provides its own max-pooling layers, but they cannot handle variable length input
32 |     # (as far as I can tell). As a result, I define my own max-pooling layer here.
33 |     pool_rnn = Lambda(lambda x: backend.max(x, axis=1), output_shape=(hidden_dim_2,))(semantic)  # See equation (5).
34 | 
35 |     output = Dense(1, input_dim=hidden_dim_2, activation="sigmoid")(pool_rnn)  # See equations (6) and (7).NUM_CLASSES=1
36 | 
37 |     model = Model(inputs=[document, left_context, right_context], outputs=output)
38 |     model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
39 | 
40 |     ##生成左右上下文
41 |     print('Build left and right data')
42 |     doc_x_train = np.array(x_train)
43 |     # We shift the document to the right to obtain the left-side contexts.
44 |     left_x_train = np.array([[max_token] + t_one[:-1].tolist() for t_one in x_train])
45 |     # We shift the document to the left to obtain the right-side contexts.
46 |     right_x_train = np.array([t_one[1:].tolist() + [max_token] for t_one in x_train])
47 | 
48 |     doc_x_test = np.array(x_test)
49 |     # We shift the document to the right to obtain the left-side contexts.
50 |     left_x_test = np.array([[max_token] + t_one[:-1].tolist() for t_one in x_test])
51 |     # We shift the document to the left to obtain the right-side contexts.
52 |     right_x_test = np.array([t_one[1:].tolist() + [max_token] for t_one in x_test])
53 | 
54 |     # patience经过几个epoch后loss不在变化停止训练
55 |     early_stopping = EarlyStopping(monitor='val_loss', patience=2)
56 |     # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping])
57 |     print('Train...')
58 |     # history = model.fit([doc_x_train, left_x_train, right_x_train], y_train, epochs = 1)
59 |     # loss = history.history["loss"][0]
60 |     hist = model.fit([doc_x_train, left_x_train, right_x_train], y_train,
61 |                      batch_size=batch_size,
62 |                      epochs=epochs,
63 |                      validation_data=[[doc_x_test, left_x_test, right_x_test], y_test], callbacks=[early_stopping])
64 | 
65 |     # print(hist.history)
66 |     ##输出loss与acc到日志文件
67 |     log_format = "%(asctime)s - %(message)s"
68 |     logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format)
69 |     logging.warning(modelname)
70 |     for i in range(len(hist.history["acc"])):
71 |         strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i])
72 |         logging.warning(strlog)
73 | 
74 |     model.save(modelpath + modelname + '.h5')


--------------------------------------------------------------------------------
/TextRNNmodel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from keras.models import Sequential
  3 | from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional,GRU,SimpleRNN
  4 | import logging
  5 | from keras.callbacks import EarlyStopping
  6 | 
  7 | def train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,modelname):
  8 |     embedding_layer = Embedding(max_token + 1,
  9 |                                 embedding_dims,
 10 |                                 input_length=maxlen,
 11 |                                 weights=[embedding_matrix],
 12 |                                 trainable=False)
 13 |     print(modelname + 'Build model...')
 14 |     model = Sequential()
 15 |     model.add(embedding_layer)
 16 |     model.add(SimpleRNN(128, activation="relu"))
 17 |     # model.add(LSTM(128))
 18 |     # model.add(Bidirectional(LSTM(200))) ### 输出维度64 GRU
 19 |     # model.add(Bidirectional(GRU(64)))
 20 |     model.add(Dropout(0.2))
 21 |     model.add(Dense(1, activation='sigmoid'))
 22 |     # try using different optimizers and different optimizer configs
 23 |     model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
 24 |     # lstm常选参数model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
 25 |     # a stateful LSTM model
 26 |     # lahead: the input sequence length that the LSTM
 27 |     # https://github.com/keras-team/keras/blob/master/examples/lstm_stateful.py
 28 |     # model = Sequential()
 29 |     # model.add(LSTM(20,input_shape=(lahead, 1),
 30 |     #               batch_size=batch_size,
 31 |     #               stateful=stateful))
 32 |     # model.add(Dense(1))
 33 |     # model.compile(loss='mse', optimizer='adam')
 34 | 
 35 |     # patience经过几个epoch后loss不在变化停止训练
 36 |     early_stopping = EarlyStopping(monitor='val_loss', patience=2)
 37 |     # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping])
 38 |     print('Train...')
 39 |     hist = model.fit(x_train, y_train,
 40 |                      batch_size=batch_size,
 41 |                      epochs=epochs,
 42 |                      validation_data=(x_test, y_test), callbacks=[early_stopping])
 43 |     # print(hist.history)
 44 |     ##输出loss与acc到日志文件
 45 |     log_format = "%(asctime)s - %(message)s"
 46 |     logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format)
 47 |     logging.warning(modelname)
 48 |     for i in range(len(hist.history["acc"])):
 49 |         strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i])
 50 |         logging.warning(strlog)
 51 | 
 52 |     model.save(modelpath + modelname + '.h5')
 53 | 
 54 | def train2(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,modelname):
 55 |     embedding_layer = Embedding(max_token + 1,
 56 |                                 embedding_dims,
 57 |                                 input_length=maxlen,
 58 |                                 weights=[embedding_matrix],
 59 |                                 trainable=False)
 60 |     print(modelname + 'Build model...')
 61 |     model = Sequential()
 62 |     model.add(embedding_layer)
 63 |     # model.add(SimpleRNN(128, activation="relu"))
 64 |     # model.add(LSTM(128))
 65 |     model.add(Bidirectional(LSTM(200))) ### 输出维度64 GRU
 66 |     # model.add(Bidirectional(GRU(64)))
 67 |     model.add(Dropout(0.2))
 68 |     model.add(Dense(1, activation='sigmoid'))
 69 |     # try using different optimizers and different optimizer configs
 70 |     model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
 71 |     # lstm常选参数model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
 72 |     # a stateful LSTM model
 73 |     # lahead: the input sequence length that the LSTM
 74 |     # https://github.com/keras-team/keras/blob/master/examples/lstm_stateful.py
 75 |     # model = Sequential()
 76 |     # model.add(LSTM(20,input_shape=(lahead, 1),
 77 |     #               batch_size=batch_size,
 78 |     #               stateful=stateful))
 79 |     # model.add(Dense(1))
 80 |     # model.compile(loss='mse', optimizer='adam')
 81 | 
 82 |     # patience经过几个epoch后loss不在变化停止训练
 83 |     early_stopping = EarlyStopping(monitor='val_loss', patience=2)
 84 |     # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping])
 85 |     print('Train...')
 86 |     hist = model.fit(x_train, y_train,
 87 |                      batch_size=batch_size,
 88 |                      epochs=epochs,
 89 |                      validation_data=(x_test, y_test), callbacks=[early_stopping])
 90 |     # print(hist.history)
 91 |     ##输出loss与acc到日志文件
 92 |     log_format = "%(asctime)s - %(message)s"
 93 |     logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format)
 94 |     logging.warning(modelname)
 95 |     for i in range(len(hist.history["acc"])):
 96 |         strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i])
 97 |         logging.warning(strlog)
 98 | 
 99 |     model.save(modelpath + modelname + '.h5')
100 | 
101 | def train3(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,modelname):
102 |     embedding_layer = Embedding(max_token + 1,
103 |                                 embedding_dims,
104 |                                 input_length=maxlen,
105 |                                 weights=[embedding_matrix],
106 |                                 trainable=False)
107 |     print(modelname+'Build model...')
108 |     model = Sequential()
109 |     model.add(embedding_layer)
110 |     # model.add(SimpleRNN(128, activation="relu"))
111 |     # model.add(LSTM(128))
112 |     # model.add(Bidirectional(LSTM(200))) ### 输出维度64 GRU
113 |     model.add(Bidirectional(GRU(128)))
114 |     model.add(Dropout(0.2))
115 |     model.add(Dense(1, activation='sigmoid'))
116 |     # try using different optimizers and different optimizer configs
117 |     model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
118 |     # lstm常选参数model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
119 |     # a stateful LSTM model
120 |     # lahead: the input sequence length that the LSTM
121 |     # https://github.com/keras-team/keras/blob/master/examples/lstm_stateful.py
122 |     # model = Sequential()
123 |     # model.add(LSTM(20,input_shape=(lahead, 1),
124 |     #               batch_size=batch_size,
125 |     #               stateful=stateful))
126 |     # model.add(Dense(1))
127 |     # model.compile(loss='mse', optimizer='adam')
128 | 
129 |     # patience经过几个epoch后loss不在变化停止训练
130 |     early_stopping = EarlyStopping(monitor='val_loss', patience=2)
131 |     # model.fit(X, y, validation_split=0.2, callbacks=[early_stopping])
132 |     print('Train...')
133 |     hist = model.fit(x_train, y_train,
134 |                      batch_size=batch_size,
135 |                      epochs=epochs,
136 |                      validation_data=(x_test, y_test), callbacks=[early_stopping])
137 |     # print(hist.history)
138 |     ##输出loss与acc到日志文件
139 |     log_format = "%(asctime)s - %(message)s"
140 |     logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format)
141 |     logging.warning(modelname)
142 |     for i in range(len(hist.history["acc"])):
143 |         strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i])
144 |         logging.warning(strlog)
145 | 
146 |     model.save(modelpath + modelname + '.h5')


--------------------------------------------------------------------------------
/dataPreprocess.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pandas as pd
 3 | import numpy as np
 4 | import gensim
 5 | from keras.preprocessing.sequence import pad_sequences
 6 | 
 7 | """
 8 | dataPreprocess
 9 | set
10 |     path='./data/nlpmail_re3.txt'
11 |     batch_size = 32
12 |     embedding_dims = 128  #词向量长度
13 |     epochs = 100
14 |     w2vpath="./data/w2c_model"
15 |     hidden_dim_1 = 200
16 |     hidden_dim_2 = 100
17 | return x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,
18 | """
19 | def getdata(path,embedding_dims,w2vpath):
20 |     print('Loading data...')
21 |     d = pd.read_csv(path,header=None)
22 |     d.columns=['title','lable']
23 | 
24 |     #drop=True 不生成index列
25 |     d=d[-pd.isnull(d["title"])].reset_index(drop=True)
26 | 
27 |     all_data=set()
28 |     for line in d["title"]:
29 |        ws=str(line).split(" ")
30 |        for w in ws:
31 |          if w == ' ' or w == '' or w=="\t" or w=="??":
32 |             continue
33 |          all_data.add(w)
34 |     words=list(all_data)
35 | 
36 |     word_to_id = dict(zip(words, range(len(words))))
37 |     dx=[]
38 |     for line in d["title"]:
39 |         ws=str(line).split(" ")
40 |         dx.append([word_to_id[w] for w in ws if w in word_to_id])
41 |     # dy=list(d['lable'])
42 |     dy=d['lable']
43 | 
44 |     print('Average  sequence length: {}'.format(np.mean(list(map(len, dx)), dtype=int)))
45 |     # set parameters:
46 |     maxlen=np.max(list(map(len, dx))) #maxlen = 29  最长文本词数
47 | 
48 |     inx=int(len(dx)/5*3)
49 |     x_train, y_train, x_test, y_test = dx[0:inx],dy[0:inx],dx[inx:len(dx)],dy[inx:len(dx)]
50 | 
51 |     print(len(x_train), 'train sequences')
52 |     print(len(x_test), 'test sequences')
53 | 
54 |     print('Pad sequences (samples x time)')
55 |     x_train = pad_sequences(x_train, maxlen=maxlen)
56 |     x_test = pad_sequences(x_test, maxlen=maxlen)
57 |     print('x_train shape:', x_train.shape)
58 |     print('x_test shape:', x_test.shape)
59 | 
60 | 
61 |     print('Indexing word vectors.')
62 |     embeddings_index = {}
63 |     model = gensim.models.Word2Vec.load(w2vpath)
64 | 
65 |     #初始化一个0向量 统计未出现词个数
66 |     null_word=np.zeros(embedding_dims)
67 |     null_word_count=0
68 | 
69 |     for word in words:
70 |         try:
71 |             embeddings_index[word]=model[word]
72 |         except:
73 |             embeddings_index[word]=null_word
74 |             null_word_count+=1
75 |     print('Found %s word vectors.' % len(embeddings_index))
76 |     print('Found %s null word.' % null_word_count)
77 | 
78 |     print('Preparing embedding matrix.')
79 |     max_token = len(word_to_id)
80 |     embedding_matrix = np.zeros((max_token + 1, embedding_dims))
81 |     for word, i in word_to_id.items():
82 |         if i > max_token:
83 |             continue
84 |         embedding_vector = embeddings_index.get(word)
85 |         if embedding_vector is not None:
86 |             # words not found in embedding index will be all-zeros.
87 |             embedding_matrix[i] = embedding_vector
88 | 
89 |     return x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix


--------------------------------------------------------------------------------
/main_control.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 使用Python脚本控制多个Python脚本运行
 4 | 方法一：
 5 | 直接Python顺序运行
 6 | import os
 7 | os.system("D:\ProgramData\Anaconda3\python D:\mysoft\dlspace\FastText3.py")  #因为没有环境变量需要制定python路径  mac/linux os.system("python /xx/a.py")
 8 | os.system("D:\ProgramData\Anaconda3\python D:\mysoft\dlspace\main_control.py")
 9 | 方法二：
10 | 写成函数形式，调用函数 如下
11 | """
12 | import dataPreprocess
13 | import FastText
14 | import TextCNNmodel
15 | import TextRNNmodel
16 | import TextRCNNmodel
17 | import TextAttention
18 | import MyModel
19 | print("设置参数")
20 | #获取数据参数
21 | # path = './data/nlpmail_re3.txt'
22 | path="./data/nlpmaildatasample2.csv" #数据输入
23 | w2vpath = "./data/w2c_model"  #w2v模型地址
24 | embedding_dims = 128  # 词向量长度
25 | logpath='./model/mylog.txt' #日志记录地址
26 | modelpath='./model/' #模型保存目录
27 | #模型训练参数
28 | batch_size = 32
29 | epochs = 100
30 | #fastText参数
31 | ngram_range=2
32 | #TextRCNNmodel参数
33 | hidden_dim_1 = 200
34 | hidden_dim_2 = 100
35 | 
36 | 
37 | print("获取数据")
38 | x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix=dataPreprocess.getdata(path,embedding_dims,w2vpath)
39 | 
40 | print("调用模型")
41 | FastText.getdata_train(path,ngram_range,maxlen+10,max_token,embedding_dims,batch_size,epochs,logpath,modelpath,"FastText")
42 | TextCNNmodel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextCNN")
43 | TextRNNmodel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextSimpleRNN")
44 | TextRNNmodel.train2(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextBiLSTM")
45 | TextRNNmodel.train3(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextBiGRU")
46 | TextRCNNmodel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextRCNN",hidden_dim_1,hidden_dim_2)
47 | TextAttention.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextAttention")
48 | MyModel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"MyConBiGRU")
49 | 


--------------------------------------------------------------------------------
/py2/01mail.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | '''
  4 | 文本数据生成 输出 文本 词典
  5 | 思考：
  6 | 去停用词改为使用信息增益、互信息法、L1正则化选择词特征
  7 | '''
  8 | import pandas as pd
  9 | import re
 10 | import jieba
 11 | import cPickle as pickle
 12 | import sys #这里只是一个对sys的引用，只能reload才能进行重新加载
 13 | from sklearn.utils import shuffle
 14 | 
 15 | def data2all():
 16 |     ##数据生成
 17 |     f1 = pd.read_csv("./data/data2016_0730_1028.csv",sep=',',header=None,encoding="utf-8")
 18 |     f2 = pd.read_csv("./data/data2016_0730_1028.csv",sep=',',header=None,encoding="utf-8")
 19 |     f3 = pd.read_csv("./data/data20161028_20170108.csv",sep=',',header=None,encoding="utf-8")
 20 |     f =pd.concat([f1,f2,f3])
 21 |     f.columns = ['accept','title','send','accept','time','label','day']
 22 | 
 23 |     all_data=f[["title","label"]]
 24 | 
 25 |     x=all_data.groupby(["label"])["title"].count()
 26 |     # Index([u'个人文件夹(个人过滤器)', u'垃圾箱(系统判断)', u'已退信', u'投递中', u'投递成功', u'收件箱', u'自动转发',u'被拦截(个人过滤器)', u'被拦截(用户黑名单)', u'被拦截(系统拦截)'],
 27 |     d1=all_data[(all_data["label"]==x.index[1])].reset_index() #垃圾
 28 |     d2=all_data[(all_data["label"]==x.index[4])].reset_index() #投递成功
 29 |     d3=all_data[(all_data["label"]==x.index[5])].reset_index() #收件箱
 30 | 
 31 |     d=pd.concat([d1,d2,d3])
 32 |     d=d[["title","label"]]
 33 |     d.to_csv("./data/nlpmail.csv",header=False,index=False,encoding="utf-8")
 34 | 
 35 | #合并数据
 36 | data2all()
 37 | 
 38 | stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
 39 | reload(sys) #通过import引用进来时,setdefaultencoding函数在被系统调用后被删除了，所以必须reload一次
 40 | sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
 41 | sys.setdefaultencoding('utf-8')
 42 | 
 43 | ##读取文件
 44 | d = pd.read_csv("./data/nlpmail.csv",sep=',',header=None,encoding="utf-8")
 45 | d.columns = ['title','lable']
 46 | ##类别编码
 47 | def label2num(x):
 48 |   l=0
 49 |   if(x==u"垃圾箱(系统判断)"):
 50 |     l=1
 51 |   return l
 52 | d["lable2"]=[label2num(x) for x in d["lable"]]
 53 | d["index"]=range(d.shape[0])
 54 | 
 55 | 
 56 | ##去除标点符号
 57 | def remove_punctuation(line):
 58 |     #中文标点 ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
 59 |     #英文标点 !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
 60 |     try:
 61 |       line = re.sub("[！？。｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+".decode("utf-8"), "",line.decode("utf-8"))
 62 |     except Exception as e:
 63 |       print "error"
 64 |     return line
 65 | 
 66 | ##结巴分词
 67 | def cutline(line):
 68 |     line=str(line) #防止只有数字的识别为float  纯数字转换成 数字 一词
 69 |     words = jieba.cut(line, cut_all=False)
 70 |     re=" ".join(words)
 71 |     return re
 72 | 
 73 | #创建字典 词级别
 74 | def createVocabList(dataSet):
 75 |     all_data=[]
 76 |     for line in  dataSet:
 77 |       for words in line.split(" "):
 78 |         all_data.append(words)
 79 |     all_data=set(all_data)
 80 |     return all_data
 81 | 
 82 | #去除空值
 83 | d=d.dropna()
 84 | #去标点
 85 | d["title"]=[remove_punctuation(x) for x in d["title"]]
 86 | d=d[["index","title","label2"]]
 87 | 
 88 | #替换特殊空格
 89 | def replaySspace(line):
 90 |     line=line.replace('\xc2\xa0', '')
 91 |     return line
 92 | 
 93 | 
 94 | d["title"]=[replaySspace(x) for x in d["title"]]
 95 | 
 96 | ##去掉全英文和字母
 97 | def  rematch(line):
 98 |     if re.match('^[A-Za-z0-9]+$',line):
 99 |         line="q100"
100 |     return line
101 | 
102 | d["title"]=[rematch(x) for x in d["title"]]
103 | 
104 | d=d[d["title"]!="q100"]
105 | 
106 | #分词
107 | d["title"]=[cutline(x) for x in d["title"]]
108 | 
109 | ##保存文本
110 | # path='./data/nlpmaildata.pkl'
111 | # output = file(path, 'wb')
112 | # pickle.dump(d, output, True)
113 | # output.close()
114 | # ##保存字典
115 | # vocab_dir=createVocabList(d["title"])
116 | # vocab_dir=list(vocab_dir)
117 | # path='./data/vocab_dir.pkl'
118 | # output = file(path, 'wb')
119 | # pickle.dump(vocab_dir, output, True)
120 | # output.close()
121 | 
122 | 
123 | #数据清洗 替换英文和字母 选取文本长度>4的文本
124 | def replayxx(line):
125 |     words=line.split(" ")
126 |     newwords=[]
127 |     for w in words:
128 |       if  w.encode( 'UTF-8' ).isdigit():
129 |         w="数字"
130 |       if  w.encode( 'UTF-8' ).isalpha():
131 |         w="英文"
132 |       if  re.match('^[A-Za-z0-9]+$',w):
133 |         w="数字英文"
134 |       newwords.append(w)
135 |     res=" ".join(newwords)
136 |     return res
137 | d["title"]=[replayxx(x) for x in d["title"]]
138 | 
139 | d=d[["title","lable2"]].reset_index(drop = True)
140 | d.columns=['title','lable']
141 | 
142 | # d["title"]=[x.encode("utf-8") for x in d["title"]]
143 | 
144 | path='./data/nlpmaildata2.pkl'
145 | output = file(path, 'wb')
146 | pickle.dump(d, output, True)
147 | output.close()
148 | 
149 | d = shuffle(d)
150 | d.to_csv("./data/nlpmail_re.csv",header=False,index=False,encoding="utf_8_sig") #(452526, 2)
151 | #切分数据集
152 | df1=d[(d["lable"]==1)].sample(frac=0.2)
153 | df2=d[(d["lable"]==0)].sample(frac=0.2)
154 | d=pd.concat([df1,df2])
155 | from sklearn.utils import shuffle
156 | d = shuffle(d)
157 | d =d.sample(frac=0.2)
158 | path='./data/nlpmaildatasample2.pkl'
159 | output = file(path, 'wb')
160 | pickle.dump(d, output, True)
161 | output.close()
162 | 
163 | d.to_csv("./data/nlpmaildatasample2.csv",header=False,index=False,encoding="utf_8_sig")
164 | 


--------------------------------------------------------------------------------
/py2/02mail.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | '''
  4 | 文本向量化 词袋方法 TF-IDF 文本Hash 朴素贝叶斯
  5 | '''
  6 | import pandas as pd
  7 | import re
  8 | import jieba
  9 | import cPickle as pickle
 10 | import numpy as np
 11 | 
 12 | ##读取文件
 13 | # path='./data/nlpmaildata2.pkl'
 14 | # path='./data/nlpmaildatasample2.pkl'
 15 | # f2 = file(path, 'rb')
 16 | # d = pickle.load(f2)
 17 | # f2.close()
 18 | path='./data/nlpmaildatasample2.csv'
 19 | d = pd.read_csv(path,header=None)
 20 | d.columns=['title','lable']
 21 | #打乱数据
 22 | # from sklearn.utils import shuffle
 23 | # d = shuffle(d)
 24 | #获取停用词表
 25 | def get_stopwords(path):
 26 |     f= open(path)
 27 |     stopwords=[]
 28 |     for line in f:
 29 |         stopwords.append(line.strip().decode("utf-8"))
 30 |     return stopwords
 31 | #停用词导入
 32 | stopwords=get_stopwords("./data/stopwords.txt")
 33 | #获取训练标签
 34 | dy=list(d["lable"])
 35 | ############################################################################################
 36 | ##方法1.1 自定义词袋方法
 37 | ##词袋模型
 38 | # def bagOfWords2VecMN(vocabList, inputSet):
 39 | #     returnVec = [0]*len(vocabList)
 40 | #     for word in inputSet:
 41 | #         if word in vocabList:
 42 | #             returnVec[vocabList.index(word)] += 1
 43 | #     return returnVec
 44 | # path='./data/vocab_dir.pkl'
 45 | # f2 = file(path, 'rb')
 46 | # vocab_dir = pickle.load(f2)
 47 | # f2.close()
 48 | # #转换成list词袋  字典维度太大 会执行失败！！！行数*字典维度/1024/1024/1024=需要多少G内存
 49 | # train=[]
 50 | # label=list(d["label2"])
 51 | # for i in range(len(d["title"])):
 52 | #     if(i%10000 ==0):
 53 | #         print float(i)/float(len(d["title"]))
 54 | #     t=d["title"][i]
 55 | #     words=t.split(" ")
 56 | #     vec=bagOfWords2VecMN(vocab_dir,words)
 57 | #     train.append(vec)
 58 | #############################################################################################
 59 | ##方法1.2 词袋向量化之sklearn
 60 | #词袋向量化
 61 | from sklearn.feature_extraction.text import CountVectorizer
 62 | vectorizer=CountVectorizer(stop_words=stopwords)
 63 | #输入是带空格的分词后list
 64 | # d_x=vectorizer.fit_transform(d["title"]).toarray()  #训练并转换
 65 | vectorizer.fit(d["title"])
 66 | dx=vectorizer.transform(d["title"]).toarray()
 67 | #返回满足条件的索引所在位置
 68 | # print np.where(d_x[0]>0)
 69 | #对应字典获取
 70 | vocab_dir=vectorizer.get_feature_names()
 71 | #############################################################################################
 72 | ##方法1.3 词袋向量化之sklearn，TF-IDF和标准化
 73 | # from sklearn.feature_extraction.text import TfidfVectorizer
 74 | # vector = TfidfVectorizer(stop_words=stopwords)
 75 | # vector.fit(d["title"])
 76 | # dx=vector.transform(d["title"]).toarray()
 77 | # vocab_dir = vector.get_feature_names()#获取词袋模型中的所有词
 78 | ############################################################################################
 79 | ##方法2 文本Hash Trick 用哈希技巧矢量化大文本语料库
 80 | ##原理 hash(文本1)=位置5 hash(文本2)=位置5 位置5的值=1+1or新的哈希函数
 81 | # from sklearn.feature_extraction.text import HashingVectorizer
 82 | # vectorizer2=HashingVectorizer(n_features = 1000,norm = None,stop_words=stopwords)
 83 | # vectorizer2.fit(d["title"])
 84 | # dx=vectorizer2.transform(d["title"]).toarray()
 85 | #############################################################################################
 86 | ##朴素贝叶斯按比例验证
 87 | # from sklearn.naive_bayes import MultinomialNB
 88 | # from sklearn.model_selection import cross_val_score
 89 | # from sklearn.model_selection import StratifiedKFold
 90 | # clf = MultinomialNB()
 91 | # ##修改cv分折方法
 92 | # skf = StratifiedKFold(n_splits=5)
 93 | # ##修改score
 94 | # scores = cross_val_score(clf, dx, dy, cv=skf, scoring='accuracy')
 95 | # scores2 = cross_val_score(clf, dx, dy, cv=skf, scoring='f1')
 96 | # #评分估计的平均得分和 95% 置信区间由此给出
 97 | # print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
 98 | # print("f1: %0.2f (+/- %0.2f)" % (scores2.mean(), scores.std() * 2))
 99 | #############################################################################################
100 | ##按比例切分训练集
101 | from sklearn.naive_bayes import MultinomialNB
102 | from sklearn.naive_bayes import BernoulliNB
103 | from sklearn.model_selection import train_test_split
104 | from sklearn.metrics import accuracy_score
105 | from sklearn.metrics import f1_score
106 | 
107 | X_train, X_test, y_train, y_test = train_test_split(dx, dy, test_size=0.2, random_state=0)
108 | # clf = MultinomialNB()
109 | clf = BernoulliNB()
110 | clf.fit(X_train, y_train)
111 | y_pred=clf.predict(X_test)
112 | print("Accuracy: %0.2f" % accuracy_score(y_test, y_pred))
113 | print("F1: %0.2f" % f1_score(y_test, y_pred))
114 | 
115 | ##一致性对比
116 | dtrain=d[0:d.shape[0]/5*3]
117 | dtest=d[d.shape[0]/5*3:d.shape[0]]
118 | X_train, X_test, y_train, y_test=vectorizer.transform(dtrain["title"]).toarray(),vectorizer.transform(dtest["title"]).toarray(),list(dtrain["lable"]),list(dtest["lable"])
119 | clf = BernoulliNB()
120 | clf.fit(X_train, y_train)
121 | y_pred=clf.predict(X_test)
122 | print("Accuracy: %0.2f" % accuracy_score(y_test, y_pred))
123 | print("F1: %0.2f" % f1_score(y_test, y_pred))
124 | # Accuracy: 0.73
125 | # F1: 0.66
126 | # #评价标准
127 | # from sklearn import metrics
128 | # print "Accuracy : %.2f" % metrics.accuracy_score(label, pre_reduce)
129 | # print "recall : %.2f" % metrics.recall_score(label, pre_reduce)
130 | # print "F1 : %.2f" % metrics.f1_score(label, pre_reduce)
131 | 


--------------------------------------------------------------------------------
/py2/03fastText.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | 文本分类之fastText
 5 | 方法一：自己编写
 6 | 方法二：Facebook开源工具https://github.com/facebookresearch/fastText#text-classification
 7 | paper:https://arxiv.org/pdf/1607.01759.pdf
 8 | fastText的核心思想就是：将整篇文档的词及n-gram向量叠加平均得到文档向量，然后使用文档向量做softmax多分类
 9 | 字符级n-gram特征的引入以及分层Softmax分类
10 | 参考：
11 | http://blog.csdn.net/sinat_26917383/article/details/54850933
12 | http://www.52nlp.cn/category/text-classification
13 | """
14 | #方法二 fastText对词向量生成考虑到上下文  基于Hierarchical(分层) Softmax
15 | # 输入格式  词(空格分开)_lable_标签  eg：英媒 称 威 __label__affairs
16 | import pandas as pd
17 | import re
18 | import jieba
19 | import cPickle as pickle
20 | import numpy as np
21 | 
22 | ##读取文件
23 | path='./data/nlpmaildatasample2.csv'
24 | d = pd.read_csv(path,header=None)
25 | d.columns=['title','lable']
26 | 
27 | dtrain=d[0:d.shape[0]/5*3]
28 | dtest=d[d.shape[0]/5*3:d.shape[0]]
29 | 
30 | #生成训练文件
31 | def w2file(data,filename):
32 |     f = open(filename,"w")
33 |     for i in range(data.shape[0]):
34 |         outline = d['title'][i] + "\t__label__" + str(d['lable'][i]) + "\n"
35 |         f.write(outline)
36 |     f.close()
37 | 
38 | w2file(dtrain,"./data/fasttext_train.txt")
39 | w2file(dtest,"./data/fasttext_test.txt")
40 | 
41 | import logging
42 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
43 | import fastText
44 | #训练模型
45 | classifier = fastText.FastText.train_supervised("./data/fasttext_train.txt",lr=0.1, dim=100,wordNgrams=1,label=u"__label__")
46 | #参数
47 | # train_supervised(input, lr=0.1, dim=100, ws=5, epoch=5, minCount=1, minCountLabel=0, minn=0, maxn=0, neg=5, wordNgrams=1, loss=u'softmax', bucket=2000000, thread=12, lrUpdateRate=100, t=0.0001, label=u'__label__', verbose=2, pretrainedVectors=u'')
48 | # input_file     training file path (required)
49 | # output         output file path (required)
50 | # lr             learning rate [0.05]
51 | # lr_update_rate change the rate of updates for the learning rate [100]
52 | # dim            size of word vectors [100]
53 | # ws             size of the context window [5]
54 | # epoch          number of epochs [5]
55 | # min_count      minimal number of word occurences [5]
56 | # neg            number of negatives sampled [5]
57 | # word_ngrams    max length of word ngram [1]
58 | # loss           loss function {ns, hs, softmax} [ns]
59 | # bucket         number of buckets [2000000]
60 | # minn           min length of char ngram [3]
61 | # maxn           max length of char ngram [6]
62 | # thread         number of threads [12]
63 | # t              sampling threshold [0.0001]
64 | # silent         disable the log output from the C++ extension [1]
65 | # encoding       specify input_file encoding [utf-8]
66 | ((u'__label__0',), array([ 0.77616984]))
67 | #测试模型 help(classifier)
68 | result = classifier.test("./data/fasttext_test.txt")
69 | print result
70 | texts=[str(t).decode("utf-8") for t in dtest["title"]] #预测与输入编码必须一致
71 | ##predict输出格式((u'__label__0',), array([ 0.77616984]))
72 | y_pred = [int(e[0].replace("__label__","")) for e in classifier.predict(texts)[0]] #预测输出结果为元组
73 | y_test=list(dtest["lable"])
74 | from sklearn.metrics import accuracy_score
75 | from sklearn.metrics import f1_score
76 | print("Accuracy: %0.2f" % accuracy_score(y_test, y_pred))
77 | print("F1: %0.2f" % f1_score(y_test, y_pred))
78 | # Accuracy: 0.73
79 | # F1: 0.65
80 | 


--------------------------------------------------------------------------------
/py2/03fastText_keras.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 文本分类之fastText
  5 | 方法一：自己编写 keras官方版本
  6 | """
  7 | #建立词典
  8 | from __future__ import print_function
  9 | from keras.preprocessing import sequence
 10 | from keras.models import Sequential
 11 | from keras.layers import Dense
 12 | from keras.layers import Embedding
 13 | from keras.layers import GlobalAveragePooling1D
 14 | import pandas as pd
 15 | import cPickle as pickle
 16 | import numpy as np
 17 | 
 18 | ##读取文件
 19 | path='./data/nlpmaildatasample2.csv'
 20 | d = pd.read_csv(path,header=None)
 21 | d.columns=['title','lable']
 22 | 
 23 | all_data=set()
 24 | for line in d["title"]:
 25 |    ws=line.split(" ")
 26 |    for w in ws:
 27 |      all_data.add(w)
 28 | words=list(all_data)
 29 | word_to_id = dict(zip(words, range(len(words))))
 30 | dx=[]
 31 | for line in d["title"]:
 32 |     ws=line.split(" ")
 33 |     dx.append([word_to_id[w] for w in ws if w in word_to_id])
 34 | dy=list(d['lable'])
 35 | # dy=d['lable']
 36 | 
 37 | def create_ngram_set(input_list, ngram_value=2):
 38 |     """
 39 |     Extract a set of n-grams from a list of integers.
 40 |     >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
 41 |     {(4, 9), (4, 1), (1, 4), (9, 4)}
 42 |     >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
 43 |     [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
 44 |     """
 45 |     return set(zip(*[input_list[i:] for i in range(ngram_value)]))
 46 | 
 47 | 
 48 | def add_ngram(sequences, token_indice, ngram_range=2):
 49 |     """
 50 |     Augment the input list of list (sequences) by appending n-grams values.
 51 |     Example: adding bi-gram
 52 |     >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
 53 |     >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
 54 |     >>> add_ngram(sequences, token_indice, ngram_range=2)
 55 |     [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
 56 |     Example: adding tri-gram
 57 |     >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
 58 |     >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
 59 |     >>> add_ngram(sequences, token_indice, ngram_range=3)
 60 |     [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
 61 |     """
 62 |     new_sequences = []
 63 |     for input_list in sequences:
 64 |         new_list = input_list[:]
 65 |         for i in range(len(new_list) - ngram_range + 1):
 66 |             for ngram_value in range(2, ngram_range + 1):
 67 |                 ngram = tuple(new_list[i:i + ngram_value])
 68 |                 if ngram in token_indice:
 69 |                     new_list.append(token_indice[ngram])
 70 |         new_sequences.append(new_list)
 71 | 
 72 |     return new_sequences
 73 | 
 74 | # Set parameters:
 75 | # ngram_range = 2 will add bi-grams features
 76 | ngram_range = 1
 77 | max_features = len(words)
 78 | maxlen = 400
 79 | batch_size = 32
 80 | embedding_dims = 50
 81 | epochs = 5
 82 | 
 83 | print('Loading data...')
 84 | x_train, y_train, x_test, y_test = dx[0:len(dx)/5*3],dy[0:len(dx)/5*3],dx[len(dx)/5*3:len(dx)],dy[len(dx)/5*3:len(dx)]
 85 | 
 86 | print(len(x_train), 'train sequences')
 87 | print(len(x_test), 'test sequences')
 88 | print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
 89 | print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
 90 | 
 91 | if ngram_range > 1:
 92 |     print('Adding {}-gram features'.format(ngram_range))
 93 |     # Create set of unique n-gram from the training set.
 94 |     ngram_set = set()
 95 |     for input_list in x_train:
 96 |         for i in range(2, ngram_range + 1):
 97 |             set_of_ngram = create_ngram_set(input_list, ngram_value=i)
 98 |             ngram_set.update(set_of_ngram)
 99 | 
100 |     # Dictionary mapping n-gram token to a unique integer.
101 |     # Integer values are greater than max_features in order
102 |     # to avoid collision with existing features.
103 |     start_index = max_features + 1
104 |     token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
105 |     indice_token = {token_indice[k]: k for k in token_indice}
106 | 
107 |     # max_features is the highest integer that could be found in the dataset.
108 |     max_features = np.max(list(indice_token.keys())) + 1
109 | 
110 |     # Augmenting x_train and x_test with n-grams features
111 |     x_train = add_ngram(x_train, token_indice, ngram_range)
112 |     x_test = add_ngram(x_test, token_indice, ngram_range)
113 |     print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
114 |     print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
115 | 
116 | print('Pad sequences (samples x time)')
117 | x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
118 | x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
119 | print('x_train shape:', x_train.shape)
120 | print('x_test shape:', x_test.shape)
121 | 
122 | print('Build model...')
123 | model = Sequential()
124 | 
125 | # we start off with an efficient embedding layer which maps
126 | # our vocab indices into embedding_dims dimensions
127 | model.add(Embedding(max_features,
128 |                     embedding_dims,
129 |                     input_length=maxlen))
130 | 
131 | # we add a GlobalAveragePooling1D, which will average the embeddings
132 | # of all words in the document
133 | model.add(GlobalAveragePooling1D())
134 | 
135 | # We project onto a single unit output layer, and squash it with a sigmoid:
136 | model.add(Dense(1, activation='sigmoid'))
137 | 
138 | model.compile(loss='binary_crossentropy',
139 |               optimizer='adam',
140 |               metrics=['accuracy'])
141 | 
142 | model.fit(x_train, y_train,
143 |           batch_size=batch_size,
144 |           epochs=epochs,
145 |           validation_data=(x_test, y_test))
146 | 


--------------------------------------------------------------------------------
/py2/04textCNN.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 文本分类之textCNN  疑问MaxPooling1D？使用计算？区别和GlobalMaxPooling1D
  5 | 论文：Convolutional Neural Networks for Sentence Classification
  6 | 论文解读：http://www.jeyzhang.com/cnn-apply-on-modelling-sentence.html
  7 | 输入层：词个数x词向量维数---矩阵的类型可以是静态的(static)word vector是固定不变，动态的(non static)word vector也当做是可优化的参数这一过程称为 Fine tune
  8 | 卷积层：若干个Feature Map--不同大小滤波器 卷积核大小为nxk k是词向量维度 1D默认宽度为词向量维度
  9 | 池化层：Max-over-time Pooling--输出为各个Feature Map的最大值们，即一个一维的向量
 10 | 全连接 + Softmax层：池化层的一维向量的输出通过全连接的方式，连接一个Softmax层
 11 | Dropout：倒数第二层的全连接部分，L2正则化，减轻过拟合
 12 | 词向量变种：
 13 | CNN-rand：对不同单词的向量作随机初始化，BP的时候作调整  Embedding层选择随机初始化方法
 14 | static：拿word2vec, FastText or GloVe训练好的词向量
 15 | non-static：拿word2vec, FastText or GloVe训练好的词向量，训练过程中再对它们微调Fine tuned(自己理解：先用其他大文本语料训练w2v再用本文本训练w2v)
 16 | multiple channel ：类比于图像中的RGB通道, 这里也可以用 static 与 non-static 搭两个通道来搞
 17 | 结论：
 18 | CNN-static较与CNN-rand好，说明pre-training的word vector确实有较大的提升作用（这也难怪，因为pre-training的word vector显然利用了更大规模的文本数据信息）；
 19 | CNN-non-static较于CNN-static大部分要好，说明适当的Fine tune也是有利的，是因为使得vectors更加贴近于具体的任务；
 20 | CNN-multichannel较于CNN-single在小规模的数据集上有更好的表现，实际上CNN-multichannel体现了一种折中思想，即既不希望Fine tuned的vector距离原始值太远，但同时保留其一定的变化空间
 21 | github：https://github.com/yoonkim/CNN_sentence
 22 | code参考
 23 | http://blog.csdn.net/diye2008/article/details/53105652?locationNum=11&fps=1
 24 | glove embedding参考http://blog.csdn.net/sscssz/article/details/53333225
 25 | """
 26 | from __future__ import print_function
 27 | 
 28 | from keras.preprocessing.sequence import pad_sequences
 29 | from keras.layers import Dense, Input, Flatten,GlobalMaxPooling1D
 30 | from keras.layers import Conv1D, MaxPooling1D, Embedding,Dropout
 31 | from keras.models import Model
 32 | from keras.optimizers import *
 33 | from keras.models import Sequential
 34 | from keras.layers import merge
 35 | import pandas as pd
 36 | import cPickle as pickle
 37 | import numpy as np
 38 | import gensim
 39 | 
 40 | ##数据获取
 41 | print('Loading data...')
 42 | path='./data/nlpmaildatasample2.csv'
 43 | d = pd.read_csv(path,header=None)
 44 | d.columns=['title','lable']
 45 | 
 46 | all_data=set()
 47 | for line in d["title"]:
 48 |    ws=line.split(" ")
 49 |    for w in ws:
 50 |      if w == ' ' or w == '' or w=="\t":
 51 |         continue
 52 |      all_data.add(w)
 53 | words=list(all_data)
 54 | word_to_id = dict(zip(words, range(len(words))))
 55 | dx=[]
 56 | for line in d["title"]:
 57 |     ws=line.split(" ")
 58 |     dx.append([word_to_id[w] for w in ws if w in word_to_id])
 59 | # dy=list(d['lable'])
 60 | dy=d['lable']
 61 | 
 62 | 
 63 | print('Average  sequence length: {}'.format(np.mean(list(map(len, dx)), dtype=int)))
 64 | 
 65 | # set parameters:
 66 | maxlen=np.max(list(map(len, dx))) #maxlen = 400  最长文本词数
 67 | max_features = 20000  #字典允许最大大小
 68 | batch_size = 32
 69 | embedding_dims = 64  #词向量长度
 70 | epochs = 2
 71 | w2vpath="./data/w2c_model"
 72 | 
 73 | x_train, y_train, x_test, y_test = dx[0:len(dx)/5*3],dy[0:len(dx)/5*3],dx[len(dx)/5*3:len(dx)],dy[len(dx)/5*3:len(dx)]
 74 | print(len(x_train), 'train sequences')
 75 | print(len(x_test), 'test sequences')
 76 | 
 77 | print('Pad sequences (samples x time)')
 78 | x_train = pad_sequences(x_train, maxlen=maxlen)
 79 | x_test = pad_sequences(x_test, maxlen=maxlen)
 80 | print('x_train shape:', x_train.shape)
 81 | print('x_test shape:', x_test.shape)
 82 | 
 83 | 
 84 | print('Indexing word vectors.')
 85 | embeddings_index = {}
 86 | model = gensim.models.Word2Vec.load(w2vpath)
 87 | for word in words:
 88 |     embeddings_index[word]=model[word]
 89 | print('Found %s word vectors.' % len(embeddings_index))
 90 | 
 91 | print('Preparing embedding matrix.')
 92 | nb_words = min(max_features, len(word_to_id))
 93 | embedding_matrix = np.zeros((nb_words + 1, embedding_dims))
 94 | for word, i in word_to_id.items():
 95 |     if i > max_features:
 96 |         continue
 97 |     embedding_vector = embeddings_index.get(word)
 98 |     if embedding_vector is not None:
 99 |         # words not found in embedding index will be all-zeros.
100 |         embedding_matrix[i] = embedding_vector # word_index to word_embedding_vector ,<20000(nb_words)
101 | 
102 | 
103 | # 神经网路的第一层，词向量层，本文使用了预训练word2vec词向量，可以把trainable那里设为False
104 | embedding_layer = Embedding(nb_words+1,
105 |                             embedding_dims,
106 |                             input_length=maxlen,
107 |                             weights=[embedding_matrix],
108 |                             trainable=False)
109 | print('Build model...')
110 | ##最简单cnn
111 | # model = Sequential()
112 | # model.add(Embedding(nb_words + 1,
113 | #                     embedding_dims,
114 | #                     input_length=maxlen))
115 | # model.add(Dropout(0.2))
116 | # model.add(Conv1D(250,#filters
117 | #                  3,#kernel_size
118 | #                  padding='valid',
119 | #                  activation='relu',
120 | #                  strides=1))
121 | # model.add(GlobalMaxPooling1D())
122 | # model.add(Dense(250))#hidden layer:
123 | # model.add(Dropout(0.2))
124 | # model.add(Activation('relu'))
125 | # model.add(Dense(1))
126 | # model.add(Activation('sigmoid'))
127 | # model.compile(loss='binary_crossentropy',
128 | #               optimizer='adam',
129 | #               metrics=['accuracy'])
130 | # model.fit(x_train, y_train,
131 | #           batch_size=batch_size,
132 | #           epochs=epochs,
133 | #           validation_data=(x_test, y_test))
134 | 
135 | ###3层合并model 经过词向量表达的文本为一维数据，因此在TextCNN卷积用的是一维卷积
136 | #left model
137 | model_left = Sequential()
138 | #https://keras.io/layers/embeddings/
139 | # model.add(Embedding(max_features,embedding_dims,input_length=maxlen))
140 | model_left.add(embedding_layer)
141 | model_left.add(Conv1D(128, 5, activation='relu')) #128输出的维度 5卷积核大小
142 | model_left.add(MaxPooling1D())#5
143 | model_left.add(Conv1D(128, 5, activation='relu'))
144 | model_left.add(MaxPooling1D())#5
145 | model_left.add(Conv1D(128, 5, activation='relu'))
146 | model_left.add(MaxPooling1D()) #35 #model_left.add(GlobalMaxPooling1D())
147 | model_left.add(Flatten())
148 | 
149 | model_right = Sequential()
150 | model_right.add(embedding_layer)
151 | model_right.add(Conv1D(128, 4, activation='relu'))
152 | model_right.add(MaxPooling1D())#4
153 | model_right.add(Conv1D(128, 4, activation='relu'))
154 | model_right.add(MaxPooling1D())#4
155 | model_right.add(Conv1D(128, 4, activation='relu'))
156 | model_right.add(MaxPooling1D())#28
157 | model_right.add(Flatten())
158 | 
159 | model_3 = Sequential()
160 | model_3.add(embedding_layer)
161 | model_3.add(Conv1D(128, 6, activation='relu'))
162 | model_3.add(MaxPooling1D())#3
163 | model_3.add(Conv1D(128, 6, activation='relu'))
164 | model_3.add(MaxPooling1D())#3
165 | model_3.add(Conv1D(128, 6, activation='relu'))
166 | model_3.add(MaxPooling1D())#30
167 | model_3.add(Flatten())
168 | 
169 | merged = Merge([model_left, model_right,model_3], mode='concat') # 将三种不同卷积窗口的卷积层组合 连接在一起，当然也可以只是用三个model中的一个，一样可以得到不错的效果，只是本文采用论文中的结构设计
170 | model = Sequential()
171 | model.add(merged) # add merge
172 | model.add(Dense(128, activation='relu')) # 全连接层
173 | model.add(Dropout(0.2))
174 | model.add(Dense(1, activation='sigmoid')) # softmax对应多分类 需要修改loss，输出文本属于类别中每个类别的概率
175 | 
176 | model.compile(loss='binary_crossentropy',
177 |               optimizer='adam',
178 |               metrics=['accuracy'])
179 | model.fit(x_train, y_train,
180 |           batch_size=batch_size,
181 |           epochs=epochs,
182 |           validation_data=(x_test, y_test))
183 | 
184 | score = model.evaluate(x_train, y_train, verbose=0) # 评估模型在训练集中的效果，准确率约99%
185 | print('train score:', score[0])
186 | print('train accuracy:', score[1])
187 | score = model.evaluate(x_test, y_test, verbose=0)  # 评估模型在测试集中的效果，准确率约为97%，迭代次数多了，会进一步提升
188 | print('Test score:', score[0])
189 | print('Test accuracy:', score[1])
190 | 


--------------------------------------------------------------------------------
/py2/05textRNN.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | 未使用word2vector的双向lstm
 5 | t 时刻输出不仅取决于之前时刻的序列输入，还取决于将来时刻序列输入
 6 | embedding--->bi-directional lstm--->concat output--->average----->softmax
 7 | lstm中的Xt-1,Xt代表的是一个样本中的每一个词 所有循环只在一个样本中循环
 8 | TimeDistributed包装器=把一个层应用到输入的每一个时间步上-http://keras-cn.readthedocs.io/en/latest/layers/wrapper/
 9 | 思考：
10 | 分类的时候不只使用最后一个隐藏元的输出，而是把所有隐藏元的输出做K-MaxPooling再分类
11 | 在双向GRU前添加单层卷积层提取一次ngram特征-C-GRU
12 | """
13 | from __future__ import print_function
14 | from keras.preprocessing import sequence
15 | from keras.models import Sequential
16 | from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
17 | from keras.datasets import imdb
18 | 
19 | import pandas as pd
20 | import cPickle as pickle
21 | import numpy as np
22 | import gensim
23 | 
24 | ##数据获取
25 | print('Loading data...')
26 | path='./data/nlpmaildatasample2.csv'
27 | d = pd.read_csv(path,header=None)
28 | d.columns=['title','lable']
29 | 
30 | all_data=set()
31 | for line in d["title"]:
32 |    ws=line.split(" ")
33 |    for w in ws:
34 |      if w == ' ' or w == '' or w=="\t":
35 |         continue
36 |      all_data.add(w)
37 | words=list(all_data)
38 | word_to_id = dict(zip(words, range(len(words))))
39 | dx=[]
40 | for line in d["title"]:
41 |     ws=line.split(" ")
42 |     dx.append([word_to_id[w] for w in ws if w in word_to_id])
43 | # dy=list(d['lable'])
44 | dy=d['lable']
45 | 
46 | # set parameters:
47 | maxlen=np.max(list(map(len, dx))) #maxlen = 400  最长文本词数
48 | max_features = len(word_to_id)+1
49 | batch_size = 32
50 | embedding_dims=128
51 | 
52 | x_train, y_train, x_test, y_test = dx[0:len(dx)/5*3],dy[0:len(dx)/5*3],dx[len(dx)/5*3:len(dx)],dy[len(dx)/5*3:len(dx)]
53 | print(len(x_train), 'train sequences')
54 | print(len(x_test), 'test sequences')
55 | print('Pad sequences (samples x time)')
56 | x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
57 | x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
58 | print('x_train shape:', x_train.shape)
59 | print('x_test shape:', x_test.shape)
60 | 
61 | print('Build model...')
62 | model = Sequential()
63 | model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
64 | model.add(Bidirectional(LSTM(64))) ### 输出维度64 GRU
65 | model.add(Dropout(0.5))
66 | model.add(Dense(1, activation='sigmoid'))
67 | # try using different optimizers and different optimizer configs
68 | model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
69 | #lstm常选参数model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
70 | # a stateful LSTM model
71 | #lahead: the input sequence length that the LSTM
72 | # https://github.com/keras-team/keras/blob/master/examples/lstm_stateful.py
73 | # model = Sequential()
74 | # model.add(LSTM(20,input_shape=(lahead, 1),
75 | #               batch_size=batch_size,
76 | #               stateful=stateful))
77 | # model.add(Dense(1))
78 | # model.compile(loss='mse', optimizer='adam')
79 | 
80 | 
81 | print('Train...')
82 | model.fit(x_train, y_train,
83 |           batch_size=batch_size,
84 |           epochs=4,
85 |           validation_data=[x_test, y_test])
86 | 
87 | # y_pred = model.predict_classes(x_test, verbose=0)
88 | 


--------------------------------------------------------------------------------
/py2/06textRCNN.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 使用Word2vec定义词向量矩阵
  5 | recurrent structure (convolutional layer)：
  6 | 词向量矩阵
  7 | left(无意义补0+去最后一个词)  max_token对应词向量为0向量
  8 | right(去第一个词+无意义补0)
  9 | lstm(left)+词向量矩阵+lstm(right)===上一个词+当前词+下一个词
 10 | structure:1)recurrent structure (convolutional layer) 2)max pooling 3) fully connected layer+softmax
 11 | Recurrent convolutional neural networks for text classification
 12 | http://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/view/9745
 13 | tensoflow版https://github.com/brightmart/text_classification/blob/master/a04_TextRCNN/p71_TextRCNN_model.py
 14 | """
 15 | import pandas as pd
 16 | import cPickle as pickle
 17 | import numpy as np
 18 | import gensim
 19 | from keras.preprocessing import sequence
 20 | from keras import backend
 21 | from keras.layers import Dense, Input, Lambda, LSTM, TimeDistributed
 22 | from keras.layers.merge import concatenate
 23 | from keras.layers.embeddings import Embedding
 24 | from keras.models import Model
 25 | 
 26 | ##数据获取
 27 | print('Loading data...')
 28 | path='./data/nlpmaildatasample2.pkl'
 29 | f2 = file(path, 'rb')
 30 | d = pickle.load(f2)
 31 | f2.close()
 32 | # path='./data/nlpmaildatasample2.csv'
 33 | # d = pd.read_csv(path,header=None)
 34 | # d.columns=['title','lable']
 35 | 
 36 | all_data=set()
 37 | for line in d["title"]:
 38 |    ws=line.split(" ")
 39 |    for w in ws:
 40 |      if w == ' ' or w == '' or w=="\t":
 41 |         continue
 42 |      all_data.add(w)
 43 | words=list(all_data)
 44 | word_to_id = dict(zip(words, range(len(words))))
 45 | dx=[]
 46 | for line in d["title"]:
 47 |     ws=line.split(" ")
 48 |     dx.append([word_to_id[w] for w in ws if w in word_to_id])
 49 | # dy=list(d['lable'])
 50 | dy=d['lable']
 51 | 
 52 | 
 53 | print('Average  sequence length: {}'.format(np.mean(list(map(len, dx)), dtype=int)))
 54 | 
 55 | # set parameters:
 56 | maxlen=np.max(list(map(len, dx))) #maxlen = 400  最长文本词数
 57 | max_features = 20000  #字典允许最大大小
 58 | batch_size = 32
 59 | embedding_dims = 64  #词向量长度
 60 | epochs = 2
 61 | hidden_dim_1 = 200
 62 | hidden_dim_2 = 100
 63 | w2vpath="./data/w2c_model"
 64 | 
 65 | x_train, y_train, x_test, y_test = dx[0:len(dx)/5*3],dy[0:len(dx)/5*3],dx[len(dx)/5*3:len(dx)],dy[len(dx)/5*3:len(dx)]
 66 | print(len(x_train), 'train sequences')
 67 | print(len(x_test), 'test sequences')
 68 | 
 69 | print('Pad sequences (samples x time)')
 70 | x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
 71 | x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
 72 | print('x_train shape:', x_train.shape)
 73 | print('x_test shape:', x_test.shape)
 74 | 
 75 | 
 76 | print('Indexing word vectors.')
 77 | embeddings_index = {}
 78 | model = gensim.models.Word2Vec.load(w2vpath)
 79 | for word in words:
 80 |     embeddings_index[word]=model[word]
 81 | print('Found %s word vectors.' % len(embeddings_index))
 82 | 
 83 | print('Preparing embedding matrix.')
 84 | max_token = min(max_features, len(word_to_id))
 85 | embedding_matrix = np.zeros((max_token + 1, embedding_dims))
 86 | for word, i in word_to_id.items():
 87 |     if i > max_features:
 88 |         continue
 89 |     embedding_vector = embeddings_index.get(word)
 90 |     if embedding_vector is not None:
 91 |         # words not found in embedding index will be all-zeros.
 92 |         embedding_matrix[i] = embedding_vector # word_index to word_embedding_vector ,<20000(max_token)
 93 | 
 94 | print('Build model...')
 95 | document = Input(shape = (None, ), dtype = "int32")
 96 | left_context = Input(shape = (None, ), dtype = "int32")
 97 | right_context = Input(shape = (None, ), dtype = "int32")
 98 | 
 99 | embedder = Embedding(max_token + 1, embedding_dims, weights = [embedding_matrix], trainable = False)
100 | doc_embedding = embedder(document)
101 | l_embedding = embedder(left_context)
102 | r_embedding = embedder(right_context)
103 | 
104 | # I use LSTM RNNs instead of vanilla RNNs as described in the paper.
105 | forward = LSTM(hidden_dim_1, return_sequences = True)(l_embedding) # See equation (1).
106 | backward = LSTM(hidden_dim_1, return_sequences = True, go_backwards = True)(r_embedding) # See equation (2).
107 | together = concatenate([forward, doc_embedding, backward], axis = 2) # See equation (3).
108 | 
109 | semantic = TimeDistributed(Dense(hidden_dim_2, activation = "tanh"))(together) # See equation (4).
110 | 
111 | # Keras provides its own max-pooling layers, but they cannot handle variable length input
112 | # (as far as I can tell). As a result, I define my own max-pooling layer here.
113 | pool_rnn = Lambda(lambda x: backend.max(x, axis = 1), output_shape = (hidden_dim_2, ))(semantic) # See equation (5).
114 | 
115 | output = Dense(1, input_dim = hidden_dim_2, activation = "sigmoid")(pool_rnn) # See equations (6) and (7).NUM_CLASSES=1
116 | 
117 | model = Model(inputs = [document, left_context, right_context], outputs = output)
118 | model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
119 | 
120 | ##生成左右上下文
121 | print('Build left and right data')
122 | doc_x_train = np.array(x_train)
123 | # We shift the document to the right to obtain the left-side contexts.
124 | left_x_train = np.array([[max_token]+t_one[:-1].tolist() for t_one in x_train])
125 | # We shift the document to the left to obtain the right-side contexts.
126 | right_x_train = np.array([t_one[1:].tolist()+[max_token] for t_one in x_train])
127 | 
128 | doc_x_test = np.array(x_test)
129 | # We shift the document to the right to obtain the left-side contexts.
130 | left_x_test = np.array([[max_token]+t_one[:-1].tolist() for t_one in x_test])
131 | # We shift the document to the left to obtain the right-side contexts.
132 | right_x_test = np.array([t_one[1:].tolist()+[max_token] for t_one in x_test])
133 | 
134 | 
135 | # history = model.fit([doc_x_train, left_x_train, right_x_train], y_train, epochs = 1)
136 | # loss = history.history["loss"][0]
137 | model.fit([doc_x_train, left_x_train, right_x_train], y_train,
138 |           batch_size=batch_size,
139 |           epochs=4,
140 |           validation_data=[[doc_x_test, left_x_test, right_x_test], y_test])
141 | 
142 | 


--------------------------------------------------------------------------------
/py2/07Attention.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 单双向lstm 之后加 + Attention   HAN模型
  5 | paper:Hierarchical Attention Networks for Document Classification
  6 | 加入Attention之后最大的好处自然是能够直观的解释各个句子和词对分类类别的重要性
  7 | Structure:
  8 | 1.embedding
  9 | 2.Word Encoder: 词级双向GRU，以获得丰富的词汇表征
 10 | 3.Word Attention:词级注意在句子中获取重要信息
 11 | 4.Sentence Encoder: 句子级双向GRU，以获得丰富的句子表征
 12 | 5.Sentence Attetion: 句级注意以获得句子中的重点句子
 13 | 6.FC+Softmax
 14 | # HierarchicalAttention: 1.Word Encoder. 2.Word Attention. 3.Sentence Encoder 4.Sentence Attention 5.linear classifier. 2017-06-13
 15 | Attention层是一个MLP+softmax机制
 16 | code参考：https://github.com/richliao/textClassifier
 17 | https://github.com/philipperemy/keras-attention-mechanism
 18 | https://github.com/codekansas/keras-language-modeling/blob/master/keras_models.py
 19 | https://github.com/codekansas/keras-language-modeling
 20 | https://github.com/EdGENetworks/attention-networks-for-classification
 21 | https://github.com/brightmart/text_classification/tree/master/a05_HierarchicalAttentionNetwork
 22 | 原理解说：https://www.zhihu.com/question/68482809/answer/268320399
 23 | """
 24 | from keras.preprocessing import sequence
 25 | from keras.layers import Dense, Input, Flatten,Permute,Reshape
 26 | from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
 27 | from keras.layers import merge
 28 | from keras.models import Model
 29 | from keras import backend as K
 30 | 
 31 | import numpy as np
 32 | import pandas as pd
 33 | import cPickle as pickle
 34 | import numpy as np
 35 | import gensim
 36 | 
 37 | ##数据获取
 38 | print('Loading data...')
 39 | path='./data/nlpmaildatasample2.pkl'
 40 | f2 = file(path, 'rb')
 41 | d = pickle.load(f2)
 42 | f2.close()
 43 | # path='./data/nlpmaildatasample2.csv'
 44 | # d = pd.read_csv(path,header=None)
 45 | # d.columns=['title','lable']
 46 | 
 47 | all_data=set()
 48 | for line in d["title"]:
 49 |    ws=line.split(" ")
 50 |    for w in ws:
 51 |      if w == ' ' or w == '' or w=="\t":
 52 |         continue
 53 |      all_data.add(w)
 54 | words=list(all_data)
 55 | word_to_id = dict(zip(words, range(len(words))))
 56 | dx=[]
 57 | for line in d["title"]:
 58 |     ws=line.split(" ")
 59 |     dx.append([word_to_id[w] for w in ws if w in word_to_id])
 60 | # dy=list(d['lable'])
 61 | dy=d['lable']
 62 | 
 63 | 
 64 | print('Average  sequence length: {}'.format(np.mean(list(map(len, dx)), dtype=int)))
 65 | 
 66 | # set parameters:
 67 | maxlen=np.max(list(map(len, dx))) #maxlen = 400  最长文本词数
 68 | max_features = 20000  #字典允许最大大小
 69 | batch_size = 32
 70 | embedding_dims = 64  #词向量长度
 71 | epochs = 2
 72 | hidden_dim_1 = 200
 73 | hidden_dim_2 = 100
 74 | w2vpath="./data/w2c_model"
 75 | 
 76 | x_train, y_train, x_test, y_test = dx[0:len(dx)/5*3],dy[0:len(dx)/5*3],dx[len(dx)/5*3:len(dx)],dy[len(dx)/5*3:len(dx)]
 77 | print(len(x_train), 'train sequences')
 78 | print(len(x_test), 'test sequences')
 79 | 
 80 | print('Pad sequences (samples x time)')
 81 | x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
 82 | x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
 83 | print('x_train shape:', x_train.shape)
 84 | print('x_test shape:', x_test.shape)
 85 | 
 86 | 
 87 | print('Indexing word vectors.')
 88 | embeddings_index = {}
 89 | model = gensim.models.Word2Vec.load(w2vpath)
 90 | for word in words:
 91 |     embeddings_index[word]=model[word]
 92 | print('Found %s word vectors.' % len(embeddings_index))
 93 | 
 94 | print('Preparing embedding matrix.')
 95 | max_token = min(max_features, len(word_to_id))
 96 | embedding_matrix = np.zeros((max_token + 1, embedding_dims))
 97 | for word, i in word_to_id.items():
 98 |     if i > max_features:
 99 |         continue
100 |     embedding_vector = embeddings_index.get(word)
101 |     if embedding_vector is not None:
102 |         # words not found in embedding index will be all-zeros.
103 |         embedding_matrix[i] = embedding_vector # word_index to word_embedding_vector ,<20000(max_token)
104 | 
105 | 
106 | ##句子最多几句
107 | max_sents=1
108 | 
109 | embedding_layer = Embedding(max_token + 1,
110 |                             embedding_dims,
111 |                             weights=[embedding_matrix],
112 |                             input_length=maxlen,
113 |                             trainable=True)
114 | #LSTM步长
115 | TIME_STEPS=maxlen
116 | SINGLE_ATTENTION_VECTOR = False
117 | ##不带别名的自编写Attention
118 | # def attention_3d_block(inputs):
119 | #     # inputs.shape = (batch_size, time_steps, input_dim)
120 | #     input_dim = int(inputs.shape[2])
121 | #     a = Permute((2, 1))(inputs)
122 | #     a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
123 | #     a = Dense(TIME_STEPS, activation='softmax')(a)
124 | #     if SINGLE_ATTENTION_VECTOR:
125 | #         a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
126 | #         a = RepeatVector(input_dim)(a)
127 | #     a_probs = Permute((2, 1), name='attention_vec')(a)
128 | #     output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
129 | #     return output_attention_mul
130 | ##使用多次attention需要新命名
131 | def attention_3d_block2(inputs,new_layer_name):
132 |     # inputs.shape = (batch_size, time_steps, input_dim)
133 |     input_dim = int(inputs.shape[2])
134 |     a = Permute((2, 1))(inputs)
135 |     a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
136 |     a = Dense(TIME_STEPS, activation='softmax')(a)
137 |     if SINGLE_ATTENTION_VECTOR:
138 |         a = Lambda(lambda x: K.mean(x, axis=1), name=new_layer_name+'_'+'dim_reduction')(a)
139 |         a = RepeatVector(input_dim)(a)
140 |     a_probs = Permute((2, 1), name=new_layer_name+'_''attention_vec')(a)
141 |     output_attention_mul = merge([inputs, a_probs], name=new_layer_name+'_''attention_mul', mode='mul')
142 |     return output_attention_mul
143 | 
144 | #单向LSTM之后加入Attention
145 | # sentence_input = Input(shape=(maxlen,), dtype='int32')
146 | # embedded_sequences = embedding_layer(sentence_input)
147 | # lstm_out = LSTM(100, return_sequences=True)(embedded_sequences)
148 | # attention_mul = attention_3d_block(lstm_out)
149 | # attention_mul = Flatten()(attention_mul)
150 | # output = Dense(1, activation='sigmoid')(attention_mul)
151 | # model = Model(sentence_input, output)
152 | # model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
153 | # model.fit(x_train, y_train, validation_data=(x_test, y_test),
154 | #           nb_epoch=epochs, batch_size=batch_size)
155 | 
156 | #双向LSTM词encoder  输入是 词标签数组
157 | sentence_input = Input(shape=(maxlen,), dtype='int32')
158 | embedded_sequences = embedding_layer(sentence_input)
159 | forward_rnn = LSTM(100, return_sequences=True)
160 | backward_rnn = LSTM(100, return_sequences=True, go_backwards=True)
161 | lstm_out_f_rnn = forward_rnn(embedded_sequences)
162 | attention_f_mul = attention_3d_block2(lstm_out_f_rnn,"forward")
163 | lstm_out_b_rnn = backward_rnn(embedded_sequences)
164 | attention_b_mul = attention_3d_block2(lstm_out_b_rnn,"backward")
165 | attention_mul=merge([attention_f_mul, attention_b_mul], mode='concat', concat_axis=-1)
166 | attention_mul = Flatten()(attention_mul)
167 | output = Dense(1, activation='sigmoid')(attention_mul)
168 | model = Model(sentence_input, output)
169 | model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
170 | model.fit(x_train, y_train, validation_data=(x_test, y_test),
171 |           nb_epoch=epochs, batch_size=batch_size)
172 | 
173 | ####先词Attention再句Attention Hierarchical Attention Networks for Document Classification
174 | #词encoder  输入是 词标签数组 未完待续
175 | #句encoder  输入是 句子个数x词标签数组
176 | 
177 | 


--------------------------------------------------------------------------------
/py2/README.md:
--------------------------------------------------------------------------------
 1 | ## keras实现深度学习模型 进行文本分类
 2 | 
 3 | > 实验数据采用真实邮件数据，涉及个人隐私，无法公开，可自行寻找数据测试--格式为：文本内容,标签
 4 | 
 5 | > 模型参数未经过合适调整，目前正在实验修改验证模型当中，修改完成会更新项目
 6 | 
 7 | 01mail.py 文本数据生成-输出文本 词典 非一次执行
 8 | 
 9 | 02mail.py 文本词袋向量化/TF-IDF标准化/文本Hash+朴素贝叶斯
10 | 
11 | 03fastText.py fastText库训练
12 | 
13 | 03fastText_keras.py fastText keras实现
14 | 
15 | 04textCNN.py word2vecter做词向量的CNN两种模型 
16 | 
17 | 05textRNN.py 双向lstm随机初始词向量
18 | 
19 | 06textRCNN.py Recurrent Convolutional Neural Networks for Text Classification
20 | 
21 | 07Attention.py 双向LSTM+Attention分层注意网络 -HAN模型 
22 | 
23 | 


--------------------------------------------------------------------------------
/py2/mymodel.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 1、自定义模型 Conv-BiGRU 卷积和循环并行
  5 | 2、自定义模型 卷积和循环串行
  6 | """
  7 | from keras.preprocessing import sequence
  8 | from keras.layers import Dense, Input, Flatten,Permute,Reshape
  9 | from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
 10 | from keras.layers import merge
 11 | from keras.models import Model
 12 | from keras import backend as K
 13 | from keras.models import Sequential
 14 | 
 15 | import numpy as np
 16 | import pandas as pd
 17 | import cPickle as pickle
 18 | import numpy as np
 19 | import gensim
 20 | 
 21 | ##数据获取
 22 | print('Loading data...')
 23 | path='./data/nlpmaildatasample2.pkl'
 24 | f2 = file(path, 'rb')
 25 | d = pickle.load(f2)
 26 | f2.close()
 27 | # path='./data/nlpmaildatasample2.csv'
 28 | # d = pd.read_csv(path,header=None)
 29 | # d.columns=['title','lable']
 30 | 
 31 | all_data=set()
 32 | for line in d["title"]:
 33 |    ws=line.split(" ")
 34 |    for w in ws:
 35 |      if w == ' ' or w == '' or w=="\t":
 36 |         continue
 37 |      all_data.add(w)
 38 | words=list(all_data)
 39 | word_to_id = dict(zip(words, range(len(words))))
 40 | dx=[]
 41 | for line in d["title"]:
 42 |     ws=line.split(" ")
 43 |     dx.append([word_to_id[w] for w in ws if w in word_to_id])
 44 | # dy=list(d['lable'])
 45 | dy=d['lable']
 46 | 
 47 | 
 48 | print('Average  sequence length: {}'.format(np.mean(list(map(len, dx)), dtype=int)))
 49 | 
 50 | # set parameters:
 51 | maxlen=np.max(list(map(len, dx))) #maxlen = 400  最长文本词数
 52 | max_features = 20000  #字典允许最大大小
 53 | batch_size = 32
 54 | embedding_dims = 64  #词向量长度
 55 | epochs = 2
 56 | hidden_dim_1 = 200
 57 | hidden_dim_2 = 100
 58 | w2vpath="./data/w2c_model"
 59 | 
 60 | x_train, y_train, x_test, y_test = dx[0:len(dx)/5*3],dy[0:len(dx)/5*3],dx[len(dx)/5*3:len(dx)],dy[len(dx)/5*3:len(dx)]
 61 | print(len(x_train), 'train sequences')
 62 | print(len(x_test), 'test sequences')
 63 | 
 64 | print('Pad sequences (samples x time)')
 65 | x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
 66 | x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
 67 | print('x_train shape:', x_train.shape)
 68 | print('x_test shape:', x_test.shape)
 69 | 
 70 | 
 71 | print('Indexing word vectors.')
 72 | embeddings_index = {}
 73 | model = gensim.models.Word2Vec.load(w2vpath)
 74 | for word in words:
 75 |     embeddings_index[word]=model[word]
 76 | print('Found %s word vectors.' % len(embeddings_index))
 77 | 
 78 | print('Preparing embedding matrix.')
 79 | max_token = min(max_features, len(word_to_id))
 80 | embedding_matrix = np.zeros((max_token + 1, embedding_dims))
 81 | for word, i in word_to_id.items():
 82 |     if i > max_features:
 83 |         continue
 84 |     embedding_vector = embeddings_index.get(word)
 85 |     if embedding_vector is not None:
 86 |         # words not found in embedding index will be all-zeros.
 87 |         embedding_matrix[i] = embedding_vector # word_index to word_embedding_vector ,<20000(max_token)
 88 | 
 89 | embedding_layer = Embedding(max_token+1,embedding_dims,input_length=maxlen,weights=[embedding_matrix],trainable=False)
 90 | 
 91 | ####并行
 92 | model_left = Sequential()
 93 | model_left.add(embedding_layer)
 94 | model_left.add(Bidirectional(GRU(128)))
 95 | 
 96 | model_right = Sequential()
 97 | model_right.add(embedding_layer)
 98 | model_right.add(Conv1D(128, 5, activation='relu')) #128卷积核的个数 5卷积核大小
 99 | model_right.add(MaxPooling1D())#5
100 | model_right.add(Conv1D(128, 1, activation='relu'))
101 | model_right.add(MaxPooling1D())#5
102 | model_right.add(Flatten())
103 | 
104 | merged = Merge([model_left, model_right], mode='concat')
105 | model = Sequential()
106 | model.add(merged) # add merge
107 | model.add(Dense(128, activation='relu')) # 全连接层
108 | model.add(Dropout(0.2))
109 | model.add(Dense(1, activation='sigmoid')) #
110 | 
111 | model.compile(loss='binary_crossentropy',
112 |               optimizer='adam',
113 |               metrics=['accuracy'])
114 | model.fit(x_train, y_train,
115 |           batch_size=batch_size,
116 |           epochs=epochs,
117 |           validation_data=(x_test, y_test))
118 | 
119 | ####串行
120 | sentence_input = Input(shape=(maxlen,), dtype='int32')
121 | embedded_sequences = embedding_layer(sentence_input)
122 | conv_1=Conv1D(128, 3, activation='relu')(embedded_sequences)
123 | maxpool_1=MaxPooling1D()(conv_1)
124 | drop_1 = Dropout(0.2)(maxpool_1)
125 | biGRU=Bidirectional(GRU(128))(drop_1)
126 | drop_2 = Dropout(0.5)(biGRU)
127 | dense_1 = Dense(1, activation='sigmoid')(drop_2)
128 | model.compile(loss='binary_crossentropy',
129 |               optimizer='adam',
130 |               metrics=['accuracy'])
131 | model.fit(x_train, y_train,
132 |           batch_size=batch_size,
133 |           epochs=epochs,
134 |           validation_data=(x_test, y_test))
135 | 


--------------------------------------------------------------------------------
/py2/word2vec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | gensim的Word2vector使用
 5 | pip install gensim
 6 | 输入数据要求是：分词后数据，以空格为单词的分隔符
 7 | """
 8 | from gensim.models import Word2Vec
 9 | import pandas as pd
10 | import cPickle as pickle
11 | path='./data/nlpmaildata2.pkl'
12 | f2 = file(path, 'rb')
13 | d = pickle.load(f2)
14 | f2.close()
15 | 
16 | 
17 | modelpath="./data/w2c_model"
18 | sentences=list(d["title"])
19 | sentences= [s.decode("utf-8").encode('utf-8').split() for s in sentences]
20 | 
21 | model = Word2Vec(sentences, sg=1, size=64,  window=5,  min_count=1,  negative=3, sample=0.001, hs=1, workers=4)
22 | # 1.sg=1是skip-gram算法，对低频词敏感；默认sg=0为CBOW算法。
23 | # 2.size是输出词向量的维数，值太小会导致词映射因为冲突而影响结果，值太大则会耗内存并使算法计算变慢，一般值取为100到200之间。
24 | # 3.window是句子中当前词与目标词之间的最大距离，3表示在目标词前看3-b个词，后面看b个词（b在0-3之间随机）。
25 | # 4.min_count是对词进行过滤，频率小于min-count的单词则会被忽视，默认值为5。
26 | # 5.negative和sample可根据训练结果进行微调，sample表示更高频率的词被随机下采样到所设置的阈值，默认值为1e-3。
27 | # 6.hs=1表示层级softmax将会被使用，默认hs=0且negative不为0，则负采样将会被选择使用。
28 | # 7.workers控制训练的并行，此参数只有在安装了Cpython后才有效，否则只能使用单核。
29 | # model["英文"]
30 | model.save(modelpath)
31 | # model = Word2Vec.load(fname)
32 | 
33 | #模型使用（词语相似度计算等）
34 | # model.most_similar(positive=['woman', 'king'], negative=['man'])
35 | # #输出[('queen', 0.50882536), ...]
36 | 
37 | # model.doesnt_match("breakfast cereal dinner lunch".split())
38 | # #输出'cereal'
39 | 
40 | # model.similarity('woman', 'man')
41 | # #输出0.73723527
42 | 
43 | # model['computer']  # raw numpy vector of a word
44 | #输出array([-0.00449447, -0.00310097,  0.02421786, ...], dtype=float32)
45 | 


--------------------------------------------------------------------------------
/word2vec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | gensim的Word2vector使用
 5 | pip install gensim
 6 | 输入数据要求是：分词后数据，以空格为单词的分隔符
 7 | 原理讲解
 8 | https://www.cnblogs.com/f-young/p/7906451.html
 9 | """
10 | from gensim.models import Word2Vec
11 | import pandas as pd
12 | # import cPickle as pickle
13 | # path='./data/nlpmaildata2.pkl'
14 | # f2 = file(path, 'rb')
15 | # d = pickle.load(f2)
16 | # f2.close()
17 | 
18 | path='./data/nlpmail_re3.txt'
19 | d = pd.read_csv(path,header=None)
20 | d.columns=['title','lable']
21 | # sentences= [str(s).split() for s in sentences]
22 | 
23 | 
24 | modelpath="./data/w2c_model"
25 | sentences=list(d["title"])
26 | sentences= [str(s).split() for s in sentences]
27 | 
28 | model = Word2Vec(sentences, sg=1, size=128,  window=5,  min_count=1,  negative=3, sample=0.001, hs=1, workers=4)
29 | # 1.sg=1是skip-gram算法，对低频词敏感；默认sg=0为CBOW算法。
30 | # 2.size是输出词向量的维数，值太小会导致词映射因为冲突而影响结果，值太大则会耗内存并使算法计算变慢，一般值取为100到200之间。
31 | # 3.window是句子中当前词与目标词之间的最大距离，3表示在目标词前看3-b个词，后面看b个词（b在0-3之间随机）。
32 | # 4.min_count是对词进行过滤，频率小于min-count的单词则会被忽视，默认值为5。
33 | # 5.negative和sample可根据训练结果进行微调，sample表示更高频率的词被随机下采样到所设置的阈值，默认值为1e-3。
34 | #作者在论文中说到，当样本量比较小的时候，选择5-20个negative words效果会比较好，当样本量比较大的时候，2-5个negative words就能得到很好的效果
35 | # 6.hs=1表示层级softmax将会被使用，默认hs=0且negative不为0，则负采样将会被选择使用。
36 | # 7.workers控制训练的并行，此参数只有在安装了Cpython后才有效，否则只能使用单核。
37 | # model["英文"]
38 | model.save(modelpath)
39 | # model = Word2Vec.load(fname)
40 | 
41 | #模型使用（词语相似度计算等）
42 | # model.most_similar(positive=['woman', 'king'], negative=['man'])
43 | # #输出[('queen', 0.50882536), ...]
44 | 
45 | # model.doesnt_match("breakfast cereal dinner lunch".split())
46 | # #输出'cereal'
47 | 
48 | # model.similarity('woman', 'man')
49 | # #输出0.73723527
50 | 
51 | # model['computer']  # raw numpy vector of a word
52 | #输出array([-0.00449447, -0.00310097,  0.02421786, ...], dtype=float32)
53 | 


--------------------------------------------------------------------------------