├── README.md
├── cnn_lstm.py
├── text_cnn.py
├── text_cnn_lstm.py
└── sklearn-pipeline.py


/README.md:
--------------------------------------------------------------------------------
 1 | # 20newsgroups-text-classification
 2 | 对20 newsgroups 数据集 进行文本分类
 3 | # 方法
 4 | - 基于传统机器学习方法的文本分类
 5 | - 基于深度学习的文本分类
 6 | # 测试结果
 7 | - 传统机器学习方法
 8 | 
 9 |     MultinomialNB准确率为： 0.8960196779964222
10 | 
11 |     SGDClassifier准确率为： 0.9724955277280859
12 | 
13 |     LogisticRegression准确率为： 0.9304561717352415
14 | 
15 |     SVC准确率为： 0.13372093023255813
16 | 
17 |     LinearSVC准确率为： 0.9749552772808586
18 | 
19 |     LinearSVR准确率为： 0.00022361359570661896
20 | 
21 |     MLPClassifier准确率为： 0.9758497316636852
22 | 
23 |     KNeighborsClassifier准确率为： 0.45840787119856885
24 | 
25 |     RandomForestClassifier准确率为： 0.9680232558139535
26 | 
27 |     GradientBoostingClassifier准确率为： 0.9186046511627907
28 | 
29 |     AdaBoostClassifier准确率为： 0.5916815742397138
30 | 
31 |     DecisionTreeClassifier准确率为： 0.9758497316636852
32 | 
33 | - CNN实现文本分类
34 | 
35 |     需要词向量http://nlp.stanford.edu/data/glove.6B.zip
36 | 
37 |     效果其实不好...
38 | 


--------------------------------------------------------------------------------
/cnn_lstm.py:
--------------------------------------------------------------------------------
 1 | '''Train a recurrent convolutional network on the IMDB sentiment
 2 | classification task.
 3 | Gets to 0.8498 test accuracy after 2 epochs. 41s/epoch on K520 GPU.
 4 | '''
 5 | from __future__ import print_function
 6 | 
 7 | from keras.preprocessing import sequence
 8 | from keras.models import Sequential
 9 | from keras.layers import Dense, Dropout, Activation
10 | from keras.layers import Embedding
11 | from keras.layers import LSTM
12 | from keras.layers import Conv1D, MaxPooling1D
13 | from keras.datasets import imdb
14 | 
15 | 
16 | class CNNLSTM():
17 |     def __init__(self):
18 |         pass
19 |     # Embedding
20 |     max_features = 10000
21 |     maxlen = 1000
22 |     embedding_size = 128
23 | 
24 |     # Convolution
25 |     kernel_size = 5
26 |     filters = 64
27 |     pool_size = 4
28 | 
29 |     # LSTM
30 |     lstm_output_size = 70
31 | 
32 |     # Training
33 |     batch_size = 30
34 |     epochs = 2
35 | 
36 |     '''
37 |     Note:
38 |     batch_size is highly sensitive.
39 |     Only 2 epochs are needed as the dataset is very small.
40 |     '''
41 |     def initialize(self):
42 |         print('Build model...')
43 |         model = Sequential()
44 |         model.add(Embedding(self.max_features, self.embedding_size, input_length=self.maxlen))
45 |         model.add(Dropout(0.25))
46 |         model.add(Conv1D(self.filters,
47 |                          self.kernel_size,
48 |                          border_mode='valid',
49 |                          activation='relu'))
50 |         model.add(MaxPooling1D(pool_length=self.pool_size))
51 |         model.add(LSTM(self.lstm_output_size))
52 |         model.add(Dense(1))
53 |         model.add(Activation('sigmoid'))
54 | 
55 |         model.compile(loss='binary_crossentropy',
56 |                       optimizer='adam',
57 |                       metrics=['accuracy'])
58 |         print('Model Compiled')
59 |         return model
60 | 
61 |     def train(self, model):
62 |         print('Loading data...')
63 |         (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=self.max_features)
64 |         print(len(x_train), 'train sequences')
65 |         print(len(x_test), 'test sequences')
66 | 
67 |         print('Pad sequences (samples x time)')
68 |         x_train = sequence.pad_sequences(x_train, maxlen=self.maxlen)
69 |         x_test = sequence.pad_sequences(x_test, maxlen=self.maxlen)
70 |         print('x_train shape:', x_train.shape)
71 |         print('x_test shape:', x_test.shape)
72 | 
73 | 
74 |         print('Train...')
75 |         model.fit(x_train, y_train,
76 |                   batch_size=self.batch_size,
77 |                   epochs=self.epochs,
78 |                   validation_data=(x_test, y_test))
79 |         score, acc = model.evaluate(x_test, y_test, batch_size=self.batch_size)
80 |         print('Test score:', score)
81 |         print('Test accuracy:', acc)
82 | 
83 | model = CNNLSTM().initialize()


--------------------------------------------------------------------------------
/text_cnn.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | 
  5 | from sklearn.datasets import fetch_20newsgroups
  6 | from keras.preprocessing.text import Tokenizer
  7 | from keras.preprocessing.sequence import pad_sequences
  8 | from keras.layers import Dense,Input,Flatten
  9 | from keras.layers import Conv1D,MaxPooling1D,Embedding
 10 | from keras.models import Model
 11 | from keras.utils.np_utils import to_categorical
 12 | 
 13 | TEXT_DATA_DIR='glove.6B.100d.txt'
 14 | MAX_SEQUENCE_LENGTH=1000
 15 | MAX_NB_WORDS=10000
 16 | EMBEDDING_DIM=100
 17 | VALIDATION_SPLIT=0.2
 18 | 
 19 | # 建立word与向量的索引
 20 | embeddings_index={}
 21 | f=open(TEXT_DATA_DIR,'r',encoding='utf-8')
 22 | for line in f:
 23 |     values=line.split()
 24 |     word=values[0]
 25 |     coefs=np.asarray(values[1:],dtype='float32')
 26 |     embeddings_index[word]=coefs
 27 | f.close()
 28 | print('Found %s word vectors.' % len(embeddings_index))
 29 | 
 30 | # 选定的类别
 31 | selected_categories = [
 32 |     'comp.graphics',
 33 |     'rec.motorcycles',
 34 |     'rec.sport.baseball',
 35 |     'misc.forsale',
 36 |     'sci.electronics',
 37 |     'sci.med',
 38 |     'talk.politics.guns',
 39 |     'talk.religion.misc']
 40 | 
 41 | # 加载数据
 42 | newsgroups_train = fetch_20newsgroups(subset='train',
 43 |                                       categories=selected_categories,
 44 |                                       remove=('headers', 'footers', 'quotes'))
 45 | 
 46 | newsgroups_test = fetch_20newsgroups(subset='test',
 47 |                                      categories=selected_categories,
 48 |                                      remove=('headers', 'footers', 'quotes'))
 49 | texts = newsgroups_train['data']
 50 | labels = newsgroups_train['target']
 51 | print(len(texts))
 52 | print(np.unique(labels))
 53 | print(labels)
 54 | texts = [t for t in texts]
 55 | print(type(texts[0]),texts)
 56 | 
 57 | # 文本向量化
 58 | tokenizer=Tokenizer(nb_words=MAX_NB_WORDS)
 59 | tokenizer.fit_on_texts(texts)
 60 | sequences=tokenizer.texts_to_sequences(texts)
 61 | 
 62 | word_index=tokenizer.word_index
 63 | print('Found %s unique tokens.' % len(word_index))
 64 | 
 65 | data = pad_sequences(sequences, maxlen=MAX_NB_WORDS)
 66 | labels = to_categorical(np.asarray(labels))
 67 | print('Shape of data tensor:', data.shape)
 68 | print('Shape of label tensor:', labels.shape)
 69 | print('Data validation split.')
 70 | 
 71 | # 将data划分为训练集和验证集
 72 | indices=np.arange(data.shape[0])
 73 | np.random.shuffle(indices)
 74 | data=data[indices]
 75 | labels=labels[indices]
 76 | num_validation_samples=int(VALIDATION_SPLIT*data.shape[0])
 77 | x_train = data[:-num_validation_samples]
 78 | y_train = labels[:-num_validation_samples]
 79 | x_val = data[-num_validation_samples:]
 80 | y_val = labels[-num_validation_samples:]
 81 | 
 82 | 
 83 | # 创建词嵌入
 84 | num_words = min(MAX_NB_WORDS, len(word_index))
 85 | embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
 86 | for word, i in word_index.items():
 87 |     if i >= MAX_NB_WORDS:
 88 |         continue
 89 |     embedding_vector = embeddings_index.get(word)
 90 |     if embedding_vector is not None:
 91 |         # 如果单词没有在word index,全部设置为0.
 92 |         embedding_matrix[i] = embedding_vector
 93 | embedding_layer = Embedding(num_words,
 94 |                             EMBEDDING_DIM,
 95 |                             weights=[embedding_matrix],
 96 |                             input_length=MAX_NB_WORDS,
 97 |                             trainable=False)
 98 | 
 99 | print("正在初始化模型...")
100 | 
101 | # 初始化卷积层和池化层
102 | sequence_input=Input(shape=(MAX_NB_WORDS,),dtype='int32')
103 | embedded_sequences=embedding_layer(sequence_input)
104 | x=Conv1D(128,5,activation='relu')(embedded_sequences)
105 | x=MaxPooling1D(5)(x)
106 | x=Conv1D(128,5,activation='relu')(x)
107 | x=MaxPooling1D(5)(x)
108 | x = Conv1D(128, 5, activation='relu')(x)
109 | x = MaxPooling1D(35)(x)
110 | x = Flatten()(x)
111 | x = Dense(128, activation='relu')(x)
112 | preds = Dense(8, activation='softmax')(x)
113 | 
114 | model=Model(sequence_input,preds)
115 | model.compile(loss='categorical_crossentropy',
116 |               optimizer='adam',
117 |               metrics=['acc'])
118 | print("正在训练模型...")
119 | # 训练模型
120 | model.fit(x_train,y_train,
121 |           batch_size=128,
122 |           nb_epoch=10,
123 |           validation_data=(x_val,y_val))


--------------------------------------------------------------------------------
/text_cnn_lstm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import os
  4 | import sys
  5 | import numpy as np
  6 | from pprint import pprint
  7 | 
  8 | from keras.preprocessing import sequence
  9 | from keras.preprocessing.text import Tokenizer
 10 | from keras.preprocessing.sequence import pad_sequences
 11 | from keras.layers import Dense, Input, Flatten
 12 | from keras.layers import Conv1D, MaxPooling1D, Embedding
 13 | from keras.models import Model
 14 | from keras.utils.np_utils import to_categorical
 15 | 
 16 | from sklearn.datasets import fetch_20newsgroups
 17 | from sklearn.feature_extraction.text import TfidfVectorizer
 18 | 
 19 | from cnn_lstm import CNNLSTM
 20 | from sklearn.externals import joblib
 21 | 
 22 | selected_categories = [
 23 |     'comp.graphics',
 24 |     'comp.windows.x',
 25 |     'rec.motorcycles',
 26 |     'rec.sport.baseball',
 27 |     'sci.crypt',
 28 |     'sci.med',
 29 |     'talk.politics.guns',
 30 |     'talk.religion.misc']
 31 | 
 32 | newsgroups_train = fetch_20newsgroups(subset='train',
 33 |                                       categories=selected_categories,
 34 |                                       remove=('headers', 'footers', 'quotes'))
 35 | 
 36 | newsgroups_test = fetch_20newsgroups(subset='test',
 37 |                                      categories=selected_categories,
 38 |                                      remove=('headers', 'footers', 'quotes'))
 39 | texts = newsgroups_train['data']
 40 | labels = newsgroups_train['target']
 41 | 
 42 | print(len(texts))
 43 | print(np.unique(labels))
 44 | print(labels)
 45 | 
 46 | texts = [t for t in texts]
 47 | print(type(texts[0]))
 48 | 
 49 | MAX_SEQUENCE_LENGTH = 1000
 50 | MAX_NB_WORDS = 10000
 51 | EMBEDDING_DIM = 100
 52 | VALIDATION_SPLIT = 0.2
 53 | 
 54 | 
 55 | ax_features = 10000
 56 | maxlen = 1000
 57 | embedding_size = 128
 58 | 
 59 | # Convolution
 60 | kernel_size = 5
 61 | filters = 64
 62 | pool_size = 4
 63 | 
 64 | # LSTM
 65 | lstm_output_size = 70
 66 | 
 67 | # Training
 68 | batch_size = 30
 69 | epochs = 2
 70 | 
 71 | # finally, vectorize the text samples into a 2D integer tensor
 72 | tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
 73 | tokenizer.fit_on_texts(texts)
 74 | sequences = tokenizer.texts_to_sequences(texts)
 75 | 
 76 | word_index = tokenizer.word_index
 77 | print('Found %s unique tokens.' % len(word_index))
 78 | 
 79 | data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
 80 | 
 81 | # labels = to_categorical(np.asarray(labels))
 82 | print('Shape of data tensor:', data.shape)
 83 | print('Shape of label tensor:', labels.shape)
 84 | 
 85 | # split the data into a training set and a validation set
 86 | indices = np.arange(data.shape[0])
 87 | np.random.shuffle(indices)
 88 | data = data[indices]
 89 | labels = labels[indices]
 90 | num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
 91 | 
 92 | x_train = data[:-num_validation_samples]
 93 | y_train = labels[:-num_validation_samples]
 94 | x_val = data[-num_validation_samples:]
 95 | y_val = labels[-num_validation_samples:]
 96 | #
 97 | print('Initialize model.')
 98 | 
 99 | model = CNNLSTM().initialize()
100 | 
101 | print('x_train shape:', x_train.shape)
102 | print('x_test shape:', x_val.shape)
103 | 
104 | 
105 | print('Train...')
106 | model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=epochs,
107 |           validation_data=(x_val, y_val), verbose=1)
108 | score, acc = model.evaluate(x_val, y_val, batch_size=batch_size)
109 | print('Test score:', score)
110 | print('Test accuracy:', acc)
111 | 
112 | # Modified for documentation by Jaques Grobler
113 | # License: BSD 3 clause
114 | 
115 | import numpy as np
116 | import matplotlib.pyplot as plt
117 | from sklearn import linear_model, datasets
118 | 
119 | logreg = linear_model.LogisticRegression(C=1e5, verbose=1)
120 | 
121 | # we create an instance of Neighbours Classifier and fit the data.
122 | print('Training')
123 | 
124 | logreg.fit(x_train, y_train)
125 | joblib.dump(logreg, 'logreg.pkl')
126 | 
127 | 
128 | clf = joblib.load('logreg.pkl')
129 | 
130 | pred = clf.predict(x_train)
131 | print(((pred == y_train).sum() * 100.0) / (np.shape(y_train)[0] * 1.0))
132 | 
133 | 
134 | # from sklearn import datasets
135 | # from sklearn.naive_bayes import GaussianNB
136 | # gnb = GaussianNB()
137 | # gnb.fit(x_train, y_train)
138 | # joblib.dump(gnb, 'gnb.pkl')
139 | # clf = joblib.load('gnb.pkl')
140 | # pred = clf.predict(x_train)
141 | # print(((pred == y_train).sum() * 100.0) / (np.shape(y_train)[0] * 1.0))


--------------------------------------------------------------------------------
/sklearn-pipeline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.pipeline import Pipeline
  3 | from sklearn.datasets import fetch_20newsgroups
  4 | from sklearn.feature_extraction.text import TfidfVectorizer
  5 | from sklearn.naive_bayes import MultinomialNB
  6 | from sklearn.neighbors import KNeighborsClassifier
  7 | from sklearn.neural_network.multilayer_perceptron import MLPClassifier
  8 | from sklearn.svm import SVC,LinearSVC,LinearSVR
  9 | from sklearn.linear_model.stochastic_gradient import SGDClassifier
 10 | from sklearn.linear_model.logistic import LogisticRegression
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from sklearn.ensemble import GradientBoostingClassifier
 13 | from sklearn.ensemble import AdaBoostClassifier
 14 | from sklearn.tree import DecisionTreeClassifier
 15 | 
 16 | # 选取下面的8类
 17 | selected_categories = [
 18 |     'comp.graphics',
 19 |     'rec.motorcycles',
 20 |     'rec.sport.baseball',
 21 |     'misc.forsale',
 22 |     'sci.electronics',
 23 |     'sci.med',
 24 |     'talk.politics.guns',
 25 |     'talk.religion.misc']
 26 | 
 27 | # 加载数据集
 28 | newsgroups_train=fetch_20newsgroups(subset='train',
 29 |                                     categories=selected_categories,
 30 |                                     remove=('headers','footers','quotes'))
 31 | newsgroups_test=fetch_20newsgroups(subset='train',
 32 |                                     categories=selected_categories,
 33 |                                     remove=('headers','footers','quotes'))
 34 | 
 35 | train_texts=newsgroups_train['data']
 36 | train_labels=newsgroups_train['target']
 37 | test_texts=newsgroups_test['data']
 38 | test_labels=newsgroups_test['target']
 39 | print(len(train_texts),len(test_texts))
 40 | 
 41 | # 贝叶斯
 42 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)),
 43 |                    ('clf',MultinomialNB())])
 44 | text_clf=text_clf.fit(train_texts,train_labels)
 45 | predicted=text_clf.predict(test_texts)
 46 | print("MultinomialNB准确率为：",np.mean(predicted==test_labels))
 47 | 
 48 | # SGD
 49 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)),
 50 |                    ('clf',SGDClassifier())])
 51 | text_clf=text_clf.fit(train_texts,train_labels)
 52 | predicted=text_clf.predict(test_texts)
 53 | print("SGDClassifier准确率为：",np.mean(predicted==test_labels))
 54 | 
 55 | # LogisticRegression
 56 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)),
 57 |                    ('clf',LogisticRegression())])
 58 | text_clf=text_clf.fit(train_texts,train_labels)
 59 | predicted=text_clf.predict(test_texts)
 60 | print("LogisticRegression准确率为：",np.mean(predicted==test_labels))
 61 | 
 62 | # SVM
 63 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)),
 64 |                    ('clf',SVC())])
 65 | text_clf=text_clf.fit(train_texts,train_labels)
 66 | predicted=text_clf.predict(test_texts)
 67 | print("SVC准确率为：",np.mean(predicted==test_labels))
 68 | 
 69 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)),
 70 |                    ('clf',LinearSVC())])
 71 | text_clf=text_clf.fit(train_texts,train_labels)
 72 | predicted=text_clf.predict(test_texts)
 73 | print("LinearSVC准确率为：",np.mean(predicted==test_labels))
 74 | 
 75 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)),
 76 |                    ('clf',LinearSVR())])
 77 | text_clf=text_clf.fit(train_texts,train_labels)
 78 | predicted=text_clf.predict(test_texts)
 79 | print("LinearSVR准确率为：",np.mean(predicted==test_labels))
 80 | 
 81 | # MLPClassifier
 82 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)),
 83 |                    ('clf',MLPClassifier())])
 84 | text_clf=text_clf.fit(train_texts,train_labels)
 85 | predicted=text_clf.predict(test_texts)
 86 | print("MLPClassifier准确率为：",np.mean(predicted==test_labels))
 87 | 
 88 | # KNeighborsClassifier
 89 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)),
 90 |                    ('clf',KNeighborsClassifier())])
 91 | text_clf=text_clf.fit(train_texts,train_labels)
 92 | predicted=text_clf.predict(test_texts)
 93 | print("KNeighborsClassifier准确率为：",np.mean(predicted==test_labels))
 94 | 
 95 | # RandomForestClassifier
 96 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)),
 97 |                    ('clf',RandomForestClassifier(n_estimators=8))])
 98 | text_clf=text_clf.fit(train_texts,train_labels)
 99 | predicted=text_clf.predict(test_texts)
100 | print("RandomForestClassifier准确率为：",np.mean(predicted==test_labels))
101 | 
102 | # GradientBoostingClassifier
103 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)),
104 |                    ('clf',GradientBoostingClassifier())])
105 | text_clf=text_clf.fit(train_texts,train_labels)
106 | predicted=text_clf.predict(test_texts)
107 | print("GradientBoostingClassifier准确率为：",np.mean(predicted==test_labels))
108 | 
109 | # AdaBoostClassifier
110 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)),
111 |                    ('clf',AdaBoostClassifier())])
112 | text_clf=text_clf.fit(train_texts,train_labels)
113 | predicted=text_clf.predict(test_texts)
114 | print("AdaBoostClassifier准确率为：",np.mean(predicted==test_labels))
115 | 
116 | # DecisionTreeClassifier
117 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)),
118 |                    ('clf',DecisionTreeClassifier())])
119 | text_clf=text_clf.fit(train_texts,train_labels)
120 | predicted=text_clf.predict(test_texts)
121 | print("DecisionTreeClassifier准确率为：",np.mean(predicted==test_labels))
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------