├── README.md ├── cnn_lstm.py ├── text_cnn.py ├── text_cnn_lstm.py └── sklearn-pipeline.py /README.md: -------------------------------------------------------------------------------- 1 | # 20newsgroups-text-classification 2 | 对20 newsgroups 数据集 进行文本分类 3 | # 方法 4 | - 基于传统机器学习方法的文本分类 5 | - 基于深度学习的文本分类 6 | # 测试结果 7 | - 传统机器学习方法 8 | 9 | MultinomialNB准确率为: 0.8960196779964222 10 | 11 | SGDClassifier准确率为: 0.9724955277280859 12 | 13 | LogisticRegression准确率为: 0.9304561717352415 14 | 15 | SVC准确率为: 0.13372093023255813 16 | 17 | LinearSVC准确率为: 0.9749552772808586 18 | 19 | LinearSVR准确率为: 0.00022361359570661896 20 | 21 | MLPClassifier准确率为: 0.9758497316636852 22 | 23 | KNeighborsClassifier准确率为: 0.45840787119856885 24 | 25 | RandomForestClassifier准确率为: 0.9680232558139535 26 | 27 | GradientBoostingClassifier准确率为: 0.9186046511627907 28 | 29 | AdaBoostClassifier准确率为: 0.5916815742397138 30 | 31 | DecisionTreeClassifier准确率为: 0.9758497316636852 32 | 33 | - CNN实现文本分类 34 | 35 | 需要词向量http://nlp.stanford.edu/data/glove.6B.zip 36 | 37 | 效果其实不好... 38 | -------------------------------------------------------------------------------- /cnn_lstm.py: -------------------------------------------------------------------------------- 1 | '''Train a recurrent convolutional network on the IMDB sentiment 2 | classification task. 3 | Gets to 0.8498 test accuracy after 2 epochs. 41s/epoch on K520 GPU. 4 | ''' 5 | from __future__ import print_function 6 | 7 | from keras.preprocessing import sequence 8 | from keras.models import Sequential 9 | from keras.layers import Dense, Dropout, Activation 10 | from keras.layers import Embedding 11 | from keras.layers import LSTM 12 | from keras.layers import Conv1D, MaxPooling1D 13 | from keras.datasets import imdb 14 | 15 | 16 | class CNNLSTM(): 17 | def __init__(self): 18 | pass 19 | # Embedding 20 | max_features = 10000 21 | maxlen = 1000 22 | embedding_size = 128 23 | 24 | # Convolution 25 | kernel_size = 5 26 | filters = 64 27 | pool_size = 4 28 | 29 | # LSTM 30 | lstm_output_size = 70 31 | 32 | # Training 33 | batch_size = 30 34 | epochs = 2 35 | 36 | ''' 37 | Note: 38 | batch_size is highly sensitive. 39 | Only 2 epochs are needed as the dataset is very small. 40 | ''' 41 | def initialize(self): 42 | print('Build model...') 43 | model = Sequential() 44 | model.add(Embedding(self.max_features, self.embedding_size, input_length=self.maxlen)) 45 | model.add(Dropout(0.25)) 46 | model.add(Conv1D(self.filters, 47 | self.kernel_size, 48 | border_mode='valid', 49 | activation='relu')) 50 | model.add(MaxPooling1D(pool_length=self.pool_size)) 51 | model.add(LSTM(self.lstm_output_size)) 52 | model.add(Dense(1)) 53 | model.add(Activation('sigmoid')) 54 | 55 | model.compile(loss='binary_crossentropy', 56 | optimizer='adam', 57 | metrics=['accuracy']) 58 | print('Model Compiled') 59 | return model 60 | 61 | def train(self, model): 62 | print('Loading data...') 63 | (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=self.max_features) 64 | print(len(x_train), 'train sequences') 65 | print(len(x_test), 'test sequences') 66 | 67 | print('Pad sequences (samples x time)') 68 | x_train = sequence.pad_sequences(x_train, maxlen=self.maxlen) 69 | x_test = sequence.pad_sequences(x_test, maxlen=self.maxlen) 70 | print('x_train shape:', x_train.shape) 71 | print('x_test shape:', x_test.shape) 72 | 73 | 74 | print('Train...') 75 | model.fit(x_train, y_train, 76 | batch_size=self.batch_size, 77 | epochs=self.epochs, 78 | validation_data=(x_test, y_test)) 79 | score, acc = model.evaluate(x_test, y_test, batch_size=self.batch_size) 80 | print('Test score:', score) 81 | print('Test accuracy:', acc) 82 | 83 | model = CNNLSTM().initialize() -------------------------------------------------------------------------------- /text_cnn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | 5 | from sklearn.datasets import fetch_20newsgroups 6 | from keras.preprocessing.text import Tokenizer 7 | from keras.preprocessing.sequence import pad_sequences 8 | from keras.layers import Dense,Input,Flatten 9 | from keras.layers import Conv1D,MaxPooling1D,Embedding 10 | from keras.models import Model 11 | from keras.utils.np_utils import to_categorical 12 | 13 | TEXT_DATA_DIR='glove.6B.100d.txt' 14 | MAX_SEQUENCE_LENGTH=1000 15 | MAX_NB_WORDS=10000 16 | EMBEDDING_DIM=100 17 | VALIDATION_SPLIT=0.2 18 | 19 | # 建立word与向量的索引 20 | embeddings_index={} 21 | f=open(TEXT_DATA_DIR,'r',encoding='utf-8') 22 | for line in f: 23 | values=line.split() 24 | word=values[0] 25 | coefs=np.asarray(values[1:],dtype='float32') 26 | embeddings_index[word]=coefs 27 | f.close() 28 | print('Found %s word vectors.' % len(embeddings_index)) 29 | 30 | # 选定的类别 31 | selected_categories = [ 32 | 'comp.graphics', 33 | 'rec.motorcycles', 34 | 'rec.sport.baseball', 35 | 'misc.forsale', 36 | 'sci.electronics', 37 | 'sci.med', 38 | 'talk.politics.guns', 39 | 'talk.religion.misc'] 40 | 41 | # 加载数据 42 | newsgroups_train = fetch_20newsgroups(subset='train', 43 | categories=selected_categories, 44 | remove=('headers', 'footers', 'quotes')) 45 | 46 | newsgroups_test = fetch_20newsgroups(subset='test', 47 | categories=selected_categories, 48 | remove=('headers', 'footers', 'quotes')) 49 | texts = newsgroups_train['data'] 50 | labels = newsgroups_train['target'] 51 | print(len(texts)) 52 | print(np.unique(labels)) 53 | print(labels) 54 | texts = [t for t in texts] 55 | print(type(texts[0]),texts) 56 | 57 | # 文本向量化 58 | tokenizer=Tokenizer(nb_words=MAX_NB_WORDS) 59 | tokenizer.fit_on_texts(texts) 60 | sequences=tokenizer.texts_to_sequences(texts) 61 | 62 | word_index=tokenizer.word_index 63 | print('Found %s unique tokens.' % len(word_index)) 64 | 65 | data = pad_sequences(sequences, maxlen=MAX_NB_WORDS) 66 | labels = to_categorical(np.asarray(labels)) 67 | print('Shape of data tensor:', data.shape) 68 | print('Shape of label tensor:', labels.shape) 69 | print('Data validation split.') 70 | 71 | # 将data划分为训练集和验证集 72 | indices=np.arange(data.shape[0]) 73 | np.random.shuffle(indices) 74 | data=data[indices] 75 | labels=labels[indices] 76 | num_validation_samples=int(VALIDATION_SPLIT*data.shape[0]) 77 | x_train = data[:-num_validation_samples] 78 | y_train = labels[:-num_validation_samples] 79 | x_val = data[-num_validation_samples:] 80 | y_val = labels[-num_validation_samples:] 81 | 82 | 83 | # 创建词嵌入 84 | num_words = min(MAX_NB_WORDS, len(word_index)) 85 | embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) 86 | for word, i in word_index.items(): 87 | if i >= MAX_NB_WORDS: 88 | continue 89 | embedding_vector = embeddings_index.get(word) 90 | if embedding_vector is not None: 91 | # 如果单词没有在word index,全部设置为0. 92 | embedding_matrix[i] = embedding_vector 93 | embedding_layer = Embedding(num_words, 94 | EMBEDDING_DIM, 95 | weights=[embedding_matrix], 96 | input_length=MAX_NB_WORDS, 97 | trainable=False) 98 | 99 | print("正在初始化模型...") 100 | 101 | # 初始化卷积层和池化层 102 | sequence_input=Input(shape=(MAX_NB_WORDS,),dtype='int32') 103 | embedded_sequences=embedding_layer(sequence_input) 104 | x=Conv1D(128,5,activation='relu')(embedded_sequences) 105 | x=MaxPooling1D(5)(x) 106 | x=Conv1D(128,5,activation='relu')(x) 107 | x=MaxPooling1D(5)(x) 108 | x = Conv1D(128, 5, activation='relu')(x) 109 | x = MaxPooling1D(35)(x) 110 | x = Flatten()(x) 111 | x = Dense(128, activation='relu')(x) 112 | preds = Dense(8, activation='softmax')(x) 113 | 114 | model=Model(sequence_input,preds) 115 | model.compile(loss='categorical_crossentropy', 116 | optimizer='adam', 117 | metrics=['acc']) 118 | print("正在训练模型...") 119 | # 训练模型 120 | model.fit(x_train,y_train, 121 | batch_size=128, 122 | nb_epoch=10, 123 | validation_data=(x_val,y_val)) -------------------------------------------------------------------------------- /text_cnn_lstm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import sys 5 | import numpy as np 6 | from pprint import pprint 7 | 8 | from keras.preprocessing import sequence 9 | from keras.preprocessing.text import Tokenizer 10 | from keras.preprocessing.sequence import pad_sequences 11 | from keras.layers import Dense, Input, Flatten 12 | from keras.layers import Conv1D, MaxPooling1D, Embedding 13 | from keras.models import Model 14 | from keras.utils.np_utils import to_categorical 15 | 16 | from sklearn.datasets import fetch_20newsgroups 17 | from sklearn.feature_extraction.text import TfidfVectorizer 18 | 19 | from cnn_lstm import CNNLSTM 20 | from sklearn.externals import joblib 21 | 22 | selected_categories = [ 23 | 'comp.graphics', 24 | 'comp.windows.x', 25 | 'rec.motorcycles', 26 | 'rec.sport.baseball', 27 | 'sci.crypt', 28 | 'sci.med', 29 | 'talk.politics.guns', 30 | 'talk.religion.misc'] 31 | 32 | newsgroups_train = fetch_20newsgroups(subset='train', 33 | categories=selected_categories, 34 | remove=('headers', 'footers', 'quotes')) 35 | 36 | newsgroups_test = fetch_20newsgroups(subset='test', 37 | categories=selected_categories, 38 | remove=('headers', 'footers', 'quotes')) 39 | texts = newsgroups_train['data'] 40 | labels = newsgroups_train['target'] 41 | 42 | print(len(texts)) 43 | print(np.unique(labels)) 44 | print(labels) 45 | 46 | texts = [t for t in texts] 47 | print(type(texts[0])) 48 | 49 | MAX_SEQUENCE_LENGTH = 1000 50 | MAX_NB_WORDS = 10000 51 | EMBEDDING_DIM = 100 52 | VALIDATION_SPLIT = 0.2 53 | 54 | 55 | ax_features = 10000 56 | maxlen = 1000 57 | embedding_size = 128 58 | 59 | # Convolution 60 | kernel_size = 5 61 | filters = 64 62 | pool_size = 4 63 | 64 | # LSTM 65 | lstm_output_size = 70 66 | 67 | # Training 68 | batch_size = 30 69 | epochs = 2 70 | 71 | # finally, vectorize the text samples into a 2D integer tensor 72 | tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) 73 | tokenizer.fit_on_texts(texts) 74 | sequences = tokenizer.texts_to_sequences(texts) 75 | 76 | word_index = tokenizer.word_index 77 | print('Found %s unique tokens.' % len(word_index)) 78 | 79 | data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) 80 | 81 | # labels = to_categorical(np.asarray(labels)) 82 | print('Shape of data tensor:', data.shape) 83 | print('Shape of label tensor:', labels.shape) 84 | 85 | # split the data into a training set and a validation set 86 | indices = np.arange(data.shape[0]) 87 | np.random.shuffle(indices) 88 | data = data[indices] 89 | labels = labels[indices] 90 | num_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) 91 | 92 | x_train = data[:-num_validation_samples] 93 | y_train = labels[:-num_validation_samples] 94 | x_val = data[-num_validation_samples:] 95 | y_val = labels[-num_validation_samples:] 96 | # 97 | print('Initialize model.') 98 | 99 | model = CNNLSTM().initialize() 100 | 101 | print('x_train shape:', x_train.shape) 102 | print('x_test shape:', x_val.shape) 103 | 104 | 105 | print('Train...') 106 | model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=epochs, 107 | validation_data=(x_val, y_val), verbose=1) 108 | score, acc = model.evaluate(x_val, y_val, batch_size=batch_size) 109 | print('Test score:', score) 110 | print('Test accuracy:', acc) 111 | 112 | # Modified for documentation by Jaques Grobler 113 | # License: BSD 3 clause 114 | 115 | import numpy as np 116 | import matplotlib.pyplot as plt 117 | from sklearn import linear_model, datasets 118 | 119 | logreg = linear_model.LogisticRegression(C=1e5, verbose=1) 120 | 121 | # we create an instance of Neighbours Classifier and fit the data. 122 | print('Training') 123 | 124 | logreg.fit(x_train, y_train) 125 | joblib.dump(logreg, 'logreg.pkl') 126 | 127 | 128 | clf = joblib.load('logreg.pkl') 129 | 130 | pred = clf.predict(x_train) 131 | print(((pred == y_train).sum() * 100.0) / (np.shape(y_train)[0] * 1.0)) 132 | 133 | 134 | # from sklearn import datasets 135 | # from sklearn.naive_bayes import GaussianNB 136 | # gnb = GaussianNB() 137 | # gnb.fit(x_train, y_train) 138 | # joblib.dump(gnb, 'gnb.pkl') 139 | # clf = joblib.load('gnb.pkl') 140 | # pred = clf.predict(x_train) 141 | # print(((pred == y_train).sum() * 100.0) / (np.shape(y_train)[0] * 1.0)) -------------------------------------------------------------------------------- /sklearn-pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.pipeline import Pipeline 3 | from sklearn.datasets import fetch_20newsgroups 4 | from sklearn.feature_extraction.text import TfidfVectorizer 5 | from sklearn.naive_bayes import MultinomialNB 6 | from sklearn.neighbors import KNeighborsClassifier 7 | from sklearn.neural_network.multilayer_perceptron import MLPClassifier 8 | from sklearn.svm import SVC,LinearSVC,LinearSVR 9 | from sklearn.linear_model.stochastic_gradient import SGDClassifier 10 | from sklearn.linear_model.logistic import LogisticRegression 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.ensemble import GradientBoostingClassifier 13 | from sklearn.ensemble import AdaBoostClassifier 14 | from sklearn.tree import DecisionTreeClassifier 15 | 16 | # 选取下面的8类 17 | selected_categories = [ 18 | 'comp.graphics', 19 | 'rec.motorcycles', 20 | 'rec.sport.baseball', 21 | 'misc.forsale', 22 | 'sci.electronics', 23 | 'sci.med', 24 | 'talk.politics.guns', 25 | 'talk.religion.misc'] 26 | 27 | # 加载数据集 28 | newsgroups_train=fetch_20newsgroups(subset='train', 29 | categories=selected_categories, 30 | remove=('headers','footers','quotes')) 31 | newsgroups_test=fetch_20newsgroups(subset='train', 32 | categories=selected_categories, 33 | remove=('headers','footers','quotes')) 34 | 35 | train_texts=newsgroups_train['data'] 36 | train_labels=newsgroups_train['target'] 37 | test_texts=newsgroups_test['data'] 38 | test_labels=newsgroups_test['target'] 39 | print(len(train_texts),len(test_texts)) 40 | 41 | # 贝叶斯 42 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)), 43 | ('clf',MultinomialNB())]) 44 | text_clf=text_clf.fit(train_texts,train_labels) 45 | predicted=text_clf.predict(test_texts) 46 | print("MultinomialNB准确率为:",np.mean(predicted==test_labels)) 47 | 48 | # SGD 49 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)), 50 | ('clf',SGDClassifier())]) 51 | text_clf=text_clf.fit(train_texts,train_labels) 52 | predicted=text_clf.predict(test_texts) 53 | print("SGDClassifier准确率为:",np.mean(predicted==test_labels)) 54 | 55 | # LogisticRegression 56 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)), 57 | ('clf',LogisticRegression())]) 58 | text_clf=text_clf.fit(train_texts,train_labels) 59 | predicted=text_clf.predict(test_texts) 60 | print("LogisticRegression准确率为:",np.mean(predicted==test_labels)) 61 | 62 | # SVM 63 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)), 64 | ('clf',SVC())]) 65 | text_clf=text_clf.fit(train_texts,train_labels) 66 | predicted=text_clf.predict(test_texts) 67 | print("SVC准确率为:",np.mean(predicted==test_labels)) 68 | 69 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)), 70 | ('clf',LinearSVC())]) 71 | text_clf=text_clf.fit(train_texts,train_labels) 72 | predicted=text_clf.predict(test_texts) 73 | print("LinearSVC准确率为:",np.mean(predicted==test_labels)) 74 | 75 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)), 76 | ('clf',LinearSVR())]) 77 | text_clf=text_clf.fit(train_texts,train_labels) 78 | predicted=text_clf.predict(test_texts) 79 | print("LinearSVR准确率为:",np.mean(predicted==test_labels)) 80 | 81 | # MLPClassifier 82 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)), 83 | ('clf',MLPClassifier())]) 84 | text_clf=text_clf.fit(train_texts,train_labels) 85 | predicted=text_clf.predict(test_texts) 86 | print("MLPClassifier准确率为:",np.mean(predicted==test_labels)) 87 | 88 | # KNeighborsClassifier 89 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)), 90 | ('clf',KNeighborsClassifier())]) 91 | text_clf=text_clf.fit(train_texts,train_labels) 92 | predicted=text_clf.predict(test_texts) 93 | print("KNeighborsClassifier准确率为:",np.mean(predicted==test_labels)) 94 | 95 | # RandomForestClassifier 96 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)), 97 | ('clf',RandomForestClassifier(n_estimators=8))]) 98 | text_clf=text_clf.fit(train_texts,train_labels) 99 | predicted=text_clf.predict(test_texts) 100 | print("RandomForestClassifier准确率为:",np.mean(predicted==test_labels)) 101 | 102 | # GradientBoostingClassifier 103 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)), 104 | ('clf',GradientBoostingClassifier())]) 105 | text_clf=text_clf.fit(train_texts,train_labels) 106 | predicted=text_clf.predict(test_texts) 107 | print("GradientBoostingClassifier准确率为:",np.mean(predicted==test_labels)) 108 | 109 | # AdaBoostClassifier 110 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)), 111 | ('clf',AdaBoostClassifier())]) 112 | text_clf=text_clf.fit(train_texts,train_labels) 113 | predicted=text_clf.predict(test_texts) 114 | print("AdaBoostClassifier准确率为:",np.mean(predicted==test_labels)) 115 | 116 | # DecisionTreeClassifier 117 | text_clf=Pipeline([('tfidf',TfidfVectorizer(max_features=10000)), 118 | ('clf',DecisionTreeClassifier())]) 119 | text_clf=text_clf.fit(train_texts,train_labels) 120 | predicted=text_clf.predict(test_texts) 121 | print("DecisionTreeClassifier准确率为:",np.mean(predicted==test_labels)) 122 | 123 | 124 | --------------------------------------------------------------------------------