├── .gitattributes ├── .gitignore └── text_classifier ├── load_files.py ├── netease_news_classifier.py ├── netease_predict_data.zip ├── netease_traning_data.zip └── text_classifier_evaluation.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /text_classifier/load_files.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import codecs 3 | from os import listdir 4 | import os 5 | from os.path import isdir, join 6 | 7 | import numpy as np 8 | from sklearn.datasets.base import Bunch 9 | from sklearn.utils import check_random_state 10 | 11 | __author__ = 'rockychi1001@gmail.com' 12 | 13 | 14 | def load_files(container_path, description=None, categories=None, 15 | shuffle=True, encoding='utf-8', random_state=0, 16 | key_path_index=-2): 17 | """Load text files with categories as subfolder names. 18 | 19 | Individual samples are assumed to be files stored a two levels folder 20 | structure such as the following: 21 | 22 | container_folder/ 23 | category_1_folder/ 24 | file_1.txt 25 | line 1 26 | line 2 27 | ... 28 | line n 29 | category_2_folder/ 30 | file_2.txt 31 | line 1 32 | line 2 33 | ... 34 | line n 35 | ... 36 | 37 | The folder names are used as supervised signal label names. The 38 | individual file names are not important. 39 | 40 | Parameters 41 | ---------- 42 | container_path : string or unicode 43 | Path to the main folder holding one subfolder per category 44 | 45 | description: string or unicode, optional (default=None) 46 | A paragraph describing the characteristic of the dataset: its source, 47 | reference, etc. 48 | 49 | categories : A collection of strings or None, optional (default=None) 50 | If None (default), load all the categories. 51 | If not None, list of category names to load (other categories ignored). 52 | 53 | shuffle : bool, optional (default=True) 54 | Whether or not to shuffle the data: might be important for models that 55 | make the assumption that the samples are independent and identically 56 | distributed (i.i.d.), such as stochastic gradient descent. 57 | 58 | random_state : int, RandomState instance or None, optional (default=0) 59 | If int, random_state is the seed used by the random number generator; 60 | If RandomState instance, random_state is the random number generator; 61 | If None, the random number generator is the RandomState instance used 62 | by `np.random`. 63 | 64 | key_name_index : int, category's index containing text file. 65 | 66 | Returns 67 | ------- 68 | data : Bunch 69 | Dictionary-like object, the interesting attributes are: either 70 | data, the raw text data to learn, or 'filenames', the files 71 | holding it, 'target', the classification labels (integer index), 72 | 'target_names', the meaning of the labels, and 'DESCR', the full 73 | description of the dataset. 74 | """ 75 | target = list() 76 | target_names = list() 77 | filenames = list() 78 | filelines2data = dict() 79 | 80 | folders = [f for f in sorted(listdir(container_path)) 81 | if isdir(join(container_path, f))] 82 | 83 | if categories is not None: 84 | folders = [f for f in folders if f in categories] 85 | 86 | for label, folder in enumerate(folders): 87 | target_names.append(folder) 88 | folder_path = join(container_path, folder) 89 | documents = [join(folder_path, d) 90 | for d in sorted(listdir(folder_path))] 91 | for training_doc in documents: 92 | if key_path_index: 93 | category = training_doc.split(os.sep)[key_path_index] 94 | else: 95 | category = training_doc 96 | with codecs.open(training_doc, encoding=encoding) as td: 97 | for line_index, data in enumerate(td): 98 | key4file = category + str(line_index) 99 | filelines2data[key4file] = data 100 | target.append(label) 101 | filenames.append(key4file) 102 | 103 | # convert to array for fancy indexing 104 | filenames = np.array(filenames) 105 | target = np.array(target) 106 | 107 | if shuffle: 108 | random_state = check_random_state(random_state) 109 | indices = np.arange(filenames.shape[0]) 110 | random_state.shuffle(indices) 111 | filenames = filenames[indices] 112 | target = target[indices] 113 | 114 | data = list() 115 | for filename in filenames: 116 | data.append(filelines2data.get(filename)) 117 | 118 | return Bunch(data=data, 119 | filenames=filenames, 120 | target_names=target_names, 121 | target=target, 122 | DESCR=description) 123 | -------------------------------------------------------------------------------- /text_classifier/netease_news_classifier.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from sklearn.datasets import load_files 3 | from sklearn.cross_validation import train_test_split 4 | from sklearn.feature_extraction.text import TfidfVectorizer 5 | from sklearn.svm import LinearSVC 6 | import jieba 7 | 8 | __author__ = 'rockychi1001@gmail.com' 9 | 10 | 11 | def jieba_tokenizer(x): return jieba.cut(x) 12 | # 加载训练文本(网易新闻抓取的分类数据),并切分为训练集 13 | training_data = load_files(ur'D:\work_code\workspace\PythonML\crawler\netease', encoding='utf-8') 14 | x_train, _, y_train, _ = train_test_split(training_data.data, training_data.target, test_size=0.00000001) 15 | # 生成词语的Tfidf向量空间模型,注意,训练样本数据调用的是fit_transform接口 16 | words_tfidf_vec = TfidfVectorizer(binary=False, tokenizer=jieba_tokenizer) 17 | X_train = words_tfidf_vec.fit_transform(x_train) 18 | # 训练分类器 19 | clf = LinearSVC().fit(X_train, y_train) 20 | # 加载待预测文本数据,并切分为测试集 21 | testing_data = load_files(ur'D:\work_code\workspace\PythonML\text_classify\netease_test\predict_data', encoding='utf-8') 22 | _, x_test, _, _ = train_test_split(testing_data.data, testing_data.target, test_size=0.99999999) 23 | # 测试样本数据调用的是transform接口 24 | X_test = words_tfidf_vec.transform(x_test) 25 | # 进行预测 26 | pred = clf.predict(X_test) 27 | 28 | for label in pred: 29 | print u'predict label: %s ' % training_data.target_names[pred] 30 | -------------------------------------------------------------------------------- /text_classifier/netease_predict_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rocky1001/Machine-Learning/f06de33181b48a97343ba75816850172bee282db/text_classifier/netease_predict_data.zip -------------------------------------------------------------------------------- /text_classifier/netease_traning_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rocky1001/Machine-Learning/f06de33181b48a97343ba75816850172bee282db/text_classifier/netease_traning_data.zip -------------------------------------------------------------------------------- /text_classifier/text_classifier_evaluation.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Author: Peter Prettenhofer 3 | # Olivier Grisel 4 | # Mathieu Blondel 5 | # Lars Buitinck 6 | # License: BSD 3 clause 7 | 8 | from __future__ import print_function 9 | 10 | import logging 11 | from optparse import OptionParser 12 | import sys 13 | from time import time 14 | 15 | import numpy as np 16 | import matplotlib.pyplot as plt 17 | from sklearn.cross_validation import train_test_split 18 | from sklearn.datasets import load_files 19 | from sklearn.feature_extraction.text import TfidfVectorizer 20 | from sklearn.feature_extraction.text import HashingVectorizer 21 | from sklearn.feature_selection import SelectKBest, chi2 22 | from sklearn.linear_model import RidgeClassifier 23 | from sklearn.pipeline import Pipeline 24 | from sklearn.svm import LinearSVC 25 | from sklearn.linear_model import SGDClassifier 26 | from sklearn.linear_model import Perceptron 27 | from sklearn.linear_model import PassiveAggressiveClassifier 28 | from sklearn.naive_bayes import BernoulliNB, MultinomialNB 29 | from sklearn.neighbors import KNeighborsClassifier 30 | from sklearn.neighbors import NearestCentroid 31 | from sklearn.ensemble import RandomForestClassifier 32 | from sklearn.utils.extmath import density 33 | from sklearn import metrics 34 | 35 | import jieba 36 | jieba.load_userdict(ur'jieba_customer_dict.txt') 37 | 38 | 39 | def jieba_tokenizer(_x): return jieba.cut(_x) 40 | 41 | # Display progress logs on stdout 42 | logging.basicConfig(level=logging.INFO, 43 | format='%(asctime)s %(levelname)s %(message)s') 44 | 45 | 46 | # parse commandline arguments 47 | op = OptionParser() 48 | op.add_option("--report", 49 | action="store_true", dest="print_report", 50 | help="Print a detailed classification report.") 51 | op.add_option("--chi2_select", 52 | action="store", type="int", dest="select_chi2", 53 | help="Select some number of features using a chi-squared test") 54 | op.add_option("--confusion_matrix", 55 | action="store_true", dest="print_cm", 56 | help="Print the confusion matrix.") 57 | op.add_option("--top10", 58 | action="store_true", dest="print_top10", 59 | help="Print ten most discriminative terms per class" 60 | " for every classifier.") 61 | op.add_option("--all_categories", 62 | action="store_true", dest="all_categories", 63 | help="Whether to use all categories or not.") 64 | op.add_option("--use_hashing", 65 | action="store_true", 66 | help="Use a hashing vectorizer.") 67 | op.add_option("--n_features", 68 | action="store", type=int, default=2 ** 16, 69 | help="n_features when using the hashing vectorizer.") 70 | op.add_option("--filtered", 71 | action="store_true", 72 | help="Remove newsgroup information that is easily overfit: " 73 | "headers, signatures, and quoting.") 74 | 75 | (opts, args) = op.parse_args() 76 | if len(args) > 0: 77 | op.error("this script takes no arguments.") 78 | sys.exit(1) 79 | 80 | print(__doc__) 81 | op.print_help() 82 | print() 83 | 84 | 85 | ############################################################################### 86 | 87 | print("Loading autohome text data:") 88 | # 加载并保存训练文本(分类词库数据) 89 | data_train = load_files(ur'training_data_folder', 90 | encoding='utf-8' 91 | ) 92 | 93 | # 加载数据集,切分数据集80%训练,20%测试 94 | x_train, x_test, y_train, y_test \ 95 | = train_test_split(data_train.data, data_train.target, test_size=0.2) 96 | print('data loaded') 97 | 98 | categories = data_train.target_names # for case categories == None 99 | 100 | 101 | def size_mb(docs): 102 | return sum(len(s.encode('utf-8')) for s in docs) / 1e6 103 | 104 | 105 | data_train_size_mb = size_mb(data_train.data) 106 | data_test_size_mb = size_mb(x_test) 107 | 108 | print("%d documents - %0.3fMB (training set)" % ( 109 | len(data_train.data), data_train_size_mb)) 110 | print("%d documents - %0.3fMB (test set)" % ( 111 | len(x_test), data_test_size_mb)) 112 | print("%d categories" % len(categories)) 113 | print() 114 | 115 | print("Extracting features from the training data using a sparse vectorizer") 116 | t0 = time() 117 | if opts.use_hashing: 118 | print('using hashing...') 119 | vectorizer = HashingVectorizer(non_negative=True, 120 | n_features=opts.n_features, 121 | tokenizer=jieba_tokenizer 122 | ) 123 | X_train = vectorizer.transform(x_train) 124 | else: 125 | print('using tfidf...') 126 | vectorizer = TfidfVectorizer(sublinear_tf=True, 127 | max_df=0.5, 128 | tokenizer=jieba_tokenizer) 129 | X_train = vectorizer.fit_transform(x_train) 130 | duration = time() - t0 131 | print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) 132 | print("n_samples: %d, n_features: %d" % X_train.shape) 133 | print() 134 | 135 | print("Extracting features from the test data using the same vectorizer") 136 | t0 = time() 137 | X_test = vectorizer.transform(x_test) 138 | duration = time() - t0 139 | print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) 140 | print("n_samples: %d, n_features: %d" % X_test.shape) 141 | print() 142 | 143 | # mapping from integer feature name to original token string 144 | if opts.use_hashing: 145 | feature_names = None 146 | else: 147 | feature_names = vectorizer.get_feature_names() 148 | 149 | if opts.select_chi2: 150 | print("Extracting %d best features by a chi-squared test" % 151 | opts.select_chi2) 152 | t0 = time() 153 | ch2 = SelectKBest(chi2, k=opts.select_chi2) 154 | X_train = ch2.fit_transform(X_train, y_train) 155 | X_test = ch2.transform(X_test) 156 | if feature_names: 157 | # keep selected feature names 158 | feature_names = [feature_names[i] for i 159 | in ch2.get_support(indices=True)] 160 | print("done in %fs" % (time() - t0)) 161 | print() 162 | 163 | if feature_names: 164 | feature_names = np.asarray(feature_names) 165 | 166 | 167 | def trim(s): 168 | """Trim string to fit on terminal (assuming 80-column display)""" 169 | return s if len(s) <= 80 else s[:77] + "..." 170 | 171 | 172 | ############################################################################### 173 | # Benchmark classifiers 174 | def benchmark(clf): 175 | print('_' * 80) 176 | print("Training: ") 177 | print(clf) 178 | t0 = time() 179 | clf.fit(X_train, y_train) 180 | train_time = time() - t0 181 | print("train time: %0.3fs" % train_time) 182 | 183 | t0 = time() 184 | pred = clf.predict(X_test) 185 | test_time = time() - t0 186 | print("test time: %0.3fs" % test_time) 187 | 188 | score = metrics.accuracy_score(y_test, pred) 189 | print("accuracy: %0.3f" % score) 190 | 191 | if hasattr(clf, 'coef_'): 192 | print("dimensionality: %d" % clf.coef_.shape[1]) 193 | print("density: %f" % density(clf.coef_)) 194 | 195 | if opts.print_top10 and feature_names is not None: 196 | print("top 10 keywords per class:") 197 | for i, category in enumerate(categories): 198 | top10 = np.argsort(clf.coef_[i])[-10:] 199 | print(trim("%s: %s" 200 | % (category, " ".join(feature_names[top10])))) 201 | print() 202 | 203 | if opts.print_report: 204 | print("classification report:") 205 | print(metrics.classification_report(y_test, pred, 206 | target_names=categories)) 207 | 208 | if opts.print_cm: 209 | print("confusion matrix:") 210 | print(metrics.confusion_matrix(y_test, pred)) 211 | 212 | print() 213 | clf_descr = str(clf).split('(')[0] 214 | return clf_descr, score, train_time, test_time 215 | 216 | 217 | results = [] 218 | for clf, name in ( 219 | (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), 220 | (Perceptron(n_iter=50), "Perceptron"), 221 | (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), 222 | (KNeighborsClassifier(n_neighbors=10), "kNN"), 223 | (RandomForestClassifier(n_estimators=100), "Random forest")): 224 | print('=' * 80) 225 | print(name) 226 | results.append(benchmark(clf)) 227 | 228 | for penalty in ["l2", "l1"]: 229 | print('=' * 80) 230 | print("%s penalty" % penalty.upper()) 231 | # Train Liblinear model 232 | results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, 233 | dual=False, tol=1e-3))) 234 | 235 | # Train SGD model 236 | results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50, 237 | penalty=penalty))) 238 | 239 | # Train SGD with Elastic Net penalty 240 | print('=' * 80) 241 | print("Elastic-Net penalty") 242 | results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50, 243 | penalty="elasticnet"))) 244 | 245 | # Train NearestCentroid without threshold 246 | print('=' * 80) 247 | print("NearestCentroid (aka Rocchio classifier)") 248 | results.append(benchmark(NearestCentroid())) 249 | 250 | # Train sparse Naive Bayes classifiers 251 | print('=' * 80) 252 | print("Naive Bayes") 253 | results.append(benchmark(MultinomialNB(alpha=.01))) 254 | results.append(benchmark(BernoulliNB(alpha=.01))) 255 | 256 | print('=' * 80) 257 | print("LinearSVC with L1-based feature selection") 258 | # The smaller C, the stronger the regularization. 259 | # The more regularization, the more sparsity. 260 | results.append(benchmark(Pipeline([ 261 | ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)), 262 | ('classification', LinearSVC()) 263 | ]))) 264 | 265 | # make some plots 266 | 267 | indices = np.arange(len(results)) 268 | 269 | results = [[x[i] for x in results] for i in range(4)] 270 | 271 | clf_names, score, training_time, test_time = results 272 | training_time = np.array(training_time) / np.max(training_time) 273 | test_time = np.array(test_time) / np.max(test_time) 274 | 275 | plt.figure(figsize=(12, 8)) 276 | plt.title("Score") 277 | plt.barh(indices, score, .2, label="score", color='r') 278 | plt.barh(indices + .3, training_time, .2, label="training time", color='g') 279 | plt.barh(indices + .6, test_time, .2, label="test time", color='b') 280 | plt.yticks(()) 281 | plt.legend(loc='best') 282 | plt.subplots_adjust(left=.25) 283 | plt.subplots_adjust(top=.95) 284 | plt.subplots_adjust(bottom=.05) 285 | 286 | for i, c in zip(indices, clf_names): 287 | plt.text(-.3, i, c) 288 | 289 | plt.show() 290 | --------------------------------------------------------------------------------