├── .gitattributes
├── .gitignore
└── text_classifier
    ├── load_files.py
    ├── netease_news_classifier.py
    ├── netease_predict_data.zip
    ├── netease_traning_data.zip
    └── text_classifier_evaluation.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/text_classifier/load_files.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import codecs
  3 | from os import listdir
  4 | import os
  5 | from os.path import isdir, join
  6 | 
  7 | import numpy as np
  8 | from sklearn.datasets.base import Bunch
  9 | from sklearn.utils import check_random_state
 10 | 
 11 | __author__ = 'rockychi1001@gmail.com'
 12 | 
 13 | 
 14 | def load_files(container_path, description=None, categories=None,
 15 |                shuffle=True, encoding='utf-8', random_state=0,
 16 |                key_path_index=-2):
 17 |     """Load text files with categories as subfolder names.
 18 | 
 19 |     Individual samples are assumed to be files stored a two levels folder
 20 |     structure such as the following:
 21 | 
 22 |         container_folder/
 23 |             category_1_folder/
 24 |                 file_1.txt
 25 |                     line 1
 26 |                     line 2
 27 |                     ...
 28 |                     line n
 29 |             category_2_folder/
 30 |                 file_2.txt
 31 |                     line 1
 32 |                     line 2
 33 |                     ...
 34 |                     line n
 35 |             ...
 36 | 
 37 |     The folder names are used as supervised signal label names. The
 38 |     individual file names are not important.
 39 | 
 40 |     Parameters
 41 |     ----------
 42 |     container_path : string or unicode
 43 |         Path to the main folder holding one subfolder per category
 44 | 
 45 |     description: string or unicode, optional (default=None)
 46 |         A paragraph describing the characteristic of the dataset: its source,
 47 |         reference, etc.
 48 | 
 49 |     categories : A collection of strings or None, optional (default=None)
 50 |         If None (default), load all the categories.
 51 |         If not None, list of category names to load (other categories ignored).
 52 | 
 53 |     shuffle : bool, optional (default=True)
 54 |         Whether or not to shuffle the data: might be important for models that
 55 |         make the assumption that the samples are independent and identically
 56 |         distributed (i.i.d.), such as stochastic gradient descent.
 57 | 
 58 |     random_state : int, RandomState instance or None, optional (default=0)
 59 |         If int, random_state is the seed used by the random number generator;
 60 |         If RandomState instance, random_state is the random number generator;
 61 |         If None, the random number generator is the RandomState instance used
 62 |         by `np.random`.
 63 | 
 64 |     key_name_index : int, category's index containing text file.
 65 | 
 66 |     Returns
 67 |     -------
 68 |     data : Bunch
 69 |         Dictionary-like object, the interesting attributes are: either
 70 |         data, the raw text data to learn, or 'filenames', the files
 71 |         holding it, 'target', the classification labels (integer index),
 72 |         'target_names', the meaning of the labels, and 'DESCR', the full
 73 |         description of the dataset.
 74 |     """
 75 |     target = list()
 76 |     target_names = list()
 77 |     filenames = list()
 78 |     filelines2data = dict()
 79 | 
 80 |     folders = [f for f in sorted(listdir(container_path))
 81 |                if isdir(join(container_path, f))]
 82 | 
 83 |     if categories is not None:
 84 |         folders = [f for f in folders if f in categories]
 85 | 
 86 |     for label, folder in enumerate(folders):
 87 |         target_names.append(folder)
 88 |         folder_path = join(container_path, folder)
 89 |         documents = [join(folder_path, d)
 90 |                      for d in sorted(listdir(folder_path))]
 91 |         for training_doc in documents:
 92 |             if key_path_index:
 93 |                 category = training_doc.split(os.sep)[key_path_index]
 94 |             else:
 95 |                 category = training_doc
 96 |             with codecs.open(training_doc, encoding=encoding) as td:
 97 |                 for line_index, data in enumerate(td):
 98 |                     key4file = category + str(line_index)
 99 |                     filelines2data[key4file] = data
100 |                     target.append(label)
101 |                     filenames.append(key4file)
102 | 
103 |     # convert to array for fancy indexing
104 |     filenames = np.array(filenames)
105 |     target = np.array(target)
106 | 
107 |     if shuffle:
108 |         random_state = check_random_state(random_state)
109 |         indices = np.arange(filenames.shape[0])
110 |         random_state.shuffle(indices)
111 |         filenames = filenames[indices]
112 |         target = target[indices]
113 | 
114 |     data = list()
115 |     for filename in filenames:
116 |         data.append(filelines2data.get(filename))
117 | 
118 |     return Bunch(data=data,
119 |                  filenames=filenames,
120 |                  target_names=target_names,
121 |                  target=target,
122 |                  DESCR=description)
123 | 


--------------------------------------------------------------------------------
/text_classifier/netease_news_classifier.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from sklearn.datasets import load_files
 3 | from sklearn.cross_validation import train_test_split
 4 | from sklearn.feature_extraction.text import TfidfVectorizer
 5 | from sklearn.svm import LinearSVC
 6 | import jieba
 7 | 
 8 | __author__ = 'rockychi1001@gmail.com'
 9 | 
10 | 
11 | def jieba_tokenizer(x): return jieba.cut(x)
12 | # 加载训练文本（网易新闻抓取的分类数据），并切分为训练集
13 | training_data = load_files(ur'D:\work_code\workspace\PythonML\crawler\netease', encoding='utf-8')
14 | x_train, _, y_train, _ = train_test_split(training_data.data, training_data.target, test_size=0.00000001)
15 | # 生成词语的Tfidf向量空间模型，注意，训练样本数据调用的是fit_transform接口
16 | words_tfidf_vec = TfidfVectorizer(binary=False, tokenizer=jieba_tokenizer)
17 | X_train = words_tfidf_vec.fit_transform(x_train)
18 | # 训练分类器
19 | clf = LinearSVC().fit(X_train, y_train)
20 | # 加载待预测文本数据，并切分为测试集
21 | testing_data = load_files(ur'D:\work_code\workspace\PythonML\text_classify\netease_test\predict_data', encoding='utf-8')
22 | _, x_test, _, _ = train_test_split(testing_data.data, testing_data.target, test_size=0.99999999)
23 | # 测试样本数据调用的是transform接口
24 | X_test = words_tfidf_vec.transform(x_test)
25 | # 进行预测
26 | pred = clf.predict(X_test)
27 | 
28 | for label in pred:
29 |     print u'predict label: %s ' % training_data.target_names[pred]
30 | 


--------------------------------------------------------------------------------
/text_classifier/netease_predict_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rocky1001/Machine-Learning/f06de33181b48a97343ba75816850172bee282db/text_classifier/netease_predict_data.zip


--------------------------------------------------------------------------------
/text_classifier/netease_traning_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rocky1001/Machine-Learning/f06de33181b48a97343ba75816850172bee282db/text_classifier/netease_traning_data.zip


--------------------------------------------------------------------------------
/text_classifier/text_classifier_evaluation.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
  3 | #         Olivier Grisel <olivier.grisel@ensta.org>
  4 | #         Mathieu Blondel <mathieu@mblondel.org>
  5 | #         Lars Buitinck <L.J.Buitinck@uva.nl>
  6 | # License: BSD 3 clause
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | import logging
 11 | from optparse import OptionParser
 12 | import sys
 13 | from time import time
 14 | 
 15 | import numpy as np
 16 | import matplotlib.pyplot as plt
 17 | from sklearn.cross_validation import train_test_split
 18 | from sklearn.datasets import load_files
 19 | from sklearn.feature_extraction.text import TfidfVectorizer
 20 | from sklearn.feature_extraction.text import HashingVectorizer
 21 | from sklearn.feature_selection import SelectKBest, chi2
 22 | from sklearn.linear_model import RidgeClassifier
 23 | from sklearn.pipeline import Pipeline
 24 | from sklearn.svm import LinearSVC
 25 | from sklearn.linear_model import SGDClassifier
 26 | from sklearn.linear_model import Perceptron
 27 | from sklearn.linear_model import PassiveAggressiveClassifier
 28 | from sklearn.naive_bayes import BernoulliNB, MultinomialNB
 29 | from sklearn.neighbors import KNeighborsClassifier
 30 | from sklearn.neighbors import NearestCentroid
 31 | from sklearn.ensemble import RandomForestClassifier
 32 | from sklearn.utils.extmath import density
 33 | from sklearn import metrics
 34 | 
 35 | import jieba
 36 | jieba.load_userdict(ur'jieba_customer_dict.txt')
 37 | 
 38 | 
 39 | def jieba_tokenizer(_x): return jieba.cut(_x)
 40 | 
 41 | # Display progress logs on stdout
 42 | logging.basicConfig(level=logging.INFO,
 43 |                     format='%(asctime)s %(levelname)s %(message)s')
 44 | 
 45 | 
 46 | # parse commandline arguments
 47 | op = OptionParser()
 48 | op.add_option("--report",
 49 |               action="store_true", dest="print_report",
 50 |               help="Print a detailed classification report.")
 51 | op.add_option("--chi2_select",
 52 |               action="store", type="int", dest="select_chi2",
 53 |               help="Select some number of features using a chi-squared test")
 54 | op.add_option("--confusion_matrix",
 55 |               action="store_true", dest="print_cm",
 56 |               help="Print the confusion matrix.")
 57 | op.add_option("--top10",
 58 |               action="store_true", dest="print_top10",
 59 |               help="Print ten most discriminative terms per class"
 60 |                    " for every classifier.")
 61 | op.add_option("--all_categories",
 62 |               action="store_true", dest="all_categories",
 63 |               help="Whether to use all categories or not.")
 64 | op.add_option("--use_hashing",
 65 |               action="store_true",
 66 |               help="Use a hashing vectorizer.")
 67 | op.add_option("--n_features",
 68 |               action="store", type=int, default=2 ** 16,
 69 |               help="n_features when using the hashing vectorizer.")
 70 | op.add_option("--filtered",
 71 |               action="store_true",
 72 |               help="Remove newsgroup information that is easily overfit: "
 73 |                    "headers, signatures, and quoting.")
 74 | 
 75 | (opts, args) = op.parse_args()
 76 | if len(args) > 0:
 77 |     op.error("this script takes no arguments.")
 78 |     sys.exit(1)
 79 | 
 80 | print(__doc__)
 81 | op.print_help()
 82 | print()
 83 | 
 84 | 
 85 | ###############################################################################
 86 | 
 87 | print("Loading autohome text data:")
 88 | # 加载并保存训练文本（分类词库数据）
 89 | data_train = load_files(ur'training_data_folder',
 90 |                         encoding='utf-8'
 91 |                         )
 92 | 
 93 | # 加载数据集，切分数据集80%训练，20%测试
 94 | x_train, x_test, y_train, y_test \
 95 |     = train_test_split(data_train.data, data_train.target, test_size=0.2)
 96 | print('data loaded')
 97 | 
 98 | categories = data_train.target_names  # for case categories == None
 99 | 
100 | 
101 | def size_mb(docs):
102 |     return sum(len(s.encode('utf-8')) for s in docs) / 1e6
103 | 
104 | 
105 | data_train_size_mb = size_mb(data_train.data)
106 | data_test_size_mb = size_mb(x_test)
107 | 
108 | print("%d documents - %0.3fMB (training set)" % (
109 |     len(data_train.data), data_train_size_mb))
110 | print("%d documents - %0.3fMB (test set)" % (
111 |     len(x_test), data_test_size_mb))
112 | print("%d categories" % len(categories))
113 | print()
114 | 
115 | print("Extracting features from the training data using a sparse vectorizer")
116 | t0 = time()
117 | if opts.use_hashing:
118 |     print('using hashing...')
119 |     vectorizer = HashingVectorizer(non_negative=True,
120 |                                    n_features=opts.n_features,
121 |                                    tokenizer=jieba_tokenizer
122 |                                    )
123 |     X_train = vectorizer.transform(x_train)
124 | else:
125 |     print('using tfidf...')
126 |     vectorizer = TfidfVectorizer(sublinear_tf=True,
127 |                                  max_df=0.5,
128 |                                  tokenizer=jieba_tokenizer)
129 |     X_train = vectorizer.fit_transform(x_train)
130 | duration = time() - t0
131 | print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
132 | print("n_samples: %d, n_features: %d" % X_train.shape)
133 | print()
134 | 
135 | print("Extracting features from the test data using the same vectorizer")
136 | t0 = time()
137 | X_test = vectorizer.transform(x_test)
138 | duration = time() - t0
139 | print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
140 | print("n_samples: %d, n_features: %d" % X_test.shape)
141 | print()
142 | 
143 | # mapping from integer feature name to original token string
144 | if opts.use_hashing:
145 |     feature_names = None
146 | else:
147 |     feature_names = vectorizer.get_feature_names()
148 | 
149 | if opts.select_chi2:
150 |     print("Extracting %d best features by a chi-squared test" %
151 |           opts.select_chi2)
152 |     t0 = time()
153 |     ch2 = SelectKBest(chi2, k=opts.select_chi2)
154 |     X_train = ch2.fit_transform(X_train, y_train)
155 |     X_test = ch2.transform(X_test)
156 |     if feature_names:
157 |         # keep selected feature names
158 |         feature_names = [feature_names[i] for i
159 |                          in ch2.get_support(indices=True)]
160 |     print("done in %fs" % (time() - t0))
161 |     print()
162 | 
163 | if feature_names:
164 |     feature_names = np.asarray(feature_names)
165 | 
166 | 
167 | def trim(s):
168 |     """Trim string to fit on terminal (assuming 80-column display)"""
169 |     return s if len(s) <= 80 else s[:77] + "..."
170 | 
171 | 
172 | ###############################################################################
173 | # Benchmark classifiers
174 | def benchmark(clf):
175 |     print('_' * 80)
176 |     print("Training: ")
177 |     print(clf)
178 |     t0 = time()
179 |     clf.fit(X_train, y_train)
180 |     train_time = time() - t0
181 |     print("train time: %0.3fs" % train_time)
182 | 
183 |     t0 = time()
184 |     pred = clf.predict(X_test)
185 |     test_time = time() - t0
186 |     print("test time:  %0.3fs" % test_time)
187 | 
188 |     score = metrics.accuracy_score(y_test, pred)
189 |     print("accuracy:   %0.3f" % score)
190 | 
191 |     if hasattr(clf, 'coef_'):
192 |         print("dimensionality: %d" % clf.coef_.shape[1])
193 |         print("density: %f" % density(clf.coef_))
194 | 
195 |         if opts.print_top10 and feature_names is not None:
196 |             print("top 10 keywords per class:")
197 |             for i, category in enumerate(categories):
198 |                 top10 = np.argsort(clf.coef_[i])[-10:]
199 |                 print(trim("%s: %s"
200 |                            % (category, " ".join(feature_names[top10]))))
201 |         print()
202 | 
203 |     if opts.print_report:
204 |         print("classification report:")
205 |         print(metrics.classification_report(y_test, pred,
206 |                                             target_names=categories))
207 | 
208 |     if opts.print_cm:
209 |         print("confusion matrix:")
210 |         print(metrics.confusion_matrix(y_test, pred))
211 | 
212 |     print()
213 |     clf_descr = str(clf).split('(')[0]
214 |     return clf_descr, score, train_time, test_time
215 | 
216 | 
217 | results = []
218 | for clf, name in (
219 |         (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
220 |         (Perceptron(n_iter=50), "Perceptron"),
221 |         (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
222 |         (KNeighborsClassifier(n_neighbors=10), "kNN"),
223 |         (RandomForestClassifier(n_estimators=100), "Random forest")):
224 |     print('=' * 80)
225 |     print(name)
226 |     results.append(benchmark(clf))
227 | 
228 | for penalty in ["l2", "l1"]:
229 |     print('=' * 80)
230 |     print("%s penalty" % penalty.upper())
231 |     # Train Liblinear model
232 |     results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
233 |                                        dual=False, tol=1e-3)))
234 | 
235 |     # Train SGD model
236 |     results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
237 |                                            penalty=penalty)))
238 | 
239 | # Train SGD with Elastic Net penalty
240 | print('=' * 80)
241 | print("Elastic-Net penalty")
242 | results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
243 |                                        penalty="elasticnet")))
244 | 
245 | # Train NearestCentroid without threshold
246 | print('=' * 80)
247 | print("NearestCentroid (aka Rocchio classifier)")
248 | results.append(benchmark(NearestCentroid()))
249 | 
250 | # Train sparse Naive Bayes classifiers
251 | print('=' * 80)
252 | print("Naive Bayes")
253 | results.append(benchmark(MultinomialNB(alpha=.01)))
254 | results.append(benchmark(BernoulliNB(alpha=.01)))
255 | 
256 | print('=' * 80)
257 | print("LinearSVC with L1-based feature selection")
258 | # The smaller C, the stronger the regularization.
259 | # The more regularization, the more sparsity.
260 | results.append(benchmark(Pipeline([
261 |     ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
262 |     ('classification', LinearSVC())
263 | ])))
264 | 
265 | # make some plots
266 | 
267 | indices = np.arange(len(results))
268 | 
269 | results = [[x[i] for x in results] for i in range(4)]
270 | 
271 | clf_names, score, training_time, test_time = results
272 | training_time = np.array(training_time) / np.max(training_time)
273 | test_time = np.array(test_time) / np.max(test_time)
274 | 
275 | plt.figure(figsize=(12, 8))
276 | plt.title("Score")
277 | plt.barh(indices, score, .2, label="score", color='r')
278 | plt.barh(indices + .3, training_time, .2, label="training time", color='g')
279 | plt.barh(indices + .6, test_time, .2, label="test time", color='b')
280 | plt.yticks(())
281 | plt.legend(loc='best')
282 | plt.subplots_adjust(left=.25)
283 | plt.subplots_adjust(top=.95)
284 | plt.subplots_adjust(bottom=.05)
285 | 
286 | for i, c in zip(indices, clf_names):
287 |     plt.text(-.3, i, c)
288 | 
289 | plt.show()
290 | 


--------------------------------------------------------------------------------