├── c1_knn
    ├── __init__.py
    ├── metrics.py
    ├── model_selection.py
    ├── preprocessing.py
    ├── kNN.py
    ├── 02_kNN_in_scikit_learn.ipynb
    ├── knn.md
    ├── 05_Hyper_Parameters.ipynb
    ├── 03_Train_Test_Split.ipynb
    ├── 04_Hyper_Parameter_kNN.ipynb
    ├── 08_Scaler_in_Scikit_Learn.ipynb
    └── 01_kNN_Basics.ipynb
├── c4_pca
    ├── __init__.py
    └── 07_MNIST.ipynb
├── c8_svm
    └── __init__.py
├── playML
    ├── __init__.py
    ├── model_selection.py
    ├── plot_utils.py
    ├── PCA.py
    ├── logistic_regression.py
    ├── metrics.py
    └── linear_regression.py
├── c2_linear_regression
    ├── __init__.py
    ├── linear_regression.py
    ├── simple_linear_regression.py
    ├── 08_Linear_Regression.ipynb
    ├── 10_More_About_Linear_Regression.ipynb
    └── 09_Regression_in_scikit_learn.ipynb
├── c3_gradient_descent
    ├── __init__.py
    ├── 06_Stochastic_Gradient_Descent.ipynb
    ├── 08_Gradient_Debugging.ipynb
    ├── 05_Vectorize_Gradient_Descent.ipynb
    └── 07_SGD_in_scikit_learn.ipynb
├── c6_logistic_regression
    ├── __init__.py
    ├── plot_utils.py
    ├── 01_Sigmoid.ipynb
    └── 04_implement_logistic_regression.ipynb
├── c5_polynomial_regression
    ├── __init__.py
    └── 06_Validation_and_Cross_Validation.ipynb
├── c7_classification_performance_measures
    ├── __init__.py
    └── 03_implement_confusion_matrix_precision_and_recall.ipynb
├── README.md
├── c0_overview
    └── 数据相关.pdf
├── LICENSE
└── .gitignore


/c1_knn/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/c4_pca/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/c8_svm/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/playML/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/c2_linear_regression/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/c3_gradient_descent/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/c6_logistic_regression/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/c5_polynomial_regression/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/c7_classification_performance_measures/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MachineLearningClassicAlgorithm
2 | 慕课网《Python 3 入门机器学习经典算法与应用》的代码和笔记
3 | 


--------------------------------------------------------------------------------
/c0_overview/数据相关.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sea-Monster/MachineLearningClassicAlgorithm/HEAD/c0_overview/数据相关.pdf


--------------------------------------------------------------------------------
/c1_knn/metrics.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # 度量
3 | import numpy as np
4 | 
5 | 
6 | def accuracy_score(y_true, y_predict):
7 |     assert np.shape(y_true)[0] == np.shape(y_predict)[0], 'the size of y_true must be equal to the size of y_predict'
8 |     return sum(y_predict == y_true) / len(y_true)
9 | 


--------------------------------------------------------------------------------
/c6_logistic_regression/plot_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | def plot_decision_boundary(model, axis):
 6 |     """绘制不规则决策边界"""
 7 |     x0, x1 = np.meshgrid(
 8 |         np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(1, -1),
 9 |         np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(1, -1)
10 |     )
11 |     X_new = np.c_[x0.ravel(), x1.ravel()]
12 | 
13 |     y_predict = model.predict(X_new)
14 |     zz = y_predict.reshape(x0.shape)
15 | 
16 |     from matplotlib.colors import ListedColormap
17 |     custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
18 | 
19 |     plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
20 | 


--------------------------------------------------------------------------------
/c1_knn/model_selection.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | 
 5 | def train_test_split(X, y, test_ratio=0.2, seed=None):
 6 |     """
 7 |     将数据X和y按照test_ratio分割成X_train，X_test,y_train，y_test
 8 |     :param X:
 9 |     :param y:
10 |     :param test_ratio:
11 |     :param seed:
12 |     :return:
13 |     """
14 |     if seed:
15 |         np.random.seed(seed)
16 | 
17 |     shuffled_indexes = np.random.permutation(np.shape(X)[0])
18 | 
19 |     test_size = int(np.shape(X)[0] * test_ratio)
20 |     test_indexes = shuffled_indexes[:test_size]
21 |     train_indexes = shuffled_indexes[test_size:]
22 | 
23 |     X_train = X[train_indexes]
24 |     y_train = y[train_indexes]
25 | 
26 |     X_test = X[test_indexes]
27 |     y_test = y[test_indexes]
28 | 
29 |     return X_train, X_test, y_train, y_test
30 | 


--------------------------------------------------------------------------------
/playML/model_selection.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | 
 5 | def train_test_split(X, y, test_ratio=0.2, seed=None):
 6 |     """
 7 |     将数据X和y按照test_ratio分割成X_train，X_test,y_train，y_test
 8 |     :param X:
 9 |     :param y:
10 |     :param test_ratio:
11 |     :param seed:
12 |     :return:
13 |     """
14 |     if seed:
15 |         np.random.seed(seed)
16 | 
17 |     shuffled_indexes = np.random.permutation(np.shape(X)[0])
18 | 
19 |     test_size = int(np.shape(X)[0] * test_ratio)
20 |     test_indexes = shuffled_indexes[:test_size]
21 |     train_indexes = shuffled_indexes[test_size:]
22 | 
23 |     X_train = X[train_indexes]
24 |     y_train = y[train_indexes]
25 | 
26 |     X_test = X[test_indexes]
27 |     y_test = y[test_indexes]
28 | 
29 |     return X_train, X_test, y_train, y_test
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Sea-Monster
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/c2_linear_regression/linear_regression.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | from playML.metrics import r2_score
 4 | 
 5 | 
 6 | class LinearRegression(object):
 7 | 
 8 |     def __init__(self):
 9 |         self.coef_ = None   # 系数
10 |         self.interception_ = None   # 截距
11 | 
12 |         self._theta = None  # 为计算方便，将来会把系数和截距合为一个θ
13 | 
14 |     def fit_normal(self, X_train:np.ndarray, y_train:np.ndarray):
15 |         assert X_train.shape[0] == y_train.shape[0], '每一个训练样本必须对应一个标记'
16 | 
17 |         # 特征矩阵的最左列加上一个行数等于特征矩阵的由1组成的列向量
18 |         X_b = np.hstack([np.ones(shape=(X_train.shape[0], 1)), X_train])
19 | 
20 |         # 正规方程解求θ
21 |         self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
22 |         self.interception_ = self._theta[0]
23 |         self.coef_ = self._theta[1:]
24 | 
25 |         return self
26 | 
27 |     def predict(self, X_predict:np.ndarray):
28 |         assert self.interception_ is not None and self.coef_ is not None, '评估前必须拟合'
29 |         assert np.shape(X_predict)[1] == len(self.coef_), '要预测的样本的特征数必须与训练的样本的特征数相等'
30 | 
31 |         X_b = np.hstack([np.ones(shape=(X_predict.shape[0], 1)), X_predict])
32 |         return X_b.dot(self._theta)
33 | 
34 |     def score(self, X_test, y_test):
35 |         y_predict = self.predict(X_test)
36 |         return r2_score(y_test,y_predict)
37 | 
38 |     def __repr__(self):
39 |         return 'LinearRegression()'


--------------------------------------------------------------------------------
/playML/plot_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | def plot_decision_boundary(model, axis):
 6 |     """绘制不规则决策边界"""
 7 |     x0, x1 = np.meshgrid(
 8 |         np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(1, -1),
 9 |         np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(1, -1)
10 |     )
11 |     X_new = np.c_[x0.ravel(), x1.ravel()]
12 | 
13 |     y_predict = model.predict(X_new)
14 |     zz = y_predict.reshape(x0.shape)
15 | 
16 |     from matplotlib.colors import ListedColormap
17 |     custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
18 | 
19 |     plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
20 | 
21 | def plot_svc_decision_boundary(model, axis):
22 |     plot_decision_boundary(model, axis)
23 |     w = model.coef_[0]
24 |     b = model.intercept_[0]
25 | 
26 |     # 绘制margin的直线
27 |     # 决策边界所在直线的表达式：w0 * x0 + w1 * x1 + b = 0  -> x1 = -w0 * x0 / w1 - b / w1
28 |     plot_x = np.linspace(axis[0], axis[1], 200)
29 | 
30 |     # w0 * x0 + w1 * x1 + b = 1  -> x1 = 1/w1 - w0 * x0 / w1 - b / w1
31 |     up_y = -w[0]/w[1]*plot_x - b/w[1] + 1/w[1]
32 | 
33 |     down_y = -w[0]/w[1]*plot_x - b/w[1] - 1/w[1]
34 | 
35 |     # 处理超过了坐标轴范围的值
36 |     up_index = (up_y >= axis[2]) & (up_y <= axis[3])
37 |     down_index = (down_y >= axis[2]) & (down_y <= axis[3])
38 | 
39 |     plt.plot(plot_x[up_index], up_y[up_index], color='black')
40 |     plt.plot(plot_x[down_index], down_y[down_index], color='black')


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/c1_knn/preprocessing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | 
 5 | class StandardScaler(object):
 6 |     """
 7 |     照猫画虎的仿照scikit-learn实现一个Standard Scaler
 8 |     """
 9 |     def __init__(self):
10 |         self.mean_ = None
11 |         self.scale_ = None
12 |         
13 |     def fit(self, X:np.ndarray):
14 |         """
15 |         根据训练数据集X获得数据的均值和标准差
16 |         (暂时只处理2维的数据)
17 |         :param X: 
18 |         :return: 
19 |         """
20 |         assert X.ndim == 2, 'The dimension of X must be 2'
21 |         
22 |         self.mean_ = np.array([np.mean(X[:,i]) for i in range(X.shape[1])])
23 |         self.scale_ = np.array([np.std(X[:,i]) for i in range(X.shape[1])])
24 |         
25 |         return self
26 |     
27 |     def transform(self, X):
28 |         """
29 |         将X根据这个StandardScaler进行0均值标准化处理
30 |         :param X: 
31 |         :return: 
32 |         """
33 |         assert X.ndim == 2, 'The dimension of X must be 2'
34 |         assert self.mean_ is not None and self.scale_ is not None, "must fit before transform"
35 |         X_standard = (X - self.mean_) / self.scale_
36 |         return X_standard
37 | 
38 |     def transform_standard(self, X):
39 |         """
40 |         将X根据这个StandardScaler进行0均值标准化处理(老师教学版)
41 |         :param X:
42 |         :return:
43 |         """
44 |         assert X.ndim == 2, 'The dimension of X must be 2'
45 |         assert self.mean_ is not None and self.scale_ is not None, "must fit before transform"
46 |         X_res = np.empty(shape=X.shape, dtype=float)
47 |         for col in range(X.shape[1]):
48 |             X_res[:, col] = (X[:, col] - self.mean_[col]) / self.scale_[col]
49 |         return X_res
50 | 
51 | if __name__ == '__main__':
52 |     from sklearn import datasets
53 |     from sklearn.model_selection._split import train_test_split
54 |     iris = datasets.load_iris()
55 |     X = iris.data
56 |     y = iris.target
57 | 
58 |     X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=666)
59 | 
60 |     ss = StandardScaler()
61 |     ss.fit(X_train)
62 | 
63 |     X_standard = ss.transform(X)
64 |     print(ss.transform_standard(X))
65 |     print('-'*100)
66 |     print(X_standard)
67 | 


--------------------------------------------------------------------------------
/playML/PCA.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | 
 5 | class PCA(object):
 6 |     def __init__(self, n_components):
 7 |         assert n_components >= 1, "n_components必须大于等于1"
 8 |         self.n_components = n_components
 9 |         self.components_ = None
10 | 
11 |     def fit(self, X, eta=0.01, n_iters=1e4):
12 |         """
13 |         获得数据集X的前n个主成分
14 |         :param X:
15 |         :param eta:
16 |         :param n_iters:
17 |         :return:
18 |         """
19 |         assert self.n_components <= np.shape(X)[1], 'n_components must not be greater than the feature number of X'
20 | 
21 |         def demean(X):
22 |             return X - np.mean(X, axis=0)
23 | 
24 |         def f(w, X):
25 |             """效用函数"""
26 |             return np.sum((X.dot(w) ** 2))/len(X)
27 | 
28 |         def derivative_f(w, X):
29 |             """求梯度"""
30 |             return X.T.dot(X.dot(w))*2./len(X)
31 | 
32 |         def direction(w):
33 |             return w/np.linalg.norm(w)
34 | 
35 |         def first_component(X, initial_w, eta=0.01, n_iters=1e4, epsilon=1e-8):
36 |             w = direction(initial_w)
37 |             cur_iter = 0
38 | 
39 |             while cur_iter < n_iters:
40 |                 gradient = derivative_f(w, X)
41 |                 last_w = w
42 |                 w = w + eta * gradient
43 |                 w = direction(w)
44 |                 if (abs(f(w, X) - f(last_w, X)) < epsilon):
45 |                     break
46 |                 cur_iter += 1
47 |             return w
48 | 
49 |         X_pca = demean(X)
50 |         self.components_ = np.empty(shape=(self.n_components, np.shape(X)[1]))
51 |         for i in range(self.n_components):
52 |             initial_w = np.random.random(X_pca.shape[1])
53 |             w = first_component(X_pca, initial_w, eta, n_iters)
54 |             self.components_[i,:] = w
55 |             X_pca = X_pca - X_pca.dot(w).reshape(-1,1)*w
56 |         return self
57 | 
58 |     def transform(self, X):
59 |         """将给定的X，映射到各个主成分分量中"""
60 |         assert np.shape(X)[1] == np.shape(self.components_)[1]
61 |         return X.dot(self.components_.T)
62 | 
63 |     def inverse_transform(self, X):
64 |         """将给定的X，反向映射回原来的特征空间"""
65 |         assert np.shape(X)[1] == np.shape(self.components_)[0]
66 |         return X.dot(self.components_)
67 | 
68 |     def __repr__(self):
69 |         return 'PCA(n_components=%d)' %self.n_components
70 | 


--------------------------------------------------------------------------------
/playML/logistic_regression.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | from .metrics import accuracy_score
 4 | 
 5 | 
 6 | class LogisticRegression(object):
 7 |     def __init__(self):
 8 |         """初始化逻辑回归模型"""
 9 |         self.coef_ = None
10 |         self.intercept_ = None
11 |         self._theta = None
12 | 
13 |     def _sigmoid(self, t):
14 |         return 1. / (1. + np.exp(-t))
15 | 
16 |     def fit(self, X_train, y_train, eta=0.01, n_iters=1e4):
17 |         """根据训练数据集X_train，y_train，使用梯度下降法训练逻辑回归模型"""
18 |         assert X_train.shape[0] == y_train.shape[0], '训练集与结果集的样本数必须一致'
19 | 
20 |         def J(theta, X_b, y):
21 |             """定义损失函数"""
22 |             y_hat = self._sigmoid(X_b.dot(theta))
23 |             try:
24 |                 return np.sum(np.dot(y, np.log(y_hat)) + np.dot((1 - y), np.log(1 - y_hat))) / -len(y)
25 |             except:
26 |                 return float('inf')
27 | 
28 |         def derivative_J(theta, X_b, y):
29 |             """求逻辑回归的梯度"""
30 |             return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(X_b)
31 | 
32 |         def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
33 |             """梯度下降法求θ"""
34 |             theta = initial_theta
35 |             iters = 0
36 |             while iters < n_iters:
37 |                 gradient = derivative_J(theta, X_b, y)
38 |                 last_theta = theta
39 |                 theta = theta - eta * gradient
40 | 
41 |                 if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
42 |                     break
43 |                 iters += 1
44 |             return theta
45 | 
46 |         X_b = np.hstack((np.ones((len(X_train), 1)), X_train))
47 |         initial_theta = np.zeros(X_b.shape[1])  # 初始的θ向量都是0
48 |         self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
49 |         self.intercept_ = self._theta[0]
50 |         self.coef_ = self._theta[1:]
51 | 
52 |         return self
53 | 
54 |     def predict_proba(self, X_predict):
55 |         """给定待预测数据集X_predict，返回表示X_predict的结果概率向量"""
56 |         X_b = np.hstack([np.ones(shape=(X_predict.shape[0], 1)), X_predict])
57 |         return self._sigmoid(X_b.dot(self._theta))
58 | 
59 |     def predict(self, X_predict):
60 |         proba = self.predict_proba(X_predict)
61 |         return np.array(proba >= .5, dtype=int) # 把True/False的向量转化为1，0的向量
62 | 
63 |     def score(self, X_test, y_test):
64 |         y_predict = self.predict(X_test)
65 |         return accuracy_score(y_test, y_predict)
66 | 
67 |     def __repr__(self):
68 |         return 'LogisticRegression()'


--------------------------------------------------------------------------------
/c1_knn/kNN.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | from math import sqrt
 4 | from collections import Counter
 5 | 
 6 | 
 7 | def kNN_classify(k: int, X_train: np.ndarray, y_train: np.ndarray, x: np.ndarray):
 8 |     """
 9 |     kNN分类算法
10 |     :param k:   kNN的k值
11 |     :param X_train: 训练集的特征（矩阵）
12 |     :param y_train: 训练集的标记（向量）
13 |     :param x: 需要预测的特征（向量）
14 |     :return:
15 |     """
16 |     assert 1 <= k <= X_train.shape[0], "k must be valid"
17 |     assert X_train.shape[0] == y_train.shape[0], "训练集中，特征向量的记录数与标记的记录数目必须一致"
18 |     assert X_train.shape[1] == x.shape[0], '需要预测的x的特征数目必须等于训练集中的特征数目'
19 | 
20 |     # 求x与每一条记录的欧拉距离
21 |     distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
22 | 
23 |     nearest = np.argsort(distances)
24 | 
25 |     # 从y_train中取前k个与x距离最近的y
26 |     topK_y = [y_train[i] for i in nearest[:k]]
27 | 
28 |     votes = Counter(topK_y)
29 | 
30 |     return votes.most_common(1)[0][0]
31 | 
32 | 
33 | class KNNClassifier(object):
34 |     """
35 |     重新整理自己写的kNN算法，使他更符合scikit-Learn的模式
36 |     """
37 |     def __init__(self, k):
38 |         """
39 |         初始化kNN分类器
40 |         :param k:
41 |         """
42 |         self.k = k
43 |         self._X_train = None
44 |         self._y_train = None
45 | 
46 |     def fit(self, X_train, y_train):
47 |         """
48 |         根据训练数据集X_train和y_train训练kNN分类器
49 |         :param X_train:
50 |         :param y_train:
51 |         :return:
52 |         """
53 |         self._X_train = X_train
54 |         self._y_train = y_train
55 |         return self
56 | 
57 |     def predict(self, X_predict):
58 |         """
59 |         给定待预测数据集X_predict，返回表示X_predict的结果向量
60 |         :param X_predict:
61 |         :return:
62 |         """
63 |         y_predict = [self._predict(x) for x in X_predict]
64 |         return np.array(y_predict)
65 | 
66 |     def _predict(self, x):
67 |         """
68 |         给定单个带预测数据x，返回x_predict的预测结果值
69 |         :param x:
70 |         :return:
71 |         """
72 |         # 差不多就是把kNN_classify方法的内容全部搬过来
73 | 
74 |         # 求x与每一条记录的欧拉距离
75 |         distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in self._X_train]
76 | 
77 |         nearest = np.argsort(distances)
78 | 
79 |         # 从y_train中取前k个与x距离最近的y
80 |         topK_y = [self._y_train[i] for i in nearest[:self.k]]
81 | 
82 |         votes = Counter(topK_y)
83 | 
84 |         return votes.most_common(1)[0][0]
85 | 
86 |     def score(self, X_test, y_test):
87 |         y_predict = self.predict(X_test)
88 |         return sum(y_predict == y_test) / len(y_test)
89 | 
90 |     def __repr__(self):
91 |         return 'kNN(k=%d)'%self.k


--------------------------------------------------------------------------------
/playML/metrics.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | from math import sqrt
  4 | 
  5 | 
  6 | def accuracy_score(y_true, y_predict):
  7 |     """
  8 |     计算y_true和y_predict之间的准确率
  9 |     :param y_true:
 10 |     :param y_predict:
 11 |     :return:
 12 |     """
 13 |     assert y_true.shape[0] == y_predict.shape[0], 'the size of y_true must be equal to the size of y_predict'
 14 | 
 15 |     return sum(y_true == y_predict) / len(y_true)
 16 | 
 17 | 
 18 | def mean_squared_error(y_true, y_predict):
 19 |     """
 20 |     计算y_true和y_predict之间的MSE
 21 |     :param y_true:
 22 |     :param y_predict:
 23 |     :return:
 24 |     """
 25 |     assert len(y_true) == len(y_predict), 'the size of y_true must be equal to the size of y_predict'
 26 | 
 27 |     return np.sum((y_true - y_predict) ** 2) / len(y_true)
 28 | 
 29 | 
 30 | def root_mean_squared_error(y_true, y_predict):
 31 |     return sqrt(mean_squared_error(y_true, y_predict))
 32 | 
 33 | 
 34 | def mean_absolute_error(y_true, y_predict):
 35 |     assert len(y_true) == len(y_predict), 'the size of y_true must be equal to the size of y_predict'
 36 | 
 37 |     return np.sum(np.absolute(y_true - y_predict)) / len(y_true)
 38 | 
 39 | 
 40 | def r2_score(y_true, y_predict):
 41 |     """
 42 |     计算R^2 R Square
 43 |     :param y_true:
 44 |     :param y_predict:
 45 |     :return:
 46 |     """
 47 |     return 1 - mean_squared_error(y_true, y_predict) / np.var(y_true)
 48 | 
 49 | 
 50 | def TN(y_true, y_predict):
 51 |     assert len(y_true) == len(y_predict), 'y_true与y_predict的样本数目必须一致'
 52 |     return np.sum((y_true == 0) & (y_predict == 0))
 53 | 
 54 | 
 55 | def FP(y_true, y_predict):
 56 |     assert len(y_true) == len(y_predict), 'y_true与y_predict的样本数目必须一致'
 57 |     return np.sum((y_true == 0) & (y_predict == 1))
 58 | 
 59 | 
 60 | def FN(y_true, y_predict):
 61 |     assert len(y_true) == len(y_predict), 'y_true与y_predict的样本数目必须一致'
 62 |     return np.sum((y_true == 1) & (y_predict == 0))
 63 | 
 64 | 
 65 | def TP(y_true, y_predict):
 66 |     assert len(y_true) == len(y_predict), 'y_true与y_predict的样本数目必须一致'
 67 |     return np.sum((y_true == 1) & (y_predict == 1))
 68 | 
 69 | 
 70 | def confusion_matrix(y_true, y_predict):
 71 |     """返回一个2✖️2的混淆矩阵"""
 72 |     return np.array([
 73 |         [TN(y_true, y_predict), FP(y_true, y_predict)],
 74 |         [FN(y_true, y_predict), TP(y_true, y_predict)]
 75 |     ])
 76 | 
 77 | 
 78 | def precision_score(y_true, y_predict):
 79 |     """求精准率"""
 80 |     tp = TP(y_true, y_predict)
 81 |     fp = FP(y_true, y_predict)
 82 |     try:
 83 |         return tp / (tp + fp)
 84 |     except:  # 分母为0时，结果返回0
 85 |         return 0.0
 86 | 
 87 | 
 88 | def recall_score(y_true, y_predict):
 89 |     """求召回率"""
 90 |     tp = TP(y_true, y_predict)
 91 |     fn = FN(y_true, y_predict)
 92 |     try:
 93 |         return tp / (tp + fn)
 94 |     except:
 95 |         return 0.0
 96 | 
 97 | 
 98 | def f1_score(y_true, y_predict):
 99 |     """f1 score"""
100 |     precision = precision_score(y_true, y_predict)
101 |     recall = recall_score(y_true, y_predict)
102 | 
103 |     try:
104 |         return 2.0 * precision * recall / (precision + recall)
105 |     except:
106 |         return 0.
107 | 
108 | 
109 | def TPR(y_true, y_predict):
110 |     return recall_score(y_true, y_predict)
111 | 
112 | 
113 | def FPR(y_true, y_predict):
114 |     fp = FP(y_true, y_predict)
115 |     tn = TN(y_true, y_predict)
116 |     try:
117 |         return fp / (fp + tn)
118 |     except:
119 |         return 0.
120 | 


--------------------------------------------------------------------------------
/c2_linear_regression/simple_linear_regression.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | from playML.metrics import r2_score
  4 | 
  5 | class SimpleLinearRegression1(object):
  6 |     """
  7 |     自己手写的简陋实现简单线性回归算法
  8 |     """
  9 |     def __init__(self):
 10 |         self.a_ = None
 11 |         self.b_ = None
 12 | 
 13 |     def fit(self, x_train, y_train):
 14 |         """
 15 | 
 16 |         :param x_train:
 17 |         :param y_train:
 18 |         :return:
 19 |         """
 20 |         assert x_train.ndim == 1, 'Simple Linear Regressor can only solve single feature training data.'
 21 |         assert len(x_train) == len(y_train), 'the size of x_train must be equal to the size of y_train'
 22 | 
 23 |         x_mean = np.mean(x_train)
 24 |         y_mean = np.mean(y_train)
 25 | 
 26 |         # 分子
 27 |         numerator = 0.0
 28 | 
 29 |         # 分母
 30 |         denominator = 0.0
 31 | 
 32 |         for x_i, y_i in zip(x_train, y_train):
 33 |             numerator += (x_i - x_mean)*(y_i - y_mean)
 34 |             denominator += (x_i - x_mean)**2
 35 | 
 36 |         self.a_ = numerator / denominator
 37 |         self.b_ = y_mean - self.a_ * x_mean
 38 | 
 39 |         return self
 40 | 
 41 |     def predict(self, x_predict):
 42 |         assert x_predict.ndim == 1, 'Simple Linear Regressor can only solve single feature training data.'
 43 |         assert self.a_ is not None and self.b_ is not None, 'must fit before predict'
 44 |         return np.array([self._predict(x) for x in x_predict])
 45 | 
 46 |     def _predict(self, x_single):
 47 |         return self.a_ * x_single + self.b_
 48 | 
 49 |     def __repr__(self):
 50 |         return 'SimpleLinearRegression1()'
 51 | 
 52 | 
 53 | class SimpleLinearRegression2(object):
 54 |     """
 55 |     自己手写的简陋实现简单线性回归算法，把for循环改为向量化运算，提升效率
 56 |     """
 57 |     def __init__(self):
 58 |         self.a_ = None
 59 |         self.b_ = None
 60 | 
 61 |     def fit(self, x_train, y_train):
 62 |         """
 63 | 
 64 |         :param x_train:
 65 |         :param y_train:
 66 |         :return:
 67 |         """
 68 |         assert x_train.ndim == 1, 'Simple Linear Regressor can only solve single feature training data.'
 69 |         assert len(x_train) == len(y_train), 'the size of x_train must be equal to the size of y_train'
 70 | 
 71 |         x_mean = np.mean(x_train)
 72 |         y_mean = np.mean(y_train)
 73 | 
 74 |         # 分子
 75 |         numerator = (x_train - x_mean).dot(y_train - y_mean)
 76 | 
 77 |         # 分母
 78 |         denominator = (x_train - x_mean).dot(x_train - x_mean)
 79 | 
 80 |         self.a_ = numerator / denominator
 81 |         self.b_ = y_mean - self.a_ * x_mean
 82 | 
 83 |         return self
 84 | 
 85 |     def predict(self, x_predict):
 86 |         assert x_predict.ndim == 1, 'Simple Linear Regressor can only solve single feature training data.'
 87 |         assert self.a_ is not None and self.b_ is not None, 'must fit before predict'
 88 |         return np.array([self._predict(x) for x in x_predict])
 89 | 
 90 |     def _predict(self, x_single):
 91 |         return self.a_ * x_single + self.b_
 92 | 
 93 |     def score(self, x_test, y_test):
 94 |         """
 95 |         根据测试数据集x_test和y_test 确定当前模型的准确度
 96 |         :param x_test:
 97 |         :param y_test:
 98 |         :return:
 99 |         """
100 |         y_predict = self.predict(x_test)
101 |         return r2_score(y_test, y_predict)
102 | 
103 |     def __repr__(self):
104 |         return 'SimpleLinearRegression2()'
105 | 
106 | 
107 | class SimpleLinearRegression(SimpleLinearRegression2):
108 |     pass


--------------------------------------------------------------------------------
/c1_knn/02_kNN_in_scikit_learn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Scikit-Learn中的kNN"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "from sklearn.neighbors import KNeighborsClassifier"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "kNN_classifier = KNeighborsClassifier(n_neighbors=6)"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 3,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "raw_data_X = np.random.random((10,2))\n",
 38 |     "X_train = raw_data_X * 10\n",
 39 |     "y_train = np.array([0,0,0,0,0,1,1,1,1,1])"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 4,
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n           metric_params=None, n_jobs=1, n_neighbors=6, p=2,\n           weights='uniform')"
 51 |       ]
 52 |      },
 53 |      "execution_count": 4,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "# 训练/拟合\n",
 60 |     "kNN_classifier.fit(X_train, y_train)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 5,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "x = np.random.random((1,2))*10"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 6,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "data": {
 79 |       "text/plain": [
 80 |        "array([0])"
 81 |       ]
 82 |      },
 83 |      "execution_count": 6,
 84 |      "metadata": {},
 85 |      "output_type": "execute_result"
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "kNN_classifier.predict(x)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## 重新整理我们的kNN代码"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 7,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "%run c1_knn/kNN.py\n"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 8,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "knn_clf = KNNClassifier(k=6)\n",
115 |     "knn_clf.fit(X_train, y_train)\n",
116 |     "y_predict = knn_clf.predict(x)\n"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 9,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "array([1])"
128 |       ]
129 |      },
130 |      "execution_count": 9,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "y_predict"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": []
145 |   }
146 |  ],
147 |  "metadata": {
148 |   "kernelspec": {
149 |    "display_name": "Python 2",
150 |    "language": "python",
151 |    "name": "python2"
152 |   },
153 |   "language_info": {
154 |    "codemirror_mode": {
155 |     "name": "ipython",
156 |     "version": 2
157 |    },
158 |    "file_extension": ".py",
159 |    "mimetype": "text/x-python",
160 |    "name": "python",
161 |    "nbconvert_exporter": "python",
162 |    "pygments_lexer": "ipython2",
163 |    "version": "2.7.6"
164 |   }
165 |  },
166 |  "nbformat": 4,
167 |  "nbformat_minor": 0
168 | }
169 | 


--------------------------------------------------------------------------------
/playML/linear_regression.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | from playML.metrics import r2_score
  4 | from c2_linear_regression import linear_regression
  5 | 
  6 | class LinearRegression(linear_regression.LinearRegression):
  7 |     def fit_gd(self, X_train:np.ndarray, y_train:np.ndarray, eta=0.01, n_iters=1e4):
  8 |         """
  9 |         根据训练数据集X_train和y_train，使用梯度下降法训练线性回归模型
 10 |         :param X_train:
 11 |         :param y_train:
 12 |         :param eta:
 13 |         :param n_iters:
 14 |         :return:
 15 |         """
 16 |         assert X_train.shape[0] == y_train.shape[0], '每一个训练样本必须对应一个标记'
 17 | 
 18 |         def J(theta, X_b, y):
 19 |             """
 20 |             给定θ，特征矩阵X，标记向量y，根据损失函数得出其（损失）值
 21 |             :param theta:
 22 |             :param X_b:
 23 |             :param y:
 24 |             :return:
 25 |             """
 26 | 
 27 |             # 分子部分其实等价于 (y - X_b.dot(theta)).T.dot(y - X_b.dot(theta))
 28 |             try:
 29 |                 return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)
 30 |             except:
 31 |                 return float('inf')  # 防止溢出？有异常直接返回最大值
 32 | 
 33 |         def derivative_J(theta: np.ndarray, X_b: np.ndarray, y: np.ndarray):
 34 |             """
 35 |             求θ为给定值时的导数
 36 |             :param theta:
 37 |             :param X_b:
 38 |             :param y:
 39 |             :return:
 40 |             """
 41 | 
 42 |             # res = np.empty(len(theta))
 43 |             # res[0] = np.sum(X_b.dot(theta) - y)
 44 |             # for i in range(1, len(theta)):
 45 |             #     res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
 46 |             # return res * 2 / len(X_b)
 47 | 
 48 |             # 改为向量的形式
 49 |             return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(X_b)
 50 | 
 51 |         def gradient_descent(X_b, y, initial_theta, eta, n_iters=5, epsilon=1e-8):
 52 |             theta = initial_theta
 53 |             iters = 0
 54 |             while iters < n_iters:
 55 |                 gradient = derivative_J(theta, X_b, y)
 56 |                 last_theta = theta
 57 |                 theta = theta - eta * gradient
 58 | 
 59 |                 if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
 60 |                     break
 61 |                 iters += 1
 62 |             return theta
 63 | 
 64 |         X_b = np.hstack((np.ones((len(X_train), 1)), X_train))
 65 |         initial_theta = np.zeros(X_b.shape[1])  # 初始的θ向量都是0
 66 |         self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
 67 |         self.interception_ = self._theta[0]
 68 |         self.coef_ = self._theta[1:]
 69 | 
 70 |         return self
 71 | 
 72 |     def fit_sgd(self, X_train, y_train, n_iters=1e4, t0=5, t1=50):
 73 |         """
 74 |         使用随机梯度下降法进行拟合
 75 |         :param X_train:
 76 |         :param y_train:
 77 |         :param n_iters:
 78 |         :param t0:
 79 |         :param t1:
 80 |         :return:
 81 |         """
 82 |         assert X_train.shape[0] == y_train.shape[0], '每一个训练样本必须对应一个标记'
 83 |         assert n_iters >=1 , '所有训练样本至少要被随机一次'
 84 | 
 85 |         def derivative_J_sgd(theta: np.ndarray, X_b_i: np.ndarray, y_i):
 86 |             """
 87 |             求随机搜索方向
 88 |             """
 89 |             return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2.
 90 | 
 91 |         def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50):
 92 |             """"""
 93 |             def learning_rate(t):
 94 |                 return t0 / (t + t1)
 95 | 
 96 |             theta = initial_theta
 97 |             m = len(X_b)    # 样本数目
 98 |             for cur_iter in range(n_iters):
 99 |                 indexes = np.random.permutation(m)
100 |                 X_b_new = X_b[indexes]
101 |                 y_new = y[indexes]
102 |                 for i in range(m):
103 |                     gradient = derivative_J_sgd(theta, X_b_new[i], y_new[i])
104 |                     # 向搜索方向的相反方向移动η
105 |                     theta = theta - learning_rate(cur_iter * m + i) * gradient
106 |             return theta
107 | 
108 |         X_b = np.hstack((np.ones((len(X_train), 1)), X_train))
109 |         initial_theta = np.zeros(X_b.shape[1])
110 |         self._theta = sgd(X_b, y_train, initial_theta, n_iters=n_iters, t0=t0, t1=t1)
111 |         self.interception_ = self._theta[0]
112 |         self.coef_ = self._theta[1:]


--------------------------------------------------------------------------------
/c2_linear_regression/08_Linear_Regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 实现多元线性回归模型"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "from sklearn import datasets"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "boston = datasets.load_boston()\n",
 30 |     "\n",
 31 |     "X = boston.data\n",
 32 |     "y = boston.target\n",
 33 |     "\n",
 34 |     "X = X[y<50.0]\n",
 35 |     "y = y[y<50.0]"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "(490, 13)"
 47 |       ]
 48 |      },
 49 |      "execution_count": 3,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "X.shape"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "from playML import model_selection\n",
 65 |     "X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,seed=666)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 5,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "LinearRegression()"
 77 |       ]
 78 |      },
 79 |      "execution_count": 5,
 80 |      "metadata": {},
 81 |      "output_type": "execute_result"
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "from c2_linear_regression.linear_regression import LinearRegression\n",
 86 |     "\n",
 87 |     "reg = LinearRegression()\n",
 88 |     "\n",
 89 |     "reg.fit_normal(X_train, y_train)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 6,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/plain": [
100 |        "array([ -1.18919477e-01,   3.63991462e-02,  -3.56494193e-02,\n         5.66737830e-02,  -1.16195486e+01,   3.42022185e+00,\n        -2.31470282e-02,  -1.19509560e+00,   2.59339091e-01,\n        -1.40112724e-02,  -8.36521175e-01,   7.92283639e-03,\n        -3.81966137e-01])"
101 |       ]
102 |      },
103 |      "execution_count": 6,
104 |      "metadata": {},
105 |      "output_type": "execute_result"
106 |     }
107 |    ],
108 |    "source": [
109 |     "reg.coef_"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 7,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/plain": [
120 |        "34.161435496212974"
121 |       ]
122 |      },
123 |      "execution_count": 7,
124 |      "metadata": {},
125 |      "output_type": "execute_result"
126 |     }
127 |    ],
128 |    "source": [
129 |     "reg.interception_"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 8,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/plain": [
140 |        "0.81298026026586467"
141 |       ]
142 |      },
143 |      "execution_count": 8,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "reg.score(X_test, y_test)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": []
158 |   }
159 |  ],
160 |  "metadata": {
161 |   "kernelspec": {
162 |    "display_name": "Python 2",
163 |    "language": "python",
164 |    "name": "python2"
165 |   },
166 |   "language_info": {
167 |    "codemirror_mode": {
168 |     "name": "ipython",
169 |     "version": 2
170 |    },
171 |    "file_extension": ".py",
172 |    "mimetype": "text/x-python",
173 |    "name": "python",
174 |    "nbconvert_exporter": "python",
175 |    "pygments_lexer": "ipython2",
176 |    "version": "2.7.6"
177 |   }
178 |  },
179 |  "nbformat": 4,
180 |  "nbformat_minor": 0
181 | }
182 | 


--------------------------------------------------------------------------------
/c1_knn/knn.md:
--------------------------------------------------------------------------------
  1 | # kNN算法
  2 | - 属于监督学习
  3 | - 非参数学习
  4 | - 是解决分类问题的算法，天然可解决多分类问题
  5 | - kNN没有模型，可以说是一个（也许也是唯一一个）不需要训练过程的算法
  6 |     - 为了和其他算法统一，可以认为训练数据集就是模型本身
  7 | 
  8 | ## 本质
  9 | 两个（或几个）样本如果足够相似，那么它们就有极高的概率属于同一个类别。所谓“相似”，就是样本就特征空间中的距离相近。
 10 | ## 优点
 11 | - 思想极其简单
 12 |     - 可以解释机器学习算法使用过程中的很多细节问题
 13 |     - 更完整的刻画机器学习应用的流程
 14 | - 应用数学知识少（近乎为零）
 15 | - 效果好
 16 | - 天然适合解决多分类问题，同时也适合解决回归问题
 17 | 
 18 | ## 缺点
 19 | - 最大的缺点：效率低下
 20 | 如果训练集有m个样本，n个特征，则预测每一个新的数据，需要O(m*n)
 21 |     - 优化，使用树结构：KD-Tree， Ball-Tree
 22 |     - 即便如此，依然效率低下
 23 | - 高度数据相关，而且对outlier更敏感
 24 | - 预测结果不具有可解释性
 25 | 只知道属于哪个类别，但是无法解释为什么属于某个类别
 26 | - 维数灾难
 27 |     - 随着维度的增加，“看似相似”的两个点之间的距离越来越大
 28 |     - 解决方法：降维，例如PCA
 29 | 
 30 | ## kNN的过程
 31 | ### 计算特征空间中的距离
 32 | #### 欧拉距离（最为常见）
 33 | - 平面距离：
 34 | <a href="https://www.codecogs.com/eqnedit.php?latex=\sqrt{(x^{(a)}-x^{(b)})^2&space;&plus;&space;(y^{(a)}-y^{(b)})^2}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\sqrt{(x^{(a)}-x^{(b)})^2&space;&plus;&space;(y^{(a)}-y^{(b)})^2}" title="\sqrt{(x^{(a)}-x^{(b)})^2 + (y^{(a)}-y^{(b)})^2}" /></a>
 35 | - 立体距离
 36 | <a href="https://www.codecogs.com/eqnedit.php?latex=\sqrt{(x^{(a)}-x^{(b)})^2&space;&plus;&space;(y^{(a)}-y^{(b)})^2&space;&plus;&space;(z^{(a)}-z^{(b)})^2}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\sqrt{(x^{(a)}-x^{(b)})^2&space;&plus;&space;(y^{(a)}-y^{(b)})^2&space;&plus;&space;(z^{(a)}-z^{(b)})^2}" title="\sqrt{(x^{(a)}-x^{(b)})^2 + (y^{(a)}-y^{(b)})^2 + (z^{(a)}-z^{(b)})^2}" /></a>
 37 | - n维空间距离
 38 | <a href="https://www.codecogs.com/eqnedit.php?latex=\sqrt{\sum\limits_{i=1}^n(x_i^{(a)}-x_i^{(b)})^2}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\sqrt{\sum\limits_{i=1}^n(x_i^{(a)}-x_i^{(b)})^2}" title="\sqrt{\sum\limits_{i=1}^n(x_i^{(a)}-x_i^{(b)})^2}" /></a>
 39 | #### 曼哈顿距离
 40 | <a href="https://www.codecogs.com/eqnedit.php?latex=\sum\limits_{i=1}^n|x_i^{(a)}-x_i^{(b)}|" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\sum\limits_{i=1}^n|x_i^{(a)}-x_i^{(b)}|" title="\sum\limits_{i=1}^n|x_i^{(a)}-x_i^{(b)}|" /></a>
 41 | #### 明可夫斯基距离
 42 | <a href="https://www.codecogs.com/eqnedit.php?latex=(\sum\limits_{i=1}^n|x_i^{(a)}-x_i^{(b)}|^p)^{\frac{1}{p}}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?(\sum\limits_{i=1}^n|x_i^{(a)}-x_i^{(b)}|^p)^{\frac{1}{p}}" title="(\sum\limits_{i=1}^n|x_i^{(a)}-x_i^{(b)}|^p)^{\frac{1}{p}}" /></a>
 43 | - 当p=1，相当于曼哈顿距离
 44 | - 当p=2，相当于欧拉距离
 45 | - 当p=3，其他距离
 46 | 
 47 | ## 参数
 48 | ### 超参数
 49 | - kNN算法中的k是典型的超参数
 50 |     - 默认值为5 （经验数值）
 51 | - 距离的权重
 52 |     - 距离越近，权重越大
 53 | - 关于“距离”的定义
 54 |     - 明可夫斯基距离（默认）
 55 |     明可夫斯基距离的p取值
 56 |         - p=1：曼哈顿距离
 57 |         - p=2（默认）：欧拉距离
 58 |         - p=3：明可夫斯基距离（其他距离）
 59 |     - 其他更多的距离定义
 60 |         - 向量空间余弦相似度Cosine Similarity
 61 |         - 调整余弦相似度Adjusted Cosine Similarity
 62 |         - 皮尔森相关系数 Pearson Correlation Coefficient
 63 |         - Jaccard相似系数 Jaccard Coefficient
 64 | 
 65 | ### 模型参数
 66 | kNN算法没有模型参数
 67 | 
 68 | ## 数据归一化 Feature Scaling
 69 | ### 需要归一化的原因
 70 | 如果某些特征数值较大，会主导最终距离的结果
 71 | ### 解决方案
 72 | 把所有的数据映射到同一尺度
 73 | #### 最值归一化 normalization
 74 | 把所有数据映射到0~1之间:
 75 | <a href="https://www.codecogs.com/eqnedit.php?latex=x_{scale}=\frac{x-x_{min}}{x_{max}-x_{min}}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?x_{scale}=\frac{x-x_{min}}{x_{max}-x_{min}}" title="x_{scale}=\frac{x-x_{min}}{x_{max}-x_{min}}" /></a>
 76 | 
 77 | - 适用于分布有明显边界的情况
 78 |     - 例如考试分数，最大是100，最小是0
 79 |     - 例如每个像素的RGB颜色，都是0~255之间
 80 | - 受outlier影响较大
 81 |     - 例如收入，有些人特别特别高
 82 | 
 83 | #### Standardization（0均值标准化/均值方差归一化）
 84 | 针对最值归一化的缺憾改进
 85 | **把所有数据归一到均值为0，方差为1的分布中**
 86 | 
 87 | <a href="https://www.codecogs.com/eqnedit.php?latex=x_{scale}&space;=&space;\frac{x&space;-&space;x_{mean}}{S}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?x_{scale}&space;=&space;\frac{x&space;-&space;x_{mean}}{S}" title="x_{scale} = \frac{x - x_{mean}}{S}" /></a>
 88 | 
 89 | - 并不保证数据在0~1之间
 90 | - 但是所有数值的均值在0的位置
 91 | - 数据方差/标准差为1
 92 | 
 93 | 适用于数据分布没有明显的分界（有可能存在极端数据值）。其实数据分布有明显边界的情况也是同样适合的，所以选它一般没错。
 94 | 
 95 | ### 数据归一化的一些注意事项
 96 | #### 对测试数据集如何归一化
 97 | 例如训练集有均值<a href="https://www.codecogs.com/eqnedit.php?latex=X_{mean}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?X_{mean}" title="X_{mean}" /></a>，标准差<a href="https://www.codecogs.com/eqnedit.php?latex=X_{std}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?X_{std}" title="X_{std}" /></a>, 那么，测试数据集进行归一化（例如0均值标准化）时，应该使用训练集的均值和标准差，而不是用测试集的均值和标准差。原因有：
 98 | 1. 测试数据是模拟真实环境，真实环境很可能无法得到所有测试数据的均值和标准差。（个人理解，如果使用测试数据集的均值和标准差，那么以后每有一个新的样例进来，岂不是要重新计算（分配）所有测试样例的均值和标准差？）
 99 | 2. 对数据的归一化也是算法的一部分
100 | 
101 | #### 需要保存训练数据集得到的均值和标准差
102 | - 使用skLearn进行数据归一化处理
103 |     - 使用StandardScaler进行0均值标准化
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/c1_knn/05_Hyper_Parameters.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 超参数"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "from sklearn import datasets"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "digits = datasets.load_digits()\n",
 29 |     "X = digits.data\n",
 30 |     "y = digits.target"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "from sklearn.model_selection._split import train_test_split"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 4,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=666)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 5,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/plain": [
 59 |        "0.98888888888888893"
 60 |       ]
 61 |      },
 62 |      "execution_count": 5,
 63 |      "metadata": {},
 64 |      "output_type": "execute_result"
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "from sklearn.neighbors.classification import KNeighborsClassifier\n",
 69 |     "\n",
 70 |     "knn_clf = KNeighborsClassifier(n_neighbors=3)\n",
 71 |     "knn_clf.fit(X_train, y_train)\n",
 72 |     "knn_clf.score(X_test, y_test)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 寻找最好的k"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 6,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "best_score: 0.991666666667\nbest_k: 4\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "best_score = 0.0\n",
 97 |     "best_k = -1\n",
 98 |     "for k in range(1,11):\n",
 99 |     "    knn_clf = KNeighborsClassifier(n_neighbors=k)\n",
100 |     "    knn_clf.fit(X_train, y_train)\n",
101 |     "    score = knn_clf.score(X_test, y_test)\n",
102 |     "    if score > best_score:\n",
103 |     "        best_score = score\n",
104 |     "        best_k = k\n",
105 |     "print('best_score:',best_score)\n",
106 |     "print('best_k:', best_k)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "## 考虑距离？不考虑距离？  \n",
114 |     "引出另一个超参数：距离权重"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 8,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "best_score: 0.991666666667\nbest_k: 4\nbest weights: uniform\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "best_method = ''\n",
132 |     "best_score = 0.0\n",
133 |     "best_k = -1\n",
134 |     "for method in ['uniform','distance']:\n",
135 |     "    for k in range(1,11):\n",
136 |     "        knn_clf = KNeighborsClassifier(n_neighbors=k, weights=method)\n",
137 |     "        knn_clf.fit(X_train, y_train)\n",
138 |     "        score = knn_clf.score(X_test, y_test)\n",
139 |     "        if score > best_score:\n",
140 |     "            best_score = score\n",
141 |     "            best_k = k\n",
142 |     "            best_method = method\n",
143 |     "print('best_score:',best_score)\n",
144 |     "print('best_k:', best_k)\n",
145 |     "print('best weights:', best_method)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "## 搜索明可夫斯基距离相应的p"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 10,
158 |    "metadata": {},
159 |    "outputs": [
160 |     {
161 |      "name": "stdout",
162 |      "output_type": "stream",
163 |      "text": [
164 |       "best_score: 0.988888888889\nbest_k: 5\nbest p: 1\nCPU times: user 15.7 s, sys: 116 ms, total: 15.8 s\nWall time: 16.1 s\n"
165 |      ]
166 |     }
167 |    ],
168 |    "source": [
169 |     "%%time\n",
170 |     "best_score = 0.0\n",
171 |     "best_k = -1\n",
172 |     "best_p = -1\n",
173 |     "for p in range(1,6):\n",
174 |     "    for k in range(1,11):\n",
175 |     "        knn_clf = KNeighborsClassifier(n_neighbors=k, weights='distance', p=p)\n",
176 |     "        knn_clf.fit(X_train, y_train)\n",
177 |     "        score = knn_clf.score(X_test, y_test)\n",
178 |     "        if score > best_score:\n",
179 |     "            best_score = score\n",
180 |     "            best_k = k\n",
181 |     "            best_p = p\n",
182 |     "print('best_score:',best_score)\n",
183 |     "print('best_k:', best_k)\n",
184 |     "print('best p:', best_p)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": []
193 |   }
194 |  ],
195 |  "metadata": {
196 |   "kernelspec": {
197 |    "display_name": "Python 2",
198 |    "language": "python",
199 |    "name": "python2"
200 |   },
201 |   "language_info": {
202 |    "codemirror_mode": {
203 |     "name": "ipython",
204 |     "version": 2
205 |    },
206 |    "file_extension": ".py",
207 |    "mimetype": "text/x-python",
208 |    "name": "python",
209 |    "nbconvert_exporter": "python",
210 |    "pygments_lexer": "ipython2",
211 |    "version": "2.7.6"
212 |   }
213 |  },
214 |  "nbformat": 4,
215 |  "nbformat_minor": 0
216 | }
217 | 


--------------------------------------------------------------------------------
/c3_gradient_descent/06_Stochastic_Gradient_Descent.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 随机梯度下降法"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import matplotlib.pyplot as plt"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "m = 100000\n",
 29 |     "\n",
 30 |     "x = np.random.normal(size=m)\n",
 31 |     "X = x.reshape(-1,1)\n",
 32 |     "y = 4.*x + 3. + np.random.normal(0,3,size=m)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 4,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "# 损失函数\n",
 42 |     "def J(theta, X_b, y):\n",
 43 |     "    try:\n",
 44 |     "        return np.sum((y - X_b.dot(theta))**2) / len(y)\n",
 45 |     "    except:\n",
 46 |     "        return float('inf')\n",
 47 |     "    \n",
 48 |     "def derivative_J(theta:np.ndarray, X_b:np.ndarray, y:np.ndarray):\n",
 49 |     "    \"\"\"\n",
 50 |     "    求θ为给定值时的导数(梯度)\n",
 51 |     "    :param theta: \n",
 52 |     "    :param X_b: \n",
 53 |     "    :param y: \n",
 54 |     "    :return: \n",
 55 |     "    \"\"\"\n",
 56 |     "    return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)\n",
 57 |     "\n",
 58 |     "# 批量梯度下降法\n",
 59 |     "def gradient_descent(X_b, y, initial_theta, eta=0.01, n_iters=1e4, epsilon=1e-8):\n",
 60 |     "    theta = initial_theta\n",
 61 |     "    cur_iter = 0\n",
 62 |     "    while cur_iter < n_iters:\n",
 63 |     "        gradient = derivative_J(theta, X_b, y)\n",
 64 |     "        last_theta = theta\n",
 65 |     "        theta = theta - eta * gradient\n",
 66 |     "        if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):\n",
 67 |     "            break\n",
 68 |     "        cur_iter += 1\n",
 69 |     "    return theta"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "## 批量梯度下降法效果"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 6,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "CPU times: user 1.36 s, sys: 93.2 ms, total: 1.45 s\nWall time: 1.45 s\n"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "%%time\n",
 94 |     "X_b = np.hstack((np.ones((len(X), 1)), X))\n",
 95 |     "initial_theta = np.zeros(X_b.shape[1])\n",
 96 |     "eta = 0.01\n",
 97 |     "# 我们知道最终的系数和截距，直接肉眼比较吧。。。就不分训练集测试集了\n",
 98 |     "theta = gradient_descent(X_b,y,initial_theta, eta)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 7,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "array([ 3.01042744,  4.00071587])"
110 |       ]
111 |      },
112 |      "execution_count": 7,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "theta"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "## 随机梯度下降法效果"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 8,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "def derivative_J_sgd(theta:np.ndarray, X_b_i:np.ndarray, y_i):\n",
135 |     "    \"\"\"\n",
136 |     "    求随机搜索方向 \n",
137 |     "    \"\"\"\n",
138 |     "    return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2."
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 9,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "def sgd(X_b, y, initial_theta, n_iters):\n",
148 |     "    # 两个超参数\n",
149 |     "    t0 = 5\n",
150 |     "    t1 = 50\n",
151 |     "    \n",
152 |     "    def learning_rate(t):\n",
153 |     "        return t0/(t+t1)\n",
154 |     "    \n",
155 |     "    theta = initial_theta\n",
156 |     "    for cur_iter in range(n_iters):\n",
157 |     "        # 随机选一个\n",
158 |     "        rand_i = np.random.randint(len(X_b))\n",
159 |     "        gradient = derivative_J_sgd(theta, X_b[rand_i], y[rand_i])\n",
160 |     "        # 向搜索方向的相反方向移动η\n",
161 |     "        theta = theta - learning_rate(cur_iter) * gradient\n",
162 |     "    return theta"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 10,
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "name": "stdout",
172 |      "output_type": "stream",
173 |      "text": [
174 |       "CPU times: user 276 ms, sys: 6.11 ms, total: 282 ms\nWall time: 283 ms\n"
175 |      ]
176 |     }
177 |    ],
178 |    "source": [
179 |     "%%time\n",
180 |     "X_b = np.hstack((np.ones((len(X), 1)), X))\n",
181 |     "initial_theta = np.zeros(X_b.shape[1])\n",
182 |     "theta = sgd(X_b, y, initial_theta, n_iters=len(X_b)//3)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 11,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "data": {
192 |       "text/plain": [
193 |        "array([ 3.02984824,  3.9936953 ])"
194 |       ]
195 |      },
196 |      "execution_count": 11,
197 |      "metadata": {},
198 |      "output_type": "execute_result"
199 |     }
200 |    ],
201 |    "source": [
202 |     "theta"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "结论：批量梯度下降法和随机梯度下降法最终效果差不多，但是随机梯度下降法循环次数少得多，计算时间快得多"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": []
218 |   }
219 |  ],
220 |  "metadata": {
221 |   "kernelspec": {
222 |    "display_name": "Python 2",
223 |    "language": "python",
224 |    "name": "python2"
225 |   },
226 |   "language_info": {
227 |    "codemirror_mode": {
228 |     "name": "ipython",
229 |     "version": 2
230 |    },
231 |    "file_extension": ".py",
232 |    "mimetype": "text/x-python",
233 |    "name": "python",
234 |    "nbconvert_exporter": "python",
235 |    "pygments_lexer": "ipython2",
236 |    "version": "2.7.6"
237 |   }
238 |  },
239 |  "nbformat": 4,
240 |  "nbformat_minor": 0
241 | }
242 | 


--------------------------------------------------------------------------------
/c2_linear_regression/10_More_About_Linear_Regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 更多关于线性回归模型的讨论"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "from sklearn import datasets\n",
 20 |     "\n",
 21 |     "boston = datasets.load_boston()\n",
 22 |     "\n",
 23 |     "X = boston.data\n",
 24 |     "y = boston.target\n",
 25 |     "\n",
 26 |     "X = X[y<50.0]\n",
 27 |     "y = y[y<50.0]"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 5,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/plain": [
 38 |        "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
 39 |       ]
 40 |      },
 41 |      "execution_count": 5,
 42 |      "metadata": {},
 43 |      "output_type": "execute_result"
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "from sklearn.linear_model.base import LinearRegression\n",
 48 |     "\n",
 49 |     "lin_reg = LinearRegression()\n",
 50 |     "lin_reg.fit(X, y)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 6,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/plain": [
 61 |        "array([ -1.05574295e-01,   3.52748549e-02,  -4.35179251e-02,\n         4.55405227e-01,  -1.24268073e+01,   3.75411229e+00,\n        -2.36116881e-02,  -1.21088069e+00,   2.50740082e-01,\n        -1.37702943e-02,  -8.38888137e-01,   7.93577159e-03,\n        -3.50952134e-01])"
 62 |       ]
 63 |      },
 64 |      "execution_count": 6,
 65 |      "metadata": {},
 66 |      "output_type": "execute_result"
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "lin_reg.coef_"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "把系数按从小到大排一下序"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 7,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "data": {
 87 |       "text/plain": [
 88 |        "array([ 4,  7, 10, 12,  0,  2,  6,  9, 11,  1,  8,  3,  5])"
 89 |       ]
 90 |      },
 91 |      "execution_count": 7,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "arg_sort = np.argsort(lin_reg.coef_)\n",
 98 |     "arg_sort"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "看看按照影响程度从小到大排序后的各个系数对应的都是什么属性（名称）"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 8,
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/plain": [
116 |        "array(['NOX', 'DIS', 'PTRATIO', 'LSTAT', 'CRIM', 'INDUS', 'AGE', 'TAX',\n       'B', 'ZN', 'RAD', 'CHAS', 'RM'],\n      dtype='<U7')"
117 |       ]
118 |      },
119 |      "execution_count": 8,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "boston.feature_names[arg_sort]"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 9,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "name": "stdout",
135 |      "output_type": "stream",
136 |      "text": [
137 |       "Boston House Prices dataset\n===========================\n\nNotes\n------\nData Set Characteristics:  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive\n    \n    :Median Value (attribute 14) is usually the target\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttp://archive.ics.uci.edu/ml/datasets/Housing\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n**References**\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n   - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)\n\n"
138 |      ]
139 |     }
140 |    ],
141 |    "source": [
142 |     "print(boston.DESCR)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": []
151 |   }
152 |  ],
153 |  "metadata": {
154 |   "kernelspec": {
155 |    "display_name": "Python 2",
156 |    "language": "python",
157 |    "name": "python2"
158 |   },
159 |   "language_info": {
160 |    "codemirror_mode": {
161 |     "name": "ipython",
162 |     "version": 2
163 |    },
164 |    "file_extension": ".py",
165 |    "mimetype": "text/x-python",
166 |    "name": "python",
167 |    "nbconvert_exporter": "python",
168 |    "pygments_lexer": "ipython2",
169 |    "version": "2.7.6"
170 |   }
171 |  },
172 |  "nbformat": 4,
173 |  "nbformat_minor": 0
174 | }
175 | 


--------------------------------------------------------------------------------
/c4_pca/07_MNIST.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# MNIST"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "from sklearn.datasets.mldata import fetch_mldata"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "mnist = fetch_mldata('MNIST original')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 3,
 34 |    "metadata": {},
 35 |    "outputs": [
 36 |     {
 37 |      "data": {
 38 |       "text/plain": [
 39 |        "{'COL_NAMES': ['label', 'data'],\n 'DESCR': 'mldata.org dataset: mnist-original',\n 'data': array([[0, 0, 0, ..., 0, 0, 0],\n        [0, 0, 0, ..., 0, 0, 0],\n        [0, 0, 0, ..., 0, 0, 0],\n        ..., \n        [0, 0, 0, ..., 0, 0, 0],\n        [0, 0, 0, ..., 0, 0, 0],\n        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),\n 'target': array([ 0.,  0.,  0., ...,  9.,  9.,  9.])}"
 40 |       ]
 41 |      },
 42 |      "execution_count": 3,
 43 |      "metadata": {},
 44 |      "output_type": "execute_result"
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "mnist"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 4,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "X, y = mnist['data'],mnist['target']"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 5,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "(70000, 784)"
 69 |       ]
 70 |      },
 71 |      "execution_count": 5,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "X.shape"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 6,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "X_train = np.array(X[:60000], dtype=float)\n",
 87 |     "y_train = np.array(y[:60000], dtype=float)\n",
 88 |     "X_test = np.array(X[60000:], dtype=float)\n",
 89 |     "y_test = np.array(y[60000:], dtype=float)\n"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 7,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/plain": [
100 |        "((60000, 784), (60000,), (10000, 784), (10000,))"
101 |       ]
102 |      },
103 |      "execution_count": 7,
104 |      "metadata": {},
105 |      "output_type": "execute_result"
106 |     }
107 |    ],
108 |    "source": [
109 |     "X_train.shape, y_train.shape, X_test.shape, y_test.shape"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "## 使用kNN"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 8,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "name": "stdout",
126 |      "output_type": "stream",
127 |      "text": [
128 |       "CPU times: user 26.6 s, sys: 184 ms, total: 26.7 s\nWall time: 26.9 s\n"
129 |      ]
130 |     },
131 |     {
132 |      "data": {
133 |       "text/plain": [
134 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n           metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n           weights='uniform')"
135 |       ]
136 |      },
137 |      "execution_count": 8,
138 |      "metadata": {},
139 |      "output_type": "execute_result"
140 |     }
141 |    ],
142 |    "source": [
143 |     "from sklearn.neighbors.classification import KNeighborsClassifier\n",
144 |     "\n",
145 |     "knn_clf = KNeighborsClassifier()\n",
146 |     "%time knn_clf.fit(X_train, y_train)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 9,
152 |    "metadata": {},
153 |    "outputs": [
154 |     {
155 |      "name": "stdout",
156 |      "output_type": "stream",
157 |      "text": [
158 |       "CPU times: user 10min 29s, sys: 2.66 s, total: 10min 31s\nWall time: 10min 35s\n"
159 |      ]
160 |     },
161 |     {
162 |      "data": {
163 |       "text/plain": [
164 |        "0.96879999999999999"
165 |       ]
166 |      },
167 |      "execution_count": 9,
168 |      "metadata": {},
169 |      "output_type": "execute_result"
170 |     }
171 |    ],
172 |    "source": [
173 |     "%time knn_clf.score(X_test, y_test)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "### 使用PCA进行降维"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 10,
186 |    "metadata": {},
187 |    "outputs": [
188 |     {
189 |      "name": "stdout",
190 |      "output_type": "stream",
191 |      "text": [
192 |       "CPU times: user 22.8 s, sys: 1.13 s, total: 23.9 s\nWall time: 11.4 s\n"
193 |      ]
194 |     }
195 |    ],
196 |    "source": [
197 |     "from sklearn.decomposition.pca import PCA\n",
198 |     "\n",
199 |     "pca = PCA(0.9)\n",
200 |     "%time pca.fit(X_train)\n",
201 |     "X_train_reduction = pca.transform(X_train)\n",
202 |     "X_test_reduction = pca.transform(X_test)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 11,
208 |    "metadata": {},
209 |    "outputs": [
210 |     {
211 |      "data": {
212 |       "text/plain": [
213 |        "(60000, 87)"
214 |       ]
215 |      },
216 |      "execution_count": 11,
217 |      "metadata": {},
218 |      "output_type": "execute_result"
219 |     }
220 |    ],
221 |    "source": [
222 |     "#784维的数据剩下多少维呢？\n",
223 |     "X_train_reduction.shape"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 12,
229 |    "metadata": {},
230 |    "outputs": [
231 |     {
232 |      "name": "stdout",
233 |      "output_type": "stream",
234 |      "text": [
235 |       "CPU times: user 510 ms, sys: 15 ms, total: 525 ms\nWall time: 649 ms\n"
236 |      ]
237 |     },
238 |     {
239 |      "data": {
240 |       "text/plain": [
241 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n           metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n           weights='uniform')"
242 |       ]
243 |      },
244 |      "execution_count": 12,
245 |      "metadata": {},
246 |      "output_type": "execute_result"
247 |     }
248 |    ],
249 |    "source": [
250 |     "pca_knn_clf = KNeighborsClassifier()\n",
251 |     "%time pca_knn_clf.fit(X_train_reduction, y_train)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 13,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "name": "stdout",
261 |      "output_type": "stream",
262 |      "text": [
263 |       "CPU times: user 1min 10s, sys: 313 ms, total: 1min 10s\nWall time: 1min 11s\n"
264 |      ]
265 |     },
266 |     {
267 |      "data": {
268 |       "text/plain": [
269 |        "0.9728"
270 |       ]
271 |      },
272 |      "execution_count": 13,
273 |      "metadata": {},
274 |      "output_type": "execute_result"
275 |     }
276 |    ],
277 |    "source": [
278 |     "%time pca_knn_clf.score(X_test_reduction, y_test)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "可以看到，所需时间大幅下降。而且准确率竟然还稍微提升了  \n",
286 |     "因为PCA在丢失信息的同时，会把一些噪音也丢掉，达到了降噪的目的"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": []
295 |   }
296 |  ],
297 |  "metadata": {
298 |   "kernelspec": {
299 |    "display_name": "Python 2",
300 |    "language": "python",
301 |    "name": "python2"
302 |   },
303 |   "language_info": {
304 |    "codemirror_mode": {
305 |     "name": "ipython",
306 |     "version": 2
307 |    },
308 |    "file_extension": ".py",
309 |    "mimetype": "text/x-python",
310 |    "name": "python",
311 |    "nbconvert_exporter": "python",
312 |    "pygments_lexer": "ipython2",
313 |    "version": "2.7.6"
314 |   }
315 |  },
316 |  "nbformat": 4,
317 |  "nbformat_minor": 0
318 | }
319 | 


--------------------------------------------------------------------------------
/c3_gradient_descent/08_Gradient_Debugging.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 如何调试梯度"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 26,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import matplotlib.pyplot as plt"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 27,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "np.random.seed(666)\n",
 29 |     "X = np.random.random(size=(1000,10))"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 28,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "true_theta = np.arange(1,12,dtype=float)\n",
 39 |     "X_b = np.hstack((np.ones(shape=(len(X),1)),X))\n",
 40 |     "y = X_b.dot(true_theta) + np.random.normal(size=1000)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 29,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "data": {
 50 |       "text/plain": [
 51 |        "(1000, 10)"
 52 |       ]
 53 |      },
 54 |      "execution_count": 29,
 55 |      "metadata": {},
 56 |      "output_type": "execute_result"
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "X.shape"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 30,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/plain": [
 71 |        "(1000,)"
 72 |       ]
 73 |      },
 74 |      "execution_count": 30,
 75 |      "metadata": {},
 76 |      "output_type": "execute_result"
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "y.shape"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 31,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "data": {
 90 |       "text/plain": [
 91 |        "array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.])"
 92 |       ]
 93 |      },
 94 |      "execution_count": 31,
 95 |      "metadata": {},
 96 |      "output_type": "execute_result"
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "true_theta"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 32,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "def J(theta, X_b, y):\n",
110 |     "    try:\n",
111 |     "        return np.sum((y - X_b.dot(theta))**2) / len(X_b)\n",
112 |     "    except:\n",
113 |     "        return float('inf')"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 33,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "# 数学推导的求梯度\n",
123 |     "def derivative_J_math(theta, X_b, y):\n",
124 |     "    return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 34,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "# 调试用的求梯度\n",
134 |     "def derivative_J_debug(theta, X_b, y, epsilon=0.01):\n",
135 |     "    res = np.empty(len(theta))\n",
136 |     "    for i in range(len(theta)):\n",
137 |     "        theta_1 = theta.copy()\n",
138 |     "        theta_1[i] += epsilon\n",
139 |     "        theta_2 = theta.copy()\n",
140 |     "        theta_2[i] -= epsilon\n",
141 |     "        res[i] = (J(theta_1, X_b, y) - J(theta_2, X_b, y)) / (2*epsilon)\n",
142 |     "    return res\n",
143 |     "        "
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 35,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "# 批量梯度下降训练法\n",
153 |     "def gradient_descent(dJ, X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):\n",
154 |     "    theta = initial_theta\n",
155 |     "    cur_iter = 0\n",
156 |     "    \n",
157 |     "    while cur_iter < n_iters:\n",
158 |     "        gradient = dJ(theta, X_b, y)\n",
159 |     "        last_theta = theta\n",
160 |     "        theta = theta - eta * gradient\n",
161 |     "        if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):\n",
162 |     "            break\n",
163 |     "        cur_iter += 1\n",
164 |     "    return theta"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 36,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "X_b = np.hstack((np.ones(shape=(len(X),1)),X))\n",
174 |     "initial_theta = np.zeros(X_b.shape[1])\n",
175 |     "eta = 0.01"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "## 调试用梯度下降的效果"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 37,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "CPU times: user 6.97 s, sys: 2.86 s, total: 9.83 s\nWall time: 9.17 s\n"
195 |      ]
196 |     },
197 |     {
198 |      "data": {
199 |       "text/plain": [
200 |        "array([  1.1251597 ,   2.05312521,   2.91522497,   4.11895968,\n         5.05002117,   5.90494046,   6.97383745,   8.00088367,\n         8.86213468,   9.98608331,  10.90529198])"
201 |       ]
202 |      },
203 |      "execution_count": 37,
204 |      "metadata": {},
205 |      "output_type": "execute_result"
206 |     }
207 |    ],
208 |    "source": [
209 |     "%time theta = gradient_descent(derivative_J_debug, X_b, y, initial_theta, eta)\n",
210 |     "theta"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "## 数学方式梯度下降的效果"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 38,
223 |    "metadata": {},
224 |    "outputs": [
225 |     {
226 |      "name": "stdout",
227 |      "output_type": "stream",
228 |      "text": [
229 |       "CPU times: user 833 ms, sys: 340 ms, total: 1.17 s\nWall time: 1.06 s\n"
230 |      ]
231 |     },
232 |     {
233 |      "data": {
234 |       "text/plain": [
235 |        "array([  1.1251597 ,   2.05312521,   2.91522497,   4.11895968,\n         5.05002117,   5.90494046,   6.97383745,   8.00088367,\n         8.86213468,   9.98608331,  10.90529198])"
236 |       ]
237 |      },
238 |      "execution_count": 38,
239 |      "metadata": {},
240 |      "output_type": "execute_result"
241 |     }
242 |    ],
243 |    "source": [
244 |     "%time theta = gradient_descent(derivative_J_math, X_b, y, initial_theta, eta)\n",
245 |     "theta"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 39,
251 |    "metadata": {},
252 |    "outputs": [
253 |     {
254 |      "data": {
255 |       "text/plain": [
256 |        "0.90497929349193762"
257 |       ]
258 |      },
259 |      "execution_count": 39,
260 |      "metadata": {},
261 |      "output_type": "execute_result"
262 |     }
263 |    ],
264 |    "source": [
265 |     "J(theta, X_b, y)"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 40,
271 |    "metadata": {},
272 |    "outputs": [
273 |     {
274 |      "data": {
275 |       "text/plain": [
276 |        "0.91015768339662462"
277 |       ]
278 |      },
279 |      "execution_count": 40,
280 |      "metadata": {},
281 |      "output_type": "execute_result"
282 |     }
283 |    ],
284 |    "source": [
285 |     "J(true_theta,X_b,y)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "两者结果差不多，但调试用的方式慢许多"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": []
301 |   }
302 |  ],
303 |  "metadata": {
304 |   "kernelspec": {
305 |    "display_name": "Python 2",
306 |    "language": "python",
307 |    "name": "python2"
308 |   },
309 |   "language_info": {
310 |    "codemirror_mode": {
311 |     "name": "ipython",
312 |     "version": 2
313 |    },
314 |    "file_extension": ".py",
315 |    "mimetype": "text/x-python",
316 |    "name": "python",
317 |    "nbconvert_exporter": "python",
318 |    "pygments_lexer": "ipython2",
319 |    "version": "2.7.6"
320 |   }
321 |  },
322 |  "nbformat": 4,
323 |  "nbformat_minor": 0
324 | }
325 | 


--------------------------------------------------------------------------------
/c5_polynomial_regression/06_Validation_and_Cross_Validation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 交叉验证"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "from sklearn import datasets"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "digits = datasets.load_digits()\n",
 29 |     "X = digits.data\n",
 30 |     "y = digits.target"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "## 测试train_test_split"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 4,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "from sklearn.model_selection._split import train_test_split\n",
 47 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=666)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 5,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "Best K = 3\nBest P = 4\nBest Score = 0.986091794159\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "from sklearn.neighbors.classification import KNeighborsClassifier\n",
 65 |     "\n",
 66 |     "best_score, best_p, best_k = 0, 0, 0\n",
 67 |     "for k in range(2, 11):\n",
 68 |     "    for p in range(1, 6):\n",
 69 |     "        knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=k, p=p)\n",
 70 |     "        knn_clf.fit(X_train, y_train)\n",
 71 |     "        score = knn_clf.score(X_test, y_test)\n",
 72 |     "        if score > best_score:\n",
 73 |     "            best_score, best_p, best_k = score, p, k\n",
 74 |     "print('Best K =', best_k)\n",
 75 |     "print('Best P =', best_p)\n",
 76 |     "print('Best Score =', best_score)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## 使用交叉验证"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 6,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "data": {
 93 |       "text/plain": [
 94 |        "array([ 0.98895028,  0.97777778,  0.96629213])"
 95 |       ]
 96 |      },
 97 |      "execution_count": 6,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "from sklearn.model_selection._validation import cross_val_score\n",
104 |     "\n",
105 |     "knn_clf = KNeighborsClassifier()\n",
106 |     "cross_val_score(knn_clf, X_train, y_train)\n",
107 |     "# 结果返回3个数，表示默认是分为3份做交叉验证"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 7,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "name": "stdout",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "Best K = 2\nBest P = 2\nBest Score = 0.982359987401\n"
120 |      ]
121 |     }
122 |    ],
123 |    "source": [
124 |     "best_score, best_p, best_k = 0, 0, 0\n",
125 |     "for k in range(2, 11):\n",
126 |     "    for p in range(1, 6):\n",
127 |     "        knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=k, p=p)\n",
128 |     "        scores = cross_val_score(knn_clf, X_train, y_train)\n",
129 |     "        score = np.mean(scores)\n",
130 |     "        if score > best_score:\n",
131 |     "            best_score, best_p, best_k = score, p, k\n",
132 |     "print('Best K =', best_k)\n",
133 |     "print('Best P =', best_p)\n",
134 |     "print('Best Score =', best_score)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "cross_val_score(knn_clf, X_train, y_train)  \n",
142 |     "可以看到在使用交叉验证寻找最佳超参数的过程中，是完全不使用测试集的"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "## 回顾网格搜索  \n",
150 |     "网格搜索其实就是用了交叉验证"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 8,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "name": "stdout",
160 |      "output_type": "stream",
161 |      "text": [
162 |       "Fitting 3 folds for each of 45 candidates, totalling 135 fits\n"
163 |      ]
164 |     },
165 |     {
166 |      "name": "stderr",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:  1.1min finished\n"
170 |      ]
171 |     },
172 |     {
173 |      "data": {
174 |       "text/plain": [
175 |        "GridSearchCV(cv=None, error_score='raise',\n       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n           metric_params=None, n_jobs=1, n_neighbors=10, p=5,\n           weights='distance'),\n       fit_params=None, iid=True, n_jobs=1,\n       param_grid=[{'weights': ['distance'], 'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],\n       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n       scoring=None, verbose=1)"
176 |       ]
177 |      },
178 |      "execution_count": 8,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "from sklearn.model_selection._search import GridSearchCV\n",
185 |     "param_grid = [\n",
186 |     "    {\n",
187 |     "        'weights':['distance'],\n",
188 |     "        'n_neighbors':[i for i in range(2,11)],\n",
189 |     "        'p': [i for i in range(1,6)]\n",
190 |     "    }\n",
191 |     "]\n",
192 |     "\n",
193 |     "grid_search = GridSearchCV(knn_clf, param_grid, verbose=1)\n",
194 |     "grid_search.fit(X_train, y_train)\n"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 9,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "data": {
204 |       "text/plain": [
205 |        "0.98237476808905377"
206 |       ]
207 |      },
208 |      "execution_count": 9,
209 |      "metadata": {},
210 |      "output_type": "execute_result"
211 |     }
212 |    ],
213 |    "source": [
214 |     "grid_search.best_score_"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 10,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "data": {
224 |       "text/plain": [
225 |        "{'n_neighbors': 2, 'p': 2, 'weights': 'distance'}"
226 |       ]
227 |      },
228 |      "execution_count": 10,
229 |      "metadata": {},
230 |      "output_type": "execute_result"
231 |     }
232 |    ],
233 |    "source": [
234 |     "grid_search.best_params_\n",
235 |     "# 与我们上面手动调用交叉验证得到的超参数一致"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 11,
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "data": {
245 |       "text/plain": [
246 |        "0.98052851182197498"
247 |       ]
248 |      },
249 |      "execution_count": 11,
250 |      "metadata": {},
251 |      "output_type": "execute_result"
252 |     }
253 |    ],
254 |    "source": [
255 |     "best_knn_clf = grid_search.best_estimator_\n",
256 |     "best_knn_clf.score(X_test, y_test)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 12,
262 |    "metadata": {},
263 |    "outputs": [
264 |     {
265 |      "data": {
266 |       "text/plain": [
267 |        "array([ 0.99543379,  0.96803653,  0.98148148,  0.96261682,  0.97619048])"
268 |       ]
269 |      },
270 |      "execution_count": 12,
271 |      "metadata": {},
272 |      "output_type": "execute_result"
273 |     }
274 |    ],
275 |    "source": [
276 |     "# cross_val_score 默认是分3份，如果要分5份：\n",
277 |     "cross_val_score(knn_clf, X_train, y_train, cv=5)"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "# GridSearchCV 中的交叉验证，如果要分为5份：\n",
287 |     "GridSearchCV(knn_clf, param_grid, verbose=1, cv=5)"
288 |    ]
289 |   }
290 |  ],
291 |  "metadata": {
292 |   "kernelspec": {
293 |    "display_name": "Python 2",
294 |    "language": "python",
295 |    "name": "python2"
296 |   },
297 |   "language_info": {
298 |    "codemirror_mode": {
299 |     "name": "ipython",
300 |     "version": 2
301 |    },
302 |    "file_extension": ".py",
303 |    "mimetype": "text/x-python",
304 |    "name": "python",
305 |    "nbconvert_exporter": "python",
306 |    "pygments_lexer": "ipython2",
307 |    "version": "2.7.6"
308 |   }
309 |  },
310 |  "nbformat": 4,
311 |  "nbformat_minor": 0
312 | }
313 | 


--------------------------------------------------------------------------------
/c7_classification_performance_measures/03_implement_confusion_matrix_precision_and_recall.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 实现混淆矩阵，精准率和召回率"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "from sklearn import datasets"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "digits = datasets.load_digits()\n",
 29 |     "X = digits.data\n",
 30 |     "y = digits.target.copy()\n",
 31 |     "\n",
 32 |     "# 把数据变为极度偏斜的数据\n",
 33 |     "# 把手写数字分为9和非9两大类， 重点关注的是分类为9的数字\n",
 34 |     "y[digits.target==9] = 1\n",
 35 |     "y[digits.target!=9] = 0"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 4,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "from sklearn.model_selection._split import train_test_split\n",
 45 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 5,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/plain": [
 56 |        "0.97555555555555551"
 57 |       ]
 58 |      },
 59 |      "execution_count": 5,
 60 |      "metadata": {},
 61 |      "output_type": "execute_result"
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "from sklearn.linear_model.logistic import LogisticRegression\n",
 66 |     "\n",
 67 |     "log_reg = LogisticRegression()\n",
 68 |     "log_reg.fit(X_train, y_train)\n",
 69 |     "log_reg.score(X_test, y_test)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "虽然0.975555555551看上去很高了，但因为我们的数据是极度偏斜的数据，即使我们把全部分类预测为\"非9\"也会有0.9左右的正确率"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 6,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "y_predict = log_reg.predict(X_test)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## 求TP，FP，FN，TN的值"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 7,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/plain": [
103 |        "403"
104 |       ]
105 |      },
106 |      "execution_count": 7,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "def TN(y_true, y_predict):\n",
113 |     "    assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'\n",
114 |     "    return np.sum((y_true == 0) & (y_predict == 0))\n",
115 |     "\n",
116 |     "TN(y_test, y_predict)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 8,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "2"
128 |       ]
129 |      },
130 |      "execution_count": 8,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "def FP(y_true, y_predict):\n",
137 |     "    assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'\n",
138 |     "    return np.sum((y_true == 0) & (y_predict == 1))\n",
139 |     "\n",
140 |     "FP(y_test, y_predict)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 9,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "9"
152 |       ]
153 |      },
154 |      "execution_count": 9,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "def FN(y_true, y_predict):\n",
161 |     "    assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'\n",
162 |     "    return np.sum((y_true == 1) & (y_predict == 0))\n",
163 |     "\n",
164 |     "FN(y_test, y_predict)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 10,
170 |    "metadata": {},
171 |    "outputs": [
172 |     {
173 |      "data": {
174 |       "text/plain": [
175 |        "36"
176 |       ]
177 |      },
178 |      "execution_count": 10,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "def TP(y_true, y_predict):\n",
185 |     "    assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'\n",
186 |     "    return np.sum((y_true == 1) & (y_predict == 1))\n",
187 |     "\n",
188 |     "TP(y_test, y_predict)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 12,
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "data": {
198 |       "text/plain": [
199 |        "array([[403,   2],\n       [  9,  36]])"
200 |       ]
201 |      },
202 |      "execution_count": 12,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "def confusion_matrix(y_true, y_predict):\n",
209 |     "    \"\"\"返回一个2✖️2的混淆矩阵\"\"\"\n",
210 |     "    return np.array([\n",
211 |     "        [TN(y_true, y_predict), FP(y_true, y_predict)],\n",
212 |     "        [FN(y_true, y_predict), TP(y_true, y_predict)]\n",
213 |     "    ])\n",
214 |     "\n",
215 |     "confusion_matrix(y_test, y_predict)"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "## 根据混淆矩阵求精准率和召回率"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 13,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "data": {
232 |       "text/plain": [
233 |        "0.94736842105263153"
234 |       ]
235 |      },
236 |      "execution_count": 13,
237 |      "metadata": {},
238 |      "output_type": "execute_result"
239 |     }
240 |    ],
241 |    "source": [
242 |     "def precision_score(y_true, y_predict):\n",
243 |     "    \"\"\"求精准率\"\"\"\n",
244 |     "    tp = TP(y_true, y_predict)\n",
245 |     "    fp = FP(y_true, y_predict)\n",
246 |     "    try:\n",
247 |     "        return tp / (tp + fp)\n",
248 |     "    except:     # 分母为0时，结果返回0\n",
249 |     "        return 0.0\n",
250 |     "\n",
251 |     "# 精准率\n",
252 |     "precision_score(y_test, y_predict)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 14,
258 |    "metadata": {},
259 |    "outputs": [
260 |     {
261 |      "data": {
262 |       "text/plain": [
263 |        "0.80000000000000004"
264 |       ]
265 |      },
266 |      "execution_count": 14,
267 |      "metadata": {},
268 |      "output_type": "execute_result"
269 |     }
270 |    ],
271 |    "source": [
272 |     "def recall_score(y_true, y_predict):\n",
273 |     "    \"\"\"求召回率\"\"\"\n",
274 |     "    tp = TP(y_true, y_predict)\n",
275 |     "    fn = FN(y_true, y_predict)\n",
276 |     "    try:\n",
277 |     "        return tp / (tp + fn)\n",
278 |     "    except:\n",
279 |     "        return 0.0\n",
280 |     "\n",
281 |     "# 召回率\n",
282 |     "recall_score(y_test, y_predict)"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "markdown",
287 |    "metadata": {},
288 |    "source": [
289 |     "# scikit-learn中的混淆矩阵，精准率和召回率"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {},
295 |    "source": [
296 |     "混淆矩阵"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 15,
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "data": {
306 |       "text/plain": [
307 |        "array([[403,   2],\n       [  9,  36]])"
308 |       ]
309 |      },
310 |      "execution_count": 15,
311 |      "metadata": {},
312 |      "output_type": "execute_result"
313 |     }
314 |    ],
315 |    "source": [
316 |     "import sklearn.metrics.classification as classification\n",
317 |     "classification.confusion_matrix(y_test, y_predict)"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "精准率"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 16,
330 |    "metadata": {},
331 |    "outputs": [
332 |     {
333 |      "data": {
334 |       "text/plain": [
335 |        "0.94736842105263153"
336 |       ]
337 |      },
338 |      "execution_count": 16,
339 |      "metadata": {},
340 |      "output_type": "execute_result"
341 |     }
342 |    ],
343 |    "source": [
344 |     "classification.precision_score(y_test, y_predict)"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {},
350 |    "source": [
351 |     "召回率"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 17,
357 |    "metadata": {},
358 |    "outputs": [
359 |     {
360 |      "data": {
361 |       "text/plain": [
362 |        "0.80000000000000004"
363 |       ]
364 |      },
365 |      "execution_count": 17,
366 |      "metadata": {},
367 |      "output_type": "execute_result"
368 |     }
369 |    ],
370 |    "source": [
371 |     "classification.recall_score(y_test, y_predict)"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": null,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": []
380 |   }
381 |  ],
382 |  "metadata": {
383 |   "kernelspec": {
384 |    "display_name": "Python 2",
385 |    "language": "python",
386 |    "name": "python2"
387 |   },
388 |   "language_info": {
389 |    "codemirror_mode": {
390 |     "name": "ipython",
391 |     "version": 2
392 |    },
393 |    "file_extension": ".py",
394 |    "mimetype": "text/x-python",
395 |    "name": "python",
396 |    "nbconvert_exporter": "python",
397 |    "pygments_lexer": "ipython2",
398 |    "version": "2.7.6"
399 |   }
400 |  },
401 |  "nbformat": 4,
402 |  "nbformat_minor": 0
403 | }
404 | 


--------------------------------------------------------------------------------
/c3_gradient_descent/05_Vectorize_Gradient_Descent.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 梯度下降的向量化"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "from sklearn import datasets"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "boston = datasets.load_boston()\n",
 29 |     "X = boston.data\n",
 30 |     "y = boston.target\n",
 31 |     "\n",
 32 |     "X = X[y < 50.0]\n",
 33 |     "y = y[y < 50.0]"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 3,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "from playML.model_selection import train_test_split\n",
 43 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## 使用正规方程解法"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 4,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "CPU times: user 827 µs, sys: 2.74 ms, total: 3.57 ms\nWall time: 8.71 ms\n"
 63 |      ]
 64 |     },
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "0.81298026026586467"
 69 |       ]
 70 |      },
 71 |      "execution_count": 4,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "from playML.linear_regression import LinearRegression\n",
 78 |     "\n",
 79 |     "lin_reg1 = LinearRegression()\n",
 80 |     "%time lin_reg1.fit_normal(X_train, y_train)\n",
 81 |     "lin_reg1.score(X_test, y_test)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "## 使用梯度下降法"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 5,
 94 |    "metadata": {},
 95 |    "outputs": [
 96 |     {
 97 |      "name": "stderr",
 98 |      "output_type": "stream",
 99 |      "text": [
100 |       "/Users/SeaMonster/PycharmProjects/MachineLearningClassicAlgorithm/playML/linear_regression.py:29: RuntimeWarning: overflow encountered in square\n  return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)\n/Users/SeaMonster/PycharmProjects/MachineLearningClassicAlgorithm/playML/linear_regression.py:59: RuntimeWarning: invalid value encountered in double_scalars\n  if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):\n"
101 |      ]
102 |     },
103 |     {
104 |      "data": {
105 |       "text/plain": [
106 |        "nan"
107 |       ]
108 |      },
109 |      "execution_count": 5,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "lin_reg2 = LinearRegression()\n",
116 |     "lin_reg2.fit_gd(X_train, y_train)\n",
117 |     "lin_reg2.score(X_test, y_test)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 6,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": [
128 |        "array([ nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,\n        nan,  nan])"
129 |       ]
130 |      },
131 |      "execution_count": 6,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "lin_reg2.coef_"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "### 调整步长η"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 7,
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "data": {
154 |       "text/plain": [
155 |        "0.27556634853389206"
156 |       ]
157 |      },
158 |      "execution_count": 7,
159 |      "metadata": {},
160 |      "output_type": "execute_result"
161 |     }
162 |    ],
163 |    "source": [
164 |     "lin_reg2.fit_gd(X_train, y_train, eta=1e-6)\n",
165 |     "lin_reg2.score(X_test, y_test)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "结果很差。。。  \n",
173 |     "那么，增加循环次数呢？"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 8,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "name": "stdout",
183 |      "output_type": "stream",
184 |      "text": [
185 |       "CPU times: user 36 s, sys: 159 ms, total: 36.1 s\nWall time: 36.4 s\n"
186 |      ]
187 |     },
188 |     {
189 |      "data": {
190 |       "text/plain": [
191 |        "0.75418523539807647"
192 |       ]
193 |      },
194 |      "execution_count": 8,
195 |      "metadata": {},
196 |      "output_type": "execute_result"
197 |     }
198 |    ],
199 |    "source": [
200 |     "%time lin_reg2.fit_gd(X_train, y_train, eta=1e-6, n_iters=1e6)\n",
201 |     "lin_reg2.score(X_test, y_test)"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "训练时间很长，但是损失函数仍然远未达到最小"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "## 使用梯度下降法前，最好进行数据归一化"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 9,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "from sklearn.preprocessing.data import StandardScaler"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 10,
230 |    "metadata": {},
231 |    "outputs": [
232 |     {
233 |      "data": {
234 |       "text/plain": [
235 |        "StandardScaler(copy=True, with_mean=True, with_std=True)"
236 |       ]
237 |      },
238 |      "execution_count": 10,
239 |      "metadata": {},
240 |      "output_type": "execute_result"
241 |     }
242 |    ],
243 |    "source": [
244 |     "standard_scaler = StandardScaler()\n",
245 |     "standard_scaler.fit(X_train)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 11,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "X_train_standard = standard_scaler.transform(X_train)\n",
255 |     "X_test_standard = standard_scaler.transform(X_test)"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 12,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "name": "stdout",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "CPU times: user 212 ms, sys: 5.18 ms, total: 217 ms\nWall time: 223 ms\n"
268 |      ]
269 |     },
270 |     {
271 |      "data": {
272 |       "text/plain": [
273 |        "0.81298806201222351"
274 |       ]
275 |      },
276 |      "execution_count": 12,
277 |      "metadata": {},
278 |      "output_type": "execute_result"
279 |     }
280 |    ],
281 |    "source": [
282 |     "lin_reg3 = LinearRegression()\n",
283 |     "%time lin_reg3.fit_gd(X_train_standard, y_train)\n",
284 |     "lin_reg3.score(X_test_standard, y_test)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {},
290 |    "source": [
291 |     "## 梯度下降法的优势"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 13,
297 |    "metadata": {},
298 |    "outputs": [],
299 |    "source": [
300 |     "m = 1000\n",
301 |     "n = 5000\n",
302 |     "\n",
303 |     "big_X = np.random.normal(size=(m,n))\n",
304 |     "\n",
305 |     "true_theta = np.random.uniform(0.0, 100.0, size=n+1) #最终要求（或者说，尽可能接近）的系数和截距离\n",
306 |     "\n",
307 |     "big_y = big_X.dot(true_theta[1:]) + true_theta[0] + np.random.normal(0.,10.,size=m)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 14,
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "name": "stdout",
317 |      "output_type": "stream",
318 |      "text": [
319 |       "CPU times: user 25.9 s, sys: 643 ms, total: 26.6 s\nWall time: 9.16 s\n"
320 |      ]
321 |     },
322 |     {
323 |      "data": {
324 |       "text/plain": [
325 |        "LinearRegression()"
326 |       ]
327 |      },
328 |      "execution_count": 14,
329 |      "metadata": {},
330 |      "output_type": "execute_result"
331 |     }
332 |    ],
333 |    "source": [
334 |     "big_reg1 = LinearRegression()\n",
335 |     "# 主要是看看训练时间，所以就不用train test split了\n",
336 |     "%time big_reg1.fit_normal(big_X, big_y)"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 15,
342 |    "metadata": {},
343 |    "outputs": [
344 |     {
345 |      "name": "stdout",
346 |      "output_type": "stream",
347 |      "text": [
348 |       "CPU times: user 8.58 s, sys: 139 ms, total: 8.72 s\nWall time: 5.01 s\n"
349 |      ]
350 |     },
351 |     {
352 |      "data": {
353 |       "text/plain": [
354 |        "LinearRegression()"
355 |       ]
356 |      },
357 |      "execution_count": 15,
358 |      "metadata": {},
359 |      "output_type": "execute_result"
360 |     }
361 |    ],
362 |    "source": [
363 |     "big_reg2 = LinearRegression()\n",
364 |     "# X 本身就都是标准差为1，均值为0的，所以就不用归一化了\n",
365 |     "%time big_reg2.fit_gd(big_X, big_y)"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "markdown",
370 |    "metadata": {},
371 |    "source": [
372 |     "这个例子中，特征数比较大，梯度下降法比正规方程解法快得多"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": []
381 |   }
382 |  ],
383 |  "metadata": {
384 |   "kernelspec": {
385 |    "display_name": "Python 2",
386 |    "language": "python",
387 |    "name": "python2"
388 |   },
389 |   "language_info": {
390 |    "codemirror_mode": {
391 |     "name": "ipython",
392 |     "version": 2
393 |    },
394 |    "file_extension": ".py",
395 |    "mimetype": "text/x-python",
396 |    "name": "python",
397 |    "nbconvert_exporter": "python",
398 |    "pygments_lexer": "ipython2",
399 |    "version": "2.7.6"
400 |   }
401 |  },
402 |  "nbformat": 4,
403 |  "nbformat_minor": 0
404 | }
405 | 


--------------------------------------------------------------------------------
/c3_gradient_descent/07_SGD_in_scikit_learn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 使用我们自己的SGD"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import matplotlib.pyplot as plt"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "m = 100000\n",
 29 |     "\n",
 30 |     "x = np.random.normal(size=m)\n",
 31 |     "X = x.reshape(-1,1)\n",
 32 |     "y = 4.*x + 3. + np.random.normal(0,3,size=m)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 3,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from playML import linear_regression\n",
 42 |     "\n",
 43 |     "lin_reg = linear_regression.LinearRegression()\n",
 44 |     "lin_reg.fit_sgd(X, y, n_iters=2)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 4,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/plain": [
 55 |        "array([ 4.00642662])"
 56 |       ]
 57 |      },
 58 |      "execution_count": 4,
 59 |      "metadata": {},
 60 |      "output_type": "execute_result"
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "lin_reg.coef_"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 5,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "text/plain": [
 75 |        "2.9918217520705057"
 76 |       ]
 77 |      },
 78 |      "execution_count": 5,
 79 |      "metadata": {},
 80 |      "output_type": "execute_result"
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "lin_reg.interception_"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "## 真实使用我们自己的SGD"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 6,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "from sklearn import datasets\n",
101 |     "boston = datasets.load_boston()\n",
102 |     "\n",
103 |     "X = boston.data\n",
104 |     "y = boston.target\n",
105 |     "\n",
106 |     "X = X[y<50.0]\n",
107 |     "y = y[y<50.0]"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 7,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "from playML.model_selection import train_test_split\n",
117 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "### 使用随机梯度下降法训练前，需要对数据进行归一化处理"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 8,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "from sklearn.preprocessing.data import StandardScaler\n",
134 |     "\n",
135 |     "standard_scaler = StandardScaler()\n",
136 |     "standard_scaler.fit(X_train)\n",
137 |     "X_train_standard = standard_scaler.transform(X_train)\n",
138 |     "X_test_standard = standard_scaler.transform(X_test)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "### 看看效果"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 10,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "name": "stdout",
155 |      "output_type": "stream",
156 |      "text": [
157 |       "CPU times: user 10.5 ms, sys: 5.5 ms, total: 16 ms\nWall time: 11.2 ms\n"
158 |      ]
159 |     },
160 |     {
161 |      "data": {
162 |       "text/plain": [
163 |        "0.79233295554251493"
164 |       ]
165 |      },
166 |      "execution_count": 10,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "lin_reg = linear_regression.LinearRegression()\n",
173 |     "%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=2)\n",
174 |     "lin_reg.score(X_test_standard, y_test)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "#### 增大循环次数，效果会越来越好么？"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 11,
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "CPU times: user 148 ms, sys: 6.43 ms, total: 154 ms\nWall time: 170 ms\n"
194 |      ]
195 |     },
196 |     {
197 |      "data": {
198 |       "text/plain": [
199 |        "0.81324404894409663"
200 |       ]
201 |      },
202 |      "execution_count": 11,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=50)\n",
209 |     "lin_reg.score(X_test_standard, y_test)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 12,
215 |    "metadata": {},
216 |    "outputs": [
217 |     {
218 |      "name": "stdout",
219 |      "output_type": "stream",
220 |      "text": [
221 |       "CPU times: user 290 ms, sys: 7.03 ms, total: 297 ms\nWall time: 338 ms\n"
222 |      ]
223 |     },
224 |     {
225 |      "data": {
226 |       "text/plain": [
227 |        "0.81316850059297174"
228 |       ]
229 |      },
230 |      "execution_count": 12,
231 |      "metadata": {},
232 |      "output_type": "execute_result"
233 |     }
234 |    ],
235 |    "source": [
236 |     "%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=100)\n",
237 |     "lin_reg.score(X_test_standard, y_test)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 13,
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "name": "stdout",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "CPU times: user 1.23 s, sys: 12.8 ms, total: 1.24 s\nWall time: 1.27 s\n"
250 |      ]
251 |     },
252 |     {
253 |      "data": {
254 |       "text/plain": [
255 |        "0.81207491088465589"
256 |       ]
257 |      },
258 |      "execution_count": 13,
259 |      "metadata": {},
260 |      "output_type": "execute_result"
261 |     }
262 |    ],
263 |    "source": [
264 |     "%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=500)\n",
265 |     "lin_reg.score(X_test_standard, y_test)"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "也不是越来越好，只能说比较收敛吧。。。"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "## scikit-learn中的SGD"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 14,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "from sklearn.linear_model.stochastic_gradient import SGDRegressor"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 15,
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "name": "stdout",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "CPU times: user 2.76 ms, sys: 2.83 ms, total: 5.59 ms\nWall time: 8.37 ms\n"
301 |      ]
302 |     },
303 |     {
304 |      "name": "stderr",
305 |      "output_type": "stream",
306 |      "text": [
307 |       "/usr/local/seamonster/MachineLearningClassicAlgorithmEnv/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:128: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDRegressor'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.\n  \"and default tol will be 1e-3.\" % type(self), FutureWarning)\n"
308 |      ]
309 |     },
310 |     {
311 |      "data": {
312 |       "text/plain": [
313 |        "0.80386489308947862"
314 |       ]
315 |      },
316 |      "execution_count": 15,
317 |      "metadata": {},
318 |      "output_type": "execute_result"
319 |     }
320 |    ],
321 |    "source": [
322 |     "sgd_reg = SGDRegressor()\n",
323 |     "%time sgd_reg.fit(X_train_standard, y_train)\n",
324 |     "sgd_reg.score(X_test_standard, y_test)"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 16,
330 |    "metadata": {},
331 |    "outputs": [
332 |     {
333 |      "name": "stdout",
334 |      "output_type": "stream",
335 |      "text": [
336 |       "CPU times: user 6.5 ms, sys: 1.69 ms, total: 8.19 ms\nWall time: 6.22 ms\n"
337 |      ]
338 |     },
339 |     {
340 |      "name": "stderr",
341 |      "output_type": "stream",
342 |      "text": [
343 |       "/usr/local/seamonster/MachineLearningClassicAlgorithmEnv/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:117: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n  DeprecationWarning)\n"
344 |      ]
345 |     },
346 |     {
347 |      "data": {
348 |       "text/plain": [
349 |        "0.81255341149152971"
350 |       ]
351 |      },
352 |      "execution_count": 16,
353 |      "metadata": {},
354 |      "output_type": "execute_result"
355 |     }
356 |    ],
357 |    "source": [
358 |     "sgd_reg = SGDRegressor(n_iter=100)\n",
359 |     "%time sgd_reg.fit(X_train_standard, y_train)\n",
360 |     "sgd_reg.score(X_test_standard, y_test)"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {},
366 |    "source": [
367 |     "比我们自己手写的快得多了。。。"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": []
376 |   }
377 |  ],
378 |  "metadata": {
379 |   "kernelspec": {
380 |    "display_name": "Python 2",
381 |    "language": "python",
382 |    "name": "python2"
383 |   },
384 |   "language_info": {
385 |    "codemirror_mode": {
386 |     "name": "ipython",
387 |     "version": 2
388 |    },
389 |    "file_extension": ".py",
390 |    "mimetype": "text/x-python",
391 |    "name": "python",
392 |    "nbconvert_exporter": "python",
393 |    "pygments_lexer": "ipython2",
394 |    "version": "2.7.6"
395 |   }
396 |  },
397 |  "nbformat": 4,
398 |  "nbformat_minor": 0
399 | }
400 | 


--------------------------------------------------------------------------------
/c2_linear_regression/09_Regression_in_scikit_learn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# scikit-learn中的回归问题"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "from sklearn import datasets"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "boston = datasets.load_boston()\n",
 30 |     "\n",
 31 |     "X = boston.data\n",
 32 |     "y = boston.target\n",
 33 |     "\n",
 34 |     "X = X[y<50.0]\n",
 35 |     "y = y[y<50.0]"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "(490, 13)"
 47 |       ]
 48 |      },
 49 |      "execution_count": 3,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "X.shape"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "from sklearn.model_selection._split import train_test_split\n",
 65 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "## scikit-learn中的线性回归"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 5,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "from sklearn.linear_model.base import LinearRegression\n",
 82 |     "\n",
 83 |     "lin_reg = LinearRegression()"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 6,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "name": "stderr",
 93 |      "output_type": "stream",
 94 |      "text": [
 95 |       "/usr/local/seamonster/MachineLearningClassicAlgorithmEnv/lib/python3.6/site-packages/scipy/linalg/basic.py:1226: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.\n  warnings.warn(mesg, RuntimeWarning)\n"
 96 |      ]
 97 |     },
 98 |     {
 99 |      "data": {
100 |       "text/plain": [
101 |        "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
102 |       ]
103 |      },
104 |      "execution_count": 6,
105 |      "metadata": {},
106 |      "output_type": "execute_result"
107 |     }
108 |    ],
109 |    "source": [
110 |     "lin_reg.fit(X_train, y_train)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 7,
116 |    "metadata": {},
117 |    "outputs": [
118 |     {
119 |      "data": {
120 |       "text/plain": [
121 |        "array([ -1.14235739e-01,   3.12783163e-02,  -4.30926281e-02,\n        -9.16425531e-02,  -1.09940036e+01,   3.49155727e+00,\n        -1.40778005e-02,  -1.06270960e+00,   2.45307516e-01,\n        -1.23179738e-02,  -8.80618320e-01,   8.43243544e-03,\n        -3.99667727e-01])"
122 |       ]
123 |      },
124 |      "execution_count": 7,
125 |      "metadata": {},
126 |      "output_type": "execute_result"
127 |     }
128 |    ],
129 |    "source": [
130 |     "lin_reg.coef_"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 8,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "data": {
140 |       "text/plain": [
141 |        "32.645660839653509"
142 |       ]
143 |      },
144 |      "execution_count": 8,
145 |      "metadata": {},
146 |      "output_type": "execute_result"
147 |     }
148 |    ],
149 |    "source": [
150 |     "lin_reg.intercept_"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 9,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/plain": [
161 |        "0.80089161995191005"
162 |       ]
163 |      },
164 |      "execution_count": 9,
165 |      "metadata": {},
166 |      "output_type": "execute_result"
167 |     }
168 |    ],
169 |    "source": [
170 |     "lin_reg.score(X_test,y_test)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "### KNN Regressor （KNN解决回归问题）"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 10,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "from sklearn.neighbors.regression import KNeighborsRegressor\n",
187 |     "\n",
188 |     "knn_reg = KNeighborsRegressor()"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "#### KNN数据归一化"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 11,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "from sklearn.preprocessing.data import StandardScaler\n",
205 |     "standard_scaler = StandardScaler()\n",
206 |     "standard_scaler.fit(X_train)\n",
207 |     "X_train_nor = standard_scaler.transform(X_train)\n",
208 |     "X_test_nor = standard_scaler.transform(X_test)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 12,
214 |    "metadata": {},
215 |    "outputs": [
216 |     {
217 |      "data": {
218 |       "text/plain": [
219 |        "KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n          metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n          weights='uniform')"
220 |       ]
221 |      },
222 |      "execution_count": 12,
223 |      "metadata": {},
224 |      "output_type": "execute_result"
225 |     }
226 |    ],
227 |    "source": [
228 |     "knn_reg.fit(X_train_nor, y_train)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 13,
234 |    "metadata": {},
235 |    "outputs": [
236 |     {
237 |      "data": {
238 |       "text/plain": [
239 |        "0.82230080487286983"
240 |       ]
241 |      },
242 |      "execution_count": 13,
243 |      "metadata": {},
244 |      "output_type": "execute_result"
245 |     }
246 |    ],
247 |    "source": [
248 |     "knn_reg.score(X_test_nor, y_test)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "呃，knn效果比线性回归还好。。。  \n",
256 |     "要是把网格搜索也用上岂不更离谱。。。"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 14,
262 |    "metadata": {},
263 |    "outputs": [
264 |     {
265 |      "name": "stdout",
266 |      "output_type": "stream",
267 |      "text": [
268 |       "Fitting 3 folds for each of 60 candidates, totalling 180 fits\n"
269 |      ]
270 |     },
271 |     {
272 |      "name": "stderr",
273 |      "output_type": "stream",
274 |      "text": [
275 |       "[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.5s finished\n"
276 |      ]
277 |     },
278 |     {
279 |      "data": {
280 |       "text/plain": [
281 |        "GridSearchCV(cv=None, error_score='raise',\n       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n          metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n          weights='uniform'),\n       fit_params=None, iid=True, n_jobs=-1,\n       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],\n       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n       scoring=None, verbose=1)"
282 |       ]
283 |      },
284 |      "execution_count": 14,
285 |      "metadata": {},
286 |      "output_type": "execute_result"
287 |     }
288 |    ],
289 |    "source": [
290 |     "from sklearn.model_selection._search import GridSearchCV\n",
291 |     "\n",
292 |     "param_grid = [\n",
293 |     "    {\n",
294 |     "        'weights':['uniform'],\n",
295 |     "        'n_neighbors':[i for i in range(1,11)]\n",
296 |     "    },\n",
297 |     "    {\n",
298 |     "        'weights':['distance'],\n",
299 |     "        'n_neighbors':[i for i in range(1,11)],\n",
300 |     "        'p':[i for i in range(1,6)]\n",
301 |     "    }\n",
302 |     "]\n",
303 |     "\n",
304 |     "knn_reg2 = KNeighborsRegressor()\n",
305 |     "grid_search = GridSearchCV(knn_reg2, param_grid, n_jobs=-1, verbose=1)\n",
306 |     "grid_search.fit(X_train_nor, y_train)"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "markdown",
311 |    "metadata": {},
312 |    "source": [
313 |     "注意下面的分数与那些分类器回归器的score用的不是同一种标准，所以不能直接与它们比较"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 15,
319 |    "metadata": {},
320 |    "outputs": [
321 |     {
322 |      "data": {
323 |       "text/plain": [
324 |        "0.79480244433269864"
325 |       ]
326 |      },
327 |      "execution_count": 15,
328 |      "metadata": {},
329 |      "output_type": "execute_result"
330 |     }
331 |    ],
332 |    "source": [
333 |     "grid_search.best_score_"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 16,
339 |    "metadata": {},
340 |    "outputs": [
341 |     {
342 |      "data": {
343 |       "text/plain": [
344 |        "{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}"
345 |       ]
346 |      },
347 |      "execution_count": 16,
348 |      "metadata": {},
349 |      "output_type": "execute_result"
350 |     }
351 |    ],
352 |    "source": [
353 |     "grid_search.best_params_"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 17,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "knn_reg_grid_search = grid_search.best_estimator_"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 18,
368 |    "metadata": {},
369 |    "outputs": [
370 |     {
371 |      "data": {
372 |       "text/plain": [
373 |        "0.85652703298427613"
374 |       ]
375 |      },
376 |      "execution_count": 18,
377 |      "metadata": {},
378 |      "output_type": "execute_result"
379 |     }
380 |    ],
381 |    "source": [
382 |     "knn_reg_grid_search.score(X_test_nor, y_test)"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": []
391 |   }
392 |  ],
393 |  "metadata": {
394 |   "kernelspec": {
395 |    "display_name": "Python 2",
396 |    "language": "python",
397 |    "name": "python2"
398 |   },
399 |   "language_info": {
400 |    "codemirror_mode": {
401 |     "name": "ipython",
402 |     "version": 2
403 |    },
404 |    "file_extension": ".py",
405 |    "mimetype": "text/x-python",
406 |    "name": "python",
407 |    "nbconvert_exporter": "python",
408 |    "pygments_lexer": "ipython2",
409 |    "version": "2.7.6"
410 |   }
411 |  },
412 |  "nbformat": 4,
413 |  "nbformat_minor": 0
414 | }
415 | 


--------------------------------------------------------------------------------
/c1_knn/03_Train_Test_Split.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 测试我们的算法"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "from sklearn import datasets"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "iris = datasets.load_iris()"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# 特征矩阵\n",
 39 |     "X = iris.data\n",
 40 |     "\n",
 41 |     "# 结果标签的向量\n",
 42 |     "y = iris.target"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 4,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "data": {
 52 |       "text/plain": [
 53 |        "(150, 4)"
 54 |       ]
 55 |      },
 56 |      "execution_count": 4,
 57 |      "metadata": {},
 58 |      "output_type": "execute_result"
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "np.shape(X)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 5,
 68 |    "metadata": {},
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "(150,)"
 74 |       ]
 75 |      },
 76 |      "execution_count": 5,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "np.shape(y)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## train_test_split"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 6,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/plain": [
100 |        "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])"
101 |       ]
102 |      },
103 |      "execution_count": 6,
104 |      "metadata": {},
105 |      "output_type": "execute_result"
106 |     }
107 |    ],
108 |    "source": [
109 |     "y"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 7,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "# 生成一个序列（例如0~100），再把这个序列打乱\n",
119 |     "shuffle_indexes = np.random.permutation(np.shape(y)[0])"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 8,
125 |    "metadata": {},
126 |    "outputs": [
127 |     {
128 |      "data": {
129 |       "text/plain": [
130 |        "array([ 50,  78,  69, 131,  10,  34,   6,   9,  36,  71,  82, 141, 137,\n        79,  59,  93,  22,  91, 122,  75,  88,   3,  89,  86,  12,  61,\n        14, 132, 119, 121, 129,  33, 103,  13,  37,  47, 139, 125,  73,\n        53,   2,  42, 114,  29, 138, 112,  52, 101,  97,  19, 123, 128,\n       144,  81,  11, 109,  26, 116,  44,  80,  64,  83, 124,  74,  39,\n        31,  58, 145, 102, 120,  76,  63,  65, 135,   8,  55,  77,  60,\n        35, 149,  57,  43,   0, 110, 127,  62, 142,  96, 106, 126,  51,\n        40, 104, 118,  68,  27,  87,  45,  15, 113, 115,  49,  16, 136,\n       117,  66,   5,  21,  67, 140,  54, 100,  99,  30,  18,  72, 148,\n        92,  24,  23,  85,  32,  70, 107,  56, 108, 105,  17, 134,  94,\n        95,  38,  48,   7,  46,  20, 146, 130,  28,  84,  90,   1, 111,\n        25, 133, 143,  41, 147,   4,  98])"
131 |       ]
132 |      },
133 |      "execution_count": 8,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "shuffle_indexes"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 9,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "test_ratio = 0.2\n",
149 |     "test_size = int(np.shape(X)[0] * test_ratio)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 10,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "data": {
159 |       "text/plain": [
160 |        "30"
161 |       ]
162 |      },
163 |      "execution_count": 10,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "test_size"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 11,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "test_indexes = shuffle_indexes[:test_size]\n",
179 |     "train_indexes = shuffle_indexes[test_size:]"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 12,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "X_train = X[train_indexes]\n",
189 |     "y_train = y[train_indexes]\n",
190 |     "\n",
191 |     "X_test = X[test_indexes]\n",
192 |     "y_test = y[test_indexes]\n"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "## 使用我们的算法"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 13,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "from c1_knn.model_selection import train_test_split"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 14,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "X_train, X_test, y_train, y_test = train_test_split(X,y,seed=1)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "### 先试试使用之前自己写的KNNClassifier"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 15,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "from  c1_knn.kNN import KNNClassifier"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 16,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "my_knn_clf = KNNClassifier(k=3)"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 17,
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "data": {
252 |       "text/plain": [
253 |        "kNN(k=3)"
254 |       ]
255 |      },
256 |      "execution_count": 17,
257 |      "metadata": {},
258 |      "output_type": "execute_result"
259 |     }
260 |    ],
261 |    "source": [
262 |     "my_knn_clf.fit(X_train, y_train)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 18,
268 |    "metadata": {},
269 |    "outputs": [
270 |     {
271 |      "data": {
272 |       "text/plain": [
273 |        "array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,\n       0, 2, 1, 0, 0, 1, 2])"
274 |       ]
275 |      },
276 |      "execution_count": 18,
277 |      "metadata": {},
278 |      "output_type": "execute_result"
279 |     }
280 |    ],
281 |    "source": [
282 |     "y_predict = my_knn_clf.predict(X_test)\n",
283 |     "y_predict"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 19,
289 |    "metadata": {},
290 |    "outputs": [
291 |     {
292 |      "data": {
293 |       "text/plain": [
294 |        "array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,\n       0, 2, 1, 0, 0, 1, 2])"
295 |       ]
296 |      },
297 |      "execution_count": 19,
298 |      "metadata": {},
299 |      "output_type": "execute_result"
300 |     }
301 |    ],
302 |    "source": [
303 |     "y_test"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 21,
309 |    "metadata": {},
310 |    "outputs": [
311 |     {
312 |      "data": {
313 |       "text/plain": [
314 |        "1.0"
315 |       ]
316 |      },
317 |      "execution_count": 21,
318 |      "metadata": {},
319 |      "output_type": "execute_result"
320 |     }
321 |    ],
322 |    "source": [
323 |     "# 正确率\n",
324 |     "np.sum(y_predict==y_test) / len(y_test)"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {},
330 |    "source": [
331 |     "### sklearn中的train_test_split"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": 22,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "from sklearn.model_selection._split import train_test_split"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 25,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 26,
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "data": {
359 |       "text/plain": [
360 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n           metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n           weights='uniform')"
361 |       ]
362 |      },
363 |      "execution_count": 26,
364 |      "metadata": {},
365 |      "output_type": "execute_result"
366 |     }
367 |    ],
368 |    "source": [
369 |     "from sklearn.neighbors.classification import KNeighborsClassifier\n",
370 |     "knn_clf = KNeighborsClassifier(n_neighbors=3)\n",
371 |     "knn_clf.fit(X_train, y_train)\n"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 27,
377 |    "metadata": {},
378 |    "outputs": [
379 |     {
380 |      "data": {
381 |       "text/plain": [
382 |        "array([1, 2, 1, 2, 0, 1, 1, 2, 1, 1, 1, 0, 0, 0, 2, 1, 0, 2, 2, 2, 1, 0, 2,\n       0, 1, 1, 0, 1, 2, 2])"
383 |       ]
384 |      },
385 |      "execution_count": 27,
386 |      "metadata": {},
387 |      "output_type": "execute_result"
388 |     }
389 |    ],
390 |    "source": [
391 |     "y_predict = knn_clf.predict(X_test)\n",
392 |     "y_predict"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 30,
398 |    "metadata": {},
399 |    "outputs": [
400 |     {
401 |      "data": {
402 |       "text/plain": [
403 |        "1.0"
404 |       ]
405 |      },
406 |      "execution_count": 30,
407 |      "metadata": {},
408 |      "output_type": "execute_result"
409 |     }
410 |    ],
411 |    "source": [
412 |     "# 准确率\n",
413 |     "np.sum(y_predict==y_test)/len(y_test)"
414 |    ]
415 |   }
416 |  ],
417 |  "metadata": {
418 |   "kernelspec": {
419 |    "display_name": "Python 2",
420 |    "language": "python",
421 |    "name": "python2"
422 |   },
423 |   "language_info": {
424 |    "codemirror_mode": {
425 |     "name": "ipython",
426 |     "version": 2
427 |    },
428 |    "file_extension": ".py",
429 |    "mimetype": "text/x-python",
430 |    "name": "python",
431 |    "nbconvert_exporter": "python",
432 |    "pygments_lexer": "ipython2",
433 |    "version": "2.7.6"
434 |   }
435 |  },
436 |  "nbformat": 4,
437 |  "nbformat_minor": 0
438 | }
439 | 


--------------------------------------------------------------------------------
/c6_logistic_regression/01_Sigmoid.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {
 6 |     "collapsed": true
 7 |    },
 8 |    "source": [
 9 |     "# Sigmoid函数"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": 1,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "import numpy as np\n",
19 |     "import matplotlib.pyplot as plt"
20 |    ]
21 |   },
22 |   {
23 |    "cell_type": "markdown",
24 |    "metadata": {},
25 |    "source": [
26 |     "## 绘制Sigmoid函数"
27 |    ]
28 |   },
29 |   {
30 |    "cell_type": "code",
31 |    "execution_count": 4,
32 |    "metadata": {},
33 |    "outputs": [],
34 |    "source": [
35 |     "def sigmoid(t):\n",
36 |     "    return 1/(1 + np.exp(-t))"
37 |    ]
38 |   },
39 |   {
40 |    "cell_type": "code",
41 |    "execution_count": 6,
42 |    "metadata": {},
43 |    "outputs": [
44 |     {
45 |      "data": {
46 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzt3Xt8VPWd//HXJxcSIIQ74Q4qqOCdRBC1FcRSsK24u9TitrbWunTdtZdff/11td2f29rd36/d7uW3Pmrbta2rdW1RW22ppYVKQ7ValasoRCRCIOGSAEEghFxm5vP7YyY6prlMkpmcmcn7+XjMI+fyPWfeOXPymZPvnDnH3B0REckuOUEHEBGR5FNxFxHJQiruIiJZSMVdRCQLqbiLiGQhFXcRkSyk4i4ikoVU3EVEspCKu4hIFsoL6onHjBnj06dP79Wyp0+fZujQockNlATK1TPK1XPpmk25eqYvuTZv3nzU3cd229DdA3mUlpZ6b5WXl/d62VRSrp5Rrp5L12zK1TN9yQVs8gRqrLplRESykIq7iEgWUnEXEclCKu4iIllIxV1EJAt1W9zN7EEzqzOz1zqZb2Z2n5lVmtl2M5uT/JgiItITiRy5PwQs6WL+UmBm7LES+G7fY4mISF90+yUmd3/WzKZ30WQZ8KPY+ZcvmtkIM5vg7oeSlFFEslQk4jSHIjS1hmkKhQmFndZwhHDEaQ07oUiEUMQJtQ2HPTYv2iYUcSLuuIPjRCJEx4l+h+f16lYOvrT/XdPco20i/s64Ex1vW1cbjxt51/QOp/1p23fdxDSu8YgzYRb0cdt1x+LDd9ooWtyfdvcLO5j3NPANd/9DbHw98HfuvqmDtiuJHt1TUlJSumrVql6FbmhooKioqFfLppJy9Yxy9Vw6ZXN3TrfCW83O4RONhHIKaQw5ja1OY4jYz3eGW8JOawRawtAScVrC0BoJ+rfoXxb7edM5ztKZvXsdFy5cuNndy7pr16+XH3D3B4AHAMrKynzBggW9Ws+GDRvo7bKppFw9o1w915/ZwhHnwPEz7Ks/zf76RvYfa2R/fSMH3zpD3almjjY00xpuOzg0oPntZQfl5lA8OJ/iwjyGDc5n0sg8BufnUpifS2F+DoX5uQzOz6UgNt42Lz83h7wcIy/Xoj9zcsjNNfJzcsjNMfJzLfYzh7xcI9eMnBzDgBwzcswwI/YwXnrxj1x15ZUY0XGztnZgGJbDnyxLbLxN3CBxg1hsxrun/en8jvTH65iM4n4AmBI3Pjk2TUQyRHMozGsHTvBqzQleP3yKikMn2VV7iqa4Q+tBuTlMHjWYSSMGM2PcMMYVFzC2qICxwwo4+GYFi66eR/HgPIoL8ynMzw3wt3nHqMIcSooLg44RiGQU99XAnWa2CpgHnFB/u0h6awlF2FRVz3OVR9lUVc8rNSdoCUUL+cgh+cyaUMxfzp3GeeOLmDZ6KFNHDWF8cSE5OR0fjW44/gYzxqVHd5FEdVvczewnwAJgjJnVAP8A5AO4+/eANcD1QCXQCHwyVWFFpPdONrWybkct6ytqeW73URqaQ+TlGBdOGs4n5k+jdNooLp0ygpLigi67FCQzJHK2zM3dzHfgb5OWSESSJhxxNuyq48mtB3hmZy3NoQjjiwv50CUTWHjeOK6aMYahBYFd+VtSSK+qSBY62dTK4xurefiPVVTXn2HU0EGsuHwKN142iUunjNCR+QCg4i6SRU42tfKD5/by4B/20tAc4vLpI7l76SzeN7uE/FxdbWQgUXEXyQLNoTAPPV/Fd3//Jm81trL0wvH8zYIZXDR5eNDRJCAq7iIZ7oXKo/z9z19jz9HTXHPuWL64+DwVdVFxF8lUDc0hvrZ6B09srmHqqCE8fNtcrjm3+1trysCg4i6SgbbsP87nV22j5ngjf7PgHD67aGbafHFI0oOKu0gGcXce+WMVX/vlTkqKC3ns0/O5fPqooGNJGlJxF8kQLaEID+9sYUP1Dq49fxz//pFLGT44P+hYkqZU3EUyQGNLiE8/spnnqkPcseAcvrj4PHI7uRSACKi4i6S9E2daue2hjWzdf5xPXTiIv1tyftCRJAOouIuksRONrdz8/RfZXXeK73x0DoVHdwUdSTKEvrImkqYaW0J88qGXqaxr4PsfL2PJhROCjiQZRMVdJA01h8J8+pHNbKt+i/tuvpQF540LOpJkGHXLiKQZd+fLT77Gc7uP8s/LL9YRu/SKjtxF0swP/7CXn22p4XOLZnJT2ZTuFxDpgIq7SBr5/RtH+D9rKlh64Xg+t2hm0HEkg6m4i6SJg2+d4bM/2cq5JcP4lw9f0ukt7UQSoeIukgbCEefzj20jFI7wvY+V6u5I0mfag0TSwP3llby8t55/u+kSpo8ZGnQcyQI6chcJ2Nb9x/mP9bu58dKJ/PmcyUHHkSyh4i4SoJZQhL/72XZKhhXw9RsvDDqOZBF1y4gE6DsbKnmjtoEHby1jWKGu8CjJoyN3kYDsrj3F/eWV3HDJRK49vyToOJJlVNxFAuDufPmpVykqyOMfPjQ76DiShVTcRQLw9PZDbKw6zpeWnM/oooKg40gWUnEX6WdNrWG+8evXmT2hWJcXkJRRcRfpZw88u4cDb53hHz40W3dTkpRRcRfpR7Unm/juhje5/qLxzDt7dNBxJIupuIv0o2//rpLWcIS7lswKOopkORV3kX5SXd/Iqo37+cjlU5g6ekjQcSTLJVTczWyJme0ys0ozu6uD+VPNrNzMtprZdjO7PvlRRTLbfet3Y2Z85lpdyldSr9vibma5wP3AUmA2cLOZtT8x9++Bx939MmAF8J1kBxXJZHuONPCzLTXccsU0xg8vDDqODACJHLnPBSrdfY+7twCrgGXt2jhQHBseDhxMXkSRzHff+t0U5OVyx4Jzgo4iA0Qi15aZBFTHjdcA89q1+Sqwzsw+AwwFrktKOpEsUF3fyC+3H+K2q6YzRl9Ykn5i7t51A7PlwBJ3vz02fgswz93vjGvzhdi6/tXM5gM/BC5090i7da0EVgKUlJSUrlq1qlehGxoaKCoq6tWyqaRcPTNQcj2ys5kN1SH+5ZrBjCzs2zkMA2WbJUs25lq4cOFmdy/rtqG7d/kA5gNr48bvBu5u12YHMCVufA8wrqv1lpaWem+Vl5f3etlUUq6eGQi5jpxq8nO/ssa/9MQrSVnfQNhmyZSNuYBN3k3ddveE+tw3AjPN7CwzG0T0A9PV7drsBxYBmNksoBA4ksC6RbLaQ89X0RKOsPKas4OOIgNMt8Xd3UPAncBaoILoWTE7zOxeM7sh1ux/An9lZq8APwFujb3DiAxYp5tD/OiPVSy9cDznjE2/rgHJbgndrMPd1wBr2k27J254J3BVcqOJZLYnt9RwsinE7e/RUbv0P31DVSQFIhHnoRequGTKCOZMHRl0HBmAVNxFUuAPlUd588hpbr1yWtBRZIBScRdJgYdeqGJMUQHXXzQh6CgyQKm4iyRZ1dHTlO+q46PzplKQlxt0HBmgVNxFkuy/X9xHrhkfnTc16CgygKm4iyRRcyjMk1sPsPiCEsYV6wJhEhwVd5Ek+u3OWupPt/CRy3XULsFScRdJosc2VjNpxGCunjEm6CgywKm4iyRJdX0jz+0+yk1lU3TjawmcirtIkjy+qRoz+HDZ5KCjiKi4iyRDKBzhiU01XHPuWCaOGBx0HBEVd5Fk+P0bRzh8sokVl08JOooIoOIukhQ/3VzD6KGDWDSrJOgoIoCKu0ifnTjTyvqKOj50yUTyc/UnJelBe6JIH/361UO0hCP8+ZxJQUcReZuKu0gfPbn1AGePHcpFk4YHHUXkbSruIn1Qc7yRl/fW82eXTsJM57ZL+lBxF+mDX2w7CMCyS9UlI+lFxV2kl9ydp7YeoGzaSKaOHhJ0HJF3UXEX6aUdB09SWdfAn+mDVElDKu4ivfTzrQfIzzU+oLstSRpScRfphUjEWfPqId47cywjhgwKOo7In1BxF+mFrdVvcfBEEx+8REftkp5U3EV64VfbDzEoL4frdLkBSVMq7iI9FN8lM6wwP+g4Ih1ScRfpoa3Vxzl8sokPXqwuGUlfKu4iPfR0rEtm0axxQUcR6ZSKu0gPtHXJLDhXXTKS3lTcRXpgy/7j1J5s5gPqkpE0p+Iu0gPvdMnoLBlJbyruIglq65JZeN5Yigrygo4j0qWEiruZLTGzXWZWaWZ3ddLmJjPbaWY7zOzHyY0pErzN+49Td6qZ63W5AckA3R5+mFkucD/wPqAG2Ghmq919Z1ybmcDdwFXuftzMdBqBZJ11Ow4zKDeHa8/X7i3pL5Ej97lApbvvcfcWYBWwrF2bvwLud/fjAO5el9yYIsFyd9btrOXKGaN1loxkBHP3rhuYLQeWuPvtsfFbgHnufmdcm58DbwBXAbnAV939Nx2sayWwEqCkpKR01apVvQrd0NBAUVFRr5ZNJeXqmUzKVXMqwt8/f4ZbLxjEginBFfdM2mbpIBtzLVy4cLO7l3Xb0N27fADLgR/Ejd8CfLtdm6eBp4B84CygGhjR1XpLS0u9t8rLy3u9bCopV89kUq77nnnDp9/1tNeePNP/geJk0jZLB9mYC9jk3dRtd0+oW+YAMCVufHJsWrwaYLW7t7r7XqJH8TMTWLdIRli3s5bLpoxg3LDCoKOIJCSR4r4RmGlmZ5nZIGAFsLpdm58DCwDMbAxwLrAniTlFAnPwrTO8euAEiy8YH3QUkYR1W9zdPQTcCawFKoDH3X2Hmd1rZjfEmq0FjpnZTqAc+F/ufixVoUX602931gKweLa+uCSZI6FvYrj7GmBNu2n3xA078IXYQySrrNt5mBnjijh7bPp9MCfSGX1DVaQLJxpbeXFPvY7aJeOouIt04Xe7aglHXP3tknFU3EW6sG5HLSXFBVw8aXjQUUR6RMVdpBNNrWF+/8YR3je7hJwcCzqOSI+ouIt04vnKozS2hFk8W10yknlU3EU6sW5HLcMK8rji7NFBRxHpMRV3kQ6EI84zFbUsPH8cg/L0ZyKZR3utSAe27D/OsdMtLL5Ap0BKZlJxF+lA27Xbrzl3bNBRRHpFxV2kHde12yULqLiLtHOgwdl3rFFnyUhGU3EXaWdLXQgzuG62bqcnmUvFXaSdLbVhXbtdMp6Ku0icg2+doepkRNeSkYyn4i4SR9dul2yh4i4SZ93Ow0wcarp2u2Q8FXeRmLZrt88pSegeNiJpTcVdJKbt2u1zxuUGHUWkz1TcRWLart0+fbj+LCTzaS8WIXrt9g27jnDdrBJyTNdul8yn4i4C/GH3Uc60hnm/ToGULKHiLkL0LJlhhbp2u2QPFXcZ8ELhCM9U1HGtrt0uWUR7sgx4m/cdp/50iy4UJllFxV0GvHU7axmUl8M15+na7ZI9VNxlQHN31u44zNUzxlBUoC8vSfZQcZcBreLQKWqOn9G1ZCTrqLjLgLZu52HMYNEsFXfJLiruMqCt3VFL2bSRjB1WEHQUkaRScZcBq7q+kYpDJ3WWjGSlhIq7mS0xs11mVmlmd3XR7i/MzM2sLHkRRVJjXdu12y9Ql4xkn26Lu5nlAvcDS4HZwM1mNruDdsOAzwEvJTukSCqs3XGY88cPY9rooUFHEUm6RI7c5wKV7r7H3VuAVcCyDtp9Hfgm0JTEfCIpcayhmU1V9TpLRrJWIsV9ElAdN14Tm/Y2M5sDTHH3XyUxm0jKrK+oI+LoXqmStczdu25gthxY4u63x8ZvAea5+52x8Rzgd8Ct7l5lZhuAL7r7pg7WtRJYCVBSUlK6atWqXoVuaGigqCj9boOmXD0TZK5/39xEzakI/3LNYKzdJX7TdXtB+mZTrp7pS66FCxdudvfuP9d09y4fwHxgbdz43cDdcePDgaNAVezRBBwEyrpab2lpqfdWeXl5r5dNJeXqmaBynTjT4jO/vMbv/eWODuen6/ZyT99sytUzfckFbPJu6ra7J9QtsxGYaWZnmdkgYAWwOu7N4YS7j3H36e4+HXgRuME7OHIXSQfrK2ppCUe4/qIJQUcRSZlui7u7h4A7gbVABfC4u+8ws3vN7IZUBxRJtl9tP8z44kIumzIi6CgiKZPQlZLcfQ2wpt20ezppu6DvsURS41RTK8/uPsJH500lJ0e305PspW+oyoCyvqKOllCED6hLRrKcirsMKGtePcT44kLmTB0ZdBSRlFJxlwGjoTnEhjeOsOTC8eqSkayn4i4DxvqKWlpCOktGBgYVdxkw1rx6iHHDCiibpi4ZyX4q7jIgnG4OsWHXEZaqS0YGCBV3GRDWv15Hs7pkZABRcZcB4RdbDzC+uJDLp48KOopIv1Bxl6xXf7qF379xhBsunaguGRkwVNwl6/3q1UOEIs6ySycGHUWk36i4S9b7xdYDzBxXxOwJxUFHEek3Ku6S1arrG9m07zg3XjbpT67bLpLNVNwlq61+5SAAN1yiLhkZWFTcJWu5Oz/feoCyaSOZMmpI0HFE+pWKu2StikOn2F3XoA9SZUBScZes9bMtNeTnGh+4WMVdBh4Vd8lKLaEIT209wHWzShg1dFDQcUT6nYq7ZKX1FbXUn27hpsunBB1FJBAq7pKVHttUzfjiQt47c2zQUUQCoeIuWefQiTM8+8YRlpdOJleXG5ABSsVdss6TWw4Qcfhw2eSgo4gERsVdskok4jy+qZorzh7FtNFDg44jEhgVd8kqL+2tZ9+xRj6iD1JlgFNxl6zy6Ev7KC7MY8kFuimHDGwq7pI16k428ZvXDnNT2RQGD8oNOo5IoFTcJWv8+OX9hCLOx66YFnQUkcCpuEtWaA1H+PFL+1lw3limj9EHqSIq7pIV1u44TN2pZj4+X0ftIqDiLlni4ReqmDpqCNecOy7oKCJpQcVdMt626rfYWHWcj8+fpm+kisSouEvGe+DZNxlWmMeKuVODjiKSNhIq7ma2xMx2mVmlmd3VwfwvmNlOM9tuZuvNTB2f0i+qjp7mN68d5mNXTKOoIC/oOCJpo9vibma5wP3AUmA2cLOZzW7XbCtQ5u4XAz8F/jnZQUU68oM/7CEvJ4dPXjk96CgiaSWRI/e5QKW773H3FmAVsCy+gbuXu3tjbPRFQFdskpQ71tDME5tquPGyiYwrLgw6jkhaMXfvuoHZcmCJu98eG78FmOfud3bS/tvAYXf/xw7mrQRWApSUlJSuWrWqV6EbGhooKirq1bKppFw909dcT+xqYc3eVv7p6sFMLErex0fpur0gfbMpV8/0JdfChQs3u3tZtw3dvcsHsBz4Qdz4LcC3O2n7MaJH7gXdrbe0tNR7q7y8vNfLppJy9Uxfch091eSz/vev/W8f3Zy8QDHpur3c0zebcvVMX3IBm7yb+uruJPIJ1AEg/hJ7k2PT3sXMrgO+Alzj7s0JrFek1x54bg9nWsN8btHMoKOIpKVE/pfdCMw0s7PMbBCwAlgd38DMLgP+E7jB3euSH1PkHUcbmvnRC/v40MUTmVkyLOg4Immp2+Lu7iHgTmAtUAE87u47zOxeM7sh1uxbQBHwhJltM7PVnaxOpM8eeHYPzaEwn9VRu0inEjox2N3XAGvaTbsnbvi6JOcS6VB1fSMPvVDFjZdOYsa49PugTCRd6BuqklG+tXYXBnzx/ecFHUUkram4S8bYVv0Wq185yO3vOYuJIwYHHUckram4S0Zwd/7x6Z2MKRrEHQtmBB1HJO2puEtGWP3KQTbtO87/eN+5uoaMSAJU3CXtnWhs5etP7+SSycNZcbmu/CiSCB0CSdr7xm9e53hjKw/fNlfXaxdJkI7cJa1tqqrnJy/v57arpnPBxOFBxxHJGCrukrYaW0J86afbmTRiMJ+/7tyg44hkFHXLSNr6p19VsPfYaR69fR5D9SGqSI/oyF3S0vqKWh59aT8r33M2V54zJug4IhlHxV3SzuETTXzpp9uZNaGYLyxWd4xIb6i4S1ppDoW549HNnGkNc9+KSynIyw06kkhGUkempJWv/XInW/e/xXc/OkeX8xXpAx25S9r47xf38eOX9nPHgnNYetGEoOOIZDQVd0kLa3cc5p5fvMa154/ji4t1xUeRvlJxl8BtrKrnsz/ZysWTR/Dtv7xM30IVSQIVdwnUlv3Hue2hjUwaOZgHb72cIYP0MZBIMugvSQKzqz7Mfb97iTHDCnjkU/MYNXRQ0JFEsoaO3CUQG3bV8a+bmygZXsjjn57PJN18QySpVNyl3z38QhW3PbSRkiE5PLZyPiXFhUFHEsk66paRftMcCvOPT1fwyIv7uG7WOJZPamDssIKgY4lkJR25S7/Ye/Q0f/HdF3jkxX2sfO/Z/OctZRTm6awYkVTRkbukVCTi/Pjl/fzfNRXk5+Xw/Y+X8b7ZJUHHEsl6Ku6SMrtrT/Hlp15lY9Vxrpoxmm8tv4SJ+uBUpF+ouEvS1Z5s4v898waPbaxmWGE+31p+MctLJ2OmbhiR/qLiLklTe7KJ/3q+iodfqCIUifDx+dP5zLUzGF2kD01F+puKu/TZjoMn+K/nq/jFtgOEI84HL57I/1x8LtNGDw06msiApeIuvXK0oZnV2w7y08017Dx0ksH5ufzl3KncdvVZKuoiaUDFXRLi7uw5eppndtbyTEUtm/cdJ+Jw0aThfO2GC1h26URGDNHlA0TShYq7dCgccfYcaWBj1XFe3nuMl/fWc/BEEwAXTCzmM9fO5PqLJnDeeN1QQyQdJVTczWwJ8B9ALvADd/9Gu/kFwI+AUuAY8BF3r0puVEkFd6fuVDNVR0/z5pHT7Dh4gp2HTvL6oVOcaQ0DMHZYAXPPGsUdZ49m0fnjdDqjSAbotribWS5wP/A+oAbYaGar3X1nXLNPAcfdfYaZrQC+CXwkFYElcaFwhBNnWqk+FWHDrjrqTjVz5FQztSebOHSiif3HGtlXf5qm1sjbywwrzGP2hGJWzJ3CBROHUzZtJNNGD9FpjCIZJpEj97lApbvvATCzVcAyIL64LwO+Ghv+KfBtMzN39yRmzViRiBOKOOGIE4pEYj+j463hd4+HwrHpkQitoQhNoQhnWsI0h8KcaQnT1BrmTGuEptbw24+G5jAnzrRysqmVk2dij6YQDc2hd0I8v/HtweGD8ykpLmDqqKG8Z+YYpo0ZyvTRQ5g+eiiTRw5WIRfJAokU90lAddx4DTCvszbuHjKzE8Bo4GgyQsZ7fGM1//5cI4M3b4hOcPDo89L2TuIOjkd/xr29tLVpm/9O27Z27ad53Ly453DinuudNuFwGHvm17y9JodQJEIkRW9xBXk5FObnMnRQLsWD8xk+OJ8po4YwfHA+xYXR8eGD8zhS/SbXzp/DuGGFjB1WQGF+bmoCiUja6NcPVM1sJbASoKSkhA0bNvR4HQfqQowfHCE/t+md9QJtB5sWN9Ew4o9Bzd6ZH39wam3Lxc/vaLzd89g7T4QBra3OoEG573rO3Jxccg1yDPIMcsyi4zm8PT3XINesw2mDcok+ctqGoz/zc6LrekcEaI494rTCmKImTu3dzingza42bj9raGjo1T6QaumaC9I3m3L1TL/kcvcuH8B8YG3c+N3A3e3arAXmx4bziB6xW1frLS0t9d4qLy/v9bKppFw9o1w9l67ZlKtn+pIL2OTd1G13T+iSvxuBmWZ2lpkNAlYAq9u1WQ18Ija8HPhdLISIiASg224Zj/ah30n06DwXeNDdd5jZvUTfQVYDPwQeMbNKoJ7oG4CIiAQkoT53d18DrGk37Z644Sbgw8mNJiIivaU7MYmIZCEVdxGRLKTiLiKShVTcRUSykIq7iEgWsqBORzezI8C+Xi4+hhRc2iAJlKtnlKvn0jWbcvVMX3JNc/ex3TUKrLj3hZltcveyoHO0p1w9o1w9l67ZlKtn+iOXumVERLKQiruISBbK1OL+QNABOqFcPaNcPZeu2ZSrZ1KeKyP73EVEpGuZeuQuIiJdSNvibmYfNrMdZhYxs7J28+42s0oz22Vm7+9k+bPM7KVYu8dilytOdsbHzGxb7FFlZts6aVdlZq/G2m1Kdo4Onu+rZnYgLtv1nbRbEtuGlWZ2Vz/k+paZvW5m283sKTMb0Um7ftle3f3+ZlYQe40rY/vS9FRliXvOKWZWbmY7Y/v/5zpos8DMTsS9vvd0tK4UZOvydbGo+2Lba7uZzemHTOfFbYdtZnbSzD7frk2/bS8ze9DM6szstbhpo8zst2a2O/ZzZCfLfiLWZreZfaKjNj2SyEXfg3gAs4DzgA1AWdz02cArQAFwFtGbC+V2sPzjwIrY8PeAO1Kc91+BezqZVwWM6cdt91Xgi920yY1tu7OBQbFtOjvFuRYDebHhbwLfDGp7JfL7A38DfC82vAJ4rB9euwnAnNjwMOCNDnItAJ7ur/0p0dcFuB74NdEbk10BvNTP+XKBw0TPAw9kewHvBeYAr8VN+2fgrtjwXR3t98AoYE/s58jY8Mi+ZEnbI3d3r3D3XR3MWgascvdmd98LVBK9iffbLHqH52uJ3qwb4GHgxlRljT3fTcBPUvUcKfD2jc/dvQVou/F5yrj7Ondvu2v3i8DkVD5fNxL5/ZcR3Xcgui8tshTfPdzdD7n7ltjwKaCC6D2KM8Ey4Ece9SIwwswm9OPzLwLedPfefjmyz9z9WaL3tIgXvx91VoveD/zW3evd/TjwW2BJX7KkbXHvQkc37G6/848G3oorJB21Sab3ALXuvruT+Q6sM7PNsfvI9oc7Y/8aP9jJv4GJbMdUuo3oUV5H+mN7JfL7v+vG70Dbjd/7Rawb6DLgpQ5mzzezV8zs12Z2QT9F6u51CXqfWkHnB1hBbK82Je5+KDZ8GCjpoE3St12/3iC7PTN7BhjfwayvuPsv+jtPRxLMeDNdH7Vf7e4HzGwc8Fszez32Dp+SXMB3ga8T/WP8OtEuo9v68nzJyNW2vczsK0AIeLST1SR9e2UaMysCfgZ83t1Ptpu9hWjXQ0Ps85SfAzP7IVbavi6xz9RuIHqP5/aC2l5/wt3dzPrlFMVAi7u7X9eLxQ4AU+LGJ8emxTtG9F/CvNgRV0dtkpLRzPKAPwdKu1jHgdjPOjN7imiXQJ/+KBLddmb2feDpDmYlsh2TnsvMbgU+CCzyWGdjB+tI+vbqQCK/f1ubmtjrPJzovpVSZpZPtLA/6u5+OtfeAAAB6UlEQVRPtp8fX+zdfY2ZfcfMxrh7Sq+hksDrkpJ9KkFLgS3uXtt+RlDbK06tmU1w90Oxbqq6DtocIPrZQJvJRD9v7LVM7JZZDayInclwFtF34JfjG8SKRjnRm3VD9ObdqfpP4DrgdXev6WimmQ01s2Ftw0Q/VHyto7bJ0q6f8886eb5Ebnye7FxLgC8BN7h7Yydt+mt7peWN32N9+j8EKtz93zppM76t79/M5hL9O07pm06Cr8tq4OOxs2auAE7EdUekWqf/PQexvdqJ3486q0VrgcVmNjLWjbo4Nq33+uMT5N48iBalGqAZqAXWxs37CtEzHXYBS+OmrwEmxobPJlr0K4EngIIU5XwI+Ot20yYCa+JyvBJ77CDaPZHqbfcI8CqwPbZjTWifKzZ+PdGzMd7sp1yVRPsVt8Ue32ufqz+3V0e/P3Av0TcfgMLYvlMZ25fO7odtdDXR7rTtcdvpeuCv2/Yz4M7YtnmF6AfTV/ZDrg5fl3a5DLg/tj1fJe4stxRnG0q0WA+PmxbI9iL6BnMIaI3Vr08R/ZxmPbAbeAYYFWtbBvwgbtnbYvtaJfDJvmbRN1RFRLJQJnbLiIhIN1TcRUSykIq7iEgWUnEXEclCKu4iIllIxV1EJAupuIuIZCEVdxGRLPT/AWDnkQWjIdEtAAAAAElFTkSuQmCC\n",
47 |       "text/plain": [
48 |        "<matplotlib.figure.Figure at 0x1064b7978>"
49 |       ]
50 |      },
51 |      "metadata": {},
52 |      "output_type": "display_data"
53 |     }
54 |    ],
55 |    "source": [
56 |     "x = np.linspace(-10,10,500)\n",
57 |     "y = sigmoid(x)\n",
58 |     "plt.plot(x,y)\n",
59 |     "plt.grid(True)\n",
60 |     "plt.show()"
61 |    ]
62 |   },
63 |   {
64 |    "cell_type": "code",
65 |    "execution_count": null,
66 |    "metadata": {},
67 |    "outputs": [],
68 |    "source": []
69 |   }
70 |  ],
71 |  "metadata": {
72 |   "kernelspec": {
73 |    "display_name": "Python 2",
74 |    "language": "python",
75 |    "name": "python2"
76 |   },
77 |   "language_info": {
78 |    "codemirror_mode": {
79 |     "name": "ipython",
80 |     "version": 2
81 |    },
82 |    "file_extension": ".py",
83 |    "mimetype": "text/x-python",
84 |    "name": "python",
85 |    "nbconvert_exporter": "python",
86 |    "pygments_lexer": "ipython2",
87 |    "version": "2.7.6"
88 |   }
89 |  },
90 |  "nbformat": 4,
91 |  "nbformat_minor": 0
92 | }
93 | 


--------------------------------------------------------------------------------
/c6_logistic_regression/04_implement_logistic_regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 实现逻辑回归"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "from sklearn import datasets\n"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "iris = datasets.load_iris()\n",
 30 |     "X = iris.data\n",
 31 |     "y = iris.target"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/plain": [
 42 |        "((100, 2), (100,))"
 43 |       ]
 44 |      },
 45 |      "execution_count": 3,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "# 我们知道逻辑回归是解决2分类问题的，但鸢尾花数据集有4个分类，所以我们需要只取其中2个分类\n",
 52 |     "# 另外为了可视化，我们只取其中2个特征而不是全部特征\n",
 53 |     "X = X[y<2,:2]\n",
 54 |     "y = y[y<2]\n",
 55 |     "X.shape, y.shape"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAFzRJREFUeJzt3X2MXFd5x/Hf45kUMG+RyAqi+GUrgagAhRCvQigIhdhUIVjmD6iaaikNauXiDSW0VLw0UqpaQqhCokDBRiujKqndEhqgDSilDYEW+gep1iEJBNMqUDuJS5uNKUlTt6lsP/3j3sW7s7Mz98zMmTnnzPcjXe3MnZO7z7n3+sndc597xtxdAICybJp0AACA0SO5A0CBSO4AUCCSOwAUiOQOAAUiuQNAgUjuAFAgkjsAFIjkDgAFajdtaGYtSUuSTrr77o7Prpf0UUkn61WfcvdDvbZ30UUX+ezsbFCwADDtjh49+ri7z/Rr1zi5S7pR0jFJz9vg89vc/d1NNzY7O6ulpaWAXw8AMLMTTdo1GpYxsy2S3iyp59U4ACANTcfcPy7p/ZLO9WjzVjN7wMxuN7Ot3RqY2V4zWzKzpeXl5dBYAQAN9U3uZrZb0mPufrRHsy9LmnX3SyXdJemWbo3cfdHd59x9bmam75ARAGBATa7cXytpj5kdl/Q5SVeb2eHVDdz9lLs/Xb89JGnHSKMEAATpm9zd/UPuvsXdZyVdJ+nr7v721W3M7OJVb/eouvEKAJiQkGqZNcxsv6Qld79D0nvMbI+kM5J+Iun60YQHABhE0ENM7v73KzXu7n5zndhXru5f7u6vdPc3uPsPYgQLTMSRI9LsrLRpU/XzyJFJRwT0NfCVOzAVjhyR9u6VTp+u3p84Ub2XpPn5ycUF9MH0A0AvN910PrGvOH26Wg8kjOQO9PLww2HrgUSQ3IFetm0LWw8kguQO9PLhD0ubN69dt3lztR5IGMkd6GV+XlpclLZvl8yqn4uL3ExF8qiWAfqZnyeZIztcuQNAgUjuAFAgkjsAFIjkDgAFIrkDQIFI7gBQIJI7ABSI5A4ABSK5A0CBSO4oB1+qAfwM0w+gDHypBrAGV+4oA1+qAaxBckcZ+FINYA2SO8rAl2oAa5DcUQa+VANYg+SOMvClGsAaVMugHHypBvAzXLljeNSXA8nhyh3Dob4cSBJX7hgO9eVAkkjuGA715UCSSO4YDvXlQJJI7hgO9eVAkkjuGA715UCSGlfLmFlL0pKkk+6+u+OzZ0i6VdIOSack/Yq7Hx9hnEgZ9eVAckKu3G+UdGyDz35D0n+6+4sl/bGkPxo2MCBL1PwjEY2Su5ltkfRmSYc2aPIWSbfUr2+XtNPMbPjwgIys1PyfOCG5n6/5J8FjAppeuX9c0vslndvg80skPSJJ7n5G0hOSXjB0dEBOqPlHQvomdzPbLekxdz867C8zs71mtmRmS8vLy8NuDkgLNf9ISJMr99dK2mNmxyV9TtLVZna4o81JSVslyczakp6v6sbqGu6+6O5z7j43MzMzVOBAcqj5R0L6Jnd3/5C7b3H3WUnXSfq6u7+9o9kdkn69fv22uo2PNFIgddT8IyED17mb2X4z21O//aykF5jZQ5J+V9IHRxEckBVq/pEQm9QF9tzcnC8tLU3kdwNArszsqLvP9WvHE6pI18KC1G5XV8HtdvUeQCPM5440LSxIBw+ef3/27Pn3Bw5MJiYgI1y5I02Li2HrAaxBckeazp4NWw9gDZI70tRqha0HsAbJHWla+R7WpusBrMENVaRp5abp4mI1FNNqVYmdm6lAIyR3pOvAAZI5MCCGZdDdrl1VffnKsmvXpCOaHOZoR4ZI7lhv1y7p7rvXrrv77ulM8MzRjkwx/QDW6/U9K9M2H9zsbJXQO23fLh0/Pu5oAKYfAEaCOdqRKZI70AtztCNTJHest3Nn2PqSMUc7MkVyx3pf+9r6RL5zZ7V+2jBHOzLFDVUAyAg3VDGcWLXdIdulvhwYGE+oYr2V2u7Tp6v3K7Xd0nDDESHbjRUDMCUYlsF6sWq7Q7ZLfTnQFcMyGFys2u6Q7VJfDgyF5I71YtV2h2yX+nJgKCR3rBertjtku9SXA0MhuWO9WLXdIdulvhwYCjdUASAj3FCNLcca7BxjBjAQ6twHkWMNdo4xAxgYwzKDyLEGO8eYAazDsExMOdZg5xgzgIGR3AeRYw12jjEDGBjJfRA51mDnGDOAgZHcB5FjDXaOMQMYWN8bqmb2TEnflPQMVdU1t7v7H3S0uV7SRyWdrFd9yt0P9dpu1jdUAWBCRnlD9WlJV7v7KyVdJukaM7uyS7vb3P2yeumZ2DEhCwtSu11dubfb1ftRtE2lfj6VOIAE9K1z9+rS/qn67QX1Mpn6SQxuYUE6ePD8+7Nnz78/cGDwtqnUz6cSB5CIRnXuZtaSdFTSiyV92t0/0PH59ZI+ImlZ0r9I+h13f6TXNhmWGbN2u0rSnVot6cyZwdumUj+fShxAZCOtc3f3s+5+maQtkq4ws1d0NPmypFl3v1TSXZJu2SCovWa2ZGZLy8vLTX41RqVbst5ofUjbVOrnU4kDSERQtYy7/1TSNyRd07H+lLs/Xb89JGnHBv/9orvPufvczMzMIPFiUK1W8/UhbVOpn08lDiARfZO7mc2Y2YX162dJeqOkH3S0uXjV2z2Sjo0ySIzAyvhzk/UhbVOpn08lDiAV7t5zkXSppO9IekDS9yTdXK/fL2lP/fojkh6UdL+qK/tf6LfdHTt2OMZs3z73Vstdqn7u2zeatocPu2/f7m5W/Tx8eNSRN5NKHEBEkpa8T351dyYOA4CcMHFYbLFqqkPqy2NuO6R/Oe6LzFDCj2BNLu9jLFkPyxw+7L55czVksbJs3jz8MMC+fWu3ubL0GhKJse2Q/uW4LzITaxcjT2JYJqJYNdUh9eUxtx3Svxz3RWYo4cdqTYdlSO6D2LSpuoDqZCadOzf4ds02/mzY4xSy7ZD+5bgvMhNrFyNPjLnHFKumOqS+POa2Q/qX477IDCX8GATJfRCxaqpD6stjbjukfznui8xQwo+BNBmYj7FkfUPVPV5NdUh9ecxth/Qvx32RGUr4sULcUAWA8jDmjvVSqF1H1jgt8tF3PncUImS+c+ZGRxecFnlhWGZapFC7jqxxWqSBYRmsFTLfOXOjowtOi7yQ3KdFCrXryBqnRV5I7tMihdp1ZI3TIi8k92kxPy8tLlYDpGbVz8XF7nfCQtpianBa5IUbqgCQEW6orohVmBuy3VTmJadIOSmlH47S+xdiIvuiyWOsMZaxTD8QayLskO2mMi85k4InpfTDUXr/Qox6X4jpBxSvMDdku6nMS06RclJKPxyl9y/EqPcF87lL8SbCDtluKvOSMyl4Uko/HKX3L8So9wVj7lK8wtyQ7aYyLzlFykkp/XCU3r8Qk9oXZSf3WIW5IdtNZV5yipSTUvrhKL1/ISa2L5oMzMdYxjafe6yJsEO2m8q85EwKnpTSD0fp/Qsxyn0hbqgCQHkYc48thfr5XbuquzIry65do4kBKEisx0ySr+NvcnkfY8n6a/ZSqJ/fubN7/fzOncPFABQk1mMmk6zjF8MyEaVQP59KiSWQsFiPmUyyjp9hmZhiTWzNhNnASHVL7L3WN5XDP1WS+yBSqJ8H0Fesx0xy+KdKch9ECvXzO3d238ZG64EpFOsxkyzq+JsMzMdYsr6h6p5G/XznTVVupgLrxHrMZFJ1/OKGKgCUZ2Q3VM3smWb2T2Z2v5k9aGZ/2KXNM8zsNjN7yMzuMbPZwcJuILS4NPli1A4hRbmF74uY4cbczU3F7F9mhzpI4af96PS7tJdkkp5Tv75A0j2SruxosyDpM/Xr6yTd1m+7Aw3LhBaX5japdEhRbuH7Ima4MXdzUzH7l9mhDlL4ad+IGg7LBI2TS9os6V5Jr+5Y/7eSXlO/bkt6XPV0whstAyX37du7/6vcvn007SdtZWCwc2m11rctfF/EDDfmbm4qZv8yO9RBCj/tG2ma3BuNuZtZS9JRSS+W9Gl3/0DH59+TdI27P1q//2H9P4DHO9rtlbRXkrZt27bjRLenAHoJnRg5t0mlQx5MKnxfxAw35m5uKmb/MjvUQQo/7RsZ6UNM7n7W3S+TtEXSFWb2ikGCcvdFd59z97mZmZnwDYQWl+ZQjLpaSFFu4fsiZrgxd3NTMfuX2aEOUvhpP1JBde7u/lNJ35B0TcdHJyVtlSQza0t6vqRTowhwjdDi0iyKUVcJKcotfF/EDDfmbm4qZv8yO9RBCj/tR6vfuI2kGUkX1q+fJelbknZ3tLlBa2+ofr7fdgeucw8tLs1tUumQotzC90XMcGPu5qZi9i+zQx2k8NO+L41qzN3MLpV0i6SWqiv9z7v7fjPbX/+SO8zsmZL+TNKrJP1E0nXu/qNe26XOHQDCNR1zb/dr4O4PqEranetvXvX6fyX9cmiQAIA4yp9bZmqfYEAvIadFCqdQzAd3cntIK4XjkYUmYzcxlrHMLVPiEwwYWshpkcIpFPPBndwe0krheEyamFtGk51RH8kKOS1SOIVCY0ihf7ltNydNx9zLTu4lPsGAoYWcFimcQjEf3MntIa0Ujsek8U1M0nQ/wYANhZwWKZxCMR/cye0hrRSORy7KTu5T/QQDNhJyWqRwCsV8cCe3h7RSOB7ZaDIwH2MZ25d1lPYEA0Yi5LRI4RSK+eBObg9ppXA8JkncUAWA8jDmDoxIyBd7pCK3mFOpXU8ljpFocnkfY8n+O1QxFUK+2CMVucWcSu16KnH0I4ZlgOG129LZs+vXt1rSmTPjj6eJ3GJOpXY9lTj6YVgGGIFuSbLX+hTkFvPDD4etLz2OUSG5Az2EfLFHKnKLOZXa9VTiGBWSO9BDyBd7pCK3mFOpXU8ljpFpMjAfY+GGKnIR8sUeqcgt5lRq11OJoxdxQxUAysMNVYxNjrXBsWKOVV+e4z7GhDW5vI+xMCxThlxqg1eLFXOs+vIc9zHiEcMyGIdcaoNXixVzrPryHPcx4mFYBmORY21wrJhj1ZfnuI8xeSR3DCXH2uBYMceqL89xH2PySO4YSo61wbFijlVfnuM+RgKaDMzHWLihWo4caoM7xYo5Vn15jvsYcYgbqgBQHm6oYurEqgUP2S716EhFe9IBAKNw5Eg1tn36dPX+xInzY93z8+PZbqwYgEEwLIMixKoFD9ku9egYB4ZlMFVi1YKHbJd6dKSE5I4ixKoFD9ku9ehICckdRYhVCx6yXerRkRKSO4owPy8tLlbj22bVz8XF4W9khmw3VgzAIPreUDWzrZJulfRCSS5p0d0/0dHmKkl/Lelf61VfdPf9vbbLDVUACDfKG6pnJL3P3V8m6UpJN5jZy7q0+5a7X1YvPRM70pdjvTb16PGx3zLS5DHW1YuqK/Q3dqy7StJXQrbD9APpynH+8JCYc+xfCthvaVCM6QfMbFbSNyW9wt2fXLX+KklfkPSopH+T9Hvu/mCvbTEsk64c67WpR4+P/ZaGpsMyjZO7mT1H0j9I+rC7f7Hjs+dJOufuT5nZtZI+4e4v6bKNvZL2StK2bdt2nOh2pmDiNm2qrss6mUnnzo0/niZCYs6xfylgv6VhpA8xmdkFqq7Mj3Qmdkly9yfd/an69Z2SLjCzi7q0W3T3OXefm5mZafKrMQE51mtTjx4f+y0vfZO7mZmkz0o65u4f26DNi+p2MrMr6u2eGmWgGJ8c67WpR4+P/ZaZfoPykl6nqgTyAUn31cu1kt4l6V11m3dLelDS/ZK+LekX+22XG6ppy3H+8JCYc+xfCthvkyfmcweA8jBx2BSg5nithQWp3a5u8LXb1XtgWjGfe6aYO3ythQXp4MHz78+ePf/+wIHJxARMEsMymaLmeK12u0ronVot6cyZ8ccDxMKwTOGYO3ytbom913qgdCT3TFFzvFarFbYeKB3JPVPUHK+1cr+h6XqgdCT3TDF3+FoHDkj79p2/Um+1qvfcTMW04oYqAGSEG6qDKLxwvPDuFd+/FLCPM9LkMdYYS3LTDxQ+WXXh3Su+fylgH6dBTD8QqPDC8cK7V3z/UsA+TsPI53MfteSSe+GTVRfeveL7lwL2cRoYcw9VeOF44d0rvn8pYB/nheS+ovDC8cK7V3z/UsA+zgvJfUXhheOFd6/4/qWAfZwXxtwBICOMuQMFiVlfTu16mZjPHUhczLn7+V6AcjEsAyQuZn05tev5YVgGKETMufv5XoBykdyBxMWsL6d2vVwkdyBxMevLqV0vF8kdSFzM+nJq18vFDVUAyAg3VAFgipHcAaBAJHcAKBDJHQAKRHIHgAKR3AGgQCR3ACgQyR0ACtQ3uZvZVjP7hpl938weNLMbu7QxM/ukmT1kZg+Y2eVxwsUwmLcbmB5N5nM/I+l97n6vmT1X0lEzu8vdv7+qzZskvaReXi3pYP0TiWDebmC69L1yd/cfu/u99ev/knRM0iUdzd4i6VavfFvShWZ28cijxcBuuul8Yl9x+nS1HkB5gsbczWxW0qsk3dPx0SWSHln1/lGt/x+AzGyvmS2Z2dLy8nJYpBgK83YD06Vxcjez50j6gqT3uvuTg/wyd1909zl3n5uZmRlkExgQ83YD06VRcjezC1Ql9iPu/sUuTU5K2rrq/ZZ6HRLBvN3AdGlSLWOSPivpmLt/bINmd0h6R101c6WkJ9z9xyOME0Ni3m5gujSplnmtpF+T9F0zu69e9/uStkmSu39G0p2SrpX0kKTTkt45+lAxrPl5kjkwLfomd3f/R0nWp41LumFUQQEAhsMTqgBQIJI7ABSI5A4ABSK5A0CBSO4AUCCSOwAUiOQOAAWyqkR9Ar/YbFnSiYn88v4ukvT4pIOIiP7lq+S+SfSvie3u3ndyrokl95SZ2ZK7z006jljoX75K7ptE/0aJYRkAKBDJHQAKRHLvbnHSAURG//JVct8k+jcyjLkDQIG4cgeAAk11cjezlpl9x8y+0uWz681s2czuq5ffnESMwzCz42b23Tr+pS6fm5l90sweMrMHzOzyScQ5iAZ9u8rMnlh1/G6eRJyDMrMLzex2M/uBmR0zs9d0fJ7tsZMa9S/b42dmL10V931m9qSZvbejTfTj1+TLOkp2o6Rjkp63wee3ufu7xxhPDG9w943qat8k6SX18mpJB+ufuejVN0n6lrvvHls0o/UJSV9197eZ2c9J6viSxOyPXb/+SZkeP3f/Z0mXSdUFpKqvHP1SR7Pox29qr9zNbIukN0s6NOlYJugtkm71yrclXWhmF086qGlnZs+X9HpVX28pd/8/d/9pR7Nsj13D/pVip6QfunvnA5vRj9/UJndJH5f0fknnerR5a/0n0+1mtrVHu1S5pL8zs6NmtrfL55dIemTV+0frdTno1zdJeo2Z3W9mf2NmLx9ncEP6eUnLkv60HjY8ZGbP7miT87Fr0j8p3+O32nWS/qLL+ujHbyqTu5ntlvSYux/t0ezLkmbd/VJJd0m6ZSzBjdbr3P1yVX8C3mBmr590QCPUr2/3qnpM+5WS/kTSX407wCG0JV0u6aC7v0rSf0v64GRDGqkm/cv5+EmS6uGmPZL+chK/fyqTu6ov/d5jZsclfU7S1WZ2eHUDdz/l7k/Xbw9J2jHeEIfn7ifrn4+pGvO7oqPJSUmr/yLZUq9LXr++ufuT7v5U/fpOSReY2UVjD3Qwj0p61N3vqd/frioZrpbtsVOD/mV+/Fa8SdK97v4fXT6LfvymMrm7+4fcfYu7z6r6s+nr7v721W06xr/2qLrxmg0ze7aZPXfltaRfkvS9jmZ3SHpHfef+SklPuPuPxxxqsCZ9M7MXmZnVr69Qda6fGnesg3D3f5f0iJm9tF61U9L3O5pleeykZv3L+fit8qvqPiQjjeH4TXu1zBpmtl/SkrvfIek9ZrZH0hlJP5F0/SRjG8ALJX2p/vfRlvTn7v5VM3uXJLn7ZyTdKelaSQ9JOi3pnROKNVSTvr1N0j4zOyPpfyRd53k9sffbko7Uf9r/SNI7Czl2K/r1L+vjV190vFHSb61aN9bjxxOqAFCgqRyWAYDSkdwBoEAkdwAoEMkdAApEcgeAApHcAaBAJHcAKBDJHQAK9P9IUj1h6gimcQAAAABJRU5ErkJggg==\n",
 66 |       "text/plain": [
 67 |        "<matplotlib.figure.Figure at 0x109124550>"
 68 |       ]
 69 |      },
 70 |      "metadata": {},
 71 |      "output_type": "display_data"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "# 分类0的散点图\n",
 76 |     "plt.scatter(X[y==0,0], X[y==0,1], color='red')\n",
 77 |     "\n",
 78 |     "# 分类1的散点图\n",
 79 |     "plt.scatter(X[y==1,0], X[y==1,1], color='blue')\n",
 80 |     "plt.show()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "## 使用我们自己编写的逻辑回归"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 5,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "from playML.model_selection import train_test_split\n",
 97 |     "\n",
 98 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 6,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "LogisticRegression()"
110 |       ]
111 |      },
112 |      "execution_count": 6,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "from playML.logistic_regression import LogisticRegression\n",
119 |     "\n",
120 |     "log_reg = LogisticRegression()\n",
121 |     "log_reg.fit(X_train, y_train)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 7,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "data": {
131 |       "text/plain": [
132 |        "1.0"
133 |       ]
134 |      },
135 |      "execution_count": 7,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "log_reg.score(X_test, y_test)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "评分结果不错，不过当然是因为我们的数据很简单"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 8,
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "text/plain": [
159 |        "array([ 0.92972035,  0.98664939,  0.14852024,  0.17601199,  0.0369836 ,\n        0.0186637 ,  0.04936918,  0.99669244,  0.97993941,  0.74524655,\n        0.04473194,  0.00339285,  0.26131273,  0.0369836 ,  0.84192923,\n        0.79892262,  0.82890209,  0.32358166,  0.06535323,  0.20735334])"
160 |       ]
161 |      },
162 |      "execution_count": 8,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "log_reg.predict_proba(X_test)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 9,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "data": {
178 |       "text/plain": [
179 |        "array([1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0])"
180 |       ]
181 |      },
182 |      "execution_count": 9,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "y_test"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 10,
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "data": {
198 |       "text/plain": [
199 |        "array([1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0])"
200 |       ]
201 |      },
202 |      "execution_count": 10,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "log_reg.predict(X_test)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": []
217 |   }
218 |  ],
219 |  "metadata": {
220 |   "kernelspec": {
221 |    "display_name": "Python 2",
222 |    "language": "python",
223 |    "name": "python2"
224 |   },
225 |   "language_info": {
226 |    "codemirror_mode": {
227 |     "name": "ipython",
228 |     "version": 2
229 |    },
230 |    "file_extension": ".py",
231 |    "mimetype": "text/x-python",
232 |    "name": "python",
233 |    "nbconvert_exporter": "python",
234 |    "pygments_lexer": "ipython2",
235 |    "version": "2.7.6"
236 |   }
237 |  },
238 |  "nbformat": 4,
239 |  "nbformat_minor": 0
240 | }
241 | 


--------------------------------------------------------------------------------
/c1_knn/04_Hyper_Parameter_kNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 超参数"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import matplotlib\n",
 20 |     "import matplotlib.pyplot as plt\n",
 21 |     "from sklearn import datasets"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## 一个识别手写数字的例子"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "digits = datasets.load_digits()"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "data": {
 47 |       "text/plain": [
 48 |        "dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])"
 49 |       ]
 50 |      },
 51 |      "execution_count": 3,
 52 |      "metadata": {},
 53 |      "output_type": "execute_result"
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "digits.keys()"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 4,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "name": "stdout",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "Optical Recognition of Handwritten Digits Data Set\n===================================================\n\nNotes\n-----\nData Set Characteristics:\n    :Number of Instances: 5620\n    :Number of Attributes: 64\n    :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n    :Missing Attribute Values: None\n    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n    :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttp://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixels are counted in each block. This generates\nan input matrix of 8x8 where each element is an integer in the range\n0..16. This reduces dimensionality and gives invariance to small\ndistortions.\n\nFor info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G.\nT. Candela, D. L. Dimmick, J. Geist, P. J. Grother, S. A. Janet, and C.\nL. Wilson, NIST Form-Based Handprint Recognition System, NISTIR 5469,\n1994.\n\nReferences\n----------\n  - C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their\n    Applications to Handwritten Digit Recognition, MSc Thesis, Institute of\n    Graduate Studies in Science and Engineering, Bogazici University.\n  - E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika.\n  - Ken Tang and Ponnuthurai N. Suganthan and Xi Yao and A. Kai Qin.\n    Linear dimensionalityreduction using relevance weighted LDA. School of\n    Electrical and Electronic Engineering Nanyang Technological University.\n    2005.\n  - Claudio Gentile. A New Approximate Maximal Margin Classification\n    Algorithm. NIPS. 2000.\n\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "print(digits.DESCR)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 5,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "array([[  0.,   0.,   5., ...,   0.,   0.,   0.],\n       [  0.,   0.,   0., ...,  10.,   0.,   0.],\n       [  0.,   0.,   0., ...,  16.,   9.,   0.],\n       ..., \n       [  0.,   0.,   1., ...,   6.,   0.,   0.],\n       [  0.,   0.,   2., ...,  12.,   0.,   0.],\n       [  0.,   0.,  10., ...,  12.,   1.,   0.]])"
 86 |       ]
 87 |      },
 88 |      "execution_count": 5,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "digits.data"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 6,
100 |    "metadata": {},
101 |    "outputs": [
102 |     {
103 |      "data": {
104 |       "text/plain": [
105 |        "(1797, 64)"
106 |       ]
107 |      },
108 |      "execution_count": 6,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "np.shape(digits.data)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "特征"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 7,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "X = digits.data"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "分类"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 8,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "data": {
147 |       "text/plain": [
148 |        "(1797,)"
149 |       ]
150 |      },
151 |      "execution_count": 8,
152 |      "metadata": {},
153 |      "output_type": "execute_result"
154 |     }
155 |    ],
156 |    "source": [
157 |     "y = digits.target\n",
158 |     "np.shape(y)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "## 可视化"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 9,
171 |    "metadata": {},
172 |    "outputs": [
173 |     {
174 |      "data": {
175 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPgAAAD8CAYAAABaQGkdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAACu5JREFUeJzt3d2LXeUZhvH77qi0qTYDTVokid1BJCCFTmQTkBRjIpZYxeSgBwkoJhRypCgtiPZE+g9IelAEiU4EE6WNSkSsVtDRCq11kkxa82FJw5RM0GZCGfw4aIg+PZgViJIya7LX1zxcPwidj81+n01zudbsWVmvI0IAcvpG2wMAqA+BA4kROJAYgQOJETiQGIEDiRE4kBiBA4kROJDYFXU86ZIlS6LX69Xx1K2amZlpdL3JycnG1hoaGmpsreuvv76xtRYtWtTYWk2anJzU2bNnPdfjagm81+tpfHy8jqdu1f79+xtd77777mtsreHh4cbW2rt3b2NrjYyMNLZWk/r9fqnHcYoOJEbgQGIEDiRG4EBiBA4kRuBAYgQOJEbgQGKlAre90faHtk/YfqTuoQBUY87AbQ9J+q2kOyTdKGmr7RvrHgzA4MocwddIOhERJyPinKTnJW2qdywAVSgT+DJJpy76fKr4GoCOq+xNNts7bI/bHp+enq7qaQEMoEzgpyWtuOjz5cXXviIinoyIfkT0ly5dWtV8AAZQJvD3Jd1ge6XtqyRtkfRyvWMBqMKc/x48Is7bvl/S65KGJD0dEUdqnwzAwErd8CEiXpX0as2zAKgYV7IBiRE4kBiBA4kROJAYgQOJETiQGIEDiRE4kFgtO5tk9dhjj7U9Qm02b97c2Fq33nprY2tNTEw0tpY0u6tPl3AEBxIjcCAxAgcSI3AgMQIHEiNwIDECBxIjcCAxAgcSK7OzydO2z9j+oImBAFSnzBF8t6SNNc8BoAZzBh4R70j6TwOzAKgYP4MDibF1EZBYZYGzdRHQPZyiA4mV+TXZc5L+LGmV7SnbP69/LABVKLM32dYmBgFQPU7RgcQIHEiMwIHECBxIjMCBxAgcSIzAgcQIHEhswW9dNDY21thahw8fbmwtSVq3bl1ja+3cubOxtWZmZhpbq8m/H5K0bdu2RtebC0dwIDECBxIjcCAxAgcSI3AgMQIHEiNwIDECBxIjcCAxAgcSK3PTxRW237J91PYR2w82MRiAwZW5Fv28pF9GxEHb10g6YPuNiDha82wABlRmb7KPIuJg8fGnko5JWlb3YAAGN6+fwW33JK2W9N4lvsfWRUDHlA7c9tWSXpD0UER88vXvs3UR0D2lArd9pWbj3hMRL9Y7EoCqlHkX3ZKeknQsIh6vfyQAVSlzBF8r6V5JG2xPFH9+WvNcACpQZm+ydyW5gVkAVIwr2YDECBxIjMCBxAgcSIzAgcQIHEiMwIHECBxIjL3JOmxkZKTtEWrR6/UaW4u9yQCkReBAYgQOJEbgQGIEDiRG4EBiBA4kRuBAYgQOJFbmpovftP1X24eLrYt+3cRgAAZX5lLV/0raEBGfFbdPftf2HyLiLzXPBmBAZW66GJI+Kz69svgTdQ4FoBplNz4Ysj0h6YykNyKCrYuABaBU4BHxRUSMSFouaY3tH17iMWxdBHTMvN5Fj4gZSW9J2ljPOACqVOZd9KW2h4uPvyXpdknH6x4MwODKvIt+raRnbA9p9j8Iv4uIV+odC0AVyryL/jfN7gkOYIHhSjYgMQIHEiNwIDECBxIjcCAxAgcSI3AgMQIHElvwWxcNDw83ttbixYsbW0uS1q9f3+h6TWlyO6Em/350EUdwIDECBxIjcCAxAgcSI3AgMQIHEiNwIDECBxIjcCCx0oEX90Y/ZJv7sQELxHyO4A9KOlbXIACqV3Znk+WS7pS0q95xAFSp7BF8p6SHJX1Z4ywAKlZm44O7JJ2JiANzPI69yYCOKXMEXyvpbtuTkp6XtMH2s19/EHuTAd0zZ+AR8WhELI+InqQtkt6MiHtqnwzAwPg9OJDYvO7oEhFjksZqmQRA5TiCA4kROJAYgQOJETiQGIEDiRE4kBiBA4kROJDYgt+6qEm9Xq/R9TZt2tTYWvv3729srbfffruxtUZHRxtbq4s4ggOJETiQGIEDiRE4kBiBA4kROJAYgQOJETiQGIEDiZW6kq24o+qnkr6QdD4i+nUOBaAa87lUdX1EnK1tEgCV4xQdSKxs4CHpj7YP2N5R50AAqlP2FP3HEXHa9vckvWH7eES8c/EDivB3SNJ1111X8ZgALkepI3hEnC7+94yklyStucRj2LoI6Jgymw9+2/Y1Fz6W9BNJH9Q9GIDBlTlF/76kl2xfePzeiHit1qkAVGLOwCPipKQfNTALgIrxazIgMQIHEiNwIDECBxIjcCAxAgcSI3AgMQIHEnNEVP6k/X4/xsfHK3/ethVX8zVm3bp1ja01MTHR2FpNbgE1NjbW2FqSNDw83Mg6/X5f4+Pjc/6F5AgOJEbgQGIEDiRG4EBiBA4kRuBAYgQOJEbgQGIEDiRWKnDbw7b32T5u+5jtm+seDMDgyt4X/TeSXouIn9m+StKiGmcCUJE5A7e9WNItkrZJUkSck3Su3rEAVKHMKfpKSdOSRm0fsr2ruD86gI4rE/gVkm6S9ERErJb0uaRHvv4g2ztsj9sen56ernhMAJejTOBTkqYi4r3i832aDf4r2LoI6J45A4+IjyWdsr2q+NJtko7WOhWASpR9F/0BSXuKd9BPStpe30gAqlIq8IiYkNSveRYAFeNKNiAxAgcSI3AgMQIHEiNwIDECBxIjcCAxAgcSI3AgsbKXqkLS6Ohoo+tt397cFcFN7oO2e/fuxtZqaq+wruIIDiRG4EBiBA4kRuBAYgQOJEbgQGIEDiRG4EBiBA4kNmfgtlfZnrjozye2H2piOACDmfNS1Yj4UNKIJNkeknRa0ks1zwWgAvM9Rb9N0j8j4l91DAOgWvMNfIuk5y71DbYuArqndODFpgd3S/r9pb7P1kVA98znCH6HpIMR8e+6hgFQrfkEvlX/5/QcQDeVCrzYD/x2SS/WOw6AKpXdm+xzSd+teRYAFeNKNiAxAgcSI3AgMQIHEiNwIDECBxIjcCAxAgcSc0RU/6T2tKT5/pPSJZLOVj5MN2R9bbyu9vwgIub8V121BH45bI9HRL/tOeqQ9bXxurqPU3QgMQIHEutS4E+2PUCNsr42XlfHdeZncADV69IRHEDFOhG47Y22P7R9wvYjbc9TBdsrbL9l+6jtI7YfbHumKtkesn3I9ittz1Il28O299k+bvuY7ZvbnmkQrZ+iF/da/4dm7xgzJel9SVsj4mirgw3I9rWSro2Ig7avkXRA0uaF/rousP0LSX1J34mIu9qepyq2n5H0p4jYVdxodFFEzLQ91+XqwhF8jaQTEXEyIs5Jel7SppZnGlhEfBQRB4uPP5V0TNKydqeqhu3lku6UtKvtWapke7GkWyQ9JUkRcW4hxy11I/Blkk5d9PmUkoRwge2epNWS3mt3ksrslPSwpC/bHqRiKyVNSxotfvzYVdyPcMHqQuCp2b5a0guSHoqIT9qeZ1C275J0JiIOtD1LDa6QdJOkJyJitaTPJS3o94S6EPhpSSsu+nx58bUFz/aVmo17T0RkuSPtWkl3257U7I9TG2w/2+5IlZmSNBURF8609mk2+AWrC4G/L+kG2yuLNzW2SHq55ZkGZtua/VnuWEQ83vY8VYmIRyNieUT0NPv/1ZsRcU/LY1UiIj6WdMr2quJLt0la0G+Klrptcp0i4rzt+yW9LmlI0tMRcaTlsaqwVtK9kv5ue6L42q8i4tUWZ8LcHpC0pzjYnJS0veV5BtL6r8kA1KcLp+gAakLgQGIEDiRG4EBiBA4kRuBAYgQOJEbgQGL/A9ozs2W/5x3pAAAAAElFTkSuQmCC\n",
176 |       "text/plain": [
177 |        "<matplotlib.figure.Figure at 0x1088395c0>"
178 |       ]
179 |      },
180 |      "metadata": {},
181 |      "output_type": "display_data"
182 |     }
183 |    ],
184 |    "source": [
185 |     "some_digit = X[666]\n",
186 |     "some_digit_image = some_digit.reshape(8,8)\n",
187 |     "plt.imshow(some_digit_image, cmap=matplotlib.cm.binary)\n",
188 |     "plt.show()"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "## train test split"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 10,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "from c1_knn.model_selection import train_test_split\n",
205 |     "from c1_knn.kNN import KNNClassifier\n"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 11,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 12,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "my_knn_clf = KNNClassifier(k=3)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 13,
229 |    "metadata": {},
230 |    "outputs": [
231 |     {
232 |      "data": {
233 |       "text/plain": [
234 |        "kNN(k=3)"
235 |       ]
236 |      },
237 |      "execution_count": 13,
238 |      "metadata": {},
239 |      "output_type": "execute_result"
240 |     }
241 |    ],
242 |    "source": [
243 |     "my_knn_clf.fit(X_train, y_train)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 14,
249 |    "metadata": {},
250 |    "outputs": [
251 |     {
252 |      "data": {
253 |       "text/plain": [
254 |        "0.99164345403899723"
255 |       ]
256 |      },
257 |      "execution_count": 14,
258 |      "metadata": {},
259 |      "output_type": "execute_result"
260 |     }
261 |    ],
262 |    "source": [
263 |     "y_predict = my_knn_clf.predict(X_test)\n",
264 |     "# 正确率\n",
265 |     "np.sum(y_predict==y_test) / np.shape(y_test)[0]"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "### 把统计正确率封装为一个方法"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 15,
278 |    "metadata": {},
279 |    "outputs": [
280 |     {
281 |      "data": {
282 |       "text/plain": [
283 |        "0.99164345403899723"
284 |       ]
285 |      },
286 |      "execution_count": 15,
287 |      "metadata": {},
288 |      "output_type": "execute_result"
289 |     }
290 |    ],
291 |    "source": [
292 |     "from c1_knn.metrics import accuracy_score\n",
293 |     "accuracy_score(y_test, y_predict)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "## sklearn中的accuracy_score"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 16,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "from sklearn.model_selection._split import train_test_split as train_test_spl\n",
310 |     "from sklearn.neighbors.classification import KNeighborsClassifier\n",
311 |     "from sklearn.metrics import accuracy_score as score\n"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 17,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "X_train, X_test, y_train, y_test = train_test_spl(X, y, test_size=0.2, random_state=666)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 18,
326 |    "metadata": {},
327 |    "outputs": [
328 |     {
329 |      "data": {
330 |       "text/plain": [
331 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n           metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n           weights='uniform')"
332 |       ]
333 |      },
334 |      "execution_count": 18,
335 |      "metadata": {},
336 |      "output_type": "execute_result"
337 |     }
338 |    ],
339 |    "source": [
340 |     "knn_clf = KNeighborsClassifier(n_neighbors=3)\n",
341 |     "knn_clf.fit(X_train, y_train)"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 19,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "y_predict = knn_clf.predict(X_test)"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 20,
356 |    "metadata": {},
357 |    "outputs": [
358 |     {
359 |      "data": {
360 |       "text/plain": [
361 |        "0.98888888888888893"
362 |       ]
363 |      },
364 |      "execution_count": 20,
365 |      "metadata": {},
366 |      "output_type": "execute_result"
367 |     }
368 |    ],
369 |    "source": [
370 |     "score(y_test,y_predict)"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": 21,
376 |    "metadata": {},
377 |    "outputs": [
378 |     {
379 |      "data": {
380 |       "text/plain": [
381 |        "0.98888888888888893"
382 |       ]
383 |      },
384 |      "execution_count": 21,
385 |      "metadata": {},
386 |      "output_type": "execute_result"
387 |     }
388 |    ],
389 |    "source": [
390 |     "knn_clf.score(X_test, y_test)"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "markdown",
395 |    "metadata": {},
396 |    "source": [
397 |     "两个score的结果一样"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {},
404 |    "outputs": [],
405 |    "source": []
406 |   }
407 |  ],
408 |  "metadata": {
409 |   "kernelspec": {
410 |    "display_name": "Python 2",
411 |    "language": "python",
412 |    "name": "python2"
413 |   },
414 |   "language_info": {
415 |    "codemirror_mode": {
416 |     "name": "ipython",
417 |     "version": 2
418 |    },
419 |    "file_extension": ".py",
420 |    "mimetype": "text/x-python",
421 |    "name": "python",
422 |    "nbconvert_exporter": "python",
423 |    "pygments_lexer": "ipython2",
424 |    "version": "2.7.6"
425 |   }
426 |  },
427 |  "nbformat": 4,
428 |  "nbformat_minor": 0
429 | }
430 | 


--------------------------------------------------------------------------------
/c1_knn/08_Scaler_in_Scikit_Learn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Scikit-learn中的Scaler"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "from sklearn import datasets"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "iris = datasets.load_iris()"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 3,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "X = iris.data\n",
 38 |     "y = iris.target"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 4,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "array([[ 5.1,  3.5,  1.4,  0.2],\n       [ 4.9,  3. ,  1.4,  0.2],\n       [ 4.7,  3.2,  1.3,  0.2],\n       [ 4.6,  3.1,  1.5,  0.2],\n       [ 5. ,  3.6,  1.4,  0.2],\n       [ 5.4,  3.9,  1.7,  0.4],\n       [ 4.6,  3.4,  1.4,  0.3],\n       [ 5. ,  3.4,  1.5,  0.2],\n       [ 4.4,  2.9,  1.4,  0.2],\n       [ 4.9,  3.1,  1.5,  0.1]])"
 50 |       ]
 51 |      },
 52 |      "execution_count": 4,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "# 可以看到X尚未归一化时的数据\n",
 59 |     "X[:10,:]"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 5,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "from sklearn.model_selection._split import train_test_split\n",
 69 |     "\n",
 70 |     "X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=666)\n"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "## Scikit-learn中的StandardScaler，进行0均值标准化处理"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 6,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "from sklearn.preprocessing.data import StandardScaler"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 7,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "data": {
 96 |       "text/plain": [
 97 |        "StandardScaler(copy=True, with_mean=True, with_std=True)"
 98 |       ]
 99 |      },
100 |      "execution_count": 7,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "standard_scaler = StandardScaler()\n",
107 |     "standard_scaler.fit(X_train)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 8,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "array([ 5.83416667,  3.0825    ,  3.70916667,  1.16916667])"
119 |       ]
120 |      },
121 |      "execution_count": 8,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "# 训练集X特征矩阵的均值\n",
128 |     "standard_scaler.mean_"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 11,
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/plain": [
139 |        "array([ 0.81019502,  0.44076874,  1.76295187,  0.75429833])"
140 |       ]
141 |      },
142 |      "execution_count": 11,
143 |      "metadata": {},
144 |      "output_type": "execute_result"
145 |     }
146 |    ],
147 |    "source": [
148 |     "# 训练集X特征矩阵的标准差\n",
149 |     "standard_scaler.scale_"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 12,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "data": {
159 |       "text/plain": [
160 |        "array([[-0.90616043,  0.94720873, -1.30982967, -1.28485856],\n       [-1.15301457, -0.18717298, -1.30982967, -1.28485856],\n       [-0.16559799, -0.64092567,  0.22169257,  0.17345038],\n       [ 0.45153738,  0.72033239,  0.95909217,  1.49918578],\n       [-0.90616043, -1.3215547 , -0.40226093, -0.0916967 ],\n       [ 1.43895396,  0.2665797 ,  0.56203085,  0.30602392],\n       [ 0.3281103 , -1.09467835,  1.07253826,  0.30602392],\n       [ 2.1795164 , -0.18717298,  1.63976872,  1.2340387 ],\n       [-0.78273335,  2.30846679, -1.25310662, -1.4174321 ],\n       [ 0.45153738, -2.00218372,  0.44858475,  0.43859746],\n       [ 1.80923518, -0.41404933,  1.46959958,  0.83631808],\n       [ 0.69839152,  0.2665797 ,  0.90236912,  1.49918578],\n       [ 0.20468323,  0.72033239,  0.44858475,  0.571171  ],\n       [-0.78273335, -0.86780201,  0.10824648,  0.30602392],\n       [-0.53587921,  1.40096142, -1.25310662, -1.28485856],\n       [-0.65930628,  1.40096142, -1.25310662, -1.28485856],\n       [-1.0295875 ,  0.94720873, -1.19638358, -0.7545644 ],\n       [-1.77014994, -0.41404933, -1.30982967, -1.28485856],\n       [-0.04217092, -0.86780201,  0.10824648,  0.04087684],\n       [-0.78273335,  0.72033239, -1.30982967, -1.28485856],\n       [-1.52329579,  0.72033239, -1.30982967, -1.15228502],\n       [ 0.82181859,  0.2665797 ,  0.78892303,  1.10146516],\n       [-0.16559799, -0.41404933,  0.27841562,  0.17345038],\n       [ 0.94524567, -0.18717298,  0.39186171,  0.30602392],\n       [ 0.20468323, -0.41404933,  0.44858475,  0.43859746],\n       [-1.39986872,  0.2665797 , -1.19638358, -1.28485856],\n       [-1.15301457,  0.03970336, -1.25310662, -1.4174321 ],\n       [ 1.06867274,  0.03970336,  1.07253826,  1.63175932],\n       [ 0.57496445, -0.86780201,  0.67547694,  0.83631808],\n       [ 0.3281103 , -0.64092567,  0.56203085,  0.04087684],\n       [ 0.45153738, -0.64092567,  0.61875389,  0.83631808],\n       [-0.16559799,  2.98909581, -1.25310662, -1.01971148],\n       [ 0.57496445, -1.3215547 ,  0.67547694,  0.43859746],\n       [ 0.69839152, -0.41404933,  0.33513866,  0.17345038],\n       [-0.90616043,  1.62783776, -1.02621444, -1.01971148],\n       [ 1.19209981, -0.64092567,  0.61875389,  0.30602392],\n       [-0.90616043,  0.94720873, -1.30982967, -1.15228502],\n       [-1.89357701, -0.18717298, -1.47999881, -1.4174321 ],\n       [ 0.08125616, -0.18717298,  0.78892303,  0.83631808],\n       [ 0.69839152, -0.64092567,  1.07253826,  1.2340387 ],\n       [-0.28902506, -0.64092567,  0.67547694,  1.10146516],\n       [-0.41245214, -1.54843104, -0.00519961, -0.22427024],\n       [ 1.31552689,  0.03970336,  0.67547694,  0.43859746],\n       [ 0.57496445,  0.72033239,  1.07253826,  1.63175932],\n       [ 0.82181859, -0.18717298,  1.18598435,  1.36661224],\n       [-0.16559799,  1.62783776, -1.13966053, -1.15228502],\n       [ 0.94524567, -0.41404933,  0.5053078 ,  0.17345038],\n       [ 1.06867274,  0.49345605,  1.12926131,  1.76433286],\n       [-1.27644165, -0.18717298, -1.30982967, -1.4174321 ],\n       [-1.0295875 ,  1.17408507, -1.30982967, -1.28485856],\n       [ 0.20468323, -0.18717298,  0.61875389,  0.83631808],\n       [-1.0295875 , -0.18717298, -1.19638358, -1.28485856],\n       [ 0.3281103 , -0.18717298,  0.67547694,  0.83631808],\n       [ 0.69839152,  0.03970336,  1.01581521,  0.83631808],\n       [-0.90616043,  1.40096142, -1.25310662, -1.01971148],\n       [-0.16559799, -0.18717298,  0.27841562,  0.04087684],\n       [-1.0295875 ,  0.94720873, -1.36655271, -1.15228502],\n       [-0.90616043,  1.62783776, -1.25310662, -1.15228502],\n       [-1.52329579,  0.2665797 , -1.30982967, -1.28485856],\n       [-0.53587921, -0.18717298,  0.44858475,  0.43859746],\n       [ 0.82181859, -0.64092567,  0.5053078 ,  0.43859746],\n       [ 0.3281103 , -0.64092567,  0.16496953,  0.17345038],\n       [-1.27644165,  0.72033239, -1.19638358, -1.28485856],\n       [-0.90616043,  0.49345605, -1.13966053, -0.88713794],\n       [-0.04217092, -0.86780201,  0.78892303,  0.96889162],\n       [-0.28902506, -0.18717298,  0.22169257,  0.17345038],\n       [ 0.57496445, -0.64092567,  0.78892303,  0.43859746],\n       [ 1.06867274,  0.49345605,  1.12926131,  1.2340387 ],\n       [ 1.68580811, -0.18717298,  1.18598435,  0.571171  ],\n       [ 1.06867274, -0.18717298,  0.84564608,  1.49918578],\n       [-1.15301457,  0.03970336, -1.25310662, -1.4174321 ],\n       [-1.15301457, -1.3215547 ,  0.44858475,  0.70374454],\n       [-0.16559799, -1.3215547 ,  0.73219998,  1.10146516],\n       [-1.15301457, -1.54843104, -0.2320918 , -0.22427024],\n       [-0.41245214, -1.54843104,  0.05152343, -0.0916967 ],\n       [ 1.06867274, -1.3215547 ,  1.18598435,  0.83631808],\n       [ 0.82181859, -0.18717298,  1.01581521,  0.83631808],\n       [-0.16559799, -1.09467835, -0.1186457 , -0.22427024],\n       [ 0.20468323, -2.00218372,  0.73219998,  0.43859746],\n       [ 1.06867274,  0.03970336,  0.56203085,  0.43859746],\n       [-1.15301457,  0.03970336, -1.25310662, -1.4174321 ],\n       [ 0.57496445, -1.3215547 ,  0.73219998,  0.96889162],\n       [-1.39986872,  0.2665797 , -1.36655271, -1.28485856],\n       [ 0.20468323, -0.86780201,  0.78892303,  0.571171  ],\n       [-0.04217092, -1.09467835,  0.16496953,  0.04087684],\n       [ 1.31552689,  0.2665797 ,  1.12926131,  1.49918578],\n       [-1.77014994, -0.18717298, -1.36655271, -1.28485856],\n       [ 1.56238103, -0.18717298,  1.2427074 ,  1.2340387 ],\n       [ 1.19209981,  0.2665797 ,  1.2427074 ,  1.49918578],\n       [-0.78273335,  0.94720873, -1.25310662, -1.28485856],\n       [ 2.54979762,  1.62783776,  1.52632263,  1.10146516],\n       [ 0.69839152, -0.64092567,  1.07253826,  1.36661224],\n       [-0.28902506, -0.41404933, -0.06192266,  0.17345038],\n       [-0.41245214,  2.53534313, -1.30982967, -1.28485856],\n       [-1.27644165, -0.18717298, -1.30982967, -1.15228502],\n       [ 0.57496445, -0.41404933,  1.07253826,  0.83631808],\n       [-1.77014994,  0.2665797 , -1.36655271, -1.28485856],\n       [-0.53587921,  1.8547141 , -1.13966053, -1.01971148],\n       [-1.0295875 ,  0.72033239, -1.19638358, -1.01971148],\n       [ 1.06867274, -0.18717298,  0.73219998,  0.70374454],\n       [-0.53587921,  1.8547141 , -1.36655271, -1.01971148],\n       [ 2.30294347, -0.64092567,  1.69649176,  1.10146516],\n       [-0.28902506, -0.86780201,  0.27841562,  0.17345038],\n       [ 1.19209981, -0.18717298,  1.01581521,  1.2340387 ],\n       [-0.41245214,  0.94720873, -1.36655271, -1.28485856],\n       [-1.27644165,  0.72033239, -1.02621444, -1.28485856],\n       [-0.53587921,  0.72033239, -1.13966053, -1.28485856],\n       [ 2.30294347,  1.62783776,  1.69649176,  1.36661224],\n       [ 1.31552689,  0.03970336,  0.95909217,  1.2340387 ],\n       [-0.28902506, -1.3215547 ,  0.10824648, -0.0916967 ],\n       [-0.90616043,  0.72033239, -1.25310662, -1.28485856],\n       [-0.90616043,  1.62783776, -1.19638358, -1.28485856],\n       [ 0.3281103 , -0.41404933,  0.56203085,  0.30602392],\n       [-0.04217092,  2.08159044, -1.42327576, -1.28485856],\n       [-1.0295875 , -2.45593641, -0.1186457 , -0.22427024],\n       [ 0.69839152,  0.2665797 ,  0.44858475,  0.43859746],\n       [ 0.3281103 , -0.18717298,  0.5053078 ,  0.30602392],\n       [ 0.08125616,  0.2665797 ,  0.61875389,  0.83631808],\n       [ 0.20468323, -2.00218372,  0.16496953, -0.22427024],\n       [ 1.93266225, -0.64092567,  1.35615349,  0.96889162]])"
161 |       ]
162 |      },
163 |      "execution_count": 12,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "# 归一化处理\n",
170 |     "X_train_normalization = standard_scaler.transform(X_train)\n",
171 |     "X_train_normalization"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 13,
177 |    "metadata": {},
178 |    "outputs": [
179 |     {
180 |      "data": {
181 |       "text/plain": [
182 |        "array([[-0.28902506, -0.18717298,  0.44858475,  0.43859746],\n       [-0.04217092, -0.64092567,  0.78892303,  1.63175932],\n       [-1.0295875 , -1.77530738, -0.2320918 , -0.22427024],\n       [-0.04217092, -0.86780201,  0.78892303,  0.96889162],\n       [-1.52329579,  0.03970336, -1.25310662, -1.28485856],\n       [-0.41245214, -1.3215547 ,  0.16496953,  0.17345038],\n       [-0.16559799, -0.64092567,  0.44858475,  0.17345038],\n       [ 0.82181859, -0.18717298,  0.84564608,  1.10146516],\n       [ 0.57496445, -1.77530738,  0.39186171,  0.17345038],\n       [-0.41245214, -1.09467835,  0.39186171,  0.04087684],\n       [ 1.06867274,  0.03970336,  0.39186171,  0.30602392],\n       [-1.64672287, -1.77530738, -1.36655271, -1.15228502],\n       [-1.27644165,  0.03970336, -1.19638358, -1.28485856],\n       [-0.53587921,  0.72033239, -1.25310662, -1.01971148],\n       [ 1.68580811,  1.17408507,  1.35615349,  1.76433286],\n       [-0.04217092, -0.86780201,  0.22169257, -0.22427024],\n       [-1.52329579,  1.17408507, -1.53672185, -1.28485856],\n       [ 1.68580811,  0.2665797 ,  1.29943044,  0.83631808],\n       [ 1.31552689,  0.03970336,  0.78892303,  1.49918578],\n       [ 0.69839152, -0.86780201,  0.90236912,  0.96889162],\n       [ 0.57496445,  0.49345605,  0.56203085,  0.571171  ],\n       [-1.0295875 ,  0.72033239, -1.25310662, -1.28485856],\n       [ 2.30294347, -1.09467835,  1.80993786,  1.49918578],\n       [-1.0295875 ,  0.49345605, -1.30982967, -1.28485856],\n       [ 0.45153738, -0.41404933,  0.33513866,  0.17345038],\n       [ 0.08125616, -0.18717298,  0.27841562,  0.43859746],\n       [-1.0295875 ,  0.2665797 , -1.42327576, -1.28485856],\n       [-0.41245214, -1.77530738,  0.16496953,  0.17345038],\n       [ 0.57496445,  0.49345605,  1.29943044,  1.76433286],\n       [ 2.30294347, -0.18717298,  1.35615349,  1.49918578]])"
183 |       ]
184 |      },
185 |      "execution_count": 13,
186 |      "metadata": {},
187 |      "output_type": "execute_result"
188 |     }
189 |    ],
190 |    "source": [
191 |     "# 对测试数据集的特征矩阵进行归一化处理\n",
192 |     "X_test_normalization = standard_scaler.transform(X_test)\n",
193 |     "X_test_normalization"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 14,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "from sklearn.neighbors.classification import KNeighborsClassifier"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 15,
208 |    "metadata": {},
209 |    "outputs": [
210 |     {
211 |      "data": {
212 |       "text/plain": [
213 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n           metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n           weights='uniform')"
214 |       ]
215 |      },
216 |      "execution_count": 15,
217 |      "metadata": {},
218 |      "output_type": "execute_result"
219 |     }
220 |    ],
221 |    "source": [
222 |     "knn_clf = KNeighborsClassifier(n_neighbors=3)\n",
223 |     "knn_clf.fit(X_train_normalization, y_train)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "#### 如果对训练数据集进行了归一化处理，则测试数据集也必须进行归一化处理，否则结果会很差"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 18,
236 |    "metadata": {},
237 |    "outputs": [
238 |     {
239 |      "data": {
240 |       "text/plain": [
241 |        "0.33333333333333331"
242 |       ]
243 |      },
244 |      "execution_count": 18,
245 |      "metadata": {},
246 |      "output_type": "execute_result"
247 |     }
248 |    ],
249 |    "source": [
250 |     "knn_clf.score(X_test, y_test)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 19,
256 |    "metadata": {},
257 |    "outputs": [
258 |     {
259 |      "data": {
260 |       "text/plain": [
261 |        "1.0"
262 |       ]
263 |      },
264 |      "execution_count": 19,
265 |      "metadata": {},
266 |      "output_type": "execute_result"
267 |     }
268 |    ],
269 |    "source": [
270 |     "knn_clf.score(X_test_normalization, y_test)"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": []
279 |   }
280 |  ],
281 |  "metadata": {
282 |   "kernelspec": {
283 |    "display_name": "Python 2",
284 |    "language": "python",
285 |    "name": "python2"
286 |   },
287 |   "language_info": {
288 |    "codemirror_mode": {
289 |     "name": "ipython",
290 |     "version": 2
291 |    },
292 |    "file_extension": ".py",
293 |    "mimetype": "text/x-python",
294 |    "name": "python",
295 |    "nbconvert_exporter": "python",
296 |    "pygments_lexer": "ipython2",
297 |    "version": "2.7.6"
298 |   }
299 |  },
300 |  "nbformat": 4,
301 |  "nbformat_minor": 0
302 | }
303 | 


--------------------------------------------------------------------------------
/c1_knn/01_kNN_Basics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import matplotlib.pyplot as plt"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 6,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "raw_data_X = np.random.random((10,2))"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 7,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "raw_data_X = raw_data_X *10"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 8,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "raw_data_y = np.array([0,0,0,0,0,1,1,1,1,1])"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 9,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "X_train = raw_data_X\n",
 49 |     "y_train = raw_data_y"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 10,
 55 |    "metadata": {},
 56 |    "outputs": [
 57 |     {
 58 |      "data": {
 59 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAADdVJREFUeJzt3UGIpHeZx/HfrzOzaMWldUkja2JX5bBkEYfdSB2iAQ9pFxbNmD3sIVARXRbqsmgUQZQ6SA61eBBpT0IR1xV8iYcxsDseRBldloUlUDMJ28mMIKzpdnSyaVm2FeuQkTx7eKuTmdnu9Nu9/dZbT9X3A0N1//NO9UPBfPP2W+/7liNCAIA8VpoeAABwPIQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyZ+p40nvuuSc6nU4dTw0AC+ny5cu/joi1KtvWEu5Op6PxeFzHUwPAQrK9XXVbDpUAQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AycxNuIutQp3NjlaeWlFns6Niq2h6JACYS3MR7mKrUP9iX9t72wqFtve21b/YJ94ATkdRSJ2OtLJSPha52zIX4R5cGmhyc3Lb2uTmRINLg4YmArAwikLq96XtbSmifOz3U8d7LsK9s7dzrHUAqGwwkCa37xhqMinXk5qLcK+vrh9rHQAq2zlkB/Cw9QTmItzDjaFaZ1u3rbXOtjTcGDY0EYCFsX7IDuBh6wnMRbh753oanR+pvdqWZbVX2xqdH6l3rtf0aACyGw6l1u07hmq1yvWkHBGn/qTdbje4rSuAuVEU5THtnZ1yT3s4lHrztWNo+3JEdKtsOxd73LPCueLAkur1pJdfll5/vXycs2gfVy0fpDCP9s8V3z/tcP9ccUkckgGQytLscXOuOIBFsTTh5lxxAItiacLNueIAFsXShJtzxQEsiqUJN+eKA1gUnMcNAHOA87gBYIERbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQqhdv252y/ZPtF28/YflvdgwEADnZkuG3fK+kzkroR8X5Jd0l6vO7BAAAHq3qo5Iykt9s+I6kl6Vf1jQQAeCtHhjsifinpq5J2JN2QtBcRP6x7MADAwaocKnmXpMck3S/pPZLutv3EAdv1bY9tj3d3d09/UgCApGqHSj4i6ecRsRsRNyU9K+lDd24UEaOI6EZEd21t7bTnBABMVQn3jqSHbLdsW9KGpGv1jgUAOEyVY9zPSbog6YqkrenfGdU8FwDgEGeqbBQRX5b05ZpnAQBUwJWTAJAM4QaAZAg3ACRDuIFbFYXU6UgrK+VjUTQ9EfB/VHpzElgKRSH1+9JkUn6/vV1+L0m9XnNzAXdgjxvYNxi8Ge19k0m5DswRwg3s29k53jrQEMIN7FtfP9460BDCDewbDqVW6/a1VqtcB+YI4Qb29XrSaCS125JdPo5GvDGJucNZJcCtej1CjbnHHjcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAE6qoZuScTogAJxEgzclY48bAE6iwZuSEW4AOIkGb0pGuAHgJBq8KRnhBoCTaPCmZIQbAE6iwZuScVYJAJxUQzclY48bAJIh3ACQDOEGgGQIdyLFVqHOZkcrT62os9lRsTWby2sBzBfenEyi2CrUv9jX5GZ5pdb23rb6F8vLa3vn+MQWYJmwx53E4NLgjWjvm9ycaHCp/strAcwXwp3Ezt7Bl9Eetg5gcRHuJNZXD76M9rB1AIuLcCcx3Biqdfb2y2tbZ1sabtR/eS2A+UK4k+id62l0fqT2aluW1V5ta3R+xBuTwBJyRJz6k3a73RiPx6f+vACwqGxfjohulW3Z4waAZCqF2/Y7bV+w/VPb12x/sO7BAAAHq3oBztcl/SAi/tr2H0hqHfUXAAD1ODLctlclfVjSpyQpIl6T9Fq9YwEADlPlUMn9knYlfcv287aftn33nRvZ7tse2x7v7u6e+qAAgFKVcJ+R9AFJ34iIByX9TtIX79woIkYR0Y2I7tra2imPCQDYVyXc1yVdj4jnpt9fUBlyAEADjgx3RLwi6Re2H5gubUi6WutUAIBDVT2r5NOSiukZJf8p6W/qGwkA8FYqhTsiXpBU6YoeAEC9uHISAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwY3kVhdTpSCsr5WNRND0RUEnVKyeBxVIUUr8vTSbl99vb5feS1ONzPDHf2OPGchoM3oz2vsmkXAfmHOHGctrZOd46MEcIN5bT+vrx1oE5QrixnIZDqXXHR6e2WuU6MOcIN5ZTryeNRlK7Ldnl42jEG5NIgbNKsLx6PUKNlNjjBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ABmptgq1NnsaOWpFXU2Oyq2uJXuSXABDoCZKLYK9S/2NblZ3pVxe29b/YvlrXR757gQ6jjY4wYwE4NLgzeivW9yc6LBJW6le1yEG8BM7OwdfMvcw9ZxOMINYCbWVw++Ze5h6zgc4QYwE8ONoVpnb7+VbutsS8MNbqV7XIQbwEz0zvU0Oj9Se7Uty2qvtjU6P+KNyRNwRJz6k3a73RiPx6f+vACwqGxfjohulW3Z4waAZAg3ACRDuAEgGcINAMkQbgBIpnK4bd9l+3nb369zIADAWzvOHveTkq7VNQgAoJpK4bZ9n6SPSXq63nEAAEepuse9KekLkl6vcRYAQAVHhtv2o5JejYjLR2zXtz22Pd7d3T21AQEAt6uyx/2wpI/bflnSdyU9Yvs7d24UEaOI6EZEd21t7ZTHBADsOzLcEfGliLgvIjqSHpf044h4ovbJAAAH4jxuAEjmWOGOiH+JiEfrGgaoTVFInY60slI+FnxILfLiw4Kx+IpC6velyfTzDre3y+8lqce9oJEPh0qw+AaDN6O9bzIp14GECDcW384hH0Z72Dow5wg3Ft/6IR9Ge9g6MOcINxbfcCi1bv+QWrVa5TqQEOHG4uv1pNFIarclu3wcjXhjEmlxVgmWQ69HqLEw2OMGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASObIcNt+r+2f2L5q+yXbT85iMADAwc5U2Ob3kj4fEVds/6Gky7Z/FBFXa54NAHCAI/e4I+JGRFyZfv1bSdck3Vv3YACAgx3rGLftjqQHJT1XxzAAgKNVDrftd0j6nqTPRsRvDvjvfdtj2+Pd3d3TnBEAcItK4bZ9VmW0i4h49qBtImIUEd2I6K6trZ3mjACAW1Q5q8SSvinpWkR8rf6RAABvpcoe98OSPiHpEdsvTP98tOa5AACHOPJ0wIj4N0mewSwAgAq4chIAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwJ1dsFepsdrTy1Io6mx0VW0XTIwGo2ZmmB8DJFVuF+hf7mtycSJK297bVv9iXJPXO9ZocDUCN2ONObHBp8Ea0901uTjS4NGhoIgCzQLgT29nbOdY6gMVAuBNbX10/1jqAxUC4ExtuDNU627ptrXW2peHGsKGJAMwC4U6sd66n0fmR2qttWVZ7ta3R+RFvTAILzhFx6k/a7XZjPB6f+vMCwKKyfTkiulW2ZY8bAJIh3ACQDOEGgGQINwAkQ7gBIJlaziqxvStp+9SfeH7cI+nXTQ8xB3gdSrwOvAb7/j+vQzsi1qpsWEu4F53tcdXTdhYZr0OJ14HXYN+sXgcOlQBAMoQbAJIh3CczanqAOcHrUOJ14DXYN5PXgWPcAJAMe9wAkAzhrsj2e23/xPZV2y/ZfrLpmZpk+y7bz9v+ftOzNMX2O21fsP1T29dsf7DpmZpg+3PTfxMv2n7G9tuanmkWbP+D7Vdtv3jL2h/Z/pHtn00f31XHzybc1f1e0ucj4n2SHpL0d7bf1/BMTXpS0rWmh2jY1yX9ICL+VNKfaQlfD9v3SvqMpG5EvF/SXZIeb3aqmflHSX95x9oXJV2KiD+RdGn6/akj3BVFxI2IuDL9+rcq/5He2+xUzbB9n6SPSXq66VmaYntV0oclfVOSIuK1iPifZqdqzBlJb7d9RlJL0q8anmcmIuJfJf33HcuPSfr29OtvS/qrOn424T4B2x1JD0p6rtlJGrMp6QuSXm96kAbdL2lX0remh4yetn1300PNWkT8UtJXJe1IuiFpLyJ+2OxUjXp3RNyYfv2KpHfX8UMI9zHZfoek70n6bET8pul5Zs32o5JejYjLTc/SsDOSPiDpGxHxoKTfqaZfi+fZ9BjuYyr/R/YeSXfbfqLZqeZDlKfs1XLaHuE+BttnVUa7iIhnm56nIQ9L+rjtlyV9V9Ijtr/T7EiNuC7pekTs/9Z1QWXIl81HJP08InYj4qakZyV9qOGZmvRftv9YkqaPr9bxQwh3Rbat8njmtYj4WtPzNCUivhQR90VER+WbUD+OiKXbw4qIVyT9wvYD06UNSVcbHKkpO5Iest2a/hvZ0BK+SXuLf5b0yenXn5T0T3X8EMJd3cOSPqFyD/OF6Z+PNj0UGvVpSYXt/5D055L+vuF5Zm76G8cFSVckbalsylJcRWn7GUn/LukB29dt/62kr0j6C9s/U/nbyFdq+dlcOQkAubDHDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgmf8Fyb54/FyWOFoAAAAASUVORK5CYII=\n",
 60 |       "text/plain": [
 61 |        "<matplotlib.figure.Figure at 0x1063a9c88>"
 62 |       ]
 63 |      },
 64 |      "metadata": {},
 65 |      "output_type": "display_data"
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "plt.scatter(X_train[y_train==0, 0], X_train[y_train==0,1], color='g')\n",
 70 |     "plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r')\n",
 71 |     "plt.show()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 11,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "x = np.random.random(2)*10"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 12,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "data": {
 90 |       "text/plain": [
 91 |        "array([ 8.67035706,  7.0760235 ])"
 92 |       ]
 93 |      },
 94 |      "execution_count": 12,
 95 |      "metadata": {},
 96 |      "output_type": "execute_result"
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "x"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 13,
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "data": {
110 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAADgBJREFUeJzt3VGIrHd9xvHn2ewWnVhWSxapiTuTi5IiHtrIXEQDXmQtFM0xvehFYCJaCnNTNIogylxILqZ4IbJeCUOsFXyJF8dAe7wQZbWUQgnMnoRuco4g1Mx69KRZKV3FucjK+fXinUnOnu6efed033nf/8z3A4fZ/ec9sz8GzjfvvvO+7zgiBABIx0rVAwAAZkO4ASAxhBsAEkO4ASAxhBsAEkO4ASAxhBsAEkO4ASAxhBsAErNaxpPed9990Wq1ynhqAFhIu7u7v46IjSLblhLuVqul4XBYxlMDwEKyPSq6LYdKACAxhBsAEkO4ASAxhBsAEkO4ASAxhBsAEkO4ASAxtQl3tpeptd3SyjMram23lO1lVY8EALVUi3Bne5m6l7saHY4UCo0OR+pe7hJvAOcjy6RWS1pZyR+ztNtSi3D3dnoaH42PrY2Pxurt9CqaCMDCyDKp25VGIykif+x2k453LcK9f7g/0zoAFNbrSePjO4Yaj/P1RNUi3JvrmzOtA0Bh+6fsAJ62noBahLu/1VdjrXFsrbHWUH+rX9FEABbG5ik7gKetJ6AW4e5c6GhwcaDmelOW1VxvanBxoM6FTtWjAUhdvy81ju8YqtHI1xPliDj3J22328FtXQHURpblx7T39/M97X5f6tRrx9D2bkS0i2xbiz3ueeFccWBJdTrSq69KN2/mjzWL9qxK+SCFOpqeKz497XB6rrgkDskASMrS7HFzrjiARbE04eZccQCLYmnCzbniABbF0oSbc8UBLIqlCTfnigNYFJzHDQA1wHncALDACDcAJIZwA0BiCDcAJIZwA0BiCDcAJIZwA0BiCDcAJIZwA0BiCDcAJIZwA0BiCDcAJIZwA0BiCDcAJKZQuG1/zvYrtl+2/Zztt5U9GADgZGeG2/b9kj4jqR0R75d0j6Qnyx4MAHCyoodKViW93faqpIakX5U3EgDgTs4Md0T8UtJXJe1LuiHpMCJ+WPZgAICTFTlU8i5JT0h6UNJ7JN1r+6kTtuvaHtoeHhwcnP+kAABJxQ6VfETSzyPiICKOJD0v6UO3bxQRg4hoR0R7Y2PjvOcEAEwUCfe+pEdsN2xb0paka+WOBQA4TZFj3C9IuiTpiqS9yd8ZlDwXAOAUq0U2iogvS/pyybMAAArgykkASAzhBoDEEG4ASAzhBm6VZVKrJa2s5I9ZVvVEwP9R6M1JYClkmdTtSuNx/v1olH8vSZ1OdXMBt2GPG5jq9d6K9tR4nK8DNUK4gan9/dnWgYoQbmBqc3O2daAihBuY6velRuP4WqORrwM1QriBqU5HGgykZlOy88fBgDcmUTucVQLcqtMh1Kg99rgBIDGEG0AtcS3U6ThUAqB2uBbqztjjBlA7XAt1Z4QbQO1wLdSdEW4AtZPMtVAVHYgn3ABqJ4lroaYH4kcjKeKtA/FziDfhBlA7SVwLVeGBeEfEuT9pu92O4XB47s8LALWxspLvad/Olm7enPnpbO9GRLvQj5752QEAlR6IJ9wAcDcqPBBPuAHgblR4IJ4rJwHgblV0UzL2uAEgMYQbABJDuAEgMYQ7IdleptZ2SyvPrKi13VK2x30ugWXEm5OJyPYydS93NT7Kr9QaHY7UvZzf57JzoU6XkwEoG3vciejt9N6M9tT4aKzeDve5BJYN4U7E/uHJ97M8bR3A4iLcidhcP/ky2tPWASwuwp2I/lZfjbXjl9c21hrqb9XpPpcA5oFwJ6JzoaPBxYGa601ZVnO9qcHFAW9MAkuI27oCQA1wW1cAWGCFwm37nbYv2f6p7Wu2P1j2YACAkxW9AOfrkn4QEX9t+w8kNc76CwCAcpwZbtvrkj4s6VOSFBFvSHqj3LEAAKcpcqjkQUkHkr5l+0Xbz9q+9/aNbHdtD20PDw4Ozn1QAECuSLhXJX1A0jci4mFJv5P0xds3iohBRLQjor2xsXHOYwIApoqE+7qk6xHxwuT7S8pDDgCowJnhjojXJP3C9kOTpS1JV0udCgBwqqJnlXxaUjY5o+Q/Jf1NeSMBAO6kULgj4iVJha7oAQCUiysnASAxhBsAEkO4ASAxhBsAEkO4ASAxhBsAEkO4ASAxhBvLK8ukVktaWckfs6zqiYBCil45CSyWLJO6XWk8zr8fjfLvJanD53ii3tjjxnLq9d6K9tR4nK8DNUe4sZz292dbB2qEcGM5bW7Otg7UCOHGcur3pcZtH53aaOTrQM0RbiynTkcaDKRmU7Lzx8GANyaRBM4qwfLqdAg1ksQeNwAkhnADQGIINwAkhnADQGIINwAkhnADQGIINwAkhnADmJtsL1Nru6WVZ1bU2m4p2+NWuneDC3AAzEW2l6l7uavxUX5XxtHhSN3L+a10Oxe4EGoW7HEDmIveTu/NaE+Nj8bq7XAr3VkRbgBzsX948i1zT1vH6Qg3gLnYXD/5lrmnreN0hBvAXPS3+mqsHb+VbmOtof4Wt9KdFeEGMBedCx0NLg7UXG/KsprrTQ0uDnhj8i44Is79SdvtdgyHw3N/XgBYVLZ3I6JdZFv2uAEgMYQbABJDuAEgMYQbABJDuAEgMYXDbfse2y/a/n6ZAwEA7myWPe6nJV0raxAAQDGFwm37AUkfk/RsueMAAM5SdI97W9IXJN0scRYAQAFnhtv245Jej4jdM7br2h7aHh4cHJzbgACA44rscT8q6eO2X5X0XUmP2f7O7RtFxCAi2hHR3tjYOOcxAQBTZ4Y7Ir4UEQ9EREvSk5J+HBFPlT4ZAOBEnMcNAImZKdwR8S8R8XhZwwClyTKp1ZJWVvLHjA+pRbr4sGAsviyTul1pPPm8w9Eo/16SOtwLGunhUAkWX6/3VrSnxuN8HUgQ4cbi2z/lw2hPWwdqjnBj8W2e8mG0p60DNUe4sfj6falx/ENq1Wjk60CCCDcWX6cjDQZSsynZ+eNgwBuTSBZnlWA5dDqEGguDPW4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASMyZ4bb9Xts/sX3V9iu2n57HYACAk60W2Ob3kj4fEVds/6GkXds/ioirJc8GADjBmXvcEXEjIq5Mvv6tpGuS7i97MADAyWY6xm27JelhSS+UMQwA4GyFw237HZK+J+mzEfGbE/571/bQ9vDg4OA8ZwQA3KJQuG2vKY92FhHPn7RNRAwioh0R7Y2NjfOcEQBwiyJnlVjSNyVdi4ivlT8SAOBOiuxxPyrpE5Ies/3S5M9HS54LAHCKM08HjIh/k+Q5zAIAKIArJwEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQ7cdleptZ2SyvPrKi13VK2l1U9EoCSrVY9AO5etpepe7mr8dFYkjQ6HKl7uStJ6lzoVDkagBKxx52w3k7vzWhPjY/G6u30KpoIwDwQ7oTtH+7PtA5gMRDuhG2ub860DmAxEO6E9bf6aqw1jq011hrqb/UrmgjAPBDuhHUudDS4OFBzvSnLaq43Nbg44I1JYME5Is79SdvtdgyHw3N/XgBYVLZ3I6JdZFv2uAEgMYQbABJDuAEgMYQbABJDuAEgMaWcVWL7QNLo3J+4Pu6T9Ouqh6gBXoccrwOvwdT/53VoRsRGkQ1LCfeisz0setrOIuN1yPE68BpMzet14FAJACSGcANAYgj33RlUPUBN8DrkeB14Dabm8jpwjBsAEsMeNwAkhnAXZPu9tn9i+6rtV2w/XfVMVbJ9j+0XbX+/6lmqYvudti/Z/qnta7Y/WPVMVbD9ucm/iZdtP2f7bVXPNA+2/8H267ZfvmXtj2z/yPbPJo/vKuNnE+7ifi/p8xHxPkmPSPo72++reKYqPS3pWtVDVOzrkn4QEX8q6c+0hK+H7fslfUZSOyLeL+keSU9WO9Xc/KOkv7xt7YuSdiLiTyTtTL4/d4S7oIi4ERFXJl//Vvk/0vurnaoath+Q9DFJz1Y9S1Vsr0v6sKRvSlJEvBER/1PtVJVZlfR226uSGpJ+VfE8cxER/yrpv29bfkLStydff1vSX5Xxswn3XbDdkvSwpBeqnaQy25K+IOlm1YNU6EFJB5K+NTlk9Kzte6seat4i4peSvippX9INSYcR8cNqp6rUuyPixuTr1yS9u4wfQrhnZPsdkr4n6bMR8Zuq55k3249Lej0idquepWKrkj4g6RsR8bCk36mkX4vrbHIM9wnl/yN7j6R7bT9V7VT1EPkpe6Wctke4Z2B7TXm0s4h4vup5KvKopI/bflXSdyU9Zvs71Y5UieuSrkfE9LeuS8pDvmw+IunnEXEQEUeSnpf0oYpnqtJ/2f5jSZo8vl7GDyHcBdm28uOZ1yLia1XPU5WI+FJEPBARLeVvQv04IpZuDysiXpP0C9sPTZa2JF2tcKSq7Et6xHZj8m9kS0v4Ju0t/lnSJydff1LSP5XxQwh3cY9K+oTyPcyXJn8+WvVQqNSnJWW2/0PSn0v6+4rnmbvJbxyXJF2RtKe8KUtxFaXt5yT9u6SHbF+3/beSviLpL2z/TPlvI18p5Wdz5SQApIU9bgBIDOEGgMQQbgBIDOEGgMQQbgBIDOEGgMQQbgBIDOEGgMT8Lwi/oYhwODxFAAAAAElFTkSuQmCC\n",
111 |       "text/plain": [
112 |        "<matplotlib.figure.Figure at 0x1063d8748>"
113 |       ]
114 |      },
115 |      "metadata": {},
116 |      "output_type": "display_data"
117 |     }
118 |    ],
119 |    "source": [
120 |     "plt.scatter(X_train[y_train==0, 0], X_train[y_train==0,1], color='g')\n",
121 |     "plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r')\n",
122 |     "plt.scatter(x[0], x[1], color='b')\n",
123 |     "plt.show()"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "## kNN的过程"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 14,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "data": {
140 |       "text/plain": [
141 |        "[8.231110301142037,\n 2.467672103362692,\n 8.513701036889186,\n 7.405233631062757,\n 5.723981182780471,\n 4.7325594847194985,\n 3.026576262484981,\n 2.3280024672389885,\n 3.5998460221137596,\n 1.259337088954209]"
142 |       ]
143 |      },
144 |      "execution_count": 14,
145 |      "metadata": {},
146 |      "output_type": "execute_result"
147 |     }
148 |    ],
149 |    "source": [
150 |     "from math import sqrt\n",
151 |     "distances = []\n",
152 |     "for x_train in X_train:\n",
153 |     "    d = sqrt(np.sum((x-x_train)**2))\n",
154 |     "    distances.append(d)\n",
155 |     "distances"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 15,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "data": {
165 |       "text/plain": [
166 |        "array([9, 7, 1, 6, 8, 5, 4, 3, 0, 2])"
167 |       ]
168 |      },
169 |      "execution_count": 15,
170 |      "metadata": {},
171 |      "output_type": "execute_result"
172 |     }
173 |    ],
174 |    "source": [
175 |     "top_k = np.argsort(distances)\n",
176 |     "top_k"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 16,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "# 假设k取6\n",
186 |     "k = 6"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 17,
192 |    "metadata": {},
193 |    "outputs": [
194 |     {
195 |      "data": {
196 |       "text/plain": [
197 |        "[(1, 5)]"
198 |       ]
199 |      },
200 |      "execution_count": 17,
201 |      "metadata": {},
202 |      "output_type": "execute_result"
203 |     }
204 |    ],
205 |    "source": [
206 |     "from collections import Counter\n",
207 |     "votes = Counter(y_train[top_k[:k]])\n",
208 |     "\n",
209 |     "votes.most_common(1)    # 前边是标签，后边是个数"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 18,
215 |    "metadata": {},
216 |    "outputs": [
217 |     {
218 |      "data": {
219 |       "text/plain": [
220 |        "1"
221 |       ]
222 |      },
223 |      "execution_count": 18,
224 |      "metadata": {},
225 |      "output_type": "execute_result"
226 |     }
227 |    ],
228 |    "source": [
229 |     "# 最终kNN的结果：\n",
230 |     "predict_y = votes.most_common(1)[0][0]\n",
231 |     "predict_y"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 19,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "%run c1_knn/kNN.py"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 21,
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "data": {
250 |       "text/plain": [
251 |        "1"
252 |       ]
253 |      },
254 |      "execution_count": 21,
255 |      "metadata": {},
256 |      "output_type": "execute_result"
257 |     }
258 |    ],
259 |    "source": [
260 |     "predict_y = kNN_classify(6, X_train, y_train, x)\n",
261 |     "predict_y"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": []
270 |   }
271 |  ],
272 |  "metadata": {
273 |   "kernelspec": {
274 |    "display_name": "Python 2",
275 |    "language": "python",
276 |    "name": "python2"
277 |   },
278 |   "language_info": {
279 |    "codemirror_mode": {
280 |     "name": "ipython",
281 |     "version": 2
282 |    },
283 |    "file_extension": ".py",
284 |    "mimetype": "text/x-python",
285 |    "name": "python",
286 |    "nbconvert_exporter": "python",
287 |    "pygments_lexer": "ipython2",
288 |    "version": "2.7.6"
289 |   }
290 |  },
291 |  "nbformat": 4,
292 |  "nbformat_minor": 0
293 | }
294 | 


--------------------------------------------------------------------------------