├── c1_knn
├── __init__.py
├── metrics.py
├── model_selection.py
├── preprocessing.py
├── kNN.py
├── 02_kNN_in_scikit_learn.ipynb
├── knn.md
├── 05_Hyper_Parameters.ipynb
├── 03_Train_Test_Split.ipynb
├── 04_Hyper_Parameter_kNN.ipynb
├── 08_Scaler_in_Scikit_Learn.ipynb
└── 01_kNN_Basics.ipynb
├── c4_pca
├── __init__.py
└── 07_MNIST.ipynb
├── c8_svm
└── __init__.py
├── playML
├── __init__.py
├── model_selection.py
├── plot_utils.py
├── PCA.py
├── logistic_regression.py
├── metrics.py
└── linear_regression.py
├── c2_linear_regression
├── __init__.py
├── linear_regression.py
├── simple_linear_regression.py
├── 08_Linear_Regression.ipynb
├── 10_More_About_Linear_Regression.ipynb
└── 09_Regression_in_scikit_learn.ipynb
├── c3_gradient_descent
├── __init__.py
├── 06_Stochastic_Gradient_Descent.ipynb
├── 08_Gradient_Debugging.ipynb
├── 05_Vectorize_Gradient_Descent.ipynb
└── 07_SGD_in_scikit_learn.ipynb
├── c6_logistic_regression
├── __init__.py
├── plot_utils.py
├── 01_Sigmoid.ipynb
└── 04_implement_logistic_regression.ipynb
├── c5_polynomial_regression
├── __init__.py
└── 06_Validation_and_Cross_Validation.ipynb
├── c7_classification_performance_measures
├── __init__.py
└── 03_implement_confusion_matrix_precision_and_recall.ipynb
├── README.md
├── c0_overview
└── 数据相关.pdf
├── LICENSE
└── .gitignore
/c1_knn/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
--------------------------------------------------------------------------------
/c4_pca/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
--------------------------------------------------------------------------------
/c8_svm/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
--------------------------------------------------------------------------------
/playML/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
--------------------------------------------------------------------------------
/c2_linear_regression/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
--------------------------------------------------------------------------------
/c3_gradient_descent/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
--------------------------------------------------------------------------------
/c6_logistic_regression/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
--------------------------------------------------------------------------------
/c5_polynomial_regression/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
--------------------------------------------------------------------------------
/c7_classification_performance_measures/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MachineLearningClassicAlgorithm
2 | 慕课网《Python 3 入门机器学习经典算法与应用》的代码和笔记
3 |
--------------------------------------------------------------------------------
/c0_overview/数据相关.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sea-Monster/MachineLearningClassicAlgorithm/HEAD/c0_overview/数据相关.pdf
--------------------------------------------------------------------------------
/c1_knn/metrics.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # 度量
3 | import numpy as np
4 |
5 |
6 | def accuracy_score(y_true, y_predict):
7 | assert np.shape(y_true)[0] == np.shape(y_predict)[0], 'the size of y_true must be equal to the size of y_predict'
8 | return sum(y_predict == y_true) / len(y_true)
9 |
--------------------------------------------------------------------------------
/c6_logistic_regression/plot_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | def plot_decision_boundary(model, axis):
6 | """绘制不规则决策边界"""
7 | x0, x1 = np.meshgrid(
8 | np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(1, -1),
9 | np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(1, -1)
10 | )
11 | X_new = np.c_[x0.ravel(), x1.ravel()]
12 |
13 | y_predict = model.predict(X_new)
14 | zz = y_predict.reshape(x0.shape)
15 |
16 | from matplotlib.colors import ListedColormap
17 | custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
18 |
19 | plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
20 |
--------------------------------------------------------------------------------
/c1_knn/model_selection.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 |
4 |
5 | def train_test_split(X, y, test_ratio=0.2, seed=None):
6 | """
7 | 将数据X和y按照test_ratio分割成X_train,X_test,y_train,y_test
8 | :param X:
9 | :param y:
10 | :param test_ratio:
11 | :param seed:
12 | :return:
13 | """
14 | if seed:
15 | np.random.seed(seed)
16 |
17 | shuffled_indexes = np.random.permutation(np.shape(X)[0])
18 |
19 | test_size = int(np.shape(X)[0] * test_ratio)
20 | test_indexes = shuffled_indexes[:test_size]
21 | train_indexes = shuffled_indexes[test_size:]
22 |
23 | X_train = X[train_indexes]
24 | y_train = y[train_indexes]
25 |
26 | X_test = X[test_indexes]
27 | y_test = y[test_indexes]
28 |
29 | return X_train, X_test, y_train, y_test
30 |
--------------------------------------------------------------------------------
/playML/model_selection.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 |
4 |
5 | def train_test_split(X, y, test_ratio=0.2, seed=None):
6 | """
7 | 将数据X和y按照test_ratio分割成X_train,X_test,y_train,y_test
8 | :param X:
9 | :param y:
10 | :param test_ratio:
11 | :param seed:
12 | :return:
13 | """
14 | if seed:
15 | np.random.seed(seed)
16 |
17 | shuffled_indexes = np.random.permutation(np.shape(X)[0])
18 |
19 | test_size = int(np.shape(X)[0] * test_ratio)
20 | test_indexes = shuffled_indexes[:test_size]
21 | train_indexes = shuffled_indexes[test_size:]
22 |
23 | X_train = X[train_indexes]
24 | y_train = y[train_indexes]
25 |
26 | X_test = X[test_indexes]
27 | y_test = y[test_indexes]
28 |
29 | return X_train, X_test, y_train, y_test
30 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Sea-Monster
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/c2_linear_regression/linear_regression.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | from playML.metrics import r2_score
4 |
5 |
6 | class LinearRegression(object):
7 |
8 | def __init__(self):
9 | self.coef_ = None # 系数
10 | self.interception_ = None # 截距
11 |
12 | self._theta = None # 为计算方便,将来会把系数和截距合为一个θ
13 |
14 | def fit_normal(self, X_train:np.ndarray, y_train:np.ndarray):
15 | assert X_train.shape[0] == y_train.shape[0], '每一个训练样本必须对应一个标记'
16 |
17 | # 特征矩阵的最左列加上一个行数等于特征矩阵的由1组成的列向量
18 | X_b = np.hstack([np.ones(shape=(X_train.shape[0], 1)), X_train])
19 |
20 | # 正规方程解求θ
21 | self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
22 | self.interception_ = self._theta[0]
23 | self.coef_ = self._theta[1:]
24 |
25 | return self
26 |
27 | def predict(self, X_predict:np.ndarray):
28 | assert self.interception_ is not None and self.coef_ is not None, '评估前必须拟合'
29 | assert np.shape(X_predict)[1] == len(self.coef_), '要预测的样本的特征数必须与训练的样本的特征数相等'
30 |
31 | X_b = np.hstack([np.ones(shape=(X_predict.shape[0], 1)), X_predict])
32 | return X_b.dot(self._theta)
33 |
34 | def score(self, X_test, y_test):
35 | y_predict = self.predict(X_test)
36 | return r2_score(y_test,y_predict)
37 |
38 | def __repr__(self):
39 | return 'LinearRegression()'
--------------------------------------------------------------------------------
/playML/plot_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | def plot_decision_boundary(model, axis):
6 | """绘制不规则决策边界"""
7 | x0, x1 = np.meshgrid(
8 | np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(1, -1),
9 | np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(1, -1)
10 | )
11 | X_new = np.c_[x0.ravel(), x1.ravel()]
12 |
13 | y_predict = model.predict(X_new)
14 | zz = y_predict.reshape(x0.shape)
15 |
16 | from matplotlib.colors import ListedColormap
17 | custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
18 |
19 | plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
20 |
21 | def plot_svc_decision_boundary(model, axis):
22 | plot_decision_boundary(model, axis)
23 | w = model.coef_[0]
24 | b = model.intercept_[0]
25 |
26 | # 绘制margin的直线
27 | # 决策边界所在直线的表达式:w0 * x0 + w1 * x1 + b = 0 -> x1 = -w0 * x0 / w1 - b / w1
28 | plot_x = np.linspace(axis[0], axis[1], 200)
29 |
30 | # w0 * x0 + w1 * x1 + b = 1 -> x1 = 1/w1 - w0 * x0 / w1 - b / w1
31 | up_y = -w[0]/w[1]*plot_x - b/w[1] + 1/w[1]
32 |
33 | down_y = -w[0]/w[1]*plot_x - b/w[1] - 1/w[1]
34 |
35 | # 处理超过了坐标轴范围的值
36 | up_index = (up_y >= axis[2]) & (up_y <= axis[3])
37 | down_index = (down_y >= axis[2]) & (down_y <= axis[3])
38 |
39 | plt.plot(plot_x[up_index], up_y[up_index], color='black')
40 | plt.plot(plot_x[down_index], down_y[down_index], color='black')
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/c1_knn/preprocessing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 |
4 |
5 | class StandardScaler(object):
6 | """
7 | 照猫画虎的仿照scikit-learn实现一个Standard Scaler
8 | """
9 | def __init__(self):
10 | self.mean_ = None
11 | self.scale_ = None
12 |
13 | def fit(self, X:np.ndarray):
14 | """
15 | 根据训练数据集X获得数据的均值和标准差
16 | (暂时只处理2维的数据)
17 | :param X:
18 | :return:
19 | """
20 | assert X.ndim == 2, 'The dimension of X must be 2'
21 |
22 | self.mean_ = np.array([np.mean(X[:,i]) for i in range(X.shape[1])])
23 | self.scale_ = np.array([np.std(X[:,i]) for i in range(X.shape[1])])
24 |
25 | return self
26 |
27 | def transform(self, X):
28 | """
29 | 将X根据这个StandardScaler进行0均值标准化处理
30 | :param X:
31 | :return:
32 | """
33 | assert X.ndim == 2, 'The dimension of X must be 2'
34 | assert self.mean_ is not None and self.scale_ is not None, "must fit before transform"
35 | X_standard = (X - self.mean_) / self.scale_
36 | return X_standard
37 |
38 | def transform_standard(self, X):
39 | """
40 | 将X根据这个StandardScaler进行0均值标准化处理(老师教学版)
41 | :param X:
42 | :return:
43 | """
44 | assert X.ndim == 2, 'The dimension of X must be 2'
45 | assert self.mean_ is not None and self.scale_ is not None, "must fit before transform"
46 | X_res = np.empty(shape=X.shape, dtype=float)
47 | for col in range(X.shape[1]):
48 | X_res[:, col] = (X[:, col] - self.mean_[col]) / self.scale_[col]
49 | return X_res
50 |
51 | if __name__ == '__main__':
52 | from sklearn import datasets
53 | from sklearn.model_selection._split import train_test_split
54 | iris = datasets.load_iris()
55 | X = iris.data
56 | y = iris.target
57 |
58 | X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=666)
59 |
60 | ss = StandardScaler()
61 | ss.fit(X_train)
62 |
63 | X_standard = ss.transform(X)
64 | print(ss.transform_standard(X))
65 | print('-'*100)
66 | print(X_standard)
67 |
--------------------------------------------------------------------------------
/playML/PCA.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 |
4 |
5 | class PCA(object):
6 | def __init__(self, n_components):
7 | assert n_components >= 1, "n_components必须大于等于1"
8 | self.n_components = n_components
9 | self.components_ = None
10 |
11 | def fit(self, X, eta=0.01, n_iters=1e4):
12 | """
13 | 获得数据集X的前n个主成分
14 | :param X:
15 | :param eta:
16 | :param n_iters:
17 | :return:
18 | """
19 | assert self.n_components <= np.shape(X)[1], 'n_components must not be greater than the feature number of X'
20 |
21 | def demean(X):
22 | return X - np.mean(X, axis=0)
23 |
24 | def f(w, X):
25 | """效用函数"""
26 | return np.sum((X.dot(w) ** 2))/len(X)
27 |
28 | def derivative_f(w, X):
29 | """求梯度"""
30 | return X.T.dot(X.dot(w))*2./len(X)
31 |
32 | def direction(w):
33 | return w/np.linalg.norm(w)
34 |
35 | def first_component(X, initial_w, eta=0.01, n_iters=1e4, epsilon=1e-8):
36 | w = direction(initial_w)
37 | cur_iter = 0
38 |
39 | while cur_iter < n_iters:
40 | gradient = derivative_f(w, X)
41 | last_w = w
42 | w = w + eta * gradient
43 | w = direction(w)
44 | if (abs(f(w, X) - f(last_w, X)) < epsilon):
45 | break
46 | cur_iter += 1
47 | return w
48 |
49 | X_pca = demean(X)
50 | self.components_ = np.empty(shape=(self.n_components, np.shape(X)[1]))
51 | for i in range(self.n_components):
52 | initial_w = np.random.random(X_pca.shape[1])
53 | w = first_component(X_pca, initial_w, eta, n_iters)
54 | self.components_[i,:] = w
55 | X_pca = X_pca - X_pca.dot(w).reshape(-1,1)*w
56 | return self
57 |
58 | def transform(self, X):
59 | """将给定的X,映射到各个主成分分量中"""
60 | assert np.shape(X)[1] == np.shape(self.components_)[1]
61 | return X.dot(self.components_.T)
62 |
63 | def inverse_transform(self, X):
64 | """将给定的X,反向映射回原来的特征空间"""
65 | assert np.shape(X)[1] == np.shape(self.components_)[0]
66 | return X.dot(self.components_)
67 |
68 | def __repr__(self):
69 | return 'PCA(n_components=%d)' %self.n_components
70 |
--------------------------------------------------------------------------------
/playML/logistic_regression.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | from .metrics import accuracy_score
4 |
5 |
6 | class LogisticRegression(object):
7 | def __init__(self):
8 | """初始化逻辑回归模型"""
9 | self.coef_ = None
10 | self.intercept_ = None
11 | self._theta = None
12 |
13 | def _sigmoid(self, t):
14 | return 1. / (1. + np.exp(-t))
15 |
16 | def fit(self, X_train, y_train, eta=0.01, n_iters=1e4):
17 | """根据训练数据集X_train,y_train,使用梯度下降法训练逻辑回归模型"""
18 | assert X_train.shape[0] == y_train.shape[0], '训练集与结果集的样本数必须一致'
19 |
20 | def J(theta, X_b, y):
21 | """定义损失函数"""
22 | y_hat = self._sigmoid(X_b.dot(theta))
23 | try:
24 | return np.sum(np.dot(y, np.log(y_hat)) + np.dot((1 - y), np.log(1 - y_hat))) / -len(y)
25 | except:
26 | return float('inf')
27 |
28 | def derivative_J(theta, X_b, y):
29 | """求逻辑回归的梯度"""
30 | return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(X_b)
31 |
32 | def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
33 | """梯度下降法求θ"""
34 | theta = initial_theta
35 | iters = 0
36 | while iters < n_iters:
37 | gradient = derivative_J(theta, X_b, y)
38 | last_theta = theta
39 | theta = theta - eta * gradient
40 |
41 | if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
42 | break
43 | iters += 1
44 | return theta
45 |
46 | X_b = np.hstack((np.ones((len(X_train), 1)), X_train))
47 | initial_theta = np.zeros(X_b.shape[1]) # 初始的θ向量都是0
48 | self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
49 | self.intercept_ = self._theta[0]
50 | self.coef_ = self._theta[1:]
51 |
52 | return self
53 |
54 | def predict_proba(self, X_predict):
55 | """给定待预测数据集X_predict,返回表示X_predict的结果概率向量"""
56 | X_b = np.hstack([np.ones(shape=(X_predict.shape[0], 1)), X_predict])
57 | return self._sigmoid(X_b.dot(self._theta))
58 |
59 | def predict(self, X_predict):
60 | proba = self.predict_proba(X_predict)
61 | return np.array(proba >= .5, dtype=int) # 把True/False的向量转化为1,0的向量
62 |
63 | def score(self, X_test, y_test):
64 | y_predict = self.predict(X_test)
65 | return accuracy_score(y_test, y_predict)
66 |
67 | def __repr__(self):
68 | return 'LogisticRegression()'
--------------------------------------------------------------------------------
/c1_knn/kNN.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | from math import sqrt
4 | from collections import Counter
5 |
6 |
7 | def kNN_classify(k: int, X_train: np.ndarray, y_train: np.ndarray, x: np.ndarray):
8 | """
9 | kNN分类算法
10 | :param k: kNN的k值
11 | :param X_train: 训练集的特征(矩阵)
12 | :param y_train: 训练集的标记(向量)
13 | :param x: 需要预测的特征(向量)
14 | :return:
15 | """
16 | assert 1 <= k <= X_train.shape[0], "k must be valid"
17 | assert X_train.shape[0] == y_train.shape[0], "训练集中,特征向量的记录数与标记的记录数目必须一致"
18 | assert X_train.shape[1] == x.shape[0], '需要预测的x的特征数目必须等于训练集中的特征数目'
19 |
20 | # 求x与每一条记录的欧拉距离
21 | distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
22 |
23 | nearest = np.argsort(distances)
24 |
25 | # 从y_train中取前k个与x距离最近的y
26 | topK_y = [y_train[i] for i in nearest[:k]]
27 |
28 | votes = Counter(topK_y)
29 |
30 | return votes.most_common(1)[0][0]
31 |
32 |
33 | class KNNClassifier(object):
34 | """
35 | 重新整理自己写的kNN算法,使他更符合scikit-Learn的模式
36 | """
37 | def __init__(self, k):
38 | """
39 | 初始化kNN分类器
40 | :param k:
41 | """
42 | self.k = k
43 | self._X_train = None
44 | self._y_train = None
45 |
46 | def fit(self, X_train, y_train):
47 | """
48 | 根据训练数据集X_train和y_train训练kNN分类器
49 | :param X_train:
50 | :param y_train:
51 | :return:
52 | """
53 | self._X_train = X_train
54 | self._y_train = y_train
55 | return self
56 |
57 | def predict(self, X_predict):
58 | """
59 | 给定待预测数据集X_predict,返回表示X_predict的结果向量
60 | :param X_predict:
61 | :return:
62 | """
63 | y_predict = [self._predict(x) for x in X_predict]
64 | return np.array(y_predict)
65 |
66 | def _predict(self, x):
67 | """
68 | 给定单个带预测数据x,返回x_predict的预测结果值
69 | :param x:
70 | :return:
71 | """
72 | # 差不多就是把kNN_classify方法的内容全部搬过来
73 |
74 | # 求x与每一条记录的欧拉距离
75 | distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in self._X_train]
76 |
77 | nearest = np.argsort(distances)
78 |
79 | # 从y_train中取前k个与x距离最近的y
80 | topK_y = [self._y_train[i] for i in nearest[:self.k]]
81 |
82 | votes = Counter(topK_y)
83 |
84 | return votes.most_common(1)[0][0]
85 |
86 | def score(self, X_test, y_test):
87 | y_predict = self.predict(X_test)
88 | return sum(y_predict == y_test) / len(y_test)
89 |
90 | def __repr__(self):
91 | return 'kNN(k=%d)'%self.k
--------------------------------------------------------------------------------
/playML/metrics.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | from math import sqrt
4 |
5 |
6 | def accuracy_score(y_true, y_predict):
7 | """
8 | 计算y_true和y_predict之间的准确率
9 | :param y_true:
10 | :param y_predict:
11 | :return:
12 | """
13 | assert y_true.shape[0] == y_predict.shape[0], 'the size of y_true must be equal to the size of y_predict'
14 |
15 | return sum(y_true == y_predict) / len(y_true)
16 |
17 |
18 | def mean_squared_error(y_true, y_predict):
19 | """
20 | 计算y_true和y_predict之间的MSE
21 | :param y_true:
22 | :param y_predict:
23 | :return:
24 | """
25 | assert len(y_true) == len(y_predict), 'the size of y_true must be equal to the size of y_predict'
26 |
27 | return np.sum((y_true - y_predict) ** 2) / len(y_true)
28 |
29 |
30 | def root_mean_squared_error(y_true, y_predict):
31 | return sqrt(mean_squared_error(y_true, y_predict))
32 |
33 |
34 | def mean_absolute_error(y_true, y_predict):
35 | assert len(y_true) == len(y_predict), 'the size of y_true must be equal to the size of y_predict'
36 |
37 | return np.sum(np.absolute(y_true - y_predict)) / len(y_true)
38 |
39 |
40 | def r2_score(y_true, y_predict):
41 | """
42 | 计算R^2 R Square
43 | :param y_true:
44 | :param y_predict:
45 | :return:
46 | """
47 | return 1 - mean_squared_error(y_true, y_predict) / np.var(y_true)
48 |
49 |
50 | def TN(y_true, y_predict):
51 | assert len(y_true) == len(y_predict), 'y_true与y_predict的样本数目必须一致'
52 | return np.sum((y_true == 0) & (y_predict == 0))
53 |
54 |
55 | def FP(y_true, y_predict):
56 | assert len(y_true) == len(y_predict), 'y_true与y_predict的样本数目必须一致'
57 | return np.sum((y_true == 0) & (y_predict == 1))
58 |
59 |
60 | def FN(y_true, y_predict):
61 | assert len(y_true) == len(y_predict), 'y_true与y_predict的样本数目必须一致'
62 | return np.sum((y_true == 1) & (y_predict == 0))
63 |
64 |
65 | def TP(y_true, y_predict):
66 | assert len(y_true) == len(y_predict), 'y_true与y_predict的样本数目必须一致'
67 | return np.sum((y_true == 1) & (y_predict == 1))
68 |
69 |
70 | def confusion_matrix(y_true, y_predict):
71 | """返回一个2✖️2的混淆矩阵"""
72 | return np.array([
73 | [TN(y_true, y_predict), FP(y_true, y_predict)],
74 | [FN(y_true, y_predict), TP(y_true, y_predict)]
75 | ])
76 |
77 |
78 | def precision_score(y_true, y_predict):
79 | """求精准率"""
80 | tp = TP(y_true, y_predict)
81 | fp = FP(y_true, y_predict)
82 | try:
83 | return tp / (tp + fp)
84 | except: # 分母为0时,结果返回0
85 | return 0.0
86 |
87 |
88 | def recall_score(y_true, y_predict):
89 | """求召回率"""
90 | tp = TP(y_true, y_predict)
91 | fn = FN(y_true, y_predict)
92 | try:
93 | return tp / (tp + fn)
94 | except:
95 | return 0.0
96 |
97 |
98 | def f1_score(y_true, y_predict):
99 | """f1 score"""
100 | precision = precision_score(y_true, y_predict)
101 | recall = recall_score(y_true, y_predict)
102 |
103 | try:
104 | return 2.0 * precision * recall / (precision + recall)
105 | except:
106 | return 0.
107 |
108 |
109 | def TPR(y_true, y_predict):
110 | return recall_score(y_true, y_predict)
111 |
112 |
113 | def FPR(y_true, y_predict):
114 | fp = FP(y_true, y_predict)
115 | tn = TN(y_true, y_predict)
116 | try:
117 | return fp / (fp + tn)
118 | except:
119 | return 0.
120 |
--------------------------------------------------------------------------------
/c2_linear_regression/simple_linear_regression.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | from playML.metrics import r2_score
4 |
5 | class SimpleLinearRegression1(object):
6 | """
7 | 自己手写的简陋实现简单线性回归算法
8 | """
9 | def __init__(self):
10 | self.a_ = None
11 | self.b_ = None
12 |
13 | def fit(self, x_train, y_train):
14 | """
15 |
16 | :param x_train:
17 | :param y_train:
18 | :return:
19 | """
20 | assert x_train.ndim == 1, 'Simple Linear Regressor can only solve single feature training data.'
21 | assert len(x_train) == len(y_train), 'the size of x_train must be equal to the size of y_train'
22 |
23 | x_mean = np.mean(x_train)
24 | y_mean = np.mean(y_train)
25 |
26 | # 分子
27 | numerator = 0.0
28 |
29 | # 分母
30 | denominator = 0.0
31 |
32 | for x_i, y_i in zip(x_train, y_train):
33 | numerator += (x_i - x_mean)*(y_i - y_mean)
34 | denominator += (x_i - x_mean)**2
35 |
36 | self.a_ = numerator / denominator
37 | self.b_ = y_mean - self.a_ * x_mean
38 |
39 | return self
40 |
41 | def predict(self, x_predict):
42 | assert x_predict.ndim == 1, 'Simple Linear Regressor can only solve single feature training data.'
43 | assert self.a_ is not None and self.b_ is not None, 'must fit before predict'
44 | return np.array([self._predict(x) for x in x_predict])
45 |
46 | def _predict(self, x_single):
47 | return self.a_ * x_single + self.b_
48 |
49 | def __repr__(self):
50 | return 'SimpleLinearRegression1()'
51 |
52 |
53 | class SimpleLinearRegression2(object):
54 | """
55 | 自己手写的简陋实现简单线性回归算法,把for循环改为向量化运算,提升效率
56 | """
57 | def __init__(self):
58 | self.a_ = None
59 | self.b_ = None
60 |
61 | def fit(self, x_train, y_train):
62 | """
63 |
64 | :param x_train:
65 | :param y_train:
66 | :return:
67 | """
68 | assert x_train.ndim == 1, 'Simple Linear Regressor can only solve single feature training data.'
69 | assert len(x_train) == len(y_train), 'the size of x_train must be equal to the size of y_train'
70 |
71 | x_mean = np.mean(x_train)
72 | y_mean = np.mean(y_train)
73 |
74 | # 分子
75 | numerator = (x_train - x_mean).dot(y_train - y_mean)
76 |
77 | # 分母
78 | denominator = (x_train - x_mean).dot(x_train - x_mean)
79 |
80 | self.a_ = numerator / denominator
81 | self.b_ = y_mean - self.a_ * x_mean
82 |
83 | return self
84 |
85 | def predict(self, x_predict):
86 | assert x_predict.ndim == 1, 'Simple Linear Regressor can only solve single feature training data.'
87 | assert self.a_ is not None and self.b_ is not None, 'must fit before predict'
88 | return np.array([self._predict(x) for x in x_predict])
89 |
90 | def _predict(self, x_single):
91 | return self.a_ * x_single + self.b_
92 |
93 | def score(self, x_test, y_test):
94 | """
95 | 根据测试数据集x_test和y_test 确定当前模型的准确度
96 | :param x_test:
97 | :param y_test:
98 | :return:
99 | """
100 | y_predict = self.predict(x_test)
101 | return r2_score(y_test, y_predict)
102 |
103 | def __repr__(self):
104 | return 'SimpleLinearRegression2()'
105 |
106 |
107 | class SimpleLinearRegression(SimpleLinearRegression2):
108 | pass
--------------------------------------------------------------------------------
/c1_knn/02_kNN_in_scikit_learn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# Scikit-Learn中的kNN"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "from sklearn.neighbors import KNeighborsClassifier"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "kNN_classifier = KNeighborsClassifier(n_neighbors=6)"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 3,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "raw_data_X = np.random.random((10,2))\n",
38 | "X_train = raw_data_X * 10\n",
39 | "y_train = np.array([0,0,0,0,0,1,1,1,1,1])"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 4,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/plain": [
50 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=6, p=2,\n weights='uniform')"
51 | ]
52 | },
53 | "execution_count": 4,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "# 训练/拟合\n",
60 | "kNN_classifier.fit(X_train, y_train)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 5,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "x = np.random.random((1,2))*10"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 6,
75 | "metadata": {},
76 | "outputs": [
77 | {
78 | "data": {
79 | "text/plain": [
80 | "array([0])"
81 | ]
82 | },
83 | "execution_count": 6,
84 | "metadata": {},
85 | "output_type": "execute_result"
86 | }
87 | ],
88 | "source": [
89 | "kNN_classifier.predict(x)"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "## 重新整理我们的kNN代码"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 7,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "%run c1_knn/kNN.py\n"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 8,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "knn_clf = KNNClassifier(k=6)\n",
115 | "knn_clf.fit(X_train, y_train)\n",
116 | "y_predict = knn_clf.predict(x)\n"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 9,
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "data": {
126 | "text/plain": [
127 | "array([1])"
128 | ]
129 | },
130 | "execution_count": 9,
131 | "metadata": {},
132 | "output_type": "execute_result"
133 | }
134 | ],
135 | "source": [
136 | "y_predict"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": []
145 | }
146 | ],
147 | "metadata": {
148 | "kernelspec": {
149 | "display_name": "Python 2",
150 | "language": "python",
151 | "name": "python2"
152 | },
153 | "language_info": {
154 | "codemirror_mode": {
155 | "name": "ipython",
156 | "version": 2
157 | },
158 | "file_extension": ".py",
159 | "mimetype": "text/x-python",
160 | "name": "python",
161 | "nbconvert_exporter": "python",
162 | "pygments_lexer": "ipython2",
163 | "version": "2.7.6"
164 | }
165 | },
166 | "nbformat": 4,
167 | "nbformat_minor": 0
168 | }
169 |
--------------------------------------------------------------------------------
/playML/linear_regression.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | from playML.metrics import r2_score
4 | from c2_linear_regression import linear_regression
5 |
6 | class LinearRegression(linear_regression.LinearRegression):
7 | def fit_gd(self, X_train:np.ndarray, y_train:np.ndarray, eta=0.01, n_iters=1e4):
8 | """
9 | 根据训练数据集X_train和y_train,使用梯度下降法训练线性回归模型
10 | :param X_train:
11 | :param y_train:
12 | :param eta:
13 | :param n_iters:
14 | :return:
15 | """
16 | assert X_train.shape[0] == y_train.shape[0], '每一个训练样本必须对应一个标记'
17 |
18 | def J(theta, X_b, y):
19 | """
20 | 给定θ,特征矩阵X,标记向量y,根据损失函数得出其(损失)值
21 | :param theta:
22 | :param X_b:
23 | :param y:
24 | :return:
25 | """
26 |
27 | # 分子部分其实等价于 (y - X_b.dot(theta)).T.dot(y - X_b.dot(theta))
28 | try:
29 | return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)
30 | except:
31 | return float('inf') # 防止溢出?有异常直接返回最大值
32 |
33 | def derivative_J(theta: np.ndarray, X_b: np.ndarray, y: np.ndarray):
34 | """
35 | 求θ为给定值时的导数
36 | :param theta:
37 | :param X_b:
38 | :param y:
39 | :return:
40 | """
41 |
42 | # res = np.empty(len(theta))
43 | # res[0] = np.sum(X_b.dot(theta) - y)
44 | # for i in range(1, len(theta)):
45 | # res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
46 | # return res * 2 / len(X_b)
47 |
48 | # 改为向量的形式
49 | return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(X_b)
50 |
51 | def gradient_descent(X_b, y, initial_theta, eta, n_iters=5, epsilon=1e-8):
52 | theta = initial_theta
53 | iters = 0
54 | while iters < n_iters:
55 | gradient = derivative_J(theta, X_b, y)
56 | last_theta = theta
57 | theta = theta - eta * gradient
58 |
59 | if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
60 | break
61 | iters += 1
62 | return theta
63 |
64 | X_b = np.hstack((np.ones((len(X_train), 1)), X_train))
65 | initial_theta = np.zeros(X_b.shape[1]) # 初始的θ向量都是0
66 | self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
67 | self.interception_ = self._theta[0]
68 | self.coef_ = self._theta[1:]
69 |
70 | return self
71 |
72 | def fit_sgd(self, X_train, y_train, n_iters=1e4, t0=5, t1=50):
73 | """
74 | 使用随机梯度下降法进行拟合
75 | :param X_train:
76 | :param y_train:
77 | :param n_iters:
78 | :param t0:
79 | :param t1:
80 | :return:
81 | """
82 | assert X_train.shape[0] == y_train.shape[0], '每一个训练样本必须对应一个标记'
83 | assert n_iters >=1 , '所有训练样本至少要被随机一次'
84 |
85 | def derivative_J_sgd(theta: np.ndarray, X_b_i: np.ndarray, y_i):
86 | """
87 | 求随机搜索方向
88 | """
89 | return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2.
90 |
91 | def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50):
92 | """"""
93 | def learning_rate(t):
94 | return t0 / (t + t1)
95 |
96 | theta = initial_theta
97 | m = len(X_b) # 样本数目
98 | for cur_iter in range(n_iters):
99 | indexes = np.random.permutation(m)
100 | X_b_new = X_b[indexes]
101 | y_new = y[indexes]
102 | for i in range(m):
103 | gradient = derivative_J_sgd(theta, X_b_new[i], y_new[i])
104 | # 向搜索方向的相反方向移动η
105 | theta = theta - learning_rate(cur_iter * m + i) * gradient
106 | return theta
107 |
108 | X_b = np.hstack((np.ones((len(X_train), 1)), X_train))
109 | initial_theta = np.zeros(X_b.shape[1])
110 | self._theta = sgd(X_b, y_train, initial_theta, n_iters=n_iters, t0=t0, t1=t1)
111 | self.interception_ = self._theta[0]
112 | self.coef_ = self._theta[1:]
--------------------------------------------------------------------------------
/c2_linear_regression/08_Linear_Regression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# 实现多元线性回归模型"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import matplotlib.pyplot as plt\n",
20 | "from sklearn import datasets"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "boston = datasets.load_boston()\n",
30 | "\n",
31 | "X = boston.data\n",
32 | "y = boston.target\n",
33 | "\n",
34 | "X = X[y<50.0]\n",
35 | "y = y[y<50.0]"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "data": {
45 | "text/plain": [
46 | "(490, 13)"
47 | ]
48 | },
49 | "execution_count": 3,
50 | "metadata": {},
51 | "output_type": "execute_result"
52 | }
53 | ],
54 | "source": [
55 | "X.shape"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "from playML import model_selection\n",
65 | "X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,seed=666)"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 5,
71 | "metadata": {},
72 | "outputs": [
73 | {
74 | "data": {
75 | "text/plain": [
76 | "LinearRegression()"
77 | ]
78 | },
79 | "execution_count": 5,
80 | "metadata": {},
81 | "output_type": "execute_result"
82 | }
83 | ],
84 | "source": [
85 | "from c2_linear_regression.linear_regression import LinearRegression\n",
86 | "\n",
87 | "reg = LinearRegression()\n",
88 | "\n",
89 | "reg.fit_normal(X_train, y_train)"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 6,
95 | "metadata": {},
96 | "outputs": [
97 | {
98 | "data": {
99 | "text/plain": [
100 | "array([ -1.18919477e-01, 3.63991462e-02, -3.56494193e-02,\n 5.66737830e-02, -1.16195486e+01, 3.42022185e+00,\n -2.31470282e-02, -1.19509560e+00, 2.59339091e-01,\n -1.40112724e-02, -8.36521175e-01, 7.92283639e-03,\n -3.81966137e-01])"
101 | ]
102 | },
103 | "execution_count": 6,
104 | "metadata": {},
105 | "output_type": "execute_result"
106 | }
107 | ],
108 | "source": [
109 | "reg.coef_"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 7,
115 | "metadata": {},
116 | "outputs": [
117 | {
118 | "data": {
119 | "text/plain": [
120 | "34.161435496212974"
121 | ]
122 | },
123 | "execution_count": 7,
124 | "metadata": {},
125 | "output_type": "execute_result"
126 | }
127 | ],
128 | "source": [
129 | "reg.interception_"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 8,
135 | "metadata": {},
136 | "outputs": [
137 | {
138 | "data": {
139 | "text/plain": [
140 | "0.81298026026586467"
141 | ]
142 | },
143 | "execution_count": 8,
144 | "metadata": {},
145 | "output_type": "execute_result"
146 | }
147 | ],
148 | "source": [
149 | "reg.score(X_test, y_test)"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": []
158 | }
159 | ],
160 | "metadata": {
161 | "kernelspec": {
162 | "display_name": "Python 2",
163 | "language": "python",
164 | "name": "python2"
165 | },
166 | "language_info": {
167 | "codemirror_mode": {
168 | "name": "ipython",
169 | "version": 2
170 | },
171 | "file_extension": ".py",
172 | "mimetype": "text/x-python",
173 | "name": "python",
174 | "nbconvert_exporter": "python",
175 | "pygments_lexer": "ipython2",
176 | "version": "2.7.6"
177 | }
178 | },
179 | "nbformat": 4,
180 | "nbformat_minor": 0
181 | }
182 |
--------------------------------------------------------------------------------
/c1_knn/knn.md:
--------------------------------------------------------------------------------
1 | # kNN算法
2 | - 属于监督学习
3 | - 非参数学习
4 | - 是解决分类问题的算法,天然可解决多分类问题
5 | - kNN没有模型,可以说是一个(也许也是唯一一个)不需要训练过程的算法
6 | - 为了和其他算法统一,可以认为训练数据集就是模型本身
7 |
8 | ## 本质
9 | 两个(或几个)样本如果足够相似,那么它们就有极高的概率属于同一个类别。所谓“相似”,就是样本就特征空间中的距离相近。
10 | ## 优点
11 | - 思想极其简单
12 | - 可以解释机器学习算法使用过程中的很多细节问题
13 | - 更完整的刻画机器学习应用的流程
14 | - 应用数学知识少(近乎为零)
15 | - 效果好
16 | - 天然适合解决多分类问题,同时也适合解决回归问题
17 |
18 | ## 缺点
19 | - 最大的缺点:效率低下
20 | 如果训练集有m个样本,n个特征,则预测每一个新的数据,需要O(m*n)
21 | - 优化,使用树结构:KD-Tree, Ball-Tree
22 | - 即便如此,依然效率低下
23 | - 高度数据相关,而且对outlier更敏感
24 | - 预测结果不具有可解释性
25 | 只知道属于哪个类别,但是无法解释为什么属于某个类别
26 | - 维数灾难
27 | - 随着维度的增加,“看似相似”的两个点之间的距离越来越大
28 | - 解决方法:降维,例如PCA
29 |
30 | ## kNN的过程
31 | ### 计算特征空间中的距离
32 | #### 欧拉距离(最为常见)
33 | - 平面距离:
34 |
35 | - 立体距离
36 |
37 | - n维空间距离
38 |
39 | #### 曼哈顿距离
40 |
41 | #### 明可夫斯基距离
42 |
43 | - 当p=1,相当于曼哈顿距离
44 | - 当p=2,相当于欧拉距离
45 | - 当p=3,其他距离
46 |
47 | ## 参数
48 | ### 超参数
49 | - kNN算法中的k是典型的超参数
50 | - 默认值为5 (经验数值)
51 | - 距离的权重
52 | - 距离越近,权重越大
53 | - 关于“距离”的定义
54 | - 明可夫斯基距离(默认)
55 | 明可夫斯基距离的p取值
56 | - p=1:曼哈顿距离
57 | - p=2(默认):欧拉距离
58 | - p=3:明可夫斯基距离(其他距离)
59 | - 其他更多的距离定义
60 | - 向量空间余弦相似度Cosine Similarity
61 | - 调整余弦相似度Adjusted Cosine Similarity
62 | - 皮尔森相关系数 Pearson Correlation Coefficient
63 | - Jaccard相似系数 Jaccard Coefficient
64 |
65 | ### 模型参数
66 | kNN算法没有模型参数
67 |
68 | ## 数据归一化 Feature Scaling
69 | ### 需要归一化的原因
70 | 如果某些特征数值较大,会主导最终距离的结果
71 | ### 解决方案
72 | 把所有的数据映射到同一尺度
73 | #### 最值归一化 normalization
74 | 把所有数据映射到0~1之间:
75 |
76 |
77 | - 适用于分布有明显边界的情况
78 | - 例如考试分数,最大是100,最小是0
79 | - 例如每个像素的RGB颜色,都是0~255之间
80 | - 受outlier影响较大
81 | - 例如收入,有些人特别特别高
82 |
83 | #### Standardization(0均值标准化/均值方差归一化)
84 | 针对最值归一化的缺憾改进
85 | **把所有数据归一到均值为0,方差为1的分布中**
86 |
87 |
88 |
89 | - 并不保证数据在0~1之间
90 | - 但是所有数值的均值在0的位置
91 | - 数据方差/标准差为1
92 |
93 | 适用于数据分布没有明显的分界(有可能存在极端数据值)。其实数据分布有明显边界的情况也是同样适合的,所以选它一般没错。
94 |
95 | ### 数据归一化的一些注意事项
96 | #### 对测试数据集如何归一化
97 | 例如训练集有均值
,标准差
, 那么,测试数据集进行归一化(例如0均值标准化)时,应该使用训练集的均值和标准差,而不是用测试集的均值和标准差。原因有:
98 | 1. 测试数据是模拟真实环境,真实环境很可能无法得到所有测试数据的均值和标准差。(个人理解,如果使用测试数据集的均值和标准差,那么以后每有一个新的样例进来,岂不是要重新计算(分配)所有测试样例的均值和标准差?)
99 | 2. 对数据的归一化也是算法的一部分
100 |
101 | #### 需要保存训练数据集得到的均值和标准差
102 | - 使用skLearn进行数据归一化处理
103 | - 使用StandardScaler进行0均值标准化
104 |
105 |
106 |
--------------------------------------------------------------------------------
/c1_knn/05_Hyper_Parameters.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# 超参数"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "from sklearn import datasets"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "digits = datasets.load_digits()\n",
29 | "X = digits.data\n",
30 | "y = digits.target"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "from sklearn.model_selection._split import train_test_split"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 4,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=666)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 5,
54 | "metadata": {},
55 | "outputs": [
56 | {
57 | "data": {
58 | "text/plain": [
59 | "0.98888888888888893"
60 | ]
61 | },
62 | "execution_count": 5,
63 | "metadata": {},
64 | "output_type": "execute_result"
65 | }
66 | ],
67 | "source": [
68 | "from sklearn.neighbors.classification import KNeighborsClassifier\n",
69 | "\n",
70 | "knn_clf = KNeighborsClassifier(n_neighbors=3)\n",
71 | "knn_clf.fit(X_train, y_train)\n",
72 | "knn_clf.score(X_test, y_test)"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "## 寻找最好的k"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 6,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "best_score: 0.991666666667\nbest_k: 4\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "best_score = 0.0\n",
97 | "best_k = -1\n",
98 | "for k in range(1,11):\n",
99 | " knn_clf = KNeighborsClassifier(n_neighbors=k)\n",
100 | " knn_clf.fit(X_train, y_train)\n",
101 | " score = knn_clf.score(X_test, y_test)\n",
102 | " if score > best_score:\n",
103 | " best_score = score\n",
104 | " best_k = k\n",
105 | "print('best_score:',best_score)\n",
106 | "print('best_k:', best_k)"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "## 考虑距离?不考虑距离? \n",
114 | "引出另一个超参数:距离权重"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 8,
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "name": "stdout",
124 | "output_type": "stream",
125 | "text": [
126 | "best_score: 0.991666666667\nbest_k: 4\nbest weights: uniform\n"
127 | ]
128 | }
129 | ],
130 | "source": [
131 | "best_method = ''\n",
132 | "best_score = 0.0\n",
133 | "best_k = -1\n",
134 | "for method in ['uniform','distance']:\n",
135 | " for k in range(1,11):\n",
136 | " knn_clf = KNeighborsClassifier(n_neighbors=k, weights=method)\n",
137 | " knn_clf.fit(X_train, y_train)\n",
138 | " score = knn_clf.score(X_test, y_test)\n",
139 | " if score > best_score:\n",
140 | " best_score = score\n",
141 | " best_k = k\n",
142 | " best_method = method\n",
143 | "print('best_score:',best_score)\n",
144 | "print('best_k:', best_k)\n",
145 | "print('best weights:', best_method)"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "## 搜索明可夫斯基距离相应的p"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 10,
158 | "metadata": {},
159 | "outputs": [
160 | {
161 | "name": "stdout",
162 | "output_type": "stream",
163 | "text": [
164 | "best_score: 0.988888888889\nbest_k: 5\nbest p: 1\nCPU times: user 15.7 s, sys: 116 ms, total: 15.8 s\nWall time: 16.1 s\n"
165 | ]
166 | }
167 | ],
168 | "source": [
169 | "%%time\n",
170 | "best_score = 0.0\n",
171 | "best_k = -1\n",
172 | "best_p = -1\n",
173 | "for p in range(1,6):\n",
174 | " for k in range(1,11):\n",
175 | " knn_clf = KNeighborsClassifier(n_neighbors=k, weights='distance', p=p)\n",
176 | " knn_clf.fit(X_train, y_train)\n",
177 | " score = knn_clf.score(X_test, y_test)\n",
178 | " if score > best_score:\n",
179 | " best_score = score\n",
180 | " best_k = k\n",
181 | " best_p = p\n",
182 | "print('best_score:',best_score)\n",
183 | "print('best_k:', best_k)\n",
184 | "print('best p:', best_p)"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": []
193 | }
194 | ],
195 | "metadata": {
196 | "kernelspec": {
197 | "display_name": "Python 2",
198 | "language": "python",
199 | "name": "python2"
200 | },
201 | "language_info": {
202 | "codemirror_mode": {
203 | "name": "ipython",
204 | "version": 2
205 | },
206 | "file_extension": ".py",
207 | "mimetype": "text/x-python",
208 | "name": "python",
209 | "nbconvert_exporter": "python",
210 | "pygments_lexer": "ipython2",
211 | "version": "2.7.6"
212 | }
213 | },
214 | "nbformat": 4,
215 | "nbformat_minor": 0
216 | }
217 |
--------------------------------------------------------------------------------
/c3_gradient_descent/06_Stochastic_Gradient_Descent.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# 随机梯度下降法"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import matplotlib.pyplot as plt"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "m = 100000\n",
29 | "\n",
30 | "x = np.random.normal(size=m)\n",
31 | "X = x.reshape(-1,1)\n",
32 | "y = 4.*x + 3. + np.random.normal(0,3,size=m)"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 4,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "# 损失函数\n",
42 | "def J(theta, X_b, y):\n",
43 | " try:\n",
44 | " return np.sum((y - X_b.dot(theta))**2) / len(y)\n",
45 | " except:\n",
46 | " return float('inf')\n",
47 | " \n",
48 | "def derivative_J(theta:np.ndarray, X_b:np.ndarray, y:np.ndarray):\n",
49 | " \"\"\"\n",
50 | " 求θ为给定值时的导数(梯度)\n",
51 | " :param theta: \n",
52 | " :param X_b: \n",
53 | " :param y: \n",
54 | " :return: \n",
55 | " \"\"\"\n",
56 | " return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)\n",
57 | "\n",
58 | "# 批量梯度下降法\n",
59 | "def gradient_descent(X_b, y, initial_theta, eta=0.01, n_iters=1e4, epsilon=1e-8):\n",
60 | " theta = initial_theta\n",
61 | " cur_iter = 0\n",
62 | " while cur_iter < n_iters:\n",
63 | " gradient = derivative_J(theta, X_b, y)\n",
64 | " last_theta = theta\n",
65 | " theta = theta - eta * gradient\n",
66 | " if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):\n",
67 | " break\n",
68 | " cur_iter += 1\n",
69 | " return theta"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "## 批量梯度下降法效果"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 6,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "name": "stdout",
86 | "output_type": "stream",
87 | "text": [
88 | "CPU times: user 1.36 s, sys: 93.2 ms, total: 1.45 s\nWall time: 1.45 s\n"
89 | ]
90 | }
91 | ],
92 | "source": [
93 | "%%time\n",
94 | "X_b = np.hstack((np.ones((len(X), 1)), X))\n",
95 | "initial_theta = np.zeros(X_b.shape[1])\n",
96 | "eta = 0.01\n",
97 | "# 我们知道最终的系数和截距,直接肉眼比较吧。。。就不分训练集测试集了\n",
98 | "theta = gradient_descent(X_b,y,initial_theta, eta)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 7,
104 | "metadata": {},
105 | "outputs": [
106 | {
107 | "data": {
108 | "text/plain": [
109 | "array([ 3.01042744, 4.00071587])"
110 | ]
111 | },
112 | "execution_count": 7,
113 | "metadata": {},
114 | "output_type": "execute_result"
115 | }
116 | ],
117 | "source": [
118 | "theta"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "## 随机梯度下降法效果"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 8,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "def derivative_J_sgd(theta:np.ndarray, X_b_i:np.ndarray, y_i):\n",
135 | " \"\"\"\n",
136 | " 求随机搜索方向 \n",
137 | " \"\"\"\n",
138 | " return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2."
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 9,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "def sgd(X_b, y, initial_theta, n_iters):\n",
148 | " # 两个超参数\n",
149 | " t0 = 5\n",
150 | " t1 = 50\n",
151 | " \n",
152 | " def learning_rate(t):\n",
153 | " return t0/(t+t1)\n",
154 | " \n",
155 | " theta = initial_theta\n",
156 | " for cur_iter in range(n_iters):\n",
157 | " # 随机选一个\n",
158 | " rand_i = np.random.randint(len(X_b))\n",
159 | " gradient = derivative_J_sgd(theta, X_b[rand_i], y[rand_i])\n",
160 | " # 向搜索方向的相反方向移动η\n",
161 | " theta = theta - learning_rate(cur_iter) * gradient\n",
162 | " return theta"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 10,
168 | "metadata": {},
169 | "outputs": [
170 | {
171 | "name": "stdout",
172 | "output_type": "stream",
173 | "text": [
174 | "CPU times: user 276 ms, sys: 6.11 ms, total: 282 ms\nWall time: 283 ms\n"
175 | ]
176 | }
177 | ],
178 | "source": [
179 | "%%time\n",
180 | "X_b = np.hstack((np.ones((len(X), 1)), X))\n",
181 | "initial_theta = np.zeros(X_b.shape[1])\n",
182 | "theta = sgd(X_b, y, initial_theta, n_iters=len(X_b)//3)"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 11,
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "data": {
192 | "text/plain": [
193 | "array([ 3.02984824, 3.9936953 ])"
194 | ]
195 | },
196 | "execution_count": 11,
197 | "metadata": {},
198 | "output_type": "execute_result"
199 | }
200 | ],
201 | "source": [
202 | "theta"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "结论:批量梯度下降法和随机梯度下降法最终效果差不多,但是随机梯度下降法循环次数少得多,计算时间快得多"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": []
218 | }
219 | ],
220 | "metadata": {
221 | "kernelspec": {
222 | "display_name": "Python 2",
223 | "language": "python",
224 | "name": "python2"
225 | },
226 | "language_info": {
227 | "codemirror_mode": {
228 | "name": "ipython",
229 | "version": 2
230 | },
231 | "file_extension": ".py",
232 | "mimetype": "text/x-python",
233 | "name": "python",
234 | "nbconvert_exporter": "python",
235 | "pygments_lexer": "ipython2",
236 | "version": "2.7.6"
237 | }
238 | },
239 | "nbformat": 4,
240 | "nbformat_minor": 0
241 | }
242 |
--------------------------------------------------------------------------------
/c2_linear_regression/10_More_About_Linear_Regression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# 更多关于线性回归模型的讨论"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "from sklearn import datasets\n",
20 | "\n",
21 | "boston = datasets.load_boston()\n",
22 | "\n",
23 | "X = boston.data\n",
24 | "y = boston.target\n",
25 | "\n",
26 | "X = X[y<50.0]\n",
27 | "y = y[y<50.0]"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 5,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
39 | ]
40 | },
41 | "execution_count": 5,
42 | "metadata": {},
43 | "output_type": "execute_result"
44 | }
45 | ],
46 | "source": [
47 | "from sklearn.linear_model.base import LinearRegression\n",
48 | "\n",
49 | "lin_reg = LinearRegression()\n",
50 | "lin_reg.fit(X, y)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 6,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "data": {
60 | "text/plain": [
61 | "array([ -1.05574295e-01, 3.52748549e-02, -4.35179251e-02,\n 4.55405227e-01, -1.24268073e+01, 3.75411229e+00,\n -2.36116881e-02, -1.21088069e+00, 2.50740082e-01,\n -1.37702943e-02, -8.38888137e-01, 7.93577159e-03,\n -3.50952134e-01])"
62 | ]
63 | },
64 | "execution_count": 6,
65 | "metadata": {},
66 | "output_type": "execute_result"
67 | }
68 | ],
69 | "source": [
70 | "lin_reg.coef_"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "把系数按从小到大排一下序"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 7,
83 | "metadata": {},
84 | "outputs": [
85 | {
86 | "data": {
87 | "text/plain": [
88 | "array([ 4, 7, 10, 12, 0, 2, 6, 9, 11, 1, 8, 3, 5])"
89 | ]
90 | },
91 | "execution_count": 7,
92 | "metadata": {},
93 | "output_type": "execute_result"
94 | }
95 | ],
96 | "source": [
97 | "arg_sort = np.argsort(lin_reg.coef_)\n",
98 | "arg_sort"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "看看按照影响程度从小到大排序后的各个系数对应的都是什么属性(名称)"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 8,
111 | "metadata": {},
112 | "outputs": [
113 | {
114 | "data": {
115 | "text/plain": [
116 | "array(['NOX', 'DIS', 'PTRATIO', 'LSTAT', 'CRIM', 'INDUS', 'AGE', 'TAX',\n 'B', 'ZN', 'RAD', 'CHAS', 'RM'],\n dtype=' best_score:\n",
73 | " best_score, best_p, best_k = score, p, k\n",
74 | "print('Best K =', best_k)\n",
75 | "print('Best P =', best_p)\n",
76 | "print('Best Score =', best_score)"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "## 使用交叉验证"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 6,
89 | "metadata": {},
90 | "outputs": [
91 | {
92 | "data": {
93 | "text/plain": [
94 | "array([ 0.98895028, 0.97777778, 0.96629213])"
95 | ]
96 | },
97 | "execution_count": 6,
98 | "metadata": {},
99 | "output_type": "execute_result"
100 | }
101 | ],
102 | "source": [
103 | "from sklearn.model_selection._validation import cross_val_score\n",
104 | "\n",
105 | "knn_clf = KNeighborsClassifier()\n",
106 | "cross_val_score(knn_clf, X_train, y_train)\n",
107 | "# 结果返回3个数,表示默认是分为3份做交叉验证"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 7,
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "name": "stdout",
117 | "output_type": "stream",
118 | "text": [
119 | "Best K = 2\nBest P = 2\nBest Score = 0.982359987401\n"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "best_score, best_p, best_k = 0, 0, 0\n",
125 | "for k in range(2, 11):\n",
126 | " for p in range(1, 6):\n",
127 | " knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=k, p=p)\n",
128 | " scores = cross_val_score(knn_clf, X_train, y_train)\n",
129 | " score = np.mean(scores)\n",
130 | " if score > best_score:\n",
131 | " best_score, best_p, best_k = score, p, k\n",
132 | "print('Best K =', best_k)\n",
133 | "print('Best P =', best_p)\n",
134 | "print('Best Score =', best_score)"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "cross_val_score(knn_clf, X_train, y_train) \n",
142 | "可以看到在使用交叉验证寻找最佳超参数的过程中,是完全不使用测试集的"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "## 回顾网格搜索 \n",
150 | "网格搜索其实就是用了交叉验证"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 8,
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "name": "stdout",
160 | "output_type": "stream",
161 | "text": [
162 | "Fitting 3 folds for each of 45 candidates, totalling 135 fits\n"
163 | ]
164 | },
165 | {
166 | "name": "stderr",
167 | "output_type": "stream",
168 | "text": [
169 | "[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 1.1min finished\n"
170 | ]
171 | },
172 | {
173 | "data": {
174 | "text/plain": [
175 | "GridSearchCV(cv=None, error_score='raise',\n estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=10, p=5,\n weights='distance'),\n fit_params=None, iid=True, n_jobs=1,\n param_grid=[{'weights': ['distance'], 'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],\n pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n scoring=None, verbose=1)"
176 | ]
177 | },
178 | "execution_count": 8,
179 | "metadata": {},
180 | "output_type": "execute_result"
181 | }
182 | ],
183 | "source": [
184 | "from sklearn.model_selection._search import GridSearchCV\n",
185 | "param_grid = [\n",
186 | " {\n",
187 | " 'weights':['distance'],\n",
188 | " 'n_neighbors':[i for i in range(2,11)],\n",
189 | " 'p': [i for i in range(1,6)]\n",
190 | " }\n",
191 | "]\n",
192 | "\n",
193 | "grid_search = GridSearchCV(knn_clf, param_grid, verbose=1)\n",
194 | "grid_search.fit(X_train, y_train)\n"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 9,
200 | "metadata": {},
201 | "outputs": [
202 | {
203 | "data": {
204 | "text/plain": [
205 | "0.98237476808905377"
206 | ]
207 | },
208 | "execution_count": 9,
209 | "metadata": {},
210 | "output_type": "execute_result"
211 | }
212 | ],
213 | "source": [
214 | "grid_search.best_score_"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 10,
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "data": {
224 | "text/plain": [
225 | "{'n_neighbors': 2, 'p': 2, 'weights': 'distance'}"
226 | ]
227 | },
228 | "execution_count": 10,
229 | "metadata": {},
230 | "output_type": "execute_result"
231 | }
232 | ],
233 | "source": [
234 | "grid_search.best_params_\n",
235 | "# 与我们上面手动调用交叉验证得到的超参数一致"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 11,
241 | "metadata": {},
242 | "outputs": [
243 | {
244 | "data": {
245 | "text/plain": [
246 | "0.98052851182197498"
247 | ]
248 | },
249 | "execution_count": 11,
250 | "metadata": {},
251 | "output_type": "execute_result"
252 | }
253 | ],
254 | "source": [
255 | "best_knn_clf = grid_search.best_estimator_\n",
256 | "best_knn_clf.score(X_test, y_test)"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": 12,
262 | "metadata": {},
263 | "outputs": [
264 | {
265 | "data": {
266 | "text/plain": [
267 | "array([ 0.99543379, 0.96803653, 0.98148148, 0.96261682, 0.97619048])"
268 | ]
269 | },
270 | "execution_count": 12,
271 | "metadata": {},
272 | "output_type": "execute_result"
273 | }
274 | ],
275 | "source": [
276 | "# cross_val_score 默认是分3份,如果要分5份:\n",
277 | "cross_val_score(knn_clf, X_train, y_train, cv=5)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "# GridSearchCV 中的交叉验证,如果要分为5份:\n",
287 | "GridSearchCV(knn_clf, param_grid, verbose=1, cv=5)"
288 | ]
289 | }
290 | ],
291 | "metadata": {
292 | "kernelspec": {
293 | "display_name": "Python 2",
294 | "language": "python",
295 | "name": "python2"
296 | },
297 | "language_info": {
298 | "codemirror_mode": {
299 | "name": "ipython",
300 | "version": 2
301 | },
302 | "file_extension": ".py",
303 | "mimetype": "text/x-python",
304 | "name": "python",
305 | "nbconvert_exporter": "python",
306 | "pygments_lexer": "ipython2",
307 | "version": "2.7.6"
308 | }
309 | },
310 | "nbformat": 4,
311 | "nbformat_minor": 0
312 | }
313 |
--------------------------------------------------------------------------------
/c7_classification_performance_measures/03_implement_confusion_matrix_precision_and_recall.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# 实现混淆矩阵,精准率和召回率"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "from sklearn import datasets"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 3,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "digits = datasets.load_digits()\n",
29 | "X = digits.data\n",
30 | "y = digits.target.copy()\n",
31 | "\n",
32 | "# 把数据变为极度偏斜的数据\n",
33 | "# 把手写数字分为9和非9两大类, 重点关注的是分类为9的数字\n",
34 | "y[digits.target==9] = 1\n",
35 | "y[digits.target!=9] = 0"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 4,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "from sklearn.model_selection._split import train_test_split\n",
45 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 5,
51 | "metadata": {},
52 | "outputs": [
53 | {
54 | "data": {
55 | "text/plain": [
56 | "0.97555555555555551"
57 | ]
58 | },
59 | "execution_count": 5,
60 | "metadata": {},
61 | "output_type": "execute_result"
62 | }
63 | ],
64 | "source": [
65 | "from sklearn.linear_model.logistic import LogisticRegression\n",
66 | "\n",
67 | "log_reg = LogisticRegression()\n",
68 | "log_reg.fit(X_train, y_train)\n",
69 | "log_reg.score(X_test, y_test)"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "虽然0.975555555551看上去很高了,但因为我们的数据是极度偏斜的数据,即使我们把全部分类预测为\"非9\"也会有0.9左右的正确率"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 6,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "y_predict = log_reg.predict(X_test)"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "## 求TP,FP,FN,TN的值"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 7,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "data": {
102 | "text/plain": [
103 | "403"
104 | ]
105 | },
106 | "execution_count": 7,
107 | "metadata": {},
108 | "output_type": "execute_result"
109 | }
110 | ],
111 | "source": [
112 | "def TN(y_true, y_predict):\n",
113 | " assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'\n",
114 | " return np.sum((y_true == 0) & (y_predict == 0))\n",
115 | "\n",
116 | "TN(y_test, y_predict)"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 8,
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "data": {
126 | "text/plain": [
127 | "2"
128 | ]
129 | },
130 | "execution_count": 8,
131 | "metadata": {},
132 | "output_type": "execute_result"
133 | }
134 | ],
135 | "source": [
136 | "def FP(y_true, y_predict):\n",
137 | " assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'\n",
138 | " return np.sum((y_true == 0) & (y_predict == 1))\n",
139 | "\n",
140 | "FP(y_test, y_predict)"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 9,
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "data": {
150 | "text/plain": [
151 | "9"
152 | ]
153 | },
154 | "execution_count": 9,
155 | "metadata": {},
156 | "output_type": "execute_result"
157 | }
158 | ],
159 | "source": [
160 | "def FN(y_true, y_predict):\n",
161 | " assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'\n",
162 | " return np.sum((y_true == 1) & (y_predict == 0))\n",
163 | "\n",
164 | "FN(y_test, y_predict)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 10,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "data": {
174 | "text/plain": [
175 | "36"
176 | ]
177 | },
178 | "execution_count": 10,
179 | "metadata": {},
180 | "output_type": "execute_result"
181 | }
182 | ],
183 | "source": [
184 | "def TP(y_true, y_predict):\n",
185 | " assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'\n",
186 | " return np.sum((y_true == 1) & (y_predict == 1))\n",
187 | "\n",
188 | "TP(y_test, y_predict)"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 12,
194 | "metadata": {},
195 | "outputs": [
196 | {
197 | "data": {
198 | "text/plain": [
199 | "array([[403, 2],\n [ 9, 36]])"
200 | ]
201 | },
202 | "execution_count": 12,
203 | "metadata": {},
204 | "output_type": "execute_result"
205 | }
206 | ],
207 | "source": [
208 | "def confusion_matrix(y_true, y_predict):\n",
209 | " \"\"\"返回一个2✖️2的混淆矩阵\"\"\"\n",
210 | " return np.array([\n",
211 | " [TN(y_true, y_predict), FP(y_true, y_predict)],\n",
212 | " [FN(y_true, y_predict), TP(y_true, y_predict)]\n",
213 | " ])\n",
214 | "\n",
215 | "confusion_matrix(y_test, y_predict)"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "## 根据混淆矩阵求精准率和召回率"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 13,
228 | "metadata": {},
229 | "outputs": [
230 | {
231 | "data": {
232 | "text/plain": [
233 | "0.94736842105263153"
234 | ]
235 | },
236 | "execution_count": 13,
237 | "metadata": {},
238 | "output_type": "execute_result"
239 | }
240 | ],
241 | "source": [
242 | "def precision_score(y_true, y_predict):\n",
243 | " \"\"\"求精准率\"\"\"\n",
244 | " tp = TP(y_true, y_predict)\n",
245 | " fp = FP(y_true, y_predict)\n",
246 | " try:\n",
247 | " return tp / (tp + fp)\n",
248 | " except: # 分母为0时,结果返回0\n",
249 | " return 0.0\n",
250 | "\n",
251 | "# 精准率\n",
252 | "precision_score(y_test, y_predict)"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 14,
258 | "metadata": {},
259 | "outputs": [
260 | {
261 | "data": {
262 | "text/plain": [
263 | "0.80000000000000004"
264 | ]
265 | },
266 | "execution_count": 14,
267 | "metadata": {},
268 | "output_type": "execute_result"
269 | }
270 | ],
271 | "source": [
272 | "def recall_score(y_true, y_predict):\n",
273 | " \"\"\"求召回率\"\"\"\n",
274 | " tp = TP(y_true, y_predict)\n",
275 | " fn = FN(y_true, y_predict)\n",
276 | " try:\n",
277 | " return tp / (tp + fn)\n",
278 | " except:\n",
279 | " return 0.0\n",
280 | "\n",
281 | "# 召回率\n",
282 | "recall_score(y_test, y_predict)"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {},
288 | "source": [
289 | "# scikit-learn中的混淆矩阵,精准率和召回率"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {},
295 | "source": [
296 | "混淆矩阵"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 15,
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "data": {
306 | "text/plain": [
307 | "array([[403, 2],\n [ 9, 36]])"
308 | ]
309 | },
310 | "execution_count": 15,
311 | "metadata": {},
312 | "output_type": "execute_result"
313 | }
314 | ],
315 | "source": [
316 | "import sklearn.metrics.classification as classification\n",
317 | "classification.confusion_matrix(y_test, y_predict)"
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "metadata": {},
323 | "source": [
324 | "精准率"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 16,
330 | "metadata": {},
331 | "outputs": [
332 | {
333 | "data": {
334 | "text/plain": [
335 | "0.94736842105263153"
336 | ]
337 | },
338 | "execution_count": 16,
339 | "metadata": {},
340 | "output_type": "execute_result"
341 | }
342 | ],
343 | "source": [
344 | "classification.precision_score(y_test, y_predict)"
345 | ]
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {},
350 | "source": [
351 | "召回率"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 17,
357 | "metadata": {},
358 | "outputs": [
359 | {
360 | "data": {
361 | "text/plain": [
362 | "0.80000000000000004"
363 | ]
364 | },
365 | "execution_count": 17,
366 | "metadata": {},
367 | "output_type": "execute_result"
368 | }
369 | ],
370 | "source": [
371 | "classification.recall_score(y_test, y_predict)"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": null,
377 | "metadata": {},
378 | "outputs": [],
379 | "source": []
380 | }
381 | ],
382 | "metadata": {
383 | "kernelspec": {
384 | "display_name": "Python 2",
385 | "language": "python",
386 | "name": "python2"
387 | },
388 | "language_info": {
389 | "codemirror_mode": {
390 | "name": "ipython",
391 | "version": 2
392 | },
393 | "file_extension": ".py",
394 | "mimetype": "text/x-python",
395 | "name": "python",
396 | "nbconvert_exporter": "python",
397 | "pygments_lexer": "ipython2",
398 | "version": "2.7.6"
399 | }
400 | },
401 | "nbformat": 4,
402 | "nbformat_minor": 0
403 | }
404 |
--------------------------------------------------------------------------------
/c3_gradient_descent/05_Vectorize_Gradient_Descent.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# 梯度下降的向量化"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "from sklearn import datasets"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "boston = datasets.load_boston()\n",
29 | "X = boston.data\n",
30 | "y = boston.target\n",
31 | "\n",
32 | "X = X[y < 50.0]\n",
33 | "y = y[y < 50.0]"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 3,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "from playML.model_selection import train_test_split\n",
43 | "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "## 使用正规方程解法"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 4,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "CPU times: user 827 µs, sys: 2.74 ms, total: 3.57 ms\nWall time: 8.71 ms\n"
63 | ]
64 | },
65 | {
66 | "data": {
67 | "text/plain": [
68 | "0.81298026026586467"
69 | ]
70 | },
71 | "execution_count": 4,
72 | "metadata": {},
73 | "output_type": "execute_result"
74 | }
75 | ],
76 | "source": [
77 | "from playML.linear_regression import LinearRegression\n",
78 | "\n",
79 | "lin_reg1 = LinearRegression()\n",
80 | "%time lin_reg1.fit_normal(X_train, y_train)\n",
81 | "lin_reg1.score(X_test, y_test)"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "## 使用梯度下降法"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 5,
94 | "metadata": {},
95 | "outputs": [
96 | {
97 | "name": "stderr",
98 | "output_type": "stream",
99 | "text": [
100 | "/Users/SeaMonster/PycharmProjects/MachineLearningClassicAlgorithm/playML/linear_regression.py:29: RuntimeWarning: overflow encountered in square\n return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)\n/Users/SeaMonster/PycharmProjects/MachineLearningClassicAlgorithm/playML/linear_regression.py:59: RuntimeWarning: invalid value encountered in double_scalars\n if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):\n"
101 | ]
102 | },
103 | {
104 | "data": {
105 | "text/plain": [
106 | "nan"
107 | ]
108 | },
109 | "execution_count": 5,
110 | "metadata": {},
111 | "output_type": "execute_result"
112 | }
113 | ],
114 | "source": [
115 | "lin_reg2 = LinearRegression()\n",
116 | "lin_reg2.fit_gd(X_train, y_train)\n",
117 | "lin_reg2.score(X_test, y_test)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 6,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/plain": [
128 | "array([ nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n nan, nan])"
129 | ]
130 | },
131 | "execution_count": 6,
132 | "metadata": {},
133 | "output_type": "execute_result"
134 | }
135 | ],
136 | "source": [
137 | "lin_reg2.coef_"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "### 调整步长η"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 7,
150 | "metadata": {},
151 | "outputs": [
152 | {
153 | "data": {
154 | "text/plain": [
155 | "0.27556634853389206"
156 | ]
157 | },
158 | "execution_count": 7,
159 | "metadata": {},
160 | "output_type": "execute_result"
161 | }
162 | ],
163 | "source": [
164 | "lin_reg2.fit_gd(X_train, y_train, eta=1e-6)\n",
165 | "lin_reg2.score(X_test, y_test)"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "结果很差。。。 \n",
173 | "那么,增加循环次数呢?"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 8,
179 | "metadata": {},
180 | "outputs": [
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "CPU times: user 36 s, sys: 159 ms, total: 36.1 s\nWall time: 36.4 s\n"
186 | ]
187 | },
188 | {
189 | "data": {
190 | "text/plain": [
191 | "0.75418523539807647"
192 | ]
193 | },
194 | "execution_count": 8,
195 | "metadata": {},
196 | "output_type": "execute_result"
197 | }
198 | ],
199 | "source": [
200 | "%time lin_reg2.fit_gd(X_train, y_train, eta=1e-6, n_iters=1e6)\n",
201 | "lin_reg2.score(X_test, y_test)"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "训练时间很长,但是损失函数仍然远未达到最小"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "## 使用梯度下降法前,最好进行数据归一化"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 9,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "from sklearn.preprocessing.data import StandardScaler"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 10,
230 | "metadata": {},
231 | "outputs": [
232 | {
233 | "data": {
234 | "text/plain": [
235 | "StandardScaler(copy=True, with_mean=True, with_std=True)"
236 | ]
237 | },
238 | "execution_count": 10,
239 | "metadata": {},
240 | "output_type": "execute_result"
241 | }
242 | ],
243 | "source": [
244 | "standard_scaler = StandardScaler()\n",
245 | "standard_scaler.fit(X_train)"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 11,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "X_train_standard = standard_scaler.transform(X_train)\n",
255 | "X_test_standard = standard_scaler.transform(X_test)"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 12,
261 | "metadata": {},
262 | "outputs": [
263 | {
264 | "name": "stdout",
265 | "output_type": "stream",
266 | "text": [
267 | "CPU times: user 212 ms, sys: 5.18 ms, total: 217 ms\nWall time: 223 ms\n"
268 | ]
269 | },
270 | {
271 | "data": {
272 | "text/plain": [
273 | "0.81298806201222351"
274 | ]
275 | },
276 | "execution_count": 12,
277 | "metadata": {},
278 | "output_type": "execute_result"
279 | }
280 | ],
281 | "source": [
282 | "lin_reg3 = LinearRegression()\n",
283 | "%time lin_reg3.fit_gd(X_train_standard, y_train)\n",
284 | "lin_reg3.score(X_test_standard, y_test)"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "## 梯度下降法的优势"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 13,
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "m = 1000\n",
301 | "n = 5000\n",
302 | "\n",
303 | "big_X = np.random.normal(size=(m,n))\n",
304 | "\n",
305 | "true_theta = np.random.uniform(0.0, 100.0, size=n+1) #最终要求(或者说,尽可能接近)的系数和截距离\n",
306 | "\n",
307 | "big_y = big_X.dot(true_theta[1:]) + true_theta[0] + np.random.normal(0.,10.,size=m)"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 14,
313 | "metadata": {},
314 | "outputs": [
315 | {
316 | "name": "stdout",
317 | "output_type": "stream",
318 | "text": [
319 | "CPU times: user 25.9 s, sys: 643 ms, total: 26.6 s\nWall time: 9.16 s\n"
320 | ]
321 | },
322 | {
323 | "data": {
324 | "text/plain": [
325 | "LinearRegression()"
326 | ]
327 | },
328 | "execution_count": 14,
329 | "metadata": {},
330 | "output_type": "execute_result"
331 | }
332 | ],
333 | "source": [
334 | "big_reg1 = LinearRegression()\n",
335 | "# 主要是看看训练时间,所以就不用train test split了\n",
336 | "%time big_reg1.fit_normal(big_X, big_y)"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": 15,
342 | "metadata": {},
343 | "outputs": [
344 | {
345 | "name": "stdout",
346 | "output_type": "stream",
347 | "text": [
348 | "CPU times: user 8.58 s, sys: 139 ms, total: 8.72 s\nWall time: 5.01 s\n"
349 | ]
350 | },
351 | {
352 | "data": {
353 | "text/plain": [
354 | "LinearRegression()"
355 | ]
356 | },
357 | "execution_count": 15,
358 | "metadata": {},
359 | "output_type": "execute_result"
360 | }
361 | ],
362 | "source": [
363 | "big_reg2 = LinearRegression()\n",
364 | "# X 本身就都是标准差为1,均值为0的,所以就不用归一化了\n",
365 | "%time big_reg2.fit_gd(big_X, big_y)"
366 | ]
367 | },
368 | {
369 | "cell_type": "markdown",
370 | "metadata": {},
371 | "source": [
372 | "这个例子中,特征数比较大,梯度下降法比正规方程解法快得多"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": null,
378 | "metadata": {},
379 | "outputs": [],
380 | "source": []
381 | }
382 | ],
383 | "metadata": {
384 | "kernelspec": {
385 | "display_name": "Python 2",
386 | "language": "python",
387 | "name": "python2"
388 | },
389 | "language_info": {
390 | "codemirror_mode": {
391 | "name": "ipython",
392 | "version": 2
393 | },
394 | "file_extension": ".py",
395 | "mimetype": "text/x-python",
396 | "name": "python",
397 | "nbconvert_exporter": "python",
398 | "pygments_lexer": "ipython2",
399 | "version": "2.7.6"
400 | }
401 | },
402 | "nbformat": 4,
403 | "nbformat_minor": 0
404 | }
405 |
--------------------------------------------------------------------------------
/c3_gradient_descent/07_SGD_in_scikit_learn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# 使用我们自己的SGD"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import matplotlib.pyplot as plt"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "m = 100000\n",
29 | "\n",
30 | "x = np.random.normal(size=m)\n",
31 | "X = x.reshape(-1,1)\n",
32 | "y = 4.*x + 3. + np.random.normal(0,3,size=m)"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 3,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "from playML import linear_regression\n",
42 | "\n",
43 | "lin_reg = linear_regression.LinearRegression()\n",
44 | "lin_reg.fit_sgd(X, y, n_iters=2)"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 4,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/plain": [
55 | "array([ 4.00642662])"
56 | ]
57 | },
58 | "execution_count": 4,
59 | "metadata": {},
60 | "output_type": "execute_result"
61 | }
62 | ],
63 | "source": [
64 | "lin_reg.coef_"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 5,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/plain": [
75 | "2.9918217520705057"
76 | ]
77 | },
78 | "execution_count": 5,
79 | "metadata": {},
80 | "output_type": "execute_result"
81 | }
82 | ],
83 | "source": [
84 | "lin_reg.interception_"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {},
90 | "source": [
91 | "## 真实使用我们自己的SGD"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 6,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "from sklearn import datasets\n",
101 | "boston = datasets.load_boston()\n",
102 | "\n",
103 | "X = boston.data\n",
104 | "y = boston.target\n",
105 | "\n",
106 | "X = X[y<50.0]\n",
107 | "y = y[y<50.0]"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 7,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "from playML.model_selection import train_test_split\n",
117 | "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "### 使用随机梯度下降法训练前,需要对数据进行归一化处理"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 8,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "from sklearn.preprocessing.data import StandardScaler\n",
134 | "\n",
135 | "standard_scaler = StandardScaler()\n",
136 | "standard_scaler.fit(X_train)\n",
137 | "X_train_standard = standard_scaler.transform(X_train)\n",
138 | "X_test_standard = standard_scaler.transform(X_test)"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "### 看看效果"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 10,
151 | "metadata": {},
152 | "outputs": [
153 | {
154 | "name": "stdout",
155 | "output_type": "stream",
156 | "text": [
157 | "CPU times: user 10.5 ms, sys: 5.5 ms, total: 16 ms\nWall time: 11.2 ms\n"
158 | ]
159 | },
160 | {
161 | "data": {
162 | "text/plain": [
163 | "0.79233295554251493"
164 | ]
165 | },
166 | "execution_count": 10,
167 | "metadata": {},
168 | "output_type": "execute_result"
169 | }
170 | ],
171 | "source": [
172 | "lin_reg = linear_regression.LinearRegression()\n",
173 | "%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=2)\n",
174 | "lin_reg.score(X_test_standard, y_test)"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {},
180 | "source": [
181 | "#### 增大循环次数,效果会越来越好么?"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 11,
187 | "metadata": {},
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "CPU times: user 148 ms, sys: 6.43 ms, total: 154 ms\nWall time: 170 ms\n"
194 | ]
195 | },
196 | {
197 | "data": {
198 | "text/plain": [
199 | "0.81324404894409663"
200 | ]
201 | },
202 | "execution_count": 11,
203 | "metadata": {},
204 | "output_type": "execute_result"
205 | }
206 | ],
207 | "source": [
208 | "%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=50)\n",
209 | "lin_reg.score(X_test_standard, y_test)"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 12,
215 | "metadata": {},
216 | "outputs": [
217 | {
218 | "name": "stdout",
219 | "output_type": "stream",
220 | "text": [
221 | "CPU times: user 290 ms, sys: 7.03 ms, total: 297 ms\nWall time: 338 ms\n"
222 | ]
223 | },
224 | {
225 | "data": {
226 | "text/plain": [
227 | "0.81316850059297174"
228 | ]
229 | },
230 | "execution_count": 12,
231 | "metadata": {},
232 | "output_type": "execute_result"
233 | }
234 | ],
235 | "source": [
236 | "%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=100)\n",
237 | "lin_reg.score(X_test_standard, y_test)"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 13,
243 | "metadata": {},
244 | "outputs": [
245 | {
246 | "name": "stdout",
247 | "output_type": "stream",
248 | "text": [
249 | "CPU times: user 1.23 s, sys: 12.8 ms, total: 1.24 s\nWall time: 1.27 s\n"
250 | ]
251 | },
252 | {
253 | "data": {
254 | "text/plain": [
255 | "0.81207491088465589"
256 | ]
257 | },
258 | "execution_count": 13,
259 | "metadata": {},
260 | "output_type": "execute_result"
261 | }
262 | ],
263 | "source": [
264 | "%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=500)\n",
265 | "lin_reg.score(X_test_standard, y_test)"
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "metadata": {},
271 | "source": [
272 | "也不是越来越好,只能说比较收敛吧。。。"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "## scikit-learn中的SGD"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 14,
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "from sklearn.linear_model.stochastic_gradient import SGDRegressor"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 15,
294 | "metadata": {},
295 | "outputs": [
296 | {
297 | "name": "stdout",
298 | "output_type": "stream",
299 | "text": [
300 | "CPU times: user 2.76 ms, sys: 2.83 ms, total: 5.59 ms\nWall time: 8.37 ms\n"
301 | ]
302 | },
303 | {
304 | "name": "stderr",
305 | "output_type": "stream",
306 | "text": [
307 | "/usr/local/seamonster/MachineLearningClassicAlgorithmEnv/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:128: FutureWarning: max_iter and tol parameters have been added in in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.\n \"and default tol will be 1e-3.\" % type(self), FutureWarning)\n"
308 | ]
309 | },
310 | {
311 | "data": {
312 | "text/plain": [
313 | "0.80386489308947862"
314 | ]
315 | },
316 | "execution_count": 15,
317 | "metadata": {},
318 | "output_type": "execute_result"
319 | }
320 | ],
321 | "source": [
322 | "sgd_reg = SGDRegressor()\n",
323 | "%time sgd_reg.fit(X_train_standard, y_train)\n",
324 | "sgd_reg.score(X_test_standard, y_test)"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 16,
330 | "metadata": {},
331 | "outputs": [
332 | {
333 | "name": "stdout",
334 | "output_type": "stream",
335 | "text": [
336 | "CPU times: user 6.5 ms, sys: 1.69 ms, total: 8.19 ms\nWall time: 6.22 ms\n"
337 | ]
338 | },
339 | {
340 | "name": "stderr",
341 | "output_type": "stream",
342 | "text": [
343 | "/usr/local/seamonster/MachineLearningClassicAlgorithmEnv/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:117: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n DeprecationWarning)\n"
344 | ]
345 | },
346 | {
347 | "data": {
348 | "text/plain": [
349 | "0.81255341149152971"
350 | ]
351 | },
352 | "execution_count": 16,
353 | "metadata": {},
354 | "output_type": "execute_result"
355 | }
356 | ],
357 | "source": [
358 | "sgd_reg = SGDRegressor(n_iter=100)\n",
359 | "%time sgd_reg.fit(X_train_standard, y_train)\n",
360 | "sgd_reg.score(X_test_standard, y_test)"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "metadata": {},
366 | "source": [
367 | "比我们自己手写的快得多了。。。"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": []
376 | }
377 | ],
378 | "metadata": {
379 | "kernelspec": {
380 | "display_name": "Python 2",
381 | "language": "python",
382 | "name": "python2"
383 | },
384 | "language_info": {
385 | "codemirror_mode": {
386 | "name": "ipython",
387 | "version": 2
388 | },
389 | "file_extension": ".py",
390 | "mimetype": "text/x-python",
391 | "name": "python",
392 | "nbconvert_exporter": "python",
393 | "pygments_lexer": "ipython2",
394 | "version": "2.7.6"
395 | }
396 | },
397 | "nbformat": 4,
398 | "nbformat_minor": 0
399 | }
400 |
--------------------------------------------------------------------------------
/c2_linear_regression/09_Regression_in_scikit_learn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# scikit-learn中的回归问题"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import matplotlib.pyplot as plt\n",
20 | "from sklearn import datasets"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "boston = datasets.load_boston()\n",
30 | "\n",
31 | "X = boston.data\n",
32 | "y = boston.target\n",
33 | "\n",
34 | "X = X[y<50.0]\n",
35 | "y = y[y<50.0]"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "data": {
45 | "text/plain": [
46 | "(490, 13)"
47 | ]
48 | },
49 | "execution_count": 3,
50 | "metadata": {},
51 | "output_type": "execute_result"
52 | }
53 | ],
54 | "source": [
55 | "X.shape"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "from sklearn.model_selection._split import train_test_split\n",
65 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "## scikit-learn中的线性回归"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 5,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "from sklearn.linear_model.base import LinearRegression\n",
82 | "\n",
83 | "lin_reg = LinearRegression()"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 6,
89 | "metadata": {},
90 | "outputs": [
91 | {
92 | "name": "stderr",
93 | "output_type": "stream",
94 | "text": [
95 | "/usr/local/seamonster/MachineLearningClassicAlgorithmEnv/lib/python3.6/site-packages/scipy/linalg/basic.py:1226: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.\n warnings.warn(mesg, RuntimeWarning)\n"
96 | ]
97 | },
98 | {
99 | "data": {
100 | "text/plain": [
101 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
102 | ]
103 | },
104 | "execution_count": 6,
105 | "metadata": {},
106 | "output_type": "execute_result"
107 | }
108 | ],
109 | "source": [
110 | "lin_reg.fit(X_train, y_train)"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 7,
116 | "metadata": {},
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/plain": [
121 | "array([ -1.14235739e-01, 3.12783163e-02, -4.30926281e-02,\n -9.16425531e-02, -1.09940036e+01, 3.49155727e+00,\n -1.40778005e-02, -1.06270960e+00, 2.45307516e-01,\n -1.23179738e-02, -8.80618320e-01, 8.43243544e-03,\n -3.99667727e-01])"
122 | ]
123 | },
124 | "execution_count": 7,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "lin_reg.coef_"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 8,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "data": {
140 | "text/plain": [
141 | "32.645660839653509"
142 | ]
143 | },
144 | "execution_count": 8,
145 | "metadata": {},
146 | "output_type": "execute_result"
147 | }
148 | ],
149 | "source": [
150 | "lin_reg.intercept_"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 9,
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/plain": [
161 | "0.80089161995191005"
162 | ]
163 | },
164 | "execution_count": 9,
165 | "metadata": {},
166 | "output_type": "execute_result"
167 | }
168 | ],
169 | "source": [
170 | "lin_reg.score(X_test,y_test)"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "### KNN Regressor (KNN解决回归问题)"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 10,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "from sklearn.neighbors.regression import KNeighborsRegressor\n",
187 | "\n",
188 | "knn_reg = KNeighborsRegressor()"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "#### KNN数据归一化"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 11,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "from sklearn.preprocessing.data import StandardScaler\n",
205 | "standard_scaler = StandardScaler()\n",
206 | "standard_scaler.fit(X_train)\n",
207 | "X_train_nor = standard_scaler.transform(X_train)\n",
208 | "X_test_nor = standard_scaler.transform(X_test)"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 12,
214 | "metadata": {},
215 | "outputs": [
216 | {
217 | "data": {
218 | "text/plain": [
219 | "KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n weights='uniform')"
220 | ]
221 | },
222 | "execution_count": 12,
223 | "metadata": {},
224 | "output_type": "execute_result"
225 | }
226 | ],
227 | "source": [
228 | "knn_reg.fit(X_train_nor, y_train)"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 13,
234 | "metadata": {},
235 | "outputs": [
236 | {
237 | "data": {
238 | "text/plain": [
239 | "0.82230080487286983"
240 | ]
241 | },
242 | "execution_count": 13,
243 | "metadata": {},
244 | "output_type": "execute_result"
245 | }
246 | ],
247 | "source": [
248 | "knn_reg.score(X_test_nor, y_test)"
249 | ]
250 | },
251 | {
252 | "cell_type": "markdown",
253 | "metadata": {},
254 | "source": [
255 | "呃,knn效果比线性回归还好。。。 \n",
256 | "要是把网格搜索也用上岂不更离谱。。。"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": 14,
262 | "metadata": {},
263 | "outputs": [
264 | {
265 | "name": "stdout",
266 | "output_type": "stream",
267 | "text": [
268 | "Fitting 3 folds for each of 60 candidates, totalling 180 fits\n"
269 | ]
270 | },
271 | {
272 | "name": "stderr",
273 | "output_type": "stream",
274 | "text": [
275 | "[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 1.5s finished\n"
276 | ]
277 | },
278 | {
279 | "data": {
280 | "text/plain": [
281 | "GridSearchCV(cv=None, error_score='raise',\n estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n weights='uniform'),\n fit_params=None, iid=True, n_jobs=-1,\n param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],\n pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n scoring=None, verbose=1)"
282 | ]
283 | },
284 | "execution_count": 14,
285 | "metadata": {},
286 | "output_type": "execute_result"
287 | }
288 | ],
289 | "source": [
290 | "from sklearn.model_selection._search import GridSearchCV\n",
291 | "\n",
292 | "param_grid = [\n",
293 | " {\n",
294 | " 'weights':['uniform'],\n",
295 | " 'n_neighbors':[i for i in range(1,11)]\n",
296 | " },\n",
297 | " {\n",
298 | " 'weights':['distance'],\n",
299 | " 'n_neighbors':[i for i in range(1,11)],\n",
300 | " 'p':[i for i in range(1,6)]\n",
301 | " }\n",
302 | "]\n",
303 | "\n",
304 | "knn_reg2 = KNeighborsRegressor()\n",
305 | "grid_search = GridSearchCV(knn_reg2, param_grid, n_jobs=-1, verbose=1)\n",
306 | "grid_search.fit(X_train_nor, y_train)"
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {},
312 | "source": [
313 | "注意下面的分数与那些分类器回归器的score用的不是同一种标准,所以不能直接与它们比较"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 15,
319 | "metadata": {},
320 | "outputs": [
321 | {
322 | "data": {
323 | "text/plain": [
324 | "0.79480244433269864"
325 | ]
326 | },
327 | "execution_count": 15,
328 | "metadata": {},
329 | "output_type": "execute_result"
330 | }
331 | ],
332 | "source": [
333 | "grid_search.best_score_"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 16,
339 | "metadata": {},
340 | "outputs": [
341 | {
342 | "data": {
343 | "text/plain": [
344 | "{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}"
345 | ]
346 | },
347 | "execution_count": 16,
348 | "metadata": {},
349 | "output_type": "execute_result"
350 | }
351 | ],
352 | "source": [
353 | "grid_search.best_params_"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 17,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "knn_reg_grid_search = grid_search.best_estimator_"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": 18,
368 | "metadata": {},
369 | "outputs": [
370 | {
371 | "data": {
372 | "text/plain": [
373 | "0.85652703298427613"
374 | ]
375 | },
376 | "execution_count": 18,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "knn_reg_grid_search.score(X_test_nor, y_test)"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "metadata": {},
389 | "outputs": [],
390 | "source": []
391 | }
392 | ],
393 | "metadata": {
394 | "kernelspec": {
395 | "display_name": "Python 2",
396 | "language": "python",
397 | "name": "python2"
398 | },
399 | "language_info": {
400 | "codemirror_mode": {
401 | "name": "ipython",
402 | "version": 2
403 | },
404 | "file_extension": ".py",
405 | "mimetype": "text/x-python",
406 | "name": "python",
407 | "nbconvert_exporter": "python",
408 | "pygments_lexer": "ipython2",
409 | "version": "2.7.6"
410 | }
411 | },
412 | "nbformat": 4,
413 | "nbformat_minor": 0
414 | }
415 |
--------------------------------------------------------------------------------
/c1_knn/03_Train_Test_Split.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 测试我们的算法"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import matplotlib.pyplot as plt\n",
20 | "from sklearn import datasets"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "iris = datasets.load_iris()"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "# 特征矩阵\n",
39 | "X = iris.data\n",
40 | "\n",
41 | "# 结果标签的向量\n",
42 | "y = iris.target"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 4,
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "data": {
52 | "text/plain": [
53 | "(150, 4)"
54 | ]
55 | },
56 | "execution_count": 4,
57 | "metadata": {},
58 | "output_type": "execute_result"
59 | }
60 | ],
61 | "source": [
62 | "np.shape(X)"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 5,
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "(150,)"
74 | ]
75 | },
76 | "execution_count": 5,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "np.shape(y)"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "## train_test_split"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 6,
95 | "metadata": {},
96 | "outputs": [
97 | {
98 | "data": {
99 | "text/plain": [
100 | "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])"
101 | ]
102 | },
103 | "execution_count": 6,
104 | "metadata": {},
105 | "output_type": "execute_result"
106 | }
107 | ],
108 | "source": [
109 | "y"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 7,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "# 生成一个序列(例如0~100),再把这个序列打乱\n",
119 | "shuffle_indexes = np.random.permutation(np.shape(y)[0])"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 8,
125 | "metadata": {},
126 | "outputs": [
127 | {
128 | "data": {
129 | "text/plain": [
130 | "array([ 50, 78, 69, 131, 10, 34, 6, 9, 36, 71, 82, 141, 137,\n 79, 59, 93, 22, 91, 122, 75, 88, 3, 89, 86, 12, 61,\n 14, 132, 119, 121, 129, 33, 103, 13, 37, 47, 139, 125, 73,\n 53, 2, 42, 114, 29, 138, 112, 52, 101, 97, 19, 123, 128,\n 144, 81, 11, 109, 26, 116, 44, 80, 64, 83, 124, 74, 39,\n 31, 58, 145, 102, 120, 76, 63, 65, 135, 8, 55, 77, 60,\n 35, 149, 57, 43, 0, 110, 127, 62, 142, 96, 106, 126, 51,\n 40, 104, 118, 68, 27, 87, 45, 15, 113, 115, 49, 16, 136,\n 117, 66, 5, 21, 67, 140, 54, 100, 99, 30, 18, 72, 148,\n 92, 24, 23, 85, 32, 70, 107, 56, 108, 105, 17, 134, 94,\n 95, 38, 48, 7, 46, 20, 146, 130, 28, 84, 90, 1, 111,\n 25, 133, 143, 41, 147, 4, 98])"
131 | ]
132 | },
133 | "execution_count": 8,
134 | "metadata": {},
135 | "output_type": "execute_result"
136 | }
137 | ],
138 | "source": [
139 | "shuffle_indexes"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 9,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "test_ratio = 0.2\n",
149 | "test_size = int(np.shape(X)[0] * test_ratio)"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 10,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "data": {
159 | "text/plain": [
160 | "30"
161 | ]
162 | },
163 | "execution_count": 10,
164 | "metadata": {},
165 | "output_type": "execute_result"
166 | }
167 | ],
168 | "source": [
169 | "test_size"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 11,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "test_indexes = shuffle_indexes[:test_size]\n",
179 | "train_indexes = shuffle_indexes[test_size:]"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 12,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "X_train = X[train_indexes]\n",
189 | "y_train = y[train_indexes]\n",
190 | "\n",
191 | "X_test = X[test_indexes]\n",
192 | "y_test = y[test_indexes]\n"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "## 使用我们的算法"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 13,
205 | "metadata": {},
206 | "outputs": [],
207 | "source": [
208 | "from c1_knn.model_selection import train_test_split"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 14,
214 | "metadata": {},
215 | "outputs": [],
216 | "source": [
217 | "X_train, X_test, y_train, y_test = train_test_split(X,y,seed=1)"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "### 先试试使用之前自己写的KNNClassifier"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 15,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "from c1_knn.kNN import KNNClassifier"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 16,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "my_knn_clf = KNNClassifier(k=3)"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 17,
248 | "metadata": {},
249 | "outputs": [
250 | {
251 | "data": {
252 | "text/plain": [
253 | "kNN(k=3)"
254 | ]
255 | },
256 | "execution_count": 17,
257 | "metadata": {},
258 | "output_type": "execute_result"
259 | }
260 | ],
261 | "source": [
262 | "my_knn_clf.fit(X_train, y_train)"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 18,
268 | "metadata": {},
269 | "outputs": [
270 | {
271 | "data": {
272 | "text/plain": [
273 | "array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,\n 0, 2, 1, 0, 0, 1, 2])"
274 | ]
275 | },
276 | "execution_count": 18,
277 | "metadata": {},
278 | "output_type": "execute_result"
279 | }
280 | ],
281 | "source": [
282 | "y_predict = my_knn_clf.predict(X_test)\n",
283 | "y_predict"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 19,
289 | "metadata": {},
290 | "outputs": [
291 | {
292 | "data": {
293 | "text/plain": [
294 | "array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,\n 0, 2, 1, 0, 0, 1, 2])"
295 | ]
296 | },
297 | "execution_count": 19,
298 | "metadata": {},
299 | "output_type": "execute_result"
300 | }
301 | ],
302 | "source": [
303 | "y_test"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 21,
309 | "metadata": {},
310 | "outputs": [
311 | {
312 | "data": {
313 | "text/plain": [
314 | "1.0"
315 | ]
316 | },
317 | "execution_count": 21,
318 | "metadata": {},
319 | "output_type": "execute_result"
320 | }
321 | ],
322 | "source": [
323 | "# 正确率\n",
324 | "np.sum(y_predict==y_test) / len(y_test)"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "### sklearn中的train_test_split"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": 22,
337 | "metadata": {},
338 | "outputs": [],
339 | "source": [
340 | "from sklearn.model_selection._split import train_test_split"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": 25,
346 | "metadata": {},
347 | "outputs": [],
348 | "source": [
349 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 26,
355 | "metadata": {},
356 | "outputs": [
357 | {
358 | "data": {
359 | "text/plain": [
360 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n weights='uniform')"
361 | ]
362 | },
363 | "execution_count": 26,
364 | "metadata": {},
365 | "output_type": "execute_result"
366 | }
367 | ],
368 | "source": [
369 | "from sklearn.neighbors.classification import KNeighborsClassifier\n",
370 | "knn_clf = KNeighborsClassifier(n_neighbors=3)\n",
371 | "knn_clf.fit(X_train, y_train)\n"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": 27,
377 | "metadata": {},
378 | "outputs": [
379 | {
380 | "data": {
381 | "text/plain": [
382 | "array([1, 2, 1, 2, 0, 1, 1, 2, 1, 1, 1, 0, 0, 0, 2, 1, 0, 2, 2, 2, 1, 0, 2,\n 0, 1, 1, 0, 1, 2, 2])"
383 | ]
384 | },
385 | "execution_count": 27,
386 | "metadata": {},
387 | "output_type": "execute_result"
388 | }
389 | ],
390 | "source": [
391 | "y_predict = knn_clf.predict(X_test)\n",
392 | "y_predict"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 30,
398 | "metadata": {},
399 | "outputs": [
400 | {
401 | "data": {
402 | "text/plain": [
403 | "1.0"
404 | ]
405 | },
406 | "execution_count": 30,
407 | "metadata": {},
408 | "output_type": "execute_result"
409 | }
410 | ],
411 | "source": [
412 | "# 准确率\n",
413 | "np.sum(y_predict==y_test)/len(y_test)"
414 | ]
415 | }
416 | ],
417 | "metadata": {
418 | "kernelspec": {
419 | "display_name": "Python 2",
420 | "language": "python",
421 | "name": "python2"
422 | },
423 | "language_info": {
424 | "codemirror_mode": {
425 | "name": "ipython",
426 | "version": 2
427 | },
428 | "file_extension": ".py",
429 | "mimetype": "text/x-python",
430 | "name": "python",
431 | "nbconvert_exporter": "python",
432 | "pygments_lexer": "ipython2",
433 | "version": "2.7.6"
434 | }
435 | },
436 | "nbformat": 4,
437 | "nbformat_minor": 0
438 | }
439 |
--------------------------------------------------------------------------------
/c6_logistic_regression/01_Sigmoid.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# Sigmoid函数"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import matplotlib.pyplot as plt"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## 绘制Sigmoid函数"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 4,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "def sigmoid(t):\n",
36 | " return 1/(1 + np.exp(-t))"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 6,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzt3Xt8VPWd//HXJxcSIIQ74Q4qqOCdRBC1FcRSsK24u9TitrbWunTdtZdff/11td2f29rd36/d7uW3Pmrbta2rdW1RW22ppYVKQ7ValasoRCRCIOGSAEEghFxm5vP7YyY6prlMkpmcmcn7+XjMI+fyPWfeOXPymZPvnDnH3B0REckuOUEHEBGR5FNxFxHJQiruIiJZSMVdRCQLqbiLiGQhFXcRkSyk4i4ikoVU3EVEspCKu4hIFsoL6onHjBnj06dP79Wyp0+fZujQockNlATK1TPK1XPpmk25eqYvuTZv3nzU3cd229DdA3mUlpZ6b5WXl/d62VRSrp5Rrp5L12zK1TN9yQVs8gRqrLplRESykIq7iEgWUnEXEclCKu4iIllIxV1EJAt1W9zN7EEzqzOz1zqZb2Z2n5lVmtl2M5uT/JgiItITiRy5PwQs6WL+UmBm7LES+G7fY4mISF90+yUmd3/WzKZ30WQZ8KPY+ZcvmtkIM5vg7oeSlFFEslQk4jSHIjS1hmkKhQmFndZwhHDEaQ07oUiEUMQJtQ2HPTYv2iYUcSLuuIPjRCJEx4l+h+f16lYOvrT/XdPco20i/s64Ex1vW1cbjxt51/QOp/1p23fdxDSu8YgzYRb0cdt1x+LDd9ooWtyfdvcLO5j3NPANd/9DbHw98HfuvqmDtiuJHt1TUlJSumrVql6FbmhooKioqFfLppJy9Yxy9Vw6ZXN3TrfCW83O4RONhHIKaQw5ja1OY4jYz3eGW8JOawRawtAScVrC0BoJ+rfoXxb7edM5ztKZvXsdFy5cuNndy7pr16+XH3D3B4AHAMrKynzBggW9Ws+GDRvo7bKppFw9o1w915/ZwhHnwPEz7Ks/zf76RvYfa2R/fSMH3zpD3almjjY00xpuOzg0oPntZQfl5lA8OJ/iwjyGDc5n0sg8BufnUpifS2F+DoX5uQzOz6UgNt42Lz83h7wcIy/Xoj9zcsjNNfJzcsjNMfJzLfYzh7xcI9eMnBzDgBwzcswwI/YwXnrxj1x15ZUY0XGztnZgGJbDnyxLbLxN3CBxg1hsxrun/en8jvTH65iM4n4AmBI3Pjk2TUQyRHMozGsHTvBqzQleP3yKikMn2VV7iqa4Q+tBuTlMHjWYSSMGM2PcMMYVFzC2qICxwwo4+GYFi66eR/HgPIoL8ynMzw3wt3nHqMIcSooLg44RiGQU99XAnWa2CpgHnFB/u0h6awlF2FRVz3OVR9lUVc8rNSdoCUUL+cgh+cyaUMxfzp3GeeOLmDZ6KFNHDWF8cSE5OR0fjW44/gYzxqVHd5FEdVvczewnwAJgjJnVAP8A5AO4+/eANcD1QCXQCHwyVWFFpPdONrWybkct6ytqeW73URqaQ+TlGBdOGs4n5k+jdNooLp0ygpLigi67FCQzJHK2zM3dzHfgb5OWSESSJhxxNuyq48mtB3hmZy3NoQjjiwv50CUTWHjeOK6aMYahBYFd+VtSSK+qSBY62dTK4xurefiPVVTXn2HU0EGsuHwKN142iUunjNCR+QCg4i6SRU42tfKD5/by4B/20tAc4vLpI7l76SzeN7uE/FxdbWQgUXEXyQLNoTAPPV/Fd3//Jm81trL0wvH8zYIZXDR5eNDRJCAq7iIZ7oXKo/z9z19jz9HTXHPuWL64+DwVdVFxF8lUDc0hvrZ6B09srmHqqCE8fNtcrjm3+1trysCg4i6SgbbsP87nV22j5ngjf7PgHD67aGbafHFI0oOKu0gGcXce+WMVX/vlTkqKC3ns0/O5fPqooGNJGlJxF8kQLaEID+9sYUP1Dq49fxz//pFLGT44P+hYkqZU3EUyQGNLiE8/spnnqkPcseAcvrj4PHI7uRSACKi4i6S9E2daue2hjWzdf5xPXTiIv1tyftCRJAOouIuksRONrdz8/RfZXXeK73x0DoVHdwUdSTKEvrImkqYaW0J88qGXqaxr4PsfL2PJhROCjiQZRMVdJA01h8J8+pHNbKt+i/tuvpQF540LOpJkGHXLiKQZd+fLT77Gc7uP8s/LL9YRu/SKjtxF0swP/7CXn22p4XOLZnJT2ZTuFxDpgIq7SBr5/RtH+D9rKlh64Xg+t2hm0HEkg6m4i6SJg2+d4bM/2cq5JcP4lw9f0ukt7UQSoeIukgbCEefzj20jFI7wvY+V6u5I0mfag0TSwP3llby8t55/u+kSpo8ZGnQcyQI6chcJ2Nb9x/mP9bu58dKJ/PmcyUHHkSyh4i4SoJZQhL/72XZKhhXw9RsvDDqOZBF1y4gE6DsbKnmjtoEHby1jWKGu8CjJoyN3kYDsrj3F/eWV3HDJRK49vyToOJJlVNxFAuDufPmpVykqyOMfPjQ76DiShVTcRQLw9PZDbKw6zpeWnM/oooKg40gWUnEX6WdNrWG+8evXmT2hWJcXkJRRcRfpZw88u4cDb53hHz40W3dTkpRRcRfpR7Unm/juhje5/qLxzDt7dNBxJIupuIv0o2//rpLWcIS7lswKOopkORV3kX5SXd/Iqo37+cjlU5g6ekjQcSTLJVTczWyJme0ys0ozu6uD+VPNrNzMtprZdjO7PvlRRTLbfet3Y2Z85lpdyldSr9vibma5wP3AUmA2cLOZtT8x9++Bx939MmAF8J1kBxXJZHuONPCzLTXccsU0xg8vDDqODACJHLnPBSrdfY+7twCrgGXt2jhQHBseDhxMXkSRzHff+t0U5OVyx4Jzgo4iA0Qi15aZBFTHjdcA89q1+Sqwzsw+AwwFrktKOpEsUF3fyC+3H+K2q6YzRl9Ykn5i7t51A7PlwBJ3vz02fgswz93vjGvzhdi6/tXM5gM/BC5090i7da0EVgKUlJSUrlq1qlehGxoaKCoq6tWyqaRcPTNQcj2ys5kN1SH+5ZrBjCzs2zkMA2WbJUs25lq4cOFmdy/rtqG7d/kA5gNr48bvBu5u12YHMCVufA8wrqv1lpaWem+Vl5f3etlUUq6eGQi5jpxq8nO/ssa/9MQrSVnfQNhmyZSNuYBN3k3ddveE+tw3AjPN7CwzG0T0A9PV7drsBxYBmNksoBA4ksC6RbLaQ89X0RKOsPKas4OOIgNMt8Xd3UPAncBaoILoWTE7zOxeM7sh1ux/An9lZq8APwFujb3DiAxYp5tD/OiPVSy9cDznjE2/rgHJbgndrMPd1wBr2k27J254J3BVcqOJZLYnt9RwsinE7e/RUbv0P31DVSQFIhHnoRequGTKCOZMHRl0HBmAVNxFUuAPlUd588hpbr1yWtBRZIBScRdJgYdeqGJMUQHXXzQh6CgyQKm4iyRZ1dHTlO+q46PzplKQlxt0HBmgVNxFkuy/X9xHrhkfnTc16CgygKm4iyRRcyjMk1sPsPiCEsYV6wJhEhwVd5Ek+u3OWupPt/CRy3XULsFScRdJosc2VjNpxGCunjEm6CgywKm4iyRJdX0jz+0+yk1lU3TjawmcirtIkjy+qRoz+HDZ5KCjiKi4iyRDKBzhiU01XHPuWCaOGBx0HBEVd5Fk+P0bRzh8sokVl08JOooIoOIukhQ/3VzD6KGDWDSrJOgoIoCKu0ifnTjTyvqKOj50yUTyc/UnJelBe6JIH/361UO0hCP8+ZxJQUcReZuKu0gfPbn1AGePHcpFk4YHHUXkbSruIn1Qc7yRl/fW82eXTsJM57ZL+lBxF+mDX2w7CMCyS9UlI+lFxV2kl9ydp7YeoGzaSKaOHhJ0HJF3UXEX6aUdB09SWdfAn+mDVElDKu4ivfTzrQfIzzU+oLstSRpScRfphUjEWfPqId47cywjhgwKOo7In1BxF+mFrdVvcfBEEx+8REftkp5U3EV64VfbDzEoL4frdLkBSVMq7iI9FN8lM6wwP+g4Ih1ScRfpoa3Vxzl8sokPXqwuGUlfKu4iPfR0rEtm0axxQUcR6ZSKu0gPtHXJLDhXXTKS3lTcRXpgy/7j1J5s5gPqkpE0p+Iu0gPvdMnoLBlJbyruIglq65JZeN5Yigrygo4j0qWEiruZLTGzXWZWaWZ3ddLmJjPbaWY7zOzHyY0pErzN+49Td6qZ63W5AckA3R5+mFkucD/wPqAG2Ghmq919Z1ybmcDdwFXuftzMdBqBZJ11Ow4zKDeHa8/X7i3pL5Ej97lApbvvcfcWYBWwrF2bvwLud/fjAO5el9yYIsFyd9btrOXKGaN1loxkBHP3rhuYLQeWuPvtsfFbgHnufmdcm58DbwBXAbnAV939Nx2sayWwEqCkpKR01apVvQrd0NBAUVFRr5ZNJeXqmUzKVXMqwt8/f4ZbLxjEginBFfdM2mbpIBtzLVy4cLO7l3Xb0N27fADLgR/Ejd8CfLtdm6eBp4B84CygGhjR1XpLS0u9t8rLy3u9bCopV89kUq77nnnDp9/1tNeePNP/geJk0jZLB9mYC9jk3dRtd0+oW+YAMCVufHJsWrwaYLW7t7r7XqJH8TMTWLdIRli3s5bLpoxg3LDCoKOIJCSR4r4RmGlmZ5nZIGAFsLpdm58DCwDMbAxwLrAniTlFAnPwrTO8euAEiy8YH3QUkYR1W9zdPQTcCawFKoDH3X2Hmd1rZjfEmq0FjpnZTqAc+F/ufixVoUX602931gKweLa+uCSZI6FvYrj7GmBNu2n3xA078IXYQySrrNt5mBnjijh7bPp9MCfSGX1DVaQLJxpbeXFPvY7aJeOouIt04Xe7aglHXP3tknFU3EW6sG5HLSXFBVw8aXjQUUR6RMVdpBNNrWF+/8YR3je7hJwcCzqOSI+ouIt04vnKozS2hFk8W10yknlU3EU6sW5HLcMK8rji7NFBRxHpMRV3kQ6EI84zFbUsPH8cg/L0ZyKZR3utSAe27D/OsdMtLL5Ap0BKZlJxF+lA27Xbrzl3bNBRRHpFxV2kHde12yULqLiLtHOgwdl3rFFnyUhGU3EXaWdLXQgzuG62bqcnmUvFXaSdLbVhXbtdMp6Ku0icg2+doepkRNeSkYyn4i4SR9dul2yh4i4SZ93Ow0wcarp2u2Q8FXeRmLZrt88pSegeNiJpTcVdJKbt2u1zxuUGHUWkz1TcRWLart0+fbj+LCTzaS8WIXrt9g27jnDdrBJyTNdul8yn4i4C/GH3Uc60hnm/ToGULKHiLkL0LJlhhbp2u2QPFXcZ8ELhCM9U1HGtrt0uWUR7sgx4m/cdp/50iy4UJllFxV0GvHU7axmUl8M15+na7ZI9VNxlQHN31u44zNUzxlBUoC8vSfZQcZcBreLQKWqOn9G1ZCTrqLjLgLZu52HMYNEsFXfJLiruMqCt3VFL2bSRjB1WEHQUkaRScZcBq7q+kYpDJ3WWjGSlhIq7mS0xs11mVmlmd3XR7i/MzM2sLHkRRVJjXdu12y9Ql4xkn26Lu5nlAvcDS4HZwM1mNruDdsOAzwEvJTukSCqs3XGY88cPY9rooUFHEUm6RI7c5wKV7r7H3VuAVcCyDtp9Hfgm0JTEfCIpcayhmU1V9TpLRrJWIsV9ElAdN14Tm/Y2M5sDTHH3XyUxm0jKrK+oI+LoXqmStczdu25gthxY4u63x8ZvAea5+52x8Rzgd8Ct7l5lZhuAL7r7pg7WtRJYCVBSUlK6atWqXoVuaGigqCj9boOmXD0TZK5/39xEzakI/3LNYKzdJX7TdXtB+mZTrp7pS66FCxdudvfuP9d09y4fwHxgbdz43cDdcePDgaNAVezRBBwEyrpab2lpqfdWeXl5r5dNJeXqmaBynTjT4jO/vMbv/eWODuen6/ZyT99sytUzfckFbPJu6ra7J9QtsxGYaWZnmdkgYAWwOu7N4YS7j3H36e4+HXgRuME7OHIXSQfrK2ppCUe4/qIJQUcRSZlui7u7h4A7gbVABfC4u+8ws3vN7IZUBxRJtl9tP8z44kIumzIi6CgiKZPQlZLcfQ2wpt20ezppu6DvsURS41RTK8/uPsJH500lJ0e305PspW+oyoCyvqKOllCED6hLRrKcirsMKGtePcT44kLmTB0ZdBSRlFJxlwGjoTnEhjeOsOTC8eqSkayn4i4DxvqKWlpCOktGBgYVdxkw1rx6iHHDCiibpi4ZyX4q7jIgnG4OsWHXEZaqS0YGCBV3GRDWv15Hs7pkZABRcZcB4RdbDzC+uJDLp48KOopIv1Bxl6xXf7qF379xhBsunaguGRkwVNwl6/3q1UOEIs6ySycGHUWk36i4S9b7xdYDzBxXxOwJxUFHEek3Ku6S1arrG9m07zg3XjbpT67bLpLNVNwlq61+5SAAN1yiLhkZWFTcJWu5Oz/feoCyaSOZMmpI0HFE+pWKu2StikOn2F3XoA9SZUBScZes9bMtNeTnGh+4WMVdBh4Vd8lKLaEIT209wHWzShg1dFDQcUT6nYq7ZKX1FbXUn27hpsunBB1FJBAq7pKVHttUzfjiQt47c2zQUUQCoeIuWefQiTM8+8YRlpdOJleXG5ABSsVdss6TWw4Qcfhw2eSgo4gERsVdskok4jy+qZorzh7FtNFDg44jEhgVd8kqL+2tZ9+xRj6iD1JlgFNxl6zy6Ev7KC7MY8kFuimHDGwq7pI16k428ZvXDnNT2RQGD8oNOo5IoFTcJWv8+OX9hCLOx66YFnQUkcCpuEtWaA1H+PFL+1lw3limj9EHqSIq7pIV1u44TN2pZj4+X0ftIqDiLlni4ReqmDpqCNecOy7oKCJpQcVdMt626rfYWHWcj8+fpm+kisSouEvGe+DZNxlWmMeKuVODjiKSNhIq7ma2xMx2mVmlmd3VwfwvmNlOM9tuZuvNTB2f0i+qjp7mN68d5mNXTKOoIC/oOCJpo9vibma5wP3AUmA2cLOZzW7XbCtQ5u4XAz8F/jnZQUU68oM/7CEvJ4dPXjk96CgiaSWRI/e5QKW773H3FmAVsCy+gbuXu3tjbPRFQFdskpQ71tDME5tquPGyiYwrLgw6jkhaMXfvuoHZcmCJu98eG78FmOfud3bS/tvAYXf/xw7mrQRWApSUlJSuWrWqV6EbGhooKirq1bKppFw909dcT+xqYc3eVv7p6sFMLErex0fpur0gfbMpV8/0JdfChQs3u3tZtw3dvcsHsBz4Qdz4LcC3O2n7MaJH7gXdrbe0tNR7q7y8vNfLppJy9Uxfch091eSz/vev/W8f3Zy8QDHpur3c0zebcvVMX3IBm7yb+uruJPIJ1AEg/hJ7k2PT3sXMrgO+Alzj7s0JrFek1x54bg9nWsN8btHMoKOIpKVE/pfdCMw0s7PMbBCwAlgd38DMLgP+E7jB3euSH1PkHUcbmvnRC/v40MUTmVkyLOg4Immp2+Lu7iHgTmAtUAE87u47zOxeM7sh1uxbQBHwhJltM7PVnaxOpM8eeHYPzaEwn9VRu0inEjox2N3XAGvaTbsnbvi6JOcS6VB1fSMPvVDFjZdOYsa49PugTCRd6BuqklG+tXYXBnzx/ecFHUUkram4S8bYVv0Wq185yO3vOYuJIwYHHUckram4S0Zwd/7x6Z2MKRrEHQtmBB1HJO2puEtGWP3KQTbtO87/eN+5uoaMSAJU3CXtnWhs5etP7+SSycNZcbmu/CiSCB0CSdr7xm9e53hjKw/fNlfXaxdJkI7cJa1tqqrnJy/v57arpnPBxOFBxxHJGCrukrYaW0J86afbmTRiMJ+/7tyg44hkFHXLSNr6p19VsPfYaR69fR5D9SGqSI/oyF3S0vqKWh59aT8r33M2V54zJug4IhlHxV3SzuETTXzpp9uZNaGYLyxWd4xIb6i4S1ppDoW549HNnGkNc9+KSynIyw06kkhGUkempJWv/XInW/e/xXc/OkeX8xXpAx25S9r47xf38eOX9nPHgnNYetGEoOOIZDQVd0kLa3cc5p5fvMa154/ji4t1xUeRvlJxl8BtrKrnsz/ZysWTR/Dtv7xM30IVSQIVdwnUlv3Hue2hjUwaOZgHb72cIYP0MZBIMugvSQKzqz7Mfb97iTHDCnjkU/MYNXRQ0JFEsoaO3CUQG3bV8a+bmygZXsjjn57PJN18QySpVNyl3z38QhW3PbSRkiE5PLZyPiXFhUFHEsk66paRftMcCvOPT1fwyIv7uG7WOJZPamDssIKgY4lkJR25S7/Ye/Q0f/HdF3jkxX2sfO/Z/OctZRTm6awYkVTRkbukVCTi/Pjl/fzfNRXk5+Xw/Y+X8b7ZJUHHEsl6Ku6SMrtrT/Hlp15lY9Vxrpoxmm8tv4SJ+uBUpF+ouEvS1Z5s4v898waPbaxmWGE+31p+MctLJ2OmbhiR/qLiLklTe7KJ/3q+iodfqCIUifDx+dP5zLUzGF2kD01F+puKu/TZjoMn+K/nq/jFtgOEI84HL57I/1x8LtNGDw06msiApeIuvXK0oZnV2w7y08017Dx0ksH5ufzl3KncdvVZKuoiaUDFXRLi7uw5eppndtbyTEUtm/cdJ+Jw0aThfO2GC1h26URGDNHlA0TShYq7dCgccfYcaWBj1XFe3nuMl/fWc/BEEwAXTCzmM9fO5PqLJnDeeN1QQyQdJVTczWwJ8B9ALvADd/9Gu/kFwI+AUuAY8BF3r0puVEkFd6fuVDNVR0/z5pHT7Dh4gp2HTvL6oVOcaQ0DMHZYAXPPGsUdZ49m0fnjdDqjSAbotribWS5wP/A+oAbYaGar3X1nXLNPAcfdfYaZrQC+CXwkFYElcaFwhBNnWqk+FWHDrjrqTjVz5FQztSebOHSiif3HGtlXf5qm1sjbywwrzGP2hGJWzJ3CBROHUzZtJNNGD9FpjCIZJpEj97lApbvvATCzVcAyIL64LwO+Ghv+KfBtMzN39yRmzViRiBOKOOGIE4pEYj+j463hd4+HwrHpkQitoQhNoQhnWsI0h8KcaQnT1BrmTGuEptbw24+G5jAnzrRysqmVk2dij6YQDc2hd0I8v/HtweGD8ykpLmDqqKG8Z+YYpo0ZyvTRQ5g+eiiTRw5WIRfJAokU90lAddx4DTCvszbuHjKzE8Bo4GgyQsZ7fGM1//5cI4M3b4hOcPDo89L2TuIOjkd/xr29tLVpm/9O27Z27ad53Ly453DinuudNuFwGHvm17y9JodQJEIkRW9xBXk5FObnMnRQLsWD8xk+OJ8po4YwfHA+xYXR8eGD8zhS/SbXzp/DuGGFjB1WQGF+bmoCiUja6NcPVM1sJbASoKSkhA0bNvR4HQfqQowfHCE/t+md9QJtB5sWN9Ew4o9Bzd6ZH39wam3Lxc/vaLzd89g7T4QBra3OoEG573rO3Jxccg1yDPIMcsyi4zm8PT3XINesw2mDcok+ctqGoz/zc6LrekcEaI494rTCmKImTu3dzingza42bj9raGjo1T6QaumaC9I3m3L1TL/kcvcuH8B8YG3c+N3A3e3arAXmx4bziB6xW1frLS0t9d4qLy/v9bKppFw9o1w9l67ZlKtn+pIL2OTd1G13T+iSvxuBmWZ2lpkNAlYAq9u1WQ18Ija8HPhdLISIiASg224Zj/ah30n06DwXeNDdd5jZvUTfQVYDPwQeMbNKoJ7oG4CIiAQkoT53d18DrGk37Z644Sbgw8mNJiIivaU7MYmIZCEVdxGRLKTiLiKShVTcRUSykIq7iEgWsqBORzezI8C+Xi4+hhRc2iAJlKtnlKvn0jWbcvVMX3JNc/ex3TUKrLj3hZltcveyoHO0p1w9o1w9l67ZlKtn+iOXumVERLKQiruISBbK1OL+QNABOqFcPaNcPZeu2ZSrZ1KeKyP73EVEpGuZeuQuIiJdSNvibmYfNrMdZhYxs7J28+42s0oz22Vm7+9k+bPM7KVYu8dilytOdsbHzGxb7FFlZts6aVdlZq/G2m1Kdo4Onu+rZnYgLtv1nbRbEtuGlWZ2Vz/k+paZvW5m283sKTMb0Um7ftle3f3+ZlYQe40rY/vS9FRliXvOKWZWbmY7Y/v/5zpos8DMTsS9vvd0tK4UZOvydbGo+2Lba7uZzemHTOfFbYdtZnbSzD7frk2/bS8ze9DM6szstbhpo8zst2a2O/ZzZCfLfiLWZreZfaKjNj2SyEXfg3gAs4DzgA1AWdz02cArQAFwFtGbC+V2sPzjwIrY8PeAO1Kc91+BezqZVwWM6cdt91Xgi920yY1tu7OBQbFtOjvFuRYDebHhbwLfDGp7JfL7A38DfC82vAJ4rB9euwnAnNjwMOCNDnItAJ7ur/0p0dcFuB74NdEbk10BvNTP+XKBw0TPAw9kewHvBeYAr8VN+2fgrtjwXR3t98AoYE/s58jY8Mi+ZEnbI3d3r3D3XR3MWgascvdmd98LVBK9iffbLHqH52uJ3qwb4GHgxlRljT3fTcBPUvUcKfD2jc/dvQVou/F5yrj7Ondvu2v3i8DkVD5fNxL5/ZcR3Xcgui8tshTfPdzdD7n7ltjwKaCC6D2KM8Ey4Ece9SIwwswm9OPzLwLedPfefjmyz9z9WaL3tIgXvx91VoveD/zW3evd/TjwW2BJX7KkbXHvQkc37G6/848G3oorJB21Sab3ALXuvruT+Q6sM7PNsfvI9oc7Y/8aP9jJv4GJbMdUuo3oUV5H+mN7JfL7v+vG70Dbjd/7Rawb6DLgpQ5mzzezV8zs12Z2QT9F6u51CXqfWkHnB1hBbK82Je5+KDZ8GCjpoE3St12/3iC7PTN7BhjfwayvuPsv+jtPRxLMeDNdH7Vf7e4HzGwc8Fszez32Dp+SXMB3ga8T/WP8OtEuo9v68nzJyNW2vczsK0AIeLST1SR9e2UaMysCfgZ83t1Ptpu9hWjXQ0Ps85SfAzP7IVbavi6xz9RuIHqP5/aC2l5/wt3dzPrlFMVAi7u7X9eLxQ4AU+LGJ8emxTtG9F/CvNgRV0dtkpLRzPKAPwdKu1jHgdjPOjN7imiXQJ/+KBLddmb2feDpDmYlsh2TnsvMbgU+CCzyWGdjB+tI+vbqQCK/f1ubmtjrPJzovpVSZpZPtLA/6u5+OtfeAAAB6UlEQVRPtp8fX+zdfY2ZfcfMxrh7Sq+hksDrkpJ9KkFLgS3uXtt+RlDbK06tmU1w90Oxbqq6DtocIPrZQJvJRD9v7LVM7JZZDayInclwFtF34JfjG8SKRjnRm3VD9ObdqfpP4DrgdXev6WimmQ01s2Ftw0Q/VHyto7bJ0q6f8886eb5Ebnye7FxLgC8BN7h7Yydt+mt7peWN32N9+j8EKtz93zppM76t79/M5hL9O07pm06Cr8tq4OOxs2auAE7EdUekWqf/PQexvdqJ3486q0VrgcVmNjLWjbo4Nq33+uMT5N48iBalGqAZqAXWxs37CtEzHXYBS+OmrwEmxobPJlr0K4EngIIU5XwI+Ot20yYCa+JyvBJ77CDaPZHqbfcI8CqwPbZjTWifKzZ+PdGzMd7sp1yVRPsVt8Ue32ufqz+3V0e/P3Av0TcfgMLYvlMZ25fO7odtdDXR7rTtcdvpeuCv2/Yz4M7YtnmF6AfTV/ZDrg5fl3a5DLg/tj1fJe4stxRnG0q0WA+PmxbI9iL6BnMIaI3Vr08R/ZxmPbAbeAYYFWtbBvwgbtnbYvtaJfDJvmbRN1RFRLJQJnbLiIhIN1TcRUSykIq7iEgWUnEXEclCKu4iIllIxV1EJAupuIuIZCEVdxGRLPT/AWDnkQWjIdEtAAAAAElFTkSuQmCC\n",
47 | "text/plain": [
48 | ""
49 | ]
50 | },
51 | "metadata": {},
52 | "output_type": "display_data"
53 | }
54 | ],
55 | "source": [
56 | "x = np.linspace(-10,10,500)\n",
57 | "y = sigmoid(x)\n",
58 | "plt.plot(x,y)\n",
59 | "plt.grid(True)\n",
60 | "plt.show()"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": []
69 | }
70 | ],
71 | "metadata": {
72 | "kernelspec": {
73 | "display_name": "Python 2",
74 | "language": "python",
75 | "name": "python2"
76 | },
77 | "language_info": {
78 | "codemirror_mode": {
79 | "name": "ipython",
80 | "version": 2
81 | },
82 | "file_extension": ".py",
83 | "mimetype": "text/x-python",
84 | "name": "python",
85 | "nbconvert_exporter": "python",
86 | "pygments_lexer": "ipython2",
87 | "version": "2.7.6"
88 | }
89 | },
90 | "nbformat": 4,
91 | "nbformat_minor": 0
92 | }
93 |
--------------------------------------------------------------------------------
/c6_logistic_regression/04_implement_logistic_regression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# 实现逻辑回归"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import matplotlib.pyplot as plt\n",
20 | "from sklearn import datasets\n"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "iris = datasets.load_iris()\n",
30 | "X = iris.data\n",
31 | "y = iris.target"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 3,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "data": {
41 | "text/plain": [
42 | "((100, 2), (100,))"
43 | ]
44 | },
45 | "execution_count": 3,
46 | "metadata": {},
47 | "output_type": "execute_result"
48 | }
49 | ],
50 | "source": [
51 | "# 我们知道逻辑回归是解决2分类问题的,但鸢尾花数据集有4个分类,所以我们需要只取其中2个分类\n",
52 | "# 另外为了可视化,我们只取其中2个特征而不是全部特征\n",
53 | "X = X[y<2,:2]\n",
54 | "y = y[y<2]\n",
55 | "X.shape, y.shape"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {},
62 | "outputs": [
63 | {
64 | "data": {
65 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAFzRJREFUeJzt3X2MXFd5x/Hf45kUMG+RyAqi+GUrgagAhRCvQigIhdhUIVjmD6iaaikNauXiDSW0VLw0UqpaQqhCokDBRiujKqndEhqgDSilDYEW+gep1iEJBNMqUDuJS5uNKUlTt6lsP/3j3sW7s7Mz98zMmTnnzPcjXe3MnZO7z7n3+sndc597xtxdAICybJp0AACA0SO5A0CBSO4AUCCSOwAUiOQOAAUiuQNAgUjuAFAgkjsAFIjkDgAFajdtaGYtSUuSTrr77o7Prpf0UUkn61WfcvdDvbZ30UUX+ezsbFCwADDtjh49+ri7z/Rr1zi5S7pR0jFJz9vg89vc/d1NNzY7O6ulpaWAXw8AMLMTTdo1GpYxsy2S3iyp59U4ACANTcfcPy7p/ZLO9WjzVjN7wMxuN7Ot3RqY2V4zWzKzpeXl5dBYAQAN9U3uZrZb0mPufrRHsy9LmnX3SyXdJemWbo3cfdHd59x9bmam75ARAGBATa7cXytpj5kdl/Q5SVeb2eHVDdz9lLs/Xb89JGnHSKMEAATpm9zd/UPuvsXdZyVdJ+nr7v721W3M7OJVb/eouvEKAJiQkGqZNcxsv6Qld79D0nvMbI+kM5J+Iun60YQHABhE0ENM7v73KzXu7n5zndhXru5f7u6vdPc3uPsPYgQLTMSRI9LsrLRpU/XzyJFJRwT0NfCVOzAVjhyR9u6VTp+u3p84Ub2XpPn5ycUF9MH0A0AvN910PrGvOH26Wg8kjOQO9PLww2HrgUSQ3IFetm0LWw8kguQO9PLhD0ubN69dt3lztR5IGMkd6GV+XlpclLZvl8yqn4uL3ExF8qiWAfqZnyeZIztcuQNAgUjuAFAgkjsAFIjkDgAFIrkDQIFI7gBQIJI7ABSI5A4ABSK5A0CBSO4oB1+qAfwM0w+gDHypBrAGV+4oA1+qAaxBckcZ+FINYA2SO8rAl2oAa5DcUQa+VANYg+SOMvClGsAaVMugHHypBvAzXLljeNSXA8nhyh3Dob4cSBJX7hgO9eVAkkjuGA715UCSSO4YDvXlQJJI7hgO9eVAkkjuGA715UCSGlfLmFlL0pKkk+6+u+OzZ0i6VdIOSack/Yq7Hx9hnEgZ9eVAckKu3G+UdGyDz35D0n+6+4sl/bGkPxo2MCBL1PwjEY2Su5ltkfRmSYc2aPIWSbfUr2+XtNPMbPjwgIys1PyfOCG5n6/5J8FjAppeuX9c0vslndvg80skPSJJ7n5G0hOSXjB0dEBOqPlHQvomdzPbLekxdz867C8zs71mtmRmS8vLy8NuDkgLNf9ISJMr99dK2mNmxyV9TtLVZna4o81JSVslyczakp6v6sbqGu6+6O5z7j43MzMzVOBAcqj5R0L6Jnd3/5C7b3H3WUnXSfq6u7+9o9kdkn69fv22uo2PNFIgddT8IyED17mb2X4z21O//aykF5jZQ5J+V9IHRxEckBVq/pEQm9QF9tzcnC8tLU3kdwNArszsqLvP9WvHE6pI18KC1G5XV8HtdvUeQCPM5440LSxIBw+ef3/27Pn3Bw5MJiYgI1y5I02Li2HrAaxBckeazp4NWw9gDZI70tRqha0HsAbJHWla+R7WpusBrMENVaRp5abp4mI1FNNqVYmdm6lAIyR3pOvAAZI5MCCGZdDdrl1VffnKsmvXpCOaHOZoR4ZI7lhv1y7p7rvXrrv77ulM8MzRjkwx/QDW6/U9K9M2H9zsbJXQO23fLh0/Pu5oAKYfAEaCOdqRKZI70AtztCNTJHest3Nn2PqSMUc7MkVyx3pf+9r6RL5zZ7V+2jBHOzLFDVUAyAg3VDGcWLXdIdulvhwYGE+oYr2V2u7Tp6v3K7Xd0nDDESHbjRUDMCUYlsF6sWq7Q7ZLfTnQFcMyGFys2u6Q7VJfDgyF5I71YtV2h2yX+nJgKCR3rBertjtku9SXA0MhuWO9WLXdIdulvhwYCjdUASAj3FCNLcca7BxjBjAQ6twHkWMNdo4xAxgYwzKDyLEGO8eYAazDsExMOdZg5xgzgIGR3AeRYw12jjEDGBjJfRA51mDnGDOAgZHcB5FjDXaOMQMYWN8bqmb2TEnflPQMVdU1t7v7H3S0uV7SRyWdrFd9yt0P9dpu1jdUAWBCRnlD9WlJV7v7KyVdJukaM7uyS7vb3P2yeumZ2DEhCwtSu11dubfb1ftRtE2lfj6VOIAE9K1z9+rS/qn67QX1Mpn6SQxuYUE6ePD8+7Nnz78/cGDwtqnUz6cSB5CIRnXuZtaSdFTSiyV92t0/0PH59ZI+ImlZ0r9I+h13f6TXNhmWGbN2u0rSnVot6cyZwdumUj+fShxAZCOtc3f3s+5+maQtkq4ws1d0NPmypFl3v1TSXZJu2SCovWa2ZGZLy8vLTX41RqVbst5ofUjbVOrnU4kDSERQtYy7/1TSNyRd07H+lLs/Xb89JGnHBv/9orvPufvczMzMIPFiUK1W8/UhbVOpn08lDiARfZO7mc2Y2YX162dJeqOkH3S0uXjV2z2Sjo0ySIzAyvhzk/UhbVOpn08lDiAV7t5zkXSppO9IekDS9yTdXK/fL2lP/fojkh6UdL+qK/tf6LfdHTt2OMZs3z73Vstdqn7u2zeatocPu2/f7m5W/Tx8eNSRN5NKHEBEkpa8T351dyYOA4CcMHFYbLFqqkPqy2NuO6R/Oe6LzFDCj2BNLu9jLFkPyxw+7L55czVksbJs3jz8MMC+fWu3ubL0GhKJse2Q/uW4LzITaxcjT2JYJqJYNdUh9eUxtx3Svxz3RWYo4cdqTYdlSO6D2LSpuoDqZCadOzf4ds02/mzY4xSy7ZD+5bgvMhNrFyNPjLnHFKumOqS+POa2Q/qX477IDCX8GATJfRCxaqpD6stjbjukfznui8xQwo+BNBmYj7FkfUPVPV5NdUh9ecxth/Qvx32RGUr4sULcUAWA8jDmjvVSqF1H1jgt8tF3PncUImS+c+ZGRxecFnlhWGZapFC7jqxxWqSBYRmsFTLfOXOjowtOi7yQ3KdFCrXryBqnRV5I7tMihdp1ZI3TIi8k92kxPy8tLlYDpGbVz8XF7nfCQtpianBa5IUbqgCQEW6orohVmBuy3VTmJadIOSmlH47S+xdiIvuiyWOsMZaxTD8QayLskO2mMi85k4InpfTDUXr/Qox6X4jpBxSvMDdku6nMS06RclJKPxyl9y/EqPcF87lL8SbCDtluKvOSMyl4Uko/HKX3L8So9wVj7lK8wtyQ7aYyLzlFykkp/XCU3r8Qk9oXZSf3WIW5IdtNZV5yipSTUvrhKL1/ISa2L5oMzMdYxjafe6yJsEO2m8q85EwKnpTSD0fp/Qsxyn0hbqgCQHkYc48thfr5XbuquzIry65do4kBKEisx0ySr+NvcnkfY8n6a/ZSqJ/fubN7/fzOncPFABQk1mMmk6zjF8MyEaVQP59KiSWQsFiPmUyyjp9hmZhiTWzNhNnASHVL7L3WN5XDP1WS+yBSqJ8H0Fesx0xy+KdKch9ECvXzO3d238ZG64EpFOsxkyzq+JsMzMdYsr6h6p5G/XznTVVupgLrxHrMZFJ1/OKGKgCUZ2Q3VM3smWb2T2Z2v5k9aGZ/2KXNM8zsNjN7yMzuMbPZwcJuILS4NPli1A4hRbmF74uY4cbczU3F7F9mhzpI4af96PS7tJdkkp5Tv75A0j2SruxosyDpM/Xr6yTd1m+7Aw3LhBaX5japdEhRbuH7Ima4MXdzUzH7l9mhDlL4ad+IGg7LBI2TS9os6V5Jr+5Y/7eSXlO/bkt6XPV0whstAyX37du7/6vcvn007SdtZWCwc2m11rctfF/EDDfmbm4qZv8yO9RBCj/tG2ma3BuNuZtZS9JRSS+W9Gl3/0DH59+TdI27P1q//2H9P4DHO9rtlbRXkrZt27bjRLenAHoJnRg5t0mlQx5MKnxfxAw35m5uKmb/MjvUQQo/7RsZ6UNM7n7W3S+TtEXSFWb2ikGCcvdFd59z97mZmZnwDYQWl+ZQjLpaSFFu4fsiZrgxd3NTMfuX2aEOUvhpP1JBde7u/lNJ35B0TcdHJyVtlSQza0t6vqRTowhwjdDi0iyKUVcJKcotfF/EDDfmbm4qZv8yO9RBCj/tR6vfuI2kGUkX1q+fJelbknZ3tLlBa2+ofr7fdgeucw8tLs1tUumQotzC90XMcGPu5qZi9i+zQx2k8NO+L41qzN3MLpV0i6SWqiv9z7v7fjPbX/+SO8zsmZL+TNKrJP1E0nXu/qNe26XOHQDCNR1zb/dr4O4PqEranetvXvX6fyX9cmiQAIA4yp9bZmqfYEAvIadFCqdQzAd3cntIK4XjkYUmYzcxlrHMLVPiEwwYWshpkcIpFPPBndwe0krheEyamFtGk51RH8kKOS1SOIVCY0ihf7ltNydNx9zLTu4lPsGAoYWcFimcQjEf3MntIa0Ujsek8U1M0nQ/wYANhZwWKZxCMR/cye0hrRSORy7KTu5T/QQDNhJyWqRwCsV8cCe3h7RSOB7ZaDIwH2MZ25d1lPYEA0Yi5LRI4RSK+eBObg9ppXA8JkncUAWA8jDmDoxIyBd7pCK3mFOpXU8ljpFocnkfY8n+O1QxFUK+2CMVucWcSu16KnH0I4ZlgOG129LZs+vXt1rSmTPjj6eJ3GJOpXY9lTj6YVgGGIFuSbLX+hTkFvPDD4etLz2OUSG5Az2EfLFHKnKLOZXa9VTiGBWSO9BDyBd7pCK3mFOpXU8ljpFpMjAfY+GGKnIR8sUeqcgt5lRq11OJoxdxQxUAysMNVYxNjrXBsWKOVV+e4z7GhDW5vI+xMCxThlxqg1eLFXOs+vIc9zHiEcMyGIdcaoNXixVzrPryHPcx4mFYBmORY21wrJhj1ZfnuI8xeSR3DCXH2uBYMceqL89xH2PySO4YSo61wbFijlVfnuM+RgKaDMzHWLihWo4caoM7xYo5Vn15jvsYcYgbqgBQHm6oYurEqgUP2S716EhFe9IBAKNw5Eg1tn36dPX+xInzY93z8+PZbqwYgEEwLIMixKoFD9ku9egYB4ZlMFVi1YKHbJd6dKSE5I4ixKoFD9ku9ehICckdRYhVCx6yXerRkRKSO4owPy8tLlbj22bVz8XF4W9khmw3VgzAIPreUDWzrZJulfRCSS5p0d0/0dHmKkl/Lelf61VfdPf9vbbLDVUACDfKG6pnJL3P3V8m6UpJN5jZy7q0+5a7X1YvPRM70pdjvTb16PGx3zLS5DHW1YuqK/Q3dqy7StJXQrbD9APpynH+8JCYc+xfCthvaVCM6QfMbFbSNyW9wt2fXLX+KklfkPSopH+T9Hvu/mCvbTEsk64c67WpR4+P/ZaGpsMyjZO7mT1H0j9I+rC7f7Hjs+dJOufuT5nZtZI+4e4v6bKNvZL2StK2bdt2nOh2pmDiNm2qrss6mUnnzo0/niZCYs6xfylgv6VhpA8xmdkFqq7Mj3Qmdkly9yfd/an69Z2SLjCzi7q0W3T3OXefm5mZafKrMQE51mtTjx4f+y0vfZO7mZmkz0o65u4f26DNi+p2MrMr6u2eGmWgGJ8c67WpR4+P/ZaZfoPykl6nqgTyAUn31cu1kt4l6V11m3dLelDS/ZK+LekX+22XG6ppy3H+8JCYc+xfCthvkyfmcweA8jBx2BSg5nithQWp3a5u8LXb1XtgWjGfe6aYO3ythQXp4MHz78+ePf/+wIHJxARMEsMymaLmeK12u0ronVot6cyZ8ccDxMKwTOGYO3ytbom913qgdCT3TFFzvFarFbYeKB3JPVPUHK+1cr+h6XqgdCT3TDF3+FoHDkj79p2/Um+1qvfcTMW04oYqAGSEG6qDKLxwvPDuFd+/FLCPM9LkMdYYS3LTDxQ+WXXh3Su+fylgH6dBTD8QqPDC8cK7V3z/UsA+TsPI53MfteSSe+GTVRfeveL7lwL2cRoYcw9VeOF44d0rvn8pYB/nheS+ovDC8cK7V3z/UsA+zgvJfUXhheOFd6/4/qWAfZwXxtwBICOMuQMFiVlfTu16mZjPHUhczLn7+V6AcjEsAyQuZn05tev5YVgGKETMufv5XoBykdyBxMWsL6d2vVwkdyBxMevLqV0vF8kdSFzM+nJq18vFDVUAyAg3VAFgipHcAaBAJHcAKBDJHQAKRHIHgAKR3AGgQCR3ACgQyR0ACtQ3uZvZVjP7hpl938weNLMbu7QxM/ukmT1kZg+Y2eVxwsUwmLcbmB5N5nM/I+l97n6vmT1X0lEzu8vdv7+qzZskvaReXi3pYP0TiWDebmC69L1yd/cfu/u99ev/knRM0iUdzd4i6VavfFvShWZ28cijxcBuuul8Yl9x+nS1HkB5gsbczWxW0qsk3dPx0SWSHln1/lGt/x+AzGyvmS2Z2dLy8nJYpBgK83YD06Vxcjez50j6gqT3uvuTg/wyd1909zl3n5uZmRlkExgQ83YD06VRcjezC1Ql9iPu/sUuTU5K2rrq/ZZ6HRLBvN3AdGlSLWOSPivpmLt/bINmd0h6R101c6WkJ9z9xyOME0Ni3m5gujSplnmtpF+T9F0zu69e9/uStkmSu39G0p2SrpX0kKTTkt45+lAxrPl5kjkwLfomd3f/R0nWp41LumFUQQEAhsMTqgBQIJI7ABSI5A4ABSK5A0CBSO4AUCCSOwAUiOQOAAWyqkR9Ar/YbFnSiYn88v4ukvT4pIOIiP7lq+S+SfSvie3u3ndyrokl95SZ2ZK7z006jljoX75K7ptE/0aJYRkAKBDJHQAKRHLvbnHSAURG//JVct8k+jcyjLkDQIG4cgeAAk11cjezlpl9x8y+0uWz681s2czuq5ffnESMwzCz42b23Tr+pS6fm5l90sweMrMHzOzyScQ5iAZ9u8rMnlh1/G6eRJyDMrMLzex2M/uBmR0zs9d0fJ7tsZMa9S/b42dmL10V931m9qSZvbejTfTj1+TLOkp2o6Rjkp63wee3ufu7xxhPDG9w943qat8k6SX18mpJB+ufuejVN0n6lrvvHls0o/UJSV9197eZ2c9J6viSxOyPXb/+SZkeP3f/Z0mXSdUFpKqvHP1SR7Pox29qr9zNbIukN0s6NOlYJugtkm71yrclXWhmF086qGlnZs+X9HpVX28pd/8/d/9pR7Nsj13D/pVip6QfunvnA5vRj9/UJndJH5f0fknnerR5a/0n0+1mtrVHu1S5pL8zs6NmtrfL55dIemTV+0frdTno1zdJeo2Z3W9mf2NmLx9ncEP6eUnLkv60HjY8ZGbP7miT87Fr0j8p3+O32nWS/qLL+ujHbyqTu5ntlvSYux/t0ezLkmbd/VJJd0m6ZSzBjdbr3P1yVX8C3mBmr590QCPUr2/3qnpM+5WS/kTSX407wCG0JV0u6aC7v0rSf0v64GRDGqkm/cv5+EmS6uGmPZL+chK/fyqTu6ov/d5jZsclfU7S1WZ2eHUDdz/l7k/Xbw9J2jHeEIfn7ifrn4+pGvO7oqPJSUmr/yLZUq9LXr++ufuT7v5U/fpOSReY2UVjD3Qwj0p61N3vqd/frioZrpbtsVOD/mV+/Fa8SdK97v4fXT6LfvymMrm7+4fcfYu7z6r6s+nr7v721W06xr/2qLrxmg0ze7aZPXfltaRfkvS9jmZ3SHpHfef+SklPuPuPxxxqsCZ9M7MXmZnVr69Qda6fGnesg3D3f5f0iJm9tF61U9L3O5pleeykZv3L+fit8qvqPiQjjeH4TXu1zBpmtl/SkrvfIek9ZrZH0hlJP5F0/SRjG8ALJX2p/vfRlvTn7v5VM3uXJLn7ZyTdKelaSQ9JOi3pnROKNVSTvr1N0j4zOyPpfyRd53k9sffbko7Uf9r/SNI7Czl2K/r1L+vjV190vFHSb61aN9bjxxOqAFCgqRyWAYDSkdwBoEAkdwAoEMkdAApEcgeAApHcAaBAJHcAKBDJHQAK9P9IUj1h6gimcQAAAABJRU5ErkJggg==\n",
66 | "text/plain": [
67 | ""
68 | ]
69 | },
70 | "metadata": {},
71 | "output_type": "display_data"
72 | }
73 | ],
74 | "source": [
75 | "# 分类0的散点图\n",
76 | "plt.scatter(X[y==0,0], X[y==0,1], color='red')\n",
77 | "\n",
78 | "# 分类1的散点图\n",
79 | "plt.scatter(X[y==1,0], X[y==1,1], color='blue')\n",
80 | "plt.show()"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "## 使用我们自己编写的逻辑回归"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 5,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "from playML.model_selection import train_test_split\n",
97 | "\n",
98 | "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 6,
104 | "metadata": {},
105 | "outputs": [
106 | {
107 | "data": {
108 | "text/plain": [
109 | "LogisticRegression()"
110 | ]
111 | },
112 | "execution_count": 6,
113 | "metadata": {},
114 | "output_type": "execute_result"
115 | }
116 | ],
117 | "source": [
118 | "from playML.logistic_regression import LogisticRegression\n",
119 | "\n",
120 | "log_reg = LogisticRegression()\n",
121 | "log_reg.fit(X_train, y_train)"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 7,
127 | "metadata": {},
128 | "outputs": [
129 | {
130 | "data": {
131 | "text/plain": [
132 | "1.0"
133 | ]
134 | },
135 | "execution_count": 7,
136 | "metadata": {},
137 | "output_type": "execute_result"
138 | }
139 | ],
140 | "source": [
141 | "log_reg.score(X_test, y_test)"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {},
147 | "source": [
148 | "评分结果不错,不过当然是因为我们的数据很简单"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 8,
154 | "metadata": {},
155 | "outputs": [
156 | {
157 | "data": {
158 | "text/plain": [
159 | "array([ 0.92972035, 0.98664939, 0.14852024, 0.17601199, 0.0369836 ,\n 0.0186637 , 0.04936918, 0.99669244, 0.97993941, 0.74524655,\n 0.04473194, 0.00339285, 0.26131273, 0.0369836 , 0.84192923,\n 0.79892262, 0.82890209, 0.32358166, 0.06535323, 0.20735334])"
160 | ]
161 | },
162 | "execution_count": 8,
163 | "metadata": {},
164 | "output_type": "execute_result"
165 | }
166 | ],
167 | "source": [
168 | "log_reg.predict_proba(X_test)"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 9,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "data": {
178 | "text/plain": [
179 | "array([1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0])"
180 | ]
181 | },
182 | "execution_count": 9,
183 | "metadata": {},
184 | "output_type": "execute_result"
185 | }
186 | ],
187 | "source": [
188 | "y_test"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 10,
194 | "metadata": {},
195 | "outputs": [
196 | {
197 | "data": {
198 | "text/plain": [
199 | "array([1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0])"
200 | ]
201 | },
202 | "execution_count": 10,
203 | "metadata": {},
204 | "output_type": "execute_result"
205 | }
206 | ],
207 | "source": [
208 | "log_reg.predict(X_test)"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": null,
214 | "metadata": {},
215 | "outputs": [],
216 | "source": []
217 | }
218 | ],
219 | "metadata": {
220 | "kernelspec": {
221 | "display_name": "Python 2",
222 | "language": "python",
223 | "name": "python2"
224 | },
225 | "language_info": {
226 | "codemirror_mode": {
227 | "name": "ipython",
228 | "version": 2
229 | },
230 | "file_extension": ".py",
231 | "mimetype": "text/x-python",
232 | "name": "python",
233 | "nbconvert_exporter": "python",
234 | "pygments_lexer": "ipython2",
235 | "version": "2.7.6"
236 | }
237 | },
238 | "nbformat": 4,
239 | "nbformat_minor": 0
240 | }
241 |
--------------------------------------------------------------------------------
/c1_knn/04_Hyper_Parameter_kNN.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# 超参数"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import matplotlib\n",
20 | "import matplotlib.pyplot as plt\n",
21 | "from sklearn import datasets"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "## 一个识别手写数字的例子"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "digits = datasets.load_digits()"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "data": {
47 | "text/plain": [
48 | "dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])"
49 | ]
50 | },
51 | "execution_count": 3,
52 | "metadata": {},
53 | "output_type": "execute_result"
54 | }
55 | ],
56 | "source": [
57 | "digits.keys()"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 4,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "Optical Recognition of Handwritten Digits Data Set\n===================================================\n\nNotes\n-----\nData Set Characteristics:\n :Number of Instances: 5620\n :Number of Attributes: 64\n :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n :Missing Attribute Values: None\n :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttp://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixels are counted in each block. This generates\nan input matrix of 8x8 where each element is an integer in the range\n0..16. This reduces dimensionality and gives invariance to small\ndistortions.\n\nFor info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G.\nT. Candela, D. L. Dimmick, J. Geist, P. J. Grother, S. A. Janet, and C.\nL. Wilson, NIST Form-Based Handprint Recognition System, NISTIR 5469,\n1994.\n\nReferences\n----------\n - C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their\n Applications to Handwritten Digit Recognition, MSc Thesis, Institute of\n Graduate Studies in Science and Engineering, Bogazici University.\n - E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika.\n - Ken Tang and Ponnuthurai N. Suganthan and Xi Yao and A. Kai Qin.\n Linear dimensionalityreduction using relevance weighted LDA. School of\n Electrical and Electronic Engineering Nanyang Technological University.\n 2005.\n - Claudio Gentile. A New Approximate Maximal Margin Classification\n Algorithm. NIPS. 2000.\n\n"
70 | ]
71 | }
72 | ],
73 | "source": [
74 | "print(digits.DESCR)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 5,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/plain": [
85 | "array([[ 0., 0., 5., ..., 0., 0., 0.],\n [ 0., 0., 0., ..., 10., 0., 0.],\n [ 0., 0., 0., ..., 16., 9., 0.],\n ..., \n [ 0., 0., 1., ..., 6., 0., 0.],\n [ 0., 0., 2., ..., 12., 0., 0.],\n [ 0., 0., 10., ..., 12., 1., 0.]])"
86 | ]
87 | },
88 | "execution_count": 5,
89 | "metadata": {},
90 | "output_type": "execute_result"
91 | }
92 | ],
93 | "source": [
94 | "digits.data"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 6,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "data": {
104 | "text/plain": [
105 | "(1797, 64)"
106 | ]
107 | },
108 | "execution_count": 6,
109 | "metadata": {},
110 | "output_type": "execute_result"
111 | }
112 | ],
113 | "source": [
114 | "np.shape(digits.data)"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "特征"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 7,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "X = digits.data"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "分类"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 8,
143 | "metadata": {},
144 | "outputs": [
145 | {
146 | "data": {
147 | "text/plain": [
148 | "(1797,)"
149 | ]
150 | },
151 | "execution_count": 8,
152 | "metadata": {},
153 | "output_type": "execute_result"
154 | }
155 | ],
156 | "source": [
157 | "y = digits.target\n",
158 | "np.shape(y)"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "## 可视化"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 9,
171 | "metadata": {},
172 | "outputs": [
173 | {
174 | "data": {
175 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPgAAAD8CAYAAABaQGkdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAACu5JREFUeJzt3d2LXeUZhvH77qi0qTYDTVokid1BJCCFTmQTkBRjIpZYxeSgBwkoJhRypCgtiPZE+g9IelAEiU4EE6WNSkSsVtDRCq11kkxa82FJw5RM0GZCGfw4aIg+PZgViJIya7LX1zxcPwidj81+n01zudbsWVmvI0IAcvpG2wMAqA+BA4kROJAYgQOJETiQGIEDiRE4kBiBA4kROJDYFXU86ZIlS6LX69Xx1K2amZlpdL3JycnG1hoaGmpsreuvv76xtRYtWtTYWk2anJzU2bNnPdfjagm81+tpfHy8jqdu1f79+xtd77777mtsreHh4cbW2rt3b2NrjYyMNLZWk/r9fqnHcYoOJEbgQGIEDiRG4EBiBA4kRuBAYgQOJEbgQGKlAre90faHtk/YfqTuoQBUY87AbQ9J+q2kOyTdKGmr7RvrHgzA4MocwddIOhERJyPinKTnJW2qdywAVSgT+DJJpy76fKr4GoCOq+xNNts7bI/bHp+enq7qaQEMoEzgpyWtuOjz5cXXviIinoyIfkT0ly5dWtV8AAZQJvD3Jd1ge6XtqyRtkfRyvWMBqMKc/x48Is7bvl/S65KGJD0dEUdqnwzAwErd8CEiXpX0as2zAKgYV7IBiRE4kBiBA4kROJAYgQOJETiQGIEDiRE4kFgtO5tk9dhjj7U9Qm02b97c2Fq33nprY2tNTEw0tpY0u6tPl3AEBxIjcCAxAgcSI3AgMQIHEiNwIDECBxIjcCAxAgcSK7OzydO2z9j+oImBAFSnzBF8t6SNNc8BoAZzBh4R70j6TwOzAKgYP4MDibF1EZBYZYGzdRHQPZyiA4mV+TXZc5L+LGmV7SnbP69/LABVKLM32dYmBgFQPU7RgcQIHEiMwIHECBxIjMCBxAgcSIzAgcQIHEhswW9dNDY21thahw8fbmwtSVq3bl1ja+3cubOxtWZmZhpbq8m/H5K0bdu2RtebC0dwIDECBxIjcCAxAgcSI3AgMQIHEiNwIDECBxIjcCAxAgcSK3PTxRW237J91PYR2w82MRiAwZW5Fv28pF9GxEHb10g6YPuNiDha82wABlRmb7KPIuJg8fGnko5JWlb3YAAGN6+fwW33JK2W9N4lvsfWRUDHlA7c9tWSXpD0UER88vXvs3UR0D2lArd9pWbj3hMRL9Y7EoCqlHkX3ZKeknQsIh6vfyQAVSlzBF8r6V5JG2xPFH9+WvNcACpQZm+ydyW5gVkAVIwr2YDECBxIjMCBxAgcSIzAgcQIHEiMwIHECBxIjL3JOmxkZKTtEWrR6/UaW4u9yQCkReBAYgQOJEbgQGIEDiRG4EBiBA4kRuBAYgQOJFbmpovftP1X24eLrYt+3cRgAAZX5lLV/0raEBGfFbdPftf2HyLiLzXPBmBAZW66GJI+Kz69svgTdQ4FoBplNz4Ysj0h6YykNyKCrYuABaBU4BHxRUSMSFouaY3tH17iMWxdBHTMvN5Fj4gZSW9J2ljPOACqVOZd9KW2h4uPvyXpdknH6x4MwODKvIt+raRnbA9p9j8Iv4uIV+odC0AVyryL/jfN7gkOYIHhSjYgMQIHEiNwIDECBxIjcCAxAgcSI3AgMQIHElvwWxcNDw83ttbixYsbW0uS1q9f3+h6TWlyO6Em/350EUdwIDECBxIjcCAxAgcSI3AgMQIHEiNwIDECBxIjcCCx0oEX90Y/ZJv7sQELxHyO4A9KOlbXIACqV3Znk+WS7pS0q95xAFSp7BF8p6SHJX1Z4ywAKlZm44O7JJ2JiANzPI69yYCOKXMEXyvpbtuTkp6XtMH2s19/EHuTAd0zZ+AR8WhELI+InqQtkt6MiHtqnwzAwPg9OJDYvO7oEhFjksZqmQRA5TiCA4kROJAYgQOJETiQGIEDiRE4kBiBA4kROJDYgt+6qEm9Xq/R9TZt2tTYWvv3729srbfffruxtUZHRxtbq4s4ggOJETiQGIEDiRE4kBiBA4kROJAYgQOJETiQGIEDiZW6kq24o+qnkr6QdD4i+nUOBaAa87lUdX1EnK1tEgCV4xQdSKxs4CHpj7YP2N5R50AAqlP2FP3HEXHa9vckvWH7eES8c/EDivB3SNJ1111X8ZgALkepI3hEnC7+94yklyStucRj2LoI6Jgymw9+2/Y1Fz6W9BNJH9Q9GIDBlTlF/76kl2xfePzeiHit1qkAVGLOwCPipKQfNTALgIrxazIgMQIHEiNwIDECBxIjcCAxAgcSI3AgMQIHEnNEVP6k/X4/xsfHK3/ethVX8zVm3bp1ja01MTHR2FpNbgE1NjbW2FqSNDw83Mg6/X5f4+Pjc/6F5AgOJEbgQGIEDiRG4EBiBA4kRuBAYgQOJEbgQGIEDiRWKnDbw7b32T5u+5jtm+seDMDgyt4X/TeSXouIn9m+StKiGmcCUJE5A7e9WNItkrZJUkSck3Su3rEAVKHMKfpKSdOSRm0fsr2ruD86gI4rE/gVkm6S9ERErJb0uaRHvv4g2ztsj9sen56ernhMAJejTOBTkqYi4r3i832aDf4r2LoI6J45A4+IjyWdsr2q+NJtko7WOhWASpR9F/0BSXuKd9BPStpe30gAqlIq8IiYkNSveRYAFeNKNiAxAgcSI3AgMQIHEiNwIDECBxIjcCAxAgcSI3AgsbKXqkLS6Ohoo+tt397cFcFN7oO2e/fuxtZqaq+wruIIDiRG4EBiBA4kRuBAYgQOJEbgQGIEDiRG4EBiBA4kNmfgtlfZnrjozye2H2piOACDmfNS1Yj4UNKIJNkeknRa0ks1zwWgAvM9Rb9N0j8j4l91DAOgWvMNfIuk5y71DbYuArqndODFpgd3S/r9pb7P1kVA98znCH6HpIMR8e+6hgFQrfkEvlX/5/QcQDeVCrzYD/x2SS/WOw6AKpXdm+xzSd+teRYAFeNKNiAxAgcSI3AgMQIHEiNwIDECBxIjcCAxAgcSc0RU/6T2tKT5/pPSJZLOVj5MN2R9bbyu9vwgIub8V121BH45bI9HRL/tOeqQ9bXxurqPU3QgMQIHEutS4E+2PUCNsr42XlfHdeZncADV69IRHEDFOhG47Y22P7R9wvYjbc9TBdsrbL9l+6jtI7YfbHumKtkesn3I9ittz1Il28O299k+bvuY7ZvbnmkQrZ+iF/da/4dm7xgzJel9SVsj4mirgw3I9rWSro2Ig7avkXRA0uaF/rousP0LSX1J34mIu9qepyq2n5H0p4jYVdxodFFEzLQ91+XqwhF8jaQTEXEyIs5Jel7SppZnGlhEfBQRB4uPP5V0TNKydqeqhu3lku6UtKvtWapke7GkWyQ9JUkRcW4hxy11I/Blkk5d9PmUkoRwge2epNWS3mt3ksrslPSwpC/bHqRiKyVNSxotfvzYVdyPcMHqQuCp2b5a0guSHoqIT9qeZ1C275J0JiIOtD1LDa6QdJOkJyJitaTPJS3o94S6EPhpSSsu+nx58bUFz/aVmo17T0RkuSPtWkl3257U7I9TG2w/2+5IlZmSNBURF8609mk2+AWrC4G/L+kG2yuLNzW2SHq55ZkGZtua/VnuWEQ83vY8VYmIRyNieUT0NPv/1ZsRcU/LY1UiIj6WdMr2quJLt0la0G+Klrptcp0i4rzt+yW9LmlI0tMRcaTlsaqwVtK9kv5ue6L42q8i4tUWZ8LcHpC0pzjYnJS0veV5BtL6r8kA1KcLp+gAakLgQGIEDiRG4EBiBA4kRuBAYgQOJEbgQGL/A9ozs2W/5x3pAAAAAElFTkSuQmCC\n",
176 | "text/plain": [
177 | ""
178 | ]
179 | },
180 | "metadata": {},
181 | "output_type": "display_data"
182 | }
183 | ],
184 | "source": [
185 | "some_digit = X[666]\n",
186 | "some_digit_image = some_digit.reshape(8,8)\n",
187 | "plt.imshow(some_digit_image, cmap=matplotlib.cm.binary)\n",
188 | "plt.show()"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "## train test split"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 10,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "from c1_knn.model_selection import train_test_split\n",
205 | "from c1_knn.kNN import KNNClassifier\n"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 11,
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 12,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "my_knn_clf = KNNClassifier(k=3)"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 13,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "data": {
233 | "text/plain": [
234 | "kNN(k=3)"
235 | ]
236 | },
237 | "execution_count": 13,
238 | "metadata": {},
239 | "output_type": "execute_result"
240 | }
241 | ],
242 | "source": [
243 | "my_knn_clf.fit(X_train, y_train)"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 14,
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "data": {
253 | "text/plain": [
254 | "0.99164345403899723"
255 | ]
256 | },
257 | "execution_count": 14,
258 | "metadata": {},
259 | "output_type": "execute_result"
260 | }
261 | ],
262 | "source": [
263 | "y_predict = my_knn_clf.predict(X_test)\n",
264 | "# 正确率\n",
265 | "np.sum(y_predict==y_test) / np.shape(y_test)[0]"
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "metadata": {},
271 | "source": [
272 | "### 把统计正确率封装为一个方法"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 15,
278 | "metadata": {},
279 | "outputs": [
280 | {
281 | "data": {
282 | "text/plain": [
283 | "0.99164345403899723"
284 | ]
285 | },
286 | "execution_count": 15,
287 | "metadata": {},
288 | "output_type": "execute_result"
289 | }
290 | ],
291 | "source": [
292 | "from c1_knn.metrics import accuracy_score\n",
293 | "accuracy_score(y_test, y_predict)"
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {},
299 | "source": [
300 | "## sklearn中的accuracy_score"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 16,
306 | "metadata": {},
307 | "outputs": [],
308 | "source": [
309 | "from sklearn.model_selection._split import train_test_split as train_test_spl\n",
310 | "from sklearn.neighbors.classification import KNeighborsClassifier\n",
311 | "from sklearn.metrics import accuracy_score as score\n"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 17,
317 | "metadata": {},
318 | "outputs": [],
319 | "source": [
320 | "X_train, X_test, y_train, y_test = train_test_spl(X, y, test_size=0.2, random_state=666)"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": 18,
326 | "metadata": {},
327 | "outputs": [
328 | {
329 | "data": {
330 | "text/plain": [
331 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n weights='uniform')"
332 | ]
333 | },
334 | "execution_count": 18,
335 | "metadata": {},
336 | "output_type": "execute_result"
337 | }
338 | ],
339 | "source": [
340 | "knn_clf = KNeighborsClassifier(n_neighbors=3)\n",
341 | "knn_clf.fit(X_train, y_train)"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 19,
347 | "metadata": {},
348 | "outputs": [],
349 | "source": [
350 | "y_predict = knn_clf.predict(X_test)"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 20,
356 | "metadata": {},
357 | "outputs": [
358 | {
359 | "data": {
360 | "text/plain": [
361 | "0.98888888888888893"
362 | ]
363 | },
364 | "execution_count": 20,
365 | "metadata": {},
366 | "output_type": "execute_result"
367 | }
368 | ],
369 | "source": [
370 | "score(y_test,y_predict)"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 21,
376 | "metadata": {},
377 | "outputs": [
378 | {
379 | "data": {
380 | "text/plain": [
381 | "0.98888888888888893"
382 | ]
383 | },
384 | "execution_count": 21,
385 | "metadata": {},
386 | "output_type": "execute_result"
387 | }
388 | ],
389 | "source": [
390 | "knn_clf.score(X_test, y_test)"
391 | ]
392 | },
393 | {
394 | "cell_type": "markdown",
395 | "metadata": {},
396 | "source": [
397 | "两个score的结果一样"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": null,
403 | "metadata": {},
404 | "outputs": [],
405 | "source": []
406 | }
407 | ],
408 | "metadata": {
409 | "kernelspec": {
410 | "display_name": "Python 2",
411 | "language": "python",
412 | "name": "python2"
413 | },
414 | "language_info": {
415 | "codemirror_mode": {
416 | "name": "ipython",
417 | "version": 2
418 | },
419 | "file_extension": ".py",
420 | "mimetype": "text/x-python",
421 | "name": "python",
422 | "nbconvert_exporter": "python",
423 | "pygments_lexer": "ipython2",
424 | "version": "2.7.6"
425 | }
426 | },
427 | "nbformat": 4,
428 | "nbformat_minor": 0
429 | }
430 |
--------------------------------------------------------------------------------
/c1_knn/08_Scaler_in_Scikit_Learn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# Scikit-learn中的Scaler"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "from sklearn import datasets"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "iris = datasets.load_iris()"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 3,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "X = iris.data\n",
38 | "y = iris.target"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 4,
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "array([[ 5.1, 3.5, 1.4, 0.2],\n [ 4.9, 3. , 1.4, 0.2],\n [ 4.7, 3.2, 1.3, 0.2],\n [ 4.6, 3.1, 1.5, 0.2],\n [ 5. , 3.6, 1.4, 0.2],\n [ 5.4, 3.9, 1.7, 0.4],\n [ 4.6, 3.4, 1.4, 0.3],\n [ 5. , 3.4, 1.5, 0.2],\n [ 4.4, 2.9, 1.4, 0.2],\n [ 4.9, 3.1, 1.5, 0.1]])"
50 | ]
51 | },
52 | "execution_count": 4,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "# 可以看到X尚未归一化时的数据\n",
59 | "X[:10,:]"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 5,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "from sklearn.model_selection._split import train_test_split\n",
69 | "\n",
70 | "X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=666)\n"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "## Scikit-learn中的StandardScaler,进行0均值标准化处理"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 6,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "from sklearn.preprocessing.data import StandardScaler"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 7,
92 | "metadata": {},
93 | "outputs": [
94 | {
95 | "data": {
96 | "text/plain": [
97 | "StandardScaler(copy=True, with_mean=True, with_std=True)"
98 | ]
99 | },
100 | "execution_count": 7,
101 | "metadata": {},
102 | "output_type": "execute_result"
103 | }
104 | ],
105 | "source": [
106 | "standard_scaler = StandardScaler()\n",
107 | "standard_scaler.fit(X_train)"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 8,
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "data": {
117 | "text/plain": [
118 | "array([ 5.83416667, 3.0825 , 3.70916667, 1.16916667])"
119 | ]
120 | },
121 | "execution_count": 8,
122 | "metadata": {},
123 | "output_type": "execute_result"
124 | }
125 | ],
126 | "source": [
127 | "# 训练集X特征矩阵的均值\n",
128 | "standard_scaler.mean_"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 11,
134 | "metadata": {},
135 | "outputs": [
136 | {
137 | "data": {
138 | "text/plain": [
139 | "array([ 0.81019502, 0.44076874, 1.76295187, 0.75429833])"
140 | ]
141 | },
142 | "execution_count": 11,
143 | "metadata": {},
144 | "output_type": "execute_result"
145 | }
146 | ],
147 | "source": [
148 | "# 训练集X特征矩阵的标准差\n",
149 | "standard_scaler.scale_"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 12,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "data": {
159 | "text/plain": [
160 | "array([[-0.90616043, 0.94720873, -1.30982967, -1.28485856],\n [-1.15301457, -0.18717298, -1.30982967, -1.28485856],\n [-0.16559799, -0.64092567, 0.22169257, 0.17345038],\n [ 0.45153738, 0.72033239, 0.95909217, 1.49918578],\n [-0.90616043, -1.3215547 , -0.40226093, -0.0916967 ],\n [ 1.43895396, 0.2665797 , 0.56203085, 0.30602392],\n [ 0.3281103 , -1.09467835, 1.07253826, 0.30602392],\n [ 2.1795164 , -0.18717298, 1.63976872, 1.2340387 ],\n [-0.78273335, 2.30846679, -1.25310662, -1.4174321 ],\n [ 0.45153738, -2.00218372, 0.44858475, 0.43859746],\n [ 1.80923518, -0.41404933, 1.46959958, 0.83631808],\n [ 0.69839152, 0.2665797 , 0.90236912, 1.49918578],\n [ 0.20468323, 0.72033239, 0.44858475, 0.571171 ],\n [-0.78273335, -0.86780201, 0.10824648, 0.30602392],\n [-0.53587921, 1.40096142, -1.25310662, -1.28485856],\n [-0.65930628, 1.40096142, -1.25310662, -1.28485856],\n [-1.0295875 , 0.94720873, -1.19638358, -0.7545644 ],\n [-1.77014994, -0.41404933, -1.30982967, -1.28485856],\n [-0.04217092, -0.86780201, 0.10824648, 0.04087684],\n [-0.78273335, 0.72033239, -1.30982967, -1.28485856],\n [-1.52329579, 0.72033239, -1.30982967, -1.15228502],\n [ 0.82181859, 0.2665797 , 0.78892303, 1.10146516],\n [-0.16559799, -0.41404933, 0.27841562, 0.17345038],\n [ 0.94524567, -0.18717298, 0.39186171, 0.30602392],\n [ 0.20468323, -0.41404933, 0.44858475, 0.43859746],\n [-1.39986872, 0.2665797 , -1.19638358, -1.28485856],\n [-1.15301457, 0.03970336, -1.25310662, -1.4174321 ],\n [ 1.06867274, 0.03970336, 1.07253826, 1.63175932],\n [ 0.57496445, -0.86780201, 0.67547694, 0.83631808],\n [ 0.3281103 , -0.64092567, 0.56203085, 0.04087684],\n [ 0.45153738, -0.64092567, 0.61875389, 0.83631808],\n [-0.16559799, 2.98909581, -1.25310662, -1.01971148],\n [ 0.57496445, -1.3215547 , 0.67547694, 0.43859746],\n [ 0.69839152, -0.41404933, 0.33513866, 0.17345038],\n [-0.90616043, 1.62783776, -1.02621444, -1.01971148],\n [ 1.19209981, -0.64092567, 0.61875389, 0.30602392],\n [-0.90616043, 0.94720873, -1.30982967, -1.15228502],\n [-1.89357701, -0.18717298, -1.47999881, -1.4174321 ],\n [ 0.08125616, -0.18717298, 0.78892303, 0.83631808],\n [ 0.69839152, -0.64092567, 1.07253826, 1.2340387 ],\n [-0.28902506, -0.64092567, 0.67547694, 1.10146516],\n [-0.41245214, -1.54843104, -0.00519961, -0.22427024],\n [ 1.31552689, 0.03970336, 0.67547694, 0.43859746],\n [ 0.57496445, 0.72033239, 1.07253826, 1.63175932],\n [ 0.82181859, -0.18717298, 1.18598435, 1.36661224],\n [-0.16559799, 1.62783776, -1.13966053, -1.15228502],\n [ 0.94524567, -0.41404933, 0.5053078 , 0.17345038],\n [ 1.06867274, 0.49345605, 1.12926131, 1.76433286],\n [-1.27644165, -0.18717298, -1.30982967, -1.4174321 ],\n [-1.0295875 , 1.17408507, -1.30982967, -1.28485856],\n [ 0.20468323, -0.18717298, 0.61875389, 0.83631808],\n [-1.0295875 , -0.18717298, -1.19638358, -1.28485856],\n [ 0.3281103 , -0.18717298, 0.67547694, 0.83631808],\n [ 0.69839152, 0.03970336, 1.01581521, 0.83631808],\n [-0.90616043, 1.40096142, -1.25310662, -1.01971148],\n [-0.16559799, -0.18717298, 0.27841562, 0.04087684],\n [-1.0295875 , 0.94720873, -1.36655271, -1.15228502],\n [-0.90616043, 1.62783776, -1.25310662, -1.15228502],\n [-1.52329579, 0.2665797 , -1.30982967, -1.28485856],\n [-0.53587921, -0.18717298, 0.44858475, 0.43859746],\n [ 0.82181859, -0.64092567, 0.5053078 , 0.43859746],\n [ 0.3281103 , -0.64092567, 0.16496953, 0.17345038],\n [-1.27644165, 0.72033239, -1.19638358, -1.28485856],\n [-0.90616043, 0.49345605, -1.13966053, -0.88713794],\n [-0.04217092, -0.86780201, 0.78892303, 0.96889162],\n [-0.28902506, -0.18717298, 0.22169257, 0.17345038],\n [ 0.57496445, -0.64092567, 0.78892303, 0.43859746],\n [ 1.06867274, 0.49345605, 1.12926131, 1.2340387 ],\n [ 1.68580811, -0.18717298, 1.18598435, 0.571171 ],\n [ 1.06867274, -0.18717298, 0.84564608, 1.49918578],\n [-1.15301457, 0.03970336, -1.25310662, -1.4174321 ],\n [-1.15301457, -1.3215547 , 0.44858475, 0.70374454],\n [-0.16559799, -1.3215547 , 0.73219998, 1.10146516],\n [-1.15301457, -1.54843104, -0.2320918 , -0.22427024],\n [-0.41245214, -1.54843104, 0.05152343, -0.0916967 ],\n [ 1.06867274, -1.3215547 , 1.18598435, 0.83631808],\n [ 0.82181859, -0.18717298, 1.01581521, 0.83631808],\n [-0.16559799, -1.09467835, -0.1186457 , -0.22427024],\n [ 0.20468323, -2.00218372, 0.73219998, 0.43859746],\n [ 1.06867274, 0.03970336, 0.56203085, 0.43859746],\n [-1.15301457, 0.03970336, -1.25310662, -1.4174321 ],\n [ 0.57496445, -1.3215547 , 0.73219998, 0.96889162],\n [-1.39986872, 0.2665797 , -1.36655271, -1.28485856],\n [ 0.20468323, -0.86780201, 0.78892303, 0.571171 ],\n [-0.04217092, -1.09467835, 0.16496953, 0.04087684],\n [ 1.31552689, 0.2665797 , 1.12926131, 1.49918578],\n [-1.77014994, -0.18717298, -1.36655271, -1.28485856],\n [ 1.56238103, -0.18717298, 1.2427074 , 1.2340387 ],\n [ 1.19209981, 0.2665797 , 1.2427074 , 1.49918578],\n [-0.78273335, 0.94720873, -1.25310662, -1.28485856],\n [ 2.54979762, 1.62783776, 1.52632263, 1.10146516],\n [ 0.69839152, -0.64092567, 1.07253826, 1.36661224],\n [-0.28902506, -0.41404933, -0.06192266, 0.17345038],\n [-0.41245214, 2.53534313, -1.30982967, -1.28485856],\n [-1.27644165, -0.18717298, -1.30982967, -1.15228502],\n [ 0.57496445, -0.41404933, 1.07253826, 0.83631808],\n [-1.77014994, 0.2665797 , -1.36655271, -1.28485856],\n [-0.53587921, 1.8547141 , -1.13966053, -1.01971148],\n [-1.0295875 , 0.72033239, -1.19638358, -1.01971148],\n [ 1.06867274, -0.18717298, 0.73219998, 0.70374454],\n [-0.53587921, 1.8547141 , -1.36655271, -1.01971148],\n [ 2.30294347, -0.64092567, 1.69649176, 1.10146516],\n [-0.28902506, -0.86780201, 0.27841562, 0.17345038],\n [ 1.19209981, -0.18717298, 1.01581521, 1.2340387 ],\n [-0.41245214, 0.94720873, -1.36655271, -1.28485856],\n [-1.27644165, 0.72033239, -1.02621444, -1.28485856],\n [-0.53587921, 0.72033239, -1.13966053, -1.28485856],\n [ 2.30294347, 1.62783776, 1.69649176, 1.36661224],\n [ 1.31552689, 0.03970336, 0.95909217, 1.2340387 ],\n [-0.28902506, -1.3215547 , 0.10824648, -0.0916967 ],\n [-0.90616043, 0.72033239, -1.25310662, -1.28485856],\n [-0.90616043, 1.62783776, -1.19638358, -1.28485856],\n [ 0.3281103 , -0.41404933, 0.56203085, 0.30602392],\n [-0.04217092, 2.08159044, -1.42327576, -1.28485856],\n [-1.0295875 , -2.45593641, -0.1186457 , -0.22427024],\n [ 0.69839152, 0.2665797 , 0.44858475, 0.43859746],\n [ 0.3281103 , -0.18717298, 0.5053078 , 0.30602392],\n [ 0.08125616, 0.2665797 , 0.61875389, 0.83631808],\n [ 0.20468323, -2.00218372, 0.16496953, -0.22427024],\n [ 1.93266225, -0.64092567, 1.35615349, 0.96889162]])"
161 | ]
162 | },
163 | "execution_count": 12,
164 | "metadata": {},
165 | "output_type": "execute_result"
166 | }
167 | ],
168 | "source": [
169 | "# 归一化处理\n",
170 | "X_train_normalization = standard_scaler.transform(X_train)\n",
171 | "X_train_normalization"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 13,
177 | "metadata": {},
178 | "outputs": [
179 | {
180 | "data": {
181 | "text/plain": [
182 | "array([[-0.28902506, -0.18717298, 0.44858475, 0.43859746],\n [-0.04217092, -0.64092567, 0.78892303, 1.63175932],\n [-1.0295875 , -1.77530738, -0.2320918 , -0.22427024],\n [-0.04217092, -0.86780201, 0.78892303, 0.96889162],\n [-1.52329579, 0.03970336, -1.25310662, -1.28485856],\n [-0.41245214, -1.3215547 , 0.16496953, 0.17345038],\n [-0.16559799, -0.64092567, 0.44858475, 0.17345038],\n [ 0.82181859, -0.18717298, 0.84564608, 1.10146516],\n [ 0.57496445, -1.77530738, 0.39186171, 0.17345038],\n [-0.41245214, -1.09467835, 0.39186171, 0.04087684],\n [ 1.06867274, 0.03970336, 0.39186171, 0.30602392],\n [-1.64672287, -1.77530738, -1.36655271, -1.15228502],\n [-1.27644165, 0.03970336, -1.19638358, -1.28485856],\n [-0.53587921, 0.72033239, -1.25310662, -1.01971148],\n [ 1.68580811, 1.17408507, 1.35615349, 1.76433286],\n [-0.04217092, -0.86780201, 0.22169257, -0.22427024],\n [-1.52329579, 1.17408507, -1.53672185, -1.28485856],\n [ 1.68580811, 0.2665797 , 1.29943044, 0.83631808],\n [ 1.31552689, 0.03970336, 0.78892303, 1.49918578],\n [ 0.69839152, -0.86780201, 0.90236912, 0.96889162],\n [ 0.57496445, 0.49345605, 0.56203085, 0.571171 ],\n [-1.0295875 , 0.72033239, -1.25310662, -1.28485856],\n [ 2.30294347, -1.09467835, 1.80993786, 1.49918578],\n [-1.0295875 , 0.49345605, -1.30982967, -1.28485856],\n [ 0.45153738, -0.41404933, 0.33513866, 0.17345038],\n [ 0.08125616, -0.18717298, 0.27841562, 0.43859746],\n [-1.0295875 , 0.2665797 , -1.42327576, -1.28485856],\n [-0.41245214, -1.77530738, 0.16496953, 0.17345038],\n [ 0.57496445, 0.49345605, 1.29943044, 1.76433286],\n [ 2.30294347, -0.18717298, 1.35615349, 1.49918578]])"
183 | ]
184 | },
185 | "execution_count": 13,
186 | "metadata": {},
187 | "output_type": "execute_result"
188 | }
189 | ],
190 | "source": [
191 | "# 对测试数据集的特征矩阵进行归一化处理\n",
192 | "X_test_normalization = standard_scaler.transform(X_test)\n",
193 | "X_test_normalization"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 14,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "from sklearn.neighbors.classification import KNeighborsClassifier"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 15,
208 | "metadata": {},
209 | "outputs": [
210 | {
211 | "data": {
212 | "text/plain": [
213 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n weights='uniform')"
214 | ]
215 | },
216 | "execution_count": 15,
217 | "metadata": {},
218 | "output_type": "execute_result"
219 | }
220 | ],
221 | "source": [
222 | "knn_clf = KNeighborsClassifier(n_neighbors=3)\n",
223 | "knn_clf.fit(X_train_normalization, y_train)"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "#### 如果对训练数据集进行了归一化处理,则测试数据集也必须进行归一化处理,否则结果会很差"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 18,
236 | "metadata": {},
237 | "outputs": [
238 | {
239 | "data": {
240 | "text/plain": [
241 | "0.33333333333333331"
242 | ]
243 | },
244 | "execution_count": 18,
245 | "metadata": {},
246 | "output_type": "execute_result"
247 | }
248 | ],
249 | "source": [
250 | "knn_clf.score(X_test, y_test)"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 19,
256 | "metadata": {},
257 | "outputs": [
258 | {
259 | "data": {
260 | "text/plain": [
261 | "1.0"
262 | ]
263 | },
264 | "execution_count": 19,
265 | "metadata": {},
266 | "output_type": "execute_result"
267 | }
268 | ],
269 | "source": [
270 | "knn_clf.score(X_test_normalization, y_test)"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": []
279 | }
280 | ],
281 | "metadata": {
282 | "kernelspec": {
283 | "display_name": "Python 2",
284 | "language": "python",
285 | "name": "python2"
286 | },
287 | "language_info": {
288 | "codemirror_mode": {
289 | "name": "ipython",
290 | "version": 2
291 | },
292 | "file_extension": ".py",
293 | "mimetype": "text/x-python",
294 | "name": "python",
295 | "nbconvert_exporter": "python",
296 | "pygments_lexer": "ipython2",
297 | "version": "2.7.6"
298 | }
299 | },
300 | "nbformat": 4,
301 | "nbformat_minor": 0
302 | }
303 |
--------------------------------------------------------------------------------
/c1_knn/01_kNN_Basics.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import matplotlib.pyplot as plt"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 6,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "raw_data_X = np.random.random((10,2))"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 7,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "raw_data_X = raw_data_X *10"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 8,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "raw_data_y = np.array([0,0,0,0,0,1,1,1,1,1])"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 9,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "X_train = raw_data_X\n",
49 | "y_train = raw_data_y"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 10,
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "data": {
59 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAADdVJREFUeJzt3UGIpHeZx/HfrzOzaMWldUkja2JX5bBkEYfdSB2iAQ9pFxbNmD3sIVARXRbqsmgUQZQ6SA61eBBpT0IR1xV8iYcxsDseRBldloUlUDMJ28mMIKzpdnSyaVm2FeuQkTx7eKuTmdnu9Nu9/dZbT9X3A0N1//NO9UPBfPP2W+/7liNCAIA8VpoeAABwPIQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyZ+p40nvuuSc6nU4dTw0AC+ny5cu/joi1KtvWEu5Op6PxeFzHUwPAQrK9XXVbDpUAQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AycxNuIutQp3NjlaeWlFns6Niq2h6JACYS3MR7mKrUP9iX9t72wqFtve21b/YJ94ATkdRSJ2OtLJSPha52zIX4R5cGmhyc3Lb2uTmRINLg4YmArAwikLq96XtbSmifOz3U8d7LsK9s7dzrHUAqGwwkCa37xhqMinXk5qLcK+vrh9rHQAq2zlkB/Cw9QTmItzDjaFaZ1u3rbXOtjTcGDY0EYCFsX7IDuBh6wnMRbh753oanR+pvdqWZbVX2xqdH6l3rtf0aACyGw6l1u07hmq1yvWkHBGn/qTdbje4rSuAuVEU5THtnZ1yT3s4lHrztWNo+3JEdKtsOxd73LPCueLAkur1pJdfll5/vXycs2gfVy0fpDCP9s8V3z/tcP9ccUkckgGQytLscXOuOIBFsTTh5lxxAItiacLNueIAFsXShJtzxQEsiqUJN+eKA1gUnMcNAHOA87gBYIERbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQqhdv252y/ZPtF28/YflvdgwEADnZkuG3fK+kzkroR8X5Jd0l6vO7BAAAHq3qo5Iykt9s+I6kl6Vf1jQQAeCtHhjsifinpq5J2JN2QtBcRP6x7MADAwaocKnmXpMck3S/pPZLutv3EAdv1bY9tj3d3d09/UgCApGqHSj4i6ecRsRsRNyU9K+lDd24UEaOI6EZEd21t7bTnBABMVQn3jqSHbLdsW9KGpGv1jgUAOEyVY9zPSbog6YqkrenfGdU8FwDgEGeqbBQRX5b05ZpnAQBUwJWTAJAM4QaAZAg3ACRDuIFbFYXU6UgrK+VjUTQ9EfB/VHpzElgKRSH1+9JkUn6/vV1+L0m9XnNzAXdgjxvYNxi8Ge19k0m5DswRwg3s29k53jrQEMIN7FtfP9460BDCDewbDqVW6/a1VqtcB+YI4Qb29XrSaCS125JdPo5GvDGJucNZJcCtej1CjbnHHjcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAE6qoZuScTogAJxEgzclY48bAE6iwZuSEW4AOIkGb0pGuAHgJBq8KRnhBoCTaPCmZIQbAE6iwZuScVYJAJxUQzclY48bAJIh3ACQDOEGgGQIdyLFVqHOZkcrT62os9lRsTWby2sBzBfenEyi2CrUv9jX5GZ5pdb23rb6F8vLa3vn+MQWYJmwx53E4NLgjWjvm9ycaHCp/strAcwXwp3Ezt7Bl9Eetg5gcRHuJNZXD76M9rB1AIuLcCcx3Biqdfb2y2tbZ1sabtR/eS2A+UK4k+id62l0fqT2aluW1V5ta3R+xBuTwBJyRJz6k3a73RiPx6f+vACwqGxfjohulW3Z4waAZCqF2/Y7bV+w/VPb12x/sO7BAAAHq3oBztcl/SAi/tr2H0hqHfUXAAD1ODLctlclfVjSpyQpIl6T9Fq9YwEADlPlUMn9knYlfcv287aftn33nRvZ7tse2x7v7u6e+qAAgFKVcJ+R9AFJ34iIByX9TtIX79woIkYR0Y2I7tra2imPCQDYVyXc1yVdj4jnpt9fUBlyAEADjgx3RLwi6Re2H5gubUi6WutUAIBDVT2r5NOSiukZJf8p6W/qGwkA8FYqhTsiXpBU6YoeAEC9uHISAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwY3kVhdTpSCsr5WNRND0RUEnVKyeBxVIUUr8vTSbl99vb5feS1ONzPDHf2OPGchoM3oz2vsmkXAfmHOHGctrZOd46MEcIN5bT+vrx1oE5QrixnIZDqXXHR6e2WuU6MOcIN5ZTryeNRlK7Ldnl42jEG5NIgbNKsLx6PUKNlNjjBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ABmptgq1NnsaOWpFXU2Oyq2uJXuSXABDoCZKLYK9S/2NblZ3pVxe29b/YvlrXR757gQ6jjY4wYwE4NLgzeivW9yc6LBJW6le1yEG8BM7OwdfMvcw9ZxOMINYCbWVw++Ze5h6zgc4QYwE8ONoVpnb7+VbutsS8MNbqV7XIQbwEz0zvU0Oj9Se7Uty2qvtjU6P+KNyRNwRJz6k3a73RiPx6f+vACwqGxfjohulW3Z4waAZAg3ACRDuAEgGcINAMkQbgBIpnK4bd9l+3nb369zIADAWzvOHveTkq7VNQgAoJpK4bZ9n6SPSXq63nEAAEepuse9KekLkl6vcRYAQAVHhtv2o5JejYjLR2zXtz22Pd7d3T21AQEAt6uyx/2wpI/bflnSdyU9Yvs7d24UEaOI6EZEd21t7ZTHBADsOzLcEfGliLgvIjqSHpf044h4ovbJAAAH4jxuAEjmWOGOiH+JiEfrGgaoTVFInY60slI+FnxILfLiw4Kx+IpC6velyfTzDre3y+8lqce9oJEPh0qw+AaDN6O9bzIp14GECDcW384hH0Z72Dow5wg3Ft/6IR9Ge9g6MOcINxbfcCi1bv+QWrVa5TqQEOHG4uv1pNFIarclu3wcjXhjEmlxVgmWQ69HqLEw2OMGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASObIcNt+r+2f2L5q+yXbT85iMADAwc5U2Ob3kj4fEVds/6Gky7Z/FBFXa54NAHCAI/e4I+JGRFyZfv1bSdck3Vv3YACAgx3rGLftjqQHJT1XxzAAgKNVDrftd0j6nqTPRsRvDvjvfdtj2+Pd3d3TnBEAcItK4bZ9VmW0i4h49qBtImIUEd2I6K6trZ3mjACAW1Q5q8SSvinpWkR8rf6RAABvpcoe98OSPiHpEdsvTP98tOa5AACHOPJ0wIj4N0mewSwAgAq4chIAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwJ1dsFepsdrTy1Io6mx0VW0XTIwGo2ZmmB8DJFVuF+hf7mtycSJK297bVv9iXJPXO9ZocDUCN2ONObHBp8Ea0901uTjS4NGhoIgCzQLgT29nbOdY6gMVAuBNbX10/1jqAxUC4ExtuDNU627ptrXW2peHGsKGJAMwC4U6sd66n0fmR2qttWVZ7ta3R+RFvTAILzhFx6k/a7XZjPB6f+vMCwKKyfTkiulW2ZY8bAJIh3ACQDOEGgGQINwAkQ7gBIJlaziqxvStp+9SfeH7cI+nXTQ8xB3gdSrwOvAb7/j+vQzsi1qpsWEu4F53tcdXTdhYZr0OJ14HXYN+sXgcOlQBAMoQbAJIh3CczanqAOcHrUOJ14DXYN5PXgWPcAJAMe9wAkAzhrsj2e23/xPZV2y/ZfrLpmZpk+y7bz9v+ftOzNMX2O21fsP1T29dsf7DpmZpg+3PTfxMv2n7G9tuanmkWbP+D7Vdtv3jL2h/Z/pHtn00f31XHzybc1f1e0ucj4n2SHpL0d7bf1/BMTXpS0rWmh2jY1yX9ICL+VNKfaQlfD9v3SvqMpG5EvF/SXZIeb3aqmflHSX95x9oXJV2KiD+RdGn6/akj3BVFxI2IuDL9+rcq/5He2+xUzbB9n6SPSXq66VmaYntV0oclfVOSIuK1iPifZqdqzBlJb7d9RlJL0q8anmcmIuJfJf33HcuPSfr29OtvS/qrOn424T4B2x1JD0p6rtlJGrMp6QuSXm96kAbdL2lX0remh4yetn1300PNWkT8UtJXJe1IuiFpLyJ+2OxUjXp3RNyYfv2KpHfX8UMI9zHZfoek70n6bET8pul5Zs32o5JejYjLTc/SsDOSPiDpGxHxoKTfqaZfi+fZ9BjuYyr/R/YeSXfbfqLZqeZDlKfs1XLaHuE+BttnVUa7iIhnm56nIQ9L+rjtlyV9V9Ijtr/T7EiNuC7pekTs/9Z1QWXIl81HJP08InYj4qakZyV9qOGZmvRftv9YkqaPr9bxQwh3Rbat8njmtYj4WtPzNCUivhQR90VER+WbUD+OiKXbw4qIVyT9wvYD06UNSVcbHKkpO5Iest2a/hvZ0BK+SXuLf5b0yenXn5T0T3X8EMJd3cOSPqFyD/OF6Z+PNj0UGvVpSYXt/5D055L+vuF5Zm76G8cFSVckbalsylJcRWn7GUn/LukB29dt/62kr0j6C9s/U/nbyFdq+dlcOQkAubDHDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgmf8Fyb54/FyWOFoAAAAASUVORK5CYII=\n",
60 | "text/plain": [
61 | ""
62 | ]
63 | },
64 | "metadata": {},
65 | "output_type": "display_data"
66 | }
67 | ],
68 | "source": [
69 | "plt.scatter(X_train[y_train==0, 0], X_train[y_train==0,1], color='g')\n",
70 | "plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r')\n",
71 | "plt.show()"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 11,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "x = np.random.random(2)*10"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 12,
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "data": {
90 | "text/plain": [
91 | "array([ 8.67035706, 7.0760235 ])"
92 | ]
93 | },
94 | "execution_count": 12,
95 | "metadata": {},
96 | "output_type": "execute_result"
97 | }
98 | ],
99 | "source": [
100 | "x"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 13,
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "data": {
110 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAADgBJREFUeJzt3VGIrHd9xvHn2ewWnVhWSxapiTuTi5IiHtrIXEQDXmQtFM0xvehFYCJaCnNTNIogylxILqZ4IbJeCUOsFXyJF8dAe7wQZbWUQgnMnoRuco4g1Mx69KRZKV3FucjK+fXinUnOnu6efed033nf/8z3A4fZ/ec9sz8GzjfvvvO+7zgiBABIx0rVAwAAZkO4ASAxhBsAEkO4ASAxhBsAEkO4ASAxhBsAEkO4ASAxhBsAErNaxpPed9990Wq1ynhqAFhIu7u7v46IjSLblhLuVqul4XBYxlMDwEKyPSq6LYdKACAxhBsAEkO4ASAxhBsAEkO4ASAxhBsAEkO4ASAxtQl3tpeptd3SyjMram23lO1lVY8EALVUi3Bne5m6l7saHY4UCo0OR+pe7hJvAOcjy6RWS1pZyR+ztNtSi3D3dnoaH42PrY2Pxurt9CqaCMDCyDKp25VGIykif+x2k453LcK9f7g/0zoAFNbrSePjO4Yaj/P1RNUi3JvrmzOtA0Bh+6fsAJ62noBahLu/1VdjrXFsrbHWUH+rX9FEABbG5ik7gKetJ6AW4e5c6GhwcaDmelOW1VxvanBxoM6FTtWjAUhdvy81ju8YqtHI1xPliDj3J22328FtXQHURpblx7T39/M97X5f6tRrx9D2bkS0i2xbiz3ueeFccWBJdTrSq69KN2/mjzWL9qxK+SCFOpqeKz497XB6rrgkDskASMrS7HFzrjiARbE04eZccQCLYmnCzbniABbF0oSbc8UBLIqlCTfnigNYFJzHDQA1wHncALDACDcAJIZwA0BiCDcAJIZwA0BiCDcAJIZwA0BiCDcAJIZwA0BiCDcAJIZwA0BiCDcAJIZwA0BiCDcAJKZQuG1/zvYrtl+2/Zztt5U9GADgZGeG2/b9kj4jqR0R75d0j6Qnyx4MAHCyoodKViW93faqpIakX5U3EgDgTs4Md0T8UtJXJe1LuiHpMCJ+WPZgAICTFTlU8i5JT0h6UNJ7JN1r+6kTtuvaHtoeHhwcnP+kAABJxQ6VfETSzyPiICKOJD0v6UO3bxQRg4hoR0R7Y2PjvOcEAEwUCfe+pEdsN2xb0paka+WOBQA4TZFj3C9IuiTpiqS9yd8ZlDwXAOAUq0U2iogvS/pyybMAAArgykkASAzhBoDEEG4ASAzhBm6VZVKrJa2s5I9ZVvVEwP9R6M1JYClkmdTtSuNx/v1olH8vSZ1OdXMBt2GPG5jq9d6K9tR4nK8DNUK4gan9/dnWgYoQbmBqc3O2daAihBuY6velRuP4WqORrwM1QriBqU5HGgykZlOy88fBgDcmUTucVQLcqtMh1Kg99rgBIDGEG0AtcS3U6ThUAqB2uBbqztjjBlA7XAt1Z4QbQO1wLdSdEW4AtZPMtVAVHYgn3ABqJ4lroaYH4kcjKeKtA/FziDfhBlA7SVwLVeGBeEfEuT9pu92O4XB47s8LALWxspLvad/Olm7enPnpbO9GRLvQj5752QEAlR6IJ9wAcDcqPBBPuAHgblR4IJ4rJwHgblV0UzL2uAEgMYQbABJDuAEgMYQ7IdleptZ2SyvPrKi13VK2x30ugWXEm5OJyPYydS93NT7Kr9QaHY7UvZzf57JzoU6XkwEoG3vciejt9N6M9tT4aKzeDve5BJYN4U7E/uHJ97M8bR3A4iLcidhcP/ky2tPWASwuwp2I/lZfjbXjl9c21hrqb9XpPpcA5oFwJ6JzoaPBxYGa601ZVnO9qcHFAW9MAkuI27oCQA1wW1cAWGCFwm37nbYv2f6p7Wu2P1j2YACAkxW9AOfrkn4QEX9t+w8kNc76CwCAcpwZbtvrkj4s6VOSFBFvSHqj3LEAAKcpcqjkQUkHkr5l+0Xbz9q+9/aNbHdtD20PDw4Ozn1QAECuSLhXJX1A0jci4mFJv5P0xds3iohBRLQjor2xsXHOYwIApoqE+7qk6xHxwuT7S8pDDgCowJnhjojXJP3C9kOTpS1JV0udCgBwqqJnlXxaUjY5o+Q/Jf1NeSMBAO6kULgj4iVJha7oAQCUiysnASAxhBsAEkO4ASAxhBsAEkO4ASAxhBsAEkO4ASAxhBvLK8ukVktaWckfs6zqiYBCil45CSyWLJO6XWk8zr8fjfLvJanD53ii3tjjxnLq9d6K9tR4nK8DNUe4sZz292dbB2qEcGM5bW7Otg7UCOHGcur3pcZtH53aaOTrQM0RbiynTkcaDKRmU7Lzx8GANyaRBM4qwfLqdAg1ksQeNwAkhnADQGIINwAkhnADQGIINwAkhnADQGIINwAkhnADmJtsL1Nru6WVZ1bU2m4p2+NWuneDC3AAzEW2l6l7uavxUX5XxtHhSN3L+a10Oxe4EGoW7HEDmIveTu/NaE+Nj8bq7XAr3VkRbgBzsX948i1zT1vH6Qg3gLnYXD/5lrmnreN0hBvAXPS3+mqsHb+VbmOtof4Wt9KdFeEGMBedCx0NLg7UXG/KsprrTQ0uDnhj8i44Is79SdvtdgyHw3N/XgBYVLZ3I6JdZFv2uAEgMYQbABJDuAEgMYQbABJDuAEgMYXDbfse2y/a/n6ZAwEA7myWPe6nJV0raxAAQDGFwm37AUkfk/RsueMAAM5SdI97W9IXJN0scRYAQAFnhtv245Jej4jdM7br2h7aHh4cHJzbgACA44rscT8q6eO2X5X0XUmP2f7O7RtFxCAi2hHR3tjYOOcxAQBTZ4Y7Ir4UEQ9EREvSk5J+HBFPlT4ZAOBEnMcNAImZKdwR8S8R8XhZwwClyTKp1ZJWVvLHjA+pRbr4sGAsviyTul1pPPm8w9Eo/16SOtwLGunhUAkWX6/3VrSnxuN8HUgQ4cbi2z/lw2hPWwdqjnBj8W2e8mG0p60DNUe4sfj6falx/ENq1Wjk60CCCDcWX6cjDQZSsynZ+eNgwBuTSBZnlWA5dDqEGguDPW4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASMyZ4bb9Xts/sX3V9iu2n57HYACAk60W2Ob3kj4fEVds/6GkXds/ioirJc8GADjBmXvcEXEjIq5Mvv6tpGuS7i97MADAyWY6xm27JelhSS+UMQwA4GyFw237HZK+J+mzEfGbE/571/bQ9vDg4OA8ZwQA3KJQuG2vKY92FhHPn7RNRAwioh0R7Y2NjfOcEQBwiyJnlVjSNyVdi4ivlT8SAOBOiuxxPyrpE5Ies/3S5M9HS54LAHCKM08HjIh/k+Q5zAIAKIArJwEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQ7cdleptZ2SyvPrKi13VK2l1U9EoCSrVY9AO5etpepe7mr8dFYkjQ6HKl7uStJ6lzoVDkagBKxx52w3k7vzWhPjY/G6u30KpoIwDwQ7oTtH+7PtA5gMRDuhG2ub860DmAxEO6E9bf6aqw1jq011hrqb/UrmgjAPBDuhHUudDS4OFBzvSnLaq43Nbg44I1JYME5Is79SdvtdgyHw3N/XgBYVLZ3I6JdZFv2uAEgMYQbABJDuAEgMYQbABJDuAEgMaWcVWL7QNLo3J+4Pu6T9Ouqh6gBXoccrwOvwdT/53VoRsRGkQ1LCfeisz0setrOIuN1yPE68BpMzet14FAJACSGcANAYgj33RlUPUBN8DrkeB14Dabm8jpwjBsAEsMeNwAkhnAXZPu9tn9i+6rtV2w/XfVMVbJ9j+0XbX+/6lmqYvudti/Z/qnta7Y/WPVMVbD9ucm/iZdtP2f7bVXPNA+2/8H267ZfvmXtj2z/yPbPJo/vKuNnE+7ifi/p8xHxPkmPSPo72++reKYqPS3pWtVDVOzrkn4QEX8q6c+0hK+H7fslfUZSOyLeL+keSU9WO9Xc/KOkv7xt7YuSdiLiTyTtTL4/d4S7oIi4ERFXJl//Vvk/0vurnaoath+Q9DFJz1Y9S1Vsr0v6sKRvSlJEvBER/1PtVJVZlfR226uSGpJ+VfE8cxER/yrpv29bfkLStydff1vSX5Xxswn3XbDdkvSwpBeqnaQy25K+IOlm1YNU6EFJB5K+NTlk9Kzte6seat4i4peSvippX9INSYcR8cNqp6rUuyPixuTr1yS9u4wfQrhnZPsdkr4n6bMR8Zuq55k3249Lej0idquepWKrkj4g6RsR8bCk36mkX4vrbHIM9wnl/yN7j6R7bT9V7VT1EPkpe6Wctke4Z2B7TXm0s4h4vup5KvKopI/bflXSdyU9Zvs71Y5UieuSrkfE9LeuS8pDvmw+IunnEXEQEUeSnpf0oYpnqtJ/2f5jSZo8vl7GDyHcBdm28uOZ1yLia1XPU5WI+FJEPBARLeVvQv04IpZuDysiXpP0C9sPTZa2JF2tcKSq7Et6xHZj8m9kS0v4Ju0t/lnSJydff1LSP5XxQwh3cY9K+oTyPcyXJn8+WvVQqNSnJWW2/0PSn0v6+4rnmbvJbxyXJF2RtKe8KUtxFaXt5yT9u6SHbF+3/beSviLpL2z/TPlvI18p5Wdz5SQApIU9bgBIDOEGgMQQbgBIDOEGgMQQbgBIDOEGgMQQbgBIDOEGgMT8Lwi/oYhwODxFAAAAAElFTkSuQmCC\n",
111 | "text/plain": [
112 | ""
113 | ]
114 | },
115 | "metadata": {},
116 | "output_type": "display_data"
117 | }
118 | ],
119 | "source": [
120 | "plt.scatter(X_train[y_train==0, 0], X_train[y_train==0,1], color='g')\n",
121 | "plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r')\n",
122 | "plt.scatter(x[0], x[1], color='b')\n",
123 | "plt.show()"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "## kNN的过程"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 14,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "data": {
140 | "text/plain": [
141 | "[8.231110301142037,\n 2.467672103362692,\n 8.513701036889186,\n 7.405233631062757,\n 5.723981182780471,\n 4.7325594847194985,\n 3.026576262484981,\n 2.3280024672389885,\n 3.5998460221137596,\n 1.259337088954209]"
142 | ]
143 | },
144 | "execution_count": 14,
145 | "metadata": {},
146 | "output_type": "execute_result"
147 | }
148 | ],
149 | "source": [
150 | "from math import sqrt\n",
151 | "distances = []\n",
152 | "for x_train in X_train:\n",
153 | " d = sqrt(np.sum((x-x_train)**2))\n",
154 | " distances.append(d)\n",
155 | "distances"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 15,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "data": {
165 | "text/plain": [
166 | "array([9, 7, 1, 6, 8, 5, 4, 3, 0, 2])"
167 | ]
168 | },
169 | "execution_count": 15,
170 | "metadata": {},
171 | "output_type": "execute_result"
172 | }
173 | ],
174 | "source": [
175 | "top_k = np.argsort(distances)\n",
176 | "top_k"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 16,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "# 假设k取6\n",
186 | "k = 6"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 17,
192 | "metadata": {},
193 | "outputs": [
194 | {
195 | "data": {
196 | "text/plain": [
197 | "[(1, 5)]"
198 | ]
199 | },
200 | "execution_count": 17,
201 | "metadata": {},
202 | "output_type": "execute_result"
203 | }
204 | ],
205 | "source": [
206 | "from collections import Counter\n",
207 | "votes = Counter(y_train[top_k[:k]])\n",
208 | "\n",
209 | "votes.most_common(1) # 前边是标签,后边是个数"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 18,
215 | "metadata": {},
216 | "outputs": [
217 | {
218 | "data": {
219 | "text/plain": [
220 | "1"
221 | ]
222 | },
223 | "execution_count": 18,
224 | "metadata": {},
225 | "output_type": "execute_result"
226 | }
227 | ],
228 | "source": [
229 | "# 最终kNN的结果:\n",
230 | "predict_y = votes.most_common(1)[0][0]\n",
231 | "predict_y"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 19,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "%run c1_knn/kNN.py"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 21,
246 | "metadata": {},
247 | "outputs": [
248 | {
249 | "data": {
250 | "text/plain": [
251 | "1"
252 | ]
253 | },
254 | "execution_count": 21,
255 | "metadata": {},
256 | "output_type": "execute_result"
257 | }
258 | ],
259 | "source": [
260 | "predict_y = kNN_classify(6, X_train, y_train, x)\n",
261 | "predict_y"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": null,
267 | "metadata": {},
268 | "outputs": [],
269 | "source": []
270 | }
271 | ],
272 | "metadata": {
273 | "kernelspec": {
274 | "display_name": "Python 2",
275 | "language": "python",
276 | "name": "python2"
277 | },
278 | "language_info": {
279 | "codemirror_mode": {
280 | "name": "ipython",
281 | "version": 2
282 | },
283 | "file_extension": ".py",
284 | "mimetype": "text/x-python",
285 | "name": "python",
286 | "nbconvert_exporter": "python",
287 | "pygments_lexer": "ipython2",
288 | "version": "2.7.6"
289 | }
290 | },
291 | "nbformat": 4,
292 | "nbformat_minor": 0
293 | }
294 |
--------------------------------------------------------------------------------