├── README.md ├── self-defining ├── train_test_split.py ├── preprocessing.py ├── KNN.py ├── PCA.py ├── LogisticRegression.py ├── metrics.py ├── LinearRegression.py └── SimpleLinearRegression.py ├── SimpleLinearRegression.py ├── Machine-learning-algorithm ├── 78 SVM思想解决回归问题.ipynb ├── 05 超参数.ipynb ├── 25 随机梯度下降法.ipynb ├── 63 F1 Score.ipynb ├── 27 如何调试梯度.ipynb ├── 02 scikit-learn 中的 kNN.ipynb ├── 62 实现混淆矩阵,精准率和召回率.ipynb ├── 18 线性回归的可解释性和更多思考.ipynb ├── 45 验证数据集与交叉验证.ipynb ├── 06 网格搜索与k近邻算法中更多的超参数.ipynb ├── 50 什么是逻辑回归.ipynb ├── 26 sklearn中的随机梯度下降法.ipynb ├── 17 实现多元线性回归.ipynb ├── 24 梯度下降法的向量化.ipynb ├── 66 ROC曲线.ipynb ├── 81 信息熵.ipynb └── 76 高斯核函数.ipynb └── Jupyter-Notebook └── 01 Jupyter Notebook 高级 - 魔法命令.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Machine-Learning 2 | ## 核心工具 3 | Python3 scikit-learn Numpy matplotlib jupyter notebook ... 4 | ## 机器学习经典算法 5 | K近邻算法 线性回归 多项式回归 逻辑回归 PCA SVM决策树 随机森林 集成学习 boosting ... 6 | ## 机器学习应用 7 | 算法推导 算法对比 算法调试 模型正则化 模型选择 模型调用 模型评价 超参调整 ... 8 | 9 | self-defining文件夹下是自己实现封装的算法~~ 10 | -------------------------------------------------------------------------------- /self-defining/train_test_split.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # X:原始数据集 3 | # y:原始数据集的 label 4 | # test_ratio:测试数据集占原始数据集的比例 5 | # seed:随机种子 6 | def train_test_split(X, y, test_ratio=0.2, seed=None): 7 | """将数据X和y按照test_ratio分割成X_train, X_test, y_train, y_test""" 8 | assert X.shape[0] == y.shape[0], \ 9 | "the size if must be equal to the size of y" 10 | assert 0.0 <= test_ratio <= 1.0, \ 11 | "test_ratio must be valid" 12 | 13 | if seed: 14 | np.random.seed(seed) 15 | 16 | shuffled_indexes = np.random.permutation(len(X)) 17 | test_size = int(len(X) * test_ratio) # 测试数据集大小 18 | test_indexes = shuffled_indexes[:test_size] 19 | train_indexes = shuffled_indexes[test_size:] 20 | 21 | X_train = X[train_indexes] # 训练数据集 22 | y_train = y[train_indexes] 23 | 24 | X_test = X[test_indexes] # 测试数据集 25 | y_test= y[test_indexes] 26 | 27 | return X_train, X_test, y_train, y_test -------------------------------------------------------------------------------- /self-defining/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class StandardScaler: 4 | def __init__(self): 5 | self.mean_ = None 6 | self.scale_ = None 7 | def fit(self, X): 8 | """根据训练数据集X获得数据的均值和方差,只处理二维数据""" 9 | assert X.ndim == 2, "The dimension of X must be 2" 10 | self.mean_ = np.array([np.mean(X[:, i]) for i in range(X.shape[1])]) 11 | self.scale_ = np.array([np.std(X[:, i]) for i in range(X.shape[1])]) 12 | 13 | return self 14 | 15 | def transform(self, X): 16 | """将X根据这个StandardScaler进行均值方差归一化处理""" 17 | assert X.ndim == 2, "The dimension of X must be 2" 18 | """而且fit必须在transform之前执行,所以mean_和scale_必须是非空的""" 19 | assert self.mean_ is not None and self.scale_ is not None, \ 20 | "must fit before transform!" 21 | assert X.shape[1] == len(self.mean_), \ 22 | "the feature nunmber of X must be equal to mean_ and std_" 23 | resX = np.empty(shape=X.shape, dtype=float) 24 | for col in range(X.shape[1]): 25 | resX[:, col] = (X[:, col] - self.mean_[col]) / self.scale_[col] 26 | return resX 27 | -------------------------------------------------------------------------------- /self-defining/KNN.py: -------------------------------------------------------------------------------- 1 | # import numpy as np 2 | # from math import sqrt 3 | # from collections import Counter 4 | # 5 | # def kNN_classify(k, X_train, y_train, x): 6 | # assert 1 <= k <= X_train.shape[0], "k must be valid" 7 | # assert X_train.shape[0] == y_train.shape[0], \ 8 | # "the size of X_train must equal to the size of y_train" 9 | # assert X_train.shape[1] == x.shape[0], \ 10 | # "the feature number of x must be equal to X_train" 11 | # 12 | # distance = [ sqrt(np.sum((x_train - x) ** 2)) for x_train in X_train] 13 | # nearest = np.argsort(distance) 14 | # 15 | # topK_y = [ y_train[i] for i in nearest[:k] ] 16 | # votes = Counter(topK_y) 17 | # 18 | # return votes.most_common(1)[0][0] 19 | 20 | # 重新整理kNN算法 21 | import numpy as np 22 | from collections import Counter 23 | from math import sqrt 24 | from metrics import accuracy_score 25 | 26 | class kNNClassifier: 27 | def __init__(self, k): 28 | """初始化kNN分类器""" 29 | assert k >= 1, "k must be valid" 30 | self.k = k 31 | self._X_tarin = None 32 | self._y_train = None 33 | def fit(self, X_train, y_train): 34 | """根据训练数据集X_train和y_train训练kNN分类器""" 35 | self._X_train = X_train 36 | self._y_train = y_train 37 | return self 38 | def predict(self, X_predict): 39 | """给定待预测数据集X_predict,返回表示X_predict的结果向量""" 40 | y_predict = [self._predict(x) for x in X_predict] 41 | return np.array(y_predict) 42 | def _predict(self, x): 43 | """给定单个待预测数据x,返回x_predict的预测结果值""" 44 | distances = [ sqrt(np.sum((x_train - x) ** 2)) for x_train in self._X_train ] 45 | nearest = np.argsort(distances) 46 | topK_y = [ self._y_train[i] for i in nearest[:self.k] ] 47 | votes = Counter(topK_y) 48 | 49 | return votes.most_common(1)[0][0] 50 | def score(self, X_test, y_test): 51 | """根据测试数据集 X_test 和 y_test 确定当前模型的准确度""" 52 | y_predict = self.predict(X_test) 53 | return accuracy_score(y_test, y_predict) 54 | def __repr__(self): 55 | return "KNN(k=%d)" % self.k -------------------------------------------------------------------------------- /self-defining/PCA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class PCA: 4 | def __init__(self, n_components): 5 | """初始化PCA""" 6 | assert n_components >= 1, "n_components must be valid" 7 | self.n_components = n_components 8 | self.components_ = None 9 | def fit(self, X, eta=0.01, n_iters=1e4): 10 | """获得数据集X的前n个主成分""" 11 | assert self.n_components <= X.shape[1], \ 12 | "n_components must not be greater than the feature number of X" 13 | def deamen(X): # 均值归为0 14 | return X - np.mean(X, axis=0) 15 | def f(w, X): 16 | return np.sum((X.dot(w)) ** 2) / len(X) 17 | def df(w, X): 18 | return X.T.dot(X.dot(w)) * 2. / len(X) 19 | def direction(w): 20 | return w / np.linalg.norm(w) 21 | def first_component(X, initial_w, eta, n_iters=1e4, epsilon=1e-8): 22 | w = direction(initial_w) 23 | i_iter = 0 24 | while i_iter < n_iters: 25 | gradient = df(w, X) 26 | last_w = w 27 | w = w + eta * gradient 28 | w = direction(w) 29 | 30 | if(abs(f(w, X) - f(last_w, X)) < epsilon): 31 | break 32 | i_iter += 1 33 | return w 34 | X_pca = deamen(X) 35 | self.components_ = np.empty(shape=(self.n_components, X.shape[1])) 36 | for i in range(self.n_components): 37 | initial_w = np.random.random(X_pca.shape[1]) 38 | w = first_component(X_pca, initial_w, eta, n_iters) 39 | self.components_[i, :] = w 40 | X_pca = X_pca - X_pca.dot(w).reshape(-1, 1) * w 41 | return self 42 | def transform(self, X): 43 | """将给定的X,映射到各个主成分分量中""" 44 | assert X.shape[1] == self.components_.shape[1] 45 | 46 | return X.dot(self.components_.T) 47 | def inverse_transform(self, X): 48 | """将给定的X,反向映射会原来的特征空间""" 49 | assert X.shape[1] == self.components_.shape[0] 50 | return X.dot(self.components_) 51 | def __repr__(self): 52 | return "PCA(n_components=%d)" % self.n_components -------------------------------------------------------------------------------- /self-defining/LogisticRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from metrics import accuracy_score 3 | 4 | class LogisticRegression: 5 | def __init__(self): 6 | """初始化 Linear Regression""" 7 | self.coef_ = None # 系数 8 | self.interception_ = None # 截距 9 | self._theta = None # θ 10 | def _sigmoid(self, t): 11 | return 1. / (1. + np.exp(-t)) 12 | def fit(self, X_train, y_train, eta=0.01, n_iters=1e4): 13 | """根据训练数据集X_train,y_train,使用梯度下降法训练Logistic Regression模型""" 14 | assert X_train.shape[0] == y_train.shape[0], \ 15 | "the size of X_train, y_train must be equal to the size of y_train" 16 | def J(theta, X_b, y): 17 | y_hat = self._sigmoid(X_b.dot(theta)) 18 | try: 19 | return -np.sum(y*np.log(y_hat) + (1-y)*np.log(1- y_hat)) / len(y) 20 | except: 21 | return float('inf') 22 | def dJ(theta, X_b, y): 23 | return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(X_b) 24 | def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8): 25 | theta = initial_theta 26 | i_iter = 0 27 | while i_iter < n_iters: 28 | gradient = dJ(theta, X_b, y) 29 | last_theta = theta 30 | theta = theta - eta * gradient 31 | 32 | if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon): 33 | break 34 | i_iter += 1 35 | return theta 36 | X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) 37 | initial_theta = np.zeros(X_b.shape[1]) 38 | self._theta = gradient_descent(X_b, y_train, initial_theta, eta) 39 | 40 | self.interception_ = self._theta[0] 41 | self.coef_ = self._theta[1:] 42 | return self 43 | def predict_proba(self, X_predict): 44 | """给定待预测数据集X_predict,返回表示X_predict的结果概率向量""" 45 | assert self.interception_ is not None and self.coef_ is not None, \ 46 | "must fit before predict!" 47 | assert X_predict.shape[1] == len(self.coef_), \ 48 | "the feature number of X_predict must be equal to X_train" 49 | X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict]) 50 | return self._sigmoid(X_b.dot(self._theta)) 51 | def predict(self, X_predict): 52 | """给定待预测数据集X_predict,返回表示X_predict的结果向量""" 53 | assert self.interception_ is not None and self.coef_ is not None, \ 54 | "must fit before predict!" 55 | assert X_predict.shape[1] == len(self.coef_), \ 56 | "the feature number of X_predict must be equal to X_train" 57 | proba = self.predict_proba(X_predict) 58 | return np.array(proba >= 0.5, dtype='int') 59 | def score(self, X_test, y_test): 60 | """根据测试数据集X_test和y_test确定当前模型的准确度""" 61 | y_predict = self.predict(X_test) 62 | return accuracy_score(y_test, y_predict) 63 | def __repr__(self): 64 | return "LogisticRegression()" -------------------------------------------------------------------------------- /SimpleLinearRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class SimpleLinearRegression1: 4 | def __init__(self): 5 | """初始化 Simple Linear Regression 模型""" 6 | self.a_ = None 7 | self.b_ = None 8 | def fit(self, x_train, y_train): 9 | """根据训练数据集 x_train, y_train训练模型""" 10 | assert x_train.ndim == 1, \ 11 | "Simple Linear Regression can only solve single feature training data" 12 | assert len(x_train) == len(y_train), \ 13 | "the size of x_train must be equal to the size of y_train" 14 | 15 | x_mean = np.mean(x_train) 16 | y_mean = np.mean(y_train) 17 | 18 | num = 0.0 19 | d = 0.0 20 | for x, y in zip(x_train, y_train): 21 | num += (x - x_mean) * (y - y_mean) 22 | d += (x - x_mean) ** 2 23 | self.a_ = num / d 24 | self.b_ = y_mean - self.a_ * x_mean 25 | 26 | return self 27 | def predict(self, x_predict): # x_predict 为一个向量 28 | """给定预测数据集x_predict, 返回表示x_predict的结果向量""" 29 | assert x_predict.ndim == 1, \ 30 | "Simple Linear Regression can only solve single feature training data" 31 | assert self.a_ is not None and self.b_ is not None, \ 32 | "must fit before predict!" 33 | return np.array([self._predict(x) for x in x_predict]) 34 | def _predict(self, x_single): # x_single 为一个数 35 | """给定单个预测数据x_single, 返回x_single的预测结果值""" 36 | return self.a_ * x_single + self.b_ 37 | def __repr__(self): 38 | return "SimpleLinearRegression1()" 39 | 40 | # 使用向量化运算 41 | # 只需要改变 fit 函数 42 | class SimpleLinearRegression2: 43 | def __init__(self): 44 | """初始化 Simple Linear Regression 模型""" 45 | self.a_ = None 46 | self.b_ = None 47 | def fit(self, x_train, y_train): 48 | """根据训练数据集 x_train, y_train训练模型""" 49 | assert x_train.ndim == 1, \ 50 | "Simple Linear Regression can only solve single feature training data" 51 | assert len(x_train) == len(y_train), \ 52 | "the size of x_train must be equal to the size of y_train" 53 | 54 | x_mean = np.mean(x_train) 55 | y_mean = np.mean(y_train) 56 | 57 | num = (x_train - x_mean).dot(y_train - y_mean) #分子点乘 58 | d = (x_train - x_mean).dot(x_train - x_mean) #分母点乘 59 | 60 | self.a_ = num / d 61 | self.b_ = y_mean - self.a_ * x_mean 62 | 63 | return self 64 | def predict(self, x_predict): # x_predict 为一个向量 65 | """给定预测数据集x_predict, 返回表示x_predict的结果向量""" 66 | assert x_predict.ndim == 1, \ 67 | "Simple Linear Regression can only solve single feature training data" 68 | assert self.a_ is not None and self.b_ is not None, \ 69 | "must fit before predict!" 70 | return np.array([self._predict(x) for x in x_predict]) 71 | def _predict(self, x_single): # x_single 为一个数 72 | """给定单个预测数据x_single, 返回x_single的预测结果值""" 73 | return self.a_ * x_single + self.b_ 74 | def __repr__(self): 75 | return "SimpleLinearRegression2()" 76 | -------------------------------------------------------------------------------- /self-defining/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from math import sqrt 3 | 4 | def accuracy_score(y_true, y_predict): 5 | """计算y_true和y_predict之间的准确率""" 6 | assert y_true.shape[0] == y_predict.shape[0], \ 7 | "the size of y_true must be equal to the size of y_predict" 8 | return sum(y_true == y_predict) / len(y_true) 9 | 10 | # MSE:均方误差 11 | def mean_squared_error(y_true, y_predict): 12 | """计算y_true与y_predict之阿的MSE""" 13 | assert len(y_true) == len(y_predict), \ 14 | "the size of y_true must be equal to the size of y_predict" 15 | return np.sum((y_true - y_predict) ** 2) / len(y_true) 16 | 17 | # RMSE:均方根误差 18 | def root_mean_squared_error(y_true, y_predict): 19 | """计算y_true与y_predict之阿的RMSE""" 20 | return sqrt(mean_squared_error(y_true, y_predict)) 21 | 22 | # MAE:平均绝对误差 23 | def mean_absolute_error(y_true, y_predict): 24 | """计算y_true与y_predict之阿的MAE""" 25 | assert len(y_true) == len(y_predict), \ 26 | "the size of y_true must be equal to the size of y_predict" 27 | return np.sum(np.absolute(y_true - y_predict)) / len(y_true) 28 | 29 | # R Square 30 | def r2_score(y_true, y_predict): 31 | """计算y_true和y_predict之间的R Square""" 32 | return 1 - mean_squared_error(y_true, y_predict) / np.var(y_true) 33 | 34 | def TN(y_true, y_predict): 35 | assert len(y_true) == len(y_predict) 36 | return np.sum((y_true == 0) & (y_predict == 0)) 37 | def FP(y_true, y_predict): 38 | assert len(y_true) == len(y_predict) 39 | return np.sum((y_true == 0) & (y_predict == 1)) 40 | def FN(y_true, y_predict): 41 | assert len(y_true) == len(y_predict) 42 | return np.sum((y_true == 1) & (y_predict == 0)) 43 | def TP(y_true, y_predict): 44 | assert len(y_true) == len(y_predict) 45 | return np.sum((y_true == 1) & (y_predict == 1)) 46 | # 混淆矩阵 47 | def confusion_matrix(y_true, y_predict): 48 | return np.array([ 49 | [TN(y_true, y_predict), FP(y_true, y_predict)], 50 | [FN(y_true, y_predict), TP(y_true, y_predict)] 51 | ]) 52 | # 精准率 53 | def precision_score(y_true, y_predict): 54 | assert len(y_true) == len(y_predict) 55 | tp = TP(y_true, y_predict) 56 | fp = FP(y_true, y_predict) 57 | try: 58 | return tp / (tp + fp) 59 | except: 60 | return 0.0 61 | # 召回率 62 | def recall_score(y_true, y_predict): 63 | assert len(y_true) == len(y_predict) 64 | tp = TP(y_true, y_predict) 65 | fn = FN(y_true, y_predict) 66 | try: 67 | return tp / (tp + fn) 68 | except: 69 | return 0.0 70 | # f1_score 71 | def f1_score(y_true, y_predict): 72 | precision = precision_score(y_true, y_predict) 73 | recall = recall_score(y_true, y_predict) 74 | try: 75 | return 2. * precision * recall / (precision + recall) 76 | except: 77 | return 0.0 78 | def TPR(y_true, y_predict): 79 | tp = TP(y_true, y_predict) 80 | fn = FN(y_true, y_predict) 81 | try: 82 | return tp / (tp + fn) 83 | except: 84 | return 0.0 85 | def FPR(y_true, y_predict): 86 | tn = TN(y_true, y_predict) 87 | fp = FP(y_true, y_predict) 88 | try: 89 | return fp / (tn + fp) 90 | except: 91 | return 0.0 -------------------------------------------------------------------------------- /Machine-learning-algorithm/78 SVM思想解决回归问题.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## SVM思想解决回归问题" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "from sklearn import datasets\n", 27 | "\n", 28 | "boston = datasets.load_boston()\n", 29 | "X = boston.data\n", 30 | "y = boston.target" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 4, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "from sklearn.model_selection import train_test_split\n", 40 | "\n", 41 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 5, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "from sklearn.svm import LinearSVR\n", 51 | "from sklearn.svm import SVR\n", 52 | "from sklearn.preprocessing import StandardScaler\n", 53 | "from sklearn.pipeline import Pipeline\n", 54 | "\n", 55 | "def StandardLinearSVR(epsilon=0.1):\n", 56 | " return Pipeline([\n", 57 | " (\"std_scaler\", StandardScaler()),\n", 58 | " (\"linearSVR\", LinearSVR(epsilon=epsilon))\n", 59 | " ])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 6, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "Pipeline(memory=None,\n", 71 | " steps=[('std_scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('linearSVR', LinearSVR(C=1.0, dual=True, epsilon=0.1, fit_intercept=True,\n", 72 | " intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,\n", 73 | " random_state=None, tol=0.0001, verbose=0))])" 74 | ] 75 | }, 76 | "execution_count": 6, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "svr = StandardLinearSVR()\n", 83 | "svr.fit(X_train, y_train)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 7, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/plain": [ 94 | "0.6357989242240044" 95 | ] 96 | }, 97 | "execution_count": 7, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "svr.score(X_test, y_test)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.7.0" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 2 135 | } 136 | -------------------------------------------------------------------------------- /self-defining/LinearRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from metrics import r2_score 3 | 4 | class LinearRegression: 5 | def __init__(self): 6 | """初始化 Linear Regression""" 7 | self.coef_ = None # 系数 8 | self.interception_ = None # 截距 9 | self._theta = None # θ 10 | def fit_normal(self, X_train, y_train): 11 | """根据训练数据集X_train,y_train训练Linear Regression模型""" 12 | assert X_train.shape[0] == y_train.shape[0], \ 13 | "the size of X_train must be equal to the size of y_train" 14 | X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) # 在 X_train 前加一列 1 15 | self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train) 16 | 17 | self.interception_ = self._theta[0] #截距 18 | self.coef_ = self._theta[1:] #系数 19 | 20 | return self 21 | def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4): 22 | """根据训练数据集X_train,y_train,使用梯度下降法训练Linear Regression模型""" 23 | assert X_train.shape[0] == y_train.shape[0], \ 24 | "the size of X_train, y_train must be equal to the size of y_train" 25 | def J(theta, X_b, y): 26 | try: 27 | return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b) 28 | except: 29 | return float('inf') 30 | def dJ(theta, X_b, y): 31 | # res = np.empty(len(theta)) 32 | # res[0] = np.sum(X_b.dot(theta) - y) 33 | # for i in range(1, len(theta)): 34 | # res[i] = np.sum((X_b.dot(theta) - y).dot(X_b[:, i])) 35 | # return res * 2 / len(X_b) 36 | return X_b.T.dot(X_b.dot(theta) - y) * 2 / len(X_b) 37 | def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8): 38 | theta = initial_theta 39 | i_iter = 0 40 | while i_iter < n_iters: 41 | gradient = dJ(theta, X_b, y) 42 | last_theta = theta 43 | theta = theta - eta * gradient 44 | 45 | if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon): 46 | break 47 | i_iter += 1 48 | return theta 49 | X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) 50 | initial_theta = np.zeros(X_b.shape[1]) 51 | self._theta = gradient_descent(X_b, y_train, initial_theta, eta) 52 | 53 | self.interception_ = self._theta[0] 54 | self.coef_ = self._theta[1:] 55 | return self 56 | 57 | def fit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50): 58 | """根据训练数据集X_train,y_train, 使用梯度下降法训练LinearRegression模型""" 59 | assert X_train.shape[0] == y_train.shape[0], \ 60 | "the size of X_train must be equal to the size of y_train" 61 | assert n_iters >= 1 # 至少将样本看一次 62 | def dJ_sgd(theta, X_b_i, y_i): 63 | return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2 64 | def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50): 65 | def learning_rate(t): 66 | return t0 / (t + t1) 67 | theta = initial_theta 68 | m = len(X_b) 69 | for i_iters in range(n_iters): #至少要将将我们的样本个数(m)看5遍 70 | # 为了保证每一遍都能够遍历了所有的样本数,我们先将下标乱序,然后依次进行遍历 71 | # 这样既保证了随机性,又保证了能够遍历到每一个样本 72 | indexes = np.random.permutation(m) 73 | X_b_new = X_b[indexes] 74 | y_new = y[indexes] 75 | for i in range(m): 76 | gradient = dJ_sgd(theta, X_b_new[i], y_new[i]) 77 | theta = theta - learning_rate(i_iters * m + i) * gradient 78 | return theta 79 | X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) 80 | initial_theta = np.zeros(X_b.shape[1]) 81 | self._theta = sgd(X_b, y_train, initial_theta, n_iters) 82 | self.interception_ = self._theta[0] 83 | self.coef_ = self._theta[1:] 84 | return self 85 | def predict(self, X_predict): 86 | """给定待预测数据集X_predict,返回表示X_predict的结果向量""" 87 | assert self.interception_ is not None and self.coef_ is not None, \ 88 | "must fit before predict!" 89 | assert X_predict.shape[1] == len(self.coef_), \ 90 | "the feature number of X_predict must be equal to X_train" 91 | X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict]) 92 | return X_b.dot(self._theta) 93 | def score(self, X_test, y_test): 94 | """根据测试数据集X_test和y_test确定当前模型的准确度""" 95 | y_predict = self.predict(X_test) 96 | return r2_score(y_test, y_predict) 97 | def __repr__(self): 98 | return "LinearRegression()" -------------------------------------------------------------------------------- /self-defining/SimpleLinearRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from metrics import r2_score 3 | # 使用向量化运算 4 | class SimpleLinearRegression: 5 | def __init__(self): 6 | """初始化 Simple Linear Regression 模型""" 7 | self.a_ = None 8 | self.b_ = None 9 | def fit(self, x_train, y_train): 10 | """根据训练数据集 x_train, y_train训练模型""" 11 | assert x_train.ndim == 1, \ 12 | "Simple Linear Regression can only solve single feature training data" 13 | assert len(x_train) == len(y_train), \ 14 | "the size of x_train must be equal to the size of y_train" 15 | 16 | x_mean = np.mean(x_train) 17 | y_mean = np.mean(y_train) 18 | 19 | num = (x_train - x_mean).dot(y_train - y_mean) #分子点乘 20 | d = (x_train - x_mean).dot(x_train - x_mean) #分母点乘 21 | 22 | self.a_ = num / d 23 | self.b_ = y_mean - self.a_ * x_mean 24 | 25 | return self 26 | def predict(self, x_predict): # x_predict 为一个向量 27 | """给定预测数据集x_predict, 返回表示x_predict的结果向量""" 28 | assert x_predict.ndim == 1, \ 29 | "Simple Linear Regression can only solve single feature training data" 30 | assert self.a_ is not None and self.b_ is not None, \ 31 | "must fit before predict!" 32 | return np.array([self._predict(x) for x in x_predict]) 33 | def _predict(self, x_single): # x_single 为一个数 34 | """给定单个预测数据x_single, 返回x_single的预测结果值""" 35 | return self.a_ * x_single + self.b_ 36 | def score(self, x_test, y_test): 37 | """根据测试数据集x_test和y_test确定当前模型的准确度""" 38 | y_predict = self.predict(x_test) 39 | return r2_score(y_test, y_predict) 40 | def __repr__(self): 41 | return "SimpleLinearRegression()" 42 | 43 | # import numpy as np 44 | # 45 | # class SimpleLinearRegression1: 46 | # def __init__(self): 47 | # """初始化 Simple Linear Regression 模型""" 48 | # self.a_ = None 49 | # self.b_ = None 50 | # def fit(self, x_train, y_train): 51 | # """根据训练数据集 x_train, y_train训练模型""" 52 | # assert x_train.ndim == 1, \ 53 | # "Simple Linear Regression can only solve single feature training data" 54 | # assert len(x_train) == len(y_train), \ 55 | # "the size of x_train must be equal to the size of y_train" 56 | # 57 | # x_mean = np.mean(x_train) 58 | # y_mean = np.mean(y_train) 59 | # 60 | # num = 0.0 61 | # d = 0.0 62 | # for x, y in zip(x_train, y_train): 63 | # num += (x - x_mean) * (y - y_mean) 64 | # d += (x - x_mean) ** 2 65 | # self.a_ = num / d 66 | # self.b_ = y_mean - self.a_ * x_mean 67 | # 68 | # return self 69 | # def predict(self, x_predict): # x_predict 为一个向量 70 | # """给定预测数据集x_predict, 返回表示x_predict的结果向量""" 71 | # assert x_predict.ndim == 1, \ 72 | # "Simple Linear Regression can only solve single feature training data" 73 | # assert self.a_ is not None and self.b_ is not None, \ 74 | # "must fit before predict!" 75 | # return np.array([self._predict(x) for x in x_predict]) 76 | # def _predict(self, x_single): # x_single 为一个数 77 | # """给定单个预测数据x_single, 返回x_single的预测结果值""" 78 | # return self.a_ * x_single + self.b_ 79 | # def __repr__(self): 80 | # return "SimpleLinearRegression1()" 81 | # 82 | # # 使用向量化运算 83 | # # 只需要改变 fit 函数 84 | # class SimpleLinearRegression2: 85 | # def __init__(self): 86 | # """初始化 Simple Linear Regression 模型""" 87 | # self.a_ = None 88 | # self.b_ = None 89 | # def fit(self, x_train, y_train): 90 | # """根据训练数据集 x_train, y_train训练模型""" 91 | # assert x_train.ndim == 1, \ 92 | # "Simple Linear Regression can only solve single feature training data" 93 | # assert len(x_train) == len(y_train), \ 94 | # "the size of x_train must be equal to the size of y_train" 95 | # 96 | # x_mean = np.mean(x_train) 97 | # y_mean = np.mean(y_train) 98 | # 99 | # num = (x_train - x_mean).dot(y_train - y_mean) #分子点乘 100 | # d = (x_train - x_mean).dot(x_train - x_mean) #分母点乘 101 | # 102 | # self.a_ = num / d 103 | # self.b_ = y_mean - self.a_ * x_mean 104 | # 105 | # return self 106 | # def predict(self, x_predict): # x_predict 为一个向量 107 | # """给定预测数据集x_predict, 返回表示x_predict的结果向量""" 108 | # assert x_predict.ndim == 1, \ 109 | # "Simple Linear Regression can only solve single feature training data" 110 | # assert self.a_ is not None and self.b_ is not None, \ 111 | # "must fit before predict!" 112 | # return np.array([self._predict(x) for x in x_predict]) 113 | # def _predict(self, x_single): # x_single 为一个数 114 | # """给定单个预测数据x_single, 返回x_single的预测结果值""" 115 | # return self.a_ * x_single + self.b_ 116 | # def __repr__(self): 117 | # return "SimpleLinearRegression2()" 118 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/05 超参数.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 超参数" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "from sklearn import datasets" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "digits = datasets.load_digits()\n", 27 | "X = digits.data\n", 28 | "y = digits.target" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "from sklearn.model_selection import train_test_split\n", 38 | "\n", 39 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "from sklearn.neighbors import KNeighborsClassifier" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "0.9888888888888889" 60 | ] 61 | }, 62 | "execution_count": 5, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "knn_clf = KNeighborsClassifier(n_neighbors=3)\n", 69 | "knn_clf.fit(X_train, y_train)\n", 70 | "knn_clf.score(X_test, y_test)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## 寻找最好的k" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 6, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "best_k = 4\n", 90 | "best_score = 0.9916666666666667\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "best_score = 0.0\n", 96 | "best_k = -1\n", 97 | "for k in range(1, 11):\n", 98 | " knn_clf = KNeighborsClassifier(n_neighbors=k)\n", 99 | " knn_clf.fit(X_train, y_train)\n", 100 | " score = knn_clf.score(X_test, y_test)\n", 101 | " if score > best_score:\n", 102 | " best_k = k;\n", 103 | " best_score = score\n", 104 | "\n", 105 | "print(\"best_k = \", best_k)\n", 106 | "print(\"best_score = \", best_score)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "## 考虑距离?不考虑距离?" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 8, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "best_method = uniform\n", 126 | "best_k = 4\n", 127 | "best_score = 0.9916666666666667\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "best_method = \"\"\n", 133 | "best_score = 0.0\n", 134 | "best_k = -1\n", 135 | "for method in [\"uniform\", \"distance\"]: # uniform:不考虑距离,distance:考虑距离\n", 136 | " for k in range(1, 11):\n", 137 | " knn_clf = KNeighborsClassifier(n_neighbors=k, weights=method)\n", 138 | " knn_clf.fit(X_train, y_train)\n", 139 | " score = knn_clf.score(X_test, y_test)\n", 140 | " if score > best_score:\n", 141 | " best_k = k;\n", 142 | " best_score = score\n", 143 | " best_method = method\n", 144 | "print(\"best_method = \", best_method)\n", 145 | "print(\"best_k = \", best_k)\n", 146 | "print(\"best_score = \", best_score)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## 搜索明可夫斯基距离相应的 p" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 10, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "best_p = 2\n", 166 | "best_k = 3\n", 167 | "best_score = 0.9888888888888889\n", 168 | "Wall time: 17 s\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "%%time\n", 174 | "best_p = -1\n", 175 | "best_score = 0.0\n", 176 | "best_k = -1\n", 177 | "\n", 178 | "for k in range(1, 11):\n", 179 | " for p in range(1, 6):\n", 180 | " knn_clf = KNeighborsClassifier(n_neighbors=k, weights=\"distance\", p=p)\n", 181 | " knn_clf.fit(X_train, y_train)\n", 182 | " score = knn_clf.score(X_test, y_test)\n", 183 | " if score > best_score:\n", 184 | " best_k = k;\n", 185 | " best_score = score\n", 186 | " best_p = p\n", 187 | "print(\"best_p = \", best_p)\n", 188 | "print(\"best_k = \", best_k)\n", 189 | "print(\"best_score = \", best_score)" 190 | ] 191 | } 192 | ], 193 | "metadata": { 194 | "kernelspec": { 195 | "display_name": "Python 3", 196 | "language": "python", 197 | "name": "python3" 198 | }, 199 | "language_info": { 200 | "codemirror_mode": { 201 | "name": "ipython", 202 | "version": 3 203 | }, 204 | "file_extension": ".py", 205 | "mimetype": "text/x-python", 206 | "name": "python", 207 | "nbconvert_exporter": "python", 208 | "pygments_lexer": "ipython3", 209 | "version": "3.7.0" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 2 214 | } 215 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/25 随机梯度下降法.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 随机梯度下降法" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "from sklearn import datasets" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "### 1. 模拟数据" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "m = 100000 # 样本数\n", 34 | "\n", 35 | "x = np.random.normal(size=m)\n", 36 | "X = x.reshape(-1, 1) # 特征数只有一个\n", 37 | "y = 4.* x + 3. + np.random.normal(0, 3, size=m)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 7, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "def J(theta, X_b, y):\n", 47 | " try:\n", 48 | " return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)\n", 49 | " except:\n", 50 | " return float('inf')\n", 51 | "def dJ(theta, X_b, y):\n", 52 | " return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(X_b)\n", 53 | "def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):\n", 54 | " theta = initial_theta\n", 55 | " i_iters = 0\n", 56 | " \n", 57 | " while i_iters < n_iters:\n", 58 | " gradient = dJ(theta, X_b, y)\n", 59 | " last_theta = theta\n", 60 | " theta = theta - eta * gradient\n", 61 | " \n", 62 | " if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):\n", 63 | " break\n", 64 | " i_iters += 1\n", 65 | " return theta" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 8, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "Wall time: 442 ms\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "%%time\n", 83 | "X_b = np.hstack([np.ones((len(X), 1)), X])\n", 84 | "initial_theta = np.zeros(X_b.shape[1])\n", 85 | "eta = 0.01\n", 86 | "theta = gradient_descent(X_b, y, initial_theta, eta)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 9, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "array([3.00933084, 3.99019421])" 98 | ] 99 | }, 100 | "execution_count": 9, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "theta" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "从结果可以看出,斜率大致为 4,截距大致为 3,和我们设计的数据是拟合的" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "### 使用随机梯度下降法" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 10, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "# X_b_i:传入的是 X_b 的第 i 行\n", 130 | "# y_i:传入的是 y 的第 i 个数值\n", 131 | "def dJ_sgd(theta, X_b_i, y_i):\n", 132 | " return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 11, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "def sgd(X_b, y, initial_theta, n_iters):\n", 142 | " t0 = 5\n", 143 | " t1= 50\n", 144 | " # 学习率函数\n", 145 | " def learning_rate(t):\n", 146 | " return t0 / (t + t1)\n", 147 | " theta = initial_theta\n", 148 | " for i_iter in range(n_iters):\n", 149 | " rand_i = np.random.randint(len(X_b)) # 随机生成一个下标\n", 150 | " gradient = dJ_sgd(theta, X_b[rand_i], y[rand_i])\n", 151 | " theta = theta - learning_rate(i_iter) * gradient\n", 152 | " return theta" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 12, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "Wall time: 187 ms\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "%%time\n", 170 | "X_b = np.hstack([np.ones((len(X), 1)), X])\n", 171 | "initial_theta = np.zeros(X_b.shape[1])\n", 172 | "theta = sgd(X_b, y, initial_theta, n_iters=len(X_b)//3)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 13, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "data": { 182 | "text/plain": [ 183 | "array([2.9904752 , 3.95833456])" 184 | ] 185 | }, 186 | "execution_count": 13, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "theta" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "  我们从结果可以发现,使用随机梯度下降法得到的也差不多是3和4,但是时间却比梯度下降的442ms快很多,而且我们只循环了1/3次样本数就得到了比较准确的结果,比梯度下降法一次循环的次数还要少。\n", 200 | "\n", 201 | "  但是,我们在实际对高维的样本应用随机梯度下降法的时候可能不能这么随意的只是用三分之一的样本数,在这里只是一个例子展示它的强大之处。" 202 | ] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.7.0" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 2 226 | } 227 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/63 F1 Score.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## F1 Score" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "def f1_score(precision, recall):\n", 26 | " try:\n", 27 | " return 2 * precision * recall / (precision + recall)\n", 28 | " except:\n", 29 | " return 0.0" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "0.5" 41 | ] 42 | }, 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "precision = 0.5\n", 50 | "recall = 0.5\n", 51 | "f1_score(precision, recall)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "0.18000000000000002" 63 | ] 64 | }, 65 | "execution_count": 4, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "precision = 0.1\n", 72 | "recall = 0.9\n", 73 | "f1_score(precision, recall)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "如果此时是算术平均值,得到的结果还是0.5,但是使用调和平均值得到的结果为0.18,远远小于0.5,所以这也是调和平均值的优势。当其中某一个值特别小的时候,F1 Score 的值也特别小。" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/plain": [ 91 | "0.0" 92 | ] 93 | }, 94 | "execution_count": 5, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "precision = 0.0\n", 101 | "recall = 1.0\n", 102 | "f1_score(precision, recall)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## 使用真实数据" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 9, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "from sklearn import datasets\n", 119 | "\n", 120 | "digits = datasets.load_digits()\n", 121 | "X = digits.data\n", 122 | "y = digits.target.copy()\n", 123 | "\n", 124 | "y[digits.target==9] = 1\n", 125 | "y[digits.target!=9] = 0" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 10, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "from sklearn.model_selection import train_test_split\n", 135 | "\n", 136 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 11, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "0.9755555555555555" 148 | ] 149 | }, 150 | "execution_count": 11, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "from sklearn.linear_model import LogisticRegression\n", 157 | "\n", 158 | "log_reg = LogisticRegression()\n", 159 | "log_reg.fit(X_train, y_train)\n", 160 | "log_reg.score(X_test, y_test)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 12, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "y_predict = log_reg.predict(X_test)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 13, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "data": { 179 | "text/plain": [ 180 | "array([[403, 2],\n", 181 | " [ 9, 36]], dtype=int64)" 182 | ] 183 | }, 184 | "execution_count": 13, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "from sklearn.metrics import confusion_matrix\n", 191 | "# 混淆矩阵\n", 192 | "confusion_matrix(y_test, y_predict)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 14, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "0.9473684210526315" 204 | ] 205 | }, 206 | "execution_count": 14, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "from sklearn.metrics import precision_score\n", 213 | "# 精准率\n", 214 | "precision_score(y_test, y_predict)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 15, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "0.8" 226 | ] 227 | }, 228 | "execution_count": 15, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "from sklearn.metrics import recall_score\n", 235 | "# 召回率\n", 236 | "recall_score(y_test, y_predict)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "### 计算 F1 Score" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 16, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "data": { 253 | "text/plain": [ 254 | "0.8674698795180723" 255 | ] 256 | }, 257 | "execution_count": 16, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "from sklearn.metrics import f1_score\n", 264 | "\n", 265 | "f1_score(y_test, y_predict)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "我们发现 F1 Score 远远要低于分类准确度求出来的 97.5%,而且也要低于精准率的 94.7%,这是因为 F1 Score 综合了精准率和召回率。所以在这种情况下,F1 Score 是能更好的反映算法的准确度的。" 273 | ] 274 | } 275 | ], 276 | "metadata": { 277 | "kernelspec": { 278 | "display_name": "Python 3", 279 | "language": "python", 280 | "name": "python3" 281 | }, 282 | "language_info": { 283 | "codemirror_mode": { 284 | "name": "ipython", 285 | "version": 3 286 | }, 287 | "file_extension": ".py", 288 | "mimetype": "text/x-python", 289 | "name": "python", 290 | "nbconvert_exporter": "python", 291 | "pygments_lexer": "ipython3", 292 | "version": "3.7.0" 293 | } 294 | }, 295 | "nbformat": 4, 296 | "nbformat_minor": 2 297 | } 298 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/27 如何调试梯度.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 如何调试梯度" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "### 1. 模拟数据" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 12, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "np.random.seed(666)\n", 34 | "X = np.random.random(size=(1000, 10)) # 有1000个样本,每个样本有10个特征" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 13, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "array([ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11.])" 46 | ] 47 | }, 48 | "execution_count": 13, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "true_theta = np.arange(1, 12, dtype=float) #随机生成11个θ值\n", 55 | "true_theta" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 14, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "X_b = np.hstack([np.ones((len(X), 1)), X])\n", 65 | "y = X_b.dot(true_theta) + np.random.normal(size=1000)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 15, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "(1000, 10)\n", 78 | "(1000,)\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "print(X.shape)\n", 84 | "print(y.shape)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "### 2. 定义损失函数 J" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 16, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "def J(theta, X_b, y):\n", 101 | " try:\n", 102 | " return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)\n", 103 | " except:\n", 104 | " return float('inf')" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### 3. 定义导数" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 17, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "def dJ_math(theta, X_b, y): # 使用数学公式计算导数\n", 121 | " return X_b.T.dot(X_b.dot(theta) - y) * 2 / len(y)\n", 122 | "\n", 123 | "def dJ_debug(theta, X_b, y, epsilon=0.01): # 使用调试的方法求导数\n", 124 | " res = np.empty(len(theta))\n", 125 | " for i in range(len(theta)):\n", 126 | " theta_1 = theta.copy()\n", 127 | " theta_1[i] += epsilon\n", 128 | " theta_2 = theta.copy()\n", 129 | " theta_2[i] -= epsilon\n", 130 | " res[i] = (J(theta_1, X_b, y) - J(theta_2, X_b, y)) / (2 * epsilon)\n", 131 | " return res" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "### 3. 使用梯度下降法进行验证\n", 139 | "\n", 140 | "这里使用批量梯度下降法" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 20, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "def gradient_descent(dJ, X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):\n", 150 | " theta = initial_theta\n", 151 | " i_iter = 0\n", 152 | " \n", 153 | " while i_iter < n_iters:\n", 154 | " gradient = dJ(theta, X_b, y)\n", 155 | " last_theta = theta\n", 156 | " theta = theta - eta * gradient\n", 157 | " \n", 158 | " if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):\n", 159 | " break\n", 160 | " i_iter += 1\n", 161 | " return theta" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 22, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "Wall time: 2.6 s\n" 174 | ] 175 | }, 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "array([ 1.1251597 , 2.05312521, 2.91522497, 4.11895968, 5.05002117,\n", 180 | " 5.90494046, 6.97383745, 8.00088367, 8.86213468, 9.98608331,\n", 181 | " 10.90529198])" 182 | ] 183 | }, 184 | "execution_count": 22, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "# 使用 dJ_debug\n", 191 | "X_b = np.hstack([np.ones((len(X), 1)), X])\n", 192 | "initial_theta = np.zeros(X_b.shape[1])\n", 193 | "eta = 0.01\n", 194 | "\n", 195 | "%time theta = gradient_descent(dJ_debug, X_b, y, initial_theta, eta)\n", 196 | "theta" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "theta大概的结果就是1,2,3,4...11 这样的数,和我们模拟的 true_theta 是吻合的。" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 23, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "Wall time: 439 ms\n" 216 | ] 217 | }, 218 | { 219 | "data": { 220 | "text/plain": [ 221 | "array([ 1.1251597 , 2.05312521, 2.91522497, 4.11895968, 5.05002117,\n", 222 | " 5.90494046, 6.97383745, 8.00088367, 8.86213468, 9.98608331,\n", 223 | " 10.90529198])" 224 | ] 225 | }, 226 | "execution_count": 23, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "# 使用 dJ_math\n", 233 | "%time theta = gradient_descent(dJ_math, X_b, y, initial_theta, eta)\n", 234 | "theta" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "可以看到使用数学公式速度快了很多,而且结果也比较准确。" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [] 248 | } 249 | ], 250 | "metadata": { 251 | "kernelspec": { 252 | "display_name": "Python 3", 253 | "language": "python", 254 | "name": "python3" 255 | }, 256 | "language_info": { 257 | "codemirror_mode": { 258 | "name": "ipython", 259 | "version": 3 260 | }, 261 | "file_extension": ".py", 262 | "mimetype": "text/x-python", 263 | "name": "python", 264 | "nbconvert_exporter": "python", 265 | "pygments_lexer": "ipython3", 266 | "version": "3.7.0" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 2 271 | } 272 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/02 scikit-learn 中的 kNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## scikit-learn 中的 kNN" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 8, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from sklearn.neighbors import KNeighborsClassifier" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 9, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "kNN_classifier = KNeighborsClassifier(n_neighbors=6) #创建算法所对应的实例,n_neighbors就是传入的k" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 10, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import numpy as np\n", 35 | "import matplotlib.pyplot as plt" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 11, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "raw_data_X = [[3.393533211, 2.331273381],\n", 45 | " [3.110073483, 1.781539638],\n", 46 | " [1.343808831, 3.368360954],\n", 47 | " [3.582294042, 4.679179110],\n", 48 | " [2.280362439, 2.866990263],\n", 49 | " [7.423436942, 4.696522875],\n", 50 | " [5.745051997, 3.533989803],\n", 51 | " [9.172168622, 2.511101045],\n", 52 | " [7.792783481, 3.424088941],\n", 53 | " [7.939820817, 0.791637231]\n", 54 | " ] # 原始数据特征\n", 55 | "raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] # 原始数据的label,即所属类别,可以理解为0为良性肿瘤,1为恶性肿瘤" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 12, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "X_train = np.array(raw_data_X)\n", 65 | "y_train = np.array(raw_data_y)\n", 66 | "x = np.array([8.093607318, 3.365731514])" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 13, 72 | "metadata": { 73 | "scrolled": true 74 | }, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 80 | " metric_params=None, n_jobs=1, n_neighbors=6, p=2,\n", 81 | " weights='uniform')" 82 | ] 83 | }, 84 | "execution_count": 13, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "kNN_classifier.fit(X_train, y_train) #拟合训练数据集,fit函数具有返回值" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 15, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "x_predict = x.reshape(1, -1)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 16, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "array([1])" 111 | ] 112 | }, 113 | "execution_count": 16, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "kNN_classifier.predict(x_predict) # predict中希望传入一个矩阵" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 17, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "array([1])" 131 | ] 132 | }, 133 | "execution_count": 17, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "y_predict = kNN_classifier.predict(x_predict)\n", 140 | "y_predict" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 18, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "1" 152 | ] 153 | }, 154 | "execution_count": 18, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "y_predict[0]" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "### 重新整理我们的kNN的代码" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 39, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "%run D://python-code/KNN.py" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 40, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "knn_clf = kNNClassifier(k=6)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 41, 191 | "metadata": { 192 | "scrolled": true 193 | }, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "KNN(k=6)" 199 | ] 200 | }, 201 | "execution_count": 41, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "knn_clf.fit(X_train, y_train)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 42, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "array([[8.09360732, 3.36573151]])" 219 | ] 220 | }, 221 | "execution_count": 42, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | } 225 | ], 226 | "source": [ 227 | "x_predict" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 43, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "y_predict = knn_clf.predict(x_predict)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 44, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "array([1])" 248 | ] 249 | }, 250 | "execution_count": 44, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "y_predict" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 45, 262 | "metadata": {}, 263 | "outputs": [ 264 | { 265 | "data": { 266 | "text/plain": [ 267 | "1" 268 | ] 269 | }, 270 | "execution_count": 45, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "y_predict[0]" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [] 285 | } 286 | ], 287 | "metadata": { 288 | "kernelspec": { 289 | "display_name": "Python 3", 290 | "language": "python", 291 | "name": "python3" 292 | }, 293 | "language_info": { 294 | "codemirror_mode": { 295 | "name": "ipython", 296 | "version": 3 297 | }, 298 | "file_extension": ".py", 299 | "mimetype": "text/x-python", 300 | "name": "python", 301 | "nbconvert_exporter": "python", 302 | "pygments_lexer": "ipython3", 303 | "version": "3.7.0" 304 | } 305 | }, 306 | "nbformat": 4, 307 | "nbformat_minor": 2 308 | } 309 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/62 实现混淆矩阵,精准率和召回率.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 实现混淆矩阵,精准率和召回率" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 20, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "from sklearn import datasets" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 23, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "digits = datasets.load_digits()\n", 27 | "X = digits.data\n", 28 | "y = digits.target.copy()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 24, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# 使数据变得极度偏斜\n", 38 | "y[digits.target==9] = 1\n", 39 | "y[digits.target!=9] = 0" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## 使用逻辑回归" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 25, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "from sklearn.model_selection import train_test_split\n", 56 | "\n", 57 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 26, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "0.9755555555555555" 69 | ] 70 | }, 71 | "execution_count": 26, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "from sklearn.linear_model import LogisticRegression\n", 78 | "\n", 79 | "log_reg = LogisticRegression()\n", 80 | "log_reg.fit(X_train, y_train)\n", 81 | "log_reg.score(X_test, y_test)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 27, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "y_log_predict = log_reg.predict(X_test)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 28, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "403" 102 | ] 103 | }, 104 | "execution_count": 28, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "def TN(y_true, y_predict):\n", 111 | " assert len(y_true) == len(y_predict)\n", 112 | " return np.sum((y_true == 0) & (y_predict == 0))\n", 113 | "\n", 114 | "TN(y_test, y_log_predict)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 29, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "def TP(y_true, y_predict):\n", 124 | " assert len(y_true) == len(y_predict)\n", 125 | " return np.sum((y_true == 1) & (y_predict == 1))\n", 126 | "def FN(y_true, y_predict):\n", 127 | " assert len(y_true) == len(y_predict)\n", 128 | " return np.sum((y_true == 1) & (y_predict == 0))\n", 129 | "def FP(y_true, y_predict):\n", 130 | " assert len(y_true) == len(y_predict)\n", 131 | " return np.sum((y_true == 0) & (y_predict == 1))" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 31, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "36\n", 144 | "9\n", 145 | "2\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "print(TP(y_test, y_log_predict))\n", 151 | "print(FN(y_test, y_log_predict))\n", 152 | "print(FP(y_test, y_log_predict))" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 32, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "array([[403, 2],\n", 164 | " [ 9, 36]])" 165 | ] 166 | }, 167 | "execution_count": 32, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "# 混淆矩阵\n", 174 | "def confusion_matrix(y_true, y_predict):\n", 175 | " return np.array([\n", 176 | " [TN(y_test, y_predict), FP(y_test, y_predict)],\n", 177 | " [FN(y_test, y_predict), TP(y_test, y_predict)]\n", 178 | " ])\n", 179 | "confusion_matrix(y_test, y_log_predict)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 33, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "0.9473684210526315" 191 | ] 192 | }, 193 | "execution_count": 33, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "# 精准率\n", 200 | "def precision_score(y_true, y_predict):\n", 201 | " tp = TP(y_true, y_predict)\n", 202 | " fp = FP(y_true, y_predict)\n", 203 | " try:\n", 204 | " return tp / (tp + fp)\n", 205 | " except:\n", 206 | " return 0.0\n", 207 | " \n", 208 | "precision_score(y_test, y_log_predict)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 34, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/plain": [ 219 | "0.8" 220 | ] 221 | }, 222 | "execution_count": 34, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "# 召回率\n", 229 | "def recall_score(y_true, y_predict):\n", 230 | " tp = TP(y_true, y_predict)\n", 231 | " fn = FN(y_true, y_predict)\n", 232 | " try:\n", 233 | " return tp / (tp + fn)\n", 234 | " except:\n", 235 | " return 0.0\n", 236 | " \n", 237 | "recall_score(y_test, y_log_predict)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "## sklearn中混淆矩阵,精准率和召回率" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 35, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "array([[403, 2],\n", 256 | " [ 9, 36]], dtype=int64)" 257 | ] 258 | }, 259 | "execution_count": 35, 260 | "metadata": {}, 261 | "output_type": "execute_result" 262 | } 263 | ], 264 | "source": [ 265 | "from sklearn.metrics import confusion_matrix\n", 266 | "\n", 267 | "confusion_matrix(y_test, y_log_predict)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 36, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "data": { 277 | "text/plain": [ 278 | "0.9473684210526315" 279 | ] 280 | }, 281 | "execution_count": 36, 282 | "metadata": {}, 283 | "output_type": "execute_result" 284 | } 285 | ], 286 | "source": [ 287 | "from sklearn.metrics import precision_score\n", 288 | "\n", 289 | "precision_score(y_test, y_log_predict)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 37, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "0.8" 301 | ] 302 | }, 303 | "execution_count": 37, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "from sklearn.metrics import recall_score\n", 310 | "\n", 311 | "recall_score(y_test, y_log_predict)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [] 320 | } 321 | ], 322 | "metadata": { 323 | "kernelspec": { 324 | "display_name": "Python 3", 325 | "language": "python", 326 | "name": "python3" 327 | }, 328 | "language_info": { 329 | "codemirror_mode": { 330 | "name": "ipython", 331 | "version": 3 332 | }, 333 | "file_extension": ".py", 334 | "mimetype": "text/x-python", 335 | "name": "python", 336 | "nbconvert_exporter": "python", 337 | "pygments_lexer": "ipython3", 338 | "version": "3.7.0" 339 | } 340 | }, 341 | "nbformat": 4, 342 | "nbformat_minor": 2 343 | } 344 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/18 线性回归的可解释性和更多思考.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 更多关于线性回归模型的讨论" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "from sklearn import datasets\n", 18 | "\n", 19 | "boston = datasets.load_boston()\n", 20 | "\n", 21 | "X = boston.data\n", 22 | "y = boston.target\n", 23 | "\n", 24 | "X = X[y < 50.0]\n", 25 | "y = y[y < 50.0]" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "from sklearn.linear_model import LinearRegression" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "这里不需要进行 train_test_split,因为接下来要做的不是进行预测,所以我们也不需要看预测准确度,那么我们就不需要测试数据集。我们直接对整个数据进行拟合。" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/plain": [ 52 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" 53 | ] 54 | }, 55 | "execution_count": 4, 56 | "metadata": {}, 57 | "output_type": "execute_result" 58 | } 59 | ], 60 | "source": [ 61 | "lin_reg = LinearRegression()\n", 62 | "lin_reg.fit(X, y)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 5, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "array([-1.05574295e-01, 3.52748549e-02, -4.35179251e-02, 4.55405227e-01,\n", 74 | " -1.24268073e+01, 3.75411229e+00, -2.36116881e-02, -1.21088069e+00,\n", 75 | " 2.50740082e-01, -1.37702943e-02, -8.38888137e-01, 7.93577159e-03,\n", 76 | " -3.50952134e-01])" 77 | ] 78 | }, 79 | "execution_count": 5, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "lin_reg.coef_ # 系数" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "我们看见这些系数有正有负,正负代表的就是样本的特征和我们最终要预测的那个目标也就是房价他们之间是正相关还是负相关。如果系数为正,说明是正相关,换句话说,这个特征越大,最终得到的房价就越高;而如果这个系数为负的话,相应的就是负相关,也就是这个特征越大,我们的房价就越便宜。而系数的绝对值的大小就决定了这个影响的程度。\n", 93 | "\n", 94 | "因此,我们在这里对系数进行一下排序。" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "array([ 4, 7, 10, 12, 0, 2, 6, 9, 11, 1, 8, 3, 5], dtype=int64)" 106 | ] 107 | }, 108 | "execution_count": 7, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | } 112 | ], 113 | "source": [ 114 | "np.argsort(lin_reg.coef_) #默认从小到大排序,最小的为负相关程度最大的,最大值为正相关程度最大的" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 8, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',\n", 126 | " 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype=' best_score:\n", 73 | " best_score = score\n", 74 | " best_p = p\n", 75 | " best_k = k\n", 76 | "print(\"best_k = \", best_k)\n", 77 | "print(\"best_p = \", best_p)\n", 78 | "print(\"best_score = \", best_score)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "## 使用交叉验证进行超参数调整" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 7, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "array([0.98895028, 0.97777778, 0.96629213])" 97 | ] 98 | }, 99 | "execution_count": 7, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "from sklearn.model_selection import cross_val_score\n", 106 | "\n", 107 | "knn_clf = KNeighborsClassifier()\n", 108 | "cross_val_score(knn_clf, X_train, y_train)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "我们的结果得到一个数组,里面有三个score,说明cross默认将我们的训练数据集分成3份进行验证。" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 8, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "best_k = 2\n", 128 | "best_p = 2\n", 129 | "best_score = 0.9823599874006478\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "from sklearn.neighbors import KNeighborsClassifier\n", 135 | "\n", 136 | "best_score, best_p, best_k = 0, 0, 0\n", 137 | "for k in range(2, 11):\n", 138 | " for p in range(1, 6):\n", 139 | " knn_clf = KNeighborsClassifier(n_neighbors=k, weights=\"distance\", p=p)\n", 140 | " scores = cross_val_score(knn_clf, X_train, y_train) #得到一个数组\n", 141 | " score = np.mean(scores)\n", 142 | " if score > best_score:\n", 143 | " best_score = score\n", 144 | " best_p = p\n", 145 | " best_k = k\n", 146 | "print(\"best_k = \", best_k)\n", 147 | "print(\"best_p = \", best_p)\n", 148 | "print(\"best_score = \", best_score)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "这里得到的 best_score 就是我们最好的准确率吗?当然不是,我们进行交叉验证的目的只是为了拿到最好的 k 和 p 而已。" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 9, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "0.980528511821975" 167 | ] 168 | }, 169 | "execution_count": 9, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "best_knn_clf = KNeighborsClassifier(n_neighbors=2, weights=\"distance\", p=2)\n", 176 | "best_knn_clf.fit(X_train, y_train)\n", 177 | "best_knn_clf.score(X_test, y_test)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "此时得到的0.98才能说是我们模型的分类准确度" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "## 回顾网格搜索" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 11, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | "Fitting 3 folds for each of 45 candidates, totalling 135 fits\n" 204 | ] 205 | }, 206 | { 207 | "name": "stderr", 208 | "output_type": "stream", 209 | "text": [ 210 | "[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 25.4s finished\n" 211 | ] 212 | }, 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "GridSearchCV(cv=None, error_score='raise',\n", 217 | " estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 218 | " metric_params=None, n_jobs=1, n_neighbors=10, p=5,\n", 219 | " weights='distance'),\n", 220 | " fit_params={}, iid=True, n_jobs=1,\n", 221 | " param_grid=[{'weights': ['distance'], 'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],\n", 222 | " pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)" 223 | ] 224 | }, 225 | "execution_count": 11, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "from sklearn.grid_search import GridSearchCV # GridSearchCV:这个CV就是交叉验证\n", 232 | "param_gird = [\n", 233 | " {\n", 234 | " \"weights\" : [\"distance\"],\n", 235 | " \"n_neighbors\": [i for i in range(2, 11)],\n", 236 | " \"p\" : [i for i in range(1, 6)]\n", 237 | " }\n", 238 | "]\n", 239 | "grid_search = GridSearchCV(knn_clf, param_gird, verbose=1)\n", 240 | "grid_search.fit(X_train, y_train)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 12, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "{'n_neighbors': 2, 'p': 2, 'weights': 'distance'}" 252 | ] 253 | }, 254 | "execution_count": 12, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "grid_search.best_params_" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 13, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "text/plain": [ 271 | "0.9823747680890538" 272 | ] 273 | }, 274 | "execution_count": 13, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "grid_search.best_score_" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 14, 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "0.980528511821975" 292 | ] 293 | }, 294 | "execution_count": 14, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [ 300 | "best_knn_clf = grid_search.best_estimator_\n", 301 | "best_knn_clf.score(X_test, y_test)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 15, 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "array([0.99543379, 0.96803653, 0.98148148, 0.96261682, 0.97619048])" 313 | ] 314 | }, 315 | "execution_count": 15, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "#交叉验证默认将训练数据集分成三份,如果我们想分成5份,里面有一个cv的参数\n", 322 | "cross_val_score(knn_clf, X_train, y_train, cv=5)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [] 331 | } 332 | ], 333 | "metadata": { 334 | "kernelspec": { 335 | "display_name": "Python 3", 336 | "language": "python", 337 | "name": "python3" 338 | }, 339 | "language_info": { 340 | "codemirror_mode": { 341 | "name": "ipython", 342 | "version": 3 343 | }, 344 | "file_extension": ".py", 345 | "mimetype": "text/x-python", 346 | "name": "python", 347 | "nbconvert_exporter": "python", 348 | "pygments_lexer": "ipython3", 349 | "version": "3.7.0" 350 | } 351 | }, 352 | "nbformat": 4, 353 | "nbformat_minor": 2 354 | } 355 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/06 网格搜索与k近邻算法中更多的超参数.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 网格搜索" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "from sklearn import datasets\n", 18 | "from sklearn.neighbors import KNeighborsClassifier\n", 19 | "from sklearn.model_selection import train_test_split" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "0.9916666666666667" 31 | ] 32 | }, 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "digits = datasets.load_digits()\n", 40 | "X = digits.data\n", 41 | "y = digits.target\n", 42 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)\n", 43 | "sk_knn_clf = KNeighborsClassifier(n_neighbors=4, weights=\"uniform\")\n", 44 | "sk_knn_clf.fit(X_train, y_train)\n", 45 | "sk_knn_clf.score(X_test, y_test)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## Grid Search" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "### 1. 首先定义我们所需要的参数" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "param_grid = [\n", 69 | " {\n", 70 | " 'weights': ['uniform'],\n", 71 | " 'n_neighbors':[i for i in range(1, 11)]\n", 72 | " },\n", 73 | " {\n", 74 | " 'weights':['distance'],\n", 75 | " 'n_neighbors':[i for i in range(1, 11)],\n", 76 | " 'p': [i for i in range(1, 6)]\n", 77 | " }\n", 78 | "]" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 6, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "knn_clf = KNeighborsClassifier()" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 7, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "from sklearn.model_selection import GridSearchCV # CV交叉验证\n", 97 | "\n", 98 | "grid_search = GridSearchCV(knn_clf, param_grid) # 定义网格搜索对象" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 8, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "Wall time: 2min 16s\n" 111 | ] 112 | }, 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "GridSearchCV(cv=None, error_score='raise',\n", 117 | " estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 118 | " metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n", 119 | " weights='uniform'),\n", 120 | " fit_params=None, iid=True, n_jobs=1,\n", 121 | " param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],\n", 122 | " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n", 123 | " scoring=None, verbose=0)" 124 | ] 125 | }, 126 | "execution_count": 8, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "%%time\n", 133 | "grid_search.fit(X_train, y_train)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 9, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 145 | " metric_params=None, n_jobs=1, n_neighbors=3, p=3,\n", 146 | " weights='distance')" 147 | ] 148 | }, 149 | "execution_count": 9, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "grid_search.best_estimator_" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 10, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "0.9853862212943633" 167 | ] 168 | }, 169 | "execution_count": 10, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "grid_search.best_score_ # 似乎比之前的准确率低一些,因为评判标准不同" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 11, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "{'n_neighbors': 3, 'p': 3, 'weights': 'distance'}" 187 | ] 188 | }, 189 | "execution_count": 11, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "grid_search.best_params_ # 最佳参数" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 12, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "knn_clf = grid_search.best_estimator_" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 13, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/plain": [ 215 | "array([8, 1, 3, 4, 4, 0, 7, 0, 8, 0, 4, 6, 1, 1, 2, 0, 1, 6, 7, 3, 3, 6,\n", 216 | " 5, 2, 9, 4, 0, 2, 0, 3, 0, 8, 7, 2, 3, 5, 1, 3, 1, 5, 8, 6, 2, 6,\n", 217 | " 3, 1, 3, 0, 0, 4, 9, 9, 2, 8, 7, 0, 5, 4, 0, 9, 5, 5, 8, 7, 4, 2,\n", 218 | " 8, 8, 7, 5, 4, 3, 0, 2, 7, 2, 1, 2, 4, 0, 9, 0, 6, 6, 2, 0, 0, 5,\n", 219 | " 4, 4, 3, 1, 3, 8, 6, 4, 4, 7, 5, 6, 8, 4, 8, 4, 6, 9, 7, 7, 0, 8,\n", 220 | " 8, 3, 9, 7, 1, 8, 4, 2, 7, 0, 0, 4, 9, 6, 7, 3, 4, 6, 4, 8, 4, 7,\n", 221 | " 2, 6, 9, 5, 8, 7, 2, 5, 5, 9, 7, 9, 3, 1, 9, 4, 4, 1, 5, 1, 6, 4,\n", 222 | " 4, 8, 1, 6, 2, 5, 2, 1, 4, 4, 3, 9, 4, 0, 6, 0, 8, 3, 8, 7, 3, 0,\n", 223 | " 3, 0, 5, 9, 2, 7, 1, 8, 1, 4, 3, 3, 7, 8, 2, 7, 2, 2, 8, 0, 5, 7,\n", 224 | " 6, 7, 3, 4, 7, 1, 7, 0, 9, 2, 8, 9, 3, 8, 9, 1, 1, 1, 9, 8, 8, 0,\n", 225 | " 3, 7, 3, 3, 4, 8, 2, 1, 8, 6, 0, 1, 7, 7, 5, 8, 3, 8, 7, 6, 8, 4,\n", 226 | " 2, 6, 2, 3, 7, 4, 9, 3, 5, 0, 6, 3, 8, 3, 3, 1, 4, 5, 3, 2, 5, 6,\n", 227 | " 9, 6, 9, 5, 5, 3, 6, 5, 9, 3, 7, 7, 0, 2, 4, 9, 9, 9, 2, 5, 6, 1,\n", 228 | " 9, 6, 9, 7, 7, 4, 5, 0, 0, 5, 3, 8, 4, 4, 3, 2, 5, 3, 2, 2, 3, 0,\n", 229 | " 9, 8, 2, 1, 4, 0, 6, 2, 8, 0, 6, 4, 9, 9, 8, 3, 9, 8, 6, 3, 2, 7,\n", 230 | " 9, 4, 2, 7, 5, 1, 1, 6, 1, 0, 4, 9, 2, 9, 0, 3, 3, 0, 7, 4, 8, 5,\n", 231 | " 9, 5, 9, 5, 0, 7, 9, 8])" 232 | ] 233 | }, 234 | "execution_count": 13, 235 | "metadata": {}, 236 | "output_type": "execute_result" 237 | } 238 | ], 239 | "source": [ 240 | "knn_clf.predict(X_test)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 14, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "0.9833333333333333" 252 | ] 253 | }, 254 | "execution_count": 14, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "knn_clf.score(X_test, y_test)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "### grid_search中还可以传入更多参数" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 15, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "Fitting 3 folds for each of 60 candidates, totalling 180 fits\n" 280 | ] 281 | }, 282 | { 283 | "name": "stderr", 284 | "output_type": "stream", 285 | "text": [ 286 | "[Parallel(n_jobs=-1)]: Done 17 tasks | elapsed: 5.1s\n", 287 | "[Parallel(n_jobs=-1)]: Done 138 tasks | elapsed: 18.6s\n" 288 | ] 289 | }, 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "Wall time: 24.9 s\n" 295 | ] 296 | }, 297 | { 298 | "name": "stderr", 299 | "output_type": "stream", 300 | "text": [ 301 | "[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 24.1s finished\n" 302 | ] 303 | } 304 | ], 305 | "source": [ 306 | "%%time\n", 307 | "grid_search = GridSearchCV(knn_clf, param_grid, n_jobs=-1, verbose=2) \n", 308 | "# n_jobs:并行计算,默认为1,单核;传入-1,默认使用全部的核\n", 309 | "# verbose:显示一些搜索信息,值越大,显示的信息越详细\n", 310 | "grid_search.fit(X_train, y_train)\n" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [] 319 | } 320 | ], 321 | "metadata": { 322 | "kernelspec": { 323 | "display_name": "Python 3", 324 | "language": "python", 325 | "name": "python3" 326 | }, 327 | "language_info": { 328 | "codemirror_mode": { 329 | "name": "ipython", 330 | "version": 3 331 | }, 332 | "file_extension": ".py", 333 | "mimetype": "text/x-python", 334 | "name": "python", 335 | "nbconvert_exporter": "python", 336 | "pygments_lexer": "ipython3", 337 | "version": "3.7.0" 338 | } 339 | }, 340 | "nbformat": 4, 341 | "nbformat_minor": 2 342 | } 343 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/50 什么是逻辑回归.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Sigmoid" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "def sigmoid(t):\n", 27 | " return 1 / (1 + np.exp(-t))" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "[]" 39 | ] 40 | }, 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | }, 45 | { 46 | "data": { 47 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAH8RJREFUeJzt3Xt83HWd7/HXJ/dekl6T3i/UFmi5YyggKpVCLaiU9YFYVtcLal324OXselbUfbBe9pzj6jm7Z32IuhVZELmqKBWrLVQQFIGmFwptKA1t2qSXJG1C0zbNZWY+54+ZwhAmzaSZyW/ml/fz8Ugzv9/vOzOf/ObXd375zm++X3N3REQkXAqCLkBERDJP4S4iEkIKdxGREFK4i4iEkMJdRCSEFO4iIiGkcBcRCSGFu4hICCncRURCqCioJ544caLPnj07qKcXEclLGzZsOOjulf21CyzcZ8+eTU1NTVBPLyKSl8xsdzrt1C0jIhJCCncRkRBSuIuIhJDCXUQkhBTuIiIh1G+4m9mdZtZsZi/1sd3M7HtmVmdmW8zswsyXKSIiA5HOmftdwNKTbL8amJf4WgH8cPBliYjIYPR7nbu7P2Vms0/SZBnwU4/P1/esmY01synuvj9DNYpISMViTlckRmdPlM5IlEjU6YnGiMacnqgTicWIxJzIidtRT2yLt4nEnJg77uA4sRjxZcAT62Oeap0T8zeWHX+jXdLMo8nTkL5pfcp1b237pklMkxovnj+J82aMHezuO6lMfIhpGtCQtNyYWPeWcDezFcTP7pk5c2YGnlpEguLuvNbRQ/ORLlqOdHHoWBftnRHaj/fQ3tlD+/EI7Z09HEms6+yJJr5iHE/c7orEgv4xhpRZ/HtVRVlehLulWJdy1m13XwmsBKiurtbM3CI5LBpz9rYdZ3frMfa0drDnUAd7WjvY99pxmo90cfBoFz3R1P+NSwoLqBhRTEVZEeWJ71XlpZQVF1JWXEBZcSEjigspTSyPKC6krLiQ4sICigqMokKLfy8ooLDQKC4ooLDAKC60xPcCigqNQjMKCgwDCswoMMOMxJdRYPH1RnzZ7EQ7MAwr4C33JbF8QtLNN4WdJTa8ed1btwclE+HeCMxIWp4O7MvA44rIEOmKRHlp72FebDzMyweOULu/ne1NR+jseePMuqSwgOnjRzBt7AjmVpVTVVFK5ehSKstLqSovZcLoUipGFFFRVkxZcWGAP41AZsJ9FXCLmT0AXAwcVn+7SG7rjsSoqW/l6bqD1NS38kLjYboTXSTjRhYzf0oFf71wFmdMHs2sCaOYOX4kkyvKKCgI9mxU0tdvuJvZ/cAiYKKZNQL/DBQDuPuPgNXANUAd0AF8MlvFisipa+/sYe3WJtbVNvH0joMc7YpQVGCcPW0MH790Fm+fNZ7zZ4xlUkVp4F0KMnjpXC1zYz/bHfhvGatIRDImGnOe3N7Mw5v28vi2JroiMSZXlPGB86bwnjOquGzuREaVBjY4rGSRXlWREGrv7OGh9Q3c/Zd6GlqPM35UCcsvmsF1F0zj/BljdWY+DCjcRUKkvbOHO57exZ1/2sXRrggXzR7HV66ez1ULJlFcqNFGhhOFu0gIdEWi3PXnen74x1d5raOHq8+ezN8tmss508cEXZoEROEukueeqTvIP/36JXYePMblp1fypSVnKNRF4S6Sr452RfjGqq38fEMjM8eP5O6bFnL56f1OrSnDhMJdJA9t3NPGFx/YTGNbB3+36G18fvE8fXBI3kThLpJH3J2fPbubb/xmG5Mqynjws5dy0ezxQZclOUjhLpInuiMxvv6brdz33B6uOLOKf//w+YwZURx0WZKjFO4ieaCjO8Jn79nA0zsOcvOit/GlJWdQqKEA5CQU7iI57vDxHm66az2b9rTxnevP5YbqGf3fSYY9hbtIDjvc0cONP36WHc1H+MFHLmTp2VOCLknyhMJdJEd1dEf45F3PU9d8lB9/rJpFZ1QFXZLkEX0eWSQHdUWifPaeDWxueI3v3Xi+gl0GTGfuIjnG3fnqwy/x9I6DfOf6c9UVI6dEZ+4iOeYnf9rFLzc28oXF8/TmqZwyhbtIDvnjKy38r9W1XH32ZL6weF7Q5UgeU7iL5Ih9rx3n8/dv4vRJ5fyfD52nKe1kUBTuIjkgGnO++OBmItEYP/ro2zU7kgyajiCRHHD7E3U8v6uVf7vhPGZPHBV0ORICOnMXCdimPW38x7odXHf+VD544fSgy5GQULiLBKg7EuPLv9zCpPJSvnXd2UGXIyGibhmRAP3gyTpeaTrKnZ+oprxMIzxK5ujMXSQgO5qOcPsTdVx73lSuOHNS0OVIyCjcRQLg7nz1Vy8yurSIf/7AgqDLkRBSuIsE4NEt+1lf38Y/Lj2TCaNLgy5HQkjhLjLEOnuifPt3L7NgSoWGF5CsUbiLDLGVT+1k72vH+ecPLNBsSpI1CneRIdTU3skPn3yVa86ZzMVzJgRdjoSYwl1kCH3/D3X0RGPcunR+0KVIyCncRYZIQ2sHD6zfw4cvmsHMCSODLkdCLq1wN7OlZrbdzOrM7NYU22ea2RNmtsnMtpjZNZkvVSS/fW/dDsyMz12hoXwl+/oNdzMrBG4HrgYWADeaWe8Lc/8JeMjdLwCWAz/IdKEi+Wxny1F+ubGRv7lkFpPHlAVdjgwD6Zy5LwTq3H2nu3cDDwDLerVxoCJxewywL3MliuS/763bQWlRITcvelvQpcgwkc7YMtOAhqTlRuDiXm2+Dqw1s88Bo4ArM1KdSAg0tHbwmy37uemy2UzUB5ZkiKRz5p7qQlzvtXwjcJe7TweuAe4xs7c8tpmtMLMaM6tpaWkZeLUieejHT++kwOBT75wTdCkyjKQT7o1A8sfopvPWbpdPAQ8BuPtfgDJgYu8HcveV7l7t7tWVlZWnVrFIHjl4tIsH1zfwwQumq69dhlQ64b4emGdmp5lZCfE3TFf1arMHWAxgZvOJh7tOzWXYu+vP9XRHY6y4XGftMrT6DXd3jwC3AGuAWuJXxWw1s2+a2bWJZv8AfMbMXgDuBz7h7r27bkSGlWNdEX76l3quPnsyb6scHXQ5MsykNVmHu68GVvdad1vS7W3AZZktTSS/PbyxkfbOCJ9+l87aZejpE6oiWRCLOXc9U895M8Zy4cxxQZcjw5DCXSQL/lR3kFdbjvGJd8wKuhQZphTuIllw1zP1TBxdyjXnTAm6FBmmFO4iGVZ/8BhPbG/mIxfPpLSoMOhyZJhSuItk2M+e3U2hGR+5eGbQpcgwpnAXyaCuSJSHN+1lyVmTqKrQh5YkOAp3kQx6bFsTrce6+fBFOmuXYCncRTLowfUNTBs7gnfOfcvoGyJDSuEukiENrR08veMgN1TP0MTXEjiFu0iGPFTTgBl8qHp60KWIKNxFMiESjfHzmkYuP72SqWNHBF2OiMJdJBP++EoLB9o7WX7RjP4biwwBhbtIBvxiQyMTRpWweP6koEsRARTuIoN2+HgP62qb+cB5Uyku1H8pyQ06EkUG6Xcv7qc7GuODF04LuhSR1yncRQbp4U17mVM5inOmjQm6FJHXKdxFBqGxrYPnd7XyV+dPw0zXtkvuULiLDMIjm+NzxS87X10yklsU7iKnyN351aa9VM8ax8wJI4MuR+RNFO4ip2jrvnbqmo/yV3ojVXKQwl3kFP16016KC433abYlyUEKd5FTEIs5q1/cz7vnVTJ2ZEnQ5Yi8hcJd5BRsaniNfYc7ef95OmuX3KRwFzkFv92yn5KiAq7UcAOSoxTuIgOU3CVTXlYcdDkiKSncRQZoU0MbB9o7ef+56pKR3KVwFxmgRxNdMovnVwVdikifFO4iA3CiS2bR6eqSkdymcBcZgI172mhq7+J96pKRHKdwFxmAN7pkdJWM5DaFu0iaTnTJvOeMSkaXFgVdjshJpRXuZrbUzLabWZ2Z3dpHmxvMbJuZbTWz+zJbpkjwNuxpo/lIF9douAHJA/2efphZIXA7cBXQCKw3s1Xuvi2pzTzgK8Bl7t5mZrqMQEJn7dYDlBQWcMWZOrwl96Vz5r4QqHP3ne7eDTwALOvV5jPA7e7eBuDuzZktUyRY7s7abU28Y+4EXSUjeSGdcJ8GNCQtNybWJTsdON3M/mxmz5rZ0lQPZGYrzKzGzGpaWlpOrWKRALzSdJTdhzpYsmBy0KWIpCWdcE81d5j3Wi4C5gGLgBuBO8xs7Fvu5L7S3avdvbqysnKgtYoEZu3WA5jBlQvUJSP5IZ1wbwRmJC1PB/alaPOIu/e4+y5gO/GwFwmFtduauGDGWKrKy4IuRSQt6YT7emCemZ1mZiXAcmBVrza/Bt4DYGYTiXfT7MxkoSJB2ffacV7ce5glZ6lLRvJHv+Hu7hHgFmANUAs85O5bzeybZnZtotka4JCZbQOeAP6Hux/KVtEiQ+mxbU0ALFmgDy5J/kjrkxjuvhpY3WvdbUm3Hfj7xJdIqKzddoC5VaOZUzk66FJE0qZPqIqcxOGOHp7d2aqzdsk7CneRk/jD9iaiMVd/u+QdhbvISazd2sSkilLOnTYm6FJEBkThLtKHzp4of3ylhasWTKKgINXHPURyl8JdpA9/rjtIR3dUn0qVvKRwF+nD2q1NlJcWccmcCUGXIjJgCneRFKIx5/HaJt5zZhUlRfpvIvlHR61IChv3tHHoWDdLztIlkJKfFO4iKZwYu/3y0zXAneQnhbtILxq7XcJA4S7Si8ZulzBQuIv0orHbJQwU7iK9aOx2CQOFu0gSjd0uYaFwF0misdslLBTuIkk0druEhcJdJEFjt0uYKNxFEjR2u4SJwl0kQWO3S5go3EWIj93+5PYWrpyvsdslHBTuIsCfdhzkeE+U96pLRkJC4S5C/CqZ8jKN3S7hoXCXYS8SjfF4bTNXaOx2CREdyTLsbdjdRuuxbg0UJqGicJdhb+22JkqKCrj8DI3dLuGhcJdhzd1Zs/UA75w7kdGlRUGXI5IxCncZ1mr3H6Gx7bg+lSqho3CXYW3ttvjY7YvnK9wlXBTuMqyt2dpE9axxVJaXBl2KSEYp3GXYamjtoHZ/u66SkVBKK9zNbKmZbTezOjO79STtrjczN7PqzJUokh1rT4zdfpa6ZCR8+g13MysEbgeuBhYAN5rZghTtyoHPA89lukiRbFiz9QBnTi5n1oRRQZciknHpnLkvBOrcfae7dwMPAMtStPsW8B2gM4P1iWTFoaNd1NRr7HYJr3TCfRrQkLTcmFj3OjO7AJjh7o9msDaRrFlX20zM0djtElrphHuq8U/99Y1mBcC/A//Q7wOZrTCzGjOraWlpSb9KkQxbs/UA08aO4KypFUGXIpIV6YR7IzAjaXk6sC9puRw4G3jSzOqBS4BVqd5UdfeV7l7t7tWVlfqotwSjvbOHp3ccZOnZkzHT2O0STumE+3pgnpmdZmYlwHJg1YmN7n7Y3Se6+2x3nw08C1zr7jVZqVhkkNbVNtEdjXHNOVOCLkUka/oNd3ePALcAa4Ba4CF332pm3zSza7NdoEim/XbLASZXlHHBjLFBlyKSNWmNlOTuq4HVvdbd1kfbRYMvSyQ7jnT28NSOFj5y8UxNpyehpk+oyrCyrraZ7kiM96lLRkJO4S7DyuoX9zO5oowLZ44LuhSRrFK4y7BxtCvCk6+0sPTsyeqSkdBTuMuwsa62ie6IrpKR4UHhLsPG6hf3U1VeSvUsdclI+CncZVg41hXhye0tXK0uGRkmFO4yLKx7uZkudcnIMKJwl2HhkU17mVxRxkWzxwddisiQULhL6LUe6+aPr7Rw7flT1SUjw4bCXULvty/uJxJzlp0/NehSRIaMwl1C75FNe5lXNZoFUzS8rwwfCncJtYbWDmp2t3HdBdM0vK8MKwp3CbVVL8SnHrj2PHXJyPCicJfQcnd+vWkv1bPGMWP8yKDLERlSCncJrdr9R9jRfFRvpMqwpHCX0PrlxkaKC433natwl+FH4S6h1B2J8atNe7ly/iTGjyoJuhyRIadwl1BaV9tE67FubrhoRv+NRUJI4S6h9GBNA5Mrynj3vMqgSxEJhMJdQmf/4eM89UoL1799OoUabkCGKYW7hM7DG/cSc/hQ9fSgSxEJjMJdQiUWcx6qaeCSOeOZNWFU0OWIBEbhLqHy3K5Wdh/q4MN6I1WGOYW7hMq9z+2moqyIpWdpUg4Z3hTuEhrN7Z38/qUD3FA9gxElhUGXIxIohbuExn3P7yEScz56yaygSxEJnMJdQqEnGuO+5/aw6IxKZk/UG6kiCncJhTVbD9B8pIuPXaqzdhFQuEtI3P1MPTPHj+Ty06uCLkUkJyjcJe9tbniN9fVtfOzSWfpEqkiCwl3y3sqnXqW8rIjlC2cGXYpIzkgr3M1sqZltN7M6M7s1xfa/N7NtZrbFzNaZmTo+ZUjUHzzG7186wEcvmcXo0qKgyxHJGf2Gu5kVArcDVwMLgBvNbEGvZpuAanc/F/gF8J1MFyqSyh1/2klRQQGffMfsoEsRySnpnLkvBOrcfae7dwMPAMuSG7j7E+7ekVh8FtCITZJ1h4528fOaRq67YCpVFWVBlyOSU9IJ92lAQ9JyY2JdXz4F/C7VBjNbYWY1ZlbT0tKSfpUiKdzxp110R2OsePecoEsRyTnphHuqyw88ZUOzjwLVwHdTbXf3le5e7e7VlZWaREFO3aGjXdz9TD3vO2cKc6vKgy5HJOek8w5UI5A8xN50YF/vRmZ2JfA14HJ378pMeSKprXx6J8d7onxh8bygSxHJSemcua8H5pnZaWZWAiwHViU3MLMLgP8ErnX35syXKfKGg0e7+Okzu/nAuVOZN0ln7SKp9Bvu7h4BbgHWALXAQ+6+1cy+aWbXJpp9FxgN/NzMNpvZqj4eTmTQVj61k65IlM/rrF2kT2ldGOzuq4HVvdbdlnT7ygzXJZJSQ2sHdz1Tz3XnT2Nu1eigyxHJWfqEquSV767ZjgFfeu8ZQZciktMU7pI3Nje8xqoX9vHpd53G1LEjgi5HJKcp3CUvuDv/8ug2Jo4u4eZFc4MuRyTnKdwlL6x6YR81u9v471edrjFkRNKgcJecd7ijh289uo3zpo9h+UUa+VEkHToFkpz37d+/TFtHD3fftFDjtYukSWfuktNq6lu5//k93HTZbM6aOibockTyhsJdclZHd4R//MUWpo0dwRevPD3ockTyirplJGf9z9/WsuvQMe799MWM0puoIgOiM3fJSetqm7j3uT2seNcc3vG2iUGXI5J3FO6Scw4c7uQff7GF+VMq+Psl6o4RORUKd8kpXZEoN9+7geM9Ub63/HxKiwqDLkkkL6kjU3LKN36zjU17XuOHH7lQw/mKDILO3CVn/OzZ3dz33B5uXvQ2rj5nStDliOQ1hbvkhDVbD3DbIy9xxZlVfGmJRnwUGSyFuwRufX0rn79/E+dOH8v3//oCfQpVJAMU7hKojXvauOmu9UwbN4I7P3ERI0v0NpBIJijcJTDP72rlb+54jvGjSrjnUxczflRJ0CWJhIZOkyQQT25v5uafbWTK2DLu/8wlTKooC7okkVDRmbsMubufqeemu9Yze+IoHlxxqYJdJAt05i5DpisS5V8ereWeZ3dz5fwq/mP5BRozRiRL9D9LhsSug8f43P0beWlvOyvePYcvLz1TV8WIZJHCXbIqFnPue34P/3t1LcVFBfz4Y9VctWBS0GWJhJ7CXbJmR9MRvvqrF1lf38Zlcyfw3evPY+rYEUGXJTIsKNwl45raO/l/j7/Cg+sbKC8r5rvXn8v1b5+OmbphRIaKwl0ypqm9k//6cz13P1NPJBbjY5fO5nNXzGXC6NKgSxMZdhTuMmhb9x3mv/5czyOb9xKNOe8/dyr/sOR0Zk0YFXRpIsOWwl1OycGjXazavI9fbGhk2/52RhQX8tcLZ3LTO09TqIvkAIW7pMXd2XnwGI9va+Lx2iY27G4j5nDOtDF849qzWHb+VMaO1PABIrlC4S4pRWPOzpajrK9v4/ldh3h+Vyv7DncCcNbUCj53xTyuOWcKZ0zWhBoiuSitcDezpcB/AIXAHe7+7V7bS4GfAm8HDgEfdvf6zJYq2eDuNB/pov7gMV5tOcbWfYfZtr+dl/cf4XhPFIDK8lIWnjaem+dMYPGZVbqcUSQP9BvuZlYI3A5cBTQC681slbtvS2r2KaDN3eea2XLgX4EPZ6NgSV8kGuPw8R6aj3TR1N5J85EuWhK39x/uZM+hDna3HqOzJ/b6fcrLilgwpYLlC2dw1tQxVM8ax6wJI3UZo0ieSefMfSFQ5+47AczsAWAZkBzuy4CvJ27/Avi+mZm7ewZrzVuxmBOJOdGYE4nFEt/jyz3RNy9Hoon1sRg9kRidkRjHu6N0RaIc747S2RPleE+Mzp7o619Hu6IcPt5De2cP7ccTX50RjnZFUtYzZkQxkypKmTl+FO+aN5FZE0cxe8JIZk8YxfRxIxTkIiGQTrhPAxqSlhuBi/tq4+4RMzsMTAAOZqLIZA+tb+A/n3qV139rOHj8eV9f5w6Ox78n/Xo50ebE9jfanmjXe50nbUt6Difpud78mK+3TfwTicWIZelXXGlRAWXFhYwqKaRiRDFjRhQzY/xIxowopqIsvjxmRBGTKsqoqiilqryMyvJSyooLs1OQiOSMdMI91Wlc77hKpw1mtgJYATBz5sw0nvqtxo0q4czJFWBvPKmZYYClWBdvZ4l1JLUzTpygWqLhm+//RhtLPB4ptp344c3e/JyJR6S40CgsMIoKjKLCAooK3lguLEhaLjSKCgre2FZoFBcUMKIkHuBlxYWMSPpeWlRAgQbeEpE+pBPujcCMpOXpwL4+2jSaWREwBmjt/UDuvhJYCVBdXX1K57NXLZikgadERPqRzmQd64F5ZnaamZUAy4FVvdqsAj6euH098Af1t4uIBKffM/dEH/otwBril0Le6e5bzeybQI27rwJ+AtxjZnXEz9iXZ7NoERE5ubSuc3f31cDqXutuS7rdCXwos6WJiMip0hyqIiIhpHAXEQkhhbuISAgp3EVEQkjhLiISQhbU5ehm1gLsPsW7TyQLQxtkgOoaGNU1cLlam+oamMHUNcvdK/trFFi4D4aZ1bh7ddB19Ka6BkZ1DVyu1qa6BmYo6lK3jIhICCncRURCKF/DfWXQBfRBdQ2M6hq4XK1NdQ1M1uvKyz53ERE5uXw9cxcRkZPI2XA3sw+Z2VYzi5lZda9tXzGzOjPbbmbv7eP+p5nZc2a2w8weTAxXnOkaHzSzzYmvejPb3Ee7ejN7MdGuJtN1pHi+r5vZ3qTarumj3dLEPqwzs1uHoK7vmtnLZrbFzH5lZmP7aDck+6u/n9/MShOvcV3iWJqdrVqSnnOGmT1hZrWJ4/8LKdosMrPDSa/vbakeKwu1nfR1sbjvJfbXFjO7cAhqOiNpP2w2s3Yz+2KvNkO2v8zsTjNrNrOXktaNN7PHEln0mJmN6+O+H0+02WFmH0/VZkDcPSe/gPnAGcCTQHXS+gXAC0ApcBrwKlCY4v4PAcsTt38E3Jzlev8vcFsf2+qBiUO4774OfKmfNoWJfTcHKEns0wVZrmsJUJS4/a/Avwa1v9L5+YG/A36UuL0ceHAIXrspwIWJ2+XAKynqWgQ8OlTHU7qvC3AN8Dvic5FdAjw3xPUVAgeIXwceyP4C3g1cCLyUtO47wK2J27emOu6B8cDOxPdxidvjBlNLzp65u3utu29PsWkZ8IC7d7n7LqCO+CTer7P4nHhXEJ+sG+Bu4Lps1Zp4vhuA+7P1HFnw+sTn7t4NnJj4PGvcfa27n5i1+1nis3oFJZ2ffxnxYwfix9Jiy/Ls4e6+3903Jm4fAWqJz1GcD5YBP/W4Z4GxZjZlCJ9/MfCqu5/qhyMHzd2f4q2z0CUfR31l0XuBx9y91d3bgMeApYOpJWfD/SRSTdjd++CfALyWFCSp2mTSu4Amd9/Rx3YH1prZhsQ8skPhlsSfxnf28WdgOvsxm24ifpaXylDsr3R+/jdN/A6cmPh9SCS6gS4Ankux+VIze8HMfmdmZw1RSf29LkEfU8vp+wQriP11wiR33w/xX95AVYo2Gd93aU3WkS1m9jgwOcWmr7n7I33dLcW6U5qwOx1p1ngjJz9rv8zd95lZFfCYmb2c+A1/yk5WF/BD4FvEf+ZvEe8yuqn3Q6S476AvnUpnf5nZ14AIcG8fD5Px/ZWq1BTrsnYcDZSZjQZ+CXzR3dt7bd5IvOvhaOL9lF8D84agrP5elyD3VwlwLfCVFJuD2l8DkfF9F2i4u/uVp3C3dCbsPkj8T8KixBlXqjYZqdHiE4J/EHj7SR5jX+J7s5n9iniXwKDCKt19Z2Y/Bh5NsSmd/ZjxuhJvFL0fWOyJzsYUj5Hx/ZVCxiZ+zzQzKyYe7Pe6+8O9tyeHvbuvNrMfmNlEd8/qGCppvC5ZOabSdDWw0d2bem8Ian8laTKzKe6+P9FN1ZyiTSPx9wZOmE78/cZTlo/dMquA5YkrGU4j/hv4+eQGidB4gvhk3RCfvLuvvwQG60rgZXdvTLXRzEaZWfmJ28TfVHwpVdtM6dXP+Vd9PF86E59nuq6lwJeBa929o482Q7W/cnLi90Sf/k+AWnf/tz7aTD7R929mC4n/Pz6U5brSeV1WAR9LXDVzCXD4RHfEEOjzr+cg9lcvycdRX1m0BlhiZuMS3ahLEutO3VC8g3wqX8RDqRHoApqANUnbvkb8SoftwNVJ61cDUxO35xAP/Trg50Bpluq8C/jbXuumAquT6ngh8bWVePdEtvfdPcCLwJbEgTWld12J5WuIX43x6hDVVUe8X3Fz4utHvesayv2V6ucHvkn8lw9AWeLYqUscS3OGYB+9k/if41uS9tM1wN+eOM6AWxL75gXib0y/YwjqSvm69KrLgNsT+/NFkq5yy3JtI4mH9ZikdYHsL+K/YPYDPYn8+hTx92nWATsS38cn2lYDdyTd96bEsVYHfHKwtegTqiIiIZSP3TIiItIPhbuISAgp3EVEQkjhLiISQgp3EZEQUriLiISQwl1EJIQU7iIiIfT/AfDvTn1iOeRyAAAAAElFTkSuQmCC\n", 48 | "text/plain": [ 49 | "
" 50 | ] 51 | }, 52 | "metadata": { 53 | "needs_background": "light" 54 | }, 55 | "output_type": "display_data" 56 | } 57 | ], 58 | "source": [ 59 | "x = np.linspace(-10, 10, 500)\n", 60 | "y = sigmoid(x)\n", 61 | "\n", 62 | "plt.plot(x, y)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [] 71 | } 72 | ], 73 | "metadata": { 74 | "kernelspec": { 75 | "display_name": "Python 3", 76 | "language": "python", 77 | "name": "python3" 78 | }, 79 | "language_info": { 80 | "codemirror_mode": { 81 | "name": "ipython", 82 | "version": 3 83 | }, 84 | "file_extension": ".py", 85 | "mimetype": "text/x-python", 86 | "name": "python", 87 | "nbconvert_exporter": "python", 88 | "pygments_lexer": "ipython3", 89 | "version": "3.7.0" 90 | } 91 | }, 92 | "nbformat": 4, 93 | "nbformat_minor": 2 94 | } 95 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/26 sklearn中的随机梯度下降法.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 使用我们自己的SGD\n", 8 | "\n", 9 | "SGD:随机梯度下降法" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### 使用模拟的数据" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "m = 100000\n", 36 | "\n", 37 | "x = np.random.normal(size=m)\n", 38 | "X = x.reshape(-1, 1)\n", 39 | "y = 4.*x + 3. + np.random.normal(0, 3, size=m)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 11, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "%run D:\\\\python-code\\LinearRegression.py" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 12, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "LinearRegression()" 60 | ] 61 | }, 62 | "execution_count": 12, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "lin_reg = LinearRegression()\n", 69 | "lin_reg.fit_sgd(X, y, n_iters=2)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 13, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "array([4.02354565])" 81 | ] 82 | }, 83 | "execution_count": 13, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "lin_reg.coef_" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 14, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "2.964567909253512" 101 | ] 102 | }, 103 | "execution_count": 14, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "lin_reg.interception_" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "  通过上面的验证,我们发现系数和截距跟我们假设的相差不了多少,可以验证该方法的准确性" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "### 使用真实的数据" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 15, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "from sklearn import datasets\n", 133 | "\n", 134 | "boston = datasets.load_boston()\n", 135 | "\n", 136 | "X = boston.data\n", 137 | "y = boston.target\n", 138 | "\n", 139 | "X = X[y < 50.0]\n", 140 | "y = y[y < 50.0]" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 17, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "%run D:\\\\python-code\\train_test_split.py\n", 150 | "\n", 151 | "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "### 进行归一化处理\n", 159 | "\n", 160 | "对于真实的数据,进行归一化处理" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 18, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "%run D:\\\\python-code\\preprocessing.py" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 19, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "standardScaler = StandardScaler()\n", 179 | "standardScaler.fit(X_train)\n", 180 | "\n", 181 | "X_train_stardard = standardScaler.transform(X_train)\n", 182 | "X_test_stardard = standardScaler.transform(X_test)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 20, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "Wall time: 4.02 ms\n" 195 | ] 196 | }, 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "0.7923329555425147" 201 | ] 202 | }, 203 | "execution_count": 20, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "lin_reg1 = LinearRegression()\n", 210 | "%time lin_reg1.fit_sgd(X_train_stardard, y_train, n_iters=2)\n", 211 | "lin_reg1.score(X_test_stardard, y_test)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "下面我们是否可以通过增加 n_iters 的值来增加score的结果呢?" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 21, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "name": "stdout", 228 | "output_type": "stream", 229 | "text": [ 230 | "Wall time: 75.9 ms\n" 231 | ] 232 | }, 233 | { 234 | "data": { 235 | "text/plain": [ 236 | "0.8132440489440966" 237 | ] 238 | }, 239 | "execution_count": 21, 240 | "metadata": {}, 241 | "output_type": "execute_result" 242 | } 243 | ], 244 | "source": [ 245 | "%time lin_reg1.fit_sgd(X_train_stardard, y_train, n_iters=50)\n", 246 | "lin_reg1.score(X_test_stardard, y_test)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "从结果可以看出,当n_iters从2增加到50时,R方的值从0.79增加到了0.81,但是相应的,时间也从4ms增加到了76ms。\n", 254 | "\n", 255 | "如果我们给n_iters传入100,会怎样?" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 22, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | "Wall time: 153 ms\n" 268 | ] 269 | }, 270 | { 271 | "data": { 272 | "text/plain": [ 273 | "0.8131685005929717" 274 | ] 275 | }, 276 | "execution_count": 22, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "%time lin_reg1.fit_sgd(X_train_stardard, y_train, n_iters=100)\n", 283 | "lin_reg1.score(X_test_stardard, y_test)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "可以看到,R方值基本上没有变化,但是花费的时间也更多了。所以这个R方值整体对于这个数据来说,应该是最好的结果了。" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "### sklearn中的SGD" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 23, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "from sklearn.linear_model import SGDRegressor\n", 307 | "\n", 308 | "# 可以看到 SGDRegressor 是在 linear_model 这个模块中的,所以它只能用于线性回归模型" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 24, 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "Wall time: 9.43 ms\n" 321 | ] 322 | }, 323 | { 324 | "name": "stderr", 325 | "output_type": "stream", 326 | "text": [ 327 | "D:\\Anaconda\\lib\\site-packages\\sklearn\\linear_model\\stochastic_gradient.py:128: FutureWarning: max_iter and tol parameters have been added in in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.\n", 328 | " \"and default tol will be 1e-3.\" % type(self), FutureWarning)\n" 329 | ] 330 | }, 331 | { 332 | "data": { 333 | "text/plain": [ 334 | "0.8047845970157298" 335 | ] 336 | }, 337 | "execution_count": 24, 338 | "metadata": {}, 339 | "output_type": "execute_result" 340 | } 341 | ], 342 | "source": [ 343 | "sgd_reg = SGDRegressor()\n", 344 | "%time sgd_reg.fit(X_train_stardard, y_train)\n", 345 | "sgd_reg.score(X_test_stardard, y_test)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 25, 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "name": "stdout", 355 | "output_type": "stream", 356 | "text": [ 357 | "Wall time: 4.02 ms\n" 358 | ] 359 | }, 360 | { 361 | "name": "stderr", 362 | "output_type": "stream", 363 | "text": [ 364 | "D:\\Anaconda\\lib\\site-packages\\sklearn\\linear_model\\stochastic_gradient.py:117: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n", 365 | " DeprecationWarning)\n" 366 | ] 367 | }, 368 | { 369 | "data": { 370 | "text/plain": [ 371 | "0.813280406080372" 372 | ] 373 | }, 374 | "execution_count": 25, 375 | "metadata": {}, 376 | "output_type": "execute_result" 377 | } 378 | ], 379 | "source": [ 380 | "sgd_reg = SGDRegressor(n_iter=100) # n_iter默认为5\n", 381 | "%time sgd_reg.fit(X_train_stardard, y_train)\n", 382 | "sgd_reg.score(X_test_stardard, y_test)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 27, 388 | "metadata": {}, 389 | "outputs": [ 390 | { 391 | "name": "stdout", 392 | "output_type": "stream", 393 | "text": [ 394 | "Wall time: 1.99 ms\n" 395 | ] 396 | }, 397 | { 398 | "name": "stderr", 399 | "output_type": "stream", 400 | "text": [ 401 | "D:\\Anaconda\\lib\\site-packages\\sklearn\\linear_model\\stochastic_gradient.py:128: FutureWarning: max_iter and tol parameters have been added in in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.\n", 402 | " \"and default tol will be 1e-3.\" % type(self), FutureWarning)\n" 403 | ] 404 | }, 405 | { 406 | "data": { 407 | "text/plain": [ 408 | "0.8028391318091583" 409 | ] 410 | }, 411 | "execution_count": 27, 412 | "metadata": {}, 413 | "output_type": "execute_result" 414 | } 415 | ], 416 | "source": [ 417 | "from sklearn.linear_model.stochastic_gradient import SGDRegressor\n", 418 | "sgd_reg1 = SGDRegressor()\n", 419 | "%time sgd_reg1.fit(X_train_stardard, y_train)\n", 420 | "sgd_reg1.score(X_test_stardard, y_test)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 28, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "name": "stdout", 430 | "output_type": "stream", 431 | "text": [ 432 | "Wall time: 4.08 ms\n" 433 | ] 434 | }, 435 | { 436 | "name": "stderr", 437 | "output_type": "stream", 438 | "text": [ 439 | "D:\\Anaconda\\lib\\site-packages\\sklearn\\linear_model\\stochastic_gradient.py:117: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n", 440 | " DeprecationWarning)\n" 441 | ] 442 | }, 443 | { 444 | "data": { 445 | "text/plain": [ 446 | "0.8129140138273009" 447 | ] 448 | }, 449 | "execution_count": 28, 450 | "metadata": {}, 451 | "output_type": "execute_result" 452 | } 453 | ], 454 | "source": [ 455 | "sgd_reg1 = SGDRegressor(n_iter=100) # n_iter默认为5\n", 456 | "%time sgd_reg1.fit(X_train_stardard, y_train)\n", 457 | "sgd_reg1.score(X_test_stardard, y_test)" 458 | ] 459 | } 460 | ], 461 | "metadata": { 462 | "kernelspec": { 463 | "display_name": "Python 3", 464 | "language": "python", 465 | "name": "python3" 466 | }, 467 | "language_info": { 468 | "codemirror_mode": { 469 | "name": "ipython", 470 | "version": 3 471 | }, 472 | "file_extension": ".py", 473 | "mimetype": "text/x-python", 474 | "name": "python", 475 | "nbconvert_exporter": "python", 476 | "pygments_lexer": "ipython3", 477 | "version": "3.7.0" 478 | } 479 | }, 480 | "nbformat": 4, 481 | "nbformat_minor": 2 482 | } 483 | -------------------------------------------------------------------------------- /Jupyter-Notebook/01 Jupyter Notebook 高级 - 魔法命令.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# %run" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "hello Machine Learning !\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "%run myscript/hello.py" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "hello wlr !\n" 37 | ] 38 | } 39 | ], 40 | "source": [ 41 | "hello(\"wlr\")" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "import FirstML" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "?\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "FirstML.predict(1)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 1, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "import mymodule.hello" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "1\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "mymodule.hello.hello(1)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 4, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "2\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "from mymodule import hello\n", 111 | "hello.hello(2)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "## %timeit" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 5, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "295 µs ± 1.07 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "%timeit L = [i**2 for i in range(1000)]" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 6, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "29.9 ms ± 184 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "%timeit L = [i**2 for i in range(100000)]" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 7, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "2.94 µs ± 24.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "%timeit L = [i**2 for i in range(10)]" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 10, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "321 µs ± 1.95 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "%%timeit\n", 187 | "L = []\n", 188 | "for n in range(1000):\n", 189 | " L.append(n ** 2)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "## %time" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 17, 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "name": "stdout", 206 | "output_type": "stream", 207 | "text": [ 208 | "Wall time: 998 µs\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "%time L = [i ** 2 for i in range(1000)]" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 19, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "Wall time: 995 µs\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "%%time\n", 231 | "L = []\n", 232 | "for n in range(1000):\n", 233 | " L.append(n ** 2)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 20, 239 | "metadata": { 240 | "scrolled": true 241 | }, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "402 µs ± 24 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "import random\n", 253 | "L = [random.random() for i in range(100000)]\n", 254 | "%timeit L.sort()" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 21, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "name": "stdout", 264 | "output_type": "stream", 265 | "text": [ 266 | "Wall time: 16 ms\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "L = [random.random() for i in range(100000)]\n", 272 | "%time L.sort()" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 22, 278 | "metadata": { 279 | "scrolled": true 280 | }, 281 | "outputs": [ 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "Wall time: 998 µs\n" 287 | ] 288 | } 289 | ], 290 | "source": [ 291 | "%time L.sort()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 23, 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "data": { 301 | "application/json": { 302 | "cell": { 303 | "!": "OSMagics", 304 | "HTML": "Other", 305 | "SVG": "Other", 306 | "bash": "Other", 307 | "capture": "ExecutionMagics", 308 | "cmd": "Other", 309 | "debug": "ExecutionMagics", 310 | "file": "Other", 311 | "html": "DisplayMagics", 312 | "javascript": "DisplayMagics", 313 | "js": "DisplayMagics", 314 | "latex": "DisplayMagics", 315 | "markdown": "DisplayMagics", 316 | "perl": "Other", 317 | "prun": "ExecutionMagics", 318 | "pypy": "Other", 319 | "python": "Other", 320 | "python2": "Other", 321 | "python3": "Other", 322 | "ruby": "Other", 323 | "script": "ScriptMagics", 324 | "sh": "Other", 325 | "svg": "DisplayMagics", 326 | "sx": "OSMagics", 327 | "system": "OSMagics", 328 | "time": "ExecutionMagics", 329 | "timeit": "ExecutionMagics", 330 | "writefile": "OSMagics" 331 | }, 332 | "line": { 333 | "alias": "OSMagics", 334 | "alias_magic": "BasicMagics", 335 | "autocall": "AutoMagics", 336 | "automagic": "AutoMagics", 337 | "autosave": "KernelMagics", 338 | "bookmark": "OSMagics", 339 | "cd": "OSMagics", 340 | "clear": "KernelMagics", 341 | "cls": "KernelMagics", 342 | "colors": "BasicMagics", 343 | "config": "ConfigMagics", 344 | "connect_info": "KernelMagics", 345 | "copy": "Other", 346 | "ddir": "Other", 347 | "debug": "ExecutionMagics", 348 | "dhist": "OSMagics", 349 | "dirs": "OSMagics", 350 | "doctest_mode": "BasicMagics", 351 | "echo": "Other", 352 | "ed": "Other", 353 | "edit": "KernelMagics", 354 | "env": "OSMagics", 355 | "gui": "BasicMagics", 356 | "hist": "Other", 357 | "history": "HistoryMagics", 358 | "killbgscripts": "ScriptMagics", 359 | "ldir": "Other", 360 | "less": "KernelMagics", 361 | "load": "CodeMagics", 362 | "load_ext": "ExtensionMagics", 363 | "loadpy": "CodeMagics", 364 | "logoff": "LoggingMagics", 365 | "logon": "LoggingMagics", 366 | "logstart": "LoggingMagics", 367 | "logstate": "LoggingMagics", 368 | "logstop": "LoggingMagics", 369 | "ls": "Other", 370 | "lsmagic": "BasicMagics", 371 | "macro": "ExecutionMagics", 372 | "magic": "BasicMagics", 373 | "matplotlib": "PylabMagics", 374 | "mkdir": "Other", 375 | "more": "KernelMagics", 376 | "notebook": "BasicMagics", 377 | "page": "BasicMagics", 378 | "pastebin": "CodeMagics", 379 | "pdb": "ExecutionMagics", 380 | "pdef": "NamespaceMagics", 381 | "pdoc": "NamespaceMagics", 382 | "pfile": "NamespaceMagics", 383 | "pinfo": "NamespaceMagics", 384 | "pinfo2": "NamespaceMagics", 385 | "pip": "BasicMagics", 386 | "popd": "OSMagics", 387 | "pprint": "BasicMagics", 388 | "precision": "BasicMagics", 389 | "profile": "BasicMagics", 390 | "prun": "ExecutionMagics", 391 | "psearch": "NamespaceMagics", 392 | "psource": "NamespaceMagics", 393 | "pushd": "OSMagics", 394 | "pwd": "OSMagics", 395 | "pycat": "OSMagics", 396 | "pylab": "PylabMagics", 397 | "qtconsole": "KernelMagics", 398 | "quickref": "BasicMagics", 399 | "recall": "HistoryMagics", 400 | "rehashx": "OSMagics", 401 | "reload_ext": "ExtensionMagics", 402 | "ren": "Other", 403 | "rep": "Other", 404 | "rerun": "HistoryMagics", 405 | "reset": "NamespaceMagics", 406 | "reset_selective": "NamespaceMagics", 407 | "rmdir": "Other", 408 | "run": "ExecutionMagics", 409 | "save": "CodeMagics", 410 | "sc": "OSMagics", 411 | "set_env": "OSMagics", 412 | "store": "StoreMagics", 413 | "sx": "OSMagics", 414 | "system": "OSMagics", 415 | "tb": "ExecutionMagics", 416 | "time": "ExecutionMagics", 417 | "timeit": "ExecutionMagics", 418 | "unalias": "OSMagics", 419 | "unload_ext": "ExtensionMagics", 420 | "who": "NamespaceMagics", 421 | "who_ls": "NamespaceMagics", 422 | "whos": "NamespaceMagics", 423 | "xdel": "NamespaceMagics", 424 | "xmode": "BasicMagics" 425 | } 426 | }, 427 | "text/plain": [ 428 | "Available line magics:\n", 429 | "%alias %alias_magic %autocall %automagic %autosave %bookmark %cd %clear %cls %colors %config %connect_info %copy %ddir %debug %dhist %dirs %doctest_mode %echo %ed %edit %env %gui %hist %history %killbgscripts %ldir %less %load %load_ext %loadpy %logoff %logon %logstart %logstate %logstop %ls %lsmagic %macro %magic %matplotlib %mkdir %more %notebook %page %pastebin %pdb %pdef %pdoc %pfile %pinfo %pinfo2 %popd %pprint %precision %profile %prun %psearch %psource %pushd %pwd %pycat %pylab %qtconsole %quickref %recall %rehashx %reload_ext %ren %rep %rerun %reset %reset_selective %rmdir %run %save %sc %set_env %store %sx %system %tb %time %timeit %unalias %unload_ext %who %who_ls %whos %xdel %xmode\n", 430 | "\n", 431 | "Available cell magics:\n", 432 | "%%! %%HTML %%SVG %%bash %%capture %%cmd %%debug %%file %%html %%javascript %%js %%latex %%markdown %%perl %%prun %%pypy %%python %%python2 %%python3 %%ruby %%script %%sh %%svg %%sx %%system %%time %%timeit %%writefile\n", 433 | "\n", 434 | "Automagic is ON, % prefix IS NOT needed for line magics." 435 | ] 436 | }, 437 | "execution_count": 23, 438 | "metadata": {}, 439 | "output_type": "execute_result" 440 | } 441 | ], 442 | "source": [ 443 | "lsmagic" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 24, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "%run?" 453 | ] 454 | } 455 | ], 456 | "metadata": { 457 | "kernelspec": { 458 | "display_name": "Python 3", 459 | "language": "python", 460 | "name": "python3" 461 | }, 462 | "language_info": { 463 | "codemirror_mode": { 464 | "name": "ipython", 465 | "version": 3 466 | }, 467 | "file_extension": ".py", 468 | "mimetype": "text/x-python", 469 | "name": "python", 470 | "nbconvert_exporter": "python", 471 | "pygments_lexer": "ipython3", 472 | "version": "3.7.0" 473 | } 474 | }, 475 | "nbformat": 4, 476 | "nbformat_minor": 2 477 | } 478 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/17 实现多元线性回归.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 实现多元线性回归模型" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "from sklearn import datasets" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "boston = datasets.load_boston()\n", 28 | "\n", 29 | "X = boston.data\n", 30 | "y = boston.target\n", 31 | "\n", 32 | "X = X[y < 50.0]\n", 33 | "y = y[y < 50.0]" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "(490, 13)" 45 | ] 46 | }, 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "X.shape" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "### 1. 切分训练数据集和测试数据集" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 5, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "from sklearn.model_selection import train_test_split\n", 70 | "\n", 71 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "### 2. 使用我们自己的LinearRegression" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 6, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "%run D:\\\\python-code\\LinearRegression.py" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 7, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/plain": [ 98 | "LinearRegression()" 99 | ] 100 | }, 101 | "execution_count": 7, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "reg = LinearRegression()\n", 108 | "reg.fit_normal(X_train, y_train)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 8, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "array([-1.18919477e-01, 3.63991462e-02, -3.56494193e-02, 5.66737830e-02,\n", 120 | " -1.16195486e+01, 3.42022185e+00, -2.31470282e-02, -1.19509560e+00,\n", 121 | " 2.59339091e-01, -1.40112724e-02, -8.36521175e-01, 7.92283639e-03,\n", 122 | " -3.81966137e-01])" 123 | ] 124 | }, 125 | "execution_count": 8, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "reg.coef_ # 系数" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 9, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "34.16143549621691" 143 | ] 144 | }, 145 | "execution_count": 9, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "reg.interception_ # 截距" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "因为现在已经是多维,所以我们已经不能进行可视化处理了" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 10, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "0.8129802602658533" 170 | ] 171 | }, 172 | "execution_count": 10, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "reg.score(X_test, y_test) # R方" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "简单线性回归中只用一个特征预测的R方为0.6左右,使用了多个特征值,R方达到了0.8。从某种程度上也印证了如果我们的数据它的特征更多的话,并且这些特征如果真的能够非常好的反映我们最终要预测的那个指标,在这里就是房价那个指标的话,那么相应的,使用更多的特征这样的数据最终的预测结果会是更好的。" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "## 使用 sklearn 中多元线性回归模型" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 12, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "from sklearn.linear_model import LinearRegression\n", 202 | "\n", 203 | "lin_reg = LinearRegression()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 13, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" 215 | ] 216 | }, 217 | "execution_count": 13, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "lin_reg.fit(X_train, y_train)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 14, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "array([-1.18919477e-01, 3.63991462e-02, -3.56494193e-02, 5.66737830e-02,\n", 235 | " -1.16195486e+01, 3.42022185e+00, -2.31470282e-02, -1.19509560e+00,\n", 236 | " 2.59339091e-01, -1.40112724e-02, -8.36521175e-01, 7.92283639e-03,\n", 237 | " -3.81966137e-01])" 238 | ] 239 | }, 240 | "execution_count": 14, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "lin_reg.coef_ # 系数" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 15, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "data": { 256 | "text/plain": [ 257 | "34.16143549624624" 258 | ] 259 | }, 260 | "execution_count": 15, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "lin_reg.intercept_ # 截距" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 16, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "0.8129802602658492" 278 | ] 279 | }, 280 | "execution_count": 16, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "lin_reg.score(X_test, y_test)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "## 使用 kNN 算法解决回归问题" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "### kNN Regressor" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 17, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "0.5865412198300899" 312 | ] 313 | }, 314 | "execution_count": 17, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "from sklearn.neighbors import KNeighborsRegressor\n", 321 | "\n", 322 | "knn_reg = KNeighborsRegressor() # 默认k=5\n", 323 | "knn_reg.fit(X_train, y_train)\n", 324 | "knn_reg.score(X_test, y_test)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "从结果可以看出,kNN 算法的准确性只有 0.58,是远远低于线性回归的,但是不要忘记了,kNN 算法中存在很多的超参数,下面我们就用网格搜索来实验一下最好的超参数" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 20, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "from sklearn.grid_search import GridSearchCV\n", 341 | "param_grid = [\n", 342 | " {\n", 343 | " 'weights': ['uniform'],\n", 344 | " 'n_neighbors': [i for i in range(1, 11)]\n", 345 | " },\n", 346 | " {\n", 347 | " 'weights': ['distance'],\n", 348 | " 'n_neighbors': [i for i in range(1, 11)],\n", 349 | " 'p': [i for i in range(1, 6)]\n", 350 | " }\n", 351 | "]" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 25, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "name": "stdout", 361 | "output_type": "stream", 362 | "text": [ 363 | "Fitting 3 folds for each of 60 candidates, totalling 180 fits\n" 364 | ] 365 | }, 366 | { 367 | "name": "stderr", 368 | "output_type": "stream", 369 | "text": [ 370 | "[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 1.8s\n", 371 | "[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 2.0s finished\n" 372 | ] 373 | }, 374 | { 375 | "data": { 376 | "text/plain": [ 377 | "GridSearchCV(cv=None, error_score='raise',\n", 378 | " estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n", 379 | " metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n", 380 | " weights='uniform'),\n", 381 | " fit_params={}, iid=True, n_jobs=-1,\n", 382 | " param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],\n", 383 | " pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)" 384 | ] 385 | }, 386 | "execution_count": 25, 387 | "metadata": {}, 388 | "output_type": "execute_result" 389 | } 390 | ], 391 | "source": [ 392 | "knn_reg = KNeighborsRegressor()\n", 393 | "grid_search = GridSearchCV(knn_reg, param_grid, n_jobs=-1, verbose=1)\n", 394 | "grid_search.fit(X_train, y_train)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 26, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/plain": [ 405 | "{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}" 406 | ] 407 | }, 408 | "execution_count": 26, 409 | "metadata": {}, 410 | "output_type": "execute_result" 411 | } 412 | ], 413 | "source": [ 414 | "grid_search.best_params_" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 27, 420 | "metadata": {}, 421 | "outputs": [ 422 | { 423 | "data": { 424 | "text/plain": [ 425 | "0.634093080186858" 426 | ] 427 | }, 428 | "execution_count": 27, 429 | "metadata": {}, 430 | "output_type": "execute_result" 431 | } 432 | ], 433 | "source": [ 434 | "grid_search.best_score_" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "我们发现0.6还是比0.8差远了。但是这里的best_score_方法与我们之前的 score 方法求解方式不一样,这是因为网格搜索使用了交叉验证(CV)的方式。\n", 442 | "\n", 443 | "那么为了得到和前面数据同样衡量标准,我们应该使用下面这种方法。" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 28, 449 | "metadata": {}, 450 | "outputs": [ 451 | { 452 | "data": { 453 | "text/plain": [ 454 | "0.7044357727037996" 455 | ] 456 | }, 457 | "execution_count": 28, 458 | "metadata": {}, 459 | "output_type": "execute_result" 460 | } 461 | ], 462 | "source": [ 463 | "grid_search.best_estimator_.score(X_test, y_test)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "最终得到的结果为0.7,这个值才是真正的和我们前面所得到的结果在同一个衡量标准下的相应的值。\n", 471 | "\n", 472 | "我们发现0.7要比使用kNN默认的参数所得到的0.5要好,但是它是不如我们使用线性回归算法得到的结果的。但是,这里需要注意,其实有一部分原因在于我们真正使用网格搜索的时候,我们在搜索的过程中比较的score函数是那个GridSeachCV里面的score的计算方法,换句话说,我们没有跳出来使用我们的score的计算方法在这些参数中搜索获得到的那组最佳的参数,这一点需要注意。\n", 473 | "\n", 474 | "通过这个例子,其实也想告诉大家。我们在使用机器学习算法的过程中,会用各种方法得到各种不同的评价标准,我们在比较这些评价标准的时候,一定要非常小心,很多时候可能我们不能非常武断的直接说通过我们这样的工作流程得到的这种算法比这种算法更加的好,这里就是一个很好的例子。" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 32, 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "name": "stdout", 484 | "output_type": "stream", 485 | "text": [ 486 | "best_k = 9\n", 487 | "best_p = 1\n", 488 | "best_method = distance\n", 489 | "best_score = 0.7287667475661718\n", 490 | "Wall time: 17.2 ms\n" 491 | ] 492 | } 493 | ], 494 | "source": [ 495 | "%%time\n", 496 | "best_method = \"\"\n", 497 | "best_p = -1\n", 498 | "best_score = 0.0\n", 499 | "best_k = -1\n", 500 | "for method in [\"uniform\", \"distance\"]:\n", 501 | " for k in range(1, 11):\n", 502 | " knn_reg = KNeighborsRegressor(n_neighbors=k, weights=method, p=p)\n", 503 | " knn_reg.fit(X_train, y_train)\n", 504 | " score = knn_reg.score(X_test, y_test)\n", 505 | " if(score > best_score):\n", 506 | " best_k = k\n", 507 | " best_p = p\n", 508 | " best_method = method\n", 509 | " best_score = score\n", 510 | "\n", 511 | "print(\"best_k = \", best_k)\n", 512 | "print(\"best_p = \", best_p)\n", 513 | "print(\"best_method = \", best_method)\n", 514 | "print(\"best_score = \", best_score)\n", 515 | " " 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 33, 521 | "metadata": {}, 522 | "outputs": [ 523 | { 524 | "data": { 525 | "text/plain": [ 526 | "0.7287667475661718" 527 | ] 528 | }, 529 | "execution_count": 33, 530 | "metadata": {}, 531 | "output_type": "execute_result" 532 | } 533 | ], 534 | "source": [ 535 | "knn_reg = KNeighborsRegressor(n_neighbors=9, weights=\"distance\", p=1)\n", 536 | "knn_reg.fit(X_train, y_train)\n", 537 | "knn_reg.score(X_test, y_test)" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": {}, 543 | "source": [ 544 | "我们发现使用for循环搜索出来的超参数是和使用网格搜索搜索出来的超参数是不同的,而且得到的准确率也要比网格搜索得到的准确率要高一些,但是还是低于线性回归算法。" 545 | ] 546 | } 547 | ], 548 | "metadata": { 549 | "kernelspec": { 550 | "display_name": "Python 3", 551 | "language": "python", 552 | "name": "python3" 553 | }, 554 | "language_info": { 555 | "codemirror_mode": { 556 | "name": "ipython", 557 | "version": 3 558 | }, 559 | "file_extension": ".py", 560 | "mimetype": "text/x-python", 561 | "name": "python", 562 | "nbconvert_exporter": "python", 563 | "pygments_lexer": "ipython3", 564 | "version": "3.7.0" 565 | } 566 | }, 567 | "nbformat": 4, 568 | "nbformat_minor": 2 569 | } 570 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/24 梯度下降法的向量化.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "from sklearn import datasets" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "boston = datasets.load_boston()\n", 20 | "X = boston.data\n", 21 | "y = boston.target\n", 22 | "\n", 23 | "X = X[y < 50.0]\n", 24 | "y = y[y < 50.0]" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 39, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "%run D:\\\\python-code\\train_test_split.py\n", 34 | "\n", 35 | "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 40, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "%run D:\\\\python-code\\LinearRegression.py" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 41, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "Wall time: 0 ns\n" 57 | ] 58 | }, 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "0.8129802602658533" 63 | ] 64 | }, 65 | "execution_count": 41, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "lin_reg = LinearRegression()\n", 72 | "%time lin_reg.fit_normal(X_train, y_train)\n", 73 | "lin_reg.score(X_test, y_test)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "### 使用梯度下降法" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 42, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stderr", 90 | "output_type": "stream", 91 | "text": [ 92 | "D:\\Anaconda\\lib\\site-packages\\numpy\\core\\fromnumeric.py:83: RuntimeWarning: overflow encountered in reduce\n", 93 | " return ufunc.reduce(obj, axis, dtype, out, **passkwargs)\n", 94 | "D:\\python-code\\LinearRegression.py:27: RuntimeWarning: overflow encountered in square\n", 95 | " return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)\n", 96 | "D:\\python-code\\LinearRegression.py:45: RuntimeWarning: invalid value encountered in double_scalars\n", 97 | " if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):\n" 98 | ] 99 | }, 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "LinearRegression()" 104 | ] 105 | }, 106 | "execution_count": 42, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "lin_reg2 = LinearRegression()\n", 113 | "lin_reg2.fit_gd(X_train, y_train)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "可以看到报错了,其中有一个关键字 overflow。" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 43, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])" 132 | ] 133 | }, 134 | "execution_count": 43, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "lin_reg2.coef_" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "从上面结果来看,还是溢出了。这是因为此时是一个真实的数据。\n", 148 | "\n", 149 | "下面我们来看 X_train 前十行的数据" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 44, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "array([[1.42362e+01, 0.00000e+00, 1.81000e+01, 0.00000e+00, 6.93000e-01,\n", 161 | " 6.34300e+00, 1.00000e+02, 1.57410e+00, 2.40000e+01, 6.66000e+02,\n", 162 | " 2.02000e+01, 3.96900e+02, 2.03200e+01],\n", 163 | " [3.67822e+00, 0.00000e+00, 1.81000e+01, 0.00000e+00, 7.70000e-01,\n", 164 | " 5.36200e+00, 9.62000e+01, 2.10360e+00, 2.40000e+01, 6.66000e+02,\n", 165 | " 2.02000e+01, 3.80790e+02, 1.01900e+01],\n", 166 | " [1.04690e-01, 4.00000e+01, 6.41000e+00, 1.00000e+00, 4.47000e-01,\n", 167 | " 7.26700e+00, 4.90000e+01, 4.78720e+00, 4.00000e+00, 2.54000e+02,\n", 168 | " 1.76000e+01, 3.89250e+02, 6.05000e+00],\n", 169 | " [1.15172e+00, 0.00000e+00, 8.14000e+00, 0.00000e+00, 5.38000e-01,\n", 170 | " 5.70100e+00, 9.50000e+01, 3.78720e+00, 4.00000e+00, 3.07000e+02,\n", 171 | " 2.10000e+01, 3.58770e+02, 1.83500e+01],\n", 172 | " [6.58800e-02, 0.00000e+00, 2.46000e+00, 0.00000e+00, 4.88000e-01,\n", 173 | " 7.76500e+00, 8.33000e+01, 2.74100e+00, 3.00000e+00, 1.93000e+02,\n", 174 | " 1.78000e+01, 3.95560e+02, 7.56000e+00],\n", 175 | " [2.49800e-02, 0.00000e+00, 1.89000e+00, 0.00000e+00, 5.18000e-01,\n", 176 | " 6.54000e+00, 5.97000e+01, 6.26690e+00, 1.00000e+00, 4.22000e+02,\n", 177 | " 1.59000e+01, 3.89960e+02, 8.65000e+00],\n", 178 | " [7.75223e+00, 0.00000e+00, 1.81000e+01, 0.00000e+00, 7.13000e-01,\n", 179 | " 6.30100e+00, 8.37000e+01, 2.78310e+00, 2.40000e+01, 6.66000e+02,\n", 180 | " 2.02000e+01, 2.72210e+02, 1.62300e+01],\n", 181 | " [9.88430e-01, 0.00000e+00, 8.14000e+00, 0.00000e+00, 5.38000e-01,\n", 182 | " 5.81300e+00, 1.00000e+02, 4.09520e+00, 4.00000e+00, 3.07000e+02,\n", 183 | " 2.10000e+01, 3.94540e+02, 1.98800e+01],\n", 184 | " [1.14320e-01, 0.00000e+00, 8.56000e+00, 0.00000e+00, 5.20000e-01,\n", 185 | " 6.78100e+00, 7.13000e+01, 2.85610e+00, 5.00000e+00, 3.84000e+02,\n", 186 | " 2.09000e+01, 3.95580e+02, 7.67000e+00],\n", 187 | " [5.69175e+00, 0.00000e+00, 1.81000e+01, 0.00000e+00, 5.83000e-01,\n", 188 | " 6.11400e+00, 7.98000e+01, 3.54590e+00, 2.40000e+01, 6.66000e+02,\n", 189 | " 2.02000e+01, 3.92680e+02, 1.49800e+01]])" 190 | ] 191 | }, 192 | "execution_count": 44, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "X_train[:10, :]" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "可以看到数据规模是不一样的,有些数据是零点几,有些可以达到几百这样一个维度。那么面对这样的一个数据,我们实际最终求到的梯度很有可能结果也是那么大的,我们使用默认的η最终形成的步长还是太大,使得我们这个梯度下降法的过程是不收敛的。\n", 206 | "\n", 207 | "那么为了验证我们的假设,我们还是使用 fit_gd 函数进行训练,但是我们传入的 eta=0.000001,小了非常多倍。" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 53, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "LinearRegression()" 219 | ] 220 | }, 221 | "execution_count": 53, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | } 225 | ], 226 | "source": [ 227 | "lin_reg2.fit_gd(X_train, y_train, eta=0.000001)# 此时不报错" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 50, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "nan" 239 | ] 240 | }, 241 | "execution_count": 50, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "lin_reg2.score(X_test, y_test)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "可以看到此时得到的R方值为0.27,很显然,此时我们使用梯度下降法所找到的θ还没有达到我们损失函数的最小值。很有可能是因为我们的η太小了,导致每一步行径都非常小,所以我们需要在梯度下降法中用更多的循环次数才能找到损失函数的最小值。\n", 255 | "\n", 256 | "所以我们再进行一次训练,手动将循环次数 n_iters 100万" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 54, 262 | "metadata": {}, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "Wall time: 320 ms\n" 269 | ] 270 | }, 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "LinearRegression()" 275 | ] 276 | }, 277 | "execution_count": 54, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "%time lin_reg2.fit_gd(X_train, y_train, eta=0.000001, n_iters=1e6)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 55, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "data": { 293 | "text/plain": [ 294 | "0.27556634853389206" 295 | ] 296 | }, 297 | "execution_count": 55, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "lin_reg2.score(X_test, y_test)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "  换句话说,我们增加了循环次数,但是结果还是没有达到我们损失函数达到最小值,其实我们的循环次数还需要更多,这样显然太耗时了,那么对于这种情况应该怎么办呢?\n", 311 | "\n", 312 | "  其实我们之前已经分析出来了,之所以出现这种情况,是因为这些数据整体不在一个规模上,其实解决的的方式我们之前也学习过,就是**数据的归一化**。\n", 313 | "\n", 314 | "  我们之前使用正规方程解来解决线性回归算法的时候,其实不需要进行数据的归一化,这是因为我们将线性回归模型的求解过程整体变成了一个公式的计算,那么在这个公式的计算中,牵扯这种中间搜索的过程比较少,所以我们不需要进行数据归一化。\n", 315 | "\n", 316 | "  可是当我们使用梯度下降法的时候,就变得不一样了,由于我们有η这个变量,所以首先会出现一个问题,如果我们最终这些数值不在一个维度上,将会影响我们梯度的结果,而梯度的结果再乘以η是我们真正每一次走的步长,这个步长就有可能或者太大,或者太小。如果太大,会导致结果不收敛,就像之前使用默认的η得到的结果那样,如果太小,又会导致我们的搜索过程太慢,就像我们上面实验的那样。但是如果我们将所有的数据进行归一化,那么这个问题就完全解决了。\n", 317 | "\n", 318 | "\n", 319 | "  下面,我们来具体实验一下。" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "### 使用梯度下降法前进行数据归一化" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 56, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "from sklearn.preprocessing import StandardScaler" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 58, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "StandardScaler(copy=True, with_mean=True, with_std=True)" 347 | ] 348 | }, 349 | "execution_count": 58, 350 | "metadata": {}, 351 | "output_type": "execute_result" 352 | } 353 | ], 354 | "source": [ 355 | "standardScaler = StandardScaler()\n", 356 | "standardScaler.fit(X_train)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 59, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "X_train_standard = standardScaler.transform(X_train)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 63, 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "name": "stdout", 375 | "output_type": "stream", 376 | "text": [ 377 | "Wall time: 163 ms\n" 378 | ] 379 | }, 380 | { 381 | "data": { 382 | "text/plain": [ 383 | "LinearRegression()" 384 | ] 385 | }, 386 | "execution_count": 63, 387 | "metadata": {}, 388 | "output_type": "execute_result" 389 | } 390 | ], 391 | "source": [ 392 | "lin_reg3 = LinearRegression()\n", 393 | "%time lin_reg3.fit_gd(X_train_standard, y_train)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 61, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "X_test_stardard = standardScaler.transform(X_test)" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 62, 408 | "metadata": {}, 409 | "outputs": [ 410 | { 411 | "data": { 412 | "text/plain": [ 413 | "0.8129880620122235" 414 | ] 415 | }, 416 | "execution_count": 62, 417 | "metadata": {}, 418 | "output_type": "execute_result" 419 | } 420 | ], 421 | "source": [ 422 | "lin_reg3.score(X_test_stardard, y_test)" 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "  从结果可以看出,和我们使用正规方程解得到的结果是一致的,说明我们找到了这个损失函数的最小值。与此同时,速度是非常快的。这就是数据归一化的威力。\n", 430 | "\n", 431 | "  但是我们发现,使用梯度下降法需要花费163ms,而使用正规方程解是0ns,那么梯度下降法有什么优势呢?\n", 432 | "\n", 433 | "  在这里,我们再举一个例子。" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "### 梯度下降法的优势" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "##### 设计一个虚拟的样本数据" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 66, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "m = 1000 # 样本数\n", 457 | "n = 5000 # 样本特征\n", 458 | "\n", 459 | "big_X = np.random.normal(size=(m, n))#随机化的正态分布,所以自送生成的数据已经归一化了\n", 460 | "\n", 461 | "# 随即生成n+1个theta的取值\n", 462 | "true_theta = np.random.uniform(0.0, 100.0, size=n+1)\n", 463 | "big_y = big_X.dot(true_theta[1:]) + true_theta[0] + np.random.normal(0, 10., size=m)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 67, 469 | "metadata": {}, 470 | "outputs": [ 471 | { 472 | "name": "stdout", 473 | "output_type": "stream", 474 | "text": [ 475 | "Wall time: 2.9 s\n" 476 | ] 477 | }, 478 | { 479 | "data": { 480 | "text/plain": [ 481 | "LinearRegression()" 482 | ] 483 | }, 484 | "execution_count": 67, 485 | "metadata": {}, 486 | "output_type": "execute_result" 487 | } 488 | ], 489 | "source": [ 490 | "big_reg1 = LinearRegression()\n", 491 | "%time big_reg1.fit_normal(big_X, big_y) # 使用正规方程解" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 68, 497 | "metadata": {}, 498 | "outputs": [ 499 | { 500 | "name": "stdout", 501 | "output_type": "stream", 502 | "text": [ 503 | "Wall time: 2.06 s\n" 504 | ] 505 | }, 506 | { 507 | "data": { 508 | "text/plain": [ 509 | "LinearRegression()" 510 | ] 511 | }, 512 | "execution_count": 68, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | } 516 | ], 517 | "source": [ 518 | "big_reg2 = LinearRegression()\n", 519 | "%time big_reg2.fit_gd(big_X, big_y) # 使用梯度下降法" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "  从上面结果可以看出,使用正规方程解耗时2.9s,使用梯度下降法为2.06s,梯度下降法稍微快一些。如果我们加大我们的特征数,梯度下降法的优势会更加明显一些。\n", 527 | "\n", 528 | "  这就是所说的对于正规方程解,它处理的是 m x n 这样大的矩阵进行非常多的乘法运算,所以这个矩阵比较大的时候,其实我们的正规方程法相应的就要更耗时一些。\n", 529 | "\n", 530 | "  不过在我们所举的例子中,样本数量是小于样本特征数的。这是因为我们现在所使用的梯度下降法的这个公式,在计算梯度的时候,我们要让每一个样本都参与计算,这使得当样本数量比较大的时候,我们计算这个梯度其实相应的也比较慢,其实这也有一种改进的方式,就是所谓的**随机梯度下降法**。" 531 | ] 532 | } 533 | ], 534 | "metadata": { 535 | "kernelspec": { 536 | "display_name": "Python 3", 537 | "language": "python", 538 | "name": "python3" 539 | }, 540 | "language_info": { 541 | "codemirror_mode": { 542 | "name": "ipython", 543 | "version": 3 544 | }, 545 | "file_extension": ".py", 546 | "mimetype": "text/x-python", 547 | "name": "python", 548 | "nbconvert_exporter": "python", 549 | "pygments_lexer": "ipython3", 550 | "version": "3.7.0" 551 | } 552 | }, 553 | "nbformat": 4, 554 | "nbformat_minor": 2 555 | } 556 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/66 ROC曲线.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import matplotlib.pyplot as plt" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from sklearn import datasets\n", 20 | "\n", 21 | "digits = datasets.load_digits()\n", 22 | "X = digits.data\n", 23 | "y = digits.target.copy()\n", 24 | "\n", 25 | "y[digits.target==9] = 1\n", 26 | "y[digits.target!=9] = 0" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "from sklearn.model_selection import train_test_split\n", 36 | "\n", 37 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "from sklearn.linear_model import LogisticRegression\n", 47 | "\n", 48 | "log_reg = LogisticRegression()\n", 49 | "log_reg.fit(X_train, y_train)\n", 50 | "decision_scores = log_reg.decision_function(X_test)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## 计算TPR和FRP" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 10, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "%run D:\\\\python-code\\metrics.py" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 11, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "fprs = []\n", 76 | "tprs = []\n", 77 | "\n", 78 | "thresholds = np.arange(np.min(decision_scores), np.max(decision_scores), 0.1)\n", 79 | "for threshold in thresholds:\n", 80 | " y_predict = np.array(decision_scores >= threshold, dtype='int')\n", 81 | " fprs.append(FPR(y_test, y_predict))\n", 82 | " tprs.append(TPR(y_test, y_predict))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 12, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "[]" 94 | ] 95 | }, 96 | "execution_count": 12, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | }, 100 | { 101 | "data": { 102 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAEMNJREFUeJzt3X2QXXV9x/H3NxsSkIeA7PLQPLixBssKtKE7lNa2okEawkwydqyTWMbqUDPaYv/QdgalpQ5OZ1qtOnWIDxl1UGcE0Znqjkap1VCsNUIcFEhocI1g1kSSUAxgCHnYb/+4F+dms5t7dvfu3r2/fb9m7sx5+OXc72/v7ofD75zzu5GZSJLKMqfdBUiSWs9wl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBVobrveuLu7O3t7e9v19pLUkX7wgx/sz8yeZu3aFu69vb1s3bq1XW8vSR0pIh6v0s5hGUkqkOEuSQUy3CWpQIa7JBXIcJekAjUN94j4dETsjYiHx9gfEfGRiBiMiAcj4vLWlylJGo8qZ+63AytPsv9aYFn9tR742OTLkiRNRtP73DPz3ojoPUmTNcBns/Z9fVsi4uyIuDAz97SoRk1AZnLHfbv4xYHn2l2KpBFWXHw+v7347Cl9j1Y8xLQQ2NWwPlTfdkK4R8R6amf3LFmypAVvrbFs3/M07/n3hwCIaHMxko5z3lmndkS4jxYdo37rdmZuBDYC9Pf3+83cU+hrD+6ha05w33tWcO4Z89tdjqRp1oq7ZYaAxQ3ri4DdLTiuJigz+dpDe/iD3zzXYJdmqVaE+wDwpvpdM1cCBxxvn3rDw8nRY8Ojvh76+QEef/Igqy69sN1lSmqTpsMyEXEHcBXQHRFDwD8CpwBk5seBTcAqYBA4CLxlqopVzYHnjrDig//F/mefH7NN15zgT15xwTRWJWkmqXK3zLom+xP465ZVpKa+uf0J9j/7PH/5h0tZcNopo7ZZdv4ZvPj0edNcmaSZom1T/mrivvbgbhadcxo3X3cx4a0wkkbh9AMd5sDBI/z34H6uu/RCg13SmDxzb/CNh/fw0M8PtLuMk3rsyYMcOZZcd5kXSyWNzXBv8A9f2cb+Z5+na4afES9fcjaXLlzQ7jIkzWCGe4PM5I1XLOGfXndpu0uRpElxzF2SCjRrz9yPHhvmWB4/A0I6IYKkQszKcH/i6UO85l/v4VeHj52wb+6cmT3eLklVzMpwf+Bnv+RXh4/xllf20t0w90oEXOcj+5IKMCvD/cdPPAPA317zck6fPyt/BJIKNysvqD6691kWnn2awS6pWLMq3fY+fYjPfu9x7vvpk/RdeFa7y5GkKTOrwv0b237BbZsHmT93Dq/+rfPaXY4kTZlZFe7Dw7V7Hbe8ewXnOGOipILNyjF3SSpd8Wfuh48Oc3R4uLZ8bLjN1UjS9Cg63Pc+fYhXfeAenjty/MNKc3xQSVLhig73/c8e5rkjx/jTyxfy8vPPBOCCBaeO+e1FklSKosP9Bdf0XcDKS/w+UUmzhxdUJalARZy5Hzk2zMfu+QlPP3fkuO1P/upwmyqSpPYqItwffeIZPvTNR5k/d84Jszqee/o8ertf1KbKJKk9igj3F+Zhv+2Nl/PavvPbW4wkzQCOuUtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCdeytkIePDnPbt3/M04eO+rCSJI3QseG+4xfP8JFvD3LaKV2c0hWcf9Z8es/1YSVJgorhHhErgX8DuoBPZuY/j9i/BPgMcHa9zU2ZuanFtR4nqT25dNsbl7PiYh9ckqRGTcfcI6IL2ABcC/QB6yKib0SzvwfuyszlwFrgo60uVJJUXZULqlcAg5m5MzMPA3cCa0a0SeCs+vICYHfrShzdEb9VSZLGVGVYZiGwq2F9CPi9EW3eC/xHRLwDOB24uiXVncT1n7wPgLld3vAjSSNVScbRvpMuR6yvA27PzEXAKuBzEXHCsSNifURsjYit+/btG3+1DY4OD9N9xnyufOmLJ3UcSSpRlXAfAhY3rC/ixGGXG4C7ADLze8CpQPfIA2Xmxszsz8z+np6eiVVcd0rXHF63/DeYP7drUseRpBJVCff7gWURsTQi5lG7YDowos3PgBUAEXExtXCf3Km5JGnCmoZ7Zh4FbgTuBh6hdlfMtoi4NSJW15u9C3hrRPwIuAN4c2aOHLqRJE2TSve51+9Z3zRi2y0Ny9uBV7a2NEnSRHmriSQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQB0Z7vc+uo+Dh4+1uwxJmrE6Mtxv/5/HAPidxee0txBJmqE6Mtwzk8sWLeC6yy5sdymSNCN1ZLhLkk7OcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpUKdwjYmVE7IiIwYi4aYw2b4iI7RGxLSI+39oyJUnjMbdZg4joAjYArwWGgPsjYiAztze0WQa8G3hlZj4VEedNVcGSpOaqnLlfAQxm5s7MPAzcCawZ0eatwIbMfAogM/e2tkxJ0nhUCfeFwK6G9aH6tkYXARdFxHcjYktErGxVgZKk8Ws6LAPEKNtylOMsA64CFgHfiYhLMvOXxx0oYj2wHmDJkiXjLlaSVE2VM/chYHHD+iJg9yhtvpKZRzLzp8AOamF/nMzcmJn9mdnf09Mz0ZolSU1UCff7gWURsTQi5gFrgYERbb4MvBogIrqpDdPsbGWhkqTqmoZ7Zh4FbgTuBh4B7srMbRFxa0Ssrje7G3gyIrYDm4G/y8wnp6poSdLJVRlzJzM3AZtGbLulYTmBd9ZfkqQ28wlVSSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAJ1XLg/+sQzbN6xj8x2VyJJM1fHhfvm/90LwB8t625zJZI0c3VcuL/gxte8rN0lSNKM1bHhLkkam+EuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBKoV7RKyMiB0RMRgRN52k3esjIiOiv3UlSpLGq2m4R0QXsAG4FugD1kVE3yjtzgT+Bvh+q4uUJI1PlTP3K4DBzNyZmYeBO4E1o7R7H/B+4FAL65MkTUCVcF8I7GpYH6pv+7WIWA4szsyvtrA2SdIEVQn3GGXbryfcjYg5wIeBdzU9UMT6iNgaEVv37dtXvUpJ0rhUCfchYHHD+iJgd8P6mcAlwD0R8RhwJTAw2kXVzNyYmf2Z2d/T0zPxqiVJJ1Ul3O8HlkXE0oiYB6wFBl7YmZkHMrM7M3szsxfYAqzOzK1TUrEkqamm4Z6ZR4EbgbuBR4C7MnNbRNwaEaunukBJ0vjNrdIoMzcBm0Zsu2WMtldNvixJ0mT4hKokFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQWqFO4RsTIidkTEYETcNMr+d0bE9oh4MCK+FREvaX2pkqSqmoZ7RHQBG4BrgT5gXUT0jWj2ANCfmZcBXwLe3+pCJUnVVTlzvwIYzMydmXkYuBNY09ggMzdn5sH66hZgUWvLlCSNR5VwXwjsalgfqm8byw3A1ydTlCRpcuZWaBOjbMtRG0ZcD/QDrxpj/3pgPcCSJUsqlihJGq8qZ+5DwOKG9UXA7pGNIuJq4GZgdWY+P9qBMnNjZvZnZn9PT89E6pUkVVAl3O8HlkXE0oiYB6wFBhobRMRy4BPUgn1v68uUJI1H03DPzKPAjcDdwCPAXZm5LSJujYjV9WYfAM4AvhgRP4yIgTEOJ0maBlXG3MnMTcCmEdtuaVi+usV1SZImwSdUJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqUMeF+9Lu01l16QXMiWh3KZI0Y1UK94hYGRE7ImIwIm4aZf/8iPhCff/3I6K31YW+4JpXXMBH//x3OfWUrql6C0nqeE3DPSK6gA3AtUAfsC4i+kY0uwF4KjNfBnwY+JdWFypJqq7KmfsVwGBm7szMw8CdwJoRbdYAn6kvfwlYEeG4iSS1S5VwXwjsalgfqm8btU1mHgUOAOeOPFBErI+IrRGxdd++fROrWJLUVJVwH+0MPCfQhszcmJn9mdnf09NTpT5J0gRUCfchYHHD+iJg91htImIusAD4v1YUKEkavyrhfj+wLCKWRsQ8YC0wMKLNAPAX9eXXA9/OzBPO3CVJ02NuswaZeTQibgTuBrqAT2fmtoi4FdiamQPAp4DPRcQgtTP2tVNZtCTp5JqGO0BmbgI2jdh2S8PyIeDPWluaJGmiol2jJxGxD3h8gv+8G9jfwnI6gX2eHezz7DCZPr8kM5vekdK2cJ+MiNiamf3trmM62efZwT7PDtPR546bW0aS1JzhLkkF6tRw39juAtrAPs8O9nl2mPI+d+SYuyTp5Dr1zF2SdBIzOtxn0jzy06VCn98ZEdsj4sGI+FZEvKQddbZSsz43tHt9RGREdPydFVX6HBFvqH/W2yLi89NdY6tV+N1eEhGbI+KB+u/3qnbU2SoR8emI2BsRD4+xPyLiI/Wfx4MRcXlLC8jMGfmi9jTsT4CXAvOAHwF9I9r8FfDx+vJa4Avtrnsa+vxq4EX15bfPhj7X250J3AtsAfrbXfc0fM7LgAeAc+rr57W77mno80bg7fXlPuCxdtc9yT7/MXA58PAY+1cBX6c28eKVwPdb+f4z+cx9Ns4j37TPmbk5Mw/WV7dQm8itk1X5nAHeB7wfODSdxU2RKn1+K7AhM58CyMy901xjq1XpcwJn1ZcXcOIEhR0lM+/l5BMorgE+mzVbgLMj4sJWvf9MDveWzSPfQar0udEN1P7L38ma9jkilgOLM/Or01nYFKryOV8EXBQR342ILRGxctqqmxpV+vxe4PqIGKI23ck7pqe0thnv3/u4VJpbpk1aNo98B6ncn4i4HugHXjWlFU29k/Y5IuZQ++rGN09XQdOgyuc8l9rQzFXU/u/sOxFxSWb+coprmypV+rwOuD0zPxgRv09tMsJLMnN46striynNr5l85j4b55Gv0mci4mrgZmB1Zj4/TbVNlWZ9PhO4BLgnIh6jNjY50OEXVav+bn8lM49k5k+BHdTCvlNV6fMNwF0Amfk94FRqc7CUqtLf+0TN5HCfjfPIN+1zfYjiE9SCvdPHYaFJnzPzQGZ2Z2ZvZvZSu86wOjO3tqfclqjyu/1lahfPiYhuasM0O6e1ytaq0uefASsAIuJiauFe8vdxDgBvqt81cyVwIDP3tOzo7b6i3ORq8yrgUWpX2W+ub7uV2h831D78LwKDwH3AS9td8zT0+T+BJ4Af1l8D7a55qvs8ou09dPjdMhU/5wA+BGwHHgLWtrvmaehzH/BdanfS/BC4pt01T7K/dwB7gCPUztJvAN4GvK3hM95Q/3k81Orfa59QlaQCzeRhGUnSBBnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQV6P8BtCHSRpKHg0MAAAAASUVORK5CYII=\n", 103 | "text/plain": [ 104 | "
" 105 | ] 106 | }, 107 | "metadata": { 108 | "needs_background": "light" 109 | }, 110 | "output_type": "display_data" 111 | } 112 | ], 113 | "source": [ 114 | "plt.plot(fprs, tprs)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "### sklearn中的ROC" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 13, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "from sklearn.metrics import roc_curve\n", 131 | "\n", 132 | "fprs, tprs, thresholds = roc_curve(y_test, decision_scores)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 15, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "[]" 144 | ] 145 | }, 146 | "execution_count": 15, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | }, 150 | { 151 | "data": { 152 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAADx1JREFUeJzt3X+IZWd9x/H3x91GKU20dkeI+8Nd6QYcQzF2SCJCjWjLJuDuP6ndlWAtwa22sX8ohRRLlPiPtbSisK0uVvwBMUb/MIOsBGojirjbjLhGd0PKdhOTSUIzmhj/EI2h3/5xb+x1dnbumZk7c+c+837BwD3nPHvu99k7+8mT5zn3nFQVkqS2vGDcBUiSRs9wl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDVo+7jeeMeOHbV3795xvb0kTaTvfve7P66qqWHtxhbue/fuZW5ublxvL0kTKcmPurRzWkaSGmS4S1KDDHdJapDhLkkNMtwlqUFDwz3Jp5M8meSHFzmeJB9Pci7J/UleO/oyJUkr0WXk/hngwDLHrwf293+OAv+69rIkSWsx9Dr3qvpmkr3LNDkEfK56z+s7meQlSS6vqidGVKNW6Y5Tj3D36cfGXYakRaZffhkfeMur1/U9RjHnvhN4dGB7vr/vAkmOJplLMrewsDCCt9Zy7j79GGef+Nm4y5A0BqP4hmqW2LfkU7er6jhwHGBmZsYnc2+A6csv44t/+bpxlyFpg41i5D4P7B7Y3gU8PoLzSpJWaRThPgu8vX/VzLXAM863S9J4DZ2WSfIF4DpgR5J54APAbwFU1SeAE8ANwDng58BfrFexWtki6dknfsb05Zetc0WSNqMuV8scGXK8gL8eWUVa1vOLpF1Ce/ryyzj0miXXtiU1bmy3/NXquUgqaRhvPyBJDXLkPmASvvTjPLqkLhy5D5iEL/04jy6pC0fuizifLakFjtwlqUGGuyQ1aMtOyyy1eOpipaRWbNmR+1KLpy5WSmrFlh25g4unktq1ZUfuktSyLTVyH5xnd35dUsu21Mh9cJ7d+XVJLdtSI3dwnl3S1rClRu6StFUY7pLUoOanZVxElbQVNT9ydxFV0lbU/MgdXESVtPU0P3KXpK2omZH7xZ6i5Dy7pK2omZH7xZ6i5Dy7pK2omZE7OLcuSc9rZuQuSfp/hrskNchwl6QGGe6S1CDDXZIaZLhLUoMm+lJIbwomSUub6JG7NwWTpKV1GrknOQB8DNgGfKqqPrzo+B7gs8BL+m1uraoTI651SX5xSZIuNHTknmQbcAy4HpgGjiSZXtTs74G7quoq4DDwL6MuVJLUXZdpmauBc1V1vqqeBe4EDi1qU8DzE94vBh4fXYmSpJXqEu47gUcHtuf7+wZ9ELgpyTxwAnjPSKpbxh2nHuHUQ0+t99tI0kTqEu5ZYl8t2j4CfKaqdgE3AJ9PcsG5kxxNMpdkbmFhYeXVDnj+KhkXUSXpQl3CfR7YPbC9iwunXW4G7gKoqu8ALwJ2LD5RVR2vqpmqmpmamlpdxQOu2fdS3nbNnjWfR5Ja0yXc7wP2J9mX5BJ6C6azi9o8ArwJIMmr6IX72obmkqRVGxruVfUccAtwD/AAvatiziS5PcnBfrP3Ae9M8n3gC8A7qmrx1I0kaYN0us69f836iUX7bht4fRZ4/WhLkySt1kR/Q1WStDTDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAZNZLjfceoRTj301LjLkKRNayLD/e7TjwFw6DU7x1yJJG1OExnuANfseylvu2bPuMuQpE1pYsNdknRxhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktSgTuGe5ECSB5OcS3LrRdq8NcnZJGeS3DHaMiVJK7F9WIMk24BjwB8D88B9SWar6uxAm/3A3wGvr6qnk7xsvQqWJA3XZeR+NXCuqs5X1bPAncChRW3eCRyrqqcBqurJ0ZYpSVqJLuG+E3h0YHu+v2/QFcAVSb6d5GSSA6MqUJK0ckOnZYAssa+WOM9+4DpgF/CtJFdW1U9/40TJUeAowJ49PmhDktZLl5H7PLB7YHsX8PgSbe6uql9V1UPAg/TC/jdU1fGqmqmqmampqdXWLEkaoku43wfsT7IvySXAYWB2UZuvAG8ESLKD3jTN+VEWKknqbmi4V9VzwC3APcADwF1VdSbJ7UkO9pvdA/wkyVngXuBvq+on61W0JGl5XebcqaoTwIlF+24beF3Ae/s/kqQx8xuqktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ2auHC/49QjnHroqXGXIUmb2sSF+92nHwPg0Gt2jrkSSdq8Ji7cAa7Z91Leds2ecZchSZvWRIa7JGl5hrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ3qFO5JDiR5MMm5JLcu0+7GJJVkZnQlSpJWami4J9kGHAOuB6aBI0mml2h3KfA3wKlRFylJWpkuI/ergXNVdb6qngXuBA4t0e5DwEeAX4ywPknSKnQJ953AowPb8/19v5bkKmB3VX11hLVJklapS7hniX3164PJC4CPAu8beqLkaJK5JHMLCwvdq5QkrUiXcJ8Hdg9s7wIeH9i+FLgS+EaSh4FrgdmlFlWr6nhVzVTVzNTU1OqrliQtq0u43wfsT7IvySXAYWD2+YNV9UxV7aiqvVW1FzgJHKyquXWpWJI01NBwr6rngFuAe4AHgLuq6kyS25McXO8CJUkrt71Lo6o6AZxYtO+2i7S9bu1lSZLWwm+oSlKDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUoE7hnuRAkgeTnEty6xLH35vkbJL7k3w9yStGX6okqauh4Z5kG3AMuB6YBo4kmV7U7HvATFX9AfBl4COjLlSS1F2XkfvVwLmqOl9VzwJ3AocGG1TVvVX18/7mSWDXaMuUJK1El3DfCTw6sD3f33cxNwNfW0tRkqS12d6hTZbYV0s2TG4CZoA3XOT4UeAowJ49ezqWKElaqS4j93lg98D2LuDxxY2SvBl4P3Cwqn651Imq6nhVzVTVzNTU1GrqlSR10CXc7wP2J9mX5BLgMDA72CDJVcAn6QX7k6MvU5K0EkPDvaqeA24B7gEeAO6qqjNJbk9ysN/sH4HfAb6U5HSS2YucTpK0AbrMuVNVJ4ATi/bdNvD6zSOuS5K0Bn5DVZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBm0fdwErNf3yy8ZdgiRtep3CPckB4GPANuBTVfXhRcdfCHwO+EPgJ8CfVdXDoy215wNvefV6nFaSmjJ0WibJNuAYcD0wDRxJMr2o2c3A01X1+8BHgX8YdaGSpO66zLlfDZyrqvNV9SxwJ3BoUZtDwGf7r78MvClJRlemJGkluoT7TuDRge35/r4l21TVc8AzwO8tPlGSo0nmkswtLCysrmJJ0lBdwn2pEXitog1VdbyqZqpqZmpqqkt9kqRV6BLu88Duge1dwOMXa5NkO/Bi4KlRFChJWrku4X4fsD/JviSXAIeB2UVtZoE/77++EfiPqrpg5C5J2hhDL4WsqueS3ALcQ+9SyE9X1ZkktwNzVTUL/Bvw+STn6I3YD69n0ZKk5XW6zr2qTgAnFu27beD1L4A/HW1pkqTVyrhmT5IsAD9a5R/fAfx4hOVMAvu8NdjnrWEtfX5FVQ29ImVs4b4WSeaqambcdWwk+7w12OetYSP67I3DJKlBhrskNWhSw/34uAsYA/u8NdjnrWHd+zyRc+6SpOVN6shdkrSMTR3uSQ4keTDJuSS3LnH8hUm+2D9+Ksneja9ytDr0+b1Jzia5P8nXk7xiHHWO0rA+D7S7MUklmfgrK7r0Oclb+5/1mSR3bHSNo9bhd3tPknuTfK//+33DOOoclSSfTvJkkh9e5HiSfLz/93F/kteOtICq2pQ/9L4N+9/AK4FLgO8D04va/BXwif7rw8AXx133BvT5jcBv91+/eyv0ud/uUuCbwElgZtx1b8DnvB/4HvC7/e2XjbvuDejzceDd/dfTwMPjrnuNff4j4LXADy9y/Abga/RuvHgtcGqU77+ZR+5b8T7yQ/tcVfdW1c/7myfp3chtknX5nAE+BHwE+MVGFrdOuvT5ncCxqnoaoKqe3OAaR61Lnwt4/jmaL+bCGxROlKr6JsvfQPEQ8LnqOQm8JMnlo3r/zRzuI7uP/ATp0udBN9P7L/8kG9rnJFcBu6vqqxtZ2Drq8jlfAVyR5NtJTvYfdTnJuvT5g8BNSebp3e7kPRtT2tis9N/7imzmB2SP7D7yE6Rzf5LcBMwAb1jXitbfsn1O8gJ6j258x0YVtAG6fM7b6U3NXEfv/86+leTKqvrpOte2Xrr0+Qjwmar6pySvo3czwiur6n/Xv7yxWNf82swj9614H/kufSbJm4H3Awer6pcbVNt6GdbnS4ErgW8keZje3OTshC+qdv3dvruqflVVDwEP0gv7SdWlzzcDdwFU1XeAF9G7B0urOv17X63NHO5b8T7yQ/vcn6L4JL1gn/R5WBjS56p6pqp2VNXeqtpLb53hYFXNjafckejyu/0VeovnJNlBb5rm/IZWOVpd+vwI8CaAJK+iF+4tP49zFnh7/6qZa4FnquqJkZ193CvKQ1abbwD+i94q+/v7+26n948beh/+l4BzwH8Crxx3zRvQ538H/gc43f+ZHXfN693nRW2/wYRfLdPxcw7wz8BZ4AfA4XHXvAF9nga+Te9KmtPAn4y75jX29wvAE8Cv6I3SbwbeBbxr4DM+1v/7+MGof6/9hqokNWgzT8tIklbJcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUH/B3Y1ff004EIsAAAAAElFTkSuQmCC\n", 153 | "text/plain": [ 154 | "
" 155 | ] 156 | }, 157 | "metadata": { 158 | "needs_background": "light" 159 | }, 160 | "output_type": "display_data" 161 | } 162 | ], 163 | "source": [ 164 | "plt.plot(fprs, tprs)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "关于ROC曲线,我们通常关注这条曲线下面面积的大小。面积越大,代表当前模型分类效果越好" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 16, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "0.9830452674897119" 183 | ] 184 | }, 185 | "execution_count": 16, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "# 求面积:面积是在0-1之间取值\n", 192 | "from sklearn.metrics import roc_auc_score\n", 193 | "\n", 194 | "roc_auc_score(y_test, decision_scores)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "从结果可以看出,ROC其实对于有偏的数据其实没有 Precision 和 Recall 那么敏感。" 202 | ] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.7.0" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 2 226 | } 227 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/81 信息熵.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 信息熵" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 6, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# 信息熵\n", 27 | "def entropy(p):\n", 28 | " return -p * np.log(p) - (1-p) * np.log(1-p)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 7, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "x = np.linspace(0.01, 0.99, 200) # x不为0和1,log0为无穷" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 8, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": [ 48 | "[]" 49 | ] 50 | }, 51 | "execution_count": 8, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | }, 55 | { 56 | "data": { 57 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzt3Xd8lfXd//HX52QnZEEGIQkkgQQIICtsEdyoFdyCkzqoWmtr1621tb3t3XFrq71rsRZX1SponYgoOHCwCZswE2b2gOydfH9/JPaX0kAO5JxznfF5Ph48HjknX5P3ZfDtN9/rur6XGGNQSinlXWxWB1BKKeV4Wu5KKeWFtNyVUsoLabkrpZQX0nJXSikvpOWulFJeSMtdKaW8kJa7Ukp5IS13pZTyQv5WfeOYmBiTkpJi1bdXSimPtHnz5nJjTGxP4ywr95SUFLKzs6369kop5ZFE5Ig943RZRimlvJCWu1JKeSG7yl1EZonIPhHJFZGHuvn8UyKyrfPPfhGpdHxUpZRS9upxzV1E/ICFwMVAPrBJRJYaY3Z/M8YY82CX8d8Dxjohq1JKKTvZM3OfCOQaYw4aY5qBJcCc04yfByx2RDillFJnx55yTwSOdXmd3/nefxCRQUAq8HnvoymllDpb9pS7dPPeqR7fNBd4yxjT1u0XElkgItkikl1WVmZvRqWUUmfInuvc84HkLq+TgMJTjJ0LfPdUX8gYswhYBJCVlaXP91MuZ4yhsr6F0pomSmsaOVHfQmNzGw0tnX+a2xCBAD8bAX6Cv81GUICNvqGBRIcF0jcskOjQQPqFBWKzdTfvUco92FPum4B0EUkFCugo8JtOHiQiQ4FoYJ1DEyp1Fk7UNbO3uIa8slryymrJLa3lUHkdpdVNNLe19/rrB/rZSO4bwqB+YQzsG0pqTBiZAyLITIggLMiyewOV+pce/xYaY1pF5H5gBeAHvGiMyRGRx4BsY8zSzqHzgCVGn7itXKy1rZ3dRdVsPVrJ1qMn2HasksMV9f/6fGigH4Nj+zBuYDQJUcHEhwcTFxFEXHgwfcMCCAn0JyTAj5AAP4L8O1YqW9rbaW0ztLYZGlvbOF7XzIm6Zo7XN3O8rpmCygaOlNdz5Hg96w9WUN/csRIpAmkxYYxMjGRMchRTB8eQEd8HEZ3lK9cSq7o4KyvL6PYD6mwVVDbw1f4yvtpfxprccqobWwGICw9i7MAoxg6MJjMhgiFxfUiIDHZquRpjKKluIqewil0F1ewqrGJXQRVFVY0AxPQJZHJaP6YOjuH8YbEkRIY4LYvyfiKy2RiT1dM4/f1ReYzc0hqW7Shi+c4i9pfUApAQGcxlIxM4Nz2G8YOinV7k3RER+kcG0z8ymAuHx//r/fwT9azLq2BdXgVr8spZtqMIgNHJUVw6Ip5LR/RncGwfl2ZVvkNn7sqtHTtez3tbC/hwZxF7i2sQgYkpfbk4M54ZGbEMifOMJQ9jDHlltazIKWFlTjHb86sAyIjvw7Xjkrh6XCJx4cEWp1SewN6Zu5a7cjstbe18uruE1zceZXVuOcZA1qBovnVOApePSiAuwvNLsLCygZU5xSzdXsiWo5X42YSZGbFcn5XEBcPiCfTXbZ9U97TclccprmrklXWHeTM7n/LaJhIig7lxQjLXZyWTGOW969R5ZbW8tTmfd7bkU1LdRHxEELdNSWHexIH0DQu0Op5yM1ruymMcKKlh0VcHeW9bAW3thguGxXPTpGRmZMTh50PXkre1G77cX8pLaw7z9YFygvxtXDMuiTumpZAeH251POUmtNyV29t85AR//SKXT/eUEhxg48asZO6ankZy31Cro1luf0kNL605xDtbCmhqbefyUf154MJ0hvWPsDqaspiWu3JbOYVV/HHlfj7fW0p0aAC3T03htikpugTRjYraJl5ee5gX1xymtqmVK0Yl8MCF6QztrzN5X6XlrtzOwbJanvr0AB9sLyQi2J97Zg5m/tQUQgP1ityeVNY388LqQ7y05jB1za1cec4AfjprKEnR+luOr9FyV26jqr6Fpz7dz6vrjxDkb+OOaancfV4akSEBVkfzOCfqmnnu64O8sPoQBrjr3FTuO38IfXTLA5+h5a4s19ZueGPTMZ5YsZeqhhZunjSIBy5MJzY8yOpoHq+wsoHHP97Le9sKiekTxI8vyeD6rGSfOgHtq7TclaU2HznOL5fmsKugmompffnVlSPIHKAnAx1t27FKfr1sN5uPnGBMchS/u2YUwxP037M303JXlqhtauXxj/fyyrojJEQG87PLh/OtcxI84i5ST2WM4f1thfx62W6qGlq4a3oa378wnZBAP6ujKSfQvWWUy325v4yfvbOTwqoGvj0thR9fMlS3v3UBEeGqsYnMyIjldx/t4dkv81i+s4jfXD2S6emxVsdTFtF7nFWvVdW38KM3t3P7ixsJDrDx1j1T+OWVI7TYXSw6LJDHrxvN4rsn428Tbn1hI794bxcNzd0+GE15Of2vT/XK+oMVPPjGNkprmvju+YP53gXpBAfocoCVpgzux/LvT+cPK/bx/OpDrMkt58kbxzAmOcrqaMqFdOauzkpLWzt/XLmPec+tJ8jfxjv3TuUnlw7TYncTwQF+/Pxbmbx+1yQaWtq49q9r+b9PD9DqgKdQKc+g5a7O2NGKem742zqe/jyX68Yl8eED0xmts0K3NHVIDB//4DyuPCeBpz7dz9xF6ynufIiI8m5a7uqMrMgp5oo/f01uaS1PzxvLE9eP1rV1NxcZEsCf5o7lTzeOYXdRNVf8+WtWHyi3OpZyMi13ZZe2dsPjH+/lO69uJi02jI++P50rRw+wOpY6A1eNTWTp/dOIDgvk1hc38OfPDtDero889lZa7qpHx+uamf/SRp75Io95E5N54ztTdE8TDzUkLpz3vzuNOaMH8OQn+5n/901U1bdYHUs5gZa7Oq1dBVVc+fRqNhw6zv9eO4rfXXOOnjT1cGFB/jx14xh+c/VI1uWVc9Uza8grq7U6lnIwLXd1Sityirnu2bUYY3jrnincOGGg1ZGUg4gIN08axOt3T6aqoYWrF67h6wNlVsdSDmRXuYvILBHZJyK5IvLQKcbcICK7RSRHRF53bEzlSsYYFn2Vxz3/2Myw/hG8f/+5nJOkV8N4owkpfXn/u9NIiAxh/kubeGXdYasjKQfpsdxFxA9YCFwGZALzRCTzpDHpwMPANGPMCOAHTsiqXKClrZ1H3tvFb5fv5fKRCSxZMFl3cfRyyX1Defu+qZw/NJZH38/hl+/vok1PtHo8e2buE4FcY8xBY0wzsASYc9KYu4GFxpgTAMaYUsfGVK5Q09jCHX/fxOsbjnLfzME8PW+srq/7iD5B/vzt1izunp7Ky+uO8L3FW2hs0W0LPJk9FygnAse6vM4HJp00JgNARNYAfsCvjDEfOyShcomK2ibmv7SJPUXVPH7tOdwwIdnqSMrF/GzCI1dkEh8RzP98uIeK2o08d3sWEcH6UBVPZM/Mvbu9Wk/+nc0fSAdmAvOA50XkPxZpRWSBiGSLSHZZmZ68cReFlQ1c/7d17C+pYdFt47XYfdxd09P4v7lj2HL0BDc8u46Sar2j1RPZU+75QNf/2pOAwm7GvG+MaTHGHAL20VH2/8YYs8gYk2WMyYqN1a1I3UFeWS3X/XUtZdVNvHrnJC4YFm91JOUG5oxJ5IXbJ3D0eD3XPLOWw+V1VkdSZ8iect8EpItIqogEAnOBpSeNeQ84H0BEYuhYpjnoyKDK8XYVVHH9s+tobmtn8YLJTEzta3Uk5UbOy4hlyYLJNLS0ceOideSW6rXwnqTHcjfGtAL3AyuAPcCbxpgcEXlMRGZ3DlsBVIjIbmAV8BNjTIWzQqve23asknnPrSckwI9/3jOVkYmRVkdSbuicpCgW3z2ZtnbD3EXr2V9SY3UkZSd9zJ4P2naskltf2EB0aCCLF0wmMSrE6kjKzeWW1nLTc+tpbTf8485J+jxcC9n7mD29Q9XHaLGrszEkrg9vfGcKQf42bnp+PbsKqqyOpHqg5e5DtNhVb6TGhPHGgimEBfpz03Pr2VNUbXUkdRpa7j5iR74Wu+q9gf1CWbJgMqGB/tz6wgbdcMyNabn7gNzSGm5/cSORIQFa7KrXkvuG8trdHfcx3vL8Bo4dr7c4keqOlruXyz9Rzy3Pb8TPZuMfd07SYlcOMTi2D6/cMYm6plZueWGD3ujkhrTcvVh5bRO3vrCR+uZWXr1zIikxYVZHUl4kc0AEL98xkfKaJm55fgPH65qtjqS60HL3UtWNLdz2wkaKqhp46dsTGJ6gl64pxxs7MJoX5nfcyXrH3zfR0KybjbkLLXcv1NjSxl0vZ3OgtIZnbxnP+EF656lynslp/fjzvLHsyK/ke4u30trWbnUkhZa712lvN/z4n9vZeOg4f7xhDDOHxlkdSfmAS0f051ezR/DpnhJ+uTQHq26OVP+fPVv+Kg/yh5X7WLajiIcuG8bs0QOsjqN8yG1TUiisbOTZL/MYEBXCd88fYnUkn6bl7kWWbDzKM1/kMW/iQL5zXprVcZQP+umlQymuauCJFftIiAzmmnFJVkfyWVruXuKr/WU88t4uzsuI5ddzRiDS3Tb8SjmXzSY8ft1oSmua+OlbO0iMCmFSWj+rY/kkXXP3AnuLq7nvtS2kx/Vh4U1j8ffTH6uyTqC/jb/eMp6B/UK597UtepOTRbQFPNzxumbuejmb0EA/Xpw/gXB9JJpyA5EhATx/Wxatbe3c/Uo2tU2tVkfyOVruHqy1rZ37X99CaU0Tz92WxQC9+1S5kbTYPiy8eRwHSmt58I1ttLfrFTSupOXuwX67fC9r8yr4zVUjGZ38H4+sVcpy09Nj+fkVw/lkdwl//GSf1XF8ip5Q9VBvb87nxTWHmD81heuz9IHWyn3Nn5rCvuIaFq7KIyM+nDljEq2O5BN05u6Bth+r5OF3dzIlrR+PXDHc6jhKnZaI8NickUxIieaht3fqo/pcRMvdw5TVNPGdVzcT2yeIhTePI0CvjFEeINDfxsKbxhEW5M89/9isJ1hdQJvBg7S1Gx58Yxsn6ptZdNt4+oYFWh1JKbvFRQTz9LyxHC6v47/e2qFbFDiZlrsH+cvnuazOLee/Z49gxIBIq+ModcamDO7HTy4dxoc7i3hpzWGr43g1LXcPsTa3nD99tp+rxyZy4wQ9gao81z0z0rg4M57fLt/D5iPHrY7jtbTcPUBpTSMPLNlGWkwY/3PVSN1aQHk0EeEP148mMTqE+17bQkVtk9WRvJJd5S4is0Rkn4jkishD3Xx+voiUici2zj93OT6qb2prN3x/8TZqm1p45ubxhAXp1avK80WGBPDMzeM4Ud/CT3T93Sl6LHcR8QMWApcBmcA8EcnsZugbxpgxnX+ed3BOn/V/nx1g3cEKfj1nJEP7h1sdRymHGTEgkocvG8bne0t5Zd0Rq+N4HXtm7hOBXGPMQWNMM7AEmOPcWApg0+Hj/OXzA1w7LklvVFJeaf7UFM4fGstvlu9hT1G11XG8ij3lnggc6/I6v/O9k10rIjtE5C0R6baJRGSBiGSLSHZZWdlZxPUd1Y0tPPjGNpKiQ/nvOSOsjqOUU4gIT1w/mojgAB5YvFWfwepA9pR7d2fvTl4g+wBIMcacA3wKvNzdFzLGLDLGZBljsmJjY88sqY/51dIciqoaeerGMfTRdXblxWL6BPHkDaM5UFrLb5bvtjqO17Cn3POBrjPxJKCw6wBjTIUx5ptT3s8B4x0Tzzct21HIO1sKuP/8IYwfFG11HKWc7ryMWBacl8Y/1h9lRU6x1XG8gj3lvglIF5FUEQkE5gJLuw4QkYQuL2cDexwX0bcUVTXwyLu7GJMcxfcu0GdQKt/x40uGMjIxgofe3kFZjV4e2Vs9lrsxphW4H1hBR2m/aYzJEZHHRGR257AHRCRHRLYDDwDznRXYm7W3G3705nZa2tr5041j9IlKyqcE+tt46oYx1DW38fP3durlkb1k12KuMWY5sPyk9x7t8vHDwMOOjeZ7/r72MGvzKvjfa0eREhNmdRylXC49PpwfXZzB7z7ay/vbCrlqrG4PfLZ0augmDpfX8fiKvVwwLI4b9LJH5cPump7GuIFR/HJpDiXVjVbH8Vha7m6gvd3w07d3EOBn47dXj9LtBZRP87N1bE/Q1NrGw+/o8szZ0nJ3A//YcISNh47ziysy6R8ZbHUcpSyXFtuHn17acffqW5vzrY7jkbTcLXbseD2//2gv52XEcn1WktVxlHIb86emMDG1L499sJvCygar43gcLXcLGWN46J0d2ET43TW6HKNUVzab8IfrRtPabnj0/V26PHOGtNwttHjjMdbkVvDw5cNIjAqxOo5Sbmdgv1B+eHEGn+4p5eNdenPTmdByt0hJdSO/Xb6HqYP7cdPEgVbHUcptfXtaCiMGRPDLpTlUNbRYHcdjaLlb5LFlu2lua9erY5Tqgb+fjd9fcw7ltU08/vFeq+N4DC13C3yxr5QPdxTxvfOH6M1KStlhVFIk356WymsbjpJ9WB/NZw8tdxdrbGnj0fdzSIsNY8GMNKvjKOUxfnhxBolRITz8zk6aW9utjuP2tNxdbOGqXI4er+d/rhpJkL+f1XGU8hhhQf78+qoRHCit5W9f5lkdx+1pubtQbmktz36ZxzVjE5k6OMbqOEp5nAuGxXPFOQk8vSqXoxX1Vsdxa1ruLmKM4efv7SQkwI+fXTHc6jhKeaxfXJGJv014bJk+2ON0tNxd5N2tBaw/eJyHLhtOTJ8gq+Mo5bH6RwbzwIXpfLqnhFX7Sq2O47a03F2gtqmV3320lzHJUcydoDs+KtVbd0xLJS0mjMc+2E1Tqz53tTta7i6wcFUuZTVN/Gr2CGw2vaZdqd4K9Lfx6JWZHCqv48XVh62O45a03J3sSEUdL3x9iGvHJTEmOcrqOEp5jZlD47g4M56nPz9AcZXu+34yLXcn+82HewjwE/5r1lCroyjldX5xRSat7YbfLtfHNp9My92JVh8oZ+XuEr57wRDiInSfdqUcbWC/UO45L42l2wvZcLDC6jhuRcvdSVrb2nlsWQ4D+4Zyx7RUq+Mo5bXunTmExKgQHlu2m/Z23Rb4G1ruTvL6xqPsL6nlkSuGExygd6Iq5SwhgX785NKh5BRW8/72AqvjuA0tdyeorG/myU/2M21IPy7JjLc6jlJeb/boAYxMjOCJj/fR2KKXRoKd5S4is0Rkn4jkishDpxl3nYgYEclyXETP88wXeVQ1tPCLb2Xqdr5KuYDNJvzs8uEUVjXy0prDVsdxCz2Wu4j4AQuBy4BMYJ6IZHYzLhx4ANjg6JCepKCygb+vPcy145IY1j/C6jhK+Yypg2O4cFgcz6zKpaK2yeo4lrNn5j4RyDXGHDTGNANLgDndjPs18Djg0xecPvXJfgAevDjD4iRK+Z6HLhtGXXMrT3+ea3UUy9lT7onAsS6v8zvf+xcRGQskG2OWne4LicgCEckWkeyysrIzDuvu9hZX8/aWfOZPTdFnoiplgfT4cOZOHMg/1h/hUHmd1XEsZU+5d7do/K/rjUTEBjwF/KinL2SMWWSMyTLGZMXGxtqf0kM8/vE+woP8uW/mYKujKOWzfnBROoH+Nv73I99+JJ895Z4PdN3tKgko7PI6HBgJfCEih4HJwFJfO6m6/mAFn+8t5b7zhxAVGmh1HKV8Vlx4MPfMGMzHOcVsPnLC6jiWsafcNwHpIpIqIoHAXGDpN580xlQZY2KMMSnGmBRgPTDbGJPtlMRuyBjD7z/aS/+IYOZPTbE6jlI+785zU+kXFsiTn+yzOopleix3Y0wrcD+wAtgDvGmMyRGRx0RktrMDeoKPdxWz7VglP7w4Q29YUsoNhAX5c+/MwazJrWBtXrnVcSwhxlhzu25WVpbJzvb8yX1bu+GSp77EJsJH35+Ov5/eF6aUO2hsaWPGE6tIjg7ln/dM8Zp7TkRkszGmx2VvbaJe+mB7IXlldTx4cYYWu1JuJDjAj/svSCf7yAm+3O99V+f1RNuoF1rb2vnzZwcY1j+cWSP6Wx1HKXWSG7OSSYoO4Y8r92PVKoVVtNx7Yen2Qg6W1/GDi9L1CUtKuaFAfxvfvzCdnQVVrNxdYnUcl9JyP0vfzNqHJ0RwSabO2pVyV1ePTSQtJownV+73qS2BtdzP0nvbCjlcUa+zdqXcnL+fjR9cnMG+khqW7SyyOo7LaLmfhda2dp7+/AAjBkTolr5KeYBvjUpgaHw4f/p0P20+MnvXcj8L72wt4EhFPT+4KMNrLq9SypvZbML9FwzhYFkdH+8qtjqOS2i5n6GWzln7qMRILhoeZ3UcpZSdLh+VQFpMGH9ZlesTV85ouZ+hd7cWcOx4Aw9enK6zdqU8iJ9NuHfmYPYUVbNqX6nVcZxOy/0MtLcbnv0yj8yECM4fqrN2pTzNVWMTSYwK4S+fe//sXcv9DKzcXcLBsjrunTlYZ+1KeaAAPxv3zEhjy9FK1h2ssDqOU2m528kYw1+/zGNQv1AuG6nXtSvlqa7PSiY2PIiFq7z7aU1a7nZad7CC7ccqWXBemu4ho5QHCw7w4+7pqazJrWDrUe/d711byk5//SKPmD5BXDsuyeooSqleunnSIKJCA7x69q7lboed+VV8faCcO89N1f3alfICYUH+3DEtlU/3lLK7sNrqOE6h5W6HZ7/MIzzIn5snD7Q6ilLKQW6fkkJooB/Prz5odRSn0HLvwaHyOpbvKuLWKYOICA6wOo5SykEiQwO4ISuZD7YXUlLdaHUch9Ny78Gir/II8LPx7WmpVkdRSjnYHdNSaWs3vLz2sNVRHE7L/TTKapp4e3MB149PIjY8yOo4SikHG9gvlEtH9Oe1DUepb261Oo5DabmfxusbjtLc1s4d5+qsXSlvddf0VKoaWnhrc77VURxKy/0UmlrbeHX9EWYOjWVwbB+r4yilnGTcwGjGJEfx4upDXrUdsJb7KXy4o4jy2ibu0LV2pbyaiHDX9FQOV9Tz2R7veRSfXeUuIrNEZJ+I5IrIQ918/h4R2Ski20RktYhkOj6q6xhjeHHNIYbE9WF6eozVcZRSTjZrRH8So0J4/utDVkdxmB7LXUT8gIXAZUAmMK+b8n7dGDPKGDMGeBx40uFJXSj7yAl2FVTz7WkpukGYUj7A38/Gt6elsPHwcbYfq7Q6jkPYM3OfCOQaYw4aY5qBJcCcrgOMMV1v8QoDPHrh6sXVh4gMCeCasbrVgFK+4sYJyYQH+fP8au+YvdtT7onAsS6v8zvf+zci8l0RyaNj5v5Ad19IRBaISLaIZJeVlZ1NXqfLP1HPipxi5k0cSEigbjWglK8IDw7gxgnJLN9ZRHGV59/UZE+5d7cu8R8zc2PMQmPMYOC/gJ9394WMMYuMMVnGmKzY2NgzS+oir6w7gohw25RBVkdRSrnYrVMG0W4Mr288anWUXrOn3POB5C6vk4DC04xfAlzVm1BWqWtqZcnGo8wa2Z8BUSFWx1FKudigfmHMzIhl8cajNLe2Wx2nV+wp901AuoikikggMBdY2nWAiKR3eXkFcMBxEV3nna0FVDe2cse0FKujKKUsctuUFMpqmliRU2x1lF7psdyNMa3A/cAKYA/wpjEmR0QeE5HZncPuF5EcEdkG/BC43WmJncQYw2vrjzAyMYJxA6OtjqOUssiMjFgG9g3l1XVHrI7SK/72DDLGLAeWn/Teo10+/r6Dc7nc5iMn2Ftcw++vGaWXPyrlw2w24ZbJA/nt8r3sLa5mWP8IqyOdFb1DtdNrG44SHuTPlaMHWB1FKWWxG7KSCfK38YoHz9613IHjdc18uLOIq8clEhZk1y8zSikvFhUayJwxA3h3SwE1jS1WxzkrWu7A25vzaW5t56ZJ+qQlpVSHmyYNoqGljaXbT3dxoPvy+XJvb++4pjVrULTHrq0ppRxvdFIkwxMiWOyh17z7fLmvzavgUHkdt0zWm5aUUv+fiDBvYjK7CqrZmV9ldZwz5vPlvnjjUaJCA5g1sr/VUZRSbmbOmESCA2ws3uR5s3efLvfjdc2s3F3M1WMTCQ7QfWSUUv8uMiSAy0clsHRbIXVNnvUYPp8u9/e2FtDSZrhxQnLPg5VSPummiQOpbWpl2Q7POrHqs+VujOHN7GOckxSpJ1KVUqc0flA0Q+L6sHjjsZ4HuxGfLfedBVXsLa7hhiydtSulTk1EmDshmW3HKtlfUmN1HLv5bLm/sekYQf42Zo/RO1KVUqd31dhE/G3C25vzrY5iN58s94bmNpZuK+TyUQlEBAdYHUcp5eZi+gQxc2gc72wtoLXNM7YC9sly/ziniJqmVl2SUUrZ7brxSZTVNPH1gXKro9jFJ8v9rc35DOwbyqTUvlZHUUp5iAuGxREdGsBbWzxjacbnyr2oqoG1eRVcPTYRm0239lVK2SfQ38acMYl8klNCVb37bybmc+W+dFshxnScIFFKqTNx3fgkmtva+cADrnn3uXJ/d2sBYwdGkRoTZnUUpZSHGTEggmH9w3nLA66a8aly31NUzd7iGq7RWbtS6iyICNeOS2LbsUryymqtjnNaPlXu724twN8mXHGOXtuulDo7s8cMQKRjided+Uy5t7Ub3t9WwMyhcfQNC7Q6jlLKQ8VHBDM5tR8fbC/EGGN1nFPymXJfl1dBSXUT14zTJRmlVO/MGTOAg+V17CqotjrKKflMub+7tYDwYH8uGBZndRSllIe7bGQCAX7C+9sKrI5ySnaVu4jMEpF9IpIrIg918/kfishuEdkhIp+JiFs91qixpY2VOcXMGtFf921XSvVaZGgAMzLi+GBHIW3t7rk002O5i4gfsBC4DMgE5olI5knDtgJZxphzgLeAxx0dtDe+PlBOTVMr3xqtJ1KVUo4xZ8wASqqb2HjouNVRumXPzH0ikGuMOWiMaQaWAHO6DjDGrDLG1He+XA8kOTZm7yzbUUh0aABTB/ezOopSyktcNDye0EA/lm53z6UZe8o9Eei6S31+53uncifwUXefEJEFIpItItllZWX2p+yFxpY2Pt1dwqyR/Qnw85lTDEopJwsJ9OOSzHiW7yymudX9doq0p+2624Cl20UmEbkFyAKe6O7zxphFxpgsY0wQq5JBAAAKoUlEQVRWbGys/Sl7YdXeUuqa2/iWXtuulHKw2WMGUNXQwupc10xWz4Q95Z4PdN0bNwn4j6v3ReQi4BFgtjGmyTHxem/ZziJi+gTqDpBKKYebNiSG8CB/PtpZbHWU/2BPuW8C0kUkVUQCgbnA0q4DRGQs8Dc6ir3U8THPTn1zK5/vKWXWyP7465KMUsrBgvz9uCgznk/2lNDiZg/x6LHxjDGtwP3ACmAP8KYxJkdEHhOR2Z3DngD6AP8UkW0isvQUX86lPttTSkOLLskopZxn1sj+VNa3sP5ghdVR/o2/PYOMMcuB5Se992iXjy9ycC6H+HBHEbHhQUxI0SUZpZRzzMiIJTTQj492FTM93TXnEu3htWsVDc1tfLG/lFkj+uOnD+VQSjlJcIAfFwyLY2VOsVvd0OS15b46t5zGlnYuHdHf6ihKKS932cgEymub2XTYfW5o8tpyX5FTTHiwP5PSdElGKeVcM4fGEhxg4+Nd7nPVjFeWe2tbO5/tKeHCYXF645JSyunCgvyZkRHLR7uKaHeTpRmvbL7sIyc4Ud/CJboko5RykctGJlBS3cTWY5VWRwG8tNxX5pQQ6G9jRob7nLlWSnm384fF4WcTPttTYnUUwAvL3RjDyt3FTB8SQ1iQXVd6KqVUr0WGBDAhJZrP9rjHfZxeV+67i6rJP9HAJSPirY6ilPIxFw2PZ19JDceO1/c82Mm8rtxX5pQgAhcO13JXSrnWN73jDkszXlfun+wuYfzAaGL6BFkdRSnlY1JjwkiLDeOzvdYvzXhVuZdUN7K7qFpn7Uopy1w0PJ71ByuoaWyxNIdXlfuX+zr2VJ45VK+SUUpZ48JhcbS0Gb4+UG5pDq8q91X7SukfEcyw/uFWR1FK+ajxg6KJDAngU4vX3b2m3Fva2ll9oJyZQ2MR0Y3ClFLW8PezMXNoLF/sK7N0IzGvKffNR05Q09TKzKFxVkdRSvm4C4fHc7yuma1HT1iWwWvK/Yt9ZfjbhGlD+lkdRSnl42akx2IT+MrCdXcvKvdSslKiCQ8OsDqKUsrHRYYGMCopiq8PWPfgbK8o96KqBvYW13C+LskopdzEeekxbD9WSVWDNZdEekW5f/GvSyC13JVS7mF6eiztBtblWbM04yXlXsqAyGAy4vtYHUUppQAYOzCKsEA/y6539/hyb21rZ21uBTP0EkillBsJ8LMxZXA/LfeztauwmpqmVqYOjrE6ilJK/Zvp6bEcPV7PkYo6l39vu8pdRGaJyD4RyRWRh7r5/HkiskVEWkXkOsfHPLV1eRUATE7TSyCVUu7l3PSOSacVs/cey11E/ICFwGVAJjBPRDJPGnYUmA+87uiAPVmbV05GfB9iw3UXSKWUe0mLCSMxKsSSSyLtmblPBHKNMQeNMc3AEmBO1wHGmMPGmB1AuxMynlJzazvZh08wRWftSik3JCJMT49hbW4FrW0urUe7yj0RONbldX7ne2dMRBaISLaIZJeV9f7/ZNvzK2loaWOKrrcrpdzUuekx1DS1sj3ftQ/Otqfcu7sE5ax2wzHGLDLGZBljsmJje78t77q8CkRgclrfXn8tpZRyhm8u9vjm/KCr2FPu+UByl9dJQKFz4pyZtXnlZCZEEBUaaHUUpZTqVt+wQIb1D2fDoeMu/b72lPsmIF1EUkUkEJgLLHVurJ41trSx5Wilrrcrpdze5LR+ZB8+QYsL1917LHdjTCtwP7AC2AO8aYzJEZHHRGQ2gIhMEJF84HrgbyKS48zQAFuOnKC5tZ2pugukUsrNTUrtS0NLGzvyq1z2Pf3tGWSMWQ4sP+m9R7t8vImO5RqXWXewAj+bMCFF19uVUu5tYmpHT204VMH4QdEu+Z4ee4fq2rwKRiVG6ha/Sim3169PEBnxfVh/0HXr7h5Z7o0tbezIr9S7UpVSHmNSaj82Hz7uskfveWS5bz9WSUubIctFv94opVRvZaVEU9fcxt7iapd8P48s982dzyV01dqVUkr11riBHX215YhrnqvqmeV++ASDY8OIDtPr25VSniEpOoT4iCCytdy7Z4xh89ETOmtXSnkUEWH8oGg2a7l3L6+sjsr6FrIG6SWQSinPMn5QX/JPNFBS3ej07+Vx5f7NetU4nbkrpTzMNysOrpi9e1y5R4UGcHFmPINjw6yOopRSZ2TEgAguGBZHWJBd94/2ihjjmmsuT5aVlWWys7Mt+d5KKeWpRGSzMSarp3EeN3NXSinVMy13pZTyQlruSinlhbTclVLKC2m5K6WUF9JyV0opL6TlrpRSXkjLXSmlvJBlNzGJSBlw5Az+kRig3Elx3Jket+/x1WPX47bPIGNMbE+DLCv3MyUi2fbcleVt9Lh9j68eux63Y+myjFJKeSEtd6WU8kKeVO6LrA5gET1u3+Orx67H7UAes+aulFLKfp40c1dKKWUntyt3EZklIvtEJFdEHurm80Ei8kbn5zeISIrrUzqeHcf9QxHZLSI7ROQzERlkRU5H6+m4u4y7TkSMiHjF1RT2HLeI3ND5M88RkdddndEZ7Ph7PlBEVonI1s6/65dbkdPRRORFESkVkV2n+LyIyJ87/73sEJFxvf6mxhi3+QP4AXlAGhAIbAcyTxpzH/Bs58dzgTeszu2i4z4fCO38+F5fOe7OceHAV8B6IMvq3C76eacDW4HoztdxVud20XEvAu7t/DgTOGx1bgcd+3nAOGDXKT5/OfARIMBkYENvv6e7zdwnArnGmIPGmGZgCTDnpDFzgJc7P34LuFBExIUZnaHH4zbGrDLG1He+XA8kuTijM9jz8wb4NfA44PynCruGPcd9N7DQGHMCwBhT6uKMzmDPcRsgovPjSKDQhfmcxhjzFXD8NEPmAK+YDuuBKBFJ6M33dLdyTwSOdXmd3/let2OMMa1AFdDPJemcx57j7upOOv4v7+l6PG4RGQskG2OWuTKYk9nz884AMkRkjYisF5FZLkvnPPYc96+AW0QkH1gOfM810Sx3ph3QI+c/pfXMdDcDP/lyHnvGeBq7j0lEbgGygBlOTeQapz1uEbEBTwHzXRXIRez5efvTsTQzk47f0r4WkZHGmEonZ3Mme457HvB3Y8wfRWQK8Grncbc7P56lHN5r7jZzzweSu7xO4j9/LfvXGBHxp+NXt9P9uuMJ7DluROQi4BFgtjGmyUXZnKmn4w4HRgJfiMhhOtYil3rBSVV7/56/b4xpMcYcAvbRUfaezJ7jvhN4E8AYsw4IpmPvFW9nVwecCXcr901AuoikikggHSdMl540Zilwe+fH1wGfm84zEh6sx+PuXJ74Gx3F7g3rr9DDcRtjqowxMcaYFGNMCh3nGmYbY7Ktiesw9vw9f4+Ok+iISAwdyzQHXZrS8ew57qPAhQAiMpyOci9zaUprLAVu67xqZjJQZYwp6tVXtPos8inOGu+n46z6I53vPUbHf9TQ8cP+J5ALbATSrM7souP+FCgBtnX+WWp1Zlcc90ljv8ALrpax8+ctwJPAbmAnMNfqzC467kxgDR1X0mwDLrE6s4OOezFQBLTQMUu/E7gHuKfLz3th57+XnY74e653qCqllBdyt2UZpZRSDqDlrpRSXkjLXSmlvJCWu1JKeSEtd6WU8kJa7kop5YW03JVSygtpuSullBf6f2LSvo9kyHKxAAAAAElFTkSuQmCC\n", 58 | "text/plain": [ 59 | "
" 60 | ] 61 | }, 62 | "metadata": { 63 | "needs_background": "light" 64 | }, 65 | "output_type": "display_data" 66 | } 67 | ], 68 | "source": [ 69 | "plt.plot(x, entropy(x))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "该图像是以0.5为对称轴的,意味着当 x=0.5 时,这根曲线得到最大值。如果对于信息熵只有两个类别,当一个类别时1/2,另一个也是1/2时,此时整个数据的信息熵就是最大,此时数据是最不稳定的,确定性是最低的。" 77 | ] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 3", 83 | "language": "python", 84 | "name": "python3" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 3 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython3", 96 | "version": "3.7.0" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 2 101 | } 102 | -------------------------------------------------------------------------------- /Machine-learning-algorithm/76 高斯核函数.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 直观理解高斯核函数" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 8, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "data": { 27 | "text/plain": [ 28 | "array([-4, -3, -2, -1, 0, 1, 2, 3, 4])" 29 | ] 30 | }, 31 | "execution_count": 8, 32 | "metadata": {}, 33 | "output_type": "execute_result" 34 | } 35 | ], 36 | "source": [ 37 | "x = np.arange(-4, 5, 1)\n", 38 | "x" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 9, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "array([0, 0, 1, 1, 1, 1, 1, 0, 0])" 50 | ] 51 | }, 52 | "execution_count": 9, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "y = np.array((x >= -2) & (x <= 2), dtype='int')\n", 59 | "y" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 10, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "" 71 | ] 72 | }, 73 | "execution_count": 10, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | }, 77 | { 78 | "data": { 79 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAD8CAYAAABkbJM/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAE9VJREFUeJzt3X+wXPV93vH3UwkIcQPmh3BAgoopiqfEcUmyI7vjacc1BmTXg0hLamXcRJniUZMx43TaOIUyMVRuOmCmdScDk45sqIkbBxhih2unrizATCYzNtZVjPkZFQ2xowuqkStM7ZTYFvn0jz3XXa733v1e7ZV2Be/XzM7u+e73nPMgtPe5Z8/ZVaoKSZJG+RuTDiBJOj5YGJKkJhaGJKmJhSFJamJhSJKaWBiSpCYWhiSpyYoURpJNSfYm2ZfkmiHPn5Tkru75h5Ks78bPSPKFJN9JcsuCdR7stvlwdztrJbJKko7M6nE3kGQVcCtwCTAH7E4yU1VPDEy7Cni+qi5IsgW4CXg38FfAbwJv6G4LvaeqZsfNKEka39iFAWwE9lXV0wBJ7gQ2A4OFsRm4oXt8D3BLklTVXwJ/kuSCFcjBmWeeWevXr1+JTUnSq8aePXu+WVVrRs1bicJYC+wfWJ4D3rTYnKo6nOQF4AzgmyO2/V+TvAT8AfDva8T3mKxfv57ZWQ9IJGk5kny9Zd5KnMPIkLGFP9hb5iz0nqr6KeDvd7dfHLrzZFuS2SSzBw8eHBlWknRkVqIw5oBzB5bXAc8uNifJauBU4NBSG62qZ7r7bwOfpP/W17B5O6qqV1W9NWtGHlFJko7QShTGbmBDkvOTnAhsAWYWzJkBtnaPrwQeWOrtpSSrk5zZPT4BeBfw2ApklSQdobHPYXTnJK4GdgKrgNur6vEk24HZqpoBbgM+kWQf/SOLLfPrJ/kacApwYpIrgEuBrwM7u7JYBdwHfHTcrJKkI5dX0r+H0ev1ypPekrQ8SfZUVW/UPD/pLUlqYmFIkppYGJKkJhaGJKmJhSFJamJhSJKaWBiSpCYWhiSpiYUhSWpiYUiSmlgYkqQmFoYkqYmFIUlqYmFIkppYGJKkJhaGJKmJhSFJamJhSJKaWBiSpCYWhiSpiYUhSWpiYUiSmlgYkqQmFoYkqYmFIUlqYmFIkppYGJKkJitSGEk2JdmbZF+Sa4Y8f1KSu7rnH0qyvhs/I8kXknwnyS0L1vnZJI926/x2kqxEVknSkRm7MJKsAm4F3gFcCPxCkgsXTLsKeL6qLgA+AtzUjf8V8JvArw/Z9O8A24AN3W3TuFklSUduJY4wNgL7qurpqvoecCewecGczcAd3eN7gIuTpKr+sqr+hH5x/ECSs4FTquqLVVXA7wJXrEBWSdIRWonCWAvsH1ie68aGzqmqw8ALwBkjtjk3YpsAJNmWZDbJ7MGDB5cZXZLUaiUKY9i5hTqCOUc0v6p2VFWvqnpr1qxZYpOSpHGsRGHMAecOLK8Dnl1sTpLVwKnAoRHbXDdim5KkY2glCmM3sCHJ+UlOBLYAMwvmzABbu8dXAg905yaGqqoDwLeTvLm7OuqXgHtXIKsk6QitHncDVXU4ydXATmAVcHtVPZ5kOzBbVTPAbcAnkuyjf2SxZX79JF8DTgFOTHIFcGlVPQH8KvBx4GTgc91NkjQhWeIX/eNOr9er2dnZSceQpONKkj1V1Rs1z096S5KaWBiSpCYWhiSpiYUhSWpiYUiSmlgYkqQmFoYkqYmFIUlqYmFIkppYGJKkJhaGJKmJhSFJamJhSJKaWBiSpCYWhiSpiYUhSWpiYUiSmlgYkqQmFoYkqYmFIUlqYmFIkppYGJKkJhaGJKmJhSFJamJhSJKaWBiSpCYWhiSpyYoURpJNSfYm2ZfkmiHPn5Tkru75h5KsH3ju2m58b5LLBsa/luTRJA8nmV2JnJKkI7d63A0kWQXcClwCzAG7k8xU1RMD064Cnq+qC5JsAW4C3p3kQmAL8JPAOcB9SX6iql7q1vuHVfXNcTNKksa3EkcYG4F9VfV0VX0PuBPYvGDOZuCO7vE9wMVJ0o3fWVXfrao/B/Z125MkTZmVKIy1wP6B5blubOicqjoMvACcMWLdAj6fZE+SbYvtPMm2JLNJZg8ePDjWf4gkaXErURgZMlaNc5Za9y1V9TPAO4D3JfkHw3ZeVTuqqldVvTVr1rRmliQt00oUxhxw7sDyOuDZxeYkWQ2cChxaat2qmr9/Dvg0vlUlSRO1EoWxG9iQ5PwkJ9I/iT2zYM4MsLV7fCXwQFVVN76lu4rqfGAD8OUkr0nyYwBJXgNcCjy2AlklSUdo7KukqupwkquBncAq4PaqejzJdmC2qmaA24BPJNlH/8hiS7fu40nuBp4ADgPvq6qXkrwO+HT/vDirgU9W1f8YN6sk6cil/4v+K0Ov16vZWT+yIUnLkWRPVfVGzfOT3pKkJhaGJKmJhSFJamJhSJKaWBiSpCYWhiSpiYUhSWpiYUiSmlgYkqQmFoYkqYmFIUlqYmFIkppYGJKkJhaGJKmJhSFJamJhSJKaWBiSpCYWhiSpiYUhSWpiYUiSmlgYkqQmFoYkqYmFIUlqYmFIkppYGJKkJhaGJKnJihRGkk1J9ibZl+SaIc+flOSu7vmHkqwfeO7abnxvkstatylJOrZWj7uBJKuAW4FLgDlgd5KZqnpiYNpVwPNVdUGSLcBNwLuTXAhsAX4SOAe4L8lPdOuM2uaK+cOvPMPNO/fy7Lde5JzXnswHLns9V/z02qOxq1dELh65G+7fDi/Mwanr4OIPwhv/6aRTTWeuacxkrmWb1tfisc41dmEAG4F9VfU0QJI7gc3A4A/3zcAN3eN7gFuSpBu/s6q+C/x5kn3d9mjY5or4w688w7WfepQXv/8SAM9860Wu/dSjABP9CzGtuXjkbvjM++H7L/aXX9jfX4bJvrCnMdc0ZjLXsk3ra3ESuVbiLam1wP6B5blubOicqjoMvACcscS6LdtcETfv3PuDP/B5L37/JW7eufdo7K7ZtObi/u3//wU97/sv9scnaRpzTWMmMNcyTetrcRK5VqIwMmSsGucsd/yHd55sSzKbZPbgwYNLBh3m2W+9uKzxY2Vac/HC3PLGj5VpzDWNmZbav7mGmtbX4iRyrURhzAHnDiyvA55dbE6S1cCpwKEl1m3ZJgBVtaOqelXVW7NmzbLDn/Pak5c1fqxMay5OXbe88WNlGnNNY6al9m+uoab1tTiJXCtRGLuBDUnOT3Ii/ZPYMwvmzABbu8dXAg9UVXXjW7qrqM4HNgBfbtzmivjAZa/n5BNWvWzs5BNW8YHLXn80dtdsWnNx8QfhhAV/IU84uT8+SdOYaxozgbmWaVpfi5PINfZJ76o6nORqYCewCri9qh5Psh2YraoZ4DbgE91J7UP0C4Bu3t30T2YfBt5XVS8BDNvmuFmHmT85NG1XQExrrh+cfJy2K1mmMdc0ZjLXsk3ra3ESudL/Rf+Vodfr1ezs7KRjSNJxJcmequqNmucnvSVJTSwMSVITC0OS1MTCkCQ1sTAkSU0sDElSEwtDktTEwpAkNbEwJElNLAxJUhMLQ5LUxMKQJDWxMCRJTSwMSVITC0OS1MTCkCQ1sTAkSU0sDElSEwtDktTEwpAkNbEwJElNLAxJUhMLQ5LUxMKQJDWxMCRJTSwMSVITC0OS1GSswkhyepJdSZ7q7k9bZN7Wbs5TSbYOjP9skkeT7Evy20nSjd+Q5JkkD3e3d46TU5I0vnGPMK4B7q+qDcD93fLLJDkduB54E7ARuH6gWH4H2AZs6G6bBlb9SFVd1N3++5g5JUljGrcwNgN3dI/vAK4YMucyYFdVHaqq54FdwKYkZwOnVNUXq6qA311kfUnSFBi3MF5XVQcAuvuzhsxZC+wfWJ7rxtZ2jxeOz7s6ySNJbl/srS5J0rEzsjCS3JfksSG3zY37yJCxWmIc+m9V/W3gIuAA8B+XyLctyWyS2YMHDzZGkiQt1+pRE6rq7Ys9l+QbSc6uqgPdW0zPDZk2B7x1YHkd8GA3vm7B+LPdPr8xsI+PAp9dIt8OYAdAr9erxeZJksYz7ltSM8D8VU9bgXuHzNkJXJrktO6tpUuBnd1bWN9O8ubu6qhfml+/K595Pwc8NmZOSdKYRh5hjHAjcHeSq4C/AH4eIEkP+JWqem9VHUryIWB3t872qjrUPf5V4OPAycDnuhvAh5NcRP8tqq8B/2LMnJKkMaV/gdIrQ6/Xq9nZ2UnHkKTjSpI9VdUbNc9PekuSmlgYkqQmFoYkqYmFIUlqYmFIkppYGJKkJhaGJKmJhSFJamJhSJKaWBiSpCYWhiSpiYUhSWpiYUiSmlgYkqQmFoYkqYmFIUlqYmFIkppYGJKkJhaGJKmJhSFJamJhSJKaWBiSpCYWhiSpiYUhSWpiYUiSmlgYkqQmFoYkqclYhZHk9CS7kjzV3Z+2yLyt3ZynkmwdGP+tJPuTfGfB/JOS3JVkX5KHkqwfJ6ckaXzjHmFcA9xfVRuA+7vll0lyOnA98CZgI3D9QLF8phtb6Crg+aq6APgIcNOYOSVJYxq3MDYDd3SP7wCuGDLnMmBXVR2qqueBXcAmgKr6UlUdGLHde4CLk2TMrJKkMYxbGK+b/4Hf3Z81ZM5aYP/A8lw3tpQfrFNVh4EXgDPGzCpJGsPqUROS3Af8+JCnrmvcx7Ajg1qpdZJsA7YBnHfeeY2RJEnLNbIwqurtiz2X5BtJzq6qA0nOBp4bMm0OeOvA8jrgwRG7nQPOBeaSrAZOBQ4tkm8HsAOg1+uNKiJJ0hEa9y2pGWD+qqetwL1D5uwELk1yWney+9JurHW7VwIPVJVlIEkTNG5h3AhckuQp4JJumSS9JB8DqKpDwIeA3d1tezdGkg8nmQN+NMlckhu67d4GnJFkH/CvGHL1lSTp2Mor6Rf3Xq9Xs7Ozk44hSceVJHuqqjdqnp/0liQ1sTAkSU0sDElSEwtDktTEwpAkNbEwJElNLAxJUhMLQ5LUxMKQJDWxMCRJTSwMSVITC0OS1MTCkCQ1sTAkSU0sDElSEwtDktTEwpAkNbEwJElNLAxJUhMLQ5LUxMKQJDWxMCRJTSwMSVITC0OS1MTCkCQ1sTAkSU0sDElSk7EKI8npSXYleaq7P22ReVu7OU8l2Tow/ltJ9if5zoL5v5zkYJKHu9t7x8kpSRrfuEcY1wD3V9UG4P5u+WWSnA5cD7wJ2AhcP1Asn+nGhrmrqi7qbh8bM6ckaUzjFsZm4I7u8R3AFUPmXAbsqqpDVfU8sAvYBFBVX6qqA2NmkCQdA+MWxuvmf+B392cNmbMW2D+wPNeNjfJPkjyS5J4k546ZU5I0ptWjJiS5D/jxIU9d17iPDBmrEet8Bvj9qvpukl+hf/TytkXybQO2AZx33nmNkSRJyzWyMKrq7Ys9l+QbSc6uqgNJzgaeGzJtDnjrwPI64MER+/zfA4sfBW5aYu4OYEeX52CSry+17RHOBL45xvpHi7mWZxpzTWMmMNdyvVJz/a2WSSMLY4QZYCtwY3d/75A5O4H/MHCi+1Lg2qU2Ol9C3eLlwJMtYapqTcu8JfY7W1W9cbZxNJhreaYx1zRmAnMt16s917jnMG4ELknyFHBJt0ySXpKPAVTVIeBDwO7utr0bI8mHk8wBP5pkLskN3Xbfn+TxJF8F3g/88pg5JUljGusIo3vr6OIh47PAeweWbwduHzLvN4DfGDJ+LSOOQiRJx5af9H65HZMOsAhzLc805prGTGCu5XpV50rVqAuWJEnyCEOS1MjCGCLJryepJGdOOgtAkg91H2J8OMnnk5wz6UwASW5O8mddtk8nee2kMwEk+fnuoom/TjLxK1qSbEqyN8m+JD/09TmTkOT2JM8leWzSWQYlOTfJF5I82f0//LVJZwJI8iNJvpzkq12ufzfpTIOSrErylSSfPZr7sTAW6D5VfgnwF5POMuDmqnpjVV0EfBb44KQDdXYBb6iqNwL/k+m5UOEx4B8DfzzpIElWAbcC7wAuBH4hyYWTTQXAx+m+omfKHAb+dVX9HeDNwPum5M/ru8DbqurvAhcBm5K8ecKZBv0ajR8/GIeF8cM+Qv/Krak5uVNV/2dg8TVMSbaq+nxVHe4Wv0T/Q5kTV1VPVtXeSefobAT2VdXTVfU94E7638E2UVX1x8ChSedYqKoOVNWfdo+/Tf+HYMtXCR1V1Tf/rdondLepeB0mWQf8I+Cof0mrhTEgyeXAM1X11UlnWWj+q+CB9zA9RxiD/jnwuUmHmEJH+l1qr3pJ1gM/DTw02SR93ds+D9P/RotdVTUVuYD/TP+X3L8+2jsa95Pex50R3431b+l/Ev2YWypXVd1bVdcB1yW5Fria/lfGTzxXN+c6+m8l/N6xyNSaa0ocyXepveol+ZvAHwD/csER9sRU1UvARd25uk8neUNVTfQcUJJ3Ac9V1Z4kbz3a+3vVFcZi342V5KeA84GvJoH+2yt/mmRjVf2vSeUa4pPAH3GMCmNUru4fxHoXcHEdw2u0l/HnNWlzwOC3La8Dnp1QluNCkhPol8XvVdWnJp1noar6VpIH6Z8DmvRFA28BLk/yTuBHgFOS/Leq+mdHY2e+JdWpqker6qyqWl9V6+m/0H/mWJTFKEk2DCxeDvzZpLIMSrIJ+DfA5VX1fyedZ0rtBjYkOT/JicAW+t/BpiHS/23tNuDJqvpPk84zL8ma+asAk5wMvJ0peB1W1bVVta77mbUFeOBolQVYGMeLG5M8luQR+m+ZTcWlhsAtwI8Bu7pLfv/LpAMBJPm57jvK/h7wR0l2TipLd1HA1fS/hPNJ4O6qenxSeeYl+X3gi8Dru+9xu2rSmTpvAX4ReNvAP9H8zkmHAs4GvtC9BnfTP4dxVC9hnUZ+0luS1MQjDElSEwtDktTEwpAkNbEwJElNLAxJUhMLQ5LUxMKQJDWxMCRJTf4f/hu4N65yoy8AAAAASUVORK5CYII=\n", 80 | "text/plain": [ 81 | "
" 82 | ] 83 | }, 84 | "metadata": { 85 | "needs_background": "light" 86 | }, 87 | "output_type": "display_data" 88 | } 89 | ], 90 | "source": [ 91 | "plt.scatter(x[y==0], [0]*len(x[y==0]))\n", 92 | "plt.scatter(x[y==1], [0]*len(x[y==1]))" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "可以看见现在的数据是线性不可分的" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "## 使用高斯核函数" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 11, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "def gaussian(x, l): # x:数据点 l:地标\n", 116 | " gamma = 1.0 # 先固定该值\n", 117 | " return np.exp(-gamma * (x-l)**2) " 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 14, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "l1, l2 = -1, 1 # 取地标\n", 127 | "\n", 128 | "X_new = np.empty((len(x), 2))\n", 129 | "for i, data in enumerate(x):\n", 130 | " X_new [i, 0] = gaussian(data, l1)\n", 131 | " X_new[i, 1] = gaussian(data, l2)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 16, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "" 143 | ] 144 | }, 145 | "execution_count": 16, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | }, 149 | { 150 | "data": { 151 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAELxJREFUeJzt3X+s3XV9x/Hny7Zol2G72ZpoWy1uxdggCeaGYUgmBjcLCa0xrKEJURcC0Q35Q2MCcWGkbtFJNhYSNm0W449MsRqDxdWQDDAuRhiXoEUgXbr6oxfMuCr0H6sUfO+Pc4DL5bbne9pz7un99PlImvP9fr7vfs/70++9r3zP95xvT6oKSVJbXjHpBiRJo2e4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhq0fFJPvGbNmtq4ceOknl6SlqQHH3zwF1W1dlDdxMJ948aNTE9PT+rpJWlJSvLTLnVelpGkBhnuktQgw12SGmS4S1KDDHdJatDAcE/yuSRPJvnRMbYnya1JDiTZl+Rto29zCPt2wy3nwE2re4/7dk+0HUmahC5n7p8Hthxn+yXApv6fa4B/Pfm2TtC+3XDndXD4EFC9xzuvM+AlnXYGhntVfRf41XFKtgFfrJ77gNVJXjeqBody9044euSlY0eP9MYl6TQyimvu64BDc9Zn+mMvk+SaJNNJpmdnZ0fw1PMcnhluXJIaNYpwzwJjC37rdlXtqqqpqppau3bg3bPDW7V+uHFJatQown0G2DBnfT3wxAj2O7yLb4QVK186tmJlb1ySTiOjCPc9wPv6n5q5ADhcVT8fwX6Hd+52uOxWWLUBSO/xslt745J0Ghn4H4cl+QpwEbAmyQzwt8AKgKr6DLAXuBQ4APwa+MtxNdvJudsNc0mnvYHhXlU7Bmwv4K9H1pEk6aR5h6okNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqUKdwT7Ilyf4kB5Jcv8D2NyS5N8lDSfYluXT0rUqSuhoY7kmWAbcBlwCbgR1JNs8r+xtgd1WdB1wB/MuoG5UkddflzP184EBVHayqZ4DbgW3zagp4dX95FfDE6FqUJA2rS7ivAw7NWZ/pj811E3BlkhlgL/DhhXaU5Jok00mmZ2dnT6BdSVIXXcI9C4zVvPUdwOeraj1wKfClJC/bd1Xtqqqpqppau3bt8N1KkjrpEu4zwIY56+t5+WWXq4DdAFX1feBVwJpRNChJGl6XcH8A2JTkrCRn0HvDdM+8mp8BFwMkeQu9cPe6iyRNyMBwr6pngWuBu4DH6H0q5pEkO5Ns7Zd9FLg6yQ+BrwAfqKr5l24kSYtkeZeiqtpL743SuWM3zll+FLhwtK1Jkk6Ud6hKUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBnUK9yRbkuxPciDJ9ceo2Z7k0SSPJPnyaNuUJA1j+aCCJMuA24A/A2aAB5LsqapH59RsAm4ALqyqp5K8dlwNS5IG63Lmfj5woKoOVtUzwO3Atnk1VwO3VdVTAFX15GjblCQNo0u4rwMOzVmf6Y/NdTZwdpLvJbkvyZaFdpTkmiTTSaZnZ2dPrGNJ0kBdwj0LjNW89eXAJuAiYAfwb0lWv+wvVe2qqqmqmlq7du2wvUqSOuoS7jPAhjnr64EnFqj5ZlUdraofA/vphb0kaQK6hPsDwKYkZyU5A7gC2DOv5g7gnQBJ1tC7THNwlI1KkrobGO5V9SxwLXAX8Biwu6oeSbIzydZ+2V3AL5M8CtwLfKyqfjmupiVJx5eq+ZfPF8fU1FRNT09P5LklaalK8mBVTQ2q8w5VSWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1KBO4Z5kS5L9SQ4kuf44dZcnqSRTo2tRkjSsgeGeZBlwG3AJsBnYkWTzAnVnAtcB94+6SUnScLqcuZ8PHKiqg1X1DHA7sG2Buk8AnwZ+M8L+JEknoEu4rwMOzVmf6Y+9IMl5wIaq+tbxdpTkmiTTSaZnZ2eHblaS1E2XcM8CY/XCxuQVwC3ARwftqKp2VdVUVU2tXbu2e5eSpKF0CfcZYMOc9fXAE3PWzwTOAb6T5CfABcAe31SVpMnpEu4PAJuSnJXkDOAKYM/zG6vqcFWtqaqNVbURuA/YWlXTY+lYkjTQwHCvqmeBa4G7gMeA3VX1SJKdSbaOu8Fj2rcbbjkHblrde9y3e2KtaEgeO2nslncpqqq9wN55Yzceo/aik29rgH274c7r4OiR3vrhQ711gHO3j/3pdRI8dtKiWJp3qN6988VweN7RI71xndo8dtKiWJrhfnhmuHGdOjx20qJYmuG+av1w4zp1eOykRbE0w/3iG2HFypeOrVjZG9epzWMnLYqlGe7nbofLboVVG4D0Hi+71TfklgKPnbQoUlWDq8Zgamqqpqf9KLwkDSPJg1U18CbRpXnmLkk6LsNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAZ1CvckW5LsT3IgyfULbP9IkkeT7Etyd5I3jr5VSVJXA8M9yTLgNuASYDOwI8nmeWUPAVNVdS7wdeDTo25UktRdlzP384EDVXWwqp4Bbge2zS2oqnur6tf91fuA9aNtU5I0jC7hvg44NGd9pj92LFcB315oQ5JrkkwnmZ6dne3epSRpKF3CPQuM1YKFyZXAFHDzQturaldVTVXV1Nq1a7t3KUkayvIONTPAhjnr64En5hcleRfwceAdVfXb0bQnSToRXc7cHwA2JTkryRnAFcCeuQVJzgM+C2ytqidH36YkaRgDw72qngWuBe4CHgN2V9UjSXYm2dovuxn4feBrSX6QZM8xdidJWgRdLstQVXuBvfPGbpyz/K4R9yVJOgneoSpJDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNWj5pBs4UXc89Dg337WfJ54+wutXr+Rj734z7zlv3aTbkqRj27cb7t4Jh2dg1Xq4+EY4d/tYnmpJhvsdDz3ODd94mCNHnwPg8aePcMM3HgYw4CWdmvbthjuvg6NHeuuHD/XWYSwBvyQvy9x81/4Xgv15R44+x8137Z9QR5I0wN07Xwz25x090hsfg07hnmRLkv1JDiS5foHtr0zy1f72+5NsHHWjcz3x9JFjjt/x0ONc+Kl7OOv6/+DCT93DHQ89Ps5WdCL27YZbzoGbVvce9+2edEfS+B2eGW78JA0M9yTLgNuAS4DNwI4km+eVXQU8VVV/DNwC/MOoG51r5YqF216xLNzwjYd5/OkjFC9erjHgTyHPvzQ9fAioF1+aGvBq3ar1w42fpC5n7ucDB6rqYFU9A9wObJtXsw34Qn/568DFSTK6Nl/qyLO/W3D8mefKyzWnukV+aSqdMi6+EVasfOnYipW98THoEu7rgENz1mf6YwvWVNWzwGHgNfN3lOSaJNNJpmdnZ0+sY6BquPpjXcbRBCzyS1PplHHudrjsVli1AUjv8bJbJ/ppmYXOwOfHa5caqmoXsAtgampqyIh+0bKE54ZI+NevXjm4SItj1fr+JZkFxqXWnbt9bGE+X5cz9xlgw5z19cATx6pJshxYBfxqFA0uZMefbFhw/MI/+kNWrlj2krGVK5bxsXe/eVytaFiL/NJUOl11CfcHgE1JzkpyBnAFsGdezR7g/f3ly4F7qoa9eNLd373nrVx5wRtY1r+svyzhygvewL9f/XY++d63sm71SgKsW72ST773rX72/VSyyC9NpdNVumRwkkuBfwaWAZ+rqr9PshOYrqo9SV4FfAk4j94Z+xVVdfB4+5yamqrp6emTnoAknU6SPFhVU4PqOt2hWlV7gb3zxm6cs/wb4C+GbVKSNB5L8g5VSdLxGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQZ1uYhrLEyezwE9HsKs1wC9GsJ+l4nSbL5x+c3a+7TuZOb+xqtYOKppYuI9Kkukud2u14nSbL5x+c3a+7VuMOXtZRpIaZLhLUoNaCPddk25gkZ1u84XTb87Ot31jn/OSv+YuSXq5Fs7cJUnzLJlwT7Ilyf4kB5Jcv8D2Vyb5an/7/Uk2Ln6Xo9Nhvh9J8miSfUnuTvLGSfQ5KoPmO6fu8iSVZMl/uqLLnJNs7x/nR5J8ebF7HKUOP9NvSHJvkof6P9eXTqLPUUnyuSRPJvnRMbYnya39f499Sd420gaq6pT/Q+9LQv4XeBNwBvBDYPO8mr8CPtNfvgL46qT7HvN83wn8Xn/5Q63Pt193JvBd4D5gatJ9L8Ix3gQ8BPxBf/21k+57zPPdBXyov7wZ+Mmk+z7JOf8p8DbgR8fYfinwbXrfQX0BcP8on3+pnLmfDxyoqoNV9QxwO7BtXs024Av95a8DFydZ6Iu7l4KB862qe6vq1/3V++h9t+1S1eX4AnwC+DTwm8Vsbky6zPlq4Laqegqgqp5c5B5Hqct8C3h1f3kVL/+u5iWlqr7L8b9Lehvwxeq5D1id5HWjev6lEu7rgENz1mf6YwvWVNWzwGHgNYvS3eh1me9cV9E7A1iqBs43yXnAhqr61mI2NkZdjvHZwNlJvpfkviRbFq270esy35uAK5PM0Pvmtw8vTmsTM+zv+VA6fc3eKWChM/D5H/PpUrNUdJ5LkiuBKeAdY+1ovI473ySvAG4BPrBYDS2CLsd4Ob1LMxfRe2X2X0nOqaqnx9zbOHSZ7w7g81X1j0neDnypP9/fjb+9iRhrZi2VM/cZYMOc9fW8/CXbCzVJltN7WXe8l0Snsi7zJcm7gI8DW6vqt4vU2zgMmu+ZwDnAd5L8hN71yT1L/E3Vrj/T36yqo1X1Y2A/vbBfirrM9ypgN0BVfR94Fb3/g6VVnX7PT9RSCfcHgE1JzkpyBr03TPfMq9kDvL+/fDlwT/XftViCBs63f5nis/SCfSlfi4UB862qw1W1pqo2VtVGeu8xbK2q6cm0OxJdfqbvoPfGOUnW0LtMc3BRuxydLvP9GXAxQJK30Av32UXtcnHtAd7X/9TMBcDhqvr5yPY+6XeUh3jn+VLgf+i94/7x/thOer/k0PtB+BpwAPhv4E2T7nnM8/1P4P+AH/T/7Jl0z+Oc77za77DEPy3T8RgH+CfgUeBh4IpJ9zzm+W4GvkfvkzQ/AP580j2f5Hy/AvwcOErvLP0q4IPAB+cc39v6/x4Pj/pn2jtUJalBS+WyjCRpCIa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkN+n/fUuPv2PVpmAAAAABJRU5ErkJggg==\n", 152 | "text/plain": [ 153 | "
" 154 | ] 155 | }, 156 | "metadata": { 157 | "needs_background": "light" 158 | }, 159 | "output_type": "display_data" 160 | } 161 | ], 162 | "source": [ 163 | "plt.scatter(X_new[y==0, 0], X_new[y==0, 1])\n", 164 | "plt.scatter(X_new[y==1, 0], X_new[y==1, 1])" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [] 173 | } 174 | ], 175 | "metadata": { 176 | "kernelspec": { 177 | "display_name": "Python 3", 178 | "language": "python", 179 | "name": "python3" 180 | }, 181 | "language_info": { 182 | "codemirror_mode": { 183 | "name": "ipython", 184 | "version": 3 185 | }, 186 | "file_extension": ".py", 187 | "mimetype": "text/x-python", 188 | "name": "python", 189 | "nbconvert_exporter": "python", 190 | "pygments_lexer": "ipython3", 191 | "version": "3.7.0" 192 | } 193 | }, 194 | "nbformat": 4, 195 | "nbformat_minor": 2 196 | } 197 | --------------------------------------------------------------------------------