├── .gitignore ├── LICENSE ├── README.md ├── c0_overview └── 数据相关.pdf ├── c1_knn ├── 01_kNN_Basics.ipynb ├── 02_kNN_in_scikit_learn.ipynb ├── 03_Train_Test_Split.ipynb ├── 04_Hyper_Parameter_kNN.ipynb ├── 05_Hyper_Parameters.ipynb ├── 06_Grid_Search.ipynb ├── 07_Feature_Scaling.ipynb ├── 08_Scaler_in_Scikit_Learn.ipynb ├── __init__.py ├── kNN.py ├── knn.md ├── metrics.py ├── model_selection.py └── preprocessing.py ├── c2_linear_regression ├── 03_Simple_Linear_Regression.ipynb ├── 05_MSE_VS_MAE.ipynb ├── 08_Linear_Regression.ipynb ├── 09_Regression_in_scikit_learn.ipynb ├── 10_More_About_Linear_Regression.ipynb ├── __init__.py ├── linear_regression.py └── simple_linear_regression.py ├── c3_gradient_descent ├── 02_Gradient_Descent_Simulate.ipynb ├── 04_Implement_Gradient_Descent_in_Linear_Regression.ipynb ├── 05_Vectorize_Gradient_Descent.ipynb ├── 06_Stochastic_Gradient_Descent.ipynb ├── 07_SGD_in_scikit_learn.ipynb ├── 08_Gradient_Debugging.ipynb └── __init__.py ├── c4_pca ├── 03_implement_PCA_in_SGA.ipynb ├── 04_Getting_First_N_Components.ipynb ├── 05_Data_Projection.ipynb ├── 06_PCA_in_scikit_learn.ipynb ├── 07_MNIST.ipynb ├── 08_PCA_for_noise_reduction.ipynb ├── 09_Eigenface.ipynb └── __init__.py ├── c5_polynomial_regression ├── 01_Polynomial_Regression.ipynb ├── 02_Polynomial_Regression_and_pipeline.ipynb ├── 03_underfitting_and_overfitting.ipynb ├── 04_why_tain_test_split.ipynb ├── 05_learning_curve.ipynb ├── 06_Validation_and_Cross_Validation.ipynb ├── 08_Ridge_Regression.ipynb ├── 09_LASSO.ipynb └── __init__.py ├── c6_logistic_regression ├── 01_Sigmoid.ipynb ├── 04_implement_logistic_regression.ipynb ├── 05_decision_boundry.ipynb ├── 06_Polynomial_Features_in_Logistic_Regression.ipynb ├── 07_Logistic_Regression_in_scikit_learn.ipynb ├── 08_OvR_and_OvO.ipynb ├── __init__.py └── plot_utils.py ├── c7_classification_performance_measures ├── 03_implement_confusion_matrix_precision_and_recall.ipynb ├── 05_Precision_Recall_trade_off.ipynb ├── 06_Precision_Recall_Curve.ipynb ├── 07_ROC.ipynb ├── 08_Confusion_Matrix_in_Multiclass_Classification.ipynb └── __init__.py ├── c8_svm ├── 04_SVM_in_scikit_learn.ipynb ├── 05_polynomial_features_in_svm_and_kernel_function.ipynb └── __init__.py └── playML ├── PCA.py ├── __init__.py ├── linear_regression.py ├── logistic_regression.py ├── metrics.py ├── model_selection.py └── plot_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Sea-Monster 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MachineLearningClassicAlgorithm 2 | 慕课网《Python 3 入门机器学习经典算法与应用》的代码和笔记 3 | -------------------------------------------------------------------------------- /c0_overview/数据相关.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sea-Monster/MachineLearningClassicAlgorithm/2aaad1965e7e4b8659b6296dfe938181825fa259/c0_overview/数据相关.pdf -------------------------------------------------------------------------------- /c1_knn/01_kNN_Basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import matplotlib.pyplot as plt" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 6, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "raw_data_X = np.random.random((10,2))" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 7, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "raw_data_X = raw_data_X *10" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 8, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "raw_data_y = np.array([0,0,0,0,0,1,1,1,1,1])" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 9, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "X_train = raw_data_X\n", 49 | "y_train = raw_data_y" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 10, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAADdVJREFUeJzt3UGIpHeZx/HfrzOzaMWldUkja2JX5bBkEYfdSB2iAQ9pFxbNmD3sIVARXRbqsmgUQZQ6SA61eBBpT0IR1xV8iYcxsDseRBldloUlUDMJ28mMIKzpdnSyaVm2FeuQkTx7eKuTmdnu9Nu9/dZbT9X3A0N1//NO9UPBfPP2W+/7liNCAIA8VpoeAABwPIQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyZ+p40nvuuSc6nU4dTw0AC+ny5cu/joi1KtvWEu5Op6PxeFzHUwPAQrK9XXVbDpUAQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AycxNuIutQp3NjlaeWlFns6Niq2h6JACYS3MR7mKrUP9iX9t72wqFtve21b/YJ94ATkdRSJ2OtLJSPha52zIX4R5cGmhyc3Lb2uTmRINLg4YmArAwikLq96XtbSmifOz3U8d7LsK9s7dzrHUAqGwwkCa37xhqMinXk5qLcK+vrh9rHQAq2zlkB/Cw9QTmItzDjaFaZ1u3rbXOtjTcGDY0EYCFsX7IDuBh6wnMRbh753oanR+pvdqWZbVX2xqdH6l3rtf0aACyGw6l1u07hmq1yvWkHBGn/qTdbje4rSuAuVEU5THtnZ1yT3s4lHrztWNo+3JEdKtsOxd73LPCueLAkur1pJdfll5/vXycs2gfVy0fpDCP9s8V3z/tcP9ccUkckgGQytLscXOuOIBFsTTh5lxxAItiacLNueIAFsXShJtzxQEsiqUJN+eKA1gUnMcNAHOA87gBYIERbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQqhdv252y/ZPtF28/YflvdgwEADnZkuG3fK+kzkroR8X5Jd0l6vO7BAAAHq3qo5Iykt9s+I6kl6Vf1jQQAeCtHhjsifinpq5J2JN2QtBcRP6x7MADAwaocKnmXpMck3S/pPZLutv3EAdv1bY9tj3d3d09/UgCApGqHSj4i6ecRsRsRNyU9K+lDd24UEaOI6EZEd21t7bTnBABMVQn3jqSHbLdsW9KGpGv1jgUAOEyVY9zPSbog6YqkrenfGdU8FwDgEGeqbBQRX5b05ZpnAQBUwJWTAJAM4QaAZAg3ACRDuIFbFYXU6UgrK+VjUTQ9EfB/VHpzElgKRSH1+9JkUn6/vV1+L0m9XnNzAXdgjxvYNxi8Ge19k0m5DswRwg3s29k53jrQEMIN7FtfP9460BDCDewbDqVW6/a1VqtcB+YI4Qb29XrSaCS125JdPo5GvDGJucNZJcCtej1CjbnHHjcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAE6qoZuScTogAJxEgzclY48bAE6iwZuSEW4AOIkGb0pGuAHgJBq8KRnhBoCTaPCmZIQbAE6iwZuScVYJAJxUQzclY48bAJIh3ACQDOEGgGQIdyLFVqHOZkcrT62os9lRsTWby2sBzBfenEyi2CrUv9jX5GZ5pdb23rb6F8vLa3vn+MQWYJmwx53E4NLgjWjvm9ycaHCp/strAcwXwp3Ezt7Bl9Eetg5gcRHuJNZXD76M9rB1AIuLcCcx3Biqdfb2y2tbZ1sabtR/eS2A+UK4k+id62l0fqT2aluW1V5ta3R+xBuTwBJyRJz6k3a73RiPx6f+vACwqGxfjohulW3Z4waAZCqF2/Y7bV+w/VPb12x/sO7BAAAHq3oBztcl/SAi/tr2H0hqHfUXAAD1ODLctlclfVjSpyQpIl6T9Fq9YwEADlPlUMn9knYlfcv287aftn33nRvZ7tse2x7v7u6e+qAAgFKVcJ+R9AFJ34iIByX9TtIX79woIkYR0Y2I7tra2imPCQDYVyXc1yVdj4jnpt9fUBlyAEADjgx3RLwi6Re2H5gubUi6WutUAIBDVT2r5NOSiukZJf8p6W/qGwkA8FYqhTsiXpBU6YoeAEC9uHISAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwY3kVhdTpSCsr5WNRND0RUEnVKyeBxVIUUr8vTSbl99vb5feS1ONzPDHf2OPGchoM3oz2vsmkXAfmHOHGctrZOd46MEcIN5bT+vrx1oE5QrixnIZDqXXHR6e2WuU6MOcIN5ZTryeNRlK7Ldnl42jEG5NIgbNKsLx6PUKNlNjjBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ABmptgq1NnsaOWpFXU2Oyq2uJXuSXABDoCZKLYK9S/2NblZ3pVxe29b/YvlrXR757gQ6jjY4wYwE4NLgzeivW9yc6LBJW6le1yEG8BM7OwdfMvcw9ZxOMINYCbWVw++Ze5h6zgc4QYwE8ONoVpnb7+VbutsS8MNbqV7XIQbwEz0zvU0Oj9Se7Uty2qvtjU6P+KNyRNwRJz6k3a73RiPx6f+vACwqGxfjohulW3Z4waAZAg3ACRDuAEgGcINAMkQbgBIpnK4bd9l+3nb369zIADAWzvOHveTkq7VNQgAoJpK4bZ9n6SPSXq63nEAAEepuse9KekLkl6vcRYAQAVHhtv2o5JejYjLR2zXtz22Pd7d3T21AQEAt6uyx/2wpI/bflnSdyU9Yvs7d24UEaOI6EZEd21t7ZTHBADsOzLcEfGliLgvIjqSHpf044h4ovbJAAAH4jxuAEjmWOGOiH+JiEfrGgaoTVFInY60slI+FnxILfLiw4Kx+IpC6velyfTzDre3y+8lqce9oJEPh0qw+AaDN6O9bzIp14GECDcW384hH0Z72Dow5wg3Ft/6IR9Ge9g6MOcINxbfcCi1bv+QWrVa5TqQEOHG4uv1pNFIarclu3wcjXhjEmlxVgmWQ69HqLEw2OMGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASObIcNt+r+2f2L5q+yXbT85iMADAwc5U2Ob3kj4fEVds/6Gky7Z/FBFXa54NAHCAI/e4I+JGRFyZfv1bSdck3Vv3YACAgx3rGLftjqQHJT1XxzAAgKNVDrftd0j6nqTPRsRvDvjvfdtj2+Pd3d3TnBEAcItK4bZ9VmW0i4h49qBtImIUEd2I6K6trZ3mjACAW1Q5q8SSvinpWkR8rf6RAABvpcoe98OSPiHpEdsvTP98tOa5AACHOPJ0wIj4N0mewSwAgAq4chIAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwJ1dsFepsdrTy1Io6mx0VW0XTIwGo2ZmmB8DJFVuF+hf7mtycSJK297bVv9iXJPXO9ZocDUCN2ONObHBp8Ea0901uTjS4NGhoIgCzQLgT29nbOdY6gMVAuBNbX10/1jqAxUC4ExtuDNU627ptrXW2peHGsKGJAMwC4U6sd66n0fmR2qttWVZ7ta3R+RFvTAILzhFx6k/a7XZjPB6f+vMCwKKyfTkiulW2ZY8bAJIh3ACQDOEGgGQINwAkQ7gBIJlaziqxvStp+9SfeH7cI+nXTQ8xB3gdSrwOvAb7/j+vQzsi1qpsWEu4F53tcdXTdhYZr0OJ14HXYN+sXgcOlQBAMoQbAJIh3CczanqAOcHrUOJ14DXYN5PXgWPcAJAMe9wAkAzhrsj2e23/xPZV2y/ZfrLpmZpk+y7bz9v+ftOzNMX2O21fsP1T29dsf7DpmZpg+3PTfxMv2n7G9tuanmkWbP+D7Vdtv3jL2h/Z/pHtn00f31XHzybc1f1e0ucj4n2SHpL0d7bf1/BMTXpS0rWmh2jY1yX9ICL+VNKfaQlfD9v3SvqMpG5EvF/SXZIeb3aqmflHSX95x9oXJV2KiD+RdGn6/akj3BVFxI2IuDL9+rcq/5He2+xUzbB9n6SPSXq66VmaYntV0oclfVOSIuK1iPifZqdqzBlJb7d9RlJL0q8anmcmIuJfJf33HcuPSfr29OtvS/qrOn424T4B2x1JD0p6rtlJGrMp6QuSXm96kAbdL2lX0remh4yetn1300PNWkT8UtJXJe1IuiFpLyJ+2OxUjXp3RNyYfv2KpHfX8UMI9zHZfoek70n6bET8pul5Zs32o5JejYjLTc/SsDOSPiDpGxHxoKTfqaZfi+fZ9BjuYyr/R/YeSXfbfqLZqeZDlKfs1XLaHuE+BttnVUa7iIhnm56nIQ9L+rjtlyV9V9Ijtr/T7EiNuC7pekTs/9Z1QWXIl81HJP08InYj4qakZyV9qOGZmvRftv9YkqaPr9bxQwh3Rbat8njmtYj4WtPzNCUivhQR90VER+WbUD+OiKXbw4qIVyT9wvYD06UNSVcbHKkpO5Iest2a/hvZ0BK+SXuLf5b0yenXn5T0T3X8EMJd3cOSPqFyD/OF6Z+PNj0UGvVpSYXt/5D055L+vuF5Zm76G8cFSVckbalsylJcRWn7GUn/LukB29dt/62kr0j6C9s/U/nbyFdq+dlcOQkAubDHDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgmf8Fyb54/FyWOFoAAAAASUVORK5CYII=\n", 60 | "text/plain": [ 61 | "" 62 | ] 63 | }, 64 | "metadata": {}, 65 | "output_type": "display_data" 66 | } 67 | ], 68 | "source": [ 69 | "plt.scatter(X_train[y_train==0, 0], X_train[y_train==0,1], color='g')\n", 70 | "plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r')\n", 71 | "plt.show()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 11, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "x = np.random.random(2)*10" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 12, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/plain": [ 91 | "array([ 8.67035706, 7.0760235 ])" 92 | ] 93 | }, 94 | "execution_count": 12, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "x" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 13, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAADgBJREFUeJzt3VGIrHd9xvHn2ewWnVhWSxapiTuTi5IiHtrIXEQDXmQtFM0xvehFYCJaCnNTNIogylxILqZ4IbJeCUOsFXyJF8dAe7wQZbWUQgnMnoRuco4g1Mx69KRZKV3FucjK+fXinUnOnu6efed033nf/8z3A4fZ/ec9sz8GzjfvvvO+7zgiBABIx0rVAwAAZkO4ASAxhBsAEkO4ASAxhBsAEkO4ASAxhBsAEkO4ASAxhBsAErNaxpPed9990Wq1ynhqAFhIu7u7v46IjSLblhLuVqul4XBYxlMDwEKyPSq6LYdKACAxhBsAEkO4ASAxhBsAEkO4ASAxhBsAEkO4ASAxtQl3tpeptd3SyjMram23lO1lVY8EALVUi3Bne5m6l7saHY4UCo0OR+pe7hJvAOcjy6RWS1pZyR+ztNtSi3D3dnoaH42PrY2Pxurt9CqaCMDCyDKp25VGIykif+x2k453LcK9f7g/0zoAFNbrSePjO4Yaj/P1RNUi3JvrmzOtA0Bh+6fsAJ62noBahLu/1VdjrXFsrbHWUH+rX9FEABbG5ik7gKetJ6AW4e5c6GhwcaDmelOW1VxvanBxoM6FTtWjAUhdvy81ju8YqtHI1xPliDj3J22328FtXQHURpblx7T39/M97X5f6tRrx9D2bkS0i2xbiz3ueeFccWBJdTrSq69KN2/mjzWL9qxK+SCFOpqeKz497XB6rrgkDskASMrS7HFzrjiARbE04eZccQCLYmnCzbniABbF0oSbc8UBLIqlCTfnigNYFJzHDQA1wHncALDACDcAJIZwA0BiCDcAJIZwA0BiCDcAJIZwA0BiCDcAJIZwA0BiCDcAJIZwA0BiCDcAJIZwA0BiCDcAJKZQuG1/zvYrtl+2/Zztt5U9GADgZGeG2/b9kj4jqR0R75d0j6Qnyx4MAHCyoodKViW93faqpIakX5U3EgDgTs4Md0T8UtJXJe1LuiHpMCJ+WPZgAICTFTlU8i5JT0h6UNJ7JN1r+6kTtuvaHtoeHhwcnP+kAABJxQ6VfETSzyPiICKOJD0v6UO3bxQRg4hoR0R7Y2PjvOcEAEwUCfe+pEdsN2xb0paka+WOBQA4TZFj3C9IuiTpiqS9yd8ZlDwXAOAUq0U2iogvS/pyybMAAArgykkASAzhBoDEEG4ASAzhBm6VZVKrJa2s5I9ZVvVEwP9R6M1JYClkmdTtSuNx/v1olH8vSZ1OdXMBt2GPG5jq9d6K9tR4nK8DNUK4gan9/dnWgYoQbmBqc3O2daAihBuY6velRuP4WqORrwM1QriBqU5HGgykZlOy88fBgDcmUTucVQLcqtMh1Kg99rgBIDGEG0AtcS3U6ThUAqB2uBbqztjjBlA7XAt1Z4QbQO1wLdSdEW4AtZPMtVAVHYgn3ABqJ4lroaYH4kcjKeKtA/FziDfhBlA7SVwLVeGBeEfEuT9pu92O4XB47s8LALWxspLvad/Olm7enPnpbO9GRLvQj5752QEAlR6IJ9wAcDcqPBBPuAHgblR4IJ4rJwHgblV0UzL2uAEgMYQbABJDuAEgMYQ7IdleptZ2SyvPrKi13VK2x30ugWXEm5OJyPYydS93NT7Kr9QaHY7UvZzf57JzoU6XkwEoG3vciejt9N6M9tT4aKzeDve5BJYN4U7E/uHJ97M8bR3A4iLcidhcP/ky2tPWASwuwp2I/lZfjbXjl9c21hrqb9XpPpcA5oFwJ6JzoaPBxYGa601ZVnO9qcHFAW9MAkuI27oCQA1wW1cAWGCFwm37nbYv2f6p7Wu2P1j2YACAkxW9AOfrkn4QEX9t+w8kNc76CwCAcpwZbtvrkj4s6VOSFBFvSHqj3LEAAKcpcqjkQUkHkr5l+0Xbz9q+9/aNbHdtD20PDw4Ozn1QAECuSLhXJX1A0jci4mFJv5P0xds3iohBRLQjor2xsXHOYwIApoqE+7qk6xHxwuT7S8pDDgCowJnhjojXJP3C9kOTpS1JV0udCgBwqqJnlXxaUjY5o+Q/Jf1NeSMBAO6kULgj4iVJha7oAQCUiysnASAxhBsAEkO4ASAxhBsAEkO4ASAxhBsAEkO4ASAxhBvLK8ukVktaWckfs6zqiYBCil45CSyWLJO6XWk8zr8fjfLvJanD53ii3tjjxnLq9d6K9tR4nK8DNUe4sZz292dbB2qEcGM5bW7Otg7UCOHGcur3pcZtH53aaOTrQM0RbiynTkcaDKRmU7Lzx8GANyaRBM4qwfLqdAg1ksQeNwAkhnADQGIINwAkhnADQGIINwAkhnADQGIINwAkhnADmJtsL1Nru6WVZ1bU2m4p2+NWuneDC3AAzEW2l6l7uavxUX5XxtHhSN3L+a10Oxe4EGoW7HEDmIveTu/NaE+Nj8bq7XAr3VkRbgBzsX948i1zT1vH6Qg3gLnYXD/5lrmnreN0hBvAXPS3+mqsHb+VbmOtof4Wt9KdFeEGMBedCx0NLg7UXG/KsprrTQ0uDnhj8i44Is79SdvtdgyHw3N/XgBYVLZ3I6JdZFv2uAEgMYQbABJDuAEgMYQbABJDuAEgMYXDbfse2y/a/n6ZAwEA7myWPe6nJV0raxAAQDGFwm37AUkfk/RsueMAAM5SdI97W9IXJN0scRYAQAFnhtv245Jej4jdM7br2h7aHh4cHJzbgACA44rscT8q6eO2X5X0XUmP2f7O7RtFxCAi2hHR3tjYOOcxAQBTZ4Y7Ir4UEQ9EREvSk5J+HBFPlT4ZAOBEnMcNAImZKdwR8S8R8XhZwwClyTKp1ZJWVvLHjA+pRbr4sGAsviyTul1pPPm8w9Eo/16SOtwLGunhUAkWX6/3VrSnxuN8HUgQ4cbi2z/lw2hPWwdqjnBj8W2e8mG0p60DNUe4sfj6falx/ENq1Wjk60CCCDcWX6cjDQZSsynZ+eNgwBuTSBZnlWA5dDqEGguDPW4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASAzhBoDEEG4ASMyZ4bb9Xts/sX3V9iu2n57HYACAk60W2Ob3kj4fEVds/6GkXds/ioirJc8GADjBmXvcEXEjIq5Mvv6tpGuS7i97MADAyWY6xm27JelhSS+UMQwA4GyFw237HZK+J+mzEfGbE/571/bQ9vDg4OA8ZwQA3KJQuG2vKY92FhHPn7RNRAwioh0R7Y2NjfOcEQBwiyJnlVjSNyVdi4ivlT8SAOBOiuxxPyrpE5Ies/3S5M9HS54LAHCKM08HjIh/k+Q5zAIAKIArJwEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQbABJDuAEgMYQ7cdleptZ2SyvPrKi13VK2l1U9EoCSrVY9AO5etpepe7mr8dFYkjQ6HKl7uStJ6lzoVDkagBKxx52w3k7vzWhPjY/G6u30KpoIwDwQ7oTtH+7PtA5gMRDuhG2ub860DmAxEO6E9bf6aqw1jq011hrqb/UrmgjAPBDuhHUudDS4OFBzvSnLaq43Nbg44I1JYME5Is79SdvtdgyHw3N/XgBYVLZ3I6JdZFv2uAEgMYQbABJDuAEgMYQbABJDuAEgMaWcVWL7QNLo3J+4Pu6T9Ouqh6gBXoccrwOvwdT/53VoRsRGkQ1LCfeisz0setrOIuN1yPE68BpMzet14FAJACSGcANAYgj33RlUPUBN8DrkeB14Dabm8jpwjBsAEsMeNwAkhnAXZPu9tn9i+6rtV2w/XfVMVbJ9j+0XbX+/6lmqYvudti/Z/qnta7Y/WPVMVbD9ucm/iZdtP2f7bVXPNA+2/8H267ZfvmXtj2z/yPbPJo/vKuNnE+7ifi/p8xHxPkmPSPo72++reKYqPS3pWtVDVOzrkn4QEX8q6c+0hK+H7fslfUZSOyLeL+keSU9WO9Xc/KOkv7xt7YuSdiLiTyTtTL4/d4S7oIi4ERFXJl//Vvk/0vurnaoath+Q9DFJz1Y9S1Vsr0v6sKRvSlJEvBER/1PtVJVZlfR226uSGpJ+VfE8cxER/yrpv29bfkLStydff1vSX5Xxswn3XbDdkvSwpBeqnaQy25K+IOlm1YNU6EFJB5K+NTlk9Kzte6seat4i4peSvippX9INSYcR8cNqp6rUuyPixuTr1yS9u4wfQrhnZPsdkr4n6bMR8Zuq55k3249Lej0idquepWKrkj4g6RsR8bCk36mkX4vrbHIM9wnl/yN7j6R7bT9V7VT1EPkpe6Wctke4Z2B7TXm0s4h4vup5KvKopI/bflXSdyU9Zvs71Y5UieuSrkfE9LeuS8pDvmw+IunnEXEQEUeSnpf0oYpnqtJ/2f5jSZo8vl7GDyHcBdm28uOZ1yLia1XPU5WI+FJEPBARLeVvQv04IpZuDysiXpP0C9sPTZa2JF2tcKSq7Et6xHZj8m9kS0v4Ju0t/lnSJydff1LSP5XxQwh3cY9K+oTyPcyXJn8+WvVQqNSnJWW2/0PSn0v6+4rnmbvJbxyXJF2RtKe8KUtxFaXt5yT9u6SHbF+3/beSviLpL2z/TPlvI18p5Wdz5SQApIU9bgBIDOEGgMQQbgBIDOEGgMQQbgBIDOEGgMQQbgBIDOEGgMT8Lwi/oYhwODxFAAAAAElFTkSuQmCC\n", 111 | "text/plain": [ 112 | "" 113 | ] 114 | }, 115 | "metadata": {}, 116 | "output_type": "display_data" 117 | } 118 | ], 119 | "source": [ 120 | "plt.scatter(X_train[y_train==0, 0], X_train[y_train==0,1], color='g')\n", 121 | "plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r')\n", 122 | "plt.scatter(x[0], x[1], color='b')\n", 123 | "plt.show()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## kNN的过程" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 14, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "[8.231110301142037,\n 2.467672103362692,\n 8.513701036889186,\n 7.405233631062757,\n 5.723981182780471,\n 4.7325594847194985,\n 3.026576262484981,\n 2.3280024672389885,\n 3.5998460221137596,\n 1.259337088954209]" 142 | ] 143 | }, 144 | "execution_count": 14, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "from math import sqrt\n", 151 | "distances = []\n", 152 | "for x_train in X_train:\n", 153 | " d = sqrt(np.sum((x-x_train)**2))\n", 154 | " distances.append(d)\n", 155 | "distances" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 15, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "array([9, 7, 1, 6, 8, 5, 4, 3, 0, 2])" 167 | ] 168 | }, 169 | "execution_count": 15, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "top_k = np.argsort(distances)\n", 176 | "top_k" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 16, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "# 假设k取6\n", 186 | "k = 6" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 17, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "[(1, 5)]" 198 | ] 199 | }, 200 | "execution_count": 17, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "from collections import Counter\n", 207 | "votes = Counter(y_train[top_k[:k]])\n", 208 | "\n", 209 | "votes.most_common(1) # 前边是标签,后边是个数" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 18, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "1" 221 | ] 222 | }, 223 | "execution_count": 18, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "# 最终kNN的结果:\n", 230 | "predict_y = votes.most_common(1)[0][0]\n", 231 | "predict_y" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 19, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "%run c1_knn/kNN.py" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 21, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "1" 252 | ] 253 | }, 254 | "execution_count": 21, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "predict_y = kNN_classify(6, X_train, y_train, x)\n", 261 | "predict_y" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [] 270 | } 271 | ], 272 | "metadata": { 273 | "kernelspec": { 274 | "display_name": "Python 2", 275 | "language": "python", 276 | "name": "python2" 277 | }, 278 | "language_info": { 279 | "codemirror_mode": { 280 | "name": "ipython", 281 | "version": 2 282 | }, 283 | "file_extension": ".py", 284 | "mimetype": "text/x-python", 285 | "name": "python", 286 | "nbconvert_exporter": "python", 287 | "pygments_lexer": "ipython2", 288 | "version": "2.7.6" 289 | } 290 | }, 291 | "nbformat": 4, 292 | "nbformat_minor": 0 293 | } 294 | -------------------------------------------------------------------------------- /c1_knn/02_kNN_in_scikit_learn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Scikit-Learn中的kNN" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "from sklearn.neighbors import KNeighborsClassifier" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "kNN_classifier = KNeighborsClassifier(n_neighbors=6)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "raw_data_X = np.random.random((10,2))\n", 38 | "X_train = raw_data_X * 10\n", 39 | "y_train = np.array([0,0,0,0,0,1,1,1,1,1])" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=6, p=2,\n weights='uniform')" 51 | ] 52 | }, 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "# 训练/拟合\n", 60 | "kNN_classifier.fit(X_train, y_train)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 5, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "x = np.random.random((1,2))*10" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 6, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "array([0])" 81 | ] 82 | }, 83 | "execution_count": 6, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "kNN_classifier.predict(x)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## 重新整理我们的kNN代码" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 7, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "%run c1_knn/kNN.py\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 8, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "knn_clf = KNNClassifier(k=6)\n", 115 | "knn_clf.fit(X_train, y_train)\n", 116 | "y_predict = knn_clf.predict(x)\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 9, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "array([1])" 128 | ] 129 | }, 130 | "execution_count": 9, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "y_predict" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [] 145 | } 146 | ], 147 | "metadata": { 148 | "kernelspec": { 149 | "display_name": "Python 2", 150 | "language": "python", 151 | "name": "python2" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 2 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython2", 163 | "version": "2.7.6" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 0 168 | } 169 | -------------------------------------------------------------------------------- /c1_knn/03_Train_Test_Split.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 测试我们的算法" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "from sklearn import datasets" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "iris = datasets.load_iris()" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "# 特征矩阵\n", 39 | "X = iris.data\n", 40 | "\n", 41 | "# 结果标签的向量\n", 42 | "y = iris.target" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 4, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/plain": [ 53 | "(150, 4)" 54 | ] 55 | }, 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "np.shape(X)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 5, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "(150,)" 74 | ] 75 | }, 76 | "execution_count": 5, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "np.shape(y)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "## train_test_split" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 6, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])" 101 | ] 102 | }, 103 | "execution_count": 6, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "y" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 7, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# 生成一个序列(例如0~100),再把这个序列打乱\n", 119 | "shuffle_indexes = np.random.permutation(np.shape(y)[0])" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 8, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "array([ 50, 78, 69, 131, 10, 34, 6, 9, 36, 71, 82, 141, 137,\n 79, 59, 93, 22, 91, 122, 75, 88, 3, 89, 86, 12, 61,\n 14, 132, 119, 121, 129, 33, 103, 13, 37, 47, 139, 125, 73,\n 53, 2, 42, 114, 29, 138, 112, 52, 101, 97, 19, 123, 128,\n 144, 81, 11, 109, 26, 116, 44, 80, 64, 83, 124, 74, 39,\n 31, 58, 145, 102, 120, 76, 63, 65, 135, 8, 55, 77, 60,\n 35, 149, 57, 43, 0, 110, 127, 62, 142, 96, 106, 126, 51,\n 40, 104, 118, 68, 27, 87, 45, 15, 113, 115, 49, 16, 136,\n 117, 66, 5, 21, 67, 140, 54, 100, 99, 30, 18, 72, 148,\n 92, 24, 23, 85, 32, 70, 107, 56, 108, 105, 17, 134, 94,\n 95, 38, 48, 7, 46, 20, 146, 130, 28, 84, 90, 1, 111,\n 25, 133, 143, 41, 147, 4, 98])" 131 | ] 132 | }, 133 | "execution_count": 8, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "shuffle_indexes" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 9, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "test_ratio = 0.2\n", 149 | "test_size = int(np.shape(X)[0] * test_ratio)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 10, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "30" 161 | ] 162 | }, 163 | "execution_count": 10, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "test_size" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 11, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "test_indexes = shuffle_indexes[:test_size]\n", 179 | "train_indexes = shuffle_indexes[test_size:]" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 12, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "X_train = X[train_indexes]\n", 189 | "y_train = y[train_indexes]\n", 190 | "\n", 191 | "X_test = X[test_indexes]\n", 192 | "y_test = y[test_indexes]\n" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "## 使用我们的算法" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 13, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "from c1_knn.model_selection import train_test_split" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 14, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "X_train, X_test, y_train, y_test = train_test_split(X,y,seed=1)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "### 先试试使用之前自己写的KNNClassifier" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 15, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "from c1_knn.kNN import KNNClassifier" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 16, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "my_knn_clf = KNNClassifier(k=3)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 17, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/plain": [ 253 | "kNN(k=3)" 254 | ] 255 | }, 256 | "execution_count": 17, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "my_knn_clf.fit(X_train, y_train)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 18, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/plain": [ 273 | "array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,\n 0, 2, 1, 0, 0, 1, 2])" 274 | ] 275 | }, 276 | "execution_count": 18, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "y_predict = my_knn_clf.predict(X_test)\n", 283 | "y_predict" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 19, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "data": { 293 | "text/plain": [ 294 | "array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,\n 0, 2, 1, 0, 0, 1, 2])" 295 | ] 296 | }, 297 | "execution_count": 19, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "y_test" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 21, 309 | "metadata": {}, 310 | "outputs": [ 311 | { 312 | "data": { 313 | "text/plain": [ 314 | "1.0" 315 | ] 316 | }, 317 | "execution_count": 21, 318 | "metadata": {}, 319 | "output_type": "execute_result" 320 | } 321 | ], 322 | "source": [ 323 | "# 正确率\n", 324 | "np.sum(y_predict==y_test) / len(y_test)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "### sklearn中的train_test_split" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 22, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "from sklearn.model_selection._split import train_test_split" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 25, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 26, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n weights='uniform')" 361 | ] 362 | }, 363 | "execution_count": 26, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "from sklearn.neighbors.classification import KNeighborsClassifier\n", 370 | "knn_clf = KNeighborsClassifier(n_neighbors=3)\n", 371 | "knn_clf.fit(X_train, y_train)\n" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 27, 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "data": { 381 | "text/plain": [ 382 | "array([1, 2, 1, 2, 0, 1, 1, 2, 1, 1, 1, 0, 0, 0, 2, 1, 0, 2, 2, 2, 1, 0, 2,\n 0, 1, 1, 0, 1, 2, 2])" 383 | ] 384 | }, 385 | "execution_count": 27, 386 | "metadata": {}, 387 | "output_type": "execute_result" 388 | } 389 | ], 390 | "source": [ 391 | "y_predict = knn_clf.predict(X_test)\n", 392 | "y_predict" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 30, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/plain": [ 403 | "1.0" 404 | ] 405 | }, 406 | "execution_count": 30, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "# 准确率\n", 413 | "np.sum(y_predict==y_test)/len(y_test)" 414 | ] 415 | } 416 | ], 417 | "metadata": { 418 | "kernelspec": { 419 | "display_name": "Python 2", 420 | "language": "python", 421 | "name": "python2" 422 | }, 423 | "language_info": { 424 | "codemirror_mode": { 425 | "name": "ipython", 426 | "version": 2 427 | }, 428 | "file_extension": ".py", 429 | "mimetype": "text/x-python", 430 | "name": "python", 431 | "nbconvert_exporter": "python", 432 | "pygments_lexer": "ipython2", 433 | "version": "2.7.6" 434 | } 435 | }, 436 | "nbformat": 4, 437 | "nbformat_minor": 0 438 | } 439 | -------------------------------------------------------------------------------- /c1_knn/04_Hyper_Parameter_kNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# 超参数" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import matplotlib\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "from sklearn import datasets" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## 一个识别手写数字的例子" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "digits = datasets.load_digits()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": [ 48 | "dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])" 49 | ] 50 | }, 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "digits.keys()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "Optical Recognition of Handwritten Digits Data Set\n===================================================\n\nNotes\n-----\nData Set Characteristics:\n :Number of Instances: 5620\n :Number of Attributes: 64\n :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n :Missing Attribute Values: None\n :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttp://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixels are counted in each block. This generates\nan input matrix of 8x8 where each element is an integer in the range\n0..16. This reduces dimensionality and gives invariance to small\ndistortions.\n\nFor info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G.\nT. Candela, D. L. Dimmick, J. Geist, P. J. Grother, S. A. Janet, and C.\nL. Wilson, NIST Form-Based Handprint Recognition System, NISTIR 5469,\n1994.\n\nReferences\n----------\n - C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their\n Applications to Handwritten Digit Recognition, MSc Thesis, Institute of\n Graduate Studies in Science and Engineering, Bogazici University.\n - E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika.\n - Ken Tang and Ponnuthurai N. Suganthan and Xi Yao and A. Kai Qin.\n Linear dimensionalityreduction using relevance weighted LDA. School of\n Electrical and Electronic Engineering Nanyang Technological University.\n 2005.\n - Claudio Gentile. A New Approximate Maximal Margin Classification\n Algorithm. NIPS. 2000.\n\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "print(digits.DESCR)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 5, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "array([[ 0., 0., 5., ..., 0., 0., 0.],\n [ 0., 0., 0., ..., 10., 0., 0.],\n [ 0., 0., 0., ..., 16., 9., 0.],\n ..., \n [ 0., 0., 1., ..., 6., 0., 0.],\n [ 0., 0., 2., ..., 12., 0., 0.],\n [ 0., 0., 10., ..., 12., 1., 0.]])" 86 | ] 87 | }, 88 | "execution_count": 5, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "digits.data" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 6, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "(1797, 64)" 106 | ] 107 | }, 108 | "execution_count": 6, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | } 112 | ], 113 | "source": [ 114 | "np.shape(digits.data)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "特征" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 7, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "X = digits.data" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "分类" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 8, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "(1797,)" 149 | ] 150 | }, 151 | "execution_count": 8, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "y = digits.target\n", 158 | "np.shape(y)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "## 可视化" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 9, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPgAAAD8CAYAAABaQGkdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAACu5JREFUeJzt3d2LXeUZhvH77qi0qTYDTVokid1BJCCFTmQTkBRjIpZYxeSgBwkoJhRypCgtiPZE+g9IelAEiU4EE6WNSkSsVtDRCq11kkxa82FJw5RM0GZCGfw4aIg+PZgViJIya7LX1zxcPwidj81+n01zudbsWVmvI0IAcvpG2wMAqA+BA4kROJAYgQOJETiQGIEDiRE4kBiBA4kROJDYFXU86ZIlS6LX69Xx1K2amZlpdL3JycnG1hoaGmpsreuvv76xtRYtWtTYWk2anJzU2bNnPdfjagm81+tpfHy8jqdu1f79+xtd77777mtsreHh4cbW2rt3b2NrjYyMNLZWk/r9fqnHcYoOJEbgQGIEDiRG4EBiBA4kRuBAYgQOJEbgQGKlAre90faHtk/YfqTuoQBUY87AbQ9J+q2kOyTdKGmr7RvrHgzA4MocwddIOhERJyPinKTnJW2qdywAVSgT+DJJpy76fKr4GoCOq+xNNts7bI/bHp+enq7qaQEMoEzgpyWtuOjz5cXXviIinoyIfkT0ly5dWtV8AAZQJvD3Jd1ge6XtqyRtkfRyvWMBqMKc/x48Is7bvl/S65KGJD0dEUdqnwzAwErd8CEiXpX0as2zAKgYV7IBiRE4kBiBA4kROJAYgQOJETiQGIEDiRE4kFgtO5tk9dhjj7U9Qm02b97c2Fq33nprY2tNTEw0tpY0u6tPl3AEBxIjcCAxAgcSI3AgMQIHEiNwIDECBxIjcCAxAgcSK7OzydO2z9j+oImBAFSnzBF8t6SNNc8BoAZzBh4R70j6TwOzAKgYP4MDibF1EZBYZYGzdRHQPZyiA4mV+TXZc5L+LGmV7SnbP69/LABVKLM32dYmBgFQPU7RgcQIHEiMwIHECBxIjMCBxAgcSIzAgcQIHEhswW9dNDY21thahw8fbmwtSVq3bl1ja+3cubOxtWZmZhpbq8m/H5K0bdu2RtebC0dwIDECBxIjcCAxAgcSI3AgMQIHEiNwIDECBxIjcCAxAgcSK3PTxRW237J91PYR2w82MRiAwZW5Fv28pF9GxEHb10g6YPuNiDha82wABlRmb7KPIuJg8fGnko5JWlb3YAAGN6+fwW33JK2W9N4lvsfWRUDHlA7c9tWSXpD0UER88vXvs3UR0D2lArd9pWbj3hMRL9Y7EoCqlHkX3ZKeknQsIh6vfyQAVSlzBF8r6V5JG2xPFH9+WvNcACpQZm+ydyW5gVkAVIwr2YDECBxIjMCBxAgcSIzAgcQIHEiMwIHECBxIjL3JOmxkZKTtEWrR6/UaW4u9yQCkReBAYgQOJEbgQGIEDiRG4EBiBA4kRuBAYgQOJFbmpovftP1X24eLrYt+3cRgAAZX5lLV/0raEBGfFbdPftf2HyLiLzXPBmBAZW66GJI+Kz69svgTdQ4FoBplNz4Ysj0h6YykNyKCrYuABaBU4BHxRUSMSFouaY3tH17iMWxdBHTMvN5Fj4gZSW9J2ljPOACqVOZd9KW2h4uPvyXpdknH6x4MwODKvIt+raRnbA9p9j8Iv4uIV+odC0AVyryL/jfN7gkOYIHhSjYgMQIHEiNwIDECBxIjcCAxAgcSI3AgMQIHElvwWxcNDw83ttbixYsbW0uS1q9f3+h6TWlyO6Em/350EUdwIDECBxIjcCAxAgcSI3AgMQIHEiNwIDECBxIjcCCx0oEX90Y/ZJv7sQELxHyO4A9KOlbXIACqV3Znk+WS7pS0q95xAFSp7BF8p6SHJX1Z4ywAKlZm44O7JJ2JiANzPI69yYCOKXMEXyvpbtuTkp6XtMH2s19/EHuTAd0zZ+AR8WhELI+InqQtkt6MiHtqnwzAwPg9OJDYvO7oEhFjksZqmQRA5TiCA4kROJAYgQOJETiQGIEDiRE4kBiBA4kROJDYgt+6qEm9Xq/R9TZt2tTYWvv3729srbfffruxtUZHRxtbq4s4ggOJETiQGIEDiRE4kBiBA4kROJAYgQOJETiQGIEDiZW6kq24o+qnkr6QdD4i+nUOBaAa87lUdX1EnK1tEgCV4xQdSKxs4CHpj7YP2N5R50AAqlP2FP3HEXHa9vckvWH7eES8c/EDivB3SNJ1111X8ZgALkepI3hEnC7+94yklyStucRj2LoI6Jgymw9+2/Y1Fz6W9BNJH9Q9GIDBlTlF/76kl2xfePzeiHit1qkAVGLOwCPipKQfNTALgIrxazIgMQIHEiNwIDECBxIjcCAxAgcSI3AgMQIHEnNEVP6k/X4/xsfHK3/ethVX8zVm3bp1ja01MTHR2FpNbgE1NjbW2FqSNDw83Mg6/X5f4+Pjc/6F5AgOJEbgQGIEDiRG4EBiBA4kRuBAYgQOJEbgQGIEDiRWKnDbw7b32T5u+5jtm+seDMDgyt4X/TeSXouIn9m+StKiGmcCUJE5A7e9WNItkrZJUkSck3Su3rEAVKHMKfpKSdOSRm0fsr2ruD86gI4rE/gVkm6S9ERErJb0uaRHvv4g2ztsj9sen56ernhMAJejTOBTkqYi4r3i832aDf4r2LoI6J45A4+IjyWdsr2q+NJtko7WOhWASpR9F/0BSXuKd9BPStpe30gAqlIq8IiYkNSveRYAFeNKNiAxAgcSI3AgMQIHEiNwIDECBxIjcCAxAgcSI3AgsbKXqkLS6Ohoo+tt397cFcFN7oO2e/fuxtZqaq+wruIIDiRG4EBiBA4kRuBAYgQOJEbgQGIEDiRG4EBiBA4kNmfgtlfZnrjozye2H2piOACDmfNS1Yj4UNKIJNkeknRa0ks1zwWgAvM9Rb9N0j8j4l91DAOgWvMNfIuk5y71DbYuArqndODFpgd3S/r9pb7P1kVA98znCH6HpIMR8e+6hgFQrfkEvlX/5/QcQDeVCrzYD/x2SS/WOw6AKpXdm+xzSd+teRYAFeNKNiAxAgcSI3AgMQIHEiNwIDECBxIjcCAxAgcSc0RU/6T2tKT5/pPSJZLOVj5MN2R9bbyu9vwgIub8V121BH45bI9HRL/tOeqQ9bXxurqPU3QgMQIHEutS4E+2PUCNsr42XlfHdeZncADV69IRHEDFOhG47Y22P7R9wvYjbc9TBdsrbL9l+6jtI7YfbHumKtkesn3I9ittz1Il28O299k+bvuY7ZvbnmkQrZ+iF/da/4dm7xgzJel9SVsj4mirgw3I9rWSro2Ig7avkXRA0uaF/rousP0LSX1J34mIu9qepyq2n5H0p4jYVdxodFFEzLQ91+XqwhF8jaQTEXEyIs5Jel7SppZnGlhEfBQRB4uPP5V0TNKydqeqhu3lku6UtKvtWapke7GkWyQ9JUkRcW4hxy11I/Blkk5d9PmUkoRwge2epNWS3mt3ksrslPSwpC/bHqRiKyVNSxotfvzYVdyPcMHqQuCp2b5a0guSHoqIT9qeZ1C275J0JiIOtD1LDa6QdJOkJyJitaTPJS3o94S6EPhpSSsu+nx58bUFz/aVmo17T0RkuSPtWkl3257U7I9TG2w/2+5IlZmSNBURF8609mk2+AWrC4G/L+kG2yuLNzW2SHq55ZkGZtua/VnuWEQ83vY8VYmIRyNieUT0NPv/1ZsRcU/LY1UiIj6WdMr2quJLt0la0G+Klrptcp0i4rzt+yW9LmlI0tMRcaTlsaqwVtK9kv5ue6L42q8i4tUWZ8LcHpC0pzjYnJS0veV5BtL6r8kA1KcLp+gAakLgQGIEDiRG4EBiBA4kRuBAYgQOJEbgQGL/A9ozs2W/5x3pAAAAAElFTkSuQmCC\n", 176 | "text/plain": [ 177 | "" 178 | ] 179 | }, 180 | "metadata": {}, 181 | "output_type": "display_data" 182 | } 183 | ], 184 | "source": [ 185 | "some_digit = X[666]\n", 186 | "some_digit_image = some_digit.reshape(8,8)\n", 187 | "plt.imshow(some_digit_image, cmap=matplotlib.cm.binary)\n", 188 | "plt.show()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "## train test split" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 10, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "from c1_knn.model_selection import train_test_split\n", 205 | "from c1_knn.kNN import KNNClassifier\n" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 11, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 12, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "my_knn_clf = KNNClassifier(k=3)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 13, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "kNN(k=3)" 235 | ] 236 | }, 237 | "execution_count": 13, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "my_knn_clf.fit(X_train, y_train)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 14, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "data": { 253 | "text/plain": [ 254 | "0.99164345403899723" 255 | ] 256 | }, 257 | "execution_count": 14, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "y_predict = my_knn_clf.predict(X_test)\n", 264 | "# 正确率\n", 265 | "np.sum(y_predict==y_test) / np.shape(y_test)[0]" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "### 把统计正确率封装为一个方法" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 15, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "text/plain": [ 283 | "0.99164345403899723" 284 | ] 285 | }, 286 | "execution_count": 15, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "from c1_knn.metrics import accuracy_score\n", 293 | "accuracy_score(y_test, y_predict)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "## sklearn中的accuracy_score" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 16, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "from sklearn.model_selection._split import train_test_split as train_test_spl\n", 310 | "from sklearn.neighbors.classification import KNeighborsClassifier\n", 311 | "from sklearn.metrics import accuracy_score as score\n" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 17, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "X_train, X_test, y_train, y_test = train_test_spl(X, y, test_size=0.2, random_state=666)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 18, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n weights='uniform')" 332 | ] 333 | }, 334 | "execution_count": 18, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "knn_clf = KNeighborsClassifier(n_neighbors=3)\n", 341 | "knn_clf.fit(X_train, y_train)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 19, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "y_predict = knn_clf.predict(X_test)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 20, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "0.98888888888888893" 362 | ] 363 | }, 364 | "execution_count": 20, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "score(y_test,y_predict)" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 21, 376 | "metadata": {}, 377 | "outputs": [ 378 | { 379 | "data": { 380 | "text/plain": [ 381 | "0.98888888888888893" 382 | ] 383 | }, 384 | "execution_count": 21, 385 | "metadata": {}, 386 | "output_type": "execute_result" 387 | } 388 | ], 389 | "source": [ 390 | "knn_clf.score(X_test, y_test)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "两个score的结果一样" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [] 406 | } 407 | ], 408 | "metadata": { 409 | "kernelspec": { 410 | "display_name": "Python 2", 411 | "language": "python", 412 | "name": "python2" 413 | }, 414 | "language_info": { 415 | "codemirror_mode": { 416 | "name": "ipython", 417 | "version": 2 418 | }, 419 | "file_extension": ".py", 420 | "mimetype": "text/x-python", 421 | "name": "python", 422 | "nbconvert_exporter": "python", 423 | "pygments_lexer": "ipython2", 424 | "version": "2.7.6" 425 | } 426 | }, 427 | "nbformat": 4, 428 | "nbformat_minor": 0 429 | } 430 | -------------------------------------------------------------------------------- /c1_knn/05_Hyper_Parameters.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# 超参数" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "from sklearn import datasets" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "digits = datasets.load_digits()\n", 29 | "X = digits.data\n", 30 | "y = digits.target" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "from sklearn.model_selection._split import train_test_split" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=666)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "0.98888888888888893" 60 | ] 61 | }, 62 | "execution_count": 5, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "from sklearn.neighbors.classification import KNeighborsClassifier\n", 69 | "\n", 70 | "knn_clf = KNeighborsClassifier(n_neighbors=3)\n", 71 | "knn_clf.fit(X_train, y_train)\n", 72 | "knn_clf.score(X_test, y_test)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 寻找最好的k" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 6, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "best_score: 0.991666666667\nbest_k: 4\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "best_score = 0.0\n", 97 | "best_k = -1\n", 98 | "for k in range(1,11):\n", 99 | " knn_clf = KNeighborsClassifier(n_neighbors=k)\n", 100 | " knn_clf.fit(X_train, y_train)\n", 101 | " score = knn_clf.score(X_test, y_test)\n", 102 | " if score > best_score:\n", 103 | " best_score = score\n", 104 | " best_k = k\n", 105 | "print('best_score:',best_score)\n", 106 | "print('best_k:', best_k)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "## 考虑距离?不考虑距离? \n", 114 | "引出另一个超参数:距离权重" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 8, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "best_score: 0.991666666667\nbest_k: 4\nbest weights: uniform\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "best_method = ''\n", 132 | "best_score = 0.0\n", 133 | "best_k = -1\n", 134 | "for method in ['uniform','distance']:\n", 135 | " for k in range(1,11):\n", 136 | " knn_clf = KNeighborsClassifier(n_neighbors=k, weights=method)\n", 137 | " knn_clf.fit(X_train, y_train)\n", 138 | " score = knn_clf.score(X_test, y_test)\n", 139 | " if score > best_score:\n", 140 | " best_score = score\n", 141 | " best_k = k\n", 142 | " best_method = method\n", 143 | "print('best_score:',best_score)\n", 144 | "print('best_k:', best_k)\n", 145 | "print('best weights:', best_method)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "## 搜索明可夫斯基距离相应的p" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 10, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "best_score: 0.988888888889\nbest_k: 5\nbest p: 1\nCPU times: user 15.7 s, sys: 116 ms, total: 15.8 s\nWall time: 16.1 s\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "%%time\n", 170 | "best_score = 0.0\n", 171 | "best_k = -1\n", 172 | "best_p = -1\n", 173 | "for p in range(1,6):\n", 174 | " for k in range(1,11):\n", 175 | " knn_clf = KNeighborsClassifier(n_neighbors=k, weights='distance', p=p)\n", 176 | " knn_clf.fit(X_train, y_train)\n", 177 | " score = knn_clf.score(X_test, y_test)\n", 178 | " if score > best_score:\n", 179 | " best_score = score\n", 180 | " best_k = k\n", 181 | " best_p = p\n", 182 | "print('best_score:',best_score)\n", 183 | "print('best_k:', best_k)\n", 184 | "print('best p:', best_p)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [] 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "Python 2", 198 | "language": "python", 199 | "name": "python2" 200 | }, 201 | "language_info": { 202 | "codemirror_mode": { 203 | "name": "ipython", 204 | "version": 2 205 | }, 206 | "file_extension": ".py", 207 | "mimetype": "text/x-python", 208 | "name": "python", 209 | "nbconvert_exporter": "python", 210 | "pygments_lexer": "ipython2", 211 | "version": "2.7.6" 212 | } 213 | }, 214 | "nbformat": 4, 215 | "nbformat_minor": 0 216 | } 217 | -------------------------------------------------------------------------------- /c1_knn/08_Scaler_in_Scikit_Learn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Scikit-learn中的Scaler" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "from sklearn import datasets" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "iris = datasets.load_iris()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "X = iris.data\n", 38 | "y = iris.target" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "array([[ 5.1, 3.5, 1.4, 0.2],\n [ 4.9, 3. , 1.4, 0.2],\n [ 4.7, 3.2, 1.3, 0.2],\n [ 4.6, 3.1, 1.5, 0.2],\n [ 5. , 3.6, 1.4, 0.2],\n [ 5.4, 3.9, 1.7, 0.4],\n [ 4.6, 3.4, 1.4, 0.3],\n [ 5. , 3.4, 1.5, 0.2],\n [ 4.4, 2.9, 1.4, 0.2],\n [ 4.9, 3.1, 1.5, 0.1]])" 50 | ] 51 | }, 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "# 可以看到X尚未归一化时的数据\n", 59 | "X[:10,:]" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "from sklearn.model_selection._split import train_test_split\n", 69 | "\n", 70 | "X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=666)\n" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## Scikit-learn中的StandardScaler,进行0均值标准化处理" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 6, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "from sklearn.preprocessing.data import StandardScaler" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 7, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "StandardScaler(copy=True, with_mean=True, with_std=True)" 98 | ] 99 | }, 100 | "execution_count": 7, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "standard_scaler = StandardScaler()\n", 107 | "standard_scaler.fit(X_train)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 8, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "array([ 5.83416667, 3.0825 , 3.70916667, 1.16916667])" 119 | ] 120 | }, 121 | "execution_count": 8, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "# 训练集X特征矩阵的均值\n", 128 | "standard_scaler.mean_" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 11, 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/plain": [ 139 | "array([ 0.81019502, 0.44076874, 1.76295187, 0.75429833])" 140 | ] 141 | }, 142 | "execution_count": 11, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "# 训练集X特征矩阵的标准差\n", 149 | "standard_scaler.scale_" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 12, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "array([[-0.90616043, 0.94720873, -1.30982967, -1.28485856],\n [-1.15301457, -0.18717298, -1.30982967, -1.28485856],\n [-0.16559799, -0.64092567, 0.22169257, 0.17345038],\n [ 0.45153738, 0.72033239, 0.95909217, 1.49918578],\n [-0.90616043, -1.3215547 , -0.40226093, -0.0916967 ],\n [ 1.43895396, 0.2665797 , 0.56203085, 0.30602392],\n [ 0.3281103 , -1.09467835, 1.07253826, 0.30602392],\n [ 2.1795164 , -0.18717298, 1.63976872, 1.2340387 ],\n [-0.78273335, 2.30846679, -1.25310662, -1.4174321 ],\n [ 0.45153738, -2.00218372, 0.44858475, 0.43859746],\n [ 1.80923518, -0.41404933, 1.46959958, 0.83631808],\n [ 0.69839152, 0.2665797 , 0.90236912, 1.49918578],\n [ 0.20468323, 0.72033239, 0.44858475, 0.571171 ],\n [-0.78273335, -0.86780201, 0.10824648, 0.30602392],\n [-0.53587921, 1.40096142, -1.25310662, -1.28485856],\n [-0.65930628, 1.40096142, -1.25310662, -1.28485856],\n [-1.0295875 , 0.94720873, -1.19638358, -0.7545644 ],\n [-1.77014994, -0.41404933, -1.30982967, -1.28485856],\n [-0.04217092, -0.86780201, 0.10824648, 0.04087684],\n [-0.78273335, 0.72033239, -1.30982967, -1.28485856],\n [-1.52329579, 0.72033239, -1.30982967, -1.15228502],\n [ 0.82181859, 0.2665797 , 0.78892303, 1.10146516],\n [-0.16559799, -0.41404933, 0.27841562, 0.17345038],\n [ 0.94524567, -0.18717298, 0.39186171, 0.30602392],\n [ 0.20468323, -0.41404933, 0.44858475, 0.43859746],\n [-1.39986872, 0.2665797 , -1.19638358, -1.28485856],\n [-1.15301457, 0.03970336, -1.25310662, -1.4174321 ],\n [ 1.06867274, 0.03970336, 1.07253826, 1.63175932],\n [ 0.57496445, -0.86780201, 0.67547694, 0.83631808],\n [ 0.3281103 , -0.64092567, 0.56203085, 0.04087684],\n [ 0.45153738, -0.64092567, 0.61875389, 0.83631808],\n [-0.16559799, 2.98909581, -1.25310662, -1.01971148],\n [ 0.57496445, -1.3215547 , 0.67547694, 0.43859746],\n [ 0.69839152, -0.41404933, 0.33513866, 0.17345038],\n [-0.90616043, 1.62783776, -1.02621444, -1.01971148],\n [ 1.19209981, -0.64092567, 0.61875389, 0.30602392],\n [-0.90616043, 0.94720873, -1.30982967, -1.15228502],\n [-1.89357701, -0.18717298, -1.47999881, -1.4174321 ],\n [ 0.08125616, -0.18717298, 0.78892303, 0.83631808],\n [ 0.69839152, -0.64092567, 1.07253826, 1.2340387 ],\n [-0.28902506, -0.64092567, 0.67547694, 1.10146516],\n [-0.41245214, -1.54843104, -0.00519961, -0.22427024],\n [ 1.31552689, 0.03970336, 0.67547694, 0.43859746],\n [ 0.57496445, 0.72033239, 1.07253826, 1.63175932],\n [ 0.82181859, -0.18717298, 1.18598435, 1.36661224],\n [-0.16559799, 1.62783776, -1.13966053, -1.15228502],\n [ 0.94524567, -0.41404933, 0.5053078 , 0.17345038],\n [ 1.06867274, 0.49345605, 1.12926131, 1.76433286],\n [-1.27644165, -0.18717298, -1.30982967, -1.4174321 ],\n [-1.0295875 , 1.17408507, -1.30982967, -1.28485856],\n [ 0.20468323, -0.18717298, 0.61875389, 0.83631808],\n [-1.0295875 , -0.18717298, -1.19638358, -1.28485856],\n [ 0.3281103 , -0.18717298, 0.67547694, 0.83631808],\n [ 0.69839152, 0.03970336, 1.01581521, 0.83631808],\n [-0.90616043, 1.40096142, -1.25310662, -1.01971148],\n [-0.16559799, -0.18717298, 0.27841562, 0.04087684],\n [-1.0295875 , 0.94720873, -1.36655271, -1.15228502],\n [-0.90616043, 1.62783776, -1.25310662, -1.15228502],\n [-1.52329579, 0.2665797 , -1.30982967, -1.28485856],\n [-0.53587921, -0.18717298, 0.44858475, 0.43859746],\n [ 0.82181859, -0.64092567, 0.5053078 , 0.43859746],\n [ 0.3281103 , -0.64092567, 0.16496953, 0.17345038],\n [-1.27644165, 0.72033239, -1.19638358, -1.28485856],\n [-0.90616043, 0.49345605, -1.13966053, -0.88713794],\n [-0.04217092, -0.86780201, 0.78892303, 0.96889162],\n [-0.28902506, -0.18717298, 0.22169257, 0.17345038],\n [ 0.57496445, -0.64092567, 0.78892303, 0.43859746],\n [ 1.06867274, 0.49345605, 1.12926131, 1.2340387 ],\n [ 1.68580811, -0.18717298, 1.18598435, 0.571171 ],\n [ 1.06867274, -0.18717298, 0.84564608, 1.49918578],\n [-1.15301457, 0.03970336, -1.25310662, -1.4174321 ],\n [-1.15301457, -1.3215547 , 0.44858475, 0.70374454],\n [-0.16559799, -1.3215547 , 0.73219998, 1.10146516],\n [-1.15301457, -1.54843104, -0.2320918 , -0.22427024],\n [-0.41245214, -1.54843104, 0.05152343, -0.0916967 ],\n [ 1.06867274, -1.3215547 , 1.18598435, 0.83631808],\n [ 0.82181859, -0.18717298, 1.01581521, 0.83631808],\n [-0.16559799, -1.09467835, -0.1186457 , -0.22427024],\n [ 0.20468323, -2.00218372, 0.73219998, 0.43859746],\n [ 1.06867274, 0.03970336, 0.56203085, 0.43859746],\n [-1.15301457, 0.03970336, -1.25310662, -1.4174321 ],\n [ 0.57496445, -1.3215547 , 0.73219998, 0.96889162],\n [-1.39986872, 0.2665797 , -1.36655271, -1.28485856],\n [ 0.20468323, -0.86780201, 0.78892303, 0.571171 ],\n [-0.04217092, -1.09467835, 0.16496953, 0.04087684],\n [ 1.31552689, 0.2665797 , 1.12926131, 1.49918578],\n [-1.77014994, -0.18717298, -1.36655271, -1.28485856],\n [ 1.56238103, -0.18717298, 1.2427074 , 1.2340387 ],\n [ 1.19209981, 0.2665797 , 1.2427074 , 1.49918578],\n [-0.78273335, 0.94720873, -1.25310662, -1.28485856],\n [ 2.54979762, 1.62783776, 1.52632263, 1.10146516],\n [ 0.69839152, -0.64092567, 1.07253826, 1.36661224],\n [-0.28902506, -0.41404933, -0.06192266, 0.17345038],\n [-0.41245214, 2.53534313, -1.30982967, -1.28485856],\n [-1.27644165, -0.18717298, -1.30982967, -1.15228502],\n [ 0.57496445, -0.41404933, 1.07253826, 0.83631808],\n [-1.77014994, 0.2665797 , -1.36655271, -1.28485856],\n [-0.53587921, 1.8547141 , -1.13966053, -1.01971148],\n [-1.0295875 , 0.72033239, -1.19638358, -1.01971148],\n [ 1.06867274, -0.18717298, 0.73219998, 0.70374454],\n [-0.53587921, 1.8547141 , -1.36655271, -1.01971148],\n [ 2.30294347, -0.64092567, 1.69649176, 1.10146516],\n [-0.28902506, -0.86780201, 0.27841562, 0.17345038],\n [ 1.19209981, -0.18717298, 1.01581521, 1.2340387 ],\n [-0.41245214, 0.94720873, -1.36655271, -1.28485856],\n [-1.27644165, 0.72033239, -1.02621444, -1.28485856],\n [-0.53587921, 0.72033239, -1.13966053, -1.28485856],\n [ 2.30294347, 1.62783776, 1.69649176, 1.36661224],\n [ 1.31552689, 0.03970336, 0.95909217, 1.2340387 ],\n [-0.28902506, -1.3215547 , 0.10824648, -0.0916967 ],\n [-0.90616043, 0.72033239, -1.25310662, -1.28485856],\n [-0.90616043, 1.62783776, -1.19638358, -1.28485856],\n [ 0.3281103 , -0.41404933, 0.56203085, 0.30602392],\n [-0.04217092, 2.08159044, -1.42327576, -1.28485856],\n [-1.0295875 , -2.45593641, -0.1186457 , -0.22427024],\n [ 0.69839152, 0.2665797 , 0.44858475, 0.43859746],\n [ 0.3281103 , -0.18717298, 0.5053078 , 0.30602392],\n [ 0.08125616, 0.2665797 , 0.61875389, 0.83631808],\n [ 0.20468323, -2.00218372, 0.16496953, -0.22427024],\n [ 1.93266225, -0.64092567, 1.35615349, 0.96889162]])" 161 | ] 162 | }, 163 | "execution_count": 12, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "# 归一化处理\n", 170 | "X_train_normalization = standard_scaler.transform(X_train)\n", 171 | "X_train_normalization" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 13, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "array([[-0.28902506, -0.18717298, 0.44858475, 0.43859746],\n [-0.04217092, -0.64092567, 0.78892303, 1.63175932],\n [-1.0295875 , -1.77530738, -0.2320918 , -0.22427024],\n [-0.04217092, -0.86780201, 0.78892303, 0.96889162],\n [-1.52329579, 0.03970336, -1.25310662, -1.28485856],\n [-0.41245214, -1.3215547 , 0.16496953, 0.17345038],\n [-0.16559799, -0.64092567, 0.44858475, 0.17345038],\n [ 0.82181859, -0.18717298, 0.84564608, 1.10146516],\n [ 0.57496445, -1.77530738, 0.39186171, 0.17345038],\n [-0.41245214, -1.09467835, 0.39186171, 0.04087684],\n [ 1.06867274, 0.03970336, 0.39186171, 0.30602392],\n [-1.64672287, -1.77530738, -1.36655271, -1.15228502],\n [-1.27644165, 0.03970336, -1.19638358, -1.28485856],\n [-0.53587921, 0.72033239, -1.25310662, -1.01971148],\n [ 1.68580811, 1.17408507, 1.35615349, 1.76433286],\n [-0.04217092, -0.86780201, 0.22169257, -0.22427024],\n [-1.52329579, 1.17408507, -1.53672185, -1.28485856],\n [ 1.68580811, 0.2665797 , 1.29943044, 0.83631808],\n [ 1.31552689, 0.03970336, 0.78892303, 1.49918578],\n [ 0.69839152, -0.86780201, 0.90236912, 0.96889162],\n [ 0.57496445, 0.49345605, 0.56203085, 0.571171 ],\n [-1.0295875 , 0.72033239, -1.25310662, -1.28485856],\n [ 2.30294347, -1.09467835, 1.80993786, 1.49918578],\n [-1.0295875 , 0.49345605, -1.30982967, -1.28485856],\n [ 0.45153738, -0.41404933, 0.33513866, 0.17345038],\n [ 0.08125616, -0.18717298, 0.27841562, 0.43859746],\n [-1.0295875 , 0.2665797 , -1.42327576, -1.28485856],\n [-0.41245214, -1.77530738, 0.16496953, 0.17345038],\n [ 0.57496445, 0.49345605, 1.29943044, 1.76433286],\n [ 2.30294347, -0.18717298, 1.35615349, 1.49918578]])" 183 | ] 184 | }, 185 | "execution_count": 13, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "# 对测试数据集的特征矩阵进行归一化处理\n", 192 | "X_test_normalization = standard_scaler.transform(X_test)\n", 193 | "X_test_normalization" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 14, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "from sklearn.neighbors.classification import KNeighborsClassifier" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 15, 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "data": { 212 | "text/plain": [ 213 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n weights='uniform')" 214 | ] 215 | }, 216 | "execution_count": 15, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "knn_clf = KNeighborsClassifier(n_neighbors=3)\n", 223 | "knn_clf.fit(X_train_normalization, y_train)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "#### 如果对训练数据集进行了归一化处理,则测试数据集也必须进行归一化处理,否则结果会很差" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 18, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "0.33333333333333331" 242 | ] 243 | }, 244 | "execution_count": 18, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "knn_clf.score(X_test, y_test)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 19, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "data": { 260 | "text/plain": [ 261 | "1.0" 262 | ] 263 | }, 264 | "execution_count": 19, 265 | "metadata": {}, 266 | "output_type": "execute_result" 267 | } 268 | ], 269 | "source": [ 270 | "knn_clf.score(X_test_normalization, y_test)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [] 279 | } 280 | ], 281 | "metadata": { 282 | "kernelspec": { 283 | "display_name": "Python 2", 284 | "language": "python", 285 | "name": "python2" 286 | }, 287 | "language_info": { 288 | "codemirror_mode": { 289 | "name": "ipython", 290 | "version": 2 291 | }, 292 | "file_extension": ".py", 293 | "mimetype": "text/x-python", 294 | "name": "python", 295 | "nbconvert_exporter": "python", 296 | "pygments_lexer": "ipython2", 297 | "version": "2.7.6" 298 | } 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 0 302 | } 303 | -------------------------------------------------------------------------------- /c1_knn/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /c1_knn/kNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from math import sqrt 4 | from collections import Counter 5 | 6 | 7 | def kNN_classify(k: int, X_train: np.ndarray, y_train: np.ndarray, x: np.ndarray): 8 | """ 9 | kNN分类算法 10 | :param k: kNN的k值 11 | :param X_train: 训练集的特征(矩阵) 12 | :param y_train: 训练集的标记(向量) 13 | :param x: 需要预测的特征(向量) 14 | :return: 15 | """ 16 | assert 1 <= k <= X_train.shape[0], "k must be valid" 17 | assert X_train.shape[0] == y_train.shape[0], "训练集中,特征向量的记录数与标记的记录数目必须一致" 18 | assert X_train.shape[1] == x.shape[0], '需要预测的x的特征数目必须等于训练集中的特征数目' 19 | 20 | # 求x与每一条记录的欧拉距离 21 | distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train] 22 | 23 | nearest = np.argsort(distances) 24 | 25 | # 从y_train中取前k个与x距离最近的y 26 | topK_y = [y_train[i] for i in nearest[:k]] 27 | 28 | votes = Counter(topK_y) 29 | 30 | return votes.most_common(1)[0][0] 31 | 32 | 33 | class KNNClassifier(object): 34 | """ 35 | 重新整理自己写的kNN算法,使他更符合scikit-Learn的模式 36 | """ 37 | def __init__(self, k): 38 | """ 39 | 初始化kNN分类器 40 | :param k: 41 | """ 42 | self.k = k 43 | self._X_train = None 44 | self._y_train = None 45 | 46 | def fit(self, X_train, y_train): 47 | """ 48 | 根据训练数据集X_train和y_train训练kNN分类器 49 | :param X_train: 50 | :param y_train: 51 | :return: 52 | """ 53 | self._X_train = X_train 54 | self._y_train = y_train 55 | return self 56 | 57 | def predict(self, X_predict): 58 | """ 59 | 给定待预测数据集X_predict,返回表示X_predict的结果向量 60 | :param X_predict: 61 | :return: 62 | """ 63 | y_predict = [self._predict(x) for x in X_predict] 64 | return np.array(y_predict) 65 | 66 | def _predict(self, x): 67 | """ 68 | 给定单个带预测数据x,返回x_predict的预测结果值 69 | :param x: 70 | :return: 71 | """ 72 | # 差不多就是把kNN_classify方法的内容全部搬过来 73 | 74 | # 求x与每一条记录的欧拉距离 75 | distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in self._X_train] 76 | 77 | nearest = np.argsort(distances) 78 | 79 | # 从y_train中取前k个与x距离最近的y 80 | topK_y = [self._y_train[i] for i in nearest[:self.k]] 81 | 82 | votes = Counter(topK_y) 83 | 84 | return votes.most_common(1)[0][0] 85 | 86 | def score(self, X_test, y_test): 87 | y_predict = self.predict(X_test) 88 | return sum(y_predict == y_test) / len(y_test) 89 | 90 | def __repr__(self): 91 | return 'kNN(k=%d)'%self.k -------------------------------------------------------------------------------- /c1_knn/knn.md: -------------------------------------------------------------------------------- 1 | # kNN算法 2 | - 属于监督学习 3 | - 非参数学习 4 | - 是解决分类问题的算法,天然可解决多分类问题 5 | - kNN没有模型,可以说是一个(也许也是唯一一个)不需要训练过程的算法 6 | - 为了和其他算法统一,可以认为训练数据集就是模型本身 7 | 8 | ## 本质 9 | 两个(或几个)样本如果足够相似,那么它们就有极高的概率属于同一个类别。所谓“相似”,就是样本就特征空间中的距离相近。 10 | ## 优点 11 | - 思想极其简单 12 | - 可以解释机器学习算法使用过程中的很多细节问题 13 | - 更完整的刻画机器学习应用的流程 14 | - 应用数学知识少(近乎为零) 15 | - 效果好 16 | - 天然适合解决多分类问题,同时也适合解决回归问题 17 | 18 | ## 缺点 19 | - 最大的缺点:效率低下 20 | 如果训练集有m个样本,n个特征,则预测每一个新的数据,需要O(m*n) 21 | - 优化,使用树结构:KD-Tree, Ball-Tree 22 | - 即便如此,依然效率低下 23 | - 高度数据相关,而且对outlier更敏感 24 | - 预测结果不具有可解释性 25 | 只知道属于哪个类别,但是无法解释为什么属于某个类别 26 | - 维数灾难 27 | - 随着维度的增加,“看似相似”的两个点之间的距离越来越大 28 | - 解决方法:降维,例如PCA 29 | 30 | ## kNN的过程 31 | ### 计算特征空间中的距离 32 | #### 欧拉距离(最为常见) 33 | - 平面距离: 34 | 35 | - 立体距离 36 | 37 | - n维空间距离 38 | 39 | #### 曼哈顿距离 40 | 41 | #### 明可夫斯基距离 42 | 43 | - 当p=1,相当于曼哈顿距离 44 | - 当p=2,相当于欧拉距离 45 | - 当p=3,其他距离 46 | 47 | ## 参数 48 | ### 超参数 49 | - kNN算法中的k是典型的超参数 50 | - 默认值为5 (经验数值) 51 | - 距离的权重 52 | - 距离越近,权重越大 53 | - 关于“距离”的定义 54 | - 明可夫斯基距离(默认) 55 | 明可夫斯基距离的p取值 56 | - p=1:曼哈顿距离 57 | - p=2(默认):欧拉距离 58 | - p=3:明可夫斯基距离(其他距离) 59 | - 其他更多的距离定义 60 | - 向量空间余弦相似度Cosine Similarity 61 | - 调整余弦相似度Adjusted Cosine Similarity 62 | - 皮尔森相关系数 Pearson Correlation Coefficient 63 | - Jaccard相似系数 Jaccard Coefficient 64 | 65 | ### 模型参数 66 | kNN算法没有模型参数 67 | 68 | ## 数据归一化 Feature Scaling 69 | ### 需要归一化的原因 70 | 如果某些特征数值较大,会主导最终距离的结果 71 | ### 解决方案 72 | 把所有的数据映射到同一尺度 73 | #### 最值归一化 normalization 74 | 把所有数据映射到0~1之间: 75 | 76 | 77 | - 适用于分布有明显边界的情况 78 | - 例如考试分数,最大是100,最小是0 79 | - 例如每个像素的RGB颜色,都是0~255之间 80 | - 受outlier影响较大 81 | - 例如收入,有些人特别特别高 82 | 83 | #### Standardization(0均值标准化/均值方差归一化) 84 | 针对最值归一化的缺憾改进 85 | **把所有数据归一到均值为0,方差为1的分布中** 86 | 87 | 88 | 89 | - 并不保证数据在0~1之间 90 | - 但是所有数值的均值在0的位置 91 | - 数据方差/标准差为1 92 | 93 | 适用于数据分布没有明显的分界(有可能存在极端数据值)。其实数据分布有明显边界的情况也是同样适合的,所以选它一般没错。 94 | 95 | ### 数据归一化的一些注意事项 96 | #### 对测试数据集如何归一化 97 | 例如训练集有均值,标准差, 那么,测试数据集进行归一化(例如0均值标准化)时,应该使用训练集的均值和标准差,而不是用测试集的均值和标准差。原因有: 98 | 1. 测试数据是模拟真实环境,真实环境很可能无法得到所有测试数据的均值和标准差。(个人理解,如果使用测试数据集的均值和标准差,那么以后每有一个新的样例进来,岂不是要重新计算(分配)所有测试样例的均值和标准差?) 99 | 2. 对数据的归一化也是算法的一部分 100 | 101 | #### 需要保存训练数据集得到的均值和标准差 102 | - 使用skLearn进行数据归一化处理 103 | - 使用StandardScaler进行0均值标准化 104 | 105 | 106 | -------------------------------------------------------------------------------- /c1_knn/metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 度量 3 | import numpy as np 4 | 5 | 6 | def accuracy_score(y_true, y_predict): 7 | assert np.shape(y_true)[0] == np.shape(y_predict)[0], 'the size of y_true must be equal to the size of y_predict' 8 | return sum(y_predict == y_true) / len(y_true) 9 | -------------------------------------------------------------------------------- /c1_knn/model_selection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | 5 | def train_test_split(X, y, test_ratio=0.2, seed=None): 6 | """ 7 | 将数据X和y按照test_ratio分割成X_train,X_test,y_train,y_test 8 | :param X: 9 | :param y: 10 | :param test_ratio: 11 | :param seed: 12 | :return: 13 | """ 14 | if seed: 15 | np.random.seed(seed) 16 | 17 | shuffled_indexes = np.random.permutation(np.shape(X)[0]) 18 | 19 | test_size = int(np.shape(X)[0] * test_ratio) 20 | test_indexes = shuffled_indexes[:test_size] 21 | train_indexes = shuffled_indexes[test_size:] 22 | 23 | X_train = X[train_indexes] 24 | y_train = y[train_indexes] 25 | 26 | X_test = X[test_indexes] 27 | y_test = y[test_indexes] 28 | 29 | return X_train, X_test, y_train, y_test 30 | -------------------------------------------------------------------------------- /c1_knn/preprocessing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | 5 | class StandardScaler(object): 6 | """ 7 | 照猫画虎的仿照scikit-learn实现一个Standard Scaler 8 | """ 9 | def __init__(self): 10 | self.mean_ = None 11 | self.scale_ = None 12 | 13 | def fit(self, X:np.ndarray): 14 | """ 15 | 根据训练数据集X获得数据的均值和标准差 16 | (暂时只处理2维的数据) 17 | :param X: 18 | :return: 19 | """ 20 | assert X.ndim == 2, 'The dimension of X must be 2' 21 | 22 | self.mean_ = np.array([np.mean(X[:,i]) for i in range(X.shape[1])]) 23 | self.scale_ = np.array([np.std(X[:,i]) for i in range(X.shape[1])]) 24 | 25 | return self 26 | 27 | def transform(self, X): 28 | """ 29 | 将X根据这个StandardScaler进行0均值标准化处理 30 | :param X: 31 | :return: 32 | """ 33 | assert X.ndim == 2, 'The dimension of X must be 2' 34 | assert self.mean_ is not None and self.scale_ is not None, "must fit before transform" 35 | X_standard = (X - self.mean_) / self.scale_ 36 | return X_standard 37 | 38 | def transform_standard(self, X): 39 | """ 40 | 将X根据这个StandardScaler进行0均值标准化处理(老师教学版) 41 | :param X: 42 | :return: 43 | """ 44 | assert X.ndim == 2, 'The dimension of X must be 2' 45 | assert self.mean_ is not None and self.scale_ is not None, "must fit before transform" 46 | X_res = np.empty(shape=X.shape, dtype=float) 47 | for col in range(X.shape[1]): 48 | X_res[:, col] = (X[:, col] - self.mean_[col]) / self.scale_[col] 49 | return X_res 50 | 51 | if __name__ == '__main__': 52 | from sklearn import datasets 53 | from sklearn.model_selection._split import train_test_split 54 | iris = datasets.load_iris() 55 | X = iris.data 56 | y = iris.target 57 | 58 | X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=666) 59 | 60 | ss = StandardScaler() 61 | ss.fit(X_train) 62 | 63 | X_standard = ss.transform(X) 64 | print(ss.transform_standard(X)) 65 | print('-'*100) 66 | print(X_standard) 67 | -------------------------------------------------------------------------------- /c2_linear_regression/08_Linear_Regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# 实现多元线性回归模型" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "from sklearn import datasets" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "boston = datasets.load_boston()\n", 30 | "\n", 31 | "X = boston.data\n", 32 | "y = boston.target\n", 33 | "\n", 34 | "X = X[y<50.0]\n", 35 | "y = y[y<50.0]" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "(490, 13)" 47 | ] 48 | }, 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "X.shape" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "from playML import model_selection\n", 65 | "X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,seed=666)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 5, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "LinearRegression()" 77 | ] 78 | }, 79 | "execution_count": 5, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "from c2_linear_regression.linear_regression import LinearRegression\n", 86 | "\n", 87 | "reg = LinearRegression()\n", 88 | "\n", 89 | "reg.fit_normal(X_train, y_train)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 6, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "array([ -1.18919477e-01, 3.63991462e-02, -3.56494193e-02,\n 5.66737830e-02, -1.16195486e+01, 3.42022185e+00,\n -2.31470282e-02, -1.19509560e+00, 2.59339091e-01,\n -1.40112724e-02, -8.36521175e-01, 7.92283639e-03,\n -3.81966137e-01])" 101 | ] 102 | }, 103 | "execution_count": 6, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "reg.coef_" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 7, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "34.161435496212974" 121 | ] 122 | }, 123 | "execution_count": 7, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "reg.interception_" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 8, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "0.81298026026586467" 141 | ] 142 | }, 143 | "execution_count": 8, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "reg.score(X_test, y_test)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [] 158 | } 159 | ], 160 | "metadata": { 161 | "kernelspec": { 162 | "display_name": "Python 2", 163 | "language": "python", 164 | "name": "python2" 165 | }, 166 | "language_info": { 167 | "codemirror_mode": { 168 | "name": "ipython", 169 | "version": 2 170 | }, 171 | "file_extension": ".py", 172 | "mimetype": "text/x-python", 173 | "name": "python", 174 | "nbconvert_exporter": "python", 175 | "pygments_lexer": "ipython2", 176 | "version": "2.7.6" 177 | } 178 | }, 179 | "nbformat": 4, 180 | "nbformat_minor": 0 181 | } 182 | -------------------------------------------------------------------------------- /c2_linear_regression/09_Regression_in_scikit_learn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# scikit-learn中的回归问题" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "from sklearn import datasets" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "boston = datasets.load_boston()\n", 30 | "\n", 31 | "X = boston.data\n", 32 | "y = boston.target\n", 33 | "\n", 34 | "X = X[y<50.0]\n", 35 | "y = y[y<50.0]" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "(490, 13)" 47 | ] 48 | }, 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "X.shape" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "from sklearn.model_selection._split import train_test_split\n", 65 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "## scikit-learn中的线性回归" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "from sklearn.linear_model.base import LinearRegression\n", 82 | "\n", 83 | "lin_reg = LinearRegression()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 6, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stderr", 93 | "output_type": "stream", 94 | "text": [ 95 | "/usr/local/seamonster/MachineLearningClassicAlgorithmEnv/lib/python3.6/site-packages/scipy/linalg/basic.py:1226: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.\n warnings.warn(mesg, RuntimeWarning)\n" 96 | ] 97 | }, 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" 102 | ] 103 | }, 104 | "execution_count": 6, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "lin_reg.fit(X_train, y_train)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 7, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "array([ -1.14235739e-01, 3.12783163e-02, -4.30926281e-02,\n -9.16425531e-02, -1.09940036e+01, 3.49155727e+00,\n -1.40778005e-02, -1.06270960e+00, 2.45307516e-01,\n -1.23179738e-02, -8.80618320e-01, 8.43243544e-03,\n -3.99667727e-01])" 122 | ] 123 | }, 124 | "execution_count": 7, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "lin_reg.coef_" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 8, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "32.645660839653509" 142 | ] 143 | }, 144 | "execution_count": 8, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "lin_reg.intercept_" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 9, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "0.80089161995191005" 162 | ] 163 | }, 164 | "execution_count": 9, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "lin_reg.score(X_test,y_test)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "### KNN Regressor (KNN解决回归问题)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 10, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "from sklearn.neighbors.regression import KNeighborsRegressor\n", 187 | "\n", 188 | "knn_reg = KNeighborsRegressor()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "#### KNN数据归一化" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 11, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "from sklearn.preprocessing.data import StandardScaler\n", 205 | "standard_scaler = StandardScaler()\n", 206 | "standard_scaler.fit(X_train)\n", 207 | "X_train_nor = standard_scaler.transform(X_train)\n", 208 | "X_test_nor = standard_scaler.transform(X_test)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 12, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/plain": [ 219 | "KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n weights='uniform')" 220 | ] 221 | }, 222 | "execution_count": 12, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "knn_reg.fit(X_train_nor, y_train)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 13, 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "data": { 238 | "text/plain": [ 239 | "0.82230080487286983" 240 | ] 241 | }, 242 | "execution_count": 13, 243 | "metadata": {}, 244 | "output_type": "execute_result" 245 | } 246 | ], 247 | "source": [ 248 | "knn_reg.score(X_test_nor, y_test)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "呃,knn效果比线性回归还好。。。 \n", 256 | "要是把网格搜索也用上岂不更离谱。。。" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 14, 262 | "metadata": {}, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "Fitting 3 folds for each of 60 candidates, totalling 180 fits\n" 269 | ] 270 | }, 271 | { 272 | "name": "stderr", 273 | "output_type": "stream", 274 | "text": [ 275 | "[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 1.5s finished\n" 276 | ] 277 | }, 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "GridSearchCV(cv=None, error_score='raise',\n estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n weights='uniform'),\n fit_params=None, iid=True, n_jobs=-1,\n param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],\n pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n scoring=None, verbose=1)" 282 | ] 283 | }, 284 | "execution_count": 14, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "from sklearn.model_selection._search import GridSearchCV\n", 291 | "\n", 292 | "param_grid = [\n", 293 | " {\n", 294 | " 'weights':['uniform'],\n", 295 | " 'n_neighbors':[i for i in range(1,11)]\n", 296 | " },\n", 297 | " {\n", 298 | " 'weights':['distance'],\n", 299 | " 'n_neighbors':[i for i in range(1,11)],\n", 300 | " 'p':[i for i in range(1,6)]\n", 301 | " }\n", 302 | "]\n", 303 | "\n", 304 | "knn_reg2 = KNeighborsRegressor()\n", 305 | "grid_search = GridSearchCV(knn_reg2, param_grid, n_jobs=-1, verbose=1)\n", 306 | "grid_search.fit(X_train_nor, y_train)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "注意下面的分数与那些分类器回归器的score用的不是同一种标准,所以不能直接与它们比较" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 15, 319 | "metadata": {}, 320 | "outputs": [ 321 | { 322 | "data": { 323 | "text/plain": [ 324 | "0.79480244433269864" 325 | ] 326 | }, 327 | "execution_count": 15, 328 | "metadata": {}, 329 | "output_type": "execute_result" 330 | } 331 | ], 332 | "source": [ 333 | "grid_search.best_score_" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 16, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "text/plain": [ 344 | "{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}" 345 | ] 346 | }, 347 | "execution_count": 16, 348 | "metadata": {}, 349 | "output_type": "execute_result" 350 | } 351 | ], 352 | "source": [ 353 | "grid_search.best_params_" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 17, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "knn_reg_grid_search = grid_search.best_estimator_" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 18, 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "data": { 372 | "text/plain": [ 373 | "0.85652703298427613" 374 | ] 375 | }, 376 | "execution_count": 18, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "knn_reg_grid_search.score(X_test_nor, y_test)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [] 391 | } 392 | ], 393 | "metadata": { 394 | "kernelspec": { 395 | "display_name": "Python 2", 396 | "language": "python", 397 | "name": "python2" 398 | }, 399 | "language_info": { 400 | "codemirror_mode": { 401 | "name": "ipython", 402 | "version": 2 403 | }, 404 | "file_extension": ".py", 405 | "mimetype": "text/x-python", 406 | "name": "python", 407 | "nbconvert_exporter": "python", 408 | "pygments_lexer": "ipython2", 409 | "version": "2.7.6" 410 | } 411 | }, 412 | "nbformat": 4, 413 | "nbformat_minor": 0 414 | } 415 | -------------------------------------------------------------------------------- /c2_linear_regression/10_More_About_Linear_Regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# 更多关于线性回归模型的讨论" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "from sklearn import datasets\n", 20 | "\n", 21 | "boston = datasets.load_boston()\n", 22 | "\n", 23 | "X = boston.data\n", 24 | "y = boston.target\n", 25 | "\n", 26 | "X = X[y<50.0]\n", 27 | "y = y[y<50.0]" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 5, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" 39 | ] 40 | }, 41 | "execution_count": 5, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "from sklearn.linear_model.base import LinearRegression\n", 48 | "\n", 49 | "lin_reg = LinearRegression()\n", 50 | "lin_reg.fit(X, y)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 6, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/plain": [ 61 | "array([ -1.05574295e-01, 3.52748549e-02, -4.35179251e-02,\n 4.55405227e-01, -1.24268073e+01, 3.75411229e+00,\n -2.36116881e-02, -1.21088069e+00, 2.50740082e-01,\n -1.37702943e-02, -8.38888137e-01, 7.93577159e-03,\n -3.50952134e-01])" 62 | ] 63 | }, 64 | "execution_count": 6, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "lin_reg.coef_" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "把系数按从小到大排一下序" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 7, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "array([ 4, 7, 10, 12, 0, 2, 6, 9, 11, 1, 8, 3, 5])" 89 | ] 90 | }, 91 | "execution_count": 7, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "arg_sort = np.argsort(lin_reg.coef_)\n", 98 | "arg_sort" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "看看按照影响程度从小到大排序后的各个系数对应的都是什么属性(名称)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 8, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "array(['NOX', 'DIS', 'PTRATIO', 'LSTAT', 'CRIM', 'INDUS', 'AGE', 'TAX',\n 'B', 'ZN', 'RAD', 'CHAS', 'RM'],\n dtype=' in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.\n \"and default tol will be 1e-3.\" % type(self), FutureWarning)\n" 308 | ] 309 | }, 310 | { 311 | "data": { 312 | "text/plain": [ 313 | "0.80386489308947862" 314 | ] 315 | }, 316 | "execution_count": 15, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "sgd_reg = SGDRegressor()\n", 323 | "%time sgd_reg.fit(X_train_standard, y_train)\n", 324 | "sgd_reg.score(X_test_standard, y_test)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 16, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "name": "stdout", 334 | "output_type": "stream", 335 | "text": [ 336 | "CPU times: user 6.5 ms, sys: 1.69 ms, total: 8.19 ms\nWall time: 6.22 ms\n" 337 | ] 338 | }, 339 | { 340 | "name": "stderr", 341 | "output_type": "stream", 342 | "text": [ 343 | "/usr/local/seamonster/MachineLearningClassicAlgorithmEnv/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:117: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n DeprecationWarning)\n" 344 | ] 345 | }, 346 | { 347 | "data": { 348 | "text/plain": [ 349 | "0.81255341149152971" 350 | ] 351 | }, 352 | "execution_count": 16, 353 | "metadata": {}, 354 | "output_type": "execute_result" 355 | } 356 | ], 357 | "source": [ 358 | "sgd_reg = SGDRegressor(n_iter=100)\n", 359 | "%time sgd_reg.fit(X_train_standard, y_train)\n", 360 | "sgd_reg.score(X_test_standard, y_test)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "比我们自己手写的快得多了。。。" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [] 376 | } 377 | ], 378 | "metadata": { 379 | "kernelspec": { 380 | "display_name": "Python 2", 381 | "language": "python", 382 | "name": "python2" 383 | }, 384 | "language_info": { 385 | "codemirror_mode": { 386 | "name": "ipython", 387 | "version": 2 388 | }, 389 | "file_extension": ".py", 390 | "mimetype": "text/x-python", 391 | "name": "python", 392 | "nbconvert_exporter": "python", 393 | "pygments_lexer": "ipython2", 394 | "version": "2.7.6" 395 | } 396 | }, 397 | "nbformat": 4, 398 | "nbformat_minor": 0 399 | } 400 | -------------------------------------------------------------------------------- /c3_gradient_descent/08_Gradient_Debugging.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# 如何调试梯度" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 26, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 27, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "np.random.seed(666)\n", 29 | "X = np.random.random(size=(1000,10))" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 28, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "true_theta = np.arange(1,12,dtype=float)\n", 39 | "X_b = np.hstack((np.ones(shape=(len(X),1)),X))\n", 40 | "y = X_b.dot(true_theta) + np.random.normal(size=1000)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 29, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/plain": [ 51 | "(1000, 10)" 52 | ] 53 | }, 54 | "execution_count": 29, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "X.shape" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 30, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "(1000,)" 72 | ] 73 | }, 74 | "execution_count": 30, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "y.shape" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 31, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/plain": [ 91 | "array([ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11.])" 92 | ] 93 | }, 94 | "execution_count": 31, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "true_theta" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 32, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "def J(theta, X_b, y):\n", 110 | " try:\n", 111 | " return np.sum((y - X_b.dot(theta))**2) / len(X_b)\n", 112 | " except:\n", 113 | " return float('inf')" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 33, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# 数学推导的求梯度\n", 123 | "def derivative_J_math(theta, X_b, y):\n", 124 | " return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 34, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "# 调试用的求梯度\n", 134 | "def derivative_J_debug(theta, X_b, y, epsilon=0.01):\n", 135 | " res = np.empty(len(theta))\n", 136 | " for i in range(len(theta)):\n", 137 | " theta_1 = theta.copy()\n", 138 | " theta_1[i] += epsilon\n", 139 | " theta_2 = theta.copy()\n", 140 | " theta_2[i] -= epsilon\n", 141 | " res[i] = (J(theta_1, X_b, y) - J(theta_2, X_b, y)) / (2*epsilon)\n", 142 | " return res\n", 143 | " " 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 35, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# 批量梯度下降训练法\n", 153 | "def gradient_descent(dJ, X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):\n", 154 | " theta = initial_theta\n", 155 | " cur_iter = 0\n", 156 | " \n", 157 | " while cur_iter < n_iters:\n", 158 | " gradient = dJ(theta, X_b, y)\n", 159 | " last_theta = theta\n", 160 | " theta = theta - eta * gradient\n", 161 | " if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):\n", 162 | " break\n", 163 | " cur_iter += 1\n", 164 | " return theta" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 36, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "X_b = np.hstack((np.ones(shape=(len(X),1)),X))\n", 174 | "initial_theta = np.zeros(X_b.shape[1])\n", 175 | "eta = 0.01" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "## 调试用梯度下降的效果" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 37, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "CPU times: user 6.97 s, sys: 2.86 s, total: 9.83 s\nWall time: 9.17 s\n" 195 | ] 196 | }, 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "array([ 1.1251597 , 2.05312521, 2.91522497, 4.11895968,\n 5.05002117, 5.90494046, 6.97383745, 8.00088367,\n 8.86213468, 9.98608331, 10.90529198])" 201 | ] 202 | }, 203 | "execution_count": 37, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "%time theta = gradient_descent(derivative_J_debug, X_b, y, initial_theta, eta)\n", 210 | "theta" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "## 数学方式梯度下降的效果" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 38, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "CPU times: user 833 ms, sys: 340 ms, total: 1.17 s\nWall time: 1.06 s\n" 230 | ] 231 | }, 232 | { 233 | "data": { 234 | "text/plain": [ 235 | "array([ 1.1251597 , 2.05312521, 2.91522497, 4.11895968,\n 5.05002117, 5.90494046, 6.97383745, 8.00088367,\n 8.86213468, 9.98608331, 10.90529198])" 236 | ] 237 | }, 238 | "execution_count": 38, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "%time theta = gradient_descent(derivative_J_math, X_b, y, initial_theta, eta)\n", 245 | "theta" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 39, 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "data": { 255 | "text/plain": [ 256 | "0.90497929349193762" 257 | ] 258 | }, 259 | "execution_count": 39, 260 | "metadata": {}, 261 | "output_type": "execute_result" 262 | } 263 | ], 264 | "source": [ 265 | "J(theta, X_b, y)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 40, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "data": { 275 | "text/plain": [ 276 | "0.91015768339662462" 277 | ] 278 | }, 279 | "execution_count": 40, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "J(true_theta,X_b,y)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "两者结果差不多,但调试用的方式慢许多" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [] 301 | } 302 | ], 303 | "metadata": { 304 | "kernelspec": { 305 | "display_name": "Python 2", 306 | "language": "python", 307 | "name": "python2" 308 | }, 309 | "language_info": { 310 | "codemirror_mode": { 311 | "name": "ipython", 312 | "version": 2 313 | }, 314 | "file_extension": ".py", 315 | "mimetype": "text/x-python", 316 | "name": "python", 317 | "nbconvert_exporter": "python", 318 | "pygments_lexer": "ipython2", 319 | "version": "2.7.6" 320 | } 321 | }, 322 | "nbformat": 4, 323 | "nbformat_minor": 0 324 | } 325 | -------------------------------------------------------------------------------- /c3_gradient_descent/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /c4_pca/07_MNIST.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# MNIST" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "from sklearn.datasets.mldata import fetch_mldata" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "mnist = fetch_mldata('MNIST original')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/plain": [ 39 | "{'COL_NAMES': ['label', 'data'],\n 'DESCR': 'mldata.org dataset: mnist-original',\n 'data': array([[0, 0, 0, ..., 0, 0, 0],\n [0, 0, 0, ..., 0, 0, 0],\n [0, 0, 0, ..., 0, 0, 0],\n ..., \n [0, 0, 0, ..., 0, 0, 0],\n [0, 0, 0, ..., 0, 0, 0],\n [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),\n 'target': array([ 0., 0., 0., ..., 9., 9., 9.])}" 40 | ] 41 | }, 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "output_type": "execute_result" 45 | } 46 | ], 47 | "source": [ 48 | "mnist" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "X, y = mnist['data'],mnist['target']" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 5, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "(70000, 784)" 69 | ] 70 | }, 71 | "execution_count": 5, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "X.shape" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 6, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "X_train = np.array(X[:60000], dtype=float)\n", 87 | "y_train = np.array(y[:60000], dtype=float)\n", 88 | "X_test = np.array(X[60000:], dtype=float)\n", 89 | "y_test = np.array(y[60000:], dtype=float)\n" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 7, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "((60000, 784), (60000,), (10000, 784), (10000,))" 101 | ] 102 | }, 103 | "execution_count": 7, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "X_train.shape, y_train.shape, X_test.shape, y_test.shape" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "## 使用kNN" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 8, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "CPU times: user 26.6 s, sys: 184 ms, total: 26.7 s\nWall time: 26.9 s\n" 129 | ] 130 | }, 131 | { 132 | "data": { 133 | "text/plain": [ 134 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n weights='uniform')" 135 | ] 136 | }, 137 | "execution_count": 8, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "from sklearn.neighbors.classification import KNeighborsClassifier\n", 144 | "\n", 145 | "knn_clf = KNeighborsClassifier()\n", 146 | "%time knn_clf.fit(X_train, y_train)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 9, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "CPU times: user 10min 29s, sys: 2.66 s, total: 10min 31s\nWall time: 10min 35s\n" 159 | ] 160 | }, 161 | { 162 | "data": { 163 | "text/plain": [ 164 | "0.96879999999999999" 165 | ] 166 | }, 167 | "execution_count": 9, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "%time knn_clf.score(X_test, y_test)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "### 使用PCA进行降维" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 10, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "CPU times: user 22.8 s, sys: 1.13 s, total: 23.9 s\nWall time: 11.4 s\n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "from sklearn.decomposition.pca import PCA\n", 198 | "\n", 199 | "pca = PCA(0.9)\n", 200 | "%time pca.fit(X_train)\n", 201 | "X_train_reduction = pca.transform(X_train)\n", 202 | "X_test_reduction = pca.transform(X_test)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 11, 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "data": { 212 | "text/plain": [ 213 | "(60000, 87)" 214 | ] 215 | }, 216 | "execution_count": 11, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "#784维的数据剩下多少维呢?\n", 223 | "X_train_reduction.shape" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 12, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "CPU times: user 510 ms, sys: 15 ms, total: 525 ms\nWall time: 649 ms\n" 236 | ] 237 | }, 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n weights='uniform')" 242 | ] 243 | }, 244 | "execution_count": 12, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "pca_knn_clf = KNeighborsClassifier()\n", 251 | "%time pca_knn_clf.fit(X_train_reduction, y_train)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 13, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "name": "stdout", 261 | "output_type": "stream", 262 | "text": [ 263 | "CPU times: user 1min 10s, sys: 313 ms, total: 1min 10s\nWall time: 1min 11s\n" 264 | ] 265 | }, 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "0.9728" 270 | ] 271 | }, 272 | "execution_count": 13, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "%time pca_knn_clf.score(X_test_reduction, y_test)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "可以看到,所需时间大幅下降。而且准确率竟然还稍微提升了 \n", 286 | "因为PCA在丢失信息的同时,会把一些噪音也丢掉,达到了降噪的目的" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [] 295 | } 296 | ], 297 | "metadata": { 298 | "kernelspec": { 299 | "display_name": "Python 2", 300 | "language": "python", 301 | "name": "python2" 302 | }, 303 | "language_info": { 304 | "codemirror_mode": { 305 | "name": "ipython", 306 | "version": 2 307 | }, 308 | "file_extension": ".py", 309 | "mimetype": "text/x-python", 310 | "name": "python", 311 | "nbconvert_exporter": "python", 312 | "pygments_lexer": "ipython2", 313 | "version": "2.7.6" 314 | } 315 | }, 316 | "nbformat": 4, 317 | "nbformat_minor": 0 318 | } 319 | -------------------------------------------------------------------------------- /c4_pca/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /c5_polynomial_regression/06_Validation_and_Cross_Validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# 交叉验证" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "from sklearn import datasets" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "digits = datasets.load_digits()\n", 29 | "X = digits.data\n", 30 | "y = digits.target" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## 测试train_test_split" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "from sklearn.model_selection._split import train_test_split\n", 47 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=666)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "Best K = 3\nBest P = 4\nBest Score = 0.986091794159\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "from sklearn.neighbors.classification import KNeighborsClassifier\n", 65 | "\n", 66 | "best_score, best_p, best_k = 0, 0, 0\n", 67 | "for k in range(2, 11):\n", 68 | " for p in range(1, 6):\n", 69 | " knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=k, p=p)\n", 70 | " knn_clf.fit(X_train, y_train)\n", 71 | " score = knn_clf.score(X_test, y_test)\n", 72 | " if score > best_score:\n", 73 | " best_score, best_p, best_k = score, p, k\n", 74 | "print('Best K =', best_k)\n", 75 | "print('Best P =', best_p)\n", 76 | "print('Best Score =', best_score)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "## 使用交叉验证" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 6, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/plain": [ 94 | "array([ 0.98895028, 0.97777778, 0.96629213])" 95 | ] 96 | }, 97 | "execution_count": 6, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "from sklearn.model_selection._validation import cross_val_score\n", 104 | "\n", 105 | "knn_clf = KNeighborsClassifier()\n", 106 | "cross_val_score(knn_clf, X_train, y_train)\n", 107 | "# 结果返回3个数,表示默认是分为3份做交叉验证" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 7, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "Best K = 2\nBest P = 2\nBest Score = 0.982359987401\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "best_score, best_p, best_k = 0, 0, 0\n", 125 | "for k in range(2, 11):\n", 126 | " for p in range(1, 6):\n", 127 | " knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=k, p=p)\n", 128 | " scores = cross_val_score(knn_clf, X_train, y_train)\n", 129 | " score = np.mean(scores)\n", 130 | " if score > best_score:\n", 131 | " best_score, best_p, best_k = score, p, k\n", 132 | "print('Best K =', best_k)\n", 133 | "print('Best P =', best_p)\n", 134 | "print('Best Score =', best_score)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "cross_val_score(knn_clf, X_train, y_train) \n", 142 | "可以看到在使用交叉验证寻找最佳超参数的过程中,是完全不使用测试集的" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "## 回顾网格搜索 \n", 150 | "网格搜索其实就是用了交叉验证" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 8, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "Fitting 3 folds for each of 45 candidates, totalling 135 fits\n" 163 | ] 164 | }, 165 | { 166 | "name": "stderr", 167 | "output_type": "stream", 168 | "text": [ 169 | "[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 1.1min finished\n" 170 | ] 171 | }, 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "GridSearchCV(cv=None, error_score='raise',\n estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=1, n_neighbors=10, p=5,\n weights='distance'),\n fit_params=None, iid=True, n_jobs=1,\n param_grid=[{'weights': ['distance'], 'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],\n pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n scoring=None, verbose=1)" 176 | ] 177 | }, 178 | "execution_count": 8, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "from sklearn.model_selection._search import GridSearchCV\n", 185 | "param_grid = [\n", 186 | " {\n", 187 | " 'weights':['distance'],\n", 188 | " 'n_neighbors':[i for i in range(2,11)],\n", 189 | " 'p': [i for i in range(1,6)]\n", 190 | " }\n", 191 | "]\n", 192 | "\n", 193 | "grid_search = GridSearchCV(knn_clf, param_grid, verbose=1)\n", 194 | "grid_search.fit(X_train, y_train)\n" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 9, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/plain": [ 205 | "0.98237476808905377" 206 | ] 207 | }, 208 | "execution_count": 9, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "grid_search.best_score_" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 10, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "{'n_neighbors': 2, 'p': 2, 'weights': 'distance'}" 226 | ] 227 | }, 228 | "execution_count": 10, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "grid_search.best_params_\n", 235 | "# 与我们上面手动调用交叉验证得到的超参数一致" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 11, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "0.98052851182197498" 247 | ] 248 | }, 249 | "execution_count": 11, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "best_knn_clf = grid_search.best_estimator_\n", 256 | "best_knn_clf.score(X_test, y_test)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 12, 262 | "metadata": {}, 263 | "outputs": [ 264 | { 265 | "data": { 266 | "text/plain": [ 267 | "array([ 0.99543379, 0.96803653, 0.98148148, 0.96261682, 0.97619048])" 268 | ] 269 | }, 270 | "execution_count": 12, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "# cross_val_score 默认是分3份,如果要分5份:\n", 277 | "cross_val_score(knn_clf, X_train, y_train, cv=5)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "# GridSearchCV 中的交叉验证,如果要分为5份:\n", 287 | "GridSearchCV(knn_clf, param_grid, verbose=1, cv=5)" 288 | ] 289 | } 290 | ], 291 | "metadata": { 292 | "kernelspec": { 293 | "display_name": "Python 2", 294 | "language": "python", 295 | "name": "python2" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 2 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython2", 307 | "version": "2.7.6" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 0 312 | } 313 | -------------------------------------------------------------------------------- /c5_polynomial_regression/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /c6_logistic_regression/01_Sigmoid.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Sigmoid函数" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## 绘制Sigmoid函数" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 4, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "def sigmoid(t):\n", 36 | " return 1/(1 + np.exp(-t))" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 6, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzt3Xt8VPWd//HXJxcSIIQ74Q4qqOCdRBC1FcRSsK24u9TitrbWunTdtZdff/11td2f29rd36/d7uW3Pmrbta2rdW1RW22ppYVKQ7ValasoRCRCIOGSAEEghFxm5vP7YyY6prlMkpmcmcn7+XjMI+fyPWfeOXPymZPvnDnH3B0REckuOUEHEBGR5FNxFxHJQiruIiJZSMVdRCQLqbiLiGQhFXcRkSyk4i4ikoVU3EVEspCKu4hIFsoL6onHjBnj06dP79Wyp0+fZujQockNlATK1TPK1XPpmk25eqYvuTZv3nzU3cd229DdA3mUlpZ6b5WXl/d62VRSrp5Rrp5L12zK1TN9yQVs8gRqrLplRESykIq7iEgWUnEXEclCKu4iIllIxV1EJAt1W9zN7EEzqzOz1zqZb2Z2n5lVmtl2M5uT/JgiItITiRy5PwQs6WL+UmBm7LES+G7fY4mISF90+yUmd3/WzKZ30WQZ8KPY+ZcvmtkIM5vg7oeSlFFEslQk4jSHIjS1hmkKhQmFndZwhHDEaQ07oUiEUMQJtQ2HPTYv2iYUcSLuuIPjRCJEx4l+h+f16lYOvrT/XdPco20i/s64Ex1vW1cbjxt51/QOp/1p23fdxDSu8YgzYRb0cdt1x+LDd9ooWtyfdvcLO5j3NPANd/9DbHw98HfuvqmDtiuJHt1TUlJSumrVql6FbmhooKioqFfLppJy9Yxy9Vw6ZXN3TrfCW83O4RONhHIKaQw5ja1OY4jYz3eGW8JOawRawtAScVrC0BoJ+rfoXxb7edM5ztKZvXsdFy5cuNndy7pr16+XH3D3B4AHAMrKynzBggW9Ws+GDRvo7bKppFw9o1w915/ZwhHnwPEz7Ks/zf76RvYfa2R/fSMH3zpD3almjjY00xpuOzg0oPntZQfl5lA8OJ/iwjyGDc5n0sg8BufnUpifS2F+DoX5uQzOz6UgNt42Lz83h7wcIy/Xoj9zcsjNNfJzcsjNMfJzLfYzh7xcI9eMnBzDgBwzcswwI/YwXnrxj1x15ZUY0XGztnZgGJbDnyxLbLxN3CBxg1hsxrun/en8jvTH65iM4n4AmBI3Pjk2TUQyRHMozGsHTvBqzQleP3yKikMn2VV7iqa4Q+tBuTlMHjWYSSMGM2PcMMYVFzC2qICxwwo4+GYFi66eR/HgPIoL8ynMzw3wt3nHqMIcSooLg44RiGQU99XAnWa2CpgHnFB/u0h6awlF2FRVz3OVR9lUVc8rNSdoCUUL+cgh+cyaUMxfzp3GeeOLmDZ6KFNHDWF8cSE5OR0fjW44/gYzxqVHd5FEdVvczewnwAJgjJnVAP8A5AO4+/eANcD1QCXQCHwyVWFFpPdONrWybkct6ytqeW73URqaQ+TlGBdOGs4n5k+jdNooLp0ygpLigi67FCQzJHK2zM3dzHfgb5OWSESSJhxxNuyq48mtB3hmZy3NoQjjiwv50CUTWHjeOK6aMYahBYFd+VtSSK+qSBY62dTK4xurefiPVVTXn2HU0EGsuHwKN142iUunjNCR+QCg4i6SRU42tfKD5/by4B/20tAc4vLpI7l76SzeN7uE/FxdbWQgUXEXyQLNoTAPPV/Fd3//Jm81trL0wvH8zYIZXDR5eNDRJCAq7iIZ7oXKo/z9z19jz9HTXHPuWL64+DwVdVFxF8lUDc0hvrZ6B09srmHqqCE8fNtcrjm3+1trysCg4i6SgbbsP87nV22j5ngjf7PgHD67aGbafHFI0oOKu0gGcXce+WMVX/vlTkqKC3ns0/O5fPqooGNJGlJxF8kQLaEID+9sYUP1Dq49fxz//pFLGT44P+hYkqZU3EUyQGNLiE8/spnnqkPcseAcvrj4PHI7uRSACKi4i6S9E2daue2hjWzdf5xPXTiIv1tyftCRJAOouIuksRONrdz8/RfZXXeK73x0DoVHdwUdSTKEvrImkqYaW0J88qGXqaxr4PsfL2PJhROCjiQZRMVdJA01h8J8+pHNbKt+i/tuvpQF540LOpJkGHXLiKQZd+fLT77Gc7uP8s/LL9YRu/SKjtxF0swP/7CXn22p4XOLZnJT2ZTuFxDpgIq7SBr5/RtH+D9rKlh64Xg+t2hm0HEkg6m4i6SJg2+d4bM/2cq5JcP4lw9f0ukt7UQSoeIukgbCEefzj20jFI7wvY+V6u5I0mfag0TSwP3llby8t55/u+kSpo8ZGnQcyQI6chcJ2Nb9x/mP9bu58dKJ/PmcyUHHkSyh4i4SoJZQhL/72XZKhhXw9RsvDDqOZBF1y4gE6DsbKnmjtoEHby1jWKGu8CjJoyN3kYDsrj3F/eWV3HDJRK49vyToOJJlVNxFAuDufPmpVykqyOMfPjQ76DiShVTcRQLw9PZDbKw6zpeWnM/oooKg40gWUnEX6WdNrWG+8evXmT2hWJcXkJRRcRfpZw88u4cDb53hHz40W3dTkpRRcRfpR7Unm/juhje5/qLxzDt7dNBxJIupuIv0o2//rpLWcIS7lswKOopkORV3kX5SXd/Iqo37+cjlU5g6ekjQcSTLJVTczWyJme0ys0ozu6uD+VPNrNzMtprZdjO7PvlRRTLbfet3Y2Z85lpdyldSr9vibma5wP3AUmA2cLOZtT8x9++Bx939MmAF8J1kBxXJZHuONPCzLTXccsU0xg8vDDqODACJHLnPBSrdfY+7twCrgGXt2jhQHBseDhxMXkSRzHff+t0U5OVyx4Jzgo4iA0Qi15aZBFTHjdcA89q1+Sqwzsw+AwwFrktKOpEsUF3fyC+3H+K2q6YzRl9Ykn5i7t51A7PlwBJ3vz02fgswz93vjGvzhdi6/tXM5gM/BC5090i7da0EVgKUlJSUrlq1qlehGxoaKCoq6tWyqaRcPTNQcj2ys5kN1SH+5ZrBjCzs2zkMA2WbJUs25lq4cOFmdy/rtqG7d/kA5gNr48bvBu5u12YHMCVufA8wrqv1lpaWem+Vl5f3etlUUq6eGQi5jpxq8nO/ssa/9MQrSVnfQNhmyZSNuYBN3k3ddveE+tw3AjPN7CwzG0T0A9PV7drsBxYBmNksoBA4ksC6RbLaQ89X0RKOsPKas4OOIgNMt8Xd3UPAncBaoILoWTE7zOxeM7sh1ux/An9lZq8APwFujb3DiAxYp5tD/OiPVSy9cDznjE2/rgHJbgndrMPd1wBr2k27J254J3BVcqOJZLYnt9RwsinE7e/RUbv0P31DVSQFIhHnoRequGTKCOZMHRl0HBmAVNxFUuAPlUd588hpbr1yWtBRZIBScRdJgYdeqGJMUQHXXzQh6CgyQKm4iyRZ1dHTlO+q46PzplKQlxt0HBmgVNxFkuy/X9xHrhkfnTc16CgygKm4iyRRcyjMk1sPsPiCEsYV6wJhEhwVd5Ek+u3OWupPt/CRy3XULsFScRdJosc2VjNpxGCunjEm6CgywKm4iyRJdX0jz+0+yk1lU3TjawmcirtIkjy+qRoz+HDZ5KCjiKi4iyRDKBzhiU01XHPuWCaOGBx0HBEVd5Fk+P0bRzh8sokVl08JOooIoOIukhQ/3VzD6KGDWDSrJOgoIoCKu0ifnTjTyvqKOj50yUTyc/UnJelBe6JIH/361UO0hCP8+ZxJQUcReZuKu0gfPbn1AGePHcpFk4YHHUXkbSruIn1Qc7yRl/fW82eXTsJM57ZL+lBxF+mDX2w7CMCyS9UlI+lFxV2kl9ydp7YeoGzaSKaOHhJ0HJF3UXEX6aUdB09SWdfAn+mDVElDKu4ivfTzrQfIzzU+oLstSRpScRfphUjEWfPqId47cywjhgwKOo7In1BxF+mFrdVvcfBEEx+8REftkp5U3EV64VfbDzEoL4frdLkBSVMq7iI9FN8lM6wwP+g4Ih1ScRfpoa3Vxzl8sokPXqwuGUlfKu4iPfR0rEtm0axxQUcR6ZSKu0gPtHXJLDhXXTKS3lTcRXpgy/7j1J5s5gPqkpE0p+Iu0gPvdMnoLBlJbyruIglq65JZeN5Yigrygo4j0qWEiruZLTGzXWZWaWZ3ddLmJjPbaWY7zOzHyY0pErzN+49Td6qZ63W5AckA3R5+mFkucD/wPqAG2Ghmq919Z1ybmcDdwFXuftzMdBqBZJ11Ow4zKDeHa8/X7i3pL5Ej97lApbvvcfcWYBWwrF2bvwLud/fjAO5el9yYIsFyd9btrOXKGaN1loxkBHP3rhuYLQeWuPvtsfFbgHnufmdcm58DbwBXAbnAV939Nx2sayWwEqCkpKR01apVvQrd0NBAUVFRr5ZNJeXqmUzKVXMqwt8/f4ZbLxjEginBFfdM2mbpIBtzLVy4cLO7l3Xb0N27fADLgR/Ejd8CfLtdm6eBp4B84CygGhjR1XpLS0u9t8rLy3u9bCopV89kUq77nnnDp9/1tNeePNP/geJk0jZLB9mYC9jk3dRtd0+oW+YAMCVufHJsWrwaYLW7t7r7XqJH8TMTWLdIRli3s5bLpoxg3LDCoKOIJCSR4r4RmGlmZ5nZIGAFsLpdm58DCwDMbAxwLrAniTlFAnPwrTO8euAEiy8YH3QUkYR1W9zdPQTcCawFKoDH3X2Hmd1rZjfEmq0FjpnZTqAc+F/ufixVoUX602931gKweLa+uCSZI6FvYrj7GmBNu2n3xA078IXYQySrrNt5mBnjijh7bPp9MCfSGX1DVaQLJxpbeXFPvY7aJeOouIt04Xe7aglHXP3tknFU3EW6sG5HLSXFBVw8aXjQUUR6RMVdpBNNrWF+/8YR3je7hJwcCzqOSI+ouIt04vnKozS2hFk8W10yknlU3EU6sW5HLcMK8rji7NFBRxHpMRV3kQ6EI84zFbUsPH8cg/L0ZyKZR3utSAe27D/OsdMtLL5Ap0BKZlJxF+lA27Xbrzl3bNBRRHpFxV2kHde12yULqLiLtHOgwdl3rFFnyUhGU3EXaWdLXQgzuG62bqcnmUvFXaSdLbVhXbtdMp6Ku0icg2+doepkRNeSkYyn4i4SR9dul2yh4i4SZ93Ow0wcarp2u2Q8FXeRmLZrt88pSegeNiJpTcVdJKbt2u1zxuUGHUWkz1TcRWLart0+fbj+LCTzaS8WIXrt9g27jnDdrBJyTNdul8yn4i4C/GH3Uc60hnm/ToGULKHiLkL0LJlhhbp2u2QPFXcZ8ELhCM9U1HGtrt0uWUR7sgx4m/cdp/50iy4UJllFxV0GvHU7axmUl8M15+na7ZI9VNxlQHN31u44zNUzxlBUoC8vSfZQcZcBreLQKWqOn9G1ZCTrqLjLgLZu52HMYNEsFXfJLiruMqCt3VFL2bSRjB1WEHQUkaRScZcBq7q+kYpDJ3WWjGSlhIq7mS0xs11mVmlmd3XR7i/MzM2sLHkRRVJjXdu12y9Ql4xkn26Lu5nlAvcDS4HZwM1mNruDdsOAzwEvJTukSCqs3XGY88cPY9rooUFHEUm6RI7c5wKV7r7H3VuAVcCyDtp9Hfgm0JTEfCIpcayhmU1V9TpLRrJWIsV9ElAdN14Tm/Y2M5sDTHH3XyUxm0jKrK+oI+LoXqmStczdu25gthxY4u63x8ZvAea5+52x8Rzgd8Ct7l5lZhuAL7r7pg7WtRJYCVBSUlK6atWqXoVuaGigqCj9boOmXD0TZK5/39xEzakI/3LNYKzdJX7TdXtB+mZTrp7pS66FCxdudvfuP9d09y4fwHxgbdz43cDdcePDgaNAVezRBBwEyrpab2lpqfdWeXl5r5dNJeXqmaBynTjT4jO/vMbv/eWODuen6/ZyT99sytUzfckFbPJu6ra7J9QtsxGYaWZnmdkgYAWwOu7N4YS7j3H36e4+HXgRuME7OHIXSQfrK2ppCUe4/qIJQUcRSZlui7u7h4A7gbVABfC4u+8ws3vN7IZUBxRJtl9tP8z44kIumzIi6CgiKZPQlZLcfQ2wpt20ezppu6DvsURS41RTK8/uPsJH500lJ0e305PspW+oyoCyvqKOllCED6hLRrKcirsMKGtePcT44kLmTB0ZdBSRlFJxlwGjoTnEhjeOsOTC8eqSkayn4i4DxvqKWlpCOktGBgYVdxkw1rx6iHHDCiibpi4ZyX4q7jIgnG4OsWHXEZaqS0YGCBV3GRDWv15Hs7pkZABRcZcB4RdbDzC+uJDLp48KOopIv1Bxl6xXf7qF379xhBsunaguGRkwVNwl6/3q1UOEIs6ySycGHUWk36i4S9b7xdYDzBxXxOwJxUFHEek3Ku6S1arrG9m07zg3XjbpT67bLpLNVNwlq61+5SAAN1yiLhkZWFTcJWu5Oz/feoCyaSOZMmpI0HFE+pWKu2StikOn2F3XoA9SZUBScZes9bMtNeTnGh+4WMVdBh4Vd8lKLaEIT209wHWzShg1dFDQcUT6nYq7ZKX1FbXUn27hpsunBB1FJBAq7pKVHttUzfjiQt47c2zQUUQCoeIuWefQiTM8+8YRlpdOJleXG5ABSsVdss6TWw4Qcfhw2eSgo4gERsVdskok4jy+qZorzh7FtNFDg44jEhgVd8kqL+2tZ9+xRj6iD1JlgFNxl6zy6Ev7KC7MY8kFuimHDGwq7pI16k428ZvXDnNT2RQGD8oNOo5IoFTcJWv8+OX9hCLOx66YFnQUkcCpuEtWaA1H+PFL+1lw3limj9EHqSIq7pIV1u44TN2pZj4+X0ftIqDiLlni4ReqmDpqCNecOy7oKCJpQcVdMt626rfYWHWcj8+fpm+kisSouEvGe+DZNxlWmMeKuVODjiKSNhIq7ma2xMx2mVmlmd3VwfwvmNlOM9tuZuvNTB2f0i+qjp7mN68d5mNXTKOoIC/oOCJpo9vibma5wP3AUmA2cLOZzW7XbCtQ5u4XAz8F/jnZQUU68oM/7CEvJ4dPXjk96CgiaSWRI/e5QKW773H3FmAVsCy+gbuXu3tjbPRFQFdskpQ71tDME5tquPGyiYwrLgw6jkhaMXfvuoHZcmCJu98eG78FmOfud3bS/tvAYXf/xw7mrQRWApSUlJSuWrWqV6EbGhooKirq1bKppFw909dcT+xqYc3eVv7p6sFMLErex0fpur0gfbMpV8/0JdfChQs3u3tZtw3dvcsHsBz4Qdz4LcC3O2n7MaJH7gXdrbe0tNR7q7y8vNfLppJy9Uxfch091eSz/vev/W8f3Zy8QDHpur3c0zebcvVMX3IBm7yb+uruJPIJ1AEg/hJ7k2PT3sXMrgO+Alzj7s0JrFek1x54bg9nWsN8btHMoKOIpKVE/pfdCMw0s7PMbBCwAlgd38DMLgP+E7jB3euSH1PkHUcbmvnRC/v40MUTmVkyLOg4Immp2+Lu7iHgTmAtUAE87u47zOxeM7sh1uxbQBHwhJltM7PVnaxOpM8eeHYPzaEwn9VRu0inEjox2N3XAGvaTbsnbvi6JOcS6VB1fSMPvVDFjZdOYsa49PugTCRd6BuqklG+tXYXBnzx/ecFHUUkram4S8bYVv0Wq185yO3vOYuJIwYHHUckram4S0Zwd/7x6Z2MKRrEHQtmBB1HJO2puEtGWP3KQTbtO87/eN+5uoaMSAJU3CXtnWhs5etP7+SSycNZcbmu/CiSCB0CSdr7xm9e53hjKw/fNlfXaxdJkI7cJa1tqqrnJy/v57arpnPBxOFBxxHJGCrukrYaW0J86afbmTRiMJ+/7tyg44hkFHXLSNr6p19VsPfYaR69fR5D9SGqSI/oyF3S0vqKWh59aT8r33M2V54zJug4IhlHxV3SzuETTXzpp9uZNaGYLyxWd4xIb6i4S1ppDoW549HNnGkNc9+KSynIyw06kkhGUkempJWv/XInW/e/xXc/OkeX8xXpAx25S9r47xf38eOX9nPHgnNYetGEoOOIZDQVd0kLa3cc5p5fvMa154/ji4t1xUeRvlJxl8BtrKrnsz/ZysWTR/Dtv7xM30IVSQIVdwnUlv3Hue2hjUwaOZgHb72cIYP0MZBIMugvSQKzqz7Mfb97iTHDCnjkU/MYNXRQ0JFEsoaO3CUQG3bV8a+bmygZXsjjn57PJN18QySpVNyl3z38QhW3PbSRkiE5PLZyPiXFhUFHEsk66paRftMcCvOPT1fwyIv7uG7WOJZPamDssIKgY4lkJR25S7/Ye/Q0f/HdF3jkxX2sfO/Z/OctZRTm6awYkVTRkbukVCTi/Pjl/fzfNRXk5+Xw/Y+X8b7ZJUHHEsl6Ku6SMrtrT/Hlp15lY9Vxrpoxmm8tv4SJ+uBUpF+ouEvS1Z5s4v898waPbaxmWGE+31p+MctLJ2OmbhiR/qLiLklTe7KJ/3q+iodfqCIUifDx+dP5zLUzGF2kD01F+puKu/TZjoMn+K/nq/jFtgOEI84HL57I/1x8LtNGDw06msiApeIuvXK0oZnV2w7y08017Dx0ksH5ufzl3KncdvVZKuoiaUDFXRLi7uw5eppndtbyTEUtm/cdJ+Jw0aThfO2GC1h26URGDNHlA0TShYq7dCgccfYcaWBj1XFe3nuMl/fWc/BEEwAXTCzmM9fO5PqLJnDeeN1QQyQdJVTczWwJ8B9ALvADd/9Gu/kFwI+AUuAY8BF3r0puVEkFd6fuVDNVR0/z5pHT7Dh4gp2HTvL6oVOcaQ0DMHZYAXPPGsUdZ49m0fnjdDqjSAbotribWS5wP/A+oAbYaGar3X1nXLNPAcfdfYaZrQC+CXwkFYElcaFwhBNnWqk+FWHDrjrqTjVz5FQztSebOHSiif3HGtlXf5qm1sjbywwrzGP2hGJWzJ3CBROHUzZtJNNGD9FpjCIZJpEj97lApbvvATCzVcAyIL64LwO+Ghv+KfBtMzN39yRmzViRiBOKOOGIE4pEYj+j463hd4+HwrHpkQitoQhNoQhnWsI0h8KcaQnT1BrmTGuEptbw24+G5jAnzrRysqmVk2dij6YQDc2hd0I8v/HtweGD8ykpLmDqqKG8Z+YYpo0ZyvTRQ5g+eiiTRw5WIRfJAokU90lAddx4DTCvszbuHjKzE8Bo4GgyQsZ7fGM1//5cI4M3b4hOcPDo89L2TuIOjkd/xr29tLVpm/9O27Z27ad53Ly453DinuudNuFwGHvm17y9JodQJEIkRW9xBXk5FObnMnRQLsWD8xk+OJ8po4YwfHA+xYXR8eGD8zhS/SbXzp/DuGGFjB1WQGF+bmoCiUja6NcPVM1sJbASoKSkhA0bNvR4HQfqQowfHCE/t+md9QJtB5sWN9Ew4o9Bzd6ZH39wam3Lxc/vaLzd89g7T4QBra3OoEG573rO3Jxccg1yDPIMcsyi4zm8PT3XINesw2mDcok+ctqGoz/zc6LrekcEaI494rTCmKImTu3dzingza42bj9raGjo1T6QaumaC9I3m3L1TL/kcvcuH8B8YG3c+N3A3e3arAXmx4bziB6xW1frLS0t9d4qLy/v9bKppFw9o1w9l67ZlKtn+pIL2OTd1G13T+iSvxuBmWZ2lpkNAlYAq9u1WQ18Ija8HPhdLISIiASg224Zj/ah30n06DwXeNDdd5jZvUTfQVYDPwQeMbNKoJ7oG4CIiAQkoT53d18DrGk37Z644Sbgw8mNJiIivaU7MYmIZCEVdxGRLKTiLiKShVTcRUSykIq7iEgWsqBORzezI8C+Xi4+hhRc2iAJlKtnlKvn0jWbcvVMX3JNc/ex3TUKrLj3hZltcveyoHO0p1w9o1w9l67ZlKtn+iOXumVERLKQiruISBbK1OL+QNABOqFcPaNcPZeu2ZSrZ1KeKyP73EVEpGuZeuQuIiJdSNvibmYfNrMdZhYxs7J28+42s0oz22Vm7+9k+bPM7KVYu8dilytOdsbHzGxb7FFlZts6aVdlZq/G2m1Kdo4Onu+rZnYgLtv1nbRbEtuGlWZ2Vz/k+paZvW5m283sKTMb0Um7ftle3f3+ZlYQe40rY/vS9FRliXvOKWZWbmY7Y/v/5zpos8DMTsS9vvd0tK4UZOvydbGo+2Lba7uZzemHTOfFbYdtZnbSzD7frk2/bS8ze9DM6szstbhpo8zst2a2O/ZzZCfLfiLWZreZfaKjNj2SyEXfg3gAs4DzgA1AWdz02cArQAFwFtGbC+V2sPzjwIrY8PeAO1Kc91+BezqZVwWM6cdt91Xgi920yY1tu7OBQbFtOjvFuRYDebHhbwLfDGp7JfL7A38DfC82vAJ4rB9euwnAnNjwMOCNDnItAJ7ur/0p0dcFuB74NdEbk10BvNTP+XKBw0TPAw9kewHvBeYAr8VN+2fgrtjwXR3t98AoYE/s58jY8Mi+ZEnbI3d3r3D3XR3MWgascvdmd98LVBK9iffbLHqH52uJ3qwb4GHgxlRljT3fTcBPUvUcKfD2jc/dvQVou/F5yrj7Ondvu2v3i8DkVD5fNxL5/ZcR3Xcgui8tshTfPdzdD7n7ltjwKaCC6D2KM8Ey4Ece9SIwwswm9OPzLwLedPfefjmyz9z9WaL3tIgXvx91VoveD/zW3evd/TjwW2BJX7KkbXHvQkc37G6/848G3oorJB21Sab3ALXuvruT+Q6sM7PNsfvI9oc7Y/8aP9jJv4GJbMdUuo3oUV5H+mN7JfL7v+vG70Dbjd/7Rawb6DLgpQ5mzzezV8zs12Z2QT9F6u51CXqfWkHnB1hBbK82Je5+KDZ8GCjpoE3St12/3iC7PTN7BhjfwayvuPsv+jtPRxLMeDNdH7Vf7e4HzGwc8Fszez32Dp+SXMB3ga8T/WP8OtEuo9v68nzJyNW2vczsK0AIeLST1SR9e2UaMysCfgZ83t1Ptpu9hWjXQ0Ps85SfAzP7IVbavi6xz9RuIHqP5/aC2l5/wt3dzPrlFMVAi7u7X9eLxQ4AU+LGJ8emxTtG9F/CvNgRV0dtkpLRzPKAPwdKu1jHgdjPOjN7imiXQJ/+KBLddmb2feDpDmYlsh2TnsvMbgU+CCzyWGdjB+tI+vbqQCK/f1ubmtjrPJzovpVSZpZPtLA/6u5+OtfeAAAB6UlEQVRPtp8fX+zdfY2ZfcfMxrh7Sq+hksDrkpJ9KkFLgS3uXtt+RlDbK06tmU1w90Oxbqq6DtocIPrZQJvJRD9v7LVM7JZZDayInclwFtF34JfjG8SKRjnRm3VD9ObdqfpP4DrgdXev6WimmQ01s2Ftw0Q/VHyto7bJ0q6f8886eb5Ebnye7FxLgC8BN7h7Yydt+mt7peWN32N9+j8EKtz93zppM76t79/M5hL9O07pm06Cr8tq4OOxs2auAE7EdUekWqf/PQexvdqJ3486q0VrgcVmNjLWjbo4Nq33+uMT5N48iBalGqAZqAXWxs37CtEzHXYBS+OmrwEmxobPJlr0K4EngIIU5XwI+Ot20yYCa+JyvBJ77CDaPZHqbfcI8CqwPbZjTWifKzZ+PdGzMd7sp1yVRPsVt8Ue32ufqz+3V0e/P3Av0TcfgMLYvlMZ25fO7odtdDXR7rTtcdvpeuCv2/Yz4M7YtnmF6AfTV/ZDrg5fl3a5DLg/tj1fJe4stxRnG0q0WA+PmxbI9iL6BnMIaI3Vr08R/ZxmPbAbeAYYFWtbBvwgbtnbYvtaJfDJvmbRN1RFRLJQJnbLiIhIN1TcRUSykIq7iEgWUnEXEclCKu4iIllIxV1EJAupuIuIZCEVdxGRLPT/AWDnkQWjIdEtAAAAAElFTkSuQmCC\n", 47 | "text/plain": [ 48 | "" 49 | ] 50 | }, 51 | "metadata": {}, 52 | "output_type": "display_data" 53 | } 54 | ], 55 | "source": [ 56 | "x = np.linspace(-10,10,500)\n", 57 | "y = sigmoid(x)\n", 58 | "plt.plot(x,y)\n", 59 | "plt.grid(True)\n", 60 | "plt.show()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [] 69 | } 70 | ], 71 | "metadata": { 72 | "kernelspec": { 73 | "display_name": "Python 2", 74 | "language": "python", 75 | "name": "python2" 76 | }, 77 | "language_info": { 78 | "codemirror_mode": { 79 | "name": "ipython", 80 | "version": 2 81 | }, 82 | "file_extension": ".py", 83 | "mimetype": "text/x-python", 84 | "name": "python", 85 | "nbconvert_exporter": "python", 86 | "pygments_lexer": "ipython2", 87 | "version": "2.7.6" 88 | } 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 0 92 | } 93 | -------------------------------------------------------------------------------- /c6_logistic_regression/04_implement_logistic_regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# 实现逻辑回归" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "from sklearn import datasets\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "iris = datasets.load_iris()\n", 30 | "X = iris.data\n", 31 | "y = iris.target" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "((100, 2), (100,))" 43 | ] 44 | }, 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "# 我们知道逻辑回归是解决2分类问题的,但鸢尾花数据集有4个分类,所以我们需要只取其中2个分类\n", 52 | "# 另外为了可视化,我们只取其中2个特征而不是全部特征\n", 53 | "X = X[y<2,:2]\n", 54 | "y = y[y<2]\n", 55 | "X.shape, y.shape" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAFzRJREFUeJzt3X2MXFd5x/Hf45kUMG+RyAqi+GUrgagAhRCvQigIhdhUIVjmD6iaaikNauXiDSW0VLw0UqpaQqhCokDBRiujKqndEhqgDSilDYEW+gep1iEJBNMqUDuJS5uNKUlTt6lsP/3j3sW7s7Mz98zMmTnnzPcjXe3MnZO7z7n3+sndc597xtxdAICybJp0AACA0SO5A0CBSO4AUCCSOwAUiOQOAAUiuQNAgUjuAFAgkjsAFIjkDgAFajdtaGYtSUuSTrr77o7Prpf0UUkn61WfcvdDvbZ30UUX+ezsbFCwADDtjh49+ri7z/Rr1zi5S7pR0jFJz9vg89vc/d1NNzY7O6ulpaWAXw8AMLMTTdo1GpYxsy2S3iyp59U4ACANTcfcPy7p/ZLO9WjzVjN7wMxuN7Ot3RqY2V4zWzKzpeXl5dBYAQAN9U3uZrZb0mPufrRHsy9LmnX3SyXdJemWbo3cfdHd59x9bmam75ARAGBATa7cXytpj5kdl/Q5SVeb2eHVDdz9lLs/Xb89JGnHSKMEAATpm9zd/UPuvsXdZyVdJ+nr7v721W3M7OJVb/eouvEKAJiQkGqZNcxsv6Qld79D0nvMbI+kM5J+Iun60YQHABhE0ENM7v73KzXu7n5zndhXru5f7u6vdPc3uPsPYgQLTMSRI9LsrLRpU/XzyJFJRwT0NfCVOzAVjhyR9u6VTp+u3p84Ub2XpPn5ycUF9MH0A0AvN910PrGvOH26Wg8kjOQO9PLww2HrgUSQ3IFetm0LWw8kguQO9PLhD0ubN69dt3lztR5IGMkd6GV+XlpclLZvl8yqn4uL3ExF8qiWAfqZnyeZIztcuQNAgUjuAFAgkjsAFIjkDgAFIrkDQIFI7gBQIJI7ABSI5A4ABSK5A0CBSO4oB1+qAfwM0w+gDHypBrAGV+4oA1+qAaxBckcZ+FINYA2SO8rAl2oAa5DcUQa+VANYg+SOMvClGsAaVMugHHypBvAzXLljeNSXA8nhyh3Dob4cSBJX7hgO9eVAkkjuGA715UCSSO4YDvXlQJJI7hgO9eVAkkjuGA715UCSGlfLmFlL0pKkk+6+u+OzZ0i6VdIOSack/Yq7Hx9hnEgZ9eVAckKu3G+UdGyDz35D0n+6+4sl/bGkPxo2MCBL1PwjEY2Su5ltkfRmSYc2aPIWSbfUr2+XtNPMbPjwgIys1PyfOCG5n6/5J8FjAppeuX9c0vslndvg80skPSJJ7n5G0hOSXjB0dEBOqPlHQvomdzPbLekxdz867C8zs71mtmRmS8vLy8NuDkgLNf9ISJMr99dK2mNmxyV9TtLVZna4o81JSVslyczakp6v6sbqGu6+6O5z7j43MzMzVOBAcqj5R0L6Jnd3/5C7b3H3WUnXSfq6u7+9o9kdkn69fv22uo2PNFIgddT8IyED17mb2X4z21O//aykF5jZQ5J+V9IHRxEckBVq/pEQm9QF9tzcnC8tLU3kdwNArszsqLvP9WvHE6pI18KC1G5XV8HtdvUeQCPM5440LSxIBw+ef3/27Pn3Bw5MJiYgI1y5I02Li2HrAaxBckeazp4NWw9gDZI70tRqha0HsAbJHWla+R7WpusBrMENVaRp5abp4mI1FNNqVYmdm6lAIyR3pOvAAZI5MCCGZdDdrl1VffnKsmvXpCOaHOZoR4ZI7lhv1y7p7rvXrrv77ulM8MzRjkwx/QDW6/U9K9M2H9zsbJXQO23fLh0/Pu5oAKYfAEaCOdqRKZI70AtztCNTJHest3Nn2PqSMUc7MkVyx3pf+9r6RL5zZ7V+2jBHOzLFDVUAyAg3VDGcWLXdIdulvhwYGE+oYr2V2u7Tp6v3K7Xd0nDDESHbjRUDMCUYlsF6sWq7Q7ZLfTnQFcMyGFys2u6Q7VJfDgyF5I71YtV2h2yX+nJgKCR3rBertjtku9SXA0MhuWO9WLXdIdulvhwYCjdUASAj3FCNLcca7BxjBjAQ6twHkWMNdo4xAxgYwzKDyLEGO8eYAazDsExMOdZg5xgzgIGR3AeRYw12jjEDGBjJfRA51mDnGDOAgZHcB5FjDXaOMQMYWN8bqmb2TEnflPQMVdU1t7v7H3S0uV7SRyWdrFd9yt0P9dpu1jdUAWBCRnlD9WlJV7v7KyVdJukaM7uyS7vb3P2yeumZ2DEhCwtSu11dubfb1ftRtE2lfj6VOIAE9K1z9+rS/qn67QX1Mpn6SQxuYUE6ePD8+7Nnz78/cGDwtqnUz6cSB5CIRnXuZtaSdFTSiyV92t0/0PH59ZI+ImlZ0r9I+h13f6TXNhmWGbN2u0rSnVot6cyZwdumUj+fShxAZCOtc3f3s+5+maQtkq4ws1d0NPmypFl3v1TSXZJu2SCovWa2ZGZLy8vLTX41RqVbst5ofUjbVOrnU4kDSERQtYy7/1TSNyRd07H+lLs/Xb89JGnHBv/9orvPufvczMzMIPFiUK1W8/UhbVOpn08lDiARfZO7mc2Y2YX162dJeqOkH3S0uXjV2z2Sjo0ySIzAyvhzk/UhbVOpn08lDiAV7t5zkXSppO9IekDS9yTdXK/fL2lP/fojkh6UdL+qK/tf6LfdHTt2OMZs3z73Vstdqn7u2zeatocPu2/f7m5W/Tx8eNSRN5NKHEBEkpa8T351dyYOA4CcMHFYbLFqqkPqy2NuO6R/Oe6LzFDCj2BNLu9jLFkPyxw+7L55czVksbJs3jz8MMC+fWu3ubL0GhKJse2Q/uW4LzITaxcjT2JYJqJYNdUh9eUxtx3Svxz3RWYo4cdqTYdlSO6D2LSpuoDqZCadOzf4ds02/mzY4xSy7ZD+5bgvMhNrFyNPjLnHFKumOqS+POa2Q/qX477IDCX8GATJfRCxaqpD6stjbjukfznui8xQwo+BNBmYj7FkfUPVPV5NdUh9ecxth/Qvx32RGUr4sULcUAWA8jDmjvVSqF1H1jgt8tF3PncUImS+c+ZGRxecFnlhWGZapFC7jqxxWqSBYRmsFTLfOXOjowtOi7yQ3KdFCrXryBqnRV5I7tMihdp1ZI3TIi8k92kxPy8tLlYDpGbVz8XF7nfCQtpianBa5IUbqgCQEW6orohVmBuy3VTmJadIOSmlH47S+xdiIvuiyWOsMZaxTD8QayLskO2mMi85k4InpfTDUXr/Qox6X4jpBxSvMDdku6nMS06RclJKPxyl9y/EqPcF87lL8SbCDtluKvOSMyl4Uko/HKX3L8So9wVj7lK8wtyQ7aYyLzlFykkp/XCU3r8Qk9oXZSf3WIW5IdtNZV5yipSTUvrhKL1/ISa2L5oMzMdYxjafe6yJsEO2m8q85EwKnpTSD0fp/Qsxyn0hbqgCQHkYc48thfr5XbuquzIry65do4kBKEisx0ySr+NvcnkfY8n6a/ZSqJ/fubN7/fzOncPFABQk1mMmk6zjF8MyEaVQP59KiSWQsFiPmUyyjp9hmZhiTWzNhNnASHVL7L3WN5XDP1WS+yBSqJ8H0Fesx0xy+KdKch9ECvXzO3d238ZG64EpFOsxkyzq+JsMzMdYsr6h6p5G/XznTVVupgLrxHrMZFJ1/OKGKgCUZ2Q3VM3smWb2T2Z2v5k9aGZ/2KXNM8zsNjN7yMzuMbPZwcJuILS4NPli1A4hRbmF74uY4cbczU3F7F9mhzpI4af96PS7tJdkkp5Tv75A0j2SruxosyDpM/Xr6yTd1m+7Aw3LhBaX5japdEhRbuH7Ima4MXdzUzH7l9mhDlL4ad+IGg7LBI2TS9os6V5Jr+5Y/7eSXlO/bkt6XPV0whstAyX37du7/6vcvn007SdtZWCwc2m11rctfF/EDDfmbm4qZv8yO9RBCj/tG2ma3BuNuZtZS9JRSS+W9Gl3/0DH59+TdI27P1q//2H9P4DHO9rtlbRXkrZt27bjRLenAHoJnRg5t0mlQx5MKnxfxAw35m5uKmb/MjvUQQo/7RsZ6UNM7n7W3S+TtEXSFWb2ikGCcvdFd59z97mZmZnwDYQWl+ZQjLpaSFFu4fsiZrgxd3NTMfuX2aEOUvhpP1JBde7u/lNJ35B0TcdHJyVtlSQza0t6vqRTowhwjdDi0iyKUVcJKcotfF/EDDfmbm4qZv8yO9RBCj/tR6vfuI2kGUkX1q+fJelbknZ3tLlBa2+ofr7fdgeucw8tLs1tUumQotzC90XMcGPu5qZi9i+zQx2k8NO+L41qzN3MLpV0i6SWqiv9z7v7fjPbX/+SO8zsmZL+TNKrJP1E0nXu/qNe26XOHQDCNR1zb/dr4O4PqEranetvXvX6fyX9cmiQAIA4yp9bZmqfYEAvIadFCqdQzAd3cntIK4XjkYUmYzcxlrHMLVPiEwwYWshpkcIpFPPBndwe0krheEyamFtGk51RH8kKOS1SOIVCY0ihf7ltNydNx9zLTu4lPsGAoYWcFimcQjEf3MntIa0Ujsek8U1M0nQ/wYANhZwWKZxCMR/cye0hrRSORy7KTu5T/QQDNhJyWqRwCsV8cCe3h7RSOB7ZaDIwH2MZ25d1lPYEA0Yi5LRI4RSK+eBObg9ppXA8JkncUAWA8jDmDoxIyBd7pCK3mFOpXU8ljpFocnkfY8n+O1QxFUK+2CMVucWcSu16KnH0I4ZlgOG129LZs+vXt1rSmTPjj6eJ3GJOpXY9lTj6YVgGGIFuSbLX+hTkFvPDD4etLz2OUSG5Az2EfLFHKnKLOZXa9VTiGBWSO9BDyBd7pCK3mFOpXU8ljpFpMjAfY+GGKnIR8sUeqcgt5lRq11OJoxdxQxUAysMNVYxNjrXBsWKOVV+e4z7GhDW5vI+xMCxThlxqg1eLFXOs+vIc9zHiEcMyGIdcaoNXixVzrPryHPcx4mFYBmORY21wrJhj1ZfnuI8xeSR3DCXH2uBYMceqL89xH2PySO4YSo61wbFijlVfnuM+RgKaDMzHWLihWo4caoM7xYo5Vn15jvsYcYgbqgBQHm6oYurEqgUP2S716EhFe9IBAKNw5Eg1tn36dPX+xInzY93z8+PZbqwYgEEwLIMixKoFD9ku9egYB4ZlMFVi1YKHbJd6dKSE5I4ixKoFD9ku9ehICckdRYhVCx6yXerRkRKSO4owPy8tLlbj22bVz8XF4W9khmw3VgzAIPreUDWzrZJulfRCSS5p0d0/0dHmKkl/Lelf61VfdPf9vbbLDVUACDfKG6pnJL3P3V8m6UpJN5jZy7q0+5a7X1YvPRM70pdjvTb16PGx3zLS5DHW1YuqK/Q3dqy7StJXQrbD9APpynH+8JCYc+xfCthvaVCM6QfMbFbSNyW9wt2fXLX+KklfkPSopH+T9Hvu/mCvbTEsk64c67WpR4+P/ZaGpsMyjZO7mT1H0j9I+rC7f7Hjs+dJOufuT5nZtZI+4e4v6bKNvZL2StK2bdt2nOh2pmDiNm2qrss6mUnnzo0/niZCYs6xfylgv6VhpA8xmdkFqq7Mj3Qmdkly9yfd/an69Z2SLjCzi7q0W3T3OXefm5mZafKrMQE51mtTjx4f+y0vfZO7mZmkz0o65u4f26DNi+p2MrMr6u2eGmWgGJ8c67WpR4+P/ZaZfoPykl6nqgTyAUn31cu1kt4l6V11m3dLelDS/ZK+LekX+22XG6ppy3H+8JCYc+xfCthvkyfmcweA8jBx2BSg5nithQWp3a5u8LXb1XtgWjGfe6aYO3ythQXp4MHz78+ePf/+wIHJxARMEsMymaLmeK12u0ronVot6cyZ8ccDxMKwTOGYO3ytbom913qgdCT3TFFzvFarFbYeKB3JPVPUHK+1cr+h6XqgdCT3TDF3+FoHDkj79p2/Um+1qvfcTMW04oYqAGSEG6qDKLxwvPDuFd+/FLCPM9LkMdYYS3LTDxQ+WXXh3Su+fylgH6dBTD8QqPDC8cK7V3z/UsA+TsPI53MfteSSe+GTVRfeveL7lwL2cRoYcw9VeOF44d0rvn8pYB/nheS+ovDC8cK7V3z/UsA+zgvJfUXhheOFd6/4/qWAfZwXxtwBICOMuQMFiVlfTu16mZjPHUhczLn7+V6AcjEsAyQuZn05tev5YVgGKETMufv5XoBykdyBxMWsL6d2vVwkdyBxMevLqV0vF8kdSFzM+nJq18vFDVUAyAg3VAFgipHcAaBAJHcAKBDJHQAKRHIHgAKR3AGgQCR3ACgQyR0ACtQ3uZvZVjP7hpl938weNLMbu7QxM/ukmT1kZg+Y2eVxwsUwmLcbmB5N5nM/I+l97n6vmT1X0lEzu8vdv7+qzZskvaReXi3pYP0TiWDebmC69L1yd/cfu/u99ev/knRM0iUdzd4i6VavfFvShWZ28cijxcBuuul8Yl9x+nS1HkB5gsbczWxW0qsk3dPx0SWSHln1/lGt/x+AzGyvmS2Z2dLy8nJYpBgK83YD06Vxcjez50j6gqT3uvuTg/wyd1909zl3n5uZmRlkExgQ83YD06VRcjezC1Ql9iPu/sUuTU5K2rrq/ZZ6HRLBvN3AdGlSLWOSPivpmLt/bINmd0h6R101c6WkJ9z9xyOME0Ni3m5gujSplnmtpF+T9F0zu69e9/uStkmSu39G0p2SrpX0kKTTkt45+lAxrPl5kjkwLfomd3f/R0nWp41LumFUQQEAhsMTqgBQIJI7ABSI5A4ABSK5A0CBSO4AUCCSOwAUiOQOAAWyqkR9Ar/YbFnSiYn88v4ukvT4pIOIiP7lq+S+SfSvie3u3ndyrokl95SZ2ZK7z006jljoX75K7ptE/0aJYRkAKBDJHQAKRHLvbnHSAURG//JVct8k+jcyjLkDQIG4cgeAAk11cjezlpl9x8y+0uWz681s2czuq5ffnESMwzCz42b23Tr+pS6fm5l90sweMrMHzOzyScQ5iAZ9u8rMnlh1/G6eRJyDMrMLzex2M/uBmR0zs9d0fJ7tsZMa9S/b42dmL10V931m9qSZvbejTfTj1+TLOkp2o6Rjkp63wee3ufu7xxhPDG9w943qat8k6SX18mpJB+ufuejVN0n6lrvvHls0o/UJSV9197eZ2c9J6viSxOyPXb/+SZkeP3f/Z0mXSdUFpKqvHP1SR7Pox29qr9zNbIukN0s6NOlYJugtkm71yrclXWhmF086qGlnZs+X9HpVX28pd/8/d/9pR7Nsj13D/pVip6QfunvnA5vRj9/UJndJH5f0fknnerR5a/0n0+1mtrVHu1S5pL8zs6NmtrfL55dIemTV+0frdTno1zdJeo2Z3W9mf2NmLx9ncEP6eUnLkv60HjY8ZGbP7miT87Fr0j8p3+O32nWS/qLL+ujHbyqTu5ntlvSYux/t0ezLkmbd/VJJd0m6ZSzBjdbr3P1yVX8C3mBmr590QCPUr2/3qnpM+5WS/kTSX407wCG0JV0u6aC7v0rSf0v64GRDGqkm/cv5+EmS6uGmPZL+chK/fyqTu6ov/d5jZsclfU7S1WZ2eHUDdz/l7k/Xbw9J2jHeEIfn7ifrn4+pGvO7oqPJSUmr/yLZUq9LXr++ufuT7v5U/fpOSReY2UVjD3Qwj0p61N3vqd/frioZrpbtsVOD/mV+/Fa8SdK97v4fXT6LfvymMrm7+4fcfYu7z6r6s+nr7v721W06xr/2qLrxmg0ze7aZPXfltaRfkvS9jmZ3SHpHfef+SklPuPuPxxxqsCZ9M7MXmZnVr69Qda6fGnesg3D3f5f0iJm9tF61U9L3O5pleeykZv3L+fit8qvqPiQjjeH4TXu1zBpmtl/SkrvfIek9ZrZH0hlJP5F0/SRjG8ALJX2p/vfRlvTn7v5VM3uXJLn7ZyTdKelaSQ9JOi3pnROKNVSTvr1N0j4zOyPpfyRd53k9sffbko7Uf9r/SNI7Czl2K/r1L+vjV190vFHSb61aN9bjxxOqAFCgqRyWAYDSkdwBoEAkdwAoEMkdAApEcgeAApHcAaBAJHcAKBDJHQAK9P9IUj1h6gimcQAAAABJRU5ErkJggg==\n", 66 | "text/plain": [ 67 | "" 68 | ] 69 | }, 70 | "metadata": {}, 71 | "output_type": "display_data" 72 | } 73 | ], 74 | "source": [ 75 | "# 分类0的散点图\n", 76 | "plt.scatter(X[y==0,0], X[y==0,1], color='red')\n", 77 | "\n", 78 | "# 分类1的散点图\n", 79 | "plt.scatter(X[y==1,0], X[y==1,1], color='blue')\n", 80 | "plt.show()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## 使用我们自己编写的逻辑回归" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 5, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "from playML.model_selection import train_test_split\n", 97 | "\n", 98 | "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 6, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "LogisticRegression()" 110 | ] 111 | }, 112 | "execution_count": 6, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "from playML.logistic_regression import LogisticRegression\n", 119 | "\n", 120 | "log_reg = LogisticRegression()\n", 121 | "log_reg.fit(X_train, y_train)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 7, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "1.0" 133 | ] 134 | }, 135 | "execution_count": 7, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "log_reg.score(X_test, y_test)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "评分结果不错,不过当然是因为我们的数据很简单" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 8, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "array([ 0.92972035, 0.98664939, 0.14852024, 0.17601199, 0.0369836 ,\n 0.0186637 , 0.04936918, 0.99669244, 0.97993941, 0.74524655,\n 0.04473194, 0.00339285, 0.26131273, 0.0369836 , 0.84192923,\n 0.79892262, 0.82890209, 0.32358166, 0.06535323, 0.20735334])" 160 | ] 161 | }, 162 | "execution_count": 8, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "log_reg.predict_proba(X_test)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 9, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "array([1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0])" 180 | ] 181 | }, 182 | "execution_count": 9, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "y_test" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 10, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "array([1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0])" 200 | ] 201 | }, 202 | "execution_count": 10, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "log_reg.predict(X_test)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [] 217 | } 218 | ], 219 | "metadata": { 220 | "kernelspec": { 221 | "display_name": "Python 2", 222 | "language": "python", 223 | "name": "python2" 224 | }, 225 | "language_info": { 226 | "codemirror_mode": { 227 | "name": "ipython", 228 | "version": 2 229 | }, 230 | "file_extension": ".py", 231 | "mimetype": "text/x-python", 232 | "name": "python", 233 | "nbconvert_exporter": "python", 234 | "pygments_lexer": "ipython2", 235 | "version": "2.7.6" 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 0 240 | } 241 | -------------------------------------------------------------------------------- /c6_logistic_regression/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /c6_logistic_regression/plot_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | def plot_decision_boundary(model, axis): 6 | """绘制不规则决策边界""" 7 | x0, x1 = np.meshgrid( 8 | np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(1, -1), 9 | np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(1, -1) 10 | ) 11 | X_new = np.c_[x0.ravel(), x1.ravel()] 12 | 13 | y_predict = model.predict(X_new) 14 | zz = y_predict.reshape(x0.shape) 15 | 16 | from matplotlib.colors import ListedColormap 17 | custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9']) 18 | 19 | plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap) 20 | -------------------------------------------------------------------------------- /c7_classification_performance_measures/03_implement_confusion_matrix_precision_and_recall.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# 实现混淆矩阵,精准率和召回率" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "from sklearn import datasets" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "digits = datasets.load_digits()\n", 29 | "X = digits.data\n", 30 | "y = digits.target.copy()\n", 31 | "\n", 32 | "# 把数据变为极度偏斜的数据\n", 33 | "# 把手写数字分为9和非9两大类, 重点关注的是分类为9的数字\n", 34 | "y[digits.target==9] = 1\n", 35 | "y[digits.target!=9] = 0" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "from sklearn.model_selection._split import train_test_split\n", 45 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 5, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "0.97555555555555551" 57 | ] 58 | }, 59 | "execution_count": 5, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "from sklearn.linear_model.logistic import LogisticRegression\n", 66 | "\n", 67 | "log_reg = LogisticRegression()\n", 68 | "log_reg.fit(X_train, y_train)\n", 69 | "log_reg.score(X_test, y_test)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "虽然0.975555555551看上去很高了,但因为我们的数据是极度偏斜的数据,即使我们把全部分类预测为\"非9\"也会有0.9左右的正确率" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 6, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "y_predict = log_reg.predict(X_test)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## 求TP,FP,FN,TN的值" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 7, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "403" 104 | ] 105 | }, 106 | "execution_count": 7, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "def TN(y_true, y_predict):\n", 113 | " assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'\n", 114 | " return np.sum((y_true == 0) & (y_predict == 0))\n", 115 | "\n", 116 | "TN(y_test, y_predict)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 8, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "2" 128 | ] 129 | }, 130 | "execution_count": 8, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "def FP(y_true, y_predict):\n", 137 | " assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'\n", 138 | " return np.sum((y_true == 0) & (y_predict == 1))\n", 139 | "\n", 140 | "FP(y_test, y_predict)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 9, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "9" 152 | ] 153 | }, 154 | "execution_count": 9, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "def FN(y_true, y_predict):\n", 161 | " assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'\n", 162 | " return np.sum((y_true == 1) & (y_predict == 0))\n", 163 | "\n", 164 | "FN(y_test, y_predict)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 10, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "36" 176 | ] 177 | }, 178 | "execution_count": 10, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "def TP(y_true, y_predict):\n", 185 | " assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'\n", 186 | " return np.sum((y_true == 1) & (y_predict == 1))\n", 187 | "\n", 188 | "TP(y_test, y_predict)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 12, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "array([[403, 2],\n [ 9, 36]])" 200 | ] 201 | }, 202 | "execution_count": 12, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "def confusion_matrix(y_true, y_predict):\n", 209 | " \"\"\"返回一个2✖️2的混淆矩阵\"\"\"\n", 210 | " return np.array([\n", 211 | " [TN(y_true, y_predict), FP(y_true, y_predict)],\n", 212 | " [FN(y_true, y_predict), TP(y_true, y_predict)]\n", 213 | " ])\n", 214 | "\n", 215 | "confusion_matrix(y_test, y_predict)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "## 根据混淆矩阵求精准率和召回率" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 13, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "data": { 232 | "text/plain": [ 233 | "0.94736842105263153" 234 | ] 235 | }, 236 | "execution_count": 13, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "def precision_score(y_true, y_predict):\n", 243 | " \"\"\"求精准率\"\"\"\n", 244 | " tp = TP(y_true, y_predict)\n", 245 | " fp = FP(y_true, y_predict)\n", 246 | " try:\n", 247 | " return tp / (tp + fp)\n", 248 | " except: # 分母为0时,结果返回0\n", 249 | " return 0.0\n", 250 | "\n", 251 | "# 精准率\n", 252 | "precision_score(y_test, y_predict)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 14, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/plain": [ 263 | "0.80000000000000004" 264 | ] 265 | }, 266 | "execution_count": 14, 267 | "metadata": {}, 268 | "output_type": "execute_result" 269 | } 270 | ], 271 | "source": [ 272 | "def recall_score(y_true, y_predict):\n", 273 | " \"\"\"求召回率\"\"\"\n", 274 | " tp = TP(y_true, y_predict)\n", 275 | " fn = FN(y_true, y_predict)\n", 276 | " try:\n", 277 | " return tp / (tp + fn)\n", 278 | " except:\n", 279 | " return 0.0\n", 280 | "\n", 281 | "# 召回率\n", 282 | "recall_score(y_test, y_predict)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "# scikit-learn中的混淆矩阵,精准率和召回率" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "混淆矩阵" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 15, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/plain": [ 307 | "array([[403, 2],\n [ 9, 36]])" 308 | ] 309 | }, 310 | "execution_count": 15, 311 | "metadata": {}, 312 | "output_type": "execute_result" 313 | } 314 | ], 315 | "source": [ 316 | "import sklearn.metrics.classification as classification\n", 317 | "classification.confusion_matrix(y_test, y_predict)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "精准率" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 16, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "data": { 334 | "text/plain": [ 335 | "0.94736842105263153" 336 | ] 337 | }, 338 | "execution_count": 16, 339 | "metadata": {}, 340 | "output_type": "execute_result" 341 | } 342 | ], 343 | "source": [ 344 | "classification.precision_score(y_test, y_predict)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "召回率" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 17, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "data": { 361 | "text/plain": [ 362 | "0.80000000000000004" 363 | ] 364 | }, 365 | "execution_count": 17, 366 | "metadata": {}, 367 | "output_type": "execute_result" 368 | } 369 | ], 370 | "source": [ 371 | "classification.recall_score(y_test, y_predict)" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [] 380 | } 381 | ], 382 | "metadata": { 383 | "kernelspec": { 384 | "display_name": "Python 2", 385 | "language": "python", 386 | "name": "python2" 387 | }, 388 | "language_info": { 389 | "codemirror_mode": { 390 | "name": "ipython", 391 | "version": 2 392 | }, 393 | "file_extension": ".py", 394 | "mimetype": "text/x-python", 395 | "name": "python", 396 | "nbconvert_exporter": "python", 397 | "pygments_lexer": "ipython2", 398 | "version": "2.7.6" 399 | } 400 | }, 401 | "nbformat": 4, 402 | "nbformat_minor": 0 403 | } 404 | -------------------------------------------------------------------------------- /c7_classification_performance_measures/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /c8_svm/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /playML/PCA.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | 5 | class PCA(object): 6 | def __init__(self, n_components): 7 | assert n_components >= 1, "n_components必须大于等于1" 8 | self.n_components = n_components 9 | self.components_ = None 10 | 11 | def fit(self, X, eta=0.01, n_iters=1e4): 12 | """ 13 | 获得数据集X的前n个主成分 14 | :param X: 15 | :param eta: 16 | :param n_iters: 17 | :return: 18 | """ 19 | assert self.n_components <= np.shape(X)[1], 'n_components must not be greater than the feature number of X' 20 | 21 | def demean(X): 22 | return X - np.mean(X, axis=0) 23 | 24 | def f(w, X): 25 | """效用函数""" 26 | return np.sum((X.dot(w) ** 2))/len(X) 27 | 28 | def derivative_f(w, X): 29 | """求梯度""" 30 | return X.T.dot(X.dot(w))*2./len(X) 31 | 32 | def direction(w): 33 | return w/np.linalg.norm(w) 34 | 35 | def first_component(X, initial_w, eta=0.01, n_iters=1e4, epsilon=1e-8): 36 | w = direction(initial_w) 37 | cur_iter = 0 38 | 39 | while cur_iter < n_iters: 40 | gradient = derivative_f(w, X) 41 | last_w = w 42 | w = w + eta * gradient 43 | w = direction(w) 44 | if (abs(f(w, X) - f(last_w, X)) < epsilon): 45 | break 46 | cur_iter += 1 47 | return w 48 | 49 | X_pca = demean(X) 50 | self.components_ = np.empty(shape=(self.n_components, np.shape(X)[1])) 51 | for i in range(self.n_components): 52 | initial_w = np.random.random(X_pca.shape[1]) 53 | w = first_component(X_pca, initial_w, eta, n_iters) 54 | self.components_[i,:] = w 55 | X_pca = X_pca - X_pca.dot(w).reshape(-1,1)*w 56 | return self 57 | 58 | def transform(self, X): 59 | """将给定的X,映射到各个主成分分量中""" 60 | assert np.shape(X)[1] == np.shape(self.components_)[1] 61 | return X.dot(self.components_.T) 62 | 63 | def inverse_transform(self, X): 64 | """将给定的X,反向映射回原来的特征空间""" 65 | assert np.shape(X)[1] == np.shape(self.components_)[0] 66 | return X.dot(self.components_) 67 | 68 | def __repr__(self): 69 | return 'PCA(n_components=%d)' %self.n_components 70 | -------------------------------------------------------------------------------- /playML/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /playML/linear_regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from playML.metrics import r2_score 4 | from c2_linear_regression import linear_regression 5 | 6 | class LinearRegression(linear_regression.LinearRegression): 7 | def fit_gd(self, X_train:np.ndarray, y_train:np.ndarray, eta=0.01, n_iters=1e4): 8 | """ 9 | 根据训练数据集X_train和y_train,使用梯度下降法训练线性回归模型 10 | :param X_train: 11 | :param y_train: 12 | :param eta: 13 | :param n_iters: 14 | :return: 15 | """ 16 | assert X_train.shape[0] == y_train.shape[0], '每一个训练样本必须对应一个标记' 17 | 18 | def J(theta, X_b, y): 19 | """ 20 | 给定θ,特征矩阵X,标记向量y,根据损失函数得出其(损失)值 21 | :param theta: 22 | :param X_b: 23 | :param y: 24 | :return: 25 | """ 26 | 27 | # 分子部分其实等价于 (y - X_b.dot(theta)).T.dot(y - X_b.dot(theta)) 28 | try: 29 | return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b) 30 | except: 31 | return float('inf') # 防止溢出?有异常直接返回最大值 32 | 33 | def derivative_J(theta: np.ndarray, X_b: np.ndarray, y: np.ndarray): 34 | """ 35 | 求θ为给定值时的导数 36 | :param theta: 37 | :param X_b: 38 | :param y: 39 | :return: 40 | """ 41 | 42 | # res = np.empty(len(theta)) 43 | # res[0] = np.sum(X_b.dot(theta) - y) 44 | # for i in range(1, len(theta)): 45 | # res[i] = (X_b.dot(theta) - y).dot(X_b[:, i]) 46 | # return res * 2 / len(X_b) 47 | 48 | # 改为向量的形式 49 | return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(X_b) 50 | 51 | def gradient_descent(X_b, y, initial_theta, eta, n_iters=5, epsilon=1e-8): 52 | theta = initial_theta 53 | iters = 0 54 | while iters < n_iters: 55 | gradient = derivative_J(theta, X_b, y) 56 | last_theta = theta 57 | theta = theta - eta * gradient 58 | 59 | if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon): 60 | break 61 | iters += 1 62 | return theta 63 | 64 | X_b = np.hstack((np.ones((len(X_train), 1)), X_train)) 65 | initial_theta = np.zeros(X_b.shape[1]) # 初始的θ向量都是0 66 | self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters) 67 | self.interception_ = self._theta[0] 68 | self.coef_ = self._theta[1:] 69 | 70 | return self 71 | 72 | def fit_sgd(self, X_train, y_train, n_iters=1e4, t0=5, t1=50): 73 | """ 74 | 使用随机梯度下降法进行拟合 75 | :param X_train: 76 | :param y_train: 77 | :param n_iters: 78 | :param t0: 79 | :param t1: 80 | :return: 81 | """ 82 | assert X_train.shape[0] == y_train.shape[0], '每一个训练样本必须对应一个标记' 83 | assert n_iters >=1 , '所有训练样本至少要被随机一次' 84 | 85 | def derivative_J_sgd(theta: np.ndarray, X_b_i: np.ndarray, y_i): 86 | """ 87 | 求随机搜索方向 88 | """ 89 | return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2. 90 | 91 | def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50): 92 | """""" 93 | def learning_rate(t): 94 | return t0 / (t + t1) 95 | 96 | theta = initial_theta 97 | m = len(X_b) # 样本数目 98 | for cur_iter in range(n_iters): 99 | indexes = np.random.permutation(m) 100 | X_b_new = X_b[indexes] 101 | y_new = y[indexes] 102 | for i in range(m): 103 | gradient = derivative_J_sgd(theta, X_b_new[i], y_new[i]) 104 | # 向搜索方向的相反方向移动η 105 | theta = theta - learning_rate(cur_iter * m + i) * gradient 106 | return theta 107 | 108 | X_b = np.hstack((np.ones((len(X_train), 1)), X_train)) 109 | initial_theta = np.zeros(X_b.shape[1]) 110 | self._theta = sgd(X_b, y_train, initial_theta, n_iters=n_iters, t0=t0, t1=t1) 111 | self.interception_ = self._theta[0] 112 | self.coef_ = self._theta[1:] -------------------------------------------------------------------------------- /playML/logistic_regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from .metrics import accuracy_score 4 | 5 | 6 | class LogisticRegression(object): 7 | def __init__(self): 8 | """初始化逻辑回归模型""" 9 | self.coef_ = None 10 | self.intercept_ = None 11 | self._theta = None 12 | 13 | def _sigmoid(self, t): 14 | return 1. / (1. + np.exp(-t)) 15 | 16 | def fit(self, X_train, y_train, eta=0.01, n_iters=1e4): 17 | """根据训练数据集X_train,y_train,使用梯度下降法训练逻辑回归模型""" 18 | assert X_train.shape[0] == y_train.shape[0], '训练集与结果集的样本数必须一致' 19 | 20 | def J(theta, X_b, y): 21 | """定义损失函数""" 22 | y_hat = self._sigmoid(X_b.dot(theta)) 23 | try: 24 | return np.sum(np.dot(y, np.log(y_hat)) + np.dot((1 - y), np.log(1 - y_hat))) / -len(y) 25 | except: 26 | return float('inf') 27 | 28 | def derivative_J(theta, X_b, y): 29 | """求逻辑回归的梯度""" 30 | return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(X_b) 31 | 32 | def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8): 33 | """梯度下降法求θ""" 34 | theta = initial_theta 35 | iters = 0 36 | while iters < n_iters: 37 | gradient = derivative_J(theta, X_b, y) 38 | last_theta = theta 39 | theta = theta - eta * gradient 40 | 41 | if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon): 42 | break 43 | iters += 1 44 | return theta 45 | 46 | X_b = np.hstack((np.ones((len(X_train), 1)), X_train)) 47 | initial_theta = np.zeros(X_b.shape[1]) # 初始的θ向量都是0 48 | self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters) 49 | self.intercept_ = self._theta[0] 50 | self.coef_ = self._theta[1:] 51 | 52 | return self 53 | 54 | def predict_proba(self, X_predict): 55 | """给定待预测数据集X_predict,返回表示X_predict的结果概率向量""" 56 | X_b = np.hstack([np.ones(shape=(X_predict.shape[0], 1)), X_predict]) 57 | return self._sigmoid(X_b.dot(self._theta)) 58 | 59 | def predict(self, X_predict): 60 | proba = self.predict_proba(X_predict) 61 | return np.array(proba >= .5, dtype=int) # 把True/False的向量转化为1,0的向量 62 | 63 | def score(self, X_test, y_test): 64 | y_predict = self.predict(X_test) 65 | return accuracy_score(y_test, y_predict) 66 | 67 | def __repr__(self): 68 | return 'LogisticRegression()' -------------------------------------------------------------------------------- /playML/metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from math import sqrt 4 | 5 | 6 | def accuracy_score(y_true, y_predict): 7 | """ 8 | 计算y_true和y_predict之间的准确率 9 | :param y_true: 10 | :param y_predict: 11 | :return: 12 | """ 13 | assert y_true.shape[0] == y_predict.shape[0], 'the size of y_true must be equal to the size of y_predict' 14 | 15 | return sum(y_true == y_predict) / len(y_true) 16 | 17 | 18 | def mean_squared_error(y_true, y_predict): 19 | """ 20 | 计算y_true和y_predict之间的MSE 21 | :param y_true: 22 | :param y_predict: 23 | :return: 24 | """ 25 | assert len(y_true) == len(y_predict), 'the size of y_true must be equal to the size of y_predict' 26 | 27 | return np.sum((y_true - y_predict) ** 2) / len(y_true) 28 | 29 | 30 | def root_mean_squared_error(y_true, y_predict): 31 | return sqrt(mean_squared_error(y_true, y_predict)) 32 | 33 | 34 | def mean_absolute_error(y_true, y_predict): 35 | assert len(y_true) == len(y_predict), 'the size of y_true must be equal to the size of y_predict' 36 | 37 | return np.sum(np.absolute(y_true - y_predict)) / len(y_true) 38 | 39 | 40 | def r2_score(y_true, y_predict): 41 | """ 42 | 计算R^2 R Square 43 | :param y_true: 44 | :param y_predict: 45 | :return: 46 | """ 47 | return 1 - mean_squared_error(y_true, y_predict) / np.var(y_true) 48 | 49 | 50 | def TN(y_true, y_predict): 51 | assert len(y_true) == len(y_predict), 'y_true与y_predict的样本数目必须一致' 52 | return np.sum((y_true == 0) & (y_predict == 0)) 53 | 54 | 55 | def FP(y_true, y_predict): 56 | assert len(y_true) == len(y_predict), 'y_true与y_predict的样本数目必须一致' 57 | return np.sum((y_true == 0) & (y_predict == 1)) 58 | 59 | 60 | def FN(y_true, y_predict): 61 | assert len(y_true) == len(y_predict), 'y_true与y_predict的样本数目必须一致' 62 | return np.sum((y_true == 1) & (y_predict == 0)) 63 | 64 | 65 | def TP(y_true, y_predict): 66 | assert len(y_true) == len(y_predict), 'y_true与y_predict的样本数目必须一致' 67 | return np.sum((y_true == 1) & (y_predict == 1)) 68 | 69 | 70 | def confusion_matrix(y_true, y_predict): 71 | """返回一个2✖️2的混淆矩阵""" 72 | return np.array([ 73 | [TN(y_true, y_predict), FP(y_true, y_predict)], 74 | [FN(y_true, y_predict), TP(y_true, y_predict)] 75 | ]) 76 | 77 | 78 | def precision_score(y_true, y_predict): 79 | """求精准率""" 80 | tp = TP(y_true, y_predict) 81 | fp = FP(y_true, y_predict) 82 | try: 83 | return tp / (tp + fp) 84 | except: # 分母为0时,结果返回0 85 | return 0.0 86 | 87 | 88 | def recall_score(y_true, y_predict): 89 | """求召回率""" 90 | tp = TP(y_true, y_predict) 91 | fn = FN(y_true, y_predict) 92 | try: 93 | return tp / (tp + fn) 94 | except: 95 | return 0.0 96 | 97 | 98 | def f1_score(y_true, y_predict): 99 | """f1 score""" 100 | precision = precision_score(y_true, y_predict) 101 | recall = recall_score(y_true, y_predict) 102 | 103 | try: 104 | return 2.0 * precision * recall / (precision + recall) 105 | except: 106 | return 0. 107 | 108 | 109 | def TPR(y_true, y_predict): 110 | return recall_score(y_true, y_predict) 111 | 112 | 113 | def FPR(y_true, y_predict): 114 | fp = FP(y_true, y_predict) 115 | tn = TN(y_true, y_predict) 116 | try: 117 | return fp / (fp + tn) 118 | except: 119 | return 0. 120 | -------------------------------------------------------------------------------- /playML/model_selection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | 5 | def train_test_split(X, y, test_ratio=0.2, seed=None): 6 | """ 7 | 将数据X和y按照test_ratio分割成X_train,X_test,y_train,y_test 8 | :param X: 9 | :param y: 10 | :param test_ratio: 11 | :param seed: 12 | :return: 13 | """ 14 | if seed: 15 | np.random.seed(seed) 16 | 17 | shuffled_indexes = np.random.permutation(np.shape(X)[0]) 18 | 19 | test_size = int(np.shape(X)[0] * test_ratio) 20 | test_indexes = shuffled_indexes[:test_size] 21 | train_indexes = shuffled_indexes[test_size:] 22 | 23 | X_train = X[train_indexes] 24 | y_train = y[train_indexes] 25 | 26 | X_test = X[test_indexes] 27 | y_test = y[test_indexes] 28 | 29 | return X_train, X_test, y_train, y_test 30 | -------------------------------------------------------------------------------- /playML/plot_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | def plot_decision_boundary(model, axis): 6 | """绘制不规则决策边界""" 7 | x0, x1 = np.meshgrid( 8 | np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(1, -1), 9 | np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(1, -1) 10 | ) 11 | X_new = np.c_[x0.ravel(), x1.ravel()] 12 | 13 | y_predict = model.predict(X_new) 14 | zz = y_predict.reshape(x0.shape) 15 | 16 | from matplotlib.colors import ListedColormap 17 | custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9']) 18 | 19 | plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap) 20 | 21 | def plot_svc_decision_boundary(model, axis): 22 | plot_decision_boundary(model, axis) 23 | w = model.coef_[0] 24 | b = model.intercept_[0] 25 | 26 | # 绘制margin的直线 27 | # 决策边界所在直线的表达式:w0 * x0 + w1 * x1 + b = 0 -> x1 = -w0 * x0 / w1 - b / w1 28 | plot_x = np.linspace(axis[0], axis[1], 200) 29 | 30 | # w0 * x0 + w1 * x1 + b = 1 -> x1 = 1/w1 - w0 * x0 / w1 - b / w1 31 | up_y = -w[0]/w[1]*plot_x - b/w[1] + 1/w[1] 32 | 33 | down_y = -w[0]/w[1]*plot_x - b/w[1] - 1/w[1] 34 | 35 | # 处理超过了坐标轴范围的值 36 | up_index = (up_y >= axis[2]) & (up_y <= axis[3]) 37 | down_index = (down_y >= axis[2]) & (down_y <= axis[3]) 38 | 39 | plt.plot(plot_x[up_index], up_y[up_index], color='black') 40 | plt.plot(plot_x[down_index], down_y[down_index], color='black') --------------------------------------------------------------------------------