├── README.md ├── wk1-linear-regression ├── .ipynb_checkpoints │ ├── Linear Regression 3D - CO2&Temp-checkpoint.ipynb │ └── Linear Regression live - learn-m&b-checkpoint.ipynb ├── Linear Regression 3D - CO2&Temp.ipynb ├── Linear Regression live - learn-m&b.ipynb ├── annual_temp.csv ├── brain_boday.txt ├── challenge1.py ├── challenge_dataset.txt ├── data.csv ├── demo.py └── global_co2.csv ├── wk2-neural-Networks ├── .ipynb_checkpoints │ ├── NN-3layers-checkpoint.ipynb │ └── feedForwardNN-checkpoint.ipynb ├── NN-3layers.ipynb └── feedForwardNN.ipynb ├── wk4-earthquakes ├── .ipynb_checkpoints │ └── earthquakes_NN-checkpoint.ipynb ├── database.csv └── earthquakes_NN.ipynb ├── wk5-speed-dating ├── .ipynb_checkpoints │ └── Speed dating prediction-checkpoint.ipynb ├── Speed Dating Data.csv └── Speed dating prediction.ipynb ├── wk6-image-classifier ├── .ipynb_checkpoints │ └── image-classifier-sheeps-goats-checkpoint.ipynb └── image-classifier-sheeps-goats.ipynb ├── wk7-stock-price-prediction ├── .ipynb_checkpoints │ └── Google-stock-prices-prediction-checkpoint.ipynb └── Google-stock-prices-prediction.ipynb └── wk8-generate-art ├── .ipynb_checkpoints ├── Generate-art-style-transform1-checkpoint.ipynb └── Generate-art-style-transform2-checkpoint.ipynb ├── Generate-art-style-transform1.ipynb ├── Generate-art-style-transform2.ipynb ├── house.jpg ├── the_scream.jpg └── wave.jpg /README.md: -------------------------------------------------------------------------------- 1 | # Deep-Learning-projects 2 | Deep Learning projects 3 | 4 | In this repository, I keep all of my codes taking Siraj Raval's [Intro to Deep Learning](https://www.youtube.com/channel/UCWN3xxRkmTPmbKwht9FuE5A) courses. I am having a lot of fun learning from his videos. 5 | -------------------------------------------------------------------------------- /wk1-linear-regression/.ipynb_checkpoints/Linear Regression live - learn-m&b-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Learn Linear Function m and b \n", 8 | " - Siraj's linear regression - live session\n", 9 | "\n", 10 | "for y = mx + b, (m is slope, b is y-intercept) --- \n", 11 | "Learn to find out m and b from data.csv" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "from numpy import *" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Compute Error\n", 30 | "To calculate our error use [Sum of squared distances formula](https://spin.atomicobject.com/wp-content/uploads/linear_regression_error1.png)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "def compute_error_for_line_given_points(b, m, points):\n", 42 | " totalError = 0\n", 43 | " for i in range(0, len(points)):\n", 44 | " x = points[i, 0]\n", 45 | " y = points[i, 1]\n", 46 | " totalError += (y - (m * x + b)) ** 2\n", 47 | " return totalError / float(len(points))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### Compute Gradient\n", 55 | "To perform gradient descent use [Partial derivative with respect to b and m](https://spin.atomicobject.com/wp-content/uploads/linear_regression_gradient1.png)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "def step_gradient(b_current, m_current, points, learningRate):\n", 67 | " b_gradient = 0\n", 68 | " m_gradient = 0\n", 69 | " N = float(len(points))\n", 70 | " for i in range(0, len(points)):\n", 71 | " x = points[i, 0]\n", 72 | " y = points[i, 1]\n", 73 | " b_gradient += -(2/N) * (y - ((m_current * x) + b_current))\n", 74 | " m_gradient += -(2/N) * x * (y - ((m_current * x) + b_current))\n", 75 | " new_b = b_current - (learningRate * b_gradient)\n", 76 | " new_m = m_current - (learningRate * m_gradient)\n", 77 | " return [new_b, new_m]" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "### Iterate Gradient steps\n", 85 | " [Gradient descent visualization](https://raw.githubusercontent.com/mattnedrich/GradientDescentExample/master/gradient_descent_example.gif)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "def gradient_descent_runner(points, starting_b, starting_m, learning_rate, num_iterations):\n", 97 | " b = starting_b\n", 98 | " m = starting_m\n", 99 | " for i in range(num_iterations):\n", 100 | " b, m = step_gradient(b, m, array(points), learning_rate)\n", 101 | " return [b, m]" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### import data and learn m & b" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 7, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "Starting gradient descent at b = 0, m = 0, error = 5565.107834483211\n", 123 | "Running...\n", 124 | "After 1000 iterations b = 0.08893651993741346, m = 1.4777440851894448, error = 112.61481011613473\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "points = genfromtxt(\"data.csv\", delimiter=\",\")\n", 130 | "learning_rate = 0.0001\n", 131 | "initial_b = 0 # initial y-intercept guess\n", 132 | "initial_m = 0 # initial slope guess\n", 133 | "num_iterations = 1000\n", 134 | "print(\"Starting gradient descent at b = {0}, m = {1}, error = {2}\"\n", 135 | " .format(initial_b, initial_m,compute_error_for_line_given_points(initial_b, initial_m, points)))\n", 136 | "print(\"Running...\")\n", 137 | "[b, m] = gradient_descent_runner(points, initial_b, initial_m, learning_rate, num_iterations)\n", 138 | "print(\"After {0} iterations b = {1}, m = {2}, error = {3}\"\n", 139 | " .format(num_iterations, b, m, compute_error_for_line_given_points(b, m, points)))" 140 | ] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 3", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.5.2" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 2 164 | } 165 | -------------------------------------------------------------------------------- /wk1-linear-regression/Linear Regression live - learn-m&b.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Learn Linear Function m and b \n", 8 | " - Siraj's linear regression - live session\n", 9 | "\n", 10 | "for y = mx + b, (m is slope, b is y-intercept) --- \n", 11 | "Learn to find out m and b from data.csv" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "from numpy import *" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Compute Error\n", 30 | "To calculate our error use [Sum of squared distances formula](https://spin.atomicobject.com/wp-content/uploads/linear_regression_error1.png)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "def compute_error_for_line_given_points(b, m, points):\n", 42 | " totalError = 0\n", 43 | " for i in range(0, len(points)):\n", 44 | " x = points[i, 0]\n", 45 | " y = points[i, 1]\n", 46 | " totalError += (y - (m * x + b)) ** 2\n", 47 | " return totalError / float(len(points))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### Compute Gradient\n", 55 | "To perform gradient descent use [Partial derivative with respect to b and m](https://spin.atomicobject.com/wp-content/uploads/linear_regression_gradient1.png)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "def step_gradient(b_current, m_current, points, learningRate):\n", 67 | " b_gradient = 0\n", 68 | " m_gradient = 0\n", 69 | " N = float(len(points))\n", 70 | " for i in range(0, len(points)):\n", 71 | " x = points[i, 0]\n", 72 | " y = points[i, 1]\n", 73 | " b_gradient += -(2/N) * (y - ((m_current * x) + b_current))\n", 74 | " m_gradient += -(2/N) * x * (y - ((m_current * x) + b_current))\n", 75 | " new_b = b_current - (learningRate * b_gradient)\n", 76 | " new_m = m_current - (learningRate * m_gradient)\n", 77 | " return [new_b, new_m]" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "### Iterate Gradient steps\n", 85 | " [Gradient descent visualization](https://raw.githubusercontent.com/mattnedrich/GradientDescentExample/master/gradient_descent_example.gif)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "def gradient_descent_runner(points, starting_b, starting_m, learning_rate, num_iterations):\n", 97 | " b = starting_b\n", 98 | " m = starting_m\n", 99 | " for i in range(num_iterations):\n", 100 | " b, m = step_gradient(b, m, array(points), learning_rate)\n", 101 | " return [b, m]" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### import data and learn m & b" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 7, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "Starting gradient descent at b = 0, m = 0, error = 5565.107834483211\n", 123 | "Running...\n", 124 | "After 1000 iterations b = 0.08893651993741346, m = 1.4777440851894448, error = 112.61481011613473\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "points = genfromtxt(\"data.csv\", delimiter=\",\")\n", 130 | "learning_rate = 0.0001\n", 131 | "initial_b = 0 # initial y-intercept guess\n", 132 | "initial_m = 0 # initial slope guess\n", 133 | "num_iterations = 1000\n", 134 | "print(\"Starting gradient descent at b = {0}, m = {1}, error = {2}\"\n", 135 | " .format(initial_b, initial_m,compute_error_for_line_given_points(initial_b, initial_m, points)))\n", 136 | "print(\"Running...\")\n", 137 | "[b, m] = gradient_descent_runner(points, initial_b, initial_m, learning_rate, num_iterations)\n", 138 | "print(\"After {0} iterations b = {1}, m = {2}, error = {3}\"\n", 139 | " .format(num_iterations, b, m, compute_error_for_line_given_points(b, m, points)))" 140 | ] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 3", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.5.2" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 2 164 | } 165 | -------------------------------------------------------------------------------- /wk1-linear-regression/annual_temp.csv: -------------------------------------------------------------------------------- 1 | Source,Year,Mean 2 | GCAG,2015,0.8990 3 | GISTEMP,2015,0.87 4 | GCAG,2014,0.7402 5 | GISTEMP,2014,0.75 6 | GCAG,2013,0.6687 7 | GISTEMP,2013,0.66 8 | GCAG,2012,0.6219 9 | GISTEMP,2012,0.64 10 | GCAG,2011,0.5759 11 | GISTEMP,2011,0.61 12 | GCAG,2010,0.7008 13 | GISTEMP,2010,0.72 14 | GCAG,2009,0.6354 15 | GISTEMP,2009,0.65 16 | GCAG,2008,0.5415 17 | GISTEMP,2008,0.54 18 | GCAG,2007,0.6113 19 | GISTEMP,2007,0.66 20 | GCAG,2006,0.6139 21 | GISTEMP,2006,0.63 22 | GCAG,2005,0.6583 23 | GISTEMP,2005,0.69 24 | GCAG,2004,0.5806 25 | GISTEMP,2004,0.55 26 | GCAG,2003,0.6145 27 | GISTEMP,2003,0.62 28 | GCAG,2002,0.6018 29 | GISTEMP,2002,0.63 30 | GCAG,2001,0.5455 31 | GISTEMP,2001,0.55 32 | GCAG,2000,0.4255 33 | GISTEMP,2000,0.42 34 | GCAG,1999,0.4427 35 | GISTEMP,1999,0.42 36 | GCAG,1998,0.6335 37 | GISTEMP,1998,0.64 38 | GCAG,1997,0.5185 39 | GISTEMP,1997,0.48 40 | GCAG,1996,0.3225 41 | GISTEMP,1996,0.35 42 | GCAG,1995,0.4593 43 | GISTEMP,1995,0.46 44 | GCAG,1994,0.3420 45 | GISTEMP,1994,0.32 46 | GCAG,1993,0.2857 47 | GISTEMP,1993,0.24 48 | GCAG,1992,0.2583 49 | GISTEMP,1992,0.23 50 | GCAG,1991,0.4079 51 | GISTEMP,1991,0.43 52 | GCAG,1990,0.4350 53 | GISTEMP,1990,0.44 54 | GCAG,1989,0.2982 55 | GISTEMP,1989,0.29 56 | GCAG,1988,0.3770 57 | GISTEMP,1988,0.41 58 | GCAG,1987,0.3710 59 | GISTEMP,1987,0.33 60 | GCAG,1986,0.2308 61 | GISTEMP,1986,0.19 62 | GCAG,1985,0.1357 63 | GISTEMP,1985,0.12 64 | GCAG,1984,0.1510 65 | GISTEMP,1984,0.16 66 | GCAG,1983,0.3429 67 | GISTEMP,1983,0.3 68 | GCAG,1982,0.1836 69 | GISTEMP,1982,0.13 70 | GCAG,1981,0.3024 71 | GISTEMP,1981,0.33 72 | GCAG,1980,0.2651 73 | GISTEMP,1980,0.27 74 | GCAG,1979,0.2288 75 | GISTEMP,1979,0.17 76 | GCAG,1978,0.1139 77 | GISTEMP,1978,0.07 78 | GCAG,1977,0.1996 79 | GISTEMP,1977,0.18 80 | GCAG,1976,-0.0769 81 | GISTEMP,1976,-0.12 82 | GCAG,1975,0.0060 83 | GISTEMP,1975,-0.02 84 | GCAG,1974,-0.0698 85 | GISTEMP,1974,-0.08 86 | GCAG,1973,0.1654 87 | GISTEMP,1973,0.15 88 | GCAG,1972,0.0280 89 | GISTEMP,1972,0.01 90 | GCAG,1971,-0.0775 91 | GISTEMP,1971,-0.09 92 | GCAG,1970,0.0383 93 | GISTEMP,1970,0.02 94 | GCAG,1969,0.0937 95 | GISTEMP,1969,0.06 96 | GCAG,1968,-0.0282 97 | GISTEMP,1968,-0.07 98 | GCAG,1967,-0.0112 99 | GISTEMP,1967,-0.02 100 | GCAG,1966,-0.0204 101 | GISTEMP,1966,-0.05 102 | GCAG,1965,-0.0752 103 | GISTEMP,1965,-0.1 104 | GCAG,1964,-0.1461 105 | GISTEMP,1964,-0.2 106 | GCAG,1963,0.1100 107 | GISTEMP,1963,0.06 108 | GCAG,1962,0.0924 109 | GISTEMP,1962,0.03 110 | GCAG,1961,0.0818 111 | GISTEMP,1961,0.05 112 | GCAG,1960,0.0252 113 | GISTEMP,1960,-0.03 114 | GCAG,1959,0.0640 115 | GISTEMP,1959,0.03 116 | GCAG,1958,0.1145 117 | GISTEMP,1958,0.07 118 | GCAG,1957,0.0538 119 | GISTEMP,1957,0.04 120 | GCAG,1956,-0.1945 121 | GISTEMP,1956,-0.2 122 | GCAG,1955,-0.1305 123 | GISTEMP,1955,-0.15 124 | GCAG,1954,-0.1118 125 | GISTEMP,1954,-0.12 126 | GCAG,1953,0.0997 127 | GISTEMP,1953,0.08 128 | GCAG,1952,0.0288 129 | GISTEMP,1952,0.01 130 | GCAG,1951,-0.0095 131 | GISTEMP,1951,-0.06 132 | GCAG,1950,-0.1579 133 | GISTEMP,1950,-0.18 134 | GCAG,1949,-0.0550 135 | GISTEMP,1949,-0.09 136 | GCAG,1948,-0.0471 137 | GISTEMP,1948,-0.09 138 | GCAG,1947,-0.0455 139 | GISTEMP,1947,-0.05 140 | GCAG,1946,-0.0013 141 | GISTEMP,1946,-0.04 142 | GCAG,1945,0.1754 143 | GISTEMP,1945,0.12 144 | GCAG,1944,0.2948 145 | GISTEMP,1944,0.25 146 | GCAG,1943,0.1598 147 | GISTEMP,1943,0.13 148 | GCAG,1942,0.1549 149 | GISTEMP,1942,0.09 150 | GCAG,1941,0.1974 151 | GISTEMP,1941,0.13 152 | GCAG,1940,0.0927 153 | GISTEMP,1940,0.08 154 | GCAG,1939,-0.0157 155 | GISTEMP,1939,-0.03 156 | GCAG,1938,-0.0318 157 | GISTEMP,1938,-0.03 158 | GCAG,1937,-0.0204 159 | GISTEMP,1937,-0.03 160 | GCAG,1936,-0.1173 161 | GISTEMP,1936,-0.15 162 | GCAG,1935,-0.1445 163 | GISTEMP,1935,-0.19 164 | GCAG,1934,-0.1075 165 | GISTEMP,1934,-0.14 166 | GCAG,1933,-0.2481 167 | GISTEMP,1933,-0.29 168 | GCAG,1932,-0.1214 169 | GISTEMP,1932,-0.16 170 | GCAG,1931,-0.0714 171 | GISTEMP,1931,-0.09 172 | GCAG,1930,-0.1016 173 | GISTEMP,1930,-0.14 174 | GCAG,1929,-0.2982 175 | GISTEMP,1929,-0.35 176 | GCAG,1928,-0.1749 177 | GISTEMP,1928,-0.21 178 | GCAG,1927,-0.1506 179 | GISTEMP,1927,-0.2 180 | GCAG,1926,-0.0618 181 | GISTEMP,1926,-0.09 182 | GCAG,1925,-0.1464 183 | GISTEMP,1925,-0.2 184 | GCAG,1924,-0.2510 185 | GISTEMP,1924,-0.28 186 | GCAG,1923,-0.2161 187 | GISTEMP,1923,-0.24 188 | GCAG,1922,-0.2318 189 | GISTEMP,1922,-0.27 190 | GCAG,1921,-0.1517 191 | GISTEMP,1921,-0.21 192 | GCAG,1920,-0.2152 193 | GISTEMP,1920,-0.27 194 | GCAG,1919,-0.2082 195 | GISTEMP,1919,-0.22 196 | GCAG,1918,-0.2118 197 | GISTEMP,1918,-0.26 198 | GCAG,1917,-0.3193 199 | GISTEMP,1917,-0.4 200 | GCAG,1916,-0.2979 201 | GISTEMP,1916,-0.34 202 | GCAG,1915,-0.0747 203 | GISTEMP,1915,-0.11 204 | GCAG,1914,-0.1444 205 | GISTEMP,1914,-0.16 206 | GCAG,1913,-0.3205 207 | GISTEMP,1913,-0.34 208 | GCAG,1912,-0.3318 209 | GISTEMP,1912,-0.35 210 | GCAG,1911,-0.4367 211 | GISTEMP,1911,-0.45 212 | GCAG,1910,-0.3862 213 | GISTEMP,1910,-0.43 214 | GCAG,1909,-0.4332 215 | GISTEMP,1909,-0.48 216 | GCAG,1908,-0.4441 217 | GISTEMP,1908,-0.43 218 | GCAG,1907,-0.3767 219 | GISTEMP,1907,-0.4 220 | GCAG,1906,-0.2208 221 | GISTEMP,1906,-0.23 222 | GCAG,1905,-0.2967 223 | GISTEMP,1905,-0.28 224 | GCAG,1904,-0.4240 225 | GISTEMP,1904,-0.45 226 | GCAG,1903,-0.3442 227 | GISTEMP,1903,-0.36 228 | GCAG,1902,-0.2535 229 | GISTEMP,1902,-0.28 230 | GCAG,1901,-0.1471 231 | GISTEMP,1901,-0.15 232 | GCAG,1900,-0.0704 233 | GISTEMP,1900,-0.09 234 | GCAG,1899,-0.1172 235 | GISTEMP,1899,-0.16 236 | GCAG,1898,-0.2578 237 | GISTEMP,1898,-0.28 238 | GCAG,1897,-0.1232 239 | GISTEMP,1897,-0.11 240 | GCAG,1896,-0.0971 241 | GISTEMP,1896,-0.15 242 | GCAG,1895,-0.2279 243 | GISTEMP,1895,-0.21 244 | GCAG,1894,-0.2828 245 | GISTEMP,1894,-0.3 246 | GCAG,1893,-0.3221 247 | GISTEMP,1893,-0.3 248 | GCAG,1892,-0.3079 249 | GISTEMP,1892,-0.26 250 | GCAG,1891,-0.2552 251 | GISTEMP,1891,-0.24 252 | GCAG,1890,-0.3233 253 | GISTEMP,1890,-0.36 254 | GCAG,1889,-0.1032 255 | GISTEMP,1889,-0.11 256 | GCAG,1888,-0.1541 257 | GISTEMP,1888,-0.2 258 | GCAG,1887,-0.2559 259 | GISTEMP,1887,-0.33 260 | GCAG,1886,-0.2101 261 | GISTEMP,1886,-0.3 262 | GCAG,1885,-0.2220 263 | GISTEMP,1885,-0.31 264 | GCAG,1884,-0.2099 265 | GISTEMP,1884,-0.28 266 | GCAG,1883,-0.1481 267 | GISTEMP,1883,-0.2 268 | GCAG,1882,-0.0710 269 | GISTEMP,1882,-0.1 270 | GCAG,1881,-0.0707 271 | GISTEMP,1881,-0.12 272 | GCAG,1880,-0.1247 273 | GISTEMP,1880,-0.2 -------------------------------------------------------------------------------- /wk1-linear-regression/brain_boday.txt: -------------------------------------------------------------------------------- 1 | Brain Body 2 | 3.385 44.500 3 | 0.480 15.500 4 | 1.350 8.100 5 | 465.000 423.000 6 | 36.330 119.500 7 | 27.660 115.000 8 | 14.830 98.200 9 | 1.040 5.500 10 | 4.190 58.000 11 | 0.425 6.400 12 | 0.101 4.000 13 | 0.920 5.700 14 | 1.000 6.600 15 | 0.005 0.140 16 | 0.060 1.000 17 | 3.500 10.800 18 | 2.000 12.300 19 | 1.700 6.300 20 | 2547.000 4603.000 21 | 0.023 0.300 22 | 187.100 419.000 23 | 521.000 655.000 24 | 0.785 3.500 25 | 10.000 115.000 26 | 3.300 25.600 27 | 0.200 5.000 28 | 1.410 17.500 29 | 529.000 680.000 30 | 207.000 406.000 31 | 85.000 325.000 32 | 0.750 12.300 33 | 62.000 1320.000 34 | 6654.000 5712.000 35 | 3.500 3.900 36 | 6.800 179.000 37 | 35.000 56.000 38 | 4.050 17.000 39 | 0.120 1.000 40 | 0.023 0.400 41 | 0.010 0.250 42 | 1.400 12.500 43 | 250.000 490.000 44 | 2.500 12.100 45 | 55.500 175.000 46 | 100.000 157.000 47 | 52.160 440.000 48 | 10.550 179.500 49 | 0.550 2.400 50 | 60.000 81.000 51 | 3.600 21.000 52 | 4.288 39.200 53 | 0.280 1.900 54 | 0.075 1.200 55 | 0.122 3.000 56 | 0.048 0.330 57 | 192.000 180.000 58 | 3.000 25.000 59 | 160.000 169.000 60 | 0.900 2.600 61 | 1.620 11.400 62 | 0.104 2.500 63 | 4.235 50.400 -------------------------------------------------------------------------------- /wk1-linear-regression/challenge1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn import linear_model 4 | import matplotlib.pyplot as plt 5 | 6 | #read data 7 | df = pd.read_csv('challenge_dataset.txt',header=None, names=['x','y']) 8 | 9 | #print df.head() 10 | #print df.shape 11 | #print df.info() 12 | #print df.describe() 13 | 14 | x = df[['x']] 15 | y = df[['y']] 16 | #print x,y 17 | 18 | plt.scatter(x,y) 19 | 20 | # train model 21 | reg = linear_model.LinearRegression() 22 | reg.fit(x,y) 23 | pred = reg.predict(x) 24 | 25 | #Returns the coefficient of determination R^2 of the prediction 26 | r_square = reg.score(x,y) 27 | print "R^2 is ", r_square 28 | 29 | plt.plot(x, pred) 30 | plt.show() 31 | 32 | # Mean Squared Error 33 | from sklearn.metrics import mean_squared_error 34 | MSR = mean_squared_error(y,pred) 35 | print "Mean Square Error is ", MSR -------------------------------------------------------------------------------- /wk1-linear-regression/challenge_dataset.txt: -------------------------------------------------------------------------------- 1 | 6.1101,17.592 2 | 5.5277,9.1302 3 | 8.5186,13.662 4 | 7.0032,11.854 5 | 5.8598,6.8233 6 | 8.3829,11.886 7 | 7.4764,4.3483 8 | 8.5781,12 9 | 6.4862,6.5987 10 | 5.0546,3.8166 11 | 5.7107,3.2522 12 | 14.164,15.505 13 | 5.734,3.1551 14 | 8.4084,7.2258 15 | 5.6407,0.71618 16 | 5.3794,3.5129 17 | 6.3654,5.3048 18 | 5.1301,0.56077 19 | 6.4296,3.6518 20 | 7.0708,5.3893 21 | 6.1891,3.1386 22 | 20.27,21.767 23 | 5.4901,4.263 24 | 6.3261,5.1875 25 | 5.5649,3.0825 26 | 18.945,22.638 27 | 12.828,13.501 28 | 10.957,7.0467 29 | 13.176,14.692 30 | 22.203,24.147 31 | 5.2524,-1.22 32 | 6.5894,5.9966 33 | 9.2482,12.134 34 | 5.8918,1.8495 35 | 8.2111,6.5426 36 | 7.9334,4.5623 37 | 8.0959,4.1164 38 | 5.6063,3.3928 39 | 12.836,10.117 40 | 6.3534,5.4974 41 | 5.4069,0.55657 42 | 6.8825,3.9115 43 | 11.708,5.3854 44 | 5.7737,2.4406 45 | 7.8247,6.7318 46 | 7.0931,1.0463 47 | 5.0702,5.1337 48 | 5.8014,1.844 49 | 11.7,8.0043 50 | 5.5416,1.0179 51 | 7.5402,6.7504 52 | 5.3077,1.8396 53 | 7.4239,4.2885 54 | 7.6031,4.9981 55 | 6.3328,1.4233 56 | 6.3589,-1.4211 57 | 6.2742,2.4756 58 | 5.6397,4.6042 59 | 9.3102,3.9624 60 | 9.4536,5.4141 61 | 8.8254,5.1694 62 | 5.1793,-0.74279 63 | 21.279,17.929 64 | 14.908,12.054 65 | 18.959,17.054 66 | 7.2182,4.8852 67 | 8.2951,5.7442 68 | 10.236,7.7754 69 | 5.4994,1.0173 70 | 20.341,20.992 71 | 10.136,6.6799 72 | 7.3345,4.0259 73 | 6.0062,1.2784 74 | 7.2259,3.3411 75 | 5.0269,-2.6807 76 | 6.5479,0.29678 77 | 7.5386,3.8845 78 | 5.0365,5.7014 79 | 10.274,6.7526 80 | 5.1077,2.0576 81 | 5.7292,0.47953 82 | 5.1884,0.20421 83 | 6.3557,0.67861 84 | 9.7687,7.5435 85 | 6.5159,5.3436 86 | 8.5172,4.2415 87 | 9.1802,6.7981 88 | 6.002,0.92695 89 | 5.5204,0.152 90 | 5.0594,2.8214 91 | 5.7077,1.8451 92 | 7.6366,4.2959 93 | 5.8707,7.2029 94 | 5.3054,1.9869 95 | 8.2934,0.14454 96 | 13.394,9.0551 97 | 5.4369,0.61705 98 | -------------------------------------------------------------------------------- /wk1-linear-regression/data.csv: -------------------------------------------------------------------------------- 1 | 32.502345269453031,31.70700584656992 2 | 53.426804033275019,68.77759598163891 3 | 61.530358025636438,62.562382297945803 4 | 47.475639634786098,71.546632233567777 5 | 59.813207869512318,87.230925133687393 6 | 55.142188413943821,78.211518270799232 7 | 52.211796692214001,79.64197304980874 8 | 39.299566694317065,59.171489321869508 9 | 48.10504169176825,75.331242297063056 10 | 52.550014442733818,71.300879886850353 11 | 45.419730144973755,55.165677145959123 12 | 54.351634881228918,82.478846757497919 13 | 44.164049496773352,62.008923245725825 14 | 58.16847071685779,75.392870425994957 15 | 56.727208057096611,81.43619215887864 16 | 48.955888566093719,60.723602440673965 17 | 44.687196231480904,82.892503731453715 18 | 60.297326851333466,97.379896862166078 19 | 45.618643772955828,48.847153317355072 20 | 38.816817537445637,56.877213186268506 21 | 66.189816606752601,83.878564664602763 22 | 65.41605174513407,118.59121730252249 23 | 47.48120860786787,57.251819462268969 24 | 41.57564261748702,51.391744079832307 25 | 51.84518690563943,75.380651665312357 26 | 59.370822011089523,74.765564032151374 27 | 57.31000343834809,95.455052922574737 28 | 63.615561251453308,95.229366017555307 29 | 46.737619407976972,79.052406169565586 30 | 50.556760148547767,83.432071421323712 31 | 52.223996085553047,63.358790317497878 32 | 35.567830047746632,41.412885303700563 33 | 42.436476944055642,76.617341280074044 34 | 58.16454011019286,96.769566426108199 35 | 57.504447615341789,74.084130116602523 36 | 45.440530725319981,66.588144414228594 37 | 61.89622268029126,77.768482417793024 38 | 33.093831736163963,50.719588912312084 39 | 36.436009511386871,62.124570818071781 40 | 37.675654860850742,60.810246649902211 41 | 44.555608383275356,52.682983366387781 42 | 43.318282631865721,58.569824717692867 43 | 50.073145632289034,82.905981485070512 44 | 43.870612645218372,61.424709804339123 45 | 62.997480747553091,115.24415280079529 46 | 32.669043763467187,45.570588823376085 47 | 40.166899008703702,54.084054796223612 48 | 53.575077531673656,87.994452758110413 49 | 33.864214971778239,52.725494375900425 50 | 64.707138666121296,93.576118692658241 51 | 38.119824026822805,80.166275447370964 52 | 44.502538064645101,65.101711570560326 53 | 40.599538384552318,65.562301260400375 54 | 41.720676356341293,65.280886920822823 55 | 51.088634678336796,73.434641546324301 56 | 55.078095904923202,71.13972785861894 57 | 41.377726534895203,79.102829683549857 58 | 62.494697427269791,86.520538440347153 59 | 49.203887540826003,84.742697807826218 60 | 41.102685187349664,59.358850248624933 61 | 41.182016105169822,61.684037524833627 62 | 50.186389494880601,69.847604158249183 63 | 52.378446219236217,86.098291205774103 64 | 50.135485486286122,59.108839267699643 65 | 33.644706006191782,69.89968164362763 66 | 39.557901222906828,44.862490711164398 67 | 56.130388816875467,85.498067778840223 68 | 57.362052133238237,95.536686846467219 69 | 60.269214393997906,70.251934419771587 70 | 35.678093889410732,52.721734964774988 71 | 31.588116998132829,50.392670135079896 72 | 53.66093226167304,63.642398775657753 73 | 46.682228649471917,72.247251068662365 74 | 43.107820219102464,57.812512976181402 75 | 70.34607561504933,104.25710158543822 76 | 44.492855880854073,86.642020318822006 77 | 57.50453330326841,91.486778000110135 78 | 36.930076609191808,55.231660886212836 79 | 55.805733357942742,79.550436678507609 80 | 38.954769073377065,44.847124242467601 81 | 56.901214702247074,80.207523139682763 82 | 56.868900661384046,83.14274979204346 83 | 34.33312470421609,55.723489260543914 84 | 59.04974121466681,77.634182511677864 85 | 57.788223993230673,99.051414841748269 86 | 54.282328705967409,79.120646274680027 87 | 51.088719898979143,69.588897851118475 88 | 50.282836348230731,69.510503311494389 89 | 44.211741752090113,73.687564318317285 90 | 38.005488008060688,61.366904537240131 91 | 32.940479942618296,67.170655768995118 92 | 53.691639571070056,85.668203145001542 93 | 68.76573426962166,114.85387123391394 94 | 46.230966498310252,90.123572069967423 95 | 68.319360818255362,97.919821035242848 96 | 50.030174340312143,81.536990783015028 97 | 49.239765342753763,72.111832469615663 98 | 50.039575939875988,85.232007342325673 99 | 48.149858891028863,66.224957888054632 100 | 25.128484647772304,53.454394214850524 -------------------------------------------------------------------------------- /wk1-linear-regression/demo.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn import linear_model 4 | import matplotlib.pyplot as plt 5 | 6 | #read data 7 | df = pd.read_fwf('brain_boday.txt') 8 | x = df[['Brain']] 9 | y = df[['Body']] 10 | #print x 11 | 12 | # train model 13 | body_reg = linear_model.LinearRegression() 14 | body_reg.fit(x,y) 15 | #a = np.array([[1],[2],[3],[4]]) # vector 16 | a = np.arange(10).reshape(10,1) 17 | #print a 18 | pred=body_reg.predict(a) 19 | #print pred 20 | 21 | plt.scatter(x,y) # independent, dependent 22 | plt.plot(x, body_reg.predict(x)) 23 | plt.show() 24 | 25 | #plt.plot(a,pred) 26 | #plt.show() 27 | -------------------------------------------------------------------------------- /wk1-linear-regression/global_co2.csv: -------------------------------------------------------------------------------- 1 | Year,Total,Gas Fuel,Liquid Fuel,Solid Fuel,Cement,Gas Flaring,Per Capita 2 | 1751,3,0,0,3,0,0, 3 | 1752,3,0,0,3,0,0, 4 | 1753,3,0,0,3,0,0, 5 | 1754,3,0,0,3,0,0, 6 | 1755,3,0,0,3,0,0, 7 | 1756,3,0,0,3,0,0, 8 | 1757,3,0,0,3,0,0, 9 | 1758,3,0,0,3,0,0, 10 | 1759,3,0,0,3,0,0, 11 | 1760,3,0,0,3,0,0, 12 | 1761,3,0,0,3,0,0, 13 | 1762,3,0,0,3,0,0, 14 | 1763,3,0,0,3,0,0, 15 | 1764,3,0,0,3,0,0, 16 | 1765,3,0,0,3,0,0, 17 | 1766,3,0,0,3,0,0, 18 | 1767,3,0,0,3,0,0, 19 | 1768,3,0,0,3,0,0, 20 | 1769,3,0,0,3,0,0, 21 | 1770,3,0,0,3,0,0, 22 | 1771,4,0,0,4,0,0, 23 | 1772,4,0,0,4,0,0, 24 | 1773,4,0,0,4,0,0, 25 | 1774,4,0,0,4,0,0, 26 | 1775,4,0,0,4,0,0, 27 | 1776,4,0,0,4,0,0, 28 | 1777,4,0,0,4,0,0, 29 | 1778,4,0,0,4,0,0, 30 | 1779,4,0,0,4,0,0, 31 | 1780,4,0,0,4,0,0, 32 | 1781,5,0,0,5,0,0, 33 | 1782,5,0,0,5,0,0, 34 | 1783,5,0,0,5,0,0, 35 | 1784,5,0,0,5,0,0, 36 | 1785,5,0,0,5,0,0, 37 | 1786,5,0,0,5,0,0, 38 | 1787,5,0,0,5,0,0, 39 | 1788,5,0,0,5,0,0, 40 | 1789,5,0,0,5,0,0, 41 | 1790,5,0,0,5,0,0, 42 | 1791,6,0,0,6,0,0, 43 | 1792,6,0,0,6,0,0, 44 | 1793,6,0,0,6,0,0, 45 | 1794,6,0,0,6,0,0, 46 | 1795,6,0,0,6,0,0, 47 | 1796,6,0,0,6,0,0, 48 | 1797,7,0,0,7,0,0, 49 | 1798,7,0,0,7,0,0, 50 | 1799,7,0,0,7,0,0, 51 | 1800,8,0,0,8,0,0, 52 | 1801,8,0,0,8,0,0, 53 | 1802,10,0,0,10,0,0, 54 | 1803,9,0,0,9,0,0, 55 | 1804,9,0,0,9,0,0, 56 | 1805,9,0,0,9,0,0, 57 | 1806,10,0,0,10,0,0, 58 | 1807,10,0,0,10,0,0, 59 | 1808,10,0,0,10,0,0, 60 | 1809,10,0,0,10,0,0, 61 | 1810,10,0,0,10,0,0, 62 | 1811,11,0,0,11,0,0, 63 | 1812,11,0,0,11,0,0, 64 | 1813,11,0,0,11,0,0, 65 | 1814,11,0,0,11,0,0, 66 | 1815,12,0,0,12,0,0, 67 | 1816,13,0,0,13,0,0, 68 | 1817,14,0,0,14,0,0, 69 | 1818,14,0,0,14,0,0, 70 | 1819,14,0,0,14,0,0, 71 | 1820,14,0,0,14,0,0, 72 | 1821,14,0,0,14,0,0, 73 | 1822,15,0,0,15,0,0, 74 | 1823,16,0,0,16,0,0, 75 | 1824,16,0,0,16,0,0, 76 | 1825,17,0,0,17,0,0, 77 | 1826,17,0,0,17,0,0, 78 | 1827,18,0,0,18,0,0, 79 | 1828,18,0,0,18,0,0, 80 | 1829,18,0,0,18,0,0, 81 | 1830,24,0,0,24,0,0, 82 | 1831,23,0,0,23,0,0, 83 | 1832,23,0,0,23,0,0, 84 | 1833,24,0,0,24,0,0, 85 | 1834,24,0,0,24,0,0, 86 | 1835,25,0,0,25,0,0, 87 | 1836,29,0,0,29,0,0, 88 | 1837,29,0,0,29,0,0, 89 | 1838,30,0,0,30,0,0, 90 | 1839,31,0,0,31,0,0, 91 | 1840,33,0,0,33,0,0, 92 | 1841,34,0,0,34,0,0, 93 | 1842,36,0,0,36,0,0, 94 | 1843,37,0,0,37,0,0, 95 | 1844,39,0,0,39,0,0, 96 | 1845,43,0,0,43,0,0, 97 | 1846,43,0,0,43,0,0, 98 | 1847,46,0,0,46,0,0, 99 | 1848,47,0,0,47,0,0, 100 | 1849,50,0,0,50,0,0, 101 | 1850,54,0,0,54,0,0, 102 | 1851,54,0,0,54,0,0, 103 | 1852,57,0,0,57,0,0, 104 | 1853,59,0,0,59,0,0, 105 | 1854,69,0,0,69,0,0, 106 | 1855,71,0,0,71,0,0, 107 | 1856,76,0,0,76,0,0, 108 | 1857,77,0,0,77,0,0, 109 | 1858,78,0,0,78,0,0, 110 | 1859,83,0,0,83,0,0, 111 | 1860,91,0,0,91,0,0, 112 | 1861,95,0,0,95,0,0, 113 | 1862,97,0,0,96,0,0, 114 | 1863,104,0,0,103,0,0, 115 | 1864,112,0,0,112,0,0, 116 | 1865,119,0,0,119,0,0, 117 | 1866,122,0,0,122,0,0, 118 | 1867,130,0,0,130,0,0, 119 | 1868,135,0,0,134,0,0, 120 | 1869,142,0,0,142,0,0, 121 | 1870,147,0,1,146,0,0, 122 | 1871,156,0,1,156,0,0, 123 | 1872,173,0,1,173,0,0, 124 | 1873,184,0,1,183,0,0, 125 | 1874,174,0,1,173,0,0, 126 | 1875,188,0,1,187,0,0, 127 | 1876,191,0,1,190,0,0, 128 | 1877,194,0,2,192,0,0, 129 | 1878,196,0,2,194,0,0, 130 | 1879,210,0,3,207,0,0, 131 | 1880,236,0,3,233,0,0, 132 | 1881,243,0,4,239,0,0, 133 | 1882,256,0,4,252,0,0, 134 | 1883,272,0,3,269,0,0, 135 | 1884,275,0,4,271,0,0, 136 | 1885,277,1,4,273,0,0, 137 | 1886,281,2,5,275,0,0, 138 | 1887,295,3,5,287,0,0, 139 | 1888,327,5,5,317,0,0, 140 | 1889,327,3,6,318,0,0, 141 | 1890,356,3,8,345,0,0, 142 | 1891,372,2,9,360,0,0, 143 | 1892,374,2,9,363,0,0, 144 | 1893,370,2,10,358,0,0, 145 | 1894,383,2,9,372,0,0, 146 | 1895,406,2,11,393,0,0, 147 | 1896,419,2,12,405,0,0, 148 | 1897,440,2,13,425,0,0, 149 | 1898,465,2,13,449,0,0, 150 | 1899,507,3,14,491,0,0, 151 | 1900,534,3,16,515,0,0, 152 | 1901,552,4,18,531,0,0, 153 | 1902,566,4,19,543,0,0, 154 | 1903,617,4,20,593,0,0, 155 | 1904,624,4,23,597,0,0, 156 | 1905,663,5,23,636,0,0, 157 | 1906,707,5,23,680,0,0, 158 | 1907,784,5,28,750,0,0, 159 | 1908,750,5,30,714,0,0, 160 | 1909,785,6,32,747,0,0, 161 | 1910,819,7,34,778,0,0, 162 | 1911,836,7,36,792,0,0, 163 | 1912,879,8,37,834,0,0, 164 | 1913,943,8,41,895,0,0, 165 | 1914,850,8,42,800,0,0, 166 | 1915,838,9,45,784,0,0, 167 | 1916,901,10,48,842,0,0, 168 | 1917,955,11,54,891,0,0, 169 | 1918,936,10,53,873,0,0, 170 | 1919,806,10,61,735,0,0, 171 | 1920,932,11,78,843,0,0, 172 | 1921,803,10,84,709,0,0, 173 | 1922,845,11,94,740,0,0, 174 | 1923,970,14,111,845,0,0, 175 | 1924,963,16,110,836,0,0, 176 | 1925,975,17,116,842,0,0, 177 | 1926,983,19,119,846,0,0, 178 | 1927,1062,21,136,905,0,0, 179 | 1928,1065,23,143,890,10,0, 180 | 1929,1145,28,160,947,10,0, 181 | 1930,1053,28,152,862,10,0, 182 | 1931,940,25,147,759,8,0, 183 | 1932,847,24,141,675,7,0, 184 | 1933,893,25,154,708,7,0, 185 | 1934,973,28,162,775,8,0, 186 | 1935,1027,30,176,811,9,0, 187 | 1936,1130,34,192,893,11,0, 188 | 1937,1209,38,219,941,11,0, 189 | 1938,1142,37,214,880,12,0, 190 | 1939,1192,38,222,918,13,0, 191 | 1940,1299,42,229,1017,11,0, 192 | 1941,1334,42,236,1043,12,0, 193 | 1942,1342,45,222,1063,11,0, 194 | 1943,1391,50,239,1092,10,0, 195 | 1944,1383,54,275,1047,7,0, 196 | 1945,1160,59,275,820,7,0, 197 | 1946,1238,61,292,875,10,0, 198 | 1947,1392,67,322,992,12,0, 199 | 1948,1469,76,364,1015,14,0, 200 | 1949,1419,81,362,960,16,0, 201 | 1950,1630,97,423,1070,18,23,0.64 202 | 1951,1767,115,479,1129,20,24,0.69 203 | 1952,1795,124,504,1119,22,26,0.68 204 | 1953,1841,131,533,1125,24,27,0.69 205 | 1954,1865,138,557,1116,27,27,0.69 206 | 1955,2042,150,625,1208,30,31,0.74 207 | 1956,2177,161,679,1273,32,32,0.77 208 | 1957,2270,178,714,1309,34,35,0.79 209 | 1958,2330,192,731,1336,36,35,0.8 210 | 1959,2454,206,789,1382,40,36,0.83 211 | 1960,2569,227,849,1410,43,39,0.85 212 | 1961,2580,240,904,1349,45,42,0.84 213 | 1962,2686,263,980,1351,49,44,0.86 214 | 1963,2833,286,1052,1396,51,47,0.88 215 | 1964,2995,316,1137,1435,57,51,0.92 216 | 1965,3130,337,1219,1460,59,55,0.94 217 | 1966,3288,364,1323,1478,63,60,0.97 218 | 1967,3393,392,1423,1448,65,66,0.98 219 | 1968,3566,424,1551,1448,70,73,1.01 220 | 1969,3780,467,1673,1486,74,80,1.05 221 | 1970,4053,493,1839,1556,78,87,1.1 222 | 1971,4208,530,1947,1559,84,88,1.12 223 | 1972,4376,560,2057,1576,89,94,1.14 224 | 1973,4614,588,2241,1581,95,110,1.18 225 | 1974,4623,597,2245,1579,96,107,1.16 226 | 1975,4596,604,2132,1673,95,92,1.13 227 | 1976,4864,630,2314,1710,103,108,1.18 228 | 1977,5026,650,2398,1765,108,104,1.19 229 | 1978,5087,680,2392,1793,116,106,1.19 230 | 1979,5369,721,2544,1887,119,98,1.23 231 | 1980,5315,740,2422,1947,120,86,1.2 232 | 1981,5152,756,2289,1921,121,64,1.14 233 | 1982,5113,740,2196,1992,121,64,1.11 234 | 1983,5094,741,2176,1995,125,58,1.09 235 | 1984,5280,808,2199,2094,128,51,1.11 236 | 1985,5439,837,2186,2237,131,49,1.12 237 | 1986,5607,831,2293,2300,137,46,1.14 238 | 1987,5752,894,2306,2364,143,44,1.15 239 | 1988,5965,937,2412,2414,152,50,1.17 240 | 1989,6097,985,2459,2457,156,41,1.17 241 | 1990,6127,1019,2492,2419,157,40,1.16 242 | 1991,6217,1063,2605,2345,161,44,1.16 243 | 1992,6164,1095,2510,2357,167,35,1.13 244 | 1993,6162,1129,2523,2298,176,36,1.11 245 | 1994,6266,1139,2546,2358,186,38,1.11 246 | 1995,6398,1157,2565,2442,197,36,1.12 247 | 1996,6542,1209,2624,2469,203,37,1.13 248 | 1997,6651,1208,2700,2495,209,38,1.13 249 | 1998,6643,1243,2766,2391,209,35,1.12 250 | 1999,6610,1270,2737,2352,217,33,1.1 251 | 2000,6765,1288,2838,2367,226,45,1.11 252 | 2001,6927,1312,2840,2492,237,46,1.12 253 | 2002,6996,1344,2831,2521,252,48,1.12 254 | 2003,7416,1391,2959,2743,276,48,1.17 255 | 2004,7807,1437,3053,2967,298,53,1.21 256 | 2005,8093,1480,3076,3157,320,60,1.24 257 | 2006,8370,1525,3089,3339,356,61,1.27 258 | 2007,8566,1572,3081,3464,382,68,1.28 259 | 2008,8783,1631,3122,3571,388,71,1.3 260 | 2009,8740,1585,3056,3620,413,66,1.28 261 | 2010,9167,1702,3114,3842,450,59,1.33 -------------------------------------------------------------------------------- /wk2-neural-Networks/.ipynb_checkpoints/NN-3layers-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 27, 17 | "metadata": { 18 | "collapsed": false 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "class NeuralNetwork():\n", 23 | " def __init__(self):\n", 24 | " np.random.seed(1)\n", 25 | " \n", 26 | " # setting the number of nodes \n", 27 | " l2 = 5\n", 28 | " l3 = 4\n", 29 | " \n", 30 | " # initialize 3 weights\n", 31 | " self.synaptic_weights1 = 2 * np.random.random((3,l2)) - 1\n", 32 | " self.synaptic_weights2 = 2 * np.random.random((l2,l3)) - 1\n", 33 | " self.synaptic_weights3 = 2 * np.random.random((l3,1)) - 1\n", 34 | " \n", 35 | " self.activation_function = lambda x: 1 / (1 + np.exp(-x))\n", 36 | " self.derivative = lambda x: x * (1-x)\n", 37 | " \n", 38 | " def train(self, X, y, iterations):\n", 39 | " # Convert inputs list to 2d array\n", 40 | " #X = np.array(X, ndmin=2)\n", 41 | " #y = np.array(y, ndmin=2)\n", 42 | " \n", 43 | " for iter in range(iterations):\n", 44 | " # feed forward\n", 45 | " a2 = self.activation_function(np.dot(X, self.synaptic_weights1))\n", 46 | " a3 = self.activation_function(np.dot(a2, self.synaptic_weights2))\n", 47 | " output = self.activation_function(np.dot(a3, self.synaptic_weights3))\n", 48 | " \n", 49 | " # error\n", 50 | " delta4 = (y - output)*self.derivative(output)\n", 51 | " delta3 = np.dot(self.synaptic_weights3,delta4.T)*self.derivative(a3).T\n", 52 | " delta2 = np.dot(self.synaptic_weights2,delta3)*self.derivative(a2).T\n", 53 | " \n", 54 | " # adjustments\n", 55 | " adjustment3 = np.dot(a3.T, delta4)\n", 56 | " adjustment2 = np.dot(a2.T, delta3.T)\n", 57 | " adjustment1 = np.dot(X.T, delta2.T)\n", 58 | " \n", 59 | " # update weights\n", 60 | " self.synaptic_weights1 += adjustment1\n", 61 | " self.synaptic_weights2 += adjustment2\n", 62 | " self.synaptic_weights3 += adjustment3\n", 63 | " \n", 64 | " def run(self, X):\n", 65 | " # forward pass\n", 66 | " a2 = self.activation_function(np.dot(X, self.synaptic_weights1))\n", 67 | " a3 = self.activation_function(np.dot(a2, self.synaptic_weights2))\n", 68 | " output = self.activation_function(np.dot(a3, self.synaptic_weights3))\n", 69 | " \n", 70 | " return output\n", 71 | " " 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 28, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "Random starting synaptic weights (layer 1): \n", 86 | "[[-0.16595599 0.44064899 -0.99977125 -0.39533485 -0.70648822]\n", 87 | " [-0.81532281 -0.62747958 -0.30887855 -0.20646505 0.07763347]\n", 88 | " [-0.16161097 0.370439 -0.5910955 0.75623487 -0.94522481]]\n", 89 | "Random starting synaptic weights (layer 2): \n", 90 | "[[ 0.34093502 -0.1653904 0.11737966 -0.71922612]\n", 91 | " [-0.60379702 0.60148914 0.93652315 -0.37315164]\n", 92 | " [ 0.38464523 0.7527783 0.78921333 -0.82991158]\n", 93 | " [-0.92189043 -0.66033916 0.75628501 -0.80330633]\n", 94 | " [-0.15778475 0.91577906 0.06633057 0.38375423]]\n", 95 | "Random starting synaptic weights (layer 3): \n", 96 | "[[-0.36896874]\n", 97 | " [ 0.37300186]\n", 98 | " [ 0.66925134]\n", 99 | " [-0.96342345]]\n", 100 | "\n", 101 | "New synaptic weights (layer 1) after training: \n", 102 | "[[-0.39042717 4.02220543 -1.52322523 2.40451717 -2.77177632]\n", 103 | " [-0.86817904 -0.33659723 -0.245578 -0.31292608 0.26079733]\n", 104 | " [-0.00600591 -1.69046817 0.12647375 -0.79367455 1.04614 ]]\n", 105 | "\n", 106 | "New synaptic weights (layer 2) after training: \n", 107 | "[[ 0.9614375 -0.15372521 -0.67703076 -0.00498486]\n", 108 | " [-2.7714058 0.77362787 2.71638353 -2.4249225 ]\n", 109 | " [ 1.88550044 0.70717346 -0.71729366 0.7730995 ]\n", 110 | " [-1.59473372 -0.55756571 1.23221965 -1.28695185]\n", 111 | " [ 1.92232578 0.86077523 -2.13676866 2.54238247]]\n", 112 | "\n", 113 | "New synaptic weights (layer 3) after training: \n", 114 | "[[-4.392069 ]\n", 115 | " [ 0.66563256]\n", 116 | " [ 5.76280212]\n", 117 | " [-3.88936424]]\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "NN = NeuralNetwork()\n", 123 | "\n", 124 | "print(\"Random starting synaptic weights (layer 1): \")\n", 125 | "print(NN.synaptic_weights1)\n", 126 | "print(\"Random starting synaptic weights (layer 2): \")\n", 127 | "print(NN.synaptic_weights2)\n", 128 | "print(\"Random starting synaptic weights (layer 3): \")\n", 129 | "print(NN.synaptic_weights3)\n", 130 | "\n", 131 | "inputs = np.array([[0,0,1],[1,1,1],[1,0,1],[0,1,1]])\n", 132 | "targets = np.array([[0,1,1,0]]).T\n", 133 | "\n", 134 | "NN.train(inputs,targets, 10000)\n", 135 | "\n", 136 | "print (\"\\nNew synaptic weights (layer 1) after training: \")\n", 137 | "print (NN.synaptic_weights1)\n", 138 | "print (\"\\nNew synaptic weights (layer 2) after training: \")\n", 139 | "print (NN.synaptic_weights2)\n", 140 | "print (\"\\nNew synaptic weights (layer 3) after training: \")\n", 141 | "print (NN.synaptic_weights3)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 29, 147 | "metadata": { 148 | "collapsed": false 149 | }, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "\n", 156 | "Predict new value [1,0,0]: \n", 157 | "[ 0.99650838]\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "# test with new input\n", 163 | "print(\"\\nPredict new value [1,0,0]: \")\n", 164 | "print(NN.run(np.array([1,0,0])))" 165 | ] 166 | } 167 | ], 168 | "metadata": { 169 | "kernelspec": { 170 | "display_name": "Python 3", 171 | "language": "python", 172 | "name": "python3" 173 | }, 174 | "language_info": { 175 | "codemirror_mode": { 176 | "name": "ipython", 177 | "version": 3 178 | }, 179 | "file_extension": ".py", 180 | "mimetype": "text/x-python", 181 | "name": "python", 182 | "nbconvert_exporter": "python", 183 | "pygments_lexer": "ipython3", 184 | "version": "3.5.2" 185 | } 186 | }, 187 | "nbformat": 4, 188 | "nbformat_minor": 2 189 | } 190 | -------------------------------------------------------------------------------- /wk2-neural-Networks/.ipynb_checkpoints/feedForwardNN-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "class NeuralNetwork():\n", 23 | " def __init__(self):\n", 24 | " np.random.seed(1) # Seed the random number generator\n", 25 | " self.weights = {} # Create dict to hold weights\n", 26 | " self.num_layers = 1 # Set initial number of layer to one (input layer)\n", 27 | " self.adjustments = {} # Create dict to hold adjustements\n", 28 | "\n", 29 | " def add_layer(self, shape):\n", 30 | " # Create weights with shape specified + biases\n", 31 | " self.weights[self.num_layers] = np.vstack((2 * np.random.random(shape) - 1, 2 * np.random.random((1, shape[1])) - 1))\n", 32 | " # Initialize the adjustements for these weights to zero\n", 33 | " self.adjustments[self.num_layers] = np.zeros(shape)\n", 34 | " self.num_layers += 1\n", 35 | "\n", 36 | " def __sigmoid(self, x):\n", 37 | " return 1 / (1 + np.exp(-x))\n", 38 | "\n", 39 | " def __sigmoid_derivative(self, x):\n", 40 | " return x * (1 - x)\n", 41 | "\n", 42 | " def predict(self, data):\n", 43 | " # Pass data through pretrained network\n", 44 | " for layer in range(1, self.num_layers+1):\n", 45 | " data = np.dot(data, self.weights[layer-1][:, :-1]) + self.weights[layer-1][:, -1] # + self.biases[layer]\n", 46 | " data = self.__sigmoid(data)\n", 47 | " return data\n", 48 | "\n", 49 | " def __forward_propagate(self, data):\n", 50 | " # Progapagate through network and hold values for use in back-propagation\n", 51 | " activation_values = {}\n", 52 | " activation_values[1] = data\n", 53 | " for layer in range(2, self.num_layers+1):\n", 54 | " data = np.dot(data.T, self.weights[layer-1][:-1, :]) + self.weights[layer-1][-1, :].T # + self.biases[layer]\n", 55 | " data = self.__sigmoid(data).T\n", 56 | " activation_values[layer] = data\n", 57 | " return activation_values\n", 58 | "\n", 59 | " def simple_error(self, outputs, targets):\n", 60 | " return targets - outputs\n", 61 | "\n", 62 | " def sum_squared_error(self, outputs, targets):\n", 63 | " return 0.5 * np.mean(np.sum(np.power(outputs - targets, 2), axis=1))\n", 64 | "\n", 65 | " def __back_propagate(self, output, target):\n", 66 | " deltas = {}\n", 67 | " # Delta of output Layer\n", 68 | " deltas[self.num_layers] = output[self.num_layers] - target\n", 69 | "\n", 70 | " # Delta of hidden Layers\n", 71 | " for layer in reversed(range(2, self.num_layers)): # All layers except input/output\n", 72 | " a_val = output[layer]\n", 73 | " weights = self.weights[layer][:-1, :]\n", 74 | " prev_deltas = deltas[layer+1]\n", 75 | " deltas[layer] = np.multiply(np.dot(weights, prev_deltas), self.__sigmoid_derivative(a_val))\n", 76 | "\n", 77 | " # Caclculate total adjustements based on deltas\n", 78 | " for layer in range(1, self.num_layers):\n", 79 | " self.adjustments[layer] += np.dot(deltas[layer+1], output[layer].T).T\n", 80 | "\n", 81 | " def __gradient_descente(self, batch_size, learning_rate):\n", 82 | " # Calculate partial derivative and take a step in that direction\n", 83 | " for layer in range(1, self.num_layers):\n", 84 | " partial_d = (1/batch_size) * self.adjustments[layer]\n", 85 | " self.weights[layer][:-1, :] += learning_rate * -partial_d\n", 86 | " self.weights[layer][-1, :] += learning_rate*1e-3 * -partial_d[-1, :]\n", 87 | "\n", 88 | "\n", 89 | " def train(self, inputs, targets, num_epochs, learning_rate=1, stop_accuracy=1e-5):\n", 90 | " error = []\n", 91 | " for iteration in range(num_epochs):\n", 92 | " for i in range(len(inputs)):\n", 93 | " x = inputs[i]\n", 94 | " y = targets[i]\n", 95 | " # Pass the training set through our neural network\n", 96 | " output = self.__forward_propagate(x)\n", 97 | "\n", 98 | " # Calculate the error\n", 99 | " loss = self.sum_squared_error(output[self.num_layers], y)\n", 100 | " error.append(loss)\n", 101 | "\n", 102 | " # Calculate Adjustements\n", 103 | " self.__back_propagate(output, y)\n", 104 | "\n", 105 | " self.__gradient_descente(i, learning_rate)\n", 106 | "\n", 107 | " # Check if accuarcy criterion is satisfied\n", 108 | " if np.mean(error[-(i+1):]) < stop_accuracy and iteration > 0:\n", 109 | " break\n", 110 | "\n", 111 | " return(np.asarray(error), iteration+1)\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 3, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "Error = 7.29967091123e-06\n", 126 | "Epoches needed to train = 62\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "# Create instance of a neural network\n", 132 | "nn = NeuralNetwork()\n", 133 | "\n", 134 | "# Add Layers (Input layer is created by default)\n", 135 | "nn.add_layer((2, 9))\n", 136 | "nn.add_layer((9, 1))\n", 137 | "\n", 138 | "# XOR function\n", 139 | "training_data = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]]).reshape(4, 2, 1)\n", 140 | "training_labels = np.asarray([[0], [1], [1], [0]])\n", 141 | "\n", 142 | "error, iteration = nn.train(training_data, training_labels, 5000)\n", 143 | "print('Error = ', np.mean(error[-4:]))\n", 144 | "print('Epoches needed to train = ', iteration)\n" 145 | ] 146 | } 147 | ], 148 | "metadata": { 149 | "kernelspec": { 150 | "display_name": "Python 3", 151 | "language": "python", 152 | "name": "python3" 153 | }, 154 | "language_info": { 155 | "codemirror_mode": { 156 | "name": "ipython", 157 | "version": 3 158 | }, 159 | "file_extension": ".py", 160 | "mimetype": "text/x-python", 161 | "name": "python", 162 | "nbconvert_exporter": "python", 163 | "pygments_lexer": "ipython3", 164 | "version": "3.5.2" 165 | } 166 | }, 167 | "nbformat": 4, 168 | "nbformat_minor": 2 169 | } 170 | -------------------------------------------------------------------------------- /wk2-neural-Networks/NN-3layers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 27, 17 | "metadata": { 18 | "collapsed": false 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "class NeuralNetwork():\n", 23 | " def __init__(self):\n", 24 | " np.random.seed(1)\n", 25 | " \n", 26 | " # setting the number of nodes \n", 27 | " l2 = 5\n", 28 | " l3 = 4\n", 29 | " \n", 30 | " # initialize 3 weights\n", 31 | " self.synaptic_weights1 = 2 * np.random.random((3,l2)) - 1\n", 32 | " self.synaptic_weights2 = 2 * np.random.random((l2,l3)) - 1\n", 33 | " self.synaptic_weights3 = 2 * np.random.random((l3,1)) - 1\n", 34 | " \n", 35 | " self.activation_function = lambda x: 1 / (1 + np.exp(-x))\n", 36 | " self.derivative = lambda x: x * (1-x)\n", 37 | " \n", 38 | " def train(self, X, y, iterations):\n", 39 | " # Convert inputs list to 2d array\n", 40 | " #X = np.array(X, ndmin=2)\n", 41 | " #y = np.array(y, ndmin=2)\n", 42 | " \n", 43 | " for iter in range(iterations):\n", 44 | " # feed forward\n", 45 | " a2 = self.activation_function(np.dot(X, self.synaptic_weights1))\n", 46 | " a3 = self.activation_function(np.dot(a2, self.synaptic_weights2))\n", 47 | " output = self.activation_function(np.dot(a3, self.synaptic_weights3))\n", 48 | " \n", 49 | " # error\n", 50 | " delta4 = (y - output)*self.derivative(output)\n", 51 | " delta3 = np.dot(self.synaptic_weights3,delta4.T)*self.derivative(a3).T\n", 52 | " delta2 = np.dot(self.synaptic_weights2,delta3)*self.derivative(a2).T\n", 53 | " \n", 54 | " # adjustments\n", 55 | " adjustment3 = np.dot(a3.T, delta4)\n", 56 | " adjustment2 = np.dot(a2.T, delta3.T)\n", 57 | " adjustment1 = np.dot(X.T, delta2.T)\n", 58 | " \n", 59 | " # update weights\n", 60 | " self.synaptic_weights1 += adjustment1\n", 61 | " self.synaptic_weights2 += adjustment2\n", 62 | " self.synaptic_weights3 += adjustment3\n", 63 | " \n", 64 | " def run(self, X):\n", 65 | " # forward pass\n", 66 | " a2 = self.activation_function(np.dot(X, self.synaptic_weights1))\n", 67 | " a3 = self.activation_function(np.dot(a2, self.synaptic_weights2))\n", 68 | " output = self.activation_function(np.dot(a3, self.synaptic_weights3))\n", 69 | " \n", 70 | " return output\n", 71 | " " 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 28, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "Random starting synaptic weights (layer 1): \n", 86 | "[[-0.16595599 0.44064899 -0.99977125 -0.39533485 -0.70648822]\n", 87 | " [-0.81532281 -0.62747958 -0.30887855 -0.20646505 0.07763347]\n", 88 | " [-0.16161097 0.370439 -0.5910955 0.75623487 -0.94522481]]\n", 89 | "Random starting synaptic weights (layer 2): \n", 90 | "[[ 0.34093502 -0.1653904 0.11737966 -0.71922612]\n", 91 | " [-0.60379702 0.60148914 0.93652315 -0.37315164]\n", 92 | " [ 0.38464523 0.7527783 0.78921333 -0.82991158]\n", 93 | " [-0.92189043 -0.66033916 0.75628501 -0.80330633]\n", 94 | " [-0.15778475 0.91577906 0.06633057 0.38375423]]\n", 95 | "Random starting synaptic weights (layer 3): \n", 96 | "[[-0.36896874]\n", 97 | " [ 0.37300186]\n", 98 | " [ 0.66925134]\n", 99 | " [-0.96342345]]\n", 100 | "\n", 101 | "New synaptic weights (layer 1) after training: \n", 102 | "[[-0.39042717 4.02220543 -1.52322523 2.40451717 -2.77177632]\n", 103 | " [-0.86817904 -0.33659723 -0.245578 -0.31292608 0.26079733]\n", 104 | " [-0.00600591 -1.69046817 0.12647375 -0.79367455 1.04614 ]]\n", 105 | "\n", 106 | "New synaptic weights (layer 2) after training: \n", 107 | "[[ 0.9614375 -0.15372521 -0.67703076 -0.00498486]\n", 108 | " [-2.7714058 0.77362787 2.71638353 -2.4249225 ]\n", 109 | " [ 1.88550044 0.70717346 -0.71729366 0.7730995 ]\n", 110 | " [-1.59473372 -0.55756571 1.23221965 -1.28695185]\n", 111 | " [ 1.92232578 0.86077523 -2.13676866 2.54238247]]\n", 112 | "\n", 113 | "New synaptic weights (layer 3) after training: \n", 114 | "[[-4.392069 ]\n", 115 | " [ 0.66563256]\n", 116 | " [ 5.76280212]\n", 117 | " [-3.88936424]]\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "NN = NeuralNetwork()\n", 123 | "\n", 124 | "print(\"Random starting synaptic weights (layer 1): \")\n", 125 | "print(NN.synaptic_weights1)\n", 126 | "print(\"Random starting synaptic weights (layer 2): \")\n", 127 | "print(NN.synaptic_weights2)\n", 128 | "print(\"Random starting synaptic weights (layer 3): \")\n", 129 | "print(NN.synaptic_weights3)\n", 130 | "\n", 131 | "inputs = np.array([[0,0,1],[1,1,1],[1,0,1],[0,1,1]])\n", 132 | "targets = np.array([[0,1,1,0]]).T\n", 133 | "\n", 134 | "NN.train(inputs,targets, 10000)\n", 135 | "\n", 136 | "print (\"\\nNew synaptic weights (layer 1) after training: \")\n", 137 | "print (NN.synaptic_weights1)\n", 138 | "print (\"\\nNew synaptic weights (layer 2) after training: \")\n", 139 | "print (NN.synaptic_weights2)\n", 140 | "print (\"\\nNew synaptic weights (layer 3) after training: \")\n", 141 | "print (NN.synaptic_weights3)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 29, 147 | "metadata": { 148 | "collapsed": false 149 | }, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "\n", 156 | "Predict new value [1,0,0]: \n", 157 | "[ 0.99650838]\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "# test with new input\n", 163 | "print(\"\\nPredict new value [1,0,0]: \")\n", 164 | "print(NN.run(np.array([1,0,0])))" 165 | ] 166 | } 167 | ], 168 | "metadata": { 169 | "kernelspec": { 170 | "display_name": "Python 3", 171 | "language": "python", 172 | "name": "python3" 173 | }, 174 | "language_info": { 175 | "codemirror_mode": { 176 | "name": "ipython", 177 | "version": 3 178 | }, 179 | "file_extension": ".py", 180 | "mimetype": "text/x-python", 181 | "name": "python", 182 | "nbconvert_exporter": "python", 183 | "pygments_lexer": "ipython3", 184 | "version": "3.5.2" 185 | } 186 | }, 187 | "nbformat": 4, 188 | "nbformat_minor": 2 189 | } 190 | -------------------------------------------------------------------------------- /wk2-neural-Networks/feedForwardNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "class NeuralNetwork():\n", 23 | " def __init__(self):\n", 24 | " np.random.seed(1) # Seed the random number generator\n", 25 | " self.weights = {} # Create dict to hold weights\n", 26 | " self.num_layers = 1 # Set initial number of layer to one (input layer)\n", 27 | " self.adjustments = {} # Create dict to hold adjustements\n", 28 | "\n", 29 | " def add_layer(self, shape):\n", 30 | " # Create weights with shape specified + biases\n", 31 | " self.weights[self.num_layers] = np.vstack((2 * np.random.random(shape) - 1, 2 * np.random.random((1, shape[1])) - 1))\n", 32 | " # Initialize the adjustements for these weights to zero\n", 33 | " self.adjustments[self.num_layers] = np.zeros(shape)\n", 34 | " self.num_layers += 1\n", 35 | "\n", 36 | " def __sigmoid(self, x):\n", 37 | " return 1 / (1 + np.exp(-x))\n", 38 | "\n", 39 | " def __sigmoid_derivative(self, x):\n", 40 | " return x * (1 - x)\n", 41 | "\n", 42 | " def predict(self, data):\n", 43 | " # Pass data through pretrained network\n", 44 | " for layer in range(1, self.num_layers+1):\n", 45 | " data = np.dot(data, self.weights[layer-1][:, :-1]) + self.weights[layer-1][:, -1] # + self.biases[layer]\n", 46 | " data = self.__sigmoid(data)\n", 47 | " return data\n", 48 | "\n", 49 | " def __forward_propagate(self, data):\n", 50 | " # Progapagate through network and hold values for use in back-propagation\n", 51 | " activation_values = {}\n", 52 | " activation_values[1] = data\n", 53 | " for layer in range(2, self.num_layers+1):\n", 54 | " data = np.dot(data.T, self.weights[layer-1][:-1, :]) + self.weights[layer-1][-1, :].T # + self.biases[layer]\n", 55 | " data = self.__sigmoid(data).T\n", 56 | " activation_values[layer] = data\n", 57 | " return activation_values\n", 58 | "\n", 59 | " def simple_error(self, outputs, targets):\n", 60 | " return targets - outputs\n", 61 | "\n", 62 | " def sum_squared_error(self, outputs, targets):\n", 63 | " return 0.5 * np.mean(np.sum(np.power(outputs - targets, 2), axis=1))\n", 64 | "\n", 65 | " def __back_propagate(self, output, target):\n", 66 | " deltas = {}\n", 67 | " # Delta of output Layer\n", 68 | " deltas[self.num_layers] = output[self.num_layers] - target\n", 69 | "\n", 70 | " # Delta of hidden Layers\n", 71 | " for layer in reversed(range(2, self.num_layers)): # All layers except input/output\n", 72 | " a_val = output[layer]\n", 73 | " weights = self.weights[layer][:-1, :]\n", 74 | " prev_deltas = deltas[layer+1]\n", 75 | " deltas[layer] = np.multiply(np.dot(weights, prev_deltas), self.__sigmoid_derivative(a_val))\n", 76 | "\n", 77 | " # Caclculate total adjustements based on deltas\n", 78 | " for layer in range(1, self.num_layers):\n", 79 | " self.adjustments[layer] += np.dot(deltas[layer+1], output[layer].T).T\n", 80 | "\n", 81 | " def __gradient_descente(self, batch_size, learning_rate):\n", 82 | " # Calculate partial derivative and take a step in that direction\n", 83 | " for layer in range(1, self.num_layers):\n", 84 | " partial_d = (1/batch_size) * self.adjustments[layer]\n", 85 | " self.weights[layer][:-1, :] += learning_rate * -partial_d\n", 86 | " self.weights[layer][-1, :] += learning_rate*1e-3 * -partial_d[-1, :]\n", 87 | "\n", 88 | "\n", 89 | " def train(self, inputs, targets, num_epochs, learning_rate=1, stop_accuracy=1e-5):\n", 90 | " error = []\n", 91 | " for iteration in range(num_epochs):\n", 92 | " for i in range(len(inputs)):\n", 93 | " x = inputs[i]\n", 94 | " y = targets[i]\n", 95 | " # Pass the training set through our neural network\n", 96 | " output = self.__forward_propagate(x)\n", 97 | "\n", 98 | " # Calculate the error\n", 99 | " loss = self.sum_squared_error(output[self.num_layers], y)\n", 100 | " error.append(loss)\n", 101 | "\n", 102 | " # Calculate Adjustements\n", 103 | " self.__back_propagate(output, y)\n", 104 | "\n", 105 | " self.__gradient_descente(i, learning_rate)\n", 106 | "\n", 107 | " # Check if accuarcy criterion is satisfied\n", 108 | " if np.mean(error[-(i+1):]) < stop_accuracy and iteration > 0:\n", 109 | " break\n", 110 | "\n", 111 | " return(np.asarray(error), iteration+1)\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 3, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "Error = 7.29967091123e-06\n", 126 | "Epoches needed to train = 62\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "# Create instance of a neural network\n", 132 | "nn = NeuralNetwork()\n", 133 | "\n", 134 | "# Add Layers (Input layer is created by default)\n", 135 | "nn.add_layer((2, 9))\n", 136 | "nn.add_layer((9, 1))\n", 137 | "\n", 138 | "# XOR function\n", 139 | "training_data = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]]).reshape(4, 2, 1)\n", 140 | "training_labels = np.asarray([[0], [1], [1], [0]])\n", 141 | "\n", 142 | "error, iteration = nn.train(training_data, training_labels, 5000)\n", 143 | "print('Error = ', np.mean(error[-4:]))\n", 144 | "print('Epoches needed to train = ', iteration)\n" 145 | ] 146 | } 147 | ], 148 | "metadata": { 149 | "kernelspec": { 150 | "display_name": "Python 3", 151 | "language": "python", 152 | "name": "python3" 153 | }, 154 | "language_info": { 155 | "codemirror_mode": { 156 | "name": "ipython", 157 | "version": 3 158 | }, 159 | "file_extension": ".py", 160 | "mimetype": "text/x-python", 161 | "name": "python", 162 | "nbconvert_exporter": "python", 163 | "pygments_lexer": "ipython3", 164 | "version": "3.5.2" 165 | } 166 | }, 167 | "nbformat": 4, 168 | "nbformat_minor": 2 169 | } 170 | -------------------------------------------------------------------------------- /wk5-speed-dating/.ipynb_checkpoints/Speed dating prediction-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Speed dating predicton\n", 8 | " - [Kaggle Speed dating experiment](https://www.kaggle.com/annavictoria/speed-dating-experiment)\n", 9 | " - Learning fun [Siraj's DL #5](https://www.youtube.com/watch?v=koiTTim4M-s)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "import matplotlib.pyplot as plt" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/html": [ 36 | "
\n", 37 | "\n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | "
iididgenderidgcondtnwaveroundpositionpositin1order...attr3_3sinc3_3intel3_3fun3_3amb3_3attr5_3sinc5_3intel5_3fun5_3amb5_3
011.00111107NaN4...5.07.07.07.07.0NaNNaNNaNNaNNaN
111.00111107NaN3...5.07.07.07.07.0NaNNaNNaNNaNNaN
211.00111107NaN10...5.07.07.07.07.0NaNNaNNaNNaNNaN
311.00111107NaN5...5.07.07.07.07.0NaNNaNNaNNaNNaN
411.00111107NaN7...5.07.07.07.07.0NaNNaNNaNNaNNaN
\n", 187 | "

5 rows × 195 columns

\n", 188 | "
" 189 | ], 190 | "text/plain": [ 191 | " iid id gender idg condtn wave round position positin1 order \\\n", 192 | "0 1 1.0 0 1 1 1 10 7 NaN 4 \n", 193 | "1 1 1.0 0 1 1 1 10 7 NaN 3 \n", 194 | "2 1 1.0 0 1 1 1 10 7 NaN 10 \n", 195 | "3 1 1.0 0 1 1 1 10 7 NaN 5 \n", 196 | "4 1 1.0 0 1 1 1 10 7 NaN 7 \n", 197 | "\n", 198 | " ... attr3_3 sinc3_3 intel3_3 fun3_3 amb3_3 attr5_3 sinc5_3 \\\n", 199 | "0 ... 5.0 7.0 7.0 7.0 7.0 NaN NaN \n", 200 | "1 ... 5.0 7.0 7.0 7.0 7.0 NaN NaN \n", 201 | "2 ... 5.0 7.0 7.0 7.0 7.0 NaN NaN \n", 202 | "3 ... 5.0 7.0 7.0 7.0 7.0 NaN NaN \n", 203 | "4 ... 5.0 7.0 7.0 7.0 7.0 NaN NaN \n", 204 | "\n", 205 | " intel5_3 fun5_3 amb5_3 \n", 206 | "0 NaN NaN NaN \n", 207 | "1 NaN NaN NaN \n", 208 | "2 NaN NaN NaN \n", 209 | "3 NaN NaN NaN \n", 210 | "4 NaN NaN NaN \n", 211 | "\n", 212 | "[5 rows x 195 columns]" 213 | ] 214 | }, 215 | "execution_count": 2, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "df =pd.read_csv('Speed Dating Data.csv', encoding=\"ISO-8859-1\")\n", 222 | "df.head()" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 3, 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/plain": [ 235 | "0 0\n", 236 | "1 0\n", 237 | "2 1\n", 238 | "3 1\n", 239 | "4 1\n", 240 | "Name: match, dtype: int64" 241 | ] 242 | }, 243 | "execution_count": 3, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "df['match'].head()" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 4, 255 | "metadata": { 256 | "collapsed": false 257 | }, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/plain": [ 262 | "(8378, 195)" 263 | ] 264 | }, 265 | "execution_count": 4, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "df.shape" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 5, 277 | "metadata": { 278 | "collapsed": false 279 | }, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "\n", 286 | "RangeIndex: 8378 entries, 0 to 8377\n", 287 | "Columns: 195 entries, iid to amb5_3\n", 288 | "dtypes: float64(174), int64(13), object(8)\n", 289 | "memory usage: 12.5+ MB\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "df.info()" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "### First of all, let's just seperate features and labels" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 6, 307 | "metadata": { 308 | "collapsed": false 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "df, df_labels = df.drop(['match'], axis=1), df['match']" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "# 1. Preprocessing Data\n", 320 | " - 1.1 Cleaning\n", 321 | " - 1.2 Transformation\n", 322 | " - 1.3 Reduction by PCA" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "## 1.1 Cleaning" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "### Cleaning null features\n", 337 | "If a feature has more than 30% (2513) of values are null, we just drop the whole column. " 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 7, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [ 347 | { 348 | "name": "stdout", 349 | "output_type": "stream", 350 | "text": [ 351 | "194\n", 352 | "194\n" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "na_sum = list(df.isnull().sum())\n", 358 | "print(len(na_sum))\n", 359 | "#na_col = list(df.isnull().sum().index)\n", 360 | "#print(len(na_col))" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 8, 366 | "metadata": { 367 | "collapsed": false 368 | }, 369 | "outputs": [ 370 | { 371 | "name": "stdout", 372 | "output_type": "stream", 373 | "text": [ 374 | "We can drop 83 Columns\n" 375 | ] 376 | } 377 | ], 378 | "source": [ 379 | "drop_col =[]\n", 380 | "for i in range(len(na_sum)):\n", 381 | " if na_sum[i] > 2523:\n", 382 | " drop_col.append(na_col[i])\n", 383 | "print(\"We can drop \",len(drop_col),\" Columns\")" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 9, 389 | "metadata": { 390 | "collapsed": false 391 | }, 392 | "outputs": [ 393 | { 394 | "data": { 395 | "text/html": [ 396 | "
\n", 397 | "\n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | "
iididgenderidgcondtnwaveroundpositionpositin1order...sinc1_2intel1_2fun1_2amb1_2shar1_2attr3_2sinc3_2intel3_2fun3_2amb3_2
011.00111107NaN4...16.6713.8922.2211.1116.676.07.08.07.06.0
111.00111107NaN3...16.6713.8922.2211.1116.676.07.08.07.06.0
211.00111107NaN10...16.6713.8922.2211.1116.676.07.08.07.06.0
311.00111107NaN5...16.6713.8922.2211.1116.676.07.08.07.06.0
411.00111107NaN7...16.6713.8922.2211.1116.676.07.08.07.06.0
\n", 547 | "

5 rows × 111 columns

\n", 548 | "
" 549 | ], 550 | "text/plain": [ 551 | " iid id gender idg condtn wave round position positin1 order \\\n", 552 | "0 1 1.0 0 1 1 1 10 7 NaN 4 \n", 553 | "1 1 1.0 0 1 1 1 10 7 NaN 3 \n", 554 | "2 1 1.0 0 1 1 1 10 7 NaN 10 \n", 555 | "3 1 1.0 0 1 1 1 10 7 NaN 5 \n", 556 | "4 1 1.0 0 1 1 1 10 7 NaN 7 \n", 557 | "\n", 558 | " ... sinc1_2 intel1_2 fun1_2 amb1_2 shar1_2 attr3_2 sinc3_2 \\\n", 559 | "0 ... 16.67 13.89 22.22 11.11 16.67 6.0 7.0 \n", 560 | "1 ... 16.67 13.89 22.22 11.11 16.67 6.0 7.0 \n", 561 | "2 ... 16.67 13.89 22.22 11.11 16.67 6.0 7.0 \n", 562 | "3 ... 16.67 13.89 22.22 11.11 16.67 6.0 7.0 \n", 563 | "4 ... 16.67 13.89 22.22 11.11 16.67 6.0 7.0 \n", 564 | "\n", 565 | " intel3_2 fun3_2 amb3_2 \n", 566 | "0 8.0 7.0 6.0 \n", 567 | "1 8.0 7.0 6.0 \n", 568 | "2 8.0 7.0 6.0 \n", 569 | "3 8.0 7.0 6.0 \n", 570 | "4 8.0 7.0 6.0 \n", 571 | "\n", 572 | "[5 rows x 111 columns]" 573 | ] 574 | }, 575 | "execution_count": 9, 576 | "metadata": {}, 577 | "output_type": "execute_result" 578 | } 579 | ], 580 | "source": [ 581 | "df = df.drop(drop_col,axis=1)\n", 582 | "df.head()" 583 | ] 584 | }, 585 | { 586 | "cell_type": "markdown", 587 | "metadata": {}, 588 | "source": [ 589 | "### Imputing null values with mean" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 10, 595 | "metadata": { 596 | "collapsed": false 597 | }, 598 | "outputs": [], 599 | "source": [ 600 | "df = df.fillna(df.mean())" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 11, 606 | "metadata": { 607 | "collapsed": false 608 | }, 609 | "outputs": [ 610 | { 611 | "data": { 612 | "text/plain": [ 613 | "True" 614 | ] 615 | }, 616 | "execution_count": 11, 617 | "metadata": {}, 618 | "output_type": "execute_result" 619 | } 620 | ], 621 | "source": [ 622 | "#check if any NaN values\n", 623 | "df.isnull().values.any()" 624 | ] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "metadata": {}, 629 | "source": [ 630 | "#### This means there are still columns which have null values. Let's further check." 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": 12, 636 | "metadata": { 637 | "collapsed": false 638 | }, 639 | "outputs": [ 640 | { 641 | "data": { 642 | "text/plain": [ 643 | "['field', 'race', 'imprace', 'income']" 644 | ] 645 | }, 646 | "execution_count": 12, 647 | "metadata": {}, 648 | "output_type": "execute_result" 649 | } 650 | ], 651 | "source": [ 652 | "na_sum = list(df.isnull().sum())\n", 653 | "#na_col = list(df.isnull().sum().index)\n", 654 | "nan_col =[]\n", 655 | "for i in range(len(na_sum)):\n", 656 | " if na_sum[i] > 0:\n", 657 | " nan_col.append(na_col[i])\n", 658 | "nan_col" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 13, 664 | "metadata": { 665 | "collapsed": false 666 | }, 667 | "outputs": [ 668 | { 669 | "data": { 670 | "text/html": [ 671 | "
\n", 672 | "\n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | "
fieldfromzipcodecareer
0LawChicago60,521lawyer
1LawChicago60,521lawyer
2LawChicago60,521lawyer
3LawChicago60,521lawyer
4LawChicago60,521lawyer
\n", 720 | "
" 721 | ], 722 | "text/plain": [ 723 | " field from zipcode career\n", 724 | "0 Law Chicago 60,521 lawyer\n", 725 | "1 Law Chicago 60,521 lawyer\n", 726 | "2 Law Chicago 60,521 lawyer\n", 727 | "3 Law Chicago 60,521 lawyer\n", 728 | "4 Law Chicago 60,521 lawyer" 729 | ] 730 | }, 731 | "execution_count": 13, 732 | "metadata": {}, 733 | "output_type": "execute_result" 734 | } 735 | ], 736 | "source": [ 737 | "df[['field', 'from', 'zipcode', 'career']].head()" 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": {}, 743 | "source": [ 744 | "#### These columns are values with object type. It is hard to predict null values for these variables. So, let's just drop all these variables. " 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": 14, 750 | "metadata": { 751 | "collapsed": true 752 | }, 753 | "outputs": [], 754 | "source": [ 755 | "df = df.drop(['from','zipcode','field','career'], axis=1)" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": 15, 761 | "metadata": { 762 | "collapsed": false 763 | }, 764 | "outputs": [ 765 | { 766 | "data": { 767 | "text/plain": [ 768 | "False" 769 | ] 770 | }, 771 | "execution_count": 15, 772 | "metadata": {}, 773 | "output_type": "execute_result" 774 | } 775 | ], 776 | "source": [ 777 | "df.isnull().values.any()" 778 | ] 779 | }, 780 | { 781 | "cell_type": "markdown", 782 | "metadata": {}, 783 | "source": [ 784 | "#### We have sucessfully cleaned all null variables in the dataset. " 785 | ] 786 | }, 787 | { 788 | "cell_type": "markdown", 789 | "metadata": {}, 790 | "source": [ 791 | "## 1.2 Transformation" 792 | ] 793 | }, 794 | { 795 | "cell_type": "markdown", 796 | "metadata": {}, 797 | "source": [ 798 | "### Normalize data" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": 16, 804 | "metadata": { 805 | "collapsed": true 806 | }, 807 | "outputs": [], 808 | "source": [ 809 | "from sklearn.preprocessing import StandardScaler" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 17, 815 | "metadata": { 816 | "collapsed": false 817 | }, 818 | "outputs": [], 819 | "source": [ 820 | "X = StandardScaler().fit_transform(df)" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": 18, 826 | "metadata": { 827 | "collapsed": false 828 | }, 829 | "outputs": [ 830 | { 831 | "data": { 832 | "text/plain": [ 833 | "(8378, 107)" 834 | ] 835 | }, 836 | "execution_count": 18, 837 | "metadata": {}, 838 | "output_type": "execute_result" 839 | } 840 | ], 841 | "source": [ 842 | "X.shape" 843 | ] 844 | }, 845 | { 846 | "cell_type": "markdown", 847 | "metadata": {}, 848 | "source": [ 849 | "## 1.3 Reduction" 850 | ] 851 | }, 852 | { 853 | "cell_type": "markdown", 854 | "metadata": {}, 855 | "source": [ 856 | "### PCA\n", 857 | " - Find out what is fairly good value for n_components according to the Explained Variance Ratio\n", 858 | " - Reduce dimensions by the n_components" 859 | ] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": 19, 864 | "metadata": { 865 | "collapsed": false 866 | }, 867 | "outputs": [ 868 | { 869 | "data": { 870 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEKCAYAAAAB0GKPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xt4XXWd7/H3NzuXnXuae9qmbUrTlnLH2hZF5SIKeOno\noIADCs4Mwwhej+PozJyH0Tnn6DPjeI56PFQUUEaUAQStWG6DFxBo6QVoaUtLm1KaXtNbLs09+Z4/\n1gps0qRZLd3Zyc7n9Tzr2Wuv9Vt7f3+95Ju1fjdzd0REREaSkeoARERkfFDCEBGRSJQwREQkEiUM\nERGJRAlDREQiUcIQEZFIlDBERCQSJQwREYlECUNERCLJTHUAJ1N5ebnPmDEj1WGIiIwbq1ev3u/u\nFVHKplXCmDFjBqtWrUp1GCIi44aZbY9aVo+kREQkEiUMERGJRAlDREQiUcIQEZFIkpowzOxSM9tk\nZlvM7KtDnDcz+154fq2ZnZtwrsTM7jezl81so5mdl8xYRUTk2JKWMMwsBvwAuAyYB1xtZvMGFbsM\nqA+3G4BbE859F3jE3ecCZwEbkxWriIiMLJl3GAuALe7e4O7dwD3A4kFlFgN3eWA5UGJmNWZWDLwb\nuB3A3bvd/XASYxURkREkM2FMAXYkvG8Mj0UpUwc0AXea2fNm9mMzy09GkP39zv/93Sv8cXNTMj5e\nRCRtjNVG70zgXOBWdz8HOAIc1QYCYGY3mNkqM1vV1HT8P/QzMowfPtnA71/e95YCFhFJd8lMGDuB\n2oT3U8NjUco0Ao3uviI8fj9BAjmKu9/m7vPdfX5FRaTR7UepKoqzt6XzhK4VEZkokpkwVgL1ZlZn\nZtnAVcDSQWWWAp8Me0stAprdfbe77wF2mNmcsNzFwIZkBVpVlKOEISIygqTNJeXuvWZ2M/AoEAPu\ncPf1ZnZjeH4JsAy4HNgCtAPXJ3zEZ4G7w2TTMOjcSVVVGGfFtoPJ+ngRkbSQ1MkH3X0ZQVJIPLYk\nYd+Bm4a59gVgfjLjG1BZFGdfayfujpmNxleKiIw7Y7XRe1RVFubQ0+ccau9JdSgiImOWEgZBozeg\ndgwRkWNQwiBo9AbY19qV4khERMYuJQx0hyEiEoUSBlBRGN5hKGGIiAxLCQOIZ8Uoyctib4seSYmI\nDEcJI1RZqMF7IiLHooQRqiqKs1eN3iIiw1LCCFUWxmnSHYaIyLCUMEJVRTnsa+2iv99THYqIyJik\nhBGqKorT2+8cbO9OdSgiImOSEkZoYPCeGr5FRIamhBGqKAwG7+1T11oRkSEpYYR0hyEicmxKGKHX\nR3ura62IyJCUMEI5mTFK87N1hyEiMgwljATBaG/dYYiIDEUJI0FVuPKeiIgcTQkjgeaTEhEZnhJG\ngqqiOE2tXfRptLeIyFGUMBJUFeXQ73CgTe0YIiKDKWEkqAxX3lPXWhGRoylhJNBSrSIiw1PCSDAw\n2nvX4Y4URyIiMvYkNWGY2aVmtsnMtpjZV4c4b2b2vfD8WjM7N+Hcq2a2zsxeMLNVyYxzQHVRnPKC\nHNa8dng0vk5EZFzJTNYHm1kM+AFwCdAIrDSzpe6+IaHYZUB9uC0Ebg1fB1zo7vuTFeNgZsbCulKW\nNxzA3TGz0fpqEZExL5l3GAuALe7e4O7dwD3A4kFlFgN3eWA5UGJmNUmMaUSLZpayu7mTHQf1WEpE\nJFEyE8YUYEfC+8bwWNQyDvyXma02sxuSFuUgC2eWAbB824HR+koRkXFhLDd6n+/uZxM8trrJzN49\nVCEzu8HMVpnZqqamprf8pfWVBZTmZ7O8QQlDRCRRMhPGTqA24f3U8FikMu4+8LoPeJDgEddR3P02\nd5/v7vMrKirectBmxoIZpaxoOPiWP0tEJJ0kM2GsBOrNrM7MsoGrgKWDyiwFPhn2lloENLv7bjPL\nN7NCADPLB94HvJTEWN9k0cxSdh7uYMfB9tH6ShGRMS9pvaTcvdfMbgYeBWLAHe6+3sxuDM8vAZYB\nlwNbgHbg+vDyKuDBsJdSJvBzd38kWbEONtCOsWLbQWpL80bra0VExrSkJQwAd19GkBQSjy1J2Hfg\npiGuawDOSmZsxzKnqpCSvCxWNBzgirdNTVUYIiJjylhu9E6ZjIywHWOb2jFERAYoYQxj4cwyXjvY\nrmlCRERCIyYMM5tqZg+aWZOZ7TOzX5pZ2j+nmT99EgBrGzVNiIgIRLvDuJOgN1MNMBn4TXgsrc0o\nywfQiG8RkVCUhFHh7ne6e2+4/QR46wMexrjivCyK4pm8pq61IiJAtIRxwMyuMbNYuF0DTIhh0NPK\n8pQwRERCURLGp4GPA3uA3cAVvDFeIq1NK83T4D0RkdCI4zDcfTvw4VGIZcypLc3jvzbso6/fiWVo\nqnMRmdiGTRhm9hV3/1cz+z7BzLFv4u6fS2pkY8C00jy6+/rZ29LJ5JLcVIcjIpJSx7rD2Bi+jspq\nd2PRtHBakNcOtithiMiEN2zCcPffhLvt7n5f4jkz+1hSoxojBhLGjoPtLArnlxIRmaiiNHp/LeKx\ntDO5JJcMQw3fIiIcuw3jMoKZZKeY2fcSThUBvckObCzIimVQU5yrrrUiIhy7DWMXQfvFh4HVCcdb\ngS8mM6ixZFqpxmKIiMCx2zBeBF40s5+7e88oxjSmTCvN44mX96U6DBGRlIuyHsYMM/smMA+IDxx0\n95lJi2oMmVaWx/62Ltq7e8nLTuryISIiY1rUyQdvJWi3uBC4C/hZMoMaS2pf7ymlSQhFZGKLkjBy\n3f0JwNx9u7v/M/CB5IY1diSOxRARmciiPGPpMrMM4JVwje6dQEFywxo7EsdiiIhMZFHuMD4P5AGf\nA94GXAN8KplBjSWT8rIoyNE05yIix7zDMLMYcKW7fxloY4LMUpvIzKjVrLUiIse+w3D3PuD8UYpl\nzJpWqsF7IiJR2jCeN7OlwH3AkYGD7v5A0qIaY2on5fGHTU24O2aa5lxEJqYoCSNOsMLeRQnHHJgw\nCWNaWR5dvf3sa+2iqig+8gUiImkoygJKJ9xuYWaXAt8FYsCP3f1bg85beP5yoB24zt3XJJyPEUxP\nstPdP3iicbxVp9YUAfD8a4e59PTqVIUhIpJSUXpJnZDwh/0PgMsIRolfbWbzBhW7DKgPtxsIBggm\n+jxvrMuRMmfXlpCfHePpLftTHYqISMokLWEAC4At7t7g7t3APcDiQWUWA3d5YDlQYmY1AGY2lWCA\n4I+TGGMkWbEMFs4s409KGCIygSUzYUwBdiS8bwyPRS3zf4CvAP3JCvB4vHNWOdv2H6HxkHpLicjE\nNGLCMLMqM7vdzB4O388zs79MZlBm9kFgn7uvjlD2BjNbZWarmpqakhbTu+rLAXhmy4GkfYeIyFgW\n5Q7jJ8CjwOTw/WbgCxGu2wnUJryfGh6LUuadwIfN7FWCR1kXmdmQEx66+23uPt/d51dUVEQI68TU\nVxZQUZjDU3osJSITVJSEUe7u9xI+GnL3XqAvwnUrgXozqzOzbOAqYOmgMkuBT1pgEdDs7rvd/Wvu\nPtXdZ4TX/c7dr4lYp6QwM86fVc4zW/bT3++pDEVEJCWiJIwjZlZGMPaCgR/sI10UJpabCe5ONgL3\nuvt6M7vRzG4Miy0DGoAtwI+Azxx/FUbP+bPKOXCkm417WlIdiojIqIsycO9LBHcCp5jZ00AFcEWU\nD3f3ZQRJIfHYkoR9B24a4TP+APwhyvcl2ztnBe0YT2/Zz2mTi1McjYjI6BrxDiMcSPce4B3A3wCn\nufvaZAc2FlUXx5lVWcCf1PAtIhNQlF5SNwEF7r7e3V8CCsxsTD86SqbzZ5Xz3LYDdPVGacYREUkf\nUdow/trdDw+8cfdDwF8nL6Sx7bxTyujs6Wdt44jNOCIiaSVKwohZwhSt4ZQf2ckLaWxbMKMUgOVb\n9VhKRCaWKAnjEeA/zexiM7sY+EV4bEKalJ/N3OpCVmw7mOpQRERGVZReUn9P0Nj9t+H7xxkD8zul\n0sK6Uu5d1UhPXz9ZsWTOriIiMnZE6SXV7+63uvsV4fbDcCW+CWvRzDI6evrUjiEiE0qUXlLvNLPH\nzWyzmTWY2TYzaxiN4MaqBXVhO0aD2jFEZOKI8jzlduA7BGt7vx2YH75OWGUFOdRXFqgdQ0QmlCht\nGM3u/nDSIxlnFs4s5cE1O+nt6ydT7RgiMgFE+Un3ezP7NzM7z8zOHdiSHtkYt7CujCPdfby0S/NK\nicjEEOUOY2H4Oj/hmAMXnfxwxo+FM4N2jBUNBzi7tiTF0YiIJN+ICcPdLxyNQMabysI4MyvyWd5w\ngL95zympDkdEJOmi3GFgZh8ATgPiA8fc/RvJCmq8WDCjlIdf2oO7kzAYXkQkLUXpVrsEuBL4LGDA\nx4DpSY5rXJhbXUhzRw/7WrtSHYqISNJFafR+h7t/Ejjk7l8HzgNmJzes8WF2VSEAm/e2pjgSEZHk\ni5IwOsLXdjObDPQANckLafyYXR0kjE17lDBEJP1FacN4yMxKgH8D1hD0kJrQc0kNKC/IoTQ/m1f2\ntqU6FBGRpIvSS+pfwt1fmtlDQNzdNYlSqL6ygM37dIchIulv2IRhZhe5++/M7KNDnMPdH0huaOPD\nnOpCHlizUz2lRCTtHesO4z3A74APDXHOASUMoL6qkLauXnY1dzKlJDfV4YiIJM2wCcPdbzGzDOBh\nd793FGMaV2ZXFgBBTyklDBFJZ8fsJeXu/cBXRimWcen1rrXqKSUiaS5Kt9r/MrMvm1mtmZUObEmP\nbJyYlJ9NRWEOm9VTSkTSXJSEcSVwE/AksDrcVkX5cDO71Mw2mdkWM/vqEOfNzL4Xnl87MAuumcXN\n7Dkze9HM1pvZ16NXafTNrirgFfWUEpE0F6Vbbd2JfLCZxYAfAJcAjcBKM1vq7hsSil0G1IfbQuDW\n8LULuMjd28wsC/iTmT3s7stPJJZkm11VyD3P7aC/38nIUE8pEUlPUScfPB2Yx5snH7xrhMsWAFvc\nvSH8jHuAxUBiwlgM3OXuDiw3sxIzq3H33cDAM56scPMosabC7KpCOnr6aDzUwbSyvFSHIyKSFFEm\nH7wF+H64XQj8K/DhCJ89BdiR8L4xPBapjJnFzOwFYB/wuLuviPCdKTG76o2eUiIi6SpKG8YVwMXA\nHne/HjgLKE5qVIC797n72cBUYEF4l3MUM7vBzFaZ2aqmpqZkhzWk+rCn1CYlDBFJY5EmHwy71/aa\nWRHBb/y1Ea7bOajc1PDYcZVx98PA74FLh/oSd7/N3ee7+/yKiooIYZ18RfEsaorjvKKEISJpLErC\nWBVOPvgjgh5Sa4BnI1y3Eqg3szozywauApYOKrMU+GTYW2oR0Ozuu82sIvxOzCyXoOH85WhVSo15\nNUU8v+MwQXOMiEj6idJL6jPh7hIzewQocve1Ea7rNbObgUeBGHCHu683sxvD80uAZcDlwBagHbg+\nvLwG+GnY0yoDuNfdHzq+qo2uC+dW8sTL+3hlX9vrg/lERNLJiAnDzJYC9wC/dvdXj+fD3X0ZQVJI\nPLYkYd8JxngMvm4tcM7xfFeqXTKvin/61Us8vmGvEoaIpKUoj6T+HTgf2GBm95vZFWYWH+miiaaq\nKM5ZtSU8tn5PqkMREUmKEROGu/8xfCw1E/gh8HGChm8Z5H3zqnixsZk9zZ2pDkVE5KSLcocx0PD8\n58CNwNuBnyYzqPHqffOqAHh8494URyIicvJFGbh3L7ARuAj4v8Ap7v7ZZAc2Hs2qLKCuPF+PpUQk\nLUWZGuR24Gp370t2MOOdmXHJvCrufHobLZ09FMWzUh2SiMhJE6UN41Eli+jeN6+Knj7nD5tSM+pc\nRCRZIrVhSHTnTJtEeUEOdy/frkF8IpJWlDBOsliG8aVLZrNi20F+/txrqQ5HROSkGbYNY2Axo+G4\n+5qTH056uHpBLb9dt4tvLnuZC+ZUaq1vEUkLx7rD+Pdw+wGwAriNYD6pFeExGYaZ8a2Pnkm/O197\nYJ0eTYlIWhg2Ybj7he5+IbAbODecEfZtBFN2DJ51VgapLc3j7y+dy5Obm/j1C7tSHY6IyFsWpQ1j\njruvG3jj7i8BpyYvpPRx7aLpzCzP5z9X7hi5sIjIGBclYaw1sx+b2QXh9iNgxNlqBTIyjA+cWcOK\nbQfY39aV6nBERN6SKAnjemA98Plw28Ab05DLCC4/o4Z+h0de0uhvERnfoqyH0WlmS4Bl7r5pFGJK\nK3OrC5lZns+ydbu5ZtH0VIcjInLCoswl9WHgBeCR8P3Z4RoZEoGZcfkZNSxv0GMpERnfojySugVY\nABwGcPcXgLpkBpVuLjujmn6Hx9ZrFlsRGb+iJIwed28edEwDC47DvJoiZpTlsWzd7lSHIiJywqIk\njPVm9gkgZmb1ZvZ94Jkkx5VWBh5LPdtwgINHulMdjojICYmSMD4LnAZ0Ab8AWoAvJDOodPSBM2vo\n63eW/HFrqkMRETkhUXpJtQP/GG5ygk6bXMwnFk7jticbOLWmkI+cMzXVIYmIHJcRE4aZzQa+DMxI\nLO/uFyUvrPT09Q+fRkNTG3//y3VML8vn3GmTUh2SiEhkUR5J3Qc8D/wT8HcJmxynrFgGt/7F26gu\ninPDXavVzVZExpUoCaPX3W919+fcffXAlvTI0tSk/GyWXPM29rd1cd+qxlSHIyISWZSE8Rsz+4yZ\n1ZhZ6cAW5cPN7FIz22RmW8zsq0OcNzP7Xnh+7cAaHGZWa2a/N7MNZrbezD5/nPUa0+ZNLuJt0yfx\n4PONmvpcRMaNKAnjUwSPoJ4BVofbqpEuMrMYwboZlwHzgKvNbN6gYpcB9eF2A3BreLwX+G/uPg9Y\nBNw0xLXj2kfOmcLmvW2s39WS6lBERCIZMWG4e90Q28wIn70A2OLuDe7eDdwDLB5UZjFwlweWAyVm\nVuPuuwdW9HP3VmAjMOW4ajbGffDMGrJjGTywRkuLiMj4MGzCMLOLwtePDrVF+OwpQOJCEI0c/UN/\nxDJmNoNg0aYVEb5z3CjJy+bCuRUsfXEXvX39qQ5HRGREx7rDeE/4+qEhtg8mOS4AzKwA+CXwBXcf\n8tmNmd1gZqvMbFVTU9NohHXSfOScqexv6+KpLftTHYqIyIiGHYfh7reErye69sVOoDbh/VSOXtp1\n2DJmlkWQLO529weOEedtBOuNM3/+/HHVgnzh3AqKc7N4YM1OLpxTmepwRESOacSBewBm9gGC6UHi\nA8fc/RsjXLYSqDezOoIkcBXwiUFllgI3m9k9wEKg2d13m5kBtwMb3f07kWoyDuVkxvjQWTXct6qR\nptYuKgpzUh2SiMiwoqyHsQS4kmBOKQM+Boy4EpC79wI3A48SNFrf6+7rzexGM7sxLLYMaAC2AD8C\nPhMefydwLXCRmb0QbpcfV83GieveMQMHvnTvC/T3j6sbJBGZYGykcQBmttbdz0x4LQAedvd3jU6I\n0c2fP99XrRqxx++Y8/MVr/EPD67j794/h5sunJXqcERkAjGz1e4+P0rZKOMwOsLXdjObDPQANSca\nnBzt6gW1fOisyfz7Y5t4btvBVIcjIjKkKAnjITMrAf4NWAO8SjDNuZwkZsb/+sjpTC/L53O/eJ7m\njp5UhyQicpQoA/f+xd0Pu/svCdou5rr7f09+aBNLYTyL7111Dk1tXfzP325IdTgiIkcZtpfUsQbn\nmRnH6uoqJ+aMqcX8zbtn8v/+sJUPnDmZ98yuSHVIIiKvO1a32g8d45wDShhJ8LmL63lsw16+9su1\nPPrFd1MYz0p1SCIiwLEH7p3ogD15C+JZMf71ijO54tZn+Mzda7hobiW1k/JYMLOUIiUPEUmhKCvu\nlQG3AOcT3Fn8CfiGux9IcmwT1rnTJvHl98/h+09s4alXgmlD5lYX8tBnzyczFqWfgojIyRflp889\nQBPw58AV4f5/JjMogc9cMIsN33g/q/7pvXzzo2fw8p5WfrZ8e6rDEpEJLErCqAl7Sm0Lt/8BVCU7\nMAk6F5QX5HDV22s5f1Y533l8MwePdKc6LBGZoKIkjMfM7Cozywi3jxNM9yGjxMy45UPzONLdx78/\ntinV4YjIBBUlYfw18HOgK9zuAf7GzFrNTMvFjZL6qkKuXTSdnz/3Gusam1MdjohMQFEG7hW6e4a7\nZ4VbRnis0N2LRiNICXzxvbMpzcvmz299hr+/fy1b9rWlOiQRmUCizFb7l4Pex8zsluSFJMMpzsvi\nVze9kyvfXsuvXtjJJf/7j/zwj1tTHZaITBBRHkldbGbLzKzGzE4HlgOFSY5LhlFbmse//NnpPP3V\ni7j0tGq++fDLPLR2V6rDEpEJYMRxGO7+CTO7ElgHHAE+4e5PJz0yOabyghz+95Vn09S6gi/d+yKT\nS3I5d9qkVIclImksyiOpeuDzBMulbgeuNbO8ZAcmI4tnxfjhtW+juijODXet4u4V22loamOkNU5E\nRE5ElCVafwPc5O5PhEunfolg+dXTkhqZRFJWkMOd17+d6+58jn988CUAaorjXL1gGtcums6k/OwU\nRygi6SLKintF7t4y6Nhsd9+c1MhOwHhdce9kcHe27T/Csw0HeHT9Xp7c3ERuVoyrFtTylffPJTc7\nluoQRWQMOikr7pnZVwDcvcXMPjbo9HUnHp4kg5kxs6KAv1g4nbs+vYBHv/BuPnBmDT955lU+/sNn\n2dPcmeoQRWScO1YbxlUJ+18bdO7SJMQiJ9Gc6kK+/bGz+NG182loamPxD/7E6u2H1L4hIifsWG0Y\nNsz+UO9ljHrvvCru/9t38Fc/XcWf3/oMk/KyOH1KMQvrSll89hRqS9V/QUSiOVbC8GH2h3ovY9ip\nNUX85rPn89t1u3mpsZm1O5v59mOb+fZjm1kwo5T3n17NwrpSTq0pIpah3wVEZGjDNnqbWR/BuAsD\ncoH2gVNA3N3H3Go+E7nR+3jtPNzBr57fyYPP73x9ipHCnEzeM6eCD501mQvmVJCTqYZykXR3PI3e\nI/aSGk+UME7MrsMdrHz1IM9uPcDjG/Zy4Eg3hfFM3lVfznmnlHP+rHLqyvNTHaaIJMGYSRhmdinw\nXSAG/NjdvzXovIXnLye4g7nO3deE5+4APgjsc/fTo3yfEsZb19vXzzNbD/Dbtbt56pUmdoW9q957\nahX/cPlcZlYUpDhCETmZxkTCMLMYsBm4BGgkGOx3tbtvSChzOfBZgoSxEPiuuy8Mz70baAPuUsJI\nDXdn+4F2Hlq7iyV/bKCzp4+rF0zj1JoiSvKyqC6Oc9bUErV7iIxjx5Mwooz0PlELgC3u3hAGdQ+w\nGNiQUGYxQUJwYLmZlZhZjbvvdvcnzWxGEuOTEZgZM8rzufmieq58+zS+8/gm7l6xnf6E3zEqCnO4\n/PRqLphTSWVRDhUFOZTmZ2vtcZE0lMyEMQXYkfC+keAuYqQyU4DdSYxLTkBFYQ7f/OiZ3PKh0zjc\n3sPhjm42723j4XW7uWflDn767BvrjZtBWX425QU5LKgr5a/On8m0MnXfFRnvkpkwRoWZ3QDcADBt\n2rQUR5P+4lkxqotjVBfHmVtdxIfPmkxbVy8bd7ewv7WLprau8LWbPc0d/OK51/jZ8u188MzJXPn2\nWubPmKTeVyLjVDITxk6gNuH91PDY8ZY5Jne/DbgNgjaM4w9T3qqCnEzePqN0yHN7mju54+lt3L18\nO0tf3EU8K4NFM8uYV1PEjLJ8ZpTnc+bUYuJZSiIiY10yE8ZKoN7M6giSwFXAJwaVWQrcHLZvLASa\n3V2Po9JIdXGcf7j8VD5/cT3LGw7w5OYmnt56gD+9sp/esDEkJzODBXWlLJpZRm1pHpWFOdSW5jGl\nJDfF0YtIoqQlDHfvNbObgUcJutXe4e7rzezG8PwSYBlBD6ktBN1qrx+43sx+AVwAlJtZI3CLu9+e\nrHglufJzMrn41CouPrUKCLrv7jrcyea9rTyz9QBPvdLEvz266U3X1Jbm8s5TyllQV8rc6iJOqczX\n4yyRFNLAPRkzmjt62NfSyd6WLrbsa+XprQdY3nCA1s5eAGIZxqk1hbyrvoJ31Zdzdm0JednjvhlO\nJKXGxDiMVFDCSD+9ff007D/Cy3ta2bSnhZWvHmLN9kOvP86qKMxhemke1cVxygtyKC/IZm51EWdP\nK6G8ICfF0YuMfWNlHIbIW5YZy2B2VSGzqwrhrMkAtHb2sLzhIJv2tLD9QDvbD7azYVcLTW1dr9+N\nQPBIa3ZlIadUFnBKRT5zqouYXVWguxKRE6T/OTLuFMazuGReFZfMqzrqXHt3L+t3tfD8a4d4cUcz\nW5vaeGrLfrp7+4FgjMjsykI+cu4UPnrOFCqL4qMdvsi4pUdSkvb6+p0dB9vDx1qtPPVKE6u2HyKW\nYSyYEUzrPqe6gJkVBUydlEtVYZwMTXciE4TaMERG0NDUxn2rG3l6y35e2dtGR0/f6+eyYxnUlMSZ\nUpLLlJJcTqksYE51IadWF1FVlEMwZ6ZIelDCEDkO/f3OjkPtbNt/hB2HOmg82E7j4Q52He6g8VAH\nTa1dr5etKsph/vRSzplWwtRJeVQV5VBZFGdSXha5WTElExl31OgtchwyMozpZflMLxt6zY/D7d1s\n2tPKxt0trHntMKu3H+K3644eX5qdmUFFQQ6zKguoryygpiSXmEEslsHUklwW1JWSn6P/cjJ+6Q5D\n5AQcaOtiT0sne1s62dfSxaH2Hg63d7O7uZMt+9rY2tRGV9jQPiArZpwzbRJnTClmRlke08vyqSvP\nZ0pJrtpMJGV0hyGSZGUFOZQV5HDa5OIhz/f1O22dvfS509vXz+a9bfxpy36e2bqfu1dsp7PnjWQS\nz8qgrryAU6sLObWmiHmTi5hXU8Sk/OzRqo5IJEoYIkkQyzCK895Y9r6yKM759eVAsDDVvtYutu0/\nwrb9R9i6r40tTW08s/UADzz/xtybNcVxZlcVUlaQzaS8bGqK45wxpZjTpxTr0ZakhP7ViYwyM6Oq\nKE5VUZxFM8vedO7gkW427Gphw+5m1u9qYWtTG1v2tXGovZv27qAnV4ZBVVGc3OwYedkxiuJZTMrP\npjQvm9nVhSyqK2VWZYEa4OWkU8IQGUNK87M5v7789buRRPvbuljX2MwLOw6z83AHHd19tHf30trZ\ny8ZdLexDF/n/AAAOE0lEQVRv66IlHOlemp/NrIoCJpfEqSnJJT87RnZmBvGsGIXxTIriWVQU5jC3\nuojsTK2OKNEoYYiME+UFOVw4t5IL51YOed7dee1gOyu2HWTltoNsP9jOqu2H2LN29+tzbw0Wz8rg\n7NoSzppaQmVRnMrCHKqK4tQUx6kujpOlpXYlgRKGSJowe6N78Mfnv7EumbvT2+909fbT2dNHa2cv\nLR097DzcwcpXD7Ly1YPc+fSrdPf1D/o8mJSXTUleFpPysplTXcjCulIW1pVpAOMEpW61IoK7c7i9\nh32tQXfh3Yc72NXcyYG2Lg6393DgSBfrd7bQ2hU88sqwYI2T4twsppXmMbMin+ml+RTEM19vV6kp\niTO5JJeieNYI3y6ppG61InJczIxJ+dlMyg/uJIbS29fPxt2trN5+kANHumnt7OVQezevHmjn1y/s\netNMwYlK87M5fUoxZ04pZkZ5PsW5WRTFM6kpzmXKpFxiGoMybihhiEgkmbEMzphazBlTjx574u60\ndPbS3t1Le3cfzR097D7cyc7D7WzZ18a6nS3c+set9A1qS8nOzKCuLJ/ywmwKcjIpjGcxuTjO1NI8\npk4K7k7ysmMU52ZRmp+tx2AppoQhIm+ZmVGcm0VxbsLjp2lvLtPZ08e+li5aOnto7uhh56EOtja1\nsbXpCIfau9nf2k5rZw97WjoZqo1+Ul4Ws6sKmVmRT1E8i4KcTKqK4pw+pZj6qgI10I8CJQwRGRXx\nrBjTyvJGLNfT18+uwx3sPNRBW1dwx3LwSDev7Aump398w15aO3vfNPVKTmYG08vyKMvPoawgm+Lc\nLArimRTmZFKQk0l+ePdSWZRDTXGcysK4HoWdACUMERlTsmIZx5wMckBPXz+NhzpY23iYtY3NNB5q\n50BbN+t3tdDS0UNrV+/rC2cNFsswaorjTJ2US01xLvk5MXKzYuRmZ5KfHSMvJ5OqwhxOn1JMTXFc\nj8JCShgiMi5lxTKoKw8mcFx89pQhy3T39nOkq5e2rl6aO3rY19rJrsOd7G4O7mAaD3Xw3LaDdPb0\n0RFugzuOluVnc0plQTA6vzCH6eX5zKkqZE5V4Zumf5kIlDBEJG1lZ2aQnRn0/gpGpgw9WeSA/n6n\ns7ePtq5eGg918NLOZtY1NrP9YDvrGg/zeEvnmyaOXFhXyrXnTef9p1VPiDYUJQwRkVBGhpGXnUle\ndiaVhXHOnTbpTefdnd3NnWze28raxmbuW72Dm3/+PGX52dSFXYYn5WcHPb0mBT29akvzmFySHt2H\nNXBPROQE9fU7T25u4tcv7GRfazDI8eCRbva2dr7p0VZWzDilooDzZ5XzrtkVLKwrJZ4VS13gCcbM\nEq1mdinwXSAG/NjdvzXovIXnLwfagevcfU2Ua4eihCEiY0FPXz97mjvZcbCd7Qfb2X6gnXU7D7Py\n1UN09/YzvSyP//j0wki9xpJtTIz0NrMY8APgEqARWGlmS919Q0Kxy4D6cFsI3AosjHitiMiYlBXL\noLY0j9rSPN6RcLyju48/bm7iqw+s5Yolz3DXXy5gbnVRyuI8Xslsw1gAbHH3BgAzuwdYDCT+0F8M\n3OXBbc5yMysxsxpgRoRrRUTGldzsGJeeXs3MinyuvX0FH1/yLDdfNIuczBgZGUbMjMwMIyvTqCiI\nMzmcj2usPL5KZsKYAuxIeN9IcBcxUpkpEa8VERmXZlcVcv+N7+BTdz7H/1r28ojlszMzKMzJJC8n\nRoYZRjC6PniFsvwc7r3xvKTHPe57SZnZDcANANOmTRuhtIjI2FBbmsfjX3wPLR099LvT505/P/T2\n99Pd28++1i52He5gd3MnLZ09tHUGo977w3bnfg96bTlQOEpL9ibzW3YCtQnvp4bHopTJinAtAO5+\nG3AbBI3eby1kEZHRE8sIZgkeysyKglGOZmTJHGmyEqg3szozywauApYOKrMU+KQFFgHN7r474rUi\nIjKKknaH4e69ZnYz8ChB19g73H29md0Ynl8CLCPoUruFoFvt9ce6NlmxiojIyDRwT0RkAjuecRjp\nP/mJiIicFEoYIiISiRKGiIhEooQhIiKRKGGIiEgkadVLysyagO0neHk5sP8khjMWTYQ6wsSo50So\nI0yMeqa6jtPdvSJKwbRKGG+Fma2K2rVsvJoIdYSJUc+JUEeYGPUcT3XUIykREYlECUNERCJRwnjD\nbakOYBRMhDrCxKjnRKgjTIx6jps6qg1DREQi0R2GiIhEMuEThpldamabzGyLmX011fGcLGZWa2a/\nN7MNZrbezD4fHi81s8fN7JXwdVKqY32rzCxmZs+b2UPh+7SqY7h08f1m9rKZbTSz89KtjgBm9sXw\n3+pLZvYLM4uP93qa2R1mts/MXko4NmydzOxr4c+iTWb2/tREPbwJnTDMLAb8ALgMmAdcbWbzUhvV\nSdML/Dd3nwcsAm4K6/ZV4Al3rweeCN+Pd58HNia8T7c6fhd4xN3nAmcR1DWt6mhmU4DPAfPd/XSC\nZQ2uYvzX8yfApYOODVmn8P/nVcBp4TX/L/wZNWZM6IQBLAC2uHuDu3cD9wCLUxzTSeHuu919Tbjf\nSvBDZgpB/X4aFvsp8GepifDkMLOpwAeAHyccTps6mlkx8G7gdgB373b3w6RRHRNkArlmlgnkAbsY\n5/V09yeBg4MOD1enxcA97t7l7tsI1glaMCqBRjTRE8YUYEfC+8bwWFoxsxnAOcAKoCpc1RBgD1CV\norBOlv8DfAXoTziWTnWsA5qAO8PHbj82s3zSq464+07g28BrwG6C1TcfI83qGRquTmP+59FETxhp\nz8wKgF8CX3D3lsRzHnSRG7fd5Mzsg8A+d189XJnxXkeC37rPBW5193OAIwx6LJMGdSR8jr+YIEFO\nBvLN7JrEMulQz8HGW50mesLYCdQmvJ8aHksLZpZFkCzudvcHwsN7zawmPF8D7EtVfCfBO4EPm9mr\nBI8TLzKzn5FedWwEGt19Rfj+foIEkk51BHgvsM3dm9y9B3gAeAfpV08Yvk5j/ufRRE8YK4F6M6sz\ns2yCBqelKY7ppDAzI3juvdHdv5NwainwqXD/U8CvRzu2k8Xdv+buU919BsHf3e/c/RrSq457gB1m\nNic8dDGwgTSqY+g1YJGZ5YX/di8maHdLt3rC8HVaClxlZjlmVgfUA8+lIL5hTfiBe2Z2OcFz8Bhw\nh7v/zxSHdFKY2fnAU8A63ni+/w8E7Rj3AtMIZvb9uLsPbpQbd8zsAuDL7v5BMysjjepoZmcTNOpn\nAw3A9QS/7KVNHQHM7OvAlQQ9/J4H/gooYBzX08x+AVxAMCPtXuAW4FcMUycz+0fg0wR/Bl9w94dT\nEPawJnzCEBGRaCb6IykREYlICUNERCJRwhARkUiUMEREJBIlDBERiUQJQ8YcM+szsxfCWUvvM7O8\nYcotM7OSE/j8yWZ2/1uI71UzKz/R68cLM7vOzCanOg4ZO5QwZCzqcPezw1lLu4EbE09aIMPdLw8n\n4jsu7r7L3a84WcGmsesIpukQAZQwZOx7CphlZjPCNQLuAl4Cagd+0w/PbTSzH4XrKTxmZrkAZjbL\nzP7LzF40szVmdkpY/qXw/HVm9msz+0O4PsEtA19sZr8ys9XhZ94wUqAWrK2yJvyuJ8JjpeHnrDWz\n5WZ2Znj8n83sp2b2lJltN7OPmtm/mtk6M3sknNZl4G5m4PhzZjYrPD7DzH4Xfu4TZjYtPP4TM/ue\nmT1jZg1mdkVCfH9nZivDa76e8DlH/dmF180H7g7v9nLN7FsWrK+y1sy+fRL+bmW8cXdt2sbUBrSF\nr5kE0yb8LTCDYMT6ooRyrxKMoJ1BMDL27PD4vcA14f4K4CPhfpxg2uwZwEvhsesIZkctA3IJktH8\n8Fxp+DpwvCzxewfFXEEw02jdoGu/D9wS7l8EvBDu/zPwJyCLYI2LduCy8NyDwJ8lfNc/hvufBB4K\n938DfCrc/zTwq3D/J8B9BL8MziOYvh/gfQRrR1t47iGCadOP9Wf3h4Q/izJgE28M9i1J9b8TbaO/\n6Q5DxqJcM3sBWEUwx9Dt4fHt7r58mGu2ufsL4f5qYIaZFQJT3P1BAHfvdPf2Ia593N0PuHsHwaR3\n54fHP2dmLwLLCSaFqz9GzIuAJz1YxwB/Y/qK84H/CI/9Digzs6Lw3MMeTLS3jmBqmkfC4+sIfpAP\n+EXC63nh/nnAz8P9/0iIGYLk0e/uG3hj6uz3hdvzwBpgbkJ9jvqzG6J+zUAncLuZfZQgwckEk5nq\nAESG0OHuZyceCOaj48gxrulK2O8juCuIavD8OB7OTfVe4Dx3bzezPxDcoZxMXQDu3m9mPe4+EEc/\nb/6/6cPsH/NzQ5bw+k13/2FiQQvWShnxz87de81sAcGkgFcANxPcMckEojsMSVserDTYaGZ/BhDO\nAjpUj6tLwraGXILVz54GioFDYbKYS3AHcSzLgXeHs4xiZqXh8aeAvwiPXQDs90HrkkRwZcLrs+H+\nMwQz9BJ+/lMjfMajwKctWB8FM5tiZpUjXNMKFIblC4Bid18GfJHgMZpMMLrDkHR3LfBDM/sG0AN8\njDevzgfBFNK/JFh/4GfuvsrM1gE3mtlGgmf3wz0KA8Ddm8KG8QfMLINgjYNLCNoq7jCztQSPcT41\n/KcMa1J4fRdwdXjsswSr8P0dwYp8148Q32NmdirwbHi31gZcQ3BHMZyfAEvMrINg3ftfm1mc4G7l\nSydQDxnnNFutTGhmdh1Bw+7NqY5lKBYsDjXf3fenOhYRPZISEZFIdIchIiKR6A5DREQiUcIQEZFI\nlDBERCQSJQwREYlECUNERCJRwhARkUj+P+Cuq/sp7mpkAAAAAElFTkSuQmCC\n", 871 | "text/plain": [ 872 | "" 873 | ] 874 | }, 875 | "metadata": {}, 876 | "output_type": "display_data" 877 | } 878 | ], 879 | "source": [ 880 | "#PCA In Sklearn\n", 881 | "from sklearn.decomposition import PCA\n", 882 | "\n", 883 | "pca_full = PCA(n_components = None)\n", 884 | "pca_full.fit(X)\n", 885 | "\n", 886 | "plt.plot(range(0,107), pca_full.explained_variance_ratio_)\n", 887 | "plt.ylabel('Explained variance ratio')\n", 888 | "plt.xlabel('Principal components')\n", 889 | "#plt.xticks(np.arange(0,100,5))\n", 890 | "plt.show()" 891 | ] 892 | }, 893 | { 894 | "cell_type": "markdown", 895 | "metadata": {}, 896 | "source": [ 897 | "#### From above figure, we will choose 80 as the dimensions of Principal components." 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": 20, 903 | "metadata": { 904 | "collapsed": false 905 | }, 906 | "outputs": [ 907 | { 908 | "name": "stdout", 909 | "output_type": "stream", 910 | "text": [ 911 | "[[-0.11279008 0.43894183 -3.99012024 ..., 1.41998245 0.4333418\n", 912 | " 0.50221114]\n", 913 | " [-0.46084816 0.17902204 -3.58830904 ..., 1.60322019 0.17307902\n", 914 | " 0.3332049 ]\n", 915 | " [-2.61677928 2.52343016 -5.04277486 ..., 1.53863346 0.27665 0.10085208]\n", 916 | " ..., \n", 917 | " [ 4.29349456 3.1139787 8.86817852 ..., 1.59718786 -0.59968411\n", 918 | " -0.65606429]\n", 919 | " [ 3.06349133 4.87790257 7.91930136 ..., 0.8040673 -1.19636896\n", 920 | " 0.58418913]\n", 921 | " [ 1.87201368 5.70640416 7.50898992 ..., 1.19867262 -0.93258426\n", 922 | " 0.58532392]]\n", 923 | "\n", 924 | "[ 0.0599287 0.05130629 0.04778579 0.04120846 0.03794963 0.03442235\n", 925 | " 0.02928028 0.02648273 0.02516247 0.02253369 0.02148449 0.01909422\n", 926 | " 0.01792891 0.01660708 0.01649287 0.01499748 0.01459388 0.0135756\n", 927 | " 0.01335547 0.01295986 0.01223028 0.01205561 0.01173874 0.0115274\n", 928 | " 0.01122597 0.01099925 0.01050502 0.01044227 0.01014913 0.0100701\n", 929 | " 0.00998956 0.00958084 0.00934085 0.009218 0.00898172 0.00889459\n", 930 | " 0.00866534 0.00851304 0.008274 0.00815345 0.00807838 0.00788843\n", 931 | " 0.0077532 0.00759841 0.00726425 0.00711507 0.00700392 0.0068995\n", 932 | " 0.00674679 0.00654725 0.00645781 0.00639411 0.00625 0.00623941\n", 933 | " 0.00600632 0.00588609 0.00575109 0.00558769 0.00542205 0.00535553\n", 934 | " 0.00517615 0.00514075 0.00507192 0.00485718 0.00475494 0.00472658\n", 935 | " 0.00467574 0.00446718 0.00438275 0.00428521 0.00421698 0.00400091\n", 936 | " 0.00394904 0.0038194 0.00378123 0.00367514 0.00348699 0.0034291\n", 937 | " 0.00330351 0.00324087]\n" 938 | ] 939 | } 940 | ], 941 | "source": [ 942 | "x_scaled = StandardScaler().fit_transform(X)\n", 943 | "pca = PCA(n_components = 80)\n", 944 | "x_pca = pca.fit_transform(x_scaled)\n", 945 | "print(x_pca, end = '\\n\\n')\n", 946 | "print(pca.explained_variance_ratio_)" 947 | ] 948 | }, 949 | { 950 | "cell_type": "code", 951 | "execution_count": 21, 952 | "metadata": { 953 | "collapsed": false 954 | }, 955 | "outputs": [ 956 | { 957 | "data": { 958 | "text/plain": [ 959 | "0.95439228841885837" 960 | ] 961 | }, 962 | "execution_count": 21, 963 | "metadata": {}, 964 | "output_type": "execute_result" 965 | } 966 | ], 967 | "source": [ 968 | "sum(pca.explained_variance_ratio_)" 969 | ] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": 22, 974 | "metadata": { 975 | "collapsed": false 976 | }, 977 | "outputs": [ 978 | { 979 | "data": { 980 | "text/plain": [ 981 | "(8378, 80)" 982 | ] 983 | }, 984 | "execution_count": 22, 985 | "metadata": {}, 986 | "output_type": "execute_result" 987 | } 988 | ], 989 | "source": [ 990 | "x_pca.shape" 991 | ] 992 | }, 993 | { 994 | "cell_type": "markdown", 995 | "metadata": {}, 996 | "source": [ 997 | "# 2. Model Training\n", 998 | "We'll do three models and compare the prediction results:\n", 999 | " - 2.1 Manually made neural network\n", 1000 | " - 2.2 Tensorflow\n", 1001 | " - 2.3 SVM\n", 1002 | " - 2.3 Logistic regression" 1003 | ] 1004 | }, 1005 | { 1006 | "cell_type": "markdown", 1007 | "metadata": {}, 1008 | "source": [ 1009 | "### Train and Test split" 1010 | ] 1011 | }, 1012 | { 1013 | "cell_type": "code", 1014 | "execution_count": 23, 1015 | "metadata": { 1016 | "collapsed": true 1017 | }, 1018 | "outputs": [], 1019 | "source": [ 1020 | "from sklearn.model_selection import train_test_split\n", 1021 | "\n", 1022 | "X_train, X_test, y_train, y_test = train_test_split(x_pca, df_labels, test_size=0.2, random_state=0)" 1023 | ] 1024 | }, 1025 | { 1026 | "cell_type": "code", 1027 | "execution_count": 24, 1028 | "metadata": { 1029 | "collapsed": false, 1030 | "scrolled": true 1031 | }, 1032 | "outputs": [ 1033 | { 1034 | "data": { 1035 | "text/plain": [ 1036 | "(6702, 80)" 1037 | ] 1038 | }, 1039 | "execution_count": 24, 1040 | "metadata": {}, 1041 | "output_type": "execute_result" 1042 | } 1043 | ], 1044 | "source": [ 1045 | "X_train.shape" 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "code", 1050 | "execution_count": 25, 1051 | "metadata": { 1052 | "collapsed": false 1053 | }, 1054 | "outputs": [ 1055 | { 1056 | "data": { 1057 | "text/plain": [ 1058 | "array([[-4.68464119, -2.8248629 , -1.51037084, ..., -0.08057486,\n", 1059 | " -0.71625464, -0.10307144],\n", 1060 | " [-0.74018427, 1.56521961, -0.06997587, ..., 0.01645834,\n", 1061 | " 0.45644289, -0.24637226],\n", 1062 | " [ 1.74612994, -3.12552681, 1.71795705, ..., 0.16907101,\n", 1063 | " 0.4087692 , -0.38171825],\n", 1064 | " ..., \n", 1065 | " [ 0.75099882, -2.49960586, -1.51160927, ..., -0.68449073,\n", 1066 | " 0.04991678, -1.01516311],\n", 1067 | " [-3.93462896, 2.8487166 , -1.62335803, ..., 0.61735951,\n", 1068 | " 0.13858547, 0.21935022],\n", 1069 | " [-1.56477143, -1.87060714, -0.83035874, ..., -0.58688186,\n", 1070 | " -0.59803885, -0.11533395]])" 1071 | ] 1072 | }, 1073 | "execution_count": 25, 1074 | "metadata": {}, 1075 | "output_type": "execute_result" 1076 | } 1077 | ], 1078 | "source": [ 1079 | "X_train" 1080 | ] 1081 | }, 1082 | { 1083 | "cell_type": "markdown", 1084 | "metadata": {}, 1085 | "source": [ 1086 | "## 2.1 Manual Neural Network\n", 1087 | " - 2.1.1 Build the Neural Network\n", 1088 | " - 2.1.2 Set the hyperparameters, train the NN and evaluate\n", 1089 | " - 2.1.3 Adapt SGD method to improve the accuracy" 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "markdown", 1094 | "metadata": {}, 1095 | "source": [ 1096 | "### 2.1.1 Build the neural network" 1097 | ] 1098 | }, 1099 | { 1100 | "cell_type": "code", 1101 | "execution_count": 96, 1102 | "metadata": { 1103 | "collapsed": false 1104 | }, 1105 | "outputs": [], 1106 | "source": [ 1107 | "class MyNeuralNetwork(object):\n", 1108 | " def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate):\n", 1109 | " # Set number of nodes in input, hidden and output layers.\n", 1110 | " self.input_nodes = input_nodes\n", 1111 | " self.hidden_nodes = hidden_nodes\n", 1112 | " self.output_nodes = output_nodes\n", 1113 | "\n", 1114 | " # Initialize weights\n", 1115 | " self.weights_0_1 = np.zeros((self.hidden_nodes,self.input_nodes))\n", 1116 | "\n", 1117 | " self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, \n", 1118 | " (self.output_nodes, self.hidden_nodes))\n", 1119 | " self.lr = learning_rate\n", 1120 | " \n", 1121 | " #### Set this to your implemented sigmoid function ####\n", 1122 | " # Activation function is the sigmoid function\n", 1123 | " self.sigmoid_activation = lambda x : 1 / (1 + np.exp(-x))\n", 1124 | " self.sigmoid_output_2_derivative = lambda x: x * (1 - x)\n", 1125 | " \n", 1126 | " def train(self, inputs_array, targets_array):\n", 1127 | " # Convert inputs list to 2d array\n", 1128 | " inputs = inputs_array.T\n", 1129 | " targets = np.array(targets_array, ndmin=2)\n", 1130 | " #targets = targets_array\n", 1131 | " m = inputs_array.shape[0] # number of records\n", 1132 | " \n", 1133 | " #### Implement the forward pass here ####\n", 1134 | " ### Forward pass ###\n", 1135 | " # TODO: Hidden layer\n", 1136 | " layer_1_inputs = np.dot(self.weights_0_1, inputs) # signals into hidden layer\n", 1137 | " layer_1 = layer_1_inputs # signals from hidden layer\n", 1138 | " \n", 1139 | " # TODO: Output layer\n", 1140 | " layer_2_inputs = np.dot(self.weights_1_2,layer_1) # signals into final output layer\n", 1141 | " layer_2 = self.sigmoid_activation(layer_2_inputs) # signals from final output layer\n", 1142 | " \n", 1143 | " #### Implement the backward pass here ####\n", 1144 | " ### Backward pass ###\n", 1145 | " \n", 1146 | " # TODO: Output error \n", 1147 | " layer_2_errors = targets - layer_2 # Output layer error is the difference between desired target and actual output.\n", 1148 | " layer_2_delta = layer_2_errors * self.sigmoid_output_2_derivative(layer_2)\n", 1149 | " \n", 1150 | " # TODO: Backpropagated error\n", 1151 | " layer_1_errors = np.dot(self.weights_1_2.T,layer_2_delta) # errors propagated to the hidden layer 2x128\n", 1152 | " layer_1_delta = layer_1_errors # hidden layer gradients y = x -> 1\n", 1153 | " \n", 1154 | " # TODO: Update the weights\n", 1155 | " self.weights_1_2 += self.lr*np.dot(layer_2_delta,layer_1.T)/m # update hidden-to-output weights with gradient descent step\n", 1156 | " self.weights_0_1 += self.lr*np.dot(layer_1_delta,inputs.T)/m # update input-to-hidden weights with gradient descent step\n", 1157 | " \n", 1158 | " \n", 1159 | " def run(self, inputs_list):\n", 1160 | " # Run a forward pass through the network\n", 1161 | " inputs = np.array(inputs_list, ndmin=2).T\n", 1162 | " \n", 1163 | " #### Implement the forward pass here ####\n", 1164 | " # TODO: Hidden layer\n", 1165 | " hidden_inputs = np.dot(self.weights_0_1, inputs) # signals into hidden layer\n", 1166 | " hidden_outputs = hidden_inputs # signals from hidden layer\n", 1167 | " \n", 1168 | " # TODO: Output layer\n", 1169 | " final_inputs = np.dot(self.weights_1_2,hidden_outputs) # signals into final output layer\n", 1170 | " final_outputs = self.sigmoid_activation(final_inputs) # signals from final output layer \n", 1171 | " \n", 1172 | " return final_outputs" 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "markdown", 1177 | "metadata": {}, 1178 | "source": [ 1179 | "### 2.1.2 Train the model and evaluation" 1180 | ] 1181 | }, 1182 | { 1183 | "cell_type": "code", 1184 | "execution_count": 97, 1185 | "metadata": { 1186 | "collapsed": false 1187 | }, 1188 | "outputs": [ 1189 | { 1190 | "name": "stdout", 1191 | "output_type": "stream", 1192 | "text": [ 1193 | "0.658711217184\n" 1194 | ] 1195 | } 1196 | ], 1197 | "source": [ 1198 | "from sklearn import metrics\n", 1199 | "### Set the hyperparameters here ###\n", 1200 | "epochs = 100 #100\n", 1201 | "learning_rate = 0.01 #0.1\n", 1202 | "hidden_nodes = 10 \n", 1203 | "output_nodes = 1\n", 1204 | "\n", 1205 | "N_i = X_train.shape[1]\n", 1206 | "network = MyNeuralNetwork(N_i, hidden_nodes, output_nodes, learning_rate)\n", 1207 | "\n", 1208 | "for e in range(epochs):\n", 1209 | " network.train(X_train, y_train)\n", 1210 | " \n", 1211 | "y_pred = network.run(X_test)\n", 1212 | "y_pred = np.where(y_pred >= 0.5, 1, 0) # if probability >= 0.5, it is 1, else 0\n", 1213 | "\n", 1214 | "print(metrics.accuracy_score(y_test,y_pred[0]))" 1215 | ] 1216 | }, 1217 | { 1218 | "cell_type": "markdown", 1219 | "metadata": {}, 1220 | "source": [ 1221 | "### 2.1.3 SGD" 1222 | ] 1223 | }, 1224 | { 1225 | "cell_type": "code", 1226 | "execution_count": 98, 1227 | "metadata": { 1228 | "collapsed": false 1229 | }, 1230 | "outputs": [], 1231 | "source": [ 1232 | "#N_i = X_train.shape[1]\n", 1233 | "network = MyNeuralNetwork(N_i, hidden_nodes, output_nodes, learning_rate)\n", 1234 | "\n", 1235 | "random_row_idx = np.zeros(128)\n", 1236 | "for e in range(epochs):\n", 1237 | " random_row_idx = np.random.choice(X_train.shape[0],size=128)\n", 1238 | " X_batch = X_train[random_row_idx,:]\n", 1239 | " y_batch = y_train[random_row_idx]\n", 1240 | " network.train(X_batch, y_batch)" 1241 | ] 1242 | }, 1243 | { 1244 | "cell_type": "code", 1245 | "execution_count": 99, 1246 | "metadata": { 1247 | "collapsed": false 1248 | }, 1249 | "outputs": [ 1250 | { 1251 | "name": "stdout", 1252 | "output_type": "stream", 1253 | "text": [ 1254 | "0.839498806683\n" 1255 | ] 1256 | }, 1257 | { 1258 | "name": "stderr", 1259 | "output_type": "stream", 1260 | "text": [ 1261 | "C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\ipykernel\\__main__.py:2: RuntimeWarning: invalid value encountered in greater_equal\n", 1262 | " from ipykernel import kernelapp as app\n" 1263 | ] 1264 | } 1265 | ], 1266 | "source": [ 1267 | "y_pred = network.run(X_test)\n", 1268 | "y_pred = np.where(y_pred >= 0.5, 1, 0) # if probability >= 0.5, it is 1, else 0\n", 1269 | "print(metrics.accuracy_score(y_test,y_pred[0]))" 1270 | ] 1271 | }, 1272 | { 1273 | "cell_type": "markdown", 1274 | "metadata": { 1275 | "collapsed": false 1276 | }, 1277 | "source": [ 1278 | "#### Wow, SGD improves the accuracy dramatically !!!!" 1279 | ] 1280 | }, 1281 | { 1282 | "cell_type": "markdown", 1283 | "metadata": {}, 1284 | "source": [ 1285 | "## 2.2 Tensorflow" 1286 | ] 1287 | }, 1288 | { 1289 | "cell_type": "code", 1290 | "execution_count": 31, 1291 | "metadata": { 1292 | "collapsed": true 1293 | }, 1294 | "outputs": [], 1295 | "source": [ 1296 | "import tensorflow as tf" 1297 | ] 1298 | }, 1299 | { 1300 | "cell_type": "code", 1301 | "execution_count": 32, 1302 | "metadata": { 1303 | "collapsed": false, 1304 | "scrolled": true 1305 | }, 1306 | "outputs": [ 1307 | { 1308 | "name": "stdout", 1309 | "output_type": "stream", 1310 | "text": [ 1311 | "WARNING:tensorflow:float64 is not supported by many models, consider casting to float32.\n", 1312 | "WARNING:tensorflow:Using temporary folder as model directory: C:\\Users\\minga\\AppData\\Local\\Temp\\tmpi4fp2htr\n", 1313 | "INFO:tensorflow:Using default config.\n", 1314 | "INFO:tensorflow:Using config: {'keep_checkpoint_every_n_hours': 10000, 'keep_checkpoint_max': 5, '_evaluation_master': '', 'save_summary_steps': 100, '_task_id': 0, 'save_checkpoints_secs': 600, 'tf_config': gpu_options {\n", 1315 | " per_process_gpu_memory_fraction: 1\n", 1316 | "}\n", 1317 | ", '_is_chief': True, '_environment': 'local', 'save_checkpoints_steps': None, '_num_ps_replicas': 0, '_cluster_spec': , '_master': '', '_task_type': None, 'tf_random_seed': None}\n", 1318 | "WARNING:tensorflow:From C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\tensorflow\\contrib\\learn\\python\\learn\\estimators\\dnn.py:315 in fit.: calling BaseEstimator.fit (from tensorflow.contrib.learn.python.learn.estimators.estimator) with x is deprecated and will be removed after 2016-12-01.\n", 1319 | "Instructions for updating:\n", 1320 | "Estimator is decoupled from Scikit Learn interface by moving into\n", 1321 | "separate class SKCompat. Arguments x, y and batch_size are only\n", 1322 | "available in the SKCompat class, Estimator will only accept input_fn.\n", 1323 | "Example conversion:\n", 1324 | " est = Estimator(...) -> est = SKCompat(Estimator(...))\n", 1325 | "WARNING:tensorflow:From C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\tensorflow\\contrib\\learn\\python\\learn\\estimators\\dnn.py:315 in fit.: calling BaseEstimator.fit (from tensorflow.contrib.learn.python.learn.estimators.estimator) with y is deprecated and will be removed after 2016-12-01.\n", 1326 | "Instructions for updating:\n", 1327 | "Estimator is decoupled from Scikit Learn interface by moving into\n", 1328 | "separate class SKCompat. Arguments x, y and batch_size are only\n", 1329 | "available in the SKCompat class, Estimator will only accept input_fn.\n", 1330 | "Example conversion:\n", 1331 | " est = Estimator(...) -> est = SKCompat(Estimator(...))\n", 1332 | "WARNING:tensorflow:From C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\tensorflow\\contrib\\learn\\python\\learn\\estimators\\dnn.py:315 in fit.: calling BaseEstimator.fit (from tensorflow.contrib.learn.python.learn.estimators.estimator) with batch_size is deprecated and will be removed after 2016-12-01.\n", 1333 | "Instructions for updating:\n", 1334 | "Estimator is decoupled from Scikit Learn interface by moving into\n", 1335 | "separate class SKCompat. Arguments x, y and batch_size are only\n", 1336 | "available in the SKCompat class, Estimator will only accept input_fn.\n", 1337 | "Example conversion:\n", 1338 | " est = Estimator(...) -> est = SKCompat(Estimator(...))\n", 1339 | "WARNING:tensorflow:float64 is not supported by many models, consider casting to float32.\n", 1340 | "INFO:tensorflow:Summary name dnn/hiddenlayer_0:fraction_of_zero_values is illegal; using dnn/hiddenlayer_0_fraction_of_zero_values instead.\n", 1341 | "INFO:tensorflow:Summary name dnn/hiddenlayer_0:activation is illegal; using dnn/hiddenlayer_0_activation instead.\n", 1342 | "INFO:tensorflow:Summary name dnn/logits:fraction_of_zero_values is illegal; using dnn/logits_fraction_of_zero_values instead.\n", 1343 | "INFO:tensorflow:Summary name dnn/logits:activation is illegal; using dnn/logits_activation instead.\n", 1344 | "INFO:tensorflow:Create CheckpointSaverHook.\n", 1345 | "INFO:tensorflow:loss = 0.534811, step = 1\n", 1346 | "INFO:tensorflow:Saving checkpoints for 1 into C:\\Users\\minga\\AppData\\Local\\Temp\\tmpi4fp2htr\\model.ckpt.\n", 1347 | "WARNING:tensorflow:*******************************************************\n", 1348 | "WARNING:tensorflow:TensorFlow's V1 checkpoint format has been deprecated.\n", 1349 | "WARNING:tensorflow:Consider switching to the more efficient V2 format:\n", 1350 | "WARNING:tensorflow: `tf.train.Saver(write_version=tf.train.SaverDef.V2)`\n", 1351 | "WARNING:tensorflow:now on by default.\n", 1352 | "WARNING:tensorflow:*******************************************************\n", 1353 | "INFO:tensorflow:loss = 0.105281, step = 101\n", 1354 | "INFO:tensorflow:global_step/sec: 10.1418\n", 1355 | "INFO:tensorflow:loss = 0.033156, step = 201\n", 1356 | "INFO:tensorflow:global_step/sec: 10.5714\n", 1357 | "INFO:tensorflow:Saving checkpoints for 300 into C:\\Users\\minga\\AppData\\Local\\Temp\\tmpi4fp2htr\\model.ckpt.\n", 1358 | "WARNING:tensorflow:*******************************************************\n", 1359 | "WARNING:tensorflow:TensorFlow's V1 checkpoint format has been deprecated.\n", 1360 | "WARNING:tensorflow:Consider switching to the more efficient V2 format:\n", 1361 | "WARNING:tensorflow: `tf.train.Saver(write_version=tf.train.SaverDef.V2)`\n", 1362 | "WARNING:tensorflow:now on by default.\n", 1363 | "WARNING:tensorflow:*******************************************************\n", 1364 | "INFO:tensorflow:Loss for final step: 0.0162134.\n", 1365 | "WARNING:tensorflow:From C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\tensorflow\\contrib\\learn\\python\\learn\\estimators\\dnn.py:348 in predict.: calling BaseEstimator.predict (from tensorflow.contrib.learn.python.learn.estimators.estimator) with x is deprecated and will be removed after 2016-12-01.\n", 1366 | "Instructions for updating:\n", 1367 | "Estimator is decoupled from Scikit Learn interface by moving into\n", 1368 | "separate class SKCompat. Arguments x, y and batch_size are only\n", 1369 | "available in the SKCompat class, Estimator will only accept input_fn.\n", 1370 | "Example conversion:\n", 1371 | " est = Estimator(...) -> est = SKCompat(Estimator(...))\n", 1372 | "WARNING:tensorflow:From C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\tensorflow\\contrib\\learn\\python\\learn\\estimators\\dnn.py:348 in predict.: calling BaseEstimator.predict (from tensorflow.contrib.learn.python.learn.estimators.estimator) with batch_size is deprecated and will be removed after 2016-12-01.\n", 1373 | "Instructions for updating:\n", 1374 | "Estimator is decoupled from Scikit Learn interface by moving into\n", 1375 | "separate class SKCompat. Arguments x, y and batch_size are only\n", 1376 | "available in the SKCompat class, Estimator will only accept input_fn.\n", 1377 | "Example conversion:\n", 1378 | " est = Estimator(...) -> est = SKCompat(Estimator(...))\n", 1379 | "WARNING:tensorflow:From C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\tensorflow\\contrib\\learn\\python\\learn\\estimators\\dnn.py:348 in predict.: calling BaseEstimator.predict (from tensorflow.contrib.learn.python.learn.estimators.estimator) with as_iterable is deprecated and will be removed after 2016-12-01.\n", 1380 | "Instructions for updating:\n", 1381 | "Estimator is decoupled from Scikit Learn interface by moving into\n", 1382 | "separate class SKCompat. Arguments x, y and batch_size are only\n", 1383 | "available in the SKCompat class, Estimator will only accept input_fn.\n", 1384 | "Example conversion:\n", 1385 | " est = Estimator(...) -> est = SKCompat(Estimator(...))\n", 1386 | "WARNING:tensorflow:float64 is not supported by many models, consider casting to float32.\n", 1387 | "INFO:tensorflow:Summary name dnn/hiddenlayer_0:fraction_of_zero_values is illegal; using dnn/hiddenlayer_0_fraction_of_zero_values instead.\n", 1388 | "INFO:tensorflow:Summary name dnn/hiddenlayer_0:activation is illegal; using dnn/hiddenlayer_0_activation instead.\n", 1389 | "INFO:tensorflow:Summary name dnn/logits:fraction_of_zero_values is illegal; using dnn/logits_fraction_of_zero_values instead.\n", 1390 | "INFO:tensorflow:Summary name dnn/logits:activation is illegal; using dnn/logits_activation instead.\n", 1391 | "INFO:tensorflow:Loading model from checkpoint: C:\\Users\\minga\\AppData\\Local\\Temp\\tmpi4fp2htr\\model.ckpt-300-?????-of-00001.\n" 1392 | ] 1393 | } 1394 | ], 1395 | "source": [ 1396 | "# Build one layer DNN with 40 units respectively.\n", 1397 | "feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(X_train)\n", 1398 | "classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns, hidden_units=[40], n_classes=2)\n", 1399 | "\n", 1400 | "# Fit and predict.\n", 1401 | "classifier.fit(X_train, y_train, steps=300)\n", 1402 | "predictions = list(classifier.predict(X_test, as_iterable=True))\n", 1403 | "score = metrics.accuracy_score(y_test, predictions)" 1404 | ] 1405 | }, 1406 | { 1407 | "cell_type": "code", 1408 | "execution_count": 33, 1409 | "metadata": { 1410 | "collapsed": false 1411 | }, 1412 | "outputs": [ 1413 | { 1414 | "name": "stdout", 1415 | "output_type": "stream", 1416 | "text": [ 1417 | "TF Accuracy: 0.999403341289\n" 1418 | ] 1419 | } 1420 | ], 1421 | "source": [ 1422 | "print('TF Accuracy: ', score)" 1423 | ] 1424 | }, 1425 | { 1426 | "cell_type": "markdown", 1427 | "metadata": {}, 1428 | "source": [ 1429 | "## 2.3 SVM" 1430 | ] 1431 | }, 1432 | { 1433 | "cell_type": "code", 1434 | "execution_count": 34, 1435 | "metadata": { 1436 | "collapsed": true 1437 | }, 1438 | "outputs": [], 1439 | "source": [ 1440 | "from sklearn.svm import SVC\n", 1441 | "svc = SVC()\n", 1442 | "svc.fit(X_train, y_train)\n", 1443 | "y_pred = svc.predict(X_test)" 1444 | ] 1445 | }, 1446 | { 1447 | "cell_type": "code", 1448 | "execution_count": 35, 1449 | "metadata": { 1450 | "collapsed": false 1451 | }, 1452 | "outputs": [ 1453 | { 1454 | "data": { 1455 | "text/plain": [ 1456 | "array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,\n", 1457 | " 0, 0, 0, 0, 0, 0, 0], dtype=int64)" 1458 | ] 1459 | }, 1460 | "execution_count": 35, 1461 | "metadata": {}, 1462 | "output_type": "execute_result" 1463 | } 1464 | ], 1465 | "source": [ 1466 | "y_pred[:30]" 1467 | ] 1468 | }, 1469 | { 1470 | "cell_type": "code", 1471 | "execution_count": 36, 1472 | "metadata": { 1473 | "collapsed": false 1474 | }, 1475 | "outputs": [ 1476 | { 1477 | "name": "stdout", 1478 | "output_type": "stream", 1479 | "text": [ 1480 | "SVM Accuracy: 0.994630071599\n" 1481 | ] 1482 | } 1483 | ], 1484 | "source": [ 1485 | "print('SVM Accuracy: ', metrics.accuracy_score(y_test,y_pred))" 1486 | ] 1487 | }, 1488 | { 1489 | "cell_type": "markdown", 1490 | "metadata": {}, 1491 | "source": [ 1492 | "## 2.4 Logistic regression" 1493 | ] 1494 | }, 1495 | { 1496 | "cell_type": "code", 1497 | "execution_count": 37, 1498 | "metadata": { 1499 | "collapsed": false 1500 | }, 1501 | "outputs": [], 1502 | "source": [ 1503 | "from sklearn.linear_model import LogisticRegression\n", 1504 | "\n", 1505 | "logreg = LogisticRegression(C=1e5)\n", 1506 | "logreg.fit(X_train, y_train)\n", 1507 | "y_pred = logreg.predict(X_test)" 1508 | ] 1509 | }, 1510 | { 1511 | "cell_type": "code", 1512 | "execution_count": 38, 1513 | "metadata": { 1514 | "collapsed": false 1515 | }, 1516 | "outputs": [ 1517 | { 1518 | "data": { 1519 | "text/plain": [ 1520 | "array([0, 0, 0, ..., 0, 0, 1], dtype=int64)" 1521 | ] 1522 | }, 1523 | "execution_count": 38, 1524 | "metadata": {}, 1525 | "output_type": "execute_result" 1526 | } 1527 | ], 1528 | "source": [ 1529 | "y_pred" 1530 | ] 1531 | }, 1532 | { 1533 | "cell_type": "code", 1534 | "execution_count": 39, 1535 | "metadata": { 1536 | "collapsed": false 1537 | }, 1538 | "outputs": [ 1539 | { 1540 | "name": "stdout", 1541 | "output_type": "stream", 1542 | "text": [ 1543 | "Log Regression Accuracy: 1.0\n" 1544 | ] 1545 | } 1546 | ], 1547 | "source": [ 1548 | "print('Log Regression Accuracy: ', metrics.accuracy_score(y_test,y_pred))" 1549 | ] 1550 | }, 1551 | { 1552 | "cell_type": "markdown", 1553 | "metadata": {}, 1554 | "source": [ 1555 | "# 3. Summary\n", 1556 | " - Manual NN: 0.839498806683. \n", 1557 | " - TF Accuracy: 0.999403341289\n", 1558 | " - SVM Accuracy: 0.994630071599\n", 1559 | " - Log Regression Accuracy: 1.0" 1560 | ] 1561 | } 1562 | ], 1563 | "metadata": { 1564 | "kernelspec": { 1565 | "display_name": "Python 3", 1566 | "language": "python", 1567 | "name": "python3" 1568 | }, 1569 | "language_info": { 1570 | "codemirror_mode": { 1571 | "name": "ipython", 1572 | "version": 3 1573 | }, 1574 | "file_extension": ".py", 1575 | "mimetype": "text/x-python", 1576 | "name": "python", 1577 | "nbconvert_exporter": "python", 1578 | "pygments_lexer": "ipython3", 1579 | "version": "3.5.2" 1580 | } 1581 | }, 1582 | "nbformat": 4, 1583 | "nbformat_minor": 2 1584 | } 1585 | -------------------------------------------------------------------------------- /wk5-speed-dating/Speed Dating Data.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdalai/Deep-Learning-projects/f690b3d8901e2ee7d872765815306ed09ba83a5a/wk5-speed-dating/Speed Dating Data.csv -------------------------------------------------------------------------------- /wk5-speed-dating/Speed dating prediction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Speed dating predicton\n", 8 | " - [Kaggle Speed dating experiment](https://www.kaggle.com/annavictoria/speed-dating-experiment)\n", 9 | " - Learning fun [Siraj's DL #5](https://www.youtube.com/watch?v=koiTTim4M-s)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "import matplotlib.pyplot as plt" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/html": [ 36 | "
\n", 37 | "\n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | "
iididgenderidgcondtnwaveroundpositionpositin1order...attr3_3sinc3_3intel3_3fun3_3amb3_3attr5_3sinc5_3intel5_3fun5_3amb5_3
011.00111107NaN4...5.07.07.07.07.0NaNNaNNaNNaNNaN
111.00111107NaN3...5.07.07.07.07.0NaNNaNNaNNaNNaN
211.00111107NaN10...5.07.07.07.07.0NaNNaNNaNNaNNaN
311.00111107NaN5...5.07.07.07.07.0NaNNaNNaNNaNNaN
411.00111107NaN7...5.07.07.07.07.0NaNNaNNaNNaNNaN
\n", 187 | "

5 rows × 195 columns

\n", 188 | "
" 189 | ], 190 | "text/plain": [ 191 | " iid id gender idg condtn wave round position positin1 order \\\n", 192 | "0 1 1.0 0 1 1 1 10 7 NaN 4 \n", 193 | "1 1 1.0 0 1 1 1 10 7 NaN 3 \n", 194 | "2 1 1.0 0 1 1 1 10 7 NaN 10 \n", 195 | "3 1 1.0 0 1 1 1 10 7 NaN 5 \n", 196 | "4 1 1.0 0 1 1 1 10 7 NaN 7 \n", 197 | "\n", 198 | " ... attr3_3 sinc3_3 intel3_3 fun3_3 amb3_3 attr5_3 sinc5_3 \\\n", 199 | "0 ... 5.0 7.0 7.0 7.0 7.0 NaN NaN \n", 200 | "1 ... 5.0 7.0 7.0 7.0 7.0 NaN NaN \n", 201 | "2 ... 5.0 7.0 7.0 7.0 7.0 NaN NaN \n", 202 | "3 ... 5.0 7.0 7.0 7.0 7.0 NaN NaN \n", 203 | "4 ... 5.0 7.0 7.0 7.0 7.0 NaN NaN \n", 204 | "\n", 205 | " intel5_3 fun5_3 amb5_3 \n", 206 | "0 NaN NaN NaN \n", 207 | "1 NaN NaN NaN \n", 208 | "2 NaN NaN NaN \n", 209 | "3 NaN NaN NaN \n", 210 | "4 NaN NaN NaN \n", 211 | "\n", 212 | "[5 rows x 195 columns]" 213 | ] 214 | }, 215 | "execution_count": 2, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "df =pd.read_csv('Speed Dating Data.csv', encoding=\"ISO-8859-1\")\n", 222 | "df.head()" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 3, 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/plain": [ 235 | "0 0\n", 236 | "1 0\n", 237 | "2 1\n", 238 | "3 1\n", 239 | "4 1\n", 240 | "Name: match, dtype: int64" 241 | ] 242 | }, 243 | "execution_count": 3, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "df['match'].head()" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 4, 255 | "metadata": { 256 | "collapsed": false 257 | }, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/plain": [ 262 | "(8378, 195)" 263 | ] 264 | }, 265 | "execution_count": 4, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "df.shape" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 5, 277 | "metadata": { 278 | "collapsed": false 279 | }, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "\n", 286 | "RangeIndex: 8378 entries, 0 to 8377\n", 287 | "Columns: 195 entries, iid to amb5_3\n", 288 | "dtypes: float64(174), int64(13), object(8)\n", 289 | "memory usage: 12.5+ MB\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "df.info()" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "### First of all, let's just seperate features and labels" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 6, 307 | "metadata": { 308 | "collapsed": false 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "df, df_labels = df.drop(['match'], axis=1), df['match']" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "# 1. Preprocessing Data\n", 320 | " - 1.1 Cleaning\n", 321 | " - 1.2 Transformation\n", 322 | " - 1.3 Reduction by PCA" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "## 1.1 Cleaning" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "### Cleaning null features\n", 337 | "If a feature has more than 30% (2513) of values are null, we just drop the whole column. " 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 7, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [ 347 | { 348 | "name": "stdout", 349 | "output_type": "stream", 350 | "text": [ 351 | "194\n", 352 | "194\n" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "na_sum = list(df.isnull().sum())\n", 358 | "print(len(na_sum))\n", 359 | "#na_col = list(df.isnull().sum().index)\n", 360 | "#print(len(na_col))" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 8, 366 | "metadata": { 367 | "collapsed": false 368 | }, 369 | "outputs": [ 370 | { 371 | "name": "stdout", 372 | "output_type": "stream", 373 | "text": [ 374 | "We can drop 83 Columns\n" 375 | ] 376 | } 377 | ], 378 | "source": [ 379 | "drop_col =[]\n", 380 | "for i in range(len(na_sum)):\n", 381 | " if na_sum[i] > 2523:\n", 382 | " drop_col.append(na_col[i])\n", 383 | "print(\"We can drop \",len(drop_col),\" Columns\")" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 9, 389 | "metadata": { 390 | "collapsed": false 391 | }, 392 | "outputs": [ 393 | { 394 | "data": { 395 | "text/html": [ 396 | "
\n", 397 | "\n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | "
iididgenderidgcondtnwaveroundpositionpositin1order...sinc1_2intel1_2fun1_2amb1_2shar1_2attr3_2sinc3_2intel3_2fun3_2amb3_2
011.00111107NaN4...16.6713.8922.2211.1116.676.07.08.07.06.0
111.00111107NaN3...16.6713.8922.2211.1116.676.07.08.07.06.0
211.00111107NaN10...16.6713.8922.2211.1116.676.07.08.07.06.0
311.00111107NaN5...16.6713.8922.2211.1116.676.07.08.07.06.0
411.00111107NaN7...16.6713.8922.2211.1116.676.07.08.07.06.0
\n", 547 | "

5 rows × 111 columns

\n", 548 | "
" 549 | ], 550 | "text/plain": [ 551 | " iid id gender idg condtn wave round position positin1 order \\\n", 552 | "0 1 1.0 0 1 1 1 10 7 NaN 4 \n", 553 | "1 1 1.0 0 1 1 1 10 7 NaN 3 \n", 554 | "2 1 1.0 0 1 1 1 10 7 NaN 10 \n", 555 | "3 1 1.0 0 1 1 1 10 7 NaN 5 \n", 556 | "4 1 1.0 0 1 1 1 10 7 NaN 7 \n", 557 | "\n", 558 | " ... sinc1_2 intel1_2 fun1_2 amb1_2 shar1_2 attr3_2 sinc3_2 \\\n", 559 | "0 ... 16.67 13.89 22.22 11.11 16.67 6.0 7.0 \n", 560 | "1 ... 16.67 13.89 22.22 11.11 16.67 6.0 7.0 \n", 561 | "2 ... 16.67 13.89 22.22 11.11 16.67 6.0 7.0 \n", 562 | "3 ... 16.67 13.89 22.22 11.11 16.67 6.0 7.0 \n", 563 | "4 ... 16.67 13.89 22.22 11.11 16.67 6.0 7.0 \n", 564 | "\n", 565 | " intel3_2 fun3_2 amb3_2 \n", 566 | "0 8.0 7.0 6.0 \n", 567 | "1 8.0 7.0 6.0 \n", 568 | "2 8.0 7.0 6.0 \n", 569 | "3 8.0 7.0 6.0 \n", 570 | "4 8.0 7.0 6.0 \n", 571 | "\n", 572 | "[5 rows x 111 columns]" 573 | ] 574 | }, 575 | "execution_count": 9, 576 | "metadata": {}, 577 | "output_type": "execute_result" 578 | } 579 | ], 580 | "source": [ 581 | "df = df.drop(drop_col,axis=1)\n", 582 | "df.head()" 583 | ] 584 | }, 585 | { 586 | "cell_type": "markdown", 587 | "metadata": {}, 588 | "source": [ 589 | "### Imputing null values with mean" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 10, 595 | "metadata": { 596 | "collapsed": false 597 | }, 598 | "outputs": [], 599 | "source": [ 600 | "df = df.fillna(df.mean())" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 11, 606 | "metadata": { 607 | "collapsed": false 608 | }, 609 | "outputs": [ 610 | { 611 | "data": { 612 | "text/plain": [ 613 | "True" 614 | ] 615 | }, 616 | "execution_count": 11, 617 | "metadata": {}, 618 | "output_type": "execute_result" 619 | } 620 | ], 621 | "source": [ 622 | "#check if any NaN values\n", 623 | "df.isnull().values.any()" 624 | ] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "metadata": {}, 629 | "source": [ 630 | "#### This means there are still columns which have null values. Let's further check." 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": 12, 636 | "metadata": { 637 | "collapsed": false 638 | }, 639 | "outputs": [ 640 | { 641 | "data": { 642 | "text/plain": [ 643 | "['field', 'race', 'imprace', 'income']" 644 | ] 645 | }, 646 | "execution_count": 12, 647 | "metadata": {}, 648 | "output_type": "execute_result" 649 | } 650 | ], 651 | "source": [ 652 | "na_sum = list(df.isnull().sum())\n", 653 | "#na_col = list(df.isnull().sum().index)\n", 654 | "nan_col =[]\n", 655 | "for i in range(len(na_sum)):\n", 656 | " if na_sum[i] > 0:\n", 657 | " nan_col.append(na_col[i])\n", 658 | "nan_col" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 13, 664 | "metadata": { 665 | "collapsed": false 666 | }, 667 | "outputs": [ 668 | { 669 | "data": { 670 | "text/html": [ 671 | "
\n", 672 | "\n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | "
fieldfromzipcodecareer
0LawChicago60,521lawyer
1LawChicago60,521lawyer
2LawChicago60,521lawyer
3LawChicago60,521lawyer
4LawChicago60,521lawyer
\n", 720 | "
" 721 | ], 722 | "text/plain": [ 723 | " field from zipcode career\n", 724 | "0 Law Chicago 60,521 lawyer\n", 725 | "1 Law Chicago 60,521 lawyer\n", 726 | "2 Law Chicago 60,521 lawyer\n", 727 | "3 Law Chicago 60,521 lawyer\n", 728 | "4 Law Chicago 60,521 lawyer" 729 | ] 730 | }, 731 | "execution_count": 13, 732 | "metadata": {}, 733 | "output_type": "execute_result" 734 | } 735 | ], 736 | "source": [ 737 | "df[['field', 'from', 'zipcode', 'career']].head()" 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": {}, 743 | "source": [ 744 | "#### These columns are values with object type. It is hard to predict null values for these variables. So, let's just drop all these variables. " 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": 14, 750 | "metadata": { 751 | "collapsed": true 752 | }, 753 | "outputs": [], 754 | "source": [ 755 | "df = df.drop(['from','zipcode','field','career'], axis=1)" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": 15, 761 | "metadata": { 762 | "collapsed": false 763 | }, 764 | "outputs": [ 765 | { 766 | "data": { 767 | "text/plain": [ 768 | "False" 769 | ] 770 | }, 771 | "execution_count": 15, 772 | "metadata": {}, 773 | "output_type": "execute_result" 774 | } 775 | ], 776 | "source": [ 777 | "df.isnull().values.any()" 778 | ] 779 | }, 780 | { 781 | "cell_type": "markdown", 782 | "metadata": {}, 783 | "source": [ 784 | "#### We have sucessfully cleaned all null variables in the dataset. " 785 | ] 786 | }, 787 | { 788 | "cell_type": "markdown", 789 | "metadata": {}, 790 | "source": [ 791 | "## 1.2 Transformation" 792 | ] 793 | }, 794 | { 795 | "cell_type": "markdown", 796 | "metadata": {}, 797 | "source": [ 798 | "### Normalize data" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": 16, 804 | "metadata": { 805 | "collapsed": true 806 | }, 807 | "outputs": [], 808 | "source": [ 809 | "from sklearn.preprocessing import StandardScaler" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 17, 815 | "metadata": { 816 | "collapsed": false 817 | }, 818 | "outputs": [], 819 | "source": [ 820 | "X = StandardScaler().fit_transform(df)" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": 18, 826 | "metadata": { 827 | "collapsed": false 828 | }, 829 | "outputs": [ 830 | { 831 | "data": { 832 | "text/plain": [ 833 | "(8378, 107)" 834 | ] 835 | }, 836 | "execution_count": 18, 837 | "metadata": {}, 838 | "output_type": "execute_result" 839 | } 840 | ], 841 | "source": [ 842 | "X.shape" 843 | ] 844 | }, 845 | { 846 | "cell_type": "markdown", 847 | "metadata": {}, 848 | "source": [ 849 | "## 1.3 Reduction" 850 | ] 851 | }, 852 | { 853 | "cell_type": "markdown", 854 | "metadata": {}, 855 | "source": [ 856 | "### PCA\n", 857 | " - Find out what is fairly good value for n_components according to the Explained Variance Ratio\n", 858 | " - Reduce dimensions by the n_components" 859 | ] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": 19, 864 | "metadata": { 865 | "collapsed": false 866 | }, 867 | "outputs": [ 868 | { 869 | "data": { 870 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEKCAYAAAAB0GKPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xt4XXWd7/H3NzuXnXuae9qmbUrTlnLH2hZF5SIKeOno\noIADCs4Mwwhej+PozJyH0Tnn6DPjeI56PFQUUEaUAQStWG6DFxBo6QVoaUtLm1KaXtNbLs09+Z4/\n1gps0qRZLd3Zyc7n9Tzr2Wuv9Vt7f3+95Ju1fjdzd0REREaSkeoARERkfFDCEBGRSJQwREQkEiUM\nERGJRAlDREQiUcIQEZFIlDBERCQSJQwREYlECUNERCLJTHUAJ1N5ebnPmDEj1WGIiIwbq1ev3u/u\nFVHKplXCmDFjBqtWrUp1GCIi44aZbY9aVo+kREQkEiUMERGJRAlDREQiUcIQEZFIkpowzOxSM9tk\nZlvM7KtDnDcz+154fq2ZnZtwrsTM7jezl81so5mdl8xYRUTk2JKWMMwsBvwAuAyYB1xtZvMGFbsM\nqA+3G4BbE859F3jE3ecCZwEbkxWriIiMLJl3GAuALe7e4O7dwD3A4kFlFgN3eWA5UGJmNWZWDLwb\nuB3A3bvd/XASYxURkREkM2FMAXYkvG8Mj0UpUwc0AXea2fNm9mMzy09GkP39zv/93Sv8cXNTMj5e\nRCRtjNVG70zgXOBWdz8HOAIc1QYCYGY3mNkqM1vV1HT8P/QzMowfPtnA71/e95YCFhFJd8lMGDuB\n2oT3U8NjUco0Ao3uviI8fj9BAjmKu9/m7vPdfX5FRaTR7UepKoqzt6XzhK4VEZkokpkwVgL1ZlZn\nZtnAVcDSQWWWAp8Me0stAprdfbe77wF2mNmcsNzFwIZkBVpVlKOEISIygqTNJeXuvWZ2M/AoEAPu\ncPf1ZnZjeH4JsAy4HNgCtAPXJ3zEZ4G7w2TTMOjcSVVVGGfFtoPJ+ngRkbSQ1MkH3X0ZQVJIPLYk\nYd+Bm4a59gVgfjLjG1BZFGdfayfujpmNxleKiIw7Y7XRe1RVFubQ0+ccau9JdSgiImOWEgZBozeg\ndgwRkWNQwiBo9AbY19qV4khERMYuJQx0hyEiEoUSBlBRGN5hKGGIiAxLCQOIZ8Uoyctib4seSYmI\nDEcJI1RZqMF7IiLHooQRqiqKs1eN3iIiw1LCCFUWxmnSHYaIyLCUMEJVRTnsa+2iv99THYqIyJik\nhBGqKorT2+8cbO9OdSgiImOSEkZoYPCeGr5FRIamhBGqKAwG7+1T11oRkSEpYYR0hyEicmxKGKHX\nR3ura62IyJCUMEI5mTFK87N1hyEiMgwljATBaG/dYYiIDEUJI0FVuPKeiIgcTQkjgeaTEhEZnhJG\ngqqiOE2tXfRptLeIyFGUMBJUFeXQ73CgTe0YIiKDKWEkqAxX3lPXWhGRoylhJNBSrSIiw1PCSDAw\n2nvX4Y4URyIiMvYkNWGY2aVmtsnMtpjZV4c4b2b2vfD8WjM7N+Hcq2a2zsxeMLNVyYxzQHVRnPKC\nHNa8dng0vk5EZFzJTNYHm1kM+AFwCdAIrDSzpe6+IaHYZUB9uC0Ebg1fB1zo7vuTFeNgZsbCulKW\nNxzA3TGz0fpqEZExL5l3GAuALe7e4O7dwD3A4kFlFgN3eWA5UGJmNUmMaUSLZpayu7mTHQf1WEpE\nJFEyE8YUYEfC+8bwWNQyDvyXma02sxuSFuUgC2eWAbB824HR+koRkXFhLDd6n+/uZxM8trrJzN49\nVCEzu8HMVpnZqqamprf8pfWVBZTmZ7O8QQlDRCRRMhPGTqA24f3U8FikMu4+8LoPeJDgEddR3P02\nd5/v7vMrKirectBmxoIZpaxoOPiWP0tEJJ0kM2GsBOrNrM7MsoGrgKWDyiwFPhn2lloENLv7bjPL\nN7NCADPLB94HvJTEWN9k0cxSdh7uYMfB9tH6ShGRMS9pvaTcvdfMbgYeBWLAHe6+3sxuDM8vAZYB\nlwNbgHbg+vDyKuDBsJdSJvBzd38kWbEONtCOsWLbQWpL80bra0VExrSkJQwAd19GkBQSjy1J2Hfg\npiGuawDOSmZsxzKnqpCSvCxWNBzgirdNTVUYIiJjylhu9E6ZjIywHWOb2jFERAYoYQxj4cwyXjvY\nrmlCRERCIyYMM5tqZg+aWZOZ7TOzX5pZ2j+nmT99EgBrGzVNiIgIRLvDuJOgN1MNMBn4TXgsrc0o\nywfQiG8RkVCUhFHh7ne6e2+4/QR46wMexrjivCyK4pm8pq61IiJAtIRxwMyuMbNYuF0DTIhh0NPK\n8pQwRERCURLGp4GPA3uA3cAVvDFeIq1NK83T4D0RkdCI4zDcfTvw4VGIZcypLc3jvzbso6/fiWVo\nqnMRmdiGTRhm9hV3/1cz+z7BzLFv4u6fS2pkY8C00jy6+/rZ29LJ5JLcVIcjIpJSx7rD2Bi+jspq\nd2PRtHBakNcOtithiMiEN2zCcPffhLvt7n5f4jkz+1hSoxojBhLGjoPtLArnlxIRmaiiNHp/LeKx\ntDO5JJcMQw3fIiIcuw3jMoKZZKeY2fcSThUBvckObCzIimVQU5yrrrUiIhy7DWMXQfvFh4HVCcdb\ngS8mM6ixZFqpxmKIiMCx2zBeBF40s5+7e88oxjSmTCvN44mX96U6DBGRlIuyHsYMM/smMA+IDxx0\n95lJi2oMmVaWx/62Ltq7e8nLTuryISIiY1rUyQdvJWi3uBC4C/hZMoMaS2pf7ymlSQhFZGKLkjBy\n3f0JwNx9u7v/M/CB5IY1diSOxRARmciiPGPpMrMM4JVwje6dQEFywxo7EsdiiIhMZFHuMD4P5AGf\nA94GXAN8KplBjSWT8rIoyNE05yIix7zDMLMYcKW7fxloY4LMUpvIzKjVrLUiIse+w3D3PuD8UYpl\nzJpWqsF7IiJR2jCeN7OlwH3AkYGD7v5A0qIaY2on5fGHTU24O2aa5lxEJqYoCSNOsMLeRQnHHJgw\nCWNaWR5dvf3sa+2iqig+8gUiImkoygJKJ9xuYWaXAt8FYsCP3f1bg85beP5yoB24zt3XJJyPEUxP\nstPdP3iicbxVp9YUAfD8a4e59PTqVIUhIpJSUXpJnZDwh/0PgMsIRolfbWbzBhW7DKgPtxsIBggm\n+jxvrMuRMmfXlpCfHePpLftTHYqISMokLWEAC4At7t7g7t3APcDiQWUWA3d5YDlQYmY1AGY2lWCA\n4I+TGGMkWbEMFs4s409KGCIygSUzYUwBdiS8bwyPRS3zf4CvAP3JCvB4vHNWOdv2H6HxkHpLicjE\nNGLCMLMqM7vdzB4O388zs79MZlBm9kFgn7uvjlD2BjNbZWarmpqakhbTu+rLAXhmy4GkfYeIyFgW\n5Q7jJ8CjwOTw/WbgCxGu2wnUJryfGh6LUuadwIfN7FWCR1kXmdmQEx66+23uPt/d51dUVEQI68TU\nVxZQUZjDU3osJSITVJSEUe7u9xI+GnL3XqAvwnUrgXozqzOzbOAqYOmgMkuBT1pgEdDs7rvd/Wvu\nPtXdZ4TX/c7dr4lYp6QwM86fVc4zW/bT3++pDEVEJCWiJIwjZlZGMPaCgR/sI10UJpabCe5ONgL3\nuvt6M7vRzG4Miy0DGoAtwI+Azxx/FUbP+bPKOXCkm417WlIdiojIqIsycO9LBHcCp5jZ00AFcEWU\nD3f3ZQRJIfHYkoR9B24a4TP+APwhyvcl2ztnBe0YT2/Zz2mTi1McjYjI6BrxDiMcSPce4B3A3wCn\nufvaZAc2FlUXx5lVWcCf1PAtIhNQlF5SNwEF7r7e3V8CCsxsTD86SqbzZ5Xz3LYDdPVGacYREUkf\nUdow/trdDw+8cfdDwF8nL6Sx7bxTyujs6Wdt44jNOCIiaSVKwohZwhSt4ZQf2ckLaWxbMKMUgOVb\n9VhKRCaWKAnjEeA/zexiM7sY+EV4bEKalJ/N3OpCVmw7mOpQRERGVZReUn9P0Nj9t+H7xxkD8zul\n0sK6Uu5d1UhPXz9ZsWTOriIiMnZE6SXV7+63uvsV4fbDcCW+CWvRzDI6evrUjiEiE0qUXlLvNLPH\nzWyzmTWY2TYzaxiN4MaqBXVhO0aD2jFEZOKI8jzlduA7BGt7vx2YH75OWGUFOdRXFqgdQ0QmlCht\nGM3u/nDSIxlnFs4s5cE1O+nt6ydT7RgiMgFE+Un3ezP7NzM7z8zOHdiSHtkYt7CujCPdfby0S/NK\nicjEEOUOY2H4Oj/hmAMXnfxwxo+FM4N2jBUNBzi7tiTF0YiIJN+ICcPdLxyNQMabysI4MyvyWd5w\ngL95zympDkdEJOmi3GFgZh8ATgPiA8fc/RvJCmq8WDCjlIdf2oO7kzAYXkQkLUXpVrsEuBL4LGDA\nx4DpSY5rXJhbXUhzRw/7WrtSHYqISNJFafR+h7t/Ejjk7l8HzgNmJzes8WF2VSEAm/e2pjgSEZHk\ni5IwOsLXdjObDPQANckLafyYXR0kjE17lDBEJP1FacN4yMxKgH8D1hD0kJrQc0kNKC/IoTQ/m1f2\ntqU6FBGRpIvSS+pfwt1fmtlDQNzdNYlSqL6ygM37dIchIulv2IRhZhe5++/M7KNDnMPdH0huaOPD\nnOpCHlizUz2lRCTtHesO4z3A74APDXHOASUMoL6qkLauXnY1dzKlJDfV4YiIJM2wCcPdbzGzDOBh\nd793FGMaV2ZXFgBBTyklDBFJZ8fsJeXu/cBXRimWcen1rrXqKSUiaS5Kt9r/MrMvm1mtmZUObEmP\nbJyYlJ9NRWEOm9VTSkTSXJSEcSVwE/AksDrcVkX5cDO71Mw2mdkWM/vqEOfNzL4Xnl87MAuumcXN\n7Dkze9HM1pvZ16NXafTNrirgFfWUEpE0F6Vbbd2JfLCZxYAfAJcAjcBKM1vq7hsSil0G1IfbQuDW\n8LULuMjd28wsC/iTmT3s7stPJJZkm11VyD3P7aC/38nIUE8pEUlPUScfPB2Yx5snH7xrhMsWAFvc\nvSH8jHuAxUBiwlgM3OXuDiw3sxIzq3H33cDAM56scPMosabC7KpCOnr6aDzUwbSyvFSHIyKSFFEm\nH7wF+H64XQj8K/DhCJ89BdiR8L4xPBapjJnFzOwFYB/wuLuviPCdKTG76o2eUiIi6SpKG8YVwMXA\nHne/HjgLKE5qVIC797n72cBUYEF4l3MUM7vBzFaZ2aqmpqZkhzWk+rCn1CYlDBFJY5EmHwy71/aa\nWRHBb/y1Ea7bOajc1PDYcZVx98PA74FLh/oSd7/N3ee7+/yKiooIYZ18RfEsaorjvKKEISJpLErC\nWBVOPvgjgh5Sa4BnI1y3Eqg3szozywauApYOKrMU+GTYW2oR0Ozuu82sIvxOzCyXoOH85WhVSo15\nNUU8v+MwQXOMiEj6idJL6jPh7hIzewQocve1Ea7rNbObgUeBGHCHu683sxvD80uAZcDlwBagHbg+\nvLwG+GnY0yoDuNfdHzq+qo2uC+dW8sTL+3hlX9vrg/lERNLJiAnDzJYC9wC/dvdXj+fD3X0ZQVJI\nPLYkYd8JxngMvm4tcM7xfFeqXTKvin/61Us8vmGvEoaIpKUoj6T+HTgf2GBm95vZFWYWH+miiaaq\nKM5ZtSU8tn5PqkMREUmKEROGu/8xfCw1E/gh8HGChm8Z5H3zqnixsZk9zZ2pDkVE5KSLcocx0PD8\n58CNwNuBnyYzqPHqffOqAHh8494URyIicvJFGbh3L7ARuAj4v8Ap7v7ZZAc2Hs2qLKCuPF+PpUQk\nLUWZGuR24Gp370t2MOOdmXHJvCrufHobLZ09FMWzUh2SiMhJE6UN41Eli+jeN6+Knj7nD5tSM+pc\nRCRZIrVhSHTnTJtEeUEOdy/frkF8IpJWlDBOsliG8aVLZrNi20F+/txrqQ5HROSkGbYNY2Axo+G4\n+5qTH056uHpBLb9dt4tvLnuZC+ZUaq1vEUkLx7rD+Pdw+wGwAriNYD6pFeExGYaZ8a2Pnkm/O197\nYJ0eTYlIWhg2Ybj7he5+IbAbODecEfZtBFN2DJ51VgapLc3j7y+dy5Obm/j1C7tSHY6IyFsWpQ1j\njruvG3jj7i8BpyYvpPRx7aLpzCzP5z9X7hi5sIjIGBclYaw1sx+b2QXh9iNgxNlqBTIyjA+cWcOK\nbQfY39aV6nBERN6SKAnjemA98Plw28Ab05DLCC4/o4Z+h0de0uhvERnfoqyH0WlmS4Bl7r5pFGJK\nK3OrC5lZns+ydbu5ZtH0VIcjInLCoswl9WHgBeCR8P3Z4RoZEoGZcfkZNSxv0GMpERnfojySugVY\nABwGcPcXgLpkBpVuLjujmn6Hx9ZrFlsRGb+iJIwed28edEwDC47DvJoiZpTlsWzd7lSHIiJywqIk\njPVm9gkgZmb1ZvZ94Jkkx5VWBh5LPdtwgINHulMdjojICYmSMD4LnAZ0Ab8AWoAvJDOodPSBM2vo\n63eW/HFrqkMRETkhUXpJtQP/GG5ygk6bXMwnFk7jticbOLWmkI+cMzXVIYmIHJcRE4aZzQa+DMxI\nLO/uFyUvrPT09Q+fRkNTG3//y3VML8vn3GmTUh2SiEhkUR5J3Qc8D/wT8HcJmxynrFgGt/7F26gu\ninPDXavVzVZExpUoCaPX3W919+fcffXAlvTI0tSk/GyWXPM29rd1cd+qxlSHIyISWZSE8Rsz+4yZ\n1ZhZ6cAW5cPN7FIz22RmW8zsq0OcNzP7Xnh+7cAaHGZWa2a/N7MNZrbezD5/nPUa0+ZNLuJt0yfx\n4PONmvpcRMaNKAnjUwSPoJ4BVofbqpEuMrMYwboZlwHzgKvNbN6gYpcB9eF2A3BreLwX+G/uPg9Y\nBNw0xLXj2kfOmcLmvW2s39WS6lBERCIZMWG4e90Q28wIn70A2OLuDe7eDdwDLB5UZjFwlweWAyVm\nVuPuuwdW9HP3VmAjMOW4ajbGffDMGrJjGTywRkuLiMj4MGzCMLOLwtePDrVF+OwpQOJCEI0c/UN/\nxDJmNoNg0aYVEb5z3CjJy+bCuRUsfXEXvX39qQ5HRGREx7rDeE/4+qEhtg8mOS4AzKwA+CXwBXcf\n8tmNmd1gZqvMbFVTU9NohHXSfOScqexv6+KpLftTHYqIyIiGHYfh7reErye69sVOoDbh/VSOXtp1\n2DJmlkWQLO529weOEedtBOuNM3/+/HHVgnzh3AqKc7N4YM1OLpxTmepwRESOacSBewBm9gGC6UHi\nA8fc/RsjXLYSqDezOoIkcBXwiUFllgI3m9k9wEKg2d13m5kBtwMb3f07kWoyDuVkxvjQWTXct6qR\nptYuKgpzUh2SiMiwoqyHsQS4kmBOKQM+Boy4EpC79wI3A48SNFrf6+7rzexGM7sxLLYMaAC2AD8C\nPhMefydwLXCRmb0QbpcfV83GieveMQMHvnTvC/T3j6sbJBGZYGykcQBmttbdz0x4LQAedvd3jU6I\n0c2fP99XrRqxx++Y8/MVr/EPD67j794/h5sunJXqcERkAjGz1e4+P0rZKOMwOsLXdjObDPQANSca\nnBzt6gW1fOisyfz7Y5t4btvBVIcjIjKkKAnjITMrAf4NWAO8SjDNuZwkZsb/+sjpTC/L53O/eJ7m\njp5UhyQicpQoA/f+xd0Pu/svCdou5rr7f09+aBNLYTyL7111Dk1tXfzP325IdTgiIkcZtpfUsQbn\nmRnH6uoqJ+aMqcX8zbtn8v/+sJUPnDmZ98yuSHVIIiKvO1a32g8d45wDShhJ8LmL63lsw16+9su1\nPPrFd1MYz0p1SCIiwLEH7p3ogD15C+JZMf71ijO54tZn+Mzda7hobiW1k/JYMLOUIiUPEUmhKCvu\nlQG3AOcT3Fn8CfiGux9IcmwT1rnTJvHl98/h+09s4alXgmlD5lYX8tBnzyczFqWfgojIyRflp889\nQBPw58AV4f5/JjMogc9cMIsN33g/q/7pvXzzo2fw8p5WfrZ8e6rDEpEJLErCqAl7Sm0Lt/8BVCU7\nMAk6F5QX5HDV22s5f1Y533l8MwePdKc6LBGZoKIkjMfM7Cozywi3jxNM9yGjxMy45UPzONLdx78/\ntinV4YjIBBUlYfw18HOgK9zuAf7GzFrNTMvFjZL6qkKuXTSdnz/3Gusam1MdjohMQFEG7hW6e4a7\nZ4VbRnis0N2LRiNICXzxvbMpzcvmz299hr+/fy1b9rWlOiQRmUCizFb7l4Pex8zsluSFJMMpzsvi\nVze9kyvfXsuvXtjJJf/7j/zwj1tTHZaITBBRHkldbGbLzKzGzE4HlgOFSY5LhlFbmse//NnpPP3V\ni7j0tGq++fDLPLR2V6rDEpEJYMRxGO7+CTO7ElgHHAE+4e5PJz0yOabyghz+95Vn09S6gi/d+yKT\nS3I5d9qkVIclImksyiOpeuDzBMulbgeuNbO8ZAcmI4tnxfjhtW+juijODXet4u4V22loamOkNU5E\nRE5ElCVafwPc5O5PhEunfolg+dXTkhqZRFJWkMOd17+d6+58jn988CUAaorjXL1gGtcums6k/OwU\nRygi6SLKintF7t4y6Nhsd9+c1MhOwHhdce9kcHe27T/Csw0HeHT9Xp7c3ERuVoyrFtTylffPJTc7\nluoQRWQMOikr7pnZVwDcvcXMPjbo9HUnHp4kg5kxs6KAv1g4nbs+vYBHv/BuPnBmDT955lU+/sNn\n2dPcmeoQRWScO1YbxlUJ+18bdO7SJMQiJ9Gc6kK+/bGz+NG182loamPxD/7E6u2H1L4hIifsWG0Y\nNsz+UO9ljHrvvCru/9t38Fc/XcWf3/oMk/KyOH1KMQvrSll89hRqS9V/QUSiOVbC8GH2h3ovY9ip\nNUX85rPn89t1u3mpsZm1O5v59mOb+fZjm1kwo5T3n17NwrpSTq0pIpah3wVEZGjDNnqbWR/BuAsD\ncoH2gVNA3N3H3Go+E7nR+3jtPNzBr57fyYPP73x9ipHCnEzeM6eCD501mQvmVJCTqYZykXR3PI3e\nI/aSGk+UME7MrsMdrHz1IM9uPcDjG/Zy4Eg3hfFM3lVfznmnlHP+rHLqyvNTHaaIJMGYSRhmdinw\nXSAG/NjdvzXovIXnLye4g7nO3deE5+4APgjsc/fTo3yfEsZb19vXzzNbD/Dbtbt56pUmdoW9q957\nahX/cPlcZlYUpDhCETmZxkTCMLMYsBm4BGgkGOx3tbtvSChzOfBZgoSxEPiuuy8Mz70baAPuUsJI\nDXdn+4F2Hlq7iyV/bKCzp4+rF0zj1JoiSvKyqC6Oc9bUErV7iIxjx5Mwooz0PlELgC3u3hAGdQ+w\nGNiQUGYxQUJwYLmZlZhZjbvvdvcnzWxGEuOTEZgZM8rzufmieq58+zS+8/gm7l6xnf6E3zEqCnO4\n/PRqLphTSWVRDhUFOZTmZ2vtcZE0lMyEMQXYkfC+keAuYqQyU4DdSYxLTkBFYQ7f/OiZ3PKh0zjc\n3sPhjm42723j4XW7uWflDn767BvrjZtBWX425QU5LKgr5a/On8m0MnXfFRnvkpkwRoWZ3QDcADBt\n2rQUR5P+4lkxqotjVBfHmVtdxIfPmkxbVy8bd7ewv7WLprau8LWbPc0d/OK51/jZ8u188MzJXPn2\nWubPmKTeVyLjVDITxk6gNuH91PDY8ZY5Jne/DbgNgjaM4w9T3qqCnEzePqN0yHN7mju54+lt3L18\nO0tf3EU8K4NFM8uYV1PEjLJ8ZpTnc+bUYuJZSiIiY10yE8ZKoN7M6giSwFXAJwaVWQrcHLZvLASa\n3V2Po9JIdXGcf7j8VD5/cT3LGw7w5OYmnt56gD+9sp/esDEkJzODBXWlLJpZRm1pHpWFOdSW5jGl\nJDfF0YtIoqQlDHfvNbObgUcJutXe4e7rzezG8PwSYBlBD6ktBN1qrx+43sx+AVwAlJtZI3CLu9+e\nrHglufJzMrn41CouPrUKCLrv7jrcyea9rTyz9QBPvdLEvz266U3X1Jbm8s5TyllQV8rc6iJOqczX\n4yyRFNLAPRkzmjt62NfSyd6WLrbsa+XprQdY3nCA1s5eAGIZxqk1hbyrvoJ31Zdzdm0JednjvhlO\nJKXGxDiMVFDCSD+9ff007D/Cy3ta2bSnhZWvHmLN9kOvP86qKMxhemke1cVxygtyKC/IZm51EWdP\nK6G8ICfF0YuMfWNlHIbIW5YZy2B2VSGzqwrhrMkAtHb2sLzhIJv2tLD9QDvbD7azYVcLTW1dr9+N\nQPBIa3ZlIadUFnBKRT5zqouYXVWguxKRE6T/OTLuFMazuGReFZfMqzrqXHt3L+t3tfD8a4d4cUcz\nW5vaeGrLfrp7+4FgjMjsykI+cu4UPnrOFCqL4qMdvsi4pUdSkvb6+p0dB9vDx1qtPPVKE6u2HyKW\nYSyYEUzrPqe6gJkVBUydlEtVYZwMTXciE4TaMERG0NDUxn2rG3l6y35e2dtGR0/f6+eyYxnUlMSZ\nUpLLlJJcTqksYE51IadWF1FVlEMwZ6ZIelDCEDkO/f3OjkPtbNt/hB2HOmg82E7j4Q52He6g8VAH\nTa1dr5etKsph/vRSzplWwtRJeVQV5VBZFGdSXha5WTElExl31OgtchwyMozpZflMLxt6zY/D7d1s\n2tPKxt0trHntMKu3H+K3644eX5qdmUFFQQ6zKguoryygpiSXmEEslsHUklwW1JWSn6P/cjJ+6Q5D\n5AQcaOtiT0sne1s62dfSxaH2Hg63d7O7uZMt+9rY2tRGV9jQPiArZpwzbRJnTClmRlke08vyqSvP\nZ0pJrtpMJGV0hyGSZGUFOZQV5HDa5OIhz/f1O22dvfS509vXz+a9bfxpy36e2bqfu1dsp7PnjWQS\nz8qgrryAU6sLObWmiHmTi5hXU8Sk/OzRqo5IJEoYIkkQyzCK895Y9r6yKM759eVAsDDVvtYutu0/\nwrb9R9i6r40tTW08s/UADzz/xtybNcVxZlcVUlaQzaS8bGqK45wxpZjTpxTr0ZakhP7ViYwyM6Oq\nKE5VUZxFM8vedO7gkW427Gphw+5m1u9qYWtTG1v2tXGovZv27qAnV4ZBVVGc3OwYedkxiuJZTMrP\npjQvm9nVhSyqK2VWZYEa4OWkU8IQGUNK87M5v7789buRRPvbuljX2MwLOw6z83AHHd19tHf30trZ\ny8ZdLexDF/n/AAAOE0lEQVRv66IlHOlemp/NrIoCJpfEqSnJJT87RnZmBvGsGIXxTIriWVQU5jC3\nuojsTK2OKNEoYYiME+UFOVw4t5IL51YOed7dee1gOyu2HWTltoNsP9jOqu2H2LN29+tzbw0Wz8rg\n7NoSzppaQmVRnMrCHKqK4tQUx6kujpOlpXYlgRKGSJowe6N78Mfnv7EumbvT2+909fbT2dNHa2cv\nLR097DzcwcpXD7Ly1YPc+fSrdPf1D/o8mJSXTUleFpPysplTXcjCulIW1pVpAOMEpW61IoK7c7i9\nh32tQXfh3Yc72NXcyYG2Lg6393DgSBfrd7bQ2hU88sqwYI2T4twsppXmMbMin+ml+RTEM19vV6kp\niTO5JJeieNYI3y6ppG61InJczIxJ+dlMyg/uJIbS29fPxt2trN5+kANHumnt7OVQezevHmjn1y/s\netNMwYlK87M5fUoxZ04pZkZ5PsW5WRTFM6kpzmXKpFxiGoMybihhiEgkmbEMzphazBlTjx574u60\ndPbS3t1Le3cfzR097D7cyc7D7WzZ18a6nS3c+set9A1qS8nOzKCuLJ/ywmwKcjIpjGcxuTjO1NI8\npk4K7k7ysmMU52ZRmp+tx2AppoQhIm+ZmVGcm0VxbsLjp2lvLtPZ08e+li5aOnto7uhh56EOtja1\nsbXpCIfau9nf2k5rZw97WjoZqo1+Ul4Ws6sKmVmRT1E8i4KcTKqK4pw+pZj6qgI10I8CJQwRGRXx\nrBjTyvJGLNfT18+uwx3sPNRBW1dwx3LwSDev7Aump398w15aO3vfNPVKTmYG08vyKMvPoawgm+Lc\nLArimRTmZFKQk0l+ePdSWZRDTXGcysK4HoWdACUMERlTsmIZx5wMckBPXz+NhzpY23iYtY3NNB5q\n50BbN+t3tdDS0UNrV+/rC2cNFsswaorjTJ2US01xLvk5MXKzYuRmZ5KfHSMvJ5OqwhxOn1JMTXFc\nj8JCShgiMi5lxTKoKw8mcFx89pQhy3T39nOkq5e2rl6aO3rY19rJrsOd7G4O7mAaD3Xw3LaDdPb0\n0RFugzuOluVnc0plQTA6vzCH6eX5zKkqZE5V4Zumf5kIlDBEJG1lZ2aQnRn0/gpGpgw9WeSA/n6n\ns7ePtq5eGg918NLOZtY1NrP9YDvrGg/zeEvnmyaOXFhXyrXnTef9p1VPiDYUJQwRkVBGhpGXnUle\ndiaVhXHOnTbpTefdnd3NnWze28raxmbuW72Dm3/+PGX52dSFXYYn5WcHPb0mBT29akvzmFySHt2H\nNXBPROQE9fU7T25u4tcv7GRfazDI8eCRbva2dr7p0VZWzDilooDzZ5XzrtkVLKwrJZ4VS13gCcbM\nEq1mdinwXSAG/NjdvzXovIXnLwfagevcfU2Ua4eihCEiY0FPXz97mjvZcbCd7Qfb2X6gnXU7D7Py\n1UN09/YzvSyP//j0wki9xpJtTIz0NrMY8APgEqARWGlmS919Q0Kxy4D6cFsI3AosjHitiMiYlBXL\noLY0j9rSPN6RcLyju48/bm7iqw+s5Yolz3DXXy5gbnVRyuI8Xslsw1gAbHH3BgAzuwdYDCT+0F8M\n3OXBbc5yMysxsxpgRoRrRUTGldzsGJeeXs3MinyuvX0FH1/yLDdfNIuczBgZGUbMjMwMIyvTqCiI\nMzmcj2usPL5KZsKYAuxIeN9IcBcxUpkpEa8VERmXZlcVcv+N7+BTdz7H/1r28ojlszMzKMzJJC8n\nRoYZRjC6PniFsvwc7r3xvKTHPe57SZnZDcANANOmTRuhtIjI2FBbmsfjX3wPLR099LvT505/P/T2\n99Pd28++1i52He5gd3MnLZ09tHUGo977w3bnfg96bTlQOEpL9ibzW3YCtQnvp4bHopTJinAtAO5+\nG3AbBI3eby1kEZHRE8sIZgkeysyKglGOZmTJHGmyEqg3szozywauApYOKrMU+KQFFgHN7r474rUi\nIjKKknaH4e69ZnYz8ChB19g73H29md0Ynl8CLCPoUruFoFvt9ce6NlmxiojIyDRwT0RkAjuecRjp\nP/mJiIicFEoYIiISiRKGiIhEooQhIiKRKGGIiEgkadVLysyagO0neHk5sP8khjMWTYQ6wsSo50So\nI0yMeqa6jtPdvSJKwbRKGG+Fma2K2rVsvJoIdYSJUc+JUEeYGPUcT3XUIykREYlECUNERCJRwnjD\nbakOYBRMhDrCxKjnRKgjTIx6jps6qg1DREQi0R2GiIhEMuEThpldamabzGyLmX011fGcLGZWa2a/\nN7MNZrbezD4fHi81s8fN7JXwdVKqY32rzCxmZs+b2UPh+7SqY7h08f1m9rKZbTSz89KtjgBm9sXw\n3+pLZvYLM4uP93qa2R1mts/MXko4NmydzOxr4c+iTWb2/tREPbwJnTDMLAb8ALgMmAdcbWbzUhvV\nSdML/Dd3nwcsAm4K6/ZV4Al3rweeCN+Pd58HNia8T7c6fhd4xN3nAmcR1DWt6mhmU4DPAfPd/XSC\nZQ2uYvzX8yfApYOODVmn8P/nVcBp4TX/L/wZNWZM6IQBLAC2uHuDu3cD9wCLUxzTSeHuu919Tbjf\nSvBDZgpB/X4aFvsp8GepifDkMLOpwAeAHyccTps6mlkx8G7gdgB373b3w6RRHRNkArlmlgnkAbsY\n5/V09yeBg4MOD1enxcA97t7l7tsI1glaMCqBRjTRE8YUYEfC+8bwWFoxsxnAOcAKoCpc1RBgD1CV\norBOlv8DfAXoTziWTnWsA5qAO8PHbj82s3zSq464+07g28BrwG6C1TcfI83qGRquTmP+59FETxhp\nz8wKgF8CX3D3lsRzHnSRG7fd5Mzsg8A+d189XJnxXkeC37rPBW5193OAIwx6LJMGdSR8jr+YIEFO\nBvLN7JrEMulQz8HGW50mesLYCdQmvJ8aHksLZpZFkCzudvcHwsN7zawmPF8D7EtVfCfBO4EPm9mr\nBI8TLzKzn5FedWwEGt19Rfj+foIEkk51BHgvsM3dm9y9B3gAeAfpV08Yvk5j/ufRRE8YK4F6M6sz\ns2yCBqelKY7ppDAzI3juvdHdv5NwainwqXD/U8CvRzu2k8Xdv+buU919BsHf3e/c/RrSq457gB1m\nNic8dDGwgTSqY+g1YJGZ5YX/di8maHdLt3rC8HVaClxlZjlmVgfUA8+lIL5hTfiBe2Z2OcFz8Bhw\nh7v/zxSHdFKY2fnAU8A63ni+/w8E7Rj3AtMIZvb9uLsPbpQbd8zsAuDL7v5BMysjjepoZmcTNOpn\nAw3A9QS/7KVNHQHM7OvAlQQ9/J4H/gooYBzX08x+AVxAMCPtXuAW4FcMUycz+0fg0wR/Bl9w94dT\nEPawJnzCEBGRaCb6IykREYlICUNERCJRwhARkUiUMEREJBIlDBERiUQJQ8YcM+szsxfCWUvvM7O8\nYcotM7OSE/j8yWZ2/1uI71UzKz/R68cLM7vOzCanOg4ZO5QwZCzqcPezw1lLu4EbE09aIMPdLw8n\n4jsu7r7L3a84WcGmsesIpukQAZQwZOx7CphlZjPCNQLuAl4Cagd+0w/PbTSzH4XrKTxmZrkAZjbL\nzP7LzF40szVmdkpY/qXw/HVm9msz+0O4PsEtA19sZr8ys9XhZ94wUqAWrK2yJvyuJ8JjpeHnrDWz\n5WZ2Znj8n83sp2b2lJltN7OPmtm/mtk6M3sknNZl4G5m4PhzZjYrPD7DzH4Xfu4TZjYtPP4TM/ue\nmT1jZg1mdkVCfH9nZivDa76e8DlH/dmF180H7g7v9nLN7FsWrK+y1sy+fRL+bmW8cXdt2sbUBrSF\nr5kE0yb8LTCDYMT6ooRyrxKMoJ1BMDL27PD4vcA14f4K4CPhfpxg2uwZwEvhsesIZkctA3IJktH8\n8Fxp+DpwvCzxewfFXEEw02jdoGu/D9wS7l8EvBDu/zPwJyCLYI2LduCy8NyDwJ8lfNc/hvufBB4K\n938DfCrc/zTwq3D/J8B9BL8MziOYvh/gfQRrR1t47iGCadOP9Wf3h4Q/izJgE28M9i1J9b8TbaO/\n6Q5DxqJcM3sBWEUwx9Dt4fHt7r58mGu2ufsL4f5qYIaZFQJT3P1BAHfvdPf2Ia593N0PuHsHwaR3\n54fHP2dmLwLLCSaFqz9GzIuAJz1YxwB/Y/qK84H/CI/9Digzs6Lw3MMeTLS3jmBqmkfC4+sIfpAP\n+EXC63nh/nnAz8P9/0iIGYLk0e/uG3hj6uz3hdvzwBpgbkJ9jvqzG6J+zUAncLuZfZQgwckEk5nq\nAESG0OHuZyceCOaj48gxrulK2O8juCuIavD8OB7OTfVe4Dx3bzezPxDcoZxMXQDu3m9mPe4+EEc/\nb/6/6cPsH/NzQ5bw+k13/2FiQQvWShnxz87de81sAcGkgFcANxPcMckEojsMSVserDTYaGZ/BhDO\nAjpUj6tLwraGXILVz54GioFDYbKYS3AHcSzLgXeHs4xiZqXh8aeAvwiPXQDs90HrkkRwZcLrs+H+\nMwQz9BJ+/lMjfMajwKctWB8FM5tiZpUjXNMKFIblC4Bid18GfJHgMZpMMLrDkHR3LfBDM/sG0AN8\njDevzgfBFNK/JFh/4GfuvsrM1gE3mtlGgmf3wz0KA8Ddm8KG8QfMLINgjYNLCNoq7jCztQSPcT41\n/KcMa1J4fRdwdXjsswSr8P0dwYp8148Q32NmdirwbHi31gZcQ3BHMZyfAEvMrINg3ftfm1mc4G7l\nSydQDxnnNFutTGhmdh1Bw+7NqY5lKBYsDjXf3fenOhYRPZISEZFIdIchIiKR6A5DREQiUcIQEZFI\nlDBERCQSJQwREYlECUNERCJRwhARkUj+P+Cuq/sp7mpkAAAAAElFTkSuQmCC\n", 871 | "text/plain": [ 872 | "" 873 | ] 874 | }, 875 | "metadata": {}, 876 | "output_type": "display_data" 877 | } 878 | ], 879 | "source": [ 880 | "#PCA In Sklearn\n", 881 | "from sklearn.decomposition import PCA\n", 882 | "\n", 883 | "pca_full = PCA(n_components = None)\n", 884 | "pca_full.fit(X)\n", 885 | "\n", 886 | "plt.plot(range(0,107), pca_full.explained_variance_ratio_)\n", 887 | "plt.ylabel('Explained variance ratio')\n", 888 | "plt.xlabel('Principal components')\n", 889 | "#plt.xticks(np.arange(0,100,5))\n", 890 | "plt.show()" 891 | ] 892 | }, 893 | { 894 | "cell_type": "markdown", 895 | "metadata": {}, 896 | "source": [ 897 | "#### From above figure, we will choose 80 as the dimensions of Principal components." 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": 20, 903 | "metadata": { 904 | "collapsed": false 905 | }, 906 | "outputs": [ 907 | { 908 | "name": "stdout", 909 | "output_type": "stream", 910 | "text": [ 911 | "[[-0.11279008 0.43894183 -3.99012024 ..., 1.41998245 0.4333418\n", 912 | " 0.50221114]\n", 913 | " [-0.46084816 0.17902204 -3.58830904 ..., 1.60322019 0.17307902\n", 914 | " 0.3332049 ]\n", 915 | " [-2.61677928 2.52343016 -5.04277486 ..., 1.53863346 0.27665 0.10085208]\n", 916 | " ..., \n", 917 | " [ 4.29349456 3.1139787 8.86817852 ..., 1.59718786 -0.59968411\n", 918 | " -0.65606429]\n", 919 | " [ 3.06349133 4.87790257 7.91930136 ..., 0.8040673 -1.19636896\n", 920 | " 0.58418913]\n", 921 | " [ 1.87201368 5.70640416 7.50898992 ..., 1.19867262 -0.93258426\n", 922 | " 0.58532392]]\n", 923 | "\n", 924 | "[ 0.0599287 0.05130629 0.04778579 0.04120846 0.03794963 0.03442235\n", 925 | " 0.02928028 0.02648273 0.02516247 0.02253369 0.02148449 0.01909422\n", 926 | " 0.01792891 0.01660708 0.01649287 0.01499748 0.01459388 0.0135756\n", 927 | " 0.01335547 0.01295986 0.01223028 0.01205561 0.01173874 0.0115274\n", 928 | " 0.01122597 0.01099925 0.01050502 0.01044227 0.01014913 0.0100701\n", 929 | " 0.00998956 0.00958084 0.00934085 0.009218 0.00898172 0.00889459\n", 930 | " 0.00866534 0.00851304 0.008274 0.00815345 0.00807838 0.00788843\n", 931 | " 0.0077532 0.00759841 0.00726425 0.00711507 0.00700392 0.0068995\n", 932 | " 0.00674679 0.00654725 0.00645781 0.00639411 0.00625 0.00623941\n", 933 | " 0.00600632 0.00588609 0.00575109 0.00558769 0.00542205 0.00535553\n", 934 | " 0.00517615 0.00514075 0.00507192 0.00485718 0.00475494 0.00472658\n", 935 | " 0.00467574 0.00446718 0.00438275 0.00428521 0.00421698 0.00400091\n", 936 | " 0.00394904 0.0038194 0.00378123 0.00367514 0.00348699 0.0034291\n", 937 | " 0.00330351 0.00324087]\n" 938 | ] 939 | } 940 | ], 941 | "source": [ 942 | "x_scaled = StandardScaler().fit_transform(X)\n", 943 | "pca = PCA(n_components = 80)\n", 944 | "x_pca = pca.fit_transform(x_scaled)\n", 945 | "print(x_pca, end = '\\n\\n')\n", 946 | "print(pca.explained_variance_ratio_)" 947 | ] 948 | }, 949 | { 950 | "cell_type": "code", 951 | "execution_count": 21, 952 | "metadata": { 953 | "collapsed": false 954 | }, 955 | "outputs": [ 956 | { 957 | "data": { 958 | "text/plain": [ 959 | "0.95439228841885837" 960 | ] 961 | }, 962 | "execution_count": 21, 963 | "metadata": {}, 964 | "output_type": "execute_result" 965 | } 966 | ], 967 | "source": [ 968 | "sum(pca.explained_variance_ratio_)" 969 | ] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": 22, 974 | "metadata": { 975 | "collapsed": false 976 | }, 977 | "outputs": [ 978 | { 979 | "data": { 980 | "text/plain": [ 981 | "(8378, 80)" 982 | ] 983 | }, 984 | "execution_count": 22, 985 | "metadata": {}, 986 | "output_type": "execute_result" 987 | } 988 | ], 989 | "source": [ 990 | "x_pca.shape" 991 | ] 992 | }, 993 | { 994 | "cell_type": "markdown", 995 | "metadata": {}, 996 | "source": [ 997 | "# 2. Model Training\n", 998 | "We'll do three models and compare the prediction results:\n", 999 | " - 2.1 Manually made neural network\n", 1000 | " - 2.2 Tensorflow\n", 1001 | " - 2.3 SVM\n", 1002 | " - 2.3 Logistic regression" 1003 | ] 1004 | }, 1005 | { 1006 | "cell_type": "markdown", 1007 | "metadata": {}, 1008 | "source": [ 1009 | "### Train and Test split" 1010 | ] 1011 | }, 1012 | { 1013 | "cell_type": "code", 1014 | "execution_count": 23, 1015 | "metadata": { 1016 | "collapsed": true 1017 | }, 1018 | "outputs": [], 1019 | "source": [ 1020 | "from sklearn.model_selection import train_test_split\n", 1021 | "\n", 1022 | "X_train, X_test, y_train, y_test = train_test_split(x_pca, df_labels, test_size=0.2, random_state=0)" 1023 | ] 1024 | }, 1025 | { 1026 | "cell_type": "code", 1027 | "execution_count": 24, 1028 | "metadata": { 1029 | "collapsed": false, 1030 | "scrolled": true 1031 | }, 1032 | "outputs": [ 1033 | { 1034 | "data": { 1035 | "text/plain": [ 1036 | "(6702, 80)" 1037 | ] 1038 | }, 1039 | "execution_count": 24, 1040 | "metadata": {}, 1041 | "output_type": "execute_result" 1042 | } 1043 | ], 1044 | "source": [ 1045 | "X_train.shape" 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "code", 1050 | "execution_count": 25, 1051 | "metadata": { 1052 | "collapsed": false 1053 | }, 1054 | "outputs": [ 1055 | { 1056 | "data": { 1057 | "text/plain": [ 1058 | "array([[-4.68464119, -2.8248629 , -1.51037084, ..., -0.08057486,\n", 1059 | " -0.71625464, -0.10307144],\n", 1060 | " [-0.74018427, 1.56521961, -0.06997587, ..., 0.01645834,\n", 1061 | " 0.45644289, -0.24637226],\n", 1062 | " [ 1.74612994, -3.12552681, 1.71795705, ..., 0.16907101,\n", 1063 | " 0.4087692 , -0.38171825],\n", 1064 | " ..., \n", 1065 | " [ 0.75099882, -2.49960586, -1.51160927, ..., -0.68449073,\n", 1066 | " 0.04991678, -1.01516311],\n", 1067 | " [-3.93462896, 2.8487166 , -1.62335803, ..., 0.61735951,\n", 1068 | " 0.13858547, 0.21935022],\n", 1069 | " [-1.56477143, -1.87060714, -0.83035874, ..., -0.58688186,\n", 1070 | " -0.59803885, -0.11533395]])" 1071 | ] 1072 | }, 1073 | "execution_count": 25, 1074 | "metadata": {}, 1075 | "output_type": "execute_result" 1076 | } 1077 | ], 1078 | "source": [ 1079 | "X_train" 1080 | ] 1081 | }, 1082 | { 1083 | "cell_type": "markdown", 1084 | "metadata": {}, 1085 | "source": [ 1086 | "## 2.1 Manual Neural Network\n", 1087 | " - 2.1.1 Build the Neural Network\n", 1088 | " - 2.1.2 Set the hyperparameters, train the NN and evaluate\n", 1089 | " - 2.1.3 Adapt SGD method to improve the accuracy" 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "markdown", 1094 | "metadata": {}, 1095 | "source": [ 1096 | "### 2.1.1 Build the neural network" 1097 | ] 1098 | }, 1099 | { 1100 | "cell_type": "code", 1101 | "execution_count": 96, 1102 | "metadata": { 1103 | "collapsed": false 1104 | }, 1105 | "outputs": [], 1106 | "source": [ 1107 | "class MyNeuralNetwork(object):\n", 1108 | " def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate):\n", 1109 | " # Set number of nodes in input, hidden and output layers.\n", 1110 | " self.input_nodes = input_nodes\n", 1111 | " self.hidden_nodes = hidden_nodes\n", 1112 | " self.output_nodes = output_nodes\n", 1113 | "\n", 1114 | " # Initialize weights\n", 1115 | " self.weights_0_1 = np.zeros((self.hidden_nodes,self.input_nodes))\n", 1116 | "\n", 1117 | " self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, \n", 1118 | " (self.output_nodes, self.hidden_nodes))\n", 1119 | " self.lr = learning_rate\n", 1120 | " \n", 1121 | " #### Set this to your implemented sigmoid function ####\n", 1122 | " # Activation function is the sigmoid function\n", 1123 | " self.sigmoid_activation = lambda x : 1 / (1 + np.exp(-x))\n", 1124 | " self.sigmoid_output_2_derivative = lambda x: x * (1 - x)\n", 1125 | " \n", 1126 | " def train(self, inputs_array, targets_array):\n", 1127 | " # Convert inputs list to 2d array\n", 1128 | " inputs = inputs_array.T\n", 1129 | " targets = np.array(targets_array, ndmin=2)\n", 1130 | " #targets = targets_array\n", 1131 | " m = inputs_array.shape[0] # number of records\n", 1132 | " \n", 1133 | " #### Implement the forward pass here ####\n", 1134 | " ### Forward pass ###\n", 1135 | " # TODO: Hidden layer\n", 1136 | " layer_1_inputs = np.dot(self.weights_0_1, inputs) # signals into hidden layer\n", 1137 | " layer_1 = layer_1_inputs # signals from hidden layer\n", 1138 | " \n", 1139 | " # TODO: Output layer\n", 1140 | " layer_2_inputs = np.dot(self.weights_1_2,layer_1) # signals into final output layer\n", 1141 | " layer_2 = self.sigmoid_activation(layer_2_inputs) # signals from final output layer\n", 1142 | " \n", 1143 | " #### Implement the backward pass here ####\n", 1144 | " ### Backward pass ###\n", 1145 | " \n", 1146 | " # TODO: Output error \n", 1147 | " layer_2_errors = targets - layer_2 # Output layer error is the difference between desired target and actual output.\n", 1148 | " layer_2_delta = layer_2_errors * self.sigmoid_output_2_derivative(layer_2)\n", 1149 | " \n", 1150 | " # TODO: Backpropagated error\n", 1151 | " layer_1_errors = np.dot(self.weights_1_2.T,layer_2_delta) # errors propagated to the hidden layer 2x128\n", 1152 | " layer_1_delta = layer_1_errors # hidden layer gradients y = x -> 1\n", 1153 | " \n", 1154 | " # TODO: Update the weights\n", 1155 | " self.weights_1_2 += self.lr*np.dot(layer_2_delta,layer_1.T)/m # update hidden-to-output weights with gradient descent step\n", 1156 | " self.weights_0_1 += self.lr*np.dot(layer_1_delta,inputs.T)/m # update input-to-hidden weights with gradient descent step\n", 1157 | " \n", 1158 | " \n", 1159 | " def run(self, inputs_list):\n", 1160 | " # Run a forward pass through the network\n", 1161 | " inputs = np.array(inputs_list, ndmin=2).T\n", 1162 | " \n", 1163 | " #### Implement the forward pass here ####\n", 1164 | " # TODO: Hidden layer\n", 1165 | " hidden_inputs = np.dot(self.weights_0_1, inputs) # signals into hidden layer\n", 1166 | " hidden_outputs = hidden_inputs # signals from hidden layer\n", 1167 | " \n", 1168 | " # TODO: Output layer\n", 1169 | " final_inputs = np.dot(self.weights_1_2,hidden_outputs) # signals into final output layer\n", 1170 | " final_outputs = self.sigmoid_activation(final_inputs) # signals from final output layer \n", 1171 | " \n", 1172 | " return final_outputs" 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "markdown", 1177 | "metadata": {}, 1178 | "source": [ 1179 | "### 2.1.2 Train the model and evaluation" 1180 | ] 1181 | }, 1182 | { 1183 | "cell_type": "code", 1184 | "execution_count": 97, 1185 | "metadata": { 1186 | "collapsed": false 1187 | }, 1188 | "outputs": [ 1189 | { 1190 | "name": "stdout", 1191 | "output_type": "stream", 1192 | "text": [ 1193 | "0.658711217184\n" 1194 | ] 1195 | } 1196 | ], 1197 | "source": [ 1198 | "from sklearn import metrics\n", 1199 | "### Set the hyperparameters here ###\n", 1200 | "epochs = 100 #100\n", 1201 | "learning_rate = 0.01 #0.1\n", 1202 | "hidden_nodes = 10 \n", 1203 | "output_nodes = 1\n", 1204 | "\n", 1205 | "N_i = X_train.shape[1]\n", 1206 | "network = MyNeuralNetwork(N_i, hidden_nodes, output_nodes, learning_rate)\n", 1207 | "\n", 1208 | "for e in range(epochs):\n", 1209 | " network.train(X_train, y_train)\n", 1210 | " \n", 1211 | "y_pred = network.run(X_test)\n", 1212 | "y_pred = np.where(y_pred >= 0.5, 1, 0) # if probability >= 0.5, it is 1, else 0\n", 1213 | "\n", 1214 | "print(metrics.accuracy_score(y_test,y_pred[0]))" 1215 | ] 1216 | }, 1217 | { 1218 | "cell_type": "markdown", 1219 | "metadata": {}, 1220 | "source": [ 1221 | "### 2.1.3 SGD" 1222 | ] 1223 | }, 1224 | { 1225 | "cell_type": "code", 1226 | "execution_count": 98, 1227 | "metadata": { 1228 | "collapsed": false 1229 | }, 1230 | "outputs": [], 1231 | "source": [ 1232 | "#N_i = X_train.shape[1]\n", 1233 | "network = MyNeuralNetwork(N_i, hidden_nodes, output_nodes, learning_rate)\n", 1234 | "\n", 1235 | "random_row_idx = np.zeros(128)\n", 1236 | "for e in range(epochs):\n", 1237 | " random_row_idx = np.random.choice(X_train.shape[0],size=128)\n", 1238 | " X_batch = X_train[random_row_idx,:]\n", 1239 | " y_batch = y_train[random_row_idx]\n", 1240 | " network.train(X_batch, y_batch)" 1241 | ] 1242 | }, 1243 | { 1244 | "cell_type": "code", 1245 | "execution_count": 99, 1246 | "metadata": { 1247 | "collapsed": false 1248 | }, 1249 | "outputs": [ 1250 | { 1251 | "name": "stdout", 1252 | "output_type": "stream", 1253 | "text": [ 1254 | "0.839498806683\n" 1255 | ] 1256 | }, 1257 | { 1258 | "name": "stderr", 1259 | "output_type": "stream", 1260 | "text": [ 1261 | "C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\ipykernel\\__main__.py:2: RuntimeWarning: invalid value encountered in greater_equal\n", 1262 | " from ipykernel import kernelapp as app\n" 1263 | ] 1264 | } 1265 | ], 1266 | "source": [ 1267 | "y_pred = network.run(X_test)\n", 1268 | "y_pred = np.where(y_pred >= 0.5, 1, 0) # if probability >= 0.5, it is 1, else 0\n", 1269 | "print(metrics.accuracy_score(y_test,y_pred[0]))" 1270 | ] 1271 | }, 1272 | { 1273 | "cell_type": "markdown", 1274 | "metadata": { 1275 | "collapsed": false 1276 | }, 1277 | "source": [ 1278 | "#### Wow, SGD improves the accuracy dramatically !!!!" 1279 | ] 1280 | }, 1281 | { 1282 | "cell_type": "markdown", 1283 | "metadata": {}, 1284 | "source": [ 1285 | "## 2.2 Tensorflow" 1286 | ] 1287 | }, 1288 | { 1289 | "cell_type": "code", 1290 | "execution_count": 31, 1291 | "metadata": { 1292 | "collapsed": true 1293 | }, 1294 | "outputs": [], 1295 | "source": [ 1296 | "import tensorflow as tf" 1297 | ] 1298 | }, 1299 | { 1300 | "cell_type": "code", 1301 | "execution_count": 32, 1302 | "metadata": { 1303 | "collapsed": false, 1304 | "scrolled": true 1305 | }, 1306 | "outputs": [ 1307 | { 1308 | "name": "stdout", 1309 | "output_type": "stream", 1310 | "text": [ 1311 | "WARNING:tensorflow:float64 is not supported by many models, consider casting to float32.\n", 1312 | "WARNING:tensorflow:Using temporary folder as model directory: C:\\Users\\minga\\AppData\\Local\\Temp\\tmpi4fp2htr\n", 1313 | "INFO:tensorflow:Using default config.\n", 1314 | "INFO:tensorflow:Using config: {'keep_checkpoint_every_n_hours': 10000, 'keep_checkpoint_max': 5, '_evaluation_master': '', 'save_summary_steps': 100, '_task_id': 0, 'save_checkpoints_secs': 600, 'tf_config': gpu_options {\n", 1315 | " per_process_gpu_memory_fraction: 1\n", 1316 | "}\n", 1317 | ", '_is_chief': True, '_environment': 'local', 'save_checkpoints_steps': None, '_num_ps_replicas': 0, '_cluster_spec': , '_master': '', '_task_type': None, 'tf_random_seed': None}\n", 1318 | "WARNING:tensorflow:From C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\tensorflow\\contrib\\learn\\python\\learn\\estimators\\dnn.py:315 in fit.: calling BaseEstimator.fit (from tensorflow.contrib.learn.python.learn.estimators.estimator) with x is deprecated and will be removed after 2016-12-01.\n", 1319 | "Instructions for updating:\n", 1320 | "Estimator is decoupled from Scikit Learn interface by moving into\n", 1321 | "separate class SKCompat. Arguments x, y and batch_size are only\n", 1322 | "available in the SKCompat class, Estimator will only accept input_fn.\n", 1323 | "Example conversion:\n", 1324 | " est = Estimator(...) -> est = SKCompat(Estimator(...))\n", 1325 | "WARNING:tensorflow:From C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\tensorflow\\contrib\\learn\\python\\learn\\estimators\\dnn.py:315 in fit.: calling BaseEstimator.fit (from tensorflow.contrib.learn.python.learn.estimators.estimator) with y is deprecated and will be removed after 2016-12-01.\n", 1326 | "Instructions for updating:\n", 1327 | "Estimator is decoupled from Scikit Learn interface by moving into\n", 1328 | "separate class SKCompat. Arguments x, y and batch_size are only\n", 1329 | "available in the SKCompat class, Estimator will only accept input_fn.\n", 1330 | "Example conversion:\n", 1331 | " est = Estimator(...) -> est = SKCompat(Estimator(...))\n", 1332 | "WARNING:tensorflow:From C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\tensorflow\\contrib\\learn\\python\\learn\\estimators\\dnn.py:315 in fit.: calling BaseEstimator.fit (from tensorflow.contrib.learn.python.learn.estimators.estimator) with batch_size is deprecated and will be removed after 2016-12-01.\n", 1333 | "Instructions for updating:\n", 1334 | "Estimator is decoupled from Scikit Learn interface by moving into\n", 1335 | "separate class SKCompat. Arguments x, y and batch_size are only\n", 1336 | "available in the SKCompat class, Estimator will only accept input_fn.\n", 1337 | "Example conversion:\n", 1338 | " est = Estimator(...) -> est = SKCompat(Estimator(...))\n", 1339 | "WARNING:tensorflow:float64 is not supported by many models, consider casting to float32.\n", 1340 | "INFO:tensorflow:Summary name dnn/hiddenlayer_0:fraction_of_zero_values is illegal; using dnn/hiddenlayer_0_fraction_of_zero_values instead.\n", 1341 | "INFO:tensorflow:Summary name dnn/hiddenlayer_0:activation is illegal; using dnn/hiddenlayer_0_activation instead.\n", 1342 | "INFO:tensorflow:Summary name dnn/logits:fraction_of_zero_values is illegal; using dnn/logits_fraction_of_zero_values instead.\n", 1343 | "INFO:tensorflow:Summary name dnn/logits:activation is illegal; using dnn/logits_activation instead.\n", 1344 | "INFO:tensorflow:Create CheckpointSaverHook.\n", 1345 | "INFO:tensorflow:loss = 0.534811, step = 1\n", 1346 | "INFO:tensorflow:Saving checkpoints for 1 into C:\\Users\\minga\\AppData\\Local\\Temp\\tmpi4fp2htr\\model.ckpt.\n", 1347 | "WARNING:tensorflow:*******************************************************\n", 1348 | "WARNING:tensorflow:TensorFlow's V1 checkpoint format has been deprecated.\n", 1349 | "WARNING:tensorflow:Consider switching to the more efficient V2 format:\n", 1350 | "WARNING:tensorflow: `tf.train.Saver(write_version=tf.train.SaverDef.V2)`\n", 1351 | "WARNING:tensorflow:now on by default.\n", 1352 | "WARNING:tensorflow:*******************************************************\n", 1353 | "INFO:tensorflow:loss = 0.105281, step = 101\n", 1354 | "INFO:tensorflow:global_step/sec: 10.1418\n", 1355 | "INFO:tensorflow:loss = 0.033156, step = 201\n", 1356 | "INFO:tensorflow:global_step/sec: 10.5714\n", 1357 | "INFO:tensorflow:Saving checkpoints for 300 into C:\\Users\\minga\\AppData\\Local\\Temp\\tmpi4fp2htr\\model.ckpt.\n", 1358 | "WARNING:tensorflow:*******************************************************\n", 1359 | "WARNING:tensorflow:TensorFlow's V1 checkpoint format has been deprecated.\n", 1360 | "WARNING:tensorflow:Consider switching to the more efficient V2 format:\n", 1361 | "WARNING:tensorflow: `tf.train.Saver(write_version=tf.train.SaverDef.V2)`\n", 1362 | "WARNING:tensorflow:now on by default.\n", 1363 | "WARNING:tensorflow:*******************************************************\n", 1364 | "INFO:tensorflow:Loss for final step: 0.0162134.\n", 1365 | "WARNING:tensorflow:From C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\tensorflow\\contrib\\learn\\python\\learn\\estimators\\dnn.py:348 in predict.: calling BaseEstimator.predict (from tensorflow.contrib.learn.python.learn.estimators.estimator) with x is deprecated and will be removed after 2016-12-01.\n", 1366 | "Instructions for updating:\n", 1367 | "Estimator is decoupled from Scikit Learn interface by moving into\n", 1368 | "separate class SKCompat. Arguments x, y and batch_size are only\n", 1369 | "available in the SKCompat class, Estimator will only accept input_fn.\n", 1370 | "Example conversion:\n", 1371 | " est = Estimator(...) -> est = SKCompat(Estimator(...))\n", 1372 | "WARNING:tensorflow:From C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\tensorflow\\contrib\\learn\\python\\learn\\estimators\\dnn.py:348 in predict.: calling BaseEstimator.predict (from tensorflow.contrib.learn.python.learn.estimators.estimator) with batch_size is deprecated and will be removed after 2016-12-01.\n", 1373 | "Instructions for updating:\n", 1374 | "Estimator is decoupled from Scikit Learn interface by moving into\n", 1375 | "separate class SKCompat. Arguments x, y and batch_size are only\n", 1376 | "available in the SKCompat class, Estimator will only accept input_fn.\n", 1377 | "Example conversion:\n", 1378 | " est = Estimator(...) -> est = SKCompat(Estimator(...))\n", 1379 | "WARNING:tensorflow:From C:\\Anaconda3\\envs\\DLNDF\\lib\\site-packages\\tensorflow\\contrib\\learn\\python\\learn\\estimators\\dnn.py:348 in predict.: calling BaseEstimator.predict (from tensorflow.contrib.learn.python.learn.estimators.estimator) with as_iterable is deprecated and will be removed after 2016-12-01.\n", 1380 | "Instructions for updating:\n", 1381 | "Estimator is decoupled from Scikit Learn interface by moving into\n", 1382 | "separate class SKCompat. Arguments x, y and batch_size are only\n", 1383 | "available in the SKCompat class, Estimator will only accept input_fn.\n", 1384 | "Example conversion:\n", 1385 | " est = Estimator(...) -> est = SKCompat(Estimator(...))\n", 1386 | "WARNING:tensorflow:float64 is not supported by many models, consider casting to float32.\n", 1387 | "INFO:tensorflow:Summary name dnn/hiddenlayer_0:fraction_of_zero_values is illegal; using dnn/hiddenlayer_0_fraction_of_zero_values instead.\n", 1388 | "INFO:tensorflow:Summary name dnn/hiddenlayer_0:activation is illegal; using dnn/hiddenlayer_0_activation instead.\n", 1389 | "INFO:tensorflow:Summary name dnn/logits:fraction_of_zero_values is illegal; using dnn/logits_fraction_of_zero_values instead.\n", 1390 | "INFO:tensorflow:Summary name dnn/logits:activation is illegal; using dnn/logits_activation instead.\n", 1391 | "INFO:tensorflow:Loading model from checkpoint: C:\\Users\\minga\\AppData\\Local\\Temp\\tmpi4fp2htr\\model.ckpt-300-?????-of-00001.\n" 1392 | ] 1393 | } 1394 | ], 1395 | "source": [ 1396 | "# Build one layer DNN with 40 units respectively.\n", 1397 | "feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(X_train)\n", 1398 | "classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns, hidden_units=[40], n_classes=2)\n", 1399 | "\n", 1400 | "# Fit and predict.\n", 1401 | "classifier.fit(X_train, y_train, steps=300)\n", 1402 | "predictions = list(classifier.predict(X_test, as_iterable=True))\n", 1403 | "score = metrics.accuracy_score(y_test, predictions)" 1404 | ] 1405 | }, 1406 | { 1407 | "cell_type": "code", 1408 | "execution_count": 33, 1409 | "metadata": { 1410 | "collapsed": false 1411 | }, 1412 | "outputs": [ 1413 | { 1414 | "name": "stdout", 1415 | "output_type": "stream", 1416 | "text": [ 1417 | "TF Accuracy: 0.999403341289\n" 1418 | ] 1419 | } 1420 | ], 1421 | "source": [ 1422 | "print('TF Accuracy: ', score)" 1423 | ] 1424 | }, 1425 | { 1426 | "cell_type": "markdown", 1427 | "metadata": {}, 1428 | "source": [ 1429 | "## 2.3 SVM" 1430 | ] 1431 | }, 1432 | { 1433 | "cell_type": "code", 1434 | "execution_count": 34, 1435 | "metadata": { 1436 | "collapsed": true 1437 | }, 1438 | "outputs": [], 1439 | "source": [ 1440 | "from sklearn.svm import SVC\n", 1441 | "svc = SVC()\n", 1442 | "svc.fit(X_train, y_train)\n", 1443 | "y_pred = svc.predict(X_test)" 1444 | ] 1445 | }, 1446 | { 1447 | "cell_type": "code", 1448 | "execution_count": 35, 1449 | "metadata": { 1450 | "collapsed": false 1451 | }, 1452 | "outputs": [ 1453 | { 1454 | "data": { 1455 | "text/plain": [ 1456 | "array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,\n", 1457 | " 0, 0, 0, 0, 0, 0, 0], dtype=int64)" 1458 | ] 1459 | }, 1460 | "execution_count": 35, 1461 | "metadata": {}, 1462 | "output_type": "execute_result" 1463 | } 1464 | ], 1465 | "source": [ 1466 | "y_pred[:30]" 1467 | ] 1468 | }, 1469 | { 1470 | "cell_type": "code", 1471 | "execution_count": 36, 1472 | "metadata": { 1473 | "collapsed": false 1474 | }, 1475 | "outputs": [ 1476 | { 1477 | "name": "stdout", 1478 | "output_type": "stream", 1479 | "text": [ 1480 | "SVM Accuracy: 0.994630071599\n" 1481 | ] 1482 | } 1483 | ], 1484 | "source": [ 1485 | "print('SVM Accuracy: ', metrics.accuracy_score(y_test,y_pred))" 1486 | ] 1487 | }, 1488 | { 1489 | "cell_type": "markdown", 1490 | "metadata": {}, 1491 | "source": [ 1492 | "## 2.4 Logistic regression" 1493 | ] 1494 | }, 1495 | { 1496 | "cell_type": "code", 1497 | "execution_count": 37, 1498 | "metadata": { 1499 | "collapsed": false 1500 | }, 1501 | "outputs": [], 1502 | "source": [ 1503 | "from sklearn.linear_model import LogisticRegression\n", 1504 | "\n", 1505 | "logreg = LogisticRegression(C=1e5)\n", 1506 | "logreg.fit(X_train, y_train)\n", 1507 | "y_pred = logreg.predict(X_test)" 1508 | ] 1509 | }, 1510 | { 1511 | "cell_type": "code", 1512 | "execution_count": 38, 1513 | "metadata": { 1514 | "collapsed": false 1515 | }, 1516 | "outputs": [ 1517 | { 1518 | "data": { 1519 | "text/plain": [ 1520 | "array([0, 0, 0, ..., 0, 0, 1], dtype=int64)" 1521 | ] 1522 | }, 1523 | "execution_count": 38, 1524 | "metadata": {}, 1525 | "output_type": "execute_result" 1526 | } 1527 | ], 1528 | "source": [ 1529 | "y_pred" 1530 | ] 1531 | }, 1532 | { 1533 | "cell_type": "code", 1534 | "execution_count": 39, 1535 | "metadata": { 1536 | "collapsed": false 1537 | }, 1538 | "outputs": [ 1539 | { 1540 | "name": "stdout", 1541 | "output_type": "stream", 1542 | "text": [ 1543 | "Log Regression Accuracy: 1.0\n" 1544 | ] 1545 | } 1546 | ], 1547 | "source": [ 1548 | "print('Log Regression Accuracy: ', metrics.accuracy_score(y_test,y_pred))" 1549 | ] 1550 | }, 1551 | { 1552 | "cell_type": "markdown", 1553 | "metadata": {}, 1554 | "source": [ 1555 | "# 3. Summary\n", 1556 | " - Manual NN: 0.839498806683. \n", 1557 | " - TF Accuracy: 0.999403341289\n", 1558 | " - SVM Accuracy: 0.994630071599\n", 1559 | " - Log Regression Accuracy: 1.0" 1560 | ] 1561 | } 1562 | ], 1563 | "metadata": { 1564 | "kernelspec": { 1565 | "display_name": "Python 3", 1566 | "language": "python", 1567 | "name": "python3" 1568 | }, 1569 | "language_info": { 1570 | "codemirror_mode": { 1571 | "name": "ipython", 1572 | "version": 3 1573 | }, 1574 | "file_extension": ".py", 1575 | "mimetype": "text/x-python", 1576 | "name": "python", 1577 | "nbconvert_exporter": "python", 1578 | "pygments_lexer": "ipython3", 1579 | "version": "3.5.2" 1580 | } 1581 | }, 1582 | "nbformat": 4, 1583 | "nbformat_minor": 2 1584 | } 1585 | -------------------------------------------------------------------------------- /wk8-generate-art/house.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdalai/Deep-Learning-projects/f690b3d8901e2ee7d872765815306ed09ba83a5a/wk8-generate-art/house.jpg -------------------------------------------------------------------------------- /wk8-generate-art/the_scream.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdalai/Deep-Learning-projects/f690b3d8901e2ee7d872765815306ed09ba83a5a/wk8-generate-art/the_scream.jpg -------------------------------------------------------------------------------- /wk8-generate-art/wave.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdalai/Deep-Learning-projects/f690b3d8901e2ee7d872765815306ed09ba83a5a/wk8-generate-art/wave.jpg --------------------------------------------------------------------------------