├── .gitignore ├── README.md ├── projects ├── boston_housing │ ├── README.md │ ├── boston_housing.ipynb │ ├── housing.csv │ └── visuals.py ├── capstone │ ├── README.md │ ├── project_report_template.md │ ├── report-example-1.pdf │ ├── report-example-2.pdf │ ├── report-example-3.pdf │ └── robot_motion_planning │ │ ├── maze.py │ │ ├── robot.py │ │ ├── showmaze.py │ │ ├── test_maze_01.txt │ │ ├── test_maze_02.txt │ │ ├── test_maze_03.txt │ │ └── tester.py ├── creating_customer_segments │ ├── README.md │ ├── customer_segments.ipynb │ ├── customers.csv │ └── renders.py ├── smartcab │ ├── README.md │ ├── images │ │ ├── car-black.png │ │ ├── car-blue.png │ │ ├── car-cyan.png │ │ ├── car-green.png │ │ ├── car-magenta.png │ │ ├── car-orange.png │ │ ├── car-red.png │ │ ├── car-white.png │ │ └── car-yellow.png │ └── smartcab │ │ ├── __init__.py │ │ ├── agent.py │ │ ├── environment.py │ │ ├── planner.py │ │ └── simulator.py ├── student_intervention │ ├── README.md │ ├── student-data.csv │ └── student_intervention.ipynb └── titanic_survival_exploration │ ├── README.md │ ├── titanic_data.csv │ ├── titanic_survival_exploration.ipynb │ └── titanic_visualizations.py └── projects_cn ├── boston_housing ├── README.md ├── boston_housing.ipynb ├── housing.csv └── visuals.py ├── creating_customer_segments ├── README.md ├── customer_segments.ipynb ├── customers.csv └── renders.py ├── student_intervention ├── README.md ├── student-data.csv └── student_intervention.ipynb └── titanic_survival_exploration ├── README.md ├── titanic_data.csv ├── titanic_survival_exploration.ipynb └── titanic_visualizations.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Mac OS 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | 61 | # PyBuilder 62 | target/ 63 | 64 | #Ipython Notebook 65 | .ipynb_checkpoints 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # machine-learning 2 | Content for Udacity's Machine Learning curriculum 3 | 4 | Creative Commons License
This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License. Please refer to [Udacity Terms of Service](https://www.udacity.com/legal) for further information. 5 | -------------------------------------------------------------------------------- /projects/boston_housing/README.md: -------------------------------------------------------------------------------- 1 | # Project 1: Model Evaluation & Validation 2 | ## Predicting Boston Housing Prices 3 | 4 | ### Install 5 | 6 | This project requires **Python 2.7** and the following Python libraries installed: 7 | 8 | - [NumPy](http://www.numpy.org/) 9 | - [matplotlib](http://matplotlib.org/) 10 | - [scikit-learn](http://scikit-learn.org/stable/) 11 | 12 | You will also need to have software installed to run and execute an [iPython Notebook](http://ipython.org/notebook.html) 13 | 14 | Udacity recommends our students install [Anaconda](https://www.continuum.io/downloads), a pre-packaged Python distribution that contains all of the necessary libraries and software for this project. 15 | 16 | ### Code 17 | 18 | Template code is provided in the `boston_housing.ipynb` notebook file. You will also be required to use the included `visuals.py` Python file and the `housing.csv` dataset file to complete your work. While some code has already been implemented to get you started, you will need to implement additional functionality when requested to successfully complete the project. 19 | 20 | ### Run 21 | 22 | In a terminal or command window, navigate to the top-level project directory `boston_housing/` (that contains this README) and run one of the following commands: 23 | 24 | ```ipython notebook boston_housing.ipynb``` 25 | ```jupyter notebook boston_housing.ipynb``` 26 | 27 | This will open the iPython Notebook software and project file in your browser. 28 | 29 | ### Data 30 | 31 | The dataset used in this project is included with the scikit-learn library ([`sklearn.datasets.load_boston`](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html#sklearn.datasets.load_boston)). You do not have to download it separately. You can find more information on this dataset from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Housing) page. 32 | -------------------------------------------------------------------------------- /projects/boston_housing/housing.csv: -------------------------------------------------------------------------------- 1 | RM,LSTAT,PTRATIO,MEDV 2 | 6.575,4.98,15.3,504000.0 3 | 6.421,9.14,17.8,453600.0 4 | 7.185,4.03,17.8,728700.0 5 | 6.998,2.94,18.7,701400.0 6 | 7.147,5.33,18.7,760200.0 7 | 6.43,5.21,18.7,602700.0 8 | 6.012,12.43,15.2,480900.0 9 | 6.172,19.15,15.2,569100.0 10 | 5.631,29.93,15.2,346500.0 11 | 6.004,17.1,15.2,396900.0 12 | 6.377,20.45,15.2,315000.0 13 | 6.009,13.27,15.2,396900.0 14 | 5.889,15.71,15.2,455700.0 15 | 5.949,8.26,21.0,428400.0 16 | 6.096,10.26,21.0,382200.0 17 | 5.834,8.47,21.0,417900.0 18 | 5.935,6.58,21.0,485100.0 19 | 5.99,14.67,21.0,367500.0 20 | 5.456,11.69,21.0,424200.0 21 | 5.727,11.28,21.0,382200.0 22 | 5.57,21.02,21.0,285600.0 23 | 5.965,13.83,21.0,411600.0 24 | 6.142,18.72,21.0,319200.0 25 | 5.813,19.88,21.0,304500.0 26 | 5.924,16.3,21.0,327600.0 27 | 5.599,16.51,21.0,291900.0 28 | 5.813,14.81,21.0,348600.0 29 | 6.047,17.28,21.0,310800.0 30 | 6.495,12.8,21.0,386400.0 31 | 6.674,11.98,21.0,441000.0 32 | 5.713,22.6,21.0,266700.0 33 | 6.072,13.04,21.0,304500.0 34 | 5.95,27.71,21.0,277200.0 35 | 5.701,18.35,21.0,275100.0 36 | 6.096,20.34,21.0,283500.0 37 | 5.933,9.68,19.2,396900.0 38 | 5.841,11.41,19.2,420000.0 39 | 5.85,8.77,19.2,441000.0 40 | 5.966,10.13,19.2,518700.0 41 | 6.595,4.32,18.3,646800.0 42 | 7.024,1.98,18.3,732900.0 43 | 6.77,4.84,17.9,558600.0 44 | 6.169,5.81,17.9,531300.0 45 | 6.211,7.44,17.9,518700.0 46 | 6.069,9.55,17.9,445200.0 47 | 5.682,10.21,17.9,405300.0 48 | 5.786,14.15,17.9,420000.0 49 | 6.03,18.8,17.9,348600.0 50 | 5.399,30.81,17.9,302400.0 51 | 5.602,16.2,17.9,407400.0 52 | 5.963,13.45,16.8,413700.0 53 | 6.115,9.43,16.8,430500.0 54 | 6.511,5.28,16.8,525000.0 55 | 5.998,8.43,16.8,491400.0 56 | 5.888,14.8,21.1,396900.0 57 | 7.249,4.81,17.9,743400.0 58 | 6.383,5.77,17.3,518700.0 59 | 6.816,3.95,15.1,663600.0 60 | 6.145,6.86,19.7,489300.0 61 | 5.927,9.22,19.7,411600.0 62 | 5.741,13.15,19.7,392700.0 63 | 5.966,14.44,19.7,336000.0 64 | 6.456,6.73,19.7,466200.0 65 | 6.762,9.5,19.7,525000.0 66 | 7.104,8.05,18.6,693000.0 67 | 6.29,4.67,16.1,493500.0 68 | 5.787,10.24,16.1,407400.0 69 | 5.878,8.1,18.9,462000.0 70 | 5.594,13.09,18.9,365400.0 71 | 5.885,8.79,18.9,438900.0 72 | 6.417,6.72,19.2,508200.0 73 | 5.961,9.88,19.2,455700.0 74 | 6.065,5.52,19.2,478800.0 75 | 6.245,7.54,19.2,491400.0 76 | 6.273,6.78,18.7,506100.0 77 | 6.286,8.94,18.7,449400.0 78 | 6.279,11.97,18.7,420000.0 79 | 6.14,10.27,18.7,436800.0 80 | 6.232,12.34,18.7,445200.0 81 | 5.874,9.1,18.7,426300.0 82 | 6.727,5.29,19.0,588000.0 83 | 6.619,7.22,19.0,501900.0 84 | 6.302,6.72,19.0,520800.0 85 | 6.167,7.51,19.0,480900.0 86 | 6.389,9.62,18.5,501900.0 87 | 6.63,6.53,18.5,558600.0 88 | 6.015,12.86,18.5,472500.0 89 | 6.121,8.44,18.5,466200.0 90 | 7.007,5.5,17.8,495600.0 91 | 7.079,5.7,17.8,602700.0 92 | 6.417,8.81,17.8,474600.0 93 | 6.405,8.2,17.8,462000.0 94 | 6.442,8.16,18.2,480900.0 95 | 6.211,6.21,18.2,525000.0 96 | 6.249,10.59,18.2,432600.0 97 | 6.625,6.65,18.0,596400.0 98 | 6.163,11.34,18.0,449400.0 99 | 8.069,4.21,18.0,812700.0 100 | 7.82,3.57,18.0,919800.0 101 | 7.416,6.19,18.0,697200.0 102 | 6.727,9.42,20.9,577500.0 103 | 6.781,7.67,20.9,556500.0 104 | 6.405,10.63,20.9,390600.0 105 | 6.137,13.44,20.9,405300.0 106 | 6.167,12.33,20.9,422100.0 107 | 5.851,16.47,20.9,409500.0 108 | 5.836,18.66,20.9,409500.0 109 | 6.127,14.09,20.9,428400.0 110 | 6.474,12.27,20.9,415800.0 111 | 6.229,15.55,20.9,407400.0 112 | 6.195,13.0,20.9,455700.0 113 | 6.715,10.16,17.8,478800.0 114 | 5.913,16.21,17.8,394800.0 115 | 6.092,17.09,17.8,392700.0 116 | 6.254,10.45,17.8,388500.0 117 | 5.928,15.76,17.8,384300.0 118 | 6.176,12.04,17.8,445200.0 119 | 6.021,10.3,17.8,403200.0 120 | 5.872,15.37,17.8,428400.0 121 | 5.731,13.61,17.8,405300.0 122 | 5.87,14.37,19.1,462000.0 123 | 6.004,14.27,19.1,426300.0 124 | 5.961,17.93,19.1,430500.0 125 | 5.856,25.41,19.1,363300.0 126 | 5.879,17.58,19.1,394800.0 127 | 5.986,14.81,19.1,449400.0 128 | 5.613,27.26,19.1,329700.0 129 | 5.693,17.19,21.2,340200.0 130 | 6.431,15.39,21.2,378000.0 131 | 5.637,18.34,21.2,300300.0 132 | 6.458,12.6,21.2,403200.0 133 | 6.326,12.26,21.2,411600.0 134 | 6.372,11.12,21.2,483000.0 135 | 5.822,15.03,21.2,386400.0 136 | 5.757,17.31,21.2,327600.0 137 | 6.335,16.96,21.2,380100.0 138 | 5.942,16.9,21.2,365400.0 139 | 6.454,14.59,21.2,359100.0 140 | 5.857,21.32,21.2,279300.0 141 | 6.151,18.46,21.2,373800.0 142 | 6.174,24.16,21.2,294000.0 143 | 5.019,34.41,21.2,302400.0 144 | 5.403,26.82,14.7,281400.0 145 | 5.468,26.42,14.7,327600.0 146 | 4.903,29.29,14.7,247800.0 147 | 6.13,27.8,14.7,289800.0 148 | 5.628,16.65,14.7,327600.0 149 | 4.926,29.53,14.7,306600.0 150 | 5.186,28.32,14.7,373800.0 151 | 5.597,21.45,14.7,323400.0 152 | 6.122,14.1,14.7,451500.0 153 | 5.404,13.28,14.7,411600.0 154 | 5.012,12.12,14.7,321300.0 155 | 5.709,15.79,14.7,407400.0 156 | 6.129,15.12,14.7,357000.0 157 | 6.152,15.02,14.7,327600.0 158 | 5.272,16.14,14.7,275100.0 159 | 6.943,4.59,14.7,867300.0 160 | 6.066,6.43,14.7,510300.0 161 | 6.51,7.39,14.7,489300.0 162 | 6.25,5.5,14.7,567000.0 163 | 5.854,11.64,14.7,476700.0 164 | 6.101,9.81,14.7,525000.0 165 | 5.877,12.14,14.7,499800.0 166 | 6.319,11.1,14.7,499800.0 167 | 6.402,11.32,14.7,468300.0 168 | 5.875,14.43,14.7,365400.0 169 | 5.88,12.03,14.7,401100.0 170 | 5.572,14.69,16.6,485100.0 171 | 6.416,9.04,16.6,495600.0 172 | 5.859,9.64,16.6,474600.0 173 | 6.546,5.33,16.6,617400.0 174 | 6.02,10.11,16.6,487200.0 175 | 6.315,6.29,16.6,516600.0 176 | 6.86,6.92,16.6,627900.0 177 | 6.98,5.04,17.8,781200.0 178 | 7.765,7.56,17.8,835800.0 179 | 6.144,9.45,17.8,760200.0 180 | 7.155,4.82,17.8,795900.0 181 | 6.563,5.68,17.8,682500.0 182 | 5.604,13.98,17.8,554400.0 183 | 6.153,13.15,17.8,621600.0 184 | 6.782,6.68,15.2,672000.0 185 | 6.556,4.56,15.2,625800.0 186 | 7.185,5.39,15.2,732900.0 187 | 6.951,5.1,15.2,777000.0 188 | 6.739,4.69,15.2,640500.0 189 | 7.178,2.87,15.2,764400.0 190 | 6.8,5.03,15.6,653100.0 191 | 6.604,4.38,15.6,611100.0 192 | 7.287,4.08,12.6,699300.0 193 | 7.107,8.61,12.6,636300.0 194 | 7.274,6.62,12.6,726600.0 195 | 6.975,4.56,17.0,732900.0 196 | 7.135,4.45,17.0,690900.0 197 | 6.162,7.43,14.7,506100.0 198 | 7.61,3.11,14.7,888300.0 199 | 7.853,3.81,14.7,1018500.0 200 | 5.891,10.87,18.6,474600.0 201 | 6.326,10.97,18.6,512400.0 202 | 5.783,18.06,18.6,472500.0 203 | 6.064,14.66,18.6,512400.0 204 | 5.344,23.09,18.6,420000.0 205 | 5.96,17.27,18.6,455700.0 206 | 5.404,23.98,18.6,405300.0 207 | 5.807,16.03,18.6,470400.0 208 | 6.375,9.38,18.6,590100.0 209 | 5.412,29.55,18.6,497700.0 210 | 6.182,9.47,18.6,525000.0 211 | 5.888,13.51,16.4,489300.0 212 | 6.642,9.69,16.4,602700.0 213 | 5.951,17.92,16.4,451500.0 214 | 6.373,10.5,16.4,483000.0 215 | 6.951,9.71,17.4,560700.0 216 | 6.164,21.46,17.4,455700.0 217 | 6.879,9.93,17.4,577500.0 218 | 6.618,7.6,17.4,632100.0 219 | 8.266,4.14,17.4,940800.0 220 | 8.04,3.13,17.4,789600.0 221 | 7.163,6.36,17.4,663600.0 222 | 7.686,3.92,17.4,980700.0 223 | 6.552,3.76,17.4,661500.0 224 | 5.981,11.65,17.4,510300.0 225 | 7.412,5.25,17.4,665700.0 226 | 8.337,2.47,17.4,875700.0 227 | 8.247,3.95,17.4,1014300.0 228 | 6.726,8.05,17.4,609000.0 229 | 6.086,10.88,17.4,504000.0 230 | 6.631,9.54,17.4,527100.0 231 | 7.358,4.73,17.4,661500.0 232 | 6.481,6.36,16.6,497700.0 233 | 6.606,7.37,16.6,489300.0 234 | 6.897,11.38,16.6,462000.0 235 | 6.095,12.4,16.6,422100.0 236 | 6.358,11.22,16.6,466200.0 237 | 6.393,5.19,16.6,497700.0 238 | 5.593,12.5,19.1,369600.0 239 | 5.605,18.46,19.1,388500.0 240 | 6.108,9.16,19.1,510300.0 241 | 6.226,10.15,19.1,430500.0 242 | 6.433,9.52,19.1,514500.0 243 | 6.718,6.56,19.1,550200.0 244 | 6.487,5.9,19.1,512400.0 245 | 6.438,3.59,19.1,520800.0 246 | 6.957,3.53,19.1,621600.0 247 | 8.259,3.54,19.1,898800.0 248 | 6.108,6.57,16.4,459900.0 249 | 5.876,9.25,16.4,438900.0 250 | 7.454,3.11,15.9,924000.0 251 | 7.333,7.79,13.0,756000.0 252 | 6.842,6.9,13.0,632100.0 253 | 7.203,9.59,13.0,709800.0 254 | 7.52,7.26,13.0,905100.0 255 | 8.398,5.91,13.0,1024800.0 256 | 7.327,11.25,13.0,651000.0 257 | 7.206,8.1,13.0,766500.0 258 | 5.56,10.45,13.0,478800.0 259 | 7.014,14.79,13.0,644700.0 260 | 7.47,3.16,13.0,913500.0 261 | 5.92,13.65,18.6,434700.0 262 | 5.856,13.0,18.6,443100.0 263 | 6.24,6.59,18.6,529200.0 264 | 6.538,7.73,18.6,512400.0 265 | 7.691,6.58,18.6,739200.0 266 | 6.758,3.53,17.6,680400.0 267 | 6.854,2.98,17.6,672000.0 268 | 7.267,6.05,17.6,697200.0 269 | 6.826,4.16,17.6,695100.0 270 | 6.482,7.19,17.6,611100.0 271 | 6.812,4.85,14.9,737100.0 272 | 7.82,3.76,14.9,953400.0 273 | 6.968,4.59,14.9,743400.0 274 | 7.645,3.01,14.9,966000.0 275 | 7.088,7.85,15.3,676200.0 276 | 6.453,8.23,15.3,462000.0 277 | 6.23,12.93,18.2,422100.0 278 | 6.209,7.14,16.6,487200.0 279 | 6.315,7.6,16.6,468300.0 280 | 6.565,9.51,16.6,520800.0 281 | 6.861,3.33,19.2,598500.0 282 | 7.148,3.56,19.2,783300.0 283 | 6.63,4.7,19.2,585900.0 284 | 6.127,8.58,16.0,501900.0 285 | 6.009,10.4,16.0,455700.0 286 | 6.678,6.27,16.0,600600.0 287 | 6.549,7.39,16.0,569100.0 288 | 5.79,15.84,16.0,426300.0 289 | 6.345,4.97,14.8,472500.0 290 | 7.041,4.74,14.8,609000.0 291 | 6.871,6.07,14.8,520800.0 292 | 6.59,9.5,16.1,462000.0 293 | 6.495,8.67,16.1,554400.0 294 | 6.982,4.86,16.1,695100.0 295 | 7.236,6.93,18.4,758100.0 296 | 6.616,8.93,18.4,596400.0 297 | 7.42,6.47,18.4,701400.0 298 | 6.849,7.53,18.4,592200.0 299 | 6.635,4.54,18.4,478800.0 300 | 5.972,9.97,18.4,426300.0 301 | 4.973,12.64,18.4,338100.0 302 | 6.122,5.98,18.4,464100.0 303 | 6.023,11.72,18.4,407400.0 304 | 6.266,7.9,18.4,453600.0 305 | 6.567,9.28,18.4,499800.0 306 | 5.705,11.5,18.4,340200.0 307 | 5.914,18.33,18.4,373800.0 308 | 5.782,15.94,18.4,415800.0 309 | 6.382,10.36,18.4,485100.0 310 | 6.113,12.73,18.4,441000.0 311 | 6.426,7.2,19.6,499800.0 312 | 6.376,6.87,19.6,485100.0 313 | 6.041,7.7,19.6,428400.0 314 | 5.708,11.74,19.6,388500.0 315 | 6.415,6.12,19.6,525000.0 316 | 6.431,5.08,19.6,516600.0 317 | 6.312,6.15,19.6,483000.0 318 | 6.083,12.79,19.6,466200.0 319 | 5.868,9.97,16.9,405300.0 320 | 6.333,7.34,16.9,474600.0 321 | 6.144,9.09,16.9,415800.0 322 | 5.706,12.43,16.9,359100.0 323 | 6.031,7.83,16.9,407400.0 324 | 6.316,5.68,20.2,466200.0 325 | 6.31,6.75,20.2,434700.0 326 | 6.037,8.01,20.2,443100.0 327 | 5.869,9.8,20.2,409500.0 328 | 5.895,10.56,20.2,388500.0 329 | 6.059,8.51,20.2,432600.0 330 | 5.985,9.74,20.2,399000.0 331 | 5.968,9.29,20.2,392700.0 332 | 7.241,5.49,15.5,686700.0 333 | 6.54,8.65,15.9,346500.0 334 | 6.696,7.18,17.6,501900.0 335 | 6.874,4.61,17.6,655200.0 336 | 6.014,10.53,18.8,367500.0 337 | 5.898,12.67,18.8,361200.0 338 | 6.516,6.36,17.9,485100.0 339 | 6.635,5.99,17.0,514500.0 340 | 6.939,5.89,19.7,558600.0 341 | 6.49,5.98,19.7,480900.0 342 | 6.579,5.49,18.3,506100.0 343 | 5.884,7.79,18.3,390600.0 344 | 6.728,4.5,17.0,632100.0 345 | 5.663,8.05,22.0,382200.0 346 | 5.936,5.57,22.0,432600.0 347 | 6.212,17.6,20.2,373800.0 348 | 6.395,13.27,20.2,455700.0 349 | 6.127,11.48,20.2,476700.0 350 | 6.112,12.67,20.2,474600.0 351 | 6.398,7.79,20.2,525000.0 352 | 6.251,14.19,20.2,417900.0 353 | 5.362,10.19,20.2,436800.0 354 | 5.803,14.64,20.2,352800.0 355 | 3.561,7.12,20.2,577500.0 356 | 4.963,14.0,20.2,459900.0 357 | 3.863,13.33,20.2,485100.0 358 | 4.906,34.77,20.2,289800.0 359 | 4.138,37.97,20.2,289800.0 360 | 7.313,13.44,20.2,315000.0 361 | 6.649,23.24,20.2,291900.0 362 | 6.794,21.24,20.2,279300.0 363 | 6.38,23.69,20.2,275100.0 364 | 6.223,21.78,20.2,214200.0 365 | 6.968,17.21,20.2,218400.0 366 | 6.545,21.08,20.2,228900.0 367 | 5.536,23.6,20.2,237300.0 368 | 5.52,24.56,20.2,258300.0 369 | 4.368,30.63,20.2,184800.0 370 | 5.277,30.81,20.2,151200.0 371 | 4.652,28.28,20.2,220500.0 372 | 5.0,31.99,20.2,155400.0 373 | 4.88,30.62,20.2,214200.0 374 | 5.39,20.85,20.2,241500.0 375 | 5.713,17.11,20.2,317100.0 376 | 6.051,18.76,20.2,487200.0 377 | 5.036,25.68,20.2,203700.0 378 | 6.193,15.17,20.2,289800.0 379 | 5.887,16.35,20.2,266700.0 380 | 6.471,17.12,20.2,275100.0 381 | 6.405,19.37,20.2,262500.0 382 | 5.747,19.92,20.2,178500.0 383 | 5.453,30.59,20.2,105000.0 384 | 5.852,29.97,20.2,132300.0 385 | 5.987,26.77,20.2,117600.0 386 | 6.343,20.32,20.2,151200.0 387 | 6.404,20.31,20.2,254100.0 388 | 5.349,19.77,20.2,174300.0 389 | 5.531,27.38,20.2,178500.0 390 | 5.683,22.98,20.2,105000.0 391 | 4.138,23.34,20.2,249900.0 392 | 5.608,12.13,20.2,585900.0 393 | 5.617,26.4,20.2,361200.0 394 | 6.852,19.78,20.2,577500.0 395 | 5.757,10.11,20.2,315000.0 396 | 6.657,21.22,20.2,361200.0 397 | 4.628,34.37,20.2,375900.0 398 | 5.155,20.08,20.2,342300.0 399 | 4.519,36.98,20.2,147000.0 400 | 6.434,29.05,20.2,151200.0 401 | 6.782,25.79,20.2,157500.0 402 | 5.304,26.64,20.2,218400.0 403 | 5.957,20.62,20.2,184800.0 404 | 6.824,22.74,20.2,176400.0 405 | 6.411,15.02,20.2,350700.0 406 | 6.006,15.7,20.2,298200.0 407 | 5.648,14.1,20.2,436800.0 408 | 6.103,23.29,20.2,281400.0 409 | 5.565,17.16,20.2,245700.0 410 | 5.896,24.39,20.2,174300.0 411 | 5.837,15.69,20.2,214200.0 412 | 6.202,14.52,20.2,228900.0 413 | 6.193,21.52,20.2,231000.0 414 | 6.38,24.08,20.2,199500.0 415 | 6.348,17.64,20.2,304500.0 416 | 6.833,19.69,20.2,296100.0 417 | 6.425,12.03,20.2,338100.0 418 | 6.436,16.22,20.2,300300.0 419 | 6.208,15.17,20.2,245700.0 420 | 6.629,23.27,20.2,281400.0 421 | 6.461,18.05,20.2,201600.0 422 | 6.152,26.45,20.2,182700.0 423 | 5.935,34.02,20.2,176400.0 424 | 5.627,22.88,20.2,268800.0 425 | 5.818,22.11,20.2,220500.0 426 | 6.406,19.52,20.2,359100.0 427 | 6.219,16.59,20.2,386400.0 428 | 6.485,18.85,20.2,323400.0 429 | 5.854,23.79,20.2,226800.0 430 | 6.459,23.98,20.2,247800.0 431 | 6.341,17.79,20.2,312900.0 432 | 6.251,16.44,20.2,264600.0 433 | 6.185,18.13,20.2,296100.0 434 | 6.417,19.31,20.2,273000.0 435 | 6.749,17.44,20.2,281400.0 436 | 6.655,17.73,20.2,319200.0 437 | 6.297,17.27,20.2,338100.0 438 | 7.393,16.74,20.2,373800.0 439 | 6.728,18.71,20.2,312900.0 440 | 6.525,18.13,20.2,296100.0 441 | 5.976,19.01,20.2,266700.0 442 | 5.936,16.94,20.2,283500.0 443 | 6.301,16.23,20.2,312900.0 444 | 6.081,14.7,20.2,420000.0 445 | 6.701,16.42,20.2,344400.0 446 | 6.376,14.65,20.2,371700.0 447 | 6.317,13.99,20.2,409500.0 448 | 6.513,10.29,20.2,424200.0 449 | 6.209,13.22,20.2,449400.0 450 | 5.759,14.13,20.2,417900.0 451 | 5.952,17.15,20.2,399000.0 452 | 6.003,21.32,20.2,401100.0 453 | 5.926,18.13,20.2,401100.0 454 | 5.713,14.76,20.2,422100.0 455 | 6.167,16.29,20.2,417900.0 456 | 6.229,12.87,20.2,411600.0 457 | 6.437,14.36,20.2,487200.0 458 | 6.98,11.66,20.2,625800.0 459 | 5.427,18.14,20.2,289800.0 460 | 6.162,24.1,20.2,279300.0 461 | 6.484,18.68,20.2,350700.0 462 | 5.304,24.91,20.2,252000.0 463 | 6.185,18.03,20.2,306600.0 464 | 6.229,13.11,20.2,449400.0 465 | 6.242,10.74,20.2,483000.0 466 | 6.75,7.74,20.2,497700.0 467 | 7.061,7.01,20.2,525000.0 468 | 5.762,10.42,20.2,457800.0 469 | 5.871,13.34,20.2,432600.0 470 | 6.312,10.58,20.2,445200.0 471 | 6.114,14.98,20.2,401100.0 472 | 5.905,11.45,20.2,432600.0 473 | 5.454,18.06,20.1,319200.0 474 | 5.414,23.97,20.1,147000.0 475 | 5.093,29.68,20.1,170100.0 476 | 5.983,18.07,20.1,285600.0 477 | 5.983,13.35,20.1,422100.0 478 | 5.707,12.01,19.2,457800.0 479 | 5.926,13.59,19.2,514500.0 480 | 5.67,17.6,19.2,485100.0 481 | 5.39,21.14,19.2,413700.0 482 | 5.794,14.1,19.2,384300.0 483 | 6.019,12.92,19.2,445200.0 484 | 5.569,15.1,19.2,367500.0 485 | 6.027,14.33,19.2,352800.0 486 | 6.593,9.67,21.0,470400.0 487 | 6.12,9.08,21.0,432600.0 488 | 6.976,5.64,21.0,501900.0 489 | 6.794,6.48,21.0,462000.0 490 | 6.03,7.88,21.0,249900.0 491 | -------------------------------------------------------------------------------- /projects/boston_housing/visuals.py: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # Suppress matplotlib user warnings 3 | # Necessary for newer version of matplotlib 4 | import warnings 5 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") 6 | ########################################### 7 | 8 | import matplotlib.pyplot as pl 9 | import numpy as np 10 | import sklearn.learning_curve as curves 11 | from sklearn.tree import DecisionTreeRegressor 12 | from sklearn.cross_validation import ShuffleSplit, train_test_split 13 | 14 | def ModelLearning(X, y): 15 | """ Calculates the performance of several models with varying sizes of training data. 16 | The learning and testing scores for each model are then plotted. """ 17 | 18 | # Create 10 cross-validation sets for training and testing 19 | cv = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.2, random_state = 0) 20 | 21 | # Generate the training set sizes increasing by 50 22 | train_sizes = np.rint(np.linspace(1, X.shape[0]*0.8 - 1, 9)).astype(int) 23 | 24 | # Create the figure window 25 | fig = pl.figure(figsize=(10,7)) 26 | 27 | # Create three different models based on max_depth 28 | for k, depth in enumerate([1,3,6,10]): 29 | 30 | # Create a Decision tree regressor at max_depth = depth 31 | regressor = DecisionTreeRegressor(max_depth = depth) 32 | 33 | # Calculate the training and testing scores 34 | sizes, train_scores, test_scores = curves.learning_curve(regressor, X, y, \ 35 | cv = cv, train_sizes = train_sizes, scoring = 'r2') 36 | 37 | # Find the mean and standard deviation for smoothing 38 | train_std = np.std(train_scores, axis = 1) 39 | train_mean = np.mean(train_scores, axis = 1) 40 | test_std = np.std(test_scores, axis = 1) 41 | test_mean = np.mean(test_scores, axis = 1) 42 | 43 | # Subplot the learning curve 44 | ax = fig.add_subplot(2, 2, k+1) 45 | ax.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score') 46 | ax.plot(sizes, test_mean, 'o-', color = 'g', label = 'Testing Score') 47 | ax.fill_between(sizes, train_mean - train_std, \ 48 | train_mean + train_std, alpha = 0.15, color = 'r') 49 | ax.fill_between(sizes, test_mean - test_std, \ 50 | test_mean + test_std, alpha = 0.15, color = 'g') 51 | 52 | # Labels 53 | ax.set_title('max_depth = %s'%(depth)) 54 | ax.set_xlabel('Number of Training Points') 55 | ax.set_ylabel('Score') 56 | ax.set_xlim([0, X.shape[0]*0.8]) 57 | ax.set_ylim([-0.05, 1.05]) 58 | 59 | # Visual aesthetics 60 | ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad = 0.) 61 | fig.suptitle('Decision Tree Regressor Learning Performances', fontsize = 16, y = 1.03) 62 | fig.tight_layout() 63 | fig.show() 64 | 65 | 66 | def ModelComplexity(X, y): 67 | """ Calculates the performance of the model as model complexity increases. 68 | The learning and testing errors rates are then plotted. """ 69 | 70 | # Create 10 cross-validation sets for training and testing 71 | cv = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.2, random_state = 0) 72 | 73 | # Vary the max_depth parameter from 1 to 10 74 | max_depth = np.arange(1,11) 75 | 76 | # Calculate the training and testing scores 77 | train_scores, test_scores = curves.validation_curve(DecisionTreeRegressor(), X, y, \ 78 | param_name = "max_depth", param_range = max_depth, cv = cv, scoring = 'r2') 79 | 80 | # Find the mean and standard deviation for smoothing 81 | train_mean = np.mean(train_scores, axis=1) 82 | train_std = np.std(train_scores, axis=1) 83 | test_mean = np.mean(test_scores, axis=1) 84 | test_std = np.std(test_scores, axis=1) 85 | 86 | # Plot the validation curve 87 | pl.figure(figsize=(7, 5)) 88 | pl.title('Decision Tree Regressor Complexity Performance') 89 | pl.plot(max_depth, train_mean, 'o-', color = 'r', label = 'Training Score') 90 | pl.plot(max_depth, test_mean, 'o-', color = 'g', label = 'Validation Score') 91 | pl.fill_between(max_depth, train_mean - train_std, \ 92 | train_mean + train_std, alpha = 0.15, color = 'r') 93 | pl.fill_between(max_depth, test_mean - test_std, \ 94 | test_mean + test_std, alpha = 0.15, color = 'g') 95 | 96 | # Visual aesthetics 97 | pl.legend(loc = 'lower right') 98 | pl.xlabel('Maximum Depth') 99 | pl.ylabel('Score') 100 | pl.ylim([-0.05,1.05]) 101 | pl.show() 102 | 103 | 104 | def PredictTrials(X, y, fitter, data): 105 | """ Performs trials of fitting and predicting data. """ 106 | 107 | # Store the predicted prices 108 | prices = [] 109 | 110 | for k in range(10): 111 | # Split the data 112 | X_train, X_test, y_train, y_test = train_test_split(X, y, \ 113 | test_size = 0.2, random_state = k) 114 | 115 | # Fit the data 116 | reg = fitter(X_train, y_train) 117 | 118 | # Make a prediction 119 | pred = reg.predict([data[0]])[0] 120 | prices.append(pred) 121 | 122 | # Result 123 | print "Trial {}: ${:,.2f}".format(k+1, pred) 124 | 125 | # Display price range 126 | print "\nRange in prices: ${:,.2f}".format(max(prices) - min(prices)) -------------------------------------------------------------------------------- /projects/capstone/README.md: -------------------------------------------------------------------------------- 1 | # Project 5: Capstone 2 | 3 | The Capstone Project for the Machine Learning Engineer Nanodegree does not have any requirements for code, libraries, or datasets. You are free to choose your project as you wish! For students who are unable to construct a capstone project on their own imagination, a pre-built project has been provided in `robot_motion_planning`. -------------------------------------------------------------------------------- /projects/capstone/project_report_template.md: -------------------------------------------------------------------------------- 1 | # Capstone Project 2 | ## Machine Learning Engineer Nanodegree 3 | Joe Udacity 4 | December 31st, 2050 5 | 6 | ## I. Definition 7 | _(approx. 1-2 pages)_ 8 | 9 | ### Project Overview 10 | In this section, look to provide a high-level overview of the project in layman’s terms. Questions to ask yourself when writing this section: 11 | - _Has an overview of the project been provided, such as the problem domain, project origin, and related datasets or input data?_ 12 | - _Has enough background information been given so that an uninformed reader would understand the problem domain and following problem statement?_ 13 | 14 | ### Problem Statement 15 | In this section, you will want to clearly define the problem that you are trying to solve, including the strategy (outline of tasks) you will use to achieve the desired solution. You should also thoroughly discuss what the intended solution will be for this problem. Questions to ask yourself when writing this section: 16 | - _Is the problem statement clearly defined? Will the reader understand what you are expecting to solve?_ 17 | - _Have you thoroughly discussed how you will attempt to solve the problem?_ 18 | - _Is an anticipated solution clearly defined? Will the reader understand what results you are looking for?_ 19 | 20 | ### Metrics 21 | In this section, you will need to clearly define the metrics or calculations you will use to measure performance of a model or result in your project. These calculations and metrics should be justified based on the characteristics of the problem and problem domain. Questions to ask yourself when writing this section: 22 | - _Are the metrics you’ve chosen to measure the performance of your models clearly discussed and defined?_ 23 | - _Have you provided reasonable justification for the metrics chosen based on the problem and solution?_ 24 | 25 | 26 | ## II. Analysis 27 | _(approx. 2-4 pages)_ 28 | 29 | ### Data Exploration 30 | In this section, you will be expected to analyze the data you are using for the problem. This data can either be in the form of a dataset (or datasets), input data (or input files), or even an environment. The type of data should be thoroughly described and, if possible, have basic statistics and information presented (such as discussion of input features or defining characteristics about the input or environment). Any abnormalities or interesting qualities about the data that may need to be addressed have been identified (such as features that need to be transformed or the possibility of outliers). Questions to ask yourself when writing this section: 31 | - _If a dataset is present for this problem, have you thoroughly discussed certain features about the dataset? Has a data sample been provided to the reader?_ 32 | - _If a dataset is present for this problem, are statistics about the dataset calculated and reported? Have any relevant results from this calculation been discussed?_ 33 | - _If a dataset is **not** present for this problem, has discussion been made about the input space or input data for your problem?_ 34 | - _Are there any abnormalities or characteristics about the input space or dataset that need to be addressed? (categorical variables, missing values, outliers, etc.)_ 35 | 36 | ### Exploratory Visualization 37 | In this section, you will need to provide some form of visualization that summarizes or extracts a relevant characteristic or feature about the data. The visualization should adequately support the data being used. Discuss why this visualization was chosen and how it is relevant. Questions to ask yourself when writing this section: 38 | - _Have you visualized a relevant characteristic or feature about the dataset or input data?_ 39 | - _Is the visualization thoroughly analyzed and discussed?_ 40 | - _If a plot is provided, are the axes, title, and datum clearly defined?_ 41 | 42 | ### Algorithms and Techniques 43 | In this section, you will need to discuss the algorithms and techniques you intend to use for solving the problem. You should justify the use of each one based on the characteristics of the problem and the problem domain. Questions to ask yourself when writing this section: 44 | - _Are the algorithms you will use, including any default variables/parameters in the project clearly defined?_ 45 | - _Are the techniques to be used thoroughly discussed and justified?_ 46 | - _Is it made clear how the input data or datasets will be handled by the algorithms and techniques chosen?_ 47 | 48 | ### Benchmark 49 | In this section, you will need to provide a clearly defined benchmark result or threshold for comparing across performances obtained by your solution. The reasoning behind the benchmark (in the case where it is not an established result) should be discussed. Questions to ask yourself when writing this section: 50 | - _Has some result or value been provided that acts as a benchmark for measuring performance?_ 51 | - _Is it clear how this result or value was obtained (whether by data or by hypothesis)?_ 52 | 53 | 54 | ## III. Methodology 55 | _(approx. 3-5 pages)_ 56 | 57 | ### Data Preprocessing 58 | In this section, all of your preprocessing steps will need to be clearly documented, if any were necessary. From the previous section, any of the abnormalities or characteristics that you identified about the dataset will be addressed and corrected here. Questions to ask yourself when writing this section: 59 | - _If the algorithms chosen require preprocessing steps like feature selection or feature transformations, have they been properly documented?_ 60 | - _Based on the **Data Exploration** section, if there were abnormalities or characteristics that needed to be addressed, have they been properly corrected?_ 61 | - _If no preprocessing is needed, has it been made clear why?_ 62 | 63 | ### Implementation 64 | In this section, the process for which metrics, algorithms, and techniques that you implemented for the given data will need to be clearly documented. It should be abundantly clear how the implementation was carried out, and discussion should be made regarding any complications that occurred during this process. Questions to ask yourself when writing this section: 65 | - _Is it made clear how the algorithms and techniques were implemented with the given datasets or input data?_ 66 | - _Were there any complications with the original metrics or techniques that required changing prior to acquiring a solution?_ 67 | - _Was there any part of the coding process (e.g., writing complicated functions) that should be documented?_ 68 | 69 | ### Refinement 70 | In this section, you will need to discuss the process of improvement you made upon the algorithms and techniques you used in your implementation. For example, adjusting parameters for certain models to acquire improved solutions would fall under the refinement category. Your initial and final solutions should be reported, as well as any significant intermediate results as necessary. Questions to ask yourself when writing this section: 71 | - _Has an initial solution been found and clearly reported?_ 72 | - _Is the process of improvement clearly documented, such as what techniques were used?_ 73 | - _Are intermediate and final solutions clearly reported as the process is improved?_ 74 | 75 | 76 | ## IV. Results 77 | _(approx. 2-3 pages)_ 78 | 79 | ### Model Evaluation and Validation 80 | In this section, the final model and any supporting qualities should be evaluated in detail. It should be clear how the final model was derived and why this model was chosen. In addition, some type of analysis should be used to validate the robustness of this model and its solution, such as manipulating the input data or environment to see how the model’s solution is affected (this is called sensitivity analysis). Questions to ask yourself when writing this section: 81 | - _Is the final model reasonable and aligning with solution expectations? Are the final parameters of the model appropriate?_ 82 | - _Has the final model been tested with various inputs to evaluate whether the model generalizes well to unseen data?_ 83 | - _Is the model robust enough for the problem? Do small perturbations (changes) in training data or the input space greatly affect the results?_ 84 | - _Can results found from the model be trusted?_ 85 | 86 | ### Justification 87 | In this section, your model’s final solution and its results should be compared to the benchmark you established earlier in the project using some type of statistical analysis. You should also justify whether these results and the solution are significant enough to have solved the problem posed in the project. Questions to ask yourself when writing this section: 88 | - _Are the final results found stronger than the benchmark result reported earlier?_ 89 | - _Have you thoroughly analyzed and discussed the final solution?_ 90 | - _Is the final solution significant enough to have solved the problem?_ 91 | 92 | 93 | ## V. Conclusion 94 | _(approx. 1-2 pages)_ 95 | 96 | ### Free-Form Visualization 97 | In this section, you will need to provide some form of visualization that emphasizes an important quality about the project. It is much more free-form, but should reasonably support a significant result or characteristic about the problem that you want to discuss. Questions to ask yourself when writing this section: 98 | - _Have you visualized a relevant or important quality about the problem, dataset, input data, or results?_ 99 | - _Is the visualization thoroughly analyzed and discussed?_ 100 | - _If a plot is provided, are the axes, title, and datum clearly defined?_ 101 | 102 | ### Reflection 103 | In this section, you will summarize the entire end-to-end problem solution and discuss one or two particular aspects of the project you found interesting or difficult. You are expected to reflect on the project as a whole to show that you have a firm understanding of the entire process employed in your work. Questions to ask yourself when writing this section: 104 | - _Have you thoroughly summarized the entire process you used for this project?_ 105 | - _Were there any interesting aspects of the project?_ 106 | - _Were there any difficult aspects of the project?_ 107 | - _Does the final model and solution fit your expectations for the problem, and should it be used in a general setting to solve these types of problems?_ 108 | 109 | ### Improvement 110 | In this section, you will need to provide discussion as to how one aspect of the implementation you designed could be improved. As an example, consider ways your implementation can be made more general, and what would need to be modified. You do not need to make this improvement, but the potential solutions resulting from these changes are considered and compared/contrasted to your current solution. Questions to ask yourself when writing this section: 111 | - _Are there further improvements that could be made on the algorithms or techniques you used in this project?_ 112 | - _Were there algorithms or techniques you researched that you did not know how to implement, but would consider using if you knew how?_ 113 | - _If you used your final solution as the new benchmark, do you think an even better solution exists?_ 114 | 115 | ----------- 116 | 117 | **Before submitting, ask yourself. . .** 118 | 119 | - Does the project report you’ve written follow a well-organized structure similar to that of the project template? 120 | - Is each section (particularly **Analysis** and **Methodology**) written in a clear, concise and specific fashion? Are there any ambiguous terms or phrases that need clarification? 121 | - Would the intended audience of your project be able to understand your analysis, methods, and results? 122 | - Have you properly proof-read your project report to assure there are minimal grammatical and spelling mistakes? 123 | - Are all the resources used for this project correctly cited and referenced? 124 | - Is the code that implements your solution easily readable and properly commented? 125 | - Does the code execute without error and produce results similar to those reported? 126 | -------------------------------------------------------------------------------- /projects/capstone/report-example-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/capstone/report-example-1.pdf -------------------------------------------------------------------------------- /projects/capstone/report-example-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/capstone/report-example-2.pdf -------------------------------------------------------------------------------- /projects/capstone/report-example-3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/capstone/report-example-3.pdf -------------------------------------------------------------------------------- /projects/capstone/robot_motion_planning/maze.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Maze(object): 4 | def __init__(self, filename): 5 | ''' 6 | Maze objects have two main attributes: 7 | - dim: mazes should be square, with sides of even length. (integer) 8 | - walls: passages are coded as a 4-bit number, with a bit value taking 9 | 0 if there is a wall and 1 if there is no wall. The 1s register 10 | corresponds with a square's top edge, 2s register the right edge, 11 | 4s register the bottom edge, and 8s register the left edge. (numpy 12 | array) 13 | 14 | The initialization function also performs some consistency checks for 15 | wall positioning. 16 | ''' 17 | with open(filename, 'rb') as f_in: 18 | 19 | # First line should be an integer with the maze dimensions 20 | self.dim = int(f_in.next()) 21 | 22 | # Subsequent lines describe the permissability of walls 23 | walls = [] 24 | for line in f_in: 25 | walls.append(map(int,line.split(','))) 26 | self.walls = np.array(walls) 27 | 28 | # Perform validation on maze 29 | # Maze dimensions 30 | if self.dim % 2: 31 | raise Exception('Maze dimensions must be even in length!') 32 | if self.walls.shape != (self.dim, self.dim): 33 | raise Exception('Maze shape does not match dimension attribute!') 34 | 35 | # Wall permeability 36 | wall_errors = [] 37 | # vertical walls 38 | for x in range(self.dim-1): 39 | for y in range(self.dim): 40 | if (self.walls[x,y] & 2 != 0) != (self.walls[x+1,y] & 8 != 0): 41 | wall_errors.append([(x,y), 'v']) 42 | # horizontal walls 43 | for y in range(self.dim-1): 44 | for x in range(self.dim): 45 | if (self.walls[x,y] & 1 != 0) != (self.walls[x,y+1] & 4 != 0): 46 | wall_errors.append([(x,y), 'h']) 47 | 48 | if wall_errors: 49 | for cell, wall_type in wall_errors: 50 | if wall_type == 'v': 51 | cell2 = (cell[0]+1, cell[1]) 52 | print 'Inconsistent vertical wall betweeen {} and {}'.format(cell, cell2) 53 | else: 54 | cell2 = (cell[0], cell[1]+1) 55 | print 'Inconsistent horizontal wall betweeen {} and {}'.format(cell, cell2) 56 | raise Exception('Consistency errors found in wall specifications!') 57 | 58 | 59 | def is_permissible(self, cell, direction): 60 | """ 61 | Returns a boolean designating whether or not a cell is passable in the 62 | given direction. Cell is input as a list. Directions may be 63 | input as single letter 'u', 'r', 'd', 'l', or complete words 'up', 64 | 'right', 'down', 'left'. 65 | """ 66 | dir_int = {'u': 1, 'r': 2, 'd': 4, 'l': 8, 67 | 'up': 1, 'right': 2, 'down': 4, 'left': 8} 68 | try: 69 | return (self.walls[tuple(cell)] & dir_int[direction] != 0) 70 | except: 71 | print 'Invalid direction provided!' 72 | 73 | 74 | def dist_to_wall(self, cell, direction): 75 | """ 76 | Returns a number designating the number of open cells to the nearest 77 | wall in the indicated direction. Cell is input as a list. Directions 78 | may be input as a single letter 'u', 'r', 'd', 'l', or complete words 79 | 'up', 'right', 'down', 'left'. 80 | """ 81 | dir_move = {'u': [0, 1], 'r': [1, 0], 'd': [0, -1], 'l': [-1, 0], 82 | 'up': [0, 1], 'right': [1, 0], 'down': [0, -1], 'left': [-1, 0]} 83 | 84 | sensing = True 85 | distance = 0 86 | curr_cell = list(cell) # make copy to preserve original 87 | while sensing: 88 | if self.is_permissible(curr_cell, direction): 89 | distance += 1 90 | curr_cell[0] += dir_move[direction][0] 91 | curr_cell[1] += dir_move[direction][1] 92 | else: 93 | sensing = False 94 | return distance -------------------------------------------------------------------------------- /projects/capstone/robot_motion_planning/robot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Robot(object): 4 | def __init__(self, maze_dim): 5 | ''' 6 | Use the initialization function to set up attributes that your robot 7 | will use to learn and navigate the maze. Some initial attributes are 8 | provided based on common information, including the size of the maze 9 | the robot is placed in. 10 | ''' 11 | 12 | self.location = [0, 0] 13 | self.heading = 'up' 14 | self.maze_dim = maze_dim 15 | 16 | def next_move(self, sensors): 17 | ''' 18 | Use this function to determine the next move the robot should make, 19 | based on the input from the sensors after its previous move. Sensor 20 | inputs are a list of three distances from the robot's left, front, and 21 | right-facing sensors, in that order. 22 | 23 | Outputs should be a tuple of two values. The first value indicates 24 | robot rotation (if any), as a number: 0 for no rotation, +90 for a 25 | 90-degree rotation clockwise, and -90 for a 90-degree rotation 26 | counterclockwise. Other values will result in no rotation. The second 27 | value indicates robot movement, and the robot will attempt to move the 28 | number of indicated squares: a positive number indicates forwards 29 | movement, while a negative number indicates backwards movement. The 30 | robot may move a maximum of three units per turn. Any excess movement 31 | is ignored. 32 | 33 | If the robot wants to end a run (e.g. during the first training run in 34 | the maze) then returing the tuple ('Reset', 'Reset') will indicate to 35 | the tester to end the run and return the robot to the start. 36 | ''' 37 | 38 | rotation = 0 39 | movement = 0 40 | 41 | return rotation, movement -------------------------------------------------------------------------------- /projects/capstone/robot_motion_planning/showmaze.py: -------------------------------------------------------------------------------- 1 | from maze import Maze 2 | import turtle 3 | import sys 4 | 5 | if __name__ == '__main__': 6 | ''' 7 | This function uses Python's turtle library to draw a picture of the maze 8 | given as an argument when running the script. 9 | ''' 10 | 11 | # Create a maze based on input argument on command line. 12 | testmaze = Maze( str(sys.argv[1]) ) 13 | 14 | # Intialize the window and drawing turtle. 15 | window = turtle.Screen() 16 | wally = turtle.Turtle() 17 | wally.speed(0) 18 | wally.hideturtle() 19 | wally.penup() 20 | 21 | # maze centered on (0,0), squares are 20 units in length. 22 | sq_size = 20 23 | origin = testmaze.dim * sq_size / -2 24 | 25 | # iterate through squares one by one to decide where to draw walls 26 | for x in range(testmaze.dim): 27 | for y in range(testmaze.dim): 28 | if not testmaze.is_permissible([x,y], 'up'): 29 | wally.goto(origin + sq_size * x, origin + sq_size * (y+1)) 30 | wally.setheading(0) 31 | wally.pendown() 32 | wally.forward(sq_size) 33 | wally.penup() 34 | 35 | if not testmaze.is_permissible([x,y], 'right'): 36 | wally.goto(origin + sq_size * (x+1), origin + sq_size * y) 37 | wally.setheading(90) 38 | wally.pendown() 39 | wally.forward(sq_size) 40 | wally.penup() 41 | 42 | # only check bottom wall if on lowest row 43 | if y == 0 and not testmaze.is_permissible([x,y], 'down'): 44 | wally.goto(origin + sq_size * x, origin) 45 | wally.setheading(0) 46 | wally.pendown() 47 | wally.forward(sq_size) 48 | wally.penup() 49 | 50 | # only check left wall if on leftmost column 51 | if x == 0 and not testmaze.is_permissible([x,y], 'left'): 52 | wally.goto(origin, origin + sq_size * y) 53 | wally.setheading(90) 54 | wally.pendown() 55 | wally.forward(sq_size) 56 | wally.penup() 57 | 58 | window.exitonclick() -------------------------------------------------------------------------------- /projects/capstone/robot_motion_planning/test_maze_01.txt: -------------------------------------------------------------------------------- 1 | 12 2 | 1,5,7,5,5,5,7,5,7,5,5,6 3 | 3,5,14,3,7,5,15,4,9,5,7,12 4 | 11,6,10,10,9,7,13,6,3,5,13,4 5 | 10,9,13,12,3,13,5,12,9,5,7,6 6 | 9,5,6,3,15,5,5,7,7,4,10,10 7 | 3,5,15,14,10,3,6,10,11,6,10,10 8 | 9,7,12,11,12,9,14,9,14,11,13,14 9 | 3,13,5,12,2,3,13,6,9,14,3,14 10 | 11,4,1,7,15,13,7,13,6,9,14,10 11 | 11,5,6,10,9,7,13,5,15,7,14,8 12 | 11,5,12,10,2,9,5,6,10,8,9,6 13 | 9,5,5,13,13,5,5,12,9,5,5,12 -------------------------------------------------------------------------------- /projects/capstone/robot_motion_planning/test_maze_02.txt: -------------------------------------------------------------------------------- 1 | 14 2 | 1,5,5,7,7,5,5,6,3,6,3,5,5,6 3 | 3,5,6,10,9,5,5,15,14,11,14,3,7,14 4 | 11,6,11,14,1,7,6,10,10,10,11,12,8,10 5 | 10,9,12,10,3,12,11,14,11,14,10,3,5,14 6 | 11,5,6,8,11,7,12,8,10,9,12,9,7,12 7 | 11,7,13,7,14,11,5,5,13,5,4,3,13,6 8 | 8,9,5,14,9,12,3,7,6,3,6,11,6,10 9 | 3,5,5,14,3,6,9,12,11,12,10,10,10,10 10 | 10,3,5,13,14,10,3,5,13,7,14,8,9,14 11 | 9,14,3,6,11,14,9,5,6,10,10,3,6,10 12 | 3,13,14,11,14,11,4,3,13,15,13,14,10,10 13 | 10,3,15,12,9,12,3,13,5,14,3,12,11,14 14 | 11,12,11,7,5,6,10,1,5,15,13,7,12,10 15 | 9,5,12,9,5,13,13,5,5,12,1,13,5,12 -------------------------------------------------------------------------------- /projects/capstone/robot_motion_planning/test_maze_03.txt: -------------------------------------------------------------------------------- 1 | 16 2 | 1,5,5,6,3,7,5,5,5,5,7,5,5,5,5,6 3 | 3,5,6,10,10,9,6,3,5,5,13,7,5,5,6,10 4 | 11,6,11,15,15,5,14,8,2,3,5,13,5,6,10,10 5 | 10,10,10,10,11,5,13,5,12,9,7,6,3,15,13,14 6 | 10,10,10,9,12,3,5,6,3,6,10,11,14,11,6,10 7 | 9,14,9,4,3,13,6,11,14,10,9,12,11,12,10,10 8 | 1,13,6,3,14,3,15,12,9,15,6,3,13,7,12,10 9 | 3,6,10,10,9,14,8,3,6,8,10,9,7,13,7,12 10 | 10,10,10,10,3,13,7,13,12,3,14,3,13,7,13,6 11 | 10,10,10,11,12,3,14,3,6,10,10,10,3,15,7,14 12 | 10,9,12,9,7,14,11,14,10,8,10,10,10,10,10,10 13 | 11,5,5,6,10,11,14,11,15,6,9,13,14,10,10,10 14 | 11,7,6,10,9,14,9,14,10,10,3,7,15,14,10,10 15 | 10,10,9,12,2,9,5,15,14,10,10,10,10,11,14,10 16 | 10,11,5,5,12,3,5,12,10,11,13,12,10,10,9,14 17 | 9,13,5,5,5,13,5,5,13,13,5,5,12,9,5,12 -------------------------------------------------------------------------------- /projects/capstone/robot_motion_planning/tester.py: -------------------------------------------------------------------------------- 1 | from maze import Maze 2 | from robot import Robot 3 | import sys 4 | 5 | # global dictionaries for robot movement and sensing 6 | dir_sensors = {'u': ['l', 'u', 'r'], 'r': ['u', 'r', 'd'], 7 | 'd': ['r', 'd', 'l'], 'l': ['d', 'l', 'u'], 8 | 'up': ['l', 'u', 'r'], 'right': ['u', 'r', 'd'], 9 | 'down': ['r', 'd', 'l'], 'left': ['d', 'l', 'u']} 10 | dir_move = {'u': [0, 1], 'r': [1, 0], 'd': [0, -1], 'l': [-1, 0], 11 | 'up': [0, 1], 'right': [1, 0], 'down': [0, -1], 'left': [-1, 0]} 12 | dir_reverse = {'u': 'd', 'r': 'l', 'd': 'u', 'l': 'r', 13 | 'up': 'd', 'right': 'l', 'down': 'u', 'left': 'r'} 14 | 15 | # test and score parameters 16 | max_time = 1000 17 | train_score_mult = 1/30. 18 | 19 | if __name__ == '__main__': 20 | ''' 21 | This script tests a robot based on the code in robot.py on a maze given 22 | as an argument when running the script. 23 | ''' 24 | 25 | # Create a maze based on input argument on command line. 26 | testmaze = Maze( str(sys.argv[1]) ) 27 | 28 | # Intitialize a robot; robot receives info about maze dimensions. 29 | testrobot = Robot(testmaze.dim) 30 | 31 | # Record robot performance over two runs. 32 | runtimes = [] 33 | total_time = 0 34 | for run in range(2): 35 | print "Starting run {}.".format(run) 36 | 37 | # Set the robot in the start position. Note that robot position 38 | # parameters are independent of the robot itself. 39 | robot_pos = {'location': [0, 0], 'heading': 'up'} 40 | 41 | run_active = True 42 | hit_goal = False 43 | while run_active: 44 | # check for end of time 45 | total_time += 1 46 | if total_time > max_time: 47 | run_active = False 48 | print "Allotted time exceeded." 49 | break 50 | 51 | # provide robot with sensor information, get actions 52 | sensing = [testmaze.dist_to_wall(robot_pos['location'], heading) 53 | for heading in dir_sensors[robot_pos['heading']]] 54 | rotation, movement = testrobot.next_move(sensing) 55 | 56 | # check for a reset 57 | if (rotation, movement) == ('Reset', 'Reset'): 58 | if run == 0 and hit_goal: 59 | run_active = False 60 | runtimes.append(total_time) 61 | print "Ending first run. Starting next run." 62 | break 63 | elif run == 0 and not hit_goal: 64 | print "Cannot reset - robot has not hit goal yet." 65 | continue 66 | else: 67 | print "Cannot reset on runs after the first." 68 | continue 69 | 70 | # perform rotation 71 | if rotation == -90: 72 | robot_pos['heading'] = dir_sensors[robot_pos['heading']][0] 73 | elif rotation == 90: 74 | robot_pos['heading'] = dir_sensors[robot_pos['heading']][2] 75 | elif rotation == 0: 76 | pass 77 | else: 78 | print "Invalid rotation value, no rotation performed." 79 | 80 | # perform movement 81 | if abs(movement) > 3: 82 | print "Movement limited to three squares in a turn." 83 | movement = max(min(int(movement), 3), -3) # fix to range [-3, 3] 84 | while movement: 85 | if movement > 0: 86 | if testmaze.is_permissible(robot_pos['location'], robot_pos['heading']): 87 | robot_pos['location'][0] += dir_move[robot_pos['heading']][0] 88 | robot_pos['location'][1] += dir_move[robot_pos['heading']][1] 89 | movement -= 1 90 | else: 91 | print "Movement stopped by wall." 92 | movement = 0 93 | else: 94 | rev_heading = dir_reverse[robot_pos['heading']] 95 | if testmaze.is_permissible(robot_pos['location'], rev_heading): 96 | robot_pos['location'][0] += dir_move[rev_heading][0] 97 | robot_pos['location'][1] += dir_move[rev_heading][1] 98 | movement += 1 99 | else: 100 | print "Movement stopped by wall." 101 | movement = 0 102 | 103 | # check for goal entered 104 | goal_bounds = [testmaze.dim/2 - 1, testmaze.dim/2] 105 | if robot_pos['location'][0] in goal_bounds and robot_pos['location'][1] in goal_bounds: 106 | hit_goal = True 107 | if run != 0: 108 | runtimes.append(total_time - sum(runtimes)) 109 | run_active = False 110 | print "Goal found; run {} completed!".format(run) 111 | 112 | # Report score if robot is successful. 113 | if len(runtimes) == 2: 114 | print "Task complete! Score: {:4.3f}".format(runtimes[1] + train_score_mult*runtimes[0]) -------------------------------------------------------------------------------- /projects/creating_customer_segments/README.md: -------------------------------------------------------------------------------- 1 | # Project 3: Unsupervised Learning 2 | ## Creating Customer Segments 3 | 4 | ### Install 5 | 6 | This project requires **Python 2.7** and the following Python libraries installed: 7 | 8 | - [NumPy](http://www.numpy.org/) 9 | - [Pandas](http://pandas.pydata.org) 10 | - [matplotlib](http://matplotlib.org/) 11 | - [scikit-learn](http://scikit-learn.org/stable/) 12 | 13 | You will also need to have software installed to run and execute an [iPython Notebook](http://ipython.org/notebook.html) 14 | 15 | Udacity recommends our students install [Anaconda](https://www.continuum.io/downloads), a pre-packaged Python distribution that contains all of the necessary libraries and software for this project. 16 | 17 | ### Code 18 | 19 | Template code is provided in the notebook `customer_segments.ipynb` notebook file. Additional supporting code can be found in `renders.py`. While some code has already been implemented to get you started, you will need to implement additional functionality when requested to successfully complete the project. 20 | 21 | ### Run 22 | 23 | In a terminal or command window, navigate to the top-level project directory `creating_customer_segments/` (that contains this README) and run one of the following commands: 24 | 25 | ```ipython notebook customer_segments.ipynb``` 26 | ```jupyter notebook customer_segments.ipynb``` 27 | 28 | This will open the iPython Notebook software and project file in your browser. 29 | 30 | ## Data 31 | 32 | The dataset used in this project is included as `customers.csv`. You can find more information on this dataset on the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Wholesale+customers) page. 33 | -------------------------------------------------------------------------------- /projects/creating_customer_segments/customers.csv: -------------------------------------------------------------------------------- 1 | Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicatessen 2 | 2,3,12669,9656,7561,214,2674,1338 3 | 2,3,7057,9810,9568,1762,3293,1776 4 | 2,3,6353,8808,7684,2405,3516,7844 5 | 1,3,13265,1196,4221,6404,507,1788 6 | 2,3,22615,5410,7198,3915,1777,5185 7 | 2,3,9413,8259,5126,666,1795,1451 8 | 2,3,12126,3199,6975,480,3140,545 9 | 2,3,7579,4956,9426,1669,3321,2566 10 | 1,3,5963,3648,6192,425,1716,750 11 | 2,3,6006,11093,18881,1159,7425,2098 12 | 2,3,3366,5403,12974,4400,5977,1744 13 | 2,3,13146,1124,4523,1420,549,497 14 | 2,3,31714,12319,11757,287,3881,2931 15 | 2,3,21217,6208,14982,3095,6707,602 16 | 2,3,24653,9465,12091,294,5058,2168 17 | 1,3,10253,1114,3821,397,964,412 18 | 2,3,1020,8816,12121,134,4508,1080 19 | 1,3,5876,6157,2933,839,370,4478 20 | 2,3,18601,6327,10099,2205,2767,3181 21 | 1,3,7780,2495,9464,669,2518,501 22 | 2,3,17546,4519,4602,1066,2259,2124 23 | 1,3,5567,871,2010,3383,375,569 24 | 1,3,31276,1917,4469,9408,2381,4334 25 | 2,3,26373,36423,22019,5154,4337,16523 26 | 2,3,22647,9776,13792,2915,4482,5778 27 | 2,3,16165,4230,7595,201,4003,57 28 | 1,3,9898,961,2861,3151,242,833 29 | 1,3,14276,803,3045,485,100,518 30 | 2,3,4113,20484,25957,1158,8604,5206 31 | 1,3,43088,2100,2609,1200,1107,823 32 | 1,3,18815,3610,11107,1148,2134,2963 33 | 1,3,2612,4339,3133,2088,820,985 34 | 1,3,21632,1318,2886,266,918,405 35 | 1,3,29729,4786,7326,6130,361,1083 36 | 1,3,1502,1979,2262,425,483,395 37 | 2,3,688,5491,11091,833,4239,436 38 | 1,3,29955,4362,5428,1729,862,4626 39 | 2,3,15168,10556,12477,1920,6506,714 40 | 2,3,4591,15729,16709,33,6956,433 41 | 1,3,56159,555,902,10002,212,2916 42 | 1,3,24025,4332,4757,9510,1145,5864 43 | 1,3,19176,3065,5956,2033,2575,2802 44 | 2,3,10850,7555,14961,188,6899,46 45 | 2,3,630,11095,23998,787,9529,72 46 | 2,3,9670,7027,10471,541,4618,65 47 | 2,3,5181,22044,21531,1740,7353,4985 48 | 2,3,3103,14069,21955,1668,6792,1452 49 | 2,3,44466,54259,55571,7782,24171,6465 50 | 2,3,11519,6152,10868,584,5121,1476 51 | 2,3,4967,21412,28921,1798,13583,1163 52 | 1,3,6269,1095,1980,3860,609,2162 53 | 1,3,3347,4051,6996,239,1538,301 54 | 2,3,40721,3916,5876,532,2587,1278 55 | 2,3,491,10473,11532,744,5611,224 56 | 1,3,27329,1449,1947,2436,204,1333 57 | 1,3,5264,3683,5005,1057,2024,1130 58 | 2,3,4098,29892,26866,2616,17740,1340 59 | 2,3,5417,9933,10487,38,7572,1282 60 | 1,3,13779,1970,1648,596,227,436 61 | 1,3,6137,5360,8040,129,3084,1603 62 | 2,3,8590,3045,7854,96,4095,225 63 | 2,3,35942,38369,59598,3254,26701,2017 64 | 2,3,7823,6245,6544,4154,4074,964 65 | 2,3,9396,11601,15775,2896,7677,1295 66 | 1,3,4760,1227,3250,3724,1247,1145 67 | 2,3,85,20959,45828,36,24231,1423 68 | 1,3,9,1534,7417,175,3468,27 69 | 2,3,19913,6759,13462,1256,5141,834 70 | 1,3,2446,7260,3993,5870,788,3095 71 | 1,3,8352,2820,1293,779,656,144 72 | 1,3,16705,2037,3202,10643,116,1365 73 | 1,3,18291,1266,21042,5373,4173,14472 74 | 1,3,4420,5139,2661,8872,1321,181 75 | 2,3,19899,5332,8713,8132,764,648 76 | 2,3,8190,6343,9794,1285,1901,1780 77 | 1,3,20398,1137,3,4407,3,975 78 | 1,3,717,3587,6532,7530,529,894 79 | 2,3,12205,12697,28540,869,12034,1009 80 | 1,3,10766,1175,2067,2096,301,167 81 | 1,3,1640,3259,3655,868,1202,1653 82 | 1,3,7005,829,3009,430,610,529 83 | 2,3,219,9540,14403,283,7818,156 84 | 2,3,10362,9232,11009,737,3537,2342 85 | 1,3,20874,1563,1783,2320,550,772 86 | 2,3,11867,3327,4814,1178,3837,120 87 | 2,3,16117,46197,92780,1026,40827,2944 88 | 2,3,22925,73498,32114,987,20070,903 89 | 1,3,43265,5025,8117,6312,1579,14351 90 | 1,3,7864,542,4042,9735,165,46 91 | 1,3,24904,3836,5330,3443,454,3178 92 | 1,3,11405,596,1638,3347,69,360 93 | 1,3,12754,2762,2530,8693,627,1117 94 | 2,3,9198,27472,32034,3232,18906,5130 95 | 1,3,11314,3090,2062,35009,71,2698 96 | 2,3,5626,12220,11323,206,5038,244 97 | 1,3,3,2920,6252,440,223,709 98 | 2,3,23,2616,8118,145,3874,217 99 | 1,3,403,254,610,774,54,63 100 | 1,3,503,112,778,895,56,132 101 | 1,3,9658,2182,1909,5639,215,323 102 | 2,3,11594,7779,12144,3252,8035,3029 103 | 2,3,1420,10810,16267,1593,6766,1838 104 | 2,3,2932,6459,7677,2561,4573,1386 105 | 1,3,56082,3504,8906,18028,1480,2498 106 | 1,3,14100,2132,3445,1336,1491,548 107 | 1,3,15587,1014,3970,910,139,1378 108 | 2,3,1454,6337,10704,133,6830,1831 109 | 2,3,8797,10646,14886,2471,8969,1438 110 | 2,3,1531,8397,6981,247,2505,1236 111 | 2,3,1406,16729,28986,673,836,3 112 | 1,3,11818,1648,1694,2276,169,1647 113 | 2,3,12579,11114,17569,805,6457,1519 114 | 1,3,19046,2770,2469,8853,483,2708 115 | 1,3,14438,2295,1733,3220,585,1561 116 | 1,3,18044,1080,2000,2555,118,1266 117 | 1,3,11134,793,2988,2715,276,610 118 | 1,3,11173,2521,3355,1517,310,222 119 | 1,3,6990,3880,5380,1647,319,1160 120 | 1,3,20049,1891,2362,5343,411,933 121 | 1,3,8258,2344,2147,3896,266,635 122 | 1,3,17160,1200,3412,2417,174,1136 123 | 1,3,4020,3234,1498,2395,264,255 124 | 1,3,12212,201,245,1991,25,860 125 | 2,3,11170,10769,8814,2194,1976,143 126 | 1,3,36050,1642,2961,4787,500,1621 127 | 1,3,76237,3473,7102,16538,778,918 128 | 1,3,19219,1840,1658,8195,349,483 129 | 2,3,21465,7243,10685,880,2386,2749 130 | 1,3,140,8847,3823,142,1062,3 131 | 1,3,42312,926,1510,1718,410,1819 132 | 1,3,7149,2428,699,6316,395,911 133 | 1,3,2101,589,314,346,70,310 134 | 1,3,14903,2032,2479,576,955,328 135 | 1,3,9434,1042,1235,436,256,396 136 | 1,3,7388,1882,2174,720,47,537 137 | 1,3,6300,1289,2591,1170,199,326 138 | 1,3,4625,8579,7030,4575,2447,1542 139 | 1,3,3087,8080,8282,661,721,36 140 | 1,3,13537,4257,5034,155,249,3271 141 | 1,3,5387,4979,3343,825,637,929 142 | 1,3,17623,4280,7305,2279,960,2616 143 | 1,3,30379,13252,5189,321,51,1450 144 | 1,3,37036,7152,8253,2995,20,3 145 | 1,3,10405,1596,1096,8425,399,318 146 | 1,3,18827,3677,1988,118,516,201 147 | 2,3,22039,8384,34792,42,12591,4430 148 | 1,3,7769,1936,2177,926,73,520 149 | 1,3,9203,3373,2707,1286,1082,526 150 | 1,3,5924,584,542,4052,283,434 151 | 1,3,31812,1433,1651,800,113,1440 152 | 1,3,16225,1825,1765,853,170,1067 153 | 1,3,1289,3328,2022,531,255,1774 154 | 1,3,18840,1371,3135,3001,352,184 155 | 1,3,3463,9250,2368,779,302,1627 156 | 1,3,622,55,137,75,7,8 157 | 2,3,1989,10690,19460,233,11577,2153 158 | 2,3,3830,5291,14855,317,6694,3182 159 | 1,3,17773,1366,2474,3378,811,418 160 | 2,3,2861,6570,9618,930,4004,1682 161 | 2,3,355,7704,14682,398,8077,303 162 | 2,3,1725,3651,12822,824,4424,2157 163 | 1,3,12434,540,283,1092,3,2233 164 | 1,3,15177,2024,3810,2665,232,610 165 | 2,3,5531,15726,26870,2367,13726,446 166 | 2,3,5224,7603,8584,2540,3674,238 167 | 2,3,15615,12653,19858,4425,7108,2379 168 | 2,3,4822,6721,9170,993,4973,3637 169 | 1,3,2926,3195,3268,405,1680,693 170 | 1,3,5809,735,803,1393,79,429 171 | 1,3,5414,717,2155,2399,69,750 172 | 2,3,260,8675,13430,1116,7015,323 173 | 2,3,200,25862,19816,651,8773,6250 174 | 1,3,955,5479,6536,333,2840,707 175 | 2,3,514,7677,19805,937,9836,716 176 | 1,3,286,1208,5241,2515,153,1442 177 | 2,3,2343,7845,11874,52,4196,1697 178 | 1,3,45640,6958,6536,7368,1532,230 179 | 1,3,12759,7330,4533,1752,20,2631 180 | 1,3,11002,7075,4945,1152,120,395 181 | 1,3,3157,4888,2500,4477,273,2165 182 | 1,3,12356,6036,8887,402,1382,2794 183 | 1,3,112151,29627,18148,16745,4948,8550 184 | 1,3,694,8533,10518,443,6907,156 185 | 1,3,36847,43950,20170,36534,239,47943 186 | 1,3,327,918,4710,74,334,11 187 | 1,3,8170,6448,1139,2181,58,247 188 | 1,3,3009,521,854,3470,949,727 189 | 1,3,2438,8002,9819,6269,3459,3 190 | 2,3,8040,7639,11687,2758,6839,404 191 | 2,3,834,11577,11522,275,4027,1856 192 | 1,3,16936,6250,1981,7332,118,64 193 | 1,3,13624,295,1381,890,43,84 194 | 1,3,5509,1461,2251,547,187,409 195 | 2,3,180,3485,20292,959,5618,666 196 | 1,3,7107,1012,2974,806,355,1142 197 | 1,3,17023,5139,5230,7888,330,1755 198 | 1,1,30624,7209,4897,18711,763,2876 199 | 2,1,2427,7097,10391,1127,4314,1468 200 | 1,1,11686,2154,6824,3527,592,697 201 | 1,1,9670,2280,2112,520,402,347 202 | 2,1,3067,13240,23127,3941,9959,731 203 | 2,1,4484,14399,24708,3549,14235,1681 204 | 1,1,25203,11487,9490,5065,284,6854 205 | 1,1,583,685,2216,469,954,18 206 | 1,1,1956,891,5226,1383,5,1328 207 | 2,1,1107,11711,23596,955,9265,710 208 | 1,1,6373,780,950,878,288,285 209 | 2,1,2541,4737,6089,2946,5316,120 210 | 1,1,1537,3748,5838,1859,3381,806 211 | 2,1,5550,12729,16767,864,12420,797 212 | 1,1,18567,1895,1393,1801,244,2100 213 | 2,1,12119,28326,39694,4736,19410,2870 214 | 1,1,7291,1012,2062,1291,240,1775 215 | 1,1,3317,6602,6861,1329,3961,1215 216 | 2,1,2362,6551,11364,913,5957,791 217 | 1,1,2806,10765,15538,1374,5828,2388 218 | 2,1,2532,16599,36486,179,13308,674 219 | 1,1,18044,1475,2046,2532,130,1158 220 | 2,1,18,7504,15205,1285,4797,6372 221 | 1,1,4155,367,1390,2306,86,130 222 | 1,1,14755,899,1382,1765,56,749 223 | 1,1,5396,7503,10646,91,4167,239 224 | 1,1,5041,1115,2856,7496,256,375 225 | 2,1,2790,2527,5265,5612,788,1360 226 | 1,1,7274,659,1499,784,70,659 227 | 1,1,12680,3243,4157,660,761,786 228 | 2,1,20782,5921,9212,1759,2568,1553 229 | 1,1,4042,2204,1563,2286,263,689 230 | 1,1,1869,577,572,950,4762,203 231 | 1,1,8656,2746,2501,6845,694,980 232 | 2,1,11072,5989,5615,8321,955,2137 233 | 1,1,2344,10678,3828,1439,1566,490 234 | 1,1,25962,1780,3838,638,284,834 235 | 1,1,964,4984,3316,937,409,7 236 | 1,1,15603,2703,3833,4260,325,2563 237 | 1,1,1838,6380,2824,1218,1216,295 238 | 1,1,8635,820,3047,2312,415,225 239 | 1,1,18692,3838,593,4634,28,1215 240 | 1,1,7363,475,585,1112,72,216 241 | 1,1,47493,2567,3779,5243,828,2253 242 | 1,1,22096,3575,7041,11422,343,2564 243 | 1,1,24929,1801,2475,2216,412,1047 244 | 1,1,18226,659,2914,3752,586,578 245 | 1,1,11210,3576,5119,561,1682,2398 246 | 1,1,6202,7775,10817,1183,3143,1970 247 | 2,1,3062,6154,13916,230,8933,2784 248 | 1,1,8885,2428,1777,1777,430,610 249 | 1,1,13569,346,489,2077,44,659 250 | 1,1,15671,5279,2406,559,562,572 251 | 1,1,8040,3795,2070,6340,918,291 252 | 1,1,3191,1993,1799,1730,234,710 253 | 2,1,6134,23133,33586,6746,18594,5121 254 | 1,1,6623,1860,4740,7683,205,1693 255 | 1,1,29526,7961,16966,432,363,1391 256 | 1,1,10379,17972,4748,4686,1547,3265 257 | 1,1,31614,489,1495,3242,111,615 258 | 1,1,11092,5008,5249,453,392,373 259 | 1,1,8475,1931,1883,5004,3593,987 260 | 1,1,56083,4563,2124,6422,730,3321 261 | 1,1,53205,4959,7336,3012,967,818 262 | 1,1,9193,4885,2157,327,780,548 263 | 1,1,7858,1110,1094,6818,49,287 264 | 1,1,23257,1372,1677,982,429,655 265 | 1,1,2153,1115,6684,4324,2894,411 266 | 2,1,1073,9679,15445,61,5980,1265 267 | 1,1,5909,23527,13699,10155,830,3636 268 | 2,1,572,9763,22182,2221,4882,2563 269 | 1,1,20893,1222,2576,3975,737,3628 270 | 2,1,11908,8053,19847,1069,6374,698 271 | 1,1,15218,258,1138,2516,333,204 272 | 1,1,4720,1032,975,5500,197,56 273 | 1,1,2083,5007,1563,1120,147,1550 274 | 1,1,514,8323,6869,529,93,1040 275 | 1,3,36817,3045,1493,4802,210,1824 276 | 1,3,894,1703,1841,744,759,1153 277 | 1,3,680,1610,223,862,96,379 278 | 1,3,27901,3749,6964,4479,603,2503 279 | 1,3,9061,829,683,16919,621,139 280 | 1,3,11693,2317,2543,5845,274,1409 281 | 2,3,17360,6200,9694,1293,3620,1721 282 | 1,3,3366,2884,2431,977,167,1104 283 | 2,3,12238,7108,6235,1093,2328,2079 284 | 1,3,49063,3965,4252,5970,1041,1404 285 | 1,3,25767,3613,2013,10303,314,1384 286 | 1,3,68951,4411,12609,8692,751,2406 287 | 1,3,40254,640,3600,1042,436,18 288 | 1,3,7149,2247,1242,1619,1226,128 289 | 1,3,15354,2102,2828,8366,386,1027 290 | 1,3,16260,594,1296,848,445,258 291 | 1,3,42786,286,471,1388,32,22 292 | 1,3,2708,2160,2642,502,965,1522 293 | 1,3,6022,3354,3261,2507,212,686 294 | 1,3,2838,3086,4329,3838,825,1060 295 | 2,2,3996,11103,12469,902,5952,741 296 | 1,2,21273,2013,6550,909,811,1854 297 | 2,2,7588,1897,5234,417,2208,254 298 | 1,2,19087,1304,3643,3045,710,898 299 | 2,2,8090,3199,6986,1455,3712,531 300 | 2,2,6758,4560,9965,934,4538,1037 301 | 1,2,444,879,2060,264,290,259 302 | 2,2,16448,6243,6360,824,2662,2005 303 | 2,2,5283,13316,20399,1809,8752,172 304 | 2,2,2886,5302,9785,364,6236,555 305 | 2,2,2599,3688,13829,492,10069,59 306 | 2,2,161,7460,24773,617,11783,2410 307 | 2,2,243,12939,8852,799,3909,211 308 | 2,2,6468,12867,21570,1840,7558,1543 309 | 1,2,17327,2374,2842,1149,351,925 310 | 1,2,6987,1020,3007,416,257,656 311 | 2,2,918,20655,13567,1465,6846,806 312 | 1,2,7034,1492,2405,12569,299,1117 313 | 1,2,29635,2335,8280,3046,371,117 314 | 2,2,2137,3737,19172,1274,17120,142 315 | 1,2,9784,925,2405,4447,183,297 316 | 1,2,10617,1795,7647,1483,857,1233 317 | 2,2,1479,14982,11924,662,3891,3508 318 | 1,2,7127,1375,2201,2679,83,1059 319 | 1,2,1182,3088,6114,978,821,1637 320 | 1,2,11800,2713,3558,2121,706,51 321 | 2,2,9759,25071,17645,1128,12408,1625 322 | 1,2,1774,3696,2280,514,275,834 323 | 1,2,9155,1897,5167,2714,228,1113 324 | 1,2,15881,713,3315,3703,1470,229 325 | 1,2,13360,944,11593,915,1679,573 326 | 1,2,25977,3587,2464,2369,140,1092 327 | 1,2,32717,16784,13626,60869,1272,5609 328 | 1,2,4414,1610,1431,3498,387,834 329 | 1,2,542,899,1664,414,88,522 330 | 1,2,16933,2209,3389,7849,210,1534 331 | 1,2,5113,1486,4583,5127,492,739 332 | 1,2,9790,1786,5109,3570,182,1043 333 | 2,2,11223,14881,26839,1234,9606,1102 334 | 1,2,22321,3216,1447,2208,178,2602 335 | 2,2,8565,4980,67298,131,38102,1215 336 | 2,2,16823,928,2743,11559,332,3486 337 | 2,2,27082,6817,10790,1365,4111,2139 338 | 1,2,13970,1511,1330,650,146,778 339 | 1,2,9351,1347,2611,8170,442,868 340 | 1,2,3,333,7021,15601,15,550 341 | 1,2,2617,1188,5332,9584,573,1942 342 | 2,3,381,4025,9670,388,7271,1371 343 | 2,3,2320,5763,11238,767,5162,2158 344 | 1,3,255,5758,5923,349,4595,1328 345 | 2,3,1689,6964,26316,1456,15469,37 346 | 1,3,3043,1172,1763,2234,217,379 347 | 1,3,1198,2602,8335,402,3843,303 348 | 2,3,2771,6939,15541,2693,6600,1115 349 | 2,3,27380,7184,12311,2809,4621,1022 350 | 1,3,3428,2380,2028,1341,1184,665 351 | 2,3,5981,14641,20521,2005,12218,445 352 | 1,3,3521,1099,1997,1796,173,995 353 | 2,3,1210,10044,22294,1741,12638,3137 354 | 1,3,608,1106,1533,830,90,195 355 | 2,3,117,6264,21203,228,8682,1111 356 | 1,3,14039,7393,2548,6386,1333,2341 357 | 1,3,190,727,2012,245,184,127 358 | 1,3,22686,134,218,3157,9,548 359 | 2,3,37,1275,22272,137,6747,110 360 | 1,3,759,18664,1660,6114,536,4100 361 | 1,3,796,5878,2109,340,232,776 362 | 1,3,19746,2872,2006,2601,468,503 363 | 1,3,4734,607,864,1206,159,405 364 | 1,3,2121,1601,2453,560,179,712 365 | 1,3,4627,997,4438,191,1335,314 366 | 1,3,2615,873,1524,1103,514,468 367 | 2,3,4692,6128,8025,1619,4515,3105 368 | 1,3,9561,2217,1664,1173,222,447 369 | 1,3,3477,894,534,1457,252,342 370 | 1,3,22335,1196,2406,2046,101,558 371 | 1,3,6211,337,683,1089,41,296 372 | 2,3,39679,3944,4955,1364,523,2235 373 | 1,3,20105,1887,1939,8164,716,790 374 | 1,3,3884,3801,1641,876,397,4829 375 | 2,3,15076,6257,7398,1504,1916,3113 376 | 1,3,6338,2256,1668,1492,311,686 377 | 1,3,5841,1450,1162,597,476,70 378 | 2,3,3136,8630,13586,5641,4666,1426 379 | 1,3,38793,3154,2648,1034,96,1242 380 | 1,3,3225,3294,1902,282,68,1114 381 | 2,3,4048,5164,10391,130,813,179 382 | 1,3,28257,944,2146,3881,600,270 383 | 1,3,17770,4591,1617,9927,246,532 384 | 1,3,34454,7435,8469,2540,1711,2893 385 | 1,3,1821,1364,3450,4006,397,361 386 | 1,3,10683,21858,15400,3635,282,5120 387 | 1,3,11635,922,1614,2583,192,1068 388 | 1,3,1206,3620,2857,1945,353,967 389 | 1,3,20918,1916,1573,1960,231,961 390 | 1,3,9785,848,1172,1677,200,406 391 | 1,3,9385,1530,1422,3019,227,684 392 | 1,3,3352,1181,1328,5502,311,1000 393 | 1,3,2647,2761,2313,907,95,1827 394 | 1,3,518,4180,3600,659,122,654 395 | 1,3,23632,6730,3842,8620,385,819 396 | 1,3,12377,865,3204,1398,149,452 397 | 1,3,9602,1316,1263,2921,841,290 398 | 2,3,4515,11991,9345,2644,3378,2213 399 | 1,3,11535,1666,1428,6838,64,743 400 | 1,3,11442,1032,582,5390,74,247 401 | 1,3,9612,577,935,1601,469,375 402 | 1,3,4446,906,1238,3576,153,1014 403 | 1,3,27167,2801,2128,13223,92,1902 404 | 1,3,26539,4753,5091,220,10,340 405 | 1,3,25606,11006,4604,127,632,288 406 | 1,3,18073,4613,3444,4324,914,715 407 | 1,3,6884,1046,1167,2069,593,378 408 | 1,3,25066,5010,5026,9806,1092,960 409 | 2,3,7362,12844,18683,2854,7883,553 410 | 2,3,8257,3880,6407,1646,2730,344 411 | 1,3,8708,3634,6100,2349,2123,5137 412 | 1,3,6633,2096,4563,1389,1860,1892 413 | 1,3,2126,3289,3281,1535,235,4365 414 | 1,3,97,3605,12400,98,2970,62 415 | 1,3,4983,4859,6633,17866,912,2435 416 | 1,3,5969,1990,3417,5679,1135,290 417 | 2,3,7842,6046,8552,1691,3540,1874 418 | 2,3,4389,10940,10908,848,6728,993 419 | 1,3,5065,5499,11055,364,3485,1063 420 | 2,3,660,8494,18622,133,6740,776 421 | 1,3,8861,3783,2223,633,1580,1521 422 | 1,3,4456,5266,13227,25,6818,1393 423 | 2,3,17063,4847,9053,1031,3415,1784 424 | 1,3,26400,1377,4172,830,948,1218 425 | 2,3,17565,3686,4657,1059,1803,668 426 | 2,3,16980,2884,12232,874,3213,249 427 | 1,3,11243,2408,2593,15348,108,1886 428 | 1,3,13134,9347,14316,3141,5079,1894 429 | 1,3,31012,16687,5429,15082,439,1163 430 | 1,3,3047,5970,4910,2198,850,317 431 | 1,3,8607,1750,3580,47,84,2501 432 | 1,3,3097,4230,16483,575,241,2080 433 | 1,3,8533,5506,5160,13486,1377,1498 434 | 1,3,21117,1162,4754,269,1328,395 435 | 1,3,1982,3218,1493,1541,356,1449 436 | 1,3,16731,3922,7994,688,2371,838 437 | 1,3,29703,12051,16027,13135,182,2204 438 | 1,3,39228,1431,764,4510,93,2346 439 | 2,3,14531,15488,30243,437,14841,1867 440 | 1,3,10290,1981,2232,1038,168,2125 441 | 1,3,2787,1698,2510,65,477,52 442 | -------------------------------------------------------------------------------- /projects/creating_customer_segments/renders.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import matplotlib.cm as cm 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn.decomposition import pca 6 | 7 | def pca_results(good_data, pca): 8 | ''' 9 | Create a DataFrame of the PCA results 10 | Includes dimension feature weights and explained variance 11 | Visualizes the PCA results 12 | ''' 13 | 14 | # Dimension indexing 15 | dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)] 16 | 17 | # PCA components 18 | components = pd.DataFrame(np.round(pca.components_, 4), columns = good_data.keys()) 19 | components.index = dimensions 20 | 21 | # PCA explained variance 22 | ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1) 23 | variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance']) 24 | variance_ratios.index = dimensions 25 | 26 | # Create a bar plot visualization 27 | fig, ax = plt.subplots(figsize = (14,8)) 28 | 29 | # Plot the feature weights as a function of the components 30 | components.plot(ax = ax, kind = 'bar'); 31 | ax.set_ylabel("Feature Weights") 32 | ax.set_xticklabels(dimensions, rotation=0) 33 | 34 | 35 | # Display the explained variance ratios 36 | for i, ev in enumerate(pca.explained_variance_ratio_): 37 | ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n %.4f"%(ev)) 38 | 39 | # Return a concatenated DataFrame 40 | return pd.concat([variance_ratios, components], axis = 1) 41 | 42 | def cluster_results(reduced_data, preds, centers, pca_samples): 43 | ''' 44 | Visualizes the PCA-reduced cluster data in two dimensions 45 | Adds cues for cluster centers and student-selected sample data 46 | ''' 47 | 48 | predictions = pd.DataFrame(preds, columns = ['Cluster']) 49 | plot_data = pd.concat([predictions, reduced_data], axis = 1) 50 | 51 | # Generate the cluster plot 52 | fig, ax = plt.subplots(figsize = (14,8)) 53 | 54 | # Color map 55 | cmap = cm.get_cmap('gist_rainbow') 56 | 57 | # Color the points based on assigned cluster 58 | for i, cluster in plot_data.groupby('Cluster'): 59 | cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \ 60 | color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30); 61 | 62 | # Plot centers with indicators 63 | for i, c in enumerate(centers): 64 | ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \ 65 | alpha = 1, linewidth = 2, marker = 'o', s=200); 66 | ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100); 67 | 68 | # Plot transformed sample points 69 | ax.scatter(x = pca_samples[:,0], y = pca_samples[:,1], \ 70 | s = 150, linewidth = 4, color = 'black', marker = 'x'); 71 | 72 | # Set plot title 73 | ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number\nTransformed Sample Data Marked by Black Cross"); 74 | 75 | 76 | def channel_results(reduced_data, outliers, pca_samples): 77 | ''' 78 | Visualizes the PCA-reduced cluster data in two dimensions using the full dataset 79 | Data is labeled by "Channel" and cues added for student-selected sample data 80 | ''' 81 | 82 | # Check that the dataset is loadable 83 | try: 84 | full_data = pd.read_csv("customers.csv") 85 | except: 86 | print "Dataset could not be loaded. Is the file missing?" 87 | return False 88 | 89 | # Create the Channel DataFrame 90 | channel = pd.DataFrame(full_data['Channel'], columns = ['Channel']) 91 | channel = channel.drop(channel.index[outliers]).reset_index(drop = True) 92 | labeled = pd.concat([reduced_data, channel], axis = 1) 93 | 94 | # Generate the cluster plot 95 | fig, ax = plt.subplots(figsize = (14,8)) 96 | 97 | # Color map 98 | cmap = cm.get_cmap('gist_rainbow') 99 | 100 | # Color the points based on assigned Channel 101 | labels = ['Hotel/Restaurant/Cafe', 'Retailer'] 102 | grouped = labeled.groupby('Channel') 103 | for i, channel in grouped: 104 | channel.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \ 105 | color = cmap((i-1)*1.0/2), label = labels[i-1], s=30); 106 | 107 | # Plot transformed sample points 108 | for i, sample in enumerate(pca_samples): 109 | ax.scatter(x = sample[0], y = sample[1], \ 110 | s = 200, linewidth = 3, color = 'black', marker = 'o', facecolors = 'none'); 111 | ax.scatter(x = sample[0]+0.25, y = sample[1]+0.3, marker='$%d$'%(i), alpha = 1, s=125); 112 | 113 | # Set plot title 114 | ax.set_title("PCA-Reduced Data Labeled by 'Channel'\nTransformed Sample Data Circled"); -------------------------------------------------------------------------------- /projects/smartcab/README.md: -------------------------------------------------------------------------------- 1 | #MLND Slack Team Walkthrough. 2 | 3 | We have seen in our experience, that many students struggle with this project, largely due to the code. 4 | This fork of the standard P4, is made to help you understand some of the complicated code used in P4. Please review the code, add your comments and understanding. 5 | 6 | The ideal situation here is to combine our knowledge and explanation of of what's going in the code. 7 | Everyone is welcome to contribute, please use the pull request system. 8 | 9 | 10 | 11 | 12 | 13 | --- 14 | #### Original Udacity Readme 15 | #### Project 4: Reinforcement Learning 16 | #### Train a Smartcab How to Drive 17 | 18 | ### Install 19 | 20 | This project requires **Python 2.7** with the [pygame](https://www.pygame.org/wiki/GettingStarted 21 | ) library installed 22 | 23 | ### Code 24 | 25 | Template code is provided in the `smartcab/agent.py` python file. Additional supporting python code can be found in `smartcab/enviroment.py`, `smartcab/planner.py`, and `smartcab/simulator.py`. Supporting images for the graphical user interface can be found in the `images` folder. While some code has already been implemented to get you started, you will need to implement additional functionality for the `LearningAgent` class in `agent.py` when requested to successfully complete the project. 26 | 27 | ### Run 28 | 29 | In a terminal or command window, navigate to the top-level project directory `smartcab/` (that contains this README) and run one of the following commands: 30 | 31 | ```python smartcab/agent.py``` 32 | ```python -m smartcab.agent``` 33 | 34 | This will run the `agent.py` file and execute your agent code. 35 | -------------------------------------------------------------------------------- /projects/smartcab/images/car-black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/smartcab/images/car-black.png -------------------------------------------------------------------------------- /projects/smartcab/images/car-blue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/smartcab/images/car-blue.png -------------------------------------------------------------------------------- /projects/smartcab/images/car-cyan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/smartcab/images/car-cyan.png -------------------------------------------------------------------------------- /projects/smartcab/images/car-green.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/smartcab/images/car-green.png -------------------------------------------------------------------------------- /projects/smartcab/images/car-magenta.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/smartcab/images/car-magenta.png -------------------------------------------------------------------------------- /projects/smartcab/images/car-orange.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/smartcab/images/car-orange.png -------------------------------------------------------------------------------- /projects/smartcab/images/car-red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/smartcab/images/car-red.png -------------------------------------------------------------------------------- /projects/smartcab/images/car-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/smartcab/images/car-white.png -------------------------------------------------------------------------------- /projects/smartcab/images/car-yellow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/smartcab/images/car-yellow.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnanodegree/machine-learning/fbd4ebc475a5811f0f3dcbcc0c49d1be8e5b6aa9/projects/smartcab/smartcab/__init__.py -------------------------------------------------------------------------------- /projects/smartcab/smartcab/agent.py: -------------------------------------------------------------------------------- 1 | import random 2 | from environment import Agent, Environment 3 | from planner import RoutePlanner 4 | from simulator import Simulator 5 | 6 | class LearningAgent(Agent): 7 | """An agent that learns to drive in the smartcab world.""" 8 | 9 | def __init__(self, env): 10 | 11 | #question - Why are they using super() here? and what does it mean? 12 | 13 | super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color 14 | 15 | #super is a way inherit the parent class of the Learningagent. In this case, that's Agent, which already has an env set, 16 | #so by calling super, we inherit the env from the agent superclass, this insures that we all agents are running in the same env 17 | 18 | self.color = 'red' # override color 19 | self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint 20 | # TODO: Initialize any additional variables here 21 | 22 | def reset(self, destination=None): 23 | self.planner.route_to(destination) 24 | # TODO: Prepare for a new trip; reset any variables here, if required 25 | 26 | def update(self, t): 27 | # Gather inputs 28 | self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator 29 | inputs = self.env.sense(self) 30 | deadline = self.env.get_deadline(self) 31 | 32 | # TODO: Update state 33 | 34 | # TODO: Select action according to your policy 35 | action = None 36 | 37 | # Execute action and get reward 38 | reward = self.env.act(self, action) 39 | 40 | # TODO: Learn policy based on state, action, reward 41 | 42 | print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug] 43 | 44 | 45 | def run(): 46 | """Run the agent for a finite number of trials.""" 47 | 48 | # Set up environment and agent 49 | e = Environment() # create environment (also adds some dummy traffic) 50 | a = e.create_agent(LearningAgent) # create agent 51 | e.set_primary_agent(a, enforce_deadline=True) # specify agent to track 52 | # NOTE: You can set enforce_deadline=False while debugging to allow longer trials 53 | 54 | # Now simulate it 55 | sim = Simulator(e, update_delay=0.5, display=True) # create simulator (uses pygame when display=True, if available) 56 | # NOTE: To speed up simulation, reduce update_delay and/or set display=False 57 | 58 | sim.run(n_trials=100) # run for a specified number of trials 59 | # NOTE: To quit midway, press Esc or close pygame window, or hit Ctrl+C on the command-line 60 | 61 | 62 | if __name__ == '__main__': 63 | run() 64 | -------------------------------------------------------------------------------- /projects/smartcab/smartcab/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | from collections import OrderedDict 4 | 5 | from simulator import Simulator 6 | 7 | class TrafficLight(object): 8 | """A traffic light that switches periodically.""" 9 | 10 | valid_states = [True, False] # True = NS open, False = EW open 11 | 12 | def __init__(self, state=None, period=None): 13 | self.state = state if state is not None else random.choice(self.valid_states) 14 | self.period = period if period is not None else random.choice([3, 4, 5]) 15 | self.last_updated = 0 16 | 17 | def reset(self): 18 | self.last_updated = 0 19 | 20 | def update(self, t): 21 | if t - self.last_updated >= self.period: 22 | self.state = not self.state # assuming state is boolean 23 | self.last_updated = t 24 | 25 | 26 | class Environment(object): 27 | """Environment within which all agents operate.""" 28 | 29 | valid_actions = [None, 'forward', 'left', 'right'] 30 | valid_inputs = {'light': TrafficLight.valid_states, 'oncoming': valid_actions, 'left': valid_actions, 'right': valid_actions} 31 | valid_headings = [(1, 0), (0, -1), (-1, 0), (0, 1)] # ENWS 32 | hard_time_limit = -100 # even if enforce_deadline is False, end trial when deadline reaches this value (to avoid deadlocks) 33 | 34 | def __init__(self, num_dummies=3): 35 | self.num_dummies = num_dummies # no. of dummy agents 36 | 37 | # Initialize simulation variables 38 | self.done = False 39 | self.t = 0 40 | self.agent_states = OrderedDict() 41 | self.status_text = "" 42 | 43 | # Road network 44 | self.grid_size = (8, 6) # (cols, rows) 45 | self.bounds = (1, 1, self.grid_size[0], self.grid_size[1]) 46 | self.block_size = 100 47 | self.intersections = OrderedDict() 48 | self.roads = [] 49 | for x in xrange(self.bounds[0], self.bounds[2] + 1): 50 | for y in xrange(self.bounds[1], self.bounds[3] + 1): 51 | self.intersections[(x, y)] = TrafficLight() # a traffic light at each intersection 52 | 53 | for a in self.intersections: 54 | for b in self.intersections: 55 | if a == b: 56 | continue 57 | if (abs(a[0] - b[0]) + abs(a[1] - b[1])) == 1: # L1 distance = 1 58 | self.roads.append((a, b)) 59 | 60 | # Dummy agents 61 | for i in xrange(self.num_dummies): 62 | self.create_agent(DummyAgent) 63 | 64 | # Primary agent and associated parameters 65 | self.primary_agent = None # to be set explicitly 66 | self.enforce_deadline = False 67 | 68 | def create_agent(self, agent_class, *args, **kwargs): 69 | agent = agent_class(self, *args, **kwargs) 70 | self.agent_states[agent] = {'location': random.choice(self.intersections.keys()), 'heading': (0, 1)} 71 | return agent 72 | 73 | def set_primary_agent(self, agent, enforce_deadline=False): 74 | self.primary_agent = agent 75 | self.enforce_deadline = enforce_deadline 76 | 77 | def reset(self): 78 | self.done = False 79 | self.t = 0 80 | 81 | # Reset traffic lights 82 | for traffic_light in self.intersections.itervalues(): 83 | traffic_light.reset() 84 | 85 | # Pick a start and a destination 86 | start = random.choice(self.intersections.keys()) 87 | destination = random.choice(self.intersections.keys()) 88 | 89 | # Ensure starting location and destination are not too close 90 | while self.compute_dist(start, destination) < 4: 91 | start = random.choice(self.intersections.keys()) 92 | destination = random.choice(self.intersections.keys()) 93 | 94 | start_heading = random.choice(self.valid_headings) 95 | deadline = self.compute_dist(start, destination) * 5 96 | print "Environment.reset(): Trial set up with start = {}, destination = {}, deadline = {}".format(start, destination, deadline) 97 | 98 | # Initialize agent(s) 99 | for agent in self.agent_states.iterkeys(): 100 | self.agent_states[agent] = { 101 | 'location': start if agent is self.primary_agent else random.choice(self.intersections.keys()), 102 | 'heading': start_heading if agent is self.primary_agent else random.choice(self.valid_headings), 103 | 'destination': destination if agent is self.primary_agent else None, 104 | 'deadline': deadline if agent is self.primary_agent else None} 105 | agent.reset(destination=(destination if agent is self.primary_agent else None)) 106 | 107 | def step(self): 108 | #print "Environment.step(): t = {}".format(self.t) # [debug] 109 | 110 | # Update traffic lights 111 | for intersection, traffic_light in self.intersections.iteritems(): 112 | traffic_light.update(self.t) 113 | 114 | # Update agents 115 | for agent in self.agent_states.iterkeys(): 116 | agent.update(self.t) 117 | 118 | if self.done: 119 | return # primary agent might have reached destination 120 | 121 | if self.primary_agent is not None: 122 | agent_deadline = self.agent_states[self.primary_agent]['deadline'] 123 | if agent_deadline <= self.hard_time_limit: 124 | self.done = True 125 | print "Environment.step(): Primary agent hit hard time limit ({})! Trial aborted.".format(self.hard_time_limit) 126 | elif self.enforce_deadline and agent_deadline <= 0: 127 | self.done = True 128 | print "Environment.step(): Primary agent ran out of time! Trial aborted." 129 | self.agent_states[self.primary_agent]['deadline'] = agent_deadline - 1 130 | 131 | self.t += 1 132 | 133 | def sense(self, agent): 134 | assert agent in self.agent_states, "Unknown agent!" 135 | 136 | state = self.agent_states[agent] 137 | location = state['location'] 138 | heading = state['heading'] 139 | light = 'green' if (self.intersections[location].state and heading[1] != 0) or ((not self.intersections[location].state) and heading[0] != 0) else 'red' 140 | 141 | # Populate oncoming, left, right 142 | oncoming = None 143 | left = None 144 | right = None 145 | for other_agent, other_state in self.agent_states.iteritems(): 146 | if agent == other_agent or location != other_state['location'] or (heading[0] == other_state['heading'][0] and heading[1] == other_state['heading'][1]): 147 | continue 148 | other_heading = other_agent.get_next_waypoint() 149 | if (heading[0] * other_state['heading'][0] + heading[1] * other_state['heading'][1]) == -1: 150 | if oncoming != 'left': # we don't want to override oncoming == 'left' 151 | oncoming = other_heading 152 | elif (heading[1] == other_state['heading'][0] and -heading[0] == other_state['heading'][1]): 153 | if right != 'forward' and right != 'left': # we don't want to override right == 'forward or 'left' 154 | right = other_heading 155 | else: 156 | if left != 'forward': # we don't want to override left == 'forward' 157 | left = other_heading 158 | 159 | return {'light': light, 'oncoming': oncoming, 'left': left, 'right': right} 160 | 161 | def get_deadline(self, agent): 162 | return self.agent_states[agent]['deadline'] if agent is self.primary_agent else None 163 | 164 | def act(self, agent, action): 165 | assert agent in self.agent_states, "Unknown agent!" 166 | assert action in self.valid_actions, "Invalid action!" 167 | 168 | state = self.agent_states[agent] 169 | location = state['location'] 170 | heading = state['heading'] 171 | light = 'green' if (self.intersections[location].state and heading[1] != 0) or ((not self.intersections[location].state) and heading[0] != 0) else 'red' 172 | inputs = self.sense(agent) 173 | 174 | # Move agent if within bounds and obeys traffic rules 175 | reward = 0 # reward/penalty 176 | move_okay = True 177 | if action == 'forward': 178 | if light != 'green': 179 | move_okay = False 180 | elif action == 'left': 181 | if light == 'green' and (inputs['oncoming'] == None or inputs['oncoming'] == 'left'): 182 | heading = (heading[1], -heading[0]) 183 | else: 184 | move_okay = False 185 | elif action == 'right': 186 | if light == 'green' or inputs['left'] != 'forward': 187 | heading = (-heading[1], heading[0]) 188 | else: 189 | move_okay = False 190 | 191 | if move_okay: 192 | # Valid move (could be null) 193 | if action is not None: 194 | # Valid non-null move 195 | location = ((location[0] + heading[0] - self.bounds[0]) % (self.bounds[2] - self.bounds[0] + 1) + self.bounds[0], 196 | (location[1] + heading[1] - self.bounds[1]) % (self.bounds[3] - self.bounds[1] + 1) + self.bounds[1]) # wrap-around 197 | #if self.bounds[0] <= location[0] <= self.bounds[2] and self.bounds[1] <= location[1] <= self.bounds[3]: # bounded 198 | state['location'] = location 199 | state['heading'] = heading 200 | reward = 2.0 if action == agent.get_next_waypoint() else -0.5 # valid, but is it correct? (as per waypoint) 201 | else: 202 | # Valid null move 203 | reward = 0.0 204 | else: 205 | # Invalid move 206 | reward = -1.0 207 | 208 | if agent is self.primary_agent: 209 | if state['location'] == state['destination']: 210 | if state['deadline'] >= 0: 211 | reward += 10 # bonus 212 | self.done = True 213 | print "Environment.act(): Primary agent has reached destination!" # [debug] 214 | self.status_text = "state: {}\naction: {}\nreward: {}".format(agent.get_state(), action, reward) 215 | #print "Environment.act() [POST]: location: {}, heading: {}, action: {}, reward: {}".format(location, heading, action, reward) # [debug] 216 | 217 | return reward 218 | 219 | def compute_dist(self, a, b): 220 | """L1 distance between two points.""" 221 | return abs(b[0] - a[0]) + abs(b[1] - a[1]) 222 | 223 | 224 | class Agent(object): 225 | """Base class for all agents.""" 226 | 227 | def __init__(self, env): 228 | self.env = env 229 | self.state = None 230 | self.next_waypoint = None 231 | self.color = 'cyan' 232 | 233 | def reset(self, destination=None): 234 | pass 235 | 236 | def update(self, t): 237 | pass 238 | 239 | def get_state(self): 240 | return self.state 241 | 242 | def get_next_waypoint(self): 243 | return self.next_waypoint 244 | 245 | 246 | class DummyAgent(Agent): 247 | color_choices = ['blue', 'cyan', 'magenta', 'orange'] 248 | 249 | def __init__(self, env): 250 | super(DummyAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color 251 | self.next_waypoint = random.choice(Environment.valid_actions[1:]) 252 | self.color = random.choice(self.color_choices) 253 | 254 | def update(self, t): 255 | inputs = self.env.sense(self) 256 | 257 | action_okay = True 258 | if self.next_waypoint == 'right': 259 | if inputs['light'] == 'red' and inputs['left'] == 'forward': 260 | action_okay = False 261 | elif self.next_waypoint == 'forward': 262 | if inputs['light'] == 'red': 263 | action_okay = False 264 | elif self.next_waypoint == 'left': 265 | if inputs['light'] == 'red' or (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'): 266 | action_okay = False 267 | 268 | action = None 269 | if action_okay: 270 | action = self.next_waypoint 271 | self.next_waypoint = random.choice(Environment.valid_actions[1:]) 272 | reward = self.env.act(self, action) 273 | #print "DummyAgent.update(): t = {}, inputs = {}, action = {}, reward = {}".format(t, inputs, action, reward) # [debug] 274 | #print "DummyAgent.update(): next_waypoint = {}".format(self.next_waypoint) # [debug] 275 | -------------------------------------------------------------------------------- /projects/smartcab/smartcab/planner.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | class RoutePlanner(object): 4 | """Silly route planner that is meant for a perpendicular grid network.""" 5 | 6 | def __init__(self, env, agent): 7 | self.env = env 8 | self.agent = agent 9 | self.destination = None 10 | 11 | def route_to(self, destination=None): 12 | self.destination = destination if destination is not None else random.choice(self.env.intersections.keys()) 13 | print "RoutePlanner.route_to(): destination = {}".format(destination) # [debug] 14 | 15 | def next_waypoint(self): 16 | location = self.env.agent_states[self.agent]['location'] 17 | heading = self.env.agent_states[self.agent]['heading'] 18 | delta = (self.destination[0] - location[0], self.destination[1] - location[1]) 19 | if delta[0] == 0 and delta[1] == 0: 20 | return None 21 | elif delta[0] != 0: # EW difference 22 | if delta[0] * heading[0] > 0: # facing correct EW direction 23 | return 'forward' 24 | elif delta[0] * heading[0] < 0: # facing opposite EW direction 25 | return 'right' # long U-turn 26 | elif delta[0] * heading[1] > 0: 27 | return 'left' 28 | else: 29 | return 'right' 30 | elif delta[1] != 0: # NS difference (turn logic is slightly different) 31 | if delta[1] * heading[1] > 0: # facing correct NS direction 32 | return 'forward' 33 | elif delta[1] * heading[1] < 0: # facing opposite NS direction 34 | return 'right' # long U-turn 35 | elif delta[1] * heading[0] > 0: 36 | return 'right' 37 | else: 38 | return 'left' 39 | -------------------------------------------------------------------------------- /projects/smartcab/smartcab/simulator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import random 4 | import importlib 5 | 6 | class Simulator(object): 7 | """Simulates agents in a dynamic smartcab environment. 8 | 9 | Uses PyGame to display GUI, if available. 10 | """ 11 | 12 | colors = { 13 | 'black' : ( 0, 0, 0), 14 | 'white' : (255, 255, 255), 15 | 'red' : (255, 0, 0), 16 | 'green' : ( 0, 255, 0), 17 | 'blue' : ( 0, 0, 255), 18 | 'cyan' : ( 0, 200, 200), 19 | 'magenta' : (200, 0, 200), 20 | 'yellow' : (255, 255, 0), 21 | 'orange' : (255, 128, 0) 22 | } 23 | 24 | def __init__(self, env, size=None, update_delay=1.0, display=True): 25 | self.env = env 26 | self.size = size if size is not None else ((self.env.grid_size[0] + 1) * self.env.block_size, (self.env.grid_size[1] + 1) * self.env.block_size) 27 | self.width, self.height = self.size 28 | 29 | self.bg_color = self.colors['white'] 30 | self.road_width = 5 31 | self.road_color = self.colors['black'] 32 | 33 | self.quit = False 34 | self.start_time = None 35 | self.current_time = 0.0 36 | self.last_updated = 0.0 37 | self.update_delay = update_delay # duration between each step (in secs) 38 | 39 | self.display = display 40 | if self.display: 41 | try: 42 | self.pygame = importlib.import_module('pygame') 43 | self.pygame.init() 44 | self.screen = self.pygame.display.set_mode(self.size) 45 | 46 | self.frame_delay = max(1, int(self.update_delay * 1000)) # delay between GUI frames in ms (min: 1) 47 | self.agent_sprite_size = (32, 32) 48 | self.agent_circle_radius = 10 # radius of circle, when using simple representation 49 | for agent in self.env.agent_states: 50 | agent._sprite = self.pygame.transform.smoothscale(self.pygame.image.load(os.path.join("images", "car-{}.png".format(agent.color))), self.agent_sprite_size) 51 | agent._sprite_size = (agent._sprite.get_width(), agent._sprite.get_height()) 52 | 53 | self.font = self.pygame.font.Font(None, 28) 54 | self.paused = False 55 | except ImportError as e: 56 | self.display = False 57 | print "Simulator.__init__(): Unable to import pygame; display disabled.\n{}: {}".format(e.__class__.__name__, e) 58 | except Exception as e: 59 | self.display = False 60 | print "Simulator.__init__(): Error initializing GUI objects; display disabled.\n{}: {}".format(e.__class__.__name__, e) 61 | 62 | def run(self, n_trials=1): 63 | self.quit = False 64 | for trial in xrange(n_trials): 65 | print "Simulator.run(): Trial {}".format(trial) # [debug] 66 | self.env.reset() 67 | self.current_time = 0.0 68 | self.last_updated = 0.0 69 | self.start_time = time.time() 70 | while True: 71 | try: 72 | # Update current time 73 | self.current_time = time.time() - self.start_time 74 | #print "Simulator.run(): current_time = {:.3f}".format(self.current_time) 75 | 76 | # Handle GUI events 77 | if self.display: 78 | for event in self.pygame.event.get(): 79 | if event.type == self.pygame.QUIT: 80 | self.quit = True 81 | elif event.type == self.pygame.KEYDOWN: 82 | if event.key == 27: # Esc 83 | self.quit = True 84 | elif event.unicode == u' ': 85 | self.paused = True 86 | 87 | if self.paused: 88 | self.pause() 89 | 90 | # Update environment 91 | if self.current_time - self.last_updated >= self.update_delay: 92 | self.env.step() 93 | self.last_updated = self.current_time 94 | 95 | # Render GUI and sleep 96 | if self.display: 97 | self.render() 98 | self.pygame.time.wait(self.frame_delay) 99 | except KeyboardInterrupt: 100 | self.quit = True 101 | finally: 102 | if self.quit or self.env.done: 103 | break 104 | 105 | if self.quit: 106 | break 107 | 108 | def render(self): 109 | # Clear screen 110 | self.screen.fill(self.bg_color) 111 | 112 | # Draw elements 113 | # * Static elements 114 | for road in self.env.roads: 115 | self.pygame.draw.line(self.screen, self.road_color, (road[0][0] * self.env.block_size, road[0][1] * self.env.block_size), (road[1][0] * self.env.block_size, road[1][1] * self.env.block_size), self.road_width) 116 | 117 | for intersection, traffic_light in self.env.intersections.iteritems(): 118 | self.pygame.draw.circle(self.screen, self.road_color, (intersection[0] * self.env.block_size, intersection[1] * self.env.block_size), 10) 119 | if traffic_light.state: # North-South is open 120 | self.pygame.draw.line(self.screen, self.colors['green'], 121 | (intersection[0] * self.env.block_size, intersection[1] * self.env.block_size - 15), 122 | (intersection[0] * self.env.block_size, intersection[1] * self.env.block_size + 15), self.road_width) 123 | else: # East-West is open 124 | self.pygame.draw.line(self.screen, self.colors['green'], 125 | (intersection[0] * self.env.block_size - 15, intersection[1] * self.env.block_size), 126 | (intersection[0] * self.env.block_size + 15, intersection[1] * self.env.block_size), self.road_width) 127 | 128 | # * Dynamic elements 129 | for agent, state in self.env.agent_states.iteritems(): 130 | # Compute precise agent location here (back from the intersection some) 131 | agent_offset = (2 * state['heading'][0] * self.agent_circle_radius, 2 * state['heading'][1] * self.agent_circle_radius) 132 | agent_pos = (state['location'][0] * self.env.block_size - agent_offset[0], state['location'][1] * self.env.block_size - agent_offset[1]) 133 | agent_color = self.colors[agent.color] 134 | if hasattr(agent, '_sprite') and agent._sprite is not None: 135 | # Draw agent sprite (image), properly rotated 136 | rotated_sprite = agent._sprite if state['heading'] == (1, 0) else self.pygame.transform.rotate(agent._sprite, 180 if state['heading'][0] == -1 else state['heading'][1] * -90) 137 | self.screen.blit(rotated_sprite, 138 | self.pygame.rect.Rect(agent_pos[0] - agent._sprite_size[0] / 2, agent_pos[1] - agent._sprite_size[1] / 2, 139 | agent._sprite_size[0], agent._sprite_size[1])) 140 | else: 141 | # Draw simple agent (circle with a short line segment poking out to indicate heading) 142 | self.pygame.draw.circle(self.screen, agent_color, agent_pos, self.agent_circle_radius) 143 | self.pygame.draw.line(self.screen, agent_color, agent_pos, state['location'], self.road_width) 144 | if agent.get_next_waypoint() is not None: 145 | self.screen.blit(self.font.render(agent.get_next_waypoint(), True, agent_color, self.bg_color), (agent_pos[0] + 10, agent_pos[1] + 10)) 146 | if state['destination'] is not None: 147 | self.pygame.draw.circle(self.screen, agent_color, (state['destination'][0] * self.env.block_size, state['destination'][1] * self.env.block_size), 6) 148 | self.pygame.draw.circle(self.screen, agent_color, (state['destination'][0] * self.env.block_size, state['destination'][1] * self.env.block_size), 15, 2) 149 | 150 | # * Overlays 151 | text_y = 10 152 | for text in self.env.status_text.split('\n'): 153 | self.screen.blit(self.font.render(text, True, self.colors['red'], self.bg_color), (100, text_y)) 154 | text_y += 20 155 | 156 | # Flip buffers 157 | self.pygame.display.flip() 158 | 159 | def pause(self): 160 | abs_pause_time = time.time() 161 | pause_text = "[PAUSED] Press any key to continue..." 162 | self.screen.blit(self.font.render(pause_text, True, self.colors['cyan'], self.bg_color), (100, self.height - 40)) 163 | self.pygame.display.flip() 164 | print pause_text # [debug] 165 | while self.paused: 166 | for event in self.pygame.event.get(): 167 | if event.type == self.pygame.KEYDOWN: 168 | self.paused = False 169 | self.pygame.time.wait(self.frame_delay) 170 | self.screen.blit(self.font.render(pause_text, True, self.bg_color, self.bg_color), (100, self.height - 40)) 171 | self.start_time += (time.time() - abs_pause_time) 172 | -------------------------------------------------------------------------------- /projects/student_intervention/README.md: -------------------------------------------------------------------------------- 1 | # Project 2: Supervised Learning 2 | ## Building a Student Intervention System 3 | 4 | ### Install 5 | 6 | This project requires **Python 2.7** and the following Python libraries installed: 7 | 8 | - [NumPy](http://www.numpy.org/) 9 | - [Pandas](http://pandas.pydata.org) 10 | - [scikit-learn](http://scikit-learn.org/stable/) 11 | 12 | You will also need to have software installed to run and execute an [iPython Notebook](http://ipython.org/notebook.html) 13 | 14 | Udacity recommends our students install [Anaconda](https://www.continuum.io/downloads), a pre-packaged Python distribution that contains all of the necessary libraries and software for this project. 15 | 16 | ### Code 17 | 18 | Template code is provided in the notebook `student_intervention.ipynb` notebook file. While some code has already been implemented to get you started, you will need to implement additional functionality when requested to successfully complete the project. 19 | 20 | ### Run 21 | 22 | In a terminal or command window, navigate to the top-level project directory `student_intervention/` (that contains this README) and run one of the following commands: 23 | 24 | ```ipython notebook student_intervention.ipynb``` 25 | ```jupyter notebook student_intervention.ipynb``` 26 | 27 | This will open the iPython Notebook software and project file in your browser. 28 | 29 | ## Data 30 | 31 | The dataset used in this project is included as `student-data.csv`. This dataset has the following attributes: 32 | 33 | - `school` : student's school (binary: "GP" or "MS") 34 | - `sex` : student's sex (binary: "F" - female or "M" - male) 35 | - `age` : student's age (numeric: from 15 to 22) 36 | - `address` : student's home address type (binary: "U" - urban or "R" - rural) 37 | - `famsize` : family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3) 38 | - `Pstatus` : parent's cohabitation status (binary: "T" - living together or "A" - apart) 39 | - `Medu` : mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary education or 4 - higher education) 40 | - `Fedu` : father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary education or 4 - higher education) 41 | - `Mjob` : mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other") 42 | - `Fjob` : father's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other") 43 | - `reason` : reason to choose this school (nominal: close to "home", school "reputation", "course" preference or "other") 44 | - `guardian` : student's guardian (nominal: "mother", "father" or "other") 45 | - `traveltime` : home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour) 46 | - `studytime` : weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours) 47 | - `failures` : number of past class failures (numeric: n if 1<=n<3, else 4) 48 | - `schoolsup` : extra educational support (binary: yes or no) 49 | - `famsup` : family educational support (binary: yes or no) 50 | - `paid` : extra paid classes within the course subject (Math or Portuguese) (binary: yes or no) 51 | - `activities` : extra-curricular activities (binary: yes or no) 52 | - `nursery` : attended nursery school (binary: yes or no) 53 | - `higher` : wants to take higher education (binary: yes or no) 54 | - `internet` : Internet access at home (binary: yes or no) 55 | - `romantic` : with a romantic relationship (binary: yes or no) 56 | - `famrel` : quality of family relationships (numeric: from 1 - very bad to 5 - excellent) 57 | - `freetime` : free time after school (numeric: from 1 - very low to 5 - very high) 58 | - `goout` : going out with friends (numeric: from 1 - very low to 5 - very high) 59 | - `Dalc` : workday alcohol consumption (numeric: from 1 - very low to 5 - very high) 60 | - `Walc` : weekend alcohol consumption (numeric: from 1 - very low to 5 - very high) 61 | - `health` : current health status (numeric: from 1 - very bad to 5 - very good) 62 | - `absences` : number of school absences (numeric: from 0 to 93) 63 | - `passed` : did the student pass the final exam (binary: yes or no) 64 | -------------------------------------------------------------------------------- /projects/titanic_survival_exploration/README.md: -------------------------------------------------------------------------------- 1 | # Project 0: Introduction and Fundamentals 2 | ## Titanic Survival Exploration 3 | 4 | ### Install 5 | 6 | This project requires **Python 2.7** and the following Python libraries installed: 7 | 8 | - [NumPy](http://www.numpy.org/) 9 | - [Pandas](http://pandas.pydata.org) 10 | - [matplotlib](http://matplotlib.org/) 11 | - [scikit-learn](http://scikit-learn.org/stable/) 12 | 13 | You will also need to have software installed to run and execute an [iPython Notebook](http://ipython.org/notebook.html) 14 | 15 | Udacity recommends our students install [Anaconda](https://www.continuum.io/downloads), a pre-packaged Python distribution that contains all of the necessary libraries and software for this project. 16 | 17 | ### Code 18 | 19 | Template code is provided in the notebook `titanic_survival_exploration.ipynb` notebook file. Additional supporting code can be found in `titanic_visualizations.py`. While some code has already been implemented to get you started, you will need to implement additional functionality when requested to successfully complete the project. 20 | 21 | ### Run 22 | 23 | In a terminal or command window, navigate to the top-level project directory `titanic_survival_exploration/` (that contains this README) and run **one** of the following commands: 24 | 25 | ```bash 26 | jupyter notebook titanic_survival_exploration.ipynb 27 | ``` 28 | or 29 | ```bash 30 | ipython notebook titanic_survival_exploration.ipynb 31 | ``` 32 | 33 | This will open the iPython Notebook software and project file in your web browser. 34 | 35 | ## Data 36 | 37 | The dataset used in this project is included as `titanic_data.csv`. This dataset is provided by Udacity and contains the following attributes: 38 | 39 | - `survival` : Survival (0 = No; 1 = Yes) 40 | - `pclass` : Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd) 41 | - `name` : Name 42 | - `sex` : Sex 43 | - `age` : Age 44 | - `sibsp` : Number of Siblings/Spouses Aboard 45 | - `parch` : Number of Parents/Children Aboard 46 | - `ticket` : Ticket Number 47 | - `fare` : Passenger Fare 48 | - `cabin` : Cabin 49 | - `embarked` : Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton) 50 | -------------------------------------------------------------------------------- /projects/titanic_survival_exploration/titanic_visualizations.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | def filter_data(data, condition): 6 | """ 7 | Remove elements that do not match the condition provided. 8 | Takes a data list as input and returns a filtered list. 9 | Conditions should be a list of strings of the following format: 10 | ' ' 11 | where the following operations are valid: >, <, >=, <=, ==, != 12 | 13 | Example: ["Sex == 'male'", 'Age < 18'] 14 | """ 15 | 16 | field, op, value = condition.split(" ") 17 | 18 | # convert value into number or strip excess quotes if string 19 | try: 20 | value = float(value) 21 | except: 22 | value = value.strip("\'\"") 23 | 24 | # get booleans for filtering 25 | if op == ">": 26 | matches = data[field] > value 27 | elif op == "<": 28 | matches = data[field] < value 29 | elif op == ">=": 30 | matches = data[field] >= value 31 | elif op == "<=": 32 | matches = data[field] <= value 33 | elif op == "==": 34 | matches = data[field] == value 35 | elif op == "!=": 36 | matches = data[field] != value 37 | else: # catch invalid operation codes 38 | raise Exception("Invalid comparison operator. Only >, <, >=, <=, ==, != allowed.") 39 | 40 | # filter data and outcomes 41 | data = data[matches].reset_index(drop = True) 42 | return data 43 | 44 | def survival_stats(data, outcomes, key, filters = []): 45 | """ 46 | Print out selected statistics regarding survival, given a feature of 47 | interest and any number of filters (including no filters) 48 | """ 49 | 50 | # Check that the key exists 51 | if key not in data.columns.values : 52 | print "'{}' is not a feature of the Titanic data. Did you spell something wrong?".format(key) 53 | return False 54 | 55 | # Return the function before visualizing if 'Cabin' or 'Ticket' 56 | # is selected: too many unique categories to display 57 | if(key == 'Cabin' or key == 'PassengerId' or key == 'Ticket'): 58 | print "'{}' has too many unique categories to display! Try a different feature.".format(key) 59 | return False 60 | 61 | # Merge data and outcomes into single dataframe 62 | all_data = pd.concat([data, outcomes], axis = 1) 63 | 64 | # Apply filters to data 65 | for condition in filters: 66 | all_data = filter_data(all_data, condition) 67 | 68 | # Create outcomes DataFrame 69 | all_data = all_data[[key, 'Survived']] 70 | 71 | # Create plotting figure 72 | plt.figure(figsize=(8,6)) 73 | 74 | # 'Numerical' features 75 | if(key == 'Age' or key == 'Fare'): 76 | 77 | # Remove NaN values from Age data 78 | all_data = all_data[~np.isnan(all_data[key])] 79 | 80 | # Divide the range of data into bins and count survival rates 81 | min_value = all_data[key].min() 82 | max_value = all_data[key].max() 83 | value_range = max_value - min_value 84 | 85 | # 'Fares' has larger range of values than 'Age' so create more bins 86 | if(key == 'Fare'): 87 | bins = np.arange(0, all_data['Fare'].max() + 20, 20) 88 | if(key == 'Age'): 89 | bins = np.arange(0, all_data['Age'].max() + 10, 10) 90 | 91 | # Overlay each bin's survival rates 92 | nonsurv_vals = all_data[all_data['Survived'] == 0][key].reset_index(drop = True) 93 | surv_vals = all_data[all_data['Survived'] == 1][key].reset_index(drop = True) 94 | plt.hist(nonsurv_vals, bins = bins, alpha = 0.6, 95 | color = 'red', label = 'Did not survive') 96 | plt.hist(surv_vals, bins = bins, alpha = 0.6, 97 | color = 'green', label = 'Survived') 98 | 99 | # Add legend to plot 100 | plt.xlim(0, bins.max()) 101 | plt.legend(framealpha = 0.8) 102 | 103 | # 'Categorical' features 104 | else: 105 | 106 | # Set the various categories 107 | if(key == 'Pclass'): 108 | values = np.arange(1,4) 109 | if(key == 'Parch' or key == 'SibSp'): 110 | values = np.arange(0,np.max(data[key]) + 1) 111 | if(key == 'Embarked'): 112 | values = ['C', 'Q', 'S'] 113 | if(key == 'Sex'): 114 | values = ['male', 'female'] 115 | 116 | # Create DataFrame containing categories and count of each 117 | frame = pd.DataFrame(index = np.arange(len(values)), columns=(key,'Survived','NSurvived')) 118 | for i, value in enumerate(values): 119 | frame.loc[i] = [value, \ 120 | len(all_data[(all_data['Survived'] == 1) & (all_data[key] == value)]), \ 121 | len(all_data[(all_data['Survived'] == 0) & (all_data[key] == value)])] 122 | 123 | # Set the width of each bar 124 | bar_width = 0.4 125 | 126 | # Display each category's survival rates 127 | for i in np.arange(len(frame)): 128 | nonsurv_bar = plt.bar(i-bar_width, frame.loc[i]['NSurvived'], width = bar_width, color = 'r') 129 | surv_bar = plt.bar(i, frame.loc[i]['Survived'], width = bar_width, color = 'g') 130 | 131 | plt.xticks(np.arange(len(frame)), values) 132 | plt.legend((nonsurv_bar[0], surv_bar[0]),('Did not survive', 'Survived'), framealpha = 0.8) 133 | 134 | # Common attributes for plot formatting 135 | plt.xlabel(key) 136 | plt.ylabel('Number of Passengers') 137 | plt.title('Passenger Survival Statistics With \'%s\' Feature'%(key)) 138 | plt.show() 139 | 140 | # Report number of passengers with missing values 141 | if sum(pd.isnull(all_data[key])): 142 | nan_outcomes = all_data[pd.isnull(all_data[key])]['Survived'] 143 | print "Passengers with missing '{}' values: {} ({} survived, {} did not survive)".format( \ 144 | key, len(nan_outcomes), sum(nan_outcomes == 1), sum(nan_outcomes == 0)) 145 | 146 | -------------------------------------------------------------------------------- /projects_cn/boston_housing/README.md: -------------------------------------------------------------------------------- 1 | # 项目1:模型评估与验证 2 | ## 波士顿房价预测 3 | 4 | ### 准备工作 5 | 6 | 这个项目需要安装**Python 2.7**和以下的Python函数库: 7 | 8 | - [NumPy](http://www.numpy.org/) 9 | - [matplotlib](http://matplotlib.org/) 10 | - [scikit-learn](http://scikit-learn.org/stable/) 11 | 12 | 你还需要安装一个软件,以运行和编辑[ipynb](http://jupyter.org/)文件。 13 | 14 | 优达学城推荐学生安装 [Anaconda](https://www.continuum.io/downloads),这是一个常用的Python集成编译环境,且已包含了本项目中所需的全部函数库。我们在P0项目中也有讲解[如何搭建学习环境](https://github.com/udacity/machine-learning/blob/master/projects_cn/titanic_survival_exploration/README.md)。 15 | 16 | ### 编码 17 | 18 | 代码的模版已经在`boston_housing.ipynb`文件中给出。你还会用到`visuals.py`和名为`housing.csv`的数据文件来完成这个项目。我们已经为你提供了一部分代码,但还有些功能需要你来实现才能以完成这个项目。 19 | 20 | ### 运行 21 | 22 | 在终端或命令行窗口中,选定`boston_housing/`的目录下(包含此README文件),运行下方的命令: 23 | 24 | ```jupyter notebook boston_housing.ipynb``` 25 | 26 | 这样就能够启动jupyter notebook软件,并在你的浏览器中打开文件。 27 | 28 | ### 数据 29 | 30 | 31 | 本项目中使用的数据均包含在scikit-learn数据库([`sklearn.datasets.load_boston`] (http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html#sklearn.datasets.load_boston))中,你无需额外下载。关于数据的更多信息,你可以访问[UCI机器学习库](https://archive.ics.uci.edu/ml/datasets/Housing)。 -------------------------------------------------------------------------------- /projects_cn/boston_housing/boston_housing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 机器学习工程师纳米学位\n", 8 | "## 模型评价与验证\n", 9 | "## 项目 1: 预测波士顿房价\n", 10 | "\n", 11 | "\n", 12 | "欢迎来到机器学习工程师纳米学位的第一个项目!在此文件中,有些示例代码已经提供给你,但你还需要实现更多的功能来让项目成功运行。除非有明确要求,你无须修改任何已给出的代码。以**'练习'**开始的标题表示接下来的内容中有需要你必须实现的功能。每一部分都会有详细的指导,需要实现的部分也会在注释中以**'TODO'**标出。请仔细阅读所有的提示!\n", 13 | "\n", 14 | "除了实现代码外,你还**必须**回答一些与项目和实现有关的问题。每一个需要你回答的问题都会以**'问题 X'**为标题。请仔细阅读每个问题,并且在问题后的**'回答'**文字框中写出完整的答案。你的项目将会根据你对问题的回答和撰写代码所实现的功能来进行评分。\n", 15 | "\n", 16 | ">**提示:**Code 和 Markdown 区域可通过 **Shift + Enter** 快捷键运行。此外,Markdown可以通过双击进入编辑模式。" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## 开始\n", 24 | "在这个项目中,你将利用马萨诸塞州波士顿郊区的房屋信息数据训练和测试一个模型,并对模型的性能和预测能力进行测试。通过该数据训练后的好的模型可以被用来对房屋做特定预测---尤其是对房屋的价值。对于房地产经纪等人的日常工作来说,这样的预测模型被证明非常有价值。\n", 25 | "\n", 26 | "此项目的数据集来自[UCI机器学习知识库](https://archive.ics.uci.edu/ml/datasets/Housing)。波士顿房屋这些数据于1978年开始统计,共506个数据点,涵盖了麻省波士顿不同郊区房屋14种特征的信息。本项目对原始数据集做了以下处理:\n", 27 | "- 有16个`'MEDV'` 值为50.0的数据点被移除。 这很可能是由于这些数据点包含**遗失**或**看不到的值**。\n", 28 | "- 有1个数据点的 `'RM'` 值为8.78. 这是一个异常值,已经被移除。\n", 29 | "- 对于本项目,房屋的`'RM'`, `'LSTAT'`,`'PTRATIO'`以及`'MEDV'`特征是必要的,其余不相关特征已经被移除。\n", 30 | "- `'MEDV'`特征的值已经过必要的数学转换,可以反映35年来市场的通货膨胀效应。\n", 31 | "\n", 32 | "运行下面区域的代码以载入波士顿房屋数据集,以及一些此项目所需的Python库。如果成功返回数据集的大小,表示数据集已载入成功。" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "# Import libraries necessary for this project\n", 44 | "# 载入此项目所需要的库\n", 45 | "import numpy as np\n", 46 | "import pandas as pd\n", 47 | "import visuals as vs # Supplementary code\n", 48 | "from sklearn.cross_validation import ShuffleSplit\n", 49 | "\n", 50 | "# Pretty display for notebooks\n", 51 | "# 让结果在notebook中显示\n", 52 | "%matplotlib inline\n", 53 | "\n", 54 | "# Load the Boston housing dataset\n", 55 | "# 载入波士顿房屋的数据集\n", 56 | "data = pd.read_csv('housing.csv')\n", 57 | "prices = data['MEDV']\n", 58 | "features = data.drop('MEDV', axis = 1)\n", 59 | " \n", 60 | "# Success\n", 61 | "# 完成\n", 62 | "print \"Boston housing dataset has {} data points with {} variables each.\".format(*data.shape)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## 分析数据\n", 70 | "在项目的第一个部分,你会对波士顿房地产数据进行初步的观察并给出你的分析。通过对数据的探索来熟悉数据可以让你更好地理解和解释你的结果。\n", 71 | "\n", 72 | "由于这个项目的最终目标是建立一个预测房屋价值的模型,我们需要将数据集分为**特征(features)**和**目标变量(target variable)**。**特征** `'RM'`, `'LSTAT'`,和 `'PTRATIO'`,给我们提供了每个数据点的数量相关的信息。**目标变量**:` 'MEDV'`,是我们希望预测的变量。他们分别被存在`features`和`prices`两个变量名中。" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 练习:基础统计运算\n", 80 | "你的第一个编程练习是计算有关波士顿房价的描述统计数据。我们已为你导入了` numpy `,你需要使用这个库来执行必要的计算。这些统计数据对于分析模型的预测结果非常重要的。\n", 81 | "在下面的代码中,你要做的是:\n", 82 | "- 计算`prices`中的`'MEDV'`的最小值、最大值、均值、中值和标准差;\n", 83 | "- 将运算结果储存在相应的变量中。" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "# TODO: Minimum price of the data\n", 95 | "#目标:计算价值的最小值\n", 96 | "minimum_price = None\n", 97 | "\n", 98 | "# TODO: Maximum price of the data\n", 99 | "#目标:计算价值的最大值\n", 100 | "maximum_price = None\n", 101 | "\n", 102 | "# TODO: Mean price of the data\n", 103 | "#目标:计算价值的平均值\n", 104 | "mean_price = None\n", 105 | "\n", 106 | "# TODO: Median price of the data\n", 107 | "#目标:计算价值的中值\n", 108 | "median_price = None\n", 109 | "\n", 110 | "# TODO: Standard deviation of prices of the data\n", 111 | "#目标:计算价值的标准差\n", 112 | "std_price = None\n", 113 | "\n", 114 | "# Show the calculated statistics\n", 115 | "#目标:输出计算的结果\n", 116 | "print \"Statistics for Boston housing dataset:\\n\"\n", 117 | "print \"Minimum price: ${:,.2f}\".format(minimum_price)\n", 118 | "print \"Maximum price: ${:,.2f}\".format(maximum_price)\n", 119 | "print \"Mean price: ${:,.2f}\".format(mean_price)\n", 120 | "print \"Median price ${:,.2f}\".format(median_price)\n", 121 | "print \"Standard deviation of prices: ${:,.2f}\".format(std_price)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "### 问题1 - 特征观察\n", 129 | "\n", 130 | "如前文所述,本项目中我们关注的是其中三个值:`'RM'`、`'LSTAT'` 和`'PTRATIO'`,对每一个数据点:\n", 131 | "- `'RM'` 是该地区中每个房屋的平均房间数量;\n", 132 | "- `'LSTAT'` 是指该地区有多少百分比的房东属于是低收入阶层(有工作但收入微薄);\n", 133 | "- `'PTRATIO'` 是该地区的中学和小学里,学生和老师的数目比(`学生/老师`)。\n", 134 | "\n", 135 | "_凭直觉,上述三个特征中对每一个来说,你认为增大该特征的数值,`'MEDV'`的值会是**增大**还是**减小**呢?每一个答案都需要你给出理由。_\n", 136 | "\n", 137 | "**提示:**你预期一个`'RM'` 值是6的房屋跟`'RM'` 值是7的房屋相比,价值更高还是更低呢?" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "**回答: **" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "## 建模\n", 152 | "在项目的第二部分中,你需要了解必要的工具和技巧来让你的模型进行预测。用这些工具和技巧对每一个模型的表现做精确的衡量可以极大地增强你预测的信心。" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "### 练习:定义衡量标准\n", 160 | "如果不能对模型的训练和测试的表现进行量化地评估,我们就很难衡量模型的好坏。通常我们会定义一些衡量标准,这些标准可以通过对某些误差或者拟合程度的计算来得到。在这个项目中,你将通过运算[*决定系数*](http://stattrek.com/statistics/dictionary.aspx?definition=coefficient_of_determination)R2 来量化模型的表现。模型的决定系数是回归分析中十分常用的统计信息,经常被当作衡量模型预测能力好坏的标准。\n", 161 | "\n", 162 | "R2的数值范围从0至1,表示**目标变量**的预测值和实际值之间的相关程度平方的百分比。一个模型的R2 值为0说明它完全无法预测目标变量;而一个R2 值为1的模型则可以对目标变量进行完美的预测。从0至1之间的数值,则表示该模型中目标变量中有百分之多少能够用**特征**来解释。_模型也可能出现负值的R2,这种情况下模型所做预测还不如直接计算目标变量的平均值。\n", 163 | "\n", 164 | "在下方代码的 `performance_metric` 函数中,你要实现:\n", 165 | "- 使用 `sklearn.metrics` 中的 `r2_score` 来计算 `y_true` 和 `y_predict`的R2值,作为对其表现的评判。\n", 166 | "- 将他们的表现评分储存到`score`变量中。" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "# TODO: Import 'r2_score'\n", 178 | "\n", 179 | "def performance_metric(y_true, y_predict):\n", 180 | " \"\"\" Calculates and returns the performance score between \n", 181 | " true and predicted values based on the metric chosen. \"\"\"\n", 182 | " \n", 183 | " # TODO: Calculate the performance score between 'y_true' and 'y_predict'\n", 184 | " score = None\n", 185 | " \n", 186 | " # Return the score\n", 187 | " return score" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "### 问题2 - 拟合程度\n", 195 | "\n", 196 | "假设一个数据集有五个数据且一个模型做出下列目标变量的预测:\n", 197 | "\n", 198 | "| 真实数值 | 预测数值 |\n", 199 | "| :-------------: | :--------: |\n", 200 | "| 3.0 | 2.5 |\n", 201 | "| -0.5 | 0.0 |\n", 202 | "| 2.0 | 2.1 |\n", 203 | "| 7.0 | 7.8 |\n", 204 | "| 4.2 | 5.3 |\n", 205 | "*你会觉得这个模型已成功地描述了目标变量的变化吗?如果成功,请解释为什么,如果没有,也请给出原因。* \n", 206 | "\n", 207 | "运行下方的代码,使用`performance_metric`函数来计算模型的决定系数。" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "# Calculate the performance of this model\n", 219 | "score = performance_metric([3, -0.5, 2, 7, 4.2], [2.5, 0.0, 2.1, 7.8, 5.3])\n", 220 | "print \"Model has a coefficient of determination, R^2, of {:.3f}.\".format(score)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "**回答:**" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "### 练习: 数据分割与重排\n", 235 | "接下来,你需要把波士顿房屋数据集分成训练和测试两个子集。通常在这个过程中,数据也会被重新排序,以消除数据集中由于排序而产生的偏差。\n", 236 | "在下面的代码中,你需要:\n", 237 | "- 使用 `sklearn.cross_validation` 中的 `train_test_split`, 将`features`和`prices`的数据都分成用于训练的数据子集和用于测试的数据子集。\n", 238 | " - 分割比例为:80%的数据用于训练,20%用于测试;\n", 239 | " - 选定一个数值以设定 `train_test_split` 中的 `random_state` ,这会确保结果的一致性;\n", 240 | "- 最终分离出的子集为`X_train`,`X_test`,`y_train`,和`y_test`。" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "collapsed": false 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "# TODO: Import 'train_test_split'\n", 252 | "\n", 253 | "\n", 254 | "# TODO: Shuffle and split the data into training and testing subsets\n", 255 | "X_train, X_test, y_train, y_test = (None, None, None, None)\n", 256 | "\n", 257 | "# Success\n", 258 | "print \"Training and testing split was successful.\"" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "### 问题 3- 训练及测试\n", 266 | "*将数据集按一定比例分为训练用的数据集和测试用的数据集对学习算法有什么好处?*\n", 267 | "\n", 268 | "**提示:** 如果没有数据来对模型进行测试,会出现什么问题?" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "**答案: **" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "----\n", 283 | "\n", 284 | "## 分析模型的表现\n", 285 | "在项目的第三部分,我们来看一下几个模型针对不同的数据集在学习和测试上的表现。另外,你需要专注于一个特定的算法,用全部训练集训练时,提高它的`'max_depth'` 参数,观察这一参数的变化如何影响模型的表现。把你模型的表现画出来对于分析过程十分有益。可视化可以让我们看到一些单看结果看不到的行为。" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "### 学习曲线\n", 293 | "下方区域内的代码会输出四幅图像,它们是一个决策树模型在不同最大深度下的表现。每一条曲线都直观的显示了随着训练数据量的增加,模型学习曲线的训练评分和测试评分的变化。注意,曲线的阴影区域代表的是该曲线的不确定性(用标准差衡量)。这个模型的训练和测试部分都使用决定系数R2来评分。\n", 294 | "\n", 295 | "运行下方区域中的代码,并利用输出的图形回答下面的问题。" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "collapsed": false, 303 | "scrolled": false 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "# Produce learning curves for varying training set sizes and maximum depths\n", 308 | "vs.ModelLearning(features, prices)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "### 问题 4 - 学习数据\n", 316 | "*选择上述图像中的其中一个,并给出其最大深度。随着训练数据量的增加,训练曲线的评分有怎样的变化?测试曲线呢?如果有更多的训练数据,是否能有效提升模型的表现呢?*\n", 317 | "**提示:**学习曲线的评分是否最终会收敛到特定的值?" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "**答案: **" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "### 复杂度曲线\n", 332 | "下列代码内的区域会输出一幅图像,它展示了一个已经经过训练和验证的决策树模型在不同最大深度条件下的表现。这个图形将包含两条曲线,一个是训练的变化,一个是测试的变化。跟**学习曲线**相似,阴影区域代表该曲线的不确定性,模型训练和测试部分的评分都用的 `performance_metric` 函数。\n", 333 | "\n", 334 | "运行下方区域中的代码,并利用输出的图形并回答下面的两个问题。" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": { 341 | "collapsed": false 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "vs.ModelComplexity(X_train, y_train)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "### 问题 5- 偏差与方差之间的权衡取舍\n", 353 | "*当模型以最大深度 1训练时,模型的预测是出现很大的偏差还是出现了很大的方差?当模型以最大深度10训练时,情形又如何呢?图形中的哪些特征能够支持你的结论?*\n", 354 | " \n", 355 | "**提示:** 你如何得知模型是否出现了偏差很大或者方差很大的问题?" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "**答案: **" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "### 问题 6- 最优模型的猜测\n", 370 | "*你认为最大深度是多少的模型能够最好地对未见过的数据进行预测?为什么你会得出了这个答案?*" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "**答案: **" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "-----\n", 385 | "\n", 386 | "## 评价模型表现\n", 387 | "在这个项目的最后,你将自己建立模型,并使用最优化的`fit_model`函数,基于客户房子的特征来预测该房屋的价值。" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "### 问题 7- 网格搜索(Grid Search)\n", 395 | "*什么是网格搜索法?如何用它来优化学习算法?*\n" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "**回答: **" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "### 问题 8- 交叉验证\n", 410 | "*什么是K折交叉验证法(k-fold cross-validation)?优化模型时,使用这种方法对网格搜索有什么好处?* \n", 411 | "\n", 412 | "**提示:** 跟为何需要一组训练集的原因差不多,网格搜索时如果不使用交叉验证会有什么问题?" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "**答案: **" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "### 练习:训练模型\n", 427 | "在最后一个练习中,你将需要将所学到的内容整合,使用**决策树演算法**训练一个模型。为了保证你得出的是一个最优模型,你需要使用网格搜索法训练模型,以找到最佳的 `'max_depth'` 参数。你可以把`'max_depth'` 参数理解为决策树算法在做出预测前,允许其对数据提出问题的数量。决策树是**监督学习算法**中的一种。\n", 428 | "\n", 429 | "在下方 `fit_model` 函数中,你需要做的是:\n", 430 | "- 使用 `sklearn.tree` 中的 [`DecisionTreeRegressor`](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html) 创建一个决策树的回归函数;\n", 431 | " - 将这个回归函数储存到 `'regressor'` 变量中;\n", 432 | "- 为 `'max_depth'` 创造一个字典,它的值是从1至10的数组,并储存到 `'params'` 变量中;\n", 433 | "- 使用 `sklearn.metrics` 中的 [`make_scorer`](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html) 创建一个评分函数;\n", 434 | " - 将 `performance_metric` 作为参数传至这个函数中;\n", 435 | " - 将评分函数储存到 `'scoring_fnc'` 变量中;\n", 436 | "- 使用 `sklearn.grid_search` 中的 [`GridSearchCV`](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html) 创建一个网格搜索对象;\n", 437 | " - 将变量`'regressor'`, `'params'`, `'scoring_fnc'`, 和 `'cv_sets'` 作为参数传至这个对象中;\n", 438 | " - 将 `GridSearchCV` 存到 `'grid'` 变量中。" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": { 445 | "collapsed": false 446 | }, 447 | "outputs": [], 448 | "source": [ 449 | "# TODO: Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'\n", 450 | "\n", 451 | "\n", 452 | "def fit_model(X, y):\n", 453 | " \"\"\" Performs grid search over the 'max_depth' parameter for a \n", 454 | " decision tree regressor trained on the input data [X, y]. \"\"\"\n", 455 | " \n", 456 | " # Create cross-validation sets from the training data\n", 457 | " cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)\n", 458 | "\n", 459 | " # TODO: Create a decision tree regressor object\n", 460 | " regressor = None\n", 461 | "\n", 462 | " # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10\n", 463 | " params = {}\n", 464 | "\n", 465 | " # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' \n", 466 | " scoring_fnc = None\n", 467 | "\n", 468 | " # TODO: Create the grid search object\n", 469 | " grid = None\n", 470 | "\n", 471 | " # Fit the grid search object to the data to compute the optimal model\n", 472 | " grid = grid.fit(X, y)\n", 473 | "\n", 474 | " # Return the optimal model after fitting the data\n", 475 | " return grid.best_estimator_" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": {}, 481 | "source": [ 482 | "### 做出预测\n", 483 | "当我们用数据训练出一个模型,它现在就可用于对新的数据进行预测。在决策树回归函数中,模型已经学会对新输入的数据*提问*,并返回对**目标变量**的预测值。你可以用这个预测来获取数据未知目标变量的信息,这些数据必须是不包含在训练数据之内的。" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "### 问题 9- 最优模型\n", 491 | "*最优模型的最大深度(maximum depth)是多少?此答案与你在**问题 6**所做的猜测是否相同?*\n", 492 | "\n", 493 | "运行下方区域内的代码,将决策树回归函数代入训练数据的集合,以得到最优化的模型。" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": { 500 | "collapsed": false, 501 | "scrolled": true 502 | }, 503 | "outputs": [], 504 | "source": [ 505 | "# Fit the training data to the model using grid search\n", 506 | "reg = fit_model(X_train, y_train)\n", 507 | "\n", 508 | "# Produce the value for 'max_depth'\n", 509 | "print \"Parameter 'max_depth' is {} for the optimal model.\".format(reg.get_params()['max_depth'])" 510 | ] 511 | }, 512 | { 513 | "cell_type": "markdown", 514 | "metadata": {}, 515 | "source": [ 516 | "**Answer: **" 517 | ] 518 | }, 519 | { 520 | "cell_type": "markdown", 521 | "metadata": {}, 522 | "source": [ 523 | "### 问题 10 - 预测销售价格\n", 524 | "想像你是一个在波士顿地区的房屋经纪人,并期待使用此模型以帮助你的客户评估他们想出售的房屋。你已经从你的三个客户收集到以下的资讯:\n", 525 | "\n", 526 | "| 特征 | 客戶 1 | 客戶 2 | 客戶 3 |\n", 527 | "| :---: | :---: | :---: | :---: |\n", 528 | "| 房屋内房间总数 | 5 间房间 | 4 间房间 | 8 间房间 |\n", 529 | "| 社区贫困指数(%被认为是贫困阶层) | 17% | 32% | 3% |\n", 530 | "| 邻近学校的学生-老师比例 | 15:1 | 22:1 | 12:1 |\n", 531 | "\n", 532 | "*你会建议每位客户的房屋销售的价格为多少?从房屋特征的数值判断,这样的价格合理吗?* \n", 533 | "\n", 534 | "**提示:**用你在**分析数据**部分计算出来的统计信息来帮助你证明你的答案。\n", 535 | "\n", 536 | "运行下列的代码区域,使用你优化的模型来为每位客户的房屋价值做出预测。" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": null, 542 | "metadata": { 543 | "collapsed": false 544 | }, 545 | "outputs": [], 546 | "source": [ 547 | "# Produce a matrix for client data\n", 548 | "client_data = [[5, 17, 15], # Client 1\n", 549 | " [4, 32, 22], # Client 2\n", 550 | " [8, 3, 12]] # Client 3\n", 551 | "\n", 552 | "# Show predictions\n", 553 | "for i, price in enumerate(reg.predict(client_data)):\n", 554 | " print \"Predicted selling price for Client {}'s home: ${:,.2f}\".format(i+1, price)" 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "metadata": {}, 560 | "source": [ 561 | "**答案: **" 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "metadata": {}, 567 | "source": [ 568 | "### 敏感度\n", 569 | "\n", 570 | "一个最优的模型不一定是一个健壮模型。有的时候模型会过于复杂或者过于简单,以致于难以泛化新增添的数据;有的时候模型采用的学习算法并不适用于特定的数据结构;有的时候样本本身可能有太多噪点或样本过少,使得模型无法准确地预测目标变量。这些情况下我们会说模型是欠拟合的。执行下方区域中的代码,采用不同的训练和测试集执行 `fit_model` 函数10次。注意观察对一个特定的客户来说,预测是如何随训练数据的变化而变化的。" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": { 577 | "collapsed": false 578 | }, 579 | "outputs": [], 580 | "source": [ 581 | "vs.PredictTrials(features, prices, fit_model, client_data)" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "### 问题 11 - 实用性探讨\n", 589 | "*简单地讨论一下你建构的模型能否在现实世界中使用?* \n", 590 | "\n", 591 | "**提示:** 回答几个问题:\n", 592 | "- *1978年所采集的数据,在今天是否仍然适用?*\n", 593 | "- *数据中呈现的特征是否足够描述一个房屋?*\n", 594 | "- *模型是否足够健壮来保证预测的一致性?*\n", 595 | "- *在波士顿这样的大都市采集的数据,能否应用在其它乡镇地区?*" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "metadata": {}, 601 | "source": [ 602 | "**答案: **" 603 | ] 604 | } 605 | ], 606 | "metadata": { 607 | "kernelspec": { 608 | "display_name": "Python 2", 609 | "language": "python", 610 | "name": "python2" 611 | }, 612 | "language_info": { 613 | "codemirror_mode": { 614 | "name": "ipython", 615 | "version": 2 616 | }, 617 | "file_extension": ".py", 618 | "mimetype": "text/x-python", 619 | "name": "python", 620 | "nbconvert_exporter": "python", 621 | "pygments_lexer": "ipython2", 622 | "version": "2.7.11" 623 | } 624 | }, 625 | "nbformat": 4, 626 | "nbformat_minor": 0 627 | } 628 | -------------------------------------------------------------------------------- /projects_cn/boston_housing/housing.csv: -------------------------------------------------------------------------------- 1 | RM,LSTAT,PTRATIO,MEDV 2 | 6.575,4.98,15.3,504000.0 3 | 6.421,9.14,17.8,453600.0 4 | 7.185,4.03,17.8,728700.0 5 | 6.998,2.94,18.7,701400.0 6 | 7.147,5.33,18.7,760200.0 7 | 6.43,5.21,18.7,602700.0 8 | 6.012,12.43,15.2,480900.0 9 | 6.172,19.15,15.2,569100.0 10 | 5.631,29.93,15.2,346500.0 11 | 6.004,17.1,15.2,396900.0 12 | 6.377,20.45,15.2,315000.0 13 | 6.009,13.27,15.2,396900.0 14 | 5.889,15.71,15.2,455700.0 15 | 5.949,8.26,21.0,428400.0 16 | 6.096,10.26,21.0,382200.0 17 | 5.834,8.47,21.0,417900.0 18 | 5.935,6.58,21.0,485100.0 19 | 5.99,14.67,21.0,367500.0 20 | 5.456,11.69,21.0,424200.0 21 | 5.727,11.28,21.0,382200.0 22 | 5.57,21.02,21.0,285600.0 23 | 5.965,13.83,21.0,411600.0 24 | 6.142,18.72,21.0,319200.0 25 | 5.813,19.88,21.0,304500.0 26 | 5.924,16.3,21.0,327600.0 27 | 5.599,16.51,21.0,291900.0 28 | 5.813,14.81,21.0,348600.0 29 | 6.047,17.28,21.0,310800.0 30 | 6.495,12.8,21.0,386400.0 31 | 6.674,11.98,21.0,441000.0 32 | 5.713,22.6,21.0,266700.0 33 | 6.072,13.04,21.0,304500.0 34 | 5.95,27.71,21.0,277200.0 35 | 5.701,18.35,21.0,275100.0 36 | 6.096,20.34,21.0,283500.0 37 | 5.933,9.68,19.2,396900.0 38 | 5.841,11.41,19.2,420000.0 39 | 5.85,8.77,19.2,441000.0 40 | 5.966,10.13,19.2,518700.0 41 | 6.595,4.32,18.3,646800.0 42 | 7.024,1.98,18.3,732900.0 43 | 6.77,4.84,17.9,558600.0 44 | 6.169,5.81,17.9,531300.0 45 | 6.211,7.44,17.9,518700.0 46 | 6.069,9.55,17.9,445200.0 47 | 5.682,10.21,17.9,405300.0 48 | 5.786,14.15,17.9,420000.0 49 | 6.03,18.8,17.9,348600.0 50 | 5.399,30.81,17.9,302400.0 51 | 5.602,16.2,17.9,407400.0 52 | 5.963,13.45,16.8,413700.0 53 | 6.115,9.43,16.8,430500.0 54 | 6.511,5.28,16.8,525000.0 55 | 5.998,8.43,16.8,491400.0 56 | 5.888,14.8,21.1,396900.0 57 | 7.249,4.81,17.9,743400.0 58 | 6.383,5.77,17.3,518700.0 59 | 6.816,3.95,15.1,663600.0 60 | 6.145,6.86,19.7,489300.0 61 | 5.927,9.22,19.7,411600.0 62 | 5.741,13.15,19.7,392700.0 63 | 5.966,14.44,19.7,336000.0 64 | 6.456,6.73,19.7,466200.0 65 | 6.762,9.5,19.7,525000.0 66 | 7.104,8.05,18.6,693000.0 67 | 6.29,4.67,16.1,493500.0 68 | 5.787,10.24,16.1,407400.0 69 | 5.878,8.1,18.9,462000.0 70 | 5.594,13.09,18.9,365400.0 71 | 5.885,8.79,18.9,438900.0 72 | 6.417,6.72,19.2,508200.0 73 | 5.961,9.88,19.2,455700.0 74 | 6.065,5.52,19.2,478800.0 75 | 6.245,7.54,19.2,491400.0 76 | 6.273,6.78,18.7,506100.0 77 | 6.286,8.94,18.7,449400.0 78 | 6.279,11.97,18.7,420000.0 79 | 6.14,10.27,18.7,436800.0 80 | 6.232,12.34,18.7,445200.0 81 | 5.874,9.1,18.7,426300.0 82 | 6.727,5.29,19.0,588000.0 83 | 6.619,7.22,19.0,501900.0 84 | 6.302,6.72,19.0,520800.0 85 | 6.167,7.51,19.0,480900.0 86 | 6.389,9.62,18.5,501900.0 87 | 6.63,6.53,18.5,558600.0 88 | 6.015,12.86,18.5,472500.0 89 | 6.121,8.44,18.5,466200.0 90 | 7.007,5.5,17.8,495600.0 91 | 7.079,5.7,17.8,602700.0 92 | 6.417,8.81,17.8,474600.0 93 | 6.405,8.2,17.8,462000.0 94 | 6.442,8.16,18.2,480900.0 95 | 6.211,6.21,18.2,525000.0 96 | 6.249,10.59,18.2,432600.0 97 | 6.625,6.65,18.0,596400.0 98 | 6.163,11.34,18.0,449400.0 99 | 8.069,4.21,18.0,812700.0 100 | 7.82,3.57,18.0,919800.0 101 | 7.416,6.19,18.0,697200.0 102 | 6.727,9.42,20.9,577500.0 103 | 6.781,7.67,20.9,556500.0 104 | 6.405,10.63,20.9,390600.0 105 | 6.137,13.44,20.9,405300.0 106 | 6.167,12.33,20.9,422100.0 107 | 5.851,16.47,20.9,409500.0 108 | 5.836,18.66,20.9,409500.0 109 | 6.127,14.09,20.9,428400.0 110 | 6.474,12.27,20.9,415800.0 111 | 6.229,15.55,20.9,407400.0 112 | 6.195,13.0,20.9,455700.0 113 | 6.715,10.16,17.8,478800.0 114 | 5.913,16.21,17.8,394800.0 115 | 6.092,17.09,17.8,392700.0 116 | 6.254,10.45,17.8,388500.0 117 | 5.928,15.76,17.8,384300.0 118 | 6.176,12.04,17.8,445200.0 119 | 6.021,10.3,17.8,403200.0 120 | 5.872,15.37,17.8,428400.0 121 | 5.731,13.61,17.8,405300.0 122 | 5.87,14.37,19.1,462000.0 123 | 6.004,14.27,19.1,426300.0 124 | 5.961,17.93,19.1,430500.0 125 | 5.856,25.41,19.1,363300.0 126 | 5.879,17.58,19.1,394800.0 127 | 5.986,14.81,19.1,449400.0 128 | 5.613,27.26,19.1,329700.0 129 | 5.693,17.19,21.2,340200.0 130 | 6.431,15.39,21.2,378000.0 131 | 5.637,18.34,21.2,300300.0 132 | 6.458,12.6,21.2,403200.0 133 | 6.326,12.26,21.2,411600.0 134 | 6.372,11.12,21.2,483000.0 135 | 5.822,15.03,21.2,386400.0 136 | 5.757,17.31,21.2,327600.0 137 | 6.335,16.96,21.2,380100.0 138 | 5.942,16.9,21.2,365400.0 139 | 6.454,14.59,21.2,359100.0 140 | 5.857,21.32,21.2,279300.0 141 | 6.151,18.46,21.2,373800.0 142 | 6.174,24.16,21.2,294000.0 143 | 5.019,34.41,21.2,302400.0 144 | 5.403,26.82,14.7,281400.0 145 | 5.468,26.42,14.7,327600.0 146 | 4.903,29.29,14.7,247800.0 147 | 6.13,27.8,14.7,289800.0 148 | 5.628,16.65,14.7,327600.0 149 | 4.926,29.53,14.7,306600.0 150 | 5.186,28.32,14.7,373800.0 151 | 5.597,21.45,14.7,323400.0 152 | 6.122,14.1,14.7,451500.0 153 | 5.404,13.28,14.7,411600.0 154 | 5.012,12.12,14.7,321300.0 155 | 5.709,15.79,14.7,407400.0 156 | 6.129,15.12,14.7,357000.0 157 | 6.152,15.02,14.7,327600.0 158 | 5.272,16.14,14.7,275100.0 159 | 6.943,4.59,14.7,867300.0 160 | 6.066,6.43,14.7,510300.0 161 | 6.51,7.39,14.7,489300.0 162 | 6.25,5.5,14.7,567000.0 163 | 5.854,11.64,14.7,476700.0 164 | 6.101,9.81,14.7,525000.0 165 | 5.877,12.14,14.7,499800.0 166 | 6.319,11.1,14.7,499800.0 167 | 6.402,11.32,14.7,468300.0 168 | 5.875,14.43,14.7,365400.0 169 | 5.88,12.03,14.7,401100.0 170 | 5.572,14.69,16.6,485100.0 171 | 6.416,9.04,16.6,495600.0 172 | 5.859,9.64,16.6,474600.0 173 | 6.546,5.33,16.6,617400.0 174 | 6.02,10.11,16.6,487200.0 175 | 6.315,6.29,16.6,516600.0 176 | 6.86,6.92,16.6,627900.0 177 | 6.98,5.04,17.8,781200.0 178 | 7.765,7.56,17.8,835800.0 179 | 6.144,9.45,17.8,760200.0 180 | 7.155,4.82,17.8,795900.0 181 | 6.563,5.68,17.8,682500.0 182 | 5.604,13.98,17.8,554400.0 183 | 6.153,13.15,17.8,621600.0 184 | 6.782,6.68,15.2,672000.0 185 | 6.556,4.56,15.2,625800.0 186 | 7.185,5.39,15.2,732900.0 187 | 6.951,5.1,15.2,777000.0 188 | 6.739,4.69,15.2,640500.0 189 | 7.178,2.87,15.2,764400.0 190 | 6.8,5.03,15.6,653100.0 191 | 6.604,4.38,15.6,611100.0 192 | 7.287,4.08,12.6,699300.0 193 | 7.107,8.61,12.6,636300.0 194 | 7.274,6.62,12.6,726600.0 195 | 6.975,4.56,17.0,732900.0 196 | 7.135,4.45,17.0,690900.0 197 | 6.162,7.43,14.7,506100.0 198 | 7.61,3.11,14.7,888300.0 199 | 7.853,3.81,14.7,1018500.0 200 | 5.891,10.87,18.6,474600.0 201 | 6.326,10.97,18.6,512400.0 202 | 5.783,18.06,18.6,472500.0 203 | 6.064,14.66,18.6,512400.0 204 | 5.344,23.09,18.6,420000.0 205 | 5.96,17.27,18.6,455700.0 206 | 5.404,23.98,18.6,405300.0 207 | 5.807,16.03,18.6,470400.0 208 | 6.375,9.38,18.6,590100.0 209 | 5.412,29.55,18.6,497700.0 210 | 6.182,9.47,18.6,525000.0 211 | 5.888,13.51,16.4,489300.0 212 | 6.642,9.69,16.4,602700.0 213 | 5.951,17.92,16.4,451500.0 214 | 6.373,10.5,16.4,483000.0 215 | 6.951,9.71,17.4,560700.0 216 | 6.164,21.46,17.4,455700.0 217 | 6.879,9.93,17.4,577500.0 218 | 6.618,7.6,17.4,632100.0 219 | 8.266,4.14,17.4,940800.0 220 | 8.04,3.13,17.4,789600.0 221 | 7.163,6.36,17.4,663600.0 222 | 7.686,3.92,17.4,980700.0 223 | 6.552,3.76,17.4,661500.0 224 | 5.981,11.65,17.4,510300.0 225 | 7.412,5.25,17.4,665700.0 226 | 8.337,2.47,17.4,875700.0 227 | 8.247,3.95,17.4,1014300.0 228 | 6.726,8.05,17.4,609000.0 229 | 6.086,10.88,17.4,504000.0 230 | 6.631,9.54,17.4,527100.0 231 | 7.358,4.73,17.4,661500.0 232 | 6.481,6.36,16.6,497700.0 233 | 6.606,7.37,16.6,489300.0 234 | 6.897,11.38,16.6,462000.0 235 | 6.095,12.4,16.6,422100.0 236 | 6.358,11.22,16.6,466200.0 237 | 6.393,5.19,16.6,497700.0 238 | 5.593,12.5,19.1,369600.0 239 | 5.605,18.46,19.1,388500.0 240 | 6.108,9.16,19.1,510300.0 241 | 6.226,10.15,19.1,430500.0 242 | 6.433,9.52,19.1,514500.0 243 | 6.718,6.56,19.1,550200.0 244 | 6.487,5.9,19.1,512400.0 245 | 6.438,3.59,19.1,520800.0 246 | 6.957,3.53,19.1,621600.0 247 | 8.259,3.54,19.1,898800.0 248 | 6.108,6.57,16.4,459900.0 249 | 5.876,9.25,16.4,438900.0 250 | 7.454,3.11,15.9,924000.0 251 | 7.333,7.79,13.0,756000.0 252 | 6.842,6.9,13.0,632100.0 253 | 7.203,9.59,13.0,709800.0 254 | 7.52,7.26,13.0,905100.0 255 | 8.398,5.91,13.0,1024800.0 256 | 7.327,11.25,13.0,651000.0 257 | 7.206,8.1,13.0,766500.0 258 | 5.56,10.45,13.0,478800.0 259 | 7.014,14.79,13.0,644700.0 260 | 7.47,3.16,13.0,913500.0 261 | 5.92,13.65,18.6,434700.0 262 | 5.856,13.0,18.6,443100.0 263 | 6.24,6.59,18.6,529200.0 264 | 6.538,7.73,18.6,512400.0 265 | 7.691,6.58,18.6,739200.0 266 | 6.758,3.53,17.6,680400.0 267 | 6.854,2.98,17.6,672000.0 268 | 7.267,6.05,17.6,697200.0 269 | 6.826,4.16,17.6,695100.0 270 | 6.482,7.19,17.6,611100.0 271 | 6.812,4.85,14.9,737100.0 272 | 7.82,3.76,14.9,953400.0 273 | 6.968,4.59,14.9,743400.0 274 | 7.645,3.01,14.9,966000.0 275 | 7.088,7.85,15.3,676200.0 276 | 6.453,8.23,15.3,462000.0 277 | 6.23,12.93,18.2,422100.0 278 | 6.209,7.14,16.6,487200.0 279 | 6.315,7.6,16.6,468300.0 280 | 6.565,9.51,16.6,520800.0 281 | 6.861,3.33,19.2,598500.0 282 | 7.148,3.56,19.2,783300.0 283 | 6.63,4.7,19.2,585900.0 284 | 6.127,8.58,16.0,501900.0 285 | 6.009,10.4,16.0,455700.0 286 | 6.678,6.27,16.0,600600.0 287 | 6.549,7.39,16.0,569100.0 288 | 5.79,15.84,16.0,426300.0 289 | 6.345,4.97,14.8,472500.0 290 | 7.041,4.74,14.8,609000.0 291 | 6.871,6.07,14.8,520800.0 292 | 6.59,9.5,16.1,462000.0 293 | 6.495,8.67,16.1,554400.0 294 | 6.982,4.86,16.1,695100.0 295 | 7.236,6.93,18.4,758100.0 296 | 6.616,8.93,18.4,596400.0 297 | 7.42,6.47,18.4,701400.0 298 | 6.849,7.53,18.4,592200.0 299 | 6.635,4.54,18.4,478800.0 300 | 5.972,9.97,18.4,426300.0 301 | 4.973,12.64,18.4,338100.0 302 | 6.122,5.98,18.4,464100.0 303 | 6.023,11.72,18.4,407400.0 304 | 6.266,7.9,18.4,453600.0 305 | 6.567,9.28,18.4,499800.0 306 | 5.705,11.5,18.4,340200.0 307 | 5.914,18.33,18.4,373800.0 308 | 5.782,15.94,18.4,415800.0 309 | 6.382,10.36,18.4,485100.0 310 | 6.113,12.73,18.4,441000.0 311 | 6.426,7.2,19.6,499800.0 312 | 6.376,6.87,19.6,485100.0 313 | 6.041,7.7,19.6,428400.0 314 | 5.708,11.74,19.6,388500.0 315 | 6.415,6.12,19.6,525000.0 316 | 6.431,5.08,19.6,516600.0 317 | 6.312,6.15,19.6,483000.0 318 | 6.083,12.79,19.6,466200.0 319 | 5.868,9.97,16.9,405300.0 320 | 6.333,7.34,16.9,474600.0 321 | 6.144,9.09,16.9,415800.0 322 | 5.706,12.43,16.9,359100.0 323 | 6.031,7.83,16.9,407400.0 324 | 6.316,5.68,20.2,466200.0 325 | 6.31,6.75,20.2,434700.0 326 | 6.037,8.01,20.2,443100.0 327 | 5.869,9.8,20.2,409500.0 328 | 5.895,10.56,20.2,388500.0 329 | 6.059,8.51,20.2,432600.0 330 | 5.985,9.74,20.2,399000.0 331 | 5.968,9.29,20.2,392700.0 332 | 7.241,5.49,15.5,686700.0 333 | 6.54,8.65,15.9,346500.0 334 | 6.696,7.18,17.6,501900.0 335 | 6.874,4.61,17.6,655200.0 336 | 6.014,10.53,18.8,367500.0 337 | 5.898,12.67,18.8,361200.0 338 | 6.516,6.36,17.9,485100.0 339 | 6.635,5.99,17.0,514500.0 340 | 6.939,5.89,19.7,558600.0 341 | 6.49,5.98,19.7,480900.0 342 | 6.579,5.49,18.3,506100.0 343 | 5.884,7.79,18.3,390600.0 344 | 6.728,4.5,17.0,632100.0 345 | 5.663,8.05,22.0,382200.0 346 | 5.936,5.57,22.0,432600.0 347 | 6.212,17.6,20.2,373800.0 348 | 6.395,13.27,20.2,455700.0 349 | 6.127,11.48,20.2,476700.0 350 | 6.112,12.67,20.2,474600.0 351 | 6.398,7.79,20.2,525000.0 352 | 6.251,14.19,20.2,417900.0 353 | 5.362,10.19,20.2,436800.0 354 | 5.803,14.64,20.2,352800.0 355 | 3.561,7.12,20.2,577500.0 356 | 4.963,14.0,20.2,459900.0 357 | 3.863,13.33,20.2,485100.0 358 | 4.906,34.77,20.2,289800.0 359 | 4.138,37.97,20.2,289800.0 360 | 7.313,13.44,20.2,315000.0 361 | 6.649,23.24,20.2,291900.0 362 | 6.794,21.24,20.2,279300.0 363 | 6.38,23.69,20.2,275100.0 364 | 6.223,21.78,20.2,214200.0 365 | 6.968,17.21,20.2,218400.0 366 | 6.545,21.08,20.2,228900.0 367 | 5.536,23.6,20.2,237300.0 368 | 5.52,24.56,20.2,258300.0 369 | 4.368,30.63,20.2,184800.0 370 | 5.277,30.81,20.2,151200.0 371 | 4.652,28.28,20.2,220500.0 372 | 5.0,31.99,20.2,155400.0 373 | 4.88,30.62,20.2,214200.0 374 | 5.39,20.85,20.2,241500.0 375 | 5.713,17.11,20.2,317100.0 376 | 6.051,18.76,20.2,487200.0 377 | 5.036,25.68,20.2,203700.0 378 | 6.193,15.17,20.2,289800.0 379 | 5.887,16.35,20.2,266700.0 380 | 6.471,17.12,20.2,275100.0 381 | 6.405,19.37,20.2,262500.0 382 | 5.747,19.92,20.2,178500.0 383 | 5.453,30.59,20.2,105000.0 384 | 5.852,29.97,20.2,132300.0 385 | 5.987,26.77,20.2,117600.0 386 | 6.343,20.32,20.2,151200.0 387 | 6.404,20.31,20.2,254100.0 388 | 5.349,19.77,20.2,174300.0 389 | 5.531,27.38,20.2,178500.0 390 | 5.683,22.98,20.2,105000.0 391 | 4.138,23.34,20.2,249900.0 392 | 5.608,12.13,20.2,585900.0 393 | 5.617,26.4,20.2,361200.0 394 | 6.852,19.78,20.2,577500.0 395 | 5.757,10.11,20.2,315000.0 396 | 6.657,21.22,20.2,361200.0 397 | 4.628,34.37,20.2,375900.0 398 | 5.155,20.08,20.2,342300.0 399 | 4.519,36.98,20.2,147000.0 400 | 6.434,29.05,20.2,151200.0 401 | 6.782,25.79,20.2,157500.0 402 | 5.304,26.64,20.2,218400.0 403 | 5.957,20.62,20.2,184800.0 404 | 6.824,22.74,20.2,176400.0 405 | 6.411,15.02,20.2,350700.0 406 | 6.006,15.7,20.2,298200.0 407 | 5.648,14.1,20.2,436800.0 408 | 6.103,23.29,20.2,281400.0 409 | 5.565,17.16,20.2,245700.0 410 | 5.896,24.39,20.2,174300.0 411 | 5.837,15.69,20.2,214200.0 412 | 6.202,14.52,20.2,228900.0 413 | 6.193,21.52,20.2,231000.0 414 | 6.38,24.08,20.2,199500.0 415 | 6.348,17.64,20.2,304500.0 416 | 6.833,19.69,20.2,296100.0 417 | 6.425,12.03,20.2,338100.0 418 | 6.436,16.22,20.2,300300.0 419 | 6.208,15.17,20.2,245700.0 420 | 6.629,23.27,20.2,281400.0 421 | 6.461,18.05,20.2,201600.0 422 | 6.152,26.45,20.2,182700.0 423 | 5.935,34.02,20.2,176400.0 424 | 5.627,22.88,20.2,268800.0 425 | 5.818,22.11,20.2,220500.0 426 | 6.406,19.52,20.2,359100.0 427 | 6.219,16.59,20.2,386400.0 428 | 6.485,18.85,20.2,323400.0 429 | 5.854,23.79,20.2,226800.0 430 | 6.459,23.98,20.2,247800.0 431 | 6.341,17.79,20.2,312900.0 432 | 6.251,16.44,20.2,264600.0 433 | 6.185,18.13,20.2,296100.0 434 | 6.417,19.31,20.2,273000.0 435 | 6.749,17.44,20.2,281400.0 436 | 6.655,17.73,20.2,319200.0 437 | 6.297,17.27,20.2,338100.0 438 | 7.393,16.74,20.2,373800.0 439 | 6.728,18.71,20.2,312900.0 440 | 6.525,18.13,20.2,296100.0 441 | 5.976,19.01,20.2,266700.0 442 | 5.936,16.94,20.2,283500.0 443 | 6.301,16.23,20.2,312900.0 444 | 6.081,14.7,20.2,420000.0 445 | 6.701,16.42,20.2,344400.0 446 | 6.376,14.65,20.2,371700.0 447 | 6.317,13.99,20.2,409500.0 448 | 6.513,10.29,20.2,424200.0 449 | 6.209,13.22,20.2,449400.0 450 | 5.759,14.13,20.2,417900.0 451 | 5.952,17.15,20.2,399000.0 452 | 6.003,21.32,20.2,401100.0 453 | 5.926,18.13,20.2,401100.0 454 | 5.713,14.76,20.2,422100.0 455 | 6.167,16.29,20.2,417900.0 456 | 6.229,12.87,20.2,411600.0 457 | 6.437,14.36,20.2,487200.0 458 | 6.98,11.66,20.2,625800.0 459 | 5.427,18.14,20.2,289800.0 460 | 6.162,24.1,20.2,279300.0 461 | 6.484,18.68,20.2,350700.0 462 | 5.304,24.91,20.2,252000.0 463 | 6.185,18.03,20.2,306600.0 464 | 6.229,13.11,20.2,449400.0 465 | 6.242,10.74,20.2,483000.0 466 | 6.75,7.74,20.2,497700.0 467 | 7.061,7.01,20.2,525000.0 468 | 5.762,10.42,20.2,457800.0 469 | 5.871,13.34,20.2,432600.0 470 | 6.312,10.58,20.2,445200.0 471 | 6.114,14.98,20.2,401100.0 472 | 5.905,11.45,20.2,432600.0 473 | 5.454,18.06,20.1,319200.0 474 | 5.414,23.97,20.1,147000.0 475 | 5.093,29.68,20.1,170100.0 476 | 5.983,18.07,20.1,285600.0 477 | 5.983,13.35,20.1,422100.0 478 | 5.707,12.01,19.2,457800.0 479 | 5.926,13.59,19.2,514500.0 480 | 5.67,17.6,19.2,485100.0 481 | 5.39,21.14,19.2,413700.0 482 | 5.794,14.1,19.2,384300.0 483 | 6.019,12.92,19.2,445200.0 484 | 5.569,15.1,19.2,367500.0 485 | 6.027,14.33,19.2,352800.0 486 | 6.593,9.67,21.0,470400.0 487 | 6.12,9.08,21.0,432600.0 488 | 6.976,5.64,21.0,501900.0 489 | 6.794,6.48,21.0,462000.0 490 | 6.03,7.88,21.0,249900.0 491 | -------------------------------------------------------------------------------- /projects_cn/boston_housing/visuals.py: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # Suppress matplotlib user warnings 3 | # Necessary for newer version of matplotlib 4 | import warnings 5 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") 6 | ########################################### 7 | 8 | import matplotlib.pyplot as pl 9 | import numpy as np 10 | import sklearn.learning_curve as curves 11 | from sklearn.tree import DecisionTreeRegressor 12 | from sklearn.cross_validation import ShuffleSplit, train_test_split 13 | 14 | def ModelLearning(X, y): 15 | """ Calculates the performance of several models with varying sizes of training data. 16 | The learning and testing scores for each model are then plotted. """ 17 | 18 | # Create 10 cross-validation sets for training and testing 19 | cv = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.2, random_state = 0) 20 | 21 | # Generate the training set sizes increasing by 50 22 | train_sizes = np.rint(np.linspace(1, X.shape[0]*0.8 - 1, 9)).astype(int) 23 | 24 | # Create the figure window 25 | fig = pl.figure(figsize=(10,7)) 26 | 27 | # Create three different models based on max_depth 28 | for k, depth in enumerate([1,3,6,10]): 29 | 30 | # Create a Decision tree regressor at max_depth = depth 31 | regressor = DecisionTreeRegressor(max_depth = depth) 32 | 33 | # Calculate the training and testing scores 34 | sizes, train_scores, test_scores = curves.learning_curve(regressor, X, y, \ 35 | cv = cv, train_sizes = train_sizes, scoring = 'r2') 36 | 37 | # Find the mean and standard deviation for smoothing 38 | train_std = np.std(train_scores, axis = 1) 39 | train_mean = np.mean(train_scores, axis = 1) 40 | test_std = np.std(test_scores, axis = 1) 41 | test_mean = np.mean(test_scores, axis = 1) 42 | 43 | # Subplot the learning curve 44 | ax = fig.add_subplot(2, 2, k+1) 45 | ax.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score') 46 | ax.plot(sizes, test_mean, 'o-', color = 'g', label = 'Testing Score') 47 | ax.fill_between(sizes, train_mean - train_std, \ 48 | train_mean + train_std, alpha = 0.15, color = 'r') 49 | ax.fill_between(sizes, test_mean - test_std, \ 50 | test_mean + test_std, alpha = 0.15, color = 'g') 51 | 52 | # Labels 53 | ax.set_title('max_depth = %s'%(depth)) 54 | ax.set_xlabel('Number of Training Points') 55 | ax.set_ylabel('Score') 56 | ax.set_xlim([0, X.shape[0]*0.8]) 57 | ax.set_ylim([-0.05, 1.05]) 58 | 59 | # Visual aesthetics 60 | ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad = 0.) 61 | fig.suptitle('Decision Tree Regressor Learning Performances', fontsize = 16, y = 1.03) 62 | fig.tight_layout() 63 | fig.show() 64 | 65 | 66 | def ModelComplexity(X, y): 67 | """ Calculates the performance of the model as model complexity increases. 68 | The learning and testing errors rates are then plotted. """ 69 | 70 | # Create 10 cross-validation sets for training and testing 71 | cv = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.2, random_state = 0) 72 | 73 | # Vary the max_depth parameter from 1 to 10 74 | max_depth = np.arange(1,11) 75 | 76 | # Calculate the training and testing scores 77 | train_scores, test_scores = curves.validation_curve(DecisionTreeRegressor(), X, y, \ 78 | param_name = "max_depth", param_range = max_depth, cv = cv, scoring = 'r2') 79 | 80 | # Find the mean and standard deviation for smoothing 81 | train_mean = np.mean(train_scores, axis=1) 82 | train_std = np.std(train_scores, axis=1) 83 | test_mean = np.mean(test_scores, axis=1) 84 | test_std = np.std(test_scores, axis=1) 85 | 86 | # Plot the validation curve 87 | pl.figure(figsize=(7, 5)) 88 | pl.title('Decision Tree Regressor Complexity Performance') 89 | pl.plot(max_depth, train_mean, 'o-', color = 'r', label = 'Training Score') 90 | pl.plot(max_depth, test_mean, 'o-', color = 'g', label = 'Validation Score') 91 | pl.fill_between(max_depth, train_mean - train_std, \ 92 | train_mean + train_std, alpha = 0.15, color = 'r') 93 | pl.fill_between(max_depth, test_mean - test_std, \ 94 | test_mean + test_std, alpha = 0.15, color = 'g') 95 | 96 | # Visual aesthetics 97 | pl.legend(loc = 'lower right') 98 | pl.xlabel('Maximum Depth') 99 | pl.ylabel('Score') 100 | pl.ylim([-0.05,1.05]) 101 | pl.show() 102 | 103 | 104 | def PredictTrials(X, y, fitter, data): 105 | """ Performs trials of fitting and predicting data. """ 106 | 107 | # Store the predicted prices 108 | prices = [] 109 | 110 | for k in range(10): 111 | # Split the data 112 | X_train, X_test, y_train, y_test = train_test_split(X, y, \ 113 | test_size = 0.2, random_state = k) 114 | 115 | # Fit the data 116 | reg = fitter(X_train, y_train) 117 | 118 | # Make a prediction 119 | pred = reg.predict([data[0]])[0] 120 | prices.append(pred) 121 | 122 | # Result 123 | print "Trial {}: ${:,.2f}".format(k+1, pred) 124 | 125 | # Display price range 126 | print "\nRange in prices: ${:,.2f}".format(max(prices) - min(prices)) -------------------------------------------------------------------------------- /projects_cn/creating_customer_segments/README.md: -------------------------------------------------------------------------------- 1 | # 项目 3: 非监督学习 2 | ## 创建用户细分 3 | 4 | ### 安装 5 | 6 | 这个项目要求使用 **Python 2.7** 并且需要安装下面这些python包: 7 | 8 | - [NumPy](http://www.numpy.org/) 9 | - [Pandas](http://pandas.pydata.org) 10 | - [scikit-learn](http://scikit-learn.org/stable/) 11 | 12 | 你同样需要安装好相应软件使之能够运行[Jupyter Notebook](http://jupyter.org/)。 13 | 14 | 优达学城推荐学生安装 [Anaconda](https://www.continuum.io/downloads), 这是一个已经打包好的python发行版,它包含了我们这个项目需要的所有的库和软件。 15 | 16 | ### 代码 17 | 18 | 初始代码包含在 `customer_segments.ipynb` 这个notebook文件中。这里面有一些代码已经实现好来帮助你开始项目,但是为了完成项目,你还需要实现附加的功能。 19 | 20 | ### 运行 21 | 22 | 在命令行中,确保当前目录为 `customer_segments.ipynb` 文件夹的最顶层(目录包含本 README 文件),运行下列命令: 23 | 24 | ```jupyter notebook customer_segments.ipynb``` 25 | 26 | ​这会启动 Jupyter Notebook 并把项目文件打开在你的浏览器中。 27 | 28 | ## 数据 29 | 30 | ​这个项目的数据包含在 `customers.csv` 文件中。你能在[UCI 机器学习信息库](https://archive.ics.uci.edu/ml/datasets/Wholesale+customers)页面中找到更多信息。 31 | -------------------------------------------------------------------------------- /projects_cn/creating_customer_segments/customers.csv: -------------------------------------------------------------------------------- 1 | Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicatessen 2 | 2,3,12669,9656,7561,214,2674,1338 3 | 2,3,7057,9810,9568,1762,3293,1776 4 | 2,3,6353,8808,7684,2405,3516,7844 5 | 1,3,13265,1196,4221,6404,507,1788 6 | 2,3,22615,5410,7198,3915,1777,5185 7 | 2,3,9413,8259,5126,666,1795,1451 8 | 2,3,12126,3199,6975,480,3140,545 9 | 2,3,7579,4956,9426,1669,3321,2566 10 | 1,3,5963,3648,6192,425,1716,750 11 | 2,3,6006,11093,18881,1159,7425,2098 12 | 2,3,3366,5403,12974,4400,5977,1744 13 | 2,3,13146,1124,4523,1420,549,497 14 | 2,3,31714,12319,11757,287,3881,2931 15 | 2,3,21217,6208,14982,3095,6707,602 16 | 2,3,24653,9465,12091,294,5058,2168 17 | 1,3,10253,1114,3821,397,964,412 18 | 2,3,1020,8816,12121,134,4508,1080 19 | 1,3,5876,6157,2933,839,370,4478 20 | 2,3,18601,6327,10099,2205,2767,3181 21 | 1,3,7780,2495,9464,669,2518,501 22 | 2,3,17546,4519,4602,1066,2259,2124 23 | 1,3,5567,871,2010,3383,375,569 24 | 1,3,31276,1917,4469,9408,2381,4334 25 | 2,3,26373,36423,22019,5154,4337,16523 26 | 2,3,22647,9776,13792,2915,4482,5778 27 | 2,3,16165,4230,7595,201,4003,57 28 | 1,3,9898,961,2861,3151,242,833 29 | 1,3,14276,803,3045,485,100,518 30 | 2,3,4113,20484,25957,1158,8604,5206 31 | 1,3,43088,2100,2609,1200,1107,823 32 | 1,3,18815,3610,11107,1148,2134,2963 33 | 1,3,2612,4339,3133,2088,820,985 34 | 1,3,21632,1318,2886,266,918,405 35 | 1,3,29729,4786,7326,6130,361,1083 36 | 1,3,1502,1979,2262,425,483,395 37 | 2,3,688,5491,11091,833,4239,436 38 | 1,3,29955,4362,5428,1729,862,4626 39 | 2,3,15168,10556,12477,1920,6506,714 40 | 2,3,4591,15729,16709,33,6956,433 41 | 1,3,56159,555,902,10002,212,2916 42 | 1,3,24025,4332,4757,9510,1145,5864 43 | 1,3,19176,3065,5956,2033,2575,2802 44 | 2,3,10850,7555,14961,188,6899,46 45 | 2,3,630,11095,23998,787,9529,72 46 | 2,3,9670,7027,10471,541,4618,65 47 | 2,3,5181,22044,21531,1740,7353,4985 48 | 2,3,3103,14069,21955,1668,6792,1452 49 | 2,3,44466,54259,55571,7782,24171,6465 50 | 2,3,11519,6152,10868,584,5121,1476 51 | 2,3,4967,21412,28921,1798,13583,1163 52 | 1,3,6269,1095,1980,3860,609,2162 53 | 1,3,3347,4051,6996,239,1538,301 54 | 2,3,40721,3916,5876,532,2587,1278 55 | 2,3,491,10473,11532,744,5611,224 56 | 1,3,27329,1449,1947,2436,204,1333 57 | 1,3,5264,3683,5005,1057,2024,1130 58 | 2,3,4098,29892,26866,2616,17740,1340 59 | 2,3,5417,9933,10487,38,7572,1282 60 | 1,3,13779,1970,1648,596,227,436 61 | 1,3,6137,5360,8040,129,3084,1603 62 | 2,3,8590,3045,7854,96,4095,225 63 | 2,3,35942,38369,59598,3254,26701,2017 64 | 2,3,7823,6245,6544,4154,4074,964 65 | 2,3,9396,11601,15775,2896,7677,1295 66 | 1,3,4760,1227,3250,3724,1247,1145 67 | 2,3,85,20959,45828,36,24231,1423 68 | 1,3,9,1534,7417,175,3468,27 69 | 2,3,19913,6759,13462,1256,5141,834 70 | 1,3,2446,7260,3993,5870,788,3095 71 | 1,3,8352,2820,1293,779,656,144 72 | 1,3,16705,2037,3202,10643,116,1365 73 | 1,3,18291,1266,21042,5373,4173,14472 74 | 1,3,4420,5139,2661,8872,1321,181 75 | 2,3,19899,5332,8713,8132,764,648 76 | 2,3,8190,6343,9794,1285,1901,1780 77 | 1,3,20398,1137,3,4407,3,975 78 | 1,3,717,3587,6532,7530,529,894 79 | 2,3,12205,12697,28540,869,12034,1009 80 | 1,3,10766,1175,2067,2096,301,167 81 | 1,3,1640,3259,3655,868,1202,1653 82 | 1,3,7005,829,3009,430,610,529 83 | 2,3,219,9540,14403,283,7818,156 84 | 2,3,10362,9232,11009,737,3537,2342 85 | 1,3,20874,1563,1783,2320,550,772 86 | 2,3,11867,3327,4814,1178,3837,120 87 | 2,3,16117,46197,92780,1026,40827,2944 88 | 2,3,22925,73498,32114,987,20070,903 89 | 1,3,43265,5025,8117,6312,1579,14351 90 | 1,3,7864,542,4042,9735,165,46 91 | 1,3,24904,3836,5330,3443,454,3178 92 | 1,3,11405,596,1638,3347,69,360 93 | 1,3,12754,2762,2530,8693,627,1117 94 | 2,3,9198,27472,32034,3232,18906,5130 95 | 1,3,11314,3090,2062,35009,71,2698 96 | 2,3,5626,12220,11323,206,5038,244 97 | 1,3,3,2920,6252,440,223,709 98 | 2,3,23,2616,8118,145,3874,217 99 | 1,3,403,254,610,774,54,63 100 | 1,3,503,112,778,895,56,132 101 | 1,3,9658,2182,1909,5639,215,323 102 | 2,3,11594,7779,12144,3252,8035,3029 103 | 2,3,1420,10810,16267,1593,6766,1838 104 | 2,3,2932,6459,7677,2561,4573,1386 105 | 1,3,56082,3504,8906,18028,1480,2498 106 | 1,3,14100,2132,3445,1336,1491,548 107 | 1,3,15587,1014,3970,910,139,1378 108 | 2,3,1454,6337,10704,133,6830,1831 109 | 2,3,8797,10646,14886,2471,8969,1438 110 | 2,3,1531,8397,6981,247,2505,1236 111 | 2,3,1406,16729,28986,673,836,3 112 | 1,3,11818,1648,1694,2276,169,1647 113 | 2,3,12579,11114,17569,805,6457,1519 114 | 1,3,19046,2770,2469,8853,483,2708 115 | 1,3,14438,2295,1733,3220,585,1561 116 | 1,3,18044,1080,2000,2555,118,1266 117 | 1,3,11134,793,2988,2715,276,610 118 | 1,3,11173,2521,3355,1517,310,222 119 | 1,3,6990,3880,5380,1647,319,1160 120 | 1,3,20049,1891,2362,5343,411,933 121 | 1,3,8258,2344,2147,3896,266,635 122 | 1,3,17160,1200,3412,2417,174,1136 123 | 1,3,4020,3234,1498,2395,264,255 124 | 1,3,12212,201,245,1991,25,860 125 | 2,3,11170,10769,8814,2194,1976,143 126 | 1,3,36050,1642,2961,4787,500,1621 127 | 1,3,76237,3473,7102,16538,778,918 128 | 1,3,19219,1840,1658,8195,349,483 129 | 2,3,21465,7243,10685,880,2386,2749 130 | 1,3,140,8847,3823,142,1062,3 131 | 1,3,42312,926,1510,1718,410,1819 132 | 1,3,7149,2428,699,6316,395,911 133 | 1,3,2101,589,314,346,70,310 134 | 1,3,14903,2032,2479,576,955,328 135 | 1,3,9434,1042,1235,436,256,396 136 | 1,3,7388,1882,2174,720,47,537 137 | 1,3,6300,1289,2591,1170,199,326 138 | 1,3,4625,8579,7030,4575,2447,1542 139 | 1,3,3087,8080,8282,661,721,36 140 | 1,3,13537,4257,5034,155,249,3271 141 | 1,3,5387,4979,3343,825,637,929 142 | 1,3,17623,4280,7305,2279,960,2616 143 | 1,3,30379,13252,5189,321,51,1450 144 | 1,3,37036,7152,8253,2995,20,3 145 | 1,3,10405,1596,1096,8425,399,318 146 | 1,3,18827,3677,1988,118,516,201 147 | 2,3,22039,8384,34792,42,12591,4430 148 | 1,3,7769,1936,2177,926,73,520 149 | 1,3,9203,3373,2707,1286,1082,526 150 | 1,3,5924,584,542,4052,283,434 151 | 1,3,31812,1433,1651,800,113,1440 152 | 1,3,16225,1825,1765,853,170,1067 153 | 1,3,1289,3328,2022,531,255,1774 154 | 1,3,18840,1371,3135,3001,352,184 155 | 1,3,3463,9250,2368,779,302,1627 156 | 1,3,622,55,137,75,7,8 157 | 2,3,1989,10690,19460,233,11577,2153 158 | 2,3,3830,5291,14855,317,6694,3182 159 | 1,3,17773,1366,2474,3378,811,418 160 | 2,3,2861,6570,9618,930,4004,1682 161 | 2,3,355,7704,14682,398,8077,303 162 | 2,3,1725,3651,12822,824,4424,2157 163 | 1,3,12434,540,283,1092,3,2233 164 | 1,3,15177,2024,3810,2665,232,610 165 | 2,3,5531,15726,26870,2367,13726,446 166 | 2,3,5224,7603,8584,2540,3674,238 167 | 2,3,15615,12653,19858,4425,7108,2379 168 | 2,3,4822,6721,9170,993,4973,3637 169 | 1,3,2926,3195,3268,405,1680,693 170 | 1,3,5809,735,803,1393,79,429 171 | 1,3,5414,717,2155,2399,69,750 172 | 2,3,260,8675,13430,1116,7015,323 173 | 2,3,200,25862,19816,651,8773,6250 174 | 1,3,955,5479,6536,333,2840,707 175 | 2,3,514,7677,19805,937,9836,716 176 | 1,3,286,1208,5241,2515,153,1442 177 | 2,3,2343,7845,11874,52,4196,1697 178 | 1,3,45640,6958,6536,7368,1532,230 179 | 1,3,12759,7330,4533,1752,20,2631 180 | 1,3,11002,7075,4945,1152,120,395 181 | 1,3,3157,4888,2500,4477,273,2165 182 | 1,3,12356,6036,8887,402,1382,2794 183 | 1,3,112151,29627,18148,16745,4948,8550 184 | 1,3,694,8533,10518,443,6907,156 185 | 1,3,36847,43950,20170,36534,239,47943 186 | 1,3,327,918,4710,74,334,11 187 | 1,3,8170,6448,1139,2181,58,247 188 | 1,3,3009,521,854,3470,949,727 189 | 1,3,2438,8002,9819,6269,3459,3 190 | 2,3,8040,7639,11687,2758,6839,404 191 | 2,3,834,11577,11522,275,4027,1856 192 | 1,3,16936,6250,1981,7332,118,64 193 | 1,3,13624,295,1381,890,43,84 194 | 1,3,5509,1461,2251,547,187,409 195 | 2,3,180,3485,20292,959,5618,666 196 | 1,3,7107,1012,2974,806,355,1142 197 | 1,3,17023,5139,5230,7888,330,1755 198 | 1,1,30624,7209,4897,18711,763,2876 199 | 2,1,2427,7097,10391,1127,4314,1468 200 | 1,1,11686,2154,6824,3527,592,697 201 | 1,1,9670,2280,2112,520,402,347 202 | 2,1,3067,13240,23127,3941,9959,731 203 | 2,1,4484,14399,24708,3549,14235,1681 204 | 1,1,25203,11487,9490,5065,284,6854 205 | 1,1,583,685,2216,469,954,18 206 | 1,1,1956,891,5226,1383,5,1328 207 | 2,1,1107,11711,23596,955,9265,710 208 | 1,1,6373,780,950,878,288,285 209 | 2,1,2541,4737,6089,2946,5316,120 210 | 1,1,1537,3748,5838,1859,3381,806 211 | 2,1,5550,12729,16767,864,12420,797 212 | 1,1,18567,1895,1393,1801,244,2100 213 | 2,1,12119,28326,39694,4736,19410,2870 214 | 1,1,7291,1012,2062,1291,240,1775 215 | 1,1,3317,6602,6861,1329,3961,1215 216 | 2,1,2362,6551,11364,913,5957,791 217 | 1,1,2806,10765,15538,1374,5828,2388 218 | 2,1,2532,16599,36486,179,13308,674 219 | 1,1,18044,1475,2046,2532,130,1158 220 | 2,1,18,7504,15205,1285,4797,6372 221 | 1,1,4155,367,1390,2306,86,130 222 | 1,1,14755,899,1382,1765,56,749 223 | 1,1,5396,7503,10646,91,4167,239 224 | 1,1,5041,1115,2856,7496,256,375 225 | 2,1,2790,2527,5265,5612,788,1360 226 | 1,1,7274,659,1499,784,70,659 227 | 1,1,12680,3243,4157,660,761,786 228 | 2,1,20782,5921,9212,1759,2568,1553 229 | 1,1,4042,2204,1563,2286,263,689 230 | 1,1,1869,577,572,950,4762,203 231 | 1,1,8656,2746,2501,6845,694,980 232 | 2,1,11072,5989,5615,8321,955,2137 233 | 1,1,2344,10678,3828,1439,1566,490 234 | 1,1,25962,1780,3838,638,284,834 235 | 1,1,964,4984,3316,937,409,7 236 | 1,1,15603,2703,3833,4260,325,2563 237 | 1,1,1838,6380,2824,1218,1216,295 238 | 1,1,8635,820,3047,2312,415,225 239 | 1,1,18692,3838,593,4634,28,1215 240 | 1,1,7363,475,585,1112,72,216 241 | 1,1,47493,2567,3779,5243,828,2253 242 | 1,1,22096,3575,7041,11422,343,2564 243 | 1,1,24929,1801,2475,2216,412,1047 244 | 1,1,18226,659,2914,3752,586,578 245 | 1,1,11210,3576,5119,561,1682,2398 246 | 1,1,6202,7775,10817,1183,3143,1970 247 | 2,1,3062,6154,13916,230,8933,2784 248 | 1,1,8885,2428,1777,1777,430,610 249 | 1,1,13569,346,489,2077,44,659 250 | 1,1,15671,5279,2406,559,562,572 251 | 1,1,8040,3795,2070,6340,918,291 252 | 1,1,3191,1993,1799,1730,234,710 253 | 2,1,6134,23133,33586,6746,18594,5121 254 | 1,1,6623,1860,4740,7683,205,1693 255 | 1,1,29526,7961,16966,432,363,1391 256 | 1,1,10379,17972,4748,4686,1547,3265 257 | 1,1,31614,489,1495,3242,111,615 258 | 1,1,11092,5008,5249,453,392,373 259 | 1,1,8475,1931,1883,5004,3593,987 260 | 1,1,56083,4563,2124,6422,730,3321 261 | 1,1,53205,4959,7336,3012,967,818 262 | 1,1,9193,4885,2157,327,780,548 263 | 1,1,7858,1110,1094,6818,49,287 264 | 1,1,23257,1372,1677,982,429,655 265 | 1,1,2153,1115,6684,4324,2894,411 266 | 2,1,1073,9679,15445,61,5980,1265 267 | 1,1,5909,23527,13699,10155,830,3636 268 | 2,1,572,9763,22182,2221,4882,2563 269 | 1,1,20893,1222,2576,3975,737,3628 270 | 2,1,11908,8053,19847,1069,6374,698 271 | 1,1,15218,258,1138,2516,333,204 272 | 1,1,4720,1032,975,5500,197,56 273 | 1,1,2083,5007,1563,1120,147,1550 274 | 1,1,514,8323,6869,529,93,1040 275 | 1,3,36817,3045,1493,4802,210,1824 276 | 1,3,894,1703,1841,744,759,1153 277 | 1,3,680,1610,223,862,96,379 278 | 1,3,27901,3749,6964,4479,603,2503 279 | 1,3,9061,829,683,16919,621,139 280 | 1,3,11693,2317,2543,5845,274,1409 281 | 2,3,17360,6200,9694,1293,3620,1721 282 | 1,3,3366,2884,2431,977,167,1104 283 | 2,3,12238,7108,6235,1093,2328,2079 284 | 1,3,49063,3965,4252,5970,1041,1404 285 | 1,3,25767,3613,2013,10303,314,1384 286 | 1,3,68951,4411,12609,8692,751,2406 287 | 1,3,40254,640,3600,1042,436,18 288 | 1,3,7149,2247,1242,1619,1226,128 289 | 1,3,15354,2102,2828,8366,386,1027 290 | 1,3,16260,594,1296,848,445,258 291 | 1,3,42786,286,471,1388,32,22 292 | 1,3,2708,2160,2642,502,965,1522 293 | 1,3,6022,3354,3261,2507,212,686 294 | 1,3,2838,3086,4329,3838,825,1060 295 | 2,2,3996,11103,12469,902,5952,741 296 | 1,2,21273,2013,6550,909,811,1854 297 | 2,2,7588,1897,5234,417,2208,254 298 | 1,2,19087,1304,3643,3045,710,898 299 | 2,2,8090,3199,6986,1455,3712,531 300 | 2,2,6758,4560,9965,934,4538,1037 301 | 1,2,444,879,2060,264,290,259 302 | 2,2,16448,6243,6360,824,2662,2005 303 | 2,2,5283,13316,20399,1809,8752,172 304 | 2,2,2886,5302,9785,364,6236,555 305 | 2,2,2599,3688,13829,492,10069,59 306 | 2,2,161,7460,24773,617,11783,2410 307 | 2,2,243,12939,8852,799,3909,211 308 | 2,2,6468,12867,21570,1840,7558,1543 309 | 1,2,17327,2374,2842,1149,351,925 310 | 1,2,6987,1020,3007,416,257,656 311 | 2,2,918,20655,13567,1465,6846,806 312 | 1,2,7034,1492,2405,12569,299,1117 313 | 1,2,29635,2335,8280,3046,371,117 314 | 2,2,2137,3737,19172,1274,17120,142 315 | 1,2,9784,925,2405,4447,183,297 316 | 1,2,10617,1795,7647,1483,857,1233 317 | 2,2,1479,14982,11924,662,3891,3508 318 | 1,2,7127,1375,2201,2679,83,1059 319 | 1,2,1182,3088,6114,978,821,1637 320 | 1,2,11800,2713,3558,2121,706,51 321 | 2,2,9759,25071,17645,1128,12408,1625 322 | 1,2,1774,3696,2280,514,275,834 323 | 1,2,9155,1897,5167,2714,228,1113 324 | 1,2,15881,713,3315,3703,1470,229 325 | 1,2,13360,944,11593,915,1679,573 326 | 1,2,25977,3587,2464,2369,140,1092 327 | 1,2,32717,16784,13626,60869,1272,5609 328 | 1,2,4414,1610,1431,3498,387,834 329 | 1,2,542,899,1664,414,88,522 330 | 1,2,16933,2209,3389,7849,210,1534 331 | 1,2,5113,1486,4583,5127,492,739 332 | 1,2,9790,1786,5109,3570,182,1043 333 | 2,2,11223,14881,26839,1234,9606,1102 334 | 1,2,22321,3216,1447,2208,178,2602 335 | 2,2,8565,4980,67298,131,38102,1215 336 | 2,2,16823,928,2743,11559,332,3486 337 | 2,2,27082,6817,10790,1365,4111,2139 338 | 1,2,13970,1511,1330,650,146,778 339 | 1,2,9351,1347,2611,8170,442,868 340 | 1,2,3,333,7021,15601,15,550 341 | 1,2,2617,1188,5332,9584,573,1942 342 | 2,3,381,4025,9670,388,7271,1371 343 | 2,3,2320,5763,11238,767,5162,2158 344 | 1,3,255,5758,5923,349,4595,1328 345 | 2,3,1689,6964,26316,1456,15469,37 346 | 1,3,3043,1172,1763,2234,217,379 347 | 1,3,1198,2602,8335,402,3843,303 348 | 2,3,2771,6939,15541,2693,6600,1115 349 | 2,3,27380,7184,12311,2809,4621,1022 350 | 1,3,3428,2380,2028,1341,1184,665 351 | 2,3,5981,14641,20521,2005,12218,445 352 | 1,3,3521,1099,1997,1796,173,995 353 | 2,3,1210,10044,22294,1741,12638,3137 354 | 1,3,608,1106,1533,830,90,195 355 | 2,3,117,6264,21203,228,8682,1111 356 | 1,3,14039,7393,2548,6386,1333,2341 357 | 1,3,190,727,2012,245,184,127 358 | 1,3,22686,134,218,3157,9,548 359 | 2,3,37,1275,22272,137,6747,110 360 | 1,3,759,18664,1660,6114,536,4100 361 | 1,3,796,5878,2109,340,232,776 362 | 1,3,19746,2872,2006,2601,468,503 363 | 1,3,4734,607,864,1206,159,405 364 | 1,3,2121,1601,2453,560,179,712 365 | 1,3,4627,997,4438,191,1335,314 366 | 1,3,2615,873,1524,1103,514,468 367 | 2,3,4692,6128,8025,1619,4515,3105 368 | 1,3,9561,2217,1664,1173,222,447 369 | 1,3,3477,894,534,1457,252,342 370 | 1,3,22335,1196,2406,2046,101,558 371 | 1,3,6211,337,683,1089,41,296 372 | 2,3,39679,3944,4955,1364,523,2235 373 | 1,3,20105,1887,1939,8164,716,790 374 | 1,3,3884,3801,1641,876,397,4829 375 | 2,3,15076,6257,7398,1504,1916,3113 376 | 1,3,6338,2256,1668,1492,311,686 377 | 1,3,5841,1450,1162,597,476,70 378 | 2,3,3136,8630,13586,5641,4666,1426 379 | 1,3,38793,3154,2648,1034,96,1242 380 | 1,3,3225,3294,1902,282,68,1114 381 | 2,3,4048,5164,10391,130,813,179 382 | 1,3,28257,944,2146,3881,600,270 383 | 1,3,17770,4591,1617,9927,246,532 384 | 1,3,34454,7435,8469,2540,1711,2893 385 | 1,3,1821,1364,3450,4006,397,361 386 | 1,3,10683,21858,15400,3635,282,5120 387 | 1,3,11635,922,1614,2583,192,1068 388 | 1,3,1206,3620,2857,1945,353,967 389 | 1,3,20918,1916,1573,1960,231,961 390 | 1,3,9785,848,1172,1677,200,406 391 | 1,3,9385,1530,1422,3019,227,684 392 | 1,3,3352,1181,1328,5502,311,1000 393 | 1,3,2647,2761,2313,907,95,1827 394 | 1,3,518,4180,3600,659,122,654 395 | 1,3,23632,6730,3842,8620,385,819 396 | 1,3,12377,865,3204,1398,149,452 397 | 1,3,9602,1316,1263,2921,841,290 398 | 2,3,4515,11991,9345,2644,3378,2213 399 | 1,3,11535,1666,1428,6838,64,743 400 | 1,3,11442,1032,582,5390,74,247 401 | 1,3,9612,577,935,1601,469,375 402 | 1,3,4446,906,1238,3576,153,1014 403 | 1,3,27167,2801,2128,13223,92,1902 404 | 1,3,26539,4753,5091,220,10,340 405 | 1,3,25606,11006,4604,127,632,288 406 | 1,3,18073,4613,3444,4324,914,715 407 | 1,3,6884,1046,1167,2069,593,378 408 | 1,3,25066,5010,5026,9806,1092,960 409 | 2,3,7362,12844,18683,2854,7883,553 410 | 2,3,8257,3880,6407,1646,2730,344 411 | 1,3,8708,3634,6100,2349,2123,5137 412 | 1,3,6633,2096,4563,1389,1860,1892 413 | 1,3,2126,3289,3281,1535,235,4365 414 | 1,3,97,3605,12400,98,2970,62 415 | 1,3,4983,4859,6633,17866,912,2435 416 | 1,3,5969,1990,3417,5679,1135,290 417 | 2,3,7842,6046,8552,1691,3540,1874 418 | 2,3,4389,10940,10908,848,6728,993 419 | 1,3,5065,5499,11055,364,3485,1063 420 | 2,3,660,8494,18622,133,6740,776 421 | 1,3,8861,3783,2223,633,1580,1521 422 | 1,3,4456,5266,13227,25,6818,1393 423 | 2,3,17063,4847,9053,1031,3415,1784 424 | 1,3,26400,1377,4172,830,948,1218 425 | 2,3,17565,3686,4657,1059,1803,668 426 | 2,3,16980,2884,12232,874,3213,249 427 | 1,3,11243,2408,2593,15348,108,1886 428 | 1,3,13134,9347,14316,3141,5079,1894 429 | 1,3,31012,16687,5429,15082,439,1163 430 | 1,3,3047,5970,4910,2198,850,317 431 | 1,3,8607,1750,3580,47,84,2501 432 | 1,3,3097,4230,16483,575,241,2080 433 | 1,3,8533,5506,5160,13486,1377,1498 434 | 1,3,21117,1162,4754,269,1328,395 435 | 1,3,1982,3218,1493,1541,356,1449 436 | 1,3,16731,3922,7994,688,2371,838 437 | 1,3,29703,12051,16027,13135,182,2204 438 | 1,3,39228,1431,764,4510,93,2346 439 | 2,3,14531,15488,30243,437,14841,1867 440 | 1,3,10290,1981,2232,1038,168,2125 441 | 1,3,2787,1698,2510,65,477,52 442 | -------------------------------------------------------------------------------- /projects_cn/creating_customer_segments/renders.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import matplotlib.cm as cm 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn.decomposition import pca 6 | 7 | def pca_results(good_data, pca): 8 | ''' 9 | Create a DataFrame of the PCA results 10 | Includes dimension feature weights and explained variance 11 | Visualizes the PCA results 12 | ''' 13 | 14 | # Dimension indexing 15 | dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)] 16 | 17 | # PCA components 18 | components = pd.DataFrame(np.round(pca.components_, 4), columns = good_data.keys()) 19 | components.index = dimensions 20 | 21 | # PCA explained variance 22 | ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1) 23 | variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance']) 24 | variance_ratios.index = dimensions 25 | 26 | # Create a bar plot visualization 27 | fig, ax = plt.subplots(figsize = (14,8)) 28 | 29 | # Plot the feature weights as a function of the components 30 | components.plot(ax = ax, kind = 'bar'); 31 | ax.set_ylabel("Feature Weights") 32 | ax.set_xticklabels(dimensions, rotation=0) 33 | 34 | 35 | # Display the explained variance ratios 36 | for i, ev in enumerate(pca.explained_variance_ratio_): 37 | ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n %.4f"%(ev)) 38 | 39 | # Return a concatenated DataFrame 40 | return pd.concat([variance_ratios, components], axis = 1) 41 | 42 | def cluster_results(reduced_data, preds, centers, pca_samples): 43 | ''' 44 | Visualizes the PCA-reduced cluster data in two dimensions 45 | Adds cues for cluster centers and student-selected sample data 46 | ''' 47 | 48 | predictions = pd.DataFrame(preds, columns = ['Cluster']) 49 | plot_data = pd.concat([predictions, reduced_data], axis = 1) 50 | 51 | # Generate the cluster plot 52 | fig, ax = plt.subplots(figsize = (14,8)) 53 | 54 | # Color map 55 | cmap = cm.get_cmap('gist_rainbow') 56 | 57 | # Color the points based on assigned cluster 58 | for i, cluster in plot_data.groupby('Cluster'): 59 | cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \ 60 | color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30); 61 | 62 | # Plot centers with indicators 63 | for i, c in enumerate(centers): 64 | ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \ 65 | alpha = 1, linewidth = 2, marker = 'o', s=200); 66 | ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100); 67 | 68 | # Plot transformed sample points 69 | ax.scatter(x = pca_samples[:,0], y = pca_samples[:,1], \ 70 | s = 150, linewidth = 4, color = 'black', marker = 'x'); 71 | 72 | # Set plot title 73 | ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number\nTransformed Sample Data Marked by Black Cross"); 74 | 75 | 76 | def channel_results(reduced_data, outliers, pca_samples): 77 | ''' 78 | Visualizes the PCA-reduced cluster data in two dimensions using the full dataset 79 | Data is labeled by "Channel" and cues added for student-selected sample data 80 | ''' 81 | 82 | # Check that the dataset is loadable 83 | try: 84 | full_data = pd.read_csv("customers.csv") 85 | except: 86 | print "Dataset could not be loaded. Is the file missing?" 87 | return False 88 | 89 | # Create the Channel DataFrame 90 | channel = pd.DataFrame(full_data['Channel'], columns = ['Channel']) 91 | channel = channel.drop(channel.index[outliers]).reset_index(drop = True) 92 | labeled = pd.concat([reduced_data, channel], axis = 1) 93 | 94 | # Generate the cluster plot 95 | fig, ax = plt.subplots(figsize = (14,8)) 96 | 97 | # Color map 98 | cmap = cm.get_cmap('gist_rainbow') 99 | 100 | # Color the points based on assigned Channel 101 | labels = ['Hotel/Restaurant/Cafe', 'Retailer'] 102 | grouped = labeled.groupby('Channel') 103 | for i, channel in grouped: 104 | channel.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \ 105 | color = cmap((i-1)*1.0/2), label = labels[i-1], s=30); 106 | 107 | # Plot transformed sample points 108 | for i, sample in enumerate(pca_samples): 109 | ax.scatter(x = sample[0], y = sample[1], \ 110 | s = 200, linewidth = 3, color = 'black', marker = 'o', facecolors = 'none'); 111 | ax.scatter(x = sample[0]+0.25, y = sample[1]+0.3, marker='$%d$'%(i), alpha = 1, s=125); 112 | 113 | # Set plot title 114 | ax.set_title("PCA-Reduced Data Labeled by 'Channel'\nTransformed Sample Data Circled"); -------------------------------------------------------------------------------- /projects_cn/student_intervention/README.md: -------------------------------------------------------------------------------- 1 | # 项目 2: 监督学习 2 | ## 搭建一个学生干预系统 3 | 4 | ### 安装 5 | 6 | 这个项目要求使用 **Python 2.7** 并且需要安装下面这些python包: 7 | 8 | - [NumPy](http://www.numpy.org/) 9 | - [pandas](http://pandas.pydata.org) 10 | - [scikit-learn](http://scikit-learn.org/stable/) 11 | 12 | 你同样需要安装好相应软件使之能够运行[Jupyter Notebook](http://jupyter.org/) 13 | 14 | 优达学城推荐学生安装[Anaconda](https://www.continuum.io/downloads), 这是一个已经打包好的python发行版,它包含了我们这个项目需要的所有的库和软件。 15 | 16 | 17 | ### 代码 18 | 19 | 初始代码包含在 `student_intervention.ipynb` 这个notebook文件中。这里面有一些代码已经实现好来帮助你开始项目,但是为了完成项目,你还需要实现附加的功能。 20 | 21 | ### 运行 22 | 23 | 在命令行中,确保当前目录为 `student_intervention/` 文件夹的最顶层(目录包含本 README 文件),运行下列命令: 24 | 25 | ```jupyter notebook student_intervention.ipynb``` 26 | 27 | ​这会启动 Jupyter Notebook 并把项目文件打开在你的浏览器中。 28 | 29 | ## 数据 30 | 31 | ​这个项目的数据包含在 `student-data.csv` 文件中。这个数据集包含以下属性: ​ 32 | 33 | - `school` : 学生的学校(二元特征:值为“GP”或者是“MS”) 34 | - `sex` : 学生的性别(二元特征:“F”表示女性 或者是 “M”表示男性) 35 | - `age` : 学生的年龄(数值特征:从15到22) 36 | - `address`: 学生的家庭住址类型(二元特征:“U”表示城市 或者是 “R”表示农村) 37 | - `famsize`: 家庭大小(二元特征:“LE3”表示小于等于3 或者 “GT3”表示大于3) 38 | - `Pstatus`: 父母共同生活状态(二元特征:“T”表示共同生活 或者是 “A”表示分居) 39 | - `Medu`: 母亲的教育程度 (数值特征:0 - 未受教育, 1 - 小学教育(4年级), 2 - 5年级到9年级, 3 - 中学教育 或者 4 - 更高等级教育) 40 | - `Fedu`: 父亲的教育程度 (数值特征:0 - 未受教育, 1 - 小学教育(4年级), 2 - 5年级到9年级, 3 - 中学教育 或者 4 - 更高等级教育) 41 | - `Mjob` : 母亲的工作 (常量特征: "teacher", "health" 表示和健康看护相关的工作, "services" 表示公务员(比如:行政人员或者警察), "at_home"表示在家, "other"表示其他) 42 | - `Fjob` : 父亲的工作 (常量特征: "teacher", "health" 表示和健康看护相关的工作, "services" 表示公务员(比如:行政人员或者警察), "at_home"表示在家, "other"表示其他) 43 | - `reason` : 选择这所学校的原因 (常量特征:"home"表示离家近, "reputation"表示学校声誉, "course"表示课程偏好 或者 "other"表示其他) 44 | - `guardian` : 学生的监护人 (常量特征:"mother"表示母亲, "father"表示父亲 或者 "other"表示其他) 45 | - `traveltime` : 到学校需要的时间 (数值特征: 1 - 小于15分钟., 2 - 15到30分钟., 3 - 30分钟到1小时, 4 - 大于1小时) 46 | - `studytime`: 每周学习时间 (数值特征: 1 - 小于2个小时, 2 - 2到5个小时, 3 - 5到10个小时, 4 - 大于10个小时) 47 | - `failures`:过去考试失败的次数 (数值特征: n 如果 1<=n<3, 其他 4) 48 | - `schoolsup` : 额外的教育支持 (二元特征: yes 或者 no) 49 | - `famsup` : 家庭教育支持 (二元特征: yes 或者 no) 50 | - `paid` : 和课程有关的其他付费课堂 (数学或者葡萄牙语) (二值特征: yes 或者 no) 51 | - `activities` : 课外活动 (二元特征: yes 或者 no) 52 | - `nursery` : 参加托儿所 (二元特征: yes 或者 no) 53 | - `higher` : 希望得到高等教育(二元特征: yes 或者 no) 54 | - `internet` : 在家是否能够访问网络 (二元特征: yes 或者 no) 55 | - `romantic` : 有没有谈恋爱 (二元特征: yes 或者 no) 56 | - `famrel` : 与家人关系的好坏 (数值特征: 从 1 - 非常差 到 5 - 非常好) 57 | - `freetime` : 放学后的空闲时间(数值特征: 从 1 - 非常少 到 5 - 非常多) 58 | - `goout` : 和朋友出去(数值特征: 从 1 - 非常少 到 5 - 非常多) 59 | - `Dalc` : 工作日饮酒量(数值特征:从 1 - 非常少 到 5 - 非常多) 60 | - `Walc` : 周末饮酒量(数值特征:从 1 - 非常少 到 5 - 非常多) 61 | - `health` : 当前健康状况 (数值特征: 从 1 - 非常差 到 5 - 非常好) 62 | - `absences` :在学校的缺席次数 (数值特征: 从 0 到 93) 63 | - `passed` : 学生是否通过最终的考试 (二元特征: yes 或者 no) 64 | -------------------------------------------------------------------------------- /projects_cn/student_intervention/student_intervention.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 机器学习工程师纳米学位\n", 8 | "## 监督学习\n", 9 | "## 项目 2: 搭建一个学生干预系统" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "欢迎来到机器学习工程师纳米学位的第二个项目!在此文件中,有些示例代码已经提供给你,但你还需要实现更多的功能让项目成功运行。除非有明确要求,你无须修改任何已给出的代码。以**'练习'**开始的标题表示接下来的代码部分中有你必须要实现的功能。每一部分都会有详细的指导,需要实现的部分也会在注释中以**'TODO'**标出。请仔细阅读所有的提示!\n", 17 | "\n", 18 | "除了实现代码外,你还**必须**回答一些与项目和你的实现有关的问题。每一个需要你回答的问题都会以**'问题 X'**为标题。请仔细阅读每个问题,并且在问题后的**'回答'**文字框中写出完整的答案。我们将根据你对问题的回答和撰写代码所实现的功能来对你提交的项目进行评分。\n", 19 | "\n", 20 | ">**提示:**Code 和 Markdown 区域可通过 **Shift + Enter** 快捷键运行。此外,Markdown可以通过双击进入编辑模式。" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "### 问题 1 - 分类 vs. 回归\n", 28 | "*在这个项目中你的任务是找出那些如果不给予帮助,最重可能无法毕业的学生。你觉得这个问题是哪种类型的监督学习问题,是分类问题还是回归问题?为什么?*" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "**答案: **" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## 分析数据\n", 43 | "运行下面区域的代码以载入学生数据集,以及一些此项目所需的Python库。注意数据集的最后一列`'passed'`是我们的预测的目标(表示学生是毕业了还是没有毕业),其他的列是每个学生的属性。" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "# 载入所需要的库\n", 55 | "import numpy as np\n", 56 | "import pandas as pd\n", 57 | "from time import time\n", 58 | "from sklearn.metrics import f1_score\n", 59 | "\n", 60 | "# 载入学生数据集\n", 61 | "student_data = pd.read_csv(\"student-data.csv\")\n", 62 | "print \"Student data read successfully!\"" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "### 练习: 分析数据\n", 70 | "我们首先通过调查数据,以确定有多少学生的信息,并了解这些学生的毕业率。在下面的代码单元中,你需要完成如下的运算:\n", 71 | "- 学生的总数, `n_students`。\n", 72 | "- 每个学生的特征总数, `n_features`。\n", 73 | "- 毕业的学生的数量, `n_passed`。\n", 74 | "- 未毕业的学生的数量, `n_failed`。\n", 75 | "- 班级的毕业率, `grad_rate`, 用百分数表示(%)。\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "# TODO: 计算学生的数量\n", 87 | "n_students = None\n", 88 | "\n", 89 | "# TODO: 计算特征数量\n", 90 | "n_features = None\n", 91 | "\n", 92 | "# TODO: 计算通过的学生数\n", 93 | "n_passed = None\n", 94 | "\n", 95 | "# TODO: 计算未通过的学生数\n", 96 | "n_failed = None\n", 97 | "\n", 98 | "# TODO: 计算通过率\n", 99 | "grad_rate = None\n", 100 | "\n", 101 | "# 输出结果\n", 102 | "print \"Total number of students: {}\".format(n_students)\n", 103 | "print \"Number of features: {}\".format(n_features)\n", 104 | "print \"Number of students who passed: {}\".format(n_passed)\n", 105 | "print \"Number of students who failed: {}\".format(n_failed)\n", 106 | "print \"Graduation rate of the class: {:.2f}%\".format(grad_rate)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "## 数据准备\n", 114 | "在这个部分中,我们将要为建模、训练和测试准备数据\n", 115 | "### 识别特征和目标列\n", 116 | "你获取的数据中通常都会包含一些非数字的特征,这会导致一些问题,因为大多数的机器学习算法都会期望输入数字特征进行计算。\n", 117 | "\n", 118 | "运行下面的代码单元将学生数据分成特征和目标列看一看他们中是否有非数字特征。" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "# 提取特征列\n", 130 | "feature_cols = list(student_data.columns[:-1])\n", 131 | "\n", 132 | "# 提取目标列 ‘passed’\n", 133 | "target_col = student_data.columns[-1] \n", 134 | "\n", 135 | "# 显示列的列表\n", 136 | "print \"Feature columns:\\n{}\".format(feature_cols)\n", 137 | "print \"\\nTarget column: {}\".format(target_col)\n", 138 | "\n", 139 | "# 将数据分割成特征数据和目标数据(即X_all 和 y_all)\n", 140 | "X_all = student_data[feature_cols]\n", 141 | "y_all = student_data[target_col]\n", 142 | "\n", 143 | "# 通过打印前5行显示特征信息\n", 144 | "print \"\\nFeature values:\"\n", 145 | "print X_all.head()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "### 预处理特征列\n", 153 | "\n", 154 | "正如你所见,我们这里有几个非数值的列需要做一定的转换!它们中很多是简单的`yes`/`no`,比如`internet`。这些可以合理地转化为`1`/`0`(二元值,binary)值。\n", 155 | "\n", 156 | "其他的列,如`Mjob`和`Fjob`,有两个以上的值,被称为_分类变量(categorical variables)_。处理这样的列的推荐方法是创建和可能值一样多的列(如:`Fjob_teacher`,`Fjob_other`,`Fjob_services`等),然后将其中一个的值设为`1`另外的设为`0`。\n", 157 | "\n", 158 | "这些创建的列有时候叫做 _虚拟变量(dummy variables)_,我们将用[`pandas.get_dummies()`](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html?highlight=get_dummies#pandas.get_dummies)函数来完成这个转换。运行下面代码单元的代码来完成这里讨论的预处理步骤。" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "def preprocess_features(X):\n", 170 | " ''' 预处理学生数据,将非数字的二元特征转化成二元值(0或1),将分类的变量转换成虚拟变量\n", 171 | " '''\n", 172 | " \n", 173 | " # 初始化一个用于输出的DataFrame\n", 174 | " output = pd.DataFrame(index = X.index)\n", 175 | "\n", 176 | " # 查看数据的每一个特征列\n", 177 | " for col, col_data in X.iteritems():\n", 178 | " \n", 179 | " # 如果数据是非数字类型,将所有的yes/no替换成1/0\n", 180 | " if col_data.dtype == object:\n", 181 | " col_data = col_data.replace(['yes', 'no'], [1, 0])\n", 182 | "\n", 183 | " # 如果数据类型是类别的(categorical),将它转换成虚拟变量\n", 184 | " if col_data.dtype == object:\n", 185 | " # 例子: 'school' => 'school_GP' and 'school_MS'\n", 186 | " col_data = pd.get_dummies(col_data, prefix = col) \n", 187 | " \n", 188 | " # 收集转换后的列\n", 189 | " output = output.join(col_data)\n", 190 | " \n", 191 | " return output\n", 192 | "\n", 193 | "X_all = preprocess_features(X_all)\n", 194 | "print \"Processed feature columns ({} total features):\\n{}\".format(len(X_all.columns), list(X_all.columns))" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "### 实现: 将数据分成训练集和测试集\n", 202 | "现在我们已经将所有的 _分类的(categorical)_ 特征转换成数值了。下一步我们将把数据(包括特征和对应的标签数据)分割成训练集和测试集。在下面的代码单元中,你需要完成下列功能:\n", 203 | "- 随机混洗(shuffle)切分数据(`X_all`, `y_all`) 为训练子集和测试子集。\n", 204 | " - 使用300个数据点作为训练集(约75%),使用95个数据点作为测试集(约25%)。\n", 205 | " - 如果可能的话,为你使用的函数设置一个`random_state`。\n", 206 | " - 将结果存储在`X_train`, `X_test`, `y_train`和 `y_test`中。" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "# TODO:在这里导入你可能需要使用的另外的功能\n", 218 | "\n", 219 | "# TODO:设置训练集的数量\n", 220 | "num_train = None\n", 221 | "\n", 222 | "# TODO:设置测试集的数量\n", 223 | "num_test = X_all.shape[0] - num_train\n", 224 | "\n", 225 | "# TODO:把数据集混洗和分割成上面定义的训练集和测试集\n", 226 | "X_train = None\n", 227 | "X_test = None\n", 228 | "y_train = None\n", 229 | "y_test = None\n", 230 | "\n", 231 | "# 显示分割的结果\n", 232 | "print \"Training set has {} samples.\".format(X_train.shape[0])\n", 233 | "print \"Testing set has {} samples.\".format(X_test.shape[0])" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "## 训练和评价模型\n", 241 | "在这个部分,你将选择3个适合这个问题并且在`scikit-learn`中已有的监督学习的模型。首先你需要说明你选择这三个模型的原因,包括这些数据集有哪些特点,每个模型的优点和缺点各是什么。然后,你需要将这些模型用不同大小的训练集(100个数据点,200个数据点,300个数据点)进行训练,并用F1的值来衡量。你需要制作三个表,每个表要显示训练集大小,训练时间,预测时间,训练集上的F1值和测试集上的F1值(每个模型一个表)。\n", 242 | "\n", 243 | "**这是目前** [`scikit-learn`](http://scikit-learn.org/stable/supervised_learning.html) **里有的监督学习模型,你可以从中选择:**\n", 244 | "- Gaussian Naive Bayes (GaussianNB) 朴素贝叶斯\n", 245 | "- Decision Trees 决策树\n", 246 | "- Ensemble Methods (Bagging, AdaBoost, Random Forest, Gradient Boosting)\n", 247 | "- K-Nearest Neighbors (KNeighbors)\n", 248 | "- Stochastic Gradient Descent (SGDC)\n", 249 | "- Support Vector Machines (SVM) 向量模型机\n", 250 | "- Logistic Regression 逻辑回归" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "### 问题 2 - 应用模型\n", 258 | "*列出三个适合这个问题的监督学习算法模型。每一个你选择的模型:*\n", 259 | "\n", 260 | "- 描述一个该模型在真实世界的一个应用场景。(你需要为此做点研究,并给出你的引用出处)\n", 261 | "- 这个模型的优势是什么?他什么情况下表现最好?\n", 262 | "- 这个模型的缺点是什么?什么条件下它表现很差?\n", 263 | "- 根据我们当前数据集的特点,为什么这个模型适合这个问题。" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "**回答: **" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "### 准备\n", 278 | "运行下面的代码单元以初始化三个帮助函数,这三个函数将能够帮你训练和测试你上面所选择的三个监督学习算法。这些函数是:\n", 279 | "- `train_classifier` - 输入一个分类器和训练集,用数据来训练这个分类器。\n", 280 | "- `predict_labels` - 输入一个训练好的分类器、特征以及一个目标标签,这个函数将帮你做预测并给出F1的值.\n", 281 | "- `train_predict` - 输入一个分类器以及训练集和测试集,它可以运行`train_clasifier`和`predict_labels`.\n", 282 | " - 这个函数将分别输出训练集的F1值和测试集的F1值" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "def train_classifier(clf, X_train, y_train):\n", 294 | " ''' 用训练集训练分类器 '''\n", 295 | " \n", 296 | " # 开始计时,训练分类器,然后停止计时\n", 297 | " start = time()\n", 298 | " clf.fit(X_train, y_train)\n", 299 | " end = time()\n", 300 | " \n", 301 | " # Print the results\n", 302 | " print \"Trained model in {:.4f} seconds\".format(end - start)\n", 303 | "\n", 304 | " \n", 305 | "def predict_labels(clf, features, target):\n", 306 | " ''' 用训练好的分类器做预测并输出F1值'''\n", 307 | " \n", 308 | " # 开始计时,作出预测,然后停止计时\n", 309 | " start = time()\n", 310 | " y_pred = clf.predict(features)\n", 311 | " end = time()\n", 312 | " \n", 313 | " # 输出并返回结果\n", 314 | " print \"Made predictions in {:.4f} seconds.\".format(end - start)\n", 315 | " return f1_score(target.values, y_pred, pos_label='yes')\n", 316 | "\n", 317 | "\n", 318 | "def train_predict(clf, X_train, y_train, X_test, y_test):\n", 319 | " ''' 用一个分类器训练和预测,并输出F1值 '''\n", 320 | " \n", 321 | " # 输出分类器名称和训练集大小\n", 322 | " print \"Training a {} using a training set size of {}. . .\".format(clf.__class__.__name__, len(X_train))\n", 323 | " \n", 324 | " # 训练一个分类器\n", 325 | " train_classifier(clf, X_train, y_train)\n", 326 | " \n", 327 | " # 输出训练和测试的预测结果\n", 328 | " print \"F1 score for training set: {:.4f}.\".format(predict_labels(clf, X_train, y_train))\n", 329 | " print \"F1 score for test set: {:.4f}.\".format(predict_labels(clf, X_test, y_test))" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "### 练习: 模型评价指标\n", 337 | "借助于上面定义的函数,你现在需要导入三个你选择的监督学习模型,然后为每一个模型运行`train_predict`函数。请记住,对于每一个模型你需要在不同大小的训练集(100,200和300)上进行训练和测试。所以,你在下面应该会有9个不同的输出(每个模型都有训练集大小不同的三个输出)。在接下来的代码单元中,你将需要实现以下功能:\n", 338 | "- 引入三个你在上面讨论过的监督式学习算法模型。\n", 339 | "- 初始化三个模型并将它们存储在`clf_A`, `clf_B` 和 `clf_C`中。\n", 340 | " - 如果可能对每一个模型都设置一个`random_state`。\n", 341 | " - **注意:** 这里先使用每一个模型的默认参数,在接下来的部分中你将需要对某一个模型的参数进行调整。\n", 342 | "- 创建不同大小的训练集用来训练每一个模型。\n", 343 | " - *不要再混洗和再分割数据!新的训练集要取自`X_train`和`y_train`.*\n", 344 | "- 对于每一个模型要用不同大小的训练集来训练它,然后在测试集上做测试(总共需要9次训练测试) \n", 345 | "**注意:** 在下面的代码单元后面我们提供了三个表用来存储你的结果。" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": { 352 | "collapsed": false 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "# TODO:从sklearn中引入三个监督学习模型\n", 357 | "# from sklearn import model_A\n", 358 | "# from sklearn import model_B\n", 359 | "# from skearln import model_C\n", 360 | "\n", 361 | "# TODO:初始化三个模型\n", 362 | "clf_A = None\n", 363 | "clf_B = None\n", 364 | "clf_C = None\n", 365 | "\n", 366 | "# TODO:设置训练集大小\n", 367 | "X_train_100 = None\n", 368 | "y_train_100 = None\n", 369 | "\n", 370 | "X_train_200 = None\n", 371 | "y_train_200 = None\n", 372 | "\n", 373 | "X_train_300 = None\n", 374 | "y_train_300 = None\n", 375 | "\n", 376 | "# TODO:对每一个分类器和每一个训练集大小运行'train_predict' \n", 377 | "# train_predict(clf, X_train, y_train, X_test, y_test)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "### 结果表格\n", 385 | "编辑下面的表格看看在[Markdown](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet#tables)中如何设计一个表格。你需要把上面的结果记录在表格中。" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "** 分类器 1 - ?** \n", 393 | "\n", 394 | "| 训练集大小 | 训练时间 | 预测时间 (测试) | F1值 (训练) | F1值 (测试) |\n", 395 | "| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: |\n", 396 | "| 100 | | | | |\n", 397 | "| 200 | EXAMPLE | | | |\n", 398 | "| 300 | | | | EXAMPLE |\n", 399 | "\n", 400 | "** 分类器 2 - ?** \n", 401 | "\n", 402 | "| 训练集大小 | 训练时间 | 预测时间 (测试) | F1值 (训练) | F1值 (测试) |\n", 403 | "| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: |\n", 404 | "| 100 | | | | |\n", 405 | "| 200 | EXAMPLE | | | |\n", 406 | "| 300 | | | | EXAMPLE |\n", 407 | "\n", 408 | "** 分类器 3 - ?** \n", 409 | "\n", 410 | "| 训练集大小 | 训练时间 | 预测时间 (测试) | F1值 (训练) | F1值 (测试) |\n", 411 | "| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: |\n", 412 | "| 100 | | | | |\n", 413 | "| 200 | EXAMPLE | | | |\n", 414 | "| 300 | | | | EXAMPLE |" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "## 选择最佳模型\n", 422 | "在最后这一部分中,你将从三个监督学习模型中选择一个用在学生数据上的最佳模型。然后你将在最佳模型上用全部的训练集(`X_train`和`y_train`)运行一个网格搜索算法,在这个过程中,你要至少调整一个参数以提高模型的F1值(相比于没有调参的模型的分值有所提高)。 " 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "### 问题 3 - 选择最佳模型\n", 430 | "*给予你上面做的实验,用一到两段话,向(学校)监事会解释你将选择哪个模型作为最佳的模型。哪个模型在现有的数据,有限的资源、开支和模型表现综合来看是最好的选择?*" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "**回答: **" 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": {}, 443 | "source": [ 444 | "### 问题 4 - 用通俗的语言解释模型\n", 445 | "*用一到两段话,向(学校)监事会用外行也听得懂的话来解释最终模型是如何工作的。你需要解释所选模型的主要特点。例如,这个模型是怎样被训练的,它又是如何做出预测的。避免使用高级的数学或技术术语,不要使用公式或特定的算法名词。*" 446 | ] 447 | }, 448 | { 449 | "cell_type": "markdown", 450 | "metadata": {}, 451 | "source": [ 452 | "**回答: **" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "### 练习: 模型调参\n", 460 | "细调选择的模型的参数。使用网格搜索(`GridSearchCV`)来至少调整模型的重要参数(至少调整一个),这个参数至少需给出并尝试3个不同的值。你要使用整个训练集来完成这个过程。在接下来的代码单元中,你需要实现以下功能:\n", 461 | "- 导入 [`sklearn.grid_search.gridSearchCV`](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html) 和 [`sklearn.metrics.make_scorer`](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html).\n", 462 | "- 创建一个对于这个模型你希望调整参数的字典。\n", 463 | " - 例如: `parameters = {'parameter' : [list of values]}`。\n", 464 | "- 初始化你选择的分类器,并将其存储在`clf`中。\n", 465 | "- 使用`make_scorer` 创建F1评分函数并将其存储在`f1_scorer`中。\n", 466 | " - 需正确设定参数`pos_label`的值!\n", 467 | "- 在分类器`clf`上用`f1_scorer` 作为评价函数运行网格搜索,并将结果存储在`grid_obj`中。\n", 468 | "- 用训练集(`X_train`, `y_train`)训练grid search object,并将结果存储在`grid_obj`中。" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "metadata": { 475 | "collapsed": false 476 | }, 477 | "outputs": [], 478 | "source": [ 479 | "# TODO: 导入 'GridSearchCV' 和 'make_scorer'\n", 480 | "\n", 481 | "# TODO:创建你希望调整的参数列表\n", 482 | "parameters = None\n", 483 | "\n", 484 | "# TODO:初始化分类器\n", 485 | "clf = None\n", 486 | "\n", 487 | "# TODO:用'make_scorer'创建一个f1评分函数\n", 488 | "f1_scorer = None\n", 489 | "\n", 490 | "# TODO:在分类器上使用f1_scorer作为评分函数运行网格搜索\n", 491 | "grid_obj = None\n", 492 | "\n", 493 | "# TODO: Fit the grid search object to the training data and find the optimal parameters\n", 494 | "# TODO:用训练集训练grid search object来寻找最佳参数\n", 495 | "grid_obj = None\n", 496 | "\n", 497 | "# Get the estimator\n", 498 | "# 得到预测的结果\n", 499 | "clf = grid_obj.best_estimator_\n", 500 | "\n", 501 | "# Report the final F1 score for training and testing after parameter tuning\n", 502 | "# 输出经过调参之后的训练集和测试集的F1值\n", 503 | "print \"Tuned model has a training F1 score of {:.4f}.\".format(predict_labels(clf, X_train, y_train))\n", 504 | "print \"Tuned model has a testing F1 score of {:.4f}.\".format(predict_labels(clf, X_test, y_test))" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": {}, 510 | "source": [ 511 | "### 问题 5 - 最终的 F1 值\n", 512 | "*最终模型的训练和测试的F1值是多少?这个值相比于没有调整过参数的模型怎么样?*" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "**回答: **" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "> **注意**: 当你写完了所有的代码,并且回答了所有的问题。你就可以把你的 iPython Notebook 导出成 HTML 文件。你可以在菜单栏,这样导出**File -> Download as -> HTML (.html)**把这个 HTML 和这个 iPython notebook 一起做为你的作业提交。 " 527 | ] 528 | } 529 | ], 530 | "metadata": { 531 | "anaconda-cloud": {}, 532 | "kernelspec": { 533 | "display_name": "Python 2", 534 | "language": "python", 535 | "name": "python2" 536 | }, 537 | "language_info": { 538 | "codemirror_mode": { 539 | "name": "ipython", 540 | "version": 2 541 | }, 542 | "file_extension": ".py", 543 | "mimetype": "text/x-python", 544 | "name": "python", 545 | "nbconvert_exporter": "python", 546 | "pygments_lexer": "ipython2", 547 | "version": "2.7.11" 548 | } 549 | }, 550 | "nbformat": 4, 551 | "nbformat_minor": 0 552 | } 553 | -------------------------------------------------------------------------------- /projects_cn/titanic_survival_exploration/README.md: -------------------------------------------------------------------------------- 1 | # 项目 0: 入门与基础 2 | ## 预测泰坦尼克号乘客幸存率 3 | 4 | ### 安装要求 5 | 这个项目要求使用 **Python 2.7** 以及安装下列python库 6 | 7 | - [NumPy](http://www.numpy.org/) 8 | - [Pandas](http://pandas.pydata.org) 9 | - [matplotlib](http://matplotlib.org/) 10 | - [scikit-learn](http://scikit-learn.org/stable/) 11 | ​ 12 | 13 | 你还需要安装和运行 [Jupyter Notebook](http://jupyter.readthedocs.io/en/latest/install.html#optional-for-experienced-python-developers-installing-jupyter-with-pip)。 14 | 15 | 16 | 优达学城推荐学生安装 [Anaconda](https://www.continuum.io/downloads),一个包含了项目需要的所有库和软件的 Python 发行版本。[这里](https://classroom.udacity.com/nanodegrees/nd002/parts/0021345403/modules/317671873575460/lessons/5430778793/concepts/54140889150923)介绍了如何安装Anaconda。 17 | 18 | 如果你使用macOS系统并且对命令行比较熟悉,可以安装[homebrew](http://brew.sh/),以及brew版python 19 | 20 | ```bash 21 | $ brew install python 22 | ``` 23 | 24 | 再用下列命令安装所需要的python库 25 | 26 | ```bash 27 | $ pip install numpy pandas matplotlib scikit-learn scipy jupyter 28 | ``` 29 | 30 | ### 代码 31 | ​ 32 | 事例代码在 `titanic_survival_exploration_cn.ipynb` 文件中,辅助代码在 `titanic_visualizations.py` 文件中。尽管已经提供了一些代码帮助你上手,你还是需要补充些代码使得项目要求的功能能够成功实现。 33 | 34 | ### 运行 35 | ​ 36 | 在命令行中,确保当前目录为 `titanic_survival_exploration/` 文件夹的最顶层(目录包含本 README 文件),运行下列命令: 37 | 38 | ```bash 39 | $ jupyter notebook titanic_survival_exploration.ipynb 40 | ``` 41 | ​ 42 | 这会启动 Jupyter Notebook 把项目文件打开在你的浏览器中。 43 | 44 | 对jupyter不熟悉的同学可以看一下这两个链接: 45 | 46 | - [Jupyter使用视频教程](http://cn-static.udacity.com/mlnd/how_to_use_jupyter.mp4) 47 | - [为什么使用jupyter?](https://www.zhihu.com/question/37490497) 48 | ​ 49 | ​ 50 | ​ 51 | ​ 52 | ​ 53 | ​ 54 | ​ 55 | ​ 56 | ​ 57 | ​ 58 | ​ 59 | ​ 60 | ​ 61 | ​ 62 | 63 | ### 数据 64 | ​ 65 | 这个项目的数据包含在 `titanic_data.csv` 文件中。文件包含下列特征: 66 | ​ 67 | - **Survived**:是否存活(0代表否,1代表是) 68 | - **Pclass**:社会阶级(1代表上层阶级,2代表中层阶级,3代表底层阶级) 69 | - **Name**:船上乘客的名字 70 | - **Sex**:船上乘客的性别 71 | - **Age**:船上乘客的年龄(可能存在 `NaN`) 72 | - **SibSp**:乘客在船上的兄弟姐妹和配偶的数量 73 | - **Parch**:乘客在船上的父母以及小孩的数量 74 | - **Ticket**:乘客船票的编号 75 | - **Fare**:乘客为船票支付的费用 76 | - **Cabin**:乘客所在船舱的编号(可能存在 `NaN`) 77 | - **Embarked**:乘客上船的港口(C 代表从 Cherbourg 登船,Q 代表从 Queenstown 登船,S 代表从 Southampton 登船) 78 | -------------------------------------------------------------------------------- /projects_cn/titanic_survival_exploration/titanic_survival_exploration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 机器学习工程师纳米学位\n", 8 | "## 入门\n", 9 | "## 项目 0: 预测泰坦尼克号乘客生还率\n", 10 | "\n", 11 | "1912年,泰坦尼克号在第一次航行中就与冰山相撞沉没,导致了大部分乘客和船员身亡。在这个入门项目中,我们将探索部分泰坦尼克号旅客名单,来确定哪些特征可以最好地预测一个人是否会生还。为了完成这个项目,你将需要实现几个基于条件的预测并回答下面的问题。我们将根据代码的完成度和对问题的解答来对你提交的项目的进行评估。 \n", 12 | "\n", 13 | "> **提示**:这样的文字将会指导你如何使用 iPython Notebook 来完成项目。" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "点击[这里](https://github.com/udacity/machine-learning/blob/master/projects/titanic_survival_exploration/Titanic_Survival_Exploration.ipynb)查看本文件的英文版本。" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "# 开始\n", 28 | "\n", 29 | "当我们开始处理泰坦尼克号乘客数据时,会先导入我们需要的功能模块以及将数据加载到 `pandas` DataFrame。运行下面区域中的代码加载数据,并使用 `.head()` 函数显示前几项乘客数据。 \n", 30 | "\n", 31 | "> **提示**:你可以通过单击代码区域,然后使用键盘快捷键 **Shift+Enter** 或 **Shift+ Return** 来运行代码。或者在选择代码后使用**播放**(run cell)按钮执行代码。像这样的 MarkDown 文本可以通过双击编辑,并使用这些相同的快捷键保存。[Markdown](http://daringfireball.net/projects/markdown/syntax) 允许你编写易读的纯文本并且可以转换为 HTML。" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import numpy as np\n", 43 | "import pandas as pd\n", 44 | "\n", 45 | "# RMS Titanic data visualization code \n", 46 | "# 数据可视化代码\n", 47 | "from titanic_visualizations import survival_stats\n", 48 | "from IPython.display import display\n", 49 | "%matplotlib inline\n", 50 | "\n", 51 | "# Load the dataset \n", 52 | "# 加载数据集\n", 53 | "in_file = 'titanic_data.csv'\n", 54 | "full_data = pd.read_csv(in_file)\n", 55 | "\n", 56 | "# Print the first few entries of the RMS Titanic data \n", 57 | "# 显示数据列表中的前几项乘客数据\n", 58 | "display(full_data.head())" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "从泰坦尼克号的数据样本中,我们可以看到船上每位旅客的特征\n", 66 | "\n", 67 | "- **Survived**:是否存活(0代表否,1代表是)\n", 68 | "- **Pclass**:社会阶级(1代表上层阶级,2代表中层阶级,3代表底层阶级)\n", 69 | "- **Name**:船上乘客的名字\n", 70 | "- **Sex**:船上乘客的性别\n", 71 | "- **Age**:船上乘客的年龄(可能存在 `NaN`)\n", 72 | "- **SibSp**:乘客在船上的兄弟姐妹和配偶的数量\n", 73 | "- **Parch**:乘客在船上的父母以及小孩的数量\n", 74 | "- **Ticket**:乘客船票的编号\n", 75 | "- **Fare**:乘客为船票支付的费用\n", 76 | "- **Cabin**:乘客所在船舱的编号(可能存在 `NaN`)\n", 77 | "- **Embarked**:乘客上船的港口(C 代表从 Cherbourg 登船,Q 代表从 Queenstown 登船,S 代表从 Southampton 登船)\n", 78 | "\n", 79 | "因为我们感兴趣的是每个乘客或船员是否在事故中活了下来。可以将 **Survived** 这一特征从这个数据集移除,并且用一个单独的变量 `outcomes` 来存储。它也做为我们要预测的目标。\n", 80 | "\n", 81 | "运行该代码,从数据集中移除 **Survived** 这个特征,并将它存储在变量 `outcomes` 中。" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "# Store the 'Survived' feature in a new variable and remove it from the dataset \n", 93 | "# 从数据集中移除 'Survived' 这个特征,并将它存储在一个新的变量中。\n", 94 | "outcomes = full_data['Survived']\n", 95 | "data = full_data.drop('Survived', axis = 1)\n", 96 | "\n", 97 | "# Show the new dataset with 'Survived' removed\n", 98 | "# 显示已移除 'Survived' 特征的数据集\n", 99 | "display(data.head())" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "这个例子展示了如何将泰坦尼克号的 **Survived** 数据从 DataFrame 移除。注意到 `data`(乘客数据)和 `outcomes` (是否存活)现在已经匹配好。这意味着对于任何乘客的 `data.loc[i]` 都有对应的存活的结果 `outcome[i]`。\n", 107 | "\n", 108 | "为了验证我们预测的结果,我们需要一个标准来给我们的预测打分。因为我们最感兴趣的是我们预测的**准确率**,既正确预测乘客存活的比例。运行下面的代码来创建我们的 `accuracy_score` 函数以对前五名乘客的预测来做测试。\n", 109 | "\n", 110 | "**思考题**:从第六个乘客算起,如果我们预测他们全部都存活,你觉得我们预测的准确率是多少?" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "def accuracy_score(truth, pred):\n", 122 | " \"\"\" Returns accuracy score for input truth and predictions. \"\"\"\n", 123 | " \n", 124 | " # Ensure that the number of predictions matches number of outcomes\n", 125 | " # 确保预测的数量与结果的数量一致\n", 126 | " if len(truth) == len(pred): \n", 127 | " \n", 128 | " # Calculate and return the accuracy as a percent\n", 129 | " # 计算预测准确率(百分比)\n", 130 | " return \"Predictions have an accuracy of {:.2f}%.\".format((truth == pred).mean()*100)\n", 131 | " \n", 132 | " else:\n", 133 | " return \"Number of predictions does not match number of outcomes!\"\n", 134 | " \n", 135 | "# Test the 'accuracy_score' function\n", 136 | "# 测试 'accuracy_score' 函数\n", 137 | "predictions = pd.Series(np.ones(5, dtype = int))\n", 138 | "print accuracy_score(outcomes[:5], predictions)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "> **提示**:如果你保存 iPython Notebook,代码运行的输出也将被保存。但是,一旦你重新打开项目,你的工作区将会被重置。请确保每次都从上次离开的地方运行代码来重新生成变量和函数。\n", 146 | "\n", 147 | "# 预测\n", 148 | "\n", 149 | "如果我们要预测泰坦尼克号上的乘客是否存活,但是我们又对他们一无所知,那么最好的预测就是船上的人无一幸免。这是因为,我们可以假定当船沉没的时候大多数乘客都遇难了。下面的 `predictions_0` 函数就预测船上的乘客全部遇难。 " 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "def predictions_0(data):\n", 161 | " \"\"\" Model with no features. Always predicts a passenger did not survive. \"\"\"\n", 162 | "\n", 163 | " predictions = []\n", 164 | " for _, passenger in data.iterrows():\n", 165 | " \n", 166 | " # Predict the survival of 'passenger'\n", 167 | " # 预测 'passenger' 的生还率\n", 168 | " predictions.append(0)\n", 169 | " \n", 170 | " # Return our predictions\n", 171 | " # 返回预测结果\n", 172 | " return pd.Series(predictions)\n", 173 | "\n", 174 | "# Make the predictions\n", 175 | "# 进行预测\n", 176 | "predictions = predictions_0(data)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "### 问题1\n", 184 | "\n", 185 | "对比真实的泰坦尼克号的数据,如果我们做一个所有乘客都没有存活的预测,你认为这个预测的准确率能达到多少?\n", 186 | "\n", 187 | "**提示**:运行下面的代码来查看预测的准确率。" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "print accuracy_score(outcomes, predictions)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "**回答:** *请用上面出现的预测结果来替换掉这里的文字*" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "***\n", 213 | "我们可以使用 `survival_stats` 函数来看看 **Sex** 这一特征对乘客的存活率有多大影响。这个函数定义在名为 `titanic_visualizations.py` 的 Python 脚本文件中,我们的项目提供了这个文件。传递给函数的前两个参数分别是泰坦尼克号的乘客数据和乘客的 生还结果。第三个参数表明我们会依据哪个特征来绘制图形。\n", 214 | "\n", 215 | "运行下面的代码绘制出依据乘客性别计算存活率的柱形图。 " 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "collapsed": false 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "survival_stats(data, outcomes, 'Sex')" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "观察泰坦尼克号上乘客存活的数据统计,我们可以发现大部分男性乘客在船沉没的时候都遇难了。相反的,大部分女性乘客都在事故中**生还**。让我们在先前推断的基础上继续创建:如果乘客是男性,那么我们就预测他们遇难;如果乘客是女性,那么我们预测他们在事故中活了下来。\n", 234 | "\n", 235 | "将下面的代码补充完整,让函数可以进行正确预测。 \n", 236 | "\n", 237 | "**提示**:您可以用访问 dictionary(字典)的方法来访问船上乘客的每个特征对应的值。例如, `passenger['Sex']` 返回乘客的性别。" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "def predictions_1(data):\n", 249 | " \"\"\" Model with one feature: \n", 250 | " - Predict a passenger survived if they are female. \"\"\"\n", 251 | " \n", 252 | " predictions = []\n", 253 | " for _, passenger in data.iterrows():\n", 254 | " \n", 255 | " # Remove the 'pass' statement below \n", 256 | " # 移除下方的 'pass' 声明\n", 257 | " # and write your prediction conditions here\n", 258 | " # 输入你自己的预测条件\n", 259 | " pass\n", 260 | " \n", 261 | " # Return our predictions\n", 262 | " # 返回预测结果\n", 263 | " return pd.Series(predictions)\n", 264 | "\n", 265 | "# Make the predictions\n", 266 | "# 进行预测\n", 267 | "predictions = predictions_1(data)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "### 问题2\n", 275 | "当我们预测船上女性乘客全部存活,而剩下的人全部遇难,那么我们预测的准确率会达到多少?\n", 276 | "\n", 277 | "**提示**:运行下面的代码来查看我们预测的准确率。 " 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "print accuracy_score(outcomes, predictions)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "**回答**: *用上面出现的预测结果来替换掉这里的文字*" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "***\n", 303 | "仅仅使用乘客性别(Sex)这一特征,我们预测的准确性就有了明显的提高。现在再看一下使用额外的特征能否更进一步提升我们的预测准确度。例如,综合考虑所有在泰坦尼克号上的男性乘客:我们是否找到这些乘客中的一个子集,他们的存活概率较高。让我们再次使用 `survival_stats` 函数来看看每位男性乘客的年龄(Age)。这一次,我们将使用第四个参数来限定柱形图中只有男性乘客。\n", 304 | "\n", 305 | "运行下面这段代码,把男性基于年龄的生存结果绘制出来。" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "collapsed": false 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "survival_stats(data, outcomes, 'Age', [\"Sex == 'male'\"])" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": { 322 | "collapsed": true 323 | }, 324 | "source": [ 325 | "仔细观察泰坦尼克号存活的数据统计,在船沉没的时候,大部分小于10岁的男孩都活着,而大多数10岁以上的男性都随着船的沉没而**遇难**。让我们继续在先前预测的基础上构建:如果乘客是女性,那么我们就预测她们全部存活;如果乘客是男性并且小于10岁,我们也会预测他们全部存活;所有其它我们就预测他们都没有幸存。 \n", 326 | "\n", 327 | "将下面缺失的代码补充完整,让我们的函数可以实现预测。 \n", 328 | "**提示**: 您可以用之前 `predictions_1` 的代码作为开始来修改代码,实现新的预测函数。" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": { 335 | "collapsed": false 336 | }, 337 | "outputs": [], 338 | "source": [ 339 | "def predictions_2(data):\n", 340 | " \"\"\" Model with two features: \n", 341 | " - Predict a passenger survived if they are female.\n", 342 | " - Predict a passenger survived if they are male and younger than 10. \"\"\"\n", 343 | " \n", 344 | " predictions = []\n", 345 | " for _, passenger in data.iterrows():\n", 346 | " \n", 347 | " # Remove the 'pass' statement below \n", 348 | " # 移除下方的 'pass' 声明\n", 349 | " # and write your prediction conditions here\n", 350 | " # 输入你自己的预测条件\n", 351 | " pass\n", 352 | " \n", 353 | " # Return our predictions\n", 354 | " # 返回预测结果\n", 355 | " return pd.Series(predictions)\n", 356 | "\n", 357 | "# Make the predictions\n", 358 | "# 进行预测\n", 359 | "predictions = predictions_2(data)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "### 问题3\n", 367 | "\n", 368 | "当预测所有女性以及小于10岁的男性都存活的时候,预测的准确率会达到多少?\n", 369 | "\n", 370 | "**提示:**运行下面的代码来查看预测的准确率。" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "collapsed": false 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "print accuracy_score(outcomes, predictions)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "**回答**: *用上面出现的预测结果来替换掉这里的文字*" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": { 394 | "collapsed": true 395 | }, 396 | "source": [ 397 | "***\n", 398 | "添加年龄(Age)特征与性别(Sex)的结合比单独使用性别(Sex)也提高了不少准确度。现在该你来做预测了:找到一系列的特征和条件来对数据进行划分,使得预测结果提高到80%以上。这可能需要多个特性和多个层次的条件语句才会成功。你可以在不同的条件下多次使用相同的特征。**Pclass**,**Sex**,**Age**,**SibSp** 和 **Parch** 是建议尝试使用的特征。 \n", 399 | "\n", 400 | "使用 `survival_stats` 函数来观测泰坦尼克号上乘客存活的数据统计。 \n", 401 | "**提示:** 要使用多个过滤条件,把每一个条件放在一个列表里作为最后一个参数传递进去。例如: `[\"Sex == 'male'\", \"Age < 18\"]`" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": { 408 | "collapsed": false 409 | }, 410 | "outputs": [], 411 | "source": [ 412 | "survival_stats(data, outcomes, 'Age', [\"Sex == 'male'\", \"Age < 18\"])" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "当查看和研究了图形化的泰坦尼克号上乘客的数据统计后,请补全下面这段代码中缺失的部分,使得函数可以返回你的预测。 \n", 420 | "在到达最终的预测模型前请确保记录你尝试过的各种特征和条件。 \n", 421 | "**提示:** 您可以用之前 `predictions_2` 的代码作为开始来修改代码,实现新的预测函数。" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": { 428 | "collapsed": false 429 | }, 430 | "outputs": [], 431 | "source": [ 432 | "def predictions_3(data):\n", 433 | " \"\"\" Model with multiple features. Makes a prediction with an accuracy of at least 80%. \"\"\"\n", 434 | " \n", 435 | " predictions = []\n", 436 | " for _, passenger in data.iterrows():\n", 437 | " \n", 438 | " # Remove the 'pass' statement below \n", 439 | " # and write your prediction conditions here\n", 440 | " pass\n", 441 | " \n", 442 | " # Return our predictions\n", 443 | " return pd.Series(predictions)\n", 444 | "\n", 445 | "# Make the predictions\n", 446 | "predictions = predictions_3(data)" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "### 结论\n", 454 | "\n", 455 | "请描述你实现80%准确度的预测模型所经历的步骤。您观察过哪些特征?某些特性是否比其他特征更有帮助?你用了什么条件来预测生还结果?你最终的预测的准确率是多少?\n", 456 | "**提示:**运行下面的代码来查看你的预测准确度。" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": { 463 | "collapsed": false 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "print accuracy_score(outcomes, predictions)" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "**回答**: *用上面问题的答案来替换掉这里的文字*" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "# 结论\n", 482 | "\n", 483 | "经过了数次对数据的探索和分类,你创建了一个预测泰坦尼克号乘客存活率的有用的算法。在这个项目中你手动地实现了一个简单的机器学习模型——决策树(decision tree)。决策树每次按照一个特征把数据分割成越来越小的群组(被称为 *nodes*)。每次数据的一个子集被分出来,如果分割结果的子集中的数据比之前更同质(包含近似的标签),我们的预测也就更加准确。电脑来帮助我们做这件事会比手动做更彻底,更精确。[这个链接](http://www.r2d3.us/visual-intro-to-machine-learning-part-1/)提供了另一个使用决策树做机器学习入门的例子。 \n", 484 | "\n", 485 | "决策树是许多**监督学习**算法中的一种。在监督学习中,我们关心的是使用数据的特征并根据数据的结果标签进行预测或建模。也就是说,每一组数据都有一个真正的结果值,不论是像泰坦尼克号生存数据集一样的标签,或者是连续的房价预测。\n", 486 | "\n", 487 | "### 问题5\n", 488 | "\n", 489 | "想象一个真实世界中应用监督学习的场景,你期望预测的结果是什么?举出两个在这个场景中能够帮助你进行预测的数据集中的特征。" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": { 495 | "collapsed": true 496 | }, 497 | "source": [ 498 | "**回答**: *用你的答案替换掉这里的文字*" 499 | ] 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "metadata": {}, 504 | "source": [ 505 | "> **注意**: 当你写完了所有的代码,并且回答了所有的问题。你就可以把你的 iPython Notebook 导出成 HTML 文件。你可以在菜单栏,这样导出**File -> Download as -> HTML (.html)** 把这个 HTML 和这个 iPython notebook 一起做为你的作业提交。" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": {}, 511 | "source": [ 512 | "---\n", 513 | "翻译:毛礼建 | 校译:黄强 | 审译:曹晨巍" 514 | ] 515 | } 516 | ], 517 | "metadata": { 518 | "kernelspec": { 519 | "display_name": "Python 2", 520 | "language": "python", 521 | "name": "python2" 522 | }, 523 | "language_info": { 524 | "codemirror_mode": { 525 | "name": "ipython", 526 | "version": 2 527 | }, 528 | "file_extension": ".py", 529 | "mimetype": "text/x-python", 530 | "name": "python", 531 | "nbconvert_exporter": "python", 532 | "pygments_lexer": "ipython2", 533 | "version": "2.7.11" 534 | } 535 | }, 536 | "nbformat": 4, 537 | "nbformat_minor": 0 538 | } 539 | -------------------------------------------------------------------------------- /projects_cn/titanic_survival_exploration/titanic_visualizations.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | def filter_data(data, condition): 6 | """ 7 | Remove elements that do not match the condition provided. 8 | Takes a data list as input and returns a filtered list. 9 | Conditions should be a list of strings of the following format: 10 | ' ' 11 | where the following operations are valid: >, <, >=, <=, ==, != 12 | 13 | Example: ["Sex == 'male'", 'Age < 18'] 14 | """ 15 | 16 | field, op, value = condition.split(" ") 17 | 18 | # convert value into number or strip excess quotes if string 19 | try: 20 | value = float(value) 21 | except: 22 | value = value.strip("\'\"") 23 | 24 | # get booleans for filtering 25 | if op == ">": 26 | matches = data[field] > value 27 | elif op == "<": 28 | matches = data[field] < value 29 | elif op == ">=": 30 | matches = data[field] >= value 31 | elif op == "<=": 32 | matches = data[field] <= value 33 | elif op == "==": 34 | matches = data[field] == value 35 | elif op == "!=": 36 | matches = data[field] != value 37 | else: # catch invalid operation codes 38 | raise Exception("Invalid comparison operator. Only >, <, >=, <=, ==, != allowed.") 39 | 40 | # filter data and outcomes 41 | data = data[matches].reset_index(drop = True) 42 | return data 43 | 44 | def survival_stats(data, outcomes, key, filters = []): 45 | """ 46 | Print out selected statistics regarding survival, given a feature of 47 | interest and any number of filters (including no filters) 48 | """ 49 | 50 | # Check that the key exists 51 | if key not in data.columns.values : 52 | print "'{}' is not a feature of the Titanic data. Did you spell something wrong?".format(key) 53 | return False 54 | 55 | # Return the function before visualizing if 'Cabin' or 'Ticket' 56 | # is selected: too many unique categories to display 57 | if(key == 'Cabin' or key == 'PassengerId' or key == 'Ticket'): 58 | print "'{}' has too many unique categories to display! Try a different feature.".format(key) 59 | return False 60 | 61 | # Merge data and outcomes into single dataframe 62 | all_data = pd.concat([data, outcomes], axis = 1) 63 | 64 | # Apply filters to data 65 | for condition in filters: 66 | all_data = filter_data(all_data, condition) 67 | 68 | # Create outcomes DataFrame 69 | all_data = all_data[[key, 'Survived']] 70 | 71 | # Create plotting figure 72 | plt.figure(figsize=(8,6)) 73 | 74 | # 'Numerical' features 75 | if(key == 'Age' or key == 'Fare'): 76 | 77 | # Remove NaN values from Age data 78 | all_data = all_data[~np.isnan(all_data[key])] 79 | 80 | # Divide the range of data into bins and count survival rates 81 | min_value = all_data[key].min() 82 | max_value = all_data[key].max() 83 | value_range = max_value - min_value 84 | 85 | # 'Fares' has larger range of values than 'Age' so create more bins 86 | if(key == 'Fare'): 87 | bins = np.arange(0, all_data['Fare'].max() + 20, 20) 88 | if(key == 'Age'): 89 | bins = np.arange(0, all_data['Age'].max() + 10, 10) 90 | 91 | # Overlay each bin's survival rates 92 | nonsurv_vals = all_data[all_data['Survived'] == 0][key].reset_index(drop = True) 93 | surv_vals = all_data[all_data['Survived'] == 1][key].reset_index(drop = True) 94 | plt.hist(nonsurv_vals, bins = bins, alpha = 0.6, 95 | color = 'red', label = 'Did not survive') 96 | plt.hist(surv_vals, bins = bins, alpha = 0.6, 97 | color = 'green', label = 'Survived') 98 | 99 | # Add legend to plot 100 | plt.xlim(0, bins.max()) 101 | plt.legend(framealpha = 0.8) 102 | 103 | # 'Categorical' features 104 | else: 105 | 106 | # Set the various categories 107 | if(key == 'Pclass'): 108 | values = np.arange(1,4) 109 | if(key == 'Parch' or key == 'SibSp'): 110 | values = np.arange(0,np.max(data[key]) + 1) 111 | if(key == 'Embarked'): 112 | values = ['C', 'Q', 'S'] 113 | if(key == 'Sex'): 114 | values = ['male', 'female'] 115 | 116 | # Create DataFrame containing categories and count of each 117 | frame = pd.DataFrame(index = np.arange(len(values)), columns=(key,'Survived','NSurvived')) 118 | for i, value in enumerate(values): 119 | frame.loc[i] = [value, \ 120 | len(all_data[(all_data['Survived'] == 1) & (all_data[key] == value)]), \ 121 | len(all_data[(all_data['Survived'] == 0) & (all_data[key] == value)])] 122 | 123 | # Set the width of each bar 124 | bar_width = 0.4 125 | 126 | # Display each category's survival rates 127 | for i in np.arange(len(frame)): 128 | nonsurv_bar = plt.bar(i-bar_width, frame.loc[i]['NSurvived'], width = bar_width, color = 'r') 129 | surv_bar = plt.bar(i, frame.loc[i]['Survived'], width = bar_width, color = 'g') 130 | 131 | plt.xticks(np.arange(len(frame)), values) 132 | plt.legend((nonsurv_bar[0], surv_bar[0]),('Did not survive', 'Survived'), framealpha = 0.8) 133 | 134 | # Common attributes for plot formatting 135 | plt.xlabel(key) 136 | plt.ylabel('Number of Passengers') 137 | plt.title('Passenger Survival Statistics With \'%s\' Feature'%(key)) 138 | plt.show() 139 | 140 | # Report number of passengers with missing values 141 | if sum(pd.isnull(all_data[key])): 142 | nan_outcomes = all_data[pd.isnull(all_data[key])]['Survived'] 143 | print "Passengers with missing '{}' values: {} ({} survived, {} did not survive)".format( \ 144 | key, len(nan_outcomes), sum(nan_outcomes == 1), sum(nan_outcomes == 0)) 145 | 146 | --------------------------------------------------------------------------------