├── .gitignore ├── .gitmodules ├── Content ├── DSConcepts.graffle ├── Explore │ ├── 01-Introduction.ipynb │ ├── 02-EDA.ipynb │ ├── 03-Interact.ipynb │ └── 04-Widgets.ipynb ├── Import │ ├── 01-Introduction.ipynb │ ├── 02-DataSoures.ipynb │ ├── 03-CSV.ipynb │ ├── 04-JSON.ipynb │ └── ArrowParquet.ipynb ├── Introduction │ ├── 01-Introduction.ipynb │ ├── 02-WhatIsDataScience.ipynb │ ├── 03-TheoryofData.ipynb │ └── images │ │ └── data_science_vd.png ├── Model │ ├── 01-Introduction.ipynb │ ├── 02-Probability.ipynb │ ├── 03-CommonDistributions.ipynb │ ├── 04-InformationTheory.ipynb │ ├── 05-ModellingOverview.ipynb │ ├── 06-EstimatorsBiasVariance.ipynb │ ├── 07-BootstrapResampling.ipynb │ ├── 08-MLE.ipynb │ ├── 09-LinearRegression.ipynb │ ├── 10-SpecificModels.ipynb │ ├── Integration.ipynb │ ├── Interpolation.ipynb │ ├── ODEs.ipynb │ ├── Scipy.ipynb │ └── images │ │ ├── rectangles.png │ │ └── trapz.png ├── Programming │ ├── 01-Introduction.ipynb │ ├── BasicPythonSyntax.ipynb │ ├── Iteration.ipynb │ ├── PythonPackages.ipynb │ └── StandardLibrary.ipynb ├── Retired │ ├── CustomDisplayLogic.ipynb │ ├── Matplotlib.ipynb │ ├── MatplotlibStyling.ipynb │ ├── PandasPlotting.ipynb │ ├── PlottingDistributions.ipynb │ ├── SFData.ipynb │ └── SFDeptPurchases.csv ├── Tidy │ ├── 01-Introduction.ipynb │ ├── 02-TidyData.ipynb │ └── CleanWrangle.ipynb ├── Transform │ ├── 01-Introduction.ipynb │ ├── 02-BasicDataTransformation.ipynb │ ├── 03-Numpy.ipynb │ ├── 04-Pandas.ipynb │ ├── 05-RelationalData.ipynb │ └── data │ │ └── Chinook_Sqlite.sqlite ├── Visualize │ ├── 01-Introduction.ipynb │ ├── 02-VisualizationGrammar.ipynb │ ├── 03-TidyData.ipynb │ ├── 04-ChartMarksEncodings.ipynb │ ├── 05-Transformation.ipynb │ ├── 06-CompoundCharts.ipynb │ ├── 07-Selections.ipynb │ ├── 08-SeattleWeather.ipynb │ ├── 09-TheoryAndPractice.ipynb │ └── images │ │ ├── column_syntax1.png │ │ ├── column_syntax2.png │ │ ├── encodings1.png │ │ ├── encodings2.png │ │ ├── mackinlay1.png │ │ ├── mackinlay2.png │ │ ├── marks.png │ │ ├── marks_encoding.png │ │ ├── measles_wsj.png │ │ ├── social_assistance_538.png │ │ └── viz_grammar.png └── Workflow │ ├── 01-Introduction.ipynb │ ├── 02-TheJupyterNotebook.ipynb │ ├── 03-NotebookBasics.ipynb │ ├── 04-RunningCode.ipynb │ ├── 05-IPythonBasics.ipynb │ ├── 06-Markdown.ipynb │ ├── 07-LaTeX.ipynb │ ├── 08-Display.ipynb │ ├── data │ ├── flare.json │ └── scrubjay.mp3 │ └── images │ ├── command_mode.png │ ├── dashboard_files_tab.png │ ├── dashboard_files_tab_btns.png │ ├── dashboard_files_tab_new.png │ ├── dashboard_files_tab_run.png │ ├── dashboard_running_tab.png │ ├── edit_mode.png │ ├── ipython-image.png │ ├── menubar_toolbar.png │ └── python-image.png ├── LICENSE.md ├── Makefile ├── README.md ├── Resources ├── Images │ └── cp_datascience_logo.png └── Notes │ ├── install.md │ └── teaching.md └── datacamp_logo.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | .ipynb_checkpoints 57 | .DS_Store 58 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "Content/PythonDataScienceHandbook"] 2 | path = Content/PythonDataScienceHandbook 3 | url = https://github.com/jakevdp/PythonDataScienceHandbook.git 4 | [submodule "Content/WhirlwindTourOfPython"] 5 | path = Content/WhirlwindTourOfPython 6 | url = https://github.com/jakevdp/WhirlwindTourOfPython.git 7 | -------------------------------------------------------------------------------- /Content/Explore/01-Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Explore" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**Learning Objective:** learn how to explore data using visualization, basic data transformation and interaction." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Overview\n", 22 | "\n", 23 | "The *explore* phase of data science encompasses a couple of different topics:\n", 24 | "\n", 25 | "* Data visualization\n", 26 | "* Basic data transformation\n", 27 | "* Exploratory data analysis (EDA)\n", 28 | "* Interaction\n", 29 | "\n", 30 | "The first two of these are covered in the Visualize and Transform sections of the course. This section covers exploratory data analysis and interaction." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Outline\n", 38 | "\n", 39 | "* [Data Visualization](../Visualize/01-Introduction.ipynb)\n", 40 | "* [Basic Data Transformation](../Transform/02-BasicDataTransformation.ipynb)\n", 41 | "* [Exploratory Data Analysis (EDA)](02-EDA.ipynb)\n", 42 | "* [Interact](03-Interact.ipynb)\n", 43 | "* [Widgets](04-Widgets.ipynb)" 44 | ] 45 | } 46 | ], 47 | "metadata": { 48 | "kernelspec": { 49 | "display_name": "Python 3", 50 | "language": "python", 51 | "name": "python3" 52 | }, 53 | "language_info": { 54 | "codemirror_mode": { 55 | "name": "ipython", 56 | "version": 3 57 | }, 58 | "file_extension": ".py", 59 | "mimetype": "text/x-python", 60 | "name": "python", 61 | "nbconvert_exporter": "python", 62 | "pygments_lexer": "ipython3", 63 | "version": "3.5.2" 64 | } 65 | }, 66 | "nbformat": 4, 67 | "nbformat_minor": 2 68 | } 69 | -------------------------------------------------------------------------------- /Content/Explore/03-Interact.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Using Interact" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "The `interact` function from `ipywidgets` automatically creates a graphical user interface (GUI) for exploring code and data interactively. It is the easiest way to get started using IPython's widgets." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 4, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "from ipywidgets import interact, interactive, fixed\n", 26 | "import ipywidgets as widgets" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Basic `interact`" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "At the most basic level, `interact` autogenerates UI controls for function arguments, and then calls the function with those arguments when you manipulate the controls interactively. To use `interact`, you need to define a function that you want to explore. Here is a function that prints its only argument `x`." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 5, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "def f(x):\n", 52 | " print(x)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "When you pass this function as the first argument to `interact` along with an integer keyword argument (`x=10`), a slider is generated and bound to the function." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 6, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "10\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "interact(f, x=10);" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "When you move the slider, the function is called and the current value of `x` is printed.\n", 86 | "\n", 87 | "If you pass `True` or `False`, `interact` will generate a checkbox:" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 7, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "True\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "interact(f, x=True);" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "If you pass a string, `interact` will generate a text area." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 8, 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "Hi there!\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "interact(f, x='Hi there!');" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "`interact` can also be used as a decorator. This allows you to define a function and interact with it in a single shot. As this example shows, `interact` also works with functions that have multiple arguments." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 9, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "True 1.1\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "@interact(x=True, y=1.0)\n", 159 | "def g(x, y):\n", 160 | " print(x, y)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "## Fixing arguments using `fixed`" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "There are times when you may want to explore a function using `interact`, but fix one or more of its arguments to specific values. This can be accomplished by wrapping values with the `fixed` function." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 10, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "def h(p, q):\n", 186 | " print(p, q)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "When we call `interact`, we pass `fixed(20)` for q to hold it fixed at a value of `20`." 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 11, 199 | "metadata": { 200 | "collapsed": false 201 | }, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "8 20\n" 208 | ] 209 | } 210 | ], 211 | "source": [ 212 | "interact(h, p=5, q=fixed(20));" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "Notice that a slider is only produced for `p` as the value of `q` is fixed." 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "## Widget abbreviations" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "When you pass an integer valued keyword argument (`x=10`) to `interact`, it generates an integer valued slider control with a range of $[-10,+3\\times10]$. In this case `10` is an *abbreviation* for an actual slider widget:\n", 234 | "\n", 235 | "```python\n", 236 | "IntSlider(min=-10,max=30,step=1,value=10)\n", 237 | "```\n", 238 | "\n", 239 | "In fact, we can get the same result if we pass this `IntSlider` as the keyword argument for `x`:" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 12, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "14\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "interact(f, x=widgets.IntSlider(min=-10,max=30,step=1,value=10));" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "This examples clarifies how `interact` proceses its keyword arguments:\n", 266 | "\n", 267 | "1. If the keyword argument is `Widget` instance with a `value` attribute, that widget is used. Any widget with a `value` attribute can be used, even custom ones.\n", 268 | "2. Otherwise, the value is treated as a *widget abbreviation* that is converted to a widget before it is used.\n", 269 | "\n", 270 | "The following table gives an overview of different widget abbreviations:\n", 271 | "\n", 272 | "\n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | "
Keyword argumentWidget
`True` or `False`Checkbox
`'Hi there'`Textarea
`value` or `(min,max)` or `(min,max,step)` if integers are passedIntSlider
`value` or `(min,max)` or `(min,max,step)` if floats are passedFloatSlider
`('orange','apple')` or `{'one':1,'two':2}`Dropdown
" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "You have seen how the checkbox and textarea widgets work above. Here, more details about the different abbreviations for sliders and dropdowns are given.\n", 287 | "\n", 288 | "If a 2-tuple of integers is passed `(min,max)` a integer valued slider is produced with those minimum and maximum (inclusive) values. In this case, the default step size of `1` is used." 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 13, 294 | "metadata": { 295 | "collapsed": false 296 | }, 297 | "outputs": [ 298 | { 299 | "name": "stdout", 300 | "output_type": "stream", 301 | "text": [ 302 | "2\n" 303 | ] 304 | } 305 | ], 306 | "source": [ 307 | "interact(f, x=(0,4));" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "If a 3-tuple of integers is passed `(min,max,step)` the step size can also be set." 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 14, 320 | "metadata": { 321 | "collapsed": false 322 | }, 323 | "outputs": [ 324 | { 325 | "name": "stdout", 326 | "output_type": "stream", 327 | "text": [ 328 | "4\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "interact(f, x=(0,8,2));" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "A float valued slider is produced if the elements of the tuples are floats. Here the minimum is `0.0`, the maximum is `10.0` and step size is `0.1` (the default)." 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 15, 346 | "metadata": { 347 | "collapsed": false 348 | }, 349 | "outputs": [ 350 | { 351 | "name": "stdout", 352 | "output_type": "stream", 353 | "text": [ 354 | "5.0\n" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "interact(f, x=(0.0,10.0));" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "The step size can be changed by passing a 3rd element in the tuple." 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 16, 372 | "metadata": { 373 | "collapsed": false 374 | }, 375 | "outputs": [ 376 | { 377 | "name": "stdout", 378 | "output_type": "stream", 379 | "text": [ 380 | "4.99\n" 381 | ] 382 | } 383 | ], 384 | "source": [ 385 | "interact(f, x=(0.0,10.0,0.01));" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "For both integer and float valued sliders, you can pick the initial value of the widget by passing a default keyword argument to the underlying Python function. Here we set the initial value of a float slider to `5.5`." 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 17, 398 | "metadata": { 399 | "collapsed": false 400 | }, 401 | "outputs": [ 402 | { 403 | "name": "stdout", 404 | "output_type": "stream", 405 | "text": [ 406 | "5.5\n" 407 | ] 408 | } 409 | ], 410 | "source": [ 411 | "def h(x=5.5):\n", 412 | " print(x)\n", 413 | " \n", 414 | "interact(h, x=(0.0,20.0,0.5));" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "Dropdown menus can be produced by passing a tuple of strings. In this case, the strings are both used as the names in the dropdown menu UI and passed to the underlying Python function." 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 18, 427 | "metadata": { 428 | "collapsed": false 429 | }, 430 | "outputs": [ 431 | { 432 | "name": "stdout", 433 | "output_type": "stream", 434 | "text": [ 435 | "apples\n" 436 | ] 437 | } 438 | ], 439 | "source": [ 440 | "interact(f, x=('apples','oranges'));" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "If you want a dropdown menu that passes non-string values to the Python function, you can pass a dictionary. The keys in the dictionary are used for the names in the dropdown menu UI and the values are the arguments that are passed to the underlying Python function." 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 19, 453 | "metadata": { 454 | "collapsed": false 455 | }, 456 | "outputs": [ 457 | { 458 | "name": "stdout", 459 | "output_type": "stream", 460 | "text": [ 461 | "10\n" 462 | ] 463 | } 464 | ], 465 | "source": [ 466 | "interact(f, x={'one': 10, 'two': 20});" 467 | ] 468 | } 469 | ], 470 | "metadata": { 471 | "kernelspec": { 472 | "display_name": "Python 3", 473 | "language": "python", 474 | "name": "python3" 475 | }, 476 | "language_info": { 477 | "codemirror_mode": { 478 | "name": "ipython", 479 | "version": 3 480 | }, 481 | "file_extension": ".py", 482 | "mimetype": "text/x-python", 483 | "name": "python", 484 | "nbconvert_exporter": "python", 485 | "pygments_lexer": "ipython3", 486 | "version": "3.5.2" 487 | }, 488 | "widgets": { 489 | "state": { 490 | "12b16c68f2914a4291d51db7203555a1": { 491 | "views": [ 492 | { 493 | "cell_index": 13 494 | } 495 | ] 496 | }, 497 | "3c0af837ca2a4d5aa1cf2996908402e8": { 498 | "views": [ 499 | { 500 | "cell_index": 9 501 | } 502 | ] 503 | }, 504 | "668446ab7c9d4187b1b4ca248fd0a8b9": { 505 | "views": [ 506 | { 507 | "cell_index": 7 508 | } 509 | ] 510 | }, 511 | "6ab2cbf9a9ca424e9b8785a8371089b3": { 512 | "views": [ 513 | { 514 | "cell_index": 37 515 | } 516 | ] 517 | }, 518 | "71eb773a464a49e5899dc65c241ad530": { 519 | "views": [ 520 | { 521 | "cell_index": 25 522 | } 523 | ] 524 | }, 525 | "872ae1f83c3845a08ea6e6153d0abef5": { 526 | "views": [ 527 | { 528 | "cell_index": 35 529 | } 530 | ] 531 | }, 532 | "8d01aaafa4d0454caa040f046733f7d2": { 533 | "views": [ 534 | { 535 | "cell_index": 27 536 | } 537 | ] 538 | }, 539 | "baaaade02e6649db88095b7033d56a7b": { 540 | "views": [ 541 | { 542 | "cell_index": 18 543 | } 544 | ] 545 | }, 546 | "bf65c7733808421b99a209e45f37aa77": { 547 | "views": [ 548 | { 549 | "cell_index": 31 550 | } 551 | ] 552 | }, 553 | "d6a7da524dfb4c3d82fb30b472cb9523": { 554 | "views": [ 555 | { 556 | "cell_index": 11 557 | } 558 | ] 559 | }, 560 | "e2c9287fd5074579be4244f08b3c12a6": { 561 | "views": [ 562 | { 563 | "cell_index": 22 564 | } 565 | ] 566 | }, 567 | "e7d9e5e1536e424f9fd1d8715f249a1b": { 568 | "views": [ 569 | { 570 | "cell_index": 33 571 | } 572 | ] 573 | }, 574 | "ebcd9f05630d4232b2da96ade64c2813": { 575 | "views": [ 576 | { 577 | "cell_index": 29 578 | } 579 | ] 580 | } 581 | }, 582 | "version": "1.2.0" 583 | } 584 | }, 585 | "nbformat": 4, 586 | "nbformat_minor": 0 587 | } 588 | -------------------------------------------------------------------------------- /Content/Explore/04-Widgets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Widgets" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Using widgets " 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "slideshow": { 21 | "slide_type": "slide" 22 | } 23 | }, 24 | "source": [ 25 | "To use the widget framework, you need to **import `IPython.html.widgets`**." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "from ipywidgets import *" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "slideshow": { 43 | "slide_type": "slide" 44 | } 45 | }, 46 | "source": [ 47 | "### repr" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "Widgets have their own display `repr` which allows them to be displayed using IPython's display framework. Constructing and returning an `IntSlider` automatically displays the widget (as seen below). Widgets are **displayed inside the `widget area`**, which sits between the code cell and output. **You can hide all of the widgets** in the `widget area` by clicking the grey *x* in the margin." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "IntSlider()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "slideshow": { 72 | "slide_type": "slide" 73 | } 74 | }, 75 | "source": [ 76 | "### display()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "You can also explicitly display the widget using `display(...)`." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "from IPython.display import display\n", 95 | "w = IntSlider()\n", 96 | "display(w)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "slideshow": { 103 | "slide_type": "slide" 104 | } 105 | }, 106 | "source": [ 107 | "### Multiple display() calls" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "If you display the same widget twice, the displayed instances in the front-end **will remain in sync** with each other." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 4, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "display(w)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## Why does displaying the same widget twice work?" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": { 138 | "slideshow": { 139 | "slide_type": "slide" 140 | } 141 | }, 142 | "source": [ 143 | "Widgets are **represented in the back-end by a single object**. Each time a widget is displayed, **a new representation** of that same object is created in the front-end. These representations are called **views**." 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "slideshow": { 150 | "slide_type": "slide" 151 | } 152 | }, 153 | "source": [ 154 | "### Closing widgets" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "You can close a widget by calling its `close()` method." 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 5, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "display(w)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 6, 178 | "metadata": { 179 | "collapsed": false 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "w.close()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "## Widget properties" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": { 196 | "slideshow": { 197 | "slide_type": "slide" 198 | } 199 | }, 200 | "source": [ 201 | "All of the IPython widgets **share a similar naming scheme**. To read the value of a widget, you can query its `value` property." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 7, 207 | "metadata": { 208 | "collapsed": false 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "w = IntSlider()\n", 213 | "display(w)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 8, 219 | "metadata": { 220 | "collapsed": false 221 | }, 222 | "outputs": [ 223 | { 224 | "data": { 225 | "text/plain": [ 226 | "0" 227 | ] 228 | }, 229 | "execution_count": 8, 230 | "metadata": {}, 231 | "output_type": "execute_result" 232 | } 233 | ], 234 | "source": [ 235 | "w.value" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "Similarly, to set a widget's value, you can set its `value` property." 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 9, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "w.value = 100" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": { 259 | "slideshow": { 260 | "slide_type": "slide" 261 | } 262 | }, 263 | "source": [ 264 | "### Keys" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "In addition to `value`, most widgets share `keys`, `description`, `disabled`, and `visible`. To see the entire list of synchronized, stateful properties, of any specific widget, you can **query the `keys` property**." 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 10, 277 | "metadata": { 278 | "collapsed": false 279 | }, 280 | "outputs": [ 281 | { 282 | "data": { 283 | "text/plain": [ 284 | "['max',\n", 285 | " 'font_style',\n", 286 | " '_view_name',\n", 287 | " 'font_size',\n", 288 | " 'continuous_update',\n", 289 | " 'readout',\n", 290 | " '_dom_classes',\n", 291 | " 'visible',\n", 292 | " '_view_module',\n", 293 | " 'msg_throttle',\n", 294 | " 'description',\n", 295 | " 'value',\n", 296 | " '_model_module',\n", 297 | " 'disabled',\n", 298 | " 'step',\n", 299 | " '_range',\n", 300 | " '_model_name',\n", 301 | " 'min',\n", 302 | " 'font_weight',\n", 303 | " 'slider_color',\n", 304 | " 'orientation',\n", 305 | " 'font_family',\n", 306 | " 'readout_format',\n", 307 | " 'background_color',\n", 308 | " 'layout',\n", 309 | " 'color']" 310 | ] 311 | }, 312 | "execution_count": 10, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "w.keys" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "### Shorthand for setting the initial values of widget properties" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": { 331 | "slideshow": { 332 | "slide_type": "slide" 333 | } 334 | }, 335 | "source": [ 336 | "While creating a widget, you can set some or all of the initial values of that widget by **defining them as keyword arguments in the widget's constructor** (as seen below)." 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 11, 342 | "metadata": { 343 | "collapsed": false 344 | }, 345 | "outputs": [], 346 | "source": [ 347 | "Text(value='Hello World!', disabled=True)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "## Linking two similar widgets" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": { 360 | "slideshow": { 361 | "slide_type": "slide" 362 | } 363 | }, 364 | "source": [ 365 | "If you need to display the same value two different ways, you'll have to use two different widgets. Instead of **attempting to manually synchronize the values** of the two widgets, you can use the `traitlet` `link` function **to link two properties together**. Below, the values of three widgets are linked together." 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 14, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "import traitlets\n", 377 | "a = FloatText()\n", 378 | "b = FloatSlider()\n", 379 | "\n", 380 | "mylink = traitlets.link((a, 'value'), (b, 'value'))\n", 381 | "display(a)\n", 382 | "display(b)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "### Unlinking widgets" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": { 395 | "slideshow": { 396 | "slide_type": "slide" 397 | } 398 | }, 399 | "source": [ 400 | "Unlinking the widgets is simple. All you have to do is call `.unlink` on the link object." 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 15, 406 | "metadata": { 407 | "collapsed": false 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "mylink.unlink()" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "# Widget List" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "## Complete list" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": { 431 | "slideshow": { 432 | "slide_type": "slide" 433 | } 434 | }, 435 | "source": [ 436 | "For a complete list of the widgets available to you, you can list the classes in the widget namespace (as seen below). `Widget` and `DOMWidget`, not listed below, are base classes." 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 17, 442 | "metadata": { 443 | "collapsed": false 444 | }, 445 | "outputs": [ 446 | { 447 | "data": { 448 | "text/plain": [ 449 | "['Jupyter.Accordion',\n", 450 | " 'Jupyter.BoundedFloatText',\n", 451 | " 'Jupyter.BoundedIntText',\n", 452 | " 'Jupyter.Box',\n", 453 | " 'Jupyter.Button',\n", 454 | " 'Jupyter.Checkbox',\n", 455 | " 'Jupyter.ColorPicker',\n", 456 | " 'Jupyter.Controller',\n", 457 | " 'Jupyter.ControllerAxis',\n", 458 | " 'Jupyter.ControllerButton',\n", 459 | " 'Jupyter.Dropdown',\n", 460 | " 'Jupyter.FlexBox',\n", 461 | " 'Jupyter.FloatProgress',\n", 462 | " 'Jupyter.FloatRangeSlider',\n", 463 | " 'Jupyter.FloatSlider',\n", 464 | " 'Jupyter.FloatText',\n", 465 | " 'Jupyter.HTML',\n", 466 | " 'Jupyter.Image',\n", 467 | " 'Jupyter.IntProgress',\n", 468 | " 'Jupyter.IntRangeSlider',\n", 469 | " 'Jupyter.IntSlider',\n", 470 | " 'Jupyter.IntText',\n", 471 | " 'Jupyter.Label',\n", 472 | " 'Jupyter.PlaceProxy',\n", 473 | " 'Jupyter.Play',\n", 474 | " 'Jupyter.Proxy',\n", 475 | " 'Jupyter.RadioButtons',\n", 476 | " 'Jupyter.Select',\n", 477 | " 'Jupyter.SelectMultiple',\n", 478 | " 'Jupyter.SelectionSlider',\n", 479 | " 'Jupyter.Tab',\n", 480 | " 'Jupyter.Text',\n", 481 | " 'Jupyter.Textarea',\n", 482 | " 'Jupyter.ToggleButton',\n", 483 | " 'Jupyter.ToggleButtons',\n", 484 | " 'Jupyter.Valid',\n", 485 | " 'jupyter.DirectionalLink',\n", 486 | " 'jupyter.Link']" 487 | ] 488 | }, 489 | "execution_count": 17, 490 | "metadata": {}, 491 | "output_type": "execute_result" 492 | } 493 | ], 494 | "source": [ 495 | "sorted(widgets.Widget.widget_types)" 496 | ] 497 | } 498 | ], 499 | "metadata": { 500 | "kernelspec": { 501 | "display_name": "Python 3", 502 | "language": "python", 503 | "name": "python3" 504 | }, 505 | "language_info": { 506 | "codemirror_mode": { 507 | "name": "ipython", 508 | "version": 3 509 | }, 510 | "file_extension": ".py", 511 | "mimetype": "text/x-python", 512 | "name": "python", 513 | "nbconvert_exporter": "python", 514 | "pygments_lexer": "ipython3", 515 | "version": "3.5.2" 516 | }, 517 | "widgets": { 518 | "state": { 519 | "59b9b0f220604d2484e6a11e9308d326": { 520 | "views": [ 521 | { 522 | "cell_index": 30 523 | } 524 | ] 525 | }, 526 | "9378cea209784112abbd9f6a6c0749f0": { 527 | "views": [ 528 | { 529 | "cell_index": 6 530 | } 531 | ] 532 | }, 533 | "a00a16bfbe104e6ba768db753d179d3d": { 534 | "views": [ 535 | { 536 | "cell_index": 33 537 | } 538 | ] 539 | }, 540 | "a2aa05af9cfd4ee2a6fe4d04c2455345": { 541 | "views": [ 542 | { 543 | "cell_index": 33 544 | } 545 | ] 546 | }, 547 | "a4ca1196f6e24ffd80adca7312637c90": { 548 | "views": [ 549 | { 550 | "cell_index": 33 551 | } 552 | ] 553 | }, 554 | "b74b0cd81ba042d28ce6a4b223891b7f": { 555 | "views": [ 556 | { 557 | "cell_index": 21 558 | } 559 | ] 560 | }, 561 | "c724840d79b14e26a253e5edcb3d8582": { 562 | "views": [ 563 | { 564 | "cell_index": 33 565 | } 566 | ] 567 | } 568 | }, 569 | "version": "1.2.0" 570 | } 571 | }, 572 | "nbformat": 4, 573 | "nbformat_minor": 0 574 | } 575 | -------------------------------------------------------------------------------- /Content/Import/01-Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [] 16 | } 17 | ], 18 | "metadata": { 19 | "kernelspec": { 20 | "display_name": "Python 3", 21 | "language": "python", 22 | "name": "python3" 23 | }, 24 | "language_info": { 25 | "codemirror_mode": { 26 | "name": "ipython", 27 | "version": 3 28 | }, 29 | "file_extension": ".py", 30 | "mimetype": "text/x-python", 31 | "name": "python", 32 | "nbconvert_exporter": "python", 33 | "pygments_lexer": "ipython3", 34 | "version": "3.5.2" 35 | } 36 | }, 37 | "nbformat": 4, 38 | "nbformat_minor": 2 39 | } 40 | -------------------------------------------------------------------------------- /Content/Import/02-DataSoures.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Interesting Datasets" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This notebook lists some well known and interesting data sets that are free or nearly free. This list is not by any means comprehensive, but is a good start." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### usa.gov bit.ly data\n", 22 | "\n", 23 | "* http://www.usa.gov/About/developer-resources/1usagov.shtml\n", 24 | "* http://1usagov.measuredvoice.com/2013/\n", 25 | "\n", 26 | "The U.S government and https://bitly.com/ provide a data set of clicks on the different U.S government web sites." 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "### MovieLens\n", 34 | "\n", 35 | "http://grouplens.org/datasets/movielens/\n", 36 | "\n", 37 | "The MovieLens data set consists of large collections of movie ratings from the MovieLens web site http://movielens.org/." 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Armchair Analysis\n", 45 | "\n", 46 | "http://armchairanalysis.com/\n", 47 | "\n", 48 | "Armchair Analysis provides high quality, play-by-play data for all NFL football games." 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "### SNAP (Stanford Large Network Dataset Collection)\n", 56 | "\n", 57 | "http://snap.stanford.edu/data/\n", 58 | "\n", 59 | "SNAP provide a number of large network data sets from social networks, citation networks, collaboration networks, etc." 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "### GDELT\n", 67 | "\n", 68 | "http://gdeltproject.org/\n", 69 | "\n", 70 | "> Supported by Google Ideas, the GDELT Project monitors the world's broadcast, \n", 71 | "> print, and web news from nearly every corner of every country in over \n", 72 | "> 100 languages and identifies the people, locations, organizations, counts, \n", 73 | "> themes, sources, and events driving our global society every second of every day, \n", 74 | "> creating a free open platform for computing on the entire world." 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### Poplular Baby Names\n", 82 | "\n", 83 | "http://www.ssa.gov/oact/babynames/\n", 84 | "\n", 85 | "The U.S Social Security Administration maintains a database of the most popular baby names over time." 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Quandl\n", 93 | "\n", 94 | "https://www.quandl.com/\n", 95 | "\n", 96 | "Quandl is a (mostly) free and open data aggregator. It offers over 10 million data sets on wide range of topics in a wide range of formats (JSON, CSV, Excel, REST API, Python, R, etc.)." 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "### Home Mortgage Disclosure Act (HMDA)\n", 104 | "\n", 105 | "http://www.ffiec.gov/hmda/hmdaproducts.htm\n", 106 | "\n", 107 | "This HMDA provides extensive data related to home mortgages in the U.S." 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "### ASA Data Exposition\n", 115 | "\n", 116 | "http://stat-computing.org/dataexpo/\n", 117 | "\n", 118 | "The The Americal Statistical Association runs a bi-annual Data Exposition. Each Data Expo focuses on a single data set and the historical data sets are available. In particular the data set of historical airline flight information is interesting:\n", 119 | "\n", 120 | "http://stat-computing.org/dataexpo/2009/" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "### Kaggle Datasets\n", 128 | "\n", 129 | "https://www.kaggle.com/datasets\n", 130 | "\n", 131 | "Kaggle runs Data Science competitions and has a number of interesting data sets." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "### Yelp Data Set Challenge\n", 139 | "\n", 140 | "http://www.yelp.com/dataset_challenge\n", 141 | "\n", 142 | "Yelp provides a medium sized data set that has users, review, businesses, etc." 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### US Census Data\n", 150 | "\n", 151 | "http://www.census.gov/\n", 152 | "\n", 153 | "The US Census Bureau offers access to the entire US census datasets. There is a ton of useful data here that can be combined with data from other sources. However, the data is a challenge to find, understand and work with." 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "### Twitter API\n", 161 | "\n", 162 | "https://dev.twitter.com/rest/public\n", 163 | "https://dev.twitter.com/streaming/overview\n", 164 | "\n", 165 | "Twitter offers a REST API for streaming and a small amount of historical data. There are limits to the amount of data you can gather for free using these APIs. While companies like GNIP (now part of Twitter) offer access to the full Twitter data set, it is horribly expensive." 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "### New York Times API\n", 173 | "\n", 174 | "http://developer.nytimes.com/docs\n", 175 | "\n", 176 | "The New York Times offers a free REST API for querying all maner of data related to the New York Times media properties." 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "### Facebook API\n", 184 | "\n", 185 | "https://developers.facebook.com/\n", 186 | "\n", 187 | "Facebook offers a rich REST API." 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "### San Francisco\n", 195 | "\n", 196 | "https://data.sfgov.org/\n", 197 | "\n", 198 | "The city of San Francisco has a large number of data sets related to the city." 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "### Awesome Public Datasets\n", 206 | "\n", 207 | "This GitHub repository has an incredibly complete list of public datasets:\n", 208 | "\n", 209 | "https://github.com/caesar0301/awesome-public-datasets" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "### BuzzFeedNews\n", 217 | "\n", 218 | "https://github.com/BuzzFeedNews/everything\n", 219 | "\n", 220 | "The Data Science team at [BuzzFeedNews](http://www.buzzfeed.com/news) publishes the datasets and code from all their stories on GitHub.\n", 221 | "\n" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "### 538\n", 229 | "\n", 230 | "https://github.com/fivethirtyeight/data\n", 231 | "\n", 232 | "[538](https://github.com/fivethirtyeight/data) also publishes their datasets from many of their articles on Github." 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Stack Exchange Data Dump\n", 240 | "\n", 241 | "https://archive.org/details/stackexchange\n", 242 | "\n", 243 | "This contains a large data dump of [Stack Exchange](http://stackexchange.com/) as of 1/1/2016." 244 | ] 245 | } 246 | ], 247 | "metadata": { 248 | "kernelspec": { 249 | "display_name": "Python 3", 250 | "language": "python", 251 | "name": "python3" 252 | }, 253 | "language_info": { 254 | "codemirror_mode": { 255 | "name": "ipython", 256 | "version": 3 257 | }, 258 | "file_extension": ".py", 259 | "mimetype": "text/x-python", 260 | "name": "python", 261 | "nbconvert_exporter": "python", 262 | "pygments_lexer": "ipython3", 263 | "version": "3.4.3" 264 | } 265 | }, 266 | "nbformat": 4, 267 | "nbformat_minor": 0 268 | } 269 | -------------------------------------------------------------------------------- /Content/Import/ArrowParquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Arrow/Parquet" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Imports" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 12, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import altair as alt" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 13, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "import pyarrow.parquet as pq\n", 33 | "import numpy as np\n", 34 | "import pandas as pd\n", 35 | "import pyarrow as pa" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Overview" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Arrow is an efficient in memory tabular data structure.\n", 50 | "\n", 51 | "Parquet is an efficient file format for saving tabales to disk.\n", 52 | "\n", 53 | "Why use them:\n", 54 | "\n", 55 | "* They are much faster, and space efficient than CSV/JSON.\n", 56 | "* They preserve types of DataFrames.\n", 57 | "\n", 58 | "https://arrow.apache.org/docs/python/parquet.html" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Write a DataFrame to a parquet file" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 14, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "cars = alt.load_dataset('cars')" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 15, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/html": [ 85 | "
\n", 86 | "\n", 99 | "\n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | "
AccelerationCylindersDisplacementHorsepowerMiles_per_GallonNameOriginWeight_in_lbsYear
012.08307.0130.018.0chevrolet chevelle malibuUSA35041970-01-01
111.58350.0165.015.0buick skylark 320USA36931970-01-01
211.08318.0150.018.0plymouth satelliteUSA34361970-01-01
312.08304.0150.016.0amc rebel sstUSA34331970-01-01
410.58302.0140.017.0ford torinoUSA34491970-01-01
\n", 177 | "
" 178 | ], 179 | "text/plain": [ 180 | " Acceleration Cylinders Displacement Horsepower Miles_per_Gallon \\\n", 181 | "0 12.0 8 307.0 130.0 18.0 \n", 182 | "1 11.5 8 350.0 165.0 15.0 \n", 183 | "2 11.0 8 318.0 150.0 18.0 \n", 184 | "3 12.0 8 304.0 150.0 16.0 \n", 185 | "4 10.5 8 302.0 140.0 17.0 \n", 186 | "\n", 187 | " Name Origin Weight_in_lbs Year \n", 188 | "0 chevrolet chevelle malibu USA 3504 1970-01-01 \n", 189 | "1 buick skylark 320 USA 3693 1970-01-01 \n", 190 | "2 plymouth satellite USA 3436 1970-01-01 \n", 191 | "3 amc rebel sst USA 3433 1970-01-01 \n", 192 | "4 ford torino USA 3449 1970-01-01 " 193 | ] 194 | }, 195 | "execution_count": 15, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "cars.head()" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 16, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "filename = '/data/ellisonbg/cars.parquet'" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 17, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "table = pa.Table.from_pandas(cars)\n", 220 | "pq.write_table(table, filename)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "## Read a parquet file as a DataFrame" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 18, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "table2 = pq.read_table(filename)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 19, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/html": [ 247 | "
\n", 248 | "\n", 261 | "\n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | "
AccelerationCylindersDisplacementHorsepowerMiles_per_GallonNameOriginWeight_in_lbsYear
012.08307.0130.018.0chevrolet chevelle malibuUSA35041970-01-01
111.58350.0165.015.0buick skylark 320USA36931970-01-01
211.08318.0150.018.0plymouth satelliteUSA34361970-01-01
312.08304.0150.016.0amc rebel sstUSA34331970-01-01
410.58302.0140.017.0ford torinoUSA34491970-01-01
\n", 339 | "
" 340 | ], 341 | "text/plain": [ 342 | " Acceleration Cylinders Displacement Horsepower Miles_per_Gallon \\\n", 343 | "0 12.0 8 307.0 130.0 18.0 \n", 344 | "1 11.5 8 350.0 165.0 15.0 \n", 345 | "2 11.0 8 318.0 150.0 18.0 \n", 346 | "3 12.0 8 304.0 150.0 16.0 \n", 347 | "4 10.5 8 302.0 140.0 17.0 \n", 348 | "\n", 349 | " Name Origin Weight_in_lbs Year \n", 350 | "0 chevrolet chevelle malibu USA 3504 1970-01-01 \n", 351 | "1 buick skylark 320 USA 3693 1970-01-01 \n", 352 | "2 plymouth satellite USA 3436 1970-01-01 \n", 353 | "3 amc rebel sst USA 3433 1970-01-01 \n", 354 | "4 ford torino USA 3449 1970-01-01 " 355 | ] 356 | }, 357 | "execution_count": 19, 358 | "metadata": {}, 359 | "output_type": "execute_result" 360 | } 361 | ], 362 | "source": [ 363 | "table2.to_pandas().head()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [] 372 | } 373 | ], 374 | "metadata": { 375 | "kernelspec": { 376 | "display_name": "Python 3", 377 | "language": "python", 378 | "name": "python3" 379 | }, 380 | "language_info": { 381 | "codemirror_mode": { 382 | "name": "ipython", 383 | "version": 3 384 | }, 385 | "file_extension": ".py", 386 | "mimetype": "text/x-python", 387 | "name": "python", 388 | "nbconvert_exporter": "python", 389 | "pygments_lexer": "ipython3", 390 | "version": "3.6.3" 391 | } 392 | }, 393 | "nbformat": 4, 394 | "nbformat_minor": 2 395 | } 396 | -------------------------------------------------------------------------------- /Content/Introduction/01-Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Course materials\n", 15 | "\n", 16 | "- https://github.com/jakevdp/WhirlwindTourOfPython\n", 17 | "- https://github.com/jakevdp/PythonDataScienceHandbook\n", 18 | "- http://r4ds.had.co.nz/index.html\n", 19 | "- https://github.com/amueller/introduction_to_ml_with_python" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Introduction to Data Science\n", 27 | "\n", 28 | "- [What is Data Science?](./02-WhatIsDataScience.ipynb)\n", 29 | "- [Theory of Data](./03-TheoryofData.ipynb)\n", 30 | "- [Programming](../Programming/01-Introduction.ipynb)\n", 31 | "- [Workflow](../Workflow/01-Introduction.ipynb)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## Explore\n", 39 | "\n", 40 | "Explore = Visualize + Transform\n", 41 | "\n", 42 | "- [Data Visualisation](../Visualize/01-Introduction.ipynb)\n", 43 | "- [Basic Data Transformation](../Transform/02-BasicDataTransformation.ipynb)\n", 44 | "- [Exploratory Data Analysis (EDA)](../Explore/02-EDA.ipynb)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Wrangle\n", 52 | "\n", 53 | "Wrangle = Import + Tidy + Transform\n", 54 | "\n", 55 | "- [Import](../Import/01-Introduction.ipynb)\n", 56 | "- [Tidy](../Tidy/01-Introduction.ipynb)\n", 57 | "- [Transform](../Transform/01-Introduction.ipynb)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## Model\n", 65 | "\n", 66 | "- [Model](../Model/01-Introduction.ipynb)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "## Communicate\n", 74 | "\n", 75 | "- GitHub and Notebooks\n", 76 | "- nbconvert\n", 77 | "- nbviewer" 78 | ] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 3", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.6.3" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 2 102 | } 103 | -------------------------------------------------------------------------------- /Content/Introduction/02-WhatIsDataScience.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# What is Data Science?" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**Learning Objective:** Understand different ways that Data Science can be defined." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Data Science as a *skill set*" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "Perhaps the most common definition of Data Science is to enumerate the skills and knowledge areas used in Data Science. The best known treatment of that approach is [Drew Conway's Data Science Venn diagram](http://drewconway.com/zia/2013/3/26/the-data-science-venn-diagram), seen here:\n", 29 | "\n", 30 | "![Data Science Venn Diagram](images/data_science_vd.png)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Data Science as a *process*" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "An operational definition of Data Science answers the question, \"what do Data Scientists do?\". Over the last few years, the community of Data Scientists have been building a concensus answer to this question. The different activities involved in Data Science are linked together to form the Data Science **process** or **workflow**. By looking at the descriptions of the Data Science process by a few individuals, we can start to see a clear picture emerging.\n", 45 | "\n", 46 | "* [A Data Science Taxonomy](http://www.dataists.com/2010/09/a-taxonomy-of-data-science/), Hilary Mason (2012):\n", 47 | " - Obtain\n", 48 | " - Scrub\n", 49 | " - Explore\n", 50 | " - Model\n", 51 | " - Interpret\n", 52 | "* [The Data Science Process](http://columbiadatascience.com/2012/09/24/reflections-after-jakes-lecture/), Rachel Shutt (2012):\n", 53 | " - Observation and collection\n", 54 | " - Processing\n", 55 | " - Exploratory data analysis\n", 56 | " - Modeling: Stats, ML\n", 57 | " - Build data product\n", 58 | " - Communicate\n", 59 | " - Make decisions\n", 60 | "* [Introduction to Data Science 2.0](http://columbiadatascience.com/2013/09/16/introduction-to-data-science-version-2-0/), Rachel Shutt (2013):\n", 61 | " - Gather and observe\n", 62 | " - Process\n", 63 | " - Modeling: Stats, ML\n", 64 | " - Summarize, communicate, build\n", 65 | " - Decide, interact\n", 66 | "* [Data Science Workflow: Overview and Challenges ](http://cacm.acm.org/blogs/blog-cacm/169199-data-science-workflow-overview-and-challenges/fulltext), Philip Guo (2014):\n", 67 | " - Preparation\n", 68 | " - Analysis\n", 69 | " - Reflection\n", 70 | " - Dissemination" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## Data Science as a *set of questions*" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "The *skills* and *process* approaches to defining Data Science do have some limitations. Another approach, for which I advocate, is to enumerate the underlying questions that the field is pursuing. Here are some possibilities:\n", 85 | "\n", 86 | "* How/where do we get data?\n", 87 | "* What is the raw format of the data?\n", 88 | "* How much data and how often?\n", 89 | "* What variables/fields are present in the data and what are their types?\n", 90 | "* What relevant variables/fields are not present in the data?\n", 91 | "* What relationships are present in the data and how are they expressed?\n", 92 | "* Is the data observational or collected in a controlled manner?\n", 93 | "* What practical questions can we, or would we like to answer with the data?\n", 94 | "* How is the data stored after collection and how does that relate to the\n", 95 | " practical questions we are interested in answering?\n", 96 | "* What in memory data structures are appropriate for answering those practical\n", 97 | " questions efficiently?\n", 98 | "* What can we predict with the data?\n", 99 | "* What can we understand with the data?\n", 100 | "* What hypotheses can be supported or rejected with the data?\n", 101 | "* What statistical or machine learning methods are needed to answer these questions?\n", 102 | "* What user interfaces are required for humans to work with the data efficiently and\n", 103 | " productively?\n", 104 | "* How can the data be visualized effectively?\n", 105 | "* How can code, data and visualizations be embedded into narratives used to\n", 106 | " communicate results?\n", 107 | "* What software is needed to support the activities around these questions?\n", 108 | "* What computational infrastructure is needed?\n", 109 | "* How can organizations leverage data to meet their goals?\n", 110 | "* What organizational structures are needed to best take advantage of data?\n", 111 | "* What are the economic benefits of pursuing these questions?\n", 112 | "* What are the social benefits of pursuing these questions?\n", 113 | "* Where do these questions and the activities in pursuit of them intersect important ethical issues." 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "## Data Science as *Science*" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "If we take the name \"Data Science\" seriously, then we have to assume that it is somehow related to science. Here is my own take:\n", 128 | "\n", 129 | "> Data Science involves the application of scientific methods and approaches to data sets that *may* lie outside the traditional fields of science (Physics, Chemistry, Biology, etc.).\n", 130 | "\n", 131 | "In other words, Data Science involves a broad application of the scientific method." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## R for Data Science\n", 139 | "\n", 140 | "In [R for Data Science](http://r4ds.had.co.nz/index.html), Garrett Grolemund and Hadley Wickham organize the data science process into the following ideas and practices:\n", 141 | "\n", 142 | "1. Import\n", 143 | "2. Tidy\n", 144 | "3. Understand:\n", 145 | " - Transform\n", 146 | " - Visualize\n", 147 | " - Model\n", 148 | "4. Communicate\n", 149 | "\n", 150 | "Furthermore, he overlays a couple of other composite ideas and practices:\n", 151 | "\n", 152 | "* Wrangle = Import + Tidy + Transform\n", 153 | "* Explore = Transform + Visualize\n", 154 | "\n", 155 | "Lastly, he identifies two additional cross cutting areas:\n", 156 | "\n", 157 | "* Workflow\n", 158 | "* Programming\n", 159 | "\n", 160 | "**This course will follow this conceptual model of data science.**" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "## Resources" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "* [Scientific Method](https://en.wikipedia.org/wiki/Scientific_method), Wikipedia (2016).\n", 175 | "* [50 Years of Data Science](http://courses.csail.mit.edu/18.337/2015/docs/50YearsDataScience.pdf), David Donoho (2015).\n", 176 | "* [Data Science Survey](https://www.oreilly.com/ideas/2015-data-science-salary-survey), O'Reilly Media (2015).\n", 177 | "* [The Emerging Role of Data Scientists on Software Development Teams](http://research.microsoft.com/apps/pubs/default.aspx?id=242286), Microsoft Research (2015)." 178 | ] 179 | } 180 | ], 181 | "metadata": { 182 | "kernelspec": { 183 | "display_name": "Python 3", 184 | "language": "python", 185 | "name": "python3" 186 | }, 187 | "language_info": { 188 | "codemirror_mode": { 189 | "name": "ipython", 190 | "version": 3 191 | }, 192 | "file_extension": ".py", 193 | "mimetype": "text/x-python", 194 | "name": "python", 195 | "nbconvert_exporter": "python", 196 | "pygments_lexer": "ipython3", 197 | "version": "3.6.3" 198 | } 199 | }, 200 | "nbformat": 4, 201 | "nbformat_minor": 2 202 | } 203 | -------------------------------------------------------------------------------- /Content/Introduction/03-TheoryofData.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# A Brief Theory of Data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Types of data" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "In this section, when I talk about the \"type\" of the data, I am not talking about the `dtype` (`int`, `float`, `bool`, `str`) used to represent the data in a NumPy array or Pandas DataFrame. In this context the \"type\" of the data is used in a more abstract sense." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Take 1: Data+Design" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "[Data+Design]() is an excellent online book about the theory of data. It is very well thought out and beautiful as well. I highly recommend spending time reading it. In Chapter 1 of Data+Design, the authors cover [Basic Data Types](https://infoactive.co/data-design/ch01.html). For further details, see also the Wikipedia pages on [Levels of measurement](https://en.wikipedia.org/wiki/Level_of_measurement). Here is a short summary of those basic data types:" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### Nominal" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "* Non-numerical\n", 50 | "* Usually, but not always strings\n", 51 | "* Non-ordered\n", 52 | "* Cannot be averaged" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 18, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "['Oregon', 'California', 'Texas', 'Colorado']" 66 | ] 67 | }, 68 | "execution_count": 18, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "states = ['Oregon', 'California', 'Texas', 'Colorado']\n", 75 | "states" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 19, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "['produce', 'diary', 'frozen']" 89 | ] 90 | }, 91 | "execution_count": 19, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "grocery_sections = [\"produce\", 'diary', 'frozen']\n", 98 | "grocery_sections" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 20, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "text/plain": [ 111 | "['male', 'female']" 112 | ] 113 | }, 114 | "execution_count": 20, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "gender = ['male', 'female']\n", 121 | "gender" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "### Ordinal" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "* Non-numerical\n", 136 | "* Usually, but not always strings\n", 137 | "* Natural ordering\n", 138 | "* Sometimes can be averaged\n", 139 | "* Can assign numerical scale, but it will be arbitrary" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 4, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "['strongly disagree', 'disagre', 'neutral', 'agree', 'strongly agree']" 153 | ] 154 | }, 155 | "execution_count": 4, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "response = ['strongly disagree', 'disagre', 'neutral', 'agree', 'strongly agree']\n", 162 | "response" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 5, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "['cold', 'hot']" 176 | ] 177 | }, 178 | "execution_count": 5, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "temp = ['cold', 'hot']\n", 185 | "temp" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 6, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "['short', 'medium', 'tall']" 199 | ] 200 | }, 201 | "execution_count": 6, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "height = ['short', 'medium', 'tall']\n", 208 | "height" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "### Interval" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "* Equally spaced numerical data\n", 223 | "* Ordered\n", 224 | "* Can either be discrete (int) or continuous (float)\n", 225 | "* No meaningful zero point\n", 226 | "* Examples:\n", 227 | " - Temperature in F or C\n", 228 | " - Dates/Times" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 22, 234 | "metadata": { 235 | "collapsed": false 236 | }, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "[32.1, 99.4, 210.0, -76.4]" 242 | ] 243 | }, 244 | "execution_count": 22, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "temps = [32.1, 99.4, 210.0, -76.4]\n", 251 | "temps" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "### Ratio" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "* Equally spaced, ordered numerical data\n", 266 | "* Can either be discrete or continuous\n", 267 | "* Meaningful zero point that indicates an absence of the measured entity\n", 268 | "* Examples:\n", 269 | " - Age in years\n", 270 | " - Height in inches" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 12, 276 | "metadata": { 277 | "collapsed": false 278 | }, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "text/plain": [ 283 | "[47, 76, 17, 48, 99, 53, 86, 45, 56, 38]" 284 | ] 285 | }, 286 | "execution_count": 12, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "ages = [random.randint(0,100) for i in range(10)]\n", 293 | "ages" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 16, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "[3.6024466624950744,\n", 307 | " 72.57678124421476,\n", 308 | " 28.563897505990518,\n", 309 | " 47.537636077547305,\n", 310 | " 48.56497103639384,\n", 311 | " 29.140383493314705,\n", 312 | " 71.6319961862486,\n", 313 | " 59.37821139476524,\n", 314 | " 71.10757477132888,\n", 315 | " 12.436518123166024]" 316 | ] 317 | }, 318 | "execution_count": 16, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "height = [76.0*random.random() for i in range(10)]\n", 325 | "height" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "### Categorical\n", 333 | "\n", 334 | "* Data is labelled by well separated categories\n", 335 | "* Often used as an umbrella for nominal and ordinal, which are unordered and ordered categorical data types respectively." 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "## Take 2: Polaris, Tableau, d3/vega" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "The data visualization community has spent a lot of time thinking carefully about fundamental data types. There is a large body of research and software projects that encode the results of that research into usable forms. Good examples of this research and software are:\n", 350 | "\n", 351 | "* [Polaris: A System for...](http://graphics.stanford.edu/papers/polaris_extended/polaris.pdf), C. Stolte, D. Tank and P. Hanrahan (2002).\n", 352 | "* [Tableau](http://www.tableau.com/), Tableau Software, website (2016).\n", 353 | "* [d3](http://d3js.org/), Data Driven Documents, website (2016).\n", 354 | "* [Vega](http://vega.github.io/vega/), Vega: A Visualization Grammar, website (2016).\n", 355 | "* [Vega-Lite](http://vega.github.io/vega-lite/), Vega-Lite: A High-Level Visualization Grammar, website (2016).\n", 356 | "* [polestar](http://vega.github.io/polestar/), Polestar website (2016).\n", 357 | "\n", 358 | "Here is a rough union of the different data types found in this body of work:\n", 359 | "\n", 360 | "* Ordinal (same as above)\n", 361 | "* Nominal (same as above)\n", 362 | "* Quantitative (ratio, interval)\n", 363 | "* Date/time (calendar dates and/or times)\n", 364 | "* Geographic (states, latitude/longitude)\n", 365 | "\n", 366 | "Some of these sofware packages also have a `text` data type that is meant for textual data that is not categorical." 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "## Variables" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "* A **variable** is some quantity that is measured, such as \"age\"\n", 381 | "* A single variable can be measured in different ways that give different data types:\n", 382 | " - \"young\" or \"old\" = ordinal\n", 383 | " - Age ranges (0-9, 10-19, ...) = ordinal\n", 384 | " - Age in years = ratio" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "## Records and data sets\n", 392 | "\n", 393 | " * A **record** or **sample** is one measurement of a set of variables\n", 394 | " * A **data set** is a set of records that measure the same set of variables in the same way" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 28, 400 | "metadata": { 401 | "collapsed": false 402 | }, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/plain": [ 407 | "[{'age': 52, 'height': 0.45153746742233736},\n", 408 | " {'age': 76, 'height': 56.65900992198216},\n", 409 | " {'age': 36, 'height': 22.419785610573825},\n", 410 | " {'age': 12, 'height': 16.34630476175516},\n", 411 | " {'age': 34, 'height': 35.392522637659134},\n", 412 | " {'age': 81, 'height': 41.75690996668162},\n", 413 | " {'age': 26, 'height': 32.123243497319},\n", 414 | " {'age': 83, 'height': 11.127118329861124},\n", 415 | " {'age': 96, 'height': 4.556533241526422},\n", 416 | " {'age': 0, 'height': 30.41328942526455}]" 417 | ] 418 | }, 419 | "execution_count": 28, 420 | "metadata": {}, 421 | "output_type": "execute_result" 422 | } 423 | ], 424 | "source": [ 425 | "ages = [random.randint(0,100) for i in range(10)]\n", 426 | "heights = [76.0*random.random() for i in range(10)]\n", 427 | "data_set = [{'age':a, 'height':h} for a, h in zip(ages, heights)]\n", 428 | "data_set" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 29, 434 | "metadata": { 435 | "collapsed": false 436 | }, 437 | "outputs": [ 438 | { 439 | "data": { 440 | "text/plain": [ 441 | "{'age': 52, 'height': 0.45153746742233736}" 442 | ] 443 | }, 444 | "execution_count": 29, 445 | "metadata": {}, 446 | "output_type": "execute_result" 447 | } 448 | ], 449 | "source": [ 450 | "sample0 = data_set[0]\n", 451 | "sample0" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "## Resources" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "* [Data+Design](https://infoactive.co/data-design) Trina Chiasson, Dyanna Gregory, et al (2016)." 466 | ] 467 | } 468 | ], 469 | "metadata": { 470 | "kernelspec": { 471 | "display_name": "Python 3", 472 | "language": "python", 473 | "name": "python3" 474 | }, 475 | "language_info": { 476 | "codemirror_mode": { 477 | "name": "ipython", 478 | "version": 3 479 | }, 480 | "file_extension": ".py", 481 | "mimetype": "text/x-python", 482 | "name": "python", 483 | "nbconvert_exporter": "python", 484 | "pygments_lexer": "ipython3", 485 | "version": "3.5.2" 486 | } 487 | }, 488 | "nbformat": 4, 489 | "nbformat_minor": 0 490 | } 491 | -------------------------------------------------------------------------------- /Content/Introduction/images/data_science_vd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Introduction/images/data_science_vd.png -------------------------------------------------------------------------------- /Content/Model/01-Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**Learning Objectives:** Learn about the theory and practice of statistical modelling." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Ouline" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "* [Probability](02-Probability.ipynb)\n", 29 | "* [Common Distributions](03-CommonDistributions.ipynb)\n", 30 | "* [Information Theory](04-InformationTheory.ipynb)\n", 31 | "* [Modelling Overview](05-ModellingOverview.ipynb)\n", 32 | "* [Estimators, Bias, Variance](06-EstimatorsBiasVariance.ipynb)\n", 33 | "* [Bootstrap Resampling](07-BootstrapResampling.ipynb)\n", 34 | "* [Maximum Likelihood Estomation](08-MLE.ipynb)\n", 35 | "* [Linear Regression](09-LinearRegression.ipynb)\n", 36 | "* [Specific Models](10-SpecificModels.ipynb)" 37 | ] 38 | } 39 | ], 40 | "metadata": { 41 | "kernelspec": { 42 | "display_name": "Python 3", 43 | "language": "python", 44 | "name": "python3" 45 | }, 46 | "language_info": { 47 | "codemirror_mode": { 48 | "name": "ipython", 49 | "version": 3 50 | }, 51 | "file_extension": ".py", 52 | "mimetype": "text/x-python", 53 | "name": "python", 54 | "nbconvert_exporter": "python", 55 | "pygments_lexer": "ipython3", 56 | "version": "3.6.3" 57 | } 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 2 61 | } 62 | -------------------------------------------------------------------------------- /Content/Model/04-InformationTheory.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Information Theory " 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "- Entropy\n", 15 | "- Kullback-Leibler divergence\n", 16 | "- Cross entropy" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [] 25 | } 26 | ], 27 | "metadata": { 28 | "kernelspec": { 29 | "display_name": "Python 3", 30 | "language": "python", 31 | "name": "python3" 32 | }, 33 | "language_info": { 34 | "codemirror_mode": { 35 | "name": "ipython", 36 | "version": 3 37 | }, 38 | "file_extension": ".py", 39 | "mimetype": "text/x-python", 40 | "name": "python", 41 | "nbconvert_exporter": "python", 42 | "pygments_lexer": "ipython3", 43 | "version": "3.5.2" 44 | } 45 | }, 46 | "nbformat": 4, 47 | "nbformat_minor": 2 48 | } 49 | -------------------------------------------------------------------------------- /Content/Model/05-ModellingOverview.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Modeling Overview" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**Learning Objectives:** Get a general conceptual understanding of statistical modeling and machine learning." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Imports" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import matplotlib.pyplot as plt\n", 40 | "%matplotlib inline" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## 1 Introduction" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "**Modeling**, or **statistical modeling** is a very general approach for using data in a variety of productive ways. In other circles these same ideas go under the name **machine learning** or more trendy phrases such as **machine intelligence**. Some of the slipery terminology comes from the fact that research in this field has been done across different academic disciplines such as statistics, computer science, mathematics and physics. Each field has developed its own emphases and terminologies.\n", 55 | "\n", 56 | "Some of the goals of modeling include:\n", 57 | "\n", 58 | "* Predict future events based on past data.\n", 59 | "* Provide intuitive understanding data.\n", 60 | "* Provide a mathematical model for data that lacks first principles theoretical models (as in Physics).\n", 61 | "* Quantify uncertainties.\n", 62 | "* Learn generalizable information from data.\n", 63 | "\n", 64 | "As pointed out by Goodfellow et al., Mitchell (1997) provided a nice general definition of this idea of \"learning from data\":\n", 65 | "\n", 66 | "> A computer program is said to learn from experience E with respect to some class of tasks T and performance measure P, if its performance at tasks in T, as measured by P, improves with experience E\n", 67 | "\n", 68 | "In this course, we will focus on two different ways of thinking about models:\n", 69 | "\n", 70 | "1. Forward = Generative models\n", 71 | "2. Backwards = Inference with models" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## 2 Generative models" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "The idea of a **generative model** is that we can use a model to generate data. Usually, our models will have parameters that we get to (and have to) choose. Here is a diagram that shows show this works:\n", 86 | "\n", 87 | "**Model** $+$ **Parameters** $\\rightarrow$ **Generated Data**\n", 88 | "\n", 89 | "Let use this process to model the time between soccer goals in a soccer game. The appropriate distribution for this would be the exponential distribution. Let's say that we know the average time between goals is 20 minutes. Using this parameter and the exponential distribution (our model), we can create a dataset of the time between specific goals (100 of them!) in soccer games:" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "array([ 4.71507829e+01, 4.22468719e+01, 1.66022228e+01,\n", 101 | " 1.95064886e+01, 5.44656709e+00, 1.01567676e+01,\n", 102 | " 3.03634966e+01, 5.90058457e+01, 5.96845511e+01,\n", 103 | " 4.56862724e+01, 7.67989738e+00, 7.38203287e+00,\n", 104 | " 3.24785943e+01, 8.85298367e+00, 1.31063440e+01,\n", 105 | " 1.19161635e+01, 4.97332313e+01, 4.41001039e+00,\n", 106 | " 3.15167607e+01, 3.67762425e+01, 3.18983132e+01,\n", 107 | " 3.78041120e+01, 2.12051012e-02, 3.46656245e+01,\n", 108 | " 2.50736875e+00, 9.33988324e+00, 3.54828328e+00,\n", 109 | " 8.97400411e+00, 3.25434878e+01, 2.35341585e+01,\n", 110 | " 8.07792154e+00, 1.15779362e+01, 1.72659522e+01,\n", 111 | " 2.02563042e+01, 9.18558896e-01, 7.24242360e+00,\n", 112 | " 2.65958441e+01, 5.50392122e+01, 2.08735129e+01,\n", 113 | " 2.10050443e+00, 1.01509222e+00, 9.25728340e-01,\n", 114 | " 7.83342215e-01, 1.27210814e+01, 2.78212012e+00,\n", 115 | " 3.44151046e+01, 7.62980429e-01, 3.54275758e+00,\n", 116 | " 1.92608673e+01, 6.14986481e+00, 3.05824946e+00,\n", 117 | " 5.62379262e+00, 1.44342878e+01, 1.42093249e+00,\n", 118 | " 1.50380526e+01, 1.37009936e+01, 1.45319686e+01,\n", 119 | " 1.63783217e+01, 8.98506500e+00, 9.53802491e+00,\n", 120 | " 4.56794033e+01, 2.84668139e+01, 1.13010953e+01,\n", 121 | " 5.11700244e+00, 4.63142783e+01, 1.23100532e+00,\n", 122 | " 1.31443132e+01, 4.40701045e+01, 1.34850432e+01,\n", 123 | " 2.32386429e+01, 7.45366566e+00, 2.60398837e+01,\n", 124 | " 1.78306323e+00, 7.43059105e+00, 6.86534103e+00,\n", 125 | " 2.30494429e+01, 9.50409199e-01, 1.68655453e+01,\n", 126 | " 3.65952907e-01, 1.95535102e+01, 7.79056167e+00,\n", 127 | " 3.16661162e+00, 2.04141580e+01, 1.07834780e+02,\n", 128 | " 2.54883375e+01, 1.02814340e+01, 1.59914411e+01,\n", 129 | " 1.64449898e+02, 1.27400145e+00, 3.43275789e+00,\n", 130 | " 3.29461633e+01, 2.60505252e+01, 2.83368097e+01,\n", 131 | " 4.11928128e+01, 1.13617707e+01, 9.85044211e+00,\n", 132 | " 7.50329192e+00, 2.23366012e+01, 4.48704797e+01,\n", 133 | " 2.06561925e+00])" 134 | ] 135 | }, 136 | "execution_count": 4, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "β = 20 # Parameter\n", 143 | "data = np.random.exponential(β, 100) # Model\n", 144 | "data # data" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "We can then visualize this dataset:" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 5, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAADy5JREFUeJzt3X+sZHV9xvH30wXUKi1QrmSD0IuGGkkTF3JDSGiNxR/lRyuYtomksZuUZG0iiaS2cdU/ik2bQFslaWK0a6BuG9RalUAUWwnFGpOKveCy7HZFENcWWXevWgukDe3ip3/M2eS63tmZOz/u3PnyfiWTmfnOmZ2Hbw7PPffcc86kqpAkzb+fmnUASdJkWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRpy0kR925pln1uLi4kZ+pCTNvQceeOB7VbUwaLkNLfTFxUWWl5c38iMlae4l+fYwy7nLRZIaYaFLUiMsdElqhIUuSY2w0CWpEQMLPckLk3w1yUNJ9id5Xzf+0STfSrKnu22bflxJUj/DHLb4LHBZVT2T5GTgy0k+3732h1X1qenFkyQNa2ChV+876p7pnp7c3fzeOknaZIbah55kS5I9wBHgnqq6v3vpT5PsTXJLkhdMLaUkaaChzhStqueAbUlOA+5I8ovAu4HvAqcAu4B3AX98/HuT7AB2AJx77rkjB13c+bmR33vwpqtGfq8kzYt1HeVSVT8EvghcXlWHqudZ4K+Bi/u8Z1dVLVXV0sLCwEsRSJJGNMxRLgvdljlJXgS8Hvh6kq3dWIBrgH3TDCpJOrFhdrlsBXYn2ULvB8Anq+qzSf4pyQIQYA/we1PMKUkaYJijXPYCF64xftlUEkmSRuKZopLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNGFjoSV6Y5KtJHkqyP8n7uvHzktyf5NEkf5fklOnHlST1M8wW+rPAZVX1amAbcHmSS4CbgVuq6nzgP4HrphdTkjTIwEKvnme6pyd3twIuAz7Vje8GrplKQknSUIbah55kS5I9wBHgHuCbwA+r6mi3yBPA2X3euyPJcpLllZWVSWSWJK1hqEKvqueqahvwMuBi4FVrLdbnvbuqaqmqlhYWFkZPKkk6oXUd5VJVPwS+CFwCnJbkpO6llwFPTjaaJGk9hjnKZSHJad3jFwGvBw4A9wG/2S22HbhzWiElSYOdNHgRtgK7k2yh9wPgk1X12ST/BnwiyZ8AXwNunWJOSdIAAwu9qvYCF64x/ji9/emSpE3AM0UlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktSIgYWe5Jwk9yU5kGR/knd04zcm+U6SPd3tyunHlST1c9IQyxwF3llVDyY5FXggyT3da7dU1V9ML54kaVgDC72qDgGHusdPJzkAnD3tYJKk9VnXPvQki8CFwP3d0PVJ9ia5LcnpE84mSVqHoQs9yUuATwM3VNVTwIeAVwDb6G3Bv7/P+3YkWU6yvLKyMoHIkqS1DFXoSU6mV+a3V9VnAKrqcFU9V1U/Aj4CXLzWe6tqV1UtVdXSwsLCpHJLko4zzFEuAW4FDlTVB1aNb1212JuBfZOPJ0ka1jBHuVwKvBV4OMmebuw9wLVJtgEFHATeNpWEkqShDHOUy5eBrPHS3ZOPI0kalWeKSlIjLHRJaoSFLkmNsNAlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNWKYy+fOvcWdnxvr/QdvumpCSSRpetxCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpEQMLPck5Se5LciDJ/iTv6MbPSHJPkke7+9OnH1eS1M8wW+hHgXdW1auAS4C3J7kA2AncW1XnA/d2zyVJMzKw0KvqUFU92D1+GjgAnA1cDezuFtsNXDOtkJKkwda1Dz3JInAhcD9wVlUdgl7pAy/t854dSZaTLK+srIyXVpLU19CFnuQlwKeBG6rqqWHfV1W7qmqpqpYWFhZGyShJGsJQhZ7kZHplfntVfaYbPpxka/f6VuDIdCJKkoYxzFEuAW4FDlTVB1a9dBewvXu8Hbhz8vEkScMa5vK5lwJvBR5Osqcbew9wE/DJJNcB/w781nQiSpKGMbDQq+rLQPq8/LrJxpEkjcozRSWpERa6JDXCQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1IiBhZ7ktiRHkuxbNXZjku8k2dPdrpxuTEnSIMNsoX8UuHyN8Vuqalt3u3uysSRJ6zWw0KvqS8APNiCLJGkM4+xDvz7J3m6XzOkTSyRJGsmohf4h4BXANuAQ8P5+CybZkWQ5yfLKysqIHydJGmSkQq+qw1X1XFX9CPgIcPEJlt1VVUtVtbSwsDBqTknSACMVepKtq56+GdjXb1lJ0sY4adACST4OvBY4M8kTwB8Br02yDSjgIPC2KWaUJA1hYKFX1bVrDN86hSySpDF4pqgkNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY0YeBy6YHHn50Z+78GbrppgEknqzy10SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpEV7LZcq8DoykjeIWuiQ1YmChJ7ktyZEk+1aNnZHkniSPdvenTzemJGmQYbbQPwpcftzYTuDeqjofuLd7LkmaoYGFXlVfAn5w3PDVwO7u8W7gmgnnkiSt06j70M+qqkMA3f1LJxdJkjSKqf9RNMmOJMtJlldWVqb9cZL0vDVqoR9OshWguz/Sb8Gq2lVVS1W1tLCwMOLHSZIGGbXQ7wK2d4+3A3dOJo4kaVTDHLb4ceBfgFcmeSLJdcBNwBuSPAq8oXsuSZqhgWeKVtW1fV563YSzSJLG4JmiktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqRF+wUWj/GIN6fnHLXRJaoSFLkmNsNAlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1IixLp+b5CDwNPAccLSqliYRSpK0fpO4HvqvVNX3JvDvSJLG4C4XSWrEuFvoBXwhSQF/VVW7jl8gyQ5gB8C555475sc9v4zzrUOSnn/G3UK/tKouAq4A3p7kNccvUFW7qmqpqpYWFhbG/DhJUj9jFXpVPdndHwHuAC6eRChJ0vqNXOhJXpzk1GOPgTcC+yYVTJK0PuPsQz8LuCPJsX/nY1X1DxNJJUlat5ELvaoeB149wSySpDF42KIkNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGjGJ7xRVY2b5TUkHb7pqZp8tzTu30CWpERa6JDXCQpekRljoktQIC12SGuFRLtIcG+eIJI8oWp9xj/7aiPl2C12SGjFWoSe5PMkjSR5LsnNSoSRJ6zdyoSfZAnwQuAK4ALg2yQWTCiZJWp9xttAvBh6rqser6n+BTwBXTyaWJGm9xin0s4H/WPX8iW5MkjQD4xzlkjXG6icWSnYAO7qnzyR5ZMTPOxP43ojvnaV5zQ0zyJ6bJ/LPzOucb2juCc31Mc75AGPO988Ps9A4hf4EcM6q5y8Dnjx+oaraBewa43MASLJcVUvj/jsbbV5zw/xmN/fGm9fs85q7n3F2ufwrcH6S85KcArwFuGsysSRJ6zXyFnpVHU1yPfCPwBbgtqraP7FkkqR1GetM0aq6G7h7QlkGGXu3zYzMa26Y3+zm3njzmn1ec68pVT/xd0xJ0hzy1H9JasRcFPq8XGIgyTlJ7ktyIMn+JO/oxm9M8p0ke7rblbPOerwkB5M83OVb7sbOSHJPkke7+9NnnXO1JK9cNad7kjyV5IbNOt9JbktyJMm+VWNrznF6/rJb5/cmuWiT5f7zJF/vst2R5LRufDHJ/6ya+w9vstx9140k7+7m+5Ekvzqb1GOqqk19o/cH128CLwdOAR4CLph1rj5ZtwIXdY9PBb5B77IINwJ/MOt8A7IfBM48buzPgJ3d453AzbPOOWA9+S6943U35XwDrwEuAvYNmmPgSuDz9M73uAS4f5PlfiNwUvf45lW5F1cvtwnne811o/v/9CHgBcB5XedsmfV/w3pv87CFPjeXGKiqQ1X1YPf4aeAA83327NXA7u7xbuCaGWYZ5HXAN6vq27MO0k9VfQn4wXHD/eb4auBvqucrwGlJtm5M0h+3Vu6q+kJVHe2efoXeeSibSp/57udq4BNV9WxVfQt4jF73zJV5KPS5vMRAkkXgQuD+buj67tfT2zbbrotOAV9I8kB3di/AWVV1CHo/rICXzizdYG8BPr7q+Waf72P6zfE8rfe/S++3iWPOS/K1JP+c5JdnFeoE1lo35mm++5qHQh/qEgObSZKXAJ8Gbqiqp4APAa8AtgGHgPfPMF4/l1bVRfSunvn2JK+ZdaBhdSe2vQn4+25oHuZ7kLlY75O8FzgK3N4NHQLOraoLgd8HPpbkZ2aVbw391o25mO9B5qHQh7rEwGaR5GR6ZX57VX0GoKoOV9VzVfUj4CNswl/lqurJ7v4IcAe9jIeP/Zrf3R+ZXcITugJ4sKoOw3zM9yr95njTr/dJtgO/Bvx2dTuiu10W3+8eP0BvX/QvzC7ljzvBurHp53sY81Doc3OJgSQBbgUOVNUHVo2v3vf5ZmDf8e+dpSQvTnLqscf0/uC1j948b+8W2w7cOZuEA13Lqt0tm32+j9Nvju8Cfqc72uUS4L+O7ZrZDJJcDrwLeFNV/feq8YX0viuBJC8Hzgcen03Kn3SCdeMu4C1JXpDkPHq5v7rR+cY267/KDnOj9xf/b9D7af/eWec5Qc5fovdr2l5gT3e7Evhb4OFu/C5g66yzHpf75fT+wv8QsP/YHAM/B9wLPNrdnzHrrGtk/2ng+8DPrhrblPNN74fOIeD/6G0RXtdvjuntAvhgt84/DCxtstyP0dvnfGw9/3C37G9069BDwIPAr2+y3H3XDeC93Xw/Alwx6/VllJtnikpSI+Zhl4skaQgWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5Jjfh/tB8YwL20I2QAAAAASUVORK5CYII=\n", 162 | "text/plain": [ 163 | "" 164 | ] 165 | }, 166 | "metadata": {}, 167 | "output_type": "display_data" 168 | } 169 | ], 170 | "source": [ 171 | "plt.hist(data, bins=20);" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "This example clarifies the choices that you have to make when building a generative model:\n", 179 | "\n", 180 | "* You have to pick a model to use\n", 181 | "* You have to pick the parameters of the model\n", 182 | "\n", 183 | "To assess if you have made good choice, you will have to perform some sort of comparison of the generated data, with actual observations from the system you are intenting to model. In general, you would like to know that the parameters of your model are choosen in a way that makes your model useful. That is exactly what **inference** provides." 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "## 3 Inference with models" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "**Inference** is a way of **learning from data**. In the context of generative models, inference allow you to go backwards from **observed data** to parameters that optimize how well the model works for that observed data. Here is a diagram of inference:\n", 198 | "\n", 199 | "**Model** $+$ **Observed Data** + **Training** $\\rightarrow$ **Best Parameters**\n", 200 | "\n", 201 | "Notice the similarities to generative modelling:\n", 202 | "\n", 203 | "* You still have to pick your model!!!\n", 204 | "\n", 205 | "However the differences are most important:\n", 206 | "\n", 207 | "* The data is not generated, it is observed\n", 208 | "* The parameters are learned, rather than guessed\n", 209 | "* A **training** step is required.\n", 210 | "\n", 211 | "The magic of inference is that once you have performed inference to find the best parameters, you can turn it around and generate predictions:\n", 212 | "\n", 213 | "**Model** $+$ **Best Parameters** $\\rightarrow$ **Predictions**\n", 214 | "\n", 215 | "If your model and parameters are good, you should be able to predict outcomes you haven't seen before.\n", 216 | "\n", 217 | "Let's see how this would work with the above soccer goal data. You have been handed a small dataset of the times (in minutes) between soccer goals. This is your observed data:" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 6, 223 | "metadata": { 224 | "collapsed": true 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "observed_data = np.array(\n", 229 | " [ 6.57946838, 16.66471659, 52.11420679, 25.64266511,\n", 230 | " 10.90558697, 17.74796824, 8.0075313 , 3.98989899,\n", 231 | " 13.46723746, 24.90308858]\n", 232 | ")" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "We are again going to pick the exponential distribution, with a parameter $\\beta$. We need to perform some type of inference to find the best value of $\\beta$ to use. We will often denote the best parameter with a hat, so let's call the best value $\\hat\\beta$. There are much more sophisticated way of finding the best parameter, but for now let's find it by just taking the mean of the observed data:" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 7, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "beta_hat = observed_data.mean()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "Now that we have the \"best\" value of beta, we can predict the times between goals of the the *next* 20 goals to happen:" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 8, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "array([ 31.42502418, 6.87980742, 1.64119901, 7.45192818,\n", 267 | " 10.65510412, 8.38029071, 5.497031 , 13.21566205,\n", 268 | " 32.94057228, 28.06311202, 4.74709422, 36.54064439,\n", 269 | " 12.65607335, 3.95839439, 12.18467277, 0.32111527,\n", 270 | " 5.43256726, 6.93929714, 9.08066106, 14.51748799])" 271 | ] 272 | }, 273 | "execution_count": 8, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "new_data = np.random.exponential(beta_hat, 20)\n", 280 | "new_data" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "The obvious question to ask it then this: how did we do. To determine that, we would need to actually observe the next 20 goals and see how their times compare to the generated values. This is a very simple, model so we wouldn't expect the goals to exactly match these predictions, but we might hope that in some aggregate sense our predictions are accurate. In future notebooks, we will go into great detail about assessing how well a model works." 288 | ] 289 | } 290 | ], 291 | "metadata": { 292 | "kernelspec": { 293 | "display_name": "Python 3", 294 | "language": "python", 295 | "name": "python3" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 3 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython3", 307 | "version": "3.6.3" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 2 312 | } 313 | -------------------------------------------------------------------------------- /Content/Model/10-SpecificModels.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Specific Models" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "- Feature engineering (PDSH Chapter 5.4)\n", 15 | "- Naive Bayes (PDSH Chapter 5.5)\n", 16 | "- Linear regression (5.6)\n", 17 | "- SVM (5.7)\n", 18 | "- Random forests (5.8)\n", 19 | "- PCA (5.9)\n", 20 | "- Manifold learning (5.10)\n", 21 | "- K-means (5.11)\n", 22 | "- Gaussian mixtures (5.12)\n", 23 | "- Kernel density estimation (5.13)\n", 24 | "- Neural networks (DL, Keras)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [] 33 | } 34 | ], 35 | "metadata": { 36 | "kernelspec": { 37 | "display_name": "Python 3", 38 | "language": "python", 39 | "name": "python3" 40 | }, 41 | "language_info": { 42 | "codemirror_mode": { 43 | "name": "ipython", 44 | "version": 3 45 | }, 46 | "file_extension": ".py", 47 | "mimetype": "text/x-python", 48 | "name": "python", 49 | "nbconvert_exporter": "python", 50 | "pygments_lexer": "ipython3", 51 | "version": "3.5.2" 52 | } 53 | }, 54 | "nbformat": 4, 55 | "nbformat_minor": 2 56 | } 57 | -------------------------------------------------------------------------------- /Content/Model/Scipy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# SciPy: Numerical Algorithms for Python" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**Learning Objective:** Learn how to find and use numerical algorithms in the SciPy package." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "%matplotlib inline\n", 26 | "from matplotlib import pyplot as plt\n", 27 | "import numpy as np" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Overview" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "The SciPy framework builds on top NumPy and provides a large number of numerical algorithms for working with data. Some of the topics that SciPy covers are:\n", 42 | "\n", 43 | "* Special functions ([scipy.special](http://docs.scipy.org/doc/scipy/reference/special.html))\n", 44 | "* Integration/ODEs ([scipy.integrate](http://docs.scipy.org/doc/scipy/reference/integrate.html))\n", 45 | "* Optimization ([scipy.optimize](http://docs.scipy.org/doc/scipy/reference/optimize.html))\n", 46 | "* Interpolation ([scipy.interpolate](http://docs.scipy.org/doc/scipy/reference/interpolate.html))\n", 47 | "* Fourier Transforms ([scipy.fftpack](http://docs.scipy.org/doc/scipy/reference/fftpack.html))\n", 48 | "* Signal Processing ([scipy.signal](http://docs.scipy.org/doc/scipy/reference/signal.html))\n", 49 | "* Linear Algebra ([scipy.linalg](http://docs.scipy.org/doc/scipy/reference/linalg.html))\n", 50 | "* Sparse Eigenvalue Problems ([scipy.sparse](http://docs.scipy.org/doc/scipy/reference/sparse.html))\n", 51 | "* Statistics ([scipy.stats](http://docs.scipy.org/doc/scipy/reference/stats.html))\n", 52 | "* Multi-dimensional image processing ([scipy.ndimage](http://docs.scipy.org/doc/scipy/reference/ndimage.html))\n", 53 | "* File IO ([scipy.io](http://docs.scipy.org/doc/scipy/reference/io.html))\n", 54 | "\n", 55 | "This notebook is not a complete tour of SciPy. Rather it focuses on the most important parts of the package for processing data.\n", 56 | "\n", 57 | "In many cases, you will want to import specific names from `scipy` subpackages. However, as a start, it is helpful to do the following import:" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 2, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "import scipy as sp" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Approach" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "One of the most important skills in data science is to be able to find Python functions and classes in a module and learn how to use them yourself. Here are some recommended steps on how to go about this:\n", 83 | "\n", 84 | "* Find the online documentation for the package you are using.\n", 85 | "* Try to find the subpackage or even the function that looks like will do the job.\n", 86 | "* Import the module, function or class and use tab completion and `?` to explore it.\n", 87 | "* Try using the function or class for an extremely simple case where you know the answer.\n", 88 | "* Then try using for your real problem." 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## Resources" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "* [SciPy Website](http://www.scipy.org)\n", 103 | "* [SciPy Reference Documentation](http://docs.scipy.org/doc/scipy/reference/)\n", 104 | "* [Python Scientific Lecture Notes](http://scipy-lectures.github.io/index.html), Edited by Valentin Haenel,\n", 105 | "Emmanuelle Gouillart and Gaël Varoquaux.\n", 106 | "* [Lectures on Scientific Computing with Python](https://github.com/jrjohansson/scientific-python-lectures), J.R. Johansson.\n", 107 | "* [Introduction to Scientific Computing in Python](http://nbviewer.ipython.org/github/jakevdp/2014_fall_ASTR599/tree/master/), Jake Vanderplas." 108 | ] 109 | } 110 | ], 111 | "metadata": { 112 | "kernelspec": { 113 | "display_name": "Python 3", 114 | "language": "python", 115 | "name": "python3" 116 | }, 117 | "language_info": { 118 | "codemirror_mode": { 119 | "name": "ipython", 120 | "version": 3 121 | }, 122 | "file_extension": ".py", 123 | "mimetype": "text/x-python", 124 | "name": "python", 125 | "nbconvert_exporter": "python", 126 | "pygments_lexer": "ipython3", 127 | "version": "3.5.2" 128 | } 129 | }, 130 | "nbformat": 4, 131 | "nbformat_minor": 1 132 | } 133 | -------------------------------------------------------------------------------- /Content/Model/images/rectangles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Model/images/rectangles.png -------------------------------------------------------------------------------- /Content/Model/images/trapz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Model/images/trapz.png -------------------------------------------------------------------------------- /Content/Programming/01-Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**Learning Objective:** Learn how to use Python effectively for data science." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "A prerequisite for this course is that you already know how to program, and hopefully already in Python. However, many of you may be rusty with Python and some of you might not have worked in Python yet. Lastly, many of you may have learned Python outside of the context of data science. To bring everyone up to the same level with using Python for data science, we begin with a brief tour of Python for data science. For this purpose we will use Jake VanderPlas's excellent [Whirlwind Tour of Python](../WhirlwindTourOfPython/00-Introduction.ipynb).\n", 22 | "\n", 23 | "This book is available on GitHub at https://github.com/jakevdp/WhirlwindTourOfPython and is included in this repository as a Git submodule." 24 | ] 25 | } 26 | ], 27 | "metadata": { 28 | "kernelspec": { 29 | "display_name": "Python 3", 30 | "language": "python", 31 | "name": "python3" 32 | }, 33 | "language_info": { 34 | "codemirror_mode": { 35 | "name": "ipython", 36 | "version": 3 37 | }, 38 | "file_extension": ".py", 39 | "mimetype": "text/x-python", 40 | "name": "python", 41 | "nbconvert_exporter": "python", 42 | "pygments_lexer": "ipython3", 43 | "version": "3.5.2" 44 | } 45 | }, 46 | "nbformat": 4, 47 | "nbformat_minor": 2 48 | } 49 | -------------------------------------------------------------------------------- /Content/Programming/PythonPackages.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python Packages" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**Learning Objective:** Learn what a Python package and how to import it." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Built-ins" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "The Python programming language offers a very minimal set of objects and functions. These are called **built-ins**" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "dir(__builtins__)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "All other capabilities are shipped as **packages** that have to be **imported** before you can use them." 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Packages" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "* A Python **package** is one of two things:\n", 61 | " - A single file with a `.py` (pure Python) extension or `.so` extension (compiled).\n", 62 | " - A directory of files with `.py`/`.so` extensions and `__init__.py` files.\n", 63 | "* To use a package you must first **import** it.\n", 64 | "* The files within packages are called **modules**.\n", 65 | "* Once you import a package, you can usually see what files it comes from using the `__file__` attribute.\n", 66 | "\n", 67 | "Let's import the `functools` package and see what file it is coming from:" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "import functools" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "functools.__file__" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "You can also use `from` to import specific names from a package:" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "from math import cos" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "cos(1.0)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "Packages and import statements can also be nested:" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "from numpy.random import rand" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "rand()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "You can also use the `as` keyword to change the name of a package. For example the `numpy` package is usually imported under the name `np`:" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 3, 160 | "metadata": { 161 | "collapsed": true 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "import numpy as np" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 4, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "0.0641057288069139" 179 | ] 180 | }, 181 | "execution_count": 4, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "np.random.rand()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "You can also use tab completion in import statements!" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "Python ships with a large set of packages, which together are often called the **standard library**. Any Python distribution has all of these packages, but you still have to import them before you can use them. A full list, along with documentation of the standard library can be found in the [Python Library Documentation](https://docs.python.org/2/library/index.html). All packages that are not in the standard library are called **external packages**. Examples of external packages that we will use in this course are NumPy, SciPy, Matplotlib and Pandas." 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "## Resources" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "* [Python Library Documentation](https://docs.python.org/2/library/index.html)" 216 | ] 217 | } 218 | ], 219 | "metadata": { 220 | "kernelspec": { 221 | "display_name": "Python 3", 222 | "language": "python", 223 | "name": "python3" 224 | }, 225 | "language_info": { 226 | "codemirror_mode": { 227 | "name": "ipython", 228 | "version": 3 229 | }, 230 | "file_extension": ".py", 231 | "mimetype": "text/x-python", 232 | "name": "python", 233 | "nbconvert_exporter": "python", 234 | "pygments_lexer": "ipython3", 235 | "version": "3.4.3" 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 0 240 | } 241 | -------------------------------------------------------------------------------- /Content/Programming/StandardLibrary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Standard Library" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "The Python *Standard Library* refers to the set of Python models and packages that are included with Python, but not imported by default. The documentation for the Standard Library can be found here:\n", 15 | "\n", 16 | "https://docs.python.org/3.4/library/index.html\n", 17 | "\n", 18 | "This doesn't include external modules and packages that are developed, distributed and installed separate from Python itself, such as NumPy, SciPy, Pandas, Matplotlib, etc." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "While there are many more packages in the Standard Library, here are the ones you will run into most often when using Python for Data Science:\n", 26 | "\n", 27 | "* [re](https://docs.python.org/3.4/library/re.html)\n", 28 | "* [datetime](https://docs.python.org/3.4/library/datetime.html)\n", 29 | "* [math](https://docs.python.org/3.4/library/math.html)\n", 30 | "* [random](https://docs.python.org/3.4/library/random.html)\n", 31 | "* [itertools](https://docs.python.org/3.4/library/itertools.html)\n", 32 | "* [functools](https://docs.python.org/3.4/library/functools.html)\n", 33 | "* [glob](https://docs.python.org/3.4/library/glob.html)\n", 34 | "* [os.path](https://docs.python.org/3.4/library/os.path.html)\n", 35 | "* [pickle](https://docs.python.org/3.4/library/pickle.html)\n", 36 | "* [multiprocessing](https://docs.python.org/3.4/library/multiprocessing.html)\n", 37 | "* [json](https://docs.python.org/3.4/library/json.html)" 38 | ] 39 | } 40 | ], 41 | "metadata": { 42 | "kernelspec": { 43 | "display_name": "Python 3", 44 | "language": "python", 45 | "name": "python3" 46 | }, 47 | "language_info": { 48 | "codemirror_mode": { 49 | "name": "ipython", 50 | "version": 3 51 | }, 52 | "file_extension": ".py", 53 | "mimetype": "text/x-python", 54 | "name": "python", 55 | "nbconvert_exporter": "python", 56 | "pygments_lexer": "ipython3", 57 | "version": "3.4.3" 58 | } 59 | }, 60 | "nbformat": 4, 61 | "nbformat_minor": 0 62 | } 63 | -------------------------------------------------------------------------------- /Content/Tidy/01-Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [] 16 | } 17 | ], 18 | "metadata": { 19 | "kernelspec": { 20 | "display_name": "Python 3", 21 | "language": "python", 22 | "name": "python3" 23 | }, 24 | "language_info": { 25 | "codemirror_mode": { 26 | "name": "ipython", 27 | "version": 3 28 | }, 29 | "file_extension": ".py", 30 | "mimetype": "text/x-python", 31 | "name": "python", 32 | "nbconvert_exporter": "python", 33 | "pygments_lexer": "ipython3", 34 | "version": "3.5.2" 35 | } 36 | }, 37 | "nbformat": 4, 38 | "nbformat_minor": 2 39 | } 40 | -------------------------------------------------------------------------------- /Content/Tidy/02-TidyData.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tidy Data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [] 16 | } 17 | ], 18 | "metadata": { 19 | "kernelspec": { 20 | "display_name": "Python 3", 21 | "language": "python", 22 | "name": "python3" 23 | }, 24 | "language_info": { 25 | "codemirror_mode": { 26 | "name": "ipython", 27 | "version": 3 28 | }, 29 | "file_extension": ".py", 30 | "mimetype": "text/x-python", 31 | "name": "python", 32 | "nbconvert_exporter": "python", 33 | "pygments_lexer": "ipython3", 34 | "version": "3.5.2" 35 | } 36 | }, 37 | "nbformat": 4, 38 | "nbformat_minor": 2 39 | } 40 | -------------------------------------------------------------------------------- /Content/Transform/01-Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Transformation\n", 8 | "\n", 9 | "**Learning Objective**: Learn how to transform data during the different stages of the data science process, from tidying a messy dataset to transforming during the the visualization and modeling stages." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Overview\n", 17 | "\n", 18 | "Data transformation is one of most important phases in the data science process. Data transformation is needed:\n", 19 | "\n", 20 | "1. To bring a raw dataset into the tidy format.\n", 21 | "2. During the visualization phase to compute groups, aggregations, statistical summaries, etc.\n", 22 | "3. During the iterative process of *Visualizing* and *Modeling*.\n", 23 | "\n", 24 | "One of the strength of Python and its various data science packages are their power, flexibility and convenience to transform data. Sometimes Python and its standard library will be sufficient. Other times, we will need more powerful packges such as [NumPy](http://www.numpy.org/), [Pandas](http://pandas.pydata.org/) or [SQL](https://en.wikipedia.org/wiki/SQL). In this section of the course, you will learn how to leverage these tools to transform data." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Outline\n", 32 | "\n", 33 | "Here is an outline of this section of the course:\n", 34 | "\n", 35 | "* [Basic Data Transformation](02-BasicDataTransformation.ipynb)\n", 36 | "* [NumPy](03-Numpy.ipynb)\n", 37 | "* [Pandas](04-Pandas.ipynb)\n", 38 | "* [Relational Data](05-RelationalData.ipynb)" 39 | ] 40 | } 41 | ], 42 | "metadata": { 43 | "kernelspec": { 44 | "display_name": "Python 3", 45 | "language": "python", 46 | "name": "python3" 47 | }, 48 | "language_info": { 49 | "codemirror_mode": { 50 | "name": "ipython", 51 | "version": 3 52 | }, 53 | "file_extension": ".py", 54 | "mimetype": "text/x-python", 55 | "name": "python", 56 | "nbconvert_exporter": "python", 57 | "pygments_lexer": "ipython3", 58 | "version": "3.5.2" 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 2 63 | } 64 | -------------------------------------------------------------------------------- /Content/Transform/04-Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pandas" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [] 16 | } 17 | ], 18 | "metadata": { 19 | "kernelspec": { 20 | "display_name": "Python 3", 21 | "language": "python", 22 | "name": "python3" 23 | }, 24 | "language_info": { 25 | "codemirror_mode": { 26 | "name": "ipython", 27 | "version": 3 28 | }, 29 | "file_extension": ".py", 30 | "mimetype": "text/x-python", 31 | "name": "python", 32 | "nbconvert_exporter": "python", 33 | "pygments_lexer": "ipython3", 34 | "version": "3.5.2" 35 | } 36 | }, 37 | "nbformat": 4, 38 | "nbformat_minor": 2 39 | } 40 | -------------------------------------------------------------------------------- /Content/Transform/data/Chinook_Sqlite.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Transform/data/Chinook_Sqlite.sqlite -------------------------------------------------------------------------------- /Content/Visualize/01-Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Data Visualization" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "To do just about anything with data, you need to be able to look at it. In many, if not most cases, that will mean creating a visualization. This section of the course will cover the basics of data visualization. Our approach here tries to follow the Zen of Python:" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "The Zen of Python, by Tim Peters\n", 29 | "\n", 30 | "Beautiful is better than ugly.\n", 31 | "Explicit is better than implicit.\n", 32 | "Simple is better than complex.\n", 33 | "Complex is better than complicated.\n", 34 | "Flat is better than nested.\n", 35 | "Sparse is better than dense.\n", 36 | "Readability counts.\n", 37 | "Special cases aren't special enough to break the rules.\n", 38 | "Although practicality beats purity.\n", 39 | "Errors should never pass silently.\n", 40 | "Unless explicitly silenced.\n", 41 | "In the face of ambiguity, refuse the temptation to guess.\n", 42 | "There should be one-- and preferably only one --obvious way to do it.\n", 43 | "Although that way may not be obvious at first unless you're Dutch.\n", 44 | "Now is better than never.\n", 45 | "Although never is often better than *right* now.\n", 46 | "If the implementation is hard to explain, it's a bad idea.\n", 47 | "If the implementation is easy to explain, it may be a good idea.\n", 48 | "Namespaces are one honking great idea -- let's do more of those!\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "import this" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "One one side there is a rich body of research literature that approaches data visualization from a formal and principled perspective through empirical studies. On the other side are numerous Python libraries which all offer slightly different ways of expressing visual concepts in code. These libraries are many (Matplotlib, ggplot, Bokeh, Altair, BQPlot, Seaborn, etc.) and they all cover some subset of data visualization quite well. Herein, I will briefly cover foundational results from data visualization research, and then turn quickly to their application by covering two very different visualization libraries: Altair and Matplotlib. These two Python libraries complement each other well, with Altair focusing narrowly on formal statistical visualization and Matplotlib covering everything else." 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Outline\n", 68 | "\n", 69 | "* [Visualization Grammar](02-VisualizationGrammar.ipynb)\n", 70 | "* [Tidy Data](03-TidyData.ipynb)\n", 71 | "* [Chart, Marks and Encodings](04-ChartMarksEncodings.ipynb)\n", 72 | "* [Transformation](05-Transformation.ipynb)\n", 73 | "* [Seattle Weather](06-SeattleWeather.ipynb)\n", 74 | "* [Configuration](07-Configuration.ipynb)\n", 75 | "* [Layers](08-Layers.ipynb)\n", 76 | "* [Theory and Practice](09-TheoryAndPractice.ipynb)\n", 77 | "* [Matplotlib](10-Matplotlib.ipynb)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "## Resources\n", 85 | "\n", 86 | "The following resources are the main ones for this section:\n", 87 | "\n", 88 | "- [Jeff Heer's CSE512](http://courses.cs.washington.edu/courses/cse512/14wi/index.html)\n", 89 | "- [Altair](https://altair-viz.github.io/)\n", 90 | "- [Matplotlib](http://matplotlib.org/)\n", 91 | "- [The work of Edward Tufte](https://www.edwardtufte.com/tufte/)" 92 | ] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "Python 3", 98 | "language": "python", 99 | "name": "python3" 100 | }, 101 | "language_info": { 102 | "codemirror_mode": { 103 | "name": "ipython", 104 | "version": 3 105 | }, 106 | "file_extension": ".py", 107 | "mimetype": "text/x-python", 108 | "name": "python", 109 | "nbconvert_exporter": "python", 110 | "pygments_lexer": "ipython3", 111 | "version": "3.5.2" 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 2 116 | } 117 | -------------------------------------------------------------------------------- /Content/Visualize/03-TidyData.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tidy Data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**Learning Objective:** Understand the basics of *tidy data*." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Overview" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "In our [Theory of Data](../Introduction/03-TheoryofData.ipynb) section, we covered some basic aspects of data:\n", 29 | "\n", 30 | "- **Data types:** ordinal, nominal, quantitative, date/time, goegraphic\n", 31 | "- **Variables:** a single thing that is measured\n", 32 | "- **Observations:** multiple variables that are measured for a single entity\n", 33 | "- **Dataset:** a set of records\n", 34 | "\n", 35 | "The idea of *tidy data* is this:\n", 36 | "\n", 37 | "1. There are many possible ways one can organize variables and observations into a dataset;\n", 38 | "2. However, not all ways are equal; and\n", 39 | "3. A particular way or organizing a dataset, called *tidy data* is particularly useful in working with data.\n", 40 | "\n", 41 | "The idea of tidy data was first formalized by Hadley Wickham in his [Tidy Data](https://www.jstatsoft.org/article/view/v059i10) paper from 2010. Later in this course we will describe tidy data in more detail. However, it is useful to take a short tidy data detour before diving into data visualization. The reason for our pausing to describe *tidy data* at this point is that our first rule in data visualization is this:\n", 42 | "\n", 43 | "> Start all data visualizations with a tidy dataset.\n", 44 | "\n", 45 | "Thus, if you want to visualize a dataset, your first task will be to put it into a tidy form. For now, we will be working with datasets that are already tidy; the often painful process of tidying a dataset will be covered later." 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## Defining tidy data\n", 53 | "\n", 54 | "A tidy dataset has the following properties:\n", 55 | "\n", 56 | "1. Each variable forms a column\n", 57 | "2. Each observation forma row\n", 58 | "3. Each type of observational unit forms a table\n", 59 | "\n", 60 | "*Messy data* is any other arrangement of the data." 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Example: cars dataset" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "The cars dataset, which comes with the Altair visualization library, is an example of a tidy dataset. Let's load that dataset and look at it:" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 2, 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "import altair as alt\n", 86 | "alt.enable_mime_rendering()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 3, 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "cars = alt.load_dataset('cars')" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/html": [ 110 | "
\n", 111 | "\n", 124 | "\n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | "
AccelerationCylindersDisplacementHorsepowerMiles_per_GallonNameOriginWeight_in_lbsYear
012.08307.0130.018.0chevrolet chevelle malibuUSA35041970-01-01
111.58350.0165.015.0buick skylark 320USA36931970-01-01
211.08318.0150.018.0plymouth satelliteUSA34361970-01-01
312.08304.0150.016.0amc rebel sstUSA34331970-01-01
410.58302.0140.017.0ford torinoUSA34491970-01-01
\n", 202 | "
" 203 | ], 204 | "text/plain": [ 205 | " Acceleration Cylinders Displacement Horsepower Miles_per_Gallon \\\n", 206 | "0 12.0 8 307.0 130.0 18.0 \n", 207 | "1 11.5 8 350.0 165.0 15.0 \n", 208 | "2 11.0 8 318.0 150.0 18.0 \n", 209 | "3 12.0 8 304.0 150.0 16.0 \n", 210 | "4 10.5 8 302.0 140.0 17.0 \n", 211 | "\n", 212 | " Name Origin Weight_in_lbs Year \n", 213 | "0 chevrolet chevelle malibu USA 3504 1970-01-01 \n", 214 | "1 buick skylark 320 USA 3693 1970-01-01 \n", 215 | "2 plymouth satellite USA 3436 1970-01-01 \n", 216 | "3 amc rebel sst USA 3433 1970-01-01 \n", 217 | "4 ford torino USA 3449 1970-01-01 " 218 | ] 219 | }, 220 | "execution_count": 4, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "cars.head()" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "The cars dataset above is stored as a table-like object called a `DataFrame`. In Python, the [Pandas](http://pandas.pydata.org/) library provides this data structure:" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 5, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "pandas.core.frame.DataFrame" 247 | ] 248 | }, 249 | "execution_count": 5, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "type(cars)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "We will learn more about Pandas and `DataFrame`s later in this course. For now, we cover a few of their commonly used attributes and methods. The `.columns` attribute returns a one dimensional sequence of the column names. These are the *variables* in the dataset:" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 6, 268 | "metadata": { 269 | "collapsed": false 270 | }, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/plain": [ 275 | "Index(['Acceleration', 'Cylinders', 'Displacement', 'Horsepower',\n", 276 | " 'Miles_per_Gallon', 'Name', 'Origin', 'Weight_in_lbs', 'Year'],\n", 277 | " dtype='object')" 278 | ] 279 | }, 280 | "execution_count": 6, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "cars.columns" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "The rows (observations) are labeled by another one dimensional sequence called the index (`.index`):" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 7, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,\n", 307 | " ...\n", 308 | " 396, 397, 398, 399, 400, 401, 402, 403, 404, 405],\n", 309 | " dtype='int64', length=406)" 310 | ] 311 | }, 312 | "execution_count": 7, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "cars.index" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "The length of the dataset is the number of rows:" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 8, 331 | "metadata": { 332 | "collapsed": false 333 | }, 334 | "outputs": [ 335 | { 336 | "data": { 337 | "text/plain": [ 338 | "406" 339 | ] 340 | }, 341 | "execution_count": 8, 342 | "metadata": {}, 343 | "output_type": "execute_result" 344 | } 345 | ], 346 | "source": [ 347 | "len(cars)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "Lastly, the `DataFrame` acts like a specialized dictionary, where the keys are the column names and the values are the columns:" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 9, 360 | "metadata": { 361 | "collapsed": false 362 | }, 363 | "outputs": [ 364 | { 365 | "data": { 366 | "text/plain": [ 367 | "0 12.0\n", 368 | "1 11.5\n", 369 | "2 11.0\n", 370 | "3 12.0\n", 371 | "4 10.5\n", 372 | "Name: Acceleration, dtype: float64" 373 | ] 374 | }, 375 | "execution_count": 9, 376 | "metadata": {}, 377 | "output_type": "execute_result" 378 | } 379 | ], 380 | "source": [ 381 | "cars['Acceleration'].head()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "We will be using this cars dataset to cover the basics of data visualization with Altair." 389 | ] 390 | } 391 | ], 392 | "metadata": { 393 | "kernelspec": { 394 | "display_name": "Python 3", 395 | "language": "python", 396 | "name": "python3" 397 | }, 398 | "language_info": { 399 | "codemirror_mode": { 400 | "name": "ipython", 401 | "version": 3 402 | }, 403 | "file_extension": ".py", 404 | "mimetype": "text/x-python", 405 | "name": "python", 406 | "nbconvert_exporter": "python", 407 | "pygments_lexer": "ipython3", 408 | "version": "3.6.3" 409 | } 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 2 413 | } 414 | -------------------------------------------------------------------------------- /Content/Visualize/images/column_syntax1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Visualize/images/column_syntax1.png -------------------------------------------------------------------------------- /Content/Visualize/images/column_syntax2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Visualize/images/column_syntax2.png -------------------------------------------------------------------------------- /Content/Visualize/images/encodings1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Visualize/images/encodings1.png -------------------------------------------------------------------------------- /Content/Visualize/images/encodings2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Visualize/images/encodings2.png -------------------------------------------------------------------------------- /Content/Visualize/images/mackinlay1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Visualize/images/mackinlay1.png -------------------------------------------------------------------------------- /Content/Visualize/images/mackinlay2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Visualize/images/mackinlay2.png -------------------------------------------------------------------------------- /Content/Visualize/images/marks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Visualize/images/marks.png -------------------------------------------------------------------------------- /Content/Visualize/images/marks_encoding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Visualize/images/marks_encoding.png -------------------------------------------------------------------------------- /Content/Visualize/images/measles_wsj.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Visualize/images/measles_wsj.png -------------------------------------------------------------------------------- /Content/Visualize/images/social_assistance_538.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Visualize/images/social_assistance_538.png -------------------------------------------------------------------------------- /Content/Visualize/images/viz_grammar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Visualize/images/viz_grammar.png -------------------------------------------------------------------------------- /Content/Workflow/01-Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [] 16 | } 17 | ], 18 | "metadata": { 19 | "kernelspec": { 20 | "display_name": "Python 3", 21 | "language": "python", 22 | "name": "python3" 23 | }, 24 | "language_info": { 25 | "codemirror_mode": { 26 | "name": "ipython", 27 | "version": 3 28 | }, 29 | "file_extension": ".py", 30 | "mimetype": "text/x-python", 31 | "name": "python", 32 | "nbconvert_exporter": "python", 33 | "pygments_lexer": "ipython3", 34 | "version": "3.5.2" 35 | } 36 | }, 37 | "nbformat": 4, 38 | "nbformat_minor": 2 39 | } 40 | -------------------------------------------------------------------------------- /Content/Workflow/02-TheJupyterNotebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# What is the Jupyter Notebook?" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Introduction" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "The Jupyter Notebook is an **interactive computing environment** that enables users to author notebook documents that include: \n", 26 | "- Live code\n", 27 | "- Interactive widgets\n", 28 | "- Plots\n", 29 | "- Narrative text\n", 30 | "- Equations\n", 31 | "- Images\n", 32 | "- Video\n", 33 | "\n", 34 | "These documents provide a **complete and self-contained record of a computation** that can be converted to various formats and shared with others using email, [Dropbox](http://dropbox.com), version control systems (like git/[GitHub](http://github.com)) or [nbviewer.jupyter.org](http://nbviewer.jupyter.org)." 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "slideshow": { 41 | "slide_type": "slide" 42 | } 43 | }, 44 | "source": [ 45 | "### Components" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "The Jupyter Notebook combines three components:\n", 53 | "\n", 54 | "* **The notebook web application**: An interactive web application for writing and running code interactively and authoring notebook documents.\n", 55 | "* **Kernels**: Separate processes started by the notebook web application that runs users' code in a given language and returns output back to the notebook web application. The kernel also handles things like computations for interactive widgets, tab completion and introspection. \n", 56 | "* **Notebook documents**: Self-contained documents that contain a representation of all content visible in the notebook web application, including inputs and outputs of the computations, narrative\n", 57 | "text, equations, images, and rich media representations of objects. Each notebook document has its own kernel." 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "slideshow": { 64 | "slide_type": "slide" 65 | } 66 | }, 67 | "source": [ 68 | "## Notebook web application" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "The notebook web application enables users to:\n", 76 | "\n", 77 | "* **Edit code in the browser**, with automatic syntax highlighting, indentation, and tab completion/introspection.\n", 78 | "* **Run code from the browser**, with the results of computations attached to the code which generated them.\n", 79 | "* See the results of computations with **rich media representations**, such as HTML, LaTeX, PNG, SVG, PDF, etc.\n", 80 | "* Create and use **interactive JavaScript widgets**, which bind interactive user interface controls and visualizations to reactive kernel side computations.\n", 81 | "* Author **narrative text** using the [Markdown](https://daringfireball.net/projects/markdown/) markup language.\n", 82 | "* Build **hierarchical documents** that are organized into sections with different levels of headings.\n", 83 | "* Include mathematical equations using **LaTeX syntax in Markdown**, which are rendered in-browser by [MathJax](http://www.mathjax.org/)." 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "slideshow": { 90 | "slide_type": "slide" 91 | } 92 | }, 93 | "source": [ 94 | "## Kernels" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "Through Jupyter's kernel and messaging architecture, the Notebook allows code to be run in a range of different programming languages. For each notebook document that a user opens, the web application starts a kernel that runs the code for that notebook. Each kernel is capable of running code in a single programming language and there are kernels available in the following languages:\n", 102 | "\n", 103 | "* Python(https://github.com/ipython/ipython)\n", 104 | "* Julia (https://github.com/JuliaLang/IJulia.jl)\n", 105 | "* R (https://github.com/takluyver/IRkernel)\n", 106 | "* Ruby (https://github.com/minrk/iruby)\n", 107 | "* Haskell (https://github.com/gibiansky/IHaskell)\n", 108 | "* Scala (https://github.com/Bridgewater/scala-notebook)\n", 109 | "* node.js (https://gist.github.com/Carreau/4279371)\n", 110 | "* Go (https://github.com/takluyver/igo)\n", 111 | "\n", 112 | "The default kernel runs Python code. The notebook provides a simple way for users to pick which of these kernels is used for a given notebook. \n", 113 | "\n", 114 | "Each of these kernels communicate with the notebook web application and web browser using a JSON over ZeroMQ/WebSockets message protocol that is described [here](http://ipython.org/ipython-doc/dev/development/messaging.html). Most users don't need to know about these details, but it helps to understand that \"kernels run code.\"" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "slideshow": { 121 | "slide_type": "slide" 122 | } 123 | }, 124 | "source": [ 125 | "## Notebook documents" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "Notebook documents contain the **inputs and outputs** of an interactive session as well as **narrative text** that accompanies the code but is not meant for execution. **Rich output** generated by running code, including HTML, images, video, and plots, is embeddeed in the notebook, which makes it a complete and self-contained record of a computation. " 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "When you run the notebook web application on your computer, notebook documents are just **files on your local filesystem with a `.ipynb` extension**. This allows you to use familiar workflows for organizing your notebooks into folders and sharing them with others." 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "Notebooks consist of a **linear sequence of cells**. There are four basic cell types:\n", 147 | "\n", 148 | "* **Code cells:** Input and output of live code that is run in the kernel\n", 149 | "* **Markdown cells:** Narrative text with embedded LaTeX equations\n", 150 | "* **Heading cells:** 6 levels of hierarchical organization and formatting\n", 151 | "* **Raw cells:** Unformatted text that is included, without modification, when notebooks are converted to different formats using nbconvert\n", 152 | "\n", 153 | "Internally, notebook documents are **[JSON](http://en.wikipedia.org/wiki/JSON) data** with **binary values [base64](http://en.wikipedia.org/wiki/Base64)** encoded. This allows them to be **read and manipulated programmatically** by any programming language. Because JSON is a text format, notebook documents are version control friendly.\n", 154 | "\n", 155 | "**Notebooks can be exported** to different static formats including HTML, reStructeredText, LaTeX, PDF, and slide shows ([reveal.js](http://lab.hakim.se/reveal-js/#/)) using Jupyter's `nbconvert` utility.\n", 156 | "\n", 157 | "Furthermore, any notebook document available from a **public URL on or GitHub can be shared** via [nbviewer](http://nbviewer.ipython.org). This service loads the notebook document from the URL and renders it as a static web page. The resulting web page may thus be shared with others **without their needing to install the Jupyter Notebook**." 158 | ] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Python 3", 164 | "language": "python", 165 | "name": "python3" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 3 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython3", 177 | "version": "3.4.0" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 0 182 | } 183 | -------------------------------------------------------------------------------- /Content/Workflow/03-NotebookBasics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Notebook Basics" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Running the Notebook Server" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "The Jupyter notebook server is a custom web server that runs the notebook web application. Most of the time, users run the notebook server on their local computer using the command line interface." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "
\n", 29 | "This section is only relevant if you are *not* using JupyterHub or another pre-deployed version of the Jupyter Notebook.\n", 30 | "
" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "### Starting the notebook server using the command line" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "You can start the notebook server from the command line (Terminal on Mac/Linux, CMD prompt on Windows) by running the following command: \n", 45 | "\n", 46 | " jupyter notebook\n", 47 | "\n", 48 | "This will print some information about the notebook server in your terminal, including the URL of the web application (by default, `http://127.0.0.1:8888`). It will then open your default web browser to this URL.\n", 49 | "\n", 50 | "When the notebook opens, you will see the **notebook dashboard**, which will show a list of the notebooks, files, and subdirectories in the directory where the notebook server was started (as seen in the next section, below). Most of the time, you will want to start a notebook server in the highest directory in your filesystem where notebooks can be found. Often this will be your home directory." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "### Additional options" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "By default, the notebook server starts on port 8888. If port 8888 is unavailable, the notebook server searchs the next available port.\n", 65 | "\n", 66 | "You can also specify the port manually:\n", 67 | "\n", 68 | " jupyter notebook --port 9999\n", 69 | "\n", 70 | "Or start notebook server without opening a web browser.\n", 71 | "\n", 72 | " jupyter notebook --no-browser\n", 73 | "\n", 74 | "The notebook server has a number of other command line arguments that can be displayed with the `--help` flag: \n", 75 | "\n", 76 | " jupyter notebook --help" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "## The Notebook dashboard" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "When you first start the notebook server, your browser will open to the notebook dashboard. The dashboard serves as a home page for the notebook. Its main purpose is to display the notebooks and files in the current directory. For example, here is a screenshot of the dashboard page for the `examples` directory in the Jupyter repository:\n", 91 | "\n", 92 | "" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "The top of the notebook list displays clickable breadcrumbs of the current directory. By clicking on these breadcrumbs or on sub-directories in the notebook list, you can navigate your file system.\n", 100 | "\n", 101 | "To create a new notebook, click on the \"New\" button at the top of the list and select a kernel from the dropdown (as seen below). Which kernels are listed depend on what's installed on the server. Some of the kernels in the screenshot below may not exist as an option to you.\n", 102 | "\n", 103 | "" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "Notebooks and files can be uploaded to the current directory by dragging a notebook file onto the notebook list or by the \"click here\" text above the list.\n", 111 | "\n", 112 | "The notebook list shows green \"Running\" text and a green notebook icon next to running notebooks (as seen below). Notebooks remain running until you explicitly shut them down; closing the notebook's page is not sufficient.\n", 113 | "\n", 114 | "" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "To shutdown, delete, duplicate, or rename a notebook check the checkbox next to it and an array of controls will appear at the top of the notebook list (as seen below). You can also use the same operations on directories and files when applicable.\n", 122 | "\n", 123 | "" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "To see all of your running notebooks along with their directories, click on the \"Running\" tab:\n", 131 | "\n", 132 | "\n", 133 | "\n", 134 | "This view provides a convenient way to track notebooks that you start as you navigate the file system in a long running notebook server." 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "## Overview of the Notebook UI" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "If you create a new notebook or open an existing one, you will be taken to the notebook user interface (UI). This UI allows you to run code and author notebook documents interactively. The notebook UI has the following main areas:\n", 149 | "\n", 150 | "* Menu\n", 151 | "* Toolbar\n", 152 | "* Notebook area and cells\n", 153 | "\n", 154 | "The notebook has an interactive tour of these elements that can be started in the \"Help:User Interface Tour\" menu item." 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "## Modal editor" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "Starting with IPython 2.0, the Jupyter Notebook has a modal user interface. This means that the keyboard does different things depending on which mode the Notebook is in. There are two modes: edit mode and command mode." 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "### Edit mode" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "Edit mode is indicated by a green cell border and a prompt showing in the editor area:\n", 183 | "\n", 184 | "\n", 185 | "\n", 186 | "When a cell is in edit mode, you can type into the cell, like a normal text editor." 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "
\n", 194 | "Enter edit mode by pressing `Enter` or using the mouse to click on a cell's editor area.\n", 195 | "
" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "### Command mode" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "Command mode is indicated by a grey cell border with a blue left margin:\n", 210 | "\n", 211 | "\n", 212 | "\n", 213 | "When you are in command mode, you are able to edit the notebook as a whole, but not type into individual cells. Most importantly, in command mode, the keyboard is mapped to a set of shortcuts that let you perform notebook and cell actions efficiently. For example, if you are in command mode and you press `c`, you will copy the current cell - no modifier is needed." 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "
\n", 221 | "Don't try to type into a cell in command mode; unexpected things will happen!\n", 222 | "
" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "
\n", 230 | "Enter command mode by pressing `Esc` or using the mouse to click *outside* a cell's editor area.\n", 231 | "
" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "## Mouse navigation" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "All navigation and actions in the Notebook are available using the mouse through the menubar and toolbar, which are both above the main Notebook area:\n", 246 | "\n", 247 | "" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "The first idea of mouse based navigation is that **cells can be selected by clicking on them.** The currently selected cell gets a grey or green border depending on whether the notebook is in edit or command mode. If you click inside a cell's editor area, you will enter edit mode. If you click on the prompt or output area of a cell you will enter command mode.\n", 255 | "\n", 256 | "If you are running this notebook in a live session (not on http://nbviewer.jupyter.org) try selecting different cells and going between edit and command mode. Try typing into a cell." 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "The second idea of mouse based navigation is that **cell actions usually apply to the currently selected cell**. Thus if you want to run the code in a cell, you would select it and click the button in the toolbar or the \"Cell:Run\" menu item. Similarly, to copy a cell you would select it and click the button in the toolbar or the \"Edit:Copy\" menu item. With this simple pattern, you should be able to do most everything you need with the mouse.\n", 264 | "\n", 265 | "Markdown and heading cells have one other state that can be modified with the mouse. These cells can either be rendered or unrendered. When they are rendered, you will see a nice formatted representation of the cell's contents. When they are unrendered, you will see the raw text source of the cell. To render the selected cell with the mouse, click the button in the toolbar or the \"Cell:Run\" menu item. To unrender the selected cell, double click on the cell." 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "## Keyboard Navigation" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "The modal user interface of the Jupyter Notebook has been optimized for efficient keyboard usage. This is made possible by having two different sets of keyboard shortcuts: one set that is active in edit mode and another in command mode.\n", 280 | "\n", 281 | "The most important keyboard shortcuts are `Enter`, which enters edit mode, and `Esc`, which enters command mode.\n", 282 | "\n", 283 | "In edit mode, most of the keyboard is dedicated to typing into the cell's editor. Thus, in edit mode there are relatively few shortcuts. In command mode, the entire keyboard is available for shortcuts, so there are many more. The `Help`->`Keyboard Shortcuts` dialog lists the available shortcuts." 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "We recommend learning the command mode shortcuts in the following rough order:\n", 291 | "\n", 292 | "1. Basic navigation: `enter`, `shift-enter`, `up/k`, `down/j`\n", 293 | "2. Saving the notebook: `s`\n", 294 | "2. Change Cell types: `y`, `m`, `1-6`, `t`\n", 295 | "3. Cell creation: `a`, `b`\n", 296 | "4. Cell editing: `x`, `c`, `v`, `d`, `z`\n", 297 | "5. Kernel operations: `i`, `0` (press twice)" 298 | ] 299 | } 300 | ], 301 | "metadata": { 302 | "kernelspec": { 303 | "display_name": "Python 3", 304 | "language": "python", 305 | "name": "python3" 306 | }, 307 | "language_info": { 308 | "codemirror_mode": { 309 | "name": "ipython", 310 | "version": 3 311 | }, 312 | "file_extension": ".py", 313 | "mimetype": "text/x-python", 314 | "name": "python", 315 | "nbconvert_exporter": "python", 316 | "pygments_lexer": "ipython3", 317 | "version": "3.4.3" 318 | } 319 | }, 320 | "nbformat": 4, 321 | "nbformat_minor": 0 322 | } 323 | -------------------------------------------------------------------------------- /Content/Workflow/04-RunningCode.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Running Code" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "First and foremost, the Jupyter Notebook is an interactive environment for writing and running code. The notebook is capable of running code in a wide range of languages. However, each notebook is associated with a single kernel. This notebook is associated with the IPython kernel, therefor runs Python code." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Code cells allow you to enter and run code" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "Run a code cell using `Shift-Enter` or pressing the button in the toolbar above:" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "a = 10" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "print(a)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "There are two other keyboard shortcuts for running code:\n", 58 | "\n", 59 | "* `Alt-Enter` runs the current cell and inserts a new one below.\n", 60 | "* `Ctrl-Enter` run the current cell and enters command mode." 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Managing the Kernel" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "Code is run in a separate process called the Kernel. The Kernel can be interrupted or restarted. Try running the following cell and then hit the button in the toolbar above." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "import time\n", 86 | "time.sleep(10)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "If the Kernel dies you will be prompted to restart it. Here we call the low-level system libc.time routine with the wrong argument via\n", 94 | "ctypes to segfault the Python interpreter:" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "collapsed": false 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "import sys\n", 106 | "from ctypes import CDLL\n", 107 | "# This will crash a Linux or Mac system\n", 108 | "# equivalent calls can be made on Windows\n", 109 | "dll = 'dylib' if sys.platform == 'darwin' else 'so.6'\n", 110 | "libc = CDLL(\"libc.%s\" % dll) \n", 111 | "libc.time(-1) # BOOM!!" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "## Cell menu" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "The \"Cell\" menu has a number of menu items for running code in different ways. These includes:\n", 126 | "\n", 127 | "* Run and Select Below\n", 128 | "* Run and Insert Below\n", 129 | "* Run All\n", 130 | "* Run All Above\n", 131 | "* Run All Below" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## Restarting the kernels" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "The kernel maintains the state of a notebook's computations. You can reset this state by restarting the kernel. This is done by clicking on the in the toolbar above." 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "## sys.stdout and sys.stderr" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "The stdout and stderr streams are displayed as text in the output area." 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "print(\"hi, stdout\")" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "from __future__ import print_function\n", 182 | "import sys\n", 183 | "print('hi, stderr', file=sys.stderr)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "## Output is asynchronous" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "All output is displayed asynchronously as it is generated in the Kernel. If you execute the next cell, you will see the output one piece at a time, not all at the end." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "collapsed": false 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "import time, sys\n", 209 | "for i in range(8):\n", 210 | " print(i)\n", 211 | " time.sleep(0.5)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "## Large outputs" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "To better handle large outputs, the output area can be collapsed. Run the following cell and then single- or double- click on the active area to the left of the output:" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": { 232 | "collapsed": false 233 | }, 234 | "outputs": [], 235 | "source": [ 236 | "for i in range(50):\n", 237 | " print(i)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "Beyond a certain point, output will scroll automatically:" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "for i in range(500):\n", 256 | " print(2**i - 1)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "collapsed": false 264 | }, 265 | "outputs": [], 266 | "source": [] 267 | } 268 | ], 269 | "metadata": { 270 | "kernelspec": { 271 | "display_name": "Python 3", 272 | "language": "python", 273 | "name": "python3" 274 | }, 275 | "language_info": { 276 | "codemirror_mode": { 277 | "name": "ipython", 278 | "version": 3 279 | }, 280 | "file_extension": ".py", 281 | "mimetype": "text/x-python", 282 | "name": "python", 283 | "nbconvert_exporter": "python", 284 | "pygments_lexer": "ipython3", 285 | "version": "3.4.3" 286 | } 287 | }, 288 | "nbformat": 4, 289 | "nbformat_minor": 0 290 | } 291 | -------------------------------------------------------------------------------- /Content/Workflow/06-Markdown.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Markdown Cells" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Text can be added to Jupyter Notebooks using Markdown cells. Markdown is a popular markup language that is a superset of HTML. Its specification can be found here:\n", 15 | "\n", 16 | "" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Markdown basics" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "You can make text *italic* or **bold**." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "You can build nested itemized or enumerated lists:\n", 38 | "\n", 39 | "* One\n", 40 | " - Sublist\n", 41 | " - This\n", 42 | " - Sublist\n", 43 | " - That\n", 44 | " - The other thing\n", 45 | "* Two\n", 46 | " - Sublist\n", 47 | "* Three\n", 48 | " - Sublist\n", 49 | "\n", 50 | "Now another list:\n", 51 | "\n", 52 | "1. Here we go\n", 53 | " 1. Sublist\n", 54 | " 2. Sublist\n", 55 | "2. There we go\n", 56 | "3. Now this" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "You can add horizontal rules:\n", 64 | "\n", 65 | "---" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "Here is a blockquote:\n", 73 | "\n", 74 | "> Beautiful is better than ugly.\n", 75 | "> Explicit is better than implicit.\n", 76 | "> Simple is better than complex.\n", 77 | "> Complex is better than complicated.\n", 78 | "> Flat is better than nested.\n", 79 | "> Sparse is better than dense.\n", 80 | "> Readability counts.\n", 81 | "> Special cases aren't special enough to break the rules.\n", 82 | "> Although practicality beats purity.\n", 83 | "> Errors should never pass silently.\n", 84 | "> Unless explicitly silenced.\n", 85 | "> In the face of ambiguity, refuse the temptation to guess.\n", 86 | "> There should be one-- and preferably only one --obvious way to do it.\n", 87 | "> Although that way may not be obvious at first unless you're Dutch.\n", 88 | "> Now is better than never.\n", 89 | "> Although never is often better than *right* now.\n", 90 | "> If the implementation is hard to explain, it's a bad idea.\n", 91 | "> If the implementation is easy to explain, it may be a good idea.\n", 92 | "> Namespaces are one honking great idea -- let's do more of those!" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "And shorthand for links:\n", 100 | "\n", 101 | "[Jupyter's website](http://jupyter.org)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "## Headings" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "You can add headings by starting a line with one (or multiple) `#` followed by a space, as in the following example:\n", 116 | "\n", 117 | "# Heading 1\n", 118 | "# Heading 2\n", 119 | "## Heading 2.1\n", 120 | "## Heading 2.2" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "## Embedded code" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "You can embed code meant for illustration instead of execution in Python:\n", 135 | "\n", 136 | " def f(x):\n", 137 | " \"\"\"a docstring\"\"\"\n", 138 | " return x**2\n", 139 | "\n", 140 | "or other languages:\n", 141 | "\n", 142 | " if (i=0; i\n", 187 | "```python\n", 188 | "print \"Hello World\"\n", 189 | "```\n", 190 | "\n", 191 | "```javascript\n", 192 | "console.log(\"Hello World\")\n", 193 | "```\n", 194 | "\n", 195 | "\n", 196 | "Gives \n", 197 | "```python\n", 198 | "print \"Hello World\"\n", 199 | "```\n", 200 | "\n", 201 | "```javascript\n", 202 | "console.log(\"Hello World\")\n", 203 | "```\n", 204 | "\n", 205 | "And a table like this : \n", 206 | "\n", 207 | "
\n",
208 |     "| This | is   |\n",
209 |     "|------|------|\n",
210 |     "|   a  | table| \n",
211 |     "
\n", 212 | "\n", 213 | "A nice Html Table\n", 214 | "\n", 215 | "| This | is |\n", 216 | "|------|------|\n", 217 | "| a | table| " 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "## General HTML" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "Because Markdown is a superset of HTML you can even add things like HTML tables:\n", 232 | "\n", 233 | "\n", 234 | "\n", 235 | "\n", 236 | "\n", 237 | "\n", 238 | "\n", 239 | "\n", 240 | "\n", 241 | "\n", 242 | "\n", 243 | "\n", 244 | "\n", 245 | "\n", 246 | "
Header 1Header 2
row 1, cell 1row 1, cell 2
row 2, cell 1row 2, cell 2
" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "## Local files" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "If you have local files in your Notebook directory, you can refer to these files in Markdown cells directly:\n", 261 | "\n", 262 | " [subdirectory/]\n", 263 | "\n", 264 | "For example, in the images folder, we have a screenshot of a cell in command mode:\n", 265 | "\n", 266 | " \n", 267 | "\n", 268 | "\n", 269 | "\n", 270 | "These do not embed the data into the notebook file, and require that the files exist when you are viewing the notebook." 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "### Security of local files" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "Note that this means that the Jupyter notebook server also acts as a generic file server\n", 285 | "for files inside the same tree as your notebooks. Access is not granted outside the\n", 286 | "notebook folder so you have strict control over what files are visible, but for this\n", 287 | "reason it is highly recommended that you do not run the notebook server with a notebook\n", 288 | "directory at a high level in your filesystem (e.g. your home directory).\n", 289 | "\n", 290 | "When you run the notebook in a password-protected manner, local file access is restricted\n", 291 | "to authenticated users unless read-only views are active." 292 | ] 293 | } 294 | ], 295 | "metadata": { 296 | "kernelspec": { 297 | "display_name": "Python 3", 298 | "language": "python", 299 | "name": "python3" 300 | }, 301 | "language_info": { 302 | "codemirror_mode": { 303 | "name": "ipython", 304 | "version": 3 305 | }, 306 | "file_extension": ".py", 307 | "mimetype": "text/x-python", 308 | "name": "python", 309 | "nbconvert_exporter": "python", 310 | "pygments_lexer": "ipython3", 311 | "version": "3.4.3" 312 | } 313 | }, 314 | "nbformat": 4, 315 | "nbformat_minor": 0 316 | } 317 | -------------------------------------------------------------------------------- /Content/Workflow/07-LaTeX.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Using LaTeX to Typeset Equations" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "[LaTeX](http://en.wikipedia.org/wiki/LaTeX) is a markup and programming language for typesetting mathematical languages. It is used extensively in Physics, Math, Statistics and other technical fields for writing papers and other documents with equations.\n", 15 | "\n", 16 | "You can use LaTeX syntax in the Markdown cells of you Jupyer notebooks. This enables you to create notebooks with have equations in addition to code and narrative text. The Jupyter Notebook renders LaTeX using the excellent [MathJax](http://docs.mathjax.org) library." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Learning LaTeX" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "If you don't already know LaTeX syntax, a good starting place is the [Short Math Guide to LaTeX](ftp://ftp.ams.org/pub/tex/doc/amsmath/short-math-guide.pdf) by the American Mathematical Society.\n", 31 | "\n", 32 | "The rest of this document show examples of LaTeX in the notebook." 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## The Lorenz equations\n", 40 | "\n", 41 | "### Source\n", 42 | "\n", 43 | "```\\begin{align}\n", 44 | "\\dot{x} & = \\sigma(y-x) \\\\\n", 45 | "\\dot{y} & = \\rho x - y - xz \\\\\n", 46 | "\\dot{z} & = -\\beta z + xy\n", 47 | "\\end{align}\n", 48 | "```\n", 49 | "\n", 50 | "### Display\n", 51 | "\n", 52 | "\\begin{align}\n", 53 | "\\dot{x} & = \\sigma(y-x) \\\\\n", 54 | "\\dot{y} & = \\rho x - y - xz \\\\\n", 55 | "\\dot{z} & = -\\beta z + xy\n", 56 | "\\end{align}" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "## The Cauchy-Schwarz inequality\n", 64 | "\n", 65 | "### Source\n", 66 | "\n", 67 | "```\\begin{equation*}\n", 68 | "\\left( \\sum_{k=1}^n a_k b_k \\right)^2 \\leq \\left( \\sum_{k=1}^n a_k^2 \\right) \\left( \\sum_{k=1}^n b_k^2 \\right)\n", 69 | "\\end{equation*}\n", 70 | "```\n", 71 | "### Display\n", 72 | "\n", 73 | "\\begin{equation*}\n", 74 | "\\left( \\sum_{k=1}^n a_k b_k \\right)^2 \\leq \\left( \\sum_{k=1}^n a_k^2 \\right) \\left( \\sum_{k=1}^n b_k^2 \\right)\n", 75 | "\\end{equation*}" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## A cross product formula\n", 83 | "\n", 84 | "### Source\n", 85 | "\n", 86 | "```\\begin{equation*}\n", 87 | "\\mathbf{V}_1 \\times \\mathbf{V}_2 = \\begin{vmatrix}\n", 88 | "\\mathbf{i} & \\mathbf{j} & \\mathbf{k} \\\\\n", 89 | "\\frac{\\partial X}{\\partial u} & \\frac{\\partial Y}{\\partial u} & 0 \\\\\n", 90 | "\\frac{\\partial X}{\\partial v} & \\frac{\\partial Y}{\\partial v} & 0\n", 91 | "\\end{vmatrix} \n", 92 | "\\end{equation*}\n", 93 | "```\n", 94 | "\n", 95 | "### Display\n", 96 | "\n", 97 | "\\begin{equation*}\n", 98 | "\\mathbf{V}_1 \\times \\mathbf{V}_2 = \\begin{vmatrix}\n", 99 | "\\mathbf{i} & \\mathbf{j} & \\mathbf{k} \\\\\n", 100 | "\\frac{\\partial X}{\\partial u} & \\frac{\\partial Y}{\\partial u} & 0 \\\\\n", 101 | "\\frac{\\partial X}{\\partial v} & \\frac{\\partial Y}{\\partial v} & 0\n", 102 | "\\end{vmatrix} \n", 103 | "\\end{equation*}" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## The probability of getting \\(k\\) heads when flipping \\(n\\) coins is\n", 111 | "\n", 112 | "### Source\n", 113 | "\n", 114 | "```\\begin{equation*}\n", 115 | "P(E) = {n \\choose k} p^k (1-p)^{ n-k} \n", 116 | "\\end{equation*}\n", 117 | "```\n", 118 | "\n", 119 | "### Display\n", 120 | "\n", 121 | "\\begin{equation*}\n", 122 | "P(E) = {n \\choose k} p^k (1-p)^{ n-k} \n", 123 | "\\end{equation*}" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## An identity of Ramanujan\n", 131 | "\n", 132 | "### Source\n", 133 | "\n", 134 | "```\\begin{equation*}\n", 135 | "\\frac{1}{\\Bigl(\\sqrt{\\phi \\sqrt{5}}-\\phi\\Bigr) e^{\\frac25 \\pi}} =\n", 136 | "1+\\frac{e^{-2\\pi}} {1+\\frac{e^{-4\\pi}} {1+\\frac{e^{-6\\pi}}\n", 137 | "{1+\\frac{e^{-8\\pi}} {1+\\ldots} } } } \n", 138 | "\\end{equation*}\n", 139 | "```\n", 140 | "\n", 141 | "### Display\n", 142 | "\n", 143 | "\\begin{equation*}\n", 144 | "\\frac{1}{\\Bigl(\\sqrt{\\phi \\sqrt{5}}-\\phi\\Bigr) e^{\\frac25 \\pi}} =\n", 145 | "1+\\frac{e^{-2\\pi}} {1+\\frac{e^{-4\\pi}} {1+\\frac{e^{-6\\pi}}\n", 146 | "{1+\\frac{e^{-8\\pi}} {1+\\ldots} } } } \n", 147 | "\\end{equation*}" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "## A Rogers-Ramanujan identity\n", 155 | "\n", 156 | "### Source\n", 157 | "\n", 158 | "```\\begin{equation*}\n", 159 | "1 + \\frac{q^2}{(1-q)}+\\frac{q^6}{(1-q)(1-q^2)}+\\cdots =\n", 160 | "\\prod_{j=0}^{\\infty}\\frac{1}{(1-q^{5j+2})(1-q^{5j+3})},\n", 161 | "\\quad\\quad \\text{for $|q|<1$}. \n", 162 | "\\end{equation*}\n", 163 | "```\n", 164 | "\n", 165 | "### Display\n", 166 | "\n", 167 | "\\begin{equation*}\n", 168 | "1 + \\frac{q^2}{(1-q)}+\\frac{q^6}{(1-q)(1-q^2)}+\\cdots =\n", 169 | "\\prod_{j=0}^{\\infty}\\frac{1}{(1-q^{5j+2})(1-q^{5j+3})},\n", 170 | "\\quad\\quad \\text{for $|q|<1$}. \n", 171 | "\\end{equation*}" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "## Maxwell's equations\n", 179 | "\n", 180 | "### Source\n", 181 | "\n", 182 | "```\\begin{align}\n", 183 | "\\nabla \\times \\vec{\\mathbf{B}} -\\, \\frac1c\\, \\frac{\\partial\\vec{\\mathbf{E}}}{\\partial t} & = \\frac{4\\pi}{c}\\vec{\\mathbf{j}} \\\\ \\nabla \\cdot \\vec{\\mathbf{E}} & = 4 \\pi \\rho \\\\\n", 184 | "\\nabla \\times \\vec{\\mathbf{E}}\\, +\\, \\frac1c\\, \\frac{\\partial\\vec{\\mathbf{B}}}{\\partial t} & = \\vec{\\mathbf{0}} \\\\\n", 185 | "\\nabla \\cdot \\vec{\\mathbf{B}} & = 0 \n", 186 | "\\end{align}\n", 187 | "```\n", 188 | "\n", 189 | "### Display\n", 190 | "\n", 191 | "\\begin{align}\n", 192 | "\\nabla \\times \\vec{\\mathbf{B}} -\\, \\frac1c\\, \\frac{\\partial\\vec{\\mathbf{E}}}{\\partial t} & = \\frac{4\\pi}{c}\\vec{\\mathbf{j}} \\\\ \\nabla \\cdot \\vec{\\mathbf{E}} & = 4 \\pi \\rho \\\\\n", 193 | "\\nabla \\times \\vec{\\mathbf{E}}\\, +\\, \\frac1c\\, \\frac{\\partial\\vec{\\mathbf{B}}}{\\partial t} & = \\vec{\\mathbf{0}} \\\\\n", 194 | "\\nabla \\cdot \\vec{\\mathbf{B}} & = 0 \n", 195 | "\\end{align}" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "## Equation Numbering and References\n", 203 | "\n", 204 | "Equation numbering and referencing will be available in a future version of the Jupyter Notebook." 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "## Inline Typesetting (Mixing Markdown and TeX)\n", 212 | "\n", 213 | "While display equations look good for a page of samples, the ability to mix math and *formatted* **text** in a paragraph is also important.\n", 214 | "\n", 215 | "## Source\n", 216 | "\n", 217 | "``` This expression $\\sqrt{3x-1}+(1+x)^2$ is an example of a TeX inline equation in a **[Markdown-formatted](http://daringfireball.net/projects/markdown/)** sentence. \n", 218 | "```\n", 219 | "\n", 220 | "## Display\n", 221 | "\n", 222 | "This expression $\\sqrt{3x-1}+(1+x)^2$ is an example of a TeX inline equation in a **[Markdown-formatted](http://daringfireball.net/projects/markdown/)** sentence. " 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "## Other syntax\n", 230 | "\n", 231 | "You will notice in other places on the web that `$$` are needed explicitly to begin and end MathJax typesetting. This is **not** required if you will be using TeX environments, but the IPython notebook will accept this syntax on legacy notebooks. \n", 232 | "\n", 233 | "### Source\n", 234 | "\n", 235 | "```\n", 236 | "$$\n", 237 | "\\begin{array}{c}\n", 238 | "y_1 \\\\\\\n", 239 | "y_2 \\mathtt{t}_i \\\\\\\n", 240 | "z_{3,4}\n", 241 | "\\end{array}\n", 242 | "$$\n", 243 | "```\n", 244 | "\n", 245 | "```\n", 246 | "$$\n", 247 | "\\begin{array}{c}\n", 248 | "y_1 \\cr\n", 249 | "y_2 \\mathtt{t}_i \\cr\n", 250 | "y_{3}\n", 251 | "\\end{array}\n", 252 | "$$\n", 253 | "```\n", 254 | "\n", 255 | "```\n", 256 | "$$\\begin{eqnarray} \n", 257 | "x' &=& &x \\sin\\phi &+& z \\cos\\phi \\\\\n", 258 | "z' &=& - &x \\cos\\phi &+& z \\sin\\phi \\\\\n", 259 | "\\end{eqnarray}$$\n", 260 | "```\n", 261 | "\n", 262 | "```\n", 263 | "$$\n", 264 | "x=4\n", 265 | "$$\n", 266 | "```\n", 267 | "\n", 268 | "### Display\n", 269 | "\n", 270 | "$$\n", 271 | "\\begin{array}{c}\n", 272 | "y_1 \\\\\\\n", 273 | "y_2 \\mathtt{t}_i \\\\\\\n", 274 | "z_{3,4}\n", 275 | "\\end{array}\n", 276 | "$$\n", 277 | "\n", 278 | "$$\n", 279 | "\\begin{array}{c}\n", 280 | "y_1 \\cr\n", 281 | "y_2 \\mathtt{t}_i \\cr\n", 282 | "y_{3}\n", 283 | "\\end{array}\n", 284 | "$$\n", 285 | "\n", 286 | "$$\\begin{eqnarray} \n", 287 | "x' &=& &x \\sin\\phi &+& z \\cos\\phi \\\\\n", 288 | "z' &=& - &x \\cos\\phi &+& z \\sin\\phi \\\\\n", 289 | "\\end{eqnarray}$$\n", 290 | "\n", 291 | "$$\n", 292 | "x=4\n", 293 | "$$" 294 | ] 295 | } 296 | ], 297 | "metadata": { 298 | "kernelspec": { 299 | "display_name": "Python 3", 300 | "language": "python", 301 | "name": "python3" 302 | }, 303 | "language_info": { 304 | "codemirror_mode": { 305 | "name": "ipython", 306 | "version": 3 307 | }, 308 | "file_extension": ".py", 309 | "mimetype": "text/x-python", 310 | "name": "python", 311 | "nbconvert_exporter": "python", 312 | "pygments_lexer": "ipython3", 313 | "version": "3.4.3" 314 | } 315 | }, 316 | "nbformat": 4, 317 | "nbformat_minor": 0 318 | } 319 | -------------------------------------------------------------------------------- /Content/Workflow/data/flare.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "flare", 3 | "children": [ 4 | { 5 | "name": "analytics", 6 | "children": [ 7 | { 8 | "name": "cluster", 9 | "children": [ 10 | {"name": "AgglomerativeCluster", "size": 3938}, 11 | {"name": "CommunityStructure", "size": 3812}, 12 | {"name": "HierarchicalCluster", "size": 6714}, 13 | {"name": "MergeEdge", "size": 743} 14 | ] 15 | }, 16 | { 17 | "name": "graph", 18 | "children": [ 19 | {"name": "BetweennessCentrality", "size": 3534}, 20 | {"name": "LinkDistance", "size": 5731}, 21 | {"name": "MaxFlowMinCut", "size": 7840}, 22 | {"name": "ShortestPaths", "size": 5914}, 23 | {"name": "SpanningTree", "size": 3416} 24 | ] 25 | }, 26 | { 27 | "name": "optimization", 28 | "children": [ 29 | {"name": "AspectRatioBanker", "size": 7074} 30 | ] 31 | } 32 | ] 33 | }, 34 | { 35 | "name": "animate", 36 | "children": [ 37 | {"name": "Easing", "size": 17010}, 38 | {"name": "FunctionSequence", "size": 5842}, 39 | { 40 | "name": "interpolate", 41 | "children": [ 42 | {"name": "ArrayInterpolator", "size": 1983}, 43 | {"name": "ColorInterpolator", "size": 2047}, 44 | {"name": "DateInterpolator", "size": 1375}, 45 | {"name": "Interpolator", "size": 8746}, 46 | {"name": "MatrixInterpolator", "size": 2202}, 47 | {"name": "NumberInterpolator", "size": 1382}, 48 | {"name": "ObjectInterpolator", "size": 1629}, 49 | {"name": "PointInterpolator", "size": 1675}, 50 | {"name": "RectangleInterpolator", "size": 2042} 51 | ] 52 | }, 53 | {"name": "ISchedulable", "size": 1041}, 54 | {"name": "Parallel", "size": 5176}, 55 | {"name": "Pause", "size": 449}, 56 | {"name": "Scheduler", "size": 5593}, 57 | {"name": "Sequence", "size": 5534}, 58 | {"name": "Transition", "size": 9201}, 59 | {"name": "Transitioner", "size": 19975}, 60 | {"name": "TransitionEvent", "size": 1116}, 61 | {"name": "Tween", "size": 6006} 62 | ] 63 | }, 64 | { 65 | "name": "data", 66 | "children": [ 67 | { 68 | "name": "converters", 69 | "children": [ 70 | {"name": "Converters", "size": 721}, 71 | {"name": "DelimitedTextConverter", "size": 4294}, 72 | {"name": "GraphMLConverter", "size": 9800}, 73 | {"name": "IDataConverter", "size": 1314}, 74 | {"name": "JSONConverter", "size": 2220} 75 | ] 76 | }, 77 | {"name": "DataField", "size": 1759}, 78 | {"name": "DataSchema", "size": 2165}, 79 | {"name": "DataSet", "size": 586}, 80 | {"name": "DataSource", "size": 3331}, 81 | {"name": "DataTable", "size": 772}, 82 | {"name": "DataUtil", "size": 3322} 83 | ] 84 | }, 85 | { 86 | "name": "display", 87 | "children": [ 88 | {"name": "DirtySprite", "size": 8833}, 89 | {"name": "LineSprite", "size": 1732}, 90 | {"name": "RectSprite", "size": 3623}, 91 | {"name": "TextSprite", "size": 10066} 92 | ] 93 | }, 94 | { 95 | "name": "flex", 96 | "children": [ 97 | {"name": "FlareVis", "size": 4116} 98 | ] 99 | }, 100 | { 101 | "name": "physics", 102 | "children": [ 103 | {"name": "DragForce", "size": 1082}, 104 | {"name": "GravityForce", "size": 1336}, 105 | {"name": "IForce", "size": 319}, 106 | {"name": "NBodyForce", "size": 10498}, 107 | {"name": "Particle", "size": 2822}, 108 | {"name": "Simulation", "size": 9983}, 109 | {"name": "Spring", "size": 2213}, 110 | {"name": "SpringForce", "size": 1681} 111 | ] 112 | }, 113 | { 114 | "name": "query", 115 | "children": [ 116 | {"name": "AggregateExpression", "size": 1616}, 117 | {"name": "And", "size": 1027}, 118 | {"name": "Arithmetic", "size": 3891}, 119 | {"name": "Average", "size": 891}, 120 | {"name": "BinaryExpression", "size": 2893}, 121 | {"name": "Comparison", "size": 5103}, 122 | {"name": "CompositeExpression", "size": 3677}, 123 | {"name": "Count", "size": 781}, 124 | {"name": "DateUtil", "size": 4141}, 125 | {"name": "Distinct", "size": 933}, 126 | {"name": "Expression", "size": 5130}, 127 | {"name": "ExpressionIterator", "size": 3617}, 128 | {"name": "Fn", "size": 3240}, 129 | {"name": "If", "size": 2732}, 130 | {"name": "IsA", "size": 2039}, 131 | {"name": "Literal", "size": 1214}, 132 | {"name": "Match", "size": 3748}, 133 | {"name": "Maximum", "size": 843}, 134 | { 135 | "name": "methods", 136 | "children": [ 137 | {"name": "add", "size": 593}, 138 | {"name": "and", "size": 330}, 139 | {"name": "average", "size": 287}, 140 | {"name": "count", "size": 277}, 141 | {"name": "distinct", "size": 292}, 142 | {"name": "div", "size": 595}, 143 | {"name": "eq", "size": 594}, 144 | {"name": "fn", "size": 460}, 145 | {"name": "gt", "size": 603}, 146 | {"name": "gte", "size": 625}, 147 | {"name": "iff", "size": 748}, 148 | {"name": "isa", "size": 461}, 149 | {"name": "lt", "size": 597}, 150 | {"name": "lte", "size": 619}, 151 | {"name": "max", "size": 283}, 152 | {"name": "min", "size": 283}, 153 | {"name": "mod", "size": 591}, 154 | {"name": "mul", "size": 603}, 155 | {"name": "neq", "size": 599}, 156 | {"name": "not", "size": 386}, 157 | {"name": "or", "size": 323}, 158 | {"name": "orderby", "size": 307}, 159 | {"name": "range", "size": 772}, 160 | {"name": "select", "size": 296}, 161 | {"name": "stddev", "size": 363}, 162 | {"name": "sub", "size": 600}, 163 | {"name": "sum", "size": 280}, 164 | {"name": "update", "size": 307}, 165 | {"name": "variance", "size": 335}, 166 | {"name": "where", "size": 299}, 167 | {"name": "xor", "size": 354}, 168 | {"name": "_", "size": 264} 169 | ] 170 | }, 171 | {"name": "Minimum", "size": 843}, 172 | {"name": "Not", "size": 1554}, 173 | {"name": "Or", "size": 970}, 174 | {"name": "Query", "size": 13896}, 175 | {"name": "Range", "size": 1594}, 176 | {"name": "StringUtil", "size": 4130}, 177 | {"name": "Sum", "size": 791}, 178 | {"name": "Variable", "size": 1124}, 179 | {"name": "Variance", "size": 1876}, 180 | {"name": "Xor", "size": 1101} 181 | ] 182 | }, 183 | { 184 | "name": "scale", 185 | "children": [ 186 | {"name": "IScaleMap", "size": 2105}, 187 | {"name": "LinearScale", "size": 1316}, 188 | {"name": "LogScale", "size": 3151}, 189 | {"name": "OrdinalScale", "size": 3770}, 190 | {"name": "QuantileScale", "size": 2435}, 191 | {"name": "QuantitativeScale", "size": 4839}, 192 | {"name": "RootScale", "size": 1756}, 193 | {"name": "Scale", "size": 4268}, 194 | {"name": "ScaleType", "size": 1821}, 195 | {"name": "TimeScale", "size": 5833} 196 | ] 197 | }, 198 | { 199 | "name": "util", 200 | "children": [ 201 | {"name": "Arrays", "size": 8258}, 202 | {"name": "Colors", "size": 10001}, 203 | {"name": "Dates", "size": 8217}, 204 | {"name": "Displays", "size": 12555}, 205 | {"name": "Filter", "size": 2324}, 206 | {"name": "Geometry", "size": 10993}, 207 | { 208 | "name": "heap", 209 | "children": [ 210 | {"name": "FibonacciHeap", "size": 9354}, 211 | {"name": "HeapNode", "size": 1233} 212 | ] 213 | }, 214 | {"name": "IEvaluable", "size": 335}, 215 | {"name": "IPredicate", "size": 383}, 216 | {"name": "IValueProxy", "size": 874}, 217 | { 218 | "name": "math", 219 | "children": [ 220 | {"name": "DenseMatrix", "size": 3165}, 221 | {"name": "IMatrix", "size": 2815}, 222 | {"name": "SparseMatrix", "size": 3366} 223 | ] 224 | }, 225 | {"name": "Maths", "size": 17705}, 226 | {"name": "Orientation", "size": 1486}, 227 | { 228 | "name": "palette", 229 | "children": [ 230 | {"name": "ColorPalette", "size": 6367}, 231 | {"name": "Palette", "size": 1229}, 232 | {"name": "ShapePalette", "size": 2059}, 233 | {"name": "SizePalette", "size": 2291} 234 | ] 235 | }, 236 | {"name": "Property", "size": 5559}, 237 | {"name": "Shapes", "size": 19118}, 238 | {"name": "Sort", "size": 6887}, 239 | {"name": "Stats", "size": 6557}, 240 | {"name": "Strings", "size": 22026} 241 | ] 242 | }, 243 | { 244 | "name": "vis", 245 | "children": [ 246 | { 247 | "name": "axis", 248 | "children": [ 249 | {"name": "Axes", "size": 1302}, 250 | {"name": "Axis", "size": 24593}, 251 | {"name": "AxisGridLine", "size": 652}, 252 | {"name": "AxisLabel", "size": 636}, 253 | {"name": "CartesianAxes", "size": 6703} 254 | ] 255 | }, 256 | { 257 | "name": "controls", 258 | "children": [ 259 | {"name": "AnchorControl", "size": 2138}, 260 | {"name": "ClickControl", "size": 3824}, 261 | {"name": "Control", "size": 1353}, 262 | {"name": "ControlList", "size": 4665}, 263 | {"name": "DragControl", "size": 2649}, 264 | {"name": "ExpandControl", "size": 2832}, 265 | {"name": "HoverControl", "size": 4896}, 266 | {"name": "IControl", "size": 763}, 267 | {"name": "PanZoomControl", "size": 5222}, 268 | {"name": "SelectionControl", "size": 7862}, 269 | {"name": "TooltipControl", "size": 8435} 270 | ] 271 | }, 272 | { 273 | "name": "data", 274 | "children": [ 275 | {"name": "Data", "size": 20544}, 276 | {"name": "DataList", "size": 19788}, 277 | {"name": "DataSprite", "size": 10349}, 278 | {"name": "EdgeSprite", "size": 3301}, 279 | {"name": "NodeSprite", "size": 19382}, 280 | { 281 | "name": "render", 282 | "children": [ 283 | {"name": "ArrowType", "size": 698}, 284 | {"name": "EdgeRenderer", "size": 5569}, 285 | {"name": "IRenderer", "size": 353}, 286 | {"name": "ShapeRenderer", "size": 2247} 287 | ] 288 | }, 289 | {"name": "ScaleBinding", "size": 11275}, 290 | {"name": "Tree", "size": 7147}, 291 | {"name": "TreeBuilder", "size": 9930} 292 | ] 293 | }, 294 | { 295 | "name": "events", 296 | "children": [ 297 | {"name": "DataEvent", "size": 2313}, 298 | {"name": "SelectionEvent", "size": 1880}, 299 | {"name": "TooltipEvent", "size": 1701}, 300 | {"name": "VisualizationEvent", "size": 1117} 301 | ] 302 | }, 303 | { 304 | "name": "legend", 305 | "children": [ 306 | {"name": "Legend", "size": 20859}, 307 | {"name": "LegendItem", "size": 4614}, 308 | {"name": "LegendRange", "size": 10530} 309 | ] 310 | }, 311 | { 312 | "name": "operator", 313 | "children": [ 314 | { 315 | "name": "distortion", 316 | "children": [ 317 | {"name": "BifocalDistortion", "size": 4461}, 318 | {"name": "Distortion", "size": 6314}, 319 | {"name": "FisheyeDistortion", "size": 3444} 320 | ] 321 | }, 322 | { 323 | "name": "encoder", 324 | "children": [ 325 | {"name": "ColorEncoder", "size": 3179}, 326 | {"name": "Encoder", "size": 4060}, 327 | {"name": "PropertyEncoder", "size": 4138}, 328 | {"name": "ShapeEncoder", "size": 1690}, 329 | {"name": "SizeEncoder", "size": 1830} 330 | ] 331 | }, 332 | { 333 | "name": "filter", 334 | "children": [ 335 | {"name": "FisheyeTreeFilter", "size": 5219}, 336 | {"name": "GraphDistanceFilter", "size": 3165}, 337 | {"name": "VisibilityFilter", "size": 3509} 338 | ] 339 | }, 340 | {"name": "IOperator", "size": 1286}, 341 | { 342 | "name": "label", 343 | "children": [ 344 | {"name": "Labeler", "size": 9956}, 345 | {"name": "RadialLabeler", "size": 3899}, 346 | {"name": "StackedAreaLabeler", "size": 3202} 347 | ] 348 | }, 349 | { 350 | "name": "layout", 351 | "children": [ 352 | {"name": "AxisLayout", "size": 6725}, 353 | {"name": "BundledEdgeRouter", "size": 3727}, 354 | {"name": "CircleLayout", "size": 9317}, 355 | {"name": "CirclePackingLayout", "size": 12003}, 356 | {"name": "DendrogramLayout", "size": 4853}, 357 | {"name": "ForceDirectedLayout", "size": 8411}, 358 | {"name": "IcicleTreeLayout", "size": 4864}, 359 | {"name": "IndentedTreeLayout", "size": 3174}, 360 | {"name": "Layout", "size": 7881}, 361 | {"name": "NodeLinkTreeLayout", "size": 12870}, 362 | {"name": "PieLayout", "size": 2728}, 363 | {"name": "RadialTreeLayout", "size": 12348}, 364 | {"name": "RandomLayout", "size": 870}, 365 | {"name": "StackedAreaLayout", "size": 9121}, 366 | {"name": "TreeMapLayout", "size": 9191} 367 | ] 368 | }, 369 | {"name": "Operator", "size": 2490}, 370 | {"name": "OperatorList", "size": 5248}, 371 | {"name": "OperatorSequence", "size": 4190}, 372 | {"name": "OperatorSwitch", "size": 2581}, 373 | {"name": "SortOperator", "size": 2023} 374 | ] 375 | }, 376 | {"name": "Visualization", "size": 16540} 377 | ] 378 | } 379 | ] 380 | } 381 | -------------------------------------------------------------------------------- /Content/Workflow/data/scrubjay.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Workflow/data/scrubjay.mp3 -------------------------------------------------------------------------------- /Content/Workflow/images/command_mode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Workflow/images/command_mode.png -------------------------------------------------------------------------------- /Content/Workflow/images/dashboard_files_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Workflow/images/dashboard_files_tab.png -------------------------------------------------------------------------------- /Content/Workflow/images/dashboard_files_tab_btns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Workflow/images/dashboard_files_tab_btns.png -------------------------------------------------------------------------------- /Content/Workflow/images/dashboard_files_tab_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Workflow/images/dashboard_files_tab_new.png -------------------------------------------------------------------------------- /Content/Workflow/images/dashboard_files_tab_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Workflow/images/dashboard_files_tab_run.png -------------------------------------------------------------------------------- /Content/Workflow/images/dashboard_running_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Workflow/images/dashboard_running_tab.png -------------------------------------------------------------------------------- /Content/Workflow/images/edit_mode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Workflow/images/edit_mode.png -------------------------------------------------------------------------------- /Content/Workflow/images/ipython-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Workflow/images/ipython-image.png -------------------------------------------------------------------------------- /Content/Workflow/images/menubar_toolbar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Workflow/images/menubar_toolbar.png -------------------------------------------------------------------------------- /Content/Workflow/images/python-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Content/Workflow/images/python-image.png -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # License 2 | 3 | Copyright (c) 2014 Brian E. Granger 4 | 5 | * Code is licensed under the MIT license 6 | * Text and other content licensed under the Creative Commons Attribution 4.0 International License 7 | 8 | ## Creative Commmons License 9 | 10 | Creative Commons License
This work is licensed under a Creative Commons Attribution 4.0 International License. 11 | 12 | ## The MIT License (MIT) 13 | 14 | Permission is hereby granted, free of charge, to any person obtaining a copy 15 | of this software and associated documentation files (the "Software"), to deal 16 | in the Software without restriction, including without limitation the rights 17 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 18 | copies of the Software, and to permit persons to whom the Software is 19 | furnished to do so, subject to the following conditions: 20 | 21 | The above copyright notice and this permission notice shall be included in all 22 | copies or substantial portions of the Software. 23 | 24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 29 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | SOFTWARE. 31 | 32 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: reset update 2 | 3 | reset: 4 | git reset --hard origin/master 5 | 6 | update: reset 7 | git remote update 8 | git pull origin master 9 | 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction to Data Science (DATA 301) 2 | 3 | This repository contains the lecture curriculum as Jupyter Notebooks, 4 | for *Introduction to Data Science* (DATA 301) 5 | being taught at [Cal Poly](www.calpoly.edu) as part of the 6 | [Data Science Minor](http://catalog.calpoly.edu/collegesandprograms/collegeofsciencemathematics/statistics/crossdisciplinarystudiesminordatascience/). 7 | 8 | ## Python Data Science Handbook 9 | 10 | We are using the *Python Data Science Handbook*, 11 | by Jake VanderPlas, in this course. We highly recommended it! 12 | 13 | * [Jupyter Notebook](https://github.com/jakevdp/PythonDataScienceHandbook) 14 | * [O'Reilly Book]()http://shop.oreilly.com/product/0636920034919.do 15 | 16 | ## DataCamp 17 | 18 | ![DataCamp](./datacamp_logo.png) 19 | 20 | We are also using [DataCamp](datacamp.com) for some of this course. They are generously providing 21 | free access to DataCamp for our students during the course: 22 | 23 | > This class is supported by DataCamp, the most intuitive learning platform for data science. 24 | > Learn R, Python and SQL the way you learn best through a combination of short expert 25 | > videos and hands-on-the-keyboard exercises. Take over 100+ courses by expert instructors 26 | > on topics such as importing data, data visualization or machine learning and learn faster 27 | > through immediate and personalised feedback on every exercise.” 28 | 29 | -------------------------------------------------------------------------------- /Resources/Images/cp_datascience_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/Resources/Images/cp_datascience_logo.png -------------------------------------------------------------------------------- /Resources/Notes/install.md: -------------------------------------------------------------------------------- 1 | # Installation note 2 | 3 | ## Anaconda 4 | 5 | * Don't use the Anaconda Launcher for IPython as it runs in `\Documents\IPython` Notebooks on Windows. 6 | 7 | ## Running from IPython master 8 | 9 | It is pretty simple to have students run from IPython master. 10 | 11 | First, you need to build a wheel. Install `git` and `pip install wheel` to get the dependencies for 12 | building the wheel. Then run: 13 | 14 | ```bash 15 | $ pip wheel --no-deps git+https://github.com/ipython/ipython.git#egg=ipython[all] 16 | $ ls 17 | ipython-3.0.0_dev-py2-none-any.whl 18 | ``` 19 | 20 | You can either directly distribute the wheel file and install by running: 21 | 22 | ``` 23 | $ pip install --upgrade ipython-3.0.0_dev-py2-none-any.whl 24 | ``` 25 | 26 | Or you can post the wheel online somewhere. I have posted it as an official release binary in a GitHub 27 | repo. Then install using: 28 | 29 | ``` 30 | $ pip install --upgrade https://github.com/ellisonbg/ds4e/releases/download/0.1/ipython-3.0.0_dev-py2-none-any.whl#egg=ipython[notebook] 31 | ``` 32 | 33 | If the user already has this version installed, they may have to do `pip uninstall ipython` first. I 34 | have tested this and it works on top of a default Anaconda. 35 | 36 | But first uninstall the Cython package because mistune will try to build its C extension if Cython 37 | is installed, but that will fail because there is no compiler!!! 38 | 39 | ``` 40 | $ conda remove cython 41 | ``` 42 | 43 | -------------------------------------------------------------------------------- /Resources/Notes/teaching.md: -------------------------------------------------------------------------------- 1 | # Notes in teaching using the IPython Notebook 2 | 3 | For notebooks prepared as lectures: 4 | 5 | * Keep all main teaching examples as simple as possible. Don't do fancy examples to teach 6 | students the API and how to code. 7 | * Have separate examples that are more complex. 8 | * Use a consistent structure and organization of notebooks. 9 | * When to print, when to use Output. 10 | * Get rid of the pager so you can put help in notebook cells!!! 11 | * Maybe think about getting numpy doc rendering of docstrings working?!? 12 | * Don't use code comments for the main narrative. 13 | * Minimize visual garbage in code cells - allow the user to focus on the main 14 | parts of the code, not the extra cruft. 15 | * Link to further documentation often. 16 | * Use the magic number 3 a lot!!! 17 | -------------------------------------------------------------------------------- /datacamp_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calpolydatascience/data301/5ca50db37afb0be83e9b0e294385f7bd60634179/datacamp_logo.png --------------------------------------------------------------------------------