├── .DS_Store ├── .ipynb_checkpoints ├── Data_Storage_1-Reading_and_Writing_in_Text-checkpoint.ipynb ├── Double Pendulum Animation Gravity Change-checkpoint.ipynb ├── Double Pendulum Animation-checkpoint.ipynb ├── Matplotlib_1-A_Brief_API_Primer-checkpoint.ipynb ├── Matplotlib_2-Additional_Features-checkpoint.ipynb ├── Matplotlib_3-Plotting_with_pandas_and_seaborn-checkpoint.ipynb ├── NumPy_2-Universal_Functions-Fast_Element-Wise_Array_Functions-checkpoint.ipynb ├── Pandas_1-Introduction-checkpoint.ipynb ├── Pandas_2-Essential_Functionality-checkpoint.ipynb ├── Pandas_3-Summarizing_and_Computing_Descriptive_Statistics-checkpoint.ipynb └── Untitled-checkpoint.ipynb ├── Basics_1-Data_Structures_and_Sequences.ipynb ├── Basics_2-Functions.ipynb ├── Basics_3-Files_and_Operating_System.ipynb ├── Data_Storage_1-Reading_and_Writing_in_Text.ipynb ├── Double Pendulum Animation Gravity Change.ipynb ├── Double Pendulum Animation.ipynb ├── Matplotlib_1-A_Brief_API_Primer.ipynb ├── Matplotlib_2-Additional_Features.ipynb ├── Matplotlib_3-Plotting_with_pandas_and_seaborn.ipynb ├── None0000000.png ├── NumPy_1-The_NumPy_ndarray.ipynb ├── NumPy_2-Universal_Functions-Fast_Element-Wise_Array_Functions.ipynb ├── NumPy_3-Array_Oriented_Programming_With_Arrays.ipynb ├── Pandas_1-Introduction.ipynb ├── Pandas_2-Essential_Functionality.ipynb ├── Pandas_3-Summarizing_and_Computing_Descriptive_Statistics.ipynb ├── convert_html_to_colour_pdf.rtf ├── dataexample.csv ├── htmls_and_pdfs ├── .DS_Store ├── Matplotlib_1-A_Brief_API_Primer.html ├── Matplotlib_2-Additional_Features.html ├── Matplotlib_3-Plotting_with_pandas_and_seaborn.html ├── Pandas_1-Introduction.html ├── Pandas_2-Essential_Functionality.html ├── Pandas_3-Summarizing_and_Computing_Descriptive_Statistics.html ├── html_to_colour.py └── to_pdf.py ├── sample_plot.pdf ├── sample_plot.png ├── sampledata ├── .DS_Store ├── data1.csv ├── data1.txt ├── data2.txt ├── data3.csv ├── data4.csv ├── macrodata.txt ├── tips.csv.sb-3098d7aa-T8ASyL └── tips.txt └── temp.txt /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukepolson/Python-Self-Learning/a8c3a7772a3bc33bf329cfeb6e003bb5c26b7372/.DS_Store -------------------------------------------------------------------------------- /.ipynb_checkpoints/NumPy_2-Universal_Functions-Fast_Element-Wise_Array_Functions-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Table of Contents and Notebook Setup

\n", 8 | "
" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 2, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "data": { 18 | "application/javascript": [ 19 | "$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 20 | ], 21 | "text/plain": [ 22 | "" 23 | ] 24 | }, 25 | "metadata": {}, 26 | "output_type": "display_data" 27 | } 28 | ], 29 | "source": [ 30 | "%%javascript\n", 31 | "$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import numpy as np" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# Univeral Function Introduction" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "A universal function or ufunc performs element wise operations on data in ndarrays. These functions include simple mathematical functions like exponentials and square roots." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "array([0. , 1. , 1.41421356, 1.73205081, 2. ,\n", 68 | " 2.23606798, 2.44948974, 2.64575131, 2.82842712, 3. ])" 69 | ] 70 | }, 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "arr = np.arange(10)\n", 78 | "np.sqrt(arr)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "These are referred to as unary ufuncs. Other functions, such as maximum take two ndarrays and return a single array ( binary ufuncs)." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "array([ 0.26252147, -0.06023566, 1.61060357, 0.94351335, 0.37743868,\n", 97 | " 1.72642266, 0.20044704, 1.36904904, 0.09303552, 1.45390904])" 98 | ] 99 | }, 100 | "execution_count": 4, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "x=np.random.randn(10)\n", 107 | "y=np.random.randn(10)\n", 108 | "np.maximum(x, y)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "# Returning Multiple Arrays" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "A ufunc can also return multiple arrays; 'modf' is one example - it returns the fractional and integer part of an array of numbers:" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 5, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/plain": [ 133 | "array([ -5.7678331 , -4.74337863, 9.23923766, 1.66801359,\n", 134 | " -17.62486265, 7.25717594, -5.52891383, -1.30939992,\n", 135 | " -4.26590234, 2.31233354])" 136 | ] 137 | }, 138 | "execution_count": 5, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "arr = np.random.randn(10) * 5\n", 145 | "arr" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 6, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "array([-0.7678331 , -0.74337863, 0.23923766, 0.66801359, -0.62486265,\n", 157 | " 0.25717594, -0.52891383, -0.30939992, -0.26590234, 0.31233354])" 158 | ] 159 | }, 160 | "execution_count": 6, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "remainder, whole_part = np.modf(arr)\n", 167 | "remainder" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 7, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "array([ -5., -4., 9., 1., -17., 7., -5., -1., -4., 2.])" 179 | ] 180 | }, 181 | "execution_count": 7, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "whole_part" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "# Returning the New Array to a New Variable" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "ufuncs take an optional out argument that allows them to copy the new array to a new variable:" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 8, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stderr", 211 | "output_type": "stream", 212 | "text": [ 213 | "C:\\Users\\lukep\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: RuntimeWarning: invalid value encountered in sqrt\n", 214 | " \n" 215 | ] 216 | }, 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "array([ nan, nan, nan, nan, nan,\n", 221 | " nan, 0.7803909 , nan, 0.25932723, 1.00560002])" 222 | ] 223 | }, 224 | "execution_count": 8, 225 | "metadata": {}, 226 | "output_type": "execute_result" 227 | } 228 | ], 229 | "source": [ 230 | "arr = np.random.randn(10)\n", 231 | "np.sqrt(arr, arr)\n", 232 | "\n", 233 | "arr" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [] 244 | } 245 | ], 246 | "metadata": { 247 | "kernelspec": { 248 | "display_name": "Python 3", 249 | "language": "python", 250 | "name": "python3" 251 | }, 252 | "language_info": { 253 | "codemirror_mode": { 254 | "name": "ipython", 255 | "version": 3 256 | }, 257 | "file_extension": ".py", 258 | "mimetype": "text/x-python", 259 | "name": "python", 260 | "nbconvert_exporter": "python", 261 | "pygments_lexer": "ipython3", 262 | "version": "3.6.2" 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 2 267 | } 268 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Pandas_3-Summarizing_and_Computing_Descriptive_Statistics-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Table of Contents and Notebook Setup

\n", 8 | "
" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "data": { 18 | "application/javascript": [ 19 | "$.get('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 20 | ], 21 | "text/plain": [ 22 | "" 23 | ] 24 | }, 25 | "metadata": {}, 26 | "output_type": "display_data" 27 | } 28 | ], 29 | "source": [ 30 | "%%javascript\n", 31 | "$.get('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import pandas as pd\n", 41 | "import numpy as np" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "# Some Math Prereqs" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "source": [ 57 | "The covariance of two distributions X and Y is defined as" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "source": [ 66 | "$$cov(X,Y) = \\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\mu_x)(y_i-\\mu_y) $$" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "i represents the 'i'th measurement \n", 74 | "\n", 75 | "n is the total number of measurements \n", 76 | "\n", 77 | "$x_i$ and $y_i$ are individual measurements\n", 78 | "\n", 79 | "$\\mu_x$ and $\\mu_y$ are the mean values of X and Y" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "This gives us information about how the two variables deviate from their expected values (means) and if they do it at the same time. If they both deviate positively or negatively at the same time, then we get a large contribution from the sum. If one deviates positively and the other negatively then we get a large negative number. Big positive numbers mean the variables are correlated (big negative also means they're related in some way too- as one goes up the other goes down)." 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "The problem is that these large or small numbers depend on the scale of the units we use for measurement. We want a quantity that we know is the same for all distributions. We can divide by their variances (related to standard deviation)." 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "$$\\sigma_x \\equiv cov(X,X) = \\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\mu_x)^2 $$" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "We define the correlation (or more precisely: the linear correlation ) as follows:" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "$$corr(X,Y)=\\frac{cov(X,Y)}{\\sqrt{\\sigma_x \\sigma_y}}$$" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "The inequaltity $-1 \\leq corr(X,Y) \\leq 1$ always holds. This can be shown through the Cauchy-Schwartz inequality (quantities $x_i-\\mu_x$ are elements of a vector)." 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "If $corr(X,Y)=1$ then the values are perfectly correlated (in the vector space of measurements $x_i-\\mu_x$ and $y_i-\\mu_y$ they point in the same direction). If $corr(X,Y)=-1$ then the values are perfectly uncorrelated (in the vector space they point in opposite direction)." 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "Such information is when comparing stock trends. Suppose that amazon always drops the day after microsoft goes up and we find a strict anticorrelation ($corr(X,Y)=-1$). In the future, when microsoft goes up, we may want to sell our amazon stock as we know its going to drop- then pick it up the next day for a discount." 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "source": [ 144 | "# Basic Mathematical Functions of Pandas" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "pandas objects are equipt to deal with a variety of mathematical and statistical functions, and can also deal with missing data." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 3, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/html": [ 162 | "
\n", 163 | "\n", 176 | "\n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | "
onetwo
a1.40NaN
b7.10-4.5
cNaNNaN
d0.75-1.3
\n", 207 | "
" 208 | ], 209 | "text/plain": [ 210 | " one two\n", 211 | "a 1.40 NaN\n", 212 | "b 7.10 -4.5\n", 213 | "c NaN NaN\n", 214 | "d 0.75 -1.3" 215 | ] 216 | }, 217 | "execution_count": 3, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "df = pd.DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],\n", 224 | " index=['a','b','c','d'], columns=['one','two'])\n", 225 | "df" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## The Sum Method" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "Use sum to return the column sums. " 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 4, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "one 9.25\n", 251 | "two -5.80\n", 252 | "dtype: float64" 253 | ] 254 | }, 255 | "execution_count": 4, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "df.sum()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "We can also use axis='columns' to sum across the columns instead." 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 5, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "a 1.40\n", 280 | "b 2.60\n", 281 | "c 0.00\n", 282 | "d -0.55\n", 283 | "dtype: float64" 284 | ] 285 | }, 286 | "execution_count": 5, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "df.sum(axis='columns')" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "## The Mean Method" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "We can exclude rows with NA values if we like:" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 6, 312 | "metadata": {}, 313 | "outputs": [ 314 | { 315 | "data": { 316 | "text/plain": [ 317 | "a NaN\n", 318 | "b 1.300\n", 319 | "c NaN\n", 320 | "d -0.275\n", 321 | "dtype: float64" 322 | ] 323 | }, 324 | "execution_count": 6, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "df.mean(axis='columns', skipna=False)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "## Accumulation Method (Integration of Rows/Columns)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 7, 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "data": { 347 | "text/html": [ 348 | "
\n", 349 | "\n", 362 | "\n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | "
onetwo
a1.40NaN
b8.50-4.5
cNaNNaN
d9.25-5.8
\n", 393 | "
" 394 | ], 395 | "text/plain": [ 396 | " one two\n", 397 | "a 1.40 NaN\n", 398 | "b 8.50 -4.5\n", 399 | "c NaN NaN\n", 400 | "d 9.25 -5.8" 401 | ] 402 | }, 403 | "execution_count": 7, 404 | "metadata": {}, 405 | "output_type": "execute_result" 406 | } 407 | ], 408 | "source": [ 409 | "df.cumsum()" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "We can use this for integration if we like." 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 8, 422 | "metadata": {}, 423 | "outputs": [ 424 | { 425 | "data": { 426 | "text/html": [ 427 | "
\n", 428 | "\n", 441 | "\n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | "
onetwo
a0.00140NaN
b0.00850-0.0045
cNaNNaN
d0.00925-0.0058
\n", 472 | "
" 473 | ], 474 | "text/plain": [ 475 | " one two\n", 476 | "a 0.00140 NaN\n", 477 | "b 0.00850 -0.0045\n", 478 | "c NaN NaN\n", 479 | "d 0.00925 -0.0058" 480 | ] 481 | }, 482 | "execution_count": 8, 483 | "metadata": {}, 484 | "output_type": "execute_result" 485 | } 486 | ], 487 | "source": [ 488 | "dx = 0.001\n", 489 | "(df*dx).cumsum()" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": {}, 495 | "source": [ 496 | "## Basic Statistical Method " 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "metadata": {}, 502 | "source": [ 503 | "We can use the describe method to learn about the rows and columns." 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 9, 509 | "metadata": {}, 510 | "outputs": [ 511 | { 512 | "data": { 513 | "text/html": [ 514 | "
\n", 515 | "\n", 528 | "\n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | "
onetwo
count3.0000002.000000
mean3.083333-2.900000
std3.4936852.262742
min0.750000-4.500000
25%1.075000-3.700000
50%1.400000-2.900000
75%4.250000-2.100000
max7.100000-1.300000
\n", 579 | "
" 580 | ], 581 | "text/plain": [ 582 | " one two\n", 583 | "count 3.000000 2.000000\n", 584 | "mean 3.083333 -2.900000\n", 585 | "std 3.493685 2.262742\n", 586 | "min 0.750000 -4.500000\n", 587 | "25% 1.075000 -3.700000\n", 588 | "50% 1.400000 -2.900000\n", 589 | "75% 4.250000 -2.100000\n", 590 | "max 7.100000 -1.300000" 591 | ] 592 | }, 593 | "execution_count": 9, 594 | "metadata": {}, 595 | "output_type": "execute_result" 596 | } 597 | ], 598 | "source": [ 599 | "df.describe()" 600 | ] 601 | }, 602 | { 603 | "cell_type": "markdown", 604 | "metadata": {}, 605 | "source": [ 606 | "## Summary" 607 | ] 608 | }, 609 | { 610 | "cell_type": "markdown", 611 | "metadata": {}, 612 | "source": [ 613 | "See table 5-8 on page 160 of textbook for all simple methods." 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "# Correlation and Covariance" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "## Introduction to Correlation in DataFrames" 628 | ] 629 | }, 630 | { 631 | "cell_type": "markdown", 632 | "metadata": {}, 633 | "source": [ 634 | "Correlation and Covariance look at the relationship between two data sets. Below we compare stock datasets. " 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 11, 640 | "metadata": { 641 | "scrolled": true 642 | }, 643 | "outputs": [ 644 | { 645 | "ename": "ImportError", 646 | "evalue": "cannot import name 'is_list_like'", 647 | "output_type": "error", 648 | "traceback": [ 649 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 650 | "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", 651 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mpandas_datareader\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mweb\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mstocks\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'AMZN'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'GOOG'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'AAPL'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'TD'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'JNJ'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'IBM'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mstart\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2017\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m7\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m29\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mend\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2018\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m8\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 652 | "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas_datareader\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_version\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mget_versions\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m from .data import (DataReader, Options, get_components_yahoo,\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mget_dailysummary_iex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mget_data_enigma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mget_data_famafrench\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mget_data_fred\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mget_data_google\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mget_data_moex\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mget_data_morningstar\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mget_data_quandl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mget_data_stooq\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 653 | "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas_datareader\\data.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[0mImmediateDeprecationError\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas_datareader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfamafrench\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mFamaFrenchReader\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 14\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas_datareader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfred\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mFredReader\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 15\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas_datareader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgoogle\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdaily\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mGoogleDailyReader\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas_datareader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgoogle\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mOptions\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mGoogleOptions\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 654 | "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas_datareader\\fred.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommon\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mis_list_like\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mconcat\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mread_csv\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas_datareader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbase\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0m_BaseReader\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 655 | "\u001b[1;31mImportError\u001b[0m: cannot import name 'is_list_like'" 656 | ] 657 | } 658 | ], 659 | "source": [ 660 | "import pandas_datareader.data as web\n", 661 | "stocks = ['AMZN', 'GOOG', 'AAPL', 'TD', 'JNJ', 'IBM']\n", 662 | "\n", 663 | "start = pd.datetime(2017, 7, 29)\n", 664 | "end = pd.datetime(2018, 8, 2)\n", 665 | "f1 = web.DataReader(stocks, 'iex', start, end)\n", 666 | "f1['open'].head() #opening price for the stock on that day" 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "metadata": {}, 672 | "source": [ 673 | "Lets apply some functions and see how the stock changes at the beginning and the end of the day. Recall that functions like the one below can operate on rows or columns of dataframes; in this case we choose column headers." 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "def find_change(x, stock):\n", 683 | " return x['close'][stock]-x['open'][stock]\n", 684 | "\n", 685 | "stock_day_changes = pd.DataFrame([f1.apply(find_change, axis='columns', args=(stock,)) \n", 686 | " for stock in stocks], index=stocks)\n", 687 | "stock_day_changes = stock_day_changes.transpose()\n", 688 | "stock_day_changes.head()" 689 | ] 690 | }, 691 | { 692 | "cell_type": "markdown", 693 | "metadata": {}, 694 | "source": [ 695 | "Correlation for entire DataFrame:" 696 | ] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": null, 701 | "metadata": {}, 702 | "outputs": [], 703 | "source": [ 704 | "stock_day_changes.corr()" 705 | ] 706 | }, 707 | { 708 | "cell_type": "markdown", 709 | "metadata": {}, 710 | "source": [ 711 | "Or we can just select single elements:" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": null, 717 | "metadata": {}, 718 | "outputs": [], 719 | "source": [ 720 | "stock_day_changes['AMZN'].corr(stock_day_changes['GOOG'])" 721 | ] 722 | }, 723 | { 724 | "cell_type": "markdown", 725 | "metadata": {}, 726 | "source": [ 727 | "Or we can select rows:" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": null, 733 | "metadata": {}, 734 | "outputs": [], 735 | "source": [ 736 | "stock_day_changes.corrwith(stock_day_changes.AMZN)" 737 | ] 738 | }, 739 | { 740 | "cell_type": "markdown", 741 | "metadata": {}, 742 | "source": [ 743 | "# Unique Values, Counting Occurences, and Membership of Elements in a Series" 744 | ] 745 | }, 746 | { 747 | "cell_type": "markdown", 748 | "metadata": {}, 749 | "source": [ 750 | "The pandas module has even more methods for determining if elements in a series are unique. This is obviously useful for DataFrames as the rows and columns can be extracted as Series." 751 | ] 752 | }, 753 | { 754 | "cell_type": "markdown", 755 | "metadata": {}, 756 | "source": [ 757 | "## Uniqueness" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": null, 763 | "metadata": {}, 764 | "outputs": [], 765 | "source": [ 766 | "obj = pd.Series(['c', 'a', 'c', 'b', 'a', 'c', 'b', 'a', 'c'])\n", 767 | "uniques = obj.unique()\n", 768 | "uniques" 769 | ] 770 | }, 771 | { 772 | "cell_type": "markdown", 773 | "metadata": {}, 774 | "source": [ 775 | "## Counting Occurences" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": null, 781 | "metadata": {}, 782 | "outputs": [], 783 | "source": [ 784 | "obj = pd.Series(['c', 'a', 'c', 'b', 'a', 'c', 'b', 'a', 'c'])\n", 785 | "obj.value_counts()" 786 | ] 787 | }, 788 | { 789 | "cell_type": "markdown", 790 | "metadata": {}, 791 | "source": [ 792 | "The returned Series is sorted by the number of occurences. We can choose not to have this as well:" 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": null, 798 | "metadata": {}, 799 | "outputs": [], 800 | "source": [ 801 | "pd.value_counts(obj.values, sort=False)" 802 | ] 803 | }, 804 | { 805 | "cell_type": "markdown", 806 | "metadata": {}, 807 | "source": [ 808 | "## Membership" 809 | ] 810 | }, 811 | { 812 | "cell_type": "markdown", 813 | "metadata": {}, 814 | "source": [ 815 | "Sometimes we want to see if an element is contained in a Series. We can use the isin method for this." 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": null, 821 | "metadata": {}, 822 | "outputs": [], 823 | "source": [ 824 | "mask = obj.isin(['b', 'c'])\n", 825 | "mask" 826 | ] 827 | }, 828 | { 829 | "cell_type": "markdown", 830 | "metadata": {}, 831 | "source": [ 832 | "Then we can use the mask to extract the elements that we want." 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": null, 838 | "metadata": {}, 839 | "outputs": [], 840 | "source": [ 841 | "obj[mask]" 842 | ] 843 | }, 844 | { 845 | "cell_type": "markdown", 846 | "metadata": {}, 847 | "source": [ 848 | "This can often make boolean indexing DataFrames easier when one has lots of conditions." 849 | ] 850 | }, 851 | { 852 | "cell_type": "markdown", 853 | "metadata": {}, 854 | "source": [ 855 | "Suppose we have a Series of distinct values and a Series of non-distinct values like below:" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": null, 861 | "metadata": {}, 862 | "outputs": [], 863 | "source": [ 864 | "to_match = pd.Series(['c','b','c','a','b'])\n", 865 | "unique_vals = pd.Series(['b','a','c'])" 866 | ] 867 | }, 868 | { 869 | "cell_type": "markdown", 870 | "metadata": {}, 871 | "source": [ 872 | "We can use the Index.get_indexer method to give an index array from the unique values:" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": null, 878 | "metadata": {}, 879 | "outputs": [], 880 | "source": [ 881 | "pd.Index(unique_vals).get_indexer(to_match)" 882 | ] 883 | } 884 | ], 885 | "metadata": { 886 | "kernelspec": { 887 | "display_name": "Python 3", 888 | "language": "python", 889 | "name": "python3" 890 | }, 891 | "language_info": { 892 | "codemirror_mode": { 893 | "name": "ipython", 894 | "version": 3 895 | }, 896 | "file_extension": ".py", 897 | "mimetype": "text/x-python", 898 | "name": "python", 899 | "nbconvert_exporter": "python", 900 | "pygments_lexer": "ipython3", 901 | "version": "3.6.5" 902 | } 903 | }, 904 | "nbformat": 4, 905 | "nbformat_minor": 2 906 | } 907 | -------------------------------------------------------------------------------- /Basics_2-Functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Table of Contents and Notebook Setup

\n", 8 | "
" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "data": { 18 | "application/javascript": [ 19 | "$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 20 | ], 21 | "text/plain": [ 22 | "" 23 | ] 24 | }, 25 | "metadata": {}, 26 | "output_type": "display_data" 27 | } 28 | ], 29 | "source": [ 30 | "%%javascript\n", 31 | "$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "# Introduction to Functions" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Functions are used for code organization and code reuse. They also make code more readable by giving a name to a block of code. They are defined with the \"def\" keyword and returned with the \"return\" keyword." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": { 52 | "collapsed": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "def sum_square(x, y, z=2):\n", 57 | " return (x+y)**2" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "81" 69 | ] 70 | }, 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "sum_square(4,5)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "Functions have positional arguments and keyword arguments. In the preceding example, x and y are positional arguments and z is a keyword argument. Python's rule is that positional arguments must come before keyword arguments. Typically keyword arguments are used for default values or optional arguments." 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "The function above can be called in the following three ways:" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "49" 103 | ] 104 | }, 105 | "execution_count": 4, 106 | "metadata": {}, 107 | "output_type": "execute_result" 108 | } 109 | ], 110 | "source": [ 111 | "sum_square(x=4, y=5, z=3)\n", 112 | "sum_square(4, 5, 3)\n", 113 | "sum_square(4, y=3)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "In the last line above, we make \"y\" a keyword argument since we use an equal sign to specify it. Sometimes, for clarity, it is nice to specify everything as a keyword argument like we did in the first line." 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "# Namespaces, Scope, and Local Functions" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Functions can access both global variables and local variables. These two terms refer to the namespace of a python variable. The local namespace is created when the function is called and immediately populated by the functions arguments. Afterwards, the local namespace is destroyed and only the return value survives." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 5, 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "def func(x,y):\n", 146 | " a=[]\n", 147 | " a.append(x)\n", 148 | " a.append(y)\n", 149 | " return 0" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "This is a pointless function, but it is used to clarify things above. When the function is created, x and y are put in the local namespace. Then a is put in the local namespace (when the second line is executed). When the function is finished, x, y, and a are all deleted." 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "If need be, we can assign variables outside the functions scope (global variables) using the global keyword." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 6, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "[3, 4]\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "a = None\n", 181 | "def func(x,y):\n", 182 | " global a\n", 183 | " a=[]\n", 184 | " a.append(x)\n", 185 | " a.append(y)\n", 186 | " return None\n", 187 | "\n", 188 | "func(3,4)\n", 189 | "print(a)\n", 190 | " " 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "Use of the global keyword is generally discouraged." 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "# Returning Multiple Values" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "In a python function, you can return multiple values in the form of a tuple. This application alone makes Python superior to Java and C++, where you would have to return an array and the code would look messy." 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 7, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "-1" 223 | ] 224 | }, 225 | "execution_count": 7, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "def func(x,y,z):\n", 232 | " return x-y, z-y\n", 233 | "\n", 234 | "a, b = func(2, 3, 4)\n", 235 | "a" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "Notice that the values get unpacked into the variables a and b. We could have also just done:" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 8, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/plain": [ 253 | "(-1, 1)" 254 | ] 255 | }, 256 | "execution_count": 8, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "return_val = func(2, 3, 4)\n", 263 | "return_val" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "but here return_val itself is a tuple (we didn't take advantage of the unpacking feature)." 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "# Anonymous (Lambda) Functions" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "Lambda functions are easy ways to express functions in one line of code. This serves multiple purposes. For starters, we can define very simple functions in one line of code:" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 9, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "4" 296 | ] 297 | }, 298 | "execution_count": 9, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "def long(x):\n", 305 | " return x**2\n", 306 | "\n", 307 | "short = lambda x: x**2\n", 308 | "\n", 309 | "long(2)\n", 310 | "short(2) #equivalent ^^" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "It also serves an extreme important second purpose; sometimes the arguments to functions are functions themselves. This is very prevalent in data analysis and it will show up frequently in these notes. Take the following two snippets of code as an example." 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 10, 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "data": { 327 | "text/plain": [ 328 | "2.0009999999996975" 329 | ] 330 | }, 331 | "execution_count": 10, 332 | "metadata": {}, 333 | "output_type": "execute_result" 334 | } 335 | ], 336 | "source": [ 337 | "def derivative(f, x, delta=0.001): #returns derivative of f at value x\n", 338 | " return (f(x+delta)-f(x))/delta\n", 339 | "\n", 340 | "def x_square(x):\n", 341 | " return x**2\n", 342 | "\n", 343 | "derivative(x_square, 1)\n", 344 | " \n", 345 | " " 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "Not bad. But we can cut down on the code significantly since x_square can be represented using a lambda function." 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 11, 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "data": { 362 | "text/plain": [ 363 | "2.0009999999996975" 364 | ] 365 | }, 366 | "execution_count": 11, 367 | "metadata": {}, 368 | "output_type": "execute_result" 369 | } 370 | ], 371 | "source": [ 372 | "def derivative(f, x, delta=0.001): #returns derivative of f at value x\n", 373 | " return (f(x+delta)-f(x))/delta\n", 374 | "\n", 375 | "derivative(lambda x: x**2, 1)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "Much more concise, and now we don't have to waste space in our program defining an x squared function. Note specifically how the lambda function takes the place of x_square in the first sample of code." 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "Lambda functions can take in multiple variables and return multiple variables as well." 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 12, 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "data": { 399 | "text/plain": [ 400 | "-1" 401 | ] 402 | }, 403 | "execution_count": 12, 404 | "metadata": {}, 405 | "output_type": "execute_result" 406 | } 407 | ], 408 | "source": [ 409 | "value = lambda x, y, z: (x-y, y-z)\n", 410 | "a, b = value(2,3,4)\n", 411 | "a" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "Go crazy. There's quite literally endless possibilities." 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "# Currying: Partial Argument Application" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "Currying is jargon for defining new functions based on old ones. This type of coding may help with lambda functions." 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 13, 438 | "metadata": {}, 439 | "outputs": [ 440 | { 441 | "data": { 442 | "text/plain": [ 443 | "7" 444 | ] 445 | }, 446 | "execution_count": 13, 447 | "metadata": {}, 448 | "output_type": "execute_result" 449 | } 450 | ], 451 | "source": [ 452 | "def add(x, y):\n", 453 | " return x+y\n", 454 | "\n", 455 | "add_five = lambda x: add(x,5)\n", 456 | "add_five(2)" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "The first element to the add function is said to be curried. This will be used later in data analysis." 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "# Generators" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": { 476 | "collapsed": true 477 | }, 478 | "source": [ 479 | "Python has a consistent way to iterate over sequences by means of iterator protocol . This protocol is a generic way to make objects iterable." 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 14, 485 | "metadata": {}, 486 | "outputs": [ 487 | { 488 | "name": "stdout", 489 | "output_type": "stream", 490 | "text": [ 491 | "a\n", 492 | "b\n", 493 | "c\n" 494 | ] 495 | } 496 | ], 497 | "source": [ 498 | "some_dict={'a':1, 'b':2, 'c':3}\n", 499 | "for item in some_dict:\n", 500 | " print(item)" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "When writing \"for item in some_dict\" the python interpretor first creates an iterator out of some_dict:" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": 15, 513 | "metadata": {}, 514 | "outputs": [ 515 | { 516 | "data": { 517 | "text/plain": [ 518 | "" 519 | ] 520 | }, 521 | "execution_count": 15, 522 | "metadata": {}, 523 | "output_type": "execute_result" 524 | } 525 | ], 526 | "source": [ 527 | "dict_iterator = iter(some_dict)\n", 528 | "dict_iterator" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "An iterator is an object that will yield objects when used in a context like a for loop. Most methods that accept a list or list-like-obejct will also accept iterators. This includes methods like min, max, sum, tuple, and list." 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": 16, 541 | "metadata": {}, 542 | "outputs": [ 543 | { 544 | "data": { 545 | "text/plain": [ 546 | "['a', 'b', 'c']" 547 | ] 548 | }, 549 | "execution_count": 16, 550 | "metadata": {}, 551 | "output_type": "execute_result" 552 | } 553 | ], 554 | "source": [ 555 | "list(dict_iterator)" 556 | ] 557 | }, 558 | { 559 | "cell_type": "markdown", 560 | "metadata": {}, 561 | "source": [ 562 | "A generator is a concise way to contruct an iterable object. Normal functions execute and return a single object (this can be a list or tuple that contains multiple things). Generators return a sequence of values, but lazily; they pause at each one untill the next one is requested. This can save computation space. To create a generator, use the \"yield\" keyword." 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 17, 568 | "metadata": {}, 569 | "outputs": [ 570 | { 571 | "data": { 572 | "text/plain": [ 573 | "" 574 | ] 575 | }, 576 | "execution_count": 17, 577 | "metadata": {}, 578 | "output_type": "execute_result" 579 | } 580 | ], 581 | "source": [ 582 | "def squares(n=10):\n", 583 | " for i in range(1, n+1):\n", 584 | " yield i**2\n", 585 | "\n", 586 | "gen1 = squares()\n", 587 | "gen1" 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "metadata": {}, 593 | "source": [ 594 | "It is not until we start requesting the values of gen that it returns the values." 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 18, 600 | "metadata": {}, 601 | "outputs": [ 602 | { 603 | "name": "stdout", 604 | "output_type": "stream", 605 | "text": [ 606 | "1 4 9 16 25 36 49 64 81 100 " 607 | ] 608 | } 609 | ], 610 | "source": [ 611 | "for x in gen1:\n", 612 | " print(x, end= ' ')" 613 | ] 614 | }, 615 | { 616 | "cell_type": "markdown", 617 | "metadata": {}, 618 | "source": [ 619 | "Here's another example." 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 19, 625 | "metadata": {}, 626 | "outputs": [ 627 | { 628 | "name": "stdout", 629 | "output_type": "stream", 630 | "text": [ 631 | "Cats in the cradle and the silver spoon. Little boy blue and the man in the moon." 632 | ] 633 | } 634 | ], 635 | "source": [ 636 | "def sentence_gen():\n", 637 | " yield 'Cats in the cradle '\n", 638 | " yield 'and the silver spoon. '\n", 639 | " yield 'Little boy blue '\n", 640 | " yield 'and the man in the moon.'\n", 641 | "\n", 642 | "gen2 = sentence_gen()\n", 643 | "for x in gen2:\n", 644 | " print(x, end='')" 645 | ] 646 | }, 647 | { 648 | "cell_type": "markdown", 649 | "metadata": {}, 650 | "source": [ 651 | "## Generator Expressions" 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": {}, 657 | "source": [ 658 | "Like lambda functions for functions, there are more concise way to make generators as well. This involves using a generator expression like such:" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 20, 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "data": { 668 | "text/plain": [ 669 | " at 0x1055ae620>" 670 | ] 671 | }, 672 | "execution_count": 20, 673 | "metadata": {}, 674 | "output_type": "execute_result" 675 | } 676 | ], 677 | "source": [ 678 | "gen = (x**3 for x in range(10))\n", 679 | "gen" 680 | ] 681 | }, 682 | { 683 | "cell_type": "markdown", 684 | "metadata": {}, 685 | "source": [ 686 | "These generator expressions are often easier to use than lists when using functions like \"sum.\"" 687 | ] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": 21, 692 | "metadata": {}, 693 | "outputs": [ 694 | { 695 | "data": { 696 | "text/plain": [ 697 | "249500250000" 698 | ] 699 | }, 700 | "execution_count": 21, 701 | "metadata": {}, 702 | "output_type": "execute_result" 703 | } 704 | ], 705 | "source": [ 706 | "sum(x**3 for x in range(1000))" 707 | ] 708 | }, 709 | { 710 | "cell_type": "markdown", 711 | "metadata": {}, 712 | "source": [ 713 | "## itertools module" 714 | ] 715 | }, 716 | { 717 | "cell_type": "markdown", 718 | "metadata": {}, 719 | "source": [ 720 | "There is a standard library called \"itertools\" that has a collection of generators for many common data algorithms. The method \"groupby\" takes a sequence and a function and groups consecutive elements in the sequence by their return values:" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 22, 726 | "metadata": {}, 727 | "outputs": [ 728 | { 729 | "name": "stdout", 730 | "output_type": "stream", 731 | "text": [ 732 | "A ['Adam', 'Axtor']\n", 733 | "J ['Jim', 'James']\n", 734 | "A ['Asshole']\n" 735 | ] 736 | } 737 | ], 738 | "source": [ 739 | "import itertools\n", 740 | "first_letter = lambda x: x[0]\n", 741 | "male_names=['Adam', 'Axtor', 'Jim', 'James', 'Asshole']\n", 742 | "\n", 743 | "for letter, names in itertools.groupby(male_names, first_letter):\n", 744 | " print(letter, list(names))" 745 | ] 746 | }, 747 | { 748 | "cell_type": "markdown", 749 | "metadata": {}, 750 | "source": [ 751 | "There are many other useful tools which you can find by googling \"itertools.\"" 752 | ] 753 | }, 754 | { 755 | "cell_type": "markdown", 756 | "metadata": {}, 757 | "source": [ 758 | "# Errors and Exception Handling" 759 | ] 760 | }, 761 | { 762 | "cell_type": "markdown", 763 | "metadata": {}, 764 | "source": [ 765 | "In order to write elegant code, one needs to be able to handle errors or exceptions gracefully. In data analysis, many functions only work on certain kinds of data input. Take, for example, the float function." 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": 23, 771 | "metadata": {}, 772 | "outputs": [ 773 | { 774 | "data": { 775 | "text/plain": [ 776 | "1.234" 777 | ] 778 | }, 779 | "execution_count": 23, 780 | "metadata": {}, 781 | "output_type": "execute_result" 782 | } 783 | ], 784 | "source": [ 785 | "float('1.234')" 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": 24, 791 | "metadata": {}, 792 | "outputs": [ 793 | { 794 | "ename": "ValueError", 795 | "evalue": "could not convert string to float: 'something'", 796 | "output_type": "error", 797 | "traceback": [ 798 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 799 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 800 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'something'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 801 | "\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'something'" 802 | ] 803 | } 804 | ], 805 | "source": [ 806 | "float('something')" 807 | ] 808 | }, 809 | { 810 | "cell_type": "markdown", 811 | "metadata": {}, 812 | "source": [ 813 | "If we want a float method that fails gracefully, we can use a try and except block as such." 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": 25, 819 | "metadata": {}, 820 | "outputs": [ 821 | { 822 | "data": { 823 | "text/plain": [ 824 | "'something'" 825 | ] 826 | }, 827 | "execution_count": 25, 828 | "metadata": {}, 829 | "output_type": "execute_result" 830 | } 831 | ], 832 | "source": [ 833 | "def attempt_float(x):\n", 834 | " try:\n", 835 | " return float(x)\n", 836 | " except:\n", 837 | " return x\n", 838 | " \n", 839 | "attempt_float('something')" 840 | ] 841 | }, 842 | { 843 | "cell_type": "markdown", 844 | "metadata": {}, 845 | "source": [ 846 | "The code in the except block is only executed if the code in the try block raises an exception." 847 | ] 848 | }, 849 | { 850 | "cell_type": "markdown", 851 | "metadata": {}, 852 | "source": [ 853 | "We have freedom to decide what type of exceptions we want to permit. For example," 854 | ] 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": 26, 859 | "metadata": {}, 860 | "outputs": [ 861 | { 862 | "ename": "TypeError", 863 | "evalue": "float() argument must be a string or a number, not 'tuple'", 864 | "output_type": "error", 865 | "traceback": [ 866 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 867 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 868 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 869 | "\u001b[0;31mTypeError\u001b[0m: float() argument must be a string or a number, not 'tuple'" 870 | ] 871 | } 872 | ], 873 | "source": [ 874 | "float((1,2))" 875 | ] 876 | }, 877 | { 878 | "cell_type": "markdown", 879 | "metadata": {}, 880 | "source": [ 881 | "institues a type error and not a value error like before. Maybe we only want the code in the except block to be completed if the try block raises a \"value exception.\" We can accomplish that as such" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": 27, 887 | "metadata": {}, 888 | "outputs": [ 889 | { 890 | "data": { 891 | "text/plain": [ 892 | "'something'" 893 | ] 894 | }, 895 | "execution_count": 27, 896 | "metadata": {}, 897 | "output_type": "execute_result" 898 | } 899 | ], 900 | "source": [ 901 | "def attempt_float(x):\n", 902 | " try:\n", 903 | " return float(x)\n", 904 | " except(ValueError):\n", 905 | " return x\n", 906 | " \n", 907 | "attempt_float('something')" 908 | ] 909 | }, 910 | { 911 | "cell_type": "code", 912 | "execution_count": 28, 913 | "metadata": {}, 914 | "outputs": [ 915 | { 916 | "ename": "TypeError", 917 | "evalue": "attempt_float() takes 1 positional argument but 2 were given", 918 | "output_type": "error", 919 | "traceback": [ 920 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 921 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 922 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mattempt_float\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 923 | "\u001b[0;31mTypeError\u001b[0m: attempt_float() takes 1 positional argument but 2 were given" 924 | ] 925 | } 926 | ], 927 | "source": [ 928 | "attempt_float(1,2)" 929 | ] 930 | }, 931 | { 932 | "cell_type": "markdown", 933 | "metadata": {}, 934 | "source": [ 935 | "Exactly what we wanted. In general, we specify what types of exceptions we will permit inside the brackets of except(). " 936 | ] 937 | }, 938 | { 939 | "cell_type": "markdown", 940 | "metadata": {}, 941 | "source": [ 942 | "Consider the following most general sequence of \"try\" block code. We wil analyze it afterwards." 943 | ] 944 | }, 945 | { 946 | "cell_type": "code", 947 | "execution_count": 29, 948 | "metadata": {}, 949 | "outputs": [ 950 | { 951 | "name": "stdout", 952 | "output_type": "stream", 953 | "text": [ 954 | "Failed\n", 955 | "The value of a was 4 and the value of b was 0\n" 956 | ] 957 | } 958 | ], 959 | "source": [ 960 | "a = 4\n", 961 | "b = 0\n", 962 | "\n", 963 | "try:\n", 964 | " print(a/b)\n", 965 | "except:\n", 966 | " print(\"Failed\")\n", 967 | "else:\n", 968 | " print(\"Succeeded\")\n", 969 | "finally:\n", 970 | " print(\"The value of a was {0} and the value of b was {1}\".format(a,b))" 971 | ] 972 | }, 973 | { 974 | "cell_type": "markdown", 975 | "metadata": {}, 976 | "source": [ 977 | "Firstly the try block is attempted. If it doesn't succeed then the except block is executed. If is does succeed then the else block is executed. Regardless of whether or not the try block succeeds, the finally block is then executed. This becomes a nice way to organize code." 978 | ] 979 | } 980 | ], 981 | "metadata": { 982 | "kernelspec": { 983 | "display_name": "Python 3", 984 | "language": "python", 985 | "name": "python3" 986 | }, 987 | "language_info": { 988 | "codemirror_mode": { 989 | "name": "ipython", 990 | "version": 3 991 | }, 992 | "file_extension": ".py", 993 | "mimetype": "text/x-python", 994 | "name": "python", 995 | "nbconvert_exporter": "python", 996 | "pygments_lexer": "ipython3", 997 | "version": "3.6.2" 998 | } 999 | }, 1000 | "nbformat": 4, 1001 | "nbformat_minor": 2 1002 | } 1003 | -------------------------------------------------------------------------------- /Basics_3-Files_and_Operating_System.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Reading from and writing to files is typically performed with the pandas module in python. Regardless, it is still very important to understand the basics of file reading and writing (which is very easy in python)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 19, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "path='example/lesson3textfile.txt'\n", 19 | "f = open(path)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "By default, the file is in read-only mode. We can treat f like an iterator with the lines being the objects to iterate over." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 20, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "(These violent delights have violent ends\n", 39 | ")\n", 40 | "(And in their triumph die, like fire and powder\n", 41 | ")\n", 42 | "(Which, as they kiss, consume.)\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "for line in f:\n", 48 | " print('('+line+')')\n", 49 | " pass" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "Lines come out of text files with end-of-line (EOL) markers intact, and we often need to strip these blank markers. Notice in the example above that the right bracket is placed on the line below. We can remove these end of line markers like such:" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 23, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "['These violent delights have violent ends',\n", 68 | " 'And in their triumph die, like fire and powder',\n", 69 | " 'Which, as they kiss, consume.']" 70 | ] 71 | }, 72 | "execution_count": 23, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "lines = [x.rstrip() for x in open(path)]\n", 79 | "lines" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "It is very important that you close a file when you are finished working with it. This releases the resources back to the operating system." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 24, 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "f.close()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "The with statement takes away having top remember to write f.close() as it automatically closes the file when after the with block is executed." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 26, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "with open(path) as f:\n", 116 | " lines = [x.rstrip() for x in open(path)]" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "If we had typed f = open(path, 'w') then a new file would have been created in place of the old one (thus overwriting the old file). One thus needs to be very careful when using the write command 'w.'" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "The most common methods for readable files are read(i) which advances the files handle position by i bytes, tell() which tells you your current position in the file, and seek(i) which changes the file position to the ith byte in the file. " 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 28, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "'These'" 142 | ] 143 | }, 144 | "execution_count": 28, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "f=open(path)\n", 151 | "f.read(5)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 29, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "11" 163 | ] 164 | }, 165 | "execution_count": 29, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "f.seek(11)\n", 172 | "f.tell()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 30, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "data": { 182 | "text/plain": [ 183 | "'nt delights have vio'" 184 | ] 185 | }, 186 | "execution_count": 30, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "f.read(20)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 31, 198 | "metadata": { 199 | "collapsed": true 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "f.close()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "# Bytes and Unicode with Files" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "There are fancy ways to open files in binary mode (that express symbols like ç, √, Ω, ...) in the common UTF-8 characters. To do this simply open the file as f = open(path, 'rb'). We contrast binary with the standard way of opening a file below:" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 8, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "The square root symbol: √ \n", 230 | "\n", 231 | "The symbol Omega: Ω\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "f=open('example/lesson3textfile2.txt')\n", 237 | "for line in f:\n", 238 | " print(line)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 9, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | "b'The square root symbol: \\xe2\\x88\\x9a \\n'\n", 251 | "b'The symbol Omega: \\xce\\xa9'\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "f=open('example/lesson3textfile2.txt', 'rb')\n", 257 | "for line in f:\n", 258 | " print(line)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "The 'b' at the beginning of the sentence lets one know that we are reading in binary. Here we can see the \\n marker at the end of the sentence (the end of line marker). The fancy symbols are defined using the UTF-8 characters." 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "We can convert these binary phrases to utf8 sentences using the following methods." 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 20, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "text/plain": [ 283 | "b'The square root symbol: \\xe2\\x88\\x9a \\nThe symbol '" 284 | ] 285 | }, 286 | "execution_count": 20, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "f=open('example/lesson3textfile2.txt', 'rb')\n", 293 | "line = f.read(40)\n", 294 | "line" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 21, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "data": { 304 | "text/plain": [ 305 | "'The square root symbol: √ \\nThe symbol '" 306 | ] 307 | }, 308 | "execution_count": 21, 309 | "metadata": {}, 310 | "output_type": "execute_result" 311 | } 312 | ], 313 | "source": [ 314 | "line.decode('utf8')" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "Our line is now in the standard utf8 format. Note that the end of line tag \"\\n\" still remains." 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "# Writing to Files" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "We can also write text to new files. To do this, we use file's write or writelines method. Lets copy the text from our shakespeare passage to a new file." 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 12, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "['These violent delights have violent ends\\n',\n", 347 | " 'And in their triumph die, like fire and powder\\n',\n", 348 | " 'Which, as they kiss, consume.']" 349 | ] 350 | }, 351 | "execution_count": 12, 352 | "metadata": {}, 353 | "output_type": "execute_result" 354 | } 355 | ], 356 | "source": [ 357 | "with open('temp.txt', 'w') as f:\n", 358 | " f.writelines(x for x in open('example/lesson3textfile.txt') if len(x)>1)\n", 359 | " \n", 360 | "with open('temp.txt') as f:\n", 361 | " lines = f.readlines()\n", 362 | " \n", 363 | "lines" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "Firstly, we open (and overwrite any other file) 'temp.txt' and assign this to the variable 'f'. Then we use the files writelines method to write a number of lines to the file 'temp.txt' from the file 'lesson3textfile.txt'." 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "# Bytes and Unicode with Files" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": { 384 | "collapsed": true 385 | }, 386 | "outputs": [], 387 | "source": [] 388 | } 389 | ], 390 | "metadata": { 391 | "kernelspec": { 392 | "display_name": "Python 3", 393 | "language": "python", 394 | "name": "python3" 395 | }, 396 | "language_info": { 397 | "codemirror_mode": { 398 | "name": "ipython", 399 | "version": 3 400 | }, 401 | "file_extension": ".py", 402 | "mimetype": "text/x-python", 403 | "name": "python", 404 | "nbconvert_exporter": "python", 405 | "pygments_lexer": "ipython3", 406 | "version": "3.6.2" 407 | } 408 | }, 409 | "nbformat": 4, 410 | "nbformat_minor": 2 411 | } 412 | -------------------------------------------------------------------------------- /Data_Storage_1-Reading_and_Writing_in_Text.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Table of Contents and Notebook Setup

\n", 8 | "
" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "data": { 18 | "application/javascript": [ 19 | "$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 20 | ], 21 | "text/plain": [ 22 | "" 23 | ] 24 | }, 25 | "metadata": {}, 26 | "output_type": "display_data" 27 | } 28 | ], 29 | "source": [ 30 | "%%javascript\n", 31 | "$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import numpy as np\n", 43 | "import matplotlib.pyplot as plt \n", 44 | "import pandas as pd" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "# Introduction" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "pandas has a number of functions to read tabular data and convert it into a DataFrame. The most popular ones are read_csv and read_table. csv uses comma as a delimiter and table uses tab as a delimiter." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "These functions essentially convert text into a DataFrame. The optional arguments for these functions fall into the following categories." 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | " Indexing - Can treat one or more columns as the returned DataFrame and how to get the column names.\n", 73 | "\n", 74 | " Type inference and data conversion - User defined value conversions and custom list of missing value markers.\n", 75 | "\n", 76 | " Datetime parsing - Allows one to combine date and time information spread over multiple columns into a single column.\n", 77 | "\n", 78 | " Iterating - Support for iterating over large files.\n", 79 | "\n", 80 | " Unclean data issues - Skipping rows or a footer, comments, or other minor things like numeric data with thousands seperated by column (French People)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## Comma Seperated (CSV) Text Files" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 3, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/html": [ 98 | "
\n", 99 | "\n", 112 | "\n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | "
MonthPeopleSales
0January12001678
1March600567
2June23002400
3August23334000
\n", 148 | "
" 149 | ], 150 | "text/plain": [ 151 | " Month People Sales\n", 152 | "0 January 1200 1678\n", 153 | "1 March 600 567\n", 154 | "2 June 2300 2400\n", 155 | "3 August 2333 4000" 156 | ] 157 | }, 158 | "execution_count": 3, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "df = pd.read_csv('sampledata/data1.csv')\n", 165 | "df" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "We could have also used the read_table function and specified the delimiter." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 4, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "data": { 182 | "text/html": [ 183 | "
\n", 184 | "\n", 197 | "\n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | "
MonthPeopleSales
0January12001678
1March600567
2June23002400
3August23334000
\n", 233 | "
" 234 | ], 235 | "text/plain": [ 236 | " Month People Sales\n", 237 | "0 January 1200 1678\n", 238 | "1 March 600 567\n", 239 | "2 June 2300 2400\n", 240 | "3 August 2333 4000" 241 | ] 242 | }, 243 | "execution_count": 4, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "pd.read_table('sampledata/data1.csv', sep=',')" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "We can also specify the column names ourselves." 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 5, 262 | "metadata": {}, 263 | "outputs": [ 264 | { 265 | "data": { 266 | "text/html": [ 267 | "
\n", 268 | "\n", 281 | "\n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | "
monthpeoplesales
0MonthPeopleSales
1January12001678
2March600567
3June23002400
4August23334000
\n", 323 | "
" 324 | ], 325 | "text/plain": [ 326 | " month people sales\n", 327 | "0 Month People Sales\n", 328 | "1 January 1200 1678\n", 329 | "2 March 600 567\n", 330 | "3 June 2300 2400\n", 331 | "4 August 2333 4000" 332 | ] 333 | }, 334 | "execution_count": 5, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "pd.read_csv('sampledata/data1.csv', names=['month', 'people', 'sales'])" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "Note that this data frame already has column header specified, so using this function is required." 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "## Choose a Specific Column to use as Row Indices" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "We can select one of the column to use as a row index as follows:" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 6, 367 | "metadata": {}, 368 | "outputs": [ 369 | { 370 | "data": { 371 | "text/html": [ 372 | "
\n", 373 | "\n", 386 | "\n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | "
PeopleSales
Month
January12001678
March600567
June23002400
August23334000
\n", 422 | "
" 423 | ], 424 | "text/plain": [ 425 | " People Sales\n", 426 | "Month \n", 427 | "January 1200 1678\n", 428 | "March 600 567\n", 429 | "June 2300 2400\n", 430 | "August 2333 4000" 431 | ] 432 | }, 433 | "execution_count": 6, 434 | "metadata": {}, 435 | "output_type": "execute_result" 436 | } 437 | ], 438 | "source": [ 439 | "pd.read_csv('sampledata/data1.csv', index_col='Month')" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "## Text Files of Bizzare Formats" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "Sometimes data is stored in text files of weird format. They may have variable whitespace and no comma seperated values. Consider the following file." 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 7, 459 | "metadata": {}, 460 | "outputs": [ 461 | { 462 | "data": { 463 | "text/plain": [ 464 | "[' People Sales\\n',\n", 465 | " 'Jan 12 1200\\n',\n", 466 | " 'Feb 23 2100\\n',\n", 467 | " 'Mar 45 4500']" 468 | ] 469 | }, 470 | "execution_count": 7, 471 | "metadata": {}, 472 | "output_type": "execute_result" 473 | } 474 | ], 475 | "source": [ 476 | "list(open('sampledata/data2.txt'))" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "While we could scrape together some code to try and open this and store it in a DataFrame, the read_table function has ways of opening files like this in a simple manner:" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 8, 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "data": { 493 | "text/html": [ 494 | "
\n", 495 | "\n", 508 | "\n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | "
PeopleSales
Jan121200
Feb232100
Mar454500
\n", 534 | "
" 535 | ], 536 | "text/plain": [ 537 | " People Sales\n", 538 | "Jan 12 1200\n", 539 | "Feb 23 2100\n", 540 | "Mar 45 4500" 541 | ] 542 | }, 543 | "execution_count": 8, 544 | "metadata": {}, 545 | "output_type": "execute_result" 546 | } 547 | ], 548 | "source": [ 549 | "pd.read_table('sampledata/data2.txt', sep='\\s+')" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "metadata": {}, 555 | "source": [ 556 | "In this case, we have passed a regular expression \\s+ as a delimiter for the variable whitespace in this text file." 557 | ] 558 | }, 559 | { 560 | "cell_type": "markdown", 561 | "metadata": {}, 562 | "source": [ 563 | "## Skipping Rows and Columns" 564 | ] 565 | }, 566 | { 567 | "cell_type": "markdown", 568 | "metadata": {}, 569 | "source": [ 570 | "We can also skip rows and columns of text file data." 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 9, 576 | "metadata": {}, 577 | "outputs": [ 578 | { 579 | "data": { 580 | "text/html": [ 581 | "
\n", 582 | "\n", 595 | "\n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | "
PeopleSales
Month
March600567
August23334000
\n", 621 | "
" 622 | ], 623 | "text/plain": [ 624 | " People Sales\n", 625 | "Month \n", 626 | "March 600 567\n", 627 | "August 2333 4000" 628 | ] 629 | }, 630 | "execution_count": 9, 631 | "metadata": {}, 632 | "output_type": "execute_result" 633 | } 634 | ], 635 | "source": [ 636 | "pd.read_csv('sampledata/data1.csv', index_col='Month', skiprows=[1,3])" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 10, 642 | "metadata": {}, 643 | "outputs": [ 644 | { 645 | "data": { 646 | "text/html": [ 647 | "
\n", 648 | "\n", 661 | "\n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | "
People
Month
January1200
March600
June2300
August2333
\n", 691 | "
" 692 | ], 693 | "text/plain": [ 694 | " People\n", 695 | "Month \n", 696 | "January 1200\n", 697 | "March 600\n", 698 | "June 2300\n", 699 | "August 2333" 700 | ] 701 | }, 702 | "execution_count": 10, 703 | "metadata": {}, 704 | "output_type": "execute_result" 705 | } 706 | ], 707 | "source": [ 708 | "pd.read_csv('sampledata/data1.csv', index_col='Month', usecols=['Month','People'])" 709 | ] 710 | }, 711 | { 712 | "cell_type": "markdown", 713 | "metadata": {}, 714 | "source": [ 715 | "## Handling Missing Values" 716 | ] 717 | }, 718 | { 719 | "cell_type": "markdown", 720 | "metadata": {}, 721 | "source": [ 722 | "Suppose our comma delimted data looks something like this (ignore the strange symbols at the beginning). Whenever pandas sees an 'NA' or a blank cell, the corresponding cell in the DataFrame will be NaN." 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "execution_count": 11, 728 | "metadata": {}, 729 | "outputs": [ 730 | { 731 | "data": { 732 | "text/plain": [ 733 | "['something,a,b,c,message\\n',\n", 734 | " 'one,12,23,NA,hello\\n',\n", 735 | " 'two,23,,12,world\\n',\n", 736 | " 'three,3,,5,foo\\n']" 737 | ] 738 | }, 739 | "execution_count": 11, 740 | "metadata": {}, 741 | "output_type": "execute_result" 742 | } 743 | ], 744 | "source": [ 745 | "list(open('sampledata/data3.csv'))" 746 | ] 747 | }, 748 | { 749 | "cell_type": "markdown", 750 | "metadata": {}, 751 | "source": [ 752 | "Opened as a DataFrame:" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": 12, 758 | "metadata": {}, 759 | "outputs": [ 760 | { 761 | "data": { 762 | "text/html": [ 763 | "
\n", 764 | "\n", 777 | "\n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | "
somethingabcmessage
0one1223.0NaNhello
1two23NaN12.0world
2three3NaN5.0foo
\n", 815 | "
" 816 | ], 817 | "text/plain": [ 818 | " something a b c message\n", 819 | "0 one 12 23.0 NaN hello\n", 820 | "1 two 23 NaN 12.0 world\n", 821 | "2 three 3 NaN 5.0 foo" 822 | ] 823 | }, 824 | "execution_count": 12, 825 | "metadata": {}, 826 | "output_type": "execute_result" 827 | } 828 | ], 829 | "source": [ 830 | "pd.read_csv('sampledata/data3.csv')" 831 | ] 832 | }, 833 | { 834 | "cell_type": "markdown", 835 | "metadata": {}, 836 | "source": [ 837 | "Different NA sentinels can be specified for each column when searching for NA values." 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": 13, 843 | "metadata": {}, 844 | "outputs": [ 845 | { 846 | "data": { 847 | "text/html": [ 848 | "
\n", 849 | "\n", 862 | "\n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | "
somethingabcmessage
0one1223.0NaNhello
1NaN23NaN12.0NaN
2three3NaN5.0NaN
\n", 900 | "
" 901 | ], 902 | "text/plain": [ 903 | " something a b c message\n", 904 | "0 one 12 23.0 NaN hello\n", 905 | "1 NaN 23 NaN 12.0 NaN\n", 906 | "2 three 3 NaN 5.0 NaN" 907 | ] 908 | }, 909 | "execution_count": 13, 910 | "metadata": {}, 911 | "output_type": "execute_result" 912 | } 913 | ], 914 | "source": [ 915 | "sentinels = {'message': ['foo', 'world'], 'something': ['two']}\n", 916 | "pd.read_csv('sampledata/data3.csv', na_values = sentinels)" 917 | ] 918 | }, 919 | { 920 | "cell_type": "markdown", 921 | "metadata": {}, 922 | "source": [ 923 | "# Reading Text Files in Pieces" 924 | ] 925 | }, 926 | { 927 | "cell_type": "markdown", 928 | "metadata": {}, 929 | "source": [ 930 | "Sometimes when processing very large files OR when you want to figure out the proper arguments to correctly process a large file, we may only want to read in a smaller part of the file." 931 | ] 932 | }, 933 | { 934 | "cell_type": "code", 935 | "execution_count": 14, 936 | "metadata": { 937 | "collapsed": true 938 | }, 939 | "outputs": [], 940 | "source": [ 941 | "pd.options.display.max_rows = 10" 942 | ] 943 | }, 944 | { 945 | "cell_type": "code", 946 | "execution_count": 15, 947 | "metadata": {}, 948 | "outputs": [ 949 | { 950 | "data": { 951 | "text/html": [ 952 | "
\n", 953 | "\n", 966 | "\n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | "
onetwothreefour
01231235461
11245456234
2143523445634
334516523521
41234232334
...............
2201231235461
2211245456234
222143523445634
22334516523521
2241234232334
\n", 1056 | "

225 rows × 4 columns

\n", 1057 | "
" 1058 | ], 1059 | "text/plain": [ 1060 | " one two three four\n", 1061 | "0 123 123 546 1\n", 1062 | "1 12 45 456 234\n", 1063 | "2 1435 234 456 34\n", 1064 | "3 3451 65 235 21\n", 1065 | "4 123 423 23 34\n", 1066 | ".. ... ... ... ...\n", 1067 | "220 123 123 546 1\n", 1068 | "221 12 45 456 234\n", 1069 | "222 1435 234 456 34\n", 1070 | "223 3451 65 235 21\n", 1071 | "224 123 423 23 34\n", 1072 | "\n", 1073 | "[225 rows x 4 columns]" 1074 | ] 1075 | }, 1076 | "execution_count": 15, 1077 | "metadata": {}, 1078 | "output_type": "execute_result" 1079 | } 1080 | ], 1081 | "source": [ 1082 | "pd.read_csv('sampledata/data4.csv')" 1083 | ] 1084 | }, 1085 | { 1086 | "cell_type": "markdown", 1087 | "metadata": {}, 1088 | "source": [ 1089 | "If we only want to read a small number of rows, we can specify that with the nrows argument." 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "code", 1094 | "execution_count": 16, 1095 | "metadata": {}, 1096 | "outputs": [ 1097 | { 1098 | "data": { 1099 | "text/html": [ 1100 | "
\n", 1101 | "\n", 1114 | "\n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | "
onetwothreefour
01231235461
11245456234
2143523445634
334516523521
41234232334
\n", 1162 | "
" 1163 | ], 1164 | "text/plain": [ 1165 | " one two three four\n", 1166 | "0 123 123 546 1\n", 1167 | "1 12 45 456 234\n", 1168 | "2 1435 234 456 34\n", 1169 | "3 3451 65 235 21\n", 1170 | "4 123 423 23 34" 1171 | ] 1172 | }, 1173 | "execution_count": 16, 1174 | "metadata": {}, 1175 | "output_type": "execute_result" 1176 | } 1177 | ], 1178 | "source": [ 1179 | "pd.read_csv('sampledata/data4.csv', nrows = 5)" 1180 | ] 1181 | }, 1182 | { 1183 | "cell_type": "markdown", 1184 | "metadata": {}, 1185 | "source": [ 1186 | "We can also use TextParser objects to iterate through parts of the file according to a certain chunksize. " 1187 | ] 1188 | }, 1189 | { 1190 | "cell_type": "code", 1191 | "execution_count": 17, 1192 | "metadata": {}, 1193 | "outputs": [ 1194 | { 1195 | "data": { 1196 | "text/plain": [ 1197 | "" 1198 | ] 1199 | }, 1200 | "execution_count": 17, 1201 | "metadata": {}, 1202 | "output_type": "execute_result" 1203 | } 1204 | ], 1205 | "source": [ 1206 | "chunker = pd.read_csv('sampledata/data4.csv', nrows=10, chunksize = 5)\n", 1207 | "chunker" 1208 | ] 1209 | }, 1210 | { 1211 | "cell_type": "markdown", 1212 | "metadata": {}, 1213 | "source": [ 1214 | "Rather than a DataFrame. the new type of object returned is a TextParser Object. Notice that the file opened above containing 10 rows is being iterated through in chunks of 5 rows at a time." 1215 | ] 1216 | }, 1217 | { 1218 | "cell_type": "code", 1219 | "execution_count": 18, 1220 | "metadata": {}, 1221 | "outputs": [ 1222 | { 1223 | "name": "stdout", 1224 | "output_type": "stream", 1225 | "text": [ 1226 | " one two three four\n", 1227 | "0 123 123 546 1\n", 1228 | "1 12 45 456 234\n", 1229 | "2 1435 234 456 34\n", 1230 | "3 3451 65 235 21\n", 1231 | "4 123 423 23 34\n", 1232 | "---------------------------\n", 1233 | " one two three four\n", 1234 | "5 123 123 546 1\n", 1235 | "6 12 45 456 234\n", 1236 | "7 1435 234 456 34\n", 1237 | "8 3451 65 235 21\n", 1238 | "9 123 423 23 34\n", 1239 | "---------------------------\n" 1240 | ] 1241 | } 1242 | ], 1243 | "source": [ 1244 | "for piece in chunker:\n", 1245 | " print(piece)\n", 1246 | " print('---------------------------')" 1247 | ] 1248 | } 1249 | ], 1250 | "metadata": { 1251 | "kernelspec": { 1252 | "display_name": "Python 3", 1253 | "language": "python", 1254 | "name": "python3" 1255 | }, 1256 | "language_info": { 1257 | "codemirror_mode": { 1258 | "name": "ipython", 1259 | "version": 3 1260 | }, 1261 | "file_extension": ".py", 1262 | "mimetype": "text/x-python", 1263 | "name": "python", 1264 | "nbconvert_exporter": "python", 1265 | "pygments_lexer": "ipython3", 1266 | "version": "3.6.2" 1267 | } 1268 | }, 1269 | "nbformat": 4, 1270 | "nbformat_minor": 2 1271 | } 1272 | -------------------------------------------------------------------------------- /None0000000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukepolson/Python-Self-Learning/a8c3a7772a3bc33bf329cfeb6e003bb5c26b7372/None0000000.png -------------------------------------------------------------------------------- /NumPy_1-The_NumPy_ndarray.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Table of Contents and Notebook Setup

\n", 8 | "
" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "data": { 18 | "application/javascript": [ 19 | "$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 20 | ], 21 | "text/plain": [ 22 | "" 23 | ] 24 | }, 25 | "metadata": {}, 26 | "output_type": "display_data" 27 | } 28 | ], 29 | "source": [ 30 | "%%javascript\n", 31 | "$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "# Introduction" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "NumPy is short for Numerical Python. It is a foundational package for numerical computation in python. It includes a number of things including\n", 46 | "\n", 47 | "(i) ndarray; a multidimensional array providing fast arithmethic operations\n", 48 | "\n", 49 | "(ii) Mathematical functions for fast operations on entire arrays\n", 50 | "\n", 51 | "(iii) Tools for writing and reading array data to disk and working with memory wrapped files\n", 52 | "\n", 53 | "(iv) Linear algebra, random number generation, Fourier transform capabilities\n", 54 | "\n", 55 | "(v) A C API for connecting NumPy with libraries written in C, C++, and Fortran\n", 56 | "\n", 57 | "The C API maked it easy to pass data to external libraries written in a low level language and also for external libraries to return data to Python as NumPy arrays. This feature often makes Python a choice for wrapping legacy C/C++/Fortran codebases and giving them an easy to use interface.\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "# ndarray: The Multidimensional Array Object" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "ndarray is a fast and flexible container for large data sets in python. Consider some of the simple operations: " 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 2, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "array([[ 1.75067497, -0.55541901, -2.29155033],\n", 83 | " [-0.66385816, 0.3124406 , -0.3907257 ]])" 84 | ] 85 | }, 86 | "execution_count": 2, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "import numpy as np\n", 93 | "\n", 94 | "data = np.random.randn(2,3)\n", 95 | "data" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 3, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "array([[ 17.50674974, -5.5541901 , -22.91550326],\n", 107 | " [ -6.63858164, 3.12440602, -3.90725697]])" 108 | ] 109 | }, 110 | "execution_count": 3, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "data*10" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 4, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "array([[ 3.50134995, -1.11083802, -4.58310065],\n", 128 | " [-1.32771633, 0.6248812 , -0.78145139]])" 129 | ] 130 | }, 131 | "execution_count": 4, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "data+data" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "Note that the random.randn(n,m) function returns an nxm array where each entry is a random number from a Gaussian distribution with mean 0 and variance 1." 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Every array has a shape and a datatype ." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 5, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "(2, 3)" 163 | ] 164 | }, 165 | "execution_count": 5, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "data.shape" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 6, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "dtype('float64')" 183 | ] 184 | }, 185 | "execution_count": 6, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "data.dtype" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "## Creating ndarrays" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "We can convert regular old lists to nparrays as follows:" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 7, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "array([3, 4, 5, 6])" 217 | ] 218 | }, 219 | "execution_count": 7, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "data1=[3, 4, 5, 6]\n", 226 | "arr1 = np.array(data1)\n", 227 | "arr1" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "Nested sequences (like a list of lists) can be converted into a multidimensional numpy array." 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 8, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "array([[1, 2, 3],\n", 246 | " [4, 5, 6]])" 247 | ] 248 | }, 249 | "execution_count": 8, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "nested_list=[[1, 2, 3],[4, 5, 6]]\n", 256 | "arr1 = np.array(nested_list)\n", 257 | "arr1" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "This array has a few properties, including its number of dimensions and its shape attributes." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 9, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/plain": [ 275 | "2" 276 | ] 277 | }, 278 | "execution_count": 9, 279 | "metadata": {}, 280 | "output_type": "execute_result" 281 | } 282 | ], 283 | "source": [ 284 | "arr1.ndim" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 10, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "(2, 3)" 296 | ] 297 | }, 298 | "execution_count": 10, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "arr1.shape" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "In addition, when the array is created, np.array tries to infer a good data type from the data it receives. The datatypw is stored in a special dtype metadata object." 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 11, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/plain": [ 322 | "dtype('int64')" 323 | ] 324 | }, 325 | "execution_count": 11, 326 | "metadata": {}, 327 | "output_type": "execute_result" 328 | } 329 | ], 330 | "source": [ 331 | "arr1.dtype" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "There are a number of other ways to create arrays. If we want to create an empty numpy array, for example, we can simply just fill it with zeros." 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 12, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "text/plain": [ 349 | "array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])" 350 | ] 351 | }, 352 | "execution_count": 12, 353 | "metadata": {}, 354 | "output_type": "execute_result" 355 | } 356 | ], 357 | "source": [ 358 | "np.zeros(10)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 13, 364 | "metadata": {}, 365 | "outputs": [ 366 | { 367 | "data": { 368 | "text/plain": [ 369 | "array([[ 0., 0., 0., 0., 0.],\n", 370 | " [ 0., 0., 0., 0., 0.],\n", 371 | " [ 0., 0., 0., 0., 0.]])" 372 | ] 373 | }, 374 | "execution_count": 13, 375 | "metadata": {}, 376 | "output_type": "execute_result" 377 | } 378 | ], 379 | "source": [ 380 | "np.zeros((3,5))" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "## Data Types for ndarrays" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "We can specify what type of data is contained in an ndarray by invoking a keyword argument." 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 14, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/plain": [ 405 | "dtype('float64')" 406 | ] 407 | }, 408 | "execution_count": 14, 409 | "metadata": {}, 410 | "output_type": "execute_result" 411 | } 412 | ], 413 | "source": [ 414 | "arr1 = np.array([1, 2, 3], dtype = np.float64)\n", 415 | "arr1.dtype" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": {}, 421 | "source": [ 422 | "dtypes are essentially a way in which numpy interacts with other systems- this involves writing and reading data to a disk. This allows numpy to interact and connect with code written in C++ or Fortran." 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "We can cast an array from one dtype to another using the astype method." 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 15, 435 | "metadata": {}, 436 | "outputs": [ 437 | { 438 | "data": { 439 | "text/plain": [ 440 | "dtype('int64')" 441 | ] 442 | }, 443 | "execution_count": 15, 444 | "metadata": {}, 445 | "output_type": "execute_result" 446 | } 447 | ], 448 | "source": [ 449 | "arr = np.array([1,2,3,4,5])\n", 450 | "arr.dtype" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 16, 456 | "metadata": {}, 457 | "outputs": [ 458 | { 459 | "data": { 460 | "text/plain": [ 461 | "dtype('float64')" 462 | ] 463 | }, 464 | "execution_count": 16, 465 | "metadata": {}, 466 | "output_type": "execute_result" 467 | } 468 | ], 469 | "source": [ 470 | "float_arr = arr.astype(np.float64)\n", 471 | "float_arr.dtype" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "In this example, integers were cast to floating point. If we cast floating point to integer, the decimal part is trunctated ( not rounded ). " 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": {}, 484 | "source": [ 485 | "We can also cast strings representing numbers to their numeric form." 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 17, 491 | "metadata": {}, 492 | "outputs": [ 493 | { 494 | "data": { 495 | "text/plain": [ 496 | "array([ 1.23, 4.56, 5.67])" 497 | ] 498 | }, 499 | "execution_count": 17, 500 | "metadata": {}, 501 | "output_type": "execute_result" 502 | } 503 | ], 504 | "source": [ 505 | "numeric_strings = np.array(['1.23', '4.56', '5.67'], dtype=np.string_)\n", 506 | "numeric_strings.astype(float)" 507 | ] 508 | }, 509 | { 510 | "cell_type": "markdown", 511 | "metadata": {}, 512 | "source": [ 513 | "Here we wrote 'float' instead of 'np.float64'. NumPy automatically aliases the Python data types to its own equivalent data types:" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 18, 519 | "metadata": {}, 520 | "outputs": [ 521 | { 522 | "data": { 523 | "text/plain": [ 524 | "dtype('float64')" 525 | ] 526 | }, 527 | "execution_count": 18, 528 | "metadata": {}, 529 | "output_type": "execute_result" 530 | } 531 | ], 532 | "source": [ 533 | "numeric_strings.astype(float).dtype" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "If casting fails somehow, then a ValueError is raised. " 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": 19, 546 | "metadata": {}, 547 | "outputs": [ 548 | { 549 | "ename": "ValueError", 550 | "evalue": "could not convert string to float: 'word'", 551 | "output_type": "error", 552 | "traceback": [ 553 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 554 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 555 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mbad_strings\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'1.23'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'word'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mbad_strings\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 556 | "\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'word'" 557 | ] 558 | } 559 | ], 560 | "source": [ 561 | "bad_strings = np.array(['1.23','word'])\n", 562 | "bad_strings.astype(float)" 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "metadata": {}, 568 | "source": [ 569 | "Finally, we can make the data type of one array become the datatype of another array. It is often useful to deal with data of the same datatype." 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 20, 575 | "metadata": {}, 576 | "outputs": [ 577 | { 578 | "data": { 579 | "text/plain": [ 580 | "array([ 1., 2., 3., 4., 5.])" 581 | ] 582 | }, 583 | "execution_count": 20, 584 | "metadata": {}, 585 | "output_type": "execute_result" 586 | } 587 | ], 588 | "source": [ 589 | "int_array = np.array([1, 2, 3, 4, 5])\n", 590 | "decimal_array = np.array([1.45, 8.32, 3.45])\n", 591 | "\n", 592 | "int_array.astype(decimal_array.dtype)" 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": {}, 598 | "source": [ 599 | "# Arithmetic with NumPy Arrays" 600 | ] 601 | }, 602 | { 603 | "cell_type": "markdown", 604 | "metadata": {}, 605 | "source": [ 606 | "Arrays are important because we can do operations on data without requiring for loops. NumPy users call this vectorization. Any arithematic between equally sized NumPy arrays occurs element wise." 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 21, 612 | "metadata": {}, 613 | "outputs": [ 614 | { 615 | "data": { 616 | "text/plain": [ 617 | "array([[ 1., 2., 3.],\n", 618 | " [ 4., 5., 6.]])" 619 | ] 620 | }, 621 | "execution_count": 21, 622 | "metadata": {}, 623 | "output_type": "execute_result" 624 | } 625 | ], 626 | "source": [ 627 | "arr = np.array([[1, 2, 3],[4, 5, 6]], dtype=float)\n", 628 | "arr" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 22, 634 | "metadata": {}, 635 | "outputs": [ 636 | { 637 | "data": { 638 | "text/plain": [ 639 | "array([[ 2., 4., 6.],\n", 640 | " [ 8., 10., 12.]])" 641 | ] 642 | }, 643 | "execution_count": 22, 644 | "metadata": {}, 645 | "output_type": "execute_result" 646 | } 647 | ], 648 | "source": [ 649 | "arr+arr" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": 23, 655 | "metadata": {}, 656 | "outputs": [ 657 | { 658 | "data": { 659 | "text/plain": [ 660 | "array([[ 1., 4., 9.],\n", 661 | " [ 16., 25., 36.]])" 662 | ] 663 | }, 664 | "execution_count": 23, 665 | "metadata": {}, 666 | "output_type": "execute_result" 667 | } 668 | ], 669 | "source": [ 670 | "arr*arr" 671 | ] 672 | }, 673 | { 674 | "cell_type": "markdown", 675 | "metadata": {}, 676 | "source": [ 677 | "You get the point. Arithmetic opertation with scalars propagate the scalar argument to each element in the array." 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": 24, 683 | "metadata": {}, 684 | "outputs": [ 685 | { 686 | "data": { 687 | "text/plain": [ 688 | "array([[ 7., 9., 11.],\n", 689 | " [ 13., 15., 17.]])" 690 | ] 691 | }, 692 | "execution_count": 24, 693 | "metadata": {}, 694 | "output_type": "execute_result" 695 | } 696 | ], 697 | "source": [ 698 | "2*arr+5" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "metadata": {}, 704 | "source": [ 705 | "Comparisons between arrays of the same size yield boolean arrays." 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": 25, 711 | "metadata": {}, 712 | "outputs": [ 713 | { 714 | "data": { 715 | "text/plain": [ 716 | "array([[ True, False, False],\n", 717 | " [False, False, False]], dtype=bool)" 718 | ] 719 | }, 720 | "execution_count": 25, 721 | "metadata": {}, 722 | "output_type": "execute_result" 723 | } 724 | ], 725 | "source": [ 726 | "2/arr > 0.5*arr+1" 727 | ] 728 | }, 729 | { 730 | "cell_type": "markdown", 731 | "metadata": {}, 732 | "source": [ 733 | "Operation between different arrays is called broadcasting and is discussed in the appendix of the textbook." 734 | ] 735 | }, 736 | { 737 | "cell_type": "markdown", 738 | "metadata": {}, 739 | "source": [ 740 | "# Basic Indexing and Slicing" 741 | ] 742 | }, 743 | { 744 | "cell_type": "markdown", 745 | "metadata": {}, 746 | "source": [ 747 | "One dimensional arrays are simple to index: they are exactly the same as a Python list." 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 26, 753 | "metadata": {}, 754 | "outputs": [ 755 | { 756 | "data": { 757 | "text/plain": [ 758 | "array([5, 6, 7])" 759 | ] 760 | }, 761 | "execution_count": 26, 762 | "metadata": {}, 763 | "output_type": "execute_result" 764 | } 765 | ], 766 | "source": [ 767 | "arr = np.arange(10)\n", 768 | "arr[5:8]" 769 | ] 770 | }, 771 | { 772 | "cell_type": "markdown", 773 | "metadata": {}, 774 | "source": [ 775 | "We can also change these values like before:" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": 27, 781 | "metadata": {}, 782 | "outputs": [ 783 | { 784 | "data": { 785 | "text/plain": [ 786 | "array([ 0, 1, 2, 3, 4, 42, 42, 42, 8, 9])" 787 | ] 788 | }, 789 | "execution_count": 27, 790 | "metadata": {}, 791 | "output_type": "execute_result" 792 | } 793 | ], 794 | "source": [ 795 | "arr[5:8] = 42\n", 796 | "arr" 797 | ] 798 | }, 799 | { 800 | "cell_type": "markdown", 801 | "metadata": {}, 802 | "source": [ 803 | "There is an extremely important distinction from python lists: array slices are viewed and not copied. That means that any changes to the array slice are reflected in the original array." 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": 28, 809 | "metadata": {}, 810 | "outputs": [ 811 | { 812 | "data": { 813 | "text/plain": [ 814 | "array([ 0, 1, 2, 3, 4, 42, 12334, 42, 8, 9])" 815 | ] 816 | }, 817 | "execution_count": 28, 818 | "metadata": {}, 819 | "output_type": "execute_result" 820 | } 821 | ], 822 | "source": [ 823 | "arr_slice = arr[5:8]\n", 824 | "arr_slice[1] = 12334\n", 825 | "\n", 826 | "arr" 827 | ] 828 | }, 829 | { 830 | "cell_type": "markdown", 831 | "metadata": {}, 832 | "source": [ 833 | "The array slice \"points to\" the original values in the NumPy array. This saves space when dealing with large data sets." 834 | ] 835 | }, 836 | { 837 | "cell_type": "markdown", 838 | "metadata": {}, 839 | "source": [ 840 | "The \"bare\" slice [:] assigns to all values in an array." 841 | ] 842 | }, 843 | { 844 | "cell_type": "code", 845 | "execution_count": 29, 846 | "metadata": {}, 847 | "outputs": [ 848 | { 849 | "data": { 850 | "text/plain": [ 851 | "array([ 0, 1, 2, 3, 4, 64, 64, 64, 8, 9])" 852 | ] 853 | }, 854 | "execution_count": 29, 855 | "metadata": {}, 856 | "output_type": "execute_result" 857 | } 858 | ], 859 | "source": [ 860 | "arr_slice[:] = 64\n", 861 | "arr" 862 | ] 863 | }, 864 | { 865 | "cell_type": "markdown", 866 | "metadata": {}, 867 | "source": [ 868 | "If we really want, we can copy subsections of arrays to new arrays using, in this case, something like arr[5:8].copy()" 869 | ] 870 | }, 871 | { 872 | "cell_type": "markdown", 873 | "metadata": {}, 874 | "source": [ 875 | "## Indexing in Higher Dimensional Arrays" 876 | ] 877 | }, 878 | { 879 | "cell_type": "markdown", 880 | "metadata": {}, 881 | "source": [ 882 | "Higher dimensional array indexing is slightly more complicated. Elements at each index are arrays themselves:" 883 | ] 884 | }, 885 | { 886 | "cell_type": "code", 887 | "execution_count": 30, 888 | "metadata": {}, 889 | "outputs": [ 890 | { 891 | "data": { 892 | "text/plain": [ 893 | "array([4, 5, 6])" 894 | ] 895 | }, 896 | "execution_count": 30, 897 | "metadata": {}, 898 | "output_type": "execute_result" 899 | } 900 | ], 901 | "source": [ 902 | "arr2d = np.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])\n", 903 | "arr2d[1]" 904 | ] 905 | }, 906 | { 907 | "cell_type": "markdown", 908 | "metadata": {}, 909 | "source": [ 910 | "Since arr2d[1] is a list, we just need to index this list to get specific elements." 911 | ] 912 | }, 913 | { 914 | "cell_type": "code", 915 | "execution_count": 33, 916 | "metadata": {}, 917 | "outputs": [ 918 | { 919 | "data": { 920 | "text/plain": [ 921 | "4" 922 | ] 923 | }, 924 | "execution_count": 33, 925 | "metadata": {}, 926 | "output_type": "execute_result" 927 | } 928 | ], 929 | "source": [ 930 | "arr2d[1][0]\n", 931 | "arr2d[1, 0] #equivalent" 932 | ] 933 | }, 934 | { 935 | "cell_type": "markdown", 936 | "metadata": {}, 937 | "source": [ 938 | "In multidimensional arrays, if you omit later indices then the object returned will be a lower dimensional array consisting of all the data along the higher dimensions:" 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": 34, 944 | "metadata": {}, 945 | "outputs": [ 946 | { 947 | "data": { 948 | "text/plain": [ 949 | "array([[1, 2, 3],\n", 950 | " [4, 5, 6]])" 951 | ] 952 | }, 953 | "execution_count": 34, 954 | "metadata": {}, 955 | "output_type": "execute_result" 956 | } 957 | ], 958 | "source": [ 959 | "arr3d = np.array([[[1,2,3],[4,5,6],[7,8,9]], [[7,8,9], [10,11,12]]])\n", 960 | "\n", 961 | "arr3d[0]" 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "metadata": {}, 967 | "source": [ 968 | "## Indexing with Slices" 969 | ] 970 | }, 971 | { 972 | "cell_type": "markdown", 973 | "metadata": {}, 974 | "source": [ 975 | "Indexing can be extended to work with multi-dimesional arrays. Note that in 2d arrays, slicing always occurs in the 'row'-'column' format." 976 | ] 977 | }, 978 | { 979 | "cell_type": "code", 980 | "execution_count": 37, 981 | "metadata": {}, 982 | "outputs": [ 983 | { 984 | "data": { 985 | "text/plain": [ 986 | "array([[2, 3],\n", 987 | " [5, 6]])" 988 | ] 989 | }, 990 | "execution_count": 37, 991 | "metadata": {}, 992 | "output_type": "execute_result" 993 | } 994 | ], 995 | "source": [ 996 | "arr3d[0][:2, 1:]" 997 | ] 998 | }, 999 | { 1000 | "cell_type": "markdown", 1001 | "metadata": {}, 1002 | "source": [ 1003 | "# Boolean Indexing" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "markdown", 1008 | "metadata": {}, 1009 | "source": [ 1010 | "## Introduction" 1011 | ] 1012 | }, 1013 | { 1014 | "cell_type": "markdown", 1015 | "metadata": {}, 1016 | "source": [ 1017 | "Boolean indexing in Numpy is similar to that in pandas. Consider the following data:" 1018 | ] 1019 | }, 1020 | { 1021 | "cell_type": "code", 1022 | "execution_count": 52, 1023 | "metadata": { 1024 | "collapsed": true 1025 | }, 1026 | "outputs": [], 1027 | "source": [ 1028 | "names = np.array(['Bob','Joe','Will','Joe','Bob','Will','Will'])\n", 1029 | "data = np.random.randn(7,4)" 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "markdown", 1034 | "metadata": {}, 1035 | "source": [ 1036 | "If each name corresponds to a row in the numpy array and we want to select the rows corresponding to 'Bob', then we can do the following:" 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "code", 1041 | "execution_count": 53, 1042 | "metadata": {}, 1043 | "outputs": [ 1044 | { 1045 | "data": { 1046 | "text/plain": [ 1047 | "array([[ 0.53167396, 0.0629875 , -0.26724699, -0.68922298],\n", 1048 | " [ 0.9609578 , 0.40406404, 0.33030467, -1.01213574]])" 1049 | ] 1050 | }, 1051 | "execution_count": 53, 1052 | "metadata": {}, 1053 | "output_type": "execute_result" 1054 | } 1055 | ], 1056 | "source": [ 1057 | "data[names=='Bob']" 1058 | ] 1059 | }, 1060 | { 1061 | "cell_type": "markdown", 1062 | "metadata": {}, 1063 | "source": [ 1064 | "The boolean array based into data[...] must be the same length as the axis it's indexing." 1065 | ] 1066 | }, 1067 | { 1068 | "cell_type": "markdown", 1069 | "metadata": {}, 1070 | "source": [ 1071 | "We can mix boolean indexing and regular indexing for concise code:" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "code", 1076 | "execution_count": 54, 1077 | "metadata": {}, 1078 | "outputs": [ 1079 | { 1080 | "data": { 1081 | "text/plain": [ 1082 | "array([[ 0.21120775, 0.52717437],\n", 1083 | " [ 1.98533752, -2.58838035]])" 1084 | ] 1085 | }, 1086 | "execution_count": 54, 1087 | "metadata": {}, 1088 | "output_type": "execute_result" 1089 | } 1090 | ], 1091 | "source": [ 1092 | "data[names=='Joe', 1:3]" 1093 | ] 1094 | }, 1095 | { 1096 | "cell_type": "markdown", 1097 | "metadata": {}, 1098 | "source": [ 1099 | "Negations can be obtained using ~ or ~=. The ~ is particularily useful when you have a general (perhaps very large) logical condition which you must enforce." 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "code", 1104 | "execution_count": 55, 1105 | "metadata": {}, 1106 | "outputs": [ 1107 | { 1108 | "data": { 1109 | "text/plain": [ 1110 | "array([False, True, True, True, False, True, True], dtype=bool)" 1111 | ] 1112 | }, 1113 | "execution_count": 55, 1114 | "metadata": {}, 1115 | "output_type": "execute_result" 1116 | } 1117 | ], 1118 | "source": [ 1119 | "~(names=='Bob')\n", 1120 | "names!='Bob' #equivalent" 1121 | ] 1122 | }, 1123 | { 1124 | "cell_type": "markdown", 1125 | "metadata": {}, 1126 | "source": [ 1127 | "## Creating Masks for Complex Logical Statements" 1128 | ] 1129 | }, 1130 | { 1131 | "cell_type": "markdown", 1132 | "metadata": {}, 1133 | "source": [ 1134 | "We can create logical masks for complex statements." 1135 | ] 1136 | }, 1137 | { 1138 | "cell_type": "code", 1139 | "execution_count": 56, 1140 | "metadata": {}, 1141 | "outputs": [ 1142 | { 1143 | "data": { 1144 | "text/plain": [ 1145 | "array([[ 0.53167396, 0.0629875 , -0.26724699, -0.68922298],\n", 1146 | " [-0.45073639, 0.68042297, 2.27010218, 1.24061411],\n", 1147 | " [ 0.9609578 , 0.40406404, 0.33030467, -1.01213574],\n", 1148 | " [ 0.36954954, 0.12929251, -2.02457337, -1.38915124],\n", 1149 | " [-0.03669644, -0.09884574, 0.30501663, 0.08721723]])" 1150 | ] 1151 | }, 1152 | "execution_count": 56, 1153 | "metadata": {}, 1154 | "output_type": "execute_result" 1155 | } 1156 | ], 1157 | "source": [ 1158 | "mask = (names == 'Bob') | (names == 'Will')\n", 1159 | "\n", 1160 | "data[mask]" 1161 | ] 1162 | }, 1163 | { 1164 | "cell_type": "markdown", 1165 | "metadata": {}, 1166 | "source": [ 1167 | " IMPORTANT: Logical keywords 'and' and 'or' do not work for boolean indexing or creating masks. Use & and | instead." 1168 | ] 1169 | }, 1170 | { 1171 | "cell_type": "markdown", 1172 | "metadata": {}, 1173 | "source": [ 1174 | "Boolean indexing can be useful to change or get rid of unwanted values." 1175 | ] 1176 | }, 1177 | { 1178 | "cell_type": "code", 1179 | "execution_count": 57, 1180 | "metadata": {}, 1181 | "outputs": [ 1182 | { 1183 | "data": { 1184 | "text/plain": [ 1185 | "array([[ 0.53167396, 0.0629875 , 0. , 0. ],\n", 1186 | " [ 0. , 0.21120775, 0.52717437, 1.25553763],\n", 1187 | " [ 0. , 0.68042297, 2.27010218, 1.24061411],\n", 1188 | " [ 1.32118295, 1.98533752, 0. , 0.60246169],\n", 1189 | " [ 0.9609578 , 0.40406404, 0.33030467, 0. ],\n", 1190 | " [ 0.36954954, 0.12929251, 0. , 0. ],\n", 1191 | " [ 0. , 0. , 0.30501663, 0.08721723]])" 1192 | ] 1193 | }, 1194 | "execution_count": 57, 1195 | "metadata": {}, 1196 | "output_type": "execute_result" 1197 | } 1198 | ], 1199 | "source": [ 1200 | "data[data<0] = 0\n", 1201 | "data" 1202 | ] 1203 | }, 1204 | { 1205 | "cell_type": "markdown", 1206 | "metadata": {}, 1207 | "source": [ 1208 | "# Transposing Arrays and Swapping Axes" 1209 | ] 1210 | }, 1211 | { 1212 | "cell_type": "markdown", 1213 | "metadata": {}, 1214 | "source": [ 1215 | "Transposing arrays is particularily useful for matrix algebra. " 1216 | ] 1217 | }, 1218 | { 1219 | "cell_type": "code", 1220 | "execution_count": 59, 1221 | "metadata": {}, 1222 | "outputs": [ 1223 | { 1224 | "data": { 1225 | "text/plain": [ 1226 | "array([[ 0, 1, 2, 3, 4],\n", 1227 | " [ 5, 6, 7, 8, 9],\n", 1228 | " [10, 11, 12, 13, 14]])" 1229 | ] 1230 | }, 1231 | "execution_count": 59, 1232 | "metadata": {}, 1233 | "output_type": "execute_result" 1234 | } 1235 | ], 1236 | "source": [ 1237 | "arr = np.arange(15).reshape((3,5))\n", 1238 | "arr" 1239 | ] 1240 | }, 1241 | { 1242 | "cell_type": "code", 1243 | "execution_count": 60, 1244 | "metadata": {}, 1245 | "outputs": [ 1246 | { 1247 | "data": { 1248 | "text/plain": [ 1249 | "array([[ 0, 5, 10],\n", 1250 | " [ 1, 6, 11],\n", 1251 | " [ 2, 7, 12],\n", 1252 | " [ 3, 8, 13],\n", 1253 | " [ 4, 9, 14]])" 1254 | ] 1255 | }, 1256 | "execution_count": 60, 1257 | "metadata": {}, 1258 | "output_type": "execute_result" 1259 | } 1260 | ], 1261 | "source": [ 1262 | "arr.T" 1263 | ] 1264 | }, 1265 | { 1266 | "cell_type": "markdown", 1267 | "metadata": {}, 1268 | "source": [ 1269 | "Calculating the inner product of two matrices:" 1270 | ] 1271 | }, 1272 | { 1273 | "cell_type": "code", 1274 | "execution_count": 61, 1275 | "metadata": {}, 1276 | "outputs": [ 1277 | { 1278 | "data": { 1279 | "text/plain": [ 1280 | "array([[ 30, 80, 130],\n", 1281 | " [ 80, 255, 430],\n", 1282 | " [130, 430, 730]])" 1283 | ] 1284 | }, 1285 | "execution_count": 61, 1286 | "metadata": {}, 1287 | "output_type": "execute_result" 1288 | } 1289 | ], 1290 | "source": [ 1291 | "np.dot(arr,arr.T)" 1292 | ] 1293 | } 1294 | ], 1295 | "metadata": { 1296 | "kernelspec": { 1297 | "display_name": "Python 3", 1298 | "language": "python", 1299 | "name": "python3" 1300 | }, 1301 | "language_info": { 1302 | "codemirror_mode": { 1303 | "name": "ipython", 1304 | "version": 3 1305 | }, 1306 | "file_extension": ".py", 1307 | "mimetype": "text/x-python", 1308 | "name": "python", 1309 | "nbconvert_exporter": "python", 1310 | "pygments_lexer": "ipython3", 1311 | "version": "3.6.2" 1312 | } 1313 | }, 1314 | "nbformat": 4, 1315 | "nbformat_minor": 2 1316 | } 1317 | -------------------------------------------------------------------------------- /NumPy_2-Universal_Functions-Fast_Element-Wise_Array_Functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Table of Contents and Notebook Setup

\n", 8 | "
" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 2, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "data": { 18 | "application/javascript": [ 19 | "$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 20 | ], 21 | "text/plain": [ 22 | "" 23 | ] 24 | }, 25 | "metadata": {}, 26 | "output_type": "display_data" 27 | } 28 | ], 29 | "source": [ 30 | "%%javascript\n", 31 | "$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import numpy as np" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# Univeral Function Introduction" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "A universal function or ufunc performs element wise operations on data in ndarrays. These functions include simple mathematical functions like exponentials and square roots." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "array([0. , 1. , 1.41421356, 1.73205081, 2. ,\n", 68 | " 2.23606798, 2.44948974, 2.64575131, 2.82842712, 3. ])" 69 | ] 70 | }, 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "arr = np.arange(10)\n", 78 | "np.sqrt(arr)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "These are referred to as unary ufuncs. Other functions, such as maximum take two ndarrays and return a single array ( binary ufuncs)." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "array([ 0.26252147, -0.06023566, 1.61060357, 0.94351335, 0.37743868,\n", 97 | " 1.72642266, 0.20044704, 1.36904904, 0.09303552, 1.45390904])" 98 | ] 99 | }, 100 | "execution_count": 4, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "x=np.random.randn(10)\n", 107 | "y=np.random.randn(10)\n", 108 | "np.maximum(x, y)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "# Returning Multiple Arrays" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "A ufunc can also return multiple arrays; 'modf' is one example - it returns the fractional and integer part of an array of numbers:" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 5, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/plain": [ 133 | "array([ -5.7678331 , -4.74337863, 9.23923766, 1.66801359,\n", 134 | " -17.62486265, 7.25717594, -5.52891383, -1.30939992,\n", 135 | " -4.26590234, 2.31233354])" 136 | ] 137 | }, 138 | "execution_count": 5, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "arr = np.random.randn(10) * 5\n", 145 | "arr" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 6, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "array([-0.7678331 , -0.74337863, 0.23923766, 0.66801359, -0.62486265,\n", 157 | " 0.25717594, -0.52891383, -0.30939992, -0.26590234, 0.31233354])" 158 | ] 159 | }, 160 | "execution_count": 6, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "remainder, whole_part = np.modf(arr)\n", 167 | "remainder" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 7, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "array([ -5., -4., 9., 1., -17., 7., -5., -1., -4., 2.])" 179 | ] 180 | }, 181 | "execution_count": 7, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "whole_part" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "# Returning the New Array to a New Variable" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "ufuncs take an optional out argument that allows them to copy the new array to a new variable:" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 8, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stderr", 211 | "output_type": "stream", 212 | "text": [ 213 | "C:\\Users\\lukep\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: RuntimeWarning: invalid value encountered in sqrt\n", 214 | " \n" 215 | ] 216 | }, 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "array([ nan, nan, nan, nan, nan,\n", 221 | " nan, 0.7803909 , nan, 0.25932723, 1.00560002])" 222 | ] 223 | }, 224 | "execution_count": 8, 225 | "metadata": {}, 226 | "output_type": "execute_result" 227 | } 228 | ], 229 | "source": [ 230 | "arr = np.random.randn(10)\n", 231 | "np.sqrt(arr, arr)\n", 232 | "\n", 233 | "arr" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [] 244 | } 245 | ], 246 | "metadata": { 247 | "kernelspec": { 248 | "display_name": "Python 3", 249 | "language": "python", 250 | "name": "python3" 251 | }, 252 | "language_info": { 253 | "codemirror_mode": { 254 | "name": "ipython", 255 | "version": 3 256 | }, 257 | "file_extension": ".py", 258 | "mimetype": "text/x-python", 259 | "name": "python", 260 | "nbconvert_exporter": "python", 261 | "pygments_lexer": "ipython3", 262 | "version": "3.6.2" 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 2 267 | } 268 | -------------------------------------------------------------------------------- /Pandas_3-Summarizing_and_Computing_Descriptive_Statistics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Table of Contents and Notebook Setup

\n", 8 | "
" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "data": { 18 | "application/javascript": [ 19 | "$.get('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 20 | ], 21 | "text/plain": [ 22 | "" 23 | ] 24 | }, 25 | "metadata": {}, 26 | "output_type": "display_data" 27 | } 28 | ], 29 | "source": [ 30 | "%%javascript\n", 31 | "$.get('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import pandas as pd\n", 43 | "import numpy as np" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# Some Math Prereqs" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "source": [ 59 | "The covariance of two distributions X and Y is defined as" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "source": [ 68 | "$$cov(X,Y) = \\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\mu_x)(y_i-\\mu_y) $$" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "i represents the 'i'th measurement \n", 76 | "\n", 77 | "n is the total number of measurements \n", 78 | "\n", 79 | "$x_i$ and $y_i$ are individual measurements\n", 80 | "\n", 81 | "$\\mu_x$ and $\\mu_y$ are the mean values of X and Y" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "This gives us information about how the two variables deviate from their expected values (means) and if they do it at the same time. If they both deviate positively or negatively at the same time, then we get a large contribution from the sum. If one deviates positively and the other negatively then we get a large negative number. Big positive numbers mean the variables are correlated (big negative also means they're related in some way too- as one goes up the other goes down)." 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "The problem is that these large or small numbers depend on the scale of the units we use for measurement. We want a quantity that we know is the same for all distributions. We can divide by their variances (related to standard deviation)." 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "$$\\sigma_x \\equiv cov(X,X) = \\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\mu_x)^2 $$" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "We define the correlation (or more precisely: the linear correlation ) as follows:" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "$$corr(X,Y)=\\frac{cov(X,Y)}{\\sqrt{\\sigma_x \\sigma_y}}$$" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "The inequaltity $-1 \\leq corr(X,Y) \\leq 1$ always holds. This can be shown through the Cauchy-Schwartz inequality (quantities $x_i-\\mu_x$ are elements of a vector)." 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "If $corr(X,Y)=1$ then the values are perfectly correlated (in the vector space of measurements $x_i-\\mu_x$ and $y_i-\\mu_y$ they point in the same direction). If $corr(X,Y)=-1$ then the values are perfectly uncorrelated (in the vector space they point in opposite direction)." 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "Such information is when comparing stock trends. Suppose that amazon always drops the day after microsoft goes up and we find a strict anticorrelation ($corr(X,Y)=-1$). In the future, when microsoft goes up, we may want to sell our amazon stock as we know its going to drop- then pick it up the next day for a discount." 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "source": [ 146 | "# Basic Mathematical Functions of Pandas" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "pandas objects are equipt to deal with a variety of mathematical and statistical functions, and can also deal with missing data." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 3, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "data": { 163 | "text/html": [ 164 | "
\n", 165 | "\n", 178 | "\n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | "
onetwo
a1.40NaN
b7.10-4.5
cNaNNaN
d0.75-1.3
\n", 209 | "
" 210 | ], 211 | "text/plain": [ 212 | " one two\n", 213 | "a 1.40 NaN\n", 214 | "b 7.10 -4.5\n", 215 | "c NaN NaN\n", 216 | "d 0.75 -1.3" 217 | ] 218 | }, 219 | "execution_count": 3, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "df = pd.DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],\n", 226 | " index=['a','b','c','d'], columns=['one','two'])\n", 227 | "df" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "## The Sum Method" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "Use sum to return the column sums. " 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 4, 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "one 9.25\n", 253 | "two -5.80\n", 254 | "dtype: float64" 255 | ] 256 | }, 257 | "execution_count": 4, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "df.sum()" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "We can also use axis='columns' to sum across the columns instead." 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 5, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "a 1.40\n", 282 | "b 2.60\n", 283 | "c 0.00\n", 284 | "d -0.55\n", 285 | "dtype: float64" 286 | ] 287 | }, 288 | "execution_count": 5, 289 | "metadata": {}, 290 | "output_type": "execute_result" 291 | } 292 | ], 293 | "source": [ 294 | "df.sum(axis='columns')" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "## The Mean Method" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "We can exclude rows with NA values if we like:" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 6, 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "data": { 318 | "text/plain": [ 319 | "a NaN\n", 320 | "b 1.300\n", 321 | "c NaN\n", 322 | "d -0.275\n", 323 | "dtype: float64" 324 | ] 325 | }, 326 | "execution_count": 6, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "df.mean(axis='columns', skipna=False)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "## Accumulation Method (Integration of Rows/Columns)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 7, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/html": [ 350 | "
\n", 351 | "\n", 364 | "\n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | "
onetwo
a1.40NaN
b8.50-4.5
cNaNNaN
d9.25-5.8
\n", 395 | "
" 396 | ], 397 | "text/plain": [ 398 | " one two\n", 399 | "a 1.40 NaN\n", 400 | "b 8.50 -4.5\n", 401 | "c NaN NaN\n", 402 | "d 9.25 -5.8" 403 | ] 404 | }, 405 | "execution_count": 7, 406 | "metadata": {}, 407 | "output_type": "execute_result" 408 | } 409 | ], 410 | "source": [ 411 | "df.cumsum()" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "We can use this for integration if we like." 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 8, 424 | "metadata": {}, 425 | "outputs": [ 426 | { 427 | "data": { 428 | "text/html": [ 429 | "
\n", 430 | "\n", 443 | "\n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | "
onetwo
a0.00140NaN
b0.00850-0.0045
cNaNNaN
d0.00925-0.0058
\n", 474 | "
" 475 | ], 476 | "text/plain": [ 477 | " one two\n", 478 | "a 0.00140 NaN\n", 479 | "b 0.00850 -0.0045\n", 480 | "c NaN NaN\n", 481 | "d 0.00925 -0.0058" 482 | ] 483 | }, 484 | "execution_count": 8, 485 | "metadata": {}, 486 | "output_type": "execute_result" 487 | } 488 | ], 489 | "source": [ 490 | "dx = 0.001\n", 491 | "(df*dx).cumsum()" 492 | ] 493 | }, 494 | { 495 | "cell_type": "markdown", 496 | "metadata": {}, 497 | "source": [ 498 | "## Basic Statistical Method " 499 | ] 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "metadata": {}, 504 | "source": [ 505 | "We can use the describe method to learn about the rows and columns." 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 9, 511 | "metadata": {}, 512 | "outputs": [ 513 | { 514 | "data": { 515 | "text/html": [ 516 | "
\n", 517 | "\n", 530 | "\n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | "
onetwo
count3.0000002.000000
mean3.083333-2.900000
std3.4936852.262742
min0.750000-4.500000
25%1.075000-3.700000
50%1.400000-2.900000
75%4.250000-2.100000
max7.100000-1.300000
\n", 581 | "
" 582 | ], 583 | "text/plain": [ 584 | " one two\n", 585 | "count 3.000000 2.000000\n", 586 | "mean 3.083333 -2.900000\n", 587 | "std 3.493685 2.262742\n", 588 | "min 0.750000 -4.500000\n", 589 | "25% 1.075000 -3.700000\n", 590 | "50% 1.400000 -2.900000\n", 591 | "75% 4.250000 -2.100000\n", 592 | "max 7.100000 -1.300000" 593 | ] 594 | }, 595 | "execution_count": 9, 596 | "metadata": {}, 597 | "output_type": "execute_result" 598 | } 599 | ], 600 | "source": [ 601 | "df.describe()" 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": {}, 607 | "source": [ 608 | "## Summary" 609 | ] 610 | }, 611 | { 612 | "cell_type": "markdown", 613 | "metadata": {}, 614 | "source": [ 615 | "See table 5-8 on page 160 of textbook for all simple methods." 616 | ] 617 | }, 618 | { 619 | "cell_type": "markdown", 620 | "metadata": {}, 621 | "source": [ 622 | "# Correlation and Covariance" 623 | ] 624 | }, 625 | { 626 | "cell_type": "markdown", 627 | "metadata": {}, 628 | "source": [ 629 | "## Introduction to Correlation in DataFrames" 630 | ] 631 | }, 632 | { 633 | "cell_type": "markdown", 634 | "metadata": {}, 635 | "source": [ 636 | "Correlation and Covariance look at the relationship between two data sets. Below we compare stock datasets. " 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 10, 642 | "metadata": { 643 | "scrolled": true 644 | }, 645 | "outputs": [ 646 | { 647 | "ename": "ImportError", 648 | "evalue": "cannot import name 'is_list_like'", 649 | "output_type": "error", 650 | "traceback": [ 651 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 652 | "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", 653 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mpandas_datareader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mweb\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mstocks\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'AMZN'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'GOOG'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'AAPL'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'TD'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'JNJ'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'IBM'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mstart\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2017\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m7\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m29\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mend\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2018\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m8\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 654 | "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas_datareader\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_version\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mget_versions\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m from .data import (DataReader, Options, get_components_yahoo,\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mget_dailysummary_iex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mget_data_enigma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mget_data_famafrench\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mget_data_fred\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mget_data_google\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mget_data_moex\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mget_data_morningstar\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mget_data_quandl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mget_data_stooq\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 655 | "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas_datareader\\data.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[0mImmediateDeprecationError\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas_datareader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfamafrench\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mFamaFrenchReader\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 14\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas_datareader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfred\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mFredReader\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 15\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas_datareader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgoogle\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdaily\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mGoogleDailyReader\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas_datareader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgoogle\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mOptions\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mGoogleOptions\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 656 | "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas_datareader\\fred.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommon\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mis_list_like\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mconcat\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mread_csv\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas_datareader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbase\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0m_BaseReader\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 657 | "\u001b[1;31mImportError\u001b[0m: cannot import name 'is_list_like'" 658 | ] 659 | } 660 | ], 661 | "source": [ 662 | "import pandas_datareader.data as web\n", 663 | "stocks = ['AMZN', 'GOOG', 'AAPL', 'TD', 'JNJ', 'IBM']\n", 664 | "\n", 665 | "start = pd.datetime(2017, 7, 29)\n", 666 | "end = pd.datetime(2018, 8, 2)\n", 667 | "f1 = web.DataReader(stocks, 'iex', start, end)\n", 668 | "f1['open'].head() #opening price for the stock on that day" 669 | ] 670 | }, 671 | { 672 | "cell_type": "markdown", 673 | "metadata": {}, 674 | "source": [ 675 | "Lets apply some functions and see how the stock changes at the beginning and the end of the day. Recall that functions like the one below can operate on rows or columns of dataframes; in this case we choose column headers." 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": null, 681 | "metadata": { 682 | "collapsed": true 683 | }, 684 | "outputs": [], 685 | "source": [ 686 | "def find_change(x, stock):\n", 687 | " return x['close'][stock]-x['open'][stock]\n", 688 | "\n", 689 | "stock_day_changes = pd.DataFrame([f1.apply(find_change, axis='columns', args=(stock,)) \n", 690 | " for stock in stocks], index=stocks)\n", 691 | "stock_day_changes = stock_day_changes.transpose()\n", 692 | "stock_day_changes.head()" 693 | ] 694 | }, 695 | { 696 | "cell_type": "markdown", 697 | "metadata": {}, 698 | "source": [ 699 | "Correlation for entire DataFrame:" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "metadata": { 706 | "collapsed": true 707 | }, 708 | "outputs": [], 709 | "source": [ 710 | "stock_day_changes.corr()" 711 | ] 712 | }, 713 | { 714 | "cell_type": "markdown", 715 | "metadata": {}, 716 | "source": [ 717 | "Or we can just select single elements:" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": null, 723 | "metadata": { 724 | "collapsed": true 725 | }, 726 | "outputs": [], 727 | "source": [ 728 | "stock_day_changes['AMZN'].corr(stock_day_changes['GOOG'])" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "metadata": {}, 734 | "source": [ 735 | "Or we can select rows:" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": { 742 | "collapsed": true 743 | }, 744 | "outputs": [], 745 | "source": [ 746 | "stock_day_changes.corrwith(stock_day_changes.AMZN)" 747 | ] 748 | }, 749 | { 750 | "cell_type": "markdown", 751 | "metadata": {}, 752 | "source": [ 753 | "# Unique Values, Counting Occurences, and Membership of Elements in a Series" 754 | ] 755 | }, 756 | { 757 | "cell_type": "markdown", 758 | "metadata": {}, 759 | "source": [ 760 | "The pandas module has even more methods for determining if elements in a series are unique. This is obviously useful for DataFrames as the rows and columns can be extracted as Series." 761 | ] 762 | }, 763 | { 764 | "cell_type": "markdown", 765 | "metadata": {}, 766 | "source": [ 767 | "## Uniqueness" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": null, 773 | "metadata": { 774 | "collapsed": true 775 | }, 776 | "outputs": [], 777 | "source": [ 778 | "obj = pd.Series(['c', 'a', 'c', 'b', 'a', 'c', 'b', 'a', 'c'])\n", 779 | "uniques = obj.unique()\n", 780 | "uniques" 781 | ] 782 | }, 783 | { 784 | "cell_type": "markdown", 785 | "metadata": {}, 786 | "source": [ 787 | "## Counting Occurences" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": null, 793 | "metadata": { 794 | "collapsed": true 795 | }, 796 | "outputs": [], 797 | "source": [ 798 | "obj = pd.Series(['c', 'a', 'c', 'b', 'a', 'c', 'b', 'a', 'c'])\n", 799 | "obj.value_counts()" 800 | ] 801 | }, 802 | { 803 | "cell_type": "markdown", 804 | "metadata": {}, 805 | "source": [ 806 | "The returned Series is sorted by the number of occurences. We can choose not to have this as well:" 807 | ] 808 | }, 809 | { 810 | "cell_type": "code", 811 | "execution_count": null, 812 | "metadata": { 813 | "collapsed": true 814 | }, 815 | "outputs": [], 816 | "source": [ 817 | "pd.value_counts(obj.values, sort=False)" 818 | ] 819 | }, 820 | { 821 | "cell_type": "markdown", 822 | "metadata": {}, 823 | "source": [ 824 | "## Membership" 825 | ] 826 | }, 827 | { 828 | "cell_type": "markdown", 829 | "metadata": {}, 830 | "source": [ 831 | "Sometimes we want to see if an element is contained in a Series. We can use the isin method for this." 832 | ] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "execution_count": null, 837 | "metadata": { 838 | "collapsed": true 839 | }, 840 | "outputs": [], 841 | "source": [ 842 | "mask = obj.isin(['b', 'c'])\n", 843 | "mask" 844 | ] 845 | }, 846 | { 847 | "cell_type": "markdown", 848 | "metadata": {}, 849 | "source": [ 850 | "Then we can use the mask to extract the elements that we want." 851 | ] 852 | }, 853 | { 854 | "cell_type": "code", 855 | "execution_count": null, 856 | "metadata": { 857 | "collapsed": true 858 | }, 859 | "outputs": [], 860 | "source": [ 861 | "obj[mask]" 862 | ] 863 | }, 864 | { 865 | "cell_type": "markdown", 866 | "metadata": {}, 867 | "source": [ 868 | "This can often make boolean indexing DataFrames easier when one has lots of conditions." 869 | ] 870 | }, 871 | { 872 | "cell_type": "markdown", 873 | "metadata": {}, 874 | "source": [ 875 | "Suppose we have a Series of distinct values and a Series of non-distinct values like below:" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": null, 881 | "metadata": { 882 | "collapsed": true 883 | }, 884 | "outputs": [], 885 | "source": [ 886 | "to_match = pd.Series(['c','b','c','a','b'])\n", 887 | "unique_vals = pd.Series(['b','a','c'])" 888 | ] 889 | }, 890 | { 891 | "cell_type": "markdown", 892 | "metadata": {}, 893 | "source": [ 894 | "We can use the Index.get_indexer method to give an index array from the unique values:" 895 | ] 896 | }, 897 | { 898 | "cell_type": "code", 899 | "execution_count": null, 900 | "metadata": { 901 | "collapsed": true 902 | }, 903 | "outputs": [], 904 | "source": [ 905 | "pd.Index(unique_vals).get_indexer(to_match)" 906 | ] 907 | } 908 | ], 909 | "metadata": { 910 | "kernelspec": { 911 | "display_name": "Python 3", 912 | "language": "python", 913 | "name": "python3" 914 | }, 915 | "language_info": { 916 | "codemirror_mode": { 917 | "name": "ipython", 918 | "version": 3 919 | }, 920 | "file_extension": ".py", 921 | "mimetype": "text/x-python", 922 | "name": "python", 923 | "nbconvert_exporter": "python", 924 | "pygments_lexer": "ipython3", 925 | "version": "3.6.2" 926 | } 927 | }, 928 | "nbformat": 4, 929 | "nbformat_minor": 2 930 | } 931 | -------------------------------------------------------------------------------- /convert_html_to_colour_pdf.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf1561\cocoasubrtf400 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fnil\fcharset0 Menlo-Regular;} 3 | {\colortbl;\red255\green255\blue255;\red27\green29\blue31;\red235\green236\blue237;} 4 | {\*\expandedcolortbl;;\cssrgb\c14118\c15294\c16078;\cssrgb\c93725\c94118\c94510;} 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0 6 | \pard\tx566\tx1133\tx1700\tx2267\tx2834\tx3401\tx3968\tx4535\tx5102\tx5669\tx6236\tx6803\pardirnatural\partightenfactor0 7 | 8 | \f0\fs24 \cf0 TO CONVERT .ipynb TO .pdf FILES:\ 9 | \ 10 | 1) In Jupyter Notebook save as html file\ 11 | 2) Open html in text editor, search for \'93@media print\'94 and delete \'93color: #000 !important;\'94\ 12 | 3) Open html file and save as pdf (enable print in color)\ 13 | \ 14 | 15 | \f1\fs26 \cf2 \cb3 \expnd0\expndtw0\kerning0 16 | \ 17 | 18 | \f0\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \ 19 | } -------------------------------------------------------------------------------- /dataexample.csv: -------------------------------------------------------------------------------- 1 | Month,People,Sales 2 | January,1200,1678 3 | March,600,567 4 | June,2300,2400 5 | August,2333,4000 6 | -------------------------------------------------------------------------------- /htmls_and_pdfs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukepolson/Python-Self-Learning/a8c3a7772a3bc33bf329cfeb6e003bb5c26b7372/htmls_and_pdfs/.DS_Store -------------------------------------------------------------------------------- /htmls_and_pdfs/html_to_colour.py: -------------------------------------------------------------------------------- 1 | import os 2 | delete_list = ["color: #000 !important;"] 3 | 4 | for filename in os.listdir('/Users/lukepolson/Documents/Jupyter Tutorials/python/htmls'): 5 | if filename.endswith(".html"): 6 | infile = filename 7 | outfile = filename+'temp' 8 | 9 | fin = open(infile) 10 | fout = open(outfile, "w+") 11 | 12 | for line in fin: 13 | for word in delete_list: 14 | line = line.replace(word, "") 15 | fout.write(line) 16 | 17 | fin.close() 18 | fout.close() 19 | os.rename(filename+'temp', filename) 20 | 21 | continue 22 | else: 23 | continue 24 | -------------------------------------------------------------------------------- /htmls_and_pdfs/to_pdf.py: -------------------------------------------------------------------------------- 1 | import weasyprint 2 | import glob 3 | import os 4 | 5 | #pdfkit.from_file('Pandas_1-Introduction.html', 'ye.pdf') 6 | 7 | 8 | for filename in glob.glob("*.html"): 9 | #pdfkit.from_file(filename, filename[:-5]+'.pdf') 10 | weasyprint.HTML(filename).write_pdf(filename[:-5]+'.pdf') 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /sample_plot.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukepolson/Python-Self-Learning/a8c3a7772a3bc33bf329cfeb6e003bb5c26b7372/sample_plot.pdf -------------------------------------------------------------------------------- /sample_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukepolson/Python-Self-Learning/a8c3a7772a3bc33bf329cfeb6e003bb5c26b7372/sample_plot.png -------------------------------------------------------------------------------- /sampledata/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukepolson/Python-Self-Learning/a8c3a7772a3bc33bf329cfeb6e003bb5c26b7372/sampledata/.DS_Store -------------------------------------------------------------------------------- /sampledata/data1.csv: -------------------------------------------------------------------------------- 1 | Month,People,Sales 2 | January,1200,1678 3 | March,600,567 4 | June,2300,2400 5 | August,2333,4000 6 | -------------------------------------------------------------------------------- /sampledata/data1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukepolson/Python-Self-Learning/a8c3a7772a3bc33bf329cfeb6e003bb5c26b7372/sampledata/data1.txt -------------------------------------------------------------------------------- /sampledata/data2.txt: -------------------------------------------------------------------------------- 1 | People Sales 2 | Jan 12 1200 3 | Feb 23 2100 4 | Mar 45 4500 -------------------------------------------------------------------------------- /sampledata/data3.csv: -------------------------------------------------------------------------------- 1 | something,a,b,c,message 2 | one,12,23,NA,hello 3 | two,23,,12,world 4 | three,3,,5,foo 5 | -------------------------------------------------------------------------------- /sampledata/data4.csv: -------------------------------------------------------------------------------- 1 | one,two,three,four 2 | 123,123,546,1 3 | 12,45,456,234 4 | 1435,234,456,34 5 | 3451,65,235,21 6 | 123,423,23,34 7 | 123,123,546,1 8 | 12,45,456,234 9 | 1435,234,456,34 10 | 3451,65,235,21 11 | 123,423,23,34 12 | 123,123,546,1 13 | 12,45,456,234 14 | 1435,234,456,34 15 | 3451,65,235,21 16 | 123,423,23,34 17 | 123,123,546,1 18 | 12,45,456,234 19 | 1435,234,456,34 20 | 3451,65,235,21 21 | 123,423,23,34 22 | 123,123,546,1 23 | 12,45,456,234 24 | 1435,234,456,34 25 | 3451,65,235,21 26 | 123,423,23,34 27 | 123,123,546,1 28 | 12,45,456,234 29 | 1435,234,456,34 30 | 3451,65,235,21 31 | 123,423,23,34 32 | 123,123,546,1 33 | 12,45,456,234 34 | 1435,234,456,34 35 | 3451,65,235,21 36 | 123,423,23,34 37 | 123,123,546,1 38 | 12,45,456,234 39 | 1435,234,456,34 40 | 3451,65,235,21 41 | 123,423,23,34 42 | 123,123,546,1 43 | 12,45,456,234 44 | 1435,234,456,34 45 | 3451,65,235,21 46 | 123,423,23,34 47 | 123,123,546,1 48 | 12,45,456,234 49 | 1435,234,456,34 50 | 3451,65,235,21 51 | 123,423,23,34 52 | 123,123,546,1 53 | 12,45,456,234 54 | 1435,234,456,34 55 | 3451,65,235,21 56 | 123,423,23,34 57 | 123,123,546,1 58 | 12,45,456,234 59 | 1435,234,456,34 60 | 3451,65,235,21 61 | 123,423,23,34 62 | 123,123,546,1 63 | 12,45,456,234 64 | 1435,234,456,34 65 | 3451,65,235,21 66 | 123,423,23,34 67 | 123,123,546,1 68 | 12,45,456,234 69 | 1435,234,456,34 70 | 3451,65,235,21 71 | 123,423,23,34 72 | 123,123,546,1 73 | 12,45,456,234 74 | 1435,234,456,34 75 | 3451,65,235,21 76 | 123,423,23,34 77 | 123,123,546,1 78 | 12,45,456,234 79 | 1435,234,456,34 80 | 3451,65,235,21 81 | 123,423,23,34 82 | 123,123,546,1 83 | 12,45,456,234 84 | 1435,234,456,34 85 | 3451,65,235,21 86 | 123,423,23,34 87 | 123,123,546,1 88 | 12,45,456,234 89 | 1435,234,456,34 90 | 3451,65,235,21 91 | 123,423,23,34 92 | 123,123,546,1 93 | 12,45,456,234 94 | 1435,234,456,34 95 | 3451,65,235,21 96 | 123,423,23,34 97 | 123,123,546,1 98 | 12,45,456,234 99 | 1435,234,456,34 100 | 3451,65,235,21 101 | 123,423,23,34 102 | 123,123,546,1 103 | 12,45,456,234 104 | 1435,234,456,34 105 | 3451,65,235,21 106 | 123,423,23,34 107 | 123,123,546,1 108 | 12,45,456,234 109 | 1435,234,456,34 110 | 3451,65,235,21 111 | 123,423,23,34 112 | 123,123,546,1 113 | 12,45,456,234 114 | 1435,234,456,34 115 | 3451,65,235,21 116 | 123,423,23,34 117 | 123,123,546,1 118 | 12,45,456,234 119 | 1435,234,456,34 120 | 3451,65,235,21 121 | 123,423,23,34 122 | 123,123,546,1 123 | 12,45,456,234 124 | 1435,234,456,34 125 | 3451,65,235,21 126 | 123,423,23,34 127 | 123,123,546,1 128 | 12,45,456,234 129 | 1435,234,456,34 130 | 3451,65,235,21 131 | 123,423,23,34 132 | 123,123,546,1 133 | 12,45,456,234 134 | 1435,234,456,34 135 | 3451,65,235,21 136 | 123,423,23,34 137 | 123,123,546,1 138 | 12,45,456,234 139 | 1435,234,456,34 140 | 3451,65,235,21 141 | 123,423,23,34 142 | 123,123,546,1 143 | 12,45,456,234 144 | 1435,234,456,34 145 | 3451,65,235,21 146 | 123,423,23,34 147 | 123,123,546,1 148 | 12,45,456,234 149 | 1435,234,456,34 150 | 3451,65,235,21 151 | 123,423,23,34 152 | 123,123,546,1 153 | 12,45,456,234 154 | 1435,234,456,34 155 | 3451,65,235,21 156 | 123,423,23,34 157 | 123,123,546,1 158 | 12,45,456,234 159 | 1435,234,456,34 160 | 3451,65,235,21 161 | 123,423,23,34 162 | 123,123,546,1 163 | 12,45,456,234 164 | 1435,234,456,34 165 | 3451,65,235,21 166 | 123,423,23,34 167 | 123,123,546,1 168 | 12,45,456,234 169 | 1435,234,456,34 170 | 3451,65,235,21 171 | 123,423,23,34 172 | 123,123,546,1 173 | 12,45,456,234 174 | 1435,234,456,34 175 | 3451,65,235,21 176 | 123,423,23,34 177 | 123,123,546,1 178 | 12,45,456,234 179 | 1435,234,456,34 180 | 3451,65,235,21 181 | 123,423,23,34 182 | 123,123,546,1 183 | 12,45,456,234 184 | 1435,234,456,34 185 | 3451,65,235,21 186 | 123,423,23,34 187 | 123,123,546,1 188 | 12,45,456,234 189 | 1435,234,456,34 190 | 3451,65,235,21 191 | 123,423,23,34 192 | 123,123,546,1 193 | 12,45,456,234 194 | 1435,234,456,34 195 | 3451,65,235,21 196 | 123,423,23,34 197 | 123,123,546,1 198 | 12,45,456,234 199 | 1435,234,456,34 200 | 3451,65,235,21 201 | 123,423,23,34 202 | 123,123,546,1 203 | 12,45,456,234 204 | 1435,234,456,34 205 | 3451,65,235,21 206 | 123,423,23,34 207 | 123,123,546,1 208 | 12,45,456,234 209 | 1435,234,456,34 210 | 3451,65,235,21 211 | 123,423,23,34 212 | 123,123,546,1 213 | 12,45,456,234 214 | 1435,234,456,34 215 | 3451,65,235,21 216 | 123,423,23,34 217 | 123,123,546,1 218 | 12,45,456,234 219 | 1435,234,456,34 220 | 3451,65,235,21 221 | 123,423,23,34 222 | 123,123,546,1 223 | 12,45,456,234 224 | 1435,234,456,34 225 | 3451,65,235,21 226 | 123,423,23,34 227 | -------------------------------------------------------------------------------- /sampledata/macrodata.txt: -------------------------------------------------------------------------------- 1 | year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint 2 | 1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0 3 | 1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74 4 | 1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09 5 | 1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06 6 | 1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19 7 | 1960.0,2.0,2834.39,1792.9,298.152,460.4,1966.1,29.55,140.2,2.68,5.2,180.671,0.14,2.55 8 | 1960.0,3.0,2839.022,1785.8,296.375,474.676,1967.8,29.75,140.9,2.36,5.6,181.528,2.7,-0.34 9 | 1960.0,4.0,2802.616,1788.2,259.764,476.434,1966.6,29.84,141.1,2.29,6.3,182.287,1.21,1.08 10 | 1961.0,1.0,2819.264,1787.7,266.405,475.854,1984.5,29.81,142.1,2.37,6.8,182.992,-0.4,2.77 11 | 1961.0,2.0,2872.005,1814.3,286.246,480.328,2014.4,29.92,142.9,2.29,7.0,183.691,1.47,0.81 12 | 1961.0,3.0,2918.419,1823.1,310.227,493.828,2041.9,29.98,144.1,2.32,6.8,184.524,0.8,1.52 13 | 1961.0,4.0,2977.83,1859.6,315.463,502.521,2082.0,30.04,145.2,2.6,6.2,185.242,0.8,1.8 14 | 1962.0,1.0,3031.241,1879.4,334.271,520.96,2101.7,30.21,146.4,2.73,5.6,185.874,2.26,0.47 15 | 1962.0,2.0,3064.709,1902.5,331.039,523.066,2125.2,30.22,146.5,2.78,5.5,186.538,0.13,2.65 16 | 1962.0,3.0,3093.047,1917.9,336.962,538.838,2137.0,30.38,146.7,2.78,5.6,187.323,2.11,0.67 17 | 1962.0,4.0,3100.563,1945.1,325.65,535.912,2154.6,30.44,148.3,2.87,5.5,188.013,0.79,2.08 18 | 1963.0,1.0,3141.087,1958.2,343.721,522.917,2172.5,30.48,149.7,2.9,5.8,188.58,0.53,2.38 19 | 1963.0,2.0,3180.447,1976.9,348.73,518.108,2193.1,30.69,151.3,3.03,5.7,189.242,2.75,0.29 20 | 1963.0,3.0,3240.332,2003.8,360.102,546.893,2217.9,30.75,152.6,3.38,5.5,190.028,0.78,2.6 21 | 1963.0,4.0,3264.967,2020.6,364.534,532.383,2254.6,30.94,153.7,3.52,5.6,190.668,2.46,1.06 22 | 1964.0,1.0,3338.246,2060.5,379.523,529.686,2299.6,30.95,154.8,3.51,5.5,191.245,0.13,3.38 23 | 1964.0,2.0,3376.587,2096.7,377.778,526.175,2362.1,31.02,156.8,3.47,5.2,191.889,0.9,2.57 24 | 1964.0,3.0,3422.469,2135.2,386.754,522.008,2392.7,31.12,159.2,3.53,5.0,192.631,1.29,2.25 25 | 1964.0,4.0,3431.957,2141.2,389.91,514.603,2420.4,31.28,160.7,3.76,5.0,193.223,2.05,1.71 26 | 1965.0,1.0,3516.251,2188.8,429.145,508.006,2447.4,31.38,162.0,3.93,4.9,193.709,1.28,2.65 27 | 1965.0,2.0,3563.96,2213.0,429.119,508.931,2474.5,31.58,163.1,3.84,4.7,194.303,2.54,1.3 28 | 1965.0,3.0,3636.285,2251.0,444.444,529.446,2542.6,31.65,166.0,3.93,4.4,194.997,0.89,3.04 29 | 1965.0,4.0,3724.014,2314.3,446.493,544.121,2594.1,31.88,169.1,4.35,4.1,195.539,2.9,1.46 30 | 1966.0,1.0,3815.423,2348.5,484.244,556.593,2618.4,32.28,171.8,4.62,3.9,195.999,4.99,-0.37 31 | 1966.0,2.0,3828.124,2354.5,475.408,571.371,2624.7,32.45,170.3,4.65,3.8,196.56,2.1,2.55 32 | 1966.0,3.0,3853.301,2381.5,470.697,594.514,2657.8,32.85,171.2,5.23,3.8,197.207,4.9,0.33 33 | 1966.0,4.0,3884.52,2391.4,472.957,599.528,2688.2,32.9,171.9,5.0,3.7,197.736,0.61,4.39 34 | 1967.0,1.0,3918.74,2405.3,460.007,640.682,2728.4,33.1,174.2,4.22,3.8,198.206,2.42,1.8 35 | 1967.0,2.0,3919.556,2438.1,440.393,631.43,2750.8,33.4,178.1,3.78,3.8,198.712,3.61,0.17 36 | 1967.0,3.0,3950.826,2450.6,453.033,641.504,2777.1,33.7,181.6,4.42,3.8,199.311,3.58,0.84 37 | 1967.0,4.0,3980.97,2465.7,462.834,640.234,2797.4,34.1,184.3,4.9,3.9,199.808,4.72,0.18 38 | 1968.0,1.0,4063.013,2524.6,472.907,651.378,2846.2,34.4,186.6,5.18,3.7,200.208,3.5,1.67 39 | 1968.0,2.0,4131.998,2563.3,492.026,646.145,2893.5,34.9,190.5,5.5,3.5,200.706,5.77,-0.28 40 | 1968.0,3.0,4160.267,2611.5,476.053,640.615,2899.3,35.3,194.0,5.21,3.5,201.29,4.56,0.65 41 | 1968.0,4.0,4178.293,2623.5,480.998,636.729,2918.4,35.7,198.7,5.85,3.4,201.76,4.51,1.34 42 | 1969.0,1.0,4244.1,2652.9,512.686,633.224,2923.4,36.3,200.7,6.08,3.4,202.161,6.67,-0.58 43 | 1969.0,2.0,4256.46,2669.8,508.601,623.16,2952.9,36.8,201.7,6.49,3.4,202.677,5.47,1.02 44 | 1969.0,3.0,4283.378,2682.7,520.36,623.613,3012.9,37.3,202.9,7.02,3.6,203.302,5.4,1.63 45 | 1969.0,4.0,4263.261,2704.1,492.334,606.9,3034.9,37.9,206.2,7.64,3.6,203.849,6.38,1.26 46 | 1970.0,1.0,4256.573,2720.7,476.925,594.888,3050.1,38.5,206.7,6.76,4.2,204.401,6.28,0.47 47 | 1970.0,2.0,4264.289,2733.2,478.419,576.257,3103.5,38.9,208.0,6.66,4.8,205.052,4.13,2.52 48 | 1970.0,3.0,4302.259,2757.1,486.594,567.743,3145.4,39.4,212.9,6.15,5.2,205.788,5.11,1.04 49 | 1970.0,4.0,4256.637,2749.6,458.406,564.666,3135.1,39.9,215.5,4.86,5.8,206.466,5.04,-0.18 50 | 1971.0,1.0,4374.016,2802.2,517.935,542.709,3197.3,40.1,220.0,3.65,5.9,207.065,2.0,1.65 51 | 1971.0,2.0,4398.829,2827.9,533.986,534.905,3245.3,40.6,224.9,4.76,5.9,207.661,4.96,-0.19 52 | 1971.0,3.0,4433.943,2850.4,541.01,532.646,3259.7,40.9,227.2,4.7,6.0,208.345,2.94,1.75 53 | 1971.0,4.0,4446.264,2897.8,524.085,516.14,3294.2,41.2,230.1,3.87,6.0,208.917,2.92,0.95 54 | 1972.0,1.0,4525.769,2936.5,561.147,518.192,3314.9,41.5,235.6,3.55,5.8,209.386,2.9,0.64 55 | 1972.0,2.0,4633.101,2992.6,595.495,526.473,3346.1,41.8,238.8,3.86,5.7,209.896,2.88,0.98 56 | 1972.0,3.0,4677.503,3038.8,603.97,498.116,3414.6,42.2,245.0,4.47,5.6,210.479,3.81,0.66 57 | 1972.0,4.0,4754.546,3110.1,607.104,496.54,3550.5,42.7,251.5,5.09,5.3,210.985,4.71,0.38 58 | 1973.0,1.0,4876.166,3167.0,645.654,504.838,3590.7,43.7,252.7,5.98,5.0,211.42,9.26,-3.28 59 | 1973.0,2.0,4932.571,3165.4,675.837,497.033,3626.2,44.2,257.5,7.19,4.9,211.909,4.55,2.64 60 | 1973.0,3.0,4906.252,3176.7,649.412,475.897,3644.4,45.6,259.0,8.06,4.8,212.475,12.47,-4.41 61 | 1973.0,4.0,4953.05,3167.4,674.253,476.174,3688.9,46.8,263.8,7.68,4.8,212.932,10.39,-2.71 62 | 1974.0,1.0,4909.617,3139.7,631.23,491.043,3632.3,48.1,267.2,7.8,5.1,213.361,10.96,-3.16 63 | 1974.0,2.0,4922.188,3150.6,628.102,490.177,3601.1,49.3,269.3,7.89,5.2,213.854,9.86,-1.96 64 | 1974.0,3.0,4873.52,3163.6,592.672,492.586,3612.4,51.0,272.3,8.16,5.6,214.451,13.56,-5.4 65 | 1974.0,4.0,4854.34,3117.3,598.306,496.176,3596.0,52.3,273.9,6.96,6.6,214.931,10.07,-3.11 66 | 1975.0,1.0,4795.295,3143.4,493.212,490.603,3581.9,53.0,276.2,5.53,8.2,215.353,5.32,0.22 67 | 1975.0,2.0,4831.942,3195.8,476.085,486.679,3749.3,54.0,283.7,5.57,8.9,215.973,7.48,-1.91 68 | 1975.0,3.0,4913.328,3241.4,516.402,498.836,3698.6,54.9,285.4,6.27,8.5,216.587,6.61,-0.34 69 | 1975.0,4.0,4977.511,3275.7,530.596,500.141,3736.0,55.8,288.4,5.26,8.3,217.095,6.5,-1.24 70 | 1976.0,1.0,5090.663,3341.2,585.541,495.568,3791.0,56.1,294.7,4.91,7.7,217.528,2.14,2.77 71 | 1976.0,2.0,5128.947,3371.8,610.513,494.532,3822.2,57.0,297.2,5.28,7.6,218.035,6.37,-1.09 72 | 1976.0,3.0,5154.072,3407.5,611.646,493.141,3856.7,57.9,302.0,5.05,7.7,218.644,6.27,-1.22 73 | 1976.0,4.0,5191.499,3451.8,615.898,494.415,3884.4,58.7,308.3,4.57,7.8,219.179,5.49,-0.92 74 | 1977.0,1.0,5251.762,3491.3,646.198,498.509,3887.5,60.0,316.0,4.6,7.5,219.684,8.76,-4.16 75 | 1977.0,2.0,5356.131,3510.6,696.141,506.695,3931.8,60.8,320.2,5.06,7.1,220.239,5.3,-0.24 76 | 1977.0,3.0,5451.921,3544.1,734.078,509.605,3990.8,61.6,326.4,5.82,6.9,220.904,5.23,0.59 77 | 1977.0,4.0,5450.793,3597.5,713.356,504.584,4071.2,62.7,334.4,6.2,6.6,221.477,7.08,-0.88 78 | 1978.0,1.0,5469.405,3618.5,727.504,506.314,4096.4,63.9,339.9,6.34,6.3,221.991,7.58,-1.24 79 | 1978.0,2.0,5684.569,3695.9,777.454,518.366,4143.4,65.5,347.6,6.72,6.0,222.585,9.89,-3.18 80 | 1978.0,3.0,5740.3,3711.4,801.452,520.199,4177.1,67.1,353.3,7.64,6.0,223.271,9.65,-2.01 81 | 1978.0,4.0,5816.222,3741.3,819.689,524.782,4209.8,68.5,358.6,9.02,5.9,223.865,8.26,0.76 82 | 1979.0,1.0,5825.949,3760.2,819.556,525.524,4255.9,70.6,368.0,9.42,5.9,224.438,12.08,-2.66 83 | 1979.0,2.0,5831.418,3758.0,817.66,532.04,4226.1,73.0,377.2,9.3,5.7,225.055,13.37,-4.07 84 | 1979.0,3.0,5873.335,3794.9,801.742,531.232,4250.3,75.2,380.8,10.49,5.9,225.801,11.88,-1.38 85 | 1979.0,4.0,5889.495,3805.0,786.817,531.126,4284.3,78.0,385.8,11.94,5.9,226.451,14.62,-2.68 86 | 1980.0,1.0,5908.467,3798.4,781.114,548.115,4296.2,80.9,383.8,13.75,6.3,227.061,14.6,-0.85 87 | 1980.0,2.0,5787.373,3712.2,710.64,561.895,4236.1,82.6,394.0,7.9,7.3,227.726,8.32,-0.42 88 | 1980.0,3.0,5776.617,3752.0,656.477,554.292,4279.7,84.7,409.0,10.34,7.7,228.417,10.04,0.3 89 | 1980.0,4.0,5883.46,3802.0,723.22,556.13,4368.1,87.2,411.3,14.75,7.4,228.937,11.64,3.11 90 | 1981.0,1.0,6005.717,3822.8,795.091,567.618,4358.1,89.1,427.4,13.95,7.4,229.403,8.62,5.32 91 | 1981.0,2.0,5957.795,3822.8,757.24,584.54,4358.6,91.5,426.9,15.33,7.4,229.966,10.63,4.69 92 | 1981.0,3.0,6030.184,3838.3,804.242,583.89,4455.4,93.4,428.4,14.58,7.4,230.641,8.22,6.36 93 | 1981.0,4.0,5955.062,3809.3,773.053,590.125,4464.4,94.4,442.7,11.33,8.2,231.157,4.26,7.07 94 | 1982.0,1.0,5857.333,3833.9,692.514,591.043,4469.6,95.0,447.1,12.95,8.8,231.645,2.53,10.42 95 | 1982.0,2.0,5889.074,3847.7,691.9,596.403,4500.8,97.5,448.0,11.97,9.4,232.188,10.39,1.58 96 | 1982.0,3.0,5866.37,3877.2,683.825,605.37,4520.6,98.1,464.5,8.1,9.9,232.816,2.45,5.65 97 | 1982.0,4.0,5871.001,3947.9,622.93,623.307,4536.4,97.9,477.2,7.96,10.7,233.322,-0.82,8.77 98 | 1983.0,1.0,5944.02,3986.6,645.11,630.873,4572.2,98.8,493.2,8.22,10.4,233.781,3.66,4.56 99 | 1983.0,2.0,6077.619,4065.7,707.372,644.322,4605.5,99.8,507.8,8.69,10.1,234.307,4.03,4.66 100 | 1983.0,3.0,6197.468,4137.6,754.937,662.412,4674.7,100.8,517.2,8.99,9.4,234.907,3.99,5.01 101 | 1983.0,4.0,6325.574,4203.2,834.427,639.197,4771.1,102.1,525.1,8.89,8.5,235.385,5.13,3.76 102 | 1984.0,1.0,6448.264,4239.2,921.763,644.635,4875.4,103.3,535.0,9.43,7.9,235.839,4.67,4.76 103 | 1984.0,2.0,6559.594,4299.9,952.841,664.839,4959.4,104.1,540.9,9.94,7.5,236.348,3.09,6.85 104 | 1984.0,3.0,6623.343,4333.0,974.989,662.294,5036.6,105.1,543.7,10.19,7.4,236.976,3.82,6.37 105 | 1984.0,4.0,6677.264,4390.1,958.993,684.282,5084.5,105.7,557.0,8.14,7.3,237.468,2.28,5.87 106 | 1985.0,1.0,6740.275,4464.6,927.375,691.613,5072.0,107.0,570.4,8.25,7.3,237.9,4.89,3.36 107 | 1985.0,2.0,6797.344,4505.2,943.383,708.524,5172.7,107.7,589.1,7.17,7.3,238.466,2.61,4.56 108 | 1985.0,3.0,6903.523,4590.8,932.959,732.305,5140.7,108.5,607.8,7.13,7.2,239.113,2.96,4.17 109 | 1985.0,4.0,6955.918,4600.9,969.434,732.026,5193.9,109.9,621.4,7.14,7.0,239.638,5.13,2.01 110 | 1986.0,1.0,7022.757,4639.3,967.442,728.125,5255.8,108.7,641.0,6.56,7.0,240.094,-4.39,10.95 111 | 1986.0,2.0,7050.969,4688.7,945.972,751.334,5315.5,109.5,670.3,6.06,7.2,240.651,2.93,3.13 112 | 1986.0,3.0,7118.95,4770.7,916.315,779.77,5343.3,110.2,694.9,5.31,7.0,241.274,2.55,2.76 113 | 1986.0,4.0,7153.359,4799.4,917.736,767.671,5346.5,111.4,730.2,5.44,6.8,241.784,4.33,1.1 114 | 1987.0,1.0,7193.019,4792.1,945.776,772.247,5379.4,112.7,743.9,5.61,6.6,242.252,4.64,0.97 115 | 1987.0,2.0,7269.51,4856.3,947.1,782.962,5321.0,113.8,743.0,5.67,6.3,242.804,3.89,1.79 116 | 1987.0,3.0,7332.558,4910.4,948.055,783.804,5416.2,115.0,756.2,6.19,6.0,243.446,4.2,1.99 117 | 1987.0,4.0,7458.022,4922.2,1021.98,795.467,5493.1,116.0,756.2,5.76,5.9,243.981,3.46,2.29 118 | 1988.0,1.0,7496.6,5004.4,964.398,773.851,5562.1,117.2,768.1,5.76,5.7,244.445,4.12,1.64 119 | 1988.0,2.0,7592.881,5040.8,987.858,765.98,5614.3,118.5,781.4,6.48,5.5,245.021,4.41,2.07 120 | 1988.0,3.0,7632.082,5080.6,994.204,760.245,5657.5,119.9,783.3,7.22,5.5,245.693,4.7,2.52 121 | 1988.0,4.0,7733.991,5140.4,1007.371,783.065,5708.5,121.2,785.7,8.03,5.3,246.224,4.31,3.72 122 | 1989.0,1.0,7806.603,5159.3,1045.975,767.024,5773.4,123.1,779.2,8.67,5.2,246.721,6.22,2.44 123 | 1989.0,2.0,7865.016,5182.4,1033.753,784.275,5749.8,124.5,777.8,8.15,5.2,247.342,4.52,3.63 124 | 1989.0,3.0,7927.393,5236.1,1021.604,791.819,5787.0,125.4,786.6,7.76,5.3,248.067,2.88,4.88 125 | 1989.0,4.0,7944.697,5261.7,1011.119,787.844,5831.3,127.5,795.4,7.65,5.4,248.659,6.64,1.01 126 | 1990.0,1.0,8027.693,5303.3,1021.07,799.681,5875.1,128.9,806.2,7.8,5.3,249.306,4.37,3.44 127 | 1990.0,2.0,8059.598,5320.8,1021.36,800.639,5913.9,130.5,810.1,7.7,5.3,250.132,4.93,2.76 128 | 1990.0,3.0,8059.476,5341.0,997.319,793.513,5918.1,133.4,819.8,7.33,5.7,251.057,8.79,-1.46 129 | 1990.0,4.0,7988.864,5299.5,934.248,800.525,5878.2,134.7,827.2,6.67,6.1,251.889,3.88,2.79 130 | 1991.0,1.0,7950.164,5284.4,896.21,806.775,5896.3,135.1,843.2,5.83,6.6,252.643,1.19,4.65 131 | 1991.0,2.0,8003.822,5324.7,891.704,809.081,5941.1,136.2,861.5,5.54,6.8,253.493,3.24,2.29 132 | 1991.0,3.0,8037.538,5345.0,913.904,793.987,5953.6,137.2,878.0,5.18,6.9,254.435,2.93,2.25 133 | 1991.0,4.0,8069.046,5342.6,948.891,778.378,5992.4,138.3,910.4,4.14,7.1,255.214,3.19,0.95 134 | 1992.0,1.0,8157.616,5434.5,927.796,778.568,6082.9,139.4,943.8,3.88,7.4,255.992,3.17,0.71 135 | 1992.0,2.0,8244.294,5466.7,988.912,777.762,6129.5,140.5,963.2,3.5,7.6,256.894,3.14,0.36 136 | 1992.0,3.0,8329.361,5527.1,999.135,786.639,6160.6,141.7,1003.8,2.97,7.6,257.861,3.4,-0.44 137 | 1992.0,4.0,8417.016,5594.6,1030.758,787.064,6248.2,142.8,1030.4,3.12,7.4,258.679,3.09,0.02 138 | 1993.0,1.0,8432.485,5617.2,1054.979,762.901,6156.5,143.8,1047.6,2.92,7.2,259.414,2.79,0.13 139 | 1993.0,2.0,8486.435,5671.1,1063.263,752.158,6252.3,144.5,1084.5,3.02,7.1,260.255,1.94,1.08 140 | 1993.0,3.0,8531.108,5732.7,1062.514,744.227,6265.7,145.6,1113.0,3.0,6.8,261.163,3.03,-0.04 141 | 1993.0,4.0,8643.769,5783.7,1118.583,748.102,6358.1,146.3,1131.6,3.05,6.6,261.919,1.92,1.13 142 | 1994.0,1.0,8727.919,5848.1,1166.845,721.288,6332.6,147.2,1141.1,3.48,6.6,262.631,2.45,1.02 143 | 1994.0,2.0,8847.303,5891.5,1234.855,717.197,6440.6,148.4,1150.5,4.2,6.2,263.436,3.25,0.96 144 | 1994.0,3.0,8904.289,5938.7,1212.655,736.89,6487.9,149.4,1150.1,4.68,6.0,264.301,2.69,2.0 145 | 1994.0,4.0,9003.18,5997.3,1269.19,716.702,6574.0,150.5,1151.4,5.53,5.6,265.044,2.93,2.6 146 | 1995.0,1.0,9025.267,6004.3,1282.09,715.326,6616.6,151.8,1149.3,5.72,5.5,265.755,3.44,2.28 147 | 1995.0,2.0,9044.668,6053.5,1247.61,712.492,6617.2,152.6,1145.4,5.52,5.7,266.557,2.1,3.42 148 | 1995.0,3.0,9120.684,6107.6,1235.601,707.649,6666.8,153.5,1137.3,5.32,5.7,267.456,2.35,2.97 149 | 1995.0,4.0,9184.275,6150.6,1270.392,681.081,6706.2,154.7,1123.5,5.17,5.6,268.151,3.11,2.05 150 | 1996.0,1.0,9247.188,6206.9,1287.128,695.265,6777.7,156.1,1124.8,4.91,5.5,268.853,3.6,1.31 151 | 1996.0,2.0,9407.052,6277.1,1353.795,705.172,6850.6,157.0,1112.4,5.09,5.5,269.667,2.3,2.79 152 | 1996.0,3.0,9488.879,6314.6,1422.059,692.741,6908.9,158.2,1086.1,5.04,5.3,270.581,3.05,2.0 153 | 1996.0,4.0,9592.458,6366.1,1418.193,690.744,6946.8,159.4,1081.5,4.99,5.3,271.36,3.02,1.97 154 | 1997.0,1.0,9666.235,6430.2,1451.304,681.445,7008.9,159.9,1063.8,5.1,5.2,272.083,1.25,3.85 155 | 1997.0,2.0,9809.551,6456.2,1543.976,693.525,7061.5,160.4,1066.2,5.01,5.0,272.912,1.25,3.76 156 | 1997.0,3.0,9932.672,6566.0,1571.426,691.261,7142.4,161.5,1065.5,5.02,4.9,273.852,2.73,2.29 157 | 1997.0,4.0,10008.874,6641.1,1596.523,690.311,7241.5,162.0,1074.4,5.11,4.7,274.626,1.24,3.88 158 | 1998.0,1.0,10103.425,6707.2,1672.732,668.783,7406.2,162.2,1076.1,5.02,4.6,275.304,0.49,4.53 159 | 1998.0,2.0,10194.277,6822.6,1652.716,687.184,7512.0,163.2,1075.0,4.98,4.4,276.115,2.46,2.52 160 | 1998.0,3.0,10328.787,6913.1,1700.071,681.472,7591.0,163.9,1086.0,4.49,4.5,277.003,1.71,2.78 161 | 1998.0,4.0,10507.575,7019.1,1754.743,688.147,7646.5,164.7,1097.8,4.38,4.4,277.79,1.95,2.43 162 | 1999.0,1.0,10601.179,7088.3,1809.993,683.601,7698.4,165.9,1101.9,4.39,4.3,278.451,2.9,1.49 163 | 1999.0,2.0,10684.049,7199.9,1803.674,683.594,7716.0,166.7,1098.7,4.54,4.3,279.295,1.92,2.62 164 | 1999.0,3.0,10819.914,7286.4,1848.949,697.936,7765.9,168.1,1102.3,4.75,4.2,280.203,3.35,1.41 165 | 1999.0,4.0,11014.254,7389.2,1914.567,713.445,7887.7,169.3,1121.9,5.2,4.1,280.976,2.85,2.35 166 | 2000.0,1.0,11043.044,7501.3,1887.836,685.216,8053.4,170.9,1113.5,5.63,4.0,281.653,3.76,1.87 167 | 2000.0,2.0,11258.454,7571.8,2018.529,712.641,8135.9,172.7,1103.0,5.81,3.9,282.385,4.19,1.62 168 | 2000.0,3.0,11267.867,7645.9,1986.956,698.827,8222.3,173.9,1098.7,6.07,4.0,283.19,2.77,3.3 169 | 2000.0,4.0,11334.544,7713.5,1987.845,695.597,8234.6,175.6,1097.7,5.7,3.9,283.9,3.89,1.81 170 | 2001.0,1.0,11297.171,7744.3,1882.691,710.403,8296.5,176.4,1114.9,4.39,4.2,284.55,1.82,2.57 171 | 2001.0,2.0,11371.251,7773.5,1876.65,725.623,8273.7,177.4,1139.7,3.54,4.4,285.267,2.26,1.28 172 | 2001.0,3.0,11340.075,7807.7,1837.074,730.493,8484.5,177.6,1166.0,2.72,4.8,286.047,0.45,2.27 173 | 2001.0,4.0,11380.128,7930.0,1731.189,739.318,8385.5,177.7,1190.9,1.74,5.5,286.728,0.23,1.51 174 | 2002.0,1.0,11477.868,7957.3,1789.327,756.915,8611.6,179.3,1185.9,1.75,5.7,287.328,3.59,-1.84 175 | 2002.0,2.0,11538.77,7997.8,1810.779,774.408,8658.9,180.0,1199.5,1.7,5.8,288.028,1.56,0.14 176 | 2002.0,3.0,11596.43,8052.0,1814.531,786.673,8629.2,181.2,1204.0,1.61,5.7,288.783,2.66,-1.05 177 | 2002.0,4.0,11598.824,8080.6,1813.219,799.967,8649.6,182.6,1226.8,1.2,5.8,289.421,3.08,-1.88 178 | 2003.0,1.0,11645.819,8122.3,1813.141,800.196,8681.3,183.2,1248.4,1.14,5.9,290.019,1.31,-0.17 179 | 2003.0,2.0,11738.706,8197.8,1823.698,838.775,8812.5,183.7,1287.9,0.96,6.2,290.704,1.09,-0.13 180 | 2003.0,3.0,11935.461,8312.1,1889.883,839.598,8935.4,184.9,1297.3,0.94,6.1,291.449,2.6,-1.67 181 | 2003.0,4.0,12042.817,8358.0,1959.783,845.722,8986.4,186.3,1306.1,0.9,5.8,292.057,3.02,-2.11 182 | 2004.0,1.0,12127.623,8437.6,1970.015,856.57,9025.9,187.4,1332.1,0.94,5.7,292.635,2.35,-1.42 183 | 2004.0,2.0,12213.818,8483.2,2055.58,861.44,9115.0,189.1,1340.5,1.21,5.6,293.31,3.61,-2.41 184 | 2004.0,3.0,12303.533,8555.8,2082.231,876.385,9175.9,190.8,1361.0,1.63,5.4,294.066,3.58,-1.95 185 | 2004.0,4.0,12410.282,8654.2,2125.152,865.596,9303.4,191.8,1366.6,2.2,5.4,294.741,2.09,0.11 186 | 2005.0,1.0,12534.113,8719.0,2170.299,869.204,9189.6,193.8,1357.8,2.69,5.3,295.308,4.15,-1.46 187 | 2005.0,2.0,12587.535,8802.9,2131.468,870.044,9253.0,194.7,1366.6,3.01,5.1,295.994,1.85,1.16 188 | 2005.0,3.0,12683.153,8865.6,2154.949,890.394,9308.0,199.2,1375.0,3.52,5.0,296.77,9.14,-5.62 189 | 2005.0,4.0,12748.699,8888.5,2232.193,875.557,9358.7,199.4,1380.6,4.0,4.9,297.435,0.4,3.6 190 | 2006.0,1.0,12915.938,8986.6,2264.721,900.511,9533.8,200.7,1380.5,4.51,4.7,298.061,2.6,1.91 191 | 2006.0,2.0,12962.462,9035.0,2261.247,892.839,9617.3,202.7,1369.2,4.82,4.7,298.766,3.97,0.85 192 | 2006.0,3.0,12965.916,9090.7,2229.636,892.002,9662.5,201.9,1369.4,4.9,4.7,299.593,-1.58,6.48 193 | 2006.0,4.0,13060.679,9181.6,2165.966,894.404,9788.8,203.574,1373.6,4.92,4.4,300.32,3.3,1.62 194 | 2007.0,1.0,13099.901,9265.1,2132.609,882.766,9830.2,205.92,1379.7,4.95,4.5,300.977,4.58,0.36 195 | 2007.0,2.0,13203.977,9291.5,2162.214,898.713,9842.7,207.338,1370.0,4.72,4.5,301.714,2.75,1.97 196 | 2007.0,3.0,13321.109,9335.6,2166.491,918.983,9883.9,209.133,1379.2,4.0,4.7,302.509,3.45,0.55 197 | 2007.0,4.0,13391.249,9363.6,2123.426,925.11,9886.2,212.495,1377.4,3.01,4.8,303.204,6.38,-3.37 198 | 2008.0,1.0,13366.865,9349.6,2082.886,943.372,9826.8,213.997,1384.0,1.56,4.9,303.803,2.82,-1.26 199 | 2008.0,2.0,13415.266,9351.0,2026.518,961.28,10059.0,218.61,1409.3,1.74,5.4,304.483,8.53,-6.79 200 | 2008.0,3.0,13324.6,9267.7,1990.693,991.551,9838.3,216.889,1474.7,1.17,6.0,305.27,-3.16,4.33 201 | 2008.0,4.0,13141.92,9195.3,1857.661,1007.273,9920.4,212.174,1576.5,0.12,6.9,305.952,-8.79,8.91 202 | 2009.0,1.0,12925.41,9209.2,1558.494,996.287,9926.4,212.671,1592.8,0.22,8.1,306.547,0.94,-0.71 203 | 2009.0,2.0,12901.504,9189.0,1456.678,1023.528,10077.5,214.469,1653.6,0.18,9.2,307.226,3.37,-3.19 204 | 2009.0,3.0,12990.341,9256.0,1486.398,1044.088,10040.6,216.385,1673.9,0.12,9.6,308.013,3.56,-3.44 205 | -------------------------------------------------------------------------------- /sampledata/tips.csv.sb-3098d7aa-T8ASyL: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukepolson/Python-Self-Learning/a8c3a7772a3bc33bf329cfeb6e003bb5c26b7372/sampledata/tips.csv.sb-3098d7aa-T8ASyL -------------------------------------------------------------------------------- /sampledata/tips.txt: -------------------------------------------------------------------------------- 1 | total_bill,tip,smoker,day,time,size 2 | 16.99,1.01,No,Sun,Dinner,2 3 | 10.34,1.66,No,Sun,Dinner,3 4 | 21.01,3.5,No,Sun,Dinner,3 5 | 23.68,3.31,No,Sun,Dinner,2 6 | 24.59,3.61,No,Sun,Dinner,4 7 | 25.29,4.71,No,Sun,Dinner,4 8 | 8.77,2.0,No,Sun,Dinner,2 9 | 26.88,3.12,No,Sun,Dinner,4 10 | 15.04,1.96,No,Sun,Dinner,2 11 | 14.78,3.23,No,Sun,Dinner,2 12 | 10.27,1.71,No,Sun,Dinner,2 13 | 35.26,5.0,No,Sun,Dinner,4 14 | 15.42,1.57,No,Sun,Dinner,2 15 | 18.43,3.0,No,Sun,Dinner,4 16 | 14.83,3.02,No,Sun,Dinner,2 17 | 21.58,3.92,No,Sun,Dinner,2 18 | 10.33,1.67,No,Sun,Dinner,3 19 | 16.29,3.71,No,Sun,Dinner,3 20 | 16.97,3.5,No,Sun,Dinner,3 21 | 20.65,3.35,No,Sat,Dinner,3 22 | 17.92,4.08,No,Sat,Dinner,2 23 | 20.29,2.75,No,Sat,Dinner,2 24 | 15.77,2.23,No,Sat,Dinner,2 25 | 39.42,7.58,No,Sat,Dinner,4 26 | 19.82,3.18,No,Sat,Dinner,2 27 | 17.81,2.34,No,Sat,Dinner,4 28 | 13.37,2.0,No,Sat,Dinner,2 29 | 12.69,2.0,No,Sat,Dinner,2 30 | 21.7,4.3,No,Sat,Dinner,2 31 | 19.65,3.0,No,Sat,Dinner,2 32 | 9.55,1.45,No,Sat,Dinner,2 33 | 18.35,2.5,No,Sat,Dinner,4 34 | 15.06,3.0,No,Sat,Dinner,2 35 | 20.69,2.45,No,Sat,Dinner,4 36 | 17.78,3.27,No,Sat,Dinner,2 37 | 24.06,3.6,No,Sat,Dinner,3 38 | 16.31,2.0,No,Sat,Dinner,3 39 | 16.93,3.07,No,Sat,Dinner,3 40 | 18.69,2.31,No,Sat,Dinner,3 41 | 31.27,5.0,No,Sat,Dinner,3 42 | 16.04,2.24,No,Sat,Dinner,3 43 | 17.46,2.54,No,Sun,Dinner,2 44 | 13.94,3.06,No,Sun,Dinner,2 45 | 9.68,1.32,No,Sun,Dinner,2 46 | 30.4,5.6,No,Sun,Dinner,4 47 | 18.29,3.0,No,Sun,Dinner,2 48 | 22.23,5.0,No,Sun,Dinner,2 49 | 32.4,6.0,No,Sun,Dinner,4 50 | 28.55,2.05,No,Sun,Dinner,3 51 | 18.04,3.0,No,Sun,Dinner,2 52 | 12.54,2.5,No,Sun,Dinner,2 53 | 10.29,2.6,No,Sun,Dinner,2 54 | 34.81,5.2,No,Sun,Dinner,4 55 | 9.94,1.56,No,Sun,Dinner,2 56 | 25.56,4.34,No,Sun,Dinner,4 57 | 19.49,3.51,No,Sun,Dinner,2 58 | 38.01,3.0,Yes,Sat,Dinner,4 59 | 26.41,1.5,No,Sat,Dinner,2 60 | 11.24,1.76,Yes,Sat,Dinner,2 61 | 48.27,6.73,No,Sat,Dinner,4 62 | 20.29,3.21,Yes,Sat,Dinner,2 63 | 13.81,2.0,Yes,Sat,Dinner,2 64 | 11.02,1.98,Yes,Sat,Dinner,2 65 | 18.29,3.76,Yes,Sat,Dinner,4 66 | 17.59,2.64,No,Sat,Dinner,3 67 | 20.08,3.15,No,Sat,Dinner,3 68 | 16.45,2.47,No,Sat,Dinner,2 69 | 3.07,1.0,Yes,Sat,Dinner,1 70 | 20.23,2.01,No,Sat,Dinner,2 71 | 15.01,2.09,Yes,Sat,Dinner,2 72 | 12.02,1.97,No,Sat,Dinner,2 73 | 17.07,3.0,No,Sat,Dinner,3 74 | 26.86,3.14,Yes,Sat,Dinner,2 75 | 25.28,5.0,Yes,Sat,Dinner,2 76 | 14.73,2.2,No,Sat,Dinner,2 77 | 10.51,1.25,No,Sat,Dinner,2 78 | 17.92,3.08,Yes,Sat,Dinner,2 79 | 27.2,4.0,No,Thur,Lunch,4 80 | 22.76,3.0,No,Thur,Lunch,2 81 | 17.29,2.71,No,Thur,Lunch,2 82 | 19.44,3.0,Yes,Thur,Lunch,2 83 | 16.66,3.4,No,Thur,Lunch,2 84 | 10.07,1.83,No,Thur,Lunch,1 85 | 32.68,5.0,Yes,Thur,Lunch,2 86 | 15.98,2.03,No,Thur,Lunch,2 87 | 34.83,5.17,No,Thur,Lunch,4 88 | 13.03,2.0,No,Thur,Lunch,2 89 | 18.28,4.0,No,Thur,Lunch,2 90 | 24.71,5.85,No,Thur,Lunch,2 91 | 21.16,3.0,No,Thur,Lunch,2 92 | 28.97,3.0,Yes,Fri,Dinner,2 93 | 22.49,3.5,No,Fri,Dinner,2 94 | 5.75,1.0,Yes,Fri,Dinner,2 95 | 16.32,4.3,Yes,Fri,Dinner,2 96 | 22.75,3.25,No,Fri,Dinner,2 97 | 40.17,4.73,Yes,Fri,Dinner,4 98 | 27.28,4.0,Yes,Fri,Dinner,2 99 | 12.03,1.5,Yes,Fri,Dinner,2 100 | 21.01,3.0,Yes,Fri,Dinner,2 101 | 12.46,1.5,No,Fri,Dinner,2 102 | 11.35,2.5,Yes,Fri,Dinner,2 103 | 15.38,3.0,Yes,Fri,Dinner,2 104 | 44.3,2.5,Yes,Sat,Dinner,3 105 | 22.42,3.48,Yes,Sat,Dinner,2 106 | 20.92,4.08,No,Sat,Dinner,2 107 | 15.36,1.64,Yes,Sat,Dinner,2 108 | 20.49,4.06,Yes,Sat,Dinner,2 109 | 25.21,4.29,Yes,Sat,Dinner,2 110 | 18.24,3.76,No,Sat,Dinner,2 111 | 14.31,4.0,Yes,Sat,Dinner,2 112 | 14.0,3.0,No,Sat,Dinner,2 113 | 7.25,1.0,No,Sat,Dinner,1 114 | 38.07,4.0,No,Sun,Dinner,3 115 | 23.95,2.55,No,Sun,Dinner,2 116 | 25.71,4.0,No,Sun,Dinner,3 117 | 17.31,3.5,No,Sun,Dinner,2 118 | 29.93,5.07,No,Sun,Dinner,4 119 | 10.65,1.5,No,Thur,Lunch,2 120 | 12.43,1.8,No,Thur,Lunch,2 121 | 24.08,2.92,No,Thur,Lunch,4 122 | 11.69,2.31,No,Thur,Lunch,2 123 | 13.42,1.68,No,Thur,Lunch,2 124 | 14.26,2.5,No,Thur,Lunch,2 125 | 15.95,2.0,No,Thur,Lunch,2 126 | 12.48,2.52,No,Thur,Lunch,2 127 | 29.8,4.2,No,Thur,Lunch,6 128 | 8.52,1.48,No,Thur,Lunch,2 129 | 14.52,2.0,No,Thur,Lunch,2 130 | 11.38,2.0,No,Thur,Lunch,2 131 | 22.82,2.18,No,Thur,Lunch,3 132 | 19.08,1.5,No,Thur,Lunch,2 133 | 20.27,2.83,No,Thur,Lunch,2 134 | 11.17,1.5,No,Thur,Lunch,2 135 | 12.26,2.0,No,Thur,Lunch,2 136 | 18.26,3.25,No,Thur,Lunch,2 137 | 8.51,1.25,No,Thur,Lunch,2 138 | 10.33,2.0,No,Thur,Lunch,2 139 | 14.15,2.0,No,Thur,Lunch,2 140 | 16.0,2.0,Yes,Thur,Lunch,2 141 | 13.16,2.75,No,Thur,Lunch,2 142 | 17.47,3.5,No,Thur,Lunch,2 143 | 34.3,6.7,No,Thur,Lunch,6 144 | 41.19,5.0,No,Thur,Lunch,5 145 | 27.05,5.0,No,Thur,Lunch,6 146 | 16.43,2.3,No,Thur,Lunch,2 147 | 8.35,1.5,No,Thur,Lunch,2 148 | 18.64,1.36,No,Thur,Lunch,3 149 | 11.87,1.63,No,Thur,Lunch,2 150 | 9.78,1.73,No,Thur,Lunch,2 151 | 7.51,2.0,No,Thur,Lunch,2 152 | 14.07,2.5,No,Sun,Dinner,2 153 | 13.13,2.0,No,Sun,Dinner,2 154 | 17.26,2.74,No,Sun,Dinner,3 155 | 24.55,2.0,No,Sun,Dinner,4 156 | 19.77,2.0,No,Sun,Dinner,4 157 | 29.85,5.14,No,Sun,Dinner,5 158 | 48.17,5.0,No,Sun,Dinner,6 159 | 25.0,3.75,No,Sun,Dinner,4 160 | 13.39,2.61,No,Sun,Dinner,2 161 | 16.49,2.0,No,Sun,Dinner,4 162 | 21.5,3.5,No,Sun,Dinner,4 163 | 12.66,2.5,No,Sun,Dinner,2 164 | 16.21,2.0,No,Sun,Dinner,3 165 | 13.81,2.0,No,Sun,Dinner,2 166 | 17.51,3.0,Yes,Sun,Dinner,2 167 | 24.52,3.48,No,Sun,Dinner,3 168 | 20.76,2.24,No,Sun,Dinner,2 169 | 31.71,4.5,No,Sun,Dinner,4 170 | 10.59,1.61,Yes,Sat,Dinner,2 171 | 10.63,2.0,Yes,Sat,Dinner,2 172 | 50.81,10.0,Yes,Sat,Dinner,3 173 | 15.81,3.16,Yes,Sat,Dinner,2 174 | 7.25,5.15,Yes,Sun,Dinner,2 175 | 31.85,3.18,Yes,Sun,Dinner,2 176 | 16.82,4.0,Yes,Sun,Dinner,2 177 | 32.9,3.11,Yes,Sun,Dinner,2 178 | 17.89,2.0,Yes,Sun,Dinner,2 179 | 14.48,2.0,Yes,Sun,Dinner,2 180 | 9.6,4.0,Yes,Sun,Dinner,2 181 | 34.63,3.55,Yes,Sun,Dinner,2 182 | 34.65,3.68,Yes,Sun,Dinner,4 183 | 23.33,5.65,Yes,Sun,Dinner,2 184 | 45.35,3.5,Yes,Sun,Dinner,3 185 | 23.17,6.5,Yes,Sun,Dinner,4 186 | 40.55,3.0,Yes,Sun,Dinner,2 187 | 20.69,5.0,No,Sun,Dinner,5 188 | 20.9,3.5,Yes,Sun,Dinner,3 189 | 30.46,2.0,Yes,Sun,Dinner,5 190 | 18.15,3.5,Yes,Sun,Dinner,3 191 | 23.1,4.0,Yes,Sun,Dinner,3 192 | 15.69,1.5,Yes,Sun,Dinner,2 193 | 19.81,4.19,Yes,Thur,Lunch,2 194 | 28.44,2.56,Yes,Thur,Lunch,2 195 | 15.48,2.02,Yes,Thur,Lunch,2 196 | 16.58,4.0,Yes,Thur,Lunch,2 197 | 7.56,1.44,No,Thur,Lunch,2 198 | 10.34,2.0,Yes,Thur,Lunch,2 199 | 43.11,5.0,Yes,Thur,Lunch,4 200 | 13.0,2.0,Yes,Thur,Lunch,2 201 | 13.51,2.0,Yes,Thur,Lunch,2 202 | 18.71,4.0,Yes,Thur,Lunch,3 203 | 12.74,2.01,Yes,Thur,Lunch,2 204 | 13.0,2.0,Yes,Thur,Lunch,2 205 | 16.4,2.5,Yes,Thur,Lunch,2 206 | 20.53,4.0,Yes,Thur,Lunch,4 207 | 16.47,3.23,Yes,Thur,Lunch,3 208 | 26.59,3.41,Yes,Sat,Dinner,3 209 | 38.73,3.0,Yes,Sat,Dinner,4 210 | 24.27,2.03,Yes,Sat,Dinner,2 211 | 12.76,2.23,Yes,Sat,Dinner,2 212 | 30.06,2.0,Yes,Sat,Dinner,3 213 | 25.89,5.16,Yes,Sat,Dinner,4 214 | 48.33,9.0,No,Sat,Dinner,4 215 | 13.27,2.5,Yes,Sat,Dinner,2 216 | 28.17,6.5,Yes,Sat,Dinner,3 217 | 12.9,1.1,Yes,Sat,Dinner,2 218 | 28.15,3.0,Yes,Sat,Dinner,5 219 | 11.59,1.5,Yes,Sat,Dinner,2 220 | 7.74,1.44,Yes,Sat,Dinner,2 221 | 30.14,3.09,Yes,Sat,Dinner,4 222 | 12.16,2.2,Yes,Fri,Lunch,2 223 | 13.42,3.48,Yes,Fri,Lunch,2 224 | 8.58,1.92,Yes,Fri,Lunch,1 225 | 15.98,3.0,No,Fri,Lunch,3 226 | 13.42,1.58,Yes,Fri,Lunch,2 227 | 16.27,2.5,Yes,Fri,Lunch,2 228 | 10.09,2.0,Yes,Fri,Lunch,2 229 | 20.45,3.0,No,Sat,Dinner,4 230 | 13.28,2.72,No,Sat,Dinner,2 231 | 22.12,2.88,Yes,Sat,Dinner,2 232 | 24.01,2.0,Yes,Sat,Dinner,4 233 | 15.69,3.0,Yes,Sat,Dinner,3 234 | 11.61,3.39,No,Sat,Dinner,2 235 | 10.77,1.47,No,Sat,Dinner,2 236 | 15.53,3.0,Yes,Sat,Dinner,2 237 | 10.07,1.25,No,Sat,Dinner,2 238 | 12.6,1.0,Yes,Sat,Dinner,2 239 | 32.83,1.17,Yes,Sat,Dinner,2 240 | 35.83,4.67,No,Sat,Dinner,3 241 | 29.03,5.92,No,Sat,Dinner,3 242 | 27.18,2.0,Yes,Sat,Dinner,2 243 | 22.67,2.0,Yes,Sat,Dinner,2 244 | 17.82,1.75,No,Sat,Dinner,2 245 | 18.78,3.0,No,Thur,Dinner,2 246 | -------------------------------------------------------------------------------- /temp.txt: -------------------------------------------------------------------------------- 1 | These violent delights have violent ends 2 | And in their triumph die, like fire and powder 3 | Which, as they kiss, consume.sdf --------------------------------------------------------------------------------