├── .gitignore ├── OPEN_ME_FIRST.ipynb ├── README.md ├── clear_output.txt ├── datasets ├── credit-card-customers │ ├── BankChurners.zip │ └── hwotoget.md ├── deaths-in-gameofthrones │ ├── game-of-thrones-deaths-data.csv │ ├── get_data.sh │ └── howtoget.md ├── life-expectancy │ ├── howtoget.md │ └── life-expectancy-who.zip2 ├── movielens │ ├── README.txt │ └── howtoget.txt ├── names │ ├── female.txt │ ├── get_data.sh │ ├── howtoget.txt │ └── male.txt ├── shakespeare │ ├── get_data.sh │ ├── howtoget.txt │ └── shakespeare.txt.gz ├── startbucks_locations │ └── url.txt.txt ├── taxi-trips │ └── howtoget.txt └── worldbank-dev-ind │ └── howtoget.txt ├── environment.yml ├── lectures ├── .gitattributes ├── 005_intro_to_consoles │ ├── images │ │ ├── 03_two_pages_at_once.png │ │ ├── console.jpg │ │ ├── dosprompt.jpg │ │ └── macterminal.png │ └── intro_to_consoles.ipynb ├── 010_programming_vs_calculator │ ├── images │ │ ├── HR-10RC_large.png │ │ ├── SL300VC-PK_large.png │ │ ├── calculator_2p3.jpg │ │ ├── calculator_blank.jpg │ │ ├── calculator_expand_chars.jpg │ │ ├── calculator_expand_date.jpg │ │ ├── calculator_expand_math.jpg │ │ ├── calculator_expand_mem.jpg │ │ ├── calculator_memory.jpg │ │ ├── calculator_numbers.jpg │ │ ├── calculator_operators.jpg │ │ └── calculator_screen.jpg │ └── programming_vs_calculator.ipynb ├── 015_first_programs │ └── first_programs.ipynb ├── 020_intro_to_jupyter │ ├── 10 - Intro To Jupyter (not technical).ipynb │ ├── 20 - Intro To Jupyter (technical).ipynb │ ├── 30 - Intro To Widgets.ipynb │ └── images │ │ ├── blank_jupyter_header.png │ │ ├── command_mode.png │ │ ├── edit_mode.png │ │ ├── jupyter_celltype.png │ │ ├── jupyter_create_notebook.png │ │ ├── jupyter_files.png │ │ ├── jupyter_kernel.png │ │ ├── jupyter_restart.png │ │ ├── jupyter_run.png │ │ ├── mathematica.png │ │ └── python_repl.png ├── 025_all_of_python_basics │ ├── 090-roadmap.ipynb │ ├── 095-all_of_python_faster_basics.ipynb │ ├── 100-all_of_python_basics.ipynb │ ├── 110-variables.ipynb │ ├── 120-basic_functions.ipynb │ ├── 130-numbers.ipynb │ ├── 140-strings.ipynb │ ├── 145-boolean_algebra.ipynb │ ├── 150-basic_plotting.ipynb │ ├── 170-dictionaries.ipynb │ ├── 180-lists.ipynb │ ├── 190-comprehensions_and_generators.ipynb │ ├── 200-classes.ipynb │ ├── 210-loops.ipynb │ ├── 220-conditonals_and_None.ipynb │ ├── 230-functions_argument_types.ipynb │ ├── 240-functions_lambda_and_higherorder.ipynb │ ├── 245-functions-decorators.ipynb │ ├── 250-functions_recursion.ipynb │ ├── 260-all_of_python_regexes.ipynb │ ├── 280-exceptions.ipynb │ ├── 290-context_managers.ipynb │ ├── 300-type_safety.ipynb │ ├── 310-async.ipynb │ ├── badly_typed_code.py │ └── images │ │ ├── best-mommy-ever-jewelry.jpg │ │ ├── binary_tree.svg │ │ ├── class_diff.png │ │ ├── clock.jpg │ │ ├── david_chang.jpg │ │ ├── how-to-control-feedback-in-a-sound-system_header.jpg │ │ ├── ifelse_diagram.png │ │ ├── inception.jpg │ │ ├── infinitemirror.jpg │ │ ├── listcomprehension.png │ │ ├── loop_diagram.png │ │ └── y_combinator.jpg ├── 030_intro_to_pandas │ ├── 100-pandas_quick_start.ipynb │ ├── 110-pandas-overview-series.ipynb │ ├── 120-pandas-overview-dataframes.ipynb │ ├── 130-pandas-dataframes-operations.ipynb │ ├── 140-pandas-dataframes-combining.ipynb │ ├── 150-pandas-groupby.ipynb │ ├── 160-pandas-index.ipynb │ ├── 170-pandas-reshape-with-pivot-melt-stack.ipynb │ ├── 180-pandas-operations_str_dt_apply.ipynb │ └── images │ │ ├── dataframes.jpg │ │ ├── series.jpg │ │ └── splitapplycombine.png ├── 035_how_to_read_technical_docs │ └── how_to_read_technical_docs.ipynb ├── 040_basic_computer_architecture │ ├── basic_computer_architecture.ipynb │ └── images │ │ ├── EBMotherboard.jpg │ │ ├── How_to_stress_test_your_CPU-Hero.jpg │ │ ├── Laptop-hard-drive-exposed.jpg │ │ ├── RAM-Modules.jpg │ │ ├── Supermicro-X12SCA-F-Overview.jpg │ │ ├── ascii.png │ │ ├── calc.png │ │ ├── overview-fig1.png │ │ └── unicode_sample.png ├── 045_intro_to_numpy │ ├── images │ │ ├── chicago.jpeg │ │ ├── chicago.png │ │ ├── chicagobw.jpeg │ │ └── chicagobw.png │ ├── intro_to_numpy.ipynb │ └── linear_regression_with_numpy.ipynb ├── 050_git_version_control │ ├── Why do you need version control.pptx │ ├── assets │ │ ├── copy_to_dropbox.png │ │ ├── folder_versions.png │ │ ├── github-desktop-screenshot-windows.png │ │ ├── macgit-03-open.png │ │ └── share_code_email.png │ ├── readme.txt │ ├── understanding_version_control.html │ └── understanding_version_control.md ├── 055_bigger_data_pandas │ ├── 050 - Work With Taxi Trips - Get to know the file.ipynb │ ├── 100 - Work With Taxi Trips - memory_map.ipynb │ ├── 110 - Work With Taxi Trips - compression.ipynb │ ├── 120 - Work With Taxi Trips - feather format.ipynb │ ├── 130 - Work With Taxi Trips - chunking and tqdm.ipynb │ ├── 135 - Work With Taxi Trips - skip columns pre-req.ipynb │ ├── 140 - Work With Taxi Trips - skip columns.ipynb │ ├── 140B - Work With Taxi Trips - skip columns.ipynb │ ├── 150 - Work With Taxi Trips - c_parser.ipynb │ ├── 160 - Work With Taxi Trips - Chunk to parquet.ipynb │ ├── 160B - Work With Taxi Trips - Read from parquet files.ipynb │ └── lecture.md ├── 060_learn_command_line │ ├── learn_command_line.md │ └── learn_command_line_2.md ├── 065_secret_lives_of_text_files │ ├── Secret Lives of Text Files.ipynb │ ├── howareyou_english.txt │ ├── howareyou_english_multiple_lines.txt │ └── howareyou_not_english.txt ├── 070_scikit_learn │ ├── 100-scikit-learn-method_behind_the_madness.ipynb │ ├── 110-scikit-learn-run_saved_model.ipynb │ └── model_server.py ├── 075_web_services │ ├── 120-bank_churners_classifier_model.ipynb │ ├── 120-bank_churners_classifier_model.py │ ├── 130-load_test.ipynb │ ├── The web, under the hood.pdf │ ├── The web, under the hood.pptx │ ├── consume_json.py │ ├── consume_services.ipynb │ ├── decorator.pyx │ ├── post_client_streamlit_app.py │ ├── serve_json.py │ ├── serve_post_json.py │ ├── serve_text.py │ └── streamlit_app.py ├── 080_env_pkg_management │ ├── 010-package_management.ipynb │ └── Python environment and package management.pptx ├── 090_python_tools │ ├── .coverage │ ├── 010-jupyter-debugger.ipynb │ ├── 020-python-bytecode.ipynb │ ├── 030-python-logging.ipynb │ ├── 040-python-unit-tests.ipynb │ ├── __init__.py │ ├── logging.json │ ├── logging_fancy.json │ ├── logs │ │ ├── training-stats.log │ │ └── uvicorn.log │ ├── name_reverser.py │ ├── python_logging_01.py │ ├── python_logging_02.py │ ├── python_logging_03.py │ └── tests │ │ ├── __init__.py │ │ ├── test_name_reverser.py │ │ └── test_name_reverser_part_deux.py ├── 100_design_patterns │ └── 100-design-patterns.ipynb ├── 110_python_py_files │ ├── 100-program-inputs.ipynb │ ├── 110-program-run-type.ipynb │ ├── 120-clean-code.ipynb │ ├── messy.py │ ├── program1.py │ ├── program10.py │ ├── program11.py │ ├── program12.py │ ├── program13.py │ ├── program14.py │ ├── program14b.py │ ├── program15.py │ ├── program2.py │ ├── program3.py │ ├── program4.py │ ├── program5.py │ ├── program6.py │ ├── program7.py │ ├── program8.py │ └── program9.py ├── 120_dockerize_python_app │ ├── 100_minimal_27 │ │ ├── Dockerfile │ │ └── app.py │ ├── 110_minimal_server │ │ ├── Dockerfile │ │ ├── app.py │ │ └── requirements.txt │ ├── Docker – an introduction.pptx │ └── example1 │ │ ├── Dockerfile │ │ └── main.py ├── 130-distributed_python │ ├── 100-simulate_a_cluster_docker.ipynb │ ├── 110-ray_intro.ipynb │ ├── 120-ray-serve.ipynb │ ├── dask-image │ │ ├── 110-dask-cluster.ipynb │ │ └── Dockerfile │ ├── ray-image │ │ ├── 110-ray-cluster.ipynb │ │ └── Dockerfile │ ├── simple_api_ray.py │ └── simple_api_ray2.py ├── 140-algorithms_datastructs │ ├── 100-data_structures.ipynb │ └── 110-algorithms.ipynb ├── Data Science in Python.ipynb ├── gradient_descent │ └── Gradient Descent.ipynb ├── misc │ ├── prettyprint_numpy.ipynb │ ├── scratchspace.ipynb │ └── test_themes.ipynb ├── r_basics │ ├── Base R.ipynb │ ├── Tidyverse.ipynb │ ├── images │ │ ├── rfordatascience.jpg │ │ ├── rfordatascience.png │ │ └── rinaction.jpg │ ├── rmarkdown_tutorial.html │ └── rmarkdown_tutorial.rmd └── readme.md ├── postcell.conf.bak ├── programs ├── calc-sum-pd.py ├── calc-sum.py ├── gen-shakespeare.py ├── killings_per_season.py ├── logging.json ├── maximum_bad_debug.py ├── maximum_bad_logging.log ├── maximum_bad_logging.py └── svm_or_logreg_strategy.py ├── python_tableofcontents.xlsx └── src ├── __init__.py ├── __pycache__ ├── __init__.cpython-36.pyc └── utils.cpython-36.pyc └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.joblib 3 | .ipynb_checkpoints 4 | .DS_Store 5 | .vscode 6 | postcell.conf 7 | postcell.log 8 | __pycache__/ 9 | -------------------------------------------------------------------------------- /OPEN_ME_FIRST.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Let's set up your envrionment" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Import necessary libraries \n", 15 | "*(click the *play* button in the toolbar above to execute a cell)*" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": { 22 | "tags": [] 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "import json\n", 27 | "import os\n", 28 | "import shutil\n", 29 | "import datetime\n", 30 | "import pandas as pd\n", 31 | "\n", 32 | "import ipywidgets as widgets" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "tags": [] 39 | }, 40 | "source": [ 41 | "### 1. Which section are you in?" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "tags": [] 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "student_section = widgets.Dropdown(\n", 53 | " options=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'],\n", 54 | " value='Monday',\n", 55 | " description='Section:',\n", 56 | " disabled=False,\n", 57 | ")\n", 58 | "student_section" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "student_section.value = \"Wednesday\"" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "student_section.value" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "### 2. What is your name?" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "tags": [] 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "student_name = \"Your Name\"" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "### 3. Set up postcell" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "# class_name = \"pythonfordatascience\"\n", 111 | "class_name = \"pythonformlengineering\"\n", 112 | "class_time_period = \"2025_quarter2\"" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "tags": [] 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "#Set up template\n", 124 | "if not os.path.isfile('postcell.conf'):\n", 125 | " shutil.copyfile('postcell.conf.bak', 'postcell.conf')\n", 126 | "\n", 127 | "#Open config file\n", 128 | "with open('postcell.conf', 'rt') as conf:\n", 129 | " parsedj = json.load(conf)\n", 130 | " #print(parsedj['student_id'])\n", 131 | " #print(parsedj['class_id'])\n", 132 | " \n", 133 | "#Set student name\n", 134 | "parsedj['student_id'] = student_name.strip().replace(' ', '_')\n", 135 | "\n", 136 | "#Set class name\n", 137 | "class_id = f\"{class_time_period}_{student_section.value.lower()}_{class_name}\"\n", 138 | "parsedj['class_id'] = class_id\n", 139 | "\n", 140 | "#Write config file\n", 141 | "#TODO: change should_send_to_server to true, set default to be false in .bak file\n", 142 | "with open('postcell.conf', 'wt') as conf:\n", 143 | " json.dump(parsedj, conf, indent=4, sort_keys=True)\n", 144 | " \n", 145 | "# Confirm your config\n", 146 | "with open('postcell.conf', 'rt') as conf:\n", 147 | " parsedj = json.load(conf)\n", 148 | " print(parsedj['student_id'])\n", 149 | " print(parsedj['class_id'])" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "### 4. Install the postcell magic command" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "!pip install postcell -U --quiet" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "tags": [] 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "%reload_ext postcell\n", 177 | "%postcell register" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "tags": [] 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "%%postcell OPEN_ME_FIRST_HELLO \n", 189 | "\"Hello\"" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "%%postcell OPEN_ME_FIRST_EXPERIENCE \n", 199 | "\"How much programming experience do you have?\"" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "### Fun test, how fast is your machine?" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": { 213 | "tags": [] 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "%%timeit\n", 218 | "sum(range(1_000_000))" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "### Paste the numeric value from the previous cell, along with a general description of you machine, on the next line" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "%%postcell OPEN_ME_FIRST_MACHINE_PERFORMANCE \n", 235 | "\"73.1 ms, dell xps, 2 years ago\"" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "### 5. Install other packages\n", 243 | "Since this is a very introductory course, students will install some required packages here, rather than via a proper environments file." 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "# download: download files, only if they don't already exist\n", 253 | "# lolviz: visualize python datastructures\n", 254 | "# jax and jaxlib: similar to numpy with autograd (don't yet run on windows)\n", 255 | "\n", 256 | "!pip install download lolviz ipywidgets pylint black mypy networkx[default,extra]" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "# nb_conda: load conda environments in jupyter\n", 266 | "# tqdm: Add progress bars to Jupyter \n", 267 | "# seaborn: A very popular charting library\n", 268 | "\n", 269 | "!conda install bokeh tqdm seaborn altair-all -y" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "# Add kernel\n", 279 | "\n", 280 | "#conda activate py310\n", 281 | "#python -m ipykernel install --user --name=\"py310\"" 282 | ] 283 | } 284 | ], 285 | "metadata": { 286 | "kernelspec": { 287 | "display_name": "Python [conda env:conda-mleng_env]", 288 | "language": "python", 289 | "name": "conda-env-conda-mleng_env-py" 290 | }, 291 | "language_info": { 292 | "codemirror_mode": { 293 | "name": "ipython", 294 | "version": 3 295 | }, 296 | "file_extension": ".py", 297 | "mimetype": "text/x-python", 298 | "name": "python", 299 | "nbconvert_exporter": "python", 300 | "pygments_lexer": "ipython3", 301 | "version": "3.11.11" 302 | } 303 | }, 304 | "nbformat": 4, 305 | "nbformat_minor": 4 306 | } 307 | -------------------------------------------------------------------------------- /clear_output.txt: -------------------------------------------------------------------------------- 1 | jupyter nbconvert *.ipynb --to notebook --ClearOutputPreprocessor.enabled=True --inplace -------------------------------------------------------------------------------- /datasets/credit-card-customers/BankChurners.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/datasets/credit-card-customers/BankChurners.zip -------------------------------------------------------------------------------- /datasets/credit-card-customers/hwotoget.md: -------------------------------------------------------------------------------- 1 | https://www.kaggle.com/datasets/sakshigoyal7/credit-card-customers?trk=article-ssr-frontend-pulse_little-text-block 2 | License: CCO: Public Domain -------------------------------------------------------------------------------- /datasets/deaths-in-gameofthrones/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | curl https://raw.githubusercontent.com/washingtonpost/data-game-of-thrones-deaths/master/game-of-thrones-deaths-data.csv -o game-of-thrones-deaths-data.csv 3 | -------------------------------------------------------------------------------- /datasets/deaths-in-gameofthrones/howtoget.md: -------------------------------------------------------------------------------- 1 | Source: 2 | https://github.com/washingtonpost/data-game-of-thrones-deaths 3 | -------------------------------------------------------------------------------- /datasets/life-expectancy/howtoget.md: -------------------------------------------------------------------------------- 1 | https://www.kaggle.com/kumarajarshi/life-expectancy-who/downloads/life-expectancy-who.zip/1 2 | 3 | Generate a small version by doing `head -n 10 "Life Expectancy Data.csv" > life_expectancy_10.csv` 4 | -------------------------------------------------------------------------------- /datasets/life-expectancy/life-expectancy-who.zip2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/datasets/life-expectancy/life-expectancy-who.zip2 -------------------------------------------------------------------------------- /datasets/movielens/README.txt: -------------------------------------------------------------------------------- 1 | Summary 2 | ======= 3 | 4 | This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018. 5 | 6 | Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided. 7 | 8 | The data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows. 9 | 10 | This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent. 11 | 12 | This and other GroupLens data sets are publicly available for download at . 13 | 14 | 15 | Usage License 16 | ============= 17 | 18 | Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions: 19 | 20 | * The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group. 21 | * The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information). 22 | * The user may redistribute the data set, including transformations, so long as it is distributed under these same license conditions. 23 | * The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota. 24 | * The executable software scripts are provided "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction. 25 | 26 | In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate). 27 | 28 | If you have any further questions or comments, please email 29 | 30 | 31 | Citation 32 | ======== 33 | 34 | To acknowledge use of the dataset in publications, please cite the following paper: 35 | 36 | > F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. 37 | 38 | 39 | Further Information About GroupLens 40 | =================================== 41 | 42 | GroupLens is a research group in the Department of Computer Science and Engineering at the University of Minnesota. Since its inception in 1992, GroupLens's research projects have explored a variety of fields including: 43 | 44 | * recommender systems 45 | * online communities 46 | * mobile and ubiquitious technologies 47 | * digital libraries 48 | * local geographic information systems 49 | 50 | GroupLens Research operates a movie recommender based on collaborative filtering, MovieLens, which is the source of these data. We encourage you to visit to try it out! If you have exciting ideas for experimental work to conduct on MovieLens, send us an email at - we are always interested in working with external collaborators. 51 | 52 | 53 | Content and Use of Files 54 | ======================== 55 | 56 | Formatting and Encoding 57 | ----------------------- 58 | 59 | The dataset files are written as [comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values) files with a single header row. Columns that contain commas (`,`) are escaped using double-quotes (`"`). These files are encoded as UTF-8. If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) display incorrectly, make sure that any program reading the data, such as a text editor, terminal, or script, is configured for UTF-8. 60 | 61 | 62 | User Ids 63 | -------- 64 | 65 | MovieLens users were selected at random for inclusion. Their ids have been anonymized. User ids are consistent between `ratings.csv` and `tags.csv` (i.e., the same id refers to the same user across the two files). 66 | 67 | 68 | Movie Ids 69 | --------- 70 | 71 | Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL ). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files). 72 | 73 | 74 | Ratings Data File Structure (ratings.csv) 75 | ----------------------------------------- 76 | 77 | All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format: 78 | 79 | userId,movieId,rating,timestamp 80 | 81 | The lines within this file are ordered first by userId, then, within user, by movieId. 82 | 83 | Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars). 84 | 85 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. 86 | 87 | 88 | Tags Data File Structure (tags.csv) 89 | ----------------------------------- 90 | 91 | All tags are contained in the file `tags.csv`. Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format: 92 | 93 | userId,movieId,tag,timestamp 94 | 95 | The lines within this file are ordered first by userId, then, within user, by movieId. 96 | 97 | Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user. 98 | 99 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. 100 | 101 | 102 | Movies Data File Structure (movies.csv) 103 | --------------------------------------- 104 | 105 | Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format: 106 | 107 | movieId,title,genres 108 | 109 | Movie titles are entered manually or imported from , and include the year of release in parentheses. Errors and inconsistencies may exist in these titles. 110 | 111 | Genres are a pipe-separated list, and are selected from the following: 112 | 113 | * Action 114 | * Adventure 115 | * Animation 116 | * Children's 117 | * Comedy 118 | * Crime 119 | * Documentary 120 | * Drama 121 | * Fantasy 122 | * Film-Noir 123 | * Horror 124 | * Musical 125 | * Mystery 126 | * Romance 127 | * Sci-Fi 128 | * Thriller 129 | * War 130 | * Western 131 | * (no genres listed) 132 | 133 | 134 | Links Data File Structure (links.csv) 135 | --------------------------------------- 136 | 137 | Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format: 138 | 139 | movieId,imdbId,tmdbId 140 | 141 | movieId is an identifier for movies used by . E.g., the movie Toy Story has the link . 142 | 143 | imdbId is an identifier for movies used by . E.g., the movie Toy Story has the link . 144 | 145 | tmdbId is an identifier for movies used by . E.g., the movie Toy Story has the link . 146 | 147 | Use of the resources listed above is subject to the terms of each provider. 148 | 149 | 150 | Cross-Validation 151 | ---------------- 152 | 153 | Prior versions of the MovieLens dataset included either pre-computed cross-folds or scripts to perform this computation. We no longer bundle either of these features with the dataset, since most modern toolkits provide this as a built-in feature. If you wish to learn about standard approaches to cross-fold computation in the context of recommender systems evaluation, see [LensKit](http://lenskit.org) for tools, documentation, and open-source code examples. 154 | -------------------------------------------------------------------------------- /datasets/movielens/howtoget.txt: -------------------------------------------------------------------------------- 1 | http://files.grouplens.org/datasets/movielens/ml-latest-small.zip 2 | -------------------------------------------------------------------------------- /datasets/names/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | curl https://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/female.txt -o female.txt 3 | curl https://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/male.txt -o male.txt 4 | 5 | -------------------------------------------------------------------------------- /datasets/names/howtoget.txt: -------------------------------------------------------------------------------- 1 | Source: 2 | https://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/female.txt 3 | https://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/male.txt 4 | -------------------------------------------------------------------------------- /datasets/shakespeare/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | curl https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt -o shakespeare.txt 3 | -------------------------------------------------------------------------------- /datasets/shakespeare/howtoget.txt: -------------------------------------------------------------------------------- 1 | Source: 2 | https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt 3 | -------------------------------------------------------------------------------- /datasets/shakespeare/shakespeare.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/datasets/shakespeare/shakespeare.txt.gz -------------------------------------------------------------------------------- /datasets/startbucks_locations/url.txt.txt: -------------------------------------------------------------------------------- 1 | https://gist.github.com/dankohn/09e5446feb4a8faea24f -------------------------------------------------------------------------------- /datasets/taxi-trips/howtoget.txt: -------------------------------------------------------------------------------- 1 | Data is available from https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew/ 2 | 3 | >time zcat Taxi_Trips.csv.gz |wc -l 4 | 113115259 5 | 6 | real 6m9.941s 7 | user 6m12.653s 8 | sys 0m45.163s 9 | 10 | Size of Taxi_Trips.csv.gz: 14G 11 | Size of taxi_trips_small.csv.gz: 1G 12 | Size of taxi_trips_smaller.csv.gz 154M 13 | 14 | Size of Taxi_Trips.csv: ???? 15 | Size of taxi_trips_small.csv: 3.4G 16 | Size of taxi_trips_smaller.csv: 456M 17 | 18 | Lines in Taxi_Trips.csv.gz: 113,115,259 (100 million) 19 | Lines in taxi_trips_small.csv.gz: 11,311,525 (11 million) 20 | Lines in taxi_trips_smaller.csv.gz: 1,131,152 (1 million) 21 | 22 | -------------------------------------------------------------------------------- /datasets/worldbank-dev-ind/howtoget.txt: -------------------------------------------------------------------------------- 1 | http://databank.worldbank.org/data/download/WDI_csv.zip 2 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | #conda env create -f environment.yml 2 | #conda env list 3 | #conda env remove --name PythonForAnalytics 4 | #conda env update --file environment.yml 5 | #conda activate PythonForAnalytics 6 | #conda update --all # update all packages 7 | name: msca 8 | channels: 9 | - defaults 10 | - conda-forge 11 | dependencies: 12 | - python=3.6 13 | - pylint 14 | - pip # a more general python package installer 15 | - openssl # technical infrastructure 16 | - flask # web/api framework 17 | - pyarrow # fast alternative to csv 18 | # Data science libraries 19 | - pandas # dataframe library 20 | - scikit-learn # collection of machine learning algos 21 | # Visualization libraries 22 | - seaborn # charting package which is built on top of matplotlib 23 | - altair # charting package which provides a python interface to the vega-lite library 24 | - bokeh # charting package 25 | - bqplot # charting package from bloomberg 26 | #- pyviz # related to holoviz? 27 | #- holoviz # meta package which installs holoviews, panel, hvplot, etc. 28 | # Notebook extensions 29 | - nb_conda # load conda environments in jupyter 30 | - rise # presentations in jupyter 31 | - tqdm 32 | - nbtutor # brings pythontutor into jupyter, execute `%load_ext nbtutor` to load extension, `%%nbtutor` to evaluate cell (`%%nbtutor -r` to reset variables) 33 | - jupyter_contrib_nbextensions #used to be under pip, under conda, doesn't require `jupyter contrib nbextension install ...` 34 | - jupyter_nbextensions_configurator #enable nb_conda to allow use of custom conda envs 35 | - pip: 36 | - wget # easy way to download files 37 | #- https://github.com/ipython-contrib/jupyter_contrib_nbextensions/tarball/master 38 | - Flask-Testing 39 | - nbdime # makes it easy to diff rendered notebooks 40 | - pixiedust 41 | #- google-cloud-firestore # to enable firestore for pusblishing student cell contents 42 | #- lolviz # visualize data structures (only useful for teaching and learning) 43 | #- kedro #McKinsey data science template 44 | -------------------------------------------------------------------------------- /lectures/.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb filter=strip-notebook-output 2 | -------------------------------------------------------------------------------- /lectures/005_intro_to_consoles/images/03_two_pages_at_once.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/005_intro_to_consoles/images/03_two_pages_at_once.png -------------------------------------------------------------------------------- /lectures/005_intro_to_consoles/images/console.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/005_intro_to_consoles/images/console.jpg -------------------------------------------------------------------------------- /lectures/005_intro_to_consoles/images/dosprompt.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/005_intro_to_consoles/images/dosprompt.jpg -------------------------------------------------------------------------------- /lectures/005_intro_to_consoles/images/macterminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/005_intro_to_consoles/images/macterminal.png -------------------------------------------------------------------------------- /lectures/005_intro_to_consoles/intro_to_consoles.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# In the beginning, there was the console" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "slide" 19 | } 20 | }, 21 | "source": [ 22 | "### Modern operating systems offer a visually stimulating **Graphical** User Interface\n", 23 | "\n", 24 | "![](./images/03_two_pages_at_once.png)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "slideshow": { 31 | "slide_type": "slide" 32 | } 33 | }, 34 | "source": [ 35 | "### However, there had to be a simpler beginning\n", 36 | "\n", 37 | "![](./images/console.jpg)\n", 38 | "\n", 39 | "Kids react to old computers: https://www.youtube.com/watch?v=PF7EpEnglgk" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "slideshow": { 46 | "slide_type": "slide" 47 | } 48 | }, 49 | "source": [ 50 | "### Print to screen, which screen?\n", 51 | "\n", 52 | "```print(1+2)```\n", 53 | "\n", 54 | "There only used to be _one_ screen (aka console)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "slideshow": { 61 | "slide_type": "slide" 62 | } 63 | }, 64 | "source": [ 65 | "### In modern computers, such consoles are still available\n", 66 | "\n", 67 | "In Apple computers, console is called the **Terminal**\n", 68 | "![](./images/macterminal.png)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "slideshow": { 75 | "slide_type": "slide" 76 | } 77 | }, 78 | "source": [ 79 | "### In modern computers, such consoles are still available\n", 80 | "\n", 81 | "In Windows, it is called the **Command Prompt**\n", 82 | "![](./images/dosprompt.jpg)\n", 83 | "\n", 84 | "We will use Git Bash, which will simulate an environment similar to Apple or Linux Terminal" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": { 90 | "slideshow": { 91 | "slide_type": "slide" 92 | } 93 | }, 94 | "source": [ 95 | "# References\n", 96 | "MS Word screenshot: https://www.howtogeek.com/215187/how-to-view-multiple-pages-at-once-in-word/\n", 97 | "\n", 98 | "Mac Terminal screenshot: https://thenextweb.com/lifehacks/2010/11/19/keep-your-macbook-from-waking-up-in-your-bag-with-a-simple-command/" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "slideshow": { 106 | "slide_type": "slide" 107 | } 108 | }, 109 | "outputs": [], 110 | "source": [] 111 | } 112 | ], 113 | "metadata": { 114 | "celltoolbar": "Slideshow", 115 | "kernelspec": { 116 | "display_name": "Python [conda env:PythonForAnalytics] *", 117 | "language": "python", 118 | "name": "conda-env-PythonForAnalytics-py" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.6.9" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 2 135 | } 136 | -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/images/HR-10RC_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/HR-10RC_large.png -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/images/SL300VC-PK_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/SL300VC-PK_large.png -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/images/calculator_2p3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_2p3.jpg -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/images/calculator_blank.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_blank.jpg -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/images/calculator_expand_chars.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_expand_chars.jpg -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/images/calculator_expand_date.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_expand_date.jpg -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/images/calculator_expand_math.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_expand_math.jpg -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/images/calculator_expand_mem.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_expand_mem.jpg -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/images/calculator_memory.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_memory.jpg -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/images/calculator_numbers.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_numbers.jpg -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/images/calculator_operators.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_operators.jpg -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/images/calculator_screen.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_screen.jpg -------------------------------------------------------------------------------- /lectures/010_programming_vs_calculator/programming_vs_calculator.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "## Programming vs a calculator" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "slide" 19 | } 20 | }, 21 | "source": [ 22 | "### Let's start with a calculator\n", 23 | "![](./images/SL300VC-PK_large.png)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "slideshow": { 31 | "slide_type": "fragment" 32 | } 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "2+2" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "slideshow": { 43 | "slide_type": "slide" 44 | } 45 | }, 46 | "source": [ 47 | "### Numbers, nouns or _things_\n", 48 | "![](./images/calculator_numbers.jpg) " 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "slideshow": { 55 | "slide_type": "slide" 56 | } 57 | }, 58 | "source": [ 59 | "### Operators, functions, verbs or ways in which you operate on _things_\n", 60 | "![](./images/calculator_operators.jpg)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": { 66 | "slideshow": { 67 | "slide_type": "slide" 68 | } 69 | }, 70 | "source": [ 71 | "### Operations related to the screen\n", 72 | "![](./images/calculator_screen.jpg) " 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": { 78 | "slideshow": { 79 | "slide_type": "slide" 80 | } 81 | }, 82 | "source": [ 83 | "### Operations related to memory\n", 84 | "![](./images/calculator_memory.jpg)\n", 85 | "\n", 86 | "**Exercise**: On a calculator, find the proportion of total represented by 35, 45, 55\n", 87 | "1. Calculate total\n", 88 | "2. Save that total in memory\n", 89 | "3. Divide each number by the total in memory" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": { 95 | "slideshow": { 96 | "slide_type": "slide" 97 | } 98 | }, 99 | "source": [ 100 | "### An imaginary box to remember things\n", 101 | "![](./images/calculator_2p3.jpg) " 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "slideshow": { 109 | "slide_type": "fragment" 110 | } 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "x=135\n", 115 | "2+x" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "slideshow": { 122 | "slide_type": "slide" 123 | } 124 | }, 125 | "source": [ 126 | "### How can we expand the functionality of this calculator?\n", 127 | "\n", 128 | "What if we could snap on more operators (aka verbs or functions)\n", 129 | "\n", 130 | "![](./images/calculator_expand_math.jpg) \n", 131 | "\n", 132 | "This way, product designer and engineers could builid a nice calculator, math majors could write fancy expansion sets and consumers could add on the functionality they needed." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "slideshow": { 140 | "slide_type": "fragment" 141 | } 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "import math\n", 146 | "math.log(100)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": { 152 | "slideshow": { 153 | "slide_type": "slide" 154 | } 155 | }, 156 | "source": [ 157 | "### How can we expand the functionality of this calculator?\n", 158 | "\n", 159 | "What about adding nouns or data types beyond numbers, such as dates and times?\n", 160 | "\n", 161 | "![](./images/calculator_expand_date.jpg) " 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "slideshow": { 169 | "slide_type": "fragment" 170 | } 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "import datetime\n", 175 | "datetime.date.today() + datetime.timedelta(weeks=1) + datetime.timedelta(days=1)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": { 181 | "slideshow": { 182 | "slide_type": "slide" 183 | } 184 | }, 185 | "source": [ 186 | "### How can we expand the functionality of this calculator?\n", 187 | "\n", 188 | "What about adding nouns or data types beyond numbers, such as dates, times and **English characters** ?\n", 189 | "\n", 190 | "![](./images/calculator_expand_chars.jpg) " 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "print(dir(\"hello\"))" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "'a' + 'b'" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": { 214 | "slideshow": { 215 | "slide_type": "slide" 216 | } 217 | }, 218 | "source": [ 219 | "### How can we expand the functionality of this calculator?\n", 220 | "\n", 221 | "What about adding more than one memory slot? This will also require that we name our memory slots, or variables.\n", 222 | "\n", 223 | "![](./images/calculator_expand_mem.jpg) " 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "x=5\n", 233 | "y=10\n", 234 | "x+y" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": { 240 | "slideshow": { 241 | "slide_type": "slide" 242 | } 243 | }, 244 | "source": [ 245 | "### How can we expand the functionality of this calculator?\n", 246 | "\n", 247 | "What if, insead of typing out calculations, we could record the calculations, and run them at a later time?\n", 248 | "```\n", 249 | "1+1\n", 250 | "2+2\n", 251 | "...\n", 252 | "```\n", 253 | "\n", 254 | "![](./images/HR-10RC_large.png) " 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": { 260 | "slideshow": { 261 | "slide_type": "slide" 262 | } 263 | }, 264 | "source": [ 265 | "### A tape will replay commands **exactly**\n", 266 | "Not very useful. \n", 267 | "We should be able to record a bunch of commands, but vary one or two things." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": { 274 | "slideshow": { 275 | "slide_type": "fragment" 276 | } 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "x = 19.99 #<= Price of a meal\n", 281 | "y = x * 0.0625 #<= Sales tax\n", 282 | "y * 1.20 #<= 20% Tip" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": { 288 | "slideshow": { 289 | "slide_type": "slide" 290 | } 291 | }, 292 | "source": [ 293 | "# Sources\n", 294 | "Pink calculator: https://www.casio.com/products/calculators/basic/sl300vc-pk\n", 295 | "\n", 296 | "Calculator with printer: https://www.casio.com/products/calculators/printing/hr-10rc" 297 | ] 298 | } 299 | ], 300 | "metadata": { 301 | "celltoolbar": "Slideshow", 302 | "kernelspec": { 303 | "display_name": "Python 3 (ipykernel)", 304 | "language": "python", 305 | "name": "python3" 306 | }, 307 | "language_info": { 308 | "codemirror_mode": { 309 | "name": "ipython", 310 | "version": 3 311 | }, 312 | "file_extension": ".py", 313 | "mimetype": "text/x-python", 314 | "name": "python", 315 | "nbconvert_exporter": "python", 316 | "pygments_lexer": "ipython3", 317 | "version": "3.9.13" 318 | }, 319 | "vscode": { 320 | "interpreter": { 321 | "hash": "108a1df64039728e69f178110a6e255a10aba8514903b770571642a02940d2ba" 322 | } 323 | } 324 | }, 325 | "nbformat": 4, 326 | "nbformat_minor": 4 327 | } 328 | -------------------------------------------------------------------------------- /lectures/020_intro_to_jupyter/10 - Intro To Jupyter (not technical).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# This is Jupyter" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Standard method of running programs: command line\n", 15 | "\n", 16 | "Normally you 'run' a program by double clicking on it. For our purpose, we will consider this to be the same as running it from the command line. In both cases, a fully written program is run.\n", 17 | "\n", 18 | "For example, you may run a python program in the following manner:\n", 19 | "\n", 20 | "`python myprogram.py`\n", 21 | "\n", 22 | "In this case, myprogram.py contains your code and you are telling python to execute the whole thing for you.\n", 23 | "\n", 24 | "This is in contrast to running a program small batches or one line at a time:\n", 25 | "![](images\\python_repl.png)\n", 26 | "\n", 27 | "## REPL\n", 28 | "This method of executing code is called a *REPL*, aka *R*ead, *E*val, *P*rint *L*oop. This of a REPL as a program:\n", 29 | "```python\n", 30 | "while True:\n", 31 | " user_input = input() # Get input from user\n", 32 | " result = parse_and_evaluate(use_input)\n", 33 | " print(result)\n", 34 | "```\n", 35 | "\n", 36 | "A program which reads your programs and executes them, very *meta*!\n", 37 | "\n", 38 | "This method of executing code was the norm in the Lisp programming language - one of the oldest, continuously used programming languages, created for the purpose of Artificial Intelligence experimentation\n", 39 | "\n", 40 | "## Literate Programming: Comments in code vs code inside comments\n", 41 | "\n", 42 | "Don Knuth, a high priest of Computer Science, introduced and advocated for the idea of *literate programming.* Knuth wanted to move programs from technical code, written to satisfy compilers or interpreters to a living document, to be consumed by human readers. He wanted code, mixed with narrative explanations, describing what the code was doing.\n", 43 | "\n", 44 | "Take this example from an earlier lecture:" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "jon = 0 #variable containing Jon's score\n", 54 | "arya = 0 #variable containing Arya's score\n", 55 | "\n", 56 | "#Open file\n", 57 | "file = open(\"../../datasets/deaths-in-gameofthrones/game-of-thrones-deaths-data.csv\", encoding='utf8')\n", 58 | "\n", 59 | "#Go through each line in file\n", 60 | "for line in file:\n", 61 | " tokens = line.split(',') #separate line into columns\n", 62 | " if tokens[4]==\"Arya Stark\": arya = arya + 1\n", 63 | " if tokens[4]==\"Jon Snow\": \n", 64 | " jon = jon + 1\n", 65 | "\n", 66 | "file.close()\n", 67 | "print(\"Arya killed\", arya, \"people\")\n", 68 | "print(\"Jon killed\", jon, \"people\")\n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "You can see comments in the program.\n", 76 | "\n", 77 | "On the other hand, this notebook is a document, to be read by human students. Yet it contains code which can be executed. But the purpose of this document is not to have a CPU execute a set of instructions. It is to explain a process or a concept to students, where the code is executed in service of that purpose.\n", 78 | "\n", 79 | "## Notebooks\n", 80 | "One of the first, mainstream, uses of a notebook interface is from Mathematica\n", 81 | "![](images/mathematica.png)\n", 82 | "\n", 83 | "This interface combines **REPL** and **Literate Programming**. These notebooks display well formatted code, the results of executing that code, as well as formatted narrative text (English or technical math formulas) along with visual charts, etc. Jupyter notebooks are an recent, open source iteration of this concept." 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## Jupyter Notebooks\n", 91 | "\n", 92 | "**Starting Jupyter**\n", 93 | "Jupyter can be started from Anaconda Navigator. If you are comfortable, you can start Jupyter from the command line:\n", 94 | "\n", 95 | "```bash\n", 96 | "jupyter notebook\n", 97 | "```\n", 98 | "\n", 99 | "When using the command line, if you are at `c:\\Users\\shahbaz\\proj\\stock_market_prediction` and execute the command `jupyter notebook`, the notebook will start and display files in that directory. \n", 100 | "\n", 101 | "Once Jupyter is running, you will see a set of files, like this:\n", 102 | "![](images/jupyter_files.png)\n", 103 | "\n", 104 | "Create a new notebook by selecting 'new' in the upper right hand corner:\n", 105 | "![](images/jupyter_create_notebook.png)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "**References**\n", 120 | "\n", 121 | "Mathematica screenshot is from https://www.wolfram.com/language/fast-introduction-for-math-students/en/notebook-documents/" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [] 130 | } 131 | ], 132 | "metadata": { 133 | "kernelspec": { 134 | "display_name": "Python 3 (ipykernel)", 135 | "language": "python", 136 | "name": "python3" 137 | }, 138 | "language_info": { 139 | "codemirror_mode": { 140 | "name": "ipython", 141 | "version": 3 142 | }, 143 | "file_extension": ".py", 144 | "mimetype": "text/x-python", 145 | "name": "python", 146 | "nbconvert_exporter": "python", 147 | "pygments_lexer": "ipython3", 148 | "version": "3.12.4" 149 | }, 150 | "vscode": { 151 | "interpreter": { 152 | "hash": "108a1df64039728e69f178110a6e255a10aba8514903b770571642a02940d2ba" 153 | } 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 4 158 | } 159 | -------------------------------------------------------------------------------- /lectures/020_intro_to_jupyter/images/blank_jupyter_header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/blank_jupyter_header.png -------------------------------------------------------------------------------- /lectures/020_intro_to_jupyter/images/command_mode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/command_mode.png -------------------------------------------------------------------------------- /lectures/020_intro_to_jupyter/images/edit_mode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/edit_mode.png -------------------------------------------------------------------------------- /lectures/020_intro_to_jupyter/images/jupyter_celltype.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/jupyter_celltype.png -------------------------------------------------------------------------------- /lectures/020_intro_to_jupyter/images/jupyter_create_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/jupyter_create_notebook.png -------------------------------------------------------------------------------- /lectures/020_intro_to_jupyter/images/jupyter_files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/jupyter_files.png -------------------------------------------------------------------------------- /lectures/020_intro_to_jupyter/images/jupyter_kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/jupyter_kernel.png -------------------------------------------------------------------------------- /lectures/020_intro_to_jupyter/images/jupyter_restart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/jupyter_restart.png -------------------------------------------------------------------------------- /lectures/020_intro_to_jupyter/images/jupyter_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/jupyter_run.png -------------------------------------------------------------------------------- /lectures/020_intro_to_jupyter/images/mathematica.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/mathematica.png -------------------------------------------------------------------------------- /lectures/020_intro_to_jupyter/images/python_repl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/python_repl.png -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/090-roadmap.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "44ab0477-e109-4dc1-83d4-8b8071136645", 6 | "metadata": {}, 7 | "source": [ 8 | "# What we learn, when we are learning programming languages" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "2c64bc0b-78c7-4b95-a212-34491d0bb610", 14 | "metadata": {}, 15 | "source": [ 16 | "Before anything else, let's look at a map of the journey we are about to undertake. Porgramming langauges are used for data analysis, complex machine learning models, creating video games where you can explore whole galaxies, all of youtube, google, controls of cars and space shuttles...[almost everything in the world](https://a16z.com/2011/08/20/why-software-is-eating-the-world/)." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "dfc17055-21a9-4a5c-910e-c4206a853906", 22 | "metadata": {}, 23 | "source": [ 24 | "Suprisingly, the components which make up a programming language are not very complex. In this class, we will learn about these components and how to put them together to create useful programs." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "c608b572-29c9-44bc-893a-ad022e05d952", 30 | "metadata": {}, 31 | "source": [ 32 | "This is a broad (and necessarily incomplete) outline:" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "5ab75289-470f-4fcc-801c-90bde8b998ac", 38 | "metadata": {}, 39 | "source": [ 40 | "### Data types\n", 41 | "Calculators can only deal with numbers. Programming languages understand \n", 42 | "1. numbers\n", 43 | "2. text (called \"strings\")\n", 44 | "3. truthfulness and falsehood of logical statements (called \"boolean\")\n", 45 | "4. infinite variety, made up of combining the basic elements shown here" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "8d301b4f-73bc-4bbd-8ec4-8caddebc5d56", 51 | "metadata": {}, 52 | "source": [ 53 | "### Container types\n", 54 | "It is one thing to deal with single numbers or individual strings, programming languages provide \"data structures\" which let us deal with a collection of objects, such as:\n", 55 | "1. lists\n", 56 | "2. dictionaries\n", 57 | "3. sets\n", 58 | "4. infinite variety, made up of combining the basic elements shown here" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "id": "144e7f6c-23c2-4fe4-9251-b6d6b1a0d16b", 64 | "metadata": {}, 65 | "source": [ 66 | "### Control flow\n", 67 | "Much of the power of computers come from programming computers to repeat tasks as many times as we want or to have programs decide, while running, which path to take. As such, the control flow methods we will study are:\n", 68 | "1. if/else statements (called \"conditionals\")\n", 69 | "2. for and while loops" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "4628dd8f-887b-46b7-b3b0-97422126a9bb", 75 | "metadata": {}, 76 | "source": [ 77 | "### Operations\n", 78 | "Just like a calculator contains numbers, and operations on those numbers, such as plus, minus, divide, multiply, programming languages are a collection of a very large number of operations. Languages provide functions which operate on\n", 79 | "1. data types, such as numbers, strings, booleans, etc.\n", 80 | "2. container types, such as lists, dictionaries, sets, etc.\n", 81 | "3. infinite variety...you get the point" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "a925b147-936b-45cf-8c86-3e54dce080de", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [] 91 | } 92 | ], 93 | "metadata": { 94 | "kernelspec": { 95 | "display_name": "Python 3 (ipykernel)", 96 | "language": "python", 97 | "name": "python3" 98 | }, 99 | "language_info": { 100 | "codemirror_mode": { 101 | "name": "ipython", 102 | "version": 3 103 | }, 104 | "file_extension": ".py", 105 | "mimetype": "text/x-python", 106 | "name": "python", 107 | "nbconvert_exporter": "python", 108 | "pygments_lexer": "ipython3", 109 | "version": "3.8.10" 110 | } 111 | }, 112 | "nbformat": 4, 113 | "nbformat_minor": 5 114 | } 115 | -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/150-basic_plotting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Plotting (with matplotlib.pyplot)\n", 8 | "\n", 9 | "As a data scientist, plotting is extremely import and part of your daily workflow. However, Python doesn't come with a built-in plotting library. We will be using `matplotlib`. Like many data science packages, it is maintained by a community of programmers who (mostly) work on it for free in their spare time.\n", 10 | "\n", 11 | "On the first day of class, we downloaded this package and installed it in our Python envrionment. If you need to do this again, here are two method:\n", 12 | "\n", 13 | "1. Open your computer's terminal window (_terminal_ in mac and _anaconda prompt_ in windows). Run this line `conda install matplotlib`\n", 14 | "2. Right here, in your jupyter notebook, create a new cell and run this command `!conda install --yes matplotlib`. Once this command runs successfully, you may have to restart the kernel.\n", 15 | "\n", 16 | "In order to use the plotting library, you need to import it (like we do with many other packages). However, you need to execute an additional line of code: `%matplotlib inline`, which tells matplotlib that it needs to render its visual charts in the present notebook. \n", 17 | "\n", 18 | "Note that `inline` is not the only optional available. While `inline` causes matplotlib to create a static image, there are other options, such as `notebook` which can provide a richer experience. To experiment with other renderers, get a full list via `%matplotlib --list`" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import matplotlib.pyplot as plt" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "%matplotlib inline" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Simple plot types" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "plt.plot([1, 2, 3, 4, 3, 2, 1, 4, 7, 10])" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "You can remove the text `[]` by adding a semicolon at the end of the plot function (this is a bit of a hack, you don't need to do this elsewhere)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "plt.plot([1, 2, 3, 4, 3, 2, 1, 4, 7, 10]);" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "Any chart you create for others must be labeled corectly. Here is how you can add labels:" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "plt.plot([1, 2, 3, 4, 3, 2, 1, 4, 7, 10])\n", 85 | "plt.title(\"Test chart\")\n", 86 | "plt.xlabel('Value index')\n", 87 | "plt.ylabel('Value'); # notice only the last line has a semi-colon" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "Matplotlib generally draws figures which are too small. You can change their size using figsize:" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "plt.figure(figsize=(10,5)) # This only needs to be execute once, and can be done at the top of the notebook\n", 104 | "\n", 105 | "plt.plot([1, 2, 3, 4, 3, 2, 1, 4, 7, 10])\n", 106 | "plt.title(\"Test chart\")\n", 107 | "plt.xlabel('Value index')\n", 108 | "plt.ylabel('Value'); " 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "The above charts visualize a single dimension of numbers. What if you had two dimensions, such as house prices and number of rooms" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "prices = [95, 150, 200, 200, 400]\n", 125 | "rooms = [1, 2, 2, 1.5, 3]\n", 126 | "\n", 127 | "plt.plot(prices, rooms)\n", 128 | "plt.title(\"Test chart\")\n", 129 | "plt.xlabel('Hosue price')\n", 130 | "plt.ylabel('Num of rooms'); " 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "Such two dimensional data is better represented as a scatter chart:" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "prices = [95, 150, 200, 200, 400]\n", 147 | "rooms = [1, 2, 2, 1.5, 3]\n", 148 | "\n", 149 | "plt.scatter(prices, rooms)\n", 150 | "plt.title(\"Test chart\")\n", 151 | "plt.xlabel('Hosue price')\n", 152 | "plt.ylabel('Num of rooms'); " 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "Same data with bar plot" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "prices = [95, 150, 200, 200, 400]\n", 169 | "rooms = [1, 2, 2, 1.5, 3]\n", 170 | "\n", 171 | "plt.bar(prices, rooms, width=10)\n", 172 | "plt.title(\"Test chart\")\n", 173 | "plt.xlabel('Hosue price')\n", 174 | "plt.ylabel('Num of rooms'); " 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "prices = [95, 150, 200, 200, 400]\n", 191 | "rooms = [1, 2, 2, 1.5, 3]\n", 192 | "\n", 193 | "plt.hist(prices)\n", 194 | "plt.title(\"Histogram of prices\")" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "Reference:\n", 202 | "Some examples taken from the official tutorial at https://matplotlib.org/tutorials/introductory/pyplot.html" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "**Exercise**\n", 210 | "Explain Python's built-in random library. Use charts such as these. " 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [] 219 | } 220 | ], 221 | "metadata": { 222 | "kernelspec": { 223 | "display_name": "Python 3", 224 | "language": "python", 225 | "name": "python3" 226 | }, 227 | "language_info": { 228 | "codemirror_mode": { 229 | "name": "ipython", 230 | "version": 3 231 | }, 232 | "file_extension": ".py", 233 | "mimetype": "text/x-python", 234 | "name": "python", 235 | "nbconvert_exporter": "python", 236 | "pygments_lexer": "ipython3", 237 | "version": "3.8.5" 238 | } 239 | }, 240 | "nbformat": 4, 241 | "nbformat_minor": 4 242 | } 243 | -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/260-all_of_python_regexes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Regular Expressions" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Regular expressions are a mini-language, used to parse and extract information from strings.\n", 15 | "\n", 16 | "### Motivation: slicing vs split vs regex\n", 17 | "\n", 18 | "Given a strings, such as:\n", 19 | "\n", 20 | "\"01/09/2008\", \"05/12/2012\"\n", 21 | "\n", 22 | "we know we can get extract the year this way:" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "dates = [\"01/09/2008\", \"05/12/2012\"]\n", 32 | "\n", 33 | "for d in dates:\n", 34 | " print(d[-4:]) # use normal indexing" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "If we had a strings, such as:\n", 42 | "\n", 43 | "\"In the year 2008 we did such as such\"\n", 44 | "\"After the year 2009 we continued something else\"\n", 45 | "\n", 46 | "We can no longer use slicing, but we can just split the string and get the 4th value to get the year:" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "sentences = [\"In the year 2008 we did such as such\"\n", 56 | " , \"After the year 2009 we continued something else\"]\n", 57 | "\n", 58 | "for s in sentences:\n", 59 | " print(s.split(\" \")[3])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "How do we extract dates in the following sentences?\n", 67 | "\n", 68 | "\"2019: After the Fall of New York\"\n", 69 | "\n", 70 | "\"The exterminators of the year 3000\"\n", 71 | "\n", 72 | "\"1990: The Bronx Warriors\"\n", 73 | "\n", 74 | "The first inclination of novice programmers would be to split the movie title above, go through each title and check to see if it is just numbers. If it is, extract that token as the year.\n", 75 | "\n", 76 | "This pattern of coding comes up so often that there is a special way of extracting such information: regular expressions!" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "import re # <= regular expression library\n", 86 | "\n", 87 | "movies = [\"2019: After the Fall of New York\"\n", 88 | " , \"The exterminators of the year 3000\"\n", 89 | " , \"1990: The Bronx Warriors\"]\n", 90 | "\n", 91 | "for m in movies:\n", 92 | " print(re.search(\"(\\d\\d\\d\\d)\", m).group(0))" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "**...what??**\n", 100 | "\n", 101 | "Some people don't like regular expressions:\n", 102 | "\n", 103 | "> Some people, when confronted with a problem, think\n", 104 | "“I know, I'll use regular expressions.” Now they have two problems.\n", 105 | "\n", 106 | "\n", 107 | "- Jamie Zawinski" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## Regular expressions in context\n", 115 | "\n", 116 | "Regular expressions were invented, in their modern form, in 1951 by Stephen Kleene. They have their roots in theoretical computer science, although they have extremely useful as a text parsing tool.\n", 117 | "\n", 118 | "Practically every language has regular expressions built-in. They are often super optimized and always expressed in an archaic syntax.\n", 119 | "\n", 120 | "Regular expressiosn allow you to use basic components to parse a language. Here are some pseudo-code examples of regex expressions:\n", 121 | "\n", 122 | "Find all characters which are digits\n", 123 | "\n", 124 | "Find all characters which are digits, followed by another digit\n", 125 | "\n", 126 | "Find all characters which are at the beginning of a line, are of one of the following characters: [,.!;:], followed by 3 digits, followed by a comma, followed by three characters which are NOT digits" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "## Sample regular expressions" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "ages = \"Homer is 38 years old, Marge is 36 years old, Bart is 10 years old, Lisa is 8 years old and Maggie is 3.\"\n", 143 | "\n", 144 | "# Task: Extract all ages\n", 145 | "# Thinking: Find all numbers\n", 146 | "# Regex pseudo code: find digits\n", 147 | "\n", 148 | "regex_attempt1 = \"(\\d)\" # <= Find digits\n", 149 | "\n", 150 | "for m in re.finditer(regex_attempt1, ages): \n", 151 | " print(\"Match starts at\",m.start(), \"ends at\", m.end(), \"and contains\", m.group())" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "# Task: Extract all ages\n", 161 | "# Thinking: Find all numbers\n", 162 | "# Regex pseudo code: find digits, clump consecutive digits together\n", 163 | "\n", 164 | "regex_attempt1 = \"(\\d+)\" # <= Find digits and 1 or more repititions\n", 165 | "\n", 166 | "for m in re.finditer(regex_attempt1, ages): \n", 167 | " print(\"Match starts at\",m.start(), \"ends at\", m.end(), \"and contains\", m.group())" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "## Just use http://www.pyregex.com/ or https://www.debuggex.com/" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "**Exercise** Extract area codes from the following phone numbers. _Must_ write a single regex which is able to extract regular expressions from the following numbers (in a loop):\n", 182 | "\n", 183 | "1-201-123-1234\n", 184 | "\n", 185 | "98-708-567-7890\n", 186 | "\n", 187 | "0-708-333-4444\n", 188 | "\n", 189 | "In the above numbers, the area codes are 201, 708 and 708, respectively." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "area_code_regex = r\"type_correct_regex_expression_here\"\n", 199 | "\n", 200 | "for ac in [\"1-201-123-1234\", \"98-708-567-7890\", \"0-708-333-4444\"]:\n", 201 | " print(re.findall(area_code_regex, ac))" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "Hint: Look for the start of string, then one or more digits, then a dash, THEN the digits which contain our area code. Ignore the rest.\n" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "## What regular expressions can't do\n", 216 | "\n", 217 | "Regular expressions are part of a theoretical framework which define languages. There are languages which are less or more powerful than regular expressions.\n", 218 | "\n", 219 | "For example, regular expressions are not able to correctly parse this expressions:\n", 220 | "\n", 221 | "`1 + (2 * (3 + 8))`\n", 222 | "\n", 223 | "In order to parse the expression above, after each left parenthesis, we would have to use recursion. Regular expressions are not designed to parse such recursive expressions.\n", 224 | "\n", 225 | "Practically speaking, although _many_ poeple attempt it, regular expressions are not the correct choise to parse html (web) pages or xml documents.\n", 226 | "\n", 227 | "Computer science students often learn about context free grammars. CFGs _can_ parse recursive strings and are often used to parse programming languages. Unfortunately, CFGs are out of scope for this course." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [] 236 | } 237 | ], 238 | "metadata": { 239 | "kernelspec": { 240 | "display_name": "Python 3", 241 | "language": "python", 242 | "name": "python3" 243 | }, 244 | "language_info": { 245 | "codemirror_mode": { 246 | "name": "ipython", 247 | "version": 3 248 | }, 249 | "file_extension": ".py", 250 | "mimetype": "text/x-python", 251 | "name": "python", 252 | "nbconvert_exporter": "python", 253 | "pygments_lexer": "ipython3", 254 | "version": "3.8.5" 255 | } 256 | }, 257 | "nbformat": 4, 258 | "nbformat_minor": 4 259 | } 260 | -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/badly_typed_code.py: -------------------------------------------------------------------------------- 1 | 2 | def calc_grade(grade:str) -> str: 3 | 4 | print(grade.capitalize()) 5 | 6 | if grade > 3.5: return 'Pass' 7 | else: return "Fail" 8 | -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/images/best-mommy-ever-jewelry.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/best-mommy-ever-jewelry.jpg -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/images/class_diff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/class_diff.png -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/images/clock.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/clock.jpg -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/images/david_chang.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/david_chang.jpg -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/images/how-to-control-feedback-in-a-sound-system_header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/how-to-control-feedback-in-a-sound-system_header.jpg -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/images/ifelse_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/ifelse_diagram.png -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/images/inception.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/inception.jpg -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/images/infinitemirror.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/infinitemirror.jpg -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/images/listcomprehension.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/listcomprehension.png -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/images/loop_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/loop_diagram.png -------------------------------------------------------------------------------- /lectures/025_all_of_python_basics/images/y_combinator.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/y_combinator.jpg -------------------------------------------------------------------------------- /lectures/030_intro_to_pandas/180-pandas-operations_str_dt_apply.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "tags": [] 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "%reload_ext postcell\n", 22 | "%postcell register" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "tags": [] 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "names_ages_df = pd.DataFrame({'Name':['george washington', 'john adams', 'thomas jefferson', 'james madison', 'james monroe', 'andrew jackson', 'john quincy adams']\n", 34 | " , 'DOB':['2/22/1732', '10/30/1735', '4/13/1743', '3/16/1751', '4/28/1758', '3/15/1767', '7/11/1767']})\n", 35 | "names_ages_df" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "tags": [] 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "names_ages_df.dtypes" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "tags": [] 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "names_ages_df.DOB = pd.to_datetime(names_ages_df.DOB)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "tags": [] 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "names_ages_df.dtypes" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "tags": [] 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "names_ages_df.head()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "# Using `str` to do string operations on Pandas columns" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Given a string, such as 'george washington', normal Python will let you change case (`capitalize`, `lower`, `upper`), `split` it into tokens and do countless other operations:" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "tags": [] 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "'george washington'.title()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "But how can you do the same thing with pandas columns?" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "tags": [] 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "names_ages_df.Name" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "tags": [] 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "names_ages_df.Name.title()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "The answer is `df.col.str`. Calling the `str` property will bring back all of core python's string functions:" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "tags": [] 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "names_ages_df.Name.str.title()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "# Using `dt` to do datetime operations on Pandas columns\n", 159 | "\n", 160 | "Similar to `str`, datetime operations can be done on pandas columns via `df.col.dt`:" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "tags": [] 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "names_ages_df.DOB.dt.year" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "# Using `apply` for any transformation\n", 179 | "\n", 180 | "Although pandas provides type specific methods via `str` and `dt`, using the `apply` function gives you much greater control.\n", 181 | "\n", 182 | "You can pass in a function (including `lambda` functions, see relevant lecture if you are not familiar) so each cell appears as a single input. You can then apply any transformation you like.\n", 183 | "\n", 184 | "#### Example: Change names from \"first last\" to \"Last, First\"" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "tags": [] 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "names_ages_df" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "tags": [] 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "def last_first(name):\n", 207 | " tokens = name.split()\n", 208 | " return f'{tokens[1].capitalize()}, {tokens[0].capitalize()}'\n", 209 | "\n", 210 | "last_first('homer simpson')" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "tags": [] 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "\n", 222 | "names_ages_df.Name.apply(last_first)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": { 229 | "tags": [] 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "names_ages_df['Normalzied Name'] = names_ages_df.Name.apply(last_first)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "tags": [] 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "names_ages_df" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "tags": [] 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "names_ages_df.Name.apply(lambda x: f'{x.split()[1].capitalize()}')" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [] 264 | } 265 | ], 266 | "metadata": { 267 | "kernelspec": { 268 | "display_name": "Python 3 (ipykernel)", 269 | "language": "python", 270 | "name": "python3" 271 | }, 272 | "language_info": { 273 | "codemirror_mode": { 274 | "name": "ipython", 275 | "version": 3 276 | }, 277 | "file_extension": ".py", 278 | "mimetype": "text/x-python", 279 | "name": "python", 280 | "nbconvert_exporter": "python", 281 | "pygments_lexer": "ipython3", 282 | "version": "3.12.4" 283 | } 284 | }, 285 | "nbformat": 4, 286 | "nbformat_minor": 4 287 | } 288 | -------------------------------------------------------------------------------- /lectures/030_intro_to_pandas/images/dataframes.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/030_intro_to_pandas/images/dataframes.jpg -------------------------------------------------------------------------------- /lectures/030_intro_to_pandas/images/series.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/030_intro_to_pandas/images/series.jpg -------------------------------------------------------------------------------- /lectures/030_intro_to_pandas/images/splitapplycombine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/030_intro_to_pandas/images/splitapplycombine.png -------------------------------------------------------------------------------- /lectures/040_basic_computer_architecture/images/EBMotherboard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/EBMotherboard.jpg -------------------------------------------------------------------------------- /lectures/040_basic_computer_architecture/images/How_to_stress_test_your_CPU-Hero.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/How_to_stress_test_your_CPU-Hero.jpg -------------------------------------------------------------------------------- /lectures/040_basic_computer_architecture/images/Laptop-hard-drive-exposed.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/Laptop-hard-drive-exposed.jpg -------------------------------------------------------------------------------- /lectures/040_basic_computer_architecture/images/RAM-Modules.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/RAM-Modules.jpg -------------------------------------------------------------------------------- /lectures/040_basic_computer_architecture/images/Supermicro-X12SCA-F-Overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/Supermicro-X12SCA-F-Overview.jpg -------------------------------------------------------------------------------- /lectures/040_basic_computer_architecture/images/ascii.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/ascii.png -------------------------------------------------------------------------------- /lectures/040_basic_computer_architecture/images/calc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/calc.png -------------------------------------------------------------------------------- /lectures/040_basic_computer_architecture/images/overview-fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/overview-fig1.png -------------------------------------------------------------------------------- /lectures/040_basic_computer_architecture/images/unicode_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/unicode_sample.png -------------------------------------------------------------------------------- /lectures/045_intro_to_numpy/images/chicago.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/045_intro_to_numpy/images/chicago.jpeg -------------------------------------------------------------------------------- /lectures/045_intro_to_numpy/images/chicago.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/045_intro_to_numpy/images/chicago.png -------------------------------------------------------------------------------- /lectures/045_intro_to_numpy/images/chicagobw.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/045_intro_to_numpy/images/chicagobw.jpeg -------------------------------------------------------------------------------- /lectures/045_intro_to_numpy/images/chicagobw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/045_intro_to_numpy/images/chicagobw.png -------------------------------------------------------------------------------- /lectures/050_git_version_control/Why do you need version control.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/050_git_version_control/Why do you need version control.pptx -------------------------------------------------------------------------------- /lectures/050_git_version_control/assets/copy_to_dropbox.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/050_git_version_control/assets/copy_to_dropbox.png -------------------------------------------------------------------------------- /lectures/050_git_version_control/assets/folder_versions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/050_git_version_control/assets/folder_versions.png -------------------------------------------------------------------------------- /lectures/050_git_version_control/assets/github-desktop-screenshot-windows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/050_git_version_control/assets/github-desktop-screenshot-windows.png -------------------------------------------------------------------------------- /lectures/050_git_version_control/assets/macgit-03-open.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/050_git_version_control/assets/macgit-03-open.png -------------------------------------------------------------------------------- /lectures/050_git_version_control/assets/share_code_email.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/050_git_version_control/assets/share_code_email.png -------------------------------------------------------------------------------- /lectures/050_git_version_control/readme.txt: -------------------------------------------------------------------------------- 1 | Convert marp presentation to pdf/power point via: 2 | https://github.com/marp-team/marp-cli -------------------------------------------------------------------------------- /lectures/050_git_version_control/understanding_version_control.md: -------------------------------------------------------------------------------- 1 | --- 2 | theme: "white" 3 | transition: "fade" 4 | highlightTheme: "dracula" 5 | center: false 6 | marp: true 7 | --- 8 | 9 | # Understanding Version Control 10 | 11 | by Shahbaz Chaudhary 12 | 13 | --- 14 | 15 | # GitHub as a code repository 16 | https://github.com/pandas-dev/pandas 17 | 18 | * Check out recent commits, notice the files changed and their diffs 19 | * Take a look at a user who recently submitted a commit 20 | * Check out their projects, as if you were looking to hire them 21 | 22 | Top contributors 23 | https://github.com/pandas-dev/pandas/graphs/contributors 24 | 25 | Branching off and doing work 26 | https://github.com/pandas-dev/pandas/network 27 | 28 | --- 29 | 30 | # GitHub as a way to track issues 31 | 32 | https://github.com/pandas-dev/pandas/issues 33 | 34 | * Take a look at the open issues 35 | * Take a look at closed issues and how they connect to commits 36 | 37 | --- 38 | 39 | # GitHub as your resume? 40 | 41 | * https://github.com/wesm 42 | * https://github.com/hadley 43 | 44 | --- 45 | 46 | # [Task] Let's get Git installed 47 | 48 | https://git-scm.com/downloads 49 | 50 | **Mac Users** Download git file, _right click_ and select open (don't double click) 51 | 52 | ![](assets/macgit-03-open.png) 53 | Then follow prompts and keep selecting default options 54 | 55 | Step by step instructions: https://www.linode.com/docs/development/version-control/how-to-install-git-on-linux-mac-and-windows/ 56 | 57 | --- 58 | 59 | # [Task] Configure Git (Optional) 60 | Once installed, run the following at the command line: 61 | 62 | ```git config --global user.name examplename``` 63 | 64 | ```git config --global user.email user@example.com``` 65 | 66 | --- 67 | 68 | # [Task] Install GitHub's software 69 | 70 | https://desktop.github.com/ 71 | 72 | ![](assets/github-desktop-screenshot-windows.png) 73 | 74 | --- 75 | 76 | # [Task] Install VS Code 77 | 78 | https://code.visualstudio.com/ 79 | 80 | * Install extension "python" (from Microsoft) 81 | 82 | --- 83 | 84 | # What is the difference between Git and GitHub? 85 | 86 | CVS -> SVN -> Git 87 | 88 | --- 89 | 90 | # [Task] Create a GitHub account 91 | 92 | https://github.com/ 93 | 94 | This will be your resume for many _years_, pick a good name. This is an example of a very bad name: 95 | https://github.com/falconair 96 | 97 | --- 98 | 99 | # [Task] Send me your GitHub user names (NOT passwords!) 100 | 101 | https://forms.gle/wgwV1ztzFqD1Bz9x7 102 | 103 | --- 104 | 105 | # Show Leadership 106 | 107 | Data science departments are 5-15 years behind software engineering in professional infrastructure. Be a leader in your field by learning from programmers. 108 | 109 | --- 110 | 111 | # Professional work vs hacking 112 | 113 | Professional work requires more than coding and building models. Here are some _extra_ things you need to do as a professional (which are within scope of this presentation): 114 | 115 | * Your work must be backed up. Losing your laptop must not mean losing your work 116 | * You must be able to go back to an older version of your work 117 | * You must be able to collaborate with your team-mates, without stepping one each other's toes 118 | 119 | --- 120 | 121 | # How do you keep track of working model vs experiments? 122 | 123 | ![](assets/folder_versions.png) 124 | 125 | --- 126 | 127 | # How do you protect against a broken laptop? 128 | 129 | ![](assets/copy_to_dropbox.png) 130 | (src: https://www.labnol.org/software/send-files-to-dropbox/18438/) 131 | 132 | --- 133 | 134 | # How do you collaborate with your colleagues? 135 | 136 | ![](assets/share_code_email.png) 137 | 138 | --- 139 | 140 | # [Task] Create a new "repo" (repository) 141 | * Create a new GitHub repository (upper right hand corner) 142 | * Call it "PfA_test" 143 | * Keep it public 144 | * Check "Initialize this repository with a README" 145 | * Click "Create repository" 146 | * Use command line or Desktop app to "clone" the repo to your disk 147 | 148 | ```git clone https://github.com//PfA_test.git``` 149 | 150 | --- 151 | 152 | # [Task] Update something in your project 153 | 154 | Add this to readme\.md (try using vs code) 155 | ```csv 156 | # My name is Shahbaz 157 | 158 | ## This is a git experiment 159 | 160 | This is some random text 161 | ``` 162 | 163 | --- 164 | 165 | # [Task] _Commit_ your code to your _local_ repo 166 | 167 | Using VS Code, GitHub Desktop or command line, "commit" your code` 168 | 169 | To "commit" your code means to tell Git to start keeping track of it. 170 | 171 | Command line 172 | 173 | ```git commit -m "Adds content to readme file"``` 174 | 175 | --- 176 | 177 | Your code is now "saved," along with a text describing the change. No need for multiple folders. 178 | 179 | --- 180 | 181 | # [Task] _Push_ this change to GitHub 182 | 183 | Using VS Code, GitHub Desktop or command line, "push" your code to GitHub 184 | 185 | Command line 186 | 187 | ```git push``` 188 | 189 | --- 190 | 191 | Your code is now 'backed-up' at a remote location 192 | 193 | --- 194 | 195 | # [Task] Update my readme.md file and add your name 196 | 197 | * Clone my repo `` 198 | * Open my readme.md file using VS Code 199 | * Add your name to the list (don't remove anyone else's name) 200 | * Commit code and push it 201 | 202 | (You may get merge conflicts, VS Code makes it easier)) 203 | 204 | --- 205 | 206 | You have now learned how to 207 | * save various versions of your code 208 | * back up your code 209 | * collaborate with your colleagues 210 | 211 | --- 212 | 213 | A couple of tools you should be aware of: 214 | * Diff 215 | * nb_diff 216 | 217 | --- 218 | 219 | #References 220 | 221 | * Mac open screenshot from https://www.linode.com/docs/development/version-control/how-to-install-git-on-linux-mac-and-windows/ -------------------------------------------------------------------------------- /lectures/055_bigger_data_pandas/100 - Work With Taxi Trips - memory_map.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# memory_map test" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "7.32 s ± 186 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_smaller.csv\", low_memory=False)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "7.17 s ± 22.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_smaller.csv\", low_memory=False, memory_map=True)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "59.2 s ± 88.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\", low_memory=False)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 5, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "59.6 s ± 180 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\", low_memory=False, memory_map=True)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [] 100 | } 101 | ], 102 | "metadata": { 103 | "kernelspec": { 104 | "display_name": "Python 3", 105 | "language": "python", 106 | "name": "python3" 107 | }, 108 | "language_info": { 109 | "codemirror_mode": { 110 | "name": "ipython", 111 | "version": 3 112 | }, 113 | "file_extension": ".py", 114 | "mimetype": "text/x-python", 115 | "name": "python", 116 | "nbconvert_exporter": "python", 117 | "pygments_lexer": "ipython3", 118 | "version": "3.7.3" 119 | } 120 | }, 121 | "nbformat": 4, 122 | "nbformat_minor": 2 123 | } 124 | -------------------------------------------------------------------------------- /lectures/055_bigger_data_pandas/110 - Work With Taxi Trips - compression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# compression test" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "7.22 s ± 25.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_smaller.csv\", low_memory=False)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "9.97 s ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_smaller.csv.gz\", low_memory=False)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 6, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "1min 2s ± 770 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\", low_memory=False)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 7, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "1min 18s ± 679 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv.gz\", low_memory=False)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [] 100 | } 101 | ], 102 | "metadata": { 103 | "kernelspec": { 104 | "display_name": "Python 3", 105 | "language": "python", 106 | "name": "python3" 107 | }, 108 | "language_info": { 109 | "codemirror_mode": { 110 | "name": "ipython", 111 | "version": 3 112 | }, 113 | "file_extension": ".py", 114 | "mimetype": "text/x-python", 115 | "name": "python", 116 | "nbconvert_exporter": "python", 117 | "pygments_lexer": "ipython3", 118 | "version": "3.7.3" 119 | } 120 | }, 121 | "nbformat": 4, 122 | "nbformat_minor": 2 123 | } 124 | -------------------------------------------------------------------------------- /lectures/055_bigger_data_pandas/120 - Work With Taxi Trips - feather format.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chunking and feather format test" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Feather format, and associated technology called _arrow_ is created by Wes McKiney and Hadley Wickham. *LOTS* of new and interesting infrastructure is being built around it. However, it is very new and lots of tooling is missing." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### You may have to install pyarrow to access to_feather and read_feather functionality" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "#!conda install --y pyarrow -c conda-forge" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import pandas as pd\n", 40 | "import pyarrow as pa\n", 41 | "from pyarrow import csv" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "### Notice that read_feather is _MUCH_ faster than read_csv" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "CPU times: user 56.7 s, sys: 5.26 s, total: 1min 1s\n", 61 | "Wall time: 1min 1s\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "%time data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\", low_memory=False)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "CPU times: user 5.52 s, sys: 7.2 s, total: 12.7 s\n", 79 | "Wall time: 17 s\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "%time data_df.to_feather(\"../../datasets/taxi-trips/taxi_trips_small.feather\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 4, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "CPU times: user 6.67 s, sys: 2.49 s, total: 9.16 s\n", 97 | "Wall time: 9.34 s\n" 98 | ] 99 | } 100 | ], 101 | "source": [ 102 | "%time data_df = pd.read_feather(\"../../datasets/taxi-trips/taxi_trips_small.feather\")" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "Python 3", 116 | "language": "python", 117 | "name": "python3" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 3 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython3", 129 | "version": "3.7.3" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 2 134 | } 135 | -------------------------------------------------------------------------------- /lectures/055_bigger_data_pandas/135 - Work With Taxi Trips - skip columns pre-req.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import sys\n", 11 | "import numpy as np" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### How big is your computer?" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "Memory:\t 31.2348 Gigabytes\n", 31 | "Disk:\t 111.4335 Gigabytes\n", 32 | "Cpus:\t 4\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "from psutil import virtual_memory, disk_usage, cpu_count\n", 38 | "\n", 39 | "bytes_in_gb = 1024**3\n", 40 | "\n", 41 | "print(\"Memory:\\t\",round(virtual_memory().total/bytes_in_gb,4), \"Gigabytes\")\n", 42 | "print(\"Disk:\\t\",round(disk_usage(os.path.abspath(os.sep)).total/bytes_in_gb,4), \"Gigabytes\")\n", 43 | "print(\"Cpus:\\t\", cpu_count())" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "### Let's test the speed of you computer" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "14.2 ms ± 86.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "%%timeit\n", 68 | "sum(range(1_000_000))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### Why you shouldn't keep your integers as strings" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 27, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "dtype('int64')" 87 | ] 88 | }, 89 | "execution_count": 27, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "np.array([1]).dtype" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 18, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "(8, 4)" 107 | ] 108 | }, 109 | "execution_count": 18, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "np.array([1]).itemsize, np.array([\"1\"]).itemsize" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 19, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | "(8, 8)" 127 | ] 128 | }, 129 | "execution_count": 19, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "np.array([10]).itemsize, np.array([\"10\"]).itemsize" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 20, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "(8, 12)" 147 | ] 148 | }, 149 | "execution_count": 20, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "np.array([100]).itemsize, np.array([\"100\"]).itemsize" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 21, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "(8, 16)" 167 | ] 168 | }, 169 | "execution_count": 21, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "np.array([1000]).itemsize, np.array([\"1000\"]).itemsize" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 22, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "(8, 40)" 187 | ] 188 | }, 189 | "execution_count": 22, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "#Seconds since epoch\n", 196 | "np.array([1565232961]).itemsize, np.array([\"1565232961\"]).itemsize" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [] 205 | } 206 | ], 207 | "metadata": { 208 | "kernelspec": { 209 | "display_name": "Python 3", 210 | "language": "python", 211 | "name": "python3" 212 | }, 213 | "language_info": { 214 | "codemirror_mode": { 215 | "name": "ipython", 216 | "version": 3 217 | }, 218 | "file_extension": ".py", 219 | "mimetype": "text/x-python", 220 | "name": "python", 221 | "nbconvert_exporter": "python", 222 | "pygments_lexer": "ipython3", 223 | "version": "3.7.3" 224 | } 225 | }, 226 | "nbformat": 4, 227 | "nbformat_minor": 2 228 | } 229 | -------------------------------------------------------------------------------- /lectures/055_bigger_data_pandas/140B - Work With Taxi Trips - skip columns.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 35, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "CPU times: user 41min 4s, sys: 4.24 s, total: 41min 9s\n", 23 | "Wall time: 41min 8s\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "schema = {'Trip ID': 'str',\n", 29 | " 'Taxi ID': 'category',\n", 30 | " 'Trip Start Timestamp': 'str',\n", 31 | " 'Trip End Timestamp': 'str',\n", 32 | " 'Trip Seconds': 'float64',\n", 33 | " 'Trip Miles': 'float64',\n", 34 | " 'Pickup Census Tract': 'float64',\n", 35 | " 'Dropoff Census Tract': 'float64',\n", 36 | " 'Pickup Community Area': 'float64',\n", 37 | " 'Dropoff Community Area': 'float64',\n", 38 | " 'Fare': 'float64',\n", 39 | " 'Tips': 'float64',\n", 40 | " 'Tolls': 'float64',\n", 41 | " 'Extras': 'float64',\n", 42 | " 'Trip Total': 'float64',\n", 43 | " 'Payment Type': 'category',\n", 44 | " 'Company': 'category',\n", 45 | " 'Pickup Centroid Latitude': 'float64',\n", 46 | " 'Pickup Centroid Longitude': 'float64',\n", 47 | " 'Pickup Centroid Location': 'str',\n", 48 | " 'Dropoff Centroid Latitude': 'float64',\n", 49 | " 'Dropoff Centroid Longitude': 'float64',\n", 50 | " 'Dropoff Centroid Location': 'str',\n", 51 | " 'Community Areas': 'float64'}\n", 52 | "\n", 53 | "usecols = ['Trip ID', 'Taxi ID', 'Trip Start Timestamp', 'Trip End Timestamp',\\\n", 54 | " 'Trip Seconds', 'Trip Miles', 'Pickup Census Tract',\\\n", 55 | " 'Dropoff Census Tract', 'Pickup Community Area',\\\n", 56 | " 'Dropoff Community Area', 'Fare', 'Tips', 'Tolls', 'Extras',\\\n", 57 | " 'Trip Total', 'Payment Type', 'Company', 'Pickup Centroid Latitude',\\\n", 58 | " 'Pickup Centroid Longitude', 'Dropoff Centroid Latitude',\\\n", 59 | " 'Dropoff Centroid Longitude', 'Community Areas']\n", 60 | "\n", 61 | "%time data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\" \\\n", 62 | " , dtype = schema \\\n", 63 | " , usecols = usecols \\\n", 64 | " , parse_dates = ['Trip Start Timestamp', 'Trip End Timestamp'])" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 36, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "Total memory usage: 2.59 GB\n", 77 | "CPU times: user 5.27 s, sys: 3 µs, total: 5.27 s\n", 78 | "Wall time: 5.29 s\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "%time print(\"Total memory usage:\", round(sum(data_df.memory_usage(deep=True, index=False)) / (1024 ** 3), 2), \"GB\")" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "Python 3", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.7.3" 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 2 115 | } 116 | -------------------------------------------------------------------------------- /lectures/055_bigger_data_pandas/150 - Work With Taxi Trips - c_parser.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# C parser test" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "7.62 s ± 42.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_smaller.csv\", low_memory=False)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "7.55 s ± 11.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_smaller.csv\", low_memory=False, engine='c')" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "1min 2s ± 50.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\", low_memory=False)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 6, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "1min 2s ± 60.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\", low_memory=False, engine='c')" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [] 100 | } 101 | ], 102 | "metadata": { 103 | "kernelspec": { 104 | "display_name": "Python 3", 105 | "language": "python", 106 | "name": "python3" 107 | }, 108 | "language_info": { 109 | "codemirror_mode": { 110 | "name": "ipython", 111 | "version": 3 112 | }, 113 | "file_extension": ".py", 114 | "mimetype": "text/x-python", 115 | "name": "python", 116 | "nbconvert_exporter": "python", 117 | "pygments_lexer": "ipython3", 118 | "version": "3.7.3" 119 | } 120 | }, 121 | "nbformat": 4, 122 | "nbformat_minor": 2 123 | } 124 | -------------------------------------------------------------------------------- /lectures/055_bigger_data_pandas/160 - Work With Taxi Trips - Chunk to parquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "\n", 11 | "from tqdm import tqdm, tqdm_notebook, tnrange\n", 12 | "tqdm.pandas()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "### Go through a large csv and convert to parquet" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "schema = {'Trip ID': 'str',\n", 29 | " 'Taxi ID': 'category',\n", 30 | " 'Trip Start Timestamp': 'str',\n", 31 | " 'Trip End Timestamp': 'str',\n", 32 | " 'Trip Seconds': 'float64',\n", 33 | " 'Trip Miles': 'float64',\n", 34 | " 'Pickup Census Tract': 'float64',\n", 35 | " 'Dropoff Census Tract': 'float64',\n", 36 | " 'Pickup Community Area': 'float64',\n", 37 | " 'Dropoff Community Area': 'float64',\n", 38 | " 'Fare': 'float64',\n", 39 | " 'Tips': 'float64',\n", 40 | " 'Tolls': 'float64',\n", 41 | " 'Extras': 'float64',\n", 42 | " 'Trip Total': 'float64',\n", 43 | " 'Payment Type': 'category',\n", 44 | " 'Company': 'category',\n", 45 | " 'Pickup Centroid Latitude': 'float64',\n", 46 | " 'Pickup Centroid Longitude': 'float64',\n", 47 | " 'Pickup Centroid Location': 'str',\n", 48 | " 'Dropoff Centroid Latitude': 'float64',\n", 49 | " 'Dropoff Centroid Longitude': 'float64',\n", 50 | " 'Dropoff Centroid Location': 'str',\n", 51 | " 'Community Areas': 'float64'}\n", 52 | "\n", 53 | "usecols = ['Trip ID', 'Taxi ID', 'Trip Start Timestamp', 'Trip End Timestamp',\\\n", 54 | " 'Trip Seconds', 'Trip Miles', 'Pickup Census Tract',\\\n", 55 | " 'Dropoff Census Tract', 'Pickup Community Area',\\\n", 56 | " 'Dropoff Community Area', 'Fare', 'Tips', 'Tolls', 'Extras',\\\n", 57 | " 'Trip Total', 'Payment Type', 'Company', 'Pickup Centroid Latitude',\\\n", 58 | " 'Pickup Centroid Longitude', 'Dropoff Centroid Latitude',\\\n", 59 | " 'Dropoff Centroid Longitude', 'Community Areas']\n", 60 | "\n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 3, 66 | "metadata": { 67 | "scrolled": false 68 | }, 69 | "outputs": [ 70 | { 71 | "name": "stderr", 72 | "output_type": "stream", 73 | "text": [ 74 | "114it [7:06:22, 163.28s/it]" 75 | ] 76 | }, 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "CPU times: user 7h 6min 33s, sys: 1min 3s, total: 7h 7min 36s\n", 82 | "Wall time: 7h 6min 22s\n" 83 | ] 84 | }, 85 | { 86 | "name": "stderr", 87 | "output_type": "stream", 88 | "text": [ 89 | "\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "%%time\n", 95 | "chunk_size = 1_000_000\n", 96 | "for chunk_counter, chunk_df in enumerate(tqdm(pd.read_csv(\"../../datasets/taxi-trips/Taxi_Trips.csv.gz\" \\\n", 97 | " , dtype = schema \\\n", 98 | " , usecols = usecols \\\n", 99 | " , parse_dates = ['Trip Start Timestamp', 'Trip End Timestamp'] \\\n", 100 | " , compression = \"gzip\"\\\n", 101 | " , chunksize=chunk_size))):\n", 102 | " OUTFILE = \"../../datasets/taxi-trips/taxi_trips_parquet/taxi_trips_\"+str(chunk_counter)+\".parquet\"\n", 103 | " chunk_df.to_parquet(OUTFILE, compression='gzip')" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.7.3" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 2 135 | } 136 | -------------------------------------------------------------------------------- /lectures/055_bigger_data_pandas/lecture.md: -------------------------------------------------------------------------------- 1 | --- 2 | theme: "white" 3 | transition: "fade" 4 | marp: true 5 | highlightTheme: "dracula" 6 | --- 7 | 8 | # Handing bigger data with Pandas 9 | 10 | --- 11 | 12 | ## General guidelines for exploring big data tool: 13 | * If data fits in memory, use Pandas/R/Excel 14 | * If data fits on disk, use a database 15 | * If data is bigger than a disk drive, use Hadoop 16 | 17 | --- 18 | 19 | ## Well known maxims in computer science: 20 | *"Premature optimization is the root of all evil"* - Don Knuth 21 | 22 | *"Developers themselves highlight the fact that those doing research should exercise caution when using such microbenchmarks"* - Wikipedia article about the benchmark game 23 | 24 | 25 | *"You don't have to be an engineer to be be a racing driver, but you do have to have Mechanical Sympathy."* Jackie Stewart, racing driver - Quoted by Martin Thompson 26 | 27 | --- 28 | 29 | ## A very important insight into understanding perfrmance issues: 30 | The pyramid of latency varies by orders of magnitude 31 | 32 | --- 33 | 34 | ## Latency numbers every programmer should know 35 | (source: https://gist.github.com/hellerbarde/2843375) 36 | (originally by Jeff Dean) 37 | 38 | ```a 39 | L1 cache reference ......................... 0.5 ns 40 | Branch mispredict ............................ 5 ns 41 | L2 cache reference ........................... 7 ns 42 | Mutex lock/unlock ........................... 25 ns 43 | Main memory reference ...................... 100 ns 44 | Compress 1K bytes with Zippy ........ 3,000 ns = 3 µs 45 | Send 2K bytes over 1 Gbps network .. 20,000 ns = 20 µs 46 | SSD random read ................... 150,000 ns = 150 µs 47 | Read 1 MB sequentially from memory 250,000 ns = 250 µs 48 | Round trip within same datacenter . 500,000 ns = 0.5 ms 49 | Read 1 MB sequentially from SSD* 1,000,000 ns = 1 ms 50 | Disk seek ...................... 10,000,000 ns = 10 ms 51 | Read 1 MB sequentially from disk 20,000,000 ns = 20 ms 52 | Send packet CA->Holland->CA ... 150,000,000 ns = 150 ms 53 | ``` 54 | 55 | --- 56 | 57 | #### In human terms (multiply above numbers by a billion) 58 | 59 | ```a 60 | 61 | -L1 cache reference 0.5 s 62 | One heart beat (0.5 s) 63 | -Branch mispredict 5 s 64 | Yawn 65 | -L2 cache reference 7 s 66 | Long yawn 67 | -Mutex lock/unlock 25 s 68 | Making a coffee 69 | -Main memory reference 100 s 70 | Brushing your teeth 71 | -Compress 1K bytes with Zippy 50 min 72 | One episode of a TV show (including ad breaks) 73 | -Send 2K bytes over 1 Gbps network 5.5 hr 74 | From lunch to end of work day 75 | -SSD random read 1.7 days 76 | A normal weekend 77 | -Read 1 MB sequentially from memory 2.9 days 78 | A long weekend 79 | -Round trip within same datacenter 5.8 days 80 | A medium vacation 81 | -Read 1 MB sequentially from SSD 11.6 days 82 | Waiting for almost 2 weeks for a delivery 83 | -Disk seek 16.5 weeks 84 | A semester in university 85 | -Read 1 MB sequentially from disk 7.8 months 86 | Almost producing a new human being 87 | -The above 2 together 1 year 88 | -Send packet CA->Netherlands->CA 4.8 years 89 | Average time it takes to complete a bachelor's degree 90 | ``` 91 | 92 | --- 93 | 94 | Luckily _amortization_ saves us. You don't actually look up individual integers from disk each time, you read a chunk of of data on disk and read it into memory. Thereby _amortizing_ the cost of a disk access over many, many reads. 95 | 96 | --- 97 | 98 | Check memory prices 99 | 100 | --- 101 | 102 | ## Let's look at a real-world file: Chicago Taxi Data 103 | https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew/ 104 | 105 | Size of full file...no idea, it is simply too big! 106 | 107 | --- 108 | 109 | ### My email to data providers: 110 | 111 | Subject: *Can you please, for the love of God, show how big a file is before we download it?* 112 | 113 | ... 114 | I’m sitting at 30 gigs right now for the taxi data and I have no idea how much more needs to be downloaded. 115 | ... 116 | 117 | Response: 118 | There is a bit of complexity to this but, in the end, these would have to be feature changes in the software we use. ... 119 | 120 | --- 121 | 122 | ### Size of (partially downloaded) file (and subsets): 123 | 124 | ```a 125 | Compressed: 126 | Size of Taxi_Trips.csv.gz: 14G 127 | Size of taxi_trips_small.csv.gz: 1G 128 | Size of taxi_trips_smaller.csv.gz 154M 129 | 130 | Uncompressed: 131 | Size of Taxi_Trips.csv: ???? 132 | Size of taxi_trips_small.csv: 3.4G 133 | Size of taxi_trips_smaller.csv: 456M 134 | 135 | Lines in Taxi_Trips.csv.gz: 113,115,259 (100 million) 136 | Lines in taxi_trips_small.csv.gz: 11,311,525 (11 million) 137 | Lines in taxi_trips_smaller.csv.gz: 1,131,152 (1 million) 138 | ``` 139 | 140 | --- 141 | 142 | ### Notebooks 143 | 144 | * 050 - Work With Taxi Trips - Get to know the file.ipynb 145 | * 100 - Work With Taxi Trips - memory_map.ipynb 146 | * 110 - Work With Taxi Trips - compression.ipynb 147 | * 150 - Work With Taxi Trips - c_parser.ipynb 148 | 149 | * 120 - Work With Taxi Trips - feather format.ipynb 150 | * 130 - Work With Taxi Trips - chunking and tqdm.ipynb 151 | * 160 - Work With Taxi Trips - Chunk to parquet.ipynb 152 | * 160B - Work With Taxi Trips - Read from parquet files.ipynb 153 | 154 | * 135 - Work With Taxi Trips - skip columns pre-req.ipynb 155 | * 140 - Work With Taxi Trips - skip columns.ipynb 156 | * 140B - Work With Taxi Trips - skip columns.ipynb 157 | 158 | --- 159 | 160 | ## Bonus material: Bash commandline as a super-power 161 | * `ls -ltrhc` to get the size of the file 162 | * `cat` to see the contents of a file 163 | * `zcat` to see the contents of a compressed file 164 | * `head -n` or `tail` to see the first or last few lines 165 | * `wc -l` to count the number of lines in a text file 166 | * `cut -d -f` to retrieve specific columns 167 | * `tr` or `sed` to replace one character or string with another 168 | -------------------------------------------------------------------------------- /lectures/060_learn_command_line/learn_command_line_2.md: -------------------------------------------------------------------------------- 1 | 2 | # **Introduction to the Linux Command Line** 3 | 4 | Your computer has a file system, organized in a heirarchy. 5 | 6 | ### Windows file system looks something like this: 7 | 8 | ``` 9 | c:\ 10 | ... 11 | Program Files 12 | Program Files (x86) 13 | Windows 14 | Users 15 | Public 16 | shahbaz 17 | Desktop 18 | Documents 19 | Pictures 20 | .gitconfig 21 | ... 22 | ``` 23 | 24 | ### Mac users' file system looks something like this 25 | 26 | ``` 27 | / 28 | ... 29 | tmp/ 30 | Applications/ 31 | Library/ 32 | Users/ 33 | shahbaz/ 34 | Applications/ 35 | Desktop/ 36 | Downloads/ 37 | ... 38 | ``` 39 | 40 | ### Linux users' file system looks something like this 41 | 42 | ``` 43 | / 44 | ... 45 | tmp/ 46 | bin/ 47 | opt/ 48 | home/ 49 | shahbaz/ 50 | ``` 51 | 52 | 53 | ## **Week 1: Exploring the Command Line** 54 | ### **1. Connecting to a Remote Machine** 55 | - Check network connectivity with `ping`: 56 | ```bash 57 | ping google.com 58 | ``` 59 | 60 | - Use `ssh` to log in: 61 | ```bash 62 | ssh -l 63 | ``` 64 | - **Example:** `ssh -l student 192.168.1.10` 65 | 66 | ### **2. Basic Navigation** 67 | 68 | A path tells your a location in a file system: `/home/shahbaz/myfile.txt` tells you that you can access the file `myfile.txt` by navigating to the "top" `/`, then changing directory to `home`, then to `shahbaz`. 69 | 70 | Notice that there is similar to a web url: `https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/` 71 | 72 | 73 | 74 | - **Commands**: 75 | - `pwd` – Print working directory. 76 | - `ls` – List files and directories. 77 | - `cd` – Change directory. 78 | - `tree` – Show directory structure (if installed). 79 | - Explore: 80 | - Use `ls -a` to see hidden files (`.bashrc`, `.profile`). 81 | - Understand paths: 82 | - `.` – Current directory 83 | - `..` – Parent directory 84 | - `~` – Home directory 85 | - Absolute vs. Relative paths. 86 | 87 | ### **3. Viewing File Contents** 88 | - **Commands**: 89 | - `head ` – View the first few lines. 90 | - `cat ` – Show entire file (be cautious with large files). 91 | - `less ` – Paginated view with navigation. 92 | 93 | ### **Exercise** 94 | - Navigate to `/opt/mleng_class/datasets/names` 95 | - What files do you see in that directory? 96 | - Take a look at the first few male and female names 97 | - Count the number of lines in `male.txt` and `female.txt` files (hint, use `wc -l`) 98 | 99 | ### **4. Getting Help** 100 | - **Options**: 101 | - `--help` or `-h` with most commands. 102 | - `man ` for the manual page. 103 | 104 | ### **5. Searching and Filtering** 105 | - **Commands**: 106 | - `which ` – Find command location. 107 | - `find -name ` – Search for a file called `` "under" the path ``. 108 | - `grep ` – Look for the text (or pattern) `` in a file called ``. 109 | - Use `grep -i` to ignore case. 110 | - Combine with regex for advanced filtering. 111 | - Combine commands with pipes (`|`): <== A SUPER POWER! 112 | ```bash 113 | cat file.txt | grep "pattern" | head 114 | ``` 115 | 116 | ### **Exercise** 117 | - Navigate to `/opt/mleng_class/datasets/names` 118 | - In the file `male.txt`, find all names which contains "Quin" 119 | - Make sure the list includes 'Joaquin' and "Quincy" (among others) 120 | 121 | --- 122 | 123 | ## **Week 2: Moving and Editing Files** 124 | ### **1. File Operations** 125 | - **Commands**: 126 | - `mkdir` – Create directories. 127 | - `cp` – Copy files. 128 | - `mv` – Move or rename files. 129 | - `rm` – Remove files or directories (`rm -rf` for directories). 130 | - `wget`/`curl` – Download files. 131 | - `tar`/`gzip`/`unzip` – Compress and extract files. 132 | 133 | ### **2. Editing Files** 134 | - Use `nano` for simple edits: 135 | - Save: `Ctrl + O` 136 | - Exit: `Ctrl + X` 137 | 138 | ### **3. Environment and Configuration** 139 | - View or edit: 140 | - `.bashrc`, `.profile` 141 | - Add `export PS1="[\u@\h \W]\$ "` to modify the prompt. 142 | - **Environment Variables**: 143 | - View with `printenv`. 144 | - Common examples: 145 | - `$HOME`, `$PATH` 146 | 147 | ### **4. Diffing Files** 148 | - Compare files: 149 | ```bash 150 | diff file1 file2 151 | ``` 152 | 153 | ### **5. `find` files** 154 | - Compare files: 155 | ```bash 156 | find . | grep file_pattern 157 | ``` 158 | 159 | ### **6. Automating with Loops** 160 | - Process multiple files: 161 | ```bash 162 | for file in *.txt; do echo $file; done 163 | ``` 164 | 165 | --- 166 | 167 | ## **Week 3: Managing Processes** 168 | ### **1. System Info** 169 | - **Commands**: 170 | - `uname -a` – OS info. 171 | - `cat /etc/os-release` – Linux distribution. 172 | - `free -h` – Memory. 173 | - `df -h` – Disk space. 174 | 175 | ### **2. Monitoring Processes** 176 | - **Commands**: 177 | - `ps` – View processes. 178 | - `top` or `htop` – Interactive monitoring. 179 | 180 | ### **3. Managing Processes** 181 | - Start a background process: 182 | ```bash 183 | long_running_command & 184 | ``` 185 | - Control processes: 186 | - `Ctrl+C` – Terminate. 187 | - `Ctrl+Z` – Suspend. 188 | - `bg` – Resume in the background. 189 | - `fg` – Bring back to the foreground. 190 | - `kill -9 ` – Force kill. 191 | 192 | ### **4. Redirecting Output** 193 | - Save output to a file: 194 | ```bash 195 | command > output.txt 196 | ``` 197 | - Append: 198 | ```bash 199 | command >> output.txt 200 | ``` 201 | 202 | --- 203 | 204 | ## **Week 4: Advanced Concepts** 205 | ### **1. Advanced Navigation** 206 | - `cd -` – Return to the last directory. 207 | - Use `tree` for visualization. 208 | 209 | ### **2. Job Management** 210 | - `jobs` – View running background jobs. 211 | - Use `nohup` to ensure long-running jobs persist: 212 | ```bash 213 | nohup command & 214 | ``` 215 | 216 | ### **3. Regular Expressions** 217 | - Basics: 218 | - `^` – Start of a line. 219 | - `$` – End of a line. 220 | - `.` – Any character. 221 | - `*` – Zero or more occurrences. 222 | - Combine with `grep`, `sed`, or `awk`. 223 | 224 | --- 225 | 226 | ## **Week 5: Capstone** 227 | ### **Scenario**: 228 | 1. SSH into the remote machine. 229 | 2. Use `find`, `grep`, and `nano` to edit configuration files. 230 | 3. Start a long-running job with output redirected to a log file. 231 | 4. Monitor resources with `top` or `htop`. 232 | 5. Schedule periodic tasks with `crontab`: 233 | ```bash 234 | crontab -e 235 | ``` 236 | Example entry: 237 | ```bash 238 | 0 3 * * * /path/to/script.sh 239 | ``` 240 | 241 | --- 242 | With help from ChatGPT (free version). -------------------------------------------------------------------------------- /lectures/065_secret_lives_of_text_files/howareyou_english.txt: -------------------------------------------------------------------------------- 1 | How are you? -------------------------------------------------------------------------------- /lectures/065_secret_lives_of_text_files/howareyou_english_multiple_lines.txt: -------------------------------------------------------------------------------- 1 | How are 2 | you? -------------------------------------------------------------------------------- /lectures/065_secret_lives_of_text_files/howareyou_not_english.txt: -------------------------------------------------------------------------------- 1 | How âre you? -------------------------------------------------------------------------------- /lectures/070_scikit_learn/110-scikit-learn-run_saved_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Loading scikit-learn's saved model from disk" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "The following code shows how to load a saved model from disk. In a real-world scenario, such model will be loaded from flask (or more production grade server)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "#### Create Test data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "X_test = pd.DataFrame(np.array([[1, 15.0, 0, 1, 211.3375, False, False],\n", 41 | " [3, 7.0, 4, 1, 29.125, False, False],\n", 42 | " [2, 33.0, 0, 2, 26.0, False, False],\n", 43 | " [2, 14.0, 1, 0, 30.0708, False, False],\n", 44 | " [3, 21.0, 0, 0, 8.05, True, True]]), columns=['pclass', 'age', 'sibsp', 'parch', 'fare', 'adult_male', 'alone'])\n", 45 | "\n", 46 | "y_test = np.array([1, 0, 1, 1, 0])" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "X_test" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "X_test.iloc[0:1,:].values" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "y_test" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "#### Load model from disk" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "from joblib import dump, load" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "trained_model = load('model.joblib')" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "trained_model.predict(X_test.iloc[0:1,:])" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "trained_model.score(X_test, y_test)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [] 125 | } 126 | ], 127 | "metadata": { 128 | "kernelspec": { 129 | "display_name": "Python 3", 130 | "language": "python", 131 | "name": "python3" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.8.5" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 4 148 | } 149 | -------------------------------------------------------------------------------- /lectures/070_scikit_learn/model_server.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request 2 | from joblib import load 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # Load trained model 8 | trained_model = load('model.joblib') 9 | 10 | 11 | # Flask server setup 12 | app = Flask(__name__) 13 | 14 | @app.route('/') 15 | def hello_world(): 16 | return 'Hello, World!' 17 | 18 | @app.route('/model') 19 | def serve_model(): 20 | args = request.args 21 | 22 | print(args) 23 | 24 | pclass, age, sibsp, parch, fare, adult_male, alone = float(args.get('pclass')), float(args.get('age')), float(args.get('sibsp')), float(args.get('parch')), float(args.get('fare')), float(args.get('adult_male')), float(args.get('alone')) 25 | 26 | print(pclass, age, sibsp, parch, fare, adult_male, alone) 27 | 28 | input_array = np.array([[pclass, age, sibsp, parch, fare, adult_male, alone]]) 29 | 30 | print(input_array) 31 | 32 | predicted = trained_model.predict(input_array) 33 | return args, predicted 34 | 35 | if __name__ == "__main__": 36 | app.run() 37 | 38 | # Test as: 39 | # http://localhost:5000/model?pclass=1&age=15&sibsp=0&parch=1&fare=211.3375&adult_male=0&alone=0 40 | # 1. , 15. , 0. , 1. , 211.3375, 0. ,0. -------------------------------------------------------------------------------- /lectures/075_web_services/120-bank_churners_classifier_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Bank Churners Classifier Model 5 | 6 | # In[3]: 7 | 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | #pre-training 15 | from sklearn.model_selection import train_test_split 16 | from sklearn.preprocessing import OneHotEncoder 17 | from sklearn.compose import make_column_transformer 18 | 19 | #training 20 | from sklearn import ensemble 21 | from sklearn import pipeline 22 | 23 | 24 | #post training 25 | from sklearn.metrics import accuracy_score 26 | from joblib import dump 27 | 28 | 29 | # #### Read data 30 | 31 | # In[ ]: 32 | 33 | 34 | data_df = pd.read_csv('../../datasets/credit-card-customers/BankChurners.zip') 35 | data_df.shape 36 | 37 | 38 | # In[ ]: 39 | 40 | 41 | data_df.head() 42 | 43 | 44 | # In[ ]: 45 | 46 | 47 | data_df.columns 48 | 49 | 50 | # In[ ]: 51 | 52 | 53 | data_df.isna().sum() 54 | 55 | 56 | # #### Remove columns which should not go into the model 57 | 58 | # In[ ]: 59 | 60 | 61 | data_df.drop([ 62 | 'CLIENTNUM', 63 | 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 64 | 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2' 65 | ], axis=1, inplace=True) 66 | 67 | 68 | # #### Convert categorical columns 69 | 70 | # In[ ]: 71 | 72 | 73 | #https://medium.com/@sami.yousuf.azad/one-hot-encoding-with-pandas-dataframe-49a304e8507a 74 | CATEGORICAL_COLS = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', ] 75 | col_transformer = make_column_transformer( 76 | (OneHotEncoder(), CATEGORICAL_COLS), 77 | remainder='passthrough') 78 | 79 | transformed = col_transformer.fit_transform(data_df) 80 | 81 | transformed_df = pd.DataFrame(transformed, columns=col_transformer.get_feature_names_out()) 82 | 83 | 84 | # In[ ]: 85 | 86 | 87 | transformed_df.head() 88 | 89 | 90 | # In[ ]: 91 | 92 | 93 | transformed_df.columns 94 | 95 | 96 | # #### Build model 97 | 98 | # In[ ]: 99 | 100 | 101 | X_train, X_test, y_train, y_test = train_test_split( 102 | data_df.drop(['Attrition_Flag'], axis=1) 103 | , data_df.Attrition_Flag 104 | , random_state=1) 105 | 106 | 107 | # In[ ]: 108 | 109 | 110 | pipe = pipeline.make_pipeline( 111 | col_transformer 112 | ,ensemble.RandomForestClassifier(n_estimators=100, min_samples_split=2) # <== Classifier 113 | ) 114 | 115 | 116 | # In[ ]: 117 | 118 | 119 | #%%time 120 | pipe.fit(X_train, y_train) 121 | 122 | y_predict = pipe.predict(X_test) 123 | pipe.score(X_test, y_test) 124 | 125 | 126 | # In[ ]: 127 | 128 | 129 | pipe 130 | 131 | 132 | # In[ ]: 133 | 134 | 135 | #pd.DataFrame({'feature':X_train.columns, 'importance':pipe.feature_importances_}).sort_values(by='importance') 136 | 137 | 138 | # #### Save model 139 | 140 | # In[ ]: 141 | 142 | 143 | #%%time 144 | dump(pipe, 'bank_churners_classifier_model.joblib') 145 | 146 | 147 | # In[ ]: 148 | 149 | 150 | #%ls 151 | 152 | 153 | # #### Read model 154 | 155 | # In[ ]: 156 | 157 | 158 | from joblib import load 159 | 160 | 161 | # In[ ]: 162 | 163 | 164 | trained_model = load('bank_churners_classifier_model.joblib') 165 | 166 | 167 | # In[ ]: 168 | 169 | 170 | trained_model.feature_names_in_ 171 | 172 | 173 | # The following columns are categorical 174 | 175 | # In[ ]: 176 | 177 | 178 | CATEGORICAL_COLS 179 | 180 | 181 | # In[ ]: 182 | 183 | 184 | for col in CATEGORICAL_COLS: 185 | print(col, data_df[col].unique()) 186 | 187 | 188 | # In[ ]: 189 | 190 | 191 | test_data_df = pd.Series({ 192 | 'Customer_Age' : 30, 193 | 'Gender' : 'M', 194 | 'Dependent_count': 3, 195 | 'Education_Level': 'Graduate', 196 | 'Marital_Status' : 'Single', 197 | 'Income_Category': '$40K - $60K', 198 | 'Card_Category' : 'Blue', 199 | 'Months_on_book' : 5, 200 | 'Total_Relationship_Count' : 3, 201 | 'Months_Inactive_12_mon' : 1, 202 | 'Contacts_Count_12_mon' : 2, 203 | 'Credit_Limit' : 34000, 204 | 'Total_Revolving_Bal' : 40000, 205 | 'Avg_Open_To_Buy' : 200, 206 | 'Total_Amt_Chng_Q4_Q1' : 34, 207 | 'Total_Trans_Amt' : 500, 208 | 'Total_Trans_Ct' : 3, 209 | 'Total_Ct_Chng_Q4_Q1' : 23, 210 | 'Avg_Utilization_Ratio' : .1 211 | }).to_frame().T 212 | 213 | 214 | # In[ ]: 215 | 216 | 217 | test_data_df 218 | 219 | 220 | # In[ ]: 221 | 222 | 223 | test_data_df.columns 224 | 225 | 226 | # In[ ]: 227 | 228 | 229 | trained_model.predict(test_data_df) 230 | 231 | 232 | # In[ ]: 233 | 234 | 235 | trained_model.classes_ 236 | 237 | 238 | # In[ ]: 239 | 240 | 241 | trained_model.predict_proba(test_data_df) 242 | 243 | 244 | # ### Convert this notebook to .py 245 | # Some students having trouble reading the model so they can run a .py file in their own enviornment and generate the model file using the same env as their web services code 246 | 247 | # In[ ]: 248 | 249 | 250 | get_ipython().system('jupyter nbconvert --to python 120-bank_churners_classifier_model.ipynb') 251 | 252 | 253 | # In[ ]: 254 | 255 | 256 | 257 | 258 | -------------------------------------------------------------------------------- /lectures/075_web_services/130-load_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3372311b-1834-4898-af06-ec7721242ba8", 6 | "metadata": {}, 7 | "source": [ 8 | "# Load test\n", 9 | "Simulate a large number of clients accessing your web site or service" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "id": "515ab2bb-9962-46ee-9172-bfe05efaed3c", 15 | "metadata": {}, 16 | "source": [ 17 | "### A sample web service\n", 18 | "Ray serve is a component which makes it easy to spread the serving of an API across several machines. Let's jump into code." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "4d8d94d5-d873-49ac-98d3-a85abb296315", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "%%writefile simple_api.py\n", 29 | "\n", 30 | "from fastapi import FastAPI\n", 31 | "from typing import Dict\n", 32 | "\n", 33 | "app = FastAPI()\n", 34 | "\n", 35 | "@app.get(\"/status\")\n", 36 | "def status() -> Dict[str, str]:\n", 37 | " \"\"\"Simple health check endpoint.\"\"\"\n", 38 | " return {\"status\": \"ok\"}\n", 39 | "\n", 40 | "\n", 41 | "@app.get(\"/compute\")\n", 42 | "def fibonacci(n: int):\n", 43 | " \"\"\"Compute Fibonacci sequence up to n (inclusive).\"\"\"\n", 44 | " if n <= 0:\n", 45 | " return []\n", 46 | " fib = [0, 1]\n", 47 | " while fib[-1] + fib[-2] <= n:\n", 48 | " fib.append(fib[-1] + fib[-2])\n", 49 | " return fib\n", 50 | "\n", 51 | "# fastapi run simple_api.py\n", 52 | "# http://localhost:8000/compute?n=10" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "id": "573491b0-7790-49b4-8553-1635f4ae341e", 58 | "metadata": {}, 59 | "source": [ 60 | "Normally you run the code above as:\n", 61 | "\n", 62 | "```python\n", 63 | "fastapi run simple_apy.py\n", 64 | "```\n", 65 | "\n", 66 | "This will run the API on a single machine. \n", 67 | "\n", 68 | "However, is your startup grows, how do you make sure you can continue to serve clients?" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "6254f39b-1ad4-4782-847c-1195a685e91f", 74 | "metadata": {}, 75 | "source": [ 76 | "### Is your current setup going to scale when you go viral? \n", 77 | "Test it with https://locust.io/" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "9690dbd7-f1b4-4cc0-be49-1c6cefc71e81", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "!pip install locust" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "id": "bb2f6289-db40-49c7-9fac-31361617c431", 93 | "metadata": {}, 94 | "source": [ 95 | "Create a virtual users who will hit your API" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "7a72fdd9-bc85-4507-970a-7a4f29bb1568", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "%%writefile locustfile.py\n", 106 | "\n", 107 | "from locust import HttpUser, TaskSet, task, between\n", 108 | "\n", 109 | "class APIUser(HttpUser):\n", 110 | " wait_time = between(1, 3)\n", 111 | " host = \"http://127.0.0.1:8000\"\n", 112 | "\n", 113 | " @task\n", 114 | " class UserTasks(TaskSet):\n", 115 | " @task\n", 116 | " def get_status(self):\n", 117 | " self.client.get(\"/status/\") \n", 118 | "\n", 119 | " @task\n", 120 | " def do_compute(self):\n", 121 | " self.client.get(\"/compute?n=100\") " 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "id": "7aff6e5f-a280-43ab-a6bc-04d537b8ff3d", 127 | "metadata": {}, 128 | "source": [ 129 | "Run it as `locust` at the command line.\n", 130 | "This will refer you to a web page, which will let you control the test." 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "id": "fc6841c8-8e34-48a2-a299-1a907d639454", 136 | "metadata": {}, 137 | "source": [ 138 | "#### Things to note\n", 139 | "1. Any filures on the locust dashboard?\n", 140 | "2. Monitor the logs of your application\n", 141 | "3. What is the median execution time?\n", 142 | "4. **What is the tail execution time**?\n", 143 | "5. What is the RPS (requests per second)?" 144 | ] 145 | } 146 | ], 147 | "metadata": { 148 | "kernelspec": { 149 | "display_name": "Python [conda env:conda-mleng_env]", 150 | "language": "python", 151 | "name": "conda-env-conda-mleng_env-py" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 3 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython3", 163 | "version": "3.11.11" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 5 168 | } 169 | -------------------------------------------------------------------------------- /lectures/075_web_services/The web, under the hood.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/075_web_services/The web, under the hood.pdf -------------------------------------------------------------------------------- /lectures/075_web_services/The web, under the hood.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/075_web_services/The web, under the hood.pptx -------------------------------------------------------------------------------- /lectures/075_web_services/consume_json.py: -------------------------------------------------------------------------------- 1 | #serve json 2 | from datetime import datetime 3 | import requests 4 | 5 | HOST = 'localhost' 6 | PORT = 5000 7 | 8 | # Request something which doesn't exist 9 | response = requests.get(url=f'http://{HOST}:{PORT}/') 10 | print(f"Result of doing a GET request from http://{HOST}:{PORT}/") 11 | print(response) 12 | 13 | print("-------------") 14 | 15 | # Request the time 16 | response = requests.get(url=f'http://{HOST}:{PORT}/get_time') 17 | print(f"Result of doing a GET request from http://{HOST}:{PORT}/get_time") 18 | print(response) 19 | print("Response raw content:" + dir(response)) 20 | 21 | if __name__=='__main__': 22 | pass -------------------------------------------------------------------------------- /lectures/075_web_services/consume_services.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fff7f325-d2b9-445e-b9fd-caa241da0bf3", 6 | "metadata": {}, 7 | "source": [ 8 | "# Test services via code (instead of via the browser)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "1c7ba53a-7dc5-4d8b-b1c0-7e0c1cb437f5", 15 | "metadata": { 16 | "tags": [] 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import requests" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "id": "89691c81-b664-48ca-be50-16d57ec9ebe2", 27 | "metadata": { 28 | "tags": [] 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "HOST = 'localhost'\n", 33 | "PORT = 8000" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "id": "e7d20138-aed8-4e2c-916c-e526791af90f", 39 | "metadata": {}, 40 | "source": [ 41 | "### Request something whcih doesn't exist" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "81504c9e-50b3-4dbd-8522-da42f3fb70c0", 48 | "metadata": { 49 | "tags": [] 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "%%time\n", 54 | "response = requests.get(url=f'http://{HOST}:{PORT}/')" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "6127c67e-ad1b-4296-8428-ff8385d5f206", 61 | "metadata": { 62 | "tags": [] 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "response" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "id": "d46a726a-5f50-41ed-b4df-9785153122c8", 72 | "metadata": {}, 73 | "source": [ 74 | "### Request the 'get_time' service (recall that it is a GET service)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "4c48e246-317c-4150-b72c-e8a1e0ab917f", 81 | "metadata": { 82 | "tags": [] 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "%%time\n", 87 | "response = requests.get(url=f'http://{HOST}:{PORT}/get_time')" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "id": "874cac25-011e-4fc6-a887-2e80d1023025", 94 | "metadata": { 95 | "tags": [] 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "response" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "54429e71-b92d-4f6d-8466-afe7f1df7299", 106 | "metadata": { 107 | "tags": [] 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "response.text" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "id": "7609100d-0b86-448d-9bf7-d6baa3644a16", 118 | "metadata": { 119 | "tags": [] 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "j = response.json()\n", 124 | "j" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "id": "b10fb44d-f9fa-4290-8ef7-090849746437", 131 | "metadata": { 132 | "tags": [] 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "type(j) # <= Proper Python dictionary is returned!" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "id": "916681a0-c917-4c07-9d49-61a90f7e1828", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "j['current_time']" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "id": "81338c44-ad31-473f-a9d2-b1c736f5b664", 152 | "metadata": { 153 | "tags": [] 154 | }, 155 | "source": [ 156 | "### Request the 'get_churn_probability' service (recall that it is a POST service)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "d3a249da-6d16-4599-9bbb-c8c1fac5aade", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "import requests" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "id": "5e0b8617-a64f-4c0c-b518-20b668241688", 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "HOST = 'localhost'\n", 177 | "PORT = 8000" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "id": "69909596-66e9-4152-b906-bc8a2c9ee0c8", 184 | "metadata": { 185 | "tags": [] 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "good_client = {'gender':'male', 'age':23, 'uc_grad':False}\n", 190 | "great_client = {'gender':'male', 'age':23, 'uc_grad':True}" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "id": "58868043-f2e5-40e0-902c-39e8a9031f2c", 197 | "metadata": { 198 | "tags": [] 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "%%time\n", 203 | "\n", 204 | "response = requests.post(url=f'http://{HOST}:{PORT}/get_churn_probability', json=good_client)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "id": "d44f9cb6-2716-4c51-98cb-94018db012da", 211 | "metadata": { 212 | "tags": [] 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "response" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "id": "40677b01-a94f-43d4-a97d-bd2384bd1258", 223 | "metadata": { 224 | "tags": [] 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "response.json()" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "id": "37de25b7-6711-4035-a16b-4fb490573f67", 235 | "metadata": { 236 | "tags": [] 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "requests.post(url=f'http://{HOST}:{PORT}/get_churn_probability', json=great_client).json()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "id": "41d76aa9-cc22-42cb-84fd-68aa4118a6d0", 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [] 250 | } 251 | ], 252 | "metadata": { 253 | "kernelspec": { 254 | "display_name": "Python [conda env:conda-mleng_env]", 255 | "language": "python", 256 | "name": "conda-env-conda-mleng_env-py" 257 | }, 258 | "language_info": { 259 | "codemirror_mode": { 260 | "name": "ipython", 261 | "version": 3 262 | }, 263 | "file_extension": ".py", 264 | "mimetype": "text/x-python", 265 | "name": "python", 266 | "nbconvert_exporter": "python", 267 | "pygments_lexer": "ipython3", 268 | "version": "3.11.11" 269 | } 270 | }, 271 | "nbformat": 4, 272 | "nbformat_minor": 5 273 | } 274 | -------------------------------------------------------------------------------- /lectures/075_web_services/decorator.pyx: -------------------------------------------------------------------------------- 1 | def logger2(f): 2 | print("Just ran logger2") 3 | def inner_func(*args, **kwargs): 4 | print(f"Starting execution of function {f.__name__}") 5 | rslt = f(*args, **kwargs) 6 | print(f"Finished execution of function {f.__name__}") 7 | return rslt 8 | return inner_func 9 | 10 | @logger2 11 | def say_bye(name): 12 | return f"Good bye {name}" 13 | 14 | #print("==============") 15 | #print(say_bye("Shahbaz")) -------------------------------------------------------------------------------- /lectures/075_web_services/post_client_streamlit_app.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import streamlit as st 3 | 4 | st.markdown("# Churn probability test") 5 | 6 | st.markdown("#### Input parameters") 7 | 8 | uc_grad = st.toggle("UC Graduate?") 9 | age = st.slider("Age", 18, 100) 10 | 11 | response = requests.post(url=f"http://localhost:5000/get_churn_probability", json = {'gender':'male', 'age':age, 'uc_grad':uc_grad}) 12 | st.write(response.json()) 13 | -------------------------------------------------------------------------------- /lectures/075_web_services/serve_json.py: -------------------------------------------------------------------------------- 1 | #serve json 2 | from datetime import datetime 3 | from fastapi import FastAPI 4 | 5 | app = FastAPI() 6 | 7 | @app.get('/status') 8 | def status(): 9 | d = {'status': 'OK'} 10 | return d 11 | 12 | @app.get('/get_time') 13 | def get_time(): 14 | d = {'current_time':datetime.now().strftime("%H:%M")} 15 | return d 16 | 17 | # fastapi dev serve_json.py -------------------------------------------------------------------------------- /lectures/075_web_services/serve_post_json.py: -------------------------------------------------------------------------------- 1 | #serve post json 2 | from datetime import datetime 3 | from fastapi import FastAPI 4 | from pydantic import BaseModel 5 | 6 | 7 | app = FastAPI() 8 | 9 | @app.get('/status') 10 | def status(): 11 | d = {'status': 'OK'} 12 | return d 13 | 14 | @app.get('/get_time') 15 | def get_time(): 16 | d = {'current_time':datetime.now().strftime("%H:%M")} 17 | return d 18 | 19 | class ClientData(BaseModel): 20 | gender: str 21 | age: int 22 | uc_grad: bool 23 | 24 | @app.post('/get_churn_probability') 25 | def get_churn_probability(client_props: ClientData): 26 | 27 | #if 'UC_GRAD' not in client_properties: 28 | # pass throw error 29 | 30 | # Our churn model is fake, we don't actually use an ML model :( 31 | if client_props.uc_grad == "true": 32 | return {'churn_prob':0.34} 33 | else: 34 | return {'churn_prob':0.87} 35 | 36 | # fastapi dev serve_post_json.py -------------------------------------------------------------------------------- /lectures/075_web_services/serve_text.py: -------------------------------------------------------------------------------- 1 | #serve text 2 | from datetime import datetime 3 | from fastapi import FastAPI 4 | 5 | app = FastAPI() 6 | 7 | @app.get('/status') 8 | def status(): 9 | return "OK" 10 | 11 | 12 | @app.get('/get_time') 13 | def get_time(): 14 | t = f'current time is {datetime.now().strftime("%H:%M")}' 15 | return t 16 | 17 | # fastapi dev serve_text.py -------------------------------------------------------------------------------- /lectures/075_web_services/streamlit_app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | st.markdown("# Hello") 4 | 5 | name = st.text_input("What is your name?") 6 | st.write(f"Hello {name}") 7 | 8 | if len(name.strip()) == 0: 9 | st.warning("You haven't entered your name yet :(") -------------------------------------------------------------------------------- /lectures/080_env_pkg_management/Python environment and package management.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/080_env_pkg_management/Python environment and package management.pptx -------------------------------------------------------------------------------- /lectures/090_python_tools/.coverage: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/090_python_tools/.coverage -------------------------------------------------------------------------------- /lectures/090_python_tools/020-python-bytecode.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "648ac726-3cb0-4744-a463-fb8dd32c87f4", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import dis" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "92d3ec11-5bda-4686-8feb-6550132f02dc", 19 | "metadata": { 20 | "tags": [] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "def loop_f(x):\n", 25 | " for i in range(x):\n", 26 | " print(i)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "id": "32c3e112-2562-4f32-8135-5d0956316fd6", 33 | "metadata": { 34 | "tags": [] 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "def listcomp_f(x):\n", 39 | " [print(i) for i in range(x)]" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "id": "4991ba06-181a-4ad1-8303-f685a33b4051", 46 | "metadata": { 47 | "tags": [] 48 | }, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "0\n", 55 | "1\n", 56 | "2\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "loop_f(3)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 5, 67 | "id": "d7f695f5-f595-4958-a551-b760f933f866", 68 | "metadata": { 69 | "tags": [] 70 | }, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "0\n", 77 | "1\n", 78 | "2\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "listcomp_f(3)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 6, 89 | "id": "7aa3fee5-f68f-4215-9bc9-27ff8aee72d2", 90 | "metadata": { 91 | "tags": [] 92 | }, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | " 1 0 RESUME 0\n", 99 | "\n", 100 | " 2 2 LOAD_GLOBAL 1 (NULL + range)\n", 101 | " 14 LOAD_FAST 0 (x)\n", 102 | " 16 PRECALL 1\n", 103 | " 20 CALL 1\n", 104 | " 30 GET_ITER\n", 105 | " >> 32 FOR_ITER 17 (to 68)\n", 106 | " 34 STORE_FAST 1 (i)\n", 107 | "\n", 108 | " 3 36 LOAD_GLOBAL 3 (NULL + print)\n", 109 | " 48 LOAD_FAST 1 (i)\n", 110 | " 50 PRECALL 1\n", 111 | " 54 CALL 1\n", 112 | " 64 POP_TOP\n", 113 | " 66 JUMP_BACKWARD 18 (to 32)\n", 114 | "\n", 115 | " 2 >> 68 LOAD_CONST 0 (None)\n", 116 | " 70 RETURN_VALUE\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "dis.dis(loop_f)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 8, 127 | "id": "935916ff-0f64-4f6e-85c9-76448ad45229", 128 | "metadata": { 129 | "tags": [] 130 | }, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | " 1 0 RESUME 0\n", 137 | "\n", 138 | " 2 2 LOAD_CONST 1 ( at 0x000001B9993E6880, file \"C:\\Users\\shahb\\AppData\\Local\\Temp\\ipykernel_4252\\1272453071.py\", line 2>)\n", 139 | " 4 MAKE_FUNCTION 0\n", 140 | " 6 LOAD_GLOBAL 1 (NULL + range)\n", 141 | " 18 LOAD_FAST 0 (x)\n", 142 | " 20 PRECALL 1\n", 143 | " 24 CALL 1\n", 144 | " 34 GET_ITER\n", 145 | " 36 PRECALL 0\n", 146 | " 40 CALL 0\n", 147 | " 50 POP_TOP\n", 148 | " 52 LOAD_CONST 0 (None)\n", 149 | " 54 RETURN_VALUE\n", 150 | "\n", 151 | "Disassembly of at 0x000001B9993E6880, file \"C:\\Users\\shahb\\AppData\\Local\\Temp\\ipykernel_4252\\1272453071.py\", line 2>:\n", 152 | " 2 0 RESUME 0\n", 153 | " 2 BUILD_LIST 0\n", 154 | " 4 LOAD_FAST 0 (.0)\n", 155 | " >> 6 FOR_ITER 17 (to 42)\n", 156 | " 8 STORE_FAST 1 (i)\n", 157 | " 10 LOAD_GLOBAL 1 (NULL + print)\n", 158 | " 22 LOAD_FAST 1 (i)\n", 159 | " 24 PRECALL 1\n", 160 | " 28 CALL 1\n", 161 | " 38 LIST_APPEND 2\n", 162 | " 40 JUMP_BACKWARD 18 (to 6)\n", 163 | " >> 42 RETURN_VALUE\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "dis.dis(listcomp_f)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "id": "9e5524e9-31ee-4c37-85ad-3aced5b7ed5f", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [] 178 | } 179 | ], 180 | "metadata": { 181 | "kernelspec": { 182 | "display_name": "Python 3 (ipykernel)", 183 | "language": "python", 184 | "name": "python3" 185 | }, 186 | "language_info": { 187 | "codemirror_mode": { 188 | "name": "ipython", 189 | "version": 3 190 | }, 191 | "file_extension": ".py", 192 | "mimetype": "text/x-python", 193 | "name": "python", 194 | "nbconvert_exporter": "python", 195 | "pygments_lexer": "ipython3", 196 | "version": "3.11.5" 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 5 201 | } 202 | -------------------------------------------------------------------------------- /lectures/090_python_tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/090_python_tools/__init__.py -------------------------------------------------------------------------------- /lectures/090_python_tools/logging.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "disable_existing_loggers": false, 4 | 5 | "handlers": { 6 | "console": { 7 | "class": "logging.StreamHandler", 8 | "level": "DEBUG", 9 | "stream": "ext://sys.stdout" 10 | }, 11 | "file": { 12 | "class": "logging.handlers.TimedRotatingFileHandler", 13 | "level": "DEBUG", 14 | "when": "D", 15 | "backupCount": 0, 16 | "filename": "./logs/training-stats.log" 17 | } 18 | }, 19 | "loggers": { 20 | "root": { 21 | "level": "DEBUG", 22 | "handlers": ["console"] 23 | }, 24 | "app": { 25 | "level": "DEBUG", 26 | "handlers": ["file"], 27 | "propagate": true, 28 | "qualname": "app" 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /lectures/090_python_tools/logging_fancy.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "disable_existing_loggers": false, 4 | "formatters": { 5 | "simple": { 6 | "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 7 | }, 8 | "verbose": { 9 | "format": "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s" 10 | } 11 | }, 12 | "handlers": { 13 | "console": { 14 | "class": "logging.StreamHandler", 15 | "level": "DEBUG", 16 | "formatter": "simple", 17 | "stream": "ext://sys.stdout" 18 | }, 19 | "file": { 20 | "class": "logging.handlers.TimedRotatingFileHandler", 21 | "level": "DEBUG", 22 | "formatter": "verbose", 23 | "when": "D", 24 | "backupCount": 0, 25 | "filename": "./logs/training-stats.log" 26 | }, 27 | "uvicorn": { 28 | "class": "logging.handlers.TimedRotatingFileHandler", 29 | "level": "DEBUG", 30 | "formatter": "verbose", 31 | "when": "D", 32 | "backupCount": 0, 33 | "filename": "./logs/uvicorn.log" 34 | } 35 | }, 36 | "loggers": { 37 | "root": { 38 | "level": "DEBUG", 39 | "handlers": ["console"] 40 | }, 41 | "app": { 42 | "level": "DEBUG", 43 | "handlers": ["file"], 44 | "propagate": true, 45 | "qualname": "app" 46 | }, 47 | "uvicorn": { 48 | "level": "DEBUG", 49 | "handlers": ["uvicorn"], 50 | "propagate": true, 51 | "qualname": "uvicorn" 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /lectures/090_python_tools/logs/training-stats.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/090_python_tools/logs/training-stats.log -------------------------------------------------------------------------------- /lectures/090_python_tools/logs/uvicorn.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/090_python_tools/logs/uvicorn.log -------------------------------------------------------------------------------- /lectures/090_python_tools/name_reverser.py: -------------------------------------------------------------------------------- 1 | # This library parses names and presents them in a professional, reverse name order 2 | 3 | def name_reverse_order(full_name): 4 | if full_name == "": # Handle the case where an empty string is passed 5 | return "" 6 | else: 7 | first, last = full_name.split(' ') 8 | return f'{last}, {first}' 9 | -------------------------------------------------------------------------------- /lectures/090_python_tools/python_logging_01.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | 4 | logging.basicConfig(level=logging.INFO) 5 | 6 | 7 | def maximum(numbers): 8 | #pass # pass means "do nothing", add your code here 9 | max_value = 0 10 | for num in numbers: 11 | logging.debug(f"num:{num}, max_value:{max_value}") 12 | if num > max_value: 13 | logging.debug("max > max_value branch taken. Setting new max_value") 14 | max_value = num 15 | return max_value 16 | 17 | def main(): 18 | logging.info("Let us find the maximum value from the following list:") 19 | list_of_nums = [1,2,3] 20 | max_value = maximum(list_of_nums) 21 | logging.info(max_value) 22 | 23 | logging.info("Let us find the maximum value from another list:") 24 | list_of_nums = [-1, -2, -3] 25 | max_value = maximum(list_of_nums) 26 | logging.info(max_value) 27 | 28 | if __name__ == "__main__": 29 | logging.warning(f"This program is being run from the command line") 30 | main() 31 | -------------------------------------------------------------------------------- /lectures/090_python_tools/python_logging_02.py: -------------------------------------------------------------------------------- 1 | 2 | import logging.config 3 | import json 4 | 5 | with open("logging.json", "r") as f: 6 | json_config = json.load(f) 7 | logging.config.dictConfig(json_config) 8 | 9 | def maximum(numbers): 10 | #pass # pass means "do nothing", add your code here 11 | max_value = 0 12 | for num in numbers: 13 | logging.debug(f"num:{num}, max_value:{max_value}") 14 | if num > max_value: 15 | logging.debug("max > max_value branch taken. Setting new max_value") 16 | max_value = num 17 | return max_value 18 | 19 | def main(): 20 | logging.info("Let us find the maximum value from the following list:") 21 | list_of_nums = [1,2,3] 22 | max_value = maximum(list_of_nums) 23 | logging.info(max_value) 24 | 25 | logging.info("Let us find the maximum value from another list:") 26 | list_of_nums = [-1, -2, -3] 27 | max_value = maximum(list_of_nums) 28 | logging.info(max_value) 29 | 30 | if __name__ == "__main__": 31 | logging.warning(f"This program is being run from the command line") 32 | main() 33 | -------------------------------------------------------------------------------- /lectures/090_python_tools/python_logging_03.py: -------------------------------------------------------------------------------- 1 | 2 | import logging.config 3 | import json 4 | import os 5 | 6 | with open("logging_fancy.json", "r") as f: 7 | json_config = json.load(f) 8 | logging.config.dictConfig(json_config) 9 | 10 | fname = os.path.basename(__file__) 11 | log = logging.getLogger(fname) # <= This lines makes the logger name more useful 12 | 13 | def maximum(numbers): 14 | #pass # pass means "do nothing", add your code here 15 | max_value = 0 16 | for num in numbers: 17 | log.debug(f"num:{num}, max_value:{max_value}") 18 | if num > max_value: 19 | log.debug("max > max_value branch taken. Setting new max_value") 20 | max_value = num 21 | return max_value 22 | 23 | def main(): 24 | log.info("Let us find the maximum value from the following list:") 25 | list_of_nums = [1,2,3] 26 | max_value = maximum(list_of_nums) 27 | log.info(max_value) 28 | 29 | log.info("Let us find the maximum value from another list:") 30 | list_of_nums = [-1, -2, -3] 31 | max_value = maximum(list_of_nums) 32 | log.info(max_value) 33 | 34 | if __name__ == "__main__": 35 | log.warning(f"This program is being run from the command line") 36 | main() 37 | -------------------------------------------------------------------------------- /lectures/090_python_tools/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/090_python_tools/tests/__init__.py -------------------------------------------------------------------------------- /lectures/090_python_tools/tests/test_name_reverser.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from name_reverser import name_reverse_order 3 | 4 | def test_name_reverse_order_normal(): 5 | rslt = name_reverse_order("Michael Jordan") 6 | assert rslt == "Jordan, Michael" 7 | 8 | rslt = name_reverse_order("Lebron James") 9 | assert rslt == "James, Lebron" 10 | 11 | -------------------------------------------------------------------------------- /lectures/090_python_tools/tests/test_name_reverser_part_deux.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from name_reverser import name_reverse_order 3 | 4 | def test_name_reverse_order_normal(): 5 | rslt = name_reverse_order("Michael Jordan") 6 | assert rslt == "Jordan, Michael" 7 | 8 | rslt = name_reverse_order("Lebron James") 9 | assert rslt == "James, Lebron" 10 | 11 | def test_name_reverse_order_bad_inputs(): 12 | 13 | # Empty string 14 | with pytest.raises(ValueError): 15 | rslt = name_reverse_order("") 16 | 17 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/messy.py: -------------------------------------------------------------------------------- 1 | # messy.py 2 | import pandas as pd,numpy as np 3 | from typing import List,Dict,Any 4 | import matplotlib.pyplot as plt 5 | 6 | class dataProcessor: 7 | def __init__(self,input_file:str, output_file:str='processed.csv'): 8 | self.input=input_file 9 | self.output_file=output_file 10 | self.data=None 11 | 12 | def Load_data(self): 13 | """loads data from csv file""" 14 | self.data=pd.read_csv(self.input) 15 | return self.data 16 | 17 | def process(self,columns_to_process:List[str]=[],aggfunc:str='mean')->pd.DataFrame: 18 | if len(columns_to_process)==0: return self.data 19 | processed_data={} 20 | for col in columns_to_process: 21 | if col in self.data.columns: 22 | processed_data[col]=getattr(self.data[col],aggfunc)() 23 | else: 24 | print(f"Warning: Column {col} not found") 25 | return pd.DataFrame(processed_data,index=[0]) 26 | 27 | def visualize_data(self, column:str, PlotType:str='bar' )->None: 28 | if self.data is None:raise ValueError('No data loaded') 29 | plt.figure(figsize=(10, 5)) 30 | if PlotType=='bar': 31 | self.data[column].value_counts().plot(kind='bar') 32 | elif PlotType=='hist': 33 | self.data[column].hist() 34 | plt.title(f'Visualization of {column}') 35 | plt.show() 36 | 37 | def main(): 38 | processor=dataProcessor('data.csv') 39 | df = processor.Load_data() 40 | processed=processor.process(['age','salary'],aggfunc='mean') 41 | processor.visualize_data('age','hist') 42 | 43 | if __name__=='__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program1.py: -------------------------------------------------------------------------------- 1 | 2 | x = 10 3 | print(f"The number is {x}") 4 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program10.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | print("Hello world") 3 | 4 | main() 5 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program11.py: -------------------------------------------------------------------------------- 1 | 2 | import program10 3 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program12.py: -------------------------------------------------------------------------------- 1 | 2 | print(f"__name__ in program12 is set to {__name__}") 3 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program13.py: -------------------------------------------------------------------------------- 1 | import program12 2 | 3 | print(f"__name__ in program13 is set to {__name__}") 4 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program14.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | print("Hello world") 3 | 4 | if __name__ == "__main__": 5 | main() 6 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program14b.py: -------------------------------------------------------------------------------- 1 | import program14 2 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program15.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | def main(): 4 | print("Hello world") 5 | 6 | if __name__ == "__main__": 7 | main() 8 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program2.py: -------------------------------------------------------------------------------- 1 | 2 | x = 10 3 | y = x + x 4 | 5 | name = "Shahbaz" 6 | name 7 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program3.py: -------------------------------------------------------------------------------- 1 | 2 | BATCH_DATE = input("Please provide a batch date (mm/dd/yyyy): ") 3 | print(f"This program is run on {BATCH_DATE}") 4 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program4.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | 4 | BATCH_DATE = sys.argv[1] #<= Here is the magic 5 | print(f"This program is run on {BATCH_DATE}") 6 | #print(f"What is at location 0? {sys.argv[0]}") 7 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program5.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | 4 | if len(sys.argv) < 2: 5 | print("Error: Missing BATCH_DATE. Please run as 'python program5.py `") 6 | sys.exit(1) 7 | 8 | BATCH_DATE = sys.argv[1] #<= Here is the magic 9 | print(f"This program is run on {BATCH_DATE}") 10 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program6.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | 4 | # Create the argument parser 5 | parser = argparse.ArgumentParser(description="Scores sales transactions to predict returns.") 6 | 7 | # Add an argument 8 | parser.add_argument("BATCH_DATE", type=str, help="The date of sales transactions") 9 | 10 | # Parse the arguments 11 | args = parser.parse_args() 12 | 13 | # Print the argument 14 | print(f"This program is run on {args.BATCH_DATE}") 15 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program7.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | 5 | if len(sys.argv) < 2: 6 | print("Error: Missing enviornment variable name. Please run as 'python program5.py `") 7 | sys.exit(1) 8 | 9 | ENV = sys.argv[1] 10 | ENV_VAL = os.environ.get(ENV, "") 11 | 12 | print(f"Environment variable {ENV} has value {ENV_VAL}") 13 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program8.py: -------------------------------------------------------------------------------- 1 | 2 | print("Hello world") 3 | -------------------------------------------------------------------------------- /lectures/110_python_py_files/program9.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | print("Hello world") 3 | -------------------------------------------------------------------------------- /lectures/120_dockerize_python_app/100_minimal_27/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use a small base image .... dangerously old! 2 | FROM python:2.7-slim 3 | 4 | # Set the working directory 5 | WORKDIR /app 6 | 7 | # Copy the application code 8 | COPY app.py . 9 | 10 | # Run the application 11 | CMD ["python", "app.py"] 12 | 13 | # docker build -t hello_app . 14 | # docker run hello_app -------------------------------------------------------------------------------- /lectures/120_dockerize_python_app/100_minimal_27/app.py: -------------------------------------------------------------------------------- 1 | print "Hi from many years ago :)" -------------------------------------------------------------------------------- /lectures/120_dockerize_python_app/110_minimal_server/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Python image as a base 2 | FROM python:3.11-slim 3 | 4 | # Set the working directory 5 | WORKDIR /app 6 | 7 | # Copy the requirements file and install dependencies 8 | COPY requirements.txt . 9 | RUN pip install --no-cache-dir -r requirements.txt 10 | 11 | # Copy the application code 12 | COPY . . 13 | 14 | # Expose the FastAPI default port 15 | EXPOSE 8000 16 | 17 | # Command to run the application 18 | CMD ["fastapi", "run", "app.py", "--host", "0.0.0.0", "--port", "8000"] 19 | 20 | 21 | # docker build -t minimal_server . 22 | # docker run -d -p 8000:8000 minimal_server 23 | # curl http://localhost:8000 -------------------------------------------------------------------------------- /lectures/120_dockerize_python_app/110_minimal_server/app.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | 3 | app = FastAPI() 4 | 5 | @app.get("/") 6 | def read_root(): 7 | return {"message": "Hello, FastAPI in Docker!"} 8 | -------------------------------------------------------------------------------- /lectures/120_dockerize_python_app/110_minimal_server/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi[standard] -------------------------------------------------------------------------------- /lectures/120_dockerize_python_app/Docker – an introduction.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/120_dockerize_python_app/Docker – an introduction.pptx -------------------------------------------------------------------------------- /lectures/120_dockerize_python_app/example1/Dockerfile: -------------------------------------------------------------------------------- 1 | # dockerfile, Image, Container 2 | FROM python:3.9 3 | 4 | ADD main.py . 5 | 6 | RUN pip install requests beautifulsoup4 7 | 8 | CMD ["python", "./main.py"] -------------------------------------------------------------------------------- /lectures/120_dockerize_python_app/example1/main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | print("Hello app") -------------------------------------------------------------------------------- /lectures/130-distributed_python/120-ray-serve.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3372311b-1834-4898-af06-ec7721242ba8", 6 | "metadata": {}, 7 | "source": [ 8 | "# Ray Serve - scale deployed models" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "3d7c1968-e07d-4d64-9f4f-bf49f5cf1512", 14 | "metadata": {}, 15 | "source": [ 16 | "### Ray serve\n", 17 | "Ray serve is a component which makes it easy to spread the serving of an API across several machines. Let's jump into code." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "id": "da5c4292-7419-4184-bccf-4594761abfe8", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "%%writefile simple_api.py\n", 28 | "\n", 29 | "from fastapi import FastAPI\n", 30 | "from typing import Dict\n", 31 | "\n", 32 | "app = FastAPI()\n", 33 | "\n", 34 | "@app.get(\"/status\")\n", 35 | "def status() -> Dict[str, str]:\n", 36 | " \"\"\"Simple health check endpoint.\"\"\"\n", 37 | " return {\"status\": \"ok\"}\n", 38 | "\n", 39 | "\n", 40 | "@app.get(\"/compute\")\n", 41 | "def fibonacci(n: int):\n", 42 | " \"\"\"Compute Fibonacci sequence up to n (inclusive).\"\"\"\n", 43 | " if n <= 0:\n", 44 | " return []\n", 45 | " fib = [0, 1]\n", 46 | " while fib[-1] + fib[-2] <= n:\n", 47 | " fib.append(fib[-1] + fib[-2])\n", 48 | " return fib\n", 49 | "\n", 50 | "# fastapi run simple_api.py\n", 51 | "# http://localhost:8000/compute?n=10" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "id": "490f1988-10a3-4c6b-ba99-641a9e61db49", 57 | "metadata": {}, 58 | "source": [ 59 | "Normally you run the code above as:\n", 60 | "\n", 61 | "```python\n", 62 | "fastapi run simple_apy.py\n", 63 | "```\n", 64 | "\n", 65 | "This will run the API on a single machine. \n", 66 | "\n", 67 | "However, is your startup grows, how do you make sure you can continue to serve clients?" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "id": "26632f27-3e6a-4030-8adf-bc8fd87e7256", 73 | "metadata": {}, 74 | "source": [ 75 | "### Let's try to scale this across several machines\n", 76 | "\n", 77 | "If we are not on the same network, use Tailscale to hop on the same vpn." 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "1c538cc3-efaa-44f2-bd32-b7771d28e7f8", 83 | "metadata": {}, 84 | "source": [ 85 | "#### Install Ray" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "fed64382-d746-4056-a27f-a6099d9aba1b", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "!pip install ray[all]" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "id": "21a104de-f563-4055-b377-29b2b8cc144d", 101 | "metadata": {}, 102 | "source": [ 103 | "#### Deploy FastAPI on a cluster (via Ray)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "f9e6a4d1-bba6-4f4d-bfba-ffae5b2d9716", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "%%writefile simple_api_ray.py\n", 114 | "\n", 115 | "from fastapi import FastAPI\n", 116 | "from typing import Dict\n", 117 | "from ray import serve\n", 118 | "import ray\n", 119 | "\n", 120 | "#ray.init(address=\"192.168.12.239:10001\") \n", 121 | "#ray.init(ignore_reinit_error=True)\n", 122 | "\n", 123 | "app = FastAPI()\n", 124 | "\n", 125 | "@app.get(\"/status\")\n", 126 | "def status() -> Dict[str, str]:\n", 127 | " \"\"\"Simple health check endpoint.\"\"\"\n", 128 | " return {\"status\": \"ok\"}\n", 129 | "\n", 130 | "\n", 131 | "@app.get(\"/compute\")\n", 132 | "def fibonacci(n: int):\n", 133 | " \"\"\"Compute Fibonacci sequence up to n (inclusive).\"\"\"\n", 134 | " if n <= 0:\n", 135 | " return []\n", 136 | " fib = [0, 1]\n", 137 | " while fib[-1] + fib[-2] <= n:\n", 138 | " fib.append(fib[-1] + fib[-2])\n", 139 | " return fib\n", 140 | "\n", 141 | "@serve.deployment\n", 142 | "@serve.ingress(app)\n", 143 | "class FastAPIWrapper:\n", 144 | " pass\n", 145 | "\n", 146 | "serve.run(FastAPIWrapper.bind(), route_prefix=\"/\")\n", 147 | "\n", 148 | "# python simple_api_ray.py\n", 149 | "# http://localhost:8000/compute?n=10" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "6a193854-70d5-4ca0-8c88-0b4684675a4b", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "%%writefile simple_api_ray2.py\n", 160 | "\n", 161 | "import requests\n", 162 | "from fastapi import FastAPI\n", 163 | "from ray import serve\n", 164 | "\n", 165 | "# 1: Define a FastAPI app and wrap it in a deployment with a route handler.\n", 166 | "app = FastAPI()\n", 167 | "\n", 168 | "\n", 169 | "@serve.deployment\n", 170 | "@serve.ingress(app)\n", 171 | "class FastAPIDeployment:\n", 172 | " # FastAPI will automatically parse the HTTP request for us.\n", 173 | " @app.get(\"/hello\")\n", 174 | " def say_hello(self, name: str) -> str:\n", 175 | " return f\"Hello {name}!\"\n", 176 | "\n", 177 | "\n", 178 | "# 2: Deploy the deployment.\n", 179 | "serve.run(FastAPIDeployment.bind(), route_prefix=\"/\", )\n", 180 | "\n", 181 | "# 3: Query the deployment and print the result.\n", 182 | "# print(requests.get(\"http://localhost:8000/hello\", params={\"name\": \"Theodore\"}).json())\n", 183 | "# \"Hello Theodore!\"" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "id": "5555ce34-cc1e-449c-9de3-77e1ea441f61", 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [] 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "Python [conda env:conda-mleng_env]", 198 | "language": "python", 199 | "name": "conda-env-conda-mleng_env-py" 200 | }, 201 | "language_info": { 202 | "codemirror_mode": { 203 | "name": "ipython", 204 | "version": 3 205 | }, 206 | "file_extension": ".py", 207 | "mimetype": "text/x-python", 208 | "name": "python", 209 | "nbconvert_exporter": "python", 210 | "pygments_lexer": "ipython3", 211 | "version": "3.11.11" 212 | } 213 | }, 214 | "nbformat": 4, 215 | "nbformat_minor": 5 216 | } 217 | -------------------------------------------------------------------------------- /lectures/130-distributed_python/dask-image/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | RUN apt update && apt install -y iputils-ping iproute2 3 | RUN pip install "dask[complete]" -------------------------------------------------------------------------------- /lectures/130-distributed_python/ray-image/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | RUN apt update && apt install -y iputils-ping iproute2 3 | RUN pip install "ray[all]" -------------------------------------------------------------------------------- /lectures/130-distributed_python/simple_api_ray.py: -------------------------------------------------------------------------------- 1 | 2 | from fastapi import FastAPI 3 | from typing import Dict 4 | from ray import serve 5 | import ray 6 | 7 | #ray.init(address="192.168.12.239:10001") 8 | #ray.init(ignore_reinit_error=True) 9 | 10 | app = FastAPI() 11 | 12 | @app.get("/status") 13 | def status() -> Dict[str, str]: 14 | """Simple health check endpoint.""" 15 | return {"status": "ok"} 16 | 17 | 18 | @app.get("/compute") 19 | def fibonacci(n: int): 20 | """Compute Fibonacci sequence up to n (inclusive).""" 21 | if n <= 0: 22 | return [] 23 | fib = [0, 1] 24 | while fib[-1] + fib[-2] <= n: 25 | fib.append(fib[-1] + fib[-2]) 26 | return fib 27 | 28 | @serve.deployment 29 | @serve.ingress(app) 30 | class FastAPIWrapper: 31 | pass 32 | 33 | serve.run(FastAPIWrapper.bind(), route_prefix="/") 34 | 35 | # python simple_api_ray.py 36 | # http://localhost:8000/compute?n=10 37 | -------------------------------------------------------------------------------- /lectures/130-distributed_python/simple_api_ray2.py: -------------------------------------------------------------------------------- 1 | 2 | import requests 3 | from fastapi import FastAPI 4 | import ray 5 | from ray import serve 6 | 7 | # 1: Define a FastAPI app and wrap it in a deployment with a route handler. 8 | app = FastAPI() 9 | 10 | 11 | @serve.deployment 12 | @serve.ingress(app) 13 | class FastAPIDeployment: 14 | # FastAPI will automatically parse the HTTP request for us. 15 | @app.get("/hello") 16 | def say_hello(self, name: str) -> str: 17 | return f"Hello {name}!" 18 | 19 | 20 | # 2: Deploy the deployment. 21 | ray.init(detached=True) 22 | serve.run(FastAPIDeployment.bind(), route_prefix="/", ) 23 | 24 | # 3: Query the deployment and print the result. 25 | # print(requests.get("http://localhost:8000/hello", params={"name": "Theodore"}).json()) 26 | # "Hello Theodore!" 27 | -------------------------------------------------------------------------------- /lectures/misc/scratchspace.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "unnecessary-locator", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "d = {'a':{'x':1, 'y':2, 'z':3}, 'b':{'x':1, 'y':2, 'z':3}, 'c':{'x':1, 'y':2, 'z':3}}" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 4, 16 | "id": "sharing-thermal", 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "a\n", 24 | "b\n", 25 | "c\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "for i in d: print(i)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 9, 36 | "id": "closing-hybrid", 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "a {'x': 1, 'y': 2, 'z': 3}\n", 44 | " x 1\n", 45 | " y 2\n", 46 | " z 3\n", 47 | " x 0.16666666666666666\n", 48 | " y 0.3333333333333333\n", 49 | " z 0.5\n", 50 | " 6\n", 51 | "b {'x': 1, 'y': 2, 'z': 3}\n", 52 | " x 1\n", 53 | " y 2\n", 54 | " z 3\n", 55 | " x 0.16666666666666666\n", 56 | " y 0.3333333333333333\n", 57 | " z 0.5\n", 58 | " 6\n", 59 | "c {'x': 1, 'y': 2, 'z': 3}\n", 60 | " x 1\n", 61 | " y 2\n", 62 | " z 3\n", 63 | " x 0.16666666666666666\n", 64 | " y 0.3333333333333333\n", 65 | " z 0.5\n", 66 | " 6\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "#Outer loop, go through the top dictionary\n", 72 | "for key, val in d.items():\n", 73 | " print(key, val)\n", 74 | " \n", 75 | " #For each inner dicitonary, go through it as well and normalize numbers\n", 76 | " #calculate total\n", 77 | " total = 0\n", 78 | " for key2, val2 in val.items():\n", 79 | " print(' ', key2, val2)\n", 80 | " total += val2\n", 81 | " \n", 82 | " #divide each value by total to convert it to a probability\n", 83 | " for key2, val2 in val.items():\n", 84 | " val[key2] = val2/total\n", 85 | " print(' ', key2, val[key2])\n", 86 | " \n", 87 | " print(' ', total)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 10, 93 | "id": "confident-radar", 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "Counter({(0, 1): 1,\n", 100 | " (1, 2): 1,\n", 101 | " (2, 3): 1,\n", 102 | " (3, 4): 1,\n", 103 | " (4, 5): 1,\n", 104 | " (5, 6): 1,\n", 105 | " (6, 7): 1,\n", 106 | " (7, 8): 1,\n", 107 | " (8, 9): 1})" 108 | ] 109 | }, 110 | "execution_count": 10, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "import collections\n", 117 | "r = range(10)\n", 118 | "collections.Counter(zip(r, r[1:]))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "dental-robertson", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [] 128 | } 129 | ], 130 | "metadata": { 131 | "kernelspec": { 132 | "display_name": "Python 3", 133 | "language": "python", 134 | "name": "python3" 135 | }, 136 | "language_info": { 137 | "codemirror_mode": { 138 | "name": "ipython", 139 | "version": 3 140 | }, 141 | "file_extension": ".py", 142 | "mimetype": "text/x-python", 143 | "name": "python", 144 | "nbconvert_exporter": "python", 145 | "pygments_lexer": "ipython3", 146 | "version": "3.8.5" 147 | } 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 5 151 | } 152 | -------------------------------------------------------------------------------- /lectures/r_basics/images/rfordatascience.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/r_basics/images/rfordatascience.jpg -------------------------------------------------------------------------------- /lectures/r_basics/images/rfordatascience.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/r_basics/images/rfordatascience.png -------------------------------------------------------------------------------- /lectures/r_basics/images/rinaction.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/r_basics/images/rinaction.jpg -------------------------------------------------------------------------------- /lectures/r_basics/rmarkdown_tutorial.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Markdown Tutorial" 3 | author: "Shahbaz Chaudhary" 4 | date: "11/27/2021" 5 | output: html_document 6 | --- 7 | 8 | # Yet another reason why you should be familiar with _Markdown_ 9 | 10 | While Jupyter is the standard tool for Python data scientists, R Studio and R Markdown are the most popular ways R data scientists develop and share their code. 11 | 12 | Markdown allows a simple way to format text. For example, you can have _italics_, **bold** and ~~crossed out~~ text. 13 | 14 | You can write lists: 15 | 16 | - item 1 17 | - item 2 18 | - item 3 19 | 20 | You can number your bullets 21 | 22 | 1. item 1 23 | 2. item 2 24 | 3. item 3 25 | 26 | Or create tables 27 | 28 | | column A | Column B | Column C | 29 | | --- | --- | --- | 30 | | value 1| value 2| value 3| 31 | | value 4| value 5| value 6| 32 | | value 7| value 8| value 9| 33 | 34 | ### But there is more 35 | 36 | You can display inline calculations, such as 2 + 2 = `r 2 + 2`! 37 | 38 | In fact, you can organize larger chunks of code: 39 | 40 | ```{r} 41 | seq(10) 42 | ``` 43 | 44 | ```{r} 45 | plot(seq(10)) 46 | ``` 47 | 48 | You can control what gets displayed in your final document: 49 | 50 | ```{r echo=FALSE} 51 | plot(seq(10)) 52 | ``` 53 | 54 | ### Here is a surprise, you can even run Python in RMarkdown (but you need the 'reticulate' library)! 55 | 56 | Notice that the following line is not being executed, since there is no '{r}' text. 57 | 58 | ``` 59 | install.packages('reticulate') 60 | ``` 61 | 62 | ```{python} 63 | list(range(10)) 64 | ``` 65 | 66 | ### Better tables 67 | By default, R doesn't do a great job of rendering tables 68 | 69 | ```{r} 70 | mtcars 71 | ``` 72 | 73 | The tidyverse set of packages promises to do better 74 | 75 | ```{r} 76 | library(tidyverse) 77 | tibble(mtcars) 78 | ``` 79 | 80 | However, the `knitr` package does a much better job 81 | 82 | ```{r} 83 | library(knitr) 84 | kable(mtcars) 85 | ``` 86 | -------------------------------------------------------------------------------- /lectures/readme.md: -------------------------------------------------------------------------------- 1 | # Lectures 2 | This directory contains lectures notes, slides, notebooks, etc. 3 | 4 | ## Introductory Lectures 5 | 6 | ### intro_to_consoles 7 | Introduces the console, which may look intimidating and out of place on a modern computer. 8 | 9 | ### programming_vs_calculator 10 | Starts with a calculator, familiar and easy to understand. Then identifies various parts of it which most of us overlook. Then asks, how can it be expanded into a programming language? 11 | 12 | ### first_programs 13 | Starts to introduce Python. Shows several complete programs, each introducing new concepts and slightly more complexity 14 | 15 | ### intro_to_jupyter 16 | A very basic and brief introduction to Jupyter notebooks 17 | 18 | ### all_of_python_basics 19 | First introduction to much of Python and basic programming language constructs. This material should be enough for students to write basic programs, understand important terminology (to help in searching for answers on the web). 20 | 21 | ##### all_of_python_variable_assignment_and_tuples 22 | Introduces multiple assignment and tuples 23 | 24 | ##### all_of_python_basic_functions 25 | Deeper dive into function creation. Also learn about asserts and test first methodology. A short tutorial on using debuggers. 26 | 27 | ##### all_of_python_numbers 28 | Uses all bsaic operators, including the modulus and power operators. Shows how mod can be used in algorithms. 29 | 30 | ##### all_of_python_strings 31 | Shows how to use many popular functions and string formatters. 32 | 33 | ##### all_of_python_basic_plotting 34 | Very basic introduction to plotting. Needed for assignment on random numbers. 35 | 36 | ##### all_of_python_libraries_random 37 | Students will write this and as a tutorial for others who know random numbers, and basics of python, but not this library. 38 | 39 | ##### all_of_python_basic_dictionaries 40 | Deeper dive into dictionaries, including .get(,), defaultdict, etc. 41 | 42 | ##### all_of_python_basic_lists 43 | Much deeper dive into lists. Separate list functions from stack functions. 44 | 45 | ##### all_of_python_basic_list_comprehensions 46 | Introduces list comprehensions. 47 | 48 | ##### all_of_python_basic_classes 49 | First introduction to classes so object.method() notation makes sense. Introduce enough inheritence to students get how functionality can be shared (such as measuring accuracy on any scikit-learn model). ~~Introduce how operators are encoded as functions so numpy/pandas don't seem magical.~~ 50 | 51 | ##### all_of_python_loops 52 | Introduce the while loop and give extended examples of the for loop. Introduce breaking out of loops and skipping iterations. 53 | 54 | ##### all_of_python_conditionals_and_None 55 | Introduce None. Introduce the elif keyword, describe short-circuiting. Mention case statements, which don't exist in Python. 56 | 57 | ~~##### all_of_python_iterators~~ 58 | Introduce iterators and itertools. 59 | 60 | ~~##### all_of_python_more_functions~~ 61 | Introduce keyword arguments, optional arguments, notation to pass through arguments, docstrings. Introduce lambdas and map/reduce/filter functions. Introduce higher order functions. 62 | 63 | ~~##### all_of_python_libraries_filesystem~~ 64 | 65 | ### Explore distribution characters in male and female names 66 | In-class exercise to download files containing male and female names, write code to draw histograms of characters in male and female names. 67 | 68 | ### Intro to Pandas 69 | Introduces the data science library Pandas. It shows how to read csv files, how to view a summary of data, how to explore data using charting libraries, etc. 70 | 71 | ### Intro to Numpy 72 | Introduces the data science library Numpy. 73 | 74 | ### Gradient Descent 75 | In-class, assisted, exercise to write the gradient descent algorithm. 76 | 77 | ## Lectures appropriate for second course in programming 78 | 79 | ### git_version_control 80 | Introduces Git and includes several exercises. This lecture was written for students who have already taken an introductory programming class. 81 | 82 | ### bigger_data_pandas 83 | Once students are comfortable with Pandas, this course shows how data measuring gigabytes can be handled in Python. 84 | 85 | ## Miscellaneous lectures 86 | 87 | ### Data Science in Python 88 | An overview of Python, numpy, pandas and scikit-learn (one hour long) -------------------------------------------------------------------------------- /postcell.conf.bak: -------------------------------------------------------------------------------- 1 | { 2 | "url" : "https://postcell.io/post_cell", 3 | "student_id" : "YOUR_STUDENTS_ENTER_THEIR_NAME_OR_ID_HERE", 4 | "instructor_id": "doJIH2jibYWOOJyGm6zsDNe93722", 5 | "class_id": "CLASS_ID_HERE", 6 | "should_send_to_server" : "true" 7 | } -------------------------------------------------------------------------------- /programs/calc-sum-pd.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("infile") 6 | args = parser.parse_args() 7 | 8 | infile = args.infile 9 | 10 | data_df = pd.read_csv(infile) 11 | print(data_df[data_df.killer.isin(["Arya Stark", "Jon Snow"])].killer.value_counts()) 12 | -------------------------------------------------------------------------------- /programs/calc-sum.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("infile") 6 | args = parser.parse_args() 7 | 8 | infile = args.infile 9 | 10 | jon = 0 #variable containing Jon's score 11 | arya = 0 #variable containing Arya's score 12 | 13 | #Open file 14 | file = open(infile, encoding='utf8') 15 | 16 | #Go through each line in file 17 | for line in file: 18 | tokens = line.split(',') #separate line into columns 19 | if tokens[4]=="Arya Stark": arya = arya + 1 20 | if tokens[4]=="Jon Snow": 21 | jon = jon + 1 22 | 23 | file.close() 24 | print("Arya killed", arya, "people") 25 | print("Jon killed", jon, "people") -------------------------------------------------------------------------------- /programs/gen-shakespeare.py: -------------------------------------------------------------------------------- 1 | # This program is used to demonstrate the debugger, shouldn't be used for anything else! 2 | import gzip 3 | import random 4 | 5 | src_dst = {} 6 | prev_word = "" 7 | 8 | with gzip.open('../datasets/shakespeare/shakespeare.txt.gz','rt') as infile: 9 | for line in infile: 10 | toks = line\ 11 | .strip()\ 12 | .lower()\ 13 | .split() 14 | for tok in toks: 15 | if prev_word not in src_dst: src_dst[prev_word] = {} 16 | dst_dict = src_dst[prev_word] 17 | dst_freq = dst_dict.get(tok, 0) + 1 18 | src_dst[prev_word][tok] = dst_freq 19 | prev_word = tok 20 | 21 | #Randomly pick a word from data 22 | chosen_word = random.choice(list(src_dst.keys())) 23 | 24 | #Get words and thier distributions 25 | def get_word_dist(dist): 26 | pop = list(dist.keys()) 27 | weights = list(distribution.values()) 28 | return pop, weights 29 | 30 | #For each word, pick the following word with probability propotional to its frequency in the src_dst dictionary 31 | for i in range(100): 32 | distribution = src_dst[chosen_word] 33 | pop, weights = get_word_dist(distribution) 34 | 35 | next_word = random.choices(pop, weights=weights)[0] 36 | chosen_word = next_word 37 | 38 | print(next_word, end=' ') 39 | if(next_word.endswith(".")): print("") 40 | -------------------------------------------------------------------------------- /programs/killings_per_season.py: -------------------------------------------------------------------------------- 1 | data_file_location = "../datasets/deaths-in-gameofthrones/game-of-thrones-deaths-data.csv" 2 | 3 | season_d = dict() 4 | 5 | #Open file 6 | file = open(data_file_location, encoding='utf8') 7 | 8 | #Go through each line in file 9 | for line in file: 10 | tokens = line.split(',') #separate line into columns 11 | s = tokens[1] 12 | if s not in season_d: season_d[s] = 1 13 | else: season_d[s] += 1 14 | 15 | file.close() 16 | 17 | print(season_d) -------------------------------------------------------------------------------- /programs/logging.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "formatters": { 4 | "simple": { 5 | "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 6 | } 7 | }, 8 | "handlers": { 9 | "console": { 10 | "class": "logging.StreamHandler", 11 | "level": "DEBUG", 12 | "formatter": "simple", 13 | "stream": "ext://sys.stdout" 14 | } 15 | }, 16 | "loggers": { 17 | "root": { 18 | "level": "DEBUG", 19 | "handlers": ["console"] 20 | } 21 | } 22 | } -------------------------------------------------------------------------------- /programs/maximum_bad_debug.py: -------------------------------------------------------------------------------- 1 | def maximum(numbers): 2 | #pass # pass means "do nothing", add your code here 3 | max_value = 0 4 | for num in numbers: 5 | if num > max_value: 6 | max_value = num 7 | return max_value 8 | 9 | def main(): 10 | print("Let us find the maximum value from the following list:") 11 | list_of_nums = [1,2,3] 12 | max_value = maximum(list_of_nums) 13 | print(max_value) 14 | 15 | print("Let us find the maximum value from another list:") 16 | list_of_nums = [-1, -2, -3] 17 | max_value = maximum(list_of_nums) 18 | print(max_value) 19 | 20 | print(f"Value of __name__ is {__name__}") 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /programs/maximum_bad_logging.log: -------------------------------------------------------------------------------- 1 | 2024-03-01 21:39:37,189 - maximum_bad_logging.py - WARNING - This program is being run from the command line 2 | 2024-03-01 21:39:37,189 - maximum_bad_logging.py - INFO - Let us find the maximum value from the following list: 3 | 2024-03-01 21:39:37,189 - maximum_bad_logging.py - DEBUG - num:1, max_value:0 4 | 2024-03-01 21:39:37,189 - maximum_bad_logging.py - DEBUG - max > max_value branch taken. Setting new max_value 5 | 2024-03-01 21:39:37,189 - maximum_bad_logging.py - DEBUG - num:2, max_value:1 6 | 2024-03-01 21:39:37,189 - maximum_bad_logging.py - DEBUG - max > max_value branch taken. Setting new max_value 7 | 2024-03-01 21:39:37,201 - maximum_bad_logging.py - DEBUG - num:3, max_value:2 8 | 2024-03-01 21:39:37,201 - maximum_bad_logging.py - DEBUG - max > max_value branch taken. Setting new max_value 9 | 2024-03-01 21:39:37,201 - maximum_bad_logging.py - INFO - 3 10 | 2024-03-01 21:39:37,201 - maximum_bad_logging.py - INFO - Let us find the maximum value from another list: 11 | 2024-03-01 21:39:37,205 - maximum_bad_logging.py - DEBUG - num:-1, max_value:0 12 | 2024-03-01 21:39:37,205 - maximum_bad_logging.py - DEBUG - num:-2, max_value:0 13 | 2024-03-01 21:39:37,206 - maximum_bad_logging.py - DEBUG - num:-3, max_value:0 14 | 2024-03-01 21:39:37,206 - maximum_bad_logging.py - INFO - 0 15 | -------------------------------------------------------------------------------- /programs/maximum_bad_logging.py: -------------------------------------------------------------------------------- 1 | import logging.config 2 | import json 3 | import os 4 | 5 | with open("logging.json", "r") as f: 6 | json_config = json.load(f) 7 | logging.config.dictConfig(json_config) 8 | 9 | fname = os.path.basename(__file__) 10 | log = logging.getLogger(fname) # <= This lines makes the logger name more useful 11 | 12 | def maximum(numbers): 13 | #pass # pass means "do nothing", add your code here 14 | max_value = 0 15 | for num in numbers: 16 | log.debug(f"num:{num}, max_value:{max_value}") 17 | if num > max_value: 18 | log.debug("max > max_value branch taken. Setting new max_value") 19 | max_value = num 20 | return max_value 21 | 22 | def main(): 23 | log.info("Let us find the maximum value from the following list:") 24 | list_of_nums = [1,2,3] 25 | max_value = maximum(list_of_nums) 26 | log.info(max_value) 27 | 28 | log.info("Let us find the maximum value from another list:") 29 | list_of_nums = [-1, -2, -3] 30 | max_value = maximum(list_of_nums) 31 | log.info(max_value) 32 | 33 | if __name__ == "__main__": 34 | log.warning(f"This program is being run from the command line") 35 | main() 36 | -------------------------------------------------------------------------------- /programs/svm_or_logreg_strategy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 5 | from sklearn.svm import SVC 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.metrics import confusion_matrix 8 | 9 | def build_classifier(args): 10 | # Read CSV file 11 | df = pd.read_csv(args.csv) 12 | 13 | # Ignore specified columns 14 | if args.ignore_cols: 15 | df = df.drop(columns=args.ignore_cols, errors='ignore') 16 | 17 | # Convert categorical columns to one-hot encoding 18 | categorical_cols = df.select_dtypes(include=['object']).columns 19 | if len(categorical_cols) > 0: 20 | df = pd.get_dummies(df, columns=categorical_cols) 21 | 22 | # Encode target column 23 | label_encoder = LabelEncoder() 24 | df[args.target_col] = label_encoder.fit_transform(df[args.target_col]) 25 | 26 | # Split data into features and target 27 | X = df.drop(columns=[args.target_col]) 28 | y = df[args.target_col] 29 | 30 | # Split data into train and test sets 31 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 32 | 33 | # Choose classifier model 34 | if args.model == "svm": # <== We are able to pick the right model or 'strategy' at runtime 35 | clf = SVC() 36 | elif args.model == "logreg": 37 | clf = LogisticRegression() 38 | 39 | # Train classifier 40 | clf.fit(X_train, y_train) # <== This code is the same, svm or logreg 41 | 42 | # Predict on test set 43 | y_pred = clf.predict(X_test) # <== This code is the same, svm or logreg 44 | 45 | # Calculate confusion matrix 46 | cm = confusion_matrix(y_test, y_pred) 47 | 48 | return cm 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser(description="Build a classifier") 52 | parser.add_argument("--model", choices=["svm", "logreg"], required=True, help="Type of classifier model") 53 | parser.add_argument("--csv", required=True, help="CSV input file") 54 | parser.add_argument("--target-col", required=True, help="Target column name") 55 | parser.add_argument("--ignore-cols", nargs='+', default=[], help="Columns to ignore") 56 | 57 | args = parser.parse_args() 58 | confusion_matrix = build_classifier(args) 59 | print("Confusion Matrix:") 60 | print(confusion_matrix) 61 | -------------------------------------------------------------------------------- /python_tableofcontents.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/python_tableofcontents.xlsx -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/src/__init__.py -------------------------------------------------------------------------------- /src/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/src/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /src/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/src/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | # https://techoverflow.net/2017/02/26/requests-download-file-if-it-doesnt-exist/ 2 | 3 | import requests 4 | import os.path 5 | 6 | def download_file(filename, url): 7 | """ 8 | Download an URL to a file 9 | """ 10 | with open(filename, 'wb') as fout: 11 | response = requests.get(url, stream=True) 12 | response.raise_for_status() 13 | # Write response data to file 14 | for block in response.iter_content(4096): 15 | fout.write(block) 16 | 17 | def download_if_not_exists(filename, url): 18 | """ 19 | Download a URL to a file if the file 20 | does not exist already. 21 | 22 | Returns 23 | ------- 24 | True if the file was downloaded, 25 | False if it already existed 26 | """ 27 | if not os.path.exists(filename): 28 | download_file(filename, url) 29 | return True 30 | return False 31 | 32 | --------------------------------------------------------------------------------