├── .gitignore
├── OPEN_ME_FIRST.ipynb
├── README.md
├── clear_output.txt
├── datasets
├── credit-card-customers
│ ├── BankChurners.zip
│ └── hwotoget.md
├── deaths-in-gameofthrones
│ ├── game-of-thrones-deaths-data.csv
│ ├── get_data.sh
│ └── howtoget.md
├── life-expectancy
│ ├── howtoget.md
│ └── life-expectancy-who.zip2
├── movielens
│ ├── README.txt
│ └── howtoget.txt
├── names
│ ├── female.txt
│ ├── get_data.sh
│ ├── howtoget.txt
│ └── male.txt
├── shakespeare
│ ├── get_data.sh
│ ├── howtoget.txt
│ └── shakespeare.txt.gz
├── startbucks_locations
│ └── url.txt.txt
├── taxi-trips
│ └── howtoget.txt
└── worldbank-dev-ind
│ └── howtoget.txt
├── environment.yml
├── lectures
├── .gitattributes
├── 005_intro_to_consoles
│ ├── images
│ │ ├── 03_two_pages_at_once.png
│ │ ├── console.jpg
│ │ ├── dosprompt.jpg
│ │ └── macterminal.png
│ └── intro_to_consoles.ipynb
├── 010_programming_vs_calculator
│ ├── images
│ │ ├── HR-10RC_large.png
│ │ ├── SL300VC-PK_large.png
│ │ ├── calculator_2p3.jpg
│ │ ├── calculator_blank.jpg
│ │ ├── calculator_expand_chars.jpg
│ │ ├── calculator_expand_date.jpg
│ │ ├── calculator_expand_math.jpg
│ │ ├── calculator_expand_mem.jpg
│ │ ├── calculator_memory.jpg
│ │ ├── calculator_numbers.jpg
│ │ ├── calculator_operators.jpg
│ │ └── calculator_screen.jpg
│ └── programming_vs_calculator.ipynb
├── 015_first_programs
│ └── first_programs.ipynb
├── 020_intro_to_jupyter
│ ├── 10 - Intro To Jupyter (not technical).ipynb
│ ├── 20 - Intro To Jupyter (technical).ipynb
│ ├── 30 - Intro To Widgets.ipynb
│ └── images
│ │ ├── blank_jupyter_header.png
│ │ ├── command_mode.png
│ │ ├── edit_mode.png
│ │ ├── jupyter_celltype.png
│ │ ├── jupyter_create_notebook.png
│ │ ├── jupyter_files.png
│ │ ├── jupyter_kernel.png
│ │ ├── jupyter_restart.png
│ │ ├── jupyter_run.png
│ │ ├── mathematica.png
│ │ └── python_repl.png
├── 025_all_of_python_basics
│ ├── 090-roadmap.ipynb
│ ├── 095-all_of_python_faster_basics.ipynb
│ ├── 100-all_of_python_basics.ipynb
│ ├── 110-variables.ipynb
│ ├── 120-basic_functions.ipynb
│ ├── 130-numbers.ipynb
│ ├── 140-strings.ipynb
│ ├── 145-boolean_algebra.ipynb
│ ├── 150-basic_plotting.ipynb
│ ├── 170-dictionaries.ipynb
│ ├── 180-lists.ipynb
│ ├── 190-comprehensions_and_generators.ipynb
│ ├── 200-classes.ipynb
│ ├── 210-loops.ipynb
│ ├── 220-conditonals_and_None.ipynb
│ ├── 230-functions_argument_types.ipynb
│ ├── 240-functions_lambda_and_higherorder.ipynb
│ ├── 245-functions-decorators.ipynb
│ ├── 250-functions_recursion.ipynb
│ ├── 260-all_of_python_regexes.ipynb
│ ├── 280-exceptions.ipynb
│ ├── 290-context_managers.ipynb
│ ├── 300-type_safety.ipynb
│ ├── 310-async.ipynb
│ ├── badly_typed_code.py
│ └── images
│ │ ├── best-mommy-ever-jewelry.jpg
│ │ ├── binary_tree.svg
│ │ ├── class_diff.png
│ │ ├── clock.jpg
│ │ ├── david_chang.jpg
│ │ ├── how-to-control-feedback-in-a-sound-system_header.jpg
│ │ ├── ifelse_diagram.png
│ │ ├── inception.jpg
│ │ ├── infinitemirror.jpg
│ │ ├── listcomprehension.png
│ │ ├── loop_diagram.png
│ │ └── y_combinator.jpg
├── 030_intro_to_pandas
│ ├── 100-pandas_quick_start.ipynb
│ ├── 110-pandas-overview-series.ipynb
│ ├── 120-pandas-overview-dataframes.ipynb
│ ├── 130-pandas-dataframes-operations.ipynb
│ ├── 140-pandas-dataframes-combining.ipynb
│ ├── 150-pandas-groupby.ipynb
│ ├── 160-pandas-index.ipynb
│ ├── 170-pandas-reshape-with-pivot-melt-stack.ipynb
│ ├── 180-pandas-operations_str_dt_apply.ipynb
│ └── images
│ │ ├── dataframes.jpg
│ │ ├── series.jpg
│ │ └── splitapplycombine.png
├── 035_how_to_read_technical_docs
│ └── how_to_read_technical_docs.ipynb
├── 040_basic_computer_architecture
│ ├── basic_computer_architecture.ipynb
│ └── images
│ │ ├── EBMotherboard.jpg
│ │ ├── How_to_stress_test_your_CPU-Hero.jpg
│ │ ├── Laptop-hard-drive-exposed.jpg
│ │ ├── RAM-Modules.jpg
│ │ ├── Supermicro-X12SCA-F-Overview.jpg
│ │ ├── ascii.png
│ │ ├── calc.png
│ │ ├── overview-fig1.png
│ │ └── unicode_sample.png
├── 045_intro_to_numpy
│ ├── images
│ │ ├── chicago.jpeg
│ │ ├── chicago.png
│ │ ├── chicagobw.jpeg
│ │ └── chicagobw.png
│ ├── intro_to_numpy.ipynb
│ └── linear_regression_with_numpy.ipynb
├── 050_git_version_control
│ ├── Why do you need version control.pptx
│ ├── assets
│ │ ├── copy_to_dropbox.png
│ │ ├── folder_versions.png
│ │ ├── github-desktop-screenshot-windows.png
│ │ ├── macgit-03-open.png
│ │ └── share_code_email.png
│ ├── readme.txt
│ ├── understanding_version_control.html
│ └── understanding_version_control.md
├── 055_bigger_data_pandas
│ ├── 050 - Work With Taxi Trips - Get to know the file.ipynb
│ ├── 100 - Work With Taxi Trips - memory_map.ipynb
│ ├── 110 - Work With Taxi Trips - compression.ipynb
│ ├── 120 - Work With Taxi Trips - feather format.ipynb
│ ├── 130 - Work With Taxi Trips - chunking and tqdm.ipynb
│ ├── 135 - Work With Taxi Trips - skip columns pre-req.ipynb
│ ├── 140 - Work With Taxi Trips - skip columns.ipynb
│ ├── 140B - Work With Taxi Trips - skip columns.ipynb
│ ├── 150 - Work With Taxi Trips - c_parser.ipynb
│ ├── 160 - Work With Taxi Trips - Chunk to parquet.ipynb
│ ├── 160B - Work With Taxi Trips - Read from parquet files.ipynb
│ └── lecture.md
├── 060_learn_command_line
│ ├── learn_command_line.md
│ └── learn_command_line_2.md
├── 065_secret_lives_of_text_files
│ ├── Secret Lives of Text Files.ipynb
│ ├── howareyou_english.txt
│ ├── howareyou_english_multiple_lines.txt
│ └── howareyou_not_english.txt
├── 070_scikit_learn
│ ├── 100-scikit-learn-method_behind_the_madness.ipynb
│ ├── 110-scikit-learn-run_saved_model.ipynb
│ └── model_server.py
├── 075_web_services
│ ├── 120-bank_churners_classifier_model.ipynb
│ ├── 120-bank_churners_classifier_model.py
│ ├── 130-load_test.ipynb
│ ├── The web, under the hood.pdf
│ ├── The web, under the hood.pptx
│ ├── consume_json.py
│ ├── consume_services.ipynb
│ ├── decorator.pyx
│ ├── post_client_streamlit_app.py
│ ├── serve_json.py
│ ├── serve_post_json.py
│ ├── serve_text.py
│ └── streamlit_app.py
├── 080_env_pkg_management
│ ├── 010-package_management.ipynb
│ └── Python environment and package management.pptx
├── 090_python_tools
│ ├── .coverage
│ ├── 010-jupyter-debugger.ipynb
│ ├── 020-python-bytecode.ipynb
│ ├── 030-python-logging.ipynb
│ ├── 040-python-unit-tests.ipynb
│ ├── __init__.py
│ ├── logging.json
│ ├── logging_fancy.json
│ ├── logs
│ │ ├── training-stats.log
│ │ └── uvicorn.log
│ ├── name_reverser.py
│ ├── python_logging_01.py
│ ├── python_logging_02.py
│ ├── python_logging_03.py
│ └── tests
│ │ ├── __init__.py
│ │ ├── test_name_reverser.py
│ │ └── test_name_reverser_part_deux.py
├── 100_design_patterns
│ └── 100-design-patterns.ipynb
├── 110_python_py_files
│ ├── 100-program-inputs.ipynb
│ ├── 110-program-run-type.ipynb
│ ├── 120-clean-code.ipynb
│ ├── messy.py
│ ├── program1.py
│ ├── program10.py
│ ├── program11.py
│ ├── program12.py
│ ├── program13.py
│ ├── program14.py
│ ├── program14b.py
│ ├── program15.py
│ ├── program2.py
│ ├── program3.py
│ ├── program4.py
│ ├── program5.py
│ ├── program6.py
│ ├── program7.py
│ ├── program8.py
│ └── program9.py
├── 120_dockerize_python_app
│ ├── 100_minimal_27
│ │ ├── Dockerfile
│ │ └── app.py
│ ├── 110_minimal_server
│ │ ├── Dockerfile
│ │ ├── app.py
│ │ └── requirements.txt
│ ├── Docker – an introduction.pptx
│ └── example1
│ │ ├── Dockerfile
│ │ └── main.py
├── 130-distributed_python
│ ├── 100-simulate_a_cluster_docker.ipynb
│ ├── 110-ray_intro.ipynb
│ ├── 120-ray-serve.ipynb
│ ├── dask-image
│ │ ├── 110-dask-cluster.ipynb
│ │ └── Dockerfile
│ ├── ray-image
│ │ ├── 110-ray-cluster.ipynb
│ │ └── Dockerfile
│ ├── simple_api_ray.py
│ └── simple_api_ray2.py
├── 140-algorithms_datastructs
│ ├── 100-data_structures.ipynb
│ └── 110-algorithms.ipynb
├── Data Science in Python.ipynb
├── gradient_descent
│ └── Gradient Descent.ipynb
├── misc
│ ├── prettyprint_numpy.ipynb
│ ├── scratchspace.ipynb
│ └── test_themes.ipynb
├── r_basics
│ ├── Base R.ipynb
│ ├── Tidyverse.ipynb
│ ├── images
│ │ ├── rfordatascience.jpg
│ │ ├── rfordatascience.png
│ │ └── rinaction.jpg
│ ├── rmarkdown_tutorial.html
│ └── rmarkdown_tutorial.rmd
└── readme.md
├── postcell.conf.bak
├── programs
├── calc-sum-pd.py
├── calc-sum.py
├── gen-shakespeare.py
├── killings_per_season.py
├── logging.json
├── maximum_bad_debug.py
├── maximum_bad_logging.log
├── maximum_bad_logging.py
└── svm_or_logreg_strategy.py
├── python_tableofcontents.xlsx
└── src
├── __init__.py
├── __pycache__
├── __init__.cpython-36.pyc
└── utils.cpython-36.pyc
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.joblib
3 | .ipynb_checkpoints
4 | .DS_Store
5 | .vscode
6 | postcell.conf
7 | postcell.log
8 | __pycache__/
9 |
--------------------------------------------------------------------------------
/OPEN_ME_FIRST.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Let's set up your envrionment"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "### Import necessary libraries \n",
15 | "*(click the *play* button in the toolbar above to execute a cell)*"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "metadata": {
22 | "tags": []
23 | },
24 | "outputs": [],
25 | "source": [
26 | "import json\n",
27 | "import os\n",
28 | "import shutil\n",
29 | "import datetime\n",
30 | "import pandas as pd\n",
31 | "\n",
32 | "import ipywidgets as widgets"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {
38 | "tags": []
39 | },
40 | "source": [
41 | "### 1. Which section are you in?"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {
48 | "tags": []
49 | },
50 | "outputs": [],
51 | "source": [
52 | "student_section = widgets.Dropdown(\n",
53 | " options=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'],\n",
54 | " value='Monday',\n",
55 | " description='Section:',\n",
56 | " disabled=False,\n",
57 | ")\n",
58 | "student_section"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "student_section.value = \"Wednesday\""
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "student_section.value"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "### 2. What is your name?"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {
90 | "tags": []
91 | },
92 | "outputs": [],
93 | "source": [
94 | "student_name = \"Your Name\""
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "### 3. Set up postcell"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "# class_name = \"pythonfordatascience\"\n",
111 | "class_name = \"pythonformlengineering\"\n",
112 | "class_time_period = \"2025_quarter2\""
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {
119 | "tags": []
120 | },
121 | "outputs": [],
122 | "source": [
123 | "#Set up template\n",
124 | "if not os.path.isfile('postcell.conf'):\n",
125 | " shutil.copyfile('postcell.conf.bak', 'postcell.conf')\n",
126 | "\n",
127 | "#Open config file\n",
128 | "with open('postcell.conf', 'rt') as conf:\n",
129 | " parsedj = json.load(conf)\n",
130 | " #print(parsedj['student_id'])\n",
131 | " #print(parsedj['class_id'])\n",
132 | " \n",
133 | "#Set student name\n",
134 | "parsedj['student_id'] = student_name.strip().replace(' ', '_')\n",
135 | "\n",
136 | "#Set class name\n",
137 | "class_id = f\"{class_time_period}_{student_section.value.lower()}_{class_name}\"\n",
138 | "parsedj['class_id'] = class_id\n",
139 | "\n",
140 | "#Write config file\n",
141 | "#TODO: change should_send_to_server to true, set default to be false in .bak file\n",
142 | "with open('postcell.conf', 'wt') as conf:\n",
143 | " json.dump(parsedj, conf, indent=4, sort_keys=True)\n",
144 | " \n",
145 | "# Confirm your config\n",
146 | "with open('postcell.conf', 'rt') as conf:\n",
147 | " parsedj = json.load(conf)\n",
148 | " print(parsedj['student_id'])\n",
149 | " print(parsedj['class_id'])"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "### 4. Install the postcell magic command"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "!pip install postcell -U --quiet"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "metadata": {
172 | "tags": []
173 | },
174 | "outputs": [],
175 | "source": [
176 | "%reload_ext postcell\n",
177 | "%postcell register"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {
184 | "tags": []
185 | },
186 | "outputs": [],
187 | "source": [
188 | "%%postcell OPEN_ME_FIRST_HELLO \n",
189 | "\"Hello\""
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {},
196 | "outputs": [],
197 | "source": [
198 | "%%postcell OPEN_ME_FIRST_EXPERIENCE \n",
199 | "\"How much programming experience do you have?\""
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "### Fun test, how fast is your machine?"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {
213 | "tags": []
214 | },
215 | "outputs": [],
216 | "source": [
217 | "%%timeit\n",
218 | "sum(range(1_000_000))"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "### Paste the numeric value from the previous cell, along with a general description of you machine, on the next line"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": null,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "%%postcell OPEN_ME_FIRST_MACHINE_PERFORMANCE \n",
235 | "\"73.1 ms, dell xps, 2 years ago\""
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "### 5. Install other packages\n",
243 | "Since this is a very introductory course, students will install some required packages here, rather than via a proper environments file."
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "# download: download files, only if they don't already exist\n",
253 | "# lolviz: visualize python datastructures\n",
254 | "# jax and jaxlib: similar to numpy with autograd (don't yet run on windows)\n",
255 | "\n",
256 | "!pip install download lolviz ipywidgets pylint black mypy networkx[default,extra]"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": null,
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "# nb_conda: load conda environments in jupyter\n",
266 | "# tqdm: Add progress bars to Jupyter \n",
267 | "# seaborn: A very popular charting library\n",
268 | "\n",
269 | "!conda install bokeh tqdm seaborn altair-all -y"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "# Add kernel\n",
279 | "\n",
280 | "#conda activate py310\n",
281 | "#python -m ipykernel install --user --name=\"py310\""
282 | ]
283 | }
284 | ],
285 | "metadata": {
286 | "kernelspec": {
287 | "display_name": "Python [conda env:conda-mleng_env]",
288 | "language": "python",
289 | "name": "conda-env-conda-mleng_env-py"
290 | },
291 | "language_info": {
292 | "codemirror_mode": {
293 | "name": "ipython",
294 | "version": 3
295 | },
296 | "file_extension": ".py",
297 | "mimetype": "text/x-python",
298 | "name": "python",
299 | "nbconvert_exporter": "python",
300 | "pygments_lexer": "ipython3",
301 | "version": "3.11.11"
302 | }
303 | },
304 | "nbformat": 4,
305 | "nbformat_minor": 4
306 | }
307 |
--------------------------------------------------------------------------------
/clear_output.txt:
--------------------------------------------------------------------------------
1 | jupyter nbconvert *.ipynb --to notebook --ClearOutputPreprocessor.enabled=True --inplace
--------------------------------------------------------------------------------
/datasets/credit-card-customers/BankChurners.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/datasets/credit-card-customers/BankChurners.zip
--------------------------------------------------------------------------------
/datasets/credit-card-customers/hwotoget.md:
--------------------------------------------------------------------------------
1 | https://www.kaggle.com/datasets/sakshigoyal7/credit-card-customers?trk=article-ssr-frontend-pulse_little-text-block
2 | License: CCO: Public Domain
--------------------------------------------------------------------------------
/datasets/deaths-in-gameofthrones/get_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | curl https://raw.githubusercontent.com/washingtonpost/data-game-of-thrones-deaths/master/game-of-thrones-deaths-data.csv -o game-of-thrones-deaths-data.csv
3 |
--------------------------------------------------------------------------------
/datasets/deaths-in-gameofthrones/howtoget.md:
--------------------------------------------------------------------------------
1 | Source:
2 | https://github.com/washingtonpost/data-game-of-thrones-deaths
3 |
--------------------------------------------------------------------------------
/datasets/life-expectancy/howtoget.md:
--------------------------------------------------------------------------------
1 | https://www.kaggle.com/kumarajarshi/life-expectancy-who/downloads/life-expectancy-who.zip/1
2 |
3 | Generate a small version by doing `head -n 10 "Life Expectancy Data.csv" > life_expectancy_10.csv`
4 |
--------------------------------------------------------------------------------
/datasets/life-expectancy/life-expectancy-who.zip2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/datasets/life-expectancy/life-expectancy-who.zip2
--------------------------------------------------------------------------------
/datasets/movielens/README.txt:
--------------------------------------------------------------------------------
1 | Summary
2 | =======
3 |
4 | This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.
5 |
6 | Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.
7 |
8 | The data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows.
9 |
10 | This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent.
11 |
12 | This and other GroupLens data sets are publicly available for download at .
13 |
14 |
15 | Usage License
16 | =============
17 |
18 | Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions:
19 |
20 | * The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group.
21 | * The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information).
22 | * The user may redistribute the data set, including transformations, so long as it is distributed under these same license conditions.
23 | * The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota.
24 | * The executable software scripts are provided "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction.
25 |
26 | In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate).
27 |
28 | If you have any further questions or comments, please email
29 |
30 |
31 | Citation
32 | ========
33 |
34 | To acknowledge use of the dataset in publications, please cite the following paper:
35 |
36 | > F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19.
37 |
38 |
39 | Further Information About GroupLens
40 | ===================================
41 |
42 | GroupLens is a research group in the Department of Computer Science and Engineering at the University of Minnesota. Since its inception in 1992, GroupLens's research projects have explored a variety of fields including:
43 |
44 | * recommender systems
45 | * online communities
46 | * mobile and ubiquitious technologies
47 | * digital libraries
48 | * local geographic information systems
49 |
50 | GroupLens Research operates a movie recommender based on collaborative filtering, MovieLens, which is the source of these data. We encourage you to visit to try it out! If you have exciting ideas for experimental work to conduct on MovieLens, send us an email at - we are always interested in working with external collaborators.
51 |
52 |
53 | Content and Use of Files
54 | ========================
55 |
56 | Formatting and Encoding
57 | -----------------------
58 |
59 | The dataset files are written as [comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values) files with a single header row. Columns that contain commas (`,`) are escaped using double-quotes (`"`). These files are encoded as UTF-8. If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) display incorrectly, make sure that any program reading the data, such as a text editor, terminal, or script, is configured for UTF-8.
60 |
61 |
62 | User Ids
63 | --------
64 |
65 | MovieLens users were selected at random for inclusion. Their ids have been anonymized. User ids are consistent between `ratings.csv` and `tags.csv` (i.e., the same id refers to the same user across the two files).
66 |
67 |
68 | Movie Ids
69 | ---------
70 |
71 | Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL ). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files).
72 |
73 |
74 | Ratings Data File Structure (ratings.csv)
75 | -----------------------------------------
76 |
77 | All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:
78 |
79 | userId,movieId,rating,timestamp
80 |
81 | The lines within this file are ordered first by userId, then, within user, by movieId.
82 |
83 | Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
84 |
85 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
86 |
87 |
88 | Tags Data File Structure (tags.csv)
89 | -----------------------------------
90 |
91 | All tags are contained in the file `tags.csv`. Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format:
92 |
93 | userId,movieId,tag,timestamp
94 |
95 | The lines within this file are ordered first by userId, then, within user, by movieId.
96 |
97 | Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user.
98 |
99 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
100 |
101 |
102 | Movies Data File Structure (movies.csv)
103 | ---------------------------------------
104 |
105 | Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format:
106 |
107 | movieId,title,genres
108 |
109 | Movie titles are entered manually or imported from , and include the year of release in parentheses. Errors and inconsistencies may exist in these titles.
110 |
111 | Genres are a pipe-separated list, and are selected from the following:
112 |
113 | * Action
114 | * Adventure
115 | * Animation
116 | * Children's
117 | * Comedy
118 | * Crime
119 | * Documentary
120 | * Drama
121 | * Fantasy
122 | * Film-Noir
123 | * Horror
124 | * Musical
125 | * Mystery
126 | * Romance
127 | * Sci-Fi
128 | * Thriller
129 | * War
130 | * Western
131 | * (no genres listed)
132 |
133 |
134 | Links Data File Structure (links.csv)
135 | ---------------------------------------
136 |
137 | Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format:
138 |
139 | movieId,imdbId,tmdbId
140 |
141 | movieId is an identifier for movies used by . E.g., the movie Toy Story has the link .
142 |
143 | imdbId is an identifier for movies used by . E.g., the movie Toy Story has the link .
144 |
145 | tmdbId is an identifier for movies used by . E.g., the movie Toy Story has the link .
146 |
147 | Use of the resources listed above is subject to the terms of each provider.
148 |
149 |
150 | Cross-Validation
151 | ----------------
152 |
153 | Prior versions of the MovieLens dataset included either pre-computed cross-folds or scripts to perform this computation. We no longer bundle either of these features with the dataset, since most modern toolkits provide this as a built-in feature. If you wish to learn about standard approaches to cross-fold computation in the context of recommender systems evaluation, see [LensKit](http://lenskit.org) for tools, documentation, and open-source code examples.
154 |
--------------------------------------------------------------------------------
/datasets/movielens/howtoget.txt:
--------------------------------------------------------------------------------
1 | http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
2 |
--------------------------------------------------------------------------------
/datasets/names/get_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | curl https://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/female.txt -o female.txt
3 | curl https://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/male.txt -o male.txt
4 |
5 |
--------------------------------------------------------------------------------
/datasets/names/howtoget.txt:
--------------------------------------------------------------------------------
1 | Source:
2 | https://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/female.txt
3 | https://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/male.txt
4 |
--------------------------------------------------------------------------------
/datasets/shakespeare/get_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | curl https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt -o shakespeare.txt
3 |
--------------------------------------------------------------------------------
/datasets/shakespeare/howtoget.txt:
--------------------------------------------------------------------------------
1 | Source:
2 | https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt
3 |
--------------------------------------------------------------------------------
/datasets/shakespeare/shakespeare.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/datasets/shakespeare/shakespeare.txt.gz
--------------------------------------------------------------------------------
/datasets/startbucks_locations/url.txt.txt:
--------------------------------------------------------------------------------
1 | https://gist.github.com/dankohn/09e5446feb4a8faea24f
--------------------------------------------------------------------------------
/datasets/taxi-trips/howtoget.txt:
--------------------------------------------------------------------------------
1 | Data is available from https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew/
2 |
3 | >time zcat Taxi_Trips.csv.gz |wc -l
4 | 113115259
5 |
6 | real 6m9.941s
7 | user 6m12.653s
8 | sys 0m45.163s
9 |
10 | Size of Taxi_Trips.csv.gz: 14G
11 | Size of taxi_trips_small.csv.gz: 1G
12 | Size of taxi_trips_smaller.csv.gz 154M
13 |
14 | Size of Taxi_Trips.csv: ????
15 | Size of taxi_trips_small.csv: 3.4G
16 | Size of taxi_trips_smaller.csv: 456M
17 |
18 | Lines in Taxi_Trips.csv.gz: 113,115,259 (100 million)
19 | Lines in taxi_trips_small.csv.gz: 11,311,525 (11 million)
20 | Lines in taxi_trips_smaller.csv.gz: 1,131,152 (1 million)
21 |
22 |
--------------------------------------------------------------------------------
/datasets/worldbank-dev-ind/howtoget.txt:
--------------------------------------------------------------------------------
1 | http://databank.worldbank.org/data/download/WDI_csv.zip
2 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | #conda env create -f environment.yml
2 | #conda env list
3 | #conda env remove --name PythonForAnalytics
4 | #conda env update --file environment.yml
5 | #conda activate PythonForAnalytics
6 | #conda update --all # update all packages
7 | name: msca
8 | channels:
9 | - defaults
10 | - conda-forge
11 | dependencies:
12 | - python=3.6
13 | - pylint
14 | - pip # a more general python package installer
15 | - openssl # technical infrastructure
16 | - flask # web/api framework
17 | - pyarrow # fast alternative to csv
18 | # Data science libraries
19 | - pandas # dataframe library
20 | - scikit-learn # collection of machine learning algos
21 | # Visualization libraries
22 | - seaborn # charting package which is built on top of matplotlib
23 | - altair # charting package which provides a python interface to the vega-lite library
24 | - bokeh # charting package
25 | - bqplot # charting package from bloomberg
26 | #- pyviz # related to holoviz?
27 | #- holoviz # meta package which installs holoviews, panel, hvplot, etc.
28 | # Notebook extensions
29 | - nb_conda # load conda environments in jupyter
30 | - rise # presentations in jupyter
31 | - tqdm
32 | - nbtutor # brings pythontutor into jupyter, execute `%load_ext nbtutor` to load extension, `%%nbtutor` to evaluate cell (`%%nbtutor -r` to reset variables)
33 | - jupyter_contrib_nbextensions #used to be under pip, under conda, doesn't require `jupyter contrib nbextension install ...`
34 | - jupyter_nbextensions_configurator #enable nb_conda to allow use of custom conda envs
35 | - pip:
36 | - wget # easy way to download files
37 | #- https://github.com/ipython-contrib/jupyter_contrib_nbextensions/tarball/master
38 | - Flask-Testing
39 | - nbdime # makes it easy to diff rendered notebooks
40 | - pixiedust
41 | #- google-cloud-firestore # to enable firestore for pusblishing student cell contents
42 | #- lolviz # visualize data structures (only useful for teaching and learning)
43 | #- kedro #McKinsey data science template
44 |
--------------------------------------------------------------------------------
/lectures/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb filter=strip-notebook-output
2 |
--------------------------------------------------------------------------------
/lectures/005_intro_to_consoles/images/03_two_pages_at_once.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/005_intro_to_consoles/images/03_two_pages_at_once.png
--------------------------------------------------------------------------------
/lectures/005_intro_to_consoles/images/console.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/005_intro_to_consoles/images/console.jpg
--------------------------------------------------------------------------------
/lectures/005_intro_to_consoles/images/dosprompt.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/005_intro_to_consoles/images/dosprompt.jpg
--------------------------------------------------------------------------------
/lectures/005_intro_to_consoles/images/macterminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/005_intro_to_consoles/images/macterminal.png
--------------------------------------------------------------------------------
/lectures/005_intro_to_consoles/intro_to_consoles.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "slideshow": {
7 | "slide_type": "slide"
8 | }
9 | },
10 | "source": [
11 | "# In the beginning, there was the console"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {
17 | "slideshow": {
18 | "slide_type": "slide"
19 | }
20 | },
21 | "source": [
22 | "### Modern operating systems offer a visually stimulating **Graphical** User Interface\n",
23 | "\n",
24 | ""
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {
30 | "slideshow": {
31 | "slide_type": "slide"
32 | }
33 | },
34 | "source": [
35 | "### However, there had to be a simpler beginning\n",
36 | "\n",
37 | "\n",
38 | "\n",
39 | "Kids react to old computers: https://www.youtube.com/watch?v=PF7EpEnglgk"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {
45 | "slideshow": {
46 | "slide_type": "slide"
47 | }
48 | },
49 | "source": [
50 | "### Print to screen, which screen?\n",
51 | "\n",
52 | "```print(1+2)```\n",
53 | "\n",
54 | "There only used to be _one_ screen (aka console)"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {
60 | "slideshow": {
61 | "slide_type": "slide"
62 | }
63 | },
64 | "source": [
65 | "### In modern computers, such consoles are still available\n",
66 | "\n",
67 | "In Apple computers, console is called the **Terminal**\n",
68 | ""
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {
74 | "slideshow": {
75 | "slide_type": "slide"
76 | }
77 | },
78 | "source": [
79 | "### In modern computers, such consoles are still available\n",
80 | "\n",
81 | "In Windows, it is called the **Command Prompt**\n",
82 | "\n",
83 | "\n",
84 | "We will use Git Bash, which will simulate an environment similar to Apple or Linux Terminal"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {
90 | "slideshow": {
91 | "slide_type": "slide"
92 | }
93 | },
94 | "source": [
95 | "# References\n",
96 | "MS Word screenshot: https://www.howtogeek.com/215187/how-to-view-multiple-pages-at-once-in-word/\n",
97 | "\n",
98 | "Mac Terminal screenshot: https://thenextweb.com/lifehacks/2010/11/19/keep-your-macbook-from-waking-up-in-your-bag-with-a-simple-command/"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {
105 | "slideshow": {
106 | "slide_type": "slide"
107 | }
108 | },
109 | "outputs": [],
110 | "source": []
111 | }
112 | ],
113 | "metadata": {
114 | "celltoolbar": "Slideshow",
115 | "kernelspec": {
116 | "display_name": "Python [conda env:PythonForAnalytics] *",
117 | "language": "python",
118 | "name": "conda-env-PythonForAnalytics-py"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.6.9"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 2
135 | }
136 |
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/images/HR-10RC_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/HR-10RC_large.png
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/images/SL300VC-PK_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/SL300VC-PK_large.png
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/images/calculator_2p3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_2p3.jpg
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/images/calculator_blank.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_blank.jpg
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/images/calculator_expand_chars.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_expand_chars.jpg
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/images/calculator_expand_date.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_expand_date.jpg
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/images/calculator_expand_math.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_expand_math.jpg
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/images/calculator_expand_mem.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_expand_mem.jpg
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/images/calculator_memory.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_memory.jpg
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/images/calculator_numbers.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_numbers.jpg
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/images/calculator_operators.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_operators.jpg
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/images/calculator_screen.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/010_programming_vs_calculator/images/calculator_screen.jpg
--------------------------------------------------------------------------------
/lectures/010_programming_vs_calculator/programming_vs_calculator.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "slideshow": {
7 | "slide_type": "slide"
8 | }
9 | },
10 | "source": [
11 | "## Programming vs a calculator"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {
17 | "slideshow": {
18 | "slide_type": "slide"
19 | }
20 | },
21 | "source": [
22 | "### Let's start with a calculator\n",
23 | ""
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "slideshow": {
31 | "slide_type": "fragment"
32 | }
33 | },
34 | "outputs": [],
35 | "source": [
36 | "2+2"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {
42 | "slideshow": {
43 | "slide_type": "slide"
44 | }
45 | },
46 | "source": [
47 | "### Numbers, nouns or _things_\n",
48 | " "
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {
54 | "slideshow": {
55 | "slide_type": "slide"
56 | }
57 | },
58 | "source": [
59 | "### Operators, functions, verbs or ways in which you operate on _things_\n",
60 | ""
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {
66 | "slideshow": {
67 | "slide_type": "slide"
68 | }
69 | },
70 | "source": [
71 | "### Operations related to the screen\n",
72 | " "
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {
78 | "slideshow": {
79 | "slide_type": "slide"
80 | }
81 | },
82 | "source": [
83 | "### Operations related to memory\n",
84 | "\n",
85 | "\n",
86 | "**Exercise**: On a calculator, find the proportion of total represented by 35, 45, 55\n",
87 | "1. Calculate total\n",
88 | "2. Save that total in memory\n",
89 | "3. Divide each number by the total in memory"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {
95 | "slideshow": {
96 | "slide_type": "slide"
97 | }
98 | },
99 | "source": [
100 | "### An imaginary box to remember things\n",
101 | " "
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {
108 | "slideshow": {
109 | "slide_type": "fragment"
110 | }
111 | },
112 | "outputs": [],
113 | "source": [
114 | "x=135\n",
115 | "2+x"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {
121 | "slideshow": {
122 | "slide_type": "slide"
123 | }
124 | },
125 | "source": [
126 | "### How can we expand the functionality of this calculator?\n",
127 | "\n",
128 | "What if we could snap on more operators (aka verbs or functions)\n",
129 | "\n",
130 | " \n",
131 | "\n",
132 | "This way, product designer and engineers could builid a nice calculator, math majors could write fancy expansion sets and consumers could add on the functionality they needed."
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {
139 | "slideshow": {
140 | "slide_type": "fragment"
141 | }
142 | },
143 | "outputs": [],
144 | "source": [
145 | "import math\n",
146 | "math.log(100)"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {
152 | "slideshow": {
153 | "slide_type": "slide"
154 | }
155 | },
156 | "source": [
157 | "### How can we expand the functionality of this calculator?\n",
158 | "\n",
159 | "What about adding nouns or data types beyond numbers, such as dates and times?\n",
160 | "\n",
161 | " "
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {
168 | "slideshow": {
169 | "slide_type": "fragment"
170 | }
171 | },
172 | "outputs": [],
173 | "source": [
174 | "import datetime\n",
175 | "datetime.date.today() + datetime.timedelta(weeks=1) + datetime.timedelta(days=1)"
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "metadata": {
181 | "slideshow": {
182 | "slide_type": "slide"
183 | }
184 | },
185 | "source": [
186 | "### How can we expand the functionality of this calculator?\n",
187 | "\n",
188 | "What about adding nouns or data types beyond numbers, such as dates, times and **English characters** ?\n",
189 | "\n",
190 | " "
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "print(dir(\"hello\"))"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": null,
205 | "metadata": {},
206 | "outputs": [],
207 | "source": [
208 | "'a' + 'b'"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {
214 | "slideshow": {
215 | "slide_type": "slide"
216 | }
217 | },
218 | "source": [
219 | "### How can we expand the functionality of this calculator?\n",
220 | "\n",
221 | "What about adding more than one memory slot? This will also require that we name our memory slots, or variables.\n",
222 | "\n",
223 | " "
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "x=5\n",
233 | "y=10\n",
234 | "x+y"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {
240 | "slideshow": {
241 | "slide_type": "slide"
242 | }
243 | },
244 | "source": [
245 | "### How can we expand the functionality of this calculator?\n",
246 | "\n",
247 | "What if, insead of typing out calculations, we could record the calculations, and run them at a later time?\n",
248 | "```\n",
249 | "1+1\n",
250 | "2+2\n",
251 | "...\n",
252 | "```\n",
253 | "\n",
254 | " "
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {
260 | "slideshow": {
261 | "slide_type": "slide"
262 | }
263 | },
264 | "source": [
265 | "### A tape will replay commands **exactly**\n",
266 | "Not very useful. \n",
267 | "We should be able to record a bunch of commands, but vary one or two things."
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": null,
273 | "metadata": {
274 | "slideshow": {
275 | "slide_type": "fragment"
276 | }
277 | },
278 | "outputs": [],
279 | "source": [
280 | "x = 19.99 #<= Price of a meal\n",
281 | "y = x * 0.0625 #<= Sales tax\n",
282 | "y * 1.20 #<= 20% Tip"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {
288 | "slideshow": {
289 | "slide_type": "slide"
290 | }
291 | },
292 | "source": [
293 | "# Sources\n",
294 | "Pink calculator: https://www.casio.com/products/calculators/basic/sl300vc-pk\n",
295 | "\n",
296 | "Calculator with printer: https://www.casio.com/products/calculators/printing/hr-10rc"
297 | ]
298 | }
299 | ],
300 | "metadata": {
301 | "celltoolbar": "Slideshow",
302 | "kernelspec": {
303 | "display_name": "Python 3 (ipykernel)",
304 | "language": "python",
305 | "name": "python3"
306 | },
307 | "language_info": {
308 | "codemirror_mode": {
309 | "name": "ipython",
310 | "version": 3
311 | },
312 | "file_extension": ".py",
313 | "mimetype": "text/x-python",
314 | "name": "python",
315 | "nbconvert_exporter": "python",
316 | "pygments_lexer": "ipython3",
317 | "version": "3.9.13"
318 | },
319 | "vscode": {
320 | "interpreter": {
321 | "hash": "108a1df64039728e69f178110a6e255a10aba8514903b770571642a02940d2ba"
322 | }
323 | }
324 | },
325 | "nbformat": 4,
326 | "nbformat_minor": 4
327 | }
328 |
--------------------------------------------------------------------------------
/lectures/020_intro_to_jupyter/10 - Intro To Jupyter (not technical).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# This is Jupyter"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Standard method of running programs: command line\n",
15 | "\n",
16 | "Normally you 'run' a program by double clicking on it. For our purpose, we will consider this to be the same as running it from the command line. In both cases, a fully written program is run.\n",
17 | "\n",
18 | "For example, you may run a python program in the following manner:\n",
19 | "\n",
20 | "`python myprogram.py`\n",
21 | "\n",
22 | "In this case, myprogram.py contains your code and you are telling python to execute the whole thing for you.\n",
23 | "\n",
24 | "This is in contrast to running a program small batches or one line at a time:\n",
25 | "\n",
26 | "\n",
27 | "## REPL\n",
28 | "This method of executing code is called a *REPL*, aka *R*ead, *E*val, *P*rint *L*oop. This of a REPL as a program:\n",
29 | "```python\n",
30 | "while True:\n",
31 | " user_input = input() # Get input from user\n",
32 | " result = parse_and_evaluate(use_input)\n",
33 | " print(result)\n",
34 | "```\n",
35 | "\n",
36 | "A program which reads your programs and executes them, very *meta*!\n",
37 | "\n",
38 | "This method of executing code was the norm in the Lisp programming language - one of the oldest, continuously used programming languages, created for the purpose of Artificial Intelligence experimentation\n",
39 | "\n",
40 | "## Literate Programming: Comments in code vs code inside comments\n",
41 | "\n",
42 | "Don Knuth, a high priest of Computer Science, introduced and advocated for the idea of *literate programming.* Knuth wanted to move programs from technical code, written to satisfy compilers or interpreters to a living document, to be consumed by human readers. He wanted code, mixed with narrative explanations, describing what the code was doing.\n",
43 | "\n",
44 | "Take this example from an earlier lecture:"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "jon = 0 #variable containing Jon's score\n",
54 | "arya = 0 #variable containing Arya's score\n",
55 | "\n",
56 | "#Open file\n",
57 | "file = open(\"../../datasets/deaths-in-gameofthrones/game-of-thrones-deaths-data.csv\", encoding='utf8')\n",
58 | "\n",
59 | "#Go through each line in file\n",
60 | "for line in file:\n",
61 | " tokens = line.split(',') #separate line into columns\n",
62 | " if tokens[4]==\"Arya Stark\": arya = arya + 1\n",
63 | " if tokens[4]==\"Jon Snow\": \n",
64 | " jon = jon + 1\n",
65 | "\n",
66 | "file.close()\n",
67 | "print(\"Arya killed\", arya, \"people\")\n",
68 | "print(\"Jon killed\", jon, \"people\")\n"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "You can see comments in the program.\n",
76 | "\n",
77 | "On the other hand, this notebook is a document, to be read by human students. Yet it contains code which can be executed. But the purpose of this document is not to have a CPU execute a set of instructions. It is to explain a process or a concept to students, where the code is executed in service of that purpose.\n",
78 | "\n",
79 | "## Notebooks\n",
80 | "One of the first, mainstream, uses of a notebook interface is from Mathematica\n",
81 | "\n",
82 | "\n",
83 | "This interface combines **REPL** and **Literate Programming**. These notebooks display well formatted code, the results of executing that code, as well as formatted narrative text (English or technical math formulas) along with visual charts, etc. Jupyter notebooks are an recent, open source iteration of this concept."
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "## Jupyter Notebooks\n",
91 | "\n",
92 | "**Starting Jupyter**\n",
93 | "Jupyter can be started from Anaconda Navigator. If you are comfortable, you can start Jupyter from the command line:\n",
94 | "\n",
95 | "```bash\n",
96 | "jupyter notebook\n",
97 | "```\n",
98 | "\n",
99 | "When using the command line, if you are at `c:\\Users\\shahbaz\\proj\\stock_market_prediction` and execute the command `jupyter notebook`, the notebook will start and display files in that directory. \n",
100 | "\n",
101 | "Once Jupyter is running, you will see a set of files, like this:\n",
102 | "\n",
103 | "\n",
104 | "Create a new notebook by selecting 'new' in the upper right hand corner:\n",
105 | ""
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": []
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "**References**\n",
120 | "\n",
121 | "Mathematica screenshot is from https://www.wolfram.com/language/fast-introduction-for-math-students/en/notebook-documents/"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": []
130 | }
131 | ],
132 | "metadata": {
133 | "kernelspec": {
134 | "display_name": "Python 3 (ipykernel)",
135 | "language": "python",
136 | "name": "python3"
137 | },
138 | "language_info": {
139 | "codemirror_mode": {
140 | "name": "ipython",
141 | "version": 3
142 | },
143 | "file_extension": ".py",
144 | "mimetype": "text/x-python",
145 | "name": "python",
146 | "nbconvert_exporter": "python",
147 | "pygments_lexer": "ipython3",
148 | "version": "3.12.4"
149 | },
150 | "vscode": {
151 | "interpreter": {
152 | "hash": "108a1df64039728e69f178110a6e255a10aba8514903b770571642a02940d2ba"
153 | }
154 | }
155 | },
156 | "nbformat": 4,
157 | "nbformat_minor": 4
158 | }
159 |
--------------------------------------------------------------------------------
/lectures/020_intro_to_jupyter/images/blank_jupyter_header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/blank_jupyter_header.png
--------------------------------------------------------------------------------
/lectures/020_intro_to_jupyter/images/command_mode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/command_mode.png
--------------------------------------------------------------------------------
/lectures/020_intro_to_jupyter/images/edit_mode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/edit_mode.png
--------------------------------------------------------------------------------
/lectures/020_intro_to_jupyter/images/jupyter_celltype.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/jupyter_celltype.png
--------------------------------------------------------------------------------
/lectures/020_intro_to_jupyter/images/jupyter_create_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/jupyter_create_notebook.png
--------------------------------------------------------------------------------
/lectures/020_intro_to_jupyter/images/jupyter_files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/jupyter_files.png
--------------------------------------------------------------------------------
/lectures/020_intro_to_jupyter/images/jupyter_kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/jupyter_kernel.png
--------------------------------------------------------------------------------
/lectures/020_intro_to_jupyter/images/jupyter_restart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/jupyter_restart.png
--------------------------------------------------------------------------------
/lectures/020_intro_to_jupyter/images/jupyter_run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/jupyter_run.png
--------------------------------------------------------------------------------
/lectures/020_intro_to_jupyter/images/mathematica.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/mathematica.png
--------------------------------------------------------------------------------
/lectures/020_intro_to_jupyter/images/python_repl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/020_intro_to_jupyter/images/python_repl.png
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/090-roadmap.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "44ab0477-e109-4dc1-83d4-8b8071136645",
6 | "metadata": {},
7 | "source": [
8 | "# What we learn, when we are learning programming languages"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "2c64bc0b-78c7-4b95-a212-34491d0bb610",
14 | "metadata": {},
15 | "source": [
16 | "Before anything else, let's look at a map of the journey we are about to undertake. Porgramming langauges are used for data analysis, complex machine learning models, creating video games where you can explore whole galaxies, all of youtube, google, controls of cars and space shuttles...[almost everything in the world](https://a16z.com/2011/08/20/why-software-is-eating-the-world/)."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "dfc17055-21a9-4a5c-910e-c4206a853906",
22 | "metadata": {},
23 | "source": [
24 | "Suprisingly, the components which make up a programming language are not very complex. In this class, we will learn about these components and how to put them together to create useful programs."
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "id": "c608b572-29c9-44bc-893a-ad022e05d952",
30 | "metadata": {},
31 | "source": [
32 | "This is a broad (and necessarily incomplete) outline:"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "id": "5ab75289-470f-4fcc-801c-90bde8b998ac",
38 | "metadata": {},
39 | "source": [
40 | "### Data types\n",
41 | "Calculators can only deal with numbers. Programming languages understand \n",
42 | "1. numbers\n",
43 | "2. text (called \"strings\")\n",
44 | "3. truthfulness and falsehood of logical statements (called \"boolean\")\n",
45 | "4. infinite variety, made up of combining the basic elements shown here"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "id": "8d301b4f-73bc-4bbd-8ec4-8caddebc5d56",
51 | "metadata": {},
52 | "source": [
53 | "### Container types\n",
54 | "It is one thing to deal with single numbers or individual strings, programming languages provide \"data structures\" which let us deal with a collection of objects, such as:\n",
55 | "1. lists\n",
56 | "2. dictionaries\n",
57 | "3. sets\n",
58 | "4. infinite variety, made up of combining the basic elements shown here"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "id": "144e7f6c-23c2-4fe4-9251-b6d6b1a0d16b",
64 | "metadata": {},
65 | "source": [
66 | "### Control flow\n",
67 | "Much of the power of computers come from programming computers to repeat tasks as many times as we want or to have programs decide, while running, which path to take. As such, the control flow methods we will study are:\n",
68 | "1. if/else statements (called \"conditionals\")\n",
69 | "2. for and while loops"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "id": "4628dd8f-887b-46b7-b3b0-97422126a9bb",
75 | "metadata": {},
76 | "source": [
77 | "### Operations\n",
78 | "Just like a calculator contains numbers, and operations on those numbers, such as plus, minus, divide, multiply, programming languages are a collection of a very large number of operations. Languages provide functions which operate on\n",
79 | "1. data types, such as numbers, strings, booleans, etc.\n",
80 | "2. container types, such as lists, dictionaries, sets, etc.\n",
81 | "3. infinite variety...you get the point"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "id": "a925b147-936b-45cf-8c86-3e54dce080de",
88 | "metadata": {},
89 | "outputs": [],
90 | "source": []
91 | }
92 | ],
93 | "metadata": {
94 | "kernelspec": {
95 | "display_name": "Python 3 (ipykernel)",
96 | "language": "python",
97 | "name": "python3"
98 | },
99 | "language_info": {
100 | "codemirror_mode": {
101 | "name": "ipython",
102 | "version": 3
103 | },
104 | "file_extension": ".py",
105 | "mimetype": "text/x-python",
106 | "name": "python",
107 | "nbconvert_exporter": "python",
108 | "pygments_lexer": "ipython3",
109 | "version": "3.8.10"
110 | }
111 | },
112 | "nbformat": 4,
113 | "nbformat_minor": 5
114 | }
115 |
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/150-basic_plotting.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Plotting (with matplotlib.pyplot)\n",
8 | "\n",
9 | "As a data scientist, plotting is extremely import and part of your daily workflow. However, Python doesn't come with a built-in plotting library. We will be using `matplotlib`. Like many data science packages, it is maintained by a community of programmers who (mostly) work on it for free in their spare time.\n",
10 | "\n",
11 | "On the first day of class, we downloaded this package and installed it in our Python envrionment. If you need to do this again, here are two method:\n",
12 | "\n",
13 | "1. Open your computer's terminal window (_terminal_ in mac and _anaconda prompt_ in windows). Run this line `conda install matplotlib`\n",
14 | "2. Right here, in your jupyter notebook, create a new cell and run this command `!conda install --yes matplotlib`. Once this command runs successfully, you may have to restart the kernel.\n",
15 | "\n",
16 | "In order to use the plotting library, you need to import it (like we do with many other packages). However, you need to execute an additional line of code: `%matplotlib inline`, which tells matplotlib that it needs to render its visual charts in the present notebook. \n",
17 | "\n",
18 | "Note that `inline` is not the only optional available. While `inline` causes matplotlib to create a static image, there are other options, such as `notebook` which can provide a richer experience. To experiment with other renderers, get a full list via `%matplotlib --list`"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "import matplotlib.pyplot as plt"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "%matplotlib inline"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "### Simple plot types"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "plt.plot([1, 2, 3, 4, 3, 2, 1, 4, 7, 10])"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "You can remove the text `[]` by adding a semicolon at the end of the plot function (this is a bit of a hack, you don't need to do this elsewhere)"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "plt.plot([1, 2, 3, 4, 3, 2, 1, 4, 7, 10]);"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "Any chart you create for others must be labeled corectly. Here is how you can add labels:"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "plt.plot([1, 2, 3, 4, 3, 2, 1, 4, 7, 10])\n",
85 | "plt.title(\"Test chart\")\n",
86 | "plt.xlabel('Value index')\n",
87 | "plt.ylabel('Value'); # notice only the last line has a semi-colon"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "Matplotlib generally draws figures which are too small. You can change their size using figsize:"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "plt.figure(figsize=(10,5)) # This only needs to be execute once, and can be done at the top of the notebook\n",
104 | "\n",
105 | "plt.plot([1, 2, 3, 4, 3, 2, 1, 4, 7, 10])\n",
106 | "plt.title(\"Test chart\")\n",
107 | "plt.xlabel('Value index')\n",
108 | "plt.ylabel('Value'); "
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "The above charts visualize a single dimension of numbers. What if you had two dimensions, such as house prices and number of rooms"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "prices = [95, 150, 200, 200, 400]\n",
125 | "rooms = [1, 2, 2, 1.5, 3]\n",
126 | "\n",
127 | "plt.plot(prices, rooms)\n",
128 | "plt.title(\"Test chart\")\n",
129 | "plt.xlabel('Hosue price')\n",
130 | "plt.ylabel('Num of rooms'); "
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "Such two dimensional data is better represented as a scatter chart:"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "prices = [95, 150, 200, 200, 400]\n",
147 | "rooms = [1, 2, 2, 1.5, 3]\n",
148 | "\n",
149 | "plt.scatter(prices, rooms)\n",
150 | "plt.title(\"Test chart\")\n",
151 | "plt.xlabel('Hosue price')\n",
152 | "plt.ylabel('Num of rooms'); "
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "Same data with bar plot"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "prices = [95, 150, 200, 200, 400]\n",
169 | "rooms = [1, 2, 2, 1.5, 3]\n",
170 | "\n",
171 | "plt.bar(prices, rooms, width=10)\n",
172 | "plt.title(\"Test chart\")\n",
173 | "plt.xlabel('Hosue price')\n",
174 | "plt.ylabel('Num of rooms'); "
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": []
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": null,
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "prices = [95, 150, 200, 200, 400]\n",
191 | "rooms = [1, 2, 2, 1.5, 3]\n",
192 | "\n",
193 | "plt.hist(prices)\n",
194 | "plt.title(\"Histogram of prices\")"
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "metadata": {},
200 | "source": [
201 | "Reference:\n",
202 | "Some examples taken from the official tutorial at https://matplotlib.org/tutorials/introductory/pyplot.html"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "**Exercise**\n",
210 | "Explain Python's built-in random library. Use charts such as these. "
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {},
217 | "outputs": [],
218 | "source": []
219 | }
220 | ],
221 | "metadata": {
222 | "kernelspec": {
223 | "display_name": "Python 3",
224 | "language": "python",
225 | "name": "python3"
226 | },
227 | "language_info": {
228 | "codemirror_mode": {
229 | "name": "ipython",
230 | "version": 3
231 | },
232 | "file_extension": ".py",
233 | "mimetype": "text/x-python",
234 | "name": "python",
235 | "nbconvert_exporter": "python",
236 | "pygments_lexer": "ipython3",
237 | "version": "3.8.5"
238 | }
239 | },
240 | "nbformat": 4,
241 | "nbformat_minor": 4
242 | }
243 |
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/260-all_of_python_regexes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Regular Expressions"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Regular expressions are a mini-language, used to parse and extract information from strings.\n",
15 | "\n",
16 | "### Motivation: slicing vs split vs regex\n",
17 | "\n",
18 | "Given a strings, such as:\n",
19 | "\n",
20 | "\"01/09/2008\", \"05/12/2012\"\n",
21 | "\n",
22 | "we know we can get extract the year this way:"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "dates = [\"01/09/2008\", \"05/12/2012\"]\n",
32 | "\n",
33 | "for d in dates:\n",
34 | " print(d[-4:]) # use normal indexing"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "If we had a strings, such as:\n",
42 | "\n",
43 | "\"In the year 2008 we did such as such\"\n",
44 | "\"After the year 2009 we continued something else\"\n",
45 | "\n",
46 | "We can no longer use slicing, but we can just split the string and get the 4th value to get the year:"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "sentences = [\"In the year 2008 we did such as such\"\n",
56 | " , \"After the year 2009 we continued something else\"]\n",
57 | "\n",
58 | "for s in sentences:\n",
59 | " print(s.split(\" \")[3])"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "How do we extract dates in the following sentences?\n",
67 | "\n",
68 | "\"2019: After the Fall of New York\"\n",
69 | "\n",
70 | "\"The exterminators of the year 3000\"\n",
71 | "\n",
72 | "\"1990: The Bronx Warriors\"\n",
73 | "\n",
74 | "The first inclination of novice programmers would be to split the movie title above, go through each title and check to see if it is just numbers. If it is, extract that token as the year.\n",
75 | "\n",
76 | "This pattern of coding comes up so often that there is a special way of extracting such information: regular expressions!"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "import re # <= regular expression library\n",
86 | "\n",
87 | "movies = [\"2019: After the Fall of New York\"\n",
88 | " , \"The exterminators of the year 3000\"\n",
89 | " , \"1990: The Bronx Warriors\"]\n",
90 | "\n",
91 | "for m in movies:\n",
92 | " print(re.search(\"(\\d\\d\\d\\d)\", m).group(0))"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "**...what??**\n",
100 | "\n",
101 | "Some people don't like regular expressions:\n",
102 | "\n",
103 | "> Some people, when confronted with a problem, think\n",
104 | "“I know, I'll use regular expressions.” Now they have two problems.\n",
105 | "\n",
106 | "\n",
107 | "- Jamie Zawinski"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "## Regular expressions in context\n",
115 | "\n",
116 | "Regular expressions were invented, in their modern form, in 1951 by Stephen Kleene. They have their roots in theoretical computer science, although they have extremely useful as a text parsing tool.\n",
117 | "\n",
118 | "Practically every language has regular expressions built-in. They are often super optimized and always expressed in an archaic syntax.\n",
119 | "\n",
120 | "Regular expressiosn allow you to use basic components to parse a language. Here are some pseudo-code examples of regex expressions:\n",
121 | "\n",
122 | "Find all characters which are digits\n",
123 | "\n",
124 | "Find all characters which are digits, followed by another digit\n",
125 | "\n",
126 | "Find all characters which are at the beginning of a line, are of one of the following characters: [,.!;:], followed by 3 digits, followed by a comma, followed by three characters which are NOT digits"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | "## Sample regular expressions"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "ages = \"Homer is 38 years old, Marge is 36 years old, Bart is 10 years old, Lisa is 8 years old and Maggie is 3.\"\n",
143 | "\n",
144 | "# Task: Extract all ages\n",
145 | "# Thinking: Find all numbers\n",
146 | "# Regex pseudo code: find digits\n",
147 | "\n",
148 | "regex_attempt1 = \"(\\d)\" # <= Find digits\n",
149 | "\n",
150 | "for m in re.finditer(regex_attempt1, ages): \n",
151 | " print(\"Match starts at\",m.start(), \"ends at\", m.end(), \"and contains\", m.group())"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "# Task: Extract all ages\n",
161 | "# Thinking: Find all numbers\n",
162 | "# Regex pseudo code: find digits, clump consecutive digits together\n",
163 | "\n",
164 | "regex_attempt1 = \"(\\d+)\" # <= Find digits and 1 or more repititions\n",
165 | "\n",
166 | "for m in re.finditer(regex_attempt1, ages): \n",
167 | " print(\"Match starts at\",m.start(), \"ends at\", m.end(), \"and contains\", m.group())"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "metadata": {},
173 | "source": [
174 | "## Just use http://www.pyregex.com/ or https://www.debuggex.com/"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {},
180 | "source": [
181 | "**Exercise** Extract area codes from the following phone numbers. _Must_ write a single regex which is able to extract regular expressions from the following numbers (in a loop):\n",
182 | "\n",
183 | "1-201-123-1234\n",
184 | "\n",
185 | "98-708-567-7890\n",
186 | "\n",
187 | "0-708-333-4444\n",
188 | "\n",
189 | "In the above numbers, the area codes are 201, 708 and 708, respectively."
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {},
196 | "outputs": [],
197 | "source": [
198 | "area_code_regex = r\"type_correct_regex_expression_here\"\n",
199 | "\n",
200 | "for ac in [\"1-201-123-1234\", \"98-708-567-7890\", \"0-708-333-4444\"]:\n",
201 | " print(re.findall(area_code_regex, ac))"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "Hint: Look for the start of string, then one or more digits, then a dash, THEN the digits which contain our area code. Ignore the rest.\n"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "## What regular expressions can't do\n",
216 | "\n",
217 | "Regular expressions are part of a theoretical framework which define languages. There are languages which are less or more powerful than regular expressions.\n",
218 | "\n",
219 | "For example, regular expressions are not able to correctly parse this expressions:\n",
220 | "\n",
221 | "`1 + (2 * (3 + 8))`\n",
222 | "\n",
223 | "In order to parse the expression above, after each left parenthesis, we would have to use recursion. Regular expressions are not designed to parse such recursive expressions.\n",
224 | "\n",
225 | "Practically speaking, although _many_ poeple attempt it, regular expressions are not the correct choise to parse html (web) pages or xml documents.\n",
226 | "\n",
227 | "Computer science students often learn about context free grammars. CFGs _can_ parse recursive strings and are often used to parse programming languages. Unfortunately, CFGs are out of scope for this course."
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {},
234 | "outputs": [],
235 | "source": []
236 | }
237 | ],
238 | "metadata": {
239 | "kernelspec": {
240 | "display_name": "Python 3",
241 | "language": "python",
242 | "name": "python3"
243 | },
244 | "language_info": {
245 | "codemirror_mode": {
246 | "name": "ipython",
247 | "version": 3
248 | },
249 | "file_extension": ".py",
250 | "mimetype": "text/x-python",
251 | "name": "python",
252 | "nbconvert_exporter": "python",
253 | "pygments_lexer": "ipython3",
254 | "version": "3.8.5"
255 | }
256 | },
257 | "nbformat": 4,
258 | "nbformat_minor": 4
259 | }
260 |
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/badly_typed_code.py:
--------------------------------------------------------------------------------
1 |
2 | def calc_grade(grade:str) -> str:
3 |
4 | print(grade.capitalize())
5 |
6 | if grade > 3.5: return 'Pass'
7 | else: return "Fail"
8 |
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/images/best-mommy-ever-jewelry.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/best-mommy-ever-jewelry.jpg
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/images/class_diff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/class_diff.png
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/images/clock.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/clock.jpg
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/images/david_chang.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/david_chang.jpg
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/images/how-to-control-feedback-in-a-sound-system_header.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/how-to-control-feedback-in-a-sound-system_header.jpg
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/images/ifelse_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/ifelse_diagram.png
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/images/inception.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/inception.jpg
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/images/infinitemirror.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/infinitemirror.jpg
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/images/listcomprehension.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/listcomprehension.png
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/images/loop_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/loop_diagram.png
--------------------------------------------------------------------------------
/lectures/025_all_of_python_basics/images/y_combinator.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/025_all_of_python_basics/images/y_combinator.jpg
--------------------------------------------------------------------------------
/lectures/030_intro_to_pandas/180-pandas-operations_str_dt_apply.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "tags": []
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "%reload_ext postcell\n",
22 | "%postcell register"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {
29 | "tags": []
30 | },
31 | "outputs": [],
32 | "source": [
33 | "names_ages_df = pd.DataFrame({'Name':['george washington', 'john adams', 'thomas jefferson', 'james madison', 'james monroe', 'andrew jackson', 'john quincy adams']\n",
34 | " , 'DOB':['2/22/1732', '10/30/1735', '4/13/1743', '3/16/1751', '4/28/1758', '3/15/1767', '7/11/1767']})\n",
35 | "names_ages_df"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "tags": []
43 | },
44 | "outputs": [],
45 | "source": [
46 | "names_ages_df.dtypes"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {
53 | "tags": []
54 | },
55 | "outputs": [],
56 | "source": [
57 | "names_ages_df.DOB = pd.to_datetime(names_ages_df.DOB)"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {
64 | "tags": []
65 | },
66 | "outputs": [],
67 | "source": [
68 | "names_ages_df.dtypes"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {
75 | "tags": []
76 | },
77 | "outputs": [],
78 | "source": [
79 | "names_ages_df.head()"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "# Using `str` to do string operations on Pandas columns"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "Given a string, such as 'george washington', normal Python will let you change case (`capitalize`, `lower`, `upper`), `split` it into tokens and do countless other operations:"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "tags": []
101 | },
102 | "outputs": [],
103 | "source": [
104 | "'george washington'.title()"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "But how can you do the same thing with pandas columns?"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {
118 | "tags": []
119 | },
120 | "outputs": [],
121 | "source": [
122 | "names_ages_df.Name"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {
129 | "tags": []
130 | },
131 | "outputs": [],
132 | "source": [
133 | "names_ages_df.Name.title()"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "The answer is `df.col.str`. Calling the `str` property will bring back all of core python's string functions:"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {
147 | "tags": []
148 | },
149 | "outputs": [],
150 | "source": [
151 | "names_ages_df.Name.str.title()"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "# Using `dt` to do datetime operations on Pandas columns\n",
159 | "\n",
160 | "Similar to `str`, datetime operations can be done on pandas columns via `df.col.dt`:"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {
167 | "tags": []
168 | },
169 | "outputs": [],
170 | "source": [
171 | "names_ages_df.DOB.dt.year"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {},
177 | "source": [
178 | "# Using `apply` for any transformation\n",
179 | "\n",
180 | "Although pandas provides type specific methods via `str` and `dt`, using the `apply` function gives you much greater control.\n",
181 | "\n",
182 | "You can pass in a function (including `lambda` functions, see relevant lecture if you are not familiar) so each cell appears as a single input. You can then apply any transformation you like.\n",
183 | "\n",
184 | "#### Example: Change names from \"first last\" to \"Last, First\""
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "tags": []
192 | },
193 | "outputs": [],
194 | "source": [
195 | "names_ages_df"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {
202 | "tags": []
203 | },
204 | "outputs": [],
205 | "source": [
206 | "def last_first(name):\n",
207 | " tokens = name.split()\n",
208 | " return f'{tokens[1].capitalize()}, {tokens[0].capitalize()}'\n",
209 | "\n",
210 | "last_first('homer simpson')"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {
217 | "tags": []
218 | },
219 | "outputs": [],
220 | "source": [
221 | "\n",
222 | "names_ages_df.Name.apply(last_first)"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {
229 | "tags": []
230 | },
231 | "outputs": [],
232 | "source": [
233 | "names_ages_df['Normalzied Name'] = names_ages_df.Name.apply(last_first)"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {
240 | "tags": []
241 | },
242 | "outputs": [],
243 | "source": [
244 | "names_ages_df"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "metadata": {
251 | "tags": []
252 | },
253 | "outputs": [],
254 | "source": [
255 | "names_ages_df.Name.apply(lambda x: f'{x.split()[1].capitalize()}')"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": []
264 | }
265 | ],
266 | "metadata": {
267 | "kernelspec": {
268 | "display_name": "Python 3 (ipykernel)",
269 | "language": "python",
270 | "name": "python3"
271 | },
272 | "language_info": {
273 | "codemirror_mode": {
274 | "name": "ipython",
275 | "version": 3
276 | },
277 | "file_extension": ".py",
278 | "mimetype": "text/x-python",
279 | "name": "python",
280 | "nbconvert_exporter": "python",
281 | "pygments_lexer": "ipython3",
282 | "version": "3.12.4"
283 | }
284 | },
285 | "nbformat": 4,
286 | "nbformat_minor": 4
287 | }
288 |
--------------------------------------------------------------------------------
/lectures/030_intro_to_pandas/images/dataframes.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/030_intro_to_pandas/images/dataframes.jpg
--------------------------------------------------------------------------------
/lectures/030_intro_to_pandas/images/series.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/030_intro_to_pandas/images/series.jpg
--------------------------------------------------------------------------------
/lectures/030_intro_to_pandas/images/splitapplycombine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/030_intro_to_pandas/images/splitapplycombine.png
--------------------------------------------------------------------------------
/lectures/040_basic_computer_architecture/images/EBMotherboard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/EBMotherboard.jpg
--------------------------------------------------------------------------------
/lectures/040_basic_computer_architecture/images/How_to_stress_test_your_CPU-Hero.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/How_to_stress_test_your_CPU-Hero.jpg
--------------------------------------------------------------------------------
/lectures/040_basic_computer_architecture/images/Laptop-hard-drive-exposed.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/Laptop-hard-drive-exposed.jpg
--------------------------------------------------------------------------------
/lectures/040_basic_computer_architecture/images/RAM-Modules.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/RAM-Modules.jpg
--------------------------------------------------------------------------------
/lectures/040_basic_computer_architecture/images/Supermicro-X12SCA-F-Overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/Supermicro-X12SCA-F-Overview.jpg
--------------------------------------------------------------------------------
/lectures/040_basic_computer_architecture/images/ascii.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/ascii.png
--------------------------------------------------------------------------------
/lectures/040_basic_computer_architecture/images/calc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/calc.png
--------------------------------------------------------------------------------
/lectures/040_basic_computer_architecture/images/overview-fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/overview-fig1.png
--------------------------------------------------------------------------------
/lectures/040_basic_computer_architecture/images/unicode_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/040_basic_computer_architecture/images/unicode_sample.png
--------------------------------------------------------------------------------
/lectures/045_intro_to_numpy/images/chicago.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/045_intro_to_numpy/images/chicago.jpeg
--------------------------------------------------------------------------------
/lectures/045_intro_to_numpy/images/chicago.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/045_intro_to_numpy/images/chicago.png
--------------------------------------------------------------------------------
/lectures/045_intro_to_numpy/images/chicagobw.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/045_intro_to_numpy/images/chicagobw.jpeg
--------------------------------------------------------------------------------
/lectures/045_intro_to_numpy/images/chicagobw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/045_intro_to_numpy/images/chicagobw.png
--------------------------------------------------------------------------------
/lectures/050_git_version_control/Why do you need version control.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/050_git_version_control/Why do you need version control.pptx
--------------------------------------------------------------------------------
/lectures/050_git_version_control/assets/copy_to_dropbox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/050_git_version_control/assets/copy_to_dropbox.png
--------------------------------------------------------------------------------
/lectures/050_git_version_control/assets/folder_versions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/050_git_version_control/assets/folder_versions.png
--------------------------------------------------------------------------------
/lectures/050_git_version_control/assets/github-desktop-screenshot-windows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/050_git_version_control/assets/github-desktop-screenshot-windows.png
--------------------------------------------------------------------------------
/lectures/050_git_version_control/assets/macgit-03-open.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/050_git_version_control/assets/macgit-03-open.png
--------------------------------------------------------------------------------
/lectures/050_git_version_control/assets/share_code_email.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/050_git_version_control/assets/share_code_email.png
--------------------------------------------------------------------------------
/lectures/050_git_version_control/readme.txt:
--------------------------------------------------------------------------------
1 | Convert marp presentation to pdf/power point via:
2 | https://github.com/marp-team/marp-cli
--------------------------------------------------------------------------------
/lectures/050_git_version_control/understanding_version_control.md:
--------------------------------------------------------------------------------
1 | ---
2 | theme: "white"
3 | transition: "fade"
4 | highlightTheme: "dracula"
5 | center: false
6 | marp: true
7 | ---
8 |
9 | # Understanding Version Control
10 |
11 | by Shahbaz Chaudhary
12 |
13 | ---
14 |
15 | # GitHub as a code repository
16 | https://github.com/pandas-dev/pandas
17 |
18 | * Check out recent commits, notice the files changed and their diffs
19 | * Take a look at a user who recently submitted a commit
20 | * Check out their projects, as if you were looking to hire them
21 |
22 | Top contributors
23 | https://github.com/pandas-dev/pandas/graphs/contributors
24 |
25 | Branching off and doing work
26 | https://github.com/pandas-dev/pandas/network
27 |
28 | ---
29 |
30 | # GitHub as a way to track issues
31 |
32 | https://github.com/pandas-dev/pandas/issues
33 |
34 | * Take a look at the open issues
35 | * Take a look at closed issues and how they connect to commits
36 |
37 | ---
38 |
39 | # GitHub as your resume?
40 |
41 | * https://github.com/wesm
42 | * https://github.com/hadley
43 |
44 | ---
45 |
46 | # [Task] Let's get Git installed
47 |
48 | https://git-scm.com/downloads
49 |
50 | **Mac Users** Download git file, _right click_ and select open (don't double click)
51 |
52 | 
53 | Then follow prompts and keep selecting default options
54 |
55 | Step by step instructions: https://www.linode.com/docs/development/version-control/how-to-install-git-on-linux-mac-and-windows/
56 |
57 | ---
58 |
59 | # [Task] Configure Git (Optional)
60 | Once installed, run the following at the command line:
61 |
62 | ```git config --global user.name examplename```
63 |
64 | ```git config --global user.email user@example.com```
65 |
66 | ---
67 |
68 | # [Task] Install GitHub's software
69 |
70 | https://desktop.github.com/
71 |
72 | 
73 |
74 | ---
75 |
76 | # [Task] Install VS Code
77 |
78 | https://code.visualstudio.com/
79 |
80 | * Install extension "python" (from Microsoft)
81 |
82 | ---
83 |
84 | # What is the difference between Git and GitHub?
85 |
86 | CVS -> SVN -> Git
87 |
88 | ---
89 |
90 | # [Task] Create a GitHub account
91 |
92 | https://github.com/
93 |
94 | This will be your resume for many _years_, pick a good name. This is an example of a very bad name:
95 | https://github.com/falconair
96 |
97 | ---
98 |
99 | # [Task] Send me your GitHub user names (NOT passwords!)
100 |
101 | https://forms.gle/wgwV1ztzFqD1Bz9x7
102 |
103 | ---
104 |
105 | # Show Leadership
106 |
107 | Data science departments are 5-15 years behind software engineering in professional infrastructure. Be a leader in your field by learning from programmers.
108 |
109 | ---
110 |
111 | # Professional work vs hacking
112 |
113 | Professional work requires more than coding and building models. Here are some _extra_ things you need to do as a professional (which are within scope of this presentation):
114 |
115 | * Your work must be backed up. Losing your laptop must not mean losing your work
116 | * You must be able to go back to an older version of your work
117 | * You must be able to collaborate with your team-mates, without stepping one each other's toes
118 |
119 | ---
120 |
121 | # How do you keep track of working model vs experiments?
122 |
123 | 
124 |
125 | ---
126 |
127 | # How do you protect against a broken laptop?
128 |
129 | 
130 | (src: https://www.labnol.org/software/send-files-to-dropbox/18438/)
131 |
132 | ---
133 |
134 | # How do you collaborate with your colleagues?
135 |
136 | 
137 |
138 | ---
139 |
140 | # [Task] Create a new "repo" (repository)
141 | * Create a new GitHub repository (upper right hand corner)
142 | * Call it "PfA_test"
143 | * Keep it public
144 | * Check "Initialize this repository with a README"
145 | * Click "Create repository"
146 | * Use command line or Desktop app to "clone" the repo to your disk
147 |
148 | ```git clone https://github.com//PfA_test.git```
149 |
150 | ---
151 |
152 | # [Task] Update something in your project
153 |
154 | Add this to readme\.md (try using vs code)
155 | ```csv
156 | # My name is Shahbaz
157 |
158 | ## This is a git experiment
159 |
160 | This is some random text
161 | ```
162 |
163 | ---
164 |
165 | # [Task] _Commit_ your code to your _local_ repo
166 |
167 | Using VS Code, GitHub Desktop or command line, "commit" your code`
168 |
169 | To "commit" your code means to tell Git to start keeping track of it.
170 |
171 | Command line
172 |
173 | ```git commit -m "Adds content to readme file"```
174 |
175 | ---
176 |
177 | Your code is now "saved," along with a text describing the change. No need for multiple folders.
178 |
179 | ---
180 |
181 | # [Task] _Push_ this change to GitHub
182 |
183 | Using VS Code, GitHub Desktop or command line, "push" your code to GitHub
184 |
185 | Command line
186 |
187 | ```git push```
188 |
189 | ---
190 |
191 | Your code is now 'backed-up' at a remote location
192 |
193 | ---
194 |
195 | # [Task] Update my readme.md file and add your name
196 |
197 | * Clone my repo ``
198 | * Open my readme.md file using VS Code
199 | * Add your name to the list (don't remove anyone else's name)
200 | * Commit code and push it
201 |
202 | (You may get merge conflicts, VS Code makes it easier))
203 |
204 | ---
205 |
206 | You have now learned how to
207 | * save various versions of your code
208 | * back up your code
209 | * collaborate with your colleagues
210 |
211 | ---
212 |
213 | A couple of tools you should be aware of:
214 | * Diff
215 | * nb_diff
216 |
217 | ---
218 |
219 | #References
220 |
221 | * Mac open screenshot from https://www.linode.com/docs/development/version-control/how-to-install-git-on-linux-mac-and-windows/
--------------------------------------------------------------------------------
/lectures/055_bigger_data_pandas/100 - Work With Taxi Trips - memory_map.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# memory_map test"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "metadata": {},
23 | "outputs": [
24 | {
25 | "name": "stdout",
26 | "output_type": "stream",
27 | "text": [
28 | "7.32 s ± 186 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
29 | ]
30 | }
31 | ],
32 | "source": [
33 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_smaller.csv\", low_memory=False)"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 3,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "7.17 s ± 22.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_smaller.csv\", low_memory=False, memory_map=True)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": []
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 4,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "59.2 s ± 88.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
70 | ]
71 | }
72 | ],
73 | "source": [
74 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\", low_memory=False)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 5,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "59.6 s ± 180 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\", low_memory=False, memory_map=True)"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": []
100 | }
101 | ],
102 | "metadata": {
103 | "kernelspec": {
104 | "display_name": "Python 3",
105 | "language": "python",
106 | "name": "python3"
107 | },
108 | "language_info": {
109 | "codemirror_mode": {
110 | "name": "ipython",
111 | "version": 3
112 | },
113 | "file_extension": ".py",
114 | "mimetype": "text/x-python",
115 | "name": "python",
116 | "nbconvert_exporter": "python",
117 | "pygments_lexer": "ipython3",
118 | "version": "3.7.3"
119 | }
120 | },
121 | "nbformat": 4,
122 | "nbformat_minor": 2
123 | }
124 |
--------------------------------------------------------------------------------
/lectures/055_bigger_data_pandas/110 - Work With Taxi Trips - compression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# compression test"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "metadata": {},
23 | "outputs": [
24 | {
25 | "name": "stdout",
26 | "output_type": "stream",
27 | "text": [
28 | "7.22 s ± 25.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
29 | ]
30 | }
31 | ],
32 | "source": [
33 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_smaller.csv\", low_memory=False)"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 4,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "9.97 s ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_smaller.csv.gz\", low_memory=False)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": []
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 6,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "1min 2s ± 770 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
70 | ]
71 | }
72 | ],
73 | "source": [
74 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\", low_memory=False)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 7,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "1min 18s ± 679 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv.gz\", low_memory=False)"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": []
100 | }
101 | ],
102 | "metadata": {
103 | "kernelspec": {
104 | "display_name": "Python 3",
105 | "language": "python",
106 | "name": "python3"
107 | },
108 | "language_info": {
109 | "codemirror_mode": {
110 | "name": "ipython",
111 | "version": 3
112 | },
113 | "file_extension": ".py",
114 | "mimetype": "text/x-python",
115 | "name": "python",
116 | "nbconvert_exporter": "python",
117 | "pygments_lexer": "ipython3",
118 | "version": "3.7.3"
119 | }
120 | },
121 | "nbformat": 4,
122 | "nbformat_minor": 2
123 | }
124 |
--------------------------------------------------------------------------------
/lectures/055_bigger_data_pandas/120 - Work With Taxi Trips - feather format.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Chunking and feather format test"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Feather format, and associated technology called _arrow_ is created by Wes McKiney and Hadley Wickham. *LOTS* of new and interesting infrastructure is being built around it. However, it is very new and lots of tooling is missing."
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "### You may have to install pyarrow to access to_feather and read_feather functionality"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 1,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "#!conda install --y pyarrow -c conda-forge"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "import pandas as pd\n",
40 | "import pyarrow as pa\n",
41 | "from pyarrow import csv"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "### Notice that read_feather is _MUCH_ faster than read_csv"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 3,
54 | "metadata": {},
55 | "outputs": [
56 | {
57 | "name": "stdout",
58 | "output_type": "stream",
59 | "text": [
60 | "CPU times: user 56.7 s, sys: 5.26 s, total: 1min 1s\n",
61 | "Wall time: 1min 1s\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "%time data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\", low_memory=False)"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 4,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "name": "stdout",
76 | "output_type": "stream",
77 | "text": [
78 | "CPU times: user 5.52 s, sys: 7.2 s, total: 12.7 s\n",
79 | "Wall time: 17 s\n"
80 | ]
81 | }
82 | ],
83 | "source": [
84 | "%time data_df.to_feather(\"../../datasets/taxi-trips/taxi_trips_small.feather\")"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 4,
90 | "metadata": {},
91 | "outputs": [
92 | {
93 | "name": "stdout",
94 | "output_type": "stream",
95 | "text": [
96 | "CPU times: user 6.67 s, sys: 2.49 s, total: 9.16 s\n",
97 | "Wall time: 9.34 s\n"
98 | ]
99 | }
100 | ],
101 | "source": [
102 | "%time data_df = pd.read_feather(\"../../datasets/taxi-trips/taxi_trips_small.feather\")"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": []
111 | }
112 | ],
113 | "metadata": {
114 | "kernelspec": {
115 | "display_name": "Python 3",
116 | "language": "python",
117 | "name": "python3"
118 | },
119 | "language_info": {
120 | "codemirror_mode": {
121 | "name": "ipython",
122 | "version": 3
123 | },
124 | "file_extension": ".py",
125 | "mimetype": "text/x-python",
126 | "name": "python",
127 | "nbconvert_exporter": "python",
128 | "pygments_lexer": "ipython3",
129 | "version": "3.7.3"
130 | }
131 | },
132 | "nbformat": 4,
133 | "nbformat_minor": 2
134 | }
135 |
--------------------------------------------------------------------------------
/lectures/055_bigger_data_pandas/135 - Work With Taxi Trips - skip columns pre-req.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 7,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "import sys\n",
11 | "import numpy as np"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "### How big is your computer?"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "name": "stdout",
28 | "output_type": "stream",
29 | "text": [
30 | "Memory:\t 31.2348 Gigabytes\n",
31 | "Disk:\t 111.4335 Gigabytes\n",
32 | "Cpus:\t 4\n"
33 | ]
34 | }
35 | ],
36 | "source": [
37 | "from psutil import virtual_memory, disk_usage, cpu_count\n",
38 | "\n",
39 | "bytes_in_gb = 1024**3\n",
40 | "\n",
41 | "print(\"Memory:\\t\",round(virtual_memory().total/bytes_in_gb,4), \"Gigabytes\")\n",
42 | "print(\"Disk:\\t\",round(disk_usage(os.path.abspath(os.sep)).total/bytes_in_gb,4), \"Gigabytes\")\n",
43 | "print(\"Cpus:\\t\", cpu_count())"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "### Let's test the speed of you computer"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 3,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "14.2 ms ± 86.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "%%timeit\n",
68 | "sum(range(1_000_000))"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "### Why you shouldn't keep your integers as strings"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 27,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "data": {
85 | "text/plain": [
86 | "dtype('int64')"
87 | ]
88 | },
89 | "execution_count": 27,
90 | "metadata": {},
91 | "output_type": "execute_result"
92 | }
93 | ],
94 | "source": [
95 | "np.array([1]).dtype"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 18,
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "data": {
105 | "text/plain": [
106 | "(8, 4)"
107 | ]
108 | },
109 | "execution_count": 18,
110 | "metadata": {},
111 | "output_type": "execute_result"
112 | }
113 | ],
114 | "source": [
115 | "np.array([1]).itemsize, np.array([\"1\"]).itemsize"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 19,
121 | "metadata": {},
122 | "outputs": [
123 | {
124 | "data": {
125 | "text/plain": [
126 | "(8, 8)"
127 | ]
128 | },
129 | "execution_count": 19,
130 | "metadata": {},
131 | "output_type": "execute_result"
132 | }
133 | ],
134 | "source": [
135 | "np.array([10]).itemsize, np.array([\"10\"]).itemsize"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 20,
141 | "metadata": {},
142 | "outputs": [
143 | {
144 | "data": {
145 | "text/plain": [
146 | "(8, 12)"
147 | ]
148 | },
149 | "execution_count": 20,
150 | "metadata": {},
151 | "output_type": "execute_result"
152 | }
153 | ],
154 | "source": [
155 | "np.array([100]).itemsize, np.array([\"100\"]).itemsize"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 21,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "data": {
165 | "text/plain": [
166 | "(8, 16)"
167 | ]
168 | },
169 | "execution_count": 21,
170 | "metadata": {},
171 | "output_type": "execute_result"
172 | }
173 | ],
174 | "source": [
175 | "np.array([1000]).itemsize, np.array([\"1000\"]).itemsize"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 22,
181 | "metadata": {},
182 | "outputs": [
183 | {
184 | "data": {
185 | "text/plain": [
186 | "(8, 40)"
187 | ]
188 | },
189 | "execution_count": 22,
190 | "metadata": {},
191 | "output_type": "execute_result"
192 | }
193 | ],
194 | "source": [
195 | "#Seconds since epoch\n",
196 | "np.array([1565232961]).itemsize, np.array([\"1565232961\"]).itemsize"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": []
205 | }
206 | ],
207 | "metadata": {
208 | "kernelspec": {
209 | "display_name": "Python 3",
210 | "language": "python",
211 | "name": "python3"
212 | },
213 | "language_info": {
214 | "codemirror_mode": {
215 | "name": "ipython",
216 | "version": 3
217 | },
218 | "file_extension": ".py",
219 | "mimetype": "text/x-python",
220 | "name": "python",
221 | "nbconvert_exporter": "python",
222 | "pygments_lexer": "ipython3",
223 | "version": "3.7.3"
224 | }
225 | },
226 | "nbformat": 4,
227 | "nbformat_minor": 2
228 | }
229 |
--------------------------------------------------------------------------------
/lectures/055_bigger_data_pandas/140B - Work With Taxi Trips - skip columns.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 35,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "name": "stdout",
20 | "output_type": "stream",
21 | "text": [
22 | "CPU times: user 41min 4s, sys: 4.24 s, total: 41min 9s\n",
23 | "Wall time: 41min 8s\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "schema = {'Trip ID': 'str',\n",
29 | " 'Taxi ID': 'category',\n",
30 | " 'Trip Start Timestamp': 'str',\n",
31 | " 'Trip End Timestamp': 'str',\n",
32 | " 'Trip Seconds': 'float64',\n",
33 | " 'Trip Miles': 'float64',\n",
34 | " 'Pickup Census Tract': 'float64',\n",
35 | " 'Dropoff Census Tract': 'float64',\n",
36 | " 'Pickup Community Area': 'float64',\n",
37 | " 'Dropoff Community Area': 'float64',\n",
38 | " 'Fare': 'float64',\n",
39 | " 'Tips': 'float64',\n",
40 | " 'Tolls': 'float64',\n",
41 | " 'Extras': 'float64',\n",
42 | " 'Trip Total': 'float64',\n",
43 | " 'Payment Type': 'category',\n",
44 | " 'Company': 'category',\n",
45 | " 'Pickup Centroid Latitude': 'float64',\n",
46 | " 'Pickup Centroid Longitude': 'float64',\n",
47 | " 'Pickup Centroid Location': 'str',\n",
48 | " 'Dropoff Centroid Latitude': 'float64',\n",
49 | " 'Dropoff Centroid Longitude': 'float64',\n",
50 | " 'Dropoff Centroid Location': 'str',\n",
51 | " 'Community Areas': 'float64'}\n",
52 | "\n",
53 | "usecols = ['Trip ID', 'Taxi ID', 'Trip Start Timestamp', 'Trip End Timestamp',\\\n",
54 | " 'Trip Seconds', 'Trip Miles', 'Pickup Census Tract',\\\n",
55 | " 'Dropoff Census Tract', 'Pickup Community Area',\\\n",
56 | " 'Dropoff Community Area', 'Fare', 'Tips', 'Tolls', 'Extras',\\\n",
57 | " 'Trip Total', 'Payment Type', 'Company', 'Pickup Centroid Latitude',\\\n",
58 | " 'Pickup Centroid Longitude', 'Dropoff Centroid Latitude',\\\n",
59 | " 'Dropoff Centroid Longitude', 'Community Areas']\n",
60 | "\n",
61 | "%time data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\" \\\n",
62 | " , dtype = schema \\\n",
63 | " , usecols = usecols \\\n",
64 | " , parse_dates = ['Trip Start Timestamp', 'Trip End Timestamp'])"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 36,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "name": "stdout",
74 | "output_type": "stream",
75 | "text": [
76 | "Total memory usage: 2.59 GB\n",
77 | "CPU times: user 5.27 s, sys: 3 µs, total: 5.27 s\n",
78 | "Wall time: 5.29 s\n"
79 | ]
80 | }
81 | ],
82 | "source": [
83 | "%time print(\"Total memory usage:\", round(sum(data_df.memory_usage(deep=True, index=False)) / (1024 ** 3), 2), \"GB\")"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": []
92 | }
93 | ],
94 | "metadata": {
95 | "kernelspec": {
96 | "display_name": "Python 3",
97 | "language": "python",
98 | "name": "python3"
99 | },
100 | "language_info": {
101 | "codemirror_mode": {
102 | "name": "ipython",
103 | "version": 3
104 | },
105 | "file_extension": ".py",
106 | "mimetype": "text/x-python",
107 | "name": "python",
108 | "nbconvert_exporter": "python",
109 | "pygments_lexer": "ipython3",
110 | "version": "3.7.3"
111 | }
112 | },
113 | "nbformat": 4,
114 | "nbformat_minor": 2
115 | }
116 |
--------------------------------------------------------------------------------
/lectures/055_bigger_data_pandas/150 - Work With Taxi Trips - c_parser.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# C parser test"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "metadata": {},
23 | "outputs": [
24 | {
25 | "name": "stdout",
26 | "output_type": "stream",
27 | "text": [
28 | "7.62 s ± 42.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
29 | ]
30 | }
31 | ],
32 | "source": [
33 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_smaller.csv\", low_memory=False)"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 3,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "7.55 s ± 11.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_smaller.csv\", low_memory=False, engine='c')"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": []
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 4,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "1min 2s ± 50.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
70 | ]
71 | }
72 | ],
73 | "source": [
74 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\", low_memory=False)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 6,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "1min 2s ± 60.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "%timeit data_df = pd.read_csv(\"../../datasets/taxi-trips/taxi_trips_small.csv\", low_memory=False, engine='c')"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": []
100 | }
101 | ],
102 | "metadata": {
103 | "kernelspec": {
104 | "display_name": "Python 3",
105 | "language": "python",
106 | "name": "python3"
107 | },
108 | "language_info": {
109 | "codemirror_mode": {
110 | "name": "ipython",
111 | "version": 3
112 | },
113 | "file_extension": ".py",
114 | "mimetype": "text/x-python",
115 | "name": "python",
116 | "nbconvert_exporter": "python",
117 | "pygments_lexer": "ipython3",
118 | "version": "3.7.3"
119 | }
120 | },
121 | "nbformat": 4,
122 | "nbformat_minor": 2
123 | }
124 |
--------------------------------------------------------------------------------
/lectures/055_bigger_data_pandas/160 - Work With Taxi Trips - Chunk to parquet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "\n",
11 | "from tqdm import tqdm, tqdm_notebook, tnrange\n",
12 | "tqdm.pandas()"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "### Go through a large csv and convert to parquet"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "schema = {'Trip ID': 'str',\n",
29 | " 'Taxi ID': 'category',\n",
30 | " 'Trip Start Timestamp': 'str',\n",
31 | " 'Trip End Timestamp': 'str',\n",
32 | " 'Trip Seconds': 'float64',\n",
33 | " 'Trip Miles': 'float64',\n",
34 | " 'Pickup Census Tract': 'float64',\n",
35 | " 'Dropoff Census Tract': 'float64',\n",
36 | " 'Pickup Community Area': 'float64',\n",
37 | " 'Dropoff Community Area': 'float64',\n",
38 | " 'Fare': 'float64',\n",
39 | " 'Tips': 'float64',\n",
40 | " 'Tolls': 'float64',\n",
41 | " 'Extras': 'float64',\n",
42 | " 'Trip Total': 'float64',\n",
43 | " 'Payment Type': 'category',\n",
44 | " 'Company': 'category',\n",
45 | " 'Pickup Centroid Latitude': 'float64',\n",
46 | " 'Pickup Centroid Longitude': 'float64',\n",
47 | " 'Pickup Centroid Location': 'str',\n",
48 | " 'Dropoff Centroid Latitude': 'float64',\n",
49 | " 'Dropoff Centroid Longitude': 'float64',\n",
50 | " 'Dropoff Centroid Location': 'str',\n",
51 | " 'Community Areas': 'float64'}\n",
52 | "\n",
53 | "usecols = ['Trip ID', 'Taxi ID', 'Trip Start Timestamp', 'Trip End Timestamp',\\\n",
54 | " 'Trip Seconds', 'Trip Miles', 'Pickup Census Tract',\\\n",
55 | " 'Dropoff Census Tract', 'Pickup Community Area',\\\n",
56 | " 'Dropoff Community Area', 'Fare', 'Tips', 'Tolls', 'Extras',\\\n",
57 | " 'Trip Total', 'Payment Type', 'Company', 'Pickup Centroid Latitude',\\\n",
58 | " 'Pickup Centroid Longitude', 'Dropoff Centroid Latitude',\\\n",
59 | " 'Dropoff Centroid Longitude', 'Community Areas']\n",
60 | "\n"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 3,
66 | "metadata": {
67 | "scrolled": false
68 | },
69 | "outputs": [
70 | {
71 | "name": "stderr",
72 | "output_type": "stream",
73 | "text": [
74 | "114it [7:06:22, 163.28s/it]"
75 | ]
76 | },
77 | {
78 | "name": "stdout",
79 | "output_type": "stream",
80 | "text": [
81 | "CPU times: user 7h 6min 33s, sys: 1min 3s, total: 7h 7min 36s\n",
82 | "Wall time: 7h 6min 22s\n"
83 | ]
84 | },
85 | {
86 | "name": "stderr",
87 | "output_type": "stream",
88 | "text": [
89 | "\n"
90 | ]
91 | }
92 | ],
93 | "source": [
94 | "%%time\n",
95 | "chunk_size = 1_000_000\n",
96 | "for chunk_counter, chunk_df in enumerate(tqdm(pd.read_csv(\"../../datasets/taxi-trips/Taxi_Trips.csv.gz\" \\\n",
97 | " , dtype = schema \\\n",
98 | " , usecols = usecols \\\n",
99 | " , parse_dates = ['Trip Start Timestamp', 'Trip End Timestamp'] \\\n",
100 | " , compression = \"gzip\"\\\n",
101 | " , chunksize=chunk_size))):\n",
102 | " OUTFILE = \"../../datasets/taxi-trips/taxi_trips_parquet/taxi_trips_\"+str(chunk_counter)+\".parquet\"\n",
103 | " chunk_df.to_parquet(OUTFILE, compression='gzip')"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": []
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": "Python 3",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.7.3"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 2
135 | }
136 |
--------------------------------------------------------------------------------
/lectures/055_bigger_data_pandas/lecture.md:
--------------------------------------------------------------------------------
1 | ---
2 | theme: "white"
3 | transition: "fade"
4 | marp: true
5 | highlightTheme: "dracula"
6 | ---
7 |
8 | # Handing bigger data with Pandas
9 |
10 | ---
11 |
12 | ## General guidelines for exploring big data tool:
13 | * If data fits in memory, use Pandas/R/Excel
14 | * If data fits on disk, use a database
15 | * If data is bigger than a disk drive, use Hadoop
16 |
17 | ---
18 |
19 | ## Well known maxims in computer science:
20 | *"Premature optimization is the root of all evil"* - Don Knuth
21 |
22 | *"Developers themselves highlight the fact that those doing research should exercise caution when using such microbenchmarks"* - Wikipedia article about the benchmark game
23 |
24 |
25 | *"You don't have to be an engineer to be be a racing driver, but you do have to have Mechanical Sympathy."* Jackie Stewart, racing driver - Quoted by Martin Thompson
26 |
27 | ---
28 |
29 | ## A very important insight into understanding perfrmance issues:
30 | The pyramid of latency varies by orders of magnitude
31 |
32 | ---
33 |
34 | ## Latency numbers every programmer should know
35 | (source: https://gist.github.com/hellerbarde/2843375)
36 | (originally by Jeff Dean)
37 |
38 | ```a
39 | L1 cache reference ......................... 0.5 ns
40 | Branch mispredict ............................ 5 ns
41 | L2 cache reference ........................... 7 ns
42 | Mutex lock/unlock ........................... 25 ns
43 | Main memory reference ...................... 100 ns
44 | Compress 1K bytes with Zippy ........ 3,000 ns = 3 µs
45 | Send 2K bytes over 1 Gbps network .. 20,000 ns = 20 µs
46 | SSD random read ................... 150,000 ns = 150 µs
47 | Read 1 MB sequentially from memory 250,000 ns = 250 µs
48 | Round trip within same datacenter . 500,000 ns = 0.5 ms
49 | Read 1 MB sequentially from SSD* 1,000,000 ns = 1 ms
50 | Disk seek ...................... 10,000,000 ns = 10 ms
51 | Read 1 MB sequentially from disk 20,000,000 ns = 20 ms
52 | Send packet CA->Holland->CA ... 150,000,000 ns = 150 ms
53 | ```
54 |
55 | ---
56 |
57 | #### In human terms (multiply above numbers by a billion)
58 |
59 | ```a
60 |
61 | -L1 cache reference 0.5 s
62 | One heart beat (0.5 s)
63 | -Branch mispredict 5 s
64 | Yawn
65 | -L2 cache reference 7 s
66 | Long yawn
67 | -Mutex lock/unlock 25 s
68 | Making a coffee
69 | -Main memory reference 100 s
70 | Brushing your teeth
71 | -Compress 1K bytes with Zippy 50 min
72 | One episode of a TV show (including ad breaks)
73 | -Send 2K bytes over 1 Gbps network 5.5 hr
74 | From lunch to end of work day
75 | -SSD random read 1.7 days
76 | A normal weekend
77 | -Read 1 MB sequentially from memory 2.9 days
78 | A long weekend
79 | -Round trip within same datacenter 5.8 days
80 | A medium vacation
81 | -Read 1 MB sequentially from SSD 11.6 days
82 | Waiting for almost 2 weeks for a delivery
83 | -Disk seek 16.5 weeks
84 | A semester in university
85 | -Read 1 MB sequentially from disk 7.8 months
86 | Almost producing a new human being
87 | -The above 2 together 1 year
88 | -Send packet CA->Netherlands->CA 4.8 years
89 | Average time it takes to complete a bachelor's degree
90 | ```
91 |
92 | ---
93 |
94 | Luckily _amortization_ saves us. You don't actually look up individual integers from disk each time, you read a chunk of of data on disk and read it into memory. Thereby _amortizing_ the cost of a disk access over many, many reads.
95 |
96 | ---
97 |
98 | Check memory prices
99 |
100 | ---
101 |
102 | ## Let's look at a real-world file: Chicago Taxi Data
103 | https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew/
104 |
105 | Size of full file...no idea, it is simply too big!
106 |
107 | ---
108 |
109 | ### My email to data providers:
110 |
111 | Subject: *Can you please, for the love of God, show how big a file is before we download it?*
112 |
113 | ...
114 | I’m sitting at 30 gigs right now for the taxi data and I have no idea how much more needs to be downloaded.
115 | ...
116 |
117 | Response:
118 | There is a bit of complexity to this but, in the end, these would have to be feature changes in the software we use. ...
119 |
120 | ---
121 |
122 | ### Size of (partially downloaded) file (and subsets):
123 |
124 | ```a
125 | Compressed:
126 | Size of Taxi_Trips.csv.gz: 14G
127 | Size of taxi_trips_small.csv.gz: 1G
128 | Size of taxi_trips_smaller.csv.gz 154M
129 |
130 | Uncompressed:
131 | Size of Taxi_Trips.csv: ????
132 | Size of taxi_trips_small.csv: 3.4G
133 | Size of taxi_trips_smaller.csv: 456M
134 |
135 | Lines in Taxi_Trips.csv.gz: 113,115,259 (100 million)
136 | Lines in taxi_trips_small.csv.gz: 11,311,525 (11 million)
137 | Lines in taxi_trips_smaller.csv.gz: 1,131,152 (1 million)
138 | ```
139 |
140 | ---
141 |
142 | ### Notebooks
143 |
144 | * 050 - Work With Taxi Trips - Get to know the file.ipynb
145 | * 100 - Work With Taxi Trips - memory_map.ipynb
146 | * 110 - Work With Taxi Trips - compression.ipynb
147 | * 150 - Work With Taxi Trips - c_parser.ipynb
148 |
149 | * 120 - Work With Taxi Trips - feather format.ipynb
150 | * 130 - Work With Taxi Trips - chunking and tqdm.ipynb
151 | * 160 - Work With Taxi Trips - Chunk to parquet.ipynb
152 | * 160B - Work With Taxi Trips - Read from parquet files.ipynb
153 |
154 | * 135 - Work With Taxi Trips - skip columns pre-req.ipynb
155 | * 140 - Work With Taxi Trips - skip columns.ipynb
156 | * 140B - Work With Taxi Trips - skip columns.ipynb
157 |
158 | ---
159 |
160 | ## Bonus material: Bash commandline as a super-power
161 | * `ls -ltrhc` to get the size of the file
162 | * `cat` to see the contents of a file
163 | * `zcat` to see the contents of a compressed file
164 | * `head -n` or `tail` to see the first or last few lines
165 | * `wc -l` to count the number of lines in a text file
166 | * `cut -d -f` to retrieve specific columns
167 | * `tr` or `sed` to replace one character or string with another
168 |
--------------------------------------------------------------------------------
/lectures/060_learn_command_line/learn_command_line_2.md:
--------------------------------------------------------------------------------
1 |
2 | # **Introduction to the Linux Command Line**
3 |
4 | Your computer has a file system, organized in a heirarchy.
5 |
6 | ### Windows file system looks something like this:
7 |
8 | ```
9 | c:\
10 | ...
11 | Program Files
12 | Program Files (x86)
13 | Windows
14 | Users
15 | Public
16 | shahbaz
17 | Desktop
18 | Documents
19 | Pictures
20 | .gitconfig
21 | ...
22 | ```
23 |
24 | ### Mac users' file system looks something like this
25 |
26 | ```
27 | /
28 | ...
29 | tmp/
30 | Applications/
31 | Library/
32 | Users/
33 | shahbaz/
34 | Applications/
35 | Desktop/
36 | Downloads/
37 | ...
38 | ```
39 |
40 | ### Linux users' file system looks something like this
41 |
42 | ```
43 | /
44 | ...
45 | tmp/
46 | bin/
47 | opt/
48 | home/
49 | shahbaz/
50 | ```
51 |
52 |
53 | ## **Week 1: Exploring the Command Line**
54 | ### **1. Connecting to a Remote Machine**
55 | - Check network connectivity with `ping`:
56 | ```bash
57 | ping google.com
58 | ```
59 |
60 | - Use `ssh` to log in:
61 | ```bash
62 | ssh -l
63 | ```
64 | - **Example:** `ssh -l student 192.168.1.10`
65 |
66 | ### **2. Basic Navigation**
67 |
68 | A path tells your a location in a file system: `/home/shahbaz/myfile.txt` tells you that you can access the file `myfile.txt` by navigating to the "top" `/`, then changing directory to `home`, then to `shahbaz`.
69 |
70 | Notice that there is similar to a web url: `https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/`
71 |
72 |
73 |
74 | - **Commands**:
75 | - `pwd` – Print working directory.
76 | - `ls` – List files and directories.
77 | - `cd` – Change directory.
78 | - `tree` – Show directory structure (if installed).
79 | - Explore:
80 | - Use `ls -a` to see hidden files (`.bashrc`, `.profile`).
81 | - Understand paths:
82 | - `.` – Current directory
83 | - `..` – Parent directory
84 | - `~` – Home directory
85 | - Absolute vs. Relative paths.
86 |
87 | ### **3. Viewing File Contents**
88 | - **Commands**:
89 | - `head ` – View the first few lines.
90 | - `cat ` – Show entire file (be cautious with large files).
91 | - `less ` – Paginated view with navigation.
92 |
93 | ### **Exercise**
94 | - Navigate to `/opt/mleng_class/datasets/names`
95 | - What files do you see in that directory?
96 | - Take a look at the first few male and female names
97 | - Count the number of lines in `male.txt` and `female.txt` files (hint, use `wc -l`)
98 |
99 | ### **4. Getting Help**
100 | - **Options**:
101 | - `--help` or `-h` with most commands.
102 | - `man ` for the manual page.
103 |
104 | ### **5. Searching and Filtering**
105 | - **Commands**:
106 | - `which ` – Find command location.
107 | - `find -name ` – Search for a file called `` "under" the path ``.
108 | - `grep ` – Look for the text (or pattern) `` in a file called ``.
109 | - Use `grep -i` to ignore case.
110 | - Combine with regex for advanced filtering.
111 | - Combine commands with pipes (`|`): <== A SUPER POWER!
112 | ```bash
113 | cat file.txt | grep "pattern" | head
114 | ```
115 |
116 | ### **Exercise**
117 | - Navigate to `/opt/mleng_class/datasets/names`
118 | - In the file `male.txt`, find all names which contains "Quin"
119 | - Make sure the list includes 'Joaquin' and "Quincy" (among others)
120 |
121 | ---
122 |
123 | ## **Week 2: Moving and Editing Files**
124 | ### **1. File Operations**
125 | - **Commands**:
126 | - `mkdir` – Create directories.
127 | - `cp` – Copy files.
128 | - `mv` – Move or rename files.
129 | - `rm` – Remove files or directories (`rm -rf` for directories).
130 | - `wget`/`curl` – Download files.
131 | - `tar`/`gzip`/`unzip` – Compress and extract files.
132 |
133 | ### **2. Editing Files**
134 | - Use `nano` for simple edits:
135 | - Save: `Ctrl + O`
136 | - Exit: `Ctrl + X`
137 |
138 | ### **3. Environment and Configuration**
139 | - View or edit:
140 | - `.bashrc`, `.profile`
141 | - Add `export PS1="[\u@\h \W]\$ "` to modify the prompt.
142 | - **Environment Variables**:
143 | - View with `printenv`.
144 | - Common examples:
145 | - `$HOME`, `$PATH`
146 |
147 | ### **4. Diffing Files**
148 | - Compare files:
149 | ```bash
150 | diff file1 file2
151 | ```
152 |
153 | ### **5. `find` files**
154 | - Compare files:
155 | ```bash
156 | find . | grep file_pattern
157 | ```
158 |
159 | ### **6. Automating with Loops**
160 | - Process multiple files:
161 | ```bash
162 | for file in *.txt; do echo $file; done
163 | ```
164 |
165 | ---
166 |
167 | ## **Week 3: Managing Processes**
168 | ### **1. System Info**
169 | - **Commands**:
170 | - `uname -a` – OS info.
171 | - `cat /etc/os-release` – Linux distribution.
172 | - `free -h` – Memory.
173 | - `df -h` – Disk space.
174 |
175 | ### **2. Monitoring Processes**
176 | - **Commands**:
177 | - `ps` – View processes.
178 | - `top` or `htop` – Interactive monitoring.
179 |
180 | ### **3. Managing Processes**
181 | - Start a background process:
182 | ```bash
183 | long_running_command &
184 | ```
185 | - Control processes:
186 | - `Ctrl+C` – Terminate.
187 | - `Ctrl+Z` – Suspend.
188 | - `bg` – Resume in the background.
189 | - `fg` – Bring back to the foreground.
190 | - `kill -9 ` – Force kill.
191 |
192 | ### **4. Redirecting Output**
193 | - Save output to a file:
194 | ```bash
195 | command > output.txt
196 | ```
197 | - Append:
198 | ```bash
199 | command >> output.txt
200 | ```
201 |
202 | ---
203 |
204 | ## **Week 4: Advanced Concepts**
205 | ### **1. Advanced Navigation**
206 | - `cd -` – Return to the last directory.
207 | - Use `tree` for visualization.
208 |
209 | ### **2. Job Management**
210 | - `jobs` – View running background jobs.
211 | - Use `nohup` to ensure long-running jobs persist:
212 | ```bash
213 | nohup command &
214 | ```
215 |
216 | ### **3. Regular Expressions**
217 | - Basics:
218 | - `^` – Start of a line.
219 | - `$` – End of a line.
220 | - `.` – Any character.
221 | - `*` – Zero or more occurrences.
222 | - Combine with `grep`, `sed`, or `awk`.
223 |
224 | ---
225 |
226 | ## **Week 5: Capstone**
227 | ### **Scenario**:
228 | 1. SSH into the remote machine.
229 | 2. Use `find`, `grep`, and `nano` to edit configuration files.
230 | 3. Start a long-running job with output redirected to a log file.
231 | 4. Monitor resources with `top` or `htop`.
232 | 5. Schedule periodic tasks with `crontab`:
233 | ```bash
234 | crontab -e
235 | ```
236 | Example entry:
237 | ```bash
238 | 0 3 * * * /path/to/script.sh
239 | ```
240 |
241 | ---
242 | With help from ChatGPT (free version).
--------------------------------------------------------------------------------
/lectures/065_secret_lives_of_text_files/howareyou_english.txt:
--------------------------------------------------------------------------------
1 | How are you?
--------------------------------------------------------------------------------
/lectures/065_secret_lives_of_text_files/howareyou_english_multiple_lines.txt:
--------------------------------------------------------------------------------
1 | How are
2 | you?
--------------------------------------------------------------------------------
/lectures/065_secret_lives_of_text_files/howareyou_not_english.txt:
--------------------------------------------------------------------------------
1 | How âre you?
--------------------------------------------------------------------------------
/lectures/070_scikit_learn/110-scikit-learn-run_saved_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Loading scikit-learn's saved model from disk"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "The following code shows how to load a saved model from disk. In a real-world scenario, such model will be loaded from flask (or more production grade server)"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "#### Create Test data"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "X_test = pd.DataFrame(np.array([[1, 15.0, 0, 1, 211.3375, False, False],\n",
41 | " [3, 7.0, 4, 1, 29.125, False, False],\n",
42 | " [2, 33.0, 0, 2, 26.0, False, False],\n",
43 | " [2, 14.0, 1, 0, 30.0708, False, False],\n",
44 | " [3, 21.0, 0, 0, 8.05, True, True]]), columns=['pclass', 'age', 'sibsp', 'parch', 'fare', 'adult_male', 'alone'])\n",
45 | "\n",
46 | "y_test = np.array([1, 0, 1, 1, 0])"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "X_test"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "X_test.iloc[0:1,:].values"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "y_test"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "#### Load model from disk"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "from joblib import dump, load"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "trained_model = load('model.joblib')"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "trained_model.predict(X_test.iloc[0:1,:])"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "trained_model.score(X_test, y_test)"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": []
125 | }
126 | ],
127 | "metadata": {
128 | "kernelspec": {
129 | "display_name": "Python 3",
130 | "language": "python",
131 | "name": "python3"
132 | },
133 | "language_info": {
134 | "codemirror_mode": {
135 | "name": "ipython",
136 | "version": 3
137 | },
138 | "file_extension": ".py",
139 | "mimetype": "text/x-python",
140 | "name": "python",
141 | "nbconvert_exporter": "python",
142 | "pygments_lexer": "ipython3",
143 | "version": "3.8.5"
144 | }
145 | },
146 | "nbformat": 4,
147 | "nbformat_minor": 4
148 | }
149 |
--------------------------------------------------------------------------------
/lectures/070_scikit_learn/model_server.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, request
2 | from joblib import load
3 |
4 | import numpy as np
5 | import pandas as pd
6 |
7 | # Load trained model
8 | trained_model = load('model.joblib')
9 |
10 |
11 | # Flask server setup
12 | app = Flask(__name__)
13 |
14 | @app.route('/')
15 | def hello_world():
16 | return 'Hello, World!'
17 |
18 | @app.route('/model')
19 | def serve_model():
20 | args = request.args
21 |
22 | print(args)
23 |
24 | pclass, age, sibsp, parch, fare, adult_male, alone = float(args.get('pclass')), float(args.get('age')), float(args.get('sibsp')), float(args.get('parch')), float(args.get('fare')), float(args.get('adult_male')), float(args.get('alone'))
25 |
26 | print(pclass, age, sibsp, parch, fare, adult_male, alone)
27 |
28 | input_array = np.array([[pclass, age, sibsp, parch, fare, adult_male, alone]])
29 |
30 | print(input_array)
31 |
32 | predicted = trained_model.predict(input_array)
33 | return args, predicted
34 |
35 | if __name__ == "__main__":
36 | app.run()
37 |
38 | # Test as:
39 | # http://localhost:5000/model?pclass=1&age=15&sibsp=0&parch=1&fare=211.3375&adult_male=0&alone=0
40 | # 1. , 15. , 0. , 1. , 211.3375, 0. ,0.
--------------------------------------------------------------------------------
/lectures/075_web_services/120-bank_churners_classifier_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Bank Churners Classifier Model
5 |
6 | # In[3]:
7 |
8 |
9 | import numpy as np
10 | import pandas as pd
11 | import matplotlib.pyplot as plt
12 |
13 |
14 | #pre-training
15 | from sklearn.model_selection import train_test_split
16 | from sklearn.preprocessing import OneHotEncoder
17 | from sklearn.compose import make_column_transformer
18 |
19 | #training
20 | from sklearn import ensemble
21 | from sklearn import pipeline
22 |
23 |
24 | #post training
25 | from sklearn.metrics import accuracy_score
26 | from joblib import dump
27 |
28 |
29 | # #### Read data
30 |
31 | # In[ ]:
32 |
33 |
34 | data_df = pd.read_csv('../../datasets/credit-card-customers/BankChurners.zip')
35 | data_df.shape
36 |
37 |
38 | # In[ ]:
39 |
40 |
41 | data_df.head()
42 |
43 |
44 | # In[ ]:
45 |
46 |
47 | data_df.columns
48 |
49 |
50 | # In[ ]:
51 |
52 |
53 | data_df.isna().sum()
54 |
55 |
56 | # #### Remove columns which should not go into the model
57 |
58 | # In[ ]:
59 |
60 |
61 | data_df.drop([
62 | 'CLIENTNUM',
63 | 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
64 | 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'
65 | ], axis=1, inplace=True)
66 |
67 |
68 | # #### Convert categorical columns
69 |
70 | # In[ ]:
71 |
72 |
73 | #https://medium.com/@sami.yousuf.azad/one-hot-encoding-with-pandas-dataframe-49a304e8507a
74 | CATEGORICAL_COLS = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', ]
75 | col_transformer = make_column_transformer(
76 | (OneHotEncoder(), CATEGORICAL_COLS),
77 | remainder='passthrough')
78 |
79 | transformed = col_transformer.fit_transform(data_df)
80 |
81 | transformed_df = pd.DataFrame(transformed, columns=col_transformer.get_feature_names_out())
82 |
83 |
84 | # In[ ]:
85 |
86 |
87 | transformed_df.head()
88 |
89 |
90 | # In[ ]:
91 |
92 |
93 | transformed_df.columns
94 |
95 |
96 | # #### Build model
97 |
98 | # In[ ]:
99 |
100 |
101 | X_train, X_test, y_train, y_test = train_test_split(
102 | data_df.drop(['Attrition_Flag'], axis=1)
103 | , data_df.Attrition_Flag
104 | , random_state=1)
105 |
106 |
107 | # In[ ]:
108 |
109 |
110 | pipe = pipeline.make_pipeline(
111 | col_transformer
112 | ,ensemble.RandomForestClassifier(n_estimators=100, min_samples_split=2) # <== Classifier
113 | )
114 |
115 |
116 | # In[ ]:
117 |
118 |
119 | #%%time
120 | pipe.fit(X_train, y_train)
121 |
122 | y_predict = pipe.predict(X_test)
123 | pipe.score(X_test, y_test)
124 |
125 |
126 | # In[ ]:
127 |
128 |
129 | pipe
130 |
131 |
132 | # In[ ]:
133 |
134 |
135 | #pd.DataFrame({'feature':X_train.columns, 'importance':pipe.feature_importances_}).sort_values(by='importance')
136 |
137 |
138 | # #### Save model
139 |
140 | # In[ ]:
141 |
142 |
143 | #%%time
144 | dump(pipe, 'bank_churners_classifier_model.joblib')
145 |
146 |
147 | # In[ ]:
148 |
149 |
150 | #%ls
151 |
152 |
153 | # #### Read model
154 |
155 | # In[ ]:
156 |
157 |
158 | from joblib import load
159 |
160 |
161 | # In[ ]:
162 |
163 |
164 | trained_model = load('bank_churners_classifier_model.joblib')
165 |
166 |
167 | # In[ ]:
168 |
169 |
170 | trained_model.feature_names_in_
171 |
172 |
173 | # The following columns are categorical
174 |
175 | # In[ ]:
176 |
177 |
178 | CATEGORICAL_COLS
179 |
180 |
181 | # In[ ]:
182 |
183 |
184 | for col in CATEGORICAL_COLS:
185 | print(col, data_df[col].unique())
186 |
187 |
188 | # In[ ]:
189 |
190 |
191 | test_data_df = pd.Series({
192 | 'Customer_Age' : 30,
193 | 'Gender' : 'M',
194 | 'Dependent_count': 3,
195 | 'Education_Level': 'Graduate',
196 | 'Marital_Status' : 'Single',
197 | 'Income_Category': '$40K - $60K',
198 | 'Card_Category' : 'Blue',
199 | 'Months_on_book' : 5,
200 | 'Total_Relationship_Count' : 3,
201 | 'Months_Inactive_12_mon' : 1,
202 | 'Contacts_Count_12_mon' : 2,
203 | 'Credit_Limit' : 34000,
204 | 'Total_Revolving_Bal' : 40000,
205 | 'Avg_Open_To_Buy' : 200,
206 | 'Total_Amt_Chng_Q4_Q1' : 34,
207 | 'Total_Trans_Amt' : 500,
208 | 'Total_Trans_Ct' : 3,
209 | 'Total_Ct_Chng_Q4_Q1' : 23,
210 | 'Avg_Utilization_Ratio' : .1
211 | }).to_frame().T
212 |
213 |
214 | # In[ ]:
215 |
216 |
217 | test_data_df
218 |
219 |
220 | # In[ ]:
221 |
222 |
223 | test_data_df.columns
224 |
225 |
226 | # In[ ]:
227 |
228 |
229 | trained_model.predict(test_data_df)
230 |
231 |
232 | # In[ ]:
233 |
234 |
235 | trained_model.classes_
236 |
237 |
238 | # In[ ]:
239 |
240 |
241 | trained_model.predict_proba(test_data_df)
242 |
243 |
244 | # ### Convert this notebook to .py
245 | # Some students having trouble reading the model so they can run a .py file in their own enviornment and generate the model file using the same env as their web services code
246 |
247 | # In[ ]:
248 |
249 |
250 | get_ipython().system('jupyter nbconvert --to python 120-bank_churners_classifier_model.ipynb')
251 |
252 |
253 | # In[ ]:
254 |
255 |
256 |
257 |
258 |
--------------------------------------------------------------------------------
/lectures/075_web_services/130-load_test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "3372311b-1834-4898-af06-ec7721242ba8",
6 | "metadata": {},
7 | "source": [
8 | "# Load test\n",
9 | "Simulate a large number of clients accessing your web site or service"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "id": "515ab2bb-9962-46ee-9172-bfe05efaed3c",
15 | "metadata": {},
16 | "source": [
17 | "### A sample web service\n",
18 | "Ray serve is a component which makes it easy to spread the serving of an API across several machines. Let's jump into code."
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "id": "4d8d94d5-d873-49ac-98d3-a85abb296315",
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "%%writefile simple_api.py\n",
29 | "\n",
30 | "from fastapi import FastAPI\n",
31 | "from typing import Dict\n",
32 | "\n",
33 | "app = FastAPI()\n",
34 | "\n",
35 | "@app.get(\"/status\")\n",
36 | "def status() -> Dict[str, str]:\n",
37 | " \"\"\"Simple health check endpoint.\"\"\"\n",
38 | " return {\"status\": \"ok\"}\n",
39 | "\n",
40 | "\n",
41 | "@app.get(\"/compute\")\n",
42 | "def fibonacci(n: int):\n",
43 | " \"\"\"Compute Fibonacci sequence up to n (inclusive).\"\"\"\n",
44 | " if n <= 0:\n",
45 | " return []\n",
46 | " fib = [0, 1]\n",
47 | " while fib[-1] + fib[-2] <= n:\n",
48 | " fib.append(fib[-1] + fib[-2])\n",
49 | " return fib\n",
50 | "\n",
51 | "# fastapi run simple_api.py\n",
52 | "# http://localhost:8000/compute?n=10"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "id": "573491b0-7790-49b4-8553-1635f4ae341e",
58 | "metadata": {},
59 | "source": [
60 | "Normally you run the code above as:\n",
61 | "\n",
62 | "```python\n",
63 | "fastapi run simple_apy.py\n",
64 | "```\n",
65 | "\n",
66 | "This will run the API on a single machine. \n",
67 | "\n",
68 | "However, is your startup grows, how do you make sure you can continue to serve clients?"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "id": "6254f39b-1ad4-4782-847c-1195a685e91f",
74 | "metadata": {},
75 | "source": [
76 | "### Is your current setup going to scale when you go viral? \n",
77 | "Test it with https://locust.io/"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "id": "9690dbd7-f1b4-4cc0-be49-1c6cefc71e81",
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "!pip install locust"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "id": "bb2f6289-db40-49c7-9fac-31361617c431",
93 | "metadata": {},
94 | "source": [
95 | "Create a virtual users who will hit your API"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "id": "7a72fdd9-bc85-4507-970a-7a4f29bb1568",
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "%%writefile locustfile.py\n",
106 | "\n",
107 | "from locust import HttpUser, TaskSet, task, between\n",
108 | "\n",
109 | "class APIUser(HttpUser):\n",
110 | " wait_time = between(1, 3)\n",
111 | " host = \"http://127.0.0.1:8000\"\n",
112 | "\n",
113 | " @task\n",
114 | " class UserTasks(TaskSet):\n",
115 | " @task\n",
116 | " def get_status(self):\n",
117 | " self.client.get(\"/status/\") \n",
118 | "\n",
119 | " @task\n",
120 | " def do_compute(self):\n",
121 | " self.client.get(\"/compute?n=100\") "
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "id": "7aff6e5f-a280-43ab-a6bc-04d537b8ff3d",
127 | "metadata": {},
128 | "source": [
129 | "Run it as `locust` at the command line.\n",
130 | "This will refer you to a web page, which will let you control the test."
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "id": "fc6841c8-8e34-48a2-a299-1a907d639454",
136 | "metadata": {},
137 | "source": [
138 | "#### Things to note\n",
139 | "1. Any filures on the locust dashboard?\n",
140 | "2. Monitor the logs of your application\n",
141 | "3. What is the median execution time?\n",
142 | "4. **What is the tail execution time**?\n",
143 | "5. What is the RPS (requests per second)?"
144 | ]
145 | }
146 | ],
147 | "metadata": {
148 | "kernelspec": {
149 | "display_name": "Python [conda env:conda-mleng_env]",
150 | "language": "python",
151 | "name": "conda-env-conda-mleng_env-py"
152 | },
153 | "language_info": {
154 | "codemirror_mode": {
155 | "name": "ipython",
156 | "version": 3
157 | },
158 | "file_extension": ".py",
159 | "mimetype": "text/x-python",
160 | "name": "python",
161 | "nbconvert_exporter": "python",
162 | "pygments_lexer": "ipython3",
163 | "version": "3.11.11"
164 | }
165 | },
166 | "nbformat": 4,
167 | "nbformat_minor": 5
168 | }
169 |
--------------------------------------------------------------------------------
/lectures/075_web_services/The web, under the hood.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/075_web_services/The web, under the hood.pdf
--------------------------------------------------------------------------------
/lectures/075_web_services/The web, under the hood.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/075_web_services/The web, under the hood.pptx
--------------------------------------------------------------------------------
/lectures/075_web_services/consume_json.py:
--------------------------------------------------------------------------------
1 | #serve json
2 | from datetime import datetime
3 | import requests
4 |
5 | HOST = 'localhost'
6 | PORT = 5000
7 |
8 | # Request something which doesn't exist
9 | response = requests.get(url=f'http://{HOST}:{PORT}/')
10 | print(f"Result of doing a GET request from http://{HOST}:{PORT}/")
11 | print(response)
12 |
13 | print("-------------")
14 |
15 | # Request the time
16 | response = requests.get(url=f'http://{HOST}:{PORT}/get_time')
17 | print(f"Result of doing a GET request from http://{HOST}:{PORT}/get_time")
18 | print(response)
19 | print("Response raw content:" + dir(response))
20 |
21 | if __name__=='__main__':
22 | pass
--------------------------------------------------------------------------------
/lectures/075_web_services/consume_services.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "fff7f325-d2b9-445e-b9fd-caa241da0bf3",
6 | "metadata": {},
7 | "source": [
8 | "# Test services via code (instead of via the browser)"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "id": "1c7ba53a-7dc5-4d8b-b1c0-7e0c1cb437f5",
15 | "metadata": {
16 | "tags": []
17 | },
18 | "outputs": [],
19 | "source": [
20 | "import requests"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "id": "89691c81-b664-48ca-be50-16d57ec9ebe2",
27 | "metadata": {
28 | "tags": []
29 | },
30 | "outputs": [],
31 | "source": [
32 | "HOST = 'localhost'\n",
33 | "PORT = 8000"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "id": "e7d20138-aed8-4e2c-916c-e526791af90f",
39 | "metadata": {},
40 | "source": [
41 | "### Request something whcih doesn't exist"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "id": "81504c9e-50b3-4dbd-8522-da42f3fb70c0",
48 | "metadata": {
49 | "tags": []
50 | },
51 | "outputs": [],
52 | "source": [
53 | "%%time\n",
54 | "response = requests.get(url=f'http://{HOST}:{PORT}/')"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "id": "6127c67e-ad1b-4296-8428-ff8385d5f206",
61 | "metadata": {
62 | "tags": []
63 | },
64 | "outputs": [],
65 | "source": [
66 | "response"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "id": "d46a726a-5f50-41ed-b4df-9785153122c8",
72 | "metadata": {},
73 | "source": [
74 | "### Request the 'get_time' service (recall that it is a GET service)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "id": "4c48e246-317c-4150-b72c-e8a1e0ab917f",
81 | "metadata": {
82 | "tags": []
83 | },
84 | "outputs": [],
85 | "source": [
86 | "%%time\n",
87 | "response = requests.get(url=f'http://{HOST}:{PORT}/get_time')"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "id": "874cac25-011e-4fc6-a887-2e80d1023025",
94 | "metadata": {
95 | "tags": []
96 | },
97 | "outputs": [],
98 | "source": [
99 | "response"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "id": "54429e71-b92d-4f6d-8466-afe7f1df7299",
106 | "metadata": {
107 | "tags": []
108 | },
109 | "outputs": [],
110 | "source": [
111 | "response.text"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "id": "7609100d-0b86-448d-9bf7-d6baa3644a16",
118 | "metadata": {
119 | "tags": []
120 | },
121 | "outputs": [],
122 | "source": [
123 | "j = response.json()\n",
124 | "j"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "id": "b10fb44d-f9fa-4290-8ef7-090849746437",
131 | "metadata": {
132 | "tags": []
133 | },
134 | "outputs": [],
135 | "source": [
136 | "type(j) # <= Proper Python dictionary is returned!"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "id": "916681a0-c917-4c07-9d49-61a90f7e1828",
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "j['current_time']"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "id": "81338c44-ad31-473f-a9d2-b1c736f5b664",
152 | "metadata": {
153 | "tags": []
154 | },
155 | "source": [
156 | "### Request the 'get_churn_probability' service (recall that it is a POST service)"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "id": "d3a249da-6d16-4599-9bbb-c8c1fac5aade",
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "import requests"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "id": "5e0b8617-a64f-4c0c-b518-20b668241688",
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "HOST = 'localhost'\n",
177 | "PORT = 8000"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "id": "69909596-66e9-4152-b906-bc8a2c9ee0c8",
184 | "metadata": {
185 | "tags": []
186 | },
187 | "outputs": [],
188 | "source": [
189 | "good_client = {'gender':'male', 'age':23, 'uc_grad':False}\n",
190 | "great_client = {'gender':'male', 'age':23, 'uc_grad':True}"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "id": "58868043-f2e5-40e0-902c-39e8a9031f2c",
197 | "metadata": {
198 | "tags": []
199 | },
200 | "outputs": [],
201 | "source": [
202 | "%%time\n",
203 | "\n",
204 | "response = requests.post(url=f'http://{HOST}:{PORT}/get_churn_probability', json=good_client)"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "id": "d44f9cb6-2716-4c51-98cb-94018db012da",
211 | "metadata": {
212 | "tags": []
213 | },
214 | "outputs": [],
215 | "source": [
216 | "response"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "id": "40677b01-a94f-43d4-a97d-bd2384bd1258",
223 | "metadata": {
224 | "tags": []
225 | },
226 | "outputs": [],
227 | "source": [
228 | "response.json()"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "id": "37de25b7-6711-4035-a16b-4fb490573f67",
235 | "metadata": {
236 | "tags": []
237 | },
238 | "outputs": [],
239 | "source": [
240 | "requests.post(url=f'http://{HOST}:{PORT}/get_churn_probability', json=great_client).json()"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "id": "41d76aa9-cc22-42cb-84fd-68aa4118a6d0",
247 | "metadata": {},
248 | "outputs": [],
249 | "source": []
250 | }
251 | ],
252 | "metadata": {
253 | "kernelspec": {
254 | "display_name": "Python [conda env:conda-mleng_env]",
255 | "language": "python",
256 | "name": "conda-env-conda-mleng_env-py"
257 | },
258 | "language_info": {
259 | "codemirror_mode": {
260 | "name": "ipython",
261 | "version": 3
262 | },
263 | "file_extension": ".py",
264 | "mimetype": "text/x-python",
265 | "name": "python",
266 | "nbconvert_exporter": "python",
267 | "pygments_lexer": "ipython3",
268 | "version": "3.11.11"
269 | }
270 | },
271 | "nbformat": 4,
272 | "nbformat_minor": 5
273 | }
274 |
--------------------------------------------------------------------------------
/lectures/075_web_services/decorator.pyx:
--------------------------------------------------------------------------------
1 | def logger2(f):
2 | print("Just ran logger2")
3 | def inner_func(*args, **kwargs):
4 | print(f"Starting execution of function {f.__name__}")
5 | rslt = f(*args, **kwargs)
6 | print(f"Finished execution of function {f.__name__}")
7 | return rslt
8 | return inner_func
9 |
10 | @logger2
11 | def say_bye(name):
12 | return f"Good bye {name}"
13 |
14 | #print("==============")
15 | #print(say_bye("Shahbaz"))
--------------------------------------------------------------------------------
/lectures/075_web_services/post_client_streamlit_app.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import streamlit as st
3 |
4 | st.markdown("# Churn probability test")
5 |
6 | st.markdown("#### Input parameters")
7 |
8 | uc_grad = st.toggle("UC Graduate?")
9 | age = st.slider("Age", 18, 100)
10 |
11 | response = requests.post(url=f"http://localhost:5000/get_churn_probability", json = {'gender':'male', 'age':age, 'uc_grad':uc_grad})
12 | st.write(response.json())
13 |
--------------------------------------------------------------------------------
/lectures/075_web_services/serve_json.py:
--------------------------------------------------------------------------------
1 | #serve json
2 | from datetime import datetime
3 | from fastapi import FastAPI
4 |
5 | app = FastAPI()
6 |
7 | @app.get('/status')
8 | def status():
9 | d = {'status': 'OK'}
10 | return d
11 |
12 | @app.get('/get_time')
13 | def get_time():
14 | d = {'current_time':datetime.now().strftime("%H:%M")}
15 | return d
16 |
17 | # fastapi dev serve_json.py
--------------------------------------------------------------------------------
/lectures/075_web_services/serve_post_json.py:
--------------------------------------------------------------------------------
1 | #serve post json
2 | from datetime import datetime
3 | from fastapi import FastAPI
4 | from pydantic import BaseModel
5 |
6 |
7 | app = FastAPI()
8 |
9 | @app.get('/status')
10 | def status():
11 | d = {'status': 'OK'}
12 | return d
13 |
14 | @app.get('/get_time')
15 | def get_time():
16 | d = {'current_time':datetime.now().strftime("%H:%M")}
17 | return d
18 |
19 | class ClientData(BaseModel):
20 | gender: str
21 | age: int
22 | uc_grad: bool
23 |
24 | @app.post('/get_churn_probability')
25 | def get_churn_probability(client_props: ClientData):
26 |
27 | #if 'UC_GRAD' not in client_properties:
28 | # pass throw error
29 |
30 | # Our churn model is fake, we don't actually use an ML model :(
31 | if client_props.uc_grad == "true":
32 | return {'churn_prob':0.34}
33 | else:
34 | return {'churn_prob':0.87}
35 |
36 | # fastapi dev serve_post_json.py
--------------------------------------------------------------------------------
/lectures/075_web_services/serve_text.py:
--------------------------------------------------------------------------------
1 | #serve text
2 | from datetime import datetime
3 | from fastapi import FastAPI
4 |
5 | app = FastAPI()
6 |
7 | @app.get('/status')
8 | def status():
9 | return "OK"
10 |
11 |
12 | @app.get('/get_time')
13 | def get_time():
14 | t = f'current time is {datetime.now().strftime("%H:%M")}'
15 | return t
16 |
17 | # fastapi dev serve_text.py
--------------------------------------------------------------------------------
/lectures/075_web_services/streamlit_app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | st.markdown("# Hello")
4 |
5 | name = st.text_input("What is your name?")
6 | st.write(f"Hello {name}")
7 |
8 | if len(name.strip()) == 0:
9 | st.warning("You haven't entered your name yet :(")
--------------------------------------------------------------------------------
/lectures/080_env_pkg_management/Python environment and package management.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/080_env_pkg_management/Python environment and package management.pptx
--------------------------------------------------------------------------------
/lectures/090_python_tools/.coverage:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/090_python_tools/.coverage
--------------------------------------------------------------------------------
/lectures/090_python_tools/020-python-bytecode.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "648ac726-3cb0-4744-a463-fb8dd32c87f4",
7 | "metadata": {
8 | "tags": []
9 | },
10 | "outputs": [],
11 | "source": [
12 | "import dis"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "id": "92d3ec11-5bda-4686-8feb-6550132f02dc",
19 | "metadata": {
20 | "tags": []
21 | },
22 | "outputs": [],
23 | "source": [
24 | "def loop_f(x):\n",
25 | " for i in range(x):\n",
26 | " print(i)"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 3,
32 | "id": "32c3e112-2562-4f32-8135-5d0956316fd6",
33 | "metadata": {
34 | "tags": []
35 | },
36 | "outputs": [],
37 | "source": [
38 | "def listcomp_f(x):\n",
39 | " [print(i) for i in range(x)]"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 4,
45 | "id": "4991ba06-181a-4ad1-8303-f685a33b4051",
46 | "metadata": {
47 | "tags": []
48 | },
49 | "outputs": [
50 | {
51 | "name": "stdout",
52 | "output_type": "stream",
53 | "text": [
54 | "0\n",
55 | "1\n",
56 | "2\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "loop_f(3)"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 5,
67 | "id": "d7f695f5-f595-4958-a551-b760f933f866",
68 | "metadata": {
69 | "tags": []
70 | },
71 | "outputs": [
72 | {
73 | "name": "stdout",
74 | "output_type": "stream",
75 | "text": [
76 | "0\n",
77 | "1\n",
78 | "2\n"
79 | ]
80 | }
81 | ],
82 | "source": [
83 | "listcomp_f(3)"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 6,
89 | "id": "7aa3fee5-f68f-4215-9bc9-27ff8aee72d2",
90 | "metadata": {
91 | "tags": []
92 | },
93 | "outputs": [
94 | {
95 | "name": "stdout",
96 | "output_type": "stream",
97 | "text": [
98 | " 1 0 RESUME 0\n",
99 | "\n",
100 | " 2 2 LOAD_GLOBAL 1 (NULL + range)\n",
101 | " 14 LOAD_FAST 0 (x)\n",
102 | " 16 PRECALL 1\n",
103 | " 20 CALL 1\n",
104 | " 30 GET_ITER\n",
105 | " >> 32 FOR_ITER 17 (to 68)\n",
106 | " 34 STORE_FAST 1 (i)\n",
107 | "\n",
108 | " 3 36 LOAD_GLOBAL 3 (NULL + print)\n",
109 | " 48 LOAD_FAST 1 (i)\n",
110 | " 50 PRECALL 1\n",
111 | " 54 CALL 1\n",
112 | " 64 POP_TOP\n",
113 | " 66 JUMP_BACKWARD 18 (to 32)\n",
114 | "\n",
115 | " 2 >> 68 LOAD_CONST 0 (None)\n",
116 | " 70 RETURN_VALUE\n"
117 | ]
118 | }
119 | ],
120 | "source": [
121 | "dis.dis(loop_f)"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 8,
127 | "id": "935916ff-0f64-4f6e-85c9-76448ad45229",
128 | "metadata": {
129 | "tags": []
130 | },
131 | "outputs": [
132 | {
133 | "name": "stdout",
134 | "output_type": "stream",
135 | "text": [
136 | " 1 0 RESUME 0\n",
137 | "\n",
138 | " 2 2 LOAD_CONST 1 ( at 0x000001B9993E6880, file \"C:\\Users\\shahb\\AppData\\Local\\Temp\\ipykernel_4252\\1272453071.py\", line 2>)\n",
139 | " 4 MAKE_FUNCTION 0\n",
140 | " 6 LOAD_GLOBAL 1 (NULL + range)\n",
141 | " 18 LOAD_FAST 0 (x)\n",
142 | " 20 PRECALL 1\n",
143 | " 24 CALL 1\n",
144 | " 34 GET_ITER\n",
145 | " 36 PRECALL 0\n",
146 | " 40 CALL 0\n",
147 | " 50 POP_TOP\n",
148 | " 52 LOAD_CONST 0 (None)\n",
149 | " 54 RETURN_VALUE\n",
150 | "\n",
151 | "Disassembly of at 0x000001B9993E6880, file \"C:\\Users\\shahb\\AppData\\Local\\Temp\\ipykernel_4252\\1272453071.py\", line 2>:\n",
152 | " 2 0 RESUME 0\n",
153 | " 2 BUILD_LIST 0\n",
154 | " 4 LOAD_FAST 0 (.0)\n",
155 | " >> 6 FOR_ITER 17 (to 42)\n",
156 | " 8 STORE_FAST 1 (i)\n",
157 | " 10 LOAD_GLOBAL 1 (NULL + print)\n",
158 | " 22 LOAD_FAST 1 (i)\n",
159 | " 24 PRECALL 1\n",
160 | " 28 CALL 1\n",
161 | " 38 LIST_APPEND 2\n",
162 | " 40 JUMP_BACKWARD 18 (to 6)\n",
163 | " >> 42 RETURN_VALUE\n"
164 | ]
165 | }
166 | ],
167 | "source": [
168 | "dis.dis(listcomp_f)"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "id": "9e5524e9-31ee-4c37-85ad-3aced5b7ed5f",
175 | "metadata": {},
176 | "outputs": [],
177 | "source": []
178 | }
179 | ],
180 | "metadata": {
181 | "kernelspec": {
182 | "display_name": "Python 3 (ipykernel)",
183 | "language": "python",
184 | "name": "python3"
185 | },
186 | "language_info": {
187 | "codemirror_mode": {
188 | "name": "ipython",
189 | "version": 3
190 | },
191 | "file_extension": ".py",
192 | "mimetype": "text/x-python",
193 | "name": "python",
194 | "nbconvert_exporter": "python",
195 | "pygments_lexer": "ipython3",
196 | "version": "3.11.5"
197 | }
198 | },
199 | "nbformat": 4,
200 | "nbformat_minor": 5
201 | }
202 |
--------------------------------------------------------------------------------
/lectures/090_python_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/090_python_tools/__init__.py
--------------------------------------------------------------------------------
/lectures/090_python_tools/logging.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": 1,
3 | "disable_existing_loggers": false,
4 |
5 | "handlers": {
6 | "console": {
7 | "class": "logging.StreamHandler",
8 | "level": "DEBUG",
9 | "stream": "ext://sys.stdout"
10 | },
11 | "file": {
12 | "class": "logging.handlers.TimedRotatingFileHandler",
13 | "level": "DEBUG",
14 | "when": "D",
15 | "backupCount": 0,
16 | "filename": "./logs/training-stats.log"
17 | }
18 | },
19 | "loggers": {
20 | "root": {
21 | "level": "DEBUG",
22 | "handlers": ["console"]
23 | },
24 | "app": {
25 | "level": "DEBUG",
26 | "handlers": ["file"],
27 | "propagate": true,
28 | "qualname": "app"
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/lectures/090_python_tools/logging_fancy.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": 1,
3 | "disable_existing_loggers": false,
4 | "formatters": {
5 | "simple": {
6 | "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
7 | },
8 | "verbose": {
9 | "format": "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s"
10 | }
11 | },
12 | "handlers": {
13 | "console": {
14 | "class": "logging.StreamHandler",
15 | "level": "DEBUG",
16 | "formatter": "simple",
17 | "stream": "ext://sys.stdout"
18 | },
19 | "file": {
20 | "class": "logging.handlers.TimedRotatingFileHandler",
21 | "level": "DEBUG",
22 | "formatter": "verbose",
23 | "when": "D",
24 | "backupCount": 0,
25 | "filename": "./logs/training-stats.log"
26 | },
27 | "uvicorn": {
28 | "class": "logging.handlers.TimedRotatingFileHandler",
29 | "level": "DEBUG",
30 | "formatter": "verbose",
31 | "when": "D",
32 | "backupCount": 0,
33 | "filename": "./logs/uvicorn.log"
34 | }
35 | },
36 | "loggers": {
37 | "root": {
38 | "level": "DEBUG",
39 | "handlers": ["console"]
40 | },
41 | "app": {
42 | "level": "DEBUG",
43 | "handlers": ["file"],
44 | "propagate": true,
45 | "qualname": "app"
46 | },
47 | "uvicorn": {
48 | "level": "DEBUG",
49 | "handlers": ["uvicorn"],
50 | "propagate": true,
51 | "qualname": "uvicorn"
52 | }
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/lectures/090_python_tools/logs/training-stats.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/090_python_tools/logs/training-stats.log
--------------------------------------------------------------------------------
/lectures/090_python_tools/logs/uvicorn.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/090_python_tools/logs/uvicorn.log
--------------------------------------------------------------------------------
/lectures/090_python_tools/name_reverser.py:
--------------------------------------------------------------------------------
1 | # This library parses names and presents them in a professional, reverse name order
2 |
3 | def name_reverse_order(full_name):
4 | if full_name == "": # Handle the case where an empty string is passed
5 | return ""
6 | else:
7 | first, last = full_name.split(' ')
8 | return f'{last}, {first}'
9 |
--------------------------------------------------------------------------------
/lectures/090_python_tools/python_logging_01.py:
--------------------------------------------------------------------------------
1 |
2 | import logging
3 |
4 | logging.basicConfig(level=logging.INFO)
5 |
6 |
7 | def maximum(numbers):
8 | #pass # pass means "do nothing", add your code here
9 | max_value = 0
10 | for num in numbers:
11 | logging.debug(f"num:{num}, max_value:{max_value}")
12 | if num > max_value:
13 | logging.debug("max > max_value branch taken. Setting new max_value")
14 | max_value = num
15 | return max_value
16 |
17 | def main():
18 | logging.info("Let us find the maximum value from the following list:")
19 | list_of_nums = [1,2,3]
20 | max_value = maximum(list_of_nums)
21 | logging.info(max_value)
22 |
23 | logging.info("Let us find the maximum value from another list:")
24 | list_of_nums = [-1, -2, -3]
25 | max_value = maximum(list_of_nums)
26 | logging.info(max_value)
27 |
28 | if __name__ == "__main__":
29 | logging.warning(f"This program is being run from the command line")
30 | main()
31 |
--------------------------------------------------------------------------------
/lectures/090_python_tools/python_logging_02.py:
--------------------------------------------------------------------------------
1 |
2 | import logging.config
3 | import json
4 |
5 | with open("logging.json", "r") as f:
6 | json_config = json.load(f)
7 | logging.config.dictConfig(json_config)
8 |
9 | def maximum(numbers):
10 | #pass # pass means "do nothing", add your code here
11 | max_value = 0
12 | for num in numbers:
13 | logging.debug(f"num:{num}, max_value:{max_value}")
14 | if num > max_value:
15 | logging.debug("max > max_value branch taken. Setting new max_value")
16 | max_value = num
17 | return max_value
18 |
19 | def main():
20 | logging.info("Let us find the maximum value from the following list:")
21 | list_of_nums = [1,2,3]
22 | max_value = maximum(list_of_nums)
23 | logging.info(max_value)
24 |
25 | logging.info("Let us find the maximum value from another list:")
26 | list_of_nums = [-1, -2, -3]
27 | max_value = maximum(list_of_nums)
28 | logging.info(max_value)
29 |
30 | if __name__ == "__main__":
31 | logging.warning(f"This program is being run from the command line")
32 | main()
33 |
--------------------------------------------------------------------------------
/lectures/090_python_tools/python_logging_03.py:
--------------------------------------------------------------------------------
1 |
2 | import logging.config
3 | import json
4 | import os
5 |
6 | with open("logging_fancy.json", "r") as f:
7 | json_config = json.load(f)
8 | logging.config.dictConfig(json_config)
9 |
10 | fname = os.path.basename(__file__)
11 | log = logging.getLogger(fname) # <= This lines makes the logger name more useful
12 |
13 | def maximum(numbers):
14 | #pass # pass means "do nothing", add your code here
15 | max_value = 0
16 | for num in numbers:
17 | log.debug(f"num:{num}, max_value:{max_value}")
18 | if num > max_value:
19 | log.debug("max > max_value branch taken. Setting new max_value")
20 | max_value = num
21 | return max_value
22 |
23 | def main():
24 | log.info("Let us find the maximum value from the following list:")
25 | list_of_nums = [1,2,3]
26 | max_value = maximum(list_of_nums)
27 | log.info(max_value)
28 |
29 | log.info("Let us find the maximum value from another list:")
30 | list_of_nums = [-1, -2, -3]
31 | max_value = maximum(list_of_nums)
32 | log.info(max_value)
33 |
34 | if __name__ == "__main__":
35 | log.warning(f"This program is being run from the command line")
36 | main()
37 |
--------------------------------------------------------------------------------
/lectures/090_python_tools/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/090_python_tools/tests/__init__.py
--------------------------------------------------------------------------------
/lectures/090_python_tools/tests/test_name_reverser.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from name_reverser import name_reverse_order
3 |
4 | def test_name_reverse_order_normal():
5 | rslt = name_reverse_order("Michael Jordan")
6 | assert rslt == "Jordan, Michael"
7 |
8 | rslt = name_reverse_order("Lebron James")
9 | assert rslt == "James, Lebron"
10 |
11 |
--------------------------------------------------------------------------------
/lectures/090_python_tools/tests/test_name_reverser_part_deux.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from name_reverser import name_reverse_order
3 |
4 | def test_name_reverse_order_normal():
5 | rslt = name_reverse_order("Michael Jordan")
6 | assert rslt == "Jordan, Michael"
7 |
8 | rslt = name_reverse_order("Lebron James")
9 | assert rslt == "James, Lebron"
10 |
11 | def test_name_reverse_order_bad_inputs():
12 |
13 | # Empty string
14 | with pytest.raises(ValueError):
15 | rslt = name_reverse_order("")
16 |
17 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/messy.py:
--------------------------------------------------------------------------------
1 | # messy.py
2 | import pandas as pd,numpy as np
3 | from typing import List,Dict,Any
4 | import matplotlib.pyplot as plt
5 |
6 | class dataProcessor:
7 | def __init__(self,input_file:str, output_file:str='processed.csv'):
8 | self.input=input_file
9 | self.output_file=output_file
10 | self.data=None
11 |
12 | def Load_data(self):
13 | """loads data from csv file"""
14 | self.data=pd.read_csv(self.input)
15 | return self.data
16 |
17 | def process(self,columns_to_process:List[str]=[],aggfunc:str='mean')->pd.DataFrame:
18 | if len(columns_to_process)==0: return self.data
19 | processed_data={}
20 | for col in columns_to_process:
21 | if col in self.data.columns:
22 | processed_data[col]=getattr(self.data[col],aggfunc)()
23 | else:
24 | print(f"Warning: Column {col} not found")
25 | return pd.DataFrame(processed_data,index=[0])
26 |
27 | def visualize_data(self, column:str, PlotType:str='bar' )->None:
28 | if self.data is None:raise ValueError('No data loaded')
29 | plt.figure(figsize=(10, 5))
30 | if PlotType=='bar':
31 | self.data[column].value_counts().plot(kind='bar')
32 | elif PlotType=='hist':
33 | self.data[column].hist()
34 | plt.title(f'Visualization of {column}')
35 | plt.show()
36 |
37 | def main():
38 | processor=dataProcessor('data.csv')
39 | df = processor.Load_data()
40 | processed=processor.process(['age','salary'],aggfunc='mean')
41 | processor.visualize_data('age','hist')
42 |
43 | if __name__=='__main__':
44 | main()
45 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program1.py:
--------------------------------------------------------------------------------
1 |
2 | x = 10
3 | print(f"The number is {x}")
4 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program10.py:
--------------------------------------------------------------------------------
1 | def main():
2 | print("Hello world")
3 |
4 | main()
5 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program11.py:
--------------------------------------------------------------------------------
1 |
2 | import program10
3 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program12.py:
--------------------------------------------------------------------------------
1 |
2 | print(f"__name__ in program12 is set to {__name__}")
3 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program13.py:
--------------------------------------------------------------------------------
1 | import program12
2 |
3 | print(f"__name__ in program13 is set to {__name__}")
4 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program14.py:
--------------------------------------------------------------------------------
1 | def main():
2 | print("Hello world")
3 |
4 | if __name__ == "__main__":
5 | main()
6 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program14b.py:
--------------------------------------------------------------------------------
1 | import program14
2 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program15.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | def main():
4 | print("Hello world")
5 |
6 | if __name__ == "__main__":
7 | main()
8 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program2.py:
--------------------------------------------------------------------------------
1 |
2 | x = 10
3 | y = x + x
4 |
5 | name = "Shahbaz"
6 | name
7 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program3.py:
--------------------------------------------------------------------------------
1 |
2 | BATCH_DATE = input("Please provide a batch date (mm/dd/yyyy): ")
3 | print(f"This program is run on {BATCH_DATE}")
4 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program4.py:
--------------------------------------------------------------------------------
1 |
2 | import sys
3 |
4 | BATCH_DATE = sys.argv[1] #<= Here is the magic
5 | print(f"This program is run on {BATCH_DATE}")
6 | #print(f"What is at location 0? {sys.argv[0]}")
7 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program5.py:
--------------------------------------------------------------------------------
1 |
2 | import sys
3 |
4 | if len(sys.argv) < 2:
5 | print("Error: Missing BATCH_DATE. Please run as 'python program5.py `")
6 | sys.exit(1)
7 |
8 | BATCH_DATE = sys.argv[1] #<= Here is the magic
9 | print(f"This program is run on {BATCH_DATE}")
10 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program6.py:
--------------------------------------------------------------------------------
1 |
2 | import argparse
3 |
4 | # Create the argument parser
5 | parser = argparse.ArgumentParser(description="Scores sales transactions to predict returns.")
6 |
7 | # Add an argument
8 | parser.add_argument("BATCH_DATE", type=str, help="The date of sales transactions")
9 |
10 | # Parse the arguments
11 | args = parser.parse_args()
12 |
13 | # Print the argument
14 | print(f"This program is run on {args.BATCH_DATE}")
15 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program7.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import sys
4 |
5 | if len(sys.argv) < 2:
6 | print("Error: Missing enviornment variable name. Please run as 'python program5.py `")
7 | sys.exit(1)
8 |
9 | ENV = sys.argv[1]
10 | ENV_VAL = os.environ.get(ENV, "")
11 |
12 | print(f"Environment variable {ENV} has value {ENV_VAL}")
13 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program8.py:
--------------------------------------------------------------------------------
1 |
2 | print("Hello world")
3 |
--------------------------------------------------------------------------------
/lectures/110_python_py_files/program9.py:
--------------------------------------------------------------------------------
1 | def main():
2 | print("Hello world")
3 |
--------------------------------------------------------------------------------
/lectures/120_dockerize_python_app/100_minimal_27/Dockerfile:
--------------------------------------------------------------------------------
1 | # Use a small base image .... dangerously old!
2 | FROM python:2.7-slim
3 |
4 | # Set the working directory
5 | WORKDIR /app
6 |
7 | # Copy the application code
8 | COPY app.py .
9 |
10 | # Run the application
11 | CMD ["python", "app.py"]
12 |
13 | # docker build -t hello_app .
14 | # docker run hello_app
--------------------------------------------------------------------------------
/lectures/120_dockerize_python_app/100_minimal_27/app.py:
--------------------------------------------------------------------------------
1 | print "Hi from many years ago :)"
--------------------------------------------------------------------------------
/lectures/120_dockerize_python_app/110_minimal_server/Dockerfile:
--------------------------------------------------------------------------------
1 | # Use the official Python image as a base
2 | FROM python:3.11-slim
3 |
4 | # Set the working directory
5 | WORKDIR /app
6 |
7 | # Copy the requirements file and install dependencies
8 | COPY requirements.txt .
9 | RUN pip install --no-cache-dir -r requirements.txt
10 |
11 | # Copy the application code
12 | COPY . .
13 |
14 | # Expose the FastAPI default port
15 | EXPOSE 8000
16 |
17 | # Command to run the application
18 | CMD ["fastapi", "run", "app.py", "--host", "0.0.0.0", "--port", "8000"]
19 |
20 |
21 | # docker build -t minimal_server .
22 | # docker run -d -p 8000:8000 minimal_server
23 | # curl http://localhost:8000
--------------------------------------------------------------------------------
/lectures/120_dockerize_python_app/110_minimal_server/app.py:
--------------------------------------------------------------------------------
1 | from fastapi import FastAPI
2 |
3 | app = FastAPI()
4 |
5 | @app.get("/")
6 | def read_root():
7 | return {"message": "Hello, FastAPI in Docker!"}
8 |
--------------------------------------------------------------------------------
/lectures/120_dockerize_python_app/110_minimal_server/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi[standard]
--------------------------------------------------------------------------------
/lectures/120_dockerize_python_app/Docker – an introduction.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/120_dockerize_python_app/Docker – an introduction.pptx
--------------------------------------------------------------------------------
/lectures/120_dockerize_python_app/example1/Dockerfile:
--------------------------------------------------------------------------------
1 | # dockerfile, Image, Container
2 | FROM python:3.9
3 |
4 | ADD main.py .
5 |
6 | RUN pip install requests beautifulsoup4
7 |
8 | CMD ["python", "./main.py"]
--------------------------------------------------------------------------------
/lectures/120_dockerize_python_app/example1/main.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 |
4 | print("Hello app")
--------------------------------------------------------------------------------
/lectures/130-distributed_python/120-ray-serve.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "3372311b-1834-4898-af06-ec7721242ba8",
6 | "metadata": {},
7 | "source": [
8 | "# Ray Serve - scale deployed models"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "3d7c1968-e07d-4d64-9f4f-bf49f5cf1512",
14 | "metadata": {},
15 | "source": [
16 | "### Ray serve\n",
17 | "Ray serve is a component which makes it easy to spread the serving of an API across several machines. Let's jump into code."
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "id": "da5c4292-7419-4184-bccf-4594761abfe8",
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "%%writefile simple_api.py\n",
28 | "\n",
29 | "from fastapi import FastAPI\n",
30 | "from typing import Dict\n",
31 | "\n",
32 | "app = FastAPI()\n",
33 | "\n",
34 | "@app.get(\"/status\")\n",
35 | "def status() -> Dict[str, str]:\n",
36 | " \"\"\"Simple health check endpoint.\"\"\"\n",
37 | " return {\"status\": \"ok\"}\n",
38 | "\n",
39 | "\n",
40 | "@app.get(\"/compute\")\n",
41 | "def fibonacci(n: int):\n",
42 | " \"\"\"Compute Fibonacci sequence up to n (inclusive).\"\"\"\n",
43 | " if n <= 0:\n",
44 | " return []\n",
45 | " fib = [0, 1]\n",
46 | " while fib[-1] + fib[-2] <= n:\n",
47 | " fib.append(fib[-1] + fib[-2])\n",
48 | " return fib\n",
49 | "\n",
50 | "# fastapi run simple_api.py\n",
51 | "# http://localhost:8000/compute?n=10"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "id": "490f1988-10a3-4c6b-ba99-641a9e61db49",
57 | "metadata": {},
58 | "source": [
59 | "Normally you run the code above as:\n",
60 | "\n",
61 | "```python\n",
62 | "fastapi run simple_apy.py\n",
63 | "```\n",
64 | "\n",
65 | "This will run the API on a single machine. \n",
66 | "\n",
67 | "However, is your startup grows, how do you make sure you can continue to serve clients?"
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "id": "26632f27-3e6a-4030-8adf-bc8fd87e7256",
73 | "metadata": {},
74 | "source": [
75 | "### Let's try to scale this across several machines\n",
76 | "\n",
77 | "If we are not on the same network, use Tailscale to hop on the same vpn."
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "id": "1c538cc3-efaa-44f2-bd32-b7771d28e7f8",
83 | "metadata": {},
84 | "source": [
85 | "#### Install Ray"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "id": "fed64382-d746-4056-a27f-a6099d9aba1b",
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "!pip install ray[all]"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "id": "21a104de-f563-4055-b377-29b2b8cc144d",
101 | "metadata": {},
102 | "source": [
103 | "#### Deploy FastAPI on a cluster (via Ray)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "id": "f9e6a4d1-bba6-4f4d-bfba-ffae5b2d9716",
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "%%writefile simple_api_ray.py\n",
114 | "\n",
115 | "from fastapi import FastAPI\n",
116 | "from typing import Dict\n",
117 | "from ray import serve\n",
118 | "import ray\n",
119 | "\n",
120 | "#ray.init(address=\"192.168.12.239:10001\") \n",
121 | "#ray.init(ignore_reinit_error=True)\n",
122 | "\n",
123 | "app = FastAPI()\n",
124 | "\n",
125 | "@app.get(\"/status\")\n",
126 | "def status() -> Dict[str, str]:\n",
127 | " \"\"\"Simple health check endpoint.\"\"\"\n",
128 | " return {\"status\": \"ok\"}\n",
129 | "\n",
130 | "\n",
131 | "@app.get(\"/compute\")\n",
132 | "def fibonacci(n: int):\n",
133 | " \"\"\"Compute Fibonacci sequence up to n (inclusive).\"\"\"\n",
134 | " if n <= 0:\n",
135 | " return []\n",
136 | " fib = [0, 1]\n",
137 | " while fib[-1] + fib[-2] <= n:\n",
138 | " fib.append(fib[-1] + fib[-2])\n",
139 | " return fib\n",
140 | "\n",
141 | "@serve.deployment\n",
142 | "@serve.ingress(app)\n",
143 | "class FastAPIWrapper:\n",
144 | " pass\n",
145 | "\n",
146 | "serve.run(FastAPIWrapper.bind(), route_prefix=\"/\")\n",
147 | "\n",
148 | "# python simple_api_ray.py\n",
149 | "# http://localhost:8000/compute?n=10"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "id": "6a193854-70d5-4ca0-8c88-0b4684675a4b",
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "%%writefile simple_api_ray2.py\n",
160 | "\n",
161 | "import requests\n",
162 | "from fastapi import FastAPI\n",
163 | "from ray import serve\n",
164 | "\n",
165 | "# 1: Define a FastAPI app and wrap it in a deployment with a route handler.\n",
166 | "app = FastAPI()\n",
167 | "\n",
168 | "\n",
169 | "@serve.deployment\n",
170 | "@serve.ingress(app)\n",
171 | "class FastAPIDeployment:\n",
172 | " # FastAPI will automatically parse the HTTP request for us.\n",
173 | " @app.get(\"/hello\")\n",
174 | " def say_hello(self, name: str) -> str:\n",
175 | " return f\"Hello {name}!\"\n",
176 | "\n",
177 | "\n",
178 | "# 2: Deploy the deployment.\n",
179 | "serve.run(FastAPIDeployment.bind(), route_prefix=\"/\", )\n",
180 | "\n",
181 | "# 3: Query the deployment and print the result.\n",
182 | "# print(requests.get(\"http://localhost:8000/hello\", params={\"name\": \"Theodore\"}).json())\n",
183 | "# \"Hello Theodore!\""
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "id": "5555ce34-cc1e-449c-9de3-77e1ea441f61",
190 | "metadata": {},
191 | "outputs": [],
192 | "source": []
193 | }
194 | ],
195 | "metadata": {
196 | "kernelspec": {
197 | "display_name": "Python [conda env:conda-mleng_env]",
198 | "language": "python",
199 | "name": "conda-env-conda-mleng_env-py"
200 | },
201 | "language_info": {
202 | "codemirror_mode": {
203 | "name": "ipython",
204 | "version": 3
205 | },
206 | "file_extension": ".py",
207 | "mimetype": "text/x-python",
208 | "name": "python",
209 | "nbconvert_exporter": "python",
210 | "pygments_lexer": "ipython3",
211 | "version": "3.11.11"
212 | }
213 | },
214 | "nbformat": 4,
215 | "nbformat_minor": 5
216 | }
217 |
--------------------------------------------------------------------------------
/lectures/130-distributed_python/dask-image/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM continuumio/miniconda3
2 | RUN apt update && apt install -y iputils-ping iproute2
3 | RUN pip install "dask[complete]"
--------------------------------------------------------------------------------
/lectures/130-distributed_python/ray-image/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM continuumio/miniconda3
2 | RUN apt update && apt install -y iputils-ping iproute2
3 | RUN pip install "ray[all]"
--------------------------------------------------------------------------------
/lectures/130-distributed_python/simple_api_ray.py:
--------------------------------------------------------------------------------
1 |
2 | from fastapi import FastAPI
3 | from typing import Dict
4 | from ray import serve
5 | import ray
6 |
7 | #ray.init(address="192.168.12.239:10001")
8 | #ray.init(ignore_reinit_error=True)
9 |
10 | app = FastAPI()
11 |
12 | @app.get("/status")
13 | def status() -> Dict[str, str]:
14 | """Simple health check endpoint."""
15 | return {"status": "ok"}
16 |
17 |
18 | @app.get("/compute")
19 | def fibonacci(n: int):
20 | """Compute Fibonacci sequence up to n (inclusive)."""
21 | if n <= 0:
22 | return []
23 | fib = [0, 1]
24 | while fib[-1] + fib[-2] <= n:
25 | fib.append(fib[-1] + fib[-2])
26 | return fib
27 |
28 | @serve.deployment
29 | @serve.ingress(app)
30 | class FastAPIWrapper:
31 | pass
32 |
33 | serve.run(FastAPIWrapper.bind(), route_prefix="/")
34 |
35 | # python simple_api_ray.py
36 | # http://localhost:8000/compute?n=10
37 |
--------------------------------------------------------------------------------
/lectures/130-distributed_python/simple_api_ray2.py:
--------------------------------------------------------------------------------
1 |
2 | import requests
3 | from fastapi import FastAPI
4 | import ray
5 | from ray import serve
6 |
7 | # 1: Define a FastAPI app and wrap it in a deployment with a route handler.
8 | app = FastAPI()
9 |
10 |
11 | @serve.deployment
12 | @serve.ingress(app)
13 | class FastAPIDeployment:
14 | # FastAPI will automatically parse the HTTP request for us.
15 | @app.get("/hello")
16 | def say_hello(self, name: str) -> str:
17 | return f"Hello {name}!"
18 |
19 |
20 | # 2: Deploy the deployment.
21 | ray.init(detached=True)
22 | serve.run(FastAPIDeployment.bind(), route_prefix="/", )
23 |
24 | # 3: Query the deployment and print the result.
25 | # print(requests.get("http://localhost:8000/hello", params={"name": "Theodore"}).json())
26 | # "Hello Theodore!"
27 |
--------------------------------------------------------------------------------
/lectures/misc/scratchspace.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "id": "unnecessary-locator",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "d = {'a':{'x':1, 'y':2, 'z':3}, 'b':{'x':1, 'y':2, 'z':3}, 'c':{'x':1, 'y':2, 'z':3}}"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 4,
16 | "id": "sharing-thermal",
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "name": "stdout",
21 | "output_type": "stream",
22 | "text": [
23 | "a\n",
24 | "b\n",
25 | "c\n"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "for i in d: print(i)"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 9,
36 | "id": "closing-hybrid",
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "name": "stdout",
41 | "output_type": "stream",
42 | "text": [
43 | "a {'x': 1, 'y': 2, 'z': 3}\n",
44 | " x 1\n",
45 | " y 2\n",
46 | " z 3\n",
47 | " x 0.16666666666666666\n",
48 | " y 0.3333333333333333\n",
49 | " z 0.5\n",
50 | " 6\n",
51 | "b {'x': 1, 'y': 2, 'z': 3}\n",
52 | " x 1\n",
53 | " y 2\n",
54 | " z 3\n",
55 | " x 0.16666666666666666\n",
56 | " y 0.3333333333333333\n",
57 | " z 0.5\n",
58 | " 6\n",
59 | "c {'x': 1, 'y': 2, 'z': 3}\n",
60 | " x 1\n",
61 | " y 2\n",
62 | " z 3\n",
63 | " x 0.16666666666666666\n",
64 | " y 0.3333333333333333\n",
65 | " z 0.5\n",
66 | " 6\n"
67 | ]
68 | }
69 | ],
70 | "source": [
71 | "#Outer loop, go through the top dictionary\n",
72 | "for key, val in d.items():\n",
73 | " print(key, val)\n",
74 | " \n",
75 | " #For each inner dicitonary, go through it as well and normalize numbers\n",
76 | " #calculate total\n",
77 | " total = 0\n",
78 | " for key2, val2 in val.items():\n",
79 | " print(' ', key2, val2)\n",
80 | " total += val2\n",
81 | " \n",
82 | " #divide each value by total to convert it to a probability\n",
83 | " for key2, val2 in val.items():\n",
84 | " val[key2] = val2/total\n",
85 | " print(' ', key2, val[key2])\n",
86 | " \n",
87 | " print(' ', total)"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 10,
93 | "id": "confident-radar",
94 | "metadata": {},
95 | "outputs": [
96 | {
97 | "data": {
98 | "text/plain": [
99 | "Counter({(0, 1): 1,\n",
100 | " (1, 2): 1,\n",
101 | " (2, 3): 1,\n",
102 | " (3, 4): 1,\n",
103 | " (4, 5): 1,\n",
104 | " (5, 6): 1,\n",
105 | " (6, 7): 1,\n",
106 | " (7, 8): 1,\n",
107 | " (8, 9): 1})"
108 | ]
109 | },
110 | "execution_count": 10,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "import collections\n",
117 | "r = range(10)\n",
118 | "collections.Counter(zip(r, r[1:]))"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "id": "dental-robertson",
125 | "metadata": {},
126 | "outputs": [],
127 | "source": []
128 | }
129 | ],
130 | "metadata": {
131 | "kernelspec": {
132 | "display_name": "Python 3",
133 | "language": "python",
134 | "name": "python3"
135 | },
136 | "language_info": {
137 | "codemirror_mode": {
138 | "name": "ipython",
139 | "version": 3
140 | },
141 | "file_extension": ".py",
142 | "mimetype": "text/x-python",
143 | "name": "python",
144 | "nbconvert_exporter": "python",
145 | "pygments_lexer": "ipython3",
146 | "version": "3.8.5"
147 | }
148 | },
149 | "nbformat": 4,
150 | "nbformat_minor": 5
151 | }
152 |
--------------------------------------------------------------------------------
/lectures/r_basics/images/rfordatascience.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/r_basics/images/rfordatascience.jpg
--------------------------------------------------------------------------------
/lectures/r_basics/images/rfordatascience.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/r_basics/images/rfordatascience.png
--------------------------------------------------------------------------------
/lectures/r_basics/images/rinaction.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/lectures/r_basics/images/rinaction.jpg
--------------------------------------------------------------------------------
/lectures/r_basics/rmarkdown_tutorial.rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "R Markdown Tutorial"
3 | author: "Shahbaz Chaudhary"
4 | date: "11/27/2021"
5 | output: html_document
6 | ---
7 |
8 | # Yet another reason why you should be familiar with _Markdown_
9 |
10 | While Jupyter is the standard tool for Python data scientists, R Studio and R Markdown are the most popular ways R data scientists develop and share their code.
11 |
12 | Markdown allows a simple way to format text. For example, you can have _italics_, **bold** and ~~crossed out~~ text.
13 |
14 | You can write lists:
15 |
16 | - item 1
17 | - item 2
18 | - item 3
19 |
20 | You can number your bullets
21 |
22 | 1. item 1
23 | 2. item 2
24 | 3. item 3
25 |
26 | Or create tables
27 |
28 | | column A | Column B | Column C |
29 | | --- | --- | --- |
30 | | value 1| value 2| value 3|
31 | | value 4| value 5| value 6|
32 | | value 7| value 8| value 9|
33 |
34 | ### But there is more
35 |
36 | You can display inline calculations, such as 2 + 2 = `r 2 + 2`!
37 |
38 | In fact, you can organize larger chunks of code:
39 |
40 | ```{r}
41 | seq(10)
42 | ```
43 |
44 | ```{r}
45 | plot(seq(10))
46 | ```
47 |
48 | You can control what gets displayed in your final document:
49 |
50 | ```{r echo=FALSE}
51 | plot(seq(10))
52 | ```
53 |
54 | ### Here is a surprise, you can even run Python in RMarkdown (but you need the 'reticulate' library)!
55 |
56 | Notice that the following line is not being executed, since there is no '{r}' text.
57 |
58 | ```
59 | install.packages('reticulate')
60 | ```
61 |
62 | ```{python}
63 | list(range(10))
64 | ```
65 |
66 | ### Better tables
67 | By default, R doesn't do a great job of rendering tables
68 |
69 | ```{r}
70 | mtcars
71 | ```
72 |
73 | The tidyverse set of packages promises to do better
74 |
75 | ```{r}
76 | library(tidyverse)
77 | tibble(mtcars)
78 | ```
79 |
80 | However, the `knitr` package does a much better job
81 |
82 | ```{r}
83 | library(knitr)
84 | kable(mtcars)
85 | ```
86 |
--------------------------------------------------------------------------------
/lectures/readme.md:
--------------------------------------------------------------------------------
1 | # Lectures
2 | This directory contains lectures notes, slides, notebooks, etc.
3 |
4 | ## Introductory Lectures
5 |
6 | ### intro_to_consoles
7 | Introduces the console, which may look intimidating and out of place on a modern computer.
8 |
9 | ### programming_vs_calculator
10 | Starts with a calculator, familiar and easy to understand. Then identifies various parts of it which most of us overlook. Then asks, how can it be expanded into a programming language?
11 |
12 | ### first_programs
13 | Starts to introduce Python. Shows several complete programs, each introducing new concepts and slightly more complexity
14 |
15 | ### intro_to_jupyter
16 | A very basic and brief introduction to Jupyter notebooks
17 |
18 | ### all_of_python_basics
19 | First introduction to much of Python and basic programming language constructs. This material should be enough for students to write basic programs, understand important terminology (to help in searching for answers on the web).
20 |
21 | ##### all_of_python_variable_assignment_and_tuples
22 | Introduces multiple assignment and tuples
23 |
24 | ##### all_of_python_basic_functions
25 | Deeper dive into function creation. Also learn about asserts and test first methodology. A short tutorial on using debuggers.
26 |
27 | ##### all_of_python_numbers
28 | Uses all bsaic operators, including the modulus and power operators. Shows how mod can be used in algorithms.
29 |
30 | ##### all_of_python_strings
31 | Shows how to use many popular functions and string formatters.
32 |
33 | ##### all_of_python_basic_plotting
34 | Very basic introduction to plotting. Needed for assignment on random numbers.
35 |
36 | ##### all_of_python_libraries_random
37 | Students will write this and as a tutorial for others who know random numbers, and basics of python, but not this library.
38 |
39 | ##### all_of_python_basic_dictionaries
40 | Deeper dive into dictionaries, including .get(,), defaultdict, etc.
41 |
42 | ##### all_of_python_basic_lists
43 | Much deeper dive into lists. Separate list functions from stack functions.
44 |
45 | ##### all_of_python_basic_list_comprehensions
46 | Introduces list comprehensions.
47 |
48 | ##### all_of_python_basic_classes
49 | First introduction to classes so object.method() notation makes sense. Introduce enough inheritence to students get how functionality can be shared (such as measuring accuracy on any scikit-learn model). ~~Introduce how operators are encoded as functions so numpy/pandas don't seem magical.~~
50 |
51 | ##### all_of_python_loops
52 | Introduce the while loop and give extended examples of the for loop. Introduce breaking out of loops and skipping iterations.
53 |
54 | ##### all_of_python_conditionals_and_None
55 | Introduce None. Introduce the elif keyword, describe short-circuiting. Mention case statements, which don't exist in Python.
56 |
57 | ~~##### all_of_python_iterators~~
58 | Introduce iterators and itertools.
59 |
60 | ~~##### all_of_python_more_functions~~
61 | Introduce keyword arguments, optional arguments, notation to pass through arguments, docstrings. Introduce lambdas and map/reduce/filter functions. Introduce higher order functions.
62 |
63 | ~~##### all_of_python_libraries_filesystem~~
64 |
65 | ### Explore distribution characters in male and female names
66 | In-class exercise to download files containing male and female names, write code to draw histograms of characters in male and female names.
67 |
68 | ### Intro to Pandas
69 | Introduces the data science library Pandas. It shows how to read csv files, how to view a summary of data, how to explore data using charting libraries, etc.
70 |
71 | ### Intro to Numpy
72 | Introduces the data science library Numpy.
73 |
74 | ### Gradient Descent
75 | In-class, assisted, exercise to write the gradient descent algorithm.
76 |
77 | ## Lectures appropriate for second course in programming
78 |
79 | ### git_version_control
80 | Introduces Git and includes several exercises. This lecture was written for students who have already taken an introductory programming class.
81 |
82 | ### bigger_data_pandas
83 | Once students are comfortable with Pandas, this course shows how data measuring gigabytes can be handled in Python.
84 |
85 | ## Miscellaneous lectures
86 |
87 | ### Data Science in Python
88 | An overview of Python, numpy, pandas and scikit-learn (one hour long)
--------------------------------------------------------------------------------
/postcell.conf.bak:
--------------------------------------------------------------------------------
1 | {
2 | "url" : "https://postcell.io/post_cell",
3 | "student_id" : "YOUR_STUDENTS_ENTER_THEIR_NAME_OR_ID_HERE",
4 | "instructor_id": "doJIH2jibYWOOJyGm6zsDNe93722",
5 | "class_id": "CLASS_ID_HERE",
6 | "should_send_to_server" : "true"
7 | }
--------------------------------------------------------------------------------
/programs/calc-sum-pd.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import argparse
3 |
4 | parser = argparse.ArgumentParser()
5 | parser.add_argument("infile")
6 | args = parser.parse_args()
7 |
8 | infile = args.infile
9 |
10 | data_df = pd.read_csv(infile)
11 | print(data_df[data_df.killer.isin(["Arya Stark", "Jon Snow"])].killer.value_counts())
12 |
--------------------------------------------------------------------------------
/programs/calc-sum.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import argparse
3 |
4 | parser = argparse.ArgumentParser()
5 | parser.add_argument("infile")
6 | args = parser.parse_args()
7 |
8 | infile = args.infile
9 |
10 | jon = 0 #variable containing Jon's score
11 | arya = 0 #variable containing Arya's score
12 |
13 | #Open file
14 | file = open(infile, encoding='utf8')
15 |
16 | #Go through each line in file
17 | for line in file:
18 | tokens = line.split(',') #separate line into columns
19 | if tokens[4]=="Arya Stark": arya = arya + 1
20 | if tokens[4]=="Jon Snow":
21 | jon = jon + 1
22 |
23 | file.close()
24 | print("Arya killed", arya, "people")
25 | print("Jon killed", jon, "people")
--------------------------------------------------------------------------------
/programs/gen-shakespeare.py:
--------------------------------------------------------------------------------
1 | # This program is used to demonstrate the debugger, shouldn't be used for anything else!
2 | import gzip
3 | import random
4 |
5 | src_dst = {}
6 | prev_word = ""
7 |
8 | with gzip.open('../datasets/shakespeare/shakespeare.txt.gz','rt') as infile:
9 | for line in infile:
10 | toks = line\
11 | .strip()\
12 | .lower()\
13 | .split()
14 | for tok in toks:
15 | if prev_word not in src_dst: src_dst[prev_word] = {}
16 | dst_dict = src_dst[prev_word]
17 | dst_freq = dst_dict.get(tok, 0) + 1
18 | src_dst[prev_word][tok] = dst_freq
19 | prev_word = tok
20 |
21 | #Randomly pick a word from data
22 | chosen_word = random.choice(list(src_dst.keys()))
23 |
24 | #Get words and thier distributions
25 | def get_word_dist(dist):
26 | pop = list(dist.keys())
27 | weights = list(distribution.values())
28 | return pop, weights
29 |
30 | #For each word, pick the following word with probability propotional to its frequency in the src_dst dictionary
31 | for i in range(100):
32 | distribution = src_dst[chosen_word]
33 | pop, weights = get_word_dist(distribution)
34 |
35 | next_word = random.choices(pop, weights=weights)[0]
36 | chosen_word = next_word
37 |
38 | print(next_word, end=' ')
39 | if(next_word.endswith(".")): print("")
40 |
--------------------------------------------------------------------------------
/programs/killings_per_season.py:
--------------------------------------------------------------------------------
1 | data_file_location = "../datasets/deaths-in-gameofthrones/game-of-thrones-deaths-data.csv"
2 |
3 | season_d = dict()
4 |
5 | #Open file
6 | file = open(data_file_location, encoding='utf8')
7 |
8 | #Go through each line in file
9 | for line in file:
10 | tokens = line.split(',') #separate line into columns
11 | s = tokens[1]
12 | if s not in season_d: season_d[s] = 1
13 | else: season_d[s] += 1
14 |
15 | file.close()
16 |
17 | print(season_d)
--------------------------------------------------------------------------------
/programs/logging.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": 1,
3 | "formatters": {
4 | "simple": {
5 | "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
6 | }
7 | },
8 | "handlers": {
9 | "console": {
10 | "class": "logging.StreamHandler",
11 | "level": "DEBUG",
12 | "formatter": "simple",
13 | "stream": "ext://sys.stdout"
14 | }
15 | },
16 | "loggers": {
17 | "root": {
18 | "level": "DEBUG",
19 | "handlers": ["console"]
20 | }
21 | }
22 | }
--------------------------------------------------------------------------------
/programs/maximum_bad_debug.py:
--------------------------------------------------------------------------------
1 | def maximum(numbers):
2 | #pass # pass means "do nothing", add your code here
3 | max_value = 0
4 | for num in numbers:
5 | if num > max_value:
6 | max_value = num
7 | return max_value
8 |
9 | def main():
10 | print("Let us find the maximum value from the following list:")
11 | list_of_nums = [1,2,3]
12 | max_value = maximum(list_of_nums)
13 | print(max_value)
14 |
15 | print("Let us find the maximum value from another list:")
16 | list_of_nums = [-1, -2, -3]
17 | max_value = maximum(list_of_nums)
18 | print(max_value)
19 |
20 | print(f"Value of __name__ is {__name__}")
21 |
22 | if __name__ == "__main__":
23 | main()
24 |
--------------------------------------------------------------------------------
/programs/maximum_bad_logging.log:
--------------------------------------------------------------------------------
1 | 2024-03-01 21:39:37,189 - maximum_bad_logging.py - WARNING - This program is being run from the command line
2 | 2024-03-01 21:39:37,189 - maximum_bad_logging.py - INFO - Let us find the maximum value from the following list:
3 | 2024-03-01 21:39:37,189 - maximum_bad_logging.py - DEBUG - num:1, max_value:0
4 | 2024-03-01 21:39:37,189 - maximum_bad_logging.py - DEBUG - max > max_value branch taken. Setting new max_value
5 | 2024-03-01 21:39:37,189 - maximum_bad_logging.py - DEBUG - num:2, max_value:1
6 | 2024-03-01 21:39:37,189 - maximum_bad_logging.py - DEBUG - max > max_value branch taken. Setting new max_value
7 | 2024-03-01 21:39:37,201 - maximum_bad_logging.py - DEBUG - num:3, max_value:2
8 | 2024-03-01 21:39:37,201 - maximum_bad_logging.py - DEBUG - max > max_value branch taken. Setting new max_value
9 | 2024-03-01 21:39:37,201 - maximum_bad_logging.py - INFO - 3
10 | 2024-03-01 21:39:37,201 - maximum_bad_logging.py - INFO - Let us find the maximum value from another list:
11 | 2024-03-01 21:39:37,205 - maximum_bad_logging.py - DEBUG - num:-1, max_value:0
12 | 2024-03-01 21:39:37,205 - maximum_bad_logging.py - DEBUG - num:-2, max_value:0
13 | 2024-03-01 21:39:37,206 - maximum_bad_logging.py - DEBUG - num:-3, max_value:0
14 | 2024-03-01 21:39:37,206 - maximum_bad_logging.py - INFO - 0
15 |
--------------------------------------------------------------------------------
/programs/maximum_bad_logging.py:
--------------------------------------------------------------------------------
1 | import logging.config
2 | import json
3 | import os
4 |
5 | with open("logging.json", "r") as f:
6 | json_config = json.load(f)
7 | logging.config.dictConfig(json_config)
8 |
9 | fname = os.path.basename(__file__)
10 | log = logging.getLogger(fname) # <= This lines makes the logger name more useful
11 |
12 | def maximum(numbers):
13 | #pass # pass means "do nothing", add your code here
14 | max_value = 0
15 | for num in numbers:
16 | log.debug(f"num:{num}, max_value:{max_value}")
17 | if num > max_value:
18 | log.debug("max > max_value branch taken. Setting new max_value")
19 | max_value = num
20 | return max_value
21 |
22 | def main():
23 | log.info("Let us find the maximum value from the following list:")
24 | list_of_nums = [1,2,3]
25 | max_value = maximum(list_of_nums)
26 | log.info(max_value)
27 |
28 | log.info("Let us find the maximum value from another list:")
29 | list_of_nums = [-1, -2, -3]
30 | max_value = maximum(list_of_nums)
31 | log.info(max_value)
32 |
33 | if __name__ == "__main__":
34 | log.warning(f"This program is being run from the command line")
35 | main()
36 |
--------------------------------------------------------------------------------
/programs/svm_or_logreg_strategy.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import pandas as pd
3 | from sklearn.model_selection import train_test_split
4 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder
5 | from sklearn.svm import SVC
6 | from sklearn.linear_model import LogisticRegression
7 | from sklearn.metrics import confusion_matrix
8 |
9 | def build_classifier(args):
10 | # Read CSV file
11 | df = pd.read_csv(args.csv)
12 |
13 | # Ignore specified columns
14 | if args.ignore_cols:
15 | df = df.drop(columns=args.ignore_cols, errors='ignore')
16 |
17 | # Convert categorical columns to one-hot encoding
18 | categorical_cols = df.select_dtypes(include=['object']).columns
19 | if len(categorical_cols) > 0:
20 | df = pd.get_dummies(df, columns=categorical_cols)
21 |
22 | # Encode target column
23 | label_encoder = LabelEncoder()
24 | df[args.target_col] = label_encoder.fit_transform(df[args.target_col])
25 |
26 | # Split data into features and target
27 | X = df.drop(columns=[args.target_col])
28 | y = df[args.target_col]
29 |
30 | # Split data into train and test sets
31 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
32 |
33 | # Choose classifier model
34 | if args.model == "svm": # <== We are able to pick the right model or 'strategy' at runtime
35 | clf = SVC()
36 | elif args.model == "logreg":
37 | clf = LogisticRegression()
38 |
39 | # Train classifier
40 | clf.fit(X_train, y_train) # <== This code is the same, svm or logreg
41 |
42 | # Predict on test set
43 | y_pred = clf.predict(X_test) # <== This code is the same, svm or logreg
44 |
45 | # Calculate confusion matrix
46 | cm = confusion_matrix(y_test, y_pred)
47 |
48 | return cm
49 |
50 | if __name__ == "__main__":
51 | parser = argparse.ArgumentParser(description="Build a classifier")
52 | parser.add_argument("--model", choices=["svm", "logreg"], required=True, help="Type of classifier model")
53 | parser.add_argument("--csv", required=True, help="CSV input file")
54 | parser.add_argument("--target-col", required=True, help="Target column name")
55 | parser.add_argument("--ignore-cols", nargs='+', default=[], help="Columns to ignore")
56 |
57 | args = parser.parse_args()
58 | confusion_matrix = build_classifier(args)
59 | print("Confusion Matrix:")
60 | print(confusion_matrix)
61 |
--------------------------------------------------------------------------------
/python_tableofcontents.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/python_tableofcontents.xlsx
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/src/__init__.py
--------------------------------------------------------------------------------
/src/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/src/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/src/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/falconair/ProgrammingForAnalytics/eb1773cc2d19500e08938dfebd084b5ced5b6877/src/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | # https://techoverflow.net/2017/02/26/requests-download-file-if-it-doesnt-exist/
2 |
3 | import requests
4 | import os.path
5 |
6 | def download_file(filename, url):
7 | """
8 | Download an URL to a file
9 | """
10 | with open(filename, 'wb') as fout:
11 | response = requests.get(url, stream=True)
12 | response.raise_for_status()
13 | # Write response data to file
14 | for block in response.iter_content(4096):
15 | fout.write(block)
16 |
17 | def download_if_not_exists(filename, url):
18 | """
19 | Download a URL to a file if the file
20 | does not exist already.
21 |
22 | Returns
23 | -------
24 | True if the file was downloaded,
25 | False if it already existed
26 | """
27 | if not os.path.exists(filename):
28 | download_file(filename, url)
29 | return True
30 | return False
31 |
32 |
--------------------------------------------------------------------------------