├── .gitignore ├── LICENSE ├── README.md ├── chapter_10_examples ├── jupyter_notebooks │ ├── a_humble_pie.ipynb │ ├── covid_FDI_impact.ipynb │ ├── refined_covid_barchart.ipynb │ ├── retirement_age.ipynb │ └── schools_that_work.ipynb └── standalone_files │ ├── a_humble_pie.py │ ├── covid_FDI_impact.py │ ├── refined_covid_barchart-bak.py │ ├── refined_covid_barchart.py │ ├── retirement_age.py │ └── schools_that_work.py ├── chapter_1_examples ├── jupyter_notebooks │ └── hello_world.ipynb └── standalone_files │ └── hello_world.py ├── chapter_2_examples ├── jupyter_notebooks │ ├── basic_greeting.ipynb │ ├── greet_me.ipynb │ ├── hitting_the_road_with_citibike.ipynb │ ├── method_madness.ipynb │ ├── noun_examples.ipynb │ ├── page_count_conditional.ipynb │ ├── page_count_custom_function.ipynb │ ├── page_count_loop.ipynb │ ├── page_count_printout.ipynb │ └── parts_of_speech.ipynb └── standalone_files │ ├── basic_greeting.py │ ├── greet_me.py │ ├── hitting_the_road_with_citibike.py │ ├── method_madness.py │ ├── noun_examples.py │ ├── page_count_conditional.py │ ├── page_count_custom_function.py │ ├── page_count_loop.py │ ├── page_count_printout.py │ └── parts_of_speech.py ├── chapter_4_examples ├── jupyter_notebooks │ ├── csv_parsing.ipynb │ ├── fixed_width_parsing.ipynb │ ├── json_parsing.ipynb │ ├── ods_parsing.ipynb │ ├── pdf_parsing.ipynb │ ├── rss_parsing.ipynb │ ├── tsv_parsing.ipynb │ ├── txt_parsing.ipynb │ ├── xls_parsing.ipynb │ ├── xlsx_parsing.ipynb │ └── xml_parsing.ipynb └── standalone_files │ ├── csv_parsing.py │ ├── fixed_width_parsing.py │ ├── json_parsing.py │ ├── ods_parsing.py │ ├── pdf_parsing.py │ ├── rss_parsing.py │ ├── tsv_parsing.py │ ├── txt_parsing.py │ ├── xls_parsing.py │ ├── xlsx_parsing.py │ └── xml_parsing.py ├── chapter_5_examples ├── .gitignore ├── jupyter_notebooks │ ├── FRED_API_example.ipynb │ ├── MTA_turnstiles_data_download.ipynb │ ├── MTA_turnstiles_index.ipynb │ ├── MTA_turnstiles_parsing.ipynb │ ├── Twitter_data_download.ipynb │ └── data_download.ipynb └── standalone_files │ ├── FRED_API_example.py │ ├── MTA_turnstile_index.py │ ├── MTA_turnstiles_data_download.py │ ├── MTA_turnstiles_parsing.py │ ├── Twitter_data_download.py │ └── data_download.py ├── chapter_6_examples ├── jupyter_notebooks │ ├── ppp_columns_review.ipynb │ ├── ppp_columns_summary.ipynb │ ├── ppp_data_join.ipynb │ ├── ppp_data_samples.ipynb │ ├── ppp_date_range.ipynb │ ├── ppp_find_waterford.ipynb │ ├── ppp_lender_names.ipynb │ ├── ppp_loan_status.ipynb │ ├── ppp_loan_uses.ipynb │ ├── ppp_min_max_loan.ipynb │ └── ppp_numrows.ipynb └── standalone_files │ ├── ppp_columns_review.py │ ├── ppp_columns_summary.py │ ├── ppp_data_join.py │ ├── ppp_data_samples.py │ ├── ppp_date_range.py │ ├── ppp_find_waterford.py │ ├── ppp_lender_names.py │ ├── ppp_loan_status.py │ ├── ppp_loan_uses.py │ ├── ppp_min_max_loan.py │ └── ppp_numrows.py ├── chapter_7_examples ├── jupyter_notebooks │ ├── citibike_september1_rides.ipynb │ ├── fixed_width_strip_parsing.ipynb │ ├── ppp_add_fingerprints.ipynb │ ├── ppp_adding_naics.ipynb │ ├── regex_tests.ipynb │ ├── weekday_rides.ipynb │ ├── xls_meta_and_date_parsing.ipynb │ └── xls_meta_parsing.ipynb └── standalone_files │ ├── citibike_september1_rides.py │ ├── fixed_width_strip_parsing.py │ ├── ppp_add_fingerprints.py │ ├── ppp_adding_naics.py │ ├── regex_tests.py │ ├── weekday_rides.py │ ├── xls_meta_and_date_parsing.py │ └── xls_meta_parsing.py ├── chapter_8_examples ├── jupyter_notebooks │ ├── fixed_width_string_parsing_refactored.ipynb │ ├── greet_me_options.ipynb │ ├── greet_me_revisited.ipynb │ ├── make_greeting.ipynb │ ├── make_greeting_no_vars.ipynb │ ├── webpage_saver.ipynb │ ├── weekday_rides_refactored.ipynb │ ├── xls_meta_and_date_parsing_refactored.ipynb │ └── xls_meta_and_date_parsing_refactored_again.ipynb └── standalone_files │ ├── fixed_width_strip_parsing_refactored.py │ ├── greet_me_options.py │ ├── greet_me_revisited.py │ ├── make_greeting.py │ ├── make_greeting_no_vars.py │ ├── webpage_saver.py │ ├── weekday_rides_refactored.py │ ├── xls_meta_and_date_parsing_refactored.py │ └── xls_meta_and_date_parsing_refactored_again.py └── chapter_9_examples ├── jupyter_notebooks ├── dollars_per_job_2M_rnd2.ipynb ├── ppp_loan_central_and_dist.ipynb ├── ppp_loan_central_measures.ipynb ├── who_got_2M_with_viz.ipynb ├── who_got_2_loans_by_date.ipynb └── wing_length_with_sd.ipynb └── standalone_files ├── dollars_per_job_2M_rnd2.py ├── ppp_fingerprint_borrowers.py ├── ppp_loan_central_and_dist.py ├── ppp_loan_central_measures.py ├── who_got_2M_with_viz.py ├── who_got_2_loans_by_date.py └── wing_length_with_sd.py /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | *checkpoint.ipynb 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Practical Python: Data Wrangling and Data Quality 2 | 3 | This repo contains draft coding exercises for the early-release version of the book _Practical Python: Data Wrangling and Data Quality_ to be published by O'Reilly Media in 2021. 4 | 5 | ## Before You Begin 6 | 7 | Below you will find an overview of this repo's contents, as well as important tips and information on how to use these files. In general, all exercises are accessible as standalone `.py` files, and as Jupyter Notebooks. The notebooks can either be downloaded to your device and run locally, or opened and run in Google Colab (https://colab.research.google.com/). The draft text of Chapter 1 includes basic instructions on how to get started with some of these tools; this text will be updated/completed before final publication. 8 | 9 | ### Working with data files 10 | 11 | Because data sets can often be quite large, the data sets for these exercises are available for download [here](https://drive.google.com/drive/folders/1q_dkJxfsCjeZjWH3Hs2WKWYFSTa7MsBn?usp=sharing). 12 | 13 | #### If you are working locally 14 | Data sets should be downloaded/copied in the same folder as the Python file or notebook, unless otherwise indicated. 15 | -------------------------------------------------------------------------------- /chapter_10_examples/jupyter_notebooks/a_humble_pie.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import matplotlib.pyplot as plt" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# matplotlib works counterclockwise, so we need to essentially reverse\n", 19 | "# the order of our pie-value \"slices\"\n", 20 | "candidate_names = ['Adams', 'Wiley', 'Garcia', 'Yang', 'Others']\n", 21 | "\n", 22 | "candidate_names.reverse()\n", 23 | "\n", 24 | "vote_pct = [30.8, 21.3, 19.6, 12.2, 16.1]\n", 25 | "\n", 26 | "vote_pct.reverse()\n", 27 | "\n", 28 | "colors = ['#006d2c','#006d2c', '#006d2c', '#31a354','#74c476']\n", 29 | "\n", 30 | "colors.reverse()\n", 31 | "\n", 32 | "fig1, ax1 = plt.subplots()\n", 33 | "\n", 34 | "# by default, the starting axis is the x-axis; making this value 90 ensures\n", 35 | "# that it is a vertical line instead\n", 36 | "ax1.pie(vote_pct, labels=candidate_names, autopct='%.1f%%', startangle=90,\n", 37 | " colors=colors)\n", 38 | "\n", 39 | "ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n", 40 | "\n", 41 | "# show the plot!\n", 42 | "plt.show()" 43 | ] 44 | } 45 | ], 46 | "metadata": { 47 | "kernelspec": { 48 | "display_name": "Python 3 (ipykernel)", 49 | "language": "python", 50 | "name": "python3" 51 | }, 52 | "language_info": { 53 | "codemirror_mode": { 54 | "name": "ipython", 55 | "version": 3 56 | }, 57 | "file_extension": ".py", 58 | "mimetype": "text/x-python", 59 | "name": "python", 60 | "nbconvert_exporter": "python", 61 | "pygments_lexer": "ipython3", 62 | "version": "3.9.5" 63 | } 64 | }, 65 | "nbformat": 4, 66 | "nbformat_minor": 4 67 | } 68 | -------------------------------------------------------------------------------- /chapter_10_examples/jupyter_notebooks/covid_FDI_impact.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import matplotlib.pyplot as plt\n", 10 | "import pandas as pd\n", 11 | "import seaborn as sns\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# each individual array is a row of data\n", 22 | "FDI = np.array([[0.8, 0.7], [0.3, 0.6]])\n", 23 | "\n", 24 | "fdi_data = pd.DataFrame(data=FDI,\n", 25 | " columns=['Developed', 'Developing'])" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "ax = sns.lineplot(data=fdi_data)\n", 35 | "\n", 36 | "# show the plot!\n", 37 | "plt.show()" 38 | ] 39 | } 40 | ], 41 | "metadata": { 42 | "kernelspec": { 43 | "display_name": "Python 3 (ipykernel)", 44 | "language": "python", 45 | "name": "python3" 46 | }, 47 | "language_info": { 48 | "codemirror_mode": { 49 | "name": "ipython", 50 | "version": 3 51 | }, 52 | "file_extension": ".py", 53 | "mimetype": "text/x-python", 54 | "name": "python", 55 | "nbconvert_exporter": "python", 56 | "pygments_lexer": "ipython3", 57 | "version": "3.9.5" 58 | } 59 | }, 60 | "nbformat": 4, 61 | "nbformat_minor": 4 62 | } 63 | -------------------------------------------------------------------------------- /chapter_10_examples/jupyter_notebooks/retirement_age.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import matplotlib.pyplot as plt\n", 10 | "import pandas as pd\n", 11 | "import seaborn as sns\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# (abbreviated) list of countries\n", 22 | "countries = ['Japan', 'Iceland', 'Switzerland', 'France', 'Ireland', 'Germany',\n", 23 | " 'Italy', 'Belgium']\n", 24 | "\n", 25 | "# difference in years between official and actual retirement age\n", 26 | "retirement_gap = [9, 2, 2, -1, -2, -2, -7, -8]\n", 27 | "\n", 28 | "# zip the two lists together, and specify the column names as we make the DataFrame\n", 29 | "retirement_data = pd.DataFrame(list(zip(countries, retirement_gap)),\n", 30 | " columns =['country', 'retirement_gap'])" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "# in practice, we might prefer to write a function that generates this list,\n", 40 | "# based on our data values\n", 41 | "bar_colors = ['#d01c8b', '#d01c8b', '#d01c8b', '#4dac26','#4dac26','#4dac26',\n", 42 | " '#4dac26','#4dac26']\n", 43 | "\n", 44 | "# pass our data and palette to the `seaborn` `barplot()` function\n", 45 | "ax = sns.barplot(x=\"retirement_gap\", y=\"country\", data=retirement_data, palette=bar_colors)\n", 46 | "\n", 47 | "# show the plot!\n", 48 | "plt.show()" 49 | ] 50 | } 51 | ], 52 | "metadata": { 53 | "kernelspec": { 54 | "display_name": "Python 3 (ipykernel)", 55 | "language": "python", 56 | "name": "python3" 57 | }, 58 | "language_info": { 59 | "codemirror_mode": { 60 | "name": "ipython", 61 | "version": 3 62 | }, 63 | "file_extension": ".py", 64 | "mimetype": "text/x-python", 65 | "name": "python", 66 | "nbconvert_exporter": "python", 67 | "pygments_lexer": "ipython3", 68 | "version": "3.9.5" 69 | } 70 | }, 71 | "nbformat": 4, 72 | "nbformat_minor": 4 73 | } 74 | -------------------------------------------------------------------------------- /chapter_10_examples/jupyter_notebooks/schools_that_work.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "5c1e35d0", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import matplotlib.pyplot as plt\n", 11 | "import seaborn as sns\n", 12 | "import pandas as pd" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "bd7e1932", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 23 | "# # Import PyDrive and associated libraries.\n", 24 | "# # This only needs to be done once per notebook.\n", 25 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n", 26 | "# from pydrive.auth import GoogleAuth\n", 27 | "# from pydrive.drive import GoogleDrive\n", 28 | "# from google.colab import auth\n", 29 | "# from oauth2client.client import GoogleCredentials\n", 30 | "\n", 31 | "# # Authenticate and create the PyDrive client.\n", 32 | "# # This only needs to be done once per notebook.\n", 33 | "# auth.authenticate_user()\n", 34 | "# gauth = GoogleAuth()\n", 35 | "# gauth.credentials = GoogleCredentials.get_application_default()\n", 36 | "# drive = GoogleDrive(gauth)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "655aebfe", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 47 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1_Kd6AUWyLirPpneW0kkeA_5WmEKRXnfl/view?usp=sharing\n", 48 | "# file_id = '1_Kd6AUWyLirPpneW0kkeA_5WmEKRXnfl' # notice where this string comes from in link above\n", 49 | "\n", 50 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 51 | "# print(imported_file['title']) # it should print the title of desired file\n", 52 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "74239aff", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# import the school test data\n", 63 | "school_data = pd.read_csv(\"apib12tx.csv\")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "c7b98a9c", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# plot test scores against the percentage of students receiving meal support\n", 74 | "sns.scatterplot(data=school_data, x=\"MEALS\", y=\"API12B\", alpha=0.6, linewidth=0)\n", 75 | "\n", 76 | "# highlight a high-performing school\n", 77 | "highlight_school = school_data[school_data['SNAME'] == \"Chin (John Yehall) Elementary\"]\n", 78 | "plt.scatter(highlight_school['MEALS'], highlight_school['API12B'],\n", 79 | " color='orange', alpha=1.0)\n", 80 | "\n", 81 | "# show the plot!\n", 82 | "plt.show()" 83 | ] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3 (ipykernel)", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.9.5" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 5 107 | } 108 | -------------------------------------------------------------------------------- /chapter_10_examples/standalone_files/a_humble_pie.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | # matplotlib works counterclockwise, so we need to essentially reverse 4 | # the order of our pie-value "slices" 5 | candidate_names = ['Adams', 'Wiley', 'Garcia', 'Yang', 'Others'] 6 | 7 | candidate_names.reverse() 8 | 9 | vote_pct = [30.8, 21.3, 19.6, 12.2, 16.1] 10 | 11 | vote_pct.reverse() 12 | 13 | colors = ['#006d2c','#006d2c', '#006d2c', '#31a354','#74c476'] 14 | 15 | colors.reverse() 16 | 17 | fig1, ax1 = plt.subplots() 18 | 19 | # by default, the starting axis is the x-axis; making this value 90 ensures 20 | # that it is a vertical line instead 21 | ax1.pie(vote_pct, labels=candidate_names, autopct='%.1f%%', startangle=90, 22 | colors=colors) 23 | 24 | ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. 25 | 26 | # show the plot! 27 | plt.show() 28 | -------------------------------------------------------------------------------- /chapter_10_examples/standalone_files/covid_FDI_impact.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | import seaborn as sns 4 | import numpy as np 5 | 6 | # each individual array is a row of data 7 | FDI = np.array([[0.8, 0.7], [0.3, 0.6]]) 8 | 9 | fdi_data = pd.DataFrame(data=FDI, 10 | columns=['Developed', 'Developing']) 11 | 12 | ax = sns.lineplot(data=fdi_data) 13 | 14 | # show the plot! 15 | plt.show() 16 | -------------------------------------------------------------------------------- /chapter_10_examples/standalone_files/refined_covid_barchart-bak.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | from matplotlib.ticker import FuncFormatter 5 | from matplotlib.dates import DateFormatter 6 | from datetime import datetime 7 | import numpy as np 8 | 9 | vaccine_data = pd.read_csv('owid-covid-data.csv') 10 | vaccine_data['date']= pd.to_datetime(vaccine_data['date']) 11 | country_and_month = vaccine_data.groupby('iso_code').resample('M', on='date').sum() 12 | country_and_month_update = country_and_month.reset_index() 13 | just_USA = country_and_month_update[country_and_month_update['iso_code']=='USA'] 14 | 15 | ax = sns.barplot(x="date", y="new_cases", palette=['grey'], data=just_USA) 16 | plt.show() 17 | 18 | def millions(val, pos): 19 | # the two arguments are the value and tick position 20 | modified_val = val*1e-6 21 | formatted_val = str(modified_val) 22 | if val == ax.get_ylim()[1]: 23 | formatted_val = formatted_val+'M' 24 | if val == 0: 25 | formatted_val = "0" 26 | return formatted_val 27 | #return '$%1.1fM' % (val*1e-6) 28 | 29 | def custom_dates(val,pos): 30 | dates_list = just_USA.date.tolist() 31 | current_value = dates_list[pos] 32 | current_month = datetime.strftime(current_value, '%b') 33 | date_label = current_month 34 | if date_label == 'Jan': 35 | date_label = date_label + " '"+ datetime.strftime(current_value, '%y') 36 | return date_label 37 | 38 | y_formatter = FuncFormatter(millions) 39 | x_formatter = FuncFormatter(custom_dates) 40 | 41 | # using a seaborn theme will make customization harder, so skip it 42 | #sns.set_theme(style="whitegrid") 43 | # make a barplot 44 | ax = sns.barplot(x="date", y="new_cases", palette=['grey'], data=just_USA) 45 | 46 | for i,bar in enumerate(ax.patches): 47 | if i == 6: 48 | bar.set_color('red') 49 | 50 | ax.set_ylim(0,7000000) 51 | 52 | # setting axis labels 53 | plt.xlabel('Month') 54 | plt.ylabel('New cases (M)') 55 | 56 | # if you want to use rcParams, you need to use them *before* tick_params 57 | # rcParams is the interactive version of a matplotlib stylesheet 58 | # https://matplotlib.org/stable/tutorials/introductory/customizing.html 59 | 60 | plt.rcParams['xtick.bottom'] = False 61 | 62 | # manipulate the axis attributes 63 | # https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.tick_params.html 64 | 65 | ax.tick_params(direction='out', length=10, width=1, color='black', colors='black',pad=4, grid_color='black', grid_alpha=1, rotation=45) 66 | 67 | # apply custom number formatter to y axis 68 | ax.yaxis.set_major_formatter(y_formatter) 69 | ax.xaxis.set_major_formatter(x_formatter) 70 | 71 | 72 | # by default, this is in "data coordinates"; e.g. a value of 1 will left-align the start 73 | # of the text with the center point of the first (in this case) column. 74 | # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.text.html 75 | # also, the "y" value is the bottom of the text, including multi-line text 76 | ax.text(4,3000000, "Confirmed cases\noften lag infection\nby several weeks."); 77 | 78 | bar_value = just_USA.new_cases.tolist() 79 | ax.vlines( x = 6, color='black', linewidth=1, alpha=.7, 80 | ymin = bar_value[6]+100000, ymax = 3000000-100000); 81 | 82 | # ha! It uses LaTeX for text layout and mainpulation 83 | # https://matplotlib.org/2.0.2/users/usetex.html 84 | # plt.rc('text', usetex=True) 85 | # plt.title(r"\textbf{Something}, but then also\\ something else") 86 | # the following titles overwrite each other - seaborn uses matplotlib under the hood 87 | plt.title("COVID-19 cases spike following relaxed restrictions\nin the spring of 2020", fontweight="bold") 88 | # ax.set_title('COVID-19 cases spike following relaxed restrictions in the spring of 2020'); 89 | 90 | plt.show() 91 | -------------------------------------------------------------------------------- /chapter_10_examples/standalone_files/refined_covid_barchart.py: -------------------------------------------------------------------------------- 1 | # `pandas` for data loading; `seaborn` and `matplotlib` for visuals 2 | import pandas as pd 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | 6 | # `FuncFormatter` to format axis labels 7 | from matplotlib.ticker import FuncFormatter 8 | 9 | # `datetime` to interpret and customize dates 10 | from datetime import datetime 11 | 12 | # load the data 13 | vaccine_data = pd.read_csv('owid-covid-data.csv') 14 | 15 | # convert the `date` column to a "real" date 16 | vaccine_data['date']= pd.to_datetime(vaccine_data['date']) 17 | 18 | # group the data by country and month 19 | country_and_month = vaccine_data.groupby('iso_code').resample('M', 20 | on='date').sum() 21 | 22 | # use `reset_index()` to "flatten" the DataFrame headers 23 | country_and_month_update = country_and_month.reset_index() 24 | 25 | # select just the United States' data 26 | just_USA = country_and_month_update[country_and_month_update['iso_code']=='USA'] 27 | 28 | # make the foundational barplot with `seaborn` 29 | ax = sns.barplot(x="date", y="new_cases", palette=['#bababa'], data=just_USA) 30 | 31 | # loop through the bars rectangles and set the color for the July, 2020 32 | # bar to red 33 | for i, bar in enumerate(ax.patches): 34 | if i == 6: 35 | bar.set_color('#ca0020') 36 | 37 | # set the maximum y-axis value to 7M 38 | ax.set_ylim(0,7000000) 39 | 40 | # setting the axis labels 41 | plt.xlabel('Month') 42 | plt.ylabel('New cases (M)') 43 | 44 | # modify the color, placement and orientation of the "tick labels" 45 | ax.tick_params(direction='out', length=5, width=1, color='#404040', 46 | colors='#404040',pad=4, grid_color='#404040', grid_alpha=1, 47 | rotation=45) 48 | 49 | # functions for formatting the axis "tick labels" 50 | # `millions()` will convert the scientific notation to millions of cases 51 | def millions(val, pos): 52 | modified_val = val*1e-6 53 | formatted_val = str(modified_val) 54 | if val == ax.get_ylim()[1]: 55 | formatted_val = formatted_val+'M' 56 | if val == 0: 57 | formatted_val = "0" 58 | return formatted_val 59 | 60 | 61 | # `custom_dates()` will abbreviate the dates to be more readable 62 | def custom_dates(val, pos): 63 | dates_list = just_USA.date.tolist() 64 | date_label = "" 65 | if pos is not None: 66 | current_value = dates_list[pos] 67 | current_month = datetime.strftime(current_value, '%b') 68 | date_label = current_month 69 | if date_label == 'Jan': 70 | date_label = date_label + " '"+ datetime.strftime(current_value, 71 | '%y') 72 | return date_label 73 | 74 | 75 | # assign formatter functions 76 | y_formatter = FuncFormatter(millions) 77 | x_formatter = FuncFormatter(custom_dates) 78 | 79 | # apply the formatter functions to the appropriate axis 80 | ax.yaxis.set_major_formatter(y_formatter) 81 | ax.xaxis.set_major_formatter(x_formatter) 82 | 83 | # create and position the annotation text 84 | ax.text(4, 3000000, "Confirmed cases\noften lag infection\nby several weeks.") 85 | 86 | # get the value of all bars as a list 87 | bar_value = just_USA.new_cases.tolist() 88 | 89 | # create the leader line 90 | ax.vlines( x = 6, color='#404040', linewidth=1, alpha=.7, 91 | ymin = bar_value[6]+100000, ymax = 3000000-100000) 92 | 93 | # set the title of the chart 94 | plt.title("COVID-19 cases spike following relaxed restrictions\n" + \ 95 | "in the spring of 2020", fontweight="bold") 96 | 97 | # show the chart! 98 | plt.show() 99 | -------------------------------------------------------------------------------- /chapter_10_examples/standalone_files/retirement_age.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | import seaborn as sns 4 | import numpy as np 5 | 6 | # (abbreviated) list of countries 7 | countries = ['Japan', 'Iceland', 'Switzerland', 'France', 'Ireland', 'Germany', 8 | 'Italy', 'Belgium'] 9 | 10 | # difference in years between official and actual retirement age 11 | retirement_gap = [9, 2, 2, -1, -2, -2, -7, -8] 12 | 13 | # zip the two lists together, and specify the column names as we make the DataFrame 14 | retirement_data = pd.DataFrame(list(zip(countries, retirement_gap)), 15 | columns =['country', 'retirement_gap']) 16 | 17 | # in practice, we might prefer to write a function that generates this list, 18 | # based on our data values 19 | bar_colors = ['#d01c8b', '#d01c8b', '#d01c8b', '#4dac26','#4dac26','#4dac26', 20 | '#4dac26','#4dac26'] 21 | 22 | # pass our data and palette to the `seaborn` `barplot()` function 23 | ax = sns.barplot(x="retirement_gap", y="country", data=retirement_data, palette=bar_colors) 24 | 25 | # show the plot! 26 | plt.show() 27 | -------------------------------------------------------------------------------- /chapter_10_examples/standalone_files/schools_that_work.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import pandas as pd 4 | 5 | # import the school test data 6 | school_data = pd.read_csv("apib12tx.csv") 7 | 8 | # plot test scores against the percentage of students receiving meal support 9 | sns.scatterplot(data=school_data, x="MEALS", y="API12B", alpha=0.6, linewidth=0) 10 | 11 | # highlight a high-performing school 12 | highlight_school = school_data[school_data['SNAME'] == "Chin (John Yehall) Elementary"] 13 | plt.scatter(highlight_school['MEALS'], highlight_school['API12B'], 14 | color='orange', alpha=1.0) 15 | 16 | # show the plot! 17 | plt.show() 18 | -------------------------------------------------------------------------------- /chapter_1_examples/jupyter_notebooks/hello_world.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "view-in-github" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "id": "vm-eoxO_wKZi" 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "# The code below should print \"Hello World!\"\n", 22 | "print(\"Hello World!\")" 23 | ] 24 | } 25 | ], 26 | "metadata": { 27 | "colab": { 28 | "authorship_tag": "ABX9TyPewIxW4Coe6EfnEBAylqJX", 29 | "collapsed_sections": [], 30 | "include_colab_link": true, 31 | "name": "HelloWorld.ipynb", 32 | "provenance": [] 33 | }, 34 | "kernelspec": { 35 | "display_name": "Python 3 (ipykernel)", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.9.5" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 1 54 | } 55 | -------------------------------------------------------------------------------- /chapter_1_examples/standalone_files/hello_world.py: -------------------------------------------------------------------------------- 1 | # The code below should print "Hello World!" 2 | print("Hello World!") 3 | -------------------------------------------------------------------------------- /chapter_2_examples/jupyter_notebooks/basic_greeting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Creating custom functions\n", 8 | "\n", 9 | "One of the potentially tricky things about Jupyter Notebooks is that it's possible to run the code \"out of order\", which can sometimes cause problems. For example, select `Kernel -> Restart & Clear Output`, and then try running the last cell below _before_ running the second to last cell - you'll get an error.\n", 10 | "\n", 11 | "Recall that computers read code top-to-bottom, and left-to-right. Jupyter Notebooks let us break that rule, which can make it easier to test and troubleshoot small bits of code. At the same time, accidentally running cells out of order can generate errors even if the code--when \"read\" correctly--works properly. For example, if you run the last cell _again_ after running the third cell, it will now work: the `greet_me` function is defined in the third cell, and the computer still \"remembers\" it when it is referenced in the fourth cell--even though they are in separate cells.\n", 12 | "\n", 13 | "If you're having problems with code in a Jupyter Notebook that you're fairly sure should work (such as examples from this book), try choosing `Kernel -> Restart & Run All` to see if that solves it. You can also use the `Kernel -> Restart & Clear Output` command above to clear the numbering to the left of the cells if it has gotten out of order and you want to start again." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Susan E. McGregor\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "# create a variable named author, and set its contents to \"Susan E. McGregor\"\n", 31 | "# using the assignment operator, '='\n", 32 | "author = \"Susan E. McGregor\" \n", 33 | "\n", 34 | "# confirm that the computer \"remembers\" what's in the `author` variable\n", 35 | "# by using the built-in 'print' function\n", 36 | "print(author)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "Hello Susan E. McGregor\n", 49 | "Hello Jeff Bleiel\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "# create a variable named author\n", 55 | "author = \"Susan E. McGregor\" \n", 56 | "\n", 57 | "# create another variable named editor\n", 58 | "editor = \"Jeff Bleiel\"\n", 59 | "\n", 60 | "# use the built-in print function to output \"Hello\" messages to each person\n", 61 | "print(\"Hello \"+author)\n", 62 | "print(\"Hello \"+editor)" 63 | ] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "Python 3 (ipykernel)", 69 | "language": "python", 70 | "name": "python3" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 3 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython3", 82 | "version": "3.9.5" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 4 87 | } 88 | -------------------------------------------------------------------------------- /chapter_2_examples/jupyter_notebooks/greet_me.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "852ff27e", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# create a function that prints out a greeting\n", 11 | "# to any name passed to the function\n", 12 | "def greet_me(a_name):\n", 13 | " print(\"Hello \"+a_name)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "c74beb6b", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# create a variable named author\n", 24 | "author = \"Susan E. McGregor\"\n", 25 | "\n", 26 | "# create another variable named editor\n", 27 | "editor = \"Jeff Bleiel\"" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "ee3022f1", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# use my custom function, `greet_me` to output \"Hello\" messages to each person\n", 38 | "greet_me(author)\n", 39 | "greet_me(editor)" 40 | ] 41 | } 42 | ], 43 | "metadata": { 44 | "kernelspec": { 45 | "display_name": "Python 3 (ipykernel)", 46 | "language": "python", 47 | "name": "python3" 48 | }, 49 | "language_info": { 50 | "codemirror_mode": { 51 | "name": "ipython", 52 | "version": 3 53 | }, 54 | "file_extension": ".py", 55 | "mimetype": "text/x-python", 56 | "name": "python", 57 | "nbconvert_exporter": "python", 58 | "pygments_lexer": "ipython3", 59 | "version": "3.9.5" 60 | } 61 | }, 62 | "nbformat": 4, 63 | "nbformat_minor": 5 64 | } 65 | -------------------------------------------------------------------------------- /chapter_2_examples/jupyter_notebooks/hitting_the_road_with_citibike.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Question: How many Citi Bike rides each day are taken by\n", 10 | "# \"subscribers\" versus \"customers\"?\n", 11 | "\n", 12 | "# Answer: Choose a single day of rides to examine.\n", 13 | "\n", 14 | "# The dataset used for this exercise was generated from the original\n", 15 | "# Citi Bike system data found here: https://s3.amazonaws.com/tripdata/index.html\n", 16 | "# Filename: 202009-citibike-tripdata.csv.zip\n", 17 | "# Program Outline:\n", 18 | "# 1. Read in the data file: 202009CtibikeTripdataExample.csv\n", 19 | "# 2. Create variables to count: subscribers, customers, and other\n", 20 | "# 3. For each row in the file:\n", 21 | "# a. If the \"User Type\" is \"Subscriber,\" add 1 to \"subscriber_count\"\n", 22 | "# b. If the \"User Type\" is \"Customer,\" add 1 to \"customer_count\"\n", 23 | "# c. Otherwise, add 1 to the \"other\" variable\n", 24 | "# 4. Print out my results" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# import the `csv` library\n", 34 | "import csv\n", 35 | "\n", 36 | "# open the `202009CitibikeTripdataExample.csv` file in read (\"r\") mode\n", 37 | "# this file should be in the same folder as our Python script or notebook\n", 38 | "source_file = open(\"202009CitibikeTripdataExample.csv\",\"r\")\n", 39 | "\n", 40 | "# pass our `source_file` as an ingredient to the the `csv` library's\n", 41 | "# DictReader \"recipe\".\n", 42 | "# Store the result in a variable called `citibike_reader`\n", 43 | "citibike_reader = csv.DictReader(source_file)\n", 44 | "\n", 45 | "# the DictReader method has added some useful information to our data,\n", 46 | "# like a `fieldnames` property that lets us access all the values\n", 47 | "# in the first or \"header\" row\n", 48 | "print(citibike_reader.fieldnames)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# create a variable to hold the count of each type of Citi Bike user\n", 58 | "# assign or \"initialize\" each with a value of zero (0)\n", 59 | "subscriber_count = 0\n", 60 | "customer_count = 0\n", 61 | "other_user_count = 0" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# Step 3: Loop through every row of our data\n", 71 | "for a_row in citibike_reader:\n", 72 | "\n", 73 | " # Step 3a: if the value in the `usertype` column\n", 74 | " # of the current row is \"Subscriber\"\n", 75 | " if a_row[\"usertype\"] == \"Subscriber\":\n", 76 | "\n", 77 | " # add 1 to `subscriber_count`\n", 78 | " subscriber_count = subscriber_count +1\n", 79 | "\n", 80 | " # Step 3b: otherwise (else), if the value in the `usertype` column\n", 81 | " # of the current row is \"Customer\"\n", 82 | " elif a_row[\"usertype\"] == \"Customer\":\n", 83 | "\n", 84 | " # add 1 to `subscriber_count`\n", 85 | " customer_count = customer_count + 1\n", 86 | "\n", 87 | " # Step 3c: the `usertype` value is _neither_\"Subscriber\" nor \"Customer\",\n", 88 | " # so we'll add 1 to our catch-all `other_user_count` variable\n", 89 | " else:\n", 90 | " other_user_count = other_user_count + 1" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# Step 4: Print out our results, being sure to include \"labels\" in the process:\n", 100 | "print(\"Number of subscribers:\")\n", 101 | "print(subscriber_count)\n", 102 | "print(\"Number of customers:\")\n", 103 | "print(customer_count)\n", 104 | "print(\"Number of 'other' users:\")\n", 105 | "print(other_user_count)" 106 | ] 107 | } 108 | ], 109 | "metadata": { 110 | "colab": { 111 | "name": "hitting_the_road_with_citibike.ipynb", 112 | "provenance": [] 113 | }, 114 | "kernelspec": { 115 | "display_name": "Python 3 (ipykernel)", 116 | "language": "python", 117 | "name": "python3" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 3 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython3", 129 | "version": "3.9.5" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 1 134 | } 135 | -------------------------------------------------------------------------------- /chapter_2_examples/jupyter_notebooks/method_madness.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "a21dc1b6", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# splitting a string \"literal\" and then printing the result\n", 11 | "split_world = \"Hello World!\".split()\n", 12 | "print(split_world)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "f0ba92b0", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# assigning a string to a variable\n", 23 | "# then printing the result of calling the `split()` method on it\n", 24 | "world_msg = \"Hello World!\"\n", 25 | "print(world_msg.split())" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "id": "f75166e6", 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# the following will produce an error because\n", 36 | "# the `split()` method must be called on a string in order to work!\n", 37 | "split(\"Hello World!\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "id": "6273c7bc", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# the following will produce an error because\n", 48 | "# there is no `split()` method for numbers!\n", 49 | "print(5.split())" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "bdf09244", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [] 59 | } 60 | ], 61 | "metadata": { 62 | "kernelspec": { 63 | "display_name": "Python 3 (ipykernel)", 64 | "language": "python", 65 | "name": "python3" 66 | }, 67 | "language_info": { 68 | "codemirror_mode": { 69 | "name": "ipython", 70 | "version": 3 71 | }, 72 | "file_extension": ".py", 73 | "mimetype": "text/x-python", 74 | "name": "python", 75 | "nbconvert_exporter": "python", 76 | "pygments_lexer": "ipython3", 77 | "version": "3.9.5" 78 | } 79 | }, 80 | "nbformat": 4, 81 | "nbformat_minor": 5 82 | } 83 | -------------------------------------------------------------------------------- /chapter_2_examples/jupyter_notebooks/noun_examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Working with variables\n", 8 | "\n", 9 | "Note that, unlike our last example, when we assign a value to a variable, Jupyter Notebook _doesn't_ print out its value automatically when we run that cell.\n", 10 | "\n", 11 | "The subsequent cells, however, work just as they would in an standalone `.py` file." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# create a variable named author, set its contents to \"Susan E. McGregor\"\n", 21 | "author = \"Susan E. McGregor\"\n", 22 | "\n", 23 | "# confirm that the computer \"remembers\" what's in the `author` variable\n", 24 | "print(author)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# create a variable named nyc_resident, set its contents to \"Susan E. McGregor\"\n", 34 | "nyc_resident = \"Susan E. McGregor\"\n", 35 | "\n", 36 | "# confirm that the computer \"remembers\" what's in the `nyc_resident` variable\n", 37 | "print(nyc_resident)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# create a variable named fuzzyPinkBunny, set its contents to \"Susan E. McGregor\"\n", 47 | "fuzzyPinkBunny = \"Susan E. McGregor\"\n", 48 | "\n", 49 | "# confirm that the computer \"remembers\" what's in the `fuzzyPinkBunny` variable\n", 50 | "print(fuzzyPinkBunny)\n", 51 | "\n", 52 | "# but correct capitalization matters!\n", 53 | "# the following line will produce an error\n", 54 | "print(fuzzypinkbunny)" 55 | ] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 3 (ipykernel)", 61 | "language": "python", 62 | "name": "python3" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.9.5" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 4 79 | } 80 | -------------------------------------------------------------------------------- /chapter_2_examples/jupyter_notebooks/page_count_conditional.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# fictional list of chapter page counts\n", 10 | "page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30]\n", 11 | "\n", 12 | "# create variables to keep track of:\n", 13 | "# the total pages in the book\n", 14 | "total_pages = 0\n", 15 | "\n", 16 | "# the number of chapters with more than 30 pages,\n", 17 | "under_30 = 0\n", 18 | "\n", 19 | "# the number of chapters with fewer than 30 pages\n", 20 | "over_30 = 0\n", 21 | "\n", 22 | "# for every item in the page_counts list:\n", 23 | "for a_number in page_counts:\n", 24 | " # add the current number of pages to our total_pages count\n", 25 | " total_pages = total_pages + a_number\n", 26 | " # check if the current number of pages is more than 30\n", 27 | " if a_number > 30:\n", 28 | " # if so, add 1 to our over_30 counter\n", 29 | " over_30 = over_30 + 1\n", 30 | " # otherwise...\n", 31 | " else:\n", 32 | " # add 1 to our under_30 counter\n", 33 | " under_30 = under_30 + 1\n", 34 | "\n", 35 | "# print our various results\n", 36 | "print(total_pages)\n", 37 | "print(\"Number of chapters over 30 pages:\")\n", 38 | "print(over_30)\n", 39 | "print(\"Number of chapters under 30 pages:\")\n", 40 | "print(under_30)" 41 | ] 42 | } 43 | ], 44 | "metadata": { 45 | "kernelspec": { 46 | "display_name": "Python 3 (ipykernel)", 47 | "language": "python", 48 | "name": "python3" 49 | }, 50 | "language_info": { 51 | "codemirror_mode": { 52 | "name": "ipython", 53 | "version": 3 54 | }, 55 | "file_extension": ".py", 56 | "mimetype": "text/x-python", 57 | "name": "python", 58 | "nbconvert_exporter": "python", 59 | "pygments_lexer": "ipython3", 60 | "version": "3.9.5" 61 | } 62 | }, 63 | "nbformat": 4, 64 | "nbformat_minor": 4 65 | } 66 | -------------------------------------------------------------------------------- /chapter_2_examples/jupyter_notebooks/page_count_custom_function.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "9cbb94aa", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# fictional list of chapter page counts\n", 11 | "page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30]" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "93f3c196", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# define a new `count_pages()` function that takes one ingredient/argument:\n", 22 | "# a list of numbers\n", 23 | "def count_pages(page_count_list):\n", 24 | "\n", 25 | " # create variables to keep track of:\n", 26 | " # the total pages in the book\n", 27 | " total_pages = 0\n", 28 | "\n", 29 | " # the number of chapters with more than 30 pages,\n", 30 | " under_30 = 0\n", 31 | "\n", 32 | " # the number of chapters with fewer than 30 pages\n", 33 | " over_30 = 0\n", 34 | "\n", 35 | " # for every item in the page_count_list:\n", 36 | " for a_number in page_count_list:\n", 37 | "\n", 38 | " # add the current number of pages to our total_pages count\n", 39 | " total_pages = total_pages + a_number\n", 40 | "\n", 41 | " # check if the current number of pages is more than 30\n", 42 | " if a_number > 30:\n", 43 | "\n", 44 | " # if so, add 1 to our over_30 counter\n", 45 | " over_30 = over_30 + 1\n", 46 | "\n", 47 | " # otherwise...\n", 48 | " else:\n", 49 | "\n", 50 | " # add 1 to our under_30 counter\n", 51 | " under_30 = under_30 + 1\n", 52 | "\n", 53 | " # print our various results\n", 54 | " print(total_pages)\n", 55 | " print(\"Number of chapters over 30 pages:\")\n", 56 | " print(over_30)\n", 57 | " print(\"Number of chapters under 30 pages:\")\n", 58 | " print(under_30)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "1baf12a0", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# call/execute this \"recipe\", being sure to pass in our\n", 69 | "# actual list as an argument/ingredient\n", 70 | "count_pages(page_counts)" 71 | ] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3 (ipykernel)", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.9.5" 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 5 95 | } 96 | -------------------------------------------------------------------------------- /chapter_2_examples/jupyter_notebooks/page_count_loop.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# fictional list of chapter page counts\n", 10 | "page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30]\n", 11 | "\n", 12 | "# variable for tracking total page count; starting value is 0\n", 13 | "total_pages = 0\n", 14 | "\n", 15 | "# for every item in the list, perform some action\n", 16 | "for a_number in page_counts:\n", 17 | "\n", 18 | " # in this case, add the number to our \"total_pages\" variable\n", 19 | " total_pages = total_pages + a_number\n", 20 | "\n", 21 | "print(total_pages)" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3 (ipykernel)", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.9.5" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 4 46 | } 47 | -------------------------------------------------------------------------------- /chapter_2_examples/jupyter_notebooks/page_count_printout.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# fictional list of chapter page counts\n", 10 | "page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30]\n", 11 | "\n", 12 | "# variable for tracking total page count; starting value is 0\n", 13 | "total_pages = 0\n", 14 | "\n", 15 | "# for every item in the list, perform some action\n", 16 | "\n", 17 | "for a_number in page_counts:\n", 18 | " print(\"Top of loop!\")\n", 19 | " print(\"The current item is:\")\n", 20 | " print(a_number)\n", 21 | " total_pages = total_pages + a_number\n", 22 | " print(\"The running total is:\")\n", 23 | " print(total_pages)\n", 24 | " print(\"Bottom of loop!\")\n", 25 | "\n", 26 | "print(total_pages)" 27 | ] 28 | } 29 | ], 30 | "metadata": { 31 | "kernelspec": { 32 | "display_name": "Python 3 (ipykernel)", 33 | "language": "python", 34 | "name": "python3" 35 | }, 36 | "language_info": { 37 | "codemirror_mode": { 38 | "name": "ipython", 39 | "version": 3 40 | }, 41 | "file_extension": ".py", 42 | "mimetype": "text/x-python", 43 | "name": "python", 44 | "nbconvert_exporter": "python", 45 | "pygments_lexer": "ipython3", 46 | "version": "3.9.5" 47 | } 48 | }, 49 | "nbformat": 4, 50 | "nbformat_minor": 4 51 | } 52 | -------------------------------------------------------------------------------- /chapter_2_examples/jupyter_notebooks/parts_of_speech.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## About Jupyter Notebooks\n", 8 | "\n", 9 | "Jupyter Notebooks have two relevant types of \"cells\": \n", 10 | "\n", 11 | "This is a \"markdown\" cell, which is useful for adding lightly-formatted context and documentation to your notebook. You can learn more about markdown here: https://www.markdownguide.org/cheat-sheet/\n", 12 | "\n", 13 | "The cells below are code cells. When you hit the \"play\" button next to a code cell, it essentially does a combination of two things:\n", 14 | "\n", 15 | "1. Runs the code\n", 16 | "2. Prints out the \"results\"\n", 17 | "\n", 18 | "This means that Jupyter Notebooks typically contain fewer `print` statements than standalone `.py` files, if any. In this book, I have kept the `print` statements from example code for consistency and clarity.\n", 19 | "\n", 20 | "Note that if you were to run any of the code snippets below in a standalone `.py` file, you would not see any output. Because these code statements are just the literal values, Jupyter Notebook prints the same thing that was entered in the cell originally, but this would require an explicit `print` statement in a standalone file. \n", 21 | "\n", 22 | "Also notice that the comments are _not_ printed, as we would expect!" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# A number is just digits\n", 32 | "25" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# A string is anything surrounded by matching quotation marks\n", 42 | "\"Hello World\"" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "# A list is surrounded by square brackets, with commas between items\n", 52 | "# Note that in Python, the first item in a list is considered to be\n", 53 | "# in position `0`, the next in position `1` and so on\n", 54 | "[\"this\",\"is\",1,\"list\"]" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# A dict is a set of key:value pairs, separated by commas and surrounded\n", 64 | "# by curly braces\n", 65 | "{\"title\":\"Practical Python for Data Wrangling and Data Quality\",\n", 66 | " \"format\": \"book\",\n", 67 | " \"author\": \"Susan E. McGregor\"\n", 68 | "}" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# A Boolean is a data type that has only two values, true and false.\n", 78 | "True" 79 | ] 80 | } 81 | ], 82 | "metadata": { 83 | "kernelspec": { 84 | "display_name": "Python 3 (ipykernel)", 85 | "language": "python", 86 | "name": "python3" 87 | }, 88 | "language_info": { 89 | "codemirror_mode": { 90 | "name": "ipython", 91 | "version": 3 92 | }, 93 | "file_extension": ".py", 94 | "mimetype": "text/x-python", 95 | "name": "python", 96 | "nbconvert_exporter": "python", 97 | "pygments_lexer": "ipython3", 98 | "version": "3.9.5" 99 | } 100 | }, 101 | "nbformat": 4, 102 | "nbformat_minor": 4 103 | } 104 | -------------------------------------------------------------------------------- /chapter_2_examples/standalone_files/basic_greeting.py: -------------------------------------------------------------------------------- 1 | # create a variable named author 2 | author = "Susan E. McGregor" 3 | 4 | # create another variable named editor 5 | editor = "Jeff Bleiel" 6 | 7 | # use the built-in print function to output "Hello" messages to each person 8 | print("Hello "+author) 9 | print("Hello "+editor) 10 | -------------------------------------------------------------------------------- /chapter_2_examples/standalone_files/greet_me.py: -------------------------------------------------------------------------------- 1 | # create a function that prints out a greeting 2 | # to any name passed to the function 3 | def greet_me(a_name): 4 | print("Hello "+a_name) 5 | 6 | # create a variable named author 7 | author = "Susan E. McGregor" 8 | 9 | # create another variable named editor 10 | editor = "Jeff Bleiel" 11 | 12 | # use my custom function, `greet_me` to output "Hello" messages to each person 13 | greet_me(author) 14 | greet_me(editor) 15 | -------------------------------------------------------------------------------- /chapter_2_examples/standalone_files/hitting_the_road_with_citibike.py: -------------------------------------------------------------------------------- 1 | # Question: How many Citi Bike rides each day are taken by 2 | # "subscribers" versus "customers"? 3 | 4 | # Answer: Choose a single day of rides to examine. 5 | 6 | # The dataset used for this exercise was generated from the original 7 | # Citi Bike system data found here: https://s3.amazonaws.com/tripdata/index.html 8 | # Filename: 202009-citibike-tripdata.csv.zip 9 | # Program Outline: 10 | # 1. Read in the data file: 202009CtibikeTripdataExample.csv 11 | # 2. Create variables to count: subscribers, customers, and other 12 | # 3. For each row in the file: 13 | # a. If the "User Type" is "Subscriber," add 1 to "subscriber_count" 14 | # b. If the "User Type" is "Customer," add 1 to "customer_count" 15 | # c. Otherwise, add 1 to the "other" variable 16 | # 4. Print out my results 17 | 18 | # import the `csv` library 19 | import csv 20 | 21 | # open the `202009CitibikeTripdataExample.csv` file in read ("r") mode 22 | # this file should be in the same folder as our Python script or notebook 23 | source_file = open("202009CitibikeTripdataExample.csv","r") 24 | 25 | # pass our `source_file` as an ingredient to the the `csv` library's 26 | # DictReader "recipe". 27 | # Store the result in a variable called `citibike_reader` 28 | citibike_reader = csv.DictReader(source_file) 29 | 30 | # the DictReader method has added some useful information to our data, 31 | # like a `fieldnames` property that lets us access all the values 32 | # in the first or "header" row 33 | print(citibike_reader.fieldnames) 34 | 35 | # create a variable to hold the count of each type of Citi Bike user 36 | # assign or "initialize" each with a value of zero (0) 37 | subscriber_count = 0 38 | customer_count = 0 39 | other_user_count = 0 40 | 41 | # Step 3: Loop through every row of our data 42 | for a_row in citibike_reader: 43 | 44 | # Step 3a: if the value in the `usertype` column 45 | # of the current row is "Subscriber" 46 | if a_row["usertype"] == "Subscriber": 47 | 48 | # add 1 to `subscriber_count` 49 | subscriber_count = subscriber_count +1 50 | 51 | # Step 3b: otherwise (else), if the value in the `usertype` column 52 | # of the current row is "Customer" 53 | elif a_row["usertype"] == "Customer": 54 | 55 | # add 1 to `subscriber_count` 56 | customer_count = customer_count + 1 57 | 58 | # Step 3c: the `usertype` value is _neither_"Subscriber" nor "Customer", 59 | # so we'll add 1 to our catch-all `other_user_count` variable 60 | else: 61 | other_user_count = other_user_count + 1 62 | 63 | # Step 4: Print out our results, being sure to include "labels" in the process: 64 | print("Number of subscribers:") 65 | print(subscriber_count) 66 | print("Number of customers:") 67 | print(customer_count) 68 | print("Number of 'other' users:") 69 | print(other_user_count) 70 | -------------------------------------------------------------------------------- /chapter_2_examples/standalone_files/method_madness.py: -------------------------------------------------------------------------------- 1 | # splitting a string "literal" and then printing the result 2 | split_world = "Hello World!".split() 3 | print(split_world) 4 | 5 | # assigning a string to a variable 6 | # then printing the result of calling the `split()` method on it 7 | world_msg = "Hello World!" 8 | print(world_msg.split()) 9 | 10 | # the following will produce an error because 11 | # the `split()` method must be called on a string in order to work! 12 | split("Hello World!") 13 | -------------------------------------------------------------------------------- /chapter_2_examples/standalone_files/noun_examples.py: -------------------------------------------------------------------------------- 1 | # create a variable named author, set its contents to "Susan E. McGregor" 2 | author = "Susan E. McGregor" 3 | 4 | # confirm that the computer "remembers" what's in the `author` variable 5 | print(author) 6 | 7 | # create a variable named nyc_resident, set its contents to "Susan E. McGregor" 8 | nyc_resident = "Susan E. McGregor" 9 | 10 | # confirm that the computer "remembers" what's in the `nyc_resident` variable 11 | print(nyc_resident) 12 | 13 | # create a variable named fuzzyPinkBunny, set its contents to "Susan E. McGregor" 14 | fuzzyPinkBunny = "Susan E. McGregor" 15 | 16 | # confirm that the computer "remembers" what's in the `fuzzyPinkBunny` variable 17 | print(fuzzyPinkBunny) 18 | 19 | # but correct capitalization matters! 20 | # the following line will produce an error 21 | print(fuzzypinkbunny) 22 | -------------------------------------------------------------------------------- /chapter_2_examples/standalone_files/page_count_conditional.py: -------------------------------------------------------------------------------- 1 | # fictional list of chapter page counts 2 | page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30] 3 | 4 | # create variables to keep track of: 5 | # the total pages in the book 6 | total_pages = 0 7 | 8 | # the number of chapters with more than 30 pages, 9 | under_30 = 0 10 | 11 | # the number of chapters with fewer than 30 pages 12 | over_30 = 0 13 | 14 | # for every item in the page_counts list: 15 | for a_number in page_counts: 16 | # add the current number of pages to our total_pages count 17 | total_pages = total_pages + a_number 18 | # check if the current number of pages is more than 30 19 | if a_number > 30: 20 | # if so, add 1 to our over_30 counter 21 | over_30 = over_30 + 1 22 | # otherwise... 23 | else: 24 | # add 1 to our under_30 counter 25 | under_30 = under_30 + 1 26 | 27 | # print our various results 28 | print(total_pages) 29 | print("Number of chapters over 30 pages:") 30 | print(over_30) 31 | print("Number of chapters under 30 pages:") 32 | print(under_30) 33 | -------------------------------------------------------------------------------- /chapter_2_examples/standalone_files/page_count_custom_function.py: -------------------------------------------------------------------------------- 1 | # fictional list of chapter page counts 2 | page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30] 3 | 4 | # define a new `count_pages()` function that takes one ingredient/argument: 5 | # a list of numbers 6 | def count_pages(page_count_list): 7 | 8 | # create variables to keep track of: 9 | # the total pages in the book 10 | total_pages = 0 11 | 12 | # the number of chapters with more than 30 pages, 13 | under_30 = 0 14 | 15 | # the number of chapters with fewer than 30 pages 16 | over_30 = 0 17 | 18 | # for every item in the page_count_list: 19 | for a_number in page_count_list: 20 | 21 | # add the current number of pages to our total_pages count 22 | total_pages = total_pages + a_number 23 | 24 | # check if the current number of pages is more than 30 25 | if a_number > 30: 26 | 27 | # if so, add 1 to our over_30 counter 28 | over_30 = over_30 + 1 29 | 30 | # otherwise... 31 | else: 32 | 33 | # add 1 to our under_30 counter 34 | under_30 = under_30 + 1 35 | 36 | # print our various results 37 | print(total_pages) 38 | print("Number of chapters over 30 pages:") 39 | print(over_30) 40 | print("Number of chapters under 30 pages:") 41 | print(under_30) 42 | 43 | # call/execute this "recipe", being sure to pass in our 44 | # actual list as an argument/ingredient 45 | count_pages(page_counts) 46 | -------------------------------------------------------------------------------- /chapter_2_examples/standalone_files/page_count_loop.py: -------------------------------------------------------------------------------- 1 | # fictional list of chapter page counts 2 | page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30] 3 | 4 | # variable for tracking total page count; starting value is 0 5 | total_pages = 0 6 | 7 | # for every item in the list, perform some action 8 | for a_number in page_counts: 9 | 10 | # in this case, add the number to our "total_pages" variable 11 | total_pages = total_pages + a_number 12 | 13 | print(total_pages) 14 | -------------------------------------------------------------------------------- /chapter_2_examples/standalone_files/page_count_printout.py: -------------------------------------------------------------------------------- 1 | # fictional list of chapter page counts 2 | page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30] 3 | 4 | # variable for tracking total page count; starting value is 0 5 | total_pages = 0 6 | 7 | # for every item in the list, perform some action 8 | 9 | for a_number in page_counts: 10 | print("Top of loop!") 11 | print("The current item is:") 12 | print(a_number) 13 | total_pages = total_pages + a_number 14 | print("The running total is:") 15 | print(total_pages) 16 | print("Bottom of loop!") 17 | 18 | print(total_pages) 19 | -------------------------------------------------------------------------------- /chapter_2_examples/standalone_files/parts_of_speech.py: -------------------------------------------------------------------------------- 1 | # A number is just digits 2 | 25 3 | 4 | # A string is anything surrounded by matching quotation marks 5 | "Hello World" 6 | 7 | # A list is surrounded by square brackets, with commas between items 8 | # Note that in Python, the first item in a list is considered to be 9 | # in position `0`, the next in position `1` and so on 10 | ["this","is",1,"list"] 11 | 12 | # A dict is a set of key:value pairs, separated by commas and surrounded 13 | # by curly braces 14 | {"title":"Practical Python for Data Wrangling and Data Quality", 15 | "format": "book", 16 | "author": "Susan E. McGregor" 17 | } 18 | 19 | # A Boolean is a data type that has only two values, true and false. 20 | True 21 | -------------------------------------------------------------------------------- /chapter_4_examples/standalone_files/csv_parsing.py: -------------------------------------------------------------------------------- 1 | # A simple example of reading data from a .csv file with Python 2 | # using the "csv" library. 3 | # The source data was sampled from the Citi Bike system data: 4 | # https://drive.google.com/file/d/17b461NhSjf_akFWvjgNXQfqgh9iFxCu_/ 5 | # Which can be found here: 6 | # https://s3.amazonaws.com/tripdata/index.html 7 | 8 | # import the `csv` library 9 | import csv 10 | 11 | # open the `202009CitibikeTripdataExample.csv` file in read ("r") mode 12 | # this file should be in the same folder as our Python script or notebook 13 | source_file = open("202009CitibikeTripdataExample.csv","r") 14 | 15 | # pass our `source_file` as an ingredient to the the `csv` library's 16 | # DictReader "recipe". 17 | # Store the result in a variable called `citibike_reader` 18 | citibike_reader = csv.DictReader(source_file) 19 | 20 | # the DictReader method has added some useful information to our data, 21 | # like a `fieldnames` property that lets us access all the values 22 | # in the first or "header" row 23 | print(citibike_reader.fieldnames) 24 | 25 | # let's just print out the first 5 rows 26 | for i in range(0,5): 27 | print (next(citibike_reader)) 28 | -------------------------------------------------------------------------------- /chapter_4_examples/standalone_files/fixed_width_parsing.py: -------------------------------------------------------------------------------- 1 | # An example of reading data from a fixed-width file with Python. 2 | # The source file for this example comes from NOAA and can be accessed here: 3 | # https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt 4 | # The metadata for the file can be found here: 5 | # https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt 6 | 7 | # import the `csv` library, to create our output file 8 | import csv 9 | 10 | filename = "ghcnd-stations" 11 | 12 | # reading from a basic text file doesn't require any special libraries 13 | # so we'll just open the file in read format ("r") as usual 14 | source_file = open(filename+".txt", "r") 15 | 16 | # the built-in "readlines()" method does just what you'd think: 17 | # it reads in a text file and converts it to a list of lines 18 | stations_list = source_file.readlines() 19 | 20 | # create an output file for our transformed data 21 | output_file = open(filename+".csv","w") 22 | 23 | # use the `csv` library's "writer" recipe to easily write rows of data 24 | # to `output_file`, instead of reading data *from* it 25 | output_writer = csv.writer(output_file) 26 | 27 | # create the header list 28 | headers = ["ID","LATITUDE","LONGITUDE","ELEVATION","STATE","NAME","GSN_FLAG", 29 | "HCNCRN_FLAG","WMO_ID"] 30 | 31 | # write our headers to the output file 32 | output_writer.writerow(headers) 33 | 34 | # loop through each line of our file (multiple "sheets" are not possible) 35 | for line in stations_list: 36 | # create an empty list, to which we'll append each set of characters that 37 | # makes up a given "column" of data 38 | new_row = [] 39 | # ID: positions 1-11 40 | new_row.append(line[0:11]) 41 | # LATITUDE: positions 13-20 42 | new_row.append(line[12:20]) 43 | # LONGITUDE: positions 22-30 44 | new_row.append(line[21:30]) 45 | # ELEVATION: positions 32-37 46 | new_row.append(line[31:37]) 47 | # STATE: positions 39-40 48 | new_row.append(line[38:40]) 49 | # NAME: positions 42-71 50 | new_row.append(line[41:71]) 51 | # GSN_FLAG: positions 73-75 52 | new_row.append(line[72:75]) 53 | # HCNCRN_FLAG: positions 77-79 54 | new_row.append(line[76:79]) 55 | # WMO_ID: positions 81-85 56 | new_row.append(line[80:85]) 57 | 58 | # now all that's left is to use the 59 | # `writerow` function to write new_row to our output file 60 | output_writer.writerow(new_row) 61 | 62 | # officially close the `.csv` file we just wrote all that data to 63 | output_file.close() 64 | -------------------------------------------------------------------------------- /chapter_4_examples/standalone_files/json_parsing.py: -------------------------------------------------------------------------------- 1 | # A simple example of reading data from a .json file with Python, 2 | # using the built-in "json" library. The data used here is an instance of 3 | # https://api.stlouisfed.org/fred/series/observations?series_id=U6RATE& \ 4 | # file_type=json&api_key=YOUR_API_KEY_HERE 5 | 6 | # import the `json` library, since that's our source file format 7 | import json 8 | 9 | # import the `csv` library, to create our output file 10 | import csv 11 | 12 | # choose a filename 13 | filename = "U6_FRED_data" 14 | 15 | # open the file in read format ("r") as usual 16 | json_source_file = open(filename+".json","r") 17 | 18 | # pass the `json_source_file` as an ingredient to the json library's `load()` 19 | # method and store the result in a variable called `json_data` 20 | json_data = json.load(json_source_file) 21 | 22 | # create our output file, naming it "json_"+filename 23 | output_file = open("json_"+filename+".csv","w") 24 | 25 | # use the `csv` library's "writer" recipe to easily write rows of data 26 | # to `output_file`, instead of reading data *from* it 27 | output_writer = csv.writer(output_file) 28 | 29 | # grab the first element (at position "0"), and use its keys as the column headers 30 | output_writer.writerow(list(json_data["observations"][0].keys())) 31 | 32 | for obj in json_data["observations"]: 33 | 34 | # we'll create an empty list where we'll put the actual values of each object 35 | obj_values = [] 36 | 37 | # for every `key` (which will become a column), in each object 38 | for key, value in obj.items(): 39 | 40 | # let's print what's in here, just to see how the code sees it 41 | print(key,value) 42 | 43 | # add the values to our list 44 | obj_values.append(value) 45 | 46 | # now we've got the whole row, write the data to our output file 47 | output_writer.writerow(obj_values) 48 | 49 | # officially close the `.csv` file we just wrote all that data to 50 | output_file.close() 51 | -------------------------------------------------------------------------------- /chapter_4_examples/standalone_files/ods_parsing.py: -------------------------------------------------------------------------------- 1 | # An example of reading data from an .ods file with Python, using the 2 | # "pyexcel_ods" library. First, you'll need to pip install the library: 3 | # https://pypi.org/project/pyexcel-ods/ 4 | 5 | # specify the "chapter" of the "pyexcel_ods" library you want to import, 6 | # in this case, `get_data` 7 | from pyexcel_ods import get_data 8 | 9 | # import the `csv` library, to create our output file 10 | import csv 11 | 12 | # pass our filename as an ingredient to the `pyexcel_ods` library's 13 | # `get_data()` "recipe" 14 | # store the result in a variable called `source_workbook` 15 | source_workbook = get_data("fredgraph.ods") 16 | 17 | # an `.ods` workbook can have multiple sheets 18 | for sheet_name, sheet_data in source_workbook.items(): 19 | 20 | # print `sheet_name`, just to see what it is 21 | print(sheet_name) 22 | 23 | # create "ods_"+sheet_name+".csv" as an output file for the current sheet 24 | output_file = open("ods_"+sheet_name+".csv","w") 25 | 26 | # use this csv library's "writer" recipe to easily write rows of data 27 | # to `output_file`, instead of reading data *from* it 28 | output_writer = csv.writer(output_file) 29 | 30 | # now, we need to loop through every row in our sheet 31 | for row in sheet_data: 32 | 33 | # use the `writerow` recipe to write each `row` 34 | # directly to our output file 35 | output_writer.writerow(row) 36 | 37 | # officially close the `.csv` file we just wrote all that data to 38 | output_file.close() 39 | -------------------------------------------------------------------------------- /chapter_4_examples/standalone_files/pdf_parsing.py: -------------------------------------------------------------------------------- 1 | # A basic example of reading data from a .pdf file with Python, 2 | # using `pdf2image` to convert it to images, and then using the 3 | # openCV and `tesseract` libraries to extract the text 4 | # The source data was downloaded from: 5 | # https://files.stlouisfed.org/files/htdocs/publications/page1-econ/2020/12/01/ \ 6 | # unemployment-insurance-a-tried-and-true-safety-net_SE.pdf 7 | 8 | # the built-in `operating system` or `os` Python library will let us create 9 | # a new folder in which to store our converted images and output text 10 | import os 11 | 12 | # we'll import the `convert_from_path` "chapter" of the `pdf2image` library 13 | from pdf2image import convert_from_path 14 | 15 | # the built-in `glob`library offers a handy way to loop through all the files 16 | # in a folder that have a certain file extension, for example 17 | import glob 18 | 19 | # `cv2` is the actual library name for `openCV` 20 | import cv2 21 | 22 | # and of course, we need our Python library for interfacing 23 | # with the tesseract OCR process 24 | import pytesseract 25 | 26 | # we'll use the pdf name to name both our generated images and text files 27 | pdf_name = "SafetyNet" 28 | 29 | # our source pdf is in the same folder as our Python script 30 | pdf_source_file = pdf_name+".pdf" 31 | 32 | # as long as a folder with the same name as the pdf does not already exist 33 | if os.path.isdir(pdf_name) == False: 34 | # create a new folder with that name 35 | target_folder = os.mkdir(pdf_name) 36 | 37 | # store all the pages of the PDF in a variable 38 | pages = convert_from_path(pdf_source_file, 300) 39 | 40 | # loop through all the converted pages, enumerating them so that the page 41 | # number can be used to label the resulting images 42 | for page_num, page in enumerate(pages): 43 | # create unique filenames for each page image, combining the 44 | # folder name and the page number 45 | filename = os.path.join(pdf_name,"p"+str(page_num)+".png") 46 | # save the image of the page in system 47 | page.save(filename, 'PNG') 48 | 49 | # next, go through all the files in the folder that end in `.png` 50 | for img_file in glob.glob(os.path.join(pdf_name, '*.png')): 51 | # replace the slash in the image's filename with a dot 52 | temp_name = img_file.replace("/",".") 53 | # pull the unique page name (e.g. `p2`) from the `temp_name` 54 | text_filename = temp_name.split(".")[1] 55 | # now! create a new, writable file, also in our target folder, that 56 | # has the same name as the image, but is a `.txt` file 57 | output_file = open(os.path.join(pdf_name,text_filename+".txt"), "w") 58 | # use the `cv2` library to interpret our image 59 | img = cv2.imread(img_file) 60 | # create a new variable to hold the results of using pytesseract's 61 | # `image_to_string()` function, which will do just that 62 | converted_text = pytesseract.image_to_string(img) 63 | # write our extracted text to our output file 64 | output_file.write(converted_text) 65 | # close the output file 66 | output_file.close() 67 | -------------------------------------------------------------------------------- /chapter_4_examples/standalone_files/rss_parsing.py: -------------------------------------------------------------------------------- 1 | # An example of reading data from an .xml file with Python, using the "lxml" 2 | # library. 3 | # First, you'll need to pip install the lxml library: 4 | # https://pypi.org/project/lxml/ 5 | # The data used here is an instance of 6 | # http://feeds.bbci.co.uk/news/science_and_environment/rss.xml 7 | 8 | # specify the "chapter" of the `lxml` library you want to import, 9 | # in this case, `etree`, which stands for "ElementTree" 10 | from lxml import etree 11 | 12 | # import the `csv` library, to create our output file 13 | import csv 14 | 15 | # choose a filename, for simplicity 16 | filename = "BBC News - Science Environment XML Feed" 17 | 18 | # open our data file in read format, using "rb" as the "mode" 19 | xml_source_file = open(filename+".xml","rb") 20 | 21 | # pass our xml_source_file as an ingredient to the the `lxml` library's 22 | # `etree.parse()` method and store the result in a variable called `xml_doc` 23 | xml_doc = etree.parse(xml_source_file) 24 | 25 | # start by getting the current xml document's "root" element 26 | document_root = xml_doc.getroot() 27 | 28 | # if the document_root is a well-formed XML element 29 | if etree.iselement(document_root): 30 | 31 | # create our output file, naming it "rss_"+filename+".csv" 32 | output_file = open("rss_"+filename+".csv","w") 33 | 34 | # use the `csv` library's "writer" recipe to easily write rows of data 35 | # to `output_file`, instead of reading data *from* it 36 | output_writer = csv.writer(output_file) 37 | 38 | # document_root[0] is the "channel" element 39 | main_channel = document_root[0] 40 | 41 | # the `find()` method returns *only* the first instance of the element name 42 | article_example = main_channel.find('item') 43 | 44 | # create an empty list in which to store our future column headers 45 | tag_list = [] 46 | for child in article_example.iterdescendants(): 47 | 48 | # add each tag to our would-be header list 49 | tag_list.append(child.tag) 50 | 51 | # if the current tag has any attributes 52 | if child.attrib: 53 | 54 | # loop through the attribute keys in the tag 55 | for attribute_name in child.attrib.keys(): 56 | 57 | # append the attribute name to our `tag_list` column headers 58 | tag_list.append(attribute_name) 59 | 60 | # write the contents of `tag_list` to our output file as column headers 61 | output_writer.writerow(tag_list) 62 | 63 | # now we want to grab *every* elment in our file 64 | # so we use the `findall()` method instead of `find()` 65 | for item in main_channel.findall('item'): 66 | 67 | # empty list for holding our new row's content 68 | new_row = [] 69 | 70 | # now we'll use our list of tags to get the contents of each element 71 | for tag in tag_list: 72 | 73 | # if there is anything in the element with a given tag name 74 | if item.findtext(tag): 75 | 76 | # append it to our new row 77 | new_row.append(item.findtext(tag)) 78 | 79 | # otherwise, make sure it's the "isPermaLink" attribute 80 | elif tag == "isPermaLink": 81 | 82 | # grab its value from the element 83 | # and append it to our row 84 | new_row.append(item.find('guid').get("isPermaLink")) 85 | 86 | # write the new row to our output file! 87 | output_writer.writerow(new_row) 88 | 89 | # officially close the `.csv` file we just wrote all that data to 90 | output_file.close() 91 | -------------------------------------------------------------------------------- /chapter_4_examples/standalone_files/tsv_parsing.py: -------------------------------------------------------------------------------- 1 | # A simple example of reading data from a .tsv file with Python, using 2 | # the `csv` library. The source data was downloaded as a .tsv file 3 | # from Jed Shugerman's Google Sheet on prosecutor politicians: 4 | # https://docs.google.com/spreadsheets/d/1E6Z-jZWbrKmit_4lG36oyQ658Ta6Mh25HCOBaz7YVrA/ 5 | 6 | # import the `csv` library 7 | import csv 8 | 9 | # open the `ShugermanProsecutorPoliticians-SupremeCourtJustices.tsv` file 10 | # in read ("r") mode. 11 | # This file should be in the same folder as our Python script or notebook 12 | tsv_source_file = open("ShugermanProsecutorPoliticians-SupremeCourtJustices.tsv","r") 13 | 14 | # pass our `tsv_source_file` as an ingredient to the the csv library's 15 | # DictReader "recipe." 16 | # Store the result in a variable called `politicians_reader` 17 | politicians_reader = csv.DictReader(tsv_source_file, delimiter='\t') 18 | 19 | # the DictReader method has added some useful information to our data, 20 | # like a `fieldnames` property that lets us access all the values 21 | # in the first or "header" row 22 | print(politicians_reader.fieldnames) 23 | 24 | # we'll use the `next()` function to print just the first row of data 25 | print (next(politicians_reader)) 26 | -------------------------------------------------------------------------------- /chapter_4_examples/standalone_files/txt_parsing.py: -------------------------------------------------------------------------------- 1 | # A simple example of reading data from a .tsv file with Python, using 2 | # the `csv` library. The source data was downloaded as a .tsv file 3 | # from Jed Shugerman's Google Sheet on prosecutor politicians: 4 | # https://docs.google.com/spreadsheets/d/1E6Z-jZWbrKmit_4lG36oyQ658Ta6Mh25HCOBaz7YVrA/ 5 | # The original .tsv file was renamed with a file extension of .txt 6 | 7 | # import the `csv` library 8 | import csv 9 | 10 | # open the `ShugermanProsecutorPoliticians-SupremeCourtJustices.txt` file 11 | # in read ("r") mode. 12 | # This file should be in the same folder as our Python script or notebook 13 | txt_source_file = open("ShugermanProsecutorPoliticians-SupremeCourtJustices.txt","r") 14 | 15 | # pass our tsv_source_file as an ingredient to the the csv library's DictReader 16 | # "recipe" and store the result in a variable called `politicians_reader` 17 | # add the "delimiter" parameter and specify the tab character, "\t" 18 | politicians_reader = csv.DictReader(txt_source_file, delimiter='\t') 19 | 20 | # the DictReader function has added useful information to our data, 21 | # like a label that shows us all the values in the first or "header" row 22 | print(politicians_reader.fieldnames) 23 | 24 | # we'll use the `next()` function to print just the first row of data 25 | print (next(politicians_reader)) 26 | -------------------------------------------------------------------------------- /chapter_4_examples/standalone_files/xls_parsing.py: -------------------------------------------------------------------------------- 1 | # A simple example of reading data from a .xls file with Python 2 | # using the "xrld" library. First, pip install the xlrd library: 3 | # https://pypi.org/project/xlrd/2.0.1/ 4 | 5 | # import the "xlrd" library 6 | import xlrd 7 | 8 | # import the `csv` library, to create our output file 9 | import csv 10 | 11 | # pass our filename as an ingredient to the `xlrd` library's 12 | # `open_workbook()` "recipe" 13 | # store the result in a variable called `source_workbook` 14 | source_workbook = xlrd.open_workbook("fredgraph.xls") 15 | 16 | # an `.xls` workbook can have multiple sheets 17 | for sheet_name in source_workbook.sheet_names(): 18 | 19 | # create a variable that points to the current worksheet by 20 | # passing the current value of `sheet_name` to the `sheet_by_name` recipe 21 | current_sheet = source_workbook.sheet_by_name(sheet_name) 22 | 23 | # print `sheet_name`, just to see what it is 24 | print(sheet_name) 25 | 26 | # create "xls_"+sheet_name+".csv" as an output file for the current sheet 27 | output_file = open("xls_"+sheet_name+".csv","w") 28 | 29 | # use the `csv` library's "writer" recipe to easily write rows of data 30 | # to `output_file`, instead of reading data *from* it 31 | output_writer = csv.writer(output_file) 32 | 33 | # now, we need to loop through every row in our sheet 34 | for row_num, row in enumerate(current_sheet.get_rows()): 35 | 36 | # each row is already a list, but we need to use the `row_value()` 37 | # method to access them 38 | # then we can use the `writerow` recipe to write them 39 | # directly to our output file 40 | output_writer.writerow(current_sheet.row_values(row_num)) 41 | 42 | # officially close the `.csv` file we just wrote all that data to 43 | output_file.close() 44 | -------------------------------------------------------------------------------- /chapter_4_examples/standalone_files/xlsx_parsing.py: -------------------------------------------------------------------------------- 1 | # An example of reading data from an .xlsx file with Python, using the "openpyxl" 2 | # library. First, you'll need to pip install the openpyxl library: 3 | # https://pypi.org/project/openpyxl/ 4 | # The source data can be composed and downloaded from: 5 | # https://fred.stlouisfed.org/series/U6RATE 6 | 7 | # specify the "chapter" you want to import from the "openpyxl" library 8 | # in this case, "load_workbook" 9 | from openpyxl import load_workbook 10 | 11 | # import the `csv` library, to create our output file 12 | import csv 13 | 14 | # Pass our filename as an ingredient to the `openpyxl` library's 15 | # `load_workbook()` "recipe" 16 | # store the result in a variable called `source_workbook` 17 | source_workbook = load_workbook(filename = 'fredgraph.xlsx') 18 | 19 | # an .xlsx workbook can have multiple sheets 20 | # print their names here for reference 21 | print(source_workbook.sheetnames) 22 | 23 | # loop through the worksheets in `source_workbook` 24 | for sheet_num, sheet_name in enumerate(source_workbook.sheetnames): 25 | 26 | # create a variable that points to the current worksheet by 27 | # passing the current value of `sheet_name` to `source_workbook` 28 | current_sheet = source_workbook[sheet_name] 29 | 30 | # print `sheet_name`, just to see what it is 31 | print(sheet_name) 32 | 33 | # create an output file called "xlsx_"+sheet_name 34 | output_file = open("xlsx_"+sheet_name+".csv","w") 35 | 36 | # use this csv library's "writer" recipe to easily write rows of data 37 | # to `output_file`, instead of reading data *from* it 38 | output_writer = csv.writer(output_file) 39 | 40 | # loop through every row in our sheet 41 | for row in current_sheet.iter_rows(): 42 | 43 | # we'll create an empty list where we'll put the actual 44 | # values of the cells in each row 45 | row_cells = [] 46 | 47 | # for every cell (or column) in each row.... 48 | for cell in row: 49 | 50 | # let's print what's in here, just to see how the code sees it 51 | print(cell, cell.value) 52 | 53 | # add the values to the end of our list with the `append()` method 54 | row_cells.append(cell.value) 55 | 56 | # write our newly (re)constructed data row to the output file 57 | output_writer.writerow(row_cells) 58 | 59 | # officially close the `.csv` file we just wrote all that data to 60 | output_file.close() 61 | -------------------------------------------------------------------------------- /chapter_4_examples/standalone_files/xml_parsing.py: -------------------------------------------------------------------------------- 1 | # An example of reading data from a .xml file with Python, using the "lxml" 2 | # library. 3 | # First, you'll need to pip install the lxml library: 4 | # https://pypi.org/project/lxml/ 5 | # A helpful tutorial can be found here: https://lxml.de/tutorial.html 6 | # The data used here is an instance of 7 | # https://api.stlouisfed.org/fred/series/observations?series_id=U6RATE& \ 8 | # api_key=YOUR_API_KEY_HERE 9 | 10 | # specify the "chapter" of the `lxml` library you want to import, 11 | # in this case, `etree`, which stands for "ElementTree" 12 | from lxml import etree 13 | 14 | # import the `csv` library, to create our output file 15 | import csv 16 | 17 | # choose a filename 18 | filename = "U6_FRED_data" 19 | 20 | # open our data file in read format, using "rb" as the "mode" 21 | xml_source_file = open(filename+".xml","rb") 22 | 23 | # pass our xml_source_file as an ingredient to the the `lxml` library's 24 | # `etree.parse()` method and store the result in a variable called `xml_doc` 25 | xml_doc = etree.parse(xml_source_file) 26 | 27 | # start by getting the current xml document's "root" element 28 | document_root = xml_doc.getroot() 29 | 30 | # let's print it out to see what it looks like 31 | print(etree.tostring(document_root)) 32 | 33 | # confirm that `document_root` is a well-formed XML element 34 | if etree.iselement(document_root): 35 | 36 | # create our output file, naming it "xml_"+filename+".csv 37 | output_file = open("xml_"+filename+".csv","w") 38 | 39 | # use the `csv` library's "writer" recipe to easily write rows of data 40 | # to `output_file`, instead of reading data *from* it 41 | output_writer = csv.writer(output_file) 42 | 43 | # grab the first element of our xml document (using `document_root[0]`) 44 | # and write its attribute keys as column headers to our output file 45 | output_writer.writerow(document_root[0].attrib.keys()) 46 | 47 | # now, we need to loop through every element in our XML file 48 | for child in document_root: 49 | 50 | # now we'll use the `.values()` method to get each element's values 51 | # as a list, and then use that directly with the `writerow` recipe 52 | output_writer.writerow(child.attrib.values()) 53 | 54 | # officially close the `.csv` file we just wrote all that data to 55 | output_file.close() 56 | -------------------------------------------------------------------------------- /chapter_5_examples/.gitignore: -------------------------------------------------------------------------------- 1 | # ignoring all credentials files 2 | **credentials* 3 | -------------------------------------------------------------------------------- /chapter_5_examples/jupyter_notebooks/FRED_API_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# import the requests library, which let's us write Python that acts like\n", 10 | "# a web browser through code\n", 11 | "import requests" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 21 | "# # Import PyDrive and associated libraries.\n", 22 | "# # This only needs to be done once per notebook.\n", 23 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n", 24 | "# from pydrive.auth import GoogleAuth\n", 25 | "# from pydrive.drive import GoogleDrive\n", 26 | "# from google.colab import auth\n", 27 | "# from oauth2client.client import GoogleCredentials\n", 28 | "\n", 29 | "# # Authenticate and create the PyDrive client.\n", 30 | "# # This only needs to be done once per notebook.\n", 31 | "# auth.authenticate_user()\n", 32 | "# gauth = GoogleAuth()\n", 33 | "# gauth.credentials = GoogleCredentials.get_application_default()\n", 34 | "# drive = GoogleDrive(gauth)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 44 | "# # Link to data file stored in Drive: LINK TO YOUR CREDENTIALS FILE ON DRIVE\n", 45 | "# file_id = 'FILE_ID_OF_YOUR_CREDENTIALS_FILE_ON_DRIVE' # notice where this string comes from in link above\n", 46 | "\n", 47 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 48 | "# print(imported_file['title']) # it should print the title of desired file\n", 49 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "# we can import our API key by first giving Python the name of our credentials\n", 59 | "# file, and then telling it the variable to import\n", 60 | "from FRED_credentials import my_api_key" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# specify the FRED endpoint we want to use\n", 70 | "FRED_endpoint = \"https://api.stlouisfed.org/fred/series/observations?\"\n", 71 | "\n", 72 | "# also specify the query parameters and their values\n", 73 | "FRED_parameters = \"series_id=U6RATE&file_type=json\"\n", 74 | "\n", 75 | "# construct the complete URL for our API request, adding our API key to the end\n", 76 | "complete_data_URL = FRED_endpoint + FRED_parameters +\"&api_key=\"+my_api_key\n", 77 | "\n", 78 | "# open a new, writable file with our chosen filename\n", 79 | "FRED_output_file = open(\"FRED_API_data.json\",\"w\")\n", 80 | "\n", 81 | "# use the requests library's \"get\" recipe to access the contents of our\n", 82 | "# target URL and store it in a our `FRED_data` variable\n", 83 | "FRED_data = requests.get(complete_data_URL)\n", 84 | "\n", 85 | "# the requests library's \"get\" function has put the contents of the webpage\n", 86 | "# in a property \"text\", which we'll write directly to our FRED_output_file\n", 87 | "# using the built-in \"write\" method\n", 88 | "FRED_output_file.write(FRED_data.text)\n", 89 | "\n", 90 | "# close our FRED_output_file\n", 91 | "FRED_output_file.close()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 101 | "# from google.colab import files\n", 102 | "\n", 103 | "# files.download(\"FRED_API_data.json\")" 104 | ] 105 | } 106 | ], 107 | "metadata": { 108 | "kernelspec": { 109 | "display_name": "Python 3 (ipykernel)", 110 | "language": "python", 111 | "name": "python3" 112 | }, 113 | "language_info": { 114 | "codemirror_mode": { 115 | "name": "ipython", 116 | "version": 3 117 | }, 118 | "file_extension": ".py", 119 | "mimetype": "text/x-python", 120 | "name": "python", 121 | "nbconvert_exporter": "python", 122 | "pygments_lexer": "ipython3", 123 | "version": "3.9.5" 124 | } 125 | }, 126 | "nbformat": 4, 127 | "nbformat_minor": 4 128 | } 129 | -------------------------------------------------------------------------------- /chapter_5_examples/jupyter_notebooks/MTA_turnstiles_index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# include the requests library in order to get data from the web\n", 10 | "import requests" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# specify the URL of the web page we're downloading\n", 20 | "# this one contains a linked list of all the NYC MTA turnstile data files\n", 21 | "# going back to 2010\n", 22 | "mta_turnstiles_index_url = \"http://web.mta.info/developers/turnstile.html\"" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# create some header information for our web page request\n", 32 | "headers = {\n", 33 | " 'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 13597.66.0) ' + \\\n", 34 | " 'AppleWebKit/537.36 (KHTML, like Gecko) ' + \\\n", 35 | " 'Chrome/88.0.4324.109 Safari/537.36',\n", 36 | " 'From': 'YOUR NAME HERE - youremailaddress@emailprovider.som'\n", 37 | "}" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# send a `get()` request for the URL, along with our informational headers\n", 47 | "mta_web_page = requests.get(mta_turnstiles_index_url, headers=headers)\n", 48 | "\n", 49 | "# open up a writable local file where we can save the contents of the web page\n", 50 | "mta_turnstiles_output_file = open(\"MTA_turnstiles_index.html\",\"w\")\n", 51 | "\n", 52 | "# write the `text` web page to our output file\n", 53 | "mta_turnstiles_output_file.write(mta_web_page.text)\n", 54 | "\n", 55 | "# close our output file!\n", 56 | "mta_turnstiles_output_file.close()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 66 | "# from google.colab import files\n", 67 | "\n", 68 | "# files.download(\"MTA_turnstiles_index.html\")" 69 | ] 70 | } 71 | ], 72 | "metadata": { 73 | "kernelspec": { 74 | "display_name": "Python 3 (ipykernel)", 75 | "language": "python", 76 | "name": "python3" 77 | }, 78 | "language_info": { 79 | "codemirror_mode": { 80 | "name": "ipython", 81 | "version": 3 82 | }, 83 | "file_extension": ".py", 84 | "mimetype": "text/x-python", 85 | "name": "python", 86 | "nbconvert_exporter": "python", 87 | "pygments_lexer": "ipython3", 88 | "version": "3.9.5" 89 | } 90 | }, 91 | "nbformat": 4, 92 | "nbformat_minor": 4 93 | } 94 | -------------------------------------------------------------------------------- /chapter_5_examples/jupyter_notebooks/data_download.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# A basic example of downloading data from the web with Python,\n", 10 | "# using the requests library\n", 11 | "#\n", 12 | "# The source data we are downloading will come from the following URLs:\n", 13 | "# http://feeds.bbci.co.uk/news/science_and_environment/rss.xml\n", 14 | "# https://gbfs.citibikenyc.com/gbfs/en/station_status.json\n", 15 | "\n", 16 | "# the `requests` library lets us write Python code that acts like\n", 17 | "# a web browser\n", 18 | "import requests" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 28 | "# from google.colab import files" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# our chosen XML filename\n", 38 | "XMLfilename = \"BBC_RSS.xml\"\n", 39 | "\n", 40 | "# open a new, writable file for our XML output\n", 41 | "xml_output_file = open(XMLfilename,\"w\")\n", 42 | "\n", 43 | "# use the requests library's \"get\" recipe to access the contents of our\n", 44 | "# target URL and store it in our `xml_data` variable\n", 45 | "xml_data = requests.get('http://feeds.bbci.co.uk/news/science_and_environment/rss.xml')\n", 46 | "\n", 47 | "# the requests library's `get()` function puts contents of the web page\n", 48 | "# in a property `text`\n", 49 | "# we'll `write` that directly to our `xml_output_file`\n", 50 | "xml_output_file.write(xml_data.text)\n", 51 | "\n", 52 | "# close our xml_output_file\n", 53 | "xml_output_file.close()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 63 | "# files.download(XMLfilename)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# our chosen JSON filename\n", 73 | "JSONfilename = \"citibikenyc_station_status.json\"\n", 74 | "\n", 75 | "# open a new, writable file for our JSON output\n", 76 | "json_output_file = open(JSONfilename,\"w\")\n", 77 | "\n", 78 | "# use the `requests` library's `get()` recipe to access the contents of our\n", 79 | "# target URL and store it in our `json_data` variable\n", 80 | "json_data = requests.get('https://gbfs.citibikenyc.com/gbfs/en/station_status.json')\n", 81 | "\n", 82 | "# `get()` the contents of the web page and write its `text`\n", 83 | "# directly to `json_output_file`\n", 84 | "json_output_file.write(json_data.text)\n", 85 | "\n", 86 | "# close our json_output_file\n", 87 | "json_output_file.close()" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 97 | "# files.download(JSONfilename)" 98 | ] 99 | } 100 | ], 101 | "metadata": { 102 | "kernelspec": { 103 | "display_name": "Python 3 (ipykernel)", 104 | "language": "python", 105 | "name": "python3" 106 | }, 107 | "language_info": { 108 | "codemirror_mode": { 109 | "name": "ipython", 110 | "version": 3 111 | }, 112 | "file_extension": ".py", 113 | "mimetype": "text/x-python", 114 | "name": "python", 115 | "nbconvert_exporter": "python", 116 | "pygments_lexer": "ipython3", 117 | "version": "3.9.5" 118 | } 119 | }, 120 | "nbformat": 4, 121 | "nbformat_minor": 4 122 | } 123 | -------------------------------------------------------------------------------- /chapter_5_examples/standalone_files/FRED_API_example.py: -------------------------------------------------------------------------------- 1 | # import the requests library, which let's us write Python that acts like 2 | # a web browser through code 3 | import requests 4 | 5 | # we can import our API key by first giving Python the name of our credentials 6 | # file, and then telling it the variable to import 7 | from FRED_credentials import my_api_key 8 | 9 | # specify the FRED endpoint we want to use 10 | FRED_endpoint = "https://api.stlouisfed.org/fred/series/observations?" 11 | 12 | # also specify the query parameters and their values 13 | FRED_parameters = "series_id=U6RATE&file_type=json" 14 | 15 | # construct the complete URL for our API request, adding our API key to the end 16 | complete_data_URL = FRED_endpoint + FRED_parameters +"&api_key="+my_api_key 17 | 18 | # open a new, writable file with our chosen filename 19 | FRED_output_file = open("FRED_API_data.json","w") 20 | 21 | # use the requests library's "get" recipe to access the contents of our 22 | # target URL and store it in a our `FRED_data` variable 23 | FRED_data = requests.get(complete_data_URL) 24 | 25 | # the requests library's "get" function has put the contents of the webpage 26 | # in a property "text", which we'll write directly to our FRED_output_file 27 | # using the built-in "write" method 28 | FRED_output_file.write(FRED_data.text) 29 | 30 | # close our FRED_output_file 31 | FRED_output_file.close() 32 | -------------------------------------------------------------------------------- /chapter_5_examples/standalone_files/MTA_turnstile_index.py: -------------------------------------------------------------------------------- 1 | # include the requests library in order to get data from the web 2 | import requests 3 | 4 | # specify the URL of the web page we're downloading 5 | # this one contains a linked list of all the NYC MTA turnstile data files 6 | # going back to 2010 7 | mta_turnstiles_index_url = "http://web.mta.info/developers/turnstile.html" 8 | 9 | # create some header information for our web page request 10 | headers = { 11 | 'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 13597.66.0) ' + \ 12 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' + \ 13 | 'Chrome/88.0.4324.109 Safari/537.36', 14 | 'From': 'YOUR NAME HERE - youremailaddress@emailprovider.som' 15 | } 16 | 17 | # send a `get()` request for the URL, along with our informational headers 18 | mta_web_page = requests.get(mta_turnstiles_index_url, headers=headers) 19 | 20 | # open up a writable local file where we can save the contents of the web page 21 | mta_turnstiles_output_file = open("MTA_turnstiles_index.html","w") 22 | 23 | # write the `text` web page to our output file 24 | mta_turnstiles_output_file.write(mta_web_page.text) 25 | 26 | # close our output file! 27 | mta_turnstiles_output_file.close() 28 | -------------------------------------------------------------------------------- /chapter_5_examples/standalone_files/MTA_turnstiles_data_download.py: -------------------------------------------------------------------------------- 1 | # include the requests library in order to get data from the web 2 | import requests 3 | 4 | # import the `os` Python library so we can create a new folder 5 | # in which to store our downloaded data files 6 | import os 7 | 8 | # import the `time` library 9 | import time 10 | 11 | # open the file where we stored our list of links 12 | mta_data_links = open("MTA_data_index.csv","r") 13 | 14 | # create a folder name so that we can keep the data organized 15 | folder_name = "turnstile_data" 16 | 17 | # add our header information 18 | headers = { 19 | 'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 13597.66.0) ' + \ 20 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' + \ 21 | 'Chrome/88.0.4324.109 Safari/537.36', 22 | 'From': 'YOUR NAME HERE - youremailaddress@emailprovider.som' 23 | } 24 | # the built-in `readlines()` function converts our data file to a 25 | # list, where each line is an item 26 | mta_links_list = mta_data_links.readlines() 27 | # confirm there isn't already a folder with our chosen name 28 | if os.path.isdir(folder_name) == False: 29 | # create a new folder with that name 30 | target_folder = os.mkdir(folder_name) 31 | 32 | # only download the precise number of files we need 33 | for i in range(0,4): 34 | 35 | # use the built-in `strip()` method to remove the newline (`\n`) 36 | # character at the end of each row/link 37 | data_url = (mta_links_list[i]).strip() 38 | 39 | # create a unique output filename based on the url 40 | data_filename = data_url.split("/")[-1] 41 | 42 | # make our request for the data 43 | turnstile_data_file = requests.get(data_url, headers=headers) 44 | 45 | # open a new, writable file inside our target folder 46 | # using the appropriate filename 47 | local_data_file = open(os.path.join(folder_name,data_filename), "w") 48 | 49 | # save the contents of the downloaded file to that new file 50 | local_data_file.write(turnstile_data_file.text) 51 | 52 | # close the local file 53 | local_data_file.close() 54 | 55 | # `sleep()` for two seconds before moving on to the next item in the loop 56 | time.sleep(2) 57 | -------------------------------------------------------------------------------- /chapter_5_examples/standalone_files/MTA_turnstiles_parsing.py: -------------------------------------------------------------------------------- 1 | # import the Beautiful Soup recipe from the bs4 library 2 | from bs4 import BeautifulSoup 3 | 4 | # open the saved copy of our MTA turnstiles web page 5 | # (original here: http://web.mta.info/developers/turnstile.html) 6 | mta_web_page = open("MTA_turnstiles_index.html", "r") 7 | 8 | # define the base URL for the data files 9 | base_url = "http://web.mta.info/developers/" 10 | 11 | # the `BeautifulSoup` recipe takes the contents of our web page and another 12 | # "ingredient", which tells it what kind of code it is working with 13 | # In this case, it's HTML 14 | soup = BeautifulSoup(mta_web_page, "html.parser") 15 | 16 | # using the "find" recipe, we can pass a tag type and class name as 17 | # "ingredients" to zero in on the content we want. 18 | data_files_section = soup.find("div", class_="span-84 last") 19 | 20 | # within that div, we can now just look for all the "anchor" (`a`) tags 21 | all_data_links = data_files_section.find_all("a") 22 | 23 | # need to open a file to write our extracted links to 24 | mta_data_list = open("MTA_data_index.csv","w") 25 | 26 | # the `find_all()` recipe returns a list of everything it matches 27 | for a_link in all_data_links: 28 | 29 | # combine our base URL with the contents of each "href" (link) property, 30 | # and store it in `complete_link` 31 | complete_link = base_url+a_link["href"] 32 | 33 | # write this completed link to our output file, manually adding a 34 | # newline `\n` character to the end, so each link will be on its own row 35 | mta_data_list.write(complete_link+"\n") 36 | 37 | # once we've written all the links to our file, close it! 38 | mta_data_list.close() 39 | -------------------------------------------------------------------------------- /chapter_5_examples/standalone_files/Twitter_data_download.py: -------------------------------------------------------------------------------- 1 | # import the encoded key from our credentials file 2 | from Twitter_credentials import auth_ready_key 3 | 4 | # include the requests library in order to get data from the web 5 | import requests 6 | 7 | # specify the Twitter endpoint that we'll use to retrieve 8 | # our access token or "bearer" token 9 | auth_url = 'https://api.twitter.com/oauth2/token' 10 | 11 | # add our `auth_ready_key` to a template `dict` object provided 12 | # in the Twitter API documentation 13 | auth_headers = { 14 | 'Authorization': 'Basic '+auth_ready_key, 15 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8' 16 | } 17 | 18 | # another `dict` describes what we're asking for 19 | auth_data = { 20 | 'grant_type': 'client_credentials' 21 | } 22 | 23 | # make our complete request to the authorization endpoint, and store 24 | # the results in the `auth_resp` variable 25 | auth_resp = requests.post(auth_url, headers=auth_headers, data=auth_data) 26 | 27 | # pull the access token out of the json-formatted data 28 | # that the authorization endpoint sent back to us 29 | access_token = auth_resp.json()['access_token'] 30 | 31 | # now that we have an access/bearer token, we're ready to request some data! 32 | # we'll create a new dict that includes this token 33 | search_headers = { 34 | 'Authorization': 'Bearer ' + access_token 35 | } 36 | 37 | # this is the Twitter search API endpoint for version 1.1 of the API 38 | search_url = 'https://api.twitter.com/1.1/search/tweets.json' 39 | 40 | # create a new dict that includes our search query parameters 41 | search_params = { 42 | 'q': 'Python', 43 | 'result_type': 'recent', 44 | 'count': 4 45 | } 46 | 47 | # send our data request and store the results in `search_resp` 48 | search_resp = requests.get(search_url, headers=search_headers, params=search_params) 49 | 50 | # parse the response into a JSON object 51 | Twitter_data = search_resp.json() 52 | 53 | # open an output file where we can save the results 54 | Twitter_output_file = open("Twitter_search_results.json", "w") 55 | 56 | # write the returned Twitter data to our output file 57 | Twitter_output_file.write(str(Twitter_data)) 58 | 59 | # close the output file 60 | Twitter_output_file.close() 61 | 62 | # loop through our results and print the text of the Twitter status 63 | for a_Tweet in Twitter_data['statuses']: 64 | print(a_Tweet['text'] + '\n') 65 | -------------------------------------------------------------------------------- /chapter_5_examples/standalone_files/data_download.py: -------------------------------------------------------------------------------- 1 | # A basic example of downloading data from the web with Python, 2 | # using the requests library 3 | # 4 | # The source data we are downloading will come from the following URLs: 5 | # http://feeds.bbci.co.uk/news/science_and_environment/rss.xml 6 | # https://gbfs.citibikenyc.com/gbfs/en/station_status.json 7 | 8 | # the `requests` library lets us write Python code that acts like 9 | # a web browser 10 | import requests 11 | 12 | # our chosen XML filename 13 | XMLfilename = "BBC_RSS.xml" 14 | 15 | # open a new, writable file for our XML output 16 | xml_output_file = open(XMLfilename,"w") 17 | 18 | # use the requests library's "get" recipe to access the contents of our 19 | # target URL and store it in our `xml_data` variable 20 | xml_data = requests.get('http://feeds.bbci.co.uk/news/science_and_environment/rss.xml') 21 | 22 | # the requests library's `get()` function puts contents of the web page 23 | # in a property `text` 24 | # we'll `write` that directly to our `xml_output_file` 25 | xml_output_file.write(xml_data.text) 26 | 27 | # close our xml_output_file 28 | xml_output_file.close() 29 | 30 | # our chosen JSON filename 31 | JSONfilename = "citibikenyc_station_status.json" 32 | 33 | # open a new, writable file for our JSON output 34 | json_output_file = open(JSONfilename,"w") 35 | 36 | # use the `requests` library's `get()` recipe to access the contents of our 37 | # target URL and store it in our `json_data` variable 38 | json_data = requests.get('https://gbfs.citibikenyc.com/gbfs/en/station_status.json') 39 | 40 | # `get()` the contents of the web page and write its `text` 41 | # directly to `json_output_file` 42 | json_output_file.write(json_data.text) 43 | 44 | # close our json_output_file 45 | json_output_file.close() 46 | -------------------------------------------------------------------------------- /chapter_6_examples/jupyter_notebooks/ppp_columns_review.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Quick script for reviewing the all the column names in the PPP data\n", 10 | "# to see what we can infer about them from the data itself\n", 11 | "\n", 12 | "# importing the `pandas` library\n", 13 | "import pandas as pd" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 23 | "# # Import PyDrive and associated libraries.\n", 24 | "# # This only needs to be done once per notebook.\n", 25 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n", 26 | "# from pydrive.auth import GoogleAuth\n", 27 | "# from pydrive.drive import GoogleDrive\n", 28 | "# from google.colab import auth\n", 29 | "# from oauth2client.client import GoogleCredentials\n", 30 | "\n", 31 | "# # Authenticate and create the PyDrive client.\n", 32 | "# # This only needs to be done once per notebook.\n", 33 | "# auth.authenticate_user()\n", 34 | "# gauth = GoogleAuth()\n", 35 | "# gauth.credentials = GoogleCredentials.get_application_default()\n", 36 | "# drive = GoogleDrive(gauth)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 46 | "# # This loads the recent data sample we created with the `ppp_data_sample` script\n", 47 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1vwVf5caOURuRWzsTahC7W_Eb_a57mvAA/view?usp=sharing\n", 48 | "# file_id = '1vwVf5caOURuRWzsTahC7W_Eb_a57mvAA' # notice where this string comes from in link above\n", 49 | "\n", 50 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 51 | "# print(imported_file['title']) # it should print the title of desired file\n", 52 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# read the recent data into a pandas DataFrame using its `read_csv()` method\n", 62 | "ppp_data_sample = pd.read_csv('recent_sample.csv')" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# convert all missing data entries to '' using the `convertdtypes()` method\n", 72 | "converted_data_sample = ppp_data_sample.convert_dtypes()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# transpose the whole sample\n", 82 | "transposed_ppp_data_sample = converted_data_sample.transpose()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# print out the results!\n", 92 | "print(transposed_ppp_data_sample)" 93 | ] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "display_name": "Python 3 (ipykernel)", 99 | "language": "python", 100 | "name": "python3" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.9.5" 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 4 117 | } 118 | -------------------------------------------------------------------------------- /chapter_6_examples/jupyter_notebooks/ppp_columns_summary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Quick script for reviewing all the column names in the PPP data\n", 10 | "# to see what we can infer about them from the data itself\n", 11 | "\n", 12 | "# importing the `pandas` library\n", 13 | "import pandas as pd" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 23 | "# # Import PyDrive and associated libraries.\n", 24 | "# # This only needs to be done once per notebook.\n", 25 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n", 26 | "# from pydrive.auth import GoogleAuth\n", 27 | "# from pydrive.drive import GoogleDrive\n", 28 | "# from google.colab import auth\n", 29 | "# from oauth2client.client import GoogleCredentials\n", 30 | "\n", 31 | "# # Authenticate and create the PyDrive client.\n", 32 | "# # This only needs to be done once per notebook.\n", 33 | "# auth.authenticate_user()\n", 34 | "# gauth = GoogleAuth()\n", 35 | "# gauth.credentials = GoogleCredentials.get_application_default()\n", 36 | "# drive = GoogleDrive(gauth)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 46 | "# # This loads the \"recent\" (February 2021) data\n", 47 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n", 48 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n", 49 | "\n", 50 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 51 | "# print(imported_file['title']) # it should print the title of desired file\n", 52 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# read the recent data sample into a pandas DataFrame\n", 62 | "ppp_data = pd.read_csv('public_150k_plus_recent.csv')" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# print the summary of values that appear in the `LoanStats` column\n", 72 | "print(ppp_data.value_counts('LoanStatus'))" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# print the total number of entries in the `LoanStatus` column\n", 82 | "print(sum(ppp_data.value_counts('LoanStatus')))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# print the summary of values that appear in the `Gender` column\n", 92 | "print(ppp_data.value_counts('Gender'))" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# print the total number of entries in the `Gender` column\n", 102 | "print(sum(ppp_data.value_counts('Gender')))" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# print how many rows do not list a value for `BorrowerAddress`\n", 112 | "print(ppp_data['BorrowerAddress'].isna().sum())" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3 (ipykernel)", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.9.5" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 4 137 | } 138 | -------------------------------------------------------------------------------- /chapter_6_examples/jupyter_notebooks/ppp_date_range.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Quick script for finding the earliest and latest loan dates in the PPP loan\n", 10 | "# data\n", 11 | "\n", 12 | "# importing the `pandas` library\n", 13 | "import pandas as pd" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 23 | "# # Import PyDrive and associated libraries.\n", 24 | "# # This only needs to be done once per notebook.\n", 25 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n", 26 | "# from pydrive.auth import GoogleAuth\n", 27 | "# from pydrive.drive import GoogleDrive\n", 28 | "# from google.colab import auth\n", 29 | "# from oauth2client.client import GoogleCredentials\n", 30 | "\n", 31 | "# # Authenticate and create the PyDrive client.\n", 32 | "# # This only needs to be done once per notebook.\n", 33 | "# auth.authenticate_user()\n", 34 | "# gauth = GoogleAuth()\n", 35 | "# gauth.credentials = GoogleCredentials.get_application_default()\n", 36 | "# drive = GoogleDrive(gauth)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 46 | "# # This loads the \"recent\" (February 2021) data\n", 47 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n", 48 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n", 49 | "\n", 50 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 51 | "# print(imported_file['title']) # it should print the title of desired file\n", 52 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# read the recent data into a pandas DataFrame using its `read_csv()` method\n", 62 | "ppp_data = pd.read_csv('public_150k_plus_recent.csv')" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# convert the values in the `DateApproved` column to *actual* dates\n", 72 | "ppp_data['DateApproved'] = pd.to_datetime(ppp_data['DateApproved'],\n", 73 | " format='%m/%d/%Y')" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "# print out the `min()` and `max()` values in the `DateApproved` column\n", 83 | "print(ppp_data['DateApproved'].min())\n", 84 | "print(ppp_data['DateApproved'].max())" 85 | ] 86 | } 87 | ], 88 | "metadata": { 89 | "kernelspec": { 90 | "display_name": "Python 3 (ipykernel)", 91 | "language": "python", 92 | "name": "python3" 93 | }, 94 | "language_info": { 95 | "codemirror_mode": { 96 | "name": "ipython", 97 | "version": 3 98 | }, 99 | "file_extension": ".py", 100 | "mimetype": "text/x-python", 101 | "name": "python", 102 | "nbconvert_exporter": "python", 103 | "pygments_lexer": "ipython3", 104 | "version": "3.9.5" 105 | } 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 4 109 | } 110 | -------------------------------------------------------------------------------- /chapter_6_examples/jupyter_notebooks/ppp_find_waterford.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Quick script for finding a business within our data set by (partial) name\n", 10 | "\n", 11 | "# importing the `pandas` library\n", 12 | "import pandas as pd" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 22 | "# # Import PyDrive and associated libraries.\n", 23 | "# # This only needs to be done once per notebook.\n", 24 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n", 25 | "# from pydrive.auth import GoogleAuth\n", 26 | "# from pydrive.drive import GoogleDrive\n", 27 | "# from google.colab import auth\n", 28 | "# from oauth2client.client import GoogleCredentials\n", 29 | "\n", 30 | "# # Authenticate and create the PyDrive client.\n", 31 | "# # This only needs to be done once per notebook.\n", 32 | "# auth.authenticate_user()\n", 33 | "# gauth = GoogleAuth()\n", 34 | "# gauth.credentials = GoogleCredentials.get_application_default()\n", 35 | "# drive = GoogleDrive(gauth)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 45 | "# # This loads the \"recent\" (February 2021) data\n", 46 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n", 47 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n", 48 | "\n", 49 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 50 | "# print(imported_file['title']) # it should print the title of desired file\n", 51 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# read the recent data sample into a pandas DataFrame\n", 61 | "ppp_data = pd.read_csv('public_150k_plus_recent.csv')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# create a DataFrame without any missing `BorrowerName` values\n", 71 | "ppp_data_named_borrowers = ppp_data[ppp_data['BorrowerName'].notna()]" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# because precise matching can be tricky,\n", 81 | "# we'll use the pandas `str.contains()` method\n", 82 | "bankruptcy_example = ppp_data_named_borrowers[ppp_data_named_borrowers['BorrowerName']\n", 83 | " .str.contains('WATERFORD RECEPTIONS')]" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# transposing the result so it's easier to read\n", 93 | "print(bankruptcy_example.transpose())" 94 | ] 95 | } 96 | ], 97 | "metadata": { 98 | "kernelspec": { 99 | "display_name": "Python 3 (ipykernel)", 100 | "language": "python", 101 | "name": "python3" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 3 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython3", 113 | "version": "3.9.5" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 4 118 | } 119 | -------------------------------------------------------------------------------- /chapter_6_examples/jupyter_notebooks/ppp_lender_names.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Quick script for determining whether there are typos &c. in any of the PPP\n", 10 | "# loan data's bank names\n", 11 | "\n", 12 | "# importing the `pandas` library. The `as` keyword let's us essentially create\n", 13 | "# a nickname for the library so that we can refer to it in fewer characters\n", 14 | "import pandas as pd\n", 15 | "\n", 16 | "# importing the `fingerprints` library, which will help us generate normalized\n", 17 | "# labels for each of the bank names in our data set\n", 18 | "import fingerprints" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 28 | "# # Import PyDrive and associated libraries.\n", 29 | "# # This only needs to be done once per notebook.\n", 30 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n", 31 | "# from pydrive.auth import GoogleAuth\n", 32 | "# from pydrive.drive import GoogleDrive\n", 33 | "# from google.colab import auth\n", 34 | "# from oauth2client.client import GoogleCredentials\n", 35 | "\n", 36 | "# # Authenticate and create the PyDrive client.\n", 37 | "# # This only needs to be done once per notebook.\n", 38 | "# auth.authenticate_user()\n", 39 | "# gauth = GoogleAuth()\n", 40 | "# gauth.credentials = GoogleCredentials.get_application_default()\n", 41 | "# drive = GoogleDrive(gauth)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 51 | "# # This loads the \"recent\" (February 2021) data\n", 52 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n", 53 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n", 54 | "\n", 55 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 56 | "# print(imported_file['title']) # it should print the title of desired file\n", 57 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# read the recent data sample into a pandas DataFrame using the library's\n", 67 | "# `read_csv()` method\n", 68 | "ppp_data = pd.read_csv('public_150k_plus_recent.csv')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# use the pandas DataFrame `unique()` function to create a list of unique\n", 78 | "# bank names in our data's `OriginatingLender` column\n", 79 | "unique_names = ppp_data['OriginatingLender'].unique()\n", 80 | "\n", 81 | "# confirm how many unique names there are\n", 82 | "print(len(unique_names))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# create an empty list to hold the fingerprint of each of the unique names\n", 92 | "fingerprint_list = []\n", 93 | "\n", 94 | "# iterate through each name in the list of unique names\n", 95 | "for name in unique_names:\n", 96 | "\n", 97 | " # for each name, generate its fingerprint\n", 98 | " # and append it to the end of the list\n", 99 | " fingerprint_list.append(fingerprints.generate(name))\n", 100 | "\n", 101 | "\n", 102 | "# use the built-in `set()` method on our fingerprint_list, which will\n", 103 | "# remove duplicates (and sort it)\n", 104 | "fingerprint_set = set(fingerprint_list)\n", 105 | "\n", 106 | "# check the length of the fingerprint_set\n", 107 | "print(len(fingerprint_set))" 108 | ] 109 | } 110 | ], 111 | "metadata": { 112 | "kernelspec": { 113 | "display_name": "Python 3 (ipykernel)", 114 | "language": "python", 115 | "name": "python3" 116 | }, 117 | "language_info": { 118 | "codemirror_mode": { 119 | "name": "ipython", 120 | "version": 3 121 | }, 122 | "file_extension": ".py", 123 | "mimetype": "text/x-python", 124 | "name": "python", 125 | "nbconvert_exporter": "python", 126 | "pygments_lexer": "ipython3", 127 | "version": "3.9.5" 128 | } 129 | }, 130 | "nbformat": 4, 131 | "nbformat_minor": 4 132 | } 133 | -------------------------------------------------------------------------------- /chapter_6_examples/jupyter_notebooks/ppp_loan_status.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Quick script for determining how many loans have been disbursed\n", 10 | "\n", 11 | "# importing the `pandas` library\n", 12 | "import pandas as pd" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 22 | "# # Import PyDrive and associated libraries.\n", 23 | "# # This only needs to be done once per notebook.\n", 24 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n", 25 | "# from pydrive.auth import GoogleAuth\n", 26 | "# from pydrive.drive import GoogleDrive\n", 27 | "# from google.colab import auth\n", 28 | "# from oauth2client.client import GoogleCredentials\n", 29 | "\n", 30 | "# # Authenticate and create the PyDrive client.\n", 31 | "# # This only needs to be done once per notebook.\n", 32 | "# auth.authenticate_user()\n", 33 | "# gauth = GoogleAuth()\n", 34 | "# gauth.credentials = GoogleCredentials.get_application_default()\n", 35 | "# drive = GoogleDrive(gauth)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 45 | "# # This loads the \"recent\" (February 2021) data\n", 46 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n", 47 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n", 48 | "\n", 49 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 50 | "# print(imported_file['title']) # it should print the title of desired file\n", 51 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# read the recent data sample into a pandas DataFrame\n", 61 | "ppp_data = pd.read_csv('public_150k_plus_recent.csv')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# print a summary of values in the `LoanStatus` column\n", 71 | "print(ppp_data['LoanStatus'].value_counts())\n", 72 | "print(sum(ppp_data['LoanStatus'].value_counts()))" 73 | ] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3 (ipykernel)", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.9.5" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 4 97 | } 98 | -------------------------------------------------------------------------------- /chapter_6_examples/jupyter_notebooks/ppp_min_max_loan.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Quick script for finding the minimum and maximum loans currently approved\n", 10 | "# in our PPP loan data set\n", 11 | "\n", 12 | "# importing the `pandas` library\n", 13 | "import pandas as pd" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 23 | "# # Import PyDrive and associated libraries.\n", 24 | "# # This only needs to be done once per notebook.\n", 25 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n", 26 | "# from pydrive.auth import GoogleAuth\n", 27 | "# from pydrive.drive import GoogleDrive\n", 28 | "# from google.colab import auth\n", 29 | "# from oauth2client.client import GoogleCredentials\n", 30 | "\n", 31 | "# # Authenticate and create the PyDrive client.\n", 32 | "# # This only needs to be done once per notebook.\n", 33 | "# auth.authenticate_user()\n", 34 | "# gauth = GoogleAuth()\n", 35 | "# gauth.credentials = GoogleCredentials.get_application_default()\n", 36 | "# drive = GoogleDrive(gauth)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 46 | "# # This loads the \"recent\" (February 2021) data\n", 47 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n", 48 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n", 49 | "\n", 50 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 51 | "# print(imported_file['title']) # it should print the title of desired file\n", 52 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# read the recent data into a pandas DataFrame\n", 62 | "ppp_data = pd.read_csv('public_150k_plus_recent.csv')" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# use the pandas `min()` and `max()` methods to retrieve the\n", 72 | "# largest and smallest values, respectively\n", 73 | "print(ppp_data['CurrentApprovalAmount'].min())\n", 74 | "print(ppp_data['CurrentApprovalAmount'].max())" 75 | ] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "Python 3 (ipykernel)", 81 | "language": "python", 82 | "name": "python3" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "name": "ipython", 87 | "version": 3 88 | }, 89 | "file_extension": ".py", 90 | "mimetype": "text/x-python", 91 | "name": "python", 92 | "nbconvert_exporter": "python", 93 | "pygments_lexer": "ipython3", 94 | "version": "3.9.5" 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 4 99 | } 100 | -------------------------------------------------------------------------------- /chapter_6_examples/jupyter_notebooks/ppp_numrows.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Quick script to print out the number of rows in each of our PPP loan data files\n", 10 | "# This is a pretty basic task, so no need to import extra libraries!" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 20 | "# # Import PyDrive and associated libraries.\n", 21 | "# # This only needs to be done once per notebook.\n", 22 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n", 23 | "# from pydrive.auth import GoogleAuth\n", 24 | "# from pydrive.drive import GoogleDrive\n", 25 | "# from google.colab import auth\n", 26 | "# from oauth2client.client import GoogleCredentials\n", 27 | "\n", 28 | "# # Authenticate and create the PyDrive client.\n", 29 | "# # This only needs to be done once per notebook.\n", 30 | "# auth.authenticate_user()\n", 31 | "# gauth = GoogleAuth()\n", 32 | "# gauth.credentials = GoogleCredentials.get_application_default()\n", 33 | "# drive = GoogleDrive(gauth)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 43 | "# # This loads the August 2020 data\n", 44 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/11wTOapbAzcfeCQVVB-YJFIpsQVaZxJAm/view?usp=sharing\n", 45 | "# file_id = '11wTOapbAzcfeCQVVB-YJFIpsQVaZxJAm' # notice where this string comes from in link above\n", 46 | "\n", 47 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 48 | "# print(imported_file['title']) # it should print the title of desired file\n", 49 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 59 | "# # This loads the \"recent\" (February 2021) data\n", 60 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n", 61 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n", 62 | "\n", 63 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 64 | "# print(imported_file['title']) # it should print the title of desired file\n", 65 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "# open the August PPP data in \"read\" mode\n", 75 | "august_data = open(\"public_150k_plus_080820.csv\",\"r\")" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# use `readlines()` to convert the lines in the data file into a list\n", 85 | "print(\"August file has \"+str(len(august_data.readlines()))+\" rows.\")" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# ditto for the recent PPP data\n", 95 | "recent_data = open(\"public_150k_plus_recent.csv\",\"r\")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "# once again, print the number of lines\n", 105 | "print(\"Recent file has \"+str(len(recent_data.readlines()))+\" rows.\")" 106 | ] 107 | } 108 | ], 109 | "metadata": { 110 | "kernelspec": { 111 | "display_name": "Python 3 (ipykernel)", 112 | "language": "python", 113 | "name": "python3" 114 | }, 115 | "language_info": { 116 | "codemirror_mode": { 117 | "name": "ipython", 118 | "version": 3 119 | }, 120 | "file_extension": ".py", 121 | "mimetype": "text/x-python", 122 | "name": "python", 123 | "nbconvert_exporter": "python", 124 | "pygments_lexer": "ipython3", 125 | "version": "3.9.5" 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 4 130 | } 131 | -------------------------------------------------------------------------------- /chapter_6_examples/standalone_files/ppp_columns_review.py: -------------------------------------------------------------------------------- 1 | # Quick script for reviewing the all the column names in the PPP data 2 | # to see what we can infer about them from the data itself 3 | 4 | # importing the `pandas` library 5 | import pandas as pd 6 | 7 | # read the recent data into a pandas DataFrame using its `read_csv()` method 8 | ppp_data_sample = pd.read_csv('recent_sample.csv') 9 | 10 | # convert all missing data entries to '' using the `convertdtypes()` method 11 | converted_data_sample = ppp_data_sample.convert_dtypes() 12 | 13 | # transpose the whole sample 14 | transposed_ppp_data_sample = converted_data_sample.transpose() 15 | 16 | # print out the results! 17 | print(transposed_ppp_data_sample) 18 | -------------------------------------------------------------------------------- /chapter_6_examples/standalone_files/ppp_columns_summary.py: -------------------------------------------------------------------------------- 1 | # Quick script for reviewing all the column names in the PPP data 2 | # to see what we can infer about them from the data itself 3 | 4 | # importing the `pandas` library 5 | import pandas as pd 6 | 7 | # read the recent data sample into a pandas DataFrame 8 | ppp_data = pd.read_csv('public_150k_plus_recent.csv') 9 | 10 | # print the summary of values that appear in the `LoanStats` column 11 | print(ppp_data.value_counts('LoanStatus')) 12 | 13 | # print the total number of entries in the `LoanStatus` column 14 | print(sum(ppp_data.value_counts('LoanStatus'))) 15 | 16 | # print the summary of values that appear in the `Gender` column 17 | print(ppp_data.value_counts('Gender')) 18 | 19 | # print the total number of entries in the `Gender` column 20 | print(sum(ppp_data.value_counts('Gender'))) 21 | 22 | # print how many rows do not list a value for `BorrowerAddress` 23 | print(ppp_data['BorrowerAddress'].isna().sum()) 24 | -------------------------------------------------------------------------------- /chapter_6_examples/standalone_files/ppp_data_join.py: -------------------------------------------------------------------------------- 1 | # Quick script for creating new CSVs that each contain the first few rows of 2 | # our larger data files 3 | 4 | # importing the `pandas` library 5 | import pandas as pd 6 | 7 | # read the august data into a pandas DataFrame using its `read_csv()` method 8 | august_ppp_data = pd.read_csv('public_150k_plus_080820.csv') 9 | 10 | # read the recent data into a pandas DataFrame using its `read_csv()` method 11 | recent_ppp_data = pd.read_csv('public_150k_plus_recent.csv') 12 | 13 | # now that we have both files in memory, let's merge them! 14 | merged_data = pd.merge(august_ppp_data,recent_ppp_data,how='outer', 15 | left_on=['BusinessName','Lender','DateApproved'],right_on=['BorrowerName', 16 | 'ServicingLenderName','DateApproved'],indicator=True) 17 | 18 | # `print()` the values in the "indicator" column, 19 | # which has a default label of `_merge` 20 | print(merged_data.value_counts('_merge')) 21 | 22 | # merge the data again, removing the match on `DateApproved` 23 | merged_data_no_date = pd.merge(august_ppp_data,recent_ppp_data,how='outer', 24 | left_on=['BusinessName','Lender'],right_on=['BorrowerName', 25 | 'ServicingLenderName'],indicator=True) 26 | 27 | # `print()` the values in the "indicator" column, 28 | # which has a default label of `_merge` 29 | print(merged_data_no_date.value_counts('_merge')) 30 | 31 | # merge the data again, matching only on `BusinessName`/`BorrowerName` 32 | merged_data_biz_only = pd.merge(august_ppp_data,recent_ppp_data,how='outer', 33 | left_on=['BusinessName'],right_on=['BorrowerName'],indicator=True) 34 | 35 | # `print()` the values in the "indicator" column, 36 | # which has a default label of `_merge` 37 | print(merged_data_biz_only.value_counts('_merge')) 38 | -------------------------------------------------------------------------------- /chapter_6_examples/standalone_files/ppp_data_samples.py: -------------------------------------------------------------------------------- 1 | # Quick script for creating new CSVs that each contain the first few rows of 2 | # our larger data files 3 | 4 | # importing the `pandas` library 5 | import pandas as pd 6 | 7 | # read the august data into a pandas DataFrame using its `read_csv()` method 8 | august_ppp_data = pd.read_csv('public_150k_plus_080820.csv') 9 | 10 | # the `head()` method returns the DataFrame's column headers 11 | # along with the first 5 rows of data 12 | august_sample = august_ppp_data.head() 13 | 14 | # write those first few rows to a CSV called `august_sample.csv` 15 | # using the pandas `to_csv()` method 16 | august_sample.to_csv('august_sample.csv', index=False) 17 | 18 | # read the recent data into a pandas DataFrame using its `read_csv()` method 19 | recent_ppp_data = pd.read_csv('public_150k_plus_recent.csv') 20 | 21 | # the `head()` method returns the DataFrame's column headers 22 | # along with the first 5 rows of data 23 | recent_sample = recent_ppp_data.head() 24 | 25 | # write those first few rows to a CSV called `recent_sample.csv` 26 | recent_sample.to_csv('recent_sample.csv', index=False) 27 | -------------------------------------------------------------------------------- /chapter_6_examples/standalone_files/ppp_date_range.py: -------------------------------------------------------------------------------- 1 | # Quick script for finding the earliest and latest loan dates in the PPP loan 2 | # data 3 | 4 | # importing the `pandas` library 5 | import pandas as pd 6 | 7 | # read the recent data into a pandas DataFrame using its `read_csv()` method 8 | ppp_data = pd.read_csv('public_150k_plus_recent.csv') 9 | 10 | # convert the values in the `DateApproved` column to *actual* dates 11 | ppp_data['DateApproved'] = pd.to_datetime(ppp_data['DateApproved'], 12 | format='%m/%d/%Y') 13 | 14 | # print out the `min()` and `max()` values in the `DateApproved` column 15 | print(ppp_data['DateApproved'].min()) 16 | print(ppp_data['DateApproved'].max()) 17 | -------------------------------------------------------------------------------- /chapter_6_examples/standalone_files/ppp_find_waterford.py: -------------------------------------------------------------------------------- 1 | # Quick script for finding a business within our data set by (partial) name 2 | 3 | # importing the `pandas` library 4 | import pandas as pd 5 | 6 | # read the recent data sample into a pandas DataFrame 7 | ppp_data = pd.read_csv('public_150k_plus_recent.csv') 8 | 9 | # create a DataFrame without any missing `BorrowerName` values 10 | ppp_data_named_borrowers = ppp_data[ppp_data['BorrowerName'].notna()] 11 | 12 | # because precise matching can be tricky, 13 | # we'll use the pandas `str.contains()` method 14 | bankruptcy_example = ppp_data_named_borrowers[ppp_data_named_borrowers['BorrowerName'] 15 | .str.contains('WATERFORD RECEPTIONS')] 16 | 17 | # transposing the result so it's easier to read 18 | print(bankruptcy_example.transpose()) 19 | -------------------------------------------------------------------------------- /chapter_6_examples/standalone_files/ppp_lender_names.py: -------------------------------------------------------------------------------- 1 | # Quick script for determining whether there are typos &c. in any of the PPP 2 | # loan data's bank names 3 | 4 | # importing the `pandas` library. The `as` keyword let's us essentially create 5 | # a nickname for the library so that we can refer to it in fewer characters 6 | import pandas as pd 7 | 8 | # importing the `fingerprints` library, which will help us generate normalized 9 | # labels for each of the bank names in our data set 10 | import fingerprints 11 | 12 | # read the recent data sample into a pandas DataFrame using the library's 13 | # `read_csv()` method 14 | ppp_data = pd.read_csv('public_150k_plus_recent.csv') 15 | 16 | # use the pandas DataFrame `unique()` function to create a list of unique 17 | # bank names in our data's `OriginatingLender` column 18 | unique_names = ppp_data['OriginatingLender'].unique() 19 | 20 | # confirm how many unique names there are 21 | print(len(unique_names)) 22 | 23 | # create an empty list to hold the fingerprint of each of the unique names 24 | fingerprint_list = [] 25 | 26 | # iterate through each name in the list of unique names 27 | for name in unique_names: 28 | 29 | # for each name, generate its fingerprint 30 | # and append it to the end of the list 31 | fingerprint_list.append(fingerprints.generate(name)) 32 | 33 | 34 | # use the built-in `set()` method on our fingerprint_list, which will 35 | # remove duplicates (and sort it) 36 | fingerprint_set = set(fingerprint_list) 37 | 38 | # check the length of the fingerprint_set 39 | print(len(fingerprint_set)) 40 | -------------------------------------------------------------------------------- /chapter_6_examples/standalone_files/ppp_loan_status.py: -------------------------------------------------------------------------------- 1 | # Quick script for determining how many loans have been disbursed 2 | 3 | # importing the `pandas` library 4 | import pandas as pd 5 | 6 | # read the recent data sample into a pandas DataFrame 7 | ppp_data = pd.read_csv('public_150k_plus_recent.csv') 8 | 9 | # print a summary of values in the `LoanStatus` column 10 | print(ppp_data['LoanStatus'].value_counts()) 11 | print(sum(ppp_data['LoanStatus'].value_counts())) 12 | -------------------------------------------------------------------------------- /chapter_6_examples/standalone_files/ppp_loan_uses.py: -------------------------------------------------------------------------------- 1 | # Quick script for determining what borrowers did (or really, did not) state 2 | # they would use PPP loan funds for 3 | 4 | # importing the `pandas` library 5 | import pandas as pd 6 | 7 | # read the recent data sample into a pandas DataFrame 8 | ppp_data = pd.read_csv('public_150k_plus_recent.csv') 9 | 10 | # print how many rows do not list a value for `UTILITIES_PROCEED` 11 | print(ppp_data['UTILITIES_PROCEED'].isna().sum()) 12 | 13 | # print how many rows do not list a value for `PAYROLL_PROCEED` 14 | print(ppp_data['PAYROLL_PROCEED'].isna().sum()) 15 | 16 | # print how many rows do not list a value for `MORTGAGE_INTEREST_PROCEED` 17 | print(ppp_data['MORTGAGE_INTEREST_PROCEED'].isna().sum()) 18 | 19 | # print how many rows do not list a value for `RENT_PROCEED` 20 | print(ppp_data['RENT_PROCEED'].isna().sum()) 21 | 22 | # print how many rows do not list a value for `REFINANCE_EIDL_PROCEED` 23 | print(ppp_data['REFINANCE_EIDL_PROCEED'].isna().sum()) 24 | 25 | # print how many rows do not list a value for `HEALTH_CARE_PROCEED` 26 | print(ppp_data['HEALTH_CARE_PROCEED'].isna().sum()) 27 | 28 | # print how many rows do not list a value for `DEBT_INTEREST_PROCEED` 29 | print(ppp_data['DEBT_INTEREST_PROCEED'].isna().sum()) 30 | 31 | # create a new DataFrame that contains all rows reporting *only* payroll costs 32 | # that is, where all _other_ costs are listed as "NA" 33 | payroll_only = ppp_data[(ppp_data['UTILITIES_PROCEED'].isna()) & (ppp_data 34 | ['MORTGAGE_INTEREST_PROCEED'].isna()) & (ppp_data 35 | ['MORTGAGE_INTEREST_PROCEED'].isna()) & (ppp_data['RENT_PROCEED'].isna()) & 36 | (ppp_data['REFINANCE_EIDL_PROCEED'].isna()) & (ppp_data 37 | ['HEALTH_CARE_PROCEED'].isna()) & (ppp_data['DEBT_INTEREST_PROCEED'].isna()) 38 | ] 39 | 40 | # print the length of our "payroll costs only" DataFrame 41 | print(len(payroll_only.index)) 42 | -------------------------------------------------------------------------------- /chapter_6_examples/standalone_files/ppp_min_max_loan.py: -------------------------------------------------------------------------------- 1 | # Quick script for finding the minimum and maximum loans currently approved 2 | # in our PPP loan data set 3 | 4 | # importing the `pandas` library 5 | import pandas as pd 6 | 7 | # read the recent data into a pandas DataFrame 8 | ppp_data = pd.read_csv('public_150k_plus_recent.csv') 9 | 10 | # use the pandas `min()` and `max()` methods to retrieve the 11 | # largest and smallest values, respectively 12 | print(ppp_data['CurrentApprovalAmount'].min()) 13 | print(ppp_data['CurrentApprovalAmount'].max()) 14 | -------------------------------------------------------------------------------- /chapter_6_examples/standalone_files/ppp_numrows.py: -------------------------------------------------------------------------------- 1 | # Quick script to print out the number of rows in each of our PPP loan data files 2 | # This is a pretty basic task, so no need to import extra libraries! 3 | 4 | # open the August PPP data in "read" mode 5 | august_data = open("public_150k_plus_080820.csv","r") 6 | 7 | # use `readlines()` to convert the lines in the data file into a list 8 | print("August file has "+str(len(august_data.readlines()))+" rows.") 9 | 10 | # ditto for the recent PPP data 11 | recent_data = open("public_150k_plus_recent.csv","r") 12 | 13 | # once again, print the number of lines 14 | print("Recent file has "+str(len(recent_data.readlines()))+" rows.") 15 | -------------------------------------------------------------------------------- /chapter_7_examples/jupyter_notebooks/regex_tests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# The goal of this script is to try out how a couple of regular expressions\n", 10 | "# fare with some sample test data.\n", 11 | "\n", 12 | "# import the regular expression library\n", 13 | "import re" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# using the `re.compile()` method is a helpful way of keeping a reference to\n", 23 | "# our various regular expressions\n", 24 | "bookend_regex = re.compile(\"\\s0[7-9]:\")\n", 25 | "\n", 26 | "# always try to be descriptive with the variable names\n", 27 | "one_sided_regex = re.compile(\"0[7-9]:\")" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# this example should *fail*\n", 37 | "sample1 = \"2020-09-01 00:00:01.0430\"" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# this example should *match*\n", 47 | "sample2 = \"2020-09-01 09:04:23.7930\"" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# this example should *fail*\n", 57 | "sample3 = \"2020-09-01 10:07:02.0510\"" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# let's see what happens!\n", 67 | "print(\"bookend_regex:\")\n", 68 | "print(bookend_regex.search(sample1))\n", 69 | "print(bookend_regex.search(sample2))\n", 70 | "print(bookend_regex.search(sample3))" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "print(\"one_sided_regex:\")\n", 80 | "print(one_sided_regex.search(sample1))\n", 81 | "print(one_sided_regex.search(sample2))\n", 82 | "print(one_sided_regex.search(sample3))" 83 | ] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3 (ipykernel)", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.9.5" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 4 107 | } 108 | -------------------------------------------------------------------------------- /chapter_7_examples/standalone_files/citibike_september1_rides.py: -------------------------------------------------------------------------------- 1 | # Objectives: Filter all September, 2020 Citi Bike rides, and output a new 2 | # file containing only the rides from 2020-09-01 3 | 4 | # Program Outline: 5 | # 1. Read in the data file: 202009-citibike-tripdata.csv 6 | # 2. Create a new output file, and write the header row to it. 7 | # 3. For each row in the file, split the `starttime` value on space: 8 | # a. If the first item in the resulting list is '2020-09-01', write 9 | # the row to our output file 10 | # 4. Close the output file 11 | 12 | # import the "csv" library 13 | import csv 14 | 15 | # open our data file in "read" mode 16 | source_file = open("202009-citibike-tripdata.csv","r") 17 | 18 | # open our output file in "write" mode 19 | output_file = open("2020-09-01-citibike-tripdata.csv","w") 20 | 21 | # pass our source_file to the DictReader "recipe" 22 | # and store the result in a variable called `citibike_reader` 23 | citibike_reader = csv.DictReader(source_file) 24 | 25 | # create a corresponding DictWriter and specify that the 26 | # header should be the same as the `citibike_reader` fieldnames 27 | output_writer = csv.DictWriter(output_file, fieldnames=citibike_reader.fieldnames) 28 | 29 | # write the header row to the output file 30 | output_writer.writeheader() 31 | 32 | # use a `for...in` loop to go through our `citibike_reader` list of rows 33 | for a_row in citibike_reader: 34 | 35 | # get the value in the 'starttime' column 36 | start_timestamp = a_row["starttime"] 37 | 38 | # split the value in 'starttime' on the space character 39 | timelist = start_timestamp.split(" ") 40 | 41 | # the "date" part of the string will be the first item, position 0 42 | the_date = timelist[0] 43 | 44 | # if `the_date` matches our desired date 45 | if the_date == "2020-09-01": 46 | 47 | # write that row of data to our output file 48 | output_writer.writerow(a_row) 49 | 50 | # close the output file 51 | output_file.close() 52 | -------------------------------------------------------------------------------- /chapter_7_examples/standalone_files/fixed_width_strip_parsing.py: -------------------------------------------------------------------------------- 1 | # An example of reading data from a fixed-width file with Python. 2 | # The source file for this example comes from the NOAA, and can be accessed here: 3 | # https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt 4 | # The metadata for the file can be found here: 5 | # https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt 6 | 7 | # import the `csv` library, to create our output file 8 | import csv 9 | 10 | filename = "ghcnd-stations" 11 | 12 | # reading from a basic text file doesn't require any special libraries 13 | # so we'll just open the file in read format ("r") as usual 14 | source_file = open(filename+".txt", "r") 15 | 16 | # the built-in "readlines()" method does just what you'd think: 17 | # it reads in a text file and converts it to a list of lines 18 | stations_list = source_file.readlines() 19 | 20 | # create an output file for our transformed data 21 | output_file = open(filename+".csv","w") 22 | 23 | # use the `csv` library's "writer" recipe to easily write rows of data 24 | # to `output_file`, instead of reading data *from* it 25 | output_writer = csv.writer(output_file) 26 | 27 | # create the header list 28 | headers = ["ID","LATITUDE","LONGITUDE","ELEVATION","STATE","NAME","GSN_FLAG", 29 | "HCNCRN_FLAG","WMO_ID"] 30 | 31 | # write our headers to the output file 32 | output_writer.writerow(headers) 33 | 34 | # loop through each line of our file (multiple "sheets" are not possible) 35 | for line in stations_list: 36 | 37 | # create an empty list, to which we'll append each set of characters that 38 | # makes up a given "column" of data 39 | new_row = [] 40 | 41 | # ID: positions 1-11 42 | new_row.append((line[0:11]).strip()) 43 | 44 | # LATITUDE: positions 13-20 45 | new_row.append((line[12:20]).strip()) 46 | 47 | # LONGITUDE: positions 22-30 48 | new_row.append((line[21:30]).strip()) 49 | 50 | # ELEVATION: positions 32-37 51 | new_row.append((line[31:37]).strip()) 52 | 53 | # STATE: positions 39-40 54 | new_row.append((line[38:40]).strip()) 55 | 56 | # NAME: positions 42-71 57 | new_row.append((line[41:71]).strip()) 58 | 59 | # GSN_FLAG: positions 73-75 60 | new_row.append((line[72:75]).strip()) 61 | 62 | # HCNCRN_FLAG: positions 77-79 63 | new_row.append((line[76:79]).strip()) 64 | 65 | # WMO_ID: positions 81-85 66 | new_row.append((line[80:85]).strip()) 67 | 68 | # now all that's left is to use the 69 | # `writerow` function to write new_row to our output file 70 | output_writer.writerow(new_row) 71 | 72 | # just for good measure, let's close the `.csv` file we just created 73 | output_file.close() 74 | -------------------------------------------------------------------------------- /chapter_7_examples/standalone_files/ppp_add_fingerprints.py: -------------------------------------------------------------------------------- 1 | # Quick script for adding a "fingerprint" column to our loan data, which will 2 | # help us confirm/correct for any typos or inconsistencies in, e.g., bank names 3 | 4 | # import the csv library 5 | import csv 6 | 7 | # importing the `fingerprints` library 8 | import fingerprints 9 | 10 | # read the recent data sample into a variable 11 | ppp_data = open('public_150k_plus_recent.csv','r') 12 | 13 | # the DictReader function makes our source data more usable 14 | ppp_data_reader = csv.DictReader(ppp_data) 15 | 16 | # create an output file to write our modified dataset to 17 | augmented_ppp_data = open('public_150k_plus_fingerprints.csv','w') 18 | 19 | # create a "writer" so that we can output whole rows at once 20 | augmented_data_writer = csv.writer(augmented_ppp_data) 21 | 22 | # because we're adding a column, we need to create a new header row as well 23 | header_row = [] 24 | 25 | # for every column header 26 | for item in ppp_data_reader.fieldnames: 27 | 28 | # append the existing column header 29 | header_row.append(item) 30 | 31 | # if we're at 'OriginatingLender' 32 | if item == 'OriginatingLender': 33 | 34 | # it's time to add a new column 35 | header_row.append('OriginatingLenderFingerprint') 36 | 37 | # now we can write our expanded header row to the output file 38 | augmented_data_writer.writerow(header_row) 39 | 40 | # iterate through every row in our data 41 | for row in ppp_data_reader: 42 | 43 | # create an empty list to hold our new data row 44 | new_row = [] 45 | 46 | # for each column of data in the *original* dataset 47 | for column_name in ppp_data_reader.fieldnames: 48 | 49 | # first, append this row's value for that column 50 | new_row.append(row[column_name]) 51 | 52 | # when we get to the 'OriginatingLender' column, it's time 53 | # to add our new "fingerprint" value 54 | if column_name == 'OriginatingLender': 55 | 56 | # our fingerprint will consist of the generated fingerprint PLUS 57 | # the OriginatingLenderLocationID 58 | the_fingerprint = fingerprints.generate(row[column_name]) + \ 59 | " " + row['OriginatingLenderLocationID'] 60 | 61 | # append the compound fingerprint value to our row 62 | new_row.append(the_fingerprint) 63 | 64 | # once the whole row is complete, write it to our output file 65 | augmented_data_writer.writerow(new_row) 66 | 67 | # close both files 68 | augmented_ppp_data.close() 69 | ppp_data.close() 70 | -------------------------------------------------------------------------------- /chapter_7_examples/standalone_files/ppp_adding_naics.py: -------------------------------------------------------------------------------- 1 | # script to merge our PPP loan data with information from the SBA's NAICS 2 | # size requirements, found here: 3 | # https://www.sba.gov/document/support--table-size-standards 4 | 5 | # import pandas to facilitate the merging and sorting 6 | import pandas as pd 7 | 8 | # read our PPP loan data into a new DataFrame 9 | ppp_data = pd.read_csv('public_150k_plus_fingerprints.csv', dtype='string') 10 | 11 | # read the NAICS data into a separate DataFrame 12 | sba_naics_data = pd.read_csv('SBA-NAICS-data.csv', dtype='string') 13 | 14 | # if there's no value in the 'NAICSCode' column, replace it with "None" 15 | ppp_data['NAICSCode'] = ppp_data['NAICSCode'].fillna("None") 16 | 17 | # merge the two datasets using a "left" merge 18 | merged_data = pd.merge(ppp_data, sba_naics_data, how='left', 19 | left_on=['NAICSCode'], right_on=['NAICS Codes'], 20 | indicator=True) 21 | 22 | # open a file to save our merged data to 23 | merged_data_file = open('ppp-fingerprints-and-naics.csv', 'w') 24 | 25 | # write the merged data to an output file as a CSV 26 | merged_data_file.write(merged_data.to_csv()) 27 | 28 | # print out the values in the '_merge' column to see how many 29 | # entries in our loan data don't get matched to a NAICS code 30 | print(merged_data.value_counts('_merge')) 31 | 32 | # create a new DataFrame that is *just* the unmatched rows 33 | unmatched_values = merged_data[merged_data['_merge']=='left_only'] 34 | 35 | # open a file to write the unmatched values to 36 | unmatched_values_file = open('ppp-unmatched-naics-codes.csv', 'w') 37 | 38 | # write a new CSV file that contains all the unmatched NAICS codes in our 39 | # PPP loan data, along with how many times it appears 40 | unmatched_values_file.write(unmatched_values.value_counts('NAICSCode').to_csv()) 41 | -------------------------------------------------------------------------------- /chapter_7_examples/standalone_files/regex_tests.py: -------------------------------------------------------------------------------- 1 | # The goal of this script is to try out how a couple of regular expressions 2 | # fare with some sample test data. 3 | 4 | # import the regular expression library 5 | import re 6 | 7 | # using the `re.compile()` method is a helpful way of keeping a reference to 8 | # our various regular expressions 9 | bookend_regex = re.compile("\s0[7-9]:") 10 | 11 | # always try to be descriptive with the variable names 12 | one_sided_regex = re.compile("0[7-9]:") 13 | 14 | # this example should *fail* 15 | sample1 = "2020-09-01 00:00:01.0430" 16 | 17 | # this example should *match* 18 | sample2 = "2020-09-01 09:04:23.7930" 19 | 20 | # this example should *fail* 21 | sample3 = "2020-09-01 10:07:02.0510" 22 | 23 | # let's see what happens! 24 | print("bookend_regex:") 25 | print(bookend_regex.search(sample1)) 26 | print(bookend_regex.search(sample2)) 27 | print(bookend_regex.search(sample3)) 28 | print("one_sided_regex:") 29 | print(one_sided_regex.search(sample1)) 30 | print(one_sided_regex.search(sample2)) 31 | print(one_sided_regex.search(sample3)) 32 | -------------------------------------------------------------------------------- /chapter_7_examples/standalone_files/weekday_rides.py: -------------------------------------------------------------------------------- 1 | # Objectives: Filter all September, 2020 Citi Bike rides, and output a new 2 | # file containing only weekday rides 3 | # Program Outline: 4 | # 1. Read in the data file: 202009-citibike-tripdata.csv 5 | # 2. Create a new output file, and write the header row to it. 6 | # 3. For each row in the file, make a date from the `starttime`: 7 | # a. if it's a weekday, write the row to our output file 8 | # 4. Close the output file 9 | 10 | # import the "csv" library 11 | import csv 12 | 13 | # import the "datetime" library 14 | from datetime import datetime 15 | 16 | # open our data file in "read" mode 17 | source_file = open("202009-citibike-tripdata.csv","r") 18 | 19 | # open our output file in "write" mode 20 | output_file = open("202009-citibike-weekday-tripdata.csv","w") 21 | 22 | # convert source data to a DictReader; store the result in `citibike_reader` 23 | citibike_reader = csv.DictReader(source_file) 24 | 25 | # create a corresponding DictWriter and specify its fieldnames 26 | output_writer = csv.DictWriter(output_file, fieldnames=citibike_reader.fieldnames) 27 | 28 | # actually write the header row to the output file 29 | output_writer.writeheader() 30 | 31 | # use a `for...in` loop to go through our `citibike_reader` list of rows 32 | for a_row in citibike_reader: 33 | 34 | # convert the value in the 'starttime' column to a date object 35 | the_date = datetime.strptime(a_row['starttime'], '%Y-%m-%d %H:%M:%S.%f') 36 | 37 | # if `the_date` is a weekday 38 | if the_date.weekday() <= 4: 39 | 40 | # write that row of data to our output file 41 | output_writer.writerow(a_row) 42 | 43 | # close the output file 44 | output_file.close() 45 | -------------------------------------------------------------------------------- /chapter_7_examples/standalone_files/xls_meta_and_date_parsing.py: -------------------------------------------------------------------------------- 1 | # Converting data in an .xls file with Python to csv + metadata file, with 2 | # functional date values using the "xrld" library. 3 | 4 | # First, pip install the xlrd library: 5 | # https://pypi.org/project/xlrd/2.0.1/ 6 | # then, import the `xlrd` library 7 | import xlrd 8 | 9 | # import the csv library 10 | import csv 11 | 12 | # needed to test if a given value is *some* type of number 13 | from numbers import Number 14 | 15 | # for parsing/formatting our newly interpreted Excel dates 16 | from datetime import datetime 17 | 18 | # pass our filename as an ingredient to the `xlrd` library's 19 | # `open_workbook()` "recipe" 20 | # store the result in a variable called `source_workbook` 21 | source_workbook = xlrd.open_workbook("fredgraph.xls") 22 | 23 | # open and name a simple metadata text file 24 | source_workbook_metadata = open("fredgraph_metadata.txt","w") 25 | 26 | # an `.xls` workbook can have multiple sheets 27 | for sheet_name in source_workbook.sheet_names(): 28 | 29 | # create a variable that points to the current worksheet by 30 | # passing the current value of `sheet_name` to the `sheet_by_name` recipe 31 | current_sheet = source_workbook.sheet_by_name(sheet_name) 32 | 33 | # create "xls_"+sheet_name+".csv" as an output file for the current sheet 34 | output_file = open("xls_"+sheet_name+"_dates.csv","w") 35 | 36 | # use the `csv` library's "writer" recipe to easily write rows of data 37 | # to `output_file`, instead of reading data *from* it 38 | output_writer = csv.writer(output_file) 39 | 40 | # create a Boolean variable to detect if we've hit our table-type data yet 41 | is_table_data = False 42 | 43 | # now, we need to loop through every row in our sheet 44 | for row_num, row in enumerate(current_sheet.get_rows()): 45 | 46 | # pulling out the value in the first column of the current row 47 | first_entry = current_sheet.row_values(row_num)[0] 48 | 49 | # if we've hit the header row of our data table 50 | if first_entry == 'observation_date': 51 | 52 | # it's time to switch our "flag" value to "True" 53 | is_table_data = True 54 | 55 | # if `is_table_data` is True 56 | if is_table_data: 57 | 58 | # extract the table-type data values into separate variables 59 | the_date_num = current_sheet.row_values(row_num)[0] 60 | U6_value = current_sheet.row_values(row_num)[1] 61 | 62 | # create a new row object with each of the values 63 | new_row = [the_date_num, U6_value] 64 | 65 | # if the `the_date_num` is a number, then the current row is *not* 66 | # the header row. We need to transform the date. 67 | if isinstance(the_date_num, Number): 68 | 69 | # use the xlrd library's `xldate_as_datetime()` to generate 70 | # a Python datetime object 71 | the_date_num = xlrd.xldate.xldate_as_datetime( 72 | the_date_num, source_workbook.datemode) 73 | 74 | # overwrite the first value in the new row with 75 | # the reformatted date 76 | new_row[0] = the_date_num.strftime('%m/%d/%Y') 77 | 78 | # write this new row to the data output file 79 | output_writer.writerow(new_row) 80 | 81 | # otherwise, this row must be metadata 82 | else: 83 | 84 | # since we'd like our metadata file to be nicely formatted, we 85 | # need to loop through the individual cells of each metadata row 86 | for item in current_sheet.row(row_num): 87 | 88 | # write the value of the cell 89 | source_workbook_metadata.write(item.value) 90 | 91 | # separate it from the next cell with a tab 92 | source_workbook_metadata.write('\t') 93 | 94 | # at the end of each line of metadata, add a newline 95 | source_workbook_metadata.write('\n') 96 | 97 | # just for good measure, let's close our output files 98 | output_file.close() 99 | source_workbook_metadata.close() 100 | -------------------------------------------------------------------------------- /chapter_7_examples/standalone_files/xls_meta_parsing.py: -------------------------------------------------------------------------------- 1 | # Converting data in an .xls file with Python to csv + metadata file 2 | # using the "xrld" library. First, pip install the xlrd library: 3 | # https://pypi.org/project/xlrd/2.0.1/ 4 | 5 | # import the "xlrd" library 6 | import xlrd 7 | 8 | # import the `csv` library, to create our output file 9 | import csv 10 | 11 | # pass our filename as an ingredient to the `xlrd` library's 12 | # `open_workbook()` "recipe" 13 | # store the result in a variable called `source_workbook` 14 | source_workbook = xlrd.open_workbook("fredgraph.xls") 15 | 16 | # open and name a simple metadata text file 17 | source_workbook_metadata = open("fredgraph_metadata.txt","w") 18 | 19 | # an `.xls` workbook can have multiple sheets 20 | for sheet_name in source_workbook.sheet_names(): 21 | 22 | # create a variable that points to the current worksheet by 23 | # passing the current value of `sheet_name` to the `sheet_by_name` recipe 24 | current_sheet = source_workbook.sheet_by_name(sheet_name) 25 | 26 | # create "xls_"+sheet_name+".csv" as an output file for the current sheet 27 | output_file = open("xls_"+sheet_name+".csv","w") 28 | 29 | # use the `csv` library's "writer" recipe to easily write rows of data 30 | # to `output_file`, instead of reading data *from* it 31 | output_writer = csv.writer(output_file) 32 | 33 | # create a Boolean variable to detect if we've hit our table-type data yet 34 | is_table_data = False 35 | 36 | # now, we need to loop through every row in our sheet 37 | for row_num, row in enumerate(current_sheet.get_rows()): 38 | 39 | # pulling out the value in the first column of the current row 40 | first_entry = current_sheet.row_values(row_num)[0] 41 | 42 | # if we've hit the header row of our data table 43 | if first_entry == 'observation_date': 44 | 45 | # it's time to switch our "flag" value to "True" 46 | is_table_data = True 47 | 48 | # if `is_table_data` is True 49 | if is_table_data: 50 | 51 | # write this row to the data output file 52 | output_writer.writerow(current_sheet.row_values(row_num)) 53 | 54 | # otherwise, this row must be metadata 55 | else: 56 | 57 | # since we'd like our metadata file to be nicely formatted, we 58 | # need to loop through the individual cells of each metadata row 59 | for item in current_sheet.row(row_num): 60 | 61 | # write the value of the cell 62 | source_workbook_metadata.write(item.value) 63 | 64 | # separate it from the next cell with a tab 65 | source_workbook_metadata.write('\t') 66 | 67 | # at the end of each line of metadata, add a newline 68 | source_workbook_metadata.write('\n') 69 | 70 | # just for good measure, let's close our output files 71 | output_file.close() 72 | source_workbook_metadata.close() 73 | -------------------------------------------------------------------------------- /chapter_8_examples/jupyter_notebooks/greet_me_options.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# create a function that prints out a greeting to any name\n", 10 | "def greet_me(a_name, greeting=\"Hello\"):\n", 11 | " print(greeting+\" \"+a_name)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# create a variable named author\n", 21 | "author = \"Susan E. McGregor\"\n", 22 | "\n", 23 | "# create another variable named editor\n", 24 | "editor = \"Jeff Bleiel\"" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# use `greet_me()` to output greeting messages to each person\n", 34 | "# say \"Hello\" by default\n", 35 | "greet_me(author)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# let the programmer specify \"Hi\" as the greeting\n", 45 | "greet_me(editor, greeting=\"Hi\")" 46 | ] 47 | } 48 | ], 49 | "metadata": { 50 | "kernelspec": { 51 | "display_name": "Python 3 (ipykernel)", 52 | "language": "python", 53 | "name": "python3" 54 | }, 55 | "language_info": { 56 | "codemirror_mode": { 57 | "name": "ipython", 58 | "version": 3 59 | }, 60 | "file_extension": ".py", 61 | "mimetype": "text/x-python", 62 | "name": "python", 63 | "nbconvert_exporter": "python", 64 | "pygments_lexer": "ipython3", 65 | "version": "3.9.5" 66 | } 67 | }, 68 | "nbformat": 4, 69 | "nbformat_minor": 4 70 | } 71 | -------------------------------------------------------------------------------- /chapter_8_examples/jupyter_notebooks/greet_me_revisited.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# create a function that prints out a greeting\n", 10 | "# to any name passed to the function\n", 11 | "def greet_me(a_name):\n", 12 | " print(\"Variable `a_name` in `greet_me`: \"+a_name)\n", 13 | " print(\"Hello \"+a_name)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# create a variable named `author`\n", 23 | "author = \"Susan E. McGregor\"\n", 24 | "\n", 25 | "# create another variable named `editor`\n", 26 | "editor = \"Jeff Bleiel\"" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "a_name = \"Python\"\n", 36 | "print(\"Variable `a_name` in main script: \"+a_name)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# use my custom function, `greet_me` to output \"Hello\" messages to each person\n", 46 | "greet_me(author)\n", 47 | "greet_me(editor)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "print(\"Variable `a_name` in main script again: \"+a_name)" 57 | ] 58 | } 59 | ], 60 | "metadata": { 61 | "kernelspec": { 62 | "display_name": "Python 3 (ipykernel)", 63 | "language": "python", 64 | "name": "python3" 65 | }, 66 | "language_info": { 67 | "codemirror_mode": { 68 | "name": "ipython", 69 | "version": 3 70 | }, 71 | "file_extension": ".py", 72 | "mimetype": "text/x-python", 73 | "name": "python", 74 | "nbconvert_exporter": "python", 75 | "pygments_lexer": "ipython3", 76 | "version": "3.9.5" 77 | } 78 | }, 79 | "nbformat": 4, 80 | "nbformat_minor": 4 81 | } 82 | -------------------------------------------------------------------------------- /chapter_8_examples/jupyter_notebooks/make_greeting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# create a function that **returns** a greeting to any name passed in\n", 10 | "def make_greeting(a_name):\n", 11 | " return(\"Hello \"+a_name)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# create a variable named author\n", 21 | "author = \"Susan E. McGregor\"\n", 22 | "\n", 23 | "# create another variable named editor\n", 24 | "editor = \"Jeff Bleiel\"" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# use my custom function, `greet_me()` to build and store\n", 34 | "# the \"Hello\" messages to each person\n", 35 | "author_greeting = make_greeting(author)\n", 36 | "editor_greeting = make_greeting(editor)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# now `print()` the greetings built and returned by each function call\n", 46 | "print(author_greeting)\n", 47 | "print(editor_greeting)" 48 | ] 49 | } 50 | ], 51 | "metadata": { 52 | "kernelspec": { 53 | "display_name": "Python 3 (ipykernel)", 54 | "language": "python", 55 | "name": "python3" 56 | }, 57 | "language_info": { 58 | "codemirror_mode": { 59 | "name": "ipython", 60 | "version": 3 61 | }, 62 | "file_extension": ".py", 63 | "mimetype": "text/x-python", 64 | "name": "python", 65 | "nbconvert_exporter": "python", 66 | "pygments_lexer": "ipython3", 67 | "version": "3.9.5" 68 | } 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 4 72 | } 73 | -------------------------------------------------------------------------------- /chapter_8_examples/jupyter_notebooks/make_greeting_no_vars.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# function that returns a greeting to any name passed in\n", 10 | "def make_greeting(a_name):\n", 11 | " return(\"Hello \"+a_name)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# function that adds a question to any greeting\n", 21 | "def add_question(a_greeting):\n", 22 | " return(a_greeting+\", how are you?\")" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# create a variable named author\n", 32 | "author = \"Susan E. McGregor\"\n", 33 | "\n", 34 | "# create another variable named editor\n", 35 | "editor = \"Jeff Bleiel\"" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# print the greeting message\n", 45 | "print(make_greeting(author))" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# pass the greeting message to the question function and print the result!\n", 55 | "print(add_question(make_greeting(editor)))" 56 | ] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "Python 3 (ipykernel)", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.9.5" 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 4 80 | } 81 | -------------------------------------------------------------------------------- /chapter_8_examples/standalone_files/fixed_width_strip_parsing_refactored.py: -------------------------------------------------------------------------------- 1 | """ NOAA data formatter 2 | Reads data from an NOAA fixed-width data file with Python and outputs 3 | a well-formatted CSV file. 4 | The source file for this example comes from the NOAA, and can be accessed here: 5 | https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt 6 | The metadata for the file can be found here: 7 | https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt 8 | Available functions 9 | ------------------- 10 | * convert_to_columns: Converts a line of text to a list 11 | Requirements 12 | ------------ 13 | * csv module 14 | """ 15 | # we'll start by importing the "csv" library 16 | import csv 17 | 18 | def main(): 19 | 20 | # variable to match our output filename to the input filename 21 | filename = "ghcnd-stations" 22 | 23 | # we'll just open the file in read format ("r") as usual 24 | source_file = open(filename+".txt", "r") 25 | 26 | # the "readlines()" method converts a text file to a list of lines 27 | stations_list = source_file.readlines() 28 | 29 | # as usual, we'll create an output file to write to 30 | output_file = open(filename+".csv","w") 31 | 32 | # and we'll use the `csv` library to create a "writer" that gives us handy 33 | # "recipe" functions for creating our new file in csv format 34 | output_writer = csv.writer(output_file) 35 | 36 | # we have to "hard code" these headers using the contents of `readme.txt` 37 | headers = ["ID","LATITUDE","LONGITUDE","ELEVATION","STATE","NAME", 38 | "GSN_FLAG","HCNCRN_FLAG","WMO_ID"] 39 | 40 | # create a list of `tuple`s with each column's start and end index 41 | column_ranges = [(1,11),(13,20),(22,30),(32,37),(39,40),(42,71),(73,75), 42 | (77,79),(81,85)] 43 | 44 | # write our headers to the output file 45 | output_writer.writerow(headers) 46 | 47 | # loop through each line of our file 48 | for line in stations_list: 49 | 50 | # send our data to be formatted 51 | new_row = convert_to_columns(line, column_ranges) 52 | 53 | # use the `writerow` function to write new_row to our output file 54 | output_writer.writerow(new_row) 55 | 56 | # for good measure, close our output file 57 | output_file.close() 58 | 59 | 60 | def convert_to_columns(data_line, column_info, zero_index=False): 61 | 62 | """Converts a line of text to a list based on the index pairs provided 63 | Parameters 64 | ---------- 65 | data_line : str 66 | The line of text to be parsed 67 | column_info : list of tuples 68 | Each tuple provides the start and end index of a data column 69 | zero_index: boolean, optional 70 | If False (default), reduces starting index position by one 71 | Returns 72 | ------- 73 | list 74 | a list of data values, stripped of surrounding whitespace 75 | """ 76 | new_row = [] 77 | 78 | # function assumes that provided indices are *NOT* zero-indexed, 79 | # so reduce starting index values by 1 80 | index_offset = 1 81 | 82 | # if column_info IS zero-indexed, don't offset starting index values 83 | if zero_index: 84 | index_offset = 0 85 | 86 | # go through list of column indices 87 | for index_pair in column_info: 88 | 89 | # pull start value, modifying by `index_offset` 90 | start_index = index_pair[0]-index_offset 91 | 92 | # pull end value 93 | end_index = index_pair[1] 94 | 95 | # strip whitespace from around the data 96 | new_row.append((data_line[start_index:end_index]).strip()) 97 | 98 | # return stripped data 99 | return new_row 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | -------------------------------------------------------------------------------- /chapter_8_examples/standalone_files/greet_me_options.py: -------------------------------------------------------------------------------- 1 | # create a function that prints out a greeting to any name 2 | def greet_me(a_name, greeting="Hello"): 3 | print(greeting+" "+a_name) 4 | 5 | # create a variable named author 6 | author = "Susan E. McGregor" 7 | 8 | # create another variable named editor 9 | editor = "Jeff Bleiel" 10 | 11 | # use `greet_me()` to output greeting messages to each person 12 | # say "Hello" by default 13 | greet_me(author) 14 | 15 | # let the programmer specify "Hi" as the greeting 16 | greet_me(editor, greeting="Hi") 17 | -------------------------------------------------------------------------------- /chapter_8_examples/standalone_files/greet_me_revisited.py: -------------------------------------------------------------------------------- 1 | # create a function that prints out a greeting 2 | # to any name passed to the function 3 | def greet_me(a_name): 4 | print("Variable `a_name` in `greet_me`: "+a_name) 5 | print("Hello "+a_name) 6 | 7 | # create a variable named `author` 8 | author = "Susan E. McGregor" 9 | 10 | # create another variable named `editor` 11 | editor = "Jeff Bleiel" 12 | 13 | a_name = "Python" 14 | print("Variable `a_name` in main script: "+a_name) 15 | 16 | # use my custom function, `greet_me` to output "Hello" messages to each person 17 | greet_me(author) 18 | greet_me(editor) 19 | 20 | print("Variable `a_name` in main script again: "+a_name) 21 | -------------------------------------------------------------------------------- /chapter_8_examples/standalone_files/make_greeting.py: -------------------------------------------------------------------------------- 1 | # create a function that **returns** a greeting to any name passed in 2 | def make_greeting(a_name): 3 | return("Hello "+a_name) 4 | 5 | # create a variable named author 6 | author = "Susan E. McGregor" 7 | 8 | # create another variable named editor 9 | editor = "Jeff Bleiel" 10 | 11 | # use my custom function, `greet_me()` to build and store 12 | # the "Hello" messages to each person 13 | author_greeting = make_greeting(author) 14 | editor_greeting = make_greeting(editor) 15 | 16 | # now `print()` the greetings built and returned by each function call 17 | print(author_greeting) 18 | print(editor_greeting) 19 | -------------------------------------------------------------------------------- /chapter_8_examples/standalone_files/make_greeting_no_vars.py: -------------------------------------------------------------------------------- 1 | # function that returns a greeting to any name passed in 2 | def make_greeting(a_name): 3 | return("Hello "+a_name) 4 | 5 | # function that adds a question to any greeting 6 | def add_question(a_greeting): 7 | return(a_greeting+", how are you?") 8 | 9 | # create a variable named author 10 | author = "Susan E. McGregor" 11 | 12 | # create another variable named editor 13 | editor = "Jeff Bleiel" 14 | 15 | # print the greeting message 16 | print(make_greeting(author)) 17 | 18 | # pass the greeting message to the question function and print the result! 19 | print(add_question(make_greeting(editor))) 20 | -------------------------------------------------------------------------------- /chapter_8_examples/standalone_files/webpage_saver.py: -------------------------------------------------------------------------------- 1 | """ Web page Saver! 2 | Downloads the contents of a web page and saves it locally 3 | 4 | Usage 5 | ----- 6 | python webpage_saver.py target_url filename 7 | 8 | Parameters 9 | ---------- 10 | target_url : str 11 | The full URL of the web page to be downloaded 12 | filename : str 13 | The desired filename of the local copy 14 | 15 | Requirements 16 | ------------ 17 | * argparse module 18 | * requests module 19 | """ 20 | 21 | # include the requests library in order to get data from the web 22 | import requests 23 | 24 | # include argparse library to pull arguments from the command line 25 | import argparse 26 | 27 | # create a new `ArgumentParser()` 28 | parser = argparse.ArgumentParser() 29 | 30 | # arguments will be assigned based on the order in which they were provided 31 | parser.add_argument("target_url", help="Full URL of web page to be downloaded") 32 | parser.add_argument("filename", help="The desired filename of the local copy") 33 | args = parser.parse_args() 34 | 35 | # pull the url of the web page we're downloading from the provided arguments 36 | target_url = args.target_url 37 | 38 | # pull the intended output filename from the provided arguments 39 | output_filename = args.filename 40 | 41 | # create appropriate header information for our web page request 42 | headers = { 43 | 'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 13597.66.0) ' + \ 44 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' + \ 45 | 'Chrome/88.0.4324.109 Safari/537.36', 46 | 'From': 'YOUR NAME HERE - youremailaddress@emailprovider.som' 47 | } 48 | 49 | # because we're just loading a regular web page, we send a `get` request to the 50 | # URL, along with our informational headers 51 | webpage = requests.get(target_url, headers=headers) 52 | 53 | # opening up a local file to save the contents of the web page to 54 | output_file = open(output_filename,"w") 55 | 56 | # the web page's code is in the `text` property of the website's response 57 | # so write that to our file 58 | output_file.write(webpage.text) 59 | 60 | # close our output file! 61 | output_file.close() 62 | -------------------------------------------------------------------------------- /chapter_8_examples/standalone_files/weekday_rides_refactored.py: -------------------------------------------------------------------------------- 1 | # Objective: Filter all September, 2020 Citi Bike rides, and output a new 2 | # file containing only weekday rides 3 | # Program Outline: 4 | # 1. Read in the data file: 202009-citibike-tripdata.csv 5 | # 2. Create a new output file, and write the header row to it. 6 | # 3. For each row in the file, make a date from the `starttime`: 7 | # a. if it's a weekday, write the row to our output file 8 | # 4. Close the output file 9 | # import the "csv" library 10 | import csv 11 | # import the "datetime" library 12 | from datetime import datetime 13 | def main(): 14 | # open our data file in "read" mode 15 | source_file = open("202009-citibike-tripdata.csv","r") 16 | # open our output file in "write" mode 17 | output_file = open("202009-citibike-weekday-tripdata.csv","w") 18 | # pass our source_file to the DictReader "recipe" 19 | # and store the result in a variable called `citibike_reader` 20 | citibike_reader = csv.DictReader(source_file) 21 | # create a corresponding DictWriter; specify its fieldnames should 22 | # be drawn from `citibike_reader` 23 | output_writer = csv.DictWriter(output_file, fieldnames=citibike_reader.fieldnames) 24 | # actually write the header row to the output file 25 | output_writer.writeheader() 26 | # loop through our `citibike_reader` rows 27 | for a_row in citibike_reader: 28 | # if the current 'starttime' value is a weekday 29 | if is_weekday(a_row['starttime']): 30 | # write that row of data to our output file 31 | output_writer.writerow(a_row) 32 | # close the output file 33 | output_file.close() 34 | 35 | 36 | def is_weekday(date_string, date_format='%Y-%m-%d %H:%M:%S.%f'): 37 | 38 | # convert the value in the 'date_string' to datetime format 39 | the_date = datetime.strptime(date_string, date_format) 40 | 41 | # if `the_date` is a weekday (i.e., its integer value is 0-5) 42 | if the_date.weekday() <= 4: 43 | return(True) 44 | else: 45 | return(False) 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /chapter_8_examples/standalone_files/xls_meta_and_date_parsing_refactored.py: -------------------------------------------------------------------------------- 1 | # Converting data in an .xls file with Python to csv + metadata file, with 2 | # functional date values using the "xrld" library. 3 | 4 | # First, pip install the xlrd library: 5 | # https://pypi.org/project/xlrd/2.0.1/ 6 | # then, import the `xlrd` library 7 | import xlrd 8 | 9 | # import the csv library 10 | import csv 11 | 12 | # needed to test if a given value is *some* type of number 13 | from numbers import Number 14 | 15 | # for parsing/formatting our newly interpreted Excel dates 16 | from datetime import datetime 17 | 18 | def main(): 19 | # use `open_workbook()` to load our data in the `source_workbook` variable 20 | source_workbook = xlrd.open_workbook("fredgraph.xls") 21 | 22 | global the_datemode 23 | the_datemode = source_workbook.datemode 24 | 25 | # open and name a simple metadata text file 26 | source_workbook_metadata = open("fredgraph_metadata.txt","w") 27 | 28 | # an `.xls` workbook can have multiple sheets 29 | for sheet_name in source_workbook.sheet_names(): 30 | 31 | # create a variable that points to the current worksheet 32 | current_sheet = source_workbook.sheet_by_name(sheet_name) 33 | 34 | # create "xls_"+sheet_name+".csv" as current sheet's output file 35 | output_file = open("xls_"+sheet_name+"_dates.csv","w") 36 | 37 | # use the `writer()` recipe to write `.csv`-formatted rows 38 | output_writer = csv.writer(output_file) 39 | 40 | # Boolean variable to detect if we've hit our table-type data yet 41 | is_table_data = False 42 | 43 | # now, we need to loop through every row in our sheet 44 | for row_num, row in enumerate(current_sheet.get_rows()): 45 | 46 | # pulling out the value in the first column of the current row 47 | first_entry = current_sheet.row_values(row_num)[0] 48 | 49 | # if we've hit the header row of our data table 50 | if first_entry == 'observation_date': 51 | 52 | # it's time to switch our "flag" value to "True" 53 | is_table_data = True 54 | 55 | # if `is_table_data` is True 56 | if is_table_data: 57 | 58 | # pass the requisite data to out `create_table_row()` function 59 | new_row = create_table_row(current_sheet, row_num) 60 | 61 | # write this new row to the data output file 62 | output_writer.writerow(new_row) 63 | 64 | # otherwise, this row must be metadata 65 | else: 66 | 67 | # pass the requisite data to our `create_meta_text()` function 68 | metadata_line = create_meta_text(current_sheet, row_num) 69 | 70 | # write this new row to the metadata output file 71 | source_workbook_metadata.write(metadata_line) 72 | 73 | # just for good measure, let's close our output files 74 | output_file.close() 75 | source_workbook_metadata.close() 76 | 77 | def create_table_row(the_sheet, the_row_num): 78 | 79 | # extract the table-type data values into separate variables 80 | the_date_num = the_sheet.row_values(the_row_num)[0] 81 | U6_value = the_sheet.row_values(the_row_num)[1] 82 | 83 | # create a new row object with each of the values 84 | new_row = [the_date_num, U6_value] 85 | 86 | # if the `the_date_num` is a number, then the current row is *not* 87 | # the header row. We need to transform the date. 88 | if isinstance(the_date_num, Number): 89 | 90 | # use the xlrd library's `xldate_as_datetime()` to generate 91 | # a Python datetime object 92 | the_date_num = xlrd.xldate.xldate_as_datetime(the_date_num, the_datemode) 93 | 94 | # create a new list containing `the_date_num` (formatted to MM/DD/YYYY 95 | # using the `strftime()` recipe) and the value in the second column 96 | new_row = [the_date_num.strftime('%m/%d/%Y'),U6_value] 97 | 98 | # return the fully formatted row 99 | return(new_row) 100 | 101 | 102 | def create_meta_text(the_sheet, the_row_num): 103 | 104 | meta_line = "" 105 | 106 | # since we'd like our metadata file to be nicely formatted, we 107 | # need to loop through the individual cells of each metadata row 108 | for item in the_sheet.row(the_row_num): 109 | # write the value of the cell, followed by a tab character 110 | meta_line = meta_line + item.value + '\t' 111 | 112 | # at the end of each line of metadata, add a newline 113 | meta_line = meta_line+'\n' 114 | 115 | # return the fully formatted line 116 | return(meta_line) 117 | 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /chapter_8_examples/standalone_files/xls_meta_and_date_parsing_refactored_again.py: -------------------------------------------------------------------------------- 1 | # Converting data in an .xls file with Python to csv + metadata file, with 2 | # functional date values using the "xrld" library. 3 | 4 | # First, pip install the xlrd library: 5 | # https://pypi.org/project/xlrd/2.0.1/ 6 | # then, import the `xlrd` library 7 | import xlrd 8 | 9 | # import the csv library 10 | import csv 11 | 12 | # needed to test if a given value is *some* type of number 13 | from numbers import Number 14 | 15 | # for parsing/formatting our newly interpreted Excel dates 16 | from datetime import datetime 17 | 18 | def main(): 19 | 20 | # use `open_workbook()` to load our data in the `source_workbook` variable 21 | source_workbook = xlrd.open_workbook("fredgraph.xls") 22 | 23 | # open and name a simple metadata text file 24 | source_workbook_metadata = open("fredgraph_metadata.txt","w") 25 | 26 | # an `.xls` workbook can have multiple sheets 27 | for sheet_name in source_workbook.sheet_names(): 28 | 29 | # create a variable that points to the current worksheet 30 | current_sheet = source_workbook.sheet_by_name(sheet_name) 31 | 32 | # create "xls_"+sheet_name+".csv" as the current sheet's output file 33 | output_file = open("xls_"+sheet_name+"_dates.csv","w") 34 | 35 | # use the `writer()` recipe to write `.csv`-formatted rows 36 | output_writer = csv.writer(output_file) 37 | 38 | # Boolean variable to detect if we've hit our table-type data yet 39 | is_table_data = False 40 | 41 | # now, we need to loop through every row in our sheet 42 | for row_num, row in enumerate(current_sheet.get_rows()): 43 | 44 | # pulling out the value in the first column of the current row 45 | first_entry = current_sheet.row_values(row_num)[0] 46 | 47 | # if we've hit the header row of our data table 48 | if first_entry == 'observation_date': 49 | # it's time to switch our "flag" value to "True" 50 | is_table_data = True 51 | 52 | # if `is_table_data` is True 53 | if is_table_data: 54 | 55 | # extract the table-type data values into separate variables 56 | the_date_num = current_sheet.row_values(row_num)[0] 57 | U6_value = current_sheet.row_values(row_num)[1] 58 | 59 | # if the value is a number, then the current row is *not* 60 | # the header row, so transform the date 61 | if isinstance(the_date_num, Number): 62 | the_date_num = format_excel_date(the_date_num, 63 | source_workbook.datemode) 64 | 65 | # write this new row to the data output file 66 | output_writer.writerow([the_date_num, U6_value]) 67 | 68 | # otherwise, this row must be metadata 69 | else: 70 | 71 | # pass the requisite data to our `create_meta_text()` function 72 | metadata_line = create_meta_text(current_sheet, row_num) 73 | 74 | # write this new row to the metadata output file 75 | source_workbook_metadata.write(metadata_line) 76 | 77 | # just for good measure, let's close our output files 78 | output_file.close() 79 | source_workbook_metadata.close() 80 | 81 | 82 | def format_excel_date(a_date_num, the_datemode): 83 | 84 | # use the xlrd library's `xldate_as_datetime()` to generate 85 | # a Python datetime object 86 | a_date_num = xlrd.xldate.xldate_as_datetime(a_date_num, the_datemode) 87 | 88 | # create a new list containing the_date_num (formatted to MM/DD/YYYY 89 | # using the `strftime()` recipe) and the value in the second column 90 | formatted_date = a_date_num.strftime('%m/%d/%Y') 91 | 92 | return(formatted_date) 93 | 94 | def create_meta_text(the_sheet, the_row_num): 95 | 96 | meta_line = "" 97 | 98 | # since we'd like our metadata file to be nicely formatted, we 99 | # need to loop through the individual cells of each metadata row 100 | for item in the_sheet.row(the_row_num): 101 | 102 | # write the value of the cell, followed by a tab character 103 | meta_line = meta_line + item.value + '\t' 104 | 105 | # at the end of each line of metadata, add a newline 106 | meta_line = meta_line+'\n' 107 | 108 | return(meta_line) 109 | 110 | 111 | if __name__ == "__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /chapter_9_examples/jupyter_notebooks/ppp_loan_central_measures.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# `pandas` for reading and assessing our data\n", 10 | "import pandas as pd\n", 11 | "\n", 12 | "# `seaborn` for its built-in themes and chart types\n", 13 | "import seaborn as sns\n", 14 | "\n", 15 | "# `matplotlib` for customizing visual details\n", 16 | "import matplotlib.pyplot as plt\n", 17 | "%matplotlib notebook" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 27 | "# # Import PyDrive and associated libraries.\n", 28 | "# # This only needs to be done once per notebook.\n", 29 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n", 30 | "# from pydrive.auth import GoogleAuth\n", 31 | "# from pydrive.drive import GoogleDrive\n", 32 | "# from google.colab import auth\n", 33 | "# from oauth2client.client import GoogleCredentials\n", 34 | "\n", 35 | "# # Authenticate and create the PyDrive client.\n", 36 | "# # This only needs to be done once per notebook.\n", 37 | "# auth.authenticate_user()\n", 38 | "# gauth = GoogleAuth()\n", 39 | "# gauth.credentials = GoogleCredentials.get_application_default()\n", 40 | "# drive = GoogleDrive(gauth)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 50 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1z6XEYE8Qg2gxkwotc1htbx2Maxuveg7-/view?usp=sharing\n", 51 | "# file_id = '1z6XEYE8Qg2gxkwotc1htbx2Maxuveg7-' # notice where this string comes from in link above\n", 52 | "\n", 53 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 54 | "# print(imported_file['title']) # it should print the title of desired file\n", 55 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# read in our data\n", 65 | "ppp_data = pd.read_csv('public_150k_plus_221.csv')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "# set a basic color theme for our visualization\n", 75 | "sns.set_theme(style=\"whitegrid\")\n", 76 | "\n", 77 | "# use the built-in `mean()` and `median()` methods in `pandas\n", 78 | "mean = ppp_data['CurrentApprovalAmount'].mean()\n", 79 | "median = ppp_data['CurrentApprovalAmount'].median()\n", 80 | "\n", 81 | "# create a histogram of the values in the `CurrentApprovalAmount` column\n", 82 | "approved_loan_plot = sns.histplot(data=ppp_data, x=\"CurrentApprovalAmount\")\n", 83 | "\n", 84 | "# get the min and max y-values on our histogram\n", 85 | "y_axis_range = approved_loan_plot.get_ylim()\n", 86 | "\n", 87 | "# add the vertical lines at the correct locations\n", 88 | "approved_loan_plot.vlines(mean, 0, y_axis_range[1], color='crimson', ls=':')\n", 89 | "approved_loan_plot.vlines(median, 0, y_axis_range[1], color='green', ls='-')\n", 90 | "\n", 91 | "# the matplotlib `show()` method actually renders the visualization\n", 92 | "plt.show()" 93 | ] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "display_name": "Python 3 (ipykernel)", 99 | "language": "python", 100 | "name": "python3" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.9.5" 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 4 117 | } 118 | -------------------------------------------------------------------------------- /chapter_9_examples/jupyter_notebooks/wing_length_with_sd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# `pandas` to read in our data\n", 10 | "import pandas as pd\n", 11 | "\n", 12 | "# `seaborn` for built-in themes and chart types\n", 13 | "import seaborn as sns\n", 14 | "\n", 15 | "# `matplotlib` for customizing visual details\n", 16 | "import matplotlib.pyplot as plt\n", 17 | "%matplotlib notebook\n", 18 | "\n", 19 | "# `statistics` easily calculating statistical measures\n", 20 | "import statistics\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 30 | "# # Import PyDrive and associated libraries.\n", 31 | "# # This only needs to be done once per notebook.\n", 32 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n", 33 | "# from pydrive.auth import GoogleAuth\n", 34 | "# from pydrive.drive import GoogleDrive\n", 35 | "# from google.colab import auth\n", 36 | "# from oauth2client.client import GoogleCredentials\n", 37 | "\n", 38 | "# # Authenticate and create the PyDrive client.\n", 39 | "# # This only needs to be done once per notebook.\n", 40 | "# auth.authenticate_user()\n", 41 | "# gauth = GoogleAuth()\n", 42 | "# gauth.credentials = GoogleCredentials.get_application_default()\n", 43 | "# drive = GoogleDrive(gauth)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n", 53 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1gyWJhIbMNnDlI1SCyLl_pZkxkIcdAhd4/view?usp=sharing\n", 54 | "# file_id = '1gyWJhIbMNnDlI1SCyLl_pZkxkIcdAhd4' # notice where this string comes from in link above\n", 55 | "\n", 56 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n", 57 | "# print(imported_file['title']) # it should print the title of desired file\n", 58 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# read in our data\n", 68 | "wing_data = pd.read_csv('wing_length - s057.csv')\n", 69 | "\n", 70 | "# set a basic color theme for our visualization\n", 71 | "sns.set_theme(style=\"white\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# create the histogram, allowing `seaborn` to choose default \"bin\" values\n", 81 | "wing_plot = sns.histplot(data=wing_data, x=\"wing_length (0.1mm)\", kde=\"True\")\n", 82 | "\n", 83 | "# calculate the standard deviation via the `statistics` `stdev()` method\n", 84 | "sd = statistics.stdev(wing_data['wing_length (0.1mm)'])\n", 85 | "\n", 86 | "# get the min and max y-values on our histogram\n", 87 | "y_axis_range = wing_plot.get_ylim()\n", 88 | "\n", 89 | "# plot the mean as a solid line\n", 90 | "mean = wing_data['wing_length (0.1mm)'].mean()\n", 91 | "wing_plot.vlines(mean, 0, y_axis_range[1], color='gray', ls='-')\n", 92 | "\n", 93 | "# plot the three standard deviation boundary lines on either side of the mean\n", 94 | "for i in range(-3,4):\n", 95 | "\n", 96 | " # find the current boundary value\n", 97 | " z_value = mean + (i*sd)\n", 98 | "\n", 99 | " # don't draw a second line over the mean line\n", 100 | " if z_value != mean:\n", 101 | "\n", 102 | " # plot a dotted gray line at each boundary value\n", 103 | " wing_plot.vlines(z_value, 0, y_axis_range[1], color='gray', ls=':')\n", 104 | "\n", 105 | "# show the plot!\n", 106 | "plt.show()" 107 | ] 108 | } 109 | ], 110 | "metadata": { 111 | "kernelspec": { 112 | "display_name": "Python 3 (ipykernel)", 113 | "language": "python", 114 | "name": "python3" 115 | }, 116 | "language_info": { 117 | "codemirror_mode": { 118 | "name": "ipython", 119 | "version": 3 120 | }, 121 | "file_extension": ".py", 122 | "mimetype": "text/x-python", 123 | "name": "python", 124 | "nbconvert_exporter": "python", 125 | "pygments_lexer": "ipython3", 126 | "version": "3.9.5" 127 | } 128 | }, 129 | "nbformat": 4, 130 | "nbformat_minor": 4 131 | } 132 | -------------------------------------------------------------------------------- /chapter_9_examples/standalone_files/dollars_per_job_2M_rnd2.py: -------------------------------------------------------------------------------- 1 | # `pandas` for data loading/transformations 2 | import pandas as pd 3 | 4 | # `seaborn` for visualization 5 | import seaborn as sns 6 | 7 | # `matplotlib` for customizing visuals 8 | import matplotlib.pyplot as plt 9 | 10 | # `numpy` for manipulating arrays/lists 11 | import numpy as np 12 | 13 | # load our data 14 | ppp_data = pd.read_csv('public_150k_plus_borrower_fingerprint_a.csv') 15 | 16 | # first, sanity check our data 17 | print(ppp_data[ppp_data['JobsReported'] <= 0]) 18 | 19 | # drop the records with no value in `JobsReported` 20 | ppp_data.drop(labels=[437083,765398], axis=0) 21 | 22 | # calculate the dollars per job 23 | dollars_per_job = ppp_data['CurrentApprovalAmount']/ppp_data['JobsReported'] 24 | 25 | # insert the new column into our original dataset 26 | ppp_data.insert(3, 'Dollars per Job', dollars_per_job) 27 | 28 | # use `ProcessingMethod` value to identify second-round loans 29 | pps_loans = ppp_data[ppp_data['ProcessingMethod'] == 'PPS'] 30 | 31 | # select all second-round loans that have a value of $2M 32 | pps_got_2M = pps_loans[pps_loans['CurrentApprovalAmount'] == 2000000.00] 33 | print("Actual $2M second-round loans:") 34 | print(pps_got_2M.shape) 35 | 36 | # pull fingerprints of businesses approved for $2M second-round loans 37 | biz_names = pd.unique(pps_got_2M['BorrowerNameFingerprint']) 38 | 39 | # convert that list to a DataFrame 40 | biz_names_df = pd.DataFrame(biz_names, columns=['BorrowerNameFingerprint']) 41 | 42 | # create an array of the same length as `biz_names_df`; fill with flag value 43 | fill_column = np.full((len(biz_names),1), '2Mil2ndRnd') 44 | biz_names_df['GotSecond'] = fill_column 45 | 46 | # now merge this new, two-column DataFrame back onto our full_data list 47 | second_round_max = pd.merge(ppp_data, biz_names_df, on='BorrowerNameFingerprint') 48 | 49 | # all loans whose fingerprints match those of businesses that got $2M 50 | # in the second round should have `2Mil2ndRnd` in the `GotSecond` column 51 | second_max_all_loans = second_round_max[ 52 | second_round_max['GotSecond'] == '2Mil2ndRnd'] 53 | 54 | # sbould be 2x the number of businesses approved for $2M second-round 55 | print('Total # of loans approved for most orgs that got $2M for second round:') 56 | print(second_max_all_loans.shape) 57 | 58 | # how much money were these businesses approved to get from the PPP, total? 59 | total_funds = second_max_all_loans['CurrentApprovalAmount'].sum() 60 | print("Total funds approved for identified orgs that could have " + \ 61 | "second-round max:") 62 | print(total_funds) 63 | 64 | # now, let's plot that new column on our selected dataset 65 | # set the seaborn theme 66 | sns.set_theme(style="whitegrid") 67 | 68 | # the `matplotlib` `subplots()` to plot charts side by side 69 | fig, ((row1col1)) = plt.subplots(nrows=1, ncols=1) 70 | 71 | # plot the histogram of our date-based analysis 72 | date_based = sns.histplot(data=second_max_all_loans, x='Dollars per Job', 73 | hue='ProcessingMethod', ax=row1col1) 74 | 75 | # show the plots! 76 | plt.show() 77 | -------------------------------------------------------------------------------- /chapter_9_examples/standalone_files/ppp_fingerprint_borrowers.py: -------------------------------------------------------------------------------- 1 | # Quick script for adding a "fingerprint" column to our loan data, which will 2 | # help us confirm/correct for any typos or inconsistencies in, e.g. borrower 3 | # name and address information 4 | 5 | # import the csv library 6 | import csv 7 | 8 | # importing the `fingerprints` library 9 | import fingerprints 10 | 11 | # read the recent data sample into a variable 12 | ppp_data = open('public_150k_plus_221.csv','r') 13 | 14 | # the DictReader function has added useful information to our data, 15 | # like a label that shows us all the values in the first or "header" row 16 | ppp_data_reader = csv.DictReader(ppp_data) 17 | 18 | # create an output file to write our modified data set to 19 | augmented_ppp_data = open('public_150k_plus_borrower_fingerprint_a.csv','w') 20 | 21 | # create a "writer" so that we can output whole rows at once 22 | augmented_data_writer = csv.writer(augmented_ppp_data) 23 | 24 | # because we're adding a column, we need to create a new header row as well 25 | header_row = [] 26 | 27 | # for every column header 28 | for item in ppp_data_reader.fieldnames: 29 | 30 | # append the existing column header 31 | header_row.append(item) 32 | 33 | # if we're at 'OriginatingLender' 34 | if item == 'BorrowerName': 35 | 36 | # it's time to add a new one! 37 | header_row.append('BorrowerNameFingerprint') 38 | 39 | # write the completed header row to the output file 40 | augmented_data_writer.writerow(header_row) 41 | 42 | # iterate through row in the data 43 | for row in ppp_data_reader: 44 | 45 | # adding a column means we need to build the new row of data 46 | # item by item, just as we did with the header row 47 | new_row = [] 48 | 49 | # for each column of data in the *original* data set 50 | for column_name in ppp_data_reader.fieldnames: 51 | 52 | # first, append this row's value for that column 53 | new_row.append(row[column_name]) 54 | 55 | # when we get to the 'OriginatingLender' column, it's time 56 | # to add our new "fingerprint" value 57 | if column_name == 'BorrowerName': 58 | 59 | # our fingerprint will consist of the generated fingerprint PLUS 60 | # the BorrowerZip 61 | try: 62 | the_fingerprint = fingerprints.generate(row[column_name]) +" "+ fingerprints.generate(row['BorrowerCity'])+" "+row['BorrowerState'] 63 | except(TypeError): 64 | the_fingerprint = fingerprints.generate("MISSING") +" "+ fingerprints.generate(row['BorrowerCity'])+" "+row['BorrowerState'] 65 | 66 | new_row.append(the_fingerprint) 67 | 68 | # once the whole row is complete, write it to our output file 69 | augmented_data_writer.writerow(new_row) 70 | 71 | # close both files 72 | augmented_ppp_data.close() 73 | ppp_data.close() 74 | -------------------------------------------------------------------------------- /chapter_9_examples/standalone_files/ppp_loan_central_and_dist.py: -------------------------------------------------------------------------------- 1 | # `pandas` for reading and assessing our data 2 | import pandas as pd 3 | 4 | # `seaborn` for its built-in themes and chart types 5 | import seaborn as sns 6 | 7 | # `matplotlib` for customizing visual details 8 | import matplotlib.pyplot as plt 9 | 10 | # read in our data 11 | ppp_data = pd.read_csv('public_150k_plus_221.csv') 12 | 13 | # set a basic color theme for our visualization 14 | sns.set_theme(style="whitegrid") 15 | 16 | # use the built-in `mean()` and `median()` methods in `pandas 17 | mean = ppp_data['CurrentApprovalAmount'].mean() 18 | median = ppp_data['CurrentApprovalAmount'].median() 19 | 20 | # Q1 is the value at the position in our dataset 21 | # that has 25% of data readings to its left 22 | Q1 = ppp_data['CurrentApprovalAmount'].quantile(0.25) 23 | 24 | # Q3 is the value at the position in our dataset 25 | # that has 75% of data readings to its left 26 | Q3 = ppp_data['CurrentApprovalAmount'].quantile(0.75) 27 | 28 | # IQR is the difference between the Q3 and Q1 values 29 | IQR = Q3-Q1 30 | 31 | # and now we calculate our lower and upper bounds 32 | lower_bound = Q1 - (1.5*IQR) 33 | upper_bound = Q3 + (1.5*IQR) 34 | 35 | # use `seaborn` to plot the histogram 36 | approved_loan_plot = sns.histplot(data=ppp_data, x="CurrentApprovalAmount") 37 | 38 | # get the min and max y-values on our histogram 39 | y_axis_range = approved_loan_plot.get_ylim() 40 | 41 | # add mean line in gray 42 | approved_loan_plot.vlines(mean, 0, y_axis_range[1], color='gray', ls='-') 43 | 44 | # other lines in black (median solid, others dotted) 45 | approved_loan_plot.vlines(median, 0, y_axis_range[1], color='black', ls='-') 46 | approved_loan_plot.vlines(lower_bound, 0, y_axis_range[1], color='black', ls=':') 47 | approved_loan_plot.vlines(Q1, 0, y_axis_range[1], color='black', ls=':') 48 | approved_loan_plot.vlines(Q3, 0, y_axis_range[1], color='black', ls=':') 49 | approved_loan_plot.vlines(upper_bound, 0, y_axis_range[1], color='black', ls=':') 50 | 51 | # show the plot! 52 | plt.show() 53 | -------------------------------------------------------------------------------- /chapter_9_examples/standalone_files/ppp_loan_central_measures.py: -------------------------------------------------------------------------------- 1 | # `pandas` for reading and assessing our data 2 | import pandas as pd 3 | 4 | # `seaborn` for its built-in themes and chart types 5 | import seaborn as sns 6 | 7 | # `matplotlib` for customizing visual details 8 | import matplotlib.pyplot as plt 9 | 10 | # read in our data 11 | ppp_data = pd.read_csv('public_150k_plus_221.csv') 12 | 13 | # set a basic color theme for our visualization 14 | sns.set_theme(style="whitegrid") 15 | 16 | # use the built-in `mean()` and `median()` methods in `pandas 17 | mean = ppp_data['CurrentApprovalAmount'].mean() 18 | median = ppp_data['CurrentApprovalAmount'].median() 19 | 20 | # create a histogram of the values in the `CurrentApprovalAmount` column 21 | approved_loan_plot = sns.histplot(data=ppp_data, x="CurrentApprovalAmount") 22 | 23 | # get the min and max y-values on our histogram 24 | y_axis_range = approved_loan_plot.get_ylim() 25 | 26 | # add the vertical lines at the correct locations 27 | approved_loan_plot.vlines(mean, 0, y_axis_range[1], color='crimson', ls=':') 28 | approved_loan_plot.vlines(median, 0, y_axis_range[1], color='green', ls='-') 29 | 30 | # the matplotlib `show()` method actually renders the visualization 31 | plt.show() 32 | -------------------------------------------------------------------------------- /chapter_9_examples/standalone_files/who_got_2_loans_by_date.py: -------------------------------------------------------------------------------- 1 | # `pandas` for data loading/transformations 2 | import pandas as pd 3 | 4 | # `seaborn` for visualization 5 | import seaborn as sns 6 | 7 | # `matplotlib` for detailed visualization support 8 | import matplotlib.pyplot as plt 9 | 10 | # `numpy` for manipulating arrays/lists 11 | import numpy as np 12 | 13 | # load our data 14 | ppp_data = pd.read_csv('public_150k_plus_borrower_fingerprint_a.csv') 15 | 16 | # convert the `DateApproved` column to an actual datetime data type 17 | ppp_data['DateApproved'] = pd.to_datetime(ppp_data['DateApproved']) 18 | 19 | # create a variable to hold the second-round start date 20 | second_round_start = pd.to_datetime('2021-01-13') 21 | 22 | # treat today's date to use as the "upper" limit on possible second-round loans 23 | todays_date = pd.to_datetime('today') 24 | 25 | # use 1/1/2020 as a "lower" limit, since it's before the PPP launched 26 | program_start = pd.to_datetime('2020-01-01') 27 | 28 | # pass our boundaries and category labels to the pandas `cut()` function 29 | loan_round = pd.cut(ppp_data.DateApproved, 30 | bins=[program_start,second_round_start, todays_date], 31 | labels=['first_round', 'maybe_second']) 32 | 33 | # insert the new column at the position we specify 34 | ppp_data.insert(2,'Loan Round',loan_round) 35 | 36 | # this "pivot table" will return a Series showing the number 37 | # of times a particular 'BorrowerNameFingerprint' appears in the dataset 38 | loan_count = ppp_data.pivot_table(index=['BorrowerNameFingerprint'], aggfunc='size') 39 | 40 | # convert our Series to a DataFrame and give it a name 41 | loan_count_df = loan_count.to_frame('Loan Count') 42 | 43 | # use the `describe()` method to print out summary statistics 44 | print("Description of duplicate borrower table:") 45 | print(loan_count_df.describe()) 46 | 47 | # start by sorting our DataFrame of loan counts from greatest to least 48 | sorted_loan_counts = loan_count_df.sort_values(by=['Loan Count'], ascending=False) 49 | 50 | # create a new DataFrame with *only* those that have more than two loans 51 | more_than_two = sorted_loan_counts[sorted_loan_counts['Loan Count'] > 2] 52 | 53 | # print one instance of each business name that appears in `more_than_two` 54 | print("Businesses that seem to have gotten more than 2 loans:") 55 | print(more_than_two.shape) 56 | 57 | print("Number of businesses that appear to have gotten precisely 2 loans:") 58 | 59 | precisely_two = sorted_loan_counts[sorted_loan_counts['Loan Count'] == 2] 60 | 61 | print(precisely_two.shape) 62 | 63 | # use `ProcessingMethod` value to identify second-round loans 64 | pps_loans = ppp_data[ppp_data['ProcessingMethod'] == 'PPS'] 65 | 66 | # print out the `shape` of this DataFrame to see how many businesses we have 67 | print("Number of loans labeled as second round:") 68 | print(pps_loans.shape) 69 | 70 | # how many loans in our derived data frame were approved for precisely $2M 71 | # during the (possibly) second-round timeframe? 72 | # merge our `loan_count_df` back to keep track of businesses 73 | # we labeled as having precisely two loans 74 | ppp_data_w_lc = pd.merge(ppp_data, loan_count_df, 75 | on=['BorrowerNameFingerprint'], how='left') 76 | 77 | # now get *all* the records of business names we associated with two loans 78 | matched_two_loans = ppp_data_w_lc[(ppp_data_w_lc['Loan Count'] == 2)] 79 | 80 | # select those loans our `maybe_second` loans that have a value of $2M 81 | maybe_round2_2M = matched_two_loans[(matched_two_loans[ 82 | 'CurrentApprovalAmount'] == 2000000.00) & 83 | (matched_two_loans[ 84 | 'Loan Round'] == 'maybe_second')] 85 | print("Derived $2M second-round loans:") 86 | print(maybe_round2_2M.shape) 87 | 88 | # select those loans that we *know* are second round and have a value of $2M 89 | pps_got_2M = pps_loans[pps_loans['CurrentApprovalAmount'] == 2000000.00] 90 | print("Actual $2M second-round loans:") 91 | print(pps_got_2M.shape) 92 | -------------------------------------------------------------------------------- /chapter_9_examples/standalone_files/wing_length_with_sd.py: -------------------------------------------------------------------------------- 1 | # `pandas` to read in our data 2 | import pandas as pd 3 | 4 | # `seaborn` for built-in themes and chart types 5 | import seaborn as sns 6 | 7 | # `matplotlib` for customizing visual details 8 | import matplotlib.pyplot as plt 9 | 10 | # `statistics` easily calculating statistical measures 11 | import statistics 12 | 13 | # read in our data 14 | wing_data = pd.read_csv('wing_length - s057.csv') 15 | 16 | # set a basic color theme for our visualization 17 | sns.set_theme(style="white") 18 | 19 | # create the histogram, allowing `seaborn` to choose default "bin" values 20 | wing_plot = sns.histplot(data=wing_data, x="wing_length (0.1mm)", kde="True") 21 | 22 | # calculate the standard deviation via the `statistics` `stdev()` method 23 | sd = statistics.stdev(wing_data['wing_length (0.1mm)']) 24 | 25 | # get the min and max y-values on our histogram 26 | y_axis_range = wing_plot.get_ylim() 27 | 28 | # plot the mean as a solid line 29 | mean = wing_data['wing_length (0.1mm)'].mean() 30 | wing_plot.vlines(mean, 0, y_axis_range[1], color='gray', ls='-') 31 | 32 | # plot the three standard deviation boundary lines on either side of the mean 33 | for i in range(-3,4): 34 | 35 | # find the current boundary value 36 | z_value = mean + (i*sd) 37 | 38 | # don't draw a second line over the mean line 39 | if z_value != mean: 40 | 41 | # plot a dotted gray line at each boundary value 42 | wing_plot.vlines(z_value, 0, y_axis_range[1], color='gray', ls=':') 43 | 44 | # show the plot! 45 | plt.show() 46 | --------------------------------------------------------------------------------