├── .gitignore
├── LICENSE
├── README.md
├── chapter_10_examples
├── jupyter_notebooks
│ ├── a_humble_pie.ipynb
│ ├── covid_FDI_impact.ipynb
│ ├── refined_covid_barchart.ipynb
│ ├── retirement_age.ipynb
│ └── schools_that_work.ipynb
└── standalone_files
│ ├── a_humble_pie.py
│ ├── covid_FDI_impact.py
│ ├── refined_covid_barchart-bak.py
│ ├── refined_covid_barchart.py
│ ├── retirement_age.py
│ └── schools_that_work.py
├── chapter_1_examples
├── jupyter_notebooks
│ └── hello_world.ipynb
└── standalone_files
│ └── hello_world.py
├── chapter_2_examples
├── jupyter_notebooks
│ ├── basic_greeting.ipynb
│ ├── greet_me.ipynb
│ ├── hitting_the_road_with_citibike.ipynb
│ ├── method_madness.ipynb
│ ├── noun_examples.ipynb
│ ├── page_count_conditional.ipynb
│ ├── page_count_custom_function.ipynb
│ ├── page_count_loop.ipynb
│ ├── page_count_printout.ipynb
│ └── parts_of_speech.ipynb
└── standalone_files
│ ├── basic_greeting.py
│ ├── greet_me.py
│ ├── hitting_the_road_with_citibike.py
│ ├── method_madness.py
│ ├── noun_examples.py
│ ├── page_count_conditional.py
│ ├── page_count_custom_function.py
│ ├── page_count_loop.py
│ ├── page_count_printout.py
│ └── parts_of_speech.py
├── chapter_4_examples
├── jupyter_notebooks
│ ├── csv_parsing.ipynb
│ ├── fixed_width_parsing.ipynb
│ ├── json_parsing.ipynb
│ ├── ods_parsing.ipynb
│ ├── pdf_parsing.ipynb
│ ├── rss_parsing.ipynb
│ ├── tsv_parsing.ipynb
│ ├── txt_parsing.ipynb
│ ├── xls_parsing.ipynb
│ ├── xlsx_parsing.ipynb
│ └── xml_parsing.ipynb
└── standalone_files
│ ├── csv_parsing.py
│ ├── fixed_width_parsing.py
│ ├── json_parsing.py
│ ├── ods_parsing.py
│ ├── pdf_parsing.py
│ ├── rss_parsing.py
│ ├── tsv_parsing.py
│ ├── txt_parsing.py
│ ├── xls_parsing.py
│ ├── xlsx_parsing.py
│ └── xml_parsing.py
├── chapter_5_examples
├── .gitignore
├── jupyter_notebooks
│ ├── FRED_API_example.ipynb
│ ├── MTA_turnstiles_data_download.ipynb
│ ├── MTA_turnstiles_index.ipynb
│ ├── MTA_turnstiles_parsing.ipynb
│ ├── Twitter_data_download.ipynb
│ └── data_download.ipynb
└── standalone_files
│ ├── FRED_API_example.py
│ ├── MTA_turnstile_index.py
│ ├── MTA_turnstiles_data_download.py
│ ├── MTA_turnstiles_parsing.py
│ ├── Twitter_data_download.py
│ └── data_download.py
├── chapter_6_examples
├── jupyter_notebooks
│ ├── ppp_columns_review.ipynb
│ ├── ppp_columns_summary.ipynb
│ ├── ppp_data_join.ipynb
│ ├── ppp_data_samples.ipynb
│ ├── ppp_date_range.ipynb
│ ├── ppp_find_waterford.ipynb
│ ├── ppp_lender_names.ipynb
│ ├── ppp_loan_status.ipynb
│ ├── ppp_loan_uses.ipynb
│ ├── ppp_min_max_loan.ipynb
│ └── ppp_numrows.ipynb
└── standalone_files
│ ├── ppp_columns_review.py
│ ├── ppp_columns_summary.py
│ ├── ppp_data_join.py
│ ├── ppp_data_samples.py
│ ├── ppp_date_range.py
│ ├── ppp_find_waterford.py
│ ├── ppp_lender_names.py
│ ├── ppp_loan_status.py
│ ├── ppp_loan_uses.py
│ ├── ppp_min_max_loan.py
│ └── ppp_numrows.py
├── chapter_7_examples
├── jupyter_notebooks
│ ├── citibike_september1_rides.ipynb
│ ├── fixed_width_strip_parsing.ipynb
│ ├── ppp_add_fingerprints.ipynb
│ ├── ppp_adding_naics.ipynb
│ ├── regex_tests.ipynb
│ ├── weekday_rides.ipynb
│ ├── xls_meta_and_date_parsing.ipynb
│ └── xls_meta_parsing.ipynb
└── standalone_files
│ ├── citibike_september1_rides.py
│ ├── fixed_width_strip_parsing.py
│ ├── ppp_add_fingerprints.py
│ ├── ppp_adding_naics.py
│ ├── regex_tests.py
│ ├── weekday_rides.py
│ ├── xls_meta_and_date_parsing.py
│ └── xls_meta_parsing.py
├── chapter_8_examples
├── jupyter_notebooks
│ ├── fixed_width_string_parsing_refactored.ipynb
│ ├── greet_me_options.ipynb
│ ├── greet_me_revisited.ipynb
│ ├── make_greeting.ipynb
│ ├── make_greeting_no_vars.ipynb
│ ├── webpage_saver.ipynb
│ ├── weekday_rides_refactored.ipynb
│ ├── xls_meta_and_date_parsing_refactored.ipynb
│ └── xls_meta_and_date_parsing_refactored_again.ipynb
└── standalone_files
│ ├── fixed_width_strip_parsing_refactored.py
│ ├── greet_me_options.py
│ ├── greet_me_revisited.py
│ ├── make_greeting.py
│ ├── make_greeting_no_vars.py
│ ├── webpage_saver.py
│ ├── weekday_rides_refactored.py
│ ├── xls_meta_and_date_parsing_refactored.py
│ └── xls_meta_and_date_parsing_refactored_again.py
└── chapter_9_examples
├── jupyter_notebooks
├── dollars_per_job_2M_rnd2.ipynb
├── ppp_loan_central_and_dist.ipynb
├── ppp_loan_central_measures.ipynb
├── who_got_2M_with_viz.ipynb
├── who_got_2_loans_by_date.ipynb
└── wing_length_with_sd.ipynb
└── standalone_files
├── dollars_per_job_2M_rnd2.py
├── ppp_fingerprint_borrowers.py
├── ppp_loan_central_and_dist.py
├── ppp_loan_central_measures.py
├── who_got_2M_with_viz.py
├── who_got_2_loans_by_date.py
└── wing_length_with_sd.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | *checkpoint.ipynb
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Practical Python: Data Wrangling and Data Quality
2 |
3 | This repo contains draft coding exercises for the early-release version of the book _Practical Python: Data Wrangling and Data Quality_ to be published by O'Reilly Media in 2021.
4 |
5 | ## Before You Begin
6 |
7 | Below you will find an overview of this repo's contents, as well as important tips and information on how to use these files. In general, all exercises are accessible as standalone `.py` files, and as Jupyter Notebooks. The notebooks can either be downloaded to your device and run locally, or opened and run in Google Colab (https://colab.research.google.com/). The draft text of Chapter 1 includes basic instructions on how to get started with some of these tools; this text will be updated/completed before final publication.
8 |
9 | ### Working with data files
10 |
11 | Because data sets can often be quite large, the data sets for these exercises are available for download [here](https://drive.google.com/drive/folders/1q_dkJxfsCjeZjWH3Hs2WKWYFSTa7MsBn?usp=sharing).
12 |
13 | #### If you are working locally
14 | Data sets should be downloaded/copied in the same folder as the Python file or notebook, unless otherwise indicated.
15 |
--------------------------------------------------------------------------------
/chapter_10_examples/jupyter_notebooks/a_humble_pie.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import matplotlib.pyplot as plt"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# matplotlib works counterclockwise, so we need to essentially reverse\n",
19 | "# the order of our pie-value \"slices\"\n",
20 | "candidate_names = ['Adams', 'Wiley', 'Garcia', 'Yang', 'Others']\n",
21 | "\n",
22 | "candidate_names.reverse()\n",
23 | "\n",
24 | "vote_pct = [30.8, 21.3, 19.6, 12.2, 16.1]\n",
25 | "\n",
26 | "vote_pct.reverse()\n",
27 | "\n",
28 | "colors = ['#006d2c','#006d2c', '#006d2c', '#31a354','#74c476']\n",
29 | "\n",
30 | "colors.reverse()\n",
31 | "\n",
32 | "fig1, ax1 = plt.subplots()\n",
33 | "\n",
34 | "# by default, the starting axis is the x-axis; making this value 90 ensures\n",
35 | "# that it is a vertical line instead\n",
36 | "ax1.pie(vote_pct, labels=candidate_names, autopct='%.1f%%', startangle=90,\n",
37 | " colors=colors)\n",
38 | "\n",
39 | "ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n",
40 | "\n",
41 | "# show the plot!\n",
42 | "plt.show()"
43 | ]
44 | }
45 | ],
46 | "metadata": {
47 | "kernelspec": {
48 | "display_name": "Python 3 (ipykernel)",
49 | "language": "python",
50 | "name": "python3"
51 | },
52 | "language_info": {
53 | "codemirror_mode": {
54 | "name": "ipython",
55 | "version": 3
56 | },
57 | "file_extension": ".py",
58 | "mimetype": "text/x-python",
59 | "name": "python",
60 | "nbconvert_exporter": "python",
61 | "pygments_lexer": "ipython3",
62 | "version": "3.9.5"
63 | }
64 | },
65 | "nbformat": 4,
66 | "nbformat_minor": 4
67 | }
68 |
--------------------------------------------------------------------------------
/chapter_10_examples/jupyter_notebooks/covid_FDI_impact.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import matplotlib.pyplot as plt\n",
10 | "import pandas as pd\n",
11 | "import seaborn as sns\n",
12 | "import numpy as np"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "# each individual array is a row of data\n",
22 | "FDI = np.array([[0.8, 0.7], [0.3, 0.6]])\n",
23 | "\n",
24 | "fdi_data = pd.DataFrame(data=FDI,\n",
25 | " columns=['Developed', 'Developing'])"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "ax = sns.lineplot(data=fdi_data)\n",
35 | "\n",
36 | "# show the plot!\n",
37 | "plt.show()"
38 | ]
39 | }
40 | ],
41 | "metadata": {
42 | "kernelspec": {
43 | "display_name": "Python 3 (ipykernel)",
44 | "language": "python",
45 | "name": "python3"
46 | },
47 | "language_info": {
48 | "codemirror_mode": {
49 | "name": "ipython",
50 | "version": 3
51 | },
52 | "file_extension": ".py",
53 | "mimetype": "text/x-python",
54 | "name": "python",
55 | "nbconvert_exporter": "python",
56 | "pygments_lexer": "ipython3",
57 | "version": "3.9.5"
58 | }
59 | },
60 | "nbformat": 4,
61 | "nbformat_minor": 4
62 | }
63 |
--------------------------------------------------------------------------------
/chapter_10_examples/jupyter_notebooks/retirement_age.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import matplotlib.pyplot as plt\n",
10 | "import pandas as pd\n",
11 | "import seaborn as sns\n",
12 | "import numpy as np"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "# (abbreviated) list of countries\n",
22 | "countries = ['Japan', 'Iceland', 'Switzerland', 'France', 'Ireland', 'Germany',\n",
23 | " 'Italy', 'Belgium']\n",
24 | "\n",
25 | "# difference in years between official and actual retirement age\n",
26 | "retirement_gap = [9, 2, 2, -1, -2, -2, -7, -8]\n",
27 | "\n",
28 | "# zip the two lists together, and specify the column names as we make the DataFrame\n",
29 | "retirement_data = pd.DataFrame(list(zip(countries, retirement_gap)),\n",
30 | " columns =['country', 'retirement_gap'])"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "# in practice, we might prefer to write a function that generates this list,\n",
40 | "# based on our data values\n",
41 | "bar_colors = ['#d01c8b', '#d01c8b', '#d01c8b', '#4dac26','#4dac26','#4dac26',\n",
42 | " '#4dac26','#4dac26']\n",
43 | "\n",
44 | "# pass our data and palette to the `seaborn` `barplot()` function\n",
45 | "ax = sns.barplot(x=\"retirement_gap\", y=\"country\", data=retirement_data, palette=bar_colors)\n",
46 | "\n",
47 | "# show the plot!\n",
48 | "plt.show()"
49 | ]
50 | }
51 | ],
52 | "metadata": {
53 | "kernelspec": {
54 | "display_name": "Python 3 (ipykernel)",
55 | "language": "python",
56 | "name": "python3"
57 | },
58 | "language_info": {
59 | "codemirror_mode": {
60 | "name": "ipython",
61 | "version": 3
62 | },
63 | "file_extension": ".py",
64 | "mimetype": "text/x-python",
65 | "name": "python",
66 | "nbconvert_exporter": "python",
67 | "pygments_lexer": "ipython3",
68 | "version": "3.9.5"
69 | }
70 | },
71 | "nbformat": 4,
72 | "nbformat_minor": 4
73 | }
74 |
--------------------------------------------------------------------------------
/chapter_10_examples/jupyter_notebooks/schools_that_work.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "5c1e35d0",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import matplotlib.pyplot as plt\n",
11 | "import seaborn as sns\n",
12 | "import pandas as pd"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "id": "bd7e1932",
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
23 | "# # Import PyDrive and associated libraries.\n",
24 | "# # This only needs to be done once per notebook.\n",
25 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n",
26 | "# from pydrive.auth import GoogleAuth\n",
27 | "# from pydrive.drive import GoogleDrive\n",
28 | "# from google.colab import auth\n",
29 | "# from oauth2client.client import GoogleCredentials\n",
30 | "\n",
31 | "# # Authenticate and create the PyDrive client.\n",
32 | "# # This only needs to be done once per notebook.\n",
33 | "# auth.authenticate_user()\n",
34 | "# gauth = GoogleAuth()\n",
35 | "# gauth.credentials = GoogleCredentials.get_application_default()\n",
36 | "# drive = GoogleDrive(gauth)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "id": "655aebfe",
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
47 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1_Kd6AUWyLirPpneW0kkeA_5WmEKRXnfl/view?usp=sharing\n",
48 | "# file_id = '1_Kd6AUWyLirPpneW0kkeA_5WmEKRXnfl' # notice where this string comes from in link above\n",
49 | "\n",
50 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
51 | "# print(imported_file['title']) # it should print the title of desired file\n",
52 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "id": "74239aff",
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "# import the school test data\n",
63 | "school_data = pd.read_csv(\"apib12tx.csv\")"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "id": "c7b98a9c",
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "# plot test scores against the percentage of students receiving meal support\n",
74 | "sns.scatterplot(data=school_data, x=\"MEALS\", y=\"API12B\", alpha=0.6, linewidth=0)\n",
75 | "\n",
76 | "# highlight a high-performing school\n",
77 | "highlight_school = school_data[school_data['SNAME'] == \"Chin (John Yehall) Elementary\"]\n",
78 | "plt.scatter(highlight_school['MEALS'], highlight_school['API12B'],\n",
79 | " color='orange', alpha=1.0)\n",
80 | "\n",
81 | "# show the plot!\n",
82 | "plt.show()"
83 | ]
84 | }
85 | ],
86 | "metadata": {
87 | "kernelspec": {
88 | "display_name": "Python 3 (ipykernel)",
89 | "language": "python",
90 | "name": "python3"
91 | },
92 | "language_info": {
93 | "codemirror_mode": {
94 | "name": "ipython",
95 | "version": 3
96 | },
97 | "file_extension": ".py",
98 | "mimetype": "text/x-python",
99 | "name": "python",
100 | "nbconvert_exporter": "python",
101 | "pygments_lexer": "ipython3",
102 | "version": "3.9.5"
103 | }
104 | },
105 | "nbformat": 4,
106 | "nbformat_minor": 5
107 | }
108 |
--------------------------------------------------------------------------------
/chapter_10_examples/standalone_files/a_humble_pie.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 |
3 | # matplotlib works counterclockwise, so we need to essentially reverse
4 | # the order of our pie-value "slices"
5 | candidate_names = ['Adams', 'Wiley', 'Garcia', 'Yang', 'Others']
6 |
7 | candidate_names.reverse()
8 |
9 | vote_pct = [30.8, 21.3, 19.6, 12.2, 16.1]
10 |
11 | vote_pct.reverse()
12 |
13 | colors = ['#006d2c','#006d2c', '#006d2c', '#31a354','#74c476']
14 |
15 | colors.reverse()
16 |
17 | fig1, ax1 = plt.subplots()
18 |
19 | # by default, the starting axis is the x-axis; making this value 90 ensures
20 | # that it is a vertical line instead
21 | ax1.pie(vote_pct, labels=candidate_names, autopct='%.1f%%', startangle=90,
22 | colors=colors)
23 |
24 | ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
25 |
26 | # show the plot!
27 | plt.show()
28 |
--------------------------------------------------------------------------------
/chapter_10_examples/standalone_files/covid_FDI_impact.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 | import seaborn as sns
4 | import numpy as np
5 |
6 | # each individual array is a row of data
7 | FDI = np.array([[0.8, 0.7], [0.3, 0.6]])
8 |
9 | fdi_data = pd.DataFrame(data=FDI,
10 | columns=['Developed', 'Developing'])
11 |
12 | ax = sns.lineplot(data=fdi_data)
13 |
14 | # show the plot!
15 | plt.show()
16 |
--------------------------------------------------------------------------------
/chapter_10_examples/standalone_files/refined_covid_barchart-bak.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import seaborn as sns
3 | import matplotlib.pyplot as plt
4 | from matplotlib.ticker import FuncFormatter
5 | from matplotlib.dates import DateFormatter
6 | from datetime import datetime
7 | import numpy as np
8 |
9 | vaccine_data = pd.read_csv('owid-covid-data.csv')
10 | vaccine_data['date']= pd.to_datetime(vaccine_data['date'])
11 | country_and_month = vaccine_data.groupby('iso_code').resample('M', on='date').sum()
12 | country_and_month_update = country_and_month.reset_index()
13 | just_USA = country_and_month_update[country_and_month_update['iso_code']=='USA']
14 |
15 | ax = sns.barplot(x="date", y="new_cases", palette=['grey'], data=just_USA)
16 | plt.show()
17 |
18 | def millions(val, pos):
19 | # the two arguments are the value and tick position
20 | modified_val = val*1e-6
21 | formatted_val = str(modified_val)
22 | if val == ax.get_ylim()[1]:
23 | formatted_val = formatted_val+'M'
24 | if val == 0:
25 | formatted_val = "0"
26 | return formatted_val
27 | #return '$%1.1fM' % (val*1e-6)
28 |
29 | def custom_dates(val,pos):
30 | dates_list = just_USA.date.tolist()
31 | current_value = dates_list[pos]
32 | current_month = datetime.strftime(current_value, '%b')
33 | date_label = current_month
34 | if date_label == 'Jan':
35 | date_label = date_label + " '"+ datetime.strftime(current_value, '%y')
36 | return date_label
37 |
38 | y_formatter = FuncFormatter(millions)
39 | x_formatter = FuncFormatter(custom_dates)
40 |
41 | # using a seaborn theme will make customization harder, so skip it
42 | #sns.set_theme(style="whitegrid")
43 | # make a barplot
44 | ax = sns.barplot(x="date", y="new_cases", palette=['grey'], data=just_USA)
45 |
46 | for i,bar in enumerate(ax.patches):
47 | if i == 6:
48 | bar.set_color('red')
49 |
50 | ax.set_ylim(0,7000000)
51 |
52 | # setting axis labels
53 | plt.xlabel('Month')
54 | plt.ylabel('New cases (M)')
55 |
56 | # if you want to use rcParams, you need to use them *before* tick_params
57 | # rcParams is the interactive version of a matplotlib stylesheet
58 | # https://matplotlib.org/stable/tutorials/introductory/customizing.html
59 |
60 | plt.rcParams['xtick.bottom'] = False
61 |
62 | # manipulate the axis attributes
63 | # https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.tick_params.html
64 |
65 | ax.tick_params(direction='out', length=10, width=1, color='black', colors='black',pad=4, grid_color='black', grid_alpha=1, rotation=45)
66 |
67 | # apply custom number formatter to y axis
68 | ax.yaxis.set_major_formatter(y_formatter)
69 | ax.xaxis.set_major_formatter(x_formatter)
70 |
71 |
72 | # by default, this is in "data coordinates"; e.g. a value of 1 will left-align the start
73 | # of the text with the center point of the first (in this case) column.
74 | # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.text.html
75 | # also, the "y" value is the bottom of the text, including multi-line text
76 | ax.text(4,3000000, "Confirmed cases\noften lag infection\nby several weeks.");
77 |
78 | bar_value = just_USA.new_cases.tolist()
79 | ax.vlines( x = 6, color='black', linewidth=1, alpha=.7,
80 | ymin = bar_value[6]+100000, ymax = 3000000-100000);
81 |
82 | # ha! It uses LaTeX for text layout and mainpulation
83 | # https://matplotlib.org/2.0.2/users/usetex.html
84 | # plt.rc('text', usetex=True)
85 | # plt.title(r"\textbf{Something}, but then also\\ something else")
86 | # the following titles overwrite each other - seaborn uses matplotlib under the hood
87 | plt.title("COVID-19 cases spike following relaxed restrictions\nin the spring of 2020", fontweight="bold")
88 | # ax.set_title('COVID-19 cases spike following relaxed restrictions in the spring of 2020');
89 |
90 | plt.show()
91 |
--------------------------------------------------------------------------------
/chapter_10_examples/standalone_files/refined_covid_barchart.py:
--------------------------------------------------------------------------------
1 | # `pandas` for data loading; `seaborn` and `matplotlib` for visuals
2 | import pandas as pd
3 | import seaborn as sns
4 | import matplotlib.pyplot as plt
5 |
6 | # `FuncFormatter` to format axis labels
7 | from matplotlib.ticker import FuncFormatter
8 |
9 | # `datetime` to interpret and customize dates
10 | from datetime import datetime
11 |
12 | # load the data
13 | vaccine_data = pd.read_csv('owid-covid-data.csv')
14 |
15 | # convert the `date` column to a "real" date
16 | vaccine_data['date']= pd.to_datetime(vaccine_data['date'])
17 |
18 | # group the data by country and month
19 | country_and_month = vaccine_data.groupby('iso_code').resample('M',
20 | on='date').sum()
21 |
22 | # use `reset_index()` to "flatten" the DataFrame headers
23 | country_and_month_update = country_and_month.reset_index()
24 |
25 | # select just the United States' data
26 | just_USA = country_and_month_update[country_and_month_update['iso_code']=='USA']
27 |
28 | # make the foundational barplot with `seaborn`
29 | ax = sns.barplot(x="date", y="new_cases", palette=['#bababa'], data=just_USA)
30 |
31 | # loop through the bars rectangles and set the color for the July, 2020
32 | # bar to red
33 | for i, bar in enumerate(ax.patches):
34 | if i == 6:
35 | bar.set_color('#ca0020')
36 |
37 | # set the maximum y-axis value to 7M
38 | ax.set_ylim(0,7000000)
39 |
40 | # setting the axis labels
41 | plt.xlabel('Month')
42 | plt.ylabel('New cases (M)')
43 |
44 | # modify the color, placement and orientation of the "tick labels"
45 | ax.tick_params(direction='out', length=5, width=1, color='#404040',
46 | colors='#404040',pad=4, grid_color='#404040', grid_alpha=1,
47 | rotation=45)
48 |
49 | # functions for formatting the axis "tick labels"
50 | # `millions()` will convert the scientific notation to millions of cases
51 | def millions(val, pos):
52 | modified_val = val*1e-6
53 | formatted_val = str(modified_val)
54 | if val == ax.get_ylim()[1]:
55 | formatted_val = formatted_val+'M'
56 | if val == 0:
57 | formatted_val = "0"
58 | return formatted_val
59 |
60 |
61 | # `custom_dates()` will abbreviate the dates to be more readable
62 | def custom_dates(val, pos):
63 | dates_list = just_USA.date.tolist()
64 | date_label = ""
65 | if pos is not None:
66 | current_value = dates_list[pos]
67 | current_month = datetime.strftime(current_value, '%b')
68 | date_label = current_month
69 | if date_label == 'Jan':
70 | date_label = date_label + " '"+ datetime.strftime(current_value,
71 | '%y')
72 | return date_label
73 |
74 |
75 | # assign formatter functions
76 | y_formatter = FuncFormatter(millions)
77 | x_formatter = FuncFormatter(custom_dates)
78 |
79 | # apply the formatter functions to the appropriate axis
80 | ax.yaxis.set_major_formatter(y_formatter)
81 | ax.xaxis.set_major_formatter(x_formatter)
82 |
83 | # create and position the annotation text
84 | ax.text(4, 3000000, "Confirmed cases\noften lag infection\nby several weeks.")
85 |
86 | # get the value of all bars as a list
87 | bar_value = just_USA.new_cases.tolist()
88 |
89 | # create the leader line
90 | ax.vlines( x = 6, color='#404040', linewidth=1, alpha=.7,
91 | ymin = bar_value[6]+100000, ymax = 3000000-100000)
92 |
93 | # set the title of the chart
94 | plt.title("COVID-19 cases spike following relaxed restrictions\n" + \
95 | "in the spring of 2020", fontweight="bold")
96 |
97 | # show the chart!
98 | plt.show()
99 |
--------------------------------------------------------------------------------
/chapter_10_examples/standalone_files/retirement_age.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 | import seaborn as sns
4 | import numpy as np
5 |
6 | # (abbreviated) list of countries
7 | countries = ['Japan', 'Iceland', 'Switzerland', 'France', 'Ireland', 'Germany',
8 | 'Italy', 'Belgium']
9 |
10 | # difference in years between official and actual retirement age
11 | retirement_gap = [9, 2, 2, -1, -2, -2, -7, -8]
12 |
13 | # zip the two lists together, and specify the column names as we make the DataFrame
14 | retirement_data = pd.DataFrame(list(zip(countries, retirement_gap)),
15 | columns =['country', 'retirement_gap'])
16 |
17 | # in practice, we might prefer to write a function that generates this list,
18 | # based on our data values
19 | bar_colors = ['#d01c8b', '#d01c8b', '#d01c8b', '#4dac26','#4dac26','#4dac26',
20 | '#4dac26','#4dac26']
21 |
22 | # pass our data and palette to the `seaborn` `barplot()` function
23 | ax = sns.barplot(x="retirement_gap", y="country", data=retirement_data, palette=bar_colors)
24 |
25 | # show the plot!
26 | plt.show()
27 |
--------------------------------------------------------------------------------
/chapter_10_examples/standalone_files/schools_that_work.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import seaborn as sns
3 | import pandas as pd
4 |
5 | # import the school test data
6 | school_data = pd.read_csv("apib12tx.csv")
7 |
8 | # plot test scores against the percentage of students receiving meal support
9 | sns.scatterplot(data=school_data, x="MEALS", y="API12B", alpha=0.6, linewidth=0)
10 |
11 | # highlight a high-performing school
12 | highlight_school = school_data[school_data['SNAME'] == "Chin (John Yehall) Elementary"]
13 | plt.scatter(highlight_school['MEALS'], highlight_school['API12B'],
14 | color='orange', alpha=1.0)
15 |
16 | # show the plot!
17 | plt.show()
18 |
--------------------------------------------------------------------------------
/chapter_1_examples/jupyter_notebooks/hello_world.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "view-in-github"
8 | },
9 | "source": [
10 | "
"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {
17 | "id": "vm-eoxO_wKZi"
18 | },
19 | "outputs": [],
20 | "source": [
21 | "# The code below should print \"Hello World!\"\n",
22 | "print(\"Hello World!\")"
23 | ]
24 | }
25 | ],
26 | "metadata": {
27 | "colab": {
28 | "authorship_tag": "ABX9TyPewIxW4Coe6EfnEBAylqJX",
29 | "collapsed_sections": [],
30 | "include_colab_link": true,
31 | "name": "HelloWorld.ipynb",
32 | "provenance": []
33 | },
34 | "kernelspec": {
35 | "display_name": "Python 3 (ipykernel)",
36 | "language": "python",
37 | "name": "python3"
38 | },
39 | "language_info": {
40 | "codemirror_mode": {
41 | "name": "ipython",
42 | "version": 3
43 | },
44 | "file_extension": ".py",
45 | "mimetype": "text/x-python",
46 | "name": "python",
47 | "nbconvert_exporter": "python",
48 | "pygments_lexer": "ipython3",
49 | "version": "3.9.5"
50 | }
51 | },
52 | "nbformat": 4,
53 | "nbformat_minor": 1
54 | }
55 |
--------------------------------------------------------------------------------
/chapter_1_examples/standalone_files/hello_world.py:
--------------------------------------------------------------------------------
1 | # The code below should print "Hello World!"
2 | print("Hello World!")
3 |
--------------------------------------------------------------------------------
/chapter_2_examples/jupyter_notebooks/basic_greeting.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Creating custom functions\n",
8 | "\n",
9 | "One of the potentially tricky things about Jupyter Notebooks is that it's possible to run the code \"out of order\", which can sometimes cause problems. For example, select `Kernel -> Restart & Clear Output`, and then try running the last cell below _before_ running the second to last cell - you'll get an error.\n",
10 | "\n",
11 | "Recall that computers read code top-to-bottom, and left-to-right. Jupyter Notebooks let us break that rule, which can make it easier to test and troubleshoot small bits of code. At the same time, accidentally running cells out of order can generate errors even if the code--when \"read\" correctly--works properly. For example, if you run the last cell _again_ after running the third cell, it will now work: the `greet_me` function is defined in the third cell, and the computer still \"remembers\" it when it is referenced in the fourth cell--even though they are in separate cells.\n",
12 | "\n",
13 | "If you're having problems with code in a Jupyter Notebook that you're fairly sure should work (such as examples from this book), try choosing `Kernel -> Restart & Run All` to see if that solves it. You can also use the `Kernel -> Restart & Clear Output` command above to clear the numbering to the left of the cells if it has gotten out of order and you want to start again."
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 1,
19 | "metadata": {},
20 | "outputs": [
21 | {
22 | "name": "stdout",
23 | "output_type": "stream",
24 | "text": [
25 | "Susan E. McGregor\n"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "# create a variable named author, and set its contents to \"Susan E. McGregor\"\n",
31 | "# using the assignment operator, '='\n",
32 | "author = \"Susan E. McGregor\" \n",
33 | "\n",
34 | "# confirm that the computer \"remembers\" what's in the `author` variable\n",
35 | "# by using the built-in 'print' function\n",
36 | "print(author)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "name": "stdout",
46 | "output_type": "stream",
47 | "text": [
48 | "Hello Susan E. McGregor\n",
49 | "Hello Jeff Bleiel\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "# create a variable named author\n",
55 | "author = \"Susan E. McGregor\" \n",
56 | "\n",
57 | "# create another variable named editor\n",
58 | "editor = \"Jeff Bleiel\"\n",
59 | "\n",
60 | "# use the built-in print function to output \"Hello\" messages to each person\n",
61 | "print(\"Hello \"+author)\n",
62 | "print(\"Hello \"+editor)"
63 | ]
64 | }
65 | ],
66 | "metadata": {
67 | "kernelspec": {
68 | "display_name": "Python 3 (ipykernel)",
69 | "language": "python",
70 | "name": "python3"
71 | },
72 | "language_info": {
73 | "codemirror_mode": {
74 | "name": "ipython",
75 | "version": 3
76 | },
77 | "file_extension": ".py",
78 | "mimetype": "text/x-python",
79 | "name": "python",
80 | "nbconvert_exporter": "python",
81 | "pygments_lexer": "ipython3",
82 | "version": "3.9.5"
83 | }
84 | },
85 | "nbformat": 4,
86 | "nbformat_minor": 4
87 | }
88 |
--------------------------------------------------------------------------------
/chapter_2_examples/jupyter_notebooks/greet_me.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "852ff27e",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# create a function that prints out a greeting\n",
11 | "# to any name passed to the function\n",
12 | "def greet_me(a_name):\n",
13 | " print(\"Hello \"+a_name)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "id": "c74beb6b",
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "# create a variable named author\n",
24 | "author = \"Susan E. McGregor\"\n",
25 | "\n",
26 | "# create another variable named editor\n",
27 | "editor = \"Jeff Bleiel\""
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "id": "ee3022f1",
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# use my custom function, `greet_me` to output \"Hello\" messages to each person\n",
38 | "greet_me(author)\n",
39 | "greet_me(editor)"
40 | ]
41 | }
42 | ],
43 | "metadata": {
44 | "kernelspec": {
45 | "display_name": "Python 3 (ipykernel)",
46 | "language": "python",
47 | "name": "python3"
48 | },
49 | "language_info": {
50 | "codemirror_mode": {
51 | "name": "ipython",
52 | "version": 3
53 | },
54 | "file_extension": ".py",
55 | "mimetype": "text/x-python",
56 | "name": "python",
57 | "nbconvert_exporter": "python",
58 | "pygments_lexer": "ipython3",
59 | "version": "3.9.5"
60 | }
61 | },
62 | "nbformat": 4,
63 | "nbformat_minor": 5
64 | }
65 |
--------------------------------------------------------------------------------
/chapter_2_examples/jupyter_notebooks/hitting_the_road_with_citibike.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Question: How many Citi Bike rides each day are taken by\n",
10 | "# \"subscribers\" versus \"customers\"?\n",
11 | "\n",
12 | "# Answer: Choose a single day of rides to examine.\n",
13 | "\n",
14 | "# The dataset used for this exercise was generated from the original\n",
15 | "# Citi Bike system data found here: https://s3.amazonaws.com/tripdata/index.html\n",
16 | "# Filename: 202009-citibike-tripdata.csv.zip\n",
17 | "# Program Outline:\n",
18 | "# 1. Read in the data file: 202009CtibikeTripdataExample.csv\n",
19 | "# 2. Create variables to count: subscribers, customers, and other\n",
20 | "# 3. For each row in the file:\n",
21 | "# a. If the \"User Type\" is \"Subscriber,\" add 1 to \"subscriber_count\"\n",
22 | "# b. If the \"User Type\" is \"Customer,\" add 1 to \"customer_count\"\n",
23 | "# c. Otherwise, add 1 to the \"other\" variable\n",
24 | "# 4. Print out my results"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "# import the `csv` library\n",
34 | "import csv\n",
35 | "\n",
36 | "# open the `202009CitibikeTripdataExample.csv` file in read (\"r\") mode\n",
37 | "# this file should be in the same folder as our Python script or notebook\n",
38 | "source_file = open(\"202009CitibikeTripdataExample.csv\",\"r\")\n",
39 | "\n",
40 | "# pass our `source_file` as an ingredient to the the `csv` library's\n",
41 | "# DictReader \"recipe\".\n",
42 | "# Store the result in a variable called `citibike_reader`\n",
43 | "citibike_reader = csv.DictReader(source_file)\n",
44 | "\n",
45 | "# the DictReader method has added some useful information to our data,\n",
46 | "# like a `fieldnames` property that lets us access all the values\n",
47 | "# in the first or \"header\" row\n",
48 | "print(citibike_reader.fieldnames)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "# create a variable to hold the count of each type of Citi Bike user\n",
58 | "# assign or \"initialize\" each with a value of zero (0)\n",
59 | "subscriber_count = 0\n",
60 | "customer_count = 0\n",
61 | "other_user_count = 0"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# Step 3: Loop through every row of our data\n",
71 | "for a_row in citibike_reader:\n",
72 | "\n",
73 | " # Step 3a: if the value in the `usertype` column\n",
74 | " # of the current row is \"Subscriber\"\n",
75 | " if a_row[\"usertype\"] == \"Subscriber\":\n",
76 | "\n",
77 | " # add 1 to `subscriber_count`\n",
78 | " subscriber_count = subscriber_count +1\n",
79 | "\n",
80 | " # Step 3b: otherwise (else), if the value in the `usertype` column\n",
81 | " # of the current row is \"Customer\"\n",
82 | " elif a_row[\"usertype\"] == \"Customer\":\n",
83 | "\n",
84 | " # add 1 to `subscriber_count`\n",
85 | " customer_count = customer_count + 1\n",
86 | "\n",
87 | " # Step 3c: the `usertype` value is _neither_\"Subscriber\" nor \"Customer\",\n",
88 | " # so we'll add 1 to our catch-all `other_user_count` variable\n",
89 | " else:\n",
90 | " other_user_count = other_user_count + 1"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "# Step 4: Print out our results, being sure to include \"labels\" in the process:\n",
100 | "print(\"Number of subscribers:\")\n",
101 | "print(subscriber_count)\n",
102 | "print(\"Number of customers:\")\n",
103 | "print(customer_count)\n",
104 | "print(\"Number of 'other' users:\")\n",
105 | "print(other_user_count)"
106 | ]
107 | }
108 | ],
109 | "metadata": {
110 | "colab": {
111 | "name": "hitting_the_road_with_citibike.ipynb",
112 | "provenance": []
113 | },
114 | "kernelspec": {
115 | "display_name": "Python 3 (ipykernel)",
116 | "language": "python",
117 | "name": "python3"
118 | },
119 | "language_info": {
120 | "codemirror_mode": {
121 | "name": "ipython",
122 | "version": 3
123 | },
124 | "file_extension": ".py",
125 | "mimetype": "text/x-python",
126 | "name": "python",
127 | "nbconvert_exporter": "python",
128 | "pygments_lexer": "ipython3",
129 | "version": "3.9.5"
130 | }
131 | },
132 | "nbformat": 4,
133 | "nbformat_minor": 1
134 | }
135 |
--------------------------------------------------------------------------------
/chapter_2_examples/jupyter_notebooks/method_madness.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "a21dc1b6",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# splitting a string \"literal\" and then printing the result\n",
11 | "split_world = \"Hello World!\".split()\n",
12 | "print(split_world)"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "id": "f0ba92b0",
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# assigning a string to a variable\n",
23 | "# then printing the result of calling the `split()` method on it\n",
24 | "world_msg = \"Hello World!\"\n",
25 | "print(world_msg.split())"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "id": "f75166e6",
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "# the following will produce an error because\n",
36 | "# the `split()` method must be called on a string in order to work!\n",
37 | "split(\"Hello World!\")"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "id": "6273c7bc",
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "# the following will produce an error because\n",
48 | "# there is no `split()` method for numbers!\n",
49 | "print(5.split())"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "id": "bdf09244",
56 | "metadata": {},
57 | "outputs": [],
58 | "source": []
59 | }
60 | ],
61 | "metadata": {
62 | "kernelspec": {
63 | "display_name": "Python 3 (ipykernel)",
64 | "language": "python",
65 | "name": "python3"
66 | },
67 | "language_info": {
68 | "codemirror_mode": {
69 | "name": "ipython",
70 | "version": 3
71 | },
72 | "file_extension": ".py",
73 | "mimetype": "text/x-python",
74 | "name": "python",
75 | "nbconvert_exporter": "python",
76 | "pygments_lexer": "ipython3",
77 | "version": "3.9.5"
78 | }
79 | },
80 | "nbformat": 4,
81 | "nbformat_minor": 5
82 | }
83 |
--------------------------------------------------------------------------------
/chapter_2_examples/jupyter_notebooks/noun_examples.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Working with variables\n",
8 | "\n",
9 | "Note that, unlike our last example, when we assign a value to a variable, Jupyter Notebook _doesn't_ print out its value automatically when we run that cell.\n",
10 | "\n",
11 | "The subsequent cells, however, work just as they would in an standalone `.py` file."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# create a variable named author, set its contents to \"Susan E. McGregor\"\n",
21 | "author = \"Susan E. McGregor\"\n",
22 | "\n",
23 | "# confirm that the computer \"remembers\" what's in the `author` variable\n",
24 | "print(author)"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "# create a variable named nyc_resident, set its contents to \"Susan E. McGregor\"\n",
34 | "nyc_resident = \"Susan E. McGregor\"\n",
35 | "\n",
36 | "# confirm that the computer \"remembers\" what's in the `nyc_resident` variable\n",
37 | "print(nyc_resident)"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "# create a variable named fuzzyPinkBunny, set its contents to \"Susan E. McGregor\"\n",
47 | "fuzzyPinkBunny = \"Susan E. McGregor\"\n",
48 | "\n",
49 | "# confirm that the computer \"remembers\" what's in the `fuzzyPinkBunny` variable\n",
50 | "print(fuzzyPinkBunny)\n",
51 | "\n",
52 | "# but correct capitalization matters!\n",
53 | "# the following line will produce an error\n",
54 | "print(fuzzypinkbunny)"
55 | ]
56 | }
57 | ],
58 | "metadata": {
59 | "kernelspec": {
60 | "display_name": "Python 3 (ipykernel)",
61 | "language": "python",
62 | "name": "python3"
63 | },
64 | "language_info": {
65 | "codemirror_mode": {
66 | "name": "ipython",
67 | "version": 3
68 | },
69 | "file_extension": ".py",
70 | "mimetype": "text/x-python",
71 | "name": "python",
72 | "nbconvert_exporter": "python",
73 | "pygments_lexer": "ipython3",
74 | "version": "3.9.5"
75 | }
76 | },
77 | "nbformat": 4,
78 | "nbformat_minor": 4
79 | }
80 |
--------------------------------------------------------------------------------
/chapter_2_examples/jupyter_notebooks/page_count_conditional.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# fictional list of chapter page counts\n",
10 | "page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30]\n",
11 | "\n",
12 | "# create variables to keep track of:\n",
13 | "# the total pages in the book\n",
14 | "total_pages = 0\n",
15 | "\n",
16 | "# the number of chapters with more than 30 pages,\n",
17 | "under_30 = 0\n",
18 | "\n",
19 | "# the number of chapters with fewer than 30 pages\n",
20 | "over_30 = 0\n",
21 | "\n",
22 | "# for every item in the page_counts list:\n",
23 | "for a_number in page_counts:\n",
24 | " # add the current number of pages to our total_pages count\n",
25 | " total_pages = total_pages + a_number\n",
26 | " # check if the current number of pages is more than 30\n",
27 | " if a_number > 30:\n",
28 | " # if so, add 1 to our over_30 counter\n",
29 | " over_30 = over_30 + 1\n",
30 | " # otherwise...\n",
31 | " else:\n",
32 | " # add 1 to our under_30 counter\n",
33 | " under_30 = under_30 + 1\n",
34 | "\n",
35 | "# print our various results\n",
36 | "print(total_pages)\n",
37 | "print(\"Number of chapters over 30 pages:\")\n",
38 | "print(over_30)\n",
39 | "print(\"Number of chapters under 30 pages:\")\n",
40 | "print(under_30)"
41 | ]
42 | }
43 | ],
44 | "metadata": {
45 | "kernelspec": {
46 | "display_name": "Python 3 (ipykernel)",
47 | "language": "python",
48 | "name": "python3"
49 | },
50 | "language_info": {
51 | "codemirror_mode": {
52 | "name": "ipython",
53 | "version": 3
54 | },
55 | "file_extension": ".py",
56 | "mimetype": "text/x-python",
57 | "name": "python",
58 | "nbconvert_exporter": "python",
59 | "pygments_lexer": "ipython3",
60 | "version": "3.9.5"
61 | }
62 | },
63 | "nbformat": 4,
64 | "nbformat_minor": 4
65 | }
66 |
--------------------------------------------------------------------------------
/chapter_2_examples/jupyter_notebooks/page_count_custom_function.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "9cbb94aa",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# fictional list of chapter page counts\n",
11 | "page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30]"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "id": "93f3c196",
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "# define a new `count_pages()` function that takes one ingredient/argument:\n",
22 | "# a list of numbers\n",
23 | "def count_pages(page_count_list):\n",
24 | "\n",
25 | " # create variables to keep track of:\n",
26 | " # the total pages in the book\n",
27 | " total_pages = 0\n",
28 | "\n",
29 | " # the number of chapters with more than 30 pages,\n",
30 | " under_30 = 0\n",
31 | "\n",
32 | " # the number of chapters with fewer than 30 pages\n",
33 | " over_30 = 0\n",
34 | "\n",
35 | " # for every item in the page_count_list:\n",
36 | " for a_number in page_count_list:\n",
37 | "\n",
38 | " # add the current number of pages to our total_pages count\n",
39 | " total_pages = total_pages + a_number\n",
40 | "\n",
41 | " # check if the current number of pages is more than 30\n",
42 | " if a_number > 30:\n",
43 | "\n",
44 | " # if so, add 1 to our over_30 counter\n",
45 | " over_30 = over_30 + 1\n",
46 | "\n",
47 | " # otherwise...\n",
48 | " else:\n",
49 | "\n",
50 | " # add 1 to our under_30 counter\n",
51 | " under_30 = under_30 + 1\n",
52 | "\n",
53 | " # print our various results\n",
54 | " print(total_pages)\n",
55 | " print(\"Number of chapters over 30 pages:\")\n",
56 | " print(over_30)\n",
57 | " print(\"Number of chapters under 30 pages:\")\n",
58 | " print(under_30)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "id": "1baf12a0",
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "# call/execute this \"recipe\", being sure to pass in our\n",
69 | "# actual list as an argument/ingredient\n",
70 | "count_pages(page_counts)"
71 | ]
72 | }
73 | ],
74 | "metadata": {
75 | "kernelspec": {
76 | "display_name": "Python 3 (ipykernel)",
77 | "language": "python",
78 | "name": "python3"
79 | },
80 | "language_info": {
81 | "codemirror_mode": {
82 | "name": "ipython",
83 | "version": 3
84 | },
85 | "file_extension": ".py",
86 | "mimetype": "text/x-python",
87 | "name": "python",
88 | "nbconvert_exporter": "python",
89 | "pygments_lexer": "ipython3",
90 | "version": "3.9.5"
91 | }
92 | },
93 | "nbformat": 4,
94 | "nbformat_minor": 5
95 | }
96 |
--------------------------------------------------------------------------------
/chapter_2_examples/jupyter_notebooks/page_count_loop.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# fictional list of chapter page counts\n",
10 | "page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30]\n",
11 | "\n",
12 | "# variable for tracking total page count; starting value is 0\n",
13 | "total_pages = 0\n",
14 | "\n",
15 | "# for every item in the list, perform some action\n",
16 | "for a_number in page_counts:\n",
17 | "\n",
18 | " # in this case, add the number to our \"total_pages\" variable\n",
19 | " total_pages = total_pages + a_number\n",
20 | "\n",
21 | "print(total_pages)"
22 | ]
23 | }
24 | ],
25 | "metadata": {
26 | "kernelspec": {
27 | "display_name": "Python 3 (ipykernel)",
28 | "language": "python",
29 | "name": "python3"
30 | },
31 | "language_info": {
32 | "codemirror_mode": {
33 | "name": "ipython",
34 | "version": 3
35 | },
36 | "file_extension": ".py",
37 | "mimetype": "text/x-python",
38 | "name": "python",
39 | "nbconvert_exporter": "python",
40 | "pygments_lexer": "ipython3",
41 | "version": "3.9.5"
42 | }
43 | },
44 | "nbformat": 4,
45 | "nbformat_minor": 4
46 | }
47 |
--------------------------------------------------------------------------------
/chapter_2_examples/jupyter_notebooks/page_count_printout.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# fictional list of chapter page counts\n",
10 | "page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30]\n",
11 | "\n",
12 | "# variable for tracking total page count; starting value is 0\n",
13 | "total_pages = 0\n",
14 | "\n",
15 | "# for every item in the list, perform some action\n",
16 | "\n",
17 | "for a_number in page_counts:\n",
18 | " print(\"Top of loop!\")\n",
19 | " print(\"The current item is:\")\n",
20 | " print(a_number)\n",
21 | " total_pages = total_pages + a_number\n",
22 | " print(\"The running total is:\")\n",
23 | " print(total_pages)\n",
24 | " print(\"Bottom of loop!\")\n",
25 | "\n",
26 | "print(total_pages)"
27 | ]
28 | }
29 | ],
30 | "metadata": {
31 | "kernelspec": {
32 | "display_name": "Python 3 (ipykernel)",
33 | "language": "python",
34 | "name": "python3"
35 | },
36 | "language_info": {
37 | "codemirror_mode": {
38 | "name": "ipython",
39 | "version": 3
40 | },
41 | "file_extension": ".py",
42 | "mimetype": "text/x-python",
43 | "name": "python",
44 | "nbconvert_exporter": "python",
45 | "pygments_lexer": "ipython3",
46 | "version": "3.9.5"
47 | }
48 | },
49 | "nbformat": 4,
50 | "nbformat_minor": 4
51 | }
52 |
--------------------------------------------------------------------------------
/chapter_2_examples/jupyter_notebooks/parts_of_speech.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## About Jupyter Notebooks\n",
8 | "\n",
9 | "Jupyter Notebooks have two relevant types of \"cells\": \n",
10 | "\n",
11 | "This is a \"markdown\" cell, which is useful for adding lightly-formatted context and documentation to your notebook. You can learn more about markdown here: https://www.markdownguide.org/cheat-sheet/\n",
12 | "\n",
13 | "The cells below are code cells. When you hit the \"play\" button next to a code cell, it essentially does a combination of two things:\n",
14 | "\n",
15 | "1. Runs the code\n",
16 | "2. Prints out the \"results\"\n",
17 | "\n",
18 | "This means that Jupyter Notebooks typically contain fewer `print` statements than standalone `.py` files, if any. In this book, I have kept the `print` statements from example code for consistency and clarity.\n",
19 | "\n",
20 | "Note that if you were to run any of the code snippets below in a standalone `.py` file, you would not see any output. Because these code statements are just the literal values, Jupyter Notebook prints the same thing that was entered in the cell originally, but this would require an explicit `print` statement in a standalone file. \n",
21 | "\n",
22 | "Also notice that the comments are _not_ printed, as we would expect!"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "# A number is just digits\n",
32 | "25"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "# A string is anything surrounded by matching quotation marks\n",
42 | "\"Hello World\""
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "# A list is surrounded by square brackets, with commas between items\n",
52 | "# Note that in Python, the first item in a list is considered to be\n",
53 | "# in position `0`, the next in position `1` and so on\n",
54 | "[\"this\",\"is\",1,\"list\"]"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "# A dict is a set of key:value pairs, separated by commas and surrounded\n",
64 | "# by curly braces\n",
65 | "{\"title\":\"Practical Python for Data Wrangling and Data Quality\",\n",
66 | " \"format\": \"book\",\n",
67 | " \"author\": \"Susan E. McGregor\"\n",
68 | "}"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "# A Boolean is a data type that has only two values, true and false.\n",
78 | "True"
79 | ]
80 | }
81 | ],
82 | "metadata": {
83 | "kernelspec": {
84 | "display_name": "Python 3 (ipykernel)",
85 | "language": "python",
86 | "name": "python3"
87 | },
88 | "language_info": {
89 | "codemirror_mode": {
90 | "name": "ipython",
91 | "version": 3
92 | },
93 | "file_extension": ".py",
94 | "mimetype": "text/x-python",
95 | "name": "python",
96 | "nbconvert_exporter": "python",
97 | "pygments_lexer": "ipython3",
98 | "version": "3.9.5"
99 | }
100 | },
101 | "nbformat": 4,
102 | "nbformat_minor": 4
103 | }
104 |
--------------------------------------------------------------------------------
/chapter_2_examples/standalone_files/basic_greeting.py:
--------------------------------------------------------------------------------
1 | # create a variable named author
2 | author = "Susan E. McGregor"
3 |
4 | # create another variable named editor
5 | editor = "Jeff Bleiel"
6 |
7 | # use the built-in print function to output "Hello" messages to each person
8 | print("Hello "+author)
9 | print("Hello "+editor)
10 |
--------------------------------------------------------------------------------
/chapter_2_examples/standalone_files/greet_me.py:
--------------------------------------------------------------------------------
1 | # create a function that prints out a greeting
2 | # to any name passed to the function
3 | def greet_me(a_name):
4 | print("Hello "+a_name)
5 |
6 | # create a variable named author
7 | author = "Susan E. McGregor"
8 |
9 | # create another variable named editor
10 | editor = "Jeff Bleiel"
11 |
12 | # use my custom function, `greet_me` to output "Hello" messages to each person
13 | greet_me(author)
14 | greet_me(editor)
15 |
--------------------------------------------------------------------------------
/chapter_2_examples/standalone_files/hitting_the_road_with_citibike.py:
--------------------------------------------------------------------------------
1 | # Question: How many Citi Bike rides each day are taken by
2 | # "subscribers" versus "customers"?
3 |
4 | # Answer: Choose a single day of rides to examine.
5 |
6 | # The dataset used for this exercise was generated from the original
7 | # Citi Bike system data found here: https://s3.amazonaws.com/tripdata/index.html
8 | # Filename: 202009-citibike-tripdata.csv.zip
9 | # Program Outline:
10 | # 1. Read in the data file: 202009CtibikeTripdataExample.csv
11 | # 2. Create variables to count: subscribers, customers, and other
12 | # 3. For each row in the file:
13 | # a. If the "User Type" is "Subscriber," add 1 to "subscriber_count"
14 | # b. If the "User Type" is "Customer," add 1 to "customer_count"
15 | # c. Otherwise, add 1 to the "other" variable
16 | # 4. Print out my results
17 |
18 | # import the `csv` library
19 | import csv
20 |
21 | # open the `202009CitibikeTripdataExample.csv` file in read ("r") mode
22 | # this file should be in the same folder as our Python script or notebook
23 | source_file = open("202009CitibikeTripdataExample.csv","r")
24 |
25 | # pass our `source_file` as an ingredient to the the `csv` library's
26 | # DictReader "recipe".
27 | # Store the result in a variable called `citibike_reader`
28 | citibike_reader = csv.DictReader(source_file)
29 |
30 | # the DictReader method has added some useful information to our data,
31 | # like a `fieldnames` property that lets us access all the values
32 | # in the first or "header" row
33 | print(citibike_reader.fieldnames)
34 |
35 | # create a variable to hold the count of each type of Citi Bike user
36 | # assign or "initialize" each with a value of zero (0)
37 | subscriber_count = 0
38 | customer_count = 0
39 | other_user_count = 0
40 |
41 | # Step 3: Loop through every row of our data
42 | for a_row in citibike_reader:
43 |
44 | # Step 3a: if the value in the `usertype` column
45 | # of the current row is "Subscriber"
46 | if a_row["usertype"] == "Subscriber":
47 |
48 | # add 1 to `subscriber_count`
49 | subscriber_count = subscriber_count +1
50 |
51 | # Step 3b: otherwise (else), if the value in the `usertype` column
52 | # of the current row is "Customer"
53 | elif a_row["usertype"] == "Customer":
54 |
55 | # add 1 to `subscriber_count`
56 | customer_count = customer_count + 1
57 |
58 | # Step 3c: the `usertype` value is _neither_"Subscriber" nor "Customer",
59 | # so we'll add 1 to our catch-all `other_user_count` variable
60 | else:
61 | other_user_count = other_user_count + 1
62 |
63 | # Step 4: Print out our results, being sure to include "labels" in the process:
64 | print("Number of subscribers:")
65 | print(subscriber_count)
66 | print("Number of customers:")
67 | print(customer_count)
68 | print("Number of 'other' users:")
69 | print(other_user_count)
70 |
--------------------------------------------------------------------------------
/chapter_2_examples/standalone_files/method_madness.py:
--------------------------------------------------------------------------------
1 | # splitting a string "literal" and then printing the result
2 | split_world = "Hello World!".split()
3 | print(split_world)
4 |
5 | # assigning a string to a variable
6 | # then printing the result of calling the `split()` method on it
7 | world_msg = "Hello World!"
8 | print(world_msg.split())
9 |
10 | # the following will produce an error because
11 | # the `split()` method must be called on a string in order to work!
12 | split("Hello World!")
13 |
--------------------------------------------------------------------------------
/chapter_2_examples/standalone_files/noun_examples.py:
--------------------------------------------------------------------------------
1 | # create a variable named author, set its contents to "Susan E. McGregor"
2 | author = "Susan E. McGregor"
3 |
4 | # confirm that the computer "remembers" what's in the `author` variable
5 | print(author)
6 |
7 | # create a variable named nyc_resident, set its contents to "Susan E. McGregor"
8 | nyc_resident = "Susan E. McGregor"
9 |
10 | # confirm that the computer "remembers" what's in the `nyc_resident` variable
11 | print(nyc_resident)
12 |
13 | # create a variable named fuzzyPinkBunny, set its contents to "Susan E. McGregor"
14 | fuzzyPinkBunny = "Susan E. McGregor"
15 |
16 | # confirm that the computer "remembers" what's in the `fuzzyPinkBunny` variable
17 | print(fuzzyPinkBunny)
18 |
19 | # but correct capitalization matters!
20 | # the following line will produce an error
21 | print(fuzzypinkbunny)
22 |
--------------------------------------------------------------------------------
/chapter_2_examples/standalone_files/page_count_conditional.py:
--------------------------------------------------------------------------------
1 | # fictional list of chapter page counts
2 | page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30]
3 |
4 | # create variables to keep track of:
5 | # the total pages in the book
6 | total_pages = 0
7 |
8 | # the number of chapters with more than 30 pages,
9 | under_30 = 0
10 |
11 | # the number of chapters with fewer than 30 pages
12 | over_30 = 0
13 |
14 | # for every item in the page_counts list:
15 | for a_number in page_counts:
16 | # add the current number of pages to our total_pages count
17 | total_pages = total_pages + a_number
18 | # check if the current number of pages is more than 30
19 | if a_number > 30:
20 | # if so, add 1 to our over_30 counter
21 | over_30 = over_30 + 1
22 | # otherwise...
23 | else:
24 | # add 1 to our under_30 counter
25 | under_30 = under_30 + 1
26 |
27 | # print our various results
28 | print(total_pages)
29 | print("Number of chapters over 30 pages:")
30 | print(over_30)
31 | print("Number of chapters under 30 pages:")
32 | print(under_30)
33 |
--------------------------------------------------------------------------------
/chapter_2_examples/standalone_files/page_count_custom_function.py:
--------------------------------------------------------------------------------
1 | # fictional list of chapter page counts
2 | page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30]
3 |
4 | # define a new `count_pages()` function that takes one ingredient/argument:
5 | # a list of numbers
6 | def count_pages(page_count_list):
7 |
8 | # create variables to keep track of:
9 | # the total pages in the book
10 | total_pages = 0
11 |
12 | # the number of chapters with more than 30 pages,
13 | under_30 = 0
14 |
15 | # the number of chapters with fewer than 30 pages
16 | over_30 = 0
17 |
18 | # for every item in the page_count_list:
19 | for a_number in page_count_list:
20 |
21 | # add the current number of pages to our total_pages count
22 | total_pages = total_pages + a_number
23 |
24 | # check if the current number of pages is more than 30
25 | if a_number > 30:
26 |
27 | # if so, add 1 to our over_30 counter
28 | over_30 = over_30 + 1
29 |
30 | # otherwise...
31 | else:
32 |
33 | # add 1 to our under_30 counter
34 | under_30 = under_30 + 1
35 |
36 | # print our various results
37 | print(total_pages)
38 | print("Number of chapters over 30 pages:")
39 | print(over_30)
40 | print("Number of chapters under 30 pages:")
41 | print(under_30)
42 |
43 | # call/execute this "recipe", being sure to pass in our
44 | # actual list as an argument/ingredient
45 | count_pages(page_counts)
46 |
--------------------------------------------------------------------------------
/chapter_2_examples/standalone_files/page_count_loop.py:
--------------------------------------------------------------------------------
1 | # fictional list of chapter page counts
2 | page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30]
3 |
4 | # variable for tracking total page count; starting value is 0
5 | total_pages = 0
6 |
7 | # for every item in the list, perform some action
8 | for a_number in page_counts:
9 |
10 | # in this case, add the number to our "total_pages" variable
11 | total_pages = total_pages + a_number
12 |
13 | print(total_pages)
14 |
--------------------------------------------------------------------------------
/chapter_2_examples/standalone_files/page_count_printout.py:
--------------------------------------------------------------------------------
1 | # fictional list of chapter page counts
2 | page_counts = [28, 32, 44, 23, 56, 32, 12, 34, 30]
3 |
4 | # variable for tracking total page count; starting value is 0
5 | total_pages = 0
6 |
7 | # for every item in the list, perform some action
8 |
9 | for a_number in page_counts:
10 | print("Top of loop!")
11 | print("The current item is:")
12 | print(a_number)
13 | total_pages = total_pages + a_number
14 | print("The running total is:")
15 | print(total_pages)
16 | print("Bottom of loop!")
17 |
18 | print(total_pages)
19 |
--------------------------------------------------------------------------------
/chapter_2_examples/standalone_files/parts_of_speech.py:
--------------------------------------------------------------------------------
1 | # A number is just digits
2 | 25
3 |
4 | # A string is anything surrounded by matching quotation marks
5 | "Hello World"
6 |
7 | # A list is surrounded by square brackets, with commas between items
8 | # Note that in Python, the first item in a list is considered to be
9 | # in position `0`, the next in position `1` and so on
10 | ["this","is",1,"list"]
11 |
12 | # A dict is a set of key:value pairs, separated by commas and surrounded
13 | # by curly braces
14 | {"title":"Practical Python for Data Wrangling and Data Quality",
15 | "format": "book",
16 | "author": "Susan E. McGregor"
17 | }
18 |
19 | # A Boolean is a data type that has only two values, true and false.
20 | True
21 |
--------------------------------------------------------------------------------
/chapter_4_examples/standalone_files/csv_parsing.py:
--------------------------------------------------------------------------------
1 | # A simple example of reading data from a .csv file with Python
2 | # using the "csv" library.
3 | # The source data was sampled from the Citi Bike system data:
4 | # https://drive.google.com/file/d/17b461NhSjf_akFWvjgNXQfqgh9iFxCu_/
5 | # Which can be found here:
6 | # https://s3.amazonaws.com/tripdata/index.html
7 |
8 | # import the `csv` library
9 | import csv
10 |
11 | # open the `202009CitibikeTripdataExample.csv` file in read ("r") mode
12 | # this file should be in the same folder as our Python script or notebook
13 | source_file = open("202009CitibikeTripdataExample.csv","r")
14 |
15 | # pass our `source_file` as an ingredient to the the `csv` library's
16 | # DictReader "recipe".
17 | # Store the result in a variable called `citibike_reader`
18 | citibike_reader = csv.DictReader(source_file)
19 |
20 | # the DictReader method has added some useful information to our data,
21 | # like a `fieldnames` property that lets us access all the values
22 | # in the first or "header" row
23 | print(citibike_reader.fieldnames)
24 |
25 | # let's just print out the first 5 rows
26 | for i in range(0,5):
27 | print (next(citibike_reader))
28 |
--------------------------------------------------------------------------------
/chapter_4_examples/standalone_files/fixed_width_parsing.py:
--------------------------------------------------------------------------------
1 | # An example of reading data from a fixed-width file with Python.
2 | # The source file for this example comes from NOAA and can be accessed here:
3 | # https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt
4 | # The metadata for the file can be found here:
5 | # https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
6 |
7 | # import the `csv` library, to create our output file
8 | import csv
9 |
10 | filename = "ghcnd-stations"
11 |
12 | # reading from a basic text file doesn't require any special libraries
13 | # so we'll just open the file in read format ("r") as usual
14 | source_file = open(filename+".txt", "r")
15 |
16 | # the built-in "readlines()" method does just what you'd think:
17 | # it reads in a text file and converts it to a list of lines
18 | stations_list = source_file.readlines()
19 |
20 | # create an output file for our transformed data
21 | output_file = open(filename+".csv","w")
22 |
23 | # use the `csv` library's "writer" recipe to easily write rows of data
24 | # to `output_file`, instead of reading data *from* it
25 | output_writer = csv.writer(output_file)
26 |
27 | # create the header list
28 | headers = ["ID","LATITUDE","LONGITUDE","ELEVATION","STATE","NAME","GSN_FLAG",
29 | "HCNCRN_FLAG","WMO_ID"]
30 |
31 | # write our headers to the output file
32 | output_writer.writerow(headers)
33 |
34 | # loop through each line of our file (multiple "sheets" are not possible)
35 | for line in stations_list:
36 | # create an empty list, to which we'll append each set of characters that
37 | # makes up a given "column" of data
38 | new_row = []
39 | # ID: positions 1-11
40 | new_row.append(line[0:11])
41 | # LATITUDE: positions 13-20
42 | new_row.append(line[12:20])
43 | # LONGITUDE: positions 22-30
44 | new_row.append(line[21:30])
45 | # ELEVATION: positions 32-37
46 | new_row.append(line[31:37])
47 | # STATE: positions 39-40
48 | new_row.append(line[38:40])
49 | # NAME: positions 42-71
50 | new_row.append(line[41:71])
51 | # GSN_FLAG: positions 73-75
52 | new_row.append(line[72:75])
53 | # HCNCRN_FLAG: positions 77-79
54 | new_row.append(line[76:79])
55 | # WMO_ID: positions 81-85
56 | new_row.append(line[80:85])
57 |
58 | # now all that's left is to use the
59 | # `writerow` function to write new_row to our output file
60 | output_writer.writerow(new_row)
61 |
62 | # officially close the `.csv` file we just wrote all that data to
63 | output_file.close()
64 |
--------------------------------------------------------------------------------
/chapter_4_examples/standalone_files/json_parsing.py:
--------------------------------------------------------------------------------
1 | # A simple example of reading data from a .json file with Python,
2 | # using the built-in "json" library. The data used here is an instance of
3 | # https://api.stlouisfed.org/fred/series/observations?series_id=U6RATE& \
4 | # file_type=json&api_key=YOUR_API_KEY_HERE
5 |
6 | # import the `json` library, since that's our source file format
7 | import json
8 |
9 | # import the `csv` library, to create our output file
10 | import csv
11 |
12 | # choose a filename
13 | filename = "U6_FRED_data"
14 |
15 | # open the file in read format ("r") as usual
16 | json_source_file = open(filename+".json","r")
17 |
18 | # pass the `json_source_file` as an ingredient to the json library's `load()`
19 | # method and store the result in a variable called `json_data`
20 | json_data = json.load(json_source_file)
21 |
22 | # create our output file, naming it "json_"+filename
23 | output_file = open("json_"+filename+".csv","w")
24 |
25 | # use the `csv` library's "writer" recipe to easily write rows of data
26 | # to `output_file`, instead of reading data *from* it
27 | output_writer = csv.writer(output_file)
28 |
29 | # grab the first element (at position "0"), and use its keys as the column headers
30 | output_writer.writerow(list(json_data["observations"][0].keys()))
31 |
32 | for obj in json_data["observations"]:
33 |
34 | # we'll create an empty list where we'll put the actual values of each object
35 | obj_values = []
36 |
37 | # for every `key` (which will become a column), in each object
38 | for key, value in obj.items():
39 |
40 | # let's print what's in here, just to see how the code sees it
41 | print(key,value)
42 |
43 | # add the values to our list
44 | obj_values.append(value)
45 |
46 | # now we've got the whole row, write the data to our output file
47 | output_writer.writerow(obj_values)
48 |
49 | # officially close the `.csv` file we just wrote all that data to
50 | output_file.close()
51 |
--------------------------------------------------------------------------------
/chapter_4_examples/standalone_files/ods_parsing.py:
--------------------------------------------------------------------------------
1 | # An example of reading data from an .ods file with Python, using the
2 | # "pyexcel_ods" library. First, you'll need to pip install the library:
3 | # https://pypi.org/project/pyexcel-ods/
4 |
5 | # specify the "chapter" of the "pyexcel_ods" library you want to import,
6 | # in this case, `get_data`
7 | from pyexcel_ods import get_data
8 |
9 | # import the `csv` library, to create our output file
10 | import csv
11 |
12 | # pass our filename as an ingredient to the `pyexcel_ods` library's
13 | # `get_data()` "recipe"
14 | # store the result in a variable called `source_workbook`
15 | source_workbook = get_data("fredgraph.ods")
16 |
17 | # an `.ods` workbook can have multiple sheets
18 | for sheet_name, sheet_data in source_workbook.items():
19 |
20 | # print `sheet_name`, just to see what it is
21 | print(sheet_name)
22 |
23 | # create "ods_"+sheet_name+".csv" as an output file for the current sheet
24 | output_file = open("ods_"+sheet_name+".csv","w")
25 |
26 | # use this csv library's "writer" recipe to easily write rows of data
27 | # to `output_file`, instead of reading data *from* it
28 | output_writer = csv.writer(output_file)
29 |
30 | # now, we need to loop through every row in our sheet
31 | for row in sheet_data:
32 |
33 | # use the `writerow` recipe to write each `row`
34 | # directly to our output file
35 | output_writer.writerow(row)
36 |
37 | # officially close the `.csv` file we just wrote all that data to
38 | output_file.close()
39 |
--------------------------------------------------------------------------------
/chapter_4_examples/standalone_files/pdf_parsing.py:
--------------------------------------------------------------------------------
1 | # A basic example of reading data from a .pdf file with Python,
2 | # using `pdf2image` to convert it to images, and then using the
3 | # openCV and `tesseract` libraries to extract the text
4 | # The source data was downloaded from:
5 | # https://files.stlouisfed.org/files/htdocs/publications/page1-econ/2020/12/01/ \
6 | # unemployment-insurance-a-tried-and-true-safety-net_SE.pdf
7 |
8 | # the built-in `operating system` or `os` Python library will let us create
9 | # a new folder in which to store our converted images and output text
10 | import os
11 |
12 | # we'll import the `convert_from_path` "chapter" of the `pdf2image` library
13 | from pdf2image import convert_from_path
14 |
15 | # the built-in `glob`library offers a handy way to loop through all the files
16 | # in a folder that have a certain file extension, for example
17 | import glob
18 |
19 | # `cv2` is the actual library name for `openCV`
20 | import cv2
21 |
22 | # and of course, we need our Python library for interfacing
23 | # with the tesseract OCR process
24 | import pytesseract
25 |
26 | # we'll use the pdf name to name both our generated images and text files
27 | pdf_name = "SafetyNet"
28 |
29 | # our source pdf is in the same folder as our Python script
30 | pdf_source_file = pdf_name+".pdf"
31 |
32 | # as long as a folder with the same name as the pdf does not already exist
33 | if os.path.isdir(pdf_name) == False:
34 | # create a new folder with that name
35 | target_folder = os.mkdir(pdf_name)
36 |
37 | # store all the pages of the PDF in a variable
38 | pages = convert_from_path(pdf_source_file, 300)
39 |
40 | # loop through all the converted pages, enumerating them so that the page
41 | # number can be used to label the resulting images
42 | for page_num, page in enumerate(pages):
43 | # create unique filenames for each page image, combining the
44 | # folder name and the page number
45 | filename = os.path.join(pdf_name,"p"+str(page_num)+".png")
46 | # save the image of the page in system
47 | page.save(filename, 'PNG')
48 |
49 | # next, go through all the files in the folder that end in `.png`
50 | for img_file in glob.glob(os.path.join(pdf_name, '*.png')):
51 | # replace the slash in the image's filename with a dot
52 | temp_name = img_file.replace("/",".")
53 | # pull the unique page name (e.g. `p2`) from the `temp_name`
54 | text_filename = temp_name.split(".")[1]
55 | # now! create a new, writable file, also in our target folder, that
56 | # has the same name as the image, but is a `.txt` file
57 | output_file = open(os.path.join(pdf_name,text_filename+".txt"), "w")
58 | # use the `cv2` library to interpret our image
59 | img = cv2.imread(img_file)
60 | # create a new variable to hold the results of using pytesseract's
61 | # `image_to_string()` function, which will do just that
62 | converted_text = pytesseract.image_to_string(img)
63 | # write our extracted text to our output file
64 | output_file.write(converted_text)
65 | # close the output file
66 | output_file.close()
67 |
--------------------------------------------------------------------------------
/chapter_4_examples/standalone_files/rss_parsing.py:
--------------------------------------------------------------------------------
1 | # An example of reading data from an .xml file with Python, using the "lxml"
2 | # library.
3 | # First, you'll need to pip install the lxml library:
4 | # https://pypi.org/project/lxml/
5 | # The data used here is an instance of
6 | # http://feeds.bbci.co.uk/news/science_and_environment/rss.xml
7 |
8 | # specify the "chapter" of the `lxml` library you want to import,
9 | # in this case, `etree`, which stands for "ElementTree"
10 | from lxml import etree
11 |
12 | # import the `csv` library, to create our output file
13 | import csv
14 |
15 | # choose a filename, for simplicity
16 | filename = "BBC News - Science Environment XML Feed"
17 |
18 | # open our data file in read format, using "rb" as the "mode"
19 | xml_source_file = open(filename+".xml","rb")
20 |
21 | # pass our xml_source_file as an ingredient to the the `lxml` library's
22 | # `etree.parse()` method and store the result in a variable called `xml_doc`
23 | xml_doc = etree.parse(xml_source_file)
24 |
25 | # start by getting the current xml document's "root" element
26 | document_root = xml_doc.getroot()
27 |
28 | # if the document_root is a well-formed XML element
29 | if etree.iselement(document_root):
30 |
31 | # create our output file, naming it "rss_"+filename+".csv"
32 | output_file = open("rss_"+filename+".csv","w")
33 |
34 | # use the `csv` library's "writer" recipe to easily write rows of data
35 | # to `output_file`, instead of reading data *from* it
36 | output_writer = csv.writer(output_file)
37 |
38 | # document_root[0] is the "channel" element
39 | main_channel = document_root[0]
40 |
41 | # the `find()` method returns *only* the first instance of the element name
42 | article_example = main_channel.find('item')
43 |
44 | # create an empty list in which to store our future column headers
45 | tag_list = []
46 | for child in article_example.iterdescendants():
47 |
48 | # add each tag to our would-be header list
49 | tag_list.append(child.tag)
50 |
51 | # if the current tag has any attributes
52 | if child.attrib:
53 |
54 | # loop through the attribute keys in the tag
55 | for attribute_name in child.attrib.keys():
56 |
57 | # append the attribute name to our `tag_list` column headers
58 | tag_list.append(attribute_name)
59 |
60 | # write the contents of `tag_list` to our output file as column headers
61 | output_writer.writerow(tag_list)
62 |
63 | # now we want to grab *every* - elment in our file
64 | # so we use the `findall()` method instead of `find()`
65 | for item in main_channel.findall('item'):
66 |
67 | # empty list for holding our new row's content
68 | new_row = []
69 |
70 | # now we'll use our list of tags to get the contents of each element
71 | for tag in tag_list:
72 |
73 | # if there is anything in the element with a given tag name
74 | if item.findtext(tag):
75 |
76 | # append it to our new row
77 | new_row.append(item.findtext(tag))
78 |
79 | # otherwise, make sure it's the "isPermaLink" attribute
80 | elif tag == "isPermaLink":
81 |
82 | # grab its value from the element
83 | # and append it to our row
84 | new_row.append(item.find('guid').get("isPermaLink"))
85 |
86 | # write the new row to our output file!
87 | output_writer.writerow(new_row)
88 |
89 | # officially close the `.csv` file we just wrote all that data to
90 | output_file.close()
91 |
--------------------------------------------------------------------------------
/chapter_4_examples/standalone_files/tsv_parsing.py:
--------------------------------------------------------------------------------
1 | # A simple example of reading data from a .tsv file with Python, using
2 | # the `csv` library. The source data was downloaded as a .tsv file
3 | # from Jed Shugerman's Google Sheet on prosecutor politicians:
4 | # https://docs.google.com/spreadsheets/d/1E6Z-jZWbrKmit_4lG36oyQ658Ta6Mh25HCOBaz7YVrA/
5 |
6 | # import the `csv` library
7 | import csv
8 |
9 | # open the `ShugermanProsecutorPoliticians-SupremeCourtJustices.tsv` file
10 | # in read ("r") mode.
11 | # This file should be in the same folder as our Python script or notebook
12 | tsv_source_file = open("ShugermanProsecutorPoliticians-SupremeCourtJustices.tsv","r")
13 |
14 | # pass our `tsv_source_file` as an ingredient to the the csv library's
15 | # DictReader "recipe."
16 | # Store the result in a variable called `politicians_reader`
17 | politicians_reader = csv.DictReader(tsv_source_file, delimiter='\t')
18 |
19 | # the DictReader method has added some useful information to our data,
20 | # like a `fieldnames` property that lets us access all the values
21 | # in the first or "header" row
22 | print(politicians_reader.fieldnames)
23 |
24 | # we'll use the `next()` function to print just the first row of data
25 | print (next(politicians_reader))
26 |
--------------------------------------------------------------------------------
/chapter_4_examples/standalone_files/txt_parsing.py:
--------------------------------------------------------------------------------
1 | # A simple example of reading data from a .tsv file with Python, using
2 | # the `csv` library. The source data was downloaded as a .tsv file
3 | # from Jed Shugerman's Google Sheet on prosecutor politicians:
4 | # https://docs.google.com/spreadsheets/d/1E6Z-jZWbrKmit_4lG36oyQ658Ta6Mh25HCOBaz7YVrA/
5 | # The original .tsv file was renamed with a file extension of .txt
6 |
7 | # import the `csv` library
8 | import csv
9 |
10 | # open the `ShugermanProsecutorPoliticians-SupremeCourtJustices.txt` file
11 | # in read ("r") mode.
12 | # This file should be in the same folder as our Python script or notebook
13 | txt_source_file = open("ShugermanProsecutorPoliticians-SupremeCourtJustices.txt","r")
14 |
15 | # pass our tsv_source_file as an ingredient to the the csv library's DictReader
16 | # "recipe" and store the result in a variable called `politicians_reader`
17 | # add the "delimiter" parameter and specify the tab character, "\t"
18 | politicians_reader = csv.DictReader(txt_source_file, delimiter='\t')
19 |
20 | # the DictReader function has added useful information to our data,
21 | # like a label that shows us all the values in the first or "header" row
22 | print(politicians_reader.fieldnames)
23 |
24 | # we'll use the `next()` function to print just the first row of data
25 | print (next(politicians_reader))
26 |
--------------------------------------------------------------------------------
/chapter_4_examples/standalone_files/xls_parsing.py:
--------------------------------------------------------------------------------
1 | # A simple example of reading data from a .xls file with Python
2 | # using the "xrld" library. First, pip install the xlrd library:
3 | # https://pypi.org/project/xlrd/2.0.1/
4 |
5 | # import the "xlrd" library
6 | import xlrd
7 |
8 | # import the `csv` library, to create our output file
9 | import csv
10 |
11 | # pass our filename as an ingredient to the `xlrd` library's
12 | # `open_workbook()` "recipe"
13 | # store the result in a variable called `source_workbook`
14 | source_workbook = xlrd.open_workbook("fredgraph.xls")
15 |
16 | # an `.xls` workbook can have multiple sheets
17 | for sheet_name in source_workbook.sheet_names():
18 |
19 | # create a variable that points to the current worksheet by
20 | # passing the current value of `sheet_name` to the `sheet_by_name` recipe
21 | current_sheet = source_workbook.sheet_by_name(sheet_name)
22 |
23 | # print `sheet_name`, just to see what it is
24 | print(sheet_name)
25 |
26 | # create "xls_"+sheet_name+".csv" as an output file for the current sheet
27 | output_file = open("xls_"+sheet_name+".csv","w")
28 |
29 | # use the `csv` library's "writer" recipe to easily write rows of data
30 | # to `output_file`, instead of reading data *from* it
31 | output_writer = csv.writer(output_file)
32 |
33 | # now, we need to loop through every row in our sheet
34 | for row_num, row in enumerate(current_sheet.get_rows()):
35 |
36 | # each row is already a list, but we need to use the `row_value()`
37 | # method to access them
38 | # then we can use the `writerow` recipe to write them
39 | # directly to our output file
40 | output_writer.writerow(current_sheet.row_values(row_num))
41 |
42 | # officially close the `.csv` file we just wrote all that data to
43 | output_file.close()
44 |
--------------------------------------------------------------------------------
/chapter_4_examples/standalone_files/xlsx_parsing.py:
--------------------------------------------------------------------------------
1 | # An example of reading data from an .xlsx file with Python, using the "openpyxl"
2 | # library. First, you'll need to pip install the openpyxl library:
3 | # https://pypi.org/project/openpyxl/
4 | # The source data can be composed and downloaded from:
5 | # https://fred.stlouisfed.org/series/U6RATE
6 |
7 | # specify the "chapter" you want to import from the "openpyxl" library
8 | # in this case, "load_workbook"
9 | from openpyxl import load_workbook
10 |
11 | # import the `csv` library, to create our output file
12 | import csv
13 |
14 | # Pass our filename as an ingredient to the `openpyxl` library's
15 | # `load_workbook()` "recipe"
16 | # store the result in a variable called `source_workbook`
17 | source_workbook = load_workbook(filename = 'fredgraph.xlsx')
18 |
19 | # an .xlsx workbook can have multiple sheets
20 | # print their names here for reference
21 | print(source_workbook.sheetnames)
22 |
23 | # loop through the worksheets in `source_workbook`
24 | for sheet_num, sheet_name in enumerate(source_workbook.sheetnames):
25 |
26 | # create a variable that points to the current worksheet by
27 | # passing the current value of `sheet_name` to `source_workbook`
28 | current_sheet = source_workbook[sheet_name]
29 |
30 | # print `sheet_name`, just to see what it is
31 | print(sheet_name)
32 |
33 | # create an output file called "xlsx_"+sheet_name
34 | output_file = open("xlsx_"+sheet_name+".csv","w")
35 |
36 | # use this csv library's "writer" recipe to easily write rows of data
37 | # to `output_file`, instead of reading data *from* it
38 | output_writer = csv.writer(output_file)
39 |
40 | # loop through every row in our sheet
41 | for row in current_sheet.iter_rows():
42 |
43 | # we'll create an empty list where we'll put the actual
44 | # values of the cells in each row
45 | row_cells = []
46 |
47 | # for every cell (or column) in each row....
48 | for cell in row:
49 |
50 | # let's print what's in here, just to see how the code sees it
51 | print(cell, cell.value)
52 |
53 | # add the values to the end of our list with the `append()` method
54 | row_cells.append(cell.value)
55 |
56 | # write our newly (re)constructed data row to the output file
57 | output_writer.writerow(row_cells)
58 |
59 | # officially close the `.csv` file we just wrote all that data to
60 | output_file.close()
61 |
--------------------------------------------------------------------------------
/chapter_4_examples/standalone_files/xml_parsing.py:
--------------------------------------------------------------------------------
1 | # An example of reading data from a .xml file with Python, using the "lxml"
2 | # library.
3 | # First, you'll need to pip install the lxml library:
4 | # https://pypi.org/project/lxml/
5 | # A helpful tutorial can be found here: https://lxml.de/tutorial.html
6 | # The data used here is an instance of
7 | # https://api.stlouisfed.org/fred/series/observations?series_id=U6RATE& \
8 | # api_key=YOUR_API_KEY_HERE
9 |
10 | # specify the "chapter" of the `lxml` library you want to import,
11 | # in this case, `etree`, which stands for "ElementTree"
12 | from lxml import etree
13 |
14 | # import the `csv` library, to create our output file
15 | import csv
16 |
17 | # choose a filename
18 | filename = "U6_FRED_data"
19 |
20 | # open our data file in read format, using "rb" as the "mode"
21 | xml_source_file = open(filename+".xml","rb")
22 |
23 | # pass our xml_source_file as an ingredient to the the `lxml` library's
24 | # `etree.parse()` method and store the result in a variable called `xml_doc`
25 | xml_doc = etree.parse(xml_source_file)
26 |
27 | # start by getting the current xml document's "root" element
28 | document_root = xml_doc.getroot()
29 |
30 | # let's print it out to see what it looks like
31 | print(etree.tostring(document_root))
32 |
33 | # confirm that `document_root` is a well-formed XML element
34 | if etree.iselement(document_root):
35 |
36 | # create our output file, naming it "xml_"+filename+".csv
37 | output_file = open("xml_"+filename+".csv","w")
38 |
39 | # use the `csv` library's "writer" recipe to easily write rows of data
40 | # to `output_file`, instead of reading data *from* it
41 | output_writer = csv.writer(output_file)
42 |
43 | # grab the first element of our xml document (using `document_root[0]`)
44 | # and write its attribute keys as column headers to our output file
45 | output_writer.writerow(document_root[0].attrib.keys())
46 |
47 | # now, we need to loop through every element in our XML file
48 | for child in document_root:
49 |
50 | # now we'll use the `.values()` method to get each element's values
51 | # as a list, and then use that directly with the `writerow` recipe
52 | output_writer.writerow(child.attrib.values())
53 |
54 | # officially close the `.csv` file we just wrote all that data to
55 | output_file.close()
56 |
--------------------------------------------------------------------------------
/chapter_5_examples/.gitignore:
--------------------------------------------------------------------------------
1 | # ignoring all credentials files
2 | **credentials*
3 |
--------------------------------------------------------------------------------
/chapter_5_examples/jupyter_notebooks/FRED_API_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# import the requests library, which let's us write Python that acts like\n",
10 | "# a web browser through code\n",
11 | "import requests"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
21 | "# # Import PyDrive and associated libraries.\n",
22 | "# # This only needs to be done once per notebook.\n",
23 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n",
24 | "# from pydrive.auth import GoogleAuth\n",
25 | "# from pydrive.drive import GoogleDrive\n",
26 | "# from google.colab import auth\n",
27 | "# from oauth2client.client import GoogleCredentials\n",
28 | "\n",
29 | "# # Authenticate and create the PyDrive client.\n",
30 | "# # This only needs to be done once per notebook.\n",
31 | "# auth.authenticate_user()\n",
32 | "# gauth = GoogleAuth()\n",
33 | "# gauth.credentials = GoogleCredentials.get_application_default()\n",
34 | "# drive = GoogleDrive(gauth)"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
44 | "# # Link to data file stored in Drive: LINK TO YOUR CREDENTIALS FILE ON DRIVE\n",
45 | "# file_id = 'FILE_ID_OF_YOUR_CREDENTIALS_FILE_ON_DRIVE' # notice where this string comes from in link above\n",
46 | "\n",
47 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
48 | "# print(imported_file['title']) # it should print the title of desired file\n",
49 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "# we can import our API key by first giving Python the name of our credentials\n",
59 | "# file, and then telling it the variable to import\n",
60 | "from FRED_credentials import my_api_key"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "# specify the FRED endpoint we want to use\n",
70 | "FRED_endpoint = \"https://api.stlouisfed.org/fred/series/observations?\"\n",
71 | "\n",
72 | "# also specify the query parameters and their values\n",
73 | "FRED_parameters = \"series_id=U6RATE&file_type=json\"\n",
74 | "\n",
75 | "# construct the complete URL for our API request, adding our API key to the end\n",
76 | "complete_data_URL = FRED_endpoint + FRED_parameters +\"&api_key=\"+my_api_key\n",
77 | "\n",
78 | "# open a new, writable file with our chosen filename\n",
79 | "FRED_output_file = open(\"FRED_API_data.json\",\"w\")\n",
80 | "\n",
81 | "# use the requests library's \"get\" recipe to access the contents of our\n",
82 | "# target URL and store it in a our `FRED_data` variable\n",
83 | "FRED_data = requests.get(complete_data_URL)\n",
84 | "\n",
85 | "# the requests library's \"get\" function has put the contents of the webpage\n",
86 | "# in a property \"text\", which we'll write directly to our FRED_output_file\n",
87 | "# using the built-in \"write\" method\n",
88 | "FRED_output_file.write(FRED_data.text)\n",
89 | "\n",
90 | "# close our FRED_output_file\n",
91 | "FRED_output_file.close()"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
101 | "# from google.colab import files\n",
102 | "\n",
103 | "# files.download(\"FRED_API_data.json\")"
104 | ]
105 | }
106 | ],
107 | "metadata": {
108 | "kernelspec": {
109 | "display_name": "Python 3 (ipykernel)",
110 | "language": "python",
111 | "name": "python3"
112 | },
113 | "language_info": {
114 | "codemirror_mode": {
115 | "name": "ipython",
116 | "version": 3
117 | },
118 | "file_extension": ".py",
119 | "mimetype": "text/x-python",
120 | "name": "python",
121 | "nbconvert_exporter": "python",
122 | "pygments_lexer": "ipython3",
123 | "version": "3.9.5"
124 | }
125 | },
126 | "nbformat": 4,
127 | "nbformat_minor": 4
128 | }
129 |
--------------------------------------------------------------------------------
/chapter_5_examples/jupyter_notebooks/MTA_turnstiles_index.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# include the requests library in order to get data from the web\n",
10 | "import requests"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "# specify the URL of the web page we're downloading\n",
20 | "# this one contains a linked list of all the NYC MTA turnstile data files\n",
21 | "# going back to 2010\n",
22 | "mta_turnstiles_index_url = \"http://web.mta.info/developers/turnstile.html\""
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "# create some header information for our web page request\n",
32 | "headers = {\n",
33 | " 'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 13597.66.0) ' + \\\n",
34 | " 'AppleWebKit/537.36 (KHTML, like Gecko) ' + \\\n",
35 | " 'Chrome/88.0.4324.109 Safari/537.36',\n",
36 | " 'From': 'YOUR NAME HERE - youremailaddress@emailprovider.som'\n",
37 | "}"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "# send a `get()` request for the URL, along with our informational headers\n",
47 | "mta_web_page = requests.get(mta_turnstiles_index_url, headers=headers)\n",
48 | "\n",
49 | "# open up a writable local file where we can save the contents of the web page\n",
50 | "mta_turnstiles_output_file = open(\"MTA_turnstiles_index.html\",\"w\")\n",
51 | "\n",
52 | "# write the `text` web page to our output file\n",
53 | "mta_turnstiles_output_file.write(mta_web_page.text)\n",
54 | "\n",
55 | "# close our output file!\n",
56 | "mta_turnstiles_output_file.close()"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
66 | "# from google.colab import files\n",
67 | "\n",
68 | "# files.download(\"MTA_turnstiles_index.html\")"
69 | ]
70 | }
71 | ],
72 | "metadata": {
73 | "kernelspec": {
74 | "display_name": "Python 3 (ipykernel)",
75 | "language": "python",
76 | "name": "python3"
77 | },
78 | "language_info": {
79 | "codemirror_mode": {
80 | "name": "ipython",
81 | "version": 3
82 | },
83 | "file_extension": ".py",
84 | "mimetype": "text/x-python",
85 | "name": "python",
86 | "nbconvert_exporter": "python",
87 | "pygments_lexer": "ipython3",
88 | "version": "3.9.5"
89 | }
90 | },
91 | "nbformat": 4,
92 | "nbformat_minor": 4
93 | }
94 |
--------------------------------------------------------------------------------
/chapter_5_examples/jupyter_notebooks/data_download.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# A basic example of downloading data from the web with Python,\n",
10 | "# using the requests library\n",
11 | "#\n",
12 | "# The source data we are downloading will come from the following URLs:\n",
13 | "# http://feeds.bbci.co.uk/news/science_and_environment/rss.xml\n",
14 | "# https://gbfs.citibikenyc.com/gbfs/en/station_status.json\n",
15 | "\n",
16 | "# the `requests` library lets us write Python code that acts like\n",
17 | "# a web browser\n",
18 | "import requests"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
28 | "# from google.colab import files"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# our chosen XML filename\n",
38 | "XMLfilename = \"BBC_RSS.xml\"\n",
39 | "\n",
40 | "# open a new, writable file for our XML output\n",
41 | "xml_output_file = open(XMLfilename,\"w\")\n",
42 | "\n",
43 | "# use the requests library's \"get\" recipe to access the contents of our\n",
44 | "# target URL and store it in our `xml_data` variable\n",
45 | "xml_data = requests.get('http://feeds.bbci.co.uk/news/science_and_environment/rss.xml')\n",
46 | "\n",
47 | "# the requests library's `get()` function puts contents of the web page\n",
48 | "# in a property `text`\n",
49 | "# we'll `write` that directly to our `xml_output_file`\n",
50 | "xml_output_file.write(xml_data.text)\n",
51 | "\n",
52 | "# close our xml_output_file\n",
53 | "xml_output_file.close()"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
63 | "# files.download(XMLfilename)"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "# our chosen JSON filename\n",
73 | "JSONfilename = \"citibikenyc_station_status.json\"\n",
74 | "\n",
75 | "# open a new, writable file for our JSON output\n",
76 | "json_output_file = open(JSONfilename,\"w\")\n",
77 | "\n",
78 | "# use the `requests` library's `get()` recipe to access the contents of our\n",
79 | "# target URL and store it in our `json_data` variable\n",
80 | "json_data = requests.get('https://gbfs.citibikenyc.com/gbfs/en/station_status.json')\n",
81 | "\n",
82 | "# `get()` the contents of the web page and write its `text`\n",
83 | "# directly to `json_output_file`\n",
84 | "json_output_file.write(json_data.text)\n",
85 | "\n",
86 | "# close our json_output_file\n",
87 | "json_output_file.close()"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
97 | "# files.download(JSONfilename)"
98 | ]
99 | }
100 | ],
101 | "metadata": {
102 | "kernelspec": {
103 | "display_name": "Python 3 (ipykernel)",
104 | "language": "python",
105 | "name": "python3"
106 | },
107 | "language_info": {
108 | "codemirror_mode": {
109 | "name": "ipython",
110 | "version": 3
111 | },
112 | "file_extension": ".py",
113 | "mimetype": "text/x-python",
114 | "name": "python",
115 | "nbconvert_exporter": "python",
116 | "pygments_lexer": "ipython3",
117 | "version": "3.9.5"
118 | }
119 | },
120 | "nbformat": 4,
121 | "nbformat_minor": 4
122 | }
123 |
--------------------------------------------------------------------------------
/chapter_5_examples/standalone_files/FRED_API_example.py:
--------------------------------------------------------------------------------
1 | # import the requests library, which let's us write Python that acts like
2 | # a web browser through code
3 | import requests
4 |
5 | # we can import our API key by first giving Python the name of our credentials
6 | # file, and then telling it the variable to import
7 | from FRED_credentials import my_api_key
8 |
9 | # specify the FRED endpoint we want to use
10 | FRED_endpoint = "https://api.stlouisfed.org/fred/series/observations?"
11 |
12 | # also specify the query parameters and their values
13 | FRED_parameters = "series_id=U6RATE&file_type=json"
14 |
15 | # construct the complete URL for our API request, adding our API key to the end
16 | complete_data_URL = FRED_endpoint + FRED_parameters +"&api_key="+my_api_key
17 |
18 | # open a new, writable file with our chosen filename
19 | FRED_output_file = open("FRED_API_data.json","w")
20 |
21 | # use the requests library's "get" recipe to access the contents of our
22 | # target URL and store it in a our `FRED_data` variable
23 | FRED_data = requests.get(complete_data_URL)
24 |
25 | # the requests library's "get" function has put the contents of the webpage
26 | # in a property "text", which we'll write directly to our FRED_output_file
27 | # using the built-in "write" method
28 | FRED_output_file.write(FRED_data.text)
29 |
30 | # close our FRED_output_file
31 | FRED_output_file.close()
32 |
--------------------------------------------------------------------------------
/chapter_5_examples/standalone_files/MTA_turnstile_index.py:
--------------------------------------------------------------------------------
1 | # include the requests library in order to get data from the web
2 | import requests
3 |
4 | # specify the URL of the web page we're downloading
5 | # this one contains a linked list of all the NYC MTA turnstile data files
6 | # going back to 2010
7 | mta_turnstiles_index_url = "http://web.mta.info/developers/turnstile.html"
8 |
9 | # create some header information for our web page request
10 | headers = {
11 | 'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 13597.66.0) ' + \
12 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' + \
13 | 'Chrome/88.0.4324.109 Safari/537.36',
14 | 'From': 'YOUR NAME HERE - youremailaddress@emailprovider.som'
15 | }
16 |
17 | # send a `get()` request for the URL, along with our informational headers
18 | mta_web_page = requests.get(mta_turnstiles_index_url, headers=headers)
19 |
20 | # open up a writable local file where we can save the contents of the web page
21 | mta_turnstiles_output_file = open("MTA_turnstiles_index.html","w")
22 |
23 | # write the `text` web page to our output file
24 | mta_turnstiles_output_file.write(mta_web_page.text)
25 |
26 | # close our output file!
27 | mta_turnstiles_output_file.close()
28 |
--------------------------------------------------------------------------------
/chapter_5_examples/standalone_files/MTA_turnstiles_data_download.py:
--------------------------------------------------------------------------------
1 | # include the requests library in order to get data from the web
2 | import requests
3 |
4 | # import the `os` Python library so we can create a new folder
5 | # in which to store our downloaded data files
6 | import os
7 |
8 | # import the `time` library
9 | import time
10 |
11 | # open the file where we stored our list of links
12 | mta_data_links = open("MTA_data_index.csv","r")
13 |
14 | # create a folder name so that we can keep the data organized
15 | folder_name = "turnstile_data"
16 |
17 | # add our header information
18 | headers = {
19 | 'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 13597.66.0) ' + \
20 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' + \
21 | 'Chrome/88.0.4324.109 Safari/537.36',
22 | 'From': 'YOUR NAME HERE - youremailaddress@emailprovider.som'
23 | }
24 | # the built-in `readlines()` function converts our data file to a
25 | # list, where each line is an item
26 | mta_links_list = mta_data_links.readlines()
27 | # confirm there isn't already a folder with our chosen name
28 | if os.path.isdir(folder_name) == False:
29 | # create a new folder with that name
30 | target_folder = os.mkdir(folder_name)
31 |
32 | # only download the precise number of files we need
33 | for i in range(0,4):
34 |
35 | # use the built-in `strip()` method to remove the newline (`\n`)
36 | # character at the end of each row/link
37 | data_url = (mta_links_list[i]).strip()
38 |
39 | # create a unique output filename based on the url
40 | data_filename = data_url.split("/")[-1]
41 |
42 | # make our request for the data
43 | turnstile_data_file = requests.get(data_url, headers=headers)
44 |
45 | # open a new, writable file inside our target folder
46 | # using the appropriate filename
47 | local_data_file = open(os.path.join(folder_name,data_filename), "w")
48 |
49 | # save the contents of the downloaded file to that new file
50 | local_data_file.write(turnstile_data_file.text)
51 |
52 | # close the local file
53 | local_data_file.close()
54 |
55 | # `sleep()` for two seconds before moving on to the next item in the loop
56 | time.sleep(2)
57 |
--------------------------------------------------------------------------------
/chapter_5_examples/standalone_files/MTA_turnstiles_parsing.py:
--------------------------------------------------------------------------------
1 | # import the Beautiful Soup recipe from the bs4 library
2 | from bs4 import BeautifulSoup
3 |
4 | # open the saved copy of our MTA turnstiles web page
5 | # (original here: http://web.mta.info/developers/turnstile.html)
6 | mta_web_page = open("MTA_turnstiles_index.html", "r")
7 |
8 | # define the base URL for the data files
9 | base_url = "http://web.mta.info/developers/"
10 |
11 | # the `BeautifulSoup` recipe takes the contents of our web page and another
12 | # "ingredient", which tells it what kind of code it is working with
13 | # In this case, it's HTML
14 | soup = BeautifulSoup(mta_web_page, "html.parser")
15 |
16 | # using the "find" recipe, we can pass a tag type and class name as
17 | # "ingredients" to zero in on the content we want.
18 | data_files_section = soup.find("div", class_="span-84 last")
19 |
20 | # within that div, we can now just look for all the "anchor" (`a`) tags
21 | all_data_links = data_files_section.find_all("a")
22 |
23 | # need to open a file to write our extracted links to
24 | mta_data_list = open("MTA_data_index.csv","w")
25 |
26 | # the `find_all()` recipe returns a list of everything it matches
27 | for a_link in all_data_links:
28 |
29 | # combine our base URL with the contents of each "href" (link) property,
30 | # and store it in `complete_link`
31 | complete_link = base_url+a_link["href"]
32 |
33 | # write this completed link to our output file, manually adding a
34 | # newline `\n` character to the end, so each link will be on its own row
35 | mta_data_list.write(complete_link+"\n")
36 |
37 | # once we've written all the links to our file, close it!
38 | mta_data_list.close()
39 |
--------------------------------------------------------------------------------
/chapter_5_examples/standalone_files/Twitter_data_download.py:
--------------------------------------------------------------------------------
1 | # import the encoded key from our credentials file
2 | from Twitter_credentials import auth_ready_key
3 |
4 | # include the requests library in order to get data from the web
5 | import requests
6 |
7 | # specify the Twitter endpoint that we'll use to retrieve
8 | # our access token or "bearer" token
9 | auth_url = 'https://api.twitter.com/oauth2/token'
10 |
11 | # add our `auth_ready_key` to a template `dict` object provided
12 | # in the Twitter API documentation
13 | auth_headers = {
14 | 'Authorization': 'Basic '+auth_ready_key,
15 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
16 | }
17 |
18 | # another `dict` describes what we're asking for
19 | auth_data = {
20 | 'grant_type': 'client_credentials'
21 | }
22 |
23 | # make our complete request to the authorization endpoint, and store
24 | # the results in the `auth_resp` variable
25 | auth_resp = requests.post(auth_url, headers=auth_headers, data=auth_data)
26 |
27 | # pull the access token out of the json-formatted data
28 | # that the authorization endpoint sent back to us
29 | access_token = auth_resp.json()['access_token']
30 |
31 | # now that we have an access/bearer token, we're ready to request some data!
32 | # we'll create a new dict that includes this token
33 | search_headers = {
34 | 'Authorization': 'Bearer ' + access_token
35 | }
36 |
37 | # this is the Twitter search API endpoint for version 1.1 of the API
38 | search_url = 'https://api.twitter.com/1.1/search/tweets.json'
39 |
40 | # create a new dict that includes our search query parameters
41 | search_params = {
42 | 'q': 'Python',
43 | 'result_type': 'recent',
44 | 'count': 4
45 | }
46 |
47 | # send our data request and store the results in `search_resp`
48 | search_resp = requests.get(search_url, headers=search_headers, params=search_params)
49 |
50 | # parse the response into a JSON object
51 | Twitter_data = search_resp.json()
52 |
53 | # open an output file where we can save the results
54 | Twitter_output_file = open("Twitter_search_results.json", "w")
55 |
56 | # write the returned Twitter data to our output file
57 | Twitter_output_file.write(str(Twitter_data))
58 |
59 | # close the output file
60 | Twitter_output_file.close()
61 |
62 | # loop through our results and print the text of the Twitter status
63 | for a_Tweet in Twitter_data['statuses']:
64 | print(a_Tweet['text'] + '\n')
65 |
--------------------------------------------------------------------------------
/chapter_5_examples/standalone_files/data_download.py:
--------------------------------------------------------------------------------
1 | # A basic example of downloading data from the web with Python,
2 | # using the requests library
3 | #
4 | # The source data we are downloading will come from the following URLs:
5 | # http://feeds.bbci.co.uk/news/science_and_environment/rss.xml
6 | # https://gbfs.citibikenyc.com/gbfs/en/station_status.json
7 |
8 | # the `requests` library lets us write Python code that acts like
9 | # a web browser
10 | import requests
11 |
12 | # our chosen XML filename
13 | XMLfilename = "BBC_RSS.xml"
14 |
15 | # open a new, writable file for our XML output
16 | xml_output_file = open(XMLfilename,"w")
17 |
18 | # use the requests library's "get" recipe to access the contents of our
19 | # target URL and store it in our `xml_data` variable
20 | xml_data = requests.get('http://feeds.bbci.co.uk/news/science_and_environment/rss.xml')
21 |
22 | # the requests library's `get()` function puts contents of the web page
23 | # in a property `text`
24 | # we'll `write` that directly to our `xml_output_file`
25 | xml_output_file.write(xml_data.text)
26 |
27 | # close our xml_output_file
28 | xml_output_file.close()
29 |
30 | # our chosen JSON filename
31 | JSONfilename = "citibikenyc_station_status.json"
32 |
33 | # open a new, writable file for our JSON output
34 | json_output_file = open(JSONfilename,"w")
35 |
36 | # use the `requests` library's `get()` recipe to access the contents of our
37 | # target URL and store it in our `json_data` variable
38 | json_data = requests.get('https://gbfs.citibikenyc.com/gbfs/en/station_status.json')
39 |
40 | # `get()` the contents of the web page and write its `text`
41 | # directly to `json_output_file`
42 | json_output_file.write(json_data.text)
43 |
44 | # close our json_output_file
45 | json_output_file.close()
46 |
--------------------------------------------------------------------------------
/chapter_6_examples/jupyter_notebooks/ppp_columns_review.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Quick script for reviewing the all the column names in the PPP data\n",
10 | "# to see what we can infer about them from the data itself\n",
11 | "\n",
12 | "# importing the `pandas` library\n",
13 | "import pandas as pd"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
23 | "# # Import PyDrive and associated libraries.\n",
24 | "# # This only needs to be done once per notebook.\n",
25 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n",
26 | "# from pydrive.auth import GoogleAuth\n",
27 | "# from pydrive.drive import GoogleDrive\n",
28 | "# from google.colab import auth\n",
29 | "# from oauth2client.client import GoogleCredentials\n",
30 | "\n",
31 | "# # Authenticate and create the PyDrive client.\n",
32 | "# # This only needs to be done once per notebook.\n",
33 | "# auth.authenticate_user()\n",
34 | "# gauth = GoogleAuth()\n",
35 | "# gauth.credentials = GoogleCredentials.get_application_default()\n",
36 | "# drive = GoogleDrive(gauth)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
46 | "# # This loads the recent data sample we created with the `ppp_data_sample` script\n",
47 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1vwVf5caOURuRWzsTahC7W_Eb_a57mvAA/view?usp=sharing\n",
48 | "# file_id = '1vwVf5caOURuRWzsTahC7W_Eb_a57mvAA' # notice where this string comes from in link above\n",
49 | "\n",
50 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
51 | "# print(imported_file['title']) # it should print the title of desired file\n",
52 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# read the recent data into a pandas DataFrame using its `read_csv()` method\n",
62 | "ppp_data_sample = pd.read_csv('recent_sample.csv')"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "# convert all missing data entries to '' using the `convertdtypes()` method\n",
72 | "converted_data_sample = ppp_data_sample.convert_dtypes()"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "# transpose the whole sample\n",
82 | "transposed_ppp_data_sample = converted_data_sample.transpose()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "# print out the results!\n",
92 | "print(transposed_ppp_data_sample)"
93 | ]
94 | }
95 | ],
96 | "metadata": {
97 | "kernelspec": {
98 | "display_name": "Python 3 (ipykernel)",
99 | "language": "python",
100 | "name": "python3"
101 | },
102 | "language_info": {
103 | "codemirror_mode": {
104 | "name": "ipython",
105 | "version": 3
106 | },
107 | "file_extension": ".py",
108 | "mimetype": "text/x-python",
109 | "name": "python",
110 | "nbconvert_exporter": "python",
111 | "pygments_lexer": "ipython3",
112 | "version": "3.9.5"
113 | }
114 | },
115 | "nbformat": 4,
116 | "nbformat_minor": 4
117 | }
118 |
--------------------------------------------------------------------------------
/chapter_6_examples/jupyter_notebooks/ppp_columns_summary.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Quick script for reviewing all the column names in the PPP data\n",
10 | "# to see what we can infer about them from the data itself\n",
11 | "\n",
12 | "# importing the `pandas` library\n",
13 | "import pandas as pd"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
23 | "# # Import PyDrive and associated libraries.\n",
24 | "# # This only needs to be done once per notebook.\n",
25 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n",
26 | "# from pydrive.auth import GoogleAuth\n",
27 | "# from pydrive.drive import GoogleDrive\n",
28 | "# from google.colab import auth\n",
29 | "# from oauth2client.client import GoogleCredentials\n",
30 | "\n",
31 | "# # Authenticate and create the PyDrive client.\n",
32 | "# # This only needs to be done once per notebook.\n",
33 | "# auth.authenticate_user()\n",
34 | "# gauth = GoogleAuth()\n",
35 | "# gauth.credentials = GoogleCredentials.get_application_default()\n",
36 | "# drive = GoogleDrive(gauth)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
46 | "# # This loads the \"recent\" (February 2021) data\n",
47 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n",
48 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n",
49 | "\n",
50 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
51 | "# print(imported_file['title']) # it should print the title of desired file\n",
52 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# read the recent data sample into a pandas DataFrame\n",
62 | "ppp_data = pd.read_csv('public_150k_plus_recent.csv')"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "# print the summary of values that appear in the `LoanStats` column\n",
72 | "print(ppp_data.value_counts('LoanStatus'))"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "# print the total number of entries in the `LoanStatus` column\n",
82 | "print(sum(ppp_data.value_counts('LoanStatus')))"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "# print the summary of values that appear in the `Gender` column\n",
92 | "print(ppp_data.value_counts('Gender'))"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "# print the total number of entries in the `Gender` column\n",
102 | "print(sum(ppp_data.value_counts('Gender')))"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "# print how many rows do not list a value for `BorrowerAddress`\n",
112 | "print(ppp_data['BorrowerAddress'].isna().sum())"
113 | ]
114 | }
115 | ],
116 | "metadata": {
117 | "kernelspec": {
118 | "display_name": "Python 3 (ipykernel)",
119 | "language": "python",
120 | "name": "python3"
121 | },
122 | "language_info": {
123 | "codemirror_mode": {
124 | "name": "ipython",
125 | "version": 3
126 | },
127 | "file_extension": ".py",
128 | "mimetype": "text/x-python",
129 | "name": "python",
130 | "nbconvert_exporter": "python",
131 | "pygments_lexer": "ipython3",
132 | "version": "3.9.5"
133 | }
134 | },
135 | "nbformat": 4,
136 | "nbformat_minor": 4
137 | }
138 |
--------------------------------------------------------------------------------
/chapter_6_examples/jupyter_notebooks/ppp_date_range.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Quick script for finding the earliest and latest loan dates in the PPP loan\n",
10 | "# data\n",
11 | "\n",
12 | "# importing the `pandas` library\n",
13 | "import pandas as pd"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
23 | "# # Import PyDrive and associated libraries.\n",
24 | "# # This only needs to be done once per notebook.\n",
25 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n",
26 | "# from pydrive.auth import GoogleAuth\n",
27 | "# from pydrive.drive import GoogleDrive\n",
28 | "# from google.colab import auth\n",
29 | "# from oauth2client.client import GoogleCredentials\n",
30 | "\n",
31 | "# # Authenticate and create the PyDrive client.\n",
32 | "# # This only needs to be done once per notebook.\n",
33 | "# auth.authenticate_user()\n",
34 | "# gauth = GoogleAuth()\n",
35 | "# gauth.credentials = GoogleCredentials.get_application_default()\n",
36 | "# drive = GoogleDrive(gauth)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
46 | "# # This loads the \"recent\" (February 2021) data\n",
47 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n",
48 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n",
49 | "\n",
50 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
51 | "# print(imported_file['title']) # it should print the title of desired file\n",
52 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# read the recent data into a pandas DataFrame using its `read_csv()` method\n",
62 | "ppp_data = pd.read_csv('public_150k_plus_recent.csv')"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "# convert the values in the `DateApproved` column to *actual* dates\n",
72 | "ppp_data['DateApproved'] = pd.to_datetime(ppp_data['DateApproved'],\n",
73 | " format='%m/%d/%Y')"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "# print out the `min()` and `max()` values in the `DateApproved` column\n",
83 | "print(ppp_data['DateApproved'].min())\n",
84 | "print(ppp_data['DateApproved'].max())"
85 | ]
86 | }
87 | ],
88 | "metadata": {
89 | "kernelspec": {
90 | "display_name": "Python 3 (ipykernel)",
91 | "language": "python",
92 | "name": "python3"
93 | },
94 | "language_info": {
95 | "codemirror_mode": {
96 | "name": "ipython",
97 | "version": 3
98 | },
99 | "file_extension": ".py",
100 | "mimetype": "text/x-python",
101 | "name": "python",
102 | "nbconvert_exporter": "python",
103 | "pygments_lexer": "ipython3",
104 | "version": "3.9.5"
105 | }
106 | },
107 | "nbformat": 4,
108 | "nbformat_minor": 4
109 | }
110 |
--------------------------------------------------------------------------------
/chapter_6_examples/jupyter_notebooks/ppp_find_waterford.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Quick script for finding a business within our data set by (partial) name\n",
10 | "\n",
11 | "# importing the `pandas` library\n",
12 | "import pandas as pd"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
22 | "# # Import PyDrive and associated libraries.\n",
23 | "# # This only needs to be done once per notebook.\n",
24 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n",
25 | "# from pydrive.auth import GoogleAuth\n",
26 | "# from pydrive.drive import GoogleDrive\n",
27 | "# from google.colab import auth\n",
28 | "# from oauth2client.client import GoogleCredentials\n",
29 | "\n",
30 | "# # Authenticate and create the PyDrive client.\n",
31 | "# # This only needs to be done once per notebook.\n",
32 | "# auth.authenticate_user()\n",
33 | "# gauth = GoogleAuth()\n",
34 | "# gauth.credentials = GoogleCredentials.get_application_default()\n",
35 | "# drive = GoogleDrive(gauth)"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
45 | "# # This loads the \"recent\" (February 2021) data\n",
46 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n",
47 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n",
48 | "\n",
49 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
50 | "# print(imported_file['title']) # it should print the title of desired file\n",
51 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "# read the recent data sample into a pandas DataFrame\n",
61 | "ppp_data = pd.read_csv('public_150k_plus_recent.csv')"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# create a DataFrame without any missing `BorrowerName` values\n",
71 | "ppp_data_named_borrowers = ppp_data[ppp_data['BorrowerName'].notna()]"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "# because precise matching can be tricky,\n",
81 | "# we'll use the pandas `str.contains()` method\n",
82 | "bankruptcy_example = ppp_data_named_borrowers[ppp_data_named_borrowers['BorrowerName']\n",
83 | " .str.contains('WATERFORD RECEPTIONS')]"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "# transposing the result so it's easier to read\n",
93 | "print(bankruptcy_example.transpose())"
94 | ]
95 | }
96 | ],
97 | "metadata": {
98 | "kernelspec": {
99 | "display_name": "Python 3 (ipykernel)",
100 | "language": "python",
101 | "name": "python3"
102 | },
103 | "language_info": {
104 | "codemirror_mode": {
105 | "name": "ipython",
106 | "version": 3
107 | },
108 | "file_extension": ".py",
109 | "mimetype": "text/x-python",
110 | "name": "python",
111 | "nbconvert_exporter": "python",
112 | "pygments_lexer": "ipython3",
113 | "version": "3.9.5"
114 | }
115 | },
116 | "nbformat": 4,
117 | "nbformat_minor": 4
118 | }
119 |
--------------------------------------------------------------------------------
/chapter_6_examples/jupyter_notebooks/ppp_lender_names.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Quick script for determining whether there are typos &c. in any of the PPP\n",
10 | "# loan data's bank names\n",
11 | "\n",
12 | "# importing the `pandas` library. The `as` keyword let's us essentially create\n",
13 | "# a nickname for the library so that we can refer to it in fewer characters\n",
14 | "import pandas as pd\n",
15 | "\n",
16 | "# importing the `fingerprints` library, which will help us generate normalized\n",
17 | "# labels for each of the bank names in our data set\n",
18 | "import fingerprints"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
28 | "# # Import PyDrive and associated libraries.\n",
29 | "# # This only needs to be done once per notebook.\n",
30 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n",
31 | "# from pydrive.auth import GoogleAuth\n",
32 | "# from pydrive.drive import GoogleDrive\n",
33 | "# from google.colab import auth\n",
34 | "# from oauth2client.client import GoogleCredentials\n",
35 | "\n",
36 | "# # Authenticate and create the PyDrive client.\n",
37 | "# # This only needs to be done once per notebook.\n",
38 | "# auth.authenticate_user()\n",
39 | "# gauth = GoogleAuth()\n",
40 | "# gauth.credentials = GoogleCredentials.get_application_default()\n",
41 | "# drive = GoogleDrive(gauth)"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
51 | "# # This loads the \"recent\" (February 2021) data\n",
52 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n",
53 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n",
54 | "\n",
55 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
56 | "# print(imported_file['title']) # it should print the title of desired file\n",
57 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "# read the recent data sample into a pandas DataFrame using the library's\n",
67 | "# `read_csv()` method\n",
68 | "ppp_data = pd.read_csv('public_150k_plus_recent.csv')"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "# use the pandas DataFrame `unique()` function to create a list of unique\n",
78 | "# bank names in our data's `OriginatingLender` column\n",
79 | "unique_names = ppp_data['OriginatingLender'].unique()\n",
80 | "\n",
81 | "# confirm how many unique names there are\n",
82 | "print(len(unique_names))"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "# create an empty list to hold the fingerprint of each of the unique names\n",
92 | "fingerprint_list = []\n",
93 | "\n",
94 | "# iterate through each name in the list of unique names\n",
95 | "for name in unique_names:\n",
96 | "\n",
97 | " # for each name, generate its fingerprint\n",
98 | " # and append it to the end of the list\n",
99 | " fingerprint_list.append(fingerprints.generate(name))\n",
100 | "\n",
101 | "\n",
102 | "# use the built-in `set()` method on our fingerprint_list, which will\n",
103 | "# remove duplicates (and sort it)\n",
104 | "fingerprint_set = set(fingerprint_list)\n",
105 | "\n",
106 | "# check the length of the fingerprint_set\n",
107 | "print(len(fingerprint_set))"
108 | ]
109 | }
110 | ],
111 | "metadata": {
112 | "kernelspec": {
113 | "display_name": "Python 3 (ipykernel)",
114 | "language": "python",
115 | "name": "python3"
116 | },
117 | "language_info": {
118 | "codemirror_mode": {
119 | "name": "ipython",
120 | "version": 3
121 | },
122 | "file_extension": ".py",
123 | "mimetype": "text/x-python",
124 | "name": "python",
125 | "nbconvert_exporter": "python",
126 | "pygments_lexer": "ipython3",
127 | "version": "3.9.5"
128 | }
129 | },
130 | "nbformat": 4,
131 | "nbformat_minor": 4
132 | }
133 |
--------------------------------------------------------------------------------
/chapter_6_examples/jupyter_notebooks/ppp_loan_status.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Quick script for determining how many loans have been disbursed\n",
10 | "\n",
11 | "# importing the `pandas` library\n",
12 | "import pandas as pd"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
22 | "# # Import PyDrive and associated libraries.\n",
23 | "# # This only needs to be done once per notebook.\n",
24 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n",
25 | "# from pydrive.auth import GoogleAuth\n",
26 | "# from pydrive.drive import GoogleDrive\n",
27 | "# from google.colab import auth\n",
28 | "# from oauth2client.client import GoogleCredentials\n",
29 | "\n",
30 | "# # Authenticate and create the PyDrive client.\n",
31 | "# # This only needs to be done once per notebook.\n",
32 | "# auth.authenticate_user()\n",
33 | "# gauth = GoogleAuth()\n",
34 | "# gauth.credentials = GoogleCredentials.get_application_default()\n",
35 | "# drive = GoogleDrive(gauth)"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
45 | "# # This loads the \"recent\" (February 2021) data\n",
46 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n",
47 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n",
48 | "\n",
49 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
50 | "# print(imported_file['title']) # it should print the title of desired file\n",
51 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "# read the recent data sample into a pandas DataFrame\n",
61 | "ppp_data = pd.read_csv('public_150k_plus_recent.csv')"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# print a summary of values in the `LoanStatus` column\n",
71 | "print(ppp_data['LoanStatus'].value_counts())\n",
72 | "print(sum(ppp_data['LoanStatus'].value_counts()))"
73 | ]
74 | }
75 | ],
76 | "metadata": {
77 | "kernelspec": {
78 | "display_name": "Python 3 (ipykernel)",
79 | "language": "python",
80 | "name": "python3"
81 | },
82 | "language_info": {
83 | "codemirror_mode": {
84 | "name": "ipython",
85 | "version": 3
86 | },
87 | "file_extension": ".py",
88 | "mimetype": "text/x-python",
89 | "name": "python",
90 | "nbconvert_exporter": "python",
91 | "pygments_lexer": "ipython3",
92 | "version": "3.9.5"
93 | }
94 | },
95 | "nbformat": 4,
96 | "nbformat_minor": 4
97 | }
98 |
--------------------------------------------------------------------------------
/chapter_6_examples/jupyter_notebooks/ppp_min_max_loan.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Quick script for finding the minimum and maximum loans currently approved\n",
10 | "# in our PPP loan data set\n",
11 | "\n",
12 | "# importing the `pandas` library\n",
13 | "import pandas as pd"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
23 | "# # Import PyDrive and associated libraries.\n",
24 | "# # This only needs to be done once per notebook.\n",
25 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n",
26 | "# from pydrive.auth import GoogleAuth\n",
27 | "# from pydrive.drive import GoogleDrive\n",
28 | "# from google.colab import auth\n",
29 | "# from oauth2client.client import GoogleCredentials\n",
30 | "\n",
31 | "# # Authenticate and create the PyDrive client.\n",
32 | "# # This only needs to be done once per notebook.\n",
33 | "# auth.authenticate_user()\n",
34 | "# gauth = GoogleAuth()\n",
35 | "# gauth.credentials = GoogleCredentials.get_application_default()\n",
36 | "# drive = GoogleDrive(gauth)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
46 | "# # This loads the \"recent\" (February 2021) data\n",
47 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n",
48 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n",
49 | "\n",
50 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
51 | "# print(imported_file['title']) # it should print the title of desired file\n",
52 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# read the recent data into a pandas DataFrame\n",
62 | "ppp_data = pd.read_csv('public_150k_plus_recent.csv')"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "# use the pandas `min()` and `max()` methods to retrieve the\n",
72 | "# largest and smallest values, respectively\n",
73 | "print(ppp_data['CurrentApprovalAmount'].min())\n",
74 | "print(ppp_data['CurrentApprovalAmount'].max())"
75 | ]
76 | }
77 | ],
78 | "metadata": {
79 | "kernelspec": {
80 | "display_name": "Python 3 (ipykernel)",
81 | "language": "python",
82 | "name": "python3"
83 | },
84 | "language_info": {
85 | "codemirror_mode": {
86 | "name": "ipython",
87 | "version": 3
88 | },
89 | "file_extension": ".py",
90 | "mimetype": "text/x-python",
91 | "name": "python",
92 | "nbconvert_exporter": "python",
93 | "pygments_lexer": "ipython3",
94 | "version": "3.9.5"
95 | }
96 | },
97 | "nbformat": 4,
98 | "nbformat_minor": 4
99 | }
100 |
--------------------------------------------------------------------------------
/chapter_6_examples/jupyter_notebooks/ppp_numrows.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Quick script to print out the number of rows in each of our PPP loan data files\n",
10 | "# This is a pretty basic task, so no need to import extra libraries!"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
20 | "# # Import PyDrive and associated libraries.\n",
21 | "# # This only needs to be done once per notebook.\n",
22 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n",
23 | "# from pydrive.auth import GoogleAuth\n",
24 | "# from pydrive.drive import GoogleDrive\n",
25 | "# from google.colab import auth\n",
26 | "# from oauth2client.client import GoogleCredentials\n",
27 | "\n",
28 | "# # Authenticate and create the PyDrive client.\n",
29 | "# # This only needs to be done once per notebook.\n",
30 | "# auth.authenticate_user()\n",
31 | "# gauth = GoogleAuth()\n",
32 | "# gauth.credentials = GoogleCredentials.get_application_default()\n",
33 | "# drive = GoogleDrive(gauth)"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
43 | "# # This loads the August 2020 data\n",
44 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/11wTOapbAzcfeCQVVB-YJFIpsQVaZxJAm/view?usp=sharing\n",
45 | "# file_id = '11wTOapbAzcfeCQVVB-YJFIpsQVaZxJAm' # notice where this string comes from in link above\n",
46 | "\n",
47 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
48 | "# print(imported_file['title']) # it should print the title of desired file\n",
49 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
59 | "# # This loads the \"recent\" (February 2021) data\n",
60 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo/view?usp=sharing\n",
61 | "# file_id = '1M1AbFf8cUl0PwgKiXAbKBP3fPXIBKcMo' # notice where this string comes from in link above\n",
62 | "\n",
63 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
64 | "# print(imported_file['title']) # it should print the title of desired file\n",
65 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "# open the August PPP data in \"read\" mode\n",
75 | "august_data = open(\"public_150k_plus_080820.csv\",\"r\")"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "# use `readlines()` to convert the lines in the data file into a list\n",
85 | "print(\"August file has \"+str(len(august_data.readlines()))+\" rows.\")"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "# ditto for the recent PPP data\n",
95 | "recent_data = open(\"public_150k_plus_recent.csv\",\"r\")"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "# once again, print the number of lines\n",
105 | "print(\"Recent file has \"+str(len(recent_data.readlines()))+\" rows.\")"
106 | ]
107 | }
108 | ],
109 | "metadata": {
110 | "kernelspec": {
111 | "display_name": "Python 3 (ipykernel)",
112 | "language": "python",
113 | "name": "python3"
114 | },
115 | "language_info": {
116 | "codemirror_mode": {
117 | "name": "ipython",
118 | "version": 3
119 | },
120 | "file_extension": ".py",
121 | "mimetype": "text/x-python",
122 | "name": "python",
123 | "nbconvert_exporter": "python",
124 | "pygments_lexer": "ipython3",
125 | "version": "3.9.5"
126 | }
127 | },
128 | "nbformat": 4,
129 | "nbformat_minor": 4
130 | }
131 |
--------------------------------------------------------------------------------
/chapter_6_examples/standalone_files/ppp_columns_review.py:
--------------------------------------------------------------------------------
1 | # Quick script for reviewing the all the column names in the PPP data
2 | # to see what we can infer about them from the data itself
3 |
4 | # importing the `pandas` library
5 | import pandas as pd
6 |
7 | # read the recent data into a pandas DataFrame using its `read_csv()` method
8 | ppp_data_sample = pd.read_csv('recent_sample.csv')
9 |
10 | # convert all missing data entries to '' using the `convertdtypes()` method
11 | converted_data_sample = ppp_data_sample.convert_dtypes()
12 |
13 | # transpose the whole sample
14 | transposed_ppp_data_sample = converted_data_sample.transpose()
15 |
16 | # print out the results!
17 | print(transposed_ppp_data_sample)
18 |
--------------------------------------------------------------------------------
/chapter_6_examples/standalone_files/ppp_columns_summary.py:
--------------------------------------------------------------------------------
1 | # Quick script for reviewing all the column names in the PPP data
2 | # to see what we can infer about them from the data itself
3 |
4 | # importing the `pandas` library
5 | import pandas as pd
6 |
7 | # read the recent data sample into a pandas DataFrame
8 | ppp_data = pd.read_csv('public_150k_plus_recent.csv')
9 |
10 | # print the summary of values that appear in the `LoanStats` column
11 | print(ppp_data.value_counts('LoanStatus'))
12 |
13 | # print the total number of entries in the `LoanStatus` column
14 | print(sum(ppp_data.value_counts('LoanStatus')))
15 |
16 | # print the summary of values that appear in the `Gender` column
17 | print(ppp_data.value_counts('Gender'))
18 |
19 | # print the total number of entries in the `Gender` column
20 | print(sum(ppp_data.value_counts('Gender')))
21 |
22 | # print how many rows do not list a value for `BorrowerAddress`
23 | print(ppp_data['BorrowerAddress'].isna().sum())
24 |
--------------------------------------------------------------------------------
/chapter_6_examples/standalone_files/ppp_data_join.py:
--------------------------------------------------------------------------------
1 | # Quick script for creating new CSVs that each contain the first few rows of
2 | # our larger data files
3 |
4 | # importing the `pandas` library
5 | import pandas as pd
6 |
7 | # read the august data into a pandas DataFrame using its `read_csv()` method
8 | august_ppp_data = pd.read_csv('public_150k_plus_080820.csv')
9 |
10 | # read the recent data into a pandas DataFrame using its `read_csv()` method
11 | recent_ppp_data = pd.read_csv('public_150k_plus_recent.csv')
12 |
13 | # now that we have both files in memory, let's merge them!
14 | merged_data = pd.merge(august_ppp_data,recent_ppp_data,how='outer',
15 | left_on=['BusinessName','Lender','DateApproved'],right_on=['BorrowerName',
16 | 'ServicingLenderName','DateApproved'],indicator=True)
17 |
18 | # `print()` the values in the "indicator" column,
19 | # which has a default label of `_merge`
20 | print(merged_data.value_counts('_merge'))
21 |
22 | # merge the data again, removing the match on `DateApproved`
23 | merged_data_no_date = pd.merge(august_ppp_data,recent_ppp_data,how='outer',
24 | left_on=['BusinessName','Lender'],right_on=['BorrowerName',
25 | 'ServicingLenderName'],indicator=True)
26 |
27 | # `print()` the values in the "indicator" column,
28 | # which has a default label of `_merge`
29 | print(merged_data_no_date.value_counts('_merge'))
30 |
31 | # merge the data again, matching only on `BusinessName`/`BorrowerName`
32 | merged_data_biz_only = pd.merge(august_ppp_data,recent_ppp_data,how='outer',
33 | left_on=['BusinessName'],right_on=['BorrowerName'],indicator=True)
34 |
35 | # `print()` the values in the "indicator" column,
36 | # which has a default label of `_merge`
37 | print(merged_data_biz_only.value_counts('_merge'))
38 |
--------------------------------------------------------------------------------
/chapter_6_examples/standalone_files/ppp_data_samples.py:
--------------------------------------------------------------------------------
1 | # Quick script for creating new CSVs that each contain the first few rows of
2 | # our larger data files
3 |
4 | # importing the `pandas` library
5 | import pandas as pd
6 |
7 | # read the august data into a pandas DataFrame using its `read_csv()` method
8 | august_ppp_data = pd.read_csv('public_150k_plus_080820.csv')
9 |
10 | # the `head()` method returns the DataFrame's column headers
11 | # along with the first 5 rows of data
12 | august_sample = august_ppp_data.head()
13 |
14 | # write those first few rows to a CSV called `august_sample.csv`
15 | # using the pandas `to_csv()` method
16 | august_sample.to_csv('august_sample.csv', index=False)
17 |
18 | # read the recent data into a pandas DataFrame using its `read_csv()` method
19 | recent_ppp_data = pd.read_csv('public_150k_plus_recent.csv')
20 |
21 | # the `head()` method returns the DataFrame's column headers
22 | # along with the first 5 rows of data
23 | recent_sample = recent_ppp_data.head()
24 |
25 | # write those first few rows to a CSV called `recent_sample.csv`
26 | recent_sample.to_csv('recent_sample.csv', index=False)
27 |
--------------------------------------------------------------------------------
/chapter_6_examples/standalone_files/ppp_date_range.py:
--------------------------------------------------------------------------------
1 | # Quick script for finding the earliest and latest loan dates in the PPP loan
2 | # data
3 |
4 | # importing the `pandas` library
5 | import pandas as pd
6 |
7 | # read the recent data into a pandas DataFrame using its `read_csv()` method
8 | ppp_data = pd.read_csv('public_150k_plus_recent.csv')
9 |
10 | # convert the values in the `DateApproved` column to *actual* dates
11 | ppp_data['DateApproved'] = pd.to_datetime(ppp_data['DateApproved'],
12 | format='%m/%d/%Y')
13 |
14 | # print out the `min()` and `max()` values in the `DateApproved` column
15 | print(ppp_data['DateApproved'].min())
16 | print(ppp_data['DateApproved'].max())
17 |
--------------------------------------------------------------------------------
/chapter_6_examples/standalone_files/ppp_find_waterford.py:
--------------------------------------------------------------------------------
1 | # Quick script for finding a business within our data set by (partial) name
2 |
3 | # importing the `pandas` library
4 | import pandas as pd
5 |
6 | # read the recent data sample into a pandas DataFrame
7 | ppp_data = pd.read_csv('public_150k_plus_recent.csv')
8 |
9 | # create a DataFrame without any missing `BorrowerName` values
10 | ppp_data_named_borrowers = ppp_data[ppp_data['BorrowerName'].notna()]
11 |
12 | # because precise matching can be tricky,
13 | # we'll use the pandas `str.contains()` method
14 | bankruptcy_example = ppp_data_named_borrowers[ppp_data_named_borrowers['BorrowerName']
15 | .str.contains('WATERFORD RECEPTIONS')]
16 |
17 | # transposing the result so it's easier to read
18 | print(bankruptcy_example.transpose())
19 |
--------------------------------------------------------------------------------
/chapter_6_examples/standalone_files/ppp_lender_names.py:
--------------------------------------------------------------------------------
1 | # Quick script for determining whether there are typos &c. in any of the PPP
2 | # loan data's bank names
3 |
4 | # importing the `pandas` library. The `as` keyword let's us essentially create
5 | # a nickname for the library so that we can refer to it in fewer characters
6 | import pandas as pd
7 |
8 | # importing the `fingerprints` library, which will help us generate normalized
9 | # labels for each of the bank names in our data set
10 | import fingerprints
11 |
12 | # read the recent data sample into a pandas DataFrame using the library's
13 | # `read_csv()` method
14 | ppp_data = pd.read_csv('public_150k_plus_recent.csv')
15 |
16 | # use the pandas DataFrame `unique()` function to create a list of unique
17 | # bank names in our data's `OriginatingLender` column
18 | unique_names = ppp_data['OriginatingLender'].unique()
19 |
20 | # confirm how many unique names there are
21 | print(len(unique_names))
22 |
23 | # create an empty list to hold the fingerprint of each of the unique names
24 | fingerprint_list = []
25 |
26 | # iterate through each name in the list of unique names
27 | for name in unique_names:
28 |
29 | # for each name, generate its fingerprint
30 | # and append it to the end of the list
31 | fingerprint_list.append(fingerprints.generate(name))
32 |
33 |
34 | # use the built-in `set()` method on our fingerprint_list, which will
35 | # remove duplicates (and sort it)
36 | fingerprint_set = set(fingerprint_list)
37 |
38 | # check the length of the fingerprint_set
39 | print(len(fingerprint_set))
40 |
--------------------------------------------------------------------------------
/chapter_6_examples/standalone_files/ppp_loan_status.py:
--------------------------------------------------------------------------------
1 | # Quick script for determining how many loans have been disbursed
2 |
3 | # importing the `pandas` library
4 | import pandas as pd
5 |
6 | # read the recent data sample into a pandas DataFrame
7 | ppp_data = pd.read_csv('public_150k_plus_recent.csv')
8 |
9 | # print a summary of values in the `LoanStatus` column
10 | print(ppp_data['LoanStatus'].value_counts())
11 | print(sum(ppp_data['LoanStatus'].value_counts()))
12 |
--------------------------------------------------------------------------------
/chapter_6_examples/standalone_files/ppp_loan_uses.py:
--------------------------------------------------------------------------------
1 | # Quick script for determining what borrowers did (or really, did not) state
2 | # they would use PPP loan funds for
3 |
4 | # importing the `pandas` library
5 | import pandas as pd
6 |
7 | # read the recent data sample into a pandas DataFrame
8 | ppp_data = pd.read_csv('public_150k_plus_recent.csv')
9 |
10 | # print how many rows do not list a value for `UTILITIES_PROCEED`
11 | print(ppp_data['UTILITIES_PROCEED'].isna().sum())
12 |
13 | # print how many rows do not list a value for `PAYROLL_PROCEED`
14 | print(ppp_data['PAYROLL_PROCEED'].isna().sum())
15 |
16 | # print how many rows do not list a value for `MORTGAGE_INTEREST_PROCEED`
17 | print(ppp_data['MORTGAGE_INTEREST_PROCEED'].isna().sum())
18 |
19 | # print how many rows do not list a value for `RENT_PROCEED`
20 | print(ppp_data['RENT_PROCEED'].isna().sum())
21 |
22 | # print how many rows do not list a value for `REFINANCE_EIDL_PROCEED`
23 | print(ppp_data['REFINANCE_EIDL_PROCEED'].isna().sum())
24 |
25 | # print how many rows do not list a value for `HEALTH_CARE_PROCEED`
26 | print(ppp_data['HEALTH_CARE_PROCEED'].isna().sum())
27 |
28 | # print how many rows do not list a value for `DEBT_INTEREST_PROCEED`
29 | print(ppp_data['DEBT_INTEREST_PROCEED'].isna().sum())
30 |
31 | # create a new DataFrame that contains all rows reporting *only* payroll costs
32 | # that is, where all _other_ costs are listed as "NA"
33 | payroll_only = ppp_data[(ppp_data['UTILITIES_PROCEED'].isna()) & (ppp_data
34 | ['MORTGAGE_INTEREST_PROCEED'].isna()) & (ppp_data
35 | ['MORTGAGE_INTEREST_PROCEED'].isna()) & (ppp_data['RENT_PROCEED'].isna()) &
36 | (ppp_data['REFINANCE_EIDL_PROCEED'].isna()) & (ppp_data
37 | ['HEALTH_CARE_PROCEED'].isna()) & (ppp_data['DEBT_INTEREST_PROCEED'].isna())
38 | ]
39 |
40 | # print the length of our "payroll costs only" DataFrame
41 | print(len(payroll_only.index))
42 |
--------------------------------------------------------------------------------
/chapter_6_examples/standalone_files/ppp_min_max_loan.py:
--------------------------------------------------------------------------------
1 | # Quick script for finding the minimum and maximum loans currently approved
2 | # in our PPP loan data set
3 |
4 | # importing the `pandas` library
5 | import pandas as pd
6 |
7 | # read the recent data into a pandas DataFrame
8 | ppp_data = pd.read_csv('public_150k_plus_recent.csv')
9 |
10 | # use the pandas `min()` and `max()` methods to retrieve the
11 | # largest and smallest values, respectively
12 | print(ppp_data['CurrentApprovalAmount'].min())
13 | print(ppp_data['CurrentApprovalAmount'].max())
14 |
--------------------------------------------------------------------------------
/chapter_6_examples/standalone_files/ppp_numrows.py:
--------------------------------------------------------------------------------
1 | # Quick script to print out the number of rows in each of our PPP loan data files
2 | # This is a pretty basic task, so no need to import extra libraries!
3 |
4 | # open the August PPP data in "read" mode
5 | august_data = open("public_150k_plus_080820.csv","r")
6 |
7 | # use `readlines()` to convert the lines in the data file into a list
8 | print("August file has "+str(len(august_data.readlines()))+" rows.")
9 |
10 | # ditto for the recent PPP data
11 | recent_data = open("public_150k_plus_recent.csv","r")
12 |
13 | # once again, print the number of lines
14 | print("Recent file has "+str(len(recent_data.readlines()))+" rows.")
15 |
--------------------------------------------------------------------------------
/chapter_7_examples/jupyter_notebooks/regex_tests.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# The goal of this script is to try out how a couple of regular expressions\n",
10 | "# fare with some sample test data.\n",
11 | "\n",
12 | "# import the regular expression library\n",
13 | "import re"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# using the `re.compile()` method is a helpful way of keeping a reference to\n",
23 | "# our various regular expressions\n",
24 | "bookend_regex = re.compile(\"\\s0[7-9]:\")\n",
25 | "\n",
26 | "# always try to be descriptive with the variable names\n",
27 | "one_sided_regex = re.compile(\"0[7-9]:\")"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "# this example should *fail*\n",
37 | "sample1 = \"2020-09-01 00:00:01.0430\""
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "# this example should *match*\n",
47 | "sample2 = \"2020-09-01 09:04:23.7930\""
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "# this example should *fail*\n",
57 | "sample3 = \"2020-09-01 10:07:02.0510\""
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "# let's see what happens!\n",
67 | "print(\"bookend_regex:\")\n",
68 | "print(bookend_regex.search(sample1))\n",
69 | "print(bookend_regex.search(sample2))\n",
70 | "print(bookend_regex.search(sample3))"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "print(\"one_sided_regex:\")\n",
80 | "print(one_sided_regex.search(sample1))\n",
81 | "print(one_sided_regex.search(sample2))\n",
82 | "print(one_sided_regex.search(sample3))"
83 | ]
84 | }
85 | ],
86 | "metadata": {
87 | "kernelspec": {
88 | "display_name": "Python 3 (ipykernel)",
89 | "language": "python",
90 | "name": "python3"
91 | },
92 | "language_info": {
93 | "codemirror_mode": {
94 | "name": "ipython",
95 | "version": 3
96 | },
97 | "file_extension": ".py",
98 | "mimetype": "text/x-python",
99 | "name": "python",
100 | "nbconvert_exporter": "python",
101 | "pygments_lexer": "ipython3",
102 | "version": "3.9.5"
103 | }
104 | },
105 | "nbformat": 4,
106 | "nbformat_minor": 4
107 | }
108 |
--------------------------------------------------------------------------------
/chapter_7_examples/standalone_files/citibike_september1_rides.py:
--------------------------------------------------------------------------------
1 | # Objectives: Filter all September, 2020 Citi Bike rides, and output a new
2 | # file containing only the rides from 2020-09-01
3 |
4 | # Program Outline:
5 | # 1. Read in the data file: 202009-citibike-tripdata.csv
6 | # 2. Create a new output file, and write the header row to it.
7 | # 3. For each row in the file, split the `starttime` value on space:
8 | # a. If the first item in the resulting list is '2020-09-01', write
9 | # the row to our output file
10 | # 4. Close the output file
11 |
12 | # import the "csv" library
13 | import csv
14 |
15 | # open our data file in "read" mode
16 | source_file = open("202009-citibike-tripdata.csv","r")
17 |
18 | # open our output file in "write" mode
19 | output_file = open("2020-09-01-citibike-tripdata.csv","w")
20 |
21 | # pass our source_file to the DictReader "recipe"
22 | # and store the result in a variable called `citibike_reader`
23 | citibike_reader = csv.DictReader(source_file)
24 |
25 | # create a corresponding DictWriter and specify that the
26 | # header should be the same as the `citibike_reader` fieldnames
27 | output_writer = csv.DictWriter(output_file, fieldnames=citibike_reader.fieldnames)
28 |
29 | # write the header row to the output file
30 | output_writer.writeheader()
31 |
32 | # use a `for...in` loop to go through our `citibike_reader` list of rows
33 | for a_row in citibike_reader:
34 |
35 | # get the value in the 'starttime' column
36 | start_timestamp = a_row["starttime"]
37 |
38 | # split the value in 'starttime' on the space character
39 | timelist = start_timestamp.split(" ")
40 |
41 | # the "date" part of the string will be the first item, position 0
42 | the_date = timelist[0]
43 |
44 | # if `the_date` matches our desired date
45 | if the_date == "2020-09-01":
46 |
47 | # write that row of data to our output file
48 | output_writer.writerow(a_row)
49 |
50 | # close the output file
51 | output_file.close()
52 |
--------------------------------------------------------------------------------
/chapter_7_examples/standalone_files/fixed_width_strip_parsing.py:
--------------------------------------------------------------------------------
1 | # An example of reading data from a fixed-width file with Python.
2 | # The source file for this example comes from the NOAA, and can be accessed here:
3 | # https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt
4 | # The metadata for the file can be found here:
5 | # https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
6 |
7 | # import the `csv` library, to create our output file
8 | import csv
9 |
10 | filename = "ghcnd-stations"
11 |
12 | # reading from a basic text file doesn't require any special libraries
13 | # so we'll just open the file in read format ("r") as usual
14 | source_file = open(filename+".txt", "r")
15 |
16 | # the built-in "readlines()" method does just what you'd think:
17 | # it reads in a text file and converts it to a list of lines
18 | stations_list = source_file.readlines()
19 |
20 | # create an output file for our transformed data
21 | output_file = open(filename+".csv","w")
22 |
23 | # use the `csv` library's "writer" recipe to easily write rows of data
24 | # to `output_file`, instead of reading data *from* it
25 | output_writer = csv.writer(output_file)
26 |
27 | # create the header list
28 | headers = ["ID","LATITUDE","LONGITUDE","ELEVATION","STATE","NAME","GSN_FLAG",
29 | "HCNCRN_FLAG","WMO_ID"]
30 |
31 | # write our headers to the output file
32 | output_writer.writerow(headers)
33 |
34 | # loop through each line of our file (multiple "sheets" are not possible)
35 | for line in stations_list:
36 |
37 | # create an empty list, to which we'll append each set of characters that
38 | # makes up a given "column" of data
39 | new_row = []
40 |
41 | # ID: positions 1-11
42 | new_row.append((line[0:11]).strip())
43 |
44 | # LATITUDE: positions 13-20
45 | new_row.append((line[12:20]).strip())
46 |
47 | # LONGITUDE: positions 22-30
48 | new_row.append((line[21:30]).strip())
49 |
50 | # ELEVATION: positions 32-37
51 | new_row.append((line[31:37]).strip())
52 |
53 | # STATE: positions 39-40
54 | new_row.append((line[38:40]).strip())
55 |
56 | # NAME: positions 42-71
57 | new_row.append((line[41:71]).strip())
58 |
59 | # GSN_FLAG: positions 73-75
60 | new_row.append((line[72:75]).strip())
61 |
62 | # HCNCRN_FLAG: positions 77-79
63 | new_row.append((line[76:79]).strip())
64 |
65 | # WMO_ID: positions 81-85
66 | new_row.append((line[80:85]).strip())
67 |
68 | # now all that's left is to use the
69 | # `writerow` function to write new_row to our output file
70 | output_writer.writerow(new_row)
71 |
72 | # just for good measure, let's close the `.csv` file we just created
73 | output_file.close()
74 |
--------------------------------------------------------------------------------
/chapter_7_examples/standalone_files/ppp_add_fingerprints.py:
--------------------------------------------------------------------------------
1 | # Quick script for adding a "fingerprint" column to our loan data, which will
2 | # help us confirm/correct for any typos or inconsistencies in, e.g., bank names
3 |
4 | # import the csv library
5 | import csv
6 |
7 | # importing the `fingerprints` library
8 | import fingerprints
9 |
10 | # read the recent data sample into a variable
11 | ppp_data = open('public_150k_plus_recent.csv','r')
12 |
13 | # the DictReader function makes our source data more usable
14 | ppp_data_reader = csv.DictReader(ppp_data)
15 |
16 | # create an output file to write our modified dataset to
17 | augmented_ppp_data = open('public_150k_plus_fingerprints.csv','w')
18 |
19 | # create a "writer" so that we can output whole rows at once
20 | augmented_data_writer = csv.writer(augmented_ppp_data)
21 |
22 | # because we're adding a column, we need to create a new header row as well
23 | header_row = []
24 |
25 | # for every column header
26 | for item in ppp_data_reader.fieldnames:
27 |
28 | # append the existing column header
29 | header_row.append(item)
30 |
31 | # if we're at 'OriginatingLender'
32 | if item == 'OriginatingLender':
33 |
34 | # it's time to add a new column
35 | header_row.append('OriginatingLenderFingerprint')
36 |
37 | # now we can write our expanded header row to the output file
38 | augmented_data_writer.writerow(header_row)
39 |
40 | # iterate through every row in our data
41 | for row in ppp_data_reader:
42 |
43 | # create an empty list to hold our new data row
44 | new_row = []
45 |
46 | # for each column of data in the *original* dataset
47 | for column_name in ppp_data_reader.fieldnames:
48 |
49 | # first, append this row's value for that column
50 | new_row.append(row[column_name])
51 |
52 | # when we get to the 'OriginatingLender' column, it's time
53 | # to add our new "fingerprint" value
54 | if column_name == 'OriginatingLender':
55 |
56 | # our fingerprint will consist of the generated fingerprint PLUS
57 | # the OriginatingLenderLocationID
58 | the_fingerprint = fingerprints.generate(row[column_name]) + \
59 | " " + row['OriginatingLenderLocationID']
60 |
61 | # append the compound fingerprint value to our row
62 | new_row.append(the_fingerprint)
63 |
64 | # once the whole row is complete, write it to our output file
65 | augmented_data_writer.writerow(new_row)
66 |
67 | # close both files
68 | augmented_ppp_data.close()
69 | ppp_data.close()
70 |
--------------------------------------------------------------------------------
/chapter_7_examples/standalone_files/ppp_adding_naics.py:
--------------------------------------------------------------------------------
1 | # script to merge our PPP loan data with information from the SBA's NAICS
2 | # size requirements, found here:
3 | # https://www.sba.gov/document/support--table-size-standards
4 |
5 | # import pandas to facilitate the merging and sorting
6 | import pandas as pd
7 |
8 | # read our PPP loan data into a new DataFrame
9 | ppp_data = pd.read_csv('public_150k_plus_fingerprints.csv', dtype='string')
10 |
11 | # read the NAICS data into a separate DataFrame
12 | sba_naics_data = pd.read_csv('SBA-NAICS-data.csv', dtype='string')
13 |
14 | # if there's no value in the 'NAICSCode' column, replace it with "None"
15 | ppp_data['NAICSCode'] = ppp_data['NAICSCode'].fillna("None")
16 |
17 | # merge the two datasets using a "left" merge
18 | merged_data = pd.merge(ppp_data, sba_naics_data, how='left',
19 | left_on=['NAICSCode'], right_on=['NAICS Codes'],
20 | indicator=True)
21 |
22 | # open a file to save our merged data to
23 | merged_data_file = open('ppp-fingerprints-and-naics.csv', 'w')
24 |
25 | # write the merged data to an output file as a CSV
26 | merged_data_file.write(merged_data.to_csv())
27 |
28 | # print out the values in the '_merge' column to see how many
29 | # entries in our loan data don't get matched to a NAICS code
30 | print(merged_data.value_counts('_merge'))
31 |
32 | # create a new DataFrame that is *just* the unmatched rows
33 | unmatched_values = merged_data[merged_data['_merge']=='left_only']
34 |
35 | # open a file to write the unmatched values to
36 | unmatched_values_file = open('ppp-unmatched-naics-codes.csv', 'w')
37 |
38 | # write a new CSV file that contains all the unmatched NAICS codes in our
39 | # PPP loan data, along with how many times it appears
40 | unmatched_values_file.write(unmatched_values.value_counts('NAICSCode').to_csv())
41 |
--------------------------------------------------------------------------------
/chapter_7_examples/standalone_files/regex_tests.py:
--------------------------------------------------------------------------------
1 | # The goal of this script is to try out how a couple of regular expressions
2 | # fare with some sample test data.
3 |
4 | # import the regular expression library
5 | import re
6 |
7 | # using the `re.compile()` method is a helpful way of keeping a reference to
8 | # our various regular expressions
9 | bookend_regex = re.compile("\s0[7-9]:")
10 |
11 | # always try to be descriptive with the variable names
12 | one_sided_regex = re.compile("0[7-9]:")
13 |
14 | # this example should *fail*
15 | sample1 = "2020-09-01 00:00:01.0430"
16 |
17 | # this example should *match*
18 | sample2 = "2020-09-01 09:04:23.7930"
19 |
20 | # this example should *fail*
21 | sample3 = "2020-09-01 10:07:02.0510"
22 |
23 | # let's see what happens!
24 | print("bookend_regex:")
25 | print(bookend_regex.search(sample1))
26 | print(bookend_regex.search(sample2))
27 | print(bookend_regex.search(sample3))
28 | print("one_sided_regex:")
29 | print(one_sided_regex.search(sample1))
30 | print(one_sided_regex.search(sample2))
31 | print(one_sided_regex.search(sample3))
32 |
--------------------------------------------------------------------------------
/chapter_7_examples/standalone_files/weekday_rides.py:
--------------------------------------------------------------------------------
1 | # Objectives: Filter all September, 2020 Citi Bike rides, and output a new
2 | # file containing only weekday rides
3 | # Program Outline:
4 | # 1. Read in the data file: 202009-citibike-tripdata.csv
5 | # 2. Create a new output file, and write the header row to it.
6 | # 3. For each row in the file, make a date from the `starttime`:
7 | # a. if it's a weekday, write the row to our output file
8 | # 4. Close the output file
9 |
10 | # import the "csv" library
11 | import csv
12 |
13 | # import the "datetime" library
14 | from datetime import datetime
15 |
16 | # open our data file in "read" mode
17 | source_file = open("202009-citibike-tripdata.csv","r")
18 |
19 | # open our output file in "write" mode
20 | output_file = open("202009-citibike-weekday-tripdata.csv","w")
21 |
22 | # convert source data to a DictReader; store the result in `citibike_reader`
23 | citibike_reader = csv.DictReader(source_file)
24 |
25 | # create a corresponding DictWriter and specify its fieldnames
26 | output_writer = csv.DictWriter(output_file, fieldnames=citibike_reader.fieldnames)
27 |
28 | # actually write the header row to the output file
29 | output_writer.writeheader()
30 |
31 | # use a `for...in` loop to go through our `citibike_reader` list of rows
32 | for a_row in citibike_reader:
33 |
34 | # convert the value in the 'starttime' column to a date object
35 | the_date = datetime.strptime(a_row['starttime'], '%Y-%m-%d %H:%M:%S.%f')
36 |
37 | # if `the_date` is a weekday
38 | if the_date.weekday() <= 4:
39 |
40 | # write that row of data to our output file
41 | output_writer.writerow(a_row)
42 |
43 | # close the output file
44 | output_file.close()
45 |
--------------------------------------------------------------------------------
/chapter_7_examples/standalone_files/xls_meta_and_date_parsing.py:
--------------------------------------------------------------------------------
1 | # Converting data in an .xls file with Python to csv + metadata file, with
2 | # functional date values using the "xrld" library.
3 |
4 | # First, pip install the xlrd library:
5 | # https://pypi.org/project/xlrd/2.0.1/
6 | # then, import the `xlrd` library
7 | import xlrd
8 |
9 | # import the csv library
10 | import csv
11 |
12 | # needed to test if a given value is *some* type of number
13 | from numbers import Number
14 |
15 | # for parsing/formatting our newly interpreted Excel dates
16 | from datetime import datetime
17 |
18 | # pass our filename as an ingredient to the `xlrd` library's
19 | # `open_workbook()` "recipe"
20 | # store the result in a variable called `source_workbook`
21 | source_workbook = xlrd.open_workbook("fredgraph.xls")
22 |
23 | # open and name a simple metadata text file
24 | source_workbook_metadata = open("fredgraph_metadata.txt","w")
25 |
26 | # an `.xls` workbook can have multiple sheets
27 | for sheet_name in source_workbook.sheet_names():
28 |
29 | # create a variable that points to the current worksheet by
30 | # passing the current value of `sheet_name` to the `sheet_by_name` recipe
31 | current_sheet = source_workbook.sheet_by_name(sheet_name)
32 |
33 | # create "xls_"+sheet_name+".csv" as an output file for the current sheet
34 | output_file = open("xls_"+sheet_name+"_dates.csv","w")
35 |
36 | # use the `csv` library's "writer" recipe to easily write rows of data
37 | # to `output_file`, instead of reading data *from* it
38 | output_writer = csv.writer(output_file)
39 |
40 | # create a Boolean variable to detect if we've hit our table-type data yet
41 | is_table_data = False
42 |
43 | # now, we need to loop through every row in our sheet
44 | for row_num, row in enumerate(current_sheet.get_rows()):
45 |
46 | # pulling out the value in the first column of the current row
47 | first_entry = current_sheet.row_values(row_num)[0]
48 |
49 | # if we've hit the header row of our data table
50 | if first_entry == 'observation_date':
51 |
52 | # it's time to switch our "flag" value to "True"
53 | is_table_data = True
54 |
55 | # if `is_table_data` is True
56 | if is_table_data:
57 |
58 | # extract the table-type data values into separate variables
59 | the_date_num = current_sheet.row_values(row_num)[0]
60 | U6_value = current_sheet.row_values(row_num)[1]
61 |
62 | # create a new row object with each of the values
63 | new_row = [the_date_num, U6_value]
64 |
65 | # if the `the_date_num` is a number, then the current row is *not*
66 | # the header row. We need to transform the date.
67 | if isinstance(the_date_num, Number):
68 |
69 | # use the xlrd library's `xldate_as_datetime()` to generate
70 | # a Python datetime object
71 | the_date_num = xlrd.xldate.xldate_as_datetime(
72 | the_date_num, source_workbook.datemode)
73 |
74 | # overwrite the first value in the new row with
75 | # the reformatted date
76 | new_row[0] = the_date_num.strftime('%m/%d/%Y')
77 |
78 | # write this new row to the data output file
79 | output_writer.writerow(new_row)
80 |
81 | # otherwise, this row must be metadata
82 | else:
83 |
84 | # since we'd like our metadata file to be nicely formatted, we
85 | # need to loop through the individual cells of each metadata row
86 | for item in current_sheet.row(row_num):
87 |
88 | # write the value of the cell
89 | source_workbook_metadata.write(item.value)
90 |
91 | # separate it from the next cell with a tab
92 | source_workbook_metadata.write('\t')
93 |
94 | # at the end of each line of metadata, add a newline
95 | source_workbook_metadata.write('\n')
96 |
97 | # just for good measure, let's close our output files
98 | output_file.close()
99 | source_workbook_metadata.close()
100 |
--------------------------------------------------------------------------------
/chapter_7_examples/standalone_files/xls_meta_parsing.py:
--------------------------------------------------------------------------------
1 | # Converting data in an .xls file with Python to csv + metadata file
2 | # using the "xrld" library. First, pip install the xlrd library:
3 | # https://pypi.org/project/xlrd/2.0.1/
4 |
5 | # import the "xlrd" library
6 | import xlrd
7 |
8 | # import the `csv` library, to create our output file
9 | import csv
10 |
11 | # pass our filename as an ingredient to the `xlrd` library's
12 | # `open_workbook()` "recipe"
13 | # store the result in a variable called `source_workbook`
14 | source_workbook = xlrd.open_workbook("fredgraph.xls")
15 |
16 | # open and name a simple metadata text file
17 | source_workbook_metadata = open("fredgraph_metadata.txt","w")
18 |
19 | # an `.xls` workbook can have multiple sheets
20 | for sheet_name in source_workbook.sheet_names():
21 |
22 | # create a variable that points to the current worksheet by
23 | # passing the current value of `sheet_name` to the `sheet_by_name` recipe
24 | current_sheet = source_workbook.sheet_by_name(sheet_name)
25 |
26 | # create "xls_"+sheet_name+".csv" as an output file for the current sheet
27 | output_file = open("xls_"+sheet_name+".csv","w")
28 |
29 | # use the `csv` library's "writer" recipe to easily write rows of data
30 | # to `output_file`, instead of reading data *from* it
31 | output_writer = csv.writer(output_file)
32 |
33 | # create a Boolean variable to detect if we've hit our table-type data yet
34 | is_table_data = False
35 |
36 | # now, we need to loop through every row in our sheet
37 | for row_num, row in enumerate(current_sheet.get_rows()):
38 |
39 | # pulling out the value in the first column of the current row
40 | first_entry = current_sheet.row_values(row_num)[0]
41 |
42 | # if we've hit the header row of our data table
43 | if first_entry == 'observation_date':
44 |
45 | # it's time to switch our "flag" value to "True"
46 | is_table_data = True
47 |
48 | # if `is_table_data` is True
49 | if is_table_data:
50 |
51 | # write this row to the data output file
52 | output_writer.writerow(current_sheet.row_values(row_num))
53 |
54 | # otherwise, this row must be metadata
55 | else:
56 |
57 | # since we'd like our metadata file to be nicely formatted, we
58 | # need to loop through the individual cells of each metadata row
59 | for item in current_sheet.row(row_num):
60 |
61 | # write the value of the cell
62 | source_workbook_metadata.write(item.value)
63 |
64 | # separate it from the next cell with a tab
65 | source_workbook_metadata.write('\t')
66 |
67 | # at the end of each line of metadata, add a newline
68 | source_workbook_metadata.write('\n')
69 |
70 | # just for good measure, let's close our output files
71 | output_file.close()
72 | source_workbook_metadata.close()
73 |
--------------------------------------------------------------------------------
/chapter_8_examples/jupyter_notebooks/greet_me_options.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# create a function that prints out a greeting to any name\n",
10 | "def greet_me(a_name, greeting=\"Hello\"):\n",
11 | " print(greeting+\" \"+a_name)"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# create a variable named author\n",
21 | "author = \"Susan E. McGregor\"\n",
22 | "\n",
23 | "# create another variable named editor\n",
24 | "editor = \"Jeff Bleiel\""
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "# use `greet_me()` to output greeting messages to each person\n",
34 | "# say \"Hello\" by default\n",
35 | "greet_me(author)"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "# let the programmer specify \"Hi\" as the greeting\n",
45 | "greet_me(editor, greeting=\"Hi\")"
46 | ]
47 | }
48 | ],
49 | "metadata": {
50 | "kernelspec": {
51 | "display_name": "Python 3 (ipykernel)",
52 | "language": "python",
53 | "name": "python3"
54 | },
55 | "language_info": {
56 | "codemirror_mode": {
57 | "name": "ipython",
58 | "version": 3
59 | },
60 | "file_extension": ".py",
61 | "mimetype": "text/x-python",
62 | "name": "python",
63 | "nbconvert_exporter": "python",
64 | "pygments_lexer": "ipython3",
65 | "version": "3.9.5"
66 | }
67 | },
68 | "nbformat": 4,
69 | "nbformat_minor": 4
70 | }
71 |
--------------------------------------------------------------------------------
/chapter_8_examples/jupyter_notebooks/greet_me_revisited.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# create a function that prints out a greeting\n",
10 | "# to any name passed to the function\n",
11 | "def greet_me(a_name):\n",
12 | " print(\"Variable `a_name` in `greet_me`: \"+a_name)\n",
13 | " print(\"Hello \"+a_name)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# create a variable named `author`\n",
23 | "author = \"Susan E. McGregor\"\n",
24 | "\n",
25 | "# create another variable named `editor`\n",
26 | "editor = \"Jeff Bleiel\""
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "a_name = \"Python\"\n",
36 | "print(\"Variable `a_name` in main script: \"+a_name)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# use my custom function, `greet_me` to output \"Hello\" messages to each person\n",
46 | "greet_me(author)\n",
47 | "greet_me(editor)"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "print(\"Variable `a_name` in main script again: \"+a_name)"
57 | ]
58 | }
59 | ],
60 | "metadata": {
61 | "kernelspec": {
62 | "display_name": "Python 3 (ipykernel)",
63 | "language": "python",
64 | "name": "python3"
65 | },
66 | "language_info": {
67 | "codemirror_mode": {
68 | "name": "ipython",
69 | "version": 3
70 | },
71 | "file_extension": ".py",
72 | "mimetype": "text/x-python",
73 | "name": "python",
74 | "nbconvert_exporter": "python",
75 | "pygments_lexer": "ipython3",
76 | "version": "3.9.5"
77 | }
78 | },
79 | "nbformat": 4,
80 | "nbformat_minor": 4
81 | }
82 |
--------------------------------------------------------------------------------
/chapter_8_examples/jupyter_notebooks/make_greeting.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# create a function that **returns** a greeting to any name passed in\n",
10 | "def make_greeting(a_name):\n",
11 | " return(\"Hello \"+a_name)"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# create a variable named author\n",
21 | "author = \"Susan E. McGregor\"\n",
22 | "\n",
23 | "# create another variable named editor\n",
24 | "editor = \"Jeff Bleiel\""
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "# use my custom function, `greet_me()` to build and store\n",
34 | "# the \"Hello\" messages to each person\n",
35 | "author_greeting = make_greeting(author)\n",
36 | "editor_greeting = make_greeting(editor)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# now `print()` the greetings built and returned by each function call\n",
46 | "print(author_greeting)\n",
47 | "print(editor_greeting)"
48 | ]
49 | }
50 | ],
51 | "metadata": {
52 | "kernelspec": {
53 | "display_name": "Python 3 (ipykernel)",
54 | "language": "python",
55 | "name": "python3"
56 | },
57 | "language_info": {
58 | "codemirror_mode": {
59 | "name": "ipython",
60 | "version": 3
61 | },
62 | "file_extension": ".py",
63 | "mimetype": "text/x-python",
64 | "name": "python",
65 | "nbconvert_exporter": "python",
66 | "pygments_lexer": "ipython3",
67 | "version": "3.9.5"
68 | }
69 | },
70 | "nbformat": 4,
71 | "nbformat_minor": 4
72 | }
73 |
--------------------------------------------------------------------------------
/chapter_8_examples/jupyter_notebooks/make_greeting_no_vars.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# function that returns a greeting to any name passed in\n",
10 | "def make_greeting(a_name):\n",
11 | " return(\"Hello \"+a_name)"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# function that adds a question to any greeting\n",
21 | "def add_question(a_greeting):\n",
22 | " return(a_greeting+\", how are you?\")"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "# create a variable named author\n",
32 | "author = \"Susan E. McGregor\"\n",
33 | "\n",
34 | "# create another variable named editor\n",
35 | "editor = \"Jeff Bleiel\""
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "# print the greeting message\n",
45 | "print(make_greeting(author))"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "# pass the greeting message to the question function and print the result!\n",
55 | "print(add_question(make_greeting(editor)))"
56 | ]
57 | }
58 | ],
59 | "metadata": {
60 | "kernelspec": {
61 | "display_name": "Python 3 (ipykernel)",
62 | "language": "python",
63 | "name": "python3"
64 | },
65 | "language_info": {
66 | "codemirror_mode": {
67 | "name": "ipython",
68 | "version": 3
69 | },
70 | "file_extension": ".py",
71 | "mimetype": "text/x-python",
72 | "name": "python",
73 | "nbconvert_exporter": "python",
74 | "pygments_lexer": "ipython3",
75 | "version": "3.9.5"
76 | }
77 | },
78 | "nbformat": 4,
79 | "nbformat_minor": 4
80 | }
81 |
--------------------------------------------------------------------------------
/chapter_8_examples/standalone_files/fixed_width_strip_parsing_refactored.py:
--------------------------------------------------------------------------------
1 | """ NOAA data formatter
2 | Reads data from an NOAA fixed-width data file with Python and outputs
3 | a well-formatted CSV file.
4 | The source file for this example comes from the NOAA, and can be accessed here:
5 | https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt
6 | The metadata for the file can be found here:
7 | https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
8 | Available functions
9 | -------------------
10 | * convert_to_columns: Converts a line of text to a list
11 | Requirements
12 | ------------
13 | * csv module
14 | """
15 | # we'll start by importing the "csv" library
16 | import csv
17 |
18 | def main():
19 |
20 | # variable to match our output filename to the input filename
21 | filename = "ghcnd-stations"
22 |
23 | # we'll just open the file in read format ("r") as usual
24 | source_file = open(filename+".txt", "r")
25 |
26 | # the "readlines()" method converts a text file to a list of lines
27 | stations_list = source_file.readlines()
28 |
29 | # as usual, we'll create an output file to write to
30 | output_file = open(filename+".csv","w")
31 |
32 | # and we'll use the `csv` library to create a "writer" that gives us handy
33 | # "recipe" functions for creating our new file in csv format
34 | output_writer = csv.writer(output_file)
35 |
36 | # we have to "hard code" these headers using the contents of `readme.txt`
37 | headers = ["ID","LATITUDE","LONGITUDE","ELEVATION","STATE","NAME",
38 | "GSN_FLAG","HCNCRN_FLAG","WMO_ID"]
39 |
40 | # create a list of `tuple`s with each column's start and end index
41 | column_ranges = [(1,11),(13,20),(22,30),(32,37),(39,40),(42,71),(73,75),
42 | (77,79),(81,85)]
43 |
44 | # write our headers to the output file
45 | output_writer.writerow(headers)
46 |
47 | # loop through each line of our file
48 | for line in stations_list:
49 |
50 | # send our data to be formatted
51 | new_row = convert_to_columns(line, column_ranges)
52 |
53 | # use the `writerow` function to write new_row to our output file
54 | output_writer.writerow(new_row)
55 |
56 | # for good measure, close our output file
57 | output_file.close()
58 |
59 |
60 | def convert_to_columns(data_line, column_info, zero_index=False):
61 |
62 | """Converts a line of text to a list based on the index pairs provided
63 | Parameters
64 | ----------
65 | data_line : str
66 | The line of text to be parsed
67 | column_info : list of tuples
68 | Each tuple provides the start and end index of a data column
69 | zero_index: boolean, optional
70 | If False (default), reduces starting index position by one
71 | Returns
72 | -------
73 | list
74 | a list of data values, stripped of surrounding whitespace
75 | """
76 | new_row = []
77 |
78 | # function assumes that provided indices are *NOT* zero-indexed,
79 | # so reduce starting index values by 1
80 | index_offset = 1
81 |
82 | # if column_info IS zero-indexed, don't offset starting index values
83 | if zero_index:
84 | index_offset = 0
85 |
86 | # go through list of column indices
87 | for index_pair in column_info:
88 |
89 | # pull start value, modifying by `index_offset`
90 | start_index = index_pair[0]-index_offset
91 |
92 | # pull end value
93 | end_index = index_pair[1]
94 |
95 | # strip whitespace from around the data
96 | new_row.append((data_line[start_index:end_index]).strip())
97 |
98 | # return stripped data
99 | return new_row
100 |
101 |
102 | if __name__ == "__main__":
103 | main()
104 |
--------------------------------------------------------------------------------
/chapter_8_examples/standalone_files/greet_me_options.py:
--------------------------------------------------------------------------------
1 | # create a function that prints out a greeting to any name
2 | def greet_me(a_name, greeting="Hello"):
3 | print(greeting+" "+a_name)
4 |
5 | # create a variable named author
6 | author = "Susan E. McGregor"
7 |
8 | # create another variable named editor
9 | editor = "Jeff Bleiel"
10 |
11 | # use `greet_me()` to output greeting messages to each person
12 | # say "Hello" by default
13 | greet_me(author)
14 |
15 | # let the programmer specify "Hi" as the greeting
16 | greet_me(editor, greeting="Hi")
17 |
--------------------------------------------------------------------------------
/chapter_8_examples/standalone_files/greet_me_revisited.py:
--------------------------------------------------------------------------------
1 | # create a function that prints out a greeting
2 | # to any name passed to the function
3 | def greet_me(a_name):
4 | print("Variable `a_name` in `greet_me`: "+a_name)
5 | print("Hello "+a_name)
6 |
7 | # create a variable named `author`
8 | author = "Susan E. McGregor"
9 |
10 | # create another variable named `editor`
11 | editor = "Jeff Bleiel"
12 |
13 | a_name = "Python"
14 | print("Variable `a_name` in main script: "+a_name)
15 |
16 | # use my custom function, `greet_me` to output "Hello" messages to each person
17 | greet_me(author)
18 | greet_me(editor)
19 |
20 | print("Variable `a_name` in main script again: "+a_name)
21 |
--------------------------------------------------------------------------------
/chapter_8_examples/standalone_files/make_greeting.py:
--------------------------------------------------------------------------------
1 | # create a function that **returns** a greeting to any name passed in
2 | def make_greeting(a_name):
3 | return("Hello "+a_name)
4 |
5 | # create a variable named author
6 | author = "Susan E. McGregor"
7 |
8 | # create another variable named editor
9 | editor = "Jeff Bleiel"
10 |
11 | # use my custom function, `greet_me()` to build and store
12 | # the "Hello" messages to each person
13 | author_greeting = make_greeting(author)
14 | editor_greeting = make_greeting(editor)
15 |
16 | # now `print()` the greetings built and returned by each function call
17 | print(author_greeting)
18 | print(editor_greeting)
19 |
--------------------------------------------------------------------------------
/chapter_8_examples/standalone_files/make_greeting_no_vars.py:
--------------------------------------------------------------------------------
1 | # function that returns a greeting to any name passed in
2 | def make_greeting(a_name):
3 | return("Hello "+a_name)
4 |
5 | # function that adds a question to any greeting
6 | def add_question(a_greeting):
7 | return(a_greeting+", how are you?")
8 |
9 | # create a variable named author
10 | author = "Susan E. McGregor"
11 |
12 | # create another variable named editor
13 | editor = "Jeff Bleiel"
14 |
15 | # print the greeting message
16 | print(make_greeting(author))
17 |
18 | # pass the greeting message to the question function and print the result!
19 | print(add_question(make_greeting(editor)))
20 |
--------------------------------------------------------------------------------
/chapter_8_examples/standalone_files/webpage_saver.py:
--------------------------------------------------------------------------------
1 | """ Web page Saver!
2 | Downloads the contents of a web page and saves it locally
3 |
4 | Usage
5 | -----
6 | python webpage_saver.py target_url filename
7 |
8 | Parameters
9 | ----------
10 | target_url : str
11 | The full URL of the web page to be downloaded
12 | filename : str
13 | The desired filename of the local copy
14 |
15 | Requirements
16 | ------------
17 | * argparse module
18 | * requests module
19 | """
20 |
21 | # include the requests library in order to get data from the web
22 | import requests
23 |
24 | # include argparse library to pull arguments from the command line
25 | import argparse
26 |
27 | # create a new `ArgumentParser()`
28 | parser = argparse.ArgumentParser()
29 |
30 | # arguments will be assigned based on the order in which they were provided
31 | parser.add_argument("target_url", help="Full URL of web page to be downloaded")
32 | parser.add_argument("filename", help="The desired filename of the local copy")
33 | args = parser.parse_args()
34 |
35 | # pull the url of the web page we're downloading from the provided arguments
36 | target_url = args.target_url
37 |
38 | # pull the intended output filename from the provided arguments
39 | output_filename = args.filename
40 |
41 | # create appropriate header information for our web page request
42 | headers = {
43 | 'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 13597.66.0) ' + \
44 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' + \
45 | 'Chrome/88.0.4324.109 Safari/537.36',
46 | 'From': 'YOUR NAME HERE - youremailaddress@emailprovider.som'
47 | }
48 |
49 | # because we're just loading a regular web page, we send a `get` request to the
50 | # URL, along with our informational headers
51 | webpage = requests.get(target_url, headers=headers)
52 |
53 | # opening up a local file to save the contents of the web page to
54 | output_file = open(output_filename,"w")
55 |
56 | # the web page's code is in the `text` property of the website's response
57 | # so write that to our file
58 | output_file.write(webpage.text)
59 |
60 | # close our output file!
61 | output_file.close()
62 |
--------------------------------------------------------------------------------
/chapter_8_examples/standalone_files/weekday_rides_refactored.py:
--------------------------------------------------------------------------------
1 | # Objective: Filter all September, 2020 Citi Bike rides, and output a new
2 | # file containing only weekday rides
3 | # Program Outline:
4 | # 1. Read in the data file: 202009-citibike-tripdata.csv
5 | # 2. Create a new output file, and write the header row to it.
6 | # 3. For each row in the file, make a date from the `starttime`:
7 | # a. if it's a weekday, write the row to our output file
8 | # 4. Close the output file
9 | # import the "csv" library
10 | import csv
11 | # import the "datetime" library
12 | from datetime import datetime
13 | def main():
14 | # open our data file in "read" mode
15 | source_file = open("202009-citibike-tripdata.csv","r")
16 | # open our output file in "write" mode
17 | output_file = open("202009-citibike-weekday-tripdata.csv","w")
18 | # pass our source_file to the DictReader "recipe"
19 | # and store the result in a variable called `citibike_reader`
20 | citibike_reader = csv.DictReader(source_file)
21 | # create a corresponding DictWriter; specify its fieldnames should
22 | # be drawn from `citibike_reader`
23 | output_writer = csv.DictWriter(output_file, fieldnames=citibike_reader.fieldnames)
24 | # actually write the header row to the output file
25 | output_writer.writeheader()
26 | # loop through our `citibike_reader` rows
27 | for a_row in citibike_reader:
28 | # if the current 'starttime' value is a weekday
29 | if is_weekday(a_row['starttime']):
30 | # write that row of data to our output file
31 | output_writer.writerow(a_row)
32 | # close the output file
33 | output_file.close()
34 |
35 |
36 | def is_weekday(date_string, date_format='%Y-%m-%d %H:%M:%S.%f'):
37 |
38 | # convert the value in the 'date_string' to datetime format
39 | the_date = datetime.strptime(date_string, date_format)
40 |
41 | # if `the_date` is a weekday (i.e., its integer value is 0-5)
42 | if the_date.weekday() <= 4:
43 | return(True)
44 | else:
45 | return(False)
46 |
47 | if __name__ == "__main__":
48 | main()
49 |
--------------------------------------------------------------------------------
/chapter_8_examples/standalone_files/xls_meta_and_date_parsing_refactored.py:
--------------------------------------------------------------------------------
1 | # Converting data in an .xls file with Python to csv + metadata file, with
2 | # functional date values using the "xrld" library.
3 |
4 | # First, pip install the xlrd library:
5 | # https://pypi.org/project/xlrd/2.0.1/
6 | # then, import the `xlrd` library
7 | import xlrd
8 |
9 | # import the csv library
10 | import csv
11 |
12 | # needed to test if a given value is *some* type of number
13 | from numbers import Number
14 |
15 | # for parsing/formatting our newly interpreted Excel dates
16 | from datetime import datetime
17 |
18 | def main():
19 | # use `open_workbook()` to load our data in the `source_workbook` variable
20 | source_workbook = xlrd.open_workbook("fredgraph.xls")
21 |
22 | global the_datemode
23 | the_datemode = source_workbook.datemode
24 |
25 | # open and name a simple metadata text file
26 | source_workbook_metadata = open("fredgraph_metadata.txt","w")
27 |
28 | # an `.xls` workbook can have multiple sheets
29 | for sheet_name in source_workbook.sheet_names():
30 |
31 | # create a variable that points to the current worksheet
32 | current_sheet = source_workbook.sheet_by_name(sheet_name)
33 |
34 | # create "xls_"+sheet_name+".csv" as current sheet's output file
35 | output_file = open("xls_"+sheet_name+"_dates.csv","w")
36 |
37 | # use the `writer()` recipe to write `.csv`-formatted rows
38 | output_writer = csv.writer(output_file)
39 |
40 | # Boolean variable to detect if we've hit our table-type data yet
41 | is_table_data = False
42 |
43 | # now, we need to loop through every row in our sheet
44 | for row_num, row in enumerate(current_sheet.get_rows()):
45 |
46 | # pulling out the value in the first column of the current row
47 | first_entry = current_sheet.row_values(row_num)[0]
48 |
49 | # if we've hit the header row of our data table
50 | if first_entry == 'observation_date':
51 |
52 | # it's time to switch our "flag" value to "True"
53 | is_table_data = True
54 |
55 | # if `is_table_data` is True
56 | if is_table_data:
57 |
58 | # pass the requisite data to out `create_table_row()` function
59 | new_row = create_table_row(current_sheet, row_num)
60 |
61 | # write this new row to the data output file
62 | output_writer.writerow(new_row)
63 |
64 | # otherwise, this row must be metadata
65 | else:
66 |
67 | # pass the requisite data to our `create_meta_text()` function
68 | metadata_line = create_meta_text(current_sheet, row_num)
69 |
70 | # write this new row to the metadata output file
71 | source_workbook_metadata.write(metadata_line)
72 |
73 | # just for good measure, let's close our output files
74 | output_file.close()
75 | source_workbook_metadata.close()
76 |
77 | def create_table_row(the_sheet, the_row_num):
78 |
79 | # extract the table-type data values into separate variables
80 | the_date_num = the_sheet.row_values(the_row_num)[0]
81 | U6_value = the_sheet.row_values(the_row_num)[1]
82 |
83 | # create a new row object with each of the values
84 | new_row = [the_date_num, U6_value]
85 |
86 | # if the `the_date_num` is a number, then the current row is *not*
87 | # the header row. We need to transform the date.
88 | if isinstance(the_date_num, Number):
89 |
90 | # use the xlrd library's `xldate_as_datetime()` to generate
91 | # a Python datetime object
92 | the_date_num = xlrd.xldate.xldate_as_datetime(the_date_num, the_datemode)
93 |
94 | # create a new list containing `the_date_num` (formatted to MM/DD/YYYY
95 | # using the `strftime()` recipe) and the value in the second column
96 | new_row = [the_date_num.strftime('%m/%d/%Y'),U6_value]
97 |
98 | # return the fully formatted row
99 | return(new_row)
100 |
101 |
102 | def create_meta_text(the_sheet, the_row_num):
103 |
104 | meta_line = ""
105 |
106 | # since we'd like our metadata file to be nicely formatted, we
107 | # need to loop through the individual cells of each metadata row
108 | for item in the_sheet.row(the_row_num):
109 | # write the value of the cell, followed by a tab character
110 | meta_line = meta_line + item.value + '\t'
111 |
112 | # at the end of each line of metadata, add a newline
113 | meta_line = meta_line+'\n'
114 |
115 | # return the fully formatted line
116 | return(meta_line)
117 |
118 |
119 | if __name__ == "__main__":
120 | main()
121 |
--------------------------------------------------------------------------------
/chapter_8_examples/standalone_files/xls_meta_and_date_parsing_refactored_again.py:
--------------------------------------------------------------------------------
1 | # Converting data in an .xls file with Python to csv + metadata file, with
2 | # functional date values using the "xrld" library.
3 |
4 | # First, pip install the xlrd library:
5 | # https://pypi.org/project/xlrd/2.0.1/
6 | # then, import the `xlrd` library
7 | import xlrd
8 |
9 | # import the csv library
10 | import csv
11 |
12 | # needed to test if a given value is *some* type of number
13 | from numbers import Number
14 |
15 | # for parsing/formatting our newly interpreted Excel dates
16 | from datetime import datetime
17 |
18 | def main():
19 |
20 | # use `open_workbook()` to load our data in the `source_workbook` variable
21 | source_workbook = xlrd.open_workbook("fredgraph.xls")
22 |
23 | # open and name a simple metadata text file
24 | source_workbook_metadata = open("fredgraph_metadata.txt","w")
25 |
26 | # an `.xls` workbook can have multiple sheets
27 | for sheet_name in source_workbook.sheet_names():
28 |
29 | # create a variable that points to the current worksheet
30 | current_sheet = source_workbook.sheet_by_name(sheet_name)
31 |
32 | # create "xls_"+sheet_name+".csv" as the current sheet's output file
33 | output_file = open("xls_"+sheet_name+"_dates.csv","w")
34 |
35 | # use the `writer()` recipe to write `.csv`-formatted rows
36 | output_writer = csv.writer(output_file)
37 |
38 | # Boolean variable to detect if we've hit our table-type data yet
39 | is_table_data = False
40 |
41 | # now, we need to loop through every row in our sheet
42 | for row_num, row in enumerate(current_sheet.get_rows()):
43 |
44 | # pulling out the value in the first column of the current row
45 | first_entry = current_sheet.row_values(row_num)[0]
46 |
47 | # if we've hit the header row of our data table
48 | if first_entry == 'observation_date':
49 | # it's time to switch our "flag" value to "True"
50 | is_table_data = True
51 |
52 | # if `is_table_data` is True
53 | if is_table_data:
54 |
55 | # extract the table-type data values into separate variables
56 | the_date_num = current_sheet.row_values(row_num)[0]
57 | U6_value = current_sheet.row_values(row_num)[1]
58 |
59 | # if the value is a number, then the current row is *not*
60 | # the header row, so transform the date
61 | if isinstance(the_date_num, Number):
62 | the_date_num = format_excel_date(the_date_num,
63 | source_workbook.datemode)
64 |
65 | # write this new row to the data output file
66 | output_writer.writerow([the_date_num, U6_value])
67 |
68 | # otherwise, this row must be metadata
69 | else:
70 |
71 | # pass the requisite data to our `create_meta_text()` function
72 | metadata_line = create_meta_text(current_sheet, row_num)
73 |
74 | # write this new row to the metadata output file
75 | source_workbook_metadata.write(metadata_line)
76 |
77 | # just for good measure, let's close our output files
78 | output_file.close()
79 | source_workbook_metadata.close()
80 |
81 |
82 | def format_excel_date(a_date_num, the_datemode):
83 |
84 | # use the xlrd library's `xldate_as_datetime()` to generate
85 | # a Python datetime object
86 | a_date_num = xlrd.xldate.xldate_as_datetime(a_date_num, the_datemode)
87 |
88 | # create a new list containing the_date_num (formatted to MM/DD/YYYY
89 | # using the `strftime()` recipe) and the value in the second column
90 | formatted_date = a_date_num.strftime('%m/%d/%Y')
91 |
92 | return(formatted_date)
93 |
94 | def create_meta_text(the_sheet, the_row_num):
95 |
96 | meta_line = ""
97 |
98 | # since we'd like our metadata file to be nicely formatted, we
99 | # need to loop through the individual cells of each metadata row
100 | for item in the_sheet.row(the_row_num):
101 |
102 | # write the value of the cell, followed by a tab character
103 | meta_line = meta_line + item.value + '\t'
104 |
105 | # at the end of each line of metadata, add a newline
106 | meta_line = meta_line+'\n'
107 |
108 | return(meta_line)
109 |
110 |
111 | if __name__ == "__main__":
112 | main()
113 |
--------------------------------------------------------------------------------
/chapter_9_examples/jupyter_notebooks/ppp_loan_central_measures.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# `pandas` for reading and assessing our data\n",
10 | "import pandas as pd\n",
11 | "\n",
12 | "# `seaborn` for its built-in themes and chart types\n",
13 | "import seaborn as sns\n",
14 | "\n",
15 | "# `matplotlib` for customizing visual details\n",
16 | "import matplotlib.pyplot as plt\n",
17 | "%matplotlib notebook"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
27 | "# # Import PyDrive and associated libraries.\n",
28 | "# # This only needs to be done once per notebook.\n",
29 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n",
30 | "# from pydrive.auth import GoogleAuth\n",
31 | "# from pydrive.drive import GoogleDrive\n",
32 | "# from google.colab import auth\n",
33 | "# from oauth2client.client import GoogleCredentials\n",
34 | "\n",
35 | "# # Authenticate and create the PyDrive client.\n",
36 | "# # This only needs to be done once per notebook.\n",
37 | "# auth.authenticate_user()\n",
38 | "# gauth = GoogleAuth()\n",
39 | "# gauth.credentials = GoogleCredentials.get_application_default()\n",
40 | "# drive = GoogleDrive(gauth)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
50 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1z6XEYE8Qg2gxkwotc1htbx2Maxuveg7-/view?usp=sharing\n",
51 | "# file_id = '1z6XEYE8Qg2gxkwotc1htbx2Maxuveg7-' # notice where this string comes from in link above\n",
52 | "\n",
53 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
54 | "# print(imported_file['title']) # it should print the title of desired file\n",
55 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "# read in our data\n",
65 | "ppp_data = pd.read_csv('public_150k_plus_221.csv')"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "# set a basic color theme for our visualization\n",
75 | "sns.set_theme(style=\"whitegrid\")\n",
76 | "\n",
77 | "# use the built-in `mean()` and `median()` methods in `pandas\n",
78 | "mean = ppp_data['CurrentApprovalAmount'].mean()\n",
79 | "median = ppp_data['CurrentApprovalAmount'].median()\n",
80 | "\n",
81 | "# create a histogram of the values in the `CurrentApprovalAmount` column\n",
82 | "approved_loan_plot = sns.histplot(data=ppp_data, x=\"CurrentApprovalAmount\")\n",
83 | "\n",
84 | "# get the min and max y-values on our histogram\n",
85 | "y_axis_range = approved_loan_plot.get_ylim()\n",
86 | "\n",
87 | "# add the vertical lines at the correct locations\n",
88 | "approved_loan_plot.vlines(mean, 0, y_axis_range[1], color='crimson', ls=':')\n",
89 | "approved_loan_plot.vlines(median, 0, y_axis_range[1], color='green', ls='-')\n",
90 | "\n",
91 | "# the matplotlib `show()` method actually renders the visualization\n",
92 | "plt.show()"
93 | ]
94 | }
95 | ],
96 | "metadata": {
97 | "kernelspec": {
98 | "display_name": "Python 3 (ipykernel)",
99 | "language": "python",
100 | "name": "python3"
101 | },
102 | "language_info": {
103 | "codemirror_mode": {
104 | "name": "ipython",
105 | "version": 3
106 | },
107 | "file_extension": ".py",
108 | "mimetype": "text/x-python",
109 | "name": "python",
110 | "nbconvert_exporter": "python",
111 | "pygments_lexer": "ipython3",
112 | "version": "3.9.5"
113 | }
114 | },
115 | "nbformat": 4,
116 | "nbformat_minor": 4
117 | }
118 |
--------------------------------------------------------------------------------
/chapter_9_examples/jupyter_notebooks/wing_length_with_sd.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# `pandas` to read in our data\n",
10 | "import pandas as pd\n",
11 | "\n",
12 | "# `seaborn` for built-in themes and chart types\n",
13 | "import seaborn as sns\n",
14 | "\n",
15 | "# `matplotlib` for customizing visual details\n",
16 | "import matplotlib.pyplot as plt\n",
17 | "%matplotlib notebook\n",
18 | "\n",
19 | "# `statistics` easily calculating statistical measures\n",
20 | "import statistics\n"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
30 | "# # Import PyDrive and associated libraries.\n",
31 | "# # This only needs to be done once per notebook.\n",
32 | "# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2\n",
33 | "# from pydrive.auth import GoogleAuth\n",
34 | "# from pydrive.drive import GoogleDrive\n",
35 | "# from google.colab import auth\n",
36 | "# from oauth2client.client import GoogleCredentials\n",
37 | "\n",
38 | "# # Authenticate and create the PyDrive client.\n",
39 | "# # This only needs to be done once per notebook.\n",
40 | "# auth.authenticate_user()\n",
41 | "# gauth = GoogleAuth()\n",
42 | "# gauth.credentials = GoogleCredentials.get_application_default()\n",
43 | "# drive = GoogleDrive(gauth)"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB\n",
53 | "# # Link to data file stored in Drive: https://drive.google.com/file/d/1gyWJhIbMNnDlI1SCyLl_pZkxkIcdAhd4/view?usp=sharing\n",
54 | "# file_id = '1gyWJhIbMNnDlI1SCyLl_pZkxkIcdAhd4' # notice where this string comes from in link above\n",
55 | "\n",
56 | "# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file\n",
57 | "# print(imported_file['title']) # it should print the title of desired file\n",
58 | "# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "# read in our data\n",
68 | "wing_data = pd.read_csv('wing_length - s057.csv')\n",
69 | "\n",
70 | "# set a basic color theme for our visualization\n",
71 | "sns.set_theme(style=\"white\")"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "# create the histogram, allowing `seaborn` to choose default \"bin\" values\n",
81 | "wing_plot = sns.histplot(data=wing_data, x=\"wing_length (0.1mm)\", kde=\"True\")\n",
82 | "\n",
83 | "# calculate the standard deviation via the `statistics` `stdev()` method\n",
84 | "sd = statistics.stdev(wing_data['wing_length (0.1mm)'])\n",
85 | "\n",
86 | "# get the min and max y-values on our histogram\n",
87 | "y_axis_range = wing_plot.get_ylim()\n",
88 | "\n",
89 | "# plot the mean as a solid line\n",
90 | "mean = wing_data['wing_length (0.1mm)'].mean()\n",
91 | "wing_plot.vlines(mean, 0, y_axis_range[1], color='gray', ls='-')\n",
92 | "\n",
93 | "# plot the three standard deviation boundary lines on either side of the mean\n",
94 | "for i in range(-3,4):\n",
95 | "\n",
96 | " # find the current boundary value\n",
97 | " z_value = mean + (i*sd)\n",
98 | "\n",
99 | " # don't draw a second line over the mean line\n",
100 | " if z_value != mean:\n",
101 | "\n",
102 | " # plot a dotted gray line at each boundary value\n",
103 | " wing_plot.vlines(z_value, 0, y_axis_range[1], color='gray', ls=':')\n",
104 | "\n",
105 | "# show the plot!\n",
106 | "plt.show()"
107 | ]
108 | }
109 | ],
110 | "metadata": {
111 | "kernelspec": {
112 | "display_name": "Python 3 (ipykernel)",
113 | "language": "python",
114 | "name": "python3"
115 | },
116 | "language_info": {
117 | "codemirror_mode": {
118 | "name": "ipython",
119 | "version": 3
120 | },
121 | "file_extension": ".py",
122 | "mimetype": "text/x-python",
123 | "name": "python",
124 | "nbconvert_exporter": "python",
125 | "pygments_lexer": "ipython3",
126 | "version": "3.9.5"
127 | }
128 | },
129 | "nbformat": 4,
130 | "nbformat_minor": 4
131 | }
132 |
--------------------------------------------------------------------------------
/chapter_9_examples/standalone_files/dollars_per_job_2M_rnd2.py:
--------------------------------------------------------------------------------
1 | # `pandas` for data loading/transformations
2 | import pandas as pd
3 |
4 | # `seaborn` for visualization
5 | import seaborn as sns
6 |
7 | # `matplotlib` for customizing visuals
8 | import matplotlib.pyplot as plt
9 |
10 | # `numpy` for manipulating arrays/lists
11 | import numpy as np
12 |
13 | # load our data
14 | ppp_data = pd.read_csv('public_150k_plus_borrower_fingerprint_a.csv')
15 |
16 | # first, sanity check our data
17 | print(ppp_data[ppp_data['JobsReported'] <= 0])
18 |
19 | # drop the records with no value in `JobsReported`
20 | ppp_data.drop(labels=[437083,765398], axis=0)
21 |
22 | # calculate the dollars per job
23 | dollars_per_job = ppp_data['CurrentApprovalAmount']/ppp_data['JobsReported']
24 |
25 | # insert the new column into our original dataset
26 | ppp_data.insert(3, 'Dollars per Job', dollars_per_job)
27 |
28 | # use `ProcessingMethod` value to identify second-round loans
29 | pps_loans = ppp_data[ppp_data['ProcessingMethod'] == 'PPS']
30 |
31 | # select all second-round loans that have a value of $2M
32 | pps_got_2M = pps_loans[pps_loans['CurrentApprovalAmount'] == 2000000.00]
33 | print("Actual $2M second-round loans:")
34 | print(pps_got_2M.shape)
35 |
36 | # pull fingerprints of businesses approved for $2M second-round loans
37 | biz_names = pd.unique(pps_got_2M['BorrowerNameFingerprint'])
38 |
39 | # convert that list to a DataFrame
40 | biz_names_df = pd.DataFrame(biz_names, columns=['BorrowerNameFingerprint'])
41 |
42 | # create an array of the same length as `biz_names_df`; fill with flag value
43 | fill_column = np.full((len(biz_names),1), '2Mil2ndRnd')
44 | biz_names_df['GotSecond'] = fill_column
45 |
46 | # now merge this new, two-column DataFrame back onto our full_data list
47 | second_round_max = pd.merge(ppp_data, biz_names_df, on='BorrowerNameFingerprint')
48 |
49 | # all loans whose fingerprints match those of businesses that got $2M
50 | # in the second round should have `2Mil2ndRnd` in the `GotSecond` column
51 | second_max_all_loans = second_round_max[
52 | second_round_max['GotSecond'] == '2Mil2ndRnd']
53 |
54 | # sbould be 2x the number of businesses approved for $2M second-round
55 | print('Total # of loans approved for most orgs that got $2M for second round:')
56 | print(second_max_all_loans.shape)
57 |
58 | # how much money were these businesses approved to get from the PPP, total?
59 | total_funds = second_max_all_loans['CurrentApprovalAmount'].sum()
60 | print("Total funds approved for identified orgs that could have " + \
61 | "second-round max:")
62 | print(total_funds)
63 |
64 | # now, let's plot that new column on our selected dataset
65 | # set the seaborn theme
66 | sns.set_theme(style="whitegrid")
67 |
68 | # the `matplotlib` `subplots()` to plot charts side by side
69 | fig, ((row1col1)) = plt.subplots(nrows=1, ncols=1)
70 |
71 | # plot the histogram of our date-based analysis
72 | date_based = sns.histplot(data=second_max_all_loans, x='Dollars per Job',
73 | hue='ProcessingMethod', ax=row1col1)
74 |
75 | # show the plots!
76 | plt.show()
77 |
--------------------------------------------------------------------------------
/chapter_9_examples/standalone_files/ppp_fingerprint_borrowers.py:
--------------------------------------------------------------------------------
1 | # Quick script for adding a "fingerprint" column to our loan data, which will
2 | # help us confirm/correct for any typos or inconsistencies in, e.g. borrower
3 | # name and address information
4 |
5 | # import the csv library
6 | import csv
7 |
8 | # importing the `fingerprints` library
9 | import fingerprints
10 |
11 | # read the recent data sample into a variable
12 | ppp_data = open('public_150k_plus_221.csv','r')
13 |
14 | # the DictReader function has added useful information to our data,
15 | # like a label that shows us all the values in the first or "header" row
16 | ppp_data_reader = csv.DictReader(ppp_data)
17 |
18 | # create an output file to write our modified data set to
19 | augmented_ppp_data = open('public_150k_plus_borrower_fingerprint_a.csv','w')
20 |
21 | # create a "writer" so that we can output whole rows at once
22 | augmented_data_writer = csv.writer(augmented_ppp_data)
23 |
24 | # because we're adding a column, we need to create a new header row as well
25 | header_row = []
26 |
27 | # for every column header
28 | for item in ppp_data_reader.fieldnames:
29 |
30 | # append the existing column header
31 | header_row.append(item)
32 |
33 | # if we're at 'OriginatingLender'
34 | if item == 'BorrowerName':
35 |
36 | # it's time to add a new one!
37 | header_row.append('BorrowerNameFingerprint')
38 |
39 | # write the completed header row to the output file
40 | augmented_data_writer.writerow(header_row)
41 |
42 | # iterate through row in the data
43 | for row in ppp_data_reader:
44 |
45 | # adding a column means we need to build the new row of data
46 | # item by item, just as we did with the header row
47 | new_row = []
48 |
49 | # for each column of data in the *original* data set
50 | for column_name in ppp_data_reader.fieldnames:
51 |
52 | # first, append this row's value for that column
53 | new_row.append(row[column_name])
54 |
55 | # when we get to the 'OriginatingLender' column, it's time
56 | # to add our new "fingerprint" value
57 | if column_name == 'BorrowerName':
58 |
59 | # our fingerprint will consist of the generated fingerprint PLUS
60 | # the BorrowerZip
61 | try:
62 | the_fingerprint = fingerprints.generate(row[column_name]) +" "+ fingerprints.generate(row['BorrowerCity'])+" "+row['BorrowerState']
63 | except(TypeError):
64 | the_fingerprint = fingerprints.generate("MISSING") +" "+ fingerprints.generate(row['BorrowerCity'])+" "+row['BorrowerState']
65 |
66 | new_row.append(the_fingerprint)
67 |
68 | # once the whole row is complete, write it to our output file
69 | augmented_data_writer.writerow(new_row)
70 |
71 | # close both files
72 | augmented_ppp_data.close()
73 | ppp_data.close()
74 |
--------------------------------------------------------------------------------
/chapter_9_examples/standalone_files/ppp_loan_central_and_dist.py:
--------------------------------------------------------------------------------
1 | # `pandas` for reading and assessing our data
2 | import pandas as pd
3 |
4 | # `seaborn` for its built-in themes and chart types
5 | import seaborn as sns
6 |
7 | # `matplotlib` for customizing visual details
8 | import matplotlib.pyplot as plt
9 |
10 | # read in our data
11 | ppp_data = pd.read_csv('public_150k_plus_221.csv')
12 |
13 | # set a basic color theme for our visualization
14 | sns.set_theme(style="whitegrid")
15 |
16 | # use the built-in `mean()` and `median()` methods in `pandas
17 | mean = ppp_data['CurrentApprovalAmount'].mean()
18 | median = ppp_data['CurrentApprovalAmount'].median()
19 |
20 | # Q1 is the value at the position in our dataset
21 | # that has 25% of data readings to its left
22 | Q1 = ppp_data['CurrentApprovalAmount'].quantile(0.25)
23 |
24 | # Q3 is the value at the position in our dataset
25 | # that has 75% of data readings to its left
26 | Q3 = ppp_data['CurrentApprovalAmount'].quantile(0.75)
27 |
28 | # IQR is the difference between the Q3 and Q1 values
29 | IQR = Q3-Q1
30 |
31 | # and now we calculate our lower and upper bounds
32 | lower_bound = Q1 - (1.5*IQR)
33 | upper_bound = Q3 + (1.5*IQR)
34 |
35 | # use `seaborn` to plot the histogram
36 | approved_loan_plot = sns.histplot(data=ppp_data, x="CurrentApprovalAmount")
37 |
38 | # get the min and max y-values on our histogram
39 | y_axis_range = approved_loan_plot.get_ylim()
40 |
41 | # add mean line in gray
42 | approved_loan_plot.vlines(mean, 0, y_axis_range[1], color='gray', ls='-')
43 |
44 | # other lines in black (median solid, others dotted)
45 | approved_loan_plot.vlines(median, 0, y_axis_range[1], color='black', ls='-')
46 | approved_loan_plot.vlines(lower_bound, 0, y_axis_range[1], color='black', ls=':')
47 | approved_loan_plot.vlines(Q1, 0, y_axis_range[1], color='black', ls=':')
48 | approved_loan_plot.vlines(Q3, 0, y_axis_range[1], color='black', ls=':')
49 | approved_loan_plot.vlines(upper_bound, 0, y_axis_range[1], color='black', ls=':')
50 |
51 | # show the plot!
52 | plt.show()
53 |
--------------------------------------------------------------------------------
/chapter_9_examples/standalone_files/ppp_loan_central_measures.py:
--------------------------------------------------------------------------------
1 | # `pandas` for reading and assessing our data
2 | import pandas as pd
3 |
4 | # `seaborn` for its built-in themes and chart types
5 | import seaborn as sns
6 |
7 | # `matplotlib` for customizing visual details
8 | import matplotlib.pyplot as plt
9 |
10 | # read in our data
11 | ppp_data = pd.read_csv('public_150k_plus_221.csv')
12 |
13 | # set a basic color theme for our visualization
14 | sns.set_theme(style="whitegrid")
15 |
16 | # use the built-in `mean()` and `median()` methods in `pandas
17 | mean = ppp_data['CurrentApprovalAmount'].mean()
18 | median = ppp_data['CurrentApprovalAmount'].median()
19 |
20 | # create a histogram of the values in the `CurrentApprovalAmount` column
21 | approved_loan_plot = sns.histplot(data=ppp_data, x="CurrentApprovalAmount")
22 |
23 | # get the min and max y-values on our histogram
24 | y_axis_range = approved_loan_plot.get_ylim()
25 |
26 | # add the vertical lines at the correct locations
27 | approved_loan_plot.vlines(mean, 0, y_axis_range[1], color='crimson', ls=':')
28 | approved_loan_plot.vlines(median, 0, y_axis_range[1], color='green', ls='-')
29 |
30 | # the matplotlib `show()` method actually renders the visualization
31 | plt.show()
32 |
--------------------------------------------------------------------------------
/chapter_9_examples/standalone_files/who_got_2_loans_by_date.py:
--------------------------------------------------------------------------------
1 | # `pandas` for data loading/transformations
2 | import pandas as pd
3 |
4 | # `seaborn` for visualization
5 | import seaborn as sns
6 |
7 | # `matplotlib` for detailed visualization support
8 | import matplotlib.pyplot as plt
9 |
10 | # `numpy` for manipulating arrays/lists
11 | import numpy as np
12 |
13 | # load our data
14 | ppp_data = pd.read_csv('public_150k_plus_borrower_fingerprint_a.csv')
15 |
16 | # convert the `DateApproved` column to an actual datetime data type
17 | ppp_data['DateApproved'] = pd.to_datetime(ppp_data['DateApproved'])
18 |
19 | # create a variable to hold the second-round start date
20 | second_round_start = pd.to_datetime('2021-01-13')
21 |
22 | # treat today's date to use as the "upper" limit on possible second-round loans
23 | todays_date = pd.to_datetime('today')
24 |
25 | # use 1/1/2020 as a "lower" limit, since it's before the PPP launched
26 | program_start = pd.to_datetime('2020-01-01')
27 |
28 | # pass our boundaries and category labels to the pandas `cut()` function
29 | loan_round = pd.cut(ppp_data.DateApproved,
30 | bins=[program_start,second_round_start, todays_date],
31 | labels=['first_round', 'maybe_second'])
32 |
33 | # insert the new column at the position we specify
34 | ppp_data.insert(2,'Loan Round',loan_round)
35 |
36 | # this "pivot table" will return a Series showing the number
37 | # of times a particular 'BorrowerNameFingerprint' appears in the dataset
38 | loan_count = ppp_data.pivot_table(index=['BorrowerNameFingerprint'], aggfunc='size')
39 |
40 | # convert our Series to a DataFrame and give it a name
41 | loan_count_df = loan_count.to_frame('Loan Count')
42 |
43 | # use the `describe()` method to print out summary statistics
44 | print("Description of duplicate borrower table:")
45 | print(loan_count_df.describe())
46 |
47 | # start by sorting our DataFrame of loan counts from greatest to least
48 | sorted_loan_counts = loan_count_df.sort_values(by=['Loan Count'], ascending=False)
49 |
50 | # create a new DataFrame with *only* those that have more than two loans
51 | more_than_two = sorted_loan_counts[sorted_loan_counts['Loan Count'] > 2]
52 |
53 | # print one instance of each business name that appears in `more_than_two`
54 | print("Businesses that seem to have gotten more than 2 loans:")
55 | print(more_than_two.shape)
56 |
57 | print("Number of businesses that appear to have gotten precisely 2 loans:")
58 |
59 | precisely_two = sorted_loan_counts[sorted_loan_counts['Loan Count'] == 2]
60 |
61 | print(precisely_two.shape)
62 |
63 | # use `ProcessingMethod` value to identify second-round loans
64 | pps_loans = ppp_data[ppp_data['ProcessingMethod'] == 'PPS']
65 |
66 | # print out the `shape` of this DataFrame to see how many businesses we have
67 | print("Number of loans labeled as second round:")
68 | print(pps_loans.shape)
69 |
70 | # how many loans in our derived data frame were approved for precisely $2M
71 | # during the (possibly) second-round timeframe?
72 | # merge our `loan_count_df` back to keep track of businesses
73 | # we labeled as having precisely two loans
74 | ppp_data_w_lc = pd.merge(ppp_data, loan_count_df,
75 | on=['BorrowerNameFingerprint'], how='left')
76 |
77 | # now get *all* the records of business names we associated with two loans
78 | matched_two_loans = ppp_data_w_lc[(ppp_data_w_lc['Loan Count'] == 2)]
79 |
80 | # select those loans our `maybe_second` loans that have a value of $2M
81 | maybe_round2_2M = matched_two_loans[(matched_two_loans[
82 | 'CurrentApprovalAmount'] == 2000000.00) &
83 | (matched_two_loans[
84 | 'Loan Round'] == 'maybe_second')]
85 | print("Derived $2M second-round loans:")
86 | print(maybe_round2_2M.shape)
87 |
88 | # select those loans that we *know* are second round and have a value of $2M
89 | pps_got_2M = pps_loans[pps_loans['CurrentApprovalAmount'] == 2000000.00]
90 | print("Actual $2M second-round loans:")
91 | print(pps_got_2M.shape)
92 |
--------------------------------------------------------------------------------
/chapter_9_examples/standalone_files/wing_length_with_sd.py:
--------------------------------------------------------------------------------
1 | # `pandas` to read in our data
2 | import pandas as pd
3 |
4 | # `seaborn` for built-in themes and chart types
5 | import seaborn as sns
6 |
7 | # `matplotlib` for customizing visual details
8 | import matplotlib.pyplot as plt
9 |
10 | # `statistics` easily calculating statistical measures
11 | import statistics
12 |
13 | # read in our data
14 | wing_data = pd.read_csv('wing_length - s057.csv')
15 |
16 | # set a basic color theme for our visualization
17 | sns.set_theme(style="white")
18 |
19 | # create the histogram, allowing `seaborn` to choose default "bin" values
20 | wing_plot = sns.histplot(data=wing_data, x="wing_length (0.1mm)", kde="True")
21 |
22 | # calculate the standard deviation via the `statistics` `stdev()` method
23 | sd = statistics.stdev(wing_data['wing_length (0.1mm)'])
24 |
25 | # get the min and max y-values on our histogram
26 | y_axis_range = wing_plot.get_ylim()
27 |
28 | # plot the mean as a solid line
29 | mean = wing_data['wing_length (0.1mm)'].mean()
30 | wing_plot.vlines(mean, 0, y_axis_range[1], color='gray', ls='-')
31 |
32 | # plot the three standard deviation boundary lines on either side of the mean
33 | for i in range(-3,4):
34 |
35 | # find the current boundary value
36 | z_value = mean + (i*sd)
37 |
38 | # don't draw a second line over the mean line
39 | if z_value != mean:
40 |
41 | # plot a dotted gray line at each boundary value
42 | wing_plot.vlines(z_value, 0, y_axis_range[1], color='gray', ls=':')
43 |
44 | # show the plot!
45 | plt.show()
46 |
--------------------------------------------------------------------------------