├── 010-housing-in-mexico-015-assignment
    ├── Add new column, drop a column.py
    ├── Boxplot.py
    ├── Concat function.py
    ├── Create new columns.py
    ├── Drop NaNs.py
    ├── Drop columns.py
    ├── Histogram.py
    ├── Import CSV.py
    ├── Inspect data.py
    ├── README.md
    ├── bar chart.py
    ├── correlation coefficient.py
    ├── describe () method.py
    ├── groupby() method.py
    ├── scatter plot.py
    ├── scatter_mapbox.py
    └── value_counts() method.py
├── 020-housing-in-buenos-aires
    ├── README.md
    ├── baseline.py
    ├── glob ().py
    ├── important DS libraries.py
    ├── model: build & fit.py
    ├── predict.py
    ├── retrieve data.py
    ├── split.py
    └── wrangle () function.py
├── 030-air-quality-in-nairobi
    ├── ACFplot.py
    ├── ARmodel.py
    ├── Baseline.py
    ├── MongoDB.py
    ├── PACFplot.py
    ├── PrettyPrinter.py
    ├── README.md
    ├── aggregate().py
    ├── communicate.py
    ├── distinct().py
    ├── finalModel.py
    ├── libraries.py
    ├── rollingAvg.py
    ├── split.py
    ├── timeSeriesplot.py
    ├── wfv.py
    └── wrangle().py
├── 040-earthquake-damage-in-nepal
    ├── 1) libraries.py
    ├── 2) connect.py
    ├── 3) get_tables.py
    ├── 4) explore_tables.py
    ├── 5) JOIN.py
    ├── 6) wrangle().py
    ├── 7) barChart.py
    ├── 8) boxplot.py
    ├── 9) pivot_table.py
    ├── 91) vertical_split.py
    ├── 92) horizontal_split.py
    ├── 93) baseline.py
    ├── 94) log_reg.py
    ├── 95) accuracy_score.py
    ├── 96) decision_tree.py
    ├── 97) validation_curve.py
    ├── 98) tests.py
    ├── 99) communicate.py
    ├── 991) others.py
    └── README.md
├── 050-bankruptcy-in-poland
    ├── GridSearchCV.py
    ├── README.md
    ├── acc_score.py
    ├── bar.py
    ├── barh.py
    ├── best_params.py
    ├── classif_reports.py
    ├── clf_cv.py
    ├── conf_matrix.py
    ├── import.py
    ├── interactive_dash.py
    ├── libraries.py
    ├── naNs.py
    ├── resampling.py
    ├── save_and_load.py
    ├── splits.py
    └── wrangle.py
├── 060-consumer-finances in-usa
    ├── 1_import.py
    ├── 2_explore.py
    ├── 3_explore.py
    ├── 4_split.py
    ├── 6_communicate.py
    ├── README.md
    ├── libraries.py
    └── model.py
├── 070-ds-admissions-in-wqu
    ├── README.md
    ├── aggregate.py
    ├── choropleth_map.py
    ├── connect.py
    ├── contingency_bar.py
    ├── contingency_table.py
    ├── country_converter.py
    ├── crosstab.py
    ├── imports.py
    ├── load.py
    ├── mongo_instance.py
    ├── our_mongo_class.py
    ├── probability.py
    ├── run_exp.py
    ├── statistic_power.py
    └── statistical_summary.py
├── 080-volatility-forecasting-in-india
    └── README.md
└── README.md


/010-housing-in-mexico-015-assignment/Add new column, drop a column.py:
--------------------------------------------------------------------------------
 1 | df2.head()
 2 | 
 3 | 	property_type	state	region	lat	lon	area_m2	price_brl
 4 | 0	apartment	Pernambuco	Northeast	-8.134204	-34.906326	72.0	414222.98
 5 | 1	apartment	Pernambuco	Northeast	-8.126664	-34.903924	136.0	848408.53
 6 | 2	apartment	Pernambuco	Northeast	-8.125550	-34.907601	75.0	299438.28
 7 | 3	apartment	Pernambuco	Northeast	-8.120249	-34.895920	187.0	848408.53
 8 | 4	apartment	Pernambuco	Northeast	-8.142666	-34.906906	80.0	464129.36
 9 | 
10 | # price_brl ---> price in Brazilian reals
11 | # create new column price_usd
12 | # use 1 USD = 3.19 Brazilian reals
13 | 
14 | df2["price_usd"] = df2["price_brl"] / 3.19
15 | df2.head()
16 | 
17 | 	property_type	state	region	lat	lon	area_m2	price_brl	price_usd
18 | 0	apartment	Pernambuco	Northeast	-8.134204	-34.906326	72.0	414222.98	129850.463950
19 | 1	apartment	Pernambuco	Northeast	-8.126664	-34.903924	136.0	848408.53	265958.786834
20 | 2	apartment	Pernambuco	Northeast	-8.125550	-34.907601	75.0	299438.28	93867.799373
21 | 3	apartment	Pernambuco	Northeast	-8.120249	-34.895920	187.0	848408.53	265958.786834
22 | 4	apartment	Pernambuco	Northeast	-8.142666	-34.906906	80.0	464129.36	145495.097179
23 | 
24 | 
25 | # DROP COLUMNS
26 | # drop price_brl
27 | 
28 | df2 = df2.drop("price_brl", axis="columns")
29 | df2.head()
30 | 
31 | 	property_type	state	region	lat	lon	area_m2	price_usd
32 | 0	apartment	Pernambuco	Northeast	-8.134204	-34.906326	72.0	129850.463950
33 | 1	apartment	Pernambuco	Northeast	-8.126664	-34.903924	136.0	265958.786834
34 | 2	apartment	Pernambuco	Northeast	-8.125550	-34.907601	75.0	93867.799373
35 | 3	apartment	Pernambuco	Northeast	-8.120249	-34.895920	187.0	265958.786834
36 | 4	apartment	Pernambuco	Northeast	-8.142666	-34.906906	80.0	145495.097179
37 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/Boxplot.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | 
3 | plt.boxplot(df["area_m2"])
4 | plt.xlabel("Area [sq meters]")
5 | plt.title("Distribution of Home Sizes")
6 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/Concat function.py:
--------------------------------------------------------------------------------
1 | # concatenate 2 data frames using concat
2 | 
3 | df = pd.concat([df1, df2])
4 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/Create new columns.py:
--------------------------------------------------------------------------------
 1 | df1.head()
 2 | 
 3 | # Output
 4 | 	property_type	place_with_parent_names	region	lat-lon	area_m2	price_usd
 5 | 0	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.6443051,-35.7088142	110.0	$187,230.85
 6 | 1	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.6430934,-35.70484	65.0	$81,133.37
 7 | 2	house	|Brasil|Alagoas|Maceió|	Northeast	-9.6227033,-35.7297953	211.0	$154,465.45
 8 | 3	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.622837,-35.719556	99.0	$146,013.20
 9 | 4	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.654955,-35.700227	55.0	$101,416.71
10 | 
11 | df1.info()
12 | 
13 | # Output
14 | <class 'pandas.core.frame.DataFrame'>
15 | RangeIndex: 12834 entries, 0 to 12833
16 | Data columns (total 6 columns):
17 |  #   Column                   Non-Null Count  Dtype  
18 | ---  ------                   --------------  -----  
19 |  0   property_type            12834 non-null  object 
20 |  1   place_with_parent_names  12834 non-null  object 
21 |  2   region                   12834 non-null  object 
22 |  3   lat-lon                  11551 non-null  object 
23 |  4   area_m2                  12834 non-null  float64
24 |  5   price_usd                12834 non-null  object 
25 | dtypes: float64(1), object(5)
26 | memory usage: 601.7+ KB
27 | 
28 | 
29 | df1[["lat", "lon"]] = df1["lat-lon"].str.split(",", expand=True)
30 | 
31 | # expand ---> increase size of data frame
32 | # without replacing 
33 | 
34 | df1["lat"] = df1.lat.astype(float)  # change lat and lon from type object(string) to type float
35 | df1["lon"] = df1.lon.astype(float)
36 | df1.shape
37 | 
38 | # Output
39 | (11551, 8)
40 | 
41 | # Example 2
42 | 
43 | df1["state"] = df1["place_with_parent_names"].str.split("|", expand=True)[2]
44 | df1.head()
45 | 
46 | # Output
47 | 	property_type	place_with_parent_names	region	lat-lon	area_m2	price_usd	lat	lon	state
48 | 0	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.6443051,-35.7088142	110.0	$187,230.85	-9.644305	-35.708814	Alagoas
49 | 1	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.6430934,-35.70484	65.0	$81,133.37	-9.643093	-35.704840	Alagoas
50 | 2	house	|Brasil|Alagoas|Maceió|	Northeast	-9.6227033,-35.7297953	211.0	$154,465.45	-9.622703	-35.729795	Alagoas
51 | 3	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.622837,-35.719556	99.0	$146,013.20	-9.622837	-35.719556	Alagoas
52 | 4	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.654955,-35.700227	55.0	$101,416.71	-9.654955	-35.700227	Alagoas
53 | 
54 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/Drop NaNs.py:
--------------------------------------------------------------------------------
1 | # df1.shape before dropping NaNs ---> (12834, 6)
2 | 
3 | df1.dropna(inplace=True)  # drop rows with null values
4 | df1.shape
5 | 
6 | # Output
7 | (11551, 6)
8 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/Drop columns.py:
--------------------------------------------------------------------------------
 1 | df1.head()
 2 | 
 3 |   property_type	place_with_parent_names	region	lat-lon	area_m2	price_usd	lat	lon	state
 4 | 0	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.6443051,-35.7088142	110.0	$187,230.85	-9.644305	-35.708814	Alagoas
 5 | 1	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.6430934,-35.70484	65.0	$81,133.37	-9.643093	-35.704840	Alagoas
 6 | 2	house	|Brasil|Alagoas|Maceió|	Northeast	-9.6227033,-35.7297953	211.0	$154,465.45	-9.622703	-35.729795	Alagoas
 7 | 3	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.622837,-35.719556	99.0	$146,013.20	-9.622837	-35.719556	Alagoas
 8 | 4	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.654955,-35.700227	55.0	$101,416.71	-9.654955	-35.700227	Alagoas
 9 | 
10 | df1 = df1.drop(["lat-lon", "place_with_parent_names"], axis="columns")
11 | df1.head()
12 | 
13 | 	property_type	region	area_m2	price_usd	lat	lon	state
14 | 0	apartment	Northeast	110.0	187230.85	-9.644305	-35.708814	Alagoas
15 | 1	apartment	Northeast	65.0	81133.37	-9.643093	-35.704840	Alagoas
16 | 2	house	Northeast	211.0	154465.45	-9.622703	-35.729795	Alagoas
17 | 3	apartment	Northeast	99.0	146013.20	-9.622837	-35.719556	Alagoas
18 | 4	apartment	Northeast	55.0	101416.71	-9.654955	-35.700227	Alagoas
19 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/Histogram.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | 
3 | plt.hist(df["price_usd"])
4 | plt.xlabel("Price [USD]")
5 | plt.ylabel("Frequency")
6 | plt.title("Distribution of Home Prices")
7 | 
8 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/Import CSV.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import pandas as pd
 3 | import plotly.express as px
 4 | 
 5 | df1 = pd.read_csv("data/brasil-real-estate-1.csv")
 6 | df1.shape
 7 | 
 8 | #Output
 9 | (12834, 6)
10 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/Inspect data.py:
--------------------------------------------------------------------------------
 1 | df1.info()
 2 | 
 3 | # Output
 4 | <class 'pandas.core.frame.DataFrame'>
 5 | RangeIndex: 12834 entries, 0 to 12833
 6 | Data columns (total 6 columns):
 7 |  #   Column                   Non-Null Count  Dtype  
 8 | ---  ------                   --------------  -----  
 9 |  0   property_type            12834 non-null  object 
10 |  1   place_with_parent_names  12834 non-null  object 
11 |  2   region                   12834 non-null  object 
12 |  3   lat-lon                  11551 non-null  object 
13 |  4   area_m2                  12834 non-null  float64
14 |  5   price_usd                12834 non-null  object 
15 | dtypes: float64(1), object(5)
16 | memory usage: 601.7+ KB
17 | 
18 | df1.shape
19 | 
20 | # Output
21 | (12834, 6)
22 | 
23 | 
24 | df1.head()   # Displays the first 5 rows starting from 0
25 | 
26 | #Output
27 | 	property_type	place_with_parent_names	region	lat-lon	area_m2	price_usd
28 | 0	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.6443051,-35.7088142	110.0	$187,230.85
29 | 1	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.6430934,-35.70484	65.0	$81,133.37
30 | 2	house	|Brasil|Alagoas|Maceió|	Northeast	-9.6227033,-35.7297953	211.0	$154,465.45
31 | 3	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.622837,-35.719556	99.0	$146,013.20
32 | 4	apartment	|Brasil|Alagoas|Maceió|	Northeast	-9.654955,-35.700227	55.0	$101,416.71
33 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/README.md:
--------------------------------------------------------------------------------
1 | # work-ds-curriculum-010-housing-in-mexico
2 | 
3 | WQU DATA SCIENCE LAB PROJECT 1
4 | HOUSING IN MEXICO
5 | 
6 | Key concepts learnt and their application
7 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/bar chart.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 
3 | mean_price_by_region.plot(
4 |     kind="bar",
5 |     xlabel="Region",
6 |     ylabel="Mean Price [USD]",
7 |     title="Mean Home Price by Region"
8 | );
9 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/correlation coefficient.py:
--------------------------------------------------------------------------------
1 | corr1= homes_by_state["area_m2"].corr(homes_by_state["price_usd"])
2 | 
3 | 0.5773267433717683 # more than half 
4 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/describe () method.py:
--------------------------------------------------------------------------------
 1 | df.head()
 2 | 
 3 | 	property_type	region	area_m2	price_usd	lat	lon	state
 4 | 0	apartment	Northeast	110.0	187230.85	-9.644305	-35.708814	Alagoas
 5 | 1	apartment	Northeast	65.0	81133.37	-9.643093	-35.704840	Alagoas
 6 | 2	house	Northeast	211.0	154465.45	-9.622703	-35.729795	Alagoas
 7 | 3	apartment	Northeast	99.0	146013.20	-9.622837	-35.719556	Alagoas
 8 | 4	apartment	Northeast	55.0	101416.71	-9.654955	-35.700227	Alagoas
 9 | 
10 | dfa = df[["area_m2", "price_usd"]]  # subset for a data frame
11 | summary_stats = dfa.describe()
12 | summary_stats
13 | 
14 | 	     area_m2	     price_usd
15 | count	22844.000000	  22844.000000
16 | mean	115.020224	    194987.315480
17 | std	  47.742932	      103617.682978
18 | min	  53.000000	      74892.340000
19 | 25%	  76.000000	      113898.770000
20 | 50%	  103.000000	    165697.555000
21 | 75%	  142.000000	    246900.880878
22 | max	  252.000000	    525659.717868
23 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/groupby() method.py:
--------------------------------------------------------------------------------
 1 | mean_price_by_region = df.groupby("region")["price_usd"].mean().sort_values(ascending=True)
 2 | mean_price_by_region.head()
 3 | 
 4 | region
 5 | Central-West    178596.283663
 6 | North           181308.958207
 7 | Northeast       185422.985441
 8 | South           189012.345265
 9 | Southeast       208996.762778
10 | Name: price_usd, dtype: float64
11 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/scatter plot.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | 
3 | plt.scatter(x=homes_by_state["area_m2"], y=homes_by_state["price_usd"])
4 | plt.xlabel("Area [sq meters]")
5 | plt.ylabel("Price [USD]")
6 | plt.title("Rio Grande do Sul: Price vs. Area");
7 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/scatter_mapbox.py:
--------------------------------------------------------------------------------
 1 | import plotly.express as px
 2 | 
 3 | fig = px.scatter_mapbox(
 4 |     df,
 5 |     lat="lat",
 6 |     lon="lon",
 7 |     center={"lat": -14.2, "lon": -51.9},  # Map will be centered on Brazil
 8 |     width=600,
 9 |     height=600,
10 |     hover_data=["price_usd"],  # Display price when hovering mouse over house
11 | )
12 | 
13 | fig.update_layout(mapbox_style="open-street-map")
14 | 
15 | fig.show()
16 | 


--------------------------------------------------------------------------------
/010-housing-in-mexico-015-assignment/value_counts() method.py:
--------------------------------------------------------------------------------
 1 | # counts the number of items/ things
 2 | 
 3 | homes_by_state = df_south["state"].value_counts()
 4 | homes_by_state
 5 | 
 6 | 
 7 | Rio Grande do Sul    2643
 8 | Santa Catarina       2634
 9 | Paraná               2544
10 | Name: state, dtype: int64
11 | 


--------------------------------------------------------------------------------
/020-housing-in-buenos-aires/README.md:
--------------------------------------------------------------------------------
1 | # 020-housing-in-buenos-aires/025-assignment.ipynb
2 | 
3 | New concepts learnt in this project
4 | 


--------------------------------------------------------------------------------
/020-housing-in-buenos-aires/baseline.py:
--------------------------------------------------------------------------------
1 | # Done after splitting data
2 | # into features and vector
3 | 
4 | from sklearn.metrics import mean_absolute_error
5 | 
6 | y_mean = y_train.mean()
7 | y_pred_baseline = [y_mean] * len(y_train)
8 | baseline_mae = mean_absolute_error(y_train, y_pred_baseline)    # what our model needs to beat
9 | 


--------------------------------------------------------------------------------
/020-housing-in-buenos-aires/glob ().py:
--------------------------------------------------------------------------------
 1 | # create a list with files of similar format
 2 | 
 3 | glob(<fileName>-*.<fileExtension>)
 4 | 
 5 | # example
 6 | glob("data/programfiles/excel-*.csv")
 7 | 
 8 | # output
 9 | ['data/programfiles/excel-1.csv',
10 |  'data/programfiles/excel-4.csv',
11 |  'data/programfiles/excel-3.csv',
12 |  'data/programfiles/excel-5.csv',
13 |  'data/programfiles/excel-2.csv']
14 | 
15 | sorted(glob("data/programfiles/excel-*.csv"))
16 | 
17 | ['data/programfiles/excel-1.csv',
18 |  'data/programfiles/excel-2.csv',
19 |  'data/programfiles/excel-3.csv',
20 |  'data/programfiles/excel-4.csv',
21 |  'data/programfiles/excel-5.csv']
22 | 


--------------------------------------------------------------------------------
/020-housing-in-buenos-aires/important DS libraries.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | import warnings
 5 | warnings.simplefilter(action="ignore", category=FutureWarning)
 6 | import plotly.express as px
 7 | import pandas as pd
 8 | import seaborn as sns
 9 | from category_encoders import OneHotEncoder
10 | from IPython.display import VimeoVideo
11 | from ipywidgets import Dropdown, FloatSlider, IntSlider, interact
12 | from sklearn.impute import SimpleImputer
13 | from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
14 | from sklearn.metrics import mean_absolute_error
15 | from sklearn.pipeline import make_pipeline
16 | from sklearn.utils.validation import check_is_fitted
17 | 


--------------------------------------------------------------------------------
/020-housing-in-buenos-aires/model: build & fit.py:
--------------------------------------------------------------------------------
 1 | from category_encoders import OneHotEncoder
 2 | from sklearn.impute import SimpleImputer
 3 | from sklearn.linear_model import LinearRegression, Ridge
 4 | from sklearn.pipeline import make_pipeline
 5 | 
 6 | # build
 7 | model = make_pipeline(
 8 |     OneHotEncoder(use_cat_names=True),
 9 |     SimpleImputer(),
10 |     Ridge()
11 | )
12 | 
13 | # fit...
14 | model.fit(X_train, y_train)
15 | 


--------------------------------------------------------------------------------
/020-housing-in-buenos-aires/predict.py:
--------------------------------------------------------------------------------
 1 | y_test_pred = pd.Series(model.predict(X_test))
 2 | y_test_pred.head()
 3 | 
 4 | # sample output
 5 | 0    53538.366480
 6 | 1    53171.988369
 7 | 2    34263.884179
 8 | 3    53488.425607
 9 | 4    68738.924884
10 | dtype: float64
11 | 


--------------------------------------------------------------------------------
/020-housing-in-buenos-aires/retrieve data.py:
--------------------------------------------------------------------------------
 1 | # retrieve intercept
 2 | intercept = model.named_steps["ridge"].intercept_
 3 | 
 4 | # retrieve coefficients
 5 | coefficients = model.named_steps["ridge"].coef_
 6 | 
 7 | # retrieve names
 8 | features = model.named_steps["onehotencoder"].get_feature_names()
 9 | 
10 | # create a series of names and values
11 | feat_imp = pd.Series(coefficients, index=features)
12 | feat_imp
13 | 
14 | # sample output
15 | surface_covered_in_m2               291.654156
16 | lat                                 478.901375
17 | lon                               -2492.221814
18 | borough_Benito Juárez             13778.188880
19 | borough_Iztacalco                   405.403127
20 | borough_Azcapotzalco               2459.288646
21 | borough_Coyoacán                   3737.561001
22 | borough_Álvaro Obregón             3275.121061
23 | borough_Iztapalapa               -13349.017448
24 | borough_Cuauhtémoc                 -350.531990
25 | borough_Tláhuac                  -14166.869486
26 | borough_Miguel Hidalgo             1977.314718
27 | 


--------------------------------------------------------------------------------
/020-housing-in-buenos-aires/split.py:
--------------------------------------------------------------------------------
 1 | # splitting data into feature matrix and target vector
 2 | 
 3 | target = "price_aprox_usd"  # <--- vector
 4 | features = ["surface_covered_in_m2", "lat", "lon", "borough"]   # <--- matrix
 5 | X_train = df[features]  # training data
 6 | y_train = df[target]    # " " " "
 7 | 
 8 | # The vector is what we are trying to predict using the matrix
 9 | # In this case we are trying to predict the price of a property
10 | # using the features in the matrix
11 | 


--------------------------------------------------------------------------------
/020-housing-in-buenos-aires/wrangle () function.py:
--------------------------------------------------------------------------------
 1 | # Wrangle function:
 2 | # read in a csv file
 3 | # apartments in <cityName> < $100000
 4 | # remove outliers
 5 | # separate columns
 6 | # create new columns from existing
 7 | # take care of highly null columns
 8 | # low and high cardinality
 9 | # Leakage
10 | # multicolinearity
11 | 
12 | def wrangle(filepath):
13 |     # Read CSV file
14 |     df = pd.read_csv(filepath)
15 |     
16 |     # Subset data: Apartments in <cityName>, less than 100,000
17 |     mask_ba = df["place_with_parent_names"].str.contains(<cityName>)
18 |     mask_apt = df["property_type"] == "apartment"
19 |     mask_price = df["price_aprox_usd"] < 100_000
20 |     df = df[mask_ba & mask_apt & mask_price]
21 | 
22 |     # Subset data: Remove outliers for "surface_covered_in_m2"
23 |     low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
24 |     mask_area = df["surface_covered_in_m2"].between(low, high)
25 |     df = df[mask_area]
26 |     
27 |     # split lat-lon column
28 |     df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
29 |     df.drop(columns="lat-lon", inplace=True)
30 |     
31 |     # Extract newColumnName
32 |     df[<newColumnName>] = df["place_with_parent_names"].str.split("|", expand=True)[1]
33 |     df.drop(columns="place_with_parent_names", inplace=True)
34 |     
35 |     # Drop feature with high null count
36 |     df.drop(columns=["surface_total_in_m2", "price_usd_per_m2", "floor", "rooms", "expenses"], inplace=True)
37 |     
38 |     # Drop low- and high- categorical variables
39 |     df.drop(columns=["operation", "property_type", "currency", "properati_url"], inplace=True)
40 |     
41 |     # Drop leaky columns
42 |     df.drop(columns=["price", "price_aprox_local_currency", "price_per_m2"], inplace=True)
43 |     
44 |     # Drop columns with multi-colinerlity
45 |     #df.drop(columns=["surface_total_in_m2", "rooms"], inplace=True)
46 |     
47 |     return df
48 |   
49 |   
50 | test1.isnull().sum() / len(test1)   # check for highly null columns
51 | test1.select_dtypes("object").nunique()   # check for low- and high- categorical variables
52 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/ACFplot.py:
--------------------------------------------------------------------------------
 1 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | fig, ax = plt.subplots(figsize=(15, 6))
 5 | plot_acf(y, ax=ax)
 6 | plt.xlabel(<"xLabelvalue">)
 7 | plt.ylabel(<"yLabelvalue">)
 8 | plt.title(<"yourTitle">);
 9 | 
10 | # Don't delete the code below 👇
11 | plt.savefig("images/3-5-7.png", dpi=150)
12 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/ARmodel.py:
--------------------------------------------------------------------------------
 1 | from statsmodels.tsa.ar_model import AutoReg
 2 | from sklearn.metrics import mean_absolute_error
 3 | 
 4 | # Use AR model to predict PM2.5 readings
 5 | # Hyperparameter --> p
 6 | p_params = range(1, 31)
 7 | maes = []
 8 | for p in p_params:
 9 |     #Train model
10 |     model = AutoReg(y_train, lags=p).fit()
11 |     
12 |     #Generate in-sample pred
13 |     y_pred = model.predict().dropna()
14 |         
15 |     #Calculate mae
16 |     mae = mean_absolute_error(y_train.iloc[p:], y_pred)
17 |     maes.append(mae)
18 |     
19 | mae_series = pd.Series(maes, name="mae", index=p_params)
20 | mae_series.head()
21 | 
22 | # sample output
23 | 1    0.947888
24 | 2    0.933894
25 | 3    0.920850
26 | 4    0.920153
27 | 5    0.919519
28 | Name: mae, dtype: float64
29 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/Baseline.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import mean_absolute_error
 2 | 
 3 | y_train_mean = y_train.mean()
 4 | y_pred_baseline = [y_train_mean] * len(y_train)
 5 | mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
 6 | 
 7 | print("Mean P2 Reading:", y_train_mean)
 8 | print("Baseline MAE:", mae_baseline)
 9 | 
10 | # sample output
11 | Mean P2 Reading: 8.617582545265433
12 | Baseline MAE: 4.07658759405218
13 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/MongoDB.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | # Connect to server
 4 | client = MongoClient(host=<"hostName">, port=<portNum>)
 5 | 
 6 | # Connect to database
 7 | db = client[<"databaseName">]
 8 | 
 9 | # Get collection
10 | dar = db[<"collectionName">]
11 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/PACFplot.py:
--------------------------------------------------------------------------------
 1 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | fig, ax = plt.subplots(figsize=(15, 6))
 5 | plot_pacf(y, ax=ax)   # -----> line showing difference from acf plot <-----
 6 | plt.xlabel(<"xLabelvalue">)
 7 | plt.ylabel(<"yLabelvalue">)
 8 | plt.title(<"yourTitle">);
 9 | 
10 | # Don't delete the code below 👇
11 | plt.savefig("images/3-5-7.png", dpi=150)
12 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/PrettyPrinter.py:
--------------------------------------------------------------------------------
1 | from pprint import PrettyPrinter
2 | 
3 | # Instantiate prettyprinter ----> for nicely formatted output
4 | pp = PrettyPrinter(indent=2)
5 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/README.md:
--------------------------------------------------------------------------------
1 | # 030-Air-Quality-In-Nairobi
2 | 
3 | - Data wrangling with MongoDB
4 | - LinearRegression with time Series data
5 | - Autoregressive models
6 | - ARMA models and Hyperparameter tuning
7 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/aggregate().py:
--------------------------------------------------------------------------------
 1 | # Determine which collection
 2 | # has the most sensor readings
 3 | # $ --> introduces sth new
 4 | result = dar.aggregate(
 5 |     [
 6 |         {"$group": {"_id": "$metadata.site", "count": {"$count": {}}}}
 7 |     ]
 8 | )
 9 | readings_per_site = list(result)
10 | readings_per_site
11 | 
12 | # sample output
13 | [{'_id': 23, 'count': 60020}, {'_id': 11, 'count': 138412}]
14 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/communicate.py:
--------------------------------------------------------------------------------
 1 | import plotly.express as px
 2 | import pandas as pd
 3 | 
 4 | # Put test and walk-forward validation values
 5 | # in a dataframe and plot df
 6 | df_pred_test = pd.DataFrame(
 7 |     {"y_test": y_test, "y_pred_wfv": y_pred_wfv}
 8 | )
 9 | fig = px.line(df_pred_test, labels={"value": "PM2.5"})
10 | fig.update_layout(
11 |     title="Dar es Salaam, WFV Predictions",
12 |     xaxis_title="Date",
13 |     yaxis_title="PM2.5 Level",
14 | )
15 | 
16 | # Don't delete the code below 👇
17 | fig.write_image("images/3-5-18.png", scale=1, height=500, width=700)
18 | 
19 | fig.show()
20 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/distinct().py:
--------------------------------------------------------------------------------
 1 | # Determine no. of sites in collection
 2 | sites = dar.distinct("metadata.site")     # dar ---> variable holding collection
 3 | sites
 4 | 
 5 | # Sample output
 6 | [11, 23]
 7 | 
 8 | # count no. of docs at a prticular site
 9 | # using count_documents()
10 | dar.count_documents({"metadata.site": 23})
11 | 
12 | # Sample output
13 | 60020
14 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/finalModel.py:
--------------------------------------------------------------------------------
 1 | from statsmodels.tsa.ar_model import AutoReg
 2 | from statsmodels.tsa.arima.model import ARIMA
 3 | 
 4 | mae_series    # locate best_p
 5 | best_p = 28
 6 | 
 7 | # build and train model
 8 | best_model = AutoReg(y_train, lags=best_p).fit()
 9 | 
10 | # calculate training residuals for best_model
11 | y_train_resid = best_model.resid
12 | y_train_resid.name = "residuals"
13 | y_train_resid.head()
14 | 
15 | # sample output
16 | timestamp
17 | 2018-01-02 07:00:00+03:00    1.732488
18 | 2018-01-02 08:00:00+03:00   -0.381568
19 | 2018-01-02 09:00:00+03:00   -0.560971
20 | 2018-01-02 10:00:00+03:00   -2.215760
21 | 2018-01-02 11:00:00+03:00    0.006468
22 | Freq: H, Name: residuals, dtype: float64
23 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/libraries.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import time
 3 | from pprint import PrettyPrinter
 4 | import matplotlib.pyplot as plt
 5 | import pandas as pd
 6 | import plotly.express as px
 7 | import seaborn as sns
 8 | from pymongo import MongoClient
 9 | import pytz
10 | from statsmodels.tsa.ar_model import AutoReg
11 | from sklearn.linear_model import LinearRegression
12 | from sklearn.metrics import mean_absolute_error
13 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
14 | from statsmodels.tsa.arima.model import ARIMA
15 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/rollingAvg.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | 
3 | fig, ax = plt.subplots(figsize=(15, 6))
4 | y.rolling(168).mean().plot(ax=ax, xlabel="Date", ylabel="PM2.5 Level", title="Dar es Salaam PM2.5 Levels, 7-Day Rolling Average");
5 | # --> 168 == num of hours in a week
6 | 
7 | # Don't delete the code below 👇
8 | plt.savefig("images/3-5-6.png", dpi=150)
9 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/split.py:
--------------------------------------------------------------------------------
 1 | # percentage ---> 90% (0.9), 80% (0.8) ...
 2 | cutoff_test = int(len(y) * <percentage>)
 3 | y_train = y.iloc[:cutoff_test]
 4 | y_test = y.iloc[cutoff_test:]
 5 | print("y_train shape:", y_train.shape)
 6 | print("y_test shape:", y_test.shape)
 7 | 
 8 | # sample output
 9 | y_train shape: (1533,)
10 | y_test shape: (171,)
11 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/timeSeriesplot.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | 
3 | fig, ax = plt.subplots(figsize=(15, 6))
4 | y.plot(xlabel="Date", ylabel="PM2.5 Level", title="Dar es Salaam PM2.5 Levels", ax=ax);
5 | 
6 | # Don't delete the code below 👇
7 | plt.savefig("images/3-5-5.png", dpi=150)
8 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/wfv.py:
--------------------------------------------------------------------------------
 1 | from statsmodels.tsa.arima.model import ARIMA
 2 | 
 3 | # walk-forward validation for model for test data --> y_test
 4 | # predictions stored in series: y_pred_wfv
 5 | y_pred_wfv = pd.Series()
 6 | history = y_train.copy()
 7 | for i in range(len(y_test)):
 8 |     model = AutoReg(history, lags=best_p).fit()
 9 |     next_pred = model.forecast()      # next value after end of history
10 |     y_pred_wfv = y_pred_wfv.append(next_pred)
11 |     history = history.append(y_test[next_pred.index])
12 |     
13 | y_pred_wfv.name = "prediction"
14 | y_pred_wfv.index.name = "timestamp"
15 | y_pred_wfv.head()
16 | 
17 | # sample output
18 | timestamp
19 | 2018-03-06 00:00:00+03:00    8.056391
20 | 2018-03-06 01:00:00+03:00    8.681779
21 | 2018-03-06 02:00:00+03:00    6.268951
22 | 2018-03-06 03:00:00+03:00    6.303760
23 | 2018-03-06 04:00:00+03:00    7.171444
24 | Freq: H, Name: prediction, dtype: float64
25 | 


--------------------------------------------------------------------------------
/030-air-quality-in-nairobi/wrangle().py:
--------------------------------------------------------------------------------
 1 | # Wrangle function
 2 | # Extract PM2.5 readings
 3 | # from collection site with
 4 | # most readings
 5 | # Localize time
 6 | # Remove outliers
 7 | # Resample data to provide PM2.5 readings
 8 | # for each hour
 9 | # impute missing values
10 | # return series
11 | def wrangle(collection):
12 |     results = collection.find(
13 |         {"metadata.site": 11, "metadata.measurement": "P2"},
14 |         projection={"P2": 1, "timestamp": 1, "_id": 0},   # ---> focus/ limit to only "P2" and timestamp
15 |     )
16 | 
17 |     df = pd.DataFrame(results).set_index("timestamp")
18 |     
19 |     # Localize time
20 |     df.index = df.index.tz_localize("UTC").tz_convert("Africa/Dar_es_Salaam")
21 |     
22 |     # Remove outliers
23 |     df = df[df["P2"] < 100]
24 |     
25 |     # Resample to 1hour period, fill in missing values
26 |     y = df["P2"].resample("1H").mean().fillna(method='ffill')
27 |     
28 |     return y
29 |   
30 | # Using wrangle()
31 | y = wrangle(dar)
32 | y.head()
33 | 
34 | # sample output
35 | timestamp
36 | 2018-01-01 03:00:00+03:00    9.456327
37 | 2018-01-01 04:00:00+03:00    9.400833
38 | 2018-01-01 05:00:00+03:00    9.331458
39 | 2018-01-01 06:00:00+03:00    9.528776
40 | 2018-01-01 07:00:00+03:00    8.861250
41 | Freq: H, Name: P2, dtype: float64
42 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/1) libraries.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | import pandas as pd
 5 | import seaborn as sns
 6 | from category_encoders import OneHotEncoder
 7 | from category_encoders import OrdinalEncoder
 8 | from sklearn.linear_model import LogisticRegression
 9 | from sklearn.metrics import accuracy_score
10 | from sklearn.model_selection import train_test_split
11 | from sklearn.pipeline import Pipeline, make_pipeline
12 | from sklearn.tree import DecisionTreeClassifier, plot_tree
13 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/2) connect.py:
--------------------------------------------------------------------------------
1 | %load_ext sql
2 | %sql sqlite:////home/jovyan/<fileName e.g nepal>.sqlite
3 | 
4 | # sample output
5 | 'Connected: @/home/jovyan/nepal.sqlite'
6 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/3) get_tables.py:
--------------------------------------------------------------------------------
1 | %%sql
2 | SELECT name
3 | FROM sqlite_schema
4 | WHERE type = "table"
5 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/4) explore_tables.py:
--------------------------------------------------------------------------------
 1 | %%sql
 2 | SELECT distinct(district_id)    # gives unique values of column district_id
 3 | FROM id_map   # name of table
 4 | 
 5 | 
 6 | # num of observations in table id_map
 7 | # where value of column district_id is 1
 8 | %%sql
 9 | SELECT count(*)
10 | FROM id_map
11 | WHERE district_id = 1
12 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/5) JOIN.py:
--------------------------------------------------------------------------------
 1 | %%sql
 2 | # joining tables at columns building_id
 3 | SELECT distinct(i.building_id) AS b_id,   # building_id column of table i aliased as b_id
 4 |      s.*,     # selects all columns of table s
 5 |      d.damage_grade   # select damage_grade column of table d
 6 | FROM id_map AS i
 7 | JOIN building_structure AS s ON i.building_id = s.building_id
 8 | JOIN building_damage AS d ON i.building_id = d.building_id
 9 | WHERE district_id = 3
10 | LIMIT 5
11 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/6) wrangle().py:
--------------------------------------------------------------------------------
 1 | def wrangle(db_path):
 2 |     # Connect to database using connect method
 3 |     conn = sqlite3.connect(db_path)
 4 | 
 5 |     # Construct query
 6 |     query = """
 7 |         SELECT distinct(i.building_id) AS b_id,
 8 |            s.*,
 9 |            d.damage_grade
10 |         FROM id_map AS i
11 |         JOIN building_structure AS s ON i.building_id = s.building_id
12 |         JOIN building_damage AS d ON i.building_id = d.building_id
13 |         WHERE district_id = 3
14 |     """
15 | 
16 |     # Read query results into DataFrame
17 |     df = pd.read_sql(query, conn, index_col="b_id")
18 | 
19 |     # Identify leaky columns
20 |     drop_cols = [col for col in df.columns if "post_eq" in col]
21 |     
22 |     # Create binary target
23 |     df["damage_grade"] = df["damage_grade"].str[-1].astype(int)
24 |     df["severe_damage"] = (df["damage_grade"] > 3).astype(int)    # encode as 0's and 1's
25 |     
26 |     # Drop old target
27 |     drop_cols.append("damage_grade")
28 |     
29 |     # Drop multicolinearity column
30 |     drop_cols.append("count_floors_pre_eq")
31 |     
32 |     # Drop high categorical features
33 |     drop_cols.append("building_id")
34 |     
35 |     # Drop columns
36 |     df.drop(columns=drop_cols, inplace=True)
37 |     
38 |     
39 |     return df
40 |   
41 |   
42 |   # Using wrangle func
43 |   df = wrangle("/home/jovyan/nepal.sqlite")
44 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/7) barChart.py:
--------------------------------------------------------------------------------
1 | # create bar chart using
2 | # severe damage column which
3 | # contains two classes
4 | df["severe_damage"].value_counts(normalize=True).plot(
5 |     kind="bar", xlabel="Severe Damage", ylabel="Relative Frequency", title="Class Balance"
6 | );
7 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/8) boxplot.py:
--------------------------------------------------------------------------------
1 | # severe_damage: column with 2 groups
2 | # plinth_area_sq_ft: column: footprint size of building
3 | 
4 | sns.boxplot(x="severe_damage", y="plinth_area_sq_ft", data=df)
5 | plt.xlabel("Severe Damage")
6 | plt.ylabel("Plinth Area [sq. ft.]")
7 | plt.title("Kavrepalanchok, Plinth Area vs Building Damage");
8 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/9) pivot_table.py:
--------------------------------------------------------------------------------
1 | roof_pivot = pd.pivot_table(
2 |     df, index="roof_type", values="severe_damage", aggfunc=np.mean    # roof_type: column in table
3 | ).sort_values(by="severe_damage")
4 | roof_pivot
5 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/91) vertical_split.py:
--------------------------------------------------------------------------------
1 | X = df.drop(columns="severe_damage")    # feature matrix: all columns apart from severe_damage
2 | y = df["severe_damage"]       # target vector
3 | print("X shape:", X.shape)
4 | print("y shape:", y.shape)
5 | 
6 | # sample output
7 | X shape: (76533, 11)
8 | y shape: (76533,)
9 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/92) horizontal_split.py:
--------------------------------------------------------------------------------
 1 | X_train, X_val, y_train, y_val = train_test_split(
 2 |     X, y, test_size=0.2, random_state=42
 3 | )
 4 | print("X_train shape:", X_train.shape)
 5 | print("y_train shape:", y_train.shape)
 6 | print("X_val shape:", X_val.shape)
 7 | print("y_val shape:", y_val.shape)
 8 | 
 9 | # sample output
10 | X_train shape: (61226, 11)
11 | y_train shape: (61226,)
12 | X_val shape: (15307, 11)
13 | y_val shape: (15307,)
14 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/93) baseline.py:
--------------------------------------------------------------------------------
1 | acc_baseline = y_train.value_counts(normalize=True).max()   # normalize gives you the relative freq
2 | print("Baseline Accuracy:", round(acc_baseline, 2))
3 | 
4 | # sample output
5 | Baseline Accuracy: 0.55
6 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/94) log_reg.py:
--------------------------------------------------------------------------------
1 | model_lr = make_pipeline(
2 |     OneHotEncoder(use_cat_names=True),
3 |     LogisticRegression(max_iter=<1000-3000>)   #max_iter: varies: suppresses the 'ConvergenceWarning'
4 | )
5 | # Fit model to training data
6 | model_lr.fit(X_train, y_train)
7 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/95) accuracy_score.py:
--------------------------------------------------------------------------------
 1 | lr_train_acc = accuracy_score(y_train, model_lr.predict(X_train))
 2 | lr_val_acc = model_lr.score(X_val, y_val)
 3 | 
 4 | print("Logistic Regression, Training Accuracy Score:", lr_train_acc)
 5 | print("Logistic Regression, Validation Accuracy Score:", lr_val_acc)
 6 | 
 7 | # sample output
 8 | Logistic Regression, Training Accuracy Score: 0.6515042628948486
 9 | Logistic Regression, Validation Accuracy Score: 0.6536878552296335
10 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/96) decision_tree.py:
--------------------------------------------------------------------------------
 1 | depth_hyperparams = range(1, 16)    # for max_depth
 2 | training_acc = []
 3 | validation_acc = []
 4 | for d in depth_hyperparams:
 5 |     model_dt = make_pipeline(
 6 |         OrdinalEncoder(), 
 7 |         DecisionTreeClassifier(max_depth= d, random_state=42)
 8 |     )
 9 |     # Fit model to training data
10 |     model_dt.fit(X_train, y_train)
11 |     # Calculate training accuracy score and append to `training_acc`
12 |     training_acc.append(model_dt.score(X_train, y_train))
13 |     # Calculate validation accuracy score and append to `training_acc`
14 |     validation_acc.append(model_dt.score(X_val, y_val))
15 | 
16 | print("Training Accuracy Scores:", training_acc[:6])
17 | print("Validation Accuracy Scores:", validation_acc[:6])
18 | 
19 | 
20 | # sample output
21 | Training Accuracy Scores: [0.6303041191650606, 0.6303041191650606, 0.642292490118577, 0.653529546271192, 0.6543951915852743, 0.6576617776761506]
22 | Validation Accuracy Scores: [0.6350035931273273, 0.6350035931273273, 0.6453909975828053, 0.6527732410008493, 0.6529039001763899, 0.6584569151368654]
23 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/97) validation_curve.py:
--------------------------------------------------------------------------------
 1 | # Validation curve
 2 | plt.plot(depth_hyperparams, training_acc, label="Training")
 3 | plt.plot(depth_hyperparams, validation_acc, label="validation")
 4 | plt.xlabel("Max Depth")
 5 | plt.ylabel("Accuracy Score")
 6 | plt.title("Validation Curve, Decision Tree Model")
 7 | plt.legend();
 8 | 
 9 | 
10 | # build & fit again
11 | final_model_dt = make_pipeline(
12 |     OrdinalEncoder(), 
13 |     DecisionTreeClassifier(max_depth=10, random_state=42)
14 | )
15 | # Fit model to training data
16 | final_model_dt.fit(X, y)    #final_model_dt.fit(X_train, y_train)
17 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/98) tests.py:
--------------------------------------------------------------------------------
 1 | # test type 1
 2 | X_test = pd.read_csv("filePath.csv", index_col="b_id")
 3 | y_test_pred = pd.Series(final_model_dt.predict(X_test))
 4 | y_test_pred[:5]
 5 | 
 6 |   # sample output
 7 |   0    1
 8 |   1    1
 9 |   2    1
10 |   3    1
11 |   4    0
12 |   dtype: int64
13 | 
14 | 
15 | # test type 2
16 | test_acc = model.score(X_test, y_test)
17 | print("Test Accuracy:", round(test_acc, 2))
18 |   
19 |   # sample output
20 |   Test Accuracy: 0.72
21 |     
22 | 
23 | # test type 3
24 | acc_train = accuracy_score(y_train, model_lr.predict(X_train))
25 | acc_test = model_lr.score(X_test, y_test)
26 | 
27 | print("LR Training Accuracy:", acc_train)
28 | print("LR Validation Accuracy:", acc_test)
29 | 
30 |   # sample output
31 |   LR Training Accuracy: 0.717985042664646
32 |   LR Validation Accuracy: 0.7218817948211109
33 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/99) communicate.py:
--------------------------------------------------------------------------------
 1 | # DECISION TREE
 2 | features = X_train.columns
 3 | importances = <yourModel eg final_model_dt>.named_steps["decisiontreeclassifier"].feature_importances_
 4 | feat_imp = pd.Series(importances, index=features).sort_values()
 5 | feat_imp.head()
 6 | 
 7 | # sample output
 8 | plan_configuration        0.004189
 9 | land_surface_condition    0.008599
10 | foundation_type           0.009967
11 | position                  0.011795
12 | ground_floor_type         0.013521
13 | dtype: float64
14 | 
15 | 
16 | # LOGISTIC REG
17 | features = model_lr.named_steps["onehotencoder"].get_feature_names()
18 | importances = model_lr.named_steps["logisticregression"].coef_[0]
19 | feat_imp = pd.Series(np.exp(importances), index=features).sort_values()
20 | feat_imp.head()
21 | 
22 | # sample output
23 | superstructure_Brick, cement mortar    0.345719
24 | foundation_type_RC                     0.364478
25 | roof_type_RCC/RB/RBC                   0.415979
26 | ground_floor_type_RC                   0.527756
27 | caste_household_Kumal                  0.543642
28 | dtype: float64
29 | 
30 | 
31 | 
32 | # horizontal bar chart
33 | feat_imp.plot(kind="barh")
34 | plt.xlabel("importance")
35 | plt.ylabel("Label")
36 | plt.title("Feature Importance");
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/991) others.py:
--------------------------------------------------------------------------------
 1 |  # Create DF called 'damage_by_vdcmun' 
 2 |  # group DF by "vdcmun_id" 
 3 |  # calculating mean of the "severe_damage" column. 
 4 |  # Be sure to sort from highest to lowest proportion
 5 | damage_by_vdcmun = (
 6 |     df.groupby("vdcmun_id")["severe_damage"].mean().sort_values(ascending=False)
 7 | ).to_frame()
 8 | damage_by_vdcmun
 9 | 
10 | 
11 | # Line plot
12 | plt.plot(damage_by_vdcmun.values, color="blue")
13 | plt.xticks(range(len(damage_by_vdcmun)), labels=damage_by_vdcmun.index)
14 | plt.yticks(np.arange(0.0, 1.1, 0.2))
15 | plt.xlabel("Mun ID")
16 | plt.ylabel("% Households")
17 | plt.title("Damage by Municipality");
18 | 


--------------------------------------------------------------------------------
/040-earthquake-damage-in-nepal/README.md:
--------------------------------------------------------------------------------
1 | # 040-Earthquake-Damage-in-Nepal
2 | 
3 | - sqlite
4 | - logistic-regression
5 | - decision-tree
6 | - demographics
7 |     - Ethical Data Science
8 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/GridSearchCV.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | 
 3 | # Range of hyperparameters
 4 | params = {
 5 |     "simpleimputer__strategy": ["mean", "median"],
 6 |     "randomforestclassifier__n_estimators": range(25, 100, 25),
 7 |     "randomforestclassifier__max_depth": range(10, 50, 10)
 8 | }
 9 | 
10 | # Using `GridSearchCV`
11 | model = model = GridSearchCV(
12 |     clf,
13 |     param_grid=params,
14 |     cv=5,
15 |     n_jobs=-1,
16 |     verbose=1
17 | )
18 | 
19 | # Fit your model
20 | model.fit(X_train_over, y_train_over)
21 | 
22 | # cross_validation results
23 | results = pd.DataFrame(model.cv_results_)
24 | cv_results.head(5)
25 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/README.md:
--------------------------------------------------------------------------------
 1 | # 050-bankruptcy-in-poland
 2 | ## Concepts learnt
 3 | - Working with JSON
 4 | - Imbalanced data
 5 | - Random forest
 6 | - Gradient boosting
 7 | - Linux command line
 8 | - Creating python modules
 9 | - Importing functions from modules
10 | - Saving and loading a model
11 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/acc_score.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | 
 3 | acc_train = model.score(X_train, y_train)
 4 | acc_test = model.score(X_test, y_test)
 5 | 
 6 | print("Model Training Accuracy:", round(acc_train, 4))
 7 | print("Model Test Accuracy:", round(acc_test, 4))
 8 | 
 9 |   # Sample output
10 |   Model Training Accuracy: 1.0
11 |   Model Test Accuracy: 0.9764
12 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/bar.py:
--------------------------------------------------------------------------------
1 | import libraries
2 | 
3 | df["bankrupt"].value_counts(normalize=True).plot(
4 |     kind = "bar",
5 |     xlabel = "Bankrupt",
6 |     ylabel = "Frequency",
7 |     title = "Class Balance"
8 | );
9 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/barh.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | 
 3 | # Get feature names from training data
 4 | features = X_train_over.columns
 5 | 
 6 | # Extract importances from model
 7 | importances = model.best_estimator_.named_steps["randomforestclassifier"].feature_importances_
 8 | 
 9 | # Create a series with feature names and importances
10 | feat_imp = pd.Series(importances, index=features).sort_values()
11 | 
12 | # Plot 10 most important features
13 | feat_imp.tail(10).plot(kind="barh")
14 | plt.xlabel("...")
15 | plt.ylabel("...")
16 | plt.title("...");
17 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/best_params.py:
--------------------------------------------------------------------------------
1 | import libraries
2 | 
3 | # Method 1
4 | best_params = model.best_params_
5 | print(best_params)
6 | 
7 | # Method 2
8 | model.predict(X_train_over)
9 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/classif_reports.py:
--------------------------------------------------------------------------------
1 | import libraries
2 | 
3 | class_report = classification_report(y_test, model.predict(X_test))
4 | print(class_report)
5 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/clf_cv.py:
--------------------------------------------------------------------------------
1 | import libraries
2 | 
3 | # # classifier
4 | clf = make_pipeline(SimpleImputer(), RandomForestClassifier(random_state=42))
5 | 
6 | # cross validation
7 | cv_scores = cross_val_score(clf, X_train_over, y_train_over, cv=5, n_jobs=-1)
8 | print(cv_scores)
9 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/conf_matrix.py:
--------------------------------------------------------------------------------
1 | import libraries
2 | 
3 | ConfusionMatrixDisplay.from_estimator(model, X_test, y_test);
4 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/import.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | 
 3 | # Compressed file --> dict
 4 | with gzip.open("<filePath>", "r") as f:
 5 |     taiwan_data = json.load(f)
 6 | 
 7 | # Extracting keys from a dict
 8 | taiwan_data_keys = taiwan_data.keys()
 9 | print(taiwan_data_keys)
10 | 
11 |   # Sample output
12 |   dict_keys(['schema', 'metadata', 'observations'])
13 | 
14 | # Counting number of observations
15 | len(taiwan_data["observations"])
16 | 
17 | # Length / no. of each observation
18 | len(taiwan_data["observations"][0])
19 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/interactive_dash.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | 
 3 | def make_cnf_matrix(threshold):
 4 |     y_pred_proba = model.predict_proba(X_test)[:, -1]
 5 |     y_pred = y_pred_proba > threshold
 6 |     conf_matrix = confusion_matrix(y_test, y_pred)
 7 |     tn, fp, fn, tp = conf_matrix.ravel()
 8 |     tn, fp, fn, tp
 9 |     print(f"Profit: €{tp * 100_000_000}")
10 |     print(f"Loses: €{tp * 250_000_000}")
11 |     ConfusionMatrixDisplay.from_predictions(y_test, y_pred, colorbar=False)
12 |     thresh_widget = widgets.FloatSlider(min=0, max=1, value=0.5, step=0.05)
13 | 
14 | interact(make_cnf_matrix, threshold=thresh_widget);
15 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/libraries.py:
--------------------------------------------------------------------------------
 1 | from sklearn.base import ClassifierMixin
 2 | from sklearn.pipeline import Pipeline
 3 | import gzip
 4 | import json
 5 | import pickle
 6 | 
 7 | import pandas as pd
 8 | import matplotlib.pyplot as plt
 9 | import pandas as pd
10 | import seaborn as sns
11 | import wqet_grader
12 | from imblearn.over_sampling import RandomOverSampler
13 | from imblearn.under_sampling import RandomUnderSampler
14 | from sklearn.impute import SimpleImputer
15 | from sklearn.metrics import (
16 |     ConfusionMatrixDisplay,
17 |     classification_report,
18 |     confusion_matrix,
19 | )
20 | from sklearn.pipeline import make_pipeline
21 | from sklearn.tree import DecisionTreeClassifier
22 | from sklearn.ensemble import RandomForestClassifier
23 | from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
24 | import ipywidgets as widgets
25 | from ipywidgets import interact
26 | from sklearn.ensemble import GradientBoostingClassifier
27 | from teaching_tools.widgets import ConfusionMatrixWidget
28 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/naNs.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | 
 3 | nans_by_col = df.isna().sum()
 4 | print("nans_by_col shape:", nans_by_col.shape)
 5 | nans_by_col.head()
 6 | 
 7 |   # Sample output
 8 |   nans_by_col shape: (96,)
 9 |   bankrupt    0
10 |   feat_1      0
11 |   feat_2      0
12 |   feat_3      0
13 |   feat_4      0
14 |   dtype: int64
15 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/resampling.py:
--------------------------------------------------------------------------------
1 | import libraries
2 | 
3 | over_sampler = RandomOverSampler(random_state=42)
4 | X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)
5 | print("X_train_over shape:", X_train_over.shape)
6 | X_train_over.head()
7 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/save_and_load.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | 
 3 | # save model to `Destination`
 4 | with open("<Destination>", "wb") as f:
 5 |     pickle.dump(model, f)
 6 |     
 7 | # 
 8 | # Load model from `Destination``
 9 | with open("<Destination>", "rb") as f:
10 |     loaded_model = pickle.load(f)
11 | print(loaded_model)
12 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/splits.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | 
 3 | # Feature matrix and Target vector
 4 | target = "bankrupt"
 5 | X = df.drop(columns="bankrupt")
 6 | y = df[target]
 7 | 
 8 | 
 9 | # Training and test split
10 | X_train, X_test, y_train, y_test = train_test_split(
11 |     X, y, test_size=0.2, random_state=42
12 | )
13 | 


--------------------------------------------------------------------------------
/050-bankruptcy-in-poland/wrangle.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | 
 3 | # Wrangle function
 4 | def wrangle(filePath):
 5 |     # Open compressed file, load to dict
 6 |     with gzip.open(filePath, "r") as f:
 7 |         data = json.load(f)
 8 |         
 9 |     # Dictionary --> DataFrame, set index
10 |     df = pd.DataFrame().from_dict(data["observations"]).set_index("id")
11 |     
12 |     return df
13 | 


--------------------------------------------------------------------------------
/060-consumer-finances in-usa/1_import.py:
--------------------------------------------------------------------------------
1 | df = pd.read_csv("<filePath>")
2 | print("df shape:", df.shape)
3 | df.head()
4 | 


--------------------------------------------------------------------------------
/060-consumer-finances in-usa/2_explore.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | import 1_import
 3 | 
 4 | # 1
 5 | # Percentage of respondents in df that are business owners, 
 6 | # assign result to the variable pct_biz_owners. 
 7 | # Review documentation regarding "HBUS" column
 8 | # https://sda.berkeley.edu/sdaweb/docs/scfcomb2019/DOC/hcbkfx0.htm
 9 | 
10 | pct_biz_owners = sum(df["HBUS"]) / (sum(df["HBUS"] == 0) + sum(df["HBUS"]))
11 | print("% of business owners in df:", pct_biz_owners)
12 | 
13 | # 2
14 | # DataFrame df_inccat showing normalized frequency 
15 | # for income categories for business owners and non-business owners
16 | 
17 | inccat_dict = {
18 |     1: "0-20",
19 |     2: "21-39.9",
20 |     3: "40-59.9",
21 |     4: "60-79.9",
22 |     5: "80-89.9",
23 |     6: "90-100",
24 | }
25 | 
26 | df_inccat = (
27 |     df["INCCAT"]
28 |     .replace(inccat_dict)
29 |     .groupby(df["HBUS"])
30 |     .value_counts(normalize=True)
31 |     .rename("frequency")
32 |     .to_frame()
33 |     .reset_index()
34 | )
35 | 
36 | df_inccat
37 | 
38 | # 3
39 | # Seaborn, create a side-by-side bar chart of df_inccat
40 | 
41 | sns.barplot(
42 |     x="INCCAT",
43 |     y="frequency",
44 |     hue="HBUS",
45 |     data=df_inccat,
46 |     order=inccat_dict.values()
47 | )
48 | plt.xlabel("<Your x_Title>")
49 | plt.ylabel("<Your y_Title>")
50 | plt.title("<Your Title>");
51 | 
52 | # 4
53 | # create a scatter plot that shows "HOUSES" vs. "DEBT"
54 | 
55 | sns.scatterplot(x=df["DEBT"] / 1e6, y=df["HOUSES"] / 1e6, palette="deep")
56 | plt.xlabel("Household Debt")
57 | plt.ylabel("Home Value")
58 | plt.title("Home Value vs. Household Debt");
59 | 
60 | # 5
61 | # New DataFrame df_small_biz containing 
62 | # only business owners whose income is below $500,000
63 | 
64 | mask = (df["HBUS"]) & (df["INCOME"] < 500_000)
65 | df_small_biz = df[mask]
66 | 


--------------------------------------------------------------------------------
/060-consumer-finances in-usa/3_explore.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | import 1_import
 3 | import 2_explore
 4 | 
 5 | # 6
 6 | # Histogram from the "AGE" column 
 7 | # in df_small_biz with 10 bins
 8 | df_small_biz["AGE"].hist(bins=10)
 9 | plt.xlabel("Your x_Label")
10 | plt.ylabel("Your y_Label")
11 | plt.title("Your Title");
12 | 
13 | # 7
14 | # Variance for all the features in df_small_biz, 
15 | # create Series top_ten_var with 10 features with largest variance
16 | top_ten_var = df_small_biz.var().sort_values().tail(10)
17 | top_ten_var
18 | 
19 | # 8
20 | # trimmed variance for the features in df_small_biz
21 | # not include the top and bottom 10% of observations
22 | top_ten_trim_var = df_small_biz.apply(trimmed_var, limits=(0.1, 0.1)).sort_values().tail(10)
23 | top_ten_trim_var
24 | 
25 | # 9
26 | # create a horizontal bar chart of top_ten_trim_var
27 | fig = px.bar(
28 |     x=top_ten_trim_var,
29 |     y=top_ten_trim_var.index,
30 |     title="High Var Feat"
31 | )
32 | fig.update_layout(xaxis_title="Your x_label", yaxis_title="Your y_label")
33 | 
34 | # 10
35 | # Create list: high_var_cols, 
36 | # with the column names of the five features 
37 | # with the highest trimmed variance
38 | high_var_cols = top_ten_trim_var.tail(5).index.to_list()
39 | high_var_cols
40 | 


--------------------------------------------------------------------------------
/060-consumer-finances in-usa/4_split.py:
--------------------------------------------------------------------------------
1 | import libraries
2 | import 1_import
3 | import 2_explore
4 | import 3_explore
5 | 
6 | # Feature matrix X containing five columns in high_var_cols
7 | X = df_small_biz[high_var_cols]
8 | 


--------------------------------------------------------------------------------
/060-consumer-finances in-usa/6_communicate.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | import 1_import
 3 | import 2_explore
 4 | import 3_explore
 5 | import 4_split
 6 | import model
 7 | 
 8 | # 16
 9 | # DataFrame xgb containing mean values 
10 | # of the features in X for the 3 clusters 
11 | # in your final_model
12 | labels = final_model.named_steps["kmeans"].labels_
13 | xgb = X.groupby(labels).mean()
14 | xgb
15 | 
16 | # 17
17 | # create side-by-side bar chart from xgb 
18 | # showing mean of the features in X 
19 | # for each of the clusters in your final_model
20 | fig = px.bar(
21 |     xgb,
22 |     barmode="group",
23 |     title="Your Title"
24 | )
25 | fig.update_layout(xaxis_title="Your x_label", yaxis_title="Your y_label")
26 | 
27 | # 18
28 | # Create a PCA transformer, 
29 | # reduce the dimensionality of X to 2, 
30 | # and then put the transformed data into a DataFrame
31 | pca = PCA(n_components=2, random_state=42)
32 | 
33 | X_t = pca.fit_transform(X)
34 | 
35 | X_pca = pd.DataFrame(X_t, columns=["PC1", "PC2"])
36 | 
37 | # 19
38 | # create a scatter plot of X_pca using seaborn
39 | fig = px.scatter(
40 |     data_frame=X_pca,
41 |     x="PC1",
42 |     y="PC2",
43 |     color=labels.astype(str),
44 |     title="PCA Representation of Clusters"
45 | )
46 | fig.update_layout(xaxis_title="PC1", yaxis_title="PC2")
47 | 


--------------------------------------------------------------------------------
/060-consumer-finances in-usa/README.md:
--------------------------------------------------------------------------------
1 | # 060-consumer-finance-in-usa
2 | ## Unsupervised learning, specifically clustering
3 | 
4 | - Side-by-side bar chart
5 | - K-means clustering model
6 | - Clustering-2-features vs -multiple-features
7 | - Feature selection based on variance
8 | - Principal component analysis (PCA)
9 | 


--------------------------------------------------------------------------------
/060-consumer-finances in-usa/libraries.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import plotly.express as px
 3 | import pandas as pd
 4 | import seaborn as sns
 5 | from sklearn.cluster import KMeans
 6 | from sklearn.metrics import silhouette_score
 7 | from teaching_tools.widgets import ClusterWidget, SCFClusterWidget
 8 | from scipy.stats.mstats import trimmed_var
 9 | from sklearn.decomposition import PCA
10 | from sklearn.pipeline import make_pipeline
11 | from sklearn.preprocessing import StandardScaler
12 | 


--------------------------------------------------------------------------------
/060-consumer-finances in-usa/model.py:
--------------------------------------------------------------------------------
 1 | import libraries
 2 | import 1_import
 3 | import 2_explore
 4 | import 3_explore
 5 | import 4_split
 6 | 
 7 | # 12
 8 | # Iteratively build and train a K-Means 
 9 | # model where n_clusters ranges [2, 12]
10 | 
11 | n_clusters = range(2, 13)
12 | inertia_errors = []
13 | silhouette_scores = []
14 | 
15 | # Use for loop
16 | for k in n_clusters:
17 |     # Build
18 |     model = make_pipeline(StandardScaler(), KMeans(n_clusters=k, random_state=42))
19 |     # Train
20 |     model.fit(X)
21 |     # Calculate inertia
22 |     inertia_errors.append(model.named_steps["kmeans"].inertia_)
23 |     # Calculate silhouette score
24 |     silhouette_scores.append(
25 |         silhouette_score(X, model.named_steps["kmeans"].labels_)
26 |     )
27 | 
28 | print("Inertia:", inertia_errors[:10])
29 | print()
30 | print("Silhouette Scores:", silhouette_scores[:3])
31 | 
32 | # 13
33 | # Line plot showing values of 
34 | # inertia_errors as a function of n_clusters
35 | 
36 | fig = px.line(
37 |     x=n_clusters, y=inertia_errors, title="Your Title"
38 | )
39 | fig.update_layout(xaxis_title="Your x_label", yaxis_title="Your y_label")
40 | 
41 | # 14
42 | # Line plot showing values of 
43 | # silhouette_scores as a function of n_clusters
44 | 
45 | fig = px.line(
46 |     x=n_clusters, y=silhouette_scores, title="Your Title"
47 | )
48 | fig.update_layout(xaxis_title="Your x_label", yaxis_title="Your y_label")
49 | 
50 | # 15
51 | # Build and train a new k-means model
52 | # n_clusters: 3
53 | # random state: 42
54 | 
55 | final_model = make_pipeline(
56 |     StandardScaler(),
57 |     KMeans(n_clusters=3, random_state=42)
58 | )
59 | final_model.fit(X)
60 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/README.md:
--------------------------------------------------------------------------------
 1 | ## 070-ds-admissions-in-wqu
 2 | 
 3 | ### Contents...
 4 | > EDA.
 5 | 
 6 | > ETL.
 7 | 
 8 | > Chi-Square test.
 9 | 
10 | > Interactive dashboard.
11 | 
12 | 
13 | ![image](https://user-images.githubusercontent.com/99328720/189812167-668064f1-7ee3-4a5c-9ae7-638101e5e9f9.png)
14 | 
15 | 
16 | 
17 | ![image](https://user-images.githubusercontent.com/99328720/189812222-a33a9bee-42cf-481e-a3d1-047cb69859e8.png)
18 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/aggregate.py:
--------------------------------------------------------------------------------
 1 | """ Using the aggregate method() """
 2 | 
 3 | import imports
 4 | 
 5 | 
 6 | # aggregate by nationality
 7 | result = <collectionName>.aggregate(
 8 |     [
 9 |         {
10 |             "$group": {"_id": "$countryISO2", "count": {"$count": {}}}
11 |         }
12 |     ]
13 | )
14 | 
15 | 
16 | # aggregate by sign-up
17 | result = <yourCollection>.aggregate(
18 |     [
19 |         {
20 |             "$match": {"admissionsQuiz": "incomplete"}
21 |         },
22 |         {
23 |             "$group": {
24 |                 "_id": {"$dateTrunc": {"date": "$createdAt", "unit": "day"}},
25 |                 "count": {"$sum": 1}
26 |             }
27 |         }
28 |     ]
29 | )
30 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/choropleth_map.py:
--------------------------------------------------------------------------------
 1 | """ """
 2 | 
 3 | import imports
 4 | import load
 5 | 
 6 | 
 7 | # `build_nat_choropleth` function
 8 | <df_Name>["count_pct"] = (<df_Name>["count"] / <df_Name>["count"].sum()) * 100
 9 | 
10 | 
11 | def build_nat_choropleth():
12 |     fig = px.choropleth(
13 |         data_frame= <df_Name>,
14 |         locations="country_iso3",
15 |         color="count_pct",
16 |         projection="natural earth",
17 |         color_continuous_scale=px.colors.sequential.Oranges,
18 |         title="Title"
19 |     )
20 |     return fig
21 | 
22 | # Display image
23 | nat_fig = build_nat_choropleth()
24 | nat_fig.write_image("images/7-5-4.png", scale=1, height=500, width=700)
25 | 
26 | nat_fig.show()
27 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/connect.py:
--------------------------------------------------------------------------------
 1 | """ Connecting to the Database """
 2 | 
 3 | import imports
 4 | 
 5 | 
 6 | # Connect to database
 7 | # Access a certain collection
 8 | 
 9 | # Create a Mongo-`client`
10 | client = MongoClient(host="localhost", port=<portNumber>)
11 | 
12 | # Create a database: `db`
13 | db = client["wqu-abtest"]
14 | 
15 | # Find your collection: `"<collectionName>"`
16 | mscfe_app = db["<collectionName>"]
17 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/contingency_bar.py:
--------------------------------------------------------------------------------
 1 | """ """
 2 | 
 3 | import imports
 4 | import crosstab
 5 | 
 6 | 
 7 | # `build_contingency_bar` function
 8 | def build_contingency_bar():
 9 |     # side-by-side bar chart
10 |     fig = px.bar(
11 |         data_frame=data,
12 |         barmode="group",
13 |         title="TITLE"
14 |     )
15 |     # Set axis labels
16 |     fig.update_layout(xaxis_title="XTITLE", yaxis_title="YTITLE")
17 |     return fig
18 | 
19 | # Display
20 | cb_fig = build_contingency_bar()
21 | cb_fig.write_image("images/7-5-16.png", scale=1, height=500, width=700)
22 | 
23 | cb_fig.show()
24 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/contingency_table.py:
--------------------------------------------------------------------------------
 1 | """ """
 2 | 
 3 | import imports
 4 | 
 5 | 
 6 | # contingency table
 7 | contingency_table = Table2x2(data.values)
 8 | 
 9 | # chi-square test
10 | chi_square_test = contingency_table.test_nominal_association()
11 | 
12 | # odds ratio
13 | odds_ratio = contingency_table.oddsratio.round(1)
14 | 
15 | # summary...
16 | summary = contingency_table.summary()
17 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/country_converter.py:
--------------------------------------------------------------------------------
 1 | """ """
 2 | 
 3 | import load
 4 | 
 5 | # Instantiate `CountryConverter`
 6 | cc = CountryConverter()
 7 | 
 8 | # Create new columns      ... full country names
 9 | <df_Name>["country_name"] = cc.convert(
10 |     <df_Name>["country_iso2"], to="name_short"
11 | )
12 | 
13 | #                         ... three letter abbv country names
14 | <df_Name>["country_iso3"] = cc.convert(
15 |     <df_Name>["country_iso2"], to="ISO3"
16 | )
17 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/crosstab.py:
--------------------------------------------------------------------------------
 1 | """ """
 2 | 
 3 | import imports
 4 | 
 5 | 
 6 | data = pd.crosstab(
 7 |     index=<yourDataFrame>["group"],
 8 |     columns=<yourDataFrame>["admissionsQuiz"],
 9 |     normalize=False
10 | )
11 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/imports.py:
--------------------------------------------------------------------------------
 1 | """ Module containing all the needed libraries """
 2 | 
 3 | 
 4 | from statsmodels.stats.contingency_tables import Table2x2
 5 | from statsmodels.stats.power import GofChisquarePower
 6 | from teaching_tools.ab_test.experiment import Experiment
 7 | from country_converter import CountryConverter
 8 | from pymongo.collection import Collection
 9 | from pymongo import MongoClient
10 | from pprint import PrettyPrinter
11 | import matplotlib.pyplot as plt
12 | import pandas as pd
13 | import numpy as np
14 | import random
15 | import math
16 | import scipy
17 | import plotly.express as px
18 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/load.py:
--------------------------------------------------------------------------------
 1 | """ Loading into a data frame """
 2 | 
 3 | import aggregate
 4 | 
 5 | # aggregated by nationality
 6 | <df_Name> = pd.DataFrame(result).rename(
 7 |   {"_id": "country_iso2"}, axis="columns").sort_values("count")
 8 | 
 9 | 
10 | 
11 | # aggregated by sign up
12 | <ndf_Name> = (
13 |     pd.DataFrame(result)
14 |     .rename({"_id": "date", "count": "new_users"}, axis=1)
15 |     .set_index("date")
16 |     .sort_index()
17 |     .squeeze()
18 | )
19 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/mongo_instance.py:
--------------------------------------------------------------------------------
1 | """ """
2 | 
3 | import imports
4 | from our_mongo_class import MongoRepository
5 | 
6 | 
7 | # An instance of class MongoRepository
8 | repo = MongoRepository()
9 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/our_mongo_class.py:
--------------------------------------------------------------------------------
  1 | """ """
  2 | 
  3 | import imports
  4 | 
  5 | 
  6 | class MongoRepository:
  7 |     """Repository for interacting with MongoDB database.
  8 | 
  9 |     Params
 10 |     ----------
 11 |     client : `pymongo.MongoClient`
 12 |         Default, `MongoClient(host='localhost', port=<portNumber>)`.
 13 |     db : str
 14 |         Default, `'<yourDatabase>'`.
 15 |     collection : str
 16 |         Default, `<yourCollection>'`.
 17 | 
 18 |     Attributes
 19 |     ----------
 20 |     collection : pymongo.collection.Collection
 21 |         All data will be extracted from and loaded to this collection.
 22 |     """
 23 | 
 24 |     # `__init__` method
 25 |     def __init__(
 26 |         self, 
 27 |         client=MongoClient(host="localhost", port=<portNumber>), 
 28 |         db="'<yourDatabase>'",
 29 |         collection="`<yourCollection>'"
 30 |     ):
 31 |         self.collection = client[db][collection]
 32 | 
 33 |     # `find_by_date` method
 34 |     def find_by_date(self, date_string):
 35 |       
 36 |         # Convert `date_string` to datetime object
 37 |         start = pd.to_datetime(date_string, format="%Y-%m-%d")
 38 |         
 39 |         # Offset `start` by 1 day
 40 |         end = start + pd.DateOffset(days=1)
 41 |         
 42 |         # Create PyMongo query for no-quiz applicants b/t `start` and `end`
 43 |         query = {"createdAt": {"$gte": start, "$lt": end}, "admissionsQuiz": "incomplete"}
 44 |         
 45 |         # Query collection, get result
 46 |         result = self.collection.find(query)
 47 |         
 48 |         # Convert `result` to list
 49 |         observations = list(result)
 50 |         
 51 |         # REMOVE}
 52 |         return observations
 53 | 
 54 |       
 55 |     # `update_applicants` method
 56 |     def update_applicants(self, observations_assigned):
 57 |         n = 0
 58 |         n_modified = 0
 59 |     
 60 |         for doc in observations_assigned:
 61 |             result = self.collection.update_one(
 62 |             filter={"_id": doc["_id"]},
 63 |             update={"$set": doc}
 64 |             )
 65 |             n += result.matched_count
 66 |             n_modified += result.modified_count
 67 |         transaction_result = {"n": n, "nModified": n_modified}
 68 |         return transaction_result
 69 | 
 70 |       
 71 |     # `assign_to_groups` method
 72 |     def assign_to_groups(self, date_string):
 73 |       
 74 |         # get observations
 75 |         observations = self.find_by_date(date_string)
 76 |         
 77 |         # Shuffle `observations`
 78 |         random.seed(42)
 79 |         random.shuffle(observations)
 80 | 
 81 |         # Get index position of item at observations halfway point
 82 |         idx = len(observations) // 2
 83 | 
 84 |         # Assign first half of observations to control group
 85 |         for doc in observations[:idx]:
 86 |             doc["inExperiment"] = True
 87 |             doc["group"] = "no email (control)"
 88 | 
 89 |         # Assign second half of observations to treatment group
 90 |         for doc in observations[idx:]:
 91 |             doc["inExperiment"] = True
 92 |             doc["group"] = "email (treatment)"
 93 | 
 94 |         # Update collections
 95 |         result = self.update_applicants(observations)
 96 |         return result
 97 | 
 98 |     # `find_exp_observations` method
 99 |     def find_exp_observations(self):
100 |         result = self.collection.find({"inExperiment": True})
101 |         return list(result)
102 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/probability.py:
--------------------------------------------------------------------------------
 1 | """ """
 2 | 
 3 | import statistical_summary
 4 | 
 5 | 
 6 | prob_65_or_fewer = scipy.stats.norm.cdf(
 7 |     group_size * 2,
 8 |     loc=sum_mean,
 9 |     scale=sum_std
10 | )
11 | prob_65_or_greater = 1 - prob_65_or_fewer
12 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/run_exp.py:
--------------------------------------------------------------------------------
 1 | """ """
 2 | 
 3 | import imports
 4 | import connect
 5 | import mongo_instance
 6 | 
 7 | 
 8 | exp = Experiment(repo=client, db="yourDatabase", collection="yourCollection")
 9 | exp.reset_experiment()
10 | result = exp.run_experiment(days=exp_days, assignment=True)
11 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/statistic_power.py:
--------------------------------------------------------------------------------
 1 | """ """
 2 | 
 3 | import imports
 4 | 
 5 | 
 6 | chi_square_power = GofChisquarePower()
 7 | group_size = math.ceil(chi_square_power.solve_power(
 8 |     effect_size=0.5,  # medium --> 0.5; small --> 0.2; large --> 0.8
 9 |     alpha=0.05,
10 |     power=0.8
11 | ))
12 | 


--------------------------------------------------------------------------------
/070-ds-admissions-in-wqu/statistical_summary.py:
--------------------------------------------------------------------------------
 1 | """ """
 2 | 
 3 | import imports
 4 | import load
 5 | import aggregate
 6 | 
 7 | 
 8 | mean = <ndf_Name>.describe()["mean"]
 9 | std = <ndf_Name>.describe()["std"]
10 | 
11 | 
12 | # sum...
13 | exp_days = <no_of_days>
14 | sum_mean = mean * exp_days
15 | sum_std = std * math.sqrt(exp_days)
16 | 


--------------------------------------------------------------------------------
/080-volatility-forecasting-in-india/README.md:
--------------------------------------------------------------------------------
 1 | ## Market/ Volatility Forecasting in India
 2 | 
 3 | - API Design
 4 | - HTTP Requests
 5 | - SQL
 6 | - Sqlite3
 7 | - GARCH
 8 | - Model deployment
 9 | - Test driven Development
10 | - Python custom classes
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data-Science-Lab
 2 | 
 3 | # WQU DATA SCIENCE LAB PROJECTS
 4 | 
 5 | - 010-housing-in-mexico
 6 | - 020-housing-in-buenos-aires
 7 | - 030-air-quality-in-nairobi
 8 | - 040-earthquake-damage-in-nepal
 9 | - 050-bankruptcy-in-poland
10 | - 060-consumer-finances in-usa
11 | - 070-ds-admissions-in-wqu
12 | - 080-market-forecasting-in-india
13 | 


--------------------------------------------------------------------------------