├── 010-housing-in-mexico-015-assignment ├── Add new column, drop a column.py ├── Boxplot.py ├── Concat function.py ├── Create new columns.py ├── Drop NaNs.py ├── Drop columns.py ├── Histogram.py ├── Import CSV.py ├── Inspect data.py ├── README.md ├── bar chart.py ├── correlation coefficient.py ├── describe () method.py ├── groupby() method.py ├── scatter plot.py ├── scatter_mapbox.py └── value_counts() method.py ├── 020-housing-in-buenos-aires ├── README.md ├── baseline.py ├── glob ().py ├── important DS libraries.py ├── model: build & fit.py ├── predict.py ├── retrieve data.py ├── split.py └── wrangle () function.py ├── 030-air-quality-in-nairobi ├── ACFplot.py ├── ARmodel.py ├── Baseline.py ├── MongoDB.py ├── PACFplot.py ├── PrettyPrinter.py ├── README.md ├── aggregate().py ├── communicate.py ├── distinct().py ├── finalModel.py ├── libraries.py ├── rollingAvg.py ├── split.py ├── timeSeriesplot.py ├── wfv.py └── wrangle().py ├── 040-earthquake-damage-in-nepal ├── 1) libraries.py ├── 2) connect.py ├── 3) get_tables.py ├── 4) explore_tables.py ├── 5) JOIN.py ├── 6) wrangle().py ├── 7) barChart.py ├── 8) boxplot.py ├── 9) pivot_table.py ├── 91) vertical_split.py ├── 92) horizontal_split.py ├── 93) baseline.py ├── 94) log_reg.py ├── 95) accuracy_score.py ├── 96) decision_tree.py ├── 97) validation_curve.py ├── 98) tests.py ├── 99) communicate.py ├── 991) others.py └── README.md ├── 050-bankruptcy-in-poland ├── GridSearchCV.py ├── README.md ├── acc_score.py ├── bar.py ├── barh.py ├── best_params.py ├── classif_reports.py ├── clf_cv.py ├── conf_matrix.py ├── import.py ├── interactive_dash.py ├── libraries.py ├── naNs.py ├── resampling.py ├── save_and_load.py ├── splits.py └── wrangle.py ├── 060-consumer-finances in-usa ├── 1_import.py ├── 2_explore.py ├── 3_explore.py ├── 4_split.py ├── 6_communicate.py ├── README.md ├── libraries.py └── model.py ├── 070-ds-admissions-in-wqu ├── README.md ├── aggregate.py ├── choropleth_map.py ├── connect.py ├── contingency_bar.py ├── contingency_table.py ├── country_converter.py ├── crosstab.py ├── imports.py ├── load.py ├── mongo_instance.py ├── our_mongo_class.py ├── probability.py ├── run_exp.py ├── statistic_power.py └── statistical_summary.py ├── 080-volatility-forecasting-in-india └── README.md └── README.md /010-housing-in-mexico-015-assignment/Add new column, drop a column.py: -------------------------------------------------------------------------------- 1 | df2.head() 2 | 3 | property_type state region lat lon area_m2 price_brl 4 | 0 apartment Pernambuco Northeast -8.134204 -34.906326 72.0 414222.98 5 | 1 apartment Pernambuco Northeast -8.126664 -34.903924 136.0 848408.53 6 | 2 apartment Pernambuco Northeast -8.125550 -34.907601 75.0 299438.28 7 | 3 apartment Pernambuco Northeast -8.120249 -34.895920 187.0 848408.53 8 | 4 apartment Pernambuco Northeast -8.142666 -34.906906 80.0 464129.36 9 | 10 | # price_brl ---> price in Brazilian reals 11 | # create new column price_usd 12 | # use 1 USD = 3.19 Brazilian reals 13 | 14 | df2["price_usd"] = df2["price_brl"] / 3.19 15 | df2.head() 16 | 17 | property_type state region lat lon area_m2 price_brl price_usd 18 | 0 apartment Pernambuco Northeast -8.134204 -34.906326 72.0 414222.98 129850.463950 19 | 1 apartment Pernambuco Northeast -8.126664 -34.903924 136.0 848408.53 265958.786834 20 | 2 apartment Pernambuco Northeast -8.125550 -34.907601 75.0 299438.28 93867.799373 21 | 3 apartment Pernambuco Northeast -8.120249 -34.895920 187.0 848408.53 265958.786834 22 | 4 apartment Pernambuco Northeast -8.142666 -34.906906 80.0 464129.36 145495.097179 23 | 24 | 25 | # DROP COLUMNS 26 | # drop price_brl 27 | 28 | df2 = df2.drop("price_brl", axis="columns") 29 | df2.head() 30 | 31 | property_type state region lat lon area_m2 price_usd 32 | 0 apartment Pernambuco Northeast -8.134204 -34.906326 72.0 129850.463950 33 | 1 apartment Pernambuco Northeast -8.126664 -34.903924 136.0 265958.786834 34 | 2 apartment Pernambuco Northeast -8.125550 -34.907601 75.0 93867.799373 35 | 3 apartment Pernambuco Northeast -8.120249 -34.895920 187.0 265958.786834 36 | 4 apartment Pernambuco Northeast -8.142666 -34.906906 80.0 145495.097179 37 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/Boxplot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | plt.boxplot(df["area_m2"]) 4 | plt.xlabel("Area [sq meters]") 5 | plt.title("Distribution of Home Sizes") 6 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/Concat function.py: -------------------------------------------------------------------------------- 1 | # concatenate 2 data frames using concat 2 | 3 | df = pd.concat([df1, df2]) 4 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/Create new columns.py: -------------------------------------------------------------------------------- 1 | df1.head() 2 | 3 | # Output 4 | property_type place_with_parent_names region lat-lon area_m2 price_usd 5 | 0 apartment |Brasil|Alagoas|Maceió| Northeast -9.6443051,-35.7088142 110.0 $187,230.85 6 | 1 apartment |Brasil|Alagoas|Maceió| Northeast -9.6430934,-35.70484 65.0 $81,133.37 7 | 2 house |Brasil|Alagoas|Maceió| Northeast -9.6227033,-35.7297953 211.0 $154,465.45 8 | 3 apartment |Brasil|Alagoas|Maceió| Northeast -9.622837,-35.719556 99.0 $146,013.20 9 | 4 apartment |Brasil|Alagoas|Maceió| Northeast -9.654955,-35.700227 55.0 $101,416.71 10 | 11 | df1.info() 12 | 13 | # Output 14 | 15 | RangeIndex: 12834 entries, 0 to 12833 16 | Data columns (total 6 columns): 17 | # Column Non-Null Count Dtype 18 | --- ------ -------------- ----- 19 | 0 property_type 12834 non-null object 20 | 1 place_with_parent_names 12834 non-null object 21 | 2 region 12834 non-null object 22 | 3 lat-lon 11551 non-null object 23 | 4 area_m2 12834 non-null float64 24 | 5 price_usd 12834 non-null object 25 | dtypes: float64(1), object(5) 26 | memory usage: 601.7+ KB 27 | 28 | 29 | df1[["lat", "lon"]] = df1["lat-lon"].str.split(",", expand=True) 30 | 31 | # expand ---> increase size of data frame 32 | # without replacing 33 | 34 | df1["lat"] = df1.lat.astype(float) # change lat and lon from type object(string) to type float 35 | df1["lon"] = df1.lon.astype(float) 36 | df1.shape 37 | 38 | # Output 39 | (11551, 8) 40 | 41 | # Example 2 42 | 43 | df1["state"] = df1["place_with_parent_names"].str.split("|", expand=True)[2] 44 | df1.head() 45 | 46 | # Output 47 | property_type place_with_parent_names region lat-lon area_m2 price_usd lat lon state 48 | 0 apartment |Brasil|Alagoas|Maceió| Northeast -9.6443051,-35.7088142 110.0 $187,230.85 -9.644305 -35.708814 Alagoas 49 | 1 apartment |Brasil|Alagoas|Maceió| Northeast -9.6430934,-35.70484 65.0 $81,133.37 -9.643093 -35.704840 Alagoas 50 | 2 house |Brasil|Alagoas|Maceió| Northeast -9.6227033,-35.7297953 211.0 $154,465.45 -9.622703 -35.729795 Alagoas 51 | 3 apartment |Brasil|Alagoas|Maceió| Northeast -9.622837,-35.719556 99.0 $146,013.20 -9.622837 -35.719556 Alagoas 52 | 4 apartment |Brasil|Alagoas|Maceió| Northeast -9.654955,-35.700227 55.0 $101,416.71 -9.654955 -35.700227 Alagoas 53 | 54 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/Drop NaNs.py: -------------------------------------------------------------------------------- 1 | # df1.shape before dropping NaNs ---> (12834, 6) 2 | 3 | df1.dropna(inplace=True) # drop rows with null values 4 | df1.shape 5 | 6 | # Output 7 | (11551, 6) 8 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/Drop columns.py: -------------------------------------------------------------------------------- 1 | df1.head() 2 | 3 | property_type place_with_parent_names region lat-lon area_m2 price_usd lat lon state 4 | 0 apartment |Brasil|Alagoas|Maceió| Northeast -9.6443051,-35.7088142 110.0 $187,230.85 -9.644305 -35.708814 Alagoas 5 | 1 apartment |Brasil|Alagoas|Maceió| Northeast -9.6430934,-35.70484 65.0 $81,133.37 -9.643093 -35.704840 Alagoas 6 | 2 house |Brasil|Alagoas|Maceió| Northeast -9.6227033,-35.7297953 211.0 $154,465.45 -9.622703 -35.729795 Alagoas 7 | 3 apartment |Brasil|Alagoas|Maceió| Northeast -9.622837,-35.719556 99.0 $146,013.20 -9.622837 -35.719556 Alagoas 8 | 4 apartment |Brasil|Alagoas|Maceió| Northeast -9.654955,-35.700227 55.0 $101,416.71 -9.654955 -35.700227 Alagoas 9 | 10 | df1 = df1.drop(["lat-lon", "place_with_parent_names"], axis="columns") 11 | df1.head() 12 | 13 | property_type region area_m2 price_usd lat lon state 14 | 0 apartment Northeast 110.0 187230.85 -9.644305 -35.708814 Alagoas 15 | 1 apartment Northeast 65.0 81133.37 -9.643093 -35.704840 Alagoas 16 | 2 house Northeast 211.0 154465.45 -9.622703 -35.729795 Alagoas 17 | 3 apartment Northeast 99.0 146013.20 -9.622837 -35.719556 Alagoas 18 | 4 apartment Northeast 55.0 101416.71 -9.654955 -35.700227 Alagoas 19 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/Histogram.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | plt.hist(df["price_usd"]) 4 | plt.xlabel("Price [USD]") 5 | plt.ylabel("Frequency") 6 | plt.title("Distribution of Home Prices") 7 | 8 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/Import CSV.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | import plotly.express as px 4 | 5 | df1 = pd.read_csv("data/brasil-real-estate-1.csv") 6 | df1.shape 7 | 8 | #Output 9 | (12834, 6) 10 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/Inspect data.py: -------------------------------------------------------------------------------- 1 | df1.info() 2 | 3 | # Output 4 | 5 | RangeIndex: 12834 entries, 0 to 12833 6 | Data columns (total 6 columns): 7 | # Column Non-Null Count Dtype 8 | --- ------ -------------- ----- 9 | 0 property_type 12834 non-null object 10 | 1 place_with_parent_names 12834 non-null object 11 | 2 region 12834 non-null object 12 | 3 lat-lon 11551 non-null object 13 | 4 area_m2 12834 non-null float64 14 | 5 price_usd 12834 non-null object 15 | dtypes: float64(1), object(5) 16 | memory usage: 601.7+ KB 17 | 18 | df1.shape 19 | 20 | # Output 21 | (12834, 6) 22 | 23 | 24 | df1.head() # Displays the first 5 rows starting from 0 25 | 26 | #Output 27 | property_type place_with_parent_names region lat-lon area_m2 price_usd 28 | 0 apartment |Brasil|Alagoas|Maceió| Northeast -9.6443051,-35.7088142 110.0 $187,230.85 29 | 1 apartment |Brasil|Alagoas|Maceió| Northeast -9.6430934,-35.70484 65.0 $81,133.37 30 | 2 house |Brasil|Alagoas|Maceió| Northeast -9.6227033,-35.7297953 211.0 $154,465.45 31 | 3 apartment |Brasil|Alagoas|Maceió| Northeast -9.622837,-35.719556 99.0 $146,013.20 32 | 4 apartment |Brasil|Alagoas|Maceió| Northeast -9.654955,-35.700227 55.0 $101,416.71 33 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/README.md: -------------------------------------------------------------------------------- 1 | # work-ds-curriculum-010-housing-in-mexico 2 | 3 | WQU DATA SCIENCE LAB PROJECT 1 4 | HOUSING IN MEXICO 5 | 6 | Key concepts learnt and their application 7 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/bar chart.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | mean_price_by_region.plot( 4 | kind="bar", 5 | xlabel="Region", 6 | ylabel="Mean Price [USD]", 7 | title="Mean Home Price by Region" 8 | ); 9 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/correlation coefficient.py: -------------------------------------------------------------------------------- 1 | corr1= homes_by_state["area_m2"].corr(homes_by_state["price_usd"]) 2 | 3 | 0.5773267433717683 # more than half 4 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/describe () method.py: -------------------------------------------------------------------------------- 1 | df.head() 2 | 3 | property_type region area_m2 price_usd lat lon state 4 | 0 apartment Northeast 110.0 187230.85 -9.644305 -35.708814 Alagoas 5 | 1 apartment Northeast 65.0 81133.37 -9.643093 -35.704840 Alagoas 6 | 2 house Northeast 211.0 154465.45 -9.622703 -35.729795 Alagoas 7 | 3 apartment Northeast 99.0 146013.20 -9.622837 -35.719556 Alagoas 8 | 4 apartment Northeast 55.0 101416.71 -9.654955 -35.700227 Alagoas 9 | 10 | dfa = df[["area_m2", "price_usd"]] # subset for a data frame 11 | summary_stats = dfa.describe() 12 | summary_stats 13 | 14 | area_m2 price_usd 15 | count 22844.000000 22844.000000 16 | mean 115.020224 194987.315480 17 | std 47.742932 103617.682978 18 | min 53.000000 74892.340000 19 | 25% 76.000000 113898.770000 20 | 50% 103.000000 165697.555000 21 | 75% 142.000000 246900.880878 22 | max 252.000000 525659.717868 23 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/groupby() method.py: -------------------------------------------------------------------------------- 1 | mean_price_by_region = df.groupby("region")["price_usd"].mean().sort_values(ascending=True) 2 | mean_price_by_region.head() 3 | 4 | region 5 | Central-West 178596.283663 6 | North 181308.958207 7 | Northeast 185422.985441 8 | South 189012.345265 9 | Southeast 208996.762778 10 | Name: price_usd, dtype: float64 11 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/scatter plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | plt.scatter(x=homes_by_state["area_m2"], y=homes_by_state["price_usd"]) 4 | plt.xlabel("Area [sq meters]") 5 | plt.ylabel("Price [USD]") 6 | plt.title("Rio Grande do Sul: Price vs. Area"); 7 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/scatter_mapbox.py: -------------------------------------------------------------------------------- 1 | import plotly.express as px 2 | 3 | fig = px.scatter_mapbox( 4 | df, 5 | lat="lat", 6 | lon="lon", 7 | center={"lat": -14.2, "lon": -51.9}, # Map will be centered on Brazil 8 | width=600, 9 | height=600, 10 | hover_data=["price_usd"], # Display price when hovering mouse over house 11 | ) 12 | 13 | fig.update_layout(mapbox_style="open-street-map") 14 | 15 | fig.show() 16 | -------------------------------------------------------------------------------- /010-housing-in-mexico-015-assignment/value_counts() method.py: -------------------------------------------------------------------------------- 1 | # counts the number of items/ things 2 | 3 | homes_by_state = df_south["state"].value_counts() 4 | homes_by_state 5 | 6 | 7 | Rio Grande do Sul 2643 8 | Santa Catarina 2634 9 | Paraná 2544 10 | Name: state, dtype: int64 11 | -------------------------------------------------------------------------------- /020-housing-in-buenos-aires/README.md: -------------------------------------------------------------------------------- 1 | # 020-housing-in-buenos-aires/025-assignment.ipynb 2 | 3 | New concepts learnt in this project 4 | -------------------------------------------------------------------------------- /020-housing-in-buenos-aires/baseline.py: -------------------------------------------------------------------------------- 1 | # Done after splitting data 2 | # into features and vector 3 | 4 | from sklearn.metrics import mean_absolute_error 5 | 6 | y_mean = y_train.mean() 7 | y_pred_baseline = [y_mean] * len(y_train) 8 | baseline_mae = mean_absolute_error(y_train, y_pred_baseline) # what our model needs to beat 9 | -------------------------------------------------------------------------------- /020-housing-in-buenos-aires/glob ().py: -------------------------------------------------------------------------------- 1 | # create a list with files of similar format 2 | 3 | glob(-*.) 4 | 5 | # example 6 | glob("data/programfiles/excel-*.csv") 7 | 8 | # output 9 | ['data/programfiles/excel-1.csv', 10 | 'data/programfiles/excel-4.csv', 11 | 'data/programfiles/excel-3.csv', 12 | 'data/programfiles/excel-5.csv', 13 | 'data/programfiles/excel-2.csv'] 14 | 15 | sorted(glob("data/programfiles/excel-*.csv")) 16 | 17 | ['data/programfiles/excel-1.csv', 18 | 'data/programfiles/excel-2.csv', 19 | 'data/programfiles/excel-3.csv', 20 | 'data/programfiles/excel-4.csv', 21 | 'data/programfiles/excel-5.csv'] 22 | -------------------------------------------------------------------------------- /020-housing-in-buenos-aires/important DS libraries.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import warnings 5 | warnings.simplefilter(action="ignore", category=FutureWarning) 6 | import plotly.express as px 7 | import pandas as pd 8 | import seaborn as sns 9 | from category_encoders import OneHotEncoder 10 | from IPython.display import VimeoVideo 11 | from ipywidgets import Dropdown, FloatSlider, IntSlider, interact 12 | from sklearn.impute import SimpleImputer 13 | from sklearn.linear_model import LinearRegression, Ridge # noqa F401 14 | from sklearn.metrics import mean_absolute_error 15 | from sklearn.pipeline import make_pipeline 16 | from sklearn.utils.validation import check_is_fitted 17 | -------------------------------------------------------------------------------- /020-housing-in-buenos-aires/model: build & fit.py: -------------------------------------------------------------------------------- 1 | from category_encoders import OneHotEncoder 2 | from sklearn.impute import SimpleImputer 3 | from sklearn.linear_model import LinearRegression, Ridge 4 | from sklearn.pipeline import make_pipeline 5 | 6 | # build 7 | model = make_pipeline( 8 | OneHotEncoder(use_cat_names=True), 9 | SimpleImputer(), 10 | Ridge() 11 | ) 12 | 13 | # fit... 14 | model.fit(X_train, y_train) 15 | -------------------------------------------------------------------------------- /020-housing-in-buenos-aires/predict.py: -------------------------------------------------------------------------------- 1 | y_test_pred = pd.Series(model.predict(X_test)) 2 | y_test_pred.head() 3 | 4 | # sample output 5 | 0 53538.366480 6 | 1 53171.988369 7 | 2 34263.884179 8 | 3 53488.425607 9 | 4 68738.924884 10 | dtype: float64 11 | -------------------------------------------------------------------------------- /020-housing-in-buenos-aires/retrieve data.py: -------------------------------------------------------------------------------- 1 | # retrieve intercept 2 | intercept = model.named_steps["ridge"].intercept_ 3 | 4 | # retrieve coefficients 5 | coefficients = model.named_steps["ridge"].coef_ 6 | 7 | # retrieve names 8 | features = model.named_steps["onehotencoder"].get_feature_names() 9 | 10 | # create a series of names and values 11 | feat_imp = pd.Series(coefficients, index=features) 12 | feat_imp 13 | 14 | # sample output 15 | surface_covered_in_m2 291.654156 16 | lat 478.901375 17 | lon -2492.221814 18 | borough_Benito Juárez 13778.188880 19 | borough_Iztacalco 405.403127 20 | borough_Azcapotzalco 2459.288646 21 | borough_Coyoacán 3737.561001 22 | borough_Álvaro Obregón 3275.121061 23 | borough_Iztapalapa -13349.017448 24 | borough_Cuauhtémoc -350.531990 25 | borough_Tláhuac -14166.869486 26 | borough_Miguel Hidalgo 1977.314718 27 | -------------------------------------------------------------------------------- /020-housing-in-buenos-aires/split.py: -------------------------------------------------------------------------------- 1 | # splitting data into feature matrix and target vector 2 | 3 | target = "price_aprox_usd" # <--- vector 4 | features = ["surface_covered_in_m2", "lat", "lon", "borough"] # <--- matrix 5 | X_train = df[features] # training data 6 | y_train = df[target] # " " " " 7 | 8 | # The vector is what we are trying to predict using the matrix 9 | # In this case we are trying to predict the price of a property 10 | # using the features in the matrix 11 | -------------------------------------------------------------------------------- /020-housing-in-buenos-aires/wrangle () function.py: -------------------------------------------------------------------------------- 1 | # Wrangle function: 2 | # read in a csv file 3 | # apartments in < $100000 4 | # remove outliers 5 | # separate columns 6 | # create new columns from existing 7 | # take care of highly null columns 8 | # low and high cardinality 9 | # Leakage 10 | # multicolinearity 11 | 12 | def wrangle(filepath): 13 | # Read CSV file 14 | df = pd.read_csv(filepath) 15 | 16 | # Subset data: Apartments in , less than 100,000 17 | mask_ba = df["place_with_parent_names"].str.contains() 18 | mask_apt = df["property_type"] == "apartment" 19 | mask_price = df["price_aprox_usd"] < 100_000 20 | df = df[mask_ba & mask_apt & mask_price] 21 | 22 | # Subset data: Remove outliers for "surface_covered_in_m2" 23 | low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9]) 24 | mask_area = df["surface_covered_in_m2"].between(low, high) 25 | df = df[mask_area] 26 | 27 | # split lat-lon column 28 | df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float) 29 | df.drop(columns="lat-lon", inplace=True) 30 | 31 | # Extract newColumnName 32 | df[] = df["place_with_parent_names"].str.split("|", expand=True)[1] 33 | df.drop(columns="place_with_parent_names", inplace=True) 34 | 35 | # Drop feature with high null count 36 | df.drop(columns=["surface_total_in_m2", "price_usd_per_m2", "floor", "rooms", "expenses"], inplace=True) 37 | 38 | # Drop low- and high- categorical variables 39 | df.drop(columns=["operation", "property_type", "currency", "properati_url"], inplace=True) 40 | 41 | # Drop leaky columns 42 | df.drop(columns=["price", "price_aprox_local_currency", "price_per_m2"], inplace=True) 43 | 44 | # Drop columns with multi-colinerlity 45 | #df.drop(columns=["surface_total_in_m2", "rooms"], inplace=True) 46 | 47 | return df 48 | 49 | 50 | test1.isnull().sum() / len(test1) # check for highly null columns 51 | test1.select_dtypes("object").nunique() # check for low- and high- categorical variables 52 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/ACFplot.py: -------------------------------------------------------------------------------- 1 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 2 | import matplotlib.pyplot as plt 3 | 4 | fig, ax = plt.subplots(figsize=(15, 6)) 5 | plot_acf(y, ax=ax) 6 | plt.xlabel(<"xLabelvalue">) 7 | plt.ylabel(<"yLabelvalue">) 8 | plt.title(<"yourTitle">); 9 | 10 | # Don't delete the code below 👇 11 | plt.savefig("images/3-5-7.png", dpi=150) 12 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/ARmodel.py: -------------------------------------------------------------------------------- 1 | from statsmodels.tsa.ar_model import AutoReg 2 | from sklearn.metrics import mean_absolute_error 3 | 4 | # Use AR model to predict PM2.5 readings 5 | # Hyperparameter --> p 6 | p_params = range(1, 31) 7 | maes = [] 8 | for p in p_params: 9 | #Train model 10 | model = AutoReg(y_train, lags=p).fit() 11 | 12 | #Generate in-sample pred 13 | y_pred = model.predict().dropna() 14 | 15 | #Calculate mae 16 | mae = mean_absolute_error(y_train.iloc[p:], y_pred) 17 | maes.append(mae) 18 | 19 | mae_series = pd.Series(maes, name="mae", index=p_params) 20 | mae_series.head() 21 | 22 | # sample output 23 | 1 0.947888 24 | 2 0.933894 25 | 3 0.920850 26 | 4 0.920153 27 | 5 0.919519 28 | Name: mae, dtype: float64 29 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/Baseline.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import mean_absolute_error 2 | 3 | y_train_mean = y_train.mean() 4 | y_pred_baseline = [y_train_mean] * len(y_train) 5 | mae_baseline = mean_absolute_error(y_train, y_pred_baseline) 6 | 7 | print("Mean P2 Reading:", y_train_mean) 8 | print("Baseline MAE:", mae_baseline) 9 | 10 | # sample output 11 | Mean P2 Reading: 8.617582545265433 12 | Baseline MAE: 4.07658759405218 13 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/MongoDB.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | # Connect to server 4 | client = MongoClient(host=<"hostName">, port=) 5 | 6 | # Connect to database 7 | db = client[<"databaseName">] 8 | 9 | # Get collection 10 | dar = db[<"collectionName">] 11 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/PACFplot.py: -------------------------------------------------------------------------------- 1 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 2 | import matplotlib.pyplot as plt 3 | 4 | fig, ax = plt.subplots(figsize=(15, 6)) 5 | plot_pacf(y, ax=ax) # -----> line showing difference from acf plot <----- 6 | plt.xlabel(<"xLabelvalue">) 7 | plt.ylabel(<"yLabelvalue">) 8 | plt.title(<"yourTitle">); 9 | 10 | # Don't delete the code below 👇 11 | plt.savefig("images/3-5-7.png", dpi=150) 12 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/PrettyPrinter.py: -------------------------------------------------------------------------------- 1 | from pprint import PrettyPrinter 2 | 3 | # Instantiate prettyprinter ----> for nicely formatted output 4 | pp = PrettyPrinter(indent=2) 5 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/README.md: -------------------------------------------------------------------------------- 1 | # 030-Air-Quality-In-Nairobi 2 | 3 | - Data wrangling with MongoDB 4 | - LinearRegression with time Series data 5 | - Autoregressive models 6 | - ARMA models and Hyperparameter tuning 7 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/aggregate().py: -------------------------------------------------------------------------------- 1 | # Determine which collection 2 | # has the most sensor readings 3 | # $ --> introduces sth new 4 | result = dar.aggregate( 5 | [ 6 | {"$group": {"_id": "$metadata.site", "count": {"$count": {}}}} 7 | ] 8 | ) 9 | readings_per_site = list(result) 10 | readings_per_site 11 | 12 | # sample output 13 | [{'_id': 23, 'count': 60020}, {'_id': 11, 'count': 138412}] 14 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/communicate.py: -------------------------------------------------------------------------------- 1 | import plotly.express as px 2 | import pandas as pd 3 | 4 | # Put test and walk-forward validation values 5 | # in a dataframe and plot df 6 | df_pred_test = pd.DataFrame( 7 | {"y_test": y_test, "y_pred_wfv": y_pred_wfv} 8 | ) 9 | fig = px.line(df_pred_test, labels={"value": "PM2.5"}) 10 | fig.update_layout( 11 | title="Dar es Salaam, WFV Predictions", 12 | xaxis_title="Date", 13 | yaxis_title="PM2.5 Level", 14 | ) 15 | 16 | # Don't delete the code below 👇 17 | fig.write_image("images/3-5-18.png", scale=1, height=500, width=700) 18 | 19 | fig.show() 20 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/distinct().py: -------------------------------------------------------------------------------- 1 | # Determine no. of sites in collection 2 | sites = dar.distinct("metadata.site") # dar ---> variable holding collection 3 | sites 4 | 5 | # Sample output 6 | [11, 23] 7 | 8 | # count no. of docs at a prticular site 9 | # using count_documents() 10 | dar.count_documents({"metadata.site": 23}) 11 | 12 | # Sample output 13 | 60020 14 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/finalModel.py: -------------------------------------------------------------------------------- 1 | from statsmodels.tsa.ar_model import AutoReg 2 | from statsmodels.tsa.arima.model import ARIMA 3 | 4 | mae_series # locate best_p 5 | best_p = 28 6 | 7 | # build and train model 8 | best_model = AutoReg(y_train, lags=best_p).fit() 9 | 10 | # calculate training residuals for best_model 11 | y_train_resid = best_model.resid 12 | y_train_resid.name = "residuals" 13 | y_train_resid.head() 14 | 15 | # sample output 16 | timestamp 17 | 2018-01-02 07:00:00+03:00 1.732488 18 | 2018-01-02 08:00:00+03:00 -0.381568 19 | 2018-01-02 09:00:00+03:00 -0.560971 20 | 2018-01-02 10:00:00+03:00 -2.215760 21 | 2018-01-02 11:00:00+03:00 0.006468 22 | Freq: H, Name: residuals, dtype: float64 23 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/libraries.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import time 3 | from pprint import PrettyPrinter 4 | import matplotlib.pyplot as plt 5 | import pandas as pd 6 | import plotly.express as px 7 | import seaborn as sns 8 | from pymongo import MongoClient 9 | import pytz 10 | from statsmodels.tsa.ar_model import AutoReg 11 | from sklearn.linear_model import LinearRegression 12 | from sklearn.metrics import mean_absolute_error 13 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 14 | from statsmodels.tsa.arima.model import ARIMA 15 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/rollingAvg.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | fig, ax = plt.subplots(figsize=(15, 6)) 4 | y.rolling(168).mean().plot(ax=ax, xlabel="Date", ylabel="PM2.5 Level", title="Dar es Salaam PM2.5 Levels, 7-Day Rolling Average"); 5 | # --> 168 == num of hours in a week 6 | 7 | # Don't delete the code below 👇 8 | plt.savefig("images/3-5-6.png", dpi=150) 9 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/split.py: -------------------------------------------------------------------------------- 1 | # percentage ---> 90% (0.9), 80% (0.8) ... 2 | cutoff_test = int(len(y) * ) 3 | y_train = y.iloc[:cutoff_test] 4 | y_test = y.iloc[cutoff_test:] 5 | print("y_train shape:", y_train.shape) 6 | print("y_test shape:", y_test.shape) 7 | 8 | # sample output 9 | y_train shape: (1533,) 10 | y_test shape: (171,) 11 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/timeSeriesplot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | fig, ax = plt.subplots(figsize=(15, 6)) 4 | y.plot(xlabel="Date", ylabel="PM2.5 Level", title="Dar es Salaam PM2.5 Levels", ax=ax); 5 | 6 | # Don't delete the code below 👇 7 | plt.savefig("images/3-5-5.png", dpi=150) 8 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/wfv.py: -------------------------------------------------------------------------------- 1 | from statsmodels.tsa.arima.model import ARIMA 2 | 3 | # walk-forward validation for model for test data --> y_test 4 | # predictions stored in series: y_pred_wfv 5 | y_pred_wfv = pd.Series() 6 | history = y_train.copy() 7 | for i in range(len(y_test)): 8 | model = AutoReg(history, lags=best_p).fit() 9 | next_pred = model.forecast() # next value after end of history 10 | y_pred_wfv = y_pred_wfv.append(next_pred) 11 | history = history.append(y_test[next_pred.index]) 12 | 13 | y_pred_wfv.name = "prediction" 14 | y_pred_wfv.index.name = "timestamp" 15 | y_pred_wfv.head() 16 | 17 | # sample output 18 | timestamp 19 | 2018-03-06 00:00:00+03:00 8.056391 20 | 2018-03-06 01:00:00+03:00 8.681779 21 | 2018-03-06 02:00:00+03:00 6.268951 22 | 2018-03-06 03:00:00+03:00 6.303760 23 | 2018-03-06 04:00:00+03:00 7.171444 24 | Freq: H, Name: prediction, dtype: float64 25 | -------------------------------------------------------------------------------- /030-air-quality-in-nairobi/wrangle().py: -------------------------------------------------------------------------------- 1 | # Wrangle function 2 | # Extract PM2.5 readings 3 | # from collection site with 4 | # most readings 5 | # Localize time 6 | # Remove outliers 7 | # Resample data to provide PM2.5 readings 8 | # for each hour 9 | # impute missing values 10 | # return series 11 | def wrangle(collection): 12 | results = collection.find( 13 | {"metadata.site": 11, "metadata.measurement": "P2"}, 14 | projection={"P2": 1, "timestamp": 1, "_id": 0}, # ---> focus/ limit to only "P2" and timestamp 15 | ) 16 | 17 | df = pd.DataFrame(results).set_index("timestamp") 18 | 19 | # Localize time 20 | df.index = df.index.tz_localize("UTC").tz_convert("Africa/Dar_es_Salaam") 21 | 22 | # Remove outliers 23 | df = df[df["P2"] < 100] 24 | 25 | # Resample to 1hour period, fill in missing values 26 | y = df["P2"].resample("1H").mean().fillna(method='ffill') 27 | 28 | return y 29 | 30 | # Using wrangle() 31 | y = wrangle(dar) 32 | y.head() 33 | 34 | # sample output 35 | timestamp 36 | 2018-01-01 03:00:00+03:00 9.456327 37 | 2018-01-01 04:00:00+03:00 9.400833 38 | 2018-01-01 05:00:00+03:00 9.331458 39 | 2018-01-01 06:00:00+03:00 9.528776 40 | 2018-01-01 07:00:00+03:00 8.861250 41 | Freq: H, Name: P2, dtype: float64 42 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/1) libraries.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | import seaborn as sns 6 | from category_encoders import OneHotEncoder 7 | from category_encoders import OrdinalEncoder 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.metrics import accuracy_score 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.pipeline import Pipeline, make_pipeline 12 | from sklearn.tree import DecisionTreeClassifier, plot_tree 13 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/2) connect.py: -------------------------------------------------------------------------------- 1 | %load_ext sql 2 | %sql sqlite:////home/jovyan/.sqlite 3 | 4 | # sample output 5 | 'Connected: @/home/jovyan/nepal.sqlite' 6 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/3) get_tables.py: -------------------------------------------------------------------------------- 1 | %%sql 2 | SELECT name 3 | FROM sqlite_schema 4 | WHERE type = "table" 5 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/4) explore_tables.py: -------------------------------------------------------------------------------- 1 | %%sql 2 | SELECT distinct(district_id) # gives unique values of column district_id 3 | FROM id_map # name of table 4 | 5 | 6 | # num of observations in table id_map 7 | # where value of column district_id is 1 8 | %%sql 9 | SELECT count(*) 10 | FROM id_map 11 | WHERE district_id = 1 12 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/5) JOIN.py: -------------------------------------------------------------------------------- 1 | %%sql 2 | # joining tables at columns building_id 3 | SELECT distinct(i.building_id) AS b_id, # building_id column of table i aliased as b_id 4 | s.*, # selects all columns of table s 5 | d.damage_grade # select damage_grade column of table d 6 | FROM id_map AS i 7 | JOIN building_structure AS s ON i.building_id = s.building_id 8 | JOIN building_damage AS d ON i.building_id = d.building_id 9 | WHERE district_id = 3 10 | LIMIT 5 11 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/6) wrangle().py: -------------------------------------------------------------------------------- 1 | def wrangle(db_path): 2 | # Connect to database using connect method 3 | conn = sqlite3.connect(db_path) 4 | 5 | # Construct query 6 | query = """ 7 | SELECT distinct(i.building_id) AS b_id, 8 | s.*, 9 | d.damage_grade 10 | FROM id_map AS i 11 | JOIN building_structure AS s ON i.building_id = s.building_id 12 | JOIN building_damage AS d ON i.building_id = d.building_id 13 | WHERE district_id = 3 14 | """ 15 | 16 | # Read query results into DataFrame 17 | df = pd.read_sql(query, conn, index_col="b_id") 18 | 19 | # Identify leaky columns 20 | drop_cols = [col for col in df.columns if "post_eq" in col] 21 | 22 | # Create binary target 23 | df["damage_grade"] = df["damage_grade"].str[-1].astype(int) 24 | df["severe_damage"] = (df["damage_grade"] > 3).astype(int) # encode as 0's and 1's 25 | 26 | # Drop old target 27 | drop_cols.append("damage_grade") 28 | 29 | # Drop multicolinearity column 30 | drop_cols.append("count_floors_pre_eq") 31 | 32 | # Drop high categorical features 33 | drop_cols.append("building_id") 34 | 35 | # Drop columns 36 | df.drop(columns=drop_cols, inplace=True) 37 | 38 | 39 | return df 40 | 41 | 42 | # Using wrangle func 43 | df = wrangle("/home/jovyan/nepal.sqlite") 44 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/7) barChart.py: -------------------------------------------------------------------------------- 1 | # create bar chart using 2 | # severe damage column which 3 | # contains two classes 4 | df["severe_damage"].value_counts(normalize=True).plot( 5 | kind="bar", xlabel="Severe Damage", ylabel="Relative Frequency", title="Class Balance" 6 | ); 7 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/8) boxplot.py: -------------------------------------------------------------------------------- 1 | # severe_damage: column with 2 groups 2 | # plinth_area_sq_ft: column: footprint size of building 3 | 4 | sns.boxplot(x="severe_damage", y="plinth_area_sq_ft", data=df) 5 | plt.xlabel("Severe Damage") 6 | plt.ylabel("Plinth Area [sq. ft.]") 7 | plt.title("Kavrepalanchok, Plinth Area vs Building Damage"); 8 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/9) pivot_table.py: -------------------------------------------------------------------------------- 1 | roof_pivot = pd.pivot_table( 2 | df, index="roof_type", values="severe_damage", aggfunc=np.mean # roof_type: column in table 3 | ).sort_values(by="severe_damage") 4 | roof_pivot 5 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/91) vertical_split.py: -------------------------------------------------------------------------------- 1 | X = df.drop(columns="severe_damage") # feature matrix: all columns apart from severe_damage 2 | y = df["severe_damage"] # target vector 3 | print("X shape:", X.shape) 4 | print("y shape:", y.shape) 5 | 6 | # sample output 7 | X shape: (76533, 11) 8 | y shape: (76533,) 9 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/92) horizontal_split.py: -------------------------------------------------------------------------------- 1 | X_train, X_val, y_train, y_val = train_test_split( 2 | X, y, test_size=0.2, random_state=42 3 | ) 4 | print("X_train shape:", X_train.shape) 5 | print("y_train shape:", y_train.shape) 6 | print("X_val shape:", X_val.shape) 7 | print("y_val shape:", y_val.shape) 8 | 9 | # sample output 10 | X_train shape: (61226, 11) 11 | y_train shape: (61226,) 12 | X_val shape: (15307, 11) 13 | y_val shape: (15307,) 14 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/93) baseline.py: -------------------------------------------------------------------------------- 1 | acc_baseline = y_train.value_counts(normalize=True).max() # normalize gives you the relative freq 2 | print("Baseline Accuracy:", round(acc_baseline, 2)) 3 | 4 | # sample output 5 | Baseline Accuracy: 0.55 6 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/94) log_reg.py: -------------------------------------------------------------------------------- 1 | model_lr = make_pipeline( 2 | OneHotEncoder(use_cat_names=True), 3 | LogisticRegression(max_iter=<1000-3000>) #max_iter: varies: suppresses the 'ConvergenceWarning' 4 | ) 5 | # Fit model to training data 6 | model_lr.fit(X_train, y_train) 7 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/95) accuracy_score.py: -------------------------------------------------------------------------------- 1 | lr_train_acc = accuracy_score(y_train, model_lr.predict(X_train)) 2 | lr_val_acc = model_lr.score(X_val, y_val) 3 | 4 | print("Logistic Regression, Training Accuracy Score:", lr_train_acc) 5 | print("Logistic Regression, Validation Accuracy Score:", lr_val_acc) 6 | 7 | # sample output 8 | Logistic Regression, Training Accuracy Score: 0.6515042628948486 9 | Logistic Regression, Validation Accuracy Score: 0.6536878552296335 10 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/96) decision_tree.py: -------------------------------------------------------------------------------- 1 | depth_hyperparams = range(1, 16) # for max_depth 2 | training_acc = [] 3 | validation_acc = [] 4 | for d in depth_hyperparams: 5 | model_dt = make_pipeline( 6 | OrdinalEncoder(), 7 | DecisionTreeClassifier(max_depth= d, random_state=42) 8 | ) 9 | # Fit model to training data 10 | model_dt.fit(X_train, y_train) 11 | # Calculate training accuracy score and append to `training_acc` 12 | training_acc.append(model_dt.score(X_train, y_train)) 13 | # Calculate validation accuracy score and append to `training_acc` 14 | validation_acc.append(model_dt.score(X_val, y_val)) 15 | 16 | print("Training Accuracy Scores:", training_acc[:6]) 17 | print("Validation Accuracy Scores:", validation_acc[:6]) 18 | 19 | 20 | # sample output 21 | Training Accuracy Scores: [0.6303041191650606, 0.6303041191650606, 0.642292490118577, 0.653529546271192, 0.6543951915852743, 0.6576617776761506] 22 | Validation Accuracy Scores: [0.6350035931273273, 0.6350035931273273, 0.6453909975828053, 0.6527732410008493, 0.6529039001763899, 0.6584569151368654] 23 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/97) validation_curve.py: -------------------------------------------------------------------------------- 1 | # Validation curve 2 | plt.plot(depth_hyperparams, training_acc, label="Training") 3 | plt.plot(depth_hyperparams, validation_acc, label="validation") 4 | plt.xlabel("Max Depth") 5 | plt.ylabel("Accuracy Score") 6 | plt.title("Validation Curve, Decision Tree Model") 7 | plt.legend(); 8 | 9 | 10 | # build & fit again 11 | final_model_dt = make_pipeline( 12 | OrdinalEncoder(), 13 | DecisionTreeClassifier(max_depth=10, random_state=42) 14 | ) 15 | # Fit model to training data 16 | final_model_dt.fit(X, y) #final_model_dt.fit(X_train, y_train) 17 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/98) tests.py: -------------------------------------------------------------------------------- 1 | # test type 1 2 | X_test = pd.read_csv("filePath.csv", index_col="b_id") 3 | y_test_pred = pd.Series(final_model_dt.predict(X_test)) 4 | y_test_pred[:5] 5 | 6 | # sample output 7 | 0 1 8 | 1 1 9 | 2 1 10 | 3 1 11 | 4 0 12 | dtype: int64 13 | 14 | 15 | # test type 2 16 | test_acc = model.score(X_test, y_test) 17 | print("Test Accuracy:", round(test_acc, 2)) 18 | 19 | # sample output 20 | Test Accuracy: 0.72 21 | 22 | 23 | # test type 3 24 | acc_train = accuracy_score(y_train, model_lr.predict(X_train)) 25 | acc_test = model_lr.score(X_test, y_test) 26 | 27 | print("LR Training Accuracy:", acc_train) 28 | print("LR Validation Accuracy:", acc_test) 29 | 30 | # sample output 31 | LR Training Accuracy: 0.717985042664646 32 | LR Validation Accuracy: 0.7218817948211109 33 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/99) communicate.py: -------------------------------------------------------------------------------- 1 | # DECISION TREE 2 | features = X_train.columns 3 | importances = .named_steps["decisiontreeclassifier"].feature_importances_ 4 | feat_imp = pd.Series(importances, index=features).sort_values() 5 | feat_imp.head() 6 | 7 | # sample output 8 | plan_configuration 0.004189 9 | land_surface_condition 0.008599 10 | foundation_type 0.009967 11 | position 0.011795 12 | ground_floor_type 0.013521 13 | dtype: float64 14 | 15 | 16 | # LOGISTIC REG 17 | features = model_lr.named_steps["onehotencoder"].get_feature_names() 18 | importances = model_lr.named_steps["logisticregression"].coef_[0] 19 | feat_imp = pd.Series(np.exp(importances), index=features).sort_values() 20 | feat_imp.head() 21 | 22 | # sample output 23 | superstructure_Brick, cement mortar 0.345719 24 | foundation_type_RC 0.364478 25 | roof_type_RCC/RB/RBC 0.415979 26 | ground_floor_type_RC 0.527756 27 | caste_household_Kumal 0.543642 28 | dtype: float64 29 | 30 | 31 | 32 | # horizontal bar chart 33 | feat_imp.plot(kind="barh") 34 | plt.xlabel("importance") 35 | plt.ylabel("Label") 36 | plt.title("Feature Importance"); 37 | 38 | 39 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/991) others.py: -------------------------------------------------------------------------------- 1 | # Create DF called 'damage_by_vdcmun' 2 | # group DF by "vdcmun_id" 3 | # calculating mean of the "severe_damage" column. 4 | # Be sure to sort from highest to lowest proportion 5 | damage_by_vdcmun = ( 6 | df.groupby("vdcmun_id")["severe_damage"].mean().sort_values(ascending=False) 7 | ).to_frame() 8 | damage_by_vdcmun 9 | 10 | 11 | # Line plot 12 | plt.plot(damage_by_vdcmun.values, color="blue") 13 | plt.xticks(range(len(damage_by_vdcmun)), labels=damage_by_vdcmun.index) 14 | plt.yticks(np.arange(0.0, 1.1, 0.2)) 15 | plt.xlabel("Mun ID") 16 | plt.ylabel("% Households") 17 | plt.title("Damage by Municipality"); 18 | -------------------------------------------------------------------------------- /040-earthquake-damage-in-nepal/README.md: -------------------------------------------------------------------------------- 1 | # 040-Earthquake-Damage-in-Nepal 2 | 3 | - sqlite 4 | - logistic-regression 5 | - decision-tree 6 | - demographics 7 | - Ethical Data Science 8 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/GridSearchCV.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # Range of hyperparameters 4 | params = { 5 | "simpleimputer__strategy": ["mean", "median"], 6 | "randomforestclassifier__n_estimators": range(25, 100, 25), 7 | "randomforestclassifier__max_depth": range(10, 50, 10) 8 | } 9 | 10 | # Using `GridSearchCV` 11 | model = model = GridSearchCV( 12 | clf, 13 | param_grid=params, 14 | cv=5, 15 | n_jobs=-1, 16 | verbose=1 17 | ) 18 | 19 | # Fit your model 20 | model.fit(X_train_over, y_train_over) 21 | 22 | # cross_validation results 23 | results = pd.DataFrame(model.cv_results_) 24 | cv_results.head(5) 25 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/README.md: -------------------------------------------------------------------------------- 1 | # 050-bankruptcy-in-poland 2 | ## Concepts learnt 3 | - Working with JSON 4 | - Imbalanced data 5 | - Random forest 6 | - Gradient boosting 7 | - Linux command line 8 | - Creating python modules 9 | - Importing functions from modules 10 | - Saving and loading a model 11 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/acc_score.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | acc_train = model.score(X_train, y_train) 4 | acc_test = model.score(X_test, y_test) 5 | 6 | print("Model Training Accuracy:", round(acc_train, 4)) 7 | print("Model Test Accuracy:", round(acc_test, 4)) 8 | 9 | # Sample output 10 | Model Training Accuracy: 1.0 11 | Model Test Accuracy: 0.9764 12 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/bar.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | df["bankrupt"].value_counts(normalize=True).plot( 4 | kind = "bar", 5 | xlabel = "Bankrupt", 6 | ylabel = "Frequency", 7 | title = "Class Balance" 8 | ); 9 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/barh.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # Get feature names from training data 4 | features = X_train_over.columns 5 | 6 | # Extract importances from model 7 | importances = model.best_estimator_.named_steps["randomforestclassifier"].feature_importances_ 8 | 9 | # Create a series with feature names and importances 10 | feat_imp = pd.Series(importances, index=features).sort_values() 11 | 12 | # Plot 10 most important features 13 | feat_imp.tail(10).plot(kind="barh") 14 | plt.xlabel("...") 15 | plt.ylabel("...") 16 | plt.title("..."); 17 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/best_params.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # Method 1 4 | best_params = model.best_params_ 5 | print(best_params) 6 | 7 | # Method 2 8 | model.predict(X_train_over) 9 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/classif_reports.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | class_report = classification_report(y_test, model.predict(X_test)) 4 | print(class_report) 5 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/clf_cv.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # # classifier 4 | clf = make_pipeline(SimpleImputer(), RandomForestClassifier(random_state=42)) 5 | 6 | # cross validation 7 | cv_scores = cross_val_score(clf, X_train_over, y_train_over, cv=5, n_jobs=-1) 8 | print(cv_scores) 9 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/conf_matrix.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | ConfusionMatrixDisplay.from_estimator(model, X_test, y_test); 4 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/import.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # Compressed file --> dict 4 | with gzip.open("", "r") as f: 5 | taiwan_data = json.load(f) 6 | 7 | # Extracting keys from a dict 8 | taiwan_data_keys = taiwan_data.keys() 9 | print(taiwan_data_keys) 10 | 11 | # Sample output 12 | dict_keys(['schema', 'metadata', 'observations']) 13 | 14 | # Counting number of observations 15 | len(taiwan_data["observations"]) 16 | 17 | # Length / no. of each observation 18 | len(taiwan_data["observations"][0]) 19 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/interactive_dash.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | def make_cnf_matrix(threshold): 4 | y_pred_proba = model.predict_proba(X_test)[:, -1] 5 | y_pred = y_pred_proba > threshold 6 | conf_matrix = confusion_matrix(y_test, y_pred) 7 | tn, fp, fn, tp = conf_matrix.ravel() 8 | tn, fp, fn, tp 9 | print(f"Profit: €{tp * 100_000_000}") 10 | print(f"Loses: €{tp * 250_000_000}") 11 | ConfusionMatrixDisplay.from_predictions(y_test, y_pred, colorbar=False) 12 | thresh_widget = widgets.FloatSlider(min=0, max=1, value=0.5, step=0.05) 13 | 14 | interact(make_cnf_matrix, threshold=thresh_widget); 15 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/libraries.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import ClassifierMixin 2 | from sklearn.pipeline import Pipeline 3 | import gzip 4 | import json 5 | import pickle 6 | 7 | import pandas as pd 8 | import matplotlib.pyplot as plt 9 | import pandas as pd 10 | import seaborn as sns 11 | import wqet_grader 12 | from imblearn.over_sampling import RandomOverSampler 13 | from imblearn.under_sampling import RandomUnderSampler 14 | from sklearn.impute import SimpleImputer 15 | from sklearn.metrics import ( 16 | ConfusionMatrixDisplay, 17 | classification_report, 18 | confusion_matrix, 19 | ) 20 | from sklearn.pipeline import make_pipeline 21 | from sklearn.tree import DecisionTreeClassifier 22 | from sklearn.ensemble import RandomForestClassifier 23 | from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split 24 | import ipywidgets as widgets 25 | from ipywidgets import interact 26 | from sklearn.ensemble import GradientBoostingClassifier 27 | from teaching_tools.widgets import ConfusionMatrixWidget 28 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/naNs.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | nans_by_col = df.isna().sum() 4 | print("nans_by_col shape:", nans_by_col.shape) 5 | nans_by_col.head() 6 | 7 | # Sample output 8 | nans_by_col shape: (96,) 9 | bankrupt 0 10 | feat_1 0 11 | feat_2 0 12 | feat_3 0 13 | feat_4 0 14 | dtype: int64 15 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/resampling.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | over_sampler = RandomOverSampler(random_state=42) 4 | X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train) 5 | print("X_train_over shape:", X_train_over.shape) 6 | X_train_over.head() 7 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/save_and_load.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # save model to `Destination` 4 | with open("", "wb") as f: 5 | pickle.dump(model, f) 6 | 7 | # 8 | # Load model from `Destination`` 9 | with open("", "rb") as f: 10 | loaded_model = pickle.load(f) 11 | print(loaded_model) 12 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/splits.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # Feature matrix and Target vector 4 | target = "bankrupt" 5 | X = df.drop(columns="bankrupt") 6 | y = df[target] 7 | 8 | 9 | # Training and test split 10 | X_train, X_test, y_train, y_test = train_test_split( 11 | X, y, test_size=0.2, random_state=42 12 | ) 13 | -------------------------------------------------------------------------------- /050-bankruptcy-in-poland/wrangle.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # Wrangle function 4 | def wrangle(filePath): 5 | # Open compressed file, load to dict 6 | with gzip.open(filePath, "r") as f: 7 | data = json.load(f) 8 | 9 | # Dictionary --> DataFrame, set index 10 | df = pd.DataFrame().from_dict(data["observations"]).set_index("id") 11 | 12 | return df 13 | -------------------------------------------------------------------------------- /060-consumer-finances in-usa/1_import.py: -------------------------------------------------------------------------------- 1 | df = pd.read_csv("") 2 | print("df shape:", df.shape) 3 | df.head() 4 | -------------------------------------------------------------------------------- /060-consumer-finances in-usa/2_explore.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | import 1_import 3 | 4 | # 1 5 | # Percentage of respondents in df that are business owners, 6 | # assign result to the variable pct_biz_owners. 7 | # Review documentation regarding "HBUS" column 8 | # https://sda.berkeley.edu/sdaweb/docs/scfcomb2019/DOC/hcbkfx0.htm 9 | 10 | pct_biz_owners = sum(df["HBUS"]) / (sum(df["HBUS"] == 0) + sum(df["HBUS"])) 11 | print("% of business owners in df:", pct_biz_owners) 12 | 13 | # 2 14 | # DataFrame df_inccat showing normalized frequency 15 | # for income categories for business owners and non-business owners 16 | 17 | inccat_dict = { 18 | 1: "0-20", 19 | 2: "21-39.9", 20 | 3: "40-59.9", 21 | 4: "60-79.9", 22 | 5: "80-89.9", 23 | 6: "90-100", 24 | } 25 | 26 | df_inccat = ( 27 | df["INCCAT"] 28 | .replace(inccat_dict) 29 | .groupby(df["HBUS"]) 30 | .value_counts(normalize=True) 31 | .rename("frequency") 32 | .to_frame() 33 | .reset_index() 34 | ) 35 | 36 | df_inccat 37 | 38 | # 3 39 | # Seaborn, create a side-by-side bar chart of df_inccat 40 | 41 | sns.barplot( 42 | x="INCCAT", 43 | y="frequency", 44 | hue="HBUS", 45 | data=df_inccat, 46 | order=inccat_dict.values() 47 | ) 48 | plt.xlabel("") 49 | plt.ylabel("") 50 | plt.title(""); 51 | 52 | # 4 53 | # create a scatter plot that shows "HOUSES" vs. "DEBT" 54 | 55 | sns.scatterplot(x=df["DEBT"] / 1e6, y=df["HOUSES"] / 1e6, palette="deep") 56 | plt.xlabel("Household Debt") 57 | plt.ylabel("Home Value") 58 | plt.title("Home Value vs. Household Debt"); 59 | 60 | # 5 61 | # New DataFrame df_small_biz containing 62 | # only business owners whose income is below $500,000 63 | 64 | mask = (df["HBUS"]) & (df["INCOME"] < 500_000) 65 | df_small_biz = df[mask] 66 | -------------------------------------------------------------------------------- /060-consumer-finances in-usa/3_explore.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | import 1_import 3 | import 2_explore 4 | 5 | # 6 6 | # Histogram from the "AGE" column 7 | # in df_small_biz with 10 bins 8 | df_small_biz["AGE"].hist(bins=10) 9 | plt.xlabel("Your x_Label") 10 | plt.ylabel("Your y_Label") 11 | plt.title("Your Title"); 12 | 13 | # 7 14 | # Variance for all the features in df_small_biz, 15 | # create Series top_ten_var with 10 features with largest variance 16 | top_ten_var = df_small_biz.var().sort_values().tail(10) 17 | top_ten_var 18 | 19 | # 8 20 | # trimmed variance for the features in df_small_biz 21 | # not include the top and bottom 10% of observations 22 | top_ten_trim_var = df_small_biz.apply(trimmed_var, limits=(0.1, 0.1)).sort_values().tail(10) 23 | top_ten_trim_var 24 | 25 | # 9 26 | # create a horizontal bar chart of top_ten_trim_var 27 | fig = px.bar( 28 | x=top_ten_trim_var, 29 | y=top_ten_trim_var.index, 30 | title="High Var Feat" 31 | ) 32 | fig.update_layout(xaxis_title="Your x_label", yaxis_title="Your y_label") 33 | 34 | # 10 35 | # Create list: high_var_cols, 36 | # with the column names of the five features 37 | # with the highest trimmed variance 38 | high_var_cols = top_ten_trim_var.tail(5).index.to_list() 39 | high_var_cols 40 | -------------------------------------------------------------------------------- /060-consumer-finances in-usa/4_split.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | import 1_import 3 | import 2_explore 4 | import 3_explore 5 | 6 | # Feature matrix X containing five columns in high_var_cols 7 | X = df_small_biz[high_var_cols] 8 | -------------------------------------------------------------------------------- /060-consumer-finances in-usa/6_communicate.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | import 1_import 3 | import 2_explore 4 | import 3_explore 5 | import 4_split 6 | import model 7 | 8 | # 16 9 | # DataFrame xgb containing mean values 10 | # of the features in X for the 3 clusters 11 | # in your final_model 12 | labels = final_model.named_steps["kmeans"].labels_ 13 | xgb = X.groupby(labels).mean() 14 | xgb 15 | 16 | # 17 17 | # create side-by-side bar chart from xgb 18 | # showing mean of the features in X 19 | # for each of the clusters in your final_model 20 | fig = px.bar( 21 | xgb, 22 | barmode="group", 23 | title="Your Title" 24 | ) 25 | fig.update_layout(xaxis_title="Your x_label", yaxis_title="Your y_label") 26 | 27 | # 18 28 | # Create a PCA transformer, 29 | # reduce the dimensionality of X to 2, 30 | # and then put the transformed data into a DataFrame 31 | pca = PCA(n_components=2, random_state=42) 32 | 33 | X_t = pca.fit_transform(X) 34 | 35 | X_pca = pd.DataFrame(X_t, columns=["PC1", "PC2"]) 36 | 37 | # 19 38 | # create a scatter plot of X_pca using seaborn 39 | fig = px.scatter( 40 | data_frame=X_pca, 41 | x="PC1", 42 | y="PC2", 43 | color=labels.astype(str), 44 | title="PCA Representation of Clusters" 45 | ) 46 | fig.update_layout(xaxis_title="PC1", yaxis_title="PC2") 47 | -------------------------------------------------------------------------------- /060-consumer-finances in-usa/README.md: -------------------------------------------------------------------------------- 1 | # 060-consumer-finance-in-usa 2 | ## Unsupervised learning, specifically clustering 3 | 4 | - Side-by-side bar chart 5 | - K-means clustering model 6 | - Clustering-2-features vs -multiple-features 7 | - Feature selection based on variance 8 | - Principal component analysis (PCA) 9 | -------------------------------------------------------------------------------- /060-consumer-finances in-usa/libraries.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import plotly.express as px 3 | import pandas as pd 4 | import seaborn as sns 5 | from sklearn.cluster import KMeans 6 | from sklearn.metrics import silhouette_score 7 | from teaching_tools.widgets import ClusterWidget, SCFClusterWidget 8 | from scipy.stats.mstats import trimmed_var 9 | from sklearn.decomposition import PCA 10 | from sklearn.pipeline import make_pipeline 11 | from sklearn.preprocessing import StandardScaler 12 | -------------------------------------------------------------------------------- /060-consumer-finances in-usa/model.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | import 1_import 3 | import 2_explore 4 | import 3_explore 5 | import 4_split 6 | 7 | # 12 8 | # Iteratively build and train a K-Means 9 | # model where n_clusters ranges [2, 12] 10 | 11 | n_clusters = range(2, 13) 12 | inertia_errors = [] 13 | silhouette_scores = [] 14 | 15 | # Use for loop 16 | for k in n_clusters: 17 | # Build 18 | model = make_pipeline(StandardScaler(), KMeans(n_clusters=k, random_state=42)) 19 | # Train 20 | model.fit(X) 21 | # Calculate inertia 22 | inertia_errors.append(model.named_steps["kmeans"].inertia_) 23 | # Calculate silhouette score 24 | silhouette_scores.append( 25 | silhouette_score(X, model.named_steps["kmeans"].labels_) 26 | ) 27 | 28 | print("Inertia:", inertia_errors[:10]) 29 | print() 30 | print("Silhouette Scores:", silhouette_scores[:3]) 31 | 32 | # 13 33 | # Line plot showing values of 34 | # inertia_errors as a function of n_clusters 35 | 36 | fig = px.line( 37 | x=n_clusters, y=inertia_errors, title="Your Title" 38 | ) 39 | fig.update_layout(xaxis_title="Your x_label", yaxis_title="Your y_label") 40 | 41 | # 14 42 | # Line plot showing values of 43 | # silhouette_scores as a function of n_clusters 44 | 45 | fig = px.line( 46 | x=n_clusters, y=silhouette_scores, title="Your Title" 47 | ) 48 | fig.update_layout(xaxis_title="Your x_label", yaxis_title="Your y_label") 49 | 50 | # 15 51 | # Build and train a new k-means model 52 | # n_clusters: 3 53 | # random state: 42 54 | 55 | final_model = make_pipeline( 56 | StandardScaler(), 57 | KMeans(n_clusters=3, random_state=42) 58 | ) 59 | final_model.fit(X) 60 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/README.md: -------------------------------------------------------------------------------- 1 | ## 070-ds-admissions-in-wqu 2 | 3 | ### Contents... 4 | > EDA. 5 | 6 | > ETL. 7 | 8 | > Chi-Square test. 9 | 10 | > Interactive dashboard. 11 | 12 | 13 | ![image](https://user-images.githubusercontent.com/99328720/189812167-668064f1-7ee3-4a5c-9ae7-638101e5e9f9.png) 14 | 15 | 16 | 17 | ![image](https://user-images.githubusercontent.com/99328720/189812222-a33a9bee-42cf-481e-a3d1-047cb69859e8.png) 18 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/aggregate.py: -------------------------------------------------------------------------------- 1 | """ Using the aggregate method() """ 2 | 3 | import imports 4 | 5 | 6 | # aggregate by nationality 7 | result = .aggregate( 8 | [ 9 | { 10 | "$group": {"_id": "$countryISO2", "count": {"$count": {}}} 11 | } 12 | ] 13 | ) 14 | 15 | 16 | # aggregate by sign-up 17 | result = .aggregate( 18 | [ 19 | { 20 | "$match": {"admissionsQuiz": "incomplete"} 21 | }, 22 | { 23 | "$group": { 24 | "_id": {"$dateTrunc": {"date": "$createdAt", "unit": "day"}}, 25 | "count": {"$sum": 1} 26 | } 27 | } 28 | ] 29 | ) 30 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/choropleth_map.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | import load 5 | 6 | 7 | # `build_nat_choropleth` function 8 | ["count_pct"] = (["count"] / ["count"].sum()) * 100 9 | 10 | 11 | def build_nat_choropleth(): 12 | fig = px.choropleth( 13 | data_frame= , 14 | locations="country_iso3", 15 | color="count_pct", 16 | projection="natural earth", 17 | color_continuous_scale=px.colors.sequential.Oranges, 18 | title="Title" 19 | ) 20 | return fig 21 | 22 | # Display image 23 | nat_fig = build_nat_choropleth() 24 | nat_fig.write_image("images/7-5-4.png", scale=1, height=500, width=700) 25 | 26 | nat_fig.show() 27 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/connect.py: -------------------------------------------------------------------------------- 1 | """ Connecting to the Database """ 2 | 3 | import imports 4 | 5 | 6 | # Connect to database 7 | # Access a certain collection 8 | 9 | # Create a Mongo-`client` 10 | client = MongoClient(host="localhost", port=) 11 | 12 | # Create a database: `db` 13 | db = client["wqu-abtest"] 14 | 15 | # Find your collection: `""` 16 | mscfe_app = db[""] 17 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/contingency_bar.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | import crosstab 5 | 6 | 7 | # `build_contingency_bar` function 8 | def build_contingency_bar(): 9 | # side-by-side bar chart 10 | fig = px.bar( 11 | data_frame=data, 12 | barmode="group", 13 | title="TITLE" 14 | ) 15 | # Set axis labels 16 | fig.update_layout(xaxis_title="XTITLE", yaxis_title="YTITLE") 17 | return fig 18 | 19 | # Display 20 | cb_fig = build_contingency_bar() 21 | cb_fig.write_image("images/7-5-16.png", scale=1, height=500, width=700) 22 | 23 | cb_fig.show() 24 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/contingency_table.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | 5 | 6 | # contingency table 7 | contingency_table = Table2x2(data.values) 8 | 9 | # chi-square test 10 | chi_square_test = contingency_table.test_nominal_association() 11 | 12 | # odds ratio 13 | odds_ratio = contingency_table.oddsratio.round(1) 14 | 15 | # summary... 16 | summary = contingency_table.summary() 17 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/country_converter.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import load 4 | 5 | # Instantiate `CountryConverter` 6 | cc = CountryConverter() 7 | 8 | # Create new columns ... full country names 9 | ["country_name"] = cc.convert( 10 | ["country_iso2"], to="name_short" 11 | ) 12 | 13 | # ... three letter abbv country names 14 | ["country_iso3"] = cc.convert( 15 | ["country_iso2"], to="ISO3" 16 | ) 17 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/crosstab.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | 5 | 6 | data = pd.crosstab( 7 | index=["group"], 8 | columns=["admissionsQuiz"], 9 | normalize=False 10 | ) 11 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/imports.py: -------------------------------------------------------------------------------- 1 | """ Module containing all the needed libraries """ 2 | 3 | 4 | from statsmodels.stats.contingency_tables import Table2x2 5 | from statsmodels.stats.power import GofChisquarePower 6 | from teaching_tools.ab_test.experiment import Experiment 7 | from country_converter import CountryConverter 8 | from pymongo.collection import Collection 9 | from pymongo import MongoClient 10 | from pprint import PrettyPrinter 11 | import matplotlib.pyplot as plt 12 | import pandas as pd 13 | import numpy as np 14 | import random 15 | import math 16 | import scipy 17 | import plotly.express as px 18 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/load.py: -------------------------------------------------------------------------------- 1 | """ Loading into a data frame """ 2 | 3 | import aggregate 4 | 5 | # aggregated by nationality 6 | = pd.DataFrame(result).rename( 7 | {"_id": "country_iso2"}, axis="columns").sort_values("count") 8 | 9 | 10 | 11 | # aggregated by sign up 12 | = ( 13 | pd.DataFrame(result) 14 | .rename({"_id": "date", "count": "new_users"}, axis=1) 15 | .set_index("date") 16 | .sort_index() 17 | .squeeze() 18 | ) 19 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/mongo_instance.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | from our_mongo_class import MongoRepository 5 | 6 | 7 | # An instance of class MongoRepository 8 | repo = MongoRepository() 9 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/our_mongo_class.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | 5 | 6 | class MongoRepository: 7 | """Repository for interacting with MongoDB database. 8 | 9 | Params 10 | ---------- 11 | client : `pymongo.MongoClient` 12 | Default, `MongoClient(host='localhost', port=)`. 13 | db : str 14 | Default, `''`. 15 | collection : str 16 | Default, `'`. 17 | 18 | Attributes 19 | ---------- 20 | collection : pymongo.collection.Collection 21 | All data will be extracted from and loaded to this collection. 22 | """ 23 | 24 | # `__init__` method 25 | def __init__( 26 | self, 27 | client=MongoClient(host="localhost", port=), 28 | db="''", 29 | collection="`'" 30 | ): 31 | self.collection = client[db][collection] 32 | 33 | # `find_by_date` method 34 | def find_by_date(self, date_string): 35 | 36 | # Convert `date_string` to datetime object 37 | start = pd.to_datetime(date_string, format="%Y-%m-%d") 38 | 39 | # Offset `start` by 1 day 40 | end = start + pd.DateOffset(days=1) 41 | 42 | # Create PyMongo query for no-quiz applicants b/t `start` and `end` 43 | query = {"createdAt": {"$gte": start, "$lt": end}, "admissionsQuiz": "incomplete"} 44 | 45 | # Query collection, get result 46 | result = self.collection.find(query) 47 | 48 | # Convert `result` to list 49 | observations = list(result) 50 | 51 | # REMOVE} 52 | return observations 53 | 54 | 55 | # `update_applicants` method 56 | def update_applicants(self, observations_assigned): 57 | n = 0 58 | n_modified = 0 59 | 60 | for doc in observations_assigned: 61 | result = self.collection.update_one( 62 | filter={"_id": doc["_id"]}, 63 | update={"$set": doc} 64 | ) 65 | n += result.matched_count 66 | n_modified += result.modified_count 67 | transaction_result = {"n": n, "nModified": n_modified} 68 | return transaction_result 69 | 70 | 71 | # `assign_to_groups` method 72 | def assign_to_groups(self, date_string): 73 | 74 | # get observations 75 | observations = self.find_by_date(date_string) 76 | 77 | # Shuffle `observations` 78 | random.seed(42) 79 | random.shuffle(observations) 80 | 81 | # Get index position of item at observations halfway point 82 | idx = len(observations) // 2 83 | 84 | # Assign first half of observations to control group 85 | for doc in observations[:idx]: 86 | doc["inExperiment"] = True 87 | doc["group"] = "no email (control)" 88 | 89 | # Assign second half of observations to treatment group 90 | for doc in observations[idx:]: 91 | doc["inExperiment"] = True 92 | doc["group"] = "email (treatment)" 93 | 94 | # Update collections 95 | result = self.update_applicants(observations) 96 | return result 97 | 98 | # `find_exp_observations` method 99 | def find_exp_observations(self): 100 | result = self.collection.find({"inExperiment": True}) 101 | return list(result) 102 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/probability.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import statistical_summary 4 | 5 | 6 | prob_65_or_fewer = scipy.stats.norm.cdf( 7 | group_size * 2, 8 | loc=sum_mean, 9 | scale=sum_std 10 | ) 11 | prob_65_or_greater = 1 - prob_65_or_fewer 12 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/run_exp.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | import connect 5 | import mongo_instance 6 | 7 | 8 | exp = Experiment(repo=client, db="yourDatabase", collection="yourCollection") 9 | exp.reset_experiment() 10 | result = exp.run_experiment(days=exp_days, assignment=True) 11 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/statistic_power.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | 5 | 6 | chi_square_power = GofChisquarePower() 7 | group_size = math.ceil(chi_square_power.solve_power( 8 | effect_size=0.5, # medium --> 0.5; small --> 0.2; large --> 0.8 9 | alpha=0.05, 10 | power=0.8 11 | )) 12 | -------------------------------------------------------------------------------- /070-ds-admissions-in-wqu/statistical_summary.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | import load 5 | import aggregate 6 | 7 | 8 | mean = .describe()["mean"] 9 | std = .describe()["std"] 10 | 11 | 12 | # sum... 13 | exp_days = 14 | sum_mean = mean * exp_days 15 | sum_std = std * math.sqrt(exp_days) 16 | -------------------------------------------------------------------------------- /080-volatility-forecasting-in-india/README.md: -------------------------------------------------------------------------------- 1 | ## Market/ Volatility Forecasting in India 2 | 3 | - API Design 4 | - HTTP Requests 5 | - SQL 6 | - Sqlite3 7 | - GARCH 8 | - Model deployment 9 | - Test driven Development 10 | - Python custom classes 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-Science-Lab 2 | 3 | # WQU DATA SCIENCE LAB PROJECTS 4 | 5 | - 010-housing-in-mexico 6 | - 020-housing-in-buenos-aires 7 | - 030-air-quality-in-nairobi 8 | - 040-earthquake-damage-in-nepal 9 | - 050-bankruptcy-in-poland 10 | - 060-consumer-finances in-usa 11 | - 070-ds-admissions-in-wqu 12 | - 080-market-forecasting-in-india 13 | --------------------------------------------------------------------------------