├── 1. Housing in Mexico ├── 1. Housing in Mexico.pdf └── Practice notebooks │ ├── Add new column, drop a column.py │ ├── Boxplot.py │ ├── Concat function.py │ ├── Create new columns.py │ ├── Drop NaNs.py │ ├── Drop columns.py │ ├── Histogram.py │ ├── Import CSV.py │ ├── Inspect data.py │ ├── README.md │ ├── bar chart.py │ ├── correlation coefficient.py │ ├── describe () method.py │ ├── groupby() method.py │ ├── scatter plot.py │ ├── scatter_mapbox.py │ └── value_counts() method.py ├── 2. Housing in Buenos Aires └── Practice Notebooks │ ├── README.md │ ├── baseline.py │ ├── glob ().py │ ├── important DS libraries.py │ ├── model_ build & fit.py │ ├── predict.py │ ├── retrieve data.py │ ├── split.py │ └── wrangle () function.py ├── 3. Air Quality In Nairobi └── Practice Notebooks │ ├── ACFplot.py │ ├── ARmodel.py │ ├── Baseline.py │ ├── MongoDB.py │ ├── PACFplot.py │ ├── PrettyPrinter.py │ ├── README.md │ ├── aggregate().py │ ├── communicate.py │ ├── distinct().py │ ├── finalModel.py │ ├── libraries.py │ ├── rollingAvg.py │ ├── split.py │ ├── timeSeriesplot.py │ ├── wfv.py │ └── wrangle().py ├── 4. Earthquake Damage In Nepal └── Practice Notebooks │ ├── 1) libraries.py │ ├── 2) connect.py │ ├── 3) get_tables.py │ ├── 4) explore_tables.py │ ├── 5) JOIN.py │ ├── 6) wrangle().py │ ├── 7) barChart.py │ ├── 8) boxplot.py │ ├── 9) pivot_table.py │ ├── 91) vertical_split.py │ ├── 92) horizontal_split.py │ ├── 93) baseline.py │ ├── 94) log_reg.py │ ├── 95) accuracy_score.py │ ├── 96) decision_tree.py │ ├── 97) validation_curve.py │ ├── 98) tests.py │ ├── 99) communicate.py │ ├── 991) others.py │ └── README.md ├── 5. Bankruptcy In Poland └── Practice Notebooks │ ├── GridSearchCV.py │ ├── README.md │ ├── acc_score.py │ ├── bar.py │ ├── barh.py │ ├── best_params.py │ ├── classif_reports.py │ ├── clf_cv.py │ ├── conf_matrix.py │ ├── import.py │ ├── interactive_dash.py │ ├── libraries.py │ ├── naNs.py │ ├── resampling.py │ ├── save_and_load.py │ ├── splits.py │ └── wrangle.py ├── 6. Consumer Finances In USA └── Practice Notebooks │ ├── 1_import.py │ ├── 2_explore.py │ ├── 3_explore.py │ ├── 4_split.py │ ├── 6_communicate.py │ ├── README.md │ ├── libraries.py │ └── model.py ├── 7. AB Testing at WorldQuant University └── Practice Notebooks │ ├── README.md │ ├── aggregate.py │ ├── choropleth_map.py │ ├── connect.py │ ├── contingency_bar.py │ ├── contingency_table.py │ ├── country_converter.py │ ├── crosstab.py │ ├── imports.py │ ├── load.py │ ├── mongo_instance.py │ ├── our_mongo_class.py │ ├── probability.py │ ├── run_exp.py │ ├── statistic_power.py │ └── statistical_summary.py ├── 8. Volatility Forecasting In India └── Practice Notebooks │ ├── data.py │ ├── main.py │ └── model.py └── README.md /1. Housing in Mexico/1. Housing in Mexico.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wessamsw/WorldQuant-Data-Science-Program/417ae6e0627c2e5709f509fb41d376b4e4583251/1. Housing in Mexico/1. Housing in Mexico.pdf -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/Add new column, drop a column.py: -------------------------------------------------------------------------------- 1 | df2.head() 2 | 3 | property_type state region lat lon area_m2 price_brl 4 | 0 apartment Pernambuco Northeast -8.134204 -34.906326 72.0 414222.98 5 | 1 apartment Pernambuco Northeast -8.126664 -34.903924 136.0 848408.53 6 | 2 apartment Pernambuco Northeast -8.125550 -34.907601 75.0 299438.28 7 | 3 apartment Pernambuco Northeast -8.120249 -34.895920 187.0 848408.53 8 | 4 apartment Pernambuco Northeast -8.142666 -34.906906 80.0 464129.36 9 | 10 | # price_brl ---> price in Brazilian reals 11 | # create new column price_usd 12 | # use 1 USD = 3.19 Brazilian reals 13 | 14 | df2["price_usd"] = df2["price_brl"] / 3.19 15 | df2.head() 16 | 17 | property_type state region lat lon area_m2 price_brl price_usd 18 | 0 apartment Pernambuco Northeast -8.134204 -34.906326 72.0 414222.98 129850.463950 19 | 1 apartment Pernambuco Northeast -8.126664 -34.903924 136.0 848408.53 265958.786834 20 | 2 apartment Pernambuco Northeast -8.125550 -34.907601 75.0 299438.28 93867.799373 21 | 3 apartment Pernambuco Northeast -8.120249 -34.895920 187.0 848408.53 265958.786834 22 | 4 apartment Pernambuco Northeast -8.142666 -34.906906 80.0 464129.36 145495.097179 23 | 24 | 25 | # DROP COLUMNS 26 | # drop price_brl 27 | 28 | df2 = df2.drop("price_brl", axis="columns") 29 | df2.head() 30 | 31 | property_type state region lat lon area_m2 price_usd 32 | 0 apartment Pernambuco Northeast -8.134204 -34.906326 72.0 129850.463950 33 | 1 apartment Pernambuco Northeast -8.126664 -34.903924 136.0 265958.786834 34 | 2 apartment Pernambuco Northeast -8.125550 -34.907601 75.0 93867.799373 35 | 3 apartment Pernambuco Northeast -8.120249 -34.895920 187.0 265958.786834 36 | 4 apartment Pernambuco Northeast -8.142666 -34.906906 80.0 145495.097179 37 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/Boxplot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | plt.boxplot(df["area_m2"]) 4 | plt.xlabel("Area [sq meters]") 5 | plt.title("Distribution of Home Sizes") 6 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/Concat function.py: -------------------------------------------------------------------------------- 1 | # concatenate 2 data frames using concat 2 | 3 | df = pd.concat([df1, df2]) 4 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/Create new columns.py: -------------------------------------------------------------------------------- 1 | df1.head() 2 | 3 | # Output 4 | property_type place_with_parent_names region lat-lon area_m2 price_usd 5 | 0 apartment |Brasil|Alagoas|Maceió| Northeast -9.6443051,-35.7088142 110.0 $187,230.85 6 | 1 apartment |Brasil|Alagoas|Maceió| Northeast -9.6430934,-35.70484 65.0 $81,133.37 7 | 2 house |Brasil|Alagoas|Maceió| Northeast -9.6227033,-35.7297953 211.0 $154,465.45 8 | 3 apartment |Brasil|Alagoas|Maceió| Northeast -9.622837,-35.719556 99.0 $146,013.20 9 | 4 apartment |Brasil|Alagoas|Maceió| Northeast -9.654955,-35.700227 55.0 $101,416.71 10 | 11 | df1.info() 12 | 13 | # Output 14 | 15 | RangeIndex: 12834 entries, 0 to 12833 16 | Data columns (total 6 columns): 17 | # Column Non-Null Count Dtype 18 | --- ------ -------------- ----- 19 | 0 property_type 12834 non-null object 20 | 1 place_with_parent_names 12834 non-null object 21 | 2 region 12834 non-null object 22 | 3 lat-lon 11551 non-null object 23 | 4 area_m2 12834 non-null float64 24 | 5 price_usd 12834 non-null object 25 | dtypes: float64(1), object(5) 26 | memory usage: 601.7+ KB 27 | 28 | 29 | df1[["lat", "lon"]] = df1["lat-lon"].str.split(",", expand=True) 30 | 31 | # expand ---> increase size of data frame 32 | # without replacing 33 | 34 | df1["lat"] = df1.lat.astype(float) # change lat and lon from type object(string) to type float 35 | df1["lon"] = df1.lon.astype(float) 36 | df1.shape 37 | 38 | # Output 39 | (11551, 8) 40 | 41 | # Example 2 42 | 43 | df1["state"] = df1["place_with_parent_names"].str.split("|", expand=True)[2] 44 | df1.head() 45 | 46 | # Output 47 | property_type place_with_parent_names region lat-lon area_m2 price_usd lat lon state 48 | 0 apartment |Brasil|Alagoas|Maceió| Northeast -9.6443051,-35.7088142 110.0 $187,230.85 -9.644305 -35.708814 Alagoas 49 | 1 apartment |Brasil|Alagoas|Maceió| Northeast -9.6430934,-35.70484 65.0 $81,133.37 -9.643093 -35.704840 Alagoas 50 | 2 house |Brasil|Alagoas|Maceió| Northeast -9.6227033,-35.7297953 211.0 $154,465.45 -9.622703 -35.729795 Alagoas 51 | 3 apartment |Brasil|Alagoas|Maceió| Northeast -9.622837,-35.719556 99.0 $146,013.20 -9.622837 -35.719556 Alagoas 52 | 4 apartment |Brasil|Alagoas|Maceió| Northeast -9.654955,-35.700227 55.0 $101,416.71 -9.654955 -35.700227 Alagoas 53 | 54 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/Drop NaNs.py: -------------------------------------------------------------------------------- 1 | # df1.shape before dropping NaNs ---> (12834, 6) 2 | 3 | df1.dropna(inplace=True) # drop rows with null values 4 | df1.shape 5 | 6 | # Output 7 | (11551, 6) 8 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/Drop columns.py: -------------------------------------------------------------------------------- 1 | df1.head() 2 | 3 | property_type place_with_parent_names region lat-lon area_m2 price_usd lat lon state 4 | 0 apartment |Brasil|Alagoas|Maceió| Northeast -9.6443051,-35.7088142 110.0 $187,230.85 -9.644305 -35.708814 Alagoas 5 | 1 apartment |Brasil|Alagoas|Maceió| Northeast -9.6430934,-35.70484 65.0 $81,133.37 -9.643093 -35.704840 Alagoas 6 | 2 house |Brasil|Alagoas|Maceió| Northeast -9.6227033,-35.7297953 211.0 $154,465.45 -9.622703 -35.729795 Alagoas 7 | 3 apartment |Brasil|Alagoas|Maceió| Northeast -9.622837,-35.719556 99.0 $146,013.20 -9.622837 -35.719556 Alagoas 8 | 4 apartment |Brasil|Alagoas|Maceió| Northeast -9.654955,-35.700227 55.0 $101,416.71 -9.654955 -35.700227 Alagoas 9 | 10 | df1 = df1.drop(["lat-lon", "place_with_parent_names"], axis="columns") 11 | df1.head() 12 | 13 | property_type region area_m2 price_usd lat lon state 14 | 0 apartment Northeast 110.0 187230.85 -9.644305 -35.708814 Alagoas 15 | 1 apartment Northeast 65.0 81133.37 -9.643093 -35.704840 Alagoas 16 | 2 house Northeast 211.0 154465.45 -9.622703 -35.729795 Alagoas 17 | 3 apartment Northeast 99.0 146013.20 -9.622837 -35.719556 Alagoas 18 | 4 apartment Northeast 55.0 101416.71 -9.654955 -35.700227 Alagoas 19 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/Histogram.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | plt.hist(df["price_usd"]) 4 | plt.xlabel("Price [USD]") 5 | plt.ylabel("Frequency") 6 | plt.title("Distribution of Home Prices") 7 | 8 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/Import CSV.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | import plotly.express as px 4 | 5 | df1 = pd.read_csv("data/brasil-real-estate-1.csv") 6 | df1.shape 7 | 8 | #Output 9 | (12834, 6) 10 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/Inspect data.py: -------------------------------------------------------------------------------- 1 | df1.info() 2 | 3 | # Output 4 | 5 | RangeIndex: 12834 entries, 0 to 12833 6 | Data columns (total 6 columns): 7 | # Column Non-Null Count Dtype 8 | --- ------ -------------- ----- 9 | 0 property_type 12834 non-null object 10 | 1 place_with_parent_names 12834 non-null object 11 | 2 region 12834 non-null object 12 | 3 lat-lon 11551 non-null object 13 | 4 area_m2 12834 non-null float64 14 | 5 price_usd 12834 non-null object 15 | dtypes: float64(1), object(5) 16 | memory usage: 601.7+ KB 17 | 18 | df1.shape 19 | 20 | # Output 21 | (12834, 6) 22 | 23 | 24 | df1.head() # Displays the first 5 rows starting from 0 25 | 26 | #Output 27 | property_type place_with_parent_names region lat-lon area_m2 price_usd 28 | 0 apartment |Brasil|Alagoas|Maceió| Northeast -9.6443051,-35.7088142 110.0 $187,230.85 29 | 1 apartment |Brasil|Alagoas|Maceió| Northeast -9.6430934,-35.70484 65.0 $81,133.37 30 | 2 house |Brasil|Alagoas|Maceió| Northeast -9.6227033,-35.7297953 211.0 $154,465.45 31 | 3 apartment |Brasil|Alagoas|Maceió| Northeast -9.622837,-35.719556 99.0 $146,013.20 32 | 4 apartment |Brasil|Alagoas|Maceió| Northeast -9.654955,-35.700227 55.0 $101,416.71 33 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/README.md: -------------------------------------------------------------------------------- 1 | # work-ds-curriculum-010-housing-in-mexico 2 | 3 | WQU DATA SCIENCE LAB PROJECT 1 4 | HOUSING IN MEXICO 5 | 6 | Key concepts learnt and their application 7 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/bar chart.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | mean_price_by_region.plot( 4 | kind="bar", 5 | xlabel="Region", 6 | ylabel="Mean Price [USD]", 7 | title="Mean Home Price by Region" 8 | ); 9 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/correlation coefficient.py: -------------------------------------------------------------------------------- 1 | corr1= homes_by_state["area_m2"].corr(homes_by_state["price_usd"]) 2 | 3 | 0.5773267433717683 # more than half 4 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/describe () method.py: -------------------------------------------------------------------------------- 1 | df.head() 2 | 3 | property_type region area_m2 price_usd lat lon state 4 | 0 apartment Northeast 110.0 187230.85 -9.644305 -35.708814 Alagoas 5 | 1 apartment Northeast 65.0 81133.37 -9.643093 -35.704840 Alagoas 6 | 2 house Northeast 211.0 154465.45 -9.622703 -35.729795 Alagoas 7 | 3 apartment Northeast 99.0 146013.20 -9.622837 -35.719556 Alagoas 8 | 4 apartment Northeast 55.0 101416.71 -9.654955 -35.700227 Alagoas 9 | 10 | dfa = df[["area_m2", "price_usd"]] # subset for a data frame 11 | summary_stats = dfa.describe() 12 | summary_stats 13 | 14 | area_m2 price_usd 15 | count 22844.000000 22844.000000 16 | mean 115.020224 194987.315480 17 | std 47.742932 103617.682978 18 | min 53.000000 74892.340000 19 | 25% 76.000000 113898.770000 20 | 50% 103.000000 165697.555000 21 | 75% 142.000000 246900.880878 22 | max 252.000000 525659.717868 23 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/groupby() method.py: -------------------------------------------------------------------------------- 1 | mean_price_by_region = df.groupby("region")["price_usd"].mean().sort_values(ascending=True) 2 | mean_price_by_region.head() 3 | 4 | region 5 | Central-West 178596.283663 6 | North 181308.958207 7 | Northeast 185422.985441 8 | South 189012.345265 9 | Southeast 208996.762778 10 | Name: price_usd, dtype: float64 11 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/scatter plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | plt.scatter(x=homes_by_state["area_m2"], y=homes_by_state["price_usd"]) 4 | plt.xlabel("Area [sq meters]") 5 | plt.ylabel("Price [USD]") 6 | plt.title("Rio Grande do Sul: Price vs. Area"); 7 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/scatter_mapbox.py: -------------------------------------------------------------------------------- 1 | import plotly.express as px 2 | 3 | fig = px.scatter_mapbox( 4 | df, 5 | lat="lat", 6 | lon="lon", 7 | center={"lat": -14.2, "lon": -51.9}, # Map will be centered on Brazil 8 | width=600, 9 | height=600, 10 | hover_data=["price_usd"], # Display price when hovering mouse over house 11 | ) 12 | 13 | fig.update_layout(mapbox_style="open-street-map") 14 | 15 | fig.show() 16 | -------------------------------------------------------------------------------- /1. Housing in Mexico/Practice notebooks/value_counts() method.py: -------------------------------------------------------------------------------- 1 | # counts the number of items/ things 2 | 3 | homes_by_state = df_south["state"].value_counts() 4 | homes_by_state 5 | 6 | 7 | Rio Grande do Sul 2643 8 | Santa Catarina 2634 9 | Paraná 2544 10 | Name: state, dtype: int64 11 | -------------------------------------------------------------------------------- /2. Housing in Buenos Aires/Practice Notebooks/README.md: -------------------------------------------------------------------------------- 1 | # 020-housing-in-buenos-aires/025-assignment.ipynb 2 | 3 | New concepts learnt in this project 4 | -------------------------------------------------------------------------------- /2. Housing in Buenos Aires/Practice Notebooks/baseline.py: -------------------------------------------------------------------------------- 1 | # Done after splitting data 2 | # into features and vector 3 | 4 | from sklearn.metrics import mean_absolute_error 5 | 6 | y_mean = y_train.mean() 7 | y_pred_baseline = [y_mean] * len(y_train) 8 | baseline_mae = mean_absolute_error(y_train, y_pred_baseline) # what our model needs to beat 9 | -------------------------------------------------------------------------------- /2. Housing in Buenos Aires/Practice Notebooks/glob ().py: -------------------------------------------------------------------------------- 1 | # create a list with files of similar format 2 | 3 | glob(-*.) 4 | 5 | # example 6 | glob("data/programfiles/excel-*.csv") 7 | 8 | # output 9 | ['data/programfiles/excel-1.csv', 10 | 'data/programfiles/excel-4.csv', 11 | 'data/programfiles/excel-3.csv', 12 | 'data/programfiles/excel-5.csv', 13 | 'data/programfiles/excel-2.csv'] 14 | 15 | sorted(glob("data/programfiles/excel-*.csv")) 16 | 17 | ['data/programfiles/excel-1.csv', 18 | 'data/programfiles/excel-2.csv', 19 | 'data/programfiles/excel-3.csv', 20 | 'data/programfiles/excel-4.csv', 21 | 'data/programfiles/excel-5.csv'] 22 | -------------------------------------------------------------------------------- /2. Housing in Buenos Aires/Practice Notebooks/important DS libraries.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import warnings 5 | warnings.simplefilter(action="ignore", category=FutureWarning) 6 | import plotly.express as px 7 | import pandas as pd 8 | import seaborn as sns 9 | from category_encoders import OneHotEncoder 10 | from IPython.display import VimeoVideo 11 | from ipywidgets import Dropdown, FloatSlider, IntSlider, interact 12 | from sklearn.impute import SimpleImputer 13 | from sklearn.linear_model import LinearRegression, Ridge # noqa F401 14 | from sklearn.metrics import mean_absolute_error 15 | from sklearn.pipeline import make_pipeline 16 | from sklearn.utils.validation import check_is_fitted 17 | -------------------------------------------------------------------------------- /2. Housing in Buenos Aires/Practice Notebooks/model_ build & fit.py: -------------------------------------------------------------------------------- 1 | from category_encoders import OneHotEncoder 2 | from sklearn.impute import SimpleImputer 3 | from sklearn.linear_model import LinearRegression, Ridge 4 | from sklearn.pipeline import make_pipeline 5 | 6 | # build 7 | model = make_pipeline( 8 | OneHotEncoder(use_cat_names=True), 9 | SimpleImputer(), 10 | Ridge() 11 | ) 12 | 13 | # fit... 14 | model.fit(X_train, y_train) 15 | -------------------------------------------------------------------------------- /2. Housing in Buenos Aires/Practice Notebooks/predict.py: -------------------------------------------------------------------------------- 1 | y_test_pred = pd.Series(model.predict(X_test)) 2 | y_test_pred.head() 3 | 4 | # sample output 5 | 0 53538.366480 6 | 1 53171.988369 7 | 2 34263.884179 8 | 3 53488.425607 9 | 4 68738.924884 10 | dtype: float64 11 | -------------------------------------------------------------------------------- /2. Housing in Buenos Aires/Practice Notebooks/retrieve data.py: -------------------------------------------------------------------------------- 1 | # retrieve intercept 2 | intercept = model.named_steps["ridge"].intercept_ 3 | 4 | # retrieve coefficients 5 | coefficients = model.named_steps["ridge"].coef_ 6 | 7 | # retrieve names 8 | features = model.named_steps["onehotencoder"].get_feature_names() 9 | 10 | # create a series of names and values 11 | feat_imp = pd.Series(coefficients, index=features) 12 | feat_imp 13 | 14 | # sample output 15 | surface_covered_in_m2 291.654156 16 | lat 478.901375 17 | lon -2492.221814 18 | borough_Benito Juárez 13778.188880 19 | borough_Iztacalco 405.403127 20 | borough_Azcapotzalco 2459.288646 21 | borough_Coyoacán 3737.561001 22 | borough_Álvaro Obregón 3275.121061 23 | borough_Iztapalapa -13349.017448 24 | borough_Cuauhtémoc -350.531990 25 | borough_Tláhuac -14166.869486 26 | borough_Miguel Hidalgo 1977.314718 27 | -------------------------------------------------------------------------------- /2. Housing in Buenos Aires/Practice Notebooks/split.py: -------------------------------------------------------------------------------- 1 | # splitting data into feature matrix and target vector 2 | 3 | target = "price_aprox_usd" # <--- vector 4 | features = ["surface_covered_in_m2", "lat", "lon", "borough"] # <--- matrix 5 | X_train = df[features] # training data 6 | y_train = df[target] # " " " " 7 | 8 | # The vector is what we are trying to predict using the matrix 9 | # In this case we are trying to predict the price of a property 10 | # using the features in the matrix 11 | -------------------------------------------------------------------------------- /2. Housing in Buenos Aires/Practice Notebooks/wrangle () function.py: -------------------------------------------------------------------------------- 1 | # Wrangle function: 2 | # read in a csv file 3 | # apartments in < $100000 4 | # remove outliers 5 | # separate columns 6 | # create new columns from existing 7 | # take care of highly null columns 8 | # low and high cardinality 9 | # Leakage 10 | # multicolinearity 11 | 12 | def wrangle(filepath): 13 | # Read CSV file 14 | df = pd.read_csv(filepath) 15 | 16 | # Subset data: Apartments in , less than 100,000 17 | mask_ba = df["place_with_parent_names"].str.contains() 18 | mask_apt = df["property_type"] == "apartment" 19 | mask_price = df["price_aprox_usd"] < 100_000 20 | df = df[mask_ba & mask_apt & mask_price] 21 | 22 | # Subset data: Remove outliers for "surface_covered_in_m2" 23 | low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9]) 24 | mask_area = df["surface_covered_in_m2"].between(low, high) 25 | df = df[mask_area] 26 | 27 | # split lat-lon column 28 | df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float) 29 | df.drop(columns="lat-lon", inplace=True) 30 | 31 | # Extract newColumnName 32 | df[] = df["place_with_parent_names"].str.split("|", expand=True)[1] 33 | df.drop(columns="place_with_parent_names", inplace=True) 34 | 35 | # Drop feature with high null count 36 | df.drop(columns=["surface_total_in_m2", "price_usd_per_m2", "floor", "rooms", "expenses"], inplace=True) 37 | 38 | # Drop low- and high- categorical variables 39 | df.drop(columns=["operation", "property_type", "currency", "properati_url"], inplace=True) 40 | 41 | # Drop leaky columns 42 | df.drop(columns=["price", "price_aprox_local_currency", "price_per_m2"], inplace=True) 43 | 44 | # Drop columns with multi-colinerlity 45 | #df.drop(columns=["surface_total_in_m2", "rooms"], inplace=True) 46 | 47 | return df 48 | 49 | 50 | test1.isnull().sum() / len(test1) # check for highly null columns 51 | test1.select_dtypes("object").nunique() # check for low- and high- categorical variables 52 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/ACFplot.py: -------------------------------------------------------------------------------- 1 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 2 | import matplotlib.pyplot as plt 3 | 4 | fig, ax = plt.subplots(figsize=(15, 6)) 5 | plot_acf(y, ax=ax) 6 | plt.xlabel(<"xLabelvalue">) 7 | plt.ylabel(<"yLabelvalue">) 8 | plt.title(<"yourTitle">); 9 | 10 | # Don't delete the code below 👇 11 | plt.savefig("images/3-5-7.png", dpi=150) 12 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/ARmodel.py: -------------------------------------------------------------------------------- 1 | from statsmodels.tsa.ar_model import AutoReg 2 | from sklearn.metrics import mean_absolute_error 3 | 4 | # Use AR model to predict PM2.5 readings 5 | # Hyperparameter --> p 6 | p_params = range(1, 31) 7 | maes = [] 8 | for p in p_params: 9 | #Train model 10 | model = AutoReg(y_train, lags=p).fit() 11 | 12 | #Generate in-sample pred 13 | y_pred = model.predict().dropna() 14 | 15 | #Calculate mae 16 | mae = mean_absolute_error(y_train.iloc[p:], y_pred) 17 | maes.append(mae) 18 | 19 | mae_series = pd.Series(maes, name="mae", index=p_params) 20 | mae_series.head() 21 | 22 | # sample output 23 | 1 0.947888 24 | 2 0.933894 25 | 3 0.920850 26 | 4 0.920153 27 | 5 0.919519 28 | Name: mae, dtype: float64 29 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/Baseline.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import mean_absolute_error 2 | 3 | y_train_mean = y_train.mean() 4 | y_pred_baseline = [y_train_mean] * len(y_train) 5 | mae_baseline = mean_absolute_error(y_train, y_pred_baseline) 6 | 7 | print("Mean P2 Reading:", y_train_mean) 8 | print("Baseline MAE:", mae_baseline) 9 | 10 | # sample output 11 | Mean P2 Reading: 8.617582545265433 12 | Baseline MAE: 4.07658759405218 13 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/MongoDB.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | # Connect to server 4 | client = MongoClient(host=<"hostName">, port=) 5 | 6 | # Connect to database 7 | db = client[<"databaseName">] 8 | 9 | # Get collection 10 | dar = db[<"collectionName">] 11 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/PACFplot.py: -------------------------------------------------------------------------------- 1 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 2 | import matplotlib.pyplot as plt 3 | 4 | fig, ax = plt.subplots(figsize=(15, 6)) 5 | plot_pacf(y, ax=ax) # -----> line showing difference from acf plot <----- 6 | plt.xlabel(<"xLabelvalue">) 7 | plt.ylabel(<"yLabelvalue">) 8 | plt.title(<"yourTitle">); 9 | 10 | # Don't delete the code below 👇 11 | plt.savefig("images/3-5-7.png", dpi=150) 12 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/PrettyPrinter.py: -------------------------------------------------------------------------------- 1 | from pprint import PrettyPrinter 2 | 3 | # Instantiate prettyprinter ----> for nicely formatted output 4 | pp = PrettyPrinter(indent=2) 5 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/README.md: -------------------------------------------------------------------------------- 1 | # 030-Air-Quality-In-Nairobi 2 | 3 | - Data wrangling with MongoDB 4 | - LinearRegression with time Series data 5 | - Autoregressive models 6 | - ARMA models and Hyperparameter tuning 7 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/aggregate().py: -------------------------------------------------------------------------------- 1 | # Determine which collection 2 | # has the most sensor readings 3 | # $ --> introduces sth new 4 | result = dar.aggregate( 5 | [ 6 | {"$group": {"_id": "$metadata.site", "count": {"$count": {}}}} 7 | ] 8 | ) 9 | readings_per_site = list(result) 10 | readings_per_site 11 | 12 | # sample output 13 | [{'_id': 23, 'count': 60020}, {'_id': 11, 'count': 138412}] 14 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/communicate.py: -------------------------------------------------------------------------------- 1 | import plotly.express as px 2 | import pandas as pd 3 | 4 | # Put test and walk-forward validation values 5 | # in a dataframe and plot df 6 | df_pred_test = pd.DataFrame( 7 | {"y_test": y_test, "y_pred_wfv": y_pred_wfv} 8 | ) 9 | fig = px.line(df_pred_test, labels={"value": "PM2.5"}) 10 | fig.update_layout( 11 | title="Dar es Salaam, WFV Predictions", 12 | xaxis_title="Date", 13 | yaxis_title="PM2.5 Level", 14 | ) 15 | 16 | # Don't delete the code below 👇 17 | fig.write_image("images/3-5-18.png", scale=1, height=500, width=700) 18 | 19 | fig.show() 20 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/distinct().py: -------------------------------------------------------------------------------- 1 | # Determine no. of sites in collection 2 | sites = dar.distinct("metadata.site") # dar ---> variable holding collection 3 | sites 4 | 5 | # Sample output 6 | [11, 23] 7 | 8 | # count no. of docs at a prticular site 9 | # using count_documents() 10 | dar.count_documents({"metadata.site": 23}) 11 | 12 | # Sample output 13 | 60020 14 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/finalModel.py: -------------------------------------------------------------------------------- 1 | from statsmodels.tsa.ar_model import AutoReg 2 | from statsmodels.tsa.arima.model import ARIMA 3 | 4 | mae_series # locate best_p 5 | best_p = 28 6 | 7 | # build and train model 8 | best_model = AutoReg(y_train, lags=best_p).fit() 9 | 10 | # calculate training residuals for best_model 11 | y_train_resid = best_model.resid 12 | y_train_resid.name = "residuals" 13 | y_train_resid.head() 14 | 15 | # sample output 16 | timestamp 17 | 2018-01-02 07:00:00+03:00 1.732488 18 | 2018-01-02 08:00:00+03:00 -0.381568 19 | 2018-01-02 09:00:00+03:00 -0.560971 20 | 2018-01-02 10:00:00+03:00 -2.215760 21 | 2018-01-02 11:00:00+03:00 0.006468 22 | Freq: H, Name: residuals, dtype: float64 23 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/libraries.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import time 3 | from pprint import PrettyPrinter 4 | import matplotlib.pyplot as plt 5 | import pandas as pd 6 | import plotly.express as px 7 | import seaborn as sns 8 | from pymongo import MongoClient 9 | import pytz 10 | from statsmodels.tsa.ar_model import AutoReg 11 | from sklearn.linear_model import LinearRegression 12 | from sklearn.metrics import mean_absolute_error 13 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 14 | from statsmodels.tsa.arima.model import ARIMA 15 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/rollingAvg.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | fig, ax = plt.subplots(figsize=(15, 6)) 4 | y.rolling(168).mean().plot(ax=ax, xlabel="Date", ylabel="PM2.5 Level", title="Dar es Salaam PM2.5 Levels, 7-Day Rolling Average"); 5 | # --> 168 == num of hours in a week 6 | 7 | # Don't delete the code below 👇 8 | plt.savefig("images/3-5-6.png", dpi=150) 9 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/split.py: -------------------------------------------------------------------------------- 1 | # percentage ---> 90% (0.9), 80% (0.8) ... 2 | cutoff_test = int(len(y) * ) 3 | y_train = y.iloc[:cutoff_test] 4 | y_test = y.iloc[cutoff_test:] 5 | print("y_train shape:", y_train.shape) 6 | print("y_test shape:", y_test.shape) 7 | 8 | # sample output 9 | y_train shape: (1533,) 10 | y_test shape: (171,) 11 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/timeSeriesplot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | fig, ax = plt.subplots(figsize=(15, 6)) 4 | y.plot(xlabel="Date", ylabel="PM2.5 Level", title="Dar es Salaam PM2.5 Levels", ax=ax); 5 | 6 | # Don't delete the code below 👇 7 | plt.savefig("images/3-5-5.png", dpi=150) 8 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/wfv.py: -------------------------------------------------------------------------------- 1 | from statsmodels.tsa.arima.model import ARIMA 2 | 3 | # walk-forward validation for model for test data --> y_test 4 | # predictions stored in series: y_pred_wfv 5 | y_pred_wfv = pd.Series() 6 | history = y_train.copy() 7 | for i in range(len(y_test)): 8 | model = AutoReg(history, lags=best_p).fit() 9 | next_pred = model.forecast() # next value after end of history 10 | y_pred_wfv = y_pred_wfv.append(next_pred) 11 | history = history.append(y_test[next_pred.index]) 12 | 13 | y_pred_wfv.name = "prediction" 14 | y_pred_wfv.index.name = "timestamp" 15 | y_pred_wfv.head() 16 | 17 | # sample output 18 | timestamp 19 | 2018-03-06 00:00:00+03:00 8.056391 20 | 2018-03-06 01:00:00+03:00 8.681779 21 | 2018-03-06 02:00:00+03:00 6.268951 22 | 2018-03-06 03:00:00+03:00 6.303760 23 | 2018-03-06 04:00:00+03:00 7.171444 24 | Freq: H, Name: prediction, dtype: float64 25 | -------------------------------------------------------------------------------- /3. Air Quality In Nairobi/Practice Notebooks/wrangle().py: -------------------------------------------------------------------------------- 1 | # Wrangle function 2 | # Extract PM2.5 readings 3 | # from collection site with 4 | # most readings 5 | # Localize time 6 | # Remove outliers 7 | # Resample data to provide PM2.5 readings 8 | # for each hour 9 | # impute missing values 10 | # return series 11 | def wrangle(collection): 12 | results = collection.find( 13 | {"metadata.site": 11, "metadata.measurement": "P2"}, 14 | projection={"P2": 1, "timestamp": 1, "_id": 0}, # ---> focus/ limit to only "P2" and timestamp 15 | ) 16 | 17 | df = pd.DataFrame(results).set_index("timestamp") 18 | 19 | # Localize time 20 | df.index = df.index.tz_localize("UTC").tz_convert("Africa/Dar_es_Salaam") 21 | 22 | # Remove outliers 23 | df = df[df["P2"] < 100] 24 | 25 | # Resample to 1hour period, fill in missing values 26 | y = df["P2"].resample("1H").mean().fillna(method='ffill') 27 | 28 | return y 29 | 30 | # Using wrangle() 31 | y = wrangle(dar) 32 | y.head() 33 | 34 | # sample output 35 | timestamp 36 | 2018-01-01 03:00:00+03:00 9.456327 37 | 2018-01-01 04:00:00+03:00 9.400833 38 | 2018-01-01 05:00:00+03:00 9.331458 39 | 2018-01-01 06:00:00+03:00 9.528776 40 | 2018-01-01 07:00:00+03:00 8.861250 41 | Freq: H, Name: P2, dtype: float64 42 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/1) libraries.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | import seaborn as sns 6 | from category_encoders import OneHotEncoder 7 | from category_encoders import OrdinalEncoder 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.metrics import accuracy_score 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.pipeline import Pipeline, make_pipeline 12 | from sklearn.tree import DecisionTreeClassifier, plot_tree 13 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/2) connect.py: -------------------------------------------------------------------------------- 1 | %load_ext sql 2 | %sql sqlite:////home/jovyan/.sqlite 3 | 4 | # sample output 5 | 'Connected: @/home/jovyan/nepal.sqlite' 6 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/3) get_tables.py: -------------------------------------------------------------------------------- 1 | %%sql 2 | SELECT name 3 | FROM sqlite_schema 4 | WHERE type = "table" 5 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/4) explore_tables.py: -------------------------------------------------------------------------------- 1 | %%sql 2 | SELECT distinct(district_id) # gives unique values of column district_id 3 | FROM id_map # name of table 4 | 5 | 6 | # num of observations in table id_map 7 | # where value of column district_id is 1 8 | %%sql 9 | SELECT count(*) 10 | FROM id_map 11 | WHERE district_id = 1 12 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/5) JOIN.py: -------------------------------------------------------------------------------- 1 | %%sql 2 | # joining tables at columns building_id 3 | SELECT distinct(i.building_id) AS b_id, # building_id column of table i aliased as b_id 4 | s.*, # selects all columns of table s 5 | d.damage_grade # select damage_grade column of table d 6 | FROM id_map AS i 7 | JOIN building_structure AS s ON i.building_id = s.building_id 8 | JOIN building_damage AS d ON i.building_id = d.building_id 9 | WHERE district_id = 3 10 | LIMIT 5 11 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/6) wrangle().py: -------------------------------------------------------------------------------- 1 | def wrangle(db_path): 2 | # Connect to database using connect method 3 | conn = sqlite3.connect(db_path) 4 | 5 | # Construct query 6 | query = """ 7 | SELECT distinct(i.building_id) AS b_id, 8 | s.*, 9 | d.damage_grade 10 | FROM id_map AS i 11 | JOIN building_structure AS s ON i.building_id = s.building_id 12 | JOIN building_damage AS d ON i.building_id = d.building_id 13 | WHERE district_id = 3 14 | """ 15 | 16 | # Read query results into DataFrame 17 | df = pd.read_sql(query, conn, index_col="b_id") 18 | 19 | # Identify leaky columns 20 | drop_cols = [col for col in df.columns if "post_eq" in col] 21 | 22 | # Create binary target 23 | df["damage_grade"] = df["damage_grade"].str[-1].astype(int) 24 | df["severe_damage"] = (df["damage_grade"] > 3).astype(int) # encode as 0's and 1's 25 | 26 | # Drop old target 27 | drop_cols.append("damage_grade") 28 | 29 | # Drop multicolinearity column 30 | drop_cols.append("count_floors_pre_eq") 31 | 32 | # Drop high categorical features 33 | drop_cols.append("building_id") 34 | 35 | # Drop columns 36 | df.drop(columns=drop_cols, inplace=True) 37 | 38 | 39 | return df 40 | 41 | 42 | # Using wrangle func 43 | df = wrangle("/home/jovyan/nepal.sqlite") 44 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/7) barChart.py: -------------------------------------------------------------------------------- 1 | # create bar chart using 2 | # severe damage column which 3 | # contains two classes 4 | df["severe_damage"].value_counts(normalize=True).plot( 5 | kind="bar", xlabel="Severe Damage", ylabel="Relative Frequency", title="Class Balance" 6 | ); 7 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/8) boxplot.py: -------------------------------------------------------------------------------- 1 | # severe_damage: column with 2 groups 2 | # plinth_area_sq_ft: column: footprint size of building 3 | 4 | sns.boxplot(x="severe_damage", y="plinth_area_sq_ft", data=df) 5 | plt.xlabel("Severe Damage") 6 | plt.ylabel("Plinth Area [sq. ft.]") 7 | plt.title("Kavrepalanchok, Plinth Area vs Building Damage"); 8 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/9) pivot_table.py: -------------------------------------------------------------------------------- 1 | roof_pivot = pd.pivot_table( 2 | df, index="roof_type", values="severe_damage", aggfunc=np.mean # roof_type: column in table 3 | ).sort_values(by="severe_damage") 4 | roof_pivot 5 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/91) vertical_split.py: -------------------------------------------------------------------------------- 1 | X = df.drop(columns="severe_damage") # feature matrix: all columns apart from severe_damage 2 | y = df["severe_damage"] # target vector 3 | print("X shape:", X.shape) 4 | print("y shape:", y.shape) 5 | 6 | # sample output 7 | X shape: (76533, 11) 8 | y shape: (76533,) 9 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/92) horizontal_split.py: -------------------------------------------------------------------------------- 1 | X_train, X_val, y_train, y_val = train_test_split( 2 | X, y, test_size=0.2, random_state=42 3 | ) 4 | print("X_train shape:", X_train.shape) 5 | print("y_train shape:", y_train.shape) 6 | print("X_val shape:", X_val.shape) 7 | print("y_val shape:", y_val.shape) 8 | 9 | # sample output 10 | X_train shape: (61226, 11) 11 | y_train shape: (61226,) 12 | X_val shape: (15307, 11) 13 | y_val shape: (15307,) 14 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/93) baseline.py: -------------------------------------------------------------------------------- 1 | acc_baseline = y_train.value_counts(normalize=True).max() # normalize gives you the relative freq 2 | print("Baseline Accuracy:", round(acc_baseline, 2)) 3 | 4 | # sample output 5 | Baseline Accuracy: 0.55 6 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/94) log_reg.py: -------------------------------------------------------------------------------- 1 | model_lr = make_pipeline( 2 | OneHotEncoder(use_cat_names=True), 3 | LogisticRegression(max_iter=<1000-3000>) #max_iter: varies: suppresses the 'ConvergenceWarning' 4 | ) 5 | # Fit model to training data 6 | model_lr.fit(X_train, y_train) 7 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/95) accuracy_score.py: -------------------------------------------------------------------------------- 1 | lr_train_acc = accuracy_score(y_train, model_lr.predict(X_train)) 2 | lr_val_acc = model_lr.score(X_val, y_val) 3 | 4 | print("Logistic Regression, Training Accuracy Score:", lr_train_acc) 5 | print("Logistic Regression, Validation Accuracy Score:", lr_val_acc) 6 | 7 | # sample output 8 | Logistic Regression, Training Accuracy Score: 0.6515042628948486 9 | Logistic Regression, Validation Accuracy Score: 0.6536878552296335 10 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/96) decision_tree.py: -------------------------------------------------------------------------------- 1 | depth_hyperparams = range(1, 16) # for max_depth 2 | training_acc = [] 3 | validation_acc = [] 4 | for d in depth_hyperparams: 5 | model_dt = make_pipeline( 6 | OrdinalEncoder(), 7 | DecisionTreeClassifier(max_depth= d, random_state=42) 8 | ) 9 | # Fit model to training data 10 | model_dt.fit(X_train, y_train) 11 | # Calculate training accuracy score and append to `training_acc` 12 | training_acc.append(model_dt.score(X_train, y_train)) 13 | # Calculate validation accuracy score and append to `training_acc` 14 | validation_acc.append(model_dt.score(X_val, y_val)) 15 | 16 | print("Training Accuracy Scores:", training_acc[:6]) 17 | print("Validation Accuracy Scores:", validation_acc[:6]) 18 | 19 | 20 | # sample output 21 | Training Accuracy Scores: [0.6303041191650606, 0.6303041191650606, 0.642292490118577, 0.653529546271192, 0.6543951915852743, 0.6576617776761506] 22 | Validation Accuracy Scores: [0.6350035931273273, 0.6350035931273273, 0.6453909975828053, 0.6527732410008493, 0.6529039001763899, 0.6584569151368654] 23 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/97) validation_curve.py: -------------------------------------------------------------------------------- 1 | # Validation curve 2 | plt.plot(depth_hyperparams, training_acc, label="Training") 3 | plt.plot(depth_hyperparams, validation_acc, label="validation") 4 | plt.xlabel("Max Depth") 5 | plt.ylabel("Accuracy Score") 6 | plt.title("Validation Curve, Decision Tree Model") 7 | plt.legend(); 8 | 9 | 10 | # build & fit again 11 | final_model_dt = make_pipeline( 12 | OrdinalEncoder(), 13 | DecisionTreeClassifier(max_depth=10, random_state=42) 14 | ) 15 | # Fit model to training data 16 | final_model_dt.fit(X, y) #final_model_dt.fit(X_train, y_train) 17 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/98) tests.py: -------------------------------------------------------------------------------- 1 | # test type 1 2 | X_test = pd.read_csv("filePath.csv", index_col="b_id") 3 | y_test_pred = pd.Series(final_model_dt.predict(X_test)) 4 | y_test_pred[:5] 5 | 6 | # sample output 7 | 0 1 8 | 1 1 9 | 2 1 10 | 3 1 11 | 4 0 12 | dtype: int64 13 | 14 | 15 | # test type 2 16 | test_acc = model.score(X_test, y_test) 17 | print("Test Accuracy:", round(test_acc, 2)) 18 | 19 | # sample output 20 | Test Accuracy: 0.72 21 | 22 | 23 | # test type 3 24 | acc_train = accuracy_score(y_train, model_lr.predict(X_train)) 25 | acc_test = model_lr.score(X_test, y_test) 26 | 27 | print("LR Training Accuracy:", acc_train) 28 | print("LR Validation Accuracy:", acc_test) 29 | 30 | # sample output 31 | LR Training Accuracy: 0.717985042664646 32 | LR Validation Accuracy: 0.7218817948211109 33 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/99) communicate.py: -------------------------------------------------------------------------------- 1 | # DECISION TREE 2 | features = X_train.columns 3 | importances = .named_steps["decisiontreeclassifier"].feature_importances_ 4 | feat_imp = pd.Series(importances, index=features).sort_values() 5 | feat_imp.head() 6 | 7 | # sample output 8 | plan_configuration 0.004189 9 | land_surface_condition 0.008599 10 | foundation_type 0.009967 11 | position 0.011795 12 | ground_floor_type 0.013521 13 | dtype: float64 14 | 15 | 16 | # LOGISTIC REG 17 | features = model_lr.named_steps["onehotencoder"].get_feature_names() 18 | importances = model_lr.named_steps["logisticregression"].coef_[0] 19 | feat_imp = pd.Series(np.exp(importances), index=features).sort_values() 20 | feat_imp.head() 21 | 22 | # sample output 23 | superstructure_Brick, cement mortar 0.345719 24 | foundation_type_RC 0.364478 25 | roof_type_RCC/RB/RBC 0.415979 26 | ground_floor_type_RC 0.527756 27 | caste_household_Kumal 0.543642 28 | dtype: float64 29 | 30 | 31 | 32 | # horizontal bar chart 33 | feat_imp.plot(kind="barh") 34 | plt.xlabel("importance") 35 | plt.ylabel("Label") 36 | plt.title("Feature Importance"); 37 | 38 | 39 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/991) others.py: -------------------------------------------------------------------------------- 1 | # Create DF called 'damage_by_vdcmun' 2 | # group DF by "vdcmun_id" 3 | # calculating mean of the "severe_damage" column. 4 | # Be sure to sort from highest to lowest proportion 5 | damage_by_vdcmun = ( 6 | df.groupby("vdcmun_id")["severe_damage"].mean().sort_values(ascending=False) 7 | ).to_frame() 8 | damage_by_vdcmun 9 | 10 | 11 | # Line plot 12 | plt.plot(damage_by_vdcmun.values, color="blue") 13 | plt.xticks(range(len(damage_by_vdcmun)), labels=damage_by_vdcmun.index) 14 | plt.yticks(np.arange(0.0, 1.1, 0.2)) 15 | plt.xlabel("Mun ID") 16 | plt.ylabel("% Households") 17 | plt.title("Damage by Municipality"); 18 | -------------------------------------------------------------------------------- /4. Earthquake Damage In Nepal/Practice Notebooks/README.md: -------------------------------------------------------------------------------- 1 | # 040-Earthquake-Damage-in-Nepal 2 | 3 | - sqlite 4 | - logistic-regression 5 | - decision-tree 6 | - demographics 7 | - Ethical Data Science 8 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/GridSearchCV.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # Range of hyperparameters 4 | params = { 5 | "simpleimputer__strategy": ["mean", "median"], 6 | "randomforestclassifier__n_estimators": range(25, 100, 25), 7 | "randomforestclassifier__max_depth": range(10, 50, 10) 8 | } 9 | 10 | # Using `GridSearchCV` 11 | model = model = GridSearchCV( 12 | clf, 13 | param_grid=params, 14 | cv=5, 15 | n_jobs=-1, 16 | verbose=1 17 | ) 18 | 19 | # Fit your model 20 | model.fit(X_train_over, y_train_over) 21 | 22 | # cross_validation results 23 | results = pd.DataFrame(model.cv_results_) 24 | cv_results.head(5) 25 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/README.md: -------------------------------------------------------------------------------- 1 | # 050-bankruptcy-in-poland 2 | ## Concepts learnt 3 | - Working with JSON 4 | - Imbalanced data 5 | - Random forest 6 | - Gradient boosting 7 | - Linux command line 8 | - Creating python modules 9 | - Importing functions from modules 10 | - Saving and loading a model 11 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/acc_score.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | acc_train = model.score(X_train, y_train) 4 | acc_test = model.score(X_test, y_test) 5 | 6 | print("Model Training Accuracy:", round(acc_train, 4)) 7 | print("Model Test Accuracy:", round(acc_test, 4)) 8 | 9 | # Sample output 10 | Model Training Accuracy: 1.0 11 | Model Test Accuracy: 0.9764 12 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/bar.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | df["bankrupt"].value_counts(normalize=True).plot( 4 | kind = "bar", 5 | xlabel = "Bankrupt", 6 | ylabel = "Frequency", 7 | title = "Class Balance" 8 | ); 9 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/barh.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # Get feature names from training data 4 | features = X_train_over.columns 5 | 6 | # Extract importances from model 7 | importances = model.best_estimator_.named_steps["randomforestclassifier"].feature_importances_ 8 | 9 | # Create a series with feature names and importances 10 | feat_imp = pd.Series(importances, index=features).sort_values() 11 | 12 | # Plot 10 most important features 13 | feat_imp.tail(10).plot(kind="barh") 14 | plt.xlabel("...") 15 | plt.ylabel("...") 16 | plt.title("..."); 17 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/best_params.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # Method 1 4 | best_params = model.best_params_ 5 | print(best_params) 6 | 7 | # Method 2 8 | model.predict(X_train_over) 9 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/classif_reports.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | class_report = classification_report(y_test, model.predict(X_test)) 4 | print(class_report) 5 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/clf_cv.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # # classifier 4 | clf = make_pipeline(SimpleImputer(), RandomForestClassifier(random_state=42)) 5 | 6 | # cross validation 7 | cv_scores = cross_val_score(clf, X_train_over, y_train_over, cv=5, n_jobs=-1) 8 | print(cv_scores) 9 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/conf_matrix.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | ConfusionMatrixDisplay.from_estimator(model, X_test, y_test); 4 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/import.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # Compressed file --> dict 4 | with gzip.open("", "r") as f: 5 | taiwan_data = json.load(f) 6 | 7 | # Extracting keys from a dict 8 | taiwan_data_keys = taiwan_data.keys() 9 | print(taiwan_data_keys) 10 | 11 | # Sample output 12 | dict_keys(['schema', 'metadata', 'observations']) 13 | 14 | # Counting number of observations 15 | len(taiwan_data["observations"]) 16 | 17 | # Length / no. of each observation 18 | len(taiwan_data["observations"][0]) 19 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/interactive_dash.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | def make_cnf_matrix(threshold): 4 | y_pred_proba = model.predict_proba(X_test)[:, -1] 5 | y_pred = y_pred_proba > threshold 6 | conf_matrix = confusion_matrix(y_test, y_pred) 7 | tn, fp, fn, tp = conf_matrix.ravel() 8 | tn, fp, fn, tp 9 | print(f"Profit: €{tp * 100_000_000}") 10 | print(f"Loses: €{tp * 250_000_000}") 11 | ConfusionMatrixDisplay.from_predictions(y_test, y_pred, colorbar=False) 12 | thresh_widget = widgets.FloatSlider(min=0, max=1, value=0.5, step=0.05) 13 | 14 | interact(make_cnf_matrix, threshold=thresh_widget); 15 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/libraries.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import ClassifierMixin 2 | from sklearn.pipeline import Pipeline 3 | import gzip 4 | import json 5 | import pickle 6 | 7 | import pandas as pd 8 | import matplotlib.pyplot as plt 9 | import pandas as pd 10 | import seaborn as sns 11 | import wqet_grader 12 | from imblearn.over_sampling import RandomOverSampler 13 | from imblearn.under_sampling import RandomUnderSampler 14 | from sklearn.impute import SimpleImputer 15 | from sklearn.metrics import ( 16 | ConfusionMatrixDisplay, 17 | classification_report, 18 | confusion_matrix, 19 | ) 20 | from sklearn.pipeline import make_pipeline 21 | from sklearn.tree import DecisionTreeClassifier 22 | from sklearn.ensemble import RandomForestClassifier 23 | from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split 24 | import ipywidgets as widgets 25 | from ipywidgets import interact 26 | from sklearn.ensemble import GradientBoostingClassifier 27 | from teaching_tools.widgets import ConfusionMatrixWidget 28 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/naNs.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | nans_by_col = df.isna().sum() 4 | print("nans_by_col shape:", nans_by_col.shape) 5 | nans_by_col.head() 6 | 7 | # Sample output 8 | nans_by_col shape: (96,) 9 | bankrupt 0 10 | feat_1 0 11 | feat_2 0 12 | feat_3 0 13 | feat_4 0 14 | dtype: int64 15 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/resampling.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | over_sampler = RandomOverSampler(random_state=42) 4 | X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train) 5 | print("X_train_over shape:", X_train_over.shape) 6 | X_train_over.head() 7 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/save_and_load.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # save model to `Destination` 4 | with open("", "wb") as f: 5 | pickle.dump(model, f) 6 | 7 | # 8 | # Load model from `Destination`` 9 | with open("", "rb") as f: 10 | loaded_model = pickle.load(f) 11 | print(loaded_model) 12 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/splits.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # Feature matrix and Target vector 4 | target = "bankrupt" 5 | X = df.drop(columns="bankrupt") 6 | y = df[target] 7 | 8 | 9 | # Training and test split 10 | X_train, X_test, y_train, y_test = train_test_split( 11 | X, y, test_size=0.2, random_state=42 12 | ) 13 | -------------------------------------------------------------------------------- /5. Bankruptcy In Poland/Practice Notebooks/wrangle.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | 3 | # Wrangle function 4 | def wrangle(filePath): 5 | # Open compressed file, load to dict 6 | with gzip.open(filePath, "r") as f: 7 | data = json.load(f) 8 | 9 | # Dictionary --> DataFrame, set index 10 | df = pd.DataFrame().from_dict(data["observations"]).set_index("id") 11 | 12 | return df 13 | -------------------------------------------------------------------------------- /6. Consumer Finances In USA/Practice Notebooks/1_import.py: -------------------------------------------------------------------------------- 1 | df = pd.read_csv("") 2 | print("df shape:", df.shape) 3 | df.head() 4 | -------------------------------------------------------------------------------- /6. Consumer Finances In USA/Practice Notebooks/2_explore.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | import 1_import 3 | 4 | # 1 5 | # Percentage of respondents in df that are business owners, 6 | # assign result to the variable pct_biz_owners. 7 | # Review documentation regarding "HBUS" column 8 | # https://sda.berkeley.edu/sdaweb/docs/scfcomb2019/DOC/hcbkfx0.htm 9 | 10 | pct_biz_owners = sum(df["HBUS"]) / (sum(df["HBUS"] == 0) + sum(df["HBUS"])) 11 | print("% of business owners in df:", pct_biz_owners) 12 | 13 | # 2 14 | # DataFrame df_inccat showing normalized frequency 15 | # for income categories for business owners and non-business owners 16 | 17 | inccat_dict = { 18 | 1: "0-20", 19 | 2: "21-39.9", 20 | 3: "40-59.9", 21 | 4: "60-79.9", 22 | 5: "80-89.9", 23 | 6: "90-100", 24 | } 25 | 26 | df_inccat = ( 27 | df["INCCAT"] 28 | .replace(inccat_dict) 29 | .groupby(df["HBUS"]) 30 | .value_counts(normalize=True) 31 | .rename("frequency") 32 | .to_frame() 33 | .reset_index() 34 | ) 35 | 36 | df_inccat 37 | 38 | # 3 39 | # Seaborn, create a side-by-side bar chart of df_inccat 40 | 41 | sns.barplot( 42 | x="INCCAT", 43 | y="frequency", 44 | hue="HBUS", 45 | data=df_inccat, 46 | order=inccat_dict.values() 47 | ) 48 | plt.xlabel("") 49 | plt.ylabel("") 50 | plt.title(""); 51 | 52 | # 4 53 | # create a scatter plot that shows "HOUSES" vs. "DEBT" 54 | 55 | sns.scatterplot(x=df["DEBT"] / 1e6, y=df["HOUSES"] / 1e6, palette="deep") 56 | plt.xlabel("Household Debt") 57 | plt.ylabel("Home Value") 58 | plt.title("Home Value vs. Household Debt"); 59 | 60 | # 5 61 | # New DataFrame df_small_biz containing 62 | # only business owners whose income is below $500,000 63 | 64 | mask = (df["HBUS"]) & (df["INCOME"] < 500_000) 65 | df_small_biz = df[mask] 66 | -------------------------------------------------------------------------------- /6. Consumer Finances In USA/Practice Notebooks/3_explore.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | import 1_import 3 | import 2_explore 4 | 5 | # 6 6 | # Histogram from the "AGE" column 7 | # in df_small_biz with 10 bins 8 | df_small_biz["AGE"].hist(bins=10) 9 | plt.xlabel("Your x_Label") 10 | plt.ylabel("Your y_Label") 11 | plt.title("Your Title"); 12 | 13 | # 7 14 | # Variance for all the features in df_small_biz, 15 | # create Series top_ten_var with 10 features with largest variance 16 | top_ten_var = df_small_biz.var().sort_values().tail(10) 17 | top_ten_var 18 | 19 | # 8 20 | # trimmed variance for the features in df_small_biz 21 | # not include the top and bottom 10% of observations 22 | top_ten_trim_var = df_small_biz.apply(trimmed_var, limits=(0.1, 0.1)).sort_values().tail(10) 23 | top_ten_trim_var 24 | 25 | # 9 26 | # create a horizontal bar chart of top_ten_trim_var 27 | fig = px.bar( 28 | x=top_ten_trim_var, 29 | y=top_ten_trim_var.index, 30 | title="High Var Feat" 31 | ) 32 | fig.update_layout(xaxis_title="Your x_label", yaxis_title="Your y_label") 33 | 34 | # 10 35 | # Create list: high_var_cols, 36 | # with the column names of the five features 37 | # with the highest trimmed variance 38 | high_var_cols = top_ten_trim_var.tail(5).index.to_list() 39 | high_var_cols 40 | -------------------------------------------------------------------------------- /6. Consumer Finances In USA/Practice Notebooks/4_split.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | import 1_import 3 | import 2_explore 4 | import 3_explore 5 | 6 | # Feature matrix X containing five columns in high_var_cols 7 | X = df_small_biz[high_var_cols] 8 | -------------------------------------------------------------------------------- /6. Consumer Finances In USA/Practice Notebooks/6_communicate.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | import 1_import 3 | import 2_explore 4 | import 3_explore 5 | import 4_split 6 | import model 7 | 8 | # 16 9 | # DataFrame xgb containing mean values 10 | # of the features in X for the 3 clusters 11 | # in your final_model 12 | labels = final_model.named_steps["kmeans"].labels_ 13 | xgb = X.groupby(labels).mean() 14 | xgb 15 | 16 | # 17 17 | # create side-by-side bar chart from xgb 18 | # showing mean of the features in X 19 | # for each of the clusters in your final_model 20 | fig = px.bar( 21 | xgb, 22 | barmode="group", 23 | title="Your Title" 24 | ) 25 | fig.update_layout(xaxis_title="Your x_label", yaxis_title="Your y_label") 26 | 27 | # 18 28 | # Create a PCA transformer, 29 | # reduce the dimensionality of X to 2, 30 | # and then put the transformed data into a DataFrame 31 | pca = PCA(n_components=2, random_state=42) 32 | 33 | X_t = pca.fit_transform(X) 34 | 35 | X_pca = pd.DataFrame(X_t, columns=["PC1", "PC2"]) 36 | 37 | # 19 38 | # create a scatter plot of X_pca using seaborn 39 | fig = px.scatter( 40 | data_frame=X_pca, 41 | x="PC1", 42 | y="PC2", 43 | color=labels.astype(str), 44 | title="PCA Representation of Clusters" 45 | ) 46 | fig.update_layout(xaxis_title="PC1", yaxis_title="PC2") 47 | -------------------------------------------------------------------------------- /6. Consumer Finances In USA/Practice Notebooks/README.md: -------------------------------------------------------------------------------- 1 | # 060-consumer-finance-in-usa 2 | ## Unsupervised learning, specifically clustering 3 | 4 | - Side-by-side bar chart 5 | - K-means clustering model 6 | - Clustering-2-features vs -multiple-features 7 | - Feature selection based on variance 8 | - Principal component analysis (PCA) 9 | -------------------------------------------------------------------------------- /6. Consumer Finances In USA/Practice Notebooks/libraries.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import plotly.express as px 3 | import pandas as pd 4 | import seaborn as sns 5 | from sklearn.cluster import KMeans 6 | from sklearn.metrics import silhouette_score 7 | from teaching_tools.widgets import ClusterWidget, SCFClusterWidget 8 | from scipy.stats.mstats import trimmed_var 9 | from sklearn.decomposition import PCA 10 | from sklearn.pipeline import make_pipeline 11 | from sklearn.preprocessing import StandardScaler 12 | -------------------------------------------------------------------------------- /6. Consumer Finances In USA/Practice Notebooks/model.py: -------------------------------------------------------------------------------- 1 | import libraries 2 | import 1_import 3 | import 2_explore 4 | import 3_explore 5 | import 4_split 6 | 7 | # 12 8 | # Iteratively build and train a K-Means 9 | # model where n_clusters ranges [2, 12] 10 | 11 | n_clusters = range(2, 13) 12 | inertia_errors = [] 13 | silhouette_scores = [] 14 | 15 | # Use for loop 16 | for k in n_clusters: 17 | # Build 18 | model = make_pipeline(StandardScaler(), KMeans(n_clusters=k, random_state=42)) 19 | # Train 20 | model.fit(X) 21 | # Calculate inertia 22 | inertia_errors.append(model.named_steps["kmeans"].inertia_) 23 | # Calculate silhouette score 24 | silhouette_scores.append( 25 | silhouette_score(X, model.named_steps["kmeans"].labels_) 26 | ) 27 | 28 | print("Inertia:", inertia_errors[:10]) 29 | print() 30 | print("Silhouette Scores:", silhouette_scores[:3]) 31 | 32 | # 13 33 | # Line plot showing values of 34 | # inertia_errors as a function of n_clusters 35 | 36 | fig = px.line( 37 | x=n_clusters, y=inertia_errors, title="Your Title" 38 | ) 39 | fig.update_layout(xaxis_title="Your x_label", yaxis_title="Your y_label") 40 | 41 | # 14 42 | # Line plot showing values of 43 | # silhouette_scores as a function of n_clusters 44 | 45 | fig = px.line( 46 | x=n_clusters, y=silhouette_scores, title="Your Title" 47 | ) 48 | fig.update_layout(xaxis_title="Your x_label", yaxis_title="Your y_label") 49 | 50 | # 15 51 | # Build and train a new k-means model 52 | # n_clusters: 3 53 | # random state: 42 54 | 55 | final_model = make_pipeline( 56 | StandardScaler(), 57 | KMeans(n_clusters=3, random_state=42) 58 | ) 59 | final_model.fit(X) 60 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/README.md: -------------------------------------------------------------------------------- 1 | ## 070-ds-admissions-in-wqu 2 | 3 | ### Contents... 4 | > EDA. 5 | 6 | > ETL. 7 | 8 | > Chi-Square test. 9 | 10 | > Interactive dashboard. 11 | 12 | 13 | ![image](https://user-images.githubusercontent.com/99328720/189812167-668064f1-7ee3-4a5c-9ae7-638101e5e9f9.png) 14 | 15 | 16 | 17 | ![image](https://user-images.githubusercontent.com/99328720/189812222-a33a9bee-42cf-481e-a3d1-047cb69859e8.png) 18 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/aggregate.py: -------------------------------------------------------------------------------- 1 | """ Using the aggregate method() """ 2 | 3 | import imports 4 | 5 | 6 | # aggregate by nationality 7 | result = .aggregate( 8 | [ 9 | { 10 | "$group": {"_id": "$countryISO2", "count": {"$count": {}}} 11 | } 12 | ] 13 | ) 14 | 15 | 16 | # aggregate by sign-up 17 | result = .aggregate( 18 | [ 19 | { 20 | "$match": {"admissionsQuiz": "incomplete"} 21 | }, 22 | { 23 | "$group": { 24 | "_id": {"$dateTrunc": {"date": "$createdAt", "unit": "day"}}, 25 | "count": {"$sum": 1} 26 | } 27 | } 28 | ] 29 | ) 30 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/choropleth_map.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | import load 5 | 6 | 7 | # `build_nat_choropleth` function 8 | ["count_pct"] = (["count"] / ["count"].sum()) * 100 9 | 10 | 11 | def build_nat_choropleth(): 12 | fig = px.choropleth( 13 | data_frame= , 14 | locations="country_iso3", 15 | color="count_pct", 16 | projection="natural earth", 17 | color_continuous_scale=px.colors.sequential.Oranges, 18 | title="Title" 19 | ) 20 | return fig 21 | 22 | # Display image 23 | nat_fig = build_nat_choropleth() 24 | nat_fig.write_image("images/7-5-4.png", scale=1, height=500, width=700) 25 | 26 | nat_fig.show() 27 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/connect.py: -------------------------------------------------------------------------------- 1 | """ Connecting to the Database """ 2 | 3 | import imports 4 | 5 | 6 | # Connect to database 7 | # Access a certain collection 8 | 9 | # Create a Mongo-`client` 10 | client = MongoClient(host="localhost", port=) 11 | 12 | # Create a database: `db` 13 | db = client["wqu-abtest"] 14 | 15 | # Find your collection: `""` 16 | mscfe_app = db[""] 17 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/contingency_bar.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | import crosstab 5 | 6 | 7 | # `build_contingency_bar` function 8 | def build_contingency_bar(): 9 | # side-by-side bar chart 10 | fig = px.bar( 11 | data_frame=data, 12 | barmode="group", 13 | title="TITLE" 14 | ) 15 | # Set axis labels 16 | fig.update_layout(xaxis_title="XTITLE", yaxis_title="YTITLE") 17 | return fig 18 | 19 | # Display 20 | cb_fig = build_contingency_bar() 21 | cb_fig.write_image("images/7-5-16.png", scale=1, height=500, width=700) 22 | 23 | cb_fig.show() 24 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/contingency_table.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | 5 | 6 | # contingency table 7 | contingency_table = Table2x2(data.values) 8 | 9 | # chi-square test 10 | chi_square_test = contingency_table.test_nominal_association() 11 | 12 | # odds ratio 13 | odds_ratio = contingency_table.oddsratio.round(1) 14 | 15 | # summary... 16 | summary = contingency_table.summary() 17 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/country_converter.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import load 4 | 5 | # Instantiate `CountryConverter` 6 | cc = CountryConverter() 7 | 8 | # Create new columns ... full country names 9 | ["country_name"] = cc.convert( 10 | ["country_iso2"], to="name_short" 11 | ) 12 | 13 | # ... three letter abbv country names 14 | ["country_iso3"] = cc.convert( 15 | ["country_iso2"], to="ISO3" 16 | ) 17 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/crosstab.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | 5 | 6 | data = pd.crosstab( 7 | index=["group"], 8 | columns=["admissionsQuiz"], 9 | normalize=False 10 | ) 11 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/imports.py: -------------------------------------------------------------------------------- 1 | """ Module containing all the needed libraries """ 2 | 3 | 4 | from statsmodels.stats.contingency_tables import Table2x2 5 | from statsmodels.stats.power import GofChisquarePower 6 | from teaching_tools.ab_test.experiment import Experiment 7 | from country_converter import CountryConverter 8 | from pymongo.collection import Collection 9 | from pymongo import MongoClient 10 | from pprint import PrettyPrinter 11 | import matplotlib.pyplot as plt 12 | import pandas as pd 13 | import numpy as np 14 | import random 15 | import math 16 | import scipy 17 | import plotly.express as px 18 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/load.py: -------------------------------------------------------------------------------- 1 | """ Loading into a data frame """ 2 | 3 | import aggregate 4 | 5 | # aggregated by nationality 6 | = pd.DataFrame(result).rename( 7 | {"_id": "country_iso2"}, axis="columns").sort_values("count") 8 | 9 | 10 | 11 | # aggregated by sign up 12 | = ( 13 | pd.DataFrame(result) 14 | .rename({"_id": "date", "count": "new_users"}, axis=1) 15 | .set_index("date") 16 | .sort_index() 17 | .squeeze() 18 | ) 19 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/mongo_instance.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | from our_mongo_class import MongoRepository 5 | 6 | 7 | # An instance of class MongoRepository 8 | repo = MongoRepository() 9 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/our_mongo_class.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | 5 | 6 | class MongoRepository: 7 | """Repository for interacting with MongoDB database. 8 | 9 | Params 10 | ---------- 11 | client : `pymongo.MongoClient` 12 | Default, `MongoClient(host='localhost', port=)`. 13 | db : str 14 | Default, `''`. 15 | collection : str 16 | Default, `'`. 17 | 18 | Attributes 19 | ---------- 20 | collection : pymongo.collection.Collection 21 | All data will be extracted from and loaded to this collection. 22 | """ 23 | 24 | # `__init__` method 25 | def __init__( 26 | self, 27 | client=MongoClient(host="localhost", port=), 28 | db="''", 29 | collection="`'" 30 | ): 31 | self.collection = client[db][collection] 32 | 33 | # `find_by_date` method 34 | def find_by_date(self, date_string): 35 | 36 | # Convert `date_string` to datetime object 37 | start = pd.to_datetime(date_string, format="%Y-%m-%d") 38 | 39 | # Offset `start` by 1 day 40 | end = start + pd.DateOffset(days=1) 41 | 42 | # Create PyMongo query for no-quiz applicants b/t `start` and `end` 43 | query = {"createdAt": {"$gte": start, "$lt": end}, "admissionsQuiz": "incomplete"} 44 | 45 | # Query collection, get result 46 | result = self.collection.find(query) 47 | 48 | # Convert `result` to list 49 | observations = list(result) 50 | 51 | # REMOVE} 52 | return observations 53 | 54 | 55 | # `update_applicants` method 56 | def update_applicants(self, observations_assigned): 57 | n = 0 58 | n_modified = 0 59 | 60 | for doc in observations_assigned: 61 | result = self.collection.update_one( 62 | filter={"_id": doc["_id"]}, 63 | update={"$set": doc} 64 | ) 65 | n += result.matched_count 66 | n_modified += result.modified_count 67 | transaction_result = {"n": n, "nModified": n_modified} 68 | return transaction_result 69 | 70 | 71 | # `assign_to_groups` method 72 | def assign_to_groups(self, date_string): 73 | 74 | # get observations 75 | observations = self.find_by_date(date_string) 76 | 77 | # Shuffle `observations` 78 | random.seed(42) 79 | random.shuffle(observations) 80 | 81 | # Get index position of item at observations halfway point 82 | idx = len(observations) // 2 83 | 84 | # Assign first half of observations to control group 85 | for doc in observations[:idx]: 86 | doc["inExperiment"] = True 87 | doc["group"] = "no email (control)" 88 | 89 | # Assign second half of observations to treatment group 90 | for doc in observations[idx:]: 91 | doc["inExperiment"] = True 92 | doc["group"] = "email (treatment)" 93 | 94 | # Update collections 95 | result = self.update_applicants(observations) 96 | return result 97 | 98 | # `find_exp_observations` method 99 | def find_exp_observations(self): 100 | result = self.collection.find({"inExperiment": True}) 101 | return list(result) 102 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/probability.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import statistical_summary 4 | 5 | 6 | prob_65_or_fewer = scipy.stats.norm.cdf( 7 | group_size * 2, 8 | loc=sum_mean, 9 | scale=sum_std 10 | ) 11 | prob_65_or_greater = 1 - prob_65_or_fewer 12 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/run_exp.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | import connect 5 | import mongo_instance 6 | 7 | 8 | exp = Experiment(repo=client, db="yourDatabase", collection="yourCollection") 9 | exp.reset_experiment() 10 | result = exp.run_experiment(days=exp_days, assignment=True) 11 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/statistic_power.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | 5 | 6 | chi_square_power = GofChisquarePower() 7 | group_size = math.ceil(chi_square_power.solve_power( 8 | effect_size=0.5, # medium --> 0.5; small --> 0.2; large --> 0.8 9 | alpha=0.05, 10 | power=0.8 11 | )) 12 | -------------------------------------------------------------------------------- /7. AB Testing at WorldQuant University/Practice Notebooks/statistical_summary.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import imports 4 | import load 5 | import aggregate 6 | 7 | 8 | mean = .describe()["mean"] 9 | std = .describe()["std"] 10 | 11 | 12 | # sum... 13 | exp_days = 14 | sum_mean = mean * exp_days 15 | sum_std = std * math.sqrt(exp_days) 16 | -------------------------------------------------------------------------------- /8. Volatility Forecasting In India/Practice Notebooks/data.py: -------------------------------------------------------------------------------- 1 | """This is for all the code used to interact with the AlphaVantage API 2 | and the SQLite database. Remember that the API relies on a key that is 3 | stored in your `.env` file and imported via the `config` module. 4 | """ 5 | 6 | import sqlite3 7 | 8 | import pandas as pd 9 | import requests 10 | from config import settings 11 | 12 | 13 | class AlphaVantageAPI: 14 | def __init__(self,api_key=settings.alpha_api_key): 15 | self.__api_key=api_key 16 | 17 | def get_daily(self, ticker,output_size="full"): 18 | 19 | """Get daily time series of an equity from AlphaVantage API. 20 | 21 | Parameters 22 | ---------- 23 | ticker : str 24 | The ticker symbol of the equity. 25 | output_size : str, optional 26 | Number of observations to retrieve. "compact" returns the 27 | latest 100 observations. "full" returns all observations for 28 | equity. By default "full". 29 | 30 | Returns 31 | ------- 32 | pd.DataFrame 33 | Columns are 'open', 'high', 'low', 'close', and 'volume'. 34 | All columns are numeric. 35 | """ 36 | # Create URL (8.1.5) 37 | url = ( 38 | "https://learn-api.wqu.edu/1/data-services/alpha-vantage/query?" 39 | "function=TIME_SERIES_DAILY&" 40 | f"symbol={ticker}&" 41 | f"outputsize={output_size}&" 42 | f"datatype=json&" 43 | f"apikey={settings.alpha_api_key}" 44 | ) 45 | # Send request to API (8.1.6) 46 | response = requests.get(url=url) 47 | # Extract JSON data from response (8.1.10) 48 | response_data = response.json() 49 | 50 | if "Time Series (Daily)" not in response_data.keys(): 51 | raise Exception(f"Invalid API call. Check that ticker symbol '{ticker}' is correct.") 52 | # Read data into DataFrame (8.1.12 & 8.1.13) 53 | stock_data = response_data["Time Series (Daily)"] 54 | df = pd.DataFrame.from_dict(stock_data, orient="index", dtype=float) 55 | # Convert index to 'DatetimeIndex' named "date" (8.1.14) 56 | df.index = pd.to_datetime(df.index) 57 | df. index.name = "date" 58 | # Remove numbering from columns (8.1.15) 59 | df.columns = [c.split(". ")[1] for c in df.columns] 60 | # Return DataFrame 61 | return df 62 | 63 | 64 | class SQLRepository: 65 | def __init__(self,connection): 66 | self.connection = connection 67 | pass 68 | 69 | def insert_table(self, table_name, records, if_exists): 70 | 71 | """Insert DataFrame into SQLite database as table 72 | 73 | Parameters 74 | ---------- 75 | table_name : str 76 | records : pd.DataFrame 77 | if_exists : str, optional 78 | How to behave if the table already exists. 79 | 80 | - 'fail': Raise a ValueError. 81 | - 'replace': Drop the table before inserting new values. 82 | - 'append': Insert new values to the existing table. 83 | 84 | Dafault: 'fail' 85 | 86 | Returns 87 | ------- 88 | dict 89 | Dictionary has two keys: 90 | 91 | - 'transaction_successful', followed by bool 92 | - 'records_inserted', followed by int 93 | """ 94 | n_inserted = records.to_sql( 95 | name = table_name, con=self.connection, if_exists=if_exists) 96 | 97 | return{ 98 | "transations_successful":True, 99 | "records_inserted":n_inserted 100 | } 101 | 102 | def read_table(self, table_name,limit=None): 103 | 104 | """Read table from database. 105 | 106 | Parameters 107 | ---------- 108 | table_name : str 109 | Name of table in SQLite database. 110 | limit : int, None, optional 111 | Number of most recent records to retrieve. If `None`, all 112 | records are retrieved. By default, `None`. 113 | 114 | Returns 115 | ------- 116 | pd.DataFrame 117 | Index is DatetimeIndex "date". Columns are 'open', 'high', 118 | 'low', 'close', and 'volume'. All columns are numeric. 119 | """ 120 | # Create SQL query (with optional limit) 121 | if limit: 122 | sql = f"SELECT * FROM '{table_name}' LIMIT {limit}" 123 | else: 124 | sql = f"SELECT * FROM '{table_name}'" 125 | 126 | # Retrieve data, read into DataFrame 127 | df = pd.read_sql( 128 | sql=sql, con=self.connection,parse_dates=["date"],index_col="date") 129 | 130 | # Return DataFrame 131 | return df -------------------------------------------------------------------------------- /8. Volatility Forecasting In India/Practice Notebooks/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import glob 3 | 4 | import joblib 5 | import pandas as pd 6 | from scipy.stats import norm 7 | import numpy as np 8 | from arch import arch_model 9 | from config import settings 10 | from data import AlphaVantageAPI, SQLRepository 11 | 12 | 13 | class GarchModel: 14 | """Class for training GARCH model and generating predictions. 15 | 16 | Atttributes 17 | ----------- 18 | ticker : str 19 | Ticker symbol of the equity whose volatility will be predicted. 20 | repo : SQLRepository 21 | The repository where the training data will be stored. 22 | use_new_data : bool 23 | Whether to download new data from the AlphaVantage API to train 24 | the model or to use the existing data stored in the repository. 25 | model_directory : str 26 | Path for directory where trained models will be stored. 27 | 28 | Methods 29 | ------- 30 | wrangle_data 31 | Generate equity returns from data in database. 32 | fit 33 | Fit model to training data. 34 | predict 35 | Generate volatilty forecast from trained model. 36 | dump 37 | Save trained model to file. 38 | load 39 | Load trained model from file. 40 | """ 41 | 42 | def __init__(self, ticker,repo,use_new_data): 43 | 44 | self.ticker = ticker 45 | self.repo = repo 46 | self.use_new_data = use_new_data 47 | self.model_directory = settings.model_directory 48 | 49 | def wrangle_data(self,n_observations): 50 | 51 | """Extract data from database (or get from AlphaVantage), transform it 52 | for training model, and attach it to `self.data`. 53 | 54 | Parameters 55 | ---------- 56 | n_observations : int 57 | Number of observations to retrieve from database 58 | 59 | Returns 60 | ------- 61 | None 62 | """ 63 | if self.use_new_data: 64 | api=AlphaVantageAPI() 65 | new_data=api.get_daily(ticker=self.ticker) 66 | self.repo.insert_table( 67 | table_name=self.ticker,records=new_data,if_exists="replace") 68 | df=self.repo.read_table(table_name = self.ticker, limit=n_observations+1) 69 | df.sort_index(ascending=True,inplace=True) 70 | df['return']=df['close'].pct_change()*100 71 | self.data=df["return"].dropna() 72 | 73 | def calculate_aic(self): 74 | log_likelihood = self.model.loglikelihood 75 | num_params = self.model.num_params 76 | n = len(self.data) 77 | return -2 * log_likelihood / n + 2 * num_params / n 78 | 79 | def calculate_bic(self): 80 | log_likelihood = self.model.loglikelihood 81 | num_params = self.model.num_params 82 | n = len(self.data) 83 | return -2 * log_likelihood / n + num_params * np.log(n) / n 84 | 85 | def fit(self,p,q): 86 | 87 | """Create model, fit to `self.data`, and attach to `self.model` attribute. 88 | For assignment, also assigns adds metrics to `self.aic` and `self.bic`. 89 | 90 | Parameters 91 | ---------- 92 | p : int 93 | Lag order of the symmetric innovation 94 | 95 | q : ind 96 | Lag order of lagged volatility 97 | 98 | Returns 99 | ------- 100 | None 101 | """ 102 | # Train Model, attach to `self.model` 103 | self.model = arch_model(self.data, p=p, q=q, rescale=False).fit(disp=0) 104 | 105 | # Calculate AIC and BIC 106 | aic = self.calculate_aic() 107 | bic = self.calculate_bic() 108 | 109 | # Add AIC and BIC attributes to the object 110 | self.aic = aic 111 | self.bic = bic 112 | 113 | 114 | def __clean_prediction(self, prediction): 115 | 116 | """Reformat model prediction to JSON. 117 | 118 | Parameters 119 | ---------- 120 | prediction : pd.DataFrame 121 | Variance from a `ARCHModelForecast` 122 | 123 | Returns 124 | ------- 125 | dict 126 | Forecast of volatility. Each key is date in ISO 8601 format. 127 | Each value is predicted volatility. 128 | """ 129 | # Calculate forecast start date 130 | start= prediction.index[0] + pd.DateOffset(days=1) 131 | 132 | # Create date range 133 | prediction_dates= pd.bdate_range(start=start, periods=prediction.shape[1]) 134 | 135 | # Create prediction index labels, ISO 8601 format 136 | prediction_index= [d.isoformat() for d in prediction_dates] 137 | 138 | # Extract predictions from DataFrame, get square root 139 | data = prediction.values.flatten() ** 0.5 140 | 141 | # Combine `data` and `prediction_index` into Series 142 | prediction_formatted = pd.Series(data, index=prediction_index) 143 | 144 | # Return Series as dictionary 145 | return prediction_formatted.to_dict() 146 | 147 | 148 | def predict_volatility(self, horizon): 149 | 150 | """Predict volatility using `self.model` 151 | 152 | Parameters 153 | ---------- 154 | horizon : int 155 | Horizon of forecast, by default 5. 156 | 157 | Returns 158 | ------- 159 | dict 160 | Forecast of volatility. Each key is date in ISO 8601 format. 161 | Each value is predicted volatility. 162 | """ 163 | # Generate variance forecast from `self.model` 164 | prediction = self.model.forecast(horizon=horizon, reindex=False).variance 165 | 166 | # Format prediction with `self.__clean_predction` 167 | prediction_formatted = self.__clean_prediction(prediction) 168 | 169 | # Return `prediction_formatted` 170 | return prediction_formatted 171 | 172 | 173 | def dump(self): 174 | 175 | """Save model to `self.model_directory` with timestamp. 176 | 177 | Returns 178 | ------- 179 | str 180 | filepath where model was saved. 181 | """ 182 | # Create timestamp in ISO format 183 | timestamp = pd.Timestamp.now().isoformat() 184 | # Create filepath, including `self.model_directory` 185 | filepath = os.path.join(self.model_directory, f"{timestamp}_{self.ticker}.pkl") 186 | # Save `self.model` 187 | joblib.dump(self.model,filepath) 188 | # Return filepath 189 | return filepath 190 | 191 | 192 | def load(self): 193 | 194 | """Load most recent model in `self.model_directory` for `self.ticker`, 195 | attach to `self.model` attribute. 196 | 197 | """ 198 | # Create pattern for glob search 199 | pattern = os.path.join(settings.model_directory,f"*{self.ticker}.pkl") 200 | # Use glob to get most recent model, handle errors 201 | try: 202 | model_path = sorted(glob(pattern))[-1] 203 | except IndexError: 204 | raise Exception(f"No model trained for '{ticker}'.") 205 | # Load model and attach to `self.model` 206 | self.model = joblib.load(model_path) 207 | -------------------------------------------------------------------------------- /8. Volatility Forecasting In India/Practice Notebooks/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import glob 3 | 4 | import joblib 5 | import pandas as pd 6 | from scipy.stats import norm 7 | import numpy as np 8 | from arch import arch_model 9 | from config import settings 10 | from data import AlphaVantageAPI, SQLRepository 11 | 12 | 13 | class GarchModel: 14 | """Class for training GARCH model and generating predictions. 15 | 16 | Atttributes 17 | ----------- 18 | ticker : str 19 | Ticker symbol of the equity whose volatility will be predicted. 20 | repo : SQLRepository 21 | The repository where the training data will be stored. 22 | use_new_data : bool 23 | Whether to download new data from the AlphaVantage API to train 24 | the model or to use the existing data stored in the repository. 25 | model_directory : str 26 | Path for directory where trained models will be stored. 27 | 28 | Methods 29 | ------- 30 | wrangle_data 31 | Generate equity returns from data in database. 32 | fit 33 | Fit model to training data. 34 | predict 35 | Generate volatilty forecast from trained model. 36 | dump 37 | Save trained model to file. 38 | load 39 | Load trained model from file. 40 | """ 41 | 42 | def __init__(self, ticker,repo,use_new_data): 43 | 44 | self.ticker = ticker 45 | self.repo = repo 46 | self.use_new_data = use_new_data 47 | self.model_directory = settings.model_directory 48 | 49 | def wrangle_data(self,n_observations): 50 | 51 | """Extract data from database (or get from AlphaVantage), transform it 52 | for training model, and attach it to `self.data`. 53 | 54 | Parameters 55 | ---------- 56 | n_observations : int 57 | Number of observations to retrieve from database 58 | 59 | Returns 60 | ------- 61 | None 62 | """ 63 | if self.use_new_data: 64 | api=AlphaVantageAPI() 65 | new_data=api.get_daily(ticker=self.ticker) 66 | self.repo.insert_table( 67 | table_name=self.ticker,records=new_data,if_exists="replace") 68 | df=self.repo.read_table(table_name = self.ticker, limit=n_observations+1) 69 | df.sort_index(ascending=True,inplace=True) 70 | df['return']=df['close'].pct_change()*100 71 | self.data=df["return"].dropna() 72 | 73 | def calculate_aic(self): 74 | log_likelihood = self.model.loglikelihood 75 | num_params = self.model.num_params 76 | n = len(self.data) 77 | return -2 * log_likelihood / n + 2 * num_params / n 78 | 79 | def calculate_bic(self): 80 | log_likelihood = self.model.loglikelihood 81 | num_params = self.model.num_params 82 | n = len(self.data) 83 | return -2 * log_likelihood / n + num_params * np.log(n) / n 84 | 85 | def fit(self,p,q): 86 | 87 | """Create model, fit to `self.data`, and attach to `self.model` attribute. 88 | For assignment, also assigns adds metrics to `self.aic` and `self.bic`. 89 | 90 | Parameters 91 | ---------- 92 | p : int 93 | Lag order of the symmetric innovation 94 | 95 | q : ind 96 | Lag order of lagged volatility 97 | 98 | Returns 99 | ------- 100 | None 101 | """ 102 | # Train Model, attach to `self.model` 103 | self.model = arch_model(self.data, p=p, q=q, rescale=False).fit(disp=0) 104 | 105 | # Calculate AIC and BIC 106 | aic = self.calculate_aic() 107 | bic = self.calculate_bic() 108 | 109 | # Add AIC and BIC attributes to the object 110 | self.aic = aic 111 | self.bic = bic 112 | 113 | 114 | def __clean_prediction(self, prediction): 115 | 116 | """Reformat model prediction to JSON. 117 | 118 | Parameters 119 | ---------- 120 | prediction : pd.DataFrame 121 | Variance from a `ARCHModelForecast` 122 | 123 | Returns 124 | ------- 125 | dict 126 | Forecast of volatility. Each key is date in ISO 8601 format. 127 | Each value is predicted volatility. 128 | """ 129 | # Calculate forecast start date 130 | start= prediction.index[0] + pd.DateOffset(days=1) 131 | 132 | # Create date range 133 | prediction_dates= pd.bdate_range(start=start, periods=prediction.shape[1]) 134 | 135 | # Create prediction index labels, ISO 8601 format 136 | prediction_index= [d.isoformat() for d in prediction_dates] 137 | 138 | # Extract predictions from DataFrame, get square root 139 | data = prediction.values.flatten() ** 0.5 140 | 141 | # Combine `data` and `prediction_index` into Series 142 | prediction_formatted = pd.Series(data, index=prediction_index) 143 | 144 | # Return Series as dictionary 145 | return prediction_formatted.to_dict() 146 | 147 | 148 | def predict_volatility(self, horizon): 149 | 150 | """Predict volatility using `self.model` 151 | 152 | Parameters 153 | ---------- 154 | horizon : int 155 | Horizon of forecast, by default 5. 156 | 157 | Returns 158 | ------- 159 | dict 160 | Forecast of volatility. Each key is date in ISO 8601 format. 161 | Each value is predicted volatility. 162 | """ 163 | # Generate variance forecast from `self.model` 164 | prediction = self.model.forecast(horizon=horizon, reindex=False).variance 165 | 166 | # Format prediction with `self.__clean_predction` 167 | prediction_formatted = self.__clean_prediction(prediction) 168 | 169 | # Return `prediction_formatted` 170 | return prediction_formatted 171 | 172 | 173 | def dump(self): 174 | 175 | """Save model to `self.model_directory` with timestamp. 176 | 177 | Returns 178 | ------- 179 | str 180 | filepath where model was saved. 181 | """ 182 | # Create timestamp in ISO format 183 | timestamp = pd.Timestamp.now().isoformat() 184 | # Create filepath, including `self.model_directory` 185 | filepath = os.path.join(self.model_directory, f"{timestamp}_{self.ticker}.pkl") 186 | # Save `self.model` 187 | joblib.dump(self.model,filepath) 188 | # Return filepath 189 | return filepath 190 | 191 | 192 | def load(self): 193 | 194 | """Load most recent model in `self.model_directory` for `self.ticker`, 195 | attach to `self.model` attribute. 196 | 197 | """ 198 | # Create pattern for glob search 199 | pattern = os.path.join(settings.model_directory,f"*{self.ticker}.pkl") 200 | # Use glob to get most recent model, handle errors 201 | try: 202 | model_path = sorted(glob(pattern))[-1] 203 | except IndexError: 204 | raise Exception(f"No model trained for '{ticker}'.") 205 | # Load model and attach to `self.model` 206 | self.model = joblib.load(model_path) 207 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WorldQuant Applied Data Science Lab 2 | Here you can find all the 8 projects of WorldQuant's Data Science Program along with my certification. 3 | 4 | I'm exhilarated to share that I have successfully completed WorldQuant's Data Science Program, a transformative journey that has broadened my skills and knowledge in the field of data science! 🎓 ✨ 5 | 6 | Check my badge: https://www.credly.com/badges/4480df1c-d561-4ef8-9852-79b5748e0c73/public_url 7 | 8 | 9 | Throughout the program, I had the opportunity to work on eight fascinating projects, each designed to enhance my understanding and practical application of key data science concepts. Let me provide a brief explanation of each project: 10 | 11 | 12 | 1- 𝗛𝗢𝗨𝗦𝗜𝗡𝗚 𝗜𝗡 𝗠𝗘𝗫𝗜𝗖𝗢: Learners use a dataset of 21,000 properties to determine if real estate prices are influenced more by property size or location. They import and clean data from a CSV file, build data visualizations, and examine the relationship between two variables using correlation. 13 | 14 | 15 | 2- 𝗔𝗣𝗔𝗥𝗧𝗠𝗘𝗡𝗧 𝗦𝗔𝗟𝗘𝗦 𝗜𝗡 𝗕𝗨𝗘𝗡𝗢𝗦 𝗔𝗜𝗥𝗘𝗦: Learners build a linear regression model to predict apartment prices in Argentina. They create a data pipeline to impute missing values and encode categorical features, and they improve model performance by reducing overfitting. 16 | 17 | 18 | 3- 𝗔𝗜𝗥 𝗤𝗨𝗔𝗟𝗜𝗧𝗬 𝗜𝗡 𝗡𝗔𝗜𝗥𝗢𝗕𝗜: Learners build an ARMA time-series model to predict particulate matter levels in Kenya. They extract data from a MongoDB database using pymongo, and improve model performance through hyperparameter tuning. 19 | 20 | 21 | 4- 𝗘𝗔𝗥𝗧𝗛𝗤𝗨𝗔𝗞𝗘 𝗗𝗔𝗠𝗔𝗚𝗘 𝗜𝗡 𝗡𝗘𝗣𝗔𝗟: Learners build logistic regression and decision tree models to predict earthquake damage to buildings. They extract data from a SQLite database, and reveal the biases in data that can lead to discrimination. 22 | 23 | 24 | 5- 𝗕𝗔𝗡𝗞𝗥𝗨𝗣𝗧𝗖𝗬 𝗜𝗡 𝗣𝗢𝗟𝗔𝗡𝗗: Learners build random forest and gradient boosting models to predict whether a company will go bankrupt. They navigate the Linux command line, address imbalanced data through resampling, and consider the impact of performance metrics precision and recall. 25 | 26 | 27 | 6- 𝗖𝗨𝗦𝗧𝗢𝗠𝗘𝗥 𝗦𝗘𝗚𝗠𝗘𝗡𝗧𝗔𝗧𝗜𝗢𝗡 𝗜𝗡 𝗧𝗛𝗘 𝗨𝗦: Learners build a k-means model to cluster US consumers into groups. They use principal component analysis (PCA) for data visualization, and they create an interactive dashboard with Plotly Dash. 28 | 29 | 30 | 7- 𝗔/𝗕 𝗧𝗘𝗦𝗧𝗜𝗡𝗚 𝗔𝗧 𝗪𝗢𝗥𝗟𝗗𝗤𝗨𝗔𝗡𝗧 𝗨𝗡𝗜𝗩𝗘𝗥𝗦𝗜𝗧𝗬: Learners conduct a chi-square test to determine if sending an email can increase program enrollment at WQU. They build custom Python classes to implement an ETL process, and they create an interactive data application following a three-tiered design pattern. 31 | 32 | 33 | 8- 𝗩𝗢𝗟𝗔𝗧𝗜𝗟𝗜𝗧𝗬 𝗙𝗢𝗥𝗘𝗖𝗔𝗦𝗧𝗜𝗡𝗚 𝗜𝗡 𝗜𝗡𝗗𝗜𝗔:Learners create a GARCH time series model to predict asset volatility. They acquire stock data through an API, clean and store it in a SQLite database, and build their own API to serve model predictions. 34 | 35 | 36 | 37 | I want to express my heartfelt gratitude to WorldQuant for providing an exceptional learning experience. The program's comprehensive curriculum and hands-on projects have equipped me with practical skills and a deep understanding of data science techniques. 38 | 39 | 40 | --------------------------------------------------------------------------------