├── excel_python ├── hello.py └── sales_data.csv ├── cpp_tutorials ├── bmi.cpp ├── retirement_contributions.cpp └── heapify_code.cpp ├── outlier_detectionpt.py ├── outlier_detectionpt2.py ├── data_cleaning.py ├── outlier_detection.py ├── cluster_analysis.py ├── quartiles_tutorial.py ├── list_tutorial.py ├── classification_performance.py ├── research_sentiment.py ├── classification_loss_functions.py ├── cc_eda.py ├── time_series_analysis.py ├── time_series_forecasting.py ├── model_selection.py ├── portfolio_opt.py ├── optimization_tutorial.py ├── regularization_tutorial.py ├── python_profiling_tutorial.py ├── empty_variables_and_datastructures.py ├── financial_data_analysis.py ├── dimensionality_reduction.py ├── model_explainability.py ├── profiling_debugging_mlworkflow.ipynb ├── python_inheritance.ipynb └── pareto_chart_er_readmission.ipynb /excel_python/hello.py: -------------------------------------------------------------------------------- 1 | print("Hello") 2 | -------------------------------------------------------------------------------- /excel_python/sales_data.csv: -------------------------------------------------------------------------------- 1 | datetime,item,price,units,revenue 2 | 9/1/23,smartphone,1200,15,18000 3 | 9/1/23,laptop,2000,5,10000 4 | 9/1/23,tablet,1500,10,15000 5 | 9/1/23,smart tv,2200,10,22000 6 | 9/2/23,laptop,2000,5,10000 7 | 9/3/23,laptop,2000,5,10000 8 | 9/3/23,wireless earbuds ,150,20,3000 9 | 9/4/23,smart tv,2200,10,22000 10 | 9/4/23,smart tv,2200,10,22000 -------------------------------------------------------------------------------- /cpp_tutorials/bmi.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | using namespace std; 4 | 5 | float BMI_calculator(float weight, float height){ 6 | if (height == 0){ 7 | throw runtime_error("You attempted to calculate BMI with an invalid value of zero for height \n"); 8 | } 9 | return weight/(height*height); 10 | } 11 | 12 | 13 | 14 | 15 | // Main() function: where the execution of program begins 16 | int main() 17 | { 18 | string name; 19 | float weight; 20 | float height; 21 | float bmi; 22 | 23 | cout << "Please enter your name \n"; 24 | cin >> name; 25 | cout << "Hello " << name << ", please enter your weight in Kg\n"; 26 | cin >> weight; 27 | cout << "Thank you " << name << ", now please enter your height in meters \n"; 28 | cin >> height; 29 | 30 | try{ 31 | bmi = BMI_calculator(weight, height); 32 | cout << "Your BMI is: " << bmi <<"\n"; 33 | 34 | } 35 | catch (runtime_error& e){ 36 | cout << "Warning: " << e.what(); 37 | } 38 | 39 | 40 | } 41 | -------------------------------------------------------------------------------- /cpp_tutorials/retirement_contributions.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include // std::out_of_range 3 | #include 4 | using namespace std; 5 | 6 | int main(){ 7 | int months; 8 | int current_month = 1; 9 | cout << "Please enter the number of months \n"; 10 | cin >> months; 11 | try{ 12 | std::vector contributions(months); //float contributions[months]; 13 | float initial_contrbution = 100; 14 | float sum_contributions = 0; 15 | contributions[1] = initial_contrbution; 16 | while (current_month <= months){ 17 | contributions[current_month + 1] =1.02*contributions[current_month]; 18 | cout << "Month " << current_month << " contribution is: " << contributions[current_month]<< endl; 19 | sum_contributions += contributions[current_month]; 20 | current_month++; 21 | } 22 | cout<<"Sum of contributions for " << months << " months is: "< 2 | 3 | using namespace std; 4 | 5 | 6 | void heapify(int array_in[], int array_size, int subtree_root_index) 7 | { 8 | int largest_value = subtree_root_index; 9 | int left = 2*subtree_root_index + 1; 10 | int right = 2*subtree_root_index + 2; 11 | 12 | 13 | if (left < array_size && array_in[left] > array_in[largest_value]){ 14 | largest_value = left; 15 | } 16 | 17 | if (right < array_size && array_in[right] > array_in[largest_value]){ 18 | largest_value = right; 19 | } 20 | 21 | 22 | if (largest_value != subtree_root_index ) 23 | { 24 | swap(array_in[subtree_root_index], array_in[largest_value]); 25 | 26 | heapify(array_in, array_size, largest_value); 27 | } 28 | 29 | 30 | } 31 | 32 | 33 | void construct_heap(int array_in[], int array_size){ 34 | int last_non_leaf_node = (array_size/2) -1; 35 | 36 | for (int subtree_root_index = last_non_leaf_node; subtree_root_index >=0; subtree_root_index -=1) 37 | { 38 | heapify(array_in, array_size, subtree_root_index); 39 | } 40 | 41 | } 42 | 43 | void print_heap(int array_in[], int array_size){ 44 | cout << "Printing values at each node in heap" << endl; 45 | 46 | for (int index = 0; index < array_size; index+=1){ 47 | cout<< array_in[index] << endl; 48 | 49 | } 50 | 51 | } 52 | 53 | 54 | int main(){ 55 | int array_in[] = { 3, 5, 8, 10, 17, 11, 13, 19, 22, 24, 29}; 56 | 57 | int array_size = sizeof(array_in) / sizeof(array_in[0]); 58 | 59 | construct_heap(array_in, array_size); 60 | 61 | print_heap(array_in, array_size); 62 | 63 | } 64 | -------------------------------------------------------------------------------- /outlier_detectionpt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Apr 22 12:14:38 2022 5 | 6 | @author: sadrachpierre 7 | """ 8 | import pandas as pd 9 | 10 | df = pd.read_csv("creditcard_downsampled5000.csv") 11 | 12 | 13 | pd.set_option('display.max_columns', None) 14 | pd.set_option('display.max_rows', None) 15 | 16 | # df = df.sample(30000, random_state=42) 17 | 18 | # df.to_csv("creditcard_downsampled5000.csv", index=False) 19 | print(df.head()) 20 | 21 | 22 | import seaborn as sns 23 | import matplotlib.pyplot as plt 24 | 25 | sns.set() 26 | 27 | sns.boxplot(y = df['V14']) 28 | plt.show() 29 | 30 | 31 | 32 | Q1=df['V13'].quantile(0.25) 33 | print("Q1:", Q1) 34 | 35 | Q3=df['V13'].quantile(0.75) 36 | print("Q3:", Q3) 37 | 38 | IQR=Q3-Q1 39 | print("IQR: ", IQR) 40 | 41 | lower_bound = Q1 - 1.5*IQR 42 | print("Lower Bound:", lower_bound) 43 | 44 | upper_bound = Q3 + 1.5*IQR 45 | print("Upper Bound:", upper_bound) 46 | 47 | df_clean = df[(df['V13']>lower_bound)&(df['V13']lower_bound)&(df['V13'] df_bad['AGE'].mean() else x for x in list(df_bad['AGE']) ] 35 | 36 | df_bad['AGE'] = pd.to_numeric(df_bad['AGE'], errors = 'coerce') 37 | print(df_bad['AGE'].mean()) 38 | 39 | 40 | 41 | from scipy import stats 42 | import numpy as np 43 | print("Length before removing RM outlier:", len(df_bad)) 44 | df_bad['RM_zscore'] = np.abs(stats.zscore(df['RM'])) 45 | df_clean1 = df_bad[df_bad['RM_zscore']< 3] 46 | print("Length after removing RM outlier:", len(df_clean1)) 47 | 48 | 49 | def remove_outliers(column_name, df_in): 50 | print(f"Length before removing {column_name} outlier:", len(df_in)) 51 | df_in[f'{column_name}_zscore'] = np.abs(stats.zscore(df_in[f'{column_name}'])) 52 | df_clean = df_in[df_in[f'{column_name}_zscore']< 3] 53 | print(f"Length after removing {column_name} outlier:", len(df_clean)) 54 | return df_clean 55 | 56 | df1 = remove_outliers('DIS', df_bad) 57 | 58 | -------------------------------------------------------------------------------- /outlier_detection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Aug 18 17:32:26 2021 5 | 6 | @author: sadrachpierre 7 | """ 8 | import pandas as pd 9 | 10 | df = pd.read_csv("banknotes.csv") 11 | 12 | 13 | print(df.head()) 14 | 15 | from collections import Counter 16 | import seaborn as sns 17 | import matplotlib.pyplot as plt 18 | 19 | def boxplot(column): 20 | sns.boxplot(data=df,x=df[f"{column}"]) 21 | plt.title(f"Boxplot of Swiss Banknote {column}") 22 | plt.show() 23 | 24 | df_outlier1 = df[df['Length']> 216].copy() 25 | print(Counter(df_outlier1['conterfeit'])) 26 | 27 | 28 | 29 | 30 | df_outlier2 = df[df['Length']> 215.5].copy() 31 | print(Counter(df_outlier2['conterfeit'])) 32 | 33 | 34 | boxplot('Length') 35 | boxplot('Right') 36 | boxplot('Left') 37 | boxplot('Bottom') 38 | boxplot('Top') 39 | boxplot('Diagonal') 40 | 41 | 42 | df_outlier3 = df[(df['Length']> 215)&(df['Right']> 130)&(df['Left']> 130)&(df['Bottom']> 10)].copy() 43 | print(Counter(df_outlier3['conterfeit'])) 44 | print(Counter(df['conterfeit'])) 45 | 46 | from sklearn.ensemble import IsolationForest 47 | from sklearn.model_selection import train_test_split 48 | from sklearn.metrics import precision_score 49 | from sklearn.svm import OneClassSVM 50 | 51 | X = df[['Length', 'Left', 'Right', 'Bottom', 'Top', 'Diagonal']] 52 | y = df['conterfeit'] 53 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 54 | 55 | clf = IsolationForest(random_state=0) 56 | clf.fit(X_train) 57 | y_pred = clf.predict(X_test) 58 | 59 | import numpy as np 60 | pred = pd.DataFrame({'pred': y_pred}) 61 | pred['y_pred'] = np.where(pred['pred'] == -1, 1, 0) 62 | 63 | y_pred = pred['y_pred'] 64 | print("Precision:", precision_score(y_test, y_pred)) 65 | 66 | 67 | 68 | clf_svm = OneClassSVM(gamma='auto') 69 | clf_svm.fit(X_train) 70 | y_pred_svm = clf_svm.predict(X_test) 71 | 72 | pred['svm'] = y_pred_svm 73 | pred['svm_pred'] = np.where(pred['svm'] == -1, 1, 0) 74 | 75 | y_pred_svm = pred['svm_pred'] 76 | print("SVM Precision:", precision_score(y_test, y_pred_svm)) 77 | -------------------------------------------------------------------------------- /cluster_analysis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.cluster import KMeans 6 | 7 | df = pd.read_csv("Mall_Customers.csv") 8 | 9 | X = df[['Age', 'Spending Score (1-100)']] 10 | wcss = [] 11 | 12 | 13 | for i in range(1, 11): 14 | kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=500, n_init=20, random_state=0) 15 | kmeans.fit(X) 16 | wcss.append(kmeans.inertia_) 17 | 18 | 19 | sns.set() 20 | 21 | plt.plot(range(1, 11), wcss) 22 | plt.title('Selecting the Numbeer of Clusters using the Elbow Method') 23 | plt.xlabel('Clusters') 24 | plt.ylabel('WCSS') 25 | plt.show() 26 | 27 | 28 | 29 | kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=500, n_init=20, random_state=0) 30 | y_pred = kmeans.fit_predict(X) 31 | plt.scatter(X['Age'], X['Spending Score (1-100)']) 32 | plt.ylabel("Spending Score") 33 | plt.xlabel("Age") 34 | plt.title("Clusters found by KMeans") 35 | plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='black') 36 | plt.show() 37 | 38 | 39 | from sklearn.mixture import GaussianMixture 40 | n_clusters = 5 41 | gmm_model = GaussianMixture(n_components=n_clusters, random_state=5) 42 | gmm_model.fit(X) 43 | 44 | 45 | cluster_labels = gmm_model.predict(X) 46 | X = pd.DataFrame(X) 47 | X['cluster'] = cluster_labels 48 | 49 | 50 | color=['blue','green','red', 'black', 'yellow'] 51 | for k in range(0,n_clusters): 52 | data = X[X["cluster"]==k].copy() 53 | plt.scatter(data["Age"],data["Spending Score (1-100)"],c=color[k]) 54 | 55 | plt.title("Clusters Identified by Guassian Mixture Model") 56 | plt.ylabel("Spending Score (1-100)") 57 | plt.xlabel("Age") 58 | plt.show() 59 | 60 | 61 | from sklearn.cluster import SpectralClustering 62 | 63 | spectral_cluster_model= SpectralClustering( 64 | n_clusters=5, 65 | random_state=64, 66 | n_neighbors=20, 67 | affinity='nearest_neighbors' 68 | ) 69 | 70 | 71 | X['cluster'] = spectral_cluster_model.fit_predict(X) 72 | 73 | 74 | fig, ax = plt.subplots() 75 | sns.scatterplot(x='Age', y='Spending Score (1-100)', data=X, hue='cluster', ax=ax) 76 | ax.set(title='Spectral Clustering') 77 | -------------------------------------------------------------------------------- /quartiles_tutorial.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | df = pd.read_csv('telco_churn.csv') 4 | 5 | print(df.head()) 6 | 7 | print("First Quartile (Q1) for Tenure: ", df['tenure'].quantile(0.25)) 8 | print("Second Quartile (Q2) for Tenure: ", df['tenure'].quantile(0.50)) 9 | print("Third Quartile (Q3) for Tenure: ", df['tenure'].quantile(0.75)) 10 | 11 | print("Third Quartile (Q3) for Tenure: ", df['tenure'].quantile(0.75)) 12 | 13 | 14 | 15 | print("Ninth Decile for Tenure: ", df['tenure'].quantile(0.9)) 16 | 17 | df_dsl = df[df['InternetService'] == 'DSL'] 18 | df_fiberoptic = df[df['InternetService'] == 'Fiber optic'] 19 | 20 | 21 | print("Third Quartile (Q3) for Tenure - DSL: ", df_dsl['tenure'].quantile(0.75)) 22 | print("Third Quartile (Q3) for Tenure - Fiber Optic: ", df_fiberoptic['tenure'].quantile(0.75)) 23 | 24 | print("Ninth Decile for Tenure - DSL: ", df_dsl['tenure'].quantile(0.9)) 25 | print("Ninth Decile for Tenure - Fiber Optic: ", df_fiberoptic['tenure'].quantile(0.9)) 26 | 27 | 28 | df_churn_yes = df[df['Churn'] == 'Yes'] 29 | df_churn_no = df[df['Churn'] == 'No'] 30 | 31 | 32 | print("Third Quartile (Q3) for Tenure - Churn: ", df_churn_yes['tenure'].quantile(0.75)) 33 | print("Third Quartile (Q3) for Tenure - No Churn: ", df_churn_no['tenure'].quantile(0.75)) 34 | 35 | 36 | print("Third Quartile (Q3) for Tenure - Churn: ", df_churn_yes['MonthlyCharges'].quantile(0.75)) 37 | print("Third Quartile (Q3) for Tenure - No Churn: ", df_churn_no['MonthlyCharges'].quantile(0.75)) 38 | 39 | 40 | import numpy as np 41 | 42 | 43 | print("Numpy Third Quartile (Q3) for Tenure - Churn: ", np.percentile(df_churn_yes['MonthlyCharges'], 75)) 44 | 45 | import seaborn as sns 46 | import matplotlib.pyplot as plt 47 | sns.set() 48 | 49 | sns.boxplot(df['tenure']) 50 | plt.show() 51 | 52 | 53 | from collections import Counter 54 | def get_boxplot_of_categories(data_frame, categorical_column, numerical_column, limit): 55 | 56 | keys = [] 57 | for i in dict(Counter(df[categorical_column].values).most_common(limit)): 58 | keys.append(i) 59 | 60 | df_new = df[df[categorical_column].isin(keys)] 61 | sns.boxplot(x = df_new[categorical_column], y = df_new[numerical_column]) 62 | plt.show() 63 | 64 | 65 | get_boxplot_of_categories(df, 'Churn', 'tenure', 5) 66 | 67 | get_boxplot_of_categories(df, 'Churn', 'MonthlyCharges', 5) 68 | 69 | -------------------------------------------------------------------------------- /list_tutorial.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed May 25 18:59:33 2022 5 | 6 | @author: sadrachpierre 7 | """ 8 | 9 | import numpy as np 10 | 11 | 12 | tech_company_names = ['Facebook', 'Apple', 'Amazon', 'Netflix', 'Google'] 13 | 14 | tech_company_employees = [58604, 147000, 950000, 11300, 135301] 15 | 16 | tech_company_revenue = [117, 378, 470, 30, 257] 17 | 18 | 19 | tech_company_employee_bool = [x > 60000 for x in tech_company_employees ] 20 | 21 | 22 | sort_company = sorted(tech_company_names) 23 | sort_employee = sorted(tech_company_employees) 24 | 25 | print(sort_company) 26 | print(sort_employee) 27 | 28 | new_company_info = ['Microsoft', 163000, 877, True] 29 | 30 | tech_company_names.append(new_company_info[0]) 31 | tech_company_employees.append(new_company_info[1]) 32 | tech_company_revenue.append(new_company_info[2]) 33 | tech_company_employee_bool.append(new_company_info[3]) 34 | 35 | 36 | print('Company: ', tech_company_names) 37 | print('Employees: ', tech_company_employees) 38 | print("Revenue: ", tech_company_revenue) 39 | print("Employee_threshold: ", tech_company_employee_bool) 40 | 41 | 42 | mu, sigma = 80, 40 43 | n_values = len(tech_company_names) 44 | np.random.seed(21) 45 | net_income_normal = np.random.normal(mu, sigma, n_values) 46 | print(net_income_normal) 47 | 48 | np.random.seed(64) 49 | net_income_fat_tail = np.random.gumbel(mu, sigma, n_values) 50 | print(net_income_fat_tail) 51 | 52 | 53 | company_data_dict = {'company_name': tech_company_names, 54 | 'number_of_employees': tech_company_employees, 55 | 'company_revenue': tech_company_revenue, 56 | 'employee_threshold': tech_company_employee_bool, 57 | 'net_income_normal': list(net_income_normal), 58 | 'net_income_fat_tail': list(net_income_fat_tail)} 59 | 60 | print(company_data_dict) 61 | 62 | import json 63 | with open('company_data.json', 'w') as fp: 64 | json.dump(company_data_dict, fp) 65 | 66 | f = open('company_data.json') 67 | company_json = json.loads(f.read()) 68 | print(company_json) 69 | 70 | import pandas as pd 71 | company_df = pd.DataFrame(company_data_dict) 72 | print(company_df) 73 | 74 | company_df.to_csv("comapany_csv_file.csv", index=False) 75 | 76 | read_company_df = pd.read_csv("comapany_csv_file.csv") 77 | print(read_company_df) 78 | -------------------------------------------------------------------------------- /classification_performance.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.metrics import accuracy_score 6 | from sklearn.metrics import confusion_matrix 7 | from sklearn.metrics import roc_curve, roc_auc_score 8 | 9 | df = pd.read_csv('telco_churn.csv') 10 | 11 | print(df.head()) 12 | 13 | print(len(df.columns)) 14 | print(len(df)) 15 | df['Churn'] = np.where(df['Churn'] == 'Yes', 1, 0) 16 | 17 | 18 | 19 | X = df[['tenure', 'MonthlyCharges']] 20 | y = df['Churn'] 21 | 22 | 23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 24 | 25 | 26 | clf_model = LogisticRegression() 27 | clf_model.fit(X_train, y_train) 28 | y_pred = clf_model.predict(X_test) 29 | 30 | print("Accuracy: ", accuracy_score(y_test, y_pred)) 31 | 32 | 33 | 34 | conmat = confusion_matrix(y_test, y_pred) 35 | print(conmat) 36 | val = np.mat(conmat) 37 | print(val) 38 | 39 | classnames = list(set(y_train)) 40 | df_cm = pd.DataFrame( 41 | val, index=classnames, columns=classnames, 42 | ) 43 | 44 | print(df_cm) 45 | 46 | 47 | print(len(y_test)) 48 | 49 | from collections import Counter 50 | 51 | print(Counter(y_test)) 52 | 53 | 54 | 55 | import matplotlib.pyplot as plt 56 | import seaborn as sns 57 | df_cm = df_cm.astype('float') / df_cm.sum(axis=1)[:, np.newaxis] 58 | plt.figure() 59 | heatmap = sns.heatmap(df_cm, annot=True, cmap="Blues", fmt='g') 60 | heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right') 61 | heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right') 62 | plt.ylabel('True label') 63 | plt.xlabel('Predicted label') 64 | plt.title('Churn Logistic Regression Model Results') 65 | plt.show() 66 | 67 | y_pred_proba = clf_model.predict_proba(np.array(X_test))[:,1] 68 | 69 | fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba) 70 | 71 | 72 | sns.set() 73 | plt.plot(fpr, tpr) 74 | plt.plot(fpr, fpr, linestyle = '--', color = 'k') 75 | plt.xlabel('False positive rate') 76 | plt.ylabel('True positive rate') 77 | AUROC = np.round(roc_auc_score(y_test, y_pred_proba), 2) 78 | plt.title(f'Logistic Regression Model ROC curve; AUROC: {AUROC}'); 79 | plt.show() 80 | 81 | 82 | 83 | from sklearn.metrics import precision_recall_curve 84 | from sklearn.metrics import average_precision_score 85 | 86 | average_precision = average_precision_score(y_test, y_pred_proba) 87 | precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba) 88 | plt.plot(recall, precision, marker='.', label='Logistic') 89 | plt.xlabel('Recall') 90 | plt.ylabel('Precision') 91 | plt.legend() 92 | plt.title(f'Precision Recall Curve. AUPRC: {average_precision}') 93 | plt.show() 94 | 95 | -------------------------------------------------------------------------------- /research_sentiment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Apr 23 11:58:01 2021 5 | 6 | @author: sadrachpierre 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from collections import Counter 12 | 13 | 14 | pd.set_option('display.max_columns', None) 15 | pd.set_option('display.max_rows', None) 16 | 17 | 18 | df_new = pd.read_csv("covid.csv") 19 | 20 | 21 | 22 | df_new.dropna(inplace=True) 23 | 24 | 25 | print(Counter(df_new['journal']).most_common(100)) 26 | 27 | 28 | df_plos = df_new[df_new['journal'] == 'PLoS One'].copy() 29 | print(df_plos.head()) 30 | df_infect = df_new[df_new['journal'].str.contains('Infect Dis', regex=False)].copy() 31 | df_microbial = df_new[df_new['journal'].str.contains('Microbial', regex=False)].copy() 32 | 33 | df_abstract_microbiome = df_new[df_new['abstract'].str.contains('microbiome', regex=False)].copy() 34 | print("Number of Microbiome Studies: ", len(df_abstract_microbiome)) 35 | 36 | 37 | print(df_abstract_microbiome.head()) 38 | 39 | df_abstract_microbiome['publish_time'] = pd.to_datetime(df_abstract_microbiome['publish_time'], format='%Y/%m/%d') 40 | df_abstract_microbiome['year'] = df_abstract_microbiome['publish_time'].dt.year 41 | print(df_abstract_microbiome.head()) 42 | print(set(df_abstract_microbiome['year'])) 43 | print(df_abstract_microbiome['abstract'].iloc[20]) 44 | 45 | from textblob import TextBlob 46 | df_abstract_microbiome['abstract_sentiment'] = df_abstract_microbiome['abstract'].apply(lambda abstract: TextBlob(abstract).sentiment.polarity) 47 | print(df_abstract_microbiome.head()) 48 | 49 | 50 | df_micro_group = df_abstract_microbiome.groupby(['year'])['abstract_sentiment'].mean() 51 | 52 | 53 | import matplotlib.pyplot as plt 54 | import seaborn as sns 55 | 56 | sns.set() 57 | plt.xlabel('Year') 58 | plt.ylabel('Sentiment') 59 | plt.title('Research Sentiment in Gut Microbiome Studies') 60 | plt.plot(df_micro_group.index, df_micro_group.values) 61 | #plt.show() 62 | 63 | 64 | 65 | df_plos['publish_time'] = pd.to_datetime(df_plos['publish_time'], format='%Y/%m/%d') 66 | df_plos['year'] = df_plos['publish_time'].dt.year 67 | df_plos['abstract_sentiment'] = df_plos['abstract'].apply(lambda abstract: TextBlob(abstract).sentiment.polarity) 68 | 69 | 70 | 71 | df_plos_group = df_plos.groupby(['year'])['abstract_sentiment'].mean() 72 | 73 | 74 | import matplotlib.pyplot as plt 75 | import seaborn as sns 76 | 77 | sns.set() 78 | plt.xlabel('Year') 79 | plt.ylabel('Sentiment') 80 | #plt.title('Research Sentiment in PLoS One Publications') 81 | plt.plot(df_plos_group.index, df_plos_group.values) 82 | 83 | 84 | 85 | df_nature = df_new[df_new['journal'] == 'Nature'].copy() 86 | 87 | df_nature['publish_time'] = pd.to_datetime(df_nature['publish_time'], format='%Y/%m/%d') 88 | df_nature['year'] = df_nature['publish_time'].dt.year 89 | df_nature['abstract_sentiment'] = df_nature['abstract'].apply(lambda abstract: TextBlob(abstract).sentiment.polarity) 90 | 91 | 92 | 93 | df_nature = df_nature.groupby(['year'])['abstract_sentiment'].mean() 94 | 95 | 96 | import matplotlib.pyplot as plt 97 | import seaborn as sns 98 | 99 | sns.set() 100 | plt.xlabel('Year') 101 | plt.ylabel('Sentiment') 102 | plt.title('Research Sentiment in Publications') 103 | plt.plot(df_nature.index, df_nature.values) 104 | -------------------------------------------------------------------------------- /classification_loss_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jan 4 17:40:26 2022 5 | 6 | @author: sadrachpierre 7 | """ 8 | import pandas as pd 9 | import numpy as np 10 | from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten 11 | from tensorflow.keras.models import Sequential 12 | from sklearn.model_selection import train_test_split 13 | from tensorflow.keras.datasets import mnist 14 | import matplotlib.pyplot as plt 15 | 16 | df = pd.read_csv('telco_churn.csv') 17 | 18 | print(df.head()) 19 | 20 | 21 | 22 | df['Churn'] = np.where(df['Churn'] == 'Yes', 1, 0) 23 | 24 | 25 | 26 | 27 | def convert_categories(cat_list): 28 | for col in cat_list: 29 | df[col] = df[col].astype('category') 30 | df[f'{col}_cat'] = df[f'{col}'].cat.codes 31 | 32 | 33 | category_list = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 34 | 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 35 | 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'] 36 | 37 | convert_categories(category_list) 38 | df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce') 39 | df['TotalCharges'].fillna(0, inplace=True) 40 | 41 | cols = ['gender_cat', 'Partner_cat', 'Dependents_cat', 'PhoneService_cat', 'MultipleLines_cat', 'InternetService_cat', 42 | 'OnlineSecurity_cat', 'OnlineBackup_cat', 'DeviceProtection_cat', 'TechSupport_cat', 'StreamingTV_cat', 43 | 'StreamingMovies_cat', 'Contract_cat', 'PaperlessBilling_cat', 'PaymentMethod_cat','MonthlyCharges', 44 | 'TotalCharges', 'SeniorCitizen'] 45 | 46 | X = df[cols] 47 | 48 | y= df['Churn'] 49 | 50 | 51 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) 52 | 53 | model_bce = Sequential() 54 | model_bce.add(Dense(len(cols),input_shape=(len(cols),), kernel_initializer='normal', activation='relu')) 55 | model_bce.add(Dense(32, activation='relu')) 56 | model_bce.add(Dense(32, activation='relu')) 57 | model_bce.add(Dense(32, activation='relu')) 58 | model_bce.add(Dense(1, activation='softmax')) 59 | model_bce.compile(optimizer = 'adam',loss='binary_crossentropy', metrics =['accuracy']) 60 | model_bce.fit(X_train, y_train,epochs =10) 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | (X_train_mnist, y_train_mnist), (X_test_mnist, y_test_mnist) = mnist.load_data() 69 | 70 | 71 | 72 | plt.imshow(X_train_mnist[0]) 73 | plt.show() 74 | 75 | 76 | plt.imshow(X_train_mnist[1]) 77 | plt.show() 78 | 79 | plt.imshow(X_train_mnist[4]) 80 | plt.show() 81 | 82 | X_train_mnist = X_train_mnist.reshape((X_train_mnist.shape[0], 28, 28, 1)) 83 | X_test_mnist = X_test_mnist.reshape((X_test_mnist.shape[0], 28, 28, 1)) 84 | 85 | y_train_mnist = np.where(y_train_mnist == 9, 1, 0) 86 | y_test_mnist = np.where(y_test_mnist == 9, 1, 0) 87 | 88 | 89 | model_cce = Sequential() 90 | model_cce.add(Conv2D(16, (3, 3), activation='relu', kernel_initializer='normal', input_shape=(28, 28, 1))) 91 | model_cce.add(MaxPooling2D((2, 2))) 92 | model_cce.add(Flatten()) 93 | model_cce.add(Dense(16, activation='relu', kernel_initializer='normal')) 94 | model_cce.add(Dense(2, activation='softmax')) 95 | model_cce.compile(optimizer = 'SGD',loss='sparse_categorical_crossentropy', metrics =['accuracy']) 96 | model_cce.fit(X_train_mnist, y_train_mnist, epochs =5) 97 | -------------------------------------------------------------------------------- /cc_eda.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | import plotly.express as px 6 | 7 | 8 | 9 | df = pd.read_csv("synthetic_transaction_data_Dining.csv") 10 | 11 | 12 | df['log_transaction_amnt'] = np.log(df['transaction_amount']) 13 | 14 | print(df.head()) 15 | 16 | 17 | df['transaction_date'] = pd.to_datetime(df['transaction_date']) 18 | df= df[df['merchant_name'] == "Cheesecake Factory"] 19 | 20 | z_scores = np.abs((df['transaction_amount'] - df['transaction_amount'].mean()) / df['transaction_amount'].std()) 21 | # Define a threshold (e.g., Z-score > 3) to identify outliers 22 | threshold = 3 23 | # Remove outliers from the DataFrame 24 | df = df[z_scores < threshold] 25 | 26 | 27 | df['year'] = df['transaction_date'].dt.year 28 | df['year'] = df['year'].astype(str) 29 | df['month'] = df['transaction_date'].dt.month 30 | df = df[df['month'] <= 12] 31 | df = df[df['month'] >= 1] 32 | df['month'] = df['month'].astype(str) 33 | df['month_year'] = df['year'] + "-"+ df['month'] 34 | df['month_year'] = pd.to_datetime(df['month_year']) 35 | 36 | 37 | df_grouped = df.groupby('month_year')['transaction_amount'].sum().reset_index() 38 | df_grouped = df_grouped.set_index('month_year').sort_index() 39 | df_grouped.index = pd.to_datetime(df_grouped.index) 40 | plt.plot(df_grouped.index, df_grouped['transaction_amount']) 41 | 42 | 43 | data = df[['transaction_amount', 'log_transaction_amnt']] 44 | sns.pairplot(data) 45 | plt.show() 46 | 47 | state_abbreviations = { 48 | 'Alabama': 'AL', 49 | 'Alaska': 'AK', 50 | 'Arizona': 'AZ', 51 | 'Arkansas': 'AR', 52 | 'California': 'CA', 53 | 'Colorado': 'CO', 54 | 'Connecticut': 'CT', 55 | 'Delaware': 'DE', 56 | 'Florida': 'FL', 57 | 'Georgia': 'GA', 58 | 'Hawaii': 'HI', 59 | 'Idaho': 'ID', 60 | 'Illinois': 'IL', 61 | 'Indiana': 'IN', 62 | 'Iowa': 'IA', 63 | 'Kansas': 'KS', 64 | 'Kentucky': 'KY', 65 | 'Louisiana': 'LA', 66 | 'Maine': 'ME', 67 | 'Maryland': 'MD', 68 | 'Massachusetts': 'MA', 69 | 'Michigan': 'MI', 70 | 'Minnesota': 'MN', 71 | 'Mississippi': 'MS', 72 | 'Missouri': 'MO', 73 | 'Montana': 'MT', 74 | 'Nebraska': 'NE', 75 | 'Nevada': 'NV', 76 | 'New Hampshire': 'NH', 77 | 'New Jersey': 'NJ', 78 | 'New Mexico': 'NM', 79 | 'New York': 'NY', 80 | 'North Carolina': 'NC', 81 | 'North Dakota': 'ND', 82 | 'Ohio': 'OH', 83 | 'Oklahoma': 'OK', 84 | 'Oregon': 'OR', 85 | 'Pennsylvania': 'PA', 86 | 'Rhode Island': 'RI', 87 | 'South Carolina': 'SC', 88 | 'South Dakota': 'SD', 89 | 'Tennessee': 'TN', 90 | 'Texas': 'TX', 91 | 'Utah': 'UT', 92 | 'Vermont': 'VT', 93 | 'Virginia': 'VA', 94 | 'Washington': 'WA', 95 | 'West Virginia': 'WV', 96 | 'Wisconsin': 'WI', 97 | 'Wyoming': 'WY' 98 | } 99 | 100 | 101 | df['merchant_state_abbr'] = df['merchant_state'].map(state_abbreviations) 102 | state_counts = df.groupby('merchant_state_abbr')['cardholder_name'].nunique().reset_index() 103 | state_counts.columns = ['State', 'Customer_Count'] 104 | 105 | fig = px.choropleth(state_counts, locations='State', locationmode='USA-states', 106 | color='Customer_Count', scope='usa', 107 | color_continuous_scale='Blues', 108 | title='Number of Customers by State') 109 | 110 | fig.show() 111 | 112 | 113 | -------------------------------------------------------------------------------- /time_series_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jul 14 13:37:58 2021 5 | 6 | @author: sadrachpierre 7 | """ 8 | 9 | 10 | import pandas as pd 11 | import matplotlib.pyplot as plt 12 | import seaborn as sns 13 | from statsmodels.tsa.stattools import adfuller 14 | from statsmodels.tsa.seasonal import seasonal_decompose 15 | 16 | df = pd.read_csv("AirPassengers.csv") 17 | 18 | print(df.head()) 19 | 20 | print(df.tail()) 21 | 22 | df['Month'] = pd.to_datetime(df['Month'], format='%Y-%m') 23 | 24 | 25 | 26 | df.index = df['Month'] 27 | del df['Month'] 28 | print(df.head()) 29 | 30 | 31 | # sns.lineplot(data=df) 32 | # plt.ylabel("Number of Passengers") 33 | # plt.show() 34 | 35 | rolling_mean = df.rolling(7).mean() 36 | rolling_std = df.rolling(7).std() 37 | 38 | 39 | plt.plot(df, color="blue",label="Original Passenger Data") 40 | plt.plot(rolling_mean, color="red", label="Rolling Mean #Passenger") 41 | plt.plot(rolling_std, color="black", label = "Rolling Standard Deviation in #Passenger") 42 | plt.title("Passenger Time Series, Rolling Mean, Standard Deviation") 43 | plt.legend(loc="best") 44 | plt.show() 45 | 46 | adft = adfuller(df,autolag="AIC") 47 | 48 | output_df = pd.DataFrame({"Values":[adft[0],adft[1],adft[2],adft[3], adft[4]['1%'], adft[4]['5%'], adft[4]['10%']] , "Metric":["Test Statistics","p-value","No. of lags used","Number of observations used", 49 | "critical value (1%)", "critical value (5%)", "critical value (10%)"]}) 50 | print(output_df) 51 | 52 | 53 | autocorrelation_lag1 = df['#Passengers'].autocorr(lag=1) 54 | print("One Month Lag: ", autocorrelation_lag1) 55 | 56 | autocorrelation_lag3 = df['#Passengers'].autocorr(lag=3) 57 | print("Three Month Lag: ", autocorrelation_lag3) 58 | 59 | autocorrelation_lag6 = df['#Passengers'].autocorr(lag=6) 60 | print("Six Month Lag: ", autocorrelation_lag6) 61 | 62 | autocorrelation_lag9 = df['#Passengers'].autocorr(lag=9) 63 | print("Nine Month Lag: ", autocorrelation_lag9) 64 | 65 | 66 | decompose = seasonal_decompose(df['#Passengers'],model='additive', period=7) 67 | decompose.plot() 68 | plt.show() 69 | df['Date'] = df.index 70 | 71 | train = df[df['Date'] < pd.to_datetime("1960-08", format='%Y-%m')] 72 | train['train'] = train['#Passengers'] 73 | del train['Date'] 74 | del train['#Passengers'] 75 | test = df[df['Date'] >= pd.to_datetime("1960-08", format='%Y-%m')] 76 | del test['Date'] 77 | test['test'] = test['#Passengers'] 78 | del test['#Passengers'] 79 | plt.plot(train, color = "black") 80 | plt.plot(test, color = "red") 81 | plt.title("Train/Test split for Passenger Data") 82 | plt.ylabel("Passenger Number") 83 | plt.xlabel('Year-Month') 84 | sns.set() 85 | plt.show() 86 | 87 | 88 | 89 | 90 | from pmdarima.arima import auto_arima 91 | model = auto_arima(train, trace=True, error_action='ignore', suppress_warnings=True) 92 | model.fit(train) 93 | forecast = model.predict(n_periods=len(test)) 94 | forecast = pd.DataFrame(forecast,index = test.index,columns=['Prediction']) 95 | 96 | 97 | 98 | plt.plot(train, label='Train') 99 | plt.plot(test, label='Test') 100 | plt.plot(forecast, label='Prediction') 101 | plt.title('#Passenger Prediction') 102 | plt.xlabel('Date') 103 | plt.ylabel('Actual #Passenger') 104 | plt.legend(loc='upper left', fontsize=8) 105 | plt.show() 106 | 107 | 108 | 109 | from math import sqrt 110 | from sklearn.metrics import mean_squared_error 111 | print("RMSE: ", rms) 112 | -------------------------------------------------------------------------------- /time_series_forecasting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Sep 28 11:11:06 2021 5 | 6 | @author: sadrachpierre 7 | """ 8 | import pandas as pd 9 | import pandas_datareader as web 10 | import datetime 11 | 12 | import matplotlib.pyplot as plt 13 | import seaborn as sns 14 | 15 | from statsmodels.tsa.statespace.sarimax import SARIMAX 16 | from statsmodels.tsa.arima.model import ARIMA 17 | pd.set_option('display.max_columns', None) 18 | pd.set_option('display.max_rows', None) 19 | 20 | 21 | 22 | 23 | # btc = web.get_data_yahoo(['BTC-USD'], start=datetime.datetime(2018, 1, 1), end=datetime.datetime(2020, 12, 2)) 24 | 25 | # btc = btc['Close'] 26 | 27 | # btc.to_csv("btc.csv") 28 | 29 | btc = pd.read_csv("btc.csv") 30 | 31 | 32 | btc.index = pd.to_datetime(btc['Date'], format='%Y-%m-%d') 33 | del btc['Date'] 34 | 35 | print(btc.head()) 36 | sns.set() 37 | plt.ylabel('BTC Price') 38 | plt.xlabel('Date') 39 | plt.xticks(rotation=45) 40 | plt.plot(btc.index, btc['BTC-USD'], ) 41 | plt.show() 42 | 43 | train = btc[btc.index < pd.to_datetime("2020-11-01", format='%Y-%m-%d')] 44 | test = btc[btc.index >= pd.to_datetime("2020-11-01", format='%Y-%m-%d')] 45 | print(test) 46 | plt.plot(train, color = "black", label = 'Training') 47 | plt.plot(test, color = "red", label = 'Testing') 48 | plt.ylabel('BTC Price') 49 | plt.xlabel('Date') 50 | plt.xticks(rotation=45) 51 | plt.title("Train/Test split for BTC Data") 52 | 53 | y = train['BTC-USD'] 54 | 55 | ARMAmodel = SARIMAX(y, order = (1, 0, 1)) 56 | ARMAmodel = ARMAmodel.fit() 57 | 58 | y_pred = ARMAmodel.get_forecast(len(test.index)) 59 | y_pred_df = y_pred.conf_int(alpha = 0.05) 60 | y_pred_df["Predictions"] = ARMAmodel.predict(start = y_pred_df.index[0], end = y_pred_df.index[-1]) 61 | y_pred_df.index = test.index 62 | y_pred_out = y_pred_df["Predictions"] 63 | plt.plot(y_pred_out, color='green', label = 'ARMA Predictions') 64 | plt.legend() 65 | 66 | 67 | import numpy as np 68 | from sklearn.metrics import mean_squared_error 69 | 70 | arma_rmse = np.sqrt(mean_squared_error(test["BTC-USD"].values, y_pred_df["Predictions"])) 71 | print("ARMA RMSE: ",arma_rmse) 72 | 73 | 74 | 75 | 76 | ARIMAmodel = ARIMA(y, order = (5, 4, 2)) 77 | ARIMAmodel = ARIMAmodel.fit() 78 | 79 | y_pred = ARIMAmodel.get_forecast(len(test.index)) 80 | y_pred_df = y_pred.conf_int(alpha = 0.05) 81 | y_pred_df["Predictions"] = ARIMAmodel.predict(start = y_pred_df.index[0], end = y_pred_df.index[-1]) 82 | y_pred_df.index = test.index 83 | y_pred_out = y_pred_df["Predictions"] 84 | plt.plot(y_pred_out, color='Yellow', label = 'ARIMA Predictions') 85 | plt.legend() 86 | 87 | 88 | import numpy as np 89 | from sklearn.metrics import mean_squared_error 90 | 91 | arma_rmse = np.sqrt(mean_squared_error(test["BTC-USD"].values, y_pred_df["Predictions"])) 92 | print("ARIMA RMSE: ",arma_rmse) 93 | 94 | 95 | 96 | SARIMAXmodel = SARIMAX(y, order = (5, 4, 2), seasonal_order=(2,2,2,12)) 97 | SARIMAXmodel = SARIMAXmodel.fit() 98 | 99 | y_pred = SARIMAXmodel.get_forecast(len(test.index)) 100 | y_pred_df = y_pred.conf_int(alpha = 0.05) 101 | y_pred_df["Predictions"] = SARIMAXmodel.predict(start = y_pred_df.index[0], end = y_pred_df.index[-1]) 102 | y_pred_df.index = test.index 103 | y_pred_out = y_pred_df["Predictions"] 104 | plt.plot(y_pred_out, color='Blue', label = 'SARIMA Predictions') 105 | plt.legend() 106 | 107 | 108 | import numpy as np 109 | from sklearn.metrics import mean_squared_error 110 | 111 | arma_rmse = np.sqrt(mean_squared_error(test["BTC-USD"].values, y_pred_df["Predictions"])) 112 | print("SARIMA RMSE: ",arma_rmse) 113 | -------------------------------------------------------------------------------- /model_selection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jun 16 13:11:36 2021 5 | 6 | @author: sadrachpierre 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from sklearn.model_selection import KFold 12 | from sklearn.model_selection import LeaveOneOut 13 | from sklearn.ensemble import RandomForestClassifier 14 | 15 | from sklearn.model_selection import train_test_split 16 | pd.set_option('display.max_columns', None) 17 | pd.set_option('display.max_rows', None) 18 | 19 | df = pd.read_csv("telco_churn.csv") 20 | 21 | print(df.head()) 22 | 23 | 24 | df['Churn_binary'] = np.where(df['Churn'] == 'Yes', 1, 0) 25 | 26 | 27 | X = df[['tenure', 'MonthlyCharges']] 28 | 29 | y = df['Churn'] 30 | 31 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42) 32 | 33 | 34 | 35 | 36 | # folds = KFold(n_splits=5) 37 | # folds.get_n_splits(X) 38 | 39 | 40 | 41 | # from sklearn.metrics import accuracy_score 42 | # fold = 0 43 | # for train_index, test_index in folds.split(X): 44 | # X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index] 45 | # model = RandomForestClassifier() 46 | # model.fit(X_train, y_train) 47 | # y_pred = model.predict(X_test) 48 | # fold+=1 49 | # print(f"Accuracy in fold {fold}:", accuracy_score(y_pred, y_test)) 50 | # loo = LeaveOneOut() 51 | # for train_index, test_index in loo.split(X): 52 | # X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index] 53 | 54 | 55 | 56 | import numpy as np 57 | from sklearn.feature_selection import SelectKBest, f_classif, chi2 58 | import matplotlib.pyplot as plt 59 | 60 | 61 | df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan) 62 | df['TotalCharges'].fillna(0, inplace = True) 63 | df['TotalCharges'] = df['TotalCharges'].astype(float) 64 | 65 | 66 | X = df[['tenure', 'MonthlyCharges', 'TotalCharges']] 67 | 68 | y = df['Churn'] 69 | 70 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42) 71 | 72 | 73 | numerical_predictors = ["MonthlyCharges", "TotalCharges", "tenure" ] 74 | numerical_selector = SelectKBest(f_classif, k=3) 75 | numerical_selector.fit(X_train[numerical_predictors], y_train) 76 | 77 | 78 | 79 | num_scores = -np.log10(numerical_selector.pvalues_) 80 | 81 | 82 | plt.bar(range(len(numerical_predictors)), num_scores) 83 | plt.xticks(range(len(numerical_predictors)), numerical_predictors, rotation='vertical') 84 | plt.xlabel("Feature") 85 | plt.ylabel("Score") 86 | plt.show() 87 | 88 | from sklearn.model_selection import RandomizedSearchCV 89 | 90 | 91 | n_estimators = [50, 100, 200] 92 | max_features = ['auto', 'sqrt', 'log2'] 93 | max_depth = [int(x) for x in np.linspace(10, 30, num = 5)] 94 | max_depth.append(None) 95 | min_samples_split = [2, 5, 10] 96 | min_samples_leaf = [1, 2, 4] 97 | bootstrap = [True, False] 98 | 99 | 100 | 101 | random_grid = {'n_estimators': n_estimators, 102 | 'max_features': max_features, 103 | 'max_depth': max_depth, 104 | 'min_samples_split': min_samples_split, 105 | 'min_samples_leaf': min_samples_leaf, 106 | 'bootstrap': bootstrap} 107 | 108 | model = RandomForestClassifier() 109 | 110 | rf_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, 111 | n_iter = 3, cv =3, verbose=2, random_state=42) 112 | rf_random.fit(X_train, y_train) 113 | 114 | parameters = rf_random.best_params_ 115 | print(parameters) 116 | 117 | -------------------------------------------------------------------------------- /portfolio_opt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Sep 15 15:45:53 2021 5 | 6 | @author: sadrachpierre 7 | """ 8 | 9 | 10 | import pandas_datareader.data as web 11 | import datetime 12 | import pandas as pd 13 | from functools import reduce 14 | 15 | 16 | pd.set_option('display.max_columns', None) 17 | pd.set_option('display.max_rows', None) 18 | 19 | 20 | # start = datetime.datetime(2019,9,15) 21 | # end = datetime.datetime(2021,9,15) 22 | 23 | # def get_stock(ticker): 24 | # data = web.DataReader(f"{ticker}","yahoo",start,end) 25 | # data[f'{ticker}'] = data["Close"]#(data["Close"] - data["Open"])/data["Open"] 26 | # data = data[[f'{ticker}']] 27 | # print(data.head()) 28 | # return data 29 | 30 | # pfizer = get_stock("PFE") 31 | # jnj = get_stock("JNJ") 32 | 33 | 34 | 35 | 36 | # def combine_stocks(tickers): 37 | # data_frames = [] 38 | # for i in tickers: 39 | # data_frames.append(get_stock(i)) 40 | 41 | # df_merged = reduce(lambda left,right: pd.merge(left,right,on=['Date'], 42 | # how='outer'), data_frames) 43 | # print(df_merged.head()) 44 | # return df_merged 45 | 46 | 47 | # stocks = ["MRNA", "PFE", "JNJ", "GOOGL", 48 | # "FB", "AAPL", "COST", "WMT", "KR", "JPM", 49 | # "BAC", "HSBC"] 50 | 51 | 52 | 53 | 54 | # portfolio = combine_stocks(stocks) 55 | 56 | 57 | 58 | # portfolio.to_csv("portfolio.csv", index=False) 59 | 60 | 61 | portfolio = pd.read_csv("portfolio.csv") 62 | print(portfolio.head()) 63 | 64 | from pypfopt.efficient_frontier import EfficientFrontier 65 | from pypfopt.expected_returns import mean_historical_return 66 | from pypfopt.risk_models import CovarianceShrinkage 67 | 68 | 69 | mu = mean_historical_return(portfolio) 70 | S = CovarianceShrinkage(portfolio).ledoit_wolf() 71 | 72 | 73 | ef = EfficientFrontier(mu, S) 74 | weights = ef.max_sharpe() 75 | 76 | cleaned_weights = ef.clean_weights() 77 | print(dict(cleaned_weights)) 78 | 79 | ef.portfolio_performance(verbose=True) 80 | 81 | 82 | 83 | from pypfopt.discrete_allocation import DiscreteAllocation, get_latest_prices 84 | 85 | latest_prices = get_latest_prices(portfolio) 86 | 87 | da = DiscreteAllocation(cleaned_weights, latest_prices, total_portfolio_value=100000) 88 | 89 | allocation, leftover = da.greedy_portfolio() 90 | print("Discrete allocation:", allocation) 91 | print("Funds remaining: ${:.2f}".format(leftover)) 92 | 93 | 94 | 95 | 96 | from pypfopt import HRPOpt 97 | returns = portfolio.pct_change().dropna() 98 | hrp = HRPOpt(returns) 99 | hrp_weights = hrp.optimize() 100 | hrp.portfolio_performance(verbose=True) 101 | print(dict(hrp_weights)) 102 | 103 | da_hrp = DiscreteAllocation(hrp_weights, latest_prices, total_portfolio_value=100000) 104 | 105 | allocation, leftover = da_hrp.greedy_portfolio() 106 | print("Discrete allocation (HRP):", allocation) 107 | print("Funds remaining (HRP): ${:.2f}".format(leftover)) 108 | 109 | 110 | 111 | 112 | from pypfopt.efficient_frontier import EfficientCVaR 113 | S = portfolio.cov() 114 | ef_cvar = EfficientCVaR(mu, S) 115 | cvar_weights = ef_cvar.min_cvar() 116 | 117 | cleaned_weights = ef_cvar.clean_weights() 118 | print(dict(cleaned_weights)) 119 | 120 | ef_cvar.portfolio_performance(verbose=True) 121 | 122 | da_cvar = DiscreteAllocation(cvar_weights, latest_prices, total_portfolio_value=100000) 123 | 124 | allocation, leftover = da_cvar.greedy_portfolio() 125 | print("Discrete allocation (CVAR):", allocation) 126 | print("Funds remaining (CVAR): ${:.2f}".format(leftover)) 127 | -------------------------------------------------------------------------------- /optimization_tutorial.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Feb 22 00:34:19 2022 5 | 6 | @author: sadrachpierre 7 | """ 8 | import pandas as pd 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.ensemble import RandomForestRegressor 11 | from sklearn.metrics import mean_squared_error 12 | import numpy as np 13 | from scipy.optimize import differential_evolution 14 | 15 | 16 | df = pd.read_csv("Concrete_Data_Yeh.csv") 17 | 18 | print(df.head()) 19 | 20 | 21 | X = df[['cement', 'slag', 'flyash', 'water', 'superplasticizer', 22 | 'coarseaggregate', 'fineaggregate', 'age']] 23 | 24 | y = df['csMPa'] 25 | 26 | X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 42) 27 | 28 | model = RandomForestRegressor(n_estimators=100, max_depth=100, random_state =42) 29 | model.fit(X_train, y_train) 30 | 31 | y_pred = model.predict(X_test) 32 | 33 | rmse = np.sqrt(mean_squared_error(y_test, y_pred)) 34 | 35 | print("RMSE: ", rmse) 36 | 37 | 38 | import matplotlib.pyplot as plt 39 | 40 | plt.scatter(y_test, y_pred) 41 | plt.title("Actual vs. Predicted") 42 | plt.xlabel("Actual") 43 | plt.ylabel("Predicted") 44 | 45 | 46 | model_full= RandomForestRegressor(n_estimators=100, max_depth=100, random_state =42) 47 | model_full.fit(X, y) 48 | 49 | 50 | def obj_fun(X): 51 | X = [X] 52 | results = model_full.predict(X) 53 | obj_fun.counter += 1 54 | print(obj_fun.counter) 55 | return -results 56 | 57 | 58 | 59 | boundaries = [(df['cement'].min(), df['cement'].max()), (df['slag'].min(), df['slag'].max()), (df['flyash'].min(), df['flyash'].max()), 60 | (df['water'].min(), df['water'].max()), (df['superplasticizer'].min(), df['superplasticizer'].max()), 61 | (df['coarseaggregate'].min(), df['coarseaggregate'].max()), (df['fineaggregate'].min(), df['fineaggregate'].max()), (df['age'].min(), df['age'].max())] 62 | 63 | 64 | obj_fun.counter = 0 65 | 66 | if __name__ == '__main__': 67 | 68 | 69 | opt_results = differential_evolution(obj_fun, boundaries) 70 | 71 | 72 | print('cement:', opt_results.x[0]) 73 | print('slag:', opt_results.x[1]) 74 | print('flyash:', opt_results.x[2]) 75 | print('water:', opt_results.x[3]) 76 | print('superplasticizer:', opt_results.x[4]) 77 | print('coarseaggregate:', opt_results.x[5]) 78 | print('fineaggregate:', opt_results.x[6]) 79 | print('age:', opt_results.x[7]) 80 | 81 | 82 | print("Max Strength: ", -opt_results.fun) 83 | 84 | 85 | import dlib 86 | 87 | lbounds = [df['cement'].min(), df['slag'].min(), df['flyash'].min(), df['water'].min(), df['superplasticizer'].min(), df['coarseaggregate'].min(), 88 | df['fineaggregate'].min(), df['age'].min()] 89 | ubounds = [df['cement'].max(), df['slag'].max(), df['flyash'].max(), df['water'].max(), df['superplasticizer'].max(), df['coarseaggregate'].max(), 90 | df['fineaggregate'].max(), df['age'].max()] 91 | max_fun_calls = 1000 92 | 93 | def maxlip_obj_fun(X1, X2, X3, X4, X5, X6, X7, X8): 94 | X = [[X1, X2, X3, X4, X5, X6, X7, X8]] 95 | results = model_full.predict(X) 96 | return results 97 | 98 | 99 | sol, obj_val = dlib.find_max_global(maxlip_obj_fun, lbounds, ubounds, max_fun_calls) 100 | 101 | print("MAXLIPO Results: ") 102 | print('cement:', sol[0]) 103 | print('slag:', sol[1]) 104 | print('flyash:', sol[2]) 105 | print('water:', sol[3]) 106 | print('superplasticizer:', sol[4]) 107 | print('coarseaggregate:', sol[5]) 108 | print('fineaggregate:', sol[6]) 109 | print('age:', sol[7]) 110 | 111 | 112 | print("Max Strength: ", obj_val) 113 | 114 | 115 | -------------------------------------------------------------------------------- /regularization_tutorial.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Nov 15 14:55:53 2021 5 | 6 | @author: sadrachpierre 7 | """ 8 | import pandas as pd 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.preprocessing import StandardScaler 11 | 12 | df = pd.read_csv("telco_churn.csv") 13 | df.fillna(0,inplace=True) 14 | print(df.head()) 15 | 16 | import numpy as np 17 | df['Churn'] = np.where(df['Churn'] == 'Yes', 1, 0) 18 | 19 | def convert_categories(cat_list): 20 | for col in cat_list: 21 | df[col] = df[col].astype('category') 22 | df[f'{col}_cat'] = df[col].cat.codes 23 | df[f'{col}_cat'] = df[f'{col}_cat'].astype(float) 24 | 25 | category_list = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 26 | 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 27 | 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'] 28 | convert_categories(category_list) 29 | 30 | print(df.head()) 31 | df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce') 32 | df['TotalCharges'].fillna(0, inplace=True) 33 | cols = ['gender_cat', 'Partner_cat', 'Dependents_cat', 'PhoneService_cat', 'MultipleLines_cat', 'InternetService_cat', 34 | 'OnlineSecurity_cat', 'OnlineBackup_cat', 'DeviceProtection_cat', 'TechSupport_cat', 'StreamingTV_cat', 35 | 'StreamingMovies_cat', 'Contract_cat', 'PaperlessBilling_cat', 'PaymentMethod_cat','MonthlyCharges', 36 | 'TotalCharges', 'SeniorCitizen', 'tenure'] 37 | 38 | X = df[cols] 39 | print(X.head()) 40 | df['Churn'] = df['Churn'].astype(int) 41 | y = df['Churn'] 42 | 43 | 44 | 45 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 46 | 47 | from tensorflow.keras.layers import Dense 48 | from tensorflow.keras.models import Sequential 49 | from sklearn.metrics import accuracy_score 50 | 51 | 52 | model = Sequential() 53 | model.add(Dense(len(cols),input_shape=(len(cols),), kernel_initializer='normal', activation='relu')) 54 | model.add(Dense(32, activation='relu')) 55 | model.add(Dense(32, activation='relu')) 56 | model.add(Dense(1, activation='sigmoid')) 57 | model.compile(optimizer = 'adam',loss='binary_crossentropy', metrics =['accuracy']) 58 | model.fit(X_train, y_train,epochs =20) 59 | 60 | y_pred = model.predict(X_test) 61 | y_pred = np.where(y_pred > 0.5, 1, 0) 62 | print("Accuracy: ", accuracy_score(y_pred, y_test)) 63 | 64 | 65 | from tensorflow.keras import regularizers 66 | 67 | model_lasso = Sequential() 68 | model_lasso.add(Dense(len(cols),input_shape=(len(cols),), kernel_initializer='normal', activation='relu', kernel_regularizer = regularizers.l1(1e-6))) 69 | model_lasso.add(Dense(32, activation='relu')) 70 | model_lasso.add(Dense(32, activation='relu')) 71 | model_lasso.add(Dense(1, activation='sigmoid')) 72 | model_lasso.compile(optimizer = 'adam',loss='binary_crossentropy', metrics =['accuracy']) 73 | model_lasso.fit(X_train, y_train,epochs =20) 74 | 75 | y_pred = model_lasso.predict(X_test) 76 | y_pred = np.where(y_pred > 0.5, 1, 0) 77 | print("Accuracy With Lasso: ", accuracy_score(y_pred, y_test)) 78 | 79 | 80 | model_ridge = Sequential() 81 | model_ridge.add(Dense(len(cols),input_shape=(len(cols),), kernel_initializer='normal', activation='relu', kernel_regularizer = regularizers.l2(1e-6))) 82 | model_ridge.add(Dense(32, activation='relu')) 83 | model_ridge.add(Dense(32, activation='relu')) 84 | model_ridge.add(Dense(1, activation='sigmoid')) 85 | model_ridge.compile(optimizer = 'adam',loss='binary_crossentropy', metrics =['accuracy']) 86 | model_ridge.fit(X_train, y_train,epochs =20) 87 | 88 | y_pred = model_ridge.predict(X_test) 89 | y_pred = np.where(y_pred > 0.5, 1, 0) 90 | print("Accuracy With Ridge: ", accuracy_score(y_pred, y_test)) 91 | -------------------------------------------------------------------------------- /python_profiling_tutorial.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed May 4 18:02:29 2022 5 | 6 | @author: sadrachpierre 7 | """ 8 | 9 | from memory_profiler import profile 10 | import pandas as pd 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.metrics import average_precision_score 14 | 15 | from catboost import CatBoostClassifier 16 | 17 | from timeit import default_timer as timer 18 | 19 | import cProfile, pstats, io 20 | from pstats import SortKey 21 | 22 | 23 | ''' 24 | df = pd.read_csv("creditcard.csv") 25 | 26 | print(df.head()) 27 | 28 | print("Number of rows: ", len(df)) 29 | 30 | print("Number of columns: ", len(df.columns)) 31 | 32 | df = df.sample(10000, random_state=42) 33 | 34 | df.to_csv("creditcard_subsample10000.csv", index=False) 35 | ''' 36 | 37 | @profile 38 | def read_data(filename): 39 | df = pd.read_csv(filename) 40 | return df 41 | 42 | @profile 43 | def data_prep(dataframe, columns): 44 | df_select = dataframe[columns] 45 | return df_select 46 | 47 | @profile 48 | def feature_engineering(dataframe, inputs, output): 49 | X = dataframe[inputs] 50 | y = dataframe[output] 51 | return X, y 52 | @profile 53 | def split_data(X, y): 54 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.33) 55 | return X_train, X_test, y_train, y_test 56 | 57 | @profile 58 | def model_training(X_train, y_train, model_type): 59 | if model_type == 'Logistic Regression': 60 | model = LogisticRegression() 61 | model.fit(X_train, y_train) 62 | elif model_type == 'CatBoost': 63 | model = CatBoostClassifier() 64 | model.fit(X_train, y_train) 65 | return model 66 | 67 | @profile 68 | def predict(model, X_test): 69 | y_pred = model.predict(X_test) 70 | return y_pred 71 | @profile 72 | def evaluate(y_pred, y_test): 73 | precision = average_precision_score(y_test, y_pred) 74 | print("Precision: ", precision) 75 | 76 | def main(): 77 | runtime_metrics = dict() 78 | #read in data 79 | start = timer() 80 | data = read_data('creditcard.csv') 81 | end = timer() 82 | read_time = end - start 83 | runtime_metrics['read_time'] = read_time 84 | 85 | #slect relevant columns 86 | start = timer() 87 | columns = ['V1', 'V2', 'V3', 'Amount', 'Class'] 88 | df_select = data_prep(data, columns) 89 | end = timer() 90 | select_time = end - start 91 | runtime_metrics['select_time'] = select_time 92 | 93 | 94 | #define input and output 95 | start = timer() 96 | inputs = ['V1', 'V2', 'V3'] 97 | output = 'Class' 98 | X, y = feature_engineering(df_select, inputs, output) 99 | end = timer() 100 | data_prep_time = end - start 101 | runtime_metrics['data_prep_time'] = data_prep_time 102 | 103 | 104 | #split data for training and testing 105 | start = timer() 106 | X_train, X_test, y_train, y_test = split_data(X, y) 107 | end = timer() 108 | split_time = end - start 109 | runtime_metrics['split_time'] = split_time 110 | 111 | 112 | #fit model 113 | start = timer() 114 | model_type = 'CatBoost' 115 | model = model_training(X_train, y_train, model_type) 116 | end = timer() 117 | fit_time = end - start 118 | runtime_metrics['fit_time'] = fit_time 119 | 120 | #make predictions 121 | start = timer() 122 | y_pred = predict(model, X_test) 123 | end = timer() 124 | pred_time = end - start 125 | runtime_metrics['pred_time'] = pred_time 126 | 127 | #evaluate model predictions 128 | start = timer() 129 | evaluate(y_pred, y_test) 130 | end = timer() 131 | pred_time = end - start 132 | runtime_metrics['pred_time'] = pred_time 133 | 134 | print(runtime_metrics) 135 | 136 | 137 | if __name__ == "__main__": 138 | main() 139 | 140 | 141 | -------------------------------------------------------------------------------- /empty_variables_and_datastructures.py: -------------------------------------------------------------------------------- 1 | age1 = 35 2 | name1 = "Fred Philips" 3 | income1= 55250.15 4 | senior_citizen1 = False 5 | 6 | 7 | age2 = 42 8 | name2 = "Josh Rogers" 9 | income2=65240.25 10 | senior_citizen2 = False 11 | 12 | 13 | age3 = 28 14 | name3 = "Bill Hanson" 15 | income3=79250.65 16 | senior_citizen3 = False 17 | 18 | 19 | #age4 = "" 20 | #name4 = 100 21 | #income4 = 45250.65 22 | #senior_citizen4 = True 23 | 24 | #age4 = None 25 | #name4 = None 26 | #income4 = 45250.65 27 | #senior_citizen4 = True 28 | 29 | 30 | import numpy as np 31 | 32 | age4 = np.nan 33 | name4 = np.nan 34 | income4 = 45250.65 35 | senior_citizen4 = np.nan 36 | 37 | avg_age = (age1 + age2 + age3 + age4)/4 38 | 39 | print(avg_age) 40 | 41 | 42 | ages = [] 43 | names = [] 44 | incomes = [] 45 | senior_citizen = [] 46 | 47 | ages.append(age1) 48 | ages.append(age2) 49 | ages.append(age3) 50 | ages.append(age4) 51 | print("List of ages: ", ages) 52 | 53 | names.append(name1) 54 | names.append(name2) 55 | names.append(name3) 56 | names.append(name4) 57 | 58 | print("List of names: ", names) 59 | 60 | 61 | incomes.append(income1) 62 | incomes.append(income2) 63 | incomes.append(income3) 64 | incomes.append(income4) 65 | 66 | 67 | print("List of incomes: ", incomes) 68 | 69 | 70 | senior_citizen.append(senior_citizen1) 71 | senior_citizen.append(senior_citizen2) 72 | senior_citizen.append(senior_citizen3) 73 | senior_citizen.append(senior_citizen4) 74 | 75 | 76 | print("List of senior citizen status: ", senior_citizen) 77 | 78 | demo_dict = {} 79 | 80 | demo_dict['age'] = ages 81 | demo_dict['name'] = names 82 | demo_dict['income'] = incomes 83 | demo_dict['senior_citizen'] = senior_citizen 84 | 85 | print("Demographics Dictionary") 86 | print(demo_dict) 87 | 88 | 89 | import pandas as pd 90 | 91 | demo_df = pd.DataFrame() 92 | 93 | demo_df['age'] = ages 94 | demo_df['name'] = names 95 | demo_df['income'] = incomes 96 | demo_df['senior_citizen'] = senior_citizen 97 | 98 | print("Demographics Dataframe") 99 | print(demo_df) 100 | 101 | 102 | 103 | 104 | def income_after_tax(income, after_tax = np.nan): 105 | if income is float: 106 | after_tax = income - 0.22*income 107 | return after_tax 108 | 109 | 110 | 111 | after_tax1 = income_after_tax(income1) 112 | print("Before: ", income1) 113 | 114 | print("After: ", after_tax1) 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | after_tax_invalid1 = income_after_tax('') 123 | after_tax_invalid2 = income_after_tax(None) 124 | after_tax_invalid3 = income_after_tax("income") 125 | after_tax_invalid4 = income_after_tax(True) 126 | after_tax_invalid5 = income_after_tax({}) 127 | 128 | print("after_tax_invalid1: ", after_tax_invalid1) 129 | print("after_tax_invalid2: ", after_tax_invalid2) 130 | print("after_tax_invalid3: ", after_tax_invalid3) 131 | print("after_tax_invalid4: ", after_tax_invalid4) 132 | print("after_tax_invalid5: ", after_tax_invalid5) 133 | 134 | 135 | def get_after_tax_list(input_list, out_list = []): 136 | if type(input_list) is list: 137 | out_list = [x - 0.22*x for x in input_list] 138 | print("After Tax Incomes: ", out_list) 139 | return out_list 140 | 141 | 142 | 143 | out_list1 = get_after_tax_list(incomes) 144 | out_list2 = get_after_tax_list(5) 145 | 146 | 147 | def get_income_truth_values(input_dict, output_dict={'avg_income': np.nan}): 148 | if type(input_dict) is dict and 'income' in input_dict: 149 | output_dict= {'avg_income': np.mean(input_dict['income'])} 150 | print(output_dict) 151 | return output_dict 152 | 153 | get_income_truth_values(demo_dict) 154 | get_income_truth_values(10000) 155 | 156 | demo_df['state'] = ['NY', 'MA', 'NY', 'CA'] 157 | demo_df['age'].fillna(demo_df['age'].mean(), inplace=True) 158 | demo_df['income'].fillna(demo_df['income'].mean(), inplace=True) 159 | 160 | def income_age_groupby(input_df, output_df = pd.DataFrame({'state': [np.nan], 'age': [np.nan], 'income':[np.nan]})): 161 | if type(input_df) is type(pd.DataFrame()) and set(['age', 'income', 'state']).issubset(input_df.columns): 162 | output_df = input_df.groupby(['state'])['age', 'income'].mean().reset_index() 163 | print(output_df) 164 | return output_df 165 | 166 | income_age_groupby(demo_df) 167 | income_age_groupby([1,2,3]) 168 | -------------------------------------------------------------------------------- /financial_data_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jun 23 12:19:13 2021 5 | 6 | @author: sadrachpierre 7 | """ 8 | import pandas_datareader.data as web 9 | import datetime 10 | import pandas as pd 11 | import matplotlib.pyplot as plt 12 | import seaborn as sns 13 | 14 | sns.set() 15 | start = datetime.datetime(2019,6,23) 16 | end = datetime.datetime(2021,6,23) 17 | 18 | # amzn = web.DataReader('AMZN','yahoo',start,end) 19 | # amzn['Date'] = amzn.index 20 | # print(amzn.head()) 21 | # amzn.to_csv(f"amzn_{start}_{end}.csv", index=False) 22 | 23 | 24 | # googl = web.DataReader('GOOGL','yahoo',start,end) 25 | # googl['Date'] = googl.index 26 | # print(googl.head()) 27 | # googl.to_csv(f"googl_{start}_{end}.csv", index=False) 28 | 29 | # aapl = web.DataReader('AAPL','yahoo',start,end) 30 | # aapl['Date'] = aapl.index 31 | # print(aapl.head()) 32 | # aapl.to_csv(f"aapl_{start}_{end}.csv", index=False) 33 | 34 | 35 | amzn_df = pd.read_csv(f'amzn_{start}_{end}.csv') 36 | googl_df = pd.read_csv(f'googl_{start}_{end}.csv') 37 | aapl_df = pd.read_csv(f'aapl_{start}_{end}.csv') 38 | 39 | # print(aapl_df.head()) 40 | 41 | amzn_df['Returns'] = (amzn_df['Open'] - amzn_df['Close'])/amzn_df['Open'] 42 | amzn_df['Returns'].hist() 43 | import numpy as np 44 | mean_amnz_returns = np.round(amzn_df['Returns'].mean(), 5) 45 | std_amnz_returns = np.round(amzn_df['Returns'].std(), 2) 46 | plt.title(f'AMZN Stock Price Returns Distribution; Mean {mean_amnz_returns}, STD: {std_amnz_returns}') 47 | plt.show() 48 | 49 | 50 | googl_df['Returns'] = (googl_df['Open'] - googl_df['Close'])/googl_df['Open'] 51 | googl_df['Returns'].hist() 52 | mean_googl_returns = np.round(googl_df['Returns'].mean(), 5) 53 | std_googl_returns = np.round(googl_df['Returns'].std(), 2) 54 | plt.title(f'GOOGL Stock Price Returns Distribution; Mean {mean_googl_returns}, STD: {std_googl_returns}') 55 | plt.show() 56 | 57 | aapl_df['Returns'] = (aapl_df['Open'] - aapl_df['Close'])/aapl_df['Open'] 58 | aapl_df['Returns'].hist() 59 | mean_aapl_returns = np.round(aapl_df['Returns'].mean(), 5) 60 | std_aapl_returns = np.round(aapl_df['Returns'].std(), 2) 61 | plt.title(f'AAPL Stock Price Returns Distribution; Mean {mean_aapl_returns}, STD: {std_aapl_returns}') 62 | plt.show() 63 | 64 | 65 | amzn_df['Ticker'] = 'AMZN' 66 | googl_df['Ticker'] = 'GOOGL' 67 | aapl_df['Ticker'] = 'AAPL' 68 | 69 | df = pd.concat([amzn_df, googl_df, aapl_df]) 70 | df = df[['Ticker', 'Returns']] 71 | print(df.head()) 72 | 73 | sns.boxplot(x= df['Ticker'], y = df['Returns']) 74 | plt.title('Box Plot for AMZN, GOOGL and AAPL Returns') 75 | plt.show() 76 | 77 | df_corr = pd.DataFrame({'AMZN':amzn_df['Returns'], 'GOOGL':googl_df['Returns'], 'AAPL':aapl_df['Returns']}) 78 | print(df_corr.head()) 79 | corr = df_corr.corr() 80 | sns.heatmap(corr, annot= True) 81 | plt.show() 82 | 83 | 84 | cutoff = datetime.datetime(2021,1,23) 85 | amzn_df['Date'] = pd.to_datetime(amzn_df['Date'], format='%Y/%m/%d') 86 | amzn_df = amzn_df[amzn_df['Date'] > cutoff] 87 | amzn_df['SMA_10'] = amzn_df['Close'].rolling(window=10).mean() 88 | print(amzn_df.head()) 89 | plt.plot(amzn_df['Date'], amzn_df['SMA_10']) 90 | plt.plot(amzn_df['Date'], amzn_df['Adj Close']) 91 | plt.title("Moving average and Adj Close price for AMZN") 92 | plt.ylabel('Adj Close Price') 93 | plt.xlabel('Date') 94 | plt.show() 95 | 96 | 97 | googl_df['Date'] = pd.to_datetime(googl_df['Date'], format='%Y/%m/%d') 98 | googl_df = googl_df[googl_df['Date'] > cutoff] 99 | googl_df['SMA_10'] = googl_df['Close'].rolling(window=10).mean() 100 | print(googl_df.head()) 101 | plt.plot(googl_df['Date'], googl_df['SMA_10']) 102 | plt.plot(googl_df['Date'], googl_df['Adj Close']) 103 | plt.title("Moving average and Adj Close price for GOOGL") 104 | plt.ylabel('Adj Close Price') 105 | plt.xlabel('Date') 106 | plt.show() 107 | 108 | 109 | aapl_df['Date'] = pd.to_datetime(aapl_df['Date'], format='%Y/%m/%d') 110 | aapl_df = aapl_df[aapl_df['Date'] > cutoff] 111 | aapl_df['SMA_10'] = aapl_df['Close'].rolling(window=10).mean() 112 | print(googl_df.head()) 113 | plt.plot(aapl_df['Date'], aapl_df['SMA_10']) 114 | plt.plot(aapl_df['Date'], aapl_df['Adj Close']) 115 | plt.title("Moving average and Adj Close price for AAPL") 116 | plt.ylabel('Adj Close Price') 117 | plt.xlabel('Date') 118 | plt.show() 119 | 120 | amzn_df['SMA_10_STD'] = amzn_df['Adj Close'].rolling(window=10).std() 121 | amzn_df['Upper Band'] = amzn_df['SMA_10'] + (amzn_df['SMA_10_STD'] * 2) 122 | amzn_df['Lower Band'] = amzn_df['SMA_10'] - (amzn_df['SMA_10_STD'] * 2) 123 | amzn_df.index = amzn_df['Date'] 124 | amzn_df[['Adj Close', 'SMA_10', 'Upper Band', 'Lower Band']].plot(figsize=(12,6)) 125 | plt.title('10 Day Bollinger Band for Amazon') 126 | plt.ylabel('Adjusted Close Price') 127 | plt.show() 128 | 129 | 130 | 131 | googl_df['SMA_10_STD'] = googl_df['Adj Close'].rolling(window=10).std() 132 | googl_df['Upper Band'] = googl_df['SMA_10'] + (googl_df['SMA_10_STD'] * 2) 133 | googl_df['Lower Band'] = googl_df['SMA_10'] - (googl_df['SMA_10_STD'] * 2) 134 | googl_df.index = googl_df['Date'] 135 | googl_df[['Adj Close', 'SMA_10', 'Upper Band', 'Lower Band']].plot(figsize=(12,6)) 136 | plt.title('10 Day Bollinger Band for Google') 137 | plt.ylabel('Adjusted Close Price') 138 | plt.show() 139 | 140 | 141 | aapl_df['SMA_10_STD'] = aapl_df['Adj Close'].rolling(window=10).std() 142 | aapl_df['Upper Band'] = aapl_df['SMA_10'] + (aapl_df['SMA_10_STD'] * 2) 143 | aapl_df['Lower Band'] = aapl_df['SMA_10'] - (aapl_df['SMA_10_STD'] * 2) 144 | aapl_df.index = aapl_df['Date'] 145 | aapl_df[['Adj Close', 'SMA_10', 'Upper Band', 'Lower Band']].plot(figsize=(12,6)) 146 | plt.title('10 Day Bollinger Band for Apple') 147 | plt.ylabel('Adjusted Close Price') 148 | plt.show() 149 | -------------------------------------------------------------------------------- /dimensionality_reduction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Nov 8 16:10:03 2021 5 | 6 | @author: sadrachpierre 7 | """ 8 | import pandas as pd 9 | import numpy as np 10 | from sklearn.ensemble import RandomForestClassifier 11 | from sklearn.model_selection import train_test_split 12 | import seaborn as sns 13 | import matplotlib.pyplot as plt 14 | 15 | 16 | pd.set_option('display.max_columns', None) 17 | pd.set_option('display.max_rows', None) 18 | 19 | 20 | df = pd.read_csv("Loan_status_2007-2020Q3.gzip") 21 | 22 | 23 | 24 | print("Number of Columns: ", len(list(df.columns))) 25 | print("Number of rows: ", len(df)) 26 | 27 | 28 | print(df.head()) 29 | 30 | 31 | df = df[df['purpose'] == 'credit_card'] 32 | 33 | 34 | columns = ['loan_amnt', 'loan_status', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate','mths_since_recent_revol_delinq','home_ownership', 'verification_status', 35 | 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'last_fico_range_low', 'last_fico_range_high'] 36 | 37 | 38 | df = df[columns] 39 | df.to_csv("credit_card_loan.csv", index=False) 40 | 41 | 42 | 43 | df_credit = pd.read_csv("credit_card_loan.csv") 44 | 45 | print("Number of Columns: ", len(list(df_credit.columns))) 46 | print("Number of rows: ", len(df_credit)) 47 | 48 | 49 | def fill_na(numerical_column): 50 | df_credit[numerical_column].fillna(df_credit[numerical_column].mean(), inplace=True) 51 | 52 | fill_na('mths_since_recent_revol_delinq') 53 | fill_na('num_accts_ever_120_pd') 54 | fill_na('num_actv_bc_tl') 55 | fill_na('num_actv_rev_tl') 56 | fill_na('avg_cur_bal') 57 | fill_na('bc_open_to_buy') 58 | fill_na('bc_util') 59 | 60 | 61 | 62 | 63 | def convert_categories(categorical_columnn): 64 | df_credit[categorical_columnn] = df_credit[categorical_columnn].astype('category') 65 | df_credit[f'{categorical_columnn}_cat'] = df_credit[categorical_columnn].cat.codes 66 | 67 | convert_categories('home_ownership') 68 | convert_categories('verification_status') 69 | convert_categories('term') 70 | 71 | 72 | print(set(df_credit['loan_status'])) 73 | 74 | df_credit = df_credit[df_credit['loan_status'].isin(['Fully Paid', 'Default', 'Charged Off'])] 75 | 76 | print(df_credit.head()) 77 | 78 | 79 | 80 | df_credit['loan_status_label'] = np.where(df_credit['loan_status'] == 'Fully Paid', 0, 1) 81 | columns2 = ['loan_amnt', 'loan_status_label', 'funded_amnt', 'funded_amnt_inv', 'term_cat', 'int_rate','mths_since_recent_revol_delinq','home_ownership_cat', 'verification_status_cat', 82 | 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'last_fico_range_low', 'last_fico_range_high'] 83 | df_credit = df_credit[columns2] 84 | print(df_credit.head()) 85 | 86 | df_credit['int_rate'] = df_credit['int_rate'].str.rstrip('%') 87 | df_credit['int_rate'] = df_credit['int_rate'].astype(float) 88 | df_credit.fillna(0, inplace=True) 89 | 90 | 91 | X = df_credit[['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term_cat', 'int_rate','mths_since_recent_revol_delinq','home_ownership_cat', 'verification_status_cat', 92 | 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'last_fico_range_low', 'last_fico_range_high']] 93 | y = df_credit['loan_status_label'] 94 | 95 | X_train, X_test, y_train, y_test = train_test_split(X, y , random_state=42, test_size = 0.33) 96 | 97 | 98 | import seaborn as sns 99 | import matplotlib.pyplot as plt 100 | model = RandomForestClassifier() 101 | model.fit(X_train, y_train) 102 | 103 | features = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term_cat', 'int_rate','mths_since_recent_revol_delinq','home_ownership_cat', 'verification_status_cat', 104 | 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'last_fico_range_low', 'last_fico_range_high'] 105 | 106 | 107 | feature_df = pd.DataFrame({"Importance":model.feature_importances_, "Features": features }) 108 | sns.set() 109 | plt.bar(feature_df["Features"], feature_df["Importance"]) 110 | plt.xticks(rotation=90) 111 | plt.title("Random Forest Model Feature Importance") 112 | plt.show() 113 | 114 | from sklearn.decomposition import PCA 115 | from sklearn.preprocessing import StandardScaler 116 | 117 | features2 = ['loan_amnt', 'loan_status_label', 'funded_amnt', 'funded_amnt_inv', 'term_cat', 'int_rate','mths_since_recent_revol_delinq','home_ownership_cat', 'verification_status_cat', 118 | 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'last_fico_range_low', 'last_fico_range_high'] 119 | 120 | 121 | 122 | 123 | X = df_credit[features2] 124 | scaler = StandardScaler() 125 | 126 | scaler.fit(X) 127 | X_scaled=scaler.transform(X) 128 | 129 | pca=PCA(n_components=4) 130 | pca.fit(X_scaled) 131 | X_components=pca.transform(X_scaled) 132 | 133 | components_df = pd.DataFrame({'component_one': list(X_components[:,0]), 'component_two': list(X_components[:,1]), 134 | 'component_three': list(X_components[:,2]), 'component_four': list(X_components[:,3])}) 135 | 136 | print(components_df.head()) 137 | 138 | 139 | 140 | 141 | labels=X.loan_status_label 142 | color_dict={0:'Red',1:'Blue'} 143 | 144 | 145 | 146 | fig,ax=plt.subplots(figsize=(7,5)) 147 | 148 | sns.set() 149 | for i in np.unique(labels): 150 | index=np.where(labels==i) 151 | ax.scatter(components_df['component_one'].loc[index],components_df['component_two'].loc[index],c=color_dict[i],s=10, 152 | label=i) 153 | 154 | 155 | plt.xlabel("1st Component",fontsize=14) 156 | plt.ylabel("2nd Component",fontsize=14) 157 | plt.title('Scatter Plot of Principal Components') 158 | plt.legend() 159 | plt.show() 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /model_explainability.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jul 20 12:31:27 2021 5 | 6 | @author: sadrachpierre 7 | """ 8 | import pandas as pd 9 | import numpy as np 10 | 11 | 12 | pd.set_option('display.max_columns', None) 13 | pd.set_option('display.max_rows', None) 14 | 15 | df = pd.read_csv("telco_churn.csv") 16 | 17 | df['gender_cat'] = df['gender'].astype('category') 18 | df['gender_cat'] = df['gender_cat'].cat.codes 19 | 20 | df['PaperlessBilling_cat'] = df['PaperlessBilling'].astype('category') 21 | df['PaperlessBilling_cat'] = df['PaperlessBilling_cat'].cat.codes 22 | 23 | 24 | 25 | df['Contract_cat'] = df['Contract'].astype('category') 26 | df['Contract_cat'] = df['Contract_cat'].cat.codes 27 | 28 | 29 | df['PaymentMethod_cat'] = df['PaymentMethod'].astype('category') 30 | df['PaymentMethod_cat'] = df['PaymentMethod_cat'].cat.codes 31 | 32 | 33 | df['Partner_cat'] = df['Partner'].astype('category') 34 | df['Partner_cat'] = df['Partner_cat'].cat.codes 35 | 36 | 37 | 38 | df['Dependents_cat'] = df['Dependents'].astype('category') 39 | df['Dependents_cat'] = df['Dependents_cat'].cat.codes 40 | 41 | 42 | df['DeviceProtection_cat'] = df['DeviceProtection'].astype('category') 43 | df['DeviceProtection_cat'] = df['DeviceProtection_cat'].cat.codes 44 | 45 | 46 | print(df.head()) 47 | 48 | df['churn_score'] = np.where(df['Churn']=='Yes', 1, 0) 49 | 50 | X = df[[ 'tenure', 'MonthlyCharges', 'gender_cat', 'PaperlessBilling_cat', 51 | 'Contract_cat','PaymentMethod_cat', 'Partner_cat', 'Dependents_cat', 'DeviceProtection_cat' ]] 52 | y = df['churn_score'] 53 | 54 | from sklearn.model_selection import train_test_split 55 | 56 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 57 | 58 | 59 | from sklearn.linear_model import LogisticRegression 60 | 61 | lr_model = LogisticRegression() 62 | lr_model.fit(X_train, y_train) 63 | 64 | y_pred = lr_model.predict(X_test) 65 | 66 | 67 | from sklearn.metrics import confusion_matrix 68 | conmat = confusion_matrix(y_test, y_pred) 69 | 70 | val = np.mat(conmat) 71 | classnames = list(set(y_train)) 72 | 73 | df_cm = pd.DataFrame( 74 | val, index=classnames, columns=classnames, 75 | ) 76 | df_cm = df_cm.astype('float') / df_cm.sum(axis=1)[:, np.newaxis] 77 | 78 | import matplotlib.pyplot as plt 79 | import seaborn as sns 80 | 81 | plt.figure() 82 | heatmap = sns.heatmap(df_cm, annot=True, cmap="Blues") 83 | heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right') 84 | heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right') 85 | plt.ylabel('True label') 86 | plt.xlabel('Predicted label') 87 | plt.title('Churn Logistic Regression Model Results') 88 | plt.show() 89 | 90 | from sklearn.inspection import plot_partial_dependence 91 | features = [0, 1, (1, 0)] 92 | plot_partial_dependence(lr_model, X_train, features, target=1) 93 | 94 | 95 | from sklearn.ensemble import RandomForestClassifier 96 | 97 | rf_model = RandomForestClassifier(n_estimators=100, max_depth=10) 98 | rf_model.fit(X_train, y_train) 99 | 100 | y_pred_rf = rf_model.predict(X_test) 101 | 102 | 103 | 104 | 105 | conmat = confusion_matrix(y_test, y_pred_rf) 106 | 107 | val = np.mat(conmat) 108 | classnames = list(set(y_train)) 109 | 110 | df_cm_rf = pd.DataFrame( 111 | val, index=classnames, columns=classnames, 112 | ) 113 | df_cm_rf = df_cm_rf.astype('float') / df_cm_rf.sum(axis=1)[:, np.newaxis] 114 | 115 | 116 | 117 | plt.figure() 118 | heatmap = sns.heatmap(df_cm_rf, annot=True, cmap="Blues") 119 | heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right') 120 | heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right') 121 | plt.ylabel('True label') 122 | plt.xlabel('Predicted label') 123 | plt.title('Churn Random Forest Model Results') 124 | plt.show() 125 | 126 | features = ['tenure', 'MonthlyCharges', 'gender_cat', 'PaperlessBilling_cat', 127 | 'Contract_cat','PaymentMethod_cat', 'Partner_cat', 'Dependents_cat', 'DeviceProtection_cat' ] 128 | 129 | print(rf_model.feature_importances_) 130 | feature_df = pd.DataFrame({'Importance':rf_model.feature_importances_, 'Features': features }) 131 | 132 | sns.set() 133 | plt.bar(feature_df['Features'], feature_df['Importance']) 134 | plt.xticks(rotation=90) 135 | plt.title('Random Forest Model Feature Importance') 136 | plt.show() 137 | 138 | from tensorflow.keras.models import Sequential 139 | from tensorflow.keras.layers import Dense 140 | 141 | model = Sequential() 142 | model.add(Dense(8, input_shape = (len(features),))) 143 | model.add(Dense(8, activation='relu')) 144 | 145 | model.add(Dense(1, activation='sigmoid')) 146 | model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 147 | 148 | model.fit(X_train, y_train, epochs = 1) 149 | 150 | y_pred_nn = [round(float(x)) for x in model.predict(X_test)] 151 | 152 | conmat = confusion_matrix(y_test, y_pred_nn) 153 | 154 | val = np.mat(conmat) 155 | classnames = list(set(y_train)) 156 | 157 | df_cm_nn = pd.DataFrame( 158 | val, index=classnames, columns=classnames, 159 | ) 160 | df_cm_nn = df_cm_nn.astype('float') / df_cm_nn.sum(axis=1)[:, np.newaxis] 161 | 162 | 163 | plt.figure() 164 | heatmap = sns.heatmap(df_cm_nn, annot=True, cmap="Blues") 165 | heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right') 166 | heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right') 167 | plt.ylabel('True label') 168 | plt.xlabel('Predicted label') 169 | plt.title('Churn Neural Network Model Results') 170 | plt.show() 171 | 172 | import shap 173 | 174 | f = lambda x: model.predict(x) 175 | med = X_train.median().values.reshape((1,X_train.shape[1])) 176 | 177 | explainer = shap.Explainer(f, med) 178 | shap_values = explainer(X_test.iloc[0:1000,:]) 179 | 180 | shap.plots.beeswarm(shap_values) 181 | 182 | 183 | import lime 184 | from lime import lime_tabular 185 | 186 | explainer = lime_tabular.LimeTabularExplainer(training_data=np.array(X_train),feature_names=X_train.columns,class_names=['Yes', 'No'], 187 | mode='classification') 188 | 189 | exp = explainer.explain_instance(data_row=X_test.iloc[1], predict_fn=model.predict, labels=(0,)) 190 | 191 | exp.show_in_notebook(show_table=True) 192 | -------------------------------------------------------------------------------- /profiling_debugging_mlworkflow.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","source":"import pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import f1_score, accuracy_score, precision_score\nimport functools\nimport time","metadata":{"tags":[],"cell_id":"1a3b7777486a4415836cf8421c910742","allow_embed":"code_output","source_hash":"7dc94c80","execution_start":1666298424461,"execution_millis":759,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":1},{"cell_type":"code","source":"#runtime \ndef runtime_monitor(input_function):\n @functools.wraps(input_function)\n def runtime_wrapper(*args, **kwargs):\n start_value = time.perf_counter() \n return_value = input_function(*args, **kwargs)\n end_value = time.perf_counter() \n runtime_value = end_value - start_value \n print(f\"Finished executing {input_function.__name__} in {runtime_value} seconds\")\n return return_value\n return runtime_wrapper","metadata":{"tags":[],"cell_id":"062b50752fa44f09a0a6db5c1c45f2c8","source_hash":"3e47cae","execution_start":1666298425233,"execution_millis":3,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":2},{"cell_type":"code","source":"#debugger\ndef debugging_method(input_function):\n @functools.wraps(input_function)\n def debugging_wrapper(*args, **kwargs):\n arguments = []\n keyword_arguments = []\n for a in args:\n arguments.append(repr(a)) \n for key, value in kwargs.items():\n keyword_arguments.append(f\"{key}={value}\")\n function_signature = arguments + keyword_arguments \n function_signature = \"; \".join(function_signature) \n print(f\"{input_function.__name__} has the following signature: {function_signature}\")\n return_value = input_function(*args, **kwargs)\n print(f\"{input_function.__name__} has the following return: {return_value}\") \n return return_value\n return debugging_wrapper","metadata":{"tags":[],"cell_id":"10dea5c6939b458885ce0df3a76c5c15","source_hash":"d517a62f","execution_start":1666298425245,"execution_millis":2,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":3},{"cell_type":"code","source":"@debugging_method\n@runtime_monitor\ndef data_preparation(columns, test_size, datatype_dict):\n df = pd.read_csv(\"telco_churn.csv\")\n df_subset = df[columns].copy()\n \n for col in columns:\n df_subset[col] = df_subset[col].astype(datatype_dict[col])\n\n for col in columns:\n if datatype_dict[col] == \"category\":\n df_subset[col] = df_subset[col].cat.codes\n X = df_subset[[\"gender\", \"tenure\", \"PhoneService\", \"MultipleLines\",\"MonthlyCharges\",]]\n y = df_subset[\"Churn\"]\n X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)\n return X_train, X_test, y_train, y_test\n\ncolumns = [\"gender\", \"tenure\", \"PhoneService\", \"MultipleLines\",\"MonthlyCharges\", \"Churn\"]\ndatatype_dict = {\"gender\":\"category\", \"tenure\":\"float\", \"PhoneService\":\"category\", \"MultipleLines\":\"category\", \"MonthlyCharges\":\"float\", \"Churn\":\"category\"}\nX_train, X_test, y_train, y_test = data_preparation(columns, 0.33, datatype_dict)","metadata":{"tags":[],"cell_id":"e656f853d7a848dcbf536b1a7a73c461","allow_embed":"code_output","source_hash":"1eb8f45e","execution_start":1666298425257,"execution_millis":122,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"data_preparation has the following signature: ['gender', 'tenure', 'PhoneService', 'MultipleLines', 'MonthlyCharges', 'Churn']; 0.33; {'gender': 'category', 'tenure': 'float', 'PhoneService': 'category', 'MultipleLines': 'category', 'MonthlyCharges': 'float', 'Churn': 'category'}\nFinished executing data_preparation in 0.04060269000183325 seconds\ndata_preparation has the following return: ( gender tenure PhoneService MultipleLines MonthlyCharges\n298 1 40.0 1 2 74.55\n3318 1 10.0 0 1 29.50\n5586 0 27.0 1 0 19.15\n6654 0 7.0 1 2 86.50\n5362 1 65.0 1 2 24.75\n... ... ... ... ... ...\n3772 1 1.0 1 0 95.00\n5191 0 23.0 1 2 91.10\n5226 1 12.0 1 0 21.15\n5390 1 12.0 1 2 99.45\n860 1 26.0 1 0 19.80\n\n[4718 rows x 5 columns], gender tenure PhoneService MultipleLines MonthlyCharges\n185 0 1.0 0 1 24.80\n2715 1 41.0 1 2 25.25\n3825 0 52.0 1 0 19.35\n1807 0 1.0 1 0 76.35\n132 1 67.0 1 0 50.55\n... ... ... ... ... ...\n4147 1 71.0 1 2 24.85\n3542 1 29.0 0 1 55.35\n3759 1 7.0 1 2 89.35\n1114 1 32.0 1 2 98.85\n4958 0 59.0 1 2 94.75\n\n[2325 rows x 5 columns], 298 0\n3318 1\n5586 0\n6654 1\n5362 0\n ..\n3772 1\n5191 0\n5226 0\n5390 1\n860 0\nName: Churn, Length: 4718, dtype: int8, 185 1\n2715 0\n3825 0\n1807 1\n132 0\n ..\n4147 0\n3542 0\n3759 1\n1114 0\n4958 0\nName: Churn, Length: 2325, dtype: int8)\n","output_type":"stream"}],"execution_count":4},{"cell_type":"code","source":"@debugging_method\n@runtime_monitor\ndef fit_model(X_train,y_train):\n model = RandomForestClassifier(random_state=42)\n model.fit(X_train,y_train)\n return model\n\nmodel = fit_model(X_train,y_train)","metadata":{"tags":[],"cell_id":"635aabc0e8524b0b93a573a8e76c18a9","source_hash":"bec93e9b","execution_start":1666298425378,"execution_millis":482,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"fit_model has the following signature: gender tenure PhoneService MultipleLines MonthlyCharges\n298 1 40.0 1 2 74.55\n3318 1 10.0 0 1 29.50\n5586 0 27.0 1 0 19.15\n6654 0 7.0 1 2 86.50\n5362 1 65.0 1 2 24.75\n... ... ... ... ... ...\n3772 1 1.0 1 0 95.00\n5191 0 23.0 1 2 91.10\n5226 1 12.0 1 0 21.15\n5390 1 12.0 1 2 99.45\n860 1 26.0 1 0 19.80\n\n[4718 rows x 5 columns]; 298 0\n3318 1\n5586 0\n6654 1\n5362 0\n ..\n3772 1\n5191 0\n5226 0\n5390 1\n860 0\nName: Churn, Length: 4718, dtype: int8\nFinished executing fit_model in 0.5037549450025836 seconds\nfit_model has the following return: RandomForestClassifier(random_state=42)\n","output_type":"stream"}],"execution_count":5},{"cell_type":"code","source":"@debugging_method\n@runtime_monitor\ndef predict(X_test, model):\n y_pred = model.predict(X_test)\n return y_pred \n\ny_pred = predict(X_test, model)","metadata":{"tags":[],"cell_id":"24bff66e9b314d7c8b4b97f458b1ecfc","source_hash":"f081a43d","execution_start":1666298425870,"execution_millis":62,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"predict has the following signature: gender tenure PhoneService MultipleLines MonthlyCharges\n185 0 1.0 0 1 24.80\n2715 1 41.0 1 2 25.25\n3825 0 52.0 1 0 19.35\n1807 0 1.0 1 0 76.35\n132 1 67.0 1 0 50.55\n... ... ... ... ... ...\n4147 1 71.0 1 2 24.85\n3542 1 29.0 0 1 55.35\n3759 1 7.0 1 2 89.35\n1114 1 32.0 1 2 98.85\n4958 0 59.0 1 2 94.75\n\n[2325 rows x 5 columns]; RandomForestClassifier(random_state=42)\nFinished executing predict in 0.05748910900001647 seconds\npredict has the following return: [1 0 0 ... 1 1 0]\n","output_type":"stream"}],"execution_count":6},{"cell_type":"code","source":"@debugging_method\n@runtime_monitor\ndef model_performance(y_pred, y_test):\n print(\"f1_score\", f1_score(y_test, y_pred))\n print(\"accuracy_score\", accuracy_score(y_test, y_pred))\n print(\"precision_score\", precision_score(y_test, y_pred))\n \nmodel_performance(y_pred, y_test)","metadata":{"tags":[],"cell_id":"40c1589e5d8e4bd184bb50594c374530","source_hash":"7f601050","execution_start":1666298425942,"execution_millis":44,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"model_performance has the following signature: array([1, 0, 0, ..., 1, 1, 0], dtype=int8); 185 1\n2715 0\n3825 0\n1807 1\n132 0\n ..\n4147 0\n3542 0\n3759 1\n1114 0\n4958 0\nName: Churn, Length: 2325, dtype: int8\nf1_score 0.5083848190644307\naccuracy_score 0.7604301075268817\nprecision_score 0.5702970297029702\nFinished executing model_performance in 0.0064978350019373465 seconds\nmodel_performance has the following return: None\n","output_type":"stream"}],"execution_count":7},{"cell_type":"markdown","source":"\nCreated in deepnote.com \nCreated in Deepnote","metadata":{"tags":[],"created_in_deepnote_cell":true,"deepnote_cell_type":"markdown"}}],"nbformat":4,"nbformat_minor":0,"metadata":{"deepnote":{},"orig_nbformat":2,"deepnote_notebook_id":"c1f86b571e3e46e9a7a160785e87a9df","deepnote_persisted_session":{"createdAt":"2022-10-20T19:16:40.943Z"},"deepnote_execution_queue":[]}} -------------------------------------------------------------------------------- /python_inheritance.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\ndf = pd.read_csv('telco_churn.csv')","metadata":{"tags":[],"cell_id":"56fdc3cca3fa4d5398566ec31bfc2529","source_hash":"d5ef50cb","execution_start":1677703521895,"execution_millis":71,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":5},{"cell_type":"code","source":"df['gender'] = df['gender'].astype('category')\ndf['gender_cat'] = df['gender'].cat.codes\n\ndf['InternetService'] = df['InternetService'].astype('category')\ndf['InternetService_cat'] = df['InternetService'].cat.codes\n\ndf['OnlineSecurity'] = df['OnlineSecurity'].astype('category')\ndf['OnlineSecurity_cat'] = df['OnlineSecurity'].cat.codes\n\ndf['Churn'] = np.where(df['Churn']=='Yes', 1, 0)\ncols = ['MonthlyCharges', 'tenure', 'gender_cat', 'InternetService_cat', 'OnlineSecurity_cat']","metadata":{"tags":[],"cell_id":"0cc961b4e48348e0a0e3c524afc8b65e","source_hash":"229ca32c","execution_start":1677703523251,"execution_millis":10,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":6},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split\nX = df[cols]\ny = df['Churn']\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)","metadata":{"tags":[],"cell_id":"231be36cfe4f47d99b55f9eb9212f783","source_hash":"e60adda4","execution_start":1677703536430,"execution_millis":888,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":7},{"cell_type":"code","source":"from sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\n\nclass CustomClassifier(RandomForestClassifier):\n def __init__(self, test_size=0.2, **kwargs):\n super().__init__(**kwargs)\n self.test_size = test_size\n \n def split_data(self):\n self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=self.test_size, random_state=42)","metadata":{"tags":[],"cell_id":"aab6726c002f4360818991abdefc7290","source_hash":"e43011cf","execution_start":1677703686000,"execution_millis":3,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":10},{"cell_type":"code","source":"rf_model = CustomClassifier(0.2)\nrf_model.split_data()\nrf_model.fit(rf_model.X_train, rf_model.y_train)","metadata":{"tags":[],"cell_id":"4bdd5752b11a460591fac71ce83399ac","source_hash":"187c58df","execution_start":1677703747150,"execution_millis":339,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","execution_count":12,"data":{"text/plain":"CustomClassifier()","text/html":"
CustomClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
"},"metadata":{}}],"execution_count":12},{"cell_type":"code","source":"importances = dict(zip(rf_model.feature_names_in_, rf_model.feature_importances_))\nprint(\"Feature Importances: \", importances)","metadata":{"tags":[],"cell_id":"8f4b23aa7ec74a50a3c6382a89c279cf","source_hash":"99041d00","execution_start":1677703749076,"execution_millis":11,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"Feature Importances: {'MonthlyCharges': 0.5180558283262937, 'tenure': 0.3413313756123183, 'gender_cat': 0.016723313513091347, 'InternetService_cat': 0.041761327477205484, 'OnlineSecurity_cat': 0.08212815507109114}\n","output_type":"stream"}],"execution_count":13},{"cell_type":"code","source":"from sklearn.metrics import confusion_matrix\nimport seaborn as sns\n\nclass Model:\n def __init__(self):\n self.n_estimators = 10\n self.max_depth = 10\n self.y_test = y_test\n self.y_train = y_train\n self.X_train = X_train\n self.X_test = X_test\n def fit(self):\n self.model = RandomForestClassifier(n_estimators = self.n_estimators, max_depth = self.max_depth, random_state=42)\n self.model.fit(self.X_train, self.y_train)\n\n def predict(self):\n self.y_pred = self.model.predict(X_test) \n return self.y_pred \n ","metadata":{"tags":[],"cell_id":"60e0c062244a4a739afb32cecd10ffc9","source_hash":"76d76ad1","execution_start":1677704087118,"execution_millis":3,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":20},{"cell_type":"code","source":"class ModelVisualization(Model):\n def __init__(self):\n super().__init__()\n\n def generate_confusion_matrix(self):\n cm = confusion_matrix(self.y_test, self.y_pred)\n cm = cm / cm.astype(np.float).sum(axis=1)\n sns.heatmap(cm, annot=True, cmap='Blues')","metadata":{"tags":[],"cell_id":"b2933702c0a94a79a2837c15174a2362","source_hash":"76b32020","execution_start":1677704097146,"execution_millis":6,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":21},{"cell_type":"code","source":"results = ModelVisualization()\nresults.fit()\nresults.predict()\nresults.generate_confusion_matrix()","metadata":{"tags":[],"cell_id":"123098651d7541ed9d181e38d101609a","source_hash":"89d5ec2e","execution_start":1677704099484,"execution_millis":351,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stderr","text":"/tmp/ipykernel_81/4231721142.py:7: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\nDeprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n cm = cm / cm.astype(np.float).sum(axis=1)\n","output_type":"stream"},{"data":{"text/plain":"
","image/png":"\n"},"metadata":{"image/png":{"width":515,"height":413}},"output_type":"display_data"}],"execution_count":22},{"cell_type":"markdown","source":"\nCreated in deepnote.com \nCreated in Deepnote","metadata":{"tags":[],"created_in_deepnote_cell":true,"deepnote_cell_type":"markdown"}}],"nbformat":4,"nbformat_minor":0,"metadata":{"deepnote":{},"orig_nbformat":2,"deepnote_notebook_id":"13925050acc04cc2a57a44357cb20293","deepnote_execution_queue":[]}} -------------------------------------------------------------------------------- /pareto_chart_er_readmission.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 24, 6 | "id": "9ea9df5a", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "\"\\n 'patient_id': 'Unique identifier for each patient',\\n 'readmission_flag': 'Indicates whether the patient was readmitted (1 for readmission, 0 for no readmission)',\\n 'readmission_cause': 'Specific cause or reason for patient readmission',\\n 'condition': 'Patient's medical condition or reason for initial visit/admission',\\n 'visit_date': 'Date of the patient's visit or admission'\\n\\n\"" 13 | ] 14 | }, 15 | "execution_count": 24, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "'''\n", 22 | " 'patient_id': 'Unique identifier for each patient',\n", 23 | " 'readmission_flag': 'Indicates whether the patient was readmitted (1 for readmission, 0 for no readmission)',\n", 24 | " 'readmission_cause': 'Specific cause or reason for patient readmission',\n", 25 | " 'condition': 'Patient\\'s medical condition or reason for initial visit/admission',\n", 26 | " 'visit_date': 'Date of the patient\\'s visit or admission'\n", 27 | "\n", 28 | "'''" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 25, 34 | "id": "8d6c208b", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "import pandas as pd \n", 39 | "\n", 40 | "df = pd.read_csv(\"emergency_room_readmission_data.csv\")" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 26, 46 | "id": "7b987dc1", 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/html": [ 52 | "
\n", 53 | "\n", 66 | "\n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | "
patient_idreadmission_flagreadmission_causeconditionvisit_date
0C252061SepsisCoronary Artery Bypass Grafting2022-05-22
1B429421Adverse Drug ReactionMedication Side Effects2022-08-01
2C989471FallsTotal Hip Arthroplasty2022-08-13
3B634021FallsTotal Hip Arthroplasty2022-02-18
4B308221SepsisCoronary Artery Bypass Grafting2022-12-22
\n", 120 | "
" 121 | ], 122 | "text/plain": [ 123 | " patient_id readmission_flag readmission_cause \\\n", 124 | "0 C25206 1 Sepsis \n", 125 | "1 B42942 1 Adverse Drug Reaction \n", 126 | "2 C98947 1 Falls \n", 127 | "3 B63402 1 Falls \n", 128 | "4 B30822 1 Sepsis \n", 129 | "\n", 130 | " condition visit_date \n", 131 | "0 Coronary Artery Bypass Grafting 2022-05-22 \n", 132 | "1 Medication Side Effects 2022-08-01 \n", 133 | "2 Total Hip Arthroplasty 2022-08-13 \n", 134 | "3 Total Hip Arthroplasty 2022-02-18 \n", 135 | "4 Coronary Artery Bypass Grafting 2022-12-22 " 136 | ] 137 | }, 138 | "execution_count": 26, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "df.head()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 27, 150 | "id": "53ea957a", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "readmitted = df[df['readmission_flag'] == 1]" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 28, 160 | "id": "72668263", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "cause_counts = readmitted['readmission_cause'].value_counts()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 29, 170 | "id": "52e90dcd", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "cumulative_percent = (cause_counts.cumsum() /cause_counts.sum()) * 100" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "id": "773cdbfb", 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 30, 188 | "id": "4dc8700d", 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "image/png": "\n", 194 | "text/plain": [ 195 | "
" 196 | ] 197 | }, 198 | "metadata": {}, 199 | "output_type": "display_data" 200 | } 201 | ], 202 | "source": [ 203 | "_, ax1 = plt.subplots()\n", 204 | "\n", 205 | "ax1.bar(cause_counts.index, cause_counts.values, color='tab:cyan', label='Frequency')\n", 206 | "ax1.set_xlabel('Readmission Cause')\n", 207 | "ax1.set_ylabel('Frequency', color='tab:cyan')\n", 208 | "ax1.tick_params(axis='y', labelcolor='tab:cyan')\n", 209 | "ax1.legend(loc='upper left')\n", 210 | "\n", 211 | "\n", 212 | "ax2 = ax1.twinx()\n", 213 | "ax2.plot(cause_counts.index, cumulative_percent , zorder=2, color='tab:red', label='Cumulative Percentage', marker='o')\n", 214 | "ax2.set_ylabel('Cumulative Percentage', color='tab:red')\n", 215 | "ax2.tick_params(axis='y', labelcolor='tab:red')\n", 216 | "ax2.legend(loc='upper right')\n", 217 | "\n", 218 | "\n", 219 | "plt.title('ER Readmission Pareto Chart')\n", 220 | "\n", 221 | "\n", 222 | "plt.setp(ax1.get_xticklabels(), rotation=30, horizontalalignment='right')\n", 223 | "plt.tight_layout()\n", 224 | "plt.show()" 225 | ] 226 | } 227 | ], 228 | "metadata": { 229 | "kernelspec": { 230 | "display_name": "Python 3 (ipykernel)", 231 | "language": "python", 232 | "name": "python3" 233 | }, 234 | "language_info": { 235 | "codemirror_mode": { 236 | "name": "ipython", 237 | "version": 3 238 | }, 239 | "file_extension": ".py", 240 | "mimetype": "text/x-python", 241 | "name": "python", 242 | "nbconvert_exporter": "python", 243 | "pygments_lexer": "ipython3", 244 | "version": "3.9.13" 245 | } 246 | }, 247 | "nbformat": 4, 248 | "nbformat_minor": 5 249 | } 250 | --------------------------------------------------------------------------------