├── Data ├── Salary.xlsx ├── Telco_data_chi2_test.csv ├── airbnb_contacts.xlsx ├── airbnb_host_searches.xlsx ├── cancer_data.csv ├── cancer_data_cleaned.xlsx ├── credit_risk_dataset.csv ├── dataforAnalysis.xlsx ├── facebook_web_log.xlsx ├── fb_friend_requests.xlsx ├── google_file_store.xlsx ├── loan_data_sample.xlsx ├── stratascratch_amazon_active_returning_customers.xlsx ├── stratascratch_amazon_pct_diff.xlsx ├── testing.txt ├── to_build_the_model.xlsx └── to_test_the_model.xlsx └── Scripts ├── Amazon_interview_question_Hard.py ├── Bag_Of_Words.py ├── Credit_Risk_PD_Model.ipynb ├── Decision_Tree.py ├── Exploratory Data Analysis - Loan Defaulter Segmentation.py ├── FastText_Setup_Prediction.py ├── Feature Selection with ANOVA Method.ipynb ├── Feature Selection with Chi2 Method.ipynb ├── Feature Selection with Correlation Method.ipynb ├── FraudDetectionModelDataPrep.py ├── GloVe.py ├── LinearRegression_Live.py ├── LogisticLive.py ├── NER.py ├── PD_credit_risk_model_dev.py ├── Payment_Fraud_Detection_Model_Logistic_Reg.py ├── Payment_Fraud_Detection_Model_Logistic_and_Decision_Tree.py ├── PoS.py ├── PreProcessing_PD_Model.ipynb ├── PreProcessing_PD_Model.py ├── RandomForesPDmodel.zip ├── Recursive Feature Elimination.ipynb ├── UK_PM_Sentiment_Analysis.py ├── Understand_differences_in_numbers.ipynb ├── UsingAutomatedModulesAndModels.ipynb ├── XGBpdModel.pkl ├── airbnb_Host_Popularity_Rental_Prices_Hard.py ├── airbnb_Ranking_Most_Active_Guests.py ├── amazon_interview_question.py ├── backward_elimintation_method.ipynb ├── balancing_live.ipynb ├── ensemble_model.zip ├── facebook_Acceptance_Rate_By_Date.py ├── facebook_interview_question.py ├── feature_selection_with_LASSO.ipynb ├── feature_selection_with_Ridge_Regression.ipynb ├── feature_selection_with_decision_tree_or_random_forest.ipynb ├── feature_selection_with_elasticnet.ipynb ├── forward_selection_method.ipynb ├── google_interview_question_Medium.py ├── gradient_boosint_complete.py ├── logisticPDmodel.pkl ├── smote_logistic.pkl ├── testing.txt └── time_series_air_passengers.py /Data/Salary.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/Salary.xlsx -------------------------------------------------------------------------------- /Data/airbnb_contacts.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/airbnb_contacts.xlsx -------------------------------------------------------------------------------- /Data/airbnb_host_searches.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/airbnb_host_searches.xlsx -------------------------------------------------------------------------------- /Data/cancer_data_cleaned.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/cancer_data_cleaned.xlsx -------------------------------------------------------------------------------- /Data/dataforAnalysis.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/dataforAnalysis.xlsx -------------------------------------------------------------------------------- /Data/facebook_web_log.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/facebook_web_log.xlsx -------------------------------------------------------------------------------- /Data/fb_friend_requests.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/fb_friend_requests.xlsx -------------------------------------------------------------------------------- /Data/google_file_store.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/google_file_store.xlsx -------------------------------------------------------------------------------- /Data/loan_data_sample.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/loan_data_sample.xlsx -------------------------------------------------------------------------------- /Data/stratascratch_amazon_active_returning_customers.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/stratascratch_amazon_active_returning_customers.xlsx -------------------------------------------------------------------------------- /Data/stratascratch_amazon_pct_diff.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/stratascratch_amazon_pct_diff.xlsx -------------------------------------------------------------------------------- /Data/testing.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Data/to_build_the_model.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/to_build_the_model.xlsx -------------------------------------------------------------------------------- /Data/to_test_the_model.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/to_test_the_model.xlsx -------------------------------------------------------------------------------- /Scripts/Amazon_interview_question_Hard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | amazon = pd.read_excel(r"D:\Learnerea\Tables\amazon_pct_diff.xlsx") 9 | amazon.head() 10 | 11 | 12 | # In[14]: 13 | 14 | 15 | amazon['yearMonth'] = amazon['created_at'].dt.strftime('%Y-%m') 16 | amazon_agg = amazon.pivot_table(index='yearMonth',values='value',aggfunc='sum').reset_index() 17 | amazon_agg['prev_month'] = amazon_agg['value'].shift() 18 | amazon_agg['rev_diff'] = round(((amazon_agg['value'] - amazon_agg['prev_month'])/amazon_agg['prev_month'])*100,2) 19 | amazon_agg 20 | 21 | -------------------------------------------------------------------------------- /Scripts/Bag_Of_Words.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | pd.set_option('display.max_colwidth',0) 11 | 12 | 13 | # In[2]: 14 | 15 | 16 | df = pd.read_csv(r"D:\Learnerea\Tables\imdb_master.csv", encoding="ISO-8859-1").drop(['Unnamed: 0',"file"],axis=1) 17 | 18 | 19 | # In[3]: 20 | 21 | 22 | display(df.shape) 23 | display(df.head()) 24 | display(df.type.value_counts()) 25 | display(df.label.value_counts()) 26 | 27 | 28 | # In[4]: 29 | 30 | 31 | filtered = df.query("label!='unsup'") 32 | 33 | 34 | # In[5]: 35 | 36 | 37 | filtered['label'] = filtered['label'].apply(lambda x: 0 if x == "neg" else 1) 38 | 39 | 40 | # In[6]: 41 | 42 | 43 | filtered.label.value_counts() 44 | 45 | 46 | # # Cleaning starts 47 | 48 | # In[7]: 49 | 50 | 51 | filtered['review_lower'] = filtered['review'].str.lower() 52 | 53 | 54 | # In[8]: 55 | 56 | 57 | from nltk.corpus import stopwords 58 | 59 | 60 | # In[9]: 61 | 62 | 63 | engStopWords = stopwords.words("english") 64 | 65 | 66 | # In[10]: 67 | 68 | 69 | filtered['review_no_stopW'] = filtered['review_lower'].apply(lambda x:" ".join(word for word in x.split() if word not in engStopWords)) 70 | 71 | 72 | # In[11]: 73 | 74 | 75 | # filtered.head(2) 76 | 77 | 78 | # In[12]: 79 | 80 | 81 | from nltk.stem import WordNetLemmatizer 82 | 83 | 84 | # In[13]: 85 | 86 | 87 | lemm = WordNetLemmatizer() 88 | 89 | 90 | # In[14]: 91 | 92 | 93 | filtered['lemmatized_review'] = filtered['review_no_stopW'].apply(lambda x: " ".join(lemm.lemmatize(word) for word in x.split())) 94 | 95 | 96 | # In[15]: 97 | 98 | 99 | filtered.head(1) 100 | 101 | 102 | # In[16]: 103 | 104 | 105 | filtered = filtered.drop(['review','review_lower','review_no_stopW'],axis=1) 106 | 107 | 108 | # In[17]: 109 | 110 | 111 | filtered = filtered.rename({'lemmatized_review':'review'},axis=1) 112 | 113 | 114 | # In[18]: 115 | 116 | 117 | filtered.head() 118 | 119 | 120 | # # Cleaning Ends 121 | 122 | # In[19]: 123 | 124 | 125 | train = filtered.query("type=='train'").drop(['type'],axis=1) 126 | test = filtered.query("type=='test'").drop(['type'],axis=1) 127 | 128 | 129 | # In[20]: 130 | 131 | 132 | x_train = train['review'].values 133 | y_train = train['label'].values 134 | 135 | 136 | # In[21]: 137 | 138 | 139 | x_test = test['review'].values 140 | y_test = test['label'].values 141 | 142 | 143 | # In[22]: 144 | 145 | 146 | from sklearn.feature_extraction.text import CountVectorizer 147 | 148 | 149 | # In[23]: 150 | 151 | 152 | x_train[0] 153 | 154 | 155 | # In[24]: 156 | 157 | 158 | vector = CountVectorizer() 159 | 160 | 161 | # In[28]: 162 | 163 | 164 | trained_vector = vector.transform(x_train) 165 | 166 | 167 | # In[29]: 168 | 169 | 170 | trained_vector[50].toarray()[:50] 171 | 172 | 173 | # In[30]: 174 | 175 | 176 | vector.get_feature_names()[500:][:4] 177 | 178 | 179 | # In[31]: 180 | 181 | 182 | train[train.review.str.contains('20minutes')] 183 | 184 | 185 | # In[32]: 186 | 187 | 188 | train.shape 189 | 190 | 191 | # In[33]: 192 | 193 | 194 | from sklearn.naive_bayes import MultinomialNB 195 | 196 | 197 | # In[34]: 198 | 199 | 200 | model = MultinomialNB() 201 | 202 | 203 | # In[35]: 204 | 205 | 206 | model.fit(trained_vector, y_train) 207 | 208 | 209 | # In[36]: 210 | 211 | 212 | test_vector = vector.transform(x_test) 213 | 214 | 215 | # In[37]: 216 | 217 | 218 | predicted = model.predict(test_vector) 219 | 220 | 221 | # In[38]: 222 | 223 | 224 | from sklearn.metrics import classification_report 225 | 226 | 227 | # In[39]: 228 | 229 | 230 | print(classification_report(y_test,predicted)) 231 | 232 | 233 | # In[76]: 234 | 235 | 236 | final_sentiment = pd.DataFrame(zip(x_test,y_test,predicted),columns=['review','act_sent','pred_sent']) 237 | final_sentiment.head(2) 238 | 239 | 240 | # In[78]: 241 | 242 | 243 | final_sentiment.query("act_sent != pred_sent").head(2) 244 | 245 | 246 | # In[85]: 247 | 248 | 249 | review = np.array(["that movie was aweful"]) 250 | 251 | 252 | # In[86]: 253 | 254 | 255 | text_to_pred = vector.transform(review) 256 | 257 | 258 | # In[87]: 259 | 260 | 261 | model.predict(text_to_pred) 262 | 263 | 264 | # In[100]: 265 | 266 | 267 | def sentiment_check(input_text): 268 | test = np.array(input_text) 269 | text_to_pred = vector.transform(test) 270 | predicted_val = model.predict(text_to_pred) 271 | if predicted_val == 1: 272 | print("the review is POSITIVE") 273 | else: 274 | print("the review is NEGATIVE") 275 | 276 | 277 | # In[102]: 278 | 279 | 280 | sentiment_check(['the movie was aweful']) 281 | 282 | -------------------------------------------------------------------------------- /Scripts/Decision_Tree.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[37]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import seaborn as sns 10 | from sklearn.preprocessing import LabelEncoder 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.tree import DecisionTreeClassifier 13 | from sklearn.metrics import classification_report, confusion_matrix 14 | 15 | 16 | # In[38]: 17 | 18 | 19 | perf_data = pd.read_excel(r"D:\Learnerea\Tables\data_decision_tree.xlsx") 20 | 21 | 22 | # In[39]: 23 | 24 | 25 | perf_data.head() 26 | 27 | 28 | # In[40]: 29 | 30 | 31 | encoder = LabelEncoder() 32 | 33 | 34 | # In[42]: 35 | 36 | 37 | perf_data['cert_encoded'] = encoder.fit_transform(perf_data['Certification']) 38 | perf_data['proj_encoded'] = encoder.fit_transform(perf_data['Projects']) 39 | perf_data['supt_encoded'] = encoder.fit_transform(perf_data['Supported_other_projects']) 40 | perf_data_new = perf_data.drop(['Certification','Projects','Supported_other_projects'],axis=1) 41 | perf_data_new.head() 42 | 43 | 44 | # In[43]: 45 | 46 | 47 | inputs = perf_data_new.drop(['Hike_ge_20pct'],axis=1) 48 | result = perf_data_new['Hike_ge_20pct'] 49 | 50 | 51 | # In[56]: 52 | 53 | 54 | inputs_train, inputs_test, result_train, result_test = train_test_split(inputs,result,test_size=0.2,random_state=False) 55 | 56 | 57 | # In[57]: 58 | 59 | 60 | myTree = DecisionTreeClassifier() 61 | 62 | 63 | # In[58]: 64 | 65 | 66 | myTree.fit(inputs_train,result_train) 67 | 68 | 69 | # In[63]: 70 | 71 | 72 | prediction=myTree.predict(inputs_test) 73 | 74 | 75 | # In[60]: 76 | 77 | 78 | result_test 79 | 80 | 81 | # In[61]: 82 | 83 | 84 | myTree.score(inputs_test,result_test) 85 | 86 | 87 | # In[67]: 88 | 89 | 90 | cm = confusion_matrix(result_test,prediction) 91 | 92 | 93 | # In[70]: 94 | 95 | 96 | print(classification_report(result_test,prediction)) 97 | 98 | -------------------------------------------------------------------------------- /Scripts/Exploratory Data Analysis - Loan Defaulter Segmentation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Data import and basic exploration 5 | 6 | # In[1]: 7 | 8 | 9 | import pandas as pd 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | import seaborn as sns 13 | 14 | pd.options.display.max_columns = None 15 | pd.options.display.max_rows = None 16 | 17 | 18 | # In[2]: 19 | 20 | 21 | app = pd.read_csv(r"D:\Learnerea\Tables\loan_default_data\application_data.csv") 22 | prev_app = pd.read_csv(r"D:\Learnerea\Tables\loan_default_data\previous_application.csv") 23 | 24 | 25 | # In[3]: 26 | 27 | 28 | app.head() 29 | 30 | 31 | # # Feature selection 32 | 33 | # In[4]: 34 | 35 | 36 | app.columns 37 | 38 | 39 | # In[5]: 40 | 41 | 42 | app.shape 43 | 44 | 45 | # In[6]: 46 | 47 | 48 | msng_info = pd.DataFrame(app.isnull().sum().sort_values()).reset_index() 49 | msng_info.rename(columns={'index':'col_name',0:'null_count'},inplace=True) 50 | msng_info.head() 51 | 52 | 53 | # In[7]: 54 | 55 | 56 | msng_info['msng_pct'] = msng_info['null_count']/app.shape[0]*100 57 | # msng_info.to_excel(r"D:\Learnerea\Tables\loan_default_data\missing_info.xlsx",index=False) 58 | msng_info.head() 59 | 60 | 61 | # In[8]: 62 | 63 | 64 | msng_col = msng_info[msng_info['msng_pct']>=40]['col_name'].to_list() 65 | app_msng_rmvd = app.drop(labels=msng_col,axis=1) 66 | app_msng_rmvd.shape 67 | 68 | 69 | # In[9]: 70 | 71 | 72 | app_msng_rmvd.head() 73 | 74 | 75 | # In[10]: 76 | 77 | 78 | flag_col = [] 79 | 80 | for col in app_msng_rmvd.columns: 81 | if col.startswith("FLAG_"): 82 | flag_col.append(col) 83 | 84 | len(flag_col) 85 | 86 | 87 | # In[11]: 88 | 89 | 90 | flag_col 91 | 92 | 93 | # In[12]: 94 | 95 | 96 | flag_tgt_col = app_msng_rmvd[flag_col+['TARGET']] 97 | flag_tgt_col.head() 98 | 99 | 100 | # In[13]: 101 | 102 | 103 | plt.figure(figsize=(20,25)) 104 | 105 | for i, col in enumerate(flag_col): 106 | plt.subplot(7,4,i+1) 107 | sns.countplot(data=flag_tgt_col,x=col,hue='TARGET') 108 | 109 | 110 | # In[14]: 111 | 112 | 113 | flg_corr = ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 114 | 'FLAG_PHONE', 'FLAG_EMAIL','TARGET'] 115 | flag_corr_df = app_msng_rmvd[flg_corr] 116 | 117 | 118 | # In[15]: 119 | 120 | 121 | flag_corr_df.groupby(['FLAG_OWN_CAR']).size() 122 | 123 | 124 | # In[16]: 125 | 126 | 127 | flag_corr_df['FLAG_OWN_CAR'] = flag_corr_df['FLAG_OWN_CAR'].replace({'N':0,'Y':1}) 128 | flag_corr_df['FLAG_OWN_REALTY'] = flag_corr_df['FLAG_OWN_REALTY'].replace({'N':0,'Y':1}) 129 | 130 | flag_corr_df.groupby(['FLAG_OWN_CAR']).size() 131 | 132 | 133 | # In[17]: 134 | 135 | 136 | corr_df = round(flag_corr_df.corr(),2) 137 | 138 | plt.figure(figsize=(10,5)) 139 | sns.heatmap(corr_df,cmap='coolwarm',linewidths=.5,annot=True) 140 | 141 | 142 | # In[18]: 143 | 144 | 145 | app_flag_rmvd = app_msng_rmvd.drop(labels =flag_col,axis=1) 146 | app_flag_rmvd.shape 147 | 148 | 149 | # In[19]: 150 | 151 | 152 | sns.heatmap(data=round(app_flag_rmvd[['EXT_SOURCE_2','EXT_SOURCE_3','TARGET']].corr(),2),cmap='coolwarm',linewidths=.5,annot=True) 153 | 154 | 155 | # In[20]: 156 | 157 | 158 | app_score_col_rmvd = app_flag_rmvd.drop(['EXT_SOURCE_2','EXT_SOURCE_3'],axis=1) 159 | app_score_col_rmvd.shape 160 | 161 | 162 | # # Feature engineering 163 | 164 | # In[21]: 165 | 166 | 167 | app_score_col_rmvd.isnull().sum().sort_values()/app_score_col_rmvd.shape[0] 168 | 169 | 170 | # ### Missing imputation 171 | 172 | # In[22]: 173 | 174 | 175 | app_score_col_rmvd['CNT_FAM_MEMBERS'] = app_score_col_rmvd['CNT_FAM_MEMBERS'].fillna((app_score_col_rmvd['CNT_FAM_MEMBERS'].mode()[0])) 176 | 177 | 178 | # In[23]: 179 | 180 | 181 | app_score_col_rmvd['CNT_FAM_MEMBERS'].isnull().sum() 182 | 183 | 184 | # In[24]: 185 | 186 | 187 | app_score_col_rmvd['OCCUPATION_TYPE'] = app_score_col_rmvd['OCCUPATION_TYPE'].fillna((app_score_col_rmvd['OCCUPATION_TYPE'].mode()[0])) 188 | # app_score_col_rmvd['OCCUPATION_TYPE'].mode()[0] 189 | 190 | 191 | # In[25]: 192 | 193 | 194 | app_score_col_rmvd['OCCUPATION_TYPE'].isnull().sum() 195 | 196 | 197 | # In[26]: 198 | 199 | 200 | app_score_col_rmvd['NAME_TYPE_SUITE'] = app_score_col_rmvd['NAME_TYPE_SUITE'].fillna((app_score_col_rmvd['NAME_TYPE_SUITE'].mode()[0])) 201 | 202 | 203 | # In[27]: 204 | 205 | 206 | app_score_col_rmvd['NAME_TYPE_SUITE'].isnull().sum() 207 | 208 | 209 | # In[28]: 210 | 211 | 212 | app_score_col_rmvd['AMT_ANNUITY'] = app_score_col_rmvd['AMT_ANNUITY'].fillna((app_score_col_rmvd['AMT_ANNUITY'].mean())) 213 | 214 | 215 | # In[29]: 216 | 217 | 218 | app_score_col_rmvd['AMT_ANNUITY'].isnull().sum() 219 | 220 | 221 | # In[30]: 222 | 223 | 224 | # app_score_col_rmvd['AMT_REQ_CREDIT_BUREAU_HOUR'] 225 | 226 | 227 | # In[31]: 228 | 229 | 230 | amt_req_col = [] 231 | 232 | for col in app_score_col_rmvd.columns: 233 | if col.startswith("AMT_REQ_CREDIT_BUREAU"): 234 | amt_req_col.append(col) 235 | 236 | amt_req_col 237 | # app_score_col_rmvd['AMT_REQ_CREDIT_BUREAU_HOUR'].unique() 238 | 239 | 240 | # In[32]: 241 | 242 | 243 | for col in amt_req_col: 244 | app_score_col_rmvd[col] = app_score_col_rmvd[col].fillna((app_score_col_rmvd[col].median())) 245 | 246 | 247 | # In[33]: 248 | 249 | 250 | # app_score_col_rmvd.isnull().sum().sort_values() 251 | 252 | 253 | # In[34]: 254 | 255 | 256 | app_score_col_rmvd['AMT_GOODS_PRICE'] = app_score_col_rmvd['AMT_GOODS_PRICE'].fillna((app_score_col_rmvd['AMT_GOODS_PRICE'].median())) 257 | 258 | 259 | # In[35]: 260 | 261 | 262 | app_score_col_rmvd['AMT_GOODS_PRICE'].isnull().sum() 263 | 264 | 265 | # In[36]: 266 | 267 | 268 | app_score_col_rmvd.head() 269 | 270 | 271 | # ### Value modifiction 272 | 273 | # In[37]: 274 | 275 | 276 | days_col = [] 277 | 278 | for col in app_score_col_rmvd.columns: 279 | if col.startswith("DAYS"): 280 | days_col.append(col) 281 | 282 | days_col 283 | 284 | 285 | # In[38]: 286 | 287 | 288 | for col in days_col: 289 | app_score_col_rmvd[col] = abs(app_score_col_rmvd[col]) 290 | 291 | 292 | # In[39]: 293 | 294 | 295 | app_score_col_rmvd.head() 296 | 297 | 298 | # In[40]: 299 | 300 | 301 | app_score_col_rmvd.nunique().sort_values() 302 | 303 | 304 | # In[41]: 305 | 306 | 307 | app_score_col_rmvd['OBS_30_CNT_SOCIAL_CIRCLE'].unique() 308 | 309 | 310 | # ### Outlier detection & treatment 311 | 312 | # In[42]: 313 | 314 | 315 | app_score_col_rmvd['AMT_GOODS_PRICE'].agg(['min','max','median']) 316 | 317 | 318 | # In[43]: 319 | 320 | 321 | sns.kdeplot(data=app_score_col_rmvd,x='AMT_GOODS_PRICE') 322 | 323 | 324 | # In[44]: 325 | 326 | 327 | sns.boxenplot(data=app_score_col_rmvd,x='AMT_GOODS_PRICE') 328 | 329 | 330 | # In[45]: 331 | 332 | 333 | app_score_col_rmvd['AMT_GOODS_PRICE'].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99]) 334 | 335 | 336 | # In[46]: 337 | 338 | 339 | bins = [0,100000,200000,300000,400000,500000,600000,700000,800000,900000,4050000] 340 | ranges = ['0-100K','100k-200K','200K-300K','300K-400K','400K-500K','500K-600K','600K-700K' 341 | ,'700K-800K','800K-900K','Above 900K'] 342 | 343 | app_score_col_rmvd['AMT_GOODS_PRICE_RANGE'] = pd.cut(app_score_col_rmvd['AMT_GOODS_PRICE'],bins,labels=ranges) 344 | 345 | 346 | # In[47]: 347 | 348 | 349 | app_score_col_rmvd.groupby(['AMT_GOODS_PRICE_RANGE']).size() 350 | 351 | 352 | # In[48]: 353 | 354 | 355 | app_score_col_rmvd['AMT_INCOME_TOTAL'].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99]) 356 | 357 | 358 | # In[49]: 359 | 360 | 361 | app_score_col_rmvd['AMT_INCOME_TOTAL'].max() 362 | 363 | 364 | # In[50]: 365 | 366 | 367 | bins = [0,100000,150000,200000,250000,300000,350000,400000,117000000] 368 | ranges = ['0-100K','100K-150K','150K-200K','200K-250K','250K-300K','300K-350K','350K-400K' 369 | ,'Above 400K'] 370 | 371 | app_score_col_rmvd['AMT_INCOME_TOTAL_RANGE'] = pd.cut(app_score_col_rmvd['AMT_INCOME_TOTAL'],bins,labels=ranges) 372 | 373 | 374 | # In[51]: 375 | 376 | 377 | app_score_col_rmvd.groupby(['AMT_INCOME_TOTAL_RANGE']).size() 378 | 379 | 380 | # In[52]: 381 | 382 | 383 | app_score_col_rmvd['AMT_INCOME_TOTAL_RANGE'].isnull().sum() 384 | 385 | 386 | # In[53]: 387 | 388 | 389 | app_score_col_rmvd['AMT_CREDIT'].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99]) 390 | 391 | 392 | # In[54]: 393 | 394 | 395 | app_score_col_rmvd['AMT_CREDIT'].max() 396 | 397 | 398 | # In[55]: 399 | 400 | 401 | bins = [0,200000,400000,600000,800000,900000,1000000,2000000,3000000,4050000] 402 | ranges = ['0-200K','200K-400K','400K-600K','600K-800K','800K-900K','900K-1M','1M-2M','2M-3M','Above 3M'] 403 | 404 | app_score_col_rmvd['AMT_CREDIT_RANGE'] = pd.cut(app_score_col_rmvd['AMT_CREDIT'],bins,labels=ranges) 405 | 406 | 407 | # In[56]: 408 | 409 | 410 | app_score_col_rmvd.groupby(['AMT_CREDIT_RANGE']).size() 411 | 412 | 413 | # In[57]: 414 | 415 | 416 | app_score_col_rmvd['AMT_CREDIT'].isnull().sum() 417 | 418 | 419 | # In[58]: 420 | 421 | 422 | app_score_col_rmvd['AMT_ANNUITY'].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99]) 423 | 424 | 425 | # In[59]: 426 | 427 | 428 | app_score_col_rmvd['AMT_ANNUITY'].max() 429 | 430 | 431 | # In[60]: 432 | 433 | 434 | bins = [0,25000,50000,100000,150000,200000,258025.5] 435 | ranges = ['0-25K','25K-50K','50K-100K','100K-150K','150K-200K','Above 200K'] 436 | 437 | app_score_col_rmvd['AMT_ANNUITY_RANGE'] = pd.cut(app_score_col_rmvd['AMT_ANNUITY'],bins,labels=ranges) 438 | 439 | 440 | # In[61]: 441 | 442 | 443 | app_score_col_rmvd.groupby(['AMT_ANNUITY_RANGE']).size() 444 | 445 | 446 | # In[62]: 447 | 448 | 449 | app_score_col_rmvd['AMT_ANNUITY_RANGE'].isnull().sum() 450 | 451 | 452 | # In[63]: 453 | 454 | 455 | app_score_col_rmvd['DAYS_EMPLOYED'].agg(['min','max','median']) 456 | 457 | 458 | # In[64]: 459 | 460 | 461 | app_score_col_rmvd['DAYS_EMPLOYED'].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.81,0.85,0.9,0.95,0.99]) 462 | 463 | 464 | # In[65]: 465 | 466 | 467 | app_score_col_rmvd[app_score_col_rmvd['DAYS_EMPLOYED']=40]['var']) 740 | var_msng_ge_40 741 | 742 | 743 | # In[102]: 744 | 745 | 746 | nva_cols = var_msng_ge_40+['WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START','FLAG_LAST_APPL_PER_CONTRACT','NFLAG_LAST_APPL_IN_DAY'] 747 | len(nva_cols) 748 | 749 | 750 | # In[103]: 751 | 752 | 753 | len(prev_app.columns) 754 | 755 | 756 | # In[104]: 757 | 758 | 759 | prev_app_nva_col_rmvd = prev_app.drop(labels=nva_cols,axis=1) 760 | 761 | 762 | len(prev_app_nva_col_rmvd.columns) 763 | 764 | 765 | # In[105]: 766 | 767 | 768 | prev_app_nva_col_rmvd.columns 769 | 770 | 771 | # In[106]: 772 | 773 | 774 | prev_app_nva_col_rmvd.head() 775 | 776 | 777 | # In[107]: 778 | 779 | 780 | prev_app_nva_col_rmvd.isnull().sum().sort_values(ascending=False)/prev_app_nva_col_rmvd.shape[0]*100 781 | 782 | 783 | # In[108]: 784 | 785 | 786 | prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].agg(func=['mean','median']) 787 | 788 | 789 | # In[109]: 790 | 791 | 792 | prev_app_nva_col_rmvd['AMT_GOODS_PRICE_MEDIAN'] = prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].fillna(prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].median()) 793 | 794 | 795 | # In[110]: 796 | 797 | 798 | prev_app_nva_col_rmvd['AMT_GOODS_PRICE_MEAN'] = prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].fillna(prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].mean()) 799 | 800 | 801 | # In[111]: 802 | 803 | 804 | prev_app_nva_col_rmvd['AMT_GOODS_PRICE_MODE'] = prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].fillna(prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].mode()[0]) 805 | 806 | 807 | # In[112]: 808 | 809 | 810 | gp_cols = ['AMT_GOODS_PRICE','AMT_GOODS_PRICE_MEDIAN','AMT_GOODS_PRICE_MEAN','AMT_GOODS_PRICE_MODE'] 811 | 812 | 813 | # In[113]: 814 | 815 | 816 | plt.figure(figsize=(10,5)) 817 | 818 | for i, col in enumerate(gp_cols): 819 | plt.subplot(2,2,i+1) 820 | sns.kdeplot(data=prev_app_nva_col_rmvd,x=col) 821 | plt.subplots_adjust(wspace=0.5,hspace=0.5) 822 | 823 | 824 | # In[114]: 825 | 826 | 827 | prev_app_nva_col_rmvd['AMT_GOODS_PRICE'] = prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].fillna(prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].median()) 828 | 829 | 830 | # In[115]: 831 | 832 | 833 | prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].isnull().sum() 834 | 835 | 836 | # In[116]: 837 | 838 | 839 | prev_app_nva_col_rmvd['AMT_ANNUITY'].agg(func=['mean','median','max']) 840 | 841 | 842 | # In[117]: 843 | 844 | 845 | prev_app_nva_col_rmvd['AMT_ANNUITY'] = prev_app_nva_col_rmvd['AMT_ANNUITY'].fillna(prev_app_nva_col_rmvd['AMT_ANNUITY'].median()) 846 | 847 | 848 | # In[118]: 849 | 850 | 851 | prev_app_nva_col_rmvd['PRODUCT_COMBINATION'] = prev_app_nva_col_rmvd['PRODUCT_COMBINATION'].fillna(prev_app_nva_col_rmvd['PRODUCT_COMBINATION'].mode()[0]) 852 | 853 | 854 | # In[119]: 855 | 856 | 857 | prev_app_nva_col_rmvd['CNT_PAYMENT'].agg(func=['mean','median','max']) 858 | 859 | 860 | # In[120]: 861 | 862 | 863 | prev_app_nva_col_rmvd[prev_app_nva_col_rmvd['CNT_PAYMENT'].isnull()].groupby(['NAME_CONTRACT_STATUS']).size().sort_values(ascending=False) 864 | 865 | 866 | # In[121]: 867 | 868 | 869 | prev_app_nva_col_rmvd['CNT_PAYMENT'] = prev_app_nva_col_rmvd['CNT_PAYMENT'].fillna(0) 870 | 871 | 872 | # In[122]: 873 | 874 | 875 | prev_app_nva_col_rmvd.isnull().sum().sort_values(ascending=False) 876 | 877 | 878 | # In[123]: 879 | 880 | 881 | prev_app_nva_col_rmvd = prev_app_nva_col_rmvd.drop(labels=['AMT_GOODS_PRICE_MEDIAN','AMT_GOODS_PRICE_MEAN','AMT_GOODS_PRICE_MODE'],axis=1) 882 | 883 | 884 | # In[124]: 885 | 886 | 887 | prev_app_nva_col_rmvd.isnull().sum().sort_values(ascending=False) 888 | 889 | 890 | # In[125]: 891 | 892 | 893 | len(prev_app_nva_col_rmvd.columns) 894 | 895 | 896 | # In[126]: 897 | 898 | 899 | prev_app_nva_col_rmvd.head() 900 | 901 | 902 | # In[137]: 903 | 904 | 905 | merged_df = pd.merge(app_score_col_rmvd,prev_app_nva_col_rmvd,how='inner',on='SK_ID_CURR') 906 | merged_df.head() 907 | 908 | 909 | # In[238]: 910 | 911 | 912 | plt.figure(figsize=(15,5)) 913 | 914 | sns.countplot(data=merged_df,x='NAME_CASH_LOAN_PURPOSE',hue='NAME_CONTRACT_STATUS') 915 | plt.xticks(rotation=90) 916 | plt.yscale('log') 917 | 918 | 919 | # In[239]: 920 | 921 | 922 | sns.countplot(data=merged_df,x='NAME_CONTRACT_STATUS',hue='TARGET') 923 | 924 | 925 | # In[248]: 926 | 927 | 928 | merged_agg = merged_df.groupby(['NAME_CONTRACT_STATUS','TARGET']).size().reset_index().rename(columns={0:'counts'}) 929 | sum_df = merged_agg.groupby(['NAME_CONTRACT_STATUS'])['counts'].sum().reset_index() 930 | 931 | merged_agg_2 = pd.merge(merged_agg,sum_df,how='left',on='NAME_CONTRACT_STATUS') 932 | merged_agg_2['pct'] = round(merged_agg_2['counts_x']/merged_agg_2['counts_y']*100,2) 933 | merged_agg_2 934 | 935 | 936 | # In[250]: 937 | 938 | 939 | sns.lineplot(data=merged_df,x='NAME_CONTRACT_STATUS',y='AMT_INCOME_TOTAL',ci=None,hue='TARGET') 940 | 941 | 942 | # In[251]: 943 | 944 | 945 | len(merged_df.columns) 946 | 947 | 948 | # # All the analysis 949 | 950 | # most of the customers have taken cash loan 951 | # customers who have taken cash loans are less likely to default 952 | # 953 | # CODE_GENDER - 954 | # 955 | # most of the loans have been taken by female 956 | # default rate for females are just ~7% which is safer and lesser than male 957 | # 958 | # NAME_TYPE_SUITE - 959 | # 960 | # unacompanied people had tanke most of the loans and the default rate is ~8.5% which is still okay 961 | # 962 | # NAME_INCOME_TYPE - 963 | # 964 | # the safest segments are working, commercial associates and pensioners 965 | # 966 | # NAME_EDUCATION_TYPE - 967 | # 968 | # Higher education is the safest segment to give the loan with a default rate of less than 5% 969 | # 970 | # NAME_FAMILY_STATUS - 971 | # 972 | # Married people are safe to target, default rate is 8% 973 | # 974 | # 975 | # NAME_HOUSING_TYPE - 976 | # 977 | # People having house/appartment are safe to give the loan with default rate of ~8% 978 | # 979 | # OCCUPATION_TYPE - 980 | # 981 | # Low-Skill Laboreres and drivers are highest defaulters 982 | # Accountants are less defaulters 983 | # Core staff, Managers and Laborers are safer to target with a default rate of <= 7.5 to 10% 984 | # 985 | # ORGANIZATION_TYPE - 986 | # 987 | # Transport type 3 highest defaulter 988 | # Others, Business Entity Type 3, Self Employed are good to go with default rate around 10 % 989 | # 990 | # =======univariate numeric variables analysis======== 991 | # 992 | # >> most of the loans were given for the goods price ranging between 0 to 1 ml 993 | # >> most of the loans were given for the credit amount of 0 to 1 ml 994 | # >> most of the customers are paying annuity of 0 to 50 K 995 | # >> mostly the customers have income between 0 to 1 ml 996 | # 997 | # =============bivariate analysis================== 998 | # 999 | # >> AMT_CREDIT and AMT_GOODS_PRICE are linearly corelated, if the AMT_CREDIT increases the defaulters are decreasing 1000 | # >> people having income less than or equals to 1 ml, are more like to take loans out of which who are taking loan of less than 1.5 million, coudl turn out to be defaulters. we can target income below 1 million and loan maount greater than 1.5 million 1001 | # >> people having children 1 to less than 5 are safer to give the loan 1002 | # >> People who can pay the annuity of 100K are more like to get the loan and that's upto less than 2ml (safer segment) 1003 | # 1004 | # ============analysis on merged data============== 1005 | # 1006 | # >> for the repairing purpose customers had applied mostly prev. and the same puspose has most number of cancelations 1007 | # >> most of the app. which were prev. either canceled or refused 80-90% of them are repayer in the current data 1008 | # >> offers which were unused prev. now have maximum number of defaulters despite of having high income band customers 1009 | 1010 | # # Final Conclusion/Insights 1011 | 1012 | # Bank should target the customers 1013 | # 1014 | # >> having low income i.e. below 1 ml 1015 | # >> working in Others, Business Entity Type 3, Self Employed org. type 1016 | # >> working as Accountants, Core staff, Managers and Laborers 1017 | # >> having house/appartment and are married and having children not more than 5 1018 | # >> Highly educated 1019 | # >> preferably female 1020 | # 1021 | # >> unacompanied people can be safer - default rate is ~8.5% 1022 | # 1023 | # Amount segment recommended - 1024 | # 1025 | # >> the credit amount should not be more than 1 ml 1026 | # >> annuity can be made of 50K (depending on the eligibility) 1027 | # >> income bracket could be below 1 ml 1028 | # 1029 | # >> 80-90% of the customer who were prev. canceled/refused, are repayers. Bank can do the analysis and can consider to give loan to these segments 1030 | # 1031 | # 1032 | # ====================precautions=============== 1033 | # 1034 | # >> org. Transport type 3 should be avoided 1035 | # >> Low-Skill Laboreres and drivers should be avoided 1036 | # >> offers prev. unused and high income customer should be avoided 1037 | # 1038 | -------------------------------------------------------------------------------- /Scripts/FastText_Setup_Prediction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | # pip install fasttext 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | import fasttext 14 | 15 | 16 | # In[3]: 17 | 18 | 19 | fModel = fasttext.load_model(r"C:\Users\smrvr\Downloads\cc.en.300.bin") 20 | 21 | 22 | # In[6]: 23 | 24 | 25 | fModel.get_nearest_neighbors("love") 26 | 27 | 28 | # In[10]: 29 | 30 | 31 | fModel.get_subwords('leanerea') 32 | 33 | 34 | # In[12]: 35 | 36 | 37 | fModel.get_analogies('delhi','noida','tokyo',k=10) 38 | 39 | 40 | # # Training model on own data 41 | 42 | # In[13]: 43 | 44 | 45 | import pandas as pd 46 | pd.set_option('display.max_colwidth',0) 47 | 48 | 49 | # In[18]: 50 | 51 | 52 | df = pd.read_csv(r"D:\Learnerea\Tables\imdb_master.csv", encoding="ISO-8859-1")[['review','label']].query("label!='unsup'") 53 | df.groupby('label').count() 54 | 55 | 56 | # In[21]: 57 | 58 | 59 | myText = df.review[5] 60 | myText 61 | 62 | 63 | # In[22]: 64 | 65 | 66 | import re 67 | 68 | 69 | # In[28]: 70 | 71 | 72 | def preProcessing(text): 73 | text = re.sub(r'[^\w\s\']',"",text) 74 | text = re.sub(' +',' ',text).lower() 75 | return text 76 | 77 | 78 | # In[29]: 79 | 80 | 81 | preProcessing(myText) 82 | 83 | 84 | # In[31]: 85 | 86 | 87 | df['processed_review'] = df['review'].map(preProcessing) 88 | 89 | 90 | # In[32]: 91 | 92 | 93 | df.head(1) 94 | 95 | 96 | # In[33]: 97 | 98 | 99 | df['final_data'] = "__label__"+df['label']+" "+df['processed_review'] 100 | 101 | 102 | # In[39]: 103 | 104 | 105 | df.head(1) 106 | 107 | 108 | # In[36]: 109 | 110 | 111 | from sklearn.model_selection import train_test_split 112 | 113 | 114 | # In[37]: 115 | 116 | 117 | train, test = train_test_split(df, test_size=0.3) 118 | 119 | 120 | # In[38]: 121 | 122 | 123 | display(train.shape) 124 | display(test.shape) 125 | 126 | 127 | # In[40]: 128 | 129 | 130 | train.to_csv(r"D:\Learnerea\Tables\fastText\new\train_reviews.csv",columns=['final_data'],index=False,header=False) 131 | test.to_csv(r"D:\Learnerea\Tables\fastText\new\test_reviews.csv",columns=['final_data'],index=False,header=False) 132 | 133 | 134 | # In[41]: 135 | 136 | 137 | myModel = fasttext.train_supervised(input=r"D:\Learnerea\Tables\fastText\new\train_reviews.csv") 138 | 139 | 140 | # In[42]: 141 | 142 | 143 | myModel.test(r"D:\Learnerea\Tables\fastText\new\test_reviews.csv") 144 | 145 | 146 | # In[53]: 147 | 148 | 149 | myModel.predict("Thanks for sharing this ...it will really prove to be very helpful") 150 | 151 | -------------------------------------------------------------------------------- /Scripts/Feature Selection with ANOVA Method.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "f97af190", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "\n", 13 | "import seaborn as sns\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "\n", 16 | "from sklearn.feature_selection import SelectKBest, f_classif, f_regression\n", 17 | "from sklearn.model_selection import train_test_split\n", 18 | "from sklearn.preprocessing import LabelEncoder\n", 19 | "\n", 20 | "le = LabelEncoder()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 4, 26 | "id": "e0215435", 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/html": [ 32 | "
\n", 33 | "\n", 46 | "\n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | "
agegendereducationincomeloan_amount
050+MalePhD<20k30925.132461
130-40MalePostGraduate60k-80k37081.630673
220-30MaleHighSchool80k+25218.164446
3<20FemalePostGraduate20k-40k22508.062994
450+MalePostGraduate60k-80k53153.715666
\n", 100 | "
" 101 | ], 102 | "text/plain": [ 103 | " age gender education income loan_amount\n", 104 | "0 50+ Male PhD <20k 30925.132461\n", 105 | "1 30-40 Male PostGraduate 60k-80k 37081.630673\n", 106 | "2 20-30 Male HighSchool 80k+ 25218.164446\n", 107 | "3 <20 Female PostGraduate 20k-40k 22508.062994\n", 108 | "4 50+ Male PostGraduate 60k-80k 53153.715666" 109 | ] 110 | }, 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "df = pd.read_excel(r\"D:\\Learnerea\\Tables\\loan_data_sample.xlsx\")\n", 118 | "df.head()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 5, 124 | "id": "de952911", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "features = df.drop('loan_amount',axis=1)\n", 129 | "target = df.loan_amount" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 9, 135 | "id": "f34657d1", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "encoded = pd.DataFrame()\n", 140 | "\n", 141 | "for i in features.columns:\n", 142 | " encoded[i]=le.fit_transform(features[i])" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 10, 148 | "id": "aa007bf9", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/html": [ 154 | "
\n", 155 | "\n", 168 | "\n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | "
agegendereducationincome
03114
11122
20103
34020
43122
\n", 216 | "
" 217 | ], 218 | "text/plain": [ 219 | " age gender education income\n", 220 | "0 3 1 1 4\n", 221 | "1 1 1 2 2\n", 222 | "2 0 1 0 3\n", 223 | "3 4 0 2 0\n", 224 | "4 3 1 2 2" 225 | ] 226 | }, 227 | "execution_count": 10, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "encoded.head()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 11, 239 | "id": "d6e3b2b2", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "x_train, x_test, y_train, y_test = train_test_split(encoded,target,test_size=0.2,random_state=123)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 12, 249 | "id": "c32949b3", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "best_features = SelectKBest(score_func=f_regression,k='all')" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 14, 259 | "id": "7d663462", 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "selection = best_features.fit(x_train,y_train)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 17, 269 | "id": "698ef2e8", 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/plain": [ 275 | "array([14.04421628, 1.44327701, 0.01548106, 10.88645751])" 276 | ] 277 | }, 278 | "execution_count": 17, 279 | "metadata": {}, 280 | "output_type": "execute_result" 281 | } 282 | ], 283 | "source": [ 284 | "selection.scores_" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 18, 290 | "id": "4123f5fc", 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "anova_score = pd.DataFrame(zip(features.columns,selection.scores_,selection.pvalues_),columns=['features','score','pValues'])" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 20, 300 | "id": "816cc320", 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/html": [ 306 | "
\n", 307 | "\n", 320 | "\n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | "
featuresscorepValues
0age14.0442160.000191
3income10.8864580.001012
1gender1.4432770.229966
2education0.0154810.901012
\n", 356 | "
" 357 | ], 358 | "text/plain": [ 359 | " features score pValues\n", 360 | "0 age 14.044216 0.000191\n", 361 | "3 income 10.886458 0.001012\n", 362 | "1 gender 1.443277 0.229966\n", 363 | "2 education 0.015481 0.901012" 364 | ] 365 | }, 366 | "execution_count": 20, 367 | "metadata": {}, 368 | "output_type": "execute_result" 369 | } 370 | ], 371 | "source": [ 372 | "anova_score.sort_values(by='score',ascending=False)" 373 | ] 374 | } 375 | ], 376 | "metadata": { 377 | "kernelspec": { 378 | "display_name": "Python 3 (ipykernel)", 379 | "language": "python", 380 | "name": "python3" 381 | }, 382 | "language_info": { 383 | "codemirror_mode": { 384 | "name": "ipython", 385 | "version": 3 386 | }, 387 | "file_extension": ".py", 388 | "mimetype": "text/x-python", 389 | "name": "python", 390 | "nbconvert_exporter": "python", 391 | "pygments_lexer": "ipython3", 392 | "version": "3.9.7" 393 | } 394 | }, 395 | "nbformat": 4, 396 | "nbformat_minor": 5 397 | } 398 | -------------------------------------------------------------------------------- /Scripts/FraudDetectionModelDataPrep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.model_selection import train_test_split, StratifiedKFold 11 | from sklearn.linear_model import LogisticRegression 12 | from sklearn.metrics import confusion_matrix, classification_report 13 | import matplotlib.pyplot as plt 14 | import seaborn as sns 15 | 16 | 17 | # In[4]: 18 | 19 | 20 | def dataPrepAutomation(inputDF): 21 | new_df = inputDF.drop(columns=['nameOrig','nameDest'],axis=1) 22 | dummies = pd.get_dummies(inputDF['type']) 23 | processedDf = pd.concat([new_df.drop('type',axis=1),dummies],axis=1) 24 | independent = processedDf.drop('isFraud',axis=1) 25 | dependent = processedDf['isFraud'] 26 | return independent, dependent 27 | 28 | 29 | # In[5]: 30 | 31 | 32 | def smoteBalancing(features, targets): 33 | from imblearn.over_sampling import SMOTE 34 | smote = SMOTE() 35 | x_smote, y_smote = smote.fit_resample(features, targets) 36 | X_train, X_test, y_train, y_test = train_test_split(x_smote,y_smote, test_size=0.30, random_state=True) 37 | return X_train, X_test, y_train, y_test 38 | 39 | -------------------------------------------------------------------------------- /Scripts/GloVe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[118]: 5 | 6 | 7 | my_vocab = ['apple','orange','shimla','banana','maruti','mumbai','china','india','husband' 8 | ,'wife','brother','sister','red','yellow','computer','mobile','pear','guava'] 9 | 10 | 11 | # In[36]: 12 | 13 | 14 | import gensim 15 | from sklearn.manifold import TSNE 16 | 17 | 18 | # In[2]: 19 | 20 | 21 | import gensim.downloader as api 22 | 23 | 24 | # In[3]: 25 | 26 | 27 | glove_model = api.load('glove-wiki-gigaword-300') 28 | 29 | 30 | # In[130]: 31 | 32 | 33 | glove_model.most_similar('love',topn=5) 34 | 35 | 36 | # In[133]: 37 | 38 | 39 | # husband - man + woman = wife 40 | 41 | 42 | # In[132]: 43 | 44 | 45 | glove_model.most_similar(positive= ['woman', 'husband'], negative=['man'],topn=1) 46 | 47 | 48 | # In[135]: 49 | 50 | 51 | words = [] 52 | vectors = [] 53 | 54 | 55 | for word in my_vocab: 56 | words.append(word) 57 | vectors.append(glove_model[word]) 58 | 59 | 60 | # In[138]: 61 | 62 | 63 | dicts = zip(words,vectors) 64 | 65 | 66 | # In[139]: 67 | 68 | 69 | import pandas as pd 70 | 71 | 72 | # In[141]: 73 | 74 | 75 | dim_model = TSNE(n_components=2, perplexity=3, init='pca', random_state=45) 76 | 77 | 78 | # In[143]: 79 | 80 | 81 | fit_model = dim_model.fit_transform(vectors) 82 | 83 | 84 | # In[145]: 85 | 86 | 87 | import matplotlib.pyplot as plt 88 | 89 | 90 | # In[146]: 91 | 92 | 93 | fit_model 94 | 95 | 96 | # In[148]: 97 | 98 | 99 | x = [] 100 | y = [] 101 | 102 | for i in fit_model: 103 | x.append(i[0]) 104 | y.append(i[1]) 105 | 106 | 107 | # In[167]: 108 | 109 | 110 | plt.figure(figsize=(8,8)) 111 | 112 | for i in range(len(x)): 113 | plt.scatter(x[i],y[i]) 114 | plt.annotate(words[i], xy=(x[i],y[i]), 115 | xytext=(2, 2), 116 | textcoords='offset points', 117 | ha='right', 118 | va='bottom' 119 | ) 120 | 121 | 122 | # In[ ]: 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /Scripts/LinearRegression_Live.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[2]: 5 | 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.linear_model import LinearRegression 11 | 12 | 13 | # In[28]: 14 | 15 | 16 | salary = pd.read_excel(r"D:\Learnerea\Tables\Salary.xlsx",sheet_name="Salary") 17 | salary.head() 18 | 19 | 20 | # In[7]: 21 | 22 | 23 | X_train, X_test, y_train, y_test = train_test_split(salary[['YearsExperience']],salary.Salary,test_size=0.2,random_state=False) 24 | 25 | 26 | # In[13]: 27 | 28 | 29 | y_test.shape 30 | 31 | 32 | # In[14]: 33 | 34 | 35 | model = LinearRegression() 36 | 37 | 38 | # In[15]: 39 | 40 | 41 | model.fit(X_train,y_train) 42 | 43 | 44 | # In[16]: 45 | 46 | 47 | model.predict(X_test) 48 | 49 | 50 | # In[17]: 51 | 52 | 53 | y_test 54 | 55 | 56 | # In[18]: 57 | 58 | 59 | model.score(X_test,y_test) 60 | 61 | 62 | # In[21]: 63 | 64 | 65 | def salary_predict(exp): 66 | return model.predict([[exp]]) 67 | 68 | 69 | # In[22]: 70 | 71 | 72 | exp = float(input("Enter your years of exp: ")) 73 | print("The salary you can expect is: ", salary_predict(exp)) 74 | 75 | 76 | # In[25]: 77 | 78 | 79 | to_pred = pd.read_excel(r"D:\Learnerea\Tables\Salary.xlsx",sheet_name="to_predict") 80 | 81 | 82 | # In[26]: 83 | 84 | 85 | model.predict(to_pred) 86 | 87 | 88 | # In[27]: 89 | 90 | 91 | to_pred['predicted_salary']=model.predict(to_pred) 92 | to_pred 93 | 94 | -------------------------------------------------------------------------------- /Scripts/LogisticLive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import seaborn as sns 10 | 11 | 12 | # In[2]: 13 | 14 | 15 | app = pd.read_excel("D:\Learnerea\Tables\loan_app_fraud.xlsx") 16 | app.head() 17 | 18 | 19 | # In[5]: 20 | 21 | 22 | sns.scatterplot(data=app, x="Age",y="Salary_KPM",hue="Fraud") 23 | 24 | 25 | # In[6]: 26 | 27 | 28 | from sklearn.model_selection import train_test_split 29 | 30 | 31 | # In[7]: 32 | 33 | 34 | X_train, X_test, y_train, y_test = train_test_split(app[["Age","Salary_KPM"]],app.Fraud,test_size=0.2, random_state=False) 35 | 36 | 37 | # In[12]: 38 | 39 | 40 | from sklearn.linear_model import LogisticRegression 41 | 42 | 43 | # In[13]: 44 | 45 | 46 | model = LogisticRegression() 47 | 48 | 49 | # In[15]: 50 | 51 | 52 | model.fit(X_train,y_train) 53 | 54 | 55 | # In[16]: 56 | 57 | 58 | model.predict(X_test) 59 | 60 | 61 | # In[17]: 62 | 63 | 64 | model.score(X_test,y_test) 65 | 66 | 67 | # In[18]: 68 | 69 | 70 | def fraud_pred_mode(age,salary): 71 | if model.predict([[age,salary]])==1: 72 | return "Fraud" 73 | else: 74 | return "Genuine" 75 | fraud_pred_mode(20,90) 76 | 77 | 78 | # In[20]: 79 | 80 | 81 | age = int(input("Enter the age of the applicant: ")) 82 | salary = int(input("Enter the Salary of the applicant: ")) 83 | 84 | print("The applicant seems to be ", fraud_pred_mode(age,salary)) 85 | 86 | -------------------------------------------------------------------------------- /Scripts/NER.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import spacy 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | nlp = spacy.load("en_core_web_sm") 14 | 15 | 16 | # In[3]: 17 | 18 | 19 | news = nlp("TATA acquired BigBasket Ent. for $45 billion") 20 | 21 | 22 | # In[6]: 23 | 24 | 25 | for x in news.ents: 26 | print(x.text, "==>", x.label_,"==>", spacy.explain(x.label_)) 27 | 28 | 29 | # In[7]: 30 | 31 | 32 | from spacy import displacy 33 | 34 | 35 | # In[8]: 36 | 37 | 38 | displacy.render(news, style='ent') 39 | 40 | -------------------------------------------------------------------------------- /Scripts/PD_credit_risk_model_dev.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import seaborn as sns 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | # In[144]: 14 | 15 | 16 | train = pd.read_csv(r'D:\Learnerea\Tables\GiveMeSomeCredit\cs-training.csv').drop(['Unnamed: 0'],axis=1) 17 | test = pd.read_csv(r'D:\Learnerea\Tables\GiveMeSomeCredit\cs-test.csv').drop(['Unnamed: 0'],axis=1) 18 | 19 | 20 | # In[153]: 21 | 22 | 23 | train.shape 24 | 25 | 26 | # In[154]: 27 | 28 | 29 | test.shape 30 | 31 | 32 | # In[150]: 33 | 34 | 35 | train_redup = train.drop_duplicates() 36 | 37 | 38 | # In[169]: 39 | 40 | 41 | def findMiss(df): 42 | return round(df.isnull().sum()/df.shape[0]*100,2) 43 | 44 | 45 | # In[202]: 46 | 47 | 48 | train_redup.shape 49 | 50 | 51 | # In[170]: 52 | 53 | 54 | findMiss(train_redup) 55 | 56 | 57 | # In[175]: 58 | 59 | 60 | train_redup[train_redup.MonthlyIncome.isnull()].describe() 61 | 62 | 63 | # In[178]: 64 | 65 | 66 | train_redup['NumberOfDependents'].agg(['mode']) 67 | 68 | 69 | # In[193]: 70 | 71 | 72 | fam_miss = train_redup[train_redup.NumberOfDependents.isnull()] 73 | fam_nmiss = train_redup[train_redup.NumberOfDependents.notnull()] 74 | 75 | 76 | # In[194]: 77 | 78 | 79 | fam_miss['NumberOfDependents'] = fam_miss['NumberOfDependents'].fillna(0) 80 | fam_miss['MonthlyIncome'] = fam_miss['MonthlyIncome'].fillna(0) 81 | 82 | 83 | # In[195]: 84 | 85 | 86 | findMiss(fam_miss) 87 | 88 | 89 | # In[196]: 90 | 91 | 92 | findMiss(fam_nmiss) 93 | 94 | 95 | # In[197]: 96 | 97 | 98 | fam_nmiss['MonthlyIncome'].agg(['mean','median','min']) 99 | 100 | 101 | # In[198]: 102 | 103 | 104 | fam_nmiss['MonthlyIncome'] = fam_nmiss['MonthlyIncome'].fillna(fam_nmiss['MonthlyIncome'].median()) 105 | 106 | 107 | # In[199]: 108 | 109 | 110 | findMiss(fam_nmiss) 111 | 112 | 113 | # In[200]: 114 | 115 | 116 | filled_train = fam_nmiss.append(fam_miss) 117 | 118 | 119 | # In[203]: 120 | 121 | 122 | findMiss(filled_train) 123 | 124 | 125 | # In[204]: 126 | 127 | 128 | filled_train.head() 129 | 130 | 131 | # In[207]: 132 | 133 | 134 | filled_train.groupby(['SeriousDlqin2yrs']).size()/filled_train.shape[0] 135 | 136 | 137 | # In[208]: 138 | 139 | 140 | filled_train.RevolvingUtilizationOfUnsecuredLines.describe() 141 | 142 | 143 | # In[218]: 144 | 145 | 146 | filled_train['RevolvingUtilizationOfUnsecuredLines'].quantile([.99]) 147 | 148 | 149 | # In[237]: 150 | 151 | 152 | (filled_train[filled_train['RevolvingUtilizationOfUnsecuredLines'] > 10]).describe() 153 | 154 | 155 | # In[243]: 156 | 157 | 158 | util_droped = filled_train.drop(filled_train[filled_train['RevolvingUtilizationOfUnsecuredLines'] > 10].index) 159 | 160 | 161 | # In[249]: 162 | 163 | 164 | sns.boxplot(util_droped['age']) 165 | 166 | 167 | # In[251]: 168 | 169 | 170 | util_droped.groupby(['NumberOfTime30-59DaysPastDueNotWorse']).size() 171 | 172 | 173 | # In[252]: 174 | 175 | 176 | util_droped.groupby(['NumberOfTime60-89DaysPastDueNotWorse']).size() 177 | 178 | 179 | # In[253]: 180 | 181 | 182 | util_droped.groupby(['NumberOfTimes90DaysLate']).size() 183 | 184 | 185 | # In[258]: 186 | 187 | 188 | util_droped[util_droped['NumberOfTimes90DaysLate']>=96].groupby(['SeriousDlqin2yrs']).size() 189 | 190 | 191 | # In[260]: 192 | 193 | 194 | util_droped['DebtRatio'].describe() 195 | 196 | 197 | # In[262]: 198 | 199 | 200 | sns.kdeplot(util_droped['DebtRatio']) 201 | 202 | 203 | # In[289]: 204 | 205 | 206 | util_droped['DebtRatio'].quantile([.975]) 207 | 208 | 209 | # In[293]: 210 | 211 | 212 | util_droped[util_droped['DebtRatio']>3492][['SeriousDlqin2yrs','MonthlyIncome']].describe() 213 | 214 | 215 | # In[297]: 216 | 217 | 218 | temp = util_droped[(util_droped['DebtRatio']>3492) & (util_droped['SeriousDlqin2yrs']==util_droped['MonthlyIncome'])] 219 | 220 | 221 | # In[300]: 222 | 223 | 224 | dRatio = util_droped.drop(util_droped[(util_droped['DebtRatio']>3492) & (util_droped['SeriousDlqin2yrs']==util_droped['MonthlyIncome'])].index) 225 | 226 | 227 | # In[311]: 228 | 229 | 230 | # pip install Xgboost 231 | 232 | 233 | # In[327]: 234 | 235 | 236 | from xgboost import XGBClassifier 237 | from sklearn.metrics import accuracy_score 238 | from sklearn.metrics import confusion_matrix, classification_report 239 | 240 | 241 | # In[320]: 242 | 243 | 244 | model = XGBClassifier(tree_method = 'exact') 245 | 246 | 247 | # In[318]: 248 | 249 | 250 | x = dRatio.drop(['SeriousDlqin2yrs'],axis=1) 251 | y = dRatio['SeriousDlqin2yrs'] 252 | 253 | 254 | # In[322]: 255 | 256 | 257 | model.fit(x,y.values.ravel()) 258 | y_pred = model.predict(x) 259 | 260 | 261 | # In[323]: 262 | 263 | 264 | accuracy_score(y,y_pred) 265 | 266 | 267 | # In[325]: 268 | 269 | 270 | cm = confusion_matrix(y,y_pred) 271 | 272 | 273 | # In[326]: 274 | 275 | 276 | sns.heatmap(cm,annot=True,fmt='d',cmap='Oranges',linewidths=0.5,linecolor='Black') 277 | plt.xticks(np.arange(2)+.5,['No def','def']) 278 | plt.yticks(np.arange(2)+.5,['No def','def']) 279 | plt.xlabel("predicted") 280 | plt.ylabel("actuals") 281 | 282 | 283 | # In[328]: 284 | 285 | 286 | print(classification_report(y,y_pred)) 287 | 288 | -------------------------------------------------------------------------------- /Scripts/Payment_Fraud_Detection_Model_Logistic_Reg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[8]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import seaborn as sns 10 | import matplotlib.pyplot as plt 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.metrics import confusion_matrix 14 | 15 | 16 | # In[9]: 17 | 18 | 19 | df = pd.read_csv(r"D:\Learnerea\Tables\PS_20174392719_1491204439457_log.csv") 20 | df.head() 21 | 22 | 23 | # In[10]: 24 | 25 | 26 | df.shape 27 | 28 | 29 | # In[11]: 30 | 31 | 32 | df3 = df.drop(['nameOrig','nameDest','isFlaggedFraud'],axis=1) 33 | 34 | 35 | # In[12]: 36 | 37 | 38 | df3.head() 39 | 40 | 41 | # In[15]: 42 | 43 | 44 | df3['type'].unique() 45 | 46 | 47 | # In[18]: 48 | 49 | 50 | dummies = pd.get_dummies(df3['type']).drop(['CASH_IN'],axis=1) 51 | dummies 52 | 53 | 54 | # In[30]: 55 | 56 | 57 | df4 = pd.concat([df3,dummies],axis=1).drop(['type'],axis=1) 58 | df4 59 | 60 | 61 | # In[31]: 62 | 63 | 64 | X_train, X_test, y_train, y_test = train_test_split(df4.drop(['isFraud'],axis=1),df4.isFraud,test_size=0.2,random_state=False) 65 | 66 | 67 | # In[32]: 68 | 69 | 70 | X_test.shape 71 | 72 | 73 | # In[33]: 74 | 75 | 76 | model = LogisticRegression() 77 | 78 | 79 | # In[34]: 80 | 81 | 82 | model.fit(X_train,y_train) 83 | 84 | 85 | # In[35]: 86 | 87 | 88 | model.score(X_test,y_test) 89 | 90 | 91 | # In[39]: 92 | 93 | 94 | df4['isFraud'].unique() 95 | 96 | 97 | # In[42]: 98 | 99 | 100 | df4.groupby('isFraud').sum() 101 | 102 | 103 | # In[43]: 104 | 105 | 106 | predict = model.predict(X_test) 107 | predict 108 | 109 | 110 | # In[45]: 111 | 112 | 113 | cm = confusion_matrix(y_test,predict) 114 | 115 | 116 | # In[55]: 117 | 118 | 119 | sns.heatmap(cm,cmap='Oranges',annot=True,fmt='d',cbar=False,linecolor='Black',linewidths=5) 120 | plt.xticks(np.arange(2)+.5,['No Fraud','Fraud']) 121 | plt.yticks(np.arange(2)+.5,['No Fraud','Fraud']) 122 | plt.xlabel("predicted") 123 | plt.ylabel("actuals") 124 | 125 | -------------------------------------------------------------------------------- /Scripts/Payment_Fraud_Detection_Model_Logistic_and_Decision_Tree.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[166]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import seaborn as sns 10 | import matplotlib.pyplot as plt 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.preprocessing import LabelEncoder 14 | from sklearn.metrics import confusion_matrix, classification_report 15 | from sklearn.tree import DecisionTreeClassifier 16 | 17 | 18 | # In[104]: 19 | 20 | 21 | df = pd.read_csv(r"D:\Learnerea\Tables\PS_20174392719_1491204439457_log.csv") 22 | df.head() 23 | 24 | 25 | # In[105]: 26 | 27 | 28 | df.groupby('isFlaggedFraud').count() 29 | 30 | 31 | # In[106]: 32 | 33 | 34 | df.groupby('type').count() 35 | 36 | 37 | # In[116]: 38 | 39 | 40 | df.pivot_table(values='amount',index='type',columns='isFraud',aggfunc='count') 41 | 42 | 43 | # In[136]: 44 | 45 | 46 | dfFltrd = df[df.type.isin(['CASH_OUT','TRANSFER'])] 47 | dfFltrd.pivot_table(values='amount',index='type',columns='isFraud',aggfunc='count') 48 | 49 | 50 | # In[137]: 51 | 52 | 53 | encoder = LabelEncoder() 54 | 55 | 56 | # In[138]: 57 | 58 | 59 | dfFltrd['typeEncoded'] = encoder.fit_transform(dfFltrd['type']) 60 | dfFltrd.pivot_table(values='amount',index='typeEncoded',columns='isFraud',aggfunc='count') 61 | 62 | 63 | # In[139]: 64 | 65 | 66 | df2 = dfFltrd.drop(['step','type','nameOrig','nameDest','isFlaggedFraud'],axis=1) 67 | 68 | 69 | # In[146]: 70 | 71 | 72 | df2.head() 73 | 74 | 75 | # In[150]: 76 | 77 | 78 | X_train, X_test, y_train, y_test = train_test_split(df2.drop(['isFraud'],axis=1),df2.isFraud,test_size=0.2,random_state=False) 79 | 80 | 81 | # In[151]: 82 | 83 | 84 | LogReg = LogisticRegression() 85 | 86 | 87 | # In[152]: 88 | 89 | 90 | LogReg.fit(X_train,y_train) 91 | 92 | 93 | # In[154]: 94 | 95 | 96 | predicted = LogReg.predict(X_test) 97 | 98 | 99 | # In[155]: 100 | 101 | 102 | LogReg.score(X_test,y_test) 103 | 104 | 105 | # In[157]: 106 | 107 | 108 | cm = confusion_matrix(y_test,predicted) 109 | cm 110 | 111 | 112 | # In[165]: 113 | 114 | 115 | sns.heatmap(cm,annot=True,fmt='d',cmap='Oranges',linewidths=0.5,linecolor='Black') 116 | plt.xticks(np.arange(2)+.5,['No Fraud','Fraud']) 117 | plt.yticks(np.arange(2)+.5,['No Fraud','Fraud']) 118 | plt.xlabel("predicted") 119 | plt.ylabel("actuals") 120 | 121 | 122 | # In[164]: 123 | 124 | 125 | print(classification_report(y_test,predicted)) 126 | 127 | 128 | # In[167]: 129 | 130 | 131 | dModel = DecisionTreeClassifier() 132 | 133 | 134 | # In[168]: 135 | 136 | 137 | dModel.fit(X_train,y_train) 138 | 139 | 140 | # In[169]: 141 | 142 | 143 | dPredicted = dModel.predict(X_test) 144 | 145 | 146 | # In[170]: 147 | 148 | 149 | cm = confusion_matrix(y_test,dPredicted) 150 | 151 | 152 | # In[171]: 153 | 154 | 155 | sns.heatmap(cm,annot=True,fmt='d',cmap='Oranges',linewidths=0.5,linecolor='Black') 156 | plt.xticks(np.arange(2)+.5,['No Fraud','Fraud']) 157 | plt.yticks(np.arange(2)+.5,['No Fraud','Fraud']) 158 | plt.xlabel("predicted") 159 | plt.ylabel("actuals") 160 | 161 | 162 | # In[172]: 163 | 164 | 165 | print(classification_report(y_test,dPredicted)) 166 | 167 | -------------------------------------------------------------------------------- /Scripts/PoS.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import nltk 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | sentence = "Tesla is not as great as we think" 14 | 15 | 16 | # In[3]: 17 | 18 | 19 | token = nltk.word_tokenize(sentence) 20 | 21 | 22 | # In[4]: 23 | 24 | 25 | posd = nltk.pos_tag(token) 26 | posd 27 | 28 | 29 | # In[5]: 30 | 31 | 32 | nltk.help.upenn_tagset('VB') 33 | 34 | 35 | # In[6]: 36 | 37 | 38 | # for i in range(len(posd)): 39 | # print(posd[i][0],"==>",posd[i][1],"==>",nltk.help.upenn_tagset(posd[i][1])) 40 | 41 | -------------------------------------------------------------------------------- /Scripts/PreProcessing_PD_Model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 13, 6 | "id": "30d956d3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import seaborn as sns\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "from sklearn.preprocessing import StandardScaler\n", 15 | "from imblearn.over_sampling import SMOTE\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "from sklearn.linear_model import LogisticRegression\n", 18 | "from sklearn.metrics import classification_report\n", 19 | "from sklearn.ensemble import RandomForestClassifier\n", 20 | "from xgboost import XGBClassifier\n", 21 | "from imblearn.over_sampling import SMOTE" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 14, 27 | "id": "407126de", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "def ModePreProcessing(df):\n", 32 | "\n", 33 | "# Outliers Treatment\n", 34 | " cr_age_rmvd = df[df['person_age']<=70]\n", 35 | " cr_age_rmvd.reset_index(drop=True, inplace=True)\n", 36 | " person_emp_rmvd = cr_age_rmvd[cr_age_rmvd['person_emp_length']<=47]\n", 37 | " person_emp_rmvd.reset_index(drop=True, inplace=True)\n", 38 | " cr_data = person_emp_rmvd.copy()\n", 39 | " \n", 40 | "# Missing Values Treatment\n", 41 | " cr_data.fillna({'loan_int_rate':cr_data['loan_int_rate'].median()},inplace=True)\n", 42 | " cr_data_copy=cr_data.drop('loan_grade',axis=1)\n", 43 | " cr_data_cat_treated = cr_data_copy.copy()\n", 44 | " \n", 45 | "# Categorical Variables Treatment\n", 46 | " person_home_ownership = pd.get_dummies(cr_data_cat_treated['person_home_ownership'],drop_first=True).astype(int)\n", 47 | " loan_intent = pd.get_dummies(cr_data_cat_treated['loan_intent'],drop_first=True).astype(int)\n", 48 | " cr_data_cat_treated['cb_person_default_on_file_binary'] = np.where(cr_data_cat_treated['cb_person_default_on_file']=='Y',1,0)\n", 49 | " data_to_scale = cr_data_cat_treated.drop(['person_home_ownership','loan_intent','loan_status','cb_person_default_on_file','cb_person_default_on_file_binary'],axis=1)\n", 50 | "\n", 51 | "# Scaling the data\n", 52 | " scaler = StandardScaler()\n", 53 | " scaled_data = scaler.fit_transform(data_to_scale)\n", 54 | " scaled_df = pd.DataFrame(scaled_data,columns=['person_age', 'person_income', 'person_emp_length', 'loan_amnt',\n", 55 | " 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'])\n", 56 | " scaled_data_combined = pd.concat([scaled_df,person_home_ownership,loan_intent],axis=1)\n", 57 | " scaled_data_combined['cb_person_default_on_file'] = cr_data_cat_treated['cb_person_default_on_file_binary']\n", 58 | " scaled_data_combined['loan_status'] = cr_data_cat_treated['loan_status']\n", 59 | " \n", 60 | "# Features and Target Creation \n", 61 | " target = scaled_data_combined['loan_status']\n", 62 | " features = scaled_data_combined.drop('loan_status',axis=1)\n", 63 | "\n", 64 | "# SMOTE Balancing\n", 65 | " smote = SMOTE()\n", 66 | " balanced_features, balanced_target = smote.fit_resample(features,target)\n", 67 | " \n", 68 | "# return the final datasets\n", 69 | " return data_to_scale, features, target, balanced_features, balanced_target" 70 | ] 71 | } 72 | ], 73 | "metadata": { 74 | "kernelspec": { 75 | "display_name": "Python 3 (ipykernel)", 76 | "language": "python", 77 | "name": "python3" 78 | }, 79 | "language_info": { 80 | "codemirror_mode": { 81 | "name": "ipython", 82 | "version": 3 83 | }, 84 | "file_extension": ".py", 85 | "mimetype": "text/x-python", 86 | "name": "python", 87 | "nbconvert_exporter": "python", 88 | "pygments_lexer": "ipython3", 89 | "version": "3.9.7" 90 | } 91 | }, 92 | "nbformat": 4, 93 | "nbformat_minor": 5 94 | } 95 | -------------------------------------------------------------------------------- /Scripts/PreProcessing_PD_Model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import seaborn as sns 10 | import matplotlib.pyplot as plt 11 | from sklearn.preprocessing import StandardScaler 12 | from imblearn.over_sampling import SMOTE 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.linear_model import LogisticRegression 15 | from sklearn.metrics import classification_report 16 | from sklearn.ensemble import RandomForestClassifier 17 | from xgboost import XGBClassifier 18 | from imblearn.over_sampling import SMOTE 19 | 20 | 21 | # In[3]: 22 | 23 | 24 | def ModePreProcessing(df): 25 | 26 | # Outliers Treatment 27 | cr_age_rmvd = df[df['person_age']<=70] 28 | cr_age_rmvd.reset_index(drop=True, inplace=True) 29 | person_emp_rmvd = cr_age_rmvd[cr_age_rmvd['person_emp_length']<=47] 30 | person_emp_rmvd.reset_index(drop=True, inplace=True) 31 | cr_data = person_emp_rmvd.copy() 32 | 33 | # Missing Values Treatment 34 | cr_data.fillna({'loan_int_rate':cr_data['loan_int_rate'].median()},inplace=True) 35 | cr_data_copy=cr_data.drop('loan_grade',axis=1) 36 | cr_data_cat_treated = cr_data_copy.copy() 37 | 38 | # Categorical Variables Treatment 39 | person_home_ownership = pd.get_dummies(cr_data_cat_treated['person_home_ownership'],drop_first=True).astype(int) 40 | loan_intent = pd.get_dummies(cr_data_cat_treated['loan_intent'],drop_first=True).astype(int) 41 | cr_data_cat_treated['cb_person_default_on_file_binary'] = np.where(cr_data_cat_treated['cb_person_default_on_file']=='Y',1,0) 42 | data_to_scale = cr_data_cat_treated.drop(['person_home_ownership','loan_intent','loan_status','cb_person_default_on_file','cb_person_default_on_file_binary'],axis=1) 43 | 44 | # Scaling the data 45 | scaler = StandardScaler() 46 | scaled_data = scaler.fit_transform(data_to_scale) 47 | scaled_df = pd.DataFrame(scaled_data,columns=['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 48 | 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']) 49 | scaled_data_combined = pd.concat([scaled_df,person_home_ownership,loan_intent],axis=1) 50 | scaled_data_combined['cb_person_default_on_file'] = cr_data_cat_treated['cb_person_default_on_file_binary'] 51 | scaled_data_combined['loan_status'] = cr_data_cat_treated['loan_status'] 52 | 53 | # Features and Target Creation 54 | target = scaled_data_combined['loan_status'] 55 | features = scaled_data_combined.drop('loan_status',axis=1) 56 | 57 | # SMOTE Balancing 58 | smote = SMOTE() 59 | balanced_features, balanced_target = smote.fit_resample(features,target) 60 | 61 | # return the final datasets 62 | return data_to_scale, features, target, balanced_features, balanced_target 63 | 64 | -------------------------------------------------------------------------------- /Scripts/RandomForesPDmodel.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Scripts/RandomForesPDmodel.zip -------------------------------------------------------------------------------- /Scripts/Recursive Feature Elimination.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4b55c914", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "\n", 13 | "from sklearn.model_selection import train_test_split\n", 14 | "from sklearn.linear_model import LogisticRegression\n", 15 | "from sklearn.tree import DecisionTreeClassifier\n", 16 | "from sklearn.feature_selection import RFE, RFECV\n", 17 | "\n", 18 | "import warnings\n", 19 | "warnings.filterwarnings('ignore')" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "id": "8bdb5f01", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | "
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0117.9910.38122.801001.00.118400.277600.30010.147100.2419...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
1120.5717.77132.901326.00.084740.078640.08690.070170.1812...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
2119.6921.25130.001203.00.109600.159900.19740.127900.2069...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
3111.4220.3877.58386.10.142500.283900.24140.105200.2597...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
4120.2914.34135.101297.00.100300.132800.19800.104300.1809...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n", 195 | "

5 rows × 31 columns

\n", 196 | "
" 197 | ], 198 | "text/plain": [ 199 | " diagnosis radius_mean texture_mean perimeter_mean area_mean \n", 200 | "0 1 17.99 10.38 122.80 1001.0 \\\n", 201 | "1 1 20.57 17.77 132.90 1326.0 \n", 202 | "2 1 19.69 21.25 130.00 1203.0 \n", 203 | "3 1 11.42 20.38 77.58 386.1 \n", 204 | "4 1 20.29 14.34 135.10 1297.0 \n", 205 | "\n", 206 | " smoothness_mean compactness_mean concavity_mean concave points_mean \n", 207 | "0 0.11840 0.27760 0.3001 0.14710 \\\n", 208 | "1 0.08474 0.07864 0.0869 0.07017 \n", 209 | "2 0.10960 0.15990 0.1974 0.12790 \n", 210 | "3 0.14250 0.28390 0.2414 0.10520 \n", 211 | "4 0.10030 0.13280 0.1980 0.10430 \n", 212 | "\n", 213 | " symmetry_mean ... radius_worst texture_worst perimeter_worst \n", 214 | "0 0.2419 ... 25.38 17.33 184.60 \\\n", 215 | "1 0.1812 ... 24.99 23.41 158.80 \n", 216 | "2 0.2069 ... 23.57 25.53 152.50 \n", 217 | "3 0.2597 ... 14.91 26.50 98.87 \n", 218 | "4 0.1809 ... 22.54 16.67 152.20 \n", 219 | "\n", 220 | " area_worst smoothness_worst compactness_worst concavity_worst \n", 221 | "0 2019.0 0.1622 0.6656 0.7119 \\\n", 222 | "1 1956.0 0.1238 0.1866 0.2416 \n", 223 | "2 1709.0 0.1444 0.4245 0.4504 \n", 224 | "3 567.7 0.2098 0.8663 0.6869 \n", 225 | "4 1575.0 0.1374 0.2050 0.4000 \n", 226 | "\n", 227 | " concave points_worst symmetry_worst fractal_dimension_worst \n", 228 | "0 0.2654 0.4601 0.11890 \n", 229 | "1 0.1860 0.2750 0.08902 \n", 230 | "2 0.2430 0.3613 0.08758 \n", 231 | "3 0.2575 0.6638 0.17300 \n", 232 | "4 0.1625 0.2364 0.07678 \n", 233 | "\n", 234 | "[5 rows x 31 columns]" 235 | ] 236 | }, 237 | "execution_count": 2, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "df = pd.read_excel(r\"D:\\Learnerea\\Tables\\cancer_data_cleaned.xlsx\")\n", 244 | "df.head()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 5, 250 | "id": "5dee842c", 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "X_train, X_test, y_train, y_test = train_test_split(df.drop('diagnosis',axis=1), df.diagnosis, test_size=0.33, random_state=42)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 14, 260 | "id": "005707bd", 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "rfe_select = RFE(DecisionTreeClassifier(),n_features_to_select=0.25).fit(X_train,y_train)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 16, 270 | "id": "8c02d803", 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "data": { 275 | "text/plain": [ 276 | "array([False, False, False, False, False, False, False, True, False,\n", 277 | " False, True, False, False, False, False, False, False, False,\n", 278 | " False, True, True, True, True, False, False, False, True,\n", 279 | " False, False, False])" 280 | ] 281 | }, 282 | "execution_count": 16, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "rfe_select.support_" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 17, 294 | "id": "289c7fff", 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "important_features = pd.DataFrame(list(zip(X_train.columns,rfe_select.support_)),columns=['features','important'])" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 18, 304 | "id": "39055606", 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/html": [ 310 | "
\n", 311 | "\n", 324 | "\n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | "
featuresimportant
26concavity_worstTrue
7concave points_meanTrue
22perimeter_worstTrue
10radius_seTrue
21texture_worstTrue
20radius_worstTrue
19fractal_dimension_seTrue
0radius_meanFalse
16concavity_seFalse
28symmetry_worstFalse
27concave points_worstFalse
25compactness_worstFalse
24smoothness_worstFalse
23area_worstFalse
18symmetry_seFalse
17concave points_seFalse
15compactness_seFalse
1texture_meanFalse
14smoothness_seFalse
13area_seFalse
12perimeter_seFalse
11texture_seFalse
9fractal_dimension_meanFalse
8symmetry_meanFalse
6concavity_meanFalse
5compactness_meanFalse
4smoothness_meanFalse
3area_meanFalse
2perimeter_meanFalse
29fractal_dimension_worstFalse
\n", 485 | "
" 486 | ], 487 | "text/plain": [ 488 | " features important\n", 489 | "26 concavity_worst True\n", 490 | "7 concave points_mean True\n", 491 | "22 perimeter_worst True\n", 492 | "10 radius_se True\n", 493 | "21 texture_worst True\n", 494 | "20 radius_worst True\n", 495 | "19 fractal_dimension_se True\n", 496 | "0 radius_mean False\n", 497 | "16 concavity_se False\n", 498 | "28 symmetry_worst False\n", 499 | "27 concave points_worst False\n", 500 | "25 compactness_worst False\n", 501 | "24 smoothness_worst False\n", 502 | "23 area_worst False\n", 503 | "18 symmetry_se False\n", 504 | "17 concave points_se False\n", 505 | "15 compactness_se False\n", 506 | "1 texture_mean False\n", 507 | "14 smoothness_se False\n", 508 | "13 area_se False\n", 509 | "12 perimeter_se False\n", 510 | "11 texture_se False\n", 511 | "9 fractal_dimension_mean False\n", 512 | "8 symmetry_mean False\n", 513 | "6 concavity_mean False\n", 514 | "5 compactness_mean False\n", 515 | "4 smoothness_mean False\n", 516 | "3 area_mean False\n", 517 | "2 perimeter_mean False\n", 518 | "29 fractal_dimension_worst False" 519 | ] 520 | }, 521 | "execution_count": 18, 522 | "metadata": {}, 523 | "output_type": "execute_result" 524 | } 525 | ], 526 | "source": [ 527 | "important_features.sort_values(by='important',ascending=False)" 528 | ] 529 | } 530 | ], 531 | "metadata": { 532 | "kernelspec": { 533 | "display_name": "Python 3 (ipykernel)", 534 | "language": "python", 535 | "name": "python3" 536 | }, 537 | "language_info": { 538 | "codemirror_mode": { 539 | "name": "ipython", 540 | "version": 3 541 | }, 542 | "file_extension": ".py", 543 | "mimetype": "text/x-python", 544 | "name": "python", 545 | "nbconvert_exporter": "python", 546 | "pygments_lexer": "ipython3", 547 | "version": "3.9.7" 548 | } 549 | }, 550 | "nbformat": 4, 551 | "nbformat_minor": 5 552 | } 553 | -------------------------------------------------------------------------------- /Scripts/UK_PM_Sentiment_Analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Environment setup and Log in to Twitter 5 | 6 | # In[7]: 7 | 8 | 9 | import selenium 10 | from selenium import webdriver 11 | from selenium.webdriver.common.by import By 12 | from selenium.webdriver.common.keys import Keys 13 | from time import sleep 14 | import getpass 15 | 16 | 17 | # In[12]: 18 | 19 | 20 | my_user = "LearnereaBot" 21 | # my_pass = getpass.getpass() 22 | my_pass = "Jamesbond#0016" 23 | 24 | # In[16]: 25 | 26 | 27 | search_item = "Liz Truss" 28 | 29 | 30 | # In[9]: 31 | 32 | 33 | PATH = "C:\Program Files\drivers\chromedriver_103.exe" 34 | driver = webdriver.Chrome(PATH) 35 | driver.get("https://twitter.com/i/flow/login") 36 | # driver.maximize_window() 37 | sleep(3) 38 | 39 | 40 | # In[10]: 41 | 42 | 43 | user_id = driver.find_element(By.XPATH,"//input[@type='text']") 44 | user_id.send_keys(my_user) 45 | user_id.send_keys(Keys.ENTER) 46 | 47 | 48 | # In[13]: 49 | 50 | 51 | password = driver.find_element(By.XPATH,"//input[@type='password']") 52 | password.send_keys(my_pass) 53 | password.send_keys(Keys.ENTER) 54 | 55 | 56 | # # Scrape Tweets mentioning about Liz Truss 57 | 58 | # In[18]: 59 | 60 | 61 | search_box = driver.find_element(By.XPATH,"//input[@data-testid='SearchBox_Search_Input']") 62 | search_box.send_keys(search_item) 63 | search_box.send_keys(Keys.ENTER) 64 | 65 | 66 | # In[24]: 67 | 68 | 69 | all_tweets = set() 70 | 71 | 72 | tweets = driver.find_elements(By.XPATH,"//div[@data-testid='tweetText']") 73 | while True: 74 | for tweet in tweets: 75 | all_tweets.add(tweet.text) 76 | driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') 77 | sleep(3) 78 | tweets = driver.find_elements(By.XPATH,"//div[@data-testid='tweetText']") 79 | if len(all_tweets)>50: 80 | break 81 | 82 | 83 | # In[25]: 84 | 85 | 86 | all_tweets = list(all_tweets) 87 | all_tweets[0] 88 | 89 | 90 | # # Cleaning the Tweets 91 | 92 | # In[64]: 93 | 94 | 95 | import pandas as pd 96 | pd.options.display.max_colwidth = 1000 97 | import re 98 | import nltk 99 | nltk.download('punkt') 100 | nltk.download('stopwords') 101 | 102 | from nltk.corpus import stopwords 103 | from nltk.tokenize import word_tokenize 104 | 105 | 106 | # In[66]: 107 | 108 | 109 | stp_words = stopwords.words('english') 110 | print(stp_words) 111 | 112 | 113 | # In[56]: 114 | 115 | 116 | df = pd.DataFrame(all_tweets,columns=['tweets']) 117 | df.head() 118 | 119 | 120 | # In[60]: 121 | 122 | 123 | one_tweet=df.iloc[4]['tweets'] 124 | one_tweet 125 | 126 | 127 | # In[86]: 128 | 129 | 130 | from textblob import TextBlob 131 | from wordcloud import WordCloud 132 | 133 | def TweetCleaning(tweet): 134 | cleanTweet = re.sub(r"@[a-zA-Z0-9]+","",tweet) 135 | cleanTweet = re.sub(r"#[a-zA-Z0-9\s]+","",cleanTweet) 136 | cleanTweet = ' '.join(word for word in cleanTweet.split() if word not in stp_words) 137 | return cleanTweet 138 | 139 | def calPolarity(tweet): 140 | return TextBlob(tweet).sentiment.polarity 141 | 142 | def calSubjectivity(tweet): 143 | return TextBlob(tweet).sentiment.subjectivity 144 | 145 | def segmentation(tweet): 146 | if tweet > 0: 147 | return "positive" 148 | if tweet == 0: 149 | return "neutral" 150 | else: 151 | return "negative" 152 | 153 | 154 | # In[88]: 155 | 156 | 157 | df['cleanedTweets'] = df['tweets'].apply(TweetCleaning) 158 | df['tPolarity'] = df['cleanedTweets'].apply(calPolarity) 159 | df['tSubjectivity'] = df['cleanedTweets'].apply(calSubjectivity) 160 | df['segmentation'] = df['tPolarity'].apply(segmentation) 161 | df.head() 162 | 163 | 164 | # # Analysis and Visualization 165 | 166 | # In[91]: 167 | 168 | 169 | df.pivot_table(index=['segmentation'],aggfunc={'segmentation':'count'}) 170 | 171 | 172 | # In[94]: 173 | 174 | 175 | # top 3 most positive 176 | df.sort_values(by=['tPolarity'],ascending=False).head(3) 177 | 178 | 179 | # In[95]: 180 | 181 | 182 | # top 3 most negative 183 | df.sort_values(by=['tPolarity'],ascending=True).head(3) 184 | 185 | 186 | # In[116]: 187 | 188 | 189 | # 3 neutral 190 | df[df.tPolarity==0] 191 | 192 | 193 | # In[103]: 194 | 195 | 196 | import matplotlib.pyplot as plt 197 | 198 | consolidated = ' '.join(word for word in df['cleanedTweets']) 199 | 200 | wordCloud = WordCloud(width=400, height=200, random_state=20, max_font_size=119).generate(consolidated) 201 | 202 | plt.imshow(wordCloud, interpolation='bilinear') 203 | plt.axis('off') 204 | plt.show() 205 | 206 | 207 | # In[104]: 208 | 209 | 210 | import seaborn as sns 211 | 212 | 213 | # In[113]: 214 | 215 | 216 | df.groupby('segmentation').count() 217 | 218 | 219 | # In[115]: 220 | 221 | 222 | plt.figure(figsize=(10,5)) 223 | sns.set_style("whitegrid") 224 | sns.scatterplot(data=df, x='tPolarity',y='tSubjectivity',s=100,hue='segmentation') 225 | 226 | 227 | # In[117]: 228 | 229 | 230 | sns.countplot(data=df,x='segmentation') 231 | 232 | 233 | # In[118]: 234 | 235 | 236 | positive = round(len(df[df.segmentation == 'positive'])/len(df)*100,1) 237 | negative = round(len(df[df.segmentation == 'negative'])/len(df)*100,1) 238 | neutral = round(len(df[df.segmentation == 'neutral'])/len(df)*100,1) 239 | 240 | responses = [positive, negative, neutral] 241 | responses 242 | 243 | response = {'resp': ['mayWin', 'mayLoose', 'notSure'], 'pct':[positive, negative, neutral]} 244 | pd.DataFrame(response) 245 | 246 | -------------------------------------------------------------------------------- /Scripts/UsingAutomatedModulesAndModels.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "600be598", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import FraudDetectionModelDataPrep as dataPrep" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "b8af91ab", 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "['LabelEncoder',\n", 23 | " 'LogisticRegression',\n", 24 | " 'StratifiedKFold',\n", 25 | " '__builtins__',\n", 26 | " '__cached__',\n", 27 | " '__doc__',\n", 28 | " '__file__',\n", 29 | " '__loader__',\n", 30 | " '__name__',\n", 31 | " '__package__',\n", 32 | " '__spec__',\n", 33 | " 'classification_report',\n", 34 | " 'confusion_matrix',\n", 35 | " 'dataPrepAutomation',\n", 36 | " 'np',\n", 37 | " 'pd',\n", 38 | " 'plt',\n", 39 | " 'smoteBalancing',\n", 40 | " 'sns',\n", 41 | " 'train_test_split']" 42 | ] 43 | }, 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "output_type": "execute_result" 47 | } 48 | ], 49 | "source": [ 50 | "dir(dataPrep)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "id": "3a7b6687", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "inputDF = dataPrep.pd.read_excel(r\"D:\\Learnerea\\Tables\\to_test_the_model.xlsx\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 5, 66 | "id": "89006010", 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "(4500, 11)" 73 | ] 74 | }, 75 | "execution_count": 5, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "inputDF.shape" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 6, 87 | "id": "1cd83ecc", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "features, target = dataPrep.dataPrepAutomation(inputDF)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 9, 97 | "id": "d71a7250", 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/html": [ 103 | "
\n", 104 | "\n", 117 | "\n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | "
stepamountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFlaggedFraudCASH_INCASH_OUTDEBITPAYMENTTRANSFER
030313363860.780.000.0013656437.6227020298.400FalseFalseFalseFalseTrue
1355340375.649758.510.004764447.255104822.890FalseFalseFalseFalseTrue
2404276737.580.000.00962518.441239256.030FalseTrueFalseFalseFalse
3258314027.77650090.00336062.230.00314027.770FalseTrueFalseFalseFalse
43633468.450.000.00149332.38182800.830FalseTrueFalseFalseFalse
\n", 213 | "
" 214 | ], 215 | "text/plain": [ 216 | " step amount oldbalanceOrg newbalanceOrig oldbalanceDest \n", 217 | "0 303 13363860.78 0.00 0.00 13656437.62 \\\n", 218 | "1 355 340375.64 9758.51 0.00 4764447.25 \n", 219 | "2 404 276737.58 0.00 0.00 962518.44 \n", 220 | "3 258 314027.77 650090.00 336062.23 0.00 \n", 221 | "4 36 33468.45 0.00 0.00 149332.38 \n", 222 | "\n", 223 | " newbalanceDest isFlaggedFraud CASH_IN CASH_OUT DEBIT PAYMENT TRANSFER \n", 224 | "0 27020298.40 0 False False False False True \n", 225 | "1 5104822.89 0 False False False False True \n", 226 | "2 1239256.03 0 False True False False False \n", 227 | "3 314027.77 0 False True False False False \n", 228 | "4 182800.83 0 False True False False False " 229 | ] 230 | }, 231 | "execution_count": 9, 232 | "metadata": {}, 233 | "output_type": "execute_result" 234 | } 235 | ], 236 | "source": [ 237 | "features.head()" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 10, 243 | "id": "239d3bc7", 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "import pickle" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 11, 253 | "id": "4ecb9787", 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "with open(\"ensemble_model.pkl\",\"rb\") as file:\n", 258 | " ensemble_model = pickle.load(file)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 14, 264 | "id": "81c2d452", 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "data": { 269 | "text/plain": [ 270 | "array([0, 0, 0, ..., 1, 1, 1], dtype=int64)" 271 | ] 272 | }, 273 | "execution_count": 14, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "prediction = ensemble_model.predict(features)\n", 280 | "prediction" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 17, 286 | "id": "b6e2ed5e", 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "name": "stdout", 291 | "output_type": "stream", 292 | "text": [ 293 | " precision recall f1-score support\n", 294 | "\n", 295 | " 0 1.00 0.99 0.99 4000\n", 296 | " 1 0.91 1.00 0.95 500\n", 297 | "\n", 298 | " accuracy 0.99 4500\n", 299 | " macro avg 0.96 0.99 0.97 4500\n", 300 | "weighted avg 0.99 0.99 0.99 4500\n", 301 | "\n" 302 | ] 303 | } 304 | ], 305 | "source": [ 306 | "print(dataPrep.classification_report(target,prediction))" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 18, 312 | "id": "9ee6aca0", 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "with open(\"smote_logistic.pkl\",\"rb\") as file:\n", 317 | " smote_logistic = pickle.load(file)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 19, 323 | "id": "888f1eec", 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "data": { 328 | "text/plain": [ 329 | "array([0, 0, 0, ..., 1, 1, 1], dtype=int64)" 330 | ] 331 | }, 332 | "execution_count": 19, 333 | "metadata": {}, 334 | "output_type": "execute_result" 335 | } 336 | ], 337 | "source": [ 338 | "smoteLogPred = smote_logistic.predict(features)\n", 339 | "smoteLogPred" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 20, 345 | "id": "192cf23d", 346 | "metadata": {}, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | " precision recall f1-score support\n", 353 | "\n", 354 | " 0 0.99 0.92 0.95 4000\n", 355 | " 1 0.58 0.90 0.71 500\n", 356 | "\n", 357 | " accuracy 0.92 4500\n", 358 | " macro avg 0.79 0.91 0.83 4500\n", 359 | "weighted avg 0.94 0.92 0.92 4500\n", 360 | "\n" 361 | ] 362 | } 363 | ], 364 | "source": [ 365 | "print(dataPrep.classification_report(target,smoteLogPred))" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 21, 371 | "id": "4973418f", 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "smoteLogProb = smote_logistic.predict_proba(features)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 24, 381 | "id": "270284c5", 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "smoteLogProbdata = smoteLogProb[:,1]" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 27, 391 | "id": "8e19858c", 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "finalData = dataPrep.pd.DataFrame(zip(target,prediction,smoteLogPred,smoteLogProbdata),columns=['isFraud','ensem_pred','smote_pred','smote_prob'])" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 28, 401 | "id": "5eb1e71a", 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/html": [ 407 | "
\n", 408 | "\n", 421 | "\n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | "
stepamountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFlaggedFraudCASH_INCASH_OUTDEBITPAYMENTTRANSFERisFraudensem_predsmote_predsmote_prob
030313363860.780.000.0013656437.6227020298.400FalseFalseFalseFalseTrue0004.550288e-79
1355340375.649758.510.004764447.255104822.890FalseFalseFalseFalseTrue0004.131617e-03
2404276737.580.000.00962518.441239256.030FalseTrueFalseFalseFalse0008.529435e-03
3258314027.77650090.00336062.230.00314027.770FalseTrueFalseFalseFalse0019.799595e-01
43633468.450.000.00149332.38182800.830FalseTrueFalseFalseFalse0003.666744e-01
\n", 541 | "
" 542 | ], 543 | "text/plain": [ 544 | " step amount oldbalanceOrg newbalanceOrig oldbalanceDest \n", 545 | "0 303 13363860.78 0.00 0.00 13656437.62 \\\n", 546 | "1 355 340375.64 9758.51 0.00 4764447.25 \n", 547 | "2 404 276737.58 0.00 0.00 962518.44 \n", 548 | "3 258 314027.77 650090.00 336062.23 0.00 \n", 549 | "4 36 33468.45 0.00 0.00 149332.38 \n", 550 | "\n", 551 | " newbalanceDest isFlaggedFraud CASH_IN CASH_OUT DEBIT PAYMENT \n", 552 | "0 27020298.40 0 False False False False \\\n", 553 | "1 5104822.89 0 False False False False \n", 554 | "2 1239256.03 0 False True False False \n", 555 | "3 314027.77 0 False True False False \n", 556 | "4 182800.83 0 False True False False \n", 557 | "\n", 558 | " TRANSFER isFraud ensem_pred smote_pred smote_prob \n", 559 | "0 True 0 0 0 4.550288e-79 \n", 560 | "1 True 0 0 0 4.131617e-03 \n", 561 | "2 False 0 0 0 8.529435e-03 \n", 562 | "3 False 0 0 1 9.799595e-01 \n", 563 | "4 False 0 0 0 3.666744e-01 " 564 | ] 565 | }, 566 | "execution_count": 28, 567 | "metadata": {}, 568 | "output_type": "execute_result" 569 | } 570 | ], 571 | "source": [ 572 | "finalOutcome = dataPrep.pd.concat([features,finalData],axis=1)\n", 573 | "finalOutcome.head()" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 29, 579 | "id": "e0cc301b", 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "finalOutcome.to_excel(r\"D:\\Learnerea\\Tables\\dataforAnalysis.xlsx\")" 584 | ] 585 | } 586 | ], 587 | "metadata": { 588 | "kernelspec": { 589 | "display_name": "Python 3 (ipykernel)", 590 | "language": "python", 591 | "name": "python3" 592 | }, 593 | "language_info": { 594 | "codemirror_mode": { 595 | "name": "ipython", 596 | "version": 3 597 | }, 598 | "file_extension": ".py", 599 | "mimetype": "text/x-python", 600 | "name": "python", 601 | "nbconvert_exporter": "python", 602 | "pygments_lexer": "ipython3", 603 | "version": "3.9.7" 604 | } 605 | }, 606 | "nbformat": 4, 607 | "nbformat_minor": 5 608 | } 609 | -------------------------------------------------------------------------------- /Scripts/XGBpdModel.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Scripts/XGBpdModel.pkl -------------------------------------------------------------------------------- /Scripts/airbnb_Host_Popularity_Rental_Prices_Hard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | 9 | pd.options.display.max_colwidth=1000 10 | 11 | 12 | # In[4]: 13 | 14 | 15 | ab = pd.read_excel(r"D:\Learnerea\Tables\airbnb_host_searches.xlsx").drop('id',axis=1) 16 | ab.head() 17 | 18 | 19 | # In[6]: 20 | 21 | 22 | ab.duplicated().value_counts() 23 | 24 | 25 | # In[7]: 26 | 27 | 28 | ab2=ab.drop_duplicates() 29 | ab2.duplicated().value_counts() 30 | 31 | 32 | # In[9]: 33 | 34 | 35 | ab2.dtypes 36 | 37 | 38 | # In[11]: 39 | 40 | 41 | ab2['host_id'] = str(ab2['price'])+ab2['room_type']+str(ab2['host_since'])+str(ab2['zipcode'])+str(ab2['number_of_reviews']) 42 | ab2.head() 43 | 44 | 45 | # In[12]: 46 | 47 | 48 | ab2['host_popularity'] = ab2['number_of_reviews'].apply(lambda x: 'New' if x==0 49 | else 'Rising' if x>=1 and x<=5 50 | else 'Trending Up' if x>=6 and x<=15 51 | else 'Popular' if x>=16 and x<=40 52 | else 'Hot' 53 | ) 54 | 55 | 56 | # In[17]: 57 | 58 | 59 | summary = ab2.pivot_table(index='host_popularity',values='price',aggfunc={'price':['min','mean','max']}).reset_index() 60 | summary.rename(columns={'max':'max_price', 61 | 'min':'min_price', 62 | 'mean':'avg_price'}) 63 | 64 | -------------------------------------------------------------------------------- /Scripts/airbnb_Ranking_Most_Active_Guests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[5]: 5 | 6 | 7 | import pandas as pd 8 | 9 | 10 | # In[6]: 11 | 12 | 13 | ab = pd.read_excel(r"D:\Learnerea\Tables\airbnb_contacts.xlsx") 14 | ab.head() 15 | 16 | 17 | # In[14]: 18 | 19 | 20 | ab_agg=ab.pivot_table(index='id_guest',values='n_messages',aggfunc='sum').reset_index().sort_values(by='n_messages',ascending=False) 21 | ab_agg 22 | 23 | 24 | # In[21]: 25 | 26 | 27 | ab_agg['rank'] = ab_agg['n_messages'].rank(ascending=False,method='dense') 28 | ab_agg.head(10) 29 | 30 | -------------------------------------------------------------------------------- /Scripts/amazon_interview_question.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | amazon = pd.read_excel(r"D:\Learnerea\Tables\amazon_trans.xlsx") 14 | amazon.head() 15 | 16 | 17 | # In[4]: 18 | 19 | 20 | amazon.sort_values(by=['user_id','created_at'],inplace=True) 21 | amazon.head() 22 | 23 | 24 | # In[8]: 25 | 26 | 27 | amazon['day_diff']=amazon.groupby('user_id')['created_at'].diff() 28 | amazon.head(10) 29 | 30 | 31 | # In[9]: 32 | 33 | 34 | amazon.info() 35 | 36 | 37 | # In[11]: 38 | 39 | 40 | amazon[amazon['day_diff']<= pd.Timedelta('7 days')]['user_id'].unique() 41 | 42 | 43 | # In[ ]: 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /Scripts/backward_elimintation_method.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 21, 6 | "id": "585ac163", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import warnings\n", 13 | "\n", 14 | "from sklearn.linear_model import LogisticRegression\n", 15 | "from sklearn.model_selection import train_test_split\n", 16 | "\n", 17 | "from mlxtend.feature_selection import SequentialFeatureSelector\n", 18 | "\n", 19 | "warnings.filterwarnings('ignore')" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 22, 25 | "id": "b3794176", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "df = pd.read_excel(r\"D:\\Learnerea\\Tables\\cancer_data_cleaned.xlsx\")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 23, 35 | "id": "7cca26cd", 36 | "metadata": { 37 | "scrolled": true 38 | }, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/html": [ 43 | "
\n", 44 | "\n", 57 | "\n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | "
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0117.9910.38122.801001.00.118400.277600.30010.147100.2419...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
1120.5717.77132.901326.00.084740.078640.08690.070170.1812...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
2119.6921.25130.001203.00.109600.159900.19740.127900.2069...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
3111.4220.3877.58386.10.142500.283900.24140.105200.2597...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
4120.2914.34135.101297.00.100300.132800.19800.104300.1809...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n", 207 | "

5 rows × 31 columns

\n", 208 | "
" 209 | ], 210 | "text/plain": [ 211 | " diagnosis radius_mean texture_mean perimeter_mean area_mean \n", 212 | "0 1 17.99 10.38 122.80 1001.0 \\\n", 213 | "1 1 20.57 17.77 132.90 1326.0 \n", 214 | "2 1 19.69 21.25 130.00 1203.0 \n", 215 | "3 1 11.42 20.38 77.58 386.1 \n", 216 | "4 1 20.29 14.34 135.10 1297.0 \n", 217 | "\n", 218 | " smoothness_mean compactness_mean concavity_mean concave points_mean \n", 219 | "0 0.11840 0.27760 0.3001 0.14710 \\\n", 220 | "1 0.08474 0.07864 0.0869 0.07017 \n", 221 | "2 0.10960 0.15990 0.1974 0.12790 \n", 222 | "3 0.14250 0.28390 0.2414 0.10520 \n", 223 | "4 0.10030 0.13280 0.1980 0.10430 \n", 224 | "\n", 225 | " symmetry_mean ... radius_worst texture_worst perimeter_worst \n", 226 | "0 0.2419 ... 25.38 17.33 184.60 \\\n", 227 | "1 0.1812 ... 24.99 23.41 158.80 \n", 228 | "2 0.2069 ... 23.57 25.53 152.50 \n", 229 | "3 0.2597 ... 14.91 26.50 98.87 \n", 230 | "4 0.1809 ... 22.54 16.67 152.20 \n", 231 | "\n", 232 | " area_worst smoothness_worst compactness_worst concavity_worst \n", 233 | "0 2019.0 0.1622 0.6656 0.7119 \\\n", 234 | "1 1956.0 0.1238 0.1866 0.2416 \n", 235 | "2 1709.0 0.1444 0.4245 0.4504 \n", 236 | "3 567.7 0.2098 0.8663 0.6869 \n", 237 | "4 1575.0 0.1374 0.2050 0.4000 \n", 238 | "\n", 239 | " concave points_worst symmetry_worst fractal_dimension_worst \n", 240 | "0 0.2654 0.4601 0.11890 \n", 241 | "1 0.1860 0.2750 0.08902 \n", 242 | "2 0.2430 0.3613 0.08758 \n", 243 | "3 0.2575 0.6638 0.17300 \n", 244 | "4 0.1625 0.2364 0.07678 \n", 245 | "\n", 246 | "[5 rows x 31 columns]" 247 | ] 248 | }, 249 | "execution_count": 23, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "df.head()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 24, 261 | "id": "e04fc892", 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "X_train, X_test, y_train, y_test = train_test_split(df.drop('diagnosis',axis=1), df.diagnosis, test_size=0.33, random_state=42)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 25, 271 | "id": "1ce45e07", 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "name": "stderr", 276 | "output_type": "stream", 277 | "text": [ 278 | "[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.8s\n", 279 | "\n", 280 | "[2024-04-06 11:15:02] Features: 29/1 -- score: 0.9448735475051265[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.6s\n", 281 | "\n", 282 | "[2024-04-06 11:15:05] Features: 28/1 -- score: 0.9449077238550923[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.6s\n", 283 | "\n", 284 | "[2024-04-06 11:15:08] Features: 27/1 -- score: 0.9553315105946684[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.7s\n", 285 | "\n", 286 | "[2024-04-06 11:15:11] Features: 26/1 -- score: 0.9579630895420369[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.7s\n", 287 | "\n", 288 | "[2024-04-06 11:15:13] Features: 25/1 -- score: 0.9632604237867396[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.6s\n", 289 | "\n", 290 | "[2024-04-06 11:15:16] Features: 24/1 -- score: 0.9632946001367054[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.6s\n", 291 | "\n", 292 | "[2024-04-06 11:15:18] Features: 23/1 -- score: 0.9632262474367739[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.5s\n", 293 | "\n", 294 | "[2024-04-06 11:15:21] Features: 22/1 -- score: 0.9658578263841422[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.3s\n", 295 | "\n", 296 | "[2024-04-06 11:15:23] Features: 21/1 -- score: 0.9580314422419687[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.6s\n", 297 | "\n", 298 | "[2024-04-06 11:15:25] Features: 20/1 -- score: 0.9606630211893371[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.6s\n", 299 | "\n", 300 | "[2024-04-06 11:15:27] Features: 19/1 -- score: 0.9632262474367739[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.6s\n", 301 | "\n", 302 | "[2024-04-06 11:15:29] Features: 18/1 -- score: 0.9684894053315105[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.6s\n", 303 | "\n", 304 | "[2024-04-06 11:15:30] Features: 17/1 -- score: 0.9632262474367739[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.6s\n", 305 | "[Parallel(n_jobs=1)]: Done 17 tasks | elapsed: 1.6s\n", 306 | "\n", 307 | "[2024-04-06 11:15:32] Features: 16/1 -- score: 0.9579972658920027\n", 308 | "[2024-04-06 11:15:34] Features: 15/1 -- score: 0.9606288448393713\n", 309 | "[2024-04-06 11:15:35] Features: 14/1 -- score: 0.9658578263841422\n", 310 | "[2024-04-06 11:15:37] Features: 13/1 -- score: 0.9632604237867396\n", 311 | "[2024-04-06 11:15:38] Features: 12/1 -- score: 0.9606288448393713\n", 312 | "[2024-04-06 11:15:39] Features: 11/1 -- score: 0.9606288448393713\n", 313 | "[2024-04-06 11:15:40] Features: 10/1 -- score: 0.9579972658920027\n", 314 | "[2024-04-06 11:15:41] Features: 9/1 -- score: 0.9606288448393713\n", 315 | "[2024-04-06 11:15:42] Features: 8/1 -- score: 0.960628844839371\n", 316 | "[2024-04-06 11:15:43] Features: 7/1 -- score: 0.9632262474367739\n", 317 | "[2024-04-06 11:15:44] Features: 6/1 -- score: 0.9606288448393713\n", 318 | "[2024-04-06 11:15:44] Features: 5/1 -- score: 0.9396445659603554\n", 319 | "[2024-04-06 11:15:44] Features: 4/1 -- score: 0.9396103896103896\n", 320 | "[2024-04-06 11:15:45] Features: 3/1 -- score: 0.902904989747095\n", 321 | "[2024-04-06 11:15:45] Features: 2/1 -- score: 0.8845181134654819\n", 322 | "[2024-04-06 11:15:45] Features: 1/1 -- score: 0.8556732740943268" 323 | ] 324 | } 325 | ], 326 | "source": [ 327 | "feature_selector = SequentialFeatureSelector(LogisticRegression(),\n", 328 | "# k_features=5,\n", 329 | " forward=False,\n", 330 | " verbose=5,\n", 331 | " scoring='accuracy',\n", 332 | " cv=5\n", 333 | " ).fit(X_train,y_train)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 18, 339 | "id": "604c2cfe", 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "data": { 344 | "text/plain": [ 345 | "('area_se',)" 346 | ] 347 | }, 348 | "execution_count": 18, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "feature_selector.k_feature_names_" 355 | ] 356 | } 357 | ], 358 | "metadata": { 359 | "kernelspec": { 360 | "display_name": "Python 3 (ipykernel)", 361 | "language": "python", 362 | "name": "python3" 363 | }, 364 | "language_info": { 365 | "codemirror_mode": { 366 | "name": "ipython", 367 | "version": 3 368 | }, 369 | "file_extension": ".py", 370 | "mimetype": "text/x-python", 371 | "name": "python", 372 | "nbconvert_exporter": "python", 373 | "pygments_lexer": "ipython3", 374 | "version": "3.9.7" 375 | } 376 | }, 377 | "nbformat": 4, 378 | "nbformat_minor": 5 379 | } 380 | -------------------------------------------------------------------------------- /Scripts/ensemble_model.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Scripts/ensemble_model.zip -------------------------------------------------------------------------------- /Scripts/facebook_Acceptance_Rate_By_Date.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | fb = pd.read_excel(r"D:\Learnerea\Tables\fb_friend_requests.xlsx").sort_values(['user_id_sender','date']) 14 | fb 15 | 16 | 17 | # In[3]: 18 | 19 | 20 | sent = fb.query("action=='sent'") 21 | sent 22 | 23 | 24 | # In[4]: 25 | 26 | 27 | accepted = fb.query("action=='accepted'") 28 | accepted 29 | 30 | 31 | # In[6]: 32 | 33 | 34 | combined = sent.merge(accepted,on=['user_id_sender','user_id_receiver'],how='left') 35 | combined 36 | 37 | 38 | # In[14]: 39 | 40 | 41 | summary = combined.pivot_table(index='date_x',values=['action_x','action_y'],aggfunc='count').reset_index() 42 | summary['acceptance_rate'] = summary.action_y/summary.action_x 43 | summary[['date_x','acceptance_rate']] 44 | 45 | -------------------------------------------------------------------------------- /Scripts/facebook_interview_question.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | fb = pd.read_excel(r"D:\Learnerea\Tables\facebook_web_log.xlsx") 14 | fb.head() 15 | 16 | 17 | # In[3]: 18 | 19 | 20 | fb = fb.query("action in ('page_load','page_exit')") 21 | fb['page_exit'] = fb.apply(lambda x: x['timestamp'] if x['action']=='page_exit' else None, axis=1) 22 | fb['page_load'] = fb.apply(lambda x: x['timestamp'] if x['action']=='page_load' else None, axis=1) 23 | fb['page_loadN']=fb['page_load'].fillna(method='ffill') 24 | fb = fb.dropna(subset=['page_exit']) 25 | 26 | 27 | # In[4]: 28 | 29 | 30 | fb = fb[['user_id','page_exit','page_loadN']] 31 | fb['sec_diff'] = (fb['page_exit']-fb['page_loadN']).dt.total_seconds() 32 | fb 33 | 34 | 35 | # In[6]: 36 | 37 | 38 | fb.pivot_table(index='user_id',values='sec_diff',aggfunc='mean').reset_index() 39 | 40 | -------------------------------------------------------------------------------- /Scripts/feature_selection_with_LASSO.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "id": "f9eeffca", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "\n", 13 | "from sklearn.model_selection import train_test_split\n", 14 | "from sklearn.linear_model import Lasso, Ridge \n", 15 | "from sklearn.preprocessing import StandardScaler\n", 16 | "\n", 17 | "import warnings\n", 18 | "warnings.filterwarnings('ignore')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "ce524d58", 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/html": [ 30 | "
\n", 31 | "\n", 44 | "\n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | "
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0117.9910.38122.801001.00.118400.277600.30010.147100.2419...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
1120.5717.77132.901326.00.084740.078640.08690.070170.1812...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
2119.6921.25130.001203.00.109600.159900.19740.127900.2069...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
3111.4220.3877.58386.10.142500.283900.24140.105200.2597...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
4120.2914.34135.101297.00.100300.132800.19800.104300.1809...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n", 194 | "

5 rows × 31 columns

\n", 195 | "
" 196 | ], 197 | "text/plain": [ 198 | " diagnosis radius_mean texture_mean perimeter_mean area_mean \n", 199 | "0 1 17.99 10.38 122.80 1001.0 \\\n", 200 | "1 1 20.57 17.77 132.90 1326.0 \n", 201 | "2 1 19.69 21.25 130.00 1203.0 \n", 202 | "3 1 11.42 20.38 77.58 386.1 \n", 203 | "4 1 20.29 14.34 135.10 1297.0 \n", 204 | "\n", 205 | " smoothness_mean compactness_mean concavity_mean concave points_mean \n", 206 | "0 0.11840 0.27760 0.3001 0.14710 \\\n", 207 | "1 0.08474 0.07864 0.0869 0.07017 \n", 208 | "2 0.10960 0.15990 0.1974 0.12790 \n", 209 | "3 0.14250 0.28390 0.2414 0.10520 \n", 210 | "4 0.10030 0.13280 0.1980 0.10430 \n", 211 | "\n", 212 | " symmetry_mean ... radius_worst texture_worst perimeter_worst \n", 213 | "0 0.2419 ... 25.38 17.33 184.60 \\\n", 214 | "1 0.1812 ... 24.99 23.41 158.80 \n", 215 | "2 0.2069 ... 23.57 25.53 152.50 \n", 216 | "3 0.2597 ... 14.91 26.50 98.87 \n", 217 | "4 0.1809 ... 22.54 16.67 152.20 \n", 218 | "\n", 219 | " area_worst smoothness_worst compactness_worst concavity_worst \n", 220 | "0 2019.0 0.1622 0.6656 0.7119 \\\n", 221 | "1 1956.0 0.1238 0.1866 0.2416 \n", 222 | "2 1709.0 0.1444 0.4245 0.4504 \n", 223 | "3 567.7 0.2098 0.8663 0.6869 \n", 224 | "4 1575.0 0.1374 0.2050 0.4000 \n", 225 | "\n", 226 | " concave points_worst symmetry_worst fractal_dimension_worst \n", 227 | "0 0.2654 0.4601 0.11890 \n", 228 | "1 0.1860 0.2750 0.08902 \n", 229 | "2 0.2430 0.3613 0.08758 \n", 230 | "3 0.2575 0.6638 0.17300 \n", 231 | "4 0.1625 0.2364 0.07678 \n", 232 | "\n", 233 | "[5 rows x 31 columns]" 234 | ] 235 | }, 236 | "execution_count": 2, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "df = pd.read_excel(r\"D:\\Learnerea\\Tables\\cancer_data_cleaned.xlsx\")\n", 243 | "df.head()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 3, 249 | "id": "47d76b81", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "X = df.drop('diagnosis',axis=1)\n", 254 | "y = df.diagnosis" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 4, 260 | "id": "a82c4233", 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "# Scaling the data\n", 265 | "scaler = StandardScaler()" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 5, 271 | "id": "1d8cd46e", 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "X_scaled = scaler.fit_transform(X)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 6, 281 | "id": "842e7cd9", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 7, 291 | "id": "f9b65651", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "lasso_instance = Lasso(alpha=0.1)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 8, 301 | "id": "74cd0861", 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/html": [ 307 | "
Lasso(alpha=0.1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 308 | ], 309 | "text/plain": [ 310 | "Lasso(alpha=0.1)" 311 | ] 312 | }, 313 | "execution_count": 8, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "lasso_instance.fit(X_train,y_train)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 9, 325 | "id": "753cc8f8", 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "array([ 0. , 0. , 0. , 0. , 0. ,\n", 332 | " 0. , 0. , 0.03562742, 0. , -0. ,\n", 333 | " 0. , 0. , 0. , 0. , 0. ,\n", 334 | " 0. , 0. , 0. , 0. , -0. ,\n", 335 | " 0.11010092, 0.00635015, 0. , 0. , 0. ,\n", 336 | " 0. , 0. , 0.1696524 , 0. , 0. ])" 337 | ] 338 | }, 339 | "execution_count": 9, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | } 343 | ], 344 | "source": [ 345 | "lasso_instance.coef_" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 33, 351 | "id": "0d85bd10", 352 | "metadata": {}, 353 | "outputs": [ 354 | { 355 | "data": { 356 | "text/plain": [ 357 | "['concave points_mean',\n", 358 | " 'radius_worst',\n", 359 | " 'texture_worst',\n", 360 | " 'concave points_worst']" 361 | ] 362 | }, 363 | "execution_count": 33, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "feature_importance = [feature for feature, coef in zip(X.columns,lasso_instance.coef_) if coef!=0]\n", 370 | "feature_importance" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 35, 376 | "id": "795d8619", 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "data": { 381 | "text/html": [ 382 | "
\n", 383 | "\n", 396 | "\n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | "
featurecoef
20radius_worst0.295662
7concave points_mean0.235560
26concavity_worst0.174722
10radius_se0.173617
21texture_worst0.131943
0radius_mean0.105438
17concave points_se0.098285
25compactness_worst0.097376
2perimeter_mean0.072313
28symmetry_worst0.070803
14smoothness_se0.066666
6concavity_mean0.047451
29fractal_dimension_worst0.024626
4smoothness_mean0.023071
19fractal_dimension_se0.010603
12perimeter_se0.006057
9fractal_dimension_mean0.003472
18symmetry_se-0.012820
8symmetry_mean-0.015663
22perimeter_worst-0.020264
24smoothness_worst-0.020566
15compactness_se-0.029187
1texture_mean-0.039028
11texture_se-0.048607
27concave points_worst-0.093311
13area_se-0.113672
16concavity_se-0.129800
3area_mean-0.133755
23area_worst-0.208526
5compactness_mean-0.237037
\n", 557 | "
" 558 | ], 559 | "text/plain": [ 560 | " feature coef\n", 561 | "20 radius_worst 0.295662\n", 562 | "7 concave points_mean 0.235560\n", 563 | "26 concavity_worst 0.174722\n", 564 | "10 radius_se 0.173617\n", 565 | "21 texture_worst 0.131943\n", 566 | "0 radius_mean 0.105438\n", 567 | "17 concave points_se 0.098285\n", 568 | "25 compactness_worst 0.097376\n", 569 | "2 perimeter_mean 0.072313\n", 570 | "28 symmetry_worst 0.070803\n", 571 | "14 smoothness_se 0.066666\n", 572 | "6 concavity_mean 0.047451\n", 573 | "29 fractal_dimension_worst 0.024626\n", 574 | "4 smoothness_mean 0.023071\n", 575 | "19 fractal_dimension_se 0.010603\n", 576 | "12 perimeter_se 0.006057\n", 577 | "9 fractal_dimension_mean 0.003472\n", 578 | "18 symmetry_se -0.012820\n", 579 | "8 symmetry_mean -0.015663\n", 580 | "22 perimeter_worst -0.020264\n", 581 | "24 smoothness_worst -0.020566\n", 582 | "15 compactness_se -0.029187\n", 583 | "1 texture_mean -0.039028\n", 584 | "11 texture_se -0.048607\n", 585 | "27 concave points_worst -0.093311\n", 586 | "13 area_se -0.113672\n", 587 | "16 concavity_se -0.129800\n", 588 | "3 area_mean -0.133755\n", 589 | "23 area_worst -0.208526\n", 590 | "5 compactness_mean -0.237037" 591 | ] 592 | }, 593 | "execution_count": 35, 594 | "metadata": {}, 595 | "output_type": "execute_result" 596 | } 597 | ], 598 | "source": [ 599 | "features_importance_df.sort_values(by='coef',ascending=False)" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "id": "30d2539f", 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [] 609 | } 610 | ], 611 | "metadata": { 612 | "kernelspec": { 613 | "display_name": "Python 3 (ipykernel)", 614 | "language": "python", 615 | "name": "python3" 616 | }, 617 | "language_info": { 618 | "codemirror_mode": { 619 | "name": "ipython", 620 | "version": 3 621 | }, 622 | "file_extension": ".py", 623 | "mimetype": "text/x-python", 624 | "name": "python", 625 | "nbconvert_exporter": "python", 626 | "pygments_lexer": "ipython3", 627 | "version": "3.9.7" 628 | } 629 | }, 630 | "nbformat": 4, 631 | "nbformat_minor": 5 632 | } 633 | -------------------------------------------------------------------------------- /Scripts/feature_selection_with_elasticnet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 18, 6 | "id": "ba07c8db", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "\n", 13 | "from sklearn.model_selection import train_test_split\n", 14 | "from sklearn.linear_model import ElasticNet\n", 15 | "from sklearn.preprocessing import StandardScaler\n", 16 | "\n", 17 | "import warnings\n", 18 | "warnings.filterwarnings('ignore')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 30, 24 | "id": "df1b1eba", 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/html": [ 30 | "
\n", 31 | "\n", 44 | "\n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | "
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0117.9910.38122.801001.00.118400.277600.30010.147100.2419...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
1120.5717.77132.901326.00.084740.078640.08690.070170.1812...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
2119.6921.25130.001203.00.109600.159900.19740.127900.2069...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
3111.4220.3877.58386.10.142500.283900.24140.105200.2597...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
4120.2914.34135.101297.00.100300.132800.19800.104300.1809...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n", 194 | "

5 rows × 31 columns

\n", 195 | "
" 196 | ], 197 | "text/plain": [ 198 | " diagnosis radius_mean texture_mean perimeter_mean area_mean \n", 199 | "0 1 17.99 10.38 122.80 1001.0 \\\n", 200 | "1 1 20.57 17.77 132.90 1326.0 \n", 201 | "2 1 19.69 21.25 130.00 1203.0 \n", 202 | "3 1 11.42 20.38 77.58 386.1 \n", 203 | "4 1 20.29 14.34 135.10 1297.0 \n", 204 | "\n", 205 | " smoothness_mean compactness_mean concavity_mean concave points_mean \n", 206 | "0 0.11840 0.27760 0.3001 0.14710 \\\n", 207 | "1 0.08474 0.07864 0.0869 0.07017 \n", 208 | "2 0.10960 0.15990 0.1974 0.12790 \n", 209 | "3 0.14250 0.28390 0.2414 0.10520 \n", 210 | "4 0.10030 0.13280 0.1980 0.10430 \n", 211 | "\n", 212 | " symmetry_mean ... radius_worst texture_worst perimeter_worst \n", 213 | "0 0.2419 ... 25.38 17.33 184.60 \\\n", 214 | "1 0.1812 ... 24.99 23.41 158.80 \n", 215 | "2 0.2069 ... 23.57 25.53 152.50 \n", 216 | "3 0.2597 ... 14.91 26.50 98.87 \n", 217 | "4 0.1809 ... 22.54 16.67 152.20 \n", 218 | "\n", 219 | " area_worst smoothness_worst compactness_worst concavity_worst \n", 220 | "0 2019.0 0.1622 0.6656 0.7119 \\\n", 221 | "1 1956.0 0.1238 0.1866 0.2416 \n", 222 | "2 1709.0 0.1444 0.4245 0.4504 \n", 223 | "3 567.7 0.2098 0.8663 0.6869 \n", 224 | "4 1575.0 0.1374 0.2050 0.4000 \n", 225 | "\n", 226 | " concave points_worst symmetry_worst fractal_dimension_worst \n", 227 | "0 0.2654 0.4601 0.11890 \n", 228 | "1 0.1860 0.2750 0.08902 \n", 229 | "2 0.2430 0.3613 0.08758 \n", 230 | "3 0.2575 0.6638 0.17300 \n", 231 | "4 0.1625 0.2364 0.07678 \n", 232 | "\n", 233 | "[5 rows x 31 columns]" 234 | ] 235 | }, 236 | "execution_count": 30, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "df = pd.read_excel(r\"D:\\Learnerea\\Tables\\cancer_data_cleaned.xlsx\")\n", 243 | "df.head()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 44, 249 | "id": "4c203fad", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "X = df.drop('diagnosis',axis=1)\n", 254 | "y = df.diagnosis" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 91, 260 | "id": "a7e5715a", 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "scaler = StandardScaler()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 93, 270 | "id": "79f95af2", 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "X_scaled = scaler.fit_transform(X)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 94, 280 | "id": "60d530ad", 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 95, 290 | "id": "0dadfa19", 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "elasticReg = ElasticNet(alpha=0.1,l1_ratio=0.5,random_state=42)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 96, 300 | "id": "9109c9bb", 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/html": [ 306 | "
ElasticNet(alpha=0.1, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 307 | ], 308 | "text/plain": [ 309 | "ElasticNet(alpha=0.1, random_state=42)" 310 | ] 311 | }, 312 | "execution_count": 96, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "elasticReg.fit(X_train,y_train)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 97, 324 | "id": "f8269ac7", 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "data": { 329 | "text/plain": [ 330 | "array([ 0. , 0. , 0. , 0. , 0. ,\n", 331 | " 0. , 0. , 0.07817021, 0. , -0. ,\n", 332 | " 0. , 0. , 0. , 0. , 0. ,\n", 333 | " 0. , -0. , 0. , 0. , -0. ,\n", 334 | " 0.08932816, 0.04304261, 0.04480352, 0. , 0. ,\n", 335 | " 0. , 0.01540593, 0.11816886, 0.01985054, 0. ])" 336 | ] 337 | }, 338 | "execution_count": 97, 339 | "metadata": {}, 340 | "output_type": "execute_result" 341 | } 342 | ], 343 | "source": [ 344 | "elasticReg.coef_" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 99, 350 | "id": "bb6c325a", 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "data": { 355 | "text/html": [ 356 | "
\n", 357 | "\n", 370 | "\n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | "
featuresimportance
27concave points_worst0.118169
20radius_worst0.089328
7concave points_mean0.078170
22perimeter_worst0.044804
21texture_worst0.043043
28symmetry_worst0.019851
26concavity_worst0.015406
0radius_mean0.000000
16concavity_se-0.000000
25compactness_worst0.000000
24smoothness_worst0.000000
23area_worst0.000000
19fractal_dimension_se-0.000000
18symmetry_se0.000000
17concave points_se0.000000
15compactness_se0.000000
1texture_mean0.000000
14smoothness_se0.000000
13area_se0.000000
12perimeter_se0.000000
11texture_se0.000000
10radius_se0.000000
9fractal_dimension_mean-0.000000
8symmetry_mean0.000000
6concavity_mean0.000000
5compactness_mean0.000000
4smoothness_mean0.000000
3area_mean0.000000
2perimeter_mean0.000000
29fractal_dimension_worst0.000000
\n", 531 | "
" 532 | ], 533 | "text/plain": [ 534 | " features importance\n", 535 | "27 concave points_worst 0.118169\n", 536 | "20 radius_worst 0.089328\n", 537 | "7 concave points_mean 0.078170\n", 538 | "22 perimeter_worst 0.044804\n", 539 | "21 texture_worst 0.043043\n", 540 | "28 symmetry_worst 0.019851\n", 541 | "26 concavity_worst 0.015406\n", 542 | "0 radius_mean 0.000000\n", 543 | "16 concavity_se -0.000000\n", 544 | "25 compactness_worst 0.000000\n", 545 | "24 smoothness_worst 0.000000\n", 546 | "23 area_worst 0.000000\n", 547 | "19 fractal_dimension_se -0.000000\n", 548 | "18 symmetry_se 0.000000\n", 549 | "17 concave points_se 0.000000\n", 550 | "15 compactness_se 0.000000\n", 551 | "1 texture_mean 0.000000\n", 552 | "14 smoothness_se 0.000000\n", 553 | "13 area_se 0.000000\n", 554 | "12 perimeter_se 0.000000\n", 555 | "11 texture_se 0.000000\n", 556 | "10 radius_se 0.000000\n", 557 | "9 fractal_dimension_mean -0.000000\n", 558 | "8 symmetry_mean 0.000000\n", 559 | "6 concavity_mean 0.000000\n", 560 | "5 compactness_mean 0.000000\n", 561 | "4 smoothness_mean 0.000000\n", 562 | "3 area_mean 0.000000\n", 563 | "2 perimeter_mean 0.000000\n", 564 | "29 fractal_dimension_worst 0.000000" 565 | ] 566 | }, 567 | "execution_count": 99, 568 | "metadata": {}, 569 | "output_type": "execute_result" 570 | } 571 | ], 572 | "source": [ 573 | "featureImp = [(feature,coef) for feature, coef in zip(X.columns,elasticReg.coef_)]\n", 574 | "impDF = pd.DataFrame(featureImp,columns=['features','importance'])\n", 575 | "impDF.sort_values(by='importance',ascending=False)" 576 | ] 577 | } 578 | ], 579 | "metadata": { 580 | "kernelspec": { 581 | "display_name": "Python 3 (ipykernel)", 582 | "language": "python", 583 | "name": "python3" 584 | }, 585 | "language_info": { 586 | "codemirror_mode": { 587 | "name": "ipython", 588 | "version": 3 589 | }, 590 | "file_extension": ".py", 591 | "mimetype": "text/x-python", 592 | "name": "python", 593 | "nbconvert_exporter": "python", 594 | "pygments_lexer": "ipython3", 595 | "version": "3.9.7" 596 | } 597 | }, 598 | "nbformat": 4, 599 | "nbformat_minor": 5 600 | } 601 | -------------------------------------------------------------------------------- /Scripts/google_interview_question_Medium.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[3]: 5 | 6 | 7 | import pandas as pd 8 | 9 | pd.options.display.max_colwidth = 1000 10 | 11 | 12 | # In[4]: 13 | 14 | 15 | df = pd.read_excel(r"D:\Learnerea\Tables\google_file_store.xlsx") 16 | df 17 | 18 | 19 | # In[12]: 20 | 21 | 22 | df['bull'] = df['contents'].apply(lambda x:x.count('bull')) 23 | df['bear'] = df['contents'].apply(lambda x:x.count('bear')) 24 | df['word'] = 'netry' 25 | df 26 | 27 | 28 | # In[16]: 29 | 30 | 31 | df.pivot_table(values=['bull','bear'],columns='word',aggfunc='sum').reset_index().sort_values(by='netry',ascending=False) 32 | 33 | -------------------------------------------------------------------------------- /Scripts/gradient_boosint_complete.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[31]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.ensemble import GradientBoostingRegressor 11 | from sklearn.model_selection import GridSearchCV 12 | 13 | 14 | # In[2]: 15 | 16 | 17 | data = {"Income_K":[75,67,70,90,80] 18 | ,"Profession":["Salaried","Business","Salaried","Business","Salaried"] 19 | ,"Loan_Lakhs":[3,2,5,7,4]} 20 | 21 | df = pd.DataFrame(data) 22 | df 23 | 24 | 25 | # In[4]: 26 | 27 | 28 | le = LabelEncoder() 29 | 30 | 31 | # In[5]: 32 | 33 | 34 | df['prof_enc'] = le.fit_transform(df['Profession']) 35 | df 36 | 37 | 38 | # In[6]: 39 | 40 | 41 | x = df[['Income_K','prof_enc']] 42 | y = df['Loan_Lakhs'] 43 | 44 | 45 | # In[27]: 46 | 47 | 48 | model = GradientBoostingRegressor(n_estimators=5) 49 | 50 | 51 | # In[28]: 52 | 53 | 54 | model.fit(x,y) 55 | 56 | 57 | # In[29]: 58 | 59 | 60 | prediction = model.predict(x) 61 | 62 | 63 | # In[30]: 64 | 65 | 66 | sum((y-prediction)**2)/len(y) 67 | 68 | 69 | # In[32]: 70 | 71 | 72 | params = {'n_estimators':range(1,200)} 73 | grid = GridSearchCV(param_grid=params,estimator=model,cv=2, scoring='neg_mean_squared_error') 74 | 75 | 76 | # In[33]: 77 | 78 | 79 | grid.fit(x,y) 80 | 81 | 82 | # In[34]: 83 | 84 | 85 | grid.best_estimator_ 86 | 87 | 88 | # In[35]: 89 | 90 | 91 | gb = grid.best_estimator_ 92 | 93 | 94 | # In[36]: 95 | 96 | 97 | gb.fit(x,y) 98 | 99 | 100 | # In[37]: 101 | 102 | 103 | preodiction = gb.predict(x) 104 | 105 | 106 | # In[38]: 107 | 108 | 109 | sum((y-prediction)**2)/len(y) 110 | 111 | -------------------------------------------------------------------------------- /Scripts/logisticPDmodel.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Scripts/logisticPDmodel.pkl -------------------------------------------------------------------------------- /Scripts/smote_logistic.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Scripts/smote_logistic.pkl -------------------------------------------------------------------------------- /Scripts/testing.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Scripts/time_series_air_passengers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | 12 | 13 | # In[2]: 14 | 15 | 16 | sns.get_dataset_names() 17 | 18 | 19 | # In[3]: 20 | 21 | 22 | df = sns.load_dataset('flights') 23 | df['yearMonth'] = pd.to_datetime("01-"+df['month'].astype(str)+"-"+df['year'].astype(str)) 24 | df.set_index('yearMonth',inplace=True) 25 | df.head() 26 | 27 | 28 | # In[4]: 29 | 30 | 31 | plt.figure(figsize=(10,5)) 32 | sns.lineplot(data=df,x=df.index,y=df.passengers) 33 | 34 | 35 | # In[5]: 36 | 37 | 38 | df['rollMean'] = df.passengers.rolling(window=12).mean() 39 | df['rollStd'] = df.passengers.rolling(window=12).std() 40 | 41 | 42 | # In[6]: 43 | 44 | 45 | plt.figure(figsize=(10,5)) 46 | sns.lineplot(data=df,x=df.index,y=df.passengers) 47 | sns.lineplot(data=df,x=df.index,y=df.rollMean) 48 | sns.lineplot(data=df,x=df.index,y=df.rollStd) 49 | 50 | 51 | # In[7]: 52 | 53 | 54 | from statsmodels.tsa.stattools import adfuller 55 | 56 | 57 | # In[8]: 58 | 59 | 60 | adfTest = adfuller(df['passengers'],autolag='AIC',) 61 | 62 | 63 | # In[9]: 64 | 65 | 66 | adfTest 67 | 68 | 69 | # In[10]: 70 | 71 | 72 | stats = pd.Series(adfTest[0:4],index=['Test Statistic','p-value','#lags used','number of observations used']) 73 | stats 74 | 75 | 76 | # In[11]: 77 | 78 | 79 | for key, values in adfTest[4].items(): 80 | print('criticality',key,":",values) 81 | 82 | 83 | # In[12]: 84 | 85 | 86 | def test_stationarity(dataFrame, var): 87 | dataFrame['rollMean'] = dataFrame[var].rolling(window=12).mean() 88 | dataFrame['rollStd'] = dataFrame[var].rolling(window=12).std() 89 | 90 | from statsmodels.tsa.stattools import adfuller 91 | adfTest = adfuller(dataFrame[var],autolag='AIC') 92 | stats = pd.Series(adfTest[0:4],index=['Test Statistic','p-value','#lags used','number of observations used']) 93 | print(stats) 94 | 95 | for key, values in adfTest[4].items(): 96 | print('criticality',key,":",values) 97 | 98 | sns.lineplot(data=dataFrame,x=dataFrame.index,y=var) 99 | sns.lineplot(data=dataFrame,x=dataFrame.index,y='rollMean') 100 | sns.lineplot(data=dataFrame,x=dataFrame.index,y='rollStd') 101 | 102 | 103 | # In[13]: 104 | 105 | 106 | air_df = df[['passengers']] 107 | air_df.head() 108 | 109 | 110 | # In[14]: 111 | 112 | 113 | # time shift 114 | 115 | air_df['shift'] = air_df.passengers.shift() 116 | air_df['shiftDiff'] = air_df['passengers'] - air_df['shift'] 117 | air_df.head() 118 | 119 | 120 | # In[15]: 121 | 122 | 123 | test_stationarity(air_df.dropna(),'shiftDiff') 124 | 125 | 126 | # In[16]: 127 | 128 | 129 | log_df = df[['passengers']] 130 | log_df['log'] = np.log(log_df['passengers']) 131 | log_df.head() 132 | 133 | 134 | # In[17]: 135 | 136 | 137 | test_stationarity(log_df,'log') 138 | 139 | 140 | # In[18]: 141 | 142 | 143 | sqrt_df = df[['passengers']] 144 | sqrt_df['sqrt'] = np.sqrt(df['passengers']) 145 | sqrt_df.head() 146 | 147 | 148 | # In[19]: 149 | 150 | 151 | test_stationarity(sqrt_df,'sqrt') 152 | 153 | 154 | # In[20]: 155 | 156 | 157 | cbrt_df = df[['passengers']] 158 | cbrt_df['cbrt'] = np.cbrt(cbrt_df['passengers']) 159 | cbrt_df.head() 160 | 161 | 162 | # In[21]: 163 | 164 | 165 | test_stationarity(cbrt_df,'cbrt') 166 | 167 | 168 | # In[22]: 169 | 170 | 171 | log_df2 = log_df[['passengers','log']] 172 | log_df2['log_sqrt'] = np.sqrt(log_df['log']) 173 | log_df2['logShiftDiff'] = log_df2['log_sqrt'] - log_df2['log_sqrt'].shift() 174 | log_df2.head() 175 | 176 | 177 | # In[23]: 178 | 179 | 180 | test_stationarity(log_df2.dropna(),'logShiftDiff') 181 | 182 | 183 | # In[24]: 184 | 185 | 186 | log_shift = df[['passengers']].copy(deep=True) 187 | log_shift['log'] = np.log(log_shift['passengers']) 188 | log_shift['logShift'] = log_shift['log'].shift() 189 | log_shift['logShiftDiff'] = log_shift['log'] - log_shift['logShift'] 190 | log_shift.head() 191 | 192 | 193 | # In[25]: 194 | 195 | 196 | test_stationarity(log_shift.dropna(),'logShiftDiff') 197 | 198 | 199 | # # Next - 2 200 | 201 | # In[80]: 202 | 203 | 204 | airP = df[['passengers']].copy(deep=True) 205 | airP['firstDiff'] = airP['passengers'].diff() 206 | airP['Diff12'] = airP['passengers'].diff(12) 207 | 208 | 209 | # In[81]: 210 | 211 | 212 | airP.head() 213 | 214 | 215 | # In[84]: 216 | 217 | 218 | from statsmodels.tsa.arima_model import ARIMA 219 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 220 | 221 | 222 | # In[86]: 223 | 224 | 225 | plot_pacf(airP['firstDiff'].dropna(),lags=20); 226 | 227 | 228 | # In[87]: 229 | 230 | 231 | plot_acf(airP['firstDiff'].dropna(),lags=20); 232 | 233 | 234 | # In[88]: 235 | 236 | 237 | # p = 1, q = 3, d =1 238 | 239 | 240 | # In[94]: 241 | 242 | 243 | train = airP[:round(len(airP)*70/100)] 244 | test = airP[round(len(airP)*70/100):] 245 | test.head() 246 | 247 | 248 | # In[100]: 249 | 250 | 251 | model = ARIMA(train['passengers'],order=(1,1,3)) 252 | model_fit = model.fit() 253 | prediction = model_fit.predict(start=test.index[0],end=test.index[-1]) 254 | airP['arimaPred'] = prediction 255 | airP.tail() 256 | 257 | 258 | # In[121]: 259 | 260 | 261 | airP.dropna() 262 | sns.lineplot(data=airP,x=airP.index,y='passengers') 263 | sns.lineplot(data=airP,x=airP.index,y='arimaPred') 264 | 265 | 266 | # In[122]: 267 | 268 | 269 | from sklearn.metrics import mean_squared_error 270 | 271 | 272 | # In[124]: 273 | 274 | 275 | np.sqrt(mean_squared_error(test['passengers'],prediction)) 276 | 277 | 278 | # In[126]: 279 | 280 | 281 | from statsmodels.tsa.statespace.sarimax import SARIMAX 282 | 283 | 284 | # In[141]: 285 | 286 | 287 | plot_pacf(airP['Diff12'].dropna(),lags=20); 288 | plot_acf(airP['Diff12'].dropna(),lags=20); 289 | 290 | 291 | # In[147]: 292 | 293 | 294 | model = SARIMAX(train['passengers'],order=(1,1,3),seasonal_order=(2,1,2,12)) 295 | model_fit = model.fit() 296 | prediction = model_fit.predict(start=test.index[0],end=test.index[-1]) 297 | airP['sarimaxPred'] = prediction 298 | 299 | 300 | # In[196]: 301 | 302 | 303 | airP.dropna() 304 | sns.lineplot(data=airP,x=airP.index,y='passengers') 305 | sns.lineplot(data=airP,x=airP.index,y='sarimaxPred') 306 | sns.lineplot(data=airP,x=airP.index,y='arimaPred') 307 | # model_fit.predict(start=futureDate.index[0],end=futureDate.index[-1]).plot(color='black') 308 | 309 | 310 | # In[144]: 311 | 312 | 313 | np.sqrt(mean_squared_error(test['passengers'],prediction)) 314 | 315 | 316 | # In[188]: 317 | 318 | 319 | futureDate = pd.DataFrame(pd.date_range(start='1961-01-01', end='1962-12-01',freq='MS'),columns=['Dates']) 320 | futureDate.set_index('Dates',inplace=True) 321 | futureDate.head() 322 | 323 | 324 | # In[192]: 325 | 326 | 327 | model_fit.predict(start=futureDate.index[0],end=futureDate.index[-1]) 328 | 329 | 330 | # In[195]: 331 | 332 | 333 | airP.dropna() 334 | sns.lineplot(data=airP,x=airP.index,y='passengers') 335 | sns.lineplot(data=airP,x=airP.index,y='sarimaxPred') 336 | sns.lineplot(data=airP,x=airP.index,y='arimaPred') 337 | model_fit.predict(start=futureDate.index[0],end=futureDate.index[-1]).plot(color='black') 338 | 339 | 340 | # # Next - 3 341 | 342 | # In[201]: 343 | 344 | 345 | checkDf = df[['passengers']] 346 | checkDf['diff1'] = checkDf.diff() 347 | # checkDf['diffInv'] = checkDf['diff1'].diffinv() 348 | checkDf.head() 349 | 350 | --------------------------------------------------------------------------------