├── Data
    ├── Salary.xlsx
    ├── Telco_data_chi2_test.csv
    ├── airbnb_contacts.xlsx
    ├── airbnb_host_searches.xlsx
    ├── cancer_data.csv
    ├── cancer_data_cleaned.xlsx
    ├── credit_risk_dataset.csv
    ├── dataforAnalysis.xlsx
    ├── facebook_web_log.xlsx
    ├── fb_friend_requests.xlsx
    ├── google_file_store.xlsx
    ├── loan_data_sample.xlsx
    ├── stratascratch_amazon_active_returning_customers.xlsx
    ├── stratascratch_amazon_pct_diff.xlsx
    ├── testing.txt
    ├── to_build_the_model.xlsx
    └── to_test_the_model.xlsx
└── Scripts
    ├── Amazon_interview_question_Hard.py
    ├── Bag_Of_Words.py
    ├── Credit_Risk_PD_Model.ipynb
    ├── Decision_Tree.py
    ├── Exploratory Data Analysis - Loan Defaulter Segmentation.py
    ├── FastText_Setup_Prediction.py
    ├── Feature Selection with ANOVA Method.ipynb
    ├── Feature Selection with Chi2 Method.ipynb
    ├── Feature Selection with Correlation Method.ipynb
    ├── FraudDetectionModelDataPrep.py
    ├── GloVe.py
    ├── LinearRegression_Live.py
    ├── LogisticLive.py
    ├── NER.py
    ├── PD_credit_risk_model_dev.py
    ├── Payment_Fraud_Detection_Model_Logistic_Reg.py
    ├── Payment_Fraud_Detection_Model_Logistic_and_Decision_Tree.py
    ├── PoS.py
    ├── PreProcessing_PD_Model.ipynb
    ├── PreProcessing_PD_Model.py
    ├── RandomForesPDmodel.zip
    ├── Recursive Feature Elimination.ipynb
    ├── UK_PM_Sentiment_Analysis.py
    ├── Understand_differences_in_numbers.ipynb
    ├── UsingAutomatedModulesAndModels.ipynb
    ├── XGBpdModel.pkl
    ├── airbnb_Host_Popularity_Rental_Prices_Hard.py
    ├── airbnb_Ranking_Most_Active_Guests.py
    ├── amazon_interview_question.py
    ├── backward_elimintation_method.ipynb
    ├── balancing_live.ipynb
    ├── ensemble_model.zip
    ├── facebook_Acceptance_Rate_By_Date.py
    ├── facebook_interview_question.py
    ├── feature_selection_with_LASSO.ipynb
    ├── feature_selection_with_Ridge_Regression.ipynb
    ├── feature_selection_with_decision_tree_or_random_forest.ipynb
    ├── feature_selection_with_elasticnet.ipynb
    ├── forward_selection_method.ipynb
    ├── google_interview_question_Medium.py
    ├── gradient_boosint_complete.py
    ├── logisticPDmodel.pkl
    ├── smote_logistic.pkl
    ├── testing.txt
    └── time_series_air_passengers.py


/Data/Salary.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/Salary.xlsx


--------------------------------------------------------------------------------
/Data/airbnb_contacts.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/airbnb_contacts.xlsx


--------------------------------------------------------------------------------
/Data/airbnb_host_searches.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/airbnb_host_searches.xlsx


--------------------------------------------------------------------------------
/Data/cancer_data_cleaned.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/cancer_data_cleaned.xlsx


--------------------------------------------------------------------------------
/Data/dataforAnalysis.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/dataforAnalysis.xlsx


--------------------------------------------------------------------------------
/Data/facebook_web_log.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/facebook_web_log.xlsx


--------------------------------------------------------------------------------
/Data/fb_friend_requests.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/fb_friend_requests.xlsx


--------------------------------------------------------------------------------
/Data/google_file_store.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/google_file_store.xlsx


--------------------------------------------------------------------------------
/Data/loan_data_sample.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/loan_data_sample.xlsx


--------------------------------------------------------------------------------
/Data/stratascratch_amazon_active_returning_customers.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/stratascratch_amazon_active_returning_customers.xlsx


--------------------------------------------------------------------------------
/Data/stratascratch_amazon_pct_diff.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/stratascratch_amazon_pct_diff.xlsx


--------------------------------------------------------------------------------
/Data/testing.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Data/to_build_the_model.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/to_build_the_model.xlsx


--------------------------------------------------------------------------------
/Data/to_test_the_model.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Data/to_test_the_model.xlsx


--------------------------------------------------------------------------------
/Scripts/Amazon_interview_question_Hard.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | amazon = pd.read_excel(r"D:\Learnerea\Tables\amazon_pct_diff.xlsx")
 9 | amazon.head()
10 | 
11 | 
12 | # In[14]:
13 | 
14 | 
15 | amazon['yearMonth'] = amazon['created_at'].dt.strftime('%Y-%m')
16 | amazon_agg = amazon.pivot_table(index='yearMonth',values='value',aggfunc='sum').reset_index()
17 | amazon_agg['prev_month'] = amazon_agg['value'].shift()
18 | amazon_agg['rev_diff'] = round(((amazon_agg['value'] - amazon_agg['prev_month'])/amazon_agg['prev_month'])*100,2)
19 | amazon_agg
20 | 
21 | 


--------------------------------------------------------------------------------
/Scripts/Bag_Of_Words.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | 
 10 | pd.set_option('display.max_colwidth',0)
 11 | 
 12 | 
 13 | # In[2]:
 14 | 
 15 | 
 16 | df = pd.read_csv(r"D:\Learnerea\Tables\imdb_master.csv", encoding="ISO-8859-1").drop(['Unnamed: 0',"file"],axis=1)
 17 | 
 18 | 
 19 | # In[3]:
 20 | 
 21 | 
 22 | display(df.shape)
 23 | display(df.head())
 24 | display(df.type.value_counts())
 25 | display(df.label.value_counts())
 26 | 
 27 | 
 28 | # In[4]:
 29 | 
 30 | 
 31 | filtered = df.query("label!='unsup'")
 32 | 
 33 | 
 34 | # In[5]:
 35 | 
 36 | 
 37 | filtered['label'] = filtered['label'].apply(lambda x: 0 if x == "neg" else 1)
 38 | 
 39 | 
 40 | # In[6]:
 41 | 
 42 | 
 43 | filtered.label.value_counts()
 44 | 
 45 | 
 46 | # # Cleaning starts
 47 | 
 48 | # In[7]:
 49 | 
 50 | 
 51 | filtered['review_lower'] = filtered['review'].str.lower()
 52 | 
 53 | 
 54 | # In[8]:
 55 | 
 56 | 
 57 | from nltk.corpus import stopwords
 58 | 
 59 | 
 60 | # In[9]:
 61 | 
 62 | 
 63 | engStopWords = stopwords.words("english")
 64 | 
 65 | 
 66 | # In[10]:
 67 | 
 68 | 
 69 | filtered['review_no_stopW'] = filtered['review_lower'].apply(lambda x:" ".join(word for word in x.split() if word not in engStopWords))
 70 | 
 71 | 
 72 | # In[11]:
 73 | 
 74 | 
 75 | # filtered.head(2)
 76 | 
 77 | 
 78 | # In[12]:
 79 | 
 80 | 
 81 | from nltk.stem import WordNetLemmatizer
 82 | 
 83 | 
 84 | # In[13]:
 85 | 
 86 | 
 87 | lemm = WordNetLemmatizer()
 88 | 
 89 | 
 90 | # In[14]:
 91 | 
 92 | 
 93 | filtered['lemmatized_review'] = filtered['review_no_stopW'].apply(lambda x: " ".join(lemm.lemmatize(word) for word in x.split()))
 94 | 
 95 | 
 96 | # In[15]:
 97 | 
 98 | 
 99 | filtered.head(1)
100 | 
101 | 
102 | # In[16]:
103 | 
104 | 
105 | filtered = filtered.drop(['review','review_lower','review_no_stopW'],axis=1)
106 | 
107 | 
108 | # In[17]:
109 | 
110 | 
111 | filtered = filtered.rename({'lemmatized_review':'review'},axis=1)
112 | 
113 | 
114 | # In[18]:
115 | 
116 | 
117 | filtered.head()
118 | 
119 | 
120 | # # Cleaning Ends
121 | 
122 | # In[19]:
123 | 
124 | 
125 | train = filtered.query("type=='train'").drop(['type'],axis=1)
126 | test = filtered.query("type=='test'").drop(['type'],axis=1)
127 | 
128 | 
129 | # In[20]:
130 | 
131 | 
132 | x_train = train['review'].values
133 | y_train = train['label'].values
134 | 
135 | 
136 | # In[21]:
137 | 
138 | 
139 | x_test = test['review'].values
140 | y_test = test['label'].values
141 | 
142 | 
143 | # In[22]:
144 | 
145 | 
146 | from sklearn.feature_extraction.text import CountVectorizer
147 | 
148 | 
149 | # In[23]:
150 | 
151 | 
152 | x_train[0]
153 | 
154 | 
155 | # In[24]:
156 | 
157 | 
158 | vector = CountVectorizer()
159 | 
160 | 
161 | # In[28]:
162 | 
163 | 
164 | trained_vector = vector.transform(x_train)
165 | 
166 | 
167 | # In[29]:
168 | 
169 | 
170 | trained_vector[50].toarray()[:50]
171 | 
172 | 
173 | # In[30]:
174 | 
175 | 
176 | vector.get_feature_names()[500:][:4]
177 | 
178 | 
179 | # In[31]:
180 | 
181 | 
182 | train[train.review.str.contains('20minutes')]
183 | 
184 | 
185 | # In[32]:
186 | 
187 | 
188 | train.shape
189 | 
190 | 
191 | # In[33]:
192 | 
193 | 
194 | from sklearn.naive_bayes import MultinomialNB
195 | 
196 | 
197 | # In[34]:
198 | 
199 | 
200 | model = MultinomialNB()
201 | 
202 | 
203 | # In[35]:
204 | 
205 | 
206 | model.fit(trained_vector, y_train)
207 | 
208 | 
209 | # In[36]:
210 | 
211 | 
212 | test_vector = vector.transform(x_test)
213 | 
214 | 
215 | # In[37]:
216 | 
217 | 
218 | predicted = model.predict(test_vector)
219 | 
220 | 
221 | # In[38]:
222 | 
223 | 
224 | from sklearn.metrics import classification_report
225 | 
226 | 
227 | # In[39]:
228 | 
229 | 
230 | print(classification_report(y_test,predicted))
231 | 
232 | 
233 | # In[76]:
234 | 
235 | 
236 | final_sentiment = pd.DataFrame(zip(x_test,y_test,predicted),columns=['review','act_sent','pred_sent'])
237 | final_sentiment.head(2)
238 | 
239 | 
240 | # In[78]:
241 | 
242 | 
243 | final_sentiment.query("act_sent != pred_sent").head(2)
244 | 
245 | 
246 | # In[85]:
247 | 
248 | 
249 | review = np.array(["that movie was aweful"])
250 | 
251 | 
252 | # In[86]:
253 | 
254 | 
255 | text_to_pred = vector.transform(review)
256 | 
257 | 
258 | # In[87]:
259 | 
260 | 
261 | model.predict(text_to_pred)
262 | 
263 | 
264 | # In[100]:
265 | 
266 | 
267 | def sentiment_check(input_text):
268 |     test = np.array(input_text)
269 |     text_to_pred = vector.transform(test)
270 |     predicted_val = model.predict(text_to_pred)
271 |     if predicted_val == 1:
272 |         print("the review is POSITIVE")
273 |     else:
274 |         print("the review is NEGATIVE")
275 | 
276 | 
277 | # In[102]:
278 | 
279 | 
280 | sentiment_check(['the movie was aweful'])
281 | 
282 | 


--------------------------------------------------------------------------------
/Scripts/Decision_Tree.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[37]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | import seaborn as sns
10 | from sklearn.preprocessing import LabelEncoder
11 | from sklearn.model_selection import train_test_split
12 | from sklearn.tree import DecisionTreeClassifier
13 | from sklearn.metrics import classification_report, confusion_matrix
14 | 
15 | 
16 | # In[38]:
17 | 
18 | 
19 | perf_data = pd.read_excel(r"D:\Learnerea\Tables\data_decision_tree.xlsx")
20 | 
21 | 
22 | # In[39]:
23 | 
24 | 
25 | perf_data.head()
26 | 
27 | 
28 | # In[40]:
29 | 
30 | 
31 | encoder = LabelEncoder()
32 | 
33 | 
34 | # In[42]:
35 | 
36 | 
37 | perf_data['cert_encoded'] = encoder.fit_transform(perf_data['Certification'])
38 | perf_data['proj_encoded'] = encoder.fit_transform(perf_data['Projects'])
39 | perf_data['supt_encoded'] = encoder.fit_transform(perf_data['Supported_other_projects'])
40 | perf_data_new = perf_data.drop(['Certification','Projects','Supported_other_projects'],axis=1)
41 | perf_data_new.head()
42 | 
43 | 
44 | # In[43]:
45 | 
46 | 
47 | inputs = perf_data_new.drop(['Hike_ge_20pct'],axis=1)
48 | result = perf_data_new['Hike_ge_20pct']
49 | 
50 | 
51 | # In[56]:
52 | 
53 | 
54 | inputs_train, inputs_test, result_train, result_test = train_test_split(inputs,result,test_size=0.2,random_state=False)
55 | 
56 | 
57 | # In[57]:
58 | 
59 | 
60 | myTree = DecisionTreeClassifier()
61 | 
62 | 
63 | # In[58]:
64 | 
65 | 
66 | myTree.fit(inputs_train,result_train)
67 | 
68 | 
69 | # In[63]:
70 | 
71 | 
72 | prediction=myTree.predict(inputs_test)
73 | 
74 | 
75 | # In[60]:
76 | 
77 | 
78 | result_test
79 | 
80 | 
81 | # In[61]:
82 | 
83 | 
84 | myTree.score(inputs_test,result_test)
85 | 
86 | 
87 | # In[67]:
88 | 
89 | 
90 | cm = confusion_matrix(result_test,prediction)
91 | 
92 | 
93 | # In[70]:
94 | 
95 | 
96 | print(classification_report(result_test,prediction))
97 | 
98 | 


--------------------------------------------------------------------------------
/Scripts/Exploratory Data Analysis - Loan Defaulter Segmentation.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | # coding: utf-8
   3 | 
   4 | # # Data import and basic exploration
   5 | 
   6 | # In[1]:
   7 | 
   8 | 
   9 | import pandas as pd
  10 | import numpy as np
  11 | import matplotlib.pyplot as plt
  12 | import seaborn as sns
  13 | 
  14 | pd.options.display.max_columns = None
  15 | pd.options.display.max_rows = None
  16 | 
  17 | 
  18 | # In[2]:
  19 | 
  20 | 
  21 | app = pd.read_csv(r"D:\Learnerea\Tables\loan_default_data\application_data.csv")
  22 | prev_app = pd.read_csv(r"D:\Learnerea\Tables\loan_default_data\previous_application.csv")
  23 | 
  24 | 
  25 | # In[3]:
  26 | 
  27 | 
  28 | app.head()
  29 | 
  30 | 
  31 | # # Feature selection
  32 | 
  33 | # In[4]:
  34 | 
  35 | 
  36 | app.columns
  37 | 
  38 | 
  39 | # In[5]:
  40 | 
  41 | 
  42 | app.shape
  43 | 
  44 | 
  45 | # In[6]:
  46 | 
  47 | 
  48 | msng_info = pd.DataFrame(app.isnull().sum().sort_values()).reset_index()
  49 | msng_info.rename(columns={'index':'col_name',0:'null_count'},inplace=True)
  50 | msng_info.head()
  51 | 
  52 | 
  53 | # In[7]:
  54 | 
  55 | 
  56 | msng_info['msng_pct'] = msng_info['null_count']/app.shape[0]*100
  57 | # msng_info.to_excel(r"D:\Learnerea\Tables\loan_default_data\missing_info.xlsx",index=False)
  58 | msng_info.head()
  59 | 
  60 | 
  61 | # In[8]:
  62 | 
  63 | 
  64 | msng_col = msng_info[msng_info['msng_pct']>=40]['col_name'].to_list()
  65 | app_msng_rmvd = app.drop(labels=msng_col,axis=1)
  66 | app_msng_rmvd.shape
  67 | 
  68 | 
  69 | # In[9]:
  70 | 
  71 | 
  72 | app_msng_rmvd.head()
  73 | 
  74 | 
  75 | # In[10]:
  76 | 
  77 | 
  78 | flag_col = []
  79 | 
  80 | for col in app_msng_rmvd.columns:
  81 |     if col.startswith("FLAG_"):
  82 |         flag_col.append(col)
  83 |         
  84 | len(flag_col)
  85 | 
  86 | 
  87 | # In[11]:
  88 | 
  89 | 
  90 | flag_col
  91 | 
  92 | 
  93 | # In[12]:
  94 | 
  95 | 
  96 | flag_tgt_col = app_msng_rmvd[flag_col+['TARGET']]
  97 | flag_tgt_col.head()
  98 | 
  99 | 
 100 | # In[13]:
 101 | 
 102 | 
 103 | plt.figure(figsize=(20,25))
 104 | 
 105 | for i, col in enumerate(flag_col):
 106 |     plt.subplot(7,4,i+1)
 107 |     sns.countplot(data=flag_tgt_col,x=col,hue='TARGET')
 108 | 
 109 | 
 110 | # In[14]:
 111 | 
 112 | 
 113 | flg_corr = ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE',
 114 |      'FLAG_PHONE', 'FLAG_EMAIL','TARGET']
 115 | flag_corr_df = app_msng_rmvd[flg_corr]
 116 | 
 117 | 
 118 | # In[15]:
 119 | 
 120 | 
 121 | flag_corr_df.groupby(['FLAG_OWN_CAR']).size()
 122 | 
 123 | 
 124 | # In[16]:
 125 | 
 126 | 
 127 | flag_corr_df['FLAG_OWN_CAR'] = flag_corr_df['FLAG_OWN_CAR'].replace({'N':0,'Y':1})
 128 | flag_corr_df['FLAG_OWN_REALTY'] = flag_corr_df['FLAG_OWN_REALTY'].replace({'N':0,'Y':1})
 129 | 
 130 | flag_corr_df.groupby(['FLAG_OWN_CAR']).size()
 131 | 
 132 | 
 133 | # In[17]:
 134 | 
 135 | 
 136 | corr_df = round(flag_corr_df.corr(),2)
 137 | 
 138 | plt.figure(figsize=(10,5))
 139 | sns.heatmap(corr_df,cmap='coolwarm',linewidths=.5,annot=True)
 140 | 
 141 | 
 142 | # In[18]:
 143 | 
 144 | 
 145 | app_flag_rmvd = app_msng_rmvd.drop(labels =flag_col,axis=1)
 146 | app_flag_rmvd.shape
 147 | 
 148 | 
 149 | # In[19]:
 150 | 
 151 | 
 152 | sns.heatmap(data=round(app_flag_rmvd[['EXT_SOURCE_2','EXT_SOURCE_3','TARGET']].corr(),2),cmap='coolwarm',linewidths=.5,annot=True)
 153 | 
 154 | 
 155 | # In[20]:
 156 | 
 157 | 
 158 | app_score_col_rmvd = app_flag_rmvd.drop(['EXT_SOURCE_2','EXT_SOURCE_3'],axis=1)
 159 | app_score_col_rmvd.shape
 160 | 
 161 | 
 162 | # # Feature engineering
 163 | 
 164 | # In[21]:
 165 | 
 166 | 
 167 | app_score_col_rmvd.isnull().sum().sort_values()/app_score_col_rmvd.shape[0]
 168 | 
 169 | 
 170 | # ### Missing imputation
 171 | 
 172 | # In[22]:
 173 | 
 174 | 
 175 | app_score_col_rmvd['CNT_FAM_MEMBERS'] = app_score_col_rmvd['CNT_FAM_MEMBERS'].fillna((app_score_col_rmvd['CNT_FAM_MEMBERS'].mode()[0]))
 176 | 
 177 | 
 178 | # In[23]:
 179 | 
 180 | 
 181 | app_score_col_rmvd['CNT_FAM_MEMBERS'].isnull().sum()
 182 | 
 183 | 
 184 | # In[24]:
 185 | 
 186 | 
 187 | app_score_col_rmvd['OCCUPATION_TYPE'] = app_score_col_rmvd['OCCUPATION_TYPE'].fillna((app_score_col_rmvd['OCCUPATION_TYPE'].mode()[0]))
 188 | # app_score_col_rmvd['OCCUPATION_TYPE'].mode()[0]
 189 | 
 190 | 
 191 | # In[25]:
 192 | 
 193 | 
 194 | app_score_col_rmvd['OCCUPATION_TYPE'].isnull().sum()
 195 | 
 196 | 
 197 | # In[26]:
 198 | 
 199 | 
 200 | app_score_col_rmvd['NAME_TYPE_SUITE'] = app_score_col_rmvd['NAME_TYPE_SUITE'].fillna((app_score_col_rmvd['NAME_TYPE_SUITE'].mode()[0]))
 201 | 
 202 | 
 203 | # In[27]:
 204 | 
 205 | 
 206 | app_score_col_rmvd['NAME_TYPE_SUITE'].isnull().sum()
 207 | 
 208 | 
 209 | # In[28]:
 210 | 
 211 | 
 212 | app_score_col_rmvd['AMT_ANNUITY'] = app_score_col_rmvd['AMT_ANNUITY'].fillna((app_score_col_rmvd['AMT_ANNUITY'].mean()))
 213 | 
 214 | 
 215 | # In[29]:
 216 | 
 217 | 
 218 | app_score_col_rmvd['AMT_ANNUITY'].isnull().sum()
 219 | 
 220 | 
 221 | # In[30]:
 222 | 
 223 | 
 224 | # app_score_col_rmvd['AMT_REQ_CREDIT_BUREAU_HOUR']
 225 | 
 226 | 
 227 | # In[31]:
 228 | 
 229 | 
 230 | amt_req_col = []
 231 | 
 232 | for col in app_score_col_rmvd.columns:
 233 |     if col.startswith("AMT_REQ_CREDIT_BUREAU"):
 234 |         amt_req_col.append(col)
 235 | 
 236 | amt_req_col
 237 | # app_score_col_rmvd['AMT_REQ_CREDIT_BUREAU_HOUR'].unique()
 238 | 
 239 | 
 240 | # In[32]:
 241 | 
 242 | 
 243 | for col in amt_req_col:
 244 |     app_score_col_rmvd[col] = app_score_col_rmvd[col].fillna((app_score_col_rmvd[col].median()))
 245 | 
 246 | 
 247 | # In[33]:
 248 | 
 249 | 
 250 | # app_score_col_rmvd.isnull().sum().sort_values()
 251 | 
 252 | 
 253 | # In[34]:
 254 | 
 255 | 
 256 | app_score_col_rmvd['AMT_GOODS_PRICE'] = app_score_col_rmvd['AMT_GOODS_PRICE'].fillna((app_score_col_rmvd['AMT_GOODS_PRICE'].median()))
 257 | 
 258 | 
 259 | # In[35]:
 260 | 
 261 | 
 262 | app_score_col_rmvd['AMT_GOODS_PRICE'].isnull().sum()
 263 | 
 264 | 
 265 | # In[36]:
 266 | 
 267 | 
 268 | app_score_col_rmvd.head()
 269 | 
 270 | 
 271 | # ### Value modifiction
 272 | 
 273 | # In[37]:
 274 | 
 275 | 
 276 | days_col = []
 277 | 
 278 | for col in app_score_col_rmvd.columns:
 279 |     if col.startswith("DAYS"):
 280 |         days_col.append(col)
 281 | 
 282 | days_col
 283 | 
 284 | 
 285 | # In[38]:
 286 | 
 287 | 
 288 | for col in days_col:
 289 |     app_score_col_rmvd[col] = abs(app_score_col_rmvd[col])
 290 | 
 291 | 
 292 | # In[39]:
 293 | 
 294 | 
 295 | app_score_col_rmvd.head()
 296 | 
 297 | 
 298 | # In[40]:
 299 | 
 300 | 
 301 | app_score_col_rmvd.nunique().sort_values()
 302 | 
 303 | 
 304 | # In[41]:
 305 | 
 306 | 
 307 | app_score_col_rmvd['OBS_30_CNT_SOCIAL_CIRCLE'].unique()
 308 | 
 309 | 
 310 | # ### Outlier detection & treatment
 311 | 
 312 | # In[42]:
 313 | 
 314 | 
 315 | app_score_col_rmvd['AMT_GOODS_PRICE'].agg(['min','max','median'])
 316 | 
 317 | 
 318 | # In[43]:
 319 | 
 320 | 
 321 | sns.kdeplot(data=app_score_col_rmvd,x='AMT_GOODS_PRICE')
 322 | 
 323 | 
 324 | # In[44]:
 325 | 
 326 | 
 327 | sns.boxenplot(data=app_score_col_rmvd,x='AMT_GOODS_PRICE')
 328 | 
 329 | 
 330 | # In[45]:
 331 | 
 332 | 
 333 | app_score_col_rmvd['AMT_GOODS_PRICE'].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99])
 334 | 
 335 | 
 336 | # In[46]:
 337 | 
 338 | 
 339 | bins = [0,100000,200000,300000,400000,500000,600000,700000,800000,900000,4050000]
 340 | ranges = ['0-100K','100k-200K','200K-300K','300K-400K','400K-500K','500K-600K','600K-700K'
 341 |           ,'700K-800K','800K-900K','Above 900K']
 342 | 
 343 | app_score_col_rmvd['AMT_GOODS_PRICE_RANGE'] = pd.cut(app_score_col_rmvd['AMT_GOODS_PRICE'],bins,labels=ranges)
 344 | 
 345 | 
 346 | # In[47]:
 347 | 
 348 | 
 349 | app_score_col_rmvd.groupby(['AMT_GOODS_PRICE_RANGE']).size()
 350 | 
 351 | 
 352 | # In[48]:
 353 | 
 354 | 
 355 | app_score_col_rmvd['AMT_INCOME_TOTAL'].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99])
 356 | 
 357 | 
 358 | # In[49]:
 359 | 
 360 | 
 361 | app_score_col_rmvd['AMT_INCOME_TOTAL'].max()
 362 | 
 363 | 
 364 | # In[50]:
 365 | 
 366 | 
 367 | bins = [0,100000,150000,200000,250000,300000,350000,400000,117000000]
 368 | ranges = ['0-100K','100K-150K','150K-200K','200K-250K','250K-300K','300K-350K','350K-400K'
 369 |           ,'Above 400K']
 370 | 
 371 | app_score_col_rmvd['AMT_INCOME_TOTAL_RANGE'] = pd.cut(app_score_col_rmvd['AMT_INCOME_TOTAL'],bins,labels=ranges)
 372 | 
 373 | 
 374 | # In[51]:
 375 | 
 376 | 
 377 | app_score_col_rmvd.groupby(['AMT_INCOME_TOTAL_RANGE']).size()
 378 | 
 379 | 
 380 | # In[52]:
 381 | 
 382 | 
 383 | app_score_col_rmvd['AMT_INCOME_TOTAL_RANGE'].isnull().sum()
 384 | 
 385 | 
 386 | # In[53]:
 387 | 
 388 | 
 389 | app_score_col_rmvd['AMT_CREDIT'].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99])
 390 | 
 391 | 
 392 | # In[54]:
 393 | 
 394 | 
 395 | app_score_col_rmvd['AMT_CREDIT'].max()
 396 | 
 397 | 
 398 | # In[55]:
 399 | 
 400 | 
 401 | bins = [0,200000,400000,600000,800000,900000,1000000,2000000,3000000,4050000]
 402 | ranges = ['0-200K','200K-400K','400K-600K','600K-800K','800K-900K','900K-1M','1M-2M','2M-3M','Above 3M']
 403 | 
 404 | app_score_col_rmvd['AMT_CREDIT_RANGE'] = pd.cut(app_score_col_rmvd['AMT_CREDIT'],bins,labels=ranges)
 405 | 
 406 | 
 407 | # In[56]:
 408 | 
 409 | 
 410 | app_score_col_rmvd.groupby(['AMT_CREDIT_RANGE']).size()
 411 | 
 412 | 
 413 | # In[57]:
 414 | 
 415 | 
 416 | app_score_col_rmvd['AMT_CREDIT'].isnull().sum()
 417 | 
 418 | 
 419 | # In[58]:
 420 | 
 421 | 
 422 | app_score_col_rmvd['AMT_ANNUITY'].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99])
 423 | 
 424 | 
 425 | # In[59]:
 426 | 
 427 | 
 428 | app_score_col_rmvd['AMT_ANNUITY'].max()
 429 | 
 430 | 
 431 | # In[60]:
 432 | 
 433 | 
 434 | bins = [0,25000,50000,100000,150000,200000,258025.5]
 435 | ranges = ['0-25K','25K-50K','50K-100K','100K-150K','150K-200K','Above 200K']
 436 | 
 437 | app_score_col_rmvd['AMT_ANNUITY_RANGE'] = pd.cut(app_score_col_rmvd['AMT_ANNUITY'],bins,labels=ranges)
 438 | 
 439 | 
 440 | # In[61]:
 441 | 
 442 | 
 443 | app_score_col_rmvd.groupby(['AMT_ANNUITY_RANGE']).size()
 444 | 
 445 | 
 446 | # In[62]:
 447 | 
 448 | 
 449 | app_score_col_rmvd['AMT_ANNUITY_RANGE'].isnull().sum()
 450 | 
 451 | 
 452 | # In[63]:
 453 | 
 454 | 
 455 | app_score_col_rmvd['DAYS_EMPLOYED'].agg(['min','max','median'])
 456 | 
 457 | 
 458 | # In[64]:
 459 | 
 460 | 
 461 | app_score_col_rmvd['DAYS_EMPLOYED'].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.81,0.85,0.9,0.95,0.99])
 462 | 
 463 | 
 464 | # In[65]:
 465 | 
 466 | 
 467 | app_score_col_rmvd[app_score_col_rmvd['DAYS_EMPLOYED']<app_score_col_rmvd['DAYS_EMPLOYED'].max()].max()['DAYS_EMPLOYED']
 468 | 
 469 | 
 470 | # In[66]:
 471 | 
 472 | 
 473 | app_score_col_rmvd['DAYS_EMPLOYED'].max()
 474 | 
 475 | 
 476 | # In[67]:
 477 | 
 478 | 
 479 | bins = [0,1825,3650,5475,7300,9125,10950,12775,14600,16425,18250,23691,365243]
 480 | 
 481 | ranges = ['0-5Y','5Y-10Y','10Y-15Y','15Y-20Y','20Y-25Y','25Y-30Y','30Y-35Y','35Y-40Y','40Y-45Y','45Y-50Y'
 482 |           ,'50Y-65Y','Above 65Y']
 483 | 
 484 | app_score_col_rmvd['DAYS_EMPLOYED_RANGE'] = pd.cut(app_score_col_rmvd['DAYS_EMPLOYED'],bins,labels=ranges)
 485 | 
 486 | 
 487 | # In[68]:
 488 | 
 489 | 
 490 | app_score_col_rmvd.groupby(['DAYS_EMPLOYED_RANGE']).size()
 491 | 
 492 | 
 493 | # In[69]:
 494 | 
 495 | 
 496 | app_score_col_rmvd['DAYS_BIRTH'].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.81,0.85,0.9,0.95,0.99])
 497 | 
 498 | 
 499 | # In[70]:
 500 | 
 501 | 
 502 | app_score_col_rmvd['DAYS_BIRTH'].min()
 503 | 
 504 | 
 505 | # In[71]:
 506 | 
 507 | 
 508 | bins = [0,7300,10950,14600,18250,21900,25229]
 509 | 
 510 | ranges = ['20Y','20Y-30Y','30Y-40Y','40Y-50Y','50Y-60Y','Above 60Y']
 511 | 
 512 | app_score_col_rmvd['DAYS_BIRTH_RANGE'] = pd.cut(app_score_col_rmvd['DAYS_BIRTH'],bins,labels=ranges)
 513 | 
 514 | 
 515 | # In[72]:
 516 | 
 517 | 
 518 | app_score_col_rmvd.groupby(['DAYS_BIRTH_RANGE']).size()
 519 | 
 520 | 
 521 | # In[73]:
 522 | 
 523 | 
 524 | app_score_col_rmvd['DAYS_BIRTH'].isnull().sum()
 525 | 
 526 | 
 527 | # # Data Analysis
 528 | 
 529 | # In[74]:
 530 | 
 531 | 
 532 | app_score_col_rmvd.dtypes.value_counts()
 533 | 
 534 | 
 535 | # In[75]:
 536 | 
 537 | 
 538 | obj_var = app_score_col_rmvd.select_dtypes(include=['object']).columns
 539 | obj_var
 540 | 
 541 | 
 542 | # In[76]:
 543 | 
 544 | 
 545 | app_score_col_rmvd.groupby(['NAME_CONTRACT_TYPE']).size()
 546 | 
 547 | 
 548 | # In[77]:
 549 | 
 550 | 
 551 | sns.countplot(data=app_score_col_rmvd,x='NAME_CONTRACT_TYPE',hue='TARGET')
 552 | 
 553 | 
 554 | # In[78]:
 555 | 
 556 | 
 557 | data_pct = app_score_col_rmvd[['NAME_CONTRACT_TYPE','TARGET']].groupby(['NAME_CONTRACT_TYPE'], as_index=False).mean().sort_values(by='TARGET',ascending=False)
 558 | 
 559 | 
 560 | # In[79]:
 561 | 
 562 | 
 563 | data_pct
 564 | 
 565 | 
 566 | # In[80]:
 567 | 
 568 | 
 569 | data_pct['PCT'] = data_pct['TARGET']*100
 570 | 
 571 | 
 572 | # In[81]:
 573 | 
 574 | 
 575 | data_pct
 576 | 
 577 | 
 578 | # In[82]:
 579 | 
 580 | 
 581 | sns.barplot(data=data_pct,x='NAME_CONTRACT_TYPE',y='PCT')
 582 | 
 583 | 
 584 | # In[83]:
 585 | 
 586 | 
 587 | plt.figure(figsize=(10,5))
 588 | 
 589 | plt.subplot(1,2,1)
 590 | sns.countplot(data=app_score_col_rmvd,x='NAME_CONTRACT_TYPE',hue='TARGET')
 591 | 
 592 | plt.subplot(1,2,2)
 593 | sns.barplot(data=data_pct,x='NAME_CONTRACT_TYPE',y='PCT')
 594 | 
 595 | 
 596 | # In[84]:
 597 | 
 598 | 
 599 | obj_var
 600 | 
 601 | 
 602 | # In[85]:
 603 | 
 604 | 
 605 | plt.figure(figsize=(25,60))
 606 | 
 607 | 
 608 | for i, var in enumerate(obj_var):
 609 | 
 610 |     data_pct = app_score_col_rmvd[[var,'TARGET']].groupby([var], as_index=False).mean().sort_values(by='TARGET',ascending=False)
 611 |     data_pct['PCT'] = data_pct['TARGET']*100
 612 | 
 613 | 
 614 |     plt.subplot(10,2,i+i+1)
 615 |     plt.subplots_adjust(wspace=0.1,hspace=1)
 616 |     sns.countplot(data=app_score_col_rmvd,x=var,hue='TARGET')
 617 |     plt.xticks(rotation=90)
 618 | 
 619 |     plt.subplot(10,2,i+i+2)
 620 |     sns.barplot(data=data_pct,x=var,y='PCT',palette='coolwarm')
 621 |     plt.xticks(rotation=90)
 622 | 
 623 | 
 624 | # In[86]:
 625 | 
 626 | 
 627 | app_score_col_rmvd['NAME_EDUCATION_TYPE'].unique()
 628 | 
 629 | 
 630 | # In[87]:
 631 | 
 632 | 
 633 | app_score_col_rmvd.dtypes.value_counts()
 634 | 
 635 | 
 636 | # In[88]:
 637 | 
 638 | 
 639 | num_var = app_score_col_rmvd.select_dtypes(include=['float64','int64']).columns
 640 | num_cat_var = app_score_col_rmvd.select_dtypes(include=['float64','int64','category']).columns
 641 | len(num_var)
 642 | 
 643 | 
 644 | # In[89]:
 645 | 
 646 | 
 647 | num_data = app_score_col_rmvd[num_var]
 648 | defaulters = num_data[num_data['TARGET']==1]
 649 | repayers = num_data[num_data['TARGET']==0]
 650 | repayers.head()
 651 | 
 652 | 
 653 | # In[90]:
 654 | 
 655 | 
 656 | defaulters[['SK_ID_CURR','CNT_CHILDREN','AMT_INCOME_TOTAL']].corr()
 657 | 
 658 | 
 659 | # In[91]:
 660 | 
 661 | 
 662 | defaulter_corr = defaulters.corr()
 663 | defaulter_corr_unstck = defaulter_corr.where(np.triu(np.ones(defaulter_corr.shape),k=1).astype(np.bool)).unstack().reset_index().rename(columns={'level_0':'var1'
 664 |                                                                                                                         ,'level_1':'var2'
 665 |                                                                                                                         ,0:'corr'})
 666 | defaulter_corr_unstck['corr'] = abs(defaulter_corr_unstck['corr'])
 667 | defaulter_corr_unstck.dropna(subset=['corr']).sort_values(by=['corr'],ascending=False).head(10)
 668 | 
 669 | 
 670 | # In[92]:
 671 | 
 672 | 
 673 | repayers_corr = repayers.corr()
 674 | repayers_corr_unstck = repayers_corr.where(np.triu(np.ones(repayers_corr.shape),k=1).astype(np.bool)).unstack().reset_index().rename(columns={'level_0':'var1'
 675 |                                                                                                                         ,'level_1':'var2'
 676 |                                                                                                                         ,0:'corr'})
 677 | repayers_corr_unstck['corr'] = abs(repayers_corr_unstck['corr'])
 678 | repayers_corr_unstck.dropna(subset=['corr']).sort_values(by=['corr'],ascending=False).head(10)
 679 | 
 680 | 
 681 | # In[93]:
 682 | 
 683 | 
 684 | num_data.head()
 685 | 
 686 | 
 687 | # In[94]:
 688 | 
 689 | 
 690 | amt_var = ['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE']
 691 | 
 692 | 
 693 | # In[95]:
 694 | 
 695 | 
 696 | sns.kdeplot(data=num_data,x='AMT_CREDIT',hue='TARGET')
 697 | 
 698 | 
 699 | # In[96]:
 700 | 
 701 | 
 702 | plt.figure(figsize=(10,5))
 703 | 
 704 | for i, col in enumerate(amt_var):
 705 |     plt.subplot(2,2,i+1)
 706 |     sns.kdeplot(data=num_data,x=col,hue='TARGET')
 707 |     plt.subplots_adjust(wspace=0.5,hspace=0.5)
 708 | 
 709 | 
 710 | # In[97]:
 711 | 
 712 | 
 713 | num_data.head()
 714 | 
 715 | 
 716 | # In[98]:
 717 | 
 718 | 
 719 | sns.scatterplot(data=num_data,x='AMT_CREDIT',y='CNT_CHILDREN',hue='TARGET')
 720 | 
 721 | 
 722 | # In[99]:
 723 | 
 724 | 
 725 | amt_var = num_data[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE','TARGET']]
 726 | 
 727 | 
 728 | # In[100]:
 729 | 
 730 | 
 731 | sns.pairplot(data=amt_var,hue='TARGET')
 732 | 
 733 | 
 734 | # In[101]:
 735 | 
 736 | 
 737 | null_count = pd.DataFrame(prev_app.isnull().sum().sort_values(ascending=False)/prev_app.shape[0]*100).reset_index().rename(columns={'index':'var',
 738 |                                                                                                                                    0:'count_pct'})
 739 | var_msng_ge_40 = list(null_count[null_count['count_pct']>=40]['var'])
 740 | var_msng_ge_40
 741 | 
 742 | 
 743 | # In[102]:
 744 | 
 745 | 
 746 | nva_cols = var_msng_ge_40+['WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START','FLAG_LAST_APPL_PER_CONTRACT','NFLAG_LAST_APPL_IN_DAY']
 747 | len(nva_cols)
 748 | 
 749 | 
 750 | # In[103]:
 751 | 
 752 | 
 753 | len(prev_app.columns)
 754 | 
 755 | 
 756 | # In[104]:
 757 | 
 758 | 
 759 | prev_app_nva_col_rmvd = prev_app.drop(labels=nva_cols,axis=1)
 760 | 
 761 | 
 762 | len(prev_app_nva_col_rmvd.columns)
 763 | 
 764 | 
 765 | # In[105]:
 766 | 
 767 | 
 768 | prev_app_nva_col_rmvd.columns
 769 | 
 770 | 
 771 | # In[106]:
 772 | 
 773 | 
 774 | prev_app_nva_col_rmvd.head()
 775 | 
 776 | 
 777 | # In[107]:
 778 | 
 779 | 
 780 | prev_app_nva_col_rmvd.isnull().sum().sort_values(ascending=False)/prev_app_nva_col_rmvd.shape[0]*100
 781 | 
 782 | 
 783 | # In[108]:
 784 | 
 785 | 
 786 | prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].agg(func=['mean','median'])
 787 | 
 788 | 
 789 | # In[109]:
 790 | 
 791 | 
 792 | prev_app_nva_col_rmvd['AMT_GOODS_PRICE_MEDIAN'] = prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].fillna(prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].median())
 793 | 
 794 | 
 795 | # In[110]:
 796 | 
 797 | 
 798 | prev_app_nva_col_rmvd['AMT_GOODS_PRICE_MEAN'] = prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].fillna(prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].mean())
 799 | 
 800 | 
 801 | # In[111]:
 802 | 
 803 | 
 804 | prev_app_nva_col_rmvd['AMT_GOODS_PRICE_MODE'] = prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].fillna(prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].mode()[0])
 805 | 
 806 | 
 807 | # In[112]:
 808 | 
 809 | 
 810 | gp_cols = ['AMT_GOODS_PRICE','AMT_GOODS_PRICE_MEDIAN','AMT_GOODS_PRICE_MEAN','AMT_GOODS_PRICE_MODE']
 811 | 
 812 | 
 813 | # In[113]:
 814 | 
 815 | 
 816 | plt.figure(figsize=(10,5))
 817 | 
 818 | for i, col in enumerate(gp_cols):
 819 |     plt.subplot(2,2,i+1)
 820 |     sns.kdeplot(data=prev_app_nva_col_rmvd,x=col)
 821 |     plt.subplots_adjust(wspace=0.5,hspace=0.5)
 822 | 
 823 | 
 824 | # In[114]:
 825 | 
 826 | 
 827 | prev_app_nva_col_rmvd['AMT_GOODS_PRICE'] = prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].fillna(prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].median())
 828 | 
 829 | 
 830 | # In[115]:
 831 | 
 832 | 
 833 | prev_app_nva_col_rmvd['AMT_GOODS_PRICE'].isnull().sum()
 834 | 
 835 | 
 836 | # In[116]:
 837 | 
 838 | 
 839 | prev_app_nva_col_rmvd['AMT_ANNUITY'].agg(func=['mean','median','max'])
 840 | 
 841 | 
 842 | # In[117]:
 843 | 
 844 | 
 845 | prev_app_nva_col_rmvd['AMT_ANNUITY'] = prev_app_nva_col_rmvd['AMT_ANNUITY'].fillna(prev_app_nva_col_rmvd['AMT_ANNUITY'].median())
 846 | 
 847 | 
 848 | # In[118]:
 849 | 
 850 | 
 851 | prev_app_nva_col_rmvd['PRODUCT_COMBINATION'] = prev_app_nva_col_rmvd['PRODUCT_COMBINATION'].fillna(prev_app_nva_col_rmvd['PRODUCT_COMBINATION'].mode()[0])
 852 | 
 853 | 
 854 | # In[119]:
 855 | 
 856 | 
 857 | prev_app_nva_col_rmvd['CNT_PAYMENT'].agg(func=['mean','median','max'])
 858 | 
 859 | 
 860 | # In[120]:
 861 | 
 862 | 
 863 | prev_app_nva_col_rmvd[prev_app_nva_col_rmvd['CNT_PAYMENT'].isnull()].groupby(['NAME_CONTRACT_STATUS']).size().sort_values(ascending=False)
 864 | 
 865 | 
 866 | # In[121]:
 867 | 
 868 | 
 869 | prev_app_nva_col_rmvd['CNT_PAYMENT'] = prev_app_nva_col_rmvd['CNT_PAYMENT'].fillna(0)
 870 | 
 871 | 
 872 | # In[122]:
 873 | 
 874 | 
 875 | prev_app_nva_col_rmvd.isnull().sum().sort_values(ascending=False)
 876 | 
 877 | 
 878 | # In[123]:
 879 | 
 880 | 
 881 | prev_app_nva_col_rmvd = prev_app_nva_col_rmvd.drop(labels=['AMT_GOODS_PRICE_MEDIAN','AMT_GOODS_PRICE_MEAN','AMT_GOODS_PRICE_MODE'],axis=1)
 882 | 
 883 | 
 884 | # In[124]:
 885 | 
 886 | 
 887 | prev_app_nva_col_rmvd.isnull().sum().sort_values(ascending=False)
 888 | 
 889 | 
 890 | # In[125]:
 891 | 
 892 | 
 893 | len(prev_app_nva_col_rmvd.columns)
 894 | 
 895 | 
 896 | # In[126]:
 897 | 
 898 | 
 899 | prev_app_nva_col_rmvd.head()
 900 | 
 901 | 
 902 | # In[137]:
 903 | 
 904 | 
 905 | merged_df = pd.merge(app_score_col_rmvd,prev_app_nva_col_rmvd,how='inner',on='SK_ID_CURR')
 906 | merged_df.head()
 907 | 
 908 | 
 909 | # In[238]:
 910 | 
 911 | 
 912 | plt.figure(figsize=(15,5))
 913 | 
 914 | sns.countplot(data=merged_df,x='NAME_CASH_LOAN_PURPOSE',hue='NAME_CONTRACT_STATUS')
 915 | plt.xticks(rotation=90)
 916 | plt.yscale('log')
 917 | 
 918 | 
 919 | # In[239]:
 920 | 
 921 | 
 922 | sns.countplot(data=merged_df,x='NAME_CONTRACT_STATUS',hue='TARGET')
 923 | 
 924 | 
 925 | # In[248]:
 926 | 
 927 | 
 928 | merged_agg = merged_df.groupby(['NAME_CONTRACT_STATUS','TARGET']).size().reset_index().rename(columns={0:'counts'})
 929 | sum_df  = merged_agg.groupby(['NAME_CONTRACT_STATUS'])['counts'].sum().reset_index()
 930 | 
 931 | merged_agg_2 = pd.merge(merged_agg,sum_df,how='left',on='NAME_CONTRACT_STATUS')
 932 | merged_agg_2['pct'] = round(merged_agg_2['counts_x']/merged_agg_2['counts_y']*100,2)
 933 | merged_agg_2
 934 | 
 935 | 
 936 | # In[250]:
 937 | 
 938 | 
 939 | sns.lineplot(data=merged_df,x='NAME_CONTRACT_STATUS',y='AMT_INCOME_TOTAL',ci=None,hue='TARGET')
 940 | 
 941 | 
 942 | # In[251]:
 943 | 
 944 | 
 945 | len(merged_df.columns)
 946 | 
 947 | 
 948 | # # All the analysis
 949 | 
 950 | # most of the customers have taken cash loan
 951 | # customers who have taken cash loans are less likely to default
 952 | # 
 953 | # CODE_GENDER - 
 954 | # 
 955 | #     most of the loans have been taken by female
 956 | #     default rate for females are just ~7% which is safer and lesser than male
 957 | # 
 958 | # NAME_TYPE_SUITE - 
 959 | # 
 960 | #     unacompanied people had tanke most of the loans and the default rate is ~8.5% which is still okay
 961 | # 
 962 | # NAME_INCOME_TYPE - 
 963 | # 
 964 | #     the safest segments are working, commercial associates and pensioners
 965 | # 
 966 | # NAME_EDUCATION_TYPE - 
 967 | # 
 968 | #     Higher education is the safest segment to give the loan with a default rate of less than 5%
 969 | # 
 970 | # NAME_FAMILY_STATUS - 
 971 | # 
 972 | #     Married people are safe to target, default rate is 8%
 973 | # 
 974 | # 
 975 | # NAME_HOUSING_TYPE - 
 976 | # 
 977 | #     People having house/appartment are safe to give the loan with default rate of ~8%
 978 | # 
 979 | # OCCUPATION_TYPE - 
 980 | # 
 981 | #     Low-Skill Laboreres and drivers are highest defaulters
 982 | #     Accountants are less defaulters
 983 | #     Core staff, Managers and Laborers are safer to target with a default rate of <= 7.5 to 10%
 984 | # 
 985 | # ORGANIZATION_TYPE - 
 986 | # 
 987 | #     Transport type 3 highest defaulter
 988 | #     Others, Business Entity Type 3, Self Employed are good to go with default rate around 10 %
 989 | # 
 990 | # =======univariate numeric variables analysis========
 991 | # 
 992 | #     >> most of the loans were given for the goods price ranging between 0 to 1 ml
 993 | #     >> most of the loans were given for the credit amount of 0 to 1 ml
 994 | #     >> most of the customers are paying annuity of 0 to 50 K
 995 | #     >> mostly the customers have income between 0 to 1 ml
 996 | # 
 997 | # =============bivariate analysis==================
 998 | # 
 999 | #     >> AMT_CREDIT and AMT_GOODS_PRICE are linearly corelated, if the AMT_CREDIT increases the defaulters are decreasing
1000 | #     >> people having income less than or equals to 1 ml, are more like to take loans out of which who are taking loan of less than 1.5 million, coudl turn out to be defaulters. we can target income below 1 million and loan maount greater than 1.5 million
1001 | #     >> people having children 1 to less than 5 are safer to give the loan
1002 | #     >> People who can pay the annuity of 100K are more like to get the loan and that's upto less than 2ml (safer segment)
1003 | # 
1004 | # ============analysis on merged data==============
1005 | # 
1006 | #     >> for the repairing purpose customers had applied mostly prev. and the same puspose has most number of cancelations
1007 | #     >> most of the app. which were prev. either canceled or refused 80-90% of them are repayer in the current data
1008 | #     >> offers which were unused prev. now have maximum number of defaulters despite of having high income band customers
1009 | 
1010 | # # Final Conclusion/Insights
1011 | 
1012 | # Bank should target the customers
1013 | # 
1014 | #     >> having low income i.e. below 1 ml
1015 | #     >> working in Others, Business Entity Type 3, Self Employed  org. type
1016 | #     >> working as Accountants, Core staff, Managers and Laborers 
1017 | #     >> having house/appartment and are married and having children not more than 5
1018 | #     >> Highly educated
1019 | #     >> preferably female
1020 | # 
1021 | #     >> unacompanied people can be safer -  default rate is ~8.5%
1022 | # 
1023 | # Amount segment recommended -
1024 | # 
1025 | #     >> the credit amount should not be more than 1 ml
1026 | #     >> annuity can be made of 50K (depending on the eligibility)
1027 | #     >> income bracket could be below 1 ml
1028 | # 
1029 | #     >> 80-90% of the customer who were prev. canceled/refused, are repayers. Bank can do the analysis and can consider to give loan to these segments
1030 | # 
1031 | # 
1032 | # ====================precautions===============
1033 | # 
1034 | #     >> org. Transport type 3 should be avoided
1035 | #     >> Low-Skill Laboreres and drivers  should be avoided
1036 | #     >> offers prev. unused and high income customer should be avoided
1037 | # 
1038 | 


--------------------------------------------------------------------------------
/Scripts/FastText_Setup_Prediction.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | # pip install fasttext
  8 | 
  9 | 
 10 | # In[2]:
 11 | 
 12 | 
 13 | import fasttext
 14 | 
 15 | 
 16 | # In[3]:
 17 | 
 18 | 
 19 | fModel = fasttext.load_model(r"C:\Users\smrvr\Downloads\cc.en.300.bin")
 20 | 
 21 | 
 22 | # In[6]:
 23 | 
 24 | 
 25 | fModel.get_nearest_neighbors("love")
 26 | 
 27 | 
 28 | # In[10]:
 29 | 
 30 | 
 31 | fModel.get_subwords('leanerea')
 32 | 
 33 | 
 34 | # In[12]:
 35 | 
 36 | 
 37 | fModel.get_analogies('delhi','noida','tokyo',k=10)
 38 | 
 39 | 
 40 | # # Training model on own data
 41 | 
 42 | # In[13]:
 43 | 
 44 | 
 45 | import pandas as pd
 46 | pd.set_option('display.max_colwidth',0)
 47 | 
 48 | 
 49 | # In[18]:
 50 | 
 51 | 
 52 | df = pd.read_csv(r"D:\Learnerea\Tables\imdb_master.csv", encoding="ISO-8859-1")[['review','label']].query("label!='unsup'")
 53 | df.groupby('label').count()
 54 | 
 55 | 
 56 | # In[21]:
 57 | 
 58 | 
 59 | myText = df.review[5]
 60 | myText
 61 | 
 62 | 
 63 | # In[22]:
 64 | 
 65 | 
 66 | import re
 67 | 
 68 | 
 69 | # In[28]:
 70 | 
 71 | 
 72 | def preProcessing(text):
 73 |     text = re.sub(r'[^\w\s\']',"",text)
 74 |     text = re.sub(' +',' ',text).lower()
 75 |     return text
 76 | 
 77 | 
 78 | # In[29]:
 79 | 
 80 | 
 81 | preProcessing(myText)
 82 | 
 83 | 
 84 | # In[31]:
 85 | 
 86 | 
 87 | df['processed_review'] = df['review'].map(preProcessing)
 88 | 
 89 | 
 90 | # In[32]:
 91 | 
 92 | 
 93 | df.head(1)
 94 | 
 95 | 
 96 | # In[33]:
 97 | 
 98 | 
 99 | df['final_data'] = "__label__"+df['label']+" "+df['processed_review']
100 | 
101 | 
102 | # In[39]:
103 | 
104 | 
105 | df.head(1)
106 | 
107 | 
108 | # In[36]:
109 | 
110 | 
111 | from sklearn.model_selection import train_test_split
112 | 
113 | 
114 | # In[37]:
115 | 
116 | 
117 | train, test = train_test_split(df, test_size=0.3)
118 | 
119 | 
120 | # In[38]:
121 | 
122 | 
123 | display(train.shape)
124 | display(test.shape)
125 | 
126 | 
127 | # In[40]:
128 | 
129 | 
130 | train.to_csv(r"D:\Learnerea\Tables\fastText\new\train_reviews.csv",columns=['final_data'],index=False,header=False)
131 | test.to_csv(r"D:\Learnerea\Tables\fastText\new\test_reviews.csv",columns=['final_data'],index=False,header=False)
132 | 
133 | 
134 | # In[41]:
135 | 
136 | 
137 | myModel = fasttext.train_supervised(input=r"D:\Learnerea\Tables\fastText\new\train_reviews.csv")
138 | 
139 | 
140 | # In[42]:
141 | 
142 | 
143 | myModel.test(r"D:\Learnerea\Tables\fastText\new\test_reviews.csv")
144 | 
145 | 
146 | # In[53]:
147 | 
148 | 
149 | myModel.predict("Thanks for sharing this ...it will really prove to be very helpful")
150 | 
151 | 


--------------------------------------------------------------------------------
/Scripts/Feature Selection with ANOVA Method.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "id": "f97af190",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "\n",
 13 |     "import seaborn as sns\n",
 14 |     "import matplotlib.pyplot as plt\n",
 15 |     "\n",
 16 |     "from sklearn.feature_selection import SelectKBest, f_classif, f_regression\n",
 17 |     "from sklearn.model_selection import train_test_split\n",
 18 |     "from sklearn.preprocessing import LabelEncoder\n",
 19 |     "\n",
 20 |     "le = LabelEncoder()"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 4,
 26 |    "id": "e0215435",
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "data": {
 31 |       "text/html": [
 32 |        "<div>\n",
 33 |        "<style scoped>\n",
 34 |        "    .dataframe tbody tr th:only-of-type {\n",
 35 |        "        vertical-align: middle;\n",
 36 |        "    }\n",
 37 |        "\n",
 38 |        "    .dataframe tbody tr th {\n",
 39 |        "        vertical-align: top;\n",
 40 |        "    }\n",
 41 |        "\n",
 42 |        "    .dataframe thead th {\n",
 43 |        "        text-align: right;\n",
 44 |        "    }\n",
 45 |        "</style>\n",
 46 |        "<table border=\"1\" class=\"dataframe\">\n",
 47 |        "  <thead>\n",
 48 |        "    <tr style=\"text-align: right;\">\n",
 49 |        "      <th></th>\n",
 50 |        "      <th>age</th>\n",
 51 |        "      <th>gender</th>\n",
 52 |        "      <th>education</th>\n",
 53 |        "      <th>income</th>\n",
 54 |        "      <th>loan_amount</th>\n",
 55 |        "    </tr>\n",
 56 |        "  </thead>\n",
 57 |        "  <tbody>\n",
 58 |        "    <tr>\n",
 59 |        "      <th>0</th>\n",
 60 |        "      <td>50+</td>\n",
 61 |        "      <td>Male</td>\n",
 62 |        "      <td>PhD</td>\n",
 63 |        "      <td>&lt;20k</td>\n",
 64 |        "      <td>30925.132461</td>\n",
 65 |        "    </tr>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>1</th>\n",
 68 |        "      <td>30-40</td>\n",
 69 |        "      <td>Male</td>\n",
 70 |        "      <td>PostGraduate</td>\n",
 71 |        "      <td>60k-80k</td>\n",
 72 |        "      <td>37081.630673</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>2</th>\n",
 76 |        "      <td>20-30</td>\n",
 77 |        "      <td>Male</td>\n",
 78 |        "      <td>HighSchool</td>\n",
 79 |        "      <td>80k+</td>\n",
 80 |        "      <td>25218.164446</td>\n",
 81 |        "    </tr>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>3</th>\n",
 84 |        "      <td>&lt;20</td>\n",
 85 |        "      <td>Female</td>\n",
 86 |        "      <td>PostGraduate</td>\n",
 87 |        "      <td>20k-40k</td>\n",
 88 |        "      <td>22508.062994</td>\n",
 89 |        "    </tr>\n",
 90 |        "    <tr>\n",
 91 |        "      <th>4</th>\n",
 92 |        "      <td>50+</td>\n",
 93 |        "      <td>Male</td>\n",
 94 |        "      <td>PostGraduate</td>\n",
 95 |        "      <td>60k-80k</td>\n",
 96 |        "      <td>53153.715666</td>\n",
 97 |        "    </tr>\n",
 98 |        "  </tbody>\n",
 99 |        "</table>\n",
100 |        "</div>"
101 |       ],
102 |       "text/plain": [
103 |        "     age  gender     education   income   loan_amount\n",
104 |        "0    50+    Male           PhD     <20k  30925.132461\n",
105 |        "1  30-40    Male  PostGraduate  60k-80k  37081.630673\n",
106 |        "2  20-30    Male    HighSchool     80k+  25218.164446\n",
107 |        "3    <20  Female  PostGraduate  20k-40k  22508.062994\n",
108 |        "4    50+    Male  PostGraduate  60k-80k  53153.715666"
109 |       ]
110 |      },
111 |      "execution_count": 4,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "df = pd.read_excel(r\"D:\\Learnerea\\Tables\\loan_data_sample.xlsx\")\n",
118 |     "df.head()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 5,
124 |    "id": "de952911",
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "features = df.drop('loan_amount',axis=1)\n",
129 |     "target = df.loan_amount"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 9,
135 |    "id": "f34657d1",
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "encoded = pd.DataFrame()\n",
140 |     "\n",
141 |     "for i in features.columns:\n",
142 |     "    encoded[i]=le.fit_transform(features[i])"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 10,
148 |    "id": "aa007bf9",
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/html": [
154 |        "<div>\n",
155 |        "<style scoped>\n",
156 |        "    .dataframe tbody tr th:only-of-type {\n",
157 |        "        vertical-align: middle;\n",
158 |        "    }\n",
159 |        "\n",
160 |        "    .dataframe tbody tr th {\n",
161 |        "        vertical-align: top;\n",
162 |        "    }\n",
163 |        "\n",
164 |        "    .dataframe thead th {\n",
165 |        "        text-align: right;\n",
166 |        "    }\n",
167 |        "</style>\n",
168 |        "<table border=\"1\" class=\"dataframe\">\n",
169 |        "  <thead>\n",
170 |        "    <tr style=\"text-align: right;\">\n",
171 |        "      <th></th>\n",
172 |        "      <th>age</th>\n",
173 |        "      <th>gender</th>\n",
174 |        "      <th>education</th>\n",
175 |        "      <th>income</th>\n",
176 |        "    </tr>\n",
177 |        "  </thead>\n",
178 |        "  <tbody>\n",
179 |        "    <tr>\n",
180 |        "      <th>0</th>\n",
181 |        "      <td>3</td>\n",
182 |        "      <td>1</td>\n",
183 |        "      <td>1</td>\n",
184 |        "      <td>4</td>\n",
185 |        "    </tr>\n",
186 |        "    <tr>\n",
187 |        "      <th>1</th>\n",
188 |        "      <td>1</td>\n",
189 |        "      <td>1</td>\n",
190 |        "      <td>2</td>\n",
191 |        "      <td>2</td>\n",
192 |        "    </tr>\n",
193 |        "    <tr>\n",
194 |        "      <th>2</th>\n",
195 |        "      <td>0</td>\n",
196 |        "      <td>1</td>\n",
197 |        "      <td>0</td>\n",
198 |        "      <td>3</td>\n",
199 |        "    </tr>\n",
200 |        "    <tr>\n",
201 |        "      <th>3</th>\n",
202 |        "      <td>4</td>\n",
203 |        "      <td>0</td>\n",
204 |        "      <td>2</td>\n",
205 |        "      <td>0</td>\n",
206 |        "    </tr>\n",
207 |        "    <tr>\n",
208 |        "      <th>4</th>\n",
209 |        "      <td>3</td>\n",
210 |        "      <td>1</td>\n",
211 |        "      <td>2</td>\n",
212 |        "      <td>2</td>\n",
213 |        "    </tr>\n",
214 |        "  </tbody>\n",
215 |        "</table>\n",
216 |        "</div>"
217 |       ],
218 |       "text/plain": [
219 |        "   age  gender  education  income\n",
220 |        "0    3       1          1       4\n",
221 |        "1    1       1          2       2\n",
222 |        "2    0       1          0       3\n",
223 |        "3    4       0          2       0\n",
224 |        "4    3       1          2       2"
225 |       ]
226 |      },
227 |      "execution_count": 10,
228 |      "metadata": {},
229 |      "output_type": "execute_result"
230 |     }
231 |    ],
232 |    "source": [
233 |     "encoded.head()"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 11,
239 |    "id": "d6e3b2b2",
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "x_train, x_test, y_train, y_test = train_test_split(encoded,target,test_size=0.2,random_state=123)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 12,
249 |    "id": "c32949b3",
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "best_features = SelectKBest(score_func=f_regression,k='all')"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 14,
259 |    "id": "7d663462",
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "selection = best_features.fit(x_train,y_train)"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 17,
269 |    "id": "698ef2e8",
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "data": {
274 |       "text/plain": [
275 |        "array([14.04421628,  1.44327701,  0.01548106, 10.88645751])"
276 |       ]
277 |      },
278 |      "execution_count": 17,
279 |      "metadata": {},
280 |      "output_type": "execute_result"
281 |     }
282 |    ],
283 |    "source": [
284 |     "selection.scores_"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 18,
290 |    "id": "4123f5fc",
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "anova_score = pd.DataFrame(zip(features.columns,selection.scores_,selection.pvalues_),columns=['features','score','pValues'])"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 20,
300 |    "id": "816cc320",
301 |    "metadata": {},
302 |    "outputs": [
303 |     {
304 |      "data": {
305 |       "text/html": [
306 |        "<div>\n",
307 |        "<style scoped>\n",
308 |        "    .dataframe tbody tr th:only-of-type {\n",
309 |        "        vertical-align: middle;\n",
310 |        "    }\n",
311 |        "\n",
312 |        "    .dataframe tbody tr th {\n",
313 |        "        vertical-align: top;\n",
314 |        "    }\n",
315 |        "\n",
316 |        "    .dataframe thead th {\n",
317 |        "        text-align: right;\n",
318 |        "    }\n",
319 |        "</style>\n",
320 |        "<table border=\"1\" class=\"dataframe\">\n",
321 |        "  <thead>\n",
322 |        "    <tr style=\"text-align: right;\">\n",
323 |        "      <th></th>\n",
324 |        "      <th>features</th>\n",
325 |        "      <th>score</th>\n",
326 |        "      <th>pValues</th>\n",
327 |        "    </tr>\n",
328 |        "  </thead>\n",
329 |        "  <tbody>\n",
330 |        "    <tr>\n",
331 |        "      <th>0</th>\n",
332 |        "      <td>age</td>\n",
333 |        "      <td>14.044216</td>\n",
334 |        "      <td>0.000191</td>\n",
335 |        "    </tr>\n",
336 |        "    <tr>\n",
337 |        "      <th>3</th>\n",
338 |        "      <td>income</td>\n",
339 |        "      <td>10.886458</td>\n",
340 |        "      <td>0.001012</td>\n",
341 |        "    </tr>\n",
342 |        "    <tr>\n",
343 |        "      <th>1</th>\n",
344 |        "      <td>gender</td>\n",
345 |        "      <td>1.443277</td>\n",
346 |        "      <td>0.229966</td>\n",
347 |        "    </tr>\n",
348 |        "    <tr>\n",
349 |        "      <th>2</th>\n",
350 |        "      <td>education</td>\n",
351 |        "      <td>0.015481</td>\n",
352 |        "      <td>0.901012</td>\n",
353 |        "    </tr>\n",
354 |        "  </tbody>\n",
355 |        "</table>\n",
356 |        "</div>"
357 |       ],
358 |       "text/plain": [
359 |        "    features      score   pValues\n",
360 |        "0        age  14.044216  0.000191\n",
361 |        "3     income  10.886458  0.001012\n",
362 |        "1     gender   1.443277  0.229966\n",
363 |        "2  education   0.015481  0.901012"
364 |       ]
365 |      },
366 |      "execution_count": 20,
367 |      "metadata": {},
368 |      "output_type": "execute_result"
369 |     }
370 |    ],
371 |    "source": [
372 |     "anova_score.sort_values(by='score',ascending=False)"
373 |    ]
374 |   }
375 |  ],
376 |  "metadata": {
377 |   "kernelspec": {
378 |    "display_name": "Python 3 (ipykernel)",
379 |    "language": "python",
380 |    "name": "python3"
381 |   },
382 |   "language_info": {
383 |    "codemirror_mode": {
384 |     "name": "ipython",
385 |     "version": 3
386 |    },
387 |    "file_extension": ".py",
388 |    "mimetype": "text/x-python",
389 |    "name": "python",
390 |    "nbconvert_exporter": "python",
391 |    "pygments_lexer": "ipython3",
392 |    "version": "3.9.7"
393 |   }
394 |  },
395 |  "nbformat": 4,
396 |  "nbformat_minor": 5
397 | }
398 | 


--------------------------------------------------------------------------------
/Scripts/FraudDetectionModelDataPrep.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | from sklearn.preprocessing import LabelEncoder
10 | from sklearn.model_selection import train_test_split, StratifiedKFold
11 | from sklearn.linear_model import LogisticRegression
12 | from sklearn.metrics import confusion_matrix, classification_report
13 | import matplotlib.pyplot as plt
14 | import seaborn as sns
15 | 
16 | 
17 | # In[4]:
18 | 
19 | 
20 | def dataPrepAutomation(inputDF):
21 |     new_df = inputDF.drop(columns=['nameOrig','nameDest'],axis=1)
22 |     dummies = pd.get_dummies(inputDF['type'])
23 |     processedDf = pd.concat([new_df.drop('type',axis=1),dummies],axis=1)
24 |     independent = processedDf.drop('isFraud',axis=1)
25 |     dependent = processedDf['isFraud']
26 |     return independent, dependent
27 | 
28 | 
29 | # In[5]:
30 | 
31 | 
32 | def smoteBalancing(features, targets):
33 |     from imblearn.over_sampling import SMOTE
34 |     smote = SMOTE()
35 |     x_smote, y_smote = smote.fit_resample(features, targets)
36 |     X_train, X_test, y_train, y_test = train_test_split(x_smote,y_smote, test_size=0.30, random_state=True)
37 |     return X_train, X_test, y_train, y_test
38 | 
39 | 


--------------------------------------------------------------------------------
/Scripts/GloVe.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[118]:
  5 | 
  6 | 
  7 | my_vocab = ['apple','orange','shimla','banana','maruti','mumbai','china','india','husband'
  8 |             ,'wife','brother','sister','red','yellow','computer','mobile','pear','guava']
  9 | 
 10 | 
 11 | # In[36]:
 12 | 
 13 | 
 14 | import gensim
 15 | from sklearn.manifold import TSNE
 16 | 
 17 | 
 18 | # In[2]:
 19 | 
 20 | 
 21 | import gensim.downloader as api
 22 | 
 23 | 
 24 | # In[3]:
 25 | 
 26 | 
 27 | glove_model = api.load('glove-wiki-gigaword-300')
 28 | 
 29 | 
 30 | # In[130]:
 31 | 
 32 | 
 33 | glove_model.most_similar('love',topn=5)
 34 | 
 35 | 
 36 | # In[133]:
 37 | 
 38 | 
 39 | # husband - man + woman = wife
 40 | 
 41 | 
 42 | # In[132]:
 43 | 
 44 | 
 45 | glove_model.most_similar(positive= ['woman', 'husband'], negative=['man'],topn=1)
 46 | 
 47 | 
 48 | # In[135]:
 49 | 
 50 | 
 51 | words = []
 52 | vectors = []
 53 | 
 54 | 
 55 | for word in my_vocab:
 56 |     words.append(word)
 57 |     vectors.append(glove_model[word])
 58 | 
 59 | 
 60 | # In[138]:
 61 | 
 62 | 
 63 | dicts = zip(words,vectors)
 64 | 
 65 | 
 66 | # In[139]:
 67 | 
 68 | 
 69 | import pandas as pd
 70 | 
 71 | 
 72 | # In[141]:
 73 | 
 74 | 
 75 | dim_model = TSNE(n_components=2, perplexity=3, init='pca', random_state=45)
 76 | 
 77 | 
 78 | # In[143]:
 79 | 
 80 | 
 81 | fit_model = dim_model.fit_transform(vectors)
 82 | 
 83 | 
 84 | # In[145]:
 85 | 
 86 | 
 87 | import matplotlib.pyplot as plt
 88 | 
 89 | 
 90 | # In[146]:
 91 | 
 92 | 
 93 | fit_model
 94 | 
 95 | 
 96 | # In[148]:
 97 | 
 98 | 
 99 | x = []
100 | y = []
101 | 
102 | for i in fit_model:
103 |     x.append(i[0])
104 |     y.append(i[1])
105 | 
106 | 
107 | # In[167]:
108 | 
109 | 
110 | plt.figure(figsize=(8,8))
111 | 
112 | for i in range(len(x)):
113 |     plt.scatter(x[i],y[i])
114 |     plt.annotate(words[i], xy=(x[i],y[i]),
115 |                  xytext=(2, 2),
116 |                  textcoords='offset points',
117 |                  ha='right',
118 |                  va='bottom'
119 |                 )
120 | 
121 | 
122 | # In[ ]:
123 | 
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/Scripts/LinearRegression_Live.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[2]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | import seaborn as sns
 9 | from sklearn.model_selection import train_test_split
10 | from sklearn.linear_model import LinearRegression
11 | 
12 | 
13 | # In[28]:
14 | 
15 | 
16 | salary = pd.read_excel(r"D:\Learnerea\Tables\Salary.xlsx",sheet_name="Salary")
17 | salary.head()
18 | 
19 | 
20 | # In[7]:
21 | 
22 | 
23 | X_train, X_test, y_train, y_test = train_test_split(salary[['YearsExperience']],salary.Salary,test_size=0.2,random_state=False)
24 | 
25 | 
26 | # In[13]:
27 | 
28 | 
29 | y_test.shape
30 | 
31 | 
32 | # In[14]:
33 | 
34 | 
35 | model = LinearRegression()
36 | 
37 | 
38 | # In[15]:
39 | 
40 | 
41 | model.fit(X_train,y_train)
42 | 
43 | 
44 | # In[16]:
45 | 
46 | 
47 | model.predict(X_test)
48 | 
49 | 
50 | # In[17]:
51 | 
52 | 
53 | y_test
54 | 
55 | 
56 | # In[18]:
57 | 
58 | 
59 | model.score(X_test,y_test)
60 | 
61 | 
62 | # In[21]:
63 | 
64 | 
65 | def salary_predict(exp):
66 |     return model.predict([[exp]])
67 | 
68 | 
69 | # In[22]:
70 | 
71 | 
72 | exp = float(input("Enter your years of exp: "))
73 | print("The salary you can expect is: ", salary_predict(exp))
74 | 
75 | 
76 | # In[25]:
77 | 
78 | 
79 | to_pred = pd.read_excel(r"D:\Learnerea\Tables\Salary.xlsx",sheet_name="to_predict")
80 | 
81 | 
82 | # In[26]:
83 | 
84 | 
85 | model.predict(to_pred)
86 | 
87 | 
88 | # In[27]:
89 | 
90 | 
91 | to_pred['predicted_salary']=model.predict(to_pred)
92 | to_pred
93 | 
94 | 


--------------------------------------------------------------------------------
/Scripts/LogisticLive.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | import seaborn as sns
10 | 
11 | 
12 | # In[2]:
13 | 
14 | 
15 | app = pd.read_excel("D:\Learnerea\Tables\loan_app_fraud.xlsx")
16 | app.head()
17 | 
18 | 
19 | # In[5]:
20 | 
21 | 
22 | sns.scatterplot(data=app, x="Age",y="Salary_KPM",hue="Fraud")
23 | 
24 | 
25 | # In[6]:
26 | 
27 | 
28 | from sklearn.model_selection import train_test_split
29 | 
30 | 
31 | # In[7]:
32 | 
33 | 
34 | X_train, X_test, y_train, y_test = train_test_split(app[["Age","Salary_KPM"]],app.Fraud,test_size=0.2, random_state=False)
35 | 
36 | 
37 | # In[12]:
38 | 
39 | 
40 | from sklearn.linear_model import LogisticRegression
41 | 
42 | 
43 | # In[13]:
44 | 
45 | 
46 | model = LogisticRegression()
47 | 
48 | 
49 | # In[15]:
50 | 
51 | 
52 | model.fit(X_train,y_train)
53 | 
54 | 
55 | # In[16]:
56 | 
57 | 
58 | model.predict(X_test)
59 | 
60 | 
61 | # In[17]:
62 | 
63 | 
64 | model.score(X_test,y_test)
65 | 
66 | 
67 | # In[18]:
68 | 
69 | 
70 | def fraud_pred_mode(age,salary):
71 |     if model.predict([[age,salary]])==1:
72 |         return "Fraud"
73 |     else:
74 |         return "Genuine"
75 | fraud_pred_mode(20,90)
76 | 
77 | 
78 | # In[20]:
79 | 
80 | 
81 | age = int(input("Enter the age of the applicant: "))
82 | salary = int(input("Enter the Salary of the applicant: "))
83 | 
84 | print("The applicant seems to be ", fraud_pred_mode(age,salary))
85 | 
86 | 


--------------------------------------------------------------------------------
/Scripts/NER.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import spacy
 8 | 
 9 | 
10 | # In[2]:
11 | 
12 | 
13 | nlp = spacy.load("en_core_web_sm")
14 | 
15 | 
16 | # In[3]:
17 | 
18 | 
19 | news = nlp("TATA acquired BigBasket Ent. for $45 billion")
20 | 
21 | 
22 | # In[6]:
23 | 
24 | 
25 | for x in news.ents:
26 |     print(x.text, "==>", x.label_,"==>", spacy.explain(x.label_))
27 | 
28 | 
29 | # In[7]:
30 | 
31 | 
32 | from spacy import displacy
33 | 
34 | 
35 | # In[8]:
36 | 
37 | 
38 | displacy.render(news, style='ent')
39 | 
40 | 


--------------------------------------------------------------------------------
/Scripts/PD_credit_risk_model_dev.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | import seaborn as sns
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | # In[144]:
 14 | 
 15 | 
 16 | train = pd.read_csv(r'D:\Learnerea\Tables\GiveMeSomeCredit\cs-training.csv').drop(['Unnamed: 0'],axis=1)
 17 | test = pd.read_csv(r'D:\Learnerea\Tables\GiveMeSomeCredit\cs-test.csv').drop(['Unnamed: 0'],axis=1)
 18 | 
 19 | 
 20 | # In[153]:
 21 | 
 22 | 
 23 | train.shape
 24 | 
 25 | 
 26 | # In[154]:
 27 | 
 28 | 
 29 | test.shape
 30 | 
 31 | 
 32 | # In[150]:
 33 | 
 34 | 
 35 | train_redup = train.drop_duplicates()
 36 | 
 37 | 
 38 | # In[169]:
 39 | 
 40 | 
 41 | def findMiss(df):
 42 |     return round(df.isnull().sum()/df.shape[0]*100,2)
 43 | 
 44 | 
 45 | # In[202]:
 46 | 
 47 | 
 48 | train_redup.shape
 49 | 
 50 | 
 51 | # In[170]:
 52 | 
 53 | 
 54 | findMiss(train_redup)
 55 | 
 56 | 
 57 | # In[175]:
 58 | 
 59 | 
 60 | train_redup[train_redup.MonthlyIncome.isnull()].describe()
 61 | 
 62 | 
 63 | # In[178]:
 64 | 
 65 | 
 66 | train_redup['NumberOfDependents'].agg(['mode'])
 67 | 
 68 | 
 69 | # In[193]:
 70 | 
 71 | 
 72 | fam_miss = train_redup[train_redup.NumberOfDependents.isnull()]
 73 | fam_nmiss = train_redup[train_redup.NumberOfDependents.notnull()]
 74 | 
 75 | 
 76 | # In[194]:
 77 | 
 78 | 
 79 | fam_miss['NumberOfDependents'] = fam_miss['NumberOfDependents'].fillna(0)
 80 | fam_miss['MonthlyIncome'] = fam_miss['MonthlyIncome'].fillna(0)
 81 | 
 82 | 
 83 | # In[195]:
 84 | 
 85 | 
 86 | findMiss(fam_miss)
 87 | 
 88 | 
 89 | # In[196]:
 90 | 
 91 | 
 92 | findMiss(fam_nmiss)
 93 | 
 94 | 
 95 | # In[197]:
 96 | 
 97 | 
 98 | fam_nmiss['MonthlyIncome'].agg(['mean','median','min'])
 99 | 
100 | 
101 | # In[198]:
102 | 
103 | 
104 | fam_nmiss['MonthlyIncome'] = fam_nmiss['MonthlyIncome'].fillna(fam_nmiss['MonthlyIncome'].median())
105 | 
106 | 
107 | # In[199]:
108 | 
109 | 
110 | findMiss(fam_nmiss)
111 | 
112 | 
113 | # In[200]:
114 | 
115 | 
116 | filled_train = fam_nmiss.append(fam_miss)
117 | 
118 | 
119 | # In[203]:
120 | 
121 | 
122 | findMiss(filled_train)
123 | 
124 | 
125 | # In[204]:
126 | 
127 | 
128 | filled_train.head()
129 | 
130 | 
131 | # In[207]:
132 | 
133 | 
134 | filled_train.groupby(['SeriousDlqin2yrs']).size()/filled_train.shape[0]
135 | 
136 | 
137 | # In[208]:
138 | 
139 | 
140 | filled_train.RevolvingUtilizationOfUnsecuredLines.describe()
141 | 
142 | 
143 | # In[218]:
144 | 
145 | 
146 | filled_train['RevolvingUtilizationOfUnsecuredLines'].quantile([.99])
147 | 
148 | 
149 | # In[237]:
150 | 
151 | 
152 | (filled_train[filled_train['RevolvingUtilizationOfUnsecuredLines'] > 10]).describe()
153 | 
154 | 
155 | # In[243]:
156 | 
157 | 
158 | util_droped = filled_train.drop(filled_train[filled_train['RevolvingUtilizationOfUnsecuredLines'] > 10].index)
159 | 
160 | 
161 | # In[249]:
162 | 
163 | 
164 | sns.boxplot(util_droped['age'])
165 | 
166 | 
167 | # In[251]:
168 | 
169 | 
170 | util_droped.groupby(['NumberOfTime30-59DaysPastDueNotWorse']).size()
171 | 
172 | 
173 | # In[252]:
174 | 
175 | 
176 | util_droped.groupby(['NumberOfTime60-89DaysPastDueNotWorse']).size()
177 | 
178 | 
179 | # In[253]:
180 | 
181 | 
182 | util_droped.groupby(['NumberOfTimes90DaysLate']).size()
183 | 
184 | 
185 | # In[258]:
186 | 
187 | 
188 | util_droped[util_droped['NumberOfTimes90DaysLate']>=96].groupby(['SeriousDlqin2yrs']).size()
189 | 
190 | 
191 | # In[260]:
192 | 
193 | 
194 | util_droped['DebtRatio'].describe()
195 | 
196 | 
197 | # In[262]:
198 | 
199 | 
200 | sns.kdeplot(util_droped['DebtRatio'])
201 | 
202 | 
203 | # In[289]:
204 | 
205 | 
206 | util_droped['DebtRatio'].quantile([.975])
207 | 
208 | 
209 | # In[293]:
210 | 
211 | 
212 | util_droped[util_droped['DebtRatio']>3492][['SeriousDlqin2yrs','MonthlyIncome']].describe()
213 | 
214 | 
215 | # In[297]:
216 | 
217 | 
218 | temp = util_droped[(util_droped['DebtRatio']>3492) & (util_droped['SeriousDlqin2yrs']==util_droped['MonthlyIncome'])]
219 | 
220 | 
221 | # In[300]:
222 | 
223 | 
224 | dRatio = util_droped.drop(util_droped[(util_droped['DebtRatio']>3492) & (util_droped['SeriousDlqin2yrs']==util_droped['MonthlyIncome'])].index)
225 | 
226 | 
227 | # In[311]:
228 | 
229 | 
230 | # pip install Xgboost
231 | 
232 | 
233 | # In[327]:
234 | 
235 | 
236 | from xgboost import XGBClassifier
237 | from sklearn.metrics import accuracy_score
238 | from sklearn.metrics import confusion_matrix, classification_report
239 | 
240 | 
241 | # In[320]:
242 | 
243 | 
244 | model = XGBClassifier(tree_method = 'exact')
245 | 
246 | 
247 | # In[318]:
248 | 
249 | 
250 | x = dRatio.drop(['SeriousDlqin2yrs'],axis=1)
251 | y = dRatio['SeriousDlqin2yrs']
252 | 
253 | 
254 | # In[322]:
255 | 
256 | 
257 | model.fit(x,y.values.ravel())
258 | y_pred = model.predict(x)
259 | 
260 | 
261 | # In[323]:
262 | 
263 | 
264 | accuracy_score(y,y_pred)
265 | 
266 | 
267 | # In[325]:
268 | 
269 | 
270 | cm = confusion_matrix(y,y_pred)
271 | 
272 | 
273 | # In[326]:
274 | 
275 | 
276 | sns.heatmap(cm,annot=True,fmt='d',cmap='Oranges',linewidths=0.5,linecolor='Black')
277 | plt.xticks(np.arange(2)+.5,['No def','def'])
278 | plt.yticks(np.arange(2)+.5,['No def','def'])
279 | plt.xlabel("predicted")
280 | plt.ylabel("actuals")
281 | 
282 | 
283 | # In[328]:
284 | 
285 | 
286 | print(classification_report(y,y_pred))
287 | 
288 | 


--------------------------------------------------------------------------------
/Scripts/Payment_Fraud_Detection_Model_Logistic_Reg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[8]:
  5 | 
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | import seaborn as sns
 10 | import matplotlib.pyplot as plt
 11 | from sklearn.model_selection import train_test_split
 12 | from sklearn.linear_model import LogisticRegression
 13 | from sklearn.metrics import confusion_matrix
 14 | 
 15 | 
 16 | # In[9]:
 17 | 
 18 | 
 19 | df = pd.read_csv(r"D:\Learnerea\Tables\PS_20174392719_1491204439457_log.csv")
 20 | df.head()
 21 | 
 22 | 
 23 | # In[10]:
 24 | 
 25 | 
 26 | df.shape
 27 | 
 28 | 
 29 | # In[11]:
 30 | 
 31 | 
 32 | df3 = df.drop(['nameOrig','nameDest','isFlaggedFraud'],axis=1)
 33 | 
 34 | 
 35 | # In[12]:
 36 | 
 37 | 
 38 | df3.head()
 39 | 
 40 | 
 41 | # In[15]:
 42 | 
 43 | 
 44 | df3['type'].unique()
 45 | 
 46 | 
 47 | # In[18]:
 48 | 
 49 | 
 50 | dummies = pd.get_dummies(df3['type']).drop(['CASH_IN'],axis=1)
 51 | dummies
 52 | 
 53 | 
 54 | # In[30]:
 55 | 
 56 | 
 57 | df4 = pd.concat([df3,dummies],axis=1).drop(['type'],axis=1)
 58 | df4
 59 | 
 60 | 
 61 | # In[31]:
 62 | 
 63 | 
 64 | X_train, X_test, y_train, y_test = train_test_split(df4.drop(['isFraud'],axis=1),df4.isFraud,test_size=0.2,random_state=False)
 65 | 
 66 | 
 67 | # In[32]:
 68 | 
 69 | 
 70 | X_test.shape
 71 | 
 72 | 
 73 | # In[33]:
 74 | 
 75 | 
 76 | model = LogisticRegression()
 77 | 
 78 | 
 79 | # In[34]:
 80 | 
 81 | 
 82 | model.fit(X_train,y_train)
 83 | 
 84 | 
 85 | # In[35]:
 86 | 
 87 | 
 88 | model.score(X_test,y_test)
 89 | 
 90 | 
 91 | # In[39]:
 92 | 
 93 | 
 94 | df4['isFraud'].unique()
 95 | 
 96 | 
 97 | # In[42]:
 98 | 
 99 | 
100 | df4.groupby('isFraud').sum()
101 | 
102 | 
103 | # In[43]:
104 | 
105 | 
106 | predict = model.predict(X_test)
107 | predict
108 | 
109 | 
110 | # In[45]:
111 | 
112 | 
113 | cm = confusion_matrix(y_test,predict)
114 | 
115 | 
116 | # In[55]:
117 | 
118 | 
119 | sns.heatmap(cm,cmap='Oranges',annot=True,fmt='d',cbar=False,linecolor='Black',linewidths=5)
120 | plt.xticks(np.arange(2)+.5,['No Fraud','Fraud'])
121 | plt.yticks(np.arange(2)+.5,['No Fraud','Fraud'])
122 | plt.xlabel("predicted")
123 | plt.ylabel("actuals")
124 | 
125 | 


--------------------------------------------------------------------------------
/Scripts/Payment_Fraud_Detection_Model_Logistic_and_Decision_Tree.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[166]:
  5 | 
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | import seaborn as sns
 10 | import matplotlib.pyplot as plt
 11 | from sklearn.model_selection import train_test_split
 12 | from sklearn.linear_model import LogisticRegression
 13 | from sklearn.preprocessing import LabelEncoder
 14 | from sklearn.metrics import confusion_matrix, classification_report
 15 | from sklearn.tree import DecisionTreeClassifier
 16 | 
 17 | 
 18 | # In[104]:
 19 | 
 20 | 
 21 | df = pd.read_csv(r"D:\Learnerea\Tables\PS_20174392719_1491204439457_log.csv")
 22 | df.head()
 23 | 
 24 | 
 25 | # In[105]:
 26 | 
 27 | 
 28 | df.groupby('isFlaggedFraud').count()
 29 | 
 30 | 
 31 | # In[106]:
 32 | 
 33 | 
 34 | df.groupby('type').count()
 35 | 
 36 | 
 37 | # In[116]:
 38 | 
 39 | 
 40 | df.pivot_table(values='amount',index='type',columns='isFraud',aggfunc='count')
 41 | 
 42 | 
 43 | # In[136]:
 44 | 
 45 | 
 46 | dfFltrd = df[df.type.isin(['CASH_OUT','TRANSFER'])]
 47 | dfFltrd.pivot_table(values='amount',index='type',columns='isFraud',aggfunc='count')
 48 | 
 49 | 
 50 | # In[137]:
 51 | 
 52 | 
 53 | encoder = LabelEncoder()
 54 | 
 55 | 
 56 | # In[138]:
 57 | 
 58 | 
 59 | dfFltrd['typeEncoded'] = encoder.fit_transform(dfFltrd['type'])
 60 | dfFltrd.pivot_table(values='amount',index='typeEncoded',columns='isFraud',aggfunc='count')
 61 | 
 62 | 
 63 | # In[139]:
 64 | 
 65 | 
 66 | df2 = dfFltrd.drop(['step','type','nameOrig','nameDest','isFlaggedFraud'],axis=1)
 67 | 
 68 | 
 69 | # In[146]:
 70 | 
 71 | 
 72 | df2.head()
 73 | 
 74 | 
 75 | # In[150]:
 76 | 
 77 | 
 78 | X_train, X_test, y_train, y_test = train_test_split(df2.drop(['isFraud'],axis=1),df2.isFraud,test_size=0.2,random_state=False)
 79 | 
 80 | 
 81 | # In[151]:
 82 | 
 83 | 
 84 | LogReg = LogisticRegression()
 85 | 
 86 | 
 87 | # In[152]:
 88 | 
 89 | 
 90 | LogReg.fit(X_train,y_train)
 91 | 
 92 | 
 93 | # In[154]:
 94 | 
 95 | 
 96 | predicted = LogReg.predict(X_test)
 97 | 
 98 | 
 99 | # In[155]:
100 | 
101 | 
102 | LogReg.score(X_test,y_test)
103 | 
104 | 
105 | # In[157]:
106 | 
107 | 
108 | cm = confusion_matrix(y_test,predicted)
109 | cm
110 | 
111 | 
112 | # In[165]:
113 | 
114 | 
115 | sns.heatmap(cm,annot=True,fmt='d',cmap='Oranges',linewidths=0.5,linecolor='Black')
116 | plt.xticks(np.arange(2)+.5,['No Fraud','Fraud'])
117 | plt.yticks(np.arange(2)+.5,['No Fraud','Fraud'])
118 | plt.xlabel("predicted")
119 | plt.ylabel("actuals")
120 | 
121 | 
122 | # In[164]:
123 | 
124 | 
125 | print(classification_report(y_test,predicted))
126 | 
127 | 
128 | # In[167]:
129 | 
130 | 
131 | dModel = DecisionTreeClassifier()
132 | 
133 | 
134 | # In[168]:
135 | 
136 | 
137 | dModel.fit(X_train,y_train)
138 | 
139 | 
140 | # In[169]:
141 | 
142 | 
143 | dPredicted = dModel.predict(X_test)
144 | 
145 | 
146 | # In[170]:
147 | 
148 | 
149 | cm = confusion_matrix(y_test,dPredicted)
150 | 
151 | 
152 | # In[171]:
153 | 
154 | 
155 | sns.heatmap(cm,annot=True,fmt='d',cmap='Oranges',linewidths=0.5,linecolor='Black')
156 | plt.xticks(np.arange(2)+.5,['No Fraud','Fraud'])
157 | plt.yticks(np.arange(2)+.5,['No Fraud','Fraud'])
158 | plt.xlabel("predicted")
159 | plt.ylabel("actuals")
160 | 
161 | 
162 | # In[172]:
163 | 
164 | 
165 | print(classification_report(y_test,dPredicted))
166 | 
167 | 


--------------------------------------------------------------------------------
/Scripts/PoS.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import nltk
 8 | 
 9 | 
10 | # In[2]:
11 | 
12 | 
13 | sentence = "Tesla is not as great as we think"
14 | 
15 | 
16 | # In[3]:
17 | 
18 | 
19 | token = nltk.word_tokenize(sentence)
20 | 
21 | 
22 | # In[4]:
23 | 
24 | 
25 | posd = nltk.pos_tag(token)
26 | posd
27 | 
28 | 
29 | # In[5]:
30 | 
31 | 
32 | nltk.help.upenn_tagset('VB')
33 | 
34 | 
35 | # In[6]:
36 | 
37 | 
38 | # for i in range(len(posd)):
39 | #     print(posd[i][0],"==>",posd[i][1],"==>",nltk.help.upenn_tagset(posd[i][1]))
40 | 
41 | 


--------------------------------------------------------------------------------
/Scripts/PreProcessing_PD_Model.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 13,
 6 |    "id": "30d956d3",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "import pandas as pd\n",
11 |     "import numpy as np\n",
12 |     "import seaborn as sns\n",
13 |     "import matplotlib.pyplot as plt\n",
14 |     "from sklearn.preprocessing import StandardScaler\n",
15 |     "from imblearn.over_sampling import SMOTE\n",
16 |     "from sklearn.model_selection import train_test_split\n",
17 |     "from sklearn.linear_model import LogisticRegression\n",
18 |     "from sklearn.metrics import classification_report\n",
19 |     "from sklearn.ensemble import RandomForestClassifier\n",
20 |     "from xgboost import XGBClassifier\n",
21 |     "from imblearn.over_sampling import SMOTE"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "code",
26 |    "execution_count": 14,
27 |    "id": "407126de",
28 |    "metadata": {},
29 |    "outputs": [],
30 |    "source": [
31 |     "def ModePreProcessing(df):\n",
32 |     "\n",
33 |     "# Outliers Treatment\n",
34 |     "    cr_age_rmvd = df[df['person_age']<=70]\n",
35 |     "    cr_age_rmvd.reset_index(drop=True, inplace=True)\n",
36 |     "    person_emp_rmvd = cr_age_rmvd[cr_age_rmvd['person_emp_length']<=47]\n",
37 |     "    person_emp_rmvd.reset_index(drop=True, inplace=True)\n",
38 |     "    cr_data = person_emp_rmvd.copy()\n",
39 |     "    \n",
40 |     "# Missing Values Treatment\n",
41 |     "    cr_data.fillna({'loan_int_rate':cr_data['loan_int_rate'].median()},inplace=True)\n",
42 |     "    cr_data_copy=cr_data.drop('loan_grade',axis=1)\n",
43 |     "    cr_data_cat_treated = cr_data_copy.copy()\n",
44 |     "    \n",
45 |     "# Categorical Variables Treatment\n",
46 |     "    person_home_ownership = pd.get_dummies(cr_data_cat_treated['person_home_ownership'],drop_first=True).astype(int)\n",
47 |     "    loan_intent = pd.get_dummies(cr_data_cat_treated['loan_intent'],drop_first=True).astype(int)\n",
48 |     "    cr_data_cat_treated['cb_person_default_on_file_binary'] = np.where(cr_data_cat_treated['cb_person_default_on_file']=='Y',1,0)\n",
49 |     "    data_to_scale = cr_data_cat_treated.drop(['person_home_ownership','loan_intent','loan_status','cb_person_default_on_file','cb_person_default_on_file_binary'],axis=1)\n",
50 |     "\n",
51 |     "# Scaling the data\n",
52 |     "    scaler = StandardScaler()\n",
53 |     "    scaled_data = scaler.fit_transform(data_to_scale)\n",
54 |     "    scaled_df = pd.DataFrame(scaled_data,columns=['person_age', 'person_income', 'person_emp_length', 'loan_amnt',\n",
55 |     "           'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'])\n",
56 |     "    scaled_data_combined = pd.concat([scaled_df,person_home_ownership,loan_intent],axis=1)\n",
57 |     "    scaled_data_combined['cb_person_default_on_file'] = cr_data_cat_treated['cb_person_default_on_file_binary']\n",
58 |     "    scaled_data_combined['loan_status'] = cr_data_cat_treated['loan_status']\n",
59 |     "    \n",
60 |     "# Features and Target Creation    \n",
61 |     "    target = scaled_data_combined['loan_status']\n",
62 |     "    features = scaled_data_combined.drop('loan_status',axis=1)\n",
63 |     "\n",
64 |     "# SMOTE Balancing\n",
65 |     "    smote = SMOTE()\n",
66 |     "    balanced_features, balanced_target = smote.fit_resample(features,target)\n",
67 |     "    \n",
68 |     "# return the final datasets\n",
69 |     "    return data_to_scale, features, target, balanced_features, balanced_target"
70 |    ]
71 |   }
72 |  ],
73 |  "metadata": {
74 |   "kernelspec": {
75 |    "display_name": "Python 3 (ipykernel)",
76 |    "language": "python",
77 |    "name": "python3"
78 |   },
79 |   "language_info": {
80 |    "codemirror_mode": {
81 |     "name": "ipython",
82 |     "version": 3
83 |    },
84 |    "file_extension": ".py",
85 |    "mimetype": "text/x-python",
86 |    "name": "python",
87 |    "nbconvert_exporter": "python",
88 |    "pygments_lexer": "ipython3",
89 |    "version": "3.9.7"
90 |   }
91 |  },
92 |  "nbformat": 4,
93 |  "nbformat_minor": 5
94 | }
95 | 


--------------------------------------------------------------------------------
/Scripts/PreProcessing_PD_Model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | import seaborn as sns
10 | import matplotlib.pyplot as plt
11 | from sklearn.preprocessing import StandardScaler
12 | from imblearn.over_sampling import SMOTE
13 | from sklearn.model_selection import train_test_split
14 | from sklearn.linear_model import LogisticRegression
15 | from sklearn.metrics import classification_report
16 | from sklearn.ensemble import RandomForestClassifier
17 | from xgboost import XGBClassifier
18 | from imblearn.over_sampling import SMOTE
19 | 
20 | 
21 | # In[3]:
22 | 
23 | 
24 | def ModePreProcessing(df):
25 | 
26 | # Outliers Treatment
27 |     cr_age_rmvd = df[df['person_age']<=70]
28 |     cr_age_rmvd.reset_index(drop=True, inplace=True)
29 |     person_emp_rmvd = cr_age_rmvd[cr_age_rmvd['person_emp_length']<=47]
30 |     person_emp_rmvd.reset_index(drop=True, inplace=True)
31 |     cr_data = person_emp_rmvd.copy()
32 |     
33 | # Missing Values Treatment
34 |     cr_data.fillna({'loan_int_rate':cr_data['loan_int_rate'].median()},inplace=True)
35 |     cr_data_copy=cr_data.drop('loan_grade',axis=1)
36 |     cr_data_cat_treated = cr_data_copy.copy()
37 |     
38 | # Categorical Variables Treatment
39 |     person_home_ownership = pd.get_dummies(cr_data_cat_treated['person_home_ownership'],drop_first=True).astype(int)
40 |     loan_intent = pd.get_dummies(cr_data_cat_treated['loan_intent'],drop_first=True).astype(int)
41 |     cr_data_cat_treated['cb_person_default_on_file_binary'] = np.where(cr_data_cat_treated['cb_person_default_on_file']=='Y',1,0)
42 |     data_to_scale = cr_data_cat_treated.drop(['person_home_ownership','loan_intent','loan_status','cb_person_default_on_file','cb_person_default_on_file_binary'],axis=1)
43 | 
44 | # Scaling the data
45 |     scaler = StandardScaler()
46 |     scaled_data = scaler.fit_transform(data_to_scale)
47 |     scaled_df = pd.DataFrame(scaled_data,columns=['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
48 |            'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'])
49 |     scaled_data_combined = pd.concat([scaled_df,person_home_ownership,loan_intent],axis=1)
50 |     scaled_data_combined['cb_person_default_on_file'] = cr_data_cat_treated['cb_person_default_on_file_binary']
51 |     scaled_data_combined['loan_status'] = cr_data_cat_treated['loan_status']
52 |     
53 | # Features and Target Creation    
54 |     target = scaled_data_combined['loan_status']
55 |     features = scaled_data_combined.drop('loan_status',axis=1)
56 | 
57 | # SMOTE Balancing
58 |     smote = SMOTE()
59 |     balanced_features, balanced_target = smote.fit_resample(features,target)
60 |     
61 | # return the final datasets
62 |     return data_to_scale, features, target, balanced_features, balanced_target
63 | 
64 | 


--------------------------------------------------------------------------------
/Scripts/RandomForesPDmodel.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Scripts/RandomForesPDmodel.zip


--------------------------------------------------------------------------------
/Scripts/Recursive Feature Elimination.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "4b55c914",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "\n",
 13 |     "from sklearn.model_selection import train_test_split\n",
 14 |     "from sklearn.linear_model import LogisticRegression\n",
 15 |     "from sklearn.tree import DecisionTreeClassifier\n",
 16 |     "from sklearn.feature_selection import RFE, RFECV\n",
 17 |     "\n",
 18 |     "import warnings\n",
 19 |     "warnings.filterwarnings('ignore')"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "id": "8bdb5f01",
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "data": {
 30 |       "text/html": [
 31 |        "<div>\n",
 32 |        "<style scoped>\n",
 33 |        "    .dataframe tbody tr th:only-of-type {\n",
 34 |        "        vertical-align: middle;\n",
 35 |        "    }\n",
 36 |        "\n",
 37 |        "    .dataframe tbody tr th {\n",
 38 |        "        vertical-align: top;\n",
 39 |        "    }\n",
 40 |        "\n",
 41 |        "    .dataframe thead th {\n",
 42 |        "        text-align: right;\n",
 43 |        "    }\n",
 44 |        "</style>\n",
 45 |        "<table border=\"1\" class=\"dataframe\">\n",
 46 |        "  <thead>\n",
 47 |        "    <tr style=\"text-align: right;\">\n",
 48 |        "      <th></th>\n",
 49 |        "      <th>diagnosis</th>\n",
 50 |        "      <th>radius_mean</th>\n",
 51 |        "      <th>texture_mean</th>\n",
 52 |        "      <th>perimeter_mean</th>\n",
 53 |        "      <th>area_mean</th>\n",
 54 |        "      <th>smoothness_mean</th>\n",
 55 |        "      <th>compactness_mean</th>\n",
 56 |        "      <th>concavity_mean</th>\n",
 57 |        "      <th>concave points_mean</th>\n",
 58 |        "      <th>symmetry_mean</th>\n",
 59 |        "      <th>...</th>\n",
 60 |        "      <th>radius_worst</th>\n",
 61 |        "      <th>texture_worst</th>\n",
 62 |        "      <th>perimeter_worst</th>\n",
 63 |        "      <th>area_worst</th>\n",
 64 |        "      <th>smoothness_worst</th>\n",
 65 |        "      <th>compactness_worst</th>\n",
 66 |        "      <th>concavity_worst</th>\n",
 67 |        "      <th>concave points_worst</th>\n",
 68 |        "      <th>symmetry_worst</th>\n",
 69 |        "      <th>fractal_dimension_worst</th>\n",
 70 |        "    </tr>\n",
 71 |        "  </thead>\n",
 72 |        "  <tbody>\n",
 73 |        "    <tr>\n",
 74 |        "      <th>0</th>\n",
 75 |        "      <td>1</td>\n",
 76 |        "      <td>17.99</td>\n",
 77 |        "      <td>10.38</td>\n",
 78 |        "      <td>122.80</td>\n",
 79 |        "      <td>1001.0</td>\n",
 80 |        "      <td>0.11840</td>\n",
 81 |        "      <td>0.27760</td>\n",
 82 |        "      <td>0.3001</td>\n",
 83 |        "      <td>0.14710</td>\n",
 84 |        "      <td>0.2419</td>\n",
 85 |        "      <td>...</td>\n",
 86 |        "      <td>25.38</td>\n",
 87 |        "      <td>17.33</td>\n",
 88 |        "      <td>184.60</td>\n",
 89 |        "      <td>2019.0</td>\n",
 90 |        "      <td>0.1622</td>\n",
 91 |        "      <td>0.6656</td>\n",
 92 |        "      <td>0.7119</td>\n",
 93 |        "      <td>0.2654</td>\n",
 94 |        "      <td>0.4601</td>\n",
 95 |        "      <td>0.11890</td>\n",
 96 |        "    </tr>\n",
 97 |        "    <tr>\n",
 98 |        "      <th>1</th>\n",
 99 |        "      <td>1</td>\n",
100 |        "      <td>20.57</td>\n",
101 |        "      <td>17.77</td>\n",
102 |        "      <td>132.90</td>\n",
103 |        "      <td>1326.0</td>\n",
104 |        "      <td>0.08474</td>\n",
105 |        "      <td>0.07864</td>\n",
106 |        "      <td>0.0869</td>\n",
107 |        "      <td>0.07017</td>\n",
108 |        "      <td>0.1812</td>\n",
109 |        "      <td>...</td>\n",
110 |        "      <td>24.99</td>\n",
111 |        "      <td>23.41</td>\n",
112 |        "      <td>158.80</td>\n",
113 |        "      <td>1956.0</td>\n",
114 |        "      <td>0.1238</td>\n",
115 |        "      <td>0.1866</td>\n",
116 |        "      <td>0.2416</td>\n",
117 |        "      <td>0.1860</td>\n",
118 |        "      <td>0.2750</td>\n",
119 |        "      <td>0.08902</td>\n",
120 |        "    </tr>\n",
121 |        "    <tr>\n",
122 |        "      <th>2</th>\n",
123 |        "      <td>1</td>\n",
124 |        "      <td>19.69</td>\n",
125 |        "      <td>21.25</td>\n",
126 |        "      <td>130.00</td>\n",
127 |        "      <td>1203.0</td>\n",
128 |        "      <td>0.10960</td>\n",
129 |        "      <td>0.15990</td>\n",
130 |        "      <td>0.1974</td>\n",
131 |        "      <td>0.12790</td>\n",
132 |        "      <td>0.2069</td>\n",
133 |        "      <td>...</td>\n",
134 |        "      <td>23.57</td>\n",
135 |        "      <td>25.53</td>\n",
136 |        "      <td>152.50</td>\n",
137 |        "      <td>1709.0</td>\n",
138 |        "      <td>0.1444</td>\n",
139 |        "      <td>0.4245</td>\n",
140 |        "      <td>0.4504</td>\n",
141 |        "      <td>0.2430</td>\n",
142 |        "      <td>0.3613</td>\n",
143 |        "      <td>0.08758</td>\n",
144 |        "    </tr>\n",
145 |        "    <tr>\n",
146 |        "      <th>3</th>\n",
147 |        "      <td>1</td>\n",
148 |        "      <td>11.42</td>\n",
149 |        "      <td>20.38</td>\n",
150 |        "      <td>77.58</td>\n",
151 |        "      <td>386.1</td>\n",
152 |        "      <td>0.14250</td>\n",
153 |        "      <td>0.28390</td>\n",
154 |        "      <td>0.2414</td>\n",
155 |        "      <td>0.10520</td>\n",
156 |        "      <td>0.2597</td>\n",
157 |        "      <td>...</td>\n",
158 |        "      <td>14.91</td>\n",
159 |        "      <td>26.50</td>\n",
160 |        "      <td>98.87</td>\n",
161 |        "      <td>567.7</td>\n",
162 |        "      <td>0.2098</td>\n",
163 |        "      <td>0.8663</td>\n",
164 |        "      <td>0.6869</td>\n",
165 |        "      <td>0.2575</td>\n",
166 |        "      <td>0.6638</td>\n",
167 |        "      <td>0.17300</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>4</th>\n",
171 |        "      <td>1</td>\n",
172 |        "      <td>20.29</td>\n",
173 |        "      <td>14.34</td>\n",
174 |        "      <td>135.10</td>\n",
175 |        "      <td>1297.0</td>\n",
176 |        "      <td>0.10030</td>\n",
177 |        "      <td>0.13280</td>\n",
178 |        "      <td>0.1980</td>\n",
179 |        "      <td>0.10430</td>\n",
180 |        "      <td>0.1809</td>\n",
181 |        "      <td>...</td>\n",
182 |        "      <td>22.54</td>\n",
183 |        "      <td>16.67</td>\n",
184 |        "      <td>152.20</td>\n",
185 |        "      <td>1575.0</td>\n",
186 |        "      <td>0.1374</td>\n",
187 |        "      <td>0.2050</td>\n",
188 |        "      <td>0.4000</td>\n",
189 |        "      <td>0.1625</td>\n",
190 |        "      <td>0.2364</td>\n",
191 |        "      <td>0.07678</td>\n",
192 |        "    </tr>\n",
193 |        "  </tbody>\n",
194 |        "</table>\n",
195 |        "<p>5 rows × 31 columns</p>\n",
196 |        "</div>"
197 |       ],
198 |       "text/plain": [
199 |        "   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean   \n",
200 |        "0          1        17.99         10.38          122.80     1001.0  \\\n",
201 |        "1          1        20.57         17.77          132.90     1326.0   \n",
202 |        "2          1        19.69         21.25          130.00     1203.0   \n",
203 |        "3          1        11.42         20.38           77.58      386.1   \n",
204 |        "4          1        20.29         14.34          135.10     1297.0   \n",
205 |        "\n",
206 |        "   smoothness_mean  compactness_mean  concavity_mean  concave points_mean   \n",
207 |        "0          0.11840           0.27760          0.3001              0.14710  \\\n",
208 |        "1          0.08474           0.07864          0.0869              0.07017   \n",
209 |        "2          0.10960           0.15990          0.1974              0.12790   \n",
210 |        "3          0.14250           0.28390          0.2414              0.10520   \n",
211 |        "4          0.10030           0.13280          0.1980              0.10430   \n",
212 |        "\n",
213 |        "   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst   \n",
214 |        "0         0.2419  ...         25.38          17.33           184.60  \\\n",
215 |        "1         0.1812  ...         24.99          23.41           158.80   \n",
216 |        "2         0.2069  ...         23.57          25.53           152.50   \n",
217 |        "3         0.2597  ...         14.91          26.50            98.87   \n",
218 |        "4         0.1809  ...         22.54          16.67           152.20   \n",
219 |        "\n",
220 |        "   area_worst  smoothness_worst  compactness_worst  concavity_worst   \n",
221 |        "0      2019.0            0.1622             0.6656           0.7119  \\\n",
222 |        "1      1956.0            0.1238             0.1866           0.2416   \n",
223 |        "2      1709.0            0.1444             0.4245           0.4504   \n",
224 |        "3       567.7            0.2098             0.8663           0.6869   \n",
225 |        "4      1575.0            0.1374             0.2050           0.4000   \n",
226 |        "\n",
227 |        "   concave points_worst  symmetry_worst  fractal_dimension_worst  \n",
228 |        "0                0.2654          0.4601                  0.11890  \n",
229 |        "1                0.1860          0.2750                  0.08902  \n",
230 |        "2                0.2430          0.3613                  0.08758  \n",
231 |        "3                0.2575          0.6638                  0.17300  \n",
232 |        "4                0.1625          0.2364                  0.07678  \n",
233 |        "\n",
234 |        "[5 rows x 31 columns]"
235 |       ]
236 |      },
237 |      "execution_count": 2,
238 |      "metadata": {},
239 |      "output_type": "execute_result"
240 |     }
241 |    ],
242 |    "source": [
243 |     "df = pd.read_excel(r\"D:\\Learnerea\\Tables\\cancer_data_cleaned.xlsx\")\n",
244 |     "df.head()"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 5,
250 |    "id": "5dee842c",
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "X_train, X_test, y_train, y_test = train_test_split(df.drop('diagnosis',axis=1), df.diagnosis, test_size=0.33, random_state=42)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 14,
260 |    "id": "005707bd",
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "rfe_select = RFE(DecisionTreeClassifier(),n_features_to_select=0.25).fit(X_train,y_train)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 16,
270 |    "id": "8c02d803",
271 |    "metadata": {},
272 |    "outputs": [
273 |     {
274 |      "data": {
275 |       "text/plain": [
276 |        "array([False, False, False, False, False, False, False,  True, False,\n",
277 |        "       False,  True, False, False, False, False, False, False, False,\n",
278 |        "       False,  True,  True,  True,  True, False, False, False,  True,\n",
279 |        "       False, False, False])"
280 |       ]
281 |      },
282 |      "execution_count": 16,
283 |      "metadata": {},
284 |      "output_type": "execute_result"
285 |     }
286 |    ],
287 |    "source": [
288 |     "rfe_select.support_"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 17,
294 |    "id": "289c7fff",
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "important_features = pd.DataFrame(list(zip(X_train.columns,rfe_select.support_)),columns=['features','important'])"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 18,
304 |    "id": "39055606",
305 |    "metadata": {},
306 |    "outputs": [
307 |     {
308 |      "data": {
309 |       "text/html": [
310 |        "<div>\n",
311 |        "<style scoped>\n",
312 |        "    .dataframe tbody tr th:only-of-type {\n",
313 |        "        vertical-align: middle;\n",
314 |        "    }\n",
315 |        "\n",
316 |        "    .dataframe tbody tr th {\n",
317 |        "        vertical-align: top;\n",
318 |        "    }\n",
319 |        "\n",
320 |        "    .dataframe thead th {\n",
321 |        "        text-align: right;\n",
322 |        "    }\n",
323 |        "</style>\n",
324 |        "<table border=\"1\" class=\"dataframe\">\n",
325 |        "  <thead>\n",
326 |        "    <tr style=\"text-align: right;\">\n",
327 |        "      <th></th>\n",
328 |        "      <th>features</th>\n",
329 |        "      <th>important</th>\n",
330 |        "    </tr>\n",
331 |        "  </thead>\n",
332 |        "  <tbody>\n",
333 |        "    <tr>\n",
334 |        "      <th>26</th>\n",
335 |        "      <td>concavity_worst</td>\n",
336 |        "      <td>True</td>\n",
337 |        "    </tr>\n",
338 |        "    <tr>\n",
339 |        "      <th>7</th>\n",
340 |        "      <td>concave points_mean</td>\n",
341 |        "      <td>True</td>\n",
342 |        "    </tr>\n",
343 |        "    <tr>\n",
344 |        "      <th>22</th>\n",
345 |        "      <td>perimeter_worst</td>\n",
346 |        "      <td>True</td>\n",
347 |        "    </tr>\n",
348 |        "    <tr>\n",
349 |        "      <th>10</th>\n",
350 |        "      <td>radius_se</td>\n",
351 |        "      <td>True</td>\n",
352 |        "    </tr>\n",
353 |        "    <tr>\n",
354 |        "      <th>21</th>\n",
355 |        "      <td>texture_worst</td>\n",
356 |        "      <td>True</td>\n",
357 |        "    </tr>\n",
358 |        "    <tr>\n",
359 |        "      <th>20</th>\n",
360 |        "      <td>radius_worst</td>\n",
361 |        "      <td>True</td>\n",
362 |        "    </tr>\n",
363 |        "    <tr>\n",
364 |        "      <th>19</th>\n",
365 |        "      <td>fractal_dimension_se</td>\n",
366 |        "      <td>True</td>\n",
367 |        "    </tr>\n",
368 |        "    <tr>\n",
369 |        "      <th>0</th>\n",
370 |        "      <td>radius_mean</td>\n",
371 |        "      <td>False</td>\n",
372 |        "    </tr>\n",
373 |        "    <tr>\n",
374 |        "      <th>16</th>\n",
375 |        "      <td>concavity_se</td>\n",
376 |        "      <td>False</td>\n",
377 |        "    </tr>\n",
378 |        "    <tr>\n",
379 |        "      <th>28</th>\n",
380 |        "      <td>symmetry_worst</td>\n",
381 |        "      <td>False</td>\n",
382 |        "    </tr>\n",
383 |        "    <tr>\n",
384 |        "      <th>27</th>\n",
385 |        "      <td>concave points_worst</td>\n",
386 |        "      <td>False</td>\n",
387 |        "    </tr>\n",
388 |        "    <tr>\n",
389 |        "      <th>25</th>\n",
390 |        "      <td>compactness_worst</td>\n",
391 |        "      <td>False</td>\n",
392 |        "    </tr>\n",
393 |        "    <tr>\n",
394 |        "      <th>24</th>\n",
395 |        "      <td>smoothness_worst</td>\n",
396 |        "      <td>False</td>\n",
397 |        "    </tr>\n",
398 |        "    <tr>\n",
399 |        "      <th>23</th>\n",
400 |        "      <td>area_worst</td>\n",
401 |        "      <td>False</td>\n",
402 |        "    </tr>\n",
403 |        "    <tr>\n",
404 |        "      <th>18</th>\n",
405 |        "      <td>symmetry_se</td>\n",
406 |        "      <td>False</td>\n",
407 |        "    </tr>\n",
408 |        "    <tr>\n",
409 |        "      <th>17</th>\n",
410 |        "      <td>concave points_se</td>\n",
411 |        "      <td>False</td>\n",
412 |        "    </tr>\n",
413 |        "    <tr>\n",
414 |        "      <th>15</th>\n",
415 |        "      <td>compactness_se</td>\n",
416 |        "      <td>False</td>\n",
417 |        "    </tr>\n",
418 |        "    <tr>\n",
419 |        "      <th>1</th>\n",
420 |        "      <td>texture_mean</td>\n",
421 |        "      <td>False</td>\n",
422 |        "    </tr>\n",
423 |        "    <tr>\n",
424 |        "      <th>14</th>\n",
425 |        "      <td>smoothness_se</td>\n",
426 |        "      <td>False</td>\n",
427 |        "    </tr>\n",
428 |        "    <tr>\n",
429 |        "      <th>13</th>\n",
430 |        "      <td>area_se</td>\n",
431 |        "      <td>False</td>\n",
432 |        "    </tr>\n",
433 |        "    <tr>\n",
434 |        "      <th>12</th>\n",
435 |        "      <td>perimeter_se</td>\n",
436 |        "      <td>False</td>\n",
437 |        "    </tr>\n",
438 |        "    <tr>\n",
439 |        "      <th>11</th>\n",
440 |        "      <td>texture_se</td>\n",
441 |        "      <td>False</td>\n",
442 |        "    </tr>\n",
443 |        "    <tr>\n",
444 |        "      <th>9</th>\n",
445 |        "      <td>fractal_dimension_mean</td>\n",
446 |        "      <td>False</td>\n",
447 |        "    </tr>\n",
448 |        "    <tr>\n",
449 |        "      <th>8</th>\n",
450 |        "      <td>symmetry_mean</td>\n",
451 |        "      <td>False</td>\n",
452 |        "    </tr>\n",
453 |        "    <tr>\n",
454 |        "      <th>6</th>\n",
455 |        "      <td>concavity_mean</td>\n",
456 |        "      <td>False</td>\n",
457 |        "    </tr>\n",
458 |        "    <tr>\n",
459 |        "      <th>5</th>\n",
460 |        "      <td>compactness_mean</td>\n",
461 |        "      <td>False</td>\n",
462 |        "    </tr>\n",
463 |        "    <tr>\n",
464 |        "      <th>4</th>\n",
465 |        "      <td>smoothness_mean</td>\n",
466 |        "      <td>False</td>\n",
467 |        "    </tr>\n",
468 |        "    <tr>\n",
469 |        "      <th>3</th>\n",
470 |        "      <td>area_mean</td>\n",
471 |        "      <td>False</td>\n",
472 |        "    </tr>\n",
473 |        "    <tr>\n",
474 |        "      <th>2</th>\n",
475 |        "      <td>perimeter_mean</td>\n",
476 |        "      <td>False</td>\n",
477 |        "    </tr>\n",
478 |        "    <tr>\n",
479 |        "      <th>29</th>\n",
480 |        "      <td>fractal_dimension_worst</td>\n",
481 |        "      <td>False</td>\n",
482 |        "    </tr>\n",
483 |        "  </tbody>\n",
484 |        "</table>\n",
485 |        "</div>"
486 |       ],
487 |       "text/plain": [
488 |        "                   features  important\n",
489 |        "26          concavity_worst       True\n",
490 |        "7       concave points_mean       True\n",
491 |        "22          perimeter_worst       True\n",
492 |        "10                radius_se       True\n",
493 |        "21            texture_worst       True\n",
494 |        "20             radius_worst       True\n",
495 |        "19     fractal_dimension_se       True\n",
496 |        "0               radius_mean      False\n",
497 |        "16             concavity_se      False\n",
498 |        "28           symmetry_worst      False\n",
499 |        "27     concave points_worst      False\n",
500 |        "25        compactness_worst      False\n",
501 |        "24         smoothness_worst      False\n",
502 |        "23               area_worst      False\n",
503 |        "18              symmetry_se      False\n",
504 |        "17        concave points_se      False\n",
505 |        "15           compactness_se      False\n",
506 |        "1              texture_mean      False\n",
507 |        "14            smoothness_se      False\n",
508 |        "13                  area_se      False\n",
509 |        "12             perimeter_se      False\n",
510 |        "11               texture_se      False\n",
511 |        "9    fractal_dimension_mean      False\n",
512 |        "8             symmetry_mean      False\n",
513 |        "6            concavity_mean      False\n",
514 |        "5          compactness_mean      False\n",
515 |        "4           smoothness_mean      False\n",
516 |        "3                 area_mean      False\n",
517 |        "2            perimeter_mean      False\n",
518 |        "29  fractal_dimension_worst      False"
519 |       ]
520 |      },
521 |      "execution_count": 18,
522 |      "metadata": {},
523 |      "output_type": "execute_result"
524 |     }
525 |    ],
526 |    "source": [
527 |     "important_features.sort_values(by='important',ascending=False)"
528 |    ]
529 |   }
530 |  ],
531 |  "metadata": {
532 |   "kernelspec": {
533 |    "display_name": "Python 3 (ipykernel)",
534 |    "language": "python",
535 |    "name": "python3"
536 |   },
537 |   "language_info": {
538 |    "codemirror_mode": {
539 |     "name": "ipython",
540 |     "version": 3
541 |    },
542 |    "file_extension": ".py",
543 |    "mimetype": "text/x-python",
544 |    "name": "python",
545 |    "nbconvert_exporter": "python",
546 |    "pygments_lexer": "ipython3",
547 |    "version": "3.9.7"
548 |   }
549 |  },
550 |  "nbformat": 4,
551 |  "nbformat_minor": 5
552 | }
553 | 


--------------------------------------------------------------------------------
/Scripts/UK_PM_Sentiment_Analysis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Environment setup and Log in to Twitter
  5 | 
  6 | # In[7]:
  7 | 
  8 | 
  9 | import selenium
 10 | from selenium import webdriver
 11 | from selenium.webdriver.common.by import By
 12 | from selenium.webdriver.common.keys import Keys
 13 | from time import sleep
 14 | import getpass
 15 | 
 16 | 
 17 | # In[12]:
 18 | 
 19 | 
 20 | my_user = "LearnereaBot"
 21 | # my_pass = getpass.getpass()
 22 | my_pass = "Jamesbond#0016"
 23 | 
 24 | # In[16]:
 25 | 
 26 | 
 27 | search_item = "Liz Truss"
 28 | 
 29 | 
 30 | # In[9]:
 31 | 
 32 | 
 33 | PATH = "C:\Program Files\drivers\chromedriver_103.exe"
 34 | driver = webdriver.Chrome(PATH)
 35 | driver.get("https://twitter.com/i/flow/login")
 36 | # driver.maximize_window()
 37 | sleep(3)
 38 | 
 39 | 
 40 | # In[10]:
 41 | 
 42 | 
 43 | user_id = driver.find_element(By.XPATH,"//input[@type='text']")
 44 | user_id.send_keys(my_user)
 45 | user_id.send_keys(Keys.ENTER)
 46 | 
 47 | 
 48 | # In[13]:
 49 | 
 50 | 
 51 | password = driver.find_element(By.XPATH,"//input[@type='password']")
 52 | password.send_keys(my_pass)
 53 | password.send_keys(Keys.ENTER)
 54 | 
 55 | 
 56 | # # Scrape Tweets mentioning about Liz Truss
 57 | 
 58 | # In[18]:
 59 | 
 60 | 
 61 | search_box = driver.find_element(By.XPATH,"//input[@data-testid='SearchBox_Search_Input']")
 62 | search_box.send_keys(search_item)
 63 | search_box.send_keys(Keys.ENTER)
 64 | 
 65 | 
 66 | # In[24]:
 67 | 
 68 | 
 69 | all_tweets = set()
 70 | 
 71 | 
 72 | tweets = driver.find_elements(By.XPATH,"//div[@data-testid='tweetText']")
 73 | while True:
 74 |     for tweet in tweets:
 75 |         all_tweets.add(tweet.text)
 76 |     driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
 77 |     sleep(3)
 78 |     tweets = driver.find_elements(By.XPATH,"//div[@data-testid='tweetText']")
 79 |     if len(all_tweets)>50:
 80 |         break
 81 | 
 82 | 
 83 | # In[25]:
 84 | 
 85 | 
 86 | all_tweets = list(all_tweets)
 87 | all_tweets[0]
 88 | 
 89 | 
 90 | # # Cleaning the Tweets
 91 | 
 92 | # In[64]:
 93 | 
 94 | 
 95 | import pandas as pd
 96 | pd.options.display.max_colwidth = 1000
 97 | import re
 98 | import nltk
 99 | nltk.download('punkt')
100 | nltk.download('stopwords')
101 | 
102 | from nltk.corpus import stopwords
103 | from nltk.tokenize import word_tokenize
104 | 
105 | 
106 | # In[66]:
107 | 
108 | 
109 | stp_words = stopwords.words('english')
110 | print(stp_words)
111 | 
112 | 
113 | # In[56]:
114 | 
115 | 
116 | df = pd.DataFrame(all_tweets,columns=['tweets'])
117 | df.head()
118 | 
119 | 
120 | # In[60]:
121 | 
122 | 
123 | one_tweet=df.iloc[4]['tweets']
124 | one_tweet
125 | 
126 | 
127 | # In[86]:
128 | 
129 | 
130 | from textblob import TextBlob
131 | from wordcloud import WordCloud
132 | 
133 | def TweetCleaning(tweet):
134 |     cleanTweet = re.sub(r"@[a-zA-Z0-9]+","",tweet)
135 |     cleanTweet = re.sub(r"#[a-zA-Z0-9\s]+","",cleanTweet)
136 |     cleanTweet = ' '.join(word for word in cleanTweet.split() if word not in stp_words)
137 |     return cleanTweet
138 | 
139 | def calPolarity(tweet):
140 |     return TextBlob(tweet).sentiment.polarity
141 | 
142 | def calSubjectivity(tweet):
143 |     return TextBlob(tweet).sentiment.subjectivity
144 | 
145 | def segmentation(tweet):
146 |     if tweet > 0:
147 |         return "positive"
148 |     if tweet == 0:
149 |         return "neutral"
150 |     else:
151 |         return "negative"
152 | 
153 | 
154 | # In[88]:
155 | 
156 | 
157 | df['cleanedTweets'] = df['tweets'].apply(TweetCleaning)
158 | df['tPolarity'] = df['cleanedTweets'].apply(calPolarity)
159 | df['tSubjectivity'] = df['cleanedTweets'].apply(calSubjectivity)
160 | df['segmentation'] = df['tPolarity'].apply(segmentation)
161 | df.head()
162 | 
163 | 
164 | # # Analysis and Visualization
165 | 
166 | # In[91]:
167 | 
168 | 
169 | df.pivot_table(index=['segmentation'],aggfunc={'segmentation':'count'})
170 | 
171 | 
172 | # In[94]:
173 | 
174 | 
175 | # top 3 most positive
176 | df.sort_values(by=['tPolarity'],ascending=False).head(3)
177 | 
178 | 
179 | # In[95]:
180 | 
181 | 
182 | # top 3 most negative
183 | df.sort_values(by=['tPolarity'],ascending=True).head(3)
184 | 
185 | 
186 | # In[116]:
187 | 
188 | 
189 | # 3 neutral
190 | df[df.tPolarity==0]
191 | 
192 | 
193 | # In[103]:
194 | 
195 | 
196 | import matplotlib.pyplot as plt
197 | 
198 | consolidated = ' '.join(word for word in df['cleanedTweets'])
199 | 
200 | wordCloud = WordCloud(width=400, height=200, random_state=20, max_font_size=119).generate(consolidated)
201 | 
202 | plt.imshow(wordCloud, interpolation='bilinear')
203 | plt.axis('off')
204 | plt.show()
205 | 
206 | 
207 | # In[104]:
208 | 
209 | 
210 | import seaborn as sns
211 | 
212 | 
213 | # In[113]:
214 | 
215 | 
216 | df.groupby('segmentation').count()
217 | 
218 | 
219 | # In[115]:
220 | 
221 | 
222 | plt.figure(figsize=(10,5))
223 | sns.set_style("whitegrid")
224 | sns.scatterplot(data=df, x='tPolarity',y='tSubjectivity',s=100,hue='segmentation')
225 | 
226 | 
227 | # In[117]:
228 | 
229 | 
230 | sns.countplot(data=df,x='segmentation')
231 | 
232 | 
233 | # In[118]:
234 | 
235 | 
236 | positive = round(len(df[df.segmentation == 'positive'])/len(df)*100,1)
237 | negative = round(len(df[df.segmentation == 'negative'])/len(df)*100,1)
238 | neutral = round(len(df[df.segmentation == 'neutral'])/len(df)*100,1)
239 | 
240 | responses = [positive, negative, neutral]
241 | responses
242 | 
243 | response = {'resp': ['mayWin', 'mayLoose', 'notSure'], 'pct':[positive, negative, neutral]}
244 | pd.DataFrame(response)
245 | 
246 | 


--------------------------------------------------------------------------------
/Scripts/UsingAutomatedModulesAndModels.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "600be598",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import FraudDetectionModelDataPrep as dataPrep"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "id": "b8af91ab",
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "['LabelEncoder',\n",
 23 |        " 'LogisticRegression',\n",
 24 |        " 'StratifiedKFold',\n",
 25 |        " '__builtins__',\n",
 26 |        " '__cached__',\n",
 27 |        " '__doc__',\n",
 28 |        " '__file__',\n",
 29 |        " '__loader__',\n",
 30 |        " '__name__',\n",
 31 |        " '__package__',\n",
 32 |        " '__spec__',\n",
 33 |        " 'classification_report',\n",
 34 |        " 'confusion_matrix',\n",
 35 |        " 'dataPrepAutomation',\n",
 36 |        " 'np',\n",
 37 |        " 'pd',\n",
 38 |        " 'plt',\n",
 39 |        " 'smoteBalancing',\n",
 40 |        " 'sns',\n",
 41 |        " 'train_test_split']"
 42 |       ]
 43 |      },
 44 |      "execution_count": 2,
 45 |      "metadata": {},
 46 |      "output_type": "execute_result"
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "dir(dataPrep)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "id": "3a7b6687",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "inputDF = dataPrep.pd.read_excel(r\"D:\\Learnerea\\Tables\\to_test_the_model.xlsx\")"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 5,
 66 |    "id": "89006010",
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "data": {
 71 |       "text/plain": [
 72 |        "(4500, 11)"
 73 |       ]
 74 |      },
 75 |      "execution_count": 5,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "inputDF.shape"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 6,
 87 |    "id": "1cd83ecc",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "features, target = dataPrep.dataPrepAutomation(inputDF)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 9,
 97 |    "id": "d71a7250",
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/html": [
103 |        "<div>\n",
104 |        "<style scoped>\n",
105 |        "    .dataframe tbody tr th:only-of-type {\n",
106 |        "        vertical-align: middle;\n",
107 |        "    }\n",
108 |        "\n",
109 |        "    .dataframe tbody tr th {\n",
110 |        "        vertical-align: top;\n",
111 |        "    }\n",
112 |        "\n",
113 |        "    .dataframe thead th {\n",
114 |        "        text-align: right;\n",
115 |        "    }\n",
116 |        "</style>\n",
117 |        "<table border=\"1\" class=\"dataframe\">\n",
118 |        "  <thead>\n",
119 |        "    <tr style=\"text-align: right;\">\n",
120 |        "      <th></th>\n",
121 |        "      <th>step</th>\n",
122 |        "      <th>amount</th>\n",
123 |        "      <th>oldbalanceOrg</th>\n",
124 |        "      <th>newbalanceOrig</th>\n",
125 |        "      <th>oldbalanceDest</th>\n",
126 |        "      <th>newbalanceDest</th>\n",
127 |        "      <th>isFlaggedFraud</th>\n",
128 |        "      <th>CASH_IN</th>\n",
129 |        "      <th>CASH_OUT</th>\n",
130 |        "      <th>DEBIT</th>\n",
131 |        "      <th>PAYMENT</th>\n",
132 |        "      <th>TRANSFER</th>\n",
133 |        "    </tr>\n",
134 |        "  </thead>\n",
135 |        "  <tbody>\n",
136 |        "    <tr>\n",
137 |        "      <th>0</th>\n",
138 |        "      <td>303</td>\n",
139 |        "      <td>13363860.78</td>\n",
140 |        "      <td>0.00</td>\n",
141 |        "      <td>0.00</td>\n",
142 |        "      <td>13656437.62</td>\n",
143 |        "      <td>27020298.40</td>\n",
144 |        "      <td>0</td>\n",
145 |        "      <td>False</td>\n",
146 |        "      <td>False</td>\n",
147 |        "      <td>False</td>\n",
148 |        "      <td>False</td>\n",
149 |        "      <td>True</td>\n",
150 |        "    </tr>\n",
151 |        "    <tr>\n",
152 |        "      <th>1</th>\n",
153 |        "      <td>355</td>\n",
154 |        "      <td>340375.64</td>\n",
155 |        "      <td>9758.51</td>\n",
156 |        "      <td>0.00</td>\n",
157 |        "      <td>4764447.25</td>\n",
158 |        "      <td>5104822.89</td>\n",
159 |        "      <td>0</td>\n",
160 |        "      <td>False</td>\n",
161 |        "      <td>False</td>\n",
162 |        "      <td>False</td>\n",
163 |        "      <td>False</td>\n",
164 |        "      <td>True</td>\n",
165 |        "    </tr>\n",
166 |        "    <tr>\n",
167 |        "      <th>2</th>\n",
168 |        "      <td>404</td>\n",
169 |        "      <td>276737.58</td>\n",
170 |        "      <td>0.00</td>\n",
171 |        "      <td>0.00</td>\n",
172 |        "      <td>962518.44</td>\n",
173 |        "      <td>1239256.03</td>\n",
174 |        "      <td>0</td>\n",
175 |        "      <td>False</td>\n",
176 |        "      <td>True</td>\n",
177 |        "      <td>False</td>\n",
178 |        "      <td>False</td>\n",
179 |        "      <td>False</td>\n",
180 |        "    </tr>\n",
181 |        "    <tr>\n",
182 |        "      <th>3</th>\n",
183 |        "      <td>258</td>\n",
184 |        "      <td>314027.77</td>\n",
185 |        "      <td>650090.00</td>\n",
186 |        "      <td>336062.23</td>\n",
187 |        "      <td>0.00</td>\n",
188 |        "      <td>314027.77</td>\n",
189 |        "      <td>0</td>\n",
190 |        "      <td>False</td>\n",
191 |        "      <td>True</td>\n",
192 |        "      <td>False</td>\n",
193 |        "      <td>False</td>\n",
194 |        "      <td>False</td>\n",
195 |        "    </tr>\n",
196 |        "    <tr>\n",
197 |        "      <th>4</th>\n",
198 |        "      <td>36</td>\n",
199 |        "      <td>33468.45</td>\n",
200 |        "      <td>0.00</td>\n",
201 |        "      <td>0.00</td>\n",
202 |        "      <td>149332.38</td>\n",
203 |        "      <td>182800.83</td>\n",
204 |        "      <td>0</td>\n",
205 |        "      <td>False</td>\n",
206 |        "      <td>True</td>\n",
207 |        "      <td>False</td>\n",
208 |        "      <td>False</td>\n",
209 |        "      <td>False</td>\n",
210 |        "    </tr>\n",
211 |        "  </tbody>\n",
212 |        "</table>\n",
213 |        "</div>"
214 |       ],
215 |       "text/plain": [
216 |        "   step       amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest   \n",
217 |        "0   303  13363860.78           0.00            0.00     13656437.62  \\\n",
218 |        "1   355    340375.64        9758.51            0.00      4764447.25   \n",
219 |        "2   404    276737.58           0.00            0.00       962518.44   \n",
220 |        "3   258    314027.77      650090.00       336062.23            0.00   \n",
221 |        "4    36     33468.45           0.00            0.00       149332.38   \n",
222 |        "\n",
223 |        "   newbalanceDest  isFlaggedFraud  CASH_IN  CASH_OUT  DEBIT  PAYMENT  TRANSFER  \n",
224 |        "0     27020298.40               0    False     False  False    False      True  \n",
225 |        "1      5104822.89               0    False     False  False    False      True  \n",
226 |        "2      1239256.03               0    False      True  False    False     False  \n",
227 |        "3       314027.77               0    False      True  False    False     False  \n",
228 |        "4       182800.83               0    False      True  False    False     False  "
229 |       ]
230 |      },
231 |      "execution_count": 9,
232 |      "metadata": {},
233 |      "output_type": "execute_result"
234 |     }
235 |    ],
236 |    "source": [
237 |     "features.head()"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 10,
243 |    "id": "239d3bc7",
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "import pickle"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 11,
253 |    "id": "4ecb9787",
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "with open(\"ensemble_model.pkl\",\"rb\") as file:\n",
258 |     "    ensemble_model = pickle.load(file)"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 14,
264 |    "id": "81c2d452",
265 |    "metadata": {},
266 |    "outputs": [
267 |     {
268 |      "data": {
269 |       "text/plain": [
270 |        "array([0, 0, 0, ..., 1, 1, 1], dtype=int64)"
271 |       ]
272 |      },
273 |      "execution_count": 14,
274 |      "metadata": {},
275 |      "output_type": "execute_result"
276 |     }
277 |    ],
278 |    "source": [
279 |     "prediction = ensemble_model.predict(features)\n",
280 |     "prediction"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 17,
286 |    "id": "b6e2ed5e",
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "name": "stdout",
291 |      "output_type": "stream",
292 |      "text": [
293 |       "              precision    recall  f1-score   support\n",
294 |       "\n",
295 |       "           0       1.00      0.99      0.99      4000\n",
296 |       "           1       0.91      1.00      0.95       500\n",
297 |       "\n",
298 |       "    accuracy                           0.99      4500\n",
299 |       "   macro avg       0.96      0.99      0.97      4500\n",
300 |       "weighted avg       0.99      0.99      0.99      4500\n",
301 |       "\n"
302 |      ]
303 |     }
304 |    ],
305 |    "source": [
306 |     "print(dataPrep.classification_report(target,prediction))"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 18,
312 |    "id": "9ee6aca0",
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": [
316 |     "with open(\"smote_logistic.pkl\",\"rb\") as file:\n",
317 |     "    smote_logistic = pickle.load(file)"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 19,
323 |    "id": "888f1eec",
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "data": {
328 |       "text/plain": [
329 |        "array([0, 0, 0, ..., 1, 1, 1], dtype=int64)"
330 |       ]
331 |      },
332 |      "execution_count": 19,
333 |      "metadata": {},
334 |      "output_type": "execute_result"
335 |     }
336 |    ],
337 |    "source": [
338 |     "smoteLogPred = smote_logistic.predict(features)\n",
339 |     "smoteLogPred"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 20,
345 |    "id": "192cf23d",
346 |    "metadata": {},
347 |    "outputs": [
348 |     {
349 |      "name": "stdout",
350 |      "output_type": "stream",
351 |      "text": [
352 |       "              precision    recall  f1-score   support\n",
353 |       "\n",
354 |       "           0       0.99      0.92      0.95      4000\n",
355 |       "           1       0.58      0.90      0.71       500\n",
356 |       "\n",
357 |       "    accuracy                           0.92      4500\n",
358 |       "   macro avg       0.79      0.91      0.83      4500\n",
359 |       "weighted avg       0.94      0.92      0.92      4500\n",
360 |       "\n"
361 |      ]
362 |     }
363 |    ],
364 |    "source": [
365 |     "print(dataPrep.classification_report(target,smoteLogPred))"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 21,
371 |    "id": "4973418f",
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "smoteLogProb = smote_logistic.predict_proba(features)"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 24,
381 |    "id": "270284c5",
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "smoteLogProbdata = smoteLogProb[:,1]"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 27,
391 |    "id": "8e19858c",
392 |    "metadata": {},
393 |    "outputs": [],
394 |    "source": [
395 |     "finalData = dataPrep.pd.DataFrame(zip(target,prediction,smoteLogPred,smoteLogProbdata),columns=['isFraud','ensem_pred','smote_pred','smote_prob'])"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 28,
401 |    "id": "5eb1e71a",
402 |    "metadata": {},
403 |    "outputs": [
404 |     {
405 |      "data": {
406 |       "text/html": [
407 |        "<div>\n",
408 |        "<style scoped>\n",
409 |        "    .dataframe tbody tr th:only-of-type {\n",
410 |        "        vertical-align: middle;\n",
411 |        "    }\n",
412 |        "\n",
413 |        "    .dataframe tbody tr th {\n",
414 |        "        vertical-align: top;\n",
415 |        "    }\n",
416 |        "\n",
417 |        "    .dataframe thead th {\n",
418 |        "        text-align: right;\n",
419 |        "    }\n",
420 |        "</style>\n",
421 |        "<table border=\"1\" class=\"dataframe\">\n",
422 |        "  <thead>\n",
423 |        "    <tr style=\"text-align: right;\">\n",
424 |        "      <th></th>\n",
425 |        "      <th>step</th>\n",
426 |        "      <th>amount</th>\n",
427 |        "      <th>oldbalanceOrg</th>\n",
428 |        "      <th>newbalanceOrig</th>\n",
429 |        "      <th>oldbalanceDest</th>\n",
430 |        "      <th>newbalanceDest</th>\n",
431 |        "      <th>isFlaggedFraud</th>\n",
432 |        "      <th>CASH_IN</th>\n",
433 |        "      <th>CASH_OUT</th>\n",
434 |        "      <th>DEBIT</th>\n",
435 |        "      <th>PAYMENT</th>\n",
436 |        "      <th>TRANSFER</th>\n",
437 |        "      <th>isFraud</th>\n",
438 |        "      <th>ensem_pred</th>\n",
439 |        "      <th>smote_pred</th>\n",
440 |        "      <th>smote_prob</th>\n",
441 |        "    </tr>\n",
442 |        "  </thead>\n",
443 |        "  <tbody>\n",
444 |        "    <tr>\n",
445 |        "      <th>0</th>\n",
446 |        "      <td>303</td>\n",
447 |        "      <td>13363860.78</td>\n",
448 |        "      <td>0.00</td>\n",
449 |        "      <td>0.00</td>\n",
450 |        "      <td>13656437.62</td>\n",
451 |        "      <td>27020298.40</td>\n",
452 |        "      <td>0</td>\n",
453 |        "      <td>False</td>\n",
454 |        "      <td>False</td>\n",
455 |        "      <td>False</td>\n",
456 |        "      <td>False</td>\n",
457 |        "      <td>True</td>\n",
458 |        "      <td>0</td>\n",
459 |        "      <td>0</td>\n",
460 |        "      <td>0</td>\n",
461 |        "      <td>4.550288e-79</td>\n",
462 |        "    </tr>\n",
463 |        "    <tr>\n",
464 |        "      <th>1</th>\n",
465 |        "      <td>355</td>\n",
466 |        "      <td>340375.64</td>\n",
467 |        "      <td>9758.51</td>\n",
468 |        "      <td>0.00</td>\n",
469 |        "      <td>4764447.25</td>\n",
470 |        "      <td>5104822.89</td>\n",
471 |        "      <td>0</td>\n",
472 |        "      <td>False</td>\n",
473 |        "      <td>False</td>\n",
474 |        "      <td>False</td>\n",
475 |        "      <td>False</td>\n",
476 |        "      <td>True</td>\n",
477 |        "      <td>0</td>\n",
478 |        "      <td>0</td>\n",
479 |        "      <td>0</td>\n",
480 |        "      <td>4.131617e-03</td>\n",
481 |        "    </tr>\n",
482 |        "    <tr>\n",
483 |        "      <th>2</th>\n",
484 |        "      <td>404</td>\n",
485 |        "      <td>276737.58</td>\n",
486 |        "      <td>0.00</td>\n",
487 |        "      <td>0.00</td>\n",
488 |        "      <td>962518.44</td>\n",
489 |        "      <td>1239256.03</td>\n",
490 |        "      <td>0</td>\n",
491 |        "      <td>False</td>\n",
492 |        "      <td>True</td>\n",
493 |        "      <td>False</td>\n",
494 |        "      <td>False</td>\n",
495 |        "      <td>False</td>\n",
496 |        "      <td>0</td>\n",
497 |        "      <td>0</td>\n",
498 |        "      <td>0</td>\n",
499 |        "      <td>8.529435e-03</td>\n",
500 |        "    </tr>\n",
501 |        "    <tr>\n",
502 |        "      <th>3</th>\n",
503 |        "      <td>258</td>\n",
504 |        "      <td>314027.77</td>\n",
505 |        "      <td>650090.00</td>\n",
506 |        "      <td>336062.23</td>\n",
507 |        "      <td>0.00</td>\n",
508 |        "      <td>314027.77</td>\n",
509 |        "      <td>0</td>\n",
510 |        "      <td>False</td>\n",
511 |        "      <td>True</td>\n",
512 |        "      <td>False</td>\n",
513 |        "      <td>False</td>\n",
514 |        "      <td>False</td>\n",
515 |        "      <td>0</td>\n",
516 |        "      <td>0</td>\n",
517 |        "      <td>1</td>\n",
518 |        "      <td>9.799595e-01</td>\n",
519 |        "    </tr>\n",
520 |        "    <tr>\n",
521 |        "      <th>4</th>\n",
522 |        "      <td>36</td>\n",
523 |        "      <td>33468.45</td>\n",
524 |        "      <td>0.00</td>\n",
525 |        "      <td>0.00</td>\n",
526 |        "      <td>149332.38</td>\n",
527 |        "      <td>182800.83</td>\n",
528 |        "      <td>0</td>\n",
529 |        "      <td>False</td>\n",
530 |        "      <td>True</td>\n",
531 |        "      <td>False</td>\n",
532 |        "      <td>False</td>\n",
533 |        "      <td>False</td>\n",
534 |        "      <td>0</td>\n",
535 |        "      <td>0</td>\n",
536 |        "      <td>0</td>\n",
537 |        "      <td>3.666744e-01</td>\n",
538 |        "    </tr>\n",
539 |        "  </tbody>\n",
540 |        "</table>\n",
541 |        "</div>"
542 |       ],
543 |       "text/plain": [
544 |        "   step       amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest   \n",
545 |        "0   303  13363860.78           0.00            0.00     13656437.62  \\\n",
546 |        "1   355    340375.64        9758.51            0.00      4764447.25   \n",
547 |        "2   404    276737.58           0.00            0.00       962518.44   \n",
548 |        "3   258    314027.77      650090.00       336062.23            0.00   \n",
549 |        "4    36     33468.45           0.00            0.00       149332.38   \n",
550 |        "\n",
551 |        "   newbalanceDest  isFlaggedFraud  CASH_IN  CASH_OUT  DEBIT  PAYMENT   \n",
552 |        "0     27020298.40               0    False     False  False    False  \\\n",
553 |        "1      5104822.89               0    False     False  False    False   \n",
554 |        "2      1239256.03               0    False      True  False    False   \n",
555 |        "3       314027.77               0    False      True  False    False   \n",
556 |        "4       182800.83               0    False      True  False    False   \n",
557 |        "\n",
558 |        "   TRANSFER  isFraud  ensem_pred  smote_pred    smote_prob  \n",
559 |        "0      True        0           0           0  4.550288e-79  \n",
560 |        "1      True        0           0           0  4.131617e-03  \n",
561 |        "2     False        0           0           0  8.529435e-03  \n",
562 |        "3     False        0           0           1  9.799595e-01  \n",
563 |        "4     False        0           0           0  3.666744e-01  "
564 |       ]
565 |      },
566 |      "execution_count": 28,
567 |      "metadata": {},
568 |      "output_type": "execute_result"
569 |     }
570 |    ],
571 |    "source": [
572 |     "finalOutcome = dataPrep.pd.concat([features,finalData],axis=1)\n",
573 |     "finalOutcome.head()"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 29,
579 |    "id": "e0cc301b",
580 |    "metadata": {},
581 |    "outputs": [],
582 |    "source": [
583 |     "finalOutcome.to_excel(r\"D:\\Learnerea\\Tables\\dataforAnalysis.xlsx\")"
584 |    ]
585 |   }
586 |  ],
587 |  "metadata": {
588 |   "kernelspec": {
589 |    "display_name": "Python 3 (ipykernel)",
590 |    "language": "python",
591 |    "name": "python3"
592 |   },
593 |   "language_info": {
594 |    "codemirror_mode": {
595 |     "name": "ipython",
596 |     "version": 3
597 |    },
598 |    "file_extension": ".py",
599 |    "mimetype": "text/x-python",
600 |    "name": "python",
601 |    "nbconvert_exporter": "python",
602 |    "pygments_lexer": "ipython3",
603 |    "version": "3.9.7"
604 |   }
605 |  },
606 |  "nbformat": 4,
607 |  "nbformat_minor": 5
608 | }
609 | 


--------------------------------------------------------------------------------
/Scripts/XGBpdModel.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Scripts/XGBpdModel.pkl


--------------------------------------------------------------------------------
/Scripts/airbnb_Host_Popularity_Rental_Prices_Hard.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | 
 9 | pd.options.display.max_colwidth=1000
10 | 
11 | 
12 | # In[4]:
13 | 
14 | 
15 | ab = pd.read_excel(r"D:\Learnerea\Tables\airbnb_host_searches.xlsx").drop('id',axis=1)
16 | ab.head()
17 | 
18 | 
19 | # In[6]:
20 | 
21 | 
22 | ab.duplicated().value_counts()
23 | 
24 | 
25 | # In[7]:
26 | 
27 | 
28 | ab2=ab.drop_duplicates()
29 | ab2.duplicated().value_counts()
30 | 
31 | 
32 | # In[9]:
33 | 
34 | 
35 | ab2.dtypes
36 | 
37 | 
38 | # In[11]:
39 | 
40 | 
41 | ab2['host_id'] = str(ab2['price'])+ab2['room_type']+str(ab2['host_since'])+str(ab2['zipcode'])+str(ab2['number_of_reviews'])
42 | ab2.head()
43 | 
44 | 
45 | # In[12]:
46 | 
47 | 
48 | ab2['host_popularity'] = ab2['number_of_reviews'].apply(lambda x: 'New' if x==0
49 |                                                        else 'Rising' if x>=1 and x<=5
50 |                                                         else 'Trending Up' if x>=6 and x<=15
51 |                                                         else 'Popular' if x>=16 and x<=40
52 |                                                         else 'Hot'
53 |                                                        )
54 | 
55 | 
56 | # In[17]:
57 | 
58 | 
59 | summary = ab2.pivot_table(index='host_popularity',values='price',aggfunc={'price':['min','mean','max']}).reset_index()
60 | summary.rename(columns={'max':'max_price',
61 |                        'min':'min_price',
62 |                        'mean':'avg_price'})
63 | 
64 | 


--------------------------------------------------------------------------------
/Scripts/airbnb_Ranking_Most_Active_Guests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[5]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | 
 9 | 
10 | # In[6]:
11 | 
12 | 
13 | ab = pd.read_excel(r"D:\Learnerea\Tables\airbnb_contacts.xlsx")
14 | ab.head()
15 | 
16 | 
17 | # In[14]:
18 | 
19 | 
20 | ab_agg=ab.pivot_table(index='id_guest',values='n_messages',aggfunc='sum').reset_index().sort_values(by='n_messages',ascending=False)
21 | ab_agg
22 | 
23 | 
24 | # In[21]:
25 | 
26 | 
27 | ab_agg['rank'] = ab_agg['n_messages'].rank(ascending=False,method='dense')
28 | ab_agg.head(10)
29 | 
30 | 


--------------------------------------------------------------------------------
/Scripts/amazon_interview_question.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | 
 9 | 
10 | # In[2]:
11 | 
12 | 
13 | amazon = pd.read_excel(r"D:\Learnerea\Tables\amazon_trans.xlsx")
14 | amazon.head()
15 | 
16 | 
17 | # In[4]:
18 | 
19 | 
20 | amazon.sort_values(by=['user_id','created_at'],inplace=True)
21 | amazon.head()
22 | 
23 | 
24 | # In[8]:
25 | 
26 | 
27 | amazon['day_diff']=amazon.groupby('user_id')['created_at'].diff()
28 | amazon.head(10)
29 | 
30 | 
31 | # In[9]:
32 | 
33 | 
34 | amazon.info()
35 | 
36 | 
37 | # In[11]:
38 | 
39 | 
40 | amazon[amazon['day_diff']<= pd.Timedelta('7 days')]['user_id'].unique()
41 | 
42 | 
43 | # In[ ]:
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/Scripts/backward_elimintation_method.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 21,
  6 |    "id": "585ac163",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "import warnings\n",
 13 |     "\n",
 14 |     "from sklearn.linear_model import LogisticRegression\n",
 15 |     "from sklearn.model_selection import train_test_split\n",
 16 |     "\n",
 17 |     "from mlxtend.feature_selection import SequentialFeatureSelector\n",
 18 |     "\n",
 19 |     "warnings.filterwarnings('ignore')"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 22,
 25 |    "id": "b3794176",
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "df = pd.read_excel(r\"D:\\Learnerea\\Tables\\cancer_data_cleaned.xlsx\")"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 23,
 35 |    "id": "7cca26cd",
 36 |    "metadata": {
 37 |     "scrolled": true
 38 |    },
 39 |    "outputs": [
 40 |     {
 41 |      "data": {
 42 |       "text/html": [
 43 |        "<div>\n",
 44 |        "<style scoped>\n",
 45 |        "    .dataframe tbody tr th:only-of-type {\n",
 46 |        "        vertical-align: middle;\n",
 47 |        "    }\n",
 48 |        "\n",
 49 |        "    .dataframe tbody tr th {\n",
 50 |        "        vertical-align: top;\n",
 51 |        "    }\n",
 52 |        "\n",
 53 |        "    .dataframe thead th {\n",
 54 |        "        text-align: right;\n",
 55 |        "    }\n",
 56 |        "</style>\n",
 57 |        "<table border=\"1\" class=\"dataframe\">\n",
 58 |        "  <thead>\n",
 59 |        "    <tr style=\"text-align: right;\">\n",
 60 |        "      <th></th>\n",
 61 |        "      <th>diagnosis</th>\n",
 62 |        "      <th>radius_mean</th>\n",
 63 |        "      <th>texture_mean</th>\n",
 64 |        "      <th>perimeter_mean</th>\n",
 65 |        "      <th>area_mean</th>\n",
 66 |        "      <th>smoothness_mean</th>\n",
 67 |        "      <th>compactness_mean</th>\n",
 68 |        "      <th>concavity_mean</th>\n",
 69 |        "      <th>concave points_mean</th>\n",
 70 |        "      <th>symmetry_mean</th>\n",
 71 |        "      <th>...</th>\n",
 72 |        "      <th>radius_worst</th>\n",
 73 |        "      <th>texture_worst</th>\n",
 74 |        "      <th>perimeter_worst</th>\n",
 75 |        "      <th>area_worst</th>\n",
 76 |        "      <th>smoothness_worst</th>\n",
 77 |        "      <th>compactness_worst</th>\n",
 78 |        "      <th>concavity_worst</th>\n",
 79 |        "      <th>concave points_worst</th>\n",
 80 |        "      <th>symmetry_worst</th>\n",
 81 |        "      <th>fractal_dimension_worst</th>\n",
 82 |        "    </tr>\n",
 83 |        "  </thead>\n",
 84 |        "  <tbody>\n",
 85 |        "    <tr>\n",
 86 |        "      <th>0</th>\n",
 87 |        "      <td>1</td>\n",
 88 |        "      <td>17.99</td>\n",
 89 |        "      <td>10.38</td>\n",
 90 |        "      <td>122.80</td>\n",
 91 |        "      <td>1001.0</td>\n",
 92 |        "      <td>0.11840</td>\n",
 93 |        "      <td>0.27760</td>\n",
 94 |        "      <td>0.3001</td>\n",
 95 |        "      <td>0.14710</td>\n",
 96 |        "      <td>0.2419</td>\n",
 97 |        "      <td>...</td>\n",
 98 |        "      <td>25.38</td>\n",
 99 |        "      <td>17.33</td>\n",
100 |        "      <td>184.60</td>\n",
101 |        "      <td>2019.0</td>\n",
102 |        "      <td>0.1622</td>\n",
103 |        "      <td>0.6656</td>\n",
104 |        "      <td>0.7119</td>\n",
105 |        "      <td>0.2654</td>\n",
106 |        "      <td>0.4601</td>\n",
107 |        "      <td>0.11890</td>\n",
108 |        "    </tr>\n",
109 |        "    <tr>\n",
110 |        "      <th>1</th>\n",
111 |        "      <td>1</td>\n",
112 |        "      <td>20.57</td>\n",
113 |        "      <td>17.77</td>\n",
114 |        "      <td>132.90</td>\n",
115 |        "      <td>1326.0</td>\n",
116 |        "      <td>0.08474</td>\n",
117 |        "      <td>0.07864</td>\n",
118 |        "      <td>0.0869</td>\n",
119 |        "      <td>0.07017</td>\n",
120 |        "      <td>0.1812</td>\n",
121 |        "      <td>...</td>\n",
122 |        "      <td>24.99</td>\n",
123 |        "      <td>23.41</td>\n",
124 |        "      <td>158.80</td>\n",
125 |        "      <td>1956.0</td>\n",
126 |        "      <td>0.1238</td>\n",
127 |        "      <td>0.1866</td>\n",
128 |        "      <td>0.2416</td>\n",
129 |        "      <td>0.1860</td>\n",
130 |        "      <td>0.2750</td>\n",
131 |        "      <td>0.08902</td>\n",
132 |        "    </tr>\n",
133 |        "    <tr>\n",
134 |        "      <th>2</th>\n",
135 |        "      <td>1</td>\n",
136 |        "      <td>19.69</td>\n",
137 |        "      <td>21.25</td>\n",
138 |        "      <td>130.00</td>\n",
139 |        "      <td>1203.0</td>\n",
140 |        "      <td>0.10960</td>\n",
141 |        "      <td>0.15990</td>\n",
142 |        "      <td>0.1974</td>\n",
143 |        "      <td>0.12790</td>\n",
144 |        "      <td>0.2069</td>\n",
145 |        "      <td>...</td>\n",
146 |        "      <td>23.57</td>\n",
147 |        "      <td>25.53</td>\n",
148 |        "      <td>152.50</td>\n",
149 |        "      <td>1709.0</td>\n",
150 |        "      <td>0.1444</td>\n",
151 |        "      <td>0.4245</td>\n",
152 |        "      <td>0.4504</td>\n",
153 |        "      <td>0.2430</td>\n",
154 |        "      <td>0.3613</td>\n",
155 |        "      <td>0.08758</td>\n",
156 |        "    </tr>\n",
157 |        "    <tr>\n",
158 |        "      <th>3</th>\n",
159 |        "      <td>1</td>\n",
160 |        "      <td>11.42</td>\n",
161 |        "      <td>20.38</td>\n",
162 |        "      <td>77.58</td>\n",
163 |        "      <td>386.1</td>\n",
164 |        "      <td>0.14250</td>\n",
165 |        "      <td>0.28390</td>\n",
166 |        "      <td>0.2414</td>\n",
167 |        "      <td>0.10520</td>\n",
168 |        "      <td>0.2597</td>\n",
169 |        "      <td>...</td>\n",
170 |        "      <td>14.91</td>\n",
171 |        "      <td>26.50</td>\n",
172 |        "      <td>98.87</td>\n",
173 |        "      <td>567.7</td>\n",
174 |        "      <td>0.2098</td>\n",
175 |        "      <td>0.8663</td>\n",
176 |        "      <td>0.6869</td>\n",
177 |        "      <td>0.2575</td>\n",
178 |        "      <td>0.6638</td>\n",
179 |        "      <td>0.17300</td>\n",
180 |        "    </tr>\n",
181 |        "    <tr>\n",
182 |        "      <th>4</th>\n",
183 |        "      <td>1</td>\n",
184 |        "      <td>20.29</td>\n",
185 |        "      <td>14.34</td>\n",
186 |        "      <td>135.10</td>\n",
187 |        "      <td>1297.0</td>\n",
188 |        "      <td>0.10030</td>\n",
189 |        "      <td>0.13280</td>\n",
190 |        "      <td>0.1980</td>\n",
191 |        "      <td>0.10430</td>\n",
192 |        "      <td>0.1809</td>\n",
193 |        "      <td>...</td>\n",
194 |        "      <td>22.54</td>\n",
195 |        "      <td>16.67</td>\n",
196 |        "      <td>152.20</td>\n",
197 |        "      <td>1575.0</td>\n",
198 |        "      <td>0.1374</td>\n",
199 |        "      <td>0.2050</td>\n",
200 |        "      <td>0.4000</td>\n",
201 |        "      <td>0.1625</td>\n",
202 |        "      <td>0.2364</td>\n",
203 |        "      <td>0.07678</td>\n",
204 |        "    </tr>\n",
205 |        "  </tbody>\n",
206 |        "</table>\n",
207 |        "<p>5 rows × 31 columns</p>\n",
208 |        "</div>"
209 |       ],
210 |       "text/plain": [
211 |        "   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean   \n",
212 |        "0          1        17.99         10.38          122.80     1001.0  \\\n",
213 |        "1          1        20.57         17.77          132.90     1326.0   \n",
214 |        "2          1        19.69         21.25          130.00     1203.0   \n",
215 |        "3          1        11.42         20.38           77.58      386.1   \n",
216 |        "4          1        20.29         14.34          135.10     1297.0   \n",
217 |        "\n",
218 |        "   smoothness_mean  compactness_mean  concavity_mean  concave points_mean   \n",
219 |        "0          0.11840           0.27760          0.3001              0.14710  \\\n",
220 |        "1          0.08474           0.07864          0.0869              0.07017   \n",
221 |        "2          0.10960           0.15990          0.1974              0.12790   \n",
222 |        "3          0.14250           0.28390          0.2414              0.10520   \n",
223 |        "4          0.10030           0.13280          0.1980              0.10430   \n",
224 |        "\n",
225 |        "   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst   \n",
226 |        "0         0.2419  ...         25.38          17.33           184.60  \\\n",
227 |        "1         0.1812  ...         24.99          23.41           158.80   \n",
228 |        "2         0.2069  ...         23.57          25.53           152.50   \n",
229 |        "3         0.2597  ...         14.91          26.50            98.87   \n",
230 |        "4         0.1809  ...         22.54          16.67           152.20   \n",
231 |        "\n",
232 |        "   area_worst  smoothness_worst  compactness_worst  concavity_worst   \n",
233 |        "0      2019.0            0.1622             0.6656           0.7119  \\\n",
234 |        "1      1956.0            0.1238             0.1866           0.2416   \n",
235 |        "2      1709.0            0.1444             0.4245           0.4504   \n",
236 |        "3       567.7            0.2098             0.8663           0.6869   \n",
237 |        "4      1575.0            0.1374             0.2050           0.4000   \n",
238 |        "\n",
239 |        "   concave points_worst  symmetry_worst  fractal_dimension_worst  \n",
240 |        "0                0.2654          0.4601                  0.11890  \n",
241 |        "1                0.1860          0.2750                  0.08902  \n",
242 |        "2                0.2430          0.3613                  0.08758  \n",
243 |        "3                0.2575          0.6638                  0.17300  \n",
244 |        "4                0.1625          0.2364                  0.07678  \n",
245 |        "\n",
246 |        "[5 rows x 31 columns]"
247 |       ]
248 |      },
249 |      "execution_count": 23,
250 |      "metadata": {},
251 |      "output_type": "execute_result"
252 |     }
253 |    ],
254 |    "source": [
255 |     "df.head()"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 24,
261 |    "id": "e04fc892",
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "X_train, X_test, y_train, y_test = train_test_split(df.drop('diagnosis',axis=1), df.diagnosis, test_size=0.33, random_state=42)"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 25,
271 |    "id": "1ce45e07",
272 |    "metadata": {},
273 |    "outputs": [
274 |     {
275 |      "name": "stderr",
276 |      "output_type": "stream",
277 |      "text": [
278 |       "[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.8s\n",
279 |       "\n",
280 |       "[2024-04-06 11:15:02] Features: 29/1 -- score: 0.9448735475051265[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.6s\n",
281 |       "\n",
282 |       "[2024-04-06 11:15:05] Features: 28/1 -- score: 0.9449077238550923[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.6s\n",
283 |       "\n",
284 |       "[2024-04-06 11:15:08] Features: 27/1 -- score: 0.9553315105946684[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.7s\n",
285 |       "\n",
286 |       "[2024-04-06 11:15:11] Features: 26/1 -- score: 0.9579630895420369[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.7s\n",
287 |       "\n",
288 |       "[2024-04-06 11:15:13] Features: 25/1 -- score: 0.9632604237867396[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.6s\n",
289 |       "\n",
290 |       "[2024-04-06 11:15:16] Features: 24/1 -- score: 0.9632946001367054[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.6s\n",
291 |       "\n",
292 |       "[2024-04-06 11:15:18] Features: 23/1 -- score: 0.9632262474367739[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.5s\n",
293 |       "\n",
294 |       "[2024-04-06 11:15:21] Features: 22/1 -- score: 0.9658578263841422[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.3s\n",
295 |       "\n",
296 |       "[2024-04-06 11:15:23] Features: 21/1 -- score: 0.9580314422419687[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.6s\n",
297 |       "\n",
298 |       "[2024-04-06 11:15:25] Features: 20/1 -- score: 0.9606630211893371[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.6s\n",
299 |       "\n",
300 |       "[2024-04-06 11:15:27] Features: 19/1 -- score: 0.9632262474367739[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.6s\n",
301 |       "\n",
302 |       "[2024-04-06 11:15:29] Features: 18/1 -- score: 0.9684894053315105[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.6s\n",
303 |       "\n",
304 |       "[2024-04-06 11:15:30] Features: 17/1 -- score: 0.9632262474367739[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.6s\n",
305 |       "[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    1.6s\n",
306 |       "\n",
307 |       "[2024-04-06 11:15:32] Features: 16/1 -- score: 0.9579972658920027\n",
308 |       "[2024-04-06 11:15:34] Features: 15/1 -- score: 0.9606288448393713\n",
309 |       "[2024-04-06 11:15:35] Features: 14/1 -- score: 0.9658578263841422\n",
310 |       "[2024-04-06 11:15:37] Features: 13/1 -- score: 0.9632604237867396\n",
311 |       "[2024-04-06 11:15:38] Features: 12/1 -- score: 0.9606288448393713\n",
312 |       "[2024-04-06 11:15:39] Features: 11/1 -- score: 0.9606288448393713\n",
313 |       "[2024-04-06 11:15:40] Features: 10/1 -- score: 0.9579972658920027\n",
314 |       "[2024-04-06 11:15:41] Features: 9/1 -- score: 0.9606288448393713\n",
315 |       "[2024-04-06 11:15:42] Features: 8/1 -- score: 0.960628844839371\n",
316 |       "[2024-04-06 11:15:43] Features: 7/1 -- score: 0.9632262474367739\n",
317 |       "[2024-04-06 11:15:44] Features: 6/1 -- score: 0.9606288448393713\n",
318 |       "[2024-04-06 11:15:44] Features: 5/1 -- score: 0.9396445659603554\n",
319 |       "[2024-04-06 11:15:44] Features: 4/1 -- score: 0.9396103896103896\n",
320 |       "[2024-04-06 11:15:45] Features: 3/1 -- score: 0.902904989747095\n",
321 |       "[2024-04-06 11:15:45] Features: 2/1 -- score: 0.8845181134654819\n",
322 |       "[2024-04-06 11:15:45] Features: 1/1 -- score: 0.8556732740943268"
323 |      ]
324 |     }
325 |    ],
326 |    "source": [
327 |     "feature_selector = SequentialFeatureSelector(LogisticRegression(),\n",
328 |     "#                                              k_features=5,\n",
329 |     "                                             forward=False,\n",
330 |     "                                             verbose=5,\n",
331 |     "                                             scoring='accuracy',\n",
332 |     "                                             cv=5\n",
333 |     "                                            ).fit(X_train,y_train)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 18,
339 |    "id": "604c2cfe",
340 |    "metadata": {},
341 |    "outputs": [
342 |     {
343 |      "data": {
344 |       "text/plain": [
345 |        "('area_se',)"
346 |       ]
347 |      },
348 |      "execution_count": 18,
349 |      "metadata": {},
350 |      "output_type": "execute_result"
351 |     }
352 |    ],
353 |    "source": [
354 |     "feature_selector.k_feature_names_"
355 |    ]
356 |   }
357 |  ],
358 |  "metadata": {
359 |   "kernelspec": {
360 |    "display_name": "Python 3 (ipykernel)",
361 |    "language": "python",
362 |    "name": "python3"
363 |   },
364 |   "language_info": {
365 |    "codemirror_mode": {
366 |     "name": "ipython",
367 |     "version": 3
368 |    },
369 |    "file_extension": ".py",
370 |    "mimetype": "text/x-python",
371 |    "name": "python",
372 |    "nbconvert_exporter": "python",
373 |    "pygments_lexer": "ipython3",
374 |    "version": "3.9.7"
375 |   }
376 |  },
377 |  "nbformat": 4,
378 |  "nbformat_minor": 5
379 | }
380 | 


--------------------------------------------------------------------------------
/Scripts/ensemble_model.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Scripts/ensemble_model.zip


--------------------------------------------------------------------------------
/Scripts/facebook_Acceptance_Rate_By_Date.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | 
 9 | 
10 | # In[2]:
11 | 
12 | 
13 | fb = pd.read_excel(r"D:\Learnerea\Tables\fb_friend_requests.xlsx").sort_values(['user_id_sender','date'])
14 | fb
15 | 
16 | 
17 | # In[3]:
18 | 
19 | 
20 | sent = fb.query("action=='sent'")
21 | sent
22 | 
23 | 
24 | # In[4]:
25 | 
26 | 
27 | accepted = fb.query("action=='accepted'")
28 | accepted
29 | 
30 | 
31 | # In[6]:
32 | 
33 | 
34 | combined = sent.merge(accepted,on=['user_id_sender','user_id_receiver'],how='left')
35 | combined
36 | 
37 | 
38 | # In[14]:
39 | 
40 | 
41 | summary = combined.pivot_table(index='date_x',values=['action_x','action_y'],aggfunc='count').reset_index()
42 | summary['acceptance_rate'] = summary.action_y/summary.action_x
43 | summary[['date_x','acceptance_rate']]
44 | 
45 | 


--------------------------------------------------------------------------------
/Scripts/facebook_interview_question.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | 
 9 | 
10 | # In[2]:
11 | 
12 | 
13 | fb = pd.read_excel(r"D:\Learnerea\Tables\facebook_web_log.xlsx")
14 | fb.head()
15 | 
16 | 
17 | # In[3]:
18 | 
19 | 
20 | fb = fb.query("action in ('page_load','page_exit')")
21 | fb['page_exit'] = fb.apply(lambda x: x['timestamp'] if x['action']=='page_exit' else None, axis=1)
22 | fb['page_load'] = fb.apply(lambda x: x['timestamp'] if x['action']=='page_load' else None, axis=1)
23 | fb['page_loadN']=fb['page_load'].fillna(method='ffill')
24 | fb = fb.dropna(subset=['page_exit'])
25 | 
26 | 
27 | # In[4]:
28 | 
29 | 
30 | fb = fb[['user_id','page_exit','page_loadN']]
31 | fb['sec_diff'] = (fb['page_exit']-fb['page_loadN']).dt.total_seconds()
32 | fb
33 | 
34 | 
35 | # In[6]:
36 | 
37 | 
38 | fb.pivot_table(index='user_id',values='sec_diff',aggfunc='mean').reset_index()
39 | 
40 | 


--------------------------------------------------------------------------------
/Scripts/feature_selection_with_LASSO.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 12,
  6 |    "id": "f9eeffca",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "\n",
 13 |     "from sklearn.model_selection import train_test_split\n",
 14 |     "from sklearn.linear_model import Lasso, Ridge \n",
 15 |     "from sklearn.preprocessing import StandardScaler\n",
 16 |     "\n",
 17 |     "import warnings\n",
 18 |     "warnings.filterwarnings('ignore')"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "id": "ce524d58",
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "data": {
 29 |       "text/html": [
 30 |        "<div>\n",
 31 |        "<style scoped>\n",
 32 |        "    .dataframe tbody tr th:only-of-type {\n",
 33 |        "        vertical-align: middle;\n",
 34 |        "    }\n",
 35 |        "\n",
 36 |        "    .dataframe tbody tr th {\n",
 37 |        "        vertical-align: top;\n",
 38 |        "    }\n",
 39 |        "\n",
 40 |        "    .dataframe thead th {\n",
 41 |        "        text-align: right;\n",
 42 |        "    }\n",
 43 |        "</style>\n",
 44 |        "<table border=\"1\" class=\"dataframe\">\n",
 45 |        "  <thead>\n",
 46 |        "    <tr style=\"text-align: right;\">\n",
 47 |        "      <th></th>\n",
 48 |        "      <th>diagnosis</th>\n",
 49 |        "      <th>radius_mean</th>\n",
 50 |        "      <th>texture_mean</th>\n",
 51 |        "      <th>perimeter_mean</th>\n",
 52 |        "      <th>area_mean</th>\n",
 53 |        "      <th>smoothness_mean</th>\n",
 54 |        "      <th>compactness_mean</th>\n",
 55 |        "      <th>concavity_mean</th>\n",
 56 |        "      <th>concave points_mean</th>\n",
 57 |        "      <th>symmetry_mean</th>\n",
 58 |        "      <th>...</th>\n",
 59 |        "      <th>radius_worst</th>\n",
 60 |        "      <th>texture_worst</th>\n",
 61 |        "      <th>perimeter_worst</th>\n",
 62 |        "      <th>area_worst</th>\n",
 63 |        "      <th>smoothness_worst</th>\n",
 64 |        "      <th>compactness_worst</th>\n",
 65 |        "      <th>concavity_worst</th>\n",
 66 |        "      <th>concave points_worst</th>\n",
 67 |        "      <th>symmetry_worst</th>\n",
 68 |        "      <th>fractal_dimension_worst</th>\n",
 69 |        "    </tr>\n",
 70 |        "  </thead>\n",
 71 |        "  <tbody>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>0</th>\n",
 74 |        "      <td>1</td>\n",
 75 |        "      <td>17.99</td>\n",
 76 |        "      <td>10.38</td>\n",
 77 |        "      <td>122.80</td>\n",
 78 |        "      <td>1001.0</td>\n",
 79 |        "      <td>0.11840</td>\n",
 80 |        "      <td>0.27760</td>\n",
 81 |        "      <td>0.3001</td>\n",
 82 |        "      <td>0.14710</td>\n",
 83 |        "      <td>0.2419</td>\n",
 84 |        "      <td>...</td>\n",
 85 |        "      <td>25.38</td>\n",
 86 |        "      <td>17.33</td>\n",
 87 |        "      <td>184.60</td>\n",
 88 |        "      <td>2019.0</td>\n",
 89 |        "      <td>0.1622</td>\n",
 90 |        "      <td>0.6656</td>\n",
 91 |        "      <td>0.7119</td>\n",
 92 |        "      <td>0.2654</td>\n",
 93 |        "      <td>0.4601</td>\n",
 94 |        "      <td>0.11890</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>1</th>\n",
 98 |        "      <td>1</td>\n",
 99 |        "      <td>20.57</td>\n",
100 |        "      <td>17.77</td>\n",
101 |        "      <td>132.90</td>\n",
102 |        "      <td>1326.0</td>\n",
103 |        "      <td>0.08474</td>\n",
104 |        "      <td>0.07864</td>\n",
105 |        "      <td>0.0869</td>\n",
106 |        "      <td>0.07017</td>\n",
107 |        "      <td>0.1812</td>\n",
108 |        "      <td>...</td>\n",
109 |        "      <td>24.99</td>\n",
110 |        "      <td>23.41</td>\n",
111 |        "      <td>158.80</td>\n",
112 |        "      <td>1956.0</td>\n",
113 |        "      <td>0.1238</td>\n",
114 |        "      <td>0.1866</td>\n",
115 |        "      <td>0.2416</td>\n",
116 |        "      <td>0.1860</td>\n",
117 |        "      <td>0.2750</td>\n",
118 |        "      <td>0.08902</td>\n",
119 |        "    </tr>\n",
120 |        "    <tr>\n",
121 |        "      <th>2</th>\n",
122 |        "      <td>1</td>\n",
123 |        "      <td>19.69</td>\n",
124 |        "      <td>21.25</td>\n",
125 |        "      <td>130.00</td>\n",
126 |        "      <td>1203.0</td>\n",
127 |        "      <td>0.10960</td>\n",
128 |        "      <td>0.15990</td>\n",
129 |        "      <td>0.1974</td>\n",
130 |        "      <td>0.12790</td>\n",
131 |        "      <td>0.2069</td>\n",
132 |        "      <td>...</td>\n",
133 |        "      <td>23.57</td>\n",
134 |        "      <td>25.53</td>\n",
135 |        "      <td>152.50</td>\n",
136 |        "      <td>1709.0</td>\n",
137 |        "      <td>0.1444</td>\n",
138 |        "      <td>0.4245</td>\n",
139 |        "      <td>0.4504</td>\n",
140 |        "      <td>0.2430</td>\n",
141 |        "      <td>0.3613</td>\n",
142 |        "      <td>0.08758</td>\n",
143 |        "    </tr>\n",
144 |        "    <tr>\n",
145 |        "      <th>3</th>\n",
146 |        "      <td>1</td>\n",
147 |        "      <td>11.42</td>\n",
148 |        "      <td>20.38</td>\n",
149 |        "      <td>77.58</td>\n",
150 |        "      <td>386.1</td>\n",
151 |        "      <td>0.14250</td>\n",
152 |        "      <td>0.28390</td>\n",
153 |        "      <td>0.2414</td>\n",
154 |        "      <td>0.10520</td>\n",
155 |        "      <td>0.2597</td>\n",
156 |        "      <td>...</td>\n",
157 |        "      <td>14.91</td>\n",
158 |        "      <td>26.50</td>\n",
159 |        "      <td>98.87</td>\n",
160 |        "      <td>567.7</td>\n",
161 |        "      <td>0.2098</td>\n",
162 |        "      <td>0.8663</td>\n",
163 |        "      <td>0.6869</td>\n",
164 |        "      <td>0.2575</td>\n",
165 |        "      <td>0.6638</td>\n",
166 |        "      <td>0.17300</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>4</th>\n",
170 |        "      <td>1</td>\n",
171 |        "      <td>20.29</td>\n",
172 |        "      <td>14.34</td>\n",
173 |        "      <td>135.10</td>\n",
174 |        "      <td>1297.0</td>\n",
175 |        "      <td>0.10030</td>\n",
176 |        "      <td>0.13280</td>\n",
177 |        "      <td>0.1980</td>\n",
178 |        "      <td>0.10430</td>\n",
179 |        "      <td>0.1809</td>\n",
180 |        "      <td>...</td>\n",
181 |        "      <td>22.54</td>\n",
182 |        "      <td>16.67</td>\n",
183 |        "      <td>152.20</td>\n",
184 |        "      <td>1575.0</td>\n",
185 |        "      <td>0.1374</td>\n",
186 |        "      <td>0.2050</td>\n",
187 |        "      <td>0.4000</td>\n",
188 |        "      <td>0.1625</td>\n",
189 |        "      <td>0.2364</td>\n",
190 |        "      <td>0.07678</td>\n",
191 |        "    </tr>\n",
192 |        "  </tbody>\n",
193 |        "</table>\n",
194 |        "<p>5 rows × 31 columns</p>\n",
195 |        "</div>"
196 |       ],
197 |       "text/plain": [
198 |        "   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean   \n",
199 |        "0          1        17.99         10.38          122.80     1001.0  \\\n",
200 |        "1          1        20.57         17.77          132.90     1326.0   \n",
201 |        "2          1        19.69         21.25          130.00     1203.0   \n",
202 |        "3          1        11.42         20.38           77.58      386.1   \n",
203 |        "4          1        20.29         14.34          135.10     1297.0   \n",
204 |        "\n",
205 |        "   smoothness_mean  compactness_mean  concavity_mean  concave points_mean   \n",
206 |        "0          0.11840           0.27760          0.3001              0.14710  \\\n",
207 |        "1          0.08474           0.07864          0.0869              0.07017   \n",
208 |        "2          0.10960           0.15990          0.1974              0.12790   \n",
209 |        "3          0.14250           0.28390          0.2414              0.10520   \n",
210 |        "4          0.10030           0.13280          0.1980              0.10430   \n",
211 |        "\n",
212 |        "   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst   \n",
213 |        "0         0.2419  ...         25.38          17.33           184.60  \\\n",
214 |        "1         0.1812  ...         24.99          23.41           158.80   \n",
215 |        "2         0.2069  ...         23.57          25.53           152.50   \n",
216 |        "3         0.2597  ...         14.91          26.50            98.87   \n",
217 |        "4         0.1809  ...         22.54          16.67           152.20   \n",
218 |        "\n",
219 |        "   area_worst  smoothness_worst  compactness_worst  concavity_worst   \n",
220 |        "0      2019.0            0.1622             0.6656           0.7119  \\\n",
221 |        "1      1956.0            0.1238             0.1866           0.2416   \n",
222 |        "2      1709.0            0.1444             0.4245           0.4504   \n",
223 |        "3       567.7            0.2098             0.8663           0.6869   \n",
224 |        "4      1575.0            0.1374             0.2050           0.4000   \n",
225 |        "\n",
226 |        "   concave points_worst  symmetry_worst  fractal_dimension_worst  \n",
227 |        "0                0.2654          0.4601                  0.11890  \n",
228 |        "1                0.1860          0.2750                  0.08902  \n",
229 |        "2                0.2430          0.3613                  0.08758  \n",
230 |        "3                0.2575          0.6638                  0.17300  \n",
231 |        "4                0.1625          0.2364                  0.07678  \n",
232 |        "\n",
233 |        "[5 rows x 31 columns]"
234 |       ]
235 |      },
236 |      "execution_count": 2,
237 |      "metadata": {},
238 |      "output_type": "execute_result"
239 |     }
240 |    ],
241 |    "source": [
242 |     "df = pd.read_excel(r\"D:\\Learnerea\\Tables\\cancer_data_cleaned.xlsx\")\n",
243 |     "df.head()"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 3,
249 |    "id": "47d76b81",
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "X = df.drop('diagnosis',axis=1)\n",
254 |     "y = df.diagnosis"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 4,
260 |    "id": "a82c4233",
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "# Scaling the data\n",
265 |     "scaler = StandardScaler()"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 5,
271 |    "id": "1d8cd46e",
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "X_scaled = scaler.fit_transform(X)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 6,
281 |    "id": "842e7cd9",
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 7,
291 |    "id": "f9b65651",
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "lasso_instance = Lasso(alpha=0.1)"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 8,
301 |    "id": "74cd0861",
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "data": {
306 |       "text/html": [
307 |        "<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Lasso(alpha=0.1)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">Lasso</label><div class=\"sk-toggleable__content\"><pre>Lasso(alpha=0.1)</pre></div></div></div></div></div>"
308 |       ],
309 |       "text/plain": [
310 |        "Lasso(alpha=0.1)"
311 |       ]
312 |      },
313 |      "execution_count": 8,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "lasso_instance.fit(X_train,y_train)"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": 9,
325 |    "id": "753cc8f8",
326 |    "metadata": {},
327 |    "outputs": [
328 |     {
329 |      "data": {
330 |       "text/plain": [
331 |        "array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
332 |        "        0.        ,  0.        ,  0.03562742,  0.        , -0.        ,\n",
333 |        "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
334 |        "        0.        ,  0.        ,  0.        ,  0.        , -0.        ,\n",
335 |        "        0.11010092,  0.00635015,  0.        ,  0.        ,  0.        ,\n",
336 |        "        0.        ,  0.        ,  0.1696524 ,  0.        ,  0.        ])"
337 |       ]
338 |      },
339 |      "execution_count": 9,
340 |      "metadata": {},
341 |      "output_type": "execute_result"
342 |     }
343 |    ],
344 |    "source": [
345 |     "lasso_instance.coef_"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": 33,
351 |    "id": "0d85bd10",
352 |    "metadata": {},
353 |    "outputs": [
354 |     {
355 |      "data": {
356 |       "text/plain": [
357 |        "['concave points_mean',\n",
358 |        " 'radius_worst',\n",
359 |        " 'texture_worst',\n",
360 |        " 'concave points_worst']"
361 |       ]
362 |      },
363 |      "execution_count": 33,
364 |      "metadata": {},
365 |      "output_type": "execute_result"
366 |     }
367 |    ],
368 |    "source": [
369 |     "feature_importance = [feature for feature, coef in zip(X.columns,lasso_instance.coef_) if coef!=0]\n",
370 |     "feature_importance"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": 35,
376 |    "id": "795d8619",
377 |    "metadata": {},
378 |    "outputs": [
379 |     {
380 |      "data": {
381 |       "text/html": [
382 |        "<div>\n",
383 |        "<style scoped>\n",
384 |        "    .dataframe tbody tr th:only-of-type {\n",
385 |        "        vertical-align: middle;\n",
386 |        "    }\n",
387 |        "\n",
388 |        "    .dataframe tbody tr th {\n",
389 |        "        vertical-align: top;\n",
390 |        "    }\n",
391 |        "\n",
392 |        "    .dataframe thead th {\n",
393 |        "        text-align: right;\n",
394 |        "    }\n",
395 |        "</style>\n",
396 |        "<table border=\"1\" class=\"dataframe\">\n",
397 |        "  <thead>\n",
398 |        "    <tr style=\"text-align: right;\">\n",
399 |        "      <th></th>\n",
400 |        "      <th>feature</th>\n",
401 |        "      <th>coef</th>\n",
402 |        "    </tr>\n",
403 |        "  </thead>\n",
404 |        "  <tbody>\n",
405 |        "    <tr>\n",
406 |        "      <th>20</th>\n",
407 |        "      <td>radius_worst</td>\n",
408 |        "      <td>0.295662</td>\n",
409 |        "    </tr>\n",
410 |        "    <tr>\n",
411 |        "      <th>7</th>\n",
412 |        "      <td>concave points_mean</td>\n",
413 |        "      <td>0.235560</td>\n",
414 |        "    </tr>\n",
415 |        "    <tr>\n",
416 |        "      <th>26</th>\n",
417 |        "      <td>concavity_worst</td>\n",
418 |        "      <td>0.174722</td>\n",
419 |        "    </tr>\n",
420 |        "    <tr>\n",
421 |        "      <th>10</th>\n",
422 |        "      <td>radius_se</td>\n",
423 |        "      <td>0.173617</td>\n",
424 |        "    </tr>\n",
425 |        "    <tr>\n",
426 |        "      <th>21</th>\n",
427 |        "      <td>texture_worst</td>\n",
428 |        "      <td>0.131943</td>\n",
429 |        "    </tr>\n",
430 |        "    <tr>\n",
431 |        "      <th>0</th>\n",
432 |        "      <td>radius_mean</td>\n",
433 |        "      <td>0.105438</td>\n",
434 |        "    </tr>\n",
435 |        "    <tr>\n",
436 |        "      <th>17</th>\n",
437 |        "      <td>concave points_se</td>\n",
438 |        "      <td>0.098285</td>\n",
439 |        "    </tr>\n",
440 |        "    <tr>\n",
441 |        "      <th>25</th>\n",
442 |        "      <td>compactness_worst</td>\n",
443 |        "      <td>0.097376</td>\n",
444 |        "    </tr>\n",
445 |        "    <tr>\n",
446 |        "      <th>2</th>\n",
447 |        "      <td>perimeter_mean</td>\n",
448 |        "      <td>0.072313</td>\n",
449 |        "    </tr>\n",
450 |        "    <tr>\n",
451 |        "      <th>28</th>\n",
452 |        "      <td>symmetry_worst</td>\n",
453 |        "      <td>0.070803</td>\n",
454 |        "    </tr>\n",
455 |        "    <tr>\n",
456 |        "      <th>14</th>\n",
457 |        "      <td>smoothness_se</td>\n",
458 |        "      <td>0.066666</td>\n",
459 |        "    </tr>\n",
460 |        "    <tr>\n",
461 |        "      <th>6</th>\n",
462 |        "      <td>concavity_mean</td>\n",
463 |        "      <td>0.047451</td>\n",
464 |        "    </tr>\n",
465 |        "    <tr>\n",
466 |        "      <th>29</th>\n",
467 |        "      <td>fractal_dimension_worst</td>\n",
468 |        "      <td>0.024626</td>\n",
469 |        "    </tr>\n",
470 |        "    <tr>\n",
471 |        "      <th>4</th>\n",
472 |        "      <td>smoothness_mean</td>\n",
473 |        "      <td>0.023071</td>\n",
474 |        "    </tr>\n",
475 |        "    <tr>\n",
476 |        "      <th>19</th>\n",
477 |        "      <td>fractal_dimension_se</td>\n",
478 |        "      <td>0.010603</td>\n",
479 |        "    </tr>\n",
480 |        "    <tr>\n",
481 |        "      <th>12</th>\n",
482 |        "      <td>perimeter_se</td>\n",
483 |        "      <td>0.006057</td>\n",
484 |        "    </tr>\n",
485 |        "    <tr>\n",
486 |        "      <th>9</th>\n",
487 |        "      <td>fractal_dimension_mean</td>\n",
488 |        "      <td>0.003472</td>\n",
489 |        "    </tr>\n",
490 |        "    <tr>\n",
491 |        "      <th>18</th>\n",
492 |        "      <td>symmetry_se</td>\n",
493 |        "      <td>-0.012820</td>\n",
494 |        "    </tr>\n",
495 |        "    <tr>\n",
496 |        "      <th>8</th>\n",
497 |        "      <td>symmetry_mean</td>\n",
498 |        "      <td>-0.015663</td>\n",
499 |        "    </tr>\n",
500 |        "    <tr>\n",
501 |        "      <th>22</th>\n",
502 |        "      <td>perimeter_worst</td>\n",
503 |        "      <td>-0.020264</td>\n",
504 |        "    </tr>\n",
505 |        "    <tr>\n",
506 |        "      <th>24</th>\n",
507 |        "      <td>smoothness_worst</td>\n",
508 |        "      <td>-0.020566</td>\n",
509 |        "    </tr>\n",
510 |        "    <tr>\n",
511 |        "      <th>15</th>\n",
512 |        "      <td>compactness_se</td>\n",
513 |        "      <td>-0.029187</td>\n",
514 |        "    </tr>\n",
515 |        "    <tr>\n",
516 |        "      <th>1</th>\n",
517 |        "      <td>texture_mean</td>\n",
518 |        "      <td>-0.039028</td>\n",
519 |        "    </tr>\n",
520 |        "    <tr>\n",
521 |        "      <th>11</th>\n",
522 |        "      <td>texture_se</td>\n",
523 |        "      <td>-0.048607</td>\n",
524 |        "    </tr>\n",
525 |        "    <tr>\n",
526 |        "      <th>27</th>\n",
527 |        "      <td>concave points_worst</td>\n",
528 |        "      <td>-0.093311</td>\n",
529 |        "    </tr>\n",
530 |        "    <tr>\n",
531 |        "      <th>13</th>\n",
532 |        "      <td>area_se</td>\n",
533 |        "      <td>-0.113672</td>\n",
534 |        "    </tr>\n",
535 |        "    <tr>\n",
536 |        "      <th>16</th>\n",
537 |        "      <td>concavity_se</td>\n",
538 |        "      <td>-0.129800</td>\n",
539 |        "    </tr>\n",
540 |        "    <tr>\n",
541 |        "      <th>3</th>\n",
542 |        "      <td>area_mean</td>\n",
543 |        "      <td>-0.133755</td>\n",
544 |        "    </tr>\n",
545 |        "    <tr>\n",
546 |        "      <th>23</th>\n",
547 |        "      <td>area_worst</td>\n",
548 |        "      <td>-0.208526</td>\n",
549 |        "    </tr>\n",
550 |        "    <tr>\n",
551 |        "      <th>5</th>\n",
552 |        "      <td>compactness_mean</td>\n",
553 |        "      <td>-0.237037</td>\n",
554 |        "    </tr>\n",
555 |        "  </tbody>\n",
556 |        "</table>\n",
557 |        "</div>"
558 |       ],
559 |       "text/plain": [
560 |        "                    feature      coef\n",
561 |        "20             radius_worst  0.295662\n",
562 |        "7       concave points_mean  0.235560\n",
563 |        "26          concavity_worst  0.174722\n",
564 |        "10                radius_se  0.173617\n",
565 |        "21            texture_worst  0.131943\n",
566 |        "0               radius_mean  0.105438\n",
567 |        "17        concave points_se  0.098285\n",
568 |        "25        compactness_worst  0.097376\n",
569 |        "2            perimeter_mean  0.072313\n",
570 |        "28           symmetry_worst  0.070803\n",
571 |        "14            smoothness_se  0.066666\n",
572 |        "6            concavity_mean  0.047451\n",
573 |        "29  fractal_dimension_worst  0.024626\n",
574 |        "4           smoothness_mean  0.023071\n",
575 |        "19     fractal_dimension_se  0.010603\n",
576 |        "12             perimeter_se  0.006057\n",
577 |        "9    fractal_dimension_mean  0.003472\n",
578 |        "18              symmetry_se -0.012820\n",
579 |        "8             symmetry_mean -0.015663\n",
580 |        "22          perimeter_worst -0.020264\n",
581 |        "24         smoothness_worst -0.020566\n",
582 |        "15           compactness_se -0.029187\n",
583 |        "1              texture_mean -0.039028\n",
584 |        "11               texture_se -0.048607\n",
585 |        "27     concave points_worst -0.093311\n",
586 |        "13                  area_se -0.113672\n",
587 |        "16             concavity_se -0.129800\n",
588 |        "3                 area_mean -0.133755\n",
589 |        "23               area_worst -0.208526\n",
590 |        "5          compactness_mean -0.237037"
591 |       ]
592 |      },
593 |      "execution_count": 35,
594 |      "metadata": {},
595 |      "output_type": "execute_result"
596 |     }
597 |    ],
598 |    "source": [
599 |     "features_importance_df.sort_values(by='coef',ascending=False)"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": null,
605 |    "id": "30d2539f",
606 |    "metadata": {},
607 |    "outputs": [],
608 |    "source": []
609 |   }
610 |  ],
611 |  "metadata": {
612 |   "kernelspec": {
613 |    "display_name": "Python 3 (ipykernel)",
614 |    "language": "python",
615 |    "name": "python3"
616 |   },
617 |   "language_info": {
618 |    "codemirror_mode": {
619 |     "name": "ipython",
620 |     "version": 3
621 |    },
622 |    "file_extension": ".py",
623 |    "mimetype": "text/x-python",
624 |    "name": "python",
625 |    "nbconvert_exporter": "python",
626 |    "pygments_lexer": "ipython3",
627 |    "version": "3.9.7"
628 |   }
629 |  },
630 |  "nbformat": 4,
631 |  "nbformat_minor": 5
632 | }
633 | 


--------------------------------------------------------------------------------
/Scripts/feature_selection_with_elasticnet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 18,
  6 |    "id": "ba07c8db",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "\n",
 13 |     "from sklearn.model_selection import train_test_split\n",
 14 |     "from sklearn.linear_model import ElasticNet\n",
 15 |     "from sklearn.preprocessing import StandardScaler\n",
 16 |     "\n",
 17 |     "import warnings\n",
 18 |     "warnings.filterwarnings('ignore')"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 30,
 24 |    "id": "df1b1eba",
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "data": {
 29 |       "text/html": [
 30 |        "<div>\n",
 31 |        "<style scoped>\n",
 32 |        "    .dataframe tbody tr th:only-of-type {\n",
 33 |        "        vertical-align: middle;\n",
 34 |        "    }\n",
 35 |        "\n",
 36 |        "    .dataframe tbody tr th {\n",
 37 |        "        vertical-align: top;\n",
 38 |        "    }\n",
 39 |        "\n",
 40 |        "    .dataframe thead th {\n",
 41 |        "        text-align: right;\n",
 42 |        "    }\n",
 43 |        "</style>\n",
 44 |        "<table border=\"1\" class=\"dataframe\">\n",
 45 |        "  <thead>\n",
 46 |        "    <tr style=\"text-align: right;\">\n",
 47 |        "      <th></th>\n",
 48 |        "      <th>diagnosis</th>\n",
 49 |        "      <th>radius_mean</th>\n",
 50 |        "      <th>texture_mean</th>\n",
 51 |        "      <th>perimeter_mean</th>\n",
 52 |        "      <th>area_mean</th>\n",
 53 |        "      <th>smoothness_mean</th>\n",
 54 |        "      <th>compactness_mean</th>\n",
 55 |        "      <th>concavity_mean</th>\n",
 56 |        "      <th>concave points_mean</th>\n",
 57 |        "      <th>symmetry_mean</th>\n",
 58 |        "      <th>...</th>\n",
 59 |        "      <th>radius_worst</th>\n",
 60 |        "      <th>texture_worst</th>\n",
 61 |        "      <th>perimeter_worst</th>\n",
 62 |        "      <th>area_worst</th>\n",
 63 |        "      <th>smoothness_worst</th>\n",
 64 |        "      <th>compactness_worst</th>\n",
 65 |        "      <th>concavity_worst</th>\n",
 66 |        "      <th>concave points_worst</th>\n",
 67 |        "      <th>symmetry_worst</th>\n",
 68 |        "      <th>fractal_dimension_worst</th>\n",
 69 |        "    </tr>\n",
 70 |        "  </thead>\n",
 71 |        "  <tbody>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>0</th>\n",
 74 |        "      <td>1</td>\n",
 75 |        "      <td>17.99</td>\n",
 76 |        "      <td>10.38</td>\n",
 77 |        "      <td>122.80</td>\n",
 78 |        "      <td>1001.0</td>\n",
 79 |        "      <td>0.11840</td>\n",
 80 |        "      <td>0.27760</td>\n",
 81 |        "      <td>0.3001</td>\n",
 82 |        "      <td>0.14710</td>\n",
 83 |        "      <td>0.2419</td>\n",
 84 |        "      <td>...</td>\n",
 85 |        "      <td>25.38</td>\n",
 86 |        "      <td>17.33</td>\n",
 87 |        "      <td>184.60</td>\n",
 88 |        "      <td>2019.0</td>\n",
 89 |        "      <td>0.1622</td>\n",
 90 |        "      <td>0.6656</td>\n",
 91 |        "      <td>0.7119</td>\n",
 92 |        "      <td>0.2654</td>\n",
 93 |        "      <td>0.4601</td>\n",
 94 |        "      <td>0.11890</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>1</th>\n",
 98 |        "      <td>1</td>\n",
 99 |        "      <td>20.57</td>\n",
100 |        "      <td>17.77</td>\n",
101 |        "      <td>132.90</td>\n",
102 |        "      <td>1326.0</td>\n",
103 |        "      <td>0.08474</td>\n",
104 |        "      <td>0.07864</td>\n",
105 |        "      <td>0.0869</td>\n",
106 |        "      <td>0.07017</td>\n",
107 |        "      <td>0.1812</td>\n",
108 |        "      <td>...</td>\n",
109 |        "      <td>24.99</td>\n",
110 |        "      <td>23.41</td>\n",
111 |        "      <td>158.80</td>\n",
112 |        "      <td>1956.0</td>\n",
113 |        "      <td>0.1238</td>\n",
114 |        "      <td>0.1866</td>\n",
115 |        "      <td>0.2416</td>\n",
116 |        "      <td>0.1860</td>\n",
117 |        "      <td>0.2750</td>\n",
118 |        "      <td>0.08902</td>\n",
119 |        "    </tr>\n",
120 |        "    <tr>\n",
121 |        "      <th>2</th>\n",
122 |        "      <td>1</td>\n",
123 |        "      <td>19.69</td>\n",
124 |        "      <td>21.25</td>\n",
125 |        "      <td>130.00</td>\n",
126 |        "      <td>1203.0</td>\n",
127 |        "      <td>0.10960</td>\n",
128 |        "      <td>0.15990</td>\n",
129 |        "      <td>0.1974</td>\n",
130 |        "      <td>0.12790</td>\n",
131 |        "      <td>0.2069</td>\n",
132 |        "      <td>...</td>\n",
133 |        "      <td>23.57</td>\n",
134 |        "      <td>25.53</td>\n",
135 |        "      <td>152.50</td>\n",
136 |        "      <td>1709.0</td>\n",
137 |        "      <td>0.1444</td>\n",
138 |        "      <td>0.4245</td>\n",
139 |        "      <td>0.4504</td>\n",
140 |        "      <td>0.2430</td>\n",
141 |        "      <td>0.3613</td>\n",
142 |        "      <td>0.08758</td>\n",
143 |        "    </tr>\n",
144 |        "    <tr>\n",
145 |        "      <th>3</th>\n",
146 |        "      <td>1</td>\n",
147 |        "      <td>11.42</td>\n",
148 |        "      <td>20.38</td>\n",
149 |        "      <td>77.58</td>\n",
150 |        "      <td>386.1</td>\n",
151 |        "      <td>0.14250</td>\n",
152 |        "      <td>0.28390</td>\n",
153 |        "      <td>0.2414</td>\n",
154 |        "      <td>0.10520</td>\n",
155 |        "      <td>0.2597</td>\n",
156 |        "      <td>...</td>\n",
157 |        "      <td>14.91</td>\n",
158 |        "      <td>26.50</td>\n",
159 |        "      <td>98.87</td>\n",
160 |        "      <td>567.7</td>\n",
161 |        "      <td>0.2098</td>\n",
162 |        "      <td>0.8663</td>\n",
163 |        "      <td>0.6869</td>\n",
164 |        "      <td>0.2575</td>\n",
165 |        "      <td>0.6638</td>\n",
166 |        "      <td>0.17300</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>4</th>\n",
170 |        "      <td>1</td>\n",
171 |        "      <td>20.29</td>\n",
172 |        "      <td>14.34</td>\n",
173 |        "      <td>135.10</td>\n",
174 |        "      <td>1297.0</td>\n",
175 |        "      <td>0.10030</td>\n",
176 |        "      <td>0.13280</td>\n",
177 |        "      <td>0.1980</td>\n",
178 |        "      <td>0.10430</td>\n",
179 |        "      <td>0.1809</td>\n",
180 |        "      <td>...</td>\n",
181 |        "      <td>22.54</td>\n",
182 |        "      <td>16.67</td>\n",
183 |        "      <td>152.20</td>\n",
184 |        "      <td>1575.0</td>\n",
185 |        "      <td>0.1374</td>\n",
186 |        "      <td>0.2050</td>\n",
187 |        "      <td>0.4000</td>\n",
188 |        "      <td>0.1625</td>\n",
189 |        "      <td>0.2364</td>\n",
190 |        "      <td>0.07678</td>\n",
191 |        "    </tr>\n",
192 |        "  </tbody>\n",
193 |        "</table>\n",
194 |        "<p>5 rows × 31 columns</p>\n",
195 |        "</div>"
196 |       ],
197 |       "text/plain": [
198 |        "   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean   \n",
199 |        "0          1        17.99         10.38          122.80     1001.0  \\\n",
200 |        "1          1        20.57         17.77          132.90     1326.0   \n",
201 |        "2          1        19.69         21.25          130.00     1203.0   \n",
202 |        "3          1        11.42         20.38           77.58      386.1   \n",
203 |        "4          1        20.29         14.34          135.10     1297.0   \n",
204 |        "\n",
205 |        "   smoothness_mean  compactness_mean  concavity_mean  concave points_mean   \n",
206 |        "0          0.11840           0.27760          0.3001              0.14710  \\\n",
207 |        "1          0.08474           0.07864          0.0869              0.07017   \n",
208 |        "2          0.10960           0.15990          0.1974              0.12790   \n",
209 |        "3          0.14250           0.28390          0.2414              0.10520   \n",
210 |        "4          0.10030           0.13280          0.1980              0.10430   \n",
211 |        "\n",
212 |        "   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst   \n",
213 |        "0         0.2419  ...         25.38          17.33           184.60  \\\n",
214 |        "1         0.1812  ...         24.99          23.41           158.80   \n",
215 |        "2         0.2069  ...         23.57          25.53           152.50   \n",
216 |        "3         0.2597  ...         14.91          26.50            98.87   \n",
217 |        "4         0.1809  ...         22.54          16.67           152.20   \n",
218 |        "\n",
219 |        "   area_worst  smoothness_worst  compactness_worst  concavity_worst   \n",
220 |        "0      2019.0            0.1622             0.6656           0.7119  \\\n",
221 |        "1      1956.0            0.1238             0.1866           0.2416   \n",
222 |        "2      1709.0            0.1444             0.4245           0.4504   \n",
223 |        "3       567.7            0.2098             0.8663           0.6869   \n",
224 |        "4      1575.0            0.1374             0.2050           0.4000   \n",
225 |        "\n",
226 |        "   concave points_worst  symmetry_worst  fractal_dimension_worst  \n",
227 |        "0                0.2654          0.4601                  0.11890  \n",
228 |        "1                0.1860          0.2750                  0.08902  \n",
229 |        "2                0.2430          0.3613                  0.08758  \n",
230 |        "3                0.2575          0.6638                  0.17300  \n",
231 |        "4                0.1625          0.2364                  0.07678  \n",
232 |        "\n",
233 |        "[5 rows x 31 columns]"
234 |       ]
235 |      },
236 |      "execution_count": 30,
237 |      "metadata": {},
238 |      "output_type": "execute_result"
239 |     }
240 |    ],
241 |    "source": [
242 |     "df = pd.read_excel(r\"D:\\Learnerea\\Tables\\cancer_data_cleaned.xlsx\")\n",
243 |     "df.head()"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 44,
249 |    "id": "4c203fad",
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "X = df.drop('diagnosis',axis=1)\n",
254 |     "y = df.diagnosis"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 91,
260 |    "id": "a7e5715a",
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "scaler = StandardScaler()"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 93,
270 |    "id": "79f95af2",
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": [
274 |     "X_scaled = scaler.fit_transform(X)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 94,
280 |    "id": "60d530ad",
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 95,
290 |    "id": "0dadfa19",
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "elasticReg = ElasticNet(alpha=0.1,l1_ratio=0.5,random_state=42)"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 96,
300 |    "id": "9109c9bb",
301 |    "metadata": {},
302 |    "outputs": [
303 |     {
304 |      "data": {
305 |       "text/html": [
306 |        "<style>#sk-container-id-15 {color: black;}#sk-container-id-15 pre{padding: 0;}#sk-container-id-15 div.sk-toggleable {background-color: white;}#sk-container-id-15 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-15 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-15 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-15 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-15 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-15 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-15 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-15 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-15 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-15 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-15 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-15 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-15 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-15 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-15 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-15 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-15 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-15 div.sk-item {position: relative;z-index: 1;}#sk-container-id-15 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-15 div.sk-item::before, #sk-container-id-15 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-15 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-15 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-15 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-15 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-15 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-15 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-15 div.sk-label-container {text-align: center;}#sk-container-id-15 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-15 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-15\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>ElasticNet(alpha=0.1, random_state=42)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-15\" type=\"checkbox\" checked><label for=\"sk-estimator-id-15\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">ElasticNet</label><div class=\"sk-toggleable__content\"><pre>ElasticNet(alpha=0.1, random_state=42)</pre></div></div></div></div></div>"
307 |       ],
308 |       "text/plain": [
309 |        "ElasticNet(alpha=0.1, random_state=42)"
310 |       ]
311 |      },
312 |      "execution_count": 96,
313 |      "metadata": {},
314 |      "output_type": "execute_result"
315 |     }
316 |    ],
317 |    "source": [
318 |     "elasticReg.fit(X_train,y_train)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 97,
324 |    "id": "f8269ac7",
325 |    "metadata": {},
326 |    "outputs": [
327 |     {
328 |      "data": {
329 |       "text/plain": [
330 |        "array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
331 |        "        0.        ,  0.        ,  0.07817021,  0.        , -0.        ,\n",
332 |        "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
333 |        "        0.        , -0.        ,  0.        ,  0.        , -0.        ,\n",
334 |        "        0.08932816,  0.04304261,  0.04480352,  0.        ,  0.        ,\n",
335 |        "        0.        ,  0.01540593,  0.11816886,  0.01985054,  0.        ])"
336 |       ]
337 |      },
338 |      "execution_count": 97,
339 |      "metadata": {},
340 |      "output_type": "execute_result"
341 |     }
342 |    ],
343 |    "source": [
344 |     "elasticReg.coef_"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": 99,
350 |    "id": "bb6c325a",
351 |    "metadata": {},
352 |    "outputs": [
353 |     {
354 |      "data": {
355 |       "text/html": [
356 |        "<div>\n",
357 |        "<style scoped>\n",
358 |        "    .dataframe tbody tr th:only-of-type {\n",
359 |        "        vertical-align: middle;\n",
360 |        "    }\n",
361 |        "\n",
362 |        "    .dataframe tbody tr th {\n",
363 |        "        vertical-align: top;\n",
364 |        "    }\n",
365 |        "\n",
366 |        "    .dataframe thead th {\n",
367 |        "        text-align: right;\n",
368 |        "    }\n",
369 |        "</style>\n",
370 |        "<table border=\"1\" class=\"dataframe\">\n",
371 |        "  <thead>\n",
372 |        "    <tr style=\"text-align: right;\">\n",
373 |        "      <th></th>\n",
374 |        "      <th>features</th>\n",
375 |        "      <th>importance</th>\n",
376 |        "    </tr>\n",
377 |        "  </thead>\n",
378 |        "  <tbody>\n",
379 |        "    <tr>\n",
380 |        "      <th>27</th>\n",
381 |        "      <td>concave points_worst</td>\n",
382 |        "      <td>0.118169</td>\n",
383 |        "    </tr>\n",
384 |        "    <tr>\n",
385 |        "      <th>20</th>\n",
386 |        "      <td>radius_worst</td>\n",
387 |        "      <td>0.089328</td>\n",
388 |        "    </tr>\n",
389 |        "    <tr>\n",
390 |        "      <th>7</th>\n",
391 |        "      <td>concave points_mean</td>\n",
392 |        "      <td>0.078170</td>\n",
393 |        "    </tr>\n",
394 |        "    <tr>\n",
395 |        "      <th>22</th>\n",
396 |        "      <td>perimeter_worst</td>\n",
397 |        "      <td>0.044804</td>\n",
398 |        "    </tr>\n",
399 |        "    <tr>\n",
400 |        "      <th>21</th>\n",
401 |        "      <td>texture_worst</td>\n",
402 |        "      <td>0.043043</td>\n",
403 |        "    </tr>\n",
404 |        "    <tr>\n",
405 |        "      <th>28</th>\n",
406 |        "      <td>symmetry_worst</td>\n",
407 |        "      <td>0.019851</td>\n",
408 |        "    </tr>\n",
409 |        "    <tr>\n",
410 |        "      <th>26</th>\n",
411 |        "      <td>concavity_worst</td>\n",
412 |        "      <td>0.015406</td>\n",
413 |        "    </tr>\n",
414 |        "    <tr>\n",
415 |        "      <th>0</th>\n",
416 |        "      <td>radius_mean</td>\n",
417 |        "      <td>0.000000</td>\n",
418 |        "    </tr>\n",
419 |        "    <tr>\n",
420 |        "      <th>16</th>\n",
421 |        "      <td>concavity_se</td>\n",
422 |        "      <td>-0.000000</td>\n",
423 |        "    </tr>\n",
424 |        "    <tr>\n",
425 |        "      <th>25</th>\n",
426 |        "      <td>compactness_worst</td>\n",
427 |        "      <td>0.000000</td>\n",
428 |        "    </tr>\n",
429 |        "    <tr>\n",
430 |        "      <th>24</th>\n",
431 |        "      <td>smoothness_worst</td>\n",
432 |        "      <td>0.000000</td>\n",
433 |        "    </tr>\n",
434 |        "    <tr>\n",
435 |        "      <th>23</th>\n",
436 |        "      <td>area_worst</td>\n",
437 |        "      <td>0.000000</td>\n",
438 |        "    </tr>\n",
439 |        "    <tr>\n",
440 |        "      <th>19</th>\n",
441 |        "      <td>fractal_dimension_se</td>\n",
442 |        "      <td>-0.000000</td>\n",
443 |        "    </tr>\n",
444 |        "    <tr>\n",
445 |        "      <th>18</th>\n",
446 |        "      <td>symmetry_se</td>\n",
447 |        "      <td>0.000000</td>\n",
448 |        "    </tr>\n",
449 |        "    <tr>\n",
450 |        "      <th>17</th>\n",
451 |        "      <td>concave points_se</td>\n",
452 |        "      <td>0.000000</td>\n",
453 |        "    </tr>\n",
454 |        "    <tr>\n",
455 |        "      <th>15</th>\n",
456 |        "      <td>compactness_se</td>\n",
457 |        "      <td>0.000000</td>\n",
458 |        "    </tr>\n",
459 |        "    <tr>\n",
460 |        "      <th>1</th>\n",
461 |        "      <td>texture_mean</td>\n",
462 |        "      <td>0.000000</td>\n",
463 |        "    </tr>\n",
464 |        "    <tr>\n",
465 |        "      <th>14</th>\n",
466 |        "      <td>smoothness_se</td>\n",
467 |        "      <td>0.000000</td>\n",
468 |        "    </tr>\n",
469 |        "    <tr>\n",
470 |        "      <th>13</th>\n",
471 |        "      <td>area_se</td>\n",
472 |        "      <td>0.000000</td>\n",
473 |        "    </tr>\n",
474 |        "    <tr>\n",
475 |        "      <th>12</th>\n",
476 |        "      <td>perimeter_se</td>\n",
477 |        "      <td>0.000000</td>\n",
478 |        "    </tr>\n",
479 |        "    <tr>\n",
480 |        "      <th>11</th>\n",
481 |        "      <td>texture_se</td>\n",
482 |        "      <td>0.000000</td>\n",
483 |        "    </tr>\n",
484 |        "    <tr>\n",
485 |        "      <th>10</th>\n",
486 |        "      <td>radius_se</td>\n",
487 |        "      <td>0.000000</td>\n",
488 |        "    </tr>\n",
489 |        "    <tr>\n",
490 |        "      <th>9</th>\n",
491 |        "      <td>fractal_dimension_mean</td>\n",
492 |        "      <td>-0.000000</td>\n",
493 |        "    </tr>\n",
494 |        "    <tr>\n",
495 |        "      <th>8</th>\n",
496 |        "      <td>symmetry_mean</td>\n",
497 |        "      <td>0.000000</td>\n",
498 |        "    </tr>\n",
499 |        "    <tr>\n",
500 |        "      <th>6</th>\n",
501 |        "      <td>concavity_mean</td>\n",
502 |        "      <td>0.000000</td>\n",
503 |        "    </tr>\n",
504 |        "    <tr>\n",
505 |        "      <th>5</th>\n",
506 |        "      <td>compactness_mean</td>\n",
507 |        "      <td>0.000000</td>\n",
508 |        "    </tr>\n",
509 |        "    <tr>\n",
510 |        "      <th>4</th>\n",
511 |        "      <td>smoothness_mean</td>\n",
512 |        "      <td>0.000000</td>\n",
513 |        "    </tr>\n",
514 |        "    <tr>\n",
515 |        "      <th>3</th>\n",
516 |        "      <td>area_mean</td>\n",
517 |        "      <td>0.000000</td>\n",
518 |        "    </tr>\n",
519 |        "    <tr>\n",
520 |        "      <th>2</th>\n",
521 |        "      <td>perimeter_mean</td>\n",
522 |        "      <td>0.000000</td>\n",
523 |        "    </tr>\n",
524 |        "    <tr>\n",
525 |        "      <th>29</th>\n",
526 |        "      <td>fractal_dimension_worst</td>\n",
527 |        "      <td>0.000000</td>\n",
528 |        "    </tr>\n",
529 |        "  </tbody>\n",
530 |        "</table>\n",
531 |        "</div>"
532 |       ],
533 |       "text/plain": [
534 |        "                   features  importance\n",
535 |        "27     concave points_worst    0.118169\n",
536 |        "20             radius_worst    0.089328\n",
537 |        "7       concave points_mean    0.078170\n",
538 |        "22          perimeter_worst    0.044804\n",
539 |        "21            texture_worst    0.043043\n",
540 |        "28           symmetry_worst    0.019851\n",
541 |        "26          concavity_worst    0.015406\n",
542 |        "0               radius_mean    0.000000\n",
543 |        "16             concavity_se   -0.000000\n",
544 |        "25        compactness_worst    0.000000\n",
545 |        "24         smoothness_worst    0.000000\n",
546 |        "23               area_worst    0.000000\n",
547 |        "19     fractal_dimension_se   -0.000000\n",
548 |        "18              symmetry_se    0.000000\n",
549 |        "17        concave points_se    0.000000\n",
550 |        "15           compactness_se    0.000000\n",
551 |        "1              texture_mean    0.000000\n",
552 |        "14            smoothness_se    0.000000\n",
553 |        "13                  area_se    0.000000\n",
554 |        "12             perimeter_se    0.000000\n",
555 |        "11               texture_se    0.000000\n",
556 |        "10                radius_se    0.000000\n",
557 |        "9    fractal_dimension_mean   -0.000000\n",
558 |        "8             symmetry_mean    0.000000\n",
559 |        "6            concavity_mean    0.000000\n",
560 |        "5          compactness_mean    0.000000\n",
561 |        "4           smoothness_mean    0.000000\n",
562 |        "3                 area_mean    0.000000\n",
563 |        "2            perimeter_mean    0.000000\n",
564 |        "29  fractal_dimension_worst    0.000000"
565 |       ]
566 |      },
567 |      "execution_count": 99,
568 |      "metadata": {},
569 |      "output_type": "execute_result"
570 |     }
571 |    ],
572 |    "source": [
573 |     "featureImp = [(feature,coef) for feature, coef in zip(X.columns,elasticReg.coef_)]\n",
574 |     "impDF = pd.DataFrame(featureImp,columns=['features','importance'])\n",
575 |     "impDF.sort_values(by='importance',ascending=False)"
576 |    ]
577 |   }
578 |  ],
579 |  "metadata": {
580 |   "kernelspec": {
581 |    "display_name": "Python 3 (ipykernel)",
582 |    "language": "python",
583 |    "name": "python3"
584 |   },
585 |   "language_info": {
586 |    "codemirror_mode": {
587 |     "name": "ipython",
588 |     "version": 3
589 |    },
590 |    "file_extension": ".py",
591 |    "mimetype": "text/x-python",
592 |    "name": "python",
593 |    "nbconvert_exporter": "python",
594 |    "pygments_lexer": "ipython3",
595 |    "version": "3.9.7"
596 |   }
597 |  },
598 |  "nbformat": 4,
599 |  "nbformat_minor": 5
600 | }
601 | 


--------------------------------------------------------------------------------
/Scripts/google_interview_question_Medium.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[3]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | 
 9 | pd.options.display.max_colwidth = 1000
10 | 
11 | 
12 | # In[4]:
13 | 
14 | 
15 | df = pd.read_excel(r"D:\Learnerea\Tables\google_file_store.xlsx")
16 | df 
17 | 
18 | 
19 | # In[12]:
20 | 
21 | 
22 | df['bull'] = df['contents'].apply(lambda x:x.count('bull'))
23 | df['bear'] = df['contents'].apply(lambda x:x.count('bear'))
24 | df['word'] = 'netry'
25 | df
26 | 
27 | 
28 | # In[16]:
29 | 
30 | 
31 | df.pivot_table(values=['bull','bear'],columns='word',aggfunc='sum').reset_index().sort_values(by='netry',ascending=False)
32 | 
33 | 


--------------------------------------------------------------------------------
/Scripts/gradient_boosint_complete.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[31]:
  5 | 
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.ensemble import GradientBoostingRegressor
 11 | from sklearn.model_selection import GridSearchCV
 12 | 
 13 | 
 14 | # In[2]:
 15 | 
 16 | 
 17 | data  = {"Income_K":[75,67,70,90,80]
 18 |          ,"Profession":["Salaried","Business","Salaried","Business","Salaried"]
 19 |         ,"Loan_Lakhs":[3,2,5,7,4]}
 20 | 
 21 | df = pd.DataFrame(data)
 22 | df
 23 | 
 24 | 
 25 | # In[4]:
 26 | 
 27 | 
 28 | le = LabelEncoder()
 29 | 
 30 | 
 31 | # In[5]:
 32 | 
 33 | 
 34 | df['prof_enc'] = le.fit_transform(df['Profession'])
 35 | df
 36 | 
 37 | 
 38 | # In[6]:
 39 | 
 40 | 
 41 | x = df[['Income_K','prof_enc']]
 42 | y = df['Loan_Lakhs']
 43 | 
 44 | 
 45 | # In[27]:
 46 | 
 47 | 
 48 | model = GradientBoostingRegressor(n_estimators=5)
 49 | 
 50 | 
 51 | # In[28]:
 52 | 
 53 | 
 54 | model.fit(x,y)
 55 | 
 56 | 
 57 | # In[29]:
 58 | 
 59 | 
 60 | prediction = model.predict(x)
 61 | 
 62 | 
 63 | # In[30]:
 64 | 
 65 | 
 66 | sum((y-prediction)**2)/len(y)
 67 | 
 68 | 
 69 | # In[32]:
 70 | 
 71 | 
 72 | params = {'n_estimators':range(1,200)}
 73 | grid = GridSearchCV(param_grid=params,estimator=model,cv=2, scoring='neg_mean_squared_error')
 74 | 
 75 | 
 76 | # In[33]:
 77 | 
 78 | 
 79 | grid.fit(x,y)
 80 | 
 81 | 
 82 | # In[34]:
 83 | 
 84 | 
 85 | grid.best_estimator_
 86 | 
 87 | 
 88 | # In[35]:
 89 | 
 90 | 
 91 | gb = grid.best_estimator_
 92 | 
 93 | 
 94 | # In[36]:
 95 | 
 96 | 
 97 | gb.fit(x,y)
 98 | 
 99 | 
100 | # In[37]:
101 | 
102 | 
103 | preodiction = gb.predict(x)
104 | 
105 | 
106 | # In[38]:
107 | 
108 | 
109 | sum((y-prediction)**2)/len(y)
110 | 
111 | 


--------------------------------------------------------------------------------
/Scripts/logisticPDmodel.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Scripts/logisticPDmodel.pkl


--------------------------------------------------------------------------------
/Scripts/smote_logistic.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEARNEREA/Data_Science/3c56acc99db1f53f41b33e65b1c4822877fdb52b/Scripts/smote_logistic.pkl


--------------------------------------------------------------------------------
/Scripts/testing.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Scripts/time_series_air_passengers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | import seaborn as sns
 11 | 
 12 | 
 13 | # In[2]:
 14 | 
 15 | 
 16 | sns.get_dataset_names()
 17 | 
 18 | 
 19 | # In[3]:
 20 | 
 21 | 
 22 | df = sns.load_dataset('flights')
 23 | df['yearMonth'] = pd.to_datetime("01-"+df['month'].astype(str)+"-"+df['year'].astype(str))
 24 | df.set_index('yearMonth',inplace=True)
 25 | df.head()
 26 | 
 27 | 
 28 | # In[4]:
 29 | 
 30 | 
 31 | plt.figure(figsize=(10,5))
 32 | sns.lineplot(data=df,x=df.index,y=df.passengers)
 33 | 
 34 | 
 35 | # In[5]:
 36 | 
 37 | 
 38 | df['rollMean']  = df.passengers.rolling(window=12).mean()
 39 | df['rollStd']  = df.passengers.rolling(window=12).std()
 40 | 
 41 | 
 42 | # In[6]:
 43 | 
 44 | 
 45 | plt.figure(figsize=(10,5))
 46 | sns.lineplot(data=df,x=df.index,y=df.passengers)
 47 | sns.lineplot(data=df,x=df.index,y=df.rollMean)
 48 | sns.lineplot(data=df,x=df.index,y=df.rollStd)
 49 | 
 50 | 
 51 | # In[7]:
 52 | 
 53 | 
 54 | from statsmodels.tsa.stattools import adfuller
 55 | 
 56 | 
 57 | # In[8]:
 58 | 
 59 | 
 60 | adfTest = adfuller(df['passengers'],autolag='AIC',)
 61 | 
 62 | 
 63 | # In[9]:
 64 | 
 65 | 
 66 | adfTest
 67 | 
 68 | 
 69 | # In[10]:
 70 | 
 71 | 
 72 | stats = pd.Series(adfTest[0:4],index=['Test Statistic','p-value','#lags used','number of observations used'])
 73 | stats
 74 | 
 75 | 
 76 | # In[11]:
 77 | 
 78 | 
 79 | for key, values in adfTest[4].items():
 80 |     print('criticality',key,":",values)
 81 | 
 82 | 
 83 | # In[12]:
 84 | 
 85 | 
 86 | def test_stationarity(dataFrame, var):
 87 |     dataFrame['rollMean']  = dataFrame[var].rolling(window=12).mean()
 88 |     dataFrame['rollStd']  = dataFrame[var].rolling(window=12).std()
 89 |     
 90 |     from statsmodels.tsa.stattools import adfuller
 91 |     adfTest = adfuller(dataFrame[var],autolag='AIC')
 92 |     stats = pd.Series(adfTest[0:4],index=['Test Statistic','p-value','#lags used','number of observations used'])
 93 |     print(stats)
 94 |     
 95 |     for key, values in adfTest[4].items():
 96 |         print('criticality',key,":",values)
 97 |         
 98 |     sns.lineplot(data=dataFrame,x=dataFrame.index,y=var)
 99 |     sns.lineplot(data=dataFrame,x=dataFrame.index,y='rollMean')
100 |     sns.lineplot(data=dataFrame,x=dataFrame.index,y='rollStd')
101 | 
102 | 
103 | # In[13]:
104 | 
105 | 
106 | air_df = df[['passengers']]
107 | air_df.head()
108 | 
109 | 
110 | # In[14]:
111 | 
112 | 
113 | # time shift
114 | 
115 | air_df['shift'] = air_df.passengers.shift()
116 | air_df['shiftDiff'] = air_df['passengers'] - air_df['shift']
117 | air_df.head()
118 | 
119 | 
120 | # In[15]:
121 | 
122 | 
123 | test_stationarity(air_df.dropna(),'shiftDiff')
124 | 
125 | 
126 | # In[16]:
127 | 
128 | 
129 | log_df = df[['passengers']]
130 | log_df['log'] = np.log(log_df['passengers'])
131 | log_df.head()
132 | 
133 | 
134 | # In[17]:
135 | 
136 | 
137 | test_stationarity(log_df,'log')
138 | 
139 | 
140 | # In[18]:
141 | 
142 | 
143 | sqrt_df = df[['passengers']]
144 | sqrt_df['sqrt'] = np.sqrt(df['passengers'])
145 | sqrt_df.head()
146 | 
147 | 
148 | # In[19]:
149 | 
150 | 
151 | test_stationarity(sqrt_df,'sqrt')
152 | 
153 | 
154 | # In[20]:
155 | 
156 | 
157 | cbrt_df = df[['passengers']]
158 | cbrt_df['cbrt'] = np.cbrt(cbrt_df['passengers'])
159 | cbrt_df.head()
160 | 
161 | 
162 | # In[21]:
163 | 
164 | 
165 | test_stationarity(cbrt_df,'cbrt')
166 | 
167 | 
168 | # In[22]:
169 | 
170 | 
171 | log_df2 = log_df[['passengers','log']]
172 | log_df2['log_sqrt'] = np.sqrt(log_df['log'])
173 | log_df2['logShiftDiff'] = log_df2['log_sqrt'] - log_df2['log_sqrt'].shift()
174 | log_df2.head()
175 | 
176 | 
177 | # In[23]:
178 | 
179 | 
180 | test_stationarity(log_df2.dropna(),'logShiftDiff')
181 | 
182 | 
183 | # In[24]:
184 | 
185 | 
186 | log_shift = df[['passengers']].copy(deep=True)
187 | log_shift['log'] = np.log(log_shift['passengers'])
188 | log_shift['logShift'] = log_shift['log'].shift()
189 | log_shift['logShiftDiff'] = log_shift['log'] - log_shift['logShift']
190 | log_shift.head()
191 | 
192 | 
193 | # In[25]:
194 | 
195 | 
196 | test_stationarity(log_shift.dropna(),'logShiftDiff')
197 | 
198 | 
199 | # # Next - 2
200 | 
201 | # In[80]:
202 | 
203 | 
204 | airP = df[['passengers']].copy(deep=True)
205 | airP['firstDiff'] = airP['passengers'].diff()
206 | airP['Diff12'] = airP['passengers'].diff(12)
207 | 
208 | 
209 | # In[81]:
210 | 
211 | 
212 | airP.head()
213 | 
214 | 
215 | # In[84]:
216 | 
217 | 
218 | from statsmodels.tsa.arima_model import ARIMA
219 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
220 | 
221 | 
222 | # In[86]:
223 | 
224 | 
225 | plot_pacf(airP['firstDiff'].dropna(),lags=20);
226 | 
227 | 
228 | # In[87]:
229 | 
230 | 
231 | plot_acf(airP['firstDiff'].dropna(),lags=20);
232 | 
233 | 
234 | # In[88]:
235 | 
236 | 
237 | # p = 1, q = 3, d =1
238 | 
239 | 
240 | # In[94]:
241 | 
242 | 
243 | train = airP[:round(len(airP)*70/100)]
244 | test = airP[round(len(airP)*70/100):]
245 | test.head()
246 | 
247 | 
248 | # In[100]:
249 | 
250 | 
251 | model = ARIMA(train['passengers'],order=(1,1,3))
252 | model_fit = model.fit()
253 | prediction = model_fit.predict(start=test.index[0],end=test.index[-1])
254 | airP['arimaPred'] = prediction
255 | airP.tail()
256 | 
257 | 
258 | # In[121]:
259 | 
260 | 
261 | airP.dropna()
262 | sns.lineplot(data=airP,x=airP.index,y='passengers')
263 | sns.lineplot(data=airP,x=airP.index,y='arimaPred')
264 | 
265 | 
266 | # In[122]:
267 | 
268 | 
269 | from sklearn.metrics import mean_squared_error
270 | 
271 | 
272 | # In[124]:
273 | 
274 | 
275 | np.sqrt(mean_squared_error(test['passengers'],prediction))
276 | 
277 | 
278 | # In[126]:
279 | 
280 | 
281 | from statsmodels.tsa.statespace.sarimax import SARIMAX
282 | 
283 | 
284 | # In[141]:
285 | 
286 | 
287 | plot_pacf(airP['Diff12'].dropna(),lags=20);
288 | plot_acf(airP['Diff12'].dropna(),lags=20);
289 | 
290 | 
291 | # In[147]:
292 | 
293 | 
294 | model = SARIMAX(train['passengers'],order=(1,1,3),seasonal_order=(2,1,2,12))
295 | model_fit = model.fit()
296 | prediction = model_fit.predict(start=test.index[0],end=test.index[-1])
297 | airP['sarimaxPred'] = prediction
298 | 
299 | 
300 | # In[196]:
301 | 
302 | 
303 | airP.dropna()
304 | sns.lineplot(data=airP,x=airP.index,y='passengers')
305 | sns.lineplot(data=airP,x=airP.index,y='sarimaxPred')
306 | sns.lineplot(data=airP,x=airP.index,y='arimaPred')
307 | # model_fit.predict(start=futureDate.index[0],end=futureDate.index[-1]).plot(color='black')
308 | 
309 | 
310 | # In[144]:
311 | 
312 | 
313 | np.sqrt(mean_squared_error(test['passengers'],prediction))
314 | 
315 | 
316 | # In[188]:
317 | 
318 | 
319 | futureDate = pd.DataFrame(pd.date_range(start='1961-01-01', end='1962-12-01',freq='MS'),columns=['Dates'])
320 | futureDate.set_index('Dates',inplace=True)
321 | futureDate.head()
322 | 
323 | 
324 | # In[192]:
325 | 
326 | 
327 | model_fit.predict(start=futureDate.index[0],end=futureDate.index[-1])
328 | 
329 | 
330 | # In[195]:
331 | 
332 | 
333 | airP.dropna()
334 | sns.lineplot(data=airP,x=airP.index,y='passengers')
335 | sns.lineplot(data=airP,x=airP.index,y='sarimaxPred')
336 | sns.lineplot(data=airP,x=airP.index,y='arimaPred')
337 | model_fit.predict(start=futureDate.index[0],end=futureDate.index[-1]).plot(color='black')
338 | 
339 | 
340 | # # Next - 3
341 | 
342 | # In[201]:
343 | 
344 | 
345 | checkDf = df[['passengers']]
346 | checkDf['diff1'] = checkDf.diff()
347 | # checkDf['diffInv'] = checkDf['diff1'].diffinv()
348 | checkDf.head()
349 | 
350 | 


--------------------------------------------------------------------------------