├── .gitattributes ├── 9781484237861.jpg ├── Accounting_Model_Final.py ├── CASESTUDY_FINAL.py ├── CaseStudy2Chapter4.py ├── Contributing.md ├── DirectPythChatbot.py ├── ElectronicPatientRecord.py ├── LICENSE.txt ├── Predict_NextDay_Open_STOCK_PRICE.py ├── Quandl_Data_Getter.py ├── README.md ├── Yahoo_Data_Getter.py ├── errata.md ├── fivepointsummary.py └── retail2.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /9781484237861.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/machine-learning-applications-using-python/9548a69d23c18218180977f277a8d0f2855f70dc/9781484237861.jpg -------------------------------------------------------------------------------- /Accounting_Model_Final.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Sep 11 14:03:51 2018 4 | 5 | @author: PUNEETMATHUR 6 | """ 7 | 8 | #Importing python libraries 9 | import pandas as pd 10 | from io import StringIO 11 | import requests 12 | import os 13 | import numpy as np 14 | os.getcwd() 15 | 16 | #Creating Visualization Functions which will be used throughout this model 17 | import matplotlib.pyplot as pl 18 | import matplotlib.patches as mpatches 19 | import numpy as np 20 | import pandas as pd 21 | from time import time 22 | from sklearn.metrics import f1_score, accuracy_score 23 | 24 | 25 | def distribution(data, transformed = False): 26 | """ 27 | Visualization code for displaying skewed distributions of features 28 | """ 29 | 30 | # Create figure 31 | fig = pl.figure(figsize = (11,5)); 32 | 33 | # Skewed feature plotting 34 | for i, feature in enumerate(['Amount','Month', 'Fiscal Year']): 35 | ax = fig.add_subplot(1, 3, i+1) 36 | ax.hist(data[feature], bins = 25, color = '#00A0A0') 37 | ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14) 38 | ax.set_xlabel("Value") 39 | ax.set_ylabel("Number of Records") 40 | ax.set_ylim((0, 2000)) 41 | ax.set_yticks([0, 500, 1000, 1500, 2000]) 42 | ax.set_yticklabels([0, 500, 1000, 1500, ">2000"]) 43 | 44 | # Plot aesthetics 45 | if transformed: 46 | fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", \ 47 | fontsize = 16, y = 1.03) 48 | else: 49 | fig.suptitle("Skewed Distributions of Continuous Census Data Features", \ 50 | fontsize = 16, y = 1.03) 51 | 52 | fig.tight_layout() 53 | fig.show() 54 | #End of Distribution Visualization function 55 | 56 | 57 | # Plotting Feature Importances through this function 58 | def feature_plot(importances, X_train, y_train): 59 | 60 | # Display the five most important features 61 | indices = np.argsort(importances)[::-1] 62 | columns = X_train.columns.values[indices[:5]] 63 | values = importances[indices][:5] 64 | 65 | # Creat the plot 66 | fig = pl.figure(figsize = (9,5)) 67 | pl.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16) 68 | pl.bar(np.arange(4), values, width = 0.6, align="center", color = '#00A000', \ 69 | label = "Feature Weight") 70 | pl.bar(np.arange(4) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \ 71 | label = "Cumulative Feature Weight") 72 | pl.xticks(np.arange(5), columns) 73 | pl.xlim((-0.5, 4.5)) 74 | pl.ylabel("Weight", fontsize = 12) 75 | pl.xlabel("Feature", fontsize = 12) 76 | 77 | pl.legend(loc = 'upper center') 78 | pl.tight_layout() 79 | pl.show() 80 | 81 | #End of Feature Importances function 82 | 83 | #Loading the Dataset 84 | fname="C:/DATASETS/data.ct.gov/PaymentsDataset.csv" 85 | openledger= pd.read_csv(fname, low_memory=False, index_col=False) 86 | 87 | #Verify the data Loaded into memory 88 | print(openledger.head(1)) 89 | 90 | #Loding into Dataframe for easier computation 91 | data= pd.DataFrame(openledger) 92 | 93 | #Look at the first record 94 | print(data.head(1)) 95 | #Check the shape of columns in the dataset 96 | print(data.shape) 97 | print(data.columns) 98 | data.dtypes 99 | 100 | #Data Cleanup 101 | #Check if there are any columns with empty/null dataset 102 | data.isnull().any() 103 | 104 | 105 | #No Data Cleanup needed so skipping decision to drop the na value rows since they are very less 106 | #data=data.dropna() 107 | data.isnull().sum() 108 | 109 | #Total number of records 110 | n_records = len(data.index) 111 | 112 | #Number of records where payments are below 1.5 times of Lower Quantile- Lower Outlier Limit 113 | 114 | l=data[data['RedFlag'] == 2].index 115 | n_greater_quantile = len(l) 116 | 117 | #Number of records where payments are above 1.5 times of upper Quantile- Upper Outlier Limit 118 | l=data[data['RedFlag'] == 1].index 119 | n_lower_quantile = len(l) 120 | 121 | #Percentage of Payments above Upper Outlier limit 122 | p=float(n_greater_quantile)/n_records*100.0 123 | greater_percent =p 124 | 125 | #Percentage of Payments above Lower Outlier limit 126 | p=float(n_lower_quantile)/n_records*100.0 127 | lower_percent =p 128 | 129 | # Print the results 130 | print "Total number of records: {}".format(n_records) 131 | print "High value Payments above 1.5 times of 75th Percentile: {}".format(n_greater_quantile) 132 | print "Low value Payments below 1.5 times of 25th Percentile: {}".format(n_lower_quantile) 133 | print "Percentage of high value Payments: {:.2f}%".format(greater_percent) 134 | print "Percentage of low value Payments: {:.2f}%".format(lower_percent) 135 | 136 | # PREPARING DATA 137 | # Split the data into features and target label 138 | payment_raw = pd.DataFrame(data['RedFlag']) 139 | type(payment_raw) 140 | features_raw = data.drop('RedFlag', axis = 1) 141 | #Removing redundant columns from features_raw dataset 142 | features_raw.dtypes 143 | features_raw=features_raw.drop('TransactionNo', axis=1) 144 | features_raw=features_raw.drop('Department', axis=1) 145 | features_raw=features_raw.drop('Account', axis=1) 146 | features_raw=features_raw.drop('Expense Category', axis=1) 147 | features_raw=features_raw.drop('Vendor ID', axis=1) 148 | features_raw=features_raw.drop('Payment Method', axis=1) 149 | features_raw=features_raw.drop('Payment Date', axis=1) 150 | features_raw=features_raw.drop('Invoice ID', axis=1) 151 | features_raw=features_raw.drop('Invoice Date', axis=1) 152 | features_raw=features_raw.drop('Unnamed: 0', axis=1) 153 | features_raw.dtypes 154 | type(features_raw) 155 | 156 | # Visualize skewed continuous features of original data 157 | distribution(data) 158 | 159 | # Log-transform the skewed features 160 | #Replacing Null values with zero due to software data entry problem 161 | #Known issue in software user screen takes null values there is no check. 162 | import warnings 163 | warnings.filterwarnings("ignore") 164 | features_raw.isnull().sum() 165 | 166 | skewed = ['Amount','Month', 'Fiscal Year'] 167 | features_raw[skewed] = data[skewed].apply(lambda x: np.log(x + 1)) 168 | features_raw.isnull().sum() 169 | features_raw.fillna(0, inplace=True) 170 | features_raw.dtypes 171 | 172 | # Visualize the new log distributions 173 | distribution(features_raw, transformed = True) 174 | 175 | #Normalizing Numerical Features 176 | # Import sklearn.preprocessing.StandardScaler 177 | from sklearn.preprocessing import MinMaxScaler 178 | 179 | # Initialize a scaler, then apply it to the features 180 | scaler = MinMaxScaler() 181 | numerical = [ 'Amount','Month', 'Fiscal Year'] 182 | #features_raw[numerical] = scaler.fit_transform(data[numerical]) 183 | features_raw[numerical] = scaler.fit_transform(features_raw[numerical]) 184 | distribution(features_raw) 185 | 186 | # Look at record with scaling applied to see if everything is good 187 | display(features_raw.head(n = 1)) 188 | features_raw.columns 189 | #Implementation: Data Preprocessing 190 | # One-Hot Encoding 191 | data.columns 192 | data.dtypes 193 | data['Payment Status'] 194 | #data['Expense Category'] #drop from features_raw 195 | #data['Payment Method'] #drop from features_raw 196 | 197 | # Encoding the 'Non-Numeric' data to numerical values 198 | #Payment Status column 199 | d={"Paid-Reconciled": 0, "Paid-Unreconciled": 1} 200 | features_raw['Payment Status'] = features_raw['Payment Status'].map(d) 201 | 202 | # Printing the number of features after one-hot encoding 203 | encoded = list(features_raw.columns) 204 | print "{} total features after one-hot encoding.".format(len(encoded)) 205 | 206 | 207 | # Importing train_test_split 208 | from sklearn.cross_validation import train_test_split 209 | payment_raw.columns 210 | # Splitting the 'features' and 'RedFlags' data into training and testing sets 211 | X_train, X_test, y_train, y_test = train_test_split(features_raw, payment_raw, test_size = 0.2, random_state = 0) 212 | X_train.columns 213 | X_test.columns 214 | y_train.columns 215 | y_test.columns 216 | # Showing the results of the split 217 | print "Training set has {} samples.".format(X_train.shape[0]) 218 | print "Testing set has {} samples.".format(X_test.shape[0]) 219 | 220 | #Checking to see if there are any empty rows 221 | X_train.isnull().any() 222 | y_train.isnull().any() 223 | X_train.isnull().sum() 224 | #Evaluating Model Performance 225 | #Establishing Benchmark performance indicator Naive Bayes 226 | #Naive Predictor Performace 227 | from sklearn.naive_bayes import GaussianNB 228 | #from sklearn.metrics import accuracy_score, fbeta_score 229 | from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix 230 | NB = GaussianNB() 231 | NB.fit(X_train,y_train) 232 | pred = NB.predict(X_test) 233 | 234 | #Calculating Accuracy Score 235 | #Calculating Beta Score 236 | accuracy_score(y_test,pred) 237 | print(f1_score(y_test,pred, average="macro")) 238 | print(precision_score(y_test, pred, average="macro")) 239 | print(recall_score(y_test, pred, average="macro")) 240 | 241 | # I AM GOING TO DO INITIAL MODEL EVALUATION NOW 242 | # Importing the three supervised learning models from sklearn 243 | from sklearn.naive_bayes import GaussianNB 244 | from sklearn.tree import DecisionTreeClassifier 245 | from sklearn.linear_model import LogisticRegression 246 | from sklearn.linear_model import SGDClassifier 247 | from sklearn.ensemble import ExtraTreesClassifier 248 | from sklearn.ensemble import RandomForestClassifier 249 | 250 | # Initialize the three models 251 | clf_A = GaussianNB() 252 | clf_B = DecisionTreeClassifier(max_features=0.2, max_depth=2, min_samples_split=2,random_state=0) 253 | clf_C = LogisticRegression(random_state=0) 254 | clf_D = SGDClassifier(loss="hinge", penalty="l2") 255 | clf_E = ExtraTreesClassifier(n_estimators=2, max_depth=2,min_samples_split=2, random_state=0) 256 | clf_F = RandomForestClassifier(max_depth=2) 257 | 258 | # Calculate the number of samples for 1%, 10%, and 100% of the training data 259 | #Defining function since percent is required 3 times 260 | 261 | 262 | # Collect results on the learners 263 | learners=["Naive Bayes","Decision Tree","Logistic Regression","SGD Classifier","ExtaTrees Classifier","RandomFores Classifier"] 264 | cnt=0 265 | columns=['learner','train_time','pred_time','acc_train','acc_test','f1_score'] 266 | learningresults= pd.DataFrame(columns=columns) 267 | results = {} 268 | 269 | for learner in [clf_A, clf_B, clf_C,clf_D,clf_E,clf_F]: 270 | #print(learners[cnt]) 271 | results['learner']=learners[cnt] 272 | #Fitting the learner to the training data using slicing with 'sample_size' 273 | start = time() # Get start time 274 | learner.fit(X_train, y_train) 275 | #Calculating the total prediction time 276 | end = time() # Get end time 277 | results['train_time'] = end - start 278 | start = time() # Get start time 279 | predictions_test = learner.predict(X_test) 280 | predictions_train = learner.predict(X_train) 281 | end = time() # Get end time 282 | results['pred_time'] = end - start 283 | results['acc_train'] = accuracy_score(y_train, predictions_train) 284 | results['acc_test'] = accuracy_score(y_test, predictions_test) 285 | beta=0.5 286 | results['f1_score'] = f1_score(y_test,pred, average="macro") 287 | print(results) 288 | learningresults.loc[cnt]=results 289 | cnt=cnt+1 290 | 291 | #Looking at the plots to determine the best Classifier for our Dataset 292 | print(learningresults) 293 | learningresults.columns 294 | learningresults.plot(kind='bar', x='learner', legend='reverse', title='Classifier Algorithms Compared- Accounts Payment Dataset',figsize=(10,10), fontsize=20) 295 | #learningresults.plot(kind='bar', x='learner').savefig('E:/BUSINESS/APRESS/ApplicationsOfMachineLearning/Chapter15/learner_performance.png') 296 | 297 | #--------------------------MODEL TUNING ------------------------ 298 | #Now I will implement Model tuning 299 | # Import 'GridSearchCV', 'make_scorer', and any other necessary libraries 300 | from sklearn.metrics import make_scorer 301 | from sklearn.model_selection import GridSearchCV 302 | from IPython.display import display 303 | import pickle, os.path 304 | from sklearn.linear_model import LogisticRegression 305 | from sklearn.metrics import fbeta_score 306 | 307 | def getscore(y_true, y_predict): 308 | return fbeta_score(y_true, y_predict, beta) 309 | 310 | best_clf = None 311 | beta=0.5 312 | 313 | #Initialize the classifier 314 | 315 | clf_C = LogisticRegression(random_state=0) 316 | 317 | # Create the parameters list you wish to tune 318 | #parameters = {'n_estimators':range(10,20),'criterion':['gini','entropy'],'max_depth':range(1,5)} 319 | parameters = {'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],'C':range(1,10),'max_iter':range(50,100)} 320 | 321 | # Make an fbeta_score scoring object 322 | scorer = make_scorer(getscore) 323 | 324 | # Perform grid search on the classifier using 'scorer' as the scoring method 325 | #grid_obj = GridSearchCV(clf_C, parameters, scoring=scorer) 326 | 327 | 328 | #do something 329 | grid_obj = GridSearchCV(clf_C, parameters) 330 | 331 | #grid_obj = GridSearchCV(clf_C, parameters) 332 | 333 | # Fit the grid search object to the training data and find the optimal parameters 334 | from datetime import datetime 335 | startTime = datetime.now() 336 | 337 | grid_fit = grid_obj.fit(X_train, y_train) 338 | CV_lr = GridSearchCV(estimator=clf_C, param_grid=parameters, cv= 5) 339 | CV_lr.fit(X_train, y_train) 340 | print(datetime.now() - startTime) 341 | 342 | # Get the estimator 343 | best_clf = grid_fit.best_estimator_ 344 | 345 | # Make predictions using the unoptimized and model 346 | predictions = (clf_C.fit(X_train, y_train)).predict(X_test) 347 | best_predictions = best_clf.predict(X_test) 348 | 349 | # Report the before-and-afterscores 350 | print "Unoptimized model\n------" 351 | print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)) 352 | print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5,average='micro')) 353 | print "\nOptimized Model\n------" 354 | print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)) 355 | print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5,average='micro')) 356 | 357 | # Print the final parameters 358 | df = pd.DataFrame(grid_fit.grid_scores_).sort_values('mean_validation_score').tail() 359 | display(df) 360 | print "Parameters for the optimal model: {}".format(clf.get_params()) 361 | 362 | # Now Extracting Feature Importances 363 | # mporting a supervised learning model that has 'feature_importances_' 364 | from sklearn.ensemble import ExtraTreesClassifier 365 | from sklearn.feature_selection import SelectFromModel 366 | from sklearn.metrics import fbeta_score 367 | 368 | # Training the supervised model on the training set 369 | model = ExtraTreesClassifier() 370 | model.fit(X_train, y_train) 371 | 372 | # TODO: Extract the feature importances 373 | importances = model.feature_importances_ 374 | 375 | # Plot 376 | feature_plot(importances, X_train, y_train) 377 | 378 | # Feature Selection 379 | # Import functionality for cloning a model 380 | from sklearn.base import clone 381 | best_clf= clf_F 382 | # Reduce the feature space 383 | X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:5]]] 384 | X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:5]]] 385 | 386 | # Train on the "best" model found from grid search earlier 387 | clf = (clone(best_clf)).fit(X_train_reduced, y_train) 388 | best_predictions = best_clf.predict(X_test) 389 | # Make new predictions 390 | reduced_predictions = clf.predict(X_test_reduced) 391 | 392 | # Report scores from the final model using both versions of data 393 | print "Final Model trained on full data\n------" 394 | print "Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)) 395 | print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, average="macro", beta = 0.5)) 396 | print "\nFinal Model trained on reduced data\n------" 397 | print "Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions)) 398 | print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta = 0.5, average='macro')) 399 | 400 | # Print the final parameters 401 | 402 | print "Parameters for the optimal model: {}".format(clf.get_params()) -------------------------------------------------------------------------------- /CASESTUDY_FINAL.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Jun 30 13:30:46 2018 4 | 5 | @author: PUNEETMATHUR 6 | """ 7 | #Loading libraries 8 | import pandas as pd 9 | 10 | #Loading the retail dataset 11 | survey=pd.read_csv("C:/Puneet/All_DOCS/Prediction_Files/DataScience-Python3_UDEMY_COURSE/DataScience-Python3/ml-100k/aystsaga_002.csv") 12 | 13 | #Looking at the dataset one row 14 | print(survey.head(1)) 15 | #Shape of the dataset Rows and Columns 16 | print(survey.shape) 17 | 18 | #Looking at the columns of the dataset 19 | print(survey.columns) 20 | #What are the datatypes of each column do we need to convert any of them 21 | print(survey.dtypes) 22 | #Are there any Null values in the dataset 23 | print(survey.isnull().any()) 24 | 25 | #Creating a pivot table for product ratings 26 | productRatings = survey.pivot_table(index=['user_id'],columns=['product_name'],values='rating') 27 | #Peeking into the pivot table 28 | productRatings.head(1) 29 | 30 | ##===========================Customer chooses a product====================================================== 31 | #Sample Product rating view assume here that a customer selected the product 'Dalmere Chocolate cup' 32 | #What recommendation are you going to give based on this selection 33 | #Users who bought 'Dalmere Chocolate cup' also bought: 34 | dalmereRatings = productRatings['Dalmere Chocolate cup'] 35 | dalmereRatings.head() 36 | 37 | #Now finding products with similar ratings 38 | similarProducts = productRatings.corrwith(dalmereRatings) 39 | #Dropping the NA values to get more meaningful data 40 | similarProducts = similarProducts.dropna() 41 | df = pd.DataFrame(similarProducts) 42 | df.head(10) 43 | 44 | similarProducts.sort_values(ascending=False) 45 | 46 | #Now let us get rid of spurious results in the similarities 47 | import numpy as np 48 | productStats = survey.groupby('product_name').agg({'rating': [np.size, np.mean]}) 49 | productStats.head() 50 | 51 | productStats.sort_values([('rating', 'mean')], ascending=False)[:15] 52 | 53 | #I am now getting rid of ratings size greater than 50 to create meaning results which matter to the business 54 | popularProducts = productStats['rating']['size'] >= 50 55 | productStats[popularProducts].sort_values([('rating', 'mean')], ascending=False)[:15] 56 | df = productStats[popularProducts].join(pd.DataFrame(similarProducts, columns=['similarity'])) 57 | df.head(15) 58 | df.sort_values(['similarity'], ascending=False)[:15] 59 | #Visualizing the Bar plot for similarities 60 | df['similarity'].plot(kind='bar') 61 | ax = df['similarity'].plot(kind='bar') 62 | x_offset = -0.03 63 | y_offset = 0.02 64 | for p in ax.patches: 65 | b = p.get_bbox() 66 | val = "{:+.2f}".format(b.y1 + b.y0) 67 | ax.annotate(val, ((b.x0 + b.x1)/2 + x_offset, b.y1 + y_offset)) 68 | 69 | #===========================Customer chooses another product====================================================== 70 | #2nd Sample Product rating view 71 | aesoRatings = productRatings['Aeso napkins 10 PACK'] 72 | aesoRatings.head() 73 | 74 | #Now finding products with similar ratings 75 | similarProducts = productRatings.corrwith(aesoRatings) 76 | similarProducts = similarProducts.dropna() 77 | df = pd.DataFrame(similarProducts) 78 | df.head(10) 79 | 80 | similarProducts.sort_values(ascending=False) 81 | 82 | #Now let us get rid of spurious results in the similarities 83 | import numpy as np 84 | productStats = survey.groupby('product_name').agg({'rating': [np.size, np.mean]}) 85 | productStats.head() 86 | 87 | productStats.sort_values([('rating', 'mean')], ascending=False)[:15] 88 | 89 | #I am now getting rid of ratings size greater than 20 to create meaning results which matter to the business 90 | popularProducts = productStats['rating']['size'] >= 50 91 | productStats[popularProducts].sort_values([('rating', 'mean')], ascending=False)[:15] 92 | df = productStats[popularProducts].join(pd.DataFrame(similarProducts, columns=['similarity'])) 93 | df.head(15) 94 | df.sort_values(['similarity'], ascending=False)[:15] 95 | 96 | #Visualization of similarities 97 | df['similarity'].plot(kind='bar') 98 | ax = df['similarity'].plot(kind='bar') 99 | x_offset = -0.03 100 | y_offset = 0.02 101 | for p in ax.patches: 102 | b = p.get_bbox() 103 | val = "{:+.2f}".format(b.y1 + b.y0) 104 | ax.annotate(val, ((b.x0 + b.x1)/2 + x_offset, b.y1 + y_offset)) 105 | 106 | -------------------------------------------------------------------------------- /CaseStudy2Chapter4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 17 13:16:52 2018 4 | 5 | @author: PMAUTHOR 6 | """ 7 | 8 | import pandas as pd 9 | from io import StringIO 10 | import requests 11 | import os 12 | os.getcwd() 13 | 14 | fname="Food_Raw_Data.csv" 15 | hospitals= pd.read_csv(fname, low_memory=False, index_col=False) 16 | df= pd.DataFrame(hospitals) 17 | print(df.head(1)) 18 | 19 | print(df.size) 20 | print(df.shape) 21 | print(df.columns) 22 | df.dtypes 23 | 24 | #Check if there are any columns with empty/null dataset 25 | df.isnull().any() 26 | #Checking how many columns have null values 27 | df.info() 28 | 29 | #Using individual functions to do EDA 30 | #Checking out Statistical data Mean Median Mode correlation 31 | df.mean() 32 | df.median() 33 | df.mode() 34 | 35 | #How is the data distributed and detecting Outliers 36 | df.std() 37 | df.max() 38 | df.min() 39 | df.quantile(0.25)*1.5 40 | df.quantile(0.75)*1.5 41 | 42 | #How many Outliers in the Total Food ordered column 43 | df.columns 44 | df.dtypes 45 | df.set_index(['Hospital Name']) 46 | df['Total Food ordered'].loc[df['Total Food ordered'] <=238.5].count() 47 | df['Total Food ordered'].loc[df['Total Food ordered'] >=679.5].count() 48 | 49 | #Visualizing the dataset 50 | df.boxplot(figsize=(10, 6)) 51 | df.plot.box(vert=False) 52 | df.kurtosis() 53 | df.skew() 54 | import scipy.stats as sp 55 | sp.skew(df['Total Food ordered']) 56 | 57 | #Visualizing dataset 58 | df.plot() 59 | df.hist(figsize=(10, 6)) 60 | df.plot.area() 61 | df.plot.area(stacked=False) 62 | 63 | 64 | #Now look at correlation and patterns 65 | df.corr() 66 | 67 | #Change to dataset columns 68 | df.plot.scatter(x='Total Food Wasted', y='No of Guests with Inpatient',s=df['Total Food Wasted']*2) 69 | df.plot.hexbin(x='Total Food Wasted', y='No of Guests with Inpatient', gridsize=25) 70 | 71 | #Change to dataset columns 72 | #Look at crosstabulation to conclude EDA 73 | df.columns 74 | df.dtypes 75 | #Counting the Categorical variables 76 | my_tab = pd.crosstab(index=df["Feedback"], columns="Count") # Name the count column 77 | my_tab = pd.crosstab(index=df["Type of Hospital"], columns="Count") # Name the count column 78 | 79 | print(my_tab) 80 | my_tab=my_tab.sort_values('Count', ascending=[False]) 81 | print(my_tab) 82 | #my_tab.sum() 83 | data_counts = pd.DataFrame(my_tab) 84 | pd.DataFrame(data_counts).transpose().plot(kind='bar', stacked=False) 85 | 86 | #Data Preparation Steps 87 | #Step 1 Split data into features and target variable 88 | # Split the data into features and target label 89 | wastage = pd.DataFrame(df['Total Food Wasted']) 90 | 91 | dropp=df[['Total Food Wasted','Feedback','Type of Hospital','Total No of beds']] 92 | features= df.drop(dropp, axis=1) 93 | wastage.columns 94 | features.columns 95 | 96 | #Step 2 Shuffle & Split Final Dataset 97 | # Import train_test_split 98 | from sklearn.cross_validation import train_test_split 99 | from sklearn.utils import shuffle 100 | 101 | # Shuffle and split the data into training and testing subsets 102 | features=shuffle(features, random_state=0) 103 | wastage=shuffle(wastage, random_state=0) 104 | # Split the 'features' and 'income' data into training and testing sets 105 | X_train, X_test, y_train, y_test = train_test_split(features, wastage, test_size = 0.2, random_state = 0) 106 | 107 | # Show the results of the split 108 | print("Training set has {} samples.".format(X_train.shape[0])) 109 | print("Testing set has {} samples.".format(X_test.shape[0])) 110 | 111 | 112 | #Model Building & Evaluation 113 | #Creating the the Model for prediction 114 | 115 | #Loading model Libraries 116 | import matplotlib.pyplot as plt 117 | import numpy as np 118 | from sklearn import linear_model 119 | from sklearn.metrics import mean_squared_error, r2_score 120 | from sklearn.svm import LinearSVC 121 | 122 | 123 | #Creating Linear Regression object 124 | regr = linear_model.LinearRegression() 125 | linear_svm = LinearSVC().fit(X_train,y_train) 126 | 127 | regr.fit(X_train,y_train) 128 | y_pred= regr.predict(X_test) 129 | yy_pred= linear_svm.predict(X_test) 130 | #Printing Codfficients 131 | print('Coefficients: \n',regr.coef_) 132 | print(LinearSVC().fit(X_train,y_train).coef_) 133 | regr.score(X_train,y_train) 134 | #Mean squared error 135 | print("mean squared error: %.2f" %mean_squared_error(y_test,y_pred)) 136 | 137 | #Variance score 138 | print("Variance score: %2f" % r2_score(y_test, y_pred)) 139 | 140 | #Plot and visualize the Linear Regression plot 141 | plt.plot(X_test, y_pred, linewidth=3) 142 | plt.show() 143 | 144 | #Checking graphically the boundaries formed by Linear SVM 145 | line = np.linspace(-15, 15) 146 | for coef, intercept in zip(linear_svm.coef_, linear_svm.intercept_): 147 | plt.plot(line, -(line * coef[0] + intercept) / coef[1]) #HOW DO WE KNOW 148 | plt.ylim(-10, 15) 149 | plt.xlim(-10, 8) 150 | plt.show() 151 | 152 | predicted= regr.predict([[820,81,363,35]]) 153 | print(predicted) 154 | 155 | features.head(2) 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /Contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to Apress Source Code 2 | 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers. 4 | 5 | ## How to Contribute 6 | 7 | 1. Make sure you have a GitHub account. 8 | 2. Fork the repository for the relevant book. 9 | 3. Create a new branch on which to make your change, e.g. 10 | `git checkout -b my_code_contribution` 11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted. 12 | 5. Submit a pull request. 13 | 14 | Thank you for your contribution! -------------------------------------------------------------------------------- /DirectPythChatbot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Mar 24 10:20:18 2018 4 | 5 | @author: PUNEETMATHUR 6 | """ 7 | #Loading tkinter libraries which will be used in the UI of chatbot 8 | import tkinter 9 | from tkinter import * 10 | from tkinter.scrolledtext import * 11 | from tkinter import ttk 12 | import time 13 | from PIL import ImageTk, Image 14 | import tkinter 15 | 16 | #Loading random for random choices in our chat program 17 | import random 18 | 19 | #Splash Screen 20 | splash = tkinter.Tk() 21 | 22 | splash.title("Welcome to Applications of Machine Learning in Healthcare, Retail & Finance") 23 | splash.geometry("1000x100") 24 | splash.configure(background='green') 25 | w = Label(splash, text="DirectPyth Diabetes Diagnostics Chatbot by Puneet Mathur\nLoading...",font=("Helvetica", 26),fg="white",bg="green") 26 | w.pack() 27 | 28 | splash.update() 29 | time.sleep(6) 30 | splash.deiconify() 31 | splash.destroy() 32 | 33 | #Initializing tkinter library for UI window showup 34 | window = tkinter.Tk() 35 | s = tkinter.Scrollbar(window) 36 | chatmsg = tkinter.Text(window) 37 | chatmsg.focus_set() 38 | s.pack(side=tkinter.RIGHT, fill=tkinter.Y) 39 | chatmsg.pack(side=tkinter.TOP, fill=tkinter.Y) 40 | s.config(command=chatmsg.yview) 41 | chatmsg.config(yscrollcommand=s.set) 42 | input_user = StringVar() 43 | input_field = Entry(window, text=input_user) 44 | input_field.pack(side=tkinter.BOTTOM, fill=tkinter.X) 45 | bot_text="Welcome to DirectPyth Diagnostic Center\n" 46 | chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text) 47 | bot_text = "Press enter to continue " 48 | chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text) 49 | chatmsg.focus() 50 | 51 | #Diagnostics Corpus for the chatbot 52 | greet=['Hello welcome to DirectPyth','Hi welcome to DirectPyth','Hey welcome to DirectPyth','Good day to you welcome to DirectPyth'] 53 | confirm=['yes','yay','yeah','yo'] 54 | memberid=['12345','12346','12347','12348','12349'] 55 | customer = ['hello','hi','hey'] 56 | answer = ['I uderstand you feel happy but please stay to the point and select one of the options',"I sympathize with you, However please do not deviate from the topic"] 57 | greetings = ['hola Welcome to DirectPyth again', 'hello Welcome to DirectPyth again', 'hi Welcome to DirectPyth again', 'Hi Welcome to DirectPyth again', 'hey! Welcome to DirectPyth again', 'hey Welcome to DirectPyth again'] 58 | question = ['how are you?', 'how are you doing?'] 59 | responses = ['Okay', "I'm fine"] 60 | tests=['Type 1 for hbA1c test', "Type 2 for Blood Viscosity test","Type 3 for Heart rate test","Type 4 for Blood Oxygen test","Type 5 for Blood Pressure"] 61 | testresponse= ['1','2','3','4','5','6'] 62 | 63 | #Global variable to check first time greeting 64 | firstswitch=1 65 | newid="12310" 66 | memid=0 67 | 68 | def chat(event): 69 | import time 70 | import random 71 | global memid 72 | condition="" 73 | #Greet for first time 74 | global firstswitch 75 | print(firstswitch) 76 | 77 | if (firstswitch==1): 78 | bot_text = random.choice(greet) 79 | chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text) 80 | bot_text = "If you are an existing Member of DirectPyth please enter your membershipid: or enter no if you are not a member" 81 | chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text) 82 | firstswitch=2 83 | if (firstswitch!=1): 84 | input_get = input_field.get().lower() 85 | if any(srchstr in input_get for srchstr in memberid): 86 | memid=input_get 87 | bot_text = "Thank you for being a loyal member of DirectPyth\n Please choose a test from following menu to continue\nType 1 for hbA1c test\nType 2 for Blood Viscosity test\nType 3 for Heart rate test\nType 4 for Blood Oxygen test\nType 5 for Blood pressure test\nType 6 to exit\n\n" 88 | elif (input_get=="no"): 89 | memid=newid 90 | bot_text = "Your new Memberid is: " + newid + " Please remember this for future reference.\n Please choose a test from following menu to continue\nType 1 for hbA1c test\nType 2 for Blood Viscosity test\nType 3 for Heart rate test\nType 4 for Blood Oxygen test\nType 5 for Blood pressure test\nType 6 to exit\n\n" 91 | elif any(srchstr in input_get for srchstr in testresponse): 92 | 93 | bot_text = "Please place any of your finger on the Finger panel above to conduct the test" 94 | chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text) 95 | delaycounter=0 96 | for delaycounter in range(0,10): 97 | bot_text = str(delaycounter) 98 | time.sleep(1) 99 | chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text) 100 | bot_text = "Please wait generating your report\n" 101 | chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text) 102 | time.sleep(2) 103 | if (input_get=="1"): 104 | hba1c=random.randint(4,10) 105 | bot_text = "MemberID: " + str(memid) + " Your hbA1c test result is: " + str(hba1c) 106 | if(hba1c>=4 and hba1c<=5.6): 107 | condition="You are don't have diabetes" 108 | elif(hba1c>=5.7 and hba1c<=6.4): 109 | condition="You are Prediabetic" 110 | elif(hba1c>=6.5): 111 | condition="You are Diabetic" 112 | bot_text=bot_text + " Your condition is: " + condition 113 | chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text) 114 | elif (input_get=="2"): 115 | viscosity=random.randint(20,60) 116 | bot_text = "MemberID: " + str(memid) + " Your Blood Viscosity level test result is: " + str(viscosity) 117 | elif (input_get=="3"): 118 | heartrate=random.randint(40,150) 119 | bot_text = "MemberID: " + str(memid) + " Your Heart rate test result is: " + str(heartrate) 120 | elif (input_get=="4"): 121 | oxygen=random.randint(90,100) 122 | bot_text = "MemberID: " + str(memid) + " Your Blood Oxygen level test result is: " + str(oxygen) 123 | elif (input_get=="5"): 124 | systolic=random.randint(80,200) 125 | diastolic=random.randint(80,110) 126 | bot_text = "MemberID: " + str(memid) + " Your Blood Pressure test result is: Systolic: " + str(systolic) + " Diastolic: " + str(diastolic) 127 | elif (input_get=="6"): 128 | import sys 129 | window.deiconify() 130 | window.destroy() 131 | sys.exit(0) 132 | else: 133 | from nltk.stem import WordNetLemmatizer 134 | import nltk 135 | if((not input_get) or (int(input_get)<=0)): 136 | print("did you just press Enter?") #print some info 137 | else: 138 | lemmatizer = WordNetLemmatizer() 139 | input_get = input_field.get().lower() 140 | lemvalue=lemmatizer.lemmatize(input_get) 141 | whatsentiment=getSentiment(lemvalue) 142 | if (whatsentiment=="pos"): 143 | bot_text = answer[0] 144 | #print("Positive Sentiment") 145 | elif (whatsentiment=="neg"): 146 | bot_text = answer[1] 147 | #print("Negative Sentiment") 148 | chatmsg.insert(INSERT, '%s\n' % lemvalue) 149 | #bot_text = "I did not understand what you said !" 150 | 151 | 152 | 153 | 154 | chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text) 155 | #label = Label(window, text=input_get) 156 | input_user.set('') 157 | #label.pack() 158 | return "break" 159 | 160 | #Sentiment Analyzer using NLP 161 | def getSentiment(text): 162 | import nltk 163 | from nltk.tokenize import word_tokenize 164 | 165 | #nltk.download('punkt') 166 | # Step 1 – Training data building the Diabetes corpus 167 | train = [("Thanks for an excellent report", "pos"), 168 | ("Your service is very quick and fast", "pos"), 169 | ("I am pleased with your service", "pos"), 170 | ("I did not know i was diabetic until you gave me this report", "neg"), 171 | ("Service - Little slow, probably because too many people.", "neg"), 172 | ("The place is not easy to locate", "neg"), 173 | ("The place is very easy to locate", "pos"), 174 | ("Not satisfied will take a second opinion", "neg"), 175 | ("No human contact everything is so robotic here", "neg"), 176 | ("can i talk to a human not convinced with your report", "neg"), 177 | ("good results", "pos"), 178 | ("good service", "pos"), 179 | ("great service", "pos"), 180 | ("excellent service", "pos"), 181 | ("amazing technology", "pos"), 182 | ("fast service and satisfying report", "pos"), 183 | ("your report sucks", "neg"), 184 | ("this report will cost me a fortune", "neg"), 185 | ("I have diabetes", "neg"), 186 | ("this report will cost me a fortune", "neg"), 187 | ("this report means i have a dreadful disease", "neg"), 188 | ("will i need to take new medication", "neg"), 189 | ("i need to take my insulin injections regularly", "neg"), 190 | ("my lipids are getting worst need to talk to the doctor", "neg"), 191 | ("oh my god very bad results", "neg"), 192 | ("bad service", "neg"), 193 | ("very bad service", "neg"), 194 | ("poor service", "neg"), 195 | ("very bad service", "neg"), 196 | ("slow service", "neg"), 197 | ("very slow service", "neg"), 198 | ("diabetes got worst is this report accurate", "neg"), 199 | ("i dont believe this report", "neg"), 200 | ("i dont like this report", "neg"), 201 | ("i am in a diabetic hell", "neg"), 202 | ] 203 | # Step 2 Tokenize the words to dictionary 204 | dictionary = set(word.lower() for passage in train for word in word_tokenize(passage[0])) 205 | # Step 3 Locate the word in training data 206 | t = [({word: (word in word_tokenize(x[0])) for word in dictionary}, x[1]) for x in train] 207 | # Step 4 – the classifier is trained with sample data 208 | classifier = nltk.NaiveBayesClassifier.train(t) 209 | test_data = "oh my god what is this" 210 | test_data_features = {word.lower(): (word in word_tokenize(test_data.lower())) for word in dictionary} 211 | print (classifier.classify(test_data_features)) 212 | return classifier.classify(test_data_features) 213 | 214 | #Start the program chat and put in loop 215 | input_field.bind("", chat) 216 | tkinter.mainloop() 217 | 218 | -------------------------------------------------------------------------------- /ElectronicPatientRecord.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 15 23:46:06 2018 4 | 5 | @author: PUNEETMATHUR 6 | """ 7 | #Importing python libraries 8 | import pandas as pd 9 | from io import StringIO 10 | import requests 11 | import os 12 | os.getcwd() 13 | 14 | #Reading dataset from flat file 15 | fname="Diabetes_Dataset.csv" 16 | patients= pd.read_csv(fname, low_memory=False, index_col=False) 17 | df= pd.DataFrame(patients) 18 | 19 | #Look at the first record 20 | print(df.head(1)) 21 | #Check the shape size and columns in the dataset 22 | print(df.size) 23 | print(df.shape) 24 | print(df.columns) 25 | df.dtypes 26 | #Check if there are any columns with empty/null dataset 27 | df.isnull().any() 28 | #Checking how many columns have null values 29 | df.info() 30 | 31 | dfworking=df.drop('Patient ID',axis=1) 32 | #You can use Describe method to see however since our columns are more 33 | #We will use individual functions to do EDA 34 | print(dfworking.describe) 35 | 36 | #Using individual functions to do EDA 37 | #Checking out Statistical data Mean Median Mode correlation 38 | dfworking.mean() 39 | dfworking.median() 40 | dfworking.mode() 41 | 42 | 43 | #How is the data distributed and detecting Outliers 44 | dfworking.std() 45 | dfworking.max() 46 | dfworking.min() 47 | dfworking.quantile(0.25)*1.5 48 | dfworking.quantile(0.75)*1.5 49 | 50 | #How many Outliers in the BPSystolic column 51 | df.columns 52 | df.dtypes 53 | dfworking.set_index(['BPSystolic']) 54 | dfworking['BPSystolic'].loc[df['BPSystolic'] >=183.562500].count() 55 | dfworking.set_index(['Patient ID']) 56 | 57 | 58 | dfworking.boxplot(figsize=(10, 6)) 59 | dfworking.plot.box(vert=False) 60 | dfworking.boxplot(column=['A1cTEST','BPSystolic','BPDiastolic','HeartRateBPM','BMI','Age'],figsize=(10, 6)) 61 | dfworking.kurtosis() 62 | dfworking.skew() 63 | import scipy.stats as sp 64 | sp.skew(dfworking.A1cTEST) 65 | 66 | #Visualizing dataset 67 | dfworking.plot() 68 | dfworking.hist(column=['A1cTEST','BPSystolic','BPDiastolic','HeartRateBPM','BMI','Age'],figsize=(10, 6)) 69 | dfworking.plot.area() 70 | dfworking.plot.area(stacked=False) 71 | 72 | 73 | #Now look at correlation and patterns 74 | dfworking.corr() 75 | dfworking.plot.scatter(x='A1cTEST', y='BPSystolic',s=dfworking['A1cTEST']*2) 76 | dfworking.plot.scatter(x='A1cTEST', y='BPSystolic',s=dfworking['BPSystolic']*0.13) 77 | dfworking.plot.hexbin(x='A1cTEST', y='BPSystolic', gridsize=25) 78 | 79 | #Look at crosstabulation to conclude EDA 80 | df.columns 81 | #Counting the Categorical variables 82 | my_tab = pd.crosstab(index=df["Gender"], columns="Count") # Name the count column 83 | my_tab = pd.crosstab(index=df["Type of diabetes"], columns="Count") # Name the count column 84 | my_tab = pd.crosstab(index=df["Diabetes status"], columns="Count") # Name the count column 85 | my_tab = pd.crosstab(index=df["FrozenShoulder"], columns="Count") # Name the count column 86 | my_tab = pd.crosstab(index=df["CarpalTunnelSynd"], columns="Count") # Name the count column 87 | my_tab = pd.crosstab(index=df["DuputrensCont"], columns="Count") # Name the count column 88 | print(my_tab) 89 | my_tab=my_tab.sort_values('Count', ascending=[False]) 90 | print(my_tab) 91 | my_tab.sum() 92 | data_counts = pd.DataFrame(my_tab) 93 | pd.DataFrame(data_counts).transpose().plot(kind='bar', stacked=False) 94 | 95 | #Data Preparation Steps 96 | #Step 1 Split data into features and target variable 97 | # Split the data into features and target label 98 | diabetics = pd.DataFrame(dfworking['Diabetes status']) 99 | features = pd.DataFrame(dfworking.drop('Diabetes status', axis = 1)) 100 | diabetics.columns 101 | features.columns 102 | 103 | #Step 2 Standardize dataset 104 | # Import sklearn.preprocessing.StandardScaler 105 | from sklearn.preprocessing import MinMaxScaler 106 | 107 | # Initialize a scaler, then apply it to the features 108 | scaler = MinMaxScaler() 109 | dfworking.dtypes 110 | numerical = ['Age','A1cTEST','BPSystolic','BPDiastolic','HeartRateBPM','BMI'] 111 | features_raw[numerical] = scaler.fit_transform(dfworking[numerical]) 112 | 113 | # Show an example of a record with scaling applied 114 | display(features_raw[numerical].head(n = 1)) 115 | 116 | # Step 3 One-hot encode the 'features_raw' data using pandas.get_dummies() 117 | features = pd.get_dummies(features_raw) 118 | features.columns 119 | 120 | #Checking output 121 | display(features.head(1),diabetics.head(1)) 122 | 123 | # Print the number of features after one-hot encoding 124 | encoded = list(features.columns) 125 | print("{} total features after one-hot encoding.".format(len(encoded))) 126 | 127 | # see the encoded feature names 128 | print(encoded) 129 | 130 | #Step 4 Shuffle & Split Final Dataset 131 | # Import train_test_split 132 | from sklearn.cross_validation import train_test_split 133 | from sklearn.utils import shuffle 134 | 135 | # Shuffle and split the data into training and testing subsets 136 | features=shuffle(features, random_state=0) 137 | diabetics=shuffle(diabetics, random_state=0) 138 | # Split the 'features' and 'income' data into training and testing sets 139 | X_train, X_test, y_train, y_test = train_test_split(features, diabetics, test_size = 0.2, random_state = 0) 140 | 141 | # Show the results of the split 142 | print("Training set has {} samples.".format(X_train.shape[0])) 143 | print("Testing set has {} samples.".format(X_test.shape[0])) 144 | 145 | #Model Building & Evaluation 146 | #Creating the the Model for prediction 147 | 148 | #Loading model Libraries 149 | from sklearn import model_selection 150 | from sklearn.linear_model import LogisticRegression 151 | from sklearn.tree import DecisionTreeClassifier 152 | from sklearn.neighbors import KNeighborsClassifier 153 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 154 | from sklearn.naive_bayes import GaussianNB 155 | from sklearn.svm import SVC 156 | from sklearn.ensemble import RandomForestClassifier 157 | 158 | # prepare models 159 | seed = 7 160 | models = [] 161 | models.append(('LR', LogisticRegression())) 162 | models.append(('LDA', LinearDiscriminantAnalysis())) 163 | models.append(('KNN', KNeighborsClassifier())) 164 | models.append(('CART', DecisionTreeClassifier())) 165 | models.append(('NB', GaussianNB())) 166 | models.append(('SVM', SVC())) 167 | models.append(('RFC', RandomForestClassifier())) 168 | 169 | # evaluate each model in turn 170 | results = [] 171 | names = [] 172 | scoring = 'accuracy' 173 | 174 | import warnings 175 | warnings.filterwarnings("ignore") 176 | 177 | for name, model in models: 178 | kfold = model_selection.KFold(n_splits=10, random_state=seed) 179 | cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring) 180 | results.append(cv_results) 181 | names.append(name) 182 | msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) 183 | print(msg) 184 | 185 | 186 | import matplotlib.pyplot as plt 187 | # boxplot algorithm comparison 188 | fig = plt.figure() 189 | fig.suptitle('Algorithm Comparison') 190 | ax = fig.add_subplot(111) 191 | plt.boxplot(results) 192 | ax.set_xticklabels(names) 193 | plt.show() 194 | 195 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Freeware License, some rights reserved 2 | 3 | Copyright (c) 2019 Puneet Mathur 4 | 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 6 | of this software and associated documentation files (the "Software"), 7 | to work with the Software within the limits of freeware distribution and fair use. 8 | This includes the rights to use, copy, and modify the Software for personal use. 9 | Users are also allowed and encouraged to submit corrections and modifications 10 | to the Software for the benefit of other users. 11 | 12 | It is not allowed to reuse, modify, or redistribute the Software for 13 | commercial use in any way, or for a user’s educational materials such as books 14 | or blog articles without prior permission from the copyright holder. 15 | 16 | The above copyright notice and this permission notice need to be included 17 | in all copies or substantial portions of the software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | 27 | 28 | -------------------------------------------------------------------------------- /Predict_NextDay_Open_STOCK_PRICE.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 17 20:30:01 2017 4 | 5 | @author: PUNEETMATHUR 6 | I am creating this script to predict next day Opening Price based on 7 | Today's Closing Price for any given stock 8 | """ 9 | 10 | import numpy as np 11 | import pandas as pd 12 | import os 13 | 14 | #Change your directory to wherever your dataset is stored 15 | os.chdir("E:\\BUSINESS\\APRESS\\ApplicationsOfMachineLearning\\Chapter16\\") 16 | 17 | #Loading the dataset of the company for which prediction is required 18 | df=pd.read_csv("BalmerLawrieColtd.csv",parse_dates=['Date']) 19 | print(df.head(1)) 20 | print(df.columns) 21 | df.shape 22 | 23 | #Selecting only relevant columns required for prediction 24 | cols=['Date','Open','Close'] 25 | df=df[cols] 26 | 27 | print(df.columns) 28 | print(df.head(5)) 29 | 30 | # Checking data if Cleaning up data is required 31 | df.isnull().any() 32 | #df=df.dropna() 33 | #df=df.replace("NA",0) 34 | df.dtypes 35 | 36 | #Sorting up data to plot historically ascending values in graph 37 | df = df.sort_values(by='Date',ascending=True) 38 | 39 | #Plotting the price of stock over the years 40 | #What story does it tell? 41 | import matplotlib.pyplot as plt 42 | plt.plot(df['Date'],df['Close']) 43 | 44 | #Now plot only for last one year and last 1 month 45 | df['Date'].dt.year==2017 46 | mask=(df['Date'] > '2017-1-1') & (df['Date'] <= '2017-12-31') 47 | print(df.loc[mask]) 48 | df2017=df.loc[mask] 49 | print(df2017.head(5)) 50 | import matplotlib.pyplot as plt 51 | plt.plot(df2017['Date'],df2017['Close']) 52 | 53 | #Plotting last 1 month data on stock 54 | mask=(df['Date'] > '2017-11-17') & (df['Date'] <= '2017-12-26') 55 | print(df.loc[mask]) 56 | dfnovdec2017=df.loc[mask] 57 | print(dfnovdec2017.head(5)) 58 | import matplotlib.pyplot as plt 59 | plt.xticks( rotation='vertical') 60 | plt.plot(dfnovdec2017['Date'],dfnovdec2017['Close']) 61 | 62 | #Now calculating the Simple Moving Average of the Stock 63 | #Simple Moving Average One Year 64 | df2017['SMA'] = df2017['Close'].rolling(window=20).mean() 65 | df2017.head(25) 66 | df2017[['SMA','Close']].plot() 67 | 68 | 69 | #Does the Open and Closing price of the stock follow very well? 70 | df2017[['Open','Close']].plot() 71 | 72 | #Checking the Correlation 73 | df2017.corr() 74 | 75 | #Simple Moving Average One Month 76 | dfnovdec2017['SMA'] = dfnovdec2017['Close'].rolling(window=2).mean() 77 | dfnovdec2017.head(25) 78 | dfnovdec2017[['SMA','Close']].plot() 79 | 80 | #Now creating NextDayOpen column for prediction 81 | ln=len(df) 82 | lnop=len(df['Open']) 83 | print(lnop) 84 | ii=0 85 | df['NextDayOpen']=df['Open'] 86 | df['NextDayOpen']=0 87 | for i in range(0,ln-1): 88 | print("Open Price: ",df['Open'][i]) 89 | if i!=0: 90 | ii=i-1 91 | df['NextDayOpen'][ii]=df['Open'][i] 92 | print(df['NextDayOpen'][ii]) 93 | 94 | print(df['NextDayOpen'].head()) 95 | 96 | #Checking on the new data 97 | print(df[['Open','NextDayOpen', 'Close']].head(5)) 98 | 99 | #Now checking if there is any correlation 100 | dfnew=df[['Close','NextDayOpen']] 101 | print(dfnew.head(5)) 102 | dfnew.corr() 103 | 104 | #Now Creating the Prediction model as correlation is very high 105 | #Importing the libraries 106 | from sklearn import cross_validation 107 | from sklearn.utils import shuffle 108 | from sklearn import linear_model 109 | from sklearn.metrics import mean_squared_error, r2_score 110 | 111 | 112 | #Creating the features and target dataframes 113 | price=dfnew['Close'] 114 | print(price) 115 | print(dfnew.columns) 116 | features=dfnew[['NextDayOpen']] 117 | #Shuffling the data 118 | price=shuffle(price, random_state=0) 119 | features=shuffle(features,random_state=0) 120 | 121 | #Dividing data into Train and Test 122 | X_train, X_test, y_train, y_test= cross_validation.train_test_split(features,price,test_size=0.2, random_state=0) 123 | 124 | #Linear Regression on Sensex data 125 | reg= linear_model.LinearRegression() 126 | 127 | X_train.shape 128 | reg.fit(X_train, y_train) 129 | 130 | y_pred= reg.predict(X_test) 131 | 132 | 133 | print("Coefficients: ", reg.coef_) 134 | 135 | #Mean squared error 136 | print("mean squared error: ",mean_squared_error(y_test,y_pred)) 137 | 138 | #Variance score 139 | print("Variance score: ", r2_score(y_test, y_pred)) 140 | 141 | #STANDARD DEVIATION 142 | standarddev=price.std() 143 | 144 | #Predict based on Opening BSE Sensex Index and Opening Volume 145 | #In the predict function below enter the first parameter Open forNSE and 2nd Volume in Crores 146 | sensexClosePredict=reg.predict([[269.05]]) 147 | #175 is the standard deviation of the Diff between Open and Close of sensex so this range 148 | print("Stock Likely to Open at: ",sensexClosePredict , "(+-STD)") 149 | print("Stock Open between: ",sensexClosePredict+standarddev , " & " , sensexClosePredict-standarddev) 150 | 151 | 152 | -------------------------------------------------------------------------------- /Quandl_Data_Getter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Sep 22 23:43:13 2018 4 | 5 | @author: PUNEETMATHUR 6 | """ 7 | 8 | import quandl 9 | quandl.ApiConfig.api_key = 'INSERT YOU API KEY HERE' 10 | 11 | # get the table for daily stock prices and, 12 | # filter the table for selected tickers, columns within a time range 13 | # set paginate to True because Quandl limits tables API to 10,000 rows per call 14 | 15 | data = quandl.get_table('WIKI/PRICES', ticker = ['AAPL', 'MSFT', 'WMT'], 16 | qopts = { 'columns': ['ticker', 'date', 'adj_close'] }, 17 | date = { 'gte': '2015-12-31', 'lte': '2016-12-31' }, 18 | paginate=True) 19 | data.head() 20 | 21 | # create a new dataframe with 'date' column as index 22 | new = data.set_index('date') 23 | 24 | # use pandas pivot function to sort adj_close by tickers 25 | clean_data = new.pivot(columns='ticker') 26 | 27 | # check the head of the output 28 | clean_data.head() 29 | 30 | #Below script gets you Data from National Stock Exchange for a stock known as Oil India Limited 31 | import quandl 32 | quandl.ApiConfig.api_key = 'z1bxBq27SVanESKoLJwa' 33 | quandl.ApiConfig.api_version = '2015-04-09' 34 | 35 | import quandl 36 | data = quandl.get('NSE/OIL') 37 | data.head() 38 | data.columns 39 | data.shape 40 | 41 | #Storing data in a flat file 42 | data.to_csv("NSE_OIL.csv") 43 | 44 | #A basic plot of the stocks data across the years 45 | data['Close'].plot() 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apress Source Code 2 | 3 | This repository accompanies [*Machine Learning Applications Using Python*](https://www.apress.com/9781484237861) by Puneet Mathur (Apress, 2019). 4 | 5 | [comment]: #cover 6 | ![Cover image](9781484237861.jpg) 7 | 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git. 9 | 10 | ## Releases 11 | 12 | Release v1.0 corresponds to the code in the published book, without corrections or updates. 13 | 14 | ## Contributions 15 | 16 | See the file Contributing.md for more information on how you can contribute to this repository. -------------------------------------------------------------------------------- /Yahoo_Data_Getter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Dec 3 23:32:32 2017 4 | 5 | @author: PUNEETMATHUR 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | #import pandas.io.data as web 11 | from pandas_datareader import data, wb 12 | 13 | sp500= data.DataReader('^GSPC', data_source='yahoo', start='1/1/2000', end='1/12/2017') 14 | #sp500= data.DataReader('^GSPC', data_source='yahoo') 15 | sp500.ix['2010-01-04'] 16 | sp500.info() 17 | print(sp500) 18 | print(sp500.columns) 19 | print(sp500.shape) 20 | 21 | import matplotlib.pyplot as plt 22 | plt.plot(sp500['Close']) 23 | 24 | # now calculating the 42nd Days and 252 days trend for the index 25 | sp500['42d']= np.round(pd.rolling_mean(sp500['Close'], window=42),2) 26 | sp500['252d']= np.round(pd.rolling_mean(sp500['Close'], window=252),2) 27 | 28 | #Look at the data 29 | sp500[['Close','42d','252d']].tail() 30 | plt.plot(sp500[['Close','42d','252d']]) 31 | 32 | -------------------------------------------------------------------------------- /errata.md: -------------------------------------------------------------------------------- 1 | # Errata for *Book Title* 2 | 3 | On **page xx** [Summary of error]: 4 | 5 | Details of error here. Highlight key pieces in **bold**. 6 | 7 | *** 8 | 9 | On **page xx** [Summary of error]: 10 | 11 | Details of error here. Highlight key pieces in **bold**. 12 | 13 | *** -------------------------------------------------------------------------------- /fivepointsummary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Sep 04 15:07:13 2018 4 | 5 | @author: PUNEETMATHUR 6 | """ 7 | #Baby Boomers Five Point Summary example 8 | import numpy as np 9 | bboomers = np.array([14230, 345, 1912, 472, 63, 861, 270, 713]) 10 | fivepoints= [np.min(bboomers), np.percentile(bboomers, 25, interpolation='midpoint'), np.median(bboomers),np.percentile(bboomers, 75, interpolation='midpoint'),np.max(bboomers)] 11 | for fivepointsummary in fivepoints: 12 | print(fivepointsummary) 13 | 14 | #Millenials Five Point Summary example 15 | import numpy as np 16 | millennials = np.array([12519, 845, 912, 72, 93, 615, 70, 538]) 17 | fivepoints= [np.min(millennials ), np.percentile(millennials , 25, interpolation='midpoint'), np.median(millennials ),np.percentile(millennials , 75, interpolation='midpoint'),np.max(millennials)] 18 | for fivepointsummary in fivepoints: 19 | print(fivepointsummary) 20 | 21 | -------------------------------------------------------------------------------- /retail2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat May 26 16:50:08 2018 4 | 5 | @author: PUNEETMATHUR Copyright 2018 All rights reserved 6 | DO NOT COPY WTTHOUT PERMISSION 7 | """ 8 | 9 | import warnings 10 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") 11 | warnings.filterwarnings("ignore", category = UserWarning, module = "cross_validation") 12 | 13 | # Display inline matplotlib plots with IPython 14 | from IPython import get_ipython 15 | get_ipython().run_line_magic('matplotlib', 'inline') 16 | ########################################### 17 | # Importing libraries 18 | import matplotlib.pyplot as plt 19 | import matplotlib.cm as cm 20 | import pandas as pd 21 | import numpy as np 22 | import pandas.plotting 23 | 24 | #################################################### 25 | #################################################### 26 | ######### FUNCTIONS FOR DATA VISUALIZATION ######### 27 | #################################################### 28 | #################################################### 29 | 30 | def pcaOutput(good_data, pca): 31 | ''' 32 | Visualizing the PCA results and calculating explained variance 33 | ''' 34 | 35 | # I am doing Dimension indexing through pca components 36 | dims = dims = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)] 37 | 38 | # Creating the PCA components pandas dataframe from the dimensions 39 | comps = pd.DataFrame(np.round(pca.components_, 4), columns = good_data.keys()) 40 | comps.index = dims 41 | 42 | # Calculating PCA explained variance for each component 43 | ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1) 44 | variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance']) 45 | variance_ratios.index = dims 46 | 47 | # Creating a bar plot visualization for better understanding 48 | fig, ax = plt.subplots(figsize = (24,12)) 49 | 50 | # Plotting the feature weights as a function of the components 51 | comps.plot(ax = ax, kind = 'bar'); 52 | ax.set_ylabel("Feature Weights") 53 | ax.set_xticklabels(dims, rotation=0) 54 | 55 | 56 | # Displaying the explained variance ratios 57 | for i, ev in enumerate(pca.explained_variance_ratio_): 58 | ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n %.4f"%(ev)) 59 | 60 | # Returning back a concatenated DataFrame 61 | return pd.concat([variance_ratios, comps], axis = 1) 62 | 63 | def clusterOutput(redData, preds, centers, pca_samples): 64 | ''' 65 | Visualizes the PCA-reduced cluster data in two dimensions 66 | Adds points for cluster centers for-selected sample data 67 | ''' 68 | 69 | preds = pd.DataFrame(preds, columns = ['Cluster']) 70 | plot_data = pd.concat([preds, redData], axis = 1) 71 | 72 | # I am Generating the cluster plot 73 | fig, ax = plt.subplots(figsize = (14,8)) 74 | 75 | # Creating the Color map to distinguish between clusters 76 | cmap = cm.get_cmap('gist_rainbow') 77 | 78 | # Coloring the points based on assigned cluster 79 | for i, cluster in plot_data.groupby('Cluster'): 80 | cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \ 81 | color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30); 82 | 83 | # Plotting the centers with cross mark indicators 84 | for i, c in enumerate(centers): 85 | ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \ 86 | alpha = 1, linewidth = 2, marker = 'o', s=200); 87 | ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100); 88 | 89 | # Plotting transformed sample points 90 | ax.scatter(x = pca_samples[:,0], y = pca_samples[:,1], \ 91 | s = 150, linewidth = 4, color = 'black', marker = 'x'); 92 | 93 | # Plot title 94 | ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids with Numerical markingr\nTick marks denote Transformed Sample Data"); 95 | 96 | 97 | def biPlotter(cleanData, redData, pca): 98 | ''' 99 | Building a biplot for PCA of the reduced data and the projections of the original features. 100 | Variable cleanData: original data, before transformation. 101 | Creating a pandas dataframe with valid column names 102 | redData: the reduced data (the first two dimensions are plotted) 103 | pca: pca object that contains the components_ attribute 104 | 105 | This function returns: a matplotlib AxesSubplot object (for any additional customization) 106 | This function is inspired by the script by Teddy Roland on Biplot in Python: 107 | https://github.com/teddyroland/python-biplot 108 | ''' 109 | 110 | fig, ax = plt.subplots(figsize = (14,8)) 111 | # scatterplot of the reduced data 112 | ax.scatter(x=redData.loc[:, 'Dimension 1'], y=redData.loc[:, 'Dimension 2'], 113 | facecolors='b', edgecolors='b', s=70, alpha=0.5) 114 | 115 | feature_vectors = pca.components_.T 116 | 117 | # we use scaling factors to make the arrows easier to see 118 | arrow_size, text_pos = 7.0, 8.0, 119 | 120 | # projections of the original features 121 | for i, v in enumerate(feature_vectors): 122 | ax.arrow(0, 0, arrow_size*v[0], arrow_size*v[1], 123 | head_width=0.2, head_length=0.2, linewidth=2, color='red') 124 | ax.text(v[0]*text_pos, v[1]*text_pos, cleanData.columns[i], color='black', 125 | ha='center', va='center', fontsize=18) 126 | 127 | ax.set_xlabel("Dimension 1", fontsize=14) 128 | ax.set_ylabel("Dimension 2", fontsize=14) 129 | ax.set_title("Principal Component plane with original feature projections.", fontsize=16); 130 | return ax 131 | 132 | 133 | def channelOutput(redData, outliers, pca_samples): 134 | ''' 135 | Here we are Visualizing the PCA-reduced cluster data in two dimensions using the full dataset 136 | Data is labeled by "Channel" points added for selected sample data 137 | ''' 138 | 139 | # Check that the dataset is loadable 140 | try: 141 | full_data = pd.read_csv("E:/retail2.csv") 142 | except: 143 | print "Dataset could not be loaded. Is the file missing?" 144 | return False 145 | 146 | # Create the Channel DataFrame 147 | chl = pd.DataFrame(full_data['Channel'], columns = ['Channel']) 148 | chl = chl.drop(chl.index[outliers]).reset_index(drop = True) 149 | labeled = pd.concat([redData, chl], axis = 1) 150 | 151 | # Generate the cluster plot 152 | fig, ax = plt.subplots(figsize = (14,8)) 153 | 154 | # Color map 155 | cmap = cm.get_cmap('gist_rainbow') 156 | 157 | # Color the points based on assigned Channel 158 | labels = ['Segment 1/Segment 2/Segment3', 'Retail Customer'] 159 | grouped = labeled.groupby('Channel') 160 | for i, chl in grouped: 161 | chl.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \ 162 | color = cmap((i-1)*1.0/2), label = labels[i-1], s=30); 163 | 164 | # Plot transformed sample points 165 | for i, sample in enumerate(pca_samples): 166 | ax.scatter(x = sample[0], y = sample[1], \ 167 | s = 200, linewidth = 3, color = 'black', marker = 'o', facecolors = 'none'); 168 | ax.scatter(x = sample[0]+0.25, y = sample[1]+0.3, marker='$%d$'%(i), alpha = 1, s=125); 169 | 170 | # Set plot title 171 | ax.set_title("PCA-Reduced Data Labeled by 'Channel'\nTransformed Sample Data Circled"); 172 | 173 | ########################################################### 174 | ########################################################### 175 | ######### END OF FUNCTIONS FOR DATA VISUALIZATION ######### 176 | ########################################################### 177 | ########################################################### 178 | 179 | 180 | 181 | ########################################################################## 182 | ########################################################################## 183 | ####################### IMPLEMENTATION CODE ############################# 184 | ########################################################################## 185 | ########################################################################## 186 | 187 | ########################################### 188 | # I am Suppressing matplotlib and other module deprecation user warnings 189 | # This is Necessary for newer version of matplotlib 190 | import warnings 191 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") 192 | warnings.filterwarnings("ignore", category = UserWarning, module = "cross_validationb") 193 | warnings.filterwarnings("ignore", category=DeprecationWarning) 194 | import matplotlib.pyplot as plt 195 | import matplotlib.cm as cm 196 | import pandas as pd 197 | import numpy as np 198 | 199 | 200 | # Load the retail customers dataset 201 | try: 202 | data = pd.read_csv("E:\\retail2.csv") 203 | data.drop(['Region', 'Channel'], axis = 1, inplace = True) 204 | print "Retail customers dataset has {} samples with {} features each.".format(*data.shape) 205 | except: 206 | print "Dataset could not be loaded. Is the dataset missing?" 207 | 208 | # Display a description of the dataset 209 | display(data.describe()) 210 | 211 | 212 | # Selecting three indices of my choice from the dataset 213 | indices = [225,182,400] 214 | 215 | # Creating a DataFrame of the chosen samples 216 | samples = pd.DataFrame(data.loc[indices], columns = data.keys()).reset_index(drop = True) 217 | print "Chosen samples of Retail customers dataset:" 218 | display(samples) 219 | 220 | #lOOKING AT THE PLOTS TO CHECK THE TREND 221 | pd.DataFrame(samples).transpose().plot(kind='bar', stacked=False, figsize=(8,8), title='Retail customers dataset') 222 | 223 | # Calculating deviations from Mean and Median 224 | #to check how much the 3 samples deviate from the Center. 225 | delta_mean = samples - data.mean().round() 226 | delta_median = samples - data.median().round() 227 | 228 | #display(delta_mean, delta_median) 229 | delta_mean.plot.bar(figsize=(30,10), title='Compared to MEAN', grid=True) 230 | delta_median.plot.bar(figsize=(30,10), title='Compared to MEDIAN', grid=True); 231 | 232 | #Importing Libraries 233 | from sklearn.cross_validation import train_test_split 234 | from sklearn.tree import DecisionTreeRegressor 235 | 236 | #Broke down the logic into a scoring function and a For loop 237 | def scorer(feature): 238 | # Make a copy of the DataFrame, using the 'drop' function to drop the given feature 239 | relevant_data = data.drop([feature], axis=1) 240 | 241 | # Split the data into training and testing sets using the given feature as the target 242 | X_train, X_test, y_train, y_test = train_test_split(relevant_data, data[feature], test_size=0.25, random_state=45) 243 | 244 | # Create a decision tree regressor and fit it to the training set 245 | regressor = DecisionTreeRegressor(random_state=45).fit(X_train, y_train) 246 | 247 | # Report the score of the prediction using the testing set 248 | score = regressor.score(X_test, y_test) 249 | 250 | print("The R2 score for feature {:16} is {:+.5f}".format(feature, score)) 251 | 252 | for feature in data.columns.values: 253 | scorer(feature) 254 | 255 | # Produce a scatter matrix for each pair of features in the data 256 | pd.plotting.scatter_matrix(data, alpha = 0.3, figsize = (24,12), diagonal = 'kde'); 257 | #checking Correlation table 258 | data.corr() 259 | 260 | # Scale the data using the natural logarithm 261 | log_data = np.log(data) 262 | # Scale the sample data using the natural logarithm 263 | log_samples = np.log(samples) 264 | # Display the log-transformed sample data 265 | display(log_samples) 266 | log_samples.corr() 267 | 268 | #Identifying Outliers 269 | #Using Counters 270 | from collections import Counter 271 | outliers_counter = Counter() 272 | 273 | # For each feature finding out the data points with extreme high or low values - Outliers 274 | outliers_scores = None 275 | 276 | for feature in log_data.keys(): 277 | 278 | # Calculate Q1 (25th percentile of the data) for the given feature 279 | Q1 = np.percentile(log_data[feature], 25) 280 | 281 | # Calculate Q3 (75th percentile of the data) for the given feature 282 | Q3 = np.percentile(log_data[feature], 75) 283 | 284 | # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range) 285 | step = 1.5 * (Q3 - Q1) 286 | 287 | empty = np.zeros(len(log_data[feature])) 288 | aboveQ3 = log_data[feature].values - Q3 - step 289 | belowQ3 = log_data[feature].values - Q1 + step 290 | current_outliers_scores = np.array(np.maximum(empty, aboveQ3) - np.minimum(empty, belowQ3)).reshape([-1,1]) 291 | outliers_scores = current_outliers_scores if outliers_scores is None else np.hstack([outliers_scores, current_outliers_scores]) 292 | 293 | # Display the outliers 294 | print("Data points considered outliers for the feature '{}':".format(feature)) 295 | current_outliers = log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))] 296 | display(current_outliers) 297 | outliers_counter.update(current_outliers.index.values) 298 | 299 | # Select the indices for data points you wish to remove 300 | min_outliers_count = 2 301 | outliers = [x[0] for x in outliers_counter.items() if x[1] >= min_outliers_count] 302 | print("Data points considered outlier for more than 1 feature: {}".format(outliers)) 303 | 304 | # Remove the outliers, if any were specified 305 | good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True) 306 | 307 | 308 | # Apply PCA by fitting the good data with the same number of dimensions as features 309 | from sklearn.decomposition import PCA 310 | pca = PCA(n_components=len(good_data.columns)).fit(good_data) 311 | 312 | # Transform log_samples using the PCA fit above 313 | pca_samples =pca.transform(log_samples) 314 | 315 | # Generate PCA results plot 316 | pcaOutput = pcaOutput(good_data, pca) 317 | # show results 318 | display(pcaOutput) 319 | # Cumulative explained variance should add to 1 320 | display(pcaOutput['Explained Variance'].cumsum()) 321 | 322 | from sklearn.decomposition import PCA 323 | # Apply PCA by fitting the good data with only two dimensions 324 | pca = PCA(n_components=2).fit(good_data) 325 | 326 | # Transform the good data using the PCA fit above 327 | reduced_data = pca.transform(good_data) 328 | 329 | # Transform the sample log-data using the PCA fit above 330 | pca_samples = pca.transform(log_samples) 331 | 332 | # Create a DataFrame for the reduced data 333 | reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2']) 334 | 335 | # Display sample log-data after applying PCA transformation in two dimensions 336 | display(pd.DataFrame(np.round(pca_samples, 4), columns = ['Dimension 1', 'Dimension 2'])) 337 | 338 | # Create a biplot 339 | biPlotter(good_data, reduced_data, pca) 340 | 341 | #Import GMM and silhouette score library 342 | from sklearn.mixture import GMM 343 | from sklearn.metrics import silhouette_score 344 | 345 | #Divided the logic into Clustering and Scoring functions 346 | #Then Iterated through the clusters in the identified range 347 | 348 | #This function creates Cluster using GMM 349 | def clusterCreator(data, n_clusters): 350 | clusterer = GMM(n_components=n_clusters, covariance_type='full', random_state=45).fit(data) 351 | preds = clusterer.predict(data) 352 | centers = clusterer.means_ 353 | sample_preds = clusterer.predict(pca_samples) 354 | return preds, centers, sample_preds 355 | 356 | #Scoring after creating Clusters 357 | def scorer(data, n_clusters): 358 | preds, _, _ = clusterCreator(data, n_clusters) 359 | score = silhouette_score(data, preds) 360 | return score 361 | 362 | #Iterate in the clusters and Print silhouette score 363 | for n_clusters in range(2,10): 364 | score = scorer(reduced_data, n_clusters) 365 | print "For n_clusters = {}. The average silhouette_score is : {}".format(n_clusters, score) 366 | 367 | n_clusters=0 368 | 369 | # Resetting as due to loop run before we need to reset values again with 2 cluster components 370 | clusterer = GMM(n_components=2).fit(reduced_data) 371 | predictions = clusterer.predict(reduced_data) 372 | centers = clusterer.means_ 373 | sample_preds = clusterer.predict(pca_samples) 374 | # Display the results of the clustering from implementation 375 | clusterOutput(reduced_data, predictions, centers, pca_samples) 376 | 377 | log_centers = pca.inverse_transform(centers) 378 | 379 | # Exponentiate the centers 380 | true_centers = np.exp(log_centers) 381 | 382 | # Display the true centers 383 | segments = ['Segment {}'.format(i) for i in range(0,len(centers))] 384 | true_centers = pd.DataFrame(np.round(true_centers), columns = data.keys()) 385 | true_centers.index = segments 386 | display(true_centers) 387 | #Compare the True Centers from the Central values Mean and Median 388 | #display(true_centers - data.mean().round()) 389 | delta_mean=true_centers - data.mean().round() 390 | #display(true_centers - data.median().round()) 391 | delta_median=true_centers - data.median().round() 392 | delta_mean.plot.bar(figsize=(10,4), title='Compared to MEAN', grid=True) 393 | delta_median.plot.bar(figsize=(10,4), title='Compared to MEDIAN', grid=True); 394 | 395 | # Display the predictions 396 | for i, pred in enumerate(sample_preds): 397 | print "Sample point", i, "predicted to be in Cluster", pred 398 | samples 399 | 400 | # Display the clustering results based on 'Channel' data 401 | redData=reduced_data 402 | channelOutput(redData, outliers, pca_samples) 403 | 404 | ########################################################################## 405 | ########################################################################## 406 | ####################### END OF CODE ##################################### 407 | ########################################################################## 408 | ########################################################################## 409 | 410 | --------------------------------------------------------------------------------