├── .gitattributes
├── 9781484237861.jpg
├── Accounting_Model_Final.py
├── CASESTUDY_FINAL.py
├── CaseStudy2Chapter4.py
├── Contributing.md
├── DirectPythChatbot.py
├── ElectronicPatientRecord.py
├── LICENSE.txt
├── Predict_NextDay_Open_STOCK_PRICE.py
├── Quandl_Data_Getter.py
├── README.md
├── Yahoo_Data_Getter.py
├── errata.md
├── fivepointsummary.py
└── retail2.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/9781484237861.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/machine-learning-applications-using-python/9548a69d23c18218180977f277a8d0f2855f70dc/9781484237861.jpg


--------------------------------------------------------------------------------
/Accounting_Model_Final.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Sep 11 14:03:51 2018
  4 | 
  5 | @author: PUNEETMATHUR
  6 | """
  7 | 
  8 | #Importing python libraries
  9 | import pandas as pd
 10 | from io import StringIO
 11 | import requests
 12 | import os
 13 | import numpy as np
 14 | os.getcwd()
 15 | 
 16 | #Creating Visualization Functions which will be used throughout this model
 17 | import matplotlib.pyplot as pl
 18 | import matplotlib.patches as mpatches
 19 | import numpy as np
 20 | import pandas as pd
 21 | from time import time
 22 | from sklearn.metrics import f1_score, accuracy_score
 23 | 
 24 | 
 25 | def distribution(data, transformed = False):
 26 |     """
 27 |     Visualization code for displaying skewed distributions of features
 28 |     """
 29 |     
 30 |     # Create figure
 31 |     fig = pl.figure(figsize = (11,5));
 32 | 
 33 |     # Skewed feature plotting
 34 |     for i, feature in enumerate(['Amount','Month', 'Fiscal Year']):
 35 |         ax = fig.add_subplot(1, 3, i+1)
 36 |         ax.hist(data[feature], bins = 25, color = '#00A0A0')
 37 |         ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14)
 38 |         ax.set_xlabel("Value")
 39 |         ax.set_ylabel("Number of Records")
 40 |         ax.set_ylim((0, 2000))
 41 |         ax.set_yticks([0, 500, 1000, 1500, 2000])
 42 |         ax.set_yticklabels([0, 500, 1000, 1500, ">2000"])
 43 | 
 44 |     # Plot aesthetics
 45 |     if transformed:
 46 |         fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", \
 47 |             fontsize = 16, y = 1.03)
 48 |     else:
 49 |         fig.suptitle("Skewed Distributions of Continuous Census Data Features", \
 50 |             fontsize = 16, y = 1.03)
 51 | 
 52 |     fig.tight_layout()
 53 |     fig.show()
 54 | #End of Distribution Visualization function
 55 | 
 56 | 
 57 | # Plotting Feature Importances through this function
 58 | def feature_plot(importances, X_train, y_train):
 59 |     
 60 |     # Display the five most important features
 61 |     indices = np.argsort(importances)[::-1]
 62 |     columns = X_train.columns.values[indices[:5]]
 63 |     values = importances[indices][:5]
 64 | 
 65 |     # Creat the plot
 66 |     fig = pl.figure(figsize = (9,5))
 67 |     pl.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16)
 68 |     pl.bar(np.arange(4), values, width = 0.6, align="center", color = '#00A000', \
 69 |           label = "Feature Weight")
 70 |     pl.bar(np.arange(4) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \
 71 |           label = "Cumulative Feature Weight")
 72 |     pl.xticks(np.arange(5), columns)
 73 |     pl.xlim((-0.5, 4.5))
 74 |     pl.ylabel("Weight", fontsize = 12)
 75 |     pl.xlabel("Feature", fontsize = 12)
 76 |     
 77 |     pl.legend(loc = 'upper center')
 78 |     pl.tight_layout()
 79 |     pl.show()      
 80 |    
 81 | #End of Feature Importances function
 82 | 
 83 | #Loading the Dataset
 84 | fname="C:/DATASETS/data.ct.gov/PaymentsDataset.csv"
 85 | openledger= pd.read_csv(fname, low_memory=False, index_col=False)
 86 | 
 87 | #Verify the data Loaded into memory
 88 | print(openledger.head(1))
 89 | 
 90 | #Loding into Dataframe for easier computation
 91 | data= pd.DataFrame(openledger)
 92 | 
 93 | #Look at the first record
 94 | print(data.head(1))
 95 | #Check the shape of columns in the dataset
 96 | print(data.shape)
 97 | print(data.columns)
 98 | data.dtypes
 99 | 
100 | #Data Cleanup
101 | #Check if there are any columns with empty/null dataset
102 | data.isnull().any()
103 | 
104 | 
105 | #No Data Cleanup needed so skipping decision to drop the na value rows since they are very less
106 | #data=data.dropna()
107 | data.isnull().sum()
108 | 
109 | #Total number of records
110 | n_records = len(data.index)
111 | 
112 | #Number of records where payments are below 1.5 times of Lower Quantile- Lower Outlier Limit
113 | 
114 | l=data[data['RedFlag'] == 2].index
115 | n_greater_quantile = len(l)
116 | 
117 | #Number of records where payments are above 1.5 times of upper Quantile- Upper Outlier Limit
118 | l=data[data['RedFlag'] == 1].index
119 | n_lower_quantile = len(l)
120 | 
121 | #Percentage of Payments above Upper Outlier limit
122 | p=float(n_greater_quantile)/n_records*100.0
123 | greater_percent =p
124 | 
125 | #Percentage of Payments above Lower Outlier limit
126 | p=float(n_lower_quantile)/n_records*100.0
127 | lower_percent =p
128 | 
129 | # Print the results
130 | print "Total number of records: {}".format(n_records)
131 | print "High value Payments above 1.5 times of 75th Percentile: {}".format(n_greater_quantile)
132 | print "Low value Payments below 1.5 times of 25th Percentile: {}".format(n_lower_quantile)
133 | print "Percentage of high value Payments: {:.2f}%".format(greater_percent)
134 | print "Percentage of low value Payments: {:.2f}%".format(lower_percent)
135 | 
136 | # PREPARING DATA
137 | # Split the data into features and target label
138 | payment_raw = pd.DataFrame(data['RedFlag'])
139 | type(payment_raw)
140 | features_raw = data.drop('RedFlag', axis = 1)
141 | #Removing redundant columns from features_raw dataset
142 | features_raw.dtypes
143 | features_raw=features_raw.drop('TransactionNo', axis=1)
144 | features_raw=features_raw.drop('Department', axis=1)
145 | features_raw=features_raw.drop('Account', axis=1)
146 | features_raw=features_raw.drop('Expense Category', axis=1)
147 | features_raw=features_raw.drop('Vendor ID', axis=1)
148 | features_raw=features_raw.drop('Payment Method', axis=1)
149 | features_raw=features_raw.drop('Payment Date', axis=1)
150 | features_raw=features_raw.drop('Invoice ID', axis=1)
151 | features_raw=features_raw.drop('Invoice Date', axis=1)
152 | features_raw=features_raw.drop('Unnamed: 0', axis=1)
153 | features_raw.dtypes
154 | type(features_raw)
155 | 
156 | # Visualize skewed continuous features of original data
157 | distribution(data)
158 | 
159 | # Log-transform the skewed features
160 | #Replacing Null values with zero due to software data entry problem
161 | #Known issue in software user screen takes null values there is no check.
162 | import warnings
163 | warnings.filterwarnings("ignore")
164 | features_raw.isnull().sum()
165 | 
166 | skewed = ['Amount','Month', 'Fiscal Year']
167 | features_raw[skewed] = data[skewed].apply(lambda x: np.log(x + 1))
168 | features_raw.isnull().sum()
169 | features_raw.fillna(0, inplace=True)
170 | features_raw.dtypes
171 | 
172 | # Visualize the new log distributions
173 | distribution(features_raw, transformed = True)
174 | 
175 | #Normalizing Numerical Features
176 | # Import sklearn.preprocessing.StandardScaler
177 | from sklearn.preprocessing import MinMaxScaler
178 | 
179 | # Initialize a scaler, then apply it to the features
180 | scaler = MinMaxScaler()
181 | numerical = [ 'Amount','Month', 'Fiscal Year']
182 | #features_raw[numerical] = scaler.fit_transform(data[numerical])
183 | features_raw[numerical] = scaler.fit_transform(features_raw[numerical])
184 | distribution(features_raw)
185 | 
186 | # Look at record with scaling applied to see if everything is good
187 | display(features_raw.head(n = 1))
188 | features_raw.columns
189 | #Implementation: Data Preprocessing
190 | # One-Hot Encoding 
191 | data.columns
192 | data.dtypes
193 | data['Payment Status']
194 | #data['Expense Category'] #drop from features_raw
195 | #data['Payment Method'] #drop from features_raw
196 | 
197 | # Encoding the 'Non-Numeric' data to numerical values
198 | #Payment Status column
199 | d={"Paid-Reconciled": 0, "Paid-Unreconciled": 1}
200 | features_raw['Payment Status'] = features_raw['Payment Status'].map(d)
201 | 
202 | # Printing the number of features after one-hot encoding
203 | encoded = list(features_raw.columns)
204 | print "{} total features after one-hot encoding.".format(len(encoded))
205 | 
206 | 
207 | # Importing train_test_split
208 | from sklearn.cross_validation import train_test_split
209 | payment_raw.columns
210 | # Splitting the 'features' and 'RedFlags' data into training and testing sets
211 | X_train, X_test, y_train, y_test = train_test_split(features_raw, payment_raw, test_size = 0.2, random_state = 0)
212 | X_train.columns
213 | X_test.columns
214 | y_train.columns
215 | y_test.columns
216 | # Showing the results of the split
217 | print "Training set has {} samples.".format(X_train.shape[0])
218 | print "Testing set has {} samples.".format(X_test.shape[0])
219 | 
220 | #Checking to see if there are any empty rows
221 | X_train.isnull().any()
222 | y_train.isnull().any()
223 | X_train.isnull().sum()
224 | #Evaluating Model Performance
225 | #Establishing Benchmark performance indicator Naive Bayes
226 | #Naive Predictor Performace
227 | from sklearn.naive_bayes import GaussianNB
228 | #from sklearn.metrics import accuracy_score, fbeta_score
229 | from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
230 | NB = GaussianNB()
231 | NB.fit(X_train,y_train)
232 | pred = NB.predict(X_test)
233 | 
234 | #Calculating Accuracy Score
235 | #Calculating Beta Score
236 | accuracy_score(y_test,pred)
237 | print(f1_score(y_test,pred, average="macro"))
238 | print(precision_score(y_test, pred, average="macro"))
239 | print(recall_score(y_test, pred, average="macro"))  
240 | 
241 | # I AM GOING TO DO INITIAL MODEL EVALUATION NOW
242 | # Importing the three supervised learning models from sklearn
243 | from sklearn.naive_bayes import GaussianNB
244 | from sklearn.tree import DecisionTreeClassifier
245 | from sklearn.linear_model import LogisticRegression
246 | from sklearn.linear_model import SGDClassifier
247 | from sklearn.ensemble import ExtraTreesClassifier
248 | from sklearn.ensemble import RandomForestClassifier
249 | 
250 | # Initialize the three models
251 | clf_A = GaussianNB()
252 | clf_B = DecisionTreeClassifier(max_features=0.2, max_depth=2, min_samples_split=2,random_state=0)
253 | clf_C = LogisticRegression(random_state=0)
254 | clf_D = SGDClassifier(loss="hinge", penalty="l2")
255 | clf_E = ExtraTreesClassifier(n_estimators=2, max_depth=2,min_samples_split=2, random_state=0)
256 | clf_F = RandomForestClassifier(max_depth=2)
257 | 
258 | # Calculate the number of samples for 1%, 10%, and 100% of the training data
259 | #Defining function since percent is required 3 times
260 | 
261 | 
262 | # Collect results on the learners
263 | learners=["Naive Bayes","Decision Tree","Logistic Regression","SGD Classifier","ExtaTrees Classifier","RandomFores Classifier"]
264 | cnt=0
265 | columns=['learner','train_time','pred_time','acc_train','acc_test','f1_score']
266 | learningresults= pd.DataFrame(columns=columns)
267 | results = {}
268 | 
269 | for learner in [clf_A, clf_B, clf_C,clf_D,clf_E,clf_F]:
270 |     #print(learners[cnt])
271 |     results['learner']=learners[cnt]
272 |     #Fitting the learner to the training data using slicing with 'sample_size'
273 |     start = time() # Get start time
274 |     learner.fit(X_train, y_train)
275 |     #Calculating the total prediction time
276 |     end = time() # Get end time
277 |     results['train_time'] =  end - start
278 |     start = time() # Get start time
279 |     predictions_test = learner.predict(X_test)
280 |     predictions_train = learner.predict(X_train)
281 |     end = time() # Get end time
282 |     results['pred_time'] = end - start
283 |     results['acc_train'] = accuracy_score(y_train, predictions_train)
284 |     results['acc_test'] = accuracy_score(y_test, predictions_test)
285 |     beta=0.5
286 |     results['f1_score'] = f1_score(y_test,pred, average="macro")
287 |     print(results)
288 |     learningresults.loc[cnt]=results
289 |     cnt=cnt+1
290 | 
291 | #Looking at the plots to determine the best Classifier for our Dataset
292 | print(learningresults)
293 | learningresults.columns
294 | learningresults.plot(kind='bar', x='learner', legend='reverse',  title='Classifier Algorithms Compared- Accounts Payment Dataset',figsize=(10,10), fontsize=20)
295 | #learningresults.plot(kind='bar', x='learner').savefig('E:/BUSINESS/APRESS/ApplicationsOfMachineLearning/Chapter15/learner_performance.png')
296 | 
297 | #--------------------------MODEL TUNING ------------------------
298 | #Now I will implement Model tuning
299 | # Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
300 | from sklearn.metrics import make_scorer
301 | from sklearn.model_selection import GridSearchCV
302 | from IPython.display import display
303 | import pickle, os.path
304 | from sklearn.linear_model import LogisticRegression
305 | from sklearn.metrics import fbeta_score
306 | 
307 | def getscore(y_true, y_predict):
308 |     return fbeta_score(y_true, y_predict, beta)
309 | 
310 | best_clf = None
311 | beta=0.5
312 | 
313 | #Initialize the classifier
314 | 
315 | clf_C = LogisticRegression(random_state=0)
316 | 
317 | # Create the parameters list you wish to tune
318 | #parameters = {'n_estimators':range(10,20),'criterion':['gini','entropy'],'max_depth':range(1,5)}
319 | parameters = {'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],'C':range(1,10),'max_iter':range(50,100)}
320 | 
321 | # Make an fbeta_score scoring object
322 | scorer = make_scorer(getscore)
323 | 
324 | # Perform grid search on the classifier using 'scorer' as the scoring method
325 | #grid_obj = GridSearchCV(clf_C, parameters, scoring=scorer)
326 | 
327 | 
328 | #do something
329 | grid_obj = GridSearchCV(clf_C, parameters)
330 | 
331 | #grid_obj = GridSearchCV(clf_C, parameters)
332 | 
333 | # Fit the grid search object to the training data and find the optimal parameters   
334 | from datetime import datetime
335 | startTime = datetime.now()
336 | 
337 | grid_fit = grid_obj.fit(X_train, y_train)
338 | CV_lr = GridSearchCV(estimator=clf_C, param_grid=parameters, cv= 5)
339 | CV_lr.fit(X_train, y_train)
340 | print(datetime.now() - startTime)
341 | 
342 |     # Get the estimator
343 | best_clf = grid_fit.best_estimator_
344 | 
345 | # Make predictions using the unoptimized and model
346 | predictions = (clf_C.fit(X_train, y_train)).predict(X_test)
347 | best_predictions = best_clf.predict(X_test)
348 | 
349 | # Report the before-and-afterscores
350 | print "Unoptimized model\n------"
351 | print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))
352 | print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5,average='micro'))
353 | print "\nOptimized Model\n------"
354 | print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
355 | print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5,average='micro'))
356 | 
357 | # Print the final parameters
358 | df = pd.DataFrame(grid_fit.grid_scores_).sort_values('mean_validation_score').tail()
359 | display(df)
360 | print "Parameters for the optimal model: {}".format(clf.get_params())
361 | 
362 | # Now Extracting Feature Importances
363 | # mporting a supervised learning model that has 'feature_importances_'
364 | from sklearn.ensemble import ExtraTreesClassifier
365 | from sklearn.feature_selection import SelectFromModel
366 | from sklearn.metrics import fbeta_score
367 | 
368 | # Training the supervised model on the training set 
369 | model = ExtraTreesClassifier()
370 | model.fit(X_train, y_train)
371 | 
372 | # TODO: Extract the feature importances
373 | importances = model.feature_importances_
374 | 
375 | # Plot
376 | feature_plot(importances, X_train, y_train)
377 | 
378 | # Feature Selection
379 | # Import functionality for cloning a model
380 | from sklearn.base import clone
381 | best_clf= clf_F
382 | # Reduce the feature space
383 | X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:5]]]
384 | X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:5]]]
385 | 
386 | # Train on the "best" model found from grid search earlier
387 | clf = (clone(best_clf)).fit(X_train_reduced, y_train)
388 | best_predictions = best_clf.predict(X_test)
389 | # Make new predictions
390 | reduced_predictions = clf.predict(X_test_reduced)
391 | 
392 | # Report scores from the final model using both versions of data
393 | print "Final Model trained on full data\n------"
394 | print "Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
395 | print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, average="macro", beta = 0.5))
396 | print "\nFinal Model trained on reduced data\n------"
397 | print "Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions))
398 | print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta = 0.5, average='macro'))
399 | 
400 | # Print the final parameters
401 | 
402 | print "Parameters for the optimal model: {}".format(clf.get_params())


--------------------------------------------------------------------------------
/CASESTUDY_FINAL.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Jun 30 13:30:46 2018
  4 | 
  5 | @author: PUNEETMATHUR
  6 | """
  7 | #Loading libraries
  8 | import pandas as pd
  9 | 
 10 | #Loading the retail dataset
 11 | survey=pd.read_csv("C:/Puneet/All_DOCS/Prediction_Files/DataScience-Python3_UDEMY_COURSE/DataScience-Python3/ml-100k/aystsaga_002.csv")
 12 | 
 13 | #Looking at the dataset one row
 14 | print(survey.head(1))
 15 | #Shape of the dataset Rows and Columns
 16 | print(survey.shape)
 17 | 
 18 | #Looking at the columns of the dataset
 19 | print(survey.columns)
 20 | #What are the datatypes of each column do we need to convert any of them
 21 | print(survey.dtypes)
 22 | #Are there any Null values in the dataset
 23 | print(survey.isnull().any())
 24 | 
 25 | #Creating a pivot table for product ratings
 26 | productRatings = survey.pivot_table(index=['user_id'],columns=['product_name'],values='rating')
 27 | #Peeking into the pivot table
 28 | productRatings.head(1)
 29 | 
 30 | ##===========================Customer chooses a product======================================================
 31 | #Sample Product rating view assume here that a customer selected the product 'Dalmere Chocolate cup'
 32 | #What recommendation are you going to give based on this selection
 33 | #Users who bought 'Dalmere Chocolate cup' also bought:
 34 | dalmereRatings = productRatings['Dalmere Chocolate cup']
 35 | dalmereRatings.head()
 36 | 
 37 | #Now finding products with similar ratings
 38 | similarProducts = productRatings.corrwith(dalmereRatings)
 39 | #Dropping the NA values to get more meaningful data
 40 | similarProducts = similarProducts.dropna()
 41 | df = pd.DataFrame(similarProducts)
 42 | df.head(10)
 43 | 
 44 | similarProducts.sort_values(ascending=False)
 45 | 
 46 | #Now let us get rid of spurious results in the similarities
 47 | import numpy as np
 48 | productStats = survey.groupby('product_name').agg({'rating': [np.size, np.mean]})
 49 | productStats.head()
 50 | 
 51 | productStats.sort_values([('rating', 'mean')], ascending=False)[:15]
 52 | 
 53 | #I am now getting rid of ratings size greater than 50 to create meaning results which matter to the business
 54 | popularProducts = productStats['rating']['size'] >= 50
 55 | productStats[popularProducts].sort_values([('rating', 'mean')], ascending=False)[:15]
 56 | df = productStats[popularProducts].join(pd.DataFrame(similarProducts, columns=['similarity']))
 57 | df.head(15)
 58 | df.sort_values(['similarity'], ascending=False)[:15]
 59 | #Visualizing the Bar plot for similarities
 60 | df['similarity'].plot(kind='bar')
 61 | ax = df['similarity'].plot(kind='bar') 
 62 | x_offset = -0.03
 63 | y_offset = 0.02
 64 | for p in ax.patches:
 65 |     b = p.get_bbox()
 66 |     val = "{:+.2f}".format(b.y1 + b.y0)        
 67 |     ax.annotate(val, ((b.x0 + b.x1)/2 + x_offset, b.y1 + y_offset))
 68 | 
 69 | #===========================Customer chooses another product======================================================
 70 | #2nd Sample Product rating view
 71 | aesoRatings = productRatings['Aeso napkins 10 PACK']
 72 | aesoRatings.head()
 73 | 
 74 | #Now finding products with similar ratings
 75 | similarProducts = productRatings.corrwith(aesoRatings)
 76 | similarProducts = similarProducts.dropna()
 77 | df = pd.DataFrame(similarProducts)
 78 | df.head(10)
 79 | 
 80 | similarProducts.sort_values(ascending=False)
 81 | 
 82 | #Now let us get rid of spurious results in the similarities
 83 | import numpy as np
 84 | productStats = survey.groupby('product_name').agg({'rating': [np.size, np.mean]})
 85 | productStats.head()
 86 | 
 87 | productStats.sort_values([('rating', 'mean')], ascending=False)[:15]
 88 | 
 89 | #I am now getting rid of ratings size greater than 20 to create meaning results which matter to the business
 90 | popularProducts = productStats['rating']['size'] >= 50
 91 | productStats[popularProducts].sort_values([('rating', 'mean')], ascending=False)[:15]
 92 | df = productStats[popularProducts].join(pd.DataFrame(similarProducts, columns=['similarity']))
 93 | df.head(15)
 94 | df.sort_values(['similarity'], ascending=False)[:15]
 95 | 
 96 | #Visualization of similarities
 97 | df['similarity'].plot(kind='bar')
 98 | ax = df['similarity'].plot(kind='bar') 
 99 | x_offset = -0.03
100 | y_offset = 0.02
101 | for p in ax.patches:
102 |     b = p.get_bbox()
103 |     val = "{:+.2f}".format(b.y1 + b.y0)        
104 |     ax.annotate(val, ((b.x0 + b.x1)/2 + x_offset, b.y1 + y_offset))
105 | 
106 | 


--------------------------------------------------------------------------------
/CaseStudy2Chapter4.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Apr 17 13:16:52 2018
  4 | 
  5 | @author: PMAUTHOR
  6 | """
  7 | 
  8 | import pandas as pd
  9 | from io import StringIO
 10 | import requests
 11 | import os
 12 | os.getcwd()
 13 | 
 14 | fname="Food_Raw_Data.csv"
 15 | hospitals= pd.read_csv(fname, low_memory=False, index_col=False)
 16 | df= pd.DataFrame(hospitals)
 17 | print(df.head(1))
 18 | 
 19 | print(df.size)
 20 | print(df.shape)
 21 | print(df.columns)
 22 | df.dtypes
 23 | 
 24 | #Check if there are any columns with empty/null dataset
 25 | df.isnull().any()
 26 | #Checking how many columns have null values
 27 | df.info()
 28 | 
 29 | #Using individual functions to do EDA
 30 | #Checking out Statistical data Mean Median Mode correlation
 31 | df.mean()
 32 | df.median()
 33 | df.mode()
 34 | 
 35 | #How is the data distributed and detecting Outliers
 36 | df.std()
 37 | df.max()
 38 | df.min()
 39 | df.quantile(0.25)*1.5
 40 | df.quantile(0.75)*1.5
 41 | 
 42 | #How many Outliers in the Total Food ordered column
 43 | df.columns
 44 | df.dtypes
 45 | df.set_index(['Hospital Name'])
 46 | df['Total Food ordered'].loc[df['Total Food ordered'] <=238.5].count()
 47 | df['Total Food ordered'].loc[df['Total Food ordered'] >=679.5].count()
 48 | 
 49 | #Visualizing the dataset
 50 | df.boxplot(figsize=(10, 6))
 51 | df.plot.box(vert=False)
 52 | df.kurtosis()
 53 | df.skew()
 54 | import scipy.stats as sp
 55 | sp.skew(df['Total Food ordered'])
 56 | 
 57 | #Visualizing dataset
 58 | df.plot()
 59 | df.hist(figsize=(10, 6))
 60 | df.plot.area()
 61 | df.plot.area(stacked=False)
 62 | 
 63 | 
 64 | #Now look at correlation and patterns
 65 | df.corr()
 66 | 
 67 | #Change to dataset columns
 68 | df.plot.scatter(x='Total Food Wasted', y='No of Guests with Inpatient',s=df['Total Food Wasted']*2)
 69 | df.plot.hexbin(x='Total Food Wasted', y='No of Guests with Inpatient', gridsize=25)
 70 | 
 71 | #Change to dataset columns
 72 | #Look at crosstabulation to conclude EDA
 73 | df.columns
 74 | df.dtypes
 75 | #Counting the Categorical variables
 76 | my_tab = pd.crosstab(index=df["Feedback"], columns="Count")      # Name the count column
 77 | my_tab = pd.crosstab(index=df["Type of Hospital"], columns="Count")      # Name the count column
 78 | 
 79 | print(my_tab)
 80 | my_tab=my_tab.sort_values('Count', ascending=[False])
 81 | print(my_tab)
 82 | #my_tab.sum()
 83 | data_counts = pd.DataFrame(my_tab)
 84 | pd.DataFrame(data_counts).transpose().plot(kind='bar', stacked=False)
 85 | 
 86 | #Data Preparation Steps
 87 | #Step 1 Split data into features and target variable
 88 | # Split the data into features and target label
 89 | wastage = pd.DataFrame(df['Total Food Wasted'])
 90 | 
 91 | dropp=df[['Total Food Wasted','Feedback','Type of Hospital','Total No of beds']]
 92 | features= df.drop(dropp, axis=1)
 93 | wastage.columns
 94 | features.columns
 95 | 
 96 | #Step 2 Shuffle & Split Final Dataset
 97 | # Import train_test_split
 98 | from sklearn.cross_validation import train_test_split
 99 | from sklearn.utils import shuffle
100 | 
101 | # Shuffle and split the data into training and testing subsets
102 | features=shuffle(features,  random_state=0)
103 | wastage=shuffle(wastage,  random_state=0)
104 | # Split the 'features' and 'income' data into training and testing sets
105 | X_train, X_test, y_train, y_test = train_test_split(features, wastage, test_size = 0.2, random_state = 0)
106 | 
107 | # Show the results of the split
108 | print("Training set has {} samples.".format(X_train.shape[0]))
109 | print("Testing set has {} samples.".format(X_test.shape[0]))
110 | 
111 | 
112 | #Model Building & Evaluation
113 | #Creating the the Model for prediction
114 | 
115 | #Loading model Libraries
116 | import matplotlib.pyplot as plt
117 | import numpy as np
118 | from sklearn import linear_model
119 | from sklearn.metrics import mean_squared_error, r2_score
120 | from sklearn.svm import LinearSVC  
121 |  
122 | 
123 | #Creating Linear Regression object
124 | regr = linear_model.LinearRegression()
125 | linear_svm = LinearSVC().fit(X_train,y_train)
126 | 
127 | regr.fit(X_train,y_train)
128 | y_pred= regr.predict(X_test)
129 | yy_pred= linear_svm.predict(X_test)
130 | #Printing Codfficients
131 | print('Coefficients: \n',regr.coef_)
132 | print(LinearSVC().fit(X_train,y_train).coef_)
133 | regr.score(X_train,y_train)
134 | #Mean squared error
135 | print("mean squared error:  %.2f" %mean_squared_error(y_test,y_pred))
136 | 
137 | #Variance score
138 | print("Variance score: %2f"  % r2_score(y_test, y_pred))
139 | 
140 | #Plot and visualize the Linear Regression plot
141 | plt.plot(X_test, y_pred, linewidth=3)
142 | plt.show()
143 | 
144 | #Checking graphically the boundaries formed by Linear SVM
145 | line = np.linspace(-15, 15)  
146 | for coef, intercept in zip(linear_svm.coef_, linear_svm.intercept_):  
147 |     plt.plot(line, -(line * coef[0] + intercept) / coef[1])  #HOW DO WE KNOW
148 | plt.ylim(-10, 15)  
149 | plt.xlim(-10, 8)  
150 | plt.show()  
151 | 
152 | predicted= regr.predict([[820,81,363,35]])
153 | print(predicted)
154 | 
155 | features.head(2)
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/Contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Apress Source Code
 2 | 
 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers.
 4 | 
 5 | ## How to Contribute
 6 | 
 7 | 1. Make sure you have a GitHub account.
 8 | 2. Fork the repository for the relevant book.
 9 | 3. Create a new branch on which to make your change, e.g. 
10 | `git checkout -b my_code_contribution`
11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted.
12 | 5. Submit a pull request.
13 | 
14 | Thank you for your contribution!


--------------------------------------------------------------------------------
/DirectPythChatbot.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Mar 24 10:20:18 2018
  4 | 
  5 | @author: PUNEETMATHUR
  6 | """
  7 | #Loading tkinter libraries which will be used in the UI of chatbot
  8 | import tkinter
  9 | from tkinter import *
 10 | from tkinter.scrolledtext import *
 11 | from tkinter import ttk
 12 | import time
 13 | from PIL import ImageTk, Image
 14 | import tkinter 
 15 | 
 16 | #Loading random for random choices in our chat program
 17 | import random
 18 | 
 19 | #Splash Screen 
 20 | splash = tkinter.Tk()
 21 | 
 22 | splash.title("Welcome to Applications of Machine Learning in Healthcare, Retail & Finance")
 23 | splash.geometry("1000x100")
 24 | splash.configure(background='green')
 25 | w = Label(splash, text="DirectPyth Diabetes Diagnostics Chatbot by Puneet Mathur\nLoading...",font=("Helvetica", 26),fg="white",bg="green")
 26 | w.pack()
 27 | 
 28 | splash.update()
 29 | time.sleep(6)
 30 | splash.deiconify()
 31 | splash.destroy()
 32 | 
 33 | #Initializing tkinter library for UI window showup
 34 | window = tkinter.Tk()
 35 | s = tkinter.Scrollbar(window)
 36 | chatmsg = tkinter.Text(window)
 37 | chatmsg.focus_set()
 38 | s.pack(side=tkinter.RIGHT, fill=tkinter.Y)
 39 | chatmsg.pack(side=tkinter.TOP, fill=tkinter.Y)
 40 | s.config(command=chatmsg.yview)
 41 | chatmsg.config(yscrollcommand=s.set)
 42 | input_user = StringVar()
 43 | input_field = Entry(window, text=input_user)
 44 | input_field.pack(side=tkinter.BOTTOM, fill=tkinter.X)
 45 | bot_text="Welcome to DirectPyth Diagnostic Center\n"
 46 | chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text)
 47 | bot_text = "Press enter to continue "
 48 | chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text)
 49 | chatmsg.focus()
 50 | 
 51 | #Diagnostics Corpus for the chatbot
 52 | greet=['Hello welcome to DirectPyth','Hi welcome to DirectPyth','Hey welcome to DirectPyth','Good day to you welcome to DirectPyth']
 53 | confirm=['yes','yay','yeah','yo']
 54 | memberid=['12345','12346','12347','12348','12349']
 55 | customer = ['hello','hi','hey']
 56 | answer = ['I uderstand you feel happy but please stay to the point and select one of the options',"I sympathize with you, However please do not deviate from the topic"]
 57 | greetings = ['hola Welcome to DirectPyth again', 'hello Welcome to DirectPyth again', 'hi Welcome to DirectPyth again', 'Hi Welcome to DirectPyth again', 'hey! Welcome to DirectPyth again', 'hey Welcome to DirectPyth again']
 58 | question = ['how are you?', 'how are you doing?']
 59 | responses = ['Okay', "I'm fine"]
 60 | tests=['Type 1 for hbA1c test', "Type 2 for Blood Viscosity test","Type 3 for Heart rate test","Type 4 for Blood Oxygen test","Type 5 for Blood Pressure"]
 61 | testresponse= ['1','2','3','4','5','6']
 62 | 
 63 | #Global variable to check first time greeting  
 64 | firstswitch=1
 65 | newid="12310"
 66 | memid=0
 67 | 
 68 | def chat(event):
 69 |     import time
 70 |     import random
 71 |     global memid    
 72 |     condition=""
 73 |     #Greet for first time
 74 |     global firstswitch
 75 |     print(firstswitch)
 76 | 
 77 |     if (firstswitch==1):
 78 |         bot_text = random.choice(greet)
 79 |         chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text)
 80 |         bot_text = "If you are an existing Member of DirectPyth please enter your membershipid: or enter no if you are not a member"
 81 |         chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text)
 82 |         firstswitch=2
 83 |     if (firstswitch!=1):
 84 |         input_get = input_field.get().lower()
 85 |         if any(srchstr in input_get for srchstr  in memberid):
 86 |             memid=input_get
 87 |             bot_text = "Thank you for being a loyal member of DirectPyth\n Please choose a test from following menu to continue\nType 1 for hbA1c test\nType 2 for Blood Viscosity test\nType 3 for Heart rate test\nType 4 for Blood Oxygen test\nType 5 for Blood pressure test\nType 6 to exit\n\n"
 88 |         elif (input_get=="no"):
 89 |             memid=newid
 90 |             bot_text = "Your new Memberid is: " + newid + " Please remember this for future reference.\n Please choose a test from following menu to continue\nType 1 for hbA1c test\nType 2 for Blood Viscosity test\nType 3 for Heart rate test\nType 4 for Blood Oxygen test\nType 5 for Blood pressure test\nType 6 to exit\n\n"
 91 |         elif any(srchstr in input_get for srchstr  in testresponse):
 92 |         
 93 |                 bot_text = "Please place any of your finger on the Finger panel above to conduct the test"
 94 |                 chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text)
 95 |                 delaycounter=0
 96 |                 for delaycounter in range(0,10):
 97 |                     bot_text = str(delaycounter)
 98 |                     time.sleep(1)
 99 |                     chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text)
100 |                 bot_text = "Please wait generating your report\n"
101 |                 chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text)
102 |                 time.sleep(2)
103 |                 if (input_get=="1"):
104 |                     hba1c=random.randint(4,10)
105 |                     bot_text = "MemberID: " + str(memid) + " Your hbA1c test result is: " + str(hba1c)
106 |                     if(hba1c>=4 and hba1c<=5.6):
107 |                         condition="You are don't have diabetes"
108 |                     elif(hba1c>=5.7 and hba1c<=6.4):
109 |                         condition="You are Prediabetic"
110 |                     elif(hba1c>=6.5):
111 |                         condition="You are Diabetic"
112 |                     bot_text=bot_text +  " Your condition is: " + condition    
113 |                     chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text)
114 |                 elif (input_get=="2"):
115 |                     viscosity=random.randint(20,60)
116 |                     bot_text = "MemberID: " + str(memid) + " Your Blood Viscosity level test result is: " + str(viscosity)
117 |                 elif (input_get=="3"):
118 |                     heartrate=random.randint(40,150)
119 |                     bot_text = "MemberID: " + str(memid) + " Your Heart rate test result is: " + str(heartrate)
120 |                 elif (input_get=="4"):
121 |                     oxygen=random.randint(90,100)
122 |                     bot_text = "MemberID: " + str(memid) + " Your Blood Oxygen level test result is: " + str(oxygen)
123 |                 elif (input_get=="5"):
124 |                     systolic=random.randint(80,200)
125 |                     diastolic=random.randint(80,110)
126 |                     bot_text = "MemberID: " + str(memid) + " Your Blood Pressure test result is: Systolic: " + str(systolic) + " Diastolic: " + str(diastolic)
127 |                 elif (input_get=="6"):
128 |                     import sys
129 |                     window.deiconify()
130 |                     window.destroy()
131 |                     sys.exit(0)
132 |         else:
133 |          from nltk.stem import WordNetLemmatizer
134 |          import nltk
135 |          if((not input_get) or (int(input_get)<=0)):    
136 |                         print("did you just press Enter?") #print some info
137 |          else:
138 |              lemmatizer = WordNetLemmatizer()
139 |              input_get = input_field.get().lower()
140 |              lemvalue=lemmatizer.lemmatize(input_get)
141 |              whatsentiment=getSentiment(lemvalue)
142 |              if (whatsentiment=="pos"):
143 |                  bot_text = answer[0]
144 |                  #print("Positive Sentiment")
145 |              elif (whatsentiment=="neg"):
146 |                  bot_text = answer[1]
147 |              #print("Negative Sentiment")
148 |              chatmsg.insert(INSERT, '%s\n' % lemvalue)
149 |              #bot_text = "I did not understand what you said !"
150 |             
151 | 
152 |         
153 |     
154 |     chatmsg.insert(INSERT, 'Bot:%s\n' % bot_text)    
155 |     #label = Label(window, text=input_get)
156 |     input_user.set('')
157 |     #label.pack()
158 |     return "break"
159 | 
160 | #Sentiment Analyzer using NLP
161 | def getSentiment(text):
162 |     import nltk
163 |     from nltk.tokenize import word_tokenize
164 | 
165 |     #nltk.download('punkt')
166 |     # Step 1 – Training data building the Diabetes corpus
167 |     train = [("Thanks for an excellent report", "pos"),
168 |     ("Your service is very quick and fast", "pos"),
169 |     ("I am pleased with your service", "pos"),
170 |     ("I did not know i was diabetic until you gave me this report", "neg"),
171 |     ("Service - Little slow, probably because too many people.", "neg"),
172 |     ("The place is not easy to locate", "neg"),
173 |     ("The place is very easy to locate", "pos"),
174 |     ("Not satisfied will take a second opinion", "neg"),
175 |     ("No human contact everything is so robotic here", "neg"),
176 |     ("can i talk to a human not convinced with your report", "neg"),
177 |     ("good results", "pos"),
178 |     ("good service", "pos"),
179 |     ("great service", "pos"),
180 |     ("excellent service", "pos"),
181 |     ("amazing technology", "pos"),
182 |     ("fast service and satisfying report", "pos"),
183 |     ("your report sucks", "neg"),
184 |     ("this report will cost me a fortune", "neg"),
185 |     ("I have diabetes", "neg"),
186 |     ("this report will cost me a fortune", "neg"),
187 |     ("this report means i have a dreadful disease", "neg"),
188 |     ("will i need to take new medication", "neg"),
189 |     ("i need to take my insulin injections regularly", "neg"),
190 |     ("my lipids are getting worst need to talk to the doctor", "neg"),
191 |     ("oh my god very bad results", "neg"),
192 |     ("bad service", "neg"),
193 |     ("very bad service", "neg"),
194 |     ("poor service", "neg"),
195 |     ("very bad service", "neg"),
196 |     ("slow service", "neg"),
197 |     ("very slow service", "neg"),
198 |     ("diabetes got worst is this report accurate", "neg"),
199 |     ("i dont believe this report", "neg"),
200 |     ("i dont like this report", "neg"),
201 |     ("i am in a diabetic hell", "neg"),
202 |     ]
203 |     # Step 2 Tokenize the words to dictionary
204 |     dictionary = set(word.lower() for passage in train for word in word_tokenize(passage[0]))
205 |     # Step 3 Locate the word in training data
206 |     t = [({word: (word in word_tokenize(x[0])) for word in dictionary}, x[1]) for x in train]
207 |     # Step 4 – the classifier is trained with sample data
208 |     classifier = nltk.NaiveBayesClassifier.train(t)
209 |     test_data = "oh my god what is this"
210 |     test_data_features = {word.lower(): (word in word_tokenize(test_data.lower())) for word in dictionary}
211 |     print (classifier.classify(test_data_features))
212 |     return classifier.classify(test_data_features)
213 |     
214 | #Start the program chat and put in loop
215 | input_field.bind("<Return>", chat)
216 | tkinter.mainloop()
217 | 
218 | 


--------------------------------------------------------------------------------
/ElectronicPatientRecord.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Mar 15 23:46:06 2018
  4 | 
  5 | @author: PUNEETMATHUR
  6 | """
  7 | #Importing python libraries
  8 | import pandas as pd
  9 | from io import StringIO
 10 | import requests
 11 | import os
 12 | os.getcwd()
 13 | 
 14 | #Reading dataset from flat file
 15 | fname="Diabetes_Dataset.csv"
 16 | patients= pd.read_csv(fname, low_memory=False, index_col=False)
 17 | df= pd.DataFrame(patients)
 18 | 
 19 | #Look at the first record
 20 | print(df.head(1))
 21 | #Check the shape size and columns in the dataset
 22 | print(df.size)
 23 | print(df.shape)
 24 | print(df.columns)
 25 | df.dtypes
 26 | #Check if there are any columns with empty/null dataset
 27 | df.isnull().any()
 28 | #Checking how many columns have null values
 29 | df.info()
 30 | 
 31 | dfworking=df.drop('Patient ID',axis=1)
 32 | #You can use Describe method to see however since our columns are more 
 33 | #We will use individual functions to do EDA
 34 | print(dfworking.describe)
 35 | 
 36 | #Using individual functions to do EDA
 37 | #Checking out Statistical data Mean Median Mode correlation
 38 | dfworking.mean()
 39 | dfworking.median()
 40 | dfworking.mode()
 41 | 
 42 | 
 43 | #How is the data distributed and detecting Outliers
 44 | dfworking.std()
 45 | dfworking.max()
 46 | dfworking.min()
 47 | dfworking.quantile(0.25)*1.5
 48 | dfworking.quantile(0.75)*1.5
 49 | 
 50 | #How many Outliers in the BPSystolic column
 51 | df.columns
 52 | df.dtypes
 53 | dfworking.set_index(['BPSystolic'])
 54 | dfworking['BPSystolic'].loc[df['BPSystolic'] >=183.562500].count()
 55 | dfworking.set_index(['Patient ID'])
 56 | 
 57 | 
 58 | dfworking.boxplot(figsize=(10, 6))
 59 | dfworking.plot.box(vert=False)
 60 | dfworking.boxplot(column=['A1cTEST','BPSystolic','BPDiastolic','HeartRateBPM','BMI','Age'],figsize=(10, 6))
 61 | dfworking.kurtosis()
 62 | dfworking.skew()
 63 | import scipy.stats as sp
 64 | sp.skew(dfworking.A1cTEST)
 65 | 
 66 | #Visualizing dataset
 67 | dfworking.plot()
 68 | dfworking.hist(column=['A1cTEST','BPSystolic','BPDiastolic','HeartRateBPM','BMI','Age'],figsize=(10, 6))
 69 | dfworking.plot.area()
 70 | dfworking.plot.area(stacked=False)
 71 | 
 72 | 
 73 | #Now look at correlation and patterns
 74 | dfworking.corr()
 75 | dfworking.plot.scatter(x='A1cTEST', y='BPSystolic',s=dfworking['A1cTEST']*2)
 76 | dfworking.plot.scatter(x='A1cTEST', y='BPSystolic',s=dfworking['BPSystolic']*0.13)
 77 | dfworking.plot.hexbin(x='A1cTEST', y='BPSystolic', gridsize=25)
 78 | 
 79 | #Look at crosstabulation to conclude EDA
 80 | df.columns
 81 | #Counting the Categorical variables
 82 | my_tab = pd.crosstab(index=df["Gender"], columns="Count")      # Name the count column
 83 | my_tab = pd.crosstab(index=df["Type of diabetes"], columns="Count")      # Name the count column
 84 | my_tab = pd.crosstab(index=df["Diabetes status"], columns="Count")      # Name the count column
 85 | my_tab = pd.crosstab(index=df["FrozenShoulder"], columns="Count")      # Name the count column
 86 | my_tab = pd.crosstab(index=df["CarpalTunnelSynd"], columns="Count")      # Name the count column
 87 | my_tab = pd.crosstab(index=df["DuputrensCont"], columns="Count")      # Name the count column
 88 | print(my_tab)
 89 | my_tab=my_tab.sort_values('Count', ascending=[False])
 90 | print(my_tab)
 91 | my_tab.sum()
 92 | data_counts = pd.DataFrame(my_tab)
 93 | pd.DataFrame(data_counts).transpose().plot(kind='bar', stacked=False)
 94 | 
 95 | #Data Preparation Steps
 96 | #Step 1 Split data into features and target variable
 97 | # Split the data into features and target label
 98 | diabetics = pd.DataFrame(dfworking['Diabetes status'])
 99 | features = pd.DataFrame(dfworking.drop('Diabetes status', axis = 1))
100 | diabetics.columns
101 | features.columns
102 | 
103 | #Step 2 Standardize dataset
104 | # Import sklearn.preprocessing.StandardScaler
105 | from sklearn.preprocessing import MinMaxScaler
106 | 
107 | # Initialize a scaler, then apply it to the features
108 | scaler = MinMaxScaler()
109 | dfworking.dtypes
110 | numerical = ['Age','A1cTEST','BPSystolic','BPDiastolic','HeartRateBPM','BMI']
111 | features_raw[numerical] = scaler.fit_transform(dfworking[numerical])
112 | 
113 | # Show an example of a record with scaling applied
114 | display(features_raw[numerical].head(n = 1))
115 | 
116 | # Step 3 One-hot encode the 'features_raw' data using pandas.get_dummies()
117 | features = pd.get_dummies(features_raw)
118 | features.columns
119 | 
120 | #Checking output
121 | display(features.head(1),diabetics.head(1))
122 | 
123 | # Print the number of features after one-hot encoding
124 | encoded = list(features.columns)
125 | print("{} total features after one-hot encoding.".format(len(encoded)))
126 | 
127 | # see the encoded feature names
128 | print(encoded)
129 | 
130 | #Step 4 Shuffle & Split Final Dataset
131 | # Import train_test_split
132 | from sklearn.cross_validation import train_test_split
133 | from sklearn.utils import shuffle
134 | 
135 | # Shuffle and split the data into training and testing subsets
136 | features=shuffle(features,  random_state=0)
137 | diabetics=shuffle(diabetics,  random_state=0)
138 | # Split the 'features' and 'income' data into training and testing sets
139 | X_train, X_test, y_train, y_test = train_test_split(features, diabetics, test_size = 0.2, random_state = 0)
140 | 
141 | # Show the results of the split
142 | print("Training set has {} samples.".format(X_train.shape[0]))
143 | print("Testing set has {} samples.".format(X_test.shape[0]))
144 | 
145 | #Model Building & Evaluation
146 | #Creating the the Model for prediction
147 | 
148 | #Loading model Libraries
149 | from sklearn import model_selection
150 | from sklearn.linear_model import LogisticRegression
151 | from sklearn.tree import DecisionTreeClassifier
152 | from sklearn.neighbors import KNeighborsClassifier
153 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
154 | from sklearn.naive_bayes import GaussianNB
155 | from sklearn.svm import SVC
156 | from sklearn.ensemble import RandomForestClassifier
157 | 
158 | # prepare models
159 | seed = 7
160 | models = []
161 | models.append(('LR', LogisticRegression()))
162 | models.append(('LDA', LinearDiscriminantAnalysis()))
163 | models.append(('KNN', KNeighborsClassifier()))
164 | models.append(('CART', DecisionTreeClassifier()))
165 | models.append(('NB', GaussianNB()))
166 | models.append(('SVM', SVC()))
167 | models.append(('RFC', RandomForestClassifier()))
168 | 
169 | # evaluate each model in turn
170 | results = []
171 | names = []
172 | scoring = 'accuracy'
173 | 
174 | import warnings
175 | warnings.filterwarnings("ignore")
176 | 
177 | for name, model in models:
178 | 	kfold = model_selection.KFold(n_splits=10, random_state=seed)
179 | 	cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
180 | 	results.append(cv_results)
181 | 	names.append(name)
182 | 	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
183 | 	print(msg)
184 | 
185 | 
186 | import matplotlib.pyplot as plt
187 | # boxplot algorithm comparison
188 | fig = plt.figure()
189 | fig.suptitle('Algorithm Comparison')
190 | ax = fig.add_subplot(111)
191 | plt.boxplot(results)
192 | ax.set_xticklabels(names)
193 | plt.show()
194 | 
195 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | ﻿Freeware License, some rights reserved
 2 | 
 3 | Copyright (c) 2019 Puneet Mathur
 4 | 
 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 
 6 | of this software and associated documentation files (the "Software"), 
 7 | to work with the Software within the limits of freeware distribution and fair use. 
 8 | This includes the rights to use, copy, and modify the Software for personal use. 
 9 | Users are also allowed and encouraged to submit corrections and modifications 
10 | to the Software for the benefit of other users.
11 | 
12 | It is not allowed to reuse,  modify, or redistribute the Software for 
13 | commercial use in any way, or for a user’s educational materials such as books 
14 | or blog articles without prior permission from the copyright holder. 
15 | 
16 | The above copyright notice and this permission notice need to be included 
17 | in all copies or substantial portions of the software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/Predict_NextDay_Open_STOCK_PRICE.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Dec 17 20:30:01 2017
  4 | 
  5 | @author: PUNEETMATHUR
  6 | I am creating this script to predict next day Opening Price based on
  7 | Today's Closing Price for any given stock
  8 | """
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | import os
 13 | 
 14 | #Change your directory to wherever your dataset is stored
 15 | os.chdir("E:\\BUSINESS\\APRESS\\ApplicationsOfMachineLearning\\Chapter16\\")
 16 | 
 17 | #Loading the dataset of the company for which prediction is required
 18 | df=pd.read_csv("BalmerLawrieColtd.csv",parse_dates=['Date'])
 19 | print(df.head(1))
 20 | print(df.columns)
 21 | df.shape
 22 | 
 23 | #Selecting only relevant columns required for prediction
 24 | cols=['Date','Open','Close']
 25 | df=df[cols]
 26 | 
 27 | print(df.columns)
 28 | print(df.head(5))
 29 | 
 30 | # Checking data if Cleaning up data is required
 31 | df.isnull().any()
 32 | #df=df.dropna()
 33 | #df=df.replace("NA",0)
 34 | df.dtypes
 35 | 
 36 | #Sorting up data to plot historically ascending values in graph
 37 | df = df.sort_values(by='Date',ascending=True)
 38 | 
 39 | #Plotting the price of stock over the years
 40 | #What story does it tell?
 41 | import matplotlib.pyplot as plt
 42 | plt.plot(df['Date'],df['Close'])
 43 | 
 44 | #Now plot only for last one year and last 1 month 
 45 | df['Date'].dt.year==2017
 46 | mask=(df['Date'] > '2017-1-1') & (df['Date'] <= '2017-12-31')
 47 | print(df.loc[mask])
 48 | df2017=df.loc[mask]
 49 | print(df2017.head(5))
 50 | import matplotlib.pyplot as plt
 51 | plt.plot(df2017['Date'],df2017['Close'])
 52 | 
 53 | #Plotting last 1 month data on stock
 54 | mask=(df['Date'] > '2017-11-17') & (df['Date'] <= '2017-12-26')
 55 | print(df.loc[mask])
 56 | dfnovdec2017=df.loc[mask]
 57 | print(dfnovdec2017.head(5))
 58 | import matplotlib.pyplot as plt
 59 | plt.xticks( rotation='vertical')
 60 | plt.plot(dfnovdec2017['Date'],dfnovdec2017['Close'])
 61 | 
 62 | #Now calculating the Simple Moving Average of the Stock
 63 | #Simple Moving Average One Year
 64 | df2017['SMA'] = df2017['Close'].rolling(window=20).mean()
 65 | df2017.head(25)
 66 | df2017[['SMA','Close']].plot()
 67 | 
 68 | 
 69 | #Does the Open and Closing price of the stock follow very well?
 70 | df2017[['Open','Close']].plot()
 71 | 
 72 | #Checking the Correlation
 73 | df2017.corr()
 74 | 
 75 | #Simple Moving Average One Month
 76 | dfnovdec2017['SMA'] = dfnovdec2017['Close'].rolling(window=2).mean()
 77 | dfnovdec2017.head(25)
 78 | dfnovdec2017[['SMA','Close']].plot()
 79 | 
 80 | #Now creating NextDayOpen column for prediction
 81 | ln=len(df)
 82 | lnop=len(df['Open'])
 83 | print(lnop)
 84 | ii=0
 85 | df['NextDayOpen']=df['Open']
 86 | df['NextDayOpen']=0
 87 | for i in range(0,ln-1):
 88 |     print("Open Price: ",df['Open'][i])
 89 |     if i!=0:
 90 |         ii=i-1
 91 |     df['NextDayOpen'][ii]=df['Open'][i]
 92 |     print(df['NextDayOpen'][ii])
 93 | 
 94 | print(df['NextDayOpen'].head())
 95 | 
 96 | #Checking on the new data
 97 | print(df[['Open','NextDayOpen', 'Close']].head(5))
 98 | 
 99 | #Now checking if there is any correlation
100 | dfnew=df[['Close','NextDayOpen']]
101 | print(dfnew.head(5))
102 | dfnew.corr()
103 | 
104 | #Now Creating the Prediction model as correlation is very high
105 | #Importing the libraries
106 | from sklearn import cross_validation
107 | from sklearn.utils import shuffle
108 | from sklearn import linear_model
109 | from sklearn.metrics import mean_squared_error, r2_score
110 | 
111 | 
112 | #Creating the features and target dataframes
113 | price=dfnew['Close']
114 | print(price)
115 | print(dfnew.columns)
116 | features=dfnew[['NextDayOpen']]
117 | #Shuffling the data
118 | price=shuffle(price, random_state=0)
119 | features=shuffle(features,random_state=0)
120 | 
121 | #Dividing data into Train and Test
122 | X_train, X_test, y_train, y_test= cross_validation.train_test_split(features,price,test_size=0.2, random_state=0)
123 | 
124 | #Linear Regression on Sensex data
125 | reg= linear_model.LinearRegression()
126 | 
127 | X_train.shape
128 | reg.fit(X_train, y_train)
129 | 
130 | y_pred= reg.predict(X_test)
131 | 
132 | 
133 | print("Coefficients: ", reg.coef_)
134 | 
135 | #Mean squared error
136 | print("mean squared error:  ",mean_squared_error(y_test,y_pred))
137 | 
138 | #Variance score
139 | print("Variance score: ",   r2_score(y_test, y_pred))
140 | 
141 | #STANDARD DEVIATION
142 | standarddev=price.std()
143 | 
144 | #Predict based on Opening BSE Sensex Index and Opening Volume
145 | #In the predict function below enter the first parameter Open forNSE and 2nd Volume in Crores
146 | sensexClosePredict=reg.predict([[269.05]])
147 | #175 is the standard deviation of the Diff between Open and Close of sensex so this range
148 | print("Stock Likely to Open at: ",sensexClosePredict , "(+-STD)")
149 | print("Stock Open between: ",sensexClosePredict+standarddev , " & " , sensexClosePredict-standarddev)
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/Quandl_Data_Getter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Sep 22 23:43:13 2018
 4 | 
 5 | @author: PUNEETMATHUR
 6 | """
 7 | 
 8 | import quandl
 9 | quandl.ApiConfig.api_key = 'INSERT YOU API KEY HERE'
10 | 
11 | # get the table for daily stock prices and,
12 | # filter the table for selected tickers, columns within a time range
13 | # set paginate to True because Quandl limits tables API to 10,000 rows per call
14 | 
15 | data = quandl.get_table('WIKI/PRICES', ticker = ['AAPL', 'MSFT', 'WMT'], 
16 |                         qopts = { 'columns': ['ticker', 'date', 'adj_close'] }, 
17 |                         date = { 'gte': '2015-12-31', 'lte': '2016-12-31' }, 
18 |                         paginate=True)
19 | data.head()
20 | 
21 | # create a new dataframe with 'date' column as index
22 | new = data.set_index('date')
23 | 
24 | # use pandas pivot function to sort adj_close by tickers
25 | clean_data = new.pivot(columns='ticker')
26 | 
27 | # check the head of the output
28 | clean_data.head()
29 | 
30 | #Below script gets you Data from National Stock Exchange for a stock known as Oil India Limited
31 | import quandl
32 | quandl.ApiConfig.api_key = 'z1bxBq27SVanESKoLJwa'
33 | quandl.ApiConfig.api_version = '2015-04-09'
34 | 
35 | import quandl
36 | data = quandl.get('NSE/OIL')
37 | data.head()
38 | data.columns
39 | data.shape
40 | 
41 | #Storing data in a flat file
42 | data.to_csv("NSE_OIL.csv")
43 | 
44 | #A basic plot of the stocks data across the years
45 | data['Close'].plot()
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apress Source Code
 2 | 
 3 | This repository accompanies [*Machine Learning Applications Using Python*](https://www.apress.com/9781484237861) by Puneet Mathur (Apress, 2019).
 4 | 
 5 | [comment]: #cover
 6 | ![Cover image](9781484237861.jpg)
 7 | 
 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git.
 9 | 
10 | ## Releases
11 | 
12 | Release v1.0 corresponds to the code in the published book, without corrections or updates.
13 | 
14 | ## Contributions
15 | 
16 | See the file Contributing.md for more information on how you can contribute to this repository.


--------------------------------------------------------------------------------
/Yahoo_Data_Getter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Dec  3 23:32:32 2017
 4 | 
 5 | @author: PUNEETMATHUR
 6 | """
 7 | 
 8 | import numpy as np
 9 | import pandas as pd
10 | #import pandas.io.data as web
11 | from pandas_datareader import data, wb
12 | 
13 | sp500= data.DataReader('^GSPC', data_source='yahoo', start='1/1/2000', end='1/12/2017')
14 | #sp500= data.DataReader('^GSPC', data_source='yahoo')
15 | sp500.ix['2010-01-04']
16 | sp500.info()
17 | print(sp500)
18 | print(sp500.columns)
19 | print(sp500.shape)
20 | 
21 | import matplotlib.pyplot as plt
22 | plt.plot(sp500['Close'])
23 | 
24 | # now calculating the 42nd Days and 252 days trend for the index
25 | sp500['42d']= np.round(pd.rolling_mean(sp500['Close'], window=42),2)
26 | sp500['252d']= np.round(pd.rolling_mean(sp500['Close'], window=252),2)
27 | 
28 | #Look at the data
29 | sp500[['Close','42d','252d']].tail()
30 | plt.plot(sp500[['Close','42d','252d']])
31 | 
32 | 


--------------------------------------------------------------------------------
/errata.md:
--------------------------------------------------------------------------------
 1 | # Errata for *Book Title*
 2 | 
 3 | On **page xx** [Summary of error]:
 4 |  
 5 | Details of error here. Highlight key pieces in **bold**.
 6 | 
 7 | ***
 8 | 
 9 | On **page xx** [Summary of error]:
10 |  
11 | Details of error here. Highlight key pieces in **bold**.
12 | 
13 | ***


--------------------------------------------------------------------------------
/fivepointsummary.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Sep 04 15:07:13 2018
 4 | 
 5 | @author: PUNEETMATHUR
 6 | """
 7 | #Baby Boomers Five Point Summary example
 8 | import numpy as np
 9 | bboomers = np.array([14230, 345, 1912, 472, 63, 861, 270, 713])
10 | fivepoints= [np.min(bboomers), np.percentile(bboomers, 25, interpolation='midpoint'), np.median(bboomers),np.percentile(bboomers, 75, interpolation='midpoint'),np.max(bboomers)]
11 | for fivepointsummary in fivepoints:
12 |      print(fivepointsummary)
13 |      
14 | #Millenials Five Point Summary example
15 | import numpy as np
16 | millennials  = np.array([12519, 845, 912, 72, 93, 615, 70, 538])
17 | fivepoints= [np.min(millennials ), np.percentile(millennials , 25, interpolation='midpoint'), np.median(millennials ),np.percentile(millennials , 75, interpolation='midpoint'),np.max(millennials)]
18 | for fivepointsummary in fivepoints:
19 |     print(fivepointsummary)
20 | 
21 | 


--------------------------------------------------------------------------------
/retail2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat May 26 16:50:08 2018
  4 | 
  5 | @author: PUNEETMATHUR Copyright 2018 All rights reserved
  6 | DO NOT COPY WTTHOUT PERMISSION
  7 | """
  8 | 
  9 | import warnings
 10 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
 11 | warnings.filterwarnings("ignore", category = UserWarning, module = "cross_validation")
 12 | 
 13 | # Display inline matplotlib plots with IPython
 14 | from IPython import get_ipython
 15 | get_ipython().run_line_magic('matplotlib', 'inline')
 16 | ###########################################
 17 | # Importing libraries
 18 | import matplotlib.pyplot as plt
 19 | import matplotlib.cm as cm
 20 | import pandas as pd
 21 | import numpy as np
 22 | import pandas.plotting
 23 | 
 24 | ####################################################
 25 | ####################################################
 26 | ######### FUNCTIONS FOR DATA VISUALIZATION #########
 27 | ####################################################
 28 | ####################################################
 29 |  
 30 | def pcaOutput(good_data, pca):
 31 | 	'''
 32 | 	Visualizing the PCA results and calculating explained variance
 33 | 	'''
 34 | 
 35 | 	# I am doing Dimension indexing through pca components
 36 | 	dims = dims = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)]
 37 | 
 38 | 	# Creating the PCA components pandas dataframe from the dimensions
 39 | 	comps = pd.DataFrame(np.round(pca.components_, 4), columns = good_data.keys())
 40 | 	comps.index = dims
 41 | 
 42 | 	# Calculating PCA explained variance for each component
 43 | 	ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
 44 | 	variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance'])
 45 | 	variance_ratios.index = dims
 46 | 
 47 | 	# Creating a bar plot visualization for better understanding
 48 | 	fig, ax = plt.subplots(figsize = (24,12))
 49 | 
 50 | 	# Plotting the feature weights as a function of the components
 51 | 	comps.plot(ax = ax, kind = 'bar');
 52 | 	ax.set_ylabel("Feature Weights")
 53 | 	ax.set_xticklabels(dims, rotation=0)
 54 | 
 55 | 
 56 | 	# Displaying the explained variance ratios
 57 | 	for i, ev in enumerate(pca.explained_variance_ratio_):
 58 | 		ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n          %.4f"%(ev))
 59 | 
 60 | 	# Returning back a concatenated DataFrame
 61 | 	return pd.concat([variance_ratios, comps], axis = 1)
 62 | 
 63 | def clusterOutput(redData, preds, centers, pca_samples):
 64 | 	'''
 65 | 	Visualizes the PCA-reduced cluster data in two dimensions
 66 | 	Adds points for cluster centers for-selected sample data
 67 | 	'''
 68 | 
 69 | 	preds = pd.DataFrame(preds, columns = ['Cluster'])
 70 | 	plot_data = pd.concat([preds, redData], axis = 1)
 71 | 
 72 | 	# I am Generating the cluster plot
 73 | 	fig, ax = plt.subplots(figsize = (14,8))
 74 | 
 75 | 	# Creating the Color map to distinguish between clusters
 76 | 	cmap = cm.get_cmap('gist_rainbow')
 77 | 
 78 | 	# Coloring the points based on assigned cluster
 79 | 	for i, cluster in plot_data.groupby('Cluster'):   
 80 | 	    cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
 81 | 	                 color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30);
 82 | 
 83 | 	# Plotting the centers with cross mark indicators
 84 | 	for i, c in enumerate(centers):
 85 | 	    ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \
 86 | 	               alpha = 1, linewidth = 2, marker = 'o', s=200);
 87 | 	    ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100);
 88 | 
 89 | 	# Plotting transformed sample points 
 90 | 	ax.scatter(x = pca_samples[:,0], y = pca_samples[:,1], \
 91 | 	           s = 150, linewidth = 4, color = 'black', marker = 'x');
 92 | 
 93 | 	# Plot title
 94 | 	ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids with Numerical markingr\nTick marks denote Transformed Sample Data");
 95 | 
 96 | 
 97 | def biPlotter(cleanData, redData, pca):
 98 |     '''
 99 |     Building a biplot for PCA of the reduced data and the projections of the original features.
100 |     Variable cleanData: original data, before transformation.
101 |     Creating a pandas dataframe with valid column names
102 |     redData: the reduced data (the first two dimensions are plotted)
103 |     pca: pca object that contains the components_ attribute
104 | 
105 |     This function returns: a matplotlib AxesSubplot object (for any additional customization)
106 |     This function is inspired by the script by Teddy Roland on Biplot in Python:
107 |     https://github.com/teddyroland/python-biplot
108 |     '''
109 | 
110 |     fig, ax = plt.subplots(figsize = (14,8))
111 |     # scatterplot of the reduced data    
112 |     ax.scatter(x=redData.loc[:, 'Dimension 1'], y=redData.loc[:, 'Dimension 2'], 
113 |         facecolors='b', edgecolors='b', s=70, alpha=0.5)
114 |     
115 |     feature_vectors = pca.components_.T
116 | 
117 |     # we use scaling factors to make the arrows easier to see
118 |     arrow_size, text_pos = 7.0, 8.0,
119 | 
120 |     # projections of the original features
121 |     for i, v in enumerate(feature_vectors):
122 |         ax.arrow(0, 0, arrow_size*v[0], arrow_size*v[1], 
123 |                   head_width=0.2, head_length=0.2, linewidth=2, color='red')
124 |         ax.text(v[0]*text_pos, v[1]*text_pos, cleanData.columns[i], color='black', 
125 |                  ha='center', va='center', fontsize=18)
126 | 
127 |     ax.set_xlabel("Dimension 1", fontsize=14)
128 |     ax.set_ylabel("Dimension 2", fontsize=14)
129 |     ax.set_title("Principal Component plane with original feature projections.", fontsize=16);
130 |     return ax
131 |     
132 | 
133 | def channelOutput(redData, outliers, pca_samples):
134 | 	'''
135 | 	Here we are Visualizing the PCA-reduced cluster data in two dimensions using the full dataset
136 | 	Data is labeled by "Channel" points added for selected sample data
137 | 	'''
138 | 
139 | 	# Check that the dataset is loadable
140 | 	try:
141 | 	    full_data = pd.read_csv("E:/retail2.csv")
142 | 	except:
143 | 	    print "Dataset could not be loaded. Is the file missing?"
144 | 	    return False
145 | 
146 | 	# Create the Channel DataFrame
147 | 	chl = pd.DataFrame(full_data['Channel'], columns = ['Channel'])
148 | 	chl = chl.drop(chl.index[outliers]).reset_index(drop = True)
149 | 	labeled = pd.concat([redData, chl], axis = 1)
150 | 	
151 | 	# Generate the cluster plot
152 | 	fig, ax = plt.subplots(figsize = (14,8))
153 | 
154 | 	# Color map
155 | 	cmap = cm.get_cmap('gist_rainbow')
156 | 
157 | 	# Color the points based on assigned Channel
158 | 	labels = ['Segment 1/Segment 2/Segment3', 'Retail Customer']
159 | 	grouped = labeled.groupby('Channel')
160 | 	for i, chl in grouped:   
161 | 	    chl.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
162 | 	                 color = cmap((i-1)*1.0/2), label = labels[i-1], s=30);
163 | 	    
164 | 	# Plot transformed sample points   
165 | 	for i, sample in enumerate(pca_samples):
166 | 		ax.scatter(x = sample[0], y = sample[1], \
167 | 	           s = 200, linewidth = 3, color = 'black', marker = 'o', facecolors = 'none');
168 | 		ax.scatter(x = sample[0]+0.25, y = sample[1]+0.3, marker='$%d$'%(i), alpha = 1, s=125);
169 | 
170 | 	# Set plot title
171 | 	ax.set_title("PCA-Reduced Data Labeled by 'Channel'\nTransformed Sample Data Circled");
172 | 
173 | ###########################################################
174 | ###########################################################
175 | ######### END OF FUNCTIONS FOR DATA VISUALIZATION #########
176 | ###########################################################
177 | ###########################################################
178 | 
179 | 
180 | 
181 | ##########################################################################
182 | ##########################################################################
183 | #######################  IMPLEMENTATION CODE #############################
184 | ##########################################################################
185 | ##########################################################################
186 | 
187 | ###########################################
188 | # I am Suppressing matplotlib and other module deprecation user warnings
189 | # This is Necessary for newer version of matplotlib
190 | import warnings
191 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
192 | warnings.filterwarnings("ignore", category = UserWarning, module = "cross_validationb")
193 | warnings.filterwarnings("ignore", category=DeprecationWarning)
194 | import matplotlib.pyplot as plt
195 | import matplotlib.cm as cm
196 | import pandas as pd
197 | import numpy as np
198 | 
199 | 
200 | # Load the retail customers dataset
201 | try:
202 |     data = pd.read_csv("E:\\retail2.csv")
203 |     data.drop(['Region', 'Channel'], axis = 1, inplace = True)
204 |     print "Retail customers dataset has {} samples with {} features each.".format(*data.shape)
205 | except:
206 |     print "Dataset could not be loaded. Is the dataset missing?"
207 |     
208 | # Display a description of the dataset
209 | display(data.describe())
210 | 
211 | 
212 | #   Selecting three indices of my choice from the dataset
213 | indices = [225,182,400]
214 | 
215 | # Creating a DataFrame of the chosen samples
216 | samples = pd.DataFrame(data.loc[indices], columns = data.keys()).reset_index(drop = True)
217 | print "Chosen samples of Retail customers dataset:"
218 | display(samples)
219 | 
220 | #lOOKING AT THE PLOTS TO CHECK THE TREND
221 | pd.DataFrame(samples).transpose().plot(kind='bar', stacked=False, figsize=(8,8), title='Retail customers dataset')
222 | 
223 | # Calculating deviations from Mean and Median
224 | #to check how much the 3 samples deviate from the Center.
225 | delta_mean = samples - data.mean().round()
226 | delta_median = samples - data.median().round()
227 | 
228 | #display(delta_mean, delta_median)
229 | delta_mean.plot.bar(figsize=(30,10), title='Compared to MEAN', grid=True)
230 | delta_median.plot.bar(figsize=(30,10), title='Compared to MEDIAN', grid=True);
231 | 
232 | #Importing Libraries
233 | from sklearn.cross_validation import train_test_split
234 | from sklearn.tree import DecisionTreeRegressor
235 | 
236 | #Broke down the logic into a scoring function and a For loop
237 | def scorer(feature):
238 |     #   Make a copy of the DataFrame, using the 'drop' function to drop the given feature
239 |     relevant_data = data.drop([feature], axis=1)
240 | 
241 |     #   Split the data into training and testing sets using the given feature as the target
242 |     X_train, X_test, y_train, y_test = train_test_split(relevant_data, data[feature], test_size=0.25, random_state=45)
243 | 
244 |     #   Create a decision tree regressor and fit it to the training set
245 |     regressor = DecisionTreeRegressor(random_state=45).fit(X_train, y_train)
246 | 
247 |     #   Report the score of the prediction using the testing set
248 |     score = regressor.score(X_test, y_test)
249 | 
250 |     print("The R2 score for feature {:16} is {:+.5f}".format(feature, score))
251 | 
252 | for feature in data.columns.values:
253 |     scorer(feature)
254 | 
255 | # Produce a scatter matrix for each pair of features in the data
256 | pd.plotting.scatter_matrix(data, alpha = 0.3, figsize = (24,12), diagonal = 'kde');
257 | #checking Correlation table
258 | data.corr()
259 | 
260 | #   Scale the data using the natural logarithm
261 | log_data = np.log(data)
262 | #   Scale the sample data using the natural logarithm
263 | log_samples = np.log(samples)
264 | # Display the log-transformed sample data
265 | display(log_samples)
266 | log_samples.corr()
267 | 
268 | #Identifying Outliers
269 | #Using Counters
270 | from collections import Counter
271 | outliers_counter = Counter()
272 | 
273 | # For each feature finding out the data points with extreme high or low values - Outliers
274 | outliers_scores = None 
275 | 
276 | for feature in log_data.keys():
277 |     
278 |     #   Calculate Q1 (25th percentile of the data) for the given feature
279 |     Q1 = np.percentile(log_data[feature], 25)
280 |     
281 |     #   Calculate Q3 (75th percentile of the data) for the given feature
282 |     Q3 = np.percentile(log_data[feature], 75)
283 |     
284 |     #   Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
285 |     step = 1.5 * (Q3 - Q1)
286 |     
287 |     empty = np.zeros(len(log_data[feature]))
288 |     aboveQ3 = log_data[feature].values - Q3 - step
289 |     belowQ3 = log_data[feature].values - Q1 + step
290 |     current_outliers_scores = np.array(np.maximum(empty, aboveQ3) - np.minimum(empty, belowQ3)).reshape([-1,1])
291 |     outliers_scores = current_outliers_scores if outliers_scores is None else np.hstack([outliers_scores, current_outliers_scores])
292 |     
293 |     # Display the outliers
294 |     print("Data points considered outliers for the feature '{}':".format(feature))
295 |     current_outliers = log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))]
296 |     display(current_outliers)
297 |     outliers_counter.update(current_outliers.index.values)
298 |     
299 | # Select the indices for data points you wish to remove
300 | min_outliers_count = 2
301 | outliers = [x[0] for x in outliers_counter.items() if x[1] >= min_outliers_count]
302 | print("Data points considered outlier for more than 1 feature: {}".format(outliers))
303 | 
304 | # Remove the outliers, if any were specified
305 | good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True)
306 | 
307 |   
308 | #   Apply PCA by fitting the good data with the same number of dimensions as features
309 | from sklearn.decomposition import PCA
310 | pca = PCA(n_components=len(good_data.columns)).fit(good_data)
311 | 
312 | #   Transform log_samples using the PCA fit above
313 | pca_samples =pca.transform(log_samples)
314 | 
315 | # Generate PCA results plot
316 | pcaOutput = pcaOutput(good_data, pca)
317 | # show results
318 | display(pcaOutput)
319 | # Cumulative explained variance should add to 1
320 | display(pcaOutput['Explained Variance'].cumsum())
321 | 
322 | from sklearn.decomposition import PCA
323 | #   Apply PCA by fitting the good data with only two dimensions
324 | pca = PCA(n_components=2).fit(good_data)
325 | 
326 | #   Transform the good data using the PCA fit above
327 | reduced_data = pca.transform(good_data)
328 | 
329 | #   Transform the sample log-data using the PCA fit above
330 | pca_samples = pca.transform(log_samples)
331 | 
332 | # Create a DataFrame for the reduced data
333 | reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2'])
334 | 
335 | # Display sample log-data after applying PCA transformation in two dimensions
336 | display(pd.DataFrame(np.round(pca_samples, 4), columns = ['Dimension 1', 'Dimension 2']))
337 | 
338 | # Create a biplot
339 | biPlotter(good_data, reduced_data, pca)
340 | 
341 | #Import GMM and silhouette score library
342 | from sklearn.mixture import GMM
343 | from sklearn.metrics import silhouette_score
344 | 
345 | #Divided the logic into Clustering and Scoring functions
346 | #Then Iterated through the clusters in the identified range
347 | 
348 | #This function creates Cluster using GMM
349 | def clusterCreator(data, n_clusters):
350 |     clusterer = GMM(n_components=n_clusters, covariance_type='full', random_state=45).fit(data)
351 |     preds = clusterer.predict(data)
352 |     centers = clusterer.means_
353 |     sample_preds = clusterer.predict(pca_samples)
354 |     return preds, centers, sample_preds
355 | 
356 | #Scoring after creating Clusters
357 | def scorer(data, n_clusters):
358 |     preds, _, _  = clusterCreator(data, n_clusters)
359 |     score = silhouette_score(data, preds)
360 |     return score
361 | 
362 | #Iterate in the clusters and Print silhouette score
363 | for n_clusters in range(2,10):
364 |     score = scorer(reduced_data, n_clusters)
365 |     print "For n_clusters = {}. The average silhouette_score is : {}".format(n_clusters, score)
366 | 
367 | n_clusters=0
368 | 
369 | # Resetting as due to loop run before we need to reset values again with 2 cluster components
370 | clusterer = GMM(n_components=2).fit(reduced_data)
371 | predictions = clusterer.predict(reduced_data)
372 | centers = clusterer.means_
373 | sample_preds = clusterer.predict(pca_samples)
374 | # Display the results of the clustering from implementation
375 | clusterOutput(reduced_data, predictions, centers, pca_samples)
376 | 
377 | log_centers = pca.inverse_transform(centers)
378 | 
379 | #   Exponentiate the centers
380 | true_centers = np.exp(log_centers)
381 | 
382 | # Display the true centers
383 | segments = ['Segment {}'.format(i) for i in range(0,len(centers))]
384 | true_centers = pd.DataFrame(np.round(true_centers), columns = data.keys())
385 | true_centers.index = segments
386 | display(true_centers)
387 | #Compare the True Centers from the Central values Mean and Median
388 | #display(true_centers - data.mean().round())
389 | delta_mean=true_centers - data.mean().round()
390 | #display(true_centers - data.median().round())
391 | delta_median=true_centers - data.median().round()
392 | delta_mean.plot.bar(figsize=(10,4), title='Compared to MEAN', grid=True)
393 | delta_median.plot.bar(figsize=(10,4), title='Compared to MEDIAN', grid=True);
394 | 
395 | # Display the predictions
396 | for i, pred in enumerate(sample_preds):
397 |     print "Sample point", i, "predicted to be in Cluster", pred
398 | samples
399 | 
400 | # Display the clustering results based on 'Channel' data
401 | redData=reduced_data
402 | channelOutput(redData, outliers, pca_samples)
403 | 
404 | ##########################################################################
405 | ##########################################################################
406 | #######################  END OF CODE #####################################
407 | ##########################################################################
408 | ##########################################################################
409 | 
410 | 


--------------------------------------------------------------------------------