├── Chapter02 ├── testTree.json ├── Data │ ├── __pycache__ │ │ ├── TreeDict.cpython-35.pyc │ │ └── __init__.cpython-35.pyc │ ├── testData.csv │ └── treeModel.json ├── ID3_Test.py ├── PracticalApplication.py ├── SplitCheck.py └── DecisionTree_ID3.py ├── Chapter05 ├── Data │ ├── download.jpg │ └── bcancer.csv ├── AdaboostAlgorithmExample.py ├── AdaBoostFaceDetection.py └── Adaboost.py ├── Chapter07 ├── Data │ └── train_modified.csv ├── xgBoost.py └── xgboost_param_tune.py ├── Chapter09 ├── Data │ └── graph_feat_4.png ├── FeatureSelection_PCA.py ├── RecursiveFeatureElimination.py ├── UnivariateFeatureSelection.py ├── feature_reduction_impact.py ├── SVM_Test.py ├── SVM_KernelTrick.py ├── stacking_spamdata.py ├── RF_feature_selection.py └── bcancer.csv ├── Chapter03 ├── RandomForest.py ├── BinaryTree.py ├── PracticalApplication.py ├── DecisionTree.py ├── DecisionTree_CART_RF.py └── Data │ └── bcancer.csv ├── LICENSE ├── Chapter01 ├── Boosting.py ├── Bagging.py ├── Stacking.py └── kmeansClustering.py ├── Chapter04 ├── knnAlgoTest.py ├── utilityFunctions.py ├── SpamClassification.py └── KNN.py ├── Chapter06 ├── RegressionTreeTest.py └── RegressionTrees.py ├── README.md ├── Chapter10 ├── ANN.py └── DigitClassification.py └── Chapter08 └── StackedGeneralization.py /Chapter02/testTree.json: -------------------------------------------------------------------------------- 1 | {"Salary": {"High": -------------------------------------------------------------------------------- /Chapter05/Data/download.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Ensemble-Machine-Learning/HEAD/Chapter05/Data/download.jpg -------------------------------------------------------------------------------- /Chapter07/Data/train_modified.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Ensemble-Machine-Learning/HEAD/Chapter07/Data/train_modified.csv -------------------------------------------------------------------------------- /Chapter09/Data/graph_feat_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Ensemble-Machine-Learning/HEAD/Chapter09/Data/graph_feat_4.png -------------------------------------------------------------------------------- /Chapter02/Data/__pycache__/TreeDict.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Ensemble-Machine-Learning/HEAD/Chapter02/Data/__pycache__/TreeDict.cpython-35.pyc -------------------------------------------------------------------------------- /Chapter02/Data/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Ensemble-Machine-Learning/HEAD/Chapter02/Data/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /Chapter02/Data/testData.csv: -------------------------------------------------------------------------------- 1 | ,buying,maint,doors,persons,lug_boot,safety,Class 2 | 0,vhigh,med,2,2,med,low,unacc 3 | 1,low,high,5more,4,small,low,unacc 4 | 2,high,high,3,4,med,low,unacc 5 | 3,vhigh,high,2,2,big,low,unacc 6 | 4,vhigh,high,2,2,big,med,unacc 7 | 5,vhigh,med,4,more,small,med,unacc 8 | 6,low,med,5more,2,small,high,unacc 9 | 7,high,low,4,4,med,high,acc 10 | 8,low,med,3,2,big,high,unacc 11 | -------------------------------------------------------------------------------- /Chapter03/RandomForest.py: -------------------------------------------------------------------------------- 1 | from Chapter_03 import DecisionTree_CART_RF as rf 2 | filename = 'bcancer.csv' 3 | dataset = rf.load_csv(filename) 4 | # convert string attributes to integers 5 | for i in range(0, len(dataset[0])-1): 6 | rf.str_column_to_float(dataset, i) 7 | # convert class column to integers 8 | rf.str_column_to_int(dataset, len(dataset[0])-1) 9 | 10 | dataset_new = [] 11 | for row in dataset: 12 | dataset_new.append([row[i] for i in range(1,len(row))]) 13 | # # evaluate algorithm 14 | dataset = dataset_new 15 | n_folds = 5 16 | max_depth = 3 17 | min_size = 1 18 | sample_size = 0.5 19 | n_features = 5#int(sqrt(len(dataset[0])-1)) 20 | print("features: %d"%n_features) 21 | 22 | for n_trees in [1, 5, 10]: 23 | scores = rf.evaluate_algorithm(dataset, rf.random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features) 24 | print('Trees: %d' % n_trees) 25 | print('Scores: %s' % scores) 26 | print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores)))) -------------------------------------------------------------------------------- /Chapter09/FeatureSelection_PCA.py: -------------------------------------------------------------------------------- 1 | #Import the required packages 2 | 3 | #Import pandas to read csv 4 | import pandas 5 | 6 | #Import numpy for array related operations 7 | import numpy 8 | 9 | #Import sklearn's PCA algorithm 10 | from sklearn.decomposition import PCA 11 | 12 | #URL for loading the data set 13 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data" 14 | 15 | #Define the attribute names 16 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 17 | 18 | #Create pandas data frame by loading the data from URL 19 | dataframe = pandas.read_csv(url, names=names) 20 | 21 | #Create array from data values 22 | array = dataframe.values 23 | 24 | #Split the data into input and target 25 | X = array[:,0:8] 26 | Y = array[:,8] 27 | 28 | #Feature extraction 29 | pca = PCA(n_components=3) 30 | fit = pca.fit(X) 31 | 32 | #Summarize components 33 | print("Explained Variance: %s" % fit.explained_variance_ratio_) 34 | print(fit.components_) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Chapter02/ID3_Test.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | #Lets Create the test dataset to build our tree 3 | dataset = {'Name':['Person 1','Person 2','Person 3','Person 4','Person 5','Person 6','Person 7','Person 8','Person 9','Person 10'], 4 | 'Salary':['Low','Med','Med','Med','Med','High','Low','High','Med','Low'], 5 | 'Sex':['Male','Male','Male','Female','Male','Female','Female','Male','Female','Male'], 6 | 'Marital':['Unmarried','Unmarried','Married','Married','Married','Unmarried','Unmarried','Unmarried','Unmarried','Married'], 7 | 'Class':['No','No','Yes','No','Yes','Yes','No','Yes','Yes','Yes']} 8 | from Chapter_02 import DecisionTree_ID3 as ID3 9 | #Preprocess data set 10 | df = ID3.preProcess(dataset) 11 | 12 | #Lets build the tree 13 | tree = ID3.buildTree(df) 14 | 15 | import pprint 16 | #print(tree) 17 | pprint.pprint(tree) 18 | 19 | #Select test instance 20 | inst = df.ix[2] 21 | 22 | #Remove its class attribute 23 | inst.pop('Class') 24 | 25 | #Get prediction 26 | prediction = ID3.predict(inst, tree) 27 | print("Prediction: %s"%prediction[0]) 28 | 29 | main() -------------------------------------------------------------------------------- /Chapter01/Boosting.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 22, 2017 3 | 4 | @author: DX 5 | ''' 6 | # Import All the required packages from sklearn 7 | from sklearn import model_selection 8 | from sklearn.datasets import load_iris 9 | from sklearn.ensemble import AdaBoostClassifier # Boosting Algorithm 10 | from sklearn.tree import DecisionTreeClassifier 11 | 12 | import numpy as np 13 | 14 | 15 | #Load data 16 | iris = load_iris() 17 | X = iris.data 18 | Y = iris.target 19 | 20 | #Split data in training and testing set 21 | X_fit, X_eval, y_fit, y_test= model_selection.train_test_split( X, Y, test_size=0.20, random_state=1 ) 22 | 23 | #Define a decision tree classifier 24 | cart = DecisionTreeClassifier() 25 | num_trees = 25 26 | 27 | #Create classification model for bagging 28 | model = AdaBoostClassifier(base_estimator=cart, n_estimators=num_trees, learning_rate = 0.1) 29 | 30 | #Train Classification model 31 | model.fit(X_fit, y_fit) 32 | 33 | #Test trained model over test set 34 | pred_label = model.predict(X_eval) 35 | nnz = np.float(np.shape(y_test)[0] - np.count_nonzero(pred_label - y_test)) 36 | acc = 100*nnz/np.shape(y_test)[0] 37 | 38 | #Print accuracy of the model 39 | print('accuracy is: '+str(acc)) -------------------------------------------------------------------------------- /Chapter03/BinaryTree.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 19-Jun-2017 3 | 4 | @author: aii32199 5 | ''' 6 | import numpy as np 7 | def getNewNode(data): 8 | node = {'data':[],'left':[],'right':[]} 9 | node['data'] = data 10 | print(node) 11 | return node 12 | 13 | def createBinaryTree(tree,data): 14 | 15 | #Check whether we have any node in the tree if not create one 16 | if not tree: 17 | tree = getNewNode(data) 18 | 19 | #Now if current value is less than parent node put it in left 20 | elif data<=tree['data']: 21 | tree['left'] = createBinaryTree(tree['left'],data) 22 | #else put it in right 23 | else: 24 | tree['right'] = createBinaryTree(tree['right'],data) 25 | return tree 26 | 27 | 28 | # data = [0.7,0.65,0.83,0.54,0.9,0.11,0.44,0.35,0.75,0.3,0.78,0.15] 29 | data = [0.7,0.65,0.83,0.54,0.9,0.11,0.44,0.35,0.75,0.3,0.78,0.15] 30 | med = np.median(data) 31 | print("Median of array is: %.2f"%med) 32 | 33 | tree = [] 34 | tree = createBinaryTree(tree,med) 35 | for i in range(len(data)): 36 | value = data[i] 37 | tree = createBinaryTree(tree,value) 38 | 39 | import pprint 40 | pprint.pprint(tree) 41 | -------------------------------------------------------------------------------- /Chapter05/AdaboostAlgorithmExample.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 18-Sep-2017 3 | 4 | @author: DX 5 | ''' 6 | 7 | import numpy as np; 8 | from Chapter_05 import Adaboost as ad 9 | 10 | dataset = [[0.25000, 1.75000, 1.00000], 11 | [1.25000, 1.75000, -1.00000], 12 | [0.50000, 1.50000, 1.00000], 13 | [1.00000, 0.50000, -1.00000], 14 | [1.25000, 3.50000, 1.00000], 15 | [1.50000, 4.00000, 1.00000], 16 | [2.00000, 2.00000, -1.00000], 17 | [2.50000, 2.50000, 1.00000], 18 | [3.75000, 3.00000, -1.00000], 19 | [4.00000, 1.00000, -1.00000]] 20 | 21 | [weaks,alphas] = ad.AdaBoostAlgorithm(dataset,9) 22 | 23 | prediction=[] 24 | actual = [] 25 | for row in dataset: 26 | preds = [] 27 | for i in range(len(weaks)): 28 | p = alphas[i]*ad.predict(weaks[i], row) 29 | #p = predict(weaks[i], row) 30 | preds.append(p) 31 | final = np.sign(sum(preds)) 32 | #final = max(set(preds), key=preds.count) 33 | prediction.append(final) 34 | actual.append(row[-1]) 35 | print('Expected=%d, Got=%d' % (row[-1], final)) 36 | 37 | acc = ad.accuracy_metric(actual, prediction) 38 | print("accuracy: %.2f"%acc) -------------------------------------------------------------------------------- /Chapter03/PracticalApplication.py: -------------------------------------------------------------------------------- 1 | from Chapter_03 import DecisionTree_CART_RF as CART 2 | import pprint 3 | filename = 'bcancer.csv' 4 | dataset = CART.load_csv(filename) 5 | # convert string attributes to integers 6 | for i in range(0, len(dataset[0])): 7 | CART.str_column_to_float(dataset, i) 8 | 9 | #Now remove index column from the data set 10 | dataset_new = [] 11 | for row in dataset: 12 | dataset_new.append([row[i] for i in range(1,len(row))]) 13 | 14 | #Get training and testing data split 15 | training,testing = CART.getTrainTestData(dataset_new, 0.7) 16 | tree = CART.build_tree(training,11,5) 17 | pprint.pprint(tree) 18 | 19 | pre = [] 20 | act = [] 21 | for row in training: 22 | prediction = CART.predict(tree, row) 23 | pre.append(prediction) 24 | actual = act.append(row[-1]) 25 | # print('Expected=%d, Got=%d' % (row[-1], prediction)) 26 | # print_tree(tree) 27 | acc = CART.accuracy_metric(act, pre) 28 | 29 | print('training accuracy: %.2f'%acc) 30 | 31 | for row in testing: 32 | prediction = CART.predict(tree, row) 33 | pre.append(prediction) 34 | actual = act.append(row[-1]) 35 | acc = CART.accuracy_metric(act, pre) 36 | # pprint.pprint(tree) 37 | print('testing accuracy: %.2f'%acc) -------------------------------------------------------------------------------- /Chapter09/RecursiveFeatureElimination.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 02-Nov-2017 3 | 4 | @author: aii32199 5 | ''' 6 | 7 | #Import the required packages 8 | 9 | #Import pandas to read csv 10 | import pandas 11 | 12 | #Import numpy for array related operations 13 | import numpy 14 | 15 | #Import sklearn's feature selection algorithm 16 | from sklearn.feature_selection import RFE 17 | 18 | #Import LogisticRegression for performing chi square test 19 | from sklearn.linear_model import LogisticRegression 20 | 21 | #URL for loading the data set 22 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data" 23 | 24 | #Define the attribute names 25 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 26 | 27 | #Create pandas data frame by loading the data from URL 28 | dataframe = pandas.read_csv(url, names=names) 29 | 30 | #Create array from data values 31 | array = dataframe.values 32 | 33 | #Split the data into input and target 34 | X = array[:,0:8] 35 | Y = array[:,8] 36 | 37 | #Feature extraction 38 | model = LogisticRegression() 39 | rfe = RFE(model, 3) 40 | fit = rfe.fit(X, Y) 41 | 42 | print("Num Features: %d"% fit.n_features_) 43 | print("Selected Features: %s"% fit.support_) 44 | print("Feature Ranking: %s"% fit.ranking_) 45 | -------------------------------------------------------------------------------- /Chapter01/Bagging.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 20, 2017 3 | 4 | @author: DX 5 | ''' 6 | # Import All the required packages from sklearn 7 | import numpy as np 8 | from sklearn import model_selection 9 | from sklearn.ensemble import BaggingClassifier 10 | from sklearn.tree import DecisionTreeClassifier 11 | from sklearn.datasets import load_iris 12 | 13 | #Load data 14 | iris = load_iris() 15 | X = iris.data 16 | Y = iris.target 17 | 18 | #Split data in training and testing set 19 | X_fit, X_eval, y_fit, y_test= model_selection.train_test_split( X, Y, test_size=0.30, random_state=1 ) 20 | 21 | #Create random sub sample to train multiple models 22 | seed = 7 23 | kfold = model_selection.KFold(n_splits=10, random_state=seed) 24 | 25 | #Define a decision tree classifier 26 | cart = DecisionTreeClassifier() 27 | num_trees = 100 28 | 29 | #Create classification model for bagging 30 | model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed) 31 | 32 | #Train different models and print their accuracy 33 | results = model_selection.cross_val_score(model, X_fit, y_fit,cv=kfold) 34 | for i in range(len(results)): 35 | print("Model: "+str(i)+" Accuracy is: "+str(results[i])) 36 | 37 | print("Mean Accuracy is: "+str(results.mean())) 38 | 39 | model.fit(X_fit, y_fit) 40 | pred_label = model.predict(X_eval) 41 | nnz = np.shape(y_test)[0] - np.count_nonzero(pred_label - y_test) 42 | acc = 100*nnz/np.shape(y_test)[0] 43 | print('accuracy is: '+str(acc)) -------------------------------------------------------------------------------- /Chapter04/knnAlgoTest.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 02-Sep-2017 3 | 4 | @author: DX 5 | ''' 6 | #Import math for calculations of square roots 7 | import numpy as np 8 | 9 | from Chapter_04 import KNN 10 | dataset = [[5.1, 3.5, 1.4, 0.2, 1], 11 | [4.9, 3.0, 1.4, 0.2, 1], 12 | [4.7, 3.2, 1.3, 0.2, 1], 13 | [4.6, 3.1, 1.5, 0.2, 1], 14 | [5.0, 3.6, 1.4, 0.2, 1], 15 | [7.0, 3.2, 4.7, 1.4, 2], 16 | [6.4, 6.2, 4.5, 1.5, 2], 17 | [6.9, 3.1, 4.9, 1.5, 2], 18 | [5.5, 2.3, 4.0, 1.3, 2], 19 | [6.5, 2.8, 4.6, 1.5, 2], 20 | [6.3, 3.3, 6.0, 2.5, 3], 21 | [5.8, 2.7, 5.1, 1.9, 3], 22 | [7.1, 3.0, 5.9, 2.1, 3], 23 | [6.3, 2.9, 5.6, 1.8, 3], 24 | [6.5, 3.0, 5.8, 2.2, 3]] 25 | 26 | np.random.shuffle(dataset) 27 | 28 | #Lets put our test instance. 29 | testInstance=[4.8,3.1,3.0,1.3,1] 30 | 31 | #Now lets find out 3 neighbors for our test instance using getNeighbor 32 | k = 5 33 | neighbors = KNN.getNeighbors(dataset, testInstance, k) 34 | 35 | #Print neighbors 36 | print(neighbors) 37 | 38 | #Get the class prediction out of neighbors 39 | prediction = KNN.getPrediction(neighbors) 40 | 41 | #Print predicion 42 | print("Predicted class for the test instance is: %d"%prediction) -------------------------------------------------------------------------------- /Chapter02/PracticalApplication.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pprint 3 | import json 4 | import numpy as np 5 | np.random.seed(1337) # for reproducibility 6 | from Chapter_02 import DecisionTree_ID3 as DT 7 | 8 | datapath = 'E:/PyDevWorkSpaceTest/Ensembles/Chapter_02/Data/CarDataset.csv' 9 | path2save = 'E:/PyDevWorkSpaceTest/Ensembles/Chapter_02/Data/TreeModel.json' 10 | trainDataPath = 'E:/PyDevWorkSpaceTest/Ensembles/Chapter_02/Data/trainData.csv' 11 | testDataPath = 'E:/PyDevWorkSpaceTest/Ensembles/Chapter_02/Data/testData.csv' 12 | 13 | # testData = pd.read_csv(testDataPath) 14 | 15 | cardata = pd.read_csv(datapath) 16 | mat = cardata.as_matrix() 17 | df = pd.DataFrame(mat,columns=['buying','maint','doors','persons','lug_boot','safety','Class']) 18 | trainData,testData = DT.split_data(df, 0.995) 19 | 20 | trainData.to_csv(trainDataPath,columns=['buying','maint','doors','persons','lug_boot','safety','Class']) 21 | testData.to_csv(testDataPath,columns=['buying','maint','doors','persons','lug_boot','safety','Class']) 22 | 23 | tree = DT.buildTree(trainData) 24 | pprint.pprint(tree) 25 | 26 | with open(path2save,'w') as f: 27 | json.dump(tree,f) 28 | 29 | with open(path2save) as f: 30 | model = json.load(f) 31 | 32 | pprint.pprint(model) 33 | actualClass = testData['Class'] 34 | predictions = DT.BatchTest(testData, model) 35 | accuracy,match = DT.getAccuracy(actualClass, predictions) 36 | 37 | print("Accuracy of the model is: %.2f and matched results are %i out of %i"%(accuracy,match,len(actualClass))) 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /Chapter09/UnivariateFeatureSelection.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 02-Nov-2017 3 | 4 | @author: aii32199 5 | ''' 6 | # Feature Extraction with Univariate Statistical Tests (Chi-squared for classification) 7 | 8 | #Import the required packages 9 | 10 | #Import pandas to read csv 11 | import pandas 12 | 13 | #Import numpy for array related operations 14 | import numpy 15 | 16 | #Import sklearn's feature selection algorithm 17 | from sklearn.feature_selection import SelectKBest 18 | 19 | #Import chi2 for performing chi square test 20 | from sklearn.feature_selection import chi2 21 | 22 | #URL for loading the data set 23 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data" 24 | 25 | #Define the attribute names 26 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 27 | 28 | #Create pandas data frame by loading the data from URL 29 | dataframe = pandas.read_csv(url, names=names) 30 | 31 | #Create array from data values 32 | array = dataframe.values 33 | 34 | #Split the data into input and target 35 | X = array[:,0:8] 36 | Y = array[:,8] 37 | 38 | #We will select the features using chi square 39 | test = SelectKBest(score_func=chi2, k=4) 40 | 41 | #Fit the function for ranking the features by score 42 | fit = test.fit(X, Y) 43 | 44 | #Summarize scores 45 | numpy.set_printoptions(precision=3) 46 | print(fit.scores_) 47 | 48 | #Apply the transformation on to data set 49 | features = fit.transform(X) 50 | 51 | #Summarize selected features 52 | print(features[0:5,:]) -------------------------------------------------------------------------------- /Chapter09/feature_reduction_impact.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 31-Oct-2017 3 | 4 | @author: DX 5 | ''' 6 | 7 | #Import Sklearn Datasets of IRIS flower classification 8 | import sklearn.datasets as datasets 9 | 10 | #Import Pandas library to create data frame from the data 11 | import pandas as pd 12 | 13 | #Load the data set 14 | iris=datasets.load_iris() 15 | 16 | #Extract data part from the data set 17 | data = iris.data 18 | 19 | #Select dimension of data 20 | data = data[:,2:4] 21 | 22 | #Load data set into the data frame 23 | df=pd.DataFrame(data) 24 | 25 | #Extract target variable from the data set 26 | y=iris.target 27 | 28 | #Import decision tree classifier from sklearn 29 | from sklearn.tree import DecisionTreeClassifier 30 | 31 | #We will create a tree with maximum depth of 5, other parameters will be default 32 | dtree=DecisionTreeClassifier(max_depth=5) 33 | 34 | #Train the classifier 35 | dtree.fit(df,y) 36 | 37 | #Import graphwiz from sklearn to create the graph out of tree 38 | from sklearn.tree import export_graphviz 39 | 40 | #We will use StringIO to create graph with all characters 41 | from sklearn.externals.six import StringIO 42 | dot_data = StringIO() 43 | 44 | #Import pydotplus to create tree as a graph and store it on the disk 45 | import pydotplus 46 | 47 | #Create Graph out of tree and store it on the disk 48 | export_graphviz(dtree, out_file=dot_data, 49 | filled=True, rounded=True, 50 | special_characters=True) 51 | graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 52 | graph.write_png("graph_feat_4.png") -------------------------------------------------------------------------------- /Chapter07/xgBoost.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 23-Oct-2017 3 | 4 | @author: aii32199 5 | ''' 6 | 7 | # First XGBoost model for Pima Indians dataset 8 | 9 | #Load the required libraries 10 | #Numpy for reading the csv file 11 | from numpy import loadtxt 12 | 13 | #Import XGBoost classifier 14 | from xgboost import XGBClassifier 15 | 16 | #We will use sklearn to divide our data set into training and test set 17 | from sklearn.model_selection import train_test_split 18 | 19 | #We will use sklearn's accuracy metric to evaluate the performance of the trained model 20 | from sklearn.metrics import accuracy_score 21 | 22 | #Let's load the dataset into the numpy array 23 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 24 | 25 | #split data into X (input variables)and y(output variable/Class) 26 | X = dataset[:,0:8] 27 | Y = dataset[:,8] 28 | 29 | #Create training and test set with 33% data in test set and 66% for the training of the model 30 | seed = 7 31 | test_size = 0.33 32 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 33 | 34 | #Train our first model on created training set 35 | model = XGBClassifier() 36 | model.fit(X_train, y_train) 37 | 38 | #Lets see the prediction from the trained model 39 | y_pred = model.predict(X_test) 40 | 41 | #Create a list of predictions for evaluation purpose 42 | predictions = [round(value) for value in y_pred] 43 | 44 | #Evaluate predictions using accuracy metric 45 | accuracy = accuracy_score(y_test, predictions) 46 | 47 | #Print the accuracy 48 | print("Accuracy of the trained model is: %.2f%%" % (accuracy * 100.0)) 49 | 50 | print(model) -------------------------------------------------------------------------------- /Chapter05/AdaBoostFaceDetection.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 24-Nov-2017 3 | 4 | @author: aii32199 5 | ''' 6 | #So We will load required libraries numpy for matrix operations 7 | import numpy as np 8 | 9 | #Import OpenCV library, in python we can call it cv2 10 | import cv2 11 | 12 | #OpenCV have module cascade classifier which is based on haar cascade and 13 | #Adaboost algorithm, so we will call direct method. 14 | #First we will load the pre trained classifiers for frontal face and eye 15 | #detection, which are in the form of xml file. 16 | face_cascade = cv2.CascadeClassifier('E:/OpenCV/opencv/sources/data/haarcascades/haarcascade_frontalface_default.xml') 17 | eye_cascade = cv2.CascadeClassifier('E:/OpenCV/opencv/sources/data/haarcascades/haarcascade_eye.xml') 18 | 19 | #Now let us load an image from the local directory 20 | img = cv2.imread('download.jpg') 21 | 22 | #Let's convert image into gray 23 | gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 24 | 25 | #Here we will call the method which will find the faces in our input image 26 | faces = face_cascade.detectMultiScale(gray, 1.3, 5) 27 | #Lets run a loop to create sub images of faces from the input image using 28 | #cv2.rectangle function 29 | for (x,y,w,h) in faces: 30 | img = cv2.rectangle(img,(x,y),(x+w,y+h),(255,0,0),2) 31 | roi_gray = gray[y:y+h, x:x+w] 32 | roi_color = img[y:y+h, x:x+w] 33 | 34 | #windows 35 | eyes = eye_cascade.detectMultiScale(roi_gray) 36 | #following function will create the rectangles around the eyes 37 | for (ex,ey,ew,eh) in eyes: 38 | cv2.rectangle(roi_color,(ex,ey),(ex+ew,ey+eh),(0,255,0),2) 39 | #Following Lines will show the detected face images 40 | cv2.imshow('img',img) 41 | cv2.waitKey(0) 42 | cv2.destroyAllWindows() -------------------------------------------------------------------------------- /Chapter06/RegressionTreeTest.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 03-Oct-2017 3 | 4 | @author: DX 5 | ''' 6 | import pprint 7 | from Chapter_06 import RegressionTrees as rg 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | 11 | #Create a Sine wave for demonstration of non-linearity 12 | 13 | #Set the number of samples 14 | N = 256 15 | 16 | #Create time value 17 | ix = np.arange(N) 18 | 19 | #Create the sine wave using the formula sin(2*pi*f) 20 | signal = np.sin(2*np.pi*ix/float(N/2)) 21 | 22 | #Combine both time and amplitude 23 | dataset = range(0,N) 24 | dataset = np.c_[ix,signal] 25 | dataset_ = dataset.copy() 26 | 27 | #Call Gradient boost 28 | weaks = rg.GradientBoost(dataset,5,1,100) 29 | 30 | prediction=[] 31 | actual = [] 32 | 33 | #Run a loop to extract each instance from the data set 34 | for row in dataset_: 35 | 36 | #Create a list to store predictions from different ckassifier for the test instance 37 | preds = [] 38 | 39 | #Feed the instance to different classifiers 40 | for i in range(len(weaks)): 41 | 42 | #Multiply the predicted ouput with the alpha value of the classifier 43 | p = rg.predict(weaks[i], row) 44 | 45 | #Add the weighted prediction to the list 46 | preds.append(p) 47 | 48 | #Sum up output of all the classifiers and take their sign as the prediction 49 | final = (sum(preds)) 50 | 51 | #Append the final output to the prediction list and actual ouput to the actual list 52 | prediction.append(final) 53 | actual.append(row[-1]) 54 | 55 | #Append the error of the current configuration 56 | _,mse = rg.getResidual(actual, prediction) 57 | 58 | 59 | #Lets Plot the error in each configuration 60 | plt.figure() 61 | plt.plot(ix,signal,marker='*',markersize=8) 62 | plt.plot(ix,prediction,marker='+',markersize=8) 63 | plt.show() -------------------------------------------------------------------------------- /Chapter02/SplitCheck.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jun 24, 2017 3 | 4 | @author: DX 5 | ''' 6 | 7 | # Split a dataset based on an attribute and an attribute value 8 | def test_split(index, value, dataset): 9 | left, right = list(), list() 10 | for row in dataset: 11 | if row[index] < value: 12 | left.append(row) 13 | else: 14 | right.append(row) 15 | return left, right 16 | 17 | # Calculate the Gini index for a split dataset 18 | def gini_index(groups, class_values): 19 | gini = 0.0 20 | for class_value in class_values: 21 | for group in groups: 22 | size = len(group) 23 | if size == 0: 24 | continue 25 | proportion = [row[-1] for row in group].count(class_value) / float(size) 26 | gini += (proportion * (1.0 - proportion)) 27 | return gini 28 | 29 | # Select the best split point for a dataset 30 | 31 | def get_split(dataset): 32 | 33 | class_values = extractClasses(dataset) 34 | 35 | b_index, b_value, b_score, b_groups = 999, 999, 999, None 36 | for index in range(len(dataset[0])-1): 37 | for row in dataset: 38 | groups = test_split(index, row[index], dataset) 39 | gini = gini_index(groups, class_values) 40 | print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) 41 | if gini < b_score: 42 | b_index, b_value, b_score, b_groups = index, row[index], gini, groups 43 | return {'index':b_index, 'value':b_value, 'groups':b_groups} 44 | 45 | def extractClasses(dataset): 46 | 47 | class_values = [] 48 | 49 | for rows in dataset: 50 | class_values.append(rows[-1]) 51 | 52 | return class_values 53 | 54 | dataset = [[0.50000, 1.50000, 1.00000], 55 | [1.00000, 0.50000, -1.00000], 56 | [1.25000, 3.50000, 1.00000], 57 | [1.50000, 4.00000, 1.00000], 58 | [2.00000, 2.00000, -1.00000], 59 | [2.50000, 2.50000, 1.00000], 60 | [3.75000, 3.00000, -1.00000], 61 | [4.00000, 1.00000, -1.00000]] 62 | split = get_split(dataset) 63 | print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) 64 | -------------------------------------------------------------------------------- /Chapter01/Stacking.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 24-May-2017 3 | 4 | @author: aii32199 5 | ''' 6 | 7 | from sklearn import datasets 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.naive_bayes import GaussianNB 11 | from sklearn.neighbors import KNeighborsClassifier 12 | from mlxtend.classifier import StackingClassifier 13 | from sklearn import cross_validation 14 | import numpy as np 15 | from sklearn.tree import DecisionTreeClassifier 16 | iris = datasets.load_iris() 17 | X, y = iris.data[:, 1:3], iris.target 18 | 19 | def CalculateAccuracy(y_test,pred_label): 20 | nnz = np.shape(y_test)[0] - np.count_nonzero(pred_label - y_test) 21 | acc = 100*nnz/float(np.shape(y_test)[0]) 22 | return acc 23 | 24 | clf1 = KNeighborsClassifier(n_neighbors=2) 25 | clf2 = RandomForestClassifier(n_estimators = 2,random_state=1) 26 | clf3 = GaussianNB() 27 | lr = LogisticRegression() 28 | 29 | clf1.fit(X, y) 30 | clf2.fit(X, y) 31 | clf3.fit(X, y) 32 | 33 | f1 = clf1.predict(X) 34 | acc1 = CalculateAccuracy(y, f1) 35 | print("accuracy from KNN: "+str(acc1) ) 36 | 37 | f2 = clf2.predict(X) 38 | acc2 = CalculateAccuracy(y, f2) 39 | print("accuracy from Random Forest: "+str(acc2) ) 40 | 41 | f3 = clf3.predict(X) 42 | acc3 = CalculateAccuracy(y, f3) 43 | print("accuracy from Naive Bays: "+str(acc3) ) 44 | 45 | f = [f1,f2,f3] 46 | f = np.transpose(f) 47 | 48 | lr.fit(f, y) 49 | final = lr.predict(f) 50 | 51 | acc4 = CalculateAccuracy(y, final) 52 | print("accuracy from Stacking: "+str(acc4) ) 53 | 54 | # accuracy from KNN: 96.66666666666667 55 | # accuracy from Random Forest: 94.66666666666667 56 | # accuracy from Naive Bays: 92.0 57 | # accuracy from Stacking: 97.33333333333333 58 | 59 | # sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 60 | # meta_classifier=lr) 61 | # 62 | # print('3-fold cross validation:\n') 63 | # 64 | # for clf, label in zip([clf1, clf2, clf3, sclf], 65 | # ['KNN', 66 | # 'Random Forest', 67 | # 'Naive Bayes', 68 | # 'StackingClassifier']): 69 | # 70 | # scores = cross_validation.cross_val_score(clf, X, y, 71 | # cv=3, scoring='accuracy') 72 | # print("Accuracy: %0.2f (+/- %0.2f) [%s]" 73 | # % (scores.mean(), scores.std(), label)) 74 | 75 | -------------------------------------------------------------------------------- /Chapter04/utilityFunctions.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 02-Sep-2017 3 | 4 | @author: DX 5 | ''' 6 | from csv import reader 7 | from math import sqrt 8 | from random import seed 9 | from random import randrange 10 | import numpy as np 11 | 12 | # Load a CSV file 13 | def load_csv(filename): 14 | dataset = list() 15 | with open(filename, 'r') as file: 16 | csv_reader = reader(file) 17 | for row in csv_reader: 18 | if not row: 19 | continue 20 | dataset.append(row) 21 | return dataset 22 | 23 | def getTrainTestData(dataset,split): 24 | np.random.seed(0) 25 | training = [] 26 | testing = [] 27 | 28 | np.random.shuffle(dataset) 29 | shape = np.shape(dataset) 30 | trainlength = np.uint16(np.floor(split*shape[0])) 31 | 32 | for i in range(trainlength): 33 | training.append(dataset[i]) 34 | 35 | for i in range(trainlength,shape[0]): 36 | testing.append(dataset[i]) 37 | 38 | return training,testing 39 | 40 | # Convert string column to float 41 | def str_column_to_float(dataset, column,length): 42 | 43 | #for row in dataset: 44 | for i in range(length): 45 | row = dataset[i] 46 | if row[column]=='?': 47 | row[column] = 0 48 | else: 49 | row[column] = float(row[column].strip()) 50 | 51 | # Convert string column to integer 52 | def str_column_to_int(dataset, column,length): 53 | 54 | class_values=[] 55 | for i in range(length): 56 | row = dataset[i] 57 | class_values.append(row[column]) 58 | # class_values = [row[column] for row in dataset] 59 | unique = set(class_values) 60 | lookup = dict() 61 | for i, value in enumerate(unique): 62 | lookup[value] = i 63 | for i in range(length): 64 | row = dataset[i] 65 | row[column] = lookup[row[column]] 66 | return lookup 67 | 68 | # Split a dataset into k folds 69 | def cross_validation_split(dataset, n_folds): 70 | dataset_split = list() 71 | dataset_copy = list(dataset) 72 | fold_size = int(len(dataset) / n_folds) 73 | for i in range(n_folds): 74 | fold = list() 75 | while len(fold) < fold_size: 76 | index = randrange(len(dataset_copy)) 77 | fold.append(dataset_copy.pop(index)) 78 | dataset_split.append(fold) 79 | return dataset_split 80 | 81 | def subsample(dataset, n_sample): 82 | sample = list() 83 | #n_sample = round(len(dataset) * ratio) 84 | while len(sample) < n_sample: 85 | index = randrange(len(dataset)) 86 | sample.append(dataset[index]) 87 | return sample 88 | 89 | # Calculate accuracy percentage 90 | def accuracy_metric(actual, predicted): 91 | correct = 0 92 | for i in range(len(actual)): 93 | if actual[i] == predicted[i]: 94 | correct += 1 95 | return correct / float(len(actual)) * 100.0 -------------------------------------------------------------------------------- /Chapter04/SpamClassification.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 02-Sep-2017 3 | 4 | @author: DX 5 | ''' 6 | 7 | #Import math for calculations of square roots 8 | from Chapter_03.DecisionTree_CART_RF import load_csv, getTrainTestData, accuracy_metric, str_column_to_float 9 | from Chapter_04 import KNN 10 | import numpy as np 11 | 12 | 13 | #Read CSV file 14 | dataName = 'spamData.csv' 15 | 16 | #Use function load_csv from chapter 3 17 | dataset = load_csv(dataName) 18 | 19 | #Create an empty list to store the data set 20 | dataset_new = [] 21 | 22 | #We will remove incomplete instance from the data set 23 | for i in range(len(dataset)-1): 24 | dataset_new.append(dataset[i]) 25 | dataset = dataset_new 26 | 27 | #Use function str_column_to_float from chapter 3 to convert string values to float 28 | for i in range(0, len(dataset[0])-1): 29 | str_column_to_float(dataset, i) 30 | 31 | str_column_to_float(dataset, len(dataset[0])-1) 32 | 33 | #Split train and test data set using function getTrainTestData 34 | #We will use 80% of the data set as training set and rest for testing 35 | train,test = getTrainTestData(dataset,0.8) 36 | 37 | train = np.array(train) 38 | test = np.array(test) 39 | 40 | shape = np.shape(train) 41 | xtrain = train[:,0:shape[1]-1] 42 | ytrain = train[:,shape[1]-1] 43 | 44 | xtest = test[:,0:shape[1]-1] 45 | ytest = test[:,shape[1]-1] 46 | 47 | #Create empty list to store predictions and actual output 48 | testPredictions=[] 49 | testActual=[] 50 | 51 | #Select number of neighbors for each classifier 52 | k = 7 53 | 54 | #Select sample size 55 | sample_size = 500 56 | 57 | #Select number of random features 58 | n_features = 20 59 | 60 | #Calculate number of classifier on the basis of number of samples. 61 | n_classifier = np.uint8(len(train)/sample_size) 62 | 63 | #Get prediction for each test instance and store them into the list 64 | for i in range(0,len(test)): 65 | predictions = [] 66 | 67 | #Run loop for each sample 68 | for cl in range(1,n_classifier): 69 | 70 | #Randomly shuffle training set and create sample out of it 71 | np.random.shuffle(train) 72 | sample = [train[row] for row in range(sample_size)] 73 | 74 | #Pick test instance 75 | test_instance = test[i] 76 | 77 | #Get neighbors and prediction on the basis of neighbor 78 | neighbors = KNN.getNeighborsBagged(sample, test_instance, k,n_features) 79 | pred = KNN.getPrediction(neighbors) 80 | 81 | #Append prediction against each sample with random features 82 | predictions.append(pred) 83 | 84 | #Get final prediction using majority voting from each classifier 85 | fin_pred = max(set(predictions), key=predictions.count) 86 | testActual.append(test_instance[-1]) 87 | testPredictions.append(fin_pred) 88 | print ("Actual: %s Predicted: %s"%(test_instance[-1],pred)) 89 | 90 | #Use accurcay_metric function to evaluate our results 91 | accuracy = accuracy_metric(testActual,testPredictions) 92 | 93 | #Print accuracy 94 | print("Accuracy of the classification: %0.2f"%accuracy) -------------------------------------------------------------------------------- /Chapter09/SVM_Test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 03-Nov-2017 3 | 4 | @author: DX 5 | ''' 6 | #To help us perform math operations 7 | import numpy as np 8 | #to plot our data and model visually 9 | from matplotlib import pyplot as plt 10 | 11 | #Step 1 - Define our data 12 | 13 | #Input data - Of the form [X value, Y value, Bias term] 14 | X = np.array([ 15 | [-2,4,-1], 16 | [4,1,-1], 17 | [1, 6, -1], 18 | [2, 4, -1], 19 | [6, 2, -1], 20 | ]) 21 | 22 | #Associated output labels - First 2 examples are labeled '-1' and last 3 are labeled '+1' 23 | y = np.array([-1,-1,1,1,1]) 24 | 25 | #lets plot these examples on a 2D graph! 26 | #for each example 27 | for d, sample in enumerate(X): 28 | # Plot the negative samples (the first 2) 29 | if d < 2: 30 | plt.scatter(sample[0], sample[1], s=120, marker='_', linewidths=2) 31 | # Plot the positive samples (the last 3) 32 | else: 33 | plt.scatter(sample[0], sample[1], s=120, marker='+', linewidths=2) 34 | 35 | # Print a possible hyperplane, that is seperating the two classes. 36 | #we'll two points and draw the line between them (naive guess) 37 | plt.plot([-2,6],[6,0.5]) 38 | plt.show() 39 | 40 | #lets perform stochastic gradient descent to learn the seperating hyperplane between both classes 41 | 42 | def svm_sgd_plot(X, Y): 43 | #Initialize our SVMs weight vector with zeros (3 values) 44 | w = np.zeros(len(X[0])) 45 | #The learning rate 46 | eta = 1 47 | #how many iterations to train for 48 | epochs = 100000 49 | #store misclassifications so we can plot how they change over time 50 | errors = [] 51 | 52 | #training part, gradient descent part 53 | for epoch in range(1,epochs): 54 | error = 0 55 | for i, x in enumerate(X): 56 | #misclassification 57 | if (Y[i]*np.dot(X[i], w)) < 1: 58 | #misclassified update for ours weights 59 | w = w + eta * ( (X[i] * Y[i]) + (-2 *(1/epoch)* w) ) 60 | error = 1 61 | else: 62 | #correct classification, update our weights 63 | w = w + eta * (-2 *(1/epoch)* w) 64 | errors.append(error) 65 | 66 | 67 | #lets plot the rate of classification errors during training for our SVM 68 | plt.plot(errors, '|') 69 | plt.ylim(0.5,1.5) 70 | plt.axes().set_yticklabels([]) 71 | plt.xlabel('Epoch') 72 | plt.ylabel('Misclassified') 73 | plt.show() 74 | 75 | return w 76 | 77 | w = svm_sgd_plot(X,y) 78 | for d, sample in enumerate(X): 79 | # Plot the negative samples 80 | if d < 2: 81 | plt.scatter(sample[0], sample[1], s=120, marker='_', linewidths=2) 82 | # Plot the positive samples 83 | else: 84 | plt.scatter(sample[0], sample[1], s=120, marker='+', linewidths=2) 85 | 86 | # Add our test samples 87 | plt.scatter(2,2, s=120, marker='_', linewidths=2, color='yellow') 88 | plt.scatter(4,3, s=120, marker='+', linewidths=2, color='blue') 89 | 90 | # Print the hyperplane calculated by svm_sgd() 91 | x2=[w[0],w[1],-w[1],w[0]] 92 | x3=[w[0],w[1],w[1],-w[0]] 93 | 94 | x2x3 =np.array([x2,x3]) 95 | X,Y,U,V = zip(*x2x3) 96 | ax = plt.gca() 97 | ax.quiver(X,Y,U,V,scale=1, color='blue') 98 | plt.show() -------------------------------------------------------------------------------- /Chapter03/DecisionTree.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 21-Jun-2017 3 | 4 | @author: aii32199 5 | ''' 6 | import sys 7 | 8 | import numpy as np 9 | 10 | 11 | # Calculate the Gini index for a split dataset 12 | def gini_index(groups, class_values): 13 | 14 | #Initialize Gini variable 15 | gini = 0.0 16 | 17 | #Calculate propertion for each class 18 | for class_value in class_values: 19 | #Extract groups 20 | for group in groups: 21 | #Number of instance in the group 22 | size = len(group) 23 | if size == 0: 24 | continue 25 | #Initialize a list to store class index of the instances 26 | r = [] 27 | #get class of each instance in the group 28 | for row in group: 29 | r.append(row[-1]) 30 | #Count number of instances belongs to current class 31 | class_count = r.count(class_value) 32 | #Calculate class proportion 33 | proportion = class_count/float(size) 34 | #Calculate Gini index 35 | gini += (proportion * (1.0 - proportion)) 36 | return gini 37 | 38 | def createSplit(attribute,threshold,dataset): 39 | 40 | #Initialize two lists to store the sub sets 41 | lesser, greater = list(),list() 42 | 43 | #Loop through the attribute values and create sub set out of it 44 | for values in dataset: 45 | #Apply threshold 46 | if values[attribute]