├── .gitignore
├── examples
    ├── Eclat_TEST.py
    ├── Apriori_TEST.py
    ├── FP_growth_TEST.py
    ├── GMM_TEST.py
    ├── KMeans_TEST.py
    ├── DBSCAN_TEST.py
    ├── Logistic_TEST.py
    ├── SVM_TEST.py
    ├── AdaptiveBoost_TEST.py
    ├── NaiveBayes_TEST.py
    ├── KNN_TEST.py
    ├── RandomForest_TEST.py
    ├── Perceptron_TEST.py
    ├── DecisionTree_TEST.py
    ├── HMM_TEST.py
    ├── PCA_TEST.py
    ├── LDA_TEST.py
    ├── LinearRegression_TEST.py
    ├── TreeRegression_TEST.py
    ├── Stacking_TEST.py
    └── Blending_TEST.py
├── preProcess.py
├── dataset
    ├── dataset2
    │   ├── train.txt
    │   └── test.txt
    ├── dataset3
    │   └── test.txt
    ├── dataset5
    │   ├── train.txt
    │   └── test.txt
    └── dataset1
    │   ├── test.txt
    │   └── train.txt
├── README.md
├── GradientBoostingDecisionTree.py
├── GMM.py
├── Blending.py
├── KNN.py
├── Stacking.py
├── DimensionReduction.py
├── FeatureCombination.py
├── LogisticRegression.py
├── NaiveBayes.py
├── Perceptron.py
├── AdaBoost.py
├── RandomForest.py
├── LinearRegression.py
├── TreeRegression.py
├── DecisionTree.py
├── LICENSE
└── SVM.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/


--------------------------------------------------------------------------------
/examples/Eclat_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       Eclat_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-06-02   
 5 | @ Update Date:    2019-06-02 
 6 | @ Description:    Implement Eclat_TEST
 7 | """
 8 | 
 9 | from AssociationAnalysis import Eclat
10 | import numpy as np
11 | import pandas as pd
12 | import time
13 | 
14 | trainData = [['bread', 'milk', 'vegetable', 'fruit', 'eggs'],
15 |             ['noodle', 'beef', 'pork', 'water', 'socks', 'gloves', 'shoes', 'rice'],
16 |             ['socks', 'gloves'],
17 |             ['bread', 'milk', 'shoes', 'socks', 'eggs'],
18 |             ['socks', 'shoes', 'sweater', 'cap', 'milk', 'vegetable', 'gloves'],
19 |             ['eggs', 'bread', 'milk', 'fish', 'crab', 'shrimp', 'rice']]
20 | 
21 | time_start1 = time.time()
22 | clf1 = Eclat()
23 | pred1 = clf1.train(trainData)
24 | time_end1 = time.time()
25 | print("Runtime of Eclat:", time_end1-time_start1)
26 | 


--------------------------------------------------------------------------------
/examples/Apriori_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       Apriori_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-28   
 5 | @ Update Date:    2019-05-31 
 6 | @ Description:    Implement Apriori_TEST
 7 | """
 8 | 
 9 | from AssociationAnalysis import Apriori
10 | import numpy as np
11 | import pandas as pd
12 | import time
13 | 
14 | trainData = [['bread', 'milk', 'vegetable', 'fruit', 'eggs'],
15 |            ['noodle', 'beef', 'pork', 'water', 'socks', 'gloves', 'shoes', 'rice'],
16 |            ['socks', 'gloves'],
17 |            ['bread', 'milk', 'shoes', 'socks', 'eggs'],
18 |            ['socks', 'shoes', 'sweater', 'cap', 'milk', 'vegetable', 'gloves'],
19 |            ['eggs', 'bread', 'milk', 'fish', 'crab', 'shrimp', 'rice']]
20 | 
21 | time_start1 = time.time()
22 | clf1 = Apriori()
23 | pred1 = clf1.train(trainData)
24 | time_end1 = time.time()
25 | print("Runtime of Apriori:", time_end1-time_start1)
26 | 


--------------------------------------------------------------------------------
/examples/FP_growth_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       FP_growth_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-30   
 5 | @ Update Date:    2019-05-31 
 6 | @ Description:    Implement FP_growth_TEST
 7 | """
 8 | 
 9 | from AssociationAnalysis import FPgrowth
10 | import numpy as np
11 | import pandas as pd
12 | import time
13 | 
14 | trainData = [['bread', 'milk', 'vegetable', 'fruit', 'eggs'],
15 |            ['noodle', 'beef', 'pork', 'water', 'socks', 'gloves', 'shoes', 'rice'],
16 |            ['socks', 'gloves'],
17 |            ['bread', 'milk', 'shoes', 'socks', 'eggs'],
18 |            ['socks', 'shoes', 'sweater', 'cap', 'milk', 'vegetable', 'gloves'],
19 |            ['eggs', 'bread', 'milk', 'fish', 'crab', 'shrimp', 'rice']]
20 | 
21 | time_start1 = time.time()
22 | clf1 = FPgrowth()
23 | pred1 = clf1.train(trainData)
24 | time_end1 = time.time()
25 | print("Runtime of FP-growth:", time_end1-time_start1)
26 | 


--------------------------------------------------------------------------------
/examples/GMM_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @FileName: GMM_TEST.py
 3 | @Description: Implement GMM_TEST
 4 | @Author: Ryuk
 5 | @CreateDate: 2021/06/03
 6 | @LastEditTime: 2021/06/03
 7 | @LastEditors: Please set LastEditors
 8 | @Version: v0.1
 9 | """
10 | 
11 | from sklearn.mixture import GaussianMixture
12 | from GMM import *
13 | import matplotlib.pyplot as plt
14 | import time
15 | from sklearn.datasets import make_blobs
16 | 
17 | 
18 | X, y_true = make_blobs(n_samples=400, centers=4, cluster_std=0.60, random_state=0)
19 | 
20 | time_start1 = time.time()
21 | clf1 = GaussianMixtureModel(K=4)
22 | pred = clf1.train(X)
23 | time_end1 = time.time()
24 | print("Runtime of GMM:", time_end1-time_start1)
25 | 
26 | 
27 | time_start2 = time.time()
28 | clf2 = GaussianMixture(n_components=4)
29 | pred2 = clf2.fit_predict(X)
30 | time_end2 = time.time()
31 | print("Runtime of Sklearn GMM:", time_end2-time_start2)
32 | plt.scatter(X[:, 0], X[:, 1], c=pred2)
33 | plt.title('Sklearn GMM')
34 | plt.show()
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/examples/KMeans_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       KMeans_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-16   
 5 | @ Update Date:    2019-05-28
 6 | @ Description:    Implement KMeans_TEST
 7 | """
 8 | import matplotlib.pyplot as plt
 9 | from Cluster import KMeans as kmeans
10 | from sklearn.cluster import KMeans
11 | import numpy as np
12 | import pandas as pd
13 | import time
14 | 
15 | trainData = pd.read_table('../dataset/dataset6/train.txt',header=None,encoding='gb2312', delim_whitespace=True)
16 | trainData = np.array(trainData)
17 | 
18 | time_start1 = time.time()
19 | clf1 = kmeans(k=4, cluster_type="KMeans")
20 | pred1 = clf1.train(trainData)
21 | time_end1 = time.time()
22 | print("Runtime of KMeans:", time_end1-time_start1)
23 | 
24 | time_start2 = time.time()
25 | clf2 = kmeans(k=4, cluster_type="biKMeans")
26 | pred = clf2.train(trainData)
27 | time_end2 = time.time()
28 | print("Runtime of biKMeans:", time_end2-time_start2)
29 | 
30 | time_start3 = time.time()
31 | clf3 = kmeans(k=4, cluster_type="KMeans++")
32 | pred3 = clf3.train(trainData)
33 | time_end3 = time.time()
34 | print("Runtime of KMeans++:", time_end3-time_start3)
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/examples/DBSCAN_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       DBSCAN_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-20   
 5 | @ Update Date:    2019-05-20 
 6 | @ Description:    Implement DBSCAN_TEST
 7 | """
 8 | 
 9 | from Cluster import KMeans as kmeans
10 | from Cluster import DBSCAN as dbscan
11 | from sklearn.cluster import DBSCAN
12 | 
13 | import time
14 | import matplotlib.pyplot as plt
15 | from sklearn import datasets
16 | 
17 | X1, y1=datasets.make_circles(n_samples=5000, factor=.6, noise=.05)
18 | trainData = X1[0:1000]
19 | time_start1 = time.time()
20 | clf1 = kmeans(k=4, cluster_type="KMeans")
21 | pred1 = clf1.train(trainData)
22 | time_end1 = time.time()
23 | print("Runtime of KMeans:", time_end1-time_start1)
24 | 
25 | time_start2 = time.time()
26 | clf2 = dbscan()
27 | pred = clf2.train(trainData)
28 | time_end2 = time.time()
29 | print("Runtime of DBSCAN:", time_end2-time_start2)
30 | 
31 | time_start3 = time.time()
32 | clf3 = DBSCAN(eps=0.1, min_samples=10)
33 | clf3.fit(trainData)
34 | pred3 = clf3.labels_
35 | time_end3 = time.time()
36 | plt.scatter(trainData[:, 0], trainData[:, 1], c=pred3)
37 | plt.title('Sklearn DBSCAN')
38 | plt.show()
39 | print("Runtime of Sklearn DBSCAN:", time_end3-time_start3)
40 | 
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/examples/Logistic_TEST.py:
--------------------------------------------------------------------------------
 1 | from sklearn.linear_model import LogisticRegression
 2 | from LogisticRegression import *
 3 | import numpy as np
 4 | import pandas as pd
 5 | import time
 6 | 
 7 | trainData = np.array(pd.read_table('../dataset3/train.txt',header=None,encoding='gb2312',delim_whitespace=True))
 8 | testData = np.array(pd.read_table('../dataset3/test.txt',header=None,encoding='gb2312',delim_whitespace=True))
 9 | trainLabel = trainData[:, -1]
10 | trainData = np.delete(trainData, -1, axis=1)
11 | testLabel = testData[:, -1]
12 | testData = np.delete(testData, -1, axis=1)
13 | 
14 | time_start1 = time.time()
15 | clf1 = LogisticRegressionClassifier()
16 | clf1.train(trainData, trainLabel)
17 | clf1.predict(testData)
18 | score1 = clf1.accuarcy(testLabel)
19 | time_end1 = time.time()
20 | print("Accuracy of self-LogisticRegression: %f" % score1)
21 | print("Runtime of self-LogisticRegression:", time_end1-time_start1)
22 | 
23 | time_start = time.time()
24 | clf = LogisticRegression()
25 | clf.fit(trainData, trainLabel)
26 | clf.predict(testData)
27 | score = clf.score(testData, testLabel, sample_weight=None)
28 | time_end = time.time()
29 | print("Accuracy of sklearn-LogisticRegression: %f" % score)
30 | print("Runtime of sklearn-LogisticRegression:", time_end-time_start)
31 | 


--------------------------------------------------------------------------------
/examples/SVM_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       SVM_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-05
 5 | @ Update Date:    2019-05-05
 6 | @ Description:    Test SVM
 7 | """
 8 | 
 9 | from sklearn.svm import SVC
10 | from SVM import *
11 | import numpy as np
12 | import pandas as pd
13 | import time
14 | 
15 | trainData = np.array(pd.read_table('../dataset2/train.txt', header=None, encoding='gb2312', delim_whitespace=True))
16 | testData = np.array(pd.read_table('../dataset2/test.txt', header=None, encoding='gb2312', delim_whitespace=True))
17 | trainLabel = trainData[:, -1]
18 | trainData = np.delete(trainData, -1, axis=1)
19 | testLabel = testData[:, -1]
20 | testData = np.delete(testData, -1, axis=1)
21 | 
22 | time_start1 = time.time()
23 | clf1 = SVMClassifier()
24 | clf1.train(trainData, trainLabel)
25 | clf1.predict(testData)
26 | score1 = clf1.accuarcy(testLabel)
27 | time_end1 = time.time()
28 | print("Accuracy of self-SVM: %f" % score1)
29 | print("Runtime of self-SVM:", time_end1-time_start1)
30 | 
31 | time_start = time.time()
32 | clf = SVC()
33 | clf.fit(trainData, trainLabel)
34 | clf.predict(testData)
35 | score = clf.score(testData, testLabel, sample_weight=None)
36 | time_end = time.time()
37 | print("Accuracy of SVM: %f" % score)
38 | print("Runtime of SVM:", time_end-time_start)
39 | 


--------------------------------------------------------------------------------
/examples/AdaptiveBoost_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       AadBoost_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-28
 5 | @ Update Date:    2019-05-31
 6 | @ Description:    Implement AdaBoost_TEST
 7 | """
 8 | from AdaBoost import *
 9 | from SVM import *
10 | import numpy as np
11 | import pandas as pd
12 | import time
13 | 
14 | trainData = np.array(pd.read_table('../dataset/dataset2/train.txt', header=None, encoding='gb2312', delim_whitespace=True))
15 | testData = np.array(pd.read_table('../dataset/dataset2/test.txt', header=None, encoding='gb2312', delim_whitespace=True))
16 | trainLabel = trainData[:, -1]
17 | trainData = np.delete(trainData, -1, axis=1)
18 | testLabel = testData[:, -1]
19 | testData = np.delete(testData, -1, axis=1)
20 | 
21 | time_start1 = time.time()
22 | clf1 = SVMClassifier()
23 | clf1.train(trainData, trainLabel)
24 | clf1.predict(testData)
25 | score1 = clf1.accuarcy(testLabel)
26 | time_end1 = time.time()
27 | print("Accuracy of SVM: %f" % score1)
28 | print("Runtime of SVM:", time_end1-time_start1)
29 | 
30 | time_start1 = time.time()
31 | clf1 = Adaboost()
32 | clf1.train(trainData, trainLabel)
33 | clf1.predict(testData)
34 | score1 = clf1.accuarcy(testLabel)
35 | time_end1 = time.time()
36 | print("Accuracy of Adaboost: %f" % score1)
37 | print("Runtime of Adaboost:", time_end1-time_start1)
38 | 


--------------------------------------------------------------------------------
/examples/NaiveBayes_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       NaiveBayes_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-16   
 5 | @ Update Date:    2019-05-16 
 6 | @ Description:    Implement NaiveBayes_TEST
 7 | """
 8 | from sklearn.naive_bayes import BernoulliNB
 9 | from NaiveBayes import *
10 | import numpy as np
11 | import pandas as pd
12 | import time
13 | 
14 | trainData = pd.read_table('../dataset1/train.txt',header=None,encoding='gb2312',delim_whitespace=True)
15 | testData = pd.read_table('../dataset1/test.txt',header=None,encoding='gb2312',delim_whitespace=True)
16 | trainLabel = np.array(trainData.pop(3))
17 | trainData = np.array(trainData)
18 | testLabel = np.array(testData.pop(3))
19 | testData = np.array(testData)
20 | 
21 | time_start1 = time.time()
22 | clf1 = BayesClassifier()
23 | clf1.train(trainData, trainLabel)
24 | clf1.predict(testData)
25 | score1 = clf1.accuarcy(testLabel)
26 | time_end1 = time.time()
27 | print("Accuracy of self-Bayes: %f" % score1)
28 | print("Runtime of self-Bayes:", time_end1-time_start1)
29 | 
30 | time_start = time.time()
31 | clf = BernoulliNB()
32 | clf.fit(trainData, trainLabel)
33 | clf.predict(testData)
34 | score = clf.score(testData, testLabel, sample_weight=None)
35 | time_end = time.time()
36 | print("Accuracy of sklearn-Bayes: %f" % score)
37 | print("Runtime of sklearn-Bayes:", time_end-time_start)
38 | 


--------------------------------------------------------------------------------
/examples/KNN_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       KNN_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-16
 5 | @ Update Date:    2019-05-28
 6 | @ Description:    Implement KNN
 7 | """
 8 | from sklearn.neighbors import KNeighborsClassifier
 9 | from KNN import *
10 | import numpy as np
11 | import pandas as pd
12 | import time
13 | 
14 | trainData = pd.read_table('../dataset/dataset1/train.txt',header=None,encoding='gb2312',delim_whitespace=True)
15 | testData = pd.read_table('../dataset/dataset1/test.txt',header=None,encoding='gb2312',delim_whitespace=True)
16 | trainLabel = np.array(trainData.pop(3))
17 | trainData = np.array(trainData)
18 | testLabel = np.array(testData.pop(3))
19 | testData = np.array(testData)
20 | 
21 | time_start1 = time.time()
22 | clf1 = KNNClassifier(k=6)
23 | clf1.train(trainData, trainLabel)
24 | clf1.predict(testData)
25 | score1 = clf1.showDetectionResult(testData, testLabel)
26 | time_end1 = time.time()
27 | print("Accuracy of self-KNN: %f" % score1)
28 | print("Runtime of self-KNN:", time_end1-time_start1)
29 | 
30 | time_start = time.time()
31 | knn = KNeighborsClassifier(n_neighbors=6)
32 | knn.fit(trainData, trainLabel)
33 | knn.predict(testData)
34 | score = knn.score(testData, testLabel, sample_weight=None)
35 | time_end = time.time()
36 | print("Accuracy of sklearn-KNN: %f" % score)
37 | print("Runtime of sklearn-KNN:", time_end-time_start)
38 | 


--------------------------------------------------------------------------------
/examples/RandomForest_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       RandomForest_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-07-10   
 5 | @ Update Date:    2019-07-10 
 6 | @ Description:    Implement RandomForest_TEST
 7 | """
 8 | from RandomForest import RandomForestClassifier, RandomForestRegression
 9 | import numpy as np
10 | import pandas as pd
11 | import time
12 | from DecisionTree import *
13 | 
14 | trainData = pd.read_table('../dataset/dataset1/train.txt', header=None, encoding='gb2312', delim_whitespace=True)
15 | testData = pd.read_table('../dataset/dataset1/test.txt', header=None, encoding='gb2312', delim_whitespace=True)
16 | trainLabel = np.array(trainData.pop(3))
17 | trainData = np.array(trainData)
18 | testLabel = np.array(testData.pop(3))
19 | testData = np.array(testData)
20 | 
21 | time_start1 = time.time()
22 | clf1 = DecisionTreeClassifier()
23 | clf1.train(trainData, trainLabel)
24 | clf1.predict(testData)
25 | score1 = clf1.accuarcy(testLabel)
26 | time_end1 = time.time()
27 | print("Accuracy of self-DecisionTree: %f" % score1)
28 | print("Runtime of self-DecisionTree:", time_end1-time_start1)
29 | 
30 | time_start = time.time()
31 | clf = RandomForestClassifier()
32 | clf.train(trainData, trainLabel)
33 | clf.predict(testData)
34 | score = clf.accuarcy(testLabel)
35 | time_end = time.time()
36 | print("Accuracy of RandomForest: %f" % score)
37 | print("Runtime of RandomForest:", time_end-time_start)
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/preProcess.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | '''
 4 | Function:  Normalization
 5 | Description: Normalize input data. For vector x, the normalization process is given by
 6 |             normalization(x) = (x - min(x))/(max(x) - min(x))
 7 | Input:  data        dataType: ndarray   description: input data
 8 | Output: normdata    dataType: ndarray   description: output data after normalization
 9 | '''
10 | 
11 | def Normalization(data):
12 |     # get the max and min value of each column
13 |     minValue = data.min(axis=0)
14 |     maxValue = data.max(axis=0)
15 |     diff = maxValue - minValue
16 |     # normalization
17 |     mindata = np.tile(minValue, (data.shape[0], 1))
18 |     normdata = (data - mindata) / np.tile(diff, (data.shape[0], 1))
19 |     return normdata
20 | 
21 | '''
22 | Function:  Standardization
23 | Description: Standardize input data. For vector x, the normalization process is given by
24 |              Standardization(x) = x - mean(x)/std(x)
25 | Input:  data            dataType: ndarray   description: input data
26 | Output: standarddata    dataType: ndarray   description: output data after standardization
27 | '''
28 | 
29 | def Standardization(data):
30 |     # get the mean and the variance of each column
31 |     meanValue = data.mean(axis=0)
32 |     varValue = data.std(axis=0)
33 |     standarddata = (data - np.tile(meanValue, (data.shape[0], 1))) / np.tile(varValue, (data.shape[0], 1))
34 |     return standarddata
35 | 


--------------------------------------------------------------------------------
/examples/Perceptron_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       Perceptron_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-05
 5 | @ Update Date:    2019-05-05
 6 | @ Description:    Test Perceptron
 7 | """
 8 | 
 9 | from sklearn.neural_network import MLPClassifier
10 | from Perceptron import *
11 | import numpy as np
12 | import pandas as pd
13 | import time
14 | 
15 | trainData = np.array(pd.read_table('../dataset3/train.txt', header=None, encoding='gb2312', delim_whitespace=True))
16 | testData = np.array(pd.read_table('../dataset3/test.txt', header=None, encoding='gb2312', delim_whitespace=True))
17 | trainLabel = trainData[:, -1]
18 | trainData = np.delete(trainData, -1, axis=1)
19 | testLabel = testData[:, -1]
20 | testData = np.delete(testData, -1, axis=1)
21 | 
22 | time_start1 = time.time()
23 | clf1 = PerceptronClassifier()
24 | clf1.train(trainData, trainLabel)
25 | clf1.predict(testData)
26 | score1 = clf1.accuarcy(testLabel)
27 | time_end1 = time.time()
28 | print("Accuracy of self-Perceptron: %f" % score1)
29 | print("Runtime of self-Perceptron:", time_end1-time_start1)
30 | 
31 | time_start = time.time()
32 | clf = MLPClassifier()
33 | clf.fit(trainData, trainLabel)
34 | clf.predict(testData)
35 | score = clf.score(testData, testLabel, sample_weight=None)
36 | time_end = time.time()
37 | print("Accuracy of Perceptron: %f" % score)
38 | print("Runtime of Perceptron:", time_end-time_start)
39 | 


--------------------------------------------------------------------------------
/examples/DecisionTree_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       DecisionTree_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-16
 5 | @ Update Date:    2019-05-28
 6 | @ Description:    Implement DecisionTree_Test
 7 | """
 8 | from sklearn import tree
 9 | from DecisionTree import *
10 | import numpy as np
11 | import pandas as pd
12 | import time
13 | 
14 | trainData = pd.read_table('../dataset/dataset1/train.txt', header=None, encoding='gb2312', delim_whitespace=True)
15 | testData = pd.read_table('../dataset/dataset1/test.txt', header=None, encoding='gb2312', delim_whitespace=True)
16 | trainLabel = np.array(trainData.pop(3))
17 | trainData = np.array(trainData)
18 | testLabel = np.array(testData.pop(3))
19 | testData = np.array(testData)
20 | 
21 | time_start1 = time.time()
22 | clf1 = DecisionTreeClassifier()
23 | clf1.train(trainData, trainLabel)
24 | clf1.predict(testData)
25 | score1 = clf1.accuarcy(testLabel)
26 | time_end1 = time.time()
27 | print("Accuracy of self-DecisionTree: %f" % score1)
28 | print("Runtime of self-DecisionTree:", time_end1-time_start1)
29 | 
30 | time_start = time.time()
31 | clf = tree.DecisionTreeClassifier()
32 | clf.fit(trainData, trainLabel)
33 | clf.predict(testData)
34 | score = clf.score(testData, testLabel, sample_weight=None)
35 | time_end = time.time()
36 | print("Accuracy of sklearn-DecisionTree: %f" % score)
37 | print("Runtime of sklearn-DecisionTree:", time_end-time_start)
38 | 


--------------------------------------------------------------------------------
/examples/HMM_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       HMM_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-06-12   
 5 | @ Update Date:    2019-06-16
 6 | @ Description:    Implement HMM_TEST
 7 | """
 8 | 
 9 | from HMM import HiddenMarkovModel
10 | import numpy as np
11 | import time
12 | 
13 | Q = np.array([0, 1]) # hot 0, cold 1
14 | V = np.array([0, 1, 2])
15 | O = np.array([[2, 2, 1], [0, 0, 1], [0, 1, 2]])
16 | I = np.array([[0, 0, 1], [1, 1, 1], [1, 0, 0]])
17 | test = np.array([0, 1, 2])
18 | 
19 | # # supervised learning algorithm
20 | time_start1 = time.time()
21 | clf1 = HiddenMarkovModel(Q, V)
22 | clf1.train(O, I)
23 | time_end1 = time.time()
24 | print("Supervised learning parameters:")
25 | print("Transfer probability  matrix\n", clf1.A)
26 | print("Observation probability  matirx\n", clf1.B)
27 | print("Initial state probability \n", clf1.Pi)
28 | print("Prediction of Supervised learning", clf1.predict(test))
29 | print("Runtime of Supervised learning:", time_end1-time_start1)
30 | print("________________BOUNDARY_______________________________________")
31 | # unsupervised learning algorithm
32 | time_start2 = time.time()
33 | clf2 = HiddenMarkovModel(Q, V)
34 | clf2.train(O)
35 | time_end2 = time.time()
36 | print("Unsupervised learning  parameters:")
37 | print("Transfer probability  matrix\n", clf2.A)
38 | print("Observation probability  matirx\n", clf2.B)
39 | print("Initial state probability \n", clf2.Pi)
40 | print("Prediction of Unsupervised learning", clf2.predict(test))
41 | print("Runtime of Unsupervised learning:", time_end2-time_start2)
42 | 


--------------------------------------------------------------------------------
/examples/PCA_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       PCA_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-06-03   
 5 | @ Update Date:    2019-06-06 
 6 | @ Description:    Implement PCA_TEST
 7 | """
 8 | 
 9 | from DimensionReduction import PCA
10 | from sklearn.decomposition import PCA as pca
11 | import numpy as np
12 | import time
13 | from sklearn.linear_model import LogisticRegression
14 | import pandas as pd
15 | 
16 | trainData = np.array(pd.read_table('./dataset/dataset1/train.txt', header=None, encoding='gb2312', delim_whitespace=True))
17 | testData = np.array(pd.read_table('./dataset/dataset1/test.txt', header=None, encoding='gb2312', delim_whitespace=True))
18 | train_y = trainData[:, -1]
19 | train_x = np.delete(trainData, -1, axis=1)
20 | test_y = testData[:, -1]
21 | test_x = np.delete(testData, -1, axis=1)
22 | 
23 | time_start1 = time.time()
24 | clf1 = PCA()
25 | clf1.train(train_x)
26 | train_x = clf1.transformData(train_x)
27 | test_x = clf1.transformData(test_x)
28 | clf = LogisticRegression(solver='liblinear', multi_class='ovr')
29 | clf.fit(train_x, train_y)
30 | print("Accuracy of PCA:", clf.score(test_x, test_y))
31 | time_end1 = time.time()
32 | print("Runtime of PCA:", time_end1-time_start1)
33 | 
34 | time_start2 = time.time()
35 | clf2 = pca(n_components=1)
36 | train_x = clf2.fit_transform(train_x)
37 | test_x = clf2.fit_transform(test_x, test_y)
38 | clf = LogisticRegression(solver='liblinear' ,multi_class='ovr')
39 | clf.fit(train_x, train_y)
40 | print("Accuracy of sklearn PCA:", clf.score(test_x, test_y))
41 | time_end2 = time.time()
42 | print("Runtime of sklearn PCA:", time_end2-time_start2)
43 | 


--------------------------------------------------------------------------------
/examples/LDA_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       LDA_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-06-04   
 5 | @ Update Date:    2019-06-04 
 6 | @ Description:    Implement LDA_TEST
 7 | """
 8 | 
 9 | from sklearn.model_selection import train_test_split
10 | from DimensionReduction import LDA
11 | import numpy as np
12 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
13 | import time
14 | from sklearn.linear_model import LogisticRegression
15 | import pandas as pd
16 | 
17 | trainData = np.array(pd.read_table('./dataset/dataset1/train.txt', header=None, encoding='gb2312', delim_whitespace=True))
18 | testData = np.array(pd.read_table('./dataset/dataset1/test.txt', header=None, encoding='gb2312', delim_whitespace=True))
19 | train_y = trainData[:, -1]
20 | train_x = np.delete(trainData, -1, axis=1)
21 | test_y = testData[:, -1]
22 | test_x = np.delete(testData, -1, axis=1)
23 | 
24 | time_start1 = time.time()
25 | clf1 = LDA()
26 | clf1.train(train_x, train_y)
27 | train_x = clf1.transformData(train_x)
28 | test_x = clf1.transformData(test_x)
29 | clf = LogisticRegression()
30 | clf.fit(train_x, train_y)
31 | print("Accuracy of LDA:", clf.score(test_x, test_y))
32 | time_end1 = time.time()
33 | print("Runtime of LDA:", time_end1-time_start1)
34 | 
35 | 
36 | time_start2 = time.time()
37 | clf2 = LinearDiscriminantAnalysis(n_components=1)
38 | train_x = clf2.fit_transform(train_x, train_y)
39 | test_x = clf2.fit_transform(test_x, test_y)
40 | clf = LogisticRegression()
41 | clf.fit(train_x, train_y)
42 | print("Accuracy of sklearn LDA:", clf.score(test_x, test_y))
43 | time_end2 = time.time()
44 | print("Runtime of sklearn LDA:", time_end2-time_start2)
45 | 


--------------------------------------------------------------------------------
/examples/LinearRegression_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       LinearRegression_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-10
 5 | @ Update Date:    2019-05-10
 6 | @ Description:    Test  LinearRegression
 7 | """
 8 | 
 9 | 
10 | from LinearRegression import *
11 | from sklearn import linear_model
12 | import numpy as np
13 | import pandas as pd
14 | import time
15 | 
16 | def plot(real_label, regression_label):
17 |     # test_label = np.expand_dims(test_label, axis=1)
18 |     plot1 = plt.plot(regression_label, 'r*', label='Regression values')
19 |     plot2 = plt.plot(real_label, 'b', label='Real values')
20 |     plt.xlabel('X ')
21 |     plt.ylabel('Y')
22 |     plt.legend(loc=3)
23 |     plt.title('Regression')
24 |     plt.show()
25 | 
26 | trainData = np.array(pd.read_table('../dataset4/train.txt', header=None, encoding='gb2312', delim_whitespace=True))
27 | testData = np.array(pd.read_table('../dataset4/test.txt', header=None, encoding='gb2312', delim_whitespace=True))
28 | trainLabel = trainData[:, -1]
29 | trainData = np.delete(trainData, -1, axis=1)
30 | testLabel = testData[:, -1]
31 | testData = np.delete(testData, -1, axis=1)
32 | 
33 | time_start1 = time.time()
34 | clf1 = linear_model.LinearRegression()
35 | clf1.fit(trainData, trainLabel)
36 | regression_label = clf1.predict(testData)
37 | time_end1 = time.time()
38 | plot(testLabel, regression_label)
39 | print("Runtime of Sklearn-linear regression:", time_end1-time_start1)
40 | 
41 | time_start2 = time.time()
42 | clf2 = Regression()
43 | clf2.train(trainData, trainLabel)
44 | regression_label2 = clf2.predict(testData)
45 | time_end2 = time.time()
46 | plot(testLabel,regression_label2)
47 | print("Runtime of self-linear regression:", time_end2-time_start2)
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/examples/TreeRegression_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       TreeRegression_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-13   
 5 | @ Update Date:    2019-05-15
 6 | @ Description:    Implement TreeRegression_TEST
 7 | """
 8 | 
 9 | import matplotlib.pyplot as plt
10 | from sklearn import linear_model
11 | 
12 | from TreeRegression import *
13 | from sklearn.tree import DecisionTreeRegressor
14 | import numpy as np
15 | import pandas as pd
16 | import time
17 | 
18 | def plot(real_label, regression_label):
19 |     # test_label = np.expand_dims(test_label, axis=1)
20 |     plot1 = plt.plot(regression_label, 'r*', label='Regression values')
21 |     plot2 = plt.plot(real_label, 'b', label='Real values')
22 |     plt.xlabel('X ')
23 |     plt.ylabel('Y')
24 |     plt.legend(loc=3)
25 |     plt.title('Tree Regression')
26 |     plt.show()
27 | 
28 | 
29 | trainData = np.array(pd.read_table('../dataset5/train.txt', header=None, encoding='gb2312', delim_whitespace=True))
30 | testData = np.array(pd.read_table('../dataset5/test.txt', header=None, encoding='gb2312', delim_whitespace=True))
31 | trainLabel = trainData[:, -1]
32 | trainData = np.delete(trainData, -1, axis=1)
33 | testLabel = testData[:, -1]
34 | testData = np.delete(testData, -1, axis=1)
35 | 
36 | 
37 | time_start1 = time.time()
38 | clf1 =  DecisionTreeRegressor()
39 | clf1.fit(trainData, trainLabel)
40 | regression_label = clf1.predict(testData)
41 | time_end1 = time.time()
42 | plot(testLabel, regression_label)
43 | print("Runtime of Sklearn-tree regression:", time_end1-time_start1)
44 | 
45 | time_start2 = time.time()
46 | clf2 = treeRegression()
47 | clf2.train(trainData, trainLabel)
48 | regression_label2 = clf2.predict(testData)
49 | time_end2 = time.time()
50 | plot(testLabel,regression_label2 )
51 | print("Runtime of self-tree regression:", time_end2-time_start2)
52 | 


--------------------------------------------------------------------------------
/examples/Stacking_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       Stacking_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-05
 5 | @ Update Date:    2019-05-05
 6 | @ Description:    Test Stacking
 7 | """
 8 | 
 9 | from Stacking import *
10 | from Perceptron import *
11 | from Logistic import *
12 | import numpy as np
13 | import pandas as pd
14 | import time
15 | 
16 | trainData = np.array(pd.read_table('../dataset3/train.txt', header=None, encoding='gb2312', delim_whitespace=True))
17 | testData = np.array(pd.read_table('../dataset3/test.txt', header=None, encoding='gb2312', delim_whitespace=True))
18 | trainLabel = trainData[:, -1]
19 | trainData = np.delete(trainData, -1, axis=1)
20 | testLabel = testData[:, -1]
21 | testData = np.delete(testData, -1, axis=1)
22 | 
23 | clfs = [PerceptronClassifier(), PerceptronClassifier(), LogisticRegressionClassifier(), LogisticRegressionClassifier()]
24 | 
25 | time_start1 = time.time()
26 | clf1 = StackingClassifier(classifier_set=clfs)
27 | clf1.train(trainData, trainLabel)
28 | clf1.predict(testData)
29 | score1 = clf1.accuarcy(testLabel)
30 | time_end1 = time.time()
31 | print("Accuracy of self-Stacking: %f" % score1)
32 | print("Runtime of self-Stacking:", time_end1-time_start1)
33 | 
34 | time_start2 = time.time()
35 | clf2 = LogisticRegressionClassifier()
36 | clf2.train(trainData, trainLabel)
37 | clf2.predict(testData)
38 | score2 = clf2.accuarcy(testLabel)
39 | time_end2 = time.time()
40 | print("Accuracy of self-Logistic: %f" % score2)
41 | print("Runtime of self-Logistic:", time_end2-time_start2)
42 | 
43 | time_start3 = time.time()
44 | clf3 = PerceptronClassifier()
45 | clf3.train(trainData, trainLabel)
46 | clf3.predict(testData)
47 | score3 = clf3.accuarcy(testLabel)
48 | time_end3 = time.time()
49 | print("Accuracy of self-Perceptron: %f" % score3)
50 | print("Runtime of self-Perceptron:", time_end3-time_start3)
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/examples/Blending_TEST.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @ Filename:       Blending_TEST.py
 3 | @ Author:         Ryuk
 4 | @ Create Date:    2019-05-04
 5 | @ Update Date:    2019-05-04
 6 | @ Description:    Test Blending
 7 | """
 8 | 
 9 | from Blending import *
10 | from Perceptron import *
11 | from Logistic import *
12 | import numpy as np
13 | import pandas as pd
14 | import time
15 | 
16 | trainData = np.array(pd.read_table('../dataset/dataset3/train.txt', header=None, encoding='gb2312', delim_whitespace=True))
17 | testData = np.array(pd.read_table('../dataset/dataset3/test.txt', header=None, encoding='gb2312', delim_whitespace=True))
18 | trainLabel = trainData[:, -1]
19 | trainData = np.delete(trainData, -1, axis=1)
20 | testLabel = testData[:, -1]
21 | testData = np.delete(testData, -1, axis=1)
22 | 
23 | clfs = [PerceptronClassifier(), PerceptronClassifier(), LogisticRegressionClassifier(), LogisticRegressionClassifier()]
24 | 
25 | time_start1 = time.time()
26 | clf1 = BlendingClassifier(classifier_set=clfs)
27 | clf1.train(trainData, trainLabel)
28 | clf1.predict(testData)
29 | score1 = clf1.accuarcy(testLabel)
30 | time_end1 = time.time()
31 | print("Accuracy of self-Blending: %f" % score1)
32 | print("Runtime of self-Blending:", time_end1-time_start1)
33 | 
34 | time_start2 = time.time()
35 | clf2 = LogisticRegressionClassifier()
36 | clf2.train(trainData, trainLabel)
37 | clf2.predict(testData)
38 | score2 = clf2.accuarcy(testLabel)
39 | time_end2 = time.time()
40 | print("Accuracy of self-Logistic: %f" % score2)
41 | print("Runtime of self-Logistic:", time_end2-time_start2)
42 | 
43 | time_start3 = time.time()
44 | clf3 = PerceptronClassifier()
45 | clf3.train(trainData, trainLabel)
46 | clf3.predict(testData)
47 | score3 = clf3.accuarcy(testLabel)
48 | time_end3 = time.time()
49 | print("Accuracy of self-Perceptron: %f" % score3)
50 | print("Runtime of self-Perceptron:", time_end3-time_start3)
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/dataset/dataset2/train.txt:
--------------------------------------------------------------------------------
  1 | -0.214824	0.662756	-1.000000
  2 | -0.061569	-0.091875	1.000000
  3 | 0.406933	0.648055	-1.000000
  4 | 0.223650	0.130142	1.000000
  5 | 0.231317	0.766906	-1.000000
  6 | -0.748800	-0.531637	-1.000000
  7 | -0.557789	0.375797	-1.000000
  8 | 0.207123	-0.019463	1.000000
  9 | 0.286462	0.719470	-1.000000
 10 | 0.195300	-0.179039	1.000000
 11 | -0.152696	-0.153030	1.000000
 12 | 0.384471	0.653336	-1.000000
 13 | -0.117280	-0.153217	1.000000
 14 | -0.238076	0.000583	1.000000
 15 | -0.413576	0.145681	1.000000
 16 | 0.490767	-0.680029	-1.000000
 17 | 0.199894	-0.199381	1.000000
 18 | -0.356048	0.537960	-1.000000
 19 | -0.392868	-0.125261	1.000000
 20 | 0.353588	-0.070617	1.000000
 21 | 0.020984	0.925720	-1.000000
 22 | -0.475167	-0.346247	-1.000000
 23 | 0.074952	0.042783	1.000000
 24 | 0.394164	-0.058217	1.000000
 25 | 0.663418	0.436525	-1.000000
 26 | 0.402158	0.577744	-1.000000
 27 | -0.449349	-0.038074	1.000000
 28 | 0.619080	-0.088188	-1.000000
 29 | 0.268066	-0.071621	1.000000
 30 | -0.015165	0.359326	1.000000
 31 | 0.539368	-0.374972	-1.000000
 32 | -0.319153	0.629673	-1.000000
 33 | 0.694424	0.641180	-1.000000
 34 | 0.079522	0.193198	1.000000
 35 | 0.253289	-0.285861	1.000000
 36 | -0.035558	-0.010086	1.000000
 37 | -0.403483	0.474466	-1.000000
 38 | -0.034312	0.995685	-1.000000
 39 | -0.590657	0.438051	-1.000000
 40 | -0.098871	-0.023953	1.000000
 41 | -0.250001	0.141621	1.000000
 42 | -0.012998	0.525985	-1.000000
 43 | 0.153738	0.491531	-1.000000
 44 | 0.388215	-0.656567	-1.000000
 45 | 0.049008	0.013499	1.000000
 46 | 0.068286	0.392741	1.000000
 47 | 0.747800	-0.066630	-1.000000
 48 | 0.004621	-0.042932	1.000000
 49 | -0.701600	0.190983	-1.000000
 50 | 0.055413	-0.024380	1.000000
 51 | 0.035398	-0.333682	1.000000
 52 | 0.211795	0.024689	1.000000
 53 | -0.045677	0.172907	1.000000
 54 | 0.595222	0.209570	-1.000000
 55 | 0.229465	0.250409	1.000000
 56 | -0.089293	0.068198	1.000000
 57 | 0.384300	-0.176570	1.000000
 58 | 0.834912	-0.110321	-1.000000
 59 | -0.307768	0.503038	-1.000000
 60 | -0.777063	-0.348066	-1.000000
 61 | 0.017390	0.152441	1.000000
 62 | -0.293382	-0.139778	1.000000
 63 | -0.203272	0.286855	1.000000
 64 | 0.957812	-0.152444	-1.000000
 65 | 0.004609	-0.070617	1.000000
 66 | -0.755431	0.096711	-1.000000
 67 | -0.526487	0.547282	-1.000000
 68 | -0.246873	0.833713	-1.000000
 69 | 0.185639	-0.066162	1.000000
 70 | 0.851934	0.456603	-1.000000
 71 | -0.827912	0.117122	-1.000000
 72 | 0.233512	-0.106274	1.000000
 73 | 0.583671	-0.709033	-1.000000
 74 | -0.487023	0.625140	-1.000000
 75 | -0.448939	0.176725	1.000000
 76 | 0.155907	-0.166371	1.000000
 77 | 0.334204	0.381237	-1.000000
 78 | 0.081536	-0.106212	1.000000
 79 | 0.227222	0.527437	-1.000000
 80 | 0.759290	0.330720	-1.000000
 81 | 0.204177	-0.023516	1.000000
 82 | 0.577939	0.403784	-1.000000
 83 | -0.568534	0.442948	-1.000000
 84 | -0.011520	0.021165	1.000000
 85 | 0.875720	0.422476	-1.000000
 86 | 0.297885	-0.632874	-1.000000
 87 | -0.015821	0.031226	1.000000
 88 | 0.541359	-0.205969	-1.000000
 89 | -0.689946	-0.508674	-1.000000
 90 | -0.343049	0.841653	-1.000000
 91 | 0.523902	-0.436156	-1.000000
 92 | 0.249281	-0.711840	-1.000000
 93 | 0.193449	0.574598	-1.000000
 94 | -0.257542	-0.753885	-1.000000
 95 | -0.021605	0.158080	1.000000
 96 | 0.601559	-0.727041	-1.000000
 97 | -0.791603	0.095651	-1.000000
 98 | -0.908298	-0.053376	-1.000000
 99 | 0.122020	0.850966	-1.000000
100 | -0.725568	-0.292022	-1.000000
101 | 


--------------------------------------------------------------------------------
/dataset/dataset2/test.txt:
--------------------------------------------------------------------------------
  1 | 0.676771	-0.486687	-1.000000
  2 | 0.008473	0.186070	1.000000
  3 | -0.727789	0.594062	-1.000000
  4 | 0.112367	0.287852	1.000000
  5 | 0.383633	-0.038068	1.000000
  6 | -0.927138	-0.032633	-1.000000
  7 | -0.842803	-0.423115	-1.000000
  8 | -0.003677	-0.367338	1.000000
  9 | 0.443211	-0.698469	-1.000000
 10 | -0.473835	0.005233	1.000000
 11 | 0.616741	0.590841	-1.000000
 12 | 0.557463	-0.373461	-1.000000
 13 | -0.498535	-0.223231	-1.000000
 14 | -0.246744	0.276413	1.000000
 15 | -0.761980	-0.244188	-1.000000
 16 | 0.641594	-0.479861	-1.000000
 17 | -0.659140	0.529830	-1.000000
 18 | -0.054873	-0.238900	1.000000
 19 | -0.089644	-0.244683	1.000000
 20 | -0.431576	-0.481538	-1.000000
 21 | -0.099535	0.728679	-1.000000
 22 | -0.188428	0.156443	1.000000
 23 | 0.267051	0.318101	1.000000
 24 | 0.222114	-0.528887	-1.000000
 25 | 0.030369	0.113317	1.000000
 26 | 0.392321	0.026089	1.000000
 27 | 0.298871	-0.915427	-1.000000
 28 | -0.034581	-0.133887	1.000000
 29 | 0.405956	0.206980	1.000000
 30 | 0.144902	-0.605762	-1.000000
 31 | 0.274362	-0.401338	1.000000
 32 | 0.397998	-0.780144	-1.000000
 33 | 0.037863	0.155137	1.000000
 34 | -0.010363	-0.004170	1.000000
 35 | 0.506519	0.486619	-1.000000
 36 | 0.000082	-0.020625	1.000000
 37 | 0.057761	-0.155140	1.000000
 38 | 0.027748	-0.553763	-1.000000
 39 | -0.413363	-0.746830	-1.000000
 40 | 0.081500	-0.014264	1.000000
 41 | 0.047137	-0.491271	1.000000
 42 | -0.267459	0.024770	1.000000
 43 | -0.148288	-0.532471	-1.000000
 44 | -0.225559	-0.201622	1.000000
 45 | 0.772360	-0.518986	-1.000000
 46 | -0.440670	0.688739	-1.000000
 47 | 0.329064	-0.095349	1.000000
 48 | 0.970170	-0.010671	-1.000000
 49 | -0.689447	-0.318722	-1.000000
 50 | -0.465493	-0.227468	-1.000000
 51 | -0.049370	0.405711	1.000000
 52 | -0.166117	0.274807	1.000000
 53 | 0.054483	0.012643	1.000000
 54 | 0.021389	0.076125	1.000000
 55 | -0.104404	-0.914042	-1.000000
 56 | 0.294487	0.440886	-1.000000
 57 | 0.107915	-0.493703	-1.000000
 58 | 0.076311	0.438860	1.000000
 59 | 0.370593	-0.728737	-1.000000
 60 | 0.409890	0.306851	-1.000000
 61 | 0.285445	0.474399	-1.000000
 62 | -0.870134	-0.161685	-1.000000
 63 | -0.654144	-0.675129	-1.000000
 64 | 0.285278	-0.767310	-1.000000
 65 | 0.049548	-0.000907	1.000000
 66 | 0.030014	-0.093265	1.000000
 67 | -0.128859	0.278865	1.000000
 68 | 0.307463	0.085667	1.000000
 69 | 0.023440	0.298638	1.000000
 70 | 0.053920	0.235344	1.000000
 71 | 0.059675	0.533339	-1.000000
 72 | 0.817125	0.016536	-1.000000
 73 | -0.108771	0.477254	1.000000
 74 | -0.118106	0.017284	1.000000
 75 | 0.288339	0.195457	1.000000
 76 | 0.567309	-0.200203	-1.000000
 77 | -0.202446	0.409387	1.000000
 78 | -0.330769	-0.240797	1.000000
 79 | -0.422377	0.480683	-1.000000
 80 | -0.295269	0.326017	1.000000
 81 | 0.261132	0.046478	1.000000
 82 | -0.492244	-0.319998	-1.000000
 83 | -0.384419	0.099170	1.000000
 84 | 0.101882	-0.781145	-1.000000
 85 | 0.234592	-0.383446	1.000000
 86 | -0.020478	-0.901833	-1.000000
 87 | 0.328449	0.186633	1.000000
 88 | -0.150059	-0.409158	1.000000
 89 | -0.155876	-0.843413	-1.000000
 90 | -0.098134	-0.136786	1.000000
 91 | 0.110575	-0.197205	1.000000
 92 | 0.219021	0.054347	1.000000
 93 | 0.030152	0.251682	1.000000
 94 | 0.033447	-0.122824	1.000000
 95 | -0.686225	-0.020779	-1.000000
 96 | -0.911211	-0.262011	-1.000000
 97 | 0.572557	0.377526	-1.000000
 98 | -0.073647	-0.519163	-1.000000
 99 | -0.281830	-0.797236	-1.000000
100 | -0.555263	0.126232	-1.000000
101 | 


--------------------------------------------------------------------------------
/dataset/dataset3/test.txt:
--------------------------------------------------------------------------------
 1 | 2	1	38.50	54	20	0	1	2	2	3	4	1	2	2	5.90	0	2	42.00	6.30	0	0	1
 2 | 2	1	37.60	48	36	0	0	1	1	0	3	0	0	0	0	0	0	44.00	6.30	1	5.00	1
 3 | 1	1	37.7	44	28	0	4	3	2	5	4	4	1	1	0	3	5	45	70	3	2	1
 4 | 1	1	37	56	24	3	1	4	2	4	4	3	1	1	0	0	0	35	61	3	2	0
 5 | 2	1	38.00	42	12	3	0	3	1	1	0	1	0	0	0	0	2	37.00	5.80	0	0	1
 6 | 1	1	0	60	40	3	0	1	1	0	4	0	3	2	0	0	5	42	72	0	0	1
 7 | 2	1	38.40	80	60	3	2	2	1	3	2	1	2	2	0	1	1	54.00	6.90	0	0	1
 8 | 2	1	37.80	48	12	2	1	2	1	3	0	1	2	0	0	2	0	48.00	7.30	1	0	1
 9 | 2	1	37.90	45	36	3	3	3	2	2	3	1	2	1	0	3	0	33.00	5.70	3	0	1
10 | 2	1	39.00	84	12	3	1	5	1	2	4	2	1	2	7.00	0	4	62.00	5.90	2	2.20	0
11 | 2	1	38.20	60	24	3	1	3	2	3	3	2	3	3	0	4	4	53.00	7.50	2	1.40	1
12 | 1	1	0	140	0	0	0	4	2	5	4	4	1	1	0	0	5	30	69	0	0	0
13 | 1	1	37.90	120	60	3	3	3	1	5	4	4	2	2	7.50	4	5	52.00	6.60	3	1.80	0
14 | 2	1	38.00	72	36	1	1	3	1	3	0	2	2	1	0	3	5	38.00	6.80	2	2.00	1
15 | 2	9	38.00	92	28	1	1	2	1	1	3	2	3	0	7.20	0	0	37.00	6.10	1	1.10	1
16 | 1	1	38.30	66	30	2	3	1	1	2	4	3	3	2	8.50	4	5	37.00	6.00	0	0	1
17 | 2	1	37.50	48	24	3	1	1	1	2	1	0	1	1	0	3	2	43.00	6.00	1	2.80	1
18 | 1	1	37.50	88	20	2	3	3	1	4	3	3	0	0	0	0	0	35.00	6.40	1	0	0
19 | 2	9	0	150	60	4	4	4	2	5	4	4	0	0	0	0	0	0	0	0	0	0
20 | 1	1	39.7	100	30	0	0	6	2	4	4	3	1	0	0	4	5	65	75	0	0	0
21 | 1	1	38.30	80	0	3	3	4	2	5	4	3	2	1	0	4	4	45.00	7.50	2	4.60	1
22 | 2	1	37.50	40	32	3	1	3	1	3	2	3	2	1	0	0	5	32.00	6.40	1	1.10	1
23 | 1	1	38.40	84	30	3	1	5	2	4	3	3	2	3	6.50	4	4	47.00	7.50	3	0	0
24 | 1	1	38.10	84	44	4	0	4	2	5	3	1	1	3	5.00	0	4	60.00	6.80	0	5.70	0
25 | 2	1	38.70	52	0	1	1	1	1	1	3	1	0	0	0	1	3	4.00	74.00	0	0	1
26 | 2	1	38.10	44	40	2	1	3	1	3	3	1	0	0	0	1	3	35.00	6.80	0	0	1
27 | 2	1	38.4	52	20	2	1	3	1	1	3	2	2	1	0	3	5	41	63	1	1	1
28 | 1	1	38.20	60	0	1	0	3	1	2	1	1	1	1	0	4	4	43.00	6.20	2	3.90	1
29 | 2	1	37.70	40	18	1	1	1	0	3	2	1	1	1	0	3	3	36.00	3.50	0	0	1
30 | 1	1	39.1	60	10	0	1	1	0	2	3	0	0	0	0	4	4	0	0	0	0	1
31 | 2	1	37.80	48	16	1	1	1	1	0	1	1	2	1	0	4	3	43.00	7.50	0	0	1
32 | 1	1	39.00	120	0	4	3	5	2	2	4	3	2	3	8.00	0	0	65.00	8.20	3	4.60	1
33 | 1	1	38.20	76	0	2	3	2	1	5	3	3	1	2	6.00	1	5	35.00	6.50	2	0.90	1
34 | 2	1	38.30	88	0	0	0	6	0	0	0	0	0	0	0	0	0	0	0	0	0	0
35 | 1	1	38.00	80	30	3	3	3	1	0	0	0	0	0	6.00	0	0	48.00	8.30	0	4.30	1
36 | 1	1	0	0	0	3	1	1	1	2	3	3	1	3	6.00	4	4	0	0	2	0	0
37 | 1	1	37.60	40	0	1	1	1	1	1	1	1	0	0	0	1	1	0	0	2	2.10	1
38 | 2	1	37.50	44	0	1	1	1	1	3	3	2	0	0	0	0	0	45.00	5.80	2	1.40	1
39 | 2	1	38.2	42	16	1	1	3	1	1	3	1	0	0	0	1	0	35	60	1	1	1
40 | 2	1	38	56	44	3	3	3	0	0	1	1	2	1	0	4	0	47	70	2	1	1
41 | 2	1	38.30	45	20	3	3	2	2	2	4	1	2	0	0	4	0	0	0	0	0	1
42 | 1	1	0	48	96	1	1	3	1	0	4	1	2	1	0	1	4	42.00	8.00	1	0	1
43 | 1	1	37.70	55	28	2	1	2	1	2	3	3	0	3	5.00	4	5	0	0	0	0	1
44 | 2	1	36.00	100	20	4	3	6	2	2	4	3	1	1	0	4	5	74.00	5.70	2	2.50	0
45 | 1	1	37.10	60	20	2	0	4	1	3	0	3	0	2	5.00	3	4	64.00	8.50	2	0	1
46 | 2	1	37.10	114	40	3	0	3	2	2	2	1	0	0	0	0	3	32.00	0	3	6.50	1
47 | 1	1	38.1	72	30	3	3	3	1	4	4	3	2	1	0	3	5	37	56	3	1	1
48 | 1	1	37.00	44	12	3	1	1	2	1	1	1	0	0	0	4	2	40.00	6.70	3	8.00	1
49 | 1	1	38.6	48	20	3	1	1	1	4	3	1	0	0	0	3	0	37	75	0	0	1
50 | 1	1	0	82	72	3	1	4	1	2	3	3	0	3	0	4	4	53	65	3	2	0
51 | 1	9	38.20	78	60	4	4	6	0	3	3	3	0	0	0	1	0	59.00	5.80	3	3.10	0
52 | 2	1	37.8	60	16	1	1	3	1	2	3	2	1	2	0	3	0	41	73	0	0	0
53 | 1	1	38.7	34	30	2	0	3	1	2	3	0	0	0	0	0	0	33	69	0	2	0
54 | 1	1	0	36	12	1	1	1	1	1	2	1	1	1	0	1	5	44.00	0	0	0	1
55 | 2	1	38.30	44	60	0	0	1	1	0	0	0	0	0	0	0	0	6.40	36.00	0	0	1
56 | 2	1	37.40	54	18	3	0	1	1	3	4	3	2	2	0	4	5	30.00	7.10	2	0	1
57 | 1	1	0	0	0	4	3	0	2	2	4	1	0	0	0	0	0	54	76	3	2	1
58 | 1	1	36.6	48	16	3	1	3	1	4	1	1	1	1	0	0	0	27	56	0	0	0
59 | 1	1	38.5	90	0	1	1	3	1	3	3	3	2	3	2	4	5	47	79	0	0	1
60 | 1	1	0	75	12	1	1	4	1	5	3	3	0	3	5.80	0	0	58.00	8.50	1	0	1
61 | 2	1	38.20	42	0	3	1	1	1	1	1	2	2	1	0	3	2	35.00	5.90	2	0	1
62 | 1	9	38.20	78	60	4	4	6	0	3	3	3	0	0	0	1	0	59.00	5.80	3	3.10	0
63 | 2	1	38.60	60	30	1	1	3	1	4	2	2	1	1	0	0	0	40.00	6.00	1	0	1
64 | 2	1	37.80	42	40	1	1	1	1	1	3	1	0	0	0	3	3	36.00	6.20	0	0	1
65 | 1	1	38	60	12	1	1	2	1	2	1	1	1	1	0	1	4	44	65	3	2	0
66 | 2	1	38.00	42	12	3	0	3	1	1	1	1	0	0	0	0	1	37.00	5.80	0	0	1
67 | 2	1	37.60	88	36	3	1	1	1	3	3	2	1	3	1.50	0	0	44.00	6.00	0	0	0


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MachineLearning
 2 | [![GPL-3.0 Licensed](https://img.shields.io/crates/l/rustc-serialize)](https://opensource.org/licenses/GPL-3.0) [![Python Version](https://img.shields.io/badge/Python-3.x-blue.svg)](https://www.python.org/)  
 3 | Machine learning algorithms implemented by myself with Python 3.6
 4 | 
 5 | ## What's in it?
 6 | + **Classification**
 7 | 1. [AdaBoost](https://github.com/DandelionLau/MachineLearning/tree/master/AdaBoost.py)
 8 | 2. [Blending](https://github.com/DandelionLau/MachineLearning/blob/master/Blending.py)
 9 | 3. [DecisionTree](https://github.com/DandelionLau/MachineLearning/blob/master/Tree.py)
10 | 4. [GBDT](https://github.com/DandelionLau/MachineLearning/blob/master/GradientBoostingDecisionTree.py)
11 | 5. [KNN](https://github.com/DandelionLau/MachineLearning/blob/master/KNN.py)
12 | 6. [LogisticRegression](https://github.com/DandelionLau/MachineLearning/blob/master/LogisticRegression.py)
13 | 7. [NaiveBayes](https://github.com/DandelionLau/MachineLearning/blob/master/NaiveBayes.py)
14 | 8. [Perceptron](https://github.com/DandelionLau/MachineLearning/blob/master/Perceptron.py)
15 | 9. [RandomForest](https://github.com/DandelionLau/MachineLearning/blob/master/RandomForest.py)
16 | 10. [Stacking](https://github.com/DandelionLau/MachineLearning/blob/master/Stacking.py)
17 | 11. [SVM](https://github.com/DandelionLau/MachineLearning/blob/master/SVM.py)
18 | 
19 | +  **Regression**
20 | 1. [GBDT](https://github.com/DandelionLau/MachineLearning/blob/master/GradientBoostingDecisionTree.py)
21 | 2. [LinearRegression](https://github.com/DandelionLau/MachineLearning/blob/master/LinearRegression.py)
22 | 3. [LocallyWeightedLinearRegression](https://github.com/DandelionLau/MachineLearning/blob/master/LinearRegression.py)
23 | 4. [LassoRegression](https://github.com/DandelionLau/MachineLearning/blob/master/LinearRegression.py)  
24 | 5. [RandomForest](https://github.com/DandelionLau/MachineLearning/blob/master/RandomForest.py)
25 | 6. [RidgeRegression](https://github.com/DandelionLau/MachineLearning/blob/master/LinearRegression.py)
26 | 7. [StepWiseRegression](https://github.com/DandelionLau/MachineLearning/blob/master/LinearRegression.py)
27 | 8. [TreeRegression](https://github.com/DandelionLau/MachineLearning/blob/master/Tree.py)
28 | 
29 | + **Cluster**
30 | 1. [BiKmeans](https://github.com/DandelionLau/MachineLearning/blob/master/Cluster.py)
31 | 2. [DBSCAN](https://github.com/DandelionLau/MachineLearning/blob/master/Cluster.py)
32 | 3. [KMeans](https://github.com/DandelionLau/MachineLearning/blob/master/Cluster.py)
33 | 4. [KMeans++](https://github.com/DandelionLau/MachineLearning/blob/master/Cluster.py)
34 | 5. [GMM](https://github.com/Ryuk17/MachineLearning/blob/master/GMM.py)
35 | 
36 | + **Association Analysis**
37 | 1. [Apriori](https://github.com/DandelionLau/MachineLearning/blob/master/AssociationAnalysis.py)
38 | 2. [Eclat](https://github.com/DandelionLau/MachineLearning/blob/master/AssociationAnalysis.py)
39 | 3. [FP-growth](https://github.com/DandelionLau/MachineLearning/blob/master/AssociationAnalysis.py)
40 | 
41 | + **Dimensionality Reduction**
42 | 1. [LDA](https://github.com/DandelionLau/MachineLearning/blob/master/DimensionReduction.py)
43 | 2. [PCA](https://github.com/DandelionLau/MachineLearning/blob/master/DimensionReduction.py)
44 | 
45 | + **Others**
46 | 1. [HMM](https://github.com/DandelionLau/MachineLearning/blob/master/HMM.py)
47 | 
48 | 
49 | ## Tutorials
50 | 中文教程: [从零实现机器学习算法](https://blog.csdn.net/sinat_35821976/category_9276758.html)  
51 | English Turorials: [Step-by-Step Guide To Implement Machine Learning](https://www.codeproject.com/script/Articles/MemberArticles.aspx?amid=14354398)
52 | 
53 | ## Main References
54 | 1. [CS229:Machine Learning](http://cs229.stanford.edu/)
55 | 2. [Machine Learning IN ACTION](https://www.manning.com/books/machine-learning-in-action)
56 | 3. [统计学习方法](https://baike.baidu.com/item/%E7%BB%9F%E8%AE%A1%E5%AD%A6%E4%B9%A0%E6%96%B9%E6%B3%95/10430179)
57 | 
58 | ## Dependences
59 | 1. Install [Python 3.6](https://www.python.org/)
60 | 2. Install [NumPy](http://www.numpy.org/)
61 | 2. Install [Scikit-learn](https://scikit-learn.org/)
62 | 


--------------------------------------------------------------------------------
/GradientBoostingDecisionTree.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @ Filename:       GradientBoostingDecisionTree.py
  3 | @ Author:         Ryuk
  4 | @ Create Date:    2019-07-09   
  5 | @ Update Date:    2019-07-10
  6 | @ Description:    Implement GradientBoostingDecisionTree
  7 | """
  8 | 
  9 | import numpy as np
 10 | from TreeRegression import RegressionTree
 11 | import pickle
 12 | 
 13 | class GBDTClassifier:
 14 |     def __init__(self, tree_num=10):
 15 |         self.tree_num = tree_num
 16 | 
 17 | 
 18 | 
 19 | 
 20 | class GBDTRegression:
 21 |     def __init__(self, tree_num=10, error_threshold=1, N=4, alpha=0.01, iterations=100):
 22 |         self.tree_num = tree_num
 23 |         self.error_threshold = error_threshold
 24 |         self.N = N
 25 |         self.alpha = alpha
 26 |         self.trees = []
 27 |         self.gamma = []                    # multiplier for each model
 28 |         self.residual = None
 29 |         self.iterations = iterations       # iterations for gamma
 30 |         self.last_prediction = None
 31 |         self.prediction = None
 32 | 
 33 |     '''
 34 |         Function:  initializeModel
 35 |         Description: initialize the model
 36 |         Input:  train_label      dataType: ndarray   description: train_label
 37 |     '''
 38 |     def initializeModel(self, train_label):
 39 |         x = np.mean(train_label)
 40 |         for i in range(self.iterations):
 41 |             error = train_label - x
 42 |             x = x - self.alpha * error
 43 |         self.residual = train_label - x
 44 |         self.last_prediction = x
 45 |         self.trees.append(x)
 46 |         self.gamma.append(1)
 47 | 
 48 |     '''
 49 |         Function: getGamma
 50 |         Description: get gamma
 51 |         Input:  train_data       dataType: ndarray   description: features
 52 |                 prediction       dataType: ndarray   description: prediction
 53 |     '''
 54 |     def getGamma(self, train_label, last_prediction, current_prediction):
 55 |         gamma = np.mean(train_label)
 56 |         for i in range(self.iterations):
 57 |             error = train_label - last_prediction - gamma * current_prediction
 58 |             gamma = gamma - self.alpha*error
 59 |         self.residual = train_label - last_prediction - gamma * current_prediction
 60 |         self.last_prediction = last_prediction + gamma * current_prediction
 61 |         self.gamma.append(gamma)
 62 | 
 63 |     '''
 64 |         Function:  train
 65 |         Description: train the model
 66 |         Input:  train_data       dataType: ndarray   description: features
 67 |                 train_label      dataType: ndarray   description: labels
 68 |         Output: self             dataType: obj       description: the trained model
 69 |     '''
 70 |     def train(self, train_data, train_label):
 71 |         # initialize
 72 |         self.initializeModel(train_label)
 73 | 
 74 |         # train
 75 |         for i in range(self.tree_num):
 76 |             clf = RegressionTree(self.error_threshold, self.N, self.alpha)
 77 |             clf.train(train_data, self.residual)
 78 |             prediction = clf.predict(train_data)
 79 |             self.trees.append(clf)
 80 |             self.getGamma(train_label, self.last_prediction, prediction)
 81 |         return self
 82 | 
 83 |     '''
 84 |      Function:  predict
 85 |      Description: predict the testing set 
 86 |      Input:  test_data        dataType: ndarray   description: features
 87 |      Output: prediction       dataType: ndarray   description: the prediction results for testing set
 88 |      '''
 89 |     def perdict(self, test_data):
 90 |         prediction = np.zeros(len(test_data))
 91 |         for i in range(self.tree_num):
 92 |             if i == 0:
 93 |                 prediction += self.gamma * self.trees[i]
 94 |             else:
 95 |                 clf_prediction = self.trees[i].predict(test_data)
 96 |                 prediction += self.gamma * clf_prediction
 97 | 
 98 |         self.prediction = prediction
 99 |         return prediction
100 | 
101 |     '''
102 |       Function:  save
103 |       Description: save the model as pkl
104 |       Input:  filename    dataType: str   description: the path to save model
105 |       '''
106 | 
107 |     def save(self, filename):
108 |         f = open(filename, 'w')
109 |         model = {'trees':self.trees, 'gamma': self.gamma}
110 |         pickle.dump(model, f)
111 |         f.close()
112 | 
113 |     '''
114 |     Function:  load
115 |     Description: load the model 
116 |     Input:  filename    dataType: str   description: the path to save model
117 |     Output: self        dataType: obj   description: the trained model
118 |     '''
119 | 
120 |     def load(self, filename):
121 |         f = open(filename)
122 |         model = pickle.load(f)
123 |         self.trees = model['trees']
124 |         self.gamma = model['gamma']
125 |         return self
126 | 


--------------------------------------------------------------------------------
/GMM.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @FileName: GMM.py
  3 | @Description: Implement GMM
  4 | @Author: Ryuk
  5 | @CreateDate: 2021/05/30
  6 | @LastEditTime: 2021/05/30
  7 | @LastEditors: Please set LastEditors
  8 | @Version: v0.1
  9 | """
 10 | 
 11 | import numpy as np
 12 | import pickle
 13 | import preProcess
 14 | from tqdm import tqdm
 15 | from scipy.stats import multivariate_normal
 16 | import matplotlib.pyplot as plt
 17 | 
 18 | class GaussianMixtureModel:
 19 |     def __init__(self, K, D=2, iterations=100, norm_type="Normalization"):
 20 |         self.norm_type = norm_type
 21 |         self.iterations = iterations
 22 |         self.K = K
 23 |         self.D = D
 24 |         self.N = 0
 25 |         self.alpha = np.random.dirichlet(np.ones(self.K))
 26 |         self.mu = np.random.rand(K, D)
 27 |         self.sigma = np.array([np.eye(self.D)] * K)
 28 |         self.gamma = None
 29 |         self.label = None
 30 | 
 31 |     '''
 32 |     Function:  GaussianPDF
 33 |     Description: generate gaussian distribution with given mu, sigma and x
 34 |     Input:  mu          dataType: ndarray   description: features
 35 |     Input:  sigma       dataType: ndarray   description: features
 36 |     Input:  x           dataType: ndarray   description: features
 37 |     Output: self        dataType: obj       description: the trained model
 38 |     '''
 39 |     def GaussianPDF(self, mu, sigma, x):
 40 |         gaussian = multivariate_normal(mu, sigma)
 41 |         return gaussian.pdf(x)
 42 | 
 43 |     '''
 44 |     Function:  train
 45 |     Description: train the model
 46 |     Input:  train_data       dataType: ndarray   description: features
 47 |     Output: self             dataType: obj       description: the trained model
 48 |     '''
 49 |     def train(self, train_data, plotResult=True):
 50 |         self.N = len(train_data)
 51 |         self.gamma = np.zeros([self.N, self.K])
 52 | 
 53 |         # if self.norm_type == "Standardization":
 54 |         #     train_data = preProcess.Standardization(train_data)
 55 |         # else:
 56 |         #     train_data = preProcess.Normalization(train_data)
 57 | 
 58 |         for i in tqdm(range(self.iterations)):
 59 |             # E-step
 60 |             for k in range(self.K):
 61 |                 self.gamma[:,k] = self.GaussianPDF(self.mu[k], self.sigma[k], train_data)
 62 | 
 63 |             for j in range(self.N):
 64 |                 self.gamma[j,:] = self.gamma[j,:] / np.sum(self.gamma[j,:])
 65 | 
 66 |             # M-step
 67 |             for k in range(self.K):
 68 |                 gamma_sum = np.sum(self.gamma[:,k])
 69 |                 self.mu[k] = np.sum(np.dot(self.gamma[None,:, k], train_data), axis=0) / gamma_sum
 70 |                 self.sigma[k] = (train_data - self.mu[k]).T * np.multiply(np.mat(train_data - self.mu[k]), np.mat(self.gamma[:, k]).T) / gamma_sum
 71 |                 self.alpha[k] = gamma_sum / self.N
 72 |         self.label = np.argmax(self.gamma, axis=1)
 73 | 
 74 |         if plotResult:
 75 |             self.plotResult(train_data)
 76 |         return self.label
 77 | 
 78 | 
 79 |     '''
 80 |     Function:  predict
 81 |     Description: predict the test data
 82 |     Input:  test_data        dataType: ndarray   description: features
 83 |     Output: label            dataType: ndarray   description: the predicted label
 84 |     '''
 85 |     def predict(self, test_data):
 86 |         self.N = len(test_data)
 87 |         self.gamma = np.zeros([self.N, self.K])
 88 | 
 89 |         for k in range(self.K):
 90 |             gamma_sum = np.sum(self.gamma[:,k])
 91 |             self.mu[k] = np.sum(np.dot(self.gamma[None,:, k], test_data), axis=0) / gamma_sum
 92 |             self.sigma[k] = (test_data - self.mu[k]).T * np.multiply(np.mat(test_data - self.mu[k]), np.mat(self.gamma[:, k]).T) / gamma_sum
 93 |             self.alpha[k] = gamma_sum / self.N
 94 |         self.label = np.argmax(self.gamma, axis=1)
 95 |         return self.label
 96 | 
 97 |     '''
 98 |     Function:  plotResult
 99 |     Description: show the clustering result
100 |     '''
101 |     def plotResult(self, train_data):
102 |         plt.scatter(train_data[:, 0], train_data[:, 1], c=self.label)
103 |         plt.title('GMM')
104 |         plt.show()
105 | 
106 |     '''
107 |          Function:  save
108 |          Description: save the model as pkl
109 |          Input:  filename    dataType: str   description: the path to save model
110 |          '''
111 |     def save(self, filename):
112 |         f = open(filename, 'w')
113 |         model = {'alpha': self.alpha, 'mu': self.mu, 'sigma': self.sigma}
114 |         pickle.dump(model, f)
115 |         f.close()
116 | 
117 |     '''
118 |     Function:  load
119 |     Description: load the model 
120 |     Input:  filename    dataType: str   description: the path to save model
121 |     Output: self        dataType: obj   description: the trained model
122 |     '''
123 |     def load(self, filename):
124 |         f = open(filename)
125 |         model = pickle.load(f)
126 |         self.alpha = model['alpha']
127 |         self.mu = model['mu']
128 |         self.sigma = model['sigma']
129 |         return self


--------------------------------------------------------------------------------
/dataset/dataset5/train.txt:
--------------------------------------------------------------------------------
  1 | 3.000000	46.852122
  2 | 23.000000	178.676107
  3 | 0.000000	86.154024
  4 | 6.000000	68.707614
  5 | 15.000000	139.737693
  6 | 17.000000	141.988903
  7 | 12.000000	94.477135
  8 | 8.000000	86.083788
  9 | 9.000000	97.265824
 10 | 7.000000	80.400027
 11 | 8.000000	83.414554
 12 | 1.000000	52.525471
 13 | 16.000000	127.060008
 14 | 9.000000	101.639269
 15 | 14.000000	146.412680
 16 | 15.000000	144.157101
 17 | 17.000000	152.699910
 18 | 19.000000	136.669023
 19 | 21.000000	166.971736
 20 | 21.000000	165.467251
 21 | 3.000000	38.455193
 22 | 6.000000	75.557721
 23 | 4.000000	22.171763
 24 | 5.000000	50.321915
 25 | 0.000000	74.412428
 26 | 5.000000	42.052392
 27 | 1.000000	42.489057
 28 | 14.000000	139.185416
 29 | 21.000000	140.713725
 30 | 5.000000	63.222944
 31 | 5.000000	56.294626
 32 | 9.000000	91.674826
 33 | 22.000000	173.497655
 34 | 17.000000	152.692482
 35 | 9.000000	113.920633
 36 | 1.000000	51.552411
 37 | 9.000000	100.075315
 38 | 16.000000	137.803868
 39 | 18.000000	135.925777
 40 | 3.000000	45.550762
 41 | 16.000000	149.933224
 42 | 2.000000	27.914173
 43 | 6.000000	62.103546
 44 | 20.000000	173.942381
 45 | 12.000000	119.200505
 46 | 6.000000	70.730214
 47 | 16.000000	156.260832
 48 | 15.000000	132.467643
 49 | 19.000000	161.164086
 50 | 17.000000	138.031844
 51 | 23.000000	169.747881
 52 | 11.000000	116.761920
 53 | 4.000000	34.305905
 54 | 6.000000	68.841160
 55 | 10.000000	119.535227
 56 | 20.000000	158.104763
 57 | 18.000000	138.390511
 58 | 5.000000	59.375794
 59 | 7.000000	80.802300
 60 | 11.000000	108.611485
 61 | 10.000000	91.169028
 62 | 15.000000	154.104819
 63 | 5.000000	51.100287
 64 | 3.000000	32.334330
 65 | 15.000000	150.551655
 66 | 10.000000	111.023073
 67 | 0.000000	87.489950
 68 | 2.000000	46.726299
 69 | 7.000000	92.540440
 70 | 15.000000	135.715438
 71 | 19.000000	152.960552
 72 | 19.000000	162.789223
 73 | 21.000000	167.176240
 74 | 22.000000	164.323358
 75 | 12.000000	104.823071
 76 | 1.000000	35.554328
 77 | 11.000000	114.784640
 78 | 1.000000	36.819570
 79 | 12.000000	130.266826
 80 | 12.000000	126.053312
 81 | 18.000000	153.378289
 82 | 7.000000	70.089159
 83 | 15.000000	139.528624
 84 | 19.000000	157.137999
 85 | 23.000000	183.595248
 86 | 7.000000	73.431043
 87 | 11.000000	128.176167
 88 | 22.000000	183.181247
 89 | 13.000000	112.685801
 90 | 18.000000	161.634783
 91 | 6.000000	63.169478
 92 | 7.000000	63.393975
 93 | 19.000000	165.779578
 94 | 14.000000	143.973398
 95 | 22.000000	185.131852
 96 | 3.000000	45.275591
 97 | 6.000000	62.018003
 98 | 0.000000	83.193398
 99 | 7.000000	76.847802
100 | 19.000000	147.087386
101 | 7.000000	62.812086
102 | 1.000000	49.910068
103 | 11.000000	102.169335
104 | 11.000000	105.108121
105 | 6.000000	63.429817
106 | 12.000000	121.301542
107 | 17.000000	163.253962
108 | 13.000000	119.588698
109 | 0.000000	87.333807
110 | 20.000000	144.484066
111 | 21.000000	168.792482
112 | 23.000000	159.751246
113 | 20.000000	162.843592
114 | 14.000000	145.664069
115 | 19.000000	146.838515
116 | 12.000000	132.049377
117 | 18.000000	155.756119
118 | 22.000000	155.686345
119 | 7.000000	73.913958
120 | 1.000000	66.761881
121 | 7.000000	65.855450
122 | 6.000000	56.271026
123 | 19.000000	155.308523
124 | 12.000000	124.372873
125 | 17.000000	136.025960
126 | 14.000000	132.996861
127 | 21.000000	172.639791
128 | 17.000000	135.672594
129 | 8.000000	90.323742
130 | 5.000000	62.462698
131 | 16.000000	159.048794
132 | 14.000000	139.991227
133 | 3.000000	37.026678
134 | 9.000000	100.839901
135 | 9.000000	93.097395
136 | 15.000000	123.645221
137 | 15.000000	147.327185
138 | 1.000000	40.055830
139 | 0.000000	88.192829
140 | 17.000000	139.174517
141 | 22.000000	169.354493
142 | 17.000000	136.354272
143 | 9.000000	90.692829
144 | 7.000000	63.987997
145 | 14.000000	128.972231
146 | 10.000000	108.433394
147 | 2.000000	49.321034
148 | 19.000000	171.615671
149 | 9.000000	97.894855
150 | 0.000000	68.962453
151 | 9.000000	72.063371
152 | 22.000000	157.000070
153 | 12.000000	114.461754
154 | 6.000000	58.239465
155 | 9.000000	104.601048
156 | 8.000000	90.772359
157 | 22.000000	164.428791
158 | 5.000000	34.804083
159 | 5.000000	37.089459
160 | 22.000000	177.987605
161 | 10.000000	89.439608
162 | 6.000000	70.711362
163 | 23.000000	181.731482
164 | 20.000000	151.538932
165 | 7.000000	66.067228
166 | 6.000000	61.565125
167 | 20.000000	184.441687
168 | 9.000000	91.569158
169 | 9.000000	98.833425
170 | 17.000000	144.352866
171 | 9.000000	94.498314
172 | 15.000000	121.922732
173 | 18.000000	166.408274
174 | 10.000000	89.571299
175 | 8.000000	75.373772
176 | 22.000000	161.001478
177 | 8.000000	90.594227
178 | 5.000000	57.180933
179 | 20.000000	161.643007
180 | 8.000000	87.197370
181 | 8.000000	95.584308
182 | 15.000000	126.207221
183 | 7.000000	84.528209
184 | 18.000000	161.056986
185 | 10.000000	86.762615
186 | 1.000000	33.325906
187 | 9.000000	105.095502
188 | 2.000000	22.440421
189 | 9.000000	93.449284
190 | 14.000000	106.249595
191 | 21.000000	163.254385
192 | 22.000000	161.746628
193 | 20.000000	152.973085
194 | 17.000000	122.918987
195 | 7.000000	58.536412
196 | 1.000000	45.013277
197 | 13.000000	137.294148
198 | 10.000000	88.123737
199 | 2.000000	45.847376
200 | 20.000000	163.385797
201 | 


--------------------------------------------------------------------------------
/dataset/dataset5/test.txt:
--------------------------------------------------------------------------------
  1 | 12.000000	121.010516
  2 | 19.000000	157.337044
  3 | 12.000000	116.031825
  4 | 15.000000	132.124872
  5 | 2.000000	52.719612
  6 | 6.000000	39.058368
  7 | 3.000000	50.757763
  8 | 20.000000	166.740333
  9 | 11.000000	115.808227
 10 | 21.000000	165.582995
 11 | 3.000000	41.956087
 12 | 3.000000	34.432370
 13 | 13.000000	116.954676
 14 | 1.000000	32.112553
 15 | 7.000000	50.380243
 16 | 7.000000	94.107791
 17 | 23.000000	188.943179
 18 | 18.000000	152.637773
 19 | 9.000000	104.122082
 20 | 18.000000	127.805226
 21 | 0.000000	83.083232
 22 | 15.000000	148.180104
 23 | 3.000000	38.480247
 24 | 8.000000	77.597839
 25 | 7.000000	75.625803
 26 | 11.000000	124.620208
 27 | 13.000000	125.186698
 28 | 5.000000	51.165922
 29 | 3.000000	31.179113
 30 | 15.000000	132.505727
 31 | 19.000000	137.978043
 32 | 9.000000	106.481123
 33 | 20.000000	172.149955
 34 | 11.000000	104.116556
 35 | 4.000000	22.457996
 36 | 20.000000	175.735047
 37 | 18.000000	165.350412
 38 | 22.000000	177.461724
 39 | 16.000000	138.672986
 40 | 17.000000	156.791788
 41 | 19.000000	150.327544
 42 | 19.000000	156.992196
 43 | 23.000000	163.624262
 44 | 8.000000	92.537227
 45 | 3.000000	32.341399
 46 | 16.000000	144.445614
 47 | 11.000000	119.985586
 48 | 16.000000	145.149335
 49 | 12.000000	113.284662
 50 | 5.000000	47.742716
 51 | 11.000000	115.852585
 52 | 3.000000	31.579325
 53 | 1.000000	43.758671
 54 | 1.000000	61.049125
 55 | 13.000000	132.751826
 56 | 23.000000	163.233087
 57 | 12.000000	115.134296
 58 | 8.000000	91.370839
 59 | 8.000000	86.137955
 60 | 14.000000	120.857934
 61 | 3.000000	33.777477
 62 | 10.000000	110.831763
 63 | 10.000000	104.174775
 64 | 20.000000	155.920696
 65 | 4.000000	30.619132
 66 | 0.000000	71.880474
 67 | 7.000000	86.399516
 68 | 7.000000	72.632906
 69 | 5.000000	58.632985
 70 | 18.000000	143.584511
 71 | 23.000000	187.059504
 72 | 6.000000	65.067119
 73 | 6.000000	69.110280
 74 | 19.000000	142.388056
 75 | 15.000000	137.174489
 76 | 21.000000	159.719092
 77 | 9.000000	102.179638
 78 | 20.000000	176.416294
 79 | 21.000000	146.516385
 80 | 18.000000	147.808343
 81 | 23.000000	154.790810
 82 | 16.000000	137.385285
 83 | 18.000000	166.885975
 84 | 15.000000	136.989000
 85 | 20.000000	144.668679
 86 | 14.000000	137.060671
 87 | 19.000000	140.468283
 88 | 11.000000	98.344084
 89 | 16.000000	132.497910
 90 | 1.000000	59.143101
 91 | 20.000000	152.299381
 92 | 13.000000	134.487271
 93 | 0.000000	77.805718
 94 | 3.000000	28.543764
 95 | 10.000000	97.751817
 96 | 4.000000	41.223659
 97 | 11.000000	110.017015
 98 | 12.000000	119.391386
 99 | 20.000000	158.872126
100 | 2.000000	38.776222
101 | 19.000000	150.496148
102 | 15.000000	131.505967
103 | 22.000000	179.856157
104 | 13.000000	143.090102
105 | 14.000000	142.611861
106 | 13.000000	120.757410
107 | 4.000000	27.929324
108 | 16.000000	151.530849
109 | 15.000000	148.149702
110 | 5.000000	44.188084
111 | 16.000000	141.135406
112 | 12.000000	119.817665
113 | 8.000000	80.991524
114 | 3.000000	29.308640
115 | 6.000000	48.203468
116 | 8.000000	92.179834
117 | 22.000000	162.720371
118 | 10.000000	91.971158
119 | 2.000000	33.481943
120 | 8.000000	88.528612
121 | 1.000000	54.042173
122 | 8.000000	92.002928
123 | 5.000000	45.614646
124 | 3.000000	34.319635
125 | 14.000000	129.140558
126 | 17.000000	146.807901
127 | 17.000000	157.694058
128 | 4.000000	37.080929
129 | 20.000000	169.942381
130 | 10.000000	114.675638
131 | 5.000000	34.913029
132 | 14.000000	137.889747
133 | 0.000000	79.043129
134 | 16.000000	139.084390
135 | 6.000000	53.340135
136 | 13.000000	142.772612
137 | 0.000000	73.103173
138 | 3.000000	37.717487
139 | 15.000000	134.116395
140 | 18.000000	138.748257
141 | 23.000000	180.779121
142 | 10.000000	93.721894
143 | 23.000000	166.958335
144 | 6.000000	74.473589
145 | 6.000000	73.006291
146 | 3.000000	34.178656
147 | 1.000000	33.395482
148 | 22.000000	149.933384
149 | 18.000000	154.858982
150 | 6.000000	66.121084
151 | 1.000000	60.816800
152 | 5.000000	55.681020
153 | 6.000000	61.251558
154 | 15.000000	125.452206
155 | 16.000000	134.310255
156 | 19.000000	167.999681
157 | 5.000000	40.074830
158 | 22.000000	162.658997
159 | 12.000000	109.473909
160 | 4.000000	44.743405
161 | 11.000000	122.419496
162 | 14.000000	139.852014
163 | 21.000000	160.045407
164 | 15.000000	131.999358
165 | 15.000000	135.577799
166 | 20.000000	173.494629
167 | 8.000000	82.497177
168 | 12.000000	123.122032
169 | 10.000000	97.592026
170 | 16.000000	141.345706
171 | 8.000000	79.588881
172 | 3.000000	54.308878
173 | 4.000000	36.112937
174 | 19.000000	165.005336
175 | 23.000000	172.198031
176 | 15.000000	127.699625
177 | 1.000000	47.305217
178 | 13.000000	115.489379
179 | 8.000000	103.956569
180 | 4.000000	53.669477
181 | 0.000000	76.220652
182 | 12.000000	114.153306
183 | 6.000000	74.608728
184 | 3.000000	41.339299
185 | 5.000000	21.944048
186 | 22.000000	181.455655
187 | 20.000000	171.691444
188 | 10.000000	104.299002
189 | 21.000000	168.307123
190 | 20.000000	169.556523
191 | 23.000000	175.960552
192 | 1.000000	42.554778
193 | 14.000000	137.286185
194 | 16.000000	136.126561
195 | 12.000000	119.269042
196 | 6.000000	63.426977
197 | 4.000000	27.728212
198 | 4.000000	32.687588
199 | 23.000000	151.153204
200 | 15.000000	129.767331
201 | 


--------------------------------------------------------------------------------
/Blending.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @ Filename:       Blending .py
  3 | @ Author:         Ryuk
  4 | @ Create Date:    2019-05-04
  5 | @ Update Date:    2019-05-04
  6 | @ Description:    Implement Blending
  7 | """
  8 | 
  9 | from sklearn.model_selection import StratifiedKFold, train_test_split
 10 | from Perceptron import *
 11 | import numpy as np
 12 | import preProcess
 13 | import pickle
 14 | import random
 15 | 
 16 | 
 17 | class BlendingClassifier:
 18 |     def __init__(self, norm_type="Normalization", classifier_set=None):
 19 |         self.norm_type = norm_type
 20 |         self.classifier_set = classifier_set
 21 |         self.k = len(self.classifier_set)       # the number of classifiers
 22 |         self.layer1_classifier_set = None
 23 |         self.layer2_classifier = None
 24 |         self.prediction = None
 25 |         self.probability = None
 26 | 
 27 |     '''
 28 |         Function:  train
 29 |         Description: train the model
 30 |         Input:  train_data       dataType: ndarray   description: features
 31 |                 train_label      dataType: ndarray   description: labels
 32 |         Output: self             dataType: obj       description: the trained model
 33 |         '''
 34 |     def train(self, train_data, train_label):
 35 |         if self.norm_type == "Standardization":
 36 |             train_data = preProcess.Standardization(train_data)
 37 |         else:
 38 |             train_data = preProcess.Normalization(train_data)
 39 | 
 40 |         train_data1, train_data2, train_label1, train_label2 = train_test_split(train_data, train_label, test_size=0.5, random_state=2019)
 41 |         # train set in the second layer
 42 |         train_predict_feature = np.zeros((train_data2.shape[0], self.k))
 43 |         trained_model = []
 44 | 
 45 |         # the first layer in Blending
 46 |         for j, clf in enumerate(self.classifier_set):
 47 |             # train each submodel
 48 |             print(j, clf)
 49 |             clf.train(train_data1, train_label1)
 50 |             train_predict_feature[:, j] = clf.predict(train_data2)[:, 0]
 51 |             # save the trained model in the first layer
 52 |             trained_model.append(clf)
 53 | 
 54 |         # the second layer in Blending
 55 |         layer2_clf = PerceptronClassifier()
 56 |         layer2_clf.train(train_predict_feature, train_label2)
 57 | 
 58 |         self.layer1_classifier_set = trained_model
 59 |         self.layer2_classifier = layer2_clf
 60 | 
 61 |         return self
 62 | 
 63 |     '''
 64 |       Function:  predict
 65 |       Description: predict the testing set 
 66 |       Input:  train_data       dataType: ndarray   description: features
 67 |               prob             dataType: bool      description: return probaility of label
 68 |       Output: prediction       dataType: ndarray   description: the prediction results for testing set
 69 |             '''
 70 | 
 71 |     def predict(self, test_data, prob="False"):
 72 |         # Normalization
 73 |         if self.norm_type == "Standardization":
 74 |             test_data = preProcess.Standardization(test_data)
 75 |         else:
 76 |             test_data = preProcess.Normalization(test_data)
 77 | 
 78 |         test_predict_feature = np.zeros((test_data.shape[0], self.k))
 79 |         # the first layer in Blending
 80 |         for j, clf in enumerate(self.layer1_classifier_set):
 81 |             test_predict_feature[:, j] = clf.predict(test_data)[:, 0]
 82 | 
 83 |         # the second layer in Blending
 84 |         probability = self.layer2_classifier.predict(test_predict_feature)
 85 |         prediction = (probability > 0.5)*1
 86 | 
 87 |         self.probability = probability
 88 |         self.prediction = prediction
 89 |         if prob:
 90 |             return probability
 91 |         else:
 92 |             return prediction
 93 | 
 94 |     '''
 95 |        Function:  accuracy
 96 |        Description: show detection result
 97 |        Input:  test_label dataType: ndarray   description: labels of test data
 98 |        Output: accuracy   dataType: float     description: detection accuarcy
 99 |        '''
100 | 
101 |     def accuarcy(self, test_label):
102 |         test_label = np.expand_dims(test_label, axis=1)
103 |         prediction = self.prediction
104 |         accuarcy = sum(prediction == test_label) / len(test_label)
105 |         return accuarcy
106 | 
107 |     '''
108 |        Function:  save
109 |        Description: save the model as pkl
110 |        Input:  filename    dataType: str   description: the path to save model
111 |        '''
112 | 
113 |     def save(self, filename):
114 |         f = open(filename, 'w')
115 |         model = {'layer1_classifiers':self.layer1_classifier_set, 'layer2_classifier':self.layer2_classifier}
116 |         pickle.dump(model, f)
117 |         f.close()
118 | 
119 |     '''
120 |     Function:  load
121 |     Description: load the model 
122 |     Input:  filename    dataType: str   description: the path to save model
123 |     Output: self        dataType: obj   description: the trained model
124 |     '''
125 | 
126 |     def load(self, filename):
127 |         f = open(filename)
128 |         model = pickle.load(f)
129 |         self.layer1_classifier_set = model['layer1_classifiers']
130 |         self.layer2_classifier = model['layer2_classifier']
131 |         return self
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/KNN.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @Filename:       KNN.py
  3 | @Author:         Ryuk
  4 | @Create Date:    2019-04-29
  5 | @Update Date:    2019-05-03
  6 | @Description:    Implement of KNN
  7 | """
  8 | 
  9 | import numpy as np
 10 | import operator as op
 11 | 
 12 | class KNNClassifier:
 13 |     def __init__(self, k, norm_type="Normalization"):
 14 |         self.k = k
 15 |         self.norm_type = "Normalization"
 16 |         self.x_train = None
 17 |         self.y_train = None
 18 | 
 19 |     '''
 20 |     Function:  Normalization
 21 |     Description: Normalize input data. For vector x, the normalization process is given by
 22 |                  normalization(x) = (x - min(x))/(max(x) - min(x))
 23 |     Input:  data        dataType: ndarray   description: input data
 24 |     Output: norm_data   dataType: ndarray   description: output data after normalization
 25 |     '''
 26 |     def Normalization(self, data):
 27 |         # get the max and min value of each column
 28 |         min_value = data.min(axis=0)
 29 |         max_value = data.max(axis=0)
 30 |         diff = max_value - min_value
 31 |         # normalization
 32 |         min_data = np.tile(min_value, (data.shape[0], 1))
 33 |         norm_data = (data - min_data)/np.tile(diff, (data.shape[0], 1))
 34 |         return norm_data
 35 | 
 36 |     '''
 37 |     Function:  Standardization
 38 |     Description: Standardize input data. For vector x, the normalization process is given by
 39 |                  Standardization(x) = x - mean(x)/std(x)
 40 |     Input:  data            dataType: ndarray   description: input data
 41 |     Output: standard_data   dataType: ndarray   description: output data after standardization
 42 |     '''
 43 |     def Standardization(self, data):
 44 |         # get the mean and the variance of each column
 45 |         mean_value = data.mean(axis=0)
 46 |         var_value = data.std(axis=0)
 47 |         standard_data = (data - np.tile(mean_value, (data.shape[0], 1)))/np.tile(var_value, (data.shape[0], 1))
 48 |         return standard_data
 49 | 
 50 |     '''
 51 |     Function:  train
 52 |     Description: train the model
 53 |     Input:  train_data       dataType: ndarray   description: features
 54 |             test_data        dataType: ndarray   description: labels
 55 |     Output: self             dataType: obj       description: 
 56 |     '''
 57 |     def train(self, train_data, train_label):
 58 |         if self.normType == "Standardization":
 59 |             train_data = self.Standardization(train_data)
 60 |         else:
 61 |             train_data = self.Normalization(train_data)
 62 |         self.x_train = train_data
 63 |         self.y_train = train_label
 64 |         return self
 65 | 
 66 |     '''
 67 |     Function:  predict
 68 |     Description: give the prediction for test data
 69 |     Input:  test_data    dataType: ndarray   description: data for testing
 70 |             test_abel    dataType: ndarray   description: labels of train data
 71 |             norm_type    dataType: string    description: type of normalization, default:Normalization
 72 |             probability  dataType: bool      description: if true return label and probability, else return label only
 73 |             showResult   dataType: bool      description: display the prediction result
 74 |     Output: results      dataType: ndarray   description: label or probability
 75 |     '''
 76 |     def predict(self, test_data):
 77 |         # Normalization
 78 |         if self.normType == "Standardization":
 79 |             testData = self.Standardization(test_data)
 80 |         else:
 81 |             testData = self.Normalization(test_data)
 82 | 
 83 |         test_num = testData.shape[0]
 84 |         prediction = np.zeros([test_num, 1])
 85 |         probability = np.zeros([test_num, 1])
 86 |         # predict each samples in test data
 87 |         for i in range(test_num):
 88 |             prediction[i], probability[i] = self.calcuateDistance(testData[i], self.x_train, self.y_train, self.k)
 89 | 
 90 |         return prediction
 91 | 
 92 |     '''
 93 |     Function:  calcuateDistance
 94 |     Description: calcuate the distance between input vector and train data
 95 |     Input:  input       dataType: ndarray   description: input vector
 96 |             traind_ata  dataType: ndarray   description: data for training
 97 |             train_label dataType: ndarray   description: labels of train data
 98 |             k           dataType: int       description: select the first k distances
 99 |     Output: prob        dataType: float     description: max probability of prediction 
100 |             label       dataType: int       description: prediction label of input vector
101 |     '''
102 |     def calcuateDistance(self, input, train_data, train_label, k):
103 |         train_num = train_data.shape[0]
104 |         # calcuate the distances
105 |         distances = np.tile(input, (train_num, 1)) - train_data
106 |         distances = distances**2
107 |         distances = distances.sum(axis=1)
108 |         distances = distances**0.5
109 | 
110 |         # get the labels of the first k distances
111 |         disIndex = distances.argsort()
112 |         labelCount = {}
113 |         for i in range(k):
114 |             label = train_label[disIndex[i]]
115 |             labelCount[label] = labelCount.get(label, 0) + 1
116 | 
117 |         prediction = sorted(labelCount.items(), key=op.itemgetter(1), reverse=True)
118 |         label = prediction[0][0]
119 |         prob = prediction[0][1]/k
120 |         return label, prob
121 | 
122 |     '''
123 |     Function:  showDetectionResult
124 |     Description: show detection result
125 |     Input:  test_data  dataType: ndarray   description: data for test
126 |             test_label dataType: ndarray   description: labels of test data
127 |     Output: accuracy   dataType: float     description: detection accuarcy
128 |     '''
129 |     def showDetectionResult(self, test_data, test_label):
130 |         test_label = np.expand_dims(test_label, axis=1)
131 |         prediction = self.predict(test_data)
132 |         accuarcy = sum(prediction == test_label)/len(test_label)
133 |         return accuarcy
134 | 


--------------------------------------------------------------------------------
/Stacking.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @ Filename:       Stacking.py
  3 | @ Author:         Ryuk
  4 | @ Create Date:    2019-05-05
  5 | @ Update Date:    2019-05-05
  6 | @ Description:    Implement Stacking
  7 | """
  8 | from sklearn.model_selection import StratifiedKFold, train_test_split
  9 | from Perceptron import *
 10 | import numpy as np
 11 | import preProcess
 12 | import pickle
 13 | import random
 14 | 
 15 | class StackingClassifier:
 16 |     def __init__(self, norm_type="Normalization", classifier_set=None, fusion_type="Weighing",n_folds=5):
 17 |         self.norm_type = norm_type
 18 |         self.classifier_set = classifier_set
 19 |         self.k = len(self.classifier_set)       # the number of classifiers
 20 |         self.trained_classifier_set = None
 21 |         self.n_folds = n_folds                  # the number of fold for cross validation
 22 |         self.fusion_type = fusion_type          # fusion method in the second layer
 23 |         self.prediction = None
 24 |         self.probability = None
 25 | 
 26 |     '''
 27 |           Function:  train
 28 |           Description: train the model
 29 |           Input:  train_data       dataType: ndarray   description: features
 30 |                   train_label      dataType: ndarray   description: labels
 31 |           Output: self             dataType: obj       description: the trained model
 32 |           '''
 33 | 
 34 |     def train(self, train_data, train_label):
 35 |         if self.norm_type == "Standardization":
 36 |             train_data = preProcess.Standardization(train_data)
 37 |         else:
 38 |             train_data = preProcess.Normalization(train_data)
 39 | 
 40 |         skf = StratifiedKFold(self.n_folds)
 41 |         prediction_feature = np.zeros((train_data.shape[0], len(self.classifier_set)))
 42 |         trained_model = []
 43 | 
 44 |         # the first layer in Stacking
 45 |         for j, clf in enumerate(self.classifier_set):
 46 |             # train each submodel
 47 |             subtrained_model = []
 48 |             # cross validation
 49 |             for (train_index, test_index) in skf.split(train_data, train_label):
 50 |                 X_train, X_test = train_data[train_index], train_data[test_index]
 51 |                 y_train, y_test = train_label[train_index], train_label[test_index]
 52 |                 # train and save the model trained with S-si
 53 |                 clf.train(X_train, y_train)
 54 |                 subtrained_model.append(clf)
 55 |                 # get the prediction feature for each sub model
 56 |                 prediction_feature[test_index, j] = clf.predict(X_test)[:, 0]
 57 |             # save the models
 58 |             trained_model.append(subtrained_model)
 59 | 
 60 |         self.trained_classifier_set = trained_model
 61 |         return self
 62 | 
 63 |     '''
 64 |        Function:  predict
 65 |        Description: predict the testing set 
 66 |        Input:  train_data       dataType: ndarray   description: features
 67 |                prob             dataType: bool      description: return probaility of label
 68 |        Output: prediction       dataType: ndarray   description: the prediction results for testing set
 69 |              '''
 70 | 
 71 |     def predict(self, test_data, prob="False"):
 72 |         # Normalization
 73 |         if self.norm_type == "Standardization":
 74 |             test_data = preProcess.Standardization(test_data)
 75 |         else:
 76 |             test_data = preProcess.Normalization(test_data)
 77 | 
 78 |         pre_prediction = np.zeros((test_data.shape[0], self.n_folds))
 79 |         # the first layer in Stacking
 80 |         for j, sub_model in enumerate(self.trained_classifier_set):
 81 |             sub_prediction_feature = np.zeros((test_data.shape[0], self.n_folds))
 82 |             i = 0
 83 |             for clf in sub_model:
 84 |                 sub_prediction_feature[:, i] = clf.predict(test_data)[:, 0]
 85 |                 i = i + 1
 86 |             pre_prediction[:, j] = sub_prediction_feature.mean(1)
 87 | 
 88 |         test_num = test_data.shape[0]
 89 |         prediction = np.zeros([test_num, 1])
 90 |         probability = np.zeros([test_num, 1])
 91 |         # the second layer in Stacking
 92 |         if self.fusion_type == "Averaging":
 93 |             probability = pre_prediction.mean(1)
 94 |         elif self.fusion_type == "Voting":
 95 |             probability = np.sum(pre_prediction, axis=1)/self.k
 96 |         elif self.fusion_type == "Weighing":
 97 |             w = [i/i.sum() for i in pre_prediction]
 98 |             probability = np.sum(np.multiply(pre_prediction, w), axis=1)
 99 | 
100 |         prediction = (probability > 0.5) * 1
101 |         self.probability = probability
102 |         self.prediction = prediction
103 |         if prob:
104 |             return probability
105 |         else:
106 |             return prediction
107 | 
108 |     '''
109 |          Function:  accuracy
110 |          Description: show detection result
111 |          Input:  test_label dataType: ndarray   description: labels of test data
112 |          Output: accuracy   dataType: float     description: detection accuarcy
113 |          '''
114 | 
115 |     def accuarcy(self, test_label):
116 |         # test_label = np.expand_dims(test_label, axis=1)
117 |         prediction = self.prediction
118 |         accuarcy = sum(prediction == test_label) / len(test_label)
119 |         return accuarcy
120 | 
121 |     '''
122 |        Function:  save
123 |        Description: save the model as pkl
124 |        Input:  filename    dataType: str   description: the path to save model
125 |        '''
126 | 
127 |     def save(self, filename):
128 |         f = open(filename, 'w')
129 |         pickle.dump(self.trained_classifier_set, f)
130 |         f.close()
131 | 
132 |     '''
133 |     Function:  load
134 |     Description: load the model 
135 |     Input:  filename    dataType: str   description: the path to save model
136 |     Output: self        dataType: obj   description: the trained model
137 |     '''
138 | 
139 |     def load(self, filename):
140 |         f = open(filename)
141 |         self.trained_classifier_set = pickle.load(f)
142 |         return self
143 | 
144 | 


--------------------------------------------------------------------------------
/DimensionReduction.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @ Filename:       DimensionReduction.py
  3 | @ Author:         Ryuk
  4 | @ Create Date:    2019-06-02   
  5 | @ Update Date:    2019-06-06
  6 | @ Description:    Implement DimensionReduction
  7 | """
  8 | import numpy as np
  9 | import pickle
 10 | import preProcess
 11 | 
 12 | class PCA:
 13 |     def __init__(self, norm_type="Standardization", rate=0.9):
 14 |         self.norm_type = norm_type
 15 |         self.matrix = None
 16 |         self.contribute_rate = None
 17 |         self.acc_contribute_rate = None
 18 |         self.rate = rate
 19 | 
 20 |     '''
 21 |        Function:  train
 22 |        Description: train the model
 23 |        Input:  train_data       dataType: ndarray   description: features
 24 |        Output: self             dataType: obj       description: the trained model
 25 |        '''
 26 |     def train(self, train_data):
 27 |         # decentration
 28 |         data = train_data - train_data.mean(axis=0)
 29 | 
 30 |         # calculate the eigenvalue and eigenvector of covariance matrix
 31 |         covariance_matrix = np.cov(data, rowvar=False)
 32 |         eigenvalue, eigenvector = np.linalg.eig(covariance_matrix)
 33 |         index = np.argsort(-eigenvalue)
 34 |         eigenvalue = eigenvalue[index]
 35 |         eigenvector = eigenvector[:, index]
 36 | 
 37 |         # calculate contribute rate
 38 |         contribute_rate = np.zeros(len(index))
 39 |         acc_contribute_rate = np.zeros(len(index))
 40 |         value_sum = eigenvalue.sum()
 41 |         sum = 0
 42 |         k = 0
 43 |         for i in range(len(eigenvalue)):
 44 |             sum = sum + eigenvalue[i]
 45 |             contribute_rate[i] = eigenvalue[i]/value_sum
 46 |             acc_contribute_rate[i] = sum/value_sum
 47 |             if (acc_contribute_rate[i-1] < self.rate) and (acc_contribute_rate[i] >= self.rate):
 48 |                 k = i
 49 |         self.contribute_rate = contribute_rate
 50 |         self.acc_contribute_rate = acc_contribute_rate
 51 | 
 52 |         matrix = np.mat(eigenvector)[:, k]
 53 |         self.matrix = matrix
 54 |         return self
 55 | 
 56 |     '''
 57 |        Function:  transformData
 58 |        Description: transform data
 59 |        Input:  data                     dataType: ndarray   description: original data
 60 |        Output: transformed_data         dataType: ndarray   description: transformed data 
 61 |        '''
 62 |     def transformData(self, data):
 63 |         data = data - data.mean(axis=0)
 64 |         transformed_data = np.dot(data, self.matrix)
 65 |         return transformed_data
 66 | 
 67 |     '''
 68 |         Function:  save
 69 |         Description: save the model as pkl
 70 |         Input:  filename    dataType: str   description: the path to save model
 71 |         '''
 72 | 
 73 |     def save(self, filename):
 74 |         f = open(filename, 'w')
 75 |         pickle.dump(self.matrix, f)
 76 |         f.close()
 77 | 
 78 |     '''
 79 |     Function:  load
 80 |     Description: load the model 
 81 |     Input:  filename    dataType: str   description: the path to save model
 82 |     Output: self        dataType: obj   description: the trained model
 83 |     '''
 84 | 
 85 |     def load(self, filename):
 86 |         f = open(filename)
 87 |         self.matrix = pickle.load(f)
 88 |         return self
 89 | 
 90 | 
 91 | class LDA:
 92 |     def __init__(self, norm_type="Standardization", rate=0.9):
 93 |         self.norm_type = norm_type
 94 |         self.matrix = None
 95 |         self.contribute_rate = None
 96 |         self.acc_contribute_rate = None
 97 |         self.rate = rate
 98 | 
 99 |     '''
100 |        Function:  train
101 |        Description: train the model
102 |        Input:  train_data       dataType: ndarray   description: features
103 |        Output: self             dataType: obj       description: the trained model
104 |        '''
105 |     def train(self, data, label):
106 |         # Normalization
107 |         if self.norm_type == "Standardization":
108 |             data = preProcess.Standardization(data)
109 |         else:
110 |             data = preProcess.Normalization(data)
111 |         unique_label = np.unique(label)
112 |         mu = np.mean(data, axis=0)
113 |         # St = np.dot((data - mu).T, data - mu)
114 | 
115 |         Sw = 0
116 |         Sb = 0
117 |         for c in unique_label:
118 |             index = np.where(label == c)
119 |             Ni = len(index)
120 |             xi = data[index]
121 |             mui = np.mean(xi, axis=0)
122 | 
123 |             # calculate Sw
124 |             Si = np.dot((xi - mui).T, xi - mui)
125 |             Sw = Sw + Si
126 | 
127 |             # calculate Sb
128 |             delta = np.expand_dims(mu - mui, axis=1)
129 |             Sb = Sb + Ni * np.dot(delta, delta.T)
130 | 
131 |         # calculate the eigenvalue, eigenvector of Sw-1 * Sb
132 |         temp = np.dot(np.linalg.inv(Sw), Sb)
133 |         eigenvalue, eigenvector = np.linalg.eig(np.dot(np.linalg.inv(Sw), Sb))
134 | 
135 |         index = np.argsort(-eigenvalue)
136 |         eigenvalue = eigenvalue[index]
137 |         eigenvector = eigenvector[:, index]
138 |         # calculate contribute rate
139 |         contribute_rate = np.zeros(len(index))
140 |         acc_contribute_rate = np.zeros(len(index))
141 |         value_sum = eigenvalue.sum()
142 |         sum = 0
143 |         k = 0
144 |         for i in range(len(eigenvalue)):
145 |             sum = sum + eigenvalue[i]
146 |             contribute_rate[i] = eigenvalue[i] / value_sum
147 |             acc_contribute_rate[i] = sum / value_sum
148 |             if (acc_contribute_rate[i - 1] < self.rate) and (acc_contribute_rate[i] >= self.rate):
149 |                 k = i
150 | 
151 |         self.contribute_rate = contribute_rate
152 |         self.acc_contribute_rate = acc_contribute_rate
153 | 
154 |         matrix = np.mat(eigenvector)[:, k]
155 |         self.matrix = matrix
156 |         return self
157 | 
158 |     '''
159 |        Function:  transformData
160 |        Description: transform data
161 |        Input:  data                     dataType: ndarray   description: original data
162 |        Output: transformed_data         dataType: ndarray   description: transformed data 
163 |        '''
164 |     def transformData(self, data):
165 |         transformed_data = np.dot(data, self.matrix)
166 |         return transformed_data
167 | 


--------------------------------------------------------------------------------
/FeatureCombination.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @ Filename:      FeatureCombination.py
  3 | @ Author:         Ryuk
  4 | @ Create Date:    2019-11-18
  5 | @ Update Date:    2019-11-20
  6 | @ Description:    Implement FM
  7 | """
  8 | 
  9 | import numpy as np
 10 | import preProcess
 11 | import pickle
 12 | 
 13 | class FM:
 14 |     def __init__(self, n, norm_type="Standardization", k=5):
 15 |         self.norm_type = norm_type
 16 |         self.n = n                                      # the number of feature
 17 |         self.k = k                                      # the dimension of latency
 18 |         self.w_0 = 0                                    # numerical parameter
 19 |         self.W = np.random.random([self.n, 1])          # one order parameter
 20 |         self.V = np.random.random([self.n, self.k])     # second order parameter
 21 |         self.sample_num = None                          # the number of samples of trainset
 22 | 
 23 |     '''
 24 |        Function:  sigmoid
 25 |        Description: sigmoid function
 26 |        Input:  x          dataType: ndarray   description: input vector
 27 |                derivative dataType: bool      description: whether to calculate the derivative of sigmoid
 28 |        Output: output     dataType: float     description: output
 29 |        '''
 30 |     def sigmoid(self, x, derivative=False):
 31 |         output = 1/(1 + np.exp(-x))
 32 |         if derivative:
 33 |             output = output * (1 - output)
 34 |         return output
 35 | 
 36 | 
 37 |     '''
 38 |        Function:  train
 39 |        Description: train the model
 40 |        Input:  train_data       dataType: ndarray   description: features
 41 |                train_label      dataType: ndarray   description: labels
 42 |                alpha            dataType: float     description: the stride of the target
 43 |                iterations       dataType: int       description: the times of iteration
 44 |        Output: self             dataType: obj       description: the trained model
 45 |        '''
 46 |     def train(self, train_data, train_label, alpha=0.01, iterations=100):
 47 |         if self.norm_type == "Standardization":
 48 |             train_data = preProcess.Standardization(train_data)
 49 |         else:
 50 |             train_data = preProcess.Normalization(train_data)
 51 | 
 52 |         for epoch in range(iterations):
 53 |             for id in range(self.sample_num):
 54 | 
 55 |                 # second order computation
 56 |                 inter_1 = train_data[id] * self.V
 57 |                 inter_2 = np.multiply(train_data[id], train_data[id]) * np.multiply(self.V, self.V)
 58 |                 interaction = np.sum(np.multiply(inter_1, inter_1) - inter_2) / 2.
 59 | 
 60 |                 # prediction result
 61 |                 pred = self.w_0 + train_data[id] * self.W + interaction
 62 | 
 63 |                 # calculate loss, cross entropy
 64 |                 base = [np.log(self.sigmoid(train_label[id] * float(pred))) - 1] * train_label
 65 | 
 66 |                 # update numerical parameters
 67 |                 self.w_0 -= alpha * base
 68 | 
 69 |                 x = train_data[id]
 70 |                 for i in range(self.n):
 71 |                     # update first-order parameter
 72 |                     if train_data[id, i] != 0:
 73 |                         self.W[id, i] -= alpha * base  * train_data[id, i]
 74 |                         for j in range(self.n):
 75 |                             # update second-order parameter
 76 |                             self.V[i, j] -= alpha * base * (
 77 |                                     train_data[id, i] * self.V[j, i] * train_data[id, j] - self.V[i, j] * train_data[id, i] * train_data[id, i])
 78 | 
 79 |         return self
 80 | 
 81 | 
 82 |     '''
 83 |        Function:  predict
 84 |        Description: predict the testing set 
 85 |        Input:  train_data       dataType: ndarray   description: features
 86 |                prob             dataType: bool      description: return probaility of label
 87 |        Output: prediction       dataType: ndarray   description: the prediction results for testing set
 88 |        '''
 89 |     def predict(self, test_data, prob="False"):
 90 |         # Normalization
 91 |         if self.norm_type == "Standardization":
 92 |             test_data = preProcess.Standardization(test_data)
 93 |         else:
 94 |             test_data = preProcess.Normalization(test_data)
 95 | 
 96 |         test_num = test_data.shape[0]
 97 |         prediction = np.zeros([test_num, 1])
 98 |         probability = np.zeros([test_num, 1])
 99 |         for i in range(test_num):
100 | 
101 |             inter_1 = test_data[i] * self.V
102 |             inter_2 = np.multiply(test_data[i], test_data[i]) * np.multiply(self.V, self.V)
103 |             interaction = sum(np.multiply(inter_1, inter_1) - inter_2) / 2.
104 |             pre = self.w_0 + test_data[i] * self.W + interaction
105 |             probability = self.sigmoid(float(pre))
106 | 
107 |             if probability[i] > 0.5:
108 |                 prediction[i] = 1
109 |             else:
110 |                 prediction[i] = 0.5
111 | 
112 |         self.prediction = prediction
113 |         self.probability = probability
114 |         if prob:
115 |             return probability
116 |         else:
117 |             return prediction
118 | 
119 | 
120 |     '''
121 |     Function:  accuracy
122 |     Description: show detection result
123 |     Input:  test_label dataType: ndarray   description: labels of test data
124 |     Output: accuracy   dataType: float     description: detection accuarcy
125 |     '''
126 |     def accuarcy(self, test_label):
127 |         test_label = np.expand_dims(test_label, axis=1)
128 |         prediction = self.prediction
129 |         accuarcy = sum(prediction == test_label)/len(test_label)
130 |         return accuarcy
131 | 
132 |     '''
133 |        Function:  save
134 |        Description: save the model as pkl
135 |        Input:  filename    dataType: str   description: the path to save model
136 |        '''
137 |     def save(self, filename):
138 |         f = open(filename, 'w')
139 |         pickle.dump(self.weights, f)
140 |         f.close()
141 | 
142 |     '''
143 |     Function:  load
144 |     Description: load the model 
145 |     Input:  filename    dataType: str   description: the path to save model
146 |     Output: self        dataType: obj   description: the trained model
147 |     '''
148 |     def load(self, filename):
149 |         f = open(filename)
150 |         self.weights = pickle.load(f)
151 |         return self
152 | 


--------------------------------------------------------------------------------
/LogisticRegression.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @Filename:       LogisticRegression.py
  3 | @Author:         Ryuk
  4 | @Create Date:    2019-04-30
  5 | @Update Date:    2019-05-03
  6 | @Description:    Implement of logistic regression
  7 | """
  8 | 
  9 | import numpy as np
 10 | import preProcess
 11 | import pickle
 12 | import random
 13 | 
 14 | 
 15 | class LogisticRegressionClassifier:
 16 |     def __init__(self,norm_type="Normalization"):
 17 |         self.norm_type = norm_type
 18 |         self.weights = None
 19 |         self.prediction = None
 20 |         self.probability = None
 21 |     '''
 22 |        Function:  sigmoid
 23 |        Description: sigmoid function
 24 |        Input:  x          dataType: ndarray   description: input vector
 25 |                derivative dataType: bool      description: whether to calculate the derivative of sigmoid
 26 |        Output: output     dataType: float     description: output
 27 |        '''
 28 |     def sigmoid(self, x, derivative=False):
 29 |         output = 1/(1 + np.exp(-x))
 30 |         if derivative:
 31 |             output = output * (1 - output)
 32 |         return output
 33 | 
 34 |     '''
 35 |        Function:  updataAlpha
 36 |        Description: updata Alpha in each sample
 37 |        Input: alpha       dataType: float     description: original alpha
 38 |               method      dataTpye: int       description: update method of alpha
 39 |        Output: output     dataType: float     description: output
 40 |        '''
 41 |     def updataAlpha(self, alpha, epoch, method=1):
 42 |         if method == 1:
 43 |             alpha = 0.95 ** epoch * alpha
 44 |         elif method == 2:
 45 |             k = 3
 46 |             alpha = k/(epoch ** 0.5) * alpha
 47 |         elif method == 3:
 48 |             decay_rate = 0.001
 49 |             alpha = alpha / (1 + decay_rate * epoch)
 50 |         return alpha
 51 | 
 52 |     '''
 53 |        Function:  train
 54 |        Description: train the model
 55 |        Input:  train_data       dataType: ndarray   description: features
 56 |                train_label      dataType: ndarray   description: labels
 57 |                method           dataType: string    description: "GA":Gradient Ascent; "SGA": Stochastic Gradient Ascent
 58 |                alpha            dataType: float     description: the stride of the target
 59 |                iterations       dataType: int       description: the times of iteration
 60 |        Output: self             dataType: obj       description: the trained model
 61 |        '''
 62 |     def train(self, train_data, train_label, method="GA", alpha=0.1, iterations=100):
 63 |         if self.norm_type == "Standardization":
 64 |             train_data = preProcess.Standardization(train_data)
 65 |         else:
 66 |             train_data = preProcess.Normalization(train_data)
 67 | 
 68 |         train_label = np.expand_dims(train_label, axis=1)
 69 |         feature_dim = len(train_data[1])
 70 | 
 71 | 
 72 |         if method == "GA":
 73 |             weights = np.random.normal(0, 1, [feature_dim, 1])
 74 |             for i in range(iterations):
 75 |                 pred = self.sigmoid(np.dot(train_data, weights))
 76 |                 errors = train_label - pred
 77 |                 # update the weights
 78 |                 weights = weights + alpha * np.dot(train_data.T, errors)
 79 |             self.weights = weights
 80 |             return self
 81 | 
 82 |         if method == "SGA":
 83 |             weights = np.random.normal(0, 1, feature_dim)
 84 |             sample_num = len(train_data)
 85 |             random_index = np.random.randint(sample_num, size=sample_num)
 86 |             for i in range(iterations):
 87 |                 for j in range(sample_num):
 88 |                     alpha = self.updataAlpha(alpha, i, 1)
 89 |                     pred = self.sigmoid(np.dot(train_data[random_index[j], :], weights))
 90 |                     sample_error = train_label[random_index[j]] - pred
 91 |                     weights = weights + alpha * sample_error * train_data[random_index[j], :]
 92 | 
 93 |             self.weights = weights
 94 |             return self
 95 | 
 96 |     '''
 97 |        Function:  predict
 98 |        Description: predict the testing set 
 99 |        Input:  train_data       dataType: ndarray   description: features
100 |                prob             dataType: bool      description: return probaility of label
101 |        Output: prediction       dataType: ndarray   description: the prediction results for testing set
102 |        '''
103 |     def predict(self, test_data, prob="False"):
104 |         # Normalization
105 |         if self.norm_type == "Standardization":
106 |             test_data = preProcess.Standardization(test_data)
107 |         else:
108 |             test_data = preProcess.Normalization(test_data)
109 | 
110 |         test_num = test_data.shape[0]
111 |         prediction = np.zeros([test_num, 1])
112 |         probability = np.zeros([test_num, 1])
113 |         for i in range(test_num):
114 |             probability[i] = self.sigmoid(np.dot(test_data[i, :], self.weights))
115 |             if probability[i] > 0.5:
116 |                 prediction[i] = 1
117 |             else:
118 |                 prediction[i] = 0.5
119 | 
120 |         self.prediction = prediction
121 |         self.probability = probability
122 |         if prob:
123 |             return probability
124 |         else:
125 |             return prediction
126 | 
127 |     '''
128 |     Function:  accuracy
129 |     Description: show detection result
130 |     Input:  test_label dataType: ndarray   description: labels of test data
131 |     Output: accuracy   dataType: float     description: detection accuarcy
132 |     '''
133 |     def accuarcy(self, test_label):
134 |         test_label = np.expand_dims(test_label, axis=1)
135 |         prediction = self.prediction
136 |         accuarcy = sum(prediction == test_label)/len(test_label)
137 |         return accuarcy
138 | 
139 |     '''
140 |        Function:  save
141 |        Description: save the model as pkl
142 |        Input:  filename    dataType: str   description: the path to save model
143 |        '''
144 |     def save(self, filename):
145 |         f = open(filename, 'w')
146 |         pickle.dump(self.weights, f)
147 |         f.close()
148 | 
149 |     '''
150 |     Function:  load
151 |     Description: load the model 
152 |     Input:  filename    dataType: str   description: the path to save model
153 |     Output: self        dataType: obj   description: the trained model
154 |     '''
155 |     def load(self, filename):
156 |         f = open(filename)
157 |         self.weights = pickle.load(f)
158 |         return self
159 | 


--------------------------------------------------------------------------------
/NaiveBayes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @Filename:       NaiveBayes.py
  3 | @Author:         Ryuk
  4 | @Create Date:    2019-05-02
  5 | @Update Date:    2019-05-03
  6 | @Description:    Implement of naive Bayes
  7 | """
  8 | 
  9 | import numpy as np
 10 | import operator as op
 11 | import preProcess
 12 | import math
 13 | import pickle
 14 | 
 15 | 
 16 | class BayesClassifier:
 17 |     def __init__(self, norm_type="Normalization", laplace=1):
 18 |         self.norm_type = norm_type
 19 |         self.laplace = laplace
 20 |         self.label_value = None
 21 |         self.feature_value = None
 22 |         self.S = None
 23 |         self.prior_probability = None
 24 |         self.conditional_probability = None
 25 |         self.prediction = None
 26 |         self.probability = None
 27 |     '''
 28 |     Function:  train
 29 |     Description: train the model
 30 |     Input:  train_data       dataType: ndarray   description: features
 31 |             train_label      dataType: ndarray   description: labels
 32 |     Output: self             dataType: obj       description: the trained model
 33 |     '''
 34 |     def train(self, train_data, train_label):
 35 |         if self.norm_type == "Standardization":
 36 |             train_data = preProcess.Standardization(train_data)
 37 |         else:
 38 |             train_data = preProcess.Normalization(train_data)
 39 | 
 40 |         label_count = {}
 41 |         feature_dim = len(train_data[1])
 42 | 
 43 |         # get the number of each labels
 44 |         for c in train_label:
 45 |             label_count[c] = label_count.get(c, 0) + 1
 46 |         label_value = sorted(label_count.items(), key=op.itemgetter(0), reverse=False)
 47 |         self.label_value = label_value
 48 | 
 49 |         K = len(label_value)         # the number of unique labels
 50 |         N = len(train_label)         # the number of samples
 51 | 
 52 |         # get the prior probability
 53 |         prior_probability = {}
 54 |         for key in range(len(label_value)):
 55 |             prior_probability[label_value[key][0]] = (label_value[key][1] + self.laplace) / (N + K * self.laplace)  # laplace smooth
 56 |         self.prior_probability = prior_probability
 57 | 
 58 |         # get the value set of each feature
 59 |         feature_value = []  # feature with different value
 60 |         S = []  # the number of unique values of each feature
 61 |         for feat in range(feature_dim):
 62 |             unique_feature = np.unique(train_data[:, feat])
 63 |             S.append(len(unique_feature))
 64 |             feature_value.append(unique_feature)
 65 |         self.S = S
 66 |         self.feature_value = feature_value
 67 | 
 68 |         # calculate the conditional probability
 69 |         prob = []
 70 |         # calculate the count (x = a & y = c)
 71 |         for j in range(feature_dim):
 72 |             count = np.zeros([S[j], len(label_count)])  # the range of label start with 1
 73 |             feature_temp = train_data[:, j]
 74 |             feature_value_temp = feature_value[j]
 75 |             for i in range(len(feature_temp)):
 76 |                 for k in range(len(feature_value_temp)):
 77 |                     for t in range(len(label_count)):
 78 |                         if feature_temp[i] == feature_value_temp[k] and train_label[i] == label_value[t][0]:
 79 |                             count[k][t] += 1             # x = value and y = label
 80 |             # calculate the conditional probability
 81 |             for m in range(len(label_value)):
 82 |                 count[:, m] = (count[:, m] + self.laplace) / (label_value[m][1] + self.laplace*S[j])  # laplace smoothing
 83 |             # print(count)
 84 |             prob.append(count)
 85 |         self.conditional_probability = prob
 86 |         return self
 87 | 
 88 |     '''
 89 |     Function:  predict
 90 |     Description: predict the testing set 
 91 |     Input:  train_data       dataType: ndarray   description: features
 92 |             prob             dataType: bool      description: return probaility of label
 93 |     Output: prediction       dataType: ndarray   description: the prediction results for testing set
 94 |     '''
 95 |     def predict(self, test_data, prob="False"):
 96 |         # Normalization
 97 |         if self.norm_type == "Standardization":
 98 |             test_data = preProcess.Standardization(test_data)
 99 |         else:
100 |             test_data = preProcess.Normalization(test_data)
101 | 
102 |         test_num = test_data.shape[0]
103 |         prediction = np.zeros([test_num, 1])
104 |         probability = np.zeros([test_num, 1])
105 |         for i in range(test_num):
106 |             result = self.classify(test_data[i, :])
107 |             result = sorted(result.items(), key=op.itemgetter(1), reverse=True)
108 |             prediction[i] = result[0][0]
109 | 
110 |         self.prediction = prediction
111 |         self.probability = probability
112 |         if prob:
113 |             return probability
114 |         else:
115 |             return prediction
116 |     '''
117 |         Function:  classify
118 |         Description: predict the testing set 
119 |         Input:  sample      dataType: ndarray     description: input vector to be classified
120 |         Output: label       dataType: ndarray     description: the prediction results of input
121 |      '''
122 |     def classify(self, sample):
123 |         predict = {}
124 |         for m in range(len(self.label_value)):
125 |             temp = self.prior_probability[self.label_value[m][0]]  # get the prior_probability of m-th label in label_value
126 |             for n in range(len(sample)):
127 |                 if sample[n] in self.feature_value[n]:
128 |                     # print(m, n)
129 |                     index = np.where(self.feature_value[n] == sample[n])[0][0]
130 |                     temp = temp * self.conditional_probability[n][index][m]
131 |                 else:
132 |                     temp = self.laplace / (self.S[n] * self.laplace)  # if the value of feature is not in training set, return the laplace smoothing
133 |             predict[self.label_value[m][0]] = temp
134 |         return predict
135 | 
136 |     '''
137 |     Function:  accuracy
138 |     Description: show detection result
139 |     Input:  test_data  dataType: ndarray   description: data for test
140 |             test_label dataType: ndarray   description: labels of test data
141 |     Output: accuracy   dataType: float     description: detection accuarcy
142 |     '''
143 |     def accuarcy(self, test_label):
144 |         test_label = np.expand_dims(test_label, axis=1)
145 |         prediction = self.prediction
146 |         accuarcy = sum(prediction == test_label)/len(test_label)
147 |         return accuarcy
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/Perceptron.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @Filename:       Perceptron.py
  3 | @Author:         Ryuk
  4 | @Create Date:    2019-04-30
  5 | @Update Date:    2019-05-03
  6 | @Description:    Implement of perceptron.py
  7 | """
  8 | 
  9 | import numpy as np
 10 | import preProcess
 11 | import pickle
 12 | import random
 13 | 
 14 | class  PerceptronClassifier:
 15 |     def __init__(self, norm_type="Normalization", iterations=500, learning_rate=0.01):
 16 |         self.norm_type = norm_type
 17 |         self.iterations = iterations
 18 |         self.learning_rate = learning_rate
 19 |         self.gradients = None
 20 |         self.loss = None
 21 |         self.w = None
 22 |         self.b = None
 23 |         self.prediction = None
 24 |         self.probability = None
 25 | 
 26 |     '''
 27 |        Function:  sigmoid
 28 |        Description: sigmoid function
 29 |        Input:  x          dataType: ndarray   description: input vector
 30 |                derivative dataType: bool      description: whether to calculate the derivative of sigmoid
 31 |        Output: output     dataType: float     description: output
 32 |        '''
 33 |     def sigmoid(self, x, derivative=False):
 34 |         output = 1/(1 + np.exp(-x))
 35 |         if derivative:
 36 |             output = output * (1 - output)
 37 |         return output
 38 | 
 39 |     '''
 40 |        Function:  initializeParameter
 41 |        Description: initialize parameter
 42 |        Input:  feature_dim  dataType: int    description: feature dimension
 43 |        '''
 44 |     def initializeParameter(self, feature_dim):
 45 |         w = np.random.normal(0, 1, [feature_dim, 1])
 46 |         b = 0
 47 |         self.w = w
 48 |         self.b = b
 49 | 
 50 |     '''
 51 |        Function:  BackPropagate
 52 |        Description: BackPropagate function
 53 |        Input:  w            dataType: dict       description: the weights in network
 54 |                b            dataType: dict       description: the bias in network
 55 |                train_data   dataType: ndarray    description: train data
 56 |                train_label  dataType: ndarray    description: train label
 57 |        Output: gradients    dataType: dict       description: gradients
 58 |                cost         dataType: float      description: loss
 59 |        '''
 60 |     def backPropagate(self, train_data, train_label):
 61 |         num = train_label.shape[0]
 62 | 
 63 |         # forward
 64 |         A = self.sigmoid(np.dot(train_data, self.w) + self.b)
 65 |         cost = -1 / num * np.sum(train_label * np.log(A) + (1 - train_label) * np.log(1 - A))
 66 | 
 67 |         # backward
 68 |         dw = 1 / num * np.dot(train_data.T, A - train_label)
 69 |         db = 1 / num * np.sum(A - train_label)
 70 | 
 71 |         # save gradients
 72 |         gradients = {"dw": dw,
 73 |                      "db": db}
 74 |         return gradients, cost
 75 | 
 76 |     '''
 77 |           Function:  train
 78 |           Description: train the model
 79 |           Input:  train_data       dataType: ndarray   description: features
 80 |                   train_label      dataType: ndarray   description: labels
 81 |           Output: self             dataType: obj       description: the trained model
 82 |           '''
 83 |     def train(self, train_data, train_label):
 84 |         if self.norm_type == "Standardization":
 85 |             train_data = preProcess.Standardization(train_data)
 86 |         else:
 87 |             train_data = preProcess.Normalization(train_data)
 88 | 
 89 |         feature_dim = len(train_data[1])
 90 |         train_label = np.expand_dims(train_label, axis=1)
 91 |         self.initializeParameter(feature_dim)
 92 | 
 93 |         self.loss = []
 94 |         # training process
 95 |         for i in range(self.iterations):
 96 |             gradients, cost = self.backPropagate(train_data, train_label)
 97 |             # get the derivative
 98 |             dw = gradients["dw"]
 99 |             db = gradients["db"]
100 | 
101 |             # update parameter
102 |             self.w = self.w - self.learning_rate * dw
103 |             self.b = self.b - self.learning_rate * db
104 |             self.loss.append(cost)
105 | 
106 |         return self
107 | 
108 |     '''
109 |           Function:  predict
110 |           Description: predict the testing set 
111 |           Input:  train_data       dataType: ndarray   description: features
112 |                   prob             dataType: bool      description: return probaility of label
113 |           Output: prediction       dataType: ndarray   description: the prediction results for testing set
114 |           '''
115 | 
116 |     def predict(self, test_data, prob="False"):
117 |         # Normalization
118 |         if self.norm_type == "Standardization":
119 |             test_data = preProcess.Standardization(test_data)
120 |         else:
121 |             test_data = preProcess.Normalization(test_data)
122 | 
123 |         test_num = test_data.shape[0]
124 |         prediction = np.zeros([test_num, 1])
125 |         probability = np.zeros([test_num, 1])
126 |         for i in range(test_num):
127 |             probability[i] = self.sigmoid(np.dot(self.w.T, test_data[i, :]) + self.b) # prediction = self.sigmoid(np.dot(self.w.T, test_data) + self.b) can speed up
128 |             if probability[i] > 0:
129 |                 prediction[i] = 1
130 |             else:
131 |                 prediction[i] = -1
132 | 
133 |         self.prediction = prediction
134 |         self.probability = probability
135 |         if prob:
136 |             return probability
137 |         else:
138 |             return prediction
139 | 
140 | 
141 |     '''
142 |     Function:  accuracy
143 |     Description: show detection result
144 |     Input:  test_label dataType: ndarray   description: labels of test data
145 |     Output: accuracy   dataType: float     description: detection accuarcy
146 |     '''
147 |     def accuarcy(self, test_label):
148 |         test_label = np.expand_dims(test_label, axis=1)
149 |         prediction = self.prediction
150 |         accuarcy = sum(prediction == test_label)/len(test_label)
151 |         return accuarcy
152 | 
153 |     '''
154 |        Function:  save
155 |        Description: save the model as pkl
156 |        Input:  filename    dataType: str   description: the path to save model
157 |        '''
158 |     def save(self, filename):
159 |         f = open(filename, 'w')
160 |         model = {'w': self.w, 'b': self.b}
161 |         pickle.dump(model, f)
162 |         f.close()
163 | 
164 |     '''
165 |     Function:  load
166 |     Description: load the model 
167 |     Input:  filename    dataType: str   description: the path to save model
168 |     Output: self        dataType: obj   description: the trained model
169 |     '''
170 |     def load(self, filename):
171 |         f = open(filename)
172 |         model = pickle.load(f)
173 |         self.w = model['w']
174 |         self.b = model['b']
175 |         return self
176 | 


--------------------------------------------------------------------------------
/AdaBoost.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @Filename:       AdaptiveBoost.py
  3 | @Author:         Ryuk
  4 | @Create Date:    2019-05-03
  5 | @Update Date:    2019-05-24
  6 | @Description:    Implement of Adaptive Boosting
  7 | """
  8 | 
  9 | import numpy as np
 10 | import preProcess
 11 | import pickle
 12 | import random
 13 | import SVM, KNN, DecisionTree,Logistic, Perceptron
 14 | import math
 15 | 
 16 | class Adaboost:
 17 |     def __init__(self, norm_type="Normalization", iterations=5, base_classifier="SVM"):
 18 |         self.iterations = iterations
 19 |         self.norm_type = norm_type
 20 |         self.prediction = None
 21 |         self.probability = None
 22 |         self.classifier_set = None
 23 | 
 24 |         if base_classifier == "SVM":
 25 |             self.base_classifier = SVM.SVMClassifier()
 26 |         elif base_classifier == "KNN":
 27 |             self.base_classifier = KNN.KNNClassifier()
 28 |         elif base_classifier == "DecisionTree":
 29 |             self.base_classifier = DecisionTree.DecisionTreeClassifier()
 30 |         elif base_classifier == "Logistic":
 31 |             self.base_classifier = Logistic.LogisticRegressionClassifier()
 32 |         elif base_classifier == "Perceptron":
 33 |             self.base_classifier = Perceptron.PerceptronClassifier()
 34 | 
 35 |     '''
 36 |        Function:  baseClassifier
 37 |        Description: generate weak classifier
 38 |        Input: train_data            dataType: ndarray        description: train_data
 39 |               train_label           dataType: ndarray        description: train_label
 40 |               w                     dataType: ndarray        description: weight
 41 |        Output: clf                  dataType: object         description: weak classifier
 42 |                weighted_error       dataType: float          description: weighted error
 43 |                base_predictions     dataType: object         description: base predictions
 44 |                 
 45 |        '''
 46 |     def baseClassifier(self, train_data, train_label, w):
 47 |         sample_num = len(train_data)
 48 |         error_index = np.ones([sample_num, 1])
 49 |         clf = self.base_classifier
 50 |         clf.train(train_data, train_label)
 51 |         base_predictions = np.sign(clf.predict(train_data))
 52 | 
 53 |         for i in range(sample_num):
 54 |             if base_predictions[i] == train_label[i]:
 55 |                 error_index[i] = 0
 56 |         weighted_error = np.dot(w.T, error_index)
 57 |         return clf, weighted_error, base_predictions
 58 | 
 59 |     '''
 60 |         Function:  updataAlpha
 61 |         Description: updata alpha
 62 |         Input:  error            dataType: float     description: weighted error
 63 |         Output: new_alpha        dataType: float     description: new alpha
 64 |             '''
 65 |     def updateAlpha(self, error):
 66 |         temp = (1.0 - error)/max(error, 10e-6)
 67 |         new_alpha = 1/2 * math.log(temp, math.e)
 68 |         return new_alpha
 69 | 
 70 |     '''
 71 |         Function:  train
 72 |         Description: train the model
 73 |         Input:  train_data       dataType: ndarray   description: features
 74 |                 train_label      dataType: ndarray   description: labels
 75 |         Output: clf_set          dataType: list      description: classifiers set
 76 |           '''
 77 |     def train(self, train_data, train_label):
 78 |         if self.norm_type == "Standardization":
 79 |             train_data = preProcess.Standardization(train_data)
 80 |         else:
 81 |             train_data = preProcess.Normalization(train_data)
 82 | 
 83 |         train_label = np.expand_dims(train_label, axis=1)
 84 |         sample_num = len(train_data)
 85 | 
 86 |         weak_classifier = []
 87 | 
 88 |         # initialize weights
 89 |         w = np.ones([sample_num, 1])
 90 |         w = w/sample_num
 91 | 
 92 |         # predictions
 93 |         agg_predicts = np.zeros([sample_num, 1]) # aggregate value of prediction
 94 | 
 95 |         # start train
 96 |         for i in range(self.iterations):
 97 |             base_clf, error, base_prediction = self.baseClassifier(train_data, train_label, w)
 98 |             alpha = self.updateAlpha(error)
 99 |             weak_classifier.append((alpha, base_clf))
100 | 
101 |             # update parameters in page of 139 Eq.(8.4)
102 |             expon = np.multiply(-1 * alpha * train_label, base_prediction)
103 |             w = np.multiply(w, np.exp(expon))
104 |             w = w/w.sum()
105 | 
106 |             # calculate the total error rate
107 |             agg_predicts += alpha*base_prediction
108 |             error_rate = np.multiply(np.sign(agg_predicts) != train_label, np.ones([sample_num, 1]))
109 |             error_rate = error_rate.sum()/sample_num
110 | 
111 |             if error_rate == 0:
112 |                 break
113 |             self.classifier_set = weak_classifier
114 |         return weak_classifier
115 | 
116 | 
117 |     '''
118 |     Function:  predict
119 |     Description: predict the testing set 
120 |     Input:  train_data       dataType: ndarray   description: features
121 |             prob             dataType: bool      description: return probaility of label
122 |     Output: prediction       dataType: ndarray   description: the prediction results for testing set
123 |           '''
124 | 
125 |     def predict(self, test_data, prob="False"):
126 |         # Normalization
127 |         if self.norm_type == "Standardization":
128 |             test_data = preProcess.Standardization(test_data)
129 |         else:
130 |             test_data = preProcess.Normalization(test_data)
131 | 
132 |         test_num = test_data.shape[0]
133 |         prediction = np.zeros([test_num, 1])
134 |         probability = np.zeros([test_num, 1])
135 | 
136 |         for classifier in self.classifier_set:
137 |             alpha = classifier[0]
138 |             clf = classifier[1]
139 |             base_prediction = alpha * clf.predict(test_data)
140 |             probability += base_prediction
141 | 
142 |         self.prediction = np.sign(probability)
143 |         self.probability = probability
144 |         if prob:
145 |             return probability
146 |         else:
147 |             return prediction
148 | 
149 | 
150 |     '''
151 |     Function:  accuracy
152 |     Description: show detection result
153 |     Input:  test_label dataType: ndarray   description: labels of test data
154 |     Output: accuracy   dataType: float     description: detection accuarcy
155 |     '''
156 |     def accuarcy(self, test_label):
157 |         test_label = np.expand_dims(test_label, axis=1)
158 |         prediction = self.prediction
159 |         accuarcy = sum(prediction == test_label)/len(test_label)
160 |         return accuarcy
161 | 
162 | 
163 |     '''
164 |        Function:  save
165 |        Description: save the model as pkl
166 |        Input:  filename    dataType: str   description: the path to save model
167 |        '''
168 |     def save(self, filename):
169 |         f = open(filename, 'w')
170 |         pickle.dump(self.classifier_set, f)
171 |         f.close()
172 | 
173 |     '''
174 |     Function:  load
175 |     Description: load the model 
176 |     Input:  filename    dataType: str   description: the path to save model
177 |     Output: self        dataType: obj   description: the trained model
178 |     '''
179 |     def load(self, filename):
180 |         f = open(filename)
181 |         self.classifier_set = pickle.load(f)
182 |         return self
183 | 
184 | 


--------------------------------------------------------------------------------
/RandomForest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @ Filename:       RandomForest.py
  3 | @ Author:         Ryuk
  4 | @ Create Date:    2019-07-09   
  5 | @ Update Date:    2019-07-09 
  6 | @ Description:    Implement RandomForest
  7 | """
  8 | import numpy as np
  9 | import operator as op
 10 | import pickle
 11 | from DecisionTree import DecisionTreeClassifier
 12 | from TreeRegression import RegressionTree
 13 | 
 14 | class RandomForestClassifier:
 15 |     def __init__(self, tree_num=10, alpha=1e-5):
 16 |         self.tree_num = tree_num
 17 |         self.alpha=alpha
 18 |         self.trees = []
 19 |         self.prediction = None
 20 |         self.probability = None
 21 | 
 22 |     '''
 23 |         Function:  boostrap
 24 |         Description: boostrap sampling and train a model
 25 |         Input:  train_data       dataType: ndarray   description: features
 26 |                 train_label      dataType: ndarray   description: labels
 27 |                 self             dataType: obj       description: the trained model
 28 |     '''
 29 |     def boostrap(self, train_data, train_label):
 30 |         index = np.random.randint(0, len(train_data), (len(train_data)))
 31 |         x = train_data[index]
 32 |         y = train_label[index]
 33 |         clf = DecisionTreeClassifier(t=self.alpha)
 34 |         clf.train(x, y)
 35 |         return clf
 36 | 
 37 |     '''
 38 |         Function:  train
 39 |         Description: train the model
 40 |         Input:  train_data       dataType: ndarray   description: features
 41 |                 train_label      dataType: ndarray   description: labels
 42 |         Output: self             dataType: obj       description: the trained model
 43 |           '''
 44 |     def train(self, train_data, train_label):
 45 |         for i in range(self.tree_num):
 46 |             clf = self.boostrap(train_data, train_label)
 47 |             self.trees.append(clf)
 48 |         return self
 49 | 
 50 |     '''
 51 |         Function:  vote
 52 |         Description: return the label of the majority  
 53 |         Input:  labels    dataType: ndarray   description:  labels
 54 |         Output: pred      dataType: int       description:  prediction label of input vector
 55 |     '''
 56 |     def vote(self, labels):
 57 |         label_count = {}
 58 |         # get the counts of each label
 59 |         for c in labels:
 60 |             label_count[c] = label_count.get(c, 0) + 1
 61 |         # get the labels of the majority
 62 |         predition = sorted(label_count.items(), key=op.itemgetter(1), reverse=True)
 63 |         pred = predition[0][0]
 64 |         return pred
 65 | 
 66 |     '''
 67 |      Function:  predict
 68 |      Description: predict the testing set 
 69 |      Input:  test_data       dataType: ndarray   description: features
 70 |      Output: prediction       dataType: ndarray   description: the prediction results for testing set
 71 |      '''
 72 |     def predict(self, test_data):
 73 |         labels = np.zeros([len(test_data), self.tree_num])
 74 |         for i in range(self.tree_num):
 75 |             clf = self.trees[i]
 76 |             labels[:, i] = clf.predict(test_data).reshape(len(test_data))
 77 | 
 78 |         prediction = np.zeros([len(test_data)])
 79 |         for j in range(len(labels)):
 80 |             prediction[j] = self.vote(labels[j,:])
 81 | 
 82 |         self.prediction = prediction
 83 |         return prediction
 84 | 
 85 |     '''
 86 |     Function:  showDetectionResult
 87 |     Description: show detection result
 88 |     Input:  test_data  dataType: ndarray   description: data for test
 89 |             test_label dataType: ndarray   description: labels of test data
 90 |     Output: accuracy   dataType: float     description: detection accuarcy
 91 |     '''
 92 |     def accuarcy(self, test_label):
 93 |         prediction = self.prediction
 94 |         accuarcy = sum(prediction == test_label)/len(test_label)
 95 |         return accuarcy
 96 | 
 97 |     '''
 98 |          Function:  save
 99 |          Description: save the model as pkl
100 |          Input:  filename    dataType: str   description: the path to save model
101 |          '''
102 |     def save(self, filename):
103 |         f = open(filename, 'w')
104 |         model = self.trees
105 |         pickle.dump(model, f)
106 |         f.close()
107 | 
108 |     '''
109 |     Function:  load
110 |     Description: load the model 
111 |     Input:  filename    dataType: str   description: the path to save model
112 |     Output: self        dataType: obj   description: the trained model
113 |     '''
114 |     def load(self, filename):
115 |         f = open(filename)
116 |         self.trees = pickle.load(f)
117 |         return self
118 | 
119 | 
120 | class RandomForestRegression:
121 |     def __init__(self, tree_num=10, error_threshold=1,  N=4, alpha=0.01):
122 |         self.sample_num = 0
123 |         self.tree_num = tree_num
124 |         self.trees = []
125 |         self.error_threshold = error_threshold  # the threshold of error
126 |         self.N = N                              # the least number of sample for split
127 |         self.alpha = alpha
128 |         self.tree_node = None
129 |         self.prediction = None
130 | 
131 |     '''
132 |         Function:  boostrap
133 |         Description: boostrap sampling and train a model
134 |         Input:  train_data       dataType: ndarray   description: features
135 |                 train_label      dataType: ndarray   description: labels
136 |                 self             dataType: obj       description: the trained model
137 |     '''
138 |     def boostrap(self, train_data, train_label):
139 |         index = np.random.randint(0, self.sample_num, (self.sample_num))
140 |         x = train_data[index]
141 |         y = train_label[index]
142 |         clf = RegressionTree(error_threshold=1,  N=4, alpha=0.01)
143 |         clf.train(x, y)
144 |         return clf
145 | 
146 |     '''
147 |         Function:  train
148 |         Description: train the model
149 |         Input:  train_data       dataType: ndarray   description: features
150 |                 train_label      dataType: ndarray   description: labels
151 |         Output: self             dataType: obj       description: the trained model
152 |           '''
153 |     def train(self, train_data, train_label):
154 |         for i in range(self.tree_num):
155 |             clf = self.boostrap(train_data, train_label)
156 |             self.trees.append(clf)
157 |         return self
158 | 
159 |     '''
160 |         Function:  vote
161 |         Description: return the label of the majority  
162 |         Input:  labels    dataType: ndarray   description:  labels
163 |         Output: pred      dataType: int       description:  prediction label of input vector
164 |     '''
165 |     def vote(self, labels):
166 |         label_count = {}
167 |         # get the counts of each label
168 |         for c in labels:
169 |             label_count[c] = label_count.get(c, 0) + 1
170 |         # get the labels of the majority
171 |         predition = sorted(label_count.items(), key=op.itemgetter(1), reverse=True)
172 |         pred = predition[0][0]
173 |         return pred
174 | 
175 |     '''
176 |      Function:  predict
177 |      Description: predict the testing set 
178 |      Input:  test_data        dataType: ndarray   description: features
179 |      Output: prediction       dataType: ndarray   description: the prediction results for testing set
180 |      '''
181 |     def predict(self, test_data):
182 |         labels = np.zeros([len(test_data), self.tree_num])
183 |         for i in range(self.tree_num):
184 |             labels[:,i] = self.trees[i].predict(test_data)
185 | 
186 |         prediction = np.mean(labels, axis=0)
187 | 
188 |         self.prediction = prediction
189 |         return prediction
190 | 
191 |     '''
192 |     Function:  showDetectionResult
193 |     Description: show detection result
194 |     Input:  test_data  dataType: ndarray   description: data for test
195 |             test_label dataType: ndarray   description: labels of test data
196 |     Output: accuracy   dataType: float     description: detection accuarcy
197 |     '''
198 |     def accuarcy(self, test_label):
199 |         test_label = np.expand_dims(test_label, axis=1)
200 |         prediction = self.prediction
201 |         accuarcy = sum(prediction == test_label)/len(test_label)
202 |         return accuarcy
203 | 
204 |     '''
205 |          Function:  save
206 |          Description: save the model as pkl
207 |          Input:  filename    dataType: str   description: the path to save model
208 |          '''
209 |     def save(self, filename):
210 |         f = open(filename, 'w')
211 |         model = self.trees
212 |         pickle.dump(model, f)
213 |         f.close()
214 | 
215 |     '''
216 |     Function:  load
217 |     Description: load the model 
218 |     Input:  filename    dataType: str   description: the path to save model
219 |     Output: self        dataType: obj   description: the trained model
220 |     '''
221 |     def load(self, filename):
222 |         f = open(filename)
223 |         self.trees = pickle.load(f)
224 |         return self
225 | 


--------------------------------------------------------------------------------
/LinearRegression.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @ Filename:       Regression.py
  3 | @ Author:         Ryuk
  4 | @ Create Date:    2019-05-05
  5 | @ Update Date:    2019-05-06
  6 | @ Description:    Implement linear regression
  7 | """
  8 | import numpy as np
  9 | import preProcess
 10 | import pickle
 11 | import random
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | class Regression:
 15 |     def __init__(self, norm_type="Normalization",regression_type="Standard", k=1.0, lamda=0.2, learning_rate=0.01, iterations=100):
 16 |         self.norm_type = norm_type
 17 |         self.regression_type = regression_type
 18 |         self.k = k                                  # parameter for local weight linear regression
 19 |         self.lamda = lamda                          # parameter for ridge regression
 20 |         self.learning_rate = learning_rate          # parameter for forward step regression
 21 |         self.iterations = iterations                # parameter for forward step regression
 22 |         self.w = None
 23 |         self.parameters = None
 24 |         self.prediction = None
 25 |         self.probability = None
 26 | 
 27 |     '''
 28 |        Function:  standardLinearRegression
 29 |        Description: standard Linear Regression, w =(X.T*X)-1*X.T*y
 30 |        Input:  x          dataType: ndarray      description: x 
 31 |                y          dataType: ndarray      description: y
 32 |        Output: w          dataType: ndarray      description: weights
 33 |        '''
 34 |     def standardLinearRegression(self, x, y):
 35 |         if self.norm_type == "Standardization":
 36 |             x = preProcess.Standardization(x)
 37 |         else:
 38 |             x = preProcess.Normalization(x)
 39 | 
 40 |         xTx = np.dot(x.T, x)
 41 |         if np.linalg.det(xTx) == 0:   # calculate the Determinant of xTx
 42 |             print("Error: Singluar Matrix !")
 43 |             return
 44 |         w = np.dot(np.linalg.inv(xTx), np.dot(x.T, y))
 45 |         return w
 46 | 
 47 |     '''
 48 |        Function:  LWLinearRegression
 49 |        Description: locally weighted linear regression, w = (X.T*W*X)-1*X.T*W*y
 50 |        Input:  x          dataType: ndarray      description: x 
 51 |                y          dataType: ndarray      description: y
 52 |        Output: w          dataType: ndarray      description: weights
 53 |        '''
 54 |     def LWLinearRegression(self, x, y, sample):
 55 |         if self.norm_type == "Standardization":
 56 |             x = preProcess.Standardization(x)
 57 |         else:
 58 |             x = preProcess.Normalization(x)
 59 | 
 60 |         sample_num = len(x)
 61 |         weights = np.eye(sample_num)
 62 |         for i in range(sample_num):
 63 |             diff = sample - x[i, :]
 64 |             weights[i, i] = np.exp(np.dot(diff, diff.T)/(-2 * self.k ** 2))
 65 |         xTx = np.dot(x.T, np.dot(weights, x))
 66 |         if np.linalg.det(xTx) == 0:
 67 |             print("Error: Singluar Matrix !")
 68 |             return
 69 |         result = np.dot(np.linalg.inv(xTx), np.dot(x.T, np.dot(weights, y)))
 70 |         return result
 71 | 
 72 |     '''
 73 |        Function:  ridgeRegression
 74 |        Description: ridge linear regression, w = (X.T*X+ LAMDA I)-1*X.T*y
 75 |        Input:  x          dataType: ndarray      description: x 
 76 |                y          dataType: ndarray      description: y
 77 |        Output: w          dataType: ndarray      description: weights
 78 |        '''
 79 |     def ridgeRegression(self, x, y):
 80 |         if self.norm_type == "Standardization":
 81 |             x = preProcess.Standardization(x)
 82 |         else:
 83 |             x = preProcess.Normalization(x)
 84 | 
 85 |         feature_dim = len(x[0])
 86 |         xTx = np.dot(x.T, x)
 87 |         matrix = xTx + np.exp(feature_dim)*self.lamda
 88 |         if np.linalg.det(xTx) == 0:
 89 |             print("Error: Singluar Matrix !")
 90 |             return
 91 |         w = np.dot(np.linalg.inv(matrix), np.dot(x.T, y))
 92 |         return w
 93 | 
 94 |     '''
 95 |        Function:  lasso Regression
 96 |        Description: lasso linear regression, 
 97 |        Input:  x          dataType: ndarray      description: x 
 98 |                y          dataType: ndarray      description: y
 99 |        Output: w          dataType: ndarray      description: weights
100 |        '''
101 |     def lassoRegression(self, x, y):
102 |         if self.norm_type == "Standardization":
103 |             x = preProcess.Standardization(x)
104 |         else:
105 |             x = preProcess.Normalization(x)
106 | 
107 |         sample_num, feataure_dim = np.shape(x)
108 |         w = np.zeros([feataure_dim, 1])
109 |         for i in range(self.iterations):
110 |             last_w = w
111 |             w[i] = np.dot(x[i, :], (y[i] - x[i, :] * last_w.T))/np.dot(x[i, :], x[i, :].T)
112 |         return w
113 | 
114 | 
115 |     '''
116 |        Function:  forwardstep Regression
117 |        Description: forward step linear regression, 
118 |        Input:  x          dataType: ndarray      description: x 
119 |                y          dataType: ndarray      description: y
120 |        Output: w          dataType: ndarray      description: weights
121 |        '''
122 |     def forwardstepRegression(self, x, y):
123 |         if self.norm_type == "Standardization":
124 |             x = preProcess.Standardization(x)
125 |         else:
126 |             x = preProcess.Normalization(x)
127 | 
128 |         sample_num, feature_dim = np.shape(x)
129 |         w = np.zeros([self.iterations, feature_dim])
130 |         best_w = np.zeros([feature_dim, 1])
131 |         for i in range(self.iterations):
132 |             min_error = np.inf
133 |             for j in range(feature_dim):
134 |                 for sign in [-1, 1]:
135 |                     temp_w = best_w
136 |                     temp_w[j] += sign * self.learning_rate
137 |                     y_hat = np.dot(x, temp_w)
138 |                     error = ((y - y_hat) ** 2).sum()                # MSE
139 |                     if error < min_error:                           # save the best parameters
140 |                         min_error = error
141 |                         best_w = temp_w
142 |             w[i, :] = best_w.T
143 |         return w
144 | 
145 |     '''
146 |           Function:  train
147 |           Description: train the model
148 |           Input:  train_data       dataType: ndarray   description: features
149 |                   train_label      dataType: ndarray   description: labels
150 |           Output: self             dataType: obj       description: the trained model
151 |           '''
152 | 
153 |     def train(self, train_data, train_label):
154 |         if self.norm_type == "Standardization":
155 |             train_data = preProcess.Standardization(train_data)
156 |         else:
157 |             train_data = preProcess.Normalization(train_data)
158 | 
159 |         if self.regression_type == "Standard":
160 |             self.w = self.standardLinearRegression(train_data, train_label)
161 |         elif self.regression_type == "Localweight":
162 |             self.w = self.LWLinearRegression(train_data, train_label)
163 |         elif self.regression_type == "Ridge":
164 |             self.w = self.ridgeRegression(train_data, train_label)
165 |         elif self.regression_type == "Lasso":
166 |             self.w = self.lassoRegression(train_data, train_label)
167 |         elif self.regression_type == "Forwardstep":
168 |             self.w = self.forwardstepRegression(train_data, train_label)
169 |         else:
170 |             print("Error Regression Type!")
171 |         return self
172 | 
173 |     '''
174 |        Function:  predict
175 |        Description: predict the testing set 
176 |        Input:  test_data       dataType: ndarray   description: features
177 |                prob             dataType: bool      description: return probaility of label
178 |        Output: prediction       dataType: ndarray   description: the prediction results for testing set
179 |        '''
180 |     def predict(self, x, prob="False"):
181 |         # Normalization
182 |         if self.norm_type == "Standardization":
183 |             x = preProcess.Standardization(x)
184 |         else:
185 |             x = preProcess.Normalization(x)
186 | 
187 |         y = np.dot(x, self.w)
188 |         self.prediction = y
189 |         return y
190 | 
191 |     '''
192 |     Function:  plot
193 |     Description: show regression result
194 |     Input:  test_label dataType: ndarray   description: labels of test data
195 |     Output: accuracy   dataType: float     description: detection accuarcy
196 |     '''
197 |     def plot(self, test_label):
198 |         # test_label = np.expand_dims(test_label, axis=1)
199 |         prediction = self.prediction
200 |         plot1 = plt.plot(test_label, 'r*', label='Regression values')
201 |         plot2 = plt.plot(prediction, 'b', label='Real values')
202 |         plt.xlabel('X ')
203 |         plt.ylabel('Y')
204 |         plt.legend(loc=3)
205 |         plt.title('Regression')
206 |         plt.show()
207 | 
208 |     '''
209 |           Function:  save
210 |           Description: save the model as pkl
211 |           Input:  filename    dataType: str   description: the path to save model
212 |           '''
213 | 
214 |     def save(self, filename):
215 |         f = open(filename, 'w')
216 |         pickle.dump(self.w, f)
217 |         f.close()
218 | 
219 |     '''
220 |     Function:  load
221 |     Description: load the model 
222 |     Input:  filename    dataType: str   description: the path to save model
223 |     Output: self        dataType: obj   description: the trained model
224 |     '''
225 | 
226 |     def load(self, filename):
227 |         f = open(filename)
228 |         self.w = pickle.load(f)
229 |         return self
230 | 
231 | 


--------------------------------------------------------------------------------
/TreeRegression.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @ Filename:       TreeRegression.py
  3 | @ Author:         Ryuk
  4 | @ Create Date:    2019-05-11
  5 | @ Update Date:    2019-05-13
  6 | @ Description:    Implement TreeRegression
  7 | """
  8 | 
  9 | import numpy as np
 10 | import operator as op
 11 | import preProcess
 12 | import math
 13 | import pickle
 14 | 
 15 | class treeNode():
 16 |     def __init__(self, index=-1, value=None, result=None, right_tree=None, left_tree=None):
 17 |         self.index = index
 18 |         self.value = value
 19 |         self.result = result
 20 |         self.right_tree = right_tree
 21 |         self.left_tree = left_tree
 22 | 
 23 | 
 24 | class treeRegression:
 25 |     def __init__(self, norm_type="Normalization",iterations=100, error_threshold=1, N=4):
 26 |         self.norm_type = norm_type
 27 |         self.iterations = iterations
 28 |         self.error_threshold = error_threshold  # the threshold of error
 29 |         self.N = N                              # the least number of sample for split
 30 |         self.tree_node = None
 31 |         self.prediction = None
 32 |         self.probability = None
 33 | 
 34 |     '''
 35 |     Function:  divideData
 36 |     Description: divide data into two parts
 37 |     Input:  data               dataType: ndarray   description:  feature and labels
 38 |             index              dataType: int       description:  the column of feature
 39 |             value              dataType: float     description:  the value of feature
 40 |     Output: left_set           dataType: ndarray   description:  feature <= value
 41 |             right_set          dataType: ndarray   description:  feature > value
 42 |     '''
 43 |     def divideData(self, data, index, value):
 44 |         left_set = []
 45 |         right_set = []
 46 |         # select feature in index with value
 47 |         for temp in data:
 48 |             if temp[index] >= value:
 49 |                 # delete this feature
 50 |                 right_set.append(temp)
 51 |             else:
 52 |                 left_set.append(temp)
 53 |         return np.array(left_set), np.array(right_set)
 54 | 
 55 |     '''
 56 |        Function:  getVariance
 57 |        Description: get the variance of the regression value, in page of 68 Eq.(5.19)
 58 |        Input:  data            dataType: ndarray      description:  feature and value, the last column is value 
 59 |        Output: variance        dataType: ndarray      description:  variance 
 60 |        '''
 61 |     def getVariance(self, data):
 62 |         variance = np.var(data)
 63 |         return variance*len(data)
 64 | 
 65 |     '''
 66 |        Function:  getMean
 67 |        Description: get the mean of the regression value,in page of 68 Eq.(5.17)
 68 |        Input:  data            dataType: ndarray      description:  feature and value, the last column is value 
 69 |        Output: mean            dataType: ndarray      description:  mean
 70 |        '''
 71 |     def getMean(self, data):
 72 |         mean = np.mean(data)
 73 |         return mean
 74 | 
 75 |     '''
 76 |        Function:  createRegressionTree
 77 |        Description: create  regression tree
 78 |        Input:  data          dataType: ndarray      description:  training set
 79 |        Output: w             dataType: ndarray      description: weights
 80 |        '''
 81 |     def createRegressionTree(self, data):
 82 |         # if there is no feature
 83 |         if len(data) == 0:
 84 |             self.tree_node = treeNode(result=self.getMean(data[:, -1]))
 85 |             return self.tree_node
 86 | 
 87 |         sample_num, feature_dim = np.shape(data)
 88 | 
 89 |         best_criteria = None
 90 |         best_error = np.inf
 91 |         best_set = None
 92 |         initial_error = self.getVariance(data)
 93 | 
 94 |         # get the best split feature and value
 95 |         for index in range(feature_dim - 1):
 96 |             uniques = np.unique(data[:, index])
 97 |             for value in uniques:
 98 |                 left_set, right_set = self.divideData(data, index, value)
 99 |                 if len(left_set) < self.N or len(right_set) < self.N:
100 |                     continue
101 |                 new_error = self.getVariance(left_set) + self.getVariance(right_set)
102 |                 if new_error < best_error:
103 |                     best_criteria = (index, value)
104 |                     best_error = new_error
105 |                     best_set = (left_set, right_set)
106 | 
107 |         if best_set is None:
108 |             self.tree_node = treeNode(result=self.getMean(data[:, -1]))
109 |             return self.tree_node
110 |         # if the descent of error is small enough, return the mean of the data
111 |         elif abs(initial_error - best_error) < self.error_threshold:
112 |             self.tree_node = treeNode(result=self.getMean(data[:, -1]))
113 |             return self.tree_node
114 |         # if the split data is small enough, return the mean of the data
115 |         elif len(best_set[0]) < self.N or len(best_set[1]) < self.N:
116 |             self.tree_node = treeNode(result=self.getMean(data[:, -1]))
117 |             return self.tree_node
118 |         else:
119 |             ltree = self.createRegressionTree(best_set[0])
120 |             rtree = self.createRegressionTree(best_set[1])
121 |             self.tree_node = treeNode(index=best_criteria[0], value=best_criteria[1], left_tree=ltree, right_tree=rtree)
122 |             return self.tree_node
123 | 
124 |     '''
125 |        Function:  train
126 |        Description: train the model
127 |        Input:  train_data       dataType: ndarray   description: features
128 |                train_label      dataType: ndarray   description: labels
129 |        Output: self             dataType: obj       description: the trained model
130 |        '''
131 |     def train(self, train_data, train_label, pruning=False, val_data=None, val_label=None):
132 |         # if self.norm_type == "Standardization":
133 |         #     train_data = preProcess.Standardization(train_data)
134 |         # else:
135 |         #     train_data = preProcess.Normalization(train_data)
136 | 
137 |         train_label = np.expand_dims(train_label, axis=1)
138 |         data = np.hstack([train_data, train_label])
139 | 
140 |         self.tree_node = self.createRegressionTree(data)
141 |         #self.printTree(self.tree_node)
142 |         return self
143 | 
144 |     '''
145 |        Function:  printTree
146 |        Description: show the structure of the decision tree
147 |        Input:  tree        dataType: DecisionNode    description: decision tree
148 |     '''
149 |     def printTree(self, tree):
150 |         # leaf node
151 |         if tree.result != None:
152 |             print(str(tree.result))
153 |         else:
154 |             # print condition
155 |             print(str(tree.index) + ":" + str(tree.value))
156 |             # print subtree
157 |             print("R->", self.printTree(tree.right_tree))
158 |             print("L->", self.printTree(tree.left_tree))
159 | 
160 |     '''
161 |      Function:  predict
162 |      Description: predict the testing set 
163 |      Input:  train_data       dataType: ndarray   description: features
164 |              prob             dataType: bool      description: return probaility of label
165 |      Output: prediction       dataType: ndarray   description: the prediction results for testing set
166 |      '''
167 |     def predict(self, test_data, prob="False"):
168 |         # Normalization
169 |         # if self.norm_type == "Standardization":
170 |         #     test_data = preProcess.Standardization(test_data)
171 |         # else:
172 |         #     test_data = preProcess.Normalization(test_data)
173 | 
174 |         test_num = test_data.shape[0]
175 |         prediction = np.zeros([test_num, 1])
176 |         probability = np.zeros([test_num, 1])
177 |         for i in range(test_num):
178 |             prediction[i] = self.classify(test_data[i, :], self.tree_node)
179 |             # probability[i] = result[0][1]/(result[0][1] + result[1][1])
180 |         self.prediction = prediction
181 |         self.probability = probability
182 | 
183 |         return prediction
184 | 
185 |     '''
186 |           Function:  classify
187 |           Description: predict the testing set 
188 |           Input:  sample      dataType: ndarray     description: input vector to be classified
189 |           Output: label       dataType: ndarray     description: the prediction results of input
190 |        '''
191 |     def classify(self, sample, tree):
192 |         if tree.result is not None:
193 |             return tree.result
194 |         else:
195 |             value = sample[tree.index]
196 |             if value >= tree.value:
197 |                 branch = tree.right_tree
198 |             else:
199 |                 branch = tree.left_tree
200 |             return self.classify(sample, branch)
201 | 
202 |     '''
203 |        Function:  pruning
204 |        Description: pruning the regression tree
205 |        Input:  test_data       dataType: ndarray   description: features
206 |                test_label      dataType: ndarray   description: labels
207 |        Output: self            dataType: obj       description: the trained model
208 |        '''
209 |     def pruning(self, tree, data, alpha):
210 | 
211 |         return 0
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 |     '''
220 |       Function:  save
221 |       Description: save the model as pkl
222 |       Input:  filename    dataType: str   description: the path to save model
223 |       '''
224 | 
225 |     def save(self, filename):
226 |         f = open(filename, 'w')
227 |         pickle.dump(self.tree_node, f)
228 |         f.close()
229 | 
230 |     '''
231 |     Function:  load
232 |     Description: load the model 
233 |     Input:  filename    dataType: str   description: the path to save model
234 |     Output: self        dataType: obj   description: the trained model
235 |     '''
236 | 
237 |     def load(self, filename):
238 |         f = open(filename)
239 |         self.tree_node = pickle.load(f)
240 |         return self
241 | 
242 | 
243 | 


--------------------------------------------------------------------------------
/DecisionTree.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @Filename:       DecisionTree.py
  3 | @Author:         Ryuk
  4 | @Create Date:    2019-04-22
  5 | @Update Date:    2019-05-03
  6 | @Description:    Implement of decision tree
  7 | """
  8 | 
  9 | import numpy as np
 10 | import operator as op
 11 | import preProcess
 12 | import math
 13 | import pickle
 14 | 
 15 | 
 16 | class DecisionNode:
 17 |     def __init__(self, index=-1, value=None, results=None, right_tree=None, left_tree=None):
 18 |         self.index = index                    # the index of feature
 19 |         self.value = value                    # the value of the feature with index
 20 |         self.results = results                # current decision result
 21 |         self.right_tree = right_tree
 22 |         self.left_tree = left_tree
 23 | 
 24 | 
 25 | class DecisionTreeClassifier:
 26 |     def __init__(self, norm_type="Normalization", t=1e-5):
 27 |         self.norm_type = norm_type
 28 |         self.t = t                          # the threshold of information gain
 29 |         self.prediction = None
 30 |         self.probability = None
 31 |         self.tree_node = None
 32 | 
 33 |     '''
 34 |     Function:  uniqueCount
 35 |     Description: calculate the count of unique labels
 36 |     Input:  labels             dataType: ndarray     description: labels of data
 37 |     Output: label_count        dataType: dictionary  description: [label, count]
 38 |     '''
 39 |     def uniqueCount(self, labels):
 40 |         label_count = {}
 41 |         for i in range(len(labels)):
 42 |             label_count[labels[i]] = label_count.get(labels[i], 0) + 1
 43 |         return label_count
 44 | 
 45 |     '''
 46 |     Function:  getEntropy
 47 |     Description: calcuate the Shannon entropy of the input data
 48 |     Input:  labels             dataType: ndarray   description: labels of data
 49 |     Output: entropy            dataType:       description: 
 50 |     '''
 51 |     def getEntropy(self, labels):
 52 |         labels_num = len(labels)
 53 |         label_count = self.uniqueCount(labels)
 54 | 
 55 |         entropy = 0.0
 56 |         for j in label_count:
 57 |             prop = label_count[j]/labels_num
 58 |             entropy = entropy + (-prop*math.log(prop, 2))
 59 | 
 60 |         return entropy
 61 | 
 62 |     '''
 63 |     Function:  divideData
 64 |     Description: divide data into two parts
 65 |     Input:  data               dataType: ndarray   description:  feature and labels
 66 |             index              dataType: int       description:  the column of feature
 67 |             value              dataType: float     description:  the value of feature
 68 |     Output: left_set           dataType: ndarray   description:  feature <= value
 69 |             right_set          dataType: ndarray   description:  feature > value
 70 |     '''
 71 |     def divideData(self, data, index, value):
 72 |         left_set = []
 73 |         right_set = []
 74 |         # select feature in index with value
 75 |         for temp in data:
 76 |             if temp[index] >= value:
 77 |                 # delete this feature
 78 |                 new_feature = np.delete(temp, index)
 79 |                 right_set.append(new_feature)
 80 |             else:
 81 |                 new_feature = np.delete(temp, index)
 82 |                 left_set.append(new_feature)
 83 |         return np.array(left_set), np.array(right_set)
 84 | 
 85 |     '''
 86 |     Function:  createDecisionTree
 87 |     Description: create decision tree by ID3
 88 |     Input:  data           dataType: ndarray   description:  [feature,label]
 89 |     Output: bestFeature    dataType: ndarray   description:  best feature
 90 |     '''
 91 |     def createDecisionTree(self, data):
 92 |         # if there is no feature in data, stop division
 93 |         if len(data) == 0:
 94 |             self.tree_node = DecisionNode()
 95 |             return self.tree_node
 96 | 
 97 |         best_gain = 0.0
 98 |         best_criteria = None
 99 |         best_set = None
100 | 
101 |         feature_num = len(data[0]) - 1
102 |         sample_num = len(data[:, -1])
103 |         init_entropy = self.getEntropy(data[:, -1])
104 | 
105 |         # get the best division
106 |         for i in range(feature_num):
107 |             uniques = np.unique(data[:, i])
108 |             for value in uniques:
109 |                 left_set, right_set = self.divideData(data, i, value)
110 |                 # calcuate information gain
111 |                 ratio = float(len(left_set)/sample_num)
112 |                 if ratio == 0.0:
113 |                     info_gain = init_entropy - (1 - ratio) * self.getEntropy(right_set[:, -1])
114 |                 elif ratio == 1.0:
115 |                     info_gain = init_entropy - ratio*self.getEntropy(left_set[:, -1])
116 |                 else:
117 |                     info_gain = init_entropy - ratio * self.getEntropy(left_set[:, -1]) - (1 - ratio) * self.getEntropy(right_set[:, -1])
118 |                 if info_gain > best_gain:
119 |                     best_gain = info_gain
120 |                     best_criteria = (i, value)
121 |                     best_set = (left_set, right_set)
122 | 
123 |         # create the decision tree
124 |         if best_gain < self.t:
125 |             self.tree_node = DecisionNode(results=self.uniqueCount(data[:, -1]))
126 |             return self.tree_node
127 |         else:
128 |             ltree = self.createDecisionTree(best_set[0])
129 |             rtree = self.createDecisionTree(best_set[1])
130 |             self.tree_node = DecisionNode(index=best_criteria[0], value=best_criteria[1], left_tree=ltree, right_tree=rtree)
131 |             return self.tree_node
132 | 
133 |     '''
134 |     Function:  vote
135 |     Description: return the label of the majority  
136 |     Input:  labels    dataType: ndarray   description:  labels
137 |     Output: pred      dataType: int       description:  prediction label of input vector
138 |     '''
139 |     def vote(self, labels):
140 |         labelCount = {}
141 |         # get the counts of each label
142 |         for c in labels:
143 |             labelCount[c] = labelCount.get(c, 0) + 1
144 |         # get the labels of the majority
145 |         predition = sorted(labelCount.items(), key=op.itemgetter(1), reverse=True)
146 |         pred = predition[0][0]
147 |         return pred
148 | 
149 | 
150 |     '''
151 |     Function:  train
152 |     Description: train the model
153 |     Input:  trainData       dataType: ndarray   description: features
154 |             labels          dataType: ndarray   description: labels
155 |     Output: self            dataType: obj       description: the trained model
156 |     '''
157 |     def train(self,trainData, trainLabel):
158 |         if self.norm_type == "Standardization":
159 |             trainData = preProcess.Standardization(trainData)
160 |         else:
161 |             trainData = preProcess.Normalization(trainData)
162 | 
163 |         trainLabel = np.expand_dims(trainLabel, axis=1)
164 |         data = np.hstack([trainData, trainLabel])
165 | 
166 |         self.tree_node = self.createDecisionTree(data)
167 |         #self.printTree(self.tree_node)
168 |         return self
169 | 
170 |     '''
171 |     Function:  save
172 |     Description: save the model as pkl
173 |     Input:  filename    dataType: str   description: the path to save model
174 |     '''
175 |     def save(self, filename):
176 |         f = open(filename, 'w')
177 |         pickle.dump(self.tree_node, f)
178 |         f.close()
179 | 
180 |     '''
181 |     Function:  load
182 |     Description: load the model 
183 |     Input:  filename    dataType: str   description: the path to save model
184 |     Output: self        dataType: obj   description: the trained model
185 |     '''
186 |     def load(self, filename):
187 |         f = open(filename)
188 |         self.tree_node = pickle.load(f)
189 |         return self
190 | 
191 |     '''
192 |     Function:  predict
193 |     Description: predict the testing set 
194 |     Input:  train_data       dataType: ndarray   description: features
195 |             probability      dataType: bool      description: return probaility of label
196 |     Output: prediction       dataType: ndarray   description: the prediction results for testing set
197 |     '''
198 |     def predict(self, test_data, prob="False"):
199 |         # Normalization
200 |         if self.norm_type == "Standardization":
201 |             test_data = preProcess.Standardization(test_data)
202 |         else:
203 |             test_data = preProcess.Normalization(test_data)
204 | 
205 |         test_num = test_data.shape[0]
206 |         prediction = np.zeros([test_num, 1])
207 |         probability = np.zeros([test_num, 1])
208 |         for i in range(test_num):
209 |             result = self.classify(test_data[i, :], self.tree_node)
210 |             result = sorted(result.items(), key=op.itemgetter(1), reverse=True)
211 |             prediction[i] = result[0][0]
212 |             #probability[i] = result[0][1]/(result[0][1] + result[1][1])
213 |         self.prediction = prediction
214 |         self.probability = probability
215 |         if prob:
216 |             return probability
217 |         else:
218 |             return prediction
219 | 
220 |     '''
221 |        Function:  classify
222 |        Description: predict the testing set 
223 |        Input:  sample      dataType: ndarray     description: input vector to be classified
224 |        Output: label       dataType: ndarray     description: the prediction results of input
225 |     '''
226 |     def classify(self, sample, tree):
227 |         if tree.results != None:
228 |             return tree.results
229 |         else:
230 |             value = sample[tree.index]
231 |             branch = None
232 |             if value >= tree.value:
233 |                 branch = tree.right_tree
234 |             else:
235 |                 branch = tree.left_tree
236 |             return self.classify(sample, branch)
237 | 
238 |     '''
239 |        Function:  printTree
240 |        Description: show the structure of the decision tree
241 |        Input:  tree        dataType: DecisionNode    description: decision tree
242 |     '''
243 |     def printTree(self, tree):
244 |         # leaf node
245 |         if tree.results != None:
246 |             print(str(tree.results))
247 |         else:
248 |             # print condition
249 |             print(str(tree.index) + ":" + str(tree.value) + "? ")
250 |             # print subtree
251 |             print("R->", self.printTree(tree.right_tree))
252 |             print("L->", self.printTree(tree.left_tree))
253 | 
254 |     '''
255 |     Function:  showDetectionResult
256 |     Description: show detection result
257 |     Input:  test_data  dataType: ndarray   description: data for test
258 |             test_label dataType: ndarray   description: labels of test data
259 |     Output: accuracy   dataType: float     description: detection accuarcy
260 |     '''
261 |     def accuarcy(self, test_label):
262 |         test_label = np.expand_dims(test_label, axis=1)
263 |         prediction = self.prediction
264 |         accuarcy = sum(prediction == test_label)/len(test_label)
265 |         return accuarcy
266 | 
267 | 
268 | 
269 | 
270 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/dataset/dataset1/test.txt:
--------------------------------------------------------------------------------
  1 | 11188	6.649568	0.544233	2
  2 | 56796	3.966325	0.850410	1
  3 | 8571	1.924045	1.664782	2
  4 | 4914	6.004812	0.280369	2
  5 | 10784	0.000000	0.375849	2
  6 | 39296	9.923018	0.092192	3
  7 | 13113	2.389084	0.119284	2
  8 | 70204	13.663189	0.133251	1
  9 | 46813	11.434976	0.321216	3
 10 | 11697	0.358270	1.292858	2
 11 | 44183	9.598873	0.223524	3
 12 | 2225	6.375275	0.608040	2
 13 | 29066	11.580532	0.458401	3
 14 | 4245	5.319324	1.598070	2
 15 | 34379	4.324031	1.603481	1
 16 | 44441	2.358370	1.273204	1
 17 | 2022	0.000000	1.182708	2
 18 | 26866	12.824376	0.890411	3
 19 | 57070	1.587247	1.456982	1
 20 | 32932	8.510324	1.520683	3
 21 | 51967	10.428884	1.187734	3
 22 | 44432	8.346618	0.042318	3
 23 | 67066	7.541444	0.809226	1
 24 | 17262	2.540946	1.583286	2
 25 | 79728	9.473047	0.692513	1
 26 | 14259	0.352284	0.474080	2
 27 | 6122	0.000000	0.589826	2
 28 | 76879	12.405171	0.567201	1
 29 | 11426	4.126775	0.871452	2
 30 | 2493	0.034087	0.335848	2
 31 | 19910	1.177634	0.075106	2
 32 | 10939	0.000000	0.479996	2
 33 | 17716	0.994909	0.611135	2
 34 | 31390	11.053664	1.180117	3
 35 | 20375	0.000000	1.679729	2
 36 | 26309	2.495011	1.459589	1
 37 | 33484	11.516831	0.001156	3
 38 | 45944	9.213215	0.797743	3
 39 | 4249	5.332865	0.109288	2
 40 | 6089	0.000000	1.689771	2
 41 | 7513	0.000000	1.126053	2
 42 | 27862	12.640062	1.690903	3
 43 | 39038	2.693142	1.317518	1
 44 | 19218	3.328969	0.268271	2
 45 | 62911	7.193166	1.117456	1
 46 | 77758	6.615512	1.521012	1
 47 | 27940	8.000567	0.835341	3
 48 | 2194	4.017541	0.512104	2
 49 | 37072	13.245859	0.927465	3
 50 | 15585	5.970616	0.813624	2
 51 | 25577	11.668719	0.886902	3
 52 | 8777	4.283237	1.272728	2
 53 | 29016	10.742963	0.971401	3
 54 | 21910	12.326672	1.592608	3
 55 | 12916	0.000000	0.344622	2
 56 | 10976	0.000000	0.922846	2
 57 | 79065	10.602095	0.573686	1
 58 | 36759	10.861859	1.155054	3
 59 | 50011	1.229094	1.638690	1
 60 | 1155	0.410392	1.313401	2
 61 | 71600	14.552711	0.616162	1
 62 | 30817	14.178043	0.616313	3
 63 | 54559	14.136260	0.362388	1
 64 | 29764	0.093534	1.207194	1
 65 | 69100	10.929021	0.403110	1
 66 | 47324	11.432919	0.825959	3
 67 | 73199	9.134527	0.586846	1
 68 | 44461	5.071432	1.421420	1
 69 | 45617	11.460254	1.541749	3
 70 | 28221	11.620039	1.103553	3
 71 | 7091	4.022079	0.207307	2
 72 | 6110	3.057842	1.631262	2
 73 | 79016	7.782169	0.404385	1
 74 | 18289	7.981741	0.929789	3
 75 | 43679	4.601363	0.268326	1
 76 | 22075	2.595564	1.115375	1
 77 | 23535	10.049077	0.391045	3
 78 | 25301	3.265444	1.572970	2
 79 | 32256	11.780282	1.511014	3
 80 | 36951	3.075975	0.286284	1
 81 | 31290	1.795307	0.194343	1
 82 | 38953	11.106979	0.202415	3
 83 | 35257	5.994413	0.800021	1
 84 | 25847	9.706062	1.012182	3
 85 | 32680	10.582992	0.836025	3
 86 | 62018	7.038266	1.458979	1
 87 | 9074	0.023771	0.015314	2
 88 | 33004	12.823982	0.676371	3
 89 | 44588	3.617770	0.493483	1
 90 | 32565	8.346684	0.253317	3
 91 | 38563	6.104317	0.099207	1
 92 | 75668	16.207776	0.584973	1
 93 | 9069	6.401969	1.691873	2
 94 | 53395	2.298696	0.559757	1
 95 | 28631	7.661515	0.055981	3
 96 | 71036	6.353608	1.645301	1
 97 | 71142	10.442780	0.335870	1
 98 | 37653	3.834509	1.346121	1
 99 | 76839	10.998587	0.584555	1
100 | 9916	2.695935	1.512111	2
101 | 38889	3.356646	0.324230	1
102 | 39075	14.677836	0.793183	3
103 | 48071	1.551934	0.130902	1
104 | 7275	2.464739	0.223502	2
105 | 41804	1.533216	1.007481	1
106 | 35665	12.473921	0.162910	3
107 | 67956	6.491596	0.032576	1
108 | 41892	10.506276	1.510747	3
109 | 38844	4.380388	0.748506	1
110 | 74197	13.670988	1.687944	1
111 | 14201	8.317599	0.390409	2
112 | 3908	0.000000	0.556245	2
113 | 2459	0.000000	0.290218	2
114 | 32027	10.095799	1.188148	3
115 | 12870	0.860695	1.482632	2
116 | 9880	1.557564	0.711278	2
117 | 72784	10.072779	0.756030	1
118 | 17521	0.000000	0.431468	2
119 | 50283	7.140817	0.883813	3
120 | 33536	11.384548	1.438307	3
121 | 9452	3.214568	1.083536	2
122 | 37457	11.720655	0.301636	3
123 | 17724	6.374475	1.475925	3
124 | 43869	5.749684	0.198875	3
125 | 264	3.871808	0.552602	2
126 | 25736	8.336309	0.636238	3
127 | 39584	9.710442	1.503735	3
128 | 31246	1.532611	1.433898	1
129 | 49567	9.785785	0.984614	3
130 | 7052	2.633627	1.097866	2
131 | 35493	9.238935	0.494701	3
132 | 10986	1.205656	1.398803	2
133 | 49508	3.124909	1.670121	1
134 | 5734	7.935489	1.585044	2
135 | 65479	12.746636	1.560352	1
136 | 77268	10.732563	0.545321	1
137 | 28490	3.977403	0.766103	1
138 | 13546	4.194426	0.450663	2
139 | 37166	9.610286	0.142912	3
140 | 16381	4.797555	1.260455	2
141 | 10848	1.615279	0.093002	2
142 | 35405	4.614771	1.027105	1
143 | 15917	0.000000	1.369726	2
144 | 6131	0.608457	0.512220	2
145 | 67432	6.558239	0.667579	1
146 | 30354	12.315116	0.197068	3
147 | 69696	7.014973	1.494616	1
148 | 33481	8.822304	1.194177	3
149 | 43075	10.086796	0.570455	3
150 | 38343	7.241614	1.661627	3
151 | 14318	4.602395	1.511768	2
152 | 5367	7.434921	0.079792	2
153 | 37894	10.467570	1.595418	3
154 | 36172	9.948127	0.003663	3
155 | 40123	2.478529	1.568987	1
156 | 10976	5.938545	0.878540	2
157 | 12705	0.000000	0.948004	2
158 | 12495	5.559181	1.357926	2
159 | 35681	9.776654	0.535966	3
160 | 46202	3.092056	0.490906	1
161 | 11505	0.000000	1.623311	2
162 | 22834	4.459495	0.538867	1
163 | 49901	8.334306	1.646600	3
164 | 71932	11.226654	0.384686	1
165 | 13279	3.904737	1.597294	2
166 | 49112	7.038205	1.211329	3
167 | 77129	9.836120	1.054340	1
168 | 37447	1.990976	0.378081	1
169 | 62397	9.005302	0.485385	1
170 | 0	1.772510	1.039873	2
171 | 15476	0.458674	0.819560	2
172 | 40625	10.003919	0.231658	3
173 | 36706	0.520807	1.476008	1
174 | 28580	10.678214	1.431837	3
175 | 25862	4.425992	1.363842	1
176 | 63488	12.035355	0.831222	1
177 | 33944	10.606732	1.253858	3
178 | 30099	1.568653	0.684264	1
179 | 13725	2.545434	0.024271	2
180 | 36768	10.264062	0.982593	3
181 | 64656	9.866276	0.685218	1
182 | 14927	0.142704	0.057455	2
183 | 43231	9.853270	1.521432	3
184 | 66087	6.596604	1.653574	1
185 | 19806	2.602287	1.321481	2
186 | 41081	10.411776	0.664168	3
187 | 10277	7.083449	0.622589	2
188 | 7014	2.080068	1.254441	2
189 | 17275	0.522844	1.622458	2
190 | 31600	10.362000	1.544827	3
191 | 59956	3.412967	1.035410	1
192 | 42181	6.796548	1.112153	3
193 | 51743	4.092035	0.075804	1
194 | 5194	2.763811	1.564325	2
195 | 30832	12.547439	1.402443	3
196 | 7976	5.708052	1.596152	2
197 | 14602	4.558025	0.375806	2
198 | 41571	11.642307	0.438553	3
199 | 55028	3.222443	0.121399	1
200 | 5837	4.736156	0.029871	2
201 | 39808	10.839526	0.836323	3
202 | 20944	4.194791	0.235483	2
203 | 22146	14.936259	0.888582	3
204 | 42169	3.310699	1.521855	1
205 | 7010	2.971931	0.034321	2
206 | 3807	9.261667	0.537807	2
207 | 29241	7.791833	1.111416	3
208 | 52696	1.480470	1.028750	1
209 | 42545	3.677287	0.244167	1
210 | 24437	2.202967	1.370399	1
211 | 16037	5.796735	0.935893	2
212 | 8493	3.063333	0.144089	2
213 | 68080	11.233094	0.492487	1
214 | 59016	1.965570	0.005697	1
215 | 11810	8.616719	0.137419	2
216 | 68630	6.609989	1.083505	1
217 | 7629	1.712639	1.086297	2
218 | 71992	10.117445	1.299319	1
219 | 13398	0.000000	1.104178	2
220 | 26241	9.824777	1.346821	3
221 | 11160	1.653089	0.980949	2
222 | 76701	18.178822	1.473671	1
223 | 32174	6.781126	0.885340	3
224 | 45043	8.206750	1.549223	3
225 | 42173	10.081853	1.376745	3
226 | 69801	6.288742	0.112799	1
227 | 41737	3.695937	1.543589	1
228 | 46979	6.726151	1.069380	3
229 | 79267	12.969999	1.568223	1
230 | 4615	2.661390	1.531933	2
231 | 32907	7.072764	1.117386	3
232 | 37444	9.123366	1.318988	3
233 | 569	3.743946	1.039546	2
234 | 8723	2.341300	0.219361	2
235 | 6024	0.541913	0.592348	2
236 | 52252	2.310828	1.436753	1
237 | 8358	6.226597	1.427316	2
238 | 26166	7.277876	0.489252	3
239 | 18471	0.000000	0.389459	2
240 | 3386	7.218221	1.098828	2
241 | 41544	8.777129	1.111464	3
242 | 10480	2.813428	0.819419	2
243 | 5894	2.268766	1.412130	2
244 | 7273	6.283627	0.571292	2
245 | 22272	7.520081	1.626868	3
246 | 31369	11.739225	0.027138	3
247 | 10708	3.746883	0.877350	2
248 | 69364	12.089835	0.521631	1
249 | 37760	12.310404	0.259339	3
250 | 13004	0.000000	0.671355	2
251 | 37885	2.728800	0.331502	1
252 | 52555	10.814342	0.607652	3
253 | 38997	12.170268	0.844205	3
254 | 69698	6.698371	0.240084	1
255 | 11783	3.632672	1.643479	2
256 | 47636	10.059991	0.892361	3
257 | 15744	1.887674	0.756162	2
258 | 69058	8.229125	0.195886	1
259 | 33057	7.817082	0.476102	3
260 | 28681	12.277230	0.076805	3
261 | 34042	10.055337	1.115778	3
262 | 29928	3.596002	1.485952	1
263 | 9734	2.755530	1.420655	2
264 | 7344	7.780991	0.513048	2
265 | 7387	0.093705	0.391834	2
266 | 33957	8.481567	0.520078	3
267 | 9936	3.865584	0.110062	2
268 | 36094	9.683709	0.779984	3
269 | 39835	10.617255	1.359970	3
270 | 64486	7.203216	1.624762	1
271 | 0	7.601414	1.215605	2
272 | 39539	1.386107	1.417070	1
273 | 66972	9.129253	0.594089	1
274 | 15029	1.363447	0.620841	2
275 | 44909	3.181399	0.359329	1
276 | 38183	13.365414	0.217011	3
277 | 37372	4.207717	1.289767	1
278 | 0	4.088395	0.870075	2
279 | 17786	3.327371	1.142505	2
280 | 39055	1.303323	1.235650	1
281 | 37045	7.999279	1.581763	3
282 | 6435	2.217488	0.864536	2
283 | 72265	7.751808	0.192451	1
284 | 28152	14.149305	1.591532	3
285 | 25931	8.765721	0.152808	3
286 | 7538	3.408996	0.184896	2
287 | 1315	1.251021	0.112340	2
288 | 12292	6.160619	1.537165	2
289 | 49248	1.034538	1.585162	1
290 | 9025	0.000000	1.034635	2
291 | 13438	2.355051	0.542603	2
292 | 69683	6.614543	0.153771	1
293 | 25374	10.245062	1.450903	3
294 | 55264	3.467074	1.231019	1
295 | 38324	7.487678	1.572293	3
296 | 69643	4.624115	1.185192	1
297 | 44058	8.995957	1.436479	3
298 | 41316	11.564476	0.007195	3
299 | 29119	3.440948	0.078331	1
300 | 51656	1.673603	0.732746	1
301 | 3030	4.719341	0.699755	2
302 | 35695	10.304798	1.576488	3
303 | 1537	2.086915	1.199312	2
304 | 9083	6.338220	1.131305	2
305 | 47744	8.254926	0.710694	3
306 | 71372	16.067108	0.974142	1
307 | 37980	1.723201	0.310488	1
308 | 42385	3.785045	0.876904	1
309 | 22687	2.557561	0.123738	1
310 | 39512	9.852220	1.095171	3
311 | 11885	3.679147	1.557205	2
312 | 4944	9.789681	0.852971	2
313 | 73230	14.958998	0.526707	1
314 | 17585	11.182148	1.288459	3
315 | 68737	7.528533	1.657487	1
316 | 13818	5.253802	1.378603	2
317 | 31662	13.946752	1.426657	3
318 | 86686	15.557263	1.430029	1
319 | 43214	12.483550	0.688513	3
320 | 24091	2.317302	1.411137	1
321 | 52544	10.069724	0.766119	3
322 | 61861	5.792231	1.615483	1
323 | 47903	4.138435	0.475994	1
324 | 37190	12.929517	0.304378	3
325 | 6013	9.378238	0.307392	2
326 | 27223	8.361362	1.643204	3
327 | 69027	7.939406	1.325042	1
328 | 78642	10.735384	0.705788	1
329 | 30254	11.592723	0.286188	3
330 | 21704	10.098356	0.704748	3
331 | 34985	9.299025	0.545337	3
332 | 31316	11.158297	0.218067	3
333 | 76368	16.143900	0.558388	1
334 | 27953	10.971700	1.221787	3
335 | 152	0.000000	0.681478	2
336 | 9146	3.178961	1.292692	2
337 | 75346	17.625350	0.339926	1
338 | 26376	1.995833	0.267826	1
339 | 35255	10.640467	0.416181	3
340 | 19198	9.628339	0.985462	3
341 | 12518	4.662664	0.495403	2
342 | 25453	5.754047	1.382742	2
343 | 12530	0.000000	0.037146	2
344 | 62230	9.334332	0.198118	1
345 | 9517	3.846162	0.619968	2
346 | 71161	10.685084	0.678179	1
347 | 1593	4.752134	0.359205	2
348 | 33794	0.697630	0.966786	1
349 | 39710	10.365836	0.505898	3
350 | 16941	0.461478	0.352865	2
351 | 69209	11.339537	1.068740	1
352 | 4446	5.420280	0.127310	2
353 | 9347	3.469955	1.619947	2
354 | 55635	8.517067	0.994858	3
355 | 65889	8.306512	0.413690	1
356 | 10753	2.628690	0.444320	2
357 | 7055	0.000000	0.802985	2
358 | 7905	0.000000	1.170397	2
359 | 53447	7.298767	1.582346	3
360 | 9194	7.331319	1.277988	2
361 | 61914	9.392269	0.151617	1
362 | 15630	5.541201	1.180596	2
363 | 79194	15.149460	0.537540	1
364 | 12268	5.515189	0.250562	2
365 | 33682	7.728898	0.920494	3
366 | 26080	11.318785	1.510979	3
367 | 19119	3.574709	1.531514	2
368 | 30902	7.350965	0.026332	3
369 | 63039	7.122363	1.630177	1
370 | 51136	1.828412	1.013702	1
371 | 35262	10.117989	1.156862	3
372 | 42776	11.309897	0.086291	3
373 | 64191	8.342034	1.388569	1
374 | 15436	0.241714	0.715577	2
375 | 14402	10.482619	1.694972	2
376 | 6341	9.289510	1.428879	2
377 | 14113	4.269419	0.134181	2
378 | 6390	0.000000	0.189456	2
379 | 8794	0.817119	0.143668	2
380 | 43432	1.508394	0.652651	1
381 | 38334	9.359918	0.052262	3
382 | 34068	10.052333	0.550423	3
383 | 30819	11.111660	0.989159	3
384 | 22239	11.265971	0.724054	3
385 | 28725	10.383830	0.254836	3
386 | 57071	3.878569	1.377983	1
387 | 72420	13.679237	0.025346	1
388 | 28294	10.526846	0.781569	3
389 | 9896	0.000000	0.924198	2
390 | 65821	4.106727	1.085669	1
391 | 7645	8.118856	1.470686	2
392 | 71289	7.796874	0.052336	1
393 | 5128	2.789669	1.093070	2
394 | 13711	6.226962	0.287251	2
395 | 22240	10.169548	1.660104	3
396 | 15092	0.000000	1.370549	2
397 | 5017	7.513353	0.137348	2
398 | 10141	8.240793	0.099735	2
399 | 35570	14.612797	1.247390	3
400 | 46893	3.562976	0.445386	1
401 | 8178	3.230482	1.331698	2
402 | 55783	3.612548	1.551911	1
403 | 1148	0.000000	0.332365	2
404 | 10062	3.931299	0.487577	2
405 | 74124	14.752342	1.155160	1
406 | 66603	10.261887	1.628085	1
407 | 11893	2.787266	1.570402	2
408 | 50908	15.112319	1.324132	3
409 | 39891	5.184553	0.223382	3
410 | 65915	3.868359	0.128078	1
411 | 65678	3.507965	0.028904	1
412 | 62996	11.019254	0.427554	1
413 | 36851	3.812387	0.655245	1
414 | 36669	11.056784	0.378725	3
415 | 38876	8.826880	1.002328	3
416 | 26878	11.173861	1.478244	3
417 | 46246	11.506465	0.421993	3
418 | 12761	7.798138	0.147917	3
419 | 35282	10.155081	1.370039	3
420 | 68306	10.645275	0.693453	1
421 | 31262	9.663200	1.521541	3
422 | 34754	10.790404	1.312679	3
423 | 13408	2.810534	0.219962	2
424 | 30365	9.825999	1.388500	3
425 | 10709	1.421316	0.677603	2
426 | 24332	11.123219	0.809107	3
427 | 45517	13.402206	0.661524	3
428 | 6178	1.212255	0.836807	2
429 | 10639	1.568446	1.297469	2
430 | 29613	3.343473	1.312266	1
431 | 22392	5.400155	0.193494	1
432 | 51126	3.818754	0.590905	1
433 | 53644	7.973845	0.307364	3
434 | 51417	9.078824	0.734876	3
435 | 24859	0.153467	0.766619	1
436 | 61732	8.325167	0.028479	1
437 | 71128	7.092089	1.216733	1
438 | 27276	5.192485	1.094409	3
439 | 30453	10.340791	1.087721	3
440 | 18670	2.077169	1.019775	2
441 | 70600	10.151966	0.993105	1
442 | 12683	0.046826	0.809614	2
443 | 81597	11.221874	1.395015	1
444 | 69959	14.497963	1.019254	1
445 | 8124	3.554508	0.533462	2
446 | 18867	3.522673	0.086725	2
447 | 80886	14.531655	0.380172	1
448 | 55895	3.027528	0.885457	1
449 | 31587	1.845967	0.488985	1
450 | 10591	10.226164	0.804403	3
451 | 70096	10.965926	1.212328	1
452 | 53151	2.129921	1.477378	1
453 | 11992	0.000000	1.606849	2
454 | 33114	9.489005	0.827814	3
455 | 7413	0.000000	1.020797	2
456 | 10583	0.000000	1.270167	2
457 | 58668	6.556676	0.055183	1
458 | 35018	9.959588	0.060020	3
459 | 70843	7.436056	1.479856	1
460 | 14011	0.404888	0.459517	2
461 | 35015	9.952942	1.650279	3
462 | 70839	15.600252	0.021935	1
463 | 3024	2.723846	0.387455	2
464 | 5526	0.513866	1.323448	2
465 | 5113	0.000000	0.861859	2
466 | 20851	7.280602	1.438470	2
467 | 40999	9.161978	1.110180	3
468 | 15823	0.991725	0.730979	2
469 | 35432	7.398380	0.684218	3
470 | 53711	12.149747	1.389088	3
471 | 64371	9.149678	0.874905	1
472 | 9289	9.666576	1.370330	2
473 | 60613	3.620110	0.287767	1
474 | 18338	5.238800	1.253646	2
475 | 22845	14.715782	1.503758	3
476 | 74676	14.445740	1.211160	1
477 | 34143	13.609528	0.364240	3
478 | 14153	3.141585	0.424280	2
479 | 9327	0.000000	0.120947	2
480 | 18991	0.454750	1.033280	2
481 | 9193	0.510310	0.016395	2
482 | 2285	3.864171	0.616349	2
483 | 9493	6.724021	0.563044	2
484 | 2371	4.289375	0.012563	2
485 | 13963	0.000000	1.437030	2
486 | 2299	3.733617	0.698269	2
487 | 5262	2.002589	1.380184	2
488 | 4659	2.502627	0.184223	2
489 | 17582	6.382129	0.876581	2
490 | 27750	8.546741	0.128706	3
491 | 9868	2.694977	0.432818	2
492 | 18333	3.951256	0.333300	2
493 | 3780	9.856183	0.329181	2
494 | 18190	2.068962	0.429927	2
495 | 11145	3.410627	0.631838	2
496 | 68846	9.974715	0.669787	1
497 | 26575	10.650102	0.866627	3
498 | 48111	9.134528	0.728045	3
499 | 43757	7.882601	1.332446	3
500 | 27884	8.855312	0.570684	3


--------------------------------------------------------------------------------
/dataset/dataset1/train.txt:
--------------------------------------------------------------------------------
  1 | 40920	8.326976	0.953952	3
  2 | 14488	7.153469	1.673904	2
  3 | 26052	1.441871	0.805124	1
  4 | 75136	13.147394	0.428964	1
  5 | 38344	1.669788	0.134296	1
  6 | 72993	10.141740	1.032955	1
  7 | 35948	6.830792	1.213192	3
  8 | 42666	13.276369	0.543880	3
  9 | 67497	8.631577	0.749278	1
 10 | 35483	12.273169	1.508053	3
 11 | 50242	3.723498	0.831917	1
 12 | 63275	8.385879	1.669485	1
 13 | 5569	4.875435	0.728658	2
 14 | 51052	4.680098	0.625224	1
 15 | 77372	15.299570	0.331351	1
 16 | 43673	1.889461	0.191283	1
 17 | 61364	7.516754	1.269164	1
 18 | 69673	14.239195	0.261333	1
 19 | 15669	0.000000	1.250185	2
 20 | 28488	10.528555	1.304844	3
 21 | 6487	3.540265	0.822483	2
 22 | 37708	2.991551	0.833920	1
 23 | 22620	5.297865	0.638306	2
 24 | 28782	6.593803	0.187108	3
 25 | 19739	2.816760	1.686209	2
 26 | 36788	12.458258	0.649617	3
 27 | 5741	0.000000	1.656418	2
 28 | 28567	9.968648	0.731232	3
 29 | 6808	1.364838	0.640103	2
 30 | 41611	0.230453	1.151996	1
 31 | 36661	11.865402	0.882810	3
 32 | 43605	0.120460	1.352013	1
 33 | 15360	8.545204	1.340429	3
 34 | 63796	5.856649	0.160006	1
 35 | 10743	9.665618	0.778626	2
 36 | 70808	9.778763	1.084103	1
 37 | 72011	4.932976	0.632026	1
 38 | 5914	2.216246	0.587095	2
 39 | 14851	14.305636	0.632317	3
 40 | 33553	12.591889	0.686581	3
 41 | 44952	3.424649	1.004504	1
 42 | 17934	0.000000	0.147573	2
 43 | 27738	8.533823	0.205324	3
 44 | 29290	9.829528	0.238620	3
 45 | 42330	11.492186	0.263499	3
 46 | 36429	3.570968	0.832254	1
 47 | 39623	1.771228	0.207612	1
 48 | 32404	3.513921	0.991854	1
 49 | 27268	4.398172	0.975024	1
 50 | 5477	4.276823	1.174874	2
 51 | 14254	5.946014	1.614244	2
 52 | 68613	13.798970	0.724375	1
 53 | 41539	10.393591	1.663724	3
 54 | 7917	3.007577	0.297302	2
 55 | 21331	1.031938	0.486174	2
 56 | 8338	4.751212	0.064693	2
 57 | 5176	3.692269	1.655113	2
 58 | 18983	10.448091	0.267652	3
 59 | 68837	10.585786	0.329557	1
 60 | 13438	1.604501	0.069064	2
 61 | 48849	3.679497	0.961466	1
 62 | 12285	3.795146	0.696694	2
 63 | 7826	2.531885	1.659173	2
 64 | 5565	9.733340	0.977746	2
 65 | 10346	6.093067	1.413798	2
 66 | 1823	7.712960	1.054927	2
 67 | 9744	11.470364	0.760461	3
 68 | 16857	2.886529	0.934416	2
 69 | 39336	10.054373	1.138351	3
 70 | 65230	9.972470	0.881876	1
 71 | 2463	2.335785	1.366145	2
 72 | 27353	11.375155	1.528626	3
 73 | 16191	0.000000	0.605619	2
 74 | 12258	4.126787	0.357501	2
 75 | 42377	6.319522	1.058602	1
 76 | 25607	8.680527	0.086955	3
 77 | 77450	14.856391	1.129823	1
 78 | 58732	2.454285	0.222380	1
 79 | 46426	7.292202	0.548607	3
 80 | 32688	8.745137	0.857348	3
 81 | 64890	8.579001	0.683048	1
 82 | 8554	2.507302	0.869177	2
 83 | 28861	11.415476	1.505466	3
 84 | 42050	4.838540	1.680892	1
 85 | 32193	10.339507	0.583646	3
 86 | 64895	6.573742	1.151433	1
 87 | 2355	6.539397	0.462065	2
 88 | 0	2.209159	0.723567	2
 89 | 70406	11.196378	0.836326	1
 90 | 57399	4.229595	0.128253	1
 91 | 41732	9.505944	0.005273	3
 92 | 11429	8.652725	1.348934	3
 93 | 75270	17.101108	0.490712	1
 94 | 5459	7.871839	0.717662	2
 95 | 73520	8.262131	1.361646	1
 96 | 40279	9.015635	1.658555	3
 97 | 21540	9.215351	0.806762	3
 98 | 17694	6.375007	0.033678	2
 99 | 22329	2.262014	1.022169	1
100 | 46570	5.677110	0.709469	1
101 | 42403	11.293017	0.207976	3
102 | 33654	6.590043	1.353117	1
103 | 9171	4.711960	0.194167	2
104 | 28122	8.768099	1.108041	3
105 | 34095	11.502519	0.545097	3
106 | 1774	4.682812	0.578112	2
107 | 40131	12.446578	0.300754	3
108 | 13994	12.908384	1.657722	3
109 | 77064	12.601108	0.974527	1
110 | 11210	3.929456	0.025466	2
111 | 6122	9.751503	1.182050	3
112 | 15341	3.043767	0.888168	2
113 | 44373	4.391522	0.807100	1
114 | 28454	11.695276	0.679015	3
115 | 63771	7.879742	0.154263	1
116 | 9217	5.613163	0.933632	2
117 | 69076	9.140172	0.851300	1
118 | 24489	4.258644	0.206892	1
119 | 16871	6.799831	1.221171	2
120 | 39776	8.752758	0.484418	3
121 | 5901	1.123033	1.180352	2
122 | 40987	10.833248	1.585426	3
123 | 7479	3.051618	0.026781	2
124 | 38768	5.308409	0.030683	3
125 | 4933	1.841792	0.028099	2
126 | 32311	2.261978	1.605603	1
127 | 26501	11.573696	1.061347	3
128 | 37433	8.038764	1.083910	3
129 | 23503	10.734007	0.103715	3
130 | 68607	9.661909	0.350772	1
131 | 27742	9.005850	0.548737	3
132 | 11303	0.000000	0.539131	2
133 | 0	5.757140	1.062373	2
134 | 32729	9.164656	1.624565	3
135 | 24619	1.318340	1.436243	1
136 | 42414	14.075597	0.695934	3
137 | 20210	10.107550	1.308398	3
138 | 33225	7.960293	1.219760	3
139 | 54483	6.317292	0.018209	1
140 | 18475	12.664194	0.595653	3
141 | 33926	2.906644	0.581657	1
142 | 43865	2.388241	0.913938	1
143 | 26547	6.024471	0.486215	3
144 | 44404	7.226764	1.255329	3
145 | 16674	4.183997	1.275290	2
146 | 8123	11.850211	1.096981	3
147 | 42747	11.661797	1.167935	3
148 | 56054	3.574967	0.494666	1
149 | 10933	0.000000	0.107475	2
150 | 18121	7.937657	0.904799	3
151 | 11272	3.365027	1.014085	2
152 | 16297	0.000000	0.367491	2
153 | 28168	13.860672	1.293270	3
154 | 40963	10.306714	1.211594	3
155 | 31685	7.228002	0.670670	3
156 | 55164	4.508740	1.036192	1
157 | 17595	0.366328	0.163652	2
158 | 1862	3.299444	0.575152	2
159 | 57087	0.573287	0.607915	1
160 | 63082	9.183738	0.012280	1
161 | 51213	7.842646	1.060636	3
162 | 6487	4.750964	0.558240	2
163 | 4805	11.438702	1.556334	3
164 | 30302	8.243063	1.122768	3
165 | 68680	7.949017	0.271865	1
166 | 17591	7.875477	0.227085	2
167 | 74391	9.569087	0.364856	1
168 | 37217	7.750103	0.869094	3
169 | 42814	0.000000	1.515293	1
170 | 14738	3.396030	0.633977	2
171 | 19896	11.916091	0.025294	3
172 | 14673	0.460758	0.689586	2
173 | 32011	13.087566	0.476002	3
174 | 58736	4.589016	1.672600	1
175 | 54744	8.397217	1.534103	1
176 | 29482	5.562772	1.689388	1
177 | 27698	10.905159	0.619091	3
178 | 11443	1.311441	1.169887	2
179 | 56117	10.647170	0.980141	3
180 | 39514	0.000000	0.481918	1
181 | 26627	8.503025	0.830861	3
182 | 16525	0.436880	1.395314	2
183 | 24368	6.127867	1.102179	1
184 | 22160	12.112492	0.359680	3
185 | 6030	1.264968	1.141582	2
186 | 6468	6.067568	1.327047	2
187 | 22945	8.010964	1.681648	3
188 | 18520	3.791084	0.304072	2
189 | 34914	11.773195	1.262621	3
190 | 6121	8.339588	1.443357	2
191 | 38063	2.563092	1.464013	1
192 | 23410	5.954216	0.953782	1
193 | 35073	9.288374	0.767318	3
194 | 52914	3.976796	1.043109	1
195 | 16801	8.585227	1.455708	3
196 | 9533	1.271946	0.796506	2
197 | 16721	0.000000	0.242778	2
198 | 5832	0.000000	0.089749	2
199 | 44591	11.521298	0.300860	3
200 | 10143	1.139447	0.415373	2
201 | 21609	5.699090	1.391892	2
202 | 23817	2.449378	1.322560	1
203 | 15640	0.000000	1.228380	2
204 | 8847	3.168365	0.053993	2
205 | 50939	10.428610	1.126257	3
206 | 28521	2.943070	1.446816	1
207 | 32901	10.441348	0.975283	3
208 | 42850	12.478764	1.628726	3
209 | 13499	5.856902	0.363883	2
210 | 40345	2.476420	0.096075	1
211 | 43547	1.826637	0.811457	1
212 | 70758	4.324451	0.328235	1
213 | 19780	1.376085	1.178359	2
214 | 44484	5.342462	0.394527	1
215 | 54462	11.835521	0.693301	3
216 | 20085	12.423687	1.424264	3
217 | 42291	12.161273	0.071131	3
218 | 47550	8.148360	1.649194	3
219 | 11938	1.531067	1.549756	2
220 | 40699	3.200912	0.309679	1
221 | 70908	8.862691	0.530506	1
222 | 73989	6.370551	0.369350	1
223 | 11872	2.468841	0.145060	2
224 | 48463	11.054212	0.141508	3
225 | 15987	2.037080	0.715243	2
226 | 70036	13.364030	0.549972	1
227 | 32967	10.249135	0.192735	3
228 | 63249	10.464252	1.669767	1
229 | 42795	9.424574	0.013725	3
230 | 14459	4.458902	0.268444	2
231 | 19973	0.000000	0.575976	2
232 | 5494	9.686082	1.029808	3
233 | 67902	13.649402	1.052618	1
234 | 25621	13.181148	0.273014	3
235 | 27545	3.877472	0.401600	1
236 | 58656	1.413952	0.451380	1
237 | 7327	4.248986	1.430249	2
238 | 64555	8.779183	0.845947	1
239 | 8998	4.156252	0.097109	2
240 | 11752	5.580018	0.158401	2
241 | 76319	15.040440	1.366898	1
242 | 27665	12.793870	1.307323	3
243 | 67417	3.254877	0.669546	1
244 | 21808	10.725607	0.588588	3
245 | 15326	8.256473	0.765891	2
246 | 20057	8.033892	1.618562	3
247 | 79341	10.702532	0.204792	1
248 | 15636	5.062996	1.132555	2
249 | 35602	10.772286	0.668721	3
250 | 28544	1.892354	0.837028	1
251 | 57663	1.019966	0.372320	1
252 | 78727	15.546043	0.729742	1
253 | 68255	11.638205	0.409125	1
254 | 14964	3.427886	0.975616	2
255 | 21835	11.246174	1.475586	3
256 | 7487	0.000000	0.645045	2
257 | 8700	0.000000	1.424017	2
258 | 26226	8.242553	0.279069	3
259 | 65899	8.700060	0.101807	1
260 | 6543	0.812344	0.260334	2
261 | 46556	2.448235	1.176829	1
262 | 71038	13.230078	0.616147	1
263 | 47657	0.236133	0.340840	1
264 | 19600	11.155826	0.335131	3
265 | 37422	11.029636	0.505769	3
266 | 1363	2.901181	1.646633	2
267 | 26535	3.924594	1.143120	1
268 | 47707	2.524806	1.292848	1
269 | 38055	3.527474	1.449158	1
270 | 6286	3.384281	0.889268	2
271 | 10747	0.000000	1.107592	2
272 | 44883	11.898890	0.406441	3
273 | 56823	3.529892	1.375844	1
274 | 68086	11.442677	0.696919	1
275 | 70242	10.308145	0.422722	1
276 | 11409	8.540529	0.727373	2
277 | 67671	7.156949	1.691682	1
278 | 61238	0.720675	0.847574	1
279 | 17774	0.229405	1.038603	2
280 | 53376	3.399331	0.077501	1
281 | 30930	6.157239	0.580133	1
282 | 28987	1.239698	0.719989	1
283 | 13655	6.036854	0.016548	2
284 | 7227	5.258665	0.933722	2
285 | 40409	12.393001	1.571281	3
286 | 13605	9.627613	0.935842	2
287 | 26400	11.130453	0.597610	3
288 | 13491	8.842595	0.349768	3
289 | 30232	10.690010	1.456595	3
290 | 43253	5.714718	1.674780	3
291 | 55536	3.052505	1.335804	1
292 | 8807	0.000000	0.059025	2
293 | 25783	9.945307	1.287952	3
294 | 22812	2.719723	1.142148	1
295 | 77826	11.154055	1.608486	1
296 | 38172	2.687918	0.660836	1
297 | 31676	10.037847	0.962245	3
298 | 74038	12.404762	1.112080	1
299 | 44738	10.237305	0.633422	3
300 | 17410	4.745392	0.662520	2
301 | 5688	4.639461	1.569431	2
302 | 36642	3.149310	0.639669	1
303 | 29956	13.406875	1.639194	3
304 | 60350	6.068668	0.881241	1
305 | 23758	9.477022	0.899002	3
306 | 25780	3.897620	0.560201	2
307 | 11342	5.463615	1.203677	2
308 | 36109	3.369267	1.575043	1
309 | 14292	5.234562	0.825954	2
310 | 11160	0.000000	0.722170	2
311 | 23762	12.979069	0.504068	3
312 | 39567	5.376564	0.557476	1
313 | 25647	13.527910	1.586732	3
314 | 14814	2.196889	0.784587	2
315 | 73590	10.691748	0.007509	1
316 | 35187	1.659242	0.447066	1
317 | 49459	8.369667	0.656697	3
318 | 31657	13.157197	0.143248	3
319 | 6259	8.199667	0.908508	2
320 | 33101	4.441669	0.439381	3
321 | 27107	9.846492	0.644523	3
322 | 17824	0.019540	0.977949	2
323 | 43536	8.253774	0.748700	3
324 | 67705	6.038620	1.509646	1
325 | 35283	6.091587	1.694641	3
326 | 71308	8.986820	1.225165	1
327 | 31054	11.508473	1.624296	3
328 | 52387	8.807734	0.713922	3
329 | 40328	0.000000	0.816676	1
330 | 34844	8.889202	1.665414	3
331 | 11607	3.178117	0.542752	2
332 | 64306	7.013795	0.139909	1
333 | 32721	9.605014	0.065254	3
334 | 33170	1.230540	1.331674	1
335 | 37192	10.412811	0.890803	3
336 | 13089	0.000000	0.567161	2
337 | 66491	9.699991	0.122011	1
338 | 15941	0.000000	0.061191	2
339 | 4272	4.455293	0.272135	2
340 | 48812	3.020977	1.502803	1
341 | 28818	8.099278	0.216317	3
342 | 35394	1.157764	1.603217	1
343 | 71791	10.105396	0.121067	1
344 | 40668	11.230148	0.408603	3
345 | 39580	9.070058	0.011379	3
346 | 11786	0.566460	0.478837	2
347 | 19251	0.000000	0.487300	2
348 | 56594	8.956369	1.193484	3
349 | 54495	1.523057	0.620528	1
350 | 11844	2.749006	0.169855	2
351 | 45465	9.235393	0.188350	3
352 | 31033	10.555573	0.403927	3
353 | 16633	6.956372	1.519308	2
354 | 13887	0.636281	1.273984	2
355 | 52603	3.574737	0.075163	1
356 | 72000	9.032486	1.461809	1
357 | 68497	5.958993	0.023012	1
358 | 35135	2.435300	1.211744	1
359 | 26397	10.539731	1.638248	3
360 | 7313	7.646702	0.056513	2
361 | 91273	20.919349	0.644571	1
362 | 24743	1.424726	0.838447	1
363 | 31690	6.748663	0.890223	3
364 | 15432	2.289167	0.114881	2
365 | 58394	5.548377	0.402238	1
366 | 33962	6.057227	0.432666	1
367 | 31442	10.828595	0.559955	3
368 | 31044	11.318160	0.271094	3
369 | 29938	13.265311	0.633903	3
370 | 9875	0.000000	1.496715	2
371 | 51542	6.517133	0.402519	3
372 | 11878	4.934374	1.520028	2
373 | 69241	10.151738	0.896433	1
374 | 37776	2.425781	1.559467	1
375 | 68997	9.778962	1.195498	1
376 | 67416	12.219950	0.657677	1
377 | 59225	7.394151	0.954434	1
378 | 29138	8.518535	0.742546	3
379 | 5962	2.798700	0.662632	2
380 | 10847	0.637930	0.617373	2
381 | 70527	10.750490	0.097415	1
382 | 9610	0.625382	0.140969	2
383 | 64734	10.027968	0.282787	1
384 | 25941	9.817347	0.364197	3
385 | 2763	0.646828	1.266069	2
386 | 55601	3.347111	0.914294	1
387 | 31128	11.816892	0.193798	3
388 | 5181	0.000000	1.480198	2
389 | 69982	10.945666	0.993219	1
390 | 52440	10.244706	0.280539	3
391 | 57350	2.579801	1.149172	1
392 | 57869	2.630410	0.098869	1
393 | 56557	11.746200	1.695517	3
394 | 42342	8.104232	1.326277	3
395 | 15560	12.409743	0.790295	3
396 | 34826	12.167844	1.328086	3
397 | 8569	3.198408	0.299287	2
398 | 77623	16.055513	0.541052	1
399 | 78184	7.138659	0.158481	1
400 | 7036	4.831041	0.761419	2
401 | 69616	10.082890	1.373611	1
402 | 21546	10.066867	0.788470	3
403 | 36715	8.129538	0.329913	3
404 | 20522	3.012463	1.138108	2
405 | 42349	3.720391	0.845974	1
406 | 9037	0.773493	1.148256	2
407 | 26728	10.962941	1.037324	3
408 | 587	0.177621	0.162614	2
409 | 48915	3.085853	0.967899	1
410 | 9824	8.426781	0.202558	2
411 | 4135	1.825927	1.128347	2
412 | 9666	2.185155	1.010173	2
413 | 59333	7.184595	1.261338	1
414 | 36198	0.000000	0.116525	1
415 | 34909	8.901752	1.033527	3
416 | 47516	2.451497	1.358795	1
417 | 55807	3.213631	0.432044	1
418 | 14036	3.974739	0.723929	2
419 | 42856	9.601306	0.619232	3
420 | 64007	8.363897	0.445341	1
421 | 59428	6.381484	1.365019	1
422 | 13730	0.000000	1.403914	2
423 | 41740	9.609836	1.438105	3
424 | 63546	9.904741	0.985862	1
425 | 30417	7.185807	1.489102	3
426 | 69636	5.466703	1.216571	1
427 | 64660	0.000000	0.915898	1
428 | 14883	4.575443	0.535671	2
429 | 7965	3.277076	1.010868	2
430 | 68620	10.246623	1.239634	1
431 | 8738	2.341735	1.060235	2
432 | 7544	3.201046	0.498843	2
433 | 6377	6.066013	0.120927	2
434 | 36842	8.829379	0.895657	3
435 | 81046	15.833048	1.568245	1
436 | 67736	13.516711	1.220153	1
437 | 32492	0.664284	1.116755	1
438 | 39299	6.325139	0.605109	3
439 | 77289	8.677499	0.344373	1
440 | 33835	8.188005	0.964896	3
441 | 71890	9.414263	0.384030	1
442 | 32054	9.196547	1.138253	3
443 | 38579	10.202968	0.452363	3
444 | 55984	2.119439	1.481661	1
445 | 72694	13.635078	0.858314	1
446 | 42299	0.083443	0.701669	1
447 | 26635	9.149096	1.051446	3
448 | 8579	1.933803	1.374388	2
449 | 37302	14.115544	0.676198	3
450 | 22878	8.933736	0.943352	3
451 | 4364	2.661254	0.946117	2
452 | 4985	0.988432	1.305027	2
453 | 37068	2.063741	1.125946	1
454 | 41137	2.220590	0.690754	1
455 | 67759	6.424849	0.806641	1
456 | 11831	1.156153	1.613674	2
457 | 34502	3.032720	0.601847	1
458 | 4088	3.076828	0.952089	2
459 | 15199	0.000000	0.318105	2
460 | 17309	7.750480	0.554015	3
461 | 42816	10.958135	1.482500	3
462 | 43751	10.222018	0.488678	3
463 | 58335	2.367988	0.435741	1
464 | 75039	7.686054	1.381455	1
465 | 42878	11.464879	1.481589	3
466 | 42770	11.075735	0.089726	3
467 | 8848	3.543989	0.345853	2
468 | 31340	8.123889	1.282880	3
469 | 41413	4.331769	0.754467	3
470 | 12731	0.120865	1.211961	2
471 | 22447	6.116109	0.701523	3
472 | 33564	7.474534	0.505790	3
473 | 48907	8.819454	0.649292	3
474 | 8762	6.802144	0.615284	2
475 | 46696	12.666325	0.931960	3
476 | 36851	8.636180	0.399333	3
477 | 67639	11.730991	1.289833	1
478 | 171	8.132449	0.039062	2
479 | 26674	10.296589	1.496144	3
480 | 8739	7.583906	1.005764	2
481 | 66668	9.777806	0.496377	1
482 | 68732	8.833546	0.513876	1
483 | 69995	4.907899	1.518036	1
484 | 82008	8.362736	1.285939	1
485 | 25054	9.084726	1.606312	3
486 | 33085	14.164141	0.560970	3
487 | 41379	9.080683	0.989920	3
488 | 39417	6.522767	0.038548	3
489 | 12556	3.690342	0.462281	2
490 | 39432	3.563706	0.242019	1
491 | 38010	1.065870	1.141569	1
492 | 69306	6.683796	1.456317	1
493 | 38000	1.712874	0.243945	1
494 | 46321	13.109929	1.280111	3
495 | 66293	11.327910	0.780977	1
496 | 22730	4.545711	1.233254	1
497 | 5952	3.367889	0.468104	2
498 | 72308	8.326224	0.567347	1
499 | 60338	8.978339	1.442034	1
500 | 13301	5.655826	1.582159	2


--------------------------------------------------------------------------------
/SVM.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @Filename:       SVM.py
  3 | @Author:         Ryuk
  4 | @Create Date:    2019-04-29
  5 | @Update Date:    2019-05-03
  6 | @Description:    Implement of SVM
  7 | """
  8 | 
  9 | import numpy as np
 10 | import preProcess
 11 | import pickle
 12 | import random
 13 | 
 14 | 
 15 | class SVMClassifier:
 16 |     def __init__(self, norm_type="Normalization", C=200, kernel="rbf", threshold=10e-3, g=0.1, c=0, n=3, max_iteration=100):
 17 |         self.norm_type = norm_type
 18 |         self.prediction = None
 19 |         self.probability = None
 20 |         self.train_data = None
 21 |         self.train_label = None
 22 |         self.sample_num = None
 23 |         self.max_iteration = max_iteration             # max iteration of SMO
 24 |         self.K = None
 25 |         self.alphas = None
 26 |         self.w = None                                 # the weight of hyperplane
 27 |         self.b = None                                 # the bias of hyperplane
 28 |         self.errors = None                            # errors
 29 |         self.C = C                                    # penalty coefficient
 30 |         self.threshold = threshold                    # threshold of tolerant error
 31 |         self.kernel = kernel                          # kernel function
 32 |         self.g = g                                    # sigma for rbf, sigmoid poly
 33 |         self.n = n                                    # order of poly
 34 |         self.c = c                                    # bias of sigmoid poly
 35 | 
 36 | 
 37 |     '''
 38 |        Function:  labelTransformation
 39 |        Description: transform {0, 1} into {-1, 1}, list to ndarray
 40 |        Input: labels      dataType: List        description: original label
 41 |        Output: new_label  dataType: ndarray     description: new label
 42 |        '''
 43 |     def labelTransformation(self, labels):
 44 |         new_labels = np.zeros([len(labels), 1])
 45 |         for i in range(len(labels)):
 46 |             if labels[i] == 0:
 47 |                 new_labels[i] = labels[i]
 48 |             else:
 49 |                 new_labels[i] = labels[i]
 50 |         return new_labels
 51 | 
 52 |     '''
 53 |        Function:  calculateErrors 
 54 |        Description: calculate the prediction errors of the k-th sample  LiHang statistical learning P127 Eq. (7.105)
 55 |                     g(x) = sigma[ai*yi*K(xi,x))] + b
 56 |        Input: k           dataType: int         description: index of the k-th sample
 57 |        Output: Ek         dataType: float       description: prediction error of the k-th sample
 58 |        '''
 59 |     def calculateErrors(self, k):
 60 |         gap = np.dot(np.multiply(self.alphas, self.train_label).T, self.K[:, k]) + self.b
 61 |         Ek = gap - self.train_label[k]
 62 |         return Ek
 63 | 
 64 |     '''
 65 |        Function:  selectAlpha2Rand
 66 |        Description: select alpha2 
 67 |        Input: i           dataType: int         description: the index of the alpha1
 68 |        Output: j          dataType: int         description: the index of the alpha2
 69 |        '''
 70 |     def selectAlpha2Rand(self, i):
 71 |         j = i
 72 |         while j == i:
 73 |             j = random.randint(0, self.sample_num)
 74 |         return j
 75 | 
 76 |     '''
 77 |        Function:  selectAplha2 
 78 |        Description: select the second alpha by elicitation method in inner loop 
 79 |        Input: i           dataType: int         description: the index of the first alpha 
 80 |               Ei          dataType: float       description: the error of the first alpha
 81 |        Output: j          dataType: int         description: the index of the second alpha
 82 |                Ej         dataType: float       description: the error of the second alpha 
 83 |             
 84 |        '''
 85 |     def selectAplha2(self, i, Ei):
 86 |         max_k = -1
 87 |         max_delta = 0.0
 88 |         Ej = 0.0
 89 | 
 90 |         self.errors[i] = [1, Ei]
 91 |         valid_errors_index = np.nonzero(self.errors[:, 0])[0]     # get the nonzero value of the alpha
 92 |         if len(valid_errors_index) > 1:
 93 |             for k in valid_errors_index:
 94 |                 if k == i:
 95 |                     continue
 96 |                 Ek = self.calculateErrors(k)
 97 |                 # print(self.calculateErrors(k))
 98 |                 delta_e = abs(Ei - Ek)
 99 |                 if delta_e > max_delta:                 # select j with the max Ei-Ej
100 |                     max_k = k
101 |                     max_delta = delta_e
102 |                     Ej = Ek
103 |             return max_k, Ej
104 |         else:
105 |             j = self.selectAlpha2Rand(i)
106 |             Ej = self.calculateErrors(j)
107 |         return j, Ej
108 | 
109 |     '''
110 |        Function:  upadateError
111 |        Description: update and save the perdiction errors
112 |        Input: k           dataType: int         description: the index of the first alpha 
113 |        '''
114 |     def upadateError(self, k):
115 |         Ek = self.calculateErrors(k)
116 |         self.errors[k] = [1, Ek]  # 1 means valid
117 | 
118 |     '''
119 |        Function:  upadateError
120 |        Description: update and save the perdiction errors, in page of 127 Eq.(7.108)
121 |        Input: alpha2      dataType: float       description: old alpha2 
122 |               L           dataType: float       description: low border of alpha2 
123 |               H           dataType: float       description: high border of alpha2 
124 |         Output: alpha2    dataType: float       description: new alpha2 
125 |        '''
126 |     def updateAlpha2(self, alpha2, L, H):
127 |         if alpha2 > H:
128 |             alpha2 = H
129 |         if L > alpha2:
130 |             alpha2 = L
131 |         return alpha2
132 | 
133 |     '''
134 |        Function:  innerLoop
135 |        Description: inner loop in Platt SMO
136 |        Input: i           dataType: int         description: the index of the first alpha 
137 |        '''
138 |     def innerLoop(self, i):
139 |         Ei = self.calculateErrors(i)
140 |         # check KKT conditions
141 |         if ((self.train_label[i] * Ei < -self.threshold) and (self.alphas[i] < self.C)) or ((self.train_label[i] * Ei > self.threshold) and (self.alphas[i] > 0)):
142 | 
143 |             j, Ej = self.selectAplha2(i, Ei)          # select alpha2 according to alpha1
144 | 
145 |             # copy alpha1 and alpha2
146 |             old_alpha1 = self.alphas[i]
147 |             old_alpha2 = self.alphas[j]
148 | 
149 |             # determine the range of alpha2 L and H      in page of 126
150 |             # if y1 != y2    L = max(0, old_alpha2 - old_alpha1), H = min(C, C + old_alpha2 - old_alpha1)
151 |             # if y1 == y2    L = max(0, old_alpha2 + old_alpha1 - C), H = min(C, old_alpha2 + old_alpha1)
152 |             if self.train_label[i] != self.train_label[j]:
153 |                 L = max(0, old_alpha2 - old_alpha1)
154 |                 H = min(self.C, self.C + old_alpha2 - old_alpha1)
155 |             else:
156 |                 L = max(0, old_alpha2 + old_alpha1 - self.C)
157 |                 H = min(self.C, old_alpha2 + old_alpha2)
158 | 
159 |             if L == H:
160 |                 # print("L == H")
161 |                 return 0
162 | 
163 |             # calculate eta in page of 127 Eq.(7.107)
164 |             # eta = K11 + K22 - 2K12
165 |             K11 = self.K[i, i]
166 |             K12 = self.K[i, j]
167 |             K21 = self.K[j, i]
168 |             K22 = self.K[j, j]
169 |             eta = K11 + K22 - 2 * K12
170 |             if eta <= 0:
171 |                 # print("eta <= 0")
172 |                 return 0
173 | 
174 |             # update alpha2 and its error in page of 127 Eq.(7.106) and Eq.(7.108)
175 |             self.alphas[j] = old_alpha2 + self.train_label[j]*(Ei - Ej)/eta
176 |             self.alphas[j] = self.updateAlpha2(self.alphas[j], L, H)
177 |             new_alphas2 = self.alphas[j]
178 |             self.upadateError(j)
179 | 
180 |             # # if the stripe of alpha2 is not big enough, stop
181 |             # if abs(self.alphas[j] - old_alpha2) < 0.01:
182 |             #     return 0
183 | 
184 |             # update the alpha1 and its error in page of 127 Eq.(7.109)
185 |             # new_alpha1 = old_alpha1 + y1y2(old_alpha2 - new_alpha2)
186 |             new_alphas1 = old_alpha1 + self.train_label[i] * self.train_label[j] * (old_alpha2 - new_alphas2)
187 |             self.alphas[i] = new_alphas1
188 |             self.upadateError(i)
189 | 
190 |             # determine b in page of 130 Eq.(7.115) and Eq.(7.116)
191 |             # new_b1 = -E1 - y1K11(new_alpha1 - old_alpha1) - y2K21(new_alpha2 - old_alpha2) + old_b
192 |             # new_b2 = -E2 - y1K12(new_alpha1 - old_alpha1) - y2K22(new_alpha2 - old_alpha2) + old_b
193 |             b1 = - Ei - self.train_label[i] * K11 * (old_alpha1 - self.alphas[i]) - self.train_label[j] * K21 * (old_alpha2 - self.alphas[j]) + self.b
194 |             b2 = - Ej - self.train_label[i] * K12 * (old_alpha1 - self.alphas[i]) - self.train_label[j] * K22 * (old_alpha2 - self.alphas[j]) + self.b
195 |             if (self.alphas[i] > 0) and (self.alphas[i] < self.C):
196 |                 self.b = b1
197 |             elif (self.alphas[j] > 0) and (self.alphas[j] < self.C):
198 |                 self.b = b2
199 |             else:
200 |                 self.b = (b1 + b2)/2.0
201 | 
202 |             return 1
203 |         else:
204 |             return 0
205 | 
206 |     '''
207 |        Function:  SMO
208 |        Description: implement of  Platt SMO, first search support vector which are not in bound, if alpha2 dosen't change enough, search 
209 |                     the entire set 
210 |        '''
211 |     def SMO(self):
212 |         iter = 0
213 |         entire_set = True
214 |         alpha_pairs_changes = 0
215 |         while (iter < self.max_iteration) and (alpha_pairs_changes > 0) or entire_set:
216 |             alpha_pairs_changes = 0
217 |             if entire_set:
218 |                 for i in range(self.sample_num):
219 |                     alpha_pairs_changes += self.innerLoop(i)
220 |                     # print("Iteration:%d, Sample:%d, Pairs changed:%d" %(iter, i, alpha_pairs_changes))
221 |                 iter += 1
222 |             else:
223 |                 non_bound_alpha = np.nonzero((self.alphas > 0) & (self.alphas < self.C))[0] # in page of 129 Eq.(7.112)
224 |                 for i in range(len(non_bound_alpha)):
225 |                     alpha_pairs_changes += self.innerLoop(i)
226 |                     # print("Iteration:%d, Sample:%d, Pairs changed:%d" % (iter, i, alpha_pairs_changes))
227 |                 iter += 1
228 |             if entire_set:
229 |                 entire_set = False
230 |             elif alpha_pairs_changes == 0:
231 |                 entire_set = True
232 | 
233 |         # print("Iteration:%d" % iter)
234 | 
235 |     '''
236 |           Function:  kernel transformation
237 |           Description: transform {0, 1} into {-1, 1}, list to ndarray
238 |           Input: data           dataType: ndarray         description: data set
239 |                  sample         dataType: ndarray         description: a sample
240 |           Output: new_label     dataType: ndarray     description: new label
241 |           '''
242 | 
243 |     def kernelTransformation(self, data, sample, kernel):
244 |         sample_num, feature_dim = np.shape(data)
245 |         K = np.zeros([sample_num])
246 |         if kernel == "linear":  # linear function
247 |             K = np.dot(data, sample.T)
248 |         elif kernel == "poly":  # polynomial function
249 |             K = (np.dot(data, sample.T) + self.c) ** self.n
250 |         elif kernel == "sigmoid":
251 |             K = np.tanh(self.g * np.dot(data, sample.T) + self.c)
252 |         elif kernel == "rbf":  # Gaussian function
253 |             for i in range(sample_num):
254 |                 delta = data[i, :] - sample
255 |                 K[i] = np.dot(delta, delta.T)
256 |             K = np.exp(-self.g * K)
257 |         else:
258 |             raise NameError('Unrecognized kernel function')
259 |         return K
260 | 
261 |     '''
262 |           Function:  train
263 |           Description: train the model
264 |           Input:  train_data       dataType: ndarray   description: features
265 |                   train_label      dataType: ndarray   description: labels
266 |           Output: self             dataType: obj       description: the trained model
267 |           '''
268 |     def train(self, train_data, train_label):
269 |         if self.norm_type == "Standardization":
270 |             train_data = preProcess.Standardization(train_data)
271 |         else:
272 |             train_data = preProcess.Normalization(train_data)
273 | 
274 |         # initiation
275 |         sample_num, feature_dim = np.shape(train_data)
276 |         self.train_data = train_data
277 |         self.train_label = self.labelTransformation(train_label)
278 |         self.sample_num = sample_num
279 |         self.K = np.zeros([self.sample_num, self.sample_num])
280 |         self.alphas = np.zeros([self.sample_num, 1])
281 |         self.errors = np.zeros([self.sample_num, 2])
282 |         self.b = 0
283 | 
284 |         # kernel trick
285 |         for i in range(self.sample_num):
286 |             self.K[:, i] = self.kernelTransformation(self.train_data, self.train_data[i, :], self.kernel)
287 | 
288 |         # train model
289 |         self.SMO()
290 |         return self
291 | 
292 |     '''
293 |     Function:  predict
294 |     Description: predict the testing set 
295 |     Input:  train_data       dataType: ndarray   description: features
296 |             prob             dataType: bool      description: return probaility of label
297 |     Output: prediction       dataType: ndarray   description: the prediction results for testing set
298 |           '''
299 | 
300 |     def predict(self, test_data, prob="False"):
301 |         # Normalization
302 |         if self.norm_type == "Standardization":
303 |             test_data = preProcess.Standardization(test_data)
304 |         else:
305 |             test_data = preProcess.Normalization(test_data)
306 | 
307 |         test_num = test_data.shape[0]
308 |         prediction = np.zeros([test_num, 1])
309 |         probability = np.zeros([test_num, 1])
310 | 
311 |         # find the support vectors and its corresponding label
312 |         support_vectors_index = np.nonzero(self.alphas > 0)[0]
313 |         support_vectors = self.train_data[support_vectors_index]
314 |         support_vectors_label = self.train_label[support_vectors_index]
315 |         support_vectors_alphas = self.alphas[support_vectors_index]
316 | 
317 |         # predict the test sample in page of 122 Eq.(7.89)
318 |         for i in range(test_num):
319 |             kernel_data = self.kernelTransformation(support_vectors, test_data[i, :], self.kernel)
320 |             probability[i] = np.dot(kernel_data.T, np.multiply(support_vectors_label, support_vectors_alphas)) + self.b
321 |             if probability[i] > 0:
322 |                 prediction[i] = 1
323 |             else:
324 |                 prediction[i] = -1
325 | 
326 |         self.prediction = prediction
327 |         self.probability = probability
328 |         if prob:
329 |             return probability
330 |         else:
331 |             return prediction
332 | 
333 |     '''
334 |     Function:  accuracy
335 |     Description: show detection result
336 |     Input:  test_label dataType: ndarray   description: labels of test data
337 |     Output: accuracy   dataType: float     description: detection accuarcy
338 |     '''
339 |     def accuarcy(self, test_label):
340 |         test_label = np.expand_dims(test_label, axis=1)
341 |         prediction = self.prediction
342 |         accuarcy = sum(prediction == test_label)/len(test_label)
343 |         return accuarcy
344 | 
345 |     '''
346 |          Function:  save
347 |          Description: save the model as pkl
348 |          Input:  filename    dataType: str   description: the path to save model
349 |          '''
350 |     def save(self, filename):
351 |         f = open(filename, 'w')
352 |         model = {'b': self.b, 'alphas': self.alphas, 'labels': self.train_label}
353 |         pickle.dump(model, f)
354 |         f.close()
355 | 
356 |     '''
357 |     Function:  load
358 |     Description: load the model 
359 |     Input:  filename    dataType: str   description: the path to save model
360 |     Output: self        dataType: obj   description: the trained model
361 |     '''
362 |     def load(self, filename):
363 |         f = open(filename)
364 |         model = pickle.load(f)
365 |         self.alphas = model['alphas']
366 |         self.b = model['b']
367 |         self.train_label = model['train_label']
368 |         return self
369 | 
370 | 
371 | 


--------------------------------------------------------------------------------