├── .idea ├── ML.iml ├── encodings.xml ├── inspectionProfiles │ └── Project_Default.xml ├── misc.xml ├── modules.xml ├── other.xml └── vcs.xml ├── AndrewNg └── Linear Regression with One Variable │ └── cost-function.py ├── MLFoundation ├── ex1 │ ├── 15.py │ ├── 16.py │ ├── 17.py │ ├── 18.py │ ├── 19.py │ ├── 20.py │ ├── hw1_15_train.dat │ ├── hw1_18_test.dat │ └── hw1_18_train.dat ├── ex2 │ ├── 17-18.py │ ├── 19-20.py │ ├── hw2_test.dat │ └── hw2_train.dat ├── ex3 │ ├── 13-15.py │ ├── 18-20.py │ ├── hw3_test.dat │ └── hw3_train.dat ├── ex4 │ ├── 13-20.py │ ├── hw4_test.dat │ └── hw4_train.dat └── pdf │ ├── 01_handout.pdf │ ├── 02_handout.pdf │ ├── 03_handout.pdf │ ├── 04_handout.pdf │ ├── 05_handout.pdf │ ├── 06_handout.pdf │ ├── 07_handout.pdf │ ├── 08_handout.pdf │ ├── 09_handout.pdf │ ├── 10_handout.pdf │ ├── 11_handout.pdf │ ├── 12_handout.pdf │ ├── 13_handout.pdf │ ├── 14_handout.pdf │ ├── 15_handout.pdf │ └── 16_handout.pdf ├── README.md ├── StatisticalLearningMethod ├── chapter2 │ └── Perceptron.py ├── chapter3 │ ├── K-NN.py │ ├── K-NN1.py │ └── K-NN2.py ├── chapter4 │ ├── naive_Bayes.py │ └── naive_Bayes1.py ├── chapter5 │ ├── C4.5.py │ ├── CART.py │ ├── ID3-1.py │ └── ID3.py ├── data │ ├── train.csv │ ├── train_binary.csv │ ├── train_binary1.csv │ └── train_binary2.csv ├── errata.pdf └── hog.xml ├── tensorflow └── course │ ├── data │ └── fire_theft.xls │ ├── feed.py │ ├── fetch.py │ ├── graph.py │ ├── interactiveSession.py │ ├── linearRegression.py │ ├── load.py │ ├── logisticRegression.py │ ├── random.py │ ├── shape.py │ ├── test.py │ ├── testt.py │ └── variable.py └── watermelon ├── ch3 ├── 3.3 │ ├── data │ │ └── watermelon_3a.csv │ ├── logistic_regression.py │ └── self_def.py ├── 3.4 │ ├── cross_validation.py │ └── data │ │ ├── transfusion.data │ │ └── transfusion.names └── 3.5 │ ├── LDA.py │ ├── data │ └── watermelon_3a.csv │ └── self_def.py └── ch4 ├── 4.3 ├── ID3_watermelon.py ├── data │ └── watermelon_3.csv └── decision_tree.py └── 4.4 ├── CART_watermelon.py ├── data └── watermelon_2.csv └── decision_tree.py /.idea/ML.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | 12 | 15 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /AndrewNg/Linear Regression with One Variable/cost-function.py: -------------------------------------------------------------------------------- 1 | # array1 = [1, 2, 2, 3, 3, 4, 5, 6, 6, 6, 8, 10] 2 | # array2 = [890, -1411, -1560, -2220, -2091, -2878, -3537, -3268, -3920, -4163, -5471, -5157] 3 | 4 | array1 = [3, 1, 0, 4] 5 | array2 = [2, 2, 1, 3] 6 | 7 | 8 | def cost_function(a0, a1): 9 | cost = 0 10 | for i in range(0, len(array1)): 11 | cost += ((a0 + a1 * array1[i] - array2[i])) ** 2 12 | cost /= (2 * len(array1)) 13 | print(cost) 14 | 15 | 16 | # cost_function(-596.6, -530.9) 17 | # cost_function(-1780.0, 530.9) 18 | # cost_function(-596.6, 530.9) 19 | # cost_function(-1780.0, -530.9) 20 | cost_function(0, 1) 21 | -------------------------------------------------------------------------------- /MLFoundation/ex1/15.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | class NaiveCyclePLA(object): 4 | def __init__(self, dimension, count): 5 | self.__dimension = dimension 6 | self.__count = count 7 | 8 | # get data 9 | def train_matrix(self, path): 10 | training_set = open(path) 11 | x_train = numpy.zeros((self.__count, self.__dimension)) 12 | y_train = numpy.zeros((self.__count, 1)) 13 | x = [] 14 | x_count = 0 15 | for line in training_set: 16 | # add 1 dimension manually 17 | x.append(1) 18 | for str in line.split(' '): 19 | if len(str.split('\t')) == 1: 20 | x.append(float(str)) 21 | else: 22 | x.append(float(str.split('\t')[0])) 23 | y_train[x_count, 0] = int(str.split('\t')[1].strip()) 24 | x_train[x_count, :] = x 25 | x = [] 26 | x_count += 1 27 | return x_train, y_train 28 | 29 | def iteration_count(self, path): 30 | count = 0 31 | x_train, y_train = self.train_matrix(path) 32 | w = numpy.zeros((self.__dimension, 1)) 33 | # loop until all x are classified right 34 | while True: 35 | flag = 0 36 | for i in range(self.__count): 37 | if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0: 38 | w += y_train[i, :] * x_train[i, :].reshape(5, 1) 39 | count += 1 40 | flag = 1 41 | if flag == 0: 42 | break 43 | return count 44 | 45 | 46 | if __name__ == '__main__': 47 | perceptron = NaiveCyclePLA(5, 400) 48 | print(perceptron.iteration_count("hw1_15_train.dat")) 49 | -------------------------------------------------------------------------------- /MLFoundation/ex1/16.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import random 3 | 4 | 5 | class RandomPLA(object): 6 | def __init__(self, dimension, count): 7 | self.__dimension = dimension 8 | self.__count = count 9 | 10 | def random_matrix(self, path): 11 | training_set = open(path) 12 | random_list = [] 13 | x = [] 14 | x_count = 0 15 | for line in training_set: 16 | x.append(1) 17 | for str in line.split(' '): 18 | if len(str.split('\t')) == 1: 19 | x.append(float(str)) 20 | else: 21 | x.append(float(str.split('\t')[0])) 22 | x.append(int(str.split('\t')[1].strip())) 23 | random_list.append(x) 24 | x = [] 25 | x_count += 1 26 | random.shuffle(random_list) 27 | return random_list 28 | 29 | def train_matrix(self, path): 30 | x_train = numpy.zeros((self.__count, self.__dimension)) 31 | y_train = numpy.zeros((self.__count, 1)) 32 | random_list = self.random_matrix(path) 33 | for i in range(self.__count): 34 | for j in range(self.__dimension): 35 | x_train[i, j] = random_list[i][j] 36 | y_train[i, 0] = random_list[i][self.__dimension] 37 | return x_train, y_train 38 | 39 | def iteration_count(self, path): 40 | count = 0 41 | x_train, y_train = self.train_matrix(path) 42 | w = numpy.zeros((self.__dimension, 1)) 43 | while True: 44 | flag = 0 45 | for i in range(self.__count): 46 | if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0: 47 | w += y_train[i, 0] * x_train[i, :].reshape(5, 1) 48 | count += 1 49 | flag = 1 50 | if flag == 0: 51 | break 52 | return count 53 | 54 | 55 | if __name__ == '__main__': 56 | sum = 0 57 | for i in range(2000): 58 | perceptron = RandomPLA(5, 400) 59 | sum += perceptron.iteration_count('hw1_15_train.dat') 60 | print(sum / 2000.0) 61 | -------------------------------------------------------------------------------- /MLFoundation/ex1/17.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import random 3 | 4 | 5 | class RandomPLA(object): 6 | def __init__(self, dimension, count): 7 | self.__dimension = dimension 8 | self.__count = count 9 | 10 | def random_matrix(self, path): 11 | training_set = open(path) 12 | random_list = [] 13 | x = [] 14 | x_count = 0 15 | for line in training_set: 16 | x.append(1) 17 | for str in line.split(' '): 18 | if len(str.split('\t')) == 1: 19 | x.append(float(str)) 20 | else: 21 | x.append(float(str.split('\t')[0])) 22 | x.append(int(str.split('\t')[1].strip())) 23 | random_list.append(x) 24 | x = [] 25 | x_count += 1 26 | random.shuffle(random_list) 27 | return random_list 28 | 29 | def train_matrix(self, path): 30 | x_train = numpy.zeros((self.__count, self.__dimension)) 31 | y_train = numpy.zeros((self.__count, 1)) 32 | random_list = self.random_matrix(path) 33 | for i in range(self.__count): 34 | for j in range(self.__dimension): 35 | x_train[i, j] = random_list[i][j] 36 | y_train[i, 0] = random_list[i][self.__dimension] 37 | return x_train, y_train 38 | 39 | def iteration_count(self, path): 40 | count = 0 41 | x_train, y_train = self.train_matrix(path) 42 | w = numpy.zeros((self.__dimension, 1)) 43 | while True: 44 | flag = 0 45 | for i in range(self.__count): 46 | if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0: 47 | w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1) 48 | count += 1 49 | flag = 1 50 | if flag == 0: 51 | break 52 | return count 53 | 54 | 55 | sum = 0 56 | for i in range(2000): 57 | perceptron = RandomPLA(5, 400) 58 | sum += perceptron.iteration_count('hw1_15_train.dat') 59 | print(sum / 2000.0) 60 | -------------------------------------------------------------------------------- /MLFoundation/ex1/18.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import random 3 | import copy 4 | 5 | 6 | class Pocket(object): 7 | def __init__(self, dimension, train_count, test_count): 8 | self.__dimension = dimension 9 | self.__train_count = train_count 10 | self.__test_count = test_count 11 | 12 | def random_matrix(self, path): 13 | training_set = open(path) 14 | random_list = [] 15 | x = [] 16 | x_count = 0 17 | for line in training_set: 18 | x.append(1) 19 | for str in line.split(' '): 20 | if len(str.split('\t')) == 1: 21 | x.append(float(str)) 22 | else: 23 | x.append(float(str.split('\t')[0])) 24 | x.append(int(str.split('\t')[1].strip())) 25 | random_list.append(x) 26 | x = [] 27 | x_count += 1 28 | random.shuffle(random_list) 29 | return random_list 30 | 31 | def train_matrix(self, path): 32 | x_train = numpy.zeros((self.__train_count, self.__dimension)) 33 | y_train = numpy.zeros((self.__train_count, 1)) 34 | random_list = self.random_matrix(path) 35 | for i in range(self.__train_count): 36 | for j in range(self.__dimension): 37 | x_train[i, j] = random_list[i][j] 38 | y_train[i, 0] = random_list[i][self.__dimension] 39 | return x_train, y_train 40 | 41 | def iteration(self, path): 42 | count = 0 43 | x_train, y_train = self.train_matrix(path) 44 | w = numpy.zeros((self.__dimension, 1)) 45 | best_count = self.__train_count 46 | best_w = numpy.zeros((self.__dimension, 1)) 47 | 48 | # pocket算法，对一条线进行修改(最多50次)，每次修改后都用训练集数据看是否是当前最好的那条线 49 | for i in range(self.__train_count): 50 | if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0: 51 | w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1) 52 | # 修改次数加一 53 | count += 1 54 | num = 0 55 | # 验证 56 | for j in range(self.__train_count): 57 | if numpy.dot(x_train[j, :], w)[0] * y_train[j, 0] <= 0: 58 | num += 1 59 | if num < best_count: 60 | best_count = num 61 | best_w = copy.deepcopy(w) 62 | if count == 50: 63 | break 64 | return best_w 65 | 66 | def test_matrix(self, test_path): 67 | x_test = numpy.zeros((self.__test_count, self.__dimension)) 68 | y_test = numpy.zeros((self.__test_count, 1)) 69 | test_set = open(test_path) 70 | x = [] 71 | x_count = 0 72 | for line in test_set: 73 | x.append(1) 74 | for str in line.split(' '): 75 | if len(str.split('\t')) == 1: 76 | x.append(float(str)) 77 | else: 78 | x.append(float(str.split('\t')[0])) 79 | y_test[x_count, 0] = (int(str.split('\t')[1].strip())) 80 | x_test[x_count, :] = x 81 | x = [] 82 | x_count += 1 83 | return x_test, y_test 84 | 85 | # 验证 86 | def test_error(self, train_path, test_path): 87 | w = self.iteration(train_path) 88 | x_test, y_test = self.test_matrix(test_path) 89 | count = 0.0 90 | for i in range(self.__test_count): 91 | if numpy.dot(x_test[i, :], w)[0] * y_test[i, 0] <= 0: 92 | count += 1 93 | return count / self.__test_count 94 | 95 | 96 | if __name__ == '__main__': 97 | average_error_rate = 0 98 | for i in range(2000): 99 | my_Pocket = Pocket(5, 500, 500) 100 | average_error_rate += my_Pocket.test_error('hw1_18_train.dat', 'hw1_18_test.dat') 101 | print(average_error_rate / 2000.0) 102 | -------------------------------------------------------------------------------- /MLFoundation/ex1/19.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import random 3 | import copy 4 | 5 | 6 | class Pocket(object): 7 | def __init__(self, dimension, train_count, test_count): 8 | self.__dimension = dimension 9 | self.__train_count = train_count 10 | self.__test_count = test_count 11 | 12 | def random_matrix(self, path): 13 | training_set = open(path) 14 | random_list = [] 15 | x = [] 16 | x_count = 0 17 | for line in training_set: 18 | x.append(1) 19 | for str in line.split(' '): 20 | if len(str.split('\t')) == 1: 21 | x.append(float(str)) 22 | else: 23 | x.append(float(str.split('\t')[0])) 24 | x.append(int(str.split('\t')[1].strip())) 25 | random_list.append(x) 26 | x = [] 27 | x_count += 1 28 | random.shuffle(random_list) 29 | return random_list 30 | 31 | def train_matrix(self, path): 32 | x_train = numpy.zeros((self.__train_count, self.__dimension)) 33 | y_train = numpy.zeros((self.__train_count, 1)) 34 | random_list = self.random_matrix(path) 35 | for i in range(self.__train_count): 36 | for j in range(self.__dimension): 37 | x_train[i, j] = random_list[i][j] 38 | y_train[i, 0] = random_list[i][self.__dimension] 39 | return x_train, y_train 40 | 41 | def iteration(self, path): 42 | count = 0 43 | x_train, y_train = self.train_matrix(path) 44 | w = numpy.zeros((self.__dimension, 1)) 45 | for i in range(self.__train_count): 46 | if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0: 47 | w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1) 48 | count += 1 49 | if count == 50: 50 | break 51 | return w 52 | 53 | def test_matrix(self, test_path): 54 | x_test = numpy.zeros((self.__test_count, self.__dimension)) 55 | y_test = numpy.zeros((self.__test_count, 1)) 56 | test_set = open(test_path) 57 | x = [] 58 | x_count = 0 59 | for line in test_set: 60 | x.append(1) 61 | for str in line.split(' '): 62 | if len(str.split('\t')) == 1: 63 | x.append(float(str)) 64 | else: 65 | x.append(float(str.split('\t')[0])) 66 | y_test[x_count, 0] = (int(str.split('\t')[1].strip())) 67 | x_test[x_count, :] = x 68 | x = [] 69 | x_count += 1 70 | return x_test, y_test 71 | 72 | # 验证 73 | def test_error(self, train_path, test_path): 74 | w = self.iteration(train_path) 75 | x_test, y_test = self.test_matrix(test_path) 76 | count = 0.0 77 | for i in range(self.__test_count): 78 | if numpy.dot(x_test[i, :], w)[0] * y_test[i, 0] <= 0: 79 | count += 1 80 | return count / self.__test_count 81 | 82 | 83 | if __name__ == '__main__': 84 | average_error_rate = 0 85 | for i in range(2000): 86 | my_Pocket = Pocket(5, 500, 500) 87 | average_error_rate += my_Pocket.test_error('hw1_18_train.dat', 'hw1_18_test.dat') 88 | print(average_error_rate / 2000.0) 89 | -------------------------------------------------------------------------------- /MLFoundation/ex1/20.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import random 3 | import copy 4 | 5 | 6 | class Pocket(object): 7 | def __init__(self, dimension, train_count, test_count): 8 | self.__dimension = dimension 9 | self.__train_count = train_count 10 | self.__test_count = test_count 11 | 12 | def random_matrix(self, path): 13 | training_set = open(path) 14 | random_list = [] 15 | x = [] 16 | x_count = 0 17 | for line in training_set: 18 | x.append(1) 19 | for str in line.split(' '): 20 | if len(str.split('\t')) == 1: 21 | x.append(float(str)) 22 | else: 23 | x.append(float(str.split('\t')[0])) 24 | x.append(int(str.split('\t')[1].strip())) 25 | random_list.append(x) 26 | x = [] 27 | x_count += 1 28 | random.shuffle(random_list) 29 | return random_list 30 | 31 | def train_matrix(self, path): 32 | x_train = numpy.zeros((self.__train_count, self.__dimension)) 33 | y_train = numpy.zeros((self.__train_count, 1)) 34 | random_list = self.random_matrix(path) 35 | for i in range(self.__train_count): 36 | for j in range(self.__dimension): 37 | x_train[i, j] = random_list[i][j] 38 | y_train[i, 0] = random_list[i][self.__dimension] 39 | return x_train, y_train 40 | 41 | def iteration(self, path): 42 | count = 0 43 | x_train, y_train = self.train_matrix(path) 44 | w = numpy.zeros((self.__dimension, 1)) 45 | best_count = self.__train_count 46 | best_w = numpy.zeros((self.__dimension, 1)) 47 | 48 | # pocket算法，对一条线进行修改(最多100次)，每次修改后都用训练集数据看是否是当前最好的那条线 49 | for i in range(self.__train_count): 50 | if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0: 51 | w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1) 52 | count += 1 53 | num = 0 54 | for j in range(self.__train_count): 55 | if numpy.dot(x_train[j, :], w)[0] * y_train[j, 0] <= 0: 56 | num += 1 57 | if num < best_count: 58 | best_count = num 59 | best_w = copy.deepcopy(w) 60 | if count == 100: 61 | break 62 | return best_w 63 | 64 | def test_matrix(self, test_path): 65 | x_test = numpy.zeros((self.__test_count, self.__dimension)) 66 | y_test = numpy.zeros((self.__test_count, 1)) 67 | test_set = open(test_path) 68 | x = [] 69 | x_count = 0 70 | for line in test_set: 71 | x.append(1) 72 | for str in line.split(' '): 73 | if len(str.split('\t')) == 1: 74 | x.append(float(str)) 75 | else: 76 | x.append(float(str.split('\t')[0])) 77 | y_test[x_count, 0] = (int(str.split('\t')[1].strip())) 78 | x_test[x_count, :] = x 79 | x = [] 80 | x_count += 1 81 | return x_test, y_test 82 | 83 | # 验证 84 | def test_error(self, train_path, test_path): 85 | w = self.iteration(train_path) 86 | x_test, y_test = self.test_matrix(test_path) 87 | count = 0.0 88 | for i in range(self.__test_count): 89 | if numpy.dot(x_test[i, :], w)[0] * y_test[i, 0] <= 0: 90 | count += 1 91 | return count / self.__test_count 92 | 93 | 94 | if __name__ == '__main__': 95 | average_error_rate = 0 96 | for i in range(2000): 97 | my_Pocket = Pocket(5, 500, 500) 98 | average_error_rate += my_Pocket.test_error('hw1_18_train.dat', 'hw1_18_test.dat') 99 | print(average_error_rate / 2000.0) 100 | -------------------------------------------------------------------------------- /MLFoundation/ex1/hw1_15_train.dat: -------------------------------------------------------------------------------- 1 | 0.97681 0.10723 0.64385 0.29556 1 2 | 0.67194 0.2418 0.83075 0.42741 1 3 | 0.20619 0.23321 0.81004 0.98691 1 4 | 0.51583 0.055814 0.92274 0.75797 1 5 | 0.70893 0.10836 0.33951 0.77058 1 6 | 0.55743 0.67804 0.061044 0.72689 1 7 | 0.15654 0.75584 0.01122 0.42598 -1 8 | 0.50462 0.15137 0.33878 0.41881 1 9 | 0.22657 0.59272 0.24103 0.46221 -1 10 | 0.49174 0.65115 0.24622 0.24796 -1 11 | 0.59512 0.26994 0.74692 0.32784 1 12 | 0.32439 0.37294 0.11623 0.94499 1 13 | 0.4475 0.60183 0.41323 0.58492 1 14 | 0.41171 0.098584 0.4795 0.083842 -1 15 | 0.10059 0.37353 0.0057687 0.14313 -1 16 | 0.8182 0.70052 0.67561 0.22231 1 17 | 0.3221 0.95754 0.99328 0.50757 1 18 | 0.41469 0.48406 0.39832 0.53216 1 19 | 0.48364 0.36163 0.14351 0.3153 -1 20 | 0.5323 0.21312 0.40401 0.98252 1 21 | 0.71073 0.29015 0.15557 0.70588 1 22 | 0.68151 0.23617 0.085193 0.58718 1 23 | 0.069048 0.14742 0.92254 0.93918 1 24 | 0.19337 0.29606 0.72589 0.71993 1 25 | 0.62783 0.80021 0.69486 0.41697 1 26 | 0.94658 0.85253 0.75418 0.3027 1 27 | 0.54402 0.73303 0.29073 0.26307 -1 28 | 0.20166 0.96147 0.83956 0.76917 1 29 | 0.8416 0.22036 0.60311 0.34751 1 30 | 0.659 0.40341 0.16311 0.12612 -1 31 | 0.87845 0.46984 0.32142 0.00042772 -1 32 | 0.95971 0.7334 0.45993 0.76215 1 33 | 0.35449 0.22126 0.57224 0.4336 1 34 | 0.34263 0.81404 0.30048 0.1461 -1 35 | 0.7234 0.45707 0.44129 0.40039 1 36 | 0.39538 0.20276 0.67262 0.67505 1 37 | 0.45179 0.78087 0.4938 0.073425 -1 38 | 0.23881 0.7675 0.40806 0.074954 -1 39 | 0.91059 0.18045 0.089421 0.59719 1 40 | 0.30088 0.3124 0.30033 0.1078 -1 41 | 0.20636 0.25969 0.87208 0.075063 -1 42 | 0.84325 0.20161 0.018555 0.58518 1 43 | 0.33334 0.087671 0.078659 0.15274 -1 44 | 0.18111 0.11502 0.73474 0.65718 1 45 | 0.90105 0.69659 0.44014 0.28963 1 46 | 0.76096 0.17909 0.18557 0.86889 1 47 | 0.20359 0.77736 0.2176 0.071641 -1 48 | 0.42406 0.98081 0.99433 0.071268 -1 49 | 0.61642 0.060815 0.10835 0.85805 1 50 | 0.62755 0.47251 0.63101 0.86293 1 51 | 0.55335 0.10757 0.87192 0.8353 1 52 | 0.72356 0.088313 0.69772 0.091611 1 53 | 0.02084 0.66204 0.26704 0.93343 1 54 | 0.15623 0.39914 0.58355 0.9993 1 55 | 0.90115 0.74857 0.6048 0.54481 1 56 | 0.40522 0.34025 0.84438 0.30728 1 57 | 0.69053 0.70505 0.77211 0.50009 1 58 | 0.32972 0.36727 0.038398 0.24515 -1 59 | 0.087565 0.80164 0.10873 0.72862 -1 60 | 0.26626 0.19317 0.83732 0.96563 1 61 | 0.33161 0.23154 0.12297 0.17358 -1 62 | 0.8338 0.22029 0.62198 0.5915 1 63 | 0.38873 0.57979 0.75488 0.12437 -1 64 | 0.093349 0.084263 0.085754 0.19575 -1 65 | 0.3938 0.21727 0.59706 0.36985 1 66 | 0.14047 0.12652 0.89396 0.0056295 -1 67 | 0.34342 0.76697 0.82696 0.43354 1 68 | 0.2665 0.83265 0.28848 0.2337 -1 69 | 0.36046 0.36809 0.32623 0.25556 -1 70 | 0.99778 0.97657 0.674 0.51915 1 71 | 0.22303 0.028847 0.73739 0.41662 1 72 | 0.30179 0.44626 0.17371 0.73116 1 73 | 0.31285 0.25044 0.46658 0.12074 -1 74 | 0.24446 0.51992 0.80413 0.74044 1 75 | 0.31433 0.80511 0.6496 0.56248 1 76 | 0.10521 0.202 0.87425 0.90105 1 77 | 0.34385 0.5524 0.52835 0.833 1 78 | 0.52791 0.62401 0.56754 0.41641 1 79 | 0.77826 0.57861 0.49655 0.84074 1 80 | 0.26143 0.5512 0.38472 0.18668 -1 81 | 0.87326 0.96009 0.24922 0.65171 1 82 | 0.65069 0.96118 0.36716 0.6302 1 83 | 0.46037 0.98854 0.62971 0.62758 1 84 | 0.11105 0.93171 0.85023 0.022051 -1 85 | 0.32721 0.95939 0.9862 0.92881 1 86 | 0.54203 0.071898 0.79052 0.86281 1 87 | 0.18994 0.76582 0.21911 0.25161 -1 88 | 0.24274 0.9501 0.80862 0.68007 1 89 | 0.36659 0.57376 0.22493 0.94652 1 90 | 0.52105 0.45772 0.7153 0.91306 1 91 | 0.73745 0.045874 0.9518 0.90951 1 92 | 0.0054206 0.5803 0.92465 0.52961 1 93 | 0.61914 0.3734 0.45772 0.56601 1 94 | 0.68483 0.34833 0.6974 0.51117 1 95 | 0.31049 0.58616 0.78657 0.077121 -1 96 | 0.0077248 0.69259 0.98719 0.93702 1 97 | 0.45361 0.47903 0.1331 0.41037 -1 98 | 0.84801 0.7256 0.21409 0.88719 1 99 | 0.29968 0.17497 0.99655 0.15494 1 100 | 0.10789 0.090897 0.013157 0.45712 -1 101 | 0.72711 0.89662 0.048524 0.77902 1 102 | 0.50372 0.14179 0.8632 0.57913 1 103 | 0.22889 0.248 0.5324 0.58705 1 104 | 0.79724 0.4484 0.90201 0.19897 1 105 | 0.10663 0.49593 0.20231 0.05901 -1 106 | 0.15117 0.49039 0.8309 0.91627 1 107 | 0.95409 0.40038 0.82197 0.73251 1 108 | 0.35704 0.014972 0.47835 0.55573 1 109 | 0.4672 0.78532 0.63665 0.80891 1 110 | 0.51268 0.49317 0.37239 0.11229 -1 111 | 0.60983 0.54596 0.30924 0.45368 1 112 | 0.17321 0.67316 0.27675 0.53482 -1 113 | 0.5761 0.36533 0.44297 0.585 1 114 | 0.77885 0.92006 0.51157 0.42738 1 115 | 0.58168 0.7896 0.58292 0.11996 -1 116 | 0.7243 0.19231 0.12572 0.42981 1 117 | 0.27893 0.27538 0.82096 0.92758 1 118 | 0.79986 0.070765 0.099176 0.61674 1 119 | 0.65646 0.042222 0.039717 0.90227 1 120 | 0.2386 0.41482 0.16741 0.26592 -1 121 | 0.84494 0.53851 0.08783 0.74972 1 122 | 0.69721 0.29151 0.14566 0.092551 -1 123 | 0.085241 0.19873 0.11313 0.53704 -1 124 | 0.18871 0.093184 0.55176 0.047211 -1 125 | 0.21583 0.79506 0.30754 0.7987 1 126 | 0.050727 0.19674 0.73473 0.48999 1 127 | 0.077524 0.29589 0.012955 0.93278 1 128 | 0.87063 0.46914 0.22899 0.35294 1 129 | 0.84807 0.60812 0.42088 0.97709 1 130 | 0.045535 0.66219 0.76946 0.71987 1 131 | 0.64344 0.20442 0.20197 0.43431 1 132 | 0.33283 0.78383 0.0097152 0.13798 -1 133 | 0.091392 0.95801 0.30999 0.17345 -1 134 | 0.058002 0.42981 0.92919 0.40967 1 135 | 0.22095 0.66618 0.86801 0.61817 1 136 | 0.018695 0.21615 0.68387 0.069085 -1 137 | 0.79796 0.18841 0.12854 0.50856 1 138 | 0.67478 0.92791 0.025838 0.12608 -1 139 | 0.68964 0.92125 0.65626 0.76319 1 140 | 0.37004 0.0075887 0.99533 0.82581 1 141 | 0.4103 0.22978 0.2938 0.78125 1 142 | 0.46467 0.40583 0.26626 0.17288 -1 143 | 0.27347 0.38493 0.20575 0.80271 1 144 | 0.0037457 0.59585 0.85865 0.037211 -1 145 | 0.45059 0.83556 0.54132 0.21109 -1 146 | 0.055447 0.84199 0.62001 0.80487 1 147 | 0.016285 0.39547 0.12598 0.63249 -1 148 | 0.11982 0.90112 0.55878 0.19737 -1 149 | 0.77264 0.38371 0.61856 0.36306 1 150 | 0.68999 0.42401 0.43875 0.98001 1 151 | 0.057837 0.86126 0.84096 0.6711 1 152 | 0.23792 0.066348 0.44791 0.9972 1 153 | 0.39259 0.89268 0.54155 0.0061404 -1 154 | 0.20604 0.19453 0.31621 0.71208 1 155 | 0.18058 0.37711 0.88283 0.65659 1 156 | 0.80745 0.24562 0.82253 0.98408 1 157 | 0.41828 0.36215 0.8516 0.68281 1 158 | 0.1323 0.39434 0.84215 0.91682 1 159 | 0.61753 0.09773 0.81467 0.40281 1 160 | 0.97318 0.19905 0.26089 0.68696 1 161 | 0.76135 0.65909 0.89342 0.21845 1 162 | 0.58691 0.6069 0.43123 0.042843 -1 163 | 0.34919 0.10586 0.50059 0.082363 -1 164 | 0.37798 0.23626 0.23852 0.14685 -1 165 | 0.9042 0.98451 0.019088 0.76116 1 166 | 0.84556 0.90166 0.072432 0.079249 -1 167 | 0.84747 0.64503 0.011196 0.53983 1 168 | 0.49067 0.78682 0.15697 0.089691 -1 169 | 0.92475 0.60457 0.64656 0.93019 1 170 | 0.63634 0.80437 0.44479 0.18618 -1 171 | 0.19157 0.60461 0.40676 0.95747 1 172 | 0.5551 0.89083 0.2496 0.65735 1 173 | 0.93298 0.76517 0.25749 0.035361 -1 174 | 0.2199 0.21024 0.10609 0.33801 -1 175 | 0.81888 0.42535 0.37241 0.74882 1 176 | 0.32533 0.40846 0.037799 0.004201 -1 177 | 0.4737 0.14999 0.66915 0.8465 1 178 | 0.16804 0.44428 0.51001 0.66228 1 179 | 0.86743 0.8456 0.17056 0.95574 1 180 | 0.28583 0.93363 0.91645 0.95502 1 181 | 0.83711 0.59571 0.3367 0.97731 1 182 | 0.32174 0.85545 0.71378 0.91737 1 183 | 0.52212 0.36278 0.66123 0.75587 1 184 | 0.21409 0.1191 0.11796 0.75938 1 185 | 0.38188 0.29273 0.27347 0.23086 -1 186 | 0.72916 0.73744 0.90535 0.13761 1 187 | 0.059381 0.25354 0.22097 0.83323 1 188 | 0.36486 0.91348 0.14745 0.57585 -1 189 | 0.68553 0.062004 0.70984 0.66362 1 190 | 0.93301 0.86593 0.17125 0.77453 1 191 | 0.61463 0.4409 0.75333 0.89446 1 192 | 0.12285 0.057161 0.58692 0.49092 1 193 | 0.56427 0.42429 0.41168 0.44017 1 194 | 0.29777 0.69766 0.8302 0.061072 -1 195 | 0.53183 0.69574 0.73405 0.90509 1 196 | 0.61368 0.29695 0.35748 0.841 1 197 | 0.85256 0.0045204 0.85749 0.38761 1 198 | 0.46745 0.45305 0.44254 0.72515 1 199 | 0.71941 0.19092 0.24009 0.89824 1 200 | 0.73892 0.44994 0.78128 0.18219 1 201 | 0.31277 0.92634 0.29642 0.46112 -1 202 | 0.11872 0.89219 0.794 0.28731 -1 203 | 0.54582 0.79468 0.18279 0.048142 -1 204 | 0.83241 0.46586 0.10901 0.048364 -1 205 | 0.89567 0.69597 0.89578 0.10248 1 206 | 0.24917 0.76999 0.20536 0.56092 -1 207 | 0.83858 0.81299 0.95404 0.62472 1 208 | 0.21222 0.21892 0.84233 0.83773 1 209 | 0.31804 0.5679 0.55799 0.15455 -1 210 | 0.81836 0.32376 0.50428 0.2733 1 211 | 0.74487 0.78055 0.18939 0.25642 -1 212 | 0.14736 0.74033 0.48418 0.0015921 -1 213 | 0.80975 0.072057 0.71856 0.86265 1 214 | 0.92345 0.37355 0.34499 0.89149 1 215 | 0.38189 0.089103 0.31269 0.72856 1 216 | 0.49649 0.25659 0.65471 0.94681 1 217 | 0.10242 0.27703 0.52294 0.85126 1 218 | 0.35479 0.17024 0.79189 0.86742 1 219 | 0.70429 0.69697 0.062243 0.964 1 220 | 0.29857 0.77505 0.65087 0.28314 -1 221 | 0.68766 0.51467 0.63235 0.44751 1 222 | 0.15416 0.83044 0.69105 0.027009 -1 223 | 0.83522 0.32071 0.52787 0.10613 1 224 | 0.83811 0.3915 0.57094 0.47851 1 225 | 0.57131 0.88752 0.53706 0.55403 1 226 | 0.93257 0.64968 0.24587 0.81109 1 227 | 0.29608 0.083328 0.74109 0.35551 1 228 | 0.46203 0.18142 0.063792 0.92144 1 229 | 0.41203 0.53101 0.77315 0.62032 1 230 | 0.36268 0.29523 0.71811 0.70884 1 231 | 0.39207 0.53465 0.28893 0.93615 1 232 | 0.95333 0.40831 0.29404 0.41991 1 233 | 0.94916 0.34266 0.87255 0.43527 1 234 | 0.19017 0.47568 0.14256 0.44132 -1 235 | 0.85894 0.9006 0.23357 0.80459 1 236 | 0.67525 0.86288 0.013998 0.28517 -1 237 | 0.88734 0.64802 0.36704 0.54815 1 238 | 0.84748 0.20105 0.89731 0.59314 1 239 | 0.53217 0.98951 0.1954 0.27718 -1 240 | 0.47945 0.30232 0.45604 0.89163 1 241 | 0.99187 0.72996 0.77676 0.72478 1 242 | 0.8889 0.36558 0.82728 0.45772 1 243 | 0.27408 0.7204 0.65677 0.70424 1 244 | 0.52243 0.59938 0.6246 0.11785 -1 245 | 0.76399 0.025814 0.33736 0.20739 1 246 | 0.27187 0.74592 0.21669 0.41116 -1 247 | 0.90839 0.050892 0.67696 0.98549 1 248 | 0.60506 0.54448 0.84372 0.30577 1 249 | 0.10422 0.76155 0.83826 0.5412 1 250 | 0.78474 0.0066151 0.22536 0.50022 1 251 | 0.98582 0.68248 0.28302 0.45186 1 252 | 0.41665 0.81217 0.097022 0.32122 -1 253 | 0.90475 0.46776 0.88671 0.68763 1 254 | 0.033977 0.048415 0.60235 0.065179 -1 255 | 0.98983 0.48006 0.33899 0.29487 1 256 | 0.85168 0.59711 0.93749 0.35835 1 257 | 0.84725 0.020964 0.39386 0.88603 1 258 | 0.56072 0.91605 0.019558 0.42813 -1 259 | 0.11745 0.060389 0.021678 0.58085 -1 260 | 0.20919 0.79555 0.69939 0.78054 1 261 | 0.7171 0.28297 0.84921 0.74192 1 262 | 0.21242 0.32839 0.56807 0.53329 1 263 | 0.48941 0.0084562 0.51977 0.72383 1 264 | 0.98037 0.2035 0.32161 0.4112 1 265 | 0.35711 0.67505 0.11554 0.47356 -1 266 | 0.68983 0.09837 0.66985 0.62623 1 267 | 0.43838 0.026309 0.51285 0.86236 1 268 | 0.10529 0.68645 0.99395 0.63142 1 269 | 0.53952 0.99271 0.27649 0.9474 1 270 | 0.018782 0.74473 0.99206 0.87102 1 271 | 0.51718 0.67211 0.70828 0.31218 1 272 | 0.41189 0.56691 0.78364 0.67886 1 273 | 0.44772 0.18827 0.71978 0.36447 1 274 | 0.317 0.47494 0.54949 0.55973 1 275 | 0.21139 0.30158 0.65269 0.051723 -1 276 | 0.13736 0.51767 0.28234 0.79935 1 277 | 0.037048 0.10755 0.63398 0.76885 1 278 | 0.44087 0.89808 0.67844 0.48225 1 279 | 0.75841 0.78382 0.24322 0.72986 1 280 | 0.87597 0.89991 0.037972 0.2432 -1 281 | 0.60687 0.32885 0.54284 0.67944 1 282 | 0.43019 0.869 0.60879 0.90864 1 283 | 0.65513 0.39801 0.91845 0.53552 1 284 | 0.88689 0.65472 0.99466 0.69948 1 285 | 0.77567 0.94883 0.8498 0.18626 1 286 | 0.97233 0.1599 0.9329 0.089635 1 287 | 0.94461 0.72613 0.71317 0.46217 1 288 | 0.4605 0.97047 0.76531 0.3996 1 289 | 0.5502 0.37931 0.76456 0.80705 1 290 | 0.5828 0.16063 0.74013 0.11508 1 291 | 0.58966 0.49064 0.99596 0.25634 1 292 | 0.96575 0.2141 0.15024 0.98043 1 293 | 0.29939 0.2934 0.46088 0.74118 1 294 | 0.042301 0.51492 0.105 0.33518 -1 295 | 0.62395 0.45102 0.92252 0.77543 1 296 | 0.36607 0.35256 0.32267 0.3285 -1 297 | 0.96545 0.25132 0.064417 0.51374 1 298 | 0.63056 0.053806 0.14816 0.40033 1 299 | 0.48831 0.76017 0.61242 0.48176 1 300 | 0.5583 0.59146 0.24049 0.22209 -1 301 | 0.94304 0.96431 0.31249 0.10506 -1 302 | 0.011705 0.93889 0.25839 0.21194 -1 303 | 0.97164 0.22943 0.18083 0.88409 1 304 | 0.87546 0.6744 0.75024 0.25818 1 305 | 0.64631 0.32332 0.86857 0.40117 1 306 | 0.4276 0.81183 0.34678 0.98935 1 307 | 0.28472 0.82959 0.40054 0.87363 1 308 | 0.62037 0.31285 0.27722 0.64167 1 309 | 0.70482 0.629 0.6828 0.51672 1 310 | 0.83688 0.18413 0.37164 0.51392 1 311 | 0.19111 0.26472 0.19798 0.76058 1 312 | 0.24988 0.091229 0.19524 0.012353 -1 313 | 0.62081 0.11765 0.98492 0.019084 1 314 | 0.18157 0.22637 0.68213 0.74354 1 315 | 0.7659 0.28888 0.61728 0.1657 1 316 | 0.26463 0.45099 0.14001 0.47823 -1 317 | 0.90022 0.31697 0.73717 0.84918 1 318 | 0.85095 0.7647 0.26824 0.61702 1 319 | 0.33281 0.83714 0.21334 0.27535 -1 320 | 0.29159 0.13184 0.10133 0.33435 -1 321 | 0.46935 0.26674 0.023366 0.21269 -1 322 | 0.6042 0.23026 0.50198 0.67093 1 323 | 0.50244 0.31349 0.564 0.74072 1 324 | 0.12275 0.53116 0.37771 0.27835 -1 325 | 0.12977 0.61848 0.83557 0.087753 -1 326 | 0.60099 0.74051 0.046187 0.79207 1 327 | 0.96669 0.37691 0.014413 0.026769 -1 328 | 0.24756 0.67287 0.053795 0.053087 -1 329 | 0.31767 0.63018 0.37828 0.27766 -1 330 | 0.60216 0.17537 0.1279 0.61092 1 331 | 0.087833 0.99196 0.77303 0.98091 1 332 | 0.36564 0.23189 0.64808 0.78337 1 333 | 0.21106 0.13959 0.20768 0.72656 1 334 | 0.6089 0.20358 0.9282 0.39475 1 335 | 0.079604 0.58299 0.46986 0.69636 1 336 | 0.25485 0.35519 0.26085 0.69246 1 337 | 0.67904 0.41069 0.49872 0.69857 1 338 | 0.40779 0.8325 0.16625 0.47396 -1 339 | 0.46199 0.50523 0.33119 0.92953 1 340 | 0.89327 0.56518 0.21383 0.61029 1 341 | 0.41033 0.38488 0.12862 0.8564 1 342 | 0.058138 0.62899 0.60946 0.99762 1 343 | 0.0073587 0.54418 0.26272 0.0063957 -1 344 | 0.91431 0.96241 0.89095 0.22206 1 345 | 0.97883 0.69139 0.23555 0.56506 1 346 | 0.79162 0.25942 0.20671 0.081687 -1 347 | 0.1136 0.19133 0.20443 0.44308 -1 348 | 0.5753 0.11082 0.96049 0.44523 1 349 | 0.66688 0.32664 0.058022 0.21483 -1 350 | 0.85187 0.53112 0.29813 0.91085 1 351 | 0.5679 0.7258 0.47001 0.49278 1 352 | 0.35162 0.85285 0.45142 0.22949 -1 353 | 0.2479 0.52952 0.79521 0.44092 1 354 | 0.4693 0.60065 0.90787 0.92907 1 355 | 0.31096 0.052271 0.25236 0.82934 1 356 | 0.55096 0.79786 0.71317 0.8198 1 357 | 0.99279 0.15139 0.27982 0.45122 1 358 | 0.66404 0.096739 0.26582 0.10294 -1 359 | 0.52803 0.1423 0.46639 0.57637 1 360 | 0.99328 0.14342 0.0087678 0.84295 1 361 | 0.5299 0.17308 0.0613 0.99353 1 362 | 0.81762 0.54861 0.87142 0.55873 1 363 | 0.68483 0.65517 0.49261 0.65511 1 364 | 0.24142 0.53478 0.92219 0.53656 1 365 | 0.66164 0.97376 0.61345 0.39626 1 366 | 0.049532 0.54176 0.98792 0.89908 1 367 | 0.038881 0.38398 0.6202 0.25135 -1 368 | 0.61624 0.084068 0.02411 0.65738 1 369 | 0.17096 0.41017 0.78869 0.71301 1 370 | 0.29773 0.63452 0.9311 0.57032 1 371 | 0.041402 0.64972 0.2671 0.15491 -1 372 | 0.28259 0.44665 0.57678 0.98452 1 373 | 0.16068 0.072643 0.31165 0.29832 -1 374 | 0.97714 0.77051 0.54517 0.72295 1 375 | 0.87151 0.86679 0.20841 0.69075 1 376 | 0.34734 0.25215 0.67884 0.69012 1 377 | 0.26408 0.11281 0.021935 0.17689 -1 378 | 0.69426 0.41539 0.27711 0.78669 1 379 | 0.84044 0.29512 0.56474 0.33757 1 380 | 0.39973 0.32958 0.34539 0.66934 1 381 | 0.58272 0.40829 0.30819 0.1299 -1 382 | 0.4527 0.40875 0.045895 0.41199 -1 383 | 0.29341 0.03832 0.7905 0.33916 1 384 | 0.92222 0.51471 0.13331 0.56679 1 385 | 0.18129 0.96248 0.79131 0.58486 1 386 | 0.45696 0.20427 0.69854 0.48235 1 387 | 0.96531 0.27775 0.95255 0.56022 1 388 | 0.50468 0.99699 0.75136 0.51681 1 389 | 0.55852 0.067689 0.666 0.98482 1 390 | 0.83188 0.66817 0.23403 0.72472 1 391 | 0.97959 0.40402 0.96303 0.28133 1 392 | 0.29634 0.4012 0.40266 0.67864 1 393 | 0.34922 0.99751 0.23234 0.52115 -1 394 | 0.65637 0.7181 0.72843 0.93113 1 395 | 0.079695 0.57218 0.70591 0.33812 -1 396 | 0.71206 0.51569 0.18168 0.5557 1 397 | 0.17528 0.2625 0.8306 0.029669 -1 398 | 0.93895 0.93941 0.72496 0.95655 1 399 | 0.046136 0.94413 0.038311 0.26812 -1 400 | 0.072491 0.2242 0.62592 0.67238 1 401 | -------------------------------------------------------------------------------- /MLFoundation/ex2/17-18.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # generate input data with 20% flipping noise 5 | def generate_input_data(time_seed): 6 | np.random.seed(time_seed) 7 | raw_X = np.sort(np.random.uniform(-1, 1, 20)) 8 | # 加20%噪声 9 | noised_y = np.sign(raw_X) * np.where(np.random.random(raw_X.shape[0]) < 0.2, -1, 1) 10 | return raw_X, noised_y 11 | 12 | 13 | def calculate_Ein(x, y): 14 | # calculate median of interval & negative infinite & positive infinite 15 | thetas = np.array([float("-inf")] + [(x[i] + x[i + 1]) / 2 for i in range(0, x.shape[0] - 1)] + [float("inf")]) 16 | Ein = x.shape[0] 17 | sign = 1 18 | target_theta = 0.0 19 | # positive and negative rays 20 | for theta in thetas: 21 | y_positive = np.where(x > theta, 1, -1) 22 | y_negative = np.where(x < theta, 1, -1) 23 | error_positive = sum(y_positive != y) 24 | error_negative = sum(y_negative != y) 25 | if error_positive > error_negative: 26 | if Ein > error_negative: 27 | Ein = error_negative 28 | sign = -1 29 | target_theta = theta 30 | else: 31 | if Ein > error_positive: 32 | Ein = error_positive 33 | sign = 1 34 | target_theta = theta 35 | # two corner cases 36 | if target_theta == float("inf"): 37 | target_theta = 1.0 38 | if target_theta == float("-inf"): 39 | target_theta = -1.0 40 | return Ein, target_theta, sign 41 | 42 | 43 | if __name__ == '__main__': 44 | T = 5000 45 | total_Ein = 0 46 | sum_Eout = 0 47 | for i in range(0, T): 48 | x, y = generate_input_data(i) 49 | curr_Ein, theta, sign = calculate_Ein(x, y) 50 | total_Ein = total_Ein + curr_Ein 51 | sum_Eout = sum_Eout + 0.5 + 0.3 * sign * (abs(theta) - 1) 52 | # 17 53 | print((total_Ein * 1.0) / (T * 20)) 54 | # 18 55 | print((sum_Eout * 1.0) / T) 56 | -------------------------------------------------------------------------------- /MLFoundation/ex2/19-20.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def read_input_data(path): 5 | x = [] 6 | y = [] 7 | for line in open(path).readlines(): 8 | items = line.strip().split(' ') 9 | tmp_x = [] 10 | for i in range(0, len(items) - 1): tmp_x.append(float(items[i])) 11 | x.append(tmp_x) 12 | y.append(float(items[-1])) 13 | return np.array(x), np.array(y) 14 | 15 | 16 | def calculate_Ein(x, y): 17 | # calculate median of interval & negative infinite & positive infinite 18 | thetas = np.array([float("-inf")] + [(x[i] + x[i + 1]) / 2 for i in range(0, x.shape[0] - 1)] + [float("inf")]) 19 | Ein = x.shape[0] 20 | sign = 1 21 | target_theta = 0.0 22 | # positive and negative rays 23 | for theta in thetas: 24 | y_positive = np.where(x > theta, 1, -1) 25 | y_negative = np.where(x < theta, 1, -1) 26 | error_positive = sum(y_positive != y) 27 | error_negative = sum(y_negative != y) 28 | if error_positive > error_negative: 29 | if Ein > error_negative: 30 | Ein = error_negative 31 | sign = -1 32 | target_theta = theta 33 | else: 34 | if Ein > error_positive: 35 | Ein = error_positive 36 | sign = 1 37 | target_theta = theta 38 | return Ein, target_theta, sign 39 | 40 | 41 | if __name__ == '__main__': 42 | # 19 43 | x, y = read_input_data("hw2_train.dat") 44 | # record optimal descision stump parameters 45 | Ein = x.shape[0] 46 | theta = 0 47 | sign = 1 48 | index = 0 49 | # multi decision stump optimal process 50 | for i in range(0, x.shape[1]): 51 | input_x = x[:, i] 52 | input_data = np.transpose(np.array([input_x, y])) 53 | input_data = input_data[np.argsort(input_data[:, 0])] 54 | curr_Ein, curr_theta, curr_sign = calculate_Ein(input_data[:, 0], input_data[:, 1]) 55 | if Ein > curr_Ein: 56 | Ein = curr_Ein 57 | theta = curr_theta 58 | sign = curr_sign 59 | index = i 60 | print((Ein * 1.0) / x.shape[0]) 61 | # 20 62 | # test process 63 | test_x, test_y = read_input_data("hw2_test.dat") 64 | test_x = test_x[:, index] 65 | predict_y = np.array([]) 66 | if sign == 1: 67 | predict_y = np.where(test_x > theta, 1.0, -1.0) 68 | else: 69 | predict_y = np.where(test_x < theta, 1.0, -1.0) 70 | Eout = sum(predict_y != test_y) 71 | print((Eout * 1.0) / test_x.shape[0]) 72 | -------------------------------------------------------------------------------- /MLFoundation/ex2/hw2_train.dat: -------------------------------------------------------------------------------- 1 | 8.105 -3.500 4.769 4.541 -9.829 5.252 3.838 -3.408 -4.824 -1 2 | -6.273 -2.097 9.404 1.143 3.487 -5.206 0.061 5.024 -6.687 1 3 | 1.624 -1.173 4.260 -3.607 -6.632 4.431 -8.355 7.206 -8.977 1 4 | -10.000 7.758 -2.670 -8.880 -1.099 -9.183 -4.086 8.962 5.841 1 5 | 8.464 1.762 2.729 2.724 8.155 6.096 -2.844 9.800 3.302 -1 6 | -0.135 6.193 7.705 7.195 7.313 -3.395 8.012 -6.773 -4.433 1 7 | 0.934 -8.379 -2.083 -6.337 4.346 -3.928 9.759 -8.499 -4.128 1 8 | 8.923 -0.018 -6.837 6.628 -2.823 -9.524 -6.767 -4.811 -6.296 1 9 | -9.028 7.010 -9.063 -1.111 -9.328 5.282 4.960 -9.569 6.784 -1 10 | -9.706 1.392 6.562 -6.543 -1.980 -6.261 -6.067 1.254 -1.071 1 11 | -6.891 -4.157 1.057 -5.954 4.732 1.729 9.328 -0.308 2.160 1 12 | -0.845 -5.858 -0.486 -4.282 -2.401 7.534 -0.543 1.531 -1.212 -1 13 | -9.596 -3.929 9.556 1.461 0.117 4.288 -6.810 -0.555 -6.020 1 14 | 9.124 7.287 -7.506 -1.363 -6.995 0.093 -3.828 2.462 -8.376 1 15 | 7.514 7.608 -0.175 7.071 -0.931 9.942 1.359 2.259 -0.613 -1 16 | -1.805 -2.265 -9.636 0.689 6.373 -6.631 -9.218 -7.456 5.831 -1 17 | -3.048 8.819 -8.509 6.777 5.889 0.560 6.719 -2.752 -7.181 -1 18 | -5.873 -9.376 -3.226 -5.509 1.313 -6.853 -2.140 2.095 -4.309 -1 19 | 4.250 -5.350 -6.683 5.741 -8.574 9.207 -3.699 8.145 -3.545 -1 20 | 8.587 -0.571 -7.906 -4.638 3.920 3.407 -1.491 -8.220 -4.498 1 21 | -8.107 0.089 -7.650 -4.790 -4.171 -6.223 -5.583 2.130 -8.078 1 22 | -8.616 9.386 -9.095 -6.522 -5.252 4.825 6.886 3.256 6.605 -1 23 | -10.000 -3.258 -1.998 -7.559 1.952 3.832 -3.782 6.369 -4.038 1 24 | -4.212 -1.462 -2.603 -3.308 2.016 2.144 -8.483 -1.099 -4.600 1 25 | 8.112 3.770 -5.551 -3.885 6.211 6.401 9.946 -7.571 2.770 -1 26 | -8.868 0.669 5.703 -1.472 7.361 -2.282 -9.328 8.879 6.620 1 27 | 6.635 5.312 5.358 -8.916 -8.574 1.569 7.485 -8.628 3.998 1 28 | 7.432 -8.466 -9.884 3.135 0.062 7.477 -9.147 0.734 6.355 -1 29 | -3.031 2.371 -4.132 -7.674 3.454 -2.706 3.895 0.939 -1.334 1 30 | -10.000 -1.108 7.883 -7.978 -7.973 -2.055 9.498 -7.120 8.679 1 31 | 10.000 2.703 -6.408 -4.365 5.029 7.046 2.929 -1.076 -2.015 -1 32 | 3.891 1.182 -0.468 1.774 3.203 1.559 9.719 2.702 4.439 -1 33 | -4.895 7.533 3.229 -1.304 -6.832 -1.742 -4.258 6.097 7.182 1 34 | -6.454 -0.875 4.457 3.077 -9.100 -2.340 -5.364 -9.381 -10.000 -1 35 | 4.393 8.004 -5.783 -2.378 -3.299 -2.615 5.880 2.443 -6.518 1 36 | 0.337 2.622 -4.467 -5.206 -4.301 -3.567 2.454 0.335 -2.949 1 37 | -1.583 7.670 6.972 2.634 -4.708 -6.327 -9.980 -8.828 6.116 1 38 | -8.917 1.634 -6.017 -3.384 6.428 -0.318 3.049 -1.118 -10.000 1 39 | -4.864 1.848 0.375 -7.892 -5.517 5.667 -4.218 -5.498 6.839 -1 40 | 5.545 3.762 -5.996 9.528 -9.622 -9.568 -0.789 3.427 -0.686 -1 41 | 1.361 -5.169 -3.709 -8.264 -3.060 0.774 7.403 2.721 5.276 -1 42 | 7.686 4.347 -0.279 -8.310 3.875 0.099 -7.878 -6.914 -6.474 1 43 | 6.890 -7.670 -8.421 -6.819 -5.934 -1.481 3.954 -8.532 -8.760 1 44 | -1.530 8.711 -0.993 8.191 -9.599 -7.117 -1.710 -7.477 -4.031 1 45 | -4.384 3.295 1.583 -2.805 6.476 5.649 5.713 0.430 7.117 -1 46 | -2.528 -9.359 2.564 6.479 8.832 2.966 9.362 -2.878 5.489 1 47 | 2.867 3.421 9.149 -5.550 -9.384 5.625 -9.901 6.329 -3.945 1 48 | -6.103 3.564 8.529 6.461 0.044 7.361 -0.573 -0.595 -5.517 -1 49 | -10.000 1.217 -5.353 9.365 5.667 -4.737 4.989 5.765 -8.408 -1 50 | -5.352 -3.079 4.530 -6.823 -6.618 -5.426 -9.462 2.809 3.979 1 51 | 9.667 2.303 8.283 -5.686 1.668 3.949 -0.423 -3.343 -0.286 1 52 | -2.993 9.110 2.642 -8.462 -7.713 6.024 -3.888 -7.175 -1.167 1 53 | 5.873 5.954 0.947 4.155 -9.732 -7.385 -1.896 -0.155 -0.728 1 54 | -3.765 4.062 0.545 8.877 5.600 2.833 4.901 -8.289 5.658 -1 55 | -1.065 -3.518 5.746 9.882 -9.363 6.014 -7.503 -1.259 -4.141 -1 56 | -9.823 3.309 -2.012 0.723 2.186 -6.412 -6.445 -2.913 -4.701 1 57 | -7.490 0.047 -5.807 8.256 -0.070 -5.170 4.271 2.427 3.572 -1 58 | -9.071 3.115 -9.485 -1.083 -6.162 2.701 2.505 -2.607 9.788 1 59 | -7.382 1.835 -8.231 -3.189 0.091 1.698 1.642 -5.638 -5.875 1 60 | 2.551 2.422 4.373 3.066 -8.661 8.210 -4.233 3.844 -4.397 -1 61 | -2.114 9.172 3.369 -0.345 -4.017 -6.540 -8.647 7.625 -2.178 1 62 | 5.056 -9.265 6.228 -0.571 3.801 7.567 -2.361 9.569 1.411 -1 63 | -3.013 -0.825 8.785 -9.643 8.830 -5.231 -6.183 -9.817 -7.606 1 64 | -2.241 4.515 4.151 -6.012 -6.056 -2.047 -8.445 1.584 -2.479 1 65 | 5.637 7.266 -6.890 4.422 7.623 -8.061 9.191 -8.560 -7.878 -1 66 | -9.766 -5.208 -8.244 4.386 -1.221 -4.299 -7.662 0.334 7.284 -1 67 | 6.440 4.960 -0.344 9.550 -0.618 -2.722 -8.511 -1.426 -1.281 -1 68 | 8.634 7.211 -6.378 -9.609 1.597 2.401 -3.909 3.935 -7.265 1 69 | 7.875 -7.259 -9.684 -2.469 -7.710 -0.301 4.809 -6.221 8.272 -1 70 | -5.843 7.417 -7.380 -2.221 7.808 4.217 -9.820 -6.101 -1.848 1 71 | 4.305 0.635 -9.011 4.622 8.166 -6.721 -5.679 2.975 -2.941 -1 72 | 6.433 -4.014 0.649 9.053 3.765 -1.543 3.269 3.946 2.356 -1 73 | 1.617 -9.885 -6.974 2.606 4.737 -8.808 5.885 9.057 4.168 -1 74 | 0.624 -0.892 8.487 -8.727 -1.840 2.252 -0.271 -8.570 -3.802 1 75 | 4.106 -2.164 -1.017 7.132 -9.558 -6.280 8.325 6.327 -7.223 1 76 | 5.663 -2.714 -3.790 4.150 -1.441 4.370 -3.598 8.288 5.800 -1 77 | -5.474 6.195 -7.293 3.509 3.328 -6.851 7.229 1.652 9.476 -1 78 | -8.465 -7.029 -7.304 -2.255 7.120 1.255 -7.885 -6.478 -0.456 1 79 | 1.437 6.306 -1.798 4.145 -0.185 -8.470 7.294 -2.956 3.182 1 80 | 0.927 3.018 -2.395 3.623 -9.236 -5.275 -5.121 -7.121 -1.753 1 81 | 6.346 -1.202 2.456 -5.452 -7.057 -7.729 -3.923 -9.763 -0.685 1 82 | -8.780 -6.548 -9.133 -1.175 7.075 -8.370 3.550 -8.046 -5.491 1 83 | -7.684 7.061 1.463 4.771 -8.391 4.406 7.042 -2.314 4.643 -1 84 | 0.571 -5.249 -2.373 1.438 3.575 -5.297 3.069 -2.875 -3.343 1 85 | -4.453 7.404 -9.191 7.010 2.175 -7.582 1.417 -0.783 0.104 -1 86 | -8.114 -1.131 -4.669 -0.486 -9.693 8.906 4.216 3.376 -3.969 -1 87 | -2.346 9.384 -2.555 -1.536 6.394 9.620 0.882 -2.189 -1.162 -1 88 | 8.614 3.468 1.580 -6.056 -7.018 1.887 -7.150 7.198 -4.737 -1 89 | 3.875 -0.368 -0.563 -8.680 8.095 -4.169 -9.060 -1.023 3.642 1 90 | 6.901 -3.390 2.563 -1.520 0.554 5.544 -9.633 3.405 2.742 -1 91 | 1.901 9.995 -7.577 -8.662 -8.685 -9.482 -2.830 -7.745 -0.505 1 92 | -2.580 -6.876 4.063 9.982 1.604 -5.383 5.527 1.971 8.022 -1 93 | 1.874 1.349 -3.578 4.296 2.687 -2.263 4.814 9.857 -0.008 -1 94 | 1.218 6.413 1.371 -4.719 6.396 -7.025 -0.102 1.922 4.946 1 95 | 4.655 1.148 -6.657 -8.923 -4.556 6.031 -1.186 -9.741 5.888 1 96 | -0.921 9.551 -8.037 -9.549 -5.168 8.359 -6.574 4.731 0.281 1 97 | -7.088 -4.467 -9.106 -3.745 -3.390 -3.662 -7.714 5.423 -3.404 1 98 | -9.721 -5.860 9.048 -7.758 -5.410 -6.119 -9.399 -1.984 8.611 1 99 | 1.099 -9.784 7.673 1.993 -3.529 -5.718 8.331 -1.243 9.706 -1 100 | 5.588 -8.062 3.135 4.636 -5.819 7.725 8.517 -5.218 -4.259 -1 101 | -------------------------------------------------------------------------------- /MLFoundation/ex3/13-15.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | 5 | # target function f(x1, x2) = sign(x1^2 + x2^2 - 0.6) 6 | def target_function(x1, x2): 7 | if (x1 * x1 + x2 * x2 - 0.6) >= 0: 8 | return 1 9 | else: 10 | return -1 11 | 12 | 13 | # create train_set 14 | def training_data_with_random_error(num=1000): 15 | features = np.zeros((num, 3)) 16 | labels = np.zeros((num, 1)) 17 | 18 | points_x1 = np.array([round(random.uniform(-1, 1), 2) for i in range(num)]) 19 | points_x2 = np.array([round(random.uniform(-1, 1), 2) for i in range(num)]) 20 | 21 | for i in range(num): 22 | # create random feature 23 | features[i, 0] = 1 24 | features[i, 1] = points_x1[i] 25 | features[i, 2] = points_x2[i] 26 | labels[i] = target_function(points_x1[i], points_x2[i]) 27 | # choose 10% error labels 28 | if i <= num * 0.1: 29 | if labels[i] < 0: 30 | labels[i] = 1 31 | else: 32 | labels[i] = -1 33 | return features, labels 34 | 35 | 36 | def error_rate(features, labels, w): 37 | wrong = 0 38 | for i in range(len(labels)): 39 | if np.dot(features[i], w) * labels[i, 0] < 0: 40 | wrong += 1 41 | return wrong / (len(labels) * 1.0) 42 | 43 | 44 | def linear_regression_closed_form(X, Y): 45 | """ 46 | linear regression: 47 | model : g(x) = Wt * X 48 | strategy : squared error 49 | algorithm : close form(matrix) 50 | result : w = (Xt.X)^-1.Xt.Y 51 | 林老师上课讲的公式 52 | """ 53 | return np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(Y) 54 | 55 | 56 | def feature_transform(features): 57 | new = np.zeros((len(features), 6)) 58 | new[:, 0:3] = features[:, :] * 1 59 | new[:, 3] = features[:, 1] * features[:, 2] 60 | new[:, 4] = features[:, 1] * features[:, 1] 61 | new[:, 5] = features[:, 2] * features[:, 2] 62 | return new 63 | 64 | 65 | if __name__ == '__main__': 66 | 67 | # 13 68 | 69 | error_rate_array = [] 70 | for i in range(1000): 71 | (features, labels) = training_data_with_random_error(1000) 72 | w13 = linear_regression_closed_form(features, labels) 73 | error_rate_array.append(error_rate(features, labels, w13)) 74 | 75 | # error rate, approximately 0.5 76 | avr_err = sum(error_rate_array) / (len(error_rate_array) * 1.0) 77 | 78 | print("13--Linear regression for classification without feature transform:Average error--", avr_err) 79 | 80 | # 14 81 | (features, labels) = training_data_with_random_error(1000) 82 | new_features = feature_transform(features) 83 | w14 = linear_regression_closed_form(new_features, labels) 84 | min_error_in = float("inf") 85 | error_rate_array = [] 86 | for i in range(1000): 87 | (features, labels) = training_data_with_random_error(1000) 88 | new_features = feature_transform(features) 89 | 90 | w = linear_regression_closed_form(new_features, labels) 91 | error_in = error_rate(new_features, labels, w) 92 | if error_in <= min_error_in: 93 | w14 = w 94 | min_error_in = error_in 95 | error_rate_array.append(error_in) 96 | 97 | print("w14", w14) 98 | 99 | # avr_err = sum(error_rate_array) / (len(error_rate_array) * 1.0) 100 | # 101 | # print("14--Linear regression for classification with feature transform:Average error--", avr_err) 102 | 103 | # 15 104 | 105 | error_out = [] 106 | for i in range(1000): 107 | (features, labels) = training_data_with_random_error(1000) 108 | new_features = feature_transform(features) 109 | error_out.append(error_rate(new_features, labels, w14)) 110 | 111 | # bins = np.arange(-1, 1, 0.05) 112 | # plt.hist(error_out, bins, rwidth=0.8, histtype='bar') 113 | # plt.title("Error out(with feature transform)") 114 | # plt.show() 115 | 116 | print("15--Average of E_out is: ", sum(error_out) / (len(error_out) * 1.0)) 117 | -------------------------------------------------------------------------------- /MLFoundation/ex3/18-20.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def data_load(file_path): 5 | # open file and read lines 6 | f = open(file_path) 7 | try: 8 | lines = f.readlines() 9 | finally: 10 | f.close() 11 | 12 | # create features and labels array 13 | example_num = len(lines) 14 | feature_dimension = len(lines[0].strip().split()) 15 | 16 | features = np.zeros((example_num, feature_dimension)) 17 | features[:, 0] = 1 18 | labels = np.zeros((example_num, 1)) 19 | 20 | for index, line in enumerate(lines): 21 | # items[0:-1]--features items[-1]--label 22 | items = line.strip().split(' ') 23 | # get features 24 | features[index, 1:] = [float(str_num) for str_num in items[0:-1]] 25 | 26 | # get label 27 | labels[index] = float(items[-1]) 28 | 29 | return features, labels 30 | 31 | 32 | # gradient descent 33 | def gradient_descent(X, y, w): 34 | # -YnWtXn 35 | tmp = -y * (np.dot(X, w)) 36 | 37 | # θ(-YnWtXn) = exp(tmp)/1+exp(tmp) 38 | # weight_matrix = np.array([math.exp(_)/(1+math.exp(_)) for _ in tmp]).reshape(len(X), 1) 39 | weight_matrix = np.exp(tmp) / ((1 + np.exp(tmp)) * 1.0) 40 | gradient = 1 / (len(X) * 1.0) * (sum(weight_matrix * -y * X).reshape(len(w), 1)) 41 | 42 | return gradient 43 | 44 | 45 | # gradient descent 46 | def stochastic_gradient_descent(X, y, w): 47 | # -YnWtXn 48 | tmp = -y * (np.dot(X, w)) 49 | 50 | # θ(-YnWtXn) = exp(tmp)/1+exp(tmp) 51 | # weight = math.exp(tmp[0])/((1+math.exp(tmp[0]))*1.0) 52 | weight = np.exp(tmp) / ((1 + np.exp(tmp)) * 1.0) 53 | 54 | gradient = weight * -y * X 55 | return gradient.reshape(len(gradient), 1) 56 | 57 | 58 | # LinearRegression Class 59 | class LinearRegression: 60 | 61 | def __init__(self): 62 | pass 63 | 64 | # fit model 65 | def fit(self, X, y, Eta=0.001, max_iteration=2000, sgd=False): 66 | # ∂E/∂w = 1/N * ∑θ(-YnWtXn)(-YnXn) 67 | self.__w = np.zeros((len(X[0]), 1)) 68 | 69 | # whether use stochastic gradient descent 70 | if not sgd: 71 | for i in range(max_iteration): 72 | self.__w = self.__w - Eta * gradient_descent(X, y, self.__w) 73 | else: 74 | index = 0 75 | for i in range(max_iteration): 76 | if (index >= len(X)): 77 | index = 0 78 | self.__w = self.__w - Eta * stochastic_gradient_descent(np.array(X[index]), y[index], self.__w) 79 | index += 1 80 | 81 | # predict 82 | def predict(self, X): 83 | binary_result = np.dot(X, self.__w) >= 0 84 | return np.array([(1 if _ > 0 else -1) for _ in binary_result]).reshape(len(X), 1) 85 | 86 | # get vector w 87 | def get_w(self): 88 | return self.__w 89 | 90 | # score(error rate) 91 | def score(self, X, y): 92 | predict_y = self.predict(X) 93 | return sum(predict_y != y) / (len(y) * 1.0) 94 | 95 | 96 | if __name__ == '__main__': 97 | # 18 98 | # training model 99 | (X, Y) = data_load("hw3_train.dat") 100 | lr = LinearRegression() 101 | lr.fit(X, Y, max_iteration=2000) 102 | 103 | # get 0/1 error in test data 104 | test_X, test_Y = data_load("hw3_test.dat") 105 | print("E_out: ", lr.score(test_X, test_Y)) 106 | 107 | # 19 108 | # training model 109 | (X, Y) = data_load("hw3_train.dat") 110 | lr_eta = LinearRegression() 111 | lr_eta.fit(X, Y, 0.01, 2000) 112 | 113 | # get 0/1 error in test data 114 | test_X, test_Y = data_load("hw3_test.dat") 115 | print("E_out: ", lr_eta.score(test_X, test_Y)) 116 | 117 | # 20 118 | (X, Y) = data_load("hw3_train.dat") 119 | lr_sgd = LinearRegression() 120 | lr_sgd.fit(X, Y, sgd=True, max_iteration=2000) 121 | 122 | # get 0/1 error in test data 123 | test_X, test_Y = data_load("hw3_test.dat") 124 | print("E_out: ", lr_sgd.score(test_X, test_Y)) 125 | -------------------------------------------------------------------------------- /MLFoundation/ex4/13-20.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # load data 5 | def load_data(filename): 6 | code = open(filename, "r") 7 | lines = code.readlines() 8 | xn = np.zeros((len(lines), 3)).astype(np.float) 9 | yn = np.zeros((len(lines),)).astype(np.int) 10 | 11 | for i in range(0, len(lines)): 12 | line = lines[i] 13 | line = line.rstrip('\r\n').replace('\t', ' ').split(' ') 14 | xn[i, 0] = 1 15 | for j in range(1, len(xn[0])): 16 | xn[i, j] = float(line[j - 1]) 17 | yn[i] = int(line[len(xn[0]) - 1]) 18 | return xn, yn 19 | 20 | 21 | # 正规方程 22 | def calculate_w_reg(x, y, lambda_value): 23 | return np.dot(np.dot(np.linalg.inv(np.dot(x.transpose(), x) + lambda_value * np.eye(x.shape[1])), x.transpose()), y) 24 | 25 | 26 | # test result 27 | def calculate_E(w, x, y): 28 | scores = np.dot(w, x.transpose()) 29 | predicts = np.where(scores >= 0, 1.0, -1.0) 30 | E_out_num = sum(predicts != y) 31 | return (E_out_num * 1.0) / predicts.shape[0] 32 | 33 | 34 | if __name__ == '__main__': 35 | # prepare train and test data 36 | train_x, train_y = load_data("hw4_train.dat") 37 | test_x, test_y = load_data("hw4_test.dat") 38 | 39 | # Q13 40 | lambda_value = 10 41 | W = calculate_w_reg(train_x, train_y, lambda_value) 42 | Ein = calculate_E(W, train_x, train_y) 43 | Eout = calculate_E(W, test_x, test_y) 44 | print('Q13: Ein = ', Ein, ', Eout= ', Eout) 45 | 46 | # Q14-Q15 47 | Ein_min = float("inf") 48 | optimal_Eout = 0 49 | optimal_lambda_Ein = 0 50 | 51 | Eout_min = float("inf") 52 | optimal_Ein = 0 53 | optimal_lambda_Eout = 0 54 | for lambda_value in range(2, -11, -1): 55 | # calculate ridge regression W 56 | w_reg = calculate_w_reg(train_x, train_y, pow(10, lambda_value)) 57 | Ein = calculate_E(w_reg, train_x, train_y) 58 | Eout = calculate_E(w_reg, test_x, test_y) 59 | 60 | # update Ein,Eout,lambda 61 | if Ein_min > Ein: 62 | Ein_min = Ein 63 | optimal_lambda_Ein = lambda_value 64 | optimal_Eout = Eout 65 | 66 | if Eout_min > Eout: 67 | Eout_min = Eout 68 | optimal_lambda_Eout = lambda_value 69 | optimal_Ein = Ein 70 | # Q14 71 | print('Q14: log10lambda = ', optimal_lambda_Ein, ', Ein= ', Ein_min, ', Eout = ', optimal_Eout) 72 | # Q15 73 | print('Q15: log10lambda = ', optimal_lambda_Eout, ', Ein = ', optimal_Ein, ', Eout= ', Eout_min) 74 | 75 | # Q16-Q17 76 | Etrain_min = float("inf") 77 | Eval_min = float("inf") 78 | 79 | # 跟着Etrain_min更新的值 80 | Eout_Etrain_min = 0 81 | Eval_Etrain_min = 0 82 | optimal_lambda_Etrain_min = 0 83 | 84 | # 跟着Eval_min更新的值 85 | Etrain_Eval_min = 0 86 | Eout_Eval_min = 0 87 | optimal_lambda_Eval_min = 0 88 | 89 | split = 120 90 | 91 | for lambda_value in range(2, -11, -1): 92 | w_reg = calculate_w_reg(train_x[:split], train_y[:split], pow(10, lambda_value)) 93 | Etrain = calculate_E(w_reg, train_x[:split], train_y[:split]) 94 | Eval = calculate_E(w_reg, train_x[split:], train_y[split:]) 95 | Eout = calculate_E(w_reg, test_x, test_y) 96 | 97 | if Etrain_min > Etrain: 98 | optimal_lambda_Etrain_min = lambda_value 99 | Etrain_min = Etrain 100 | Eout_Etrain_min = Eout 101 | Eval_Etrain_min = Eval 102 | 103 | if Eval_min > Eval: 104 | optimal_lambda_Eval_min = lambda_value 105 | Eout_Eval_min = Eout 106 | Eval_min = Eval 107 | Etrain_Eval_min = Etrain 108 | # Q16 109 | print('Q16: log10 = ', optimal_lambda_Etrain_min, ', Etrain= ', Etrain_min, ', Eval = ', Eval_Etrain_min, 110 | ', Eout = ', Eout_Etrain_min) 111 | # Q17 112 | print('Q17: log10 = ', optimal_lambda_Eval_min, ', Etrain= ', Etrain_Eval_min, ', Eval = ', Eval_min, ', Eout = ', 113 | Eout_Eval_min) 114 | 115 | # Q18 116 | # optimal_lambda_Eval_min是第17题得到的最优lamda 117 | w_reg = calculate_w_reg(train_x, train_y, pow(10, optimal_lambda_Eval_min)) 118 | optimal_Ein = calculate_E(w_reg, train_x, train_y) 119 | optimal_Eout = calculate_E(w_reg, test_x, test_y) 120 | print('Q18: Ein = ', optimal_Ein, ', Eout = ', optimal_Eout) 121 | 122 | # Q19 123 | folder_num = 5 124 | split_folder = 40 125 | 126 | Ecv_min = float("inf") 127 | optimal_lambda = 0 128 | for lambda_value in range(2, -11, -1): 129 | total_cv = 0 130 | for i in range(folder_num): 131 | # get test_data 132 | test_data_x = train_x[i * split_folder:(i + 1) * split_folder, :] 133 | test_data_y = train_y[i * split_folder:(i + 1) * split_folder] 134 | 135 | # train_data= raw_data-test_data，test_data可能在中间或两边 136 | if 0 < i < (folder_num - 1): 137 | train_data_x = np.concatenate((train_x[0:i * split_folder, :], train_x[(i + 1) * split_folder:, :]), 138 | axis=0) 139 | train_data_y = np.concatenate((train_y[0:i * split_folder], train_y[(i + 1) * split_folder:]), axis=0) 140 | elif i == 0: 141 | train_data_x = train_x[split_folder:, :] 142 | train_data_y = train_y[split_folder:] 143 | else: 144 | train_data_x = train_x[0:i * split_folder, :] 145 | train_data_y = train_y[0:i * split_folder] 146 | 147 | w_reg = calculate_w_reg(train_data_x, train_data_y, pow(10, lambda_value)) 148 | Ecv = calculate_E(w_reg, test_data_x, test_data_y) 149 | total_cv += Ecv 150 | total_cv = total_cv * 1.0 / folder_num 151 | if Ecv_min > total_cv: 152 | Ecv_min = total_cv 153 | optimal_lambda = lambda_value 154 | 155 | print('Q19: log10=', optimal_lambda, ' Ecv=', Ecv_min) 156 | 157 | # Q20 158 | w_reg = calculate_w_reg(train_x, train_y, pow(10, optimal_lambda)) 159 | Ein = calculate_E(w_reg, train_x, train_y) 160 | Eout = calculate_E(w_reg, test_x, test_y) 161 | print('Q20: Ein = ', Ein, 'Eout = ', Eout) 162 | -------------------------------------------------------------------------------- /MLFoundation/ex4/hw4_train.dat: -------------------------------------------------------------------------------- 1 | 0.568304 0.568283 1 2 | 0.310968 0.310956 -1 3 | 0.103376 0.103373 -1 4 | 0.0531882 0.053218 -1 5 | 0.97006 0.970064 1 6 | 0.0941873 0.0941707 -1 7 | 0.655902 0.655892 1 8 | 0.370821 0.370839 -1 9 | 0.558482 0.558476 1 10 | 0.849389 0.849383 1 11 | 0.796038 0.796051 1 12 | 0.723246 0.723252 1 13 | 0.571236 0.571254 1 14 | 0.385144 0.38512 -1 15 | 0.877176 0.877168 1 16 | 0.74655 0.746552 1 17 | 0.0676164 0.0676087 -1 18 | 0.0412524 0.0412649 -1 19 | 0.851637 0.851661 1 20 | 0.586989 0.58698 1 21 | 0.661014 0.660994 1 22 | 0.587988 0.587968 1 23 | 0.257615 0.257628 -1 24 | 0.680505 0.680485 1 25 | 0.895242 0.895257 1 26 | 0.381124 0.381139 -1 27 | 0.314332 0.31433 -1 28 | 0.157744 0.157747 -1 29 | 0.670923 0.670925 1 30 | 0.531716 0.531736 1 31 | 0.810956 0.810938 1 32 | 0.514937 0.51493 1 33 | 0.188567 0.188587 -1 34 | 0.778528 0.778527 1 35 | 0.904966 0.904955 1 36 | 0.563699 0.563708 1 37 | 0.599768 0.59978 1 38 | 0.619909 0.619928 1 39 | 0.650556 0.650556 1 40 | 0.131949 0.131967 -1 41 | 0.251546 0.251546 -1 42 | 0.690874 0.690863 1 43 | 0.381249 0.381284 -1 44 | 0.559231 0.559232 1 45 | 0.197361 0.197367 -1 46 | 0.784776 0.784781 1 47 | 0.620494 0.620499 1 48 | 0.229646 0.229647 -1 49 | 0.0891466 0.0891438 -1 50 | 0.981857 0.981861 1 51 | 0.64711 0.647102 1 52 | 0.725596 0.725592 1 53 | 0.614771 0.614764 1 54 | 0.976315 0.976321 1 55 | 0.250716 0.250708 -1 56 | 0.281071 0.281096 -1 57 | 0.550196 0.550187 1 58 | 0.955756 0.955751 1 59 | 0.251821 0.251838 -1 60 | 0.538196 0.538183 1 61 | 0.58285 0.582836 1 62 | 0.48367 0.48368 -1 63 | 0.481451 0.481471 -1 64 | 0.291576 0.291561 -1 65 | 0.181592 0.181596 -1 66 | 0.232746 0.232759 -1 67 | 0.488322 0.488349 -1 68 | 0.664499 0.664487 1 69 | 0.0420094 0.0420475 -1 70 | 0.950521 0.950524 1 71 | 0.445707 0.445706 -1 72 | 0.430385 0.430396 -1 73 | 0.747574 0.747583 1 74 | 0.245047 0.245078 -1 75 | 0.742838 0.742833 1 76 | 0.284625 0.284627 -1 77 | 0.0613909 0.061374 -1 78 | 0.612767 0.612754 1 79 | 0.378545 0.378555 -1 80 | 0.818764 0.818763 1 81 | 0.0507026 0.0507136 -1 82 | 0.882725 0.882731 1 83 | 0.0810847 0.0810796 -1 84 | 0.836278 0.836279 1 85 | 0.696709 0.696695 1 86 | 0.603346 0.603334 1 87 | 0.513718 0.513712 1 88 | 0.247789 0.247802 -1 89 | 0.704221 0.704213 1 90 | 0.546723 0.546724 1 91 | 0.881583 0.881592 1 92 | 0.13456 0.134545 -1 93 | 0.86883 0.868815 1 94 | 0.980909 0.980887 1 95 | 0.369986 0.369986 -1 96 | 0.194455 0.194457 -1 97 | 0.483858 0.483875 -1 98 | 0.43807 0.43808 -1 99 | 0.159602 0.159592 -1 100 | 0.923499 0.923504 1 101 | 0.419902 0.419906 -1 102 | 0.659252 0.659271 1 103 | 0.419546 0.419546 -1 104 | 0.935494 0.935512 1 105 | 0.712397 0.71239 1 106 | 0.952567 0.952549 1 107 | 0.915359 0.915379 1 108 | 0.182693 0.182675 -1 109 | 0.668527 0.668522 1 110 | 0.0965221 0.0965266 -1 111 | 0.984174 0.984197 1 112 | 0.7437 0.743702 1 113 | 0.213357 0.213341 -1 114 | 0.617402 0.617386 1 115 | 0.335604 0.335604 -1 116 | 0.632581 0.632597 1 117 | 0.515744 0.515757 1 118 | 0.786921 0.786912 1 119 | 0.502608 0.502599 1 120 | 0.164538 0.164537 -1 121 | 0.507454 0.507469 1 122 | 0.822809 0.822806 1 123 | 0.42883 0.428821 -1 124 | 0.157678 0.157693 -1 125 | 0.674884 0.674896 1 126 | 0.276618 0.276622 -1 127 | 0.374795 0.374795 -1 128 | 0.396781 0.396815 -1 129 | 0.132116 0.132101 -1 130 | 0.966203 0.966249 1 131 | 0.961164 0.961159 1 132 | 0.0140044 0.014014 -1 133 | 0.509361 0.509379 1 134 | 0.195082 0.195097 -1 135 | 0.853012 0.853012 1 136 | 0.852883 0.852896 1 137 | 0.574279 0.574282 1 138 | 0.316965 0.316939 -1 139 | 0.386753 0.386761 -1 140 | 0.764792 0.764815 1 141 | 0.680442 0.680428 1 142 | 0.125299 0.125304 -1 143 | 0.619824 0.619818 1 144 | 0.687672 0.687662 1 145 | 0.760271 0.760289 1 146 | 0.227148 0.22713 -1 147 | 0.224288 0.224295 -1 148 | 0.0150326 0.0150352 -1 149 | 0.585322 0.585314 1 150 | 0.732755 0.732777 1 151 | 0.864553 0.864569 1 152 | 0.0788415 0.0788569 -1 153 | 0.4326 0.432602 -1 154 | 0.804816 0.804801 1 155 | 0.50957 0.509589 1 156 | 0.405003 0.404988 -1 157 | 0.465702 0.465691 -1 158 | 0.368576 0.368574 -1 159 | 0.56202 0.562033 1 160 | 0.552361 0.552356 1 161 | 0.18263 0.182606 -1 162 | 0.672912 0.672906 1 163 | 0.642397 0.642413 1 164 | 0.816308 0.816316 1 165 | 0.264986 0.264978 -1 166 | 0.799168 0.799179 1 167 | 0.311442 0.311432 -1 168 | 0.715291 0.715278 1 169 | 0.913262 0.913265 1 170 | 0.703566 0.70358 1 171 | 0.0868818 0.0868856 -1 172 | 0.507828 0.507835 1 173 | 0.77619 0.776196 1 174 | 0.503254 0.503257 1 175 | 0.0585257 0.0585251 -1 176 | 0.668003 0.667995 1 177 | 0.409675 0.409686 -1 178 | 0.00104673 0.00105247 -1 179 | 0.6743 0.674268 1 180 | 0.461383 0.461378 -1 181 | 0.957667 0.957677 1 182 | 0.386593 0.386566 -1 183 | 0.260177 0.260171 -1 184 | 0.208071 0.208076 -1 185 | 0.634661 0.634646 1 186 | 0.354351 0.354351 -1 187 | 0.135384 0.135381 -1 188 | 0.216718 0.216748 -1 189 | 0.606084 0.606096 1 190 | 0.443809 0.443801 -1 191 | 0.480428 0.480418 -1 192 | 0.886987 0.886995 1 193 | 0.0126171 0.012603 -1 194 | 0.578502 0.578495 1 195 | 0.0664441 0.0664438 -1 196 | 0.292442 0.292432 -1 197 | 0.487013 0.487008 -1 198 | 0.176237 0.176234 -1 199 | 0.496052 0.496044 -1 200 | 0.62186 0.621853 1 201 | -------------------------------------------------------------------------------- /MLFoundation/pdf/01_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/01_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/02_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/02_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/03_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/03_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/04_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/04_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/05_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/05_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/06_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/06_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/07_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/07_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/08_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/08_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/09_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/09_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/10_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/10_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/11_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/11_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/12_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/12_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/13_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/13_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/14_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/14_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/15_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/15_handout.pdf -------------------------------------------------------------------------------- /MLFoundation/pdf/16_handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/16_handout.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LearningML 2 | 3 | 学习机器学习过程中一些课程习题，算法的实现 4 | 5 | [Coursera AndrewNg 机器学习 matlab实现](https://github.com/xjwhhh/AndrewNgMachineLearning) 6 | 7 | [Coursera 国立台湾大学林轩田机器学习基石](https://github.com/xjwhhh/LearningML/tree/master/MLFoundation) 8 | 9 | [李航统计学习方法](https://github.com/xjwhhh/LearningML/tree/master/StatisticalLearningMethod) 10 | 11 | [周志华机器学习](https://github.com/xjwhhh/LearningML/tree/master/watermelon) 12 | 13 | -------------------------------------------------------------------------------- /StatisticalLearningMethod/chapter2/Perceptron.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import random 3 | import time 4 | import logging 5 | 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.metrics import accuracy_score 8 | 9 | 10 | def log(func): 11 | def wrapper(*args, **kwargs): 12 | start_time = time.time() 13 | logging.debug('start %s()' % func.__name__) 14 | ret = func(*args, **kwargs) 15 | 16 | end_time = time.time() 17 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time)) 18 | 19 | return ret 20 | 21 | return wrapper 22 | 23 | 24 | class Perceptron(object): 25 | 26 | def __init__(self): 27 | self.learning_step = 0.00001 28 | self.max_iteration = 5000 29 | 30 | def predict_(self, x): 31 | wx = 0 32 | for i in range(len(self.w)): 33 | wx += self.w[i] * x[i] 34 | 35 | return int(wx > 0) 36 | 37 | @log 38 | def train(self, features, labels): 39 | # (1) 40 | self.w = [0.0] * (len(features[0]) + 1) 41 | 42 | correct_count = 0 43 | 44 | while True: 45 | # (2) 46 | # 有可能随机生成相同的数字，使得correct_count对一个数据有重复计算，但无伤大雅 47 | index = random.randint(0, len(labels) - 1) 48 | x = list(features[index]) 49 | x.append(1.0) 50 | if labels[index] == 1: 51 | y = 1 52 | else: 53 | y = -1 54 | wx = 0 55 | for i in range(len(self.w)): 56 | wx += self.w[i] * x[i] 57 | 58 | # 验证正确 59 | if wx * y > 0: 60 | correct_count += 1 61 | # 训练集大约有两万多数据，这里可随意取适宜的值，用来跳出while循环 62 | if correct_count > 10000: 63 | break 64 | continue 65 | 66 | # (3) 67 | # 验证错误，修改w值 68 | for i in range(len(self.w)): 69 | self.w[i] += self.learning_step * (y * x[i]) 70 | 71 | @log 72 | def predict(self, features): 73 | predict_labels = [] 74 | for feature in features: 75 | x = list(feature) 76 | x.append(1) 77 | predict_labels.append(self.predict_(x)) 78 | return predict_labels 79 | 80 | 81 | if __name__ == '__main__': 82 | # 记录 83 | logger = logging.getLogger() 84 | logger.setLevel(logging.DEBUG) 85 | 86 | raw_data = pd.read_csv('../data/train_binary.csv', header=0) 87 | data = raw_data.values 88 | 89 | images = data[0:, 1:] 90 | labels = data[:, 0] 91 | 92 | # 选取 2/3 数据作为训练集， 1/3 数据作为测试集 93 | train_features, test_features, train_labels, test_labels = train_test_split( 94 | images, labels, test_size=0.33, random_state=1) 95 | 96 | # 模型训练 97 | p = Perceptron() 98 | p.train(train_features, train_labels) 99 | 100 | # 使用测试集预测 101 | test_predict = p.predict(test_features) 102 | 103 | # 计算准确率 104 | # 因为是随机的，每次得到的准确率都不同 105 | score = accuracy_score(test_labels, test_predict) 106 | print("The accuracy score is ", score) 107 | -------------------------------------------------------------------------------- /StatisticalLearningMethod/chapter3/K-NN.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import cv2 4 | import logging 5 | import time 6 | 7 | from math import sqrt 8 | from collections import namedtuple 9 | 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.metrics import accuracy_score 12 | 13 | 14 | def log(func): 15 | def wrapper(*args, **kwargs): 16 | start_time = time.time() 17 | logging.debug('start %s()' % func.__name__) 18 | ret = func(*args, **kwargs) 19 | 20 | end_time = time.time() 21 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time)) 22 | 23 | return ret 24 | 25 | return wrapper 26 | 27 | 28 | def get_hog_features(trainset): 29 | # 利用opencv获取图像hog特征 30 | 31 | features = [] 32 | 33 | hog = cv2.HOGDescriptor('../hog.xml') 34 | 35 | for img in trainset: 36 | img = np.reshape(img, (28, 28)) 37 | cv_img = img.astype(np.uint8) 38 | 39 | hog_feature = hog.compute(cv_img) 40 | # hog_feature = np.transpose(hog_feature) 41 | features.append(hog_feature) 42 | 43 | features = np.array(features) 44 | features = np.reshape(features, (-1, 324)) 45 | 46 | return features 47 | 48 | 49 | def predict(test_set, kd_tree): 50 | predict = [] 51 | 52 | for i in range(len(test_set)): 53 | predict.append(find_nearest(kd_tree, test_set[i]).label) 54 | 55 | return np.array(predict) 56 | 57 | 58 | # 构造kdTree搜索 59 | # 现在的实现是最近邻， 60 | # 问题1：怎么保存每个结点对应的label，现在的实现似乎成功了，但我不确定 61 | # 问题2：速度非常慢 62 | 63 | class KdNode(object): 64 | def __init__(self, dom_elt, split, left, right, label): 65 | self.dom_elt = dom_elt # k维向量节点(k维空间中的一个样本点) 66 | self.split = split # 整数（进行分割维度的序号） 67 | self.left = left # 该结点分割超平面左子空间构成的kd-tree 68 | self.right = right # 该结点分割超平面右子空间构成的kd-tree 69 | self.label = label 70 | 71 | 72 | class KdTree(object): 73 | 74 | @log 75 | def __init__(self, data, labels): 76 | k = len(data[0]) # 数据维度 77 | 78 | def create_node(split, data_set, labels): # 按第split维划分数据集,创建KdNode 79 | 80 | # print(len(data_set)) 81 | if (len(data_set) == 0): 82 | return None 83 | 84 | sort_index = data_set[:, split].argsort() 85 | data_set = data_set[sort_index] 86 | labels = labels[sort_index] 87 | # print(data_set) 88 | 89 | split_pos = len(data_set) // 2 90 | # print(split_pos) 91 | median = data_set[split_pos] # 中位数分割点 92 | label = labels[split_pos] 93 | split_next = (split + 1) % k # cycle coordinates 94 | 95 | # 递归的创建kd树 96 | return KdNode(median, split, 97 | create_node(split_next, data_set[:split_pos], labels[:split_pos]), # 创建左子树 98 | create_node(split_next, data_set[split_pos + 1:], labels[split_pos + 1:]), # 创建右子树 99 | label) 100 | 101 | self.root = create_node(0, data, labels) # 从第0维分量开始构建kd树,返回根节点 102 | 103 | 104 | # 定义一个namedtuple,分别存放最近坐标点、最近距离和访问过的节点数 105 | result = namedtuple("Result_tuple", "nearest_point nearest_dist nodes_visited label") 106 | 107 | 108 | @log 109 | def find_nearest(tree, point): 110 | k = len(point) # 数据维度 111 | 112 | def travel(kd_node, target, max_dist): 113 | if kd_node is None: 114 | return result([0] * k, float("inf"), 0, 0) # python中用float("inf")和float("-inf")表示正负无穷 115 | 116 | nodes_visited = 1 117 | 118 | s = kd_node.split # 进行分割的维度 119 | pivot = kd_node.dom_elt # 进行分割的“轴” 120 | 121 | if target[s] <= pivot[s]: # 如果目标点第s维小于分割轴的对应值(目标离左子树更近) 122 | nearer_node = kd_node.left # 下一个访问节点为左子树根节点 123 | further_node = kd_node.right # 同时记录下右子树 124 | else: # 目标离右子树更近 125 | nearer_node = kd_node.right # 下一个访问节点为右子树根节点 126 | further_node = kd_node.left 127 | if (nearer_node is None): 128 | label = 0 129 | else: 130 | label = nearer_node.label 131 | 132 | temp1 = travel(nearer_node, target, max_dist) # 进行遍历找到包含目标点的区域 133 | 134 | nearest = temp1.nearest_point # 以此叶结点作为“当前最近点” 135 | dist = temp1.nearest_dist # 更新最近距离 136 | 137 | nodes_visited += temp1.nodes_visited 138 | 139 | if dist < max_dist: 140 | max_dist = dist # 最近点将在以目标点为球心，max_dist为半径的超球体内 141 | 142 | temp_dist = abs(pivot[s] - target[s]) # 第s维上目标点与分割超平面的距离 143 | if max_dist < temp_dist: # 判断超球体是否与超平面相交 144 | return result(nearest, dist, nodes_visited, temp1.label) # 不相交则可以直接返回，不用继续判断 145 | 146 | # ---------------------------------------------------------------------- 147 | # 计算目标点与分割点的欧氏距离 148 | temp_dist = sqrt(sum((p1 - p2) ** 2 for p1, p2 in zip(pivot, target))) 149 | 150 | if temp_dist < dist: # 如果“更近” 151 | nearest = pivot # 更新最近点 152 | dist = temp_dist # 更新最近距离 153 | max_dist = dist # 更新超球体半径 154 | label = kd_node 155 | 156 | # 检查另一个子结点对应的区域是否有更近的点 157 | temp2 = travel(further_node, target, max_dist) 158 | 159 | nodes_visited += temp2.nodes_visited 160 | if temp2.nearest_dist < dist: # 如果另一个子结点内存在更近距离 161 | nearest = temp2.nearest_point # 更新最近点 162 | dist = temp2.nearest_dist # 更新最近距离 163 | label = temp2.label 164 | 165 | return result(nearest, dist, nodes_visited, label) 166 | 167 | return travel(tree.root, point, float("inf")) # 从根节点开始递归 168 | 169 | 170 | k = 10 171 | 172 | if __name__ == '__main__': 173 | logger = logging.getLogger() 174 | logger.setLevel(logging.DEBUG) 175 | 176 | raw_data = pd.read_csv('../data/train.csv', header=0) 177 | data = raw_data.values 178 | 179 | images = data[0:, 1:] 180 | labels = data[:, 0] 181 | 182 | features = get_hog_features(images) 183 | # 选取 2/3 数据作为训练集， 1/3 数据作为测试集 184 | train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, 185 | random_state=1) 186 | 187 | kd_tree = KdTree(train_features, train_labels) 188 | 189 | test_predict = predict(test_features, kd_tree) 190 | 191 | score = accuracy_score(test_labels, test_predict) 192 | print("The accuracy score is ", score) 193 | -------------------------------------------------------------------------------- /StatisticalLearningMethod/chapter3/K-NN1.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import cv2 6 | import random 7 | import time 8 | 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.metrics import accuracy_score 11 | 12 | 13 | # 利用opencv获取图像hog特征 14 | def get_hog_features(trainset): 15 | features = [] 16 | 17 | hog = cv2.HOGDescriptor('../hog.xml') 18 | 19 | for img in trainset: 20 | img = np.reshape(img, (28, 28)) 21 | cv_img = img.astype(np.uint8) 22 | 23 | hog_feature = hog.compute(cv_img) 24 | # hog_feature = np.transpose(hog_feature) 25 | features.append(hog_feature) 26 | 27 | features = np.array(features) 28 | features = np.reshape(features, (-1, 324)) 29 | 30 | return features 31 | 32 | 33 | def Predict(testset, trainset, train_labels): 34 | predict = [] 35 | count = 0 36 | 37 | # 线性搜索 38 | for test_vec in testset: 39 | # 输出当前运行的测试用例坐标，用于测试 40 | print(count) 41 | count += 1 42 | 43 | knn_list = [] # 当前k个最近邻居 44 | max_index = -1 # 当前k个最近邻居中距离最远点的坐标 45 | max_dist = 0 # 当前k个最近邻居中距离最远点的距离 46 | 47 | # 先将前k个点放入k个最近邻居中，填充满knn_list 48 | for i in range(k): 49 | label = train_labels[i] 50 | train_vec = trainset[i] 51 | 52 | dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离 53 | 54 | knn_list.append((dist, label)) 55 | 56 | # 剩下的点 57 | for i in range(k, len(train_labels)): 58 | label = train_labels[i] 59 | train_vec = trainset[i] 60 | 61 | dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离 62 | 63 | # 寻找10个邻近点钟距离最远的点 64 | if max_index < 0: 65 | for j in range(k): 66 | if max_dist < knn_list[j][0]: 67 | max_index = j 68 | max_dist = knn_list[max_index][0] 69 | 70 | # 如果当前k个最近邻居中存在点距离比当前点距离远，则替换 71 | if dist < max_dist: 72 | knn_list[max_index] = (dist, label) 73 | max_index = -1 74 | max_dist = 0 75 | 76 | # 统计选票 77 | class_total = 10 78 | class_count = [0 for i in range(class_total)] 79 | for dist, label in knn_list: 80 | class_count[label] += 1 81 | 82 | # 找出最大选票 83 | mmax = max(class_count) 84 | 85 | # 找出最大选票标签 86 | for i in range(class_total): 87 | if mmax == class_count[i]: 88 | predict.append(i) 89 | break 90 | 91 | return np.array(predict) 92 | 93 | 94 | k = 10 95 | 96 | if __name__ == '__main__': 97 | print('Start read data') 98 | 99 | time_1 = time.time() 100 | 101 | raw_data = pd.read_csv('../data/train.csv', header=0) 102 | data = raw_data.values 103 | 104 | imgs = data[0::, 1::] 105 | labels = data[::, 0] 106 | 107 | features = get_hog_features(imgs) 108 | 109 | # 选取 2/3 数据作为训练集， 1/3 数据作为测试集 110 | train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, 111 | random_state=23323) 112 | # print train_features.shape 113 | # print train_features.shape 114 | 115 | time_2 = time.time() 116 | print('read data cost ', time_2 - time_1, ' second') 117 | 118 | print('Start training') 119 | print('knn do not need to train') 120 | time_3 = time.time() 121 | print('training cost ', time_3 - time_2, ' second') 122 | 123 | print('Start predicting') 124 | test_predict = Predict(test_features, train_features, train_labels) 125 | time_4 = time.time() 126 | print('predicting cost ', time_4 - time_3, ' second') 127 | 128 | score = accuracy_score(test_labels, test_predict) 129 | print("The accuracy score is ", score) 130 | -------------------------------------------------------------------------------- /StatisticalLearningMethod/chapter3/K-NN2.py: -------------------------------------------------------------------------------- 1 | # --*-- coding:utf-8 --*-- 2 | import numpy as np 3 | 4 | 5 | class Node: # 结点 6 | def __init__(self, data, lchild=None, rchild=None): 7 | self.data = data 8 | self.lchild = lchild 9 | self.rchild = rchild 10 | 11 | 12 | class KdTree: # kd树 13 | def __init__(self): 14 | self.kdTree = None 15 | 16 | def create(self, dataSet, depth): # 创建kd树，返回根结点 17 | if (len(dataSet) > 0): 18 | m, n = np.shape(dataSet) # 求出样本行，列 19 | midIndex = int(m / 2) # 中间数的索引位置 20 | axis = depth % n # 判断以哪个轴划分数据 21 | sortedDataSet = self.sort(dataSet, axis) # 进行排序 22 | node = Node(sortedDataSet[midIndex]) # 将节点数据域设置为中位数，具体参考下书本 23 | # print sortedDataSet[midIndex] 24 | leftDataSet = sortedDataSet[: midIndex] # 将中位数的左边创建2改副本 25 | rightDataSet = sortedDataSet[midIndex + 1:] 26 | print(leftDataSet) 27 | print(rightDataSet) 28 | print(123) 29 | node.lchild = self.create(leftDataSet, depth + 1) # 将中位数左边样本传入来递归创建树 30 | node.rchild = self.create(rightDataSet, depth + 1) 31 | return node 32 | else: 33 | return None 34 | 35 | def sort(self, dataSet, axis): # 采用冒泡排序，利用aixs作为轴进行划分 36 | sortDataSet = dataSet[:] # 由于不能破坏原样本，此处建立一个副本 37 | m, n = np.shape(sortDataSet) 38 | for i in range(m): 39 | for j in range(0, m - i - 1): 40 | if (sortDataSet[j][axis] > sortDataSet[j + 1][axis]): 41 | temp = sortDataSet[j] 42 | sortDataSet[j] = sortDataSet[j + 1] 43 | sortDataSet[j + 1] = temp 44 | print(sortDataSet) 45 | return sortDataSet 46 | 47 | def preOrder(self, node): # 前序遍历 48 | if node != None: 49 | print("tttt->%s" % node.data) 50 | self.preOrder(node.lchild) 51 | self.preOrder(node.rchild) 52 | 53 | def search(self, tree, x): # 搜索 54 | self.nearestPoint = None # 保存最近的点 55 | self.nearestValue = 0 # 保存最近的值 56 | 57 | def travel(node, depth=0): # 递归搜索 58 | if node != None: # 递归终止条件 59 | n = len(x) # 特征数 60 | axis = depth % n # 计算轴 61 | if x[axis] < node.data[axis]: # 如果数据小于结点，则往左结点找 62 | travel(node.lchild, depth + 1) 63 | else: 64 | travel(node.rchild, depth + 1) 65 | 66 | # 以下是递归完毕后，往父结点方向回朔，对应算法3.3(3) 67 | print(3) 68 | print(node.data) 69 | distNodeAndX = self.dist(x, node.data) # 目标和节点的距离判断 70 | if (self.nearestPoint == None): # 确定当前点，更新最近的点和最近的值，对应算法3.3(3)(a) 71 | self.nearestPoint = node.data 72 | self.nearestValue = distNodeAndX 73 | elif (self.nearestValue > distNodeAndX): 74 | print("t") 75 | self.nearestPoint = node.data 76 | self.nearestValue = distNodeAndX 77 | 78 | print(axis) 79 | print(node.data, depth, self.nearestValue, node.data[axis], x[axis]) 80 | if (abs(x[axis] - node.data[axis]) <= self.nearestValue): # 确定是否需要去子节点的区域去找（圆的判断），对应算法3.3(3)(b) 81 | if x[axis] < node.data[axis]: 82 | print(1) 83 | travel(node.rchild, depth + 1) 84 | else: 85 | print(2) 86 | travel(node.lchild, depth + 1) 87 | 88 | travel(tree) 89 | return self.nearestPoint 90 | 91 | def dist(self, x1, x2): # 欧式距离的计算 92 | return ((np.array(x1) - np.array(x2)) ** 2).sum() ** 0.5 93 | 94 | 95 | if __name__ == '__main__': 96 | dataSet = [[2, 3], 97 | [5, 4], 98 | [9, 6], 99 | [4, 7], 100 | [8, 1], 101 | [7, 2]] 102 | x = [5, 3] 103 | kdtree = KdTree() 104 | tree = kdtree.create(dataSet, 0) 105 | kdtree.preOrder(tree) 106 | print(kdtree.search(tree, x)) 107 | -------------------------------------------------------------------------------- /StatisticalLearningMethod/chapter4/naive_Bayes.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import time 3 | import logging 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import accuracy_score 9 | 10 | 11 | def log(func): 12 | def wrapper(*args, **kwargs): 13 | start_time = time.time() 14 | logging.debug('start %s()' % func.__name__) 15 | ret = func(*args, **kwargs) 16 | 17 | end_time = time.time() 18 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time)) 19 | 20 | return ret 21 | 22 | return wrapper 23 | 24 | 25 | # 二值化,将图片进行二值化的目的是确定每个特征可选的值只有两种，对应于train方法里conditional_probability最后一个维度的长度2 26 | def binaryzation(img): 27 | cv_img = img.astype(np.uint8) 28 | cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img) 29 | return cv_img 30 | 31 | 32 | @log 33 | def train(train_set, train_labels): 34 | class_num = len(set(train_labels)) 35 | feature_num = len(train_set[0]) 36 | prior_probability = np.zeros(class_num) # 先验概率 37 | conditional_probability = np.zeros((class_num, feature_num, 2)) # 条件概率 38 | print(conditional_probability.shape) 39 | 40 | for i in range(len(train_labels)): 41 | img = binaryzation(train_set[i]) # 图片二值化 42 | label = train_labels[i] 43 | 44 | prior_probability[label] += 1 45 | 46 | for j in range(feature_num): 47 | conditional_probability[label][j][img[j]] += 1 48 | 49 | # 贝叶斯估计，因为分母都相同，所以先验概率和条件概率都不用除以分母 50 | prior_probability += 1 51 | for label in set(train_labels): 52 | for j in range(feature_num): 53 | conditional_probability[label][j][0] += 1 54 | conditional_probability[label][j][0] /= (len(train_labels[train_labels == label]) + 2 * 1) 55 | conditional_probability[label][j][1] += 1 56 | conditional_probability[label][j][1] /= (len(train_labels[train_labels == label]) + 2 * 1) 57 | 58 | # print(prior_probability) 59 | # print(conditional_probability) 60 | return prior_probability, conditional_probability 61 | 62 | 63 | @log 64 | def predict(test_features, prior_probability, conditional_probability): 65 | result = [] 66 | for test in test_features: 67 | img = binaryzation(test) 68 | 69 | max_label = 0 70 | max_probability = 0 71 | 72 | for i in range(len(prior_probability)): 73 | 74 | # print("label",i) 75 | probability = prior_probability[i] 76 | for j in range(len(img)): # 特征长度 77 | # print("j",j) 78 | probability *= int(conditional_probability[i][j][img[j]]) 79 | if max_probability < probability: 80 | max_probability = probability 81 | max_label = i 82 | result.append(max_label) 83 | return np.array(result) 84 | 85 | 86 | if __name__ == '__main__': 87 | logger = logging.getLogger() 88 | logger.setLevel(logging.DEBUG) 89 | 90 | raw_data = pd.read_csv('../data/train.csv', header=0) 91 | data = raw_data.values 92 | 93 | imgs = data[0:2000, 1:] 94 | labels = data[0:2000, 0] 95 | 96 | # print(imgs.shape) 97 | 98 | # 选取 2/3 数据作为训练集， 1/3 数据作为测试集 99 | train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, 100 | random_state=1) 101 | 102 | prior_probability, conditional_probability = train(train_features, train_labels) 103 | test_predict = predict(test_features, prior_probability, conditional_probability) 104 | score = accuracy_score(test_labels, test_predict) 105 | print("The accuracy score is ", score) 106 | -------------------------------------------------------------------------------- /StatisticalLearningMethod/chapter4/naive_Bayes1.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | import cv2 5 | import time 6 | 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import accuracy_score 9 | 10 | 11 | # 二值化 12 | def binaryzation(img): 13 | cv_img = img.astype(np.uint8) 14 | cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img) 15 | return cv_img 16 | 17 | 18 | def Train(trainset, train_labels): 19 | prior_probability = np.zeros(class_num) # 先验概率 20 | conditional_probability = np.zeros((class_num, feature_len, 2)) # 条件概率 21 | 22 | # 计算先验概率及条件概率 23 | for i in range(len(train_labels)): 24 | img = binaryzation(trainset[i]) # 图片二值化 25 | label = train_labels[i] 26 | 27 | prior_probability[label] += 1 28 | 29 | for j in range(feature_len): 30 | conditional_probability[label][j][img[j]] += 1 31 | 32 | # 将概率归到[1.10001] 33 | for i in range(class_num): 34 | for j in range(feature_len): 35 | # 经过二值化后图像只有0，1两种取值 36 | pix_0 = conditional_probability[i][j][0] 37 | pix_1 = conditional_probability[i][j][1] 38 | 39 | # 计算0，1像素点对应的条件概率 40 | probalility_0 = (float(pix_0) / float(pix_0 + pix_1)) * 1000000 + 1 41 | probalility_1 = (float(pix_1) / float(pix_0 + pix_1)) * 1000000 + 1 42 | 43 | conditional_probability[i][j][0] = probalility_0 44 | conditional_probability[i][j][1] = probalility_1 45 | 46 | print(conditional_probability) 47 | 48 | return prior_probability, conditional_probability 49 | 50 | 51 | # 计算概率 52 | def calculate_probability(img, label): 53 | probability = int(prior_probability[label]) 54 | 55 | for i in range(len(img)): 56 | probability *= int(conditional_probability[label][i][img[i]]) 57 | 58 | return probability 59 | 60 | 61 | def Predict(testset, prior_probability, conditional_probability): 62 | predict = [] 63 | 64 | for img in testset: 65 | 66 | # 图像二值化 67 | img = binaryzation(img) 68 | 69 | max_label = 0 70 | max_probability = calculate_probability(img, 0) 71 | 72 | for j in range(1, 10): 73 | probability = calculate_probability(img, j) 74 | 75 | if max_probability < probability: 76 | max_label = j 77 | max_probability = probability 78 | 79 | predict.append(max_label) 80 | 81 | return np.array(predict) 82 | 83 | 84 | class_num = 10 85 | feature_len = 784 86 | 87 | if __name__ == '__main__': 88 | print('Start read data') 89 | 90 | time_1 = time.time() 91 | 92 | raw_data = pd.read_csv('../data/train.csv', header=0) 93 | data = raw_data.values 94 | 95 | imgs = data[0::, 1::] 96 | labels = data[::, 0] 97 | 98 | # 选取 2/3 数据作为训练集， 1/3 数据作为测试集 99 | train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, 100 | random_state=23323) 101 | # print train_features.shape 102 | # print train_features.shape 103 | 104 | time_2 = time.time() 105 | print('read data cost ', time_2 - time_1, ' second', '\n') 106 | 107 | print('Start training') 108 | prior_probability, conditional_probability = Train(train_features, train_labels) 109 | time_3 = time.time() 110 | print('training cost ', time_3 - time_2, ' second', '\n') 111 | 112 | print('Start predicting') 113 | test_predict = Predict(test_features, prior_probability, conditional_probability) 114 | time_4 = time.time() 115 | print('predicting cost ', time_4 - time_3, ' second', '\n') 116 | 117 | score = accuracy_score(test_labels, test_predict) 118 | print("The accuracy score is ", score) 119 | -------------------------------------------------------------------------------- /StatisticalLearningMethod/chapter5/C4.5.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | import time 4 | import logging 5 | import numpy as np 6 | import pandas as pd 7 | import random 8 | 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.metrics import accuracy_score 11 | 12 | # 与ID3的实现类似，只有在特征选择时使用的标准不同 13 | # 问题也类似 14 | 15 | 16 | total_class = 10 17 | 18 | 19 | def log(func): 20 | def wrapper(*args, **kwargs): 21 | start_time = time.time() 22 | logging.debug('start %s()' % func.__name__) 23 | ret = func(*args, **kwargs) 24 | 25 | end_time = time.time() 26 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time)) 27 | 28 | return ret 29 | 30 | return wrapper 31 | 32 | 33 | class Tree(object): 34 | def __init__(self, node_type, Class=None, feature=None): 35 | self.node_type = node_type 36 | self.dict = {} 37 | self.Class = Class 38 | self.feature = feature 39 | 40 | def add_tree(self, val, tree): 41 | self.dict[val] = tree 42 | 43 | def predict(self, features): 44 | if self.node_type == 'leaf': 45 | return self.Class 46 | if (features[self.feature] in self.dict.keys()): 47 | tree = self.dict[features[self.feature]] 48 | else: 49 | if (self.Class is None): 50 | return random.randint(0, 1) 51 | else: 52 | return self.Class 53 | return tree.predict(features) 54 | 55 | 56 | def calc_ent(x): 57 | """ 58 | calculate empirical entropy of x 59 | """ 60 | 61 | x_value_list = set(x) 62 | ent = 0.0 63 | for x_value in x_value_list: 64 | p = float(x[x == x_value].shape[0]) / x.shape[0] 65 | logp = np.log2(p) 66 | ent -= p * logp 67 | 68 | return ent 69 | 70 | 71 | def calc_condition_ent(train_feature, train_label): 72 | """ 73 | calculate empirical entropy H(y|x) 74 | """ 75 | 76 | # calc ent(y|x) 77 | 78 | ent = 0 79 | train_feature_set = set(train_feature) 80 | # print("train_feature_set", train_feature_set) 81 | for train_feature_value in train_feature_set: 82 | Di = train_feature[train_feature == train_feature_value] 83 | label_i = train_label[train_feature == train_feature_value] 84 | # print("Di", Di) 85 | train_label_set = set(train_label) 86 | temp = 0 87 | # print("train_label_set", train_label_set) 88 | for train_label_value in train_label_set: 89 | Dik = Di[label_i == train_label_value] 90 | # print(Dik) 91 | if (len(Dik) != 0): 92 | p = float(len(Dik) / len(Di)) 93 | logp = np.log2(p) 94 | temp -= p * logp 95 | ent += (len(Di) / len(train_feature)) * temp 96 | return ent 97 | 98 | 99 | def recurse_train(train_set, train_label, features, epsilon): 100 | 101 | LEAF = 'leaf' 102 | INTERNAL = 'internal' 103 | 104 | # 步骤1——如果train_set中的所有实例都属于同一类Ck 105 | label_set = set(train_label) 106 | # print(label_set) 107 | if len(label_set) == 1: 108 | return Tree(LEAF, Class=label_set.pop()) 109 | 110 | # 步骤2——如果features为空 111 | 112 | class_count0 = 0 113 | class_count1 = 0 114 | 115 | for i in range(len(train_label)): 116 | if (train_label[i] == 1): 117 | class_count1 += 1 118 | else: 119 | class_count0 += 1 120 | 121 | if (class_count0 >= class_count1): 122 | max_class = 0 123 | else: 124 | max_class = 0 125 | 126 | if features is None: 127 | return Tree(LEAF, Class=max_class) 128 | 129 | if len(features) == 0: 130 | return Tree(LEAF, Class=max_class) 131 | 132 | # 步骤3——计算信息增益 133 | max_feature = 0 134 | max_grda = 0 135 | 136 | D = train_label 137 | HD = calc_ent(D) 138 | for feature in features: 139 | A = np.array(train_set[:, feature].flat) 140 | gda = HD - calc_condition_ent(A, D) 141 | had = calc_ent(A) 142 | grda = gda / had 143 | 144 | if grda > max_grda: 145 | max_grda, max_feature = grda, feature 146 | 147 | # 步骤4——小于阈值 148 | if max_grda < epsilon: 149 | return Tree(LEAF, Class=max_class) 150 | 151 | # 步骤5——构建非空子集 152 | sub_features = features.remove(max_feature) 153 | tree = Tree(INTERNAL, feature=max_feature) 154 | 155 | feature_col = np.array(train_set[:, max_feature].flat) 156 | feature_value_list = set([feature_col[i] for i in range(feature_col.shape[0])]) 157 | for feature_value in feature_value_list: 158 | 159 | index = [] 160 | for i in range(len(train_label)): 161 | if train_set[i][max_feature] == feature_value: 162 | index.append(i) 163 | 164 | sub_train_set = train_set[index] 165 | sub_train_label = train_label[index] 166 | 167 | sub_tree = recurse_train(sub_train_set, sub_train_label, sub_features, epsilon) 168 | tree.add_tree(feature_value, sub_tree) 169 | 170 | return tree 171 | 172 | 173 | @log 174 | def train(train_set, train_label, features, epsilon): 175 | # print(features) 176 | return recurse_train(train_set, train_label, features, epsilon) 177 | 178 | 179 | @log 180 | def predict(test_set, tree): 181 | result = [] 182 | for features in test_set: 183 | tmp_predict = tree.predict(features) 184 | result.append(tmp_predict) 185 | return np.array(result) 186 | 187 | 188 | if __name__ == '__main__': 189 | logger = logging.getLogger() 190 | logger.setLevel(logging.DEBUG) 191 | 192 | raw_data = pd.read_csv('../data/train_binary2.csv', header=0) 193 | data = raw_data.values 194 | 195 | images = data[0:, 1:] 196 | labels = data[:, 0] 197 | 198 | # 选取 2/3 数据作为训练集， 1/3 数据作为测试集 199 | train_features, test_features, train_labels, test_labels = train_test_split(images, labels, test_size=0.33, 200 | random_state=1) 201 | 202 | print(train_features.shape) 203 | tree = train(train_features, train_labels, [i for i in range(99)], 0.1) 204 | test_predict = predict(test_features, tree) 205 | print(test_predict) 206 | score = accuracy_score(test_labels, test_predict) 207 | 208 | print("The accuracy score is ", score) 209 | -------------------------------------------------------------------------------- /StatisticalLearningMethod/chapter5/CART.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import time 3 | import logging 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import accuracy_score 9 | 10 | total_class = 10 11 | 12 | 13 | # 这里选用了一个比较小的数据集，因为过大的数据集会导致栈溢出 14 | 15 | 16 | def log(func): 17 | def wrapper(*args, **kwargs): 18 | start_time = time.time() 19 | logging.debug('start %s()' % func.__name__) 20 | ret = func(*args, **kwargs) 21 | 22 | end_time = time.time() 23 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time)) 24 | 25 | return ret 26 | 27 | return wrapper 28 | 29 | 30 | # 二值化 31 | def binaryzation(img): 32 | cv_img = img.astype(np.uint8) 33 | cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img) 34 | return cv_img 35 | 36 | 37 | @log 38 | def binaryzation_features(trainset): 39 | features = [] 40 | 41 | for img in trainset: 42 | img = np.reshape(img, (28, 28)) 43 | cv_img = img.astype(np.uint8) 44 | 45 | img_b = binaryzation(cv_img) 46 | features.append(img_b) 47 | 48 | features = np.array(features) 49 | features = np.reshape(features, (-1, 784)) 50 | 51 | return features 52 | 53 | 54 | class TreeNode(object): 55 | """决策树节点""" 56 | 57 | def __init__(self, **kwargs): 58 | ''' 59 | attr_index: 属性编号 60 | attr: 属性值 61 | label: 类别（y） 62 | left_chuld: 左子结点 63 | right_child: 右子节点 64 | ''' 65 | self.attr_index = kwargs.get('attr_index') 66 | self.attr = kwargs.get('attr') 67 | self.label = kwargs.get('label') 68 | self.left_child = kwargs.get('left_child') 69 | self.right_child = kwargs.get('right_child') 70 | 71 | 72 | # 计算数据集的基尼指数 73 | def gini_train_set(train_label): 74 | train_label_value = set(train_label) 75 | gini = 0.0 76 | for i in train_label_value: 77 | train_label_temp = train_label[train_label == i] 78 | pk = float(len(train_label_temp)) / len(train_label) 79 | gini += pk * (1 - pk) 80 | return gini 81 | 82 | 83 | # 计算一个特征不同切分点的基尼指数，并返回最小的 84 | def gini_feature(train_feature, train_label): 85 | train_feature_value = set(train_feature) 86 | min_gini = float('inf') 87 | return_feature_value = 0 88 | for i in train_feature_value: 89 | train_feature_class1 = train_feature[train_feature == i] 90 | label_class1 = train_label[train_feature == i] 91 | # train_feature_class2 = train_feature[train_feature != i] 92 | label_class2 = train_label[train_feature != i] 93 | D1 = float(len(train_feature_class1)) / len(train_feature) 94 | D2 = 1 - D1 95 | if (len(label_class1) == 0): 96 | p1 = 0 97 | else: 98 | p1 = float(len(label_class1[label_class1 == label_class1[0]])) / len(label_class1) 99 | if (len(label_class2) == 0): 100 | p2 = 0 101 | else: 102 | p2 = float(len(label_class2[label_class2 == label_class2[0]])) / len(label_class2) 103 | gini = D1 * 2 * p1 * (1 - p1) + D2 * 2 * p2 * (1 - p2) 104 | if min_gini > gini: 105 | min_gini = gini 106 | return_feature_value = i 107 | return min_gini, return_feature_value 108 | 109 | 110 | def get_best_index(train_set, train_label, feature_indexes): 111 | ''' 112 | :param train_set: 给定数据集 113 | :param train_label: 数据集对应的标记 114 | :return: 最佳切分点，最佳切分变量 115 | 求给定切分点集合中的最佳切分点和其对应的最佳切分变量 116 | ''' 117 | min_gini = float('inf') 118 | feature_index = 0 119 | return_feature_value = 0 120 | for i in range(len(train_set[0])): 121 | if i in feature_indexes: 122 | train_feature = train_set[:, i] 123 | gini, feature_value = gini_feature(train_feature, train_label) 124 | if gini < min_gini: 125 | min_gini = gini 126 | feature_index = i 127 | return_feature_value = feature_value 128 | return feature_index, return_feature_value 129 | 130 | 131 | # 根据最有特征和最优切分点划分数据集 132 | def divide_train_set(train_set, train_label, feature_index, feature_value): 133 | left = [] 134 | right = [] 135 | left_label = [] 136 | right_label = [] 137 | for i in range(len(train_set)): 138 | line = train_set[i] 139 | if line[feature_index] == feature_value: 140 | left.append(line) 141 | left_label.append(train_label[i]) 142 | else: 143 | right.append(line) 144 | right_label.append(train_label[i]) 145 | return np.array(left), np.array(right), np.array(left_label), np.array(right_label) 146 | 147 | 148 | @log 149 | def build_tree(train_set, train_label, feature_indexes): 150 | # 查看是否满足停止条件 151 | train_label_value = set(train_label) 152 | if len(train_label_value) == 1: 153 | print("a") 154 | return TreeNode(label=train_label[0]) 155 | 156 | if feature_indexes is None: 157 | print("b") 158 | return TreeNode(label=train_label[0]) 159 | 160 | if len(feature_indexes) == 0: 161 | print("c") 162 | return TreeNode(label=train_label[0]) 163 | 164 | feature_index, feature_value = get_best_index(train_set, train_label, feature_indexes) 165 | # print("feature_index",feature_index) 166 | 167 | left, right, left_label, right_label = divide_train_set(train_set, train_label, feature_index, feature_value) 168 | 169 | feature_indexes.remove(feature_index) 170 | # print("feature_indexes",feature_indexes) 171 | 172 | left_branch = build_tree(left, left_label, feature_indexes) 173 | right_branch = build_tree(right, right_label, feature_indexes) 174 | return TreeNode(left_child=left_branch, 175 | right_child=right_branch, 176 | attr_index=feature_index, 177 | attr=feature_value) 178 | 179 | # @log 180 | # def prune(tree): 181 | 182 | 183 | def predict_one(node, test): 184 | while node.label is None: 185 | if test[node.attr_index] == node.attr: 186 | node = node.left_child 187 | else: 188 | node = node.right_child 189 | return node.label 190 | 191 | 192 | @log 193 | def predict(tree, test_set): 194 | result = [] 195 | for test in test_set: 196 | label = predict_one(tree, test) 197 | result.append(label) 198 | return result 199 | 200 | 201 | if __name__ == '__main__': 202 | logger = logging.getLogger() 203 | logger.setLevel(logging.DEBUG) 204 | 205 | raw_data = pd.read_csv('../data/train_binary1.csv', header=0) 206 | data = raw_data.values 207 | 208 | imgs = data[0:, 1:] 209 | labels = data[:, 0] 210 | 211 | print(imgs.shape) 212 | 213 | # 图片二值化 214 | # features = binaryzation_features(imgs) 215 | 216 | # 选取 2/3 数据作为训练集， 1/3 数据作为测试集 217 | train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, 218 | random_state=23323) 219 | 220 | print(type(train_features)) 221 | tree = build_tree(train_features, train_labels, [i for i in range(784)]) 222 | test_predict = predict(tree, test_features) 223 | score = accuracy_score(test_labels, test_predict) 224 | 225 | print("The accuracy score is ", score) 226 | -------------------------------------------------------------------------------- /StatisticalLearningMethod/chapter5/ID3-1.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | import cv2 4 | import time 5 | import logging 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.metrics import accuracy_score 11 | 12 | total_class = 10 13 | 14 | 15 | def log(func): 16 | def wrapper(*args, **kwargs): 17 | start_time = time.time() 18 | logging.debug('start %s()' % func.__name__) 19 | ret = func(*args, **kwargs) 20 | 21 | end_time = time.time() 22 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time)) 23 | 24 | return ret 25 | 26 | return wrapper 27 | 28 | 29 | # 二值化 30 | def binaryzation(img): 31 | cv_img = img.astype(np.uint8) 32 | cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img) 33 | return cv_img 34 | 35 | 36 | @log 37 | def binaryzation_features(trainset): 38 | features = [] 39 | 40 | for img in trainset: 41 | img = np.reshape(img, (28, 28)) 42 | cv_img = img.astype(np.uint8) 43 | 44 | img_b = binaryzation(cv_img) 45 | # hog_feature = np.transpose(hog_feature) 46 | features.append(img_b) 47 | 48 | features = np.array(features) 49 | features = np.reshape(features, (-1, 784)) 50 | 51 | return features 52 | 53 | 54 | class Tree(object): 55 | def __init__(self, node_type, Class=None, feature=None): 56 | self.node_type = node_type 57 | self.dict = {} 58 | self.Class = Class 59 | self.feature = feature 60 | 61 | def add_tree(self, val, tree): 62 | self.dict[val] = tree 63 | 64 | def predict(self, features): 65 | if self.node_type == 'leaf': 66 | return self.Class 67 | 68 | tree = self.dict[features[self.feature]] 69 | return tree.predict(features) 70 | 71 | 72 | def calc_ent(x): 73 | """ 74 | calculate shanno ent of x 75 | """ 76 | 77 | x_value_list = set([x[i] for i in range(x.shape[0])]) 78 | ent = 0.0 79 | for x_value in x_value_list: 80 | p = float(x[x == x_value].shape[0]) / x.shape[0] 81 | logp = np.log2(p) 82 | ent -= p * logp 83 | 84 | return ent 85 | 86 | 87 | def calc_condition_ent(x, y): 88 | """ 89 | calculate ent H(y|x) 90 | """ 91 | 92 | # calc ent(y|x) 93 | x_value_list = set([x[i] for i in range(x.shape[0])]) 94 | ent = 0.0 95 | for x_value in x_value_list: 96 | sub_y = y[x == x_value] 97 | temp_ent = calc_ent(sub_y) 98 | ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent 99 | 100 | return ent 101 | 102 | 103 | def calc_ent_grap(x, y): 104 | """ 105 | calculate ent grap 106 | """ 107 | 108 | base_ent = calc_ent(y) 109 | condition_ent = calc_condition_ent(x, y) 110 | ent_grap = base_ent - condition_ent 111 | 112 | return ent_grap 113 | 114 | 115 | def recurse_train(train_set, train_label, features, epsilon): 116 | global total_class 117 | 118 | LEAF = 'leaf' 119 | INTERNAL = 'internal' 120 | 121 | # 步骤1——如果train_set中的所有实例都属于同一类Ck 122 | label_set = set(train_label) 123 | if len(label_set) == 1: 124 | return Tree(LEAF, Class=label_set.pop()) 125 | 126 | # 步骤2——如果features为空 127 | class_count0 = 0 128 | class_count1 = 0 129 | 130 | for i in range(len(train_label)): 131 | if (train_label[i] == 1): 132 | class_count1 += 1 133 | else: 134 | class_count0 += 1 135 | 136 | if (class_count0 >= class_count1): 137 | max_class = 0 138 | else: 139 | max_class = 0 140 | 141 | if features is None: 142 | return Tree(LEAF, Class=max_class) 143 | 144 | if len(features) == 0: 145 | return Tree(LEAF, Class=max_class) 146 | 147 | # 步骤3——计算信息增益 148 | max_feature = 0 149 | max_gda = 0 150 | 151 | D = train_label 152 | HD = calc_ent(D) 153 | for feature in features: 154 | A = np.array(train_set[:, feature].flat) 155 | gda = HD - calc_condition_ent(A, D) 156 | 157 | if gda > max_gda: 158 | max_gda, max_feature = gda, feature 159 | 160 | # 步骤4——小于阈值 161 | if max_gda < epsilon: 162 | return Tree(LEAF, Class=max_class) 163 | 164 | # 步骤5——构建非空子集 165 | sub_features = features.remove(max_feature) 166 | tree = Tree(INTERNAL, feature=max_feature) 167 | 168 | feature_col = np.array(train_set[:, max_feature].flat) 169 | feature_value_list = set([feature_col[i] for i in range(feature_col.shape[0])]) 170 | for feature_value in feature_value_list: 171 | 172 | index = [] 173 | for i in range(len(train_label)): 174 | if train_set[i][max_feature] == feature_value: 175 | index.append(i) 176 | 177 | sub_train_set = train_set[index] 178 | sub_train_label = train_label[index] 179 | 180 | sub_tree = recurse_train(sub_train_set, sub_train_label, sub_features, epsilon) 181 | tree.add_tree(feature_value, sub_tree) 182 | 183 | return tree 184 | 185 | 186 | @log 187 | def train(train_set, train_label, features, epsilon): 188 | return recurse_train(train_set, train_label, features, epsilon) 189 | 190 | 191 | @log 192 | def predict(test_set, tree): 193 | result = [] 194 | for features in test_set: 195 | tmp_predict = tree.predict(features) 196 | result.append(tmp_predict) 197 | return np.array(result) 198 | 199 | 200 | if __name__ == '__main__': 201 | logger = logging.getLogger() 202 | logger.setLevel(logging.DEBUG) 203 | 204 | raw_data = pd.read_csv('../data/train.csv', header=0) 205 | data = raw_data.values 206 | 207 | imgs = data[0::, 1::] 208 | labels = data[::, 0] 209 | 210 | # 图片二值化 211 | features = binaryzation_features(imgs) 212 | 213 | # print(features) 214 | 215 | # aa=features.tolist() 216 | # print(aa) 217 | 218 | # 选取 2/3 数据作为训练集， 1/3 数据作为测试集 219 | train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, 220 | random_state=23323) 221 | 222 | tree = train(train_features, train_labels, [i for i in range(784)], 0.1) 223 | test_predict = predict(test_features, tree) 224 | score = accuracy_score(test_labels, test_predict) 225 | 226 | print("The accuracy score is ", score) 227 | -------------------------------------------------------------------------------- /StatisticalLearningMethod/chapter5/ID3.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import cv2 3 | import time 4 | import logging 5 | import numpy as np 6 | import pandas as pd 7 | import random 8 | 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.metrics import accuracy_score 11 | 12 | # 参考了别人的实现 13 | # 问题1：可能是因为没有进行二值化？算出来的正确率很低，只有10%？还不如瞎猜！ 14 | # 直接使用别人的代码（将二值化注释）得到的正确率也很低，但看博文达到了89%，不知其解 15 | # 问题2：我觉得参考的代码在实现经验条件熵的时候是有问题的，改成了自己实现的 16 | # 问题3：在tree.predict中可能出现keyError的问题，不得已进行了键是否存在的检测，并随机返回值。我没有找到原因 17 | # 问题4：但就我所看，问题3的情况不多，对最后结果产生的影响应该不大，但正确率还是很低 18 | # 问题5：使用原来的train_binary数据集计算很慢，用了小一点的数据集 19 | # 虽然有这么多问题，但关键步骤的代码，如经验熵和经验条件熵的计算，ID3算法的各个步骤应该都是正确的 20 | 21 | # 好吧发现不是二值化的问题！博主原来的代码我运行了也是很低的正确率，不得其解 22 | 23 | total_class = 10 24 | 25 | 26 | def log(func): 27 | def wrapper(*args, **kwargs): 28 | start_time = time.time() 29 | logging.debug('start %s()' % func.__name__) 30 | ret = func(*args, **kwargs) 31 | 32 | end_time = time.time() 33 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time)) 34 | 35 | return ret 36 | 37 | return wrapper 38 | 39 | 40 | # 二值化 41 | def binaryzation(img): 42 | cv_img = img.astype(np.uint8) 43 | cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY, cv_img) 44 | return cv_img 45 | 46 | 47 | @log 48 | def binaryzation_features(trainset): 49 | features = [] 50 | 51 | for img in trainset: 52 | img = np.reshape(img, (10, 10)) 53 | cv_img = img.astype(np.uint8) 54 | 55 | img_b = binaryzation(cv_img) 56 | # hog_feature = np.transpose(hog_feature) 57 | features.append(img_b) 58 | 59 | features = np.array(features) 60 | features = np.reshape(features, (-1, 100)) 61 | 62 | return features 63 | 64 | 65 | class Tree(object): 66 | def __init__(self, node_type, Class=None, feature=None): 67 | self.node_type = node_type 68 | self.dict = {} 69 | self.Class = Class 70 | self.feature = feature 71 | 72 | def add_tree(self, val, tree): 73 | self.dict[val] = tree 74 | 75 | def predict(self, features): 76 | if self.node_type == 'leaf': 77 | return self.Class 78 | if (features[self.feature] in self.dict.keys()): 79 | tree = self.dict[features[self.feature]] 80 | else: 81 | if (self.Class is None): 82 | return random.randint(0, 1) 83 | else: 84 | return self.Class 85 | return tree.predict(features) 86 | 87 | 88 | def calc_ent(x): 89 | """ 90 | calculate empirical entropy of x 91 | """ 92 | 93 | x_value_list = set([x[i] for i in range(x.shape[0])]) 94 | ent = 0.0 95 | for x_value in x_value_list: 96 | p = float(x[x == x_value].shape[0]) / x.shape[0] 97 | logp = np.log2(p) 98 | ent -= p * logp 99 | 100 | return ent 101 | 102 | 103 | def calc_condition_ent(train_feature, train_label): 104 | """ 105 | calculate empirical entropy H(y|x) 106 | """ 107 | 108 | # calc ent(y|x) 109 | 110 | ent = 0 111 | train_feature_set = set(train_feature) 112 | # print("train_feature_set", train_feature_set) 113 | for train_feature_value in train_feature_set: 114 | Di = train_feature[train_feature == train_feature_value] 115 | label_i = train_label[train_feature == train_feature_value] 116 | # print("Di", Di) 117 | train_label_set = set(train_label) 118 | temp = 0 119 | # print("train_label_set", train_label_set) 120 | for train_label_value in train_label_set: 121 | Dik = Di[label_i == train_label_value] 122 | # print(Dik) 123 | if (len(Dik) != 0): 124 | p = float(len(Dik)) / len(Di) 125 | logp = np.log2(p) 126 | temp -= p * logp 127 | ent += float(len(Di)) / len(train_feature) * temp 128 | return ent 129 | 130 | 131 | def recurse_train(train_set, train_label, features, epsilon): 132 | global total_class 133 | 134 | LEAF = 'leaf' 135 | INTERNAL = 'internal' 136 | 137 | # 步骤1——如果train_set中的所有实例都属于同一类Ck 138 | label_set = set(train_label) 139 | # print(label_set) 140 | if len(label_set) == 1: 141 | return Tree(LEAF, Class=label_set.pop()) 142 | 143 | # 步骤2——如果features为空 144 | 145 | class_count0 = 0 146 | class_count1 = 0 147 | 148 | for i in range(len(train_label)): 149 | if (train_label[i] == 1): 150 | class_count1 += 1 151 | else: 152 | class_count0 += 1 153 | 154 | if (class_count0 >= class_count1): 155 | max_class = 0 156 | else: 157 | max_class = 0 158 | 159 | if features is None: 160 | return Tree(LEAF, Class=max_class) 161 | 162 | if len(features) == 0: 163 | return Tree(LEAF, Class=max_class) 164 | 165 | # 步骤3——计算信息增益 166 | max_feature = 0 167 | max_gda = 0 168 | 169 | D = train_label 170 | HD = calc_ent(D) 171 | for feature in features: 172 | A = np.array(train_set[:, feature].flat) 173 | gda = HD - calc_condition_ent(A, D) 174 | 175 | if gda > max_gda: 176 | max_gda, max_feature = gda, feature 177 | 178 | # 步骤4——小于阈值 179 | if max_gda < epsilon: 180 | return Tree(LEAF, Class=max_class) 181 | 182 | # 步骤5——构建非空子集 183 | sub_features = features.remove(max_feature) 184 | tree = Tree(INTERNAL, feature=max_feature) 185 | 186 | feature_col = np.array(train_set[:, max_feature].flat) 187 | feature_value_list = set([feature_col[i] for i in range(feature_col.shape[0])]) 188 | for feature_value in feature_value_list: 189 | 190 | index = [] 191 | for i in range(len(train_label)): 192 | if train_set[i][max_feature] == feature_value: 193 | index.append(i) 194 | 195 | sub_train_set = train_set[index] 196 | sub_train_label = train_label[index] 197 | 198 | sub_tree = recurse_train(sub_train_set, sub_train_label, sub_features, epsilon) 199 | tree.add_tree(feature_value, sub_tree) 200 | 201 | return tree 202 | 203 | 204 | @log 205 | def train(train_set, train_label, features, epsilon): 206 | # print(features) 207 | return recurse_train(train_set, train_label, features, epsilon) 208 | 209 | 210 | @log 211 | def predict(test_set, tree): 212 | result = [] 213 | for features in test_set: 214 | tmp_predict = tree.predict(features) 215 | result.append(tmp_predict) 216 | return np.array(result) 217 | 218 | 219 | if __name__ == '__main__': 220 | logger = logging.getLogger() 221 | logger.setLevel(logging.DEBUG) 222 | 223 | raw_data = pd.read_csv('../data/train_binary2.csv', header=0) 224 | data = raw_data.values 225 | 226 | images = data[0:, 1:] 227 | labels = data[:, 0] 228 | 229 | # 图片二值化 230 | features = binaryzation_features(images) 231 | 232 | # 选取 2/3 数据作为训练集， 1/3 数据作为测试集 233 | train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, 234 | random_state=1) 235 | 236 | # print(train_features.shape) 237 | tree = train(train_features, train_labels, [i for i in range(99)], 0.1) 238 | test_predict = predict(test_features, tree) 239 | # print(test_predict) 240 | score = accuracy_score(test_labels, test_predict) 241 | 242 | print("The accuracy score is ", score) 243 | -------------------------------------------------------------------------------- /StatisticalLearningMethod/errata.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/StatisticalLearningMethod/errata.pdf -------------------------------------------------------------------------------- /StatisticalLearningMethod/hog.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 28 28 6 | 7 | 14 14 8 | 9 | 7 7 10 | 11 | 7 7 12 | 9 13 | 1 14 | 4. 15 | 0 16 | 2.0000000000000001e-001 17 | 1 18 | 64 19 | 20 | -------------------------------------------------------------------------------- /tensorflow/course/data/fire_theft.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/tensorflow/course/data/fire_theft.xls -------------------------------------------------------------------------------- /tensorflow/course/feed.py: -------------------------------------------------------------------------------- 1 | # 上述示例在计算图中引入了 tensor, 以常量或变量的形式存储. TensorFlow 还提供了 feed 机制, 2 | # 该机制可以临时替代图中的任意操作中的 tensor 可以对图中任何操作提交补丁, 直接插入一个 tensor. 3 | # feed 使用一个 tensor 值临时替换一个操作的输出结果. 你可以提供 feed 数据作为 run() 调用的参数. 4 | # feed 只在调用它的方法内有效, 方法结束, feed 就会消失. 5 | # 最常见的用例是将某些特殊的操作指定为 "feed" 操作, 标记的方法是使用 tf.placeholder() 为这些操作创建占位符. 6 | import tensorflow as tf 7 | 8 | input1 = tf.placeholder(tf.float32) 9 | input2 = tf.placeholder(tf.float32) 10 | output = tf.multiply(input1, input2) 11 | 12 | with tf.Session() as sess: 13 | print(sess.run([output], feed_dict={input1: [7.], input2: [2.]})) 14 | 15 | # 输出: 16 | # [array([ 14.], dtype=float32)] 17 | -------------------------------------------------------------------------------- /tensorflow/course/fetch.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | input1 = tf.constant(3.0) 4 | input2 = tf.constant(2.0) 5 | input3 = tf.constant(5.0) 6 | intermed = tf.add(input2, input3) 7 | mul = tf.multiply(input1, intermed) 8 | 9 | with tf.Session() as sess: 10 | result = sess.run([mul, intermed]) 11 | print(result) 12 | 13 | # 输出: 14 | # [array([ 21.], dtype=float32), array([ 7.], dtype=float32)] 15 | -------------------------------------------------------------------------------- /tensorflow/course/graph.py: -------------------------------------------------------------------------------- 1 | # 构建图的第一步, 是创建源 op (source op). 源 op 不需要任何输入, 例如常量 (Constant). 源 op 的输出被传递给其它 op 做运算. 2 | # 3 | # Python 库中, op 构造器的返回值代表被构造出的 op 的输出, 这些返回值可以传递给其它 op 构造器作为输入. 4 | 5 | import tensorflow as tf 6 | 7 | # 创建一个常量 op, 产生一个 1x2 矩阵. 这个 op 被作为一个节点 8 | # 加到默认图中. 9 | # 10 | # 构造器的返回值代表该常量 op 的返回值. 11 | matrix1 = tf.constant([[3., 3.]]) 12 | 13 | # 创建另外一个常量 op, 产生一个 2x1 矩阵. 14 | matrix2 = tf.constant([[2.], [2.]]) 15 | 16 | # 创建一个矩阵乘法 matmul op , 把 'matrix1' 和 'matrix2' 作为输入. 17 | # 返回值 'product' 代表矩阵乘法的结果. 18 | product = tf.matmul(matrix1, matrix2) 19 | 20 | # 默认图现在有三个节点, 两个 constant() op, 和一个matmul() op. 为了真正进行矩阵相乘运算, 并得到矩阵乘法的结果, 你必须在会话里启动这个图. 21 | 22 | # 构造阶段完成后, 才能启动图. 启动图的第一步是创建一个 Session 对象, 如果无任何创建参数, 会话构造器将启动默认图. 23 | 24 | # 启动默认图. 25 | sess = tf.Session() 26 | 27 | # 调用 sess 的 'run()' 方法来执行矩阵乘法 op, 传入 'product' 作为该方法的参数. 28 | # 上面提到, 'product' 代表了矩阵乘法 op 的输出, 传入它是向方法表明, 我们希望取回 29 | # 矩阵乘法 op 的输出. 30 | # 31 | # 整个执行过程是自动化的, 会话负责传递 op 所需的全部输入. op 通常是并发执行的. 32 | # 33 | # 函数调用 'run(product)' 触发了图中三个 op (两个常量 op 和一个矩阵乘法 op) 的执行. 34 | # 35 | # 返回值 'result' 是一个 numpy `ndarray` 对象. 36 | result = sess.run(product) 37 | print(result) 38 | # ==> [[ 12.]] 39 | 40 | # 任务完成, 关闭会话. 41 | sess.close() 42 | -------------------------------------------------------------------------------- /tensorflow/course/interactiveSession.py: -------------------------------------------------------------------------------- 1 | # 为了便于使用诸如 IPython 之类的 Python 交互环境, 2 | # 可以使用 InteractiveSession 代替 Session 类, 使用 Tensor.eval() 和 Operation.run() 方法代替 Session.run(). 3 | # 这样可以避免使用一个变量来持有会话. 4 | 5 | # 进入一个交互式 TensorFlow 会话. 6 | import tensorflow as tf 7 | 8 | sess = tf.InteractiveSession() 9 | 10 | x = tf.Variable([1.0, 2.0]) 11 | a = tf.constant([3.0, 3.0]) 12 | 13 | # 使用初始化器 initializer op 的 run() 方法初始化 'x' 14 | x.initializer.run() 15 | 16 | # 增加一个减法 sub op, 从 'x' 减去 'a'. 运行减法 op, 输出结果 17 | sub = tf.subtract(x, a) 18 | print(sub.eval()) 19 | # ==> [-2. -1.] 20 | -------------------------------------------------------------------------------- /tensorflow/course/linearRegression.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import tensorflow as tf 5 | import xlrd 6 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 7 | 8 | 9 | DATA_FILE = 'data/fire_theft.xls' 10 | 11 | # Phase 1: Assemble the graph 12 | # Step 1: read in data from the .xls file 13 | book = xlrd.open_workbook(DATA_FILE, encoding_override='utf-8') 14 | sheet = book.sheet_by_index(0) 15 | data = np.asarray( 16 | [sheet.row_values(i) for i in range(1, sheet.nrows)], dtype=np.float32) 17 | n_samples = sheet.nrows - 1 18 | 19 | # Step 2: create placeholders for input X (number of fire) and label Y (number of theft) 20 | # Both have the type float32 21 | X = tf.placeholder(tf.float32, shape=[], name='input') 22 | Y = tf.placeholder(tf.float32, shape=[], name='label') 23 | # Step 3: create weight and bias, initialized to 0 24 | # name your variables w and b 25 | w = tf.get_variable( 26 | 'weight', shape=[], initializer=tf.truncated_normal_initializer()) 27 | b = tf.get_variable('bias', shape=[], initializer=tf.zeros_initializer()) 28 | # Step 4: predict Y (number of theft) from the number of fire 29 | # name your variable Y_predicted 30 | Y_predicted = w * X + b 31 | # Step 5: use the square error as the loss function 32 | # name your variable loss 33 | loss = tf.square(Y - Y_predicted, name='loss') 34 | 35 | 36 | def huber_loss(labels, predictions, delta=1.0): 37 | residual = tf.abs(predictions - labels) 38 | condition = tf.less(residual, delta) 39 | small_res = 0.5 * residual ** 2 40 | large_res = delta * residual - 0.5 * delta ** 2 41 | return tf.where(condition, small_res, large_res) 42 | 43 | 44 | h_loss = huber_loss(Y, Y_predicted) 45 | # Step 6: using gradient descent with learning rate of 0.01 to minimize loss 46 | optimizer = tf.train.GradientDescentOptimizer( 47 | learning_rate=1e-3).minimize(loss) 48 | 49 | # Phase 2: Train our model 50 | init = tf.global_variables_initializer() 51 | with tf.Session() as sess: 52 | # Step 7: initialize the necessary variables, in this case, w and b 53 | writer = tf.summary.FileWriter('./linear_log', graph=sess.graph) 54 | sess.run(init) 55 | # Step 8: train the model 56 | for i in range(100): 57 | total_loss = 0 58 | for x, y in data: 59 | # Session runs optimizer to minimize loss and fetch the value of loss. Name the received value as l 60 | _, l = sess.run([optimizer, h_loss], feed_dict={X: x, Y: y}) 61 | total_loss += l 62 | print("Epoch {0}: {1}".format(i, total_loss / n_samples)) 63 | w, b = sess.run([w, b]) 64 | writer.close() 65 | # plot the results 66 | X, Y = data.T[0], data.T[1] 67 | plt.plot(X, Y, 'bo', label='Real data') 68 | plt.plot(X, X * w + b, 'r', label='Predicted data') 69 | plt.legend() 70 | plt.show() 71 | -------------------------------------------------------------------------------- /tensorflow/course/load.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | v = tf.Variable(initial_value=[1, 2]) 6 | init = tf.global_variables_initializer() 7 | 8 | with tf.Session() as sess: 9 | sess.run(init) 10 | # 显式地传递session到函数里面 11 | v.load(value=[3, 4], session=sess) 12 | print(v.eval(session=sess)) 13 | -------------------------------------------------------------------------------- /tensorflow/course/logisticRegression.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | from tensorflow.examples.tutorials.mnist import input_data 5 | import time 6 | 7 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 8 | 9 | # Define parameters for the model 10 | learning_rate = 0.01 11 | batch_size = 128 12 | n_epochs = 10 13 | 14 | # Step 1: Read in data 15 | # using TF Learn's built in function to load MNIST data to the folder data/mnist 16 | mnist = input_data.read_data_sets('./data/mnist', one_hot=True) 17 | 18 | # Step 2: create placeholders for features and labels 19 | # each image in the MNIST data is of shape 28*28 = 784 20 | # therefore, each image is represented with a 1x784 tensor 21 | # there are 10 classes for each image, corresponding to digits 0 - 9. 22 | # Features are of the type float, and labels are of the type int 23 | x = tf.placeholder(tf.float32, shape=[None, 784], name='image') 24 | y = tf.placeholder(tf.int32, shape=[None, 10], name='label') 25 | 26 | # Step 3: create weights and bias 27 | # weights and biases are initialized to 0 28 | # shape of w depends on the dimension of X and Y so that Y = X * w + b 29 | # shape of b depends on Y 30 | 31 | w = tf.get_variable( 32 | 'weight', shape=[784, 10], initializer=tf.truncated_normal_initializer()) 33 | b = tf.get_variable('bias', shape=[10], initializer=tf.zeros_initializer()) 34 | 35 | # Step 4: build model 36 | # the model that returns the logits. 37 | # this logits will be later passed through softmax layer 38 | # to get the probability distribution of possible label of the image 39 | # DO NOT DO SOFTMAX HERE 40 | logits = tf.matmul(x, w) + b 41 | # Step 5: define loss function 42 | # use cross entropy loss of the real labels with the softmax of logits 43 | # use the method: 44 | entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits) 45 | # then use tf.reduce_mean to get the mean loss of the batch 46 | loss = tf.reduce_mean(entropy, axis=0) 47 | # test model 48 | preds = tf.nn.softmax(logits) 49 | correct_preds = tf.equal(tf.argmax(preds, 1), tf.argmax(y, 1)) 50 | accuracy = tf.reduce_sum(tf.cast(correct_preds, tf.float32), axis=0) 51 | 52 | # Step 6: define training op 53 | # using gradient descent to minimize loss 54 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss) 55 | 56 | with tf.Session() as sess: 57 | writer = tf.summary.FileWriter('./logistic_log', sess.graph) 58 | start_time = time.time() 59 | sess.run(tf.global_variables_initializer()) 60 | n_batches = int(mnist.train.num_examples / batch_size) 61 | for i in range(n_epochs): # train the model n_epochs times 62 | total_loss = 0 63 | for _ in range(n_batches): 64 | X_batch, Y_batch = mnist.train.next_batch(batch_size) 65 | _, loss_batch = sess.run( 66 | [optimizer, loss], feed_dict={x: X_batch, 67 | y: Y_batch}) 68 | total_loss += loss_batch 69 | print('Average loss epoch {0}: {1}'.format(i, total_loss / n_batches)) 70 | 71 | print('Total time: {0} seconds'.format(time.time() - start_time)) 72 | 73 | print('Optimization Finished!') # should be around 0.35 after 25 epochs 74 | 75 | # test the model 76 | n_batches = int(mnist.test.num_examples / batch_size) 77 | total_correct_preds = 0 78 | 79 | for i in range(n_batches): 80 | X_batch, Y_batch = mnist.test.next_batch(batch_size) 81 | accuracy_batch = sess.run(accuracy, feed_dict={x: X_batch, y: Y_batch}) 82 | total_correct_preds += accuracy_batch 83 | 84 | print('Accuracy {0}'.format(total_correct_preds / mnist.test.num_examples)) 85 | -------------------------------------------------------------------------------- /tensorflow/course/random.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | # Create a tensor of shape [2, 3] consisting of random normal values, with mean -1 and standard deviation 4. 4 | norm = tf.random_normal([2, 3], mean=-1, stddev=4) 5 | 6 | # Shuffle the first dimension of a tensor 7 | c = tf.constant([[1, 2], [3, 4], [5, 6]]) 8 | shuff = tf.random_shuffle(c) 9 | 10 | # Each time we run these ops, different results are generated 11 | sess = tf.Session() 12 | print(sess.run(norm)) 13 | print(sess.run(norm)) 14 | 15 | # Set an op-level seed to generate repeatable sequences across sessions. 16 | norm = tf.random_normal([2, 3], seed=1234) 17 | sess = tf.Session() 18 | print(sess.run(norm)) 19 | print(sess.run(norm)) 20 | sess = tf.Session() 21 | print(sess.run(norm)) 22 | print(sess.run(norm)) 23 | -------------------------------------------------------------------------------- /tensorflow/course/shape.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | 3 | import tensorflow as tf 4 | 5 | import numpy as np 6 | 7 | batch_dim_1 = np.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]]) 8 | 9 | print("batch_dim:\n", batch_dim_1) 10 | 11 | batch_dim_2 = np.array([[3, 4, 5, 6], [9, 10, 11, 12], [13, 14, 15, 16]]) 12 | 13 | print("batch_dim:\n", batch_dim_2) 14 | 15 | graph = tf.Graph() 16 | 17 | with graph.as_default(): 18 | a = tf.Variable(initial_value=batch_dim_1) 19 | 20 | b = tf.Variable(initial_value=batch_dim_2) 21 | 22 | result = (a, b) 23 | 24 | print("result:", result) 25 | 26 | result = tf.concat(values=[a, b], axis=0) 27 | 28 | print(result) 29 | 30 | result2 = tf.reshape(tensor=result, shape=(2, 3, -1)) 31 | 32 | print("result2:", result2) 33 | 34 | result3 = tf.transpose(a=result2, perm=(1, 0, 2)) 35 | 36 | print("result3:", result3) 37 | 38 | shape = result3.get_shape().as_list() 39 | 40 | print(shape) 41 | 42 | init = tf.global_variables_initializer() 43 | 44 | with tf.Session(graph=graph) as sess: 45 | sess.run(init) 46 | 47 | print("result:\n", sess.run(result)) 48 | 49 | print("result2:\n", sess.run(result2)) 50 | 51 | print("result3:\n", sess.run(result3)) 52 | 53 | # define graph 54 | graph = tf.Graph() 55 | with graph.as_default(): 56 | c1 = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=tf.float32, name="c1") 57 | c2 = tf.constant([1, 2, 3, 4, 5, 6], dtype=tf.float32, name="c2") 58 | c3 = tf.random_normal(shape=(3, 2, 3)) 59 | shape_c1 = tf.shape(c1) 60 | # shape_nc1=tf.shape_n(c1) 61 | shape_c2 = tf.shape(c2) 62 | shape_c3 = tf.shape(c3) 63 | 64 | # run graph 65 | with tf.Session(graph=graph) as sess: 66 | _shape_c1, _shape_c2, _shape_c3, c3 = sess.run([shape_c1, shape_c2, shape_c3, c3]) 67 | print("shape of c1:", _shape_c1) 68 | # print ("shape of n_c1:",_shape_nc1) 69 | print("c3:", c3) 70 | 71 | # size test 72 | size = sess.run(tf.size(c3)) 73 | print("size of c3:", size) 74 | 75 | # rank test 76 | rank = sess.run(tf.rank(c3)) 77 | print("rank of c3:", rank) 78 | -------------------------------------------------------------------------------- /tensorflow/course/test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | xs = tf.placeholder(tf.float32, [None, 1], name="xs") 4 | ys = tf.placeholder(tf.float32, [None, 1], name="ys") 5 | Weights1 = tf.Variable(tf.constant([[0.0004]]), dtype=tf.float32, name="Weights1") 6 | Biases1 = tf.Variable(tf.zeros([1, 1]) + 0.1, dtype=tf.float32, name="Biases1") 7 | Wx_plus_b1 = tf.add(tf.matmul(xs, Weights1, name="matmul"), Biases1, name="add") 8 | l1 = tf.nn.sigmoid(Wx_plus_b1) 9 | 10 | Weights2 = tf.Variable(tf.constant([[10000.0]]), dtype=tf.float32, name="Weights2") 11 | Biases2 = tf.Variable(tf.constant(-4999.5), dtype=tf.float32, name="Biases2") 12 | Wx_plus_b2 = tf.add(tf.matmul(l1, Weights2, name="matmul"), Biases2, name="add") 13 | 14 | prediction = Wx_plus_b2 15 | 16 | loss = tf.reduce_mean(tf.square(tf.subtract(ys, prediction, name="Sub"), name="Square"), name="ReduceMean") 17 | 18 | # tf.train.GradientDescentOptimizer,实现梯度下降算法的优化器 19 | train_step = tf.train.GradientDescentOptimizer(0.1).minimize(loss, 20 | name="minimize") 21 | init_op = tf.global_variables_initializer() 22 | with tf.Session() as sess: 23 | sess.run(init_op) 24 | writer = tf.summary.FileWriter('./graphs', sess.graph) 25 | for i in range(1000): 26 | a, b, c, d, e, f = sess.run([train_step, loss, Weights1, Biases1, Weights2, Biases2], 27 | feed_dict={xs: [[i], [i]], ys: [[i], [i]]}) 28 | print(b) 29 | print(c) 30 | print(d) 31 | print(e) 32 | print(f) 33 | print("22222") 34 | writer.close() 35 | -------------------------------------------------------------------------------- /tensorflow/course/testt.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | a = tf.constant(2) 3 | b = tf.constant(3) 4 | x = tf.add(a, b) 5 | with tf.Session() as sess: 6 | writer = tf.summary.FileWriter('./graphs', sess.graph) 7 | print(sess.run(x)) 8 | writer.close() # close the writer when you’re done using it -------------------------------------------------------------------------------- /tensorflow/course/variable.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | # 创建一个变量, 初始化为标量 0. 4 | state = tf.Variable(0, name="counter") 5 | 6 | # 创建一个 op, 其作用是使 state 增加 1 7 | 8 | one = tf.constant(1) 9 | new_value = tf.add(state, one) 10 | update = tf.assign(state, new_value) 11 | 12 | # 启动图后, 变量必须先经过`初始化` (init) op 初始化, 13 | # 首先必须增加一个`初始化` op 到图中. 14 | # init_op = tf.initialize_all_variables() 15 | init_op = tf.global_variables_initializer() 16 | 17 | # 启动图, 运行 op 18 | with tf.Session() as sess: 19 | # 运行 'init' op 20 | sess.run(init_op) 21 | # 打印 'state' 的初始值 22 | print(sess.run(state)) 23 | # 运行 op, 更新 'state', 并打印 'state' 24 | for _ in range(3): 25 | sess.run(update) 26 | print(sess.run(state)) 27 | 28 | # 输出: 29 | 30 | # 0 31 | # 1 32 | # 2 33 | # 3 34 | -------------------------------------------------------------------------------- /watermelon/ch3/3.3/data/watermelon_3a.csv: -------------------------------------------------------------------------------- 1 | 1,0.697,0.46,1 2 | 2,0.774,0.376,1 3 | 3,0.634,0.264,1 4 | 4,0.608,0.318,1 5 | 5,0.556,0.215,1 6 | 6,0.403,0.237,1 7 | 7,0.481,0.149,1 8 | 8,0.437,0.211,1 9 | 9,0.666,0.091,0 10 | 10,0.243,0.267,0 11 | 11,0.245,0.057,0 12 | 12,0.343,0.099,0 13 | 13,0.639,0.161,0 14 | 14,0.657,0.198,0 15 | 15,0.36,0.37,0 16 | 16,0.593,0.042,0 17 | 17,0.719,0.103,0 18 | -------------------------------------------------------------------------------- /watermelon/ch3/3.3/logistic_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import model_selection 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn import metrics 6 | import matplotlib.pylab as pl 7 | import self_def 8 | 9 | 10 | # load the CSV file as a numpy matrix 11 | dataSet = np.loadtxt('data/watermelon_3a.csv', delimiter=",") 12 | 13 | # separate the data from the target attributes 14 | X = dataSet[:, 1:3] 15 | y = dataSet[:, 3] 16 | 17 | # draw scatter diagram to show the raw data 18 | f1 = plt.figure(1) 19 | plt.title('watermelon_3a') 20 | plt.xlabel('density') 21 | plt.ylabel('ratio_sugar') 22 | plt.scatter(X[y == 0, 0], X[y == 0, 1], marker='o', color='k', s=100, label='bad') 23 | plt.scatter(X[y == 1, 0], X[y == 1, 1], marker='o', color='g', s=100, label='good') 24 | plt.legend(loc='upper right') 25 | plt.show() 26 | 27 | ''' 28 | using sklearn lib for logistic regression 29 | ''' 30 | 31 | # generalization of test and train set 32 | X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.5, random_state=0) 33 | 34 | # model training 35 | log_model = LogisticRegression() 36 | log_model.fit(X_train, y_train) 37 | 38 | # model testing 39 | y_pred = log_model.predict(X_test) 40 | 41 | # summarize the accuracy of fitting 42 | print(metrics.confusion_matrix(y_test, y_pred)) 43 | print(metrics.classification_report(y_test, y_pred)) 44 | 45 | precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred) 46 | 47 | # show decision boundary in plt 48 | # X - some data in 2dimensional np.array 49 | f2 = plt.figure(2) 50 | h = 0.001 51 | x0_min, x0_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1 52 | x1_min, x1_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1 53 | x0, x1 = np.meshgrid(np.arange(x0_min, x0_max, h), 54 | np.arange(x1_min, x1_max, h)) 55 | 56 | # here "model" is your model's prediction (classification) function 57 | z = log_model.predict(np.c_[x0.ravel(), x1.ravel()]) 58 | 59 | # Put the result into a color plot 60 | z = z.reshape(x0.shape) 61 | plt.contourf(x0, x1, z, cmap=pl.cm.Paired) 62 | 63 | # Plot also the training pointsplt.title('watermelon_3a') 64 | plt.title('watermelon_3a') 65 | plt.xlabel('density') 66 | plt.ylabel('ratio_sugar') 67 | plt.scatter(X[y == 0, 0], X[y == 0, 1], marker='o', color='k', s=100, label='bad') 68 | plt.scatter(X[y == 1, 0], X[y == 1, 1], marker='o', color='g', s=100, label='good') 69 | # plt.show() 70 | 71 | ''' 72 | coding to implement logistic regression 73 | ''' 74 | 75 | # X_train, X_test, y_train, y_test 76 | # np.ones(n) 77 | m, n = np.shape(X) 78 | X_ex = np.c_[X, np.ones(m)] # extend the variable matrix to [x, 1] 79 | X_train, X_test, y_train, y_test = model_selection.train_test_split(X_ex, y, test_size=0.5, random_state=0) 80 | 81 | # using gradDescent to get the optimal parameter beta = [w, b] in page-59 82 | beta = self_def.gradDscent_2(X_train, y_train) 83 | 84 | # prediction, beta mapping to the model 85 | y_pred = self_def.predict(X_test, beta) 86 | 87 | m_test = np.shape(X_test)[0] 88 | # calculation of confusion_matrix and prediction accuracy 89 | cfmat = np.zeros((2, 2)) 90 | for i in range(m_test): 91 | if y_pred[i] == y_test[i] == 0: 92 | cfmat[0, 0] += 1 93 | elif y_pred[i] == y_test[i] == 1: 94 | cfmat[1, 1] += 1 95 | elif y_pred[i] == 0: 96 | cfmat[1, 0] += 1 97 | elif y_pred[i] == 1: 98 | cfmat[0, 1] += 1 99 | 100 | print(cfmat) 101 | -------------------------------------------------------------------------------- /watermelon/ch3/3.3/self_def.py: -------------------------------------------------------------------------------- 1 | # object likelihood function 2 | import numpy as np 3 | 4 | 5 | def likelihood_sub(x, y, beta): 6 | ''' 7 | @param X: one sample variables 8 | @param y: one sample label 9 | @param beta: the parameter vector in 3.27 10 | @return: the sub_log-likelihood of 3.27 11 | ''' 12 | return -y * np.dot(beta, x.T) + np.math.log(1 + np.math.exp(np.dot(beta, x.T))) 13 | 14 | 15 | def likelihood(X, y, beta): 16 | ''' 17 | @param X: the sample variables matrix 18 | @param y: the sample label matrix 19 | @param beta: the parameter vector in 3.27 20 | @return: the log-likelihood of 3.27 21 | ''' 22 | sum = 0 23 | m, n = np.shape(X) 24 | 25 | for i in range(m): 26 | sum += likelihood_sub(X[i], y[i], beta) 27 | 28 | return sum 29 | 30 | 31 | def partial_derivative(X, y, beta): # refer to 3.30 on book page 60 32 | ''' 33 | @param X: the sample variables matrix 34 | @param y: the sample label matrix 35 | @param beta: the parameter vector in 3.27 36 | @return: the partial derivative of beta [j] 37 | ''' 38 | 39 | m, n = np.shape(X) 40 | pd = np.zeros(n) 41 | 42 | for i in range(m): 43 | tmp = y[i] - sigmoid(X[i], beta) 44 | for j in range(n): 45 | pd[j] += X[i][j] * (tmp) 46 | return pd 47 | 48 | 49 | def gradDscent_1(X, y): # implementation of fundational gradDscent algorithms 50 | ''' 51 | @param X: X is the variable matrix 52 | @param y: y is the label array 53 | @return: the best parameter estimate of 3.27 54 | ''' 55 | import matplotlib.pyplot as plt 56 | 57 | h = 0.1 # step length of iterator 58 | max_times = 500 # give the iterative times limit 59 | m, n = np.shape(X) 60 | 61 | b = np.zeros((n, max_times)) # for show convergence curve of parameter 62 | beta = np.zeros(n) # parameter and initial 63 | delta_beta = np.ones(n) * h 64 | llh = 0 65 | llh_temp = 0 66 | 67 | for i in range(max_times): 68 | beta_temp = beta 69 | 70 | for j in range(n): 71 | # for partial derivative 72 | beta[j] += delta_beta[j] 73 | llh_tmp = likelihood(X, y, beta) 74 | delta_beta[j] = -h * (llh_tmp - llh) / delta_beta[j] 75 | 76 | b[j, i] = beta[j] 77 | 78 | beta[j] = beta_temp[j] 79 | 80 | beta += delta_beta 81 | llh = likelihood(X, y, beta) 82 | 83 | t = np.arange(max_times) 84 | 85 | f2 = plt.figure(3) 86 | 87 | p1 = plt.subplot(311) 88 | p1.plot(t, b[0]) 89 | plt.ylabel('w1') 90 | 91 | p2 = plt.subplot(312) 92 | p2.plot(t, b[1]) 93 | plt.ylabel('w2') 94 | 95 | p3 = plt.subplot(313) 96 | p3.plot(t, b[2]) 97 | plt.ylabel('b') 98 | 99 | plt.show() 100 | return beta 101 | 102 | 103 | def gradDscent_2(X, y): # implementation of stochastic gradDscent algorithms 104 | ''' 105 | @param X: X is the variable matrix 106 | @param y: y is the label array 107 | @return: the best parameter estimate of 3.27 108 | ''' 109 | import matplotlib.pyplot as plt 110 | 111 | m, n = np.shape(X) 112 | h = 0.5 # step length of iterator and initial 113 | beta = np.zeros(n) # parameter and initial 114 | delta_beta = np.ones(n) * h 115 | llh = 0 116 | llh_temp = 0 117 | b = np.zeros((n, m)) # for show convergence curve of parameter 118 | 119 | for i in range(m): 120 | beta_temp = beta 121 | 122 | for j in range(n): 123 | # for partial derivative 124 | h = 0.5 * 1 / (1 + i + j) # change step length of iterator 125 | beta[j] += delta_beta[j] 126 | 127 | b[j, i] = beta[j] 128 | 129 | llh_tmp = likelihood_sub(X[i], y[i], beta) 130 | delta_beta[j] = -h * (llh_tmp - llh) / delta_beta[j] 131 | 132 | beta[j] = beta_temp[j] 133 | 134 | beta += delta_beta 135 | llh = likelihood_sub(X[i], y[i], beta) 136 | 137 | t = np.arange(m) 138 | 139 | f2 = plt.figure(3) 140 | 141 | p1 = plt.subplot(311) 142 | p1.plot(t, b[0]) 143 | plt.ylabel('w1') 144 | 145 | p2 = plt.subplot(312) 146 | p2.plot(t, b[1]) 147 | plt.ylabel('w2') 148 | 149 | p3 = plt.subplot(313) 150 | p3.plot(t, b[2]) 151 | plt.ylabel('b') 152 | 153 | plt.show() 154 | 155 | return beta 156 | 157 | 158 | def sigmoid(x, beta): 159 | ''' 160 | @param x: is the predict variable 161 | @param beta: is the parameter 162 | @return: the sigmoid function value 163 | ''' 164 | return 1.0 / (1 + np.math.exp(- np.dot(beta, x.T))) 165 | 166 | 167 | def predict(X, beta): 168 | ''' 169 | prediction the class lable using sigmoid 170 | @param X: data sample form like [x, 1] 171 | @param beta: the parameter of sigmoid form like [w, b] 172 | @return: the class lable array 173 | ''' 174 | m, n = np.shape(X) 175 | y = np.zeros(m) 176 | 177 | for i in range(m): 178 | if sigmoid(X[i], beta) > 0.5: y[i] = 1; 179 | return y 180 | 181 | return beta 182 | -------------------------------------------------------------------------------- /watermelon/ch3/3.4/cross_validation.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import seaborn as sns 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn import metrics 6 | from sklearn.model_selection import cross_val_predict 7 | 8 | ''' 9 | 1-st. iris data set importing and visualization using seaborn 10 | ''' 11 | 12 | sns.set(style="white", color_codes=True) 13 | iris = sns.load_dataset("iris") 14 | X = iris.values[50:150, 0:4] 15 | y = iris.values[50:150, 4] 16 | 17 | # iris.plot(kind="scatter", x="sepal_length", y="sepal_width") 18 | # sns.pairplot(iris,hue='species') 19 | # sns.plt.show() 20 | 21 | ''' 22 | 2-nd logistic regression using sklearn 23 | ''' 24 | 25 | 26 | # log-regression lib model 27 | log_model = LogisticRegression() 28 | m = np.shape(X)[0] 29 | 30 | # 10-folds CV 10折交叉验证 31 | y_pred = cross_val_predict(log_model, X, y, cv=10) 32 | print(metrics.accuracy_score(y, y_pred)) 33 | 34 | # LOOCV 留一法 35 | from sklearn.model_selection import LeaveOneOut 36 | 37 | loo = LeaveOneOut() 38 | accuracy = 0 39 | for train, test in loo.split(X): 40 | log_model.fit(X[train], y[train]) # fitting 41 | y_p = log_model.predict(X[test]) 42 | if y_p == y[test]: accuracy += 1 43 | print(accuracy / np.shape(X)[0]) 44 | 45 | ''' 46 | transfusion-blood data set analysis 47 | ''' 48 | 49 | dataset_transfusion = np.loadtxt('data/transfusion.data', delimiter=",", skiprows=1) 50 | X2 = dataset_transfusion[:, 0:4] 51 | y2 = dataset_transfusion[:, 4] 52 | 53 | 54 | # log-regression lib model 55 | log_model = LogisticRegression() 56 | m = np.shape(X2)[0] 57 | 58 | # 10-folds CV 59 | y2_pred = cross_val_predict(log_model, X2, y2, cv=10) 60 | print(metrics.accuracy_score(y2, y2_pred)) 61 | 62 | # LOOCV 63 | # from sklearn.model_selection import LeaveOneOut 64 | loo = LeaveOneOut() 65 | accuracy = 0 66 | for train, test in loo.split(X2): 67 | log_model.fit(X2[train], y2[train]) # fitting 68 | y2_p = log_model.predict(X2[test]) 69 | if y2_p == y2[test]: accuracy += 1 70 | print(accuracy / np.shape(X2)[0]) 71 | 72 | ''' 73 | 针对经验风险最小化算法的过拟合的问题，给出交叉验证的方法，这个方法在做分类问题时很常用： 74 | 一：简单的交叉验证的步骤如下： 75 | 1、从全部的训练数据 S中随机选择中随机选择 s的样例作为训练集train，剩余的作为测试集test。 76 | 2、通过对测试集训练，得到假设函数或者模型。 77 | 3、在测试集对每一个样本根据假设函数或者模型，得到训练集的类标，求出分类正确率。 78 | 4，选择具有最大分类率的模型或者假设。 79 | 这种方法称为 hold -out cross validation 或者称为简单交叉验证。由于测试集和训练集是分开的，就避免了过拟合的现象 80 | 81 | 二：k折交叉验证 k-fold cross validation 82 | 1、将全部训练集 S分成 k个不相交的子集，假设 S中的训练样例个数为 m，那么每一个子集有 m/k 个训练样例，，相应的子集称作 {s1,s2,…,sk}。 83 | 2、每次从分好的子集中里面，拿出一个作为测试集，其它k-1个作为训练集 84 | 3、根据训练训练出模型或者假设函数。 85 | 4、把这个模型放到测试集上，得到分类率。 86 | 5、计算k次求得的分类率的平均值，作为该模型或者假设函数的真实分类率。 87 | 这个方法充分利用了所有样本。但计算比较繁琐，需要训练k次，测试k次。 88 | 89 | 三：留一法 leave-one-out cross validation 90 | 留一法就是每次只留下一个样本做测试集，其它样本做训练集，如果有k个样本，则需要训练k次，测试k次。 91 | 留一法计算最繁琐，但样本利用率最高。适合于小样本的情况。 92 | 93 | ''' 94 | 95 | ''' 96 | 两种交叉验证的结果相近，但是由于 Blood Transfusion Service Center Data Set的类分性不如iris明显，所得结果也要差一些。 97 | 同时由程序运行可以看出，LOOCV的运行时间相对较长，这一点随着数据量的增大而愈发明显。 98 | 所以，一般情况下选择K-折交叉验证即可满足精度要求，同时运算量相对小 99 | ''' 100 | -------------------------------------------------------------------------------- /watermelon/ch3/3.4/data/transfusion.data: -------------------------------------------------------------------------------- 1 | Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),"whether he/she donated blood in March 2007" 2 | 2 ,50,12500,98 ,1 3 | 0 ,13,3250,28 ,1 4 | 1 ,16,4000,35 ,1 5 | 2 ,20,5000,45 ,1 6 | 1 ,24,6000,77 ,0 7 | 4 ,4,1000,4 ,0 8 | 2 ,7,1750,14 ,1 9 | 1 ,12,3000,35 ,0 10 | 2 ,9,2250,22 ,1 11 | 5 ,46,11500,98 ,1 12 | 4 ,23,5750,58 ,0 13 | 0 ,3,750,4 ,0 14 | 2 ,10,2500,28 ,1 15 | 1 ,13,3250,47 ,0 16 | 2 ,6,1500,15 ,1 17 | 2 ,5,1250,11 ,1 18 | 2 ,14,3500,48 ,1 19 | 2 ,15,3750,49 ,1 20 | 2 ,6,1500,15 ,1 21 | 2 ,3,750,4 ,1 22 | 2 ,3,750,4 ,1 23 | 4 ,11,2750,28 ,0 24 | 2 ,6,1500,16 ,1 25 | 2 ,6,1500,16 ,1 26 | 9 ,9,2250,16 ,0 27 | 4 ,14,3500,40 ,0 28 | 4 ,6,1500,14 ,0 29 | 4 ,12,3000,34 ,1 30 | 4 ,5,1250,11 ,1 31 | 4 ,8,2000,21 ,0 32 | 1 ,14,3500,58 ,0 33 | 4 ,10,2500,28 ,1 34 | 4 ,10,2500,28 ,1 35 | 4 ,9,2250,26 ,1 36 | 2 ,16,4000,64 ,0 37 | 2 ,8,2000,28 ,1 38 | 2 ,12,3000,47 ,1 39 | 4 ,6,1500,16 ,1 40 | 2 ,14,3500,57 ,1 41 | 4 ,7,1750,22 ,1 42 | 2 ,13,3250,53 ,1 43 | 2 ,5,1250,16 ,0 44 | 2 ,5,1250,16 ,1 45 | 2 ,5,1250,16 ,0 46 | 4 ,20,5000,69 ,1 47 | 4 ,9,2250,28 ,1 48 | 2 ,9,2250,36 ,0 49 | 2 ,2,500,2 ,0 50 | 2 ,2,500,2 ,0 51 | 2 ,2,500,2 ,0 52 | 2 ,11,2750,46 ,0 53 | 2 ,11,2750,46 ,1 54 | 2 ,6,1500,22 ,0 55 | 2 ,12,3000,52 ,0 56 | 4 ,5,1250,14 ,1 57 | 4 ,19,4750,69 ,1 58 | 4 ,8,2000,26 ,1 59 | 2 ,7,1750,28 ,1 60 | 2 ,16,4000,81 ,0 61 | 3 ,6,1500,21 ,0 62 | 2 ,7,1750,29 ,0 63 | 2 ,8,2000,35 ,1 64 | 2 ,10,2500,49 ,0 65 | 4 ,5,1250,16 ,1 66 | 2 ,3,750,9 ,1 67 | 3 ,16,4000,74 ,0 68 | 2 ,4,1000,14 ,1 69 | 0 ,2,500,4 ,0 70 | 4 ,7,1750,25 ,0 71 | 1 ,9,2250,51 ,0 72 | 2 ,4,1000,16 ,0 73 | 2 ,4,1000,16 ,0 74 | 4 ,17,4250,71 ,1 75 | 2 ,2,500,4 ,0 76 | 2 ,2,500,4 ,1 77 | 2 ,2,500,4 ,1 78 | 2 ,4,1000,16 ,1 79 | 2 ,2,500,4 ,0 80 | 2 ,2,500,4 ,0 81 | 2 ,2,500,4 ,0 82 | 4 ,6,1500,23 ,1 83 | 2 ,4,1000,16 ,0 84 | 2 ,4,1000,16 ,0 85 | 2 ,4,1000,16 ,0 86 | 2 ,6,1500,28 ,1 87 | 2 ,6,1500,28 ,0 88 | 4 ,2,500,4 ,0 89 | 4 ,2,500,4 ,0 90 | 4 ,2,500,4 ,0 91 | 2 ,7,1750,35 ,1 92 | 4 ,2,500,4 ,1 93 | 4 ,2,500,4 ,0 94 | 4 ,2,500,4 ,0 95 | 4 ,2,500,4 ,0 96 | 12 ,11,2750,23 ,0 97 | 4 ,7,1750,28 ,0 98 | 3 ,17,4250,86 ,0 99 | 4 ,9,2250,38 ,1 100 | 4 ,4,1000,14 ,1 101 | 5 ,7,1750,26 ,1 102 | 4 ,8,2000,34 ,1 103 | 2 ,13,3250,76 ,1 104 | 4 ,9,2250,40 ,0 105 | 2 ,5,1250,26 ,0 106 | 2 ,5,1250,26 ,0 107 | 6 ,17,4250,70 ,0 108 | 0 ,8,2000,59 ,0 109 | 3 ,5,1250,26 ,0 110 | 2 ,3,750,14 ,0 111 | 2 ,10,2500,64 ,0 112 | 4 ,5,1250,23 ,1 113 | 4 ,9,2250,46 ,0 114 | 4 ,5,1250,23 ,0 115 | 4 ,8,2000,40 ,1 116 | 2 ,12,3000,82 ,0 117 | 11 ,24,6000,64 ,0 118 | 2 ,7,1750,46 ,1 119 | 4 ,11,2750,61 ,0 120 | 1 ,7,1750,57 ,0 121 | 2 ,11,2750,79 ,1 122 | 2 ,3,750,16 ,1 123 | 4 ,5,1250,26 ,1 124 | 2 ,6,1500,41 ,1 125 | 2 ,5,1250,33 ,1 126 | 2 ,4,1000,26 ,0 127 | 2 ,5,1250,34 ,0 128 | 4 ,8,2000,46 ,1 129 | 2 ,4,1000,26 ,0 130 | 4 ,8,2000,48 ,1 131 | 2 ,2,500,10 ,1 132 | 4 ,5,1250,28 ,0 133 | 2 ,12,3000,95 ,0 134 | 2 ,2,500,10 ,0 135 | 4 ,6,1500,35 ,0 136 | 2 ,11,2750,88 ,0 137 | 2 ,3,750,19 ,0 138 | 2 ,5,1250,37 ,0 139 | 2 ,12,3000,98 ,0 140 | 9 ,5,1250,19 ,0 141 | 2 ,2,500,11 ,0 142 | 2 ,9,2250,74 ,0 143 | 5 ,14,3500,86 ,0 144 | 4 ,3,750,16 ,0 145 | 4 ,3,750,16 ,0 146 | 4 ,2,500,9 ,1 147 | 4 ,3,750,16 ,1 148 | 6 ,3,750,14 ,0 149 | 2 ,2,500,11 ,0 150 | 2 ,2,500,11 ,1 151 | 2 ,2,500,11 ,0 152 | 2 ,7,1750,58 ,1 153 | 4 ,6,1500,39 ,0 154 | 4 ,11,2750,78 ,0 155 | 2 ,1,250,2 ,1 156 | 2 ,1,250,2 ,0 157 | 2 ,1,250,2 ,0 158 | 2 ,1,250,2 ,0 159 | 2 ,1,250,2 ,0 160 | 2 ,1,250,2 ,0 161 | 2 ,1,250,2 ,0 162 | 2 ,1,250,2 ,0 163 | 2 ,1,250,2 ,0 164 | 2 ,1,250,2 ,0 165 | 2 ,1,250,2 ,1 166 | 2 ,1,250,2 ,1 167 | 2 ,1,250,2 ,1 168 | 2 ,1,250,2 ,0 169 | 2 ,1,250,2 ,0 170 | 2 ,1,250,2 ,0 171 | 2 ,1,250,2 ,0 172 | 2 ,1,250,2 ,0 173 | 2 ,1,250,2 ,0 174 | 2 ,1,250,2 ,0 175 | 2 ,1,250,2 ,0 176 | 2 ,1,250,2 ,0 177 | 11 ,10,2500,35 ,0 178 | 11 ,4,1000,16 ,1 179 | 4 ,5,1250,33 ,1 180 | 4 ,6,1500,41 ,1 181 | 2 ,3,750,22 ,0 182 | 4 ,4,1000,26 ,1 183 | 10 ,4,1000,16 ,0 184 | 2 ,4,1000,35 ,0 185 | 4 ,12,3000,88 ,0 186 | 13 ,8,2000,26 ,0 187 | 11 ,9,2250,33 ,0 188 | 4 ,5,1250,34 ,0 189 | 4 ,4,1000,26 ,0 190 | 8 ,15,3750,77 ,0 191 | 4 ,5,1250,35 ,1 192 | 4 ,7,1750,52 ,0 193 | 4 ,7,1750,52 ,0 194 | 2 ,4,1000,35 ,0 195 | 11 ,11,2750,42 ,0 196 | 2 ,2,500,14 ,0 197 | 2 ,5,1250,47 ,1 198 | 9 ,8,2000,38 ,1 199 | 4 ,6,1500,47 ,0 200 | 11 ,7,1750,29 ,0 201 | 9 ,9,2250,45 ,0 202 | 4 ,6,1500,52 ,0 203 | 4 ,7,1750,58 ,0 204 | 6 ,2,500,11 ,1 205 | 4 ,7,1750,58 ,0 206 | 11 ,9,2250,38 ,0 207 | 11 ,6,1500,26 ,0 208 | 2 ,2,500,16 ,0 209 | 2 ,7,1750,76 ,0 210 | 11 ,6,1500,27 ,0 211 | 11 ,3,750,14 ,0 212 | 4 ,1,250,4 ,0 213 | 4 ,1,250,4 ,0 214 | 4 ,1,250,4 ,0 215 | 4 ,1,250,4 ,0 216 | 4 ,1,250,4 ,0 217 | 4 ,1,250,4 ,1 218 | 4 ,1,250,4 ,0 219 | 4 ,1,250,4 ,0 220 | 4 ,1,250,4 ,0 221 | 4 ,1,250,4 ,0 222 | 4 ,1,250,4 ,0 223 | 4 ,1,250,4 ,1 224 | 4 ,1,250,4 ,1 225 | 4 ,1,250,4 ,0 226 | 4 ,1,250,4 ,1 227 | 4 ,1,250,4 ,1 228 | 4 ,1,250,4 ,0 229 | 4 ,3,750,24 ,0 230 | 4 ,1,250,4 ,0 231 | 4 ,1,250,4 ,0 232 | 4 ,1,250,4 ,0 233 | 4 ,1,250,4 ,1 234 | 4 ,1,250,4 ,0 235 | 10 ,8,2000,39 ,0 236 | 14 ,7,1750,26 ,0 237 | 8 ,10,2500,63 ,0 238 | 11 ,3,750,15 ,0 239 | 4 ,2,500,14 ,0 240 | 2 ,4,1000,43 ,0 241 | 8 ,9,2250,58 ,0 242 | 8 ,8,2000,52 ,1 243 | 11 ,22,5500,98 ,0 244 | 4 ,3,750,25 ,1 245 | 11 ,17,4250,79 ,1 246 | 9 ,2,500,11 ,0 247 | 4 ,5,1250,46 ,0 248 | 11 ,12,3000,58 ,0 249 | 7 ,12,3000,86 ,0 250 | 11 ,2,500,11 ,0 251 | 11 ,2,500,11 ,0 252 | 11 ,2,500,11 ,0 253 | 2 ,6,1500,75 ,0 254 | 11 ,8,2000,41 ,1 255 | 11 ,3,750,16 ,1 256 | 12 ,13,3250,59 ,0 257 | 2 ,3,750,35 ,0 258 | 16 ,8,2000,28 ,0 259 | 11 ,7,1750,37 ,0 260 | 4 ,3,750,28 ,0 261 | 12 ,12,3000,58 ,0 262 | 4 ,4,1000,41 ,0 263 | 11 ,14,3500,73 ,1 264 | 2 ,2,500,23 ,0 265 | 2 ,3,750,38 ,1 266 | 4 ,5,1250,58 ,0 267 | 4 ,4,1000,43 ,1 268 | 3 ,2,500,23 ,0 269 | 11 ,8,2000,46 ,0 270 | 4 ,7,1750,82 ,0 271 | 13 ,4,1000,21 ,0 272 | 16 ,11,2750,40 ,0 273 | 16 ,7,1750,28 ,0 274 | 7 ,2,500,16 ,0 275 | 4 ,5,1250,58 ,0 276 | 4 ,5,1250,58 ,0 277 | 4 ,4,1000,46 ,0 278 | 14 ,13,3250,57 ,0 279 | 4 ,3,750,34 ,0 280 | 14 ,18,4500,78 ,0 281 | 11 ,8,2000,48 ,0 282 | 14 ,16,4000,70 ,0 283 | 14 ,4,1000,22 ,1 284 | 14 ,5,1250,26 ,0 285 | 8 ,2,500,16 ,0 286 | 11 ,5,1250,33 ,0 287 | 11 ,2,500,14 ,0 288 | 4 ,2,500,23 ,0 289 | 9 ,2,500,16 ,1 290 | 14 ,5,1250,28 ,1 291 | 14 ,3,750,19 ,1 292 | 14 ,4,1000,23 ,1 293 | 16 ,12,3000,50 ,0 294 | 11 ,4,1000,28 ,0 295 | 11 ,5,1250,35 ,0 296 | 11 ,5,1250,35 ,0 297 | 2 ,4,1000,70 ,0 298 | 14 ,5,1250,28 ,0 299 | 14 ,2,500,14 ,0 300 | 14 ,2,500,14 ,0 301 | 14 ,2,500,14 ,0 302 | 14 ,2,500,14 ,0 303 | 14 ,2,500,14 ,0 304 | 14 ,2,500,14 ,0 305 | 2 ,3,750,52 ,0 306 | 14 ,6,1500,34 ,0 307 | 11 ,5,1250,37 ,1 308 | 4 ,5,1250,74 ,0 309 | 11 ,3,750,23 ,0 310 | 16 ,4,1000,23 ,0 311 | 16 ,3,750,19 ,0 312 | 11 ,5,1250,38 ,0 313 | 11 ,2,500,16 ,0 314 | 12 ,9,2250,60 ,0 315 | 9 ,1,250,9 ,0 316 | 9 ,1,250,9 ,0 317 | 4 ,2,500,29 ,0 318 | 11 ,2,500,17 ,0 319 | 14 ,4,1000,26 ,0 320 | 11 ,9,2250,72 ,1 321 | 11 ,5,1250,41 ,0 322 | 15 ,16,4000,82 ,0 323 | 9 ,5,1250,51 ,1 324 | 11 ,4,1000,34 ,0 325 | 14 ,8,2000,50 ,1 326 | 16 ,7,1750,38 ,0 327 | 14 ,2,500,16 ,0 328 | 2 ,2,500,41 ,0 329 | 14 ,16,4000,98 ,0 330 | 14 ,4,1000,28 ,1 331 | 16 ,7,1750,39 ,0 332 | 14 ,7,1750,47 ,0 333 | 16 ,6,1500,35 ,0 334 | 16 ,6,1500,35 ,1 335 | 11 ,7,1750,62 ,1 336 | 16 ,2,500,16 ,0 337 | 16 ,3,750,21 ,1 338 | 11 ,3,750,28 ,0 339 | 11 ,7,1750,64 ,0 340 | 11 ,1,250,11 ,1 341 | 9 ,3,750,34 ,0 342 | 14 ,4,1000,30 ,0 343 | 23 ,38,9500,98 ,0 344 | 11 ,6,1500,58 ,0 345 | 11 ,1,250,11 ,0 346 | 11 ,1,250,11 ,0 347 | 11 ,1,250,11 ,0 348 | 11 ,1,250,11 ,0 349 | 11 ,1,250,11 ,0 350 | 11 ,1,250,11 ,0 351 | 11 ,1,250,11 ,0 352 | 11 ,1,250,11 ,0 353 | 11 ,2,500,21 ,0 354 | 11 ,5,1250,50 ,0 355 | 11 ,2,500,21 ,0 356 | 16 ,4,1000,28 ,0 357 | 4 ,2,500,41 ,0 358 | 16 ,6,1500,40 ,0 359 | 14 ,3,750,26 ,0 360 | 9 ,2,500,26 ,0 361 | 21 ,16,4000,64 ,0 362 | 14 ,6,1500,51 ,0 363 | 11 ,2,500,24 ,0 364 | 4 ,3,750,71 ,0 365 | 21 ,13,3250,57 ,0 366 | 11 ,6,1500,71 ,0 367 | 14 ,2,500,21 ,1 368 | 23 ,15,3750,57 ,0 369 | 14 ,4,1000,38 ,0 370 | 11 ,2,500,26 ,0 371 | 16 ,5,1250,40 ,1 372 | 4 ,2,500,51 ,1 373 | 14 ,3,750,31 ,0 374 | 4 ,2,500,52 ,0 375 | 9 ,4,1000,65 ,0 376 | 14 ,4,1000,40 ,0 377 | 11 ,3,750,40 ,1 378 | 14 ,5,1250,50 ,0 379 | 14 ,1,250,14 ,0 380 | 14 ,1,250,14 ,0 381 | 14 ,1,250,14 ,0 382 | 14 ,1,250,14 ,0 383 | 14 ,1,250,14 ,0 384 | 14 ,1,250,14 ,0 385 | 14 ,1,250,14 ,0 386 | 14 ,1,250,14 ,0 387 | 14 ,7,1750,72 ,0 388 | 14 ,1,250,14 ,0 389 | 14 ,1,250,14 ,0 390 | 9 ,3,750,52 ,0 391 | 14 ,7,1750,73 ,0 392 | 11 ,4,1000,58 ,0 393 | 11 ,4,1000,59 ,0 394 | 4 ,2,500,59 ,0 395 | 11 ,4,1000,61 ,0 396 | 16 ,4,1000,40 ,0 397 | 16 ,10,2500,89 ,0 398 | 21 ,2,500,21 ,1 399 | 21 ,3,750,26 ,0 400 | 16 ,8,2000,76 ,0 401 | 21 ,3,750,26 ,1 402 | 18 ,2,500,23 ,0 403 | 23 ,5,1250,33 ,0 404 | 23 ,8,2000,46 ,0 405 | 16 ,3,750,34 ,0 406 | 14 ,5,1250,64 ,0 407 | 14 ,3,750,41 ,0 408 | 16 ,1,250,16 ,0 409 | 16 ,1,250,16 ,0 410 | 16 ,1,250,16 ,0 411 | 16 ,1,250,16 ,0 412 | 16 ,1,250,16 ,0 413 | 16 ,1,250,16 ,0 414 | 16 ,1,250,16 ,0 415 | 16 ,4,1000,45 ,0 416 | 16 ,1,250,16 ,0 417 | 16 ,1,250,16 ,0 418 | 16 ,1,250,16 ,0 419 | 16 ,1,250,16 ,0 420 | 16 ,1,250,16 ,0 421 | 16 ,2,500,26 ,0 422 | 21 ,2,500,23 ,0 423 | 16 ,2,500,27 ,0 424 | 21 ,2,500,23 ,0 425 | 21 ,2,500,23 ,0 426 | 14 ,4,1000,57 ,0 427 | 16 ,5,1250,60 ,0 428 | 23 ,2,500,23 ,0 429 | 14 ,5,1250,74 ,0 430 | 23 ,3,750,28 ,0 431 | 16 ,3,750,40 ,0 432 | 9 ,2,500,52 ,0 433 | 9 ,2,500,52 ,0 434 | 16 ,7,1750,87 ,1 435 | 14 ,4,1000,64 ,0 436 | 14 ,2,500,35 ,0 437 | 16 ,7,1750,93 ,0 438 | 21 ,2,500,25 ,0 439 | 14 ,3,750,52 ,0 440 | 23 ,14,3500,93 ,0 441 | 18 ,8,2000,95 ,0 442 | 16 ,3,750,46 ,0 443 | 11 ,3,750,76 ,0 444 | 11 ,2,500,52 ,0 445 | 11 ,3,750,76 ,0 446 | 23 ,12,3000,86 ,0 447 | 21 ,3,750,35 ,0 448 | 23 ,2,500,26 ,0 449 | 23 ,2,500,26 ,0 450 | 23 ,8,2000,64 ,0 451 | 16 ,3,750,50 ,0 452 | 23 ,3,750,33 ,0 453 | 21 ,3,750,38 ,0 454 | 23 ,2,500,28 ,0 455 | 21 ,1,250,21 ,0 456 | 21 ,1,250,21 ,0 457 | 21 ,1,250,21 ,0 458 | 21 ,1,250,21 ,0 459 | 21 ,1,250,21 ,0 460 | 21 ,1,250,21 ,0 461 | 21 ,1,250,21 ,0 462 | 21 ,1,250,21 ,0 463 | 21 ,1,250,21 ,0 464 | 21 ,1,250,21 ,1 465 | 21 ,1,250,21 ,0 466 | 21 ,1,250,21 ,0 467 | 21 ,5,1250,60 ,0 468 | 23 ,4,1000,45 ,0 469 | 21 ,4,1000,52 ,0 470 | 22 ,1,250,22 ,1 471 | 11 ,2,500,70 ,0 472 | 23 ,5,1250,58 ,0 473 | 23 ,3,750,40 ,0 474 | 23 ,3,750,41 ,0 475 | 14 ,3,750,83 ,0 476 | 21 ,2,500,35 ,0 477 | 26 ,5,1250,49 ,1 478 | 23 ,6,1500,70 ,0 479 | 23 ,1,250,23 ,0 480 | 23 ,1,250,23 ,0 481 | 23 ,1,250,23 ,0 482 | 23 ,1,250,23 ,0 483 | 23 ,1,250,23 ,0 484 | 23 ,1,250,23 ,0 485 | 23 ,1,250,23 ,0 486 | 23 ,1,250,23 ,0 487 | 23 ,4,1000,53 ,0 488 | 21 ,6,1500,86 ,0 489 | 23 ,3,750,48 ,0 490 | 21 ,2,500,41 ,0 491 | 21 ,3,750,64 ,0 492 | 16 ,2,500,70 ,0 493 | 21 ,3,750,70 ,0 494 | 23 ,4,1000,87 ,0 495 | 23 ,3,750,89 ,0 496 | 23 ,2,500,87 ,0 497 | 35 ,3,750,64 ,0 498 | 38 ,1,250,38 ,0 499 | 38 ,1,250,38 ,0 500 | 40 ,1,250,40 ,0 501 | 74 ,1,250,74 ,0 502 | 2 ,43,10750,86 ,1 503 | 6 ,22,5500,28 ,1 504 | 2 ,34,8500,77 ,1 505 | 2 ,44,11000,98 ,0 506 | 0 ,26,6500,76 ,1 507 | 2 ,41,10250,98 ,1 508 | 3 ,21,5250,42 ,1 509 | 2 ,11,2750,23 ,0 510 | 2 ,21,5250,52 ,1 511 | 2 ,13,3250,32 ,1 512 | 4 ,4,1000,4 ,1 513 | 2 ,11,2750,26 ,0 514 | 2 ,11,2750,28 ,0 515 | 3 ,14,3500,35 ,0 516 | 4 ,16,4000,38 ,1 517 | 4 ,6,1500,14 ,0 518 | 3 ,5,1250,12 ,1 519 | 4 ,33,8250,98 ,1 520 | 3 ,10,2500,33 ,1 521 | 4 ,10,2500,28 ,1 522 | 2 ,11,2750,40 ,1 523 | 2 ,11,2750,41 ,1 524 | 4 ,13,3250,39 ,1 525 | 1 ,10,2500,43 ,1 526 | 4 ,9,2250,28 ,0 527 | 2 ,4,1000,11 ,0 528 | 2 ,5,1250,16 ,1 529 | 2 ,15,3750,64 ,0 530 | 5 ,24,6000,79 ,0 531 | 2 ,6,1500,22 ,1 532 | 4 ,5,1250,16 ,1 533 | 2 ,4,1000,14 ,1 534 | 4 ,8,2000,28 ,0 535 | 2 ,4,1000,14 ,0 536 | 2 ,6,1500,26 ,0 537 | 4 ,5,1250,16 ,1 538 | 2 ,7,1750,32 ,1 539 | 2 ,6,1500,26 ,1 540 | 2 ,8,2000,38 ,1 541 | 2 ,2,500,4 ,1 542 | 2 ,6,1500,28 ,1 543 | 2 ,10,2500,52 ,0 544 | 4 ,16,4000,70 ,1 545 | 4 ,2,500,4 ,1 546 | 1 ,14,3500,95 ,0 547 | 4 ,2,500,4 ,1 548 | 7 ,14,3500,48 ,0 549 | 2 ,3,750,11 ,0 550 | 2 ,12,3000,70 ,1 551 | 4 ,7,1750,32 ,1 552 | 4 ,4,1000,16 ,0 553 | 2 ,6,1500,35 ,1 554 | 4 ,6,1500,28 ,1 555 | 2 ,3,750,14 ,0 556 | 2 ,4,1000,23 ,0 557 | 4 ,4,1000,18 ,0 558 | 5 ,6,1500,28 ,0 559 | 4 ,6,1500,30 ,0 560 | 14 ,5,1250,14 ,0 561 | 3 ,8,2000,50 ,0 562 | 4 ,11,2750,64 ,1 563 | 4 ,9,2250,52 ,0 564 | 4 ,16,4000,98 ,1 565 | 7 ,10,2500,47 ,0 566 | 4 ,14,3500,86 ,0 567 | 2 ,9,2250,75 ,0 568 | 4 ,6,1500,35 ,0 569 | 4 ,9,2250,55 ,0 570 | 4 ,6,1500,35 ,1 571 | 2 ,6,1500,45 ,0 572 | 2 ,6,1500,47 ,0 573 | 4 ,2,500,9 ,0 574 | 2 ,2,500,11 ,1 575 | 2 ,2,500,11 ,0 576 | 2 ,2,500,11 ,1 577 | 4 ,6,1500,38 ,1 578 | 3 ,4,1000,29 ,1 579 | 9 ,9,2250,38 ,0 580 | 11 ,5,1250,18 ,0 581 | 2 ,3,750,21 ,0 582 | 2 ,1,250,2 ,0 583 | 2 ,1,250,2 ,1 584 | 2 ,1,250,2 ,0 585 | 2 ,1,250,2 ,0 586 | 2 ,1,250,2 ,0 587 | 2 ,1,250,2 ,0 588 | 2 ,1,250,2 ,1 589 | 2 ,1,250,2 ,0 590 | 2 ,1,250,2 ,0 591 | 2 ,1,250,2 ,0 592 | 2 ,1,250,2 ,0 593 | 11 ,11,2750,38 ,0 594 | 2 ,3,750,22 ,0 595 | 9 ,11,2750,49 ,1 596 | 5 ,11,2750,75 ,0 597 | 3 ,5,1250,38 ,0 598 | 3 ,1,250,3 ,1 599 | 4 ,6,1500,43 ,0 600 | 2 ,3,750,24 ,0 601 | 12 ,11,2750,39 ,0 602 | 2 ,2,500,14 ,0 603 | 4 ,6,1500,46 ,0 604 | 9 ,3,750,14 ,0 605 | 14 ,8,2000,26 ,0 606 | 4 ,2,500,13 ,0 607 | 4 ,11,2750,95 ,0 608 | 2 ,7,1750,77 ,0 609 | 2 ,7,1750,77 ,0 610 | 4 ,1,250,4 ,0 611 | 4 ,1,250,4 ,0 612 | 4 ,1,250,4 ,0 613 | 4 ,1,250,4 ,0 614 | 4 ,1,250,4 ,1 615 | 4 ,1,250,4 ,0 616 | 4 ,1,250,4 ,0 617 | 4 ,1,250,4 ,0 618 | 4 ,1,250,4 ,0 619 | 4 ,1,250,4 ,0 620 | 4 ,1,250,4 ,1 621 | 4 ,1,250,4 ,0 622 | 4 ,7,1750,62 ,0 623 | 4 ,1,250,4 ,0 624 | 4 ,4,1000,34 ,1 625 | 11 ,6,1500,28 ,0 626 | 13 ,3,750,14 ,1 627 | 7 ,5,1250,35 ,0 628 | 9 ,9,2250,54 ,0 629 | 11 ,2,500,11 ,0 630 | 2 ,5,1250,63 ,0 631 | 7 ,11,2750,89 ,0 632 | 8 ,9,2250,64 ,0 633 | 2 ,2,500,22 ,0 634 | 6 ,3,750,26 ,0 635 | 12 ,15,3750,71 ,0 636 | 13 ,3,750,16 ,0 637 | 11 ,16,4000,89 ,0 638 | 4 ,5,1250,58 ,0 639 | 14 ,7,1750,35 ,0 640 | 11 ,4,1000,27 ,0 641 | 7 ,9,2250,89 ,1 642 | 11 ,8,2000,52 ,1 643 | 7 ,5,1250,52 ,0 644 | 11 ,6,1500,41 ,0 645 | 10 ,5,1250,38 ,0 646 | 14 ,2,500,14 ,1 647 | 14 ,2,500,14 ,0 648 | 14 ,2,500,14 ,0 649 | 2 ,2,500,33 ,0 650 | 11 ,3,750,23 ,0 651 | 14 ,8,2000,46 ,0 652 | 9 ,1,250,9 ,0 653 | 16 ,5,1250,27 ,0 654 | 14 ,4,1000,26 ,0 655 | 4 ,2,500,30 ,0 656 | 14 ,3,750,21 ,0 657 | 16 ,16,4000,77 ,0 658 | 4 ,2,500,31 ,0 659 | 14 ,8,2000,50 ,0 660 | 11 ,3,750,26 ,0 661 | 14 ,7,1750,45 ,0 662 | 15 ,5,1250,33 ,0 663 | 16 ,2,500,16 ,0 664 | 16 ,3,750,21 ,0 665 | 11 ,8,2000,72 ,0 666 | 11 ,1,250,11 ,0 667 | 11 ,1,250,11 ,0 668 | 11 ,1,250,11 ,0 669 | 11 ,1,250,11 ,1 670 | 11 ,1,250,11 ,0 671 | 2 ,3,750,75 ,1 672 | 2 ,3,750,77 ,0 673 | 16 ,4,1000,28 ,0 674 | 16 ,15,3750,87 ,0 675 | 16 ,14,3500,83 ,0 676 | 16 ,10,2500,62 ,0 677 | 16 ,3,750,23 ,0 678 | 14 ,3,750,26 ,0 679 | 23 ,19,4750,62 ,0 680 | 11 ,7,1750,75 ,0 681 | 14 ,3,750,28 ,0 682 | 20 ,14,3500,69 ,1 683 | 4 ,2,500,46 ,0 684 | 11 ,2,500,25 ,0 685 | 11 ,3,750,37 ,0 686 | 16 ,4,1000,33 ,0 687 | 21 ,7,1750,38 ,0 688 | 13 ,7,1750,76 ,0 689 | 16 ,6,1500,50 ,0 690 | 14 ,3,750,33 ,0 691 | 14 ,1,250,14 ,0 692 | 14 ,1,250,14 ,0 693 | 14 ,1,250,14 ,0 694 | 14 ,1,250,14 ,0 695 | 14 ,1,250,14 ,0 696 | 14 ,1,250,14 ,0 697 | 17 ,7,1750,58 ,1 698 | 14 ,3,750,35 ,0 699 | 14 ,3,750,35 ,0 700 | 16 ,7,1750,64 ,0 701 | 21 ,2,500,21 ,0 702 | 16 ,3,750,35 ,0 703 | 16 ,1,250,16 ,0 704 | 16 ,1,250,16 ,0 705 | 16 ,1,250,16 ,0 706 | 16 ,1,250,16 ,0 707 | 16 ,1,250,16 ,0 708 | 14 ,2,500,29 ,0 709 | 11 ,4,1000,74 ,0 710 | 11 ,2,500,38 ,1 711 | 21 ,6,1500,48 ,0 712 | 23 ,2,500,23 ,0 713 | 23 ,6,1500,45 ,0 714 | 14 ,2,500,35 ,1 715 | 16 ,6,1500,81 ,0 716 | 16 ,4,1000,58 ,0 717 | 16 ,5,1250,71 ,0 718 | 21 ,2,500,26 ,0 719 | 21 ,3,750,35 ,0 720 | 21 ,3,750,35 ,0 721 | 23 ,8,2000,69 ,0 722 | 21 ,3,750,38 ,0 723 | 23 ,3,750,35 ,0 724 | 21 ,3,750,40 ,0 725 | 23 ,2,500,28 ,0 726 | 21 ,1,250,21 ,0 727 | 21 ,1,250,21 ,0 728 | 25 ,6,1500,50 ,0 729 | 21 ,1,250,21 ,0 730 | 21 ,1,250,21 ,0 731 | 23 ,3,750,39 ,0 732 | 21 ,2,500,33 ,0 733 | 14 ,3,750,79 ,0 734 | 23 ,1,250,23 ,1 735 | 23 ,1,250,23 ,0 736 | 23 ,1,250,23 ,0 737 | 23 ,1,250,23 ,0 738 | 23 ,1,250,23 ,0 739 | 23 ,1,250,23 ,0 740 | 23 ,1,250,23 ,0 741 | 23 ,4,1000,52 ,0 742 | 23 ,1,250,23 ,0 743 | 23 ,7,1750,88 ,0 744 | 16 ,3,750,86 ,0 745 | 23 ,2,500,38 ,0 746 | 21 ,2,500,52 ,0 747 | 23 ,3,750,62 ,0 748 | 39 ,1,250,39 ,0 749 | 72 ,1,250,72 ,0 -------------------------------------------------------------------------------- /watermelon/ch3/3.4/data/transfusion.names: -------------------------------------------------------------------------------- 1 | Title: Blood Transfusion Service Center Data Set 2 | 3 | Abstract: Data taken from the Blood Transfusion Service Center in Hsin-Chu City 4 | in Taiwan -- this is a classification problem. 5 | 6 | 7 | ----------------------------------------------------- 8 | 9 | Data Set Characteristics: Multivariate 10 | Number of Instances: 748 11 | Area: Business 12 | Attribute Characteristics: Real 13 | Number of Attributes: 5 14 | Date Donated: 2008-10-03 15 | Associated Tasks: Classification 16 | Missing Values? N/A 17 | 18 | ----------------------------------------------------- 19 | 20 | Source: 21 | 22 | Original Owner and Donor 23 | Prof. I-Cheng Yeh 24 | Department of Information Management 25 | Chung-Hua University, 26 | Hsin Chu, Taiwan 30067, R.O.C. 27 | e-mail:icyeh 'at' chu.edu.tw 28 | TEL:886-3-5186511 29 | 30 | Date Donated: October 3, 2008 31 | 32 | ----------------------------------------------------- 33 | 34 | Data Set Information: 35 | 36 | To demonstrate the RFMTC marketing model (a modified version of RFM), this study 37 | adopted the donor database of Blood Transfusion Service Center in Hsin-Chu City 38 | in Taiwan. The center passes their blood transfusion service bus to one 39 | university in Hsin-Chu City to gather blood donated about every three months. To 40 | build a FRMTC model, we selected 748 donors at random from the donor database. 41 | These 748 donor data, each one included R (Recency - months since last 42 | donation), F (Frequency - total number of donation), M (Monetary - total blood 43 | donated in c.c.), T (Time - months since first donation), and a binary variable 44 | representing whether he/she donated blood in March 2007 (1 stand for donating 45 | blood; 0 stands for not donating blood). 46 | 47 | ----------------------------------------------------- 48 | 49 | Attribute Information: 50 | 51 | Given is the variable name, variable type, the measurement unit and a brief 52 | description. The "Blood Transfusion Service Center" is a classification problem. 53 | The order of this listing corresponds to the order of numerals along the rows of 54 | the database. 55 | 56 | R (Recency - months since last donation), 57 | F (Frequency - total number of donation), 58 | M (Monetary - total blood donated in c.c.), 59 | T (Time - months since first donation), and 60 | a binary variable representing whether he/she donated blood in March 2007 (1 61 | stand for donating blood; 0 stands for not donating blood). 62 | 63 | 64 | Table 1 shows the descriptive statistics of the data. We selected 500 data at 65 | random as the training set, and the rest 248 as the testing set. 66 | 67 | Table 1. Descriptive statistics of the data 68 | 69 | Variable Data Type Measurement Description min max mean std 70 | Recency quantitative Months Input 0.03 74.4 9.74 8.07 71 | Frequency quantitative Times Input 1 50 5.51 5.84 72 | Monetary quantitative c.c. blood Input 250 12500 1378.68 1459.83 73 | Time quantitative Months Input 2.27 98.3 34.42 24.32 74 | Whether he/she donated blood in March 2007 binary 1=yes 0=no Output 0 1 1 (24%) 0 (76%) 75 | 76 | 77 | ----------------------------------------------------- 78 | 79 | Citation Request: 80 | 81 | NOTE: Reuse of this database is unlimited with retention of copyright notice for 82 | Prof. I-Cheng Yeh and the following published paper: 83 | 84 | Yeh, I-Cheng, Yang, King-Jang, and Ting, Tao-Ming, "Knowledge discovery on RFM 85 | model using Bernoulli sequence, "Expert Systems with Applications, 2008 86 | (doi:10.1016/j.eswa.2008.07.018). 87 | 88 | 89 | -------------------------------------------------------------------------------- /watermelon/ch3/3.5/LDA.py: -------------------------------------------------------------------------------- 1 | import numpy as np # for matrix calculation 2 | import matplotlib.pyplot as plt 3 | from self_def import GetProjectivePoint_2D 4 | from sklearn import model_selection 5 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 6 | from sklearn import metrics 7 | import matplotlib.pyplot as plt 8 | ''' 9 | data importion and pre-analysis 10 | ''' 11 | 12 | # load the CSV file as a numpy matrix 13 | data_file = open('data/watermelon_3a.csv') 14 | dataset = np.loadtxt(data_file, delimiter=",") 15 | 16 | # separate the data from the target attributes 17 | X = dataset[:, 1:3] 18 | y = dataset[:, 3] 19 | 20 | # draw scatter diagram to show the raw data 21 | f1 = plt.figure(1) 22 | plt.title('watermelon_3a') 23 | plt.xlabel('density') 24 | plt.ylabel('ratio_sugar') 25 | plt.scatter(X[y == 0, 0], X[y == 0, 1], marker='o', color='k', s=100, label='bad') 26 | plt.scatter(X[y == 1, 0], X[y == 1, 1], marker='o', color='g', s=100, label='good') 27 | plt.legend(loc='upper right') 28 | # plt.show() 29 | 30 | ''' 31 | LDA via sklearn 32 | ''' 33 | # generalization of train and test set 34 | X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.5, random_state=0) 35 | 36 | # model fitting 37 | lda_model = LinearDiscriminantAnalysis(solver='lsqr', shrinkage=None).fit(X_train, y_train) 38 | 39 | # model validation 40 | y_pred = lda_model.predict(X_test) 41 | 42 | # summarize the fit of the model 43 | print(metrics.confusion_matrix(y_test, y_pred)) 44 | print(metrics.classification_report(y_test, y_pred)) 45 | 46 | # draw the classfier decision boundary 47 | f2 = plt.figure(2) 48 | h = 0.001 49 | # x0_min, x0_max = X[:, 0].min()-0.1, X[:, 0].max()+0.1 50 | # x1_min, x1_max = X[:, 1].min()-0.1, X[:, 1].max()+0.1 51 | 52 | x0, x1 = np.meshgrid(np.arange(-1, 1, h), 53 | np.arange(-1, 1, h)) 54 | 55 | # x0, x1 = np.meshgrid(np.arange(x0_min, x0_max, h), 56 | # np.arange(x1_min, x1_max, h)) 57 | 58 | z = lda_model.predict(np.c_[x0.ravel(), x1.ravel()]) 59 | 60 | # Put the result into a color plot 61 | z = z.reshape(x0.shape) 62 | plt.contourf(x0, x1, z) 63 | 64 | # Plot also the training pointsplt.title('watermelon_3a') 65 | plt.title('watermelon_3a') 66 | plt.xlabel('density') 67 | plt.ylabel('ratio_sugar') 68 | plt.scatter(X[y == 0,0], X[y == 0,1], marker = 'o', color = 'k', s=100, label = 'bad') 69 | plt.scatter(X[y == 1,0], X[y == 1,1], marker = 'o', color = 'g', s=100, label = 'good') 70 | plt.show() 71 | 72 | 73 | ''' 74 | implementation of LDA based on self-coding 75 | ''' 76 | # 1-st. get the mean vector of each class 77 | 78 | u = [] 79 | for i in range(2): # two class 80 | u.append(np.mean(X[y == i], axis=0)) # column mean 81 | 82 | # 2-nd. computing the within-class scatter matrix, refer on book (3.33) 83 | m, n = np.shape(X) 84 | Sw = np.zeros((n, n)) 85 | for i in range(m): 86 | x_tmp = X[i].reshape(n, 1) # row -> cloumn vector 87 | if y[i] == 0: u_tmp = u[0].reshape(n, 1) 88 | if y[i] == 1: u_tmp = u[1].reshape(n, 1) 89 | Sw += np.dot(x_tmp - u_tmp, (x_tmp - u_tmp).T) 90 | 91 | Sw = np.mat(Sw) 92 | U, sigma, V = np.linalg.svd(Sw) 93 | 94 | Sw_inv = V.T * np.linalg.inv(np.diag(sigma)) * U.T 95 | # 3-th. computing the parameter w, refer on book (3.39) 96 | w = np.dot(Sw_inv, (u[0] - u[1]).reshape(n, 1)) # here we use a**-1 to get the inverse of a ndarray 97 | 98 | print(w) 99 | 100 | # 4-th draw the LDA line in scatter figure 101 | 102 | # f2 = plt.figure(2) 103 | f3 = plt.figure(3) 104 | plt.xlim(-0.2, 1) 105 | plt.ylim(-0.5, 0.7) 106 | 107 | p0_x0 = -X[:, 0].max() 108 | p0_x1 = (w[1, 0] / w[0, 0]) * p0_x0 109 | p1_x0 = X[:, 0].max() 110 | p1_x1 = (w[1, 0] / w[0, 0]) * p1_x0 111 | 112 | plt.title('watermelon_3a - LDA') 113 | plt.xlabel('density') 114 | plt.ylabel('ratio_sugar') 115 | plt.scatter(X[y == 0, 0], X[y == 0, 1], marker='o', color='k', s=10, label='bad') 116 | plt.scatter(X[y == 1, 0], X[y == 1, 1], marker='o', color='g', s=10, label='good') 117 | plt.legend(loc='upper right') 118 | 119 | plt.plot([p0_x0, p1_x0], [p0_x1, p1_x1]) 120 | 121 | # draw projective point on the line 122 | 123 | 124 | m, n = np.shape(X) 125 | for i in range(m): 126 | x_p = GetProjectivePoint_2D([X[i, 0], X[i, 1]], [w[1, 0] / w[0, 0], 0]) 127 | if y[i] == 0: 128 | plt.plot(x_p[0], x_p[1], 'ko', markersize=5) 129 | if y[i] == 1: 130 | plt.plot(x_p[0], x_p[1], 'go', markersize=5) 131 | plt.plot([x_p[0], X[i, 0]], [x_p[1], X[i, 1]], 'c--', linewidth=0.3) 132 | 133 | plt.show() 134 | 135 | ''' 136 | 由于数据线性不可分，则出现类簇重叠现象。 137 | 接下来，通过观查数据，我们考虑将西瓜数据集中的bad类离群点15删去,此时数据集的线性可分性大大提高。 138 | implementation of LDA again after delete outlier (X[14]) 139 | ''' 140 | # computing the d-dimensional mean vectors 141 | # import numpy as np 142 | 143 | # 1-st. get the mean vector of each class 144 | X = np.delete(X, 14, 0) 145 | y = np.delete(y, 14, 0) 146 | 147 | u = [] 148 | for i in range(2): # two class 149 | u.append(np.mean(X[y == i], axis=0)) # column mean 150 | 151 | # 2-nd. computing the within-class scatter matrix, refer on book (3.33) 152 | m, n = np.shape(X) 153 | Sw = np.zeros((n, n)) 154 | for i in range(m): 155 | x_tmp = X[i].reshape(n, 1) # row -> cloumn vector 156 | if y[i] == 0: u_tmp = u[0].reshape(n, 1) 157 | if y[i] == 1: u_tmp = u[1].reshape(n, 1) 158 | Sw += np.dot(x_tmp - u_tmp, (x_tmp - u_tmp).T) 159 | 160 | Sw = np.mat(Sw) 161 | U, sigma, V = np.linalg.svd(Sw) 162 | 163 | Sw_inv = V.T * np.linalg.inv(np.diag(sigma)) * U.T 164 | # 3-th. computing the parameter w, refer on book (3.39) 165 | w = np.dot(Sw_inv, (u[0] - u[1]).reshape(n, 1)) # here we use a**-1 to get the inverse of a ndarray 166 | 167 | print(w) 168 | 169 | # 4-th draw the LDA line in scatter figure 170 | 171 | # f2 = plt.figure(2) 172 | f4 = plt.figure(4) 173 | plt.xlim(-0.2, 1) 174 | plt.ylim(-0.5, 0.7) 175 | 176 | p0_x0 = -X[:, 0].max() 177 | p0_x1 = (w[1, 0] / w[0, 0]) * p0_x0 178 | p1_x0 = X[:, 0].max() 179 | p1_x1 = (w[1, 0] / w[0, 0]) * p1_x0 180 | 181 | plt.title('watermelon_3a - LDA') 182 | plt.xlabel('density') 183 | plt.ylabel('ratio_sugar') 184 | plt.scatter(X[y == 0, 0], X[y == 0, 1], marker='o', color='k', s=10, label='bad') 185 | plt.scatter(X[y == 1, 0], X[y == 1, 1], marker='o', color='g', s=10, label='good') 186 | plt.legend(loc='upper right') 187 | 188 | plt.plot([p0_x0, p1_x0], [p0_x1, p1_x1]) 189 | 190 | # draw projective point on the line 191 | 192 | m, n = np.shape(X) 193 | for i in range(m): 194 | x_p = GetProjectivePoint_2D([X[i, 0], X[i, 1]], [w[1, 0] / w[0, 0], 0]) 195 | if y[i] == 0: 196 | plt.plot(x_p[0], x_p[1], 'ko', markersize=5) 197 | if y[i] == 1: 198 | plt.plot(x_p[0], x_p[1], 'go', markersize=5) 199 | plt.plot([x_p[0], X[i, 0]], [x_p[1], X[i, 1]], 'c--', linewidth=0.3) 200 | 201 | plt.show() 202 | 203 | ''' 204 | 由于西瓜数据集自身非线性因素，LDA所得直线未能很好的表现出类别的分簇情景， 205 | 说明，LDA基本模型不太适用于线性不可分的情况。要拓展到非线性，或许可以考虑SVM-核技巧。 206 | ''' -------------------------------------------------------------------------------- /watermelon/ch3/3.5/data/watermelon_3a.csv: -------------------------------------------------------------------------------- 1 | 1,0.697,0.46,1 2 | 2,0.774,0.376,1 3 | 3,0.634,0.264,1 4 | 4,0.608,0.318,1 5 | 5,0.556,0.215,1 6 | 6,0.403,0.237,1 7 | 7,0.481,0.149,1 8 | 8,0.437,0.211,1 9 | 9,0.666,0.091,0 10 | 10,0.243,0.0267,0 11 | 11,0.245,0.057,0 12 | 12,0.343,0.099,0 13 | 13,0.639,0.161,0 14 | 14,0.657,0.198,0 15 | 15,0.36,0.37,0 16 | 16,0.593,0.042,0 17 | 17,0.719,0.103,0 18 | -------------------------------------------------------------------------------- /watermelon/ch3/3.5/self_def.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | ''' 4 | get the projective point(2D) of a point to a line 5 | 6 | @param point: the coordinate of the point form as [a,b] 7 | @param line: the line parameters form as [k, t] which means y = k*x + t 8 | @return: the coordinate of the projective point 9 | ''' 10 | 11 | 12 | def GetProjectivePoint_2D(point, line): 13 | a = point[0] 14 | b = point[1] 15 | k = line[0] 16 | t = line[1] 17 | 18 | if k == 0: 19 | return [a, t] 20 | elif k == np.inf: 21 | return [0, b] 22 | x = (a + k * b - k * t) / (k * k + 1) 23 | y = k * x + t 24 | return [x, y] 25 | -------------------------------------------------------------------------------- /watermelon/ch4/4.3/ID3_watermelon.py: -------------------------------------------------------------------------------- 1 | ''' 2 | import data and pre-analysis through data visualization 3 | ''' 4 | # using pandas dataframe for .csv read which contains chinese char. 5 | import pandas as pd 6 | import decision_tree 7 | 8 | data_file_encode = "gb18030" # the watermelon_3.csv is file codec type 9 | with open("data/watermelon_3.csv", mode='r', encoding=data_file_encode) as data_file: 10 | df = pd.read_csv(data_file) 11 | 12 | # using seaborn for data visualization. 13 | # # load chinese font 14 | # import matplotlib as mpl 15 | # import matplotlib.pyplot as plt 16 | # import seaborn as sns 17 | # # sns.set(style="whitegrid", color_codes=True) 18 | # mpl.rcParams['font.sans-serif'] = ['Droid Sans Fallback'] # for chinese chararter visualization 19 | # mpl.rcParams['axes.unicode_minus'] = False 20 | # sns.set_context("poster") 21 | # 22 | # f1 = plt.figure(1) 23 | # sns.FacetGrid(df, hue="好瓜", size=5).map(plt.scatter, "密度", "含糖率").add_legend() 24 | # sns.plt.show() 25 | # 26 | # f2 = plt.figure(2) 27 | # sns.plt.subplot(221) 28 | # sns.swarmplot(x = "纹理", y = '密度', hue = "好瓜", data = df) 29 | # sns.plt.subplot(222) 30 | # sns.swarmplot(x = "敲声", y = '密度', hue = "好瓜", data = df) 31 | # sns.plt.subplot(223) 32 | # sns.swarmplot(x = "色泽", y = '含糖率', hue = "好瓜", data = df) 33 | # sns.plt.subplot(224) 34 | # sns.swarmplot(x = "敲声", y = '含糖率', hue = "好瓜", data = df) 35 | # sns.plt.show() 36 | 37 | ''' 38 | implementation of ID3 39 | 40 | rely on decision_tree.py 41 | ''' 42 | 43 | root = decision_tree.TreeGenerate(df) 44 | 45 | # df = df.drop(['密度','含糖率'], 1) 46 | # df = df.drop(['色泽','根蒂','敲声','纹理','脐部','触感'], 1) 47 | 48 | accuracy_scores = [] 49 | 50 | ''' 51 | from random import sample 52 | for i in range(10): 53 | train = sample(range(len(df.index)), int(1*len(df.index)/2)) 54 | 55 | df_train = df.iloc[train] 56 | df_test = df.drop(train) 57 | # generate the tree 58 | root = decision_tree.TreeGenerate(df_train) 59 | # test the accuracy 60 | pred_true = 0 61 | for i in df_test.index: 62 | label = decision_tree.Predict(root, df[df.index == i]) 63 | if label == df_test[df_test.columns[-1]][i]: 64 | pred_true += 1 65 | 66 | accuracy = pred_true / len(df_test.index) 67 | accuracy_scores.append(accuracy) 68 | ''' 69 | 70 | # k-folds cross prediction 71 | # 按照K折交叉验证模型（k=5） 72 | 73 | n = len(df.index) 74 | k = 5 75 | for i in range(k): 76 | m = int(n / k) 77 | test = [] 78 | for j in range(i * m, i * m + m): 79 | test.append(j) 80 | 81 | df_train = df.drop(test) 82 | df_test = df.iloc[test] 83 | root = decision_tree.TreeGenerate(df_train) # generate the tree 84 | 85 | # test the accuracy 86 | pred_true = 0 87 | for i in df_test.index: 88 | label = decision_tree.Predict(root, df[df.index == i]) 89 | if label == df_test[df_test.columns[-1]][i]: 90 | pred_true += 1 91 | 92 | accuracy = pred_true / len(df_test.index) 93 | accuracy_scores.append(accuracy) 94 | 95 | # print the prediction accuracy result 96 | accuracy_sum = 0 97 | print("accuracy: ", end="") 98 | for i in range(k): 99 | print("%.3f " % accuracy_scores[i], end="") 100 | accuracy_sum += accuracy_scores[i] 101 | print("\naverage accuracy: %.3f" % (accuracy_sum / k)) 102 | 103 | # dicision tree visualization using pydotplus.graphviz 104 | root = decision_tree.TreeGenerate(df) 105 | 106 | decision_tree.DrawPNG(root, "decision_tree_ID3.png") 107 | -------------------------------------------------------------------------------- /watermelon/ch4/4.3/data/watermelon_3.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/watermelon/ch4/4.3/data/watermelon_3.csv -------------------------------------------------------------------------------- /watermelon/ch4/4.3/decision_tree.py: -------------------------------------------------------------------------------- 1 | ''' 2 | definition of decision node class 3 | 4 | attr: attribution as parent for a new branching 5 | attr_down: dict: {key, value} 6 | key: categorical: categorical attr_value 7 | continuous: '<= div_value' for small part 8 | '> div_value' for big part 9 | value: children (Node class) 10 | label： class label (the majority of current sample labels) 11 | 决策树节点类 12 | 该节点类包含当前节点的属性，向下划分的属性取值，节点的类标签（叶节点有效，为通用性而保留所有节点类标签） 13 | ''' 14 | 15 | import numpy as np 16 | 17 | 18 | class Node(object): 19 | def __init__(self, attr_init=None, label_init=None, attr_down_init={}): 20 | self.attr = attr_init 21 | self.label = label_init 22 | self.attr_down = attr_down_init 23 | 24 | 25 | ''' 26 | Branching for decision tree using recursion 27 | 递归实现决策树生成算法p74 28 | 29 | @param df: the pandas dataframe of the data_set 30 | @return root: Node, the root node of decision tree 31 | ''' 32 | 33 | 34 | def TreeGenerate(df): 35 | # generating a new root node 36 | new_node = Node(None, None, {}) 37 | label_arr = df[df.columns[-1]] 38 | 39 | label_count = NodeLabel(label_arr) 40 | if label_count: # assert the label_count isn't empty 41 | new_node.label = max(label_count, key=label_count.get) 42 | 43 | # end if there is only 1 class in current node data 样本全属于同一类别 44 | # end if attribution array is empty 属性集为空 45 | if len(label_count) == 1 or len(label_arr) == 0: 46 | return new_node 47 | 48 | # get the optimal attribution for a new branching 获取最优划分属性 49 | new_node.attr, div_value = OptAttr(df) 50 | 51 | # recursion 52 | if div_value == 0: # categorical variable 53 | value_count = ValueCount(df[new_node.attr]) 54 | for value in value_count: 55 | df_v = df[df[new_node.attr].isin([value])] # get sub set 56 | # delete current attribution 57 | df_v = df_v.drop(new_node.attr, 1) 58 | new_node.attr_down[value] = TreeGenerate(df_v) 59 | 60 | else: # continuous variable # left and right child 61 | value_l = "<=%.3f" % div_value 62 | value_r = ">%.3f" % div_value 63 | df_v_l = df[df[new_node.attr] <= div_value] # get sub set 64 | df_v_r = df[df[new_node.attr] > div_value] 65 | 66 | new_node.attr_down[value_l] = TreeGenerate(df_v_l) 67 | new_node.attr_down[value_r] = TreeGenerate(df_v_r) 68 | 69 | return new_node 70 | 71 | 72 | ''' 73 | make a predict based on root 74 | 75 | @param root: Node, root Node of the decision tree 76 | @param df_sample: dataframe, a sample line 77 | ''' 78 | 79 | 80 | def Predict(root, df_sample): 81 | try: 82 | import re # using Regular Expression to get the number in string 83 | except ImportError: 84 | print("module re not found") 85 | 86 | while root.attr != None: 87 | # continuous variable 88 | if df_sample[root.attr].dtype == (float, int): 89 | # get the div_value from root.attr_down 90 | for key in list(root.attr_down): 91 | num = re.findall(r"\d+\.?\d*", key) 92 | div_value = float(num[0]) 93 | break 94 | if df_sample[root.attr].values[0] <= div_value: 95 | key = "<=%.3f" % div_value 96 | root = root.attr_down[key] 97 | else: 98 | key = ">%.3f" % div_value 99 | root = root.attr_down[key] 100 | 101 | # categorical variable 102 | else: 103 | key = df_sample[root.attr].values[0] 104 | # check whether the attr_value in the child branch 105 | if key in root.attr_down: 106 | root = root.attr_down[key] 107 | else: 108 | break 109 | 110 | return root.label 111 | 112 | 113 | ''' 114 | calculating the appeared label and it's counts 115 | 116 | @param label_arr: data array for class labels 117 | @return label_count: dict, the appeared label and it's counts 118 | ''' 119 | 120 | 121 | def NodeLabel(label_arr): 122 | label_count = {} # store count of label 123 | 124 | for label in label_arr: 125 | if label in label_count: 126 | label_count[label] += 1 127 | else: 128 | label_count[label] = 1 129 | 130 | return label_count 131 | 132 | 133 | ''' 134 | calculating the appeared value for categorical attribute and it's counts 135 | 136 | @param data_arr: data array for an attribute 137 | @return value_count: dict, the appeared value and it's counts 138 | ''' 139 | 140 | 141 | def ValueCount(data_arr): 142 | value_count = {} # store count of value 143 | 144 | for label in data_arr: 145 | if label in value_count: 146 | value_count[label] += 1 147 | else: 148 | value_count[label] = 1 149 | 150 | return value_count 151 | 152 | 153 | ''' 154 | find the optimal attributes of current data_set 155 | 获取最优划分属性 156 | 157 | @param df: the pandas dataframe of the data_set 158 | @return opt_attr: the optimal attribution for branch 159 | @return div_value: for discrete variable value = 0 160 | for continuous variable value = t for bisection divide value 161 | ''' 162 | 163 | 164 | def OptAttr(df): 165 | info_gain = 0 166 | 167 | for attr_id in df.columns[1:-1]: 168 | info_gian_tmp, div_value_tmp = InfoGain(df, attr_id) 169 | if info_gian_tmp > info_gain: 170 | info_gain = info_gian_tmp 171 | opt_attr = attr_id 172 | div_value = div_value_tmp 173 | 174 | return opt_attr, div_value 175 | 176 | 177 | ''' 178 | calculating the information gain of an attribution 179 | 采用信息增益最大化来实现最优划分属性的选择，这里主要的挑战是离散和连续两种属性变量的分别操作。 180 | 对于离散变量（categorical variable），参考书p75-77内容实现， 181 | 对于连续变量（continuous variable），采用书p83-85所介绍的二分法实现。 182 | 183 | @param df: dataframe, the pandas dataframe of the data_set 184 | @param attr_id: the target attribution in df 185 | @return info_gain: the information gain of current attribution 186 | @return div_value: for discrete variable, value = 0 187 | for continuous variable, value = t (the division value) 188 | ''' 189 | 190 | 191 | # todo 运行失败 192 | def InfoGain(df, index): 193 | info_gain = InfoEnt(df.values[:, -1]) # info_gain for the whole label 194 | div_value = 0 # div_value for continuous attribute 195 | 196 | n = len(df[index]) # the number of sample 197 | # 1.for continuous variable using method of bisection 198 | 199 | if type(df[index][0]) == np.float64: 200 | 201 | sub_info_ent = {} # store the div_value (div) and it's subset entropy 202 | # print(df) 203 | 204 | df = df.sort([index], ascending=1) # sorting via column 205 | df = df.reset_index(drop=True) 206 | 207 | data_arr = df[index] 208 | # print(data_arr) 209 | label_arr = df[df.columns[-1]] 210 | 211 | for i in range(n - 1): 212 | div = (data_arr[i] + data_arr[i + 1]) / 2 213 | sub_info_ent[div] = ((i + 1) * InfoEnt(label_arr[0:i + 1]) / n) \ 214 | + ((n - i - 1) * InfoEnt(label_arr[i + 1:-1]) / n) 215 | # our goal is to get the min subset entropy sum and it's divide value 216 | div_value, sub_info_ent_max = min(sub_info_ent.items(), key=lambda x: x[1]) 217 | info_gain -= sub_info_ent_max 218 | 219 | # 2.for discrete variable (categorical variable) 220 | else: 221 | data_arr = df[index] 222 | label_arr = df[df.columns[-1]] 223 | value_count = ValueCount(data_arr) 224 | 225 | # 计算信息增益 226 | for key in value_count: 227 | key_label_arr = label_arr[data_arr == key] 228 | info_gain -= value_count[key] * InfoEnt(key_label_arr) / n 229 | 230 | return info_gain, div_value 231 | 232 | 233 | ''' 234 | calculating the information entropy of an attribution 235 | 计算信息熵Ent(D) 236 | 237 | @param label_arr: ndarray, class label array of data_arr 238 | @return ent: the information entropy of current attribution 239 | ''' 240 | 241 | 242 | def InfoEnt(label_arr): 243 | try: 244 | from math import log2 245 | except ImportError: 246 | print("module math.log2 not found") 247 | 248 | ent = 0 249 | n = len(label_arr) 250 | label_count = NodeLabel(label_arr) 251 | 252 | for key in label_count: 253 | ent -= (label_count[key] / n) * log2(label_count[key] / n) 254 | 255 | return ent 256 | 257 | 258 | def DrawPNG(root, out_file): 259 | ''' 260 | visualization of decision tree from root. 261 | @param root: Node, the root node for tree. 262 | @param out_file: str, name and path of output file 263 | ''' 264 | try: 265 | from pydotplus import graphviz 266 | except ImportError: 267 | print("module pydotplus.graphviz not found") 268 | 269 | g = graphviz.Dot() # generation of new dot 270 | 271 | TreeToGraph(0, g, root) 272 | g2 = graphviz.graph_from_dot_data(g.to_string()) 273 | 274 | g2.write_png(out_file) 275 | 276 | 277 | def TreeToGraph(i, g, root): 278 | ''' 279 | build a graph from root on 280 | @param i: node number in this tree 281 | @param g: pydotplus.graphviz.Dot() object 282 | @param root: the root node 283 | 284 | @return i: node number after modified 285 | # @return g: pydotplus.graphviz.Dot() object after modified 286 | @return g_node: the current root node in graphviz 287 | ''' 288 | try: 289 | from pydotplus import graphviz 290 | except ImportError: 291 | print("module pydotplus.graphviz not found") 292 | 293 | if root.attr == None: 294 | g_node_label = "Node:%d\n好瓜:%s" % (i, root.label) 295 | else: 296 | g_node_label = "Node:%d\n好瓜:%s\n属性:%s" % (i, root.label, root.attr) 297 | g_node = i 298 | g.add_node(graphviz.Node(g_node, label=g_node_label)) 299 | 300 | for value in list(root.attr_down): 301 | i, g_child = TreeToGraph(i + 1, g, root.attr_down[value]) 302 | g.add_edge(graphviz.Edge(g_node, g_child, label=value)) 303 | 304 | return i, g_node 305 | -------------------------------------------------------------------------------- /watermelon/ch4/4.4/CART_watermelon.py: -------------------------------------------------------------------------------- 1 | ''' 2 | import data and pre-analysis through data visualization 3 | ''' 4 | # using pandas dataframe for .csv read which contains chinese char. 5 | import pandas as pd 6 | import decision_tree 7 | 8 | data_file_encode = "gb18030" 9 | with open("data/watermelon_2.csv", mode='r', encoding=data_file_encode) as data_file: 10 | df = pd.read_csv(data_file) 11 | 12 | ''' 13 | implementation of CART rely on decision_tree.py 14 | ''' 15 | 16 | # dicision tree visualization using pydotplus.graphviz 17 | index_train = [0, 1, 2, 5, 6, 9, 13, 14, 15, 16] 18 | 19 | df_train = df.iloc[index_train] 20 | df_test = df.drop(index_train) 21 | 22 | # generate a full tree 23 | root = decision_tree.TreeGenerate(df_train) 24 | decision_tree.DrawPNG(root, "decision_tree_full.png") 25 | print("accuracy of full tree: %.3f" % decision_tree.PredictAccuracy(root, df_test)) 26 | 27 | # pre-purning 28 | root = decision_tree.PrePurn(df_train, df_test) 29 | decision_tree.DrawPNG(root, "decision_tree_pre.png") 30 | print("accuracy of pre-purning tree: %.3f" % decision_tree.PredictAccuracy(root, df_test)) 31 | 32 | # # post-puring 33 | root = decision_tree.TreeGenerate(df_train) 34 | decision_tree.PostPurn(root, df_test) 35 | decision_tree.DrawPNG(root, "decision_tree_post.png") 36 | print("accuracy of post-purning tree: %.3f" % decision_tree.PredictAccuracy(root, df_test)) 37 | 38 | # print the accuracy 39 | # k-folds cross prediction 40 | accuracy_scores = [] 41 | n = len(df.index) 42 | k = 5 43 | for i in range(k): 44 | m = int(n / k) 45 | test = [] 46 | for j in range(i * m, i * m + m): 47 | test.append(j) 48 | 49 | df_train = df.drop(test) 50 | df_test = df.iloc[test] 51 | root = decision_tree.TreeGenerate(df_train) # generate the tree 52 | decision_tree.PostPurn(root, df_test) # post-purning 53 | 54 | # test the accuracy 55 | pred_true = 0 56 | for i in df_test.index: 57 | label = decision_tree.Predict(root, df[df.index == i]) 58 | if label == df_test[df_test.columns[-1]][i]: 59 | pred_true += 1 60 | 61 | accuracy = pred_true / len(df_test.index) 62 | accuracy_scores.append(accuracy) 63 | 64 | # print the prediction accuracy result 65 | accuracy_sum = 0 66 | print("accuracy: ", end="") 67 | for i in range(k): 68 | print("%.3f " % accuracy_scores[i], end="") 69 | accuracy_sum += accuracy_scores[i] 70 | print("\naverage accuracy: %.3f" % (accuracy_sum / k)) 71 | -------------------------------------------------------------------------------- /watermelon/ch4/4.4/data/watermelon_2.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/watermelon/ch4/4.4/data/watermelon_2.csv -------------------------------------------------------------------------------- /watermelon/ch4/4.4/decision_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | ''' 4 | 由于本题数据集较差，决策树的总体表现一般，交叉验证存在很大波动性。 5 | 剪枝操作是提升模型泛化能力的重要途径，在不考虑建模开销的情况下，后剪枝一般会优于预剪枝。 6 | 除剪枝外，常采用最大叶深度约束等方法来保持决策树泛化能力。 7 | ''' 8 | 9 | 10 | class Node(object): 11 | 12 | ''' 13 | definition of decision node class 14 | 15 | attr: attribution as parent for a new branching 16 | attr_down: dict: {key, value} 17 | key: categorical: categorical attr_value 18 | continuous: '<= div_value' for small part 19 | '> div_value' for big part 20 | value: children (Node class) 21 | label： class label (the majority of current sample labels) 22 | ''' 23 | 24 | def __init__(self, attr_init=None, label_init=None, attr_down_init={}): 25 | self.attr = attr_init 26 | self.label = label_init 27 | self.attr_down = attr_down_init 28 | 29 | 30 | def TreeGenerate(df): 31 | ''' 32 | Branching for decision tree using recursion 33 | 34 | @param df: the pandas dataframe of the data_set 35 | @return root: Node, the root node of decision tree 36 | ''' 37 | # generating a new root node 38 | new_node = Node(None, None, {}) 39 | label_arr = df[df.columns[-1]] 40 | 41 | label_count = NodeLabel(label_arr) 42 | if label_count: # assert the label_count isn't empty 43 | new_node.label = max(label_count, key=label_count.get) 44 | 45 | # end if there is only 1 class in current node data 46 | # end if attribution array is empty 47 | if len(label_count) == 1 or len(label_arr) == 0: 48 | return new_node 49 | 50 | # get the optimal attribution for a new branching 51 | new_node.attr, div_value = OptAttr_Gini(df) # via Gini index 52 | 53 | # recursion 54 | if div_value == 0: # categorical variable 55 | value_count = ValueCount(df[new_node.attr]) 56 | for value in value_count: 57 | df_v = df[df[new_node.attr].isin([value])] # get sub set 58 | # delete current attribution 59 | df_v = df_v.drop(new_node.attr, 1) 60 | new_node.attr_down[value] = TreeGenerate(df_v) 61 | 62 | else: # continuous variable # left and right child 63 | value_l = "<=%.3f" % div_value 64 | value_r = ">%.3f" % div_value 65 | df_v_l = df[df[new_node.attr] <= div_value] # get sub set 66 | df_v_r = df[df[new_node.attr] > div_value] 67 | 68 | new_node.attr_down[value_l] = TreeGenerate(df_v_l) 69 | new_node.attr_down[value_r] = TreeGenerate(df_v_r) 70 | 71 | return new_node 72 | 73 | 74 | def Predict(root, df_sample): 75 | ''' 76 | make a predict based on root 77 | 78 | @param root: Node, root Node of the decision tree 79 | @param df_sample: dataframe, a sample line 80 | ''' 81 | try: 82 | import re # using Regular Expression to get the number in string 83 | except ImportError: 84 | print("module re not found") 85 | 86 | while root.attr != None: 87 | # continuous variable 88 | if df_sample[root.attr].dtype == (float, int): 89 | # get the div_value from root.attr_down 90 | for key in list(root.attr_down): 91 | num = re.findall(r"\d+\.?\d*", key) 92 | div_value = float(num[0]) 93 | break 94 | if df_sample[root.attr].values[0] <= div_value: 95 | key = "<=%.3f" % div_value 96 | root = root.attr_down[key] 97 | else: 98 | key = ">%.3f" % div_value 99 | root = root.attr_down[key] 100 | 101 | # categorical variable 102 | else: 103 | key = df_sample[root.attr].values[0] 104 | # check whether the attr_value in the child branch 105 | if key in root.attr_down: 106 | root = root.attr_down[key] 107 | else: 108 | break 109 | 110 | return root.label 111 | 112 | 113 | def PredictAccuracy(root, df_test): 114 | ''' 115 | calculating accuracy of prediction on test set 116 | 117 | @param root: Node, root Node of the decision tree 118 | @param df_test: dataframe, test data set 119 | @return accuracy, float, 120 | ''' 121 | if len(df_test.index) == 0: return 0 122 | pred_true = 0 123 | for i in df_test.index: 124 | label = Predict(root, df_test[df_test.index == i]) 125 | if label == df_test[df_test.columns[-1]][i]: 126 | pred_true += 1 127 | return pred_true / len(df_test.index) 128 | 129 | 130 | def PrePrune(df_train, df_test): 131 | ''' 132 | pre-purning to generating a decision tree 133 | 预剪枝 134 | 基于奥卡姆剃刀准则，这棵决策树模型要优于前者； 135 | 由于数据集小，所以预剪枝优越性不明显，实际预剪枝操作是有较好的模型提升效果的。 136 | 此处结果模型太简单，有严重的欠拟合风险 137 | 138 | @param df_train: dataframe, the training set to generating a tree 139 | @param df_test: dataframe, the testing set for pruning decision 140 | @return root: Node, root of the tree using pruning 141 | ''' 142 | # generating a new root node 143 | new_node = Node(None, None, {}) 144 | label_arr = df_train[df_train.columns[-1]] 145 | 146 | label_count = NodeLabel(label_arr) 147 | if label_count: # assert the label_count isn't empty 148 | new_node.label = max(label_count, key=label_count.get) 149 | 150 | # end if there is only 1 class in current node data 151 | # end if attribution array is empty 152 | if len(label_count) == 1 or len(label_arr) == 0: 153 | return new_node 154 | 155 | # calculating the test accuracy up to current node 156 | a0 = PredictAccuracy(new_node, df_test) 157 | 158 | # get the optimal attribution for a new branching 159 | new_node.attr, div_value = OptAttr_Gini(df_train) # via Gini index 160 | 161 | # get the new branch 162 | if div_value == 0: # categorical variable 163 | value_count = ValueCount(df_train[new_node.attr]) 164 | for value in value_count: 165 | df_v = df_train[df_train[new_node.attr].isin([value])] # get sub set 166 | df_v = df_v.drop(new_node.attr, 1) 167 | # for child node 168 | new_node_child = Node(None, None, {}) 169 | label_arr_child = df_train[df_v.columns[-1]] 170 | label_count_child = NodeLabel(label_arr_child) 171 | new_node_child.label = max(label_count_child, key=label_count_child.get) 172 | new_node.attr_down[value] = new_node_child 173 | 174 | # calculating to check whether need further branching 175 | a1 = PredictAccuracy(new_node, df_test) 176 | if a1 > a0: # need branching 177 | for value in value_count: 178 | df_v = df_train[df_train[new_node.attr].isin([value])] # get sub set 179 | df_v = df_v.drop(new_node.attr, 1) 180 | new_node.attr_down[value] = TreeGenerate(df_v) 181 | else: 182 | new_node.attr = None 183 | new_node.attr_down = {} 184 | 185 | else: # continuous variable # left and right child 186 | value_l = "<=%.3f" % div_value 187 | value_r = ">%.3f" % div_value 188 | df_v_l = df_train[df_train[new_node.attr] <= div_value] # get sub set 189 | df_v_r = df_train[df_train[new_node.attr] > div_value] 190 | 191 | # for child node 192 | new_node_l = Node(None, None, {}) 193 | new_node_r = Node(None, None, {}) 194 | label_count_l = NodeLabel(df_v_l[df_v_r.columns[-1]]) 195 | label_count_r = NodeLabel(df_v_r[df_v_r.columns[-1]]) 196 | new_node_l.label = max(label_count_l, key=label_count_l.get) 197 | new_node_r.label = max(label_count_r, key=label_count_r.get) 198 | new_node.attr_down[value_l] = new_node_l 199 | new_node.attr_down[value_r] = new_node_r 200 | 201 | # calculating to check whether need further branching 202 | a1 = PredictAccuracy(new_node, df_test) 203 | if a1 > a0: # need branching 204 | new_node.attr_down[value_l] = TreeGenerate(df_v_l) 205 | new_node.attr_down[value_r] = TreeGenerate(df_v_r) 206 | else: 207 | new_node.attr = None 208 | new_node.attr_down = {} 209 | 210 | return new_node 211 | 212 | 213 | def PostPrune(root, df_test): 214 | ''' 215 | pre-pruning to generating a decision tree 216 | 后剪枝 217 | 此精度相较于前者有了很大的提升，说明经过后剪枝，模型泛化能力变强，同时保留了一定树规模，拟合较好。 218 | 219 | @param root: Node, root of the tree 220 | @param df_test: dataframe, the testing set for pruning decision 221 | @return accuracy score through traversal the tree 222 | ''' 223 | # leaf node 224 | if root.attr == None: 225 | return PredictAccuracy(root, df_test) 226 | 227 | # calculating the test accuracy on children node 228 | a1 = 0 229 | value_count = ValueCount(df_test[root.attr]) 230 | for value in list(value_count): 231 | df_test_v = df_test[df_test[root.attr].isin([value])] # get sub set 232 | if value in root.attr_down: # root has the value 233 | a1_v = PostPrune(root.attr_down[value], df_test_v) 234 | else: # root doesn't have value 235 | a1_v = PredictAccuracy(root, df_test_v) 236 | if a1_v == -1: # -1 means no pruning back from this child 237 | return -1 238 | else: 239 | a1 += a1_v * len(df_test_v.index) / len(df_test.index) 240 | 241 | # calculating the test accuracy on this node 242 | node = Node(None, root.label, {}) 243 | a0 = PredictAccuracy(node, df_test) 244 | 245 | # check if need pruning 246 | if a0 >= a1: 247 | root.attr = None 248 | root.attr_down = {} 249 | return a0 250 | else: 251 | return -1 252 | 253 | 254 | def NodeLabel(label_arr): 255 | ''' 256 | calculating the appeared label and it's counts 257 | 258 | @param label_arr: data array for class labels 259 | @return label_count: dict, the appeared label and it's counts 260 | ''' 261 | label_count = {} # store count of label 262 | 263 | for label in label_arr: 264 | if label in label_count: 265 | label_count[label] += 1 266 | else: 267 | label_count[label] = 1 268 | 269 | return label_count 270 | 271 | 272 | def ValueCount(data_arr): 273 | ''' 274 | calculating the appeared value for categorical attribute and it's counts 275 | 276 | @param data_arr: data array for an attribute 277 | @return value_count: dict, the appeared value and it's counts 278 | ''' 279 | value_count = {} # store count of value 280 | 281 | for label in data_arr: 282 | if label in value_count: 283 | value_count[label] += 1 284 | else: 285 | value_count[label] = 1 286 | 287 | return value_count 288 | 289 | 290 | ''' 291 | optimal attribution selection in CART algorithm based on gini index 292 | ''' 293 | 294 | 295 | def OptAttr_Gini(df): 296 | ''' 297 | find the optimal attributes of current data_set based on gini index 298 | 299 | @param df: the pandas dataframe of the data_set 300 | @return opt_attr: the optimal attribution for branch 301 | @return div_value: for discrete variable value = 0 302 | for continuous variable value = t for bisection divide value 303 | ''' 304 | gini_index = float('Inf') 305 | for attr_id in df.columns[1:-1]: 306 | gini_index_tmp, div_value_tmp = InfoGain(df, attr_id) 307 | if gini_index_tmp < gini_index: 308 | gini_index = gini_index_tmp 309 | opt_attr = attr_id 310 | div_value = div_value_tmp 311 | 312 | return opt_attr, div_value 313 | 314 | 315 | def GiniIndex(df, attr_id): 316 | ''' 317 | calculating the gini index of an attribution 318 | 计算基尼指数 319 | 320 | 321 | @param df: dataframe, the pandas dataframe of the data_set 322 | @param attr_id: the target attribution in df 323 | @return gini_index: the gini index of current attribution 324 | @return div_value: for discrete variable, value = 0 325 | for continuous variable, value = t (the division value) 326 | ''' 327 | gini_index = 0 # info_gain for the whole label 328 | div_value = 0 # div_value for continuous attribute 329 | 330 | n = len(df[attr_id]) # the number of sample 331 | 332 | # 1.for continuous variable using method of bisection 333 | if type(df[attr_id][0]) == np.float64: 334 | sub_gini = {} # store the div_value (div) and it's subset gini value 335 | 336 | df = df.sort([attr_id], ascending=1) # sorting via column 337 | df = df.reset_index(drop=True) 338 | 339 | data_arr = df[attr_id] 340 | label_arr = df[df.columns[-1]] 341 | 342 | for i in range(n - 1): 343 | div = (data_arr[i] + data_arr[i + 1]) / 2 344 | sub_gini[div] = ((i + 1) * Gini(label_arr[0:i + 1]) / n) \ 345 | + ((n - i - 1) * Gini(label_arr[i + 1:-1]) / n) 346 | # our goal is to get the min subset entropy sum and it's divide value 347 | div_value, gini_index = min(sub_gini.items(), key=lambda x: x[1]) 348 | 349 | # 2.for discrete variable (categorical variable) 350 | else: 351 | data_arr = df[attr_id] 352 | label_arr = df[df.columns[-1]] 353 | value_count = ValueCount(data_arr) 354 | 355 | for key in value_count: 356 | key_label_arr = label_arr[data_arr == key] 357 | gini_index += value_count[key] * Gini(key_label_arr) / n 358 | 359 | return gini_index, div_value 360 | 361 | 362 | def Gini(label_arr): 363 | ''' 364 | calculating the gini value of an attribution 365 | 366 | @param label_arr: ndarray, class label array of data_arr 367 | @return gini: the information entropy of current attribution 368 | ''' 369 | gini = 1 370 | 371 | n = len(label_arr) 372 | label_count = NodeLabel(label_arr) 373 | for key in label_count: 374 | gini -= (label_count[key] / n) * (label_count[key] / n) 375 | 376 | return gini 377 | 378 | 379 | ''' 380 | optimal attribution selection in ID3 algorithm based on information entropy 381 | ''' 382 | 383 | 384 | def OptAttr_Ent(df): 385 | ''' 386 | find the optimal attributes of current data_set based on info entropy 387 | 388 | @param df: the pandas dataframe of the data_set 389 | @return opt_attr: the optimal attribution for branch 390 | @return div_value: for discrete variable value = 0 391 | for continuous variable value = t for bisection divide value 392 | ''' 393 | info_gain = 0 394 | 395 | for attr_id in df.columns[1:-1]: 396 | info_gian_tmp, div_value_tmp = InfoGain(df, attr_id) 397 | if info_gian_tmp > info_gain: 398 | info_gain = info_gian_tmp 399 | opt_attr = attr_id 400 | div_value = div_value_tmp 401 | 402 | return opt_attr, div_value 403 | 404 | 405 | def InfoGain(df, attr_id): 406 | ''' 407 | calculating the information gain of an attribution 408 | 409 | @param df: dataframe, the pandas dataframe of the data_set 410 | @param attr_id: the target attribution in df 411 | @return info_gain: the information gain of current attribution 412 | @return div_value: for discrete variable, value = 0 413 | for continuous variable, value = t (the division value) 414 | ''' 415 | info_gain = InfoEnt(df.values[:, -1]) # info_gain for the whole label 416 | div_value = 0 # div_value for continuous attribute 417 | 418 | n = len(df[attr_id]) # the number of sample 419 | # 1.for continuous variable using method of bisection 420 | if type(df[attr_id][0]) == np.float64: 421 | sub_info_ent = {} # store the div_value (div) and it's subset entropy 422 | 423 | df = df.sort([attr_id], ascending=1) # sorting via column 424 | df = df.reset_index(drop=True) 425 | 426 | data_arr = df[attr_id] 427 | label_arr = df[df.columns[-1]] 428 | 429 | for i in range(n - 1): 430 | div = (data_arr[i] + data_arr[i + 1]) / 2 431 | sub_info_ent[div] = ((i + 1) * InfoEnt(label_arr[0:i + 1]) / n) \ 432 | + ((n - i - 1) * InfoEnt(label_arr[i + 1:-1]) / n) 433 | # our goal is to get the min subset entropy sum and it's divide value 434 | div_value, sub_info_ent_max = min(sub_info_ent.items(), key=lambda x: x[1]) 435 | info_gain -= sub_info_ent_max 436 | 437 | # 2.for discrete variable (categorical variable) 438 | else: 439 | data_arr = df[attr_id] 440 | label_arr = df[df.columns[-1]] 441 | value_count = ValueCount(data_arr) 442 | 443 | for key in value_count: 444 | key_label_arr = label_arr[data_arr == key] 445 | info_gain -= value_count[key] * InfoEnt(key_label_arr) / n 446 | 447 | return info_gain, div_value 448 | 449 | 450 | def InfoEnt(label_arr): 451 | ''' 452 | calculating the information entropy of an attribution 453 | 454 | @param label_arr: ndarray, class label array of data_arr 455 | @return ent: the information entropy of current attribution 456 | ''' 457 | try: 458 | from math import log2 459 | except ImportError: 460 | print("module math.log2 not found") 461 | 462 | ent = 0 463 | n = len(label_arr) 464 | label_count = NodeLabel(label_arr) 465 | 466 | for key in label_count: 467 | ent -= (label_count[key] / n) * log2(label_count[key] / n) 468 | 469 | return ent 470 | 471 | 472 | def DrawPNG(root, out_file): 473 | ''' 474 | visualization of decision tree from root. 475 | @param root: Node, the root node for tree. 476 | @param out_file: str, name and path of output file 477 | ''' 478 | try: 479 | from pydotplus import graphviz 480 | except ImportError: 481 | print("module pydotplus.graphviz not found") 482 | 483 | g = graphviz.Dot() # generation of new dot 484 | 485 | TreeToGraph(0, g, root) 486 | g2 = graphviz.graph_from_dot_data(g.to_string()) 487 | 488 | g2.write_png(out_file) 489 | 490 | 491 | def TreeToGraph(i, g, root): 492 | ''' 493 | build a graph from root on 494 | @param i: node number in this tree 495 | @param g: pydotplus.graphviz.Dot() object 496 | @param root: the root node 497 | 498 | @return i: node number after modified 499 | # @return g: pydotplus.graphviz.Dot() object after modified 500 | @return g_node: the current root node in graphviz 501 | ''' 502 | try: 503 | from pydotplus import graphviz 504 | except ImportError: 505 | print("module pydotplus.graphviz not found") 506 | 507 | if root.attr == None: 508 | g_node_label = "Node:%d\n好瓜:%s" % (i, root.label) 509 | else: 510 | g_node_label = "Node:%d\n好瓜:%s\n属性:%s" % (i, root.label, root.attr) 511 | g_node = i 512 | g.add_node(graphviz.Node(g_node, label=g_node_label)) 513 | 514 | for value in list(root.attr_down): 515 | i, g_child = TreeToGraph(i + 1, g, root.attr_down[value]) 516 | g.add_edge(graphviz.Edge(g_node, g_child, label=value)) 517 | 518 | return i, g_node 519 | --------------------------------------------------------------------------------