├── .idea
├── ML.iml
├── encodings.xml
├── inspectionProfiles
│ └── Project_Default.xml
├── misc.xml
├── modules.xml
├── other.xml
└── vcs.xml
├── AndrewNg
└── Linear Regression with One Variable
│ └── cost-function.py
├── MLFoundation
├── ex1
│ ├── 15.py
│ ├── 16.py
│ ├── 17.py
│ ├── 18.py
│ ├── 19.py
│ ├── 20.py
│ ├── hw1_15_train.dat
│ ├── hw1_18_test.dat
│ └── hw1_18_train.dat
├── ex2
│ ├── 17-18.py
│ ├── 19-20.py
│ ├── hw2_test.dat
│ └── hw2_train.dat
├── ex3
│ ├── 13-15.py
│ ├── 18-20.py
│ ├── hw3_test.dat
│ └── hw3_train.dat
├── ex4
│ ├── 13-20.py
│ ├── hw4_test.dat
│ └── hw4_train.dat
└── pdf
│ ├── 01_handout.pdf
│ ├── 02_handout.pdf
│ ├── 03_handout.pdf
│ ├── 04_handout.pdf
│ ├── 05_handout.pdf
│ ├── 06_handout.pdf
│ ├── 07_handout.pdf
│ ├── 08_handout.pdf
│ ├── 09_handout.pdf
│ ├── 10_handout.pdf
│ ├── 11_handout.pdf
│ ├── 12_handout.pdf
│ ├── 13_handout.pdf
│ ├── 14_handout.pdf
│ ├── 15_handout.pdf
│ └── 16_handout.pdf
├── README.md
├── StatisticalLearningMethod
├── chapter2
│ └── Perceptron.py
├── chapter3
│ ├── K-NN.py
│ ├── K-NN1.py
│ └── K-NN2.py
├── chapter4
│ ├── naive_Bayes.py
│ └── naive_Bayes1.py
├── chapter5
│ ├── C4.5.py
│ ├── CART.py
│ ├── ID3-1.py
│ └── ID3.py
├── data
│ ├── train.csv
│ ├── train_binary.csv
│ ├── train_binary1.csv
│ └── train_binary2.csv
├── errata.pdf
└── hog.xml
├── tensorflow
└── course
│ ├── data
│ └── fire_theft.xls
│ ├── feed.py
│ ├── fetch.py
│ ├── graph.py
│ ├── interactiveSession.py
│ ├── linearRegression.py
│ ├── load.py
│ ├── logisticRegression.py
│ ├── random.py
│ ├── shape.py
│ ├── test.py
│ ├── testt.py
│ └── variable.py
└── watermelon
├── ch3
├── 3.3
│ ├── data
│ │ └── watermelon_3a.csv
│ ├── logistic_regression.py
│ └── self_def.py
├── 3.4
│ ├── cross_validation.py
│ └── data
│ │ ├── transfusion.data
│ │ └── transfusion.names
└── 3.5
│ ├── LDA.py
│ ├── data
│ └── watermelon_3a.csv
│ └── self_def.py
└── ch4
├── 4.3
├── ID3_watermelon.py
├── data
│ └── watermelon_3.csv
└── decision_tree.py
└── 4.4
├── CART_watermelon.py
├── data
└── watermelon_2.csv
└── decision_tree.py
/.idea/ML.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/AndrewNg/Linear Regression with One Variable/cost-function.py:
--------------------------------------------------------------------------------
1 | # array1 = [1, 2, 2, 3, 3, 4, 5, 6, 6, 6, 8, 10]
2 | # array2 = [890, -1411, -1560, -2220, -2091, -2878, -3537, -3268, -3920, -4163, -5471, -5157]
3 |
4 | array1 = [3, 1, 0, 4]
5 | array2 = [2, 2, 1, 3]
6 |
7 |
8 | def cost_function(a0, a1):
9 | cost = 0
10 | for i in range(0, len(array1)):
11 | cost += ((a0 + a1 * array1[i] - array2[i])) ** 2
12 | cost /= (2 * len(array1))
13 | print(cost)
14 |
15 |
16 | # cost_function(-596.6, -530.9)
17 | # cost_function(-1780.0, 530.9)
18 | # cost_function(-596.6, 530.9)
19 | # cost_function(-1780.0, -530.9)
20 | cost_function(0, 1)
21 |
--------------------------------------------------------------------------------
/MLFoundation/ex1/15.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 | class NaiveCyclePLA(object):
4 | def __init__(self, dimension, count):
5 | self.__dimension = dimension
6 | self.__count = count
7 |
8 | # get data
9 | def train_matrix(self, path):
10 | training_set = open(path)
11 | x_train = numpy.zeros((self.__count, self.__dimension))
12 | y_train = numpy.zeros((self.__count, 1))
13 | x = []
14 | x_count = 0
15 | for line in training_set:
16 | # add 1 dimension manually
17 | x.append(1)
18 | for str in line.split(' '):
19 | if len(str.split('\t')) == 1:
20 | x.append(float(str))
21 | else:
22 | x.append(float(str.split('\t')[0]))
23 | y_train[x_count, 0] = int(str.split('\t')[1].strip())
24 | x_train[x_count, :] = x
25 | x = []
26 | x_count += 1
27 | return x_train, y_train
28 |
29 | def iteration_count(self, path):
30 | count = 0
31 | x_train, y_train = self.train_matrix(path)
32 | w = numpy.zeros((self.__dimension, 1))
33 | # loop until all x are classified right
34 | while True:
35 | flag = 0
36 | for i in range(self.__count):
37 | if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0:
38 | w += y_train[i, :] * x_train[i, :].reshape(5, 1)
39 | count += 1
40 | flag = 1
41 | if flag == 0:
42 | break
43 | return count
44 |
45 |
46 | if __name__ == '__main__':
47 | perceptron = NaiveCyclePLA(5, 400)
48 | print(perceptron.iteration_count("hw1_15_train.dat"))
49 |
--------------------------------------------------------------------------------
/MLFoundation/ex1/16.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import random
3 |
4 |
5 | class RandomPLA(object):
6 | def __init__(self, dimension, count):
7 | self.__dimension = dimension
8 | self.__count = count
9 |
10 | def random_matrix(self, path):
11 | training_set = open(path)
12 | random_list = []
13 | x = []
14 | x_count = 0
15 | for line in training_set:
16 | x.append(1)
17 | for str in line.split(' '):
18 | if len(str.split('\t')) == 1:
19 | x.append(float(str))
20 | else:
21 | x.append(float(str.split('\t')[0]))
22 | x.append(int(str.split('\t')[1].strip()))
23 | random_list.append(x)
24 | x = []
25 | x_count += 1
26 | random.shuffle(random_list)
27 | return random_list
28 |
29 | def train_matrix(self, path):
30 | x_train = numpy.zeros((self.__count, self.__dimension))
31 | y_train = numpy.zeros((self.__count, 1))
32 | random_list = self.random_matrix(path)
33 | for i in range(self.__count):
34 | for j in range(self.__dimension):
35 | x_train[i, j] = random_list[i][j]
36 | y_train[i, 0] = random_list[i][self.__dimension]
37 | return x_train, y_train
38 |
39 | def iteration_count(self, path):
40 | count = 0
41 | x_train, y_train = self.train_matrix(path)
42 | w = numpy.zeros((self.__dimension, 1))
43 | while True:
44 | flag = 0
45 | for i in range(self.__count):
46 | if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0:
47 | w += y_train[i, 0] * x_train[i, :].reshape(5, 1)
48 | count += 1
49 | flag = 1
50 | if flag == 0:
51 | break
52 | return count
53 |
54 |
55 | if __name__ == '__main__':
56 | sum = 0
57 | for i in range(2000):
58 | perceptron = RandomPLA(5, 400)
59 | sum += perceptron.iteration_count('hw1_15_train.dat')
60 | print(sum / 2000.0)
61 |
--------------------------------------------------------------------------------
/MLFoundation/ex1/17.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import random
3 |
4 |
5 | class RandomPLA(object):
6 | def __init__(self, dimension, count):
7 | self.__dimension = dimension
8 | self.__count = count
9 |
10 | def random_matrix(self, path):
11 | training_set = open(path)
12 | random_list = []
13 | x = []
14 | x_count = 0
15 | for line in training_set:
16 | x.append(1)
17 | for str in line.split(' '):
18 | if len(str.split('\t')) == 1:
19 | x.append(float(str))
20 | else:
21 | x.append(float(str.split('\t')[0]))
22 | x.append(int(str.split('\t')[1].strip()))
23 | random_list.append(x)
24 | x = []
25 | x_count += 1
26 | random.shuffle(random_list)
27 | return random_list
28 |
29 | def train_matrix(self, path):
30 | x_train = numpy.zeros((self.__count, self.__dimension))
31 | y_train = numpy.zeros((self.__count, 1))
32 | random_list = self.random_matrix(path)
33 | for i in range(self.__count):
34 | for j in range(self.__dimension):
35 | x_train[i, j] = random_list[i][j]
36 | y_train[i, 0] = random_list[i][self.__dimension]
37 | return x_train, y_train
38 |
39 | def iteration_count(self, path):
40 | count = 0
41 | x_train, y_train = self.train_matrix(path)
42 | w = numpy.zeros((self.__dimension, 1))
43 | while True:
44 | flag = 0
45 | for i in range(self.__count):
46 | if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0:
47 | w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1)
48 | count += 1
49 | flag = 1
50 | if flag == 0:
51 | break
52 | return count
53 |
54 |
55 | sum = 0
56 | for i in range(2000):
57 | perceptron = RandomPLA(5, 400)
58 | sum += perceptron.iteration_count('hw1_15_train.dat')
59 | print(sum / 2000.0)
60 |
--------------------------------------------------------------------------------
/MLFoundation/ex1/18.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import random
3 | import copy
4 |
5 |
6 | class Pocket(object):
7 | def __init__(self, dimension, train_count, test_count):
8 | self.__dimension = dimension
9 | self.__train_count = train_count
10 | self.__test_count = test_count
11 |
12 | def random_matrix(self, path):
13 | training_set = open(path)
14 | random_list = []
15 | x = []
16 | x_count = 0
17 | for line in training_set:
18 | x.append(1)
19 | for str in line.split(' '):
20 | if len(str.split('\t')) == 1:
21 | x.append(float(str))
22 | else:
23 | x.append(float(str.split('\t')[0]))
24 | x.append(int(str.split('\t')[1].strip()))
25 | random_list.append(x)
26 | x = []
27 | x_count += 1
28 | random.shuffle(random_list)
29 | return random_list
30 |
31 | def train_matrix(self, path):
32 | x_train = numpy.zeros((self.__train_count, self.__dimension))
33 | y_train = numpy.zeros((self.__train_count, 1))
34 | random_list = self.random_matrix(path)
35 | for i in range(self.__train_count):
36 | for j in range(self.__dimension):
37 | x_train[i, j] = random_list[i][j]
38 | y_train[i, 0] = random_list[i][self.__dimension]
39 | return x_train, y_train
40 |
41 | def iteration(self, path):
42 | count = 0
43 | x_train, y_train = self.train_matrix(path)
44 | w = numpy.zeros((self.__dimension, 1))
45 | best_count = self.__train_count
46 | best_w = numpy.zeros((self.__dimension, 1))
47 |
48 | # pocket算法,对一条线进行修改(最多50次),每次修改后都用训练集数据看是否是当前最好的那条线
49 | for i in range(self.__train_count):
50 | if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0:
51 | w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1)
52 | # 修改次数加一
53 | count += 1
54 | num = 0
55 | # 验证
56 | for j in range(self.__train_count):
57 | if numpy.dot(x_train[j, :], w)[0] * y_train[j, 0] <= 0:
58 | num += 1
59 | if num < best_count:
60 | best_count = num
61 | best_w = copy.deepcopy(w)
62 | if count == 50:
63 | break
64 | return best_w
65 |
66 | def test_matrix(self, test_path):
67 | x_test = numpy.zeros((self.__test_count, self.__dimension))
68 | y_test = numpy.zeros((self.__test_count, 1))
69 | test_set = open(test_path)
70 | x = []
71 | x_count = 0
72 | for line in test_set:
73 | x.append(1)
74 | for str in line.split(' '):
75 | if len(str.split('\t')) == 1:
76 | x.append(float(str))
77 | else:
78 | x.append(float(str.split('\t')[0]))
79 | y_test[x_count, 0] = (int(str.split('\t')[1].strip()))
80 | x_test[x_count, :] = x
81 | x = []
82 | x_count += 1
83 | return x_test, y_test
84 |
85 | # 验证
86 | def test_error(self, train_path, test_path):
87 | w = self.iteration(train_path)
88 | x_test, y_test = self.test_matrix(test_path)
89 | count = 0.0
90 | for i in range(self.__test_count):
91 | if numpy.dot(x_test[i, :], w)[0] * y_test[i, 0] <= 0:
92 | count += 1
93 | return count / self.__test_count
94 |
95 |
96 | if __name__ == '__main__':
97 | average_error_rate = 0
98 | for i in range(2000):
99 | my_Pocket = Pocket(5, 500, 500)
100 | average_error_rate += my_Pocket.test_error('hw1_18_train.dat', 'hw1_18_test.dat')
101 | print(average_error_rate / 2000.0)
102 |
--------------------------------------------------------------------------------
/MLFoundation/ex1/19.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import random
3 | import copy
4 |
5 |
6 | class Pocket(object):
7 | def __init__(self, dimension, train_count, test_count):
8 | self.__dimension = dimension
9 | self.__train_count = train_count
10 | self.__test_count = test_count
11 |
12 | def random_matrix(self, path):
13 | training_set = open(path)
14 | random_list = []
15 | x = []
16 | x_count = 0
17 | for line in training_set:
18 | x.append(1)
19 | for str in line.split(' '):
20 | if len(str.split('\t')) == 1:
21 | x.append(float(str))
22 | else:
23 | x.append(float(str.split('\t')[0]))
24 | x.append(int(str.split('\t')[1].strip()))
25 | random_list.append(x)
26 | x = []
27 | x_count += 1
28 | random.shuffle(random_list)
29 | return random_list
30 |
31 | def train_matrix(self, path):
32 | x_train = numpy.zeros((self.__train_count, self.__dimension))
33 | y_train = numpy.zeros((self.__train_count, 1))
34 | random_list = self.random_matrix(path)
35 | for i in range(self.__train_count):
36 | for j in range(self.__dimension):
37 | x_train[i, j] = random_list[i][j]
38 | y_train[i, 0] = random_list[i][self.__dimension]
39 | return x_train, y_train
40 |
41 | def iteration(self, path):
42 | count = 0
43 | x_train, y_train = self.train_matrix(path)
44 | w = numpy.zeros((self.__dimension, 1))
45 | for i in range(self.__train_count):
46 | if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0:
47 | w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1)
48 | count += 1
49 | if count == 50:
50 | break
51 | return w
52 |
53 | def test_matrix(self, test_path):
54 | x_test = numpy.zeros((self.__test_count, self.__dimension))
55 | y_test = numpy.zeros((self.__test_count, 1))
56 | test_set = open(test_path)
57 | x = []
58 | x_count = 0
59 | for line in test_set:
60 | x.append(1)
61 | for str in line.split(' '):
62 | if len(str.split('\t')) == 1:
63 | x.append(float(str))
64 | else:
65 | x.append(float(str.split('\t')[0]))
66 | y_test[x_count, 0] = (int(str.split('\t')[1].strip()))
67 | x_test[x_count, :] = x
68 | x = []
69 | x_count += 1
70 | return x_test, y_test
71 |
72 | # 验证
73 | def test_error(self, train_path, test_path):
74 | w = self.iteration(train_path)
75 | x_test, y_test = self.test_matrix(test_path)
76 | count = 0.0
77 | for i in range(self.__test_count):
78 | if numpy.dot(x_test[i, :], w)[0] * y_test[i, 0] <= 0:
79 | count += 1
80 | return count / self.__test_count
81 |
82 |
83 | if __name__ == '__main__':
84 | average_error_rate = 0
85 | for i in range(2000):
86 | my_Pocket = Pocket(5, 500, 500)
87 | average_error_rate += my_Pocket.test_error('hw1_18_train.dat', 'hw1_18_test.dat')
88 | print(average_error_rate / 2000.0)
89 |
--------------------------------------------------------------------------------
/MLFoundation/ex1/20.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import random
3 | import copy
4 |
5 |
6 | class Pocket(object):
7 | def __init__(self, dimension, train_count, test_count):
8 | self.__dimension = dimension
9 | self.__train_count = train_count
10 | self.__test_count = test_count
11 |
12 | def random_matrix(self, path):
13 | training_set = open(path)
14 | random_list = []
15 | x = []
16 | x_count = 0
17 | for line in training_set:
18 | x.append(1)
19 | for str in line.split(' '):
20 | if len(str.split('\t')) == 1:
21 | x.append(float(str))
22 | else:
23 | x.append(float(str.split('\t')[0]))
24 | x.append(int(str.split('\t')[1].strip()))
25 | random_list.append(x)
26 | x = []
27 | x_count += 1
28 | random.shuffle(random_list)
29 | return random_list
30 |
31 | def train_matrix(self, path):
32 | x_train = numpy.zeros((self.__train_count, self.__dimension))
33 | y_train = numpy.zeros((self.__train_count, 1))
34 | random_list = self.random_matrix(path)
35 | for i in range(self.__train_count):
36 | for j in range(self.__dimension):
37 | x_train[i, j] = random_list[i][j]
38 | y_train[i, 0] = random_list[i][self.__dimension]
39 | return x_train, y_train
40 |
41 | def iteration(self, path):
42 | count = 0
43 | x_train, y_train = self.train_matrix(path)
44 | w = numpy.zeros((self.__dimension, 1))
45 | best_count = self.__train_count
46 | best_w = numpy.zeros((self.__dimension, 1))
47 |
48 | # pocket算法,对一条线进行修改(最多100次),每次修改后都用训练集数据看是否是当前最好的那条线
49 | for i in range(self.__train_count):
50 | if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0:
51 | w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1)
52 | count += 1
53 | num = 0
54 | for j in range(self.__train_count):
55 | if numpy.dot(x_train[j, :], w)[0] * y_train[j, 0] <= 0:
56 | num += 1
57 | if num < best_count:
58 | best_count = num
59 | best_w = copy.deepcopy(w)
60 | if count == 100:
61 | break
62 | return best_w
63 |
64 | def test_matrix(self, test_path):
65 | x_test = numpy.zeros((self.__test_count, self.__dimension))
66 | y_test = numpy.zeros((self.__test_count, 1))
67 | test_set = open(test_path)
68 | x = []
69 | x_count = 0
70 | for line in test_set:
71 | x.append(1)
72 | for str in line.split(' '):
73 | if len(str.split('\t')) == 1:
74 | x.append(float(str))
75 | else:
76 | x.append(float(str.split('\t')[0]))
77 | y_test[x_count, 0] = (int(str.split('\t')[1].strip()))
78 | x_test[x_count, :] = x
79 | x = []
80 | x_count += 1
81 | return x_test, y_test
82 |
83 | # 验证
84 | def test_error(self, train_path, test_path):
85 | w = self.iteration(train_path)
86 | x_test, y_test = self.test_matrix(test_path)
87 | count = 0.0
88 | for i in range(self.__test_count):
89 | if numpy.dot(x_test[i, :], w)[0] * y_test[i, 0] <= 0:
90 | count += 1
91 | return count / self.__test_count
92 |
93 |
94 | if __name__ == '__main__':
95 | average_error_rate = 0
96 | for i in range(2000):
97 | my_Pocket = Pocket(5, 500, 500)
98 | average_error_rate += my_Pocket.test_error('hw1_18_train.dat', 'hw1_18_test.dat')
99 | print(average_error_rate / 2000.0)
100 |
--------------------------------------------------------------------------------
/MLFoundation/ex1/hw1_15_train.dat:
--------------------------------------------------------------------------------
1 | 0.97681 0.10723 0.64385 0.29556 1
2 | 0.67194 0.2418 0.83075 0.42741 1
3 | 0.20619 0.23321 0.81004 0.98691 1
4 | 0.51583 0.055814 0.92274 0.75797 1
5 | 0.70893 0.10836 0.33951 0.77058 1
6 | 0.55743 0.67804 0.061044 0.72689 1
7 | 0.15654 0.75584 0.01122 0.42598 -1
8 | 0.50462 0.15137 0.33878 0.41881 1
9 | 0.22657 0.59272 0.24103 0.46221 -1
10 | 0.49174 0.65115 0.24622 0.24796 -1
11 | 0.59512 0.26994 0.74692 0.32784 1
12 | 0.32439 0.37294 0.11623 0.94499 1
13 | 0.4475 0.60183 0.41323 0.58492 1
14 | 0.41171 0.098584 0.4795 0.083842 -1
15 | 0.10059 0.37353 0.0057687 0.14313 -1
16 | 0.8182 0.70052 0.67561 0.22231 1
17 | 0.3221 0.95754 0.99328 0.50757 1
18 | 0.41469 0.48406 0.39832 0.53216 1
19 | 0.48364 0.36163 0.14351 0.3153 -1
20 | 0.5323 0.21312 0.40401 0.98252 1
21 | 0.71073 0.29015 0.15557 0.70588 1
22 | 0.68151 0.23617 0.085193 0.58718 1
23 | 0.069048 0.14742 0.92254 0.93918 1
24 | 0.19337 0.29606 0.72589 0.71993 1
25 | 0.62783 0.80021 0.69486 0.41697 1
26 | 0.94658 0.85253 0.75418 0.3027 1
27 | 0.54402 0.73303 0.29073 0.26307 -1
28 | 0.20166 0.96147 0.83956 0.76917 1
29 | 0.8416 0.22036 0.60311 0.34751 1
30 | 0.659 0.40341 0.16311 0.12612 -1
31 | 0.87845 0.46984 0.32142 0.00042772 -1
32 | 0.95971 0.7334 0.45993 0.76215 1
33 | 0.35449 0.22126 0.57224 0.4336 1
34 | 0.34263 0.81404 0.30048 0.1461 -1
35 | 0.7234 0.45707 0.44129 0.40039 1
36 | 0.39538 0.20276 0.67262 0.67505 1
37 | 0.45179 0.78087 0.4938 0.073425 -1
38 | 0.23881 0.7675 0.40806 0.074954 -1
39 | 0.91059 0.18045 0.089421 0.59719 1
40 | 0.30088 0.3124 0.30033 0.1078 -1
41 | 0.20636 0.25969 0.87208 0.075063 -1
42 | 0.84325 0.20161 0.018555 0.58518 1
43 | 0.33334 0.087671 0.078659 0.15274 -1
44 | 0.18111 0.11502 0.73474 0.65718 1
45 | 0.90105 0.69659 0.44014 0.28963 1
46 | 0.76096 0.17909 0.18557 0.86889 1
47 | 0.20359 0.77736 0.2176 0.071641 -1
48 | 0.42406 0.98081 0.99433 0.071268 -1
49 | 0.61642 0.060815 0.10835 0.85805 1
50 | 0.62755 0.47251 0.63101 0.86293 1
51 | 0.55335 0.10757 0.87192 0.8353 1
52 | 0.72356 0.088313 0.69772 0.091611 1
53 | 0.02084 0.66204 0.26704 0.93343 1
54 | 0.15623 0.39914 0.58355 0.9993 1
55 | 0.90115 0.74857 0.6048 0.54481 1
56 | 0.40522 0.34025 0.84438 0.30728 1
57 | 0.69053 0.70505 0.77211 0.50009 1
58 | 0.32972 0.36727 0.038398 0.24515 -1
59 | 0.087565 0.80164 0.10873 0.72862 -1
60 | 0.26626 0.19317 0.83732 0.96563 1
61 | 0.33161 0.23154 0.12297 0.17358 -1
62 | 0.8338 0.22029 0.62198 0.5915 1
63 | 0.38873 0.57979 0.75488 0.12437 -1
64 | 0.093349 0.084263 0.085754 0.19575 -1
65 | 0.3938 0.21727 0.59706 0.36985 1
66 | 0.14047 0.12652 0.89396 0.0056295 -1
67 | 0.34342 0.76697 0.82696 0.43354 1
68 | 0.2665 0.83265 0.28848 0.2337 -1
69 | 0.36046 0.36809 0.32623 0.25556 -1
70 | 0.99778 0.97657 0.674 0.51915 1
71 | 0.22303 0.028847 0.73739 0.41662 1
72 | 0.30179 0.44626 0.17371 0.73116 1
73 | 0.31285 0.25044 0.46658 0.12074 -1
74 | 0.24446 0.51992 0.80413 0.74044 1
75 | 0.31433 0.80511 0.6496 0.56248 1
76 | 0.10521 0.202 0.87425 0.90105 1
77 | 0.34385 0.5524 0.52835 0.833 1
78 | 0.52791 0.62401 0.56754 0.41641 1
79 | 0.77826 0.57861 0.49655 0.84074 1
80 | 0.26143 0.5512 0.38472 0.18668 -1
81 | 0.87326 0.96009 0.24922 0.65171 1
82 | 0.65069 0.96118 0.36716 0.6302 1
83 | 0.46037 0.98854 0.62971 0.62758 1
84 | 0.11105 0.93171 0.85023 0.022051 -1
85 | 0.32721 0.95939 0.9862 0.92881 1
86 | 0.54203 0.071898 0.79052 0.86281 1
87 | 0.18994 0.76582 0.21911 0.25161 -1
88 | 0.24274 0.9501 0.80862 0.68007 1
89 | 0.36659 0.57376 0.22493 0.94652 1
90 | 0.52105 0.45772 0.7153 0.91306 1
91 | 0.73745 0.045874 0.9518 0.90951 1
92 | 0.0054206 0.5803 0.92465 0.52961 1
93 | 0.61914 0.3734 0.45772 0.56601 1
94 | 0.68483 0.34833 0.6974 0.51117 1
95 | 0.31049 0.58616 0.78657 0.077121 -1
96 | 0.0077248 0.69259 0.98719 0.93702 1
97 | 0.45361 0.47903 0.1331 0.41037 -1
98 | 0.84801 0.7256 0.21409 0.88719 1
99 | 0.29968 0.17497 0.99655 0.15494 1
100 | 0.10789 0.090897 0.013157 0.45712 -1
101 | 0.72711 0.89662 0.048524 0.77902 1
102 | 0.50372 0.14179 0.8632 0.57913 1
103 | 0.22889 0.248 0.5324 0.58705 1
104 | 0.79724 0.4484 0.90201 0.19897 1
105 | 0.10663 0.49593 0.20231 0.05901 -1
106 | 0.15117 0.49039 0.8309 0.91627 1
107 | 0.95409 0.40038 0.82197 0.73251 1
108 | 0.35704 0.014972 0.47835 0.55573 1
109 | 0.4672 0.78532 0.63665 0.80891 1
110 | 0.51268 0.49317 0.37239 0.11229 -1
111 | 0.60983 0.54596 0.30924 0.45368 1
112 | 0.17321 0.67316 0.27675 0.53482 -1
113 | 0.5761 0.36533 0.44297 0.585 1
114 | 0.77885 0.92006 0.51157 0.42738 1
115 | 0.58168 0.7896 0.58292 0.11996 -1
116 | 0.7243 0.19231 0.12572 0.42981 1
117 | 0.27893 0.27538 0.82096 0.92758 1
118 | 0.79986 0.070765 0.099176 0.61674 1
119 | 0.65646 0.042222 0.039717 0.90227 1
120 | 0.2386 0.41482 0.16741 0.26592 -1
121 | 0.84494 0.53851 0.08783 0.74972 1
122 | 0.69721 0.29151 0.14566 0.092551 -1
123 | 0.085241 0.19873 0.11313 0.53704 -1
124 | 0.18871 0.093184 0.55176 0.047211 -1
125 | 0.21583 0.79506 0.30754 0.7987 1
126 | 0.050727 0.19674 0.73473 0.48999 1
127 | 0.077524 0.29589 0.012955 0.93278 1
128 | 0.87063 0.46914 0.22899 0.35294 1
129 | 0.84807 0.60812 0.42088 0.97709 1
130 | 0.045535 0.66219 0.76946 0.71987 1
131 | 0.64344 0.20442 0.20197 0.43431 1
132 | 0.33283 0.78383 0.0097152 0.13798 -1
133 | 0.091392 0.95801 0.30999 0.17345 -1
134 | 0.058002 0.42981 0.92919 0.40967 1
135 | 0.22095 0.66618 0.86801 0.61817 1
136 | 0.018695 0.21615 0.68387 0.069085 -1
137 | 0.79796 0.18841 0.12854 0.50856 1
138 | 0.67478 0.92791 0.025838 0.12608 -1
139 | 0.68964 0.92125 0.65626 0.76319 1
140 | 0.37004 0.0075887 0.99533 0.82581 1
141 | 0.4103 0.22978 0.2938 0.78125 1
142 | 0.46467 0.40583 0.26626 0.17288 -1
143 | 0.27347 0.38493 0.20575 0.80271 1
144 | 0.0037457 0.59585 0.85865 0.037211 -1
145 | 0.45059 0.83556 0.54132 0.21109 -1
146 | 0.055447 0.84199 0.62001 0.80487 1
147 | 0.016285 0.39547 0.12598 0.63249 -1
148 | 0.11982 0.90112 0.55878 0.19737 -1
149 | 0.77264 0.38371 0.61856 0.36306 1
150 | 0.68999 0.42401 0.43875 0.98001 1
151 | 0.057837 0.86126 0.84096 0.6711 1
152 | 0.23792 0.066348 0.44791 0.9972 1
153 | 0.39259 0.89268 0.54155 0.0061404 -1
154 | 0.20604 0.19453 0.31621 0.71208 1
155 | 0.18058 0.37711 0.88283 0.65659 1
156 | 0.80745 0.24562 0.82253 0.98408 1
157 | 0.41828 0.36215 0.8516 0.68281 1
158 | 0.1323 0.39434 0.84215 0.91682 1
159 | 0.61753 0.09773 0.81467 0.40281 1
160 | 0.97318 0.19905 0.26089 0.68696 1
161 | 0.76135 0.65909 0.89342 0.21845 1
162 | 0.58691 0.6069 0.43123 0.042843 -1
163 | 0.34919 0.10586 0.50059 0.082363 -1
164 | 0.37798 0.23626 0.23852 0.14685 -1
165 | 0.9042 0.98451 0.019088 0.76116 1
166 | 0.84556 0.90166 0.072432 0.079249 -1
167 | 0.84747 0.64503 0.011196 0.53983 1
168 | 0.49067 0.78682 0.15697 0.089691 -1
169 | 0.92475 0.60457 0.64656 0.93019 1
170 | 0.63634 0.80437 0.44479 0.18618 -1
171 | 0.19157 0.60461 0.40676 0.95747 1
172 | 0.5551 0.89083 0.2496 0.65735 1
173 | 0.93298 0.76517 0.25749 0.035361 -1
174 | 0.2199 0.21024 0.10609 0.33801 -1
175 | 0.81888 0.42535 0.37241 0.74882 1
176 | 0.32533 0.40846 0.037799 0.004201 -1
177 | 0.4737 0.14999 0.66915 0.8465 1
178 | 0.16804 0.44428 0.51001 0.66228 1
179 | 0.86743 0.8456 0.17056 0.95574 1
180 | 0.28583 0.93363 0.91645 0.95502 1
181 | 0.83711 0.59571 0.3367 0.97731 1
182 | 0.32174 0.85545 0.71378 0.91737 1
183 | 0.52212 0.36278 0.66123 0.75587 1
184 | 0.21409 0.1191 0.11796 0.75938 1
185 | 0.38188 0.29273 0.27347 0.23086 -1
186 | 0.72916 0.73744 0.90535 0.13761 1
187 | 0.059381 0.25354 0.22097 0.83323 1
188 | 0.36486 0.91348 0.14745 0.57585 -1
189 | 0.68553 0.062004 0.70984 0.66362 1
190 | 0.93301 0.86593 0.17125 0.77453 1
191 | 0.61463 0.4409 0.75333 0.89446 1
192 | 0.12285 0.057161 0.58692 0.49092 1
193 | 0.56427 0.42429 0.41168 0.44017 1
194 | 0.29777 0.69766 0.8302 0.061072 -1
195 | 0.53183 0.69574 0.73405 0.90509 1
196 | 0.61368 0.29695 0.35748 0.841 1
197 | 0.85256 0.0045204 0.85749 0.38761 1
198 | 0.46745 0.45305 0.44254 0.72515 1
199 | 0.71941 0.19092 0.24009 0.89824 1
200 | 0.73892 0.44994 0.78128 0.18219 1
201 | 0.31277 0.92634 0.29642 0.46112 -1
202 | 0.11872 0.89219 0.794 0.28731 -1
203 | 0.54582 0.79468 0.18279 0.048142 -1
204 | 0.83241 0.46586 0.10901 0.048364 -1
205 | 0.89567 0.69597 0.89578 0.10248 1
206 | 0.24917 0.76999 0.20536 0.56092 -1
207 | 0.83858 0.81299 0.95404 0.62472 1
208 | 0.21222 0.21892 0.84233 0.83773 1
209 | 0.31804 0.5679 0.55799 0.15455 -1
210 | 0.81836 0.32376 0.50428 0.2733 1
211 | 0.74487 0.78055 0.18939 0.25642 -1
212 | 0.14736 0.74033 0.48418 0.0015921 -1
213 | 0.80975 0.072057 0.71856 0.86265 1
214 | 0.92345 0.37355 0.34499 0.89149 1
215 | 0.38189 0.089103 0.31269 0.72856 1
216 | 0.49649 0.25659 0.65471 0.94681 1
217 | 0.10242 0.27703 0.52294 0.85126 1
218 | 0.35479 0.17024 0.79189 0.86742 1
219 | 0.70429 0.69697 0.062243 0.964 1
220 | 0.29857 0.77505 0.65087 0.28314 -1
221 | 0.68766 0.51467 0.63235 0.44751 1
222 | 0.15416 0.83044 0.69105 0.027009 -1
223 | 0.83522 0.32071 0.52787 0.10613 1
224 | 0.83811 0.3915 0.57094 0.47851 1
225 | 0.57131 0.88752 0.53706 0.55403 1
226 | 0.93257 0.64968 0.24587 0.81109 1
227 | 0.29608 0.083328 0.74109 0.35551 1
228 | 0.46203 0.18142 0.063792 0.92144 1
229 | 0.41203 0.53101 0.77315 0.62032 1
230 | 0.36268 0.29523 0.71811 0.70884 1
231 | 0.39207 0.53465 0.28893 0.93615 1
232 | 0.95333 0.40831 0.29404 0.41991 1
233 | 0.94916 0.34266 0.87255 0.43527 1
234 | 0.19017 0.47568 0.14256 0.44132 -1
235 | 0.85894 0.9006 0.23357 0.80459 1
236 | 0.67525 0.86288 0.013998 0.28517 -1
237 | 0.88734 0.64802 0.36704 0.54815 1
238 | 0.84748 0.20105 0.89731 0.59314 1
239 | 0.53217 0.98951 0.1954 0.27718 -1
240 | 0.47945 0.30232 0.45604 0.89163 1
241 | 0.99187 0.72996 0.77676 0.72478 1
242 | 0.8889 0.36558 0.82728 0.45772 1
243 | 0.27408 0.7204 0.65677 0.70424 1
244 | 0.52243 0.59938 0.6246 0.11785 -1
245 | 0.76399 0.025814 0.33736 0.20739 1
246 | 0.27187 0.74592 0.21669 0.41116 -1
247 | 0.90839 0.050892 0.67696 0.98549 1
248 | 0.60506 0.54448 0.84372 0.30577 1
249 | 0.10422 0.76155 0.83826 0.5412 1
250 | 0.78474 0.0066151 0.22536 0.50022 1
251 | 0.98582 0.68248 0.28302 0.45186 1
252 | 0.41665 0.81217 0.097022 0.32122 -1
253 | 0.90475 0.46776 0.88671 0.68763 1
254 | 0.033977 0.048415 0.60235 0.065179 -1
255 | 0.98983 0.48006 0.33899 0.29487 1
256 | 0.85168 0.59711 0.93749 0.35835 1
257 | 0.84725 0.020964 0.39386 0.88603 1
258 | 0.56072 0.91605 0.019558 0.42813 -1
259 | 0.11745 0.060389 0.021678 0.58085 -1
260 | 0.20919 0.79555 0.69939 0.78054 1
261 | 0.7171 0.28297 0.84921 0.74192 1
262 | 0.21242 0.32839 0.56807 0.53329 1
263 | 0.48941 0.0084562 0.51977 0.72383 1
264 | 0.98037 0.2035 0.32161 0.4112 1
265 | 0.35711 0.67505 0.11554 0.47356 -1
266 | 0.68983 0.09837 0.66985 0.62623 1
267 | 0.43838 0.026309 0.51285 0.86236 1
268 | 0.10529 0.68645 0.99395 0.63142 1
269 | 0.53952 0.99271 0.27649 0.9474 1
270 | 0.018782 0.74473 0.99206 0.87102 1
271 | 0.51718 0.67211 0.70828 0.31218 1
272 | 0.41189 0.56691 0.78364 0.67886 1
273 | 0.44772 0.18827 0.71978 0.36447 1
274 | 0.317 0.47494 0.54949 0.55973 1
275 | 0.21139 0.30158 0.65269 0.051723 -1
276 | 0.13736 0.51767 0.28234 0.79935 1
277 | 0.037048 0.10755 0.63398 0.76885 1
278 | 0.44087 0.89808 0.67844 0.48225 1
279 | 0.75841 0.78382 0.24322 0.72986 1
280 | 0.87597 0.89991 0.037972 0.2432 -1
281 | 0.60687 0.32885 0.54284 0.67944 1
282 | 0.43019 0.869 0.60879 0.90864 1
283 | 0.65513 0.39801 0.91845 0.53552 1
284 | 0.88689 0.65472 0.99466 0.69948 1
285 | 0.77567 0.94883 0.8498 0.18626 1
286 | 0.97233 0.1599 0.9329 0.089635 1
287 | 0.94461 0.72613 0.71317 0.46217 1
288 | 0.4605 0.97047 0.76531 0.3996 1
289 | 0.5502 0.37931 0.76456 0.80705 1
290 | 0.5828 0.16063 0.74013 0.11508 1
291 | 0.58966 0.49064 0.99596 0.25634 1
292 | 0.96575 0.2141 0.15024 0.98043 1
293 | 0.29939 0.2934 0.46088 0.74118 1
294 | 0.042301 0.51492 0.105 0.33518 -1
295 | 0.62395 0.45102 0.92252 0.77543 1
296 | 0.36607 0.35256 0.32267 0.3285 -1
297 | 0.96545 0.25132 0.064417 0.51374 1
298 | 0.63056 0.053806 0.14816 0.40033 1
299 | 0.48831 0.76017 0.61242 0.48176 1
300 | 0.5583 0.59146 0.24049 0.22209 -1
301 | 0.94304 0.96431 0.31249 0.10506 -1
302 | 0.011705 0.93889 0.25839 0.21194 -1
303 | 0.97164 0.22943 0.18083 0.88409 1
304 | 0.87546 0.6744 0.75024 0.25818 1
305 | 0.64631 0.32332 0.86857 0.40117 1
306 | 0.4276 0.81183 0.34678 0.98935 1
307 | 0.28472 0.82959 0.40054 0.87363 1
308 | 0.62037 0.31285 0.27722 0.64167 1
309 | 0.70482 0.629 0.6828 0.51672 1
310 | 0.83688 0.18413 0.37164 0.51392 1
311 | 0.19111 0.26472 0.19798 0.76058 1
312 | 0.24988 0.091229 0.19524 0.012353 -1
313 | 0.62081 0.11765 0.98492 0.019084 1
314 | 0.18157 0.22637 0.68213 0.74354 1
315 | 0.7659 0.28888 0.61728 0.1657 1
316 | 0.26463 0.45099 0.14001 0.47823 -1
317 | 0.90022 0.31697 0.73717 0.84918 1
318 | 0.85095 0.7647 0.26824 0.61702 1
319 | 0.33281 0.83714 0.21334 0.27535 -1
320 | 0.29159 0.13184 0.10133 0.33435 -1
321 | 0.46935 0.26674 0.023366 0.21269 -1
322 | 0.6042 0.23026 0.50198 0.67093 1
323 | 0.50244 0.31349 0.564 0.74072 1
324 | 0.12275 0.53116 0.37771 0.27835 -1
325 | 0.12977 0.61848 0.83557 0.087753 -1
326 | 0.60099 0.74051 0.046187 0.79207 1
327 | 0.96669 0.37691 0.014413 0.026769 -1
328 | 0.24756 0.67287 0.053795 0.053087 -1
329 | 0.31767 0.63018 0.37828 0.27766 -1
330 | 0.60216 0.17537 0.1279 0.61092 1
331 | 0.087833 0.99196 0.77303 0.98091 1
332 | 0.36564 0.23189 0.64808 0.78337 1
333 | 0.21106 0.13959 0.20768 0.72656 1
334 | 0.6089 0.20358 0.9282 0.39475 1
335 | 0.079604 0.58299 0.46986 0.69636 1
336 | 0.25485 0.35519 0.26085 0.69246 1
337 | 0.67904 0.41069 0.49872 0.69857 1
338 | 0.40779 0.8325 0.16625 0.47396 -1
339 | 0.46199 0.50523 0.33119 0.92953 1
340 | 0.89327 0.56518 0.21383 0.61029 1
341 | 0.41033 0.38488 0.12862 0.8564 1
342 | 0.058138 0.62899 0.60946 0.99762 1
343 | 0.0073587 0.54418 0.26272 0.0063957 -1
344 | 0.91431 0.96241 0.89095 0.22206 1
345 | 0.97883 0.69139 0.23555 0.56506 1
346 | 0.79162 0.25942 0.20671 0.081687 -1
347 | 0.1136 0.19133 0.20443 0.44308 -1
348 | 0.5753 0.11082 0.96049 0.44523 1
349 | 0.66688 0.32664 0.058022 0.21483 -1
350 | 0.85187 0.53112 0.29813 0.91085 1
351 | 0.5679 0.7258 0.47001 0.49278 1
352 | 0.35162 0.85285 0.45142 0.22949 -1
353 | 0.2479 0.52952 0.79521 0.44092 1
354 | 0.4693 0.60065 0.90787 0.92907 1
355 | 0.31096 0.052271 0.25236 0.82934 1
356 | 0.55096 0.79786 0.71317 0.8198 1
357 | 0.99279 0.15139 0.27982 0.45122 1
358 | 0.66404 0.096739 0.26582 0.10294 -1
359 | 0.52803 0.1423 0.46639 0.57637 1
360 | 0.99328 0.14342 0.0087678 0.84295 1
361 | 0.5299 0.17308 0.0613 0.99353 1
362 | 0.81762 0.54861 0.87142 0.55873 1
363 | 0.68483 0.65517 0.49261 0.65511 1
364 | 0.24142 0.53478 0.92219 0.53656 1
365 | 0.66164 0.97376 0.61345 0.39626 1
366 | 0.049532 0.54176 0.98792 0.89908 1
367 | 0.038881 0.38398 0.6202 0.25135 -1
368 | 0.61624 0.084068 0.02411 0.65738 1
369 | 0.17096 0.41017 0.78869 0.71301 1
370 | 0.29773 0.63452 0.9311 0.57032 1
371 | 0.041402 0.64972 0.2671 0.15491 -1
372 | 0.28259 0.44665 0.57678 0.98452 1
373 | 0.16068 0.072643 0.31165 0.29832 -1
374 | 0.97714 0.77051 0.54517 0.72295 1
375 | 0.87151 0.86679 0.20841 0.69075 1
376 | 0.34734 0.25215 0.67884 0.69012 1
377 | 0.26408 0.11281 0.021935 0.17689 -1
378 | 0.69426 0.41539 0.27711 0.78669 1
379 | 0.84044 0.29512 0.56474 0.33757 1
380 | 0.39973 0.32958 0.34539 0.66934 1
381 | 0.58272 0.40829 0.30819 0.1299 -1
382 | 0.4527 0.40875 0.045895 0.41199 -1
383 | 0.29341 0.03832 0.7905 0.33916 1
384 | 0.92222 0.51471 0.13331 0.56679 1
385 | 0.18129 0.96248 0.79131 0.58486 1
386 | 0.45696 0.20427 0.69854 0.48235 1
387 | 0.96531 0.27775 0.95255 0.56022 1
388 | 0.50468 0.99699 0.75136 0.51681 1
389 | 0.55852 0.067689 0.666 0.98482 1
390 | 0.83188 0.66817 0.23403 0.72472 1
391 | 0.97959 0.40402 0.96303 0.28133 1
392 | 0.29634 0.4012 0.40266 0.67864 1
393 | 0.34922 0.99751 0.23234 0.52115 -1
394 | 0.65637 0.7181 0.72843 0.93113 1
395 | 0.079695 0.57218 0.70591 0.33812 -1
396 | 0.71206 0.51569 0.18168 0.5557 1
397 | 0.17528 0.2625 0.8306 0.029669 -1
398 | 0.93895 0.93941 0.72496 0.95655 1
399 | 0.046136 0.94413 0.038311 0.26812 -1
400 | 0.072491 0.2242 0.62592 0.67238 1
401 |
--------------------------------------------------------------------------------
/MLFoundation/ex2/17-18.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | # generate input data with 20% flipping noise
5 | def generate_input_data(time_seed):
6 | np.random.seed(time_seed)
7 | raw_X = np.sort(np.random.uniform(-1, 1, 20))
8 | # 加20%噪声
9 | noised_y = np.sign(raw_X) * np.where(np.random.random(raw_X.shape[0]) < 0.2, -1, 1)
10 | return raw_X, noised_y
11 |
12 |
13 | def calculate_Ein(x, y):
14 | # calculate median of interval & negative infinite & positive infinite
15 | thetas = np.array([float("-inf")] + [(x[i] + x[i + 1]) / 2 for i in range(0, x.shape[0] - 1)] + [float("inf")])
16 | Ein = x.shape[0]
17 | sign = 1
18 | target_theta = 0.0
19 | # positive and negative rays
20 | for theta in thetas:
21 | y_positive = np.where(x > theta, 1, -1)
22 | y_negative = np.where(x < theta, 1, -1)
23 | error_positive = sum(y_positive != y)
24 | error_negative = sum(y_negative != y)
25 | if error_positive > error_negative:
26 | if Ein > error_negative:
27 | Ein = error_negative
28 | sign = -1
29 | target_theta = theta
30 | else:
31 | if Ein > error_positive:
32 | Ein = error_positive
33 | sign = 1
34 | target_theta = theta
35 | # two corner cases
36 | if target_theta == float("inf"):
37 | target_theta = 1.0
38 | if target_theta == float("-inf"):
39 | target_theta = -1.0
40 | return Ein, target_theta, sign
41 |
42 |
43 | if __name__ == '__main__':
44 | T = 5000
45 | total_Ein = 0
46 | sum_Eout = 0
47 | for i in range(0, T):
48 | x, y = generate_input_data(i)
49 | curr_Ein, theta, sign = calculate_Ein(x, y)
50 | total_Ein = total_Ein + curr_Ein
51 | sum_Eout = sum_Eout + 0.5 + 0.3 * sign * (abs(theta) - 1)
52 | # 17
53 | print((total_Ein * 1.0) / (T * 20))
54 | # 18
55 | print((sum_Eout * 1.0) / T)
56 |
--------------------------------------------------------------------------------
/MLFoundation/ex2/19-20.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def read_input_data(path):
5 | x = []
6 | y = []
7 | for line in open(path).readlines():
8 | items = line.strip().split(' ')
9 | tmp_x = []
10 | for i in range(0, len(items) - 1): tmp_x.append(float(items[i]))
11 | x.append(tmp_x)
12 | y.append(float(items[-1]))
13 | return np.array(x), np.array(y)
14 |
15 |
16 | def calculate_Ein(x, y):
17 | # calculate median of interval & negative infinite & positive infinite
18 | thetas = np.array([float("-inf")] + [(x[i] + x[i + 1]) / 2 for i in range(0, x.shape[0] - 1)] + [float("inf")])
19 | Ein = x.shape[0]
20 | sign = 1
21 | target_theta = 0.0
22 | # positive and negative rays
23 | for theta in thetas:
24 | y_positive = np.where(x > theta, 1, -1)
25 | y_negative = np.where(x < theta, 1, -1)
26 | error_positive = sum(y_positive != y)
27 | error_negative = sum(y_negative != y)
28 | if error_positive > error_negative:
29 | if Ein > error_negative:
30 | Ein = error_negative
31 | sign = -1
32 | target_theta = theta
33 | else:
34 | if Ein > error_positive:
35 | Ein = error_positive
36 | sign = 1
37 | target_theta = theta
38 | return Ein, target_theta, sign
39 |
40 |
41 | if __name__ == '__main__':
42 | # 19
43 | x, y = read_input_data("hw2_train.dat")
44 | # record optimal descision stump parameters
45 | Ein = x.shape[0]
46 | theta = 0
47 | sign = 1
48 | index = 0
49 | # multi decision stump optimal process
50 | for i in range(0, x.shape[1]):
51 | input_x = x[:, i]
52 | input_data = np.transpose(np.array([input_x, y]))
53 | input_data = input_data[np.argsort(input_data[:, 0])]
54 | curr_Ein, curr_theta, curr_sign = calculate_Ein(input_data[:, 0], input_data[:, 1])
55 | if Ein > curr_Ein:
56 | Ein = curr_Ein
57 | theta = curr_theta
58 | sign = curr_sign
59 | index = i
60 | print((Ein * 1.0) / x.shape[0])
61 | # 20
62 | # test process
63 | test_x, test_y = read_input_data("hw2_test.dat")
64 | test_x = test_x[:, index]
65 | predict_y = np.array([])
66 | if sign == 1:
67 | predict_y = np.where(test_x > theta, 1.0, -1.0)
68 | else:
69 | predict_y = np.where(test_x < theta, 1.0, -1.0)
70 | Eout = sum(predict_y != test_y)
71 | print((Eout * 1.0) / test_x.shape[0])
72 |
--------------------------------------------------------------------------------
/MLFoundation/ex2/hw2_train.dat:
--------------------------------------------------------------------------------
1 | 8.105 -3.500 4.769 4.541 -9.829 5.252 3.838 -3.408 -4.824 -1
2 | -6.273 -2.097 9.404 1.143 3.487 -5.206 0.061 5.024 -6.687 1
3 | 1.624 -1.173 4.260 -3.607 -6.632 4.431 -8.355 7.206 -8.977 1
4 | -10.000 7.758 -2.670 -8.880 -1.099 -9.183 -4.086 8.962 5.841 1
5 | 8.464 1.762 2.729 2.724 8.155 6.096 -2.844 9.800 3.302 -1
6 | -0.135 6.193 7.705 7.195 7.313 -3.395 8.012 -6.773 -4.433 1
7 | 0.934 -8.379 -2.083 -6.337 4.346 -3.928 9.759 -8.499 -4.128 1
8 | 8.923 -0.018 -6.837 6.628 -2.823 -9.524 -6.767 -4.811 -6.296 1
9 | -9.028 7.010 -9.063 -1.111 -9.328 5.282 4.960 -9.569 6.784 -1
10 | -9.706 1.392 6.562 -6.543 -1.980 -6.261 -6.067 1.254 -1.071 1
11 | -6.891 -4.157 1.057 -5.954 4.732 1.729 9.328 -0.308 2.160 1
12 | -0.845 -5.858 -0.486 -4.282 -2.401 7.534 -0.543 1.531 -1.212 -1
13 | -9.596 -3.929 9.556 1.461 0.117 4.288 -6.810 -0.555 -6.020 1
14 | 9.124 7.287 -7.506 -1.363 -6.995 0.093 -3.828 2.462 -8.376 1
15 | 7.514 7.608 -0.175 7.071 -0.931 9.942 1.359 2.259 -0.613 -1
16 | -1.805 -2.265 -9.636 0.689 6.373 -6.631 -9.218 -7.456 5.831 -1
17 | -3.048 8.819 -8.509 6.777 5.889 0.560 6.719 -2.752 -7.181 -1
18 | -5.873 -9.376 -3.226 -5.509 1.313 -6.853 -2.140 2.095 -4.309 -1
19 | 4.250 -5.350 -6.683 5.741 -8.574 9.207 -3.699 8.145 -3.545 -1
20 | 8.587 -0.571 -7.906 -4.638 3.920 3.407 -1.491 -8.220 -4.498 1
21 | -8.107 0.089 -7.650 -4.790 -4.171 -6.223 -5.583 2.130 -8.078 1
22 | -8.616 9.386 -9.095 -6.522 -5.252 4.825 6.886 3.256 6.605 -1
23 | -10.000 -3.258 -1.998 -7.559 1.952 3.832 -3.782 6.369 -4.038 1
24 | -4.212 -1.462 -2.603 -3.308 2.016 2.144 -8.483 -1.099 -4.600 1
25 | 8.112 3.770 -5.551 -3.885 6.211 6.401 9.946 -7.571 2.770 -1
26 | -8.868 0.669 5.703 -1.472 7.361 -2.282 -9.328 8.879 6.620 1
27 | 6.635 5.312 5.358 -8.916 -8.574 1.569 7.485 -8.628 3.998 1
28 | 7.432 -8.466 -9.884 3.135 0.062 7.477 -9.147 0.734 6.355 -1
29 | -3.031 2.371 -4.132 -7.674 3.454 -2.706 3.895 0.939 -1.334 1
30 | -10.000 -1.108 7.883 -7.978 -7.973 -2.055 9.498 -7.120 8.679 1
31 | 10.000 2.703 -6.408 -4.365 5.029 7.046 2.929 -1.076 -2.015 -1
32 | 3.891 1.182 -0.468 1.774 3.203 1.559 9.719 2.702 4.439 -1
33 | -4.895 7.533 3.229 -1.304 -6.832 -1.742 -4.258 6.097 7.182 1
34 | -6.454 -0.875 4.457 3.077 -9.100 -2.340 -5.364 -9.381 -10.000 -1
35 | 4.393 8.004 -5.783 -2.378 -3.299 -2.615 5.880 2.443 -6.518 1
36 | 0.337 2.622 -4.467 -5.206 -4.301 -3.567 2.454 0.335 -2.949 1
37 | -1.583 7.670 6.972 2.634 -4.708 -6.327 -9.980 -8.828 6.116 1
38 | -8.917 1.634 -6.017 -3.384 6.428 -0.318 3.049 -1.118 -10.000 1
39 | -4.864 1.848 0.375 -7.892 -5.517 5.667 -4.218 -5.498 6.839 -1
40 | 5.545 3.762 -5.996 9.528 -9.622 -9.568 -0.789 3.427 -0.686 -1
41 | 1.361 -5.169 -3.709 -8.264 -3.060 0.774 7.403 2.721 5.276 -1
42 | 7.686 4.347 -0.279 -8.310 3.875 0.099 -7.878 -6.914 -6.474 1
43 | 6.890 -7.670 -8.421 -6.819 -5.934 -1.481 3.954 -8.532 -8.760 1
44 | -1.530 8.711 -0.993 8.191 -9.599 -7.117 -1.710 -7.477 -4.031 1
45 | -4.384 3.295 1.583 -2.805 6.476 5.649 5.713 0.430 7.117 -1
46 | -2.528 -9.359 2.564 6.479 8.832 2.966 9.362 -2.878 5.489 1
47 | 2.867 3.421 9.149 -5.550 -9.384 5.625 -9.901 6.329 -3.945 1
48 | -6.103 3.564 8.529 6.461 0.044 7.361 -0.573 -0.595 -5.517 -1
49 | -10.000 1.217 -5.353 9.365 5.667 -4.737 4.989 5.765 -8.408 -1
50 | -5.352 -3.079 4.530 -6.823 -6.618 -5.426 -9.462 2.809 3.979 1
51 | 9.667 2.303 8.283 -5.686 1.668 3.949 -0.423 -3.343 -0.286 1
52 | -2.993 9.110 2.642 -8.462 -7.713 6.024 -3.888 -7.175 -1.167 1
53 | 5.873 5.954 0.947 4.155 -9.732 -7.385 -1.896 -0.155 -0.728 1
54 | -3.765 4.062 0.545 8.877 5.600 2.833 4.901 -8.289 5.658 -1
55 | -1.065 -3.518 5.746 9.882 -9.363 6.014 -7.503 -1.259 -4.141 -1
56 | -9.823 3.309 -2.012 0.723 2.186 -6.412 -6.445 -2.913 -4.701 1
57 | -7.490 0.047 -5.807 8.256 -0.070 -5.170 4.271 2.427 3.572 -1
58 | -9.071 3.115 -9.485 -1.083 -6.162 2.701 2.505 -2.607 9.788 1
59 | -7.382 1.835 -8.231 -3.189 0.091 1.698 1.642 -5.638 -5.875 1
60 | 2.551 2.422 4.373 3.066 -8.661 8.210 -4.233 3.844 -4.397 -1
61 | -2.114 9.172 3.369 -0.345 -4.017 -6.540 -8.647 7.625 -2.178 1
62 | 5.056 -9.265 6.228 -0.571 3.801 7.567 -2.361 9.569 1.411 -1
63 | -3.013 -0.825 8.785 -9.643 8.830 -5.231 -6.183 -9.817 -7.606 1
64 | -2.241 4.515 4.151 -6.012 -6.056 -2.047 -8.445 1.584 -2.479 1
65 | 5.637 7.266 -6.890 4.422 7.623 -8.061 9.191 -8.560 -7.878 -1
66 | -9.766 -5.208 -8.244 4.386 -1.221 -4.299 -7.662 0.334 7.284 -1
67 | 6.440 4.960 -0.344 9.550 -0.618 -2.722 -8.511 -1.426 -1.281 -1
68 | 8.634 7.211 -6.378 -9.609 1.597 2.401 -3.909 3.935 -7.265 1
69 | 7.875 -7.259 -9.684 -2.469 -7.710 -0.301 4.809 -6.221 8.272 -1
70 | -5.843 7.417 -7.380 -2.221 7.808 4.217 -9.820 -6.101 -1.848 1
71 | 4.305 0.635 -9.011 4.622 8.166 -6.721 -5.679 2.975 -2.941 -1
72 | 6.433 -4.014 0.649 9.053 3.765 -1.543 3.269 3.946 2.356 -1
73 | 1.617 -9.885 -6.974 2.606 4.737 -8.808 5.885 9.057 4.168 -1
74 | 0.624 -0.892 8.487 -8.727 -1.840 2.252 -0.271 -8.570 -3.802 1
75 | 4.106 -2.164 -1.017 7.132 -9.558 -6.280 8.325 6.327 -7.223 1
76 | 5.663 -2.714 -3.790 4.150 -1.441 4.370 -3.598 8.288 5.800 -1
77 | -5.474 6.195 -7.293 3.509 3.328 -6.851 7.229 1.652 9.476 -1
78 | -8.465 -7.029 -7.304 -2.255 7.120 1.255 -7.885 -6.478 -0.456 1
79 | 1.437 6.306 -1.798 4.145 -0.185 -8.470 7.294 -2.956 3.182 1
80 | 0.927 3.018 -2.395 3.623 -9.236 -5.275 -5.121 -7.121 -1.753 1
81 | 6.346 -1.202 2.456 -5.452 -7.057 -7.729 -3.923 -9.763 -0.685 1
82 | -8.780 -6.548 -9.133 -1.175 7.075 -8.370 3.550 -8.046 -5.491 1
83 | -7.684 7.061 1.463 4.771 -8.391 4.406 7.042 -2.314 4.643 -1
84 | 0.571 -5.249 -2.373 1.438 3.575 -5.297 3.069 -2.875 -3.343 1
85 | -4.453 7.404 -9.191 7.010 2.175 -7.582 1.417 -0.783 0.104 -1
86 | -8.114 -1.131 -4.669 -0.486 -9.693 8.906 4.216 3.376 -3.969 -1
87 | -2.346 9.384 -2.555 -1.536 6.394 9.620 0.882 -2.189 -1.162 -1
88 | 8.614 3.468 1.580 -6.056 -7.018 1.887 -7.150 7.198 -4.737 -1
89 | 3.875 -0.368 -0.563 -8.680 8.095 -4.169 -9.060 -1.023 3.642 1
90 | 6.901 -3.390 2.563 -1.520 0.554 5.544 -9.633 3.405 2.742 -1
91 | 1.901 9.995 -7.577 -8.662 -8.685 -9.482 -2.830 -7.745 -0.505 1
92 | -2.580 -6.876 4.063 9.982 1.604 -5.383 5.527 1.971 8.022 -1
93 | 1.874 1.349 -3.578 4.296 2.687 -2.263 4.814 9.857 -0.008 -1
94 | 1.218 6.413 1.371 -4.719 6.396 -7.025 -0.102 1.922 4.946 1
95 | 4.655 1.148 -6.657 -8.923 -4.556 6.031 -1.186 -9.741 5.888 1
96 | -0.921 9.551 -8.037 -9.549 -5.168 8.359 -6.574 4.731 0.281 1
97 | -7.088 -4.467 -9.106 -3.745 -3.390 -3.662 -7.714 5.423 -3.404 1
98 | -9.721 -5.860 9.048 -7.758 -5.410 -6.119 -9.399 -1.984 8.611 1
99 | 1.099 -9.784 7.673 1.993 -3.529 -5.718 8.331 -1.243 9.706 -1
100 | 5.588 -8.062 3.135 4.636 -5.819 7.725 8.517 -5.218 -4.259 -1
101 |
--------------------------------------------------------------------------------
/MLFoundation/ex3/13-15.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 |
4 |
5 | # target function f(x1, x2) = sign(x1^2 + x2^2 - 0.6)
6 | def target_function(x1, x2):
7 | if (x1 * x1 + x2 * x2 - 0.6) >= 0:
8 | return 1
9 | else:
10 | return -1
11 |
12 |
13 | # create train_set
14 | def training_data_with_random_error(num=1000):
15 | features = np.zeros((num, 3))
16 | labels = np.zeros((num, 1))
17 |
18 | points_x1 = np.array([round(random.uniform(-1, 1), 2) for i in range(num)])
19 | points_x2 = np.array([round(random.uniform(-1, 1), 2) for i in range(num)])
20 |
21 | for i in range(num):
22 | # create random feature
23 | features[i, 0] = 1
24 | features[i, 1] = points_x1[i]
25 | features[i, 2] = points_x2[i]
26 | labels[i] = target_function(points_x1[i], points_x2[i])
27 | # choose 10% error labels
28 | if i <= num * 0.1:
29 | if labels[i] < 0:
30 | labels[i] = 1
31 | else:
32 | labels[i] = -1
33 | return features, labels
34 |
35 |
36 | def error_rate(features, labels, w):
37 | wrong = 0
38 | for i in range(len(labels)):
39 | if np.dot(features[i], w) * labels[i, 0] < 0:
40 | wrong += 1
41 | return wrong / (len(labels) * 1.0)
42 |
43 |
44 | def linear_regression_closed_form(X, Y):
45 | """
46 | linear regression:
47 | model : g(x) = Wt * X
48 | strategy : squared error
49 | algorithm : close form(matrix)
50 | result : w = (Xt.X)^-1.Xt.Y
51 | 林老师上课讲的公式
52 | """
53 | return np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(Y)
54 |
55 |
56 | def feature_transform(features):
57 | new = np.zeros((len(features), 6))
58 | new[:, 0:3] = features[:, :] * 1
59 | new[:, 3] = features[:, 1] * features[:, 2]
60 | new[:, 4] = features[:, 1] * features[:, 1]
61 | new[:, 5] = features[:, 2] * features[:, 2]
62 | return new
63 |
64 |
65 | if __name__ == '__main__':
66 |
67 | # 13
68 |
69 | error_rate_array = []
70 | for i in range(1000):
71 | (features, labels) = training_data_with_random_error(1000)
72 | w13 = linear_regression_closed_form(features, labels)
73 | error_rate_array.append(error_rate(features, labels, w13))
74 |
75 | # error rate, approximately 0.5
76 | avr_err = sum(error_rate_array) / (len(error_rate_array) * 1.0)
77 |
78 | print("13--Linear regression for classification without feature transform:Average error--", avr_err)
79 |
80 | # 14
81 | (features, labels) = training_data_with_random_error(1000)
82 | new_features = feature_transform(features)
83 | w14 = linear_regression_closed_form(new_features, labels)
84 | min_error_in = float("inf")
85 | error_rate_array = []
86 | for i in range(1000):
87 | (features, labels) = training_data_with_random_error(1000)
88 | new_features = feature_transform(features)
89 |
90 | w = linear_regression_closed_form(new_features, labels)
91 | error_in = error_rate(new_features, labels, w)
92 | if error_in <= min_error_in:
93 | w14 = w
94 | min_error_in = error_in
95 | error_rate_array.append(error_in)
96 |
97 | print("w14", w14)
98 |
99 | # avr_err = sum(error_rate_array) / (len(error_rate_array) * 1.0)
100 | #
101 | # print("14--Linear regression for classification with feature transform:Average error--", avr_err)
102 |
103 | # 15
104 |
105 | error_out = []
106 | for i in range(1000):
107 | (features, labels) = training_data_with_random_error(1000)
108 | new_features = feature_transform(features)
109 | error_out.append(error_rate(new_features, labels, w14))
110 |
111 | # bins = np.arange(-1, 1, 0.05)
112 | # plt.hist(error_out, bins, rwidth=0.8, histtype='bar')
113 | # plt.title("Error out(with feature transform)")
114 | # plt.show()
115 |
116 | print("15--Average of E_out is: ", sum(error_out) / (len(error_out) * 1.0))
117 |
--------------------------------------------------------------------------------
/MLFoundation/ex3/18-20.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def data_load(file_path):
5 | # open file and read lines
6 | f = open(file_path)
7 | try:
8 | lines = f.readlines()
9 | finally:
10 | f.close()
11 |
12 | # create features and labels array
13 | example_num = len(lines)
14 | feature_dimension = len(lines[0].strip().split())
15 |
16 | features = np.zeros((example_num, feature_dimension))
17 | features[:, 0] = 1
18 | labels = np.zeros((example_num, 1))
19 |
20 | for index, line in enumerate(lines):
21 | # items[0:-1]--features items[-1]--label
22 | items = line.strip().split(' ')
23 | # get features
24 | features[index, 1:] = [float(str_num) for str_num in items[0:-1]]
25 |
26 | # get label
27 | labels[index] = float(items[-1])
28 |
29 | return features, labels
30 |
31 |
32 | # gradient descent
33 | def gradient_descent(X, y, w):
34 | # -YnWtXn
35 | tmp = -y * (np.dot(X, w))
36 |
37 | # θ(-YnWtXn) = exp(tmp)/1+exp(tmp)
38 | # weight_matrix = np.array([math.exp(_)/(1+math.exp(_)) for _ in tmp]).reshape(len(X), 1)
39 | weight_matrix = np.exp(tmp) / ((1 + np.exp(tmp)) * 1.0)
40 | gradient = 1 / (len(X) * 1.0) * (sum(weight_matrix * -y * X).reshape(len(w), 1))
41 |
42 | return gradient
43 |
44 |
45 | # gradient descent
46 | def stochastic_gradient_descent(X, y, w):
47 | # -YnWtXn
48 | tmp = -y * (np.dot(X, w))
49 |
50 | # θ(-YnWtXn) = exp(tmp)/1+exp(tmp)
51 | # weight = math.exp(tmp[0])/((1+math.exp(tmp[0]))*1.0)
52 | weight = np.exp(tmp) / ((1 + np.exp(tmp)) * 1.0)
53 |
54 | gradient = weight * -y * X
55 | return gradient.reshape(len(gradient), 1)
56 |
57 |
58 | # LinearRegression Class
59 | class LinearRegression:
60 |
61 | def __init__(self):
62 | pass
63 |
64 | # fit model
65 | def fit(self, X, y, Eta=0.001, max_iteration=2000, sgd=False):
66 | # ∂E/∂w = 1/N * ∑θ(-YnWtXn)(-YnXn)
67 | self.__w = np.zeros((len(X[0]), 1))
68 |
69 | # whether use stochastic gradient descent
70 | if not sgd:
71 | for i in range(max_iteration):
72 | self.__w = self.__w - Eta * gradient_descent(X, y, self.__w)
73 | else:
74 | index = 0
75 | for i in range(max_iteration):
76 | if (index >= len(X)):
77 | index = 0
78 | self.__w = self.__w - Eta * stochastic_gradient_descent(np.array(X[index]), y[index], self.__w)
79 | index += 1
80 |
81 | # predict
82 | def predict(self, X):
83 | binary_result = np.dot(X, self.__w) >= 0
84 | return np.array([(1 if _ > 0 else -1) for _ in binary_result]).reshape(len(X), 1)
85 |
86 | # get vector w
87 | def get_w(self):
88 | return self.__w
89 |
90 | # score(error rate)
91 | def score(self, X, y):
92 | predict_y = self.predict(X)
93 | return sum(predict_y != y) / (len(y) * 1.0)
94 |
95 |
96 | if __name__ == '__main__':
97 | # 18
98 | # training model
99 | (X, Y) = data_load("hw3_train.dat")
100 | lr = LinearRegression()
101 | lr.fit(X, Y, max_iteration=2000)
102 |
103 | # get 0/1 error in test data
104 | test_X, test_Y = data_load("hw3_test.dat")
105 | print("E_out: ", lr.score(test_X, test_Y))
106 |
107 | # 19
108 | # training model
109 | (X, Y) = data_load("hw3_train.dat")
110 | lr_eta = LinearRegression()
111 | lr_eta.fit(X, Y, 0.01, 2000)
112 |
113 | # get 0/1 error in test data
114 | test_X, test_Y = data_load("hw3_test.dat")
115 | print("E_out: ", lr_eta.score(test_X, test_Y))
116 |
117 | # 20
118 | (X, Y) = data_load("hw3_train.dat")
119 | lr_sgd = LinearRegression()
120 | lr_sgd.fit(X, Y, sgd=True, max_iteration=2000)
121 |
122 | # get 0/1 error in test data
123 | test_X, test_Y = data_load("hw3_test.dat")
124 | print("E_out: ", lr_sgd.score(test_X, test_Y))
125 |
--------------------------------------------------------------------------------
/MLFoundation/ex4/13-20.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | # load data
5 | def load_data(filename):
6 | code = open(filename, "r")
7 | lines = code.readlines()
8 | xn = np.zeros((len(lines), 3)).astype(np.float)
9 | yn = np.zeros((len(lines),)).astype(np.int)
10 |
11 | for i in range(0, len(lines)):
12 | line = lines[i]
13 | line = line.rstrip('\r\n').replace('\t', ' ').split(' ')
14 | xn[i, 0] = 1
15 | for j in range(1, len(xn[0])):
16 | xn[i, j] = float(line[j - 1])
17 | yn[i] = int(line[len(xn[0]) - 1])
18 | return xn, yn
19 |
20 |
21 | # 正规方程
22 | def calculate_w_reg(x, y, lambda_value):
23 | return np.dot(np.dot(np.linalg.inv(np.dot(x.transpose(), x) + lambda_value * np.eye(x.shape[1])), x.transpose()), y)
24 |
25 |
26 | # test result
27 | def calculate_E(w, x, y):
28 | scores = np.dot(w, x.transpose())
29 | predicts = np.where(scores >= 0, 1.0, -1.0)
30 | E_out_num = sum(predicts != y)
31 | return (E_out_num * 1.0) / predicts.shape[0]
32 |
33 |
34 | if __name__ == '__main__':
35 | # prepare train and test data
36 | train_x, train_y = load_data("hw4_train.dat")
37 | test_x, test_y = load_data("hw4_test.dat")
38 |
39 | # Q13
40 | lambda_value = 10
41 | W = calculate_w_reg(train_x, train_y, lambda_value)
42 | Ein = calculate_E(W, train_x, train_y)
43 | Eout = calculate_E(W, test_x, test_y)
44 | print('Q13: Ein = ', Ein, ', Eout= ', Eout)
45 |
46 | # Q14-Q15
47 | Ein_min = float("inf")
48 | optimal_Eout = 0
49 | optimal_lambda_Ein = 0
50 |
51 | Eout_min = float("inf")
52 | optimal_Ein = 0
53 | optimal_lambda_Eout = 0
54 | for lambda_value in range(2, -11, -1):
55 | # calculate ridge regression W
56 | w_reg = calculate_w_reg(train_x, train_y, pow(10, lambda_value))
57 | Ein = calculate_E(w_reg, train_x, train_y)
58 | Eout = calculate_E(w_reg, test_x, test_y)
59 |
60 | # update Ein,Eout,lambda
61 | if Ein_min > Ein:
62 | Ein_min = Ein
63 | optimal_lambda_Ein = lambda_value
64 | optimal_Eout = Eout
65 |
66 | if Eout_min > Eout:
67 | Eout_min = Eout
68 | optimal_lambda_Eout = lambda_value
69 | optimal_Ein = Ein
70 | # Q14
71 | print('Q14: log10lambda = ', optimal_lambda_Ein, ', Ein= ', Ein_min, ', Eout = ', optimal_Eout)
72 | # Q15
73 | print('Q15: log10lambda = ', optimal_lambda_Eout, ', Ein = ', optimal_Ein, ', Eout= ', Eout_min)
74 |
75 | # Q16-Q17
76 | Etrain_min = float("inf")
77 | Eval_min = float("inf")
78 |
79 | # 跟着Etrain_min更新的值
80 | Eout_Etrain_min = 0
81 | Eval_Etrain_min = 0
82 | optimal_lambda_Etrain_min = 0
83 |
84 | # 跟着Eval_min更新的值
85 | Etrain_Eval_min = 0
86 | Eout_Eval_min = 0
87 | optimal_lambda_Eval_min = 0
88 |
89 | split = 120
90 |
91 | for lambda_value in range(2, -11, -1):
92 | w_reg = calculate_w_reg(train_x[:split], train_y[:split], pow(10, lambda_value))
93 | Etrain = calculate_E(w_reg, train_x[:split], train_y[:split])
94 | Eval = calculate_E(w_reg, train_x[split:], train_y[split:])
95 | Eout = calculate_E(w_reg, test_x, test_y)
96 |
97 | if Etrain_min > Etrain:
98 | optimal_lambda_Etrain_min = lambda_value
99 | Etrain_min = Etrain
100 | Eout_Etrain_min = Eout
101 | Eval_Etrain_min = Eval
102 |
103 | if Eval_min > Eval:
104 | optimal_lambda_Eval_min = lambda_value
105 | Eout_Eval_min = Eout
106 | Eval_min = Eval
107 | Etrain_Eval_min = Etrain
108 | # Q16
109 | print('Q16: log10 = ', optimal_lambda_Etrain_min, ', Etrain= ', Etrain_min, ', Eval = ', Eval_Etrain_min,
110 | ', Eout = ', Eout_Etrain_min)
111 | # Q17
112 | print('Q17: log10 = ', optimal_lambda_Eval_min, ', Etrain= ', Etrain_Eval_min, ', Eval = ', Eval_min, ', Eout = ',
113 | Eout_Eval_min)
114 |
115 | # Q18
116 | # optimal_lambda_Eval_min是第17题得到的最优lamda
117 | w_reg = calculate_w_reg(train_x, train_y, pow(10, optimal_lambda_Eval_min))
118 | optimal_Ein = calculate_E(w_reg, train_x, train_y)
119 | optimal_Eout = calculate_E(w_reg, test_x, test_y)
120 | print('Q18: Ein = ', optimal_Ein, ', Eout = ', optimal_Eout)
121 |
122 | # Q19
123 | folder_num = 5
124 | split_folder = 40
125 |
126 | Ecv_min = float("inf")
127 | optimal_lambda = 0
128 | for lambda_value in range(2, -11, -1):
129 | total_cv = 0
130 | for i in range(folder_num):
131 | # get test_data
132 | test_data_x = train_x[i * split_folder:(i + 1) * split_folder, :]
133 | test_data_y = train_y[i * split_folder:(i + 1) * split_folder]
134 |
135 | # train_data= raw_data-test_data,test_data可能在中间或两边
136 | if 0 < i < (folder_num - 1):
137 | train_data_x = np.concatenate((train_x[0:i * split_folder, :], train_x[(i + 1) * split_folder:, :]),
138 | axis=0)
139 | train_data_y = np.concatenate((train_y[0:i * split_folder], train_y[(i + 1) * split_folder:]), axis=0)
140 | elif i == 0:
141 | train_data_x = train_x[split_folder:, :]
142 | train_data_y = train_y[split_folder:]
143 | else:
144 | train_data_x = train_x[0:i * split_folder, :]
145 | train_data_y = train_y[0:i * split_folder]
146 |
147 | w_reg = calculate_w_reg(train_data_x, train_data_y, pow(10, lambda_value))
148 | Ecv = calculate_E(w_reg, test_data_x, test_data_y)
149 | total_cv += Ecv
150 | total_cv = total_cv * 1.0 / folder_num
151 | if Ecv_min > total_cv:
152 | Ecv_min = total_cv
153 | optimal_lambda = lambda_value
154 |
155 | print('Q19: log10=', optimal_lambda, ' Ecv=', Ecv_min)
156 |
157 | # Q20
158 | w_reg = calculate_w_reg(train_x, train_y, pow(10, optimal_lambda))
159 | Ein = calculate_E(w_reg, train_x, train_y)
160 | Eout = calculate_E(w_reg, test_x, test_y)
161 | print('Q20: Ein = ', Ein, 'Eout = ', Eout)
162 |
--------------------------------------------------------------------------------
/MLFoundation/ex4/hw4_train.dat:
--------------------------------------------------------------------------------
1 | 0.568304 0.568283 1
2 | 0.310968 0.310956 -1
3 | 0.103376 0.103373 -1
4 | 0.0531882 0.053218 -1
5 | 0.97006 0.970064 1
6 | 0.0941873 0.0941707 -1
7 | 0.655902 0.655892 1
8 | 0.370821 0.370839 -1
9 | 0.558482 0.558476 1
10 | 0.849389 0.849383 1
11 | 0.796038 0.796051 1
12 | 0.723246 0.723252 1
13 | 0.571236 0.571254 1
14 | 0.385144 0.38512 -1
15 | 0.877176 0.877168 1
16 | 0.74655 0.746552 1
17 | 0.0676164 0.0676087 -1
18 | 0.0412524 0.0412649 -1
19 | 0.851637 0.851661 1
20 | 0.586989 0.58698 1
21 | 0.661014 0.660994 1
22 | 0.587988 0.587968 1
23 | 0.257615 0.257628 -1
24 | 0.680505 0.680485 1
25 | 0.895242 0.895257 1
26 | 0.381124 0.381139 -1
27 | 0.314332 0.31433 -1
28 | 0.157744 0.157747 -1
29 | 0.670923 0.670925 1
30 | 0.531716 0.531736 1
31 | 0.810956 0.810938 1
32 | 0.514937 0.51493 1
33 | 0.188567 0.188587 -1
34 | 0.778528 0.778527 1
35 | 0.904966 0.904955 1
36 | 0.563699 0.563708 1
37 | 0.599768 0.59978 1
38 | 0.619909 0.619928 1
39 | 0.650556 0.650556 1
40 | 0.131949 0.131967 -1
41 | 0.251546 0.251546 -1
42 | 0.690874 0.690863 1
43 | 0.381249 0.381284 -1
44 | 0.559231 0.559232 1
45 | 0.197361 0.197367 -1
46 | 0.784776 0.784781 1
47 | 0.620494 0.620499 1
48 | 0.229646 0.229647 -1
49 | 0.0891466 0.0891438 -1
50 | 0.981857 0.981861 1
51 | 0.64711 0.647102 1
52 | 0.725596 0.725592 1
53 | 0.614771 0.614764 1
54 | 0.976315 0.976321 1
55 | 0.250716 0.250708 -1
56 | 0.281071 0.281096 -1
57 | 0.550196 0.550187 1
58 | 0.955756 0.955751 1
59 | 0.251821 0.251838 -1
60 | 0.538196 0.538183 1
61 | 0.58285 0.582836 1
62 | 0.48367 0.48368 -1
63 | 0.481451 0.481471 -1
64 | 0.291576 0.291561 -1
65 | 0.181592 0.181596 -1
66 | 0.232746 0.232759 -1
67 | 0.488322 0.488349 -1
68 | 0.664499 0.664487 1
69 | 0.0420094 0.0420475 -1
70 | 0.950521 0.950524 1
71 | 0.445707 0.445706 -1
72 | 0.430385 0.430396 -1
73 | 0.747574 0.747583 1
74 | 0.245047 0.245078 -1
75 | 0.742838 0.742833 1
76 | 0.284625 0.284627 -1
77 | 0.0613909 0.061374 -1
78 | 0.612767 0.612754 1
79 | 0.378545 0.378555 -1
80 | 0.818764 0.818763 1
81 | 0.0507026 0.0507136 -1
82 | 0.882725 0.882731 1
83 | 0.0810847 0.0810796 -1
84 | 0.836278 0.836279 1
85 | 0.696709 0.696695 1
86 | 0.603346 0.603334 1
87 | 0.513718 0.513712 1
88 | 0.247789 0.247802 -1
89 | 0.704221 0.704213 1
90 | 0.546723 0.546724 1
91 | 0.881583 0.881592 1
92 | 0.13456 0.134545 -1
93 | 0.86883 0.868815 1
94 | 0.980909 0.980887 1
95 | 0.369986 0.369986 -1
96 | 0.194455 0.194457 -1
97 | 0.483858 0.483875 -1
98 | 0.43807 0.43808 -1
99 | 0.159602 0.159592 -1
100 | 0.923499 0.923504 1
101 | 0.419902 0.419906 -1
102 | 0.659252 0.659271 1
103 | 0.419546 0.419546 -1
104 | 0.935494 0.935512 1
105 | 0.712397 0.71239 1
106 | 0.952567 0.952549 1
107 | 0.915359 0.915379 1
108 | 0.182693 0.182675 -1
109 | 0.668527 0.668522 1
110 | 0.0965221 0.0965266 -1
111 | 0.984174 0.984197 1
112 | 0.7437 0.743702 1
113 | 0.213357 0.213341 -1
114 | 0.617402 0.617386 1
115 | 0.335604 0.335604 -1
116 | 0.632581 0.632597 1
117 | 0.515744 0.515757 1
118 | 0.786921 0.786912 1
119 | 0.502608 0.502599 1
120 | 0.164538 0.164537 -1
121 | 0.507454 0.507469 1
122 | 0.822809 0.822806 1
123 | 0.42883 0.428821 -1
124 | 0.157678 0.157693 -1
125 | 0.674884 0.674896 1
126 | 0.276618 0.276622 -1
127 | 0.374795 0.374795 -1
128 | 0.396781 0.396815 -1
129 | 0.132116 0.132101 -1
130 | 0.966203 0.966249 1
131 | 0.961164 0.961159 1
132 | 0.0140044 0.014014 -1
133 | 0.509361 0.509379 1
134 | 0.195082 0.195097 -1
135 | 0.853012 0.853012 1
136 | 0.852883 0.852896 1
137 | 0.574279 0.574282 1
138 | 0.316965 0.316939 -1
139 | 0.386753 0.386761 -1
140 | 0.764792 0.764815 1
141 | 0.680442 0.680428 1
142 | 0.125299 0.125304 -1
143 | 0.619824 0.619818 1
144 | 0.687672 0.687662 1
145 | 0.760271 0.760289 1
146 | 0.227148 0.22713 -1
147 | 0.224288 0.224295 -1
148 | 0.0150326 0.0150352 -1
149 | 0.585322 0.585314 1
150 | 0.732755 0.732777 1
151 | 0.864553 0.864569 1
152 | 0.0788415 0.0788569 -1
153 | 0.4326 0.432602 -1
154 | 0.804816 0.804801 1
155 | 0.50957 0.509589 1
156 | 0.405003 0.404988 -1
157 | 0.465702 0.465691 -1
158 | 0.368576 0.368574 -1
159 | 0.56202 0.562033 1
160 | 0.552361 0.552356 1
161 | 0.18263 0.182606 -1
162 | 0.672912 0.672906 1
163 | 0.642397 0.642413 1
164 | 0.816308 0.816316 1
165 | 0.264986 0.264978 -1
166 | 0.799168 0.799179 1
167 | 0.311442 0.311432 -1
168 | 0.715291 0.715278 1
169 | 0.913262 0.913265 1
170 | 0.703566 0.70358 1
171 | 0.0868818 0.0868856 -1
172 | 0.507828 0.507835 1
173 | 0.77619 0.776196 1
174 | 0.503254 0.503257 1
175 | 0.0585257 0.0585251 -1
176 | 0.668003 0.667995 1
177 | 0.409675 0.409686 -1
178 | 0.00104673 0.00105247 -1
179 | 0.6743 0.674268 1
180 | 0.461383 0.461378 -1
181 | 0.957667 0.957677 1
182 | 0.386593 0.386566 -1
183 | 0.260177 0.260171 -1
184 | 0.208071 0.208076 -1
185 | 0.634661 0.634646 1
186 | 0.354351 0.354351 -1
187 | 0.135384 0.135381 -1
188 | 0.216718 0.216748 -1
189 | 0.606084 0.606096 1
190 | 0.443809 0.443801 -1
191 | 0.480428 0.480418 -1
192 | 0.886987 0.886995 1
193 | 0.0126171 0.012603 -1
194 | 0.578502 0.578495 1
195 | 0.0664441 0.0664438 -1
196 | 0.292442 0.292432 -1
197 | 0.487013 0.487008 -1
198 | 0.176237 0.176234 -1
199 | 0.496052 0.496044 -1
200 | 0.62186 0.621853 1
201 |
--------------------------------------------------------------------------------
/MLFoundation/pdf/01_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/01_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/02_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/02_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/03_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/03_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/04_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/04_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/05_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/05_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/06_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/06_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/07_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/07_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/08_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/08_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/09_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/09_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/10_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/10_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/11_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/11_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/12_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/12_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/13_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/13_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/14_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/14_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/15_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/15_handout.pdf
--------------------------------------------------------------------------------
/MLFoundation/pdf/16_handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/MLFoundation/pdf/16_handout.pdf
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LearningML
2 |
3 | 学习机器学习过程中一些课程习题,算法的实现
4 |
5 | [Coursera AndrewNg 机器学习 matlab实现](https://github.com/xjwhhh/AndrewNgMachineLearning)
6 |
7 | [Coursera 国立台湾大学 林轩田 机器学习基石](https://github.com/xjwhhh/LearningML/tree/master/MLFoundation)
8 |
9 | [李航 统计学习方法](https://github.com/xjwhhh/LearningML/tree/master/StatisticalLearningMethod)
10 |
11 | [周志华 机器学习](https://github.com/xjwhhh/LearningML/tree/master/watermelon)
12 |
13 |
--------------------------------------------------------------------------------
/StatisticalLearningMethod/chapter2/Perceptron.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import random
3 | import time
4 | import logging
5 |
6 | from sklearn.model_selection import train_test_split
7 | from sklearn.metrics import accuracy_score
8 |
9 |
10 | def log(func):
11 | def wrapper(*args, **kwargs):
12 | start_time = time.time()
13 | logging.debug('start %s()' % func.__name__)
14 | ret = func(*args, **kwargs)
15 |
16 | end_time = time.time()
17 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time))
18 |
19 | return ret
20 |
21 | return wrapper
22 |
23 |
24 | class Perceptron(object):
25 |
26 | def __init__(self):
27 | self.learning_step = 0.00001
28 | self.max_iteration = 5000
29 |
30 | def predict_(self, x):
31 | wx = 0
32 | for i in range(len(self.w)):
33 | wx += self.w[i] * x[i]
34 |
35 | return int(wx > 0)
36 |
37 | @log
38 | def train(self, features, labels):
39 | # (1)
40 | self.w = [0.0] * (len(features[0]) + 1)
41 |
42 | correct_count = 0
43 |
44 | while True:
45 | # (2)
46 | # 有可能随机生成相同的数字,使得correct_count对一个数据有重复计算,但无伤大雅
47 | index = random.randint(0, len(labels) - 1)
48 | x = list(features[index])
49 | x.append(1.0)
50 | if labels[index] == 1:
51 | y = 1
52 | else:
53 | y = -1
54 | wx = 0
55 | for i in range(len(self.w)):
56 | wx += self.w[i] * x[i]
57 |
58 | # 验证正确
59 | if wx * y > 0:
60 | correct_count += 1
61 | # 训练集大约有两万多数据,这里可随意取适宜的值,用来跳出while循环
62 | if correct_count > 10000:
63 | break
64 | continue
65 |
66 | # (3)
67 | # 验证错误,修改w值
68 | for i in range(len(self.w)):
69 | self.w[i] += self.learning_step * (y * x[i])
70 |
71 | @log
72 | def predict(self, features):
73 | predict_labels = []
74 | for feature in features:
75 | x = list(feature)
76 | x.append(1)
77 | predict_labels.append(self.predict_(x))
78 | return predict_labels
79 |
80 |
81 | if __name__ == '__main__':
82 | # 记录
83 | logger = logging.getLogger()
84 | logger.setLevel(logging.DEBUG)
85 |
86 | raw_data = pd.read_csv('../data/train_binary.csv', header=0)
87 | data = raw_data.values
88 |
89 | images = data[0:, 1:]
90 | labels = data[:, 0]
91 |
92 | # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
93 | train_features, test_features, train_labels, test_labels = train_test_split(
94 | images, labels, test_size=0.33, random_state=1)
95 |
96 | # 模型训练
97 | p = Perceptron()
98 | p.train(train_features, train_labels)
99 |
100 | # 使用测试集预测
101 | test_predict = p.predict(test_features)
102 |
103 | # 计算准确率
104 | # 因为是随机的,每次得到的准确率都不同
105 | score = accuracy_score(test_labels, test_predict)
106 | print("The accuracy score is ", score)
107 |
--------------------------------------------------------------------------------
/StatisticalLearningMethod/chapter3/K-NN.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import cv2
4 | import logging
5 | import time
6 |
7 | from math import sqrt
8 | from collections import namedtuple
9 |
10 | from sklearn.model_selection import train_test_split
11 | from sklearn.metrics import accuracy_score
12 |
13 |
14 | def log(func):
15 | def wrapper(*args, **kwargs):
16 | start_time = time.time()
17 | logging.debug('start %s()' % func.__name__)
18 | ret = func(*args, **kwargs)
19 |
20 | end_time = time.time()
21 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time))
22 |
23 | return ret
24 |
25 | return wrapper
26 |
27 |
28 | def get_hog_features(trainset):
29 | # 利用opencv获取图像hog特征
30 |
31 | features = []
32 |
33 | hog = cv2.HOGDescriptor('../hog.xml')
34 |
35 | for img in trainset:
36 | img = np.reshape(img, (28, 28))
37 | cv_img = img.astype(np.uint8)
38 |
39 | hog_feature = hog.compute(cv_img)
40 | # hog_feature = np.transpose(hog_feature)
41 | features.append(hog_feature)
42 |
43 | features = np.array(features)
44 | features = np.reshape(features, (-1, 324))
45 |
46 | return features
47 |
48 |
49 | def predict(test_set, kd_tree):
50 | predict = []
51 |
52 | for i in range(len(test_set)):
53 | predict.append(find_nearest(kd_tree, test_set[i]).label)
54 |
55 | return np.array(predict)
56 |
57 |
58 | # 构造kdTree搜索
59 | # 现在的实现是最近邻,
60 | # 问题1:怎么保存每个结点对应的label,现在的实现似乎成功了,但我不确定
61 | # 问题2:速度非常慢
62 |
63 | class KdNode(object):
64 | def __init__(self, dom_elt, split, left, right, label):
65 | self.dom_elt = dom_elt # k维向量节点(k维空间中的一个样本点)
66 | self.split = split # 整数(进行分割维度的序号)
67 | self.left = left # 该结点分割超平面左子空间构成的kd-tree
68 | self.right = right # 该结点分割超平面右子空间构成的kd-tree
69 | self.label = label
70 |
71 |
72 | class KdTree(object):
73 |
74 | @log
75 | def __init__(self, data, labels):
76 | k = len(data[0]) # 数据维度
77 |
78 | def create_node(split, data_set, labels): # 按第split维划分数据集,创建KdNode
79 |
80 | # print(len(data_set))
81 | if (len(data_set) == 0):
82 | return None
83 |
84 | sort_index = data_set[:, split].argsort()
85 | data_set = data_set[sort_index]
86 | labels = labels[sort_index]
87 | # print(data_set)
88 |
89 | split_pos = len(data_set) // 2
90 | # print(split_pos)
91 | median = data_set[split_pos] # 中位数分割点
92 | label = labels[split_pos]
93 | split_next = (split + 1) % k # cycle coordinates
94 |
95 | # 递归的创建kd树
96 | return KdNode(median, split,
97 | create_node(split_next, data_set[:split_pos], labels[:split_pos]), # 创建左子树
98 | create_node(split_next, data_set[split_pos + 1:], labels[split_pos + 1:]), # 创建右子树
99 | label)
100 |
101 | self.root = create_node(0, data, labels) # 从第0维分量开始构建kd树,返回根节点
102 |
103 |
104 | # 定义一个namedtuple,分别存放最近坐标点、最近距离和访问过的节点数
105 | result = namedtuple("Result_tuple", "nearest_point nearest_dist nodes_visited label")
106 |
107 |
108 | @log
109 | def find_nearest(tree, point):
110 | k = len(point) # 数据维度
111 |
112 | def travel(kd_node, target, max_dist):
113 | if kd_node is None:
114 | return result([0] * k, float("inf"), 0, 0) # python中用float("inf")和float("-inf")表示正负无穷
115 |
116 | nodes_visited = 1
117 |
118 | s = kd_node.split # 进行分割的维度
119 | pivot = kd_node.dom_elt # 进行分割的“轴”
120 |
121 | if target[s] <= pivot[s]: # 如果目标点第s维小于分割轴的对应值(目标离左子树更近)
122 | nearer_node = kd_node.left # 下一个访问节点为左子树根节点
123 | further_node = kd_node.right # 同时记录下右子树
124 | else: # 目标离右子树更近
125 | nearer_node = kd_node.right # 下一个访问节点为右子树根节点
126 | further_node = kd_node.left
127 | if (nearer_node is None):
128 | label = 0
129 | else:
130 | label = nearer_node.label
131 |
132 | temp1 = travel(nearer_node, target, max_dist) # 进行遍历找到包含目标点的区域
133 |
134 | nearest = temp1.nearest_point # 以此叶结点作为“当前最近点”
135 | dist = temp1.nearest_dist # 更新最近距离
136 |
137 | nodes_visited += temp1.nodes_visited
138 |
139 | if dist < max_dist:
140 | max_dist = dist # 最近点将在以目标点为球心,max_dist为半径的超球体内
141 |
142 | temp_dist = abs(pivot[s] - target[s]) # 第s维上目标点与分割超平面的距离
143 | if max_dist < temp_dist: # 判断超球体是否与超平面相交
144 | return result(nearest, dist, nodes_visited, temp1.label) # 不相交则可以直接返回,不用继续判断
145 |
146 | # ----------------------------------------------------------------------
147 | # 计算目标点与分割点的欧氏距离
148 | temp_dist = sqrt(sum((p1 - p2) ** 2 for p1, p2 in zip(pivot, target)))
149 |
150 | if temp_dist < dist: # 如果“更近”
151 | nearest = pivot # 更新最近点
152 | dist = temp_dist # 更新最近距离
153 | max_dist = dist # 更新超球体半径
154 | label = kd_node
155 |
156 | # 检查另一个子结点对应的区域是否有更近的点
157 | temp2 = travel(further_node, target, max_dist)
158 |
159 | nodes_visited += temp2.nodes_visited
160 | if temp2.nearest_dist < dist: # 如果另一个子结点内存在更近距离
161 | nearest = temp2.nearest_point # 更新最近点
162 | dist = temp2.nearest_dist # 更新最近距离
163 | label = temp2.label
164 |
165 | return result(nearest, dist, nodes_visited, label)
166 |
167 | return travel(tree.root, point, float("inf")) # 从根节点开始递归
168 |
169 |
170 | k = 10
171 |
172 | if __name__ == '__main__':
173 | logger = logging.getLogger()
174 | logger.setLevel(logging.DEBUG)
175 |
176 | raw_data = pd.read_csv('../data/train.csv', header=0)
177 | data = raw_data.values
178 |
179 | images = data[0:, 1:]
180 | labels = data[:, 0]
181 |
182 | features = get_hog_features(images)
183 | # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
184 | train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33,
185 | random_state=1)
186 |
187 | kd_tree = KdTree(train_features, train_labels)
188 |
189 | test_predict = predict(test_features, kd_tree)
190 |
191 | score = accuracy_score(test_labels, test_predict)
192 | print("The accuracy score is ", score)
193 |
--------------------------------------------------------------------------------
/StatisticalLearningMethod/chapter3/K-NN1.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 |
3 | import pandas as pd
4 | import numpy as np
5 | import cv2
6 | import random
7 | import time
8 |
9 | from sklearn.model_selection import train_test_split
10 | from sklearn.metrics import accuracy_score
11 |
12 |
13 | # 利用opencv获取图像hog特征
14 | def get_hog_features(trainset):
15 | features = []
16 |
17 | hog = cv2.HOGDescriptor('../hog.xml')
18 |
19 | for img in trainset:
20 | img = np.reshape(img, (28, 28))
21 | cv_img = img.astype(np.uint8)
22 |
23 | hog_feature = hog.compute(cv_img)
24 | # hog_feature = np.transpose(hog_feature)
25 | features.append(hog_feature)
26 |
27 | features = np.array(features)
28 | features = np.reshape(features, (-1, 324))
29 |
30 | return features
31 |
32 |
33 | def Predict(testset, trainset, train_labels):
34 | predict = []
35 | count = 0
36 |
37 | # 线性搜索
38 | for test_vec in testset:
39 | # 输出当前运行的测试用例坐标,用于测试
40 | print(count)
41 | count += 1
42 |
43 | knn_list = [] # 当前k个最近邻居
44 | max_index = -1 # 当前k个最近邻居中距离最远点的坐标
45 | max_dist = 0 # 当前k个最近邻居中距离最远点的距离
46 |
47 | # 先将前k个点放入k个最近邻居中,填充满knn_list
48 | for i in range(k):
49 | label = train_labels[i]
50 | train_vec = trainset[i]
51 |
52 | dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离
53 |
54 | knn_list.append((dist, label))
55 |
56 | # 剩下的点
57 | for i in range(k, len(train_labels)):
58 | label = train_labels[i]
59 | train_vec = trainset[i]
60 |
61 | dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离
62 |
63 | # 寻找10个邻近点钟距离最远的点
64 | if max_index < 0:
65 | for j in range(k):
66 | if max_dist < knn_list[j][0]:
67 | max_index = j
68 | max_dist = knn_list[max_index][0]
69 |
70 | # 如果当前k个最近邻居中存在点距离比当前点距离远,则替换
71 | if dist < max_dist:
72 | knn_list[max_index] = (dist, label)
73 | max_index = -1
74 | max_dist = 0
75 |
76 | # 统计选票
77 | class_total = 10
78 | class_count = [0 for i in range(class_total)]
79 | for dist, label in knn_list:
80 | class_count[label] += 1
81 |
82 | # 找出最大选票
83 | mmax = max(class_count)
84 |
85 | # 找出最大选票标签
86 | for i in range(class_total):
87 | if mmax == class_count[i]:
88 | predict.append(i)
89 | break
90 |
91 | return np.array(predict)
92 |
93 |
94 | k = 10
95 |
96 | if __name__ == '__main__':
97 | print('Start read data')
98 |
99 | time_1 = time.time()
100 |
101 | raw_data = pd.read_csv('../data/train.csv', header=0)
102 | data = raw_data.values
103 |
104 | imgs = data[0::, 1::]
105 | labels = data[::, 0]
106 |
107 | features = get_hog_features(imgs)
108 |
109 | # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
110 | train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33,
111 | random_state=23323)
112 | # print train_features.shape
113 | # print train_features.shape
114 |
115 | time_2 = time.time()
116 | print('read data cost ', time_2 - time_1, ' second')
117 |
118 | print('Start training')
119 | print('knn do not need to train')
120 | time_3 = time.time()
121 | print('training cost ', time_3 - time_2, ' second')
122 |
123 | print('Start predicting')
124 | test_predict = Predict(test_features, train_features, train_labels)
125 | time_4 = time.time()
126 | print('predicting cost ', time_4 - time_3, ' second')
127 |
128 | score = accuracy_score(test_labels, test_predict)
129 | print("The accuracy score is ", score)
130 |
--------------------------------------------------------------------------------
/StatisticalLearningMethod/chapter3/K-NN2.py:
--------------------------------------------------------------------------------
1 | # --*-- coding:utf-8 --*--
2 | import numpy as np
3 |
4 |
5 | class Node: # 结点
6 | def __init__(self, data, lchild=None, rchild=None):
7 | self.data = data
8 | self.lchild = lchild
9 | self.rchild = rchild
10 |
11 |
12 | class KdTree: # kd树
13 | def __init__(self):
14 | self.kdTree = None
15 |
16 | def create(self, dataSet, depth): # 创建kd树,返回根结点
17 | if (len(dataSet) > 0):
18 | m, n = np.shape(dataSet) # 求出样本行,列
19 | midIndex = int(m / 2) # 中间数的索引位置
20 | axis = depth % n # 判断以哪个轴划分数据
21 | sortedDataSet = self.sort(dataSet, axis) # 进行排序
22 | node = Node(sortedDataSet[midIndex]) # 将节点数据域设置为中位数,具体参考下书本
23 | # print sortedDataSet[midIndex]
24 | leftDataSet = sortedDataSet[: midIndex] # 将中位数的左边创建2改副本
25 | rightDataSet = sortedDataSet[midIndex + 1:]
26 | print(leftDataSet)
27 | print(rightDataSet)
28 | print(123)
29 | node.lchild = self.create(leftDataSet, depth + 1) # 将中位数左边样本传入来递归创建树
30 | node.rchild = self.create(rightDataSet, depth + 1)
31 | return node
32 | else:
33 | return None
34 |
35 | def sort(self, dataSet, axis): # 采用冒泡排序,利用aixs作为轴进行划分
36 | sortDataSet = dataSet[:] # 由于不能破坏原样本,此处建立一个副本
37 | m, n = np.shape(sortDataSet)
38 | for i in range(m):
39 | for j in range(0, m - i - 1):
40 | if (sortDataSet[j][axis] > sortDataSet[j + 1][axis]):
41 | temp = sortDataSet[j]
42 | sortDataSet[j] = sortDataSet[j + 1]
43 | sortDataSet[j + 1] = temp
44 | print(sortDataSet)
45 | return sortDataSet
46 |
47 | def preOrder(self, node): # 前序遍历
48 | if node != None:
49 | print("tttt->%s" % node.data)
50 | self.preOrder(node.lchild)
51 | self.preOrder(node.rchild)
52 |
53 | def search(self, tree, x): # 搜索
54 | self.nearestPoint = None # 保存最近的点
55 | self.nearestValue = 0 # 保存最近的值
56 |
57 | def travel(node, depth=0): # 递归搜索
58 | if node != None: # 递归终止条件
59 | n = len(x) # 特征数
60 | axis = depth % n # 计算轴
61 | if x[axis] < node.data[axis]: # 如果数据小于结点,则往左结点找
62 | travel(node.lchild, depth + 1)
63 | else:
64 | travel(node.rchild, depth + 1)
65 |
66 | # 以下是递归完毕后,往父结点方向回朔,对应算法3.3(3)
67 | print(3)
68 | print(node.data)
69 | distNodeAndX = self.dist(x, node.data) # 目标和节点的距离判断
70 | if (self.nearestPoint == None): # 确定当前点,更新最近的点和最近的值,对应算法3.3(3)(a)
71 | self.nearestPoint = node.data
72 | self.nearestValue = distNodeAndX
73 | elif (self.nearestValue > distNodeAndX):
74 | print("t")
75 | self.nearestPoint = node.data
76 | self.nearestValue = distNodeAndX
77 |
78 | print(axis)
79 | print(node.data, depth, self.nearestValue, node.data[axis], x[axis])
80 | if (abs(x[axis] - node.data[axis]) <= self.nearestValue): # 确定是否需要去子节点的区域去找(圆的判断),对应算法3.3(3)(b)
81 | if x[axis] < node.data[axis]:
82 | print(1)
83 | travel(node.rchild, depth + 1)
84 | else:
85 | print(2)
86 | travel(node.lchild, depth + 1)
87 |
88 | travel(tree)
89 | return self.nearestPoint
90 |
91 | def dist(self, x1, x2): # 欧式距离的计算
92 | return ((np.array(x1) - np.array(x2)) ** 2).sum() ** 0.5
93 |
94 |
95 | if __name__ == '__main__':
96 | dataSet = [[2, 3],
97 | [5, 4],
98 | [9, 6],
99 | [4, 7],
100 | [8, 1],
101 | [7, 2]]
102 | x = [5, 3]
103 | kdtree = KdTree()
104 | tree = kdtree.create(dataSet, 0)
105 | kdtree.preOrder(tree)
106 | print(kdtree.search(tree, x))
107 |
--------------------------------------------------------------------------------
/StatisticalLearningMethod/chapter4/naive_Bayes.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import time
3 | import logging
4 | import numpy as np
5 | import pandas as pd
6 |
7 | from sklearn.model_selection import train_test_split
8 | from sklearn.metrics import accuracy_score
9 |
10 |
11 | def log(func):
12 | def wrapper(*args, **kwargs):
13 | start_time = time.time()
14 | logging.debug('start %s()' % func.__name__)
15 | ret = func(*args, **kwargs)
16 |
17 | end_time = time.time()
18 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time))
19 |
20 | return ret
21 |
22 | return wrapper
23 |
24 |
25 | # 二值化,将图片进行二值化的目的是确定每个特征可选的值只有两种,对应于train方法里conditional_probability最后一个维度的长度2
26 | def binaryzation(img):
27 | cv_img = img.astype(np.uint8)
28 | cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img)
29 | return cv_img
30 |
31 |
32 | @log
33 | def train(train_set, train_labels):
34 | class_num = len(set(train_labels))
35 | feature_num = len(train_set[0])
36 | prior_probability = np.zeros(class_num) # 先验概率
37 | conditional_probability = np.zeros((class_num, feature_num, 2)) # 条件概率
38 | print(conditional_probability.shape)
39 |
40 | for i in range(len(train_labels)):
41 | img = binaryzation(train_set[i]) # 图片二值化
42 | label = train_labels[i]
43 |
44 | prior_probability[label] += 1
45 |
46 | for j in range(feature_num):
47 | conditional_probability[label][j][img[j]] += 1
48 |
49 | # 贝叶斯估计,因为分母都相同,所以先验概率和条件概率都不用除以分母
50 | prior_probability += 1
51 | for label in set(train_labels):
52 | for j in range(feature_num):
53 | conditional_probability[label][j][0] += 1
54 | conditional_probability[label][j][0] /= (len(train_labels[train_labels == label]) + 2 * 1)
55 | conditional_probability[label][j][1] += 1
56 | conditional_probability[label][j][1] /= (len(train_labels[train_labels == label]) + 2 * 1)
57 |
58 | # print(prior_probability)
59 | # print(conditional_probability)
60 | return prior_probability, conditional_probability
61 |
62 |
63 | @log
64 | def predict(test_features, prior_probability, conditional_probability):
65 | result = []
66 | for test in test_features:
67 | img = binaryzation(test)
68 |
69 | max_label = 0
70 | max_probability = 0
71 |
72 | for i in range(len(prior_probability)):
73 |
74 | # print("label",i)
75 | probability = prior_probability[i]
76 | for j in range(len(img)): # 特征长度
77 | # print("j",j)
78 | probability *= int(conditional_probability[i][j][img[j]])
79 | if max_probability < probability:
80 | max_probability = probability
81 | max_label = i
82 | result.append(max_label)
83 | return np.array(result)
84 |
85 |
86 | if __name__ == '__main__':
87 | logger = logging.getLogger()
88 | logger.setLevel(logging.DEBUG)
89 |
90 | raw_data = pd.read_csv('../data/train.csv', header=0)
91 | data = raw_data.values
92 |
93 | imgs = data[0:2000, 1:]
94 | labels = data[0:2000, 0]
95 |
96 | # print(imgs.shape)
97 |
98 | # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
99 | train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33,
100 | random_state=1)
101 |
102 | prior_probability, conditional_probability = train(train_features, train_labels)
103 | test_predict = predict(test_features, prior_probability, conditional_probability)
104 | score = accuracy_score(test_labels, test_predict)
105 | print("The accuracy score is ", score)
106 |
--------------------------------------------------------------------------------
/StatisticalLearningMethod/chapter4/naive_Bayes1.py:
--------------------------------------------------------------------------------
1 |
2 | import pandas as pd
3 | import numpy as np
4 | import cv2
5 | import time
6 |
7 | from sklearn.model_selection import train_test_split
8 | from sklearn.metrics import accuracy_score
9 |
10 |
11 | # 二值化
12 | def binaryzation(img):
13 | cv_img = img.astype(np.uint8)
14 | cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img)
15 | return cv_img
16 |
17 |
18 | def Train(trainset, train_labels):
19 | prior_probability = np.zeros(class_num) # 先验概率
20 | conditional_probability = np.zeros((class_num, feature_len, 2)) # 条件概率
21 |
22 | # 计算先验概率及条件概率
23 | for i in range(len(train_labels)):
24 | img = binaryzation(trainset[i]) # 图片二值化
25 | label = train_labels[i]
26 |
27 | prior_probability[label] += 1
28 |
29 | for j in range(feature_len):
30 | conditional_probability[label][j][img[j]] += 1
31 |
32 | # 将概率归到[1.10001]
33 | for i in range(class_num):
34 | for j in range(feature_len):
35 | # 经过二值化后图像只有0,1两种取值
36 | pix_0 = conditional_probability[i][j][0]
37 | pix_1 = conditional_probability[i][j][1]
38 |
39 | # 计算0,1像素点对应的条件概率
40 | probalility_0 = (float(pix_0) / float(pix_0 + pix_1)) * 1000000 + 1
41 | probalility_1 = (float(pix_1) / float(pix_0 + pix_1)) * 1000000 + 1
42 |
43 | conditional_probability[i][j][0] = probalility_0
44 | conditional_probability[i][j][1] = probalility_1
45 |
46 | print(conditional_probability)
47 |
48 | return prior_probability, conditional_probability
49 |
50 |
51 | # 计算概率
52 | def calculate_probability(img, label):
53 | probability = int(prior_probability[label])
54 |
55 | for i in range(len(img)):
56 | probability *= int(conditional_probability[label][i][img[i]])
57 |
58 | return probability
59 |
60 |
61 | def Predict(testset, prior_probability, conditional_probability):
62 | predict = []
63 |
64 | for img in testset:
65 |
66 | # 图像二值化
67 | img = binaryzation(img)
68 |
69 | max_label = 0
70 | max_probability = calculate_probability(img, 0)
71 |
72 | for j in range(1, 10):
73 | probability = calculate_probability(img, j)
74 |
75 | if max_probability < probability:
76 | max_label = j
77 | max_probability = probability
78 |
79 | predict.append(max_label)
80 |
81 | return np.array(predict)
82 |
83 |
84 | class_num = 10
85 | feature_len = 784
86 |
87 | if __name__ == '__main__':
88 | print('Start read data')
89 |
90 | time_1 = time.time()
91 |
92 | raw_data = pd.read_csv('../data/train.csv', header=0)
93 | data = raw_data.values
94 |
95 | imgs = data[0::, 1::]
96 | labels = data[::, 0]
97 |
98 | # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
99 | train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33,
100 | random_state=23323)
101 | # print train_features.shape
102 | # print train_features.shape
103 |
104 | time_2 = time.time()
105 | print('read data cost ', time_2 - time_1, ' second', '\n')
106 |
107 | print('Start training')
108 | prior_probability, conditional_probability = Train(train_features, train_labels)
109 | time_3 = time.time()
110 | print('training cost ', time_3 - time_2, ' second', '\n')
111 |
112 | print('Start predicting')
113 | test_predict = Predict(test_features, prior_probability, conditional_probability)
114 | time_4 = time.time()
115 | print('predicting cost ', time_4 - time_3, ' second', '\n')
116 |
117 | score = accuracy_score(test_labels, test_predict)
118 | print("The accuracy score is ", score)
119 |
--------------------------------------------------------------------------------
/StatisticalLearningMethod/chapter5/C4.5.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 |
3 | import time
4 | import logging
5 | import numpy as np
6 | import pandas as pd
7 | import random
8 |
9 | from sklearn.model_selection import train_test_split
10 | from sklearn.metrics import accuracy_score
11 |
12 | # 与ID3的实现类似,只有在特征选择时使用的标准不同
13 | # 问题也类似
14 |
15 |
16 | total_class = 10
17 |
18 |
19 | def log(func):
20 | def wrapper(*args, **kwargs):
21 | start_time = time.time()
22 | logging.debug('start %s()' % func.__name__)
23 | ret = func(*args, **kwargs)
24 |
25 | end_time = time.time()
26 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time))
27 |
28 | return ret
29 |
30 | return wrapper
31 |
32 |
33 | class Tree(object):
34 | def __init__(self, node_type, Class=None, feature=None):
35 | self.node_type = node_type
36 | self.dict = {}
37 | self.Class = Class
38 | self.feature = feature
39 |
40 | def add_tree(self, val, tree):
41 | self.dict[val] = tree
42 |
43 | def predict(self, features):
44 | if self.node_type == 'leaf':
45 | return self.Class
46 | if (features[self.feature] in self.dict.keys()):
47 | tree = self.dict[features[self.feature]]
48 | else:
49 | if (self.Class is None):
50 | return random.randint(0, 1)
51 | else:
52 | return self.Class
53 | return tree.predict(features)
54 |
55 |
56 | def calc_ent(x):
57 | """
58 | calculate empirical entropy of x
59 | """
60 |
61 | x_value_list = set(x)
62 | ent = 0.0
63 | for x_value in x_value_list:
64 | p = float(x[x == x_value].shape[0]) / x.shape[0]
65 | logp = np.log2(p)
66 | ent -= p * logp
67 |
68 | return ent
69 |
70 |
71 | def calc_condition_ent(train_feature, train_label):
72 | """
73 | calculate empirical entropy H(y|x)
74 | """
75 |
76 | # calc ent(y|x)
77 |
78 | ent = 0
79 | train_feature_set = set(train_feature)
80 | # print("train_feature_set", train_feature_set)
81 | for train_feature_value in train_feature_set:
82 | Di = train_feature[train_feature == train_feature_value]
83 | label_i = train_label[train_feature == train_feature_value]
84 | # print("Di", Di)
85 | train_label_set = set(train_label)
86 | temp = 0
87 | # print("train_label_set", train_label_set)
88 | for train_label_value in train_label_set:
89 | Dik = Di[label_i == train_label_value]
90 | # print(Dik)
91 | if (len(Dik) != 0):
92 | p = float(len(Dik) / len(Di))
93 | logp = np.log2(p)
94 | temp -= p * logp
95 | ent += (len(Di) / len(train_feature)) * temp
96 | return ent
97 |
98 |
99 | def recurse_train(train_set, train_label, features, epsilon):
100 |
101 | LEAF = 'leaf'
102 | INTERNAL = 'internal'
103 |
104 | # 步骤1——如果train_set中的所有实例都属于同一类Ck
105 | label_set = set(train_label)
106 | # print(label_set)
107 | if len(label_set) == 1:
108 | return Tree(LEAF, Class=label_set.pop())
109 |
110 | # 步骤2——如果features为空
111 |
112 | class_count0 = 0
113 | class_count1 = 0
114 |
115 | for i in range(len(train_label)):
116 | if (train_label[i] == 1):
117 | class_count1 += 1
118 | else:
119 | class_count0 += 1
120 |
121 | if (class_count0 >= class_count1):
122 | max_class = 0
123 | else:
124 | max_class = 0
125 |
126 | if features is None:
127 | return Tree(LEAF, Class=max_class)
128 |
129 | if len(features) == 0:
130 | return Tree(LEAF, Class=max_class)
131 |
132 | # 步骤3——计算信息增益
133 | max_feature = 0
134 | max_grda = 0
135 |
136 | D = train_label
137 | HD = calc_ent(D)
138 | for feature in features:
139 | A = np.array(train_set[:, feature].flat)
140 | gda = HD - calc_condition_ent(A, D)
141 | had = calc_ent(A)
142 | grda = gda / had
143 |
144 | if grda > max_grda:
145 | max_grda, max_feature = grda, feature
146 |
147 | # 步骤4——小于阈值
148 | if max_grda < epsilon:
149 | return Tree(LEAF, Class=max_class)
150 |
151 | # 步骤5——构建非空子集
152 | sub_features = features.remove(max_feature)
153 | tree = Tree(INTERNAL, feature=max_feature)
154 |
155 | feature_col = np.array(train_set[:, max_feature].flat)
156 | feature_value_list = set([feature_col[i] for i in range(feature_col.shape[0])])
157 | for feature_value in feature_value_list:
158 |
159 | index = []
160 | for i in range(len(train_label)):
161 | if train_set[i][max_feature] == feature_value:
162 | index.append(i)
163 |
164 | sub_train_set = train_set[index]
165 | sub_train_label = train_label[index]
166 |
167 | sub_tree = recurse_train(sub_train_set, sub_train_label, sub_features, epsilon)
168 | tree.add_tree(feature_value, sub_tree)
169 |
170 | return tree
171 |
172 |
173 | @log
174 | def train(train_set, train_label, features, epsilon):
175 | # print(features)
176 | return recurse_train(train_set, train_label, features, epsilon)
177 |
178 |
179 | @log
180 | def predict(test_set, tree):
181 | result = []
182 | for features in test_set:
183 | tmp_predict = tree.predict(features)
184 | result.append(tmp_predict)
185 | return np.array(result)
186 |
187 |
188 | if __name__ == '__main__':
189 | logger = logging.getLogger()
190 | logger.setLevel(logging.DEBUG)
191 |
192 | raw_data = pd.read_csv('../data/train_binary2.csv', header=0)
193 | data = raw_data.values
194 |
195 | images = data[0:, 1:]
196 | labels = data[:, 0]
197 |
198 | # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
199 | train_features, test_features, train_labels, test_labels = train_test_split(images, labels, test_size=0.33,
200 | random_state=1)
201 |
202 | print(train_features.shape)
203 | tree = train(train_features, train_labels, [i for i in range(99)], 0.1)
204 | test_predict = predict(test_features, tree)
205 | print(test_predict)
206 | score = accuracy_score(test_labels, test_predict)
207 |
208 | print("The accuracy score is ", score)
209 |
--------------------------------------------------------------------------------
/StatisticalLearningMethod/chapter5/CART.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import time
3 | import logging
4 | import numpy as np
5 | import pandas as pd
6 |
7 | from sklearn.model_selection import train_test_split
8 | from sklearn.metrics import accuracy_score
9 |
10 | total_class = 10
11 |
12 |
13 | # 这里选用了一个比较小的数据集,因为过大的数据集会导致栈溢出
14 |
15 |
16 | def log(func):
17 | def wrapper(*args, **kwargs):
18 | start_time = time.time()
19 | logging.debug('start %s()' % func.__name__)
20 | ret = func(*args, **kwargs)
21 |
22 | end_time = time.time()
23 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time))
24 |
25 | return ret
26 |
27 | return wrapper
28 |
29 |
30 | # 二值化
31 | def binaryzation(img):
32 | cv_img = img.astype(np.uint8)
33 | cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img)
34 | return cv_img
35 |
36 |
37 | @log
38 | def binaryzation_features(trainset):
39 | features = []
40 |
41 | for img in trainset:
42 | img = np.reshape(img, (28, 28))
43 | cv_img = img.astype(np.uint8)
44 |
45 | img_b = binaryzation(cv_img)
46 | features.append(img_b)
47 |
48 | features = np.array(features)
49 | features = np.reshape(features, (-1, 784))
50 |
51 | return features
52 |
53 |
54 | class TreeNode(object):
55 | """决策树节点"""
56 |
57 | def __init__(self, **kwargs):
58 | '''
59 | attr_index: 属性编号
60 | attr: 属性值
61 | label: 类别(y)
62 | left_chuld: 左子结点
63 | right_child: 右子节点
64 | '''
65 | self.attr_index = kwargs.get('attr_index')
66 | self.attr = kwargs.get('attr')
67 | self.label = kwargs.get('label')
68 | self.left_child = kwargs.get('left_child')
69 | self.right_child = kwargs.get('right_child')
70 |
71 |
72 | # 计算数据集的基尼指数
73 | def gini_train_set(train_label):
74 | train_label_value = set(train_label)
75 | gini = 0.0
76 | for i in train_label_value:
77 | train_label_temp = train_label[train_label == i]
78 | pk = float(len(train_label_temp)) / len(train_label)
79 | gini += pk * (1 - pk)
80 | return gini
81 |
82 |
83 | # 计算一个特征不同切分点的基尼指数,并返回最小的
84 | def gini_feature(train_feature, train_label):
85 | train_feature_value = set(train_feature)
86 | min_gini = float('inf')
87 | return_feature_value = 0
88 | for i in train_feature_value:
89 | train_feature_class1 = train_feature[train_feature == i]
90 | label_class1 = train_label[train_feature == i]
91 | # train_feature_class2 = train_feature[train_feature != i]
92 | label_class2 = train_label[train_feature != i]
93 | D1 = float(len(train_feature_class1)) / len(train_feature)
94 | D2 = 1 - D1
95 | if (len(label_class1) == 0):
96 | p1 = 0
97 | else:
98 | p1 = float(len(label_class1[label_class1 == label_class1[0]])) / len(label_class1)
99 | if (len(label_class2) == 0):
100 | p2 = 0
101 | else:
102 | p2 = float(len(label_class2[label_class2 == label_class2[0]])) / len(label_class2)
103 | gini = D1 * 2 * p1 * (1 - p1) + D2 * 2 * p2 * (1 - p2)
104 | if min_gini > gini:
105 | min_gini = gini
106 | return_feature_value = i
107 | return min_gini, return_feature_value
108 |
109 |
110 | def get_best_index(train_set, train_label, feature_indexes):
111 | '''
112 | :param train_set: 给定数据集
113 | :param train_label: 数据集对应的标记
114 | :return: 最佳切分点,最佳切分变量
115 | 求给定切分点集合中的最佳切分点和其对应的最佳切分变量
116 | '''
117 | min_gini = float('inf')
118 | feature_index = 0
119 | return_feature_value = 0
120 | for i in range(len(train_set[0])):
121 | if i in feature_indexes:
122 | train_feature = train_set[:, i]
123 | gini, feature_value = gini_feature(train_feature, train_label)
124 | if gini < min_gini:
125 | min_gini = gini
126 | feature_index = i
127 | return_feature_value = feature_value
128 | return feature_index, return_feature_value
129 |
130 |
131 | # 根据最有特征和最优切分点划分数据集
132 | def divide_train_set(train_set, train_label, feature_index, feature_value):
133 | left = []
134 | right = []
135 | left_label = []
136 | right_label = []
137 | for i in range(len(train_set)):
138 | line = train_set[i]
139 | if line[feature_index] == feature_value:
140 | left.append(line)
141 | left_label.append(train_label[i])
142 | else:
143 | right.append(line)
144 | right_label.append(train_label[i])
145 | return np.array(left), np.array(right), np.array(left_label), np.array(right_label)
146 |
147 |
148 | @log
149 | def build_tree(train_set, train_label, feature_indexes):
150 | # 查看是否满足停止条件
151 | train_label_value = set(train_label)
152 | if len(train_label_value) == 1:
153 | print("a")
154 | return TreeNode(label=train_label[0])
155 |
156 | if feature_indexes is None:
157 | print("b")
158 | return TreeNode(label=train_label[0])
159 |
160 | if len(feature_indexes) == 0:
161 | print("c")
162 | return TreeNode(label=train_label[0])
163 |
164 | feature_index, feature_value = get_best_index(train_set, train_label, feature_indexes)
165 | # print("feature_index",feature_index)
166 |
167 | left, right, left_label, right_label = divide_train_set(train_set, train_label, feature_index, feature_value)
168 |
169 | feature_indexes.remove(feature_index)
170 | # print("feature_indexes",feature_indexes)
171 |
172 | left_branch = build_tree(left, left_label, feature_indexes)
173 | right_branch = build_tree(right, right_label, feature_indexes)
174 | return TreeNode(left_child=left_branch,
175 | right_child=right_branch,
176 | attr_index=feature_index,
177 | attr=feature_value)
178 |
179 | # @log
180 | # def prune(tree):
181 |
182 |
183 | def predict_one(node, test):
184 | while node.label is None:
185 | if test[node.attr_index] == node.attr:
186 | node = node.left_child
187 | else:
188 | node = node.right_child
189 | return node.label
190 |
191 |
192 | @log
193 | def predict(tree, test_set):
194 | result = []
195 | for test in test_set:
196 | label = predict_one(tree, test)
197 | result.append(label)
198 | return result
199 |
200 |
201 | if __name__ == '__main__':
202 | logger = logging.getLogger()
203 | logger.setLevel(logging.DEBUG)
204 |
205 | raw_data = pd.read_csv('../data/train_binary1.csv', header=0)
206 | data = raw_data.values
207 |
208 | imgs = data[0:, 1:]
209 | labels = data[:, 0]
210 |
211 | print(imgs.shape)
212 |
213 | # 图片二值化
214 | # features = binaryzation_features(imgs)
215 |
216 | # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
217 | train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33,
218 | random_state=23323)
219 |
220 | print(type(train_features))
221 | tree = build_tree(train_features, train_labels, [i for i in range(784)])
222 | test_predict = predict(tree, test_features)
223 | score = accuracy_score(test_labels, test_predict)
224 |
225 | print("The accuracy score is ", score)
226 |
--------------------------------------------------------------------------------
/StatisticalLearningMethod/chapter5/ID3-1.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 |
3 | import cv2
4 | import time
5 | import logging
6 | import numpy as np
7 | import pandas as pd
8 |
9 | from sklearn.model_selection import train_test_split
10 | from sklearn.metrics import accuracy_score
11 |
12 | total_class = 10
13 |
14 |
15 | def log(func):
16 | def wrapper(*args, **kwargs):
17 | start_time = time.time()
18 | logging.debug('start %s()' % func.__name__)
19 | ret = func(*args, **kwargs)
20 |
21 | end_time = time.time()
22 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time))
23 |
24 | return ret
25 |
26 | return wrapper
27 |
28 |
29 | # 二值化
30 | def binaryzation(img):
31 | cv_img = img.astype(np.uint8)
32 | cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img)
33 | return cv_img
34 |
35 |
36 | @log
37 | def binaryzation_features(trainset):
38 | features = []
39 |
40 | for img in trainset:
41 | img = np.reshape(img, (28, 28))
42 | cv_img = img.astype(np.uint8)
43 |
44 | img_b = binaryzation(cv_img)
45 | # hog_feature = np.transpose(hog_feature)
46 | features.append(img_b)
47 |
48 | features = np.array(features)
49 | features = np.reshape(features, (-1, 784))
50 |
51 | return features
52 |
53 |
54 | class Tree(object):
55 | def __init__(self, node_type, Class=None, feature=None):
56 | self.node_type = node_type
57 | self.dict = {}
58 | self.Class = Class
59 | self.feature = feature
60 |
61 | def add_tree(self, val, tree):
62 | self.dict[val] = tree
63 |
64 | def predict(self, features):
65 | if self.node_type == 'leaf':
66 | return self.Class
67 |
68 | tree = self.dict[features[self.feature]]
69 | return tree.predict(features)
70 |
71 |
72 | def calc_ent(x):
73 | """
74 | calculate shanno ent of x
75 | """
76 |
77 | x_value_list = set([x[i] for i in range(x.shape[0])])
78 | ent = 0.0
79 | for x_value in x_value_list:
80 | p = float(x[x == x_value].shape[0]) / x.shape[0]
81 | logp = np.log2(p)
82 | ent -= p * logp
83 |
84 | return ent
85 |
86 |
87 | def calc_condition_ent(x, y):
88 | """
89 | calculate ent H(y|x)
90 | """
91 |
92 | # calc ent(y|x)
93 | x_value_list = set([x[i] for i in range(x.shape[0])])
94 | ent = 0.0
95 | for x_value in x_value_list:
96 | sub_y = y[x == x_value]
97 | temp_ent = calc_ent(sub_y)
98 | ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent
99 |
100 | return ent
101 |
102 |
103 | def calc_ent_grap(x, y):
104 | """
105 | calculate ent grap
106 | """
107 |
108 | base_ent = calc_ent(y)
109 | condition_ent = calc_condition_ent(x, y)
110 | ent_grap = base_ent - condition_ent
111 |
112 | return ent_grap
113 |
114 |
115 | def recurse_train(train_set, train_label, features, epsilon):
116 | global total_class
117 |
118 | LEAF = 'leaf'
119 | INTERNAL = 'internal'
120 |
121 | # 步骤1——如果train_set中的所有实例都属于同一类Ck
122 | label_set = set(train_label)
123 | if len(label_set) == 1:
124 | return Tree(LEAF, Class=label_set.pop())
125 |
126 | # 步骤2——如果features为空
127 | class_count0 = 0
128 | class_count1 = 0
129 |
130 | for i in range(len(train_label)):
131 | if (train_label[i] == 1):
132 | class_count1 += 1
133 | else:
134 | class_count0 += 1
135 |
136 | if (class_count0 >= class_count1):
137 | max_class = 0
138 | else:
139 | max_class = 0
140 |
141 | if features is None:
142 | return Tree(LEAF, Class=max_class)
143 |
144 | if len(features) == 0:
145 | return Tree(LEAF, Class=max_class)
146 |
147 | # 步骤3——计算信息增益
148 | max_feature = 0
149 | max_gda = 0
150 |
151 | D = train_label
152 | HD = calc_ent(D)
153 | for feature in features:
154 | A = np.array(train_set[:, feature].flat)
155 | gda = HD - calc_condition_ent(A, D)
156 |
157 | if gda > max_gda:
158 | max_gda, max_feature = gda, feature
159 |
160 | # 步骤4——小于阈值
161 | if max_gda < epsilon:
162 | return Tree(LEAF, Class=max_class)
163 |
164 | # 步骤5——构建非空子集
165 | sub_features = features.remove(max_feature)
166 | tree = Tree(INTERNAL, feature=max_feature)
167 |
168 | feature_col = np.array(train_set[:, max_feature].flat)
169 | feature_value_list = set([feature_col[i] for i in range(feature_col.shape[0])])
170 | for feature_value in feature_value_list:
171 |
172 | index = []
173 | for i in range(len(train_label)):
174 | if train_set[i][max_feature] == feature_value:
175 | index.append(i)
176 |
177 | sub_train_set = train_set[index]
178 | sub_train_label = train_label[index]
179 |
180 | sub_tree = recurse_train(sub_train_set, sub_train_label, sub_features, epsilon)
181 | tree.add_tree(feature_value, sub_tree)
182 |
183 | return tree
184 |
185 |
186 | @log
187 | def train(train_set, train_label, features, epsilon):
188 | return recurse_train(train_set, train_label, features, epsilon)
189 |
190 |
191 | @log
192 | def predict(test_set, tree):
193 | result = []
194 | for features in test_set:
195 | tmp_predict = tree.predict(features)
196 | result.append(tmp_predict)
197 | return np.array(result)
198 |
199 |
200 | if __name__ == '__main__':
201 | logger = logging.getLogger()
202 | logger.setLevel(logging.DEBUG)
203 |
204 | raw_data = pd.read_csv('../data/train.csv', header=0)
205 | data = raw_data.values
206 |
207 | imgs = data[0::, 1::]
208 | labels = data[::, 0]
209 |
210 | # 图片二值化
211 | features = binaryzation_features(imgs)
212 |
213 | # print(features)
214 |
215 | # aa=features.tolist()
216 | # print(aa)
217 |
218 | # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
219 | train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33,
220 | random_state=23323)
221 |
222 | tree = train(train_features, train_labels, [i for i in range(784)], 0.1)
223 | test_predict = predict(test_features, tree)
224 | score = accuracy_score(test_labels, test_predict)
225 |
226 | print("The accuracy score is ", score)
227 |
--------------------------------------------------------------------------------
/StatisticalLearningMethod/chapter5/ID3.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 | import cv2
3 | import time
4 | import logging
5 | import numpy as np
6 | import pandas as pd
7 | import random
8 |
9 | from sklearn.model_selection import train_test_split
10 | from sklearn.metrics import accuracy_score
11 |
12 | # 参考了别人的实现
13 | # 问题1:可能是因为没有进行二值化?算出来的正确率很低,只有10%?还不如瞎猜!
14 | # 直接使用别人的代码(将二值化注释)得到的正确率也很低,但看博文达到了89%,不知其解
15 | # 问题2:我觉得参考的代码在实现经验条件熵的时候是有问题的,改成了自己实现的
16 | # 问题3:在tree.predict中可能出现keyError的问题,不得已进行了键是否存在的检测,并随机返回值。我没有找到原因
17 | # 问题4:但就我所看,问题3的情况不多,对最后结果产生的影响应该不大,但正确率还是很低
18 | # 问题5:使用原来的train_binary数据集计算很慢,用了小一点的数据集
19 | # 虽然有这么多问题,但关键步骤的代码,如经验熵和经验条件熵的计算,ID3算法的各个步骤应该都是正确的
20 |
21 | # 好吧发现不是二值化的问题!博主原来的代码我运行了也是很低的正确率,不得其解
22 |
23 | total_class = 10
24 |
25 |
26 | def log(func):
27 | def wrapper(*args, **kwargs):
28 | start_time = time.time()
29 | logging.debug('start %s()' % func.__name__)
30 | ret = func(*args, **kwargs)
31 |
32 | end_time = time.time()
33 | logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time - start_time))
34 |
35 | return ret
36 |
37 | return wrapper
38 |
39 |
40 | # 二值化
41 | def binaryzation(img):
42 | cv_img = img.astype(np.uint8)
43 | cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY, cv_img)
44 | return cv_img
45 |
46 |
47 | @log
48 | def binaryzation_features(trainset):
49 | features = []
50 |
51 | for img in trainset:
52 | img = np.reshape(img, (10, 10))
53 | cv_img = img.astype(np.uint8)
54 |
55 | img_b = binaryzation(cv_img)
56 | # hog_feature = np.transpose(hog_feature)
57 | features.append(img_b)
58 |
59 | features = np.array(features)
60 | features = np.reshape(features, (-1, 100))
61 |
62 | return features
63 |
64 |
65 | class Tree(object):
66 | def __init__(self, node_type, Class=None, feature=None):
67 | self.node_type = node_type
68 | self.dict = {}
69 | self.Class = Class
70 | self.feature = feature
71 |
72 | def add_tree(self, val, tree):
73 | self.dict[val] = tree
74 |
75 | def predict(self, features):
76 | if self.node_type == 'leaf':
77 | return self.Class
78 | if (features[self.feature] in self.dict.keys()):
79 | tree = self.dict[features[self.feature]]
80 | else:
81 | if (self.Class is None):
82 | return random.randint(0, 1)
83 | else:
84 | return self.Class
85 | return tree.predict(features)
86 |
87 |
88 | def calc_ent(x):
89 | """
90 | calculate empirical entropy of x
91 | """
92 |
93 | x_value_list = set([x[i] for i in range(x.shape[0])])
94 | ent = 0.0
95 | for x_value in x_value_list:
96 | p = float(x[x == x_value].shape[0]) / x.shape[0]
97 | logp = np.log2(p)
98 | ent -= p * logp
99 |
100 | return ent
101 |
102 |
103 | def calc_condition_ent(train_feature, train_label):
104 | """
105 | calculate empirical entropy H(y|x)
106 | """
107 |
108 | # calc ent(y|x)
109 |
110 | ent = 0
111 | train_feature_set = set(train_feature)
112 | # print("train_feature_set", train_feature_set)
113 | for train_feature_value in train_feature_set:
114 | Di = train_feature[train_feature == train_feature_value]
115 | label_i = train_label[train_feature == train_feature_value]
116 | # print("Di", Di)
117 | train_label_set = set(train_label)
118 | temp = 0
119 | # print("train_label_set", train_label_set)
120 | for train_label_value in train_label_set:
121 | Dik = Di[label_i == train_label_value]
122 | # print(Dik)
123 | if (len(Dik) != 0):
124 | p = float(len(Dik)) / len(Di)
125 | logp = np.log2(p)
126 | temp -= p * logp
127 | ent += float(len(Di)) / len(train_feature) * temp
128 | return ent
129 |
130 |
131 | def recurse_train(train_set, train_label, features, epsilon):
132 | global total_class
133 |
134 | LEAF = 'leaf'
135 | INTERNAL = 'internal'
136 |
137 | # 步骤1——如果train_set中的所有实例都属于同一类Ck
138 | label_set = set(train_label)
139 | # print(label_set)
140 | if len(label_set) == 1:
141 | return Tree(LEAF, Class=label_set.pop())
142 |
143 | # 步骤2——如果features为空
144 |
145 | class_count0 = 0
146 | class_count1 = 0
147 |
148 | for i in range(len(train_label)):
149 | if (train_label[i] == 1):
150 | class_count1 += 1
151 | else:
152 | class_count0 += 1
153 |
154 | if (class_count0 >= class_count1):
155 | max_class = 0
156 | else:
157 | max_class = 0
158 |
159 | if features is None:
160 | return Tree(LEAF, Class=max_class)
161 |
162 | if len(features) == 0:
163 | return Tree(LEAF, Class=max_class)
164 |
165 | # 步骤3——计算信息增益
166 | max_feature = 0
167 | max_gda = 0
168 |
169 | D = train_label
170 | HD = calc_ent(D)
171 | for feature in features:
172 | A = np.array(train_set[:, feature].flat)
173 | gda = HD - calc_condition_ent(A, D)
174 |
175 | if gda > max_gda:
176 | max_gda, max_feature = gda, feature
177 |
178 | # 步骤4——小于阈值
179 | if max_gda < epsilon:
180 | return Tree(LEAF, Class=max_class)
181 |
182 | # 步骤5——构建非空子集
183 | sub_features = features.remove(max_feature)
184 | tree = Tree(INTERNAL, feature=max_feature)
185 |
186 | feature_col = np.array(train_set[:, max_feature].flat)
187 | feature_value_list = set([feature_col[i] for i in range(feature_col.shape[0])])
188 | for feature_value in feature_value_list:
189 |
190 | index = []
191 | for i in range(len(train_label)):
192 | if train_set[i][max_feature] == feature_value:
193 | index.append(i)
194 |
195 | sub_train_set = train_set[index]
196 | sub_train_label = train_label[index]
197 |
198 | sub_tree = recurse_train(sub_train_set, sub_train_label, sub_features, epsilon)
199 | tree.add_tree(feature_value, sub_tree)
200 |
201 | return tree
202 |
203 |
204 | @log
205 | def train(train_set, train_label, features, epsilon):
206 | # print(features)
207 | return recurse_train(train_set, train_label, features, epsilon)
208 |
209 |
210 | @log
211 | def predict(test_set, tree):
212 | result = []
213 | for features in test_set:
214 | tmp_predict = tree.predict(features)
215 | result.append(tmp_predict)
216 | return np.array(result)
217 |
218 |
219 | if __name__ == '__main__':
220 | logger = logging.getLogger()
221 | logger.setLevel(logging.DEBUG)
222 |
223 | raw_data = pd.read_csv('../data/train_binary2.csv', header=0)
224 | data = raw_data.values
225 |
226 | images = data[0:, 1:]
227 | labels = data[:, 0]
228 |
229 | # 图片二值化
230 | features = binaryzation_features(images)
231 |
232 | # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
233 | train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33,
234 | random_state=1)
235 |
236 | # print(train_features.shape)
237 | tree = train(train_features, train_labels, [i for i in range(99)], 0.1)
238 | test_predict = predict(test_features, tree)
239 | # print(test_predict)
240 | score = accuracy_score(test_labels, test_predict)
241 |
242 | print("The accuracy score is ", score)
243 |
--------------------------------------------------------------------------------
/StatisticalLearningMethod/errata.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/StatisticalLearningMethod/errata.pdf
--------------------------------------------------------------------------------
/StatisticalLearningMethod/hog.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 28 28
6 |
7 | 14 14
8 |
9 | 7 7
10 |
11 | 7 7
12 | 9
13 | 1
14 | 4.
15 | 0
16 | 2.0000000000000001e-001
17 | 1
18 | 64
19 |
20 |
--------------------------------------------------------------------------------
/tensorflow/course/data/fire_theft.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/tensorflow/course/data/fire_theft.xls
--------------------------------------------------------------------------------
/tensorflow/course/feed.py:
--------------------------------------------------------------------------------
1 | # 上述示例在计算图中引入了 tensor, 以常量或变量的形式存储. TensorFlow 还提供了 feed 机制,
2 | # 该机制 可以临时替代图中的任意操作中的 tensor 可以对图中任何操作提交补丁, 直接插入一个 tensor.
3 | # feed 使用一个 tensor 值临时替换一个操作的输出结果. 你可以提供 feed 数据作为 run() 调用的参数.
4 | # feed 只在调用它的方法内有效, 方法结束, feed 就会消失.
5 | # 最常见的用例是将某些特殊的操作指定为 "feed" 操作, 标记的方法是使用 tf.placeholder() 为这些操作创建占位符.
6 | import tensorflow as tf
7 |
8 | input1 = tf.placeholder(tf.float32)
9 | input2 = tf.placeholder(tf.float32)
10 | output = tf.multiply(input1, input2)
11 |
12 | with tf.Session() as sess:
13 | print(sess.run([output], feed_dict={input1: [7.], input2: [2.]}))
14 |
15 | # 输出:
16 | # [array([ 14.], dtype=float32)]
17 |
--------------------------------------------------------------------------------
/tensorflow/course/fetch.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | input1 = tf.constant(3.0)
4 | input2 = tf.constant(2.0)
5 | input3 = tf.constant(5.0)
6 | intermed = tf.add(input2, input3)
7 | mul = tf.multiply(input1, intermed)
8 |
9 | with tf.Session() as sess:
10 | result = sess.run([mul, intermed])
11 | print(result)
12 |
13 | # 输出:
14 | # [array([ 21.], dtype=float32), array([ 7.], dtype=float32)]
15 |
--------------------------------------------------------------------------------
/tensorflow/course/graph.py:
--------------------------------------------------------------------------------
1 | # 构建图的第一步, 是创建源 op (source op). 源 op 不需要任何输入, 例如 常量 (Constant). 源 op 的输出被传递给其它 op 做运算.
2 | #
3 | # Python 库中, op 构造器的返回值代表被构造出的 op 的输出, 这些返回值可以传递给其它 op 构造器作为输入.
4 |
5 | import tensorflow as tf
6 |
7 | # 创建一个常量 op, 产生一个 1x2 矩阵. 这个 op 被作为一个节点
8 | # 加到默认图中.
9 | #
10 | # 构造器的返回值代表该常量 op 的返回值.
11 | matrix1 = tf.constant([[3., 3.]])
12 |
13 | # 创建另外一个常量 op, 产生一个 2x1 矩阵.
14 | matrix2 = tf.constant([[2.], [2.]])
15 |
16 | # 创建一个矩阵乘法 matmul op , 把 'matrix1' 和 'matrix2' 作为输入.
17 | # 返回值 'product' 代表矩阵乘法的结果.
18 | product = tf.matmul(matrix1, matrix2)
19 |
20 | # 默认图现在有三个节点, 两个 constant() op, 和一个matmul() op. 为了真正进行矩阵相乘运算, 并得到矩阵乘法的 结果, 你必须在会话里启动这个图.
21 |
22 | # 构造阶段完成后, 才能启动图. 启动图的第一步是创建一个 Session 对象, 如果无任何创建参数, 会话构造器将启动默认图.
23 |
24 | # 启动默认图.
25 | sess = tf.Session()
26 |
27 | # 调用 sess 的 'run()' 方法来执行矩阵乘法 op, 传入 'product' 作为该方法的参数.
28 | # 上面提到, 'product' 代表了矩阵乘法 op 的输出, 传入它是向方法表明, 我们希望取回
29 | # 矩阵乘法 op 的输出.
30 | #
31 | # 整个执行过程是自动化的, 会话负责传递 op 所需的全部输入. op 通常是并发执行的.
32 | #
33 | # 函数调用 'run(product)' 触发了图中三个 op (两个常量 op 和一个矩阵乘法 op) 的执行.
34 | #
35 | # 返回值 'result' 是一个 numpy `ndarray` 对象.
36 | result = sess.run(product)
37 | print(result)
38 | # ==> [[ 12.]]
39 |
40 | # 任务完成, 关闭会话.
41 | sess.close()
42 |
--------------------------------------------------------------------------------
/tensorflow/course/interactiveSession.py:
--------------------------------------------------------------------------------
1 | # 为了便于使用诸如 IPython 之类的 Python 交互环境,
2 | # 可以使用 InteractiveSession 代替 Session 类, 使用 Tensor.eval() 和 Operation.run() 方法代替 Session.run().
3 | # 这样可以避免使用一个变量来持有会话.
4 |
5 | # 进入一个交互式 TensorFlow 会话.
6 | import tensorflow as tf
7 |
8 | sess = tf.InteractiveSession()
9 |
10 | x = tf.Variable([1.0, 2.0])
11 | a = tf.constant([3.0, 3.0])
12 |
13 | # 使用初始化器 initializer op 的 run() 方法初始化 'x'
14 | x.initializer.run()
15 |
16 | # 增加一个减法 sub op, 从 'x' 减去 'a'. 运行减法 op, 输出结果
17 | sub = tf.subtract(x, a)
18 | print(sub.eval())
19 | # ==> [-2. -1.]
20 |
--------------------------------------------------------------------------------
/tensorflow/course/linearRegression.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import tensorflow as tf
5 | import xlrd
6 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
7 |
8 |
9 | DATA_FILE = 'data/fire_theft.xls'
10 |
11 | # Phase 1: Assemble the graph
12 | # Step 1: read in data from the .xls file
13 | book = xlrd.open_workbook(DATA_FILE, encoding_override='utf-8')
14 | sheet = book.sheet_by_index(0)
15 | data = np.asarray(
16 | [sheet.row_values(i) for i in range(1, sheet.nrows)], dtype=np.float32)
17 | n_samples = sheet.nrows - 1
18 |
19 | # Step 2: create placeholders for input X (number of fire) and label Y (number of theft)
20 | # Both have the type float32
21 | X = tf.placeholder(tf.float32, shape=[], name='input')
22 | Y = tf.placeholder(tf.float32, shape=[], name='label')
23 | # Step 3: create weight and bias, initialized to 0
24 | # name your variables w and b
25 | w = tf.get_variable(
26 | 'weight', shape=[], initializer=tf.truncated_normal_initializer())
27 | b = tf.get_variable('bias', shape=[], initializer=tf.zeros_initializer())
28 | # Step 4: predict Y (number of theft) from the number of fire
29 | # name your variable Y_predicted
30 | Y_predicted = w * X + b
31 | # Step 5: use the square error as the loss function
32 | # name your variable loss
33 | loss = tf.square(Y - Y_predicted, name='loss')
34 |
35 |
36 | def huber_loss(labels, predictions, delta=1.0):
37 | residual = tf.abs(predictions - labels)
38 | condition = tf.less(residual, delta)
39 | small_res = 0.5 * residual ** 2
40 | large_res = delta * residual - 0.5 * delta ** 2
41 | return tf.where(condition, small_res, large_res)
42 |
43 |
44 | h_loss = huber_loss(Y, Y_predicted)
45 | # Step 6: using gradient descent with learning rate of 0.01 to minimize loss
46 | optimizer = tf.train.GradientDescentOptimizer(
47 | learning_rate=1e-3).minimize(loss)
48 |
49 | # Phase 2: Train our model
50 | init = tf.global_variables_initializer()
51 | with tf.Session() as sess:
52 | # Step 7: initialize the necessary variables, in this case, w and b
53 | writer = tf.summary.FileWriter('./linear_log', graph=sess.graph)
54 | sess.run(init)
55 | # Step 8: train the model
56 | for i in range(100):
57 | total_loss = 0
58 | for x, y in data:
59 | # Session runs optimizer to minimize loss and fetch the value of loss. Name the received value as l
60 | _, l = sess.run([optimizer, h_loss], feed_dict={X: x, Y: y})
61 | total_loss += l
62 | print("Epoch {0}: {1}".format(i, total_loss / n_samples))
63 | w, b = sess.run([w, b])
64 | writer.close()
65 | # plot the results
66 | X, Y = data.T[0], data.T[1]
67 | plt.plot(X, Y, 'bo', label='Real data')
68 | plt.plot(X, X * w + b, 'r', label='Predicted data')
69 | plt.legend()
70 | plt.show()
71 |
--------------------------------------------------------------------------------
/tensorflow/course/load.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 | import numpy as np
3 | import tensorflow as tf
4 |
5 | v = tf.Variable(initial_value=[1, 2])
6 | init = tf.global_variables_initializer()
7 |
8 | with tf.Session() as sess:
9 | sess.run(init)
10 | # 显式地传递session到函数里面
11 | v.load(value=[3, 4], session=sess)
12 | print(v.eval(session=sess))
13 |
--------------------------------------------------------------------------------
/tensorflow/course/logisticRegression.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import tensorflow as tf
4 | from tensorflow.examples.tutorials.mnist import input_data
5 | import time
6 |
7 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
8 |
9 | # Define parameters for the model
10 | learning_rate = 0.01
11 | batch_size = 128
12 | n_epochs = 10
13 |
14 | # Step 1: Read in data
15 | # using TF Learn's built in function to load MNIST data to the folder data/mnist
16 | mnist = input_data.read_data_sets('./data/mnist', one_hot=True)
17 |
18 | # Step 2: create placeholders for features and labels
19 | # each image in the MNIST data is of shape 28*28 = 784
20 | # therefore, each image is represented with a 1x784 tensor
21 | # there are 10 classes for each image, corresponding to digits 0 - 9.
22 | # Features are of the type float, and labels are of the type int
23 | x = tf.placeholder(tf.float32, shape=[None, 784], name='image')
24 | y = tf.placeholder(tf.int32, shape=[None, 10], name='label')
25 |
26 | # Step 3: create weights and bias
27 | # weights and biases are initialized to 0
28 | # shape of w depends on the dimension of X and Y so that Y = X * w + b
29 | # shape of b depends on Y
30 |
31 | w = tf.get_variable(
32 | 'weight', shape=[784, 10], initializer=tf.truncated_normal_initializer())
33 | b = tf.get_variable('bias', shape=[10], initializer=tf.zeros_initializer())
34 |
35 | # Step 4: build model
36 | # the model that returns the logits.
37 | # this logits will be later passed through softmax layer
38 | # to get the probability distribution of possible label of the image
39 | # DO NOT DO SOFTMAX HERE
40 | logits = tf.matmul(x, w) + b
41 | # Step 5: define loss function
42 | # use cross entropy loss of the real labels with the softmax of logits
43 | # use the method:
44 | entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits)
45 | # then use tf.reduce_mean to get the mean loss of the batch
46 | loss = tf.reduce_mean(entropy, axis=0)
47 | # test model
48 | preds = tf.nn.softmax(logits)
49 | correct_preds = tf.equal(tf.argmax(preds, 1), tf.argmax(y, 1))
50 | accuracy = tf.reduce_sum(tf.cast(correct_preds, tf.float32), axis=0)
51 |
52 | # Step 6: define training op
53 | # using gradient descent to minimize loss
54 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
55 |
56 | with tf.Session() as sess:
57 | writer = tf.summary.FileWriter('./logistic_log', sess.graph)
58 | start_time = time.time()
59 | sess.run(tf.global_variables_initializer())
60 | n_batches = int(mnist.train.num_examples / batch_size)
61 | for i in range(n_epochs): # train the model n_epochs times
62 | total_loss = 0
63 | for _ in range(n_batches):
64 | X_batch, Y_batch = mnist.train.next_batch(batch_size)
65 | _, loss_batch = sess.run(
66 | [optimizer, loss], feed_dict={x: X_batch,
67 | y: Y_batch})
68 | total_loss += loss_batch
69 | print('Average loss epoch {0}: {1}'.format(i, total_loss / n_batches))
70 |
71 | print('Total time: {0} seconds'.format(time.time() - start_time))
72 |
73 | print('Optimization Finished!') # should be around 0.35 after 25 epochs
74 |
75 | # test the model
76 | n_batches = int(mnist.test.num_examples / batch_size)
77 | total_correct_preds = 0
78 |
79 | for i in range(n_batches):
80 | X_batch, Y_batch = mnist.test.next_batch(batch_size)
81 | accuracy_batch = sess.run(accuracy, feed_dict={x: X_batch, y: Y_batch})
82 | total_correct_preds += accuracy_batch
83 |
84 | print('Accuracy {0}'.format(total_correct_preds / mnist.test.num_examples))
85 |
--------------------------------------------------------------------------------
/tensorflow/course/random.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | # Create a tensor of shape [2, 3] consisting of random normal values, with mean -1 and standard deviation 4.
4 | norm = tf.random_normal([2, 3], mean=-1, stddev=4)
5 |
6 | # Shuffle the first dimension of a tensor
7 | c = tf.constant([[1, 2], [3, 4], [5, 6]])
8 | shuff = tf.random_shuffle(c)
9 |
10 | # Each time we run these ops, different results are generated
11 | sess = tf.Session()
12 | print(sess.run(norm))
13 | print(sess.run(norm))
14 |
15 | # Set an op-level seed to generate repeatable sequences across sessions.
16 | norm = tf.random_normal([2, 3], seed=1234)
17 | sess = tf.Session()
18 | print(sess.run(norm))
19 | print(sess.run(norm))
20 | sess = tf.Session()
21 | print(sess.run(norm))
22 | print(sess.run(norm))
23 |
--------------------------------------------------------------------------------
/tensorflow/course/shape.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 |
3 | import tensorflow as tf
4 |
5 | import numpy as np
6 |
7 | batch_dim_1 = np.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]])
8 |
9 | print("batch_dim:\n", batch_dim_1)
10 |
11 | batch_dim_2 = np.array([[3, 4, 5, 6], [9, 10, 11, 12], [13, 14, 15, 16]])
12 |
13 | print("batch_dim:\n", batch_dim_2)
14 |
15 | graph = tf.Graph()
16 |
17 | with graph.as_default():
18 | a = tf.Variable(initial_value=batch_dim_1)
19 |
20 | b = tf.Variable(initial_value=batch_dim_2)
21 |
22 | result = (a, b)
23 |
24 | print("result:", result)
25 |
26 | result = tf.concat(values=[a, b], axis=0)
27 |
28 | print(result)
29 |
30 | result2 = tf.reshape(tensor=result, shape=(2, 3, -1))
31 |
32 | print("result2:", result2)
33 |
34 | result3 = tf.transpose(a=result2, perm=(1, 0, 2))
35 |
36 | print("result3:", result3)
37 |
38 | shape = result3.get_shape().as_list()
39 |
40 | print(shape)
41 |
42 | init = tf.global_variables_initializer()
43 |
44 | with tf.Session(graph=graph) as sess:
45 | sess.run(init)
46 |
47 | print("result:\n", sess.run(result))
48 |
49 | print("result2:\n", sess.run(result2))
50 |
51 | print("result3:\n", sess.run(result3))
52 |
53 | # define graph
54 | graph = tf.Graph()
55 | with graph.as_default():
56 | c1 = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=tf.float32, name="c1")
57 | c2 = tf.constant([1, 2, 3, 4, 5, 6], dtype=tf.float32, name="c2")
58 | c3 = tf.random_normal(shape=(3, 2, 3))
59 | shape_c1 = tf.shape(c1)
60 | # shape_nc1=tf.shape_n(c1)
61 | shape_c2 = tf.shape(c2)
62 | shape_c3 = tf.shape(c3)
63 |
64 | # run graph
65 | with tf.Session(graph=graph) as sess:
66 | _shape_c1, _shape_c2, _shape_c3, c3 = sess.run([shape_c1, shape_c2, shape_c3, c3])
67 | print("shape of c1:", _shape_c1)
68 | # print ("shape of n_c1:",_shape_nc1)
69 | print("c3:", c3)
70 |
71 | # size test
72 | size = sess.run(tf.size(c3))
73 | print("size of c3:", size)
74 |
75 | # rank test
76 | rank = sess.run(tf.rank(c3))
77 | print("rank of c3:", rank)
78 |
--------------------------------------------------------------------------------
/tensorflow/course/test.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | xs = tf.placeholder(tf.float32, [None, 1], name="xs")
4 | ys = tf.placeholder(tf.float32, [None, 1], name="ys")
5 | Weights1 = tf.Variable(tf.constant([[0.0004]]), dtype=tf.float32, name="Weights1")
6 | Biases1 = tf.Variable(tf.zeros([1, 1]) + 0.1, dtype=tf.float32, name="Biases1")
7 | Wx_plus_b1 = tf.add(tf.matmul(xs, Weights1, name="matmul"), Biases1, name="add")
8 | l1 = tf.nn.sigmoid(Wx_plus_b1)
9 |
10 | Weights2 = tf.Variable(tf.constant([[10000.0]]), dtype=tf.float32, name="Weights2")
11 | Biases2 = tf.Variable(tf.constant(-4999.5), dtype=tf.float32, name="Biases2")
12 | Wx_plus_b2 = tf.add(tf.matmul(l1, Weights2, name="matmul"), Biases2, name="add")
13 |
14 | prediction = Wx_plus_b2
15 |
16 | loss = tf.reduce_mean(tf.square(tf.subtract(ys, prediction, name="Sub"), name="Square"), name="ReduceMean")
17 |
18 | # tf.train.GradientDescentOptimizer,实现梯度下降算法的优化器
19 | train_step = tf.train.GradientDescentOptimizer(0.1).minimize(loss,
20 | name="minimize")
21 | init_op = tf.global_variables_initializer()
22 | with tf.Session() as sess:
23 | sess.run(init_op)
24 | writer = tf.summary.FileWriter('./graphs', sess.graph)
25 | for i in range(1000):
26 | a, b, c, d, e, f = sess.run([train_step, loss, Weights1, Biases1, Weights2, Biases2],
27 | feed_dict={xs: [[i], [i]], ys: [[i], [i]]})
28 | print(b)
29 | print(c)
30 | print(d)
31 | print(e)
32 | print(f)
33 | print("22222")
34 | writer.close()
35 |
--------------------------------------------------------------------------------
/tensorflow/course/testt.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | a = tf.constant(2)
3 | b = tf.constant(3)
4 | x = tf.add(a, b)
5 | with tf.Session() as sess:
6 | writer = tf.summary.FileWriter('./graphs', sess.graph)
7 | print(sess.run(x))
8 | writer.close() # close the writer when you’re done using it
--------------------------------------------------------------------------------
/tensorflow/course/variable.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | # 创建一个变量, 初始化为标量 0.
4 | state = tf.Variable(0, name="counter")
5 |
6 | # 创建一个 op, 其作用是使 state 增加 1
7 |
8 | one = tf.constant(1)
9 | new_value = tf.add(state, one)
10 | update = tf.assign(state, new_value)
11 |
12 | # 启动图后, 变量必须先经过`初始化` (init) op 初始化,
13 | # 首先必须增加一个`初始化` op 到图中.
14 | # init_op = tf.initialize_all_variables()
15 | init_op = tf.global_variables_initializer()
16 |
17 | # 启动图, 运行 op
18 | with tf.Session() as sess:
19 | # 运行 'init' op
20 | sess.run(init_op)
21 | # 打印 'state' 的初始值
22 | print(sess.run(state))
23 | # 运行 op, 更新 'state', 并打印 'state'
24 | for _ in range(3):
25 | sess.run(update)
26 | print(sess.run(state))
27 |
28 | # 输出:
29 |
30 | # 0
31 | # 1
32 | # 2
33 | # 3
34 |
--------------------------------------------------------------------------------
/watermelon/ch3/3.3/data/watermelon_3a.csv:
--------------------------------------------------------------------------------
1 | 1,0.697,0.46,1
2 | 2,0.774,0.376,1
3 | 3,0.634,0.264,1
4 | 4,0.608,0.318,1
5 | 5,0.556,0.215,1
6 | 6,0.403,0.237,1
7 | 7,0.481,0.149,1
8 | 8,0.437,0.211,1
9 | 9,0.666,0.091,0
10 | 10,0.243,0.267,0
11 | 11,0.245,0.057,0
12 | 12,0.343,0.099,0
13 | 13,0.639,0.161,0
14 | 14,0.657,0.198,0
15 | 15,0.36,0.37,0
16 | 16,0.593,0.042,0
17 | 17,0.719,0.103,0
18 |
--------------------------------------------------------------------------------
/watermelon/ch3/3.3/logistic_regression.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn import model_selection
4 | from sklearn.linear_model import LogisticRegression
5 | from sklearn import metrics
6 | import matplotlib.pylab as pl
7 | import self_def
8 |
9 |
10 | # load the CSV file as a numpy matrix
11 | dataSet = np.loadtxt('data/watermelon_3a.csv', delimiter=",")
12 |
13 | # separate the data from the target attributes
14 | X = dataSet[:, 1:3]
15 | y = dataSet[:, 3]
16 |
17 | # draw scatter diagram to show the raw data
18 | f1 = plt.figure(1)
19 | plt.title('watermelon_3a')
20 | plt.xlabel('density')
21 | plt.ylabel('ratio_sugar')
22 | plt.scatter(X[y == 0, 0], X[y == 0, 1], marker='o', color='k', s=100, label='bad')
23 | plt.scatter(X[y == 1, 0], X[y == 1, 1], marker='o', color='g', s=100, label='good')
24 | plt.legend(loc='upper right')
25 | plt.show()
26 |
27 | '''
28 | using sklearn lib for logistic regression
29 | '''
30 |
31 | # generalization of test and train set
32 | X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.5, random_state=0)
33 |
34 | # model training
35 | log_model = LogisticRegression()
36 | log_model.fit(X_train, y_train)
37 |
38 | # model testing
39 | y_pred = log_model.predict(X_test)
40 |
41 | # summarize the accuracy of fitting
42 | print(metrics.confusion_matrix(y_test, y_pred))
43 | print(metrics.classification_report(y_test, y_pred))
44 |
45 | precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred)
46 |
47 | # show decision boundary in plt
48 | # X - some data in 2dimensional np.array
49 | f2 = plt.figure(2)
50 | h = 0.001
51 | x0_min, x0_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1
52 | x1_min, x1_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1
53 | x0, x1 = np.meshgrid(np.arange(x0_min, x0_max, h),
54 | np.arange(x1_min, x1_max, h))
55 |
56 | # here "model" is your model's prediction (classification) function
57 | z = log_model.predict(np.c_[x0.ravel(), x1.ravel()])
58 |
59 | # Put the result into a color plot
60 | z = z.reshape(x0.shape)
61 | plt.contourf(x0, x1, z, cmap=pl.cm.Paired)
62 |
63 | # Plot also the training pointsplt.title('watermelon_3a')
64 | plt.title('watermelon_3a')
65 | plt.xlabel('density')
66 | plt.ylabel('ratio_sugar')
67 | plt.scatter(X[y == 0, 0], X[y == 0, 1], marker='o', color='k', s=100, label='bad')
68 | plt.scatter(X[y == 1, 0], X[y == 1, 1], marker='o', color='g', s=100, label='good')
69 | # plt.show()
70 |
71 | '''
72 | coding to implement logistic regression
73 | '''
74 |
75 | # X_train, X_test, y_train, y_test
76 | # np.ones(n)
77 | m, n = np.shape(X)
78 | X_ex = np.c_[X, np.ones(m)] # extend the variable matrix to [x, 1]
79 | X_train, X_test, y_train, y_test = model_selection.train_test_split(X_ex, y, test_size=0.5, random_state=0)
80 |
81 | # using gradDescent to get the optimal parameter beta = [w, b] in page-59
82 | beta = self_def.gradDscent_2(X_train, y_train)
83 |
84 | # prediction, beta mapping to the model
85 | y_pred = self_def.predict(X_test, beta)
86 |
87 | m_test = np.shape(X_test)[0]
88 | # calculation of confusion_matrix and prediction accuracy
89 | cfmat = np.zeros((2, 2))
90 | for i in range(m_test):
91 | if y_pred[i] == y_test[i] == 0:
92 | cfmat[0, 0] += 1
93 | elif y_pred[i] == y_test[i] == 1:
94 | cfmat[1, 1] += 1
95 | elif y_pred[i] == 0:
96 | cfmat[1, 0] += 1
97 | elif y_pred[i] == 1:
98 | cfmat[0, 1] += 1
99 |
100 | print(cfmat)
101 |
--------------------------------------------------------------------------------
/watermelon/ch3/3.3/self_def.py:
--------------------------------------------------------------------------------
1 | # object likelihood function
2 | import numpy as np
3 |
4 |
5 | def likelihood_sub(x, y, beta):
6 | '''
7 | @param X: one sample variables
8 | @param y: one sample label
9 | @param beta: the parameter vector in 3.27
10 | @return: the sub_log-likelihood of 3.27
11 | '''
12 | return -y * np.dot(beta, x.T) + np.math.log(1 + np.math.exp(np.dot(beta, x.T)))
13 |
14 |
15 | def likelihood(X, y, beta):
16 | '''
17 | @param X: the sample variables matrix
18 | @param y: the sample label matrix
19 | @param beta: the parameter vector in 3.27
20 | @return: the log-likelihood of 3.27
21 | '''
22 | sum = 0
23 | m, n = np.shape(X)
24 |
25 | for i in range(m):
26 | sum += likelihood_sub(X[i], y[i], beta)
27 |
28 | return sum
29 |
30 |
31 | def partial_derivative(X, y, beta): # refer to 3.30 on book page 60
32 | '''
33 | @param X: the sample variables matrix
34 | @param y: the sample label matrix
35 | @param beta: the parameter vector in 3.27
36 | @return: the partial derivative of beta [j]
37 | '''
38 |
39 | m, n = np.shape(X)
40 | pd = np.zeros(n)
41 |
42 | for i in range(m):
43 | tmp = y[i] - sigmoid(X[i], beta)
44 | for j in range(n):
45 | pd[j] += X[i][j] * (tmp)
46 | return pd
47 |
48 |
49 | def gradDscent_1(X, y): # implementation of fundational gradDscent algorithms
50 | '''
51 | @param X: X is the variable matrix
52 | @param y: y is the label array
53 | @return: the best parameter estimate of 3.27
54 | '''
55 | import matplotlib.pyplot as plt
56 |
57 | h = 0.1 # step length of iterator
58 | max_times = 500 # give the iterative times limit
59 | m, n = np.shape(X)
60 |
61 | b = np.zeros((n, max_times)) # for show convergence curve of parameter
62 | beta = np.zeros(n) # parameter and initial
63 | delta_beta = np.ones(n) * h
64 | llh = 0
65 | llh_temp = 0
66 |
67 | for i in range(max_times):
68 | beta_temp = beta
69 |
70 | for j in range(n):
71 | # for partial derivative
72 | beta[j] += delta_beta[j]
73 | llh_tmp = likelihood(X, y, beta)
74 | delta_beta[j] = -h * (llh_tmp - llh) / delta_beta[j]
75 |
76 | b[j, i] = beta[j]
77 |
78 | beta[j] = beta_temp[j]
79 |
80 | beta += delta_beta
81 | llh = likelihood(X, y, beta)
82 |
83 | t = np.arange(max_times)
84 |
85 | f2 = plt.figure(3)
86 |
87 | p1 = plt.subplot(311)
88 | p1.plot(t, b[0])
89 | plt.ylabel('w1')
90 |
91 | p2 = plt.subplot(312)
92 | p2.plot(t, b[1])
93 | plt.ylabel('w2')
94 |
95 | p3 = plt.subplot(313)
96 | p3.plot(t, b[2])
97 | plt.ylabel('b')
98 |
99 | plt.show()
100 | return beta
101 |
102 |
103 | def gradDscent_2(X, y): # implementation of stochastic gradDscent algorithms
104 | '''
105 | @param X: X is the variable matrix
106 | @param y: y is the label array
107 | @return: the best parameter estimate of 3.27
108 | '''
109 | import matplotlib.pyplot as plt
110 |
111 | m, n = np.shape(X)
112 | h = 0.5 # step length of iterator and initial
113 | beta = np.zeros(n) # parameter and initial
114 | delta_beta = np.ones(n) * h
115 | llh = 0
116 | llh_temp = 0
117 | b = np.zeros((n, m)) # for show convergence curve of parameter
118 |
119 | for i in range(m):
120 | beta_temp = beta
121 |
122 | for j in range(n):
123 | # for partial derivative
124 | h = 0.5 * 1 / (1 + i + j) # change step length of iterator
125 | beta[j] += delta_beta[j]
126 |
127 | b[j, i] = beta[j]
128 |
129 | llh_tmp = likelihood_sub(X[i], y[i], beta)
130 | delta_beta[j] = -h * (llh_tmp - llh) / delta_beta[j]
131 |
132 | beta[j] = beta_temp[j]
133 |
134 | beta += delta_beta
135 | llh = likelihood_sub(X[i], y[i], beta)
136 |
137 | t = np.arange(m)
138 |
139 | f2 = plt.figure(3)
140 |
141 | p1 = plt.subplot(311)
142 | p1.plot(t, b[0])
143 | plt.ylabel('w1')
144 |
145 | p2 = plt.subplot(312)
146 | p2.plot(t, b[1])
147 | plt.ylabel('w2')
148 |
149 | p3 = plt.subplot(313)
150 | p3.plot(t, b[2])
151 | plt.ylabel('b')
152 |
153 | plt.show()
154 |
155 | return beta
156 |
157 |
158 | def sigmoid(x, beta):
159 | '''
160 | @param x: is the predict variable
161 | @param beta: is the parameter
162 | @return: the sigmoid function value
163 | '''
164 | return 1.0 / (1 + np.math.exp(- np.dot(beta, x.T)))
165 |
166 |
167 | def predict(X, beta):
168 | '''
169 | prediction the class lable using sigmoid
170 | @param X: data sample form like [x, 1]
171 | @param beta: the parameter of sigmoid form like [w, b]
172 | @return: the class lable array
173 | '''
174 | m, n = np.shape(X)
175 | y = np.zeros(m)
176 |
177 | for i in range(m):
178 | if sigmoid(X[i], beta) > 0.5: y[i] = 1;
179 | return y
180 |
181 | return beta
182 |
--------------------------------------------------------------------------------
/watermelon/ch3/3.4/cross_validation.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | import seaborn as sns
4 | from sklearn.linear_model import LogisticRegression
5 | from sklearn import metrics
6 | from sklearn.model_selection import cross_val_predict
7 |
8 | '''
9 | 1-st. iris data set importing and visualization using seaborn
10 | '''
11 |
12 | sns.set(style="white", color_codes=True)
13 | iris = sns.load_dataset("iris")
14 | X = iris.values[50:150, 0:4]
15 | y = iris.values[50:150, 4]
16 |
17 | # iris.plot(kind="scatter", x="sepal_length", y="sepal_width")
18 | # sns.pairplot(iris,hue='species')
19 | # sns.plt.show()
20 |
21 | '''
22 | 2-nd logistic regression using sklearn
23 | '''
24 |
25 |
26 | # log-regression lib model
27 | log_model = LogisticRegression()
28 | m = np.shape(X)[0]
29 |
30 | # 10-folds CV 10折交叉验证
31 | y_pred = cross_val_predict(log_model, X, y, cv=10)
32 | print(metrics.accuracy_score(y, y_pred))
33 |
34 | # LOOCV 留一法
35 | from sklearn.model_selection import LeaveOneOut
36 |
37 | loo = LeaveOneOut()
38 | accuracy = 0
39 | for train, test in loo.split(X):
40 | log_model.fit(X[train], y[train]) # fitting
41 | y_p = log_model.predict(X[test])
42 | if y_p == y[test]: accuracy += 1
43 | print(accuracy / np.shape(X)[0])
44 |
45 | '''
46 | transfusion-blood data set analysis
47 | '''
48 |
49 | dataset_transfusion = np.loadtxt('data/transfusion.data', delimiter=",", skiprows=1)
50 | X2 = dataset_transfusion[:, 0:4]
51 | y2 = dataset_transfusion[:, 4]
52 |
53 |
54 | # log-regression lib model
55 | log_model = LogisticRegression()
56 | m = np.shape(X2)[0]
57 |
58 | # 10-folds CV
59 | y2_pred = cross_val_predict(log_model, X2, y2, cv=10)
60 | print(metrics.accuracy_score(y2, y2_pred))
61 |
62 | # LOOCV
63 | # from sklearn.model_selection import LeaveOneOut
64 | loo = LeaveOneOut()
65 | accuracy = 0
66 | for train, test in loo.split(X2):
67 | log_model.fit(X2[train], y2[train]) # fitting
68 | y2_p = log_model.predict(X2[test])
69 | if y2_p == y2[test]: accuracy += 1
70 | print(accuracy / np.shape(X2)[0])
71 |
72 | '''
73 | 针对经验风险最小化算法的过拟合的问题,给出交叉验证的方法,这个方法在做分类问题时很常用:
74 | 一:简单的交叉验证的步骤如下:
75 | 1、从全部的训练数据 S中随机选择 中随机选择 s的样例作为训练集train,剩余的作为测试集test。
76 | 2、通过对测试集训练 ,得到假设函数或者模型 。
77 | 3、在测试集对每一个样本根据假设函数或者模型,得到训练集的类标,求出分类正确率。
78 | 4,选择具有最大分类率的模型或者假设。
79 | 这种方法称为 hold -out cross validation 或者称为简单交叉验证。由于测试集和训练集是分开的,就避免了过拟合的现象
80 |
81 | 二:k折交叉验证 k-fold cross validation
82 | 1、将全部训练集 S分成 k个不相交的子集,假设 S中的训练样例个数为 m,那么每一个子 集有 m/k 个训练样例,,相应的子集称作 {s1,s2,…,sk}。
83 | 2、每次从分好的子集中里面,拿出一个作为测试集,其它k-1个作为训练集
84 | 3、根据训练训练出模型或者假设函数。
85 | 4、把这个模型放到测试集上,得到分类率。
86 | 5、计算k次求得的分类率的平均值,作为该模型或者假设函数的真实分类率。
87 | 这个方法充分利用了所有样本。但计算比较繁琐,需要训练k次,测试k次。
88 |
89 | 三:留一法 leave-one-out cross validation
90 | 留一法就是每次只留下一个样本做测试集,其它样本做训练集,如果有k个样本,则需要训练k次,测试k次。
91 | 留一法计算最繁琐,但样本利用率最高。适合于小样本的情况。
92 |
93 | '''
94 |
95 | '''
96 | 两种交叉验证的结果相近,但是由于 Blood Transfusion Service Center Data Set的类分性不如iris明显,所得结果也要差一些。
97 | 同时由程序运行可以看出,LOOCV的运行时间相对较长,这一点随着数据量的增大而愈发明显。
98 | 所以,一般情况下选择K-折交叉验证即可满足精度要求,同时运算量相对小
99 | '''
100 |
--------------------------------------------------------------------------------
/watermelon/ch3/3.4/data/transfusion.data:
--------------------------------------------------------------------------------
1 | Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),"whether he/she donated blood in March 2007"
2 | 2 ,50,12500,98 ,1
3 | 0 ,13,3250,28 ,1
4 | 1 ,16,4000,35 ,1
5 | 2 ,20,5000,45 ,1
6 | 1 ,24,6000,77 ,0
7 | 4 ,4,1000,4 ,0
8 | 2 ,7,1750,14 ,1
9 | 1 ,12,3000,35 ,0
10 | 2 ,9,2250,22 ,1
11 | 5 ,46,11500,98 ,1
12 | 4 ,23,5750,58 ,0
13 | 0 ,3,750,4 ,0
14 | 2 ,10,2500,28 ,1
15 | 1 ,13,3250,47 ,0
16 | 2 ,6,1500,15 ,1
17 | 2 ,5,1250,11 ,1
18 | 2 ,14,3500,48 ,1
19 | 2 ,15,3750,49 ,1
20 | 2 ,6,1500,15 ,1
21 | 2 ,3,750,4 ,1
22 | 2 ,3,750,4 ,1
23 | 4 ,11,2750,28 ,0
24 | 2 ,6,1500,16 ,1
25 | 2 ,6,1500,16 ,1
26 | 9 ,9,2250,16 ,0
27 | 4 ,14,3500,40 ,0
28 | 4 ,6,1500,14 ,0
29 | 4 ,12,3000,34 ,1
30 | 4 ,5,1250,11 ,1
31 | 4 ,8,2000,21 ,0
32 | 1 ,14,3500,58 ,0
33 | 4 ,10,2500,28 ,1
34 | 4 ,10,2500,28 ,1
35 | 4 ,9,2250,26 ,1
36 | 2 ,16,4000,64 ,0
37 | 2 ,8,2000,28 ,1
38 | 2 ,12,3000,47 ,1
39 | 4 ,6,1500,16 ,1
40 | 2 ,14,3500,57 ,1
41 | 4 ,7,1750,22 ,1
42 | 2 ,13,3250,53 ,1
43 | 2 ,5,1250,16 ,0
44 | 2 ,5,1250,16 ,1
45 | 2 ,5,1250,16 ,0
46 | 4 ,20,5000,69 ,1
47 | 4 ,9,2250,28 ,1
48 | 2 ,9,2250,36 ,0
49 | 2 ,2,500,2 ,0
50 | 2 ,2,500,2 ,0
51 | 2 ,2,500,2 ,0
52 | 2 ,11,2750,46 ,0
53 | 2 ,11,2750,46 ,1
54 | 2 ,6,1500,22 ,0
55 | 2 ,12,3000,52 ,0
56 | 4 ,5,1250,14 ,1
57 | 4 ,19,4750,69 ,1
58 | 4 ,8,2000,26 ,1
59 | 2 ,7,1750,28 ,1
60 | 2 ,16,4000,81 ,0
61 | 3 ,6,1500,21 ,0
62 | 2 ,7,1750,29 ,0
63 | 2 ,8,2000,35 ,1
64 | 2 ,10,2500,49 ,0
65 | 4 ,5,1250,16 ,1
66 | 2 ,3,750,9 ,1
67 | 3 ,16,4000,74 ,0
68 | 2 ,4,1000,14 ,1
69 | 0 ,2,500,4 ,0
70 | 4 ,7,1750,25 ,0
71 | 1 ,9,2250,51 ,0
72 | 2 ,4,1000,16 ,0
73 | 2 ,4,1000,16 ,0
74 | 4 ,17,4250,71 ,1
75 | 2 ,2,500,4 ,0
76 | 2 ,2,500,4 ,1
77 | 2 ,2,500,4 ,1
78 | 2 ,4,1000,16 ,1
79 | 2 ,2,500,4 ,0
80 | 2 ,2,500,4 ,0
81 | 2 ,2,500,4 ,0
82 | 4 ,6,1500,23 ,1
83 | 2 ,4,1000,16 ,0
84 | 2 ,4,1000,16 ,0
85 | 2 ,4,1000,16 ,0
86 | 2 ,6,1500,28 ,1
87 | 2 ,6,1500,28 ,0
88 | 4 ,2,500,4 ,0
89 | 4 ,2,500,4 ,0
90 | 4 ,2,500,4 ,0
91 | 2 ,7,1750,35 ,1
92 | 4 ,2,500,4 ,1
93 | 4 ,2,500,4 ,0
94 | 4 ,2,500,4 ,0
95 | 4 ,2,500,4 ,0
96 | 12 ,11,2750,23 ,0
97 | 4 ,7,1750,28 ,0
98 | 3 ,17,4250,86 ,0
99 | 4 ,9,2250,38 ,1
100 | 4 ,4,1000,14 ,1
101 | 5 ,7,1750,26 ,1
102 | 4 ,8,2000,34 ,1
103 | 2 ,13,3250,76 ,1
104 | 4 ,9,2250,40 ,0
105 | 2 ,5,1250,26 ,0
106 | 2 ,5,1250,26 ,0
107 | 6 ,17,4250,70 ,0
108 | 0 ,8,2000,59 ,0
109 | 3 ,5,1250,26 ,0
110 | 2 ,3,750,14 ,0
111 | 2 ,10,2500,64 ,0
112 | 4 ,5,1250,23 ,1
113 | 4 ,9,2250,46 ,0
114 | 4 ,5,1250,23 ,0
115 | 4 ,8,2000,40 ,1
116 | 2 ,12,3000,82 ,0
117 | 11 ,24,6000,64 ,0
118 | 2 ,7,1750,46 ,1
119 | 4 ,11,2750,61 ,0
120 | 1 ,7,1750,57 ,0
121 | 2 ,11,2750,79 ,1
122 | 2 ,3,750,16 ,1
123 | 4 ,5,1250,26 ,1
124 | 2 ,6,1500,41 ,1
125 | 2 ,5,1250,33 ,1
126 | 2 ,4,1000,26 ,0
127 | 2 ,5,1250,34 ,0
128 | 4 ,8,2000,46 ,1
129 | 2 ,4,1000,26 ,0
130 | 4 ,8,2000,48 ,1
131 | 2 ,2,500,10 ,1
132 | 4 ,5,1250,28 ,0
133 | 2 ,12,3000,95 ,0
134 | 2 ,2,500,10 ,0
135 | 4 ,6,1500,35 ,0
136 | 2 ,11,2750,88 ,0
137 | 2 ,3,750,19 ,0
138 | 2 ,5,1250,37 ,0
139 | 2 ,12,3000,98 ,0
140 | 9 ,5,1250,19 ,0
141 | 2 ,2,500,11 ,0
142 | 2 ,9,2250,74 ,0
143 | 5 ,14,3500,86 ,0
144 | 4 ,3,750,16 ,0
145 | 4 ,3,750,16 ,0
146 | 4 ,2,500,9 ,1
147 | 4 ,3,750,16 ,1
148 | 6 ,3,750,14 ,0
149 | 2 ,2,500,11 ,0
150 | 2 ,2,500,11 ,1
151 | 2 ,2,500,11 ,0
152 | 2 ,7,1750,58 ,1
153 | 4 ,6,1500,39 ,0
154 | 4 ,11,2750,78 ,0
155 | 2 ,1,250,2 ,1
156 | 2 ,1,250,2 ,0
157 | 2 ,1,250,2 ,0
158 | 2 ,1,250,2 ,0
159 | 2 ,1,250,2 ,0
160 | 2 ,1,250,2 ,0
161 | 2 ,1,250,2 ,0
162 | 2 ,1,250,2 ,0
163 | 2 ,1,250,2 ,0
164 | 2 ,1,250,2 ,0
165 | 2 ,1,250,2 ,1
166 | 2 ,1,250,2 ,1
167 | 2 ,1,250,2 ,1
168 | 2 ,1,250,2 ,0
169 | 2 ,1,250,2 ,0
170 | 2 ,1,250,2 ,0
171 | 2 ,1,250,2 ,0
172 | 2 ,1,250,2 ,0
173 | 2 ,1,250,2 ,0
174 | 2 ,1,250,2 ,0
175 | 2 ,1,250,2 ,0
176 | 2 ,1,250,2 ,0
177 | 11 ,10,2500,35 ,0
178 | 11 ,4,1000,16 ,1
179 | 4 ,5,1250,33 ,1
180 | 4 ,6,1500,41 ,1
181 | 2 ,3,750,22 ,0
182 | 4 ,4,1000,26 ,1
183 | 10 ,4,1000,16 ,0
184 | 2 ,4,1000,35 ,0
185 | 4 ,12,3000,88 ,0
186 | 13 ,8,2000,26 ,0
187 | 11 ,9,2250,33 ,0
188 | 4 ,5,1250,34 ,0
189 | 4 ,4,1000,26 ,0
190 | 8 ,15,3750,77 ,0
191 | 4 ,5,1250,35 ,1
192 | 4 ,7,1750,52 ,0
193 | 4 ,7,1750,52 ,0
194 | 2 ,4,1000,35 ,0
195 | 11 ,11,2750,42 ,0
196 | 2 ,2,500,14 ,0
197 | 2 ,5,1250,47 ,1
198 | 9 ,8,2000,38 ,1
199 | 4 ,6,1500,47 ,0
200 | 11 ,7,1750,29 ,0
201 | 9 ,9,2250,45 ,0
202 | 4 ,6,1500,52 ,0
203 | 4 ,7,1750,58 ,0
204 | 6 ,2,500,11 ,1
205 | 4 ,7,1750,58 ,0
206 | 11 ,9,2250,38 ,0
207 | 11 ,6,1500,26 ,0
208 | 2 ,2,500,16 ,0
209 | 2 ,7,1750,76 ,0
210 | 11 ,6,1500,27 ,0
211 | 11 ,3,750,14 ,0
212 | 4 ,1,250,4 ,0
213 | 4 ,1,250,4 ,0
214 | 4 ,1,250,4 ,0
215 | 4 ,1,250,4 ,0
216 | 4 ,1,250,4 ,0
217 | 4 ,1,250,4 ,1
218 | 4 ,1,250,4 ,0
219 | 4 ,1,250,4 ,0
220 | 4 ,1,250,4 ,0
221 | 4 ,1,250,4 ,0
222 | 4 ,1,250,4 ,0
223 | 4 ,1,250,4 ,1
224 | 4 ,1,250,4 ,1
225 | 4 ,1,250,4 ,0
226 | 4 ,1,250,4 ,1
227 | 4 ,1,250,4 ,1
228 | 4 ,1,250,4 ,0
229 | 4 ,3,750,24 ,0
230 | 4 ,1,250,4 ,0
231 | 4 ,1,250,4 ,0
232 | 4 ,1,250,4 ,0
233 | 4 ,1,250,4 ,1
234 | 4 ,1,250,4 ,0
235 | 10 ,8,2000,39 ,0
236 | 14 ,7,1750,26 ,0
237 | 8 ,10,2500,63 ,0
238 | 11 ,3,750,15 ,0
239 | 4 ,2,500,14 ,0
240 | 2 ,4,1000,43 ,0
241 | 8 ,9,2250,58 ,0
242 | 8 ,8,2000,52 ,1
243 | 11 ,22,5500,98 ,0
244 | 4 ,3,750,25 ,1
245 | 11 ,17,4250,79 ,1
246 | 9 ,2,500,11 ,0
247 | 4 ,5,1250,46 ,0
248 | 11 ,12,3000,58 ,0
249 | 7 ,12,3000,86 ,0
250 | 11 ,2,500,11 ,0
251 | 11 ,2,500,11 ,0
252 | 11 ,2,500,11 ,0
253 | 2 ,6,1500,75 ,0
254 | 11 ,8,2000,41 ,1
255 | 11 ,3,750,16 ,1
256 | 12 ,13,3250,59 ,0
257 | 2 ,3,750,35 ,0
258 | 16 ,8,2000,28 ,0
259 | 11 ,7,1750,37 ,0
260 | 4 ,3,750,28 ,0
261 | 12 ,12,3000,58 ,0
262 | 4 ,4,1000,41 ,0
263 | 11 ,14,3500,73 ,1
264 | 2 ,2,500,23 ,0
265 | 2 ,3,750,38 ,1
266 | 4 ,5,1250,58 ,0
267 | 4 ,4,1000,43 ,1
268 | 3 ,2,500,23 ,0
269 | 11 ,8,2000,46 ,0
270 | 4 ,7,1750,82 ,0
271 | 13 ,4,1000,21 ,0
272 | 16 ,11,2750,40 ,0
273 | 16 ,7,1750,28 ,0
274 | 7 ,2,500,16 ,0
275 | 4 ,5,1250,58 ,0
276 | 4 ,5,1250,58 ,0
277 | 4 ,4,1000,46 ,0
278 | 14 ,13,3250,57 ,0
279 | 4 ,3,750,34 ,0
280 | 14 ,18,4500,78 ,0
281 | 11 ,8,2000,48 ,0
282 | 14 ,16,4000,70 ,0
283 | 14 ,4,1000,22 ,1
284 | 14 ,5,1250,26 ,0
285 | 8 ,2,500,16 ,0
286 | 11 ,5,1250,33 ,0
287 | 11 ,2,500,14 ,0
288 | 4 ,2,500,23 ,0
289 | 9 ,2,500,16 ,1
290 | 14 ,5,1250,28 ,1
291 | 14 ,3,750,19 ,1
292 | 14 ,4,1000,23 ,1
293 | 16 ,12,3000,50 ,0
294 | 11 ,4,1000,28 ,0
295 | 11 ,5,1250,35 ,0
296 | 11 ,5,1250,35 ,0
297 | 2 ,4,1000,70 ,0
298 | 14 ,5,1250,28 ,0
299 | 14 ,2,500,14 ,0
300 | 14 ,2,500,14 ,0
301 | 14 ,2,500,14 ,0
302 | 14 ,2,500,14 ,0
303 | 14 ,2,500,14 ,0
304 | 14 ,2,500,14 ,0
305 | 2 ,3,750,52 ,0
306 | 14 ,6,1500,34 ,0
307 | 11 ,5,1250,37 ,1
308 | 4 ,5,1250,74 ,0
309 | 11 ,3,750,23 ,0
310 | 16 ,4,1000,23 ,0
311 | 16 ,3,750,19 ,0
312 | 11 ,5,1250,38 ,0
313 | 11 ,2,500,16 ,0
314 | 12 ,9,2250,60 ,0
315 | 9 ,1,250,9 ,0
316 | 9 ,1,250,9 ,0
317 | 4 ,2,500,29 ,0
318 | 11 ,2,500,17 ,0
319 | 14 ,4,1000,26 ,0
320 | 11 ,9,2250,72 ,1
321 | 11 ,5,1250,41 ,0
322 | 15 ,16,4000,82 ,0
323 | 9 ,5,1250,51 ,1
324 | 11 ,4,1000,34 ,0
325 | 14 ,8,2000,50 ,1
326 | 16 ,7,1750,38 ,0
327 | 14 ,2,500,16 ,0
328 | 2 ,2,500,41 ,0
329 | 14 ,16,4000,98 ,0
330 | 14 ,4,1000,28 ,1
331 | 16 ,7,1750,39 ,0
332 | 14 ,7,1750,47 ,0
333 | 16 ,6,1500,35 ,0
334 | 16 ,6,1500,35 ,1
335 | 11 ,7,1750,62 ,1
336 | 16 ,2,500,16 ,0
337 | 16 ,3,750,21 ,1
338 | 11 ,3,750,28 ,0
339 | 11 ,7,1750,64 ,0
340 | 11 ,1,250,11 ,1
341 | 9 ,3,750,34 ,0
342 | 14 ,4,1000,30 ,0
343 | 23 ,38,9500,98 ,0
344 | 11 ,6,1500,58 ,0
345 | 11 ,1,250,11 ,0
346 | 11 ,1,250,11 ,0
347 | 11 ,1,250,11 ,0
348 | 11 ,1,250,11 ,0
349 | 11 ,1,250,11 ,0
350 | 11 ,1,250,11 ,0
351 | 11 ,1,250,11 ,0
352 | 11 ,1,250,11 ,0
353 | 11 ,2,500,21 ,0
354 | 11 ,5,1250,50 ,0
355 | 11 ,2,500,21 ,0
356 | 16 ,4,1000,28 ,0
357 | 4 ,2,500,41 ,0
358 | 16 ,6,1500,40 ,0
359 | 14 ,3,750,26 ,0
360 | 9 ,2,500,26 ,0
361 | 21 ,16,4000,64 ,0
362 | 14 ,6,1500,51 ,0
363 | 11 ,2,500,24 ,0
364 | 4 ,3,750,71 ,0
365 | 21 ,13,3250,57 ,0
366 | 11 ,6,1500,71 ,0
367 | 14 ,2,500,21 ,1
368 | 23 ,15,3750,57 ,0
369 | 14 ,4,1000,38 ,0
370 | 11 ,2,500,26 ,0
371 | 16 ,5,1250,40 ,1
372 | 4 ,2,500,51 ,1
373 | 14 ,3,750,31 ,0
374 | 4 ,2,500,52 ,0
375 | 9 ,4,1000,65 ,0
376 | 14 ,4,1000,40 ,0
377 | 11 ,3,750,40 ,1
378 | 14 ,5,1250,50 ,0
379 | 14 ,1,250,14 ,0
380 | 14 ,1,250,14 ,0
381 | 14 ,1,250,14 ,0
382 | 14 ,1,250,14 ,0
383 | 14 ,1,250,14 ,0
384 | 14 ,1,250,14 ,0
385 | 14 ,1,250,14 ,0
386 | 14 ,1,250,14 ,0
387 | 14 ,7,1750,72 ,0
388 | 14 ,1,250,14 ,0
389 | 14 ,1,250,14 ,0
390 | 9 ,3,750,52 ,0
391 | 14 ,7,1750,73 ,0
392 | 11 ,4,1000,58 ,0
393 | 11 ,4,1000,59 ,0
394 | 4 ,2,500,59 ,0
395 | 11 ,4,1000,61 ,0
396 | 16 ,4,1000,40 ,0
397 | 16 ,10,2500,89 ,0
398 | 21 ,2,500,21 ,1
399 | 21 ,3,750,26 ,0
400 | 16 ,8,2000,76 ,0
401 | 21 ,3,750,26 ,1
402 | 18 ,2,500,23 ,0
403 | 23 ,5,1250,33 ,0
404 | 23 ,8,2000,46 ,0
405 | 16 ,3,750,34 ,0
406 | 14 ,5,1250,64 ,0
407 | 14 ,3,750,41 ,0
408 | 16 ,1,250,16 ,0
409 | 16 ,1,250,16 ,0
410 | 16 ,1,250,16 ,0
411 | 16 ,1,250,16 ,0
412 | 16 ,1,250,16 ,0
413 | 16 ,1,250,16 ,0
414 | 16 ,1,250,16 ,0
415 | 16 ,4,1000,45 ,0
416 | 16 ,1,250,16 ,0
417 | 16 ,1,250,16 ,0
418 | 16 ,1,250,16 ,0
419 | 16 ,1,250,16 ,0
420 | 16 ,1,250,16 ,0
421 | 16 ,2,500,26 ,0
422 | 21 ,2,500,23 ,0
423 | 16 ,2,500,27 ,0
424 | 21 ,2,500,23 ,0
425 | 21 ,2,500,23 ,0
426 | 14 ,4,1000,57 ,0
427 | 16 ,5,1250,60 ,0
428 | 23 ,2,500,23 ,0
429 | 14 ,5,1250,74 ,0
430 | 23 ,3,750,28 ,0
431 | 16 ,3,750,40 ,0
432 | 9 ,2,500,52 ,0
433 | 9 ,2,500,52 ,0
434 | 16 ,7,1750,87 ,1
435 | 14 ,4,1000,64 ,0
436 | 14 ,2,500,35 ,0
437 | 16 ,7,1750,93 ,0
438 | 21 ,2,500,25 ,0
439 | 14 ,3,750,52 ,0
440 | 23 ,14,3500,93 ,0
441 | 18 ,8,2000,95 ,0
442 | 16 ,3,750,46 ,0
443 | 11 ,3,750,76 ,0
444 | 11 ,2,500,52 ,0
445 | 11 ,3,750,76 ,0
446 | 23 ,12,3000,86 ,0
447 | 21 ,3,750,35 ,0
448 | 23 ,2,500,26 ,0
449 | 23 ,2,500,26 ,0
450 | 23 ,8,2000,64 ,0
451 | 16 ,3,750,50 ,0
452 | 23 ,3,750,33 ,0
453 | 21 ,3,750,38 ,0
454 | 23 ,2,500,28 ,0
455 | 21 ,1,250,21 ,0
456 | 21 ,1,250,21 ,0
457 | 21 ,1,250,21 ,0
458 | 21 ,1,250,21 ,0
459 | 21 ,1,250,21 ,0
460 | 21 ,1,250,21 ,0
461 | 21 ,1,250,21 ,0
462 | 21 ,1,250,21 ,0
463 | 21 ,1,250,21 ,0
464 | 21 ,1,250,21 ,1
465 | 21 ,1,250,21 ,0
466 | 21 ,1,250,21 ,0
467 | 21 ,5,1250,60 ,0
468 | 23 ,4,1000,45 ,0
469 | 21 ,4,1000,52 ,0
470 | 22 ,1,250,22 ,1
471 | 11 ,2,500,70 ,0
472 | 23 ,5,1250,58 ,0
473 | 23 ,3,750,40 ,0
474 | 23 ,3,750,41 ,0
475 | 14 ,3,750,83 ,0
476 | 21 ,2,500,35 ,0
477 | 26 ,5,1250,49 ,1
478 | 23 ,6,1500,70 ,0
479 | 23 ,1,250,23 ,0
480 | 23 ,1,250,23 ,0
481 | 23 ,1,250,23 ,0
482 | 23 ,1,250,23 ,0
483 | 23 ,1,250,23 ,0
484 | 23 ,1,250,23 ,0
485 | 23 ,1,250,23 ,0
486 | 23 ,1,250,23 ,0
487 | 23 ,4,1000,53 ,0
488 | 21 ,6,1500,86 ,0
489 | 23 ,3,750,48 ,0
490 | 21 ,2,500,41 ,0
491 | 21 ,3,750,64 ,0
492 | 16 ,2,500,70 ,0
493 | 21 ,3,750,70 ,0
494 | 23 ,4,1000,87 ,0
495 | 23 ,3,750,89 ,0
496 | 23 ,2,500,87 ,0
497 | 35 ,3,750,64 ,0
498 | 38 ,1,250,38 ,0
499 | 38 ,1,250,38 ,0
500 | 40 ,1,250,40 ,0
501 | 74 ,1,250,74 ,0
502 | 2 ,43,10750,86 ,1
503 | 6 ,22,5500,28 ,1
504 | 2 ,34,8500,77 ,1
505 | 2 ,44,11000,98 ,0
506 | 0 ,26,6500,76 ,1
507 | 2 ,41,10250,98 ,1
508 | 3 ,21,5250,42 ,1
509 | 2 ,11,2750,23 ,0
510 | 2 ,21,5250,52 ,1
511 | 2 ,13,3250,32 ,1
512 | 4 ,4,1000,4 ,1
513 | 2 ,11,2750,26 ,0
514 | 2 ,11,2750,28 ,0
515 | 3 ,14,3500,35 ,0
516 | 4 ,16,4000,38 ,1
517 | 4 ,6,1500,14 ,0
518 | 3 ,5,1250,12 ,1
519 | 4 ,33,8250,98 ,1
520 | 3 ,10,2500,33 ,1
521 | 4 ,10,2500,28 ,1
522 | 2 ,11,2750,40 ,1
523 | 2 ,11,2750,41 ,1
524 | 4 ,13,3250,39 ,1
525 | 1 ,10,2500,43 ,1
526 | 4 ,9,2250,28 ,0
527 | 2 ,4,1000,11 ,0
528 | 2 ,5,1250,16 ,1
529 | 2 ,15,3750,64 ,0
530 | 5 ,24,6000,79 ,0
531 | 2 ,6,1500,22 ,1
532 | 4 ,5,1250,16 ,1
533 | 2 ,4,1000,14 ,1
534 | 4 ,8,2000,28 ,0
535 | 2 ,4,1000,14 ,0
536 | 2 ,6,1500,26 ,0
537 | 4 ,5,1250,16 ,1
538 | 2 ,7,1750,32 ,1
539 | 2 ,6,1500,26 ,1
540 | 2 ,8,2000,38 ,1
541 | 2 ,2,500,4 ,1
542 | 2 ,6,1500,28 ,1
543 | 2 ,10,2500,52 ,0
544 | 4 ,16,4000,70 ,1
545 | 4 ,2,500,4 ,1
546 | 1 ,14,3500,95 ,0
547 | 4 ,2,500,4 ,1
548 | 7 ,14,3500,48 ,0
549 | 2 ,3,750,11 ,0
550 | 2 ,12,3000,70 ,1
551 | 4 ,7,1750,32 ,1
552 | 4 ,4,1000,16 ,0
553 | 2 ,6,1500,35 ,1
554 | 4 ,6,1500,28 ,1
555 | 2 ,3,750,14 ,0
556 | 2 ,4,1000,23 ,0
557 | 4 ,4,1000,18 ,0
558 | 5 ,6,1500,28 ,0
559 | 4 ,6,1500,30 ,0
560 | 14 ,5,1250,14 ,0
561 | 3 ,8,2000,50 ,0
562 | 4 ,11,2750,64 ,1
563 | 4 ,9,2250,52 ,0
564 | 4 ,16,4000,98 ,1
565 | 7 ,10,2500,47 ,0
566 | 4 ,14,3500,86 ,0
567 | 2 ,9,2250,75 ,0
568 | 4 ,6,1500,35 ,0
569 | 4 ,9,2250,55 ,0
570 | 4 ,6,1500,35 ,1
571 | 2 ,6,1500,45 ,0
572 | 2 ,6,1500,47 ,0
573 | 4 ,2,500,9 ,0
574 | 2 ,2,500,11 ,1
575 | 2 ,2,500,11 ,0
576 | 2 ,2,500,11 ,1
577 | 4 ,6,1500,38 ,1
578 | 3 ,4,1000,29 ,1
579 | 9 ,9,2250,38 ,0
580 | 11 ,5,1250,18 ,0
581 | 2 ,3,750,21 ,0
582 | 2 ,1,250,2 ,0
583 | 2 ,1,250,2 ,1
584 | 2 ,1,250,2 ,0
585 | 2 ,1,250,2 ,0
586 | 2 ,1,250,2 ,0
587 | 2 ,1,250,2 ,0
588 | 2 ,1,250,2 ,1
589 | 2 ,1,250,2 ,0
590 | 2 ,1,250,2 ,0
591 | 2 ,1,250,2 ,0
592 | 2 ,1,250,2 ,0
593 | 11 ,11,2750,38 ,0
594 | 2 ,3,750,22 ,0
595 | 9 ,11,2750,49 ,1
596 | 5 ,11,2750,75 ,0
597 | 3 ,5,1250,38 ,0
598 | 3 ,1,250,3 ,1
599 | 4 ,6,1500,43 ,0
600 | 2 ,3,750,24 ,0
601 | 12 ,11,2750,39 ,0
602 | 2 ,2,500,14 ,0
603 | 4 ,6,1500,46 ,0
604 | 9 ,3,750,14 ,0
605 | 14 ,8,2000,26 ,0
606 | 4 ,2,500,13 ,0
607 | 4 ,11,2750,95 ,0
608 | 2 ,7,1750,77 ,0
609 | 2 ,7,1750,77 ,0
610 | 4 ,1,250,4 ,0
611 | 4 ,1,250,4 ,0
612 | 4 ,1,250,4 ,0
613 | 4 ,1,250,4 ,0
614 | 4 ,1,250,4 ,1
615 | 4 ,1,250,4 ,0
616 | 4 ,1,250,4 ,0
617 | 4 ,1,250,4 ,0
618 | 4 ,1,250,4 ,0
619 | 4 ,1,250,4 ,0
620 | 4 ,1,250,4 ,1
621 | 4 ,1,250,4 ,0
622 | 4 ,7,1750,62 ,0
623 | 4 ,1,250,4 ,0
624 | 4 ,4,1000,34 ,1
625 | 11 ,6,1500,28 ,0
626 | 13 ,3,750,14 ,1
627 | 7 ,5,1250,35 ,0
628 | 9 ,9,2250,54 ,0
629 | 11 ,2,500,11 ,0
630 | 2 ,5,1250,63 ,0
631 | 7 ,11,2750,89 ,0
632 | 8 ,9,2250,64 ,0
633 | 2 ,2,500,22 ,0
634 | 6 ,3,750,26 ,0
635 | 12 ,15,3750,71 ,0
636 | 13 ,3,750,16 ,0
637 | 11 ,16,4000,89 ,0
638 | 4 ,5,1250,58 ,0
639 | 14 ,7,1750,35 ,0
640 | 11 ,4,1000,27 ,0
641 | 7 ,9,2250,89 ,1
642 | 11 ,8,2000,52 ,1
643 | 7 ,5,1250,52 ,0
644 | 11 ,6,1500,41 ,0
645 | 10 ,5,1250,38 ,0
646 | 14 ,2,500,14 ,1
647 | 14 ,2,500,14 ,0
648 | 14 ,2,500,14 ,0
649 | 2 ,2,500,33 ,0
650 | 11 ,3,750,23 ,0
651 | 14 ,8,2000,46 ,0
652 | 9 ,1,250,9 ,0
653 | 16 ,5,1250,27 ,0
654 | 14 ,4,1000,26 ,0
655 | 4 ,2,500,30 ,0
656 | 14 ,3,750,21 ,0
657 | 16 ,16,4000,77 ,0
658 | 4 ,2,500,31 ,0
659 | 14 ,8,2000,50 ,0
660 | 11 ,3,750,26 ,0
661 | 14 ,7,1750,45 ,0
662 | 15 ,5,1250,33 ,0
663 | 16 ,2,500,16 ,0
664 | 16 ,3,750,21 ,0
665 | 11 ,8,2000,72 ,0
666 | 11 ,1,250,11 ,0
667 | 11 ,1,250,11 ,0
668 | 11 ,1,250,11 ,0
669 | 11 ,1,250,11 ,1
670 | 11 ,1,250,11 ,0
671 | 2 ,3,750,75 ,1
672 | 2 ,3,750,77 ,0
673 | 16 ,4,1000,28 ,0
674 | 16 ,15,3750,87 ,0
675 | 16 ,14,3500,83 ,0
676 | 16 ,10,2500,62 ,0
677 | 16 ,3,750,23 ,0
678 | 14 ,3,750,26 ,0
679 | 23 ,19,4750,62 ,0
680 | 11 ,7,1750,75 ,0
681 | 14 ,3,750,28 ,0
682 | 20 ,14,3500,69 ,1
683 | 4 ,2,500,46 ,0
684 | 11 ,2,500,25 ,0
685 | 11 ,3,750,37 ,0
686 | 16 ,4,1000,33 ,0
687 | 21 ,7,1750,38 ,0
688 | 13 ,7,1750,76 ,0
689 | 16 ,6,1500,50 ,0
690 | 14 ,3,750,33 ,0
691 | 14 ,1,250,14 ,0
692 | 14 ,1,250,14 ,0
693 | 14 ,1,250,14 ,0
694 | 14 ,1,250,14 ,0
695 | 14 ,1,250,14 ,0
696 | 14 ,1,250,14 ,0
697 | 17 ,7,1750,58 ,1
698 | 14 ,3,750,35 ,0
699 | 14 ,3,750,35 ,0
700 | 16 ,7,1750,64 ,0
701 | 21 ,2,500,21 ,0
702 | 16 ,3,750,35 ,0
703 | 16 ,1,250,16 ,0
704 | 16 ,1,250,16 ,0
705 | 16 ,1,250,16 ,0
706 | 16 ,1,250,16 ,0
707 | 16 ,1,250,16 ,0
708 | 14 ,2,500,29 ,0
709 | 11 ,4,1000,74 ,0
710 | 11 ,2,500,38 ,1
711 | 21 ,6,1500,48 ,0
712 | 23 ,2,500,23 ,0
713 | 23 ,6,1500,45 ,0
714 | 14 ,2,500,35 ,1
715 | 16 ,6,1500,81 ,0
716 | 16 ,4,1000,58 ,0
717 | 16 ,5,1250,71 ,0
718 | 21 ,2,500,26 ,0
719 | 21 ,3,750,35 ,0
720 | 21 ,3,750,35 ,0
721 | 23 ,8,2000,69 ,0
722 | 21 ,3,750,38 ,0
723 | 23 ,3,750,35 ,0
724 | 21 ,3,750,40 ,0
725 | 23 ,2,500,28 ,0
726 | 21 ,1,250,21 ,0
727 | 21 ,1,250,21 ,0
728 | 25 ,6,1500,50 ,0
729 | 21 ,1,250,21 ,0
730 | 21 ,1,250,21 ,0
731 | 23 ,3,750,39 ,0
732 | 21 ,2,500,33 ,0
733 | 14 ,3,750,79 ,0
734 | 23 ,1,250,23 ,1
735 | 23 ,1,250,23 ,0
736 | 23 ,1,250,23 ,0
737 | 23 ,1,250,23 ,0
738 | 23 ,1,250,23 ,0
739 | 23 ,1,250,23 ,0
740 | 23 ,1,250,23 ,0
741 | 23 ,4,1000,52 ,0
742 | 23 ,1,250,23 ,0
743 | 23 ,7,1750,88 ,0
744 | 16 ,3,750,86 ,0
745 | 23 ,2,500,38 ,0
746 | 21 ,2,500,52 ,0
747 | 23 ,3,750,62 ,0
748 | 39 ,1,250,39 ,0
749 | 72 ,1,250,72 ,0
--------------------------------------------------------------------------------
/watermelon/ch3/3.4/data/transfusion.names:
--------------------------------------------------------------------------------
1 | Title: Blood Transfusion Service Center Data Set
2 |
3 | Abstract: Data taken from the Blood Transfusion Service Center in Hsin-Chu City
4 | in Taiwan -- this is a classification problem.
5 |
6 |
7 | -----------------------------------------------------
8 |
9 | Data Set Characteristics: Multivariate
10 | Number of Instances: 748
11 | Area: Business
12 | Attribute Characteristics: Real
13 | Number of Attributes: 5
14 | Date Donated: 2008-10-03
15 | Associated Tasks: Classification
16 | Missing Values? N/A
17 |
18 | -----------------------------------------------------
19 |
20 | Source:
21 |
22 | Original Owner and Donor
23 | Prof. I-Cheng Yeh
24 | Department of Information Management
25 | Chung-Hua University,
26 | Hsin Chu, Taiwan 30067, R.O.C.
27 | e-mail:icyeh 'at' chu.edu.tw
28 | TEL:886-3-5186511
29 |
30 | Date Donated: October 3, 2008
31 |
32 | -----------------------------------------------------
33 |
34 | Data Set Information:
35 |
36 | To demonstrate the RFMTC marketing model (a modified version of RFM), this study
37 | adopted the donor database of Blood Transfusion Service Center in Hsin-Chu City
38 | in Taiwan. The center passes their blood transfusion service bus to one
39 | university in Hsin-Chu City to gather blood donated about every three months. To
40 | build a FRMTC model, we selected 748 donors at random from the donor database.
41 | These 748 donor data, each one included R (Recency - months since last
42 | donation), F (Frequency - total number of donation), M (Monetary - total blood
43 | donated in c.c.), T (Time - months since first donation), and a binary variable
44 | representing whether he/she donated blood in March 2007 (1 stand for donating
45 | blood; 0 stands for not donating blood).
46 |
47 | -----------------------------------------------------
48 |
49 | Attribute Information:
50 |
51 | Given is the variable name, variable type, the measurement unit and a brief
52 | description. The "Blood Transfusion Service Center" is a classification problem.
53 | The order of this listing corresponds to the order of numerals along the rows of
54 | the database.
55 |
56 | R (Recency - months since last donation),
57 | F (Frequency - total number of donation),
58 | M (Monetary - total blood donated in c.c.),
59 | T (Time - months since first donation), and
60 | a binary variable representing whether he/she donated blood in March 2007 (1
61 | stand for donating blood; 0 stands for not donating blood).
62 |
63 |
64 | Table 1 shows the descriptive statistics of the data. We selected 500 data at
65 | random as the training set, and the rest 248 as the testing set.
66 |
67 | Table 1. Descriptive statistics of the data
68 |
69 | Variable Data Type Measurement Description min max mean std
70 | Recency quantitative Months Input 0.03 74.4 9.74 8.07
71 | Frequency quantitative Times Input 1 50 5.51 5.84
72 | Monetary quantitative c.c. blood Input 250 12500 1378.68 1459.83
73 | Time quantitative Months Input 2.27 98.3 34.42 24.32
74 | Whether he/she donated blood in March 2007 binary 1=yes 0=no Output 0 1 1 (24%) 0 (76%)
75 |
76 |
77 | -----------------------------------------------------
78 |
79 | Citation Request:
80 |
81 | NOTE: Reuse of this database is unlimited with retention of copyright notice for
82 | Prof. I-Cheng Yeh and the following published paper:
83 |
84 | Yeh, I-Cheng, Yang, King-Jang, and Ting, Tao-Ming, "Knowledge discovery on RFM
85 | model using Bernoulli sequence, "Expert Systems with Applications, 2008
86 | (doi:10.1016/j.eswa.2008.07.018).
87 |
88 |
89 |
--------------------------------------------------------------------------------
/watermelon/ch3/3.5/LDA.py:
--------------------------------------------------------------------------------
1 | import numpy as np # for matrix calculation
2 | import matplotlib.pyplot as plt
3 | from self_def import GetProjectivePoint_2D
4 | from sklearn import model_selection
5 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
6 | from sklearn import metrics
7 | import matplotlib.pyplot as plt
8 | '''
9 | data importion and pre-analysis
10 | '''
11 |
12 | # load the CSV file as a numpy matrix
13 | data_file = open('data/watermelon_3a.csv')
14 | dataset = np.loadtxt(data_file, delimiter=",")
15 |
16 | # separate the data from the target attributes
17 | X = dataset[:, 1:3]
18 | y = dataset[:, 3]
19 |
20 | # draw scatter diagram to show the raw data
21 | f1 = plt.figure(1)
22 | plt.title('watermelon_3a')
23 | plt.xlabel('density')
24 | plt.ylabel('ratio_sugar')
25 | plt.scatter(X[y == 0, 0], X[y == 0, 1], marker='o', color='k', s=100, label='bad')
26 | plt.scatter(X[y == 1, 0], X[y == 1, 1], marker='o', color='g', s=100, label='good')
27 | plt.legend(loc='upper right')
28 | # plt.show()
29 |
30 | '''
31 | LDA via sklearn
32 | '''
33 | # generalization of train and test set
34 | X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.5, random_state=0)
35 |
36 | # model fitting
37 | lda_model = LinearDiscriminantAnalysis(solver='lsqr', shrinkage=None).fit(X_train, y_train)
38 |
39 | # model validation
40 | y_pred = lda_model.predict(X_test)
41 |
42 | # summarize the fit of the model
43 | print(metrics.confusion_matrix(y_test, y_pred))
44 | print(metrics.classification_report(y_test, y_pred))
45 |
46 | # draw the classfier decision boundary
47 | f2 = plt.figure(2)
48 | h = 0.001
49 | # x0_min, x0_max = X[:, 0].min()-0.1, X[:, 0].max()+0.1
50 | # x1_min, x1_max = X[:, 1].min()-0.1, X[:, 1].max()+0.1
51 |
52 | x0, x1 = np.meshgrid(np.arange(-1, 1, h),
53 | np.arange(-1, 1, h))
54 |
55 | # x0, x1 = np.meshgrid(np.arange(x0_min, x0_max, h),
56 | # np.arange(x1_min, x1_max, h))
57 |
58 | z = lda_model.predict(np.c_[x0.ravel(), x1.ravel()])
59 |
60 | # Put the result into a color plot
61 | z = z.reshape(x0.shape)
62 | plt.contourf(x0, x1, z)
63 |
64 | # Plot also the training pointsplt.title('watermelon_3a')
65 | plt.title('watermelon_3a')
66 | plt.xlabel('density')
67 | plt.ylabel('ratio_sugar')
68 | plt.scatter(X[y == 0,0], X[y == 0,1], marker = 'o', color = 'k', s=100, label = 'bad')
69 | plt.scatter(X[y == 1,0], X[y == 1,1], marker = 'o', color = 'g', s=100, label = 'good')
70 | plt.show()
71 |
72 |
73 | '''
74 | implementation of LDA based on self-coding
75 | '''
76 | # 1-st. get the mean vector of each class
77 |
78 | u = []
79 | for i in range(2): # two class
80 | u.append(np.mean(X[y == i], axis=0)) # column mean
81 |
82 | # 2-nd. computing the within-class scatter matrix, refer on book (3.33)
83 | m, n = np.shape(X)
84 | Sw = np.zeros((n, n))
85 | for i in range(m):
86 | x_tmp = X[i].reshape(n, 1) # row -> cloumn vector
87 | if y[i] == 0: u_tmp = u[0].reshape(n, 1)
88 | if y[i] == 1: u_tmp = u[1].reshape(n, 1)
89 | Sw += np.dot(x_tmp - u_tmp, (x_tmp - u_tmp).T)
90 |
91 | Sw = np.mat(Sw)
92 | U, sigma, V = np.linalg.svd(Sw)
93 |
94 | Sw_inv = V.T * np.linalg.inv(np.diag(sigma)) * U.T
95 | # 3-th. computing the parameter w, refer on book (3.39)
96 | w = np.dot(Sw_inv, (u[0] - u[1]).reshape(n, 1)) # here we use a**-1 to get the inverse of a ndarray
97 |
98 | print(w)
99 |
100 | # 4-th draw the LDA line in scatter figure
101 |
102 | # f2 = plt.figure(2)
103 | f3 = plt.figure(3)
104 | plt.xlim(-0.2, 1)
105 | plt.ylim(-0.5, 0.7)
106 |
107 | p0_x0 = -X[:, 0].max()
108 | p0_x1 = (w[1, 0] / w[0, 0]) * p0_x0
109 | p1_x0 = X[:, 0].max()
110 | p1_x1 = (w[1, 0] / w[0, 0]) * p1_x0
111 |
112 | plt.title('watermelon_3a - LDA')
113 | plt.xlabel('density')
114 | plt.ylabel('ratio_sugar')
115 | plt.scatter(X[y == 0, 0], X[y == 0, 1], marker='o', color='k', s=10, label='bad')
116 | plt.scatter(X[y == 1, 0], X[y == 1, 1], marker='o', color='g', s=10, label='good')
117 | plt.legend(loc='upper right')
118 |
119 | plt.plot([p0_x0, p1_x0], [p0_x1, p1_x1])
120 |
121 | # draw projective point on the line
122 |
123 |
124 | m, n = np.shape(X)
125 | for i in range(m):
126 | x_p = GetProjectivePoint_2D([X[i, 0], X[i, 1]], [w[1, 0] / w[0, 0], 0])
127 | if y[i] == 0:
128 | plt.plot(x_p[0], x_p[1], 'ko', markersize=5)
129 | if y[i] == 1:
130 | plt.plot(x_p[0], x_p[1], 'go', markersize=5)
131 | plt.plot([x_p[0], X[i, 0]], [x_p[1], X[i, 1]], 'c--', linewidth=0.3)
132 |
133 | plt.show()
134 |
135 | '''
136 | 由于数据线性不可分,则出现类簇重叠现象。
137 | 接下来,通过观查数据,我们考虑将西瓜数据集中的bad类离群点15删去,此时数据集的线性可分性大大提高。
138 | implementation of LDA again after delete outlier (X[14])
139 | '''
140 | # computing the d-dimensional mean vectors
141 | # import numpy as np
142 |
143 | # 1-st. get the mean vector of each class
144 | X = np.delete(X, 14, 0)
145 | y = np.delete(y, 14, 0)
146 |
147 | u = []
148 | for i in range(2): # two class
149 | u.append(np.mean(X[y == i], axis=0)) # column mean
150 |
151 | # 2-nd. computing the within-class scatter matrix, refer on book (3.33)
152 | m, n = np.shape(X)
153 | Sw = np.zeros((n, n))
154 | for i in range(m):
155 | x_tmp = X[i].reshape(n, 1) # row -> cloumn vector
156 | if y[i] == 0: u_tmp = u[0].reshape(n, 1)
157 | if y[i] == 1: u_tmp = u[1].reshape(n, 1)
158 | Sw += np.dot(x_tmp - u_tmp, (x_tmp - u_tmp).T)
159 |
160 | Sw = np.mat(Sw)
161 | U, sigma, V = np.linalg.svd(Sw)
162 |
163 | Sw_inv = V.T * np.linalg.inv(np.diag(sigma)) * U.T
164 | # 3-th. computing the parameter w, refer on book (3.39)
165 | w = np.dot(Sw_inv, (u[0] - u[1]).reshape(n, 1)) # here we use a**-1 to get the inverse of a ndarray
166 |
167 | print(w)
168 |
169 | # 4-th draw the LDA line in scatter figure
170 |
171 | # f2 = plt.figure(2)
172 | f4 = plt.figure(4)
173 | plt.xlim(-0.2, 1)
174 | plt.ylim(-0.5, 0.7)
175 |
176 | p0_x0 = -X[:, 0].max()
177 | p0_x1 = (w[1, 0] / w[0, 0]) * p0_x0
178 | p1_x0 = X[:, 0].max()
179 | p1_x1 = (w[1, 0] / w[0, 0]) * p1_x0
180 |
181 | plt.title('watermelon_3a - LDA')
182 | plt.xlabel('density')
183 | plt.ylabel('ratio_sugar')
184 | plt.scatter(X[y == 0, 0], X[y == 0, 1], marker='o', color='k', s=10, label='bad')
185 | plt.scatter(X[y == 1, 0], X[y == 1, 1], marker='o', color='g', s=10, label='good')
186 | plt.legend(loc='upper right')
187 |
188 | plt.plot([p0_x0, p1_x0], [p0_x1, p1_x1])
189 |
190 | # draw projective point on the line
191 |
192 | m, n = np.shape(X)
193 | for i in range(m):
194 | x_p = GetProjectivePoint_2D([X[i, 0], X[i, 1]], [w[1, 0] / w[0, 0], 0])
195 | if y[i] == 0:
196 | plt.plot(x_p[0], x_p[1], 'ko', markersize=5)
197 | if y[i] == 1:
198 | plt.plot(x_p[0], x_p[1], 'go', markersize=5)
199 | plt.plot([x_p[0], X[i, 0]], [x_p[1], X[i, 1]], 'c--', linewidth=0.3)
200 |
201 | plt.show()
202 |
203 | '''
204 | 由于西瓜数据集自身非线性因素,LDA所得直线未能很好的表现出类别的分簇情景,
205 | 说明,LDA基本模型不太适用于线性不可分的情况。要拓展到非线性,或许可以考虑SVM-核技巧。
206 | '''
--------------------------------------------------------------------------------
/watermelon/ch3/3.5/data/watermelon_3a.csv:
--------------------------------------------------------------------------------
1 | 1,0.697,0.46,1
2 | 2,0.774,0.376,1
3 | 3,0.634,0.264,1
4 | 4,0.608,0.318,1
5 | 5,0.556,0.215,1
6 | 6,0.403,0.237,1
7 | 7,0.481,0.149,1
8 | 8,0.437,0.211,1
9 | 9,0.666,0.091,0
10 | 10,0.243,0.0267,0
11 | 11,0.245,0.057,0
12 | 12,0.343,0.099,0
13 | 13,0.639,0.161,0
14 | 14,0.657,0.198,0
15 | 15,0.36,0.37,0
16 | 16,0.593,0.042,0
17 | 17,0.719,0.103,0
18 |
--------------------------------------------------------------------------------
/watermelon/ch3/3.5/self_def.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | '''
4 | get the projective point(2D) of a point to a line
5 |
6 | @param point: the coordinate of the point form as [a,b]
7 | @param line: the line parameters form as [k, t] which means y = k*x + t
8 | @return: the coordinate of the projective point
9 | '''
10 |
11 |
12 | def GetProjectivePoint_2D(point, line):
13 | a = point[0]
14 | b = point[1]
15 | k = line[0]
16 | t = line[1]
17 |
18 | if k == 0:
19 | return [a, t]
20 | elif k == np.inf:
21 | return [0, b]
22 | x = (a + k * b - k * t) / (k * k + 1)
23 | y = k * x + t
24 | return [x, y]
25 |
--------------------------------------------------------------------------------
/watermelon/ch4/4.3/ID3_watermelon.py:
--------------------------------------------------------------------------------
1 | '''
2 | import data and pre-analysis through data visualization
3 | '''
4 | # using pandas dataframe for .csv read which contains chinese char.
5 | import pandas as pd
6 | import decision_tree
7 |
8 | data_file_encode = "gb18030" # the watermelon_3.csv is file codec type
9 | with open("data/watermelon_3.csv", mode='r', encoding=data_file_encode) as data_file:
10 | df = pd.read_csv(data_file)
11 |
12 | # using seaborn for data visualization.
13 | # # load chinese font
14 | # import matplotlib as mpl
15 | # import matplotlib.pyplot as plt
16 | # import seaborn as sns
17 | # # sns.set(style="whitegrid", color_codes=True)
18 | # mpl.rcParams['font.sans-serif'] = ['Droid Sans Fallback'] # for chinese chararter visualization
19 | # mpl.rcParams['axes.unicode_minus'] = False
20 | # sns.set_context("poster")
21 | #
22 | # f1 = plt.figure(1)
23 | # sns.FacetGrid(df, hue="好瓜", size=5).map(plt.scatter, "密度", "含糖率").add_legend()
24 | # sns.plt.show()
25 | #
26 | # f2 = plt.figure(2)
27 | # sns.plt.subplot(221)
28 | # sns.swarmplot(x = "纹理", y = '密度', hue = "好瓜", data = df)
29 | # sns.plt.subplot(222)
30 | # sns.swarmplot(x = "敲声", y = '密度', hue = "好瓜", data = df)
31 | # sns.plt.subplot(223)
32 | # sns.swarmplot(x = "色泽", y = '含糖率', hue = "好瓜", data = df)
33 | # sns.plt.subplot(224)
34 | # sns.swarmplot(x = "敲声", y = '含糖率', hue = "好瓜", data = df)
35 | # sns.plt.show()
36 |
37 | '''
38 | implementation of ID3
39 |
40 | rely on decision_tree.py
41 | '''
42 |
43 | root = decision_tree.TreeGenerate(df)
44 |
45 | # df = df.drop(['密度','含糖率'], 1)
46 | # df = df.drop(['色泽','根蒂','敲声','纹理','脐部','触感'], 1)
47 |
48 | accuracy_scores = []
49 |
50 | '''
51 | from random import sample
52 | for i in range(10):
53 | train = sample(range(len(df.index)), int(1*len(df.index)/2))
54 |
55 | df_train = df.iloc[train]
56 | df_test = df.drop(train)
57 | # generate the tree
58 | root = decision_tree.TreeGenerate(df_train)
59 | # test the accuracy
60 | pred_true = 0
61 | for i in df_test.index:
62 | label = decision_tree.Predict(root, df[df.index == i])
63 | if label == df_test[df_test.columns[-1]][i]:
64 | pred_true += 1
65 |
66 | accuracy = pred_true / len(df_test.index)
67 | accuracy_scores.append(accuracy)
68 | '''
69 |
70 | # k-folds cross prediction
71 | # 按照K折交叉验证模型(k=5)
72 |
73 | n = len(df.index)
74 | k = 5
75 | for i in range(k):
76 | m = int(n / k)
77 | test = []
78 | for j in range(i * m, i * m + m):
79 | test.append(j)
80 |
81 | df_train = df.drop(test)
82 | df_test = df.iloc[test]
83 | root = decision_tree.TreeGenerate(df_train) # generate the tree
84 |
85 | # test the accuracy
86 | pred_true = 0
87 | for i in df_test.index:
88 | label = decision_tree.Predict(root, df[df.index == i])
89 | if label == df_test[df_test.columns[-1]][i]:
90 | pred_true += 1
91 |
92 | accuracy = pred_true / len(df_test.index)
93 | accuracy_scores.append(accuracy)
94 |
95 | # print the prediction accuracy result
96 | accuracy_sum = 0
97 | print("accuracy: ", end="")
98 | for i in range(k):
99 | print("%.3f " % accuracy_scores[i], end="")
100 | accuracy_sum += accuracy_scores[i]
101 | print("\naverage accuracy: %.3f" % (accuracy_sum / k))
102 |
103 | # dicision tree visualization using pydotplus.graphviz
104 | root = decision_tree.TreeGenerate(df)
105 |
106 | decision_tree.DrawPNG(root, "decision_tree_ID3.png")
107 |
--------------------------------------------------------------------------------
/watermelon/ch4/4.3/data/watermelon_3.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/watermelon/ch4/4.3/data/watermelon_3.csv
--------------------------------------------------------------------------------
/watermelon/ch4/4.3/decision_tree.py:
--------------------------------------------------------------------------------
1 | '''
2 | definition of decision node class
3 |
4 | attr: attribution as parent for a new branching
5 | attr_down: dict: {key, value}
6 | key: categorical: categorical attr_value
7 | continuous: '<= div_value' for small part
8 | '> div_value' for big part
9 | value: children (Node class)
10 | label: class label (the majority of current sample labels)
11 | 决策树节点类
12 | 该节点类包含当前节点的属性,向下划分的属性取值,节点的类标签(叶节点有效,为通用性而保留所有节点类标签)
13 | '''
14 |
15 | import numpy as np
16 |
17 |
18 | class Node(object):
19 | def __init__(self, attr_init=None, label_init=None, attr_down_init={}):
20 | self.attr = attr_init
21 | self.label = label_init
22 | self.attr_down = attr_down_init
23 |
24 |
25 | '''
26 | Branching for decision tree using recursion
27 | 递归实现决策树生成算法p74
28 |
29 | @param df: the pandas dataframe of the data_set
30 | @return root: Node, the root node of decision tree
31 | '''
32 |
33 |
34 | def TreeGenerate(df):
35 | # generating a new root node
36 | new_node = Node(None, None, {})
37 | label_arr = df[df.columns[-1]]
38 |
39 | label_count = NodeLabel(label_arr)
40 | if label_count: # assert the label_count isn't empty
41 | new_node.label = max(label_count, key=label_count.get)
42 |
43 | # end if there is only 1 class in current node data 样本全属于同一类别
44 | # end if attribution array is empty 属性集为空
45 | if len(label_count) == 1 or len(label_arr) == 0:
46 | return new_node
47 |
48 | # get the optimal attribution for a new branching 获取最优划分属性
49 | new_node.attr, div_value = OptAttr(df)
50 |
51 | # recursion
52 | if div_value == 0: # categorical variable
53 | value_count = ValueCount(df[new_node.attr])
54 | for value in value_count:
55 | df_v = df[df[new_node.attr].isin([value])] # get sub set
56 | # delete current attribution
57 | df_v = df_v.drop(new_node.attr, 1)
58 | new_node.attr_down[value] = TreeGenerate(df_v)
59 |
60 | else: # continuous variable # left and right child
61 | value_l = "<=%.3f" % div_value
62 | value_r = ">%.3f" % div_value
63 | df_v_l = df[df[new_node.attr] <= div_value] # get sub set
64 | df_v_r = df[df[new_node.attr] > div_value]
65 |
66 | new_node.attr_down[value_l] = TreeGenerate(df_v_l)
67 | new_node.attr_down[value_r] = TreeGenerate(df_v_r)
68 |
69 | return new_node
70 |
71 |
72 | '''
73 | make a predict based on root
74 |
75 | @param root: Node, root Node of the decision tree
76 | @param df_sample: dataframe, a sample line
77 | '''
78 |
79 |
80 | def Predict(root, df_sample):
81 | try:
82 | import re # using Regular Expression to get the number in string
83 | except ImportError:
84 | print("module re not found")
85 |
86 | while root.attr != None:
87 | # continuous variable
88 | if df_sample[root.attr].dtype == (float, int):
89 | # get the div_value from root.attr_down
90 | for key in list(root.attr_down):
91 | num = re.findall(r"\d+\.?\d*", key)
92 | div_value = float(num[0])
93 | break
94 | if df_sample[root.attr].values[0] <= div_value:
95 | key = "<=%.3f" % div_value
96 | root = root.attr_down[key]
97 | else:
98 | key = ">%.3f" % div_value
99 | root = root.attr_down[key]
100 |
101 | # categorical variable
102 | else:
103 | key = df_sample[root.attr].values[0]
104 | # check whether the attr_value in the child branch
105 | if key in root.attr_down:
106 | root = root.attr_down[key]
107 | else:
108 | break
109 |
110 | return root.label
111 |
112 |
113 | '''
114 | calculating the appeared label and it's counts
115 |
116 | @param label_arr: data array for class labels
117 | @return label_count: dict, the appeared label and it's counts
118 | '''
119 |
120 |
121 | def NodeLabel(label_arr):
122 | label_count = {} # store count of label
123 |
124 | for label in label_arr:
125 | if label in label_count:
126 | label_count[label] += 1
127 | else:
128 | label_count[label] = 1
129 |
130 | return label_count
131 |
132 |
133 | '''
134 | calculating the appeared value for categorical attribute and it's counts
135 |
136 | @param data_arr: data array for an attribute
137 | @return value_count: dict, the appeared value and it's counts
138 | '''
139 |
140 |
141 | def ValueCount(data_arr):
142 | value_count = {} # store count of value
143 |
144 | for label in data_arr:
145 | if label in value_count:
146 | value_count[label] += 1
147 | else:
148 | value_count[label] = 1
149 |
150 | return value_count
151 |
152 |
153 | '''
154 | find the optimal attributes of current data_set
155 | 获取最优划分属性
156 |
157 | @param df: the pandas dataframe of the data_set
158 | @return opt_attr: the optimal attribution for branch
159 | @return div_value: for discrete variable value = 0
160 | for continuous variable value = t for bisection divide value
161 | '''
162 |
163 |
164 | def OptAttr(df):
165 | info_gain = 0
166 |
167 | for attr_id in df.columns[1:-1]:
168 | info_gian_tmp, div_value_tmp = InfoGain(df, attr_id)
169 | if info_gian_tmp > info_gain:
170 | info_gain = info_gian_tmp
171 | opt_attr = attr_id
172 | div_value = div_value_tmp
173 |
174 | return opt_attr, div_value
175 |
176 |
177 | '''
178 | calculating the information gain of an attribution
179 | 采用信息增益最大化来实现最优划分属性的选择,这里主要的挑战是离散和连续两种属性变量的分别操作。
180 | 对于离散变量(categorical variable),参考书p75-77内容实现,
181 | 对于连续变量(continuous variable),采用书p83-85所介绍的二分法实现。
182 |
183 | @param df: dataframe, the pandas dataframe of the data_set
184 | @param attr_id: the target attribution in df
185 | @return info_gain: the information gain of current attribution
186 | @return div_value: for discrete variable, value = 0
187 | for continuous variable, value = t (the division value)
188 | '''
189 |
190 |
191 | # todo 运行失败
192 | def InfoGain(df, index):
193 | info_gain = InfoEnt(df.values[:, -1]) # info_gain for the whole label
194 | div_value = 0 # div_value for continuous attribute
195 |
196 | n = len(df[index]) # the number of sample
197 | # 1.for continuous variable using method of bisection
198 |
199 | if type(df[index][0]) == np.float64:
200 |
201 | sub_info_ent = {} # store the div_value (div) and it's subset entropy
202 | # print(df)
203 |
204 | df = df.sort([index], ascending=1) # sorting via column
205 | df = df.reset_index(drop=True)
206 |
207 | data_arr = df[index]
208 | # print(data_arr)
209 | label_arr = df[df.columns[-1]]
210 |
211 | for i in range(n - 1):
212 | div = (data_arr[i] + data_arr[i + 1]) / 2
213 | sub_info_ent[div] = ((i + 1) * InfoEnt(label_arr[0:i + 1]) / n) \
214 | + ((n - i - 1) * InfoEnt(label_arr[i + 1:-1]) / n)
215 | # our goal is to get the min subset entropy sum and it's divide value
216 | div_value, sub_info_ent_max = min(sub_info_ent.items(), key=lambda x: x[1])
217 | info_gain -= sub_info_ent_max
218 |
219 | # 2.for discrete variable (categorical variable)
220 | else:
221 | data_arr = df[index]
222 | label_arr = df[df.columns[-1]]
223 | value_count = ValueCount(data_arr)
224 |
225 | # 计算信息增益
226 | for key in value_count:
227 | key_label_arr = label_arr[data_arr == key]
228 | info_gain -= value_count[key] * InfoEnt(key_label_arr) / n
229 |
230 | return info_gain, div_value
231 |
232 |
233 | '''
234 | calculating the information entropy of an attribution
235 | 计算信息熵Ent(D)
236 |
237 | @param label_arr: ndarray, class label array of data_arr
238 | @return ent: the information entropy of current attribution
239 | '''
240 |
241 |
242 | def InfoEnt(label_arr):
243 | try:
244 | from math import log2
245 | except ImportError:
246 | print("module math.log2 not found")
247 |
248 | ent = 0
249 | n = len(label_arr)
250 | label_count = NodeLabel(label_arr)
251 |
252 | for key in label_count:
253 | ent -= (label_count[key] / n) * log2(label_count[key] / n)
254 |
255 | return ent
256 |
257 |
258 | def DrawPNG(root, out_file):
259 | '''
260 | visualization of decision tree from root.
261 | @param root: Node, the root node for tree.
262 | @param out_file: str, name and path of output file
263 | '''
264 | try:
265 | from pydotplus import graphviz
266 | except ImportError:
267 | print("module pydotplus.graphviz not found")
268 |
269 | g = graphviz.Dot() # generation of new dot
270 |
271 | TreeToGraph(0, g, root)
272 | g2 = graphviz.graph_from_dot_data(g.to_string())
273 |
274 | g2.write_png(out_file)
275 |
276 |
277 | def TreeToGraph(i, g, root):
278 | '''
279 | build a graph from root on
280 | @param i: node number in this tree
281 | @param g: pydotplus.graphviz.Dot() object
282 | @param root: the root node
283 |
284 | @return i: node number after modified
285 | # @return g: pydotplus.graphviz.Dot() object after modified
286 | @return g_node: the current root node in graphviz
287 | '''
288 | try:
289 | from pydotplus import graphviz
290 | except ImportError:
291 | print("module pydotplus.graphviz not found")
292 |
293 | if root.attr == None:
294 | g_node_label = "Node:%d\n好瓜:%s" % (i, root.label)
295 | else:
296 | g_node_label = "Node:%d\n好瓜:%s\n属性:%s" % (i, root.label, root.attr)
297 | g_node = i
298 | g.add_node(graphviz.Node(g_node, label=g_node_label))
299 |
300 | for value in list(root.attr_down):
301 | i, g_child = TreeToGraph(i + 1, g, root.attr_down[value])
302 | g.add_edge(graphviz.Edge(g_node, g_child, label=value))
303 |
304 | return i, g_node
305 |
--------------------------------------------------------------------------------
/watermelon/ch4/4.4/CART_watermelon.py:
--------------------------------------------------------------------------------
1 | '''
2 | import data and pre-analysis through data visualization
3 | '''
4 | # using pandas dataframe for .csv read which contains chinese char.
5 | import pandas as pd
6 | import decision_tree
7 |
8 | data_file_encode = "gb18030"
9 | with open("data/watermelon_2.csv", mode='r', encoding=data_file_encode) as data_file:
10 | df = pd.read_csv(data_file)
11 |
12 | '''
13 | implementation of CART rely on decision_tree.py
14 | '''
15 |
16 | # dicision tree visualization using pydotplus.graphviz
17 | index_train = [0, 1, 2, 5, 6, 9, 13, 14, 15, 16]
18 |
19 | df_train = df.iloc[index_train]
20 | df_test = df.drop(index_train)
21 |
22 | # generate a full tree
23 | root = decision_tree.TreeGenerate(df_train)
24 | decision_tree.DrawPNG(root, "decision_tree_full.png")
25 | print("accuracy of full tree: %.3f" % decision_tree.PredictAccuracy(root, df_test))
26 |
27 | # pre-purning
28 | root = decision_tree.PrePurn(df_train, df_test)
29 | decision_tree.DrawPNG(root, "decision_tree_pre.png")
30 | print("accuracy of pre-purning tree: %.3f" % decision_tree.PredictAccuracy(root, df_test))
31 |
32 | # # post-puring
33 | root = decision_tree.TreeGenerate(df_train)
34 | decision_tree.PostPurn(root, df_test)
35 | decision_tree.DrawPNG(root, "decision_tree_post.png")
36 | print("accuracy of post-purning tree: %.3f" % decision_tree.PredictAccuracy(root, df_test))
37 |
38 | # print the accuracy
39 | # k-folds cross prediction
40 | accuracy_scores = []
41 | n = len(df.index)
42 | k = 5
43 | for i in range(k):
44 | m = int(n / k)
45 | test = []
46 | for j in range(i * m, i * m + m):
47 | test.append(j)
48 |
49 | df_train = df.drop(test)
50 | df_test = df.iloc[test]
51 | root = decision_tree.TreeGenerate(df_train) # generate the tree
52 | decision_tree.PostPurn(root, df_test) # post-purning
53 |
54 | # test the accuracy
55 | pred_true = 0
56 | for i in df_test.index:
57 | label = decision_tree.Predict(root, df[df.index == i])
58 | if label == df_test[df_test.columns[-1]][i]:
59 | pred_true += 1
60 |
61 | accuracy = pred_true / len(df_test.index)
62 | accuracy_scores.append(accuracy)
63 |
64 | # print the prediction accuracy result
65 | accuracy_sum = 0
66 | print("accuracy: ", end="")
67 | for i in range(k):
68 | print("%.3f " % accuracy_scores[i], end="")
69 | accuracy_sum += accuracy_scores[i]
70 | print("\naverage accuracy: %.3f" % (accuracy_sum / k))
71 |
--------------------------------------------------------------------------------
/watermelon/ch4/4.4/data/watermelon_2.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xjwhhh/LearningML/bb6ca914df5d8ba3a033f549f26c0e97eb9d54ca/watermelon/ch4/4.4/data/watermelon_2.csv
--------------------------------------------------------------------------------
/watermelon/ch4/4.4/decision_tree.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | '''
4 | 由于本题数据集较差,决策树的总体表现一般,交叉验证存在很大波动性。
5 | 剪枝操作是提升模型泛化能力的重要途径,在不考虑建模开销的情况下,后剪枝一般会优于预剪枝。
6 | 除剪枝外,常采用最大叶深度约束等方法来保持决策树泛化能力。
7 | '''
8 |
9 |
10 | class Node(object):
11 |
12 | '''
13 | definition of decision node class
14 |
15 | attr: attribution as parent for a new branching
16 | attr_down: dict: {key, value}
17 | key: categorical: categorical attr_value
18 | continuous: '<= div_value' for small part
19 | '> div_value' for big part
20 | value: children (Node class)
21 | label: class label (the majority of current sample labels)
22 | '''
23 |
24 | def __init__(self, attr_init=None, label_init=None, attr_down_init={}):
25 | self.attr = attr_init
26 | self.label = label_init
27 | self.attr_down = attr_down_init
28 |
29 |
30 | def TreeGenerate(df):
31 | '''
32 | Branching for decision tree using recursion
33 |
34 | @param df: the pandas dataframe of the data_set
35 | @return root: Node, the root node of decision tree
36 | '''
37 | # generating a new root node
38 | new_node = Node(None, None, {})
39 | label_arr = df[df.columns[-1]]
40 |
41 | label_count = NodeLabel(label_arr)
42 | if label_count: # assert the label_count isn't empty
43 | new_node.label = max(label_count, key=label_count.get)
44 |
45 | # end if there is only 1 class in current node data
46 | # end if attribution array is empty
47 | if len(label_count) == 1 or len(label_arr) == 0:
48 | return new_node
49 |
50 | # get the optimal attribution for a new branching
51 | new_node.attr, div_value = OptAttr_Gini(df) # via Gini index
52 |
53 | # recursion
54 | if div_value == 0: # categorical variable
55 | value_count = ValueCount(df[new_node.attr])
56 | for value in value_count:
57 | df_v = df[df[new_node.attr].isin([value])] # get sub set
58 | # delete current attribution
59 | df_v = df_v.drop(new_node.attr, 1)
60 | new_node.attr_down[value] = TreeGenerate(df_v)
61 |
62 | else: # continuous variable # left and right child
63 | value_l = "<=%.3f" % div_value
64 | value_r = ">%.3f" % div_value
65 | df_v_l = df[df[new_node.attr] <= div_value] # get sub set
66 | df_v_r = df[df[new_node.attr] > div_value]
67 |
68 | new_node.attr_down[value_l] = TreeGenerate(df_v_l)
69 | new_node.attr_down[value_r] = TreeGenerate(df_v_r)
70 |
71 | return new_node
72 |
73 |
74 | def Predict(root, df_sample):
75 | '''
76 | make a predict based on root
77 |
78 | @param root: Node, root Node of the decision tree
79 | @param df_sample: dataframe, a sample line
80 | '''
81 | try:
82 | import re # using Regular Expression to get the number in string
83 | except ImportError:
84 | print("module re not found")
85 |
86 | while root.attr != None:
87 | # continuous variable
88 | if df_sample[root.attr].dtype == (float, int):
89 | # get the div_value from root.attr_down
90 | for key in list(root.attr_down):
91 | num = re.findall(r"\d+\.?\d*", key)
92 | div_value = float(num[0])
93 | break
94 | if df_sample[root.attr].values[0] <= div_value:
95 | key = "<=%.3f" % div_value
96 | root = root.attr_down[key]
97 | else:
98 | key = ">%.3f" % div_value
99 | root = root.attr_down[key]
100 |
101 | # categorical variable
102 | else:
103 | key = df_sample[root.attr].values[0]
104 | # check whether the attr_value in the child branch
105 | if key in root.attr_down:
106 | root = root.attr_down[key]
107 | else:
108 | break
109 |
110 | return root.label
111 |
112 |
113 | def PredictAccuracy(root, df_test):
114 | '''
115 | calculating accuracy of prediction on test set
116 |
117 | @param root: Node, root Node of the decision tree
118 | @param df_test: dataframe, test data set
119 | @return accuracy, float,
120 | '''
121 | if len(df_test.index) == 0: return 0
122 | pred_true = 0
123 | for i in df_test.index:
124 | label = Predict(root, df_test[df_test.index == i])
125 | if label == df_test[df_test.columns[-1]][i]:
126 | pred_true += 1
127 | return pred_true / len(df_test.index)
128 |
129 |
130 | def PrePrune(df_train, df_test):
131 | '''
132 | pre-purning to generating a decision tree
133 | 预剪枝
134 | 基于奥卡姆剃刀准则,这棵决策树模型要优于前者;
135 | 由于数据集小,所以预剪枝优越性不明显,实际预剪枝操作是有较好的模型提升效果的。
136 | 此处结果模型太简单,有严重的欠拟合风险
137 |
138 | @param df_train: dataframe, the training set to generating a tree
139 | @param df_test: dataframe, the testing set for pruning decision
140 | @return root: Node, root of the tree using pruning
141 | '''
142 | # generating a new root node
143 | new_node = Node(None, None, {})
144 | label_arr = df_train[df_train.columns[-1]]
145 |
146 | label_count = NodeLabel(label_arr)
147 | if label_count: # assert the label_count isn't empty
148 | new_node.label = max(label_count, key=label_count.get)
149 |
150 | # end if there is only 1 class in current node data
151 | # end if attribution array is empty
152 | if len(label_count) == 1 or len(label_arr) == 0:
153 | return new_node
154 |
155 | # calculating the test accuracy up to current node
156 | a0 = PredictAccuracy(new_node, df_test)
157 |
158 | # get the optimal attribution for a new branching
159 | new_node.attr, div_value = OptAttr_Gini(df_train) # via Gini index
160 |
161 | # get the new branch
162 | if div_value == 0: # categorical variable
163 | value_count = ValueCount(df_train[new_node.attr])
164 | for value in value_count:
165 | df_v = df_train[df_train[new_node.attr].isin([value])] # get sub set
166 | df_v = df_v.drop(new_node.attr, 1)
167 | # for child node
168 | new_node_child = Node(None, None, {})
169 | label_arr_child = df_train[df_v.columns[-1]]
170 | label_count_child = NodeLabel(label_arr_child)
171 | new_node_child.label = max(label_count_child, key=label_count_child.get)
172 | new_node.attr_down[value] = new_node_child
173 |
174 | # calculating to check whether need further branching
175 | a1 = PredictAccuracy(new_node, df_test)
176 | if a1 > a0: # need branching
177 | for value in value_count:
178 | df_v = df_train[df_train[new_node.attr].isin([value])] # get sub set
179 | df_v = df_v.drop(new_node.attr, 1)
180 | new_node.attr_down[value] = TreeGenerate(df_v)
181 | else:
182 | new_node.attr = None
183 | new_node.attr_down = {}
184 |
185 | else: # continuous variable # left and right child
186 | value_l = "<=%.3f" % div_value
187 | value_r = ">%.3f" % div_value
188 | df_v_l = df_train[df_train[new_node.attr] <= div_value] # get sub set
189 | df_v_r = df_train[df_train[new_node.attr] > div_value]
190 |
191 | # for child node
192 | new_node_l = Node(None, None, {})
193 | new_node_r = Node(None, None, {})
194 | label_count_l = NodeLabel(df_v_l[df_v_r.columns[-1]])
195 | label_count_r = NodeLabel(df_v_r[df_v_r.columns[-1]])
196 | new_node_l.label = max(label_count_l, key=label_count_l.get)
197 | new_node_r.label = max(label_count_r, key=label_count_r.get)
198 | new_node.attr_down[value_l] = new_node_l
199 | new_node.attr_down[value_r] = new_node_r
200 |
201 | # calculating to check whether need further branching
202 | a1 = PredictAccuracy(new_node, df_test)
203 | if a1 > a0: # need branching
204 | new_node.attr_down[value_l] = TreeGenerate(df_v_l)
205 | new_node.attr_down[value_r] = TreeGenerate(df_v_r)
206 | else:
207 | new_node.attr = None
208 | new_node.attr_down = {}
209 |
210 | return new_node
211 |
212 |
213 | def PostPrune(root, df_test):
214 | '''
215 | pre-pruning to generating a decision tree
216 | 后剪枝
217 | 此精度相较于前者有了很大的提升,说明经过后剪枝,模型泛化能力变强,同时保留了一定树规模,拟合较好。
218 |
219 | @param root: Node, root of the tree
220 | @param df_test: dataframe, the testing set for pruning decision
221 | @return accuracy score through traversal the tree
222 | '''
223 | # leaf node
224 | if root.attr == None:
225 | return PredictAccuracy(root, df_test)
226 |
227 | # calculating the test accuracy on children node
228 | a1 = 0
229 | value_count = ValueCount(df_test[root.attr])
230 | for value in list(value_count):
231 | df_test_v = df_test[df_test[root.attr].isin([value])] # get sub set
232 | if value in root.attr_down: # root has the value
233 | a1_v = PostPrune(root.attr_down[value], df_test_v)
234 | else: # root doesn't have value
235 | a1_v = PredictAccuracy(root, df_test_v)
236 | if a1_v == -1: # -1 means no pruning back from this child
237 | return -1
238 | else:
239 | a1 += a1_v * len(df_test_v.index) / len(df_test.index)
240 |
241 | # calculating the test accuracy on this node
242 | node = Node(None, root.label, {})
243 | a0 = PredictAccuracy(node, df_test)
244 |
245 | # check if need pruning
246 | if a0 >= a1:
247 | root.attr = None
248 | root.attr_down = {}
249 | return a0
250 | else:
251 | return -1
252 |
253 |
254 | def NodeLabel(label_arr):
255 | '''
256 | calculating the appeared label and it's counts
257 |
258 | @param label_arr: data array for class labels
259 | @return label_count: dict, the appeared label and it's counts
260 | '''
261 | label_count = {} # store count of label
262 |
263 | for label in label_arr:
264 | if label in label_count:
265 | label_count[label] += 1
266 | else:
267 | label_count[label] = 1
268 |
269 | return label_count
270 |
271 |
272 | def ValueCount(data_arr):
273 | '''
274 | calculating the appeared value for categorical attribute and it's counts
275 |
276 | @param data_arr: data array for an attribute
277 | @return value_count: dict, the appeared value and it's counts
278 | '''
279 | value_count = {} # store count of value
280 |
281 | for label in data_arr:
282 | if label in value_count:
283 | value_count[label] += 1
284 | else:
285 | value_count[label] = 1
286 |
287 | return value_count
288 |
289 |
290 | '''
291 | optimal attribution selection in CART algorithm based on gini index
292 | '''
293 |
294 |
295 | def OptAttr_Gini(df):
296 | '''
297 | find the optimal attributes of current data_set based on gini index
298 |
299 | @param df: the pandas dataframe of the data_set
300 | @return opt_attr: the optimal attribution for branch
301 | @return div_value: for discrete variable value = 0
302 | for continuous variable value = t for bisection divide value
303 | '''
304 | gini_index = float('Inf')
305 | for attr_id in df.columns[1:-1]:
306 | gini_index_tmp, div_value_tmp = InfoGain(df, attr_id)
307 | if gini_index_tmp < gini_index:
308 | gini_index = gini_index_tmp
309 | opt_attr = attr_id
310 | div_value = div_value_tmp
311 |
312 | return opt_attr, div_value
313 |
314 |
315 | def GiniIndex(df, attr_id):
316 | '''
317 | calculating the gini index of an attribution
318 | 计算基尼指数
319 |
320 |
321 | @param df: dataframe, the pandas dataframe of the data_set
322 | @param attr_id: the target attribution in df
323 | @return gini_index: the gini index of current attribution
324 | @return div_value: for discrete variable, value = 0
325 | for continuous variable, value = t (the division value)
326 | '''
327 | gini_index = 0 # info_gain for the whole label
328 | div_value = 0 # div_value for continuous attribute
329 |
330 | n = len(df[attr_id]) # the number of sample
331 |
332 | # 1.for continuous variable using method of bisection
333 | if type(df[attr_id][0]) == np.float64:
334 | sub_gini = {} # store the div_value (div) and it's subset gini value
335 |
336 | df = df.sort([attr_id], ascending=1) # sorting via column
337 | df = df.reset_index(drop=True)
338 |
339 | data_arr = df[attr_id]
340 | label_arr = df[df.columns[-1]]
341 |
342 | for i in range(n - 1):
343 | div = (data_arr[i] + data_arr[i + 1]) / 2
344 | sub_gini[div] = ((i + 1) * Gini(label_arr[0:i + 1]) / n) \
345 | + ((n - i - 1) * Gini(label_arr[i + 1:-1]) / n)
346 | # our goal is to get the min subset entropy sum and it's divide value
347 | div_value, gini_index = min(sub_gini.items(), key=lambda x: x[1])
348 |
349 | # 2.for discrete variable (categorical variable)
350 | else:
351 | data_arr = df[attr_id]
352 | label_arr = df[df.columns[-1]]
353 | value_count = ValueCount(data_arr)
354 |
355 | for key in value_count:
356 | key_label_arr = label_arr[data_arr == key]
357 | gini_index += value_count[key] * Gini(key_label_arr) / n
358 |
359 | return gini_index, div_value
360 |
361 |
362 | def Gini(label_arr):
363 | '''
364 | calculating the gini value of an attribution
365 |
366 | @param label_arr: ndarray, class label array of data_arr
367 | @return gini: the information entropy of current attribution
368 | '''
369 | gini = 1
370 |
371 | n = len(label_arr)
372 | label_count = NodeLabel(label_arr)
373 | for key in label_count:
374 | gini -= (label_count[key] / n) * (label_count[key] / n)
375 |
376 | return gini
377 |
378 |
379 | '''
380 | optimal attribution selection in ID3 algorithm based on information entropy
381 | '''
382 |
383 |
384 | def OptAttr_Ent(df):
385 | '''
386 | find the optimal attributes of current data_set based on info entropy
387 |
388 | @param df: the pandas dataframe of the data_set
389 | @return opt_attr: the optimal attribution for branch
390 | @return div_value: for discrete variable value = 0
391 | for continuous variable value = t for bisection divide value
392 | '''
393 | info_gain = 0
394 |
395 | for attr_id in df.columns[1:-1]:
396 | info_gian_tmp, div_value_tmp = InfoGain(df, attr_id)
397 | if info_gian_tmp > info_gain:
398 | info_gain = info_gian_tmp
399 | opt_attr = attr_id
400 | div_value = div_value_tmp
401 |
402 | return opt_attr, div_value
403 |
404 |
405 | def InfoGain(df, attr_id):
406 | '''
407 | calculating the information gain of an attribution
408 |
409 | @param df: dataframe, the pandas dataframe of the data_set
410 | @param attr_id: the target attribution in df
411 | @return info_gain: the information gain of current attribution
412 | @return div_value: for discrete variable, value = 0
413 | for continuous variable, value = t (the division value)
414 | '''
415 | info_gain = InfoEnt(df.values[:, -1]) # info_gain for the whole label
416 | div_value = 0 # div_value for continuous attribute
417 |
418 | n = len(df[attr_id]) # the number of sample
419 | # 1.for continuous variable using method of bisection
420 | if type(df[attr_id][0]) == np.float64:
421 | sub_info_ent = {} # store the div_value (div) and it's subset entropy
422 |
423 | df = df.sort([attr_id], ascending=1) # sorting via column
424 | df = df.reset_index(drop=True)
425 |
426 | data_arr = df[attr_id]
427 | label_arr = df[df.columns[-1]]
428 |
429 | for i in range(n - 1):
430 | div = (data_arr[i] + data_arr[i + 1]) / 2
431 | sub_info_ent[div] = ((i + 1) * InfoEnt(label_arr[0:i + 1]) / n) \
432 | + ((n - i - 1) * InfoEnt(label_arr[i + 1:-1]) / n)
433 | # our goal is to get the min subset entropy sum and it's divide value
434 | div_value, sub_info_ent_max = min(sub_info_ent.items(), key=lambda x: x[1])
435 | info_gain -= sub_info_ent_max
436 |
437 | # 2.for discrete variable (categorical variable)
438 | else:
439 | data_arr = df[attr_id]
440 | label_arr = df[df.columns[-1]]
441 | value_count = ValueCount(data_arr)
442 |
443 | for key in value_count:
444 | key_label_arr = label_arr[data_arr == key]
445 | info_gain -= value_count[key] * InfoEnt(key_label_arr) / n
446 |
447 | return info_gain, div_value
448 |
449 |
450 | def InfoEnt(label_arr):
451 | '''
452 | calculating the information entropy of an attribution
453 |
454 | @param label_arr: ndarray, class label array of data_arr
455 | @return ent: the information entropy of current attribution
456 | '''
457 | try:
458 | from math import log2
459 | except ImportError:
460 | print("module math.log2 not found")
461 |
462 | ent = 0
463 | n = len(label_arr)
464 | label_count = NodeLabel(label_arr)
465 |
466 | for key in label_count:
467 | ent -= (label_count[key] / n) * log2(label_count[key] / n)
468 |
469 | return ent
470 |
471 |
472 | def DrawPNG(root, out_file):
473 | '''
474 | visualization of decision tree from root.
475 | @param root: Node, the root node for tree.
476 | @param out_file: str, name and path of output file
477 | '''
478 | try:
479 | from pydotplus import graphviz
480 | except ImportError:
481 | print("module pydotplus.graphviz not found")
482 |
483 | g = graphviz.Dot() # generation of new dot
484 |
485 | TreeToGraph(0, g, root)
486 | g2 = graphviz.graph_from_dot_data(g.to_string())
487 |
488 | g2.write_png(out_file)
489 |
490 |
491 | def TreeToGraph(i, g, root):
492 | '''
493 | build a graph from root on
494 | @param i: node number in this tree
495 | @param g: pydotplus.graphviz.Dot() object
496 | @param root: the root node
497 |
498 | @return i: node number after modified
499 | # @return g: pydotplus.graphviz.Dot() object after modified
500 | @return g_node: the current root node in graphviz
501 | '''
502 | try:
503 | from pydotplus import graphviz
504 | except ImportError:
505 | print("module pydotplus.graphviz not found")
506 |
507 | if root.attr == None:
508 | g_node_label = "Node:%d\n好瓜:%s" % (i, root.label)
509 | else:
510 | g_node_label = "Node:%d\n好瓜:%s\n属性:%s" % (i, root.label, root.attr)
511 | g_node = i
512 | g.add_node(graphviz.Node(g_node, label=g_node_label))
513 |
514 | for value in list(root.attr_down):
515 | i, g_child = TreeToGraph(i + 1, g, root.attr_down[value])
516 | g.add_edge(graphviz.Edge(g_node, g_child, label=value))
517 |
518 | return i, g_node
519 |
--------------------------------------------------------------------------------