├── README.md ├── als.py ├── dataset ├── boston_house_prices.csv ├── breast_cancer.csv ├── movie_ratings.csv ├── observations.csv └── states.csv ├── decision_tree.py ├── gauss_nb.py ├── gbdt_classify.py ├── gbdt_classify_regression.py ├── gbdt_regression.py ├── image ├── gbdt_regression.png ├── kmeans.png ├── knn.png ├── regression_tree.png ├── ridge.png └── weixin.jpg ├── kd_tree.py ├── kmeans.py ├── knn.py ├── lda-multi-classify.py ├── lda-two-classify.py ├── linear_regression.py ├── logistic_regression.py ├── max_heap.py ├── pca.py ├── perceptron.py ├── random_forest.py ├── regression_tree.py └── ridge.py /README.md: -------------------------------------------------------------------------------- 1 | # 机器学习算法学习 2 | 3 | - regreesion_tree.py 机器学习算法之回归树,原理请看:https://blog.csdn.net/just_sort/article/details/100574688 4 | - gbdt_regession.py 机器学习算法之梯度提升决策树(GBDT)回归,原理请看:https://blog.csdn.net/just_sort/article/details/100604262 5 | - gbdt_classify.py 机器学习算法之梯度提升决策树(GBDT)分类,如果要运行这个脚本需要将gbgt_regression.py的内容替换为gbdt_classify_regression.py,因为GBDT分类继承了GBDT分类的类,所以类里面的某些函数会重写。原理请看: https://blog.csdn.net/just_sort/article/details/100658881 6 | - max_heap.py 机器学习算法之最大堆,原理请看:https://blog.csdn.net/just_sort/article/details/100740794 7 | - linear_regression.py 机器学习算法之线性回归。算法原理请看:https://blog.csdn.net/just_sort/article/details/101216607 8 | - logistic_regression.py 机器学习算法之逻辑斯蒂回归。算法原理请看:https://blog.csdn.net/just_sort/article/details/101272026 9 | - kd_tree.py 机器学习算法之KD树,是KNN的基础。算法原理请看:https://blog.csdn.net/just_sort/article/details/101296358 10 | - knn.py 机器学习算法之k邻近算法。算法原理请看:https://blog.csdn.net/just_sort/article/details/101530830 11 | - gauss_nb.py 机器学习算法之高斯朴素贝叶斯算法。算法原理请看:https://blog.csdn.net/just_sort/article/details/101704567 12 | - als.py 机器学习算法之交替最小二乘法(ALS)算法。算法原理请看:https://blog.csdn.net/just_sort/article/details/101762055 13 | - decision_tree.py 机器学习算法之决策树算法。算法原理请看:https://blog.csdn.net/just_sort/article/details/102400047 14 | - random_forest.py 机器学习算法之随机深林算法。算法原理请看:https://blog.csdn.net/just_sort/article/details/102415869 15 | - kmeans.py 机器学习算法之Kmeans聚类算法。算法原理请看:https://blog.csdn.net/just_sort/article/details/102499619 16 | - ridge.py 机器学习算法之脊回归算法。算法原理请看:https://blog.csdn.net/just_sort/article/details/102511645 17 | - lda-two-classify.py 机器学习算法之LDA二分类算法。算法原理请看:https://blog.csdn.net/just_sort/article/details/102690086 18 | - lda-multi-classify.py 机器学习算法之LDA多分类。算法原理请看:https://blog.csdn.net/just_sort/article/details/102701240 19 | - perceptron.py 机器学习算法之感知机算法。算法原理请看:https://blog.csdn.net/just_sort/article/details/102792788 20 | - pca.py 机器学习算法之PCA算法。算法原理请看:https://blog.csdn.net/just_sort/article/details/102943659 21 | 22 | 23 | 24 | 25 | 26 | # 维护了一个微信公众号,分享论文,算法,比赛,生活,欢迎加入。 27 | 28 | - 图片要是没加载出来直接搜GiantPandaCV 就好。 29 | 30 | ![](image/weixin.jpg) 31 | -------------------------------------------------------------------------------- /als.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from collections import defaultdict 4 | from random import random 5 | from itertools import product, chain 6 | from time import time 7 | 8 | def load_movie_ratings(): 9 | 10 | f = open("boston/movie_ratings.csv") 11 | lines = iter(f) 12 | col_names = ", ".join(next(lines)[:-1].split(",")[:-1]) 13 | print("The column names are: %s." % col_names) 14 | data = [[float(x) if i == 2 else int(x) 15 | for i, x in enumerate(line[:-1].split(",")[:-1])] 16 | for line in lines] 17 | f.close() 18 | return data 19 | 20 | class Matrix(object): 21 | def __init__(self, data): 22 | self.data = data 23 | self.shape = (len(data), len(data[0])) 24 | 25 | def row(self, row_no): 26 | return Matrix([self.data[row_no]]) 27 | 28 | def col(self, col_no): 29 | m = self.shape[0] 30 | return Matrix([[self.data[i][col_no]] for i in range(m)]) 31 | 32 | @property 33 | def is_square(self): 34 | return self.shape[0] == self.shape[1] 35 | 36 | @property 37 | def transpose(self): 38 | data = list(map(list, zip(*self.data))) 39 | return Matrix(data) 40 | 41 | # 生成一个长度为n的单位阵 42 | def _eye(self, n): 43 | return [[0 if i != j else 1 for j in range(n)] for i in range(n)] 44 | 45 | @property 46 | def eye(self): 47 | assert self.is_squre, "The matrix has to be squre" 48 | data = self._eye(self.shape[0]) 49 | return Matrix(data) 50 | 51 | # 高斯消元 52 | def gaussian_elimination(self, aug_matrix): 53 | n = len(aug_matrix) 54 | m = len(aug_matrix[0]) 55 | 56 | # From top to bottom. 57 | for col_idx in range(n): 58 | # Check if element on the diagonal is zero. 59 | if aug_matrix[col_idx][col_idx] == 0: 60 | row_idx = col_idx 61 | # Find a row whose element has same column index with 62 | # the element on the diagonal is not zero. 63 | while row_idx < n and aug_matrix[row_idx][col_idx] == 0: 64 | row_idx += 1 65 | # Add this row to the row of the element on the diagonal. 66 | for i in range(col_idx, m): 67 | aug_matrix[col_idx][i] += aug_matrix[row_idx][i] 68 | 69 | # Elimiate the non-zero element. 70 | for i in range(col_idx + 1, n): 71 | # Skip the zero element. 72 | if aug_matrix[i][col_idx] == 0: 73 | continue 74 | # Elimiate the non-zero element. 75 | k = aug_matrix[i][col_idx] / aug_matrix[col_idx][col_idx] 76 | for j in range(col_idx, m): 77 | aug_matrix[i][j] -= k * aug_matrix[col_idx][j] 78 | 79 | # From bottom to top. 80 | for col_idx in range(n - 1, -1, -1): 81 | # Elimiate the non-zero element. 82 | for i in range(col_idx): 83 | # Skip the zero element. 84 | if aug_matrix[i][col_idx] == 0: 85 | continue 86 | # Elimiate the non-zero element. 87 | k = aug_matrix[i][col_idx] / aug_matrix[col_idx][col_idx] 88 | for j in chain(range(i, col_idx + 1), range(n, m)): 89 | aug_matrix[i][j] -= k * aug_matrix[col_idx][j] 90 | 91 | # Iterate the element on the diagonal. 92 | for i in range(n): 93 | k = 1 / aug_matrix[i][i] 94 | aug_matrix[i][i] *= k 95 | for j in range(n, m): 96 | aug_matrix[i][j] *= k 97 | 98 | return aug_matrix 99 | 100 | # 矩阵求逆 101 | def _inverse(self, data): 102 | n = len(data) 103 | unit_matrix = self._eye(n) 104 | aug_matrix = [a + b for a, b in zip(self.data, unit_matrix)] 105 | ret = self.gaussian_elimination(aug_matrix) 106 | 107 | return list(map(lambda x: x[n:], ret)) 108 | 109 | # 矩阵求逆,原理:https://baike.baidu.com/item/%E9%AB%98%E6%96%AF%E6%B6%88%E5%85%83%E6%B3%95/619561?fr=aladdin 110 | @property 111 | def inverse(self): 112 | assert self.is_square, "The matrix has to be square!" 113 | data = self._inverse(self.data) 114 | 115 | return Matrix(data) 116 | 117 | def row_mul(self, row_A, row_B): 118 | return sum(x[0] * x[1] for x in zip(row_A, row_B)) 119 | 120 | def _mat_mul(self, row_A, B): 121 | row_pairs = product([row_A], B.transpose.data) 122 | return [self.row_mul(*row_pair) for row_pair in row_pairs] 123 | 124 | def mat_mul(self, B): 125 | assert self.shape[1] == B.shape[0], "A's column count does not match B's row count!" 126 | return Matrix([self._mat_mul(row_A, B) for row_A in self.data]) 127 | 128 | def _mean(self, data): 129 | m = len(data) 130 | n = len(data[0]) 131 | ret = [0 for _ in range(n)] 132 | for row in data: 133 | for j in range(n): 134 | ret[j] += row[j] / m 135 | return ret 136 | def mean(self, data): 137 | return Matrix(self._mean(self.data)) 138 | 139 | # 统计程序运行时间函数 140 | # fn代表运行的函数 141 | def run_time(fn): 142 | def fun(): 143 | start = time() 144 | fn() 145 | ret = time() - start 146 | if ret < 1e-6: 147 | unit = "ns" 148 | ret *= 1e9 149 | elif ret < 1e-3: 150 | unit = "us" 151 | ret *= 1e6 152 | elif ret < 1: 153 | unit = "ms" 154 | ret *= 1e3 155 | else: 156 | unit = "s" 157 | print("Total run time is %.1f %s\n" % (ret, unit)) 158 | return fun() 159 | 160 | 161 | class ALS(object): 162 | # 初始化,存储用户ID、物品ID、用户ID与用户矩阵列号的对应关系、物品ID 163 | # 与物品矩阵列号的对应关系、用户已经看过哪些物品、评分矩阵的Shape以及RMSE 164 | def __init__(self): 165 | self.user_ids = None 166 | self.item_ids = None 167 | self.user_ids_dict = None 168 | self.item_ids_dict = None 169 | self.user_matrix = None 170 | self.item_matrix = None 171 | self.user_items = None 172 | self.shape = None 173 | self.rmse = None 174 | 175 | # 对训练数据进行处理,得到用户ID、物品ID、用户ID与用户矩阵列号的对应关系、物 176 | # 品ID与物品矩阵列号的对应关系、评分矩阵的Shape、评分矩阵及评分矩阵的转置。 177 | def process_data(self, X): 178 | self.user_ids = tuple((set(map(lambda x: x[0], X)))) 179 | self.user_ids_dict = dict(map(lambda x: x[::-1], enumerate(self.user_ids))) 180 | self.item_ids = tuple((set(map(lambda x: x[1], X)))) 181 | self.item_ids_dict = dict(map(lambda x: x[::-1], enumerate(self.item_ids))) 182 | self.shape = (len(self.user_ids), len(self.item_ids)) 183 | ratings = defaultdict(lambda : defaultdict(int)) 184 | ratings_T = defaultdict(lambda : defaultdict(int)) 185 | for row in X: 186 | user_id, item_id, rating = row 187 | ratings[user_id][item_id] = rating 188 | ratings_T[item_id][user_id] = rating 189 | err_msg = "Length of user_ids %d and ratings %d not match!" % ( 190 | len(self.user_ids), len(ratings)) 191 | assert len(self.user_ids) == len(ratings), err_msg 192 | err_msg = "Length of item_ids %d and ratings_T %d not match!" % ( 193 | len(self.item_ids), len(ratings_T)) 194 | assert len(self.item_ids) == len(ratings_T), err_msg 195 | return ratings, ratings_T 196 | 197 | # 用户矩阵乘以评分矩阵,实现稠密矩阵与稀疏矩阵的矩阵乘法,得到用户矩阵与评分矩阵的乘积。 198 | def users_mul_ratings(self, users, ratings_T): 199 | def f(users_row, item_id): 200 | user_ids = iter(ratings_T[item_id].keys()) 201 | scores = iter(ratings_T[item_id].values()) 202 | col_nos = map(lambda x: self.user_ids_dict[x], user_ids) 203 | _users_row = map(lambda x: users_row[x], col_nos) 204 | return sum(a * b for a, b in zip(_users_row, scores)) 205 | 206 | ret = [[f(users_row, item_id) for item_id in self.item_ids] 207 | for users_row in users.data] 208 | return Matrix(ret) 209 | 210 | # 物品矩阵乘以评分矩阵,实现稠密矩阵与稀疏矩阵的矩阵乘法,得到物品矩阵与评分矩阵的乘积。 211 | def items_mul_ratings(self, items, ratings): 212 | def f(items_row, user_id): 213 | item_ids = iter(ratings[user_id].keys()) 214 | scores = iter(ratings[user_id].values()) 215 | col_nos = map(lambda x: self.item_ids_dict[x], item_ids) 216 | _items_row = map(lambda x: items_row[x], col_nos) 217 | return sum(a * b for a, b in zip(_items_row, scores)) 218 | 219 | ret = [[f(items_row, user_id) for user_id in self.user_ids] 220 | for items_row in items.data] 221 | return Matrix(ret) 222 | 223 | # 生成随机矩阵 224 | def gen_random_matrix(self, n_rows, n_colums): 225 | data = [[random() for _ in range(n_colums)] for _ in range(n_rows)] 226 | return Matrix(data) 227 | 228 | # 计算RMSE 229 | def get_rmse(self, ratings): 230 | m, n = self.shape 231 | mse = 0.0 232 | n_elements = sum(map(len, ratings.values())) 233 | for i in range(m): 234 | for j in range(n): 235 | user_id = self.user_ids[i] 236 | item_id = self.item_ids[j] 237 | rating = ratings[user_id][item_id] 238 | if rating > 0: 239 | user_row = self.user_matrix.col(i).transpose 240 | item_col = self.item_matrix.col(j) 241 | rating_hat = user_row.mat_mul(item_col).data[0][0] 242 | square_error = (rating - rating_hat) ** 2 243 | mse += square_error / n_elements 244 | return mse ** 0.5 245 | 246 | # 训练模型 247 | # 1.数据预处理 248 | # 2.变量k合法性检查 249 | # 3.生成随机矩阵U 250 | # 4.交替计算矩阵U和矩阵I,并打印RMSE信息,直到迭代次数达到max_iter 251 | # 5.保存最终的RMSE 252 | def fit(self, X, k, max_iter=10): 253 | ratings, ratings_T = self.process_data(X) 254 | self.user_items = {k: set(v.keys()) for k,v in ratings.items()} 255 | m, n = self.shape 256 | error_msg = "Parameter k must be less than the rank of original matrix" 257 | assert k < min(m, n), error_msg 258 | self.user_matrix = self.gen_random_matrix(k, m) 259 | for i in range(max_iter): 260 | if i % 2: 261 | items = self.item_matrix 262 | self.user_matrix = self.items_mul_ratings( 263 | items.mat_mul(items.transpose).inverse.mat_mul(items), 264 | ratings 265 | ) 266 | else: 267 | users = self.user_matrix 268 | self.item_matrix = self.users_mul_ratings( 269 | users.mat_mul(users.transpose).inverse.mat_mul(users), 270 | ratings_T 271 | ) 272 | rmse = self.get_rmse(ratings) 273 | print("Iterations: %d, RMSE: %.6f" % (i + 1, rmse)) 274 | self.rmse = rmse 275 | # 预测一个用户 276 | def _predict(self, user_id, n_items): 277 | users_col = self.user_matrix.col(self.user_ids_dict[user_id]) 278 | users_col = users_col.transpose 279 | 280 | items_col = enumerate(users_col.mat_mul(self.item_matrix).data[0]) 281 | items_scores = map(lambda x: (self.item_ids[x[0]], x[1]), items_col) 282 | viewed_items = self.user_items[user_id] 283 | items_scores = filter(lambda x: x[0] not in viewed_items, items_scores) 284 | 285 | return sorted(items_scores, key=lambda x: x[1], reverse=True)[:n_items] 286 | 287 | # 预测多个用户 288 | def predict(self, user_ids, n_items=10): 289 | return [self._predict(user_id, n_items) for user_id in user_ids] 290 | 291 | def format_prediction(item_id, score): 292 | return "item_id:%d score:%.2f" % (item_id, score) 293 | 294 | @run_time 295 | def main(): 296 | print("Tesing the accuracy of ALS...") 297 | 298 | X = load_movie_ratings() 299 | 300 | model = ALS() 301 | model.fit(X, k=3, max_iter=5) 302 | print("Showing the predictions of users...") 303 | 304 | user_ids = range(1, 5) 305 | predictions = model.predict(user_ids, n_items=2) 306 | for user_id, prediction in zip(user_ids, predictions): 307 | _prediction = [format_prediction(item_id, score) 308 | for item_id, score in prediction] 309 | print("User id:%d recommedation: %s" % (user_id, _prediction)) 310 | 311 | 312 | -------------------------------------------------------------------------------- /dataset/boston_house_prices.csv: -------------------------------------------------------------------------------- 1 | 0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24 2 | 0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6 3 | 0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7 4 | 0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4 5 | 0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2 6 | 0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7 7 | 0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9 8 | 0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1 9 | 0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93,16.5 10 | 0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9 11 | 0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45,15 12 | 0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27,18.9 13 | 0.09378,12.5,7.87,0,0.524,5.889,39,5.4509,5,311,15.2,390.5,15.71,21.7 14 | 0.62976,0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21,396.9,8.26,20.4 15 | 0.63796,0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21,380.02,10.26,18.2 16 | 0.62739,0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21,395.62,8.47,19.9 17 | 1.05393,0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21,386.85,6.58,23.1 18 | 0.7842,0,8.14,0,0.538,5.99,81.7,4.2579,4,307,21,386.75,14.67,17.5 19 | 0.80271,0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21,288.99,11.69,20.2 20 | 0.7258,0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21,390.95,11.28,18.2 21 | 1.25179,0,8.14,0,0.538,5.57,98.1,3.7979,4,307,21,376.57,21.02,13.6 22 | 0.85204,0,8.14,0,0.538,5.965,89.2,4.0123,4,307,21,392.53,13.83,19.6 23 | 1.23247,0,8.14,0,0.538,6.142,91.7,3.9769,4,307,21,396.9,18.72,15.2 24 | 0.98843,0,8.14,0,0.538,5.813,100,4.0952,4,307,21,394.54,19.88,14.5 25 | 0.75026,0,8.14,0,0.538,5.924,94.1,4.3996,4,307,21,394.33,16.3,15.6 26 | 0.84054,0,8.14,0,0.538,5.599,85.7,4.4546,4,307,21,303.42,16.51,13.9 27 | 0.67191,0,8.14,0,0.538,5.813,90.3,4.682,4,307,21,376.88,14.81,16.6 28 | 0.95577,0,8.14,0,0.538,6.047,88.8,4.4534,4,307,21,306.38,17.28,14.8 29 | 0.77299,0,8.14,0,0.538,6.495,94.4,4.4547,4,307,21,387.94,12.8,18.4 30 | 1.00245,0,8.14,0,0.538,6.674,87.3,4.239,4,307,21,380.23,11.98,21 31 | 1.13081,0,8.14,0,0.538,5.713,94.1,4.233,4,307,21,360.17,22.6,12.7 32 | 1.35472,0,8.14,0,0.538,6.072,100,4.175,4,307,21,376.73,13.04,14.5 33 | 1.38799,0,8.14,0,0.538,5.95,82,3.99,4,307,21,232.6,27.71,13.2 34 | 1.15172,0,8.14,0,0.538,5.701,95,3.7872,4,307,21,358.77,18.35,13.1 35 | 1.61282,0,8.14,0,0.538,6.096,96.9,3.7598,4,307,21,248.31,20.34,13.5 36 | 0.06417,0,5.96,0,0.499,5.933,68.2,3.3603,5,279,19.2,396.9,9.68,18.9 37 | 0.09744,0,5.96,0,0.499,5.841,61.4,3.3779,5,279,19.2,377.56,11.41,20 38 | 0.08014,0,5.96,0,0.499,5.85,41.5,3.9342,5,279,19.2,396.9,8.77,21 39 | 0.17505,0,5.96,0,0.499,5.966,30.2,3.8473,5,279,19.2,393.43,10.13,24.7 40 | 0.02763,75,2.95,0,0.428,6.595,21.8,5.4011,3,252,18.3,395.63,4.32,30.8 41 | 0.03359,75,2.95,0,0.428,7.024,15.8,5.4011,3,252,18.3,395.62,1.98,34.9 42 | 0.12744,0,6.91,0,0.448,6.77,2.9,5.7209,3,233,17.9,385.41,4.84,26.6 43 | 0.1415,0,6.91,0,0.448,6.169,6.6,5.7209,3,233,17.9,383.37,5.81,25.3 44 | 0.15936,0,6.91,0,0.448,6.211,6.5,5.7209,3,233,17.9,394.46,7.44,24.7 45 | 0.12269,0,6.91,0,0.448,6.069,40,5.7209,3,233,17.9,389.39,9.55,21.2 46 | 0.17142,0,6.91,0,0.448,5.682,33.8,5.1004,3,233,17.9,396.9,10.21,19.3 47 | 0.18836,0,6.91,0,0.448,5.786,33.3,5.1004,3,233,17.9,396.9,14.15,20 48 | 0.22927,0,6.91,0,0.448,6.03,85.5,5.6894,3,233,17.9,392.74,18.8,16.6 49 | 0.25387,0,6.91,0,0.448,5.399,95.3,5.87,3,233,17.9,396.9,30.81,14.4 50 | 0.21977,0,6.91,0,0.448,5.602,62,6.0877,3,233,17.9,396.9,16.2,19.4 51 | 0.08873,21,5.64,0,0.439,5.963,45.7,6.8147,4,243,16.8,395.56,13.45,19.7 52 | 0.04337,21,5.64,0,0.439,6.115,63,6.8147,4,243,16.8,393.97,9.43,20.5 53 | 0.0536,21,5.64,0,0.439,6.511,21.1,6.8147,4,243,16.8,396.9,5.28,25 54 | 0.04981,21,5.64,0,0.439,5.998,21.4,6.8147,4,243,16.8,396.9,8.43,23.4 55 | 0.0136,75,4,0,0.41,5.888,47.6,7.3197,3,469,21.1,396.9,14.8,18.9 56 | 0.01311,90,1.22,0,0.403,7.249,21.9,8.6966,5,226,17.9,395.93,4.81,35.4 57 | 0.02055,85,0.74,0,0.41,6.383,35.7,9.1876,2,313,17.3,396.9,5.77,24.7 58 | 0.01432,100,1.32,0,0.411,6.816,40.5,8.3248,5,256,15.1,392.9,3.95,31.6 59 | 0.15445,25,5.13,0,0.453,6.145,29.2,7.8148,8,284,19.7,390.68,6.86,23.3 60 | 0.10328,25,5.13,0,0.453,5.927,47.2,6.932,8,284,19.7,396.9,9.22,19.6 61 | 0.14932,25,5.13,0,0.453,5.741,66.2,7.2254,8,284,19.7,395.11,13.15,18.7 62 | 0.17171,25,5.13,0,0.453,5.966,93.4,6.8185,8,284,19.7,378.08,14.44,16 63 | 0.11027,25,5.13,0,0.453,6.456,67.8,7.2255,8,284,19.7,396.9,6.73,22.2 64 | 0.1265,25,5.13,0,0.453,6.762,43.4,7.9809,8,284,19.7,395.58,9.5,25 65 | 0.01951,17.5,1.38,0,0.4161,7.104,59.5,9.2229,3,216,18.6,393.24,8.05,33 66 | 0.03584,80,3.37,0,0.398,6.29,17.8,6.6115,4,337,16.1,396.9,4.67,23.5 67 | 0.04379,80,3.37,0,0.398,5.787,31.1,6.6115,4,337,16.1,396.9,10.24,19.4 68 | 0.05789,12.5,6.07,0,0.409,5.878,21.4,6.498,4,345,18.9,396.21,8.1,22 69 | 0.13554,12.5,6.07,0,0.409,5.594,36.8,6.498,4,345,18.9,396.9,13.09,17.4 70 | 0.12816,12.5,6.07,0,0.409,5.885,33,6.498,4,345,18.9,396.9,8.79,20.9 71 | 0.08826,0,10.81,0,0.413,6.417,6.6,5.2873,4,305,19.2,383.73,6.72,24.2 72 | 0.15876,0,10.81,0,0.413,5.961,17.5,5.2873,4,305,19.2,376.94,9.88,21.7 73 | 0.09164,0,10.81,0,0.413,6.065,7.8,5.2873,4,305,19.2,390.91,5.52,22.8 74 | 0.19539,0,10.81,0,0.413,6.245,6.2,5.2873,4,305,19.2,377.17,7.54,23.4 75 | 0.07896,0,12.83,0,0.437,6.273,6,4.2515,5,398,18.7,394.92,6.78,24.1 76 | 0.09512,0,12.83,0,0.437,6.286,45,4.5026,5,398,18.7,383.23,8.94,21.4 77 | 0.10153,0,12.83,0,0.437,6.279,74.5,4.0522,5,398,18.7,373.66,11.97,20 78 | 0.08707,0,12.83,0,0.437,6.14,45.8,4.0905,5,398,18.7,386.96,10.27,20.8 79 | 0.05646,0,12.83,0,0.437,6.232,53.7,5.0141,5,398,18.7,386.4,12.34,21.2 80 | 0.08387,0,12.83,0,0.437,5.874,36.6,4.5026,5,398,18.7,396.06,9.1,20.3 81 | 0.04113,25,4.86,0,0.426,6.727,33.5,5.4007,4,281,19,396.9,5.29,28 82 | 0.04462,25,4.86,0,0.426,6.619,70.4,5.4007,4,281,19,395.63,7.22,23.9 83 | 0.03659,25,4.86,0,0.426,6.302,32.2,5.4007,4,281,19,396.9,6.72,24.8 84 | 0.03551,25,4.86,0,0.426,6.167,46.7,5.4007,4,281,19,390.64,7.51,22.9 85 | 0.05059,0,4.49,0,0.449,6.389,48,4.7794,3,247,18.5,396.9,9.62,23.9 86 | 0.05735,0,4.49,0,0.449,6.63,56.1,4.4377,3,247,18.5,392.3,6.53,26.6 87 | 0.05188,0,4.49,0,0.449,6.015,45.1,4.4272,3,247,18.5,395.99,12.86,22.5 88 | 0.07151,0,4.49,0,0.449,6.121,56.8,3.7476,3,247,18.5,395.15,8.44,22.2 89 | 0.0566,0,3.41,0,0.489,7.007,86.3,3.4217,2,270,17.8,396.9,5.5,23.6 90 | 0.05302,0,3.41,0,0.489,7.079,63.1,3.4145,2,270,17.8,396.06,5.7,28.7 91 | 0.04684,0,3.41,0,0.489,6.417,66.1,3.0923,2,270,17.8,392.18,8.81,22.6 92 | 0.03932,0,3.41,0,0.489,6.405,73.9,3.0921,2,270,17.8,393.55,8.2,22 93 | 0.04203,28,15.04,0,0.464,6.442,53.6,3.6659,4,270,18.2,395.01,8.16,22.9 94 | 0.02875,28,15.04,0,0.464,6.211,28.9,3.6659,4,270,18.2,396.33,6.21,25 95 | 0.04294,28,15.04,0,0.464,6.249,77.3,3.615,4,270,18.2,396.9,10.59,20.6 96 | 0.12204,0,2.89,0,0.445,6.625,57.8,3.4952,2,276,18,357.98,6.65,28.4 97 | 0.11504,0,2.89,0,0.445,6.163,69.6,3.4952,2,276,18,391.83,11.34,21.4 98 | 0.12083,0,2.89,0,0.445,8.069,76,3.4952,2,276,18,396.9,4.21,38.7 99 | 0.08187,0,2.89,0,0.445,7.82,36.9,3.4952,2,276,18,393.53,3.57,43.8 100 | 0.0686,0,2.89,0,0.445,7.416,62.5,3.4952,2,276,18,396.9,6.19,33.2 101 | 0.14866,0,8.56,0,0.52,6.727,79.9,2.7778,5,384,20.9,394.76,9.42,27.5 102 | 0.11432,0,8.56,0,0.52,6.781,71.3,2.8561,5,384,20.9,395.58,7.67,26.5 103 | 0.22876,0,8.56,0,0.52,6.405,85.4,2.7147,5,384,20.9,70.8,10.63,18.6 104 | 0.21161,0,8.56,0,0.52,6.137,87.4,2.7147,5,384,20.9,394.47,13.44,19.3 105 | 0.1396,0,8.56,0,0.52,6.167,90,2.421,5,384,20.9,392.69,12.33,20.1 106 | 0.13262,0,8.56,0,0.52,5.851,96.7,2.1069,5,384,20.9,394.05,16.47,19.5 107 | 0.1712,0,8.56,0,0.52,5.836,91.9,2.211,5,384,20.9,395.67,18.66,19.5 108 | 0.13117,0,8.56,0,0.52,6.127,85.2,2.1224,5,384,20.9,387.69,14.09,20.4 109 | 0.12802,0,8.56,0,0.52,6.474,97.1,2.4329,5,384,20.9,395.24,12.27,19.8 110 | 0.26363,0,8.56,0,0.52,6.229,91.2,2.5451,5,384,20.9,391.23,15.55,19.4 111 | 0.10793,0,8.56,0,0.52,6.195,54.4,2.7778,5,384,20.9,393.49,13,21.7 112 | 0.10084,0,10.01,0,0.547,6.715,81.6,2.6775,6,432,17.8,395.59,10.16,22.8 113 | 0.12329,0,10.01,0,0.547,5.913,92.9,2.3534,6,432,17.8,394.95,16.21,18.8 114 | 0.22212,0,10.01,0,0.547,6.092,95.4,2.548,6,432,17.8,396.9,17.09,18.7 115 | 0.14231,0,10.01,0,0.547,6.254,84.2,2.2565,6,432,17.8,388.74,10.45,18.5 116 | 0.17134,0,10.01,0,0.547,5.928,88.2,2.4631,6,432,17.8,344.91,15.76,18.3 117 | 0.13158,0,10.01,0,0.547,6.176,72.5,2.7301,6,432,17.8,393.3,12.04,21.2 118 | 0.15098,0,10.01,0,0.547,6.021,82.6,2.7474,6,432,17.8,394.51,10.3,19.2 119 | 0.13058,0,10.01,0,0.547,5.872,73.1,2.4775,6,432,17.8,338.63,15.37,20.4 120 | 0.14476,0,10.01,0,0.547,5.731,65.2,2.7592,6,432,17.8,391.5,13.61,19.3 121 | 0.06899,0,25.65,0,0.581,5.87,69.7,2.2577,2,188,19.1,389.15,14.37,22 122 | 0.07165,0,25.65,0,0.581,6.004,84.1,2.1974,2,188,19.1,377.67,14.27,20.3 123 | 0.09299,0,25.65,0,0.581,5.961,92.9,2.0869,2,188,19.1,378.09,17.93,20.5 124 | 0.15038,0,25.65,0,0.581,5.856,97,1.9444,2,188,19.1,370.31,25.41,17.3 125 | 0.09849,0,25.65,0,0.581,5.879,95.8,2.0063,2,188,19.1,379.38,17.58,18.8 126 | 0.16902,0,25.65,0,0.581,5.986,88.4,1.9929,2,188,19.1,385.02,14.81,21.4 127 | 0.38735,0,25.65,0,0.581,5.613,95.6,1.7572,2,188,19.1,359.29,27.26,15.7 128 | 0.25915,0,21.89,0,0.624,5.693,96,1.7883,4,437,21.2,392.11,17.19,16.2 129 | 0.32543,0,21.89,0,0.624,6.431,98.8,1.8125,4,437,21.2,396.9,15.39,18 130 | 0.88125,0,21.89,0,0.624,5.637,94.7,1.9799,4,437,21.2,396.9,18.34,14.3 131 | 0.34006,0,21.89,0,0.624,6.458,98.9,2.1185,4,437,21.2,395.04,12.6,19.2 132 | 1.19294,0,21.89,0,0.624,6.326,97.7,2.271,4,437,21.2,396.9,12.26,19.6 133 | 0.59005,0,21.89,0,0.624,6.372,97.9,2.3274,4,437,21.2,385.76,11.12,23 134 | 0.32982,0,21.89,0,0.624,5.822,95.4,2.4699,4,437,21.2,388.69,15.03,18.4 135 | 0.97617,0,21.89,0,0.624,5.757,98.4,2.346,4,437,21.2,262.76,17.31,15.6 136 | 0.55778,0,21.89,0,0.624,6.335,98.2,2.1107,4,437,21.2,394.67,16.96,18.1 137 | 0.32264,0,21.89,0,0.624,5.942,93.5,1.9669,4,437,21.2,378.25,16.9,17.4 138 | 0.35233,0,21.89,0,0.624,6.454,98.4,1.8498,4,437,21.2,394.08,14.59,17.1 139 | 0.2498,0,21.89,0,0.624,5.857,98.2,1.6686,4,437,21.2,392.04,21.32,13.3 140 | 0.54452,0,21.89,0,0.624,6.151,97.9,1.6687,4,437,21.2,396.9,18.46,17.8 141 | 0.2909,0,21.89,0,0.624,6.174,93.6,1.6119,4,437,21.2,388.08,24.16,14 142 | 1.62864,0,21.89,0,0.624,5.019,100,1.4394,4,437,21.2,396.9,34.41,14.4 143 | 3.32105,0,19.58,1,0.871,5.403,100,1.3216,5,403,14.7,396.9,26.82,13.4 144 | 4.0974,0,19.58,0,0.871,5.468,100,1.4118,5,403,14.7,396.9,26.42,15.6 145 | 2.77974,0,19.58,0,0.871,4.903,97.8,1.3459,5,403,14.7,396.9,29.29,11.8 146 | 2.37934,0,19.58,0,0.871,6.13,100,1.4191,5,403,14.7,172.91,27.8,13.8 147 | 2.15505,0,19.58,0,0.871,5.628,100,1.5166,5,403,14.7,169.27,16.65,15.6 148 | 2.36862,0,19.58,0,0.871,4.926,95.7,1.4608,5,403,14.7,391.71,29.53,14.6 149 | 2.33099,0,19.58,0,0.871,5.186,93.8,1.5296,5,403,14.7,356.99,28.32,17.8 150 | 2.73397,0,19.58,0,0.871,5.597,94.9,1.5257,5,403,14.7,351.85,21.45,15.4 151 | 1.6566,0,19.58,0,0.871,6.122,97.3,1.618,5,403,14.7,372.8,14.1,21.5 152 | 1.49632,0,19.58,0,0.871,5.404,100,1.5916,5,403,14.7,341.6,13.28,19.6 153 | 1.12658,0,19.58,1,0.871,5.012,88,1.6102,5,403,14.7,343.28,12.12,15.3 154 | 2.14918,0,19.58,0,0.871,5.709,98.5,1.6232,5,403,14.7,261.95,15.79,19.4 155 | 1.41385,0,19.58,1,0.871,6.129,96,1.7494,5,403,14.7,321.02,15.12,17 156 | 3.53501,0,19.58,1,0.871,6.152,82.6,1.7455,5,403,14.7,88.01,15.02,15.6 157 | 2.44668,0,19.58,0,0.871,5.272,94,1.7364,5,403,14.7,88.63,16.14,13.1 158 | 1.22358,0,19.58,0,0.605,6.943,97.4,1.8773,5,403,14.7,363.43,4.59,41.3 159 | 1.34284,0,19.58,0,0.605,6.066,100,1.7573,5,403,14.7,353.89,6.43,24.3 160 | 1.42502,0,19.58,0,0.871,6.51,100,1.7659,5,403,14.7,364.31,7.39,23.3 161 | 1.27346,0,19.58,1,0.605,6.25,92.6,1.7984,5,403,14.7,338.92,5.5,27 162 | 1.46336,0,19.58,0,0.605,7.489,90.8,1.9709,5,403,14.7,374.43,1.73,50 163 | 1.83377,0,19.58,1,0.605,7.802,98.2,2.0407,5,403,14.7,389.61,1.92,50 164 | 1.51902,0,19.58,1,0.605,8.375,93.9,2.162,5,403,14.7,388.45,3.32,50 165 | 2.24236,0,19.58,0,0.605,5.854,91.8,2.422,5,403,14.7,395.11,11.64,22.7 166 | 2.924,0,19.58,0,0.605,6.101,93,2.2834,5,403,14.7,240.16,9.81,25 167 | 2.01019,0,19.58,0,0.605,7.929,96.2,2.0459,5,403,14.7,369.3,3.7,50 168 | 1.80028,0,19.58,0,0.605,5.877,79.2,2.4259,5,403,14.7,227.61,12.14,23.8 169 | 2.3004,0,19.58,0,0.605,6.319,96.1,2.1,5,403,14.7,297.09,11.1,23.8 170 | 2.44953,0,19.58,0,0.605,6.402,95.2,2.2625,5,403,14.7,330.04,11.32,22.3 171 | 1.20742,0,19.58,0,0.605,5.875,94.6,2.4259,5,403,14.7,292.29,14.43,17.4 172 | 2.3139,0,19.58,0,0.605,5.88,97.3,2.3887,5,403,14.7,348.13,12.03,19.1 173 | 0.13914,0,4.05,0,0.51,5.572,88.5,2.5961,5,296,16.6,396.9,14.69,23.1 174 | 0.09178,0,4.05,0,0.51,6.416,84.1,2.6463,5,296,16.6,395.5,9.04,23.6 175 | 0.08447,0,4.05,0,0.51,5.859,68.7,2.7019,5,296,16.6,393.23,9.64,22.6 176 | 0.06664,0,4.05,0,0.51,6.546,33.1,3.1323,5,296,16.6,390.96,5.33,29.4 177 | 0.07022,0,4.05,0,0.51,6.02,47.2,3.5549,5,296,16.6,393.23,10.11,23.2 178 | 0.05425,0,4.05,0,0.51,6.315,73.4,3.3175,5,296,16.6,395.6,6.29,24.6 179 | 0.06642,0,4.05,0,0.51,6.86,74.4,2.9153,5,296,16.6,391.27,6.92,29.9 180 | 0.0578,0,2.46,0,0.488,6.98,58.4,2.829,3,193,17.8,396.9,5.04,37.2 181 | 0.06588,0,2.46,0,0.488,7.765,83.3,2.741,3,193,17.8,395.56,7.56,39.8 182 | 0.06888,0,2.46,0,0.488,6.144,62.2,2.5979,3,193,17.8,396.9,9.45,36.2 183 | 0.09103,0,2.46,0,0.488,7.155,92.2,2.7006,3,193,17.8,394.12,4.82,37.9 184 | 0.10008,0,2.46,0,0.488,6.563,95.6,2.847,3,193,17.8,396.9,5.68,32.5 185 | 0.08308,0,2.46,0,0.488,5.604,89.8,2.9879,3,193,17.8,391,13.98,26.4 186 | 0.06047,0,2.46,0,0.488,6.153,68.8,3.2797,3,193,17.8,387.11,13.15,29.6 187 | 0.05602,0,2.46,0,0.488,7.831,53.6,3.1992,3,193,17.8,392.63,4.45,50 188 | 0.07875,45,3.44,0,0.437,6.782,41.1,3.7886,5,398,15.2,393.87,6.68,32 189 | 0.12579,45,3.44,0,0.437,6.556,29.1,4.5667,5,398,15.2,382.84,4.56,29.8 190 | 0.0837,45,3.44,0,0.437,7.185,38.9,4.5667,5,398,15.2,396.9,5.39,34.9 191 | 0.09068,45,3.44,0,0.437,6.951,21.5,6.4798,5,398,15.2,377.68,5.1,37 192 | 0.06911,45,3.44,0,0.437,6.739,30.8,6.4798,5,398,15.2,389.71,4.69,30.5 193 | 0.08664,45,3.44,0,0.437,7.178,26.3,6.4798,5,398,15.2,390.49,2.87,36.4 194 | 0.02187,60,2.93,0,0.401,6.8,9.9,6.2196,1,265,15.6,393.37,5.03,31.1 195 | 0.01439,60,2.93,0,0.401,6.604,18.8,6.2196,1,265,15.6,376.7,4.38,29.1 196 | 0.01381,80,0.46,0,0.422,7.875,32,5.6484,4,255,14.4,394.23,2.97,50 197 | 0.04011,80,1.52,0,0.404,7.287,34.1,7.309,2,329,12.6,396.9,4.08,33.3 198 | 0.04666,80,1.52,0,0.404,7.107,36.6,7.309,2,329,12.6,354.31,8.61,30.3 199 | 0.03768,80,1.52,0,0.404,7.274,38.3,7.309,2,329,12.6,392.2,6.62,34.6 200 | 0.0315,95,1.47,0,0.403,6.975,15.3,7.6534,3,402,17,396.9,4.56,34.9 201 | 0.01778,95,1.47,0,0.403,7.135,13.9,7.6534,3,402,17,384.3,4.45,32.9 202 | 0.03445,82.5,2.03,0,0.415,6.162,38.4,6.27,2,348,14.7,393.77,7.43,24.1 203 | 0.02177,82.5,2.03,0,0.415,7.61,15.7,6.27,2,348,14.7,395.38,3.11,42.3 204 | 0.0351,95,2.68,0,0.4161,7.853,33.2,5.118,4,224,14.7,392.78,3.81,48.5 205 | 0.02009,95,2.68,0,0.4161,8.034,31.9,5.118,4,224,14.7,390.55,2.88,50 206 | 0.13642,0,10.59,0,0.489,5.891,22.3,3.9454,4,277,18.6,396.9,10.87,22.6 207 | 0.22969,0,10.59,0,0.489,6.326,52.5,4.3549,4,277,18.6,394.87,10.97,24.4 208 | 0.25199,0,10.59,0,0.489,5.783,72.7,4.3549,4,277,18.6,389.43,18.06,22.5 209 | 0.13587,0,10.59,1,0.489,6.064,59.1,4.2392,4,277,18.6,381.32,14.66,24.4 210 | 0.43571,0,10.59,1,0.489,5.344,100,3.875,4,277,18.6,396.9,23.09,20 211 | 0.17446,0,10.59,1,0.489,5.96,92.1,3.8771,4,277,18.6,393.25,17.27,21.7 212 | 0.37578,0,10.59,1,0.489,5.404,88.6,3.665,4,277,18.6,395.24,23.98,19.3 213 | 0.21719,0,10.59,1,0.489,5.807,53.8,3.6526,4,277,18.6,390.94,16.03,22.4 214 | 0.14052,0,10.59,0,0.489,6.375,32.3,3.9454,4,277,18.6,385.81,9.38,28.1 215 | 0.28955,0,10.59,0,0.489,5.412,9.8,3.5875,4,277,18.6,348.93,29.55,23.7 216 | 0.19802,0,10.59,0,0.489,6.182,42.4,3.9454,4,277,18.6,393.63,9.47,25 217 | 0.0456,0,13.89,1,0.55,5.888,56,3.1121,5,276,16.4,392.8,13.51,23.3 218 | 0.07013,0,13.89,0,0.55,6.642,85.1,3.4211,5,276,16.4,392.78,9.69,28.7 219 | 0.11069,0,13.89,1,0.55,5.951,93.8,2.8893,5,276,16.4,396.9,17.92,21.5 220 | 0.11425,0,13.89,1,0.55,6.373,92.4,3.3633,5,276,16.4,393.74,10.5,23 221 | 0.35809,0,6.2,1,0.507,6.951,88.5,2.8617,8,307,17.4,391.7,9.71,26.7 222 | 0.40771,0,6.2,1,0.507,6.164,91.3,3.048,8,307,17.4,395.24,21.46,21.7 223 | 0.62356,0,6.2,1,0.507,6.879,77.7,3.2721,8,307,17.4,390.39,9.93,27.5 224 | 0.6147,0,6.2,0,0.507,6.618,80.8,3.2721,8,307,17.4,396.9,7.6,30.1 225 | 0.31533,0,6.2,0,0.504,8.266,78.3,2.8944,8,307,17.4,385.05,4.14,44.8 226 | 0.52693,0,6.2,0,0.504,8.725,83,2.8944,8,307,17.4,382,4.63,50 227 | 0.38214,0,6.2,0,0.504,8.04,86.5,3.2157,8,307,17.4,387.38,3.13,37.6 228 | 0.41238,0,6.2,0,0.504,7.163,79.9,3.2157,8,307,17.4,372.08,6.36,31.6 229 | 0.29819,0,6.2,0,0.504,7.686,17,3.3751,8,307,17.4,377.51,3.92,46.7 230 | 0.44178,0,6.2,0,0.504,6.552,21.4,3.3751,8,307,17.4,380.34,3.76,31.5 231 | 0.537,0,6.2,0,0.504,5.981,68.1,3.6715,8,307,17.4,378.35,11.65,24.3 232 | 0.46296,0,6.2,0,0.504,7.412,76.9,3.6715,8,307,17.4,376.14,5.25,31.7 233 | 0.57529,0,6.2,0,0.507,8.337,73.3,3.8384,8,307,17.4,385.91,2.47,41.7 234 | 0.33147,0,6.2,0,0.507,8.247,70.4,3.6519,8,307,17.4,378.95,3.95,48.3 235 | 0.44791,0,6.2,1,0.507,6.726,66.5,3.6519,8,307,17.4,360.2,8.05,29 236 | 0.33045,0,6.2,0,0.507,6.086,61.5,3.6519,8,307,17.4,376.75,10.88,24 237 | 0.52058,0,6.2,1,0.507,6.631,76.5,4.148,8,307,17.4,388.45,9.54,25.1 238 | 0.51183,0,6.2,0,0.507,7.358,71.6,4.148,8,307,17.4,390.07,4.73,31.5 239 | 0.08244,30,4.93,0,0.428,6.481,18.5,6.1899,6,300,16.6,379.41,6.36,23.7 240 | 0.09252,30,4.93,0,0.428,6.606,42.2,6.1899,6,300,16.6,383.78,7.37,23.3 241 | 0.11329,30,4.93,0,0.428,6.897,54.3,6.3361,6,300,16.6,391.25,11.38,22 242 | 0.10612,30,4.93,0,0.428,6.095,65.1,6.3361,6,300,16.6,394.62,12.4,20.1 243 | 0.1029,30,4.93,0,0.428,6.358,52.9,7.0355,6,300,16.6,372.75,11.22,22.2 244 | 0.12757,30,4.93,0,0.428,6.393,7.8,7.0355,6,300,16.6,374.71,5.19,23.7 245 | 0.20608,22,5.86,0,0.431,5.593,76.5,7.9549,7,330,19.1,372.49,12.5,17.6 246 | 0.19133,22,5.86,0,0.431,5.605,70.2,7.9549,7,330,19.1,389.13,18.46,18.5 247 | 0.33983,22,5.86,0,0.431,6.108,34.9,8.0555,7,330,19.1,390.18,9.16,24.3 248 | 0.19657,22,5.86,0,0.431,6.226,79.2,8.0555,7,330,19.1,376.14,10.15,20.5 249 | 0.16439,22,5.86,0,0.431,6.433,49.1,7.8265,7,330,19.1,374.71,9.52,24.5 250 | 0.19073,22,5.86,0,0.431,6.718,17.5,7.8265,7,330,19.1,393.74,6.56,26.2 251 | 0.1403,22,5.86,0,0.431,6.487,13,7.3967,7,330,19.1,396.28,5.9,24.4 252 | 0.21409,22,5.86,0,0.431,6.438,8.9,7.3967,7,330,19.1,377.07,3.59,24.8 253 | 0.08221,22,5.86,0,0.431,6.957,6.8,8.9067,7,330,19.1,386.09,3.53,29.6 254 | 0.36894,22,5.86,0,0.431,8.259,8.4,8.9067,7,330,19.1,396.9,3.54,42.8 255 | 0.04819,80,3.64,0,0.392,6.108,32,9.2203,1,315,16.4,392.89,6.57,21.9 256 | 0.03548,80,3.64,0,0.392,5.876,19.1,9.2203,1,315,16.4,395.18,9.25,20.9 257 | 0.01538,90,3.75,0,0.394,7.454,34.2,6.3361,3,244,15.9,386.34,3.11,44 258 | 0.61154,20,3.97,0,0.647,8.704,86.9,1.801,5,264,13,389.7,5.12,50 259 | 0.66351,20,3.97,0,0.647,7.333,100,1.8946,5,264,13,383.29,7.79,36 260 | 0.65665,20,3.97,0,0.647,6.842,100,2.0107,5,264,13,391.93,6.9,30.1 261 | 0.54011,20,3.97,0,0.647,7.203,81.8,2.1121,5,264,13,392.8,9.59,33.8 262 | 0.53412,20,3.97,0,0.647,7.52,89.4,2.1398,5,264,13,388.37,7.26,43.1 263 | 0.52014,20,3.97,0,0.647,8.398,91.5,2.2885,5,264,13,386.86,5.91,48.8 264 | 0.82526,20,3.97,0,0.647,7.327,94.5,2.0788,5,264,13,393.42,11.25,31 265 | 0.55007,20,3.97,0,0.647,7.206,91.6,1.9301,5,264,13,387.89,8.1,36.5 266 | 0.76162,20,3.97,0,0.647,5.56,62.8,1.9865,5,264,13,392.4,10.45,22.8 267 | 0.7857,20,3.97,0,0.647,7.014,84.6,2.1329,5,264,13,384.07,14.79,30.7 268 | 0.57834,20,3.97,0,0.575,8.297,67,2.4216,5,264,13,384.54,7.44,50 269 | 0.5405,20,3.97,0,0.575,7.47,52.6,2.872,5,264,13,390.3,3.16,43.5 270 | 0.09065,20,6.96,1,0.464,5.92,61.5,3.9175,3,223,18.6,391.34,13.65,20.7 271 | 0.29916,20,6.96,0,0.464,5.856,42.1,4.429,3,223,18.6,388.65,13,21.1 272 | 0.16211,20,6.96,0,0.464,6.24,16.3,4.429,3,223,18.6,396.9,6.59,25.2 273 | 0.1146,20,6.96,0,0.464,6.538,58.7,3.9175,3,223,18.6,394.96,7.73,24.4 274 | 0.22188,20,6.96,1,0.464,7.691,51.8,4.3665,3,223,18.6,390.77,6.58,35.2 275 | 0.05644,40,6.41,1,0.447,6.758,32.9,4.0776,4,254,17.6,396.9,3.53,32.4 276 | 0.09604,40,6.41,0,0.447,6.854,42.8,4.2673,4,254,17.6,396.9,2.98,32 277 | 0.10469,40,6.41,1,0.447,7.267,49,4.7872,4,254,17.6,389.25,6.05,33.2 278 | 0.06127,40,6.41,1,0.447,6.826,27.6,4.8628,4,254,17.6,393.45,4.16,33.1 279 | 0.07978,40,6.41,0,0.447,6.482,32.1,4.1403,4,254,17.6,396.9,7.19,29.1 280 | 0.21038,20,3.33,0,0.4429,6.812,32.2,4.1007,5,216,14.9,396.9,4.85,35.1 281 | 0.03578,20,3.33,0,0.4429,7.82,64.5,4.6947,5,216,14.9,387.31,3.76,45.4 282 | 0.03705,20,3.33,0,0.4429,6.968,37.2,5.2447,5,216,14.9,392.23,4.59,35.4 283 | 0.06129,20,3.33,1,0.4429,7.645,49.7,5.2119,5,216,14.9,377.07,3.01,46 284 | 0.01501,90,1.21,1,0.401,7.923,24.8,5.885,1,198,13.6,395.52,3.16,50 285 | 0.00906,90,2.97,0,0.4,7.088,20.8,7.3073,1,285,15.3,394.72,7.85,32.2 286 | 0.01096,55,2.25,0,0.389,6.453,31.9,7.3073,1,300,15.3,394.72,8.23,22 287 | 0.01965,80,1.76,0,0.385,6.23,31.5,9.0892,1,241,18.2,341.6,12.93,20.1 288 | 0.03871,52.5,5.32,0,0.405,6.209,31.3,7.3172,6,293,16.6,396.9,7.14,23.2 289 | 0.0459,52.5,5.32,0,0.405,6.315,45.6,7.3172,6,293,16.6,396.9,7.6,22.3 290 | 0.04297,52.5,5.32,0,0.405,6.565,22.9,7.3172,6,293,16.6,371.72,9.51,24.8 291 | 0.03502,80,4.95,0,0.411,6.861,27.9,5.1167,4,245,19.2,396.9,3.33,28.5 292 | 0.07886,80,4.95,0,0.411,7.148,27.7,5.1167,4,245,19.2,396.9,3.56,37.3 293 | 0.03615,80,4.95,0,0.411,6.63,23.4,5.1167,4,245,19.2,396.9,4.7,27.9 294 | 0.08265,0,13.92,0,0.437,6.127,18.4,5.5027,4,289,16,396.9,8.58,23.9 295 | 0.08199,0,13.92,0,0.437,6.009,42.3,5.5027,4,289,16,396.9,10.4,21.7 296 | 0.12932,0,13.92,0,0.437,6.678,31.1,5.9604,4,289,16,396.9,6.27,28.6 297 | 0.05372,0,13.92,0,0.437,6.549,51,5.9604,4,289,16,392.85,7.39,27.1 298 | 0.14103,0,13.92,0,0.437,5.79,58,6.32,4,289,16,396.9,15.84,20.3 299 | 0.06466,70,2.24,0,0.4,6.345,20.1,7.8278,5,358,14.8,368.24,4.97,22.5 300 | 0.05561,70,2.24,0,0.4,7.041,10,7.8278,5,358,14.8,371.58,4.74,29 301 | 0.04417,70,2.24,0,0.4,6.871,47.4,7.8278,5,358,14.8,390.86,6.07,24.8 302 | 0.03537,34,6.09,0,0.433,6.59,40.4,5.4917,7,329,16.1,395.75,9.5,22 303 | 0.09266,34,6.09,0,0.433,6.495,18.4,5.4917,7,329,16.1,383.61,8.67,26.4 304 | 0.1,34,6.09,0,0.433,6.982,17.7,5.4917,7,329,16.1,390.43,4.86,33.1 305 | 0.05515,33,2.18,0,0.472,7.236,41.1,4.022,7,222,18.4,393.68,6.93,36.1 306 | 0.05479,33,2.18,0,0.472,6.616,58.1,3.37,7,222,18.4,393.36,8.93,28.4 307 | 0.07503,33,2.18,0,0.472,7.42,71.9,3.0992,7,222,18.4,396.9,6.47,33.4 308 | 0.04932,33,2.18,0,0.472,6.849,70.3,3.1827,7,222,18.4,396.9,7.53,28.2 309 | 0.49298,0,9.9,0,0.544,6.635,82.5,3.3175,4,304,18.4,396.9,4.54,22.8 310 | 0.3494,0,9.9,0,0.544,5.972,76.7,3.1025,4,304,18.4,396.24,9.97,20.3 311 | 2.63548,0,9.9,0,0.544,4.973,37.8,2.5194,4,304,18.4,350.45,12.64,16.1 312 | 0.79041,0,9.9,0,0.544,6.122,52.8,2.6403,4,304,18.4,396.9,5.98,22.1 313 | 0.26169,0,9.9,0,0.544,6.023,90.4,2.834,4,304,18.4,396.3,11.72,19.4 314 | 0.26938,0,9.9,0,0.544,6.266,82.8,3.2628,4,304,18.4,393.39,7.9,21.6 315 | 0.3692,0,9.9,0,0.544,6.567,87.3,3.6023,4,304,18.4,395.69,9.28,23.8 316 | 0.25356,0,9.9,0,0.544,5.705,77.7,3.945,4,304,18.4,396.42,11.5,16.2 317 | 0.31827,0,9.9,0,0.544,5.914,83.2,3.9986,4,304,18.4,390.7,18.33,17.8 318 | 0.24522,0,9.9,0,0.544,5.782,71.7,4.0317,4,304,18.4,396.9,15.94,19.8 319 | 0.40202,0,9.9,0,0.544,6.382,67.2,3.5325,4,304,18.4,395.21,10.36,23.1 320 | 0.47547,0,9.9,0,0.544,6.113,58.8,4.0019,4,304,18.4,396.23,12.73,21 321 | 0.1676,0,7.38,0,0.493,6.426,52.3,4.5404,5,287,19.6,396.9,7.2,23.8 322 | 0.18159,0,7.38,0,0.493,6.376,54.3,4.5404,5,287,19.6,396.9,6.87,23.1 323 | 0.35114,0,7.38,0,0.493,6.041,49.9,4.7211,5,287,19.6,396.9,7.7,20.4 324 | 0.28392,0,7.38,0,0.493,5.708,74.3,4.7211,5,287,19.6,391.13,11.74,18.5 325 | 0.34109,0,7.38,0,0.493,6.415,40.1,4.7211,5,287,19.6,396.9,6.12,25 326 | 0.19186,0,7.38,0,0.493,6.431,14.7,5.4159,5,287,19.6,393.68,5.08,24.6 327 | 0.30347,0,7.38,0,0.493,6.312,28.9,5.4159,5,287,19.6,396.9,6.15,23 328 | 0.24103,0,7.38,0,0.493,6.083,43.7,5.4159,5,287,19.6,396.9,12.79,22.2 329 | 0.06617,0,3.24,0,0.46,5.868,25.8,5.2146,4,430,16.9,382.44,9.97,19.3 330 | 0.06724,0,3.24,0,0.46,6.333,17.2,5.2146,4,430,16.9,375.21,7.34,22.6 331 | 0.04544,0,3.24,0,0.46,6.144,32.2,5.8736,4,430,16.9,368.57,9.09,19.8 332 | 0.05023,35,6.06,0,0.4379,5.706,28.4,6.6407,1,304,16.9,394.02,12.43,17.1 333 | 0.03466,35,6.06,0,0.4379,6.031,23.3,6.6407,1,304,16.9,362.25,7.83,19.4 334 | 0.05083,0,5.19,0,0.515,6.316,38.1,6.4584,5,224,20.2,389.71,5.68,22.2 335 | 0.03738,0,5.19,0,0.515,6.31,38.5,6.4584,5,224,20.2,389.4,6.75,20.7 336 | 0.03961,0,5.19,0,0.515,6.037,34.5,5.9853,5,224,20.2,396.9,8.01,21.1 337 | 0.03427,0,5.19,0,0.515,5.869,46.3,5.2311,5,224,20.2,396.9,9.8,19.5 338 | 0.03041,0,5.19,0,0.515,5.895,59.6,5.615,5,224,20.2,394.81,10.56,18.5 339 | 0.03306,0,5.19,0,0.515,6.059,37.3,4.8122,5,224,20.2,396.14,8.51,20.6 340 | 0.05497,0,5.19,0,0.515,5.985,45.4,4.8122,5,224,20.2,396.9,9.74,19 341 | 0.06151,0,5.19,0,0.515,5.968,58.5,4.8122,5,224,20.2,396.9,9.29,18.7 342 | 0.01301,35,1.52,0,0.442,7.241,49.3,7.0379,1,284,15.5,394.74,5.49,32.7 343 | 0.02498,0,1.89,0,0.518,6.54,59.7,6.2669,1,422,15.9,389.96,8.65,16.5 344 | 0.02543,55,3.78,0,0.484,6.696,56.4,5.7321,5,370,17.6,396.9,7.18,23.9 345 | 0.03049,55,3.78,0,0.484,6.874,28.1,6.4654,5,370,17.6,387.97,4.61,31.2 346 | 0.03113,0,4.39,0,0.442,6.014,48.5,8.0136,3,352,18.8,385.64,10.53,17.5 347 | 0.06162,0,4.39,0,0.442,5.898,52.3,8.0136,3,352,18.8,364.61,12.67,17.2 348 | 0.0187,85,4.15,0,0.429,6.516,27.7,8.5353,4,351,17.9,392.43,6.36,23.1 349 | 0.01501,80,2.01,0,0.435,6.635,29.7,8.344,4,280,17,390.94,5.99,24.5 350 | 0.02899,40,1.25,0,0.429,6.939,34.5,8.7921,1,335,19.7,389.85,5.89,26.6 351 | 0.06211,40,1.25,0,0.429,6.49,44.4,8.7921,1,335,19.7,396.9,5.98,22.9 352 | 0.0795,60,1.69,0,0.411,6.579,35.9,10.7103,4,411,18.3,370.78,5.49,24.1 353 | 0.07244,60,1.69,0,0.411,5.884,18.5,10.7103,4,411,18.3,392.33,7.79,18.6 354 | 0.01709,90,2.02,0,0.41,6.728,36.1,12.1265,5,187,17,384.46,4.5,30.1 355 | 0.04301,80,1.91,0,0.413,5.663,21.9,10.5857,4,334,22,382.8,8.05,18.2 356 | 0.10659,80,1.91,0,0.413,5.936,19.5,10.5857,4,334,22,376.04,5.57,20.6 357 | 8.98296,0,18.1,1,0.77,6.212,97.4,2.1222,24,666,20.2,377.73,17.6,17.8 358 | 3.8497,0,18.1,1,0.77,6.395,91,2.5052,24,666,20.2,391.34,13.27,21.7 359 | 5.20177,0,18.1,1,0.77,6.127,83.4,2.7227,24,666,20.2,395.43,11.48,22.7 360 | 4.26131,0,18.1,0,0.77,6.112,81.3,2.5091,24,666,20.2,390.74,12.67,22.6 361 | 4.54192,0,18.1,0,0.77,6.398,88,2.5182,24,666,20.2,374.56,7.79,25 362 | 3.83684,0,18.1,0,0.77,6.251,91.1,2.2955,24,666,20.2,350.65,14.19,19.9 363 | 3.67822,0,18.1,0,0.77,5.362,96.2,2.1036,24,666,20.2,380.79,10.19,20.8 364 | 4.22239,0,18.1,1,0.77,5.803,89,1.9047,24,666,20.2,353.04,14.64,16.8 365 | 3.47428,0,18.1,1,0.718,8.78,82.9,1.9047,24,666,20.2,354.55,5.29,21.9 366 | 4.55587,0,18.1,0,0.718,3.561,87.9,1.6132,24,666,20.2,354.7,7.12,27.5 367 | 3.69695,0,18.1,0,0.718,4.963,91.4,1.7523,24,666,20.2,316.03,14,21.9 368 | 13.5222,0,18.1,0,0.631,3.863,100,1.5106,24,666,20.2,131.42,13.33,23.1 369 | 4.89822,0,18.1,0,0.631,4.97,100,1.3325,24,666,20.2,375.52,3.26,50 370 | 5.66998,0,18.1,1,0.631,6.683,96.8,1.3567,24,666,20.2,375.33,3.73,50 371 | 6.53876,0,18.1,1,0.631,7.016,97.5,1.2024,24,666,20.2,392.05,2.96,50 372 | 9.2323,0,18.1,0,0.631,6.216,100,1.1691,24,666,20.2,366.15,9.53,50 373 | 8.26725,0,18.1,1,0.668,5.875,89.6,1.1296,24,666,20.2,347.88,8.88,50 374 | 11.1081,0,18.1,0,0.668,4.906,100,1.1742,24,666,20.2,396.9,34.77,13.8 375 | 18.4982,0,18.1,0,0.668,4.138,100,1.137,24,666,20.2,396.9,37.97,13.8 376 | 19.6091,0,18.1,0,0.671,7.313,97.9,1.3163,24,666,20.2,396.9,13.44,15 377 | 15.288,0,18.1,0,0.671,6.649,93.3,1.3449,24,666,20.2,363.02,23.24,13.9 378 | 9.82349,0,18.1,0,0.671,6.794,98.8,1.358,24,666,20.2,396.9,21.24,13.3 379 | 23.6482,0,18.1,0,0.671,6.38,96.2,1.3861,24,666,20.2,396.9,23.69,13.1 380 | 17.8667,0,18.1,0,0.671,6.223,100,1.3861,24,666,20.2,393.74,21.78,10.2 381 | 88.9762,0,18.1,0,0.671,6.968,91.9,1.4165,24,666,20.2,396.9,17.21,10.4 382 | 15.8744,0,18.1,0,0.671,6.545,99.1,1.5192,24,666,20.2,396.9,21.08,10.9 383 | 9.18702,0,18.1,0,0.7,5.536,100,1.5804,24,666,20.2,396.9,23.6,11.3 384 | 7.99248,0,18.1,0,0.7,5.52,100,1.5331,24,666,20.2,396.9,24.56,12.3 385 | 20.0849,0,18.1,0,0.7,4.368,91.2,1.4395,24,666,20.2,285.83,30.63,8.8 386 | 16.8118,0,18.1,0,0.7,5.277,98.1,1.4261,24,666,20.2,396.9,30.81,7.2 387 | 24.3938,0,18.1,0,0.7,4.652,100,1.4672,24,666,20.2,396.9,28.28,10.5 388 | 22.5971,0,18.1,0,0.7,5,89.5,1.5184,24,666,20.2,396.9,31.99,7.4 389 | 14.3337,0,18.1,0,0.7,4.88,100,1.5895,24,666,20.2,372.92,30.62,10.2 390 | 8.15174,0,18.1,0,0.7,5.39,98.9,1.7281,24,666,20.2,396.9,20.85,11.5 391 | 6.96215,0,18.1,0,0.7,5.713,97,1.9265,24,666,20.2,394.43,17.11,15.1 392 | 5.29305,0,18.1,0,0.7,6.051,82.5,2.1678,24,666,20.2,378.38,18.76,23.2 393 | 11.5779,0,18.1,0,0.7,5.036,97,1.77,24,666,20.2,396.9,25.68,9.7 394 | 8.64476,0,18.1,0,0.693,6.193,92.6,1.7912,24,666,20.2,396.9,15.17,13.8 395 | 13.3598,0,18.1,0,0.693,5.887,94.7,1.7821,24,666,20.2,396.9,16.35,12.7 396 | 8.71675,0,18.1,0,0.693,6.471,98.8,1.7257,24,666,20.2,391.98,17.12,13.1 397 | 5.87205,0,18.1,0,0.693,6.405,96,1.6768,24,666,20.2,396.9,19.37,12.5 398 | 7.67202,0,18.1,0,0.693,5.747,98.9,1.6334,24,666,20.2,393.1,19.92,8.5 399 | 38.3518,0,18.1,0,0.693,5.453,100,1.4896,24,666,20.2,396.9,30.59,5 400 | 9.91655,0,18.1,0,0.693,5.852,77.8,1.5004,24,666,20.2,338.16,29.97,6.3 401 | 25.0461,0,18.1,0,0.693,5.987,100,1.5888,24,666,20.2,396.9,26.77,5.6 402 | 14.2362,0,18.1,0,0.693,6.343,100,1.5741,24,666,20.2,396.9,20.32,7.2 403 | 9.59571,0,18.1,0,0.693,6.404,100,1.639,24,666,20.2,376.11,20.31,12.1 404 | 24.8017,0,18.1,0,0.693,5.349,96,1.7028,24,666,20.2,396.9,19.77,8.3 405 | 41.5292,0,18.1,0,0.693,5.531,85.4,1.6074,24,666,20.2,329.46,27.38,8.5 406 | 67.9208,0,18.1,0,0.693,5.683,100,1.4254,24,666,20.2,384.97,22.98,5 407 | 20.7162,0,18.1,0,0.659,4.138,100,1.1781,24,666,20.2,370.22,23.34,11.9 408 | 11.9511,0,18.1,0,0.659,5.608,100,1.2852,24,666,20.2,332.09,12.13,27.9 409 | 7.40389,0,18.1,0,0.597,5.617,97.9,1.4547,24,666,20.2,314.64,26.4,17.2 410 | 14.4383,0,18.1,0,0.597,6.852,100,1.4655,24,666,20.2,179.36,19.78,27.5 411 | 51.1358,0,18.1,0,0.597,5.757,100,1.413,24,666,20.2,2.6,10.11,15 412 | 14.0507,0,18.1,0,0.597,6.657,100,1.5275,24,666,20.2,35.05,21.22,17.2 413 | 18.811,0,18.1,0,0.597,4.628,100,1.5539,24,666,20.2,28.79,34.37,17.9 414 | 28.6558,0,18.1,0,0.597,5.155,100,1.5894,24,666,20.2,210.97,20.08,16.3 415 | 45.7461,0,18.1,0,0.693,4.519,100,1.6582,24,666,20.2,88.27,36.98,7 416 | 18.0846,0,18.1,0,0.679,6.434,100,1.8347,24,666,20.2,27.25,29.05,7.2 417 | 10.8342,0,18.1,0,0.679,6.782,90.8,1.8195,24,666,20.2,21.57,25.79,7.5 418 | 25.9406,0,18.1,0,0.679,5.304,89.1,1.6475,24,666,20.2,127.36,26.64,10.4 419 | 73.5341,0,18.1,0,0.679,5.957,100,1.8026,24,666,20.2,16.45,20.62,8.8 420 | 11.8123,0,18.1,0,0.718,6.824,76.5,1.794,24,666,20.2,48.45,22.74,8.4 421 | 11.0874,0,18.1,0,0.718,6.411,100,1.8589,24,666,20.2,318.75,15.02,16.7 422 | 7.02259,0,18.1,0,0.718,6.006,95.3,1.8746,24,666,20.2,319.98,15.7,14.2 423 | 12.0482,0,18.1,0,0.614,5.648,87.6,1.9512,24,666,20.2,291.55,14.1,20.8 424 | 7.05042,0,18.1,0,0.614,6.103,85.1,2.0218,24,666,20.2,2.52,23.29,13.4 425 | 8.79212,0,18.1,0,0.584,5.565,70.6,2.0635,24,666,20.2,3.65,17.16,11.7 426 | 15.8603,0,18.1,0,0.679,5.896,95.4,1.9096,24,666,20.2,7.68,24.39,8.3 427 | 12.2472,0,18.1,0,0.584,5.837,59.7,1.9976,24,666,20.2,24.65,15.69,10.2 428 | 37.6619,0,18.1,0,0.679,6.202,78.7,1.8629,24,666,20.2,18.82,14.52,10.9 429 | 7.36711,0,18.1,0,0.679,6.193,78.1,1.9356,24,666,20.2,96.73,21.52,11 430 | 9.33889,0,18.1,0,0.679,6.38,95.6,1.9682,24,666,20.2,60.72,24.08,9.5 431 | 8.49213,0,18.1,0,0.584,6.348,86.1,2.0527,24,666,20.2,83.45,17.64,14.5 432 | 10.0623,0,18.1,0,0.584,6.833,94.3,2.0882,24,666,20.2,81.33,19.69,14.1 433 | 6.44405,0,18.1,0,0.584,6.425,74.8,2.2004,24,666,20.2,97.95,12.03,16.1 434 | 5.58107,0,18.1,0,0.713,6.436,87.9,2.3158,24,666,20.2,100.19,16.22,14.3 435 | 13.9134,0,18.1,0,0.713,6.208,95,2.2222,24,666,20.2,100.63,15.17,11.7 436 | 11.1604,0,18.1,0,0.74,6.629,94.6,2.1247,24,666,20.2,109.85,23.27,13.4 437 | 14.4208,0,18.1,0,0.74,6.461,93.3,2.0026,24,666,20.2,27.49,18.05,9.6 438 | 15.1772,0,18.1,0,0.74,6.152,100,1.9142,24,666,20.2,9.32,26.45,8.7 439 | 13.6781,0,18.1,0,0.74,5.935,87.9,1.8206,24,666,20.2,68.95,34.02,8.4 440 | 9.39063,0,18.1,0,0.74,5.627,93.9,1.8172,24,666,20.2,396.9,22.88,12.8 441 | 22.0511,0,18.1,0,0.74,5.818,92.4,1.8662,24,666,20.2,391.45,22.11,10.5 442 | 9.72418,0,18.1,0,0.74,6.406,97.2,2.0651,24,666,20.2,385.96,19.52,17.1 443 | 5.66637,0,18.1,0,0.74,6.219,100,2.0048,24,666,20.2,395.69,16.59,18.4 444 | 9.96654,0,18.1,0,0.74,6.485,100,1.9784,24,666,20.2,386.73,18.85,15.4 445 | 12.8023,0,18.1,0,0.74,5.854,96.6,1.8956,24,666,20.2,240.52,23.79,10.8 446 | 0.6718,0,18.1,0,0.74,6.459,94.8,1.9879,24,666,20.2,43.06,23.98,11.8 447 | 6.28807,0,18.1,0,0.74,6.341,96.4,2.072,24,666,20.2,318.01,17.79,14.9 448 | 9.92485,0,18.1,0,0.74,6.251,96.6,2.198,24,666,20.2,388.52,16.44,12.6 449 | 9.32909,0,18.1,0,0.713,6.185,98.7,2.2616,24,666,20.2,396.9,18.13,14.1 450 | 7.52601,0,18.1,0,0.713,6.417,98.3,2.185,24,666,20.2,304.21,19.31,13 451 | 6.71772,0,18.1,0,0.713,6.749,92.6,2.3236,24,666,20.2,0.32,17.44,13.4 452 | 5.44114,0,18.1,0,0.713,6.655,98.2,2.3552,24,666,20.2,355.29,17.73,15.2 453 | 5.09017,0,18.1,0,0.713,6.297,91.8,2.3682,24,666,20.2,385.09,17.27,16.1 454 | 8.24809,0,18.1,0,0.713,7.393,99.3,2.4527,24,666,20.2,375.87,16.74,17.8 455 | 9.51363,0,18.1,0,0.713,6.728,94.1,2.4961,24,666,20.2,6.68,18.71,14.9 456 | 4.75237,0,18.1,0,0.713,6.525,86.5,2.4358,24,666,20.2,50.92,18.13,14.1 457 | 4.66883,0,18.1,0,0.713,5.976,87.9,2.5806,24,666,20.2,10.48,19.01,12.7 458 | 8.20058,0,18.1,0,0.713,5.936,80.3,2.7792,24,666,20.2,3.5,16.94,13.5 459 | 7.75223,0,18.1,0,0.713,6.301,83.7,2.7831,24,666,20.2,272.21,16.23,14.9 460 | 6.80117,0,18.1,0,0.713,6.081,84.4,2.7175,24,666,20.2,396.9,14.7,20 461 | 4.81213,0,18.1,0,0.713,6.701,90,2.5975,24,666,20.2,255.23,16.42,16.4 462 | 3.69311,0,18.1,0,0.713,6.376,88.4,2.5671,24,666,20.2,391.43,14.65,17.7 463 | 6.65492,0,18.1,0,0.713,6.317,83,2.7344,24,666,20.2,396.9,13.99,19.5 464 | 5.82115,0,18.1,0,0.713,6.513,89.9,2.8016,24,666,20.2,393.82,10.29,20.2 465 | 7.83932,0,18.1,0,0.655,6.209,65.4,2.9634,24,666,20.2,396.9,13.22,21.4 466 | 3.1636,0,18.1,0,0.655,5.759,48.2,3.0665,24,666,20.2,334.4,14.13,19.9 467 | 3.77498,0,18.1,0,0.655,5.952,84.7,2.8715,24,666,20.2,22.01,17.15,19 468 | 4.42228,0,18.1,0,0.584,6.003,94.5,2.5403,24,666,20.2,331.29,21.32,19.1 469 | 15.5757,0,18.1,0,0.58,5.926,71,2.9084,24,666,20.2,368.74,18.13,19.1 470 | 13.0751,0,18.1,0,0.58,5.713,56.7,2.8237,24,666,20.2,396.9,14.76,20.1 471 | 4.34879,0,18.1,0,0.58,6.167,84,3.0334,24,666,20.2,396.9,16.29,19.9 472 | 4.03841,0,18.1,0,0.532,6.229,90.7,3.0993,24,666,20.2,395.33,12.87,19.6 473 | 3.56868,0,18.1,0,0.58,6.437,75,2.8965,24,666,20.2,393.37,14.36,23.2 474 | 4.64689,0,18.1,0,0.614,6.98,67.6,2.5329,24,666,20.2,374.68,11.66,29.8 475 | 8.05579,0,18.1,0,0.584,5.427,95.4,2.4298,24,666,20.2,352.58,18.14,13.8 476 | 6.39312,0,18.1,0,0.584,6.162,97.4,2.206,24,666,20.2,302.76,24.1,13.3 477 | 4.87141,0,18.1,0,0.614,6.484,93.6,2.3053,24,666,20.2,396.21,18.68,16.7 478 | 15.0234,0,18.1,0,0.614,5.304,97.3,2.1007,24,666,20.2,349.48,24.91,12 479 | 10.233,0,18.1,0,0.614,6.185,96.7,2.1705,24,666,20.2,379.7,18.03,14.6 480 | 14.3337,0,18.1,0,0.614,6.229,88,1.9512,24,666,20.2,383.32,13.11,21.4 481 | 5.82401,0,18.1,0,0.532,6.242,64.7,3.4242,24,666,20.2,396.9,10.74,23 482 | 5.70818,0,18.1,0,0.532,6.75,74.9,3.3317,24,666,20.2,393.07,7.74,23.7 483 | 5.73116,0,18.1,0,0.532,7.061,77,3.4106,24,666,20.2,395.28,7.01,25 484 | 2.81838,0,18.1,0,0.532,5.762,40.3,4.0983,24,666,20.2,392.92,10.42,21.8 485 | 2.37857,0,18.1,0,0.583,5.871,41.9,3.724,24,666,20.2,370.73,13.34,20.6 486 | 3.67367,0,18.1,0,0.583,6.312,51.9,3.9917,24,666,20.2,388.62,10.58,21.2 487 | 5.69175,0,18.1,0,0.583,6.114,79.8,3.5459,24,666,20.2,392.68,14.98,19.1 488 | 4.83567,0,18.1,0,0.583,5.905,53.2,3.1523,24,666,20.2,388.22,11.45,20.6 489 | 0.15086,0,27.74,0,0.609,5.454,92.7,1.8209,4,711,20.1,395.09,18.06,15.2 490 | 0.18337,0,27.74,0,0.609,5.414,98.3,1.7554,4,711,20.1,344.05,23.97,7 491 | 0.20746,0,27.74,0,0.609,5.093,98,1.8226,4,711,20.1,318.43,29.68,8.1 492 | 0.10574,0,27.74,0,0.609,5.983,98.8,1.8681,4,711,20.1,390.11,18.07,13.6 493 | 0.11132,0,27.74,0,0.609,5.983,83.5,2.1099,4,711,20.1,396.9,13.35,20.1 494 | 0.17331,0,9.69,0,0.585,5.707,54,2.3817,6,391,19.2,396.9,12.01,21.8 495 | 0.27957,0,9.69,0,0.585,5.926,42.6,2.3817,6,391,19.2,396.9,13.59,24.5 496 | 0.17899,0,9.69,0,0.585,5.67,28.8,2.7986,6,391,19.2,393.29,17.6,23.1 497 | 0.2896,0,9.69,0,0.585,5.39,72.9,2.7986,6,391,19.2,396.9,21.14,19.7 498 | 0.26838,0,9.69,0,0.585,5.794,70.6,2.8927,6,391,19.2,396.9,14.1,18.3 499 | 0.23912,0,9.69,0,0.585,6.019,65.3,2.4091,6,391,19.2,396.9,12.92,21.2 500 | 0.17783,0,9.69,0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.1,17.5 501 | 0.22438,0,9.69,0,0.585,6.027,79.7,2.4982,6,391,19.2,396.9,14.33,16.8 502 | 0.06263,0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21,391.99,9.67,22.4 503 | 0.04527,0,11.93,0,0.573,6.12,76.7,2.2875,1,273,21,396.9,9.08,20.6 504 | 0.06076,0,11.93,0,0.573,6.976,91,2.1675,1,273,21,396.9,5.64,23.9 505 | 0.10959,0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21,393.45,6.48,22 506 | 0.04741,0,11.93,0,0.573,6.03,80.8,2.505,1,273,21,396.9,7.88,11.9 507 | -------------------------------------------------------------------------------- /decision_tree.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from math import log2 3 | from copy import copy 4 | from time import time 5 | from random import random 6 | from random import randint, seed, random 7 | import numpy as np 8 | 9 | def list_split(X, idxs, feature, split): 10 | ret = [[], []] 11 | while idxs: 12 | if X[idxs[0]][feature] < split: 13 | ret[0].append(idxs.pop(0)) 14 | else: 15 | ret[1].append(idxs.pop(0)) 16 | return ret 17 | 18 | # 统计程序运行时间函数 19 | # fn代表运行的函数 20 | def run_time(fn): 21 | def fun(): 22 | start = time() 23 | fn() 24 | ret = time() - start 25 | if ret < 1e-6: 26 | unit = "ns" 27 | ret *= 1e9 28 | elif ret < 1e-3: 29 | unit = "us" 30 | ret *= 1e6 31 | elif ret < 1: 32 | unit = "ms" 33 | ret *= 1e3 34 | else: 35 | unit = "s" 36 | print("Total run time is %.1f %s\n" % (ret, unit)) 37 | return fun() 38 | 39 | # 加载肺癌数据集 40 | def load_data(): 41 | f = open("boston/breast_cancer.csv") 42 | X = [] 43 | y = [] 44 | for line in f: 45 | line = line[:-1].split(',') 46 | xi = [float(s) for s in line[:-1]] 47 | yi = line[-1] 48 | if '.' in yi: 49 | yi = float(yi) 50 | else: 51 | yi = int(yi) 52 | X.append(xi) 53 | y.append(yi) 54 | f.close() 55 | return X, y 56 | 57 | # 划分训练集和测试集 58 | def train_test_split(X, y, prob=0.7, random_state=None): 59 | if random_state is not None: 60 | seed(random_state) 61 | X_train = [] 62 | X_test = [] 63 | y_train = [] 64 | y_test = [] 65 | for i in range(len(X)): 66 | if random() < prob: 67 | X_train.append(X[i]) 68 | y_train.append(y[i]) 69 | else: 70 | X_test.append(X[i]) 71 | y_test.append(y[i]) 72 | seed() 73 | return np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test) 74 | 75 | # 准确率 76 | def get_acc(y, y_hat): 77 | return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y) 78 | 79 | # 查准率 80 | def get_precision(y, y_hat): 81 | true_postive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 82 | predicted_positive = sum(y_hat) 83 | return true_postive / predicted_positive 84 | 85 | # 查全率 86 | def get_recall(y, y_hat): 87 | true_postive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 88 | actual_positive = sum(y) 89 | return true_postive / actual_positive 90 | 91 | # 计算真正率 92 | def get_tpr(y, y_hat): 93 | true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 94 | actual_positive = sum(y) 95 | return true_positive / actual_positive 96 | 97 | # 计算真负率 98 | def get_tnr(y, y_hat): 99 | true_negative = sum(1 - (yi or yi_hat) for yi, yi_hat in zip(y, y_hat)) 100 | actual_negative = len(y) - sum(y) 101 | return true_negative / actual_negative 102 | 103 | # 画ROC曲线 104 | def get_roc(y, y_hat_prob): 105 | thresholds = sorted(set(y_hat_prob), reverse=True) 106 | ret = [[0, 0]] 107 | for threshold in thresholds: 108 | y_hat = [int(yi_hat_prob >= threshold) for yi_hat_prob in y_hat_prob] 109 | ret.append([get_tpr(y, y_hat), 1 - get_tnr(y, y_hat)]) 110 | return ret 111 | # 计算AUC(ROC曲线下方的面积) 112 | def get_auc(y, y_hat_prob): 113 | roc = iter(get_roc(y, y_hat_prob)) 114 | tpr_pre, fpr_pre = next(roc) 115 | auc = 0 116 | for tpr, fpr in roc: 117 | auc += (tpr + tpr_pre) * (fpr - fpr_pre) / 2 118 | tpr_pre = tpr 119 | fpr_pre = fpr 120 | return auc 121 | 122 | # 定义决策树的节点 123 | class Node(object): 124 | def __init__(self, prob=None): 125 | self.prob = prob 126 | self.left = None 127 | self.right = None 128 | self.feature = None 129 | self.split = None 130 | 131 | 132 | class DecisionTree(object): 133 | # 本决策树只支持使用ID3的二分类 134 | # root代表根节点,depth代表决策树的深度 135 | def __init__(self): 136 | self.root = Node() 137 | self.depth = 1 138 | self._rules = None 139 | 140 | # 对特定特征feature和分裂点x 141 | def get_split_effect(self, X, y, idx, feature, split): 142 | n = len(idx) 143 | pos_cnt = [0, 0] 144 | cnt = [0, 0] 145 | for i in idx: 146 | xi, yi = X[i][feature], y[i] 147 | if xi < split: 148 | cnt[0] += 1 149 | pos_cnt[0] += yi 150 | else: 151 | cnt[1] += 1 152 | pos_cnt[1] += y[i] 153 | # 计算分裂影响 154 | prob = [pos_cnt[0] / cnt[0], pos_cnt[1] / cnt[1]] 155 | rate = [cnt[0] / n, cnt[1] / n] 156 | return prob, rate 157 | 158 | # 计算熵 159 | def get_entropy(self, p): 160 | if p == 1 or p == 0: 161 | return 0 162 | else: 163 | q = 1 - p 164 | return -(p * log2(p) + q * log2(q)) 165 | 166 | # 计算信息熵 167 | def get_info(self, y, idx): 168 | p = sum(y[i] for i in idx) / len(idx) 169 | return self.get_entropy(p) 170 | 171 | # 计算条件熵 172 | def get_cond_info(self, prob, rate): 173 | info_left = self.get_entropy(prob[0]) 174 | info_right = self.get_entropy(prob[1]) 175 | return rate[0] * info_left + rate[1] * info_right 176 | # 寻找feature特征下的最好的分裂点 177 | def choose_split(self, X, y, idxs, feature): 178 | unique = set([X[i][feature] for i in idxs]) 179 | if len(unique) == 1: 180 | return None 181 | unique.remove(min(unique)) 182 | def f(split): 183 | info = self.get_info(y, idxs) 184 | prob, rate = self.get_split_effect(X, y, idxs, feature, split) 185 | cond_info = self.get_cond_info(prob, rate) 186 | # 信息增益 187 | gain = info - cond_info 188 | return gain, split, prob 189 | # 得到用于最大信息增益的分裂点 190 | gain, split, prob = max((f(split) for split in unique), key=lambda x: x[0]) 191 | 192 | return gain, feature, split, prob 193 | 194 | # 寻找具有最大信息增益的特征 195 | def choose_feature(self, X, y, idxs): 196 | m = len(X[0]) 197 | split_rets = map(lambda j: self.choose_split(X, y, idxs, j), range(m)) 198 | split_rets = filter(lambda x: x is not None, split_rets) 199 | return max(split_rets, default=None, key=lambda x: x[0]) 200 | 201 | def expr2literal(self, expr): 202 | feature, op, split = expr 203 | op = ">=" if op == 1 else "<" 204 | return "Feature%d %s %.4f" % (feature, op, split) 205 | 206 | # 获取决策树所有叶子节点的规则 207 | def get_rules(self): 208 | que = [[self.root, []]] 209 | self._rules = [] 210 | while que: 211 | nd, exprs = que.pop(0) 212 | if not(nd.left or nd.right): 213 | literals = list(map(self.expr2literal, exprs)) 214 | self._rules.append([literals, nd.prob]) 215 | if nd.left: 216 | rule_left = copy(exprs) 217 | rule_left.append([nd.feature, -1, nd.split]) 218 | que.append([nd.left, rule_left]) 219 | if nd.right: 220 | rule_right = copy(exprs) 221 | rule_right.append([nd.feature, 1, nd.split]) 222 | que.append([nd.right, rule_right]) 223 | 224 | # 训练数据 225 | def fit(self, X, y, max_depth=4, min_samples_split=2): 226 | idxs = list(range(len(y))) 227 | que = [(self.depth+1, self.root, idxs)] 228 | while que: 229 | depth, nd, idxs = que.pop(0) 230 | if depth > max_depth: 231 | depth -= 1 232 | break 233 | if len(idxs) < min_samples_split or nd.prob == 1 or nd.prob == 0: 234 | continue 235 | split_ret = self.choose_feature(X, y, idxs) 236 | if split_ret is None: 237 | continue 238 | _, feature, split, prob = split_ret 239 | nd.feature = feature 240 | nd.split = split 241 | nd.left = Node(prob[0]) 242 | nd.right = Node(prob[1]) 243 | idxs_split = list_split(X, idxs, feature, split) 244 | que.append((depth + 1, nd.left, idxs_split[0])) 245 | que.append((depth + 1, nd.right, idxs_split[1])) 246 | self.depth = depth 247 | self.get_rules() 248 | 249 | def print_rules(self): 250 | for i, rule in enumerate(self._rules): 251 | literals, prob = rule 252 | print("Rule %d: " % i, ' | '.join(literals) + ' => y_hat %.4f' % prob) 253 | print() 254 | def _predict(self, Xi): 255 | nd = self.root 256 | while nd.left and nd.right: 257 | if Xi[nd.feature] < nd.split: 258 | nd = nd.left 259 | else: 260 | nd = nd.right 261 | return nd.prob 262 | 263 | def predict(self, X, threshold=0.5): 264 | return [int(self._predict(Xi) >= threshold) for Xi in X] 265 | 266 | 267 | @run_time 268 | # 效果评估 269 | def main(): 270 | print("Tesing the performance of DecisionTree...") 271 | data, label = load_data() 272 | data_train, data_test, label_train, label_test = train_test_split(data, label, random_state=100) 273 | clf = DecisionTree() 274 | clf.fit(data_train, label_train) 275 | clf.print_rules() 276 | y_hat = clf.predict(data_test) 277 | acc = get_acc(label_test, y_hat) 278 | print("Accuracy is %.3f" % acc) 279 | -------------------------------------------------------------------------------- /gauss_nb.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import numpy as np 4 | from numpy import ndarray, exp, pi, sqrt 5 | from random import randint, seed, random 6 | from numpy.random import choice, seed 7 | from collections import Counter 8 | 9 | # 加载肺癌数据集 10 | def load_data(): 11 | f = open("boston/breast_cancer.csv") 12 | X = [] 13 | y = [] 14 | for line in f: 15 | line = line[:-1].split(',') 16 | xi = [float(s) for s in line[:-1]] 17 | yi = line[-1] 18 | if '.' in yi: 19 | yi = float(yi) 20 | else: 21 | yi = int(yi) 22 | X.append(xi) 23 | y.append(yi) 24 | f.close() 25 | return X, y 26 | 27 | # 划分训练集和测试集 28 | def train_test_split(X, y, prob=0.7, random_state=None): 29 | if random_state is not None: 30 | seed(random_state) 31 | X_train = [] 32 | X_test = [] 33 | y_train = [] 34 | y_test = [] 35 | for i in range(len(X)): 36 | if random() < prob: 37 | X_train.append(X[i]) 38 | y_train.append(y[i]) 39 | else: 40 | X_test.append(X[i]) 41 | y_test.append(y[i]) 42 | seed() 43 | return np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test) 44 | 45 | # 准确率 46 | def get_acc(y, y_hat): 47 | return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y) 48 | 49 | # 查准率 50 | def get_precision(y, y_hat): 51 | true_postive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 52 | predicted_positive = sum(y_hat) 53 | return true_postive / predicted_positive 54 | 55 | # 查全率 56 | def get_recall(y, y_hat): 57 | true_postive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 58 | actual_positive = sum(y) 59 | return true_postive / actual_positive 60 | 61 | # 计算真正率 62 | def get_tpr(y, y_hat): 63 | true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 64 | actual_positive = sum(y) 65 | return true_positive / actual_positive 66 | 67 | # 计算真负率 68 | def get_tnr(y, y_hat): 69 | true_negative = sum(1 - (yi or yi_hat) for yi, yi_hat in zip(y, y_hat)) 70 | actual_negative = len(y) - sum(y) 71 | return true_negative / actual_negative 72 | 73 | # 画ROC曲线 74 | def get_roc(y, y_hat_prob): 75 | thresholds = sorted(set(y_hat_prob), reverse=True) 76 | ret = [[0, 0]] 77 | for threshold in thresholds: 78 | y_hat = [int(yi_hat_prob >= threshold) for yi_hat_prob in y_hat_prob] 79 | ret.append([get_tpr(y, y_hat), 1 - get_tnr(y, y_hat)]) 80 | return ret 81 | # 计算AUC(ROC曲线下方的面积) 82 | def get_auc(y, y_hat_prob): 83 | roc = iter(get_roc(y, y_hat_prob)) 84 | tpr_pre, fpr_pre = next(roc) 85 | auc = 0 86 | for tpr, fpr in roc: 87 | auc += (tpr + tpr_pre) * (fpr - fpr_pre) / 2 88 | tpr_pre = tpr 89 | fpr_pre = fpr 90 | return auc 91 | 92 | class GaussianNB(object): 93 | # 初始化、存储先验概率、训练集的均值、方差及label的类别数量 94 | def __init__(self): 95 | self.prior = None 96 | self.avgs = None 97 | self.vars = None 98 | self.n_class = None 99 | 100 | # 计算先验概率 101 | # 通过Python自带的Counter计算每个类别的占比,再将结果存储到numpy数组中 102 | def get_prior(self, label): 103 | cnt = Counter(label) 104 | prior = np.array([cnt[i] / len(label) for i in range(len(cnt))]) 105 | return prior 106 | 107 | # 计算训练集均值 108 | # 每个label类别分别计算均值 109 | def get_avgs(self, data, label): 110 | return np.array([data[label == i].mean(axis=0) for i in range(self.n_class)]) 111 | 112 | # 计算训练集方差 113 | def get_vasrs(self, data, label): 114 | return np.array([data[label == i].var(axis=0) for i in range(self.n_class)]) 115 | 116 | # 计算似然度 117 | # 通过高斯分布的概率密度函数计算出似然再连乘得到似然度 118 | # .prod代表连乘操作 119 | def get_likehood(self, row): 120 | return (1 / sqrt(2 * pi * self.vars) * exp( 121 | -(row - self.avgs) ** 2 / (2 * self.vars))).prod(axis=1) 122 | 123 | # 训练模型 124 | def fit(self, data, label): 125 | self.prior = self.get_prior(label) 126 | self.n_class = len(self.prior) 127 | self.avgs = self.get_avgs(data, label) 128 | self.vars = self.get_vasrs(data, label) 129 | 130 | # 预测概率prob 131 | # 用先验概率乘以似然度再归一化得到每个label的prob 132 | def predict_prob(self, data): 133 | likehood = np.apply_along_axis(self.get_likehood, axis=1, arr=data) 134 | probs = self.prior * likehood 135 | probs_sum = probs.sum(axis=1) 136 | return probs / probs_sum[:, None] 137 | 138 | # 预测label 139 | # 对于单个样本,取prob最大值所对应的类别,就是label的预测值。 140 | def predict(self, data): 141 | return self.predict_prob(data).argmax(axis=1) 142 | 143 | 144 | # 效果评估 145 | def main(): 146 | print("Tesing the performance of Gaussian NaiveBayes...") 147 | data, label = load_data() 148 | data_train, data_test, label_train, label_test = train_test_split(data, label, random_state=100) 149 | clf = GaussianNB() 150 | clf.fit(data_train, label_train) 151 | y_hat = clf.predict(data_test) 152 | acc = get_acc(label_test, y_hat) 153 | print("Accuracy is %.3f" % acc) 154 | 155 | 156 | main() 157 | -------------------------------------------------------------------------------- /gbdt_classify.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from copy import copy 3 | from random import randint, seed, random 4 | from time import time 5 | from regression import RegressionTree 6 | from gbdt_regressor import GradientBoostingBase 7 | from random import choice 8 | from math import exp, log 9 | 10 | # 统计程序运行时间函数 11 | # fn代表运行的函数 12 | def run_time(fn): 13 | def fun(): 14 | start = time() 15 | fn() 16 | ret = time() - start 17 | if ret < 1e-6: 18 | unit = "ns" 19 | ret *= 1e9 20 | elif ret < 1e-3: 21 | unit = "us" 22 | ret *= 1e6 23 | elif ret < 1: 24 | unit = "ms" 25 | ret *= 1e3 26 | else: 27 | unit = "s" 28 | print("Total run time is %.1f %s\n" % (ret, unit)) 29 | return fun() 30 | 31 | def load_cancer(): 32 | f = open("boston/breast_cancer.csv") 33 | X = [] 34 | y = [] 35 | for line in f: 36 | line = line[:-1].split(',') 37 | xi = [float(s) for s in line[:-1]] 38 | yi = line[-1] 39 | if '.' in yi: 40 | yi = float(yi) 41 | else: 42 | yi = int(yi) 43 | X.append(xi) 44 | y.append(yi) 45 | f.close() 46 | return X, y 47 | 48 | # 划分训练集和测试集 49 | def train_test_split(X, y, prob=0.7, random_state=None): 50 | if random_state is not None: 51 | seed(random_state) 52 | X_train = [] 53 | X_test = [] 54 | y_train = [] 55 | y_test = [] 56 | for i in range(len(X)): 57 | if random() < prob: 58 | X_train.append(X[i]) 59 | y_train.append(y[i]) 60 | else: 61 | X_test.append(X[i]) 62 | y_test.append(y[i]) 63 | seed() 64 | return X_train, X_test, y_train, y_test 65 | 66 | # 准确率 67 | def get_acc(y, y_hat): 68 | return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y) 69 | 70 | # 查准率 71 | def get_precision(y, y_hat): 72 | true_postive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 73 | predicted_positive = sum(y_hat) 74 | return true_postive / predicted_positive 75 | 76 | # 查全率 77 | def get_recall(y, y_hat): 78 | true_postive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 79 | actual_positive = sum(y) 80 | return true_postive / actual_positive 81 | 82 | # 计算真正率 83 | def get_tpr(y, y_hat): 84 | true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 85 | actual_positive = sum(y) 86 | return true_positive / actual_positive 87 | 88 | # 计算真负率 89 | def get_tnr(y, y_hat): 90 | true_negative = sum(1 - (yi or yi_hat) for yi, yi_hat in zip(y, y_hat)) 91 | actual_negative = len(y) - sum(y) 92 | return true_negative / actual_negative 93 | 94 | # 画ROC曲线 95 | def get_roc(y, y_hat_prob): 96 | thresholds = sorted(set(y_hat_prob), reverse=True) 97 | ret = [[0, 0]] 98 | for threshold in thresholds: 99 | y_hat = [int(yi_hat_prob >= threshold) for yi_hat_prob in y_hat_prob] 100 | ret.append([get_tpr(y, y_hat), 1 - get_tnr(y, y_hat)]) 101 | return ret 102 | # 计算AUC(ROC曲线下方的面积) 103 | def get_auc(y, y_hat_prob): 104 | roc = iter(get_roc(y, y_hat_prob)) 105 | tpr_pre, fpr_pre = next(roc) 106 | auc = 0 107 | for tpr, fpr in roc: 108 | auc += (tpr + tpr_pre) * (fpr - fpr_pre) / 2 109 | tpr_pre = tpr 110 | fpr_pre = fpr 111 | return auc 112 | 113 | def model_evaluation(clf, X, y): 114 | y_hat = clf.predict(X) 115 | y_hat_prob = [clf._predict(Xi) for Xi in X] 116 | ret = dict() 117 | ret["Accuracy"] = get_acc(y, y_hat) 118 | ret["Recall"] = get_recall(y, y_hat) 119 | ret['Precision'] = get_precision(y, y_hat) 120 | ret['AUC'] = get_auc(y, y_hat_prob) 121 | for k, v in ret.items(): 122 | print("%s: %.3f" % (k, v)) 123 | print() 124 | return ret 125 | 126 | def sigmoid(x, x_min=-100): 127 | return 1 / (1 + exp(-x)) if x > x_min else 0 128 | 129 | class GradientBoostingClassifier(GradientBoostingBase): 130 | def __init__(self): 131 | GradientBoostingBase.__init__(self) 132 | self.fn = sigmoid 133 | def get_init_val(self, y): 134 | n = len(y) 135 | y_sum = sum(y) 136 | return log((y_sum) / (n - y_sum)) 137 | def get_score(self, idxs, y_hat, residuals): 138 | numerator = denominator = 0 139 | for idx in idxs: 140 | numerator += residuals[idx] 141 | denominator += y_hat[idx] * (1 - y_hat[idx]) 142 | 143 | return numerator / denominator 144 | def predict(self, X, threshold=0.5): 145 | return [int(self._predict(Xi) >= threshold) for Xi in X] 146 | 147 | 148 | @run_time 149 | def main(): 150 | print('Testing the accuracy of GBDT ClassifyTree...') 151 | X, y = load_cancer() 152 | X_train, X_test, y_train, y_test = train_test_split( 153 | X, y, random_state=20) 154 | clf = GradientBoostingClassifier() 155 | clf.fit(X_train, y_train, n_estimators=2, 156 | lr=0.8, max_depth=3, min_samples_split=2) 157 | model_evaluation(clf, X_test, y_test) 158 | 159 | -------------------------------------------------------------------------------- /gbdt_classify_regression.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from copy import copy 3 | from random import randint, seed, random 4 | from time import time 5 | from regression import RegressionTree 6 | from random import choice 7 | 8 | # 统计程序运行时间函数 9 | # fn代表运行的函数 10 | def run_time(fn): 11 | def fun(): 12 | start = time() 13 | fn() 14 | ret = time() - start 15 | if ret < 1e-6: 16 | unit = "ns" 17 | ret *= 1e9 18 | elif ret < 1e-3: 19 | unit = "us" 20 | ret *= 1e6 21 | elif ret < 1: 22 | unit = "ms" 23 | ret *= 1e3 24 | else: 25 | unit = "s" 26 | print("Total run time is %.1f %s\n" % (ret, unit)) 27 | return fun() 28 | 29 | def load_data(): 30 | f = open("boston/housing.csv") 31 | X = [] 32 | y = [] 33 | for line in f: 34 | line = line[:-1].split(',') 35 | xi = [float(s) for s in line[:-1]] 36 | yi = line[-1] 37 | if '.' in yi: 38 | yi = float(yi) 39 | else: 40 | yi = int(yi) 41 | X.append(xi) 42 | y.append(yi) 43 | f.close() 44 | return X, y 45 | 46 | # 划分训练集和测试集 47 | def train_test_split(X, y, prob=0.7, random_state=None): 48 | if random_state is not None: 49 | seed(random_state) 50 | X_train = [] 51 | X_test = [] 52 | y_train = [] 53 | y_test = [] 54 | for i in range(len(X)): 55 | if random() < prob: 56 | X_train.append(X[i]) 57 | y_train.append(y[i]) 58 | else: 59 | X_test.append(X[i]) 60 | y_test.append(y[i]) 61 | seed() 62 | return X_train, X_test, y_train, y_test 63 | 64 | 65 | class GradientBoostingBase(object): 66 | # 初始化,存储回归树、学习率、初始预测值和变换函数。 67 | # (注:回归不需要做变换,因此函数的返回值等于参数) 68 | def __init__(self): 69 | self.trees = None 70 | self.lr = None 71 | self.init_val = None 72 | self.fn = lambda x: x 73 | 74 | # 计算初始预测值,初始预测值即y的平均值。 75 | def get_init_val(self, y): 76 | return sum(y) / len(y) 77 | 78 | # 计算残差 79 | def get_residuals(self, y, y_hat): 80 | return [yi - self.fn(y_hat_i) for yi, y_hat_i in zip(y, y_hat)] 81 | 82 | # 找到例子属于哪个分类的叶子节点 83 | def match_node(self, row, tree): 84 | nd = tree.root 85 | while nd.left and nd.right: 86 | if row[nd.feature] < nd.split: 87 | nd = nd.left 88 | else: 89 | nd = nd.right 90 | return nd 91 | 92 | # 得到回归树的所有的叶子节点 93 | def get_leaves(self, tree): 94 | nodes = [] 95 | que = [tree.root] 96 | while que: 97 | node = que.pop(0) 98 | if node.left is None or node.right is None: 99 | nodes.append(node) 100 | continue 101 | left_node = node.left 102 | right_node = node.right 103 | que.append(left_node) 104 | que.append(right_node) 105 | return nodes 106 | 107 | # 将样本的索引划分为回归树的相应叶节点。 108 | # 返回一个字典,类似于:{node1: [1, 3, 5], node2: [2, 4, 6]...},代表哪个节点对哪些样本进行了决策(分类) 109 | def divide_regions(self, tree, nodes, X): 110 | regions = {node: [] for node in nodes} 111 | for i, row in enumerate(X): 112 | node = self.match_node(row, tree) 113 | regions[node].append(i) 114 | return regions 115 | 116 | # 计算回归树的叶子节点值 117 | def get_score(self, idxs, y_hat, residuals): 118 | 119 | return None 120 | 121 | 122 | # 更新回归树的叶子节点值 123 | def update_score(self, tree, X, y_hat, residuals): 124 | nodes = self.get_leaves(tree) 125 | regions = self.divide_regions(tree, nodes, X) 126 | for node, idxs in regions.items(): 127 | node.score = self.get_score(idxs, y_hat, residuals) 128 | tree.get_rules() 129 | 130 | # 训练模型的时候需要注意以下几点: 131 | # 1.控制树的最大深度max_depth; 132 | # 2.控制分裂时最少的样本量min_samples_split; 133 | # 3.训练每一棵回归树的时候要乘以一个学习率lr,防止模型过拟合; 134 | # 4.对样本进行抽样的时候要采用有放回的抽样方式。 135 | def fit(self, X, y, n_estimators, lr, max_depth, min_samples_split, subsample=None): 136 | self.init_val = self.get_init_val(y) 137 | n = len(y) 138 | y_hat = [self.init_val] * n 139 | residuals = self.get_residuals(y, y_hat) 140 | 141 | self.trees = [] 142 | self.lr = lr 143 | for _ in range(n_estimators): 144 | idx = range(n) 145 | if subsample is not None: 146 | k = int(subsample * n) 147 | idx = choices(population=idx, k=k) 148 | X_sub = [X[i] for i in idx] 149 | residuals_sub = [residuals[i] for i in idx] 150 | y_hat_sub = [y_hat[i] for i in idx] 151 | tree = RegressionTree() 152 | tree.fit(X_sub, residuals_sub, max_depth, min_samples_split) 153 | # Update scores of tree leaf nodes 154 | self.update_score(tree, X_sub, y_hat_sub, residuals_sub) 155 | # Update y_hat 156 | # y_hat = [y_hat_i + lr * res_hat_i for y_hat_i, res_hat_i in zip(y_hat, tree.predict(X))] 157 | # Update residuals 158 | residuals = self.get_residuals(y, y_hat) 159 | self.trees.append(tree) 160 | 161 | # 对单个样本进行预测 162 | def _predict(self, Xi): 163 | ret = self.init_val + sum(self.lr * tree._predict(Xi) for tree in self.trees) 164 | return self.fn(ret) 165 | 166 | # 对多个样本进行预测 167 | def predict(self, X): 168 | #return [self._predict(Xi) for Xi in X] 169 | return None 170 | -------------------------------------------------------------------------------- /gbdt_regression.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from copy import copy 3 | from random import randint, seed, random 4 | from time import time 5 | from regression import RegressionTree 6 | from random import choice 7 | 8 | # 统计程序运行时间函数 9 | # fn代表运行的函数 10 | def run_time(fn): 11 | def fun(): 12 | start = time() 13 | fn() 14 | ret = time() - start 15 | if ret < 1e-6: 16 | unit = "ns" 17 | ret *= 1e9 18 | elif ret < 1e-3: 19 | unit = "us" 20 | ret *= 1e6 21 | elif ret < 1: 22 | unit = "ms" 23 | ret *= 1e3 24 | else: 25 | unit = "s" 26 | print("Total run time is %.1f %s\n" % (ret, unit)) 27 | return fun() 28 | 29 | def load_data(): 30 | f = open("boston/housing.csv") 31 | X = [] 32 | y = [] 33 | for line in f: 34 | line = line[:-1].split(',') 35 | xi = [float(s) for s in line[:-1]] 36 | yi = line[-1] 37 | if '.' in yi: 38 | yi = float(yi) 39 | else: 40 | yi = int(yi) 41 | X.append(xi) 42 | y.append(yi) 43 | f.close() 44 | return X, y 45 | 46 | # 划分训练集和测试集 47 | def train_test_split(X, y, prob=0.7, random_state=None): 48 | if random_state is not None: 49 | seed(random_state) 50 | X_train = [] 51 | X_test = [] 52 | y_train = [] 53 | y_test = [] 54 | for i in range(len(X)): 55 | if random() < prob: 56 | X_train.append(X[i]) 57 | y_train.append(y[i]) 58 | else: 59 | X_test.append(X[i]) 60 | y_test.append(y[i]) 61 | seed() 62 | return X_train, X_test, y_train, y_test 63 | 64 | # 计算回归模型的拟合优度 65 | def get_r2(reg, X, y): 66 | y_hat = reg.predict(X) 67 | m = len(y) 68 | n = len(y_hat) 69 | sse = sum((yi - yi_hat) ** 2 for yi, yi_hat in zip(y, y_hat)) 70 | y_avg = sum(y) / len(y) 71 | sst = sum((yi - y_avg) ** 2 for yi in y) 72 | r2 = 1 - sse / sst 73 | print("Test r2 is %.3f!" % r2) 74 | return r2 75 | 76 | class GradientBoostingBase(object): 77 | # 初始化,存储回归树、学习率、初始预测值和变换函数。 78 | # (注:回归不需要做变换,因此函数的返回值等于参数) 79 | def __init__(self): 80 | self.trees = None 81 | self.lr = None 82 | self.init_val = None 83 | self.fn = lambda x: x 84 | 85 | # 计算初始预测值,初始预测值即y的平均值。 86 | def get_init_val(self, y): 87 | return sum(y) / len(y) 88 | 89 | # 计算残差 90 | def get_residuals(self, y, y_hat): 91 | return [yi - self.fn(y_hat_i) for yi, y_hat_i in zip(y, y_hat)] 92 | 93 | # 找到例子属于哪个分类的叶子节点 94 | def match_node(self, row, tree): 95 | nd = tree.root 96 | while nd.left and nd.right: 97 | if row[nd.feature] < nd.split: 98 | nd = nd.left 99 | else: 100 | nd = nd.right 101 | return nd 102 | 103 | # 得到回归树的所有的叶子节点 104 | def get_leaves(self, tree): 105 | nodes = [] 106 | que = [tree.root] 107 | while que: 108 | node = que.pop(0) 109 | if node.left is None or node.right is None: 110 | nodes.append(node) 111 | continue 112 | left_node = node.left 113 | right_node = node.right 114 | que.append(left_node) 115 | que.append(right_node) 116 | return nodes 117 | 118 | # 将样本的索引划分为回归树的相应叶节点。 119 | # 返回一个字典,类似于:{node1: [1, 3, 5], node2: [2, 4, 6]...},代表哪个节点对哪些样本进行了决策(分类) 120 | def divide_regions(self, tree, nodes, X): 121 | regions = {node: [] for node in nodes} 122 | for i, row in enumerate(X): 123 | node = self.match_node(row, tree) 124 | regions[node].append(i) 125 | return regions 126 | 127 | # 计算回归树的叶子节点值 128 | def get_score(self, idxs, y_hat, residuals): 129 | return None 130 | 131 | # 更新回归树的叶子节点值 132 | def update_score(self, tree, X, y_hat, residuals): 133 | nodes = self.get_leaves(tree) 134 | regions = self.divide_regions(tree, nodes, X) 135 | for node, idxs in regions.items(): 136 | node.score = self.get_score(idxs, y_hat, residuals) 137 | tree.get_rules() 138 | 139 | # 训练模型的时候需要注意以下几点: 140 | # 1.控制树的最大深度max_depth; 141 | # 2.控制分裂时最少的样本量min_samples_split; 142 | # 3.训练每一棵回归树的时候要乘以一个学习率lr,防止模型过拟合; 143 | # 4.对样本进行抽样的时候要采用有放回的抽样方式。 144 | def fit(self, X, y, n_estimators, lr, max_depth, min_samples_split, subsample=None): 145 | self.init_val = self.get_init_val(y) 146 | n = len(y) 147 | y_hat = [self.init_val] * n 148 | residuals = self.get_residuals(y, y_hat) 149 | 150 | self.trees = [] 151 | self.lr = lr 152 | for _ in range(n_estimators): 153 | idx = range(n) 154 | if subsample is not None: 155 | k = int(subsample * n) 156 | idx = choices(population=idx, k=k) 157 | X_sub = [X[i] for i in idx] 158 | residuals_sub = [residuals[i] for i in idx] 159 | y_hat_sub = [y_hat[i] for i in idx] 160 | tree = RegressionTree() 161 | tree.fit(X_sub, residuals_sub, max_depth, min_samples_split) 162 | # Update scores of tree leaf nodes 163 | # self.update_score(tree, X_sub, y_hat_sub, residuals_sub) 164 | # Update y_hat 165 | y_hat = [y_hat_i + lr * res_hat_i for y_hat_i, res_hat_i in zip(y_hat, tree.predict(X))] 166 | # Update residuals 167 | residuals = self.get_residuals(y, y_hat) 168 | self.trees.append(tree) 169 | 170 | # 对单个样本进行预测 171 | def _predict(self, Xi): 172 | ret = self.init_val + sum(self.lr * tree._predict(Xi) for tree in self.trees) 173 | return self.fn(ret) 174 | 175 | # 对多个样本进行预测 176 | def predict(self, X): 177 | return [self._predict(Xi) for Xi in X] 178 | @run_time 179 | def main(): 180 | print('Testing the accuracy of GBDT RegressionTree...') 181 | X, y = load_data() 182 | X_train, X_test, y_train, y_test = train_test_split( 183 | X, y, random_state=10) 184 | reg = GradientBoostingBase() 185 | reg.fit(X=X_train, y=y_train, n_estimators=4, 186 | lr=0.5, max_depth=3, min_samples_split=2) 187 | get_r2(reg, X_test, y_test) 188 | 189 | -------------------------------------------------------------------------------- /image/gbdt_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/machine-learning/8cfae1e944faa67244f18d03d10ca7dd7323be19/image/gbdt_regression.png -------------------------------------------------------------------------------- /image/kmeans.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/machine-learning/8cfae1e944faa67244f18d03d10ca7dd7323be19/image/kmeans.png -------------------------------------------------------------------------------- /image/knn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/machine-learning/8cfae1e944faa67244f18d03d10ca7dd7323be19/image/knn.png -------------------------------------------------------------------------------- /image/regression_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/machine-learning/8cfae1e944faa67244f18d03d10ca7dd7323be19/image/regression_tree.png -------------------------------------------------------------------------------- /image/ridge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/machine-learning/8cfae1e944faa67244f18d03d10ca7dd7323be19/image/ridge.png -------------------------------------------------------------------------------- /image/weixin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/machine-learning/8cfae1e944faa67244f18d03d10ca7dd7323be19/image/weixin.jpg -------------------------------------------------------------------------------- /kd_tree.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from time import time 3 | from copy import copy 4 | from random import randint 5 | 6 | # 产生一个[low,high)区间的随机数组 7 | def gen_data(low, high, n_rows, n_cols=None): 8 | if n_cols is None: 9 | ret = [randint(low, high) for _ in range(n_rows)] 10 | else: 11 | ret = [[randint(low, high) for _ in range(n_cols)] 12 | for _ in range(n_rows)] 13 | return ret 14 | 15 | def get_euclidean_distance(arr1, arr2): 16 | return sum((x1 - x2) ** 2 for x1, x2 in zip(arr1, arr2)) 17 | 18 | class Node(object): 19 | 20 | def __init__(self): 21 | self.father = None 22 | self.left = None 23 | self.right = None 24 | self.feature = None 25 | self.split = None 26 | 27 | def __str__(self): 28 | return ("feature: %s, split: %s" % (str(self.feature), str(self.split))) 29 | 30 | @property 31 | def brother(self): 32 | if not self.father: 33 | ret = None 34 | else: 35 | if self.father.left is self: 36 | ret = self.father.right 37 | else: 38 | ret = self.father.left 39 | return ret 40 | 41 | class KDTree(object): 42 | 43 | def __init__(self): 44 | # root代表KD-Tree的根节点 45 | self.root = Node() 46 | 47 | def __str__(self): 48 | # 展示KD-Tree每个节点的关系 49 | ret = [] 50 | i = 0 51 | que = [(self.root, -1)] 52 | while que: 53 | nd, idx_father = que.pop(0) 54 | ret.append("%d -> %d: %s" % (idx_father, i, str(nd))) 55 | if nd.left: 56 | que.append((nd.left, i)) 57 | if nd.right: 58 | que.append((nd.right, i)) 59 | i += 1 60 | return "\n".join(ret) 61 | 62 | def get_median_idx(self, X, idxs, feature): 63 | # 计算一列数据的中位数 64 | n = len(idxs) 65 | # 忽略n为奇数或者偶数 66 | k = n // 2 67 | # 以元组形式获取列j的所有索引和元素 68 | col = map(lambda i: (i, X[i][feature]), idxs) 69 | # 根据元素的值对元组进行排序,并获取相应的索引 70 | sorted_idxs = map(lambda x: x[0], sorted(col, key=lambda x:x[1])) 71 | # 搜索中值 72 | median_idx = list(sorted_idxs)[k] 73 | return median_idx 74 | 75 | def get_variance(self, X, idxs, feature): 76 | # 计算一列数据的方差 77 | n = len(idxs) 78 | col_sum = col_sum_sqr = 0 79 | for idx in idxs: 80 | xi = X[idx][feature] 81 | col_sum += xi 82 | col_sum_sqr += xi ** 2 83 | # D(X) = E{[X-E(X)]^2} = E(X^2)-[E(X)]^2 84 | return col_sum_sqr / n - (col_sum / n) ** 2 85 | # 取方差最大的特征作为分割点特征 86 | def choose_feature(self, X, idxs): 87 | m = len(X[0]) 88 | variances = map(lambda j: (j, self.get_variance(X, idxs, j)), range(m)) 89 | return max(variances, key=lambda x: x[1])[0] 90 | 91 | def split_feature(self, X, idxs, feature, median_idx): 92 | idxs_split = [[], []] 93 | split_val = X[median_idx][feature] 94 | for idx in idxs: 95 | if(idx == median_idx): 96 | continue 97 | xi = X[idx][feature] 98 | if xi < split_val: 99 | idxs_split[0].append(idx) 100 | else: 101 | idxs_split[1].append(idx) 102 | return idxs_split 103 | 104 | # 使用广度优先搜索的方式建立KD Tree,注意要对X进行归一化。 105 | def build_tree(self, X, y): 106 | nd = self.root 107 | idxs = range(len(X)) 108 | que = [(nd, idxs)] 109 | while que: 110 | nd, idxs = que.pop(0) 111 | n = len(idxs) 112 | if n == 1: 113 | nd.split = (X[idxs[0]], y[idxs[0]]) 114 | continue 115 | feature = self.choose_feature(X, idxs) 116 | median_idx = self.get_median_idx(X, idxs, feature) 117 | idxs_left, idxs_right = self.split_feature(X, idxs, feature, median_idx) 118 | nd.feature = feature 119 | nd.split = (X[median_idx], y[median_idx]) 120 | if idxs_left != []: 121 | nd.left = Node() 122 | nd.left.father = nd 123 | que.append((nd.left, idxs_left)) 124 | if idxs_right != []: 125 | nd.right = Node() 126 | nd.right.father = nd 127 | que.append((nd.right, idxs_right)) 128 | 129 | # 比较目标元素与当前结点的当前feature,访问对应的子节点。 130 | # 反复执行上述过程,直到到达叶子节点。 131 | def search(self, Xi, nd): 132 | while nd.left or nd.right: 133 | if nd.left is None: 134 | nd = nd.right 135 | elif nd.right is None: 136 | nd = nd.left 137 | else: 138 | if Xi[nd.feature] < nd.split[0][nd.feature]: 139 | nd = nd.left 140 | else: 141 | nd = nd.right 142 | return nd 143 | 144 | #计算目标元素与某个节点的欧氏距离,注意get_euclidean_distance 145 | # 这个函数没有进行开根号的操作,所以求出来的是欧氏距离的平方。 146 | def get_eu_dist(self, Xi, nd): 147 | X0 = nd.split[0] 148 | return get_euclidean_distance(Xi, X0) 149 | 150 | # 计算目标元素与某个节点所在超平面的欧氏距离,为了跟上面函数保持一致,要加上平方。 151 | def get_hyper_plane_dist(self, Xi, nd): 152 | j = nd.feature 153 | X0 = nd.split[0] 154 | return (Xi[j] - X0[j]) ** 2 155 | 156 | # 搜索KD-Tree中与目标元素距离最近的节点,使用广度优先搜索来实现。 157 | def nearest_neighbour_search(self, Xi): 158 | dist_best = float("inf") 159 | nd_best = self.search(Xi, self.root) 160 | que = [(self.root, nd_best)] 161 | while que: 162 | nd_root, nd_cur = que.pop(0) 163 | while 1: 164 | dist = self.get_eu_dist(Xi, nd_cur) 165 | if dist < dist_best: 166 | dist_best = dist 167 | nd_best = nd_cur 168 | if nd_cur is not nd_root: 169 | nd_bro = nd_cur.brother 170 | if nd_bro is not None: 171 | dist_hyper = self.get_hyper_plane_dist(Xi, nd_cur.father) 172 | if dist > dist_hyper: 173 | _nd_best = self.search(Xi, nd_bro) 174 | que.append((nd_bro, _nd_best)) 175 | nd_cur = nd_cur.father 176 | else: 177 | break 178 | return nd_best 179 | 180 | 181 | # 暴力搜索 182 | def exhausted_search(X, Xi): 183 | dist_best = float('inf') 184 | row_best = None 185 | for row in X: 186 | dist = get_euclidean_distance(Xi, row) 187 | if dist < dist_best: 188 | dist_best = dist 189 | row_best = row 190 | return row_best 191 | 192 | 193 | def main(): 194 | print("Testing KD Tree...") 195 | test_times = 100 196 | run_time_1 = run_time_2 = 0 197 | for _ in range(test_times): 198 | low = 0 199 | high = 100 200 | n_rows = 1000 201 | n_cols = 2 202 | X = gen_data(low, high, n_rows, n_cols) 203 | y = gen_data(low, high, n_rows) 204 | Xi = gen_data(low, high, n_cols) 205 | 206 | tree = KDTree() 207 | tree.build_tree(X, y) 208 | 209 | start = time() 210 | nd = tree.nearest_neighbour_search(Xi) 211 | run_time_1 += time() - start 212 | ret1 = get_euclidean_distance(Xi, nd.split[0]) 213 | 214 | start = time() 215 | row = exhausted_search(X, Xi) 216 | run_time_2 += time() - start 217 | ret2 = get_euclidean_distance(Xi, row) 218 | 219 | assert ret1 == ret2, "target:%s\nrestult1:%s\nrestult2:%s\ntree:\n%s" \ 220 | % (str(Xi), str(nd), str(row), str(tree)) 221 | print("%d tests passed!" % test_times) 222 | print("KD Tree Search %.2f s" % run_time_1) 223 | print("Exhausted search %.2f s" % run_time_2) 224 | 225 | 226 | main() -------------------------------------------------------------------------------- /kmeans.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from collections import Counter 3 | from copy import deepcopy 4 | from time import time 5 | from random import randint, seed, random 6 | 7 | # 统计程序运行时间函数 8 | # fn代表运行的函数 9 | def run_time(fn): 10 | def fun(): 11 | start = time() 12 | fn() 13 | ret = time() - start 14 | if ret < 1e-6: 15 | unit = "ns" 16 | ret *= 1e9 17 | elif ret < 1e-3: 18 | unit = "us" 19 | ret *= 1e6 20 | elif ret < 1: 21 | unit = "ms" 22 | ret *= 1e3 23 | else: 24 | unit = "s" 25 | print("Total run time is %.1f %s\n" % (ret, unit)) 26 | return fun() 27 | 28 | def load_data(): 29 | f = open("boston/breast_cancer.csv") 30 | X = [] 31 | y = [] 32 | for line in f: 33 | line = line[:-1].split(',') 34 | xi = [float(s) for s in line[:-1]] 35 | yi = line[-1] 36 | if '.' in yi: 37 | yi = float(yi) 38 | else: 39 | yi = int(yi) 40 | X.append(xi) 41 | y.append(yi) 42 | f.close() 43 | return X, y 44 | 45 | # 将数据归一化到[0, 1]范围 46 | def min_max_scale(X): 47 | m = len(X[0]) 48 | x_max = [-float('inf') for _ in range(m)] 49 | x_min = [float('inf') for _ in range(m)] 50 | for row in X: 51 | x_max = [max(a, b) for a, b in zip(x_max, row)] 52 | x_min = [min(a, b) for a, b in zip(x_min, row)] 53 | 54 | ret = [] 55 | for row in X: 56 | tmp = [(x - b) / (a - b) for a, b, x in zip(x_max, x_min, row)] 57 | ret.append(tmp) 58 | return ret 59 | 60 | def get_euclidean_distance(arr1, arr2): 61 | return sum((x1 - x2) ** 2 for x1, x2 in zip(arr1, arr2)) ** 0.5 62 | 63 | def get_cosine_distance(arr1, arr2): 64 | numerator = sum(x1 * x2 for x1, x2 in zip(arr1, arr2)) 65 | denominator = (sum(x1 ** 2 for x1 in arr1) * 66 | sum(x2 ** 2 for x2 in arr2)) ** 0.5 67 | return numerator / denominator 68 | 69 | 70 | class KMeans(object): 71 | # k 簇的个数 72 | # n_features 特征的个数 73 | # clister_centers 聚类中心 74 | # distance_fn 距离计算函数 75 | # cluster_samples_cnt 每个簇里面的样本数 76 | def __init__(self): 77 | self.k = None 78 | self.n_features = None 79 | self.cluster_centers = None 80 | self.distance_fn = None 81 | self.cluster_samples_cnt = None 82 | 83 | # 二分,查找有序列表里面大于目标值的第一个值 84 | def bin_search(self, target, nums): 85 | low = 0 86 | high = len(nums) - 1 87 | assert nums[low] <= target < nums[high], "Cannot find target!" 88 | while 1: 89 | mid = (low + high) // 2 90 | if mid == 0 or target >= nums[mid]: 91 | low = mid + 1 92 | elif target < nums[mid - 1]: 93 | high = mid - 1 94 | else: 95 | break 96 | return mid 97 | 98 | # 比较两个向量是否为同一向量 99 | def cmp_arr(self, arr1, arr2, eps=1e-8): 100 | return len(arr1) == len(arr2) and \ 101 | all(abs(a- b) < eps for a, b in zip(arr1, arr2)) 102 | 103 | # 初始化聚类中心 104 | def init_cluster_centers(self, X, k, n_features, distance_fn): 105 | n = len(X) 106 | centers = [X[randint(0, n-1)]] 107 | for _ in range(k-1): 108 | center_pre = centers[-1] 109 | idxs_dists = ([i, distance_fn(Xi, center_pre)] for i, Xi in enumerate(X)) 110 | # 对距离进行排序 111 | idxs_dists = sorted(idxs_dists, key=lambda x: x[1]) 112 | dists = [x[1] for x in idxs_dists] 113 | tot = sum(dists) 114 | for i in range(1, n): 115 | dists[i] /= tot 116 | for i in range(1, n): 117 | dists[i] += dists[i-1] 118 | # 随机选择一个聚类中心 119 | while 1: 120 | num = random() 121 | # 查找>=num的距离 122 | dist_idx = self.bin_search(num, dists) 123 | row_idx = idxs_dists[dist_idx][0] 124 | center_cur = X[row_idx] 125 | if not any(self.cmp_arr(center_cur, center) for center in centers): 126 | break 127 | centers.append(center_cur) 128 | return centers 129 | 130 | 131 | # 寻找距离Xi最近的聚类中心 132 | def get_nearest_center(self, Xi, centers, distance_fn): 133 | return min(((i, distance_fn(Xi, center)) for 134 | i, center in enumerate(centers)), key=lambda x: x[1])[0] 135 | 136 | # 寻找X最近的聚类中心 137 | def get_nearest_centers(self, X, distance_fn, centers): 138 | return [self.get_nearest_center(Xi, centers, distance_fn) for Xi in X] 139 | 140 | # 获取空的簇 141 | def get_empty_cluster_idxs(self, cluster_samples_cnt, k): 142 | clusters = ((i, cluster_samples_cnt[i]) for i in range(k)) 143 | empty_clusters = filter(lambda x: x[1] == 0, clusters) 144 | return [empty_clusters[0] for empty_cluster in empty_clusters] 145 | # 在X中找到到所有非空簇中心的最远样本 146 | def get_furthest_row(self, X, distance_fn, centers, empty_cluster_idxs): 147 | def f(Xi, centers): 148 | return sum(distance_fn(Xi, centers) for center in centers) 149 | 150 | non_empty_centers = map(lambda x: x[1], filter( 151 | lambda x: x[0] not in empty_cluster_idxs, enumerate(centers))) 152 | return max(map(lambda x: [x, f(x, non_empty_centers)], X), key=lambda x: x[1])[0] 153 | 154 | # 处理空的簇 155 | def process_empty_clusters(self, X, distance_fn, n_features, centers, empty_cluster_idxs): 156 | for i in empty_cluster_idxs: 157 | center_cur = self.get_furthest_row(X, distance_fn, centers, empty_cluster_idxs) 158 | while any(self._cmp_arr(center_cur, center) for center in centers): 159 | center_cur = self.get_furthest_row(X, distance_fn, centers, 160 | empty_cluster_idxs) 161 | centers[i] = center_cur 162 | return centers 163 | 164 | # 重新获取聚类中心 165 | def get_cluster_centers(self, X, k, n_features, y, cluster_samples_cnt): 166 | ret = [[0 for _ in range(n_features)] for _ in range(k)] 167 | for Xi, cetner_num in zip(X, y): 168 | for j in range(n_features): 169 | ret[cetner_num][j] += Xi[j] / cluster_samples_cnt[cetner_num] 170 | return ret 171 | 172 | # 训练 173 | def fit(self, X, k, fn=None, n_iter=100): 174 | n_features = len(X[0]) 175 | if fn is None: 176 | distance_fn = get_euclidean_distance 177 | else: 178 | error_msg = "Parameter distance_fn must be eu or cos!" 179 | assert fn in ("eu", "cos"), error_msg 180 | if fn == "eu": 181 | distance_fn = get_euclidean_distance 182 | if fn == "cos": 183 | distance_fn = get_cosine_distance 184 | 185 | centers = self.init_cluster_centers(X, k, n_features, distance_fn) 186 | for i in range(n_iter): 187 | while 1: 188 | # 寻找X的最近聚类中心 189 | y = self.get_nearest_centers(X, distance_fn, centers) 190 | # 统计每个簇的样本个数 191 | cluster_samples_cnt = Counter(y) 192 | # 获取空的簇 193 | empty_cluster_idxs = self.get_empty_cluster_idxs(cluster_samples_cnt, k) 194 | # 如果有空的簇 195 | if empty_cluster_idxs: 196 | centers = self.process_empty_clusters(centers, empty_cluster_idxs, n_features) 197 | else: 198 | break 199 | centers_new = self.get_cluster_centers(X, k, n_features, y, cluster_samples_cnt) 200 | centers = deepcopy(centers_new) 201 | print("Iteration: %d" % i) 202 | self.k = k 203 | self.n_features = n_features 204 | self.distance_fn = distance_fn 205 | self.cluster_centers = centers 206 | self.cluster_samples_cnt = cluster_samples_cnt 207 | 208 | def _predict(self, Xi): 209 | return self.get_nearest_center(Xi, self.cluster_centers, self.distance_fn) 210 | 211 | def predict(self, X): 212 | return [self._predict(Xi) for Xi in X] 213 | 214 | 215 | @run_time 216 | def main(): 217 | print("Tesing the performance of Kmeans...") 218 | # Load data 219 | X, y = load_data() 220 | X = min_max_scale(X) 221 | # Train model 222 | est = KMeans() 223 | k = 2 224 | est.fit(X, k) 225 | print() 226 | # Model performance 227 | prob_pos = sum(y) / len(y) 228 | print("Positive probability of X is:%.1f%%.\n" % (prob_pos * 100)) 229 | y_hat = est.predict(X) 230 | cluster_pos_tot_cnt = {i: [0, 0] for i in range(k)} 231 | for yi_hat, yi in zip(y_hat, y): 232 | cluster_pos_tot_cnt[yi_hat][0] += yi 233 | cluster_pos_tot_cnt[yi_hat][1] += 1 234 | cluster_prob_pos = {k: v[0] / v[1] for k, v in cluster_pos_tot_cnt.items()} 235 | for i in range(k): 236 | tot_cnt = cluster_pos_tot_cnt[i][1] 237 | prob_pos = cluster_prob_pos[i] 238 | print("Count of elements in cluster %d is:%d." % 239 | (i, tot_cnt)) 240 | print("Positive probability of cluster %d is:%.1f%%.\n" % (i, prob_pos * 100)) -------------------------------------------------------------------------------- /knn.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from kdtree import KDTree 3 | from max_heap import MaxHeap 4 | 5 | from copy import copy 6 | from random import randint, seed, random 7 | from time import time 8 | from random import choice 9 | from math import exp, log 10 | 11 | # 统计程序运行时间函数 12 | # fn代表运行的函数 13 | def run_time(fn): 14 | def fun(): 15 | start = time() 16 | fn() 17 | ret = time() - start 18 | if ret < 1e-6: 19 | unit = "ns" 20 | ret *= 1e9 21 | elif ret < 1e-3: 22 | unit = "us" 23 | ret *= 1e6 24 | elif ret < 1: 25 | unit = "ms" 26 | ret *= 1e3 27 | else: 28 | unit = "s" 29 | print("Total run time is %.1f %s\n" % (ret, unit)) 30 | return fun() 31 | 32 | def load_cancer(): 33 | f = open("boston/breast_cancer.csv") 34 | X = [] 35 | y = [] 36 | for line in f: 37 | line = line[:-1].split(',') 38 | xi = [float(s) for s in line[:-1]] 39 | yi = line[-1] 40 | if '.' in yi: 41 | yi = float(yi) 42 | else: 43 | yi = int(yi) 44 | X.append(xi) 45 | y.append(yi) 46 | f.close() 47 | return X, y 48 | 49 | def load_house_data(): 50 | f = open("boston/housing.csv") 51 | X = [] 52 | y = [] 53 | for line in f: 54 | line = line[:-1].split(',') 55 | xi = [float(s) for s in line[:-1]] 56 | yi = line[-1] 57 | if '.' in yi: 58 | yi = float(yi) 59 | else: 60 | yi = int(yi) 61 | X.append(xi) 62 | y.append(yi) 63 | f.close() 64 | return X, y 65 | 66 | # 划分训练集和测试集 67 | def train_test_split(X, y, prob=0.7, random_state=None): 68 | if random_state is not None: 69 | seed(random_state) 70 | X_train = [] 71 | X_test = [] 72 | y_train = [] 73 | y_test = [] 74 | for i in range(len(X)): 75 | if random() < prob: 76 | X_train.append(X[i]) 77 | y_train.append(y[i]) 78 | else: 79 | X_test.append(X[i]) 80 | y_test.append(y[i]) 81 | seed() 82 | return X_train, X_test, y_train, y_test 83 | 84 | # 准确率 85 | def get_acc(y, y_hat): 86 | return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y) 87 | 88 | # 查准率 89 | def get_precision(y, y_hat): 90 | true_postive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 91 | predicted_positive = sum(y_hat) 92 | return true_postive / predicted_positive 93 | 94 | # 查全率 95 | def get_recall(y, y_hat): 96 | true_postive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 97 | actual_positive = sum(y) 98 | return true_postive / actual_positive 99 | 100 | # 计算真正率 101 | def get_tpr(y, y_hat): 102 | true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 103 | actual_positive = sum(y) 104 | return true_positive / actual_positive 105 | 106 | # 计算真负率 107 | def get_tnr(y, y_hat): 108 | true_negative = sum(1 - (yi or yi_hat) for yi, yi_hat in zip(y, y_hat)) 109 | actual_negative = len(y) - sum(y) 110 | return true_negative / actual_negative 111 | 112 | # 画ROC曲线 113 | def get_roc(y, y_hat_prob): 114 | thresholds = sorted(set(y_hat_prob), reverse=True) 115 | ret = [[0, 0]] 116 | for threshold in thresholds: 117 | y_hat = [int(yi_hat_prob >= threshold) for yi_hat_prob in y_hat_prob] 118 | ret.append([get_tpr(y, y_hat), 1 - get_tnr(y, y_hat)]) 119 | return ret 120 | # 计算AUC(ROC曲线下方的面积) 121 | def get_auc(y, y_hat_prob): 122 | roc = iter(get_roc(y, y_hat_prob)) 123 | tpr_pre, fpr_pre = next(roc) 124 | auc = 0 125 | for tpr, fpr in roc: 126 | auc += (tpr + tpr_pre) * (fpr - fpr_pre) / 2 127 | tpr_pre = tpr 128 | fpr_pre = fpr 129 | return auc 130 | 131 | def model_evaluation(clf, X, y): 132 | y_hat = clf.predict(X) 133 | y_hat_prob = [clf._predict(Xi) for Xi in X] 134 | ret = dict() 135 | ret["Accuracy"] = get_acc(y, y_hat) 136 | ret["Recall"] = get_recall(y, y_hat) 137 | ret['Precision'] = get_precision(y, y_hat) 138 | ret['AUC'] = get_auc(y, y_hat_prob) 139 | for k, v in ret.items(): 140 | print("%s: %.3f" % (k, v)) 141 | print() 142 | return ret 143 | 144 | # 计算回归模型的拟合优度 145 | def get_r2(reg, X, y): 146 | y_hat = reg.predict(X) 147 | m = len(y) 148 | n = len(y_hat) 149 | sse = sum((yi - yi_hat) ** 2 for yi, yi_hat in zip(y, y_hat)) 150 | y_avg = sum(y) / len(y) 151 | sst = sum((yi - y_avg) ** 2 for yi in y) 152 | r2 = 1 - sse / sst 153 | print("Test r2 is %.3f!" % r2) 154 | return r2 155 | 156 | # 将数据归一化到[0, 1]范围 157 | def min_max_scale(X): 158 | m = len(X[0]) 159 | x_max = [-float('inf') for _ in range(m)] 160 | x_min = [float('inf') for _ in range(m)] 161 | for row in X: 162 | x_max = [max(a, b) for a, b in zip(x_max, row)] 163 | x_min = [min(a, b) for a, b in zip(x_min, row)] 164 | 165 | ret = [] 166 | for row in X: 167 | tmp = [(x - b) / (a - b) for a, b, x in zip(x_max, x_min, row)] 168 | ret.append(tmp) 169 | return ret 170 | 171 | class KNeighborsBase(object): 172 | 173 | def __init__(self): 174 | self.k_neighbors = None 175 | self.tree = None 176 | 177 | def fit(self, X, y, k_neighbors=3): 178 | self.k_neighbors = k_neighbors 179 | self.tree = KDTree() 180 | self.tree.build_tree(X, y) 181 | 182 | # 1.获取kd_Tree 183 | # 2.建立大顶堆 184 | # 3.建立队列 185 | # 4.外层循环更新大顶堆 186 | # 5.内层循环遍历kd_Tree 187 | # 6.满足堆顶是第k近邻时退出循环 188 | 189 | def knn_search(self, Xi): 190 | tree = self.tree 191 | heap = MaxHeap(self.k_neighbors, lambda x: x.dist) 192 | # 搜索Xi时,从根节点到叶节点的路径 193 | nd = tree.search(Xi, tree.root) 194 | # 初始化队列 195 | que = [(tree.root, nd)] 196 | while que: 197 | # 计算Xi和根节点的距离 198 | nd_root, nd_cur = que.pop(0) 199 | nd_root.dist = tree.get_eu_dist(Xi, nd_root) 200 | heap.add(nd_root) 201 | while nd_cur is not nd_root: 202 | # 计算Xi和当前节点的距离 203 | nd_cur.dist = tree.get_eu_dist(Xi, nd_cur) 204 | # 更新最好的节点和距离 205 | heap.add(nd_cur) 206 | if nd_cur.brother and (not heap or heap.items[0].dist > tree.get_hyper_plane_dist(Xi, nd_cur.father)): 207 | _nd = tree.search(Xi, nd_cur.brother) 208 | que.append((nd_cur.brother, _nd)) 209 | nd_cur = nd_cur.father 210 | 211 | return heap 212 | 213 | def _predict(self, Xi): 214 | return NotImplemented 215 | 216 | def predict(self, X): 217 | return [self._predict(Xi) for Xi in X] 218 | 219 | 220 | class KNeighborsClassifier(KNeighborsBase): 221 | def __init__(self): 222 | KNeighborsBase.__init__(self) 223 | 224 | def _predict(self, Xi): 225 | heap = self.knn_search(Xi) 226 | n_pos = sum(nd.split[1] for nd in heap._items) 227 | return int(n_pos * 2 > self.k_neighbors) 228 | 229 | class KNeighborsRegressor(KNeighborsBase): 230 | def __init__(self): 231 | KNeighborsBase.__init__(self) 232 | 233 | def _predict(self, Xi): 234 | heap = self.knn_search(Xi) 235 | return sum(nd.split[1] for nd in heap._items) / self.k_neighbors 236 | 237 | 238 | @run_time 239 | def main1(): 240 | print("Tesing the performance of KNN classifier...") 241 | X, y = load_cancer() 242 | X = min_max_scale(X) 243 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20) 244 | clf = KNeighborsClassifier() 245 | clf.fit(X_train, y_train, k_neighbors=21) 246 | model_evaluation(clf, X_test, y_test) 247 | 248 | @run_time 249 | def main2(): 250 | print("Tesing the performance of KNN regressor...") 251 | X, y = load_house_data() 252 | X = min_max_scale(X) 253 | X_train, X_test, y_train, y_test = train_test_split( 254 | X, y, random_state=10) 255 | reg = KNeighborsRegressor() 256 | reg.fit(X=X_train, y=y_train, k_neighbors=3) 257 | get_r2(reg, X_test, y_test) -------------------------------------------------------------------------------- /lda-multi-classify.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import numpy as np 3 | from sklearn.datasets import load_iris 4 | import matplotlib.pyplot as plt 5 | # 这是sklearn中实现的LDA,待会我们会比较自己实现的LDA和它的区别 6 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 7 | 8 | # k为目标 9 | def LDA(X, y, k): 10 | label_ = list(set(y)) 11 | X_classify = {} 12 | for label in label_: 13 | X1 = np.array([X[i] for i in range(len(X)) if y[i] == label]) 14 | X_classify[label] = X1 15 | 16 | miu = np.mean(X, axis=0) 17 | miu_classify = {} 18 | for label in label_: 19 | miu1 = np.mean(X_classify[label], axis=0) 20 | miu_classify[label] = miu1 21 | 22 | # St = np.dot((X - mju).T, X - mju) 23 | # 计算类内散度矩阵Sw 24 | Sw = np.zeros((len(miu), len(miu))) 25 | for i in label_: 26 | Sw += np.dot((X_classify[i] - miu_classify[i]).T, X_classify[i] - miu_classify[i]) 27 | 28 | #Sb = St-Sw 29 | # 计算类内散度矩阵Sb 30 | Sb = np.zeros((len(miu), len(miu))) 31 | for i in label_: 32 | Sb += len(X_classify[i]) * np.dot((miu_classify[i] - miu).reshape( 33 | (len(miu), 1)), (miu_classify[i] - miu).reshape((1, len(miu)))) 34 | 35 | # 计算S_w^{-1}S_b的特征值和特征矩阵 36 | eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(Sw).dot(Sb)) 37 | sorted_indices = np.argsort(eig_vals) 38 | # 提取前k个特征向量 39 | topk_eig_vecs = eig_vecs[:, sorted_indices[:-k - 1:-1]] 40 | return topk_eig_vecs 41 | 42 | def main(): 43 | iris = load_iris() 44 | X = iris.data 45 | y = iris.target 46 | 47 | W = LDA(X, y, 2) 48 | X_new = np.dot(X, W) 49 | plt.scatter(X_new[:, 0], X_new[:, 1], marker='o', c=y) 50 | plt.show() 51 | 52 | # 和sklearn的函数对比 53 | lda = LinearDiscriminantAnalysis(n_components=2) 54 | lda.fit(X, y) 55 | X_new = lda.transform(X) 56 | plt.scatter(X_new[:, 0], X_new[:, 1], marker='o', c=y) 57 | plt.show() 58 | 59 | 60 | main() 61 | -------------------------------------------------------------------------------- /lda-two-classify.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets.samples_generator import make_classification 5 | 6 | def LDA(X, y): 7 | X1 = np.array([X[i] for i in range(len(X)) if y[i] == 0]) 8 | X2 = np.array([X[i] for i in range(len(X)) if y[i] == 1]) 9 | 10 | len1 = len(X1) 11 | len2 = len(X2) 12 | 13 | # 求均值向量u1,u2 14 | miu1 = np.mean(X1, axis=0) 15 | miu2 = np.mean(X2, axis=0) 16 | 17 | # 求S_w 18 | # \sum_0 19 | conv1 = np.dot((X1 - miu1).T, (X1 - miu1)) 20 | # \sum_1 21 | conv2 = np.dot((X2 - miu2).T, (X2 - miu2)) 22 | Sw = conv1 + conv2 23 | 24 | # 计算w 25 | w = np.dot(np.mat(Sw).I, (miu1 - miu2).reshape((len(miu1), 1))) 26 | X1_new = np.dot(X1, w) 27 | X2_new = np.dot(X2, w) 28 | y1_new = [0 for i in range(len1)] 29 | y2_new = [1 for i in range(len2)] 30 | return X1_new, X2_new, y1_new, y2_new 31 | 32 | def main(): 33 | X, y = make_classification(n_samples=500, n_features=2, n_redundant=0, n_classes=2, 34 | n_informative=1, n_clusters_per_class=1, class_sep=0.5, random_state=10) 35 | 36 | X1_new, X2_new, y1_new, y2_new = LDA(X, y) 37 | 38 | # 可视化原始数据 39 | plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) 40 | plt.show() 41 | # 可视化LDA降维后的数据 42 | plt.plot(X1_new, y1_new, "bo") 43 | plt.plot(X2_new, y2_new, "ro") 44 | plt.show() 45 | 46 | main() 47 | 48 | -------------------------------------------------------------------------------- /linear_regression.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import numpy as np 3 | from numpy.random import choice, seed 4 | from random import sample, normalvariate 5 | from numpy import ndarray 6 | from time import time 7 | from random import randint, seed, random 8 | 9 | # 统计程序运行时间函数 10 | # fn代表运行的函数 11 | def run_time(fn): 12 | def fun(): 13 | start = time() 14 | fn() 15 | ret = time() - start 16 | if ret < 1e-6: 17 | unit = "ns" 18 | ret *= 1e9 19 | elif ret < 1e-3: 20 | unit = "us" 21 | ret *= 1e6 22 | elif ret < 1: 23 | unit = "ms" 24 | ret *= 1e3 25 | else: 26 | unit = "s" 27 | print("Total run time is %.1f %s\n" % (ret, unit)) 28 | return fun() 29 | 30 | def load_data(): 31 | f = open("boston/housing.csv") 32 | X = [] 33 | y = [] 34 | for line in f: 35 | line = line[:-1].split(',') 36 | xi = [float(s) for s in line[:-1]] 37 | yi = line[-1] 38 | if '.' in yi: 39 | yi = float(yi) 40 | else: 41 | yi = int(yi) 42 | X.append(xi) 43 | y.append(yi) 44 | f.close() 45 | return X, y 46 | 47 | # 划分训练集和测试集 48 | def train_test_split(X, y, prob=0.7, random_state=None): 49 | if random_state is not None: 50 | seed(random_state) 51 | X_train = [] 52 | X_test = [] 53 | y_train = [] 54 | y_test = [] 55 | for i in range(len(X)): 56 | if random() < prob: 57 | X_train.append(X[i]) 58 | y_train.append(y[i]) 59 | else: 60 | X_test.append(X[i]) 61 | y_test.append(y[i]) 62 | seed() 63 | return X_train, X_test, y_train, y_test 64 | 65 | # 计算回归模型的拟合优度 66 | def get_r2(reg, X, y): 67 | y_hat = reg.predict(X) 68 | m = len(y) 69 | n = len(y_hat) 70 | sse = sum((yi - yi_hat) ** 2 for yi, yi_hat in zip(y, y_hat)) 71 | y_avg = sum(y) / len(y) 72 | sst = sum((yi - y_avg) ** 2 for yi in y) 73 | r2 = 1 - sse / sst 74 | print("Test r2 is %.3f!" % r2) 75 | return r2 76 | 77 | # 将数据归一化到[0, 1]范围 78 | def min_max_scale(X): 79 | m = len(X[0]) 80 | x_max = [-float('inf') for _ in range(m)] 81 | x_min = [float('inf') for _ in range(m)] 82 | for row in X: 83 | x_max = [max(a, b) for a, b in zip(x_max, row)] 84 | x_min = [min(a, b) for a, b in zip(x_min, row)] 85 | 86 | ret = [] 87 | for row in X: 88 | tmp = [(x - b) / (a - b) for a, b, x in zip(x_max, x_min, row)] 89 | ret.append(tmp) 90 | return ret 91 | 92 | class RegressionBase(object): 93 | def __init__(self): 94 | self.bias = None 95 | self.weights = None 96 | 97 | def _predict(self, Xi): 98 | return NotImplemented 99 | 100 | def get_gradient_delta(self, Xi, yi): 101 | y_hat = self._predict(Xi) 102 | bias_grad_delta = yi - y_hat 103 | weights_grad_delta = [bias_grad_delta * Xij for Xij in Xi] 104 | return bias_grad_delta, weights_grad_delta 105 | 106 | # 全梯度下降 107 | def batch_gradient_descent(self, X, y, lr, epochs): 108 | #b = b - learning_rate * 1 / m * b_grad_i, b_grad_i < - grad 109 | #W = W - learning_rate * 1 / m * w_grad_i, w_grad_i < - grad 110 | m, n = len(X), len(X[0]) 111 | self.bias = 0 112 | # 正太分布 113 | self.weights = [normalvariate(0, 0.01) for _ in range(n)] 114 | for _ in range(epochs): 115 | bias_grad = 0 116 | weights_grad = [0 for _ in range(n)] 117 | for i in range(m): 118 | bias_grad_delta, weights_grad_delta = self.get_gradient_delta(X[i], y[i]) 119 | bias_grad += bias_grad_delta 120 | weights_grad = [w_grad + w_grad_d for w_grad, w_grad_d 121 | in zip(weights_grad, weights_grad_delta)] 122 | self.bias = self.bias + lr * bias_grad * 2 / m 123 | self.weights = [w + lr * w_grad * 2 / m for w, 124 | w_grad in zip(self.weights, weights_grad)] 125 | 126 | # 随机梯度下降 127 | def stochastic_gradient_descent(self, X, y, lr, epochs, sample_rate): 128 | m, n = len(X), len(X[0]) 129 | k = int(m * sample_rate) 130 | self.bias = 0 131 | self.weights = [normalvariate(0, 0.01) for _ in range(n)] 132 | for _ in range(epochs): 133 | for i in sample(range(m), k): 134 | bias_grad, weights_grad = self.get_gradient_delta(X[i], y[i]) 135 | self.bias += lr * bias_grad 136 | self.weights = [w + lr * w_grad for w, 137 | w_grad in zip(self.weights, weights_grad)] 138 | 139 | def fit(self, X, y, lr, epochs, method="batch", sample_rate=1.0): 140 | assert method in ("batch", "stochastic") 141 | if method == "batch": 142 | self.batch_gradient_descent(X, y, lr, epochs) 143 | if method == "stochastic": 144 | self.stochastic_gradient_descent(X, y, lr, epochs, sample_rate) 145 | 146 | def predict(self, X): 147 | return NotImplemented 148 | 149 | 150 | class LinearRegreession(RegressionBase): 151 | def __init__(self): 152 | RegressionBase.__init__(self) 153 | 154 | def _predict(self, Xi): 155 | return sum(wi * xij for wi, xij in zip(self.weights, Xi)) + self.bias 156 | 157 | def predict(self, X): 158 | return [self._predict(xi) for xi in X] 159 | 160 | 161 | 162 | def main(): 163 | X, y = load_data() 164 | X = min_max_scale(X) 165 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10) 166 | @run_time 167 | def batch(): 168 | print("Tesing the performance of LinearRegression(batch)...") 169 | reg = LinearRegreession() 170 | reg.fit(X=X_train, y=y_train, lr=0.02, epochs=5000) 171 | get_r2(reg, X_test, y_test) 172 | 173 | @run_time 174 | def stochastic(): 175 | print("Tesing the performance of LinearRegression(stochastic)...") 176 | reg = LinearRegreession() 177 | reg.fit(X=X_train, y=y_train, lr=0.001, epochs=1000, 178 | method="stochastic", sample_rate=0.5) 179 | get_r2(reg, X_test, y_test) 180 | 181 | 182 | main() 183 | -------------------------------------------------------------------------------- /logistic_regression.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import numpy as np 3 | from numpy.random import choice, seed 4 | from random import sample, normalvariate 5 | from numpy import ndarray 6 | from time import time 7 | from random import randint, seed, random 8 | from math import exp 9 | 10 | # 统计程序运行时间函数 11 | # fn代表运行的函数 12 | def run_time(fn): 13 | def fun(): 14 | start = time() 15 | fn() 16 | ret = time() - start 17 | if ret < 1e-6: 18 | unit = "ns" 19 | ret *= 1e9 20 | elif ret < 1e-3: 21 | unit = "us" 22 | ret *= 1e6 23 | elif ret < 1: 24 | unit = "ms" 25 | ret *= 1e3 26 | else: 27 | unit = "s" 28 | print("Total run time is %.1f %s\n" % (ret, unit)) 29 | return fun() 30 | 31 | def load_data(): 32 | f = open("boston/breast_cancer.csv") 33 | X = [] 34 | y = [] 35 | for line in f: 36 | line = line[:-1].split(',') 37 | xi = [float(s) for s in line[:-1]] 38 | yi = line[-1] 39 | if '.' in yi: 40 | yi = float(yi) 41 | else: 42 | yi = int(yi) 43 | X.append(xi) 44 | y.append(yi) 45 | f.close() 46 | return X, y 47 | 48 | # 划分训练集和测试集 49 | def train_test_split(X, y, prob=0.7, random_state=None): 50 | if random_state is not None: 51 | seed(random_state) 52 | X_train = [] 53 | X_test = [] 54 | y_train = [] 55 | y_test = [] 56 | for i in range(len(X)): 57 | if random() < prob: 58 | X_train.append(X[i]) 59 | y_train.append(y[i]) 60 | else: 61 | X_test.append(X[i]) 62 | y_test.append(y[i]) 63 | seed() 64 | return X_train, X_test, y_train, y_test 65 | 66 | 67 | # 将数据归一化到[0, 1]范围 68 | def min_max_scale(X): 69 | m = len(X[0]) 70 | x_max = [-float('inf') for _ in range(m)] 71 | x_min = [float('inf') for _ in range(m)] 72 | for row in X: 73 | x_max = [max(a, b) for a, b in zip(x_max, row)] 74 | x_min = [min(a, b) for a, b in zip(x_min, row)] 75 | 76 | ret = [] 77 | for row in X: 78 | tmp = [(x - b) / (a - b) for a, b, x in zip(x_max, x_min, row)] 79 | ret.append(tmp) 80 | return ret 81 | 82 | # 准确率 83 | def get_acc(y, y_hat): 84 | return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y) 85 | 86 | # 查准率 87 | def get_precision(y, y_hat): 88 | true_postive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 89 | predicted_positive = sum(y_hat) 90 | return true_postive / predicted_positive 91 | 92 | # 查全率 93 | def get_recall(y, y_hat): 94 | true_postive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 95 | actual_positive = sum(y) 96 | return true_postive / actual_positive 97 | 98 | # 计算真正率 99 | def get_tpr(y, y_hat): 100 | true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat)) 101 | actual_positive = sum(y) 102 | return true_positive / actual_positive 103 | 104 | # 计算真负率 105 | def get_tnr(y, y_hat): 106 | true_negative = sum(1 - (yi or yi_hat) for yi, yi_hat in zip(y, y_hat)) 107 | actual_negative = len(y) - sum(y) 108 | return true_negative / actual_negative 109 | 110 | # 画ROC曲线 111 | def get_roc(y, y_hat_prob): 112 | thresholds = sorted(set(y_hat_prob), reverse=True) 113 | ret = [[0, 0]] 114 | for threshold in thresholds: 115 | y_hat = [int(yi_hat_prob >= threshold) for yi_hat_prob in y_hat_prob] 116 | ret.append([get_tpr(y, y_hat), 1 - get_tnr(y, y_hat)]) 117 | return ret 118 | # 计算AUC(ROC曲线下方的面积) 119 | def get_auc(y, y_hat_prob): 120 | roc = iter(get_roc(y, y_hat_prob)) 121 | tpr_pre, fpr_pre = next(roc) 122 | auc = 0 123 | for tpr, fpr in roc: 124 | auc += (tpr + tpr_pre) * (fpr - fpr_pre) / 2 125 | tpr_pre = tpr 126 | fpr_pre = fpr 127 | return auc 128 | 129 | def model_evaluation(clf, X, y): 130 | y_hat = clf.predict(X) 131 | y_hat_prob = [clf._predict(Xi) for Xi in X] 132 | ret = dict() 133 | ret["Accuracy"] = get_acc(y, y_hat) 134 | ret["Recall"] = get_recall(y, y_hat) 135 | ret['Precision'] = get_precision(y, y_hat) 136 | ret['AUC'] = get_auc(y, y_hat_prob) 137 | for k, v in ret.items(): 138 | print("%s: %.3f" % (k, v)) 139 | print() 140 | return ret 141 | 142 | def sigmoid(x, x_min=-100): 143 | return 1 / (1 + exp(-x)) if x > x_min else 0 144 | 145 | class RegressionBase(object): 146 | def __init__(self): 147 | self.bias = None 148 | self.weights = None 149 | 150 | def _predict(self, Xi): 151 | return NotImplemented 152 | 153 | def get_gradient_delta(self, Xi, yi): 154 | y_hat = self._predict(Xi) 155 | bias_grad_delta = yi - y_hat 156 | weights_grad_delta = [bias_grad_delta * Xij for Xij in Xi] 157 | return bias_grad_delta, weights_grad_delta 158 | 159 | # 全梯度下降 160 | def batch_gradient_descent(self, X, y, lr, epochs): 161 | #b = b - learning_rate * 1 / m * b_grad_i, b_grad_i < - grad 162 | #W = W - learning_rate * 1 / m * w_grad_i, w_grad_i < - grad 163 | m, n = len(X), len(X[0]) 164 | self.bias = 0 165 | # 正太分布 166 | self.weights = [normalvariate(0, 0.01) for _ in range(n)] 167 | for _ in range(epochs): 168 | bias_grad = 0 169 | weights_grad = [0 for _ in range(n)] 170 | for i in range(m): 171 | bias_grad_delta, weights_grad_delta = self.get_gradient_delta(X[i], y[i]) 172 | bias_grad += bias_grad_delta 173 | weights_grad = [w_grad + w_grad_d for w_grad, w_grad_d 174 | in zip(weights_grad, weights_grad_delta)] 175 | self.bias = self.bias + lr * bias_grad * 2 / m 176 | self.weights = [w + lr * w_grad * 2 / m for w, 177 | w_grad in zip(self.weights, weights_grad)] 178 | 179 | # 随机梯度下降 180 | def stochastic_gradient_descent(self, X, y, lr, epochs, sample_rate): 181 | m, n = len(X), len(X[0]) 182 | k = int(m * sample_rate) 183 | self.bias = 0 184 | self.weights = [normalvariate(0, 0.01) for _ in range(n)] 185 | for _ in range(epochs): 186 | for i in sample(range(m), k): 187 | bias_grad, weights_grad = self.get_gradient_delta(X[i], y[i]) 188 | self.bias += lr * bias_grad 189 | self.weights = [w + lr * w_grad for w, 190 | w_grad in zip(self.weights, weights_grad)] 191 | 192 | def fit(self, X, y, lr, epochs, method="batch", sample_rate=1.0): 193 | assert method in ("batch", "stochastic") 194 | if method == "batch": 195 | self.batch_gradient_descent(X, y, lr, epochs) 196 | if method == "stochastic": 197 | self.stochastic_gradient_descent(X, y, lr, epochs, sample_rate) 198 | 199 | def predict(self, X): 200 | return NotImplemented 201 | 202 | class LogisticRegression(RegressionBase): 203 | def __init__(self): 204 | RegressionBase.__init__(self) 205 | 206 | def _predict(self, Xi): 207 | z = sum(wi * xij for wi, xij in zip(self.weights, Xi)) + self.bias 208 | return sigmoid(z) 209 | 210 | def predict(self, X, threshold=0.5): 211 | return [int(self._predict(Xi) >= threshold) for Xi in X] 212 | 213 | 214 | def main(): 215 | # Load data 216 | X, y = load_data() 217 | X = min_max_scale(X) 218 | # Split data randomly, train set rate 70% 219 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10) 220 | 221 | @run_time 222 | def batch(): 223 | print("Tesing the performance of LogisticRegression(batch)...") 224 | # Train model 225 | clf = LogisticRegression() 226 | clf.fit(X=X_train, y=y_train, lr=0.05, epochs=200) 227 | # Model evaluation 228 | model_evaluation(clf, X_test, y_test) 229 | 230 | @run_time 231 | def stochastic(): 232 | print("Tesing the performance of LogisticRegression(stochastic)...") 233 | # Train model 234 | clf = LogisticRegression() 235 | clf.fit(X=X_train, y=y_train, lr=0.01, epochs=200, 236 | method="stochastic", sample_rate=0.5) 237 | # Model evaluation 238 | model_evaluation(clf, X_test, y_test) 239 | 240 | main() 241 | -------------------------------------------------------------------------------- /max_heap.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from time import time 3 | from copy import copy 4 | from random import randint 5 | 6 | # 产生一个[low,high)区间的随机数组 7 | def gen_data(low, high, n_rows, n_cols=None): 8 | if n_cols is None: 9 | ret = [randint(low, high) for _ in range(n_rows)] 10 | else: 11 | ret = [[randint(low, high) for _ in range(n_cols)] 12 | for _ in range(n_rows)] 13 | return ret 14 | 15 | class MaxHeap(object): 16 | # 创建MaxHeap类 17 | def __init__(self, max_size, fn): 18 | self.max_size = max_size 19 | self.fn = fn 20 | self._items = [None] * max_size 21 | self.size = 0 22 | # 打印对象中具体的属性值 23 | def __str__(self): 24 | item_values = str([self.fn(self.items[i]) for i in range(self.size)]) 25 | return ("Size: %d\nMax size: %d\nItem_values: %s\n" % (self.size, self.max_size, item_values)) 26 | # 获取所有大顶堆的所有值 27 | @property 28 | def items(self): 29 | return self._items[:self.size] 30 | # 检查大顶堆是否已满 31 | @property 32 | def full(self): 33 | return self.size == self.max_size 34 | # 获取大顶堆的idx位置的值,如果被删除了,返回-inf 35 | def value(self, idx): 36 | item = self._items[idx] 37 | if item is None: 38 | ret = -float('inf') 39 | else: 40 | ret = self.fn(item) 41 | return ret 42 | # 添加元素 43 | def add(self, item): 44 | if self.full: 45 | if self.fn(item) < self.value(0): 46 | self._items[0] = item 47 | self.shift_down(0) 48 | else: 49 | self._items[self.size] = item 50 | self.size += 1 51 | self.shift_up(self.size - 1) 52 | # 推出顶部元素 53 | def pop(self): 54 | assert self.size > 0, "Cannot pop item! The MaxHeap is empty!" 55 | ret = self.items[0] 56 | self._items[0] = self._items[self.size - 1] 57 | self._items[self.size - 1] = None 58 | self.size -= 1 59 | self.shit_down(0) 60 | return ret 61 | # 元素上浮 62 | def shift_up(self, idx): 63 | assert idx < self.size, "The parameter idx must be less than heap's size!" 64 | parent = (idx - 1) // 2 65 | while parent >= 0 and self.value(parent) < self.value(idx): 66 | self._items[parent], self._items[idx] = self._items[idx], self._items[parent] 67 | idx = parent 68 | parent = (idx - 1) // 2 69 | 70 | # 元素下沉 71 | def shift_down(self, idx): 72 | child = (idx + 1) * 2 - 1 73 | while child < self.size: 74 | if child + 1 < self.size and self.value(child + 1) > self.value(child): 75 | child += 1 76 | if self.value(idx) < self.value(child): 77 | self._items[idx], self._items[child] = self._items[child], self._items[idx] 78 | idx = child 79 | child = (idx + 1) * 2 - 1 80 | else: 81 | break 82 | # 检查有效性 83 | def is_valid(self): 84 | ret = [] 85 | for i in range(1, self.size): 86 | parent = (i - 1) // 2 87 | ret.append(self.value(parent) >= self.value(i)) 88 | # all()函数用于判定可迭代参数iterable中的所有元素是否都为TRUE,如果是返回True,否则返回False 89 | return all(ret) 90 | 91 | # 暴力查找nums中最小的k各元素 92 | def exhausted_search(nums, k): 93 | rets = [] 94 | idxs = [] 95 | key = None 96 | for _ in range(k): 97 | val = float("inf") 98 | for i, num in enumerate(nums): 99 | if num < val and i not in idxs: 100 | key = i 101 | val = num 102 | idxs.append(key) 103 | rets.append(val) 104 | return rets 105 | 106 | # 主函数分为下面几个部分 107 | # 1. 随机生成数据集,即测试用例 108 | # 2. 建立大顶堆 109 | # 3. 调用exhausted_search查找 110 | # 4. 使用大顶堆 111 | def main(): 112 | # Test 113 | print("Testing MaxHeap...") 114 | test_times = 100 115 | run_time_1 = run_time_2 = 0 116 | for _ in range(test_times): 117 | # Generate dataset randomly 118 | low = 0 119 | high = 1000 120 | n_rows = 10000 121 | k = 100 122 | nums = gen_data(low, high, n_rows) 123 | 124 | # Build Max Heap 125 | heap = MaxHeap(k, lambda x: x) 126 | start = time() 127 | for num in nums: 128 | heap.add(num) 129 | ret1 = copy(heap.items) 130 | run_time_1 += time() - start 131 | 132 | # Exhausted search 133 | start = time() 134 | ret2 = exhausted_search(nums, k) 135 | run_time_2 += time() - start 136 | 137 | # Compare result 138 | ret1.sort() 139 | assert ret1 == ret2, "target:%s\nk:%d\nrestult1:%s\nrestult2:%s\n" % ( 140 | str(nums), k, str(ret1), str(ret2)) 141 | print("%d tests passed!" % test_times) 142 | print("Max Heap Search %.2f s" % run_time_1) 143 | print("Exhausted search %.2f s" % run_time_2) 144 | 145 | main() 146 | -------------------------------------------------------------------------------- /pca.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # 零均值化,即中心化,是数据的预处理方法 3 | def zero_centered(data): 4 | matrix_mean = np.mean(data, axis=0) 5 | return data - matrix_mean 6 | 7 | def pca_eig(data, n): 8 | new_data = zero_centered(data) 9 | conv_mat = np.dot(new_data.T, new_data) #也可以用np.cov()方法 10 | eig_values, eig_vectors = np.linalg.eig(np.mat(conv_mat)) 11 | # 求特征值和特征向量,特征向量是列向量 12 | value_indices = np.argsort(eig_values) #将特征值从小到大排序 13 | n_vectors = eig_vectors[:, value_indices[-1: -(n+1): -1]] 14 | # 最大的n个特征值对应的特征向量 15 | return new_data * n_vectors #返回低维特征空间的数据 16 | 17 | def pca_svd(data, n): 18 | new_data = zero_centered(data) 19 | cov_mat = np.dot(new_data.T, new_data) 20 | U, s, V = np.linalg.svd(cov_mat) #将协方差矩阵奇异值分解 21 | pc = np.dot(new_data, U) #返回矩阵的第一个列向量即是降维后的结果 22 | return pc[:, 0] 23 | 24 | def unit_test(): 25 | data = np.array([[2.5, 2.4], [0.5, 0.7], [2.2, 2.9], [1.9, 2.2], [3.1, 3.0], 26 | [2.3, 2.7], [2, 1.6], [1, 1.1], [1.5, 1.6], [1.1, 0.9]]) 27 | result_eig = pca_eig(data, 1) 28 | # 使用常规的特征值分解法,将二维数据降到一维 29 | print(result_eig) 30 | result_svd = pca_svd(data, 1) 31 | # 使用奇异值分解法将协方差矩阵分解,得到降维结果 32 | print(result_svd) 33 | unit_test() 34 | -------------------------------------------------------------------------------- /perceptron.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from sklearn.datasets import load_iris 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | # 加载鸢尾花数据集 8 | iris = load_iris() 9 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 10 | df['label'] = iris.target 11 | # 行列数据标注 12 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 13 | # print (df.label.value_counts()) 14 | print(df.head(10)) 15 | 16 | # 数据可视化 17 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], c='red', label='0') 18 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], c='blue', label='1') 19 | plt.xlabel('sepal length') 20 | plt.ylabel('sepal width') 21 | #plt.show() 22 | 23 | # 选择特征和标签 24 | data = np.array(df.iloc[:100, [0, 1, -1]]) 25 | X, y = data[:, :-1], data[:, -1] 26 | y = np.array([1 if i == 1 else -1 for i in y]) #将label中的0标签替换为-1 27 | 28 | # 开始实现感知机算法 29 | 30 | class Model: 31 | # 初始化 32 | def __init__(self): 33 | # 初始化权重 34 | self.w = np.ones(len(data[0]) - 1, dtype=np.float32) 35 | # 初始化偏执 36 | self.b = 0 37 | # 学习率 38 | self.l_rate = 0.1 39 | 40 | # 定义符号函数sign 41 | def sign(self, x, w, b): 42 | y = np.dot(x, w) + b 43 | return y 44 | 45 | # 随机梯度下降法 46 | def fit(self, X_train, y_train): 47 | is_wrong = False 48 | while not is_wrong: 49 | wrong_cnt = 0 50 | for i in range(len(X_train)): 51 | X = X_train[i] 52 | y = y_train[i] 53 | if (y * self.sign(X, self.w, self.b) <= 0): 54 | # 更新权重 55 | self.w = self.w + self.l_rate * np.dot(y, X) 56 | # 更新步长 57 | self.b = self.b + self.l_rate * y 58 | wrong_cnt += 1 59 | if(wrong_cnt == 0): 60 | is_wrong = True 61 | 62 | return 'Perceptron Model!' 63 | 64 | def score(self): 65 | pass 66 | 67 | 68 | # 开始调用感知机模型 69 | perceptron = Model() 70 | perceptron.fit(X, y) 71 | # 可视化超平面 72 | x_points = np.linspace(4, 7, 10) 73 | # 误分类点到超平面的距离 74 | y_ = -(perceptron.w[0] * x_points + perceptron.b) / perceptron.w[1] 75 | plt.plot(x_points, y_) 76 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], c='red', label='0') 77 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], c='blue', label='1') 78 | plt.xlabel('sepal length') 79 | plt.ylabel('sepal width') 80 | plt.show() 81 | -------------------------------------------------------------------------------- /random_forest.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import decision_tree 3 | from decision_tree import DecisionTree 4 | from random import sample, choices, choice 5 | 6 | class RandomForest(object): 7 | def __init__(self): 8 | self.trees = None 9 | self.tree_features = None 10 | 11 | def fit(self, X, y, n_estimators=10, max_depth=3, min_samples_split=2, max_features=None, n_samples=None): 12 | self.trees = [] 13 | self.tree_features = [] 14 | for _ in range(n_estimators): 15 | m = len(X[0]) 16 | n = len(y) 17 | if n_samples: 18 | idx = choices(population=range(n), k=min(n, n_samples)) 19 | else: 20 | idx = range(n) 21 | if max_features: 22 | n_features = min(m, max_features) 23 | else: 24 | n_features = int(m ** 0.5) 25 | features = sample(range(m), choice(range(1, n_features+1))) 26 | X_sub = [[X[i][j] for j in features] for i in idx] 27 | y_sub = [y[i] for i in idx] 28 | clf = DecisionTree() 29 | clf.fit(X_sub, y_sub, max_depth, min_samples_split) 30 | self.trees.append(clf) 31 | self.tree_features.append(features) 32 | 33 | def _predict(self, Xi): 34 | pos_vote = 0 35 | for tree, features in zip(self.trees, self.tree_features): 36 | score = tree._predict([Xi[j] for j in features]) 37 | if score >= 0.5: 38 | pos_vote += 1 39 | neg_vote = len(self.trees) - pos_vote 40 | if pos_vote > neg_vote: 41 | return 1 42 | elif pos_vote < neg_vote: 43 | return 0 44 | else: 45 | return choice([0, 1]) 46 | 47 | def predict(self, X): 48 | return [self._predict(Xi) for Xi in X] 49 | 50 | 51 | @decision_tree.run_time 52 | def main(): 53 | print("Tesing the performance of RandomForest...") 54 | # Load data 55 | X, y = decision_tree.load_data() 56 | # Split data randomly, train set rate 70% 57 | X_train, X_test, y_train, y_test = decision_tree.train_test_split(X, y, random_state=40) 58 | 59 | # Train model 60 | rf = RandomForest() 61 | rf.fit(X_train, y_train, n_samples=300, max_depth=3, n_estimators=20) 62 | # Model evaluation 63 | y_hat = rf.predict(X_test) 64 | acc = decision_tree.get_acc(y_test, y_hat) 65 | print("Accuracy is %.3f" % acc) -------------------------------------------------------------------------------- /regression_tree.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from copy import copy 3 | from random import randint, seed, random 4 | from time import time 5 | 6 | # 统计程序运行时间函数 7 | # fn代表运行的函数 8 | def run_time(fn): 9 | def fun(): 10 | start = time() 11 | fn() 12 | ret = time() - start 13 | if ret < 1e-6: 14 | unit = "ns" 15 | ret *= 1e9 16 | elif ret < 1e-3: 17 | unit = "us" 18 | ret *= 1e6 19 | elif ret < 1: 20 | unit = "ms" 21 | ret *= 1e3 22 | else: 23 | unit = "s" 24 | print("Total run time is %.1f %s\n" % (ret, unit)) 25 | return fun() 26 | 27 | def load_data(): 28 | f = open("boston/housing.csv") 29 | X = [] 30 | y = [] 31 | for line in f: 32 | line = line[:-1].split(',') 33 | xi = [float(s) for s in line[:-1]] 34 | yi = line[-1] 35 | if '.' in yi: 36 | yi = float(yi) 37 | else: 38 | yi = int(yi) 39 | X.append(xi) 40 | y.append(yi) 41 | f.close() 42 | return X, y 43 | 44 | # 划分训练集和测试集 45 | def train_test_split(X, y, prob=0.7, random_state=None): 46 | if random_state is not None: 47 | seed(random_state) 48 | X_train = [] 49 | X_test = [] 50 | y_train = [] 51 | y_test = [] 52 | for i in range(len(X)): 53 | if random() < prob: 54 | X_train.append(X[i]) 55 | y_train.append(y[i]) 56 | else: 57 | X_test.append(X[i]) 58 | y_test.append(y[i]) 59 | seed() 60 | return X_train, X_test, y_train, y_test 61 | 62 | # 计算回归模型的拟合优度 63 | def get_r2(reg, X, y): 64 | y_hat = reg.predict(X) 65 | m = len(y) 66 | n = len(y_hat) 67 | sse = sum((yi - yi_hat) ** 2 for yi, yi_hat in zip(y, y_hat)) 68 | y_avg = sum(y) / len(y) 69 | sst = sum((yi - y_avg) ** 2 for yi in y) 70 | r2 = 1 - sse / sst 71 | print("Test r2 is %.3f!" % r2) 72 | return r2 73 | 74 | # 创建Node类 75 | class Node(object): 76 | # 初始化,存储预测值,左右节点,特征和分割点 77 | 78 | def __init__(self, score=None): 79 | self.score = score 80 | self.left = None 81 | self.right = None 82 | self.feature = None 83 | self.split = None 84 | 85 | 86 | # 创建回归树类 87 | class RegressionTree(object): 88 | 89 | # 初始化,存储根节点和树的高度 90 | def __init__(self): 91 | self.root = Node() 92 | self.height = 0 93 | 94 | # 计算分割点,MSE, 根据自变量X、因变量y、X元素中被取出的行号idx, 95 | # 列号feature以及分割点split,计算分割后的MSE。注意这里为了减少 96 | # 计算量,用到了方差公式:D(X)=E{[X-E(X)]^2}=E(X^2)-[E(X)]^2 97 | def get_split_mse(self, X, y, idx, feature, split): 98 | split_sum = [0, 0] 99 | split_cnt = [0, 0] 100 | split_sqr_sum = [0, 0] 101 | for i in idx: 102 | xi, yi = X[i][feature], y[i] 103 | if xi < split: 104 | split_cnt[0] += 1 105 | split_sum[0] += yi 106 | split_sqr_sum[0] += yi ** 2 107 | else: 108 | split_cnt[1] += 1 109 | split_sum[1] += yi 110 | split_sqr_sum[1] += yi ** 2 111 | split_avg = [split_sum[0] / split_cnt[0], split_sum[1] / split_cnt[1]] 112 | split_mse = [split_sqr_sum[0] - split_sum[0] * split_avg[0], 113 | split_sqr_sum[1] - split_sum[1] * split_avg[1]] 114 | return sum(split_mse), split, split_avg 115 | 116 | # 计算最佳分割点,遍历特征某一列的所有的不重复的点,找出MSE最小的点 117 | # 作为最佳分割点。如果特征中没有不重复的元素则返回None。 118 | def choose_split_point(self, X, y, idx, feature): 119 | unique = set([X[i][feature] for i in idx]) 120 | if (len(unique) == 1): 121 | return None 122 | unique.remove(min(unique)) 123 | mse, split, split_avg = min((self.get_split_mse(X, y, idx, feature, split) 124 | for split in unique), key=lambda x: x[0]) 125 | return mse, feature, split, split_avg 126 | 127 | # 选择最佳特征,遍历所有特征,计算最佳分割点对应的MSE,找出MSE最小 128 | # 的特征、对应的分割点,左右子节点对应的均值和行号。如果所有的特征都没有不重复元素则返回None 129 | def choose_feature(self, X, y, idx): 130 | m = len(X[0]) 131 | split_rets = [x for x in map(lambda x: self.choose_split_point(X, y, idx, x), 132 | range(m)) if x is not None] 133 | if (split_rets == []): 134 | return None 135 | _, feature, split, split_avg = min(split_rets, key=lambda x: x[0]) 136 | 137 | idx_split = [[], []] 138 | while idx: 139 | i = idx.pop() 140 | xi = X[i][feature] 141 | if xi < split: 142 | idx_split[0].append(i) 143 | else: 144 | idx_split[1].append(i) 145 | return feature, split, split_avg, idx_split 146 | 147 | # 规则转文字 148 | def expr2literal(self, expr): 149 | feature, op, split = expr 150 | op = ">=" if op == 1 else "<" 151 | return ("Feature%d %s %.4f" % (feature, op, split)) 152 | 153 | # 获取规则,将回归树的所有规则都用文字表达出来,方便我们了解树的全貌。这里使用BFS。 154 | def get_rules(self): 155 | que = [[self.root, []]] 156 | self.rules = [] 157 | while que: 158 | nd, exprs = que.pop(0) 159 | if not (nd.left or nd.right): 160 | literals = list(map(self.expr2literal, exprs)) 161 | self.rules.append([literals, nd.score]) 162 | if nd.left: 163 | rule_left = copy(exprs) 164 | rule_left.append([nd.feature, -1, nd.split]) 165 | que.append([nd.left, rule_left]) 166 | 167 | if nd.right: 168 | rule_right = copy(exprs) 169 | rule_right.append([nd.feature, 1, nd.split]) 170 | que.append([nd.right, rule_right]) 171 | 172 | # 训练模型,仍然使用队列+广度优先搜索,训练模型的过程中需要注意: 173 | # 1.控制树的最大深度max_depth 174 | # 2.控制分裂时最少的样本量min_samples_split 175 | # 3.叶子节点至少有两个不重复的y值 176 | # 4.至少有一个特征是没有重复值的 177 | def fit(self, X, y, max_depth=5, min_samples_split=2): 178 | self.root = Node() 179 | que = [[0, self.root, list(range(len(y)))]] 180 | while que: 181 | depth, nd, idx = que.pop(0) 182 | if depth > max_depth: 183 | depth -= 1 184 | break 185 | if len(idx) < min_samples_split or set(map(lambda i: y[i], idx)) == 1: 186 | continue 187 | feature_rets = self.choose_feature(X, y, idx) 188 | if feature_rets is None: 189 | continue 190 | nd.feature, nd.split, split_avg, idx_split = feature_rets 191 | nd.left = Node(split_avg[0]) 192 | nd.right = Node(split_avg[1]) 193 | que.append([depth + 1, nd.left, idx_split[0]]) 194 | que.append([depth + 1, nd.right, idx_split[1]]) 195 | 196 | self.height = depth 197 | self.get_rules() 198 | 199 | # 打印规则 200 | def print_reuls(self): 201 | for i, rule in enumerate(self.rules): 202 | literals, score = rule 203 | print("Rule %d: " % i, ' | '.join( 204 | literals) + ' => split_hat %.4f' % score) 205 | # 预测一个样本 206 | def _predict(self, row): 207 | nd = self.root 208 | while nd.left and nd.right: 209 | if row[nd.feature] < nd.split: 210 | nd = nd.left 211 | else: 212 | nd = nd.right 213 | return nd.score 214 | 215 | # 预测多个样本 216 | def predict(self, X): 217 | return [self._predict(Xi) for Xi in X] 218 | 219 | 220 | # 效果评估 221 | if __name__ == '__main__': 222 | print('Testing the accuracy of RegressionTree...') 223 | X, y = load_data() 224 | X_train, X_test, y_train, y_test = train_test_split( 225 | X, y, random_state=10) 226 | reg = RegressionTree() 227 | reg.fit(X=X_train, y=y_train, max_depth=4) 228 | reg.print_reuls() 229 | get_r2(reg, X_test, y_test) 230 | -------------------------------------------------------------------------------- /ridge.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from linear_regression import LinearRegreession 3 | from linear_regression import run_time 4 | from linear_regression import load_data 5 | from linear_regression import min_max_scale 6 | from linear_regression import train_test_split 7 | from linear_regression import get_r2 8 | 9 | from numpy import ndarray 10 | 11 | class Ridge(LinearRegreession): 12 | """脊回归类 13 | 损失函数: 14 | L = (y - y_hat) ^ 2 + L2 15 | L = (y - W * X - b) ^ 2 + α * (W, b) ^ 2 16 | Get partial derivative of W: 17 | dL/dW = -2 * (y - W * X - b) * X + 2 * α * W 18 | dL/dW = -2 * (y - y_hat) * X + 2 * α * W 19 | Get partial derivative of b: 20 | dL/db = -2 * (y - W * X - b) + 2 * α * b 21 | dL/db = -2 * (y - y_hat) + 2 * α * b 22 | ---------------------------------------------------------------- 23 | 超参数: 24 | bias: b 25 | weights: W 26 | alpha: α 27 | """ 28 | def __init__(self): 29 | super(Ridge, self).__init__() 30 | self.alpha = None 31 | 32 | def get_gradient_delta(self, Xi, yi): 33 | y_hat = self._predict(Xi) 34 | bias_grad_delta = yi - y_hat - self.alpha * self.bias 35 | weights_grad_delta = [(yi - y_hat) * Xij - self.alpha * wj 36 | for Xij, wj in zip(Xi, self.weights)] 37 | return bias_grad_delta, weights_grad_delta 38 | 39 | def fit(self, X, y, lr, epochs, alpha, method="batch", sample_rate=1.0): 40 | self.alpha = alpha 41 | assert method in ("batch", "stochastic") 42 | if method == "batch": 43 | self.batch_gradient_descent(X, y, lr, epochs) 44 | if method == "stochastic": 45 | self.stochastic_gradient_descent(X, y, lr, epochs, sample_rate) 46 | 47 | @run_time 48 | def main(): 49 | print("Tesing the performance of Ridge Regressor(stochastic)...") 50 | # Load data 51 | data, label = load_data() 52 | data = min_max_scale(data) 53 | # Split data randomly, train set rate 70% 54 | data_train, data_test, label_train, label_test = train_test_split(data, label, random_state=10) 55 | # Train model 56 | reg = Ridge() 57 | reg.fit(X=data_train, y=label_train, lr=0.001, epochs=1000, method="stochastic", sample_rate=0.5, alpha=1e-4) 58 | # Model evaluation 59 | get_r2(reg, data_test, label_test) 60 | --------------------------------------------------------------------------------