├── cluster ├── __pycache__ │ └── kmeans.cpython-36.pyc ├── kmeans.py ├── label_propagation.py ├── dbscan.py ├── meanshift.py ├── spectural_clustering.py └── gmm_em.py ├── README.md ├── classification_regression ├── __pycache__ │ └── cart_clf.cpython-36.pyc ├── linear_regression.py ├── logistic_regression.py ├── naive_bayes.py ├── perceptron.py ├── softmax.py ├── random_forest.py ├── adaboost.py ├── cart_regression.py ├── boosting_tree.py ├── cart_clf.py ├── decision_tree.py ├── maximum_entropy.py ├── xgboost.py ├── svm_wss3.py ├── knearest.py └── svm.py ├── load_data.py ├── cross_validation.py ├── outlier_detection ├── lof.py └── isolation_forest.py ├── association ├── apriori.py └── fp_growth.py ├── recommendation └── collaborative_filter.py ├── probability_algorithm └── hmm.py └── optimization └── optimization_algorithm.py /cluster/__pycache__/kmeans.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shi-Lixin/Machine-Learning-Algorithms/HEAD/cluster/__pycache__/kmeans.cpython-36.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine-Learning-Algorithms 2 |    Python实现经典分类回归、关联分析、聚类、推荐算法以及机器学习常用最优化算法。 3 | 4 |    [我的博客](https://blog.csdn.net/slx_share)详细介绍了本项目实现的算法。 5 | -------------------------------------------------------------------------------- /classification_regression/__pycache__/cart_clf.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shi-Lixin/Machine-Learning-Algorithms/HEAD/classification_regression/__pycache__/cart_clf.cpython-36.pyc -------------------------------------------------------------------------------- /load_data.py: -------------------------------------------------------------------------------- 1 | def load(path, sep='\t'): 2 | data = [] 3 | with open(path) as fp: 4 | for line in fp: 5 | sample = [] 6 | for item in line.strip().split(sep): 7 | sample.append(float(item)) 8 | data.append(sample) 9 | return data -------------------------------------------------------------------------------- /cross_validation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def validate(X_data, y_data, ratio=0.15): 5 | N = X_data.shape[0] 6 | size = int(N * ratio) 7 | inds = np.random.permutation(range(N)) 8 | for i in range(int(N / size)): 9 | test_ind = inds[i * size:(i + 1) * size] 10 | train_ind = list(set(range(N))-set(test_ind)) 11 | yield X_data[train_ind], y_data[train_ind], X_data[test_ind], y_data[test_ind] 12 | -------------------------------------------------------------------------------- /classification_regression/linear_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | 应用最小二乘法进行一维线性回归 3 | 求导直接求得系数 4 | 不加正则化项: w = 1/(X.T@X)@X.T@y 5 | 加L2正则化项: w = 1/(X.T@X+alpha*E)@X.T@y 6 | """ 7 | import numpy as np 8 | from numpy.linalg import inv 9 | 10 | 11 | def LR_regression(data, alpha=0.0): 12 | X = data[:, :-1] # n*d 13 | X = np.column_stack((X, np.ones(X.shape[0]))) # 末尾添加一列,元素全部为1 14 | y = data[:, -1] # n*1 15 | w = (inv(X.T@X+alpha*np.eye(X.shape[1]))@X.T@y).T # 1*d, 每个xi前的系数,也可以理解为每个特征的权重 16 | return w 17 | 18 | def predict(X, w): 19 | X = np.column_stack((X, np.ones(X.shape[0]))) 20 | return w@X.T 21 | 22 | if __name__ == '__main__': 23 | import matplotlib.pyplot as plt 24 | data = [] 25 | with open('./data/data.txt') as fp: 26 | for line in fp: 27 | tmp =[] 28 | for item in line.strip().split('\t'): 29 | tmp.append(float(item)) 30 | data.append(tmp) 31 | data = np.array(data) 32 | w = LR_regression(data) 33 | res = predict(data[:,0], w) 34 | plt.scatter(data[:,0], data[:,1], marker='o') 35 | plt.plot(data[:,0],res, color='r') 36 | plt.show() -------------------------------------------------------------------------------- /classification_regression/logistic_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | 逻辑斯谛回归 3 | """ 4 | 5 | import numpy as np 6 | 7 | 8 | class LR: 9 | def __init__(self, alpha=0.01, maxstep=1000): 10 | self.w = None 11 | self.maxstep = maxstep 12 | self.alpha = alpha 13 | 14 | def sig(self, z): 15 | # Logistic函数, 正类的概率 16 | return 1.0 / (1 + np.exp(-z)) 17 | 18 | def bgd(self, X_data, y_data): # 损失函数采用对数损失函数,其数学形式与似然函数一致 19 | # 批量梯度下降法 20 | b = np.ones((X_data.shape[0], 1)) 21 | X = np.hstack((X_data, b)) # 考虑阈值,堆输入向量进行扩充 22 | w = np.ones(X.shape[1]) # 初始化各特征的权重 23 | i = 0 24 | while i <= self.maxstep: 25 | i += 1 26 | err = y_data - self.sig(w @ X.T) 27 | w += self.alpha * err @ X # 注意,其表达式与平方误差损失函数的非常相似,但这是由对数损失函数推导而来的 28 | self.w = w 29 | return 30 | 31 | def fit(self, X_data, y_data): 32 | self.bgd(X_data, y_data) 33 | return 34 | 35 | def predict(self, x): 36 | x = np.append(x, 1) 37 | PT = self.sig(self.w @ x.T) 38 | if PT > 1 - PT: 39 | return 1 40 | else: 41 | return 0 42 | 43 | 44 | if __name__ == '__main__': 45 | from sklearn import datasets 46 | 47 | data = datasets.load_digits(n_class=2) 48 | X_data = data['data'] 49 | y_data = data['target'] 50 | from machine_learning_algorithm.cross_validation import validate 51 | g = validate(X_data, y_data, ratio=0.2) 52 | for item in g: 53 | X_train, y_train, X_test, y_test = item 54 | clf = LR() 55 | clf.fit(X_train, y_train) 56 | score = 0 57 | for x, y in zip(X_test, y_test): 58 | if clf.predict(x)==y: 59 | score += 1 60 | print(score/len(y_test)) -------------------------------------------------------------------------------- /classification_regression/naive_bayes.py: -------------------------------------------------------------------------------- 1 | """ 2 | 朴素贝叶斯分类算法 3 | 采用后验期望计参数,先验概率分布取均匀分布 4 | """ 5 | 6 | from collections import Counter, defaultdict 7 | import numpy as np 8 | 9 | 10 | class NBayes: 11 | def __init__(self, lambda_=1): 12 | self.lambda_ = lambda_ # 贝叶斯估计参数lambda 13 | self.p_prior = {} # 模型的先验概率, 注意这里的先验概率不是指预先人为设定的先验概率,而是需要估计的P(y=Ck) 14 | self.p_condition = {} # 模型的条件概率 15 | 16 | def fit(self, X_data, y_data): 17 | N = y_data.shape[0] 18 | # 后验期望估计P(y=Ck)的后验概率,设定先验概率为均匀分布 19 | c_y = Counter(y_data) 20 | K = len(c_y) 21 | for key, val in c_y.items(): 22 | self.p_prior[key] = (val + self.lambda_) / (N + K * self.lambda_) 23 | # 后验期望估计P(Xd=a|y=Ck)的后验概率,同样先验概率为均匀分布 24 | for d in range(X_data.shape[1]): # 对各个维度分别进行处理 25 | Xd_y = defaultdict(int) 26 | vector = X_data[:, d] 27 | Sd = len(np.unique(vector)) 28 | for xd, y in zip(vector, y_data): # 这里Xd仅考虑出现在数据集D中的情况,故即使用极大似然估计叶没有概率为0的情况 29 | Xd_y[(xd, y)] += 1 30 | for key, val in Xd_y.items(): 31 | self.p_condition[(d, key[0], key[1])] = (val + self.lambda_) / (c_y[key[1]] + Sd * self.lambda_) 32 | return 33 | 34 | def predict(self, X): 35 | p_post = defaultdict() 36 | for y, py in self.p_prior.items(): 37 | p_joint = py # 联合概率分布 38 | for d, Xd in enumerate(X): 39 | p_joint *= self.p_condition[(d, Xd, y)] # 条件独立性假设 40 | p_post[y] = p_joint # 分母P(X)相同,故直接存储联合概率分布即可 41 | return max(p_post, key=p_post.get) 42 | 43 | 44 | if __name__ == '__main__': 45 | data = np.array([[1, 0, -1], [1, 1, -1], [1, 1, 1], [1, 0, 1], 46 | [1, 0, -1], [2, 0, -1], [2, 1, -1], [2, 1, 1], 47 | [2, 2, 1], [2, 2, 1], [3, 2, 1], [3, 1, 1], 48 | [3, 1, 1], [3, 2, 1], [3, 2, -1]]) 49 | X_data = data[:, :-1] 50 | y_data = data[:, -1] 51 | clf = NBayes(lambda_=1) 52 | clf.fit(X_data, y_data) 53 | print(clf.p_prior, '\n', clf.p_condition) 54 | print(clf.predict(np.array([2, 0]))) 55 | -------------------------------------------------------------------------------- /outlier_detection/lof.py: -------------------------------------------------------------------------------- 1 | # LOF异常值检测算法 2 | from scipy.spatial.distance import cdist 3 | import numpy as np 4 | 5 | 6 | class LOF: 7 | def __init__(self, data, k, epsilon=1.0): 8 | self.data = data 9 | self.k = k 10 | self.epsilon = epsilon 11 | self.N = self.data.shape[0] 12 | 13 | def get_dist(self): 14 | # 计算欧式距离矩阵 15 | return cdist(self.data, self.data) 16 | 17 | def _kdist(self, arr): 18 | # 计算k距离 19 | inds_sort = np.argsort(arr) 20 | neighbor_ind = inds_sort[1:self.k + 1] # 邻域内点索引 21 | return neighbor_ind, arr[neighbor_ind[-1]] 22 | 23 | def get_rdist(self): 24 | # 计算可达距离 25 | dist = self.get_dist() 26 | nei_kdist = np.apply_along_axis(self._kdist, 1, dist) 27 | nei_inds, kdist = zip(*nei_kdist) 28 | for i, k in enumerate(kdist): 29 | ind = np.where(dist[i] < k) # 实际距离小于k距离,则可达距离为k距离 30 | dist[i][ind] = k 31 | return nei_inds, dist 32 | 33 | def get_lrd(self, nei_inds, rdist): 34 | # 计算局部可达密度 35 | lrd = np.zeros(self.N) 36 | for i, inds in enumerate(nei_inds): 37 | s = 0 38 | for j in inds: 39 | s += rdist[j, i] 40 | lrd[i] = self.k / s 41 | return lrd 42 | 43 | def run(self): 44 | # 计算局部离群因子 45 | nei_inds, rdist = self.get_rdist() 46 | lrd = self.get_lrd(nei_inds, rdist) 47 | score = np.zeros(self.N) 48 | for i, inds in enumerate(nei_inds): 49 | lrd_nei = sum(lrd[inds]) 50 | score[i] = lrd_nei / self.k / lrd[i] 51 | 52 | return score, np.where(score > self.epsilon)[0] 53 | 54 | 55 | if __name__ == '__main__': 56 | np.random.seed(42) 57 | X_inliers = 0.3 * np.random.randn(100, 2) 58 | X_inliers = np.r_[X_inliers + 2, X_inliers - 2] 59 | X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) 60 | data = np.r_[X_inliers, X_outliers] 61 | 62 | lof = LOF(data, 5, epsilon=1.2) 63 | score, out_ind = lof.run() 64 | outliers = data[out_ind] 65 | 66 | import matplotlib.pyplot as plt 67 | 68 | plt.scatter(data[:, 0], data[:, 1], color='b') 69 | plt.scatter(outliers[:, 0], outliers[:, 1], color='r') 70 | plt.show() 71 | -------------------------------------------------------------------------------- /classification_regression/perceptron.py: -------------------------------------------------------------------------------- 1 | """ 2 | 感知机(perceptron):原始形式以及对偶形式 3 | """ 4 | import numpy as np 5 | 6 | 7 | class Perceptron: 8 | def __init__(self, eta=1): 9 | self.eta = eta # 学习率 10 | self.w = None # 权值 11 | self.b = None # 阈值 12 | 13 | def fit(self, X_data, y_data): 14 | self.w = np.zeros(X_data.shape[1]) # 初始化 15 | self.b = 0 16 | change = True 17 | while change: # w, b 不发生改变则结束训练 18 | for X, y in zip(X_data, y_data): # 依次输入每个数据点进行训练 19 | change = False 20 | while y * (self.w @ X + self.b) <= 0: 21 | self.w += self.eta * X * y 22 | self.b += self.eta * y 23 | change = True 24 | return 25 | 26 | def predict(self, X): 27 | return np.sign(self.w @ X + self.b) 28 | 29 | 30 | class Perceptron_dual: 31 | # 对偶形式的感知机 32 | def __init__(self, eta=1): 33 | self.eta = eta 34 | self.alpha = None # alpha相当于样本的权值,当eta为1时就是每个样本参与训练的次数 35 | self.b = None 36 | 37 | self.N = None 38 | self.gram = None 39 | 40 | def init_param(self, X_data): 41 | self.N = X_data.shape[0] 42 | self.alpha = np.zeros(self.N) 43 | self.b = 0 44 | self.gram = self.getGram(X_data) 45 | 46 | def getGram(self, X_data): 47 | # 计算Gram矩阵 48 | gram = np.diag(np.linalg.norm(X_data, axis=1) ** 2) 49 | 50 | for i in range(self.N): 51 | for j in range(i + 1, self.N): 52 | gram[i, j] = X_data[i] @ X_data[j] 53 | gram[j, i] = gram[i, j] 54 | 55 | return gram 56 | 57 | def sum_dual(self, y_data, i): 58 | s = 0 59 | for j in range(self.N): 60 | s += self.alpha[j] * y_data[j] * self.gram[j][i] 61 | return y_data[i] * (s + self.b) 62 | 63 | def fit(self, X_data, y_data): 64 | self.init_param(X_data) 65 | changed = True 66 | while changed: 67 | changed = False 68 | for i in range(self.N): # 依次输入每个数据点进行训练 69 | while self.sum_dual(y_data, i) <= 0: 70 | self.alpha[i] += self.eta 71 | self.b += self.eta * y_data[i] 72 | changed = True 73 | return 74 | 75 | 76 | if __name__ == '__main__': 77 | X_data = np.array([[3, 3], [4, 3], [1, 1]]) 78 | y_data = np.array([1, 1, -1]) 79 | p = Perceptron() 80 | p.fit(X_data, y_data) 81 | print(p.w, p.b) 82 | -------------------------------------------------------------------------------- /classification_regression/softmax.py: -------------------------------------------------------------------------------- 1 | """ 2 | SoftMax回归,逻辑斯蒂回归的多分类推广。所以,本质还是一种分类算法 3 | """ 4 | import numpy as np 5 | 6 | 7 | class SoftMax: 8 | def __init__(self, maxstep=10000, C=1e-4, alpha=0.4): 9 | self.maxstep = maxstep 10 | self.C = C # 权值衰减项系数lambda, 类似于惩罚系数 11 | self.alpha = alpha # 学习率 12 | 13 | self.w = None # 权值 14 | 15 | self.L = None # 类的数量 16 | self.D = None # 输入数据维度 17 | self.N = None # 样本总量 18 | 19 | def init_param(self, X_data, y_data): 20 | # 初始化,暂定输入数据全部为数值形式 21 | b = np.ones((X_data.shape[0], 1)) 22 | X_data = np.hstack((X_data, b)) # 附加偏置项 23 | self.L = len(np.unique(y_data)) 24 | self.D = X_data.shape[1] 25 | self.N = X_data.shape[0] 26 | self.w = np.ones((self.L, self.D)) # l*d, 针对每个类,都有一组权值参数w 27 | return X_data 28 | 29 | def bgd(self, X_data, y_data): 30 | # 梯度下降训练 31 | step = 0 32 | while step < self.maxstep: 33 | step += 1 34 | prob = np.exp(X_data @ self.w.T) # n*l, 行向量存储该样本属于每个类的概率 35 | nf = np.transpose([prob.sum(axis=1)]) # n*1 36 | nf = np.repeat(nf, self.L, axis=1) # n*l 37 | prob = -prob / nf # 归一化, 此处条件符号仅方便后续计算梯度 38 | for i in range(self.N): 39 | prob[i, int(y_data[i])] += 1 40 | grad = -1.0 / self.N * prob.T @ X_data + self.C * self.w # 梯度, 第二项为衰减项 41 | self.w -= self.alpha * grad 42 | return 43 | 44 | def fit(self, X_data, y_data): 45 | X_data = self.init_param(X_data, y_data) 46 | self.bgd(X_data, y_data) 47 | return 48 | 49 | def predict(self, X): 50 | b = np.ones((X.shape[0], 1)) 51 | X = np.hstack((X, b)) # 附加偏置项 52 | prob = np.exp(X @ self.w.T) 53 | return np.argmax(prob, axis=1) 54 | 55 | 56 | if __name__ == '__main__': 57 | from sklearn.datasets import load_digits 58 | 59 | data = load_digits() 60 | X_data = data['data'] 61 | y_data = data['target'] 62 | 63 | from machine_learning_algorithm.cross_validation import validate 64 | 65 | g = validate(X_data, y_data, ratio=0.2) 66 | for item in g: 67 | X_train, y_train, X_test, y_test = item 68 | clf = SoftMax(maxstep=10000, alpha=0.1, C=1e-4) 69 | clf.fit(X_train, y_train) 70 | y_pred = clf.predict(X_test) 71 | score = 0 72 | for y, y_pred in zip(y_test, y_pred): 73 | score += 1 if y == y_pred else 0 74 | print(score / len(y_test)) 75 | -------------------------------------------------------------------------------- /cluster/kmeans.py: -------------------------------------------------------------------------------- 1 | """ 2 | K均值聚类算法 3 | 给定初始簇的个数,迭代更改样本与簇的隶属关系,更新簇的中心为样本的均值 4 | """ 5 | from collections import defaultdict 6 | import numpy as np 7 | import copy 8 | 9 | 10 | class KMEANS: 11 | def __init__(self, n_cluster, epsilon=1e-3, maxstep=2000): 12 | self.n_cluster = n_cluster 13 | self.epsilon = epsilon 14 | self.maxstep = maxstep 15 | self.N = None 16 | self.centers = None 17 | self.cluster = defaultdict(list) 18 | 19 | def init_param(self, data): 20 | # 初始化参数, 包括初始化簇中心 21 | self.N = data.shape[0] 22 | random_ind = np.random.choice(self.N, size=self.n_cluster) 23 | self.centers = [data[i] for i in random_ind] # list存储中心点坐标数组 24 | for ind, p in enumerate(data): 25 | self.cluster[self.mark(p)].append(ind) 26 | return 27 | 28 | def _cal_dist(self, center, p): 29 | # 计算点到簇中心的距离平方 30 | return sum([(i - j) ** 2 for i, j in zip(center, p)]) 31 | 32 | def mark(self, p): 33 | # 计算样本点到每个簇中心的距离,选取最小的簇 34 | dists = [] 35 | for center in self.centers: 36 | dists.append(self._cal_dist(center, p)) 37 | return dists.index(min(dists)) 38 | 39 | def update_center(self, data): 40 | # 更新簇的中心坐标 41 | for label, inds in self.cluster.items(): 42 | self.centers[label] = np.mean(data[inds], axis=0) 43 | return 44 | 45 | def divide(self, data): 46 | # 重新对样本聚类 47 | tmp_cluster = copy.deepcopy(self.cluster) # 迭代过程中,字典长度不能发生改变,故deepcopy 48 | for label, inds in tmp_cluster.items(): 49 | for i in inds: 50 | new_label = self.mark(data[i]) 51 | if new_label == label: # 若类标记不变,跳过 52 | continue 53 | else: 54 | self.cluster[label].remove(i) 55 | self.cluster[new_label].append(i) 56 | return 57 | 58 | def cal_err(self, data): 59 | # 计算MSE 60 | mse = 0 61 | for label, inds in self.cluster.items(): 62 | partial_data = data[inds] 63 | for p in partial_data: 64 | mse += self._cal_dist(self.centers[label], p) 65 | return mse / self.N 66 | 67 | def fit(self, data): 68 | self.init_param(data) 69 | step = 0 70 | while step < self.maxstep: 71 | step += 1 72 | self.update_center(data) 73 | self.divide(data) 74 | err = self.cal_err(data) 75 | if err < self.epsilon: 76 | break 77 | return 78 | 79 | 80 | if __name__ == '__main__': 81 | from sklearn.datasets import make_blobs 82 | from itertools import cycle 83 | import matplotlib.pyplot as plt 84 | 85 | data, label = make_blobs(centers=4, cluster_std=0.5) 86 | km = KMEANS(4) 87 | km.fit(data) 88 | cluster = km.cluster 89 | centers = np.array(km.centers) 90 | 91 | def visualize(data, cluster, centers): 92 | color = 'bgrym' 93 | for col, inds in zip(cycle(color), cluster.values()): 94 | partial_data = data[inds] 95 | plt.scatter(partial_data[:, 0], partial_data[:, 1], color=col) 96 | plt.scatter(centers[:, 0], centers[:, 1], color='k', marker='*', s=100) 97 | plt.show() 98 | return 99 | 100 | 101 | visualize(data, cluster, centers) 102 | -------------------------------------------------------------------------------- /outlier_detection/isolation_forest.py: -------------------------------------------------------------------------------- 1 | # 实现IsolationForest高维数据的异常值检测算法 2 | import numpy as np 3 | import math 4 | from collections import Counter 5 | 6 | 7 | class Node: 8 | def __init__(self, val=None, right=None, left=None): 9 | self.val = val # 存储样本索引,仅叶节点 10 | self.right = right 11 | self.left = left 12 | 13 | 14 | class RandomTree: 15 | def __init__(self): 16 | self.tree = None 17 | self.n_feas = None 18 | 19 | def get_split(self, data, inds): 20 | # 随机构建切分点 21 | f = np.random.choice(self.n_feas) # 随机选择一个特征 22 | up = max(data[inds, f]) 23 | down = min(data[inds, f]) 24 | v = (up - down) * np.random.sample() + down # 在该特征的最大与最小值间随机选择一个数 25 | return f, v 26 | 27 | def split(self, data, inds): 28 | # 切分数据集 29 | f, v = self.get_split(data, inds) 30 | left_ind = [] 31 | right_ind = [] 32 | for i in inds: 33 | if data[i, f] <= v: 34 | left_ind.append(i) 35 | else: 36 | right_ind.append(i) 37 | return left_ind, right_ind 38 | 39 | def buildTree(self, data, inds): 40 | if len(inds) < 3: # 叶节点 41 | return Node(val=inds) 42 | left_ind, right_ind = self.split(data, inds) 43 | left = self.buildTree(data, left_ind) 44 | right = self.buildTree(data, right_ind) 45 | return Node(left=left, right=right) 46 | 47 | def fit(self, data): 48 | self.n_feas = data.shape[1] 49 | inds = np.arange(data.shape[0]) 50 | self.tree = self.buildTree(data, inds) 51 | return 52 | 53 | def traverse(self): 54 | # 遍历树,统计每个样本的路径长 55 | path_len = Counter() 56 | i = -1 57 | 58 | def helper(currentNode): 59 | nonlocal i 60 | i += 1 61 | if currentNode.val is not None: 62 | for ind in currentNode.val: 63 | path_len[ind] = i 64 | return 65 | for child in [currentNode.left, currentNode.right]: 66 | helper(child) 67 | i -= 1 68 | return 69 | 70 | helper(self.tree) 71 | return path_len 72 | 73 | 74 | class IsolationForest: 75 | def __init__(self, n_tree, epsilon): 76 | self.n_tree = n_tree 77 | self.epsilon = epsilon # 异常点比例 78 | self.scores = Counter() 79 | 80 | def fit_predict(self, data): 81 | for _ in range(self.n_tree): 82 | RT = RandomTree() 83 | RT.fit(data) 84 | path_len = RT.traverse() 85 | self.scores = self.scores + path_len 86 | 87 | n_sample = data.shape[0] 88 | phi = 2 * math.log(n_sample - 1) - 2 * (n_sample - 1) / n_sample 89 | for key, val in self.scores.items(): 90 | self.scores[key] = 2 ** -(val / self.n_tree / phi) # 归一化 91 | q = np.quantile(list(self.scores.values()), 1 - self.epsilon) 92 | outliers = [key for key, val in self.scores.items() if val > q] 93 | return outliers 94 | 95 | 96 | if __name__ == '__main__': 97 | np.random.seed(42) 98 | X_inliers = 0.3 * np.random.randn(100, 2) 99 | X_inliers = np.r_[X_inliers + 2, X_inliers - 2] 100 | X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) 101 | data = np.r_[X_inliers, X_outliers] 102 | 103 | IF = IsolationForest(100, 0.1) 104 | out_ind = IF.fit_predict(data) 105 | outliers = data[out_ind] 106 | 107 | import matplotlib.pyplot as plt 108 | 109 | plt.scatter(data[:, 0], data[:, 1], color='b') 110 | plt.scatter(outliers[:, 0], outliers[:, 1], color='r') 111 | plt.show() 112 | -------------------------------------------------------------------------------- /classification_regression/random_forest.py: -------------------------------------------------------------------------------- 1 | """ 2 | 随机森林算法,组合算法bagging(装袋)的一种 3 | """ 4 | from collections import defaultdict 5 | import numpy as np 6 | import math 7 | from cart_clf import CART_CLF 8 | 9 | 10 | class RandomForest: 11 | def __init__(self, n_tree=6, n_fea=None, ri_rc=True, L=None, epsilon=1e-3, min_sample=1): 12 | self.n_tree = n_tree 13 | self.n_fea = n_fea # 每棵树中特征的数量 14 | self.ri_rc = ri_rc # 判定特征的选择选用RI还是RC, 特征比较少时使用RC 15 | self.L = L # 选择RC时,进行线性组合的特征个数 16 | self.tree_list = [] # 随机森林中子树的list 17 | 18 | self.epsilon = epsilon 19 | self.min_sample = min_sample # 叶节点含有的最少样本数 20 | 21 | self.D = None # 输入数据维度 22 | self.N = None 23 | 24 | def init_param(self, X_data): 25 | # 初始化参数 26 | self.D = X_data.shape[1] 27 | self.N = X_data.shape[0] 28 | if self.n_fea is None: 29 | self.n_fea = int(math.log2(self.D) + 1) # 默认选择特征的个数 30 | return 31 | 32 | def extract_fea(self): 33 | # 从原数据中抽取特征(RI)或线性组合构建新特征(RC) 34 | if self.ri_rc: 35 | if self.n_fea > self.D: 36 | raise ValueError('the number of features should be lower than dimention of data while RI is chosen') 37 | fea_arr = np.random.choice(self.D, self.n_fea, replace=False) 38 | else: 39 | fea_arr = np.zeros((self.n_fea, self.D)) 40 | for i in range(self.n_fea): 41 | out_fea = np.random.choice(self.D, self.L, replace=False) 42 | coeff = np.random.uniform(-1, 1, self.D) # [-1,1]上的均匀分布来产生每个特征前的系数 43 | coeff[out_fea] = 0 44 | fea_arr[i] = coeff 45 | return fea_arr 46 | 47 | def extract_data(self, X_data, y_data): 48 | # 从原数据中有放回的抽取样本,构成每个决策树的自助样本集 49 | fea_arr = self.extract_fea() # col_index or coeffs 50 | inds = np.unique(np.random.choice(self.N, self.N)) # row_index, 有放回抽取样本 51 | sub_X = X_data[inds] 52 | sub_y = y_data[inds] 53 | if self.ri_rc: 54 | sub_X = sub_X[:, fea_arr] 55 | else: 56 | sub_X = sub_X @ fea_arr.T 57 | return sub_X, sub_y, fea_arr 58 | 59 | def fit(self, X_data, y_data): 60 | # 训练主函数 61 | self.init_param(X_data) 62 | for i in range(self.n_tree): 63 | sub_X, sub_y, fea_arr = self.extract_data(X_data, y_data) 64 | subtree = CART_CLF(epsilon=self.epsilon, min_sample=self.min_sample) 65 | subtree.fit(sub_X, sub_y) 66 | self.tree_list.append((subtree, fea_arr)) # 保存训练后的树及其选用的特征,以便后续预测时使用 67 | return 68 | 69 | def predict(self, X): 70 | # 预测,多数表决 71 | res = defaultdict(int) # 存储每个类得到的票数 72 | for item in self.tree_list: 73 | subtree, fea_arr = item 74 | if self.ri_rc: 75 | X_modify = X[fea_arr] 76 | else: 77 | X_modify = (np.array([X]) @ fea_arr.T)[0] 78 | label = subtree.predict(X_modify) 79 | res[label] += 1 80 | return max(res, key=res.get) 81 | 82 | 83 | if __name__ == '__main__': 84 | from sklearn.datasets import load_iris 85 | 86 | data = load_iris() 87 | X_data = data['data'] 88 | y_data = data['target'] 89 | from machine_learning_algorithm.cross_validation import validate 90 | 91 | g = validate(X_data, y_data, ratio=0.2) 92 | for item in g: 93 | X_train, y_train, X_test, y_test = item 94 | RF = RandomForest(n_tree=50, n_fea=2, ri_rc=True) 95 | RF.fit(X_train, y_train) 96 | score = 0 97 | for X, y in zip(X_test, y_test): 98 | if RF.predict(X) == y: 99 | score += 1 100 | print(score / len(y_test)) 101 | -------------------------------------------------------------------------------- /classification_regression/adaboost.py: -------------------------------------------------------------------------------- 1 | """ 2 | boosting(提升)算法的一种 3 | """ 4 | 5 | import math 6 | from collections import defaultdict 7 | 8 | import numpy as np 9 | 10 | 11 | class AdaBoost: 12 | def __init__(self, epsilon=0.0): 13 | self.epsilon = epsilon # 分类误差率阈值 14 | self.w = None # 样本的权值,每加入一个基本分类器都要重新计算 15 | self.N = None 16 | self.g_list = [] # 弱分类器, 本程序暂设弱分类器由xnu产生,故用(0,nu)或(1,nu)表示 17 | self.alpha = [] # 基本分类器前面的系数 18 | self.base_list = [] # 基本分类器 19 | 20 | def init_param(self, X_data): 21 | # 初始化参数,包括权值和所有可能的弱分类器 22 | self.N = X_data.shape[0] 23 | self.w = np.ones(self.N) / self.N # 初始化权值 24 | for i in range(1, self.N): # 构建可能的弱分类器集合 25 | nu = (X_data[i][0] + X_data[i - 1][0]) / 2 26 | self.g_list.append((0, nu)) # 对应x nu[1]) or (nu[0] == 1 and X[0] <= nu[1]): 34 | val = -1 35 | return val 36 | 37 | def get_base(self, X_data, y_data): 38 | # 挑选出最佳的弱分类器作为基本分类器, 即获取使分类误差率最小的数据集切分点(基于上一轮更新的权重) 39 | g_err = defaultdict(float) # 每个弱分类器对应的分类误差率 40 | 41 | for g in self.g_list: 42 | for i in range(self.N): 43 | if self.cal_weak_val(g, X_data[i]) != y_data[i]: 44 | g_err[g] += self.w[i] # 误差等于错误分类样本的权值之和,即sum{1*w} 45 | 46 | best_g = min(g_err, key=g_err.get) 47 | return best_g, g_err[best_g] 48 | 49 | def cal_alpha(self, err): 50 | # 计算基本分类器前的系数 51 | return 1.0 / 2 * math.log((1 - err) / err) 52 | 53 | def cal_weight(self, X_data, y_data, base, alpha): 54 | # 基于新加入的基本分类器,迭代更新每个样本权重 55 | for i in range(self.N): 56 | self.w[i] *= math.exp(-alpha * y_data[i] * self.cal_weak_val(base, X_data[i])) 57 | self.w = self.w / np.sum(self.w) 58 | return 59 | 60 | def _fx(self, X): 61 | # 基于当前的组合分类器,计算预测值 62 | s = 0 63 | for alpha, base in zip(self.alpha, self.base_list): 64 | s += alpha * self.cal_weak_val(base, X) 65 | return np.sign(s) 66 | 67 | def fit(self, X_data, y_data): 68 | # 构建最终的强分类器, 暂设输入维度为1 69 | self.init_param(X_data) 70 | 71 | while True: # 逐步添加基本分类器 72 | base, err = self.get_base(X_data, y_data) 73 | alpha = self.cal_alpha(err) 74 | self.cal_weight(X_data, y_data, base, alpha) # 更新样本权值 75 | self.alpha.append(alpha) 76 | self.base_list.append(base) 77 | 78 | s = 0 79 | for X, y in zip(X_data, y_data): 80 | if self._fx(X) != y: 81 | s += 1 82 | if s / self.N <= self.epsilon: # 分类错误数目占比小于等于epsilon, 停止训练 83 | print('the err ratio is {0}'.format(s / self.N)) 84 | break 85 | return 86 | 87 | def predict(self, X): 88 | # 预测 89 | return self._fx(X) 90 | 91 | 92 | if __name__ == '__main__': 93 | X_data_raw = np.linspace(-50, 50, 100) 94 | np.random.shuffle(X_data_raw) 95 | y_data = np.sign(X_data_raw) 96 | X_data = np.transpose([X_data_raw]) 97 | from machine_learning_algorithm.cross_validation import validate 98 | 99 | g = validate(X_data, y_data) 100 | for item in g: 101 | X_train, y_train, X_test, y_test = item 102 | AB = AdaBoost(epsilon=0.02) 103 | AB.fit(X_train, y_train) 104 | score = 0 105 | for X, y in zip(X_test, y_test): 106 | if AB.predict(X) == y: 107 | score += 1 108 | print(score / len(y_test)) -------------------------------------------------------------------------------- /classification_regression/cart_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | CART+最小二乘法构建CART回归树 3 | """ 4 | 5 | import numpy as np 6 | 7 | 8 | class node: 9 | def __init__(self, fea=-1, val=None, res=None, right=None, left=None): 10 | self.fea = fea 11 | self.val = val 12 | self.res = res 13 | self.right = right 14 | self.left = left 15 | 16 | 17 | class CART_REG: 18 | def __init__(self, epsilon=0.1, min_sample=10): 19 | self.epsilon = epsilon 20 | self.min_sample = min_sample 21 | self.tree = None 22 | 23 | def err(self, y_data): 24 | # 子数据集的输出变量y与均值的差的平方和 25 | return y_data.var() * y_data.shape[0] 26 | 27 | def leaf(self, y_data): 28 | # 叶节点取值,为子数据集输出y的均值 29 | return y_data.mean() 30 | 31 | def split(self, fea, val, X_data): 32 | # 根据某个特征,以及特征下的某个取值,将数据集进行切分 33 | set1_inds = np.where(X_data[:, fea] <= val)[0] 34 | set2_inds = list(set(range(X_data.shape[0]))-set(set1_inds)) 35 | return set1_inds, set2_inds 36 | 37 | def getBestSplit(self, X_data, y_data): 38 | # 求最优切分点 39 | best_err = self.err(y_data) 40 | best_split = None 41 | subsets_inds = None 42 | for fea in range(X_data.shape[1]): 43 | for val in X_data[:, fea]: 44 | set1_inds, set2_inds = self.split(fea, val, X_data) 45 | if len(set1_inds) < 2 or len(set2_inds) < 2: # 若切分后某个子集大小不足2,则不切分 46 | continue 47 | now_err = self.err(y_data[set1_inds]) + self.err(y_data[set2_inds]) 48 | if now_err < best_err: 49 | best_err = now_err 50 | best_split = (fea, val) 51 | subsets_inds = (set1_inds, set2_inds) 52 | return best_err, best_split, subsets_inds 53 | 54 | def buildTree(self, X_data, y_data): 55 | # 递归构建二叉树 56 | if y_data.shape[0] < self.min_sample: 57 | return node(res=self.leaf(y_data)) 58 | best_err, best_split, subsets_inds = self.getBestSplit(X_data, y_data) 59 | if subsets_inds is None: 60 | return node(res=self.leaf(y_data)) 61 | if best_err < self.epsilon: 62 | return node(res=self.leaf(y_data)) 63 | else: 64 | left = self.buildTree(X_data[subsets_inds[0]], y_data[subsets_inds[0]]) 65 | right = self.buildTree(X_data[subsets_inds[1]], y_data[subsets_inds[1]]) 66 | return node(fea=best_split[0], val=best_split[1], right=right, left=left) 67 | 68 | def fit(self, X_data, y_data): 69 | self.tree = self.buildTree(X_data, y_data) 70 | return 71 | 72 | def predict(self, x): 73 | # 对输入变量进行预测 74 | def helper(x, tree): 75 | if tree.res is not None: 76 | return tree.res 77 | else: 78 | if x[tree.fea] <= tree.val: 79 | branch = tree.left 80 | else: 81 | branch = tree.right 82 | return helper(x, branch) 83 | 84 | return helper(x, self.tree) 85 | 86 | 87 | if __name__ == '__main__': 88 | import matplotlib.pyplot as plt 89 | 90 | X_data_raw = np.linspace(-3, 3, 50) 91 | np.random.shuffle(X_data_raw) 92 | y_data = np.sin(X_data_raw) 93 | X_data = np.transpose([X_data_raw]) 94 | y_data = y_data + 0.1 * np.random.randn(y_data.shape[0]) 95 | clf = CART_REG(epsilon=1e-4, min_sample=1) 96 | clf.fit(X_data, y_data) 97 | res = [] 98 | for i in range(X_data.shape[0]): 99 | res.append(clf.predict(X_data[i])) 100 | p1 = plt.scatter(X_data_raw, y_data) 101 | p2 = plt.scatter(X_data_raw, res, marker='*') 102 | plt.legend([p1,p2],['real','pred'],loc='upper left') 103 | plt.show() 104 | -------------------------------------------------------------------------------- /cluster/label_propagation.py: -------------------------------------------------------------------------------- 1 | """ 2 | 标签传播聚类算法, 典型的半监督学习算法 3 | 核心思想:相似的数据应该具有相同的标签,构建节点间的相似性矩阵(边的权重) 4 | """ 5 | import numpy as np 6 | 7 | 8 | class LablePropagation: 9 | def __init__(self, epsilon=1e-3, maxstep=500, kernel_option='rbf', sigma=1.0, k=10): 10 | self.epsilon = epsilon 11 | self.maxstep = maxstep 12 | self.kernel_option = kernel_option 13 | self.sigma = sigma # rbf 核函数的参数 14 | self.k = k # knn 核函数参数 15 | 16 | self.T = None # 未标记点间的转换矩阵 17 | self.Y = None # 标签数矩阵 18 | self.Y_clamp = None # 已知标签数据点的标签矩阵 19 | self.N = None 20 | self.labeled_inds = None # 已知标签样本的索引 21 | self.labels = None 22 | 23 | def init_param(self, X_data, y_data): 24 | # 初始化参数 25 | self.N = X_data.shape[0] 26 | self.labeled_inds = np.where(y_data >= 0)[0] # 未知标签设为-1 27 | n_class = len(np.unique(y_data[self.labeled_inds])) 28 | 29 | self.Y = np.zeros((self.N, n_class)) 30 | for i in self.labeled_inds: 31 | self.Y[i][int(y_data[i])] = 1.0 # 哑编码,对应标签设为1 32 | 33 | self.Y_clamp = self.Y[self.labeled_inds] # n*l 34 | self.T = self.cal_tran_mat(X_data) # n*n 35 | return 36 | 37 | def cal_dis2(self, node1, node2): 38 | # 计算节点间的欧式距离平方 39 | return (node1 - node2) @ (node1 - node2) 40 | 41 | def cal_tran_mat(self, data): 42 | # 计算转换矩阵, 即构建图 43 | dis_mat = np.zeros((self.N, self.N)) 44 | for i in range(self.N): 45 | for j in range(i + 1, self.N): 46 | dis_mat[i, j] = self.cal_dis2(data[i], data[j]) 47 | dis_mat[j, i] = dis_mat[i, j] 48 | 49 | if self.kernel_option == 'rbf': 50 | assert (self.sigma is not None) 51 | T = np.exp(-dis_mat / self.sigma ** 2) 52 | normalizer = T.sum(axis=0) 53 | T = T / normalizer 54 | elif self.kernel_option == 'knn': 55 | assert (self.k is not None) 56 | T = np.zeros((self.N, self.N)) 57 | for i in range(self.N): 58 | inds = np.argpartition(dis_mat[i], self.k + 1)[:self.k + 1] 59 | T[i][inds] = 1.0 / self.k # 最近的k个拥有相同的权重 60 | T[i][i] = 0 61 | else: 62 | raise ValueError('kernel is not supported') 63 | return T 64 | 65 | def fit(self, X_data, y_data): 66 | # 训练主函数 67 | self.init_param(X_data, y_data) 68 | step = 0 69 | while step < self.maxstep: 70 | step += 1 71 | new_Y = self.T @ self.Y # 更新标签矩阵 72 | new_Y[self.labeled_inds] = self.Y_clamp # clamp 73 | if np.abs(new_Y - self.Y).sum() < self.epsilon: 74 | break 75 | self.Y = new_Y 76 | self.labels = np.argmax(self.Y, axis=1) 77 | return 78 | 79 | 80 | if __name__ == '__main__': 81 | from sklearn.datasets import make_circles 82 | 83 | n_samples = 100 84 | X, y = make_circles(n_samples=n_samples, shuffle=False) 85 | outer, inner = 0, 1 86 | labels = -np.ones(n_samples) 87 | labels[0] = outer 88 | labels[-1] = inner 89 | LPA = LablePropagation(maxstep=1000, kernel_option='knn', k=2, sigma=0.07) 90 | LPA.fit(X, labels) 91 | labels = LPA.labels 92 | 93 | import matplotlib.pyplot as plt 94 | 95 | 96 | def visualize(data, labels): 97 | color = 'bg' 98 | unique_label = np.unique(labels) 99 | for col, label in zip(color, unique_label): 100 | partial_data = data[np.where(labels == label)] 101 | plt.scatter(partial_data[:, 0], partial_data[:, 1], color=col, alpha=1) 102 | plt.scatter(data[0, 0], data[0, 1], color='b', marker='*', s=200, alpha=0.5) # outer 103 | plt.scatter(data[-1, 0], data[-1, 1], color='g', marker='*', s=200, alpha=0.5) # inner 104 | plt.show() 105 | return 106 | 107 | 108 | visualize(X, labels) 109 | -------------------------------------------------------------------------------- /classification_regression/boosting_tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | 提升树:基于二叉回归树的提升算法 3 | 程序暂考虑输入为一维的情况 4 | """ 5 | from collections import defaultdict 6 | import numpy as np 7 | 8 | 9 | class BoostingTree: 10 | def __init__(self, epsilon=1e-2): 11 | self.epsilon = epsilon 12 | self.cand_splits = [] # 候选切分点 13 | self.split_index = defaultdict(tuple) # 由于要多次切分数据集,故预先存储,切分后数据点的索引 14 | self.split_list = [] # 最终各个基本回归树的切分点 15 | self.c1_list = [] # 切分点左区域取值 16 | self.c2_list = [] # 切分点右区域取值 17 | self.N = None 18 | self.n_split = None 19 | 20 | def init_param(self, X_data): 21 | # 初始化参数 22 | self.N = X_data.shape[0] 23 | for i in range(1, self.N): 24 | self.cand_splits.append((X_data[i][0] + X_data[i - 1][0]) / 2) 25 | self.n_split = len(self.cand_splits) 26 | for split in self.cand_splits: 27 | left_index = np.where(X_data[:, 0]<= split)[0] 28 | right_index = list(set(range(self.N))-set(left_index)) 29 | self.split_index[split] = (left_index, right_index) 30 | return 31 | 32 | def _cal_err(self, split, y_res): 33 | # 计算每个切分点的误差 34 | inds = self.split_index[split] 35 | left = y_res[inds[0]] 36 | right = y_res[inds[1]] 37 | 38 | c1 = np.sum(left) / len(left) 39 | c2 = np.sum(right) / len(right) 40 | y_res_left = left - c1 41 | y_res_right = right - c2 42 | res = np.hstack([y_res_left, y_res_right]) 43 | res_square = np.apply_along_axis(lambda x: x ** 2, 0, res).sum() 44 | return res_square, c1, c2 45 | 46 | def best_split(self,y_res): 47 | # 获取最佳切分点,并返回对应的残差 48 | best_split = self.cand_splits[0] 49 | min_res_square, best_c1, best_c2 = self._cal_err(best_split, y_res) 50 | 51 | for i in range(1, self.n_split): 52 | res_square, c1, c2 = self._cal_err(self.cand_splits[i], y_res) 53 | if res_square < min_res_square: 54 | best_split = self.cand_splits[i] 55 | min_res_square = res_square 56 | best_c1 = c1 57 | best_c2 = c2 58 | 59 | self.split_list.append(best_split) 60 | self.c1_list.append(best_c1) 61 | self.c2_list.append(best_c2) 62 | return 63 | 64 | def _fx(self, X): 65 | # 基于当前组合树,预测X的输出值 66 | s = 0 67 | for split, c1, c2 in zip(self.split_list, self.c1_list, self.c2_list): 68 | if X < split: 69 | s += c1 70 | else: 71 | s += c2 72 | return s 73 | 74 | def update_y(self, X_data, y_data): 75 | # 每添加一颗回归树,就要更新y,即基于当前组合回归树的预测残差 76 | y_res = [] 77 | for X, y in zip(X_data, y_data): 78 | y_res.append(y - self._fx(X[0])) 79 | y_res = np.array(y_res) 80 | res_square = np.apply_along_axis(lambda x: x ** 2, 0, y_res).sum() 81 | return y_res, res_square 82 | 83 | def fit(self, X_data, y_data): 84 | self.init_param(X_data) 85 | y_res = y_data 86 | while True: 87 | self.best_split(y_res) 88 | y_res, res_square = self.update_y(X_data, y_data) 89 | if res_square < self.epsilon: 90 | break 91 | return 92 | 93 | def predict(self, X): 94 | return self._fx(X) 95 | 96 | 97 | if __name__ == '__main__': 98 | # data = np.array( 99 | # [[1, 5.56], [2, 5.70], [3, 5.91], [4, 6.40], [5, 6.80], [6, 7.05], [7, 8.90], [8, 8.70], [9, 9.00], [10, 9.05]]) 100 | # X_data = data[:, :-1] 101 | # y_data = data[:, -1] 102 | # BT = BoostingTree(epsilon=0.18) 103 | # BT.fit(X_data, y_data) 104 | # print(BT.split_list, BT.c1_list, BT.c2_list) 105 | X_data_raw = np.linspace(-5, 5, 100) 106 | X_data = np.transpose([X_data_raw]) 107 | y_data = np.sin(X_data_raw) 108 | BT = BoostingTree(epsilon=0.1) 109 | BT.fit(X_data, y_data) 110 | y_pred = [BT.predict(X) for X in X_data] 111 | 112 | import matplotlib.pyplot as plt 113 | 114 | p1 = plt.scatter(X_data_raw, y_data, color='r') 115 | p2 = plt.scatter(X_data_raw, y_pred, color='b') 116 | plt.legend([p1, p2], ['real', 'pred']) 117 | plt.show() 118 | -------------------------------------------------------------------------------- /cluster/dbscan.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict, deque 3 | import math 4 | 5 | 6 | class DBSCAN: 7 | def __init__(self): 8 | self.eps = None 9 | self.minpts = None 10 | self.N = None 11 | self.data = None 12 | self.pair_dis = None 13 | self.labels = None 14 | self.cluster_labels = None 15 | 16 | def init_param(self, data): 17 | # 初始化参数 18 | self.data = data 19 | self.N = data.shape[0] 20 | self.pair_dis = self.cal_pair_dis() 21 | self.cluster_labels = np.zeros(self.data.shape[0]) # 将所有点类标记设为 0 22 | self.cal_pair_dis() 23 | return 24 | 25 | def _cal_dis(self, p1, p2): 26 | dis = 0 27 | for i, j in zip(p1, p2): 28 | dis += (i - j) ** 2 29 | return math.sqrt(dis) 30 | 31 | def cal_pair_dis(self): 32 | # 获取每对点之间的距离 33 | pair_dis = np.zeros((self.N, self.N)) 34 | for i in range(self.N): 35 | for j in range(i + 1, self.N): 36 | pair_dis[i, j] = self._cal_dis(self.data[i], self.data[j]) 37 | pair_dis[j, i] = pair_dis[i, j] 38 | return pair_dis 39 | 40 | def _cal_k_dis(self, k): 41 | # 计算每个点的k距离 42 | kdis = [] 43 | for i in range(self.N): 44 | dis_arr = self.pair_dis[i, :] 45 | inds_sort = np.argsort(dis_arr) 46 | kdis.append(dis_arr[inds_sort[k + 1]]) 47 | return kdis 48 | 49 | def graph2param(self, minpts): 50 | # 画出k距离图,决定参数eps, minpts 51 | kdis = self._cal_k_dis(minpts) 52 | kdis = sorted(kdis) 53 | plt.plot(kdis) 54 | plt.show() 55 | return 56 | 57 | def cal_eps_neighbors(self): 58 | # 计算某个点eps内距离点的集合 59 | if self.eps is None: 60 | raise ValueError('the eps is not set') 61 | neighs = defaultdict(list) 62 | for i in range(self.N): 63 | for ind, dis in enumerate(self.pair_dis[i]): 64 | if dis <= self.eps and i != ind: 65 | neighs[i].append(ind) 66 | return neighs 67 | 68 | def mark_core(self, neighs): 69 | # 标记核心点 70 | if self.minpts is None: 71 | raise ValueError('the minpts is not set') 72 | core_points = [] 73 | for key, val in neighs.items(): 74 | if len(val) >= self.minpts: # 近邻点数大于minpts则为核心点 75 | core_points.append(key) 76 | return core_points 77 | 78 | def fit(self): 79 | # 训练,对每个样本进行判别 80 | neighs = self.cal_eps_neighbors() 81 | core_points = self.mark_core(neighs) 82 | cluster_label = 0 83 | q = deque() 84 | for p in core_points: 85 | if not self.cluster_labels[p]: # 若该核心点未被标记,则建立新簇, 簇标记加 1 86 | q.append(p) 87 | cluster_label += 1 88 | # 以当前核心点为出发点, 采用广度优先算法进行簇的扩展, 直到队列为空,则停止此次扩展 89 | while len(q) > 0: 90 | p = q.pop() 91 | self.cluster_labels[p] = cluster_label 92 | for n in neighs[p]: 93 | if not self.cluster_labels[n]: # 邻域内的点未归类,则加入该簇 94 | self.cluster_labels[n] = cluster_label 95 | if n in core_points: # 若邻域内存在未标记的核心点,则依据该核心点继续扩展簇(一定要未标记,否则造成死循环) 96 | q.appendleft(n) 97 | 98 | return 99 | 100 | 101 | if __name__ == '__main__': 102 | import matplotlib.pyplot as plt 103 | from itertools import cycle 104 | from sklearn.datasets import make_blobs 105 | 106 | data, label = make_blobs(centers=5, cluster_std=1.5, random_state=5) 107 | # plt.scatter(data[:, 0], data[:, 1]) 108 | # plt.show() 109 | db = DBSCAN() 110 | db.init_param(data) 111 | # db.graph2param(5) 112 | db.eps = 2.0 113 | db.minpts = 5 114 | db.fit() 115 | 116 | 117 | def visualize(data, cluster_labels): 118 | cluster = defaultdict(list) 119 | for ind, label in enumerate(cluster_labels): 120 | cluster[label].append(ind) 121 | color = 'bgrym' 122 | for col, label in zip(cycle(color), cluster.keys()): 123 | if label == 0: 124 | col = 'k' 125 | partial_data = data[cluster[label]] 126 | plt.scatter(partial_data[:, 0], partial_data[:, 1], color=col) 127 | plt.show() 128 | return 129 | 130 | 131 | visualize(data, db.cluster_labels) 132 | -------------------------------------------------------------------------------- /cluster/meanshift.py: -------------------------------------------------------------------------------- 1 | """ 2 | meanshift聚类算法 3 | 核心思想: 4 | 寻找核密度极值点并作为簇的质心,然后根据最近邻原则将样本点赋予质心 5 | """ 6 | from collections import defaultdict 7 | import numpy as np 8 | import math 9 | 10 | 11 | class MeanShift: 12 | def __init__(self, epsilon=1e-5, band_width=2, min_fre=3, bin_seeding=False): 13 | self.epsilon = epsilon 14 | self.band_width = band_width 15 | self.min_fre = min_fre # 可以作为起始质心的球体内最少的样本数目 16 | self.bin_seeding = bin_seeding 17 | self.radius2 = self.band_width ** 2 # 高维球体半径的平方 18 | 19 | self.N = None 20 | self.labels = None 21 | self.centers = [] 22 | self.center_score = [] 23 | 24 | def init_param(self, data): 25 | # 初始化参数 26 | self.N = data.shape[0] 27 | self.labels = -1 * np.ones(self.N) 28 | return 29 | 30 | def get_seeds(self, data): 31 | # 获取可以作为起始质心的点(seed) 32 | if self.bin_seeding: 33 | binsize = self.band_width 34 | else: 35 | binsize = 1 36 | seed_list = [] 37 | seeds_fre = defaultdict(int) 38 | for sample in data: 39 | seed = tuple(np.round(sample / binsize)) # 将数据粗粒化,以防止非常近的样本点都作为起始质心 40 | seeds_fre[seed] += 1 41 | for seed, fre in seeds_fre.items(): 42 | if fre >= self.min_fre: 43 | seed_list.append(np.array(seed)) 44 | if not seed_list: 45 | raise ValueError('the bin size and min_fre are not proper') 46 | return seed_list 47 | 48 | def euclidean_dis2(self, center, sample): 49 | # 计算均值点到每个样本点的欧式距离(平方) 50 | delta = center - sample 51 | return delta @ delta 52 | 53 | def gaussian_kel(self, dis2): 54 | # 计算高斯核 55 | return 1.0 / self.band_width * (2 * math.pi) ** (-1.0 / 2) * math.exp(-dis2 / (2 * self.band_width ** 2)) 56 | 57 | def shift_center(self, current_center, data, tmp_center_score): 58 | # 计算下一个漂移的坐标 59 | denominator = 0 # 分母 60 | numerator = np.zeros_like(current_center) # 分子, 一维数组形式 61 | for ind, sample in enumerate(data): 62 | dis2 = self.euclidean_dis2(current_center, sample) 63 | if dis2 <= self.radius2: 64 | tmp_center_score += 1 65 | d = self.gaussian_kel(dis2) 66 | denominator += d 67 | numerator += d * sample 68 | return numerator / denominator 69 | 70 | def classify(self, data): 71 | # 根据最近邻将数据分类到最近的簇中 72 | center_arr = np.array(self.centers) 73 | for i in range(self.N): 74 | delta = center_arr - data[i] 75 | dis2 = np.sum(delta * delta, axis=1) 76 | self.labels[i] = np.argmin(dis2) 77 | return 78 | 79 | def fit(self, data): 80 | # 训练主函数 81 | self.init_param(data) 82 | seed_list = self.get_seeds(data) 83 | for seed in seed_list: 84 | current_center = seed 85 | tmp_center_score = 0 86 | # 进行一次独立的均值漂移 87 | while True: 88 | next_center = self.shift_center(current_center, data, tmp_center_score) 89 | delta_dis = np.linalg.norm(next_center - current_center, 2) 90 | if delta_dis < self.epsilon: 91 | break 92 | current_center = next_center 93 | # 若该次漂移结束后,最终的质心与已存在的质心距离小于带宽,则合并 94 | for i in range(len(self.centers)): 95 | if np.linalg.norm(current_center - self.centers[i], 2) < self.band_width: 96 | if tmp_center_score > self.center_score[i]: 97 | self.centers[i] = current_center 98 | self.center_score[i] = tmp_center_score 99 | break 100 | else: 101 | self.centers.append(current_center) 102 | self.center_score.append(tmp_center_score) 103 | self.classify(data) 104 | return 105 | 106 | 107 | if __name__ == '__main__': 108 | from sklearn.datasets import make_blobs 109 | 110 | data, label = make_blobs(n_samples=500, centers=5, cluster_std=1.2, random_state=5) 111 | MS = MeanShift(bin_seeding=True) 112 | MS.fit(data) 113 | labels = MS.labels 114 | import matplotlib.pyplot as plt 115 | from itertools import cycle 116 | 117 | 118 | def visualize(data, labels): 119 | color = 'bgrym' 120 | unique_label = np.unique(labels) 121 | for col, label in zip(cycle(color), unique_label): 122 | partial_data = data[np.where(labels == label)] 123 | plt.scatter(partial_data[:, 0], partial_data[:, 1], color=col) 124 | plt.show() 125 | return 126 | 127 | 128 | visualize(data, labels) 129 | -------------------------------------------------------------------------------- /cluster/spectural_clustering.py: -------------------------------------------------------------------------------- 1 | """ 2 | 谱聚类算法 3 | 核心思想:构建样本点的图,切分图,使得子图内权重最大,子图间权重最小 4 | """ 5 | import numpy as np 6 | from kmeans import KMEANS 7 | 8 | 9 | class Spectrum: 10 | def __init__(self, n_cluster, epsilon=1e-3, maxstep=1000, method='unnormalized', 11 | criterion='gaussian', gamma=2.0, dis_epsilon=70, k=5): 12 | self.n_cluster = n_cluster 13 | self.epsilon = epsilon 14 | self.maxstep = maxstep 15 | self.method = method # 本程序提供规范化以及非规范化的谱聚类算法 16 | self.criterion = criterion # 相似性矩阵的构建方法 17 | self.gamma = gamma # 高斯方法中的sigma参数 18 | self.dis_epsilon = dis_epsilon # epsilon-近邻方法的参数 19 | self.k = k # k近邻方法的参数 20 | 21 | self.W = None # 图的相似性矩阵 22 | self.L = None # 图的拉普拉斯矩阵 23 | self.L_norm = None # 规范化后的拉普拉斯矩阵 24 | self.D = None # 图的度矩阵 25 | self.cluster = None 26 | 27 | self.N = None 28 | 29 | def init_param(self, data): 30 | # 初始化参数 31 | self.N = data.shape[0] 32 | dis_mat = self.cal_dis_mat(data) 33 | self.cal_weight_mat(dis_mat) 34 | self.D = np.diag(self.W.sum(axis=1)) 35 | self.L = self.D - self.W 36 | return 37 | 38 | def cal_dis_mat(self, data): 39 | # 计算距离平方的矩阵 40 | dis_mat = np.zeros((self.N, self.N)) 41 | for i in range(self.N): 42 | for j in range(i + 1, self.N): 43 | dis_mat[i, j] = (data[i] - data[j]) @ (data[i] - data[j]) 44 | dis_mat[j, i] = dis_mat[i, j] 45 | return dis_mat 46 | 47 | def cal_weight_mat(self, dis_mat): 48 | # 计算相似性矩阵 49 | if self.criterion == 'gaussian': # 适合于较小样本集 50 | if self.gamma is None: 51 | raise ValueError('gamma is not set') 52 | self.W = np.exp(-self.gamma * dis_mat) 53 | elif self.criterion == 'k_nearest': # 适合于较大样本集 54 | if self.k is None or self.gamma is None: 55 | raise ValueError('k or gamma is not set') 56 | self.W = np.zeros((self.N, self.N)) 57 | for i in range(self.N): 58 | inds = np.argpartition(dis_mat[i], self.k + 1)[:self.k + 1] # 由于包括自身,所以+1 59 | tmp_w = np.exp(-self.gamma * dis_mat[i][inds]) 60 | self.W[i][inds] = tmp_w 61 | elif self.criterion == 'eps_nearest': # 适合于较大样本集 62 | if self.dis_epsilon is None: 63 | raise ValueError('epsilon is not set') 64 | self.W = np.zeros((self.N, self.N)) 65 | for i in range(self.N): 66 | inds = np.where(dis_mat[i] < self.dis_epsilon) 67 | self.W[i][inds] = 1.0 / len(inds) 68 | else: 69 | raise ValueError('the criterion is not supported') 70 | return 71 | 72 | def fit(self, data): 73 | # 训练主函数 74 | self.init_param(data) 75 | if self.method == 'unnormalized': 76 | w, v = np.linalg.eig(self.L) 77 | inds = np.argsort(w)[:self.n_cluster] 78 | Vectors = v[:, inds] 79 | elif self.method == 'normalized': 80 | D = np.linalg.inv(np.sqrt(self.D)) 81 | L = D @ self.L @ D 82 | w, v = np.linalg.eig(L) 83 | inds = np.argsort(w)[:self.n_cluster] 84 | Vectors = v[:, inds] 85 | normalizer = np.linalg.norm(Vectors, axis=1) 86 | normalizer = np.repeat(np.transpose([normalizer]), self.n_cluster, axis=1) 87 | Vectors = Vectors / normalizer 88 | else: 89 | raise ValueError('the method is not supported') 90 | km = KMEANS(self.n_cluster, self.epsilon, self.maxstep) 91 | km.fit(Vectors) 92 | self.cluster = km.cluster 93 | return 94 | 95 | 96 | if __name__ == '__main__': 97 | from sklearn.datasets import make_blobs 98 | from itertools import cycle 99 | import matplotlib.pyplot as plt 100 | 101 | data, label = make_blobs(centers=3, n_features=10, cluster_std=1.2, n_samples=500, random_state=1) 102 | sp = Spectrum(n_cluster=3, method='unnormalized', criterion='gaussian', gamma=0.1) 103 | sp.fit(data) 104 | cluster = sp.cluster 105 | 106 | # km = KMEANS(4) 107 | # km.fit(data) 108 | # cluster_km = km.cluster 109 | 110 | # def visualize(data, cluster): 111 | # color = 'bgrym' 112 | # for col, inds in zip(cycle(color), cluster.values()): 113 | # partial_data = data[inds] 114 | # plt.scatter(partial_data[:, 0], partial_data[:, 1], color=col) 115 | # plt.show() 116 | # return 117 | 118 | # visualize(data, cluster) 119 | 120 | def cal_err(data, cluster): 121 | # 计算MSE 122 | mse = 0 123 | for label, inds in cluster.items(): 124 | partial_data = data[inds] 125 | center = partial_data.mean(axis=0) 126 | for p in partial_data: 127 | mse += (center - p) @ (center - p) 128 | return mse / data.shape[0] 129 | 130 | print(cal_err(data, cluster)) 131 | # print(cal_err(data, cluster_km)) -------------------------------------------------------------------------------- /cluster/gmm_em.py: -------------------------------------------------------------------------------- 1 | """ 2 | 高斯混合模型+EM算法 3 | 以alpha(k)的概率选择第k个高斯模型,再以该高斯模型概率分布产生数据。其中alpha(k)就是隐变量 4 | """ 5 | import numpy as np 6 | import math 7 | import copy 8 | from collections import defaultdict 9 | from sklearn.cluster import KMeans 10 | 11 | 12 | class GEM: 13 | def __init__(self, maxstep=1000, epsilon=1e-3, K=4): 14 | self.maxstep = maxstep 15 | self.epsilon = epsilon 16 | self.K = K # 混合模型中的分模型的个数 17 | 18 | self.alpha = None # 每个分模型前系数 19 | self.mu = None # 每个分模型的均值向量 20 | self.sigma = None # 每个分模型的协方差 21 | self.gamma_all_final = None # 存储最终的每个样本对分模型的响应度,用于最终的聚类 22 | 23 | self.D = None # 输入数据的维度 24 | self.N = None # 输入数据总量 25 | 26 | def inin_param(self, data): 27 | # 初始化参数 28 | self.D = data.shape[1] 29 | self.N = data.shape[0] 30 | self.init_param_helper(data) 31 | return 32 | 33 | def init_param_helper(self, data): 34 | # KMeans初始化模型参数 35 | KMEANS = KMeans(n_clusters=self.K).fit(data) 36 | clusters = defaultdict(list) 37 | for ind, label in enumerate(KMEANS.labels_): 38 | clusters[label].append(ind) 39 | mu = [] 40 | alpha = [] 41 | sigma = [] 42 | for inds in clusters.values(): 43 | partial_data = data[inds] 44 | mu.append(partial_data.mean(axis=0)) # 分模型的均值向量 45 | alpha.append(len(inds) / self.N) # 权重 46 | sigma.append(np.cov(partial_data.T)) # 协方差,D个维度间的协方差 47 | self.mu = np.array(mu) 48 | self.alpha = np.array(alpha) 49 | self.sigma = np.array(sigma) 50 | return 51 | 52 | def _phi(self, y, mu, sigma): 53 | # 获取分模型的概率 54 | s1 = 1.0 / math.sqrt(np.linalg.det(sigma)) 55 | s2 = np.linalg.inv(sigma) # d*d 56 | delta = np.array([y - mu]) # 1*d 57 | return s1 * math.exp(-1.0 / 2 * delta @ s2 @ delta.T) 58 | 59 | def fit(self, data): 60 | # 迭代训练 61 | self.inin_param(data) 62 | step = 0 63 | gamma_all_arr = None 64 | while step < self.maxstep: 65 | step += 1 66 | old_alpha = copy.copy(self.alpha) 67 | # E步 68 | gamma_all = [] 69 | for j in range(self.N): 70 | gamma_j = [] # 依次求每个样本对K个分模型的响应度 71 | 72 | for k in range(self.K): 73 | gamma_j.append(self.alpha[k] * self._phi(data[j], self.mu[k], self.sigma[k])) 74 | 75 | s = sum(gamma_j) 76 | gamma_j = [item/s for item in gamma_j] 77 | gamma_all.append(gamma_j) 78 | 79 | gamma_all_arr = np.array(gamma_all) 80 | # M步 81 | for k in range(self.K): 82 | gamma_k = gamma_all_arr[:, k] 83 | SUM = np.sum(gamma_k) 84 | # 更新权重 85 | self.alpha[k] = SUM / self.N # 更新权重 86 | # 更新均值向量 87 | new_mu = sum([gamma * y for gamma, y in zip(gamma_k, data)]) / SUM # 1*d 88 | self.mu[k] = new_mu 89 | # 更新协方差阵 90 | delta_ = data - new_mu # n*d 91 | self.sigma[k] = sum([gamma * (np.outer(np.transpose([delta]), delta)) for gamma, delta in zip(gamma_k, delta_)]) / SUM # d*d 92 | alpha_delta = self.alpha - old_alpha 93 | if np.linalg.norm(alpha_delta, 1) < self.epsilon: 94 | break 95 | self.gamma_all_final = gamma_all_arr 96 | return 97 | 98 | def predict(self): 99 | cluster = defaultdict(list) 100 | for j in range(self.N): 101 | max_ind = np.argmax(self.gamma_all_final[j]) 102 | cluster[max_ind].append(j) 103 | return cluster 104 | 105 | if __name__ == '__main__': 106 | 107 | def generate_data(N=500): 108 | X = np.zeros((N, 2)) # N*2, 初始化X 109 | mu = np.array([[5, 35], [20, 40], [20, 35], [45, 15]]) 110 | sigma = np.array([[30, 0], [0, 25]]) 111 | for i in range(N): # alpha_list=[0.3, 0.2, 0.3, 0.2] 112 | prob = np.random.random(1) 113 | if prob < 0.1: # 生成0-1之间随机数 114 | X[i, :] = np.random.multivariate_normal(mu[0], sigma, 1) # 用第一个高斯模型生成2维数据 115 | elif 0.1 <= prob < 0.3: 116 | X[i, :] = np.random.multivariate_normal(mu[1], sigma, 1) # 用第二个高斯模型生成2维数据 117 | elif 0.3 <= prob < 0.6: 118 | X[i, :] = np.random.multivariate_normal(mu[2], sigma, 1) # 用第三个高斯模型生成2维数据 119 | else: 120 | X[i, :] = np.random.multivariate_normal(mu[3], sigma, 1) # 用第四个高斯模型生成2维数据 121 | return X 122 | 123 | 124 | data = generate_data() 125 | gem = GEM() 126 | gem.fit(data) 127 | # print(gem.alpha, '\n', gem.sigma, '\n', gem.mu) 128 | cluster = gem.predict() 129 | 130 | import matplotlib.pyplot as plt 131 | from itertools import cycle 132 | colors = cycle('grbk') 133 | for color, inds in zip(colors, cluster.values()): 134 | partial_data = data[inds] 135 | plt.scatter(partial_data[:,0], partial_data[:, 1], edgecolors=color) 136 | plt.show() -------------------------------------------------------------------------------- /classification_regression/cart_clf.py: -------------------------------------------------------------------------------- 1 | """ 2 | CART分类树,是一颗二叉树,以某个特征以及该特征对应的一个值为节点,故相对ID3算法,最大的不同就是特征可以使用多次 3 | """ 4 | from collections import Counter, defaultdict 5 | 6 | import numpy as np 7 | 8 | 9 | class node: 10 | def __init__(self, fea=-1, val=None, res=None, right=None, left=None): 11 | self.fea = fea # 特征 12 | self.val = val # 特征对应的值 13 | self.res = res # 叶节点标记 14 | self.right = right 15 | self.left = left 16 | 17 | 18 | class CART_CLF: 19 | def __init__(self, epsilon=1e-3, min_sample=1): 20 | self.epsilon = epsilon 21 | self.min_sample = min_sample # 叶节点含有的最少样本数 22 | self.tree = None 23 | 24 | def getGini(self, y_data): 25 | # 计算基尼指数 26 | c = Counter(y_data) 27 | return 1 - sum([(val / y_data.shape[0]) ** 2 for val in c.values()]) 28 | 29 | def getFeaGini(self, set1, set2): 30 | # 计算某个特征及相应的某个特征值组成的切分节点的基尼指数 31 | num = set1.shape[0] + set2.shape[0] 32 | return set1.shape[0] / num * self.getGini(set1) + set2.shape[0] / num * self.getGini(set2) 33 | 34 | def bestSplit(self, splits_set, X_data, y_data): 35 | # 返回所有切分点的基尼指数,以字典形式存储。键为split,是一个元组,第一个元素为最优切分特征,第二个为该特征对应的最优切分值 36 | pre_gini = self.getGini(y_data) 37 | subdata_inds = defaultdict(list) # 切分点以及相应的样本点的索引 38 | for split in splits_set: 39 | for ind, sample in enumerate(X_data): 40 | if sample[split[0]] == split[1]: 41 | subdata_inds[split].append(ind) 42 | min_gini = 1 43 | best_split = None 44 | best_set = None 45 | for split, data_ind in subdata_inds.items(): 46 | set1 = y_data[data_ind] # 满足切分点的条件,则为左子树 47 | set2_inds = list(set(range(y_data.shape[0])) - set(data_ind)) 48 | set2 = y_data[set2_inds] 49 | if set1.shape[0] < 1 or set2.shape[0] < 1: 50 | continue 51 | now_gini = self.getFeaGini(set1, set2) 52 | if now_gini < min_gini: 53 | min_gini = now_gini 54 | best_split = split 55 | best_set = (data_ind, set2_inds) 56 | if abs(pre_gini - min_gini) < self.epsilon: # 若切分后基尼指数下降未超过阈值则停止切分 57 | best_split = None 58 | return best_split, best_set, min_gini 59 | 60 | def buildTree(self, splits_set, X_data, y_data): 61 | if y_data.shape[0] < self.min_sample: # 数据集小于阈值直接设为叶节点 62 | return node(res=Counter(y_data).most_common(1)[0][0]) 63 | best_split, best_set, min_gini = self.bestSplit(splits_set, X_data, y_data) 64 | if best_split is None: # 基尼指数下降小于阈值,则终止切分,设为叶节点 65 | return node(res=Counter(y_data).most_common(1)[0][0]) 66 | else: 67 | splits_set.remove(best_split) 68 | left = self.buildTree(splits_set, X_data[best_set[0]], y_data[best_set[0]]) 69 | right = self.buildTree(splits_set, X_data[best_set[1]], y_data[best_set[1]]) 70 | return node(fea=best_split[0], val=best_split[1], right=right, left=left) 71 | 72 | def fit(self, X_data, y_data): 73 | # 训练模型,CART分类树与ID3最大的不同是,CART建立的是二叉树,每个节点是特征及其对应的某个值组成的元组 74 | # 特征可以多次使用 75 | splits_set = [] 76 | for fea in range(X_data.shape[1]): 77 | unique_vals = np.unique(X_data[:, fea]) 78 | if unique_vals.shape[0] < 2: 79 | continue 80 | elif unique_vals.shape[0] == 2: # 若特征取值只有2个,则只有一个切分点,非此即彼 81 | splits_set.append((fea, unique_vals[0])) 82 | else: 83 | for val in unique_vals: 84 | splits_set.append((fea, val)) 85 | self.tree = self.buildTree(splits_set, X_data, y_data) 86 | return 87 | 88 | def predict(self, x): 89 | def helper(x, tree): 90 | if tree.res is not None: # 表明到达叶节点 91 | return tree.res 92 | else: 93 | if x[tree.fea] == tree.val: # "是" 返回左子树 94 | branch = tree.left 95 | else: 96 | branch = tree.right 97 | return helper(x, branch) 98 | 99 | return helper(x, self.tree) 100 | 101 | def disp_tree(self): 102 | # 打印树 103 | self.disp_helper(self.tree) 104 | return 105 | 106 | def disp_helper(self, current_node): 107 | # 前序遍历 108 | print(current_node.fea, current_node.val, current_node.res) 109 | if current_node.res is not None: 110 | return 111 | self.disp_helper(current_node.left) 112 | self.disp_helper(current_node.right) 113 | return 114 | 115 | 116 | if __name__ == '__main__': 117 | from sklearn.datasets import load_iris 118 | 119 | X_data = load_iris().data 120 | y_data = load_iris().target 121 | 122 | from machine_learning_algorithm.cross_validation import validate 123 | 124 | g = validate(X_data, y_data, ratio=0.2) 125 | for item in g: 126 | X_data_train, y_data_train, X_data_test, y_data_test = item 127 | clf = CART_CLF() 128 | clf.fit(X_data_train, y_data_train) 129 | score = 0 130 | for X, y in zip(X_data_test,y_data_test): 131 | if clf.predict(X) == y: 132 | score += 1 133 | print(score / len(y_data_test)) 134 | -------------------------------------------------------------------------------- /classification_regression/decision_tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | ID3&C4.5决策树算法 3 | """ 4 | import math 5 | from collections import Counter, defaultdict 6 | 7 | import numpy as np 8 | 9 | 10 | class node: 11 | # 这里构建树的节点类,也可用字典来表示树结构 12 | def __init__(self, fea=-1, res=None, child=None): 13 | self.fea = fea 14 | self.res = res 15 | self.child = child # 特征的每个值对应一颗子树,特征值为键,相应子树为值 16 | 17 | 18 | class DecisionTree: 19 | def __init__(self, epsilon=1e-3, metric='C4.5'): 20 | self.epsilon = epsilon 21 | self.tree = None 22 | self.metric = metric 23 | 24 | def exp_ent(self, y_data): 25 | # 计算经验熵 26 | c = Counter(y_data) # 统计各个类标记的个数 27 | ent = 0 28 | N = len(y_data) 29 | for val in c.values(): 30 | p = val / N 31 | ent += -p * math.log2(p) 32 | return ent 33 | 34 | def con_ent(self, fea, X_data, y_data): 35 | # 计算条件熵并返回,同时返回切分后的各个子数据集 36 | fea_val_unique = Counter(X_data[:, fea]) 37 | subdata_inds = defaultdict(list) # 根据特征fea下的值切分数据集 38 | for ind, sample in enumerate(X_data): 39 | subdata_inds[sample[fea]].append(ind) # 挑选某个值对应的所有样本点的索引 40 | 41 | ent = 0 42 | N = len(y_data) 43 | for key, val in fea_val_unique.items(): 44 | pi = val / N 45 | ent += pi * self.exp_ent(y_data[subdata_inds[key]]) 46 | return ent, subdata_inds 47 | 48 | def infoGain(self, fea, X_data, y_data): 49 | # 计算信息增益 50 | exp_ent = self.exp_ent(y_data) 51 | con_ent, subdata_inds = self.con_ent(fea, X_data, y_data) 52 | return exp_ent - con_ent, subdata_inds 53 | 54 | def infoGainRatio(self, fea, X_data, y_data): 55 | # 计算信息增益比 56 | g, subdata_inds = self.infoGain(fea, X_data, y_data) 57 | N = len(y_data) 58 | split_info = 0 59 | for val in subdata_inds.values(): 60 | p = len(val) / N 61 | split_info -= p * math.log2(p) 62 | return g / split_info, subdata_inds 63 | 64 | def bestfea(self, fea_list, X_data, y_data): 65 | # 获取最优切分特征、相应的信息增益(比)以及切分后的子数据集 66 | score_func = self.infoGainRatio 67 | if self.metric == 'ID3': 68 | score_func = self.infoGain 69 | bestfea = fea_list[0] # 初始化最优特征 70 | gmax, bestsubdata_inds = score_func(bestfea, X_data, y_data) # 初始化最大信息增益及切分后的子数据集 71 | for fea in fea_list[1:]: 72 | g, subdata_inds = score_func(fea, X_data, y_data) 73 | if g > gmax: 74 | bestfea = fea 75 | bestsubdata_inds = subdata_inds 76 | gmax = g 77 | return gmax, bestfea, bestsubdata_inds 78 | 79 | def buildTree(self, fea_list, X_data, y_data): 80 | # 递归构建树 81 | label_unique = np.unique(y_data) 82 | if label_unique.shape[0] == 1: # 数据集只有一个类,直接返回该类 83 | return node(res=label_unique[0]) 84 | if not fea_list: 85 | return node(res=Counter(y_data).most_common(1)[0][0]) 86 | gmax, bestfea, bestsubdata_inds = self.bestfea(fea_list, X_data, y_data) 87 | if gmax < self.epsilon: # 信息增益比小于阈值,返回数据集中出现最多的类 88 | return node(res=Counter(y_data).most_common(1)[0][0]) 89 | else: 90 | fea_list.remove(bestfea) 91 | child = {} 92 | for key, val in bestsubdata_inds.items(): 93 | child[key] = self.buildTree(fea_list, X_data[val], y_data[val]) 94 | return node(fea=bestfea, child=child) 95 | 96 | def fit(self, X_data, y_data): 97 | fea_list = list(range(X_data.shape[1])) 98 | self.tree = self.buildTree(fea_list, X_data, y_data) 99 | return 100 | 101 | def predict(self, X): 102 | def helper(X, tree): 103 | if tree.res is not None: # 表明到达叶节点 104 | return tree.res 105 | else: 106 | try: 107 | sub_tree = tree.child[X[tree.fea]] 108 | return helper(X, sub_tree) # 根据对应特征下的值返回相应的子树 109 | except: 110 | print('input data is out of scope') 111 | 112 | return helper(X, self.tree) 113 | 114 | 115 | if __name__ == '__main__': 116 | data = np.array([['青年', '青年', '青年', '青年', '青年', '中年', '中年', 117 | '中年', '中年', '中年', '老年', '老年', '老年', '老年', '老年'], 118 | ['否', '否', '是', '是', '否', '否', '否', '是', '否', 119 | '否', '否', '否', '是', '是', '否'], 120 | ['否', '否', '否', '是', '否', '否', '否', '是', 121 | '是', '是', '是', '是', '否', '否', '否'], 122 | ['一般', '好', '好', '一般', '一般', '一般', '好', '好', 123 | '非常好', '非常好', '非常好', '好', '好', '非常好', '一般'], 124 | ['否', '否', '是', '是', '否', '否', '否', '是', '是', 125 | '是', '是', '是', '是', '是', '否']]) 126 | data = data.T 127 | X_data = data[:, :-1] 128 | y_data = data[:, -1] 129 | 130 | import time 131 | from machine_learning_algorithm.cross_validation import validate 132 | start = time.clock() 133 | 134 | g = validate(X_data, y_data, ratio=0.2) 135 | for item in g: 136 | X_data_train, y_data_train, X_data_test, y_data_test = item 137 | clf = DecisionTree() 138 | clf.fit(X_data_train, y_data_train) 139 | score = 0 140 | for X, y in zip(X_data_test,y_data_test): 141 | if clf.predict(X) == y: 142 | score += 1 143 | print(score / len(y_data_test)) 144 | print(time.clock() - start) -------------------------------------------------------------------------------- /association/apriori.py: -------------------------------------------------------------------------------- 1 | """ 2 | Apriori关联分析算法 3 | 核心思想:先验原理 4 | """ 5 | from collections import Counter, defaultdict 6 | 7 | 8 | class Apriori: 9 | def __init__(self, minsup, minconf): 10 | self.minsup = minsup 11 | self.minconf = minconf 12 | self.data = None 13 | self.N = None # 购物篮数据的总数 14 | self.D = None # 频繁项集的最大项个数 15 | self.fre_list = [] # 频繁项集,[[[],[]],[[],[]]] 16 | self.sup_list = [] # 存储每个频繁项的支持度 17 | self.fre_dict = defaultdict(lambda: 0) # 键为频繁项集的tuple,值为支持度 18 | self.rules_dict = defaultdict(lambda: 0) # 规则,键为规则前件和规则后件的tuple, 值为置信度 19 | 20 | def init_param(self, data): 21 | # 根据传入的数据初始化参数 22 | self.data = sorted(data) 23 | self.N = len(data) 24 | self.D = 0 25 | 26 | item_counter = Counter() 27 | for itemset in data: 28 | if len(itemset) > self.D: 29 | self.D = len(itemset) 30 | item_counter += Counter(itemset) 31 | itemset = sorted(item_counter) # 保证有序 32 | c1 = [] 33 | sup_c1 = [] 34 | for item in itemset: 35 | sup = item_counter[item] / self.N 36 | if sup > self.minsup: 37 | c1.append([item]) 38 | sup_c1.append(sup) 39 | 40 | self.fre_list.append(c1) 41 | self.sup_list.append(sup_c1) 42 | return 43 | 44 | def apriori_fre_itemset(self): 45 | # 使用Apriori算法获取频繁项集 46 | for i in range(1, self.D): # 逐渐增加频繁项大小 47 | ck_1 = self.fre_list[i - 1] 48 | if len(ck_1) < 2: # 若k-1频繁项集不足两个,则跳出循环 49 | break 50 | cand_ck_set = self.ck_itemset(i, ck_1) 51 | 52 | sup_ck = [] 53 | ck = [] 54 | for item in cand_ck_set: # 计算ck的支持度 55 | sup = self.cal_sup(item) 56 | if sup > self.minsup: 57 | ck.append(item) 58 | sup_ck.append(sup) 59 | 60 | if len(ck) > 0: 61 | self.fre_list.append(ck) 62 | self.sup_list.append(sup_ck) 63 | 64 | for ck, sup_ck in zip(self.fre_list, self.sup_list): 65 | for itemset, sup in zip(ck, sup_ck): 66 | self.fre_dict[tuple(itemset)] = sup 67 | 68 | return 69 | 70 | def ck_itemset(self, ind, ck_1): 71 | # 根据k-1频繁项集产生k频繁项集, 产生候选然后减枝, 返回频繁项的list 72 | cand_ck_set = [] 73 | for i in range(len(ck_1)): # 合并两个k-1频繁项集 74 | cand_ck = ck_1[i] 75 | for j in range(i + 1, len(ck_1)): 76 | if ck_1[i][:ind - 1] == ck_1[j][:ind - 1]: # 若前k-2项相同则合并 77 | cand_ck.append(ck_1[j][-1]) # 合并形成频繁k项 78 | if self.prune(cand_ck, ck_1): # 检查其他k-1项集是否为频繁项集,进而减枝 79 | cand_ck_set.append(cand_ck.copy()) 80 | cand_ck.pop() 81 | return cand_ck_set 82 | 83 | def prune(self, cand_ck_item, ck_1): 84 | # 根据k-1频繁项集来对k频繁项是否频繁 85 | for item in cand_ck_item[:-2]: 86 | sub_item = cand_ck_item.copy() 87 | sub_item.remove(item) 88 | if sub_item not in ck_1: 89 | return False 90 | return True 91 | 92 | def cal_sup(self, item): 93 | # 支持度计数 94 | s = set(item) 95 | sup = 0 96 | for t in self.data: 97 | if s.issubset(t): 98 | sup += 1 99 | return sup / self.N 100 | 101 | def cal_conf(self, sxy, X): 102 | # 计算置信度, sxy为产生规则的频繁项集的支持度, X为规则前件 103 | return sxy / self.fre_dict[tuple(X)] 104 | 105 | def gen_rules(self): 106 | # 从频繁项集中提取规则 107 | for i in range(1, len(self.fre_list)): 108 | for ind, itemset in enumerate(self.fre_list[i]): 109 | cand_rules = [] # 由该频繁项集产生的规则的list, 记录规则前件 110 | sxy = self.sup_list[i][ind] 111 | for item in itemset: # 初始化后件为1个项的规则 112 | X = itemset.copy() 113 | X.remove(item) 114 | cand_rules.append(X) 115 | 116 | while len(cand_rules) > 0: 117 | itemset_rules = [] 118 | for X in cand_rules: 119 | conf = self.cal_conf(sxy, X) 120 | if conf > self.minconf: 121 | itemset_rules.append(X) 122 | Y = list(set(itemset) - set(X)) 123 | Y = sorted(Y) 124 | self.rules_dict[(tuple(X), tuple(Y))] = conf 125 | cand_rules = self.apriori_rules(itemset_rules) 126 | return 127 | 128 | def apriori_rules(self, itemset_rules): 129 | # 根据先验原理产生候选规则 130 | cand_rules = [] 131 | for i in range(len(itemset_rules)): 132 | for j in range(i + 1, len(itemset_rules)): 133 | X = list(set(itemset_rules[i]) & set(itemset_rules[j])) # 合并生成新的规则前件 134 | X = sorted(X) 135 | if X in cand_rules or len(X) < 1: # 若该规则前件已经产生或者为空则跳过 136 | continue 137 | cand_rules.append(X) 138 | return cand_rules 139 | 140 | def fit(self, data): 141 | self.init_param(data) 142 | self.apriori_fre_itemset() 143 | self.gen_rules() 144 | return 145 | 146 | 147 | if __name__ == '__main__': 148 | data = [['l1', 'l2', 'l5'], ['l2', 'l4'], ['l2', 'l3'], 149 | ['l1', 'l2', 'l4'], ['l1', 'l3'], ['l2', 'l3'], 150 | ['l1', 'l3'], ['l1', 'l2', 'l3', 'l5'], ['l1', 'l2', 'l3']] 151 | 152 | AP = Apriori(minsup=0.2, minconf=0.6) 153 | AP.fit(data) 154 | print(AP.rules_dict) 155 | -------------------------------------------------------------------------------- /recommendation/collaborative_filter.py: -------------------------------------------------------------------------------- 1 | """ 2 | 协同过滤算法 3 | """ 4 | from abc import ABCMeta, abstractmethod 5 | import numpy as np 6 | from collections import defaultdict 7 | 8 | 9 | class CF_base(metaclass=ABCMeta): 10 | def __init__(self, k=3): 11 | self.k = k 12 | self.n_user = None 13 | self.n_item = None 14 | 15 | @abstractmethod 16 | def init_param(self, data): 17 | pass 18 | 19 | @abstractmethod 20 | def cal_prediction(self, *args): 21 | pass 22 | 23 | @abstractmethod 24 | def cal_recommendation(self, user_id, data): 25 | pass 26 | 27 | def fit(self, data): 28 | # 计算所有用户的推荐物品 29 | self.init_param(data) 30 | all_users = [] 31 | for i in range(self.n_user): 32 | all_users.append(self.cal_recommendation(i, data)) 33 | return all_users 34 | 35 | 36 | class CF_knearest(CF_base): 37 | """ 38 | 基于物品的K近邻协同过滤推荐算法 39 | """ 40 | 41 | def __init__(self, k, criterion='cosine'): 42 | super(CF_knearest, self).__init__(k) 43 | self.criterion = criterion 44 | self.simi_mat = None 45 | return 46 | 47 | def init_param(self, data): 48 | # 初始化参数 49 | self.n_user = data.shape[0] 50 | self.n_item = data.shape[1] 51 | self.simi_mat = self.cal_simi_mat(data) 52 | return 53 | 54 | def cal_similarity(self, i, j, data): 55 | # 计算物品i和物品j的相似度 56 | items = data[:, [i, j]] 57 | del_inds = np.where(items == 0)[0] 58 | items = np.delete(items, del_inds, axis=0) 59 | if items.size == 0: 60 | similarity = 0 61 | else: 62 | v1 = items[:, 0] 63 | v2 = items[:, 1] 64 | if self.criterion == 'cosine': 65 | if np.std(v1) > 1e-3: # 方差过大,表明用户间评价尺度差别大需要进行调整 66 | v1 = v1 - v1.mean() 67 | if np.std(v2) > 1e-3: 68 | v2 = v2 - v2.mean() 69 | similarity = (v1 @ v2) / np.linalg.norm(v1, 2) / np.linalg.norm(v2, 2) 70 | elif self.criterion == 'pearson': 71 | similarity = np.corrcoef(v1, v2)[0, 1] 72 | else: 73 | raise ValueError('the method is not supported now') 74 | return similarity 75 | 76 | def cal_simi_mat(self, data): 77 | # 计算物品间的相似度矩阵 78 | simi_mat = np.ones((self.n_item, self.n_item)) 79 | for i in range(self.n_item): 80 | for j in range(i + 1, self.n_item): 81 | simi_mat[i, j] = self.cal_similarity(i, j, data) 82 | simi_mat[j, i] = simi_mat[i, j] 83 | return simi_mat 84 | 85 | def cal_prediction(self, user_row, item_ind): 86 | # 计算预推荐物品i对目标活跃用户u的吸引力 87 | purchase_item_inds = np.where(user_row > 0)[0] 88 | rates = user_row[purchase_item_inds] 89 | simi = self.simi_mat[item_ind][purchase_item_inds] 90 | return np.sum(rates * simi) / np.linalg.norm(simi, 1) 91 | 92 | def cal_recommendation(self, user_ind, data): 93 | # 计算目标用户的最具吸引力的k个物品list 94 | item_prediction = defaultdict(float) 95 | user_row = data[user_ind] 96 | un_purchase_item_inds = np.where(user_row == 0)[0] 97 | for item_ind in un_purchase_item_inds: 98 | item_prediction[item_ind] = self.cal_prediction(user_row, item_ind) 99 | res = sorted(item_prediction, key=item_prediction.get, reverse=True) 100 | return res[:self.k] 101 | 102 | 103 | class CF_svd(CF_base): 104 | """ 105 | 基于矩阵分解的协同过滤算法 106 | """ 107 | 108 | def __init__(self, k=3, r=3): 109 | super(CF_svd, self).__init__(k) 110 | self.r = r # 选取前k个奇异值 111 | self.uk = None # 用户的隐因子向量 112 | self.vk = None # 物品的隐因子向量 113 | return 114 | 115 | def init_param(self, data): 116 | # 初始化,预处理 117 | self.n_user = data.shape[0] 118 | self.n_item = data.shape[1] 119 | self.svd_simplify(data) 120 | return data 121 | 122 | def svd_simplify(self, data): 123 | # 奇异值分解以及简化 124 | u, s, v = np.linalg.svd(data) 125 | u, s, v = u[:, :self.r], s[:self.r], v[:self.r, :] # 简化 126 | sk = np.diag(np.sqrt(s)) # r*r 127 | self.uk = u @ sk # m*r 128 | self.vk = sk @ v # r*n 129 | return 130 | 131 | def cal_prediction(self, user_ind, item_ind, user_row): 132 | rate_ave = np.mean(user_row) # 用户已购物品的评价的平均值 133 | return rate_ave + self.uk[user_ind] @ self.vk[:, item_ind] # 两个隐因子向量的内积加上平均值就是最终的预测分值 134 | 135 | def cal_recommendation(self, user_ind, data): 136 | # 计算目标用户的最具吸引力的k个物品list 137 | item_prediction = defaultdict(float) 138 | user_row = data[user_ind] 139 | un_purchase_item_inds = np.where(user_row == 0)[0] 140 | for item_ind in un_purchase_item_inds: 141 | item_prediction[item_ind] = self.cal_prediction(user_ind, item_ind, user_row) 142 | res = sorted(item_prediction, key=item_prediction.get, reverse=True) 143 | return res[:self.k] 144 | 145 | 146 | if __name__ == '__main__': 147 | # data = np.array([[4, 3, 0, 5, 0], 148 | # [4, 0, 4, 4, 0], 149 | # [4, 0, 5, 0, 3], 150 | # [2, 3, 0, 1, 0], 151 | # [0, 4, 2, 0, 5]]) 152 | data = np.array([[3.5, 1.0, 0.0, 0.0, 0.0, 0.0], 153 | [2.5, 3.5, 3.0, 3.5, 2.5, 3.0], 154 | [3.0, 3.5, 1.5, 5.0, 3.0, 3.5], 155 | [2.5, 3.5, 0.0, 3.5, 4.0, 0.0], 156 | [3.5, 2.0, 4.5, 0.0, 3.5, 2.0], 157 | [3.0, 4.0, 2.0, 3.0, 3.0, 2.0], 158 | [4.5, 1.5, 3.0, 5.0, 3.5, 0.0]]) 159 | # cf = CF_svd(k=1, r=3) 160 | cf = CF_knearest(k=1) 161 | print(cf.fit(data)) 162 | -------------------------------------------------------------------------------- /classification_regression/maximum_entropy.py: -------------------------------------------------------------------------------- 1 | """ 2 | 最大熵模型: 采用IIS最优化算法(M为常数时等同于GIS算法) 3 | """ 4 | 5 | import math 6 | from collections import defaultdict 7 | 8 | import numpy as np 9 | 10 | 11 | class MaxEnt: 12 | def __init__(self, epsilon=1e-3, maxstep=100): 13 | self.epsilon = epsilon 14 | self.maxstep = maxstep 15 | self.w = None # 特征函数的权重 16 | self.labels = None # 标签 17 | self.fea_list = [] # 特征函数 18 | self.px = defaultdict(lambda: 0) # 经验边缘分布概率 19 | self.pxy = defaultdict(lambda: 0) # 经验联合分布概率,由于特征函数为取值为0,1的二值函数,所以等同于特征的经验期望值 20 | self.exp_fea = defaultdict(lambda: 0) # 每个特征在数据集上的期望 21 | self.data_list = [] # 样本集,元素为tuple((X),y) 22 | self.N = None # 样本总量 23 | self.M = None # 某个训练样本包含特征的总数,这里假设每个样本的M值相同,即M为常数。其倒数类似于学习率 24 | self.n_fea = None # 特征函数的总数 25 | 26 | def init_param(self, X_data, y_data): 27 | # 根据传入的数据集(数组)初始化模型参数 28 | self.N = X_data.shape[0] 29 | self.labels = np.unique(y_data) 30 | 31 | self.fea_func(X_data, y_data) 32 | self.n_fea = len(self.fea_list) 33 | self.w = np.zeros(self.n_fea) 34 | self._exp_fea(X_data, y_data) 35 | return 36 | 37 | def fea_func(self, X_data, y_data, rules=None): 38 | # 特征函数 39 | if rules is None: # 若没有特征提取规则,则直接构造特征,此时每个样本没有缺失值的情况下的特征个数相同,等于维度 40 | for X, y in zip(X_data, y_data): 41 | X = tuple(X) 42 | self.px[X] += 1.0 / self.N # X的经验边缘分布 43 | self.pxy[(X, y)] += 1.0 / self.N # X,y的经验联合分布 44 | 45 | for dimension, val in enumerate(X): 46 | key = (dimension, val, y) 47 | if not key in self.fea_list: 48 | self.fea_list.append(key) # 特征函数,由 维度+维度下的值+标签 构成的元组 49 | self.M = X_data.shape[1] 50 | else: 51 | self.M = defaultdict(int) # 字典存储每个样本的特征总数 52 | for i in range(self.N): 53 | self.M[i] = X_data.shape[1] 54 | pass # 根据具体规则构建 55 | 56 | def _exp_fea(self, X_data, y_data): 57 | # 计算特征的经验期望值 58 | for X, y in zip(X_data, y_data): 59 | for dimension, val in enumerate(X): 60 | fea = (dimension, val, y) 61 | self.exp_fea[fea] += self.pxy[(tuple(X), y)] # 特征存在取值为1,否则为0 62 | return 63 | 64 | def _py_X(self, X): 65 | # 当前w下的条件分布概率,输入向量X和y的条件概率 66 | py_X = defaultdict(float) 67 | 68 | for y in self.labels: 69 | s = 0 70 | for dimension, val in enumerate(X): 71 | tmp_fea = (dimension, val, y) 72 | if tmp_fea in self.fea_list: # 输入X包含的特征 73 | s += self.w[self.fea_list.index(tmp_fea)] 74 | py_X[y] = math.exp(s) 75 | 76 | normalizer = sum(py_X.values()) 77 | for key, val in py_X.items(): 78 | py_X[key] = val / normalizer 79 | return py_X 80 | 81 | def _est_fea(self, X_data, y_data): 82 | # 基于当前模型,获取每个特征估计期望 83 | est_fea = defaultdict(float) 84 | for X, y in zip(X_data, y_data): 85 | py_x = self._py_X(X)[y] 86 | for dimension, val in enumerate(X): 87 | est_fea[(dimension, val, y)] += self.px[tuple(X)] * py_x 88 | return est_fea 89 | 90 | def GIS(self): 91 | # GIS算法更新delta 92 | est_fea = self._est_fea(X_data, y_data) 93 | delta = np.zeros(self.n_fea) 94 | for j in range(self.n_fea): 95 | try: 96 | delta[j] = 1 / self.M * math.log(self.exp_fea[self.fea_list[j]] / est_fea[self.fea_list[j]]) 97 | except: 98 | continue 99 | delta = delta / delta.sum() # 归一化,防止某一个特征权重过大导致,后续计算超过范围 100 | return delta 101 | 102 | def IIS(self, delta, X_data, y_data): 103 | # IIS算法更新delta 104 | g = np.zeros(self.n_fea) 105 | g_diff = np.zeros(self.n_fea) 106 | for j in range(self.n_fea): 107 | for k in range(self.N): 108 | g[j] += self.px[tuple(X_data[k])] * self._py_X(X_data[k])[y_data[k]] * math.exp(delta[j] * self.M[k]) 109 | g_diff[j] += g[j] * self.M[k] 110 | g[j] -= self.exp_fea[j] 111 | delta[j] -= g[j] / g_diff[j] 112 | return delta 113 | 114 | def fit(self, X_data, y_data): 115 | # 训练,迭代更新wi 116 | self.init_param(X_data, y_data) 117 | if isinstance(self.M, int): 118 | i = 0 119 | while i < self.maxstep: 120 | i += 1 121 | delta = self.GIS() 122 | # if max(abs(delta)) < self.epsilon: # 所有的delta都小于阈值时,停止迭代 123 | # break 124 | self.w += delta 125 | else: 126 | i = 0 127 | delta = np.random.rand(self.n_fea) 128 | while i < self.maxstep: 129 | i += 1 130 | delta = self.IIS(delta, X_data, y_data) 131 | # if max(abs(delta)) < self.epsilon: 132 | # break 133 | self.w += delta 134 | return 135 | 136 | def predict(self, X): 137 | # 输入x(数组),返回条件概率最大的标签 138 | py_x = self._py_X(X) 139 | best_label = max(py_x, key=py_x.get) 140 | return best_label 141 | 142 | 143 | if __name__ == '__main__': 144 | from sklearn.datasets import load_iris, load_digits 145 | 146 | data = load_iris() 147 | 148 | X_data = data['data'] 149 | y_data = data['target'] 150 | 151 | from machine_learning_algorithm.cross_validation import validate 152 | 153 | g = validate(X_data, y_data, ratio=0.2) 154 | for item in g: 155 | X_train, y_train, X_test, y_test = item 156 | ME = MaxEnt(maxstep=10) 157 | ME.fit(X_train, y_train) 158 | score = 0 159 | for X, y in zip(X_test, y_test): 160 | if ME.predict(X) == y: 161 | score += 1 162 | print(score / len(y_test)) 163 | -------------------------------------------------------------------------------- /classification_regression/xgboost.py: -------------------------------------------------------------------------------- 1 | # 实现XGBoost回归, 以MSE损失函数为例 2 | import numpy as np 3 | 4 | 5 | class Node: 6 | def __init__(self, sp=None, left=None, right=None, w=None): 7 | self.sp = sp # 非叶节点的切分,特征以及对应的特征下的值组成的元组 8 | self.left = left 9 | self.right = right 10 | self.w = w # 叶节点权重,也即叶节点输出值 11 | 12 | def isLeaf(self): 13 | return self.w 14 | 15 | 16 | class Tree: 17 | def __init__(self, _gamma, _lambda, max_depth): 18 | self._gamma = _gamma # 正则化项中T前面的系数 19 | self._lambda = _lambda # 正则化项w前面的系数 20 | self.max_depth = max_depth 21 | self.root = None 22 | 23 | def _candSplits(self, X_data): 24 | # 计算候选切分点 25 | splits = [] 26 | for fea in range(X_data.shape[1]): 27 | for val in X_data[fea]: 28 | splits.append((fea, val)) 29 | return splits 30 | 31 | def split(self, X_data, sp): 32 | # 劈裂数据集,返回左右子数据集索引 33 | lind = np.where(X_data[:, sp[0]] <= sp[1])[0] 34 | rind = list(set(range(X_data.shape[0])) - set(lind)) 35 | return lind, rind 36 | 37 | def calWeight(self, garr, harr): 38 | # 计算叶节点权重,也即位于该节点上的样本预测值 39 | return - sum(garr) / (sum(harr) + self._lambda) 40 | 41 | def calObj(self, garr, harr): 42 | # 计算某个叶节点的目标(损失)函数值 43 | return (-1.0 / 2) * sum(garr) ** 2 / (sum(harr) + self._lambda) + self._gamma 44 | 45 | def getBestSplit(self, X_data, garr, harr, splits): 46 | # 搜索最优切分点 47 | if not splits: 48 | return None 49 | else: 50 | bestSplit = None 51 | maxScore = -float('inf') 52 | score_pre = self.calObj(garr, harr) 53 | subinds = None 54 | for sp in splits: 55 | lind, rind = self.split(X_data, sp) 56 | if len(rind) < 2 or len(lind) < 2: 57 | continue 58 | gl = garr[lind] 59 | gr = garr[rind] 60 | hl = harr[lind] 61 | hr = harr[rind] 62 | score = score_pre - self.calObj(gl, hl) - self.calObj(gr, hr) # 切分后目标函数值下降量 63 | if score > maxScore: 64 | maxScore = score 65 | bestSplit = sp 66 | subinds = (lind, rind) 67 | if maxScore < 0: # pre-stopping 68 | return None 69 | else: 70 | return bestSplit, subinds 71 | 72 | def buildTree(self, X_data, garr, harr, splits, depth): 73 | # 递归构建树 74 | res = self.getBestSplit(X_data, garr, harr, splits) 75 | depth += 1 76 | if not res or depth >= self.max_depth: 77 | return Node(w=self.calWeight(garr, harr)) 78 | bestSplit, subinds = res 79 | splits.remove(bestSplit) 80 | left = self.buildTree(X_data[subinds[0]], garr[subinds[0]], harr[subinds[0]], splits, depth) 81 | right = self.buildTree(X_data[subinds[1]], garr[subinds[1]], harr[subinds[1]], splits, depth) 82 | return Node(sp=bestSplit, right=right, left=left) 83 | 84 | def fit(self, X_data, garr, harr): 85 | splits = self._candSplits(X_data) 86 | self.root = self.buildTree(X_data, garr, harr, splits, 0) 87 | 88 | def predict(self, x): 89 | def helper(currentNode): 90 | if currentNode.isLeaf(): 91 | return currentNode.w 92 | fea, val = currentNode.sp 93 | if x[fea] <= val: 94 | return helper(currentNode.left) 95 | else: 96 | return helper(currentNode.right) 97 | 98 | return helper(self.root) 99 | 100 | def _display(self): 101 | def helper(currentNode): 102 | if currentNode.isLeaf(): 103 | print(currentNode.w) 104 | else: 105 | print(currentNode.sp) 106 | if currentNode.left: 107 | helper(currentNode.left) 108 | if currentNode.right: 109 | helper(currentNode.right) 110 | 111 | helper(self.root) 112 | 113 | 114 | class Forest: 115 | def __init__(self, n_iter, _gamma, _lambda, max_depth, eta=1.0): 116 | self.n_iter = n_iter # 迭代次数,即基本树的个数 117 | self._gamma = _gamma 118 | self._lambda = _lambda 119 | self.max_depth = max_depth # 单颗基本树最大深度 120 | self.eta = eta # 收缩系数, 默认1.0,即不收缩 121 | self.trees = [] 122 | 123 | def calGrad(self, y_pred, y_data): 124 | # 计算一阶导数 125 | return 2 * (y_pred - y_data) 126 | 127 | def calHess(self, y_pred, y_data): 128 | # 计算二阶导数 129 | return 2 * np.ones_like(y_data) 130 | 131 | def fit(self, X_data, y_data): 132 | step = 0 133 | while step < self.n_iter: 134 | tree = Tree(self._gamma, self._lambda, self.max_depth) 135 | y_pred = self.predict(X_data) 136 | garr, harr = self.calGrad(y_pred, y_data), self.calHess(y_pred, y_data) 137 | tree.fit(X_data, garr, harr) 138 | self.trees.append(tree) 139 | step += 1 140 | 141 | def predict(self, X_data): 142 | if self.trees: 143 | y_pred = [] 144 | for x in X_data: 145 | y_pred.append(self.eta * sum([tree.predict(x) for tree in self.trees])) 146 | return np.array(y_pred) 147 | else: 148 | return np.zeros(X_data.shape[0]) 149 | 150 | 151 | if __name__ == '__main__': 152 | from sklearn.datasets import load_boston 153 | from sklearn.model_selection import train_test_split 154 | from sklearn.metrics import mean_absolute_error 155 | import matplotlib.pyplot as plt 156 | 157 | boston = load_boston() 158 | y = boston['target'] 159 | X = boston['data'] 160 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 161 | f = Forest(50, 0, 1.0, 4, eta=0.8) 162 | f.fit(X_train, y_train) 163 | y_pred = f.predict(X_test) 164 | print(mean_absolute_error(y_test, y_pred)) 165 | plt.scatter(np.arange(y_pred.shape[0]), y_test - y_pred) 166 | plt.show() 167 | -------------------------------------------------------------------------------- /classification_regression/svm_wss3.py: -------------------------------------------------------------------------------- 1 | """ 2 | 支持向量机 3 | 学习方法:不等式约束的最优化问题,求解凸二次规划的最优化算法, 本程序采用序列最小最优化算法(SOM) 4 | """ 5 | import numpy as np 6 | import math 7 | 8 | 9 | class SVM: 10 | def __init__(self, epsilon=1e-5, maxstep=500, C=1.0, kernel_option=True, gamma=None): 11 | self.epsilon = epsilon 12 | self.maxstep = maxstep 13 | self.C = C 14 | self.kernel_option = kernel_option # 是否选择核函数 15 | self.gamma = gamma # 高斯核参数 16 | 17 | self.kernel_arr = None # n*n 存储核内积 18 | self.Q = None # n*n yi*yj*K(i,j) 19 | self.grad = None # 1*n 存储每个alpha的梯度,相对于对偶问题的最优化函数而言 20 | self.X = None # 训练数据集 21 | self.y = None # 类标记值,是计算w,b的参数,故存入模型中 22 | self.alpha_arr = None # 1*n 存储拉格朗日乘子, 每个样本对应一个拉格朗日乘子 23 | self.b = 0 # 阈值b, 初始化为0 24 | 25 | self.N = None 26 | 27 | def init_param(self, X_data, y_data): 28 | # 初始化参数, 包括核内积矩阵、alpha和预测误差 29 | self.N = X_data.shape[0] 30 | self.X = X_data 31 | self.y = y_data 32 | if self.gamma is None: 33 | self.gamma = 1.0 / X_data.shape[1] 34 | self.cal_kernel(X_data) 35 | self.alpha_arr = np.zeros(self.N) 36 | self.grad = - np.ones(self.N) 37 | _y = np.array([y_data]) # 1*n 38 | self.Q = _y.T @ _y * self.kernel_arr 39 | return 40 | 41 | def _gaussian_dot(self, x1, x2): 42 | # 计算两个样本之间的高斯内积 43 | return math.exp(-self.gamma * np.square(x1 - x2).sum()) 44 | 45 | def cal_kernel(self, X_data): 46 | # 计算核内积矩阵 47 | if self.kernel_option: 48 | self.kernel_arr = np.ones((self.N, self.N)) 49 | for i in range(self.N): 50 | for j in range(i + 1, self.N): 51 | self.kernel_arr[i, j] = self._gaussian_dot(X_data[i], X_data[j]) 52 | self.kernel_arr[j, i] = self.kernel_arr[i, j] 53 | else: 54 | self.kernel_arr = X_data @ X_data.T # 不使用高斯核,线性分类器 55 | return 56 | 57 | def get_working_set(self, y_data): 58 | # 挑选两个变量alpha, 返回索引 59 | ind1 = -1 60 | ind2 = -1 61 | max_grad = - float('inf') 62 | min_grad = float('inf') 63 | # 挑选第一个alpha, 正类 64 | for i in range(self.N): 65 | if (y_data[i] == 1 and self.alpha_arr[i] < self.C) or (y_data[i] == -1 and self.alpha_arr[i] > 0): 66 | tmp = -y_data[i] * self.grad[i] 67 | if tmp >= max_grad: 68 | ind1 = i 69 | max_grad = tmp 70 | # 挑选第二个alpha, 负类 71 | ab_obj = float('inf') 72 | for i in range(self.N): 73 | if (y_data[i] == 1 and self.alpha_arr[i] > 0) or (y_data[i] == -1 and self.alpha_arr[i] < self.C): 74 | tmp = y_data[i] * self.grad[i] 75 | b = max_grad + tmp 76 | if -tmp < min_grad: 77 | min_grad = -tmp 78 | if b > 0: 79 | a = self.Q[ind1][ind1] + self.Q[i][i] - 2 * y_data[ind1] * y_data[i] * \ 80 | self.Q[ind1][i] 81 | if a <= 0: 82 | a = 1e-12 83 | if - b ** 2 / a < ab_obj: 84 | ind2 = i 85 | ab_obj = - b ** 2 / a 86 | if max_grad - min_grad >= self.epsilon: # 收敛条件 87 | return ind1, ind2 88 | return -1, -1 89 | 90 | def update(self, ind1, ind2): 91 | # 更新挑选出的两个样本的alpha、对应的预测值及误差和阈值b 92 | old_alpha1 = self.alpha_arr[ind1] 93 | old_alpha2 = self.alpha_arr[ind2] 94 | y1 = self.y[ind1] 95 | y2 = self.y[ind2] 96 | a = self.Q[ind1][ind1] + self.Q[ind2][ind2] - 2 * y1 * y2 * self.Q[ind1][ind2] 97 | if a <= 0: 98 | a = 1e-12 99 | b = -y1 * self.grad[ind1] + y2 * self.grad[ind2] 100 | new_alpha1 = old_alpha1 + y1 * b / a 101 | # 剪辑 102 | s = y1 * old_alpha1 + y2 * old_alpha2 103 | if new_alpha1 > self.C: 104 | new_alpha1 = self.C 105 | if new_alpha1 < 0: 106 | new_alpha1 = 0 107 | new_alpha2 = y2 * (s - y1 * new_alpha1) 108 | if new_alpha2 > self.C: 109 | new_alpha2 = self.C 110 | if new_alpha2 < 0: 111 | new_alpha2 = 0 112 | new_alpha1 = y1 * (s - y2 * new_alpha2) 113 | self.alpha_arr[ind1] = new_alpha1 114 | self.alpha_arr[ind2] = new_alpha2 115 | # 更新梯度 116 | delta1 = new_alpha1 - old_alpha1 117 | delta2 = new_alpha2 - old_alpha2 118 | for i in range(self.N): 119 | self.grad[i] += self.Q[i][ind1] * delta1 + self.Q[i][ind2] * delta2 120 | return 121 | 122 | def fit(self, X_data, y_data): 123 | # 训练主函数 124 | self.init_param(X_data, y_data) 125 | step = 0 126 | while step < self.maxstep: 127 | step += 1 128 | ind1, ind2 = self.get_working_set(y_data) 129 | if ind2 == -1: 130 | break 131 | self.update(ind1, ind2) 132 | # 计算阈值b 133 | alpha0_inds = set(np.where(self.grad == 0)[0]) 134 | alphaC_inds = set(np.where(self.grad == self.C)[0]) 135 | alpha_inds = set(range(self.N)) - alphaC_inds - alpha0_inds 136 | 137 | label_inds1 = set(np.where(y_data == 1)[0]) 138 | r1_inds = list(label_inds1 & alpha_inds) 139 | if r1_inds: 140 | r1 = self.grad[r1_inds].sum() 141 | else: 142 | min_r1 = self.grad[list(alpha0_inds & label_inds1)].min() 143 | max_r1 = self.grad[list(alphaC_inds & label_inds1)].max() 144 | r1 = (min_r1 + max_r1) / 2 145 | 146 | label_inds2 = set(np.where(y_data == -1)[0]) 147 | r2_inds = list(label_inds1 & alpha_inds) 148 | if r2_inds: 149 | r2 = self.grad[r2_inds].sum() 150 | else: 151 | min_r2 = self.grad[list(alpha0_inds & label_inds2)].min() 152 | max_r2 = self.grad[list(alphaC_inds & label_inds2)].max() 153 | r2 = (min_r2 + max_r2) / 2 154 | self.b = (r2 - r1) / 2 155 | return 156 | 157 | def predict(self, x): 158 | # 预测x的类别 159 | if self.kernel_option: 160 | kernel = np.array([self._gaussian_dot(x, sample) for sample in self.X]) 161 | g = np.sum(self.y * self.alpha_arr * kernel) 162 | else: 163 | g = np.sum(self.alpha_arr * self.y * (np.array([x]) @ self.X.T)[0]) 164 | return np.sign(g + self.b) 165 | 166 | 167 | if __name__ == "__main__": 168 | from sklearn.datasets import load_digits 169 | 170 | data = load_digits(n_class=2) 171 | X_data = data['data'] 172 | y_data = data['target'] 173 | inds = np.where(y_data == 0)[0] 174 | y_data[inds] = -1 175 | 176 | from machine_learning_algorithm.cross_validation import validate 177 | 178 | g = validate(X_data, y_data) 179 | for item in g: 180 | X_train, y_train, X_test, y_test = item 181 | S = SVM(kernel_option=False, maxstep=1000, epsilon=1e-3, C=1.0) 182 | S.fit(X_train, y_train) 183 | score = 0 184 | for X, y in zip(X_test, y_test): 185 | if S.predict(X) == y: 186 | score += 1 187 | print(score / len(y_test)) 188 | -------------------------------------------------------------------------------- /classification_regression/knearest.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from itertools import cycle 3 | 4 | 5 | class node: 6 | def __init__(self, val, label, left=None, right=None, visit=False): 7 | self.val = val 8 | self.label = label 9 | self.left = left 10 | self.right = right 11 | self.visit = visit # 用于回退时,父节点搜索另一个子节点 12 | 13 | 14 | class heap: 15 | # 建立一个最大堆来保存最近的k个样本点 16 | def __init__(self): 17 | self.h = [] 18 | 19 | def push(self, item): 20 | # 加入元素 21 | self.h.append(item) 22 | self._move_up(item) 23 | return 24 | 25 | def get(self): 26 | # 获取堆顶元素 27 | if self.h: 28 | return self.h[0] 29 | else: 30 | raise ValueError('the heap is empty') 31 | 32 | def delete(self): 33 | # 删除堆顶元素 34 | if self.h: 35 | last_item = self.h.pop() 36 | if self.h: 37 | self.h[0] = last_item 38 | self._move_down(last_item) 39 | else: 40 | raise ValueError('the heap is empty') 41 | 42 | def _move_up(self, item): 43 | # 将新加入的元素上移 44 | startpos = len(self.h) - 1 45 | pos = (startpos - 1) >> 1 46 | while pos >= 0: 47 | if item[1] > self.h[pos][1]: 48 | self.h[pos], self.h[startpos] = self.h[startpos], self.h[pos] 49 | startpos = pos 50 | pos = (pos - 1) >> 1 51 | else: 52 | break 53 | return 54 | 55 | def _move_down(self, item): 56 | # 删除堆顶元素,将末尾元素加入堆顶,重新调整堆 57 | pos = 0 58 | child_pos = 2 * pos + 1 # 暂设左树为预交换节点 59 | while child_pos < len(self.h): 60 | rightpos = 2 * pos + 2 61 | if rightpos < len(self.h) and self.h[rightpos][1] > self.h[child_pos][1]: 62 | child_pos = rightpos 63 | if item[1] < self.h[child_pos][1]: 64 | self.h[pos], self.h[child_pos] = self.h[child_pos], self.h[pos] 65 | pos = child_pos 66 | child_pos = 2 * pos + 1 67 | continue 68 | else: 69 | break 70 | return 71 | 72 | 73 | def get_split(data, d): 74 | # 根据切分维度返回切分点索引以及,切分后的两个子数据集 75 | vector = data[:, d] 76 | median = int(len(vector) / 2) 77 | inds = np.argpartition(vector, median) 78 | left = inds[:median] 79 | right = inds[median + 1:] 80 | return inds[median], left, right 81 | 82 | 83 | def build_tree(data): 84 | # 构建kd树, 存储索引 85 | dimension_cycle = cycle(range(data.shape[1] - 1)) 86 | 87 | def helper(dataset): 88 | if dataset.shape[0] < 1: 89 | return 90 | d = next(dimension_cycle) 91 | split, left, right = get_split(dataset, d) 92 | left = helper(dataset[left]) 93 | right = helper(dataset[right]) 94 | return node(val=dataset[split][:-1], label=dataset[split][-1], left=left, right=right) 95 | 96 | return helper(data) 97 | 98 | 99 | def cal_dis(node, X): 100 | # 计算输入X与节点之间的距离 101 | delta = node.val - X 102 | return delta @ delta 103 | 104 | 105 | def add_node(res, current_node, X, k): 106 | # 检查是否加入当前节点为k近邻之一 107 | dis = cal_dis(current_node, X) 108 | if len(res.h) < k: 109 | res.push([current_node, dis]) 110 | else: 111 | if res.get()[1] > dis: 112 | res.delete() 113 | res.push([current_node, dis]) 114 | return 115 | 116 | 117 | def check_cross(X, d, current_node, dis): 118 | # 判断超球体是否与分割平面相交 119 | plane_dis = (X[d] - current_node.val[d])**2 120 | if plane_dis < dis: 121 | return True 122 | else: 123 | return False 124 | 125 | 126 | def search_tree(X, tree, k=3): 127 | # k近邻搜索, 最大堆存储k个最近的元素 128 | dimension_cycle = cycle(range(X.shape[0])) 129 | res = heap() 130 | 131 | def helper(current_node): # 从叶节点开始回退搜索 132 | nonlocal res 133 | # 寻找叶节点 134 | if current_node.left is None and current_node.right is None: 135 | current_node.visit = True 136 | dis = cal_dis(current_node, X) 137 | res.push([current_node, dis]) 138 | return current_node 139 | d = next(dimension_cycle) 140 | if X[d] < current_node.val[d]: 141 | if current_node.left is not None: 142 | helper(current_node.left) 143 | elif current_node.right is not None: 144 | helper(current_node.right) 145 | # 回退搜索 146 | current_node.visit = True 147 | add_node(res, current_node, X, k) # 回退到父节点,并检查父节点 148 | if check_cross(X, d, current_node, res.get()[1]): # 如果与分割平面相交,则搜索另一个子节点 149 | if current_node.left is not None and current_node.left.visit: # 检查另一个子节点 150 | add_node(res, current_node.left, X, k) 151 | if current_node.right is not None and current_node.right.visit: 152 | add_node(res, current_node.right, X, k) 153 | return 154 | 155 | helper(tree) 156 | 157 | return res 158 | 159 | 160 | class KNearest: 161 | def __init__(self, k=5): 162 | self.k = k 163 | 164 | def predict(self, X_data, y_data, X): 165 | data = np.hstack((X_data, np.transpose([y_data]))) 166 | tree = build_tree(data) 167 | pred_res = [] 168 | for x in X: 169 | res = search_tree(x, tree, self.k) 170 | klabel = [] 171 | for item in res.h: 172 | klabel.append(item[0].label) 173 | c = Counter(klabel) 174 | pred_res.append(max(c, key=c.get)) 175 | return pred_res 176 | 177 | 178 | if __name__ == '__main__': 179 | import numpy as np 180 | 181 | # X_data = np.array([[2, 3], [5, 4], [9, 6], [4, 7], [8, 1], [7, 2]]) 182 | # y_data = np.array([0, 0, 1, 1, 0, 1]) 183 | # data = np.hstack((X_data, np.transpose([y_data]))) 184 | # tree = build_tree(data) 185 | 186 | # def disp_tree(tree): 187 | # # 打印树 188 | # def disp_helper(current_node): 189 | # # 前序遍历 190 | # print(current_node.val, current_node.label) 191 | # if current_node.left is not None: 192 | # disp_helper(current_node.left) 193 | # if current_node.right is not None: 194 | # disp_helper(current_node.right) 195 | # return 196 | # 197 | # disp_helper(tree) 198 | # return 199 | 200 | # disp_tree(tree) 201 | 202 | # res = search_tree(np.array([4, 3]), tree, k=3) 203 | # for node, dis in res: 204 | # print(node.val, node.label, dis) 205 | 206 | from sklearn.datasets import make_blobs 207 | from machine_learning_algorithm.cross_validation import validate 208 | 209 | X_data, y_data = make_blobs(n_samples=200) 210 | g = validate(X_data, y_data, ratio=0.2) 211 | for item in g: 212 | X_data_train, y_data_train, X_data_test, y_data_test = item 213 | knn = KNearest() 214 | score = 0 215 | y_pred = knn.predict(X_data_train, y_data_train, X_data_test) 216 | for y_test, y_pred in zip(y_data_test, y_pred): 217 | if y_test == y_pred: 218 | score += 1 219 | print(score / len(y_data_test)) 220 | -------------------------------------------------------------------------------- /probability_algorithm/hmm.py: -------------------------------------------------------------------------------- 1 | """ 2 | 隐马尔科夫模型 3 | 三类问题:1.概率计算 2.学习问题(参数估计) 3.预测问题(状态序列的预测) 4 | """ 5 | import numpy as np 6 | from itertools import accumulate 7 | 8 | 9 | class GenData: 10 | """ 11 | 根据隐马尔科夫模型生成相应的观测数据 12 | """ 13 | 14 | def __init__(self, hmm, n_sample): 15 | self.hmm = hmm 16 | self.n_sample = n_sample 17 | 18 | def _locate(self, prob_arr): 19 | # 给定概率向量,返回状态 20 | seed = np.random.rand(1) 21 | for state, cdf in enumerate(accumulate(prob_arr)): 22 | if seed <= cdf: 23 | return state 24 | return 25 | 26 | def init_state(self): 27 | # 根据初始状态概率向量,生成初始状态 28 | return self._locate(self.hmm.S) 29 | 30 | def state_trans(self, current_state): 31 | # 转移状态 32 | return self._locate(self.hmm.A[current_state]) 33 | 34 | def gen_obs(self, current_state): 35 | # 生成观测 36 | return self._locate(self.hmm.B[current_state]) 37 | 38 | def gen_data(self): 39 | # 根据模型产生观测数据 40 | current_state = self.init_state() 41 | start_obs = self.gen_obs(current_state) 42 | state = [current_state] 43 | obs = [start_obs] 44 | n = 0 45 | while n < self.n_sample - 1: 46 | n += 1 47 | current_state = self.state_trans(current_state) 48 | state.append(current_state) 49 | obs.append(self.gen_obs(current_state)) 50 | return state, obs 51 | 52 | 53 | class HMM: 54 | def __init__(self, n_state, n_obs, S=None, A=None, B=None): 55 | self.n_state = n_state # 状态的个数n 56 | self.n_obs = n_obs # 观测的种类数m 57 | self.S = S # 1*n, 初始状态概率向量 58 | self.A = A # n*n, 状态转移概率矩阵 59 | self.B = B # n*m, 观测生成概率矩阵 60 | 61 | 62 | def _alpha(hmm, obs, t): 63 | # 计算时刻t各个状态的前向概率 64 | b = hmm.B[:, obs[0]] 65 | alpha = np.array([hmm.S * b]) # n*1 66 | for i in range(1, t + 1): 67 | alpha = (alpha @ hmm.A) * np.array([hmm.B[:, obs[i]]]) 68 | return alpha[0] 69 | 70 | 71 | def forward_prob(hmm, obs): 72 | # 前向算法计算最终生成观测序列的概率, 即各个状态下概率之和 73 | alpha = _alpha(hmm, obs, len(obs) - 1) 74 | return np.sum(alpha) 75 | 76 | 77 | def _beta(hmm, obs, t): 78 | # 计算时刻t各个状态的后向概率 79 | beta = np.ones(hmm.n_state) 80 | for i in reversed(range(t + 1, len(obs))): 81 | beta = np.sum(hmm.A * hmm.B[:, obs[i]] * beta, axis=1) 82 | return beta 83 | 84 | 85 | def backward_prob(hmm, obs): 86 | # 后向算法计算生成观测序列的概率 87 | beta = _beta(hmm, obs, 0) 88 | return np.sum(hmm.S * hmm.B[:, obs[0]] * beta) 89 | 90 | 91 | def fb_prob(hmm, obs, t=None): 92 | # 将前向和后向合并 93 | if t is None: 94 | t = 0 95 | res = _alpha(hmm, obs, t) * _beta(hmm, obs, t) 96 | return res.sum() 97 | 98 | 99 | def _gamma(hmm, obs, t): 100 | # 计算时刻t处于各个状态的概率 101 | alpha = _alpha(hmm, obs, t) 102 | beta = _beta(hmm, obs, t) 103 | prob = alpha * beta 104 | return prob / prob.sum() 105 | 106 | 107 | def point_prob(hmm, obs, t, i): 108 | # 计算时刻t处于状态i的概率 109 | prob = _gamma(hmm, obs, t) 110 | return prob[i] 111 | 112 | 113 | def _xi(hmm, obs, t): 114 | alpha = np.mat(_alpha(hmm, obs, t)) 115 | beta_p = _beta(hmm, obs, t + 1) 116 | obs_prob = hmm.B[:, obs[t + 1]] 117 | obs_beta = np.mat(obs_prob * beta_p) 118 | alpha_obs_beta = np.asarray(alpha.T * obs_beta) 119 | xi = alpha_obs_beta * hmm.A 120 | return xi / xi.sum() 121 | 122 | 123 | def fit(hmm, obs_data, maxstep=100): 124 | # 利用Baum-Welch算法学习 125 | hmm.A = np.ones((hmm.n_state, hmm.n_state)) / hmm.n_state 126 | hmm.B = np.ones((hmm.n_state, hmm.n_obs)) / hmm.n_obs 127 | hmm.S = np.random.sample(hmm.n_state) # 初始状态概率矩阵(向量),的初始化必须随机状态,否则容易陷入局部最优 128 | hmm.S = hmm.S / hmm.S.sum() 129 | step = 0 130 | while step < maxstep: 131 | xi = np.zeros_like(hmm.A) 132 | gamma = np.zeros_like(hmm.S) 133 | B = np.zeros_like(hmm.B) 134 | S = _gamma(hmm, obs_data, 0) 135 | for t in range(len(obs_data) - 1): 136 | tmp_gamma = _gamma(hmm, obs_data, t) 137 | gamma += tmp_gamma 138 | xi += _xi(hmm, obs_data, t) 139 | B[:, obs_data[t]] += tmp_gamma 140 | 141 | # 更新 A 142 | for i in range(hmm.n_state): 143 | hmm.A[i] = xi[i] / gamma[i] 144 | # 更新 B 145 | tmp_gamma_end = _gamma(hmm, obs_data, len(obs_data) - 1) 146 | gamma += tmp_gamma_end 147 | B[:, obs_data[-1]] += tmp_gamma_end 148 | for i in range(hmm.n_state): 149 | hmm.B[i] = B[i] / gamma[i] 150 | # 更新 S 151 | hmm.S = S 152 | step += 1 153 | return hmm 154 | 155 | 156 | def predict(hmm, obs): 157 | # 采用Viterbi算法预测状态序列 158 | N = len(obs) 159 | nodes_graph = np.zeros((hmm.n_state, N), dtype=int) # 存储时刻t且状态为i时, 前一个时刻t-1的状态,用于构建最终的状态序列 160 | delta = hmm.S * hmm.B[:, obs[0]] # 存储到t时刻,且此刻状态为i的最大概率 161 | nodes_graph[:, 0] = range(hmm.n_state) 162 | 163 | for t in range(1, N): 164 | new_delta = [] 165 | for i in range(hmm.n_state): 166 | temp = [hmm.A[j, i] * d for j, d in enumerate(delta)] # 当前状态为i时, 选取最优的前一时刻状态 167 | max_d = max(temp) 168 | new_delta.append(max_d * hmm.B[i, obs[t]]) 169 | nodes_graph[i, t] = temp.index(max_d) 170 | delta = new_delta 171 | 172 | current_state = np.argmax(nodes_graph[:, -1]) 173 | path = [] 174 | t = N 175 | while t > 0: 176 | path.append(current_state) 177 | current_state = nodes_graph[current_state, t - 1] 178 | t -= 1 179 | return list(reversed(path)) 180 | 181 | 182 | if __name__ == '__main__': 183 | # S = np.array([0.2, 0.4, 0.4]) 184 | # A = np.array([[0.5, 0.2, 0.3], [0.3, 0.5, 0.2], [0.2, 0.3, 0.5]]) 185 | # B = np.array([[0.5, 0.2, 0.3], [0.4, 0.2, 0.4], [0.6, 0.3, 0.1]]) 186 | # hmm_real = HMM(3, 3, S, A, B) 187 | # g = GenData(hmm_real, 500) 188 | # state, obs = g.gen_data() 189 | # 检测生成的数据 190 | # state, obs = np.array(state), np.array(obs) 191 | # ind = np.where(state==2)[0] 192 | # from collections import Counter 193 | # obs_ind = obs[ind] 194 | # c1 = Counter(obs_ind) 195 | # n = sum(c1.values()) 196 | # for o, val in c.items(): 197 | # print(o, val/n) 198 | # ind_next = ind + 1 199 | # ind_out = np.where(ind_next==1000) 200 | # ind_next = np.delete(ind_next, ind_out) 201 | # state_next = state[ind_next] 202 | # c2 = Counter(state_next) 203 | # n = sum(c2.values()) 204 | # for s, val in c2.items(): 205 | # print(s, val/n) 206 | 207 | # 预测 208 | S = np.array([0.5, 0.5]) 209 | A = np.array([[0.8, 1], [0.8, 0.8]]) 210 | B = np.array([[0.2, 0.0, 0.8], [0, 0.8, 0.2]]) 211 | hmm = HMM(2, 3, S, A, B) 212 | g = GenData(hmm, 200) 213 | state, obs = g.gen_data() 214 | print(obs) 215 | path = predict(hmm, obs) 216 | score = sum([int(i == j) for i, j in zip(state, path)]) 217 | print(score / len(path)) 218 | 219 | 220 | 221 | 222 | # 学习 223 | # import matplotlib.pyplot as plt 224 | # 225 | # 226 | # def triangle_data(n_sample): 227 | # # 生成三角波形状的序列 228 | # data = [] 229 | # for x in range(n_sample): 230 | # x = x % 6 231 | # if x <= 3: 232 | # data.append(x) 233 | # else: 234 | # data.append(6 - x) 235 | # return data 236 | # 237 | # 238 | # hmm = HMM(10, 4) 239 | # data = triangle_data(30) 240 | # hmm = fit(hmm, data) 241 | # g = GenData(hmm, 30) 242 | # state, obs = g.gen_data() 243 | # 244 | # x = [i for i in range(30)] 245 | # plt.scatter(x, obs, marker='*', color='r') 246 | # plt.plot(x, data, color='g') 247 | # plt.show() 248 | 249 | 250 | -------------------------------------------------------------------------------- /classification_regression/svm.py: -------------------------------------------------------------------------------- 1 | """ 2 | 支持向量机 3 | 学习方法:不等式约束的最优化问题,求解凸二次规划的最优化算法, 本程序采用序列最小最优化算法(SOM) 4 | """ 5 | import numpy as np 6 | import math 7 | 8 | 9 | class SVM: 10 | def __init__(self, epsilon=1e-5, maxstep=500, C=1.0, kernel_option=True, gamma=None): 11 | self.epsilon = epsilon 12 | self.maxstep = maxstep 13 | self.C = C 14 | self.kernel_option = kernel_option # 是否选择核函数 15 | self.gamma = gamma # 高斯核参数 16 | 17 | self.kernel_arr = None # n*n 存储核内积 18 | self.X = None # 训练数据集 19 | self.y = None # 类标记值,是计算w,b的参数,故存入模型中 20 | self.alpha_arr = None # 1*n 存储拉格朗日乘子, 每个样本对应一个拉格朗日乘子 21 | self.b = 0 # 阈值b, 初始化为0 22 | self.err_arr = None # 1*n 记录每个样本的预测误差 23 | 24 | self.N = None 25 | 26 | def init_param(self, X_data, y_data): 27 | # 初始化参数, 包括核内积矩阵、alpha和预测误差 28 | self.N = X_data.shape[0] 29 | self.X = X_data 30 | self.y = y_data 31 | if self.gamma is None: 32 | self.gamma = 1.0 / X_data.shape[1] 33 | self.cal_kernel(X_data) 34 | self.alpha_arr = np.zeros(self.N) 35 | self.err_arr = - self.y # 将拉格朗日乘子全部初始化为0,则相应的预测值初始化为0,预测误差就是-y_data 36 | return 37 | 38 | def _gaussian_dot(self, x1, x2): 39 | # 计算两个样本之间的高斯内积 40 | return math.exp(-self.gamma * np.square(x1 - x2).sum()) 41 | 42 | def cal_kernel(self, X_data): 43 | # 计算核内积矩阵 44 | if self.kernel_option: 45 | self.kernel_arr = np.ones((self.N, self.N)) 46 | for i in range(self.N): 47 | for j in range(i + 1, self.N): 48 | self.kernel_arr[i, j] = self._gaussian_dot(X_data[i], X_data[j]) 49 | self.kernel_arr[j, i] = self.kernel_arr[i, j] 50 | else: 51 | self.kernel_arr = X_data @ X_data.T # 不使用高斯核,线性分类器 52 | return 53 | 54 | def select_second_alpha(self, ind1): 55 | # 挑选第二个变量alpha, 返回索引 56 | E1 = self.err_arr[ind1] 57 | ind2 = None 58 | max_diff = 0 # 初始化最大的|E1-E2| 59 | candidate_alpha_inds = np.nonzero(self.err_arr)[0] # 存在预测误差的样本作为候选样本 60 | if len(candidate_alpha_inds) > 1: 61 | for i in candidate_alpha_inds: 62 | if i == ind1: 63 | continue 64 | tmp = abs(self.err_arr[i] - E1) 65 | if tmp > max_diff: 66 | max_diff = tmp 67 | ind2 = i 68 | if ind2 is None: # 随机选择一个不与ind1相等的样本索引 69 | ind2 = ind1 70 | while ind2 == ind1: 71 | ind2 = np.random.choice(self.N) 72 | return ind2 73 | 74 | def update(self, ind1, ind2): 75 | # 更新挑选出的两个样本的alpha、对应的预测值及误差和阈值b 76 | old_alpha1 = self.alpha_arr[ind1] 77 | old_alpha2 = self.alpha_arr[ind2] 78 | y1 = self.y[ind1] 79 | y2 = self.y[ind2] 80 | if y1 == y2: 81 | L = max(0.0, old_alpha2 + old_alpha1 - self.C) 82 | H = min(self.C, old_alpha2 + old_alpha1) 83 | else: 84 | L = max(0.0, old_alpha2 - old_alpha1) 85 | H = min(self.C, self.C + old_alpha2 - old_alpha1) 86 | if L == H: 87 | return 0 88 | E1 = self.err_arr[ind1] 89 | E2 = self.err_arr[ind2] 90 | K11 = self.kernel_arr[ind1, ind1] 91 | K12 = self.kernel_arr[ind1, ind2] 92 | K22 = self.kernel_arr[ind2, ind2] 93 | # 更新alpha2 94 | eta = K11 + K22 - 2 * K12 95 | if eta <= 0: 96 | return 0 97 | new_unc_alpha2 = old_alpha2 + y2 * (E1 - E2) / eta # 未经剪辑的alpha2 98 | if new_unc_alpha2 > H: 99 | new_alpha2 = H 100 | elif new_unc_alpha2 < L: 101 | new_alpha2 = L 102 | else: 103 | new_alpha2 = new_unc_alpha2 104 | # 更新alpha1 105 | if abs(old_alpha2 - new_alpha2) < self.epsilon * ( 106 | old_alpha2 + new_alpha2 + self.epsilon): # 若alpha2更新变化很小,则忽略本次更新 107 | return 0 108 | new_alpha1 = old_alpha1 + y1 * y2 * (old_alpha2 - new_alpha2) 109 | self.alpha_arr[ind1] = new_alpha1 110 | self.alpha_arr[ind2] = new_alpha2 111 | # 更新阈值b 112 | new_b1 = -E1 - y1 * K11 * (new_alpha1 - old_alpha1) - y2 * K12 * (new_alpha2 - old_alpha2) + self.b 113 | new_b2 = -E2 - y1 * K12 * (new_alpha1 - old_alpha1) - y2 * K22 * (new_alpha2 - old_alpha2) + self.b 114 | if 0 < new_alpha1 < self.C: 115 | self.b = new_b1 116 | elif 0 < new_alpha2 < self.C: 117 | self.b = new_b2 118 | else: 119 | self.b = (new_b1 + new_b2) / 2 120 | # 更新对应的预测误差 121 | self.err_arr[ind1] = np.sum(self.y * self.alpha_arr * self.kernel_arr[ind1, :]) + self.b - y1 122 | self.err_arr[ind2] = np.sum(self.y * self.alpha_arr * self.kernel_arr[ind2, :]) + self.b - y2 123 | return 1 124 | 125 | def satisfy_kkt(self, y, err, alpha): 126 | # 在精度范围内判断是否满足KTT条件 127 | r = y * err 128 | # r<=0,则y(g-y)<=0,yg<1, alpha=C则符合;r>0,则yg>1, alpha=0则符合 129 | if (r < -self.epsilon and alpha < self.C) or (r > self.epsilon and alpha > 0): 130 | return False 131 | return True 132 | 133 | def fit(self, X_data, y_data): 134 | # 训练主函数 135 | self.init_param(X_data, y_data) 136 | # 启发式搜索第一个alpha时,当间隔边界上的支持向量全都满足KKT条件时,就搜索整个数据集。 137 | # 整个训练过程需要在边界支持向量与所有样本集之间进行切换搜索,以防止无法收敛 138 | entire_set = True 139 | step = 0 140 | change_pairs = 0 141 | while step < self.maxstep and (change_pairs > 0 or entire_set): # 当搜寻全部样本,依然没有改变,则停止迭代 142 | step += 1 143 | change_pairs = 0 144 | if entire_set: # 搜索整个样本集 145 | for ind1 in range(self.N): 146 | if not self.satisfy_kkt(y_data[ind1], self.err_arr[ind1], self.alpha_arr[ind1]): 147 | ind2 = self.select_second_alpha(ind1) 148 | change_pairs += self.update(ind1, ind2) 149 | else: # 搜索间隔边界上的支持向量(bound_search) 150 | bound_inds = np.where((0 < self.alpha_arr) & (self.alpha_arr < self.C))[0] 151 | for ind1 in bound_inds: 152 | if not self.satisfy_kkt(y_data[ind1], self.err_arr[ind1], self.alpha_arr[ind1]): 153 | ind2 = self.select_second_alpha(ind1) 154 | change_pairs += self.update(ind1, ind2) 155 | if entire_set: # 当前是对整个数据集进行搜索,则下一次搜索间隔边界上的支持向量 156 | entire_set = False 157 | elif change_pairs == 0: 158 | entire_set = True # 当前是对间隔边界上的支持向量进行搜索,若未发生任何改变,则下一次搜索整个数据集 159 | return 160 | 161 | def predict(self, x): 162 | # 预测x的类别 163 | if self.kernel_option: 164 | kernel = np.array([self._gaussian_dot(x, sample) for sample in self.X]) 165 | g = np.sum(self.y * self.alpha_arr * kernel) 166 | else: 167 | g = np.sum(self.alpha_arr * self.y * (np.array([x]) @ self.X.T)[0]) 168 | return np.sign(g + self.b) 169 | 170 | 171 | if __name__ == "__main__": 172 | # xx, yy = np.meshgrid(np.linspace(-3, 3, 500), 173 | # # np.linspace(-3, 3, 500)) 174 | # # np.random.seed(0) 175 | # # X_data = np.random.randn(500, 2) 176 | # # Y = np.logical_xor(X_data[:, 0] > 0, X_data[:, 1] > 0) 177 | # # y = [] 178 | # # for i in Y: 179 | # # if i: 180 | # # y.append(1) 181 | # # else: 182 | # # y.append(-1) 183 | # # y_data = np.array(y) 184 | from sklearn.datasets import load_digits 185 | 186 | data = load_digits(n_class=2) 187 | X_data = data['data'] 188 | y_data = data['target'] 189 | inds = np.where(y_data == 0)[0] 190 | y_data[inds] = -1 191 | 192 | from machine_learning_algorithm.cross_validation import validate 193 | 194 | g = validate(X_data, y_data) 195 | for item in g: 196 | X_train, y_train, X_test, y_test = item 197 | S = SVM(kernel_option=False, maxstep=1000, epsilon=1e-6, C=1.0) 198 | S.fit(X_train, y_train) 199 | score = 0 200 | for X, y in zip(X_test, y_test): 201 | if S.predict(X) == y: 202 | score += 1 203 | print(score / len(y_test)) 204 | -------------------------------------------------------------------------------- /association/fp_growth.py: -------------------------------------------------------------------------------- 1 | """ 2 | FP树增长算法发现频繁项集 3 | """ 4 | from collections import defaultdict, Counter, deque 5 | import math 6 | import copy 7 | 8 | 9 | class node: 10 | def __init__(self, item, count, parent): # 本程序将节点之间的链接信息存储到项头表中,后续可遍历项头表添加该属性 11 | self.item = item # 该节点的项 12 | self.count = count # 项的计数 13 | self.parent = parent # 该节点父节点的id 14 | self.children = [] # 该节点的子节点的list 15 | 16 | 17 | class FP: 18 | def __init__(self, minsup=0.5): 19 | self.minsup = minsup 20 | self.minsup_num = None # 支持度计数 21 | 22 | self.N = None 23 | self.item_head = defaultdict(list) # 项头表 24 | self.fre_one_itemset = defaultdict(lambda: 0) # 频繁一项集,值为支持度 25 | self.sort_rules = None # 项头表中的项排序规则,按照支持度从大到小有序排列 26 | self.tree = defaultdict() # fp树, 键为节点的id, 值为node 27 | self.max_node_id = 0 # 当前树中最大的node_id, 用于插入新节点时,新建node_id 28 | self.fre_itemsets = [] # 频繁项集 29 | self.fre_itemsets_sups = [] # 频繁项集的支持度计数 30 | 31 | def init_param(self, data): 32 | self.N = len(data) 33 | self.minsup_num = math.ceil(self.minsup * self.N) 34 | self.get_fre_one_itemset(data) 35 | self.build_tree(data) 36 | return 37 | 38 | def get_fre_one_itemset(self, data): 39 | # 获取频繁1项,并排序,第一次扫描数据集 40 | c = Counter() 41 | for t in data: 42 | c += Counter(t) 43 | for key, val in c.items(): 44 | if val >= self.minsup_num: 45 | self.fre_one_itemset[key] = val 46 | sort_keys = sorted(self.fre_one_itemset, key=self.fre_one_itemset.get, reverse=True) 47 | self.sort_rules = {k: i for i, k in enumerate(sort_keys)} # 频繁一项按照支持度降低的顺序排列,构建排序规则 48 | return 49 | 50 | def insert_item(self, parent, item): 51 | # 将事务中的项插入到FP树中,并返回插入节点的id 52 | children = self.tree[parent].children 53 | for child_id in children: 54 | child_node = self.tree[child_id] 55 | if child_node.item == item: 56 | self.tree[child_id].count += 1 57 | next_node_id = child_id 58 | break 59 | else: # 循环正常结束,表明当前父节点的子节点中没有项与之匹配,所以新建子节点,更新项头表和树 60 | self.max_node_id += 1 61 | next_node_id = copy.copy(self.max_node_id) # 注意self.max_node_id 是可变的,引用时需要copy 62 | self.tree[next_node_id] = node(item=item, count=1, parent=parent) # 更新树,添加节点 63 | self.tree[parent].children.append(next_node_id) # 更新父节点的孩子列表 64 | self.item_head[item].append(next_node_id) # 更新项头表 65 | return next_node_id 66 | 67 | def build_tree(self, data): 68 | # 构建项头表以及FP树, 第二次扫描数据集 69 | one_itemset = set(self.fre_one_itemset.keys()) 70 | self.tree[0] = node(item=None, count=0, parent=-1) 71 | for t in data: 72 | t = list(set(t) & one_itemset) # 去除该事务中非频繁项 73 | if len(t) > 0: 74 | t = sorted(t, key=lambda x: self.sort_rules[x]) # 按照项的频繁程度从大到小排序 75 | parent = 0 # 每个事务都是从树根开始插起 76 | for item in t: 77 | parent = self.insert_item(parent, item) # 将排序后的事务中每个项依次插入FP树 78 | return 79 | 80 | def get_path(self, pre_tree, condition_tree, node_id, suffix_items_count): 81 | # 根据后缀的某个叶节点的父节点出发,选取出路径,并更新计数。suffix_item_count为后缀的计数 82 | if node_id == 0: 83 | return 84 | else: 85 | if node_id not in condition_tree.keys(): 86 | current_node = copy.deepcopy(pre_tree[node_id]) 87 | current_node.count = suffix_items_count # 更新计数 88 | condition_tree[node_id] = current_node 89 | 90 | else: # 若叶节点有多个,则路径可能有重复,计数叠加 91 | condition_tree[node_id].count += suffix_items_count 92 | node_id = condition_tree[node_id].parent 93 | self.get_path(pre_tree, condition_tree, node_id, suffix_items_count) # 递归构建路径 94 | return 95 | 96 | def get_condition_tree(self, pre_tree, suffix_items_ids): 97 | # 构建后缀为一个项的条件模式基。可能对应多个叶节点,综合后缀的各个叶节点的路径 98 | condition_tree = defaultdict() # 字典存储条件FP树,值为父节点 99 | for suffix_id in suffix_items_ids: # 从各个后缀叶节点出发,综合各条路径形成条件FP树 100 | suffix_items_count = copy.copy(pre_tree[suffix_id].count) # 叶节点计数 101 | node_id = pre_tree[suffix_id].parent # 注意条件FP树不包括后缀 102 | if node_id == 0: 103 | continue 104 | self.get_path(pre_tree, condition_tree, node_id, suffix_items_count) 105 | return condition_tree 106 | 107 | def extract_suffix_set(self, condition_tree, suffix_items): 108 | # 根据条件模式基,提取频繁项集, suffix_item为该条件模式基对应的后缀 109 | # 返回新的后缀,以及新添加项(将作为下轮的叶节点)的id 110 | new_suffix_items_list = [] # 后缀中添加的新项 111 | new_item_head = defaultdict(list) # 基于当前的条件FP树,更新项头表, 新添加的后缀项 112 | item_sup_dict = defaultdict(int) 113 | for key, val in condition_tree.items(): 114 | item_sup_dict[val.item] += val.count # 对项出现次数进行统计 115 | new_item_head[val.item].append(key) 116 | 117 | for item, sup in item_sup_dict.items(): 118 | if sup >= self.minsup_num: # 若条件FP树中某个项是频繁的,则添加到后缀中 119 | current_item_set = [item] + suffix_items 120 | self.fre_itemsets.append(current_item_set) 121 | self.fre_itemsets_sups.append(sup) 122 | new_suffix_items_list.append(current_item_set) 123 | else: 124 | new_item_head.pop(item) 125 | return new_suffix_items_list, new_item_head.values() 126 | 127 | def get_fre_set(self, data): 128 | # 构建以每个频繁1项为后缀的频繁项集 129 | self.init_param(data) 130 | suffix_items_list = [] 131 | suffix_items_id_list = [] 132 | for key, val in self.fre_one_itemset.items(): 133 | suffix_items = [key] 134 | suffix_items_list.append(suffix_items) 135 | suffix_items_id_list.append(self.item_head[key]) 136 | self.fre_itemsets.append(suffix_items) 137 | self.fre_itemsets_sups.append(val) 138 | pre_tree = copy.deepcopy(self.tree) # pre_tree 是尚未去除任何后缀的前驱,若其叶节点的项有多种,则可以形成多种条件FP树 139 | self.dfs_search(pre_tree, suffix_items_list, suffix_items_id_list) 140 | return 141 | 142 | def bfs_search(self, pre_tree, suffix_items_list, suffix_items_id_list): 143 | # 宽度优先,递增构建频繁k项集 144 | q = deque() 145 | q.appendleft((pre_tree, suffix_items_list, suffix_items_id_list)) 146 | while len(q) > 0: 147 | param_tuple = q.pop() 148 | pre_tree = param_tuple[0] 149 | for suffix_items, suffix_items_ids in zip(param_tuple[1], param_tuple[2]): 150 | condition_tree = self.get_condition_tree(pre_tree, suffix_items_ids) 151 | new_suffix_items_list, new_suffix_items_id_list = self.extract_suffix_set(condition_tree, suffix_items) 152 | if new_suffix_items_list: 153 | q.appendleft( 154 | (condition_tree, new_suffix_items_list, new_suffix_items_id_list)) # 储存前驱,以及产生该前驱的后缀的信息 155 | return 156 | 157 | def dfs_search(self, pre_tree, suffix_items_list, suffix_items_id_list): 158 | # 深度优先,递归构建以某个项为后缀的频繁k项集 159 | for suffix_items, suffix_items_ids in zip(suffix_items_list, suffix_items_id_list): 160 | condition_tree = self.get_condition_tree(pre_tree, suffix_items_ids) 161 | new_suffix_items_list, new_suffix_items_id_list = self.extract_suffix_set(condition_tree, suffix_items) 162 | if new_suffix_items_list: # 如果后缀有新的项添加进来,则继续深度搜索 163 | self.dfs_search(condition_tree, new_suffix_items_list, new_suffix_items_id_list) 164 | return 165 | 166 | 167 | if __name__ == '__main__': 168 | data1 = [list('ABCEFO'), list('ACG'), list('ET'), list('ACDEG'), list('ACEGL'), 169 | list('EJ'), list('ABCEFP'), list('ACD'), list('ACEGM'), list('ACEGN')] 170 | data2 = [list('ab'), list('bcd'), list('acde'), list('ade'), list('abc'), 171 | list('abcd'), list('a'), list('abc'), list('abd'), list('bce')] 172 | data3 = [['r', 'z', 'h', 'j', 'p'], ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], ['z'], ['r', 'x', 'n', 'o', 's'], 173 | ['y', 'r', 'x', 'z', 'q', 't', 'p'], ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']] 174 | 175 | fp = FP(minsup=0.2) 176 | fp.get_fre_set(data2) 177 | 178 | for itemset, sup in zip(fp.fre_itemsets, fp.fre_itemsets_sups): 179 | print(itemset, sup) 180 | -------------------------------------------------------------------------------- /optimization/optimization_algorithm.py: -------------------------------------------------------------------------------- 1 | """ 2 | 优化算法的核心两点:1.寻找下降方向 2.一维搜索下降的步长 3 | 梯度下降算法以负梯度方向为下降方向 4 | 牛顿法以海塞矩阵逆矩阵与梯度乘积为下降方向(此时学习率恒为1) 5 | 拟牛顿法同牛顿法,仅仅以B代替海塞矩阵 6 | 7 | 一维不精确搜索:Wolfe和Armijo确定搜索步长(通常应用于拟牛顿法) 8 | Armijo: f(Xk + eta * Dk) - f(Xk) <= rho * eta * Grad.T * Dk, eta=beta*gamma**m(m为非负整数,beta>0) 9 | Goldstein(需要联合Armijo): f(Xk + eta * Dk) - f(Xk) >= (1-rho) * eta * Grad.T * Dk 10 | Wolfe-Powell(需要联合Armijo): Grad(k+1).T * Dk >= sigma * Grad(k).T * Dk, rho 1 32 | W = np.ones((X.shape[1], 1)) 33 | indx = np.random.permutation(np.arange(X.shape[0])) 34 | X, y = X[indx], y[indx] 35 | indx_cycle = cycle(np.arange(X.shape[0] // batch_size + 1) * batch_size) 36 | return W, indx_cycle 37 | 38 | 39 | def bgd(X, y, epsilon=1e-2, max_iter=1000, eta=0.1): 40 | # 批量梯度下降算法 41 | i = 0 42 | W = np.ones((X.shape[1], 1)) # 初始化权值 d*1 43 | while i < max_iter: 44 | Grad = grad(X, y, W) # 1*d 计算梯度 45 | if norm(Grad) < epsilon: # 如果梯度的第二范数小于阈值,则终止训练 46 | break 47 | W -= eta * Grad # 负梯度方向下降,更新自变量,即权值w 48 | i += 1 49 | 50 | return W 51 | 52 | 53 | def sgd(X, y, epsilon=1e-2, max_iter=3600, eta=0.1): 54 | # 随机梯度下降 55 | i = 0 56 | W = np.ones((X.shape[1], 1)) 57 | while i < max_iter: 58 | indx = np.random.randint(X.shape[0], size=1) # 随机选择一个样本 59 | X_s, y_s = X[(indx)], y[(indx)] 60 | Grad = grad(X_s, y_s, W) 61 | if norm(Grad) < epsilon: 62 | break 63 | W -= eta * Grad 64 | i += 1 65 | return W 66 | 67 | 68 | def msgd(X, y, epsilon=1e-2, max_iter=1000, eta=0.1, batch_size=4): 69 | # 小批量梯度下降 70 | W, indx_cycle = initial(X, y, batch_size) 71 | i = 0 72 | while i < max_iter: 73 | j = next(indx_cycle) 74 | X_s, y_s = X[j:j + batch_size], y[j:j + batch_size] 75 | Grad = grad(X_s, y_s, W) 76 | if norm(Grad) < epsilon: 77 | break 78 | W -= eta * Grad 79 | i += 1 80 | return W 81 | 82 | 83 | def momentum(X, y, epsilon=1e-2, max_iter=3600, eta=0.1, batch_size=4, alpha=0.01, nesterov=False): 84 | # 随机梯度下降 85 | W, indx_cycle = initial(X, y, batch_size) 86 | i = 0 87 | v = np.zeros((X.shape[1], 1)) # 初始化动量 88 | while i < max_iter: 89 | j = next(indx_cycle) 90 | X_s, y_s = X[j:j + batch_size], y[j:j + batch_size] 91 | if nesterov: 92 | W += alpha * v 93 | Grad = grad(X_s, y_s, W) 94 | if norm(Grad) < epsilon: 95 | break 96 | v = alpha * v - eta * Grad 97 | W += v 98 | i += 1 99 | return W 100 | 101 | 102 | def adag(X, y, epsilon=1e-2, max_iter=1000, eta=0.1, batch_size=4, eps_station=1e-10): 103 | # adaptive gradient descent自适应学习率梯度下降 104 | W, indx_cycle = initial(X, y, batch_size) 105 | r = np.zeros((X.shape[1], 1)) 106 | i = 0 107 | while i < max_iter: 108 | j = next(indx_cycle) 109 | X_s, y_s = X[j:j + batch_size], y[j:j + batch_size] 110 | Grad = grad(X_s, y_s, W) 111 | if norm(Grad) < epsilon: 112 | break 113 | r += Grad ** 2 114 | W -= eta * Grad / (np.sqrt(r) + eps_station) 115 | i += 1 116 | return W 117 | 118 | 119 | def rms_prop(X, y, epsilon=1e-2, max_iter=1000, eta=0.01, rho=0.9, batch_size=4, eps_station=1e-10): 120 | # 均方根反向传播算法 121 | W, indx_cycle = initial(X, y, batch_size) 122 | m = np.zeros((X.shape[1], 1)) 123 | i = 0 124 | while i < max_iter: 125 | j = next(indx_cycle) 126 | X_s, y_s = X[j:j + batch_size], y[j:j + batch_size] 127 | Grad = grad(X_s, y_s, W) 128 | if norm(Grad) < epsilon: 129 | break 130 | m = rho * m + (1 - rho) * Grad ** 2 131 | W -= eta * Grad / (np.sqrt(m) + eps_station) 132 | i += 1 133 | return W 134 | 135 | 136 | def adam(X, y, epsilon=1e-2, max_iter=1000, eta=0.01, beta1=0.9, beta2=0.999, batch_size=4, eps_station=1e-10): 137 | # adam算法 138 | W, indx_cycle = initial(X, y, batch_size) 139 | m = np.zeros((X.shape[1], 1)) 140 | v = np.zeros((X.shape[1], 1)) 141 | i = 0 142 | while i < max_iter: 143 | j = next(indx_cycle) 144 | X_s, y_s = X[j:j + batch_size], y[j:j + batch_size] 145 | Grad = grad(X_s, y_s, W) 146 | if norm(Grad) < epsilon: 147 | break 148 | m = beta1 * m + (1 - beta1) * Grad 149 | v = beta2 * v + (1 - beta2) * Grad ** 2 150 | m_bar = m / (1 - beta1 ** (i + 1)) 151 | v_bar = v / (1 - beta2 ** (i + 1)) 152 | W -= eta * m_bar / (np.sqrt(v_bar) + eps_station) 153 | i += 1 154 | return W 155 | 156 | 157 | def newton(X, y, epsilon=1e-2, max_iter=1000): 158 | # 牛顿法 159 | i = 0 160 | W = np.ones((X.shape[1], 1)) 161 | x1 = X[:, 0] # 变量维度1,n维向量 162 | x2 = X[:, 1] 163 | while i < max_iter: 164 | err = X @ W - y # n*1 165 | Grad = X.T @ err / X.shape[0] 166 | if norm(Grad) < epsilon: # 如果梯度的第二范数小于阈值,则终止训练 167 | break 168 | err = err.reshape(-1) 169 | H12 = 2 * x1 @ x2 170 | H11 = 2 * err @ x1 171 | H22 = 2 * err @ x2 172 | H = np.array([[H11, H12], [H12, H22]]) # 计算海塞矩阵 173 | W -= inv(H) @ Grad # 负梯度方向下降,更新自变量,即权值w 174 | i += 1 175 | return W 176 | 177 | 178 | def bfgs(X, y, epsilon=1e-4, max_iter=1000): 179 | # 拟牛顿算法 180 | i = 0 181 | W = np.ones((X.shape[1], 1)) 182 | N = X.shape[0] 183 | B = np.eye(X.shape[1]) # 初始化B d*d 184 | while i < max_iter: 185 | err = X @ W - y 186 | fx = err.T @ err / 2 / N 187 | Grad = X.T @ err / N # d*1 计算梯度 188 | if norm(Grad) < epsilon: # 如果梯度的第二范数小于阈值,则终止训练 189 | break 190 | Dk = - inv(B) @ Grad # d*1, 下降方向 191 | eta, W = wp_search(W, Dk, fx, Grad, X, y, N) # 下降步长以及更新w, 注意弱用WP规则,还可以返回新的梯度,减少重复计算 192 | delta = eta * Dk # d*1 自变量w的增量 193 | yk = B @ delta # d*1, 更新yk 194 | B = B + yk @ yk.T / (yk.T @ delta)[0] - B @ delta @ (delta.T @ B) / (delta.T @ B @ delta)[0] # 更新B 195 | return W 196 | 197 | 198 | def wp_search(W, Dk, fx, Grad, X, y, N, sigma=0.75, gamma=0.5, rho=1e-4, beta=1, maxm=100): 199 | # 基于Wolfe-Powell条件的不精确一维搜索 200 | assert ((rho < 1.0 / 2) and (rho > 0)) 201 | assert ((gamma < 1.0) and (gamma > 0.0)) 202 | assert ((sigma > rho) and (sigma < 1)) 203 | assert (beta > 0) 204 | m = 0 205 | W_new = None 206 | eta = None 207 | while m < maxm: 208 | eta = beta * gamma ** m # 一维搜索合适的m,进而更新eta 209 | W_new = W + eta * Dk 210 | err_new = X @ W_new - y 211 | fx_new = err_new.T @ err_new / 2 / N # 下降后的函数值 212 | diff_val = fx_new - fx # 下降量 213 | gdk = Grad.T @ Dk 214 | exp_diff = eta * gdk 215 | Grad_new = X.T @ err_new / N # 更新w后的梯度 216 | if (diff_val[0] <= rho * exp_diff[0]) and (Grad_new.T @ Dk >= sigma * gdk): 217 | break 218 | m += 1 219 | return eta, W_new 220 | 221 | 222 | def ag_search(W, Dk, fx, Grad, X, y, N, gamma=0.5, rho=1e-4, beta=1, maxm=100): 223 | # 基于Armijo-Goldstein条件的不精确一维搜索 224 | assert ((rho < 1.0 / 2) and (rho > 0)) 225 | assert ((gamma < 1.0) and (gamma > 0.0)) 226 | assert (beta > 0) 227 | m = 0 228 | eta = None 229 | W_new = None 230 | while m < maxm: 231 | eta = beta * gamma ** m # 一维搜索合适的m,进而更新eta 232 | W_new = W + eta * Dk 233 | err_new = X @ W_new - y 234 | fx_new = err_new.T @ err_new / 2 / N # 下降后的函数值 235 | diff_val = fx_new - fx # 下降量 236 | exp_diff = eta * Grad.T @ Dk 237 | if (diff_val[0] <= rho * exp_diff[0]) and (diff_val[0] >= (1 - rho) * exp_diff[0]): 238 | break 239 | m += 1 240 | return eta, W_new 241 | 242 | 243 | def predict(X, W): 244 | return X @ W 245 | 246 | 247 | if __name__ == '__main__': 248 | from sklearn.metrics import mean_squared_error 249 | from pprint import pprint 250 | 251 | train_data = np.array([[1.1, 1.5, 2.5], 252 | [1.3, 1.9, 3.2], 253 | [1.5, 2.3, 3.9], 254 | [1.7, 2.7, 4.6], 255 | [1.9, 3.1, 5.3], 256 | [2.1, 3.5, 6.0], 257 | [2.3, 3.9, 6.7], 258 | [2.5, 4.3, 7.4], 259 | [2.7, 4.7, 8.1], 260 | [2.9, 5.1, 8.8]]) 261 | X_train, y_train = train_data[:, :-1], train_data[:, [-1]] 262 | test_data = np.array([[3.1, 5.5, 9.5], 263 | [3.3, 5.9, 10.2], 264 | [3.5, 6.3, 10.9], 265 | [3.7, 6.7, 11.6], 266 | [3.9, 7.1, 12.3]]) 267 | X_test, y_test = test_data[:, :-1], test_data[:, [-1]] 268 | 269 | W = bfgs(X_train, y_train, epsilon=1e-5, max_iter=3000) 270 | y_pred = predict(X_test, W) 271 | pprint(W) 272 | print(y_pred, '\n', mean_squared_error(y_test, y_pred)) 273 | --------------------------------------------------------------------------------