├── .gitattributes ├── 10混合高斯 ├── Gaussian_mixture_model.py ├── expectation_maximization.py └── sklearn中的混合高斯.txt ├── 11隐马尔可夫 └── HMM.py ├── 12EM算法 └── em.py ├── 13关联分析 ├── apriori_ys.py └── fpGrowth_ys.py ├── 1手写KNN.ipynb ├── 1线性回归 ├── .ipynb_checkpoints │ ├── linear_regression_example-checkpoint.ipynb │ └── 维度测试-checkpoint.py ├── LinearRegression.py ├── linear.py ├── linear_regression_data1.txt ├── linear_regression_example.ipynb ├── np测试.py └── 维度测试.py ├── 2手写决策树.ipynb ├── 2逻辑回归 ├── LogisticRegression-l2-线性不可分.py ├── LogisticRegression-l2.py ├── LogisticRegression.py ├── data2.html ├── data2.txt ├── lg.py └── logistic_regression_example.html ├── 3SVM ├── SVM │ ├── svm-digits.py │ ├── svm-simple.py │ ├── svm-smo.py │ ├── svm-svc.py │ ├── svmMLiA.py │ ├── testSet.txt │ ├── testSetRBF.txt │ └── testSetRBF2.txt ├── data2.txt ├── svm.py ├── svm_2.py ├── testSet.txt ├── testSetRBF.txt └── testSetRBF2.txt ├── 3线性回归.ipynb ├── 4决策树 ├── DT.py ├── Decision Tree ys.py ├── Dtree_id3.py └── data.csv ├── 4逻辑回归.ipynb ├── 5kmeans聚类.ipynb ├── 6贝叶斯 ├── iris.txt ├── naiveBayesBase.py ├── naiveBayes_mnist.py ├── 多项式贝叶斯hand_极大似然估计.py ├── 多项式贝叶斯hand_贝叶斯估计.py ├── 贝叶斯高斯.txt ├── 高斯贝叶斯 - 菊安酱.py └── 高斯贝叶斯.py ├── 7K近邻-KNN └── kNN.py ├── 8K-means ├── 1笔记.txt └── K-Means │ ├── K-Means_scikit-learn.py │ ├── K-Menas.py │ ├── bird.mat │ ├── bird.png │ ├── data.mat │ └── kmeansplusplus_ys.py ├── 9降维 ├── PCA-ceshi.py ├── PCA-mnist.py ├── PCA.py ├── __pycache__ │ └── tsne.cpython-36.pyc ├── data.mat ├── data_faces.mat ├── outfile.png ├── pca_juanjiang.py ├── testSet.txt └── tsne.py └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | *.html linguist-language=python 4 | -------------------------------------------------------------------------------- /10混合高斯/Gaussian_mixture_model.py: -------------------------------------------------------------------------------- 1 | # @Time : 2019/1/9 14:07 2 | # @Author : Xu Huipeng 3 | # @Blog : https://brycexxx.github.io/ 4 | import numpy as np 5 | from typing import Any 6 | 7 | 8 | class GaussianMixture: 9 | """ 10 | 高斯混合模型 11 | """ 12 | 13 | def __init__(self, n_components: int = 1, eps: float = 1e-3, 14 | max_iter: int = 200, random_state: Any = None): 15 | self.n_components = n_components 16 | self.eps = eps 17 | self.max_iter = max_iter 18 | self.random_state = random_state 19 | self.alpha = None 20 | self.miu = None 21 | self.sigma_square = None 22 | 23 | def fit(self, X: np.ndarray): 24 | m = X.shape[0] 25 | rs = np.random.RandomState(self.random_state) 26 | # 初始化模型参数 27 | alpha = rs.random_sample((1, self.n_components)) 28 | self.alpha = alpha / alpha.sum() 29 | self.miu = rs.random_sample((1, self.n_components)) 30 | self.sigma_square = rs.random_sample((1, self.n_components)) 31 | for _ in range(self.max_iter): 32 | # E 步 33 | gamma = self.alpha * (1.0 / np.sqrt(2 * np.pi * self.sigma_square) * 34 | (np.exp(-(X - self.miu) ** 2 / (2 * self.sigma_square)) + 1e-9)) 35 | gamma = gamma / gamma.sum(axis=1, keepdims=True) 36 | # M 步 37 | miu_old = self.miu.copy() 38 | sigma_square_old = self.sigma_square.copy() 39 | alpha_old = self.alpha.copy() 40 | self.miu = (gamma * X).sum(axis=0) / gamma.sum(axis=0, keepdims=True) 41 | self.sigma_square = (gamma * (X - self.miu) ** 2).sum(axis=0, keepdims=True) \ 42 | / gamma.sum(axis=0, keepdims=True) 43 | self.alpha = gamma.sum(axis=0, keepdims=True) / m 44 | delta_alpha = self.alpha - alpha_old 45 | delta_miu = self.miu - miu_old 46 | delta_sigma_square = self.sigma_square - sigma_square_old 47 | if np.linalg.norm(delta_miu) < self.eps and np.linalg.norm(delta_sigma_square) < self.eps \ 48 | and np.linalg.norm(delta_alpha) < self.eps: 49 | break 50 | return self 51 | 52 | 53 | if __name__ == "__main__": 54 | def generate_data(length, alpha0, alpha1, miu0, miu1, sigma0, sigma1): 55 | data = np.zeros((length, 1)) 56 | data0 = np.random.normal(miu0, sigma0, int(alpha0 * length)) 57 | data1 = np.random.normal(miu1, sigma1, int(alpha1 * length)) 58 | data[:int(alpha0 * length), 0] = data0[:] 59 | data[int(alpha0 * length):, 0] = data1[:] 60 | np.random.shuffle(data) 61 | return data 62 | 63 | 64 | data = generate_data(1000, 0.1, 0.9, 12, 11, 0.2, 6) 65 | # 初始化观测数据 66 | data=np.array([-67, -48, 6, 8, 14, 16, 23, 24, 28, 29, 41, 49, 56, 60, 75]).reshape(-1, 1) 67 | gmm = GaussianMixture(n_components=2, eps=1e-5, max_iter=1000) 68 | gmm.fit(data) 69 | print(gmm.alpha) 70 | print(gmm.miu) 71 | print(np.sqrt(gmm.sigma_square)) 72 | -------------------------------------------------------------------------------- /10混合高斯/expectation_maximization.py: -------------------------------------------------------------------------------- 1 | # @Time : 2019/1/8 13:54 2 | # @Author : Xu Huipeng 3 | # @Blog : https://brycexxx.github.io/ 4 | 5 | """ 6 | 双硬币模型 7 | 8 | 假设有两枚硬币A、B,以相同的概率随机选择一个硬币,进行如下的抛硬币实验:共做5次实验,每次实验独立的抛十次, 9 | 例如某次实验产生了H、T、T、T、H、H、T、H、T、H,H代表正面朝上。 10 | 11 | 假设实习生忘了记录每次试验选择的是 A 还是 B,我们无法观测实验数据中选择的硬币是哪个,数据如下: 12 | 硬币投掷结果观测序列 13 | observations = np.array([[1, 0, 0, 0, 1, 1, 0, 1, 0, 1], 14 | [1, 1, 1, 1, 0, 1, 1, 1, 1, 1], 15 | [1, 0, 1, 1, 1, 1, 1, 0, 1, 1], 16 | [1, 0, 1, 0, 0, 0, 1, 1, 0, 0], 17 | [0, 1, 1, 1, 0, 1, 1, 1, 0, 1]]) 18 | 19 | 问如何估计两个硬币正面出现的概率? 20 | 21 | 题目来源:http://www.hankcs.com/ml/em-algorithm-and-its-generalization.html 22 | """ 23 | import numpy as np 24 | from scipy.stats import binom 25 | 26 | 27 | class ExpectationMaximization: 28 | """ 29 | 简明 EM(期望最大化) 算法实现 30 | """ 31 | 32 | def __init__(self, theta_a: float = 0.5, theta_b: float = 0.5, eps: float = 1e-3): 33 | self.eps = eps 34 | self.theta_a = theta_a 35 | self.theta_b = theta_b 36 | 37 | def fit(self, X: np.ndarray): 38 | # 初始化两枚硬币出现正面的概率 39 | n = X.shape[1] 40 | while True: 41 | counts = np.zeros((2, 2)) 42 | for x in X: 43 | obverse_freq = x.sum() 44 | p_from_a = binom.pmf(obverse_freq, n, self.theta_a) 45 | p_from_b = binom.pmf(obverse_freq, n, self.theta_b) 46 | # 正规化 47 | p_from_a_normalized = p_from_a / (p_from_a + p_from_b) 48 | p_from_b_normalized = p_from_b / (p_from_a + p_from_b) 49 | counts[0, 0] += p_from_a_normalized * obverse_freq 50 | counts[0, 1] += p_from_a_normalized * (n - obverse_freq) 51 | counts[1, 0] += p_from_b_normalized * obverse_freq 52 | counts[1, 1] += p_from_b_normalized * (n - obverse_freq) 53 | # 更新 theta 54 | theta_a_old, theta_b_old = self.theta_a, self.theta_b 55 | self.theta_a = counts[0, 0] / counts[0, :].sum() 56 | self.theta_b = counts[1, 0] / counts[1, :].sum() 57 | if np.linalg.norm([self.theta_a - theta_a_old, self.theta_b - theta_b_old]) < self.eps: 58 | break 59 | return self 60 | 61 | 62 | if __name__ == "__main__": 63 | x = np.array([[1, 0, 0, 0, 1, 1, 0, 1, 0, 1], 64 | [1, 1, 1, 1, 0, 1, 1, 1, 1, 1], 65 | [1, 0, 1, 1, 1, 1, 1, 0, 1, 1], 66 | [1, 0, 1, 0, 0, 0, 1, 1, 0, 0], 67 | [0, 1, 1, 1, 0, 1, 1, 1, 0, 1]]) 68 | em = ExpectationMaximization(0.999999999, 0.000001) 69 | em.fit(x) 70 | print(em.theta_a, em.theta_b) 71 | -------------------------------------------------------------------------------- /10混合高斯/sklearn中的混合高斯.txt: -------------------------------------------------------------------------------- 1 | sklearn.mixture.BayesianGaussianMixture 2 | sklearn.mixture.GaussianMixture 3 | 4 | 5 | -------------------------------------------------------------------------------- /11隐马尔可夫/HMM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class HiddenMarkov: 5 | def forward(self, Q, V, A, B, O, PI): # 使用前向算法 6 | N = len(Q) # 状态序列的大小 7 | M = len(O) # 观测序列的大小 8 | alphas = np.zeros((N, M)) # alpha值 9 | T = M # 有几个时刻,有几个观测序列,就有几个时刻 10 | for t in range(T): # 遍历每一时刻,算出alpha值 11 | indexOfO = V.index(O[t]) # 找出序列对应的索引 12 | for i in range(N): 13 | if t == 0: # 计算初值 14 | alphas[i][t] = PI[t][i] * B[i][indexOfO] # P176(10.15) 15 | print('alpha1(%d)=p%db%db(o1)=%f' % (i, i, i, alphas[i][t])) 16 | else: 17 | alphas[i][t] = np.dot([alpha[t - 1] for alpha in alphas], [a[i] for a in A]) * B[i][ 18 | indexOfO] # 对应P176(10.16) 19 | print('alpha%d(%d)=[sigma alpha%d(i)ai%d]b%d(o%d)=%f' % (t, i, t - 1, i, i, t, alphas[i][t])) 20 | # print(alphas) 21 | P = np.sum([alpha[M - 1] for alpha in alphas]) # P176(10.17) 22 | # alpha11 = pi[0][0] * B[0][0] #代表a1(1) 23 | # alpha12 = pi[0][1] * B[1][0] #代表a1(2) 24 | # alpha13 = pi[0][2] * B[2][0] #代表a1(3) 25 | 26 | def backward(self, Q, V, A, B, O, PI): # 后向算法 27 | N = len(Q) # 状态序列的大小 28 | M = len(O) # 观测序列的大小 29 | betas = np.ones((N, M)) # beta 30 | for i in range(N): 31 | print('beta%d(%d)=1' % (M, i)) 32 | for t in range(M - 2, -1, -1): 33 | indexOfO = V.index(O[t + 1]) # 找出序列对应的索引 34 | for i in range(N): 35 | betas[i][t] = np.dot(np.multiply(A[i], [b[indexOfO] for b in B]), [beta[t + 1] for beta in betas]) 36 | realT = t + 1 37 | realI = i + 1 38 | print('beta%d(%d)=[sigma a%djbj(o%d)]beta%d(j)=(' % (realT, realI, realI, realT + 1, realT + 1), 39 | end='') 40 | for j in range(N): 41 | print("%.2f*%.2f*%.2f+" % (A[i][j], B[j][indexOfO], betas[j][t + 1]), end='') 42 | print("0)=%.3f" % betas[i][t]) 43 | # print(betas) 44 | indexOfO = V.index(O[0]) 45 | P = np.dot(np.multiply(PI, [b[indexOfO] for b in B]), [beta[0] for beta in betas]) 46 | print("P(O|lambda)=", end="") 47 | for i in range(N): 48 | print("%.1f*%.1f*%.5f+" % (PI[0][i], B[i][indexOfO], betas[i][0]), end="") 49 | print("0=%f" % P) 50 | 51 | def viterbi(self, Q, V, A, B, O, PI): 52 | N = len(Q) # 状态序列的大小 53 | M = len(O) # 观测序列的大小 54 | deltas = np.zeros((N, M)) 55 | psis = np.zeros((N, M)) 56 | I = np.zeros((1, M)) 57 | for t in range(M): 58 | realT = t+1 59 | indexOfO = V.index(O[t]) # 找出序列对应的索引 60 | for i in range(N): 61 | realI = i+1 62 | if t == 0: 63 | deltas[i][t] = PI[0][i] * B[i][indexOfO] 64 | psis[i][t] = 0 65 | print('delta1(%d)=pi%d * b%d(o1)=%.2f * %.2f=%.2f'%(realI, realI, realI, PI[0][i], B[i][indexOfO], deltas[i][t])) 66 | print('psis1(%d)=0' % (realI)) 67 | else: 68 | deltas[i][t] = np.max(np.multiply([delta[t-1] for delta in deltas], [a[i] for a in A])) * B[i][indexOfO] 69 | print('delta%d(%d)=max[delta%d(j)aj%d]b%d(o%d)=%.2f*%.2f=%.5f'%(realT, realI, realT-1, realI, realI, realT, np.max(np.multiply([delta[t-1] for delta in deltas], [a[i] for a in A])), B[i][indexOfO], deltas[i][t])) 70 | psis[i][t] = np.argmax(np.multiply([delta[t-1] for delta in deltas], [a[i] for a in A])) 71 | print('psis%d(%d)=argmax[delta%d(j)aj%d]=%d' % (realT, realI, realT-1, realI, psis[i][t])) 72 | print(deltas) 73 | print(psis) 74 | I[0][M-1] = np.argmax([delta[M-1] for delta in deltas]) 75 | print('i%d=argmax[deltaT(i)]=%d' % (M, I[0][M-1]+1)) 76 | for t in range(M-2, -1, -1): 77 | I[0][t] = psis[int(I[0][t+1])][t+1] 78 | print('i%d=psis%d(i%d)=%d' % (t+1, t+2, t+2, I[0][t]+1)) 79 | print(I) 80 | 81 | #习题10.1 82 | Q = [1, 2, 3] 83 | V = ['红', '白'] 84 | A = [[0.5, 0.2, 0.3], [0.3, 0.5, 0.2], [0.2, 0.3, 0.5]] 85 | B = [[0.5, 0.5], [0.4, 0.6], [0.7, 0.3]] 86 | # O = ['红', '白', '红', '红', '白', '红', '白', '白'] 87 | O = ['红', '白', '红', '白'] #习题10.1的例子 88 | # O = ['红', '白', '红'] 89 | PI = [[0.2, 0.4, 0.4]] 90 | 91 | HMM = HiddenMarkov() 92 | # HMM.forward(Q, V, A, B, O, PI) 93 | HMM.backward(Q, V, A, B, O, PI) 94 | # HMM.viterbi(Q, V, A, B, O, PI) -------------------------------------------------------------------------------- /12EM算法/em.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 数据集:伪造数据集(两个高斯分布混合) 3 | 数据集长度:1000 4 | ------------------------------ 5 | 运行结果: 6 | ---------------------------- 7 | the Parameters set is: 8 | alpha0:0.3, mu0:0.7, sigmod0:-2.0, alpha1:0.5, mu1:0.5, sigmod1:1.0 9 | ---------------------------- 10 | the Parameters predict is: 11 | alpha0:0.4, mu0:0.6, sigmod0:-1.7, alpha1:0.7, mu1:0.7, sigmod1:0.9 12 | ---------------------------- 13 | ''' 14 | 15 | import numpy as np 16 | import random 17 | import math 18 | import time 19 | 20 | def loadData(mu0, sigma0, mu1, sigma1, alpha0, alpha1): 21 | ''' 22 | 初始化数据集 23 | 这里通过服从高斯分布的随机函数来伪造数据集 24 | :param mu0: 高斯0的均值 25 | :param sigma0: 高斯0的方差 26 | :param mu1: 高斯1的均值 27 | :param sigma1: 高斯1的方差 28 | :param alpha0: 高斯0的系数 29 | :param alpha1: 高斯1的系数 30 | :return: 混合了两个高斯分布的数据 31 | ''' 32 | # 定义数据集长度为1000 33 | length = 1000 34 | 35 | # 初始化第一个高斯分布,生成数据,数据长度为length * alpha系数,以此来 36 | # 满足alpha的作用 37 | data0 = np.random.normal(mu0, sigma0, int(length * alpha0)) 38 | # 第二个高斯分布的数据 39 | data1 = np.random.normal(mu1, sigma1, int(length * alpha1)) 40 | 41 | # 初始化总数据集 42 | # 两个高斯分布的数据混合后会放在该数据集中返回 43 | dataSet = [] 44 | # 将第一个数据集的内容添加进去 45 | dataSet.extend(data0) 46 | # 添加第二个数据集的数据 47 | dataSet.extend(data1) 48 | # 对总的数据集进行打乱(其实不打乱也没事,只不过打乱一下直观上让人感觉已经混合了 49 | # 读者可以将下面这句话屏蔽以后看看效果是否有差别) 50 | random.shuffle(dataSet) 51 | 52 | #返回伪造好的数据集 53 | return dataSet 54 | 55 | def calcGauss(dataSetArr, mu, sigmod): 56 | ''' 57 | 根据高斯密度函数计算值 58 | 依据:“9.3.1 高斯混合模型” 式9.25 59 | 注:在公式中y是一个实数,但是在EM算法中(见算法9.2的E步),需要对每个j 60 | 都求一次yjk,在本实例中有1000个可观测数据,因此需要计算1000次。考虑到 61 | 在E步时进行1000次高斯计算,程序上比较不简洁,因此这里的y是向量,在numpy 62 | 的exp中如果exp内部值为向量,则对向量中每个值进行exp,输出仍是向量的形式。 63 | 所以使用向量的形式1次计算即可将所有计算结果得出,程序上较为简洁 64 | :param dataSetArr: 可观测数据集 65 | :param mu: 均值 66 | :param sigmod: 方差 67 | :return: 整个可观测数据集的高斯分布密度(向量形式) 68 | ''' 69 | # 计算过程就是依据式9.25写的,没有别的花样 70 | result = (1 / (math.sqrt(2*math.pi)*sigmod**2)) * np.exp(-1 * (dataSetArr-mu) * (dataSetArr-mu) / (2*sigmod**2)) 71 | # 返回结果 72 | return result 73 | 74 | 75 | def E_step(dataSetArr, alpha0, mu0, sigmod0, alpha1, mu1, sigmod1): 76 | ''' 77 | EM算法中的E步 78 | 依据当前模型参数,计算分模型k对观数据y的响应度 79 | :param dataSetArr: 可观测数据y 80 | :param alpha0: 高斯模型0的系数 81 | :param mu0: 高斯模型0的均值 82 | :param sigmod0: 高斯模型0的方差 83 | :param alpha1: 高斯模型1的系数 84 | :param mu1: 高斯模型1的均值 85 | :param sigmod1: 高斯模型1的方差 86 | :return: 两个模型各自的响应度 87 | ''' 88 | # 计算y0的响应度 89 | # 先计算模型0的响应度的分子 90 | gamma0 = alpha0 * calcGauss(dataSetArr, mu0, sigmod0) 91 | # 模型1响应度的分子 92 | gamma1 = alpha1 * calcGauss(dataSetArr, mu1, sigmod1) 93 | 94 | # 两者相加为E步中的分布 95 | sum = gamma0 + gamma1 96 | # 各自相除,得到两个模型的响应度 97 | gamma0 = gamma0 / sum 98 | gamma1 = gamma1 / sum 99 | 100 | # 返回两个模型响应度 101 | return gamma0, gamma1 102 | 103 | def M_step(muo, mu1, gamma0, gamma1, dataSetArr): 104 | # 依据算法9.2计算各个值 105 | # 这里没什么花样,对照书本公式看看这里就好了 106 | mu0_new = np.dot(gamma0, dataSetArr) / np.sum(gamma0) 107 | mu1_new = np.dot(gamma1, dataSetArr) / np.sum(gamma1) 108 | 109 | sigmod0_new = math.sqrt(np.dot(gamma0, (dataSetArr - muo)**2) / np.sum(gamma0)) 110 | sigmod1_new = math.sqrt(np.dot(gamma1, (dataSetArr - mu1)**2) / np.sum(gamma1)) 111 | 112 | alpha0_new = np.sum(gamma0) / len(gamma0) 113 | alpha1_new = np.sum(gamma1) / len(gamma1) 114 | 115 | # 将更新的值返回 116 | return mu0_new, mu1_new, sigmod0_new, sigmod1_new, alpha0_new, alpha1_new 117 | 118 | 119 | def EM_Train(dataSetList, iter=500): 120 | ''' 121 | 根据EM算法进行参数估计 122 | 算法依据“9.3.2 高斯混合模型参数估计的EM算法” 算法9.2 123 | :param dataSetList:数据集(可观测数据) 124 | :param iter: 迭代次数 125 | :return: 估计的参数 126 | ''' 127 | # 将可观测数据y转换为数组形式,主要是为了方便后续运算 128 | dataSetArr = np.array(dataSetList) 129 | 130 | # 步骤1:对参数取初值,开始迭代 131 | alpha0 = 0.5 132 | mu0 = 0 133 | sigmod0 = 1 134 | alpha1 = 0.5 135 | mu1 = 1 136 | sigmod1 = 1 137 | 138 | # 开始迭代 139 | step = 0 140 | while (step < iter): 141 | # 每次进入一次迭代后迭代次数加1 142 | step += 1 143 | # 步骤2:E步:依据当前模型参数,计算分模型k对观测数据y的响应度 144 | gamma0, gamma1 = E_step(dataSetArr, alpha0, mu0, sigmod0, alpha1, mu1, sigmod1) 145 | # 步骤3:M步 146 | mu0, mu1, sigmod0, sigmod1, alpha0, alpha1 = M_step(mu0, mu1, gamma0, gamma1, dataSetArr) 147 | 148 | # 迭代结束后将更新后的各参数返回 149 | return alpha0, mu0, sigmod0, alpha1, mu1, sigmod1 150 | 151 | 152 | if __name__ == '__main__': 153 | start = time.time() 154 | 155 | # 设置两个高斯模型进行混合,这里是初始化两个模型各自的参数 156 | # 见“9.3 EM算法在高斯混合模型学习中的应用” 157 | # alpha是“9.3.1 高斯混合模型” 定义9.2中的系数α 158 | # mu0是均值μ 159 | # sigmod是方差σ 160 | # 在设置上两个alpha的和必须为1,其他没有什么具体要求,符合高斯定义就可以 161 | alpha0 = 0.3 # 系数α 162 | mu0 = -2 # 均值μ 163 | sigmod0 = 0.5 # 方差σ 164 | 165 | alpha1 = 0.7 # 系数α 166 | mu1 = 0.5 # 均值μ 167 | sigmod1 = 1 # 方差σ 168 | 169 | # 初始化数据集 170 | dataSetList = loadData(mu0, sigmod0, mu1, sigmod1, alpha0, alpha1) 171 | 172 | #打印设置的参数 173 | print('---------------------------') 174 | print('the Parameters set is:') 175 | print('alpha0:%.1f, mu0:%.1f, sigmod0:%.1f, alpha1:%.1f, mu1:%.1f, sigmod1:%.1f' % ( 176 | alpha0, alpha1, mu0, mu1, sigmod0, sigmod1 177 | )) 178 | 179 | # 开始EM算法,进行参数估计 180 | alpha0, mu0, sigmod0, alpha1, mu1, sigmod1 = EM_Train(dataSetList) 181 | 182 | # 打印参数预测结果 183 | print('----------------------------') 184 | print('the Parameters predict is:') 185 | print('alpha0:%.1f, mu0:%.1f, sigmod0:%.1f, alpha1:%.1f, mu1:%.1f, sigmod1:%.1f' % ( 186 | alpha0, alpha1, mu0, mu1, sigmod0, sigmod1 187 | )) 188 | 189 | # 打印时间 190 | print('----------------------------') 191 | print('time span:', time.time() - start) 192 | 193 | # --------------------------- 194 | # the Parameters set is: 195 | # alpha0:0.3, mu0:0.7, sigmod0:-2.0, alpha1:0.5, mu1:0.5, sigmod1:1.0 196 | # ---------------------------- 197 | # the Parameters predict is: 198 | # alpha0:0.4, mu0:0.6, sigmod0:-1.7, alpha1:0.7, mu1:0.7, sigmod1:0.9 199 | # ---------------------------- 200 | # time span: 0.06240034103393555 201 | -------------------------------------------------------------------------------- /13关联分析/fpGrowth_ys.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding:utf8 3 | 4 | # ''' 5 | # FP-Growth FP means frequent pattern 6 | # the FP-Growth algorithm needs: 7 | # 1. FP-tree (class treeNode) 8 | # 2. header table (use dict) 9 | # This finds frequent itemsets similar to apriori but does not find association rules. 10 | # Author: Peter/片刻 11 | # GitHub: https://github.com/apachecn/AiLearning 12 | 13 | # ''' 14 | 15 | 16 | class treeNode: 17 | def __init__(self, nameValue, numOccur, parentNode): 18 | self.name = nameValue 19 | self.count = numOccur 20 | self.nodeLink = None 21 | # needs to be updated 22 | self.parent = parentNode 23 | self.children = {} 24 | 25 | def inc(self, numOccur): 26 | """inc(对count变量增加给定值) 27 | """ 28 | self.count += numOccur 29 | 30 | def disp(self, ind=1): 31 | """disp(用于将树以文本形式显示) 32 | 33 | """ 34 | print(' '*ind, self.name, ' ', self.count) 35 | for child in self.children.values(): 36 | child.disp(ind+1) 37 | 38 | 39 | def loadSimpDat(): 40 | simpDat = [['r', 'z', 'h', 'j', 'p'], 41 | ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], 42 | ['z'], 43 | ['r', 'x', 'n', 'o', 's'], 44 | # ['r', 'x', 'n', 'o', 's'], 45 | ['y', 'r', 'x', 'z', 'q', 't', 'p'], 46 | ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']] 47 | return simpDat 48 | 49 | 50 | def createInitSet(dataSet): 51 | retDict = {} 52 | for trans in dataSet: 53 | if frozenset(trans) not in retDict.keys(): 54 | retDict[frozenset(trans)] = 1 55 | else: 56 | retDict[frozenset(trans)] += 1 57 | return retDict 58 | 59 | 60 | # this version does not use recursion 61 | def updateHeader(nodeToTest, targetNode): 62 | """updateHeader(更新头指针,建立相同元素之间的关系,例如: 左边的r指向右边的r值,就是后出现的相同元素 指向 已经出现的元素) 63 | 64 | 从头指针的nodeLink开始,一直沿着nodeLink直到到达链表末尾。这就是链表。 65 | 性能:如果链表很长可能会遇到迭代调用的次数限制。 66 | 67 | Args: 68 | nodeToTest 满足minSup {所有的元素+(value, treeNode)} 69 | targetNode Tree对象的子节点 70 | """ 71 | # 建立相同元素之间的关系,例如: 左边的r指向右边的r值 72 | while (nodeToTest.nodeLink != None): 73 | nodeToTest = nodeToTest.nodeLink 74 | nodeToTest.nodeLink = targetNode 75 | 76 | 77 | def updateTree(items, inTree, headerTable, count): 78 | """updateTree(更新FP-tree,第二次遍历) 79 | 80 | # 针对每一行的数据 81 | # 最大的key, 添加 82 | Args: 83 | items 满足minSup 排序后的元素key的数组(大到小的排序) 84 | inTree 空的Tree对象 85 | headerTable 满足minSup {所有的元素+(value, treeNode)} 86 | count 原数据集中每一组Kay出现的次数 87 | """ 88 | # 取出 元素 出现次数最高的 89 | # 如果该元素在 inTree.children 这个字典中,就进行累加 90 | # 如果该元素不存在 就 inTree.children 字典中新增key,value为初始化的 treeNode 对象 91 | if items[0] in inTree.children: 92 | # 更新 最大元素,对应的 treeNode 对象的count进行叠加 93 | inTree.children[items[0]].inc(count) 94 | else: 95 | # 如果不存在子节点,我们为该inTree添加子节点 96 | inTree.children[items[0]] = treeNode(items[0], count, inTree) 97 | # 如果满足minSup的dist字典的value值第二位为null, 我们就设置该元素为 本节点对应的tree节点 98 | # 如果元素第二位不为null,我们就更新header节点 99 | if headerTable[items[0]][1] is None: 100 | # headerTable只记录第一次节点出现的位置 101 | headerTable[items[0]][1] = inTree.children[items[0]] 102 | else: 103 | # 本质上是修改headerTable的key对应的Tree,的nodeLink值 104 | updateHeader(headerTable[items[0]][1], inTree.children[items[0]]) 105 | if len(items) > 1: 106 | # 递归的调用,在items[0]的基础上,添加item0[1]做子节点, count只要循环的进行累计加和而已,统计出节点的最后的统计值。 107 | updateTree(items[1:], inTree.children[items[0]], headerTable, count) 108 | 109 | 110 | def createTree(dataSet, minSup=1): 111 | """createTree(生成FP-tree) 112 | 113 | Args: 114 | dataSet dist{行:出现次数}的样本数据 115 | minSup 最小的支持度 116 | Returns: 117 | retTree FP-tree 118 | headerTable 满足minSup {所有的元素+(value, treeNode)} 119 | """ 120 | # 支持度>=minSup的dist{所有元素:出现的次数} 121 | headerTable = {} 122 | # 循环 dist{行:出现次数}的样本数据 123 | for trans in dataSet: 124 | # 对所有的行进行循环,得到行里面的所有元素 125 | # 统计每一行中,每个元素出现的总次数 126 | for item in trans: 127 | # 例如: {'ababa': 3} count(a)=3+3+3=9 count(b)=3+3=6 128 | headerTable[item] = headerTable.get(item, 0) + dataSet[trans] 129 | 130 | 131 | # 删除 headerTable中,元素次数<最小支持度minSup的元素 132 | for k in list(headerTable.keys()): # python3中.keys()返回的是迭代器不是list,不能在遍历时对其改变。 133 | if headerTable[k] < minSup: 134 | del(headerTable[k]) 135 | 136 | # 满足minSup: set(各元素集合) 137 | freqItemSet = set(headerTable.keys()) 138 | # 如果不存在,直接返回None 139 | if len(freqItemSet) == 0: 140 | return None, None 141 | for k in headerTable: 142 | # 格式化: dist{元素key: [元素次数, None]} 143 | headerTable[k] = [headerTable[k], None] 144 | 145 | # create tree 146 | retTree = treeNode('Null Set', 1, None) 147 | # 循环 dist{行:出现次数}的样本数据 148 | for tranSet, count in dataSet.items(): 149 | # print('tranSet, count=', tranSet, count) 150 | # localD = dist{元素key: 元素总出现次数} 151 | localD = {} 152 | for item in tranSet: 153 | # 判断是否在满足minSup的集合中 154 | if item in freqItemSet: 155 | # print('headerTable[item][0]=', headerTable[item][0], headerTable[item]) 156 | localD[item] = headerTable[item][0] 157 | # print('localD=', localD) 158 | # 对每一行的key 进行排序,然后开始往树添加枝丫,直到丰满 159 | # 第二次,如果在同一个排名下出现,那么就对该枝丫的值进行追加,继续递归调用! 160 | if len(localD) > 0: 161 | # p=key,value; 所以是通过value值的大小,进行从大到小进行排序 162 | # orderedItems 表示取出元组的key值,也就是字母本身,但是字母本身是大到小的顺序 163 | orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)] 164 | # print 'orderedItems=', orderedItems, 'headerTable', headerTable, '\n\n\n' 165 | # 填充树,通过有序的orderedItems的第一位,进行顺序填充 第一层的子节点。 166 | updateTree(orderedItems, retTree, headerTable, count) 167 | 168 | return retTree, headerTable 169 | 170 | 171 | def ascendTree(leafNode, prefixPath): 172 | """ascendTree(如果存在父节点,就记录当前节点的name值) 173 | 174 | Args: 175 | leafNode 查询的节点对于的nodeTree 176 | prefixPath 要查询的节点值 177 | """ 178 | if leafNode.parent is not None: 179 | prefixPath.append(leafNode.name) 180 | ascendTree(leafNode.parent, prefixPath) 181 | 182 | 183 | def findPrefixPath(basePat, treeNode): 184 | """findPrefixPath 基础数据集 185 | 186 | Args: 187 | basePat 要查询的节点值 188 | treeNode 查询的节点所在的当前nodeTree 189 | Returns: 190 | condPats 对非basePat的倒叙值作为key,赋值为count数 191 | """ 192 | condPats = {} 193 | # 对 treeNode的link进行循环 194 | while treeNode is not None: 195 | prefixPath = [] 196 | # 寻找改节点的父节点,相当于找到了该节点的频繁项集 197 | ascendTree(treeNode, prefixPath) 198 | # 排除自身这个元素,判断是否存在父元素(所以要>1, 说明存在父元素) 199 | if len(prefixPath) > 1: 200 | # 对非basePat的倒叙值作为key,赋值为count数 201 | # prefixPath[1:] 变frozenset后,字母就变无序了 202 | # condPats[frozenset(prefixPath)] = treeNode.count 203 | condPats[frozenset(prefixPath[1:])] = treeNode.count 204 | # 递归,寻找改节点的下一个 相同值的链接节点 205 | treeNode = treeNode.nodeLink 206 | # print(treeNode) 207 | return condPats 208 | 209 | 210 | def mineTree(inTree, headerTable, minSup, preFix, freqItemList): 211 | """mineTree(创建条件FP树) 212 | 213 | Args: 214 | inTree myFPtree 215 | headerTable 满足minSup {所有的元素+(value, treeNode)} 216 | minSup 最小支持项集 217 | preFix preFix为newFreqSet上一次的存储记录,一旦没有myHead,就不会更新 218 | freqItemList 用来存储频繁子项的列表 219 | """ 220 | # 通过value进行从小到大的排序, 得到频繁项集的key 221 | # 最小支持项集的key的list集合 222 | bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: (p[1][0],p[0]))] 223 | # print('-----', sorted(headerTable.items(), key=lambda p: p[1][0])) 224 | # print('bigL=', bigL) 225 | # 循环遍历 最频繁项集的key,从小到大的递归寻找对应的频繁项集 226 | for basePat in bigL: 227 | # preFix为newFreqSet上一次的存储记录,一旦没有myHead,就不会更新 228 | newFreqSet = preFix.copy() 229 | newFreqSet.add(basePat) 230 | # print('newFreqSet=', newFreqSet, preFix) 231 | 232 | freqItemList.append(newFreqSet) 233 | # print('freqItemList=', freqItemList) 234 | 235 | # 通过条件模式基找到的频繁项集 236 | condPattBases = findPrefixPath(basePat, headerTable[basePat][1]) 237 | # print('condPattBases=', basePat, condPattBases) 238 | 239 | # 构建FP-tree 240 | myCondTree, myHead = createTree(condPattBases, minSup) 241 | # print('myHead=', myHead) 242 | # 挖掘条件 FP-tree, 如果myHead不为空,表示满足minSup {所有的元素+(value, treeNode)} 243 | if myHead is not None: 244 | print('condPattBases:',basePat,condPattBases) 245 | myCondTree.disp() 246 | print('*'*30) 247 | # 递归 myHead 找出频繁项集 248 | mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList) 249 | # print('\n') 250 | 251 | 252 | 253 | 254 | if __name__ == "__main__": 255 | # rootNode = treeNode('pyramid', 9, None) 256 | # rootNode.children['eye'] = treeNode('eye', 13, None) 257 | # rootNode.children['phoenix'] = treeNode('phoenix', 3, None) 258 | # # 将树以文本形式显示 259 | # # print(rootNode.disp()) 260 | 261 | # load样本数据 262 | simpDat = loadSimpDat() 263 | # print(simpDat, '\n') 264 | # frozen set 格式化 并 重新装载 样本数据,对所有的行进行统计求和,格式: {行:出现次数} 265 | initSet = createInitSet(simpDat) 266 | print(initSet) 267 | 268 | # 创建FP树 269 | # 输入:dist{行:出现次数}的样本数据 和 最小的支持度 270 | # 输出:最终的PF-tree,通过循环获取第一层的节点,然后每一层的节点进行递归的获取每一行的字节点,也就是分支。然后所谓的指针,就是后来的指向已存在的 271 | myFPtree, myHeaderTab = createTree(initSet, 3) 272 | myFPtree.disp() 273 | 274 | # 抽取条件模式基 报错 275 | # 查询树节点的,频繁子项 276 | print('x --->', findPrefixPath('x', myHeaderTab['x'][1])) 277 | print('z --->', findPrefixPath('z', myHeaderTab['z'][1])) 278 | print('r --->', findPrefixPath('r', myHeaderTab['r'][1])) 279 | 280 | # 创建条件模式基 281 | freqItemList = [] 282 | mineTree(myFPtree, myHeaderTab, 3, set([]), freqItemList) 283 | print("freqItemList: \n", freqItemList) 284 | -------------------------------------------------------------------------------- /1手写KNN.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"1手写KNN.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"metadata":{"id":"-StIebn4df07","colab_type":"code","colab":{}},"cell_type":"code","source":["import numpy as np\n","import operator\n","import os\n","from collections import Counter"],"execution_count":0,"outputs":[]},{"metadata":{"id":"pNFTsZ7-rmHS","colab_type":"text"},"cell_type":"markdown","source":["# 手写KNN"]},{"metadata":{"id":"alz3qLVarpCG","colab_type":"code","colab":{}},"cell_type":"code","source":["def createDataSet():\n"," group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])\n"," labels = ['A', 'A', 'B', 'B']\n"," return group, labels"],"execution_count":0,"outputs":[]},{"metadata":{"id":"DOyywTMdrpHa","colab_type":"code","colab":{}},"cell_type":"code","source":["def classify0(inX, dataSet, labels, k): \n"," # 1. 计算距离 \n"," # inx - dataset 使用了numpy broadcasting\n","\tdist = np.sum((inX - dataSet)**2, axis=1)**0.5\n","\t# print(dist.shape)\n"," \n","\n"," # 2. k个最近的标签 \n"," # 对距离排序使用numpy中的argsort函数, 见 https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.sort.html#numpy.sort\n"," # 函数返回的是数组值从小到大的索引值 ,因此取前k个索引使用[0 : k]\n"," # 将这k个标签存在列表k_labels中 \n","\tk_labels = [ labels[index] for index in dist.argsort()[0 : k] ]\n","\n"," # 3. 出现次数最多的标签即为最终类别 \n"," # 使用collections.Counter可以统计各个标签的出现次数,most_common返回出现次数最多的标签tuple,例如[('lable1', 2)],因此[0][0]可以取出标签值\n","\n","\tlabel = Counter(k_labels).most_common(1)[0][0]\n","\treturn label"],"execution_count":0,"outputs":[]},{"metadata":{"id":"b7JtoTD1rpKa","colab_type":"code","colab":{}},"cell_type":"code","source":["def test1():\n"," \"\"\"\n"," 第一个例子演示\n"," \"\"\"\n"," group, labels = createDataSet()\n"," print(str(group))\n"," print(str(labels))\n"," print(classify0([0.1, 0.1], group, labels, 3))"],"execution_count":0,"outputs":[]},{"metadata":{"id":"8bAC7vIkrpM6","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":119},"outputId":"04b4d547-08cc-4612-8898-b47e517bcb24","executionInfo":{"status":"ok","timestamp":1550998048385,"user_tz":-480,"elapsed":4025,"user":{"displayName":"Sen Yang","photoUrl":"","userId":"00832503676208839570"}}},"cell_type":"code","source":["test1()"],"execution_count":5,"outputs":[{"output_type":"stream","text":["[[1. 1.1]\n"," [1. 1. ]\n"," [0. 0. ]\n"," [0. 0.1]]\n","['A', 'A', 'B', 'B']\n","B\n"],"name":"stdout"}]},{"metadata":{"id":"AAtvwaHXvBdA","colab_type":"code","colab":{}},"cell_type":"code","source":["from sklearn.datasets import load_iris\n","from sklearn.model_selection import train_test_split\n","import numpy as np\n","import pandas as pd\n","import matplotlib.pyplot as plt"],"execution_count":0,"outputs":[]},{"metadata":{"id":"M9rif3XPvBhg","colab_type":"code","colab":{}},"cell_type":"code","source":["iris = load_iris()\n","df = pd.DataFrame(iris.data, columns=iris.feature_names)\n","df['label'] = iris.target\n","df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']"],"execution_count":0,"outputs":[]},{"metadata":{"id":"QoMncMFNvBkK","colab_type":"code","colab":{}},"cell_type":"code","source":["data = np.array(df.iloc[:100, [0, 1, -1]])\n","X, y = data[:,:-1], data[:,-1]\n","X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"4f4xoVrlwpwI","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"876ccff9-18cb-4498-999a-45cd9ab53666","executionInfo":{"status":"ok","timestamp":1550999274393,"user_tz":-480,"elapsed":1595,"user":{"displayName":"Sen Yang","photoUrl":"","userId":"00832503676208839570"}}},"cell_type":"code","source":["X_train.shape"],"execution_count":13,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(80, 2)"]},"metadata":{"tags":[]},"execution_count":13}]},{"metadata":{"id":"rjZtvg-9vBmp","colab_type":"code","colab":{}},"cell_type":"code","source":["def irisTest():\n","\n"," # 设置测试的样本数量\n"," numTestVecs = X_test.shape[0]\n"," \n"," errorCount = 0\n"," for i in range(numTestVecs):\n"," # 对数据测试\n"," classifierResult = classify0(X_test[i], X_train, y_train, 3)\n"," errorCount += classifierResult != y_test[i]\n"," print(\"the total error rate is: %f\" % (errorCount / numTestVecs))\n"," print(errorCount)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"Gx4kNUudvBo7","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":51},"outputId":"72d07a42-2d43-4994-df37-66a8d687dced","executionInfo":{"status":"ok","timestamp":1550999329549,"user_tz":-480,"elapsed":651,"user":{"displayName":"Sen Yang","photoUrl":"","userId":"00832503676208839570"}}},"cell_type":"code","source":["irisTest()"],"execution_count":15,"outputs":[{"output_type":"stream","text":["the total error rate is: 0.050000\n","1\n"],"name":"stdout"}]},{"metadata":{"id":"4xLlKZVsvBrb","colab_type":"code","colab":{}},"cell_type":"code","source":[""],"execution_count":0,"outputs":[]},{"metadata":{"id":"nJAvi5dTvBt6","colab_type":"code","colab":{}},"cell_type":"code","source":[""],"execution_count":0,"outputs":[]},{"metadata":{"id":"ToEYRXs6rpPS","colab_type":"code","colab":{}},"cell_type":"code","source":[""],"execution_count":0,"outputs":[]},{"metadata":{"id":"kAcPVI24di9O","colab_type":"text"},"cell_type":"markdown","source":["# 李航版KNN"]},{"metadata":{"id":"GtcMb9Ajdnrm","colab_type":"code","colab":{}},"cell_type":"code","source":["class KNN:\n"," def __init__(self, X_train, y_train, n_neighbors=3, p=2):\n"," \"\"\"\n"," parameter: n_neighbors 临近点个数\n"," parameter: p 距离度量\n"," \"\"\"\n"," self.n = n_neighbors\n"," self.p = p\n"," self.X_train = X_train\n"," self.y_train = y_train\n"," \n"," def predict(self, X):\n"," # 取出n个点\n"," knn_list = []\n"," for i in range(self.n):\n","# 计算每个样本到训练样本的距离\n"," dist = np.linalg.norm(X - self.X_train[i], ord=self.p)\n"," knn_list.append((dist, self.y_train[i]))\n"," \n"," for i in range(self.n, len(self.X_train)):\n"," max_index = knn_list.index(max(knn_list, key=lambda x: x[0]))\n"," dist = np.linalg.norm(X - self.X_train[i], ord=self.p)\n"," if knn_list[max_index][0] > dist:\n"," knn_list[max_index] = (dist, self.y_train[i])\n"," \n"," # 统计\n"," knn = [k[-1] for k in knn_list]\n"," count_pairs = Counter(knn)\n"," max_count = sorted(count_pairs, key=lambda x:x)[-1]\n"," return max_count\n"," \n"," def score(self, X_test, y_test):\n"," right_count = 0\n"," n = 10\n"," for X, y in zip(X_test, y_test):\n"," label = self.predict(X)\n"," if label == y:\n"," right_count += 1\n"," return right_count / len(X_test)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"EE091sMJxKfW","colab_type":"code","colab":{}},"cell_type":"code","source":["clf = KNN(X_train, y_train)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"Up_odiRSxMjJ","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"6cf0a9c6-672e-48e7-c100-ab89e053d651","executionInfo":{"status":"ok","timestamp":1550999418732,"user_tz":-480,"elapsed":1106,"user":{"displayName":"Sen Yang","photoUrl":"","userId":"00832503676208839570"}}},"cell_type":"code","source":["clf.score(X_test, y_test)"],"execution_count":18,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.95"]},"metadata":{"tags":[]},"execution_count":18}]},{"metadata":{"id":"PHmF75qxxPKq","colab_type":"code","colab":{}},"cell_type":"code","source":[""],"execution_count":0,"outputs":[]}]} -------------------------------------------------------------------------------- /1线性回归/.ipynb_checkpoints/维度测试-checkpoint.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /1线性回归/LinearRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LinearRegression 3 | from sklearn.datasets import load_diabetes 4 | from sklearn.utils import shuffle 5 | import matplotlib.pyplot as plt 6 | 7 | # 形状非常重要,而且容易错误 8 | 9 | def fit_normal(X_train, y_train): 10 | """根据训练数据集X_train, y_train训练Linear Regression模型""" 11 | assert X_train.shape[0] == y_train.shape[0], \ 12 | "the size of X_train must be equal to the size of y_train" 13 | 14 | # np.vstack():在竖直方向上堆叠 15 | # np.hstack():在水平方向上平铺 16 | X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) # 为了增加常数项 17 | theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train) 18 | 19 | intercept = theta[0] 20 | coef = theta[1:] 21 | 22 | return theta 23 | 24 | def fit_bgd(X_train, y_train, eta=0.01, n_iters=1e5): 25 | """根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型""" 26 | assert X_train.shape[0] == y_train.shape[0], \ 27 | "the size of X_train must be equal to the size of y_train" 28 | 29 | 30 | def costfunc(theta, X_b, y): 31 | # 计算损失函数 32 | try: 33 | return np.sum((y - X_b.dot(theta)) ** 2) / len(y)/2 34 | except: 35 | return float('inf') 36 | 37 | def dJ(theta, X_b, y): 38 | # 损失函数求导 39 | return X_b.T.dot(X_b.dot(theta) - y) / len(y) 40 | 41 | def gradient_descent(X_b, y, initial_theta, eta, n_iters=n_iters, epsilon=1e-8): 42 | 43 | theta = initial_theta 44 | cur_iter = 0 45 | print('X_b.dot(theta)=',(X_b.dot(theta)).shape) 46 | print('(X_b.dot(theta) - y).shape=',(X_b.dot(theta) - y).shape) 47 | print('X_b.T.dot(X_b.dot(theta) - y).shape=',X_b.T.dot(X_b.dot(theta) - y).shape) 48 | 49 | # y = np.array(data[:,1])时的维度 50 | # y_train.shape= (97,) 51 | # theta.shape= (2,) 52 | # X_b.dot(theta)= (97,) 53 | # (X_b.dot(theta) - y).shape= (97,) 54 | # X_b.T.dot(X_b.dot(theta) - y).shape= (2,) 55 | 56 | 57 | # y = np.c_[data[:,1]]时的维度 58 | # y_train.shape= (97, 1) 59 | # theta.shape= (2,) 60 | # X_b.dot(theta)= (97,) 61 | # (X_b.dot(theta) - y).shape= (97, 97) 62 | # X_b.T.dot(X_b.dot(theta) - y).shape= (2, 97) 63 | # ValueError: operands could not be broadcast together with shapes (2,) (2,97) 64 | 65 | 66 | while cur_iter < n_iters: 67 | gradient = dJ(theta, X_b, y) 68 | # print((X_b.dot(theta)).shape) 69 | last_theta = theta 70 | # print(gradient.shape) 71 | theta = theta - eta * gradient 72 | if (abs(costfunc(theta, X_b, y) - costfunc(last_theta, X_b, y)) < epsilon): 73 | break 74 | 75 | cur_iter += 1 76 | 77 | return theta 78 | 79 | X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) 80 | print('X_b.shape=',X_b.shape) 81 | print('y_train.shape=',y_train.shape) 82 | initial_theta = np.zeros(X_b.shape[1]) #初始化theta 83 | print('theta.shape=',initial_theta.shape) 84 | theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters) 85 | 86 | intercept_ = theta[0] 87 | coef_ = theta[1:] 88 | 89 | return theta 90 | 91 | def predict(X_predict,theta): 92 | """给定待预测数据集X_predict,返回表示X_predict的结果向量""" 93 | 94 | X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict]) 95 | return X_b.dot(theta) 96 | 97 | def test(): 98 | data = np.loadtxt('linear_regression_data1.txt', delimiter=',') 99 | X = np.c_[data[:,0]] 100 | y = np.array(data[:,1]) 101 | y1 = np.c_[data[:,1]] 102 | print(fit_normal(X,y)) 103 | print(fit_bgd(X,y)) 104 | 105 | regr = LinearRegression() 106 | regr.fit(X, y) 107 | print(regr.intercept_,regr.coef_) 108 | 109 | def test0425(): 110 | # 加载数据 111 | diabets = load_diabetes() 112 | data = diabets.data 113 | target = diabets.target 114 | 115 | # 打乱数据 116 | X, y = shuffle(data, target, random_state=13) 117 | 118 | # 划分训练集和测试集 119 | offset = int(X.shape[0] * 0.9) 120 | X_train, y_train = X[:offset], y[:offset] 121 | X_test, y_test = X[offset:], y[offset:] 122 | y_train = y_train.reshape((-1, 1)) 123 | y_test = y_test.reshape((-1, 1)) 124 | 125 | print(X_train.shape) 126 | print(X_test.shape) 127 | print(y_train.shape) 128 | print(y_test.shape) 129 | 130 | X=X_train 131 | y=y_train 132 | print(fit_normal(X,y)) 133 | print(fit_bgd(X,y.reshape(len(y)))) 134 | 135 | regr = LinearRegression() 136 | regr.fit(X, y) 137 | print(regr.intercept_,regr.coef_) 138 | 139 | 140 | if __name__ == '__main__': 141 | test0425() 142 | 143 | # ValueError: operands could not be broadcast together with shapes (2,) (2,97) 144 | -------------------------------------------------------------------------------- /1线性回归/linear.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_diabetes 3 | from sklearn.utils import shuffle 4 | import matplotlib.pyplot as plt 5 | from sklearn.linear_model import LinearRegression 6 | 7 | def initialize_params(dims): 8 | w = np.zeros((dims, 1)) 9 | b = 0 10 | return w, b 11 | 12 | 13 | def linear_loss(X, y, w, b): 14 | num_train = X.shape[0] 15 | # 模型公式 16 | y_hat = np.dot(X, w) + b 17 | # 损失函数 18 | loss = np.sum((y_hat - y) ** 2) / num_train 19 | # 参数偏导 20 | dw = np.dot(X.T, (y_hat - y)) / num_train 21 | db = np.sum(y_hat - y) / num_train 22 | return y_hat, loss, dw, db 23 | 24 | 25 | def linear_train(X, y, learning_rate, epochs): 26 | # 参数初始化 27 | w, b = initialize_params(X.shape[1]) 28 | 29 | loss_list = [] 30 | for i in range(1, epochs): 31 | # 计算当前预测值、损失和梯度 32 | y_hat, loss, dw, db = linear_loss(X, y, w, b) 33 | loss_list.append(loss) 34 | 35 | # 基于梯度下降的参数更新 36 | w += -learning_rate * dw 37 | b += -learning_rate * db 38 | 39 | # 打印迭代次数和损失 40 | if i % 10000 == 0: 41 | print('epoch %d loss %f' % (i, loss)) 42 | 43 | # 保存参数 44 | params = { 45 | 'w': w, 46 | 'b': b 47 | } 48 | 49 | # 保存梯度 50 | grads = { 51 | 'dw': dw, 52 | 'db': db 53 | } 54 | return loss_list, loss, params, grads 55 | 56 | 57 | def predict(X, params): 58 | w = params['w'] 59 | b = params['b'] 60 | y_pred = np.dot(X, w) + b 61 | return y_pred 62 | 63 | 64 | if __name__ == "__main__": 65 | # 加载数据 66 | diabets = load_diabetes() 67 | data = diabets.data 68 | target = diabets.target 69 | 70 | # 打乱数据 71 | X, y = shuffle(data, target, random_state=13) 72 | 73 | # 划分训练集和测试集 74 | offset = int(X.shape[0] * 0.9) 75 | X_train, y_train = X[:offset], y[:offset] 76 | X_test, y_test = X[offset:], y[offset:] 77 | y_train = y_train.reshape((-1, 1)) 78 | y_test = y_test.reshape((-1, 1)) 79 | 80 | print(X_train.shape) 81 | print(X_test.shape) 82 | print(y_train.shape) 83 | print(y_test.shape) 84 | 85 | # 训练 86 | loss_list, loss, params, grads = linear_train(X_train, y_train, 0.01, 100000) 87 | print(params) 88 | 89 | regr = LinearRegression() 90 | regr.fit(X, y) 91 | print(regr.intercept_,regr.coef_) 92 | 93 | # 预测 94 | y_pred = predict(X_test, params) 95 | print(y_pred[:5]) 96 | 97 | # 画图 98 | f = X_test.dot(params['w']) + params['b'] 99 | plt.scatter(range(X_test.shape[0]), y_test) 100 | plt.plot(f, color='darkorange') 101 | plt.xlabel('x') 102 | plt.xlabel('y') 103 | plt.show() 104 | 105 | plt.plot(loss_list, color='blue') 106 | plt.xlabel('epochs') 107 | plt.ylabel('loss') 108 | plt.show() -------------------------------------------------------------------------------- /1线性回归/linear_regression_data1.txt: -------------------------------------------------------------------------------- 1 | 6.1101,17.592 2 | 5.5277,9.1302 3 | 8.5186,13.662 4 | 7.0032,11.854 5 | 5.8598,6.8233 6 | 8.3829,11.886 7 | 7.4764,4.3483 8 | 8.5781,12 9 | 6.4862,6.5987 10 | 5.0546,3.8166 11 | 5.7107,3.2522 12 | 14.164,15.505 13 | 5.734,3.1551 14 | 8.4084,7.2258 15 | 5.6407,0.71618 16 | 5.3794,3.5129 17 | 6.3654,5.3048 18 | 5.1301,0.56077 19 | 6.4296,3.6518 20 | 7.0708,5.3893 21 | 6.1891,3.1386 22 | 20.27,21.767 23 | 5.4901,4.263 24 | 6.3261,5.1875 25 | 5.5649,3.0825 26 | 18.945,22.638 27 | 12.828,13.501 28 | 10.957,7.0467 29 | 13.176,14.692 30 | 22.203,24.147 31 | 5.2524,-1.22 32 | 6.5894,5.9966 33 | 9.2482,12.134 34 | 5.8918,1.8495 35 | 8.2111,6.5426 36 | 7.9334,4.5623 37 | 8.0959,4.1164 38 | 5.6063,3.3928 39 | 12.836,10.117 40 | 6.3534,5.4974 41 | 5.4069,0.55657 42 | 6.8825,3.9115 43 | 11.708,5.3854 44 | 5.7737,2.4406 45 | 7.8247,6.7318 46 | 7.0931,1.0463 47 | 5.0702,5.1337 48 | 5.8014,1.844 49 | 11.7,8.0043 50 | 5.5416,1.0179 51 | 7.5402,6.7504 52 | 5.3077,1.8396 53 | 7.4239,4.2885 54 | 7.6031,4.9981 55 | 6.3328,1.4233 56 | 6.3589,-1.4211 57 | 6.2742,2.4756 58 | 5.6397,4.6042 59 | 9.3102,3.9624 60 | 9.4536,5.4141 61 | 8.8254,5.1694 62 | 5.1793,-0.74279 63 | 21.279,17.929 64 | 14.908,12.054 65 | 18.959,17.054 66 | 7.2182,4.8852 67 | 8.2951,5.7442 68 | 10.236,7.7754 69 | 5.4994,1.0173 70 | 20.341,20.992 71 | 10.136,6.6799 72 | 7.3345,4.0259 73 | 6.0062,1.2784 74 | 7.2259,3.3411 75 | 5.0269,-2.6807 76 | 6.5479,0.29678 77 | 7.5386,3.8845 78 | 5.0365,5.7014 79 | 10.274,6.7526 80 | 5.1077,2.0576 81 | 5.7292,0.47953 82 | 5.1884,0.20421 83 | 6.3557,0.67861 84 | 9.7687,7.5435 85 | 6.5159,5.3436 86 | 8.5172,4.2415 87 | 9.1802,6.7981 88 | 6.002,0.92695 89 | 5.5204,0.152 90 | 5.0594,2.8214 91 | 5.7077,1.8451 92 | 7.6366,4.2959 93 | 5.8707,7.2029 94 | 5.3054,1.9869 95 | 8.2934,0.14454 96 | 13.394,9.0551 97 | 5.4369,0.61705 98 | -------------------------------------------------------------------------------- /1线性回归/np测试.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | a = np.array([1, 2, 3]) 3 | b = np.array([4, 5, 6]) 4 | c = np.c_[a,b] 5 | 6 | print(np.r_[a,b]) 7 | print(c) 8 | print(np.c_[c,a]) 9 | print(np.c_[c,a].size) 10 | 11 | # np.c_是按行连接两个矩阵,就是把两矩阵左右相加,要求行数相等, 12 | # 类似于pandas中的merge()。 13 | # [1 2 3 4 5 6] 14 | # [[1 4] 15 | # [2 5] 16 | # [3 6]] 17 | # [[1 4 1] 18 | # [2 5 2] 19 | # [3 6 3]] 20 | # 9 -------------------------------------------------------------------------------- /1线性回归/维度测试.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | from sklearn.linear_model import LinearRegression 6 | from mpl_toolkits.mplot3d import axes3d 7 | 8 | 9 | data = np.loadtxt('linear_regression_data1.txt', delimiter=',') 10 | X = np.c_[data[:,0]] 11 | X1 = np.c_[np.ones(data.shape[0]),data[:,0]] 12 | y1 = np.c_[data[:,1]] 13 | y2 = np.array(data[:,1]) 14 | print('X1.shape=',X1.shape) 15 | print('y1.shape=',y1.shape) 16 | print('len(y1)=',len(y1)) 17 | print('y2.shape=',y2.shape) 18 | 19 | initial_theta = np.zeros((X1.shape[1],1)) 20 | # 计算损失函数 21 | def computeCost(X, y, theta=initial_theta): 22 | m = y.size 23 | J = 0 24 | 25 | h = X.dot(theta) 26 | 27 | J = 1.0/(2*m)*(np.sum(np.square(h-y))) 28 | 29 | return J 30 | 31 | # 梯度下降 32 | def gradientDescent(X, y, theta=initial_theta, alpha=0.01, num_iters=5000): 33 | m = y.size 34 | J_history = np.zeros(num_iters) 35 | 36 | 37 | theta1 = np.zeros(X.shape[1]) 38 | print('theta1.shape=',theta1.shape) 39 | 40 | print('theta.shape=',theta.shape) 41 | for iter in np.arange(num_iters): 42 | h = X.dot(theta) 43 | theta = theta - alpha*(1.0/m)*(X.T.dot(h-y)) 44 | J_history[iter] = computeCost(X, y, theta) 45 | return(theta, J_history) 46 | 47 | 48 | theta , Cost_J = gradientDescent(X1, y1) 49 | print('theta: ',theta.ravel()) 50 | 51 | 52 | # y1.shape= (97, 1) 53 | # y2.shape= (97,) 54 | # theta1.shape= (2,) 55 | # theta.shape= (2, 1) -------------------------------------------------------------------------------- /2逻辑回归/LogisticRegression-l2-线性不可分.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | from sklearn.datasets import load_iris 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.preprocessing import PolynomialFeatures 9 | 10 | # 带正则化的LR 11 | # data 12 | def create_data(): 13 | iris = load_iris() 14 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 15 | df['label'] = iris.target 16 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 17 | data = np.array(df.iloc[:100, [0,1,-1]]) 18 | # print(data) 19 | return data[:,:2], data[:,-1] 20 | 21 | 22 | #定义sigmoid函数 23 | def sigmoid(z): 24 | return(1 / (1 + np.exp(-z))) 25 | 26 | def fit(X_train, y_train,lam, eta=0.01, n_iters=1e4): 27 | """根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型""" 28 | assert X_train.shape[0] == y_train.shape[0], \ 29 | "the size of X_train must be equal to the size of y_train" 30 | 31 | 32 | def costfunc(theta, X_b, y,lam =1): 33 | # 计算损失函数 34 | y_hat = sigmoid(X_b.dot(theta)) 35 | try: 36 | return -np.sum(y*np.log(y_hat) + (1-y)*np.log(1-y_hat)) / len(y) \ 37 | + (lam/(2.0*len(y)))*np.sum(np.square(theta[1:])) 38 | except: 39 | return float('inf') 40 | 41 | def dJ(theta, X_b, y,lam): 42 | # 损失函数求导 43 | y_hat = sigmoid(X_b.dot(theta)) 44 | return X_b.T.dot(y_hat - y) / len(y) + (lam/len(y))*np.r_[[0],theta[1:]] 45 | 46 | def gradient_descent(X_b, y, initial_theta, lam,eta, n_iters=1e4, epsilon=1e-8): 47 | 48 | theta = initial_theta 49 | cur_iter = 0 50 | 51 | while cur_iter < n_iters: 52 | gradient = dJ(theta, X_b, y,lam) 53 | last_theta = theta 54 | theta = theta - eta * gradient 55 | if (abs(costfunc(theta, X_b, y,lam) - costfunc(last_theta, X_b, y,lam)) < epsilon): 56 | break 57 | 58 | cur_iter += 1 59 | 60 | return theta 61 | 62 | X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) 63 | # X_b = np.hstack([np.zeros((len(X_train), 1)), X_train]) 64 | initial_theta = np.zeros(X_b.shape[1]) #初始化theta 65 | theta = gradient_descent(X_b, y_train,initial_theta,lam ,eta, n_iters) 66 | 67 | intercept_ = theta[0] 68 | coef_ = theta[1:] 69 | 70 | return theta 71 | 72 | def predict(X_predict,theta): 73 | 74 | X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict]) 75 | # X_b = np.hstack([np.zeros((len(X_predict), 1)), X_predict]) 76 | proba = sigmoid(X_b.dot(theta)) 77 | return np.array(proba >= 0.6, dtype='int') 78 | 79 | def test(): 80 | X, y = create_data() 81 | 82 | weight = fit(X,y) 83 | 84 | x_ponits = np.arange(4, 8) 85 | y_ = -(weight[1]*x_ponits + weight[0])/weight[2] 86 | 87 | 88 | print(weight) 89 | 90 | clf = LogisticRegression( 91 | # max_iter=200, 92 | C=1) 93 | clf.fit(X, y) 94 | print(clf.intercept_,clf.coef_) 95 | 96 | y_2 = -(clf.coef_[0][0]*x_ponits + clf.intercept_[0])/clf.coef_[0][1] 97 | 98 | theta2 = np.array([clf.intercept_[0],clf.coef_[0][0],clf.coef_[0][1]]) 99 | 100 | plt.plot(x_ponits, y_,label='ys-lr') 101 | plt.plot(x_ponits, y_2,label='sklearn') 102 | plt.scatter(X[y==0,0], X[y==0,1]) 103 | plt.scatter(X[y==1,0], X[y==1,1]) 104 | plt.legend() 105 | plt.show() 106 | 107 | def testsklearn(): 108 | clf = LogisticRegression() 109 | X, y = create_data() 110 | clf.fit(X, y) 111 | print(clf.intercept_,clf.coef_) 112 | 113 | # if __name__ == '__main__': 114 | # test() 115 | # # testsklearn() 116 | X = np.random.normal(0, 1, size=(200, 2)) 117 | y = np.array((X[:,0]**2+X[:,1])<1.5, dtype='int') 118 | 119 | 120 | # data2 = np.loadtxt('data2.txt', delimiter=',') 121 | # y = np.array(data2[:,2]) 122 | # X = np.array(data2[:,0:2]) 123 | 124 | 125 | poly = PolynomialFeatures(6) 126 | XX = poly.fit_transform(X) 127 | 128 | fig, axes = plt.subplots(1,3, sharey = True, figsize=(17,5)) 129 | 130 | # 决策边界,咱们分别来看看正则化系数lambda太大太小分别会出现什么情况 131 | # Lambda = 0 : 就是没有正则化,这样的话,就过拟合咯 132 | # Lambda = 1 : 这才是正确的打开方式 133 | # Lambda = 100 : 卧槽,正则化项太激进,导致基本就没拟合出决策边界 134 | 135 | for i, C in enumerate([0.0, 10.0, 1000.0]): 136 | # 最优化 costFunctionReg 137 | weight = fit(XX,y,lam=C) 138 | 139 | # 准确率 140 | accuracy = 100.0*sum(predict(XX,weight) == y.ravel())/y.size 141 | 142 | # 对X,y的散列绘图 143 | 144 | # plt.scatter(X[y==0,0], X[y==0,1]) 145 | # plt.scatter(X[y==1,0], X[y==1,1]) 146 | axes.flatten()[i].scatter(X[y==0,0], X[y==0,1]) 147 | axes.flatten()[i].scatter(X[y==1,0], X[y==1,1]) 148 | # 画出决策边界 149 | x1_min, x1_max = X[:,0].min(), X[:,0].max(), 150 | x2_min, x2_max = X[:,1].min(), X[:,1].max(), 151 | xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max)) 152 | h = sigmoid(poly.fit_transform(np.c_[xx1.ravel(), xx2.ravel()]).dot(weight[1:])+1*weight[0]) 153 | h = h.reshape(xx1.shape) 154 | axes.flatten()[i].contour(xx1, xx2, h, [0.5], linewidths=1, colors='g'); 155 | axes.flatten()[i].set_title('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=2), C)) 156 | plt.show() -------------------------------------------------------------------------------- /2逻辑回归/LogisticRegression-l2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | from sklearn.datasets import load_iris 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.preprocessing import PolynomialFeatures 9 | 10 | # 带正则化的LR 11 | # data 12 | def create_data(): 13 | iris = load_iris() 14 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 15 | df['label'] = iris.target 16 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 17 | data = np.array(df.iloc[:100, [0,1,-1]]) 18 | # print(data) 19 | return data[:,:2], data[:,-1] 20 | 21 | 22 | #定义sigmoid函数 23 | def sigmoid(z): 24 | return(1 / (1 + np.exp(-z))) 25 | 26 | def fit(X_train, y_train, eta=0.01, n_iters=1e4): 27 | """根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型""" 28 | assert X_train.shape[0] == y_train.shape[0], \ 29 | "the size of X_train must be equal to the size of y_train" 30 | 31 | 32 | def costfunc(theta, X_b, y,lam =1): 33 | # 计算损失函数 34 | y_hat = sigmoid(X_b.dot(theta)) 35 | try: 36 | return -np.sum(y*np.log(y_hat) + (1-y)*np.log(1-y_hat)) / len(y) \ 37 | + (lam/(2.0*len(y)))*np.sum(np.square(theta[1:])) 38 | # 不对截距做限制 39 | except: 40 | return float('inf') 41 | 42 | def dJ(theta, X_b, y,lam=1): 43 | # 损失函数求导 44 | y_hat = sigmoid(X_b.dot(theta)) 45 | return X_b.T.dot(y_hat - y) / len(y) + (lam/len(y))*np.r_[[0],theta[1:]] 46 | 47 | def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8): 48 | 49 | theta = initial_theta 50 | cur_iter = 0 51 | print('X_b.dot(theta)=',(X_b.dot(theta)).shape) 52 | print('(X_b.dot(theta) - y).shape=',(X_b.dot(theta) - y).shape) 53 | print('X_b.T.dot(X_b.dot(theta) - y).shape=',X_b.T.dot(X_b.dot(theta) - y).shape) 54 | 55 | 56 | while cur_iter < n_iters: 57 | gradient = dJ(theta, X_b, y) 58 | last_theta = theta 59 | theta = theta - eta * gradient 60 | if (abs(costfunc(theta, X_b, y) - costfunc(last_theta, X_b, y)) < epsilon): 61 | break 62 | 63 | cur_iter += 1 64 | 65 | return theta 66 | 67 | X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) 68 | print('X_b.shape=',X_b.shape) 69 | print('y_train.shape=',y_train.shape) 70 | initial_theta = np.zeros(X_b.shape[1]) #初始化theta 71 | print('theta.shape=',initial_theta.shape) 72 | theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters) 73 | 74 | intercept_ = theta[0] 75 | coef_ = theta[1:] 76 | 77 | return theta 78 | 79 | def predict(X_predict,theta): 80 | 81 | X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict]) 82 | proba = sigmoid(X_b.dot(theta)) 83 | return np.array(proba >= 0.5, dtype='int') 84 | 85 | def test(): 86 | X, y = create_data() 87 | 88 | weight = fit(X,y) 89 | 90 | x_ponits = np.arange(4, 8) 91 | y_ = -(weight[1]*x_ponits + weight[0])/weight[2] 92 | 93 | 94 | print(weight) 95 | 96 | clf = LogisticRegression( 97 | # max_iter=200, 98 | C=1) 99 | clf.fit(X, y) 100 | print(clf.intercept_,clf.coef_) 101 | 102 | y_2 = -(clf.coef_[0][0]*x_ponits + clf.intercept_[0])/clf.coef_[0][1] 103 | 104 | theta2 = np.array([clf.intercept_[0],clf.coef_[0][0],clf.coef_[0][1]]) 105 | 106 | plt.plot(x_ponits, y_,label='ys-lr') 107 | plt.plot(x_ponits, y_2,label='sklearn') 108 | plt.scatter(X[y==0,0], X[y==0,1]) 109 | plt.scatter(X[y==1,0], X[y==1,1]) 110 | plt.legend() 111 | plt.show() 112 | 113 | def testsklearn(): 114 | clf = LogisticRegression() 115 | X, y = create_data() 116 | clf.fit(X, y) 117 | print(clf.intercept_,clf.coef_) 118 | 119 | if __name__ == '__main__': 120 | test() 121 | # testsklearn() 122 | 123 | -------------------------------------------------------------------------------- /2逻辑回归/LogisticRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | from sklearn.datasets import load_iris 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.linear_model import LogisticRegression 8 | 9 | # data 10 | def create_data(): 11 | iris = load_iris() 12 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 13 | df['label'] = iris.target 14 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 15 | data = np.array(df.iloc[:100, [0,1,-1]]) 16 | # print(data) 17 | return data[:,:2], data[:,-1] 18 | 19 | 20 | #定义sigmoid函数 21 | def sigmoid(z): 22 | return(1 / (1 + np.exp(-z))) 23 | 24 | def fit(X_train, y_train, eta=0.01, n_iters=1e4): 25 | """根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型""" 26 | assert X_train.shape[0] == y_train.shape[0], \ 27 | "the size of X_train must be equal to the size of y_train" 28 | 29 | 30 | def costfunc(theta, X_b, y): 31 | # 计算损失函数 32 | y_hat = sigmoid(X_b.dot(theta)) 33 | try: 34 | return -np.sum(y*np.log(y_hat) + (1-y)*np.log(1-y_hat)) / len(y) 35 | except: 36 | return float('inf') 37 | 38 | def dJ(theta, X_b, y): 39 | # 损失函数求导 40 | y_hat = sigmoid(X_b.dot(theta)) 41 | return X_b.T.dot(y_hat - y) / len(y) 42 | 43 | def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8): 44 | 45 | theta = initial_theta 46 | cur_iter = 0 47 | # print('X_b.dot(theta)=',(X_b.dot(theta)).shape) 48 | # print('(X_b.dot(theta) - y).shape=',(X_b.dot(theta) - y).shape) 49 | # print('X_b.T.dot(X_b.dot(theta) - y).shape=',X_b.T.dot(X_b.dot(theta) - y).shape) 50 | 51 | 52 | while cur_iter < n_iters: 53 | gradient = dJ(theta, X_b, y) 54 | last_theta = theta 55 | # print(gradient.shape) 56 | theta = theta - eta * gradient 57 | if (abs(costfunc(theta, X_b, y) - costfunc(last_theta, X_b, y)) < epsilon): 58 | break 59 | 60 | cur_iter += 1 61 | 62 | return theta 63 | 64 | X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) 65 | print('X_b.shape=',X_b.shape) 66 | print('y_train.shape=',y_train.shape) 67 | initial_theta = np.zeros(X_b.shape[1]) #初始化theta 68 | print('theta.shape=',initial_theta.shape) 69 | theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters) 70 | 71 | intercept_ = theta[0] 72 | coef_ = theta[1:] 73 | 74 | return theta 75 | 76 | 77 | def test(): 78 | X, y = create_data() 79 | 80 | 81 | weight = fit(X,y) 82 | 83 | x_ponits = np.arange(4, 8) 84 | y_ = -(weight[1]*x_ponits + weight[0])/weight[2] 85 | 86 | 87 | print(weight) 88 | 89 | clf = LogisticRegression(max_iter=200) 90 | clf.fit(X, y) 91 | print(clf.intercept_,clf.coef_) 92 | 93 | y_2 = -(clf.coef_[0][0]*x_ponits + clf.intercept_[0])/clf.coef_[0][1] 94 | plt.plot(x_ponits, y_) 95 | plt.plot(x_ponits, y_2) 96 | #lr_clf.show_graph() 97 | plt.scatter(X[:50,0],X[:50,1], label='0') 98 | plt.scatter(X[50:,0],X[50:,1], label='1') 99 | plt.legend() 100 | plt.show() 101 | 102 | def testsklearn(): 103 | clf = LogisticRegression() 104 | X, y = create_data() 105 | clf.fit(X, y) 106 | print(clf.intercept_,clf.coef_) 107 | 108 | if __name__ == '__main__': 109 | test() 110 | # testsklearn() 111 | 112 | -------------------------------------------------------------------------------- /2逻辑回归/data2.txt: -------------------------------------------------------------------------------- 1 | 0.051267,0.69956,1 2 | -0.092742,0.68494,1 3 | -0.21371,0.69225,1 4 | -0.375,0.50219,1 5 | -0.51325,0.46564,1 6 | -0.52477,0.2098,1 7 | -0.39804,0.034357,1 8 | -0.30588,-0.19225,1 9 | 0.016705,-0.40424,1 10 | 0.13191,-0.51389,1 11 | 0.38537,-0.56506,1 12 | 0.52938,-0.5212,1 13 | 0.63882,-0.24342,1 14 | 0.73675,-0.18494,1 15 | 0.54666,0.48757,1 16 | 0.322,0.5826,1 17 | 0.16647,0.53874,1 18 | -0.046659,0.81652,1 19 | -0.17339,0.69956,1 20 | -0.47869,0.63377,1 21 | -0.60541,0.59722,1 22 | -0.62846,0.33406,1 23 | -0.59389,0.005117,1 24 | -0.42108,-0.27266,1 25 | -0.11578,-0.39693,1 26 | 0.20104,-0.60161,1 27 | 0.46601,-0.53582,1 28 | 0.67339,-0.53582,1 29 | -0.13882,0.54605,1 30 | -0.29435,0.77997,1 31 | -0.26555,0.96272,1 32 | -0.16187,0.8019,1 33 | -0.17339,0.64839,1 34 | -0.28283,0.47295,1 35 | -0.36348,0.31213,1 36 | -0.30012,0.027047,1 37 | -0.23675,-0.21418,1 38 | -0.06394,-0.18494,1 39 | 0.062788,-0.16301,1 40 | 0.22984,-0.41155,1 41 | 0.2932,-0.2288,1 42 | 0.48329,-0.18494,1 43 | 0.64459,-0.14108,1 44 | 0.46025,0.012427,1 45 | 0.6273,0.15863,1 46 | 0.57546,0.26827,1 47 | 0.72523,0.44371,1 48 | 0.22408,0.52412,1 49 | 0.44297,0.67032,1 50 | 0.322,0.69225,1 51 | 0.13767,0.57529,1 52 | -0.0063364,0.39985,1 53 | -0.092742,0.55336,1 54 | -0.20795,0.35599,1 55 | -0.20795,0.17325,1 56 | -0.43836,0.21711,1 57 | -0.21947,-0.016813,1 58 | -0.13882,-0.27266,1 59 | 0.18376,0.93348,0 60 | 0.22408,0.77997,0 61 | 0.29896,0.61915,0 62 | 0.50634,0.75804,0 63 | 0.61578,0.7288,0 64 | 0.60426,0.59722,0 65 | 0.76555,0.50219,0 66 | 0.92684,0.3633,0 67 | 0.82316,0.27558,0 68 | 0.96141,0.085526,0 69 | 0.93836,0.012427,0 70 | 0.86348,-0.082602,0 71 | 0.89804,-0.20687,0 72 | 0.85196,-0.36769,0 73 | 0.82892,-0.5212,0 74 | 0.79435,-0.55775,0 75 | 0.59274,-0.7405,0 76 | 0.51786,-0.5943,0 77 | 0.46601,-0.41886,0 78 | 0.35081,-0.57968,0 79 | 0.28744,-0.76974,0 80 | 0.085829,-0.75512,0 81 | 0.14919,-0.57968,0 82 | -0.13306,-0.4481,0 83 | -0.40956,-0.41155,0 84 | -0.39228,-0.25804,0 85 | -0.74366,-0.25804,0 86 | -0.69758,0.041667,0 87 | -0.75518,0.2902,0 88 | -0.69758,0.68494,0 89 | -0.4038,0.70687,0 90 | -0.38076,0.91886,0 91 | -0.50749,0.90424,0 92 | -0.54781,0.70687,0 93 | 0.10311,0.77997,0 94 | 0.057028,0.91886,0 95 | -0.10426,0.99196,0 96 | -0.081221,1.1089,0 97 | 0.28744,1.087,0 98 | 0.39689,0.82383,0 99 | 0.63882,0.88962,0 100 | 0.82316,0.66301,0 101 | 0.67339,0.64108,0 102 | 1.0709,0.10015,0 103 | -0.046659,-0.57968,0 104 | -0.23675,-0.63816,0 105 | -0.15035,-0.36769,0 106 | -0.49021,-0.3019,0 107 | -0.46717,-0.13377,0 108 | -0.28859,-0.060673,0 109 | -0.61118,-0.067982,0 110 | -0.66302,-0.21418,0 111 | -0.59965,-0.41886,0 112 | -0.72638,-0.082602,0 113 | -0.83007,0.31213,0 114 | -0.72062,0.53874,0 115 | -0.59389,0.49488,0 116 | -0.48445,0.99927,0 117 | -0.0063364,0.99927,0 118 | 0.63265,-0.030612,0 119 | -------------------------------------------------------------------------------- /2逻辑回归/lg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.datasets.samples_generator import make_classification 4 | from sklearn.linear_model import LogisticRegression 5 | 6 | def initialize_params(dims): 7 | w = np.zeros((dims, 1)) 8 | b = 0 9 | return w, b 10 | 11 | def sigmoid(x): 12 | z = 1 / (1 + np.exp(-x)) 13 | return z 14 | 15 | def logistic(X, y, w, b): 16 | num_train = X.shape[0] 17 | y_hat = sigmoid(np.dot(X, w) + b) 18 | loss = -1 / num_train * np.sum(y * np.log(y_hat) + (1-y) * np.log(1-y_hat)) 19 | cost = -1 / num_train * np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) 20 | dw = np.dot(X.T, (y_hat - y)) / num_train 21 | db = np.sum(y_hat - y) / num_train 22 | return y_hat, cost, dw, db 23 | 24 | def linear_train(X, y, learning_rate, epochs): 25 | # 参数初始化 26 | w, b = initialize_params(X.shape[1]) 27 | 28 | loss_list = [] 29 | for i in range(epochs): 30 | # 计算当前的预测值、损失和梯度 31 | y_hat, loss, dw, db = logistic(X, y, w, b) 32 | loss_list.append(loss) 33 | 34 | # 基于梯度下降的参数更新 35 | w += -learning_rate * dw 36 | b += -learning_rate * db 37 | 38 | # 打印迭代次数和损失 39 | if i % 10000 == 0: 40 | print("epoch %d loss %f" % (i, loss)) 41 | 42 | # 保存参数 43 | params = { 44 | 'w': w, 45 | 'b': b 46 | } 47 | 48 | # 保存梯度 49 | grads = { 50 | 'dw': dw, 51 | 'db': db 52 | } 53 | 54 | return loss_list, loss, params, grads 55 | 56 | def predict(X, params): 57 | w = params['w'] 58 | b = params['b'] 59 | y_pred = sigmoid(np.dot(X, w) + b) 60 | return y_pred 61 | 62 | 63 | if __name__ == "__main__": 64 | # 生成数据 65 | X, labels = make_classification(n_samples=100, 66 | n_features=2, 67 | n_informative=2, 68 | n_redundant=0, 69 | random_state=1, 70 | n_clusters_per_class=2) 71 | print(X.shape) 72 | print(labels.shape) 73 | 74 | X = np.array([[3, 3, 3], 75 | [4, 3, 2], 76 | [2, 1, 2], 77 | [1, 1, 1], 78 | [-1, 0, 1], 79 | [2, -2, 1]]) 80 | labels = np.array([1, 1, 1, 0, 0, 0]) 81 | 82 | print(X.shape) 83 | print(labels.shape) 84 | 85 | 86 | # 生成伪随机数 87 | rng = np.random.RandomState(2) 88 | # X += 2 * rng.uniform(size=X.shape) 89 | 90 | # 划分训练集和测试集 91 | offset = int(X.shape[0] * 0.9) 92 | X_train, y_train = X[:offset], labels[:offset] 93 | X_test, y_test = X[offset:], labels[offset:] 94 | y_train = y_train.reshape((-1, 1)) 95 | y_test = y_test.reshape((-1, 1)) 96 | # 一般情况y的shape为[samples,] 97 | print('X_train=', X_train.shape) 98 | print('y_train=', y_train.shape) 99 | print('X_test=', X_test.shape) 100 | print('y_test=', y_test.shape) 101 | 102 | # 训练 103 | loss_list, loss, params, grads = linear_train(X_train, y_train, 0.01, 100000) 104 | print(params) 105 | 106 | clf = LogisticRegression(max_iter=20000) 107 | clf.fit(X_train, y_train) 108 | print(clf.intercept_,clf.coef_) 109 | 110 | # 预测 111 | y_pred = predict(X_train, params) 112 | print(y_pred[:10]) -------------------------------------------------------------------------------- /3SVM/SVM/svm-digits.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import random 5 | 6 | """ 7 | SVM 8 | """ 9 | 10 | class optStruct: 11 | """ 12 | 数据结构,维护所有需要操作的值 13 | Parameters: 14 | dataMatIn - 数据矩阵 15 | classLabels - 数据标签 16 | C - 松弛变量 17 | toler - 容错率 18 | kTup - 包含核函数信息的元组,第一个参数存放核函数类别,第二个参数存放必要的核函数需要用到的参数 19 | """ 20 | def __init__(self, dataMatIn, classLabels, C, toler, kTup): 21 | self.X = dataMatIn #数据矩阵 22 | self.labelMat = classLabels #数据标签 23 | self.C = C #松弛变量 24 | self.tol = toler #容错率 25 | self.m = np.shape(dataMatIn)[0] #数据矩阵行数 26 | self.alphas = np.mat(np.zeros((self.m,1))) #根据矩阵行数初始化alpha参数为0 27 | self.b = 0 #初始化b参数为0 28 | self.eCache = np.mat(np.zeros((self.m,2))) #根据矩阵行数初始化虎误差缓存,第一列为是否有效的标志位,第二列为实际的误差E的值。 29 | self.K = np.mat(np.zeros((self.m,self.m))) #初始化核K 30 | for i in range(self.m): #计算所有数据的核K 31 | self.K[:,i] = kernelTrans(self.X, self.X[i,:], kTup) 32 | 33 | def kernelTrans(X, A, kTup): 34 | """ 35 | 通过核函数将数据转换更高维的空间 36 | Parameters: 37 | X - 数据矩阵 38 | A - 单个数据的向量 39 | kTup - 包含核函数信息的元组 40 | Returns: 41 | K - 计算的核K 42 | """ 43 | m,n = np.shape(X) 44 | K = np.mat(np.zeros((m,1))) 45 | if kTup[0] == 'lin': K = X * A.T #线性核函数,只进行内积。 46 | elif kTup[0] == 'rbf': #高斯核函数,根据高斯核函数公式进行计算 47 | for j in range(m): 48 | deltaRow = X[j,:] - A 49 | K[j] = deltaRow*deltaRow.T 50 | K = np.exp(K/(-1*kTup[1]**2)) #计算高斯核K 51 | else: raise NameError('核函数无法识别') 52 | return K #返回计算的核K 53 | 54 | def loadDataSet(fileName): 55 | """ 56 | 读取数据 57 | Parameters: 58 | fileName - 文件名 59 | Returns: 60 | dataMat - 数据矩阵 61 | labelMat - 数据标签 62 | """ 63 | dataMat = []; labelMat = [] 64 | fr = open(fileName) 65 | for line in fr.readlines(): #逐行读取,滤除空格等 66 | lineArr = line.strip().split('\t') 67 | dataMat.append([float(lineArr[0]), float(lineArr[1])]) #添加数据 68 | labelMat.append(float(lineArr[2])) #添加标签 69 | return dataMat,labelMat 70 | 71 | def calcEk(oS, k): 72 | """ 73 | 计算误差 74 | Parameters: 75 | oS - 数据结构 76 | k - 标号为k的数据 77 | Returns: 78 | Ek - 标号为k的数据误差 79 | """ 80 | fXk = float(np.multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b) 81 | Ek = fXk - float(oS.labelMat[k]) 82 | return Ek 83 | 84 | def selectJrand(i, m): 85 | """ 86 | 函数说明:随机选择alpha_j的索引值 87 | 88 | Parameters: 89 | i - alpha_i的索引值 90 | m - alpha参数个数 91 | Returns: 92 | j - alpha_j的索引值 93 | """ 94 | j = i #选择一个不等于i的j 95 | while (j == i): 96 | j = int(random.uniform(0, m)) 97 | return j 98 | 99 | def selectJ(i, oS, Ei): 100 | """ 101 | 内循环启发方式2 102 | Parameters: 103 | i - 标号为i的数据的索引值 104 | oS - 数据结构 105 | Ei - 标号为i的数据误差 106 | Returns: 107 | j, maxK - 标号为j或maxK的数据的索引值 108 | Ej - 标号为j的数据误差 109 | """ 110 | maxK = -1; maxDeltaE = 0; Ej = 0 #初始化 111 | oS.eCache[i] = [1,Ei] #根据Ei更新误差缓存 112 | validEcacheList = np.nonzero(oS.eCache[:,0].A)[0] #返回误差不为0的数据的索引值 113 | if (len(validEcacheList)) > 1: #有不为0的误差 114 | for k in validEcacheList: #遍历,找到最大的Ek 115 | if k == i: continue #不计算i,浪费时间 116 | Ek = calcEk(oS, k) #计算Ek 117 | deltaE = abs(Ei - Ek) #计算|Ei-Ek| 118 | if (deltaE > maxDeltaE): #找到maxDeltaE 119 | maxK = k; maxDeltaE = deltaE; Ej = Ek 120 | return maxK, Ej #返回maxK,Ej 121 | else: #没有不为0的误差 122 | j = selectJrand(i, oS.m) #随机选择alpha_j的索引值 123 | Ej = calcEk(oS, j) #计算Ej 124 | return j, Ej #j,Ej 125 | 126 | def updateEk(oS, k): 127 | """ 128 | 计算Ek,并更新误差缓存 129 | Parameters: 130 | oS - 数据结构 131 | k - 标号为k的数据的索引值 132 | Returns: 133 | 无 134 | """ 135 | Ek = calcEk(oS, k) #计算Ek 136 | oS.eCache[k] = [1,Ek] #更新误差缓存 137 | 138 | 139 | def clipAlpha(aj,H,L): 140 | """ 141 | 修剪alpha_j 142 | Parameters: 143 | aj - alpha_j的值 144 | H - alpha上限 145 | L - alpha下限 146 | Returns: 147 | aj - 修剪后的alpah_j的值 148 | """ 149 | if aj > H: 150 | aj = H 151 | if L > aj: 152 | aj = L 153 | return aj 154 | 155 | def innerL(i, oS): 156 | """ 157 | 优化的SMO算法 158 | Parameters: 159 | i - 标号为i的数据的索引值 160 | oS - 数据结构 161 | Returns: 162 | 1 - 有任意一对alpha值发生变化 163 | 0 - 没有任意一对alpha值发生变化或变化太小 164 | """ 165 | #步骤1:计算误差Ei 166 | Ei = calcEk(oS, i) 167 | #优化alpha,设定一定的容错率。 168 | if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)): 169 | #使用内循环启发方式2选择alpha_j,并计算Ej 170 | j,Ej = selectJ(i, oS, Ei) 171 | #保存更新前的aplpha值,使用深拷贝 172 | alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy(); 173 | #步骤2:计算上下界L和H 174 | if (oS.labelMat[i] != oS.labelMat[j]): 175 | L = max(0, oS.alphas[j] - oS.alphas[i]) 176 | H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) 177 | else: 178 | L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) 179 | H = min(oS.C, oS.alphas[j] + oS.alphas[i]) 180 | if L == H: 181 | print("L==H") 182 | return 0 183 | #步骤3:计算eta 184 | eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j] 185 | if eta >= 0: 186 | print("eta>=0") 187 | return 0 188 | #步骤4:更新alpha_j 189 | oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej)/eta 190 | #步骤5:修剪alpha_j 191 | oS.alphas[j] = clipAlpha(oS.alphas[j],H,L) 192 | #更新Ej至误差缓存 193 | updateEk(oS, j) 194 | if (abs(oS.alphas[j] - alphaJold) < 0.00001): 195 | print("alpha_j变化太小") 196 | return 0 197 | #步骤6:更新alpha_i 198 | oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j]) 199 | #更新Ei至误差缓存 200 | updateEk(oS, i) 201 | #步骤7:更新b_1和b_2 202 | b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j] 203 | b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j] 204 | #步骤8:根据b_1和b_2更新b 205 | if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1 206 | elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2 207 | else: oS.b = (b1 + b2)/2.0 208 | return 1 209 | else: 210 | return 0 211 | 212 | def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup = ('lin',0)): 213 | """ 214 | 完整的线性SMO算法 215 | Parameters: 216 | dataMatIn - 数据矩阵 217 | classLabels - 数据标签 218 | C - 松弛变量 219 | toler - 容错率 220 | maxIter - 最大迭代次数 221 | kTup - 包含核函数信息的元组 222 | Returns: 223 | oS.b - SMO算法计算的b 224 | oS.alphas - SMO算法计算的alphas 225 | """ 226 | oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).transpose(), C, toler, kTup) #初始化数据结构 227 | iter = 0 #初始化当前迭代次数 228 | entireSet = True; alphaPairsChanged = 0 229 | while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)): #遍历整个数据集都alpha也没有更新或者超过最大迭代次数,则退出循环 230 | alphaPairsChanged = 0 231 | if entireSet: #遍历整个数据集 232 | for i in range(oS.m): 233 | alphaPairsChanged += innerL(i,oS) #使用优化的SMO算法 234 | print("全样本遍历:第%d次迭代 样本:%d, alpha优化次数:%d" % (iter,i,alphaPairsChanged)) 235 | iter += 1 236 | else: #遍历非边界值 237 | nonBoundIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0] #遍历不在边界0和C的alpha 238 | for i in nonBoundIs: 239 | alphaPairsChanged += innerL(i,oS) 240 | print("非边界遍历:第%d次迭代 样本:%d, alpha优化次数:%d" % (iter,i,alphaPairsChanged)) 241 | iter += 1 242 | if entireSet: #遍历一次后改为非边界遍历 243 | entireSet = False 244 | elif (alphaPairsChanged == 0): #如果alpha没有更新,计算全样本遍历 245 | entireSet = True 246 | print("迭代次数: %d" % iter) 247 | return oS.b,oS.alphas #返回SMO算法计算的b和alphas 248 | 249 | 250 | def img2vector(filename): 251 | """ 252 | 将32x32的二进制图像转换为1x1024向量。 253 | Parameters: 254 | filename - 文件名 255 | Returns: 256 | returnVect - 返回的二进制图像的1x1024向量 257 | """ 258 | returnVect = np.zeros((1,1024)) 259 | fr = open(filename) 260 | for i in range(32): 261 | lineStr = fr.readline() 262 | for j in range(32): 263 | returnVect[0,32*i+j] = int(lineStr[j]) 264 | return returnVect 265 | 266 | def loadImages(dirName): 267 | """ 268 | 加载图片 269 | Parameters: 270 | dirName - 文件夹的名字 271 | Returns: 272 | trainingMat - 数据矩阵 273 | hwLabels - 数据标签 274 | """ 275 | from os import listdir 276 | hwLabels = [] 277 | trainingFileList = listdir(dirName) 278 | m = len(trainingFileList) 279 | trainingMat = np.zeros((m,1024)) 280 | for i in range(m): 281 | fileNameStr = trainingFileList[i] 282 | fileStr = fileNameStr.split('.')[0] 283 | classNumStr = int(fileStr.split('_')[0]) 284 | if classNumStr == 9: hwLabels.append(-1) 285 | else: hwLabels.append(1) 286 | trainingMat[i,:] = img2vector('%s/%s' % (dirName, fileNameStr)) 287 | return trainingMat, hwLabels 288 | 289 | def testDigits(kTup=('rbf', 10)): 290 | """ 291 | 测试函数 292 | Parameters: 293 | kTup - 包含核函数信息的元组 294 | Returns: 295 | 无 296 | """ 297 | dataArr,labelArr = loadImages('trainingDigits') 298 | b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 10, kTup) 299 | datMat = np.mat(dataArr); labelMat = np.mat(labelArr).transpose() 300 | svInd = np.nonzero(alphas.A>0)[0] 301 | sVs=datMat[svInd] 302 | labelSV = labelMat[svInd]; 303 | print("支持向量个数:%d" % np.shape(sVs)[0]) 304 | m,n = np.shape(datMat) 305 | errorCount = 0 306 | for i in range(m): 307 | kernelEval = kernelTrans(sVs,datMat[i,:],kTup) 308 | predict=kernelEval.T * np.multiply(labelSV,alphas[svInd]) + b 309 | if np.sign(predict) != np.sign(labelArr[i]): errorCount += 1 310 | print("训练集错误率: %.2f%%" % (float(errorCount)/m)) 311 | dataArr,labelArr = loadImages('testDigits') 312 | errorCount = 0 313 | datMat = np.mat(dataArr); labelMat = np.mat(labelArr).transpose() 314 | m,n = np.shape(datMat) 315 | for i in range(m): 316 | kernelEval = kernelTrans(sVs,datMat[i,:],kTup) 317 | predict=kernelEval.T * np.multiply(labelSV,alphas[svInd]) + b 318 | if np.sign(predict) != np.sign(labelArr[i]): errorCount += 1 319 | print("测试集错误率: %.2f%%" % (float(errorCount)/m)) 320 | 321 | if __name__ == '__main__': 322 | testDigits() -------------------------------------------------------------------------------- /3SVM/SVM/svm-simple.py: -------------------------------------------------------------------------------- 1 | # -*- coding:UTF-8 -*- 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import random 5 | 6 | """ 7 | 函数说明:读取数据 8 | 9 | Parameters: 10 | fileName - 文件名 11 | Returns: 12 | dataMat - 数据矩阵 13 | labelMat - 数据标签 14 | Author: 15 | Jack Cui 16 | Blog: 17 | http://blog.csdn.net/c406495762 18 | Zhihu: 19 | https://www.zhihu.com/people/Jack--Cui/ 20 | Modify: 21 | 2017-09-21 22 | """ 23 | def loadDataSet(fileName): 24 | dataMat = []; labelMat = [] 25 | fr = open(fileName) 26 | for line in fr.readlines(): #逐行读取,滤除空格等 27 | lineArr = line.strip().split('\t') 28 | dataMat.append([float(lineArr[0]), float(lineArr[1])]) #添加数据 29 | labelMat.append(float(lineArr[2])) #添加标签 30 | return dataMat,labelMat 31 | 32 | 33 | """ 34 | 函数说明:随机选择alpha 35 | 36 | Parameters: 37 | i - alpha_i的索引值 38 | m - alpha参数个数 39 | Returns: 40 | j - alpha_j的索引值 41 | """ 42 | def selectJrand(i, m): 43 | j = i #选择一个不等于i的j 44 | while (j == i): 45 | j = int(random.uniform(0, m)) 46 | return j 47 | 48 | """ 49 | 函数说明:修剪alpha 50 | 51 | Parameters: 52 | aj - alpha_j值 53 | H - alpha上限 54 | L - alpha下限 55 | Returns: 56 | aj - alpah值 57 | """ 58 | def clipAlpha(aj,H,L): 59 | if aj > H: 60 | aj = H 61 | if L > aj: 62 | aj = L 63 | return aj 64 | 65 | """ 66 | 函数说明:数据可视化 67 | 68 | Parameters: 69 | dataMat - 数据矩阵 70 | labelMat - 数据标签 71 | Returns: 72 | 无 73 | Author: 74 | Jack Cui 75 | Blog: 76 | http://blog.csdn.net/c406495762 77 | Zhihu: 78 | https://www.zhihu.com/people/Jack--Cui/ 79 | Modify: 80 | 2017-09-21 81 | """ 82 | def showDataSet(dataMat, labelMat): 83 | data_plus = [] #正样本 84 | data_minus = [] #负样本 85 | for i in range(len(dataMat)): 86 | if labelMat[i] > 0: 87 | data_plus.append(dataMat[i]) 88 | else: 89 | data_minus.append(dataMat[i]) 90 | data_plus_np = np.array(data_plus) #转换为numpy矩阵 91 | data_minus_np = np.array(data_minus) #转换为numpy矩阵 92 | plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1]) #正样本散点图 93 | plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1]) #负样本散点图 94 | plt.show() 95 | 96 | 97 | """ 98 | 函数说明:简化版SMO算法 99 | 100 | Parameters: 101 | dataMatIn - 数据矩阵 102 | classLabels - 数据标签 103 | C - 松弛变量 104 | toler - 容错率 105 | maxIter - 最大迭代次数 106 | Returns: 107 | 无 108 | Author: 109 | Jack Cui 110 | Blog: 111 | http://blog.csdn.net/c406495762 112 | Zhihu: 113 | https://www.zhihu.com/people/Jack--Cui/ 114 | Modify: 115 | 2017-09-23 116 | """ 117 | def smoSimple(dataMatIn, classLabels, C, toler, maxIter): 118 | #转换为numpy的mat存储 119 | dataMatrix = np.mat(dataMatIn); labelMat = np.mat(classLabels).transpose() 120 | #初始化b参数,统计dataMatrix的维度 121 | b = 0; m,n = np.shape(dataMatrix) 122 | #初始化alpha参数,设为0 123 | alphas = np.mat(np.zeros((m,1))) 124 | #初始化迭代次数 125 | iter_num = 0 126 | #最多迭代matIter次 127 | while (iter_num < maxIter): 128 | # 记录alpha是否已经进行优化,每次循环时设为0,然后再对整个集合顺序遍历 129 | alphaPairsChanged = 0 130 | for i in range(m): 131 | # print(i) 测试continue 每次continue后m会加1 132 | #步骤1:计算误差Ei 133 | fXi = float(np.multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b 134 | Ei = fXi - float(labelMat[i]) 135 | 136 | #优化alpha,设定一定的容错率。 137 | # KKT条件的判断,找到违背KKT条件的点 138 | # 约束条件 (KKT条件是解决最优化问题的时用到的一种方法。我们这里提到的最优化问题通常是指对于给定的某一函数,求其在指定作用域上的全局最小值) 139 | # 0<=alphas[i]<=C,但由于0和C是边界值,我们无法进行优化,因为需要增加一个alphas和降低一个alphas。 140 | # 表示发生错误的概率:labelMat[i]*Ei 如果超出了 toler, 才需要优化。至于正负号,我们考虑绝对值就对了。 141 | # 检验训练样本(xi, yi)是否满足KKT条件 142 | # yi*f(i) >= 1 and alpha = 0 (outside the boundary) 143 | # yi*f(i) == 1 and 0 toler) and (alphas[i] > 0)): 147 | #随机选择另一个与alpha_i成对优化的alpha_j 148 | j = selectJrand(i,m) 149 | #步骤1:计算误差Ej 150 | fXj = float(np.multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b 151 | Ej = fXj - float(labelMat[j]) 152 | #保存更新前的aplpha值,使用深拷贝 153 | alphaIold = alphas[i].copy() 154 | alphaJold = alphas[j].copy() 155 | #步骤2:计算上下界L和H 156 | # L和H用于将alphas[j]调整到0-C之间。如果L==H,就不做任何改变,直接执行continue语句 157 | # labelMat[i] != labelMat[j] 表示异侧,就相减,否则是同侧,就相加。 158 | if (labelMat[i] != labelMat[j]): 159 | L = max(0, alphas[j] - alphas[i]) 160 | H = min(C, C + alphas[j] - alphas[i]) 161 | else: 162 | L = max(0, alphas[j] + alphas[i] - C) 163 | H = min(C, alphas[j] + alphas[i]) 164 | if L==H: 165 | print("L==H") 166 | continue 167 | #步骤3:计算eta 与公式中的符号相反 168 | # eta是alphas[j]的最优修改量,如果eta==0,需要退出for循环的当前迭代过程 169 | eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - dataMatrix[j,:]*dataMatrix[j,:].T 170 | if eta >= 0: 171 | print("eta>=0") 172 | continue 173 | #步骤4:更新alpha_j 无约束条件下的解 174 | alphas[j] -= labelMat[j]*(Ei - Ej)/eta 175 | #步骤5:修剪alpha_j 176 | alphas[j] = clipAlpha(alphas[j],H,L) 177 | if (abs(alphas[j] - alphaJold) < 0.00001): 178 | print("alpha_j变化太小") 179 | continue 180 | #步骤6:更新alpha_i 181 | alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j]) 182 | #步骤7:更新b_1和b_2 183 | b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T 184 | b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T 185 | #步骤8:根据b_1和b_2更新b 186 | if (0 < alphas[i]) and (C > alphas[i]): b = b1 187 | elif (0 < alphas[j]) and (C > alphas[j]): b = b2 188 | else: b = (b1 + b2)/2.0 189 | #统计优化次数 190 | alphaPairsChanged += 1 191 | #打印统计信息 192 | print("第%d次迭代 样本:%d, alpha优化次数:%d" % (iter_num,i,alphaPairsChanged)) 193 | #更新迭代次数 194 | if (alphaPairsChanged == 0): 195 | iter_num += 1 196 | else: 197 | iter_num = 0 198 | print("迭代次数: %d" % iter_num) 199 | return b,alphas 200 | 201 | """ 202 | 函数说明:分类结果可视化 203 | 204 | Parameters: 205 | dataMat - 数据矩阵 206 | w - 直线法向量 207 | b - 直线解决 208 | Returns: 209 | 无 210 | """ 211 | def showClassifer(dataMat, w, b): 212 | #绘制样本点 213 | data_plus = [] #正样本 214 | data_minus = [] #负样本 215 | for i in range(len(dataMat)): 216 | if labelMat[i] > 0: 217 | data_plus.append(dataMat[i]) 218 | else: 219 | data_minus.append(dataMat[i]) 220 | data_plus_np = np.array(data_plus) #转换为numpy矩阵 221 | data_minus_np = np.array(data_minus) #转换为numpy矩阵 222 | plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1], s=30, alpha=0.7) #正样本散点图 223 | plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1], s=30, alpha=0.7) #负样本散点图 224 | #绘制直线 225 | x1 = max(dataMat)[0] 226 | x2 = min(dataMat)[0] 227 | a1, a2 = w 228 | b = float(b) 229 | a1 = float(a1[0]) 230 | a2 = float(a2[0]) 231 | y1, y2 = (-b- a1*x1)/a2, (-b - a1*x2)/a2 232 | plt.plot([x1, x2], [y1, y2]) 233 | #找出支持向量点 234 | for i, alpha in enumerate(alphas): 235 | if abs(alpha) > 0: 236 | x, y = dataMat[i] 237 | plt.scatter([x], [y], s=150, c='none', alpha=0.7, linewidth=1.5, edgecolor='red') 238 | plt.show() 239 | 240 | 241 | """ 242 | 函数说明:计算w 243 | 244 | Parameters: 245 | dataMat - 数据矩阵 246 | labelMat - 数据标签 247 | alphas - alphas值 248 | Returns: 249 | 无 250 | """ 251 | def get_w(dataMat, labelMat, alphas): 252 | alphas, dataMat, labelMat = np.array(alphas), np.array(dataMat), np.array(labelMat) 253 | w = np.dot((np.tile(labelMat.reshape(1, -1).T, (1, 2)) * dataMat).T, alphas) 254 | return w.tolist() 255 | 256 | 257 | if __name__ == '__main__': 258 | dataMat, labelMat = loadDataSet('testSet.txt') 259 | b,alphas = smoSimple(dataMat, labelMat, 0.6, 0.001, 4) 260 | w = get_w(dataMat, labelMat, alphas) 261 | showClassifer(dataMat, w, b) 262 | -------------------------------------------------------------------------------- /3SVM/SVM/svm-smo.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import random 5 | 6 | """ 7 | 完整版SMO算法 8 | """ 9 | 10 | class optStruct: 11 | """ 12 | 数据结构,维护所有需要操作的值 13 | Parameters: 14 | dataMatIn - 数据矩阵 15 | classLabels - 数据标签 16 | C - 松弛变量 17 | toler - 容错率 18 | """ 19 | def __init__(self, dataMatIn, classLabels, C, toler): 20 | self.X = dataMatIn #数据矩阵 21 | self.labelMat = classLabels #数据标签 22 | self.C = C #松弛变量 23 | self.tol = toler #容错率 24 | self.m = np.shape(dataMatIn)[0] #数据矩阵行数 25 | self.alphas = np.mat(np.zeros((self.m,1))) #根据矩阵行数初始化alpha参数为0 26 | self.b = 0 #初始化b参数为0 27 | self.eCache = np.mat(np.zeros((self.m,2))) #根据矩阵行数初始化虎误差缓存,第一列为是否有效的标志位,第二列为实际的误差E的值。 28 | 29 | def loadDataSet(fileName): 30 | """ 31 | 读取数据 32 | Parameters: 33 | fileName - 文件名 34 | Returns: 35 | dataMat - 数据矩阵 36 | labelMat - 数据标签 37 | """ 38 | dataMat = []; labelMat = [] 39 | fr = open(fileName) 40 | for line in fr.readlines(): #逐行读取,滤除空格等 41 | lineArr = line.strip().split('\t') 42 | dataMat.append([float(lineArr[0]), float(lineArr[1])]) #添加数据 43 | labelMat.append(float(lineArr[2])) #添加标签 44 | return dataMat,labelMat 45 | 46 | def calcEk(oS, k): 47 | """ 48 | 计算误差 49 | Parameters: 50 | oS - 数据结构 51 | k - 标号为k的数据 52 | Returns: 53 | Ek - 标号为k的数据误差 54 | """ 55 | fXk = float(np.multiply(oS.alphas,oS.labelMat).T*(oS.X*oS.X[k,:].T) + oS.b) 56 | Ek = fXk - float(oS.labelMat[k]) 57 | return Ek 58 | 59 | def selectJrand(i, m): 60 | """ 61 | 函数说明:随机选择alpha_j的索引值 62 | 63 | Parameters: 64 | i - alpha_i的索引值 65 | m - alpha参数个数 66 | Returns: 67 | j - alpha_j的索引值 68 | """ 69 | j = i #选择一个不等于i的j 70 | while (j == i): 71 | j = int(random.uniform(0, m)) 72 | return j 73 | 74 | # 选择J 75 | def selectJ(i, oS, Ei): 76 | """ 77 | 内循环启发方式2 78 | Parameters: 79 | i - 标号为i的数据的索引值 80 | oS - 数据结构 81 | Ei - 标号为i的数据误差 82 | Returns: 83 | j, maxK - 标号为j或maxK的数据的索引值 84 | Ej - 标号为j的数据误差 85 | """ 86 | maxK = -1; maxDeltaE = 0; Ej = 0 #初始化 87 | oS.eCache[i] = [1,Ei] #根据Ei更新误差缓存 88 | validEcacheList = np.nonzero(oS.eCache[:,0].A)[0] #返回误差不为0的数据的索引值 89 | if (len(validEcacheList)) > 1: #有不为0的误差 90 | for k in validEcacheList: #遍历,找到最大的Ek 91 | if k == i: continue #不计算i,浪费时间 92 | Ek = calcEk(oS, k) #计算Ek 93 | deltaE = abs(Ei - Ek) #计算|Ei-Ek| 94 | if (deltaE > maxDeltaE): #找到maxDeltaE 95 | maxK = k; maxDeltaE = deltaE; Ej = Ek 96 | return maxK, Ej #返回maxK,Ej 97 | else: #没有不为0的误差 98 | j = selectJrand(i, oS.m) #随机选择alpha_j的索引值 99 | Ej = calcEk(oS, j) #计算Ej 100 | return j, Ej #j,Ej 101 | 102 | def updateEk(oS, k): 103 | """ 104 | 计算Ek,并更新误差缓存 105 | Parameters: 106 | oS - 数据结构 107 | k - 标号为k的数据的索引值 108 | Returns: 109 | 无 110 | """ 111 | Ek = calcEk(oS, k) #计算Ek 112 | oS.eCache[k] = [1,Ek] #更新误差缓存 113 | 114 | 115 | def clipAlpha(aj,H,L): 116 | """ 117 | 修剪alpha_j 118 | Parameters: 119 | aj - alpha_j的值 120 | H - alpha上限 121 | L - alpha下限 122 | Returns: 123 | aj - 修剪后的alpah_j的值 124 | """ 125 | if aj > H: 126 | aj = H 127 | if L > aj: 128 | aj = L 129 | return aj 130 | 131 | def innerL(i, oS): 132 | """ 133 | 优化的SMO算法 134 | Parameters: 135 | i - 标号为i的数据的索引值 136 | oS - 数据结构 137 | Returns: 138 | 1 - 有任意一对alpha值发生变化 139 | 0 - 没有任意一对alpha值发生变化或变化太小 140 | """ 141 | #步骤1:计算误差Ei 142 | Ei = calcEk(oS, i) 143 | #优化alpha,设定一定的容错率。 144 | if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)): 145 | #使用内循环启发方式2选择alpha_j,并计算Ej 146 | j,Ej = selectJ(i, oS, Ei) 147 | #保存更新前的aplpha值,使用深拷贝 148 | alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy(); 149 | #步骤2:计算上下界L和H 150 | if (oS.labelMat[i] != oS.labelMat[j]): 151 | L = max(0, oS.alphas[j] - oS.alphas[i]) 152 | H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) 153 | else: 154 | L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) 155 | H = min(oS.C, oS.alphas[j] + oS.alphas[i]) 156 | if L == H: 157 | print("L==H") 158 | return 0 159 | #步骤3:计算eta 160 | eta = 2.0 * oS.X[i,:] * oS.X[j,:].T - oS.X[i,:] * oS.X[i,:].T - oS.X[j,:] * oS.X[j,:].T 161 | if eta >= 0: 162 | print("eta>=0") 163 | return 0 164 | #步骤4:更新alpha_j 165 | oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej)/eta 166 | #步骤5:修剪alpha_j 167 | oS.alphas[j] = clipAlpha(oS.alphas[j],H,L) 168 | #更新Ej至误差缓存 169 | updateEk(oS, j) 170 | if (abs(oS.alphas[j] - alphaJold) < 0.00001): 171 | print("alpha_j变化太小") 172 | return 0 173 | #步骤6:更新alpha_i 174 | oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j]) 175 | #更新Ei至误差缓存 176 | updateEk(oS, i) 177 | #步骤7:更新b_1和b_2 178 | b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[i,:].T - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.X[i,:]*oS.X[j,:].T 179 | b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[j,:].T - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.X[j,:]*oS.X[j,:].T 180 | #步骤8:根据b_1和b_2更新b 181 | if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1 182 | elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2 183 | else: oS.b = (b1 + b2)/2.0 184 | return 1 185 | else: 186 | return 0 187 | 188 | def smoP(dataMatIn, classLabels, C, toler, maxIter): 189 | """ 190 | 完整的线性SMO算法 191 | Parameters: 192 | dataMatIn - 数据矩阵 193 | classLabels - 数据标签 194 | C - 松弛变量 195 | toler - 容错率 196 | maxIter - 最大迭代次数 197 | Returns: 198 | oS.b - SMO算法计算的b 199 | oS.alphas - SMO算法计算的alphas 200 | """ 201 | oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).transpose(), C, toler) #初始化数据结构 202 | iter = 0 #初始化当前迭代次数 203 | entireSet = True; alphaPairsChanged = 0 204 | while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)): #遍历整个数据集都alpha也没有更新或者超过最大迭代次数,则退出循环 205 | alphaPairsChanged = 0 206 | if entireSet: #遍历整个数据集 207 | for i in range(oS.m): 208 | alphaPairsChanged += innerL(i,oS) #使用优化的SMO算法 209 | print("全样本遍历:第%d次迭代 样本:%d, alpha优化次数:%d" % (iter,i,alphaPairsChanged)) 210 | iter += 1 211 | else: #遍历非边界值 212 | nonBoundIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0] #遍历不在边界0和C的alpha 213 | for i in nonBoundIs: 214 | alphaPairsChanged += innerL(i,oS) 215 | print("非边界遍历:第%d次迭代 样本:%d, alpha优化次数:%d" % (iter,i,alphaPairsChanged)) 216 | iter += 1 217 | if entireSet: #遍历一次后改为非边界遍历 218 | entireSet = False 219 | elif (alphaPairsChanged == 0): #如果alpha没有更新,计算全样本遍历 220 | entireSet = True 221 | print("迭代次数: %d" % iter) 222 | return oS.b,oS.alphas #返回SMO算法计算的b和alphas 223 | 224 | 225 | def showClassifer(dataMat, classLabels, w, b): 226 | """ 227 | 分类结果可视化 228 | Parameters: 229 | dataMat - 数据矩阵 230 | w - 直线法向量 231 | b - 直线解决 232 | Returns: 233 | 无 234 | """ 235 | #绘制样本点 236 | data_plus = [] #正样本 237 | data_minus = [] #负样本 238 | for i in range(len(dataMat)): 239 | if classLabels[i] > 0: 240 | data_plus.append(dataMat[i]) 241 | else: 242 | data_minus.append(dataMat[i]) 243 | data_plus_np = np.array(data_plus) #转换为numpy矩阵 244 | data_minus_np = np.array(data_minus) #转换为numpy矩阵 245 | plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1], s=30, alpha=0.7) #正样本散点图 246 | plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1], s=30, alpha=0.7) #负样本散点图 247 | #绘制直线 248 | x1 = max(dataMat)[0] 249 | x2 = min(dataMat)[0] 250 | a1, a2 = w 251 | b = float(b) 252 | a1 = float(a1[0]) 253 | a2 = float(a2[0]) 254 | y1, y2 = (-b- a1*x1)/a2, (-b - a1*x2)/a2 255 | plt.plot([x1, x2], [y1, y2]) 256 | #找出支持向量点 257 | for i, alpha in enumerate(alphas): 258 | if alpha > 0: 259 | x, y = dataMat[i] 260 | plt.scatter([x], [y], s=150, c='none', alpha=0.7, linewidth=1.5, edgecolor='red') 261 | plt.show() 262 | 263 | 264 | def calcWs(alphas,dataArr,classLabels): 265 | """ 266 | 计算w 267 | Parameters: 268 | dataArr - 数据矩阵 269 | classLabels - 数据标签 270 | alphas - alphas值 271 | Returns: 272 | w - 计算得到的w 273 | """ 274 | X = np.mat(dataArr); labelMat = np.mat(classLabels).transpose() 275 | m,n = np.shape(X) 276 | w = np.zeros((n,1)) 277 | for i in range(m): 278 | w += np.multiply(alphas[i]*labelMat[i],X[i,:].T) 279 | return w 280 | 281 | if __name__ == '__main__': 282 | dataArr, classLabels = loadDataSet('testSet.txt') 283 | b, alphas = smoP(dataArr, classLabels, 0.6, 0.001, 40) 284 | w = calcWs(alphas,dataArr, classLabels) 285 | showClassifer(dataArr, classLabels, w, b) 286 | -------------------------------------------------------------------------------- /3SVM/SVM/svm-svc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | import operator 4 | from os import listdir 5 | from sklearn.svm import SVC 6 | 7 | """ 8 | Author: 9 | Jack Cui 10 | Blog: 11 | http://blog.csdn.net/c406495762 12 | Zhihu: 13 | https://www.zhihu.com/people/Jack--Cui/ 14 | Modify: 15 | 2017-10-04 16 | """ 17 | 18 | def img2vector(filename): 19 | """ 20 | 将32x32的二进制图像转换为1x1024向量。 21 | Parameters: 22 | filename - 文件名 23 | Returns: 24 | returnVect - 返回的二进制图像的1x1024向量 25 | """ 26 | #创建1x1024零向量 27 | returnVect = np.zeros((1, 1024)) 28 | #打开文件 29 | fr = open(filename) 30 | #按行读取 31 | for i in range(32): 32 | #读一行数据 33 | lineStr = fr.readline() 34 | #每一行的前32个元素依次添加到returnVect中 35 | for j in range(32): 36 | returnVect[0, 32*i+j] = int(lineStr[j]) 37 | #返回转换后的1x1024向量 38 | return returnVect 39 | 40 | def handwritingClassTest(): 41 | """ 42 | 手写数字分类测试 43 | Parameters: 44 | 无 45 | Returns: 46 | 无 47 | """ 48 | #测试集的Labels 49 | hwLabels = [] 50 | #返回trainingDigits目录下的文件名 51 | trainingFileList = listdir('trainingDigits') 52 | #返回文件夹下文件的个数 53 | m = len(trainingFileList) 54 | #初始化训练的Mat矩阵,测试集 55 | trainingMat = np.zeros((m, 1024)) 56 | #从文件名中解析出训练集的类别 57 | for i in range(m): 58 | #获得文件的名字 59 | fileNameStr = trainingFileList[i] 60 | #获得分类的数字 61 | classNumber = int(fileNameStr.split('_')[0]) 62 | #将获得的类别添加到hwLabels中 63 | hwLabels.append(classNumber) 64 | #将每一个文件的1x1024数据存储到trainingMat矩阵中 65 | trainingMat[i,:] = img2vector('trainingDigits/%s' % (fileNameStr)) 66 | clf = SVC(C=200,kernel='rbf') 67 | clf.fit(trainingMat,hwLabels) 68 | #返回testDigits目录下的文件列表 69 | testFileList = listdir('testDigits') 70 | #错误检测计数 71 | errorCount = 0.0 72 | #测试数据的数量 73 | mTest = len(testFileList) 74 | #从文件中解析出测试集的类别并进行分类测试 75 | for i in range(mTest): 76 | #获得文件的名字 77 | fileNameStr = testFileList[i] 78 | #获得分类的数字 79 | classNumber = int(fileNameStr.split('_')[0]) 80 | #获得测试集的1x1024向量,用于训练 81 | vectorUnderTest = img2vector('testDigits/%s' % (fileNameStr)) 82 | #获得预测结果 83 | # classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) 84 | classifierResult = clf.predict(vectorUnderTest) 85 | print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber)) 86 | if(classifierResult != classNumber): 87 | errorCount += 1.0 88 | print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100)) 89 | 90 | if __name__ == '__main__': 91 | handwritingClassTest() -------------------------------------------------------------------------------- /3SVM/SVM/svmMLiA.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import random 5 | 6 | """ 7 | 加上核函数的完整版SMO 8 | """ 9 | 10 | class optStruct: 11 | """ 12 | 数据结构,维护所有需要操作的值 13 | Parameters: 14 | dataMatIn - 数据矩阵 15 | classLabels - 数据标签 16 | C - 松弛变量 17 | toler - 容错率 18 | kTup - 包含核函数信息的元组,第一个参数存放核函数类别,第二个参数存放必要的核函数需要用到的参数 19 | """ 20 | def __init__(self, dataMatIn, classLabels, C, toler, kTup): 21 | self.X = dataMatIn #数据矩阵 22 | self.labelMat = classLabels #数据标签 23 | self.C = C #松弛变量 24 | self.tol = toler #容错率 25 | self.m = np.shape(dataMatIn)[0] #数据矩阵行数 26 | self.alphas = np.mat(np.zeros((self.m,1))) #根据矩阵行数初始化alpha参数为0 27 | self.b = 0 #初始化b参数为0 28 | self.eCache = np.mat(np.zeros((self.m,2))) #根据矩阵行数初始化虎误差缓存,第一列为是否有效的标志位,第二列为实际的误差E的值。 29 | self.K = np.mat(np.zeros((self.m,self.m))) #初始化核K 30 | for i in range(self.m): #计算所有数据的核K 31 | self.K[:,i] = kernelTrans(self.X, self.X[i,:], kTup) 32 | 33 | def kernelTrans(X, A, kTup): 34 | """ 35 | 通过核函数将数据转换更高维的空间 36 | Parameters: 37 | X - 数据矩阵 38 | A - 单个数据的向量 39 | kTup - 包含核函数信息的元组 40 | Returns: 41 | K - 计算的核K 42 | """ 43 | m,n = np.shape(X) 44 | K = np.mat(np.zeros((m,1))) 45 | if kTup[0] == 'lin': K = X * A.T #线性核函数,只进行内积。 46 | elif kTup[0] == 'rbf': #高斯核函数,根据高斯核函数公式进行计算 47 | for j in range(m): 48 | deltaRow = X[j,:] - A 49 | K[j] = deltaRow*deltaRow.T 50 | K = np.exp(K/(-1*kTup[1]**2)) #计算高斯核K 51 | else: raise NameError('核函数无法识别') 52 | return K #返回计算的核K 53 | 54 | def loadDataSet(fileName): 55 | """ 56 | 读取数据 57 | Parameters: 58 | fileName - 文件名 59 | Returns: 60 | dataMat - 数据矩阵 61 | labelMat - 数据标签 62 | """ 63 | dataMat = []; labelMat = [] 64 | fr = open(fileName) 65 | for line in fr.readlines(): #逐行读取,滤除空格等 66 | lineArr = line.strip().split('\t') 67 | dataMat.append([float(lineArr[0]), float(lineArr[1])]) #添加数据 68 | labelMat.append(float(lineArr[2])) #添加标签 69 | return dataMat,labelMat 70 | 71 | def calcEk(oS, k): 72 | """ 73 | 计算误差 74 | Parameters: 75 | oS - 数据结构 76 | k - 标号为k的数据 77 | Returns: 78 | Ek - 标号为k的数据误差 79 | """ 80 | fXk = float(np.multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b) 81 | Ek = fXk - float(oS.labelMat[k]) 82 | return Ek 83 | 84 | def selectJrand(i, m): 85 | """ 86 | 函数说明:随机选择alpha_j的索引值 87 | 88 | Parameters: 89 | i - alpha_i的索引值 90 | m - alpha参数个数 91 | Returns: 92 | j - alpha_j的索引值 93 | """ 94 | j = i #选择一个不等于i的j 95 | while (j == i): 96 | j = int(random.uniform(0, m)) 97 | return j 98 | 99 | def selectJ(i, oS, Ei): 100 | """ 101 | 内循环启发方式2 102 | Parameters: 103 | i - 标号为i的数据的索引值 104 | oS - 数据结构 105 | Ei - 标号为i的数据误差 106 | Returns: 107 | j, maxK - 标号为j或maxK的数据的索引值 108 | Ej - 标号为j的数据误差 109 | """ 110 | maxK = -1; maxDeltaE = 0; Ej = 0 #初始化 111 | oS.eCache[i] = [1,Ei] #根据Ei更新误差缓存 112 | validEcacheList = np.nonzero(oS.eCache[:,0].A)[0] #返回误差不为0的数据的索引值 113 | if (len(validEcacheList)) > 1: #有不为0的误差 114 | for k in validEcacheList: #遍历,找到最大的Ek 115 | if k == i: continue #不计算i,浪费时间 116 | Ek = calcEk(oS, k) #计算Ek 117 | deltaE = abs(Ei - Ek) #计算|Ei-Ek| 118 | if (deltaE > maxDeltaE): #找到maxDeltaE 119 | maxK = k; maxDeltaE = deltaE; Ej = Ek 120 | return maxK, Ej #返回maxK,Ej 121 | else: #没有不为0的误差 122 | j = selectJrand(i, oS.m) #随机选择alpha_j的索引值 123 | Ej = calcEk(oS, j) #计算Ej 124 | return j, Ej #j,Ej 125 | 126 | def updateEk(oS, k): 127 | """ 128 | 计算Ek,并更新误差缓存 129 | Parameters: 130 | oS - 数据结构 131 | k - 标号为k的数据的索引值 132 | Returns: 133 | 无 134 | """ 135 | Ek = calcEk(oS, k) #计算Ek 136 | oS.eCache[k] = [1,Ek] #更新误差缓存 137 | 138 | 139 | def clipAlpha(aj,H,L): 140 | """ 141 | 修剪alpha_j 142 | Parameters: 143 | aj - alpha_j的值 144 | H - alpha上限 145 | L - alpha下限 146 | Returns: 147 | aj - 修剪后的alpah_j的值 148 | """ 149 | if aj > H: 150 | aj = H 151 | if L > aj: 152 | aj = L 153 | return aj 154 | 155 | def innerL(i, oS): 156 | """ 157 | 优化的SMO算法 158 | Parameters: 159 | i - 标号为i的数据的索引值 160 | oS - 数据结构 161 | Returns: 162 | 1 - 有任意一对alpha值发生变化 163 | 0 - 没有任意一对alpha值发生变化或变化太小 164 | """ 165 | #步骤1:计算误差Ei 166 | Ei = calcEk(oS, i) 167 | #优化alpha,设定一定的容错率。 168 | if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)): 169 | #使用内循环启发方式2选择alpha_j,并计算Ej 170 | j,Ej = selectJ(i, oS, Ei) 171 | #保存更新前的aplpha值,使用深拷贝 172 | alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy(); 173 | #步骤2:计算上下界L和H 174 | if (oS.labelMat[i] != oS.labelMat[j]): 175 | L = max(0, oS.alphas[j] - oS.alphas[i]) 176 | H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) 177 | else: 178 | L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) 179 | H = min(oS.C, oS.alphas[j] + oS.alphas[i]) 180 | if L == H: 181 | print("L==H") 182 | return 0 183 | #步骤3:计算eta 184 | eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j] 185 | if eta >= 0: 186 | print("eta>=0") 187 | return 0 188 | #步骤4:更新alpha_j 189 | oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej)/eta 190 | #步骤5:修剪alpha_j 191 | oS.alphas[j] = clipAlpha(oS.alphas[j],H,L) 192 | #更新Ej至误差缓存 193 | updateEk(oS, j) 194 | if (abs(oS.alphas[j] - alphaJold) < 0.00001): 195 | print("alpha_j变化太小") 196 | return 0 197 | #步骤6:更新alpha_i 198 | oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j]) 199 | #更新Ei至误差缓存 200 | updateEk(oS, i) 201 | #步骤7:更新b_1和b_2 202 | b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j] 203 | b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j] 204 | #步骤8:根据b_1和b_2更新b 205 | if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1 206 | elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2 207 | else: oS.b = (b1 + b2)/2.0 208 | return 1 209 | else: 210 | return 0 211 | 212 | def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup = ('lin',0)): 213 | """ 214 | 完整的线性SMO算法 215 | Parameters: 216 | dataMatIn - 数据矩阵 217 | classLabels - 数据标签 218 | C - 松弛变量 219 | toler - 容错率 220 | maxIter - 最大迭代次数 221 | kTup - 包含核函数信息的元组 222 | Returns: 223 | oS.b - SMO算法计算的b 224 | oS.alphas - SMO算法计算的alphas 225 | """ 226 | oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).transpose(), C, toler, kTup) #初始化数据结构 227 | iter = 0 #初始化当前迭代次数 228 | entireSet = True; alphaPairsChanged = 0 229 | while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)): #遍历整个数据集都alpha也没有更新或者超过最大迭代次数,则退出循环 230 | alphaPairsChanged = 0 231 | if entireSet: #遍历整个数据集 232 | for i in range(oS.m): 233 | alphaPairsChanged += innerL(i,oS) #使用优化的SMO算法 234 | print("全样本遍历:第%d次迭代 样本:%d, alpha优化次数:%d" % (iter,i,alphaPairsChanged)) 235 | iter += 1 236 | else: #遍历非边界值 237 | nonBoundIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0] #遍历不在边界0和C的alpha 238 | for i in nonBoundIs: 239 | alphaPairsChanged += innerL(i,oS) 240 | print("非边界遍历:第%d次迭代 样本:%d, alpha优化次数:%d" % (iter,i,alphaPairsChanged)) 241 | iter += 1 242 | if entireSet: #遍历一次后改为非边界遍历 243 | entireSet = False 244 | elif (alphaPairsChanged == 0): #如果alpha没有更新,计算全样本遍历 245 | entireSet = True 246 | print("迭代次数: %d" % iter) 247 | return oS.b,oS.alphas #返回SMO算法计算的b和alphas 248 | 249 | 250 | def testRbf(k1 = 1.3): 251 | """ 252 | 测试函数 253 | Parameters: 254 | k1 - 使用高斯核函数的时候表示到达率 255 | Returns: 256 | 无 257 | """ 258 | dataArr,labelArr = loadDataSet('testSetRBF.txt') #加载训练集 259 | b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 100, ('rbf', k1)) #根据训练集计算b和alphas 260 | datMat = np.mat(dataArr); labelMat = np.mat(labelArr).transpose() 261 | svInd = np.nonzero(alphas.A > 0)[0] #获得支持向量 262 | sVs = datMat[svInd] 263 | labelSV = labelMat[svInd]; 264 | print("支持向量个数:%d" % np.shape(sVs)[0]) 265 | m,n = np.shape(datMat) 266 | errorCount = 0 267 | for i in range(m): 268 | kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1)) #计算各个点的核 269 | predict = kernelEval.T * np.multiply(labelSV,alphas[svInd]) + b #根据支持向量的点,计算超平面,返回预测结果 270 | if np.sign(predict) != np.sign(labelArr[i]): errorCount += 1 #返回数组中各元素的正负符号,用1和-1表示,并统计错误个数 271 | print("训练集错误率: %.2f%%" % ((float(errorCount)/m)*100)) #打印错误率 272 | dataArr,labelArr = loadDataSet('testSetRBF2.txt') #加载测试集 273 | errorCount = 0 274 | datMat = np.mat(dataArr); labelMat = np.mat(labelArr).transpose() 275 | m,n = np.shape(datMat) 276 | for i in range(m): 277 | kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1)) #计算各个点的核 278 | predict=kernelEval.T * np.multiply(labelSV,alphas[svInd]) + b #根据支持向量的点,计算超平面,返回预测结果 279 | if np.sign(predict) != np.sign(labelArr[i]): errorCount += 1 #返回数组中各元素的正负符号,用1和-1表示,并统计错误个数 280 | print("测试集错误率: %.2f%%" % ((float(errorCount)/m)*100)) #打印错误率 281 | 282 | 283 | def showDataSet(dataMat, labelMat): 284 | """ 285 | 数据可视化 286 | Parameters: 287 | dataMat - 数据矩阵 288 | labelMat - 数据标签 289 | Returns: 290 | 无 291 | """ 292 | data_plus = [] #正样本 293 | data_minus = [] #负样本 294 | for i in range(len(dataMat)): 295 | if labelMat[i] > 0: 296 | data_plus.append(dataMat[i]) 297 | else: 298 | data_minus.append(dataMat[i]) 299 | data_plus_np = np.array(data_plus) #转换为numpy矩阵 300 | data_minus_np = np.array(data_minus) #转换为numpy矩阵 301 | plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1]) #正样本散点图 302 | plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1]) #负样本散点图 303 | plt.show() 304 | 305 | if __name__ == '__main__': 306 | testRbf() -------------------------------------------------------------------------------- /3SVM/SVM/testSet.txt: -------------------------------------------------------------------------------- 1 | 3.542485 1.977398 -1 2 | 3.018896 2.556416 -1 3 | 7.551510 -1.580030 1 4 | 2.114999 -0.004466 -1 5 | 8.127113 1.274372 1 6 | 7.108772 -0.986906 1 7 | 8.610639 2.046708 1 8 | 2.326297 0.265213 -1 9 | 3.634009 1.730537 -1 10 | 0.341367 -0.894998 -1 11 | 3.125951 0.293251 -1 12 | 2.123252 -0.783563 -1 13 | 0.887835 -2.797792 -1 14 | 7.139979 -2.329896 1 15 | 1.696414 -1.212496 -1 16 | 8.117032 0.623493 1 17 | 8.497162 -0.266649 1 18 | 4.658191 3.507396 -1 19 | 8.197181 1.545132 1 20 | 1.208047 0.213100 -1 21 | 1.928486 -0.321870 -1 22 | 2.175808 -0.014527 -1 23 | 7.886608 0.461755 1 24 | 3.223038 -0.552392 -1 25 | 3.628502 2.190585 -1 26 | 7.407860 -0.121961 1 27 | 7.286357 0.251077 1 28 | 2.301095 -0.533988 -1 29 | -0.232542 -0.547690 -1 30 | 3.457096 -0.082216 -1 31 | 3.023938 -0.057392 -1 32 | 8.015003 0.885325 1 33 | 8.991748 0.923154 1 34 | 7.916831 -1.781735 1 35 | 7.616862 -0.217958 1 36 | 2.450939 0.744967 -1 37 | 7.270337 -2.507834 1 38 | 1.749721 -0.961902 -1 39 | 1.803111 -0.176349 -1 40 | 8.804461 3.044301 1 41 | 1.231257 -0.568573 -1 42 | 2.074915 1.410550 -1 43 | -0.743036 -1.736103 -1 44 | 3.536555 3.964960 -1 45 | 8.410143 0.025606 1 46 | 7.382988 -0.478764 1 47 | 6.960661 -0.245353 1 48 | 8.234460 0.701868 1 49 | 8.168618 -0.903835 1 50 | 1.534187 -0.622492 -1 51 | 9.229518 2.066088 1 52 | 7.886242 0.191813 1 53 | 2.893743 -1.643468 -1 54 | 1.870457 -1.040420 -1 55 | 5.286862 -2.358286 1 56 | 6.080573 0.418886 1 57 | 2.544314 1.714165 -1 58 | 6.016004 -3.753712 1 59 | 0.926310 -0.564359 -1 60 | 0.870296 -0.109952 -1 61 | 2.369345 1.375695 -1 62 | 1.363782 -0.254082 -1 63 | 7.279460 -0.189572 1 64 | 1.896005 0.515080 -1 65 | 8.102154 -0.603875 1 66 | 2.529893 0.662657 -1 67 | 1.963874 -0.365233 -1 68 | 8.132048 0.785914 1 69 | 8.245938 0.372366 1 70 | 6.543888 0.433164 1 71 | -0.236713 -5.766721 -1 72 | 8.112593 0.295839 1 73 | 9.803425 1.495167 1 74 | 1.497407 -0.552916 -1 75 | 1.336267 -1.632889 -1 76 | 9.205805 -0.586480 1 77 | 1.966279 -1.840439 -1 78 | 8.398012 1.584918 1 79 | 7.239953 -1.764292 1 80 | 7.556201 0.241185 1 81 | 9.015509 0.345019 1 82 | 8.266085 -0.230977 1 83 | 8.545620 2.788799 1 84 | 9.295969 1.346332 1 85 | 2.404234 0.570278 -1 86 | 2.037772 0.021919 -1 87 | 1.727631 -0.453143 -1 88 | 1.979395 -0.050773 -1 89 | 8.092288 -1.372433 1 90 | 1.667645 0.239204 -1 91 | 9.854303 1.365116 1 92 | 7.921057 -1.327587 1 93 | 8.500757 1.492372 1 94 | 1.339746 -0.291183 -1 95 | 3.107511 0.758367 -1 96 | 2.609525 0.902979 -1 97 | 3.263585 1.367898 -1 98 | 2.912122 -0.202359 -1 99 | 1.731786 0.589096 -1 100 | 2.387003 1.573131 -1 101 | -------------------------------------------------------------------------------- /3SVM/SVM/testSetRBF.txt: -------------------------------------------------------------------------------- 1 | -0.214824 0.662756 -1.000000 2 | -0.061569 -0.091875 1.000000 3 | 0.406933 0.648055 -1.000000 4 | 0.223650 0.130142 1.000000 5 | 0.231317 0.766906 -1.000000 6 | -0.748800 -0.531637 -1.000000 7 | -0.557789 0.375797 -1.000000 8 | 0.207123 -0.019463 1.000000 9 | 0.286462 0.719470 -1.000000 10 | 0.195300 -0.179039 1.000000 11 | -0.152696 -0.153030 1.000000 12 | 0.384471 0.653336 -1.000000 13 | -0.117280 -0.153217 1.000000 14 | -0.238076 0.000583 1.000000 15 | -0.413576 0.145681 1.000000 16 | 0.490767 -0.680029 -1.000000 17 | 0.199894 -0.199381 1.000000 18 | -0.356048 0.537960 -1.000000 19 | -0.392868 -0.125261 1.000000 20 | 0.353588 -0.070617 1.000000 21 | 0.020984 0.925720 -1.000000 22 | -0.475167 -0.346247 -1.000000 23 | 0.074952 0.042783 1.000000 24 | 0.394164 -0.058217 1.000000 25 | 0.663418 0.436525 -1.000000 26 | 0.402158 0.577744 -1.000000 27 | -0.449349 -0.038074 1.000000 28 | 0.619080 -0.088188 -1.000000 29 | 0.268066 -0.071621 1.000000 30 | -0.015165 0.359326 1.000000 31 | 0.539368 -0.374972 -1.000000 32 | -0.319153 0.629673 -1.000000 33 | 0.694424 0.641180 -1.000000 34 | 0.079522 0.193198 1.000000 35 | 0.253289 -0.285861 1.000000 36 | -0.035558 -0.010086 1.000000 37 | -0.403483 0.474466 -1.000000 38 | -0.034312 0.995685 -1.000000 39 | -0.590657 0.438051 -1.000000 40 | -0.098871 -0.023953 1.000000 41 | -0.250001 0.141621 1.000000 42 | -0.012998 0.525985 -1.000000 43 | 0.153738 0.491531 -1.000000 44 | 0.388215 -0.656567 -1.000000 45 | 0.049008 0.013499 1.000000 46 | 0.068286 0.392741 1.000000 47 | 0.747800 -0.066630 -1.000000 48 | 0.004621 -0.042932 1.000000 49 | -0.701600 0.190983 -1.000000 50 | 0.055413 -0.024380 1.000000 51 | 0.035398 -0.333682 1.000000 52 | 0.211795 0.024689 1.000000 53 | -0.045677 0.172907 1.000000 54 | 0.595222 0.209570 -1.000000 55 | 0.229465 0.250409 1.000000 56 | -0.089293 0.068198 1.000000 57 | 0.384300 -0.176570 1.000000 58 | 0.834912 -0.110321 -1.000000 59 | -0.307768 0.503038 -1.000000 60 | -0.777063 -0.348066 -1.000000 61 | 0.017390 0.152441 1.000000 62 | -0.293382 -0.139778 1.000000 63 | -0.203272 0.286855 1.000000 64 | 0.957812 -0.152444 -1.000000 65 | 0.004609 -0.070617 1.000000 66 | -0.755431 0.096711 -1.000000 67 | -0.526487 0.547282 -1.000000 68 | -0.246873 0.833713 -1.000000 69 | 0.185639 -0.066162 1.000000 70 | 0.851934 0.456603 -1.000000 71 | -0.827912 0.117122 -1.000000 72 | 0.233512 -0.106274 1.000000 73 | 0.583671 -0.709033 -1.000000 74 | -0.487023 0.625140 -1.000000 75 | -0.448939 0.176725 1.000000 76 | 0.155907 -0.166371 1.000000 77 | 0.334204 0.381237 -1.000000 78 | 0.081536 -0.106212 1.000000 79 | 0.227222 0.527437 -1.000000 80 | 0.759290 0.330720 -1.000000 81 | 0.204177 -0.023516 1.000000 82 | 0.577939 0.403784 -1.000000 83 | -0.568534 0.442948 -1.000000 84 | -0.011520 0.021165 1.000000 85 | 0.875720 0.422476 -1.000000 86 | 0.297885 -0.632874 -1.000000 87 | -0.015821 0.031226 1.000000 88 | 0.541359 -0.205969 -1.000000 89 | -0.689946 -0.508674 -1.000000 90 | -0.343049 0.841653 -1.000000 91 | 0.523902 -0.436156 -1.000000 92 | 0.249281 -0.711840 -1.000000 93 | 0.193449 0.574598 -1.000000 94 | -0.257542 -0.753885 -1.000000 95 | -0.021605 0.158080 1.000000 96 | 0.601559 -0.727041 -1.000000 97 | -0.791603 0.095651 -1.000000 98 | -0.908298 -0.053376 -1.000000 99 | 0.122020 0.850966 -1.000000 100 | -0.725568 -0.292022 -1.000000 101 | -------------------------------------------------------------------------------- /3SVM/SVM/testSetRBF2.txt: -------------------------------------------------------------------------------- 1 | 0.676771 -0.486687 -1.000000 2 | 0.008473 0.186070 1.000000 3 | -0.727789 0.594062 -1.000000 4 | 0.112367 0.287852 1.000000 5 | 0.383633 -0.038068 1.000000 6 | -0.927138 -0.032633 -1.000000 7 | -0.842803 -0.423115 -1.000000 8 | -0.003677 -0.367338 1.000000 9 | 0.443211 -0.698469 -1.000000 10 | -0.473835 0.005233 1.000000 11 | 0.616741 0.590841 -1.000000 12 | 0.557463 -0.373461 -1.000000 13 | -0.498535 -0.223231 -1.000000 14 | -0.246744 0.276413 1.000000 15 | -0.761980 -0.244188 -1.000000 16 | 0.641594 -0.479861 -1.000000 17 | -0.659140 0.529830 -1.000000 18 | -0.054873 -0.238900 1.000000 19 | -0.089644 -0.244683 1.000000 20 | -0.431576 -0.481538 -1.000000 21 | -0.099535 0.728679 -1.000000 22 | -0.188428 0.156443 1.000000 23 | 0.267051 0.318101 1.000000 24 | 0.222114 -0.528887 -1.000000 25 | 0.030369 0.113317 1.000000 26 | 0.392321 0.026089 1.000000 27 | 0.298871 -0.915427 -1.000000 28 | -0.034581 -0.133887 1.000000 29 | 0.405956 0.206980 1.000000 30 | 0.144902 -0.605762 -1.000000 31 | 0.274362 -0.401338 1.000000 32 | 0.397998 -0.780144 -1.000000 33 | 0.037863 0.155137 1.000000 34 | -0.010363 -0.004170 1.000000 35 | 0.506519 0.486619 -1.000000 36 | 0.000082 -0.020625 1.000000 37 | 0.057761 -0.155140 1.000000 38 | 0.027748 -0.553763 -1.000000 39 | -0.413363 -0.746830 -1.000000 40 | 0.081500 -0.014264 1.000000 41 | 0.047137 -0.491271 1.000000 42 | -0.267459 0.024770 1.000000 43 | -0.148288 -0.532471 -1.000000 44 | -0.225559 -0.201622 1.000000 45 | 0.772360 -0.518986 -1.000000 46 | -0.440670 0.688739 -1.000000 47 | 0.329064 -0.095349 1.000000 48 | 0.970170 -0.010671 -1.000000 49 | -0.689447 -0.318722 -1.000000 50 | -0.465493 -0.227468 -1.000000 51 | -0.049370 0.405711 1.000000 52 | -0.166117 0.274807 1.000000 53 | 0.054483 0.012643 1.000000 54 | 0.021389 0.076125 1.000000 55 | -0.104404 -0.914042 -1.000000 56 | 0.294487 0.440886 -1.000000 57 | 0.107915 -0.493703 -1.000000 58 | 0.076311 0.438860 1.000000 59 | 0.370593 -0.728737 -1.000000 60 | 0.409890 0.306851 -1.000000 61 | 0.285445 0.474399 -1.000000 62 | -0.870134 -0.161685 -1.000000 63 | -0.654144 -0.675129 -1.000000 64 | 0.285278 -0.767310 -1.000000 65 | 0.049548 -0.000907 1.000000 66 | 0.030014 -0.093265 1.000000 67 | -0.128859 0.278865 1.000000 68 | 0.307463 0.085667 1.000000 69 | 0.023440 0.298638 1.000000 70 | 0.053920 0.235344 1.000000 71 | 0.059675 0.533339 -1.000000 72 | 0.817125 0.016536 -1.000000 73 | -0.108771 0.477254 1.000000 74 | -0.118106 0.017284 1.000000 75 | 0.288339 0.195457 1.000000 76 | 0.567309 -0.200203 -1.000000 77 | -0.202446 0.409387 1.000000 78 | -0.330769 -0.240797 1.000000 79 | -0.422377 0.480683 -1.000000 80 | -0.295269 0.326017 1.000000 81 | 0.261132 0.046478 1.000000 82 | -0.492244 -0.319998 -1.000000 83 | -0.384419 0.099170 1.000000 84 | 0.101882 -0.781145 -1.000000 85 | 0.234592 -0.383446 1.000000 86 | -0.020478 -0.901833 -1.000000 87 | 0.328449 0.186633 1.000000 88 | -0.150059 -0.409158 1.000000 89 | -0.155876 -0.843413 -1.000000 90 | -0.098134 -0.136786 1.000000 91 | 0.110575 -0.197205 1.000000 92 | 0.219021 0.054347 1.000000 93 | 0.030152 0.251682 1.000000 94 | 0.033447 -0.122824 1.000000 95 | -0.686225 -0.020779 -1.000000 96 | -0.911211 -0.262011 -1.000000 97 | 0.572557 0.377526 -1.000000 98 | -0.073647 -0.519163 -1.000000 99 | -0.281830 -0.797236 -1.000000 100 | -0.555263 0.126232 -1.000000 101 | -------------------------------------------------------------------------------- /3SVM/data2.txt: -------------------------------------------------------------------------------- 1 | 0.051267,0.69956,1 2 | -0.092742,0.68494,1 3 | -0.21371,0.69225,1 4 | -0.375,0.50219,1 5 | -0.51325,0.46564,1 6 | -0.52477,0.2098,1 7 | -0.39804,0.034357,1 8 | -0.30588,-0.19225,1 9 | 0.016705,-0.40424,1 10 | 0.13191,-0.51389,1 11 | 0.38537,-0.56506,1 12 | 0.52938,-0.5212,1 13 | 0.63882,-0.24342,1 14 | 0.73675,-0.18494,1 15 | 0.54666,0.48757,1 16 | 0.322,0.5826,1 17 | 0.16647,0.53874,1 18 | -0.046659,0.81652,1 19 | -0.17339,0.69956,1 20 | -0.47869,0.63377,1 21 | -0.60541,0.59722,1 22 | -0.62846,0.33406,1 23 | -0.59389,0.005117,1 24 | -0.42108,-0.27266,1 25 | -0.11578,-0.39693,1 26 | 0.20104,-0.60161,1 27 | 0.46601,-0.53582,1 28 | 0.67339,-0.53582,1 29 | -0.13882,0.54605,1 30 | -0.29435,0.77997,1 31 | -0.26555,0.96272,1 32 | -0.16187,0.8019,1 33 | -0.17339,0.64839,1 34 | -0.28283,0.47295,1 35 | -0.36348,0.31213,1 36 | -0.30012,0.027047,1 37 | -0.23675,-0.21418,1 38 | -0.06394,-0.18494,1 39 | 0.062788,-0.16301,1 40 | 0.22984,-0.41155,1 41 | 0.2932,-0.2288,1 42 | 0.48329,-0.18494,1 43 | 0.64459,-0.14108,1 44 | 0.46025,0.012427,1 45 | 0.6273,0.15863,1 46 | 0.57546,0.26827,1 47 | 0.72523,0.44371,1 48 | 0.22408,0.52412,1 49 | 0.44297,0.67032,1 50 | 0.322,0.69225,1 51 | 0.13767,0.57529,1 52 | -0.0063364,0.39985,1 53 | -0.092742,0.55336,1 54 | -0.20795,0.35599,1 55 | -0.20795,0.17325,1 56 | -0.43836,0.21711,1 57 | -0.21947,-0.016813,1 58 | -0.13882,-0.27266,1 59 | 0.18376,0.93348,0 60 | 0.22408,0.77997,0 61 | 0.29896,0.61915,0 62 | 0.50634,0.75804,0 63 | 0.61578,0.7288,0 64 | 0.60426,0.59722,0 65 | 0.76555,0.50219,0 66 | 0.92684,0.3633,0 67 | 0.82316,0.27558,0 68 | 0.96141,0.085526,0 69 | 0.93836,0.012427,0 70 | 0.86348,-0.082602,0 71 | 0.89804,-0.20687,0 72 | 0.85196,-0.36769,0 73 | 0.82892,-0.5212,0 74 | 0.79435,-0.55775,0 75 | 0.59274,-0.7405,0 76 | 0.51786,-0.5943,0 77 | 0.46601,-0.41886,0 78 | 0.35081,-0.57968,0 79 | 0.28744,-0.76974,0 80 | 0.085829,-0.75512,0 81 | 0.14919,-0.57968,0 82 | -0.13306,-0.4481,0 83 | -0.40956,-0.41155,0 84 | -0.39228,-0.25804,0 85 | -0.74366,-0.25804,0 86 | -0.69758,0.041667,0 87 | -0.75518,0.2902,0 88 | -0.69758,0.68494,0 89 | -0.4038,0.70687,0 90 | -0.38076,0.91886,0 91 | -0.50749,0.90424,0 92 | -0.54781,0.70687,0 93 | 0.10311,0.77997,0 94 | 0.057028,0.91886,0 95 | -0.10426,0.99196,0 96 | -0.081221,1.1089,0 97 | 0.28744,1.087,0 98 | 0.39689,0.82383,0 99 | 0.63882,0.88962,0 100 | 0.82316,0.66301,0 101 | 0.67339,0.64108,0 102 | 1.0709,0.10015,0 103 | -0.046659,-0.57968,0 104 | -0.23675,-0.63816,0 105 | -0.15035,-0.36769,0 106 | -0.49021,-0.3019,0 107 | -0.46717,-0.13377,0 108 | -0.28859,-0.060673,0 109 | -0.61118,-0.067982,0 110 | -0.66302,-0.21418,0 111 | -0.59965,-0.41886,0 112 | -0.72638,-0.082602,0 113 | -0.83007,0.31213,0 114 | -0.72062,0.53874,0 115 | -0.59389,0.49488,0 116 | -0.48445,0.99927,0 117 | -0.0063364,0.99927,0 118 | 0.63265,-0.030612,0 119 | -------------------------------------------------------------------------------- /3SVM/testSet.txt: -------------------------------------------------------------------------------- 1 | 3.542485 1.977398 -1 2 | 3.018896 2.556416 -1 3 | 7.551510 -1.580030 1 4 | 2.114999 -0.004466 -1 5 | 8.127113 1.274372 1 6 | 7.108772 -0.986906 1 7 | 8.610639 2.046708 1 8 | 2.326297 0.265213 -1 9 | 3.634009 1.730537 -1 10 | 0.341367 -0.894998 -1 11 | 3.125951 0.293251 -1 12 | 2.123252 -0.783563 -1 13 | 0.887835 -2.797792 -1 14 | 7.139979 -2.329896 1 15 | 1.696414 -1.212496 -1 16 | 8.117032 0.623493 1 17 | 8.497162 -0.266649 1 18 | 4.658191 3.507396 -1 19 | 8.197181 1.545132 1 20 | 1.208047 0.213100 -1 21 | 1.928486 -0.321870 -1 22 | 2.175808 -0.014527 -1 23 | 7.886608 0.461755 1 24 | 3.223038 -0.552392 -1 25 | 3.628502 2.190585 -1 26 | 7.407860 -0.121961 1 27 | 7.286357 0.251077 1 28 | 2.301095 -0.533988 -1 29 | -0.232542 -0.547690 -1 30 | 3.457096 -0.082216 -1 31 | 3.023938 -0.057392 -1 32 | 8.015003 0.885325 1 33 | 8.991748 0.923154 1 34 | 7.916831 -1.781735 1 35 | 7.616862 -0.217958 1 36 | 2.450939 0.744967 -1 37 | 7.270337 -2.507834 1 38 | 1.749721 -0.961902 -1 39 | 1.803111 -0.176349 -1 40 | 8.804461 3.044301 1 41 | 1.231257 -0.568573 -1 42 | 2.074915 1.410550 -1 43 | -0.743036 -1.736103 -1 44 | 3.536555 3.964960 -1 45 | 8.410143 0.025606 1 46 | 7.382988 -0.478764 1 47 | 6.960661 -0.245353 1 48 | 8.234460 0.701868 1 49 | 8.168618 -0.903835 1 50 | 1.534187 -0.622492 -1 51 | 9.229518 2.066088 1 52 | 7.886242 0.191813 1 53 | 2.893743 -1.643468 -1 54 | 1.870457 -1.040420 -1 55 | 5.286862 -2.358286 1 56 | 6.080573 0.418886 1 57 | 2.544314 1.714165 -1 58 | 6.016004 -3.753712 1 59 | 0.926310 -0.564359 -1 60 | 0.870296 -0.109952 -1 61 | 2.369345 1.375695 -1 62 | 1.363782 -0.254082 -1 63 | 7.279460 -0.189572 1 64 | 1.896005 0.515080 -1 65 | 8.102154 -0.603875 1 66 | 2.529893 0.662657 -1 67 | 1.963874 -0.365233 -1 68 | 8.132048 0.785914 1 69 | 8.245938 0.372366 1 70 | 6.543888 0.433164 1 71 | -0.236713 -5.766721 -1 72 | 8.112593 0.295839 1 73 | 9.803425 1.495167 1 74 | 1.497407 -0.552916 -1 75 | 1.336267 -1.632889 -1 76 | 9.205805 -0.586480 1 77 | 1.966279 -1.840439 -1 78 | 8.398012 1.584918 1 79 | 7.239953 -1.764292 1 80 | 7.556201 0.241185 1 81 | 9.015509 0.345019 1 82 | 8.266085 -0.230977 1 83 | 8.545620 2.788799 1 84 | 9.295969 1.346332 1 85 | 2.404234 0.570278 -1 86 | 2.037772 0.021919 -1 87 | 1.727631 -0.453143 -1 88 | 1.979395 -0.050773 -1 89 | 8.092288 -1.372433 1 90 | 1.667645 0.239204 -1 91 | 9.854303 1.365116 1 92 | 7.921057 -1.327587 1 93 | 8.500757 1.492372 1 94 | 1.339746 -0.291183 -1 95 | 3.107511 0.758367 -1 96 | 2.609525 0.902979 -1 97 | 3.263585 1.367898 -1 98 | 2.912122 -0.202359 -1 99 | 1.731786 0.589096 -1 100 | 2.387003 1.573131 -1 101 | -------------------------------------------------------------------------------- /3SVM/testSetRBF.txt: -------------------------------------------------------------------------------- 1 | -0.214824 0.662756 -1.000000 2 | -0.061569 -0.091875 1.000000 3 | 0.406933 0.648055 -1.000000 4 | 0.223650 0.130142 1.000000 5 | 0.231317 0.766906 -1.000000 6 | -0.748800 -0.531637 -1.000000 7 | -0.557789 0.375797 -1.000000 8 | 0.207123 -0.019463 1.000000 9 | 0.286462 0.719470 -1.000000 10 | 0.195300 -0.179039 1.000000 11 | -0.152696 -0.153030 1.000000 12 | 0.384471 0.653336 -1.000000 13 | -0.117280 -0.153217 1.000000 14 | -0.238076 0.000583 1.000000 15 | -0.413576 0.145681 1.000000 16 | 0.490767 -0.680029 -1.000000 17 | 0.199894 -0.199381 1.000000 18 | -0.356048 0.537960 -1.000000 19 | -0.392868 -0.125261 1.000000 20 | 0.353588 -0.070617 1.000000 21 | 0.020984 0.925720 -1.000000 22 | -0.475167 -0.346247 -1.000000 23 | 0.074952 0.042783 1.000000 24 | 0.394164 -0.058217 1.000000 25 | 0.663418 0.436525 -1.000000 26 | 0.402158 0.577744 -1.000000 27 | -0.449349 -0.038074 1.000000 28 | 0.619080 -0.088188 -1.000000 29 | 0.268066 -0.071621 1.000000 30 | -0.015165 0.359326 1.000000 31 | 0.539368 -0.374972 -1.000000 32 | -0.319153 0.629673 -1.000000 33 | 0.694424 0.641180 -1.000000 34 | 0.079522 0.193198 1.000000 35 | 0.253289 -0.285861 1.000000 36 | -0.035558 -0.010086 1.000000 37 | -0.403483 0.474466 -1.000000 38 | -0.034312 0.995685 -1.000000 39 | -0.590657 0.438051 -1.000000 40 | -0.098871 -0.023953 1.000000 41 | -0.250001 0.141621 1.000000 42 | -0.012998 0.525985 -1.000000 43 | 0.153738 0.491531 -1.000000 44 | 0.388215 -0.656567 -1.000000 45 | 0.049008 0.013499 1.000000 46 | 0.068286 0.392741 1.000000 47 | 0.747800 -0.066630 -1.000000 48 | 0.004621 -0.042932 1.000000 49 | -0.701600 0.190983 -1.000000 50 | 0.055413 -0.024380 1.000000 51 | 0.035398 -0.333682 1.000000 52 | 0.211795 0.024689 1.000000 53 | -0.045677 0.172907 1.000000 54 | 0.595222 0.209570 -1.000000 55 | 0.229465 0.250409 1.000000 56 | -0.089293 0.068198 1.000000 57 | 0.384300 -0.176570 1.000000 58 | 0.834912 -0.110321 -1.000000 59 | -0.307768 0.503038 -1.000000 60 | -0.777063 -0.348066 -1.000000 61 | 0.017390 0.152441 1.000000 62 | -0.293382 -0.139778 1.000000 63 | -0.203272 0.286855 1.000000 64 | 0.957812 -0.152444 -1.000000 65 | 0.004609 -0.070617 1.000000 66 | -0.755431 0.096711 -1.000000 67 | -0.526487 0.547282 -1.000000 68 | -0.246873 0.833713 -1.000000 69 | 0.185639 -0.066162 1.000000 70 | 0.851934 0.456603 -1.000000 71 | -0.827912 0.117122 -1.000000 72 | 0.233512 -0.106274 1.000000 73 | 0.583671 -0.709033 -1.000000 74 | -0.487023 0.625140 -1.000000 75 | -0.448939 0.176725 1.000000 76 | 0.155907 -0.166371 1.000000 77 | 0.334204 0.381237 -1.000000 78 | 0.081536 -0.106212 1.000000 79 | 0.227222 0.527437 -1.000000 80 | 0.759290 0.330720 -1.000000 81 | 0.204177 -0.023516 1.000000 82 | 0.577939 0.403784 -1.000000 83 | -0.568534 0.442948 -1.000000 84 | -0.011520 0.021165 1.000000 85 | 0.875720 0.422476 -1.000000 86 | 0.297885 -0.632874 -1.000000 87 | -0.015821 0.031226 1.000000 88 | 0.541359 -0.205969 -1.000000 89 | -0.689946 -0.508674 -1.000000 90 | -0.343049 0.841653 -1.000000 91 | 0.523902 -0.436156 -1.000000 92 | 0.249281 -0.711840 -1.000000 93 | 0.193449 0.574598 -1.000000 94 | -0.257542 -0.753885 -1.000000 95 | -0.021605 0.158080 1.000000 96 | 0.601559 -0.727041 -1.000000 97 | -0.791603 0.095651 -1.000000 98 | -0.908298 -0.053376 -1.000000 99 | 0.122020 0.850966 -1.000000 100 | -0.725568 -0.292022 -1.000000 101 | -------------------------------------------------------------------------------- /3SVM/testSetRBF2.txt: -------------------------------------------------------------------------------- 1 | -0.315195 0.179853 1.000000 2 | 0.107595 -0.029051 1.000000 3 | -0.185186 -0.046594 1.000000 4 | 0.417559 -0.611041 -1.000000 5 | -0.709961 -0.378453 -1.000000 6 | 0.817966 -0.339780 -1.000000 7 | -0.426304 0.246405 1.000000 8 | -0.629090 -0.034066 -1.000000 9 | -0.101373 -0.342936 1.000000 10 | -0.115091 0.589987 -1.000000 11 | -0.742537 -0.230896 -1.000000 12 | -0.176015 0.202490 1.000000 13 | 0.482520 0.077956 1.000000 14 | -0.451300 0.058579 1.000000 15 | -0.101936 0.560738 -1.000000 16 | 0.117699 0.025009 1.000000 17 | -0.277058 -0.163113 1.000000 18 | -0.841363 -0.237808 -1.000000 19 | 0.102499 -0.524597 -1.000000 20 | 0.741458 -0.476078 -1.000000 21 | -0.530935 -0.381524 -1.000000 22 | -0.000124 0.000154 1.000000 23 | -0.218510 -0.283071 1.000000 24 | 0.064006 -0.075869 1.000000 25 | 0.187447 0.015254 1.000000 26 | 0.388434 -0.291113 1.000000 27 | -0.296896 0.210095 1.000000 28 | -0.378333 0.635760 -1.000000 29 | 0.574020 -0.146708 -1.000000 30 | 0.801734 -0.525407 -1.000000 31 | -0.060222 0.052230 1.000000 32 | -0.214719 -0.483126 -1.000000 33 | 0.045105 0.413242 1.000000 34 | 0.663627 -0.719413 -1.000000 35 | 0.468531 0.420537 -1.000000 36 | -0.049267 0.030175 1.000000 37 | -0.343603 -0.301825 1.000000 38 | 0.083517 0.135366 1.000000 39 | 0.179711 0.710245 -1.000000 40 | 0.250455 -0.125666 1.000000 41 | -0.268469 -0.270694 1.000000 42 | 0.470130 -0.445839 -1.000000 43 | 0.024252 0.963110 -1.000000 44 | 0.027918 -0.870127 -1.000000 45 | -0.166923 0.377378 1.000000 46 | -0.115090 -0.208590 1.000000 47 | -0.001965 0.199851 1.000000 48 | 0.072049 -0.234439 1.000000 49 | -0.290787 0.495726 -1.000000 50 | -0.726125 -0.073736 -1.000000 51 | 0.167871 -0.017862 1.000000 52 | -0.378729 0.010530 1.000000 53 | -0.090217 -0.013859 1.000000 54 | 0.076733 0.039887 1.000000 55 | 0.144850 0.102434 1.000000 56 | 0.182131 -0.122162 1.000000 57 | 0.438593 0.318277 -1.000000 58 | -0.392669 -0.128420 1.000000 59 | -0.012482 0.058373 1.000000 60 | -0.054475 -0.034040 1.000000 61 | 0.069161 -0.112257 1.000000 62 | 0.010766 0.147746 1.000000 63 | -0.463447 -0.185588 1.000000 64 | 0.251545 -0.207783 1.000000 65 | -0.338931 0.222263 1.000000 66 | -0.364475 0.023580 1.000000 67 | -0.093060 0.191695 1.000000 68 | -0.608280 0.212288 -1.000000 69 | -0.139076 0.026915 1.000000 70 | -0.362995 0.189204 1.000000 71 | 0.429399 0.593252 -1.000000 72 | 0.300878 0.494624 -1.000000 73 | 0.639163 -0.203734 -1.000000 74 | 0.387114 -0.101099 1.000000 75 | -0.075003 0.352444 1.000000 76 | -0.811758 -0.314541 -1.000000 77 | -0.567449 -0.051839 -1.000000 78 | -0.049327 -0.468576 1.000000 79 | -0.264613 -0.207274 1.000000 80 | -0.113365 0.096580 1.000000 81 | 0.903412 -0.127743 -1.000000 82 | 0.798913 0.514763 -1.000000 83 | 0.086340 -0.424824 1.000000 84 | 0.762617 0.498092 -1.000000 85 | 0.515900 0.229150 -1.000000 86 | 0.565364 0.620983 -1.000000 87 | -0.183505 -0.310291 1.000000 88 | -0.723313 -0.158385 -1.000000 89 | 0.429548 0.718230 -1.000000 90 | 0.195963 -0.309003 1.000000 91 | -0.202018 -0.008604 1.000000 92 | -0.463432 0.594036 -1.000000 93 | -0.922401 0.299753 -1.000000 94 | 0.313284 0.136471 1.000000 95 | -0.576804 0.220114 -1.000000 96 | 0.263942 0.326218 1.000000 97 | 0.153849 0.134195 1.000000 98 | -0.768446 -0.172051 -1.000000 99 | -0.056146 0.497200 -1.000000 100 | 0.602243 0.117171 -1.000000 101 | -------------------------------------------------------------------------------- /3线性回归.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"3线性回归.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"metadata":{"id":"6El6dFGc72vu","colab_type":"code","colab":{}},"cell_type":"code","source":[""],"execution_count":0,"outputs":[]},{"metadata":{"id":"SCYwL6I177ax","colab_type":"text"},"cell_type":"markdown","source":["# 线性回归"]},{"metadata":{"id":"M37GNpka789I","colab_type":"code","colab":{}},"cell_type":"code","source":["import numpy as np\n","from sklearn.linear_model import LinearRegression"],"execution_count":0,"outputs":[]},{"metadata":{"id":"52H4hCpz8K0l","colab_type":"code","colab":{}},"cell_type":"code","source":["# 形状非常重要,而且容易错误\n","\n","def fit_normal(X_train, y_train):\n"," \"\"\"根据训练数据集X_train, y_train训练Linear Regression模型\"\"\"\n"," assert X_train.shape[0] == y_train.shape[0], \\\n"," \"the size of X_train must be equal to the size of y_train\"\n","\n"," # np.vstack():在竖直方向上堆叠\n"," # np.hstack():在水平方向上平铺\n"," X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) # 为了增加常数项\n"," theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)\n","\n"," intercept = theta[0]\n"," coef = theta[1:]\n","\n"," return theta"],"execution_count":0,"outputs":[]},{"metadata":{"id":"amxYYK3b8M9h","colab_type":"code","colab":{}},"cell_type":"code","source":["def fit_bgd(X_train, y_train, eta=0.01, n_iters=1e4):\n"," \"\"\"根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型\"\"\"\n"," assert X_train.shape[0] == y_train.shape[0], \\\n"," \"the size of X_train must be equal to the size of y_train\"\n","\n","\n"," def costfunc(theta, X_b, y):\n"," # 计算损失函数\n"," try:\n"," return np.sum((y - X_b.dot(theta)) ** 2) / len(y)/2\n"," except:\n"," return float('inf')\n","\n"," def dJ(theta, X_b, y):\n"," # 损失函数求导\n"," return X_b.T.dot(X_b.dot(theta) - y) / len(y)\n","\n"," def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):\n","\n"," theta = initial_theta\n"," cur_iter = 0\n"," print('X_b.dot(theta)=',(X_b.dot(theta)).shape)\n"," print('(X_b.dot(theta) - y).shape=',(X_b.dot(theta) - y).shape)\n"," print('X_b.T.dot(X_b.dot(theta) - y).shape=',X_b.T.dot(X_b.dot(theta) - y).shape)\n","\n"," # y = np.array(data[:,1])时的维度\n"," # y_train.shape= (97,)\n"," # theta.shape= (2,)\n"," # X_b.dot(theta)= (97,)\n"," # (X_b.dot(theta) - y).shape= (97,)\n"," # X_b.T.dot(X_b.dot(theta) - y).shape= (2,)\n","\n","\n"," # y = np.c_[data[:,1]]时的维度\n"," # y_train.shape= (97, 1)\n"," # theta.shape= (2,)\n"," # X_b.dot(theta)= (97,)\n"," # (X_b.dot(theta) - y).shape= (97, 97)\n"," # X_b.T.dot(X_b.dot(theta) - y).shape= (2, 97)\n"," # ValueError: operands could not be broadcast together with shapes (2,) (2,97) \n","\n","\n"," while cur_iter < n_iters:\n"," gradient = dJ(theta, X_b, y)\n"," # print((X_b.dot(theta)).shape)\n"," last_theta = theta\n"," # print(gradient.shape)\n"," theta = theta - eta * gradient\n"," if (abs(costfunc(theta, X_b, y) - costfunc(last_theta, X_b, y)) < epsilon):\n"," break\n","\n"," cur_iter += 1\n","\n"," return theta\n","\n"," X_b = np.hstack([np.ones((len(X_train), 1)), X_train])\n"," print('X_b.shape=',X_b.shape)\n"," print('y_train.shape=',y_train.shape)\n"," initial_theta = np.zeros(X_b.shape[1]) #初始化theta\n"," print('theta.shape=',initial_theta.shape)\n"," theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)\n","\n"," intercept_ = theta[0]\n"," coef_ = theta[1:]\n","\n"," return theta"],"execution_count":0,"outputs":[]},{"metadata":{"id":"HkwsMDhm8QGd","colab_type":"code","colab":{}},"cell_type":"code","source":["def predict(X_predict,theta):\n"," \"\"\"给定待预测数据集X_predict,返回表示X_predict的结果向量\"\"\"\n","\n"," X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])\n"," return X_b.dot(theta)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"lxWyKr3d8SQj","colab_type":"code","colab":{}},"cell_type":"code","source":["def test():\n"," data = np.loadtxt('linear_regression_data1.txt', delimiter=',')\n"," X = np.c_[data[:,0]]\n"," y = np.array(data[:,1])\n"," y1 = np.c_[data[:,1]]\n"," print(fit_normal(X,y))\n"," print(fit_bgd(X,y))\n","\n"," regr = LinearRegression()\n"," regr.fit(X, y)\n"," print(regr.intercept_,regr.coef_)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"YNS67dKj8cGQ","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":170},"outputId":"2b40d359-4408-477a-c8e8-89d56ea54a76","executionInfo":{"status":"ok","timestamp":1551254029786,"user_tz":-480,"elapsed":841,"user":{"displayName":"Sen Yang","photoUrl":"","userId":"00832503676208839570"}}},"cell_type":"code","source":["test()\n","\n","# ValueError: operands could not be broadcast together with shapes (2,) (2,97) \n"],"execution_count":6,"outputs":[{"output_type":"stream","text":["[-3.89578088 1.19303364]\n","X_b.shape= (97, 2)\n","y_train.shape= (97,)\n","theta.shape= (2,)\n","X_b.dot(theta)= (97,)\n","(X_b.dot(theta) - y).shape= (97,)\n","X_b.T.dot(X_b.dot(theta) - y).shape= (2,)\n","[-3.89027341 1.19248036]\n","-3.8957808783118484 [1.19303364]\n"],"name":"stdout"}]},{"metadata":{"id":"0C99Wllc8gIi","colab_type":"code","colab":{}},"cell_type":"code","source":[""],"execution_count":0,"outputs":[]}]} -------------------------------------------------------------------------------- /4决策树/DT.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 数据集:Mnist 3 | 训练集数量:60000 4 | 测试集数量:10000 5 | ------------------------------ 6 | 运行结果:ID3(未剪枝) 7 | 正确率:85.9% 8 | 运行时长:356s 9 | ''' 10 | 11 | import time 12 | import numpy as np 13 | 14 | 15 | def loadData(fileName): 16 | ''' 17 | 加载文件 18 | :param fileName:要加载的文件路径 19 | :return: 数据集和标签集 20 | ''' 21 | # 存放数据及标记 22 | dataArr = []; 23 | labelArr = [] 24 | # 读取文件 25 | fr = open(fileName) 26 | # 遍历文件中的每一行 27 | for line in fr.readlines(): 28 | # 获取当前行,并按“,”切割成字段放入列表中 29 | # strip:去掉每行字符串首尾指定的字符(默认空格或换行符) 30 | # split:按照指定的字符将字符串切割成每个字段,返回列表形式 31 | curLine = line.strip().split(',') 32 | # 将每行中除标记外的数据放入数据集中(curLine[0]为标记信息) 33 | # 在放入的同时将原先字符串形式的数据转换为整型 34 | # 此外将数据进行了二值化处理,大于128的转换成1,小于的转换成0,方便后续计算 35 | dataArr.append([int(int(num) > 128) for num in curLine[1:]]) 36 | # 将标记信息放入标记集中 37 | # 放入的同时将标记转换为整型 38 | labelArr.append(int(curLine[0])) 39 | # 返回数据集和标记 40 | return dataArr, labelArr 41 | 42 | 43 | def majorClass(labelArr): 44 | ''' 45 | 找到当前标签集中占数目最大的标签 46 | :param labelArr: 标签集 47 | :return: 最大的标签 48 | ''' 49 | # 建立字典,用于不同类别的标签技术 50 | classDict = {} 51 | # 遍历所有标签 52 | for i in range(len(labelArr)): 53 | # 当第一次遇到A标签时,字典内还没有A标签,这时候直接幅值加1是错误的, 54 | # 所以需要判断字典中是否有该键,没有则创建,有就直接自增 55 | if labelArr[i] in classDict.keys(): 56 | # 若在字典中存在该标签,则直接加1 57 | classDict[labelArr[i]] += 1 58 | else: 59 | # 若无该标签,设初值为1,表示出现了1次了 60 | classDict[labelArr[i]] = 1 61 | # 对字典依据值进行降序排序 62 | classSort = sorted(classDict.items(), key=lambda x: x[1], reverse=True) 63 | # 返回最大一项的标签,即占数目最多的标签 64 | return classSort[0][0] 65 | 66 | 67 | def calc_H_D(trainLabelArr): 68 | ''' 69 | 计算数据集D的经验熵,参考公式5.7 经验熵的计算 70 | :param trainLabelArr:当前数据集的标签集 71 | :return: 经验熵 72 | ''' 73 | # 初始化为0 74 | H_D = 0 75 | # 将当前所有标签放入集合中,这样只要有的标签都会在集合中出现,且出现一次。 76 | # 遍历该集合就可以遍历所有出现过的标记并计算其Ck 77 | # 这么做有一个很重要的原因:首先假设一个背景,当前标签集中有一些标记已经没有了,比如说标签集中 78 | # 没有0(这是很正常的,说明当前分支不存在这个标签)。 式5.7中有一项Ck,那按照式中的针对不同标签k 79 | # 计算Cl和D并求和时,由于没有0,那么C0=0,此时C0/D0=0,log2(C0/D0) = log2(0),事实上0并不在log的 80 | # 定义区间内,出现了问题 81 | # 所以使用集合的方式先知道当前标签中都出现了那些标签,随后对每个标签进行计算,如果没出现的标签那一项就 82 | # 不在经验熵中出现(未参与,对经验熵无影响),保证log的计算能一直有定义 83 | trainLabelSet = set([label for label in trainLabelArr]) 84 | # 遍历每一个出现过的标签 85 | for i in trainLabelSet: 86 | # 计算|Ck|/|D| 87 | # trainLabelArr == i:当前标签集中为该标签的的位置 88 | # 例如a = [1, 0, 0, 1], c = (a == 1): c == [True, false, false, True] 89 | # trainLabelArr[trainLabelArr == i]:获得为指定标签的样本 90 | # trainLabelArr[trainLabelArr == i].size:获得为指定标签的样本的大小,即标签为i的样本 91 | # 数量,就是|Ck| 92 | # trainLabelArr.size:整个标签集的数量(也就是样本集的数量),即|D| 93 | p = trainLabelArr[trainLabelArr == i].size / trainLabelArr.size 94 | # 对经验熵的每一项累加求和 95 | H_D += -1 * p * np.log2(p) 96 | 97 | # 返回经验熵 98 | return H_D 99 | 100 | 101 | def calcH_D_A(trainDataArr_DevFeature, trainLabelArr): 102 | ''' 103 | 计算经验条件熵 104 | :param trainDataArr_DevFeature:切割后只有feature那列数据的数组 105 | :param trainLabelArr: 标签集数组 106 | :return: 经验条件熵 107 | ''' 108 | # 初始为0 109 | H_D_A = 0 110 | # 在featue那列放入集合中,是为了根据集合中的数目知道该feature目前可取值数目是多少 111 | trainDataSet = set([label for label in trainDataArr_DevFeature]) 112 | 113 | # 对于每一个特征取值遍历计算条件经验熵的每一项 114 | for i in trainDataSet: 115 | # 计算H(D|A) 116 | # trainDataArr_DevFeature[trainDataArr_DevFeature == i].size / trainDataArr_DevFeature.size:|Di| / |D| 117 | # calc_H_D(trainLabelArr[trainDataArr_DevFeature == i]):H(Di) 118 | H_D_A += trainDataArr_DevFeature[trainDataArr_DevFeature == i].size / trainDataArr_DevFeature.size \ 119 | * calc_H_D(trainLabelArr[trainDataArr_DevFeature == i]) 120 | # 返回得出的条件经验熵 121 | return H_D_A 122 | 123 | 124 | def calcBestFeature(trainDataList, trainLabelList): 125 | ''' 126 | 计算信息增益最大的特征 127 | :param trainDataList: 当前数据集 128 | :param trainLabelList: 当前标签集 129 | :return: 信息增益最大的特征及最大信息增益值 130 | ''' 131 | # 将数据集和标签集转换为数组形式 132 | # trainLabelArr转换后需要转置,这样在取数时方便 133 | # 例如a = np.array([1, 2, 3]); b = np.array([1, 2, 3]).T 134 | # 若不转置,a[0] = [1, 2, 3],转置后b[0] = 1, b[1] = 2 135 | # 对于标签集来说,能够很方便地取到每一位是很重要的 136 | trainDataArr = np.array(trainDataList) 137 | trainLabelArr = np.array(trainLabelList).T 138 | 139 | # 获取当前特征数目,也就是数据集的横轴大小 140 | featureNum = trainDataArr.shape[1] 141 | 142 | # 初始化最大信息增益 143 | maxG_D_A = -1 144 | # 初始化最大信息增益的特征 145 | maxFeature = -1 146 | # 对每一个特征进行遍历计算 147 | for feature in range(featureNum): 148 | # “5.2.2 信息增益”中“算法5.1(信息增益的算法)”第一步: 149 | # 1.计算数据集D的经验熵H(D) 150 | H_D = calc_H_D(trainLabelArr) 151 | # 2.计算条件经验熵H(D|A) 152 | # 由于条件经验熵的计算过程中只涉及到标签以及当前特征,为了提高运算速度(全部样本 153 | # 做成的矩阵运算速度太慢,需要剔除不需要的部分),将数据集矩阵进行切割 154 | # 数据集在初始时刻是一个Arr = 60000*784的矩阵,针对当前要计算的feature,在训练集中切割下 155 | # Arr[:, feature]这么一条来,因为后续计算中数据集中只用到这个(没明白的跟着算一遍例5.2) 156 | # trainDataArr[:, feature]:在数据集中切割下这么一条 157 | # trainDataArr[:, feature].flat:将这么一条转换成竖着的列表 158 | # np.array(trainDataArr[:, feature].flat):再转换成一条竖着的矩阵,大小为60000*1(只是初始是 159 | # 这么大,运行过程中是依据当前数据集大小动态变的) 160 | trainDataArr_DevideByFeature = np.array(trainDataArr[:, feature].flat) 161 | # 3.计算信息增益G(D|A) G(D|A) = H(D) - H(D | A) 162 | G_D_A = H_D - calcH_D_A(trainDataArr_DevideByFeature, trainLabelArr) 163 | # 不断更新最大的信息增益以及对应的feature 164 | if G_D_A > maxG_D_A: 165 | maxG_D_A = G_D_A 166 | maxFeature = feature 167 | return maxFeature, maxG_D_A 168 | 169 | 170 | def getSubDataArr(trainDataArr, trainLabelArr, A, a): 171 | ''' 172 | 更新数据集和标签集 173 | :param trainDataArr:要更新的数据集 174 | :param trainLabelArr: 要更新的标签集 175 | :param A: 要去除的特征索引 176 | :param a: 当data[A]== a时,说明该行样本时要保留的 177 | :return: 新的数据集和标签集 178 | ''' 179 | # 返回的数据集 180 | retDataArr = [] 181 | # 返回的标签集 182 | retLabelArr = [] 183 | # 对当前数据的每一个样本进行遍历 184 | for i in range(len(trainDataArr)): 185 | # 如果当前样本的特征为指定特征值a 186 | if trainDataArr[i][A] == a: 187 | # 那么将该样本的第A个特征切割掉,放入返回的数据集中 188 | retDataArr.append(trainDataArr[i][0:A] + trainDataArr[i][A + 1:]) 189 | # 将该样本的标签放入返回标签集中 190 | retLabelArr.append(trainLabelArr[i]) 191 | # 返回新的数据集和标签集 192 | return retDataArr, retLabelArr 193 | 194 | 195 | def createTree(*dataSet): 196 | ''' 197 | 递归创建决策树 198 | :param dataSet:(trainDataList, trainLabelList) <<-- 元祖形式 199 | :return:新的子节点或该叶子节点的值 200 | ''' 201 | # 设置Epsilon,“5.3.1 ID3算法”第4步提到需要将信息增益与阈值Epsilon比较,若小于则直接处理后返回T 202 | Epsilon = 0.1 203 | # 从参数中获取trainDataList和trainLabelList 204 | trainDataList = dataSet[0][0] 205 | trainLabelList = dataSet[0][1] 206 | # 打印信息:开始一个子节点创建,打印当前特征向量数目及当前剩余样本数目 207 | print('start a node', len(trainDataList[0]), len(trainLabelList)) 208 | 209 | # 将标签放入一个字典中,当前样本有多少类,在字典中就会有多少项 210 | # 也相当于去重,多次出现的标签就留一次。举个例子,假如处理结束后字典的长度为1,那说明所有的样本 211 | # 都是同一个标签,那就可以直接返回该标签了,不需要再生成子节点了。 212 | classDict = {i for i in trainLabelList} 213 | # 如果D中所有实例属于同一类Ck,则置T为单节点数,并将Ck作为该节点的类,返回T 214 | # 即若所有样本的标签一致,也就不需要再分化,返回标记作为该节点的值,返回后这就是一个叶子节点 215 | if len(classDict) == 1: 216 | # 因为所有样本都是一致的,在标签集中随便拿一个标签返回都行,这里用的第0个(因为你并不知道 217 | # 当前标签集的长度是多少,但运行中所有标签只要有长度都会有第0位。 218 | return trainLabelList[0] 219 | 220 | # 如果A为空集,则置T为单节点数,并将D中实例数最大的类Ck作为该节点的类,返回T 221 | # 即如果已经没有特征可以用来再分化了,就返回占大多数的类别 222 | if len(trainDataList[0]) == 0: 223 | # 返回当前标签集中占数目最大的标签 224 | return majorClass(trainLabelList) 225 | 226 | # 否则,按式5.10计算A中个特征值的信息增益,选择信息增益最大的特征Ag 227 | Ag, EpsilonGet = calcBestFeature(trainDataList, trainLabelList) 228 | 229 | # 如果Ag的信息增益比小于阈值Epsilon,则置T为单节点树,并将D中实例数最大的类Ck 230 | # 作为该节点的类,返回T 231 | if EpsilonGet < Epsilon: 232 | return majorClass(trainLabelList) 233 | 234 | # 否则,对Ag的每一可能值ai,依Ag=ai将D分割为若干非空子集Di,将Di中实例数最大的 235 | # 类作为标记,构建子节点,由节点及其子节点构成树T,返回T 236 | treeDict = {Ag: {}} 237 | # 特征值为0时,进入0分支 238 | # getSubDataArr(trainDataList, trainLabelList, Ag, 0):在当前数据集中切割当前feature,返回新的数据集和标签集 239 | treeDict[Ag][0] = createTree(getSubDataArr(trainDataList, trainLabelList, Ag, 0)) 240 | treeDict[Ag][1] = createTree(getSubDataArr(trainDataList, trainLabelList, Ag, 1)) 241 | 242 | return treeDict 243 | 244 | 245 | def predict(testDataList, tree): 246 | ''' 247 | 预测标签 248 | :param testDataList:样本 249 | :param tree: 决策树 250 | :return: 预测结果 251 | ''' 252 | # treeDict = copy.deepcopy(tree) 253 | 254 | # 死循环,直到找到一个有效地分类 255 | while True: 256 | # 因为有时候当前字典只有一个节点 257 | # 例如{73: {0: {74:6}}}看起来节点很多,但是对于字典的最顶层来说,只有73一个key,其余都是value 258 | # 若还是采用for来读取的话不太合适,所以使用下行这种方式读取key和value 259 | (key, value), = tree.items() 260 | # 如果当前的value是字典,说明还需要遍历下去 261 | if type(tree[key]).__name__ == 'dict': 262 | # 获取目前所在节点的feature值,需要在样本中删除该feature 263 | # 因为在创建树的过程中,feature的索引值永远是对于当时剩余的feature来设置的 264 | # 所以需要不断地删除已经用掉的特征,保证索引相对位置的一致性 265 | dataVal = testDataList[key] 266 | del testDataList[key] 267 | # 将tree更新为其子节点的字典 268 | tree = value[dataVal] 269 | # 如果当前节点的子节点的值是int,就直接返回该int值 270 | # 例如{403: {0: 7, 1: {297:7}},dataVal=0 271 | # 此时上一行tree = value[dataVal],将tree定位到了7,而7不再是一个字典了, 272 | # 这里就可以直接返回7了,如果tree = value[1],那就是一个新的子节点,需要继续遍历下去 273 | if type(tree).__name__ == 'int': 274 | # 返回该节点值,也就是分类值 275 | return tree 276 | else: 277 | # 如果当前value不是字典,那就返回分类值 278 | return value 279 | 280 | 281 | def accuracy(testDataList, testLabelList, tree): 282 | ''' 283 | 测试准确率 284 | :param testDataList:待测试数据集 285 | :param testLabelList: 待测试标签集 286 | :param tree: 训练集生成的树 287 | :return: 准确率 288 | ''' 289 | # 错误次数计数 290 | errorCnt = 0 291 | # 遍历测试集中每一个测试样本 292 | for i in range(len(testDataList)): 293 | # 判断预测与标签中结果是否一致 294 | if testLabelList[i] != predict(testDataList[i], tree): 295 | errorCnt += 1 296 | # 返回准确率 297 | return 1 - errorCnt / len(testDataList) 298 | 299 | 300 | if __name__ == '__main__': 301 | # 开始时间 302 | start = time.time() 303 | 304 | # 获取训练集 305 | trainDataList, trainLabelList = loadData('../data/Mnist/mnist_train.csv') 306 | # 获取测试集 307 | testDataList, testLabelList = loadData('../data/Mnist/mnist_test.csv') 308 | 309 | # 创建决策树 310 | print('start create tree') 311 | tree = createTree((trainDataList, trainLabelList)) 312 | print('tree is:', tree) 313 | 314 | # 测试准确率 315 | print('start test') 316 | accur = accuracy(testDataList, testLabelList, tree) 317 | print('the accur is:', accur) 318 | 319 | # 结束时间 320 | end = time.time() 321 | print('time span:', end - start) 322 | 323 | # the accur is: 0.8589 324 | # time span: 313.45452547073364 -------------------------------------------------------------------------------- /4决策树/Dtree_id3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from collections import Counter 4 | 5 | 6 | class Node(object): 7 | def __init__(self, x=None, label=None, y=None, data=None): 8 | self.label = label # label:子节点分类依据的特征 9 | self.x = x # x:特征 10 | self.child = [] # child:子节点 11 | self.y = y # y:类标记(叶节点才有) 12 | self.data = data # data:包含数据(叶节点才有) 13 | 14 | def append(self, node): # 添加子节点 15 | self.child.append(node) 16 | 17 | def predict(self, features): # 预测数据所述类 18 | if self.y is not None: 19 | return self.y 20 | for c in self.child: 21 | if c.x == features[self.label]: 22 | return c.predict(features) 23 | 24 | 25 | def printnode(node, depth=0): # 打印树所有节点 26 | if node.label is None: 27 | print(depth, (node.label, node.x, node.y, len(node.data))) 28 | else: 29 | print(depth, (node.label, node.x)) 30 | for c in node.child: 31 | printnode(c, depth+1) 32 | 33 | 34 | class DTreeID3(object): 35 | def __init__(self, epsilon=0, alpha=0): 36 | # 信息增益阈值 37 | self.epsilon = epsilon 38 | self.alpha = alpha 39 | self.tree = Node() 40 | 41 | # 求概率 42 | def prob(self, datasets): 43 | datalen = len(datasets) 44 | labelx = set(datasets) 45 | p = {l: 0 for l in labelx} 46 | for d in datasets: 47 | p[d] += 1 48 | for i in p.items(): 49 | p[i[0]] /= datalen 50 | return p 51 | 52 | # 求数据集的熵 53 | def calc_ent(self, datasets): 54 | p = self.prob(datasets) 55 | value = list(p.values()) 56 | return -np.sum(np.multiply(value, np.log2(value))) 57 | 58 | # 求条件熵 59 | def cond_ent(self, datasets, col): 60 | labelx = set(datasets.iloc[col]) 61 | p = {x: [] for x in labelx} 62 | for i, d in enumerate(datasets.iloc[-1]): 63 | p[datasets.iloc[col][i]].append(d) 64 | return sum([self.prob(datasets.iloc[col])[k] * self.calc_ent(p[k]) for k in p.keys()]) 65 | 66 | 67 | # 求信息增益 68 | def info_gain_train(self, datasets, datalabels): 69 | datasets = datasets.T 70 | ent = self.calc_ent(datasets.iloc[-1]) 71 | gainmax = {} 72 | for i in range(len(datasets) - 1): 73 | cond = self.cond_ent(datasets, i) 74 | gainmax[ent - cond] = i 75 | m = max(gainmax.keys()) 76 | return gainmax[m], m 77 | 78 | 79 | def train(self, datasets, node): 80 | labely = datasets.columns[-1] 81 | # 判断样本是否为同一类输出Di,如果是则返回单节点树T。标记类别为Di 82 | if len(datasets[labely].value_counts()) == 1: 83 | node.data = datasets[labely] 84 | node.y = datasets[labely][0] 85 | return 86 | # 判断特征是否为空,如果是则返回单节点树T,标记类别为样本中输出类别D实例数最多的类别 87 | if len(datasets.columns[:-1]) == 0: 88 | node.data = datasets[labely] 89 | node.y = datasets[labely].value_counts().index[0] 90 | return 91 | # 计算A中的各个特征(一共n个)对输出D的信息增益,选择信息增益最大的特征Ag。 92 | gainmaxi, gainmax = self.info_gain_train(datasets, datasets.columns) 93 | # 如果Ag的信息增益小于阈值ε,则返回单节点树T,标记类别为样本中输出类别D实例数最多的类别。 94 | if gainmax <= self.epsilon: 95 | node.data = datasets[labely] 96 | node.y = datasets[labely].value_counts().index[0] 97 | return 98 | # 按特征Ag的不同取值Agi将对应的样本输出D分成不同的类别Di。每个类别产生一个子节点。对应特征值为Agi。返回增加了节点的数T。 99 | vc = datasets[datasets.columns[gainmaxi]].value_counts() 100 | for Di in vc.index: 101 | node.label = gainmaxi 102 | child = Node(Di) 103 | node.append(child) 104 | new_datasets = pd.DataFrame([list(i) for i in datasets.values if i[gainmaxi]==Di], columns=datasets.columns) 105 | self.train(new_datasets, child) 106 | 107 | #训练数据 108 | def fit(self, datasets): 109 | self.train(datasets, self.tree) 110 | 111 | # 找到所有节点 112 | def findleaf(self, node, leaf): 113 | for t in node.child: 114 | if t.y is not None: 115 | leaf.append(t.data) 116 | else: 117 | for c in node.child: 118 | self.findleaf(c, leaf) 119 | 120 | def findfather(self, node, errormin): 121 | if node.label is not None: 122 | cy = [c.y for c in node.child] 123 | if None not in cy: # 全是叶节点 124 | childdata = [] 125 | for c in node.child: 126 | for d in list(c.data): 127 | childdata.append(d) 128 | childcounter = Counter(childdata) 129 | 130 | old_child = node.child # 剪枝前先拷贝一下 131 | old_label = node.label 132 | old_y = node.y 133 | old_data = node.data 134 | 135 | node.label = None # 剪枝 136 | node.y = childcounter.most_common(1)[0][0] 137 | node.data = childdata 138 | 139 | error = self.c_error() 140 | if error <= errormin: # 剪枝前后损失比较 141 | errormin = error 142 | return 1 143 | else: 144 | node.child = old_child # 剪枝效果不好,则复原 145 | node.label = old_label 146 | node.y = old_y 147 | node.data = old_data 148 | else: 149 | re = 0 150 | i = 0 151 | while i < len(node.child): 152 | if_re = self.findfather(node.child[i], errormin) # 若剪过枝,则其父节点要重新检测 153 | if if_re == 1: 154 | re = 1 155 | elif if_re == 2: 156 | i -= 1 157 | i += 1 158 | if re: 159 | return 2 160 | return 0 161 | 162 | def c_error(self): # 求C(T) 163 | leaf = [] 164 | self.findleaf(self.tree, leaf) 165 | leafnum = [len(l) for l in leaf] 166 | ent = [self.calc_ent(l) for l in leaf] 167 | print("Ent:", ent) 168 | error = self.alpha*len(leafnum) 169 | for l, e in zip(leafnum, ent): 170 | error += l*e 171 | print("C(T):", error) 172 | return error 173 | 174 | def cut(self, alpha=0): # 剪枝 175 | if alpha: 176 | self.alpha = alpha 177 | errormin = self.c_error() 178 | self.findfather(self.tree, errormin) 179 | 180 | if __name__ == "__main__": 181 | 182 | 183 | datasets = np.array([ 184 | ['青年', '否', '否', '一般', '否'], 185 | ['青年', '否', '否', '好', '否'], 186 | ['青年', '是', '否', '好', '是'], 187 | ['青年', '是', '是', '一般', '是'], 188 | ['青年', '否', '否', '一般', '否'], 189 | ['中年', '否', '否', '一般', '否'], 190 | ['中年', '否', '否', '好', '否'], 191 | ['中年', '是', '是', '好', '是'], 192 | ['中年', '否', '是', '非常好', '是'], 193 | ['中年', '否', '是', '非常好', '是'], 194 | ['老年', '否', '是', '非常好', '是'], 195 | ['老年', '否', '是', '好', '是'], 196 | ['老年', '是', '否', '好', '是'], 197 | ['老年', '是', '否', '非常好', '是'], 198 | ['老年', '否', '否', '一般', '否'], 199 | ['青年', '否', '否', '一般', '是']]) # 在李航原始数据上多加了最后这行数据,以便体现剪枝效果 200 | 201 | datalabels = np.array(['年龄', '有工作', '有自己的房子', '信贷情况', '类别']) 202 | train_data = pd.DataFrame(datasets, columns=datalabels) 203 | test_data = ['老年', '否', '否', '一般'] 204 | 205 | dt = DTreeID3(epsilon=0) # 可修改epsilon查看预剪枝效果 206 | dt.fit(train_data) 207 | 208 | print('DTree:') 209 | printnode(dt.tree) 210 | y = dt.tree.predict(test_data) 211 | print('result:', y) 212 | 213 | dt.cut(alpha=0.5) # 可修改正则化参数alpha查看后剪枝效果 214 | 215 | print('DTree:') 216 | printnode(dt.tree) 217 | y = dt.tree.predict(test_data) 218 | print('result:', y) 219 | -------------------------------------------------------------------------------- /4决策树/data.csv: -------------------------------------------------------------------------------- 1 | x,y 2 | 3.3,1.7 3 | 4.4,2.76 4 | 5.5,2.09 5 | 6.71,3.19 6 | 6.93,1.694 7 | 4.168,1.573 8 | 9.779,3.366 9 | 6.182,2.596 10 | 7.59,2.53 11 | 2.167,1.221 12 | 7.042,2.827 13 | 10.791,3.465 14 | 5.313,1.65 15 | 7.997,2.904 16 | 3.1,1.3 17 | -------------------------------------------------------------------------------- /6贝叶斯/iris.txt: -------------------------------------------------------------------------------- 1 | 5.1,3.5,1.4,0.2,Iris-setosa 2 | 4.9,3.0,1.4,0.2,Iris-setosa 3 | 4.7,3.2,1.3,0.2,Iris-setosa 4 | 4.6,3.1,1.5,0.2,Iris-setosa 5 | 5.0,3.6,1.4,0.2,Iris-setosa 6 | 5.4,3.9,1.7,0.4,Iris-setosa 7 | 4.6,3.4,1.4,0.3,Iris-setosa 8 | 5.0,3.4,1.5,0.2,Iris-setosa 9 | 4.4,2.9,1.4,0.2,Iris-setosa 10 | 4.9,3.1,1.5,0.1,Iris-setosa 11 | 5.4,3.7,1.5,0.2,Iris-setosa 12 | 4.8,3.4,1.6,0.2,Iris-setosa 13 | 4.8,3.0,1.4,0.1,Iris-setosa 14 | 4.3,3.0,1.1,0.1,Iris-setosa 15 | 5.8,4.0,1.2,0.2,Iris-setosa 16 | 5.7,4.4,1.5,0.4,Iris-setosa 17 | 5.4,3.9,1.3,0.4,Iris-setosa 18 | 5.1,3.5,1.4,0.3,Iris-setosa 19 | 5.7,3.8,1.7,0.3,Iris-setosa 20 | 5.1,3.8,1.5,0.3,Iris-setosa 21 | 5.4,3.4,1.7,0.2,Iris-setosa 22 | 5.1,3.7,1.5,0.4,Iris-setosa 23 | 4.6,3.6,1.0,0.2,Iris-setosa 24 | 5.1,3.3,1.7,0.5,Iris-setosa 25 | 4.8,3.4,1.9,0.2,Iris-setosa 26 | 5.0,3.0,1.6,0.2,Iris-setosa 27 | 5.0,3.4,1.6,0.4,Iris-setosa 28 | 5.2,3.5,1.5,0.2,Iris-setosa 29 | 5.2,3.4,1.4,0.2,Iris-setosa 30 | 4.7,3.2,1.6,0.2,Iris-setosa 31 | 4.8,3.1,1.6,0.2,Iris-setosa 32 | 5.4,3.4,1.5,0.4,Iris-setosa 33 | 5.2,4.1,1.5,0.1,Iris-setosa 34 | 5.5,4.2,1.4,0.2,Iris-setosa 35 | 4.9,3.1,1.5,0.1,Iris-setosa 36 | 5.0,3.2,1.2,0.2,Iris-setosa 37 | 5.5,3.5,1.3,0.2,Iris-setosa 38 | 4.9,3.1,1.5,0.1,Iris-setosa 39 | 4.4,3.0,1.3,0.2,Iris-setosa 40 | 5.1,3.4,1.5,0.2,Iris-setosa 41 | 5.0,3.5,1.3,0.3,Iris-setosa 42 | 4.5,2.3,1.3,0.3,Iris-setosa 43 | 4.4,3.2,1.3,0.2,Iris-setosa 44 | 5.0,3.5,1.6,0.6,Iris-setosa 45 | 5.1,3.8,1.9,0.4,Iris-setosa 46 | 4.8,3.0,1.4,0.3,Iris-setosa 47 | 5.1,3.8,1.6,0.2,Iris-setosa 48 | 4.6,3.2,1.4,0.2,Iris-setosa 49 | 5.3,3.7,1.5,0.2,Iris-setosa 50 | 5.0,3.3,1.4,0.2,Iris-setosa 51 | 7.0,3.2,4.7,1.4,Iris-versicolor 52 | 6.4,3.2,4.5,1.5,Iris-versicolor 53 | 6.9,3.1,4.9,1.5,Iris-versicolor 54 | 5.5,2.3,4.0,1.3,Iris-versicolor 55 | 6.5,2.8,4.6,1.5,Iris-versicolor 56 | 5.7,2.8,4.5,1.3,Iris-versicolor 57 | 6.3,3.3,4.7,1.6,Iris-versicolor 58 | 4.9,2.4,3.3,1.0,Iris-versicolor 59 | 6.6,2.9,4.6,1.3,Iris-versicolor 60 | 5.2,2.7,3.9,1.4,Iris-versicolor 61 | 5.0,2.0,3.5,1.0,Iris-versicolor 62 | 5.9,3.0,4.2,1.5,Iris-versicolor 63 | 6.0,2.2,4.0,1.0,Iris-versicolor 64 | 6.1,2.9,4.7,1.4,Iris-versicolor 65 | 5.6,2.9,3.6,1.3,Iris-versicolor 66 | 6.7,3.1,4.4,1.4,Iris-versicolor 67 | 5.6,3.0,4.5,1.5,Iris-versicolor 68 | 5.8,2.7,4.1,1.0,Iris-versicolor 69 | 6.2,2.2,4.5,1.5,Iris-versicolor 70 | 5.6,2.5,3.9,1.1,Iris-versicolor 71 | 5.9,3.2,4.8,1.8,Iris-versicolor 72 | 6.1,2.8,4.0,1.3,Iris-versicolor 73 | 6.3,2.5,4.9,1.5,Iris-versicolor 74 | 6.1,2.8,4.7,1.2,Iris-versicolor 75 | 6.4,2.9,4.3,1.3,Iris-versicolor 76 | 6.6,3.0,4.4,1.4,Iris-versicolor 77 | 6.8,2.8,4.8,1.4,Iris-versicolor 78 | 6.7,3.0,5.0,1.7,Iris-versicolor 79 | 6.0,2.9,4.5,1.5,Iris-versicolor 80 | 5.7,2.6,3.5,1.0,Iris-versicolor 81 | 5.5,2.4,3.8,1.1,Iris-versicolor 82 | 5.5,2.4,3.7,1.0,Iris-versicolor 83 | 5.8,2.7,3.9,1.2,Iris-versicolor 84 | 6.0,2.7,5.1,1.6,Iris-versicolor 85 | 5.4,3.0,4.5,1.5,Iris-versicolor 86 | 6.0,3.4,4.5,1.6,Iris-versicolor 87 | 6.7,3.1,4.7,1.5,Iris-versicolor 88 | 6.3,2.3,4.4,1.3,Iris-versicolor 89 | 5.6,3.0,4.1,1.3,Iris-versicolor 90 | 5.5,2.5,4.0,1.3,Iris-versicolor 91 | 5.5,2.6,4.4,1.2,Iris-versicolor 92 | 6.1,3.0,4.6,1.4,Iris-versicolor 93 | 5.8,2.6,4.0,1.2,Iris-versicolor 94 | 5.0,2.3,3.3,1.0,Iris-versicolor 95 | 5.6,2.7,4.2,1.3,Iris-versicolor 96 | 5.7,3.0,4.2,1.2,Iris-versicolor 97 | 5.7,2.9,4.2,1.3,Iris-versicolor 98 | 6.2,2.9,4.3,1.3,Iris-versicolor 99 | 5.1,2.5,3.0,1.1,Iris-versicolor 100 | 5.7,2.8,4.1,1.3,Iris-versicolor 101 | 6.3,3.3,6.0,2.5,Iris-virginica 102 | 5.8,2.7,5.1,1.9,Iris-virginica 103 | 7.1,3.0,5.9,2.1,Iris-virginica 104 | 6.3,2.9,5.6,1.8,Iris-virginica 105 | 6.5,3.0,5.8,2.2,Iris-virginica 106 | 7.6,3.0,6.6,2.1,Iris-virginica 107 | 4.9,2.5,4.5,1.7,Iris-virginica 108 | 7.3,2.9,6.3,1.8,Iris-virginica 109 | 6.7,2.5,5.8,1.8,Iris-virginica 110 | 7.2,3.6,6.1,2.5,Iris-virginica 111 | 6.5,3.2,5.1,2.0,Iris-virginica 112 | 6.4,2.7,5.3,1.9,Iris-virginica 113 | 6.8,3.0,5.5,2.1,Iris-virginica 114 | 5.7,2.5,5.0,2.0,Iris-virginica 115 | 5.8,2.8,5.1,2.4,Iris-virginica 116 | 6.4,3.2,5.3,2.3,Iris-virginica 117 | 6.5,3.0,5.5,1.8,Iris-virginica 118 | 7.7,3.8,6.7,2.2,Iris-virginica 119 | 7.7,2.6,6.9,2.3,Iris-virginica 120 | 6.0,2.2,5.0,1.5,Iris-virginica 121 | 6.9,3.2,5.7,2.3,Iris-virginica 122 | 5.6,2.8,4.9,2.0,Iris-virginica 123 | 7.7,2.8,6.7,2.0,Iris-virginica 124 | 6.3,2.7,4.9,1.8,Iris-virginica 125 | 6.7,3.3,5.7,2.1,Iris-virginica 126 | 7.2,3.2,6.0,1.8,Iris-virginica 127 | 6.2,2.8,4.8,1.8,Iris-virginica 128 | 6.1,3.0,4.9,1.8,Iris-virginica 129 | 6.4,2.8,5.6,2.1,Iris-virginica 130 | 7.2,3.0,5.8,1.6,Iris-virginica 131 | 7.4,2.8,6.1,1.9,Iris-virginica 132 | 7.9,3.8,6.4,2.0,Iris-virginica 133 | 6.4,2.8,5.6,2.2,Iris-virginica 134 | 6.3,2.8,5.1,1.5,Iris-virginica 135 | 6.1,2.6,5.6,1.4,Iris-virginica 136 | 7.7,3.0,6.1,2.3,Iris-virginica 137 | 6.3,3.4,5.6,2.4,Iris-virginica 138 | 6.4,3.1,5.5,1.8,Iris-virginica 139 | 6.0,3.0,4.8,1.8,Iris-virginica 140 | 6.9,3.1,5.4,2.1,Iris-virginica 141 | 6.7,3.1,5.6,2.4,Iris-virginica 142 | 6.9,3.1,5.1,2.3,Iris-virginica 143 | 5.8,2.7,5.1,1.9,Iris-virginica 144 | 6.8,3.2,5.9,2.3,Iris-virginica 145 | 6.7,3.3,5.7,2.5,Iris-virginica 146 | 6.7,3.0,5.2,2.3,Iris-virginica 147 | 6.3,2.5,5.0,1.9,Iris-virginica 148 | 6.5,3.0,5.2,2.0,Iris-virginica 149 | 6.2,3.4,5.4,2.3,Iris-virginica 150 | 5.9,3.0,5.1,1.8,Iris-virginica 151 | -------------------------------------------------------------------------------- /6贝叶斯/naiveBayesBase.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # from utils.word_utils import * 3 | # 类似邮件识别 4 | def createVocabList(dataSet): 5 | vocabSet = set() #创建一个空的集合 6 | for doc in dataSet: #遍历dataSet中的每一条言论 7 | vocabSet = vocabSet | set(doc) #取并集 8 | vocabList = list(vocabSet) 9 | return vocabList 10 | 11 | def setOfWord2Vec(vocabList, inputSet): 12 | returnVec = [0] * len(vocabList) #创建一个其中所含元素都为0的向量 13 | for word in inputSet: #遍历每个词条 14 | if word in vocabList: #如果词条存在于词汇表中,则变为1 15 | returnVec[vocabList.index(word)] = 1 16 | else: 17 | print(f" {word} is not in my Vocabulary!" ) 18 | return returnVec #返回文档向量 19 | 20 | def get_trainMat(dataSet): 21 | trainMat = [] #初始化向量列表 22 | vocabList = createVocabList(dataSet) #生成词汇表 23 | for inputSet in dataSet: #遍历样本词条中的每一条样本 24 | returnVec=setOfWords2Vec(vocabList, inputSet) #将当前词条向量化 25 | trainMat.append(returnVec) #追加到向量列表中 26 | return trainMat 27 | 28 | class NaiveBayesBase(object): 29 | 30 | def __init__(self): 31 | pass 32 | 33 | 34 | def fit(self, trainMatrix, trainCategory): 35 | ''' 36 | 朴素贝叶斯分类器训练函数,求:p(Ci),基于词汇表的p(w|Ci) 37 | Args: 38 | trainMatrix : 训练矩阵,即向量化表示后的文档(词条集合) 39 | trainCategory : 文档中每个词条的列表标注 40 | Return: 41 | p0Vect : 属于0类别的概率向量(p(w1|C0),p(w2|C0),...,p(wn|C0)) 42 | p1Vect : 属于1类别的概率向量(p(w1|C1),p(w2|C1),...,p(wn|C1)) 43 | pAbusive : 属于1类别文档的概率 44 | ''' 45 | numTrainDocs = len(trainMatrix) 46 | # 长度为词汇表长度 47 | numWords = len(trainMatrix[0]) 48 | # p(ci) 49 | self.pAbusive = sum(trainCategory) / float(numTrainDocs) 50 | # 由于后期要计算p(w|Ci)=p(w1|Ci)*p(w2|Ci)*...*p(wn|Ci),若wj未出现,则p(wj|Ci)=0,因此p(w|Ci)=0,这样显然是不对的 51 | # 故在初始化时,将所有词的出现数初始化为1,分母即出现词条总数初始化为2 52 | p0Num = np.ones(numWords) 53 | p1Num = np.ones(numWords) 54 | p0Denom = 2.0 55 | p1Denom = 2.0 56 | for i in range(numTrainDocs): 57 | if trainCategory[i] == 1: 58 | p1Num += trainMatrix[i] 59 | p1Denom += sum(trainMatrix[i]) 60 | else: 61 | p0Num += trainMatrix[i] 62 | p0Denom += sum(trainMatrix[i]) 63 | # p(wi | c1) 64 | # 为了避免下溢出(当所有的p都很小时,再相乘会得到0.0,使用log则会避免得到0.0) 65 | self.p1Vect = np.log(p1Num / p1Denom) 66 | # p(wi | c2) 67 | self.p0Vect = np.log(p0Num / p0Denom) 68 | return self 69 | 70 | 71 | def predict(self, testX): 72 | ''' 73 | 朴素贝叶斯分类器 74 | Args: 75 | testX : 待分类的文档向量(已转换成array) 76 | p0Vect : p(w|C0) 77 | p1Vect : p(w|C1) 78 | pAbusive : p(C1) 79 | Return: 80 | 1 : 为侮辱性文档 (基于当前文档的p(w|C1)*p(C1)=log(基于当前文档的p(w|C1))+log(p(C1))) 81 | 0 : 非侮辱性文档 (基于当前文档的p(w|C0)*p(C0)=log(基于当前文档的p(w|C0))+log(p(C0))) 82 | ''' 83 | 84 | p1 = np.sum(testX * self.p1Vect) + np.log(self.pAbusive) 85 | p0 = np.sum(testX * self.p0Vect) + np.log(1 - self.pAbusive) 86 | if p1 > p0: 87 | return 1 88 | else: 89 | return 0 90 | 91 | def loadDataSet(): 92 | '''数据加载函数。这里是一个小例子''' 93 | postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], 94 | ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], 95 | ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], 96 | ['stop', 'posting', 'stupid', 'worthless', 'garbage'], 97 | ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], 98 | ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] 99 | classVec = [0, 1, 0, 1, 0, 1] # 1代表侮辱性文字,0代表正常言论,代表上面6个样本的类别 100 | return postingList, classVec 101 | 102 | 103 | def checkNB(): 104 | '''测试''' 105 | listPosts, listClasses = loadDataSet() 106 | myVocabList = createVocabList(listPosts) 107 | trainMat = [] 108 | for postDoc in listPosts: 109 | trainMat.append(setOfWord2Vec(myVocabList, postDoc)) 110 | 111 | nb = NaiveBayesBase() 112 | nb.fit(np.array(trainMat), np.array(listClasses)) 113 | 114 | testEntry1 = ['love', 'my', 'dalmation'] 115 | thisDoc = np.array(setOfWord2Vec(myVocabList, testEntry1)) 116 | print(testEntry1, 'classified as:', nb.predict(thisDoc)) 117 | 118 | testEntry2 = ['stupid', 'garbage'] 119 | thisDoc2 = np.array(setOfWord2Vec(myVocabList, testEntry2)) 120 | print(testEntry2, 'classified as:', nb.predict(thisDoc2)) 121 | 122 | 123 | if __name__ == "__main__": 124 | checkNB() -------------------------------------------------------------------------------- /6贝叶斯/naiveBayes_mnist.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 数据集:Mnist 3 | 训练集数量:60000 4 | 测试集数量:10000 5 | ''' 6 | 7 | import numpy as np 8 | import time 9 | 10 | 11 | def loadData(fileName): 12 | ''' 13 | 加载文件 14 | :param fileName:要加载的文件路径 15 | :return: 数据集和标签集 16 | ''' 17 | # 存放数据及标记 18 | dataArr = []; 19 | labelArr = [] 20 | # 读取文件 21 | fr = open(fileName) 22 | # 遍历文件中的每一行 23 | for line in fr.readlines(): 24 | # 获取当前行,并按“,”切割成字段放入列表中 25 | # strip:去掉每行字符串首尾指定的字符(默认空格或换行符) 26 | # split:按照指定的字符将字符串切割成每个字段,返回列表形式 27 | curLine = line.strip().split(',') 28 | # 将每行中除标记外的数据放入数据集中(curLine[0]为标记信息) 29 | # 在放入的同时将原先字符串形式的数据转换为整型 30 | # 此外将数据进行了二值化处理,大于128的转换成1,小于的转换成0,方便后续计算 31 | dataArr.append([int(int(num) > 128) for num in curLine[1:]]) 32 | # 将标记信息放入标记集中 33 | # 放入的同时将标记转换为整型 34 | labelArr.append(int(curLine[0])) 35 | # 返回数据集和标记 36 | return dataArr, labelArr 37 | 38 | 39 | def NaiveBayes(Py, Px_y, x): 40 | ''' 41 | 通过朴素贝叶斯进行概率估计 42 | :param Py: 先验概率分布 43 | :param Px_y: 条件概率分布 44 | :param x: 要估计的样本x 45 | :return: 返回所有label的估计概率 46 | ''' 47 | # 设置特征数目 48 | featrueNum = 784 49 | # 设置类别数目 50 | classNum = 10 51 | # 建立存放所有标记的估计概率数组 52 | P = [0] * classNum 53 | # 对于每一个类别,单独估计其概率 54 | for i in range(classNum): 55 | # 初始化sum为0,sum为求和项。 56 | # 在训练过程中对概率进行了log处理,所以这里原先应当是连乘所有概率,最后比较哪个概率最大 57 | # 但是当使用log处理时,连乘变成了累加,所以使用sum 58 | sum = 0 59 | # 获取每一个条件概率值,进行累加 60 | for j in range(featrueNum): 61 | sum += Px_y[i][j][x[j]] 62 | # 最后再和先验概率相加(也就是式4.7中的先验概率乘以后头那些东西,乘法因为log全变成了加法) 63 | P[i] = sum + Py[i] 64 | 65 | # max(P):找到概率最大值 66 | # P.index(max(P)):找到该概率最大值对应的所有(索引值和标签值相等) 67 | return P.index(max(P)) 68 | 69 | 70 | def accuracy(Py, Px_y, testDataArr, testLabelArr): 71 | ''' 72 | 对测试集进行测试 73 | :param Py: 先验概率分布 74 | :param Px_y: 条件概率分布 75 | :param testDataArr: 测试集数据 76 | :param testLabelArr: 测试集标记 77 | :return: 准确率 78 | ''' 79 | # 错误值计数 80 | errorCnt = 0 81 | # 循环遍历测试集中的每一个样本 82 | for i in range(len(testDataArr)): 83 | # 获取预测值 84 | presict = NaiveBayes(Py, Px_y, testDataArr[i]) 85 | # 与答案进行比较 86 | if presict != testLabelArr[i]: 87 | # 若错误 错误值计数加1 88 | errorCnt += 1 89 | # 返回准确率 90 | return 1 - (errorCnt / len(testDataArr)) 91 | 92 | 93 | def getAllProbability(trainDataArr, trainLabelArr): 94 | ''' 95 | 通过训练集计算先验概率分布和条件概率分布 96 | :param trainDataArr: 训练数据集 97 | :param trainLabelArr: 训练标记集 98 | :return: 先验概率分布和条件概率分布 99 | ''' 100 | # 设置样本特诊数目,数据集中手写图片为28*28,转换为向量是784维。 101 | # (我们的数据集已经从图像转换成784维的形式了,CSV格式内就是) 102 | featureNum = 784 103 | # 设置类别数目,0-9共十个类别 104 | classNum = 10 105 | 106 | # 初始化先验概率分布存放数组,后续计算得到的P(Y = 0)放在Py[0]中,以此类推 107 | # 数据长度为10行1列 108 | # 各个类别的先验概率分布 109 | Py = np.zeros((classNum, 1)) 110 | # 对每个类别进行一次循环,分别计算它们的先验概率分布 111 | # 计算公式为书中"4.2节 朴素贝叶斯法的参数估计 公式4.8" 112 | for i in range(classNum): 113 | # 下方式子拆开分析 114 | # np.mat(trainLabelArr) == i:将标签转换为矩阵形式,里面的每一位与i比较,若相等,该位变为Ture,反之False 115 | # np.sum(np.mat(trainLabelArr) == i):计算上一步得到的矩阵中Ture的个数,进行求和(直观上就是找所有label中有多少个 116 | # 为i的标记,求得4.8式P(Y = Ck)中的分子) 117 | # np.sum(np.mat(trainLabelArr) == i)) + 1:参考“4.2.3节 贝叶斯估计”,例如若数据集总不存在y=1的标记,也就是说 118 | # 手写数据集中没有1这张图,那么如果不加1,由于没有y=1,所以分子就会变成0,那么在最后求后验概率时这一项就变成了0,再 119 | # 和条件概率乘,结果同样为0,不允许存在这种情况,所以分子加1,分母加上K(K为标签可取的值数量,这里有10个数,取值为10) 120 | # 参考公式4.11 121 | # (len(trainLabelArr) + 10):标签集的总长度+10. 122 | # ((np.sum(np.mat(trainLabelArr) == i)) + 1) / (len(trainLabelArr) + 10):最后求得的先验概率 123 | Py[i] = ((np.sum(np.mat(trainLabelArr) == i)) + 1) / (len(trainLabelArr) + 10) 124 | # 转换为log对数形式 125 | # log书中没有写到,但是实际中需要考虑到,原因是这样: 126 | # 最后求后验概率估计的时候,形式是各项的相乘(“4.1 朴素贝叶斯法的学习” 式4.7),这里存在两个问题:1.某一项为0时,结果为0. 127 | # 这个问题通过分子和分母加上一个相应的数可以排除,前面已经做好了处理。2.如果特征特别多(例如在这里,需要连乘的项目有784个特征 128 | # 加一个先验概率分布一共795项相乘,所有数都是0-1之间,结果一定是一个很小的接近0的数。)理论上可以通过结果的大小值判断, 但在 129 | # 程序运行中很可能会向下溢出无法比较,因为值太小了。所以人为把值进行log处理。log在定义域内是一个递增函数,也就是说log(x)中, 130 | # x越大,log也就越大,单调性和原数据保持一致。所以加上log对结果没有影响。此外连乘项通过log以后,可以变成各项累加,简化了计算。 131 | # 在似然函数中通常会使用log的方式进行处理(至于此书中为什么没涉及,我也不知道) 132 | Py = np.log(Py) 133 | 134 | # 计算条件概率 Px_y=P(X=x|Y = y) 135 | # 计算条件概率分成了两个步骤,下方第一个大for循环用于累加,参考书中“4.2.3 贝叶斯估计 式4.10”,下方第一个大for循环内部是 136 | # 用于计算式4.10的分子,至于分子的+1以及分母的计算在下方第二个大For内 137 | # 初始化为全0矩阵,用于存放所有情况下的条件概率 138 | Px_y = np.zeros((classNum, featureNum, 2)) 139 | # 对标记集进行遍历 140 | for i in range(len(trainLabelArr)): 141 | # 获取当前循环所使用的标记 142 | label = trainLabelArr[i] 143 | # 获取当前要处理的样本 144 | x = trainDataArr[i] 145 | # 对该样本的每一维特诊进行遍历 146 | for j in range(featureNum): 147 | # 在矩阵中对应位置加1 148 | # 这里还没有计算条件概率,先把所有数累加,全加完以后,在后续步骤中再求对应的条件概率 149 | Px_y[label][j][x[j]] += 1 150 | 151 | # 第二个大for,计算式4.10的分母,以及分子和分母之间的除法 152 | # 循环每一个标记(共10个) 153 | for label in range(classNum): 154 | # 循环每一个标记对应的每一个特征 155 | for j in range(featureNum): 156 | # 获取y=label,第j个特诊为0的个数 157 | Px_y0 = Px_y[label][j][0] 158 | # 获取y=label,第j个特诊为1的个数 159 | Px_y1 = Px_y[label][j][1] 160 | # 对式4.10的分子和分母进行相除,再除之前依据贝叶斯估计,分母需要加上2(为每个特征可取值个数) 161 | # 分别计算对于y= label,x第j个特征为0和1的条件概率分布 162 | Px_y[label][j][0] = np.log((Px_y0 + 1) / (Px_y0 + Px_y1 + 2)) 163 | Px_y[label][j][1] = np.log((Px_y1 + 1) / (Px_y0 + Px_y1 + 2)) 164 | 165 | # 返回先验概率分布和条件概率分布 166 | return Py, Px_y 167 | 168 | 169 | if __name__ == "__main__": 170 | start = time.time() 171 | # 获取训练集 172 | print('start read transSet') 173 | trainDataArr, trainLabelArr = loadData('../data/Mnist/mnist_train.csv') 174 | 175 | # 获取测试集 176 | print('start read testSet') 177 | testDataArr, testLabelArr = loadData('../data/Mnist/mnist_test.csv') 178 | 179 | # 开始训练,学习先验概率分布和条件概率分布 180 | print('start to train') 181 | Py, Px_y = getAllProbability(trainDataArr, trainLabelArr) 182 | 183 | # 使用习得的先验概率分布和条件概率分布对测试集进行测试 184 | print('start to test') 185 | accuracy = accuracy(Py, Px_y, testDataArr, testLabelArr) 186 | 187 | # 打印准确率 188 | print('the accuracy is:', accuracy) 189 | # 打印时间 190 | print('time span:', time.time() - start) 191 | 192 | # start read transSet 193 | # start read testSet 194 | # start to train 195 | # start to test 196 | # the accuracy is: 0.8432999999999999 197 | # time span: 97.44515252113342 198 | -------------------------------------------------------------------------------- /6贝叶斯/多项式贝叶斯hand_极大似然估计.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | 4 | class naiveBayes_MLE: 5 | def __init__(self, X, Y, N, n, K, x): 6 | self.X = X # 训练数据的特征 7 | self.Y = Y # 训练数据的类标记 8 | self.N = N # 训练数据个数 9 | self.n = n # 特征的个数 10 | self.K = K # 类标记的个数 11 | self.x = x # 待分类实例 12 | 13 | def prob(self): 14 | # 先验概率 15 | prior = {} 16 | # 条件概率 17 | conditional = {} 18 | # Y 训练数据的类标记 19 | # set(Y):总共几类,1和-1 20 | for c in set(self.Y): 21 | prior[c] = 0 22 | conditional[c] = {} 23 | for j in range(self.n): 24 | for a in set(self.X[j]): 25 | conditional[c][a] = 0 26 | print(conditional) 27 | # set(self.X[1])为 1,2,3 ; set(self.X[2) 为S,L,M 28 | # {1: {1: 0, 2: 0, 3: 0, 'S': 0, 'L': 0, 'M': 0}, 29 | # -1: {1: 0, 2: 0, 3: 0, 'S': 0, 'L': 0, 'M': 0}} 30 | 31 | # 计算先验概率和条件概率 32 | # N为样本数量 33 | # prior:{1:0,-1:0} 记录每个类别的数量 34 | for i in range(self.N): 35 | # Y[i]=1或-1 36 | prior[self.Y[i]] += 1 37 | # n为特征的个数 38 | for j in range(self.n): 39 | # X[1][i]的取值为1,2,3 40 | # X[2][i]的取值为S,L,M 41 | conditional[self.Y[i]][self.X[j][i]] += 1 42 | # 除以样本总数N之后,得到真正意义上的先验概率和条件概率 43 | for c in set(self.Y): 44 | for j in range(self.n): 45 | for a in set(self.X[j]): 46 | conditional[c][a] /= prior[c] 47 | 48 | prior[c] /= self.N 49 | 50 | return prior, conditional 51 | 52 | # 确定实例x的类 53 | def classifier(self): 54 | prior, conditional = self.prob() 55 | # 计算各类别的后验概率 56 | posterior = {} 57 | for c in set(self.Y): 58 | cond = 1 59 | for j in range(self.n): 60 | cond *= conditional[c][self.x[j]] 61 | posterior[c] = prior[c] * cond 62 | 63 | # 取最大后验概率的类别max(dict, key=dict.get) 64 | argmax = max(posterior, key=posterior.get) 65 | 66 | return posterior, argmax 67 | 68 | 69 | # + 70 | 71 | X = [[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3], 72 | ['S', 'M', 'M', 'S', 'S', 'S', 'M', 'M', 'L', 'L', 'L', 'M', 'M', 'L', 'L']] 73 | Y = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1] 74 | N = len(Y) 75 | n = len(X) 76 | K = len(set(Y)) 77 | x = [2, 'S'] 78 | 79 | nb = naiveBayes_MLE(X, Y, N, n, K, x) 80 | posterior, argmax = nb.classifier() 81 | print("每个类别的后验概率:", posterior) 82 | print("x=", x, "的类标记y为", argmax) 83 | # - 84 | 85 | nb.prob() 86 | 87 | 88 | -------------------------------------------------------------------------------- /6贝叶斯/多项式贝叶斯hand_贝叶斯估计.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | """ 3 | @author:hanmy 4 | @file:nb_be.py 5 | @time:2019/04/29 6 | """ 7 | class naiveBayes_BE: 8 | def __init__(self, X, Y, N, n, K, x, lamb): 9 | self.X = X # 训练数据的特征 10 | self.Y = Y # 训练数据的类标记 11 | self.N = N # 训练数据个数 12 | self.n = n # 特征的个数 13 | self.K = K # 类标记的个数 14 | self.x = x # 待分类实例 15 | self.lamb = lamb # 贝叶斯估计的lambda 16 | 17 | def prob(self): 18 | # 先验概率 19 | prior = {} 20 | # 条件概率 21 | conditional = {} 22 | for c in set(self.Y): 23 | prior[c] = 0 24 | conditional[c] = {} 25 | for j in range(self.n): 26 | for a in set(self.X[j]): 27 | conditional[c][a] = 0 28 | # 每个特征有多少个不同的特征值 29 | S = [0]*self.n 30 | for j in range(self.n): 31 | for _ in set(self.X[j]): 32 | S[j] += 1 33 | 34 | # 计算先验概率和条件概率 35 | for i in range(self.N): 36 | prior[self.Y[i]] += 1 37 | for j in range(self.n): 38 | conditional[self.Y[i]][self.X[j][i]] += 1 39 | 40 | for c in set(self.Y): 41 | for j in range(self.n): 42 | for a in set(self.X[j]): 43 | conditional[c][a] = (conditional[c][a] + self.lamb) / (prior[c] + S[j]*self.lamb) 44 | 45 | prior[c] = (prior[c] + self.lamb) / (self.N + self.K*self.lamb) 46 | 47 | return prior, conditional 48 | 49 | # 确定实例x的类 50 | def classifier(self): 51 | prior, conditional = self.prob() 52 | # 计算各类别的后验概率 53 | posterior = {} 54 | for c in set(self.Y): 55 | cond = 1 56 | for j in range(self.n): 57 | cond *= conditional[c][self.x[j]] 58 | posterior[c] = prior[c] * cond 59 | 60 | # 取最大后验概率的类别max(dict, key=dict.get) 61 | argmax = max(posterior, key=posterior.get) 62 | 63 | return posterior, argmax 64 | 65 | 66 | if __name__ == "__main__": 67 | X = [[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3], 68 | ['S', 'M', 'M', 'S', 'S', 'S', 'M', 'M', 'L', 'L', 'L', 'M', 'M', 'L', 'L']] 69 | Y = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1] 70 | N = len(Y) 71 | n = len(X) 72 | K = len(set(Y)) 73 | x = [2, 'S'] 74 | lamb = 0.2 75 | 76 | nb = naiveBayes_BE(X, Y, N, n, K, x, lamb) 77 | posterior, argmax = nb.classifier() 78 | print("每个类别的后验概率:", posterior) 79 | print("x=", x, "的类标记y为", argmax) 80 | -------------------------------------------------------------------------------- /6贝叶斯/贝叶斯高斯.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ys1305/ML-hand/8e1c3a4a1e34f329a4d6bd70d561c1988325e57c/6贝叶斯/贝叶斯高斯.txt -------------------------------------------------------------------------------- /6贝叶斯/高斯贝叶斯 - 菊安酱.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import random 4 | dataSet =pd.read_csv('iris.txt',header = None) 5 | dataSet.head() 6 | 7 | def randSplit(dataSet, rate): 8 | l = list(dataSet.index) #提取出索引 9 | random.shuffle(l) #随机打乱索引 10 | dataSet.index = l #将打乱后的索引重新赋值给原数据集 11 | n = dataSet.shape[0] #总行数 12 | m = int(n * rate) #训练集的数量 13 | train = dataSet.loc[range(m), :] #提取前m个记录作为训练集 14 | test = dataSet.loc[range(m, n), :] #剩下的作为测试集 15 | dataSet.index = range(dataSet.shape[0]) #更新原数据集的索引 16 | test.index = range(test.shape[0]) #更新测试集的索引 17 | return train, test 18 | 19 | def gnb_classify(train,test): 20 | labels = train.iloc[:,-1].value_counts().index #提取训练集的标签种类 21 | mean =[] #存放每个类别的均值 22 | std =[] #存放每个类别的方差 23 | result = [] #存放测试集的预测结果 24 | for i in labels: 25 | item = train.loc[train.iloc[:,-1]==i,:] #分别提取出每一种类别 26 | m = item.iloc[:,:-1].mean() #当前类别的平均值 27 | s = np.sum((item.iloc[:,:-1]-m)**2)/(item.shape[0]) #当前类别的方差 28 | mean.append(m) #将当前类别的平均值追加至列表 29 | std.append(s) #将当前类别的方差追加至列表 30 | means = pd.DataFrame(mean,index=labels) #变成DF格式,索引为类标签 31 | stds = pd.DataFrame(std,index=labels) #变成DF格式,索引为类标签 32 | for j in range(test.shape[0]): 33 | iset = test.iloc[j,:-1].tolist() #当前测试实例 34 | iprob = np.exp(-1*(iset-means)**2/(stds*2))/(np.sqrt(2*np.pi*stds)) #正态分布公式 35 | # print(iprob.shape) 36 | # 3,4 3对应的是三个类别,4对应的是每个样本有四个特征 37 | 38 | # 用log求和 39 | prob = np.sum(np.log(iprob),axis=1) 40 | 41 | # prob = 1 #初始化当前实例总概率 42 | # for k in range(test.shape[1]-1): #遍历每个特征 43 | # prob *= iprob[k] #特征概率之积即为当前实例概率 44 | cla = prob.index[np.argmax(prob.values)] #返回最大概率的类别 45 | result.append(cla) 46 | test['predict']=result 47 | acc = (test.iloc[:,-1]==test.iloc[:,-2]).mean() #计算预测准确率 48 | print(f'模型预测准确率为{acc}') 49 | return test 50 | train,test=randSplit(dataSet, 0.8) 51 | 52 | from sklearn.naive_bayes import GaussianNB 53 | clf = GaussianNB() 54 | # 不包括第四列 55 | clf.fit(train.iloc[:,:4], train.iloc[:,-1]) 56 | print(clf.score(test.iloc[:,:4], test.iloc[:,-1])) 57 | 58 | gnb_classify(train,test) 59 | 60 | -------------------------------------------------------------------------------- /6贝叶斯/高斯贝叶斯.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.model_selection import train_test_split 6 | 7 | import math 8 | 9 | # data 10 | def create_data(): 11 | iris = load_iris() 12 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 13 | df['label'] = iris.target 14 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 15 | data = np.array(df.iloc[:150, :]) 16 | # print(data) 17 | return data[:,:-1], data[:,-1] 18 | 19 | class NaiveBayes: 20 | def __init__(self): 21 | self.model = None 22 | 23 | # 数学期望 24 | @staticmethod 25 | def mean(X): 26 | return sum(X) / float(len(X)) 27 | 28 | # 标准差(方差) 29 | def stdev(self, X): 30 | avg = self.mean(X) 31 | return math.sqrt(sum([pow(x-avg, 2) for x in X]) / float(len(X))) 32 | 33 | # 概率密度函数 34 | def gaussian_probability(self, x, mean, stdev): 35 | exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2)))) 36 | return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent 37 | 38 | # 处理X_train 39 | def summarize(self, train_data): 40 | summaries = [(self.mean(i), self.stdev(i)) for i in zip(*train_data)] 41 | return summaries 42 | 43 | # 分类别求出数学期望和标准差 44 | def fit(self, X, y): 45 | labels = list(set(y)) 46 | data = {label:[] for label in labels} 47 | for f, label in zip(X, y): 48 | data[label].append(f) 49 | self.model = {label: self.summarize(value) for label, value in data.items()} 50 | return 'gaussianNB train done!' 51 | 52 | # 计算概率 53 | def calculate_probabilities(self, input_data): 54 | # summaries:{0.0: [(5.0, 0.37),(3.42, 0.40)], 1.0: [(5.8, 0.449),(2.7, 0.27)]} 55 | # input_data:[1.1, 2.2] 56 | probabilities = {} 57 | for label, value in self.model.items(): 58 | probabilities[label] = 1 59 | for i in range(len(value)): 60 | mean, stdev = value[i] 61 | probabilities[label] *= self.gaussian_probability(input_data[i], mean, stdev) 62 | return probabilities 63 | 64 | # 类别 65 | def predict(self, X_test): 66 | # print(self.calculate_probabilities(X_test).items()) 67 | # dict_items([(0.0, 0.46216164346529376), (1.0, 2.917762081359877e-18), (2.0, 9.836470021805279e-31)]) 68 | # print(sorted(self.calculate_probabilities(X_test).items(), key=lambda x: x[-1])) 69 | # 从小到大排序 70 | # [(2.0, 1.0446327049122698e-26), (1.0, 1.612020738629649e-18), (0.0, 0.9025722438875156)] 71 | 72 | label = sorted(self.calculate_probabilities(X_test).items(), key=lambda x: x[-1])[-1][0] 73 | 74 | return label 75 | 76 | def score(self, X_test, y_test): 77 | right = 0 78 | for X, y in zip(X_test, y_test): 79 | label = self.predict(X) 80 | if label == y: 81 | right += 1 82 | 83 | return right / float(len(X_test)) 84 | 85 | X, y = create_data() 86 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 87 | 88 | model = NaiveBayes() 89 | model.fit(X_train, y_train) 90 | print(model.predict([4.4, 3.2, 1.3, 0.2])) 91 | print(model.score(X_test, y_test)) 92 | 93 | from sklearn.naive_bayes import GaussianNB 94 | clf = GaussianNB() 95 | clf.fit(X_train, y_train) 96 | print(clf.score(X_test, y_test)) 97 | 98 | -------------------------------------------------------------------------------- /7K近邻-KNN/kNN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import operator 3 | import os 4 | from collections import Counter 5 | 6 | 7 | def createDataSet(): 8 | group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) 9 | labels = ['A', 'A', 'B', 'B'] 10 | return group, labels 11 | 12 | 13 | def classify01(inX, dataSet, labels, k): 14 | 15 | # -----------实现 classify0() 方法的第一种方式---------------------------------------------------------------------------------------------------------------------------- 16 | # 1. 距离计算 17 | dataSetSize = dataSet.shape[0] 18 | 19 | # tile生成和训练样本对应的矩阵,并与训练样本求差 20 | # 计算每一个样本到训练样本的距离 21 | diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet 22 | 23 | # 取平方 24 | sqDiffMat = diffMat ** 2 25 | # 将矩阵的每一行相加 26 | sqDistances = sqDiffMat.sum(axis=1) 27 | # 开方 28 | distances = sqDistances ** 0.5 29 | # 根据距离排序从小到大的排序,返回对应的索引位置 30 | # argsort() 是将x中的元素从小到大排列,提取其对应的index(索引),然后输出到y。 31 | sortedDistIndicies = distances.argsort() 32 | 33 | # 2. 选择距离最小的k个点 34 | classCount = {} 35 | for i in range(k): 36 | # 找到该样本的类型 37 | voteIlabel = labels[sortedDistIndicies[i]] 38 | # 在字典中将该类型加一 39 | # 字典的get方法 40 | # 如:list.get(k,d) 其中 get相当于一条if...else...语句,参数k在字典中,字典将返回list[k];如果参数k不在字典中则返回参数d,如果K在字典中则返回k对应的value值 41 | classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 42 | # 3. 排序并返回出现最多的那个类型 43 | # 字典的 items() 方法,以列表返回可遍历的(键,值)元组数组。 44 | # 例如:dict = {'Name': 'Zara', 'Age': 7} print "Value : %s" % dict.items() Value : [('Age', 7), ('Name', 'Zara')] 45 | # sorted 中的第2个参数 key=operator.itemgetter(1) 这个参数的意思是先比较第几个元素 46 | # 例如:a=[('b',2),('a',1),('c',0)] b=sorted(a,key=operator.itemgetter(1)) >>>b=[('c',0),('a',1),('b',2)] 可以看到排序是按照后边的0,1,2进行排序的,而不是a,b,c 47 | # b=sorted(a,key=operator.itemgetter(0)) >>>b=[('a',1),('b',2),('c',0)] 这次比较的是前边的a,b,c而不是0,1,2 48 | # b=sorted(a,key=opertator.itemgetter(1,0)) >>>b=[('c',0),('a',1),('b',2)] 这个是先比较第2个元素,然后对第一个元素进行排序,形成多级排序。 49 | sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) 50 | return sortedClassCount[0][0] 51 | 52 | def classify0(inX, dataSet, labels, k): 53 | # """ 54 | # 1. 计算距离 55 | 56 | # inx - dataset 使用了numpy broadcasting,见 https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html 57 | # """ 58 | dist = np.sum((inX - dataSet)**2, axis=1)**0.5 59 | # 不使用广播,使用一个一个样本进行计算距离 60 | # dist = [sqrt(np.sum((x_train - inX) ** 2)) 61 | # for x_train in dataSet] 62 | 63 | # print(dist.shape) 64 | 65 | # """ 66 | # 2. k个最近的标签 67 | 68 | # 对距离排序使用numpy中的argsort函数, 见 https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.sort.html#numpy.sort 69 | # 函数返回的是数组值从小到大的索引值 ,因此取前k个索引使用[0 : k] 70 | # 将这k个标签存在列表k_labels中 71 | # """ 72 | 73 | k_labels = [ labels[index] for index in dist.argsort()[0 : k] ] 74 | 75 | # 3. 出现次数最多的标签即为最终类别 76 | # 使用collections.Counter可以统计各个标签的出现次数,most_common返回出现次数最多的标签tuple,例如[('lable1', 2)],因此[0][0]可以取出标签值 77 | 78 | label = Counter(k_labels).most_common(1)[0][0] 79 | return label 80 | 81 | # ------------------------------------------------------------------------------------------------------------------------------------------ 82 | 83 | 84 | def test1(): 85 | """ 86 | 第一个例子演示 87 | """ 88 | group, labels = createDataSet() 89 | print(str(group)) 90 | print(str(labels)) 91 | print(classify0([0.1, 0.1], group, labels, 3)) 92 | 93 | 94 | # ---------------------------------------------------------------------------------------- 95 | def file2matrix(filename): 96 | """ 97 | 导入训练数据 98 | :param filename: 数据文件路径 99 | :return: 数据矩阵returnMat和对应的类别classLabelVector 100 | """ 101 | fr = open(filename, 'r') 102 | # 获得文件中的数据行的行数 103 | numberOfLines = len(fr.readlines()) 104 | # 生成对应的空矩阵 105 | # 例如:zeros(2,3)就是生成一个 2*3 的矩阵,各个位置上全是 0 106 | returnMat = np.zeros((numberOfLines, 3)) # prepare matrix to return 107 | classLabelVector = [] # prepare labels return 108 | fr = open(filename, 'r') 109 | index = 0 110 | for line in fr.readlines(): 111 | # str.strip([chars]) --返回移除字符串头尾指定的字符生成的新字符串 112 | line = line.strip() 113 | # 以 '\t' 切割字符串 114 | listFromLine = line.split('\t') 115 | # 每列的属性数据,即 features 116 | returnMat[index] = listFromLine[0 : 3] 117 | # 每列的类别数据,就是 label 标签数据 118 | classLabelVector.append(int(listFromLine[-1])) 119 | index += 1 120 | # 返回数据矩阵returnMat和对应的类别classLabelVector 121 | return returnMat, classLabelVector 122 | 123 | 124 | def autoNorm(dataSet): 125 | """ 126 | 归一化特征值,消除属性之间量级不同导致的影响 127 | """ 128 | # 计算每种属性的最大值、最小值、范围 129 | minVals = dataSet.min(0) 130 | maxVals = dataSet.max(0) 131 | # 极差 132 | ranges = maxVals - minVals 133 | 134 | normDataSet = (dataSet - minVals) / ranges 135 | 136 | return normDataSet, ranges, minVals 137 | 138 | 139 | def datingClassTest(): 140 | """ 141 | Desc: 142 | 对约会网站的测试方法,并将分类错误的数量和分类错误率打印出来 143 | Args: 144 | None 145 | Returns: 146 | None 147 | """ 148 | # 设置测试数据的的一个比例(训练数据集比例=1-hoRatio) 149 | hoRatio = 0.1 # 测试范围,一部分测试一部分作为样本 150 | # 从文件中加载数据 151 | datingDataMat, datingLabels = file2matrix("../data/2.KNN/datingTestSet2.txt") # load data setfrom file 152 | # 归一化数据 153 | normMat, ranges, minVals = autoNorm(datingDataMat) 154 | # m 表示数据的行数,即矩阵的第一维;样本的总数 155 | m = normMat.shape[0] 156 | 157 | # 设置测试的样本数量 158 | numTestVecs = int(m * hoRatio) 159 | print('numTestVecs=', numTestVecs) 160 | 161 | errorCount = 0 162 | for i in range(numTestVecs): 163 | # 对数据测试 164 | classifierResult = classify0(normMat[i], normMat[numTestVecs : m], datingLabels[numTestVecs : m], 3) 165 | print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])) 166 | errorCount += classifierResult != datingLabels[i] 167 | print("the total error rate is: %f" % (errorCount / numTestVecs)) 168 | print(errorCount) 169 | 170 | 171 | def img2vector(filename): 172 | """ 173 | Desc: 174 | 将图像数据转换为向量 175 | Args: 176 | filename -- 图片文件 因为我们的输入数据的图片格式是 32 * 32的 177 | Returns: 178 | returnVect -- 图片文件处理完成后的一维矩阵 179 | 180 | 该函数将图像转换为向量:该函数创建 1 * 1024 的NumPy数组,然后打开给定的文件, 181 | 循环读出文件的前32行,并将每行的头32个字符值存储在NumPy数组中,最后返回数组。 182 | """ 183 | returnVect = np.zeros((1, 1024)) 184 | fr = open(filename, 'r') 185 | for i in range(32): 186 | lineStr = fr.readline() 187 | for j in range(32): 188 | returnVect[0, 32 * i + j] = int(lineStr[j]) 189 | return returnVect 190 | 191 | 192 | def handwritingClassTest(): 193 | """ 194 | Desc: 195 | 手写数字识别分类器,并将分类错误数和分类错误率打印出来 196 | Args: 197 | None 198 | Returns: 199 | None 200 | """ 201 | # 1. 导入数据 202 | hwLabels = [] 203 | trainingFileList = os.listdir("../data/2.KNN/trainingDigits") # load the training set 204 | m = len(trainingFileList) 205 | trainingMat = np.zeros((m, 1024)) 206 | # hwLabels存储0~9对应的index位置, trainingMat存放的每个位置对应的图片向量 207 | for i in range(m): 208 | fileNameStr = trainingFileList[i] 209 | fileStr = fileNameStr.split('.')[0] # take off .txt 210 | classNumStr = int(fileStr.split('_')[0]) 211 | hwLabels.append(classNumStr) 212 | # 将 32*32的矩阵->1*1024的矩阵 213 | trainingMat[i] = img2vector('../data/2.KNN/trainingDigits/%s' % fileNameStr) 214 | 215 | # 2. 导入测试数据 216 | testFileList = os.listdir('../data/2.KNN/testDigits') # iterate through the test set 217 | errorCount = 0 218 | mTest = len(testFileList) 219 | for i in range(mTest): 220 | fileNameStr = testFileList[i] 221 | fileStr = fileNameStr.split('.')[0] # take off .txt 222 | classNumStr = int(fileStr.split('_')[0]) 223 | vectorUnderTest = img2vector('../data/2.KNN/testDigits/%s' % fileNameStr) 224 | classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) 225 | print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)) 226 | errorCount += classifierResult != classNumStr 227 | print("\nthe total number of errors is: %d" % errorCount) 228 | print("\nthe total error rate is: %f" % (errorCount / mTest)) 229 | 230 | 231 | if __name__ == '__main__': 232 | # test1() 233 | datingClassTest() 234 | # handwritingClassTest() 235 | 236 | 237 | -------------------------------------------------------------------------------- /8K-means/1笔记.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ys1305/ML-hand/8e1c3a4a1e34f329a4d6bd70d561c1988325e57c/8K-means/1笔记.txt -------------------------------------------------------------------------------- /8K-means/K-Means/K-Means_scikit-learn.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | import numpy as np 3 | from scipy import io as spio 4 | from matplotlib import pyplot as plt 5 | from sklearn.cluster import KMeans 6 | 7 | 8 | def kMenas(): 9 | data = spio.loadmat("data.mat") 10 | X = data['X'] 11 | model = KMeans(n_clusters=3).fit(X) # n_clusters指定3类,拟合数据 12 | centroids = model.cluster_centers_ # 聚类中心 13 | 14 | plt.scatter(X[:,0], X[:,1]) # 原数据的散点图 15 | plt.plot(centroids[:,0],centroids[:,1],'r^',markersize=10) # 聚类中心 16 | plt.show() 17 | 18 | if __name__ == "__main__": 19 | kMenas() 20 | -------------------------------------------------------------------------------- /8K-means/K-Means/K-Menas.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | from __future__ import print_function 3 | import numpy as np 4 | from matplotlib import pyplot as plt 5 | from matplotlib import colors 6 | from scipy import io as spio 7 | from scipy import misc # 图片操作 8 | import numbers 9 | from matplotlib.font_manager import FontProperties 10 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) # 解决windows环境下画图汉字乱码问题 11 | 12 | def check_random_state(seed): 13 | if seed is None or seed is np.random: 14 | return np.random.mtrand._rand 15 | if isinstance(seed, (numbers.Integral, np.integer)): 16 | return np.random.RandomState(seed) 17 | if isinstance(seed, np.random.RandomState): 18 | return seed 19 | raise ValueError('%r cannot be used to seed a numpy.random.RandomState' 20 | ' instance' % seed) 21 | random_state = check_random_state(2) 22 | 23 | def distance(point1, point2): 24 | # 欧氏距离 25 | return np.sqrt(np.sum(np.square(point1 - point2), axis=1)) 26 | 27 | 28 | def KMeans(): 29 | '''二维数据聚类过程演示''' 30 | print(u'聚类过程展示...\n') 31 | data = spio.loadmat("data.mat") 32 | X = data['X'] 33 | K = 3 # 总类数 34 | # initial_centroids = np.array([[3,3],[6,2],[8,5]]) # 初始化类中心 35 | initial_centroids = _k_means_plus_plus(X,K) 36 | max_iters = 10 37 | runKMeans(X,initial_centroids,max_iters,True) # 执行K-Means聚类算法 38 | ''' 39 | 图片压缩 40 | ''' 41 | print(u'K-Means压缩图片\n') 42 | img_data = misc.imread("bird.png") # 读取图片像素数据 43 | img_data = img_data/255.0 # 像素值映射到0-1 44 | img_size = img_data.shape 45 | X = img_data.reshape(img_size[0]*img_size[1],3) # 调整为N*3的矩阵,N是所有像素点个数 46 | 47 | K = 16 48 | max_iters = 5 49 | # initial_centroids = kMeansInitCentroids(X,K) 50 | initial_centroids = _k_means_plus_plus(X,K) 51 | centroids,idx = runKMeans(X, initial_centroids, max_iters, False) 52 | print(u'\nK-Means运行结束\n') 53 | print(u'\n压缩图片...\n') 54 | idx = findClosestCentroids(X, centroids) 55 | X_recovered = centroids[idx,:] 56 | X_recovered = X_recovered.reshape(img_size[0],img_size[1],3) 57 | 58 | print(u'绘制图片...\n') 59 | plt.subplot(1,2,1) 60 | plt.imshow(img_data) 61 | plt.title(u"原先图片",fontproperties=font) 62 | plt.subplot(1,2,2) 63 | plt.imshow(X_recovered) 64 | plt.title(u"压缩图像",fontproperties=font) 65 | plt.show() 66 | print(u'运行结束!') 67 | 68 | 69 | # 找到每条数据距离哪个类中心最近 70 | def findClosestCentroids(X,initial_centroids): 71 | m = X.shape[0] # 数据条数 72 | K = initial_centroids.shape[0] # 类的总数 73 | dis = np.zeros((m,K)) # 存储计算每个点分别到K个类的距离 74 | idx = np.zeros((m,1)) # 要返回的每条数据属于哪个类 75 | 76 | '''计算每个点到每个类中心的距离''' 77 | for i in range(m): 78 | for j in range(K): 79 | dis[i,j] = np.dot((X[i,:]-initial_centroids[j,:]).reshape(1,-1),(X[i,:]-initial_centroids[j,:]).reshape(-1,1)) 80 | 81 | '''返回dis每一行的最小值对应的列号,即为对应的类别 82 | - np.min(dis, axis=1)返回每一行的最小值 83 | - np.where(dis == np.min(dis, axis=1).reshape(-1,1)) 返回对应最小值的坐标 84 | - 注意:可能最小值对应的坐标有多个,where都会找出来,所以返回时返回前m个需要的即可(因为对于多个最小值,属于哪个类别都可以) 85 | ''' 86 | dummy,idx = np.where(dis == np.min(dis, axis=1).reshape(-1,1)) 87 | return idx[0:dis.shape[0]] # 注意截取一下 88 | 89 | 90 | # 计算类中心 91 | def computerCentroids(X,idx,K): 92 | n = X.shape[1] 93 | centroids = np.zeros((K,n)) 94 | for i in range(K): 95 | centroids[i,:] = np.mean(X[np.ravel(idx==i),:], axis=0).reshape(1,-1) # 索引要是一维的,axis=0为每一列,idx==i一次找出属于哪一类的,然后计算均值 96 | return centroids 97 | 98 | # 聚类算法 99 | def runKMeans(X,initial_centroids,max_iters,plot_process): 100 | m,n = X.shape # 数据条数和维度 101 | K = initial_centroids.shape[0] # 类数 102 | centroids = initial_centroids # 记录当前类中心 103 | previous_centroids = centroids # 记录上一次类中心 104 | idx = np.zeros((m,1)) # 每条数据属于哪个类 105 | 106 | for i in range(max_iters): # 迭代次数 107 | print(u'迭代计算次数:%d'%(i+1)) 108 | idx = findClosestCentroids(X, centroids) 109 | if plot_process: # 如果绘制图像 110 | plt = plotProcessKMeans(X,centroids,previous_centroids) # 画聚类中心的移动过程 111 | previous_centroids = centroids # 重置 112 | centroids = computerCentroids(X, idx, K) # 重新计算类中心 113 | if plot_process: # 显示最终的绘制结果 114 | plt.show() 115 | return centroids,idx # 返回聚类中心和数据属于哪个类 116 | 117 | # 画图,聚类中心的移动过程 118 | def plotProcessKMeans(X,centroids,previous_centroids): 119 | plt.scatter(X[:,0], X[:,1]) # 原数据的散点图 120 | plt.plot(previous_centroids[:,0],previous_centroids[:,1],'rx',markersize=10,linewidth=5.0) # 上一次聚类中心 121 | plt.plot(centroids[:,0],centroids[:,1],'rx',markersize=10,linewidth=5.0) # 当前聚类中心 122 | for j in range(centroids.shape[0]): # 遍历每个类,画类中心的移动直线 123 | p1 = centroids[j,:] 124 | p2 = previous_centroids[j,:] 125 | plt.plot([p1[0],p2[0]],[p1[1],p2[1]],"->",linewidth=2.0) 126 | return plt 127 | 128 | 129 | # 初始化类中心--随机取K个点作为聚类中心 130 | def kMeansInitCentroids(X,K): 131 | m = X.shape[0] 132 | m_arr = np.arange(0,m) # 生成0-m-1 133 | centroids = np.zeros((K,X.shape[1])) 134 | np.random.shuffle(m_arr) # 打乱m_arr顺序 135 | rand_indices = m_arr[:K] # 取前K个 136 | centroids = X[rand_indices,:] 137 | return centroids 138 | 139 | # kmeans++的初始化方式,加速聚类速度 140 | # 第一个点是随机选择出来的 141 | def _k_means_plus_plus(dataset,k): 142 | n_samples, n_features = dataset.shape 143 | centers = np.empty((k, n_features)) 144 | # n_local_trials是每次选择候选点个数 145 | n_local_trials = None 146 | if n_local_trials is None: 147 | n_local_trials = 2 + int(np.log(k)) 148 | 149 | 150 | 151 | # 第一个随机点 152 | center_id = random_state.randint(n_samples) 153 | centers[0] = dataset[center_id] 154 | 155 | # closest_dist_sq是每个样本,到所有中心点最近距离 156 | # 假设现在有3个中心点,closest_dist_sq = [min(样本1到3个中心距离),min(样本2到3个中心距离),...min(样本n到3个中心距离)] 157 | closest_dist_sq = distance(centers[0, np.newaxis], dataset) 158 | # newaxis可以给原数组增加一个维度 159 | 160 | # current_pot所有最短距离的和 161 | current_pot = closest_dist_sq.sum() 162 | 163 | for c in range(1, k): 164 | # 选出n_local_trials随机址,并映射到current_pot的长度 165 | rand_vals = random_state.random_sample(n_local_trials) * current_pot 166 | # 选择出来的候选节点是按照概率选择出来的 167 | # 然后再根据所有样本到候选节点的距离选择出来距离最小的节点 168 | 169 | # np.cumsum([1,2,3,4]) = [1, 3, 6, 10],就是累加当前索引前面的值 170 | # np.searchsorted搜索随机出的rand_vals落在np.cumsum(closest_dist_sq)中的位置。 171 | # candidate_ids候选节点的索引 172 | candidate_ids = np.searchsorted(np.cumsum(closest_dist_sq), rand_vals) 173 | 174 | # best_candidate最好的候选节点 175 | # best_pot最好的候选节点计算出的距离和 176 | # best_dist_sq最好的候选节点计算出的距离列表 177 | best_candidate = None 178 | best_pot = None 179 | best_dist_sq = None 180 | for trial in range(n_local_trials): 181 | # 计算每个样本到候选节点的欧式距离 182 | distance_to_candidate = distance(dataset[candidate_ids[trial], np.newaxis], dataset) 183 | 184 | # 计算每个候选节点的距离序列new_dist_sq, 距离总和new_pot 185 | 186 | # closest_dist_sq 每个样本,到所有已知的中心点的距离 187 | # new_dist_sq 每个样本,到所有中心点(已知的中心点+当前的候选点)最近距离 188 | # 如果中心点变成了两个,那么样本到中心点的最近距离就可能会发生变化 189 | new_dist_sq = np.minimum(closest_dist_sq, distance_to_candidate) 190 | new_pot = new_dist_sq.sum() 191 | 192 | # 选择最小的new_pot 193 | if (best_candidate is None) or (new_pot < best_pot): 194 | best_candidate = candidate_ids[trial] 195 | best_pot = new_pot 196 | best_dist_sq = new_dist_sq 197 | 198 | centers[c] = dataset[best_candidate] 199 | current_pot = best_pot 200 | closest_dist_sq = best_dist_sq 201 | 202 | return centers 203 | 204 | if __name__ == "__main__": 205 | KMeans() 206 | -------------------------------------------------------------------------------- /8K-means/K-Means/bird.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ys1305/ML-hand/8e1c3a4a1e34f329a4d6bd70d561c1988325e57c/8K-means/K-Means/bird.mat -------------------------------------------------------------------------------- /8K-means/K-Means/bird.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ys1305/ML-hand/8e1c3a4a1e34f329a4d6bd70d561c1988325e57c/8K-means/K-Means/bird.png -------------------------------------------------------------------------------- /8K-means/K-Means/data.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ys1305/ML-hand/8e1c3a4a1e34f329a4d6bd70d561c1988325e57c/8K-means/K-Means/data.mat -------------------------------------------------------------------------------- /8K-means/K-Means/kmeansplusplus_ys.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from matplotlib import colors 4 | from scipy import io as spio 5 | from scipy import misc # 图片操作 6 | import numbers 7 | from matplotlib.font_manager import FontProperties 8 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) # 解决windows环境下画图汉字乱码问题 9 | 10 | def distance(point1, point2): 11 | # 欧氏距离 12 | return np.sqrt(np.sum(np.square(point1 - point2), axis=1)) 13 | # return np.sqrt(np.sum(np.power(point1 - point2,2))) 14 | 15 | 16 | def check_random_state(seed): 17 | if seed is None or seed is np.random: 18 | return np.random.mtrand._rand 19 | if isinstance(seed, (numbers.Integral, np.integer)): 20 | return np.random.RandomState(seed) 21 | if isinstance(seed, np.random.RandomState): 22 | return seed 23 | raise ValueError('%r cannot be used to seed a numpy.random.RandomState' 24 | ' instance' % seed) 25 | 26 | random_state = check_random_state(None) 27 | # kmeans++的初始化方式,加速聚类速度 28 | # 第一个点是随机选择出来的 29 | def k_means_plus_plus(dataset,k): 30 | n_samples, n_features = dataset.shape 31 | centers = np.empty((k, n_features)) 32 | 33 | 34 | # n_local_trials是每次选择候选点个数 35 | n_local_trials = None 36 | if n_local_trials is None: 37 | n_local_trials = 2 + int(np.log(k)) 38 | 39 | # 第一个随机点 40 | center_id = random_state.randint(n_samples) 41 | centers[0] = dataset[center_id] 42 | 43 | # closest_dist_sq是每个样本到所有中心点最近距离 44 | # 假设现在有3个中心点,closest_dist_sq = 45 | # [min(样本1到3个中心距离),min(样本2到3个中心距离),...min(样本n到3个中心距离)] 46 | closest_dist_sq = distance(centers[0, np.newaxis], dataset) 47 | # newaxis可以给原数组增加一个维度 48 | 49 | # current_pot所有最短距离的和 50 | current_pot = closest_dist_sq.sum() 51 | 52 | for c in range(1, k): 53 | # 选出n_local_trials随机址,并映射到current_pot的长度 54 | rand_vals = random_state.random_sample(n_local_trials) * current_pot 55 | # 选择出来的候选节点是按照概率选择出来的 56 | # 然后再根据所有样本到候选节点的距离选择出来距离最小的节点 57 | 58 | # np.cumsum([1,2,3,4]) = [1, 3, 6, 10],就是累加当前索引前面的值 59 | # np.searchsorted搜索随机出的rand_vals落在np.cumsum(closest_dist_sq)中的位置。 60 | # candidate_ids候选节点的索引 61 | candidate_ids = np.searchsorted(np.cumsum(closest_dist_sq), rand_vals) 62 | print(candidate_ids) 63 | 64 | # best_candidate最好的候选节点 65 | # best_pot最好的候选节点计算出的距离和 66 | # best_dist_sq最好的候选节点计算出的距离列表 67 | best_candidate = None 68 | best_pot = None 69 | best_dist_sq = None 70 | for trial in range(n_local_trials): 71 | # 计算每个样本到候选节点的欧式距离 72 | distance_to_candidate = distance(dataset[candidate_ids[trial], np.newaxis], dataset) 73 | 74 | # 计算每个候选节点的距离序列new_dist_sq, 距离总和new_pot 75 | 76 | # closest_dist_sq 每个样本,到所有已知的中心点的距离 77 | # new_dist_sq 每个样本,到所有中心点(已知的中心点+当前的候选点)最近距离 78 | # 如果中心点变成了两个,那么样本到中心点的最近距离就可能会发生变化 79 | new_dist_sq = np.minimum(closest_dist_sq, distance_to_candidate) 80 | new_pot = new_dist_sq.sum() 81 | 82 | # 选择最小的new_pot 83 | if (best_candidate is None) or (new_pot < best_pot): 84 | best_candidate = candidate_ids[trial] 85 | best_pot = new_pot 86 | best_dist_sq = new_dist_sq 87 | 88 | centers[c] = dataset[best_candidate] 89 | current_pot = best_pot 90 | closest_dist_sq = best_dist_sq 91 | 92 | return centers 93 | 94 | 95 | ##################### 相对简单版--理解 96 | # 但是可能会出现初值不好的情况 97 | # [[3. 0. ] 98 | # [0. 0. ] 99 | # [3.1 3.1]] 100 | 101 | def distance1(point1, point2): 102 | # 欧氏距离 103 | return np.sqrt(np.sum(np.power(point1 - point2,2))) 104 | 105 | #对一个样本找到与该样本距离最近的聚类中心 106 | def nearest(point, cluster_centers): 107 | min_dist = np.inf 108 | m = np.shape(cluster_centers)[0] # 当前已经初始化的聚类中心的个数 109 | for i in range(m): 110 | # 计算point与每个聚类中心之间的距离 111 | d = distance1(point, cluster_centers[i, ]) 112 | # 选择最短距离 113 | if min_dist > d: 114 | min_dist = d 115 | return min_dist 116 | 117 | #选择尽可能相距较远的类中心 118 | def get_centroids(dataset, k): 119 | m, n = np.shape(dataset) 120 | cluster_centers = np.zeros((k , n)) 121 | index = np.random.randint(0, m) 122 | # index = random_state.randint(0,m) 123 | # 返回一个随机整型数,范围从低(包括)到高(不包括) 124 | # print(index) 125 | cluster_centers[0] = dataset[index] 126 | # 2、初始化一个距离的序列 127 | d = [0.0 for _ in range(m)] 128 | for i in range(1, k): 129 | sum_all = 0 130 | for j in range(m): 131 | # 3、对每一个样本找到最近的聚类中心点 132 | d[j] = nearest(dataset[j], cluster_centers[0:i]) 133 | # 4、将所有的最短距离相加 134 | sum_all += d[j] 135 | # 5、取得sum_all之间的随机值 136 | # print(d) 137 | sum_all *= np.random.rand() 138 | 139 | # np.searchsorted搜索随机出的sum_all落在np.cumsum(d)中的位置。等价于下面6的代码 140 | candidate_ids = np.searchsorted(np.cumsum(d), sum_all) 141 | cluster_centers[i] = dataset[candidate_ids] 142 | 143 | 144 | # ##6、获得距离最远的样本点作为聚类中心点 145 | # for j, di in enumerate(d): 146 | # sum_all=sum_all - di 147 | # if sum_all > 0: 148 | # continue 149 | # cluster_centers[i] = dataset[j] 150 | # break 151 | return cluster_centers 152 | 153 | 154 | 155 | data = np.array([[0.,0.], 156 | [0.1,0.1],[0.2,0.2],[3.0,0.0],[3.1,3.1],[3.2,3.2],[9.0,9.0],[9.1,9.1],[9.2,9.2] 157 | ]) 158 | 159 | print(data) 160 | 161 | print(k_means_plus_plus(data,3)) 162 | print(get_centroids(data,3)) -------------------------------------------------------------------------------- /9降维/PCA-ceshi.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | # Author: Bob 3 | # Date: 2016.11.24 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from sklearn.datasets import load_iris 7 | from sklearn.decomposition import PCA 8 | from scipy import linalg 9 | iris = load_iris() 10 | yc = iris.target 11 | Xc = iris.data 12 | 13 | print(Xc.mean(axis=0)) 14 | print(Xc.std(axis=0)) 15 | # print(np.cov(Xc.T)) 16 | # 等于减去均值后的cov 17 | # [[ 0.68569351 -0.03926846 1.27368233 0.5169038 ] 18 | # [-0.03926846 0.18800403 -0.32171275 -0.11798121] 19 | # [ 1.27368233 -0.32171275 3.11317942 1.29638747] 20 | # [ 0.5169038 -0.11798121 1.29638747 0.58241432]] 21 | k=2 22 | pcac = PCA(n_components=k) #实例化 23 | pcac = pcac.fit(Xc) #拟合模型 24 | Xc_dr = pcac.transform(Xc) #获取新矩阵 25 | Xc_dr.shape 26 | print(pcac.get_covariance()) 27 | # 不是简单的返回样本的协方差,只有当k=4时返回的才是样本的协方差,源代码见下面 28 | # [[ 0.67919741 -0.03258618 1.27066452 0.5321852 ] 29 | # [-0.03258618 0.18113034 -0.31863564 -0.13363564] 30 | # [ 1.27066452 -0.31863564 3.11934547 1.28541527] 31 | # [ 0.5321852 -0.13363564 1.28541527 0.58961806]] 32 | print(pcac.explained_variance_) 33 | # [4.22484077 0.24224357] 34 | print(pcac.explained_variance_ratio_) 35 | # [0.92461621 0.05301557] 36 | Vc = pcac.components_ 37 | print(Vc) 38 | # [[ 0.36158968 -0.08226889 0.85657211 0.35884393] 39 | # [ 0.65653988 0.72971237 -0.1757674 -0.07470647]] 40 | 41 | # print(Xc_dr-(Xc-Xc.mean(axis=0)).dot(Vc.T)) 42 | print('*'*30) 43 | 44 | ### 只减去均值 45 | print('second') 46 | mean_ = np.mean(Xc, axis=0) 47 | Xc1 = Xc-mean_ 48 | U, S, VT = linalg.svd(Xc1) 49 | unp,snp,vnp = np.linalg.svd(Xc1) 50 | print(unp.shape) 51 | #(150, 150) 52 | print(snp.shape) 53 | #(4,) 54 | print(snp) 55 | # [25.08986398 6.00785254 3.42053538 1.87850234] 56 | print(S) 57 | # [25.08986398 6.00785254 3.42053538 1.87850234] 58 | 59 | print(VT) 60 | # [[ 0.36158968 -0.08226889 0.85657211 0.35884393] 61 | # [-0.65653988 -0.72971237 0.1757674 0.07470647] 62 | # [ 0.58099728 -0.59641809 -0.07252408 -0.54906091] 63 | # [ 0.31725455 -0.32409435 -0.47971899 0.75112056]] 64 | 65 | # 协方差矩阵的特征值 66 | explained_variance_ = (S ** 2) / (150 - 1) 67 | # print(explained_variance_) 68 | # # [4.22484077 0.24224357 0.07852391 0.02368303] 69 | # total_var = explained_variance_.sum() 70 | # explained_variance_ratio_ = explained_variance_ / total_var 71 | # print(explained_variance_ratio_) 72 | # # [0.92461621 0.05301557 0.01718514 0.00518309] 73 | 74 | VT[1,:]=VT[1,:]*(-1) 75 | VT[2,:]=VT[2,:]*(-1) 76 | # print(S) 77 | # print(VT[:k,:]) 78 | # [[ 0.36158968 -0.08226889 0.85657211 0.35884393] 79 | # [ 0.65653988 0.72971237 -0.1757674 -0.07470647]] 80 | Xc_dr1 = Xc1.dot(VT[:k,:].T) 81 | # print(Xc_dr-Xc_dr1) 82 | # print((Xc_dr-Xc_dr1).sum(axis=1)) 83 | # [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 84 | # 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 85 | # 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 86 | # 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 87 | # 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 88 | # 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 89 | # 0. 0. 0. 0. 0. 0.] 90 | 91 | ### 只减去均值 92 | print('third') 93 | cov = Xc1.T.dot(Xc1)/149 94 | # print(np.cov(Xc1.T)) 95 | # [[ 0.68569351 -0.03926846 1.27368233 0.5169038 ] 96 | # [-0.03926846 0.18800403 -0.32171275 -0.11798121] 97 | # [ 1.27368233 -0.32171275 3.11317942 1.29638747] 98 | # [ 0.5169038 -0.11798121 1.29638747 0.58241432]] 99 | # print(cov) 100 | # [[ 0.68569351 -0.03926846 1.27368233 0.5169038 ] 101 | # [-0.03926846 0.18800403 -0.32171275 -0.11798121] 102 | # [ 1.27368233 -0.32171275 3.11317942 1.29638747] 103 | # [ 0.5169038 -0.11798121 1.29638747 0.58241432]] 104 | u,d,v = np.linalg.svd(cov) 105 | 106 | # print((u-v.T)) 107 | # [[ 2.77555756e-16 6.66133815e-16 6.66133815e-16 1.11022302e-16] 108 | # [ 2.77555756e-17 -7.77156117e-16 0.00000000e+00 -5.55111512e-17] 109 | # [-1.11022302e-16 5.55111512e-17 -1.11022302e-16 5.55111512e-17] 110 | # [-5.55111512e-17 -1.11022302e-16 0.00000000e+00 1.11022302e-16]] 111 | 112 | # print(u) 113 | # [[-0.36158968 -0.65653988 0.58099728 0.31725455] 114 | # [ 0.08226889 -0.72971237 -0.59641809 -0.32409435] 115 | # [-0.85657211 0.1757674 -0.07252408 -0.47971899] 116 | # [-0.35884393 0.07470647 -0.54906091 0.75112056]] 117 | # d:[4.22484077 0.24224357 0.07852391 0.02368303] 118 | u = -1*u 119 | print(u[:,:k]) 120 | Xc_dr2 = np.dot(Xc1, u[:,:k]) 121 | # print((Xc_dr-Xc_dr2).sum(axis=1)) 122 | 123 | ### 不使用SVD 124 | U,V = np.linalg.eigh(cov) 125 | U = U[::-1] 126 | print(U) 127 | # [4.22484077 0.24224357 0.07852391 0.02368303] 128 | for i in range(4): 129 | V[i,:] = V[i,:][::-1] 130 | v = V[:,:k] 131 | v[:,0]=-1*v[:,0] 132 | # print(v) 133 | # [[ 0.36158968 0.65653988] 134 | # [-0.08226889 0.72971237] 135 | # [ 0.85657211 -0.1757674 ] 136 | # [ 0.35884393 -0.07470647]] 137 | 138 | print('未除以样本数减1') 139 | U,V = np.linalg.eigh(Xc1.T.dot(Xc1)) 140 | print(U) 141 | print(V) 142 | print('################') 143 | Xc_dr3 = np.dot(Xc1, v) 144 | # print((Xc_dr-Xc_dr3).sum(axis=1)) 145 | # ### 除以标准差 146 | print('fourth') 147 | n = Xc.shape[1] 148 | std_ = np.std(Xc,axis=0) 149 | print(std_) 150 | Xc2 = (Xc-mean_)/std_ 151 | print(Xc2.mean()) 152 | print(Xc2.std()) 153 | U2, S2, V2 = linalg.svd(Xc2) 154 | print(V2) 155 | # , full_matrices=False 156 | V2[1,:]=V2[1,:]*(-1) 157 | # print(S2) 158 | Xc_dr2 = Xc2.dot(V2[:k,:].T) 159 | # print(V2[:k,:]) 160 | # print((Xc_dr-Xc_dr2).sum(axis=1)) 161 | 162 | 163 | # def get_covariance(self): 164 | # """Compute data covariance with the generative model. 165 | # ``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)`` 166 | # where S**2 contains the explained variances, and sigma2 contains the 167 | # noise variances. 168 | # Returns 169 | # ------- 170 | # cov : array, shape=(n_features, n_features) 171 | # Estimated covariance of data. 172 | # """ 173 | # components_ = self.components_ 174 | # exp_var = self.explained_variance_ 175 | # if self.whiten: 176 | # components_ = components_ * np.sqrt(exp_var[:, np.newaxis]) 177 | # exp_var_diff = np.maximum(exp_var - self.noise_variance_, 0.) 178 | # cov = np.dot(components_.T * exp_var_diff, components_) 179 | # cov.flat[::len(cov) + 1] += self.noise_variance_ # modify diag inplace 180 | # return cov 181 | 182 | noise_variance_ = explained_variance_[2:].mean() 183 | components_ = pcac.components_ 184 | exp_var = pcac.explained_variance_ 185 | exp_var_diff = np.maximum(exp_var - noise_variance_, 0.) 186 | covsk = np.dot(components_.T * exp_var_diff, components_) 187 | covsk.flat[::len(covsk) + 1] += noise_variance_ 188 | print(covsk) -------------------------------------------------------------------------------- /9降维/PCA-mnist.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | # Author: Bob 3 | # Date: 2016.11.24 4 | import numpy as np 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | from sklearn.datasets import load_iris 8 | from sklearn.decomposition import PCA 9 | from scipy import linalg 10 | iris = load_iris() 11 | yc = iris.target 12 | 13 | Xc = pd.read_csv('../data/Mnist/mnist_train.csv',header = None,nrows =500) 14 | print(Xc.head()) 15 | 16 | # Xc = iris.data 17 | Xc=Xc/255. 18 | k=3 19 | pcac = PCA(n_components=k,whiten=False) #实例化 20 | pcac = pcac.fit(Xc) #拟合模型 21 | Xc_dr = pcac.transform(Xc) #获取新矩阵 22 | Xc_dr.shape 23 | # print(pcac.get_covariance()) 24 | # [[ 0.67919741 -0.03258618 1.27066452 0.5321852 ] 25 | # [-0.03258618 0.18113034 -0.31863564 -0.13363564] 26 | # [ 1.27066452 -0.31863564 3.11934547 1.28541527] 27 | # [ 0.5321852 -0.13363564 1.28541527 0.58961806]] 28 | print(pcac.explained_variance_) 29 | # [4.22484077 0.24224357] 30 | print(pcac.explained_variance_ratio_) 31 | # [0.92461621 0.05301557] 32 | Vc = pcac.components_ 33 | print(Vc) 34 | # [[ 0.36158968 -0.08226889 0.85657211 0.35884393] 35 | # [ 0.65653988 0.72971237 -0.1757674 -0.07470647]] 36 | print('*'*30) 37 | 38 | ### 只减去均值 39 | print('second') 40 | mean_ = np.mean(Xc, axis=0) 41 | std_ = np.std(Xc,axis=0) 42 | # Xc1 = (Xc-mean_)/(std_+10**-7) 43 | Xc1 = Xc-mean_ 44 | U, S, V = linalg.svd(Xc1) 45 | 46 | V[0,:]=V[0,:]*(-1) 47 | # V[1,:]=V[1,:]*(-1) 48 | 49 | # print(S) 50 | print(V[:k,:]) 51 | # [[ 0.36158968 -0.08226889 0.85657211 0.35884393] 52 | # [ 0.65653988 0.72971237 -0.1757674 -0.07470647]] 53 | Xc_dr1 = Xc1.dot(V[:k,:].T) 54 | # print(Xc_dr-Xc_dr1) 55 | 56 | Xc_dr11 = Xc1.dot(Vc.T) 57 | print((Xc_dr-Xc_dr1).sum(axis=1)) 58 | 59 | # [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 60 | # 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 61 | # 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 62 | # 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 63 | # 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 64 | # 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 65 | # 0. 0. 0. 0. 0. 0.] 66 | 67 | ### 只减去均值 68 | # print('third') 69 | # cov = Xc1.T.dot(Xc1)/149 70 | # # print(cov) 71 | # # [[ 0.68569351 -0.03926846 1.27368233 0.5169038 ] 72 | # # [-0.03926846 0.18800403 -0.32171275 -0.11798121] 73 | # # [ 1.27368233 -0.32171275 3.11317942 1.29638747] 74 | # # [ 0.5169038 -0.11798121 1.29638747 0.58241432]] 75 | # u,d,v = np.linalg.svd(cov) 76 | # # d:[4.22484077 0.24224357 0.07852391 0.02368303] 77 | # u = -1*u 78 | # print(u[:,:k]) 79 | # Xc_dr2 = np.dot(Xc1, u[:,:k]) 80 | # print((Xc_dr-Xc_dr2).sum(axis=1)) 81 | 82 | ### 不使用SVD 83 | # U,V = np.linalg.eigh(cov) 84 | # U = U[::-1] 85 | # # [4.22484077 0.24224357 0.07852391 0.02368303] 86 | # for i in range(4): 87 | # V[i,:] = V[i,:][::-1] 88 | # v = V[:,:k] 89 | # v[:,0]=-1*v[:,0] 90 | # # print(v) 91 | # # [[ 0.36158968 0.65653988] 92 | # # [-0.08226889 0.72971237] 93 | # # [ 0.85657211 -0.1757674 ] 94 | # # [ 0.35884393 -0.07470647]] 95 | 96 | # Xc_dr3 = np.dot(Xc1, v) 97 | # print((Xc_dr-Xc_dr3).sum(axis=1)) 98 | # ### 除以标准差 99 | # print('fourth') 100 | # n = Xc.shape[1] 101 | # std_ = np.std(Xc,axis=0) 102 | # print(std_) 103 | # Xc2 = (Xc-mean_)/std_ 104 | # print(Xc2.mean()) 105 | # print(Xc2.std()) 106 | # U2, S2, V2 = linalg.svd(Xc2) 107 | # print(V2) 108 | # # , full_matrices=False 109 | # V2[1,:]=V2[1,:]*(-1) 110 | # # print(S2) 111 | # Xc_dr2 = Xc2.dot(V2[:k,:].T) 112 | # print(V2[:k,:]) 113 | # print((Xc_dr-Xc_dr2).sum(axis=1)) 114 | 115 | -------------------------------------------------------------------------------- /9降维/PCA.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | # Author: Bob 3 | # Date: 2016.11.24 4 | import numpy as np 5 | from matplotlib import pyplot as plt 6 | from scipy import io as spio 7 | from sklearn.decomposition import pca 8 | 9 | ''' 10 | 主成分分析_2维数据降维1维演示函数 11 | ''' 12 | def PCA_2D(): 13 | data_2d = spio.loadmat("data.mat") 14 | X = data_2d['X'] 15 | m = X.shape[0] 16 | plt = plot_data_2d(X,'bo') # 显示二维的数据 17 | plt.show() 18 | 19 | X_copy = X.copy() 20 | X_norm,mu,sigma = featureNormalize(X_copy) # 归一化数据 21 | #plot_data_2d(X_norm) # 显示归一化后的数据 22 | #plt.show() 23 | 24 | Sigma = np.dot(np.transpose(X_norm),X_norm)/m # 求Sigma 25 | U,S,V = np.linalg.svd(Sigma) # 求Sigma的奇异值分解 26 | 27 | plt = plot_data_2d(X,'bo') # 显示原本数据 28 | drawline(plt, mu, mu+S[0]*(U[:,0]), 'r-') # 线,为投影的方向 29 | 30 | plt.axis('square') 31 | plt.show() 32 | 33 | K = 1 # 定义降维多少维(本来是2维的,这里降维1维) 34 | '''投影之后数据(降维之后)''' 35 | Z = projectData(X_norm,U,K) # 投影 36 | '''恢复数据''' 37 | X_rec = recoverData(Z,U,K) # 恢复 38 | '''作图-----原数据与恢复的数据''' 39 | plt = plot_data_2d(X_norm,'bo') 40 | plot_data_2d(X_rec,'ro') 41 | for i in range(X_norm.shape[0]): 42 | drawline(plt, X_norm[i,:], X_rec[i,:], '--k') 43 | plt.axis('square') 44 | plt.show() 45 | 46 | 47 | '''主成分分析_PCA图像数据降维''' 48 | def PCA_faceImage(): 49 | print (u'加载图像数据.....') 50 | data_image = spio.loadmat('data_faces.mat') 51 | X = data_image['X'] 52 | display_imageData(X[0:100,:]) 53 | m = X.shape[0] # 数据条数 54 | 55 | print (u'运行PCA....') 56 | X_norm,mu,sigma = featureNormalize(X) # 归一化 57 | 58 | Sigma = np.dot(np.transpose(X_norm),X_norm)/m # 求Sigma 59 | U,S,V = np.linalg.svd(Sigma) # 奇异值分解 60 | display_imageData(np.transpose(U[:,0:36])) # 显示U的数据 61 | 62 | print (u'对face数据降维.....') 63 | K = 100 # 降维100维(原先是32*32=1024维的) 64 | Z = projectData(X_norm, U, K) 65 | print (u'投影之后Z向量的大小:%d %d' %Z.shape) 66 | 67 | print (u'显示降维之后的数据......') 68 | X_rec = recoverData(Z, U, K) # 恢复数据 69 | display_imageData(X_rec[0:100,:]) 70 | 71 | 72 | 73 | 74 | # 可视化二维数据 75 | def plot_data_2d(X,marker): 76 | plt.plot(X[:,0],X[:,1],marker) 77 | return plt 78 | 79 | # 归一化数据 80 | def featureNormalize(X): 81 | '''(每一个数据-当前列的均值)/当前列的标准差''' 82 | n = X.shape[1] 83 | mu = np.zeros((1,n)); 84 | sigma = np.zeros((1,n)) 85 | 86 | mu = np.mean(X,axis=0) # axis=0表示列 87 | sigma = np.std(X,axis=0) 88 | for i in range(n): 89 | # X[:,i] = (X[:,i]-mu[i])/sigma[i] 90 | X[:,i] = (X[:,i]-mu[i]) 91 | return X,mu,sigma 92 | 93 | 94 | # 映射数据 95 | def projectData(X_norm,U,K): 96 | Z = np.zeros((X_norm.shape[0],K)) 97 | 98 | U_reduce = U[:,0:K] # 取前K个 99 | Z = np.dot(X_norm,U_reduce) 100 | return Z 101 | 102 | # 画一条线 103 | def drawline(plt,p1,p2,line_type): 104 | plt.plot(np.array([p1[0],p2[0]]),np.array([p1[1],p2[1]]),line_type) 105 | 106 | 107 | 108 | # 恢复数据 109 | def recoverData(Z,U,K): 110 | X_rec = np.zeros((Z.shape[0],U.shape[0])) 111 | U_recude = U[:,0:K] 112 | X_rec = np.dot(Z,np.transpose(U_recude)) # 还原数据(近似) 113 | return X_rec 114 | 115 | # 显示图片 116 | def display_imageData(imgData): 117 | sum = 0 118 | ''' 119 | 显示100个数(若是一个一个绘制将会非常慢,可以将要画的图片整理好,放到一个矩阵中,显示这个矩阵即可) 120 | - 初始化一个二维数组 121 | - 将每行的数据调整成图像的矩阵,放进二维数组 122 | - 显示即可 123 | ''' 124 | m,n = imgData.shape 125 | width = np.int32(np.round(np.sqrt(n))) 126 | height = np.int32(n/width); 127 | rows_count = np.int32(np.floor(np.sqrt(m))) 128 | cols_count = np.int32(np.ceil(m/rows_count)) 129 | pad = 1 130 | display_array = -np.ones((pad+rows_count*(height+pad),pad+cols_count*(width+pad))) 131 | for i in range(rows_count): 132 | for j in range(cols_count): 133 | max_val = np.max(np.abs(imgData[sum,:])) 134 | display_array[pad+i*(height+pad):pad+i*(height+pad)+height,pad+j*(width+pad):pad+j*(width+pad)+width] = imgData[sum,:].reshape(height,width,order="F")/max_val # order=F指定以列优先,在matlab中是这样的,python中需要指定,默认以行 135 | sum += 1 136 | 137 | plt.imshow(display_array,cmap='gray') #显示灰度图像 138 | plt.axis('off') 139 | plt.show() 140 | 141 | if __name__ == "__main__": 142 | PCA_2D() 143 | PCA_faceImage() -------------------------------------------------------------------------------- /9降维/__pycache__/tsne.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ys1305/ML-hand/8e1c3a4a1e34f329a4d6bd70d561c1988325e57c/9降维/__pycache__/tsne.cpython-36.pyc -------------------------------------------------------------------------------- /9降维/data.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ys1305/ML-hand/8e1c3a4a1e34f329a4d6bd70d561c1988325e57c/9降维/data.mat -------------------------------------------------------------------------------- /9降维/data_faces.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ys1305/ML-hand/8e1c3a4a1e34f329a4d6bd70d561c1988325e57c/9降维/data_faces.mat -------------------------------------------------------------------------------- /9降维/outfile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ys1305/ML-hand/8e1c3a4a1e34f329a4d6bd70d561c1988325e57c/9降维/outfile.png -------------------------------------------------------------------------------- /9降维/pca_juanjiang.py: -------------------------------------------------------------------------------- 1 | #导入相应包 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | 6 | # def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None,aweights=None) 7 | 8 | # m:一维或则二维的数组,默认情况下每一行代表一个变量(属性),每一列代表一个观测样本 9 | # y:与m具有一样的形式的一组数据 10 | # rowvar:默认为True,此时每一行代表一个变量(属性),每一列代表一个观测;为False时,则反之 11 | # bias:默认为False,此时标准化时除以n-1;反之为n。其中n为观测数 12 | # ddof:类型是int,当其值非None时,bias参数作用将失效。当ddof=1时,将会返回无偏估计(除以n-1), 13 | # 即使指定了fweights和aweights参数;当ddof=0时,则返回简单平均值。 14 | # frequency weights:一维数组,代表每个观测要重复的次数(相当于给观测赋予权重) 15 | # analytic weights:一维数组,代表观测矢量权重。对于被认为“重要”的观察, 16 | # 这些相对权重通常很大,而对于被认为不太重要的观察,这些相对权重较小。如果ddof = 0,则可以使用权重数组将概率分配给观测向量。 17 | 18 | 19 | #导入数据 20 | testSet = pd.read_table('testSet.txt',header=None) 21 | dataSet = testSet 22 | #计算均值 23 | meanVals = dataSet.mean(0) 24 | #去均值化,均值变为0 25 | meanRemoved = dataSet - meanVals 26 | #计算协方差矩阵 27 | covMat = np.mat(np.cov(meanRemoved, rowvar=0)) 28 | 29 | print(covMat) 30 | # print(np.cov(meanRemoved)) 31 | print(np.cov(meanRemoved, rowvar=0)) 32 | print(np.cov(meanRemoved, rowvar=0,bias=1)) 33 | print(meanRemoved.T.dot(meanRemoved)/(len(meanRemoved)-1)) 34 | 35 | def pca(dataSet, N=9999999): 36 | meanVals = dataSet.mean(0) 37 | meanRemoved = dataSet - meanVals 38 | covMat = np.mat(np.cov(meanRemoved, rowvar=0)) 39 | eigVals,eigVects = np.linalg.eig(covMat) 40 | # 对特征值排序,.argsort()函数默认从小到大排序,返回的是索引 41 | eigValInd = np.argsort(eigVals) 42 | 43 | # 提取出最大的N个特征 44 | eigValInd = eigValInd[:-(N+1):-1] 45 | 46 | redEigVects = eigVects[:,eigValInd] 47 | 48 | # 降维后的数据 49 | lowDDataMat = np.mat(meanRemoved) * redEigVects 50 | 51 | # 降维数据重构为原来数据 52 | reconMat = (lowDDataMat * redEigVects.T) + np.mat(meanVals) 53 | return lowDDataMat, reconMat 54 | 55 | lowDDataMat, reconMat = pca(testSet, N=1) 56 | 57 | from sklearn.decomposition import PCA 58 | pcac = PCA(n_components=1) 59 | pcac = pcac.fit(testSet) #拟合模型 60 | Xc_dr = pcac.transform(testSet) 61 | print(Xc_dr.shape) 62 | pcaniv = pcac.inverse_transform(Xc_dr) 63 | 64 | print(reconMat.shape) 65 | print((reconMat-pcaniv).sum()) 66 | print(pcaniv[:,0].shape) # (1000,) 67 | print(reconMat[:,0].A.flatten().shape) # (1000,) 68 | print(reconMat[:,0].shape) # (1000, 1) 不能进行绘图 69 | print(reconMat[:,0].flatten().shape) # (1, 1000) 70 | print(reconMat[:,0].A.shape) # (1000, 1) 71 | plt.scatter(testSet.iloc[:,0],testSet.iloc[:,1],marker = '.',c='orange') 72 | plt.scatter(reconMat[:,0].A.flatten(),reconMat[:,1].A.flatten(), marker='*',c='g') 73 | plt.scatter(pcaniv[:,0],pcaniv[:,1],marker = '.',c='b',s=3) 74 | plt.show() -------------------------------------------------------------------------------- /9降维/tsne.py: -------------------------------------------------------------------------------- 1 | print(__doc__) 2 | from time import time 3 | 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from matplotlib import offsetbox 7 | from sklearn import (manifold, datasets, decomposition, ensemble, 8 | discriminant_analysis, random_projection) 9 | 10 | digits = datasets.load_digits(n_class=10) 11 | X = digits.data 12 | y = digits.target 13 | n_samples, n_features = X.shape 14 | n_neighbors = 30 15 | 16 | 17 | #---------------------------------------------------------------------- 18 | # Scale and visualize the embedding vectors 19 | def plot_embedding(X, title=None): 20 | x_min, x_max = np.min(X, 0), np.max(X, 0) 21 | X = (X - x_min) / (x_max - x_min) 22 | 23 | plt.figure() 24 | ax = plt.subplot(111) 25 | for i in range(X.shape[0]): 26 | plt.text(X[i, 0], X[i, 1], str(y[i]), 27 | color=plt.cm.Set1(y[i] / 10.), 28 | fontdict={'weight': 'bold', 'size': 9}) 29 | 30 | if hasattr(offsetbox, 'AnnotationBbox'): 31 | # only print thumbnails with matplotlib > 1.0 32 | shown_images = np.array([[1., 1.]]) # just something big 33 | for i in range(X.shape[0]): 34 | dist = np.sum((X[i] - shown_images) ** 2, 1) 35 | if np.min(dist) < 4e-3: 36 | # don't show points that are too close 37 | continue 38 | shown_images = np.r_[shown_images, [X[i]]] 39 | imagebox = offsetbox.AnnotationBbox( 40 | offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), 41 | X[i]) 42 | ax.add_artist(imagebox) 43 | plt.xticks([]), plt.yticks([]) 44 | if title is not None: 45 | plt.title(title) 46 | 47 | 48 | #---------------------------------------------------------------------- 49 | # Plot images of the digits 50 | n_img_per_row = 20 51 | img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row)) 52 | for i in range(n_img_per_row): 53 | ix = 10 * i + 1 54 | for j in range(n_img_per_row): 55 | iy = 10 * j + 1 56 | img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8)) 57 | 58 | plt.imshow(img, cmap=plt.cm.binary) 59 | plt.xticks([]) 60 | plt.yticks([]) 61 | plt.title('A selection from the 64-dimensional digits dataset') 62 | 63 | 64 | #---------------------------------------------------------------------- 65 | # Random 2D projection using a random unitary matrix 66 | print("Computing random projection") 67 | rp = random_projection.SparseRandomProjection(n_components=2, random_state=42) 68 | X_projected = rp.fit_transform(X) 69 | plot_embedding(X_projected, "Random Projection of the digits") 70 | 71 | 72 | #---------------------------------------------------------------------- 73 | # Projection on to the first 2 principal components 74 | 75 | print("Computing PCA projection") 76 | t0 = time() 77 | X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform(X) 78 | plot_embedding(X_pca, 79 | "Principal Components projection of the digits (time %.2fs)" % 80 | (time() - t0)) 81 | 82 | #---------------------------------------------------------------------- 83 | # Projection on to the first 2 linear discriminant components 84 | 85 | print("Computing Linear Discriminant Analysis projection") 86 | X2 = X.copy() 87 | X2.flat[::X.shape[1] + 1] += 0.01 # Make X invertible 88 | t0 = time() 89 | X_lda = discriminant_analysis.LinearDiscriminantAnalysis(n_components=2).fit_transform(X2, y) 90 | plot_embedding(X_lda, 91 | "Linear Discriminant projection of the digits (time %.2fs)" % 92 | (time() - t0)) 93 | 94 | 95 | #---------------------------------------------------------------------- 96 | # Isomap projection of the digits dataset 97 | print("Computing Isomap embedding") 98 | t0 = time() 99 | X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X) 100 | print("Done.") 101 | plot_embedding(X_iso, 102 | "Isomap projection of the digits (time %.2fs)" % 103 | (time() - t0)) 104 | 105 | 106 | #---------------------------------------------------------------------- 107 | # Locally linear embedding of the digits dataset 108 | print("Computing LLE embedding") 109 | clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, 110 | method='standard') 111 | t0 = time() 112 | X_lle = clf.fit_transform(X) 113 | print("Done. Reconstruction error: %g" % clf.reconstruction_error_) 114 | plot_embedding(X_lle, 115 | "Locally Linear Embedding of the digits (time %.2fs)" % 116 | (time() - t0)) 117 | 118 | 119 | #---------------------------------------------------------------------- 120 | # Modified Locally linear embedding of the digits dataset 121 | print("Computing modified LLE embedding") 122 | clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, 123 | method='modified') 124 | t0 = time() 125 | X_mlle = clf.fit_transform(X) 126 | print("Done. Reconstruction error: %g" % clf.reconstruction_error_) 127 | plot_embedding(X_mlle, 128 | "Modified Locally Linear Embedding of the digits (time %.2fs)" % 129 | (time() - t0)) 130 | 131 | 132 | #---------------------------------------------------------------------- 133 | # HLLE embedding of the digits dataset 134 | print("Computing Hessian LLE embedding") 135 | clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, 136 | method='hessian') 137 | t0 = time() 138 | X_hlle = clf.fit_transform(X) 139 | print("Done. Reconstruction error: %g" % clf.reconstruction_error_) 140 | plot_embedding(X_hlle, 141 | "Hessian Locally Linear Embedding of the digits (time %.2fs)" % 142 | (time() - t0)) 143 | 144 | 145 | #---------------------------------------------------------------------- 146 | # LTSA embedding of the digits dataset 147 | print("Computing LTSA embedding") 148 | clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, 149 | method='ltsa') 150 | t0 = time() 151 | X_ltsa = clf.fit_transform(X) 152 | print("Done. Reconstruction error: %g" % clf.reconstruction_error_) 153 | plot_embedding(X_ltsa, 154 | "Local Tangent Space Alignment of the digits (time %.2fs)" % 155 | (time() - t0)) 156 | 157 | #---------------------------------------------------------------------- 158 | # MDS embedding of the digits dataset 159 | print("Computing MDS embedding") 160 | clf = manifold.MDS(n_components=2, n_init=1, max_iter=100) 161 | t0 = time() 162 | X_mds = clf.fit_transform(X) 163 | print("Done. Stress: %f" % clf.stress_) 164 | plot_embedding(X_mds, 165 | "MDS embedding of the digits (time %.2fs)" % 166 | (time() - t0)) 167 | 168 | #---------------------------------------------------------------------- 169 | # Random Trees embedding of the digits dataset 170 | print("Computing Totally Random Trees embedding") 171 | hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, 172 | max_depth=5) 173 | t0 = time() 174 | X_transformed = hasher.fit_transform(X) 175 | pca = decomposition.TruncatedSVD(n_components=2) 176 | X_reduced = pca.fit_transform(X_transformed) 177 | 178 | plot_embedding(X_reduced, 179 | "Random forest embedding of the digits (time %.2fs)" % 180 | (time() - t0)) 181 | 182 | #---------------------------------------------------------------------- 183 | # Spectral embedding of the digits dataset 184 | print("Computing Spectral embedding") 185 | embedder = manifold.SpectralEmbedding(n_components=2, random_state=0, 186 | eigen_solver="arpack") 187 | t0 = time() 188 | X_se = embedder.fit_transform(X) 189 | 190 | plot_embedding(X_se, 191 | "Spectral embedding of the digits (time %.2fs)" % 192 | (time() - t0)) 193 | 194 | #---------------------------------------------------------------------- 195 | # t-SNE embedding of the digits dataset 196 | print("Computing t-SNE embedding") 197 | tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) 198 | t0 = time() 199 | X_tsne = tsne.fit_transform(X) 200 | 201 | plot_embedding(X_tsne, 202 | "t-SNE embedding of the digits (time %.2fs)" % 203 | (time() - t0)) 204 | 205 | plt.show() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ML-hand 2 | 主要是各种机器学习算法的手写实现 3 | 4 | 参考了很多大佬的代码,由于时间过得太久了,好多参考的来源都忘记了。这里只好对各位大佬们说声抱歉 5 | 6 | 希望可以对机器学习的初学者有所帮助。 7 | --------------------------------------------------------------------------------