├── .idea ├── inspectionProfiles │ └── profiles_settings.xml ├── machine_learning.iml ├── misc.xml ├── modules.xml └── workspace.xml ├── README.md ├── ex1 ├── multi_variable.py └── one_variable.py ├── ex2 ├── Regularized_logistic_regression.py └── logistic_regression.py ├── ex3 ├── multi_logistic_regression.py └── neural_network.py ├── ex4 └── bp_neural_network.py ├── ex5 └── bias_variance.py ├── ex6 ├── spam_classification.py ├── support_vector_machines.py └── svm_with_gaussian.py ├── ex7 ├── Kmeans.py └── pca.py └── ex8 ├── anomaly_detection.py └── recommender_systems.py /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/machine_learning.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 11 | 12 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 34 | 35 | 36 | 37 | 38 | 57 | 58 | 59 | 78 | 79 | 80 | 99 | 100 | 101 | 120 | 121 | 122 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 1581836468914 165 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | file://$PROJECT_DIR$/ex1/one_variable.py 176 | 16 177 | 179 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 吴恩达机器学习作业 2 | 3 | 拖延了许久终于把吴恩达机器学习的视频看完了，用python完成了作业。作业为py文件，注释很全。 4 | 感觉还是得用jupyter好一些，接下来学习深度学习部分，下一阶段改用jupyter。 5 | 6 | **本次学习参考了众多笔记，表示感谢:** 7 | 8 | 吴恩达机器学习视频： 9 | 10 | 吴恩达机器学习与深度学习作业： 11 | 12 | 斯坦福大学2014机器学习教程中文笔记： 13 | 14 | machine-learning： 15 | 16 | 反向传播——通俗易懂： 17 | 18 | 奇异值分解(SVD)原理与在降维中的应用： 19 | 20 | SVM从入门到放弃再到掌握： 21 | 22 | 主成分分析PCA原理与实现： 23 | 24 | 有一些笔记因为时间比较久，有所疏漏，一并感谢 -------------------------------------------------------------------------------- /ex1/multi_variable.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | # 代价函数 7 | def computeCost(X, y, theta): 8 | inner = np.power((X*theta.T - y), 2) 9 | return np.sum(inner) / (2*(len(X))) 10 | 11 | 12 | # 梯度下降 13 | def gradientDescent(X, y, theta, alpha, epoch): 14 | cost = np.zeros(epoch) 15 | m = X.shape[0] 16 | for i in range(epoch): 17 | theta = theta - (alpha / m) * (X * theta.T - y).T * X 18 | cost[i] = computeCost(X, y, theta) 19 | return theta, cost 20 | 21 | 22 | path = 'D:\\Documents\\machine-Leanring\\machine-learning-ex1\\ex1\\ex1data2.txt' 23 | data = pd.read_csv(path, header=None, names=['Size', 'Bedrooms', 'Price']) 24 | data = (data - data.mean()) / data.std() # 数据归一化 25 | data.insert(0, 'Ones', 1) 26 | print(data.head()) 27 | 28 | cols = data.shape[1] 29 | X = data.iloc[:, 0:cols-1] 30 | y = data.iloc[:, cols-1:] 31 | 32 | X = np.mat(X.values) 33 | y = np.mat(y.values) 34 | theta = np.mat([0, 0, 0]) 35 | alpha = 0.01 36 | epoch = 1000 37 | 38 | final_theta, cost = gradientDescent(X, y, theta, alpha, epoch) 39 | print(final_theta) 40 | fig, ax = plt.subplots(figsize=(8, 5)) 41 | ax.plot(np.arange(epoch), cost, 'r', label='Iterations') 42 | ax.set_xlabel('Iterations') 43 | ax.set_ylabel('Cost') 44 | ax.set_title('Error vs. Training Epoch') 45 | plt.show() 46 | 47 | -------------------------------------------------------------------------------- /ex1/one_variable.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | # 计算代价函数 7 | def computeCost(X, y, theta): 8 | inner = np.power(((X * theta.T) - y), 2) 9 | return np.sum(inner) / (2 * len(X)) 10 | 11 | 12 | # 梯度下降, return theta, cost 13 | def gradientDescent(X, y, theta, alpha, epoch): 14 | cost = np.zeros(epoch) 15 | m = X.shape[0] 16 | for i in range(epoch): 17 | theta = theta - (alpha / m) * (X * theta.T - y).T * X 18 | cost[i] = computeCost(X, y, theta) 19 | return theta, cost 20 | 21 | 22 | # 正规方程求解theta 23 | def normalEqn(X, y): 24 | return np.linalg.inv(X.T @ X) @ X.T @ y # np.linalg.inv求逆 25 | 26 | 27 | path = 'D:\\Documents\\machine-Leanring\\machine-learning-ex1\\ex1\\ex1data1.txt' 28 | data = pd.read_csv(path, header=None, names=['Population', 'Profit']) 29 | 30 | # 插入一列 31 | data.insert(0, 'Ones', 1) 32 | 33 | print(data.head()) # 可查看导入的数据项 34 | print(data.describe()) # 查看导入数据的信息 35 | 36 | ''' 37 | # 可视化查看数据 38 | data.plot(kind='scatter', x='Population', y='Profit', figsize=(8, 5)) 39 | plt.show() 40 | ''' 41 | 42 | cols = data.shape[1] 43 | X = data.iloc[:, 0:cols - 1] # 取输入向量 44 | y = data.iloc[:, cols - 1:] # 取输出向量 45 | 46 | X = np.mat(X.values) 47 | y = np.mat(y.values) 48 | theta = np.mat([0, 0]) 49 | alpha = 0.01 50 | epoch = 2000 51 | 52 | # 梯度下降求解theta 53 | final_theta, cost = gradientDescent(X, y, theta, alpha, epoch) 54 | print(final_theta) 55 | 56 | # 正规方程求解theta 57 | print(normalEqn(X, y).flatten()) 58 | 59 | x = np.linspace(data.Population.min(), data.Population.max(), 100) # 横坐标 60 | f = final_theta[0, 0] + (final_theta[0, 1] * x) # 纵坐标，利润 61 | fig, ax = plt.subplots(figsize=(6, 4)) 62 | ax.plot(x, f, 'r', label='Prediction') 63 | ax.scatter(data['Population'], data.Profit, label='Traning Data') 64 | ax.legend(loc=2) # 2表示在左上角 65 | ax.set_xlabel('Population') 66 | ax.set_ylabel('Profit') 67 | ax.set_title('Predicted Profit vs. Population Size') 68 | plt.show() 69 | 70 | ''' 71 | # 查看cost曲线 72 | fig, ax = plt.subplots(figsize=(8, 4)) 73 | ax.plot(np.arange(epoch), cost, 'r') 74 | ax.set_xlabel('Iterations') 75 | ax.set_ylabel('Cost') 76 | ax.set_title('Error vs. Training Epoch') 77 | plt.show() 78 | ''' -------------------------------------------------------------------------------- /ex2/Regularized_logistic_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import scipy.optimize as opt 5 | from sklearn.metrics import classification_report 6 | 7 | 8 | # 创建更多的特征 9 | def feature_mapping(x1, x2, power): 10 | data = {} 11 | for i in np.arange(power + 1): 12 | for p in np.arange(i + 1): 13 | data["f{}{}".format(i - p, p)] = np.power(x1, i - p) * np.power(x2, p) 14 | return pd.DataFrame(data) 15 | 16 | 17 | def sigmoid(z): 18 | return 1 / (1 + np.exp(-z)) 19 | 20 | 21 | # 计算代价 22 | def computeCost(theta, X, y): 23 | first = (-y) * np.log(sigmoid(X @ theta)) 24 | second = (1 - y) * np.log(1 - sigmoid(X @ theta)) 25 | return np.mean(first - second) 26 | 27 | 28 | # 加入正则的代价函数 29 | def costReg(theta, X, y, l=1): 30 | _theta = theta[1:] # 不惩罚第一项 31 | reg = (l / (2 * len(X))) * (_theta @ _theta) 32 | return computeCost(theta, X, y) + reg 33 | 34 | 35 | # 计算梯度 36 | def gradient(theta, X, y): 37 | return (X.T @ (sigmoid(X @ theta) - y)) / len(X) 38 | 39 | 40 | # 加入正则的梯度 41 | def gradientReg(theta, X, y, l=1): 42 | reg = (l / len(X)) * theta 43 | reg[0] = 0 # 不惩罚第一项 44 | return gradient(theta, X, y) + reg 45 | 46 | 47 | # 预测结果 48 | def predict(theta, X): 49 | probability = sigmoid(X @ theta) 50 | return [1 if x >= 0.5 else 0 for x in probability] 51 | 52 | 53 | path = 'D:\\Documents\\machine-Leanring\\machine-learning-ex2\\ex2\\ex2data2.txt' 54 | data = pd.read_csv(path, header=None, names=['Test 1', 'Test 2', 'Accepted']) 55 | 56 | ''' 57 | # 可视化数据 58 | positive = data[data['Accepted'].isin([1])] 59 | negative = data[data['Accepted'].isin([0])] 60 | fig, ax = plt.subplots(figsize=(8, 5)) 61 | ax.scatter(positive['Test 1'], positive['Test 2'], s=50, c='b', marker='o', label='Accepted') 62 | ax.scatter(negative['Test 1'], negative['Test 2'], s=50, c='r', marker='x', label='Rejected') 63 | ax.legend() 64 | ax.set_xlabel('Test 1 Score') 65 | ax.set_ylabel('Test 2 Score') 66 | plt.show() 67 | ''' 68 | 69 | x1 = data['Test 1'].values 70 | x2 = data['Test 2'].values 71 | 72 | # 由于不能够线性拟合, 因此需要加入高次特征 73 | _data = feature_mapping(x1, x2, power=6) 74 | print(_data.head()) 75 | 76 | X = _data.values 77 | y = data['Accepted'].values 78 | theta = np.zeros(X.shape[1]) 79 | result = opt.fmin_tnc(func=costReg, x0=theta, fprime=gradientReg, args=(X, y, 1), messages=0) 80 | print(result) 81 | 82 | final_theta = result[0] 83 | 84 | # 测试准确度 85 | predictions = predict(final_theta, X) 86 | correct = [1 if a == b else 0 for (a, b) in zip(predictions, y)] 87 | accuracy = sum(correct) / len(X) 88 | print(accuracy) 89 | 90 | # 采用skearn中的方法来检验 91 | print(classification_report(predictions, y)) 92 | 93 | # 计算决策边界 94 | x = np.linspace(-1, 1.5, 250) 95 | xx, yy = np.meshgrid(x, x) # 从坐标向量中返回坐标矩阵 96 | z = feature_mapping(xx.ravel(), yy.ravel(), 6).values 97 | z = z @ final_theta 98 | z = z.reshape(xx.shape) 99 | 100 | # 绘制数据散点图以及决策边界 101 | positive = data[data.Accepted.isin(['1'])] 102 | negetive = data[data.Accepted.isin(['0'])] 103 | fig, ax = plt.subplots(figsize=(6, 5)) 104 | ax.scatter(positive['Test 1'], positive['Test 2'], s=50, marker='o', c='b', label='Accepted') 105 | ax.scatter(negetive['Test 1'], negetive['Test 2'], s=50, marker='x', c='r', label='Rejected') 106 | box = ax.get_position() 107 | ax.set_position([box.x0, box.y0, box.width, box.height * 0.9]) 108 | ax.legend(loc='center left', bbox_to_anchor=(0.2, 1.12), ncol=2) 109 | ax.set_xlabel('Test 1 score') 110 | ax.set_ylabel('Test 2 score') 111 | plt.contour(xx, yy, z, 0) 112 | plt.ylim(-.8, 1.2) 113 | plt.show() 114 | -------------------------------------------------------------------------------- /ex2/logistic_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import scipy.optimize as opt 5 | from sklearn.metrics import classification_report 6 | 7 | 8 | # 逻辑函数 9 | def sigmoid(z): 10 | return 1 / (1 + np.exp(-z)) 11 | 12 | 13 | # 代价函数 14 | def computeCost(theta, X, y): 15 | first = (-y) * np.log(sigmoid(X @ theta)) 16 | second = (1 - y) * np.log(1 - sigmoid(X @ theta)) 17 | return np.mean(first - second) 18 | 19 | 20 | # 计算梯度 21 | def gradient(theta, X, y): 22 | return (X.T @ (sigmoid(X @ theta) - y)) / len(X) 23 | 24 | 25 | # 预测结果 26 | def predict(theta, X): 27 | probability = sigmoid(X @ theta) 28 | return [1 if x >= 0.5 else 0 for x in probability] 29 | 30 | 31 | path = 'D:\\Documents\\machine-Leanring\\machine-learning-ex2\\ex2\\ex2data1.txt' 32 | data = pd.read_csv(path, header=None, names=['exam1', 'exam2', 'admitted']) 33 | data.insert(0, 'Ones', 1) 34 | 35 | positive = data[data.admitted.isin(['1'])] 36 | negetive = data[data.admitted.isin(['0'])] 37 | 38 | # show the exam1 and exam2 39 | ''' 40 | fig, ax = plt.subplots(figsize=(6, 5)) 41 | ax.scatter(positive['exam1'], positive['exam2'], c='b', label='Admitted') 42 | ax.scatter(negetive['exam1'], negetive['exam2'], s=50, c='r', marker='x', label='Not Admitted') 43 | box = ax.get_position() 44 | ax.set_position([box.x0, box.y0, box.width, box.height*0.8]) 45 | ax.legend(loc='center left', bbox_to_anchor=(0.2, 1.12), ncol=2) # loc:位置 bbox_to_anchor:相对位置 ncol:列数 46 | ax.set_xlabel('Exam1 Score') 47 | ax.set_ylabel('Exam2 Score') 48 | plt.show() 49 | ''' 50 | 51 | X = data.iloc[:, :-1].values 52 | y = data.iloc[:, -1].values 53 | theta = np.zeros(X.shape[1]) 54 | result = opt.fmin_tnc(func=computeCost, x0=theta, fprime=gradient, args=(X, y), messages=0) 55 | 56 | print(result) 57 | 58 | # 测试准确度 59 | final_theta = result[0] 60 | predictions = predict(final_theta, X) 61 | correct = [1 if a == b else 0 for (a, b) in zip(predictions, y)] 62 | accuracy = sum(correct) / len(X) 63 | print(accuracy) 64 | 65 | # 采用skearn中的方法来检验 66 | print(classification_report(predictions, y)) 67 | 68 | # 绘制决策边界 69 | x1 = np.arange(130, step=0.1) 70 | x2 = -(final_theta[0] + x1 * final_theta[1]) / final_theta[2] 71 | fig, ax = plt.subplots(figsize=(6, 5)) 72 | ax.scatter(positive['exam1'], positive['exam2'], c='b', label='Admitted') 73 | ax.scatter(negetive['exam1'], negetive['exam2'], s=50, c='r', marker='x', label='Not Admitted') 74 | ax.plot(x1, x2) 75 | ax.set_xlim(0, 130) 76 | ax.set_ylim(0, 130) 77 | ax.set_xlabel('x1') 78 | ax.set_ylabel('y1') 79 | ax.set_title('Decision Boundary') 80 | plt.show() 81 | -------------------------------------------------------------------------------- /ex3/multi_logistic_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from scipy.io import loadmat 8 | from scipy.optimize import minimize 9 | 10 | 11 | # 加载数据 12 | def loadData(path): 13 | data = loadmat(path) 14 | X = data['X'] 15 | y = data['y'] 16 | return X, y 17 | 18 | 19 | # 随机打印一张图片 20 | def plot_an_image(X): 21 | pick_one = np.random.randint(0, 5000) # 随机选择一行 22 | image = X[pick_one, :] # 从X中抽取图像数据 23 | fig, ax = plt.subplots(figsize=(1, 1)) 24 | ax.matshow(image.reshape((20, 20)), cmap='gray_r') # 灰度图 25 | plt.xticks([]) # 去除刻度，美观 26 | plt.yticks([]) 27 | print('this should be {}'.format(y[pick_one])) 28 | plt.show() 29 | 30 | 31 | # 随机打印100张图 32 | def plot_100_image(X): 33 | sample_idx = np.random.choice(np.arange(X.shape[0]), 100) 34 | sample_img = X[sample_idx, :] 35 | fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8)) 36 | for row in range(10): 37 | for col in range(10): 38 | ax_array[row, col].matshow(sample_img[10 * row + col].reshape((20, 20)), cmap='gray_r') 39 | plt.xticks([]) 40 | plt.yticks([]) 41 | plt.show() 42 | 43 | 44 | def sigmoid(z): 45 | return 1 / (1 + np.exp(-z)) 46 | 47 | 48 | # 正则化的代价函数 49 | def regularized_cost(theta, X, y, l): 50 | reg = theta[1:] 51 | first = -y * np.log(sigmoid(X @ theta)) - (1 - y) * np.log(1 - sigmoid(X @ theta)) 52 | reg = (reg @ reg) * l / (2 * len(X)) 53 | return np.mean(first) + reg 54 | 55 | 56 | # 正则化的梯度下降 57 | def regularized_gradient(theta, X, y, l): 58 | reg = theta[1:] 59 | first = (1 / len(X)) * X.T @ (sigmoid(X @ theta) - y) 60 | reg = np.concatenate([np.array([0]), (1 / len(X)) * reg]) # concatenate 数组拼接函数, 此处在前面加一个0, 即第一项不惩罚 61 | return first + reg 62 | 63 | 64 | # 一对多分类训练 65 | def one_vs_all(X, y, l, K): 66 | all_theta = np.zeros((K, X.shape[1])) 67 | for i in range(1, K + 1): 68 | theta = np.zeros(X.shape[1]) 69 | y_i = np.array([1 if label == i else 0 for label in y]) # 对于第i类, 标签为i则y_i为1, 否则y_i为0 70 | ret = minimize(fun=regularized_cost, x0=theta, args=(X, y_i, l), method='TNC', jac=regularized_gradient, 71 | options={'disp': True}) # disp为True, 则打印详细的迭代信息; 72 | all_theta[i - 1, :] = ret.x 73 | return all_theta 74 | 75 | 76 | def predict_all(X, all_theta): 77 | h = sigmoid(X @ all_theta.T) 78 | h_argmax = np.argmax(h, axis=1) # 返回array中数值最大数的下标 79 | h_argmax += 1 80 | return h_argmax 81 | 82 | 83 | X, y = loadData('D:\\Documents\\machine-Leanring\\machine-learning-ex3\\ex3\\ex3data1.mat') 84 | print(np.unique(y)) # 查看有几类标签 85 | print(X.shape, y.shape) 86 | 87 | # plot_an_image(X) 88 | # plot_100_image(X) 89 | 90 | X = np.insert(X, 0, 1, axis=1) 91 | y = y.flatten() # 这里消除一个维度，方便后面的计算 92 | 93 | # 训练 94 | all_theta = one_vs_all(X, y, 1, 10) 95 | 96 | # 预测 97 | y_pred = predict_all(X, all_theta) 98 | 99 | # 计算准确度 100 | accuracy = np.mean(y_pred == y) 101 | print('accuracy = {0}%'.format(accuracy * 100)) 102 | -------------------------------------------------------------------------------- /ex3/neural_network.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | import numpy as np 6 | from scipy.io import loadmat 7 | 8 | 9 | # 加载权重 10 | def load_weight(path): 11 | data = loadmat(path) 12 | return data['Theta1'], data['Theta2'] 13 | 14 | 15 | # 加载数据 16 | def load_data(path): 17 | data = loadmat(path) 18 | return data['X'], data['y'] 19 | 20 | 21 | def sigmoid(z): 22 | return 1 / (1 + np.exp(-z)) 23 | 24 | 25 | theta1, theta2 = load_weight('D:\\Documents\\machine-Leanring\\machine-learning-ex3\\ex3\\ex3weights.mat') 26 | 27 | X, y = load_data('D:\\Documents\\machine-Leanring\\machine-learning-ex3\\ex3\\ex3data1.mat') 28 | y = y.flatten() 29 | X = np.insert(X, 0, values=np.ones(X.shape[0]), axis=1) 30 | 31 | a1 = X 32 | 33 | z2 = a1 @ theta1.T # 第一层偏差计算 34 | z2 = np.insert(z2, 0, 1, axis=1) # 插入偏差单位 35 | a2 = sigmoid(z2) # 第一层激活 36 | 37 | z3 = a2 @ theta2.T # 第二层偏差计算 38 | a3 = sigmoid(z3) # 第二层激活, 由于模型只有两层, 因此最后得到的即为所需结果 39 | 40 | y_pred = np.argmax(a3, axis=1) + 1 41 | 42 | # 准确度 43 | accuracy = np.mean(y_pred == y) 44 | print('accuracy = {0}%'.format(accuracy * 100)) 45 | 46 | -------------------------------------------------------------------------------- /ex4/bp_neural_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from scipy.io import loadmat 4 | import scipy.optimize as opt 5 | from sklearn.metrics import classification_report 6 | from sklearn.preprocessing import OneHotEncoder 7 | 8 | 9 | # 加载数据 10 | def load_mat(path): 11 | data = loadmat(path) 12 | return data['X'], data['y'].flatten() 13 | 14 | 15 | # 加载权重 16 | def load_weight(path): 17 | data = loadmat(path) 18 | return data['Theta1'], data['Theta2'] 19 | 20 | 21 | # 展示100张图片 22 | def plot_100_img(X): 23 | index = np.random.choice(range(5000), 100) 24 | image = X[index] 25 | fig, ax_array = plt.subplots(nrows=10, ncols=10, sharex=True, sharey=True, figsize=(6, 6)) 26 | for row in range(10): 27 | for col in range(10): 28 | ax_array[row, col].matshow(image[10 * row + col].reshape(20, 20), cmap='gray_r') 29 | plt.xticks([]) 30 | plt.yticks([]) 31 | plt.show() 32 | 33 | 34 | # 将y扩展为向量形式 35 | def expand_y(y): 36 | result = [] 37 | 38 | # 把y中每个类别转化为一个向量，对应的lable值在向量对应位置上置为1 39 | for i in y: 40 | y_array = np.zeros(10) 41 | y_array[i - 1] = 1 42 | result.append(y_array) 43 | 44 | # 或者用sklearn中OneHotEncoder函数 45 | ''' 46 | encoder = OneHotEncoder(sparse=False) # return a array instead of matrix 47 | y_onehot = encoder.fit_transform(y.reshape(-1,1)) 48 | return y_onehot 49 | ''' 50 | 51 | return np.array(result) 52 | 53 | 54 | # 展开theta值以便能够传入高级优化方法进行运算 55 | def serialize(a, b): 56 | return np.r_[a.flatten(), b.flatten()] # np.r_是按行叠加两个矩阵的意思，也可以说是按列连接两个矩阵，就是把两矩阵上下相加，要求列数相等 57 | 58 | 59 | # 还原为原来的theta值 60 | def deserialize(seq): 61 | return seq[:25 * 401].reshape(25, 401), seq[25 * 401:].reshape(10, 26) 62 | 63 | 64 | def sigmoid(z): 65 | return 1 / (1 + np.exp(-z)) 66 | 67 | 68 | # 前向传播 69 | def feed_forward(theta, X): 70 | t1, t2 = deserialize(theta) 71 | a1 = X 72 | 73 | # 第一层向第二层传播 74 | z2 = a1 @ t1.T 75 | a2 = np.insert(sigmoid(z2), 0, 1, axis=1) 76 | 77 | # 第二层向第三层传播 78 | z3 = a2 @ t2.T 79 | a3 = sigmoid(z3) 80 | 81 | return a1, z2, a2, z3, a3 82 | 83 | 84 | # 计算代价 85 | def cost(theta, X, y): 86 | a1, z2, a2, z3, h = feed_forward(theta, X) 87 | J = 0 88 | for i in range(len(X)): 89 | first = - y[i] * np.log(h[i]) 90 | second = (1 - y[i]) * np.log(1 - h[i]) 91 | J = J + np.sum(first - second) 92 | J = J / len(X) 93 | return J 94 | 95 | 96 | # 加入正则的代价函数 97 | def regularized_cost(theta, X, y, l=1): 98 | t1, t2 = deserialize(theta) 99 | reg = np.sum(t1[:, 1:] ** 2) + np.sum(t2[:, 1:] ** 2) # or use np.power(a, 2) 100 | return l / (2 * len(X)) * reg + cost(theta, X, y) 101 | 102 | 103 | # 反向传播时S函数的导数 104 | def sigmoid_gradient(z): 105 | return sigmoid(z) * (1 - sigmoid(z)) 106 | 107 | 108 | # 随机初始化参数, 打破数据的对称性 109 | def random_init(size): 110 | return np.random.uniform(-0.12, 0.12, size) 111 | 112 | 113 | # 计算梯度 114 | def gradient(theta, X, y): 115 | t1, t2 = deserialize(theta) 116 | a1, z2, a2, z3, h = feed_forward(theta, X) 117 | d3 = h - y # 最后一层的误差 118 | d2 = d3 @ t2[:, 1:] * sigmoid_gradient(z2) # 前一层的误差 119 | D2 = d3.T @ a2 120 | D1 = d2.T @ a1 121 | D = (1 / len(X)) * serialize(D1, D2) 122 | return D 123 | 124 | 125 | # 正则的梯度下降 126 | def regularized_gradient(theta, X, y, l=1): 127 | D1, D2 = deserialize(gradient(theta, X, y)) 128 | t1, t2 = deserialize(theta) 129 | t1[:, 0] = 0 130 | t2[:, 0] = 0 131 | reg_D1 = D1 + (l / len(X)) * t1 132 | reg_D2 = D2 + (l / len(X)) * t2 133 | return serialize(reg_D1, reg_D2) 134 | 135 | 136 | # 在代价函数上沿着切线的方向选择离两个非常近的点然后计算两个点的平均值用以估计梯度。即对于某个特定的theta，我们计算出在theta-e和theta+e的代价的均值 137 | def gradient_checking(theta, X, y, e): 138 | def a_number_grad(plus, minus): 139 | return (regularized_cost(plus, X, y) - regularized_cost(minus, X, y)) / (e * 2) 140 | 141 | numeric_grad = [] 142 | for i in range(len(theta)): 143 | plus = theta.copy() 144 | minus = theta.copy() 145 | plus[i] = plus[i] + e 146 | minus[i] = minus[i] - e 147 | grad_i = a_number_grad(plus, minus) 148 | numeric_grad.append(grad_i) 149 | 150 | numeric_grad = np.array(numeric_grad) 151 | analytic_grad = regularized_gradient(theta, X, y) 152 | diff = np.linalg.norm(numeric_grad - analytic_grad) / np.linalg.norm(numeric_grad + analytic_grad) 153 | print('If your backpropagation implementation is correct, \nthe relative difference will be smaller than ' 154 | '10e-9.\nRelative Difference:{}\n'.format(diff)) 155 | 156 | 157 | def nn_training(X, y): 158 | init_theta = random_init(10285) # 25*401 + 10*26 159 | res = opt.minimize(fun=regularized_cost, 160 | x0=init_theta, 161 | args=(X, y, 1), 162 | method='TNC', 163 | jac=regularized_gradient, 164 | options={'maxiter': 400}) 165 | return res 166 | 167 | 168 | def accuracy(theta, X, y): 169 | _, _, _, _, h = feed_forward(theta, X) 170 | y_pred = np.argmax(h, axis=1) + 1 171 | print(classification_report(y, y_pred)) 172 | 173 | 174 | # 可视化隐藏层 175 | def plot_hidden(theta): 176 | t1, _ = deserialize(theta) 177 | t1 = t1[:, 1:] 178 | fig, ax_array = plt.subplots(5, 5, sharex=True, sharey=True, figsize=(6, 6)) 179 | for r in range(5): 180 | for c in range(5): 181 | ax_array[r, c].matshow(t1[r * 5 + c].reshape(20, 20), cmap='gray_r') 182 | plt.xticks([]) 183 | plt.yticks([]) 184 | plt.show() 185 | 186 | 187 | # 加载数据 188 | X, raw_y = load_mat('D:\\Documents\\machine-Leanring\\machine-learning-ex4\\ex4\\ex4data1.mat') 189 | # plot_100_img(X) 190 | 191 | X = np.insert(X, 0, 1, axis=1) 192 | 193 | # y需要扩展为向量形式 194 | y = expand_y(raw_y) 195 | 196 | # 加载权重数据 197 | t1, t2 = load_weight('D:\\Documents\\machine-Leanring\\machine-learning-ex4\\ex4\\ex4weights.mat') 198 | 199 | # 需要将多个参数矩阵展开，才能传入高级优化方法的优化函数，然后再恢复形状 200 | theta = serialize(t1, t2) 201 | a1, z2, a2, z3, h = feed_forward(theta, X) 202 | 203 | # 梯度下降的检验 204 | # gradient_checking(theta, X, y, e=0.0001) 205 | 206 | # 训练部分 207 | res = nn_training(X, y) 208 | print(res) 209 | 210 | # 计算准确度 211 | accuracy(res.x, X, raw_y) 212 | 213 | # 可视化隐藏层 214 | plot_hidden(res.x) 215 | -------------------------------------------------------------------------------- /ex5/bias_variance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from scipy.io import loadmat 4 | import scipy.optimize as opt 5 | 6 | 7 | def plot_Data(): 8 | fig, ax = plt.subplots(figsize=(6, 6)) 9 | ax.scatter(X[:, 1:], y, c='r', marker='x') 10 | plt.grid(True) # 设置是否显示网格线 11 | 12 | 13 | # 正则代价函数 14 | def costReg(theta, X, y, l): 15 | cost = ((X @ theta - y.flatten()) ** 2).sum() 16 | regterm = l * (theta[1:] @ theta[1:]) 17 | return (cost + regterm) / (2 * len(X)) 18 | 19 | 20 | # 正则梯度下降 21 | def gradientReg(theta, X, y, l): 22 | grad = (X @ theta - y.flatten()) @ X 23 | regterm = l * theta 24 | regterm[0] = 0 25 | return (grad + regterm) / len(X) 26 | 27 | 28 | # 拟合线性回归 29 | def trainLinearReg(X, y, l): 30 | theta = np.zeros(X.shape[1]) 31 | res = opt.minimize(fun=costReg, 32 | x0=theta, 33 | args=(X, y, l), 34 | method='TNC', 35 | jac=gradientReg) 36 | return res.x 37 | 38 | 39 | # 画出学习曲线，即交叉验证误差和训练误差随样本数量的变化的变化 40 | def plot_learning_curve(X, y, Xval, yval, l): 41 | x = range(1, len(X) + 1) 42 | training_cost, cv_cost = [], [] 43 | for i in x: 44 | res = trainLinearReg(X[:i], y[:i], l) # 样本数量不断增加 45 | training_cost_i = costReg(res, X[:i], y[:i], 0) 46 | cv_cost_i = costReg(res, Xval, yval, 0) 47 | training_cost.append(training_cost_i) 48 | cv_cost.append(cv_cost_i) 49 | plt.figure(figsize=(8, 5)) 50 | plt.plot(x, training_cost, label='training cost', c='r') 51 | plt.plot(x, cv_cost, label='cv cost', c='b') 52 | plt.legend() 53 | plt.xlabel('Number of training examples') 54 | plt.ylabel('Error') 55 | plt.title('Learning curve for linear regression:lamda=' + str(l)) 56 | plt.grid(True) 57 | 58 | 59 | # 添加多项式特征，从二次方开始开始插入 60 | def genPolyFeatures(X, power): 61 | Xpoly = X.copy() 62 | for i in range(2, power + 1): 63 | Xpoly = np.insert(Xpoly, Xpoly.shape[1], np.power(Xpoly[:, 1], i), axis=1) 64 | return Xpoly 65 | 66 | 67 | # 获取均值与标准差 68 | def get_means_std(X): 69 | means = np.mean(X, axis=0) 70 | stds = np.std(X, axis=0, ddof=1) # ddof=1 means 样本标准差 71 | return means, stds 72 | 73 | 74 | # 数据标准化 75 | def featureNormalize(myX, means, stds): 76 | X_norm = myX.copy() 77 | X_norm[:, 1:] = X_norm[:, 1:] - means[1:] 78 | X_norm[:, 1:] = X_norm[:, 1:] / stds[1:] 79 | return X_norm 80 | 81 | 82 | # 绘制拟合曲线 83 | def plot_fit(means, stds, l): 84 | theta = trainLinearReg(X_norm, y, l) 85 | x = np.linspace(-75, 55, 50) 86 | xmat = x.reshape(-1, 1) 87 | xmat = np.insert(xmat, 0, 1, axis=1) 88 | Xmat = genPolyFeatures(xmat, power) 89 | Xmat_norm = featureNormalize(Xmat, means, stds) 90 | plot_Data() 91 | plt.plot(x, Xmat_norm @ theta, 'b--') 92 | 93 | 94 | data = loadmat('D:\\Documents\\machine-Leanring\\machine-learning-ex5\\ex5\\ex5data1.mat') 95 | 96 | X, y = data['X'], data['y'] 97 | Xval, yval = data['Xval'], data['yval'] 98 | Xtest, ytest = data['Xtest'], data['ytest'] 99 | 100 | X = np.insert(X, 0, 1, axis=1) 101 | Xval = np.insert(Xval, 0, 1, axis=1) 102 | Xtest = np.insert(Xtest, 0, 1, axis=1) 103 | print('X={},y={}'.format(X.shape, y.shape)) 104 | print('Xval={},yval={}'.format(Xval.shape, yval.shape)) 105 | print('Xtest={},ytest={}'.format(Xtest.shape, ytest.shape)) 106 | 107 | # plot_Data() 108 | # plt.show() 109 | 110 | theta = np.ones(X.shape[1]) 111 | 112 | fit_theta = trainLinearReg(X, y, 0) 113 | plot_Data() 114 | plt.plot(X[:, 1], X @ fit_theta) 115 | plt.show() 116 | 117 | power = 6 118 | train_means, train_stds = get_means_std(genPolyFeatures(X, power)) 119 | X_norm = featureNormalize(genPolyFeatures(X, power), train_means, train_stds) 120 | Xval_norm = featureNormalize(genPolyFeatures(Xval, power), train_means, train_stds) 121 | Xtest_norm = featureNormalize(genPolyFeatures(Xtest, power), train_means, train_stds) 122 | 123 | # 观察不同lambdas下的曲线 124 | plot_fit(train_means, train_stds, 0) 125 | plot_learning_curve(X_norm, y, Xval_norm, yval, 0) 126 | plt.show() 127 | 128 | plot_fit(train_means, train_stds, 1) 129 | plot_learning_curve(X_norm, y, Xval_norm, yval, 1) 130 | plt.show() 131 | 132 | # 不同lambdas取值下的代价 133 | lambdas = [0., 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1., 3., 10.] 134 | errors_train, errors_val = [], [] 135 | for l in lambdas: 136 | theta = trainLinearReg(X_norm, y, l) 137 | errors_train.append(costReg(theta, X_norm, y, 0)) # 记得把lambda = 0 138 | errors_val.append(costReg(theta, Xval_norm, yval, 0)) 139 | 140 | plt.figure(figsize=(8, 5)) 141 | plt.plot(lambdas, errors_train, label='Train') 142 | plt.plot(lambdas, errors_val, label='Cross Validation') 143 | plt.legend() 144 | plt.xlabel('lambda') 145 | plt.ylabel('Error') 146 | plt.grid(True) 147 | plt.show() 148 | print('lambda={}'.format(lambdas[np.argmin(errors_val)])) 149 | -------------------------------------------------------------------------------- /ex6/spam_classification.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import nltk 4 | import nltk.stem.porter 5 | import numpy as np 6 | import pandas as pd 7 | from scipy.io import loadmat 8 | from sklearn import svm 9 | 10 | 11 | # 预处理邮件, 所有字符小写并且替换所有html标签/网页链接/邮箱地址/美元符号/数字 12 | def processEmail(email): 13 | email = email.lower() 14 | email = re.sub('<[^<>]>', '', email) 15 | email = re.sub('(http|https)://[^\s]*', 'httpaddr', email) 16 | email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email) 17 | email = re.sub('[$]+', 'dollar', email) 18 | email = re.sub('[0-9]+', 'number', email) 19 | return email 20 | 21 | 22 | # 邮件转为单词列表 23 | def email2TokenList(email): 24 | stemmer = nltk.stem.porter.PorterStemmer() 25 | email = processEmail(email) 26 | 27 | # 通过正则指定多种分隔符, 实现字符分割 28 | tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\{\}\,\'\"\>\_\<\;\%]', email) 29 | 30 | tokenlist = [] 31 | for token in tokens: 32 | token = re.sub('[^a-zA-Z0-9]', '', token) # 删除非字母数字字符 33 | stemmed = stemmer.stem(token) # 数据正规化,即全部转为原始形式,比如,depends变为depend 34 | if not len(stemmed): continue # 空则跳过 35 | tokenlist.append(stemmed) 36 | return tokenlist 37 | 38 | 39 | # 将邮件根据单词表转为向量形式 40 | def email2FeatureVector(email): 41 | token = email2TokenList(email) 42 | vector = [1 if vocab[i] in token else 0 for i in range(len(vocab))] 43 | return np.array(vector) 44 | 45 | 46 | # 读取邮件 47 | with open('D:\\Documents\\machine-Leanring\\machine-learning-ex6\\ex6\\emailSample1.txt', 'r') as f: 48 | email = f.read() 49 | print(email) 50 | 51 | # 读取单词表 52 | vocab = pd.read_csv('D:\\Documents\\machine-Leanring\\machine-learning-ex6\\ex6\\vocab.txt', names=['words'], sep='\t').values 53 | vector = email2FeatureVector(email) 54 | print('length of vector = {}\nnum of non-zero = {}'.format(len(vector), int(vector.sum()))) 55 | 56 | # 以上过程演示了如何将邮件向量化,可以根据上述步骤对大量邮件预处理作为输入数据 57 | 58 | # 读取已经预处理好的邮件以及相应的标签,分为训练集和测试集 59 | 60 | mat = loadmat('D:\\Documents\\machine-Leanring\\machine-learning-ex6\\ex6\\spamTrain.mat') 61 | X, y = mat['X'], mat['y'] 62 | 63 | mat = loadmat('D:\\Documents\\machine-Leanring\\machine-learning-ex6\\ex6\\spamTest.mat') 64 | Xtest, ytest = mat['Xtest'], mat['ytest'] 65 | 66 | # 采用SVM训练 67 | clf = svm.SVC(C=0.1, kernel='linear') 68 | clf.fit(X, y.flatten()) 69 | 70 | # 检查训练效果 71 | predTrain = clf.score(X, y) 72 | predTest = clf.score(Xtest, ytest) 73 | print('PredTrain={}, PredTest={}'.format(predTrain,predTest)) 74 | 75 | # 可以接着对C的值进行调整或者采用高斯内核尝试分类, 由于精度很高,不再进行尝试 76 | -------------------------------------------------------------------------------- /ex6/support_vector_machines.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from scipy.io import loadmat 4 | from sklearn import svm 5 | 6 | 7 | def plot_data(): 8 | plt.figure(figsize=(8, 5)) 9 | plt.scatter(X[:, 0], X[:, 1], c=y.flatten(), cmap='rainbow') 10 | plt.xlabel('X1') 11 | plt.ylabel('X2') 12 | plt.legend() 13 | 14 | 15 | def plotBoundary(clf, X): 16 | x_min, x_max = X[:, 0].min() * 1.2, X[:, 1].max() * 1.1 17 | y_min, y_max = X[:, 0].min() * 1.1, X[:, 1].max() * 1.1 18 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500), 19 | np.linspace(y_min, y_max, 500)) # 将原始数据变成网格数据形式 20 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 21 | Z = Z.reshape(xx.shape) 22 | plt.contour(xx, yy, Z) # contour绘制矩阵的等高线。 23 | 24 | 25 | mat = loadmat('D:\\Documents\\machine-Leanring\\machine-learning-ex6\\ex6\\ex6data1.mat') 26 | X = mat['X'] 27 | y = mat['y'] 28 | 29 | # plot_data() 30 | # plt.show() 31 | 32 | models = [svm.SVC(C, kernel='linear') for C in [1, 100]] # 支持向量机可以使用线性函数 33 | clfs = [model.fit(X, y.ravel()) for model in models] # ravel将多维数组转为一维 34 | 35 | title = ['SVM Decision Boundary with C = {} (Example Dataset 1)'.format(C) for C in [1, 100]] 36 | print(title) 37 | for model, title in zip(clfs, title): 38 | plot_data() 39 | plt.title(title) 40 | plotBoundary(model, X) 41 | plt.show() 42 | -------------------------------------------------------------------------------- /ex6/svm_with_gaussian.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from scipy.io import loadmat 4 | from sklearn import svm 5 | 6 | 7 | def plot_data(): 8 | plt.figure(figsize=(8, 5)) 9 | plt.scatter(X[:, 0], X[:, 1], c=y.flatten(), cmap='rainbow') 10 | plt.xlabel('X1') 11 | plt.ylabel('X2') 12 | plt.legend() 13 | 14 | 15 | # 核函数-高斯函数,但是实际上我们直接调用SVC中的高斯核函数即可 16 | def gaussKernel(x1, x2, sigma): 17 | return np.exp(-((x1 - x2) ** 2).sum() / (2 * sigma ** 2)) 18 | 19 | 20 | # 绘制边界 21 | def plotBoundary(clf, X): 22 | x_min, x_max = X[:, 0].min() * 1.2, X[:, 1].max() * 1.1 23 | y_min, y_max = X[:, 0].min() * 1.1, X[:, 1].max() * 1.1 24 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500), 25 | np.linspace(y_min, y_max, 500)) # 将原始数据变成网格数据形式 26 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 27 | Z = Z.reshape(xx.shape) 28 | plt.contour(xx, yy, Z) # contour绘制矩阵的等高线。 29 | 30 | 31 | mat = loadmat('D:\\Documents\\machine-Leanring\\machine-learning-ex6\\ex6\\ex6data2.mat') 32 | X = mat['X'] 33 | y = mat['y'] 34 | 35 | sigma = 0.1 36 | gamma = np.power(sigma, -2) / 2 37 | 38 | # C:惩罚参数 39 | # kernel='rbf'时，为高斯核，gamma值越小，分类界面越连续；gamma值越大，分类界面越“散”，分类效果越好，但可能会过拟合 40 | # gamma:'rbf'的核函数参数。默认是’auto’，则会选择1/n_features 41 | clf = svm.SVC(C=1, kernel='rbf', gamma=gamma) 42 | modle = clf.fit(X, y.flatten()) 43 | plot_data() 44 | plotBoundary(modle, X) 45 | plt.show() 46 | 47 | 48 | # 通过一个案例说明,如何寻找最优的C和sigma 49 | # 加载数据 50 | mat = loadmat('D:\\Documents\\machine-Leanring\\machine-learning-ex6\\ex6\\ex6data3.mat') 51 | X, y = mat['X'], mat['y'] 52 | Xval, yval = mat['Xval'], mat['yval'] 53 | 54 | Cvalues = (0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30) 55 | sigmavalues = Cvalues 56 | best_pair, best_score = (0, 0), 0 57 | 58 | for C in Cvalues: 59 | for sigma in sigmavalues: 60 | gamma = np.power(sigma, -2.) / 2 61 | clf = svm.SVC(C=C, kernel='rbf', gamma=gamma) 62 | clf.fit(X, y.flatten()) 63 | score = clf.score(Xval, yval) # Return the mean accuracy on the given test data and labels. 64 | if score > best_score: 65 | best_score = score 66 | best_pair = (C, sigma) 67 | 68 | print('best_pair={}, best_score={}'.format(best_pair, best_score)) 69 | 70 | # 展示一下最优C和sigma的效果 71 | gamma = np.power(best_pair[1], -2) / 2 72 | clf = svm.SVC(C=best_pair[0], kernel='rbf', gamma=gamma) 73 | clf.fit(X, y.flatten()) 74 | plot_data() 75 | plotBoundary(clf, X) 76 | plt.show() 77 | -------------------------------------------------------------------------------- /ex7/Kmeans.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from scipy.io import loadmat 4 | import imageio 5 | 6 | 7 | # 对每个样本点, 寻找与其距离最近的分类中心点 8 | def findClosestCentroids(X, centroids): 9 | idx = [] 10 | max_dist = 1000000 # 设置的最大距离 11 | for i in range(len(X)): 12 | minus = X[i] - centroids 13 | dist = np.diag(minus @ minus.T) 14 | if dist.min() < max_dist: 15 | ci = np.argmin(dist) 16 | idx.append(ci) 17 | return np.array(idx) 18 | 19 | 20 | # 计算每个簇的质心 21 | def computeCentroids(X, idx): 22 | centroids = [] 23 | for i in range(len(np.unique(idx))): 24 | u_k = X[idx == i].mean(axis=0) # 对每个簇的点求均值 25 | centroids.append(u_k) 26 | return np.array(centroids) 27 | 28 | 29 | # 绘制数据点、中心点以及中心点的迭代路径 30 | def plot_data(X, centroids, idx=None, title=''): 31 | colors = ['b', 'g', 'gold', 'darkorange', 'salmon', 'olivedrab', 32 | 'maroon', 'navy', 'sienna', 'tomato', 'lightgray', 'gainsboro', 33 | 'coral', 'aliceblue', 'dimgray', 'mintcream', 34 | 'mintcream'] 35 | 36 | # assert 断言,如果表达式为false,则触发异常. 37 | assert len(centroids[0]) <= len(colors), 'colors not enough' 38 | 39 | # 如果idx存在，就根据idx把X分类 40 | subX = [] 41 | if idx is not None: 42 | for i in range(centroids[0].shape[0]): 43 | x_i = X[idx == i] 44 | subX.append(x_i) 45 | else: 46 | subX = [X] 47 | 48 | # 绘制分类后的X点，不同类别X采用不同颜色 49 | plt.figure(figsize=(8, 5)) 50 | for i in range(len(subX)): 51 | xx = subX[i] 52 | plt.scatter(xx[:, 0], xx[:, 1], c=colors[i], label='Cluster %d' % i) 53 | 54 | plt.legend() 55 | plt.grid(True) 56 | plt.xlabel('x1', fontsize=14) 57 | plt.ylabel('y1', fontsize=14) 58 | plt.title('plot of x points' + title, fontsize=16) 59 | 60 | # 绘制分类中心点 61 | xx, yy = [], [] 62 | for centroid in centroids: 63 | xx.append(centroid[:, 0]) 64 | yy.append(centroid[:, 1]) 65 | plt.plot(xx, yy, 'rx--', markersize=8) # 第三个参数'[color][marker][line]' 66 | 67 | 68 | # 进行指定次数的迭代，不断的求解最优的中心点 69 | def runKmeans(X, centroids, max_iters): 70 | centroids_all = [centroids] 71 | centroid_i = centroids 72 | 73 | # 对于每次迭代，首先对每个样本点，计算距离最近的中心点；然后计算中心点的质心。 74 | for i in range(max_iters): 75 | idx = findClosestCentroids(X, centroid_i) 76 | centroid_i = computeCentroids(X, idx) 77 | centroids_all.append(centroid_i) 78 | return idx, centroids_all 79 | 80 | 81 | # 随机化初始中心点，K个初始化中心点从X中随机选择 82 | def initCentroids(X, K): 83 | idx = np.random.choice(X.shape[0], K) # 从X中选择三个索引值 84 | centroids = X[idx] 85 | return centroids 86 | 87 | 88 | # 加载数据 89 | mat = loadmat('D:\\Documents\\machine-Leanring\\machine-learning-ex7\\ex7\\ex7data2.mat') 90 | X = mat['X'] 91 | 92 | # 指定一个中心点列表 93 | init_centroids = np.array([[3, 3], [6, 2], [8, 5]]) 94 | 95 | # 计算分别对每个点计算，求离哪个中心点近 96 | idx = findClosestCentroids(X, init_centroids) 97 | 98 | # 每个簇求解中心点 99 | # print(computeCentroids(X, idx)) 100 | 101 | plot_data(X, [init_centroids], idx, '->init_centroids') 102 | plt.show() 103 | 104 | # 迭代20次 105 | idx, centroids_all = runKmeans(X, init_centroids, 20) 106 | plot_data(X, centroids_all, idx, '->iters=20') 107 | plt.show() 108 | 109 | # 随机生成三次中心点，查看Kmeans效果 110 | for i in range(3): 111 | centroids = initCentroids(X, 3) 112 | idx, centroids_all = runKmeans(X, centroids, 10) 113 | plot_data(X, centroids_all, idx, '->The result of the initCentroids') 114 | plt.show() 115 | 116 | 117 | # 采用Kmeans对图像进行压缩 118 | A = imageio.imread('D:\\Documents\\machine-Leanring\\machine-learning-ex7\\ex7\\bird_small.png') 119 | # A为128*128*3,表示128*128像素,通道为3 120 | 121 | # plt.imshow(A) 122 | # plt.show() 123 | 124 | A = A / 255 # 将A的值变为0-1之间 125 | X = A.reshape(-1, 3) 126 | K = 8 # 将图片颜色压缩到16种 127 | centroids = initCentroids(X, K) # 随机初始化16种颜色 128 | idx, centroids_all = runKmeans(X, centroids, 10) # 20次迭代选择最好的16种颜色 129 | 130 | # 重新生成一张图 131 | img = np.zeros(X.shape) 132 | centroids = centroids_all[-1] # 取最后一次数据即可 133 | for i in range(len(centroids)): 134 | img[idx == i] = centroids[i] # i点属于centroids[i]类别,则将其颜色设置为centroids[i] 135 | 136 | # 维度还原 137 | img = img.reshape((128, 128, 3)) 138 | fig, ax = plt.subplots(1, 2, figsize=(12, 6)) 139 | ax[0].set_title('original') 140 | ax[0].imshow(A) 141 | ax[1].set_title('compress') 142 | ax[1].imshow(img) 143 | plt.show() 144 | -------------------------------------------------------------------------------- /ex7/pca.py: -------------------------------------------------------------------------------- 1 | from scipy.io import loadmat 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | 6 | # 特征标准化 7 | def featureNormalize(X): 8 | means = X.mean(axis=0) 9 | stds = X.std(axis=0, ddof=1) 10 | X_norm = (X - means) / stds 11 | return X_norm, means, stds 12 | 13 | 14 | def pca(X): 15 | sigma = (X.T @ X) / len(X) 16 | U, S, V = np.linalg.svd(sigma) # SVD奇异值分解 17 | return U, S, V 18 | 19 | 20 | # 将数据投影到主成分U上 21 | def projectData(X, U, K): 22 | Z = X @ U[:, :K] 23 | return Z 24 | 25 | 26 | # 将数据恢复到高维空间 27 | def recoverData(Z, U, K): 28 | X_rec = Z @ U[:, :K].T 29 | return X_rec 30 | 31 | 32 | def displayData(X, row, col): 33 | fig, axs = plt.subplots(row, col, figsize=(8, 8)) 34 | for r in range(row): 35 | for c in range(col): 36 | axs[r][c].imshow(X[r * col + c].reshape(32, 32).T, cmap='Greys_r') # cmap代表色彩盘,0通道的灰度图 37 | axs[r][c].set_xticks([]) 38 | axs[r][c].set_yticks([]) 39 | 40 | 41 | mat = loadmat('D:\\Documents\\machine-Leanring\\machine-learning-ex7\\ex7\\ex7data1.mat') 42 | X = mat['X'] 43 | 44 | # plt.scatter(X[:, 0], X[:, 1], facecolors='none', edgecolors='b') 45 | # plt.show() 46 | 47 | X_norm, means, stds = featureNormalize(X) 48 | U, S, V = pca(X_norm) 49 | print(U) 50 | plt.figure(figsize=(7, 5)) 51 | plt.scatter(X[:, 0], X[:, 1], facecolors='none', edgecolors='b') 52 | 53 | plt.plot([means[0], means[0] + 1.5 * S[0] * U[0, 0]], 54 | [means[1], means[1] + 1.5 * S[0] * U[0, 1]], 55 | c='r', linewidth=3, label='first Principal Component') 56 | 57 | plt.plot([means[0], means[0] + 1.5 * S[1] * U[1, 0]], 58 | [means[1], means[1] + 1.5 * S[1] * U[1, 1]], 59 | c='g', linewidth=3, label='Second Principal Component') 60 | 61 | plt.grid() 62 | plt.axis('equal') # x轴与y轴一样长 63 | plt.legend() 64 | plt.show() 65 | 66 | # 将数据投影到主成分U上 67 | Z = projectData(X_norm, U, 1) 68 | 69 | # 将数据恢复到高维空间,但是恢复只能恢复近似值 70 | X_rec = recoverData(Z, U, 1) 71 | 72 | plt.figure(figsize=(7, 5)) 73 | plt.axis("equal") 74 | 75 | # 正常数据 76 | plt.scatter(X_norm[:, 0], X_norm[:, 1], s=30, facecolors='none', 77 | edgecolors='b', label='Original Data Points') 78 | 79 | # 恢复之后的数据 80 | plt.scatter(X_rec[:, 0], X_rec[:, 1], s=30, facecolors='none', 81 | edgecolors='r', label='PCA Reduced Data Points') 82 | 83 | plt.title("Example Dataset: Reduced Dimension Points Shown", fontsize=14) 84 | plt.xlabel('x1 [Feature Normalized]', fontsize=14) 85 | plt.ylabel('x2 [Feature Normalized]', fontsize=14) 86 | plt.grid(True) 87 | 88 | # 将原始点和恢复后的点对应起来 89 | for x in range(X_norm.shape[0]): 90 | plt.plot([X_norm[x, 0], X_rec[x, 0]], [X_norm[x, 1], X_rec[x, 1]], 'k--') 91 | plt.legend() 92 | plt.show() 93 | 94 | # 将PCA应用到人脸图像压缩上 95 | mat = loadmat('D:\\Documents\\machine-Leanring\\machine-learning-ex7\\ex7\\ex7faces.mat') 96 | X = mat['X'] 97 | 98 | displayData(X, 10, 10) 99 | plt.show() 100 | 101 | X_norm, means, stds = featureNormalize(X) 102 | U, S, V = pca(X_norm) 103 | 104 | # 看一下主成分U 105 | displayData(U[:, :36].T, 6, 6) 106 | plt.show() 107 | 108 | # 将数据投影到主成分U上 109 | Z = projectData(X_norm, U, 36) 110 | 111 | # 将数据恢复到高维空间,但是恢复只能恢复近似值 112 | X_rec = recoverData(Z, U, 36) 113 | displayData(X_rec, 10, 10) 114 | plt.show() 115 | -------------------------------------------------------------------------------- /ex8/anomaly_detection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.io import loadmat 3 | import matplotlib.pyplot as plt 4 | from scipy import stats 5 | 6 | 7 | def plot_data(): 8 | plt.figure(figsize=(8, 5)) 9 | plt.plot(X[:, 0], X[:, 1], 'bx') 10 | 11 | 12 | # 获取高斯参数 13 | def getGaussianParams(X): 14 | mu = X.mean(axis=0) # 均值 15 | sigma = np.cov(X.T) # 求协方差 16 | return mu, sigma 17 | 18 | 19 | # 高斯模型 20 | def gaussian(X, mu, sigma): 21 | norm = 1. / (np.power(2 * np.pi, X.shape[1] / 2) * np.sqrt(np.linalg.det(sigma))) # np.linalg.det计算行列式 22 | exp = np.zeros((X.shape[0], 1)) 23 | for i in range(X.shape[0]): 24 | exp[i] = np.exp(-0.5 * (X[i] - mu).T @ np.linalg.inv(sigma) @ (X[i] - mu)) # np.linalg.inv求逆 25 | return norm * exp 26 | 27 | 28 | # 绘制等高线 29 | def plotContours(mu, sigma): 30 | x, y = np.mgrid[0:30:.3, 0:30:.3] 31 | points = np.c_[x.ravel(), y.ravel()] # 按行连接 32 | z = gaussian(points, mu, sigma) 33 | z = z.reshape(x.shape) 34 | 35 | # 可以调用函数直接进行高斯分布 36 | # multi_normal = stats.multivariate_normal(mu, sigma) 37 | # z = multi_normal.pdf(np.dstack((x, y))) 38 | 39 | cont_levels = [10 ** h for h in range(-20, 0, 3)] # 该变量设置等高线的个数,可以为整数或者类数组.此处由于高斯函数设置等高线个数太密集,所以采用数组形式 40 | plt.contour(x, y, z, cont_levels) 41 | plt.title('Gaussian Contours', fontsize=16) 42 | 43 | 44 | # 根据计算F1值选择合适的epsilons值 45 | def selectThreshold(yval, pval): 46 | # 计算F1 47 | def computeF1(yval, pval): 48 | m = len(yval) 49 | tp = float(len([i for i in range(m) if pval[i] and yval[i]])) 50 | fp = float(len([i for i in range(m) if pval[i] and not yval[i]])) 51 | fn = float(len([i for i in range(m) if not pval[i] and yval[i]])) 52 | prec = tp / (tp + fp) if (tp + fp) else 0 53 | rec = tp / (tp + fn) if (tp + fn) else 0 54 | F1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0 55 | return F1 56 | 57 | # 产生1000个epsilons值分别计算F1 58 | epsilons = np.linspace(min(pval), max(pval), 1000) 59 | bestF1, bestEpsilon = 0, 0 60 | for e in epsilons: 61 | pval_ = pval < e 62 | thisF1 = computeF1(yval, pval_) 63 | if thisF1 > bestF1: 64 | bestF1 = thisF1 65 | bestEpsilon = e 66 | return bestF1, bestEpsilon 67 | 68 | 69 | mat = loadmat('D:\\Documents\\machine-Leanring\\machine-learning-ex8\\ex8\\ex8data1.mat') 70 | X = mat['X'] 71 | Xval, yval = mat['Xval'], mat['yval'] 72 | 73 | plot_data() 74 | # plt.show() 75 | 76 | plotContours(*getGaussianParams(X)) # *代表有多个参数 77 | plt.show() 78 | 79 | # 获取合适的epslion值 80 | mu, sigma = getGaussianParams(X) 81 | pval = gaussian(Xval, mu, sigma) 82 | bestF1, bestEpslion = selectThreshold(yval, pval) 83 | print('bestF1={},bestEpslion={}'.format(bestF1, bestEpslion)) 84 | 85 | # 筛选离群点 86 | y = gaussian(X, mu, sigma) 87 | x = np.array([X[i] for i in range(len(y)) if y[i] < bestEpslion]) 88 | 89 | # 绘图 90 | plot_data() 91 | plotContours(mu, sigma) 92 | plt.scatter(x[:, 0], x[:, 1], s=80, facecolor='none', edgecolors='r') 93 | plt.title('mark the negatives') 94 | plt.show() 95 | -------------------------------------------------------------------------------- /ex8/recommender_systems.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.io import loadmat 3 | import matplotlib.pyplot as plt 4 | import scipy.optimize as opt 5 | 6 | 7 | # 获取高斯参数 8 | def getGaussianParams(X): 9 | mu = X.mean(axis=0) # 均值 10 | sigma = np.cov(X.T) # 求协方差 11 | return mu, sigma 12 | 13 | 14 | # 高斯模型 15 | def gaussian(X, mu, sigma): 16 | norm = 1. / (np.power(2 * np.pi, X.shape[1] / 2) * np.sqrt(np.linalg.det(sigma))) # np.linalg.det计算行列式 17 | exp = np.zeros((X.shape[0], 1)) 18 | for i in range(X.shape[0]): 19 | exp[i] = np.exp(-0.5 * (X[i] - mu).T @ np.linalg.inv(sigma) @ (X[i] - mu)) # np.linalg.inv求逆 20 | return norm * exp 21 | 22 | 23 | # 展开X/theta值以便能够传入高级优化方法进行运算 24 | def serialize(X, Theta): 25 | return np.r_[X.flatten(), Theta.flatten()] 26 | 27 | 28 | # 还原为原来的X/theta值 29 | def deserialize(seq, nm, nu, nf): 30 | return seq[:nm * nf].reshape(nm, nf), seq[nm * nf:].reshape(nu, nf) 31 | 32 | 33 | # 协同过滤下的代价函数 34 | def cofiCostFunc(params, Ynorm, R, nm, nu, nf, l=0): 35 | X, Theta = deserialize(params, nm, nu, nf) 36 | error = 0.5 * np.square((X @ Theta.T - Ynorm) * R).sum() # 之所以*R,是因为我们只要我们评过分数的电影的数据 37 | reg1 = .5 * l * np.square(Theta).sum() 38 | reg2 = .5 * l * np.square(X).sum() 39 | return error + reg1 + reg2 40 | 41 | 42 | # 协同过滤下的梯度下降函数 43 | def cofiGradient(params, Y, R, nm, nu, nf, l=0): 44 | X, Theta = deserialize(params, nm, nu, nf) 45 | X_grad = ((X @ Theta.T - Y) * R) @ Theta + l * X 46 | Theta_grad = ((X @ Theta.T - Y) * R).T @ X + l * Theta 47 | return serialize(X_grad, Theta_grad) 48 | 49 | 50 | # 检查梯度下降是否正常工作 51 | def checkGradient(params, Y, myR, nm, nu, nf, l=0): 52 | grad = cofiGradient(params, Y, myR, nm, nu, nf, l) 53 | e = 0.0001 54 | nparams = len(params) 55 | e_vec = np.zeros(nparams) 56 | 57 | for i in range(10): 58 | idx = np.random.randint(0, nparams) 59 | e_vec[idx] = e 60 | loss1 = cofiCostFunc(params - e_vec, Y, myR, nm, nu, nf, l) 61 | loss2 = cofiCostFunc(params + e_vec, Y, myR, nm, nu, nf, l) 62 | numgrad = (loss2 - loss1) / (2 * e) 63 | e_vec[idx] = 0 64 | diff = np.linalg.norm(numgrad - grad[idx]) / np.linalg.norm(numgrad + grad[idx]) 65 | print('%0.15f \t %0.15f \t %0.15f' % (numgrad, grad[idx], diff)) 66 | 67 | 68 | # 均值归一化 69 | def normalizeRating(Y, R): 70 | Ymean = (Y.sum(axis=1) / R.sum(axis=1)).reshape(-1, 1) 71 | Ynorm = (Y - Ymean) * R 72 | return Ynorm, Ymean 73 | 74 | 75 | mat = loadmat('D:\\Documents\\machine-Leanring\\machine-learning-ex8\\ex8\\ex8_movies.mat') 76 | Y, R = mat['Y'], mat['R'] # Y为每个用户给每部电影的评分1-5, R为用户是否给电影评分 77 | 78 | plt.figure(figsize=(8, 8 * (1682. / 943.))) 79 | plt.imshow(Y, cmap='rainbow') 80 | plt.colorbar() 81 | plt.ylabel('Movies', fontsize=20) 82 | plt.xlabel('User', fontsize=20) 83 | plt.show() 84 | 85 | mat = loadmat('D:\\Documents\\machine-Leanring\\machine-learning-ex8\\ex8\\ex8_movieParams.mat') 86 | X = mat['X'] # 第i部电影对应的特征向量Xi 87 | Theta = mat['Theta'] 88 | nu = int(mat['num_users']) # 用户数 89 | nm = int(mat['num_movies']) # 电影数 90 | nf = int(mat['num_features']) # 特征数 91 | print("nu={},nm={},nf={}".format(nu, nm, nf)) 92 | 93 | # 测试计算是否正确 94 | ''' 95 | nu = 4 96 | nm = 5 97 | nf = 3 98 | X = X[:nm, :nf] 99 | Theta = Theta[:nu, :nf] 100 | Y = Y[:nm, :nu] 101 | R = R[:nm, :nu] 102 | print(cofiCostFunc(serialize(X, Theta), Y, R, nm, nu, nf)) 103 | print(cofiCostFunc(serialize(X, Theta), Y, R, nm, nu, nf, 1.5)) 104 | print('Checking gradient with lambda = 0...') 105 | checkGradient(serialize(X, Theta), Y, R, nm, nu, nf) 106 | print('Checking gradient with lambda = 1.5...') 107 | checkGradient(serialize(X, Theta), Y, R, nm, nu, nf, 1.5) 108 | ''' 109 | 110 | # 导入电影数据 111 | movies = [] 112 | with open('D:\\Documents\\machine-Leanring\\machine-learning-ex8\\ex8\\movie_ids.txt', 'r', encoding='ISO-8859-1') as f: 113 | for line in f: 114 | movies.append(' '.join(line.strip().split(' ')[1:])) 115 | 116 | # 我们自己的评分数据 117 | my_ratings = np.zeros((1682, 1)) 118 | my_ratings[0] = 4 119 | my_ratings[97] = 2 120 | my_ratings[6] = 3 121 | my_ratings[11] = 5 122 | my_ratings[53] = 4 123 | my_ratings[63] = 5 124 | my_ratings[65] = 3 125 | my_ratings[68] = 5 126 | my_ratings[182] = 4 127 | my_ratings[225] = 5 128 | my_ratings[354] = 5 129 | 130 | # 看看我们对哪些电影进行了打分 131 | ''' 132 | for i in range(len(my_ratings)): 133 | if my_ratings[i] > 0: 134 | print(my_ratings[i], movies[i]) 135 | ''' 136 | 137 | # 把我们的打分加到原来的数据中 138 | Y = np.c_[Y, my_ratings] 139 | R = np.c_[R, my_ratings != 0] 140 | 141 | # 均值归一化数据 142 | Ynorm, Ymean = normalizeRating(Y, R) 143 | nm = Ynorm.shape[0] 144 | nu = Ynorm.shape[1] 145 | 146 | # 随机生成特征矩阵X与Theta 147 | X = np.random.random((nm, nf)) 148 | Theta = np.random.random((nu, nf)) 149 | params = serialize(X, Theta) 150 | l = 10 151 | 152 | # 检查梯度下降是否正常工作 153 | # checkGradient(params,Ynorm, R, nm, nu, nf, l) 154 | 155 | # 最小化代价函数 156 | res = opt.minimize(fun=cofiCostFunc, x0=params, args=(Ynorm, R, nm, nu, nf, l), method='TNC', jac=cofiGradient, 157 | options={'maxiter': 100}) 158 | ret = res.x 159 | fit_X, fit_Theta = deserialize(ret, nm, nu, nf) 160 | 161 | # 计算预测结果 162 | pred_mat = fit_X @ fit_Theta.T 163 | 164 | # 最后一个用户的预测分数，也就是我们刚才添加的用户 165 | pred = pred_mat[:, -1] + Ymean.flatten() 166 | pred_sorted_idx = np.argsort(pred)[::-1] # [::-1]从后往前取元素 167 | 168 | print("Top recommendations for you:") 169 | for i in range(10): 170 | print('Predicting rating %0.1f for movie %s.' \ 171 | % (pred[pred_sorted_idx[i]], movies[pred_sorted_idx[i]])) 172 | 173 | print("\nOriginal ratings provided:") 174 | for i in range(len(my_ratings)): 175 | if my_ratings[i] > 0: 176 | print('Rated %d for movie %s.' % (my_ratings[i], movies[i])) 177 | --------------------------------------------------------------------------------