├── .gitignore ├── README.md ├── algorithm_analysis ├── data │ └── water.mat ├── diagnose.py └── linear_regression.py ├── anomaly_detection ├── anomaly.py ├── data │ ├── ex8data1.mat │ └── ex8data2.mat └── test_anomaly_detection.py ├── kmeans ├── data │ ├── places.txt │ ├── portlandClubs.txt │ ├── testSet.txt │ └── testSet2.txt ├── kmeans.py ├── test_bi_kmeans.py └── test_normal_kmeans.py ├── linear_regression ├── data │ ├── ex0.txt │ ├── ex1.txt │ ├── houses.txt │ ├── lwr.txt │ └── temperature.txt ├── regression.py ├── test_bgd.py ├── test_feature_scaling.py ├── test_lwr.py ├── test_multiple.py ├── test_sgd.py ├── test_temperature_normal.py └── test_temperature_polynomial.py ├── logical_regression ├── data │ ├── ex3data1.mat │ ├── linear.txt │ └── non_linear.txt ├── logical_regression.py ├── test_linear_boundry.py ├── test_non_linear_boundry.py └── test_onevsall.py ├── neural_network ├── data │ ├── ex4weights.mat │ └── handwritten_digits.mat ├── nn.py ├── test_handwritten_digits.py └── test_logic_and.py ├── pca ├── data │ ├── bird_small.mat │ ├── ex7data1.mat │ ├── ex7data2.mat │ └── ex7faces.mat ├── kmeans.py ├── pca.py ├── test_pca4performance.py └── test_pca4visualization.py ├── recommender_system ├── data │ ├── ex8_movieParams.mat │ ├── ex8_movies.mat │ └── movie_ids.txt ├── recommender.py └── test_movies_rating.py └── svm ├── data ├── emailSample1.txt ├── emailSample2.txt ├── ex6data1.mat ├── ex6data2.mat ├── ex6data3.mat ├── spamSample1.txt ├── spamSample2.txt ├── spamTest.mat └── spamTrain.mat ├── smo.py ├── spam.py ├── test_linear.py ├── test_model_selection.py ├── test_non_linear.py ├── test_spam.py └── vocab.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | *.log 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 斯坦福机器学习笔记 2 | ================== 3 | 4 | Gitbook 地址: [戳我](https://www.gitbook.com/book/yoyoyohamapi/mit-ml/details) 5 | 6 | 本书为斯坦福吴恩达教授的在 coursera 上的[机器学习公开课](https://zh.coursera.org/learn/machine-learning)的知识笔记,涵盖了大部分课上涉及到的知识点和内容,因为篇幅有限,部分公式的推导没有记录在案,但推荐大家还是在草稿本上演算一遍,加深印象,知其然还要知其所以然。 7 | 8 | 本书涉及到的程序代码均放在了我个人的 [github](https://github.com/yoyoyohamapi/mit-ml) 上,采用了 python 实现,大部分代码都是相关学习算法的完整实现和测试。我没有放这门课程的 homework 代码,原因是 homework 布置的编程作业是填空式的作业,而完整实现一个算法虽然历经更多坎坷,但更有助于检验自己对算法理解和掌握程度。 9 | 10 | 本书的章节安排与课程对应关系为: 11 | 12 | | 斯坦福课程 | 本书章节 | 13 | |:-----------|:-------------------| 14 | | Week 2 | 线性回归 | 15 | | Week 3 | 逻辑回归 | 16 | | Week 4-5 | 神经网络 | 17 | | Week 6 | 算法分析与优化 | 18 | | Week 7 | SVM(支持向量机) | 19 | | Week 8 | K-Means、特征降维 | 20 | | Week 9 | 异常检测、推荐系统 | 21 | | Week 10 | 大规模机器学习 | 22 | | Week 11 | 案例--光学字符识别 | 23 | 24 | 学生我才疏学浅,对机器学习也只是刚刚入门,文中难免不少纰漏甚至严重错误,希望大家指正,这是对我最大的帮助。本书最大的目的也在于交流学习,而不在 star 和传播。任重而道远,你我共勉。 25 | -------------------------------------------------------------------------------- /algorithm_analysis/data/water.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/algorithm_analysis/data/water.mat -------------------------------------------------------------------------------- /algorithm_analysis/diagnose.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # algorithm_analysis/diagnose.py 3 | """算法诊断 4 | """ 5 | import linear_regression 6 | import numpy as np 7 | from scipy.io import loadmat 8 | import matplotlib.pyplot as plt 9 | from sklearn.preprocessing import PolynomialFeatures 10 | 11 | data = loadmat('data/water.mat') 12 | ##### 13 | # 数据集划分 14 | ##### 15 | # 训练集 16 | X = np.mat(data['X']) 17 | # 为X添加偏置 18 | X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) 19 | y = np.mat(data['y']) 20 | # 交叉验证集 21 | Xval = np.mat(data['Xval']) 22 | Xval = np.concatenate((np.ones((Xval.shape[0], 1)), Xval), axis=1) 23 | yval = np.mat(data['yval']) 24 | # 测试集 25 | Xtest = np.mat(data['Xtest']) 26 | Xtest = np.concatenate((np.ones((Xtest.shape[0], 1)), Xtest), axis=1) 27 | ytest = np.mat(data['ytest']) 28 | 29 | def diagnoseLR(): 30 | """线性回归诊断 31 | """ 32 | initTheta = np.mat(np.ones((X.shape[1], 1))) 33 | result, timeConsumed = linear_regression.gradient( 34 | X, y, rate=0.001, maxLoop=5000, epsilon=0.1, initTheta=initTheta) 35 | theta, errors = result 36 | 37 | # 绘制拟合成果 38 | Xmin = X[:, 1].min() 39 | Xmax = X[:, 1].max() 40 | ymax = y[:, 0].max() 41 | ymin = y[:, 0].min() 42 | fitX = np.mat(np.linspace(Xmin, Xmax, 20).reshape(-1, 1)) 43 | fitX = np.concatenate((np.ones((fitX.shape[0], 1)), fitX), axis=1) 44 | h = fitX * theta 45 | plt.xlim(Xmin, Xmax) 46 | plt.ylim(ymin, ymax) 47 | # 绘制训练样本 48 | plt.scatter(X[:, 1].flatten().A[0], y[:, 0].flatten().A[0],marker='x',color='r', linewidth=2) 49 | # 绘制拟合曲线 50 | plt.plot(fitX[:, 1], h, color='b') 51 | plt.xlabel('Change in water level(x)') 52 | plt.ylabel('Water flowing out of the dam(y)') 53 | plt.show() 54 | 55 | # 绘制随样本规模学习曲线 56 | m, n = X.shape 57 | trainErrors = np.zeros((1,m)) 58 | valErrors = np.zeros((1,m)) 59 | for i in range(m): 60 | Xtrain = X[0:i+1] 61 | ytrain = y[0:i+1] 62 | res, timeConsumed = linear_regression.gradient( 63 | Xtrain, ytrain, rate=0.001, maxLoop=5000, epsilon=0.1) 64 | theta, errors = res 65 | trainErrors[0,i] = errors[-1] 66 | valErrors[0,i] = linear_regression.J(theta, Xval, yval) 67 | 68 | plt.plot(np.arange(1,m+1).ravel(), trainErrors.ravel(), color='b', label='Training Error') 69 | plt.plot(np.arange(1,m+1).ravel(), valErrors.ravel(), color='g', label='Validation Error') 70 | plt.title('Learning curve for linear regression') 71 | plt.xlabel('Number of training examples') 72 | plt.ylabel('Error') 73 | plt.legend() 74 | plt.show() 75 | 76 | def diagnosePR(): 77 | """多项式回归诊断 78 | """ 79 | # 多项式回归 80 | poly = PolynomialFeatures(degree=8) 81 | XX, XXval, XXtest = [linear_regression.normalize( 82 | np.mat(poly.fit_transform(data[:, 1:]))) for data in [X, Xval, Xtest]] 83 | initTheta = np.mat(np.ones((XX.shape[1], 1))) 84 | theLambdas = [1.0, 0.001, 0.003, 0.01, 0.003, 0.1, 0.3, 1.0, 3.0, 10.0] 85 | numTheLambdas = len(theLambdas) 86 | trainErrors = np.zeros((1, numTheLambdas)) 87 | valErrors = np.zeros((1, numTheLambdas)) 88 | thetas = [] 89 | for idx, theLambda in enumerate(theLambdas): 90 | res, timeConsumed = linear_regression.gradient( 91 | XX, y, rate=0.3, maxLoop=500, epsilon=0.01, 92 | theLambda=theLambda, initTheta=initTheta) 93 | theta, errors = res 94 | thetas.append(theta) 95 | trainErrors[0, idx] = errors[-1] 96 | valErrors[0, idx] = linear_regression.J( 97 | theta, XXval, yval, theLambda=theLambda) 98 | bestLambda = theLambdas[np.argmin(valErrors)] 99 | theta = thetas[np.argmin(valErrors)] 100 | error = np.min(valErrors) 101 | 102 | # # 绘制随样本规模学习曲线 103 | plt.plot(np.arange(1, numTheLambdas + 1).ravel(), 104 | trainErrors.ravel(), color='b', label='Training Error') 105 | plt.plot(np.arange(1, numTheLambdas + 1).ravel(), 106 | valErrors.ravel(), color='g', label='Validation Error') 107 | plt.title('Learning curve for polynomial regression') 108 | plt.xlabel('lambda') 109 | plt.ylabel('Error') 110 | plt.legend() 111 | plt.show() 112 | 113 | # 绘制拟合曲线 114 | fitX = np.mat(np.linspace(-60, 45).reshape(-1, 1)) 115 | fitX = np.concatenate((np.ones((fitX.shape[0], 1)), fitX), axis=1) 116 | fitXX = linear_regression.normalize(np.mat(poly.fit_transform(fitX[:, 1:]))) 117 | h = fitXX * theta 118 | plt.title('Polynomial regression learning curve(lambda=%.3f) \n validation error=%.3f' % (bestLambda, error)) 119 | plt.scatter(X[:, 1].ravel(), y[:, 0].flatten().A[0], marker='x', color='r', linewidth=3) 120 | plt.plot(fitX[:, 1], h, color='b') 121 | plt.show() 122 | 123 | diagnoseLR() 124 | diagnosePR() 125 | -------------------------------------------------------------------------------- /algorithm_analysis/linear_regression.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # algorithm_analysis/linear_regression.py 3 | import numpy as np 4 | import matplotlib as plt 5 | import time 6 | 7 | 8 | def exeTime(func): 9 | """ 耗时计算装饰器 10 | """ 11 | def newFunc(*args, **args2): 12 | t0 = time.time() 13 | back = func(*args, **args2) 14 | return back, time.time() - t0 15 | return newFunc 16 | 17 | 18 | def h(theta, x): 19 | """预测函数 20 | 21 | Args: 22 | theta 相关系数矩阵 23 | x 特征向量 24 | 25 | Returns: 26 | 预测结果 27 | """ 28 | return (theta.T * x)[0, 0] 29 | 30 | 31 | def J(theta, X, y, theLambda=0): 32 | """代价函数 33 | 34 | Args: 35 | theta 相关系数矩阵 36 | X 样本集矩阵 37 | y 标签集矩阵 38 | 39 | Returns: 40 | 预测误差(代价) 41 | """ 42 | m = len(X) 43 | return (X * theta - y).T * (X * theta - y) / (2 * m) + theLambda * np.sum(np.square(theta)) / (2*m) 44 | 45 | 46 | @exeTime 47 | def gradient(X, y, rate=1, maxLoop=50, epsilon=1e-1, theLambda=0, initTheta=None): 48 | """批量梯度下降法 49 | 50 | Args: 51 | X 样本矩阵 52 | y 标签矩阵 53 | rate 学习率 54 | maxLoop 最大迭代次数 55 | epsilon 收敛精度 56 | theLambda 正规化参数 57 | Returns: 58 | (theta, errors), timeConsumed 59 | """ 60 | m, n = X.shape 61 | # 初始化theta 62 | if initTheta is None: 63 | theta = np.zeros((n, 1)) 64 | else: 65 | theta = initTheta 66 | count = 0 67 | converged = False 68 | error = float('inf') 69 | errors = [] 70 | for i in range(maxLoop): 71 | theta = theta + (1.0 / m) * rate * ((y - X * theta).T * X).T 72 | error = J(theta, X, y, theLambda) 73 | if np.isnan(error) is True: 74 | error = np.inf 75 | else: 76 | error = error[0, 0] 77 | errors.append(error) 78 | # 如果已经收敛 79 | if(error < epsilon): 80 | break 81 | return theta, errors 82 | 83 | def standardize(X): 84 | """特征标准化处理 85 | 86 | Args: 87 | X 样本集 88 | Returns: 89 | 标准后的样本集 90 | """ 91 | m, n = X.shape 92 | # 归一化每一个特征 93 | for j in range(n): 94 | features = X[:,j] 95 | meanVal = features.mean(axis=0) 96 | std = features.std(axis=0) 97 | if std != 0: 98 | X[:, j] = (features-meanVal)/std 99 | else: 100 | X[:, j] = 0 101 | return X 102 | 103 | def normalize(X): 104 | """特征归一化处理 105 | 106 | Args: 107 | X 样本集 108 | Returns: 109 | 归一化后的样本集 110 | """ 111 | m, n = X.shape 112 | # 归一化每一个特征 113 | for j in range(n): 114 | features = X[:,j] 115 | minVal = features.min(axis=0) 116 | maxVal = features.max(axis=0) 117 | diff = maxVal - minVal 118 | if diff != 0: 119 | X[:,j] = (features-minVal)/diff 120 | else: 121 | X[:,j] = 0 122 | return X 123 | 124 | def getLearningCurves(X, y, Xval, yval, rate=1, maxLoop=50, epsilon=0.1, theLambda=0): 125 | """获得学习曲线 126 | 127 | Args: 128 | X 样本集 129 | y 标签集 130 | Xval 交叉验证集 131 | yval 交叉验证集标签 132 | Returns: 133 | trainErrors 训练误差随样本规模的变化 134 | valErrors 校验验证集误差随样本规模的变化 135 | """ 136 | # 绘制随样本规模学习曲线 137 | m, n = X.shape 138 | trainErrors = np.zeros((1,m)) 139 | valErrors = np.zeros((1,m)) 140 | for i in range(m): 141 | Xtrain = X[0:i+1] 142 | ytrain = y[0:i+1] 143 | res, timeConsumed = gradient( 144 | Xtrain, ytrain, rate=rate, maxLoop=maxLoop, epsilon=epsilon,theLambda=theLambda) 145 | theta, errors = res 146 | trainErrors[0,i] = errors[-1] 147 | valErrors[0,i] = J(theta, Xval, yval, theLambda=theLambda) 148 | return trainErrors, valErrors 149 | -------------------------------------------------------------------------------- /anomaly_detection/anomaly.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # anomaly_detection/anomaly.py 3 | 4 | import numpy as np 5 | 6 | def F1(predictions, y): 7 | """F_1Score 8 | 9 | Args: 10 | predictions 预测 11 | y 真实值 12 | Returns: 13 | F_1Score 14 | """ 15 | TP = np.sum((predictions == 1) & (y == 1)) 16 | FP = np.sum((predictions == 1) & (y == 0)) 17 | FN = np.sum((predictions == 0) & (y == 1)) 18 | if TP + FP == 0: 19 | precision = 0 20 | else: 21 | precision = float(TP) / (TP + FP) 22 | if TP + FN == 0: 23 | recall = 0 24 | else: 25 | recall = float(TP) / (TP + FN) 26 | if precision + recall == 0: 27 | return 0 28 | else: 29 | return (2.0 * precision * recall) / (precision + recall) 30 | 31 | 32 | def gaussianModel(X): 33 | """高斯模型 34 | 35 | Args: 36 | X 样本集 37 | Returns: 38 | p 模型 39 | """ 40 | # 参数估计 41 | m, n = X.shape 42 | mu = np.mean(X, axis=0) 43 | delta2 = np.var(X, axis=0) 44 | def p(x): 45 | """p(x) 46 | 47 | Args: 48 | x x 49 | mu mu 50 | delta2 delta2 51 | Returns: 52 | p 53 | """ 54 | total = 1 55 | for j in range(x.shape[0]): 56 | total *= np.exp(-np.power((x[j, 0] - mu[0, j]), 2) / (2 * delta2[0, j]**2) 57 | ) / (np.sqrt(2 * np.pi * delta2[0, j])) 58 | return total 59 | return p 60 | 61 | 62 | def multivariateGaussianModel(X): 63 | """多元高斯模型 64 | 65 | Args: 66 | X 样本集 67 | Returns: 68 | p 模型 69 | """ 70 | # 参数估计 71 | m, n = X.shape 72 | mu = np.mean(X.T, axis=1) 73 | Sigma = np.var(X, axis=0) 74 | Sigma = np.diagflat(Sigma) 75 | # Sigma = np.mat(np.cov(X.T)) 76 | detSigma = np.linalg.det(Sigma) 77 | 78 | def p(x): 79 | """p(x) 80 | 81 | Args: 82 | x x 83 | mu mu 84 | delta2 delta2 85 | Returns: 86 | p 87 | """ 88 | x = x - mu 89 | return np.exp(-x.T * np.linalg.pinv(Sigma) * x / 2).A[0] * \ 90 | ((2*np.pi)**(-n/2) * (detSigma**(-0.5) )) 91 | return p 92 | 93 | 94 | def train(X, model=gaussianModel): 95 | """训练函数 96 | 97 | Args: 98 | X 样本集 99 | Returns: 100 | p 概率模型 101 | """ 102 | return model(X) 103 | -------------------------------------------------------------------------------- /anomaly_detection/data/ex8data1.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/anomaly_detection/data/ex8data1.mat -------------------------------------------------------------------------------- /anomaly_detection/data/ex8data2.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/anomaly_detection/data/ex8data2.mat -------------------------------------------------------------------------------- /anomaly_detection/test_anomaly_detection.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # anomaly_detection/test_anomaly_detection.py 3 | 4 | import numpy as np 5 | from scipy.io import loadmat 6 | import matplotlib.pyplot as plt 7 | import anomaly 8 | 9 | def selectEpsilon(XVal, yVal, p): 10 | # 通过交叉验证集,选择最好的 epsilon 参数 11 | pVal = np.mat([p(x.T) for x in XVal]).reshape(-1, 1) 12 | step = (np.max(pVal) - np.min(pVal)) / 1000 13 | bestEpsilon = 0 14 | bestF1 = 0 15 | for epsilon in np.arange(np.min(pVal), np.max(pVal), step): 16 | predictions = pVal < epsilon 17 | F1 = anomaly.F1(predictions, yVal) 18 | if F1 > bestF1: 19 | bestF1 = F1 20 | bestEpsilon = epsilon 21 | return bestEpsilon, bestF1 22 | 23 | # 小维度测试...... 24 | data = loadmat('data/ex8data1.mat') 25 | X = np.mat(data['X']) 26 | XVal = np.mat(data['Xval']) 27 | yVal = np.mat(data['yval']) 28 | 29 | # p = anomaly.train(X) 30 | p = anomaly.train(X, model=anomaly.multivariateGaussianModel) 31 | pTest = np.mat([p(x.T) for x in X]).reshape(-1, 1) 32 | 33 | # 绘制数据点 34 | plt.xlabel('Latency (ms)') 35 | plt.ylabel('Throughput (mb/s)') 36 | plt.plot(X[:, 0], X[:, 1], 'bx') 37 | epsilon, F1 = selectEpsilon(XVal, yVal, p) 38 | 39 | print 'Best epsilon found using cross-validation: %e\n'%epsilon 40 | print 'Best F1 on Cross Validation Set: %f\n'%F1 41 | print '# Outliers found: %d' % np.sum(pTest < epsilon) 42 | 43 | # 获得训练集的异常点 44 | outliers = np.where(pTest < epsilon, True, False).ravel() 45 | plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2, markersize=10, fillstyle='none', markeredgewidth=1) 46 | n = np.linspace(0, 35, 100) 47 | X1 = np.meshgrid(n,n) 48 | XFit = np.mat(np.column_stack((X1[0].T.flatten(), X1[1].T.flatten()))) 49 | pFit = np.mat([p(x.T) for x in XFit]).reshape(-1, 1) 50 | pFit = pFit.reshape(X1[0].shape) 51 | # Do not plot if there are infinities 52 | if not np.isinf(np.sum(pFit)): 53 | plt.contour(X1[0], X1[1], pFit, 10.0**np.arange(-20, 0, 3).T) 54 | plt.show() 55 | 56 | 57 | # 大维度测试...... 58 | data = loadmat('data/ex8data2.mat') 59 | X = np.mat(data['X']) 60 | XVal = np.mat(data['Xval']) 61 | yVal = np.mat(data['yval']) 62 | 63 | # p = anomaly.train(X) 64 | p = anomaly.train(X, model=anomaly.multivariateGaussianModel) 65 | pTest = np.mat([p(x.T) for x in X]).reshape(-1, 1) 66 | 67 | epsilon, F1 = selectEpsilon(XVal, yVal, p) 68 | 69 | print 'Best epsilon found using cross-validation: %e\n'%epsilon 70 | print 'Best F1 on Cross Validation Set: %f\n'%F1 71 | print '# Outliers found: %d' % np.sum(pTest < epsilon) 72 | -------------------------------------------------------------------------------- /kmeans/data/places.txt: -------------------------------------------------------------------------------- 1 | Dolphin II 10860 SW Beaverton-Hillsdale Hwy Beaverton, OR 45.486502 -122.788346 2 | Hotties 10140 SW Canyon Rd. Beaverton, OR 45.493150 -122.781021 3 | Pussycats 8666a SW Canyon Road Beaverton, OR 45.498187 -122.766147 4 | Stars Cabaret 4570 Lombard Ave Beaverton, OR 45.485943 -122.800311 5 | Sunset Strip 10205 SW Park Way Beaverton, OR 45.508203 -122.781853 6 | Vegas VIP Room 10018 SW Canyon Rd Beaverton, OR 45.493398 -122.779628 7 | Full Moon Bar and Grill 28014 Southeast Wally Road Boring, OR 45.430319 -122.376304 8 | 505 Club 505 Burnside Rd Gresham, OR 45.507621 -122.425553 9 | Dolphin 17180 McLoughlin Blvd Milwaukie, OR 45.399070 -122.618893 10 | Dolphin III 13305 SE McLoughlin BLVD Milwaukie, OR 45.427072 -122.634159 11 | Acropolis 8325 McLoughlin Blvd Portland, OR 45.462173 -122.638846 12 | Blush 5145 SE McLoughlin Blvd Portland, OR 45.485396 -122.646587 13 | Boom Boom Room 8345 Barbur Blvd Portland, OR 45.464826 -122.699212 14 | Bottoms Up 16900 Saint Helens Rd Portland, OR 45.646831 -122.842918 15 | Cabaret II 17544 Stark St Portland, OR 45.519142 -122.482480 16 | Cabaret Lounge 503 W Burnside Portland, OR 45.523094 -122.675528 17 | Carnaval 330 SW 3rd Avenue Portland, OR 45.520682 -122.674206 18 | Casa Diablo 2839 NW St. Helens Road Portland, OR 45.543016 -122.720828 19 | Chantilly Lace 6723 Killingsworth St Portland, OR 45.562715 -122.593078 20 | Club 205 9939 Stark St Portland, OR 45.519052 -122.561510 21 | Club Rouge 403 SW Stark Portland, OR 45.520561 -122.675605 22 | Dancin' Bare 8440 Interstate Ave Portland, OR 45.584124 -122.682725 23 | Devil's Point 5305 SE Foster Rd Portland, OR 45.495365 -122.608366 24 | Double Dribble 13550 Southeast Powell Boulevard Portland, OR 45.497750 -122.524073 25 | Dream on Saloon 15920 Stark St Portland, OR 45.519142 -122.499672 26 | DV8 5003 Powell Blvd Portland, OR 45.497498 -122.611177 27 | Exotica 240 Columbia Blvd Portland, OR 45.583048 -122.668350 28 | Frolics 8845 Sandy Blvd Portland, OR 45.555384 -122.571475 29 | G-Spot Airport 8654 Sandy Blvd Portland, OR 45.554263 -122.574167 30 | G-Spot Northeast 3400 NE 82nd Ave Portland, OR 45.547229 -122.578746 31 | G-Spot Southeast 5241 SE 72nd Ave Portland, OR 45.484823 -122.589208 32 | Glimmers 3532 Powell Blvd Portland, OR 45.496918 -122.627920 33 | Golden Dragon Exotic Club 324 SW 3rd Ave Portland, OR 45.520714 -122.674189 34 | Heat 12131 SE Holgate Blvd. Portland, OR 45.489637 -122.538196 35 | Honeysuckle's Lingerie 3520 82nd Ave Portland, OR 45.548651 -122.578730 36 | Hush Playhouse 13560 Powell Blvd Portland, OR 45.497765 -122.523985 37 | JD's Bar & Grill 4523 NE 60th Ave Portland, OR 45.555811 -122.600881 38 | Jody's Bar And Grill 12035 Glisan St Portland, OR 45.526306 -122.538833 39 | Landing Strip 6210 Columbia Blvd Portland, OR 45.595042 -122.728825 40 | Lucky Devil Lounge 633 SE Powell Blvd Portland, OR 45.501585 -122.659310 41 | Lure 11051 Barbur Blvd Portland, OR 45.445233 -122.732606 42 | Magic Garden 217 4th Ave Portland, OR 45.524692 -122.674466 43 | Mary's Club 129 Broadway Portland, OR 45.535101 -122.667390 44 | Montego's 15826 SE Division Portland, OR 45.504448 -122.500034 45 | Mr. Peeps 709 122nd Ave Portland, OR 45.527863 -122.537726 46 | Mynt Gentlemen's Club 3390 NE Sandy Blvd Portland, OR 45.532426 -122.628865 47 | Mystic 9950 SE Stark St. Portland, OR 45.519037 -122.561283 48 | Nicolai Street Clubhouse 2460 24th Ave Portland, OR 45.540098 -122.641114 49 | Oh Zone 6218 Columbia Blvd Portland, OR 45.595069 -122.728961 50 | Pallas Club 13639 Powell Blvd Portland, OR 45.497990 -122.522849 51 | Pirates Cove 7427 Sandy Blvd Portland, OR 45.549288 -122.586505 52 | Private Pleasures 10931 53rd Ave Portland, OR 45.446442 -122.731034 53 | Pussycats 3414 Northeast 82nd Avenue Portland, OR 45.547337 -122.578744 54 | Riverside Corral 545 Tacoma St Portland, OR 45.464338 -122.660285 55 | Rooster's 605 Columbia Blvd Portland, OR 45.583693 -122.672462 56 | Rose City Strip 3620 35th Pl Portland, OR 45.496601 -122.627688 57 | Safari Show Club 3000 SE Powell Blvd Portland, OR 45.497091 -122.634581 58 | Sassy's Bar & Grill 927 Morrison St Portland, OR 45.517225 -122.656367 59 | Secret Rendezvous 12503 Division St Portland, OR 45.504087 -122.534481 60 | Shimmers 7944 Foster Rd Portland, OR 45.483836 -122.581608 61 | Soobie's 333 SE 122nd Ave Portland, OR 45.520162 -122.537787 62 | Spyce Gentleman's Club 33 NW 2nd Ave Portland, OR 45.523370 -122.672388 63 | Sugar Shack 6732 Killingsworth St Portland, OR 45.562699 -122.593048 64 | The Hawthorne Strip 1008 Hawthorne Blvd Portland, OR 45.512220 -122.655527 65 | Tommy's Too 10335 Foster Rd Portland, OR 45.476721 -122.557005 66 | Union Jacks 938 Burnside St Portland, OR 45.522902 -122.656249 67 | Video Visions 6723 Killingsworth St Portland, OR 45.562715 -122.593078 68 | Stars Cabaret Bridgeport 17939 SW McEwan Rd Tigard, OR 45.425788 -122.765754 69 | Jiggles 7455 SW Nyberg St Tualatin, OR 45.382682 -122.753932 70 | -------------------------------------------------------------------------------- /kmeans/data/portlandClubs.txt: -------------------------------------------------------------------------------- 1 | Dolphin II 10860 SW Beaverton-Hillsdale Hwy Beaverton, OR 2 | Hotties 10140 SW Canyon Rd. Beaverton, OR 3 | Pussycats 8666a SW Canyon Road Beaverton, OR 4 | Stars Cabaret 4570 Lombard Ave Beaverton, OR 5 | Sunset Strip 10205 SW Park Way Beaverton, OR 6 | Vegas VIP Room 10018 SW Canyon Rd Beaverton, OR 7 | Full Moon Bar and Grill 28014 Southeast Wally Road Boring, OR 8 | 505 Club 505 Burnside Rd Gresham, OR 9 | Dolphin 17180 McLoughlin Blvd Milwaukie, OR 10 | Dolphin III 13305 SE McLoughlin BLVD Milwaukie, OR 11 | Acropolis 8325 McLoughlin Blvd Portland, OR 12 | Blush 5145 SE McLoughlin Blvd Portland, OR 13 | Boom Boom Room 8345 Barbur Blvd Portland, OR 14 | Bottoms Up 16900 Saint Helens Rd Portland, OR 15 | Cabaret II 17544 Stark St Portland, OR 16 | Cabaret Lounge 503 W Burnside Portland, OR 17 | Carnaval 330 SW 3rd Avenue Portland, OR 18 | Casa Diablo 2839 NW St. Helens Road Portland, OR 19 | Chantilly Lace 6723 Killingsworth St Portland, OR 20 | Club 205 9939 Stark St Portland, OR 21 | Club Rouge 403 SW Stark Portland, OR 22 | Dancin' Bare 8440 Interstate Ave Portland, OR 23 | Devil's Point 5305 SE Foster Rd Portland, OR 24 | Double Dribble 13550 Southeast Powell Boulevard Portland, OR 25 | Dream on Saloon 15920 Stark St Portland, OR 26 | DV8 5003 Powell Blvd Portland, OR 27 | Exotica 240 Columbia Blvd Portland, OR 28 | Frolics 8845 Sandy Blvd Portland, OR 29 | G-Spot Airport 8654 Sandy Blvd Portland, OR 30 | G-Spot Northeast 3400 NE 82nd Ave Portland, OR 31 | G-Spot Southeast 5241 SE 72nd Ave Portland, OR 32 | Glimmers 3532 Powell Blvd Portland, OR 33 | Golden Dragon Exotic Club 324 SW 3rd Ave Portland, OR 34 | Heat 12131 SE Holgate Blvd. Portland, OR 35 | Honeysuckle's Lingerie 3520 82nd Ave Portland, OR 36 | Hush Playhouse 13560 Powell Blvd Portland, OR 37 | JD's Bar & Grill 4523 NE 60th Ave Portland, OR 38 | Jody's Bar And Grill 12035 Glisan St Portland, OR 39 | Landing Strip 6210 Columbia Blvd Portland, OR 40 | Lucky Devil Lounge 633 SE Powell Blvd Portland, OR 41 | Lure 11051 Barbur Blvd Portland, OR 42 | Magic Garden 217 4th Ave Portland, OR 43 | Mary's Club 129 Broadway Portland, OR 44 | Montego's 15826 SE Division Portland, OR 45 | Mr. Peeps 709 122nd Ave Portland, OR 46 | Mynt Gentlemen's Club 3390 NE Sandy Blvd Portland, OR 47 | Mystic 9950 SE Stark St. Portland, OR 48 | Nicolai Street Clubhouse 2460 24th Ave Portland, OR 49 | Oh Zone 6218 Columbia Blvd Portland, OR 50 | Pallas Club 13639 Powell Blvd Portland, OR 51 | Pirates Cove 7427 Sandy Blvd Portland, OR 52 | Private Pleasures 10931 53rd Ave Portland, OR 53 | Pussycats 3414 Northeast 82nd Avenue Portland, OR 54 | Riverside Corral 545 Tacoma St Portland, OR 55 | Rooster's 605 Columbia Blvd Portland, OR 56 | Rose City Strip 3620 35th Pl Portland, OR 57 | Safari Show Club 3000 SE Powell Blvd Portland, OR 58 | Sassy's Bar & Grill 927 Morrison St Portland, OR 59 | Secret Rendezvous 12503 Division St Portland, OR 60 | Shimmers 7944 Foster Rd Portland, OR 61 | Soobie's 333 SE 122nd Ave Portland, OR 62 | Spyce Gentleman's Club 33 NW 2nd Ave Portland, OR 63 | Sugar Shack 6732 Killingsworth St Portland, OR 64 | The Hawthorne Strip 1008 Hawthorne Blvd Portland, OR 65 | Tommy's Too 10335 Foster Rd Portland, OR 66 | Union Jacks 938 Burnside St Portland, OR 67 | Video Visions 6723 Killingsworth St Portland, OR 68 | Stars Cabaret Bridgeport 17939 SW McEwan Rd Tigard, OR 69 | Jiggles 7455 SW Nyberg St Tualatin, OR -------------------------------------------------------------------------------- /kmeans/data/testSet.txt: -------------------------------------------------------------------------------- 1 | 1.658985 4.285136 2 | -3.453687 3.424321 3 | 4.838138 -1.151539 4 | -5.379713 -3.362104 5 | 0.972564 2.924086 6 | -3.567919 1.531611 7 | 0.450614 -3.302219 8 | -3.487105 -1.724432 9 | 2.668759 1.594842 10 | -3.156485 3.191137 11 | 3.165506 -3.999838 12 | -2.786837 -3.099354 13 | 4.208187 2.984927 14 | -2.123337 2.943366 15 | 0.704199 -0.479481 16 | -0.392370 -3.963704 17 | 2.831667 1.574018 18 | -0.790153 3.343144 19 | 2.943496 -3.357075 20 | -3.195883 -2.283926 21 | 2.336445 2.875106 22 | -1.786345 2.554248 23 | 2.190101 -1.906020 24 | -3.403367 -2.778288 25 | 1.778124 3.880832 26 | -1.688346 2.230267 27 | 2.592976 -2.054368 28 | -4.007257 -3.207066 29 | 2.257734 3.387564 30 | -2.679011 0.785119 31 | 0.939512 -4.023563 32 | -3.674424 -2.261084 33 | 2.046259 2.735279 34 | -3.189470 1.780269 35 | 4.372646 -0.822248 36 | -2.579316 -3.497576 37 | 1.889034 5.190400 38 | -0.798747 2.185588 39 | 2.836520 -2.658556 40 | -3.837877 -3.253815 41 | 2.096701 3.886007 42 | -2.709034 2.923887 43 | 3.367037 -3.184789 44 | -2.121479 -4.232586 45 | 2.329546 3.179764 46 | -3.284816 3.273099 47 | 3.091414 -3.815232 48 | -3.762093 -2.432191 49 | 3.542056 2.778832 50 | -1.736822 4.241041 51 | 2.127073 -2.983680 52 | -4.323818 -3.938116 53 | 3.792121 5.135768 54 | -4.786473 3.358547 55 | 2.624081 -3.260715 56 | -4.009299 -2.978115 57 | 2.493525 1.963710 58 | -2.513661 2.642162 59 | 1.864375 -3.176309 60 | -3.171184 -3.572452 61 | 2.894220 2.489128 62 | -2.562539 2.884438 63 | 3.491078 -3.947487 64 | -2.565729 -2.012114 65 | 3.332948 3.983102 66 | -1.616805 3.573188 67 | 2.280615 -2.559444 68 | -2.651229 -3.103198 69 | 2.321395 3.154987 70 | -1.685703 2.939697 71 | 3.031012 -3.620252 72 | -4.599622 -2.185829 73 | 4.196223 1.126677 74 | -2.133863 3.093686 75 | 4.668892 -2.562705 76 | -2.793241 -2.149706 77 | 2.884105 3.043438 78 | -2.967647 2.848696 79 | 4.479332 -1.764772 80 | -4.905566 -2.911070 81 | -------------------------------------------------------------------------------- /kmeans/data/testSet2.txt: -------------------------------------------------------------------------------- 1 | 3.275154 2.957587 2 | -3.344465 2.603513 3 | 0.355083 -3.376585 4 | 1.852435 3.547351 5 | -2.078973 2.552013 6 | -0.993756 -0.884433 7 | 2.682252 4.007573 8 | -3.087776 2.878713 9 | -1.565978 -1.256985 10 | 2.441611 0.444826 11 | -0.659487 3.111284 12 | -0.459601 -2.618005 13 | 2.177680 2.387793 14 | -2.920969 2.917485 15 | -0.028814 -4.168078 16 | 3.625746 2.119041 17 | -3.912363 1.325108 18 | -0.551694 -2.814223 19 | 2.855808 3.483301 20 | -3.594448 2.856651 21 | 0.421993 -2.372646 22 | 1.650821 3.407572 23 | -2.082902 3.384412 24 | -0.718809 -2.492514 25 | 4.513623 3.841029 26 | -4.822011 4.607049 27 | -0.656297 -1.449872 28 | 1.919901 4.439368 29 | -3.287749 3.918836 30 | -1.576936 -2.977622 31 | 3.598143 1.975970 32 | -3.977329 4.900932 33 | -1.791080 -2.184517 34 | 3.914654 3.559303 35 | -1.910108 4.166946 36 | -1.226597 -3.317889 37 | 1.148946 3.345138 38 | -2.113864 3.548172 39 | 0.845762 -3.589788 40 | 2.629062 3.535831 41 | -1.640717 2.990517 42 | -1.881012 -2.485405 43 | 4.606999 3.510312 44 | -4.366462 4.023316 45 | 0.765015 -3.001270 46 | 3.121904 2.173988 47 | -4.025139 4.652310 48 | -0.559558 -3.840539 49 | 4.376754 4.863579 50 | -1.874308 4.032237 51 | -0.089337 -3.026809 52 | 3.997787 2.518662 53 | -3.082978 2.884822 54 | 0.845235 -3.454465 55 | 1.327224 3.358778 56 | -2.889949 3.596178 57 | -0.966018 -2.839827 58 | 2.960769 3.079555 59 | -3.275518 1.577068 60 | 0.639276 -3.412840 61 | -------------------------------------------------------------------------------- /kmeans/kmeans.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # kmeans/kmeans.py 3 | import numpy as np 4 | 5 | def loadDataSet(filename): 6 | """ 7 | 读取数据集 8 | 9 | Args: 10 | filename: 文件名 11 | Returns: 12 | dataMat: 数据样本矩阵 13 | """ 14 | dataMat = [] 15 | fr = open(filename) 16 | for line in fr.readlines(): 17 | curLine = line.strip().split('\t') 18 | # 通过map函数批量转换 19 | fitLine = map(float, curLine) 20 | dataMat.append(fitLine) 21 | return dataMat 22 | 23 | def distEclud(vecA, vecB): 24 | """ 25 | 计算两向量的欧氏距离 26 | 27 | Args: 28 | vecA: 向量A 29 | vecB: 向量B 30 | Returns: 31 | 欧式距离 32 | """ 33 | return np.sqrt(np.sum(np.power(vecA - vecB, 2))) 34 | 35 | def randCent(dataSet, k): 36 | """ 37 | 随机生成k个聚类中心 38 | 39 | Args: 40 | dataSet: 数据集 41 | k: 簇数目 42 | Returns: 43 | centroids: 聚类中心矩阵 44 | """ 45 | _, n = dataSet.shape 46 | centroids = np.mat(np.zeros((k, n))) 47 | for j in range(n): 48 | # 随机聚类中心落在数据集的边界之内 49 | minJ = np.min(dataSet[:, j]) 50 | maxJ = np.max(dataSet[:, j]) 51 | rangeJ = float(maxJ - minJ) 52 | centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1) 53 | return centroids 54 | 55 | def kMeans(dataSet, k, maxIter = 5): 56 | """ 57 | K-Means 58 | 59 | Args: 60 | dataSet: 数据集 61 | k: 聚类数 62 | Returns: 63 | centroids: 聚类中心 64 | clusterAssment: 点分配结果 65 | """ 66 | # 随机初始化聚类中心 67 | centroids = randCent(dataSet, k) 68 | m, n = np.shape(dataSet) 69 | # 点分配结果: 第一列指明样本所在的簇,第二列指明该样本到聚类中心的距离 70 | clusterAssment = np.mat(np.zeros((m, 2))) 71 | # 标识聚类中心是否仍在改变 72 | clusterChanged = True 73 | # 直至聚类中心不再变化 74 | iterCount = 0 75 | while clusterChanged and iterCount < maxIter: 76 | iterCount += 1 77 | clusterChanged = False 78 | # 分配样本到簇 79 | for i in range(m): 80 | # 计算第i个样本到各个聚类中心的距离 81 | minIndex = 0 82 | minDist = np.inf 83 | for j in range(k): 84 | dist = distEclud(dataSet[i, :], centroids[j, :]) 85 | if(dist < minDist): 86 | minIndex = j 87 | minDist = dist 88 | # 判断cluster是否改变 89 | if(clusterAssment[i, 0] != minIndex): 90 | clusterChanged = True 91 | clusterAssment[i, :] = minIndex, minDist**2 92 | # 刷新聚类中心: 移动聚类中心到所在簇的均值位置 93 | for cent in range(k): 94 | # 通过数组过滤获得簇中的点 95 | ptsInCluster = dataSet[np.nonzero( 96 | clusterAssment[:, 0].A == cent)[0]] 97 | if ptsInCluster.shape[0] > 0: 98 | # 计算均值并移动 99 | centroids[cent, :] = np.mean(ptsInCluster, axis=0) 100 | return centroids, clusterAssment 101 | 102 | def biKmeans(dataSet, k): 103 | """ 104 | 二分kmeans算法 105 | Args: 106 | dataSet: 数据集 107 | k: 聚类数 108 | Returns: 109 | centroids: 聚类中心 110 | clusterAssment: 点分配结果 111 | """ 112 | m, n = np.shape(dataSet) 113 | # 起始时,只有一个簇,该簇的聚类中心为所有样本的平均位置 114 | centroid0 = np.mean(dataSet, axis=0).tolist()[0] 115 | # 设置一个列表保存当前的聚类中心 116 | currentCentroids = [centroid0] 117 | # 点分配结果: 第一列指明样本所在的簇,第二列指明该样本到聚类中心的距离 118 | clusterAssment = np.mat(np.zeros((m, 2))) 119 | # 初始化点分配结果,默认将所有样本先分配到初始簇 120 | for j in range(m): 121 | clusterAssment[j, 1] = distEclud(dataSet[j, :], np.mat(centroid0))**2 122 | # 直到簇的数目达标 123 | while len(currentCentroids) < k: 124 | # 当前最小的代价 125 | lowestError = np.inf 126 | # 对于每一个簇 127 | for j in range(len(currentCentroids)): 128 | # 获得该簇的样本 129 | ptsInCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == j)[0], :] 130 | # 在该簇上进行2-means聚类 131 | # 注意,得到的centroids,其聚类编号含0,1 132 | centroids, clusterAss = kMeans(ptsInCluster, 2) 133 | # 获得划分后的误差之和 134 | splitedError = np.sum(clusterAss[:, 1]) 135 | # 获得其他簇的样本 136 | ptsNoInCluster = dataSet[np.nonzero( 137 | clusterAssment[:, 0].A != j)[0]] 138 | # 获得剩余数据集的误差 139 | nonSplitedError = np.sum(ptsNoInCluster[:, 1]) 140 | # 比较,判断此次划分是否划算 141 | if (splitedError + nonSplitedError) < lowestError: 142 | # 如果划算,刷新总误差 143 | lowestError = splitedError + nonSplitedError 144 | # 记录当前的应当划分的簇 145 | needToSplit = j 146 | # 新获得的簇以及点分配结果 147 | newCentroids = centroids.A 148 | newClusterAss = clusterAss.copy() 149 | # 更新簇的分配结果 150 | # 第0簇应当修正为被划分的簇 151 | newClusterAss[np.nonzero(newClusterAss[:, 0].A == 0)[ 152 | 0], 0] = needToSplit 153 | # 第1簇应当修正为最新一簇 154 | newClusterAss[np.nonzero(newClusterAss[:, 0].A == 1)[ 155 | 0], 0] = len(currentCentroids) 156 | # 被划分的簇需要更新 157 | currentCentroids[needToSplit] = newCentroids[0, :] 158 | # 加入新的划分后的簇 159 | currentCentroids.append(newCentroids[1, :]) 160 | # 刷新点分配结果 161 | clusterAssment[np.nonzero( 162 | clusterAssment[:, 0].A == needToSplit 163 | )[0], :] = newClusterAss 164 | return np.mat(currentCentroids), clusterAssment 165 | -------------------------------------------------------------------------------- /kmeans/test_bi_kmeans.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # kmeans/test_bi_kmeans.py 3 | 4 | import kmeans 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | 8 | if __name__ == "__main__": 9 | dataMat = np.mat(kmeans.loadDataSet('data/testSet2.txt')) 10 | centroids, clusterAssment = kmeans.biKmeans(dataMat, 3) 11 | clusterCount = centroids.shape[0] 12 | m = dataMat.shape[0] 13 | # 绘制散点图 14 | patterns = ['o', 'D', '^'] 15 | colors = ['b', 'g', 'y'] 16 | fig = plt.figure() 17 | title = 'bi-kmeans with k=3' 18 | ax = fig.add_subplot(111, title=title) 19 | for k in range(clusterCount): 20 | # 绘制聚类中心 21 | ax.scatter(centroids[k,0], centroids[k,1], color='r', marker='+', linewidth=20) 22 | for i in range(m): 23 | # 绘制属于该聚类中心的样本 24 | ptsInCluster = dataMat[np.nonzero(clusterAssment[:, 0].A==k)[0]] 25 | ax.scatter(ptsInCluster[:, 0].flatten().A[0], ptsInCluster[:, 1].flatten().A[0], marker=patterns[k], color=colors[k]) 26 | plt.show() 27 | -------------------------------------------------------------------------------- /kmeans/test_normal_kmeans.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # kmeans/test_normal_kmeans.py 3 | import kmeans 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | if __name__ == "__main__": 8 | dataMat = np.mat(kmeans.loadDataSet('data/testSet.txt')) 9 | centroids, clusterAssment = kmeans.kMeans(dataMat, 2) 10 | clusterCount = np.shape(centroids)[0] 11 | m = np.shape(dataMat)[0] 12 | # 绘制散点图 13 | patterns = ['o', 'D', '^', 's'] 14 | colors = ['b', 'g', 'y', 'black'] 15 | fig = plt.figure() 16 | title = 'kmeans with k=2' 17 | ax = fig.add_subplot(111, title=title) 18 | for k in range(clusterCount): 19 | # 绘制聚类中心 20 | ax.scatter(centroids[k, 0], centroids[k, 1], color='r', marker='+', linewidth=20) 21 | for i in range(m): 22 | # 绘制属于该聚类中心的样本 23 | ptsInCluster = dataMat[np.nonzero(clusterAssment[:, 0].A==k)[0]] 24 | ax.scatter(ptsInCluster[:, 0].flatten().A[0], ptsInCluster[:, 1].flatten().A[0], marker=patterns[k], color=colors[k]) 25 | plt.show() 26 | -------------------------------------------------------------------------------- /linear_regression/data/ex0.txt: -------------------------------------------------------------------------------- 1 | 1.000000 0.067732 3.176513 2 | 1.000000 0.427810 3.816464 3 | 1.000000 0.995731 4.550095 4 | 1.000000 0.738336 4.256571 5 | 1.000000 0.981083 4.560815 6 | 1.000000 0.526171 3.929515 7 | 1.000000 0.378887 3.526170 8 | 1.000000 0.033859 3.156393 9 | 1.000000 0.132791 3.110301 10 | 1.000000 0.138306 3.149813 11 | 1.000000 0.247809 3.476346 12 | 1.000000 0.648270 4.119688 13 | 1.000000 0.731209 4.282233 14 | 1.000000 0.236833 3.486582 15 | 1.000000 0.969788 4.655492 16 | 1.000000 0.607492 3.965162 17 | 1.000000 0.358622 3.514900 18 | 1.000000 0.147846 3.125947 19 | 1.000000 0.637820 4.094115 20 | 1.000000 0.230372 3.476039 21 | 1.000000 0.070237 3.210610 22 | 1.000000 0.067154 3.190612 23 | 1.000000 0.925577 4.631504 24 | 1.000000 0.717733 4.295890 25 | 1.000000 0.015371 3.085028 26 | 1.000000 0.335070 3.448080 27 | 1.000000 0.040486 3.167440 28 | 1.000000 0.212575 3.364266 29 | 1.000000 0.617218 3.993482 30 | 1.000000 0.541196 3.891471 31 | 1.000000 0.045353 3.143259 32 | 1.000000 0.126762 3.114204 33 | 1.000000 0.556486 3.851484 34 | 1.000000 0.901144 4.621899 35 | 1.000000 0.958476 4.580768 36 | 1.000000 0.274561 3.620992 37 | 1.000000 0.394396 3.580501 38 | 1.000000 0.872480 4.618706 39 | 1.000000 0.409932 3.676867 40 | 1.000000 0.908969 4.641845 41 | 1.000000 0.166819 3.175939 42 | 1.000000 0.665016 4.264980 43 | 1.000000 0.263727 3.558448 44 | 1.000000 0.231214 3.436632 45 | 1.000000 0.552928 3.831052 46 | 1.000000 0.047744 3.182853 47 | 1.000000 0.365746 3.498906 48 | 1.000000 0.495002 3.946833 49 | 1.000000 0.493466 3.900583 50 | 1.000000 0.792101 4.238522 51 | 1.000000 0.769660 4.233080 52 | 1.000000 0.251821 3.521557 53 | 1.000000 0.181951 3.203344 54 | 1.000000 0.808177 4.278105 55 | 1.000000 0.334116 3.555705 56 | 1.000000 0.338630 3.502661 57 | 1.000000 0.452584 3.859776 58 | 1.000000 0.694770 4.275956 59 | 1.000000 0.590902 3.916191 60 | 1.000000 0.307928 3.587961 61 | 1.000000 0.148364 3.183004 62 | 1.000000 0.702180 4.225236 63 | 1.000000 0.721544 4.231083 64 | 1.000000 0.666886 4.240544 65 | 1.000000 0.124931 3.222372 66 | 1.000000 0.618286 4.021445 67 | 1.000000 0.381086 3.567479 68 | 1.000000 0.385643 3.562580 69 | 1.000000 0.777175 4.262059 70 | 1.000000 0.116089 3.208813 71 | 1.000000 0.115487 3.169825 72 | 1.000000 0.663510 4.193949 73 | 1.000000 0.254884 3.491678 74 | 1.000000 0.993888 4.533306 75 | 1.000000 0.295434 3.550108 76 | 1.000000 0.952523 4.636427 77 | 1.000000 0.307047 3.557078 78 | 1.000000 0.277261 3.552874 79 | 1.000000 0.279101 3.494159 80 | 1.000000 0.175724 3.206828 81 | 1.000000 0.156383 3.195266 82 | 1.000000 0.733165 4.221292 83 | 1.000000 0.848142 4.413372 84 | 1.000000 0.771184 4.184347 85 | 1.000000 0.429492 3.742878 86 | 1.000000 0.162176 3.201878 87 | 1.000000 0.917064 4.648964 88 | 1.000000 0.315044 3.510117 89 | 1.000000 0.201473 3.274434 90 | 1.000000 0.297038 3.579622 91 | 1.000000 0.336647 3.489244 92 | 1.000000 0.666109 4.237386 93 | 1.000000 0.583888 3.913749 94 | 1.000000 0.085031 3.228990 95 | 1.000000 0.687006 4.286286 96 | 1.000000 0.949655 4.628614 97 | 1.000000 0.189912 3.239536 98 | 1.000000 0.844027 4.457997 99 | 1.000000 0.333288 3.513384 100 | 1.000000 0.427035 3.729674 101 | 1.000000 0.466369 3.834274 102 | 1.000000 0.550659 3.811155 103 | 1.000000 0.278213 3.598316 104 | 1.000000 0.918769 4.692514 105 | 1.000000 0.886555 4.604859 106 | 1.000000 0.569488 3.864912 107 | 1.000000 0.066379 3.184236 108 | 1.000000 0.335751 3.500796 109 | 1.000000 0.426863 3.743365 110 | 1.000000 0.395746 3.622905 111 | 1.000000 0.694221 4.310796 112 | 1.000000 0.272760 3.583357 113 | 1.000000 0.503495 3.901852 114 | 1.000000 0.067119 3.233521 115 | 1.000000 0.038326 3.105266 116 | 1.000000 0.599122 3.865544 117 | 1.000000 0.947054 4.628625 118 | 1.000000 0.671279 4.231213 119 | 1.000000 0.434811 3.791149 120 | 1.000000 0.509381 3.968271 121 | 1.000000 0.749442 4.253910 122 | 1.000000 0.058014 3.194710 123 | 1.000000 0.482978 3.996503 124 | 1.000000 0.466776 3.904358 125 | 1.000000 0.357767 3.503976 126 | 1.000000 0.949123 4.557545 127 | 1.000000 0.417320 3.699876 128 | 1.000000 0.920461 4.613614 129 | 1.000000 0.156433 3.140401 130 | 1.000000 0.656662 4.206717 131 | 1.000000 0.616418 3.969524 132 | 1.000000 0.853428 4.476096 133 | 1.000000 0.133295 3.136528 134 | 1.000000 0.693007 4.279071 135 | 1.000000 0.178449 3.200603 136 | 1.000000 0.199526 3.299012 137 | 1.000000 0.073224 3.209873 138 | 1.000000 0.286515 3.632942 139 | 1.000000 0.182026 3.248361 140 | 1.000000 0.621523 3.995783 141 | 1.000000 0.344584 3.563262 142 | 1.000000 0.398556 3.649712 143 | 1.000000 0.480369 3.951845 144 | 1.000000 0.153350 3.145031 145 | 1.000000 0.171846 3.181577 146 | 1.000000 0.867082 4.637087 147 | 1.000000 0.223855 3.404964 148 | 1.000000 0.528301 3.873188 149 | 1.000000 0.890192 4.633648 150 | 1.000000 0.106352 3.154768 151 | 1.000000 0.917886 4.623637 152 | 1.000000 0.014855 3.078132 153 | 1.000000 0.567682 3.913596 154 | 1.000000 0.068854 3.221817 155 | 1.000000 0.603535 3.938071 156 | 1.000000 0.532050 3.880822 157 | 1.000000 0.651362 4.176436 158 | 1.000000 0.901225 4.648161 159 | 1.000000 0.204337 3.332312 160 | 1.000000 0.696081 4.240614 161 | 1.000000 0.963924 4.532224 162 | 1.000000 0.981390 4.557105 163 | 1.000000 0.987911 4.610072 164 | 1.000000 0.990947 4.636569 165 | 1.000000 0.736021 4.229813 166 | 1.000000 0.253574 3.500860 167 | 1.000000 0.674722 4.245514 168 | 1.000000 0.939368 4.605182 169 | 1.000000 0.235419 3.454340 170 | 1.000000 0.110521 3.180775 171 | 1.000000 0.218023 3.380820 172 | 1.000000 0.869778 4.565020 173 | 1.000000 0.196830 3.279973 174 | 1.000000 0.958178 4.554241 175 | 1.000000 0.972673 4.633520 176 | 1.000000 0.745797 4.281037 177 | 1.000000 0.445674 3.844426 178 | 1.000000 0.470557 3.891601 179 | 1.000000 0.549236 3.849728 180 | 1.000000 0.335691 3.492215 181 | 1.000000 0.884739 4.592374 182 | 1.000000 0.918916 4.632025 183 | 1.000000 0.441815 3.756750 184 | 1.000000 0.116598 3.133555 185 | 1.000000 0.359274 3.567919 186 | 1.000000 0.814811 4.363382 187 | 1.000000 0.387125 3.560165 188 | 1.000000 0.982243 4.564305 189 | 1.000000 0.780880 4.215055 190 | 1.000000 0.652565 4.174999 191 | 1.000000 0.870030 4.586640 192 | 1.000000 0.604755 3.960008 193 | 1.000000 0.255212 3.529963 194 | 1.000000 0.730546 4.213412 195 | 1.000000 0.493829 3.908685 196 | 1.000000 0.257017 3.585821 197 | 1.000000 0.833735 4.374394 198 | 1.000000 0.070095 3.213817 199 | 1.000000 0.527070 3.952681 200 | 1.000000 0.116163 3.129283 201 | -------------------------------------------------------------------------------- /linear_regression/data/ex1.txt: -------------------------------------------------------------------------------- 1 | 6.1101 17.592 2 | 5.5277 9.1302 3 | 8.5186 13.662 4 | 7.0032 11.854 5 | 5.8598 6.8233 6 | 8.3829 11.886 7 | 7.4764 4.3483 8 | 8.5781 12 9 | 6.4862 6.5987 10 | 5.0546 3.8166 11 | 5.7107 3.2522 12 | 14.164 15.505 13 | 5.734 3.1551 14 | 8.4084 7.2258 15 | 5.6407 0.71618 16 | 5.3794 3.5129 17 | 6.3654 5.3048 18 | 5.1301 0.56077 19 | 6.4296 3.6518 20 | 7.0708 5.3893 21 | 6.1891 3.1386 22 | 20.27 21.767 23 | 5.4901 4.263 24 | 6.3261 5.1875 25 | 5.5649 3.0825 26 | 18.945 22.638 27 | 12.828 13.501 28 | 10.957 7.0467 29 | 13.176 14.692 30 | 22.203 24.147 31 | 5.2524 -1.22 32 | 6.5894 5.9966 33 | 9.2482 12.134 34 | 5.8918 1.8495 35 | 8.2111 6.5426 36 | 7.9334 4.5623 37 | 8.0959 4.1164 38 | 5.6063 3.3928 39 | 12.836 10.117 40 | 6.3534 5.4974 41 | 5.4069 0.55657 42 | 6.8825 3.9115 43 | 11.708 5.3854 44 | 5.7737 2.4406 45 | 7.8247 6.7318 46 | 7.0931 1.0463 47 | 5.0702 5.1337 48 | 5.8014 1.844 49 | 11.7 8.0043 50 | 5.5416 1.0179 51 | 7.5402 6.7504 52 | 5.3077 1.8396 53 | 7.4239 4.2885 54 | 7.6031 4.9981 55 | 6.3328 1.4233 56 | 6.3589 -1.4211 57 | 6.2742 2.4756 58 | 5.6397 4.6042 59 | 9.3102 3.9624 60 | 9.4536 5.4141 61 | 8.8254 5.1694 62 | 5.1793 -0.74279 63 | 21.279 17.929 64 | 14.908 12.054 65 | 18.959 17.054 66 | 7.2182 4.8852 67 | 8.2951 5.7442 68 | 10.236 7.7754 69 | 5.4994 1.0173 70 | 20.341 20.992 71 | 10.136 6.6799 72 | 7.3345 4.0259 73 | 6.0062 1.2784 74 | 7.2259 3.3411 75 | 5.0269 -2.6807 76 | 6.5479 0.29678 77 | 7.5386 3.8845 78 | 5.0365 5.7014 79 | 10.274 6.7526 80 | 5.1077 2.0576 81 | 5.7292 0.47953 82 | 5.1884 0.20421 83 | 6.3557 0.67861 84 | 9.7687 7.5435 85 | 6.5159 5.3436 86 | 8.5172 4.2415 87 | 9.1802 6.7981 88 | 6.0020 .92695 89 | 5.5204 0.152 90 | 5.0594 2.8214 91 | 5.7077 1.8451 92 | 7.6366 4.2959 93 | 5.8707 7.2029 94 | 5.3054 1.9869 95 | 8.2934 0.14454 96 | 13.394 9.0551 97 | 5.4369 0.61705 98 | -------------------------------------------------------------------------------- /linear_regression/data/houses.txt: -------------------------------------------------------------------------------- 1 | 2104 3 399900 2 | 1600 3 329900 3 | 2400 3 369000 4 | 1416 2 232000 5 | 3000 4 539900 6 | 1985 4 299900 7 | 1534 3 314900 8 | 1427 3 198999 9 | 1380 3 212000 10 | 1494 3 242500 11 | 1940 4 239999 12 | 2000 3 347000 13 | 1890 3 329999 14 | 4478 5 699900 15 | 1268 3 259900 16 | 2300 4 449900 17 | 1320 2 299900 18 | 1236 3 199900 19 | 2609 4 499998 20 | 3031 4 599000 21 | 1767 3 252900 22 | 1888 2 255000 23 | 1604 3 242900 24 | 1962 4 259900 25 | 3890 3 573900 26 | 1100 3 249900 27 | 1458 3 464500 28 | 2526 3 469000 29 | 2200 3 475000 30 | 2637 3 299900 31 | 1839 2 349900 32 | 1000 1 169900 33 | 2040 4 314900 34 | 3137 3 579900 35 | 1811 4 285900 36 | 1437 3 249900 37 | 1239 3 229900 38 | 2132 4 345000 39 | 4215 4 549000 40 | 2162 4 287000 41 | 1664 2 368500 42 | 2238 3 329900 43 | 2567 4 314000 44 | 1200 3 299000 45 | 852 2 179900 46 | 1852 4 299900 47 | 1203 3 239500 48 | -------------------------------------------------------------------------------- /linear_regression/data/lwr.txt: -------------------------------------------------------------------------------- 1 | 1 1.0 2 | 2 2.0 3 | 3 3.0 4 | 4 3.15 5 | 5 3.25 6 | 6 3.5 7 | -------------------------------------------------------------------------------- /linear_regression/data/temperature.txt: -------------------------------------------------------------------------------- 1 | 50 3.3 2 | 50 2.8 3 | 50 2.9 4 | 70 2.3 5 | 70 2.6 6 | 70 2.1 7 | 80 2.5 8 | 80 2.9 9 | 80 2.4 10 | 90 3.0 11 | 90 3.1 12 | 90 2.8 13 | 100 3.3 14 | 100 3.5 15 | 100 3.0 16 | -------------------------------------------------------------------------------- /linear_regression/regression.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # linear_regression/regression.py 3 | import numpy as np 4 | import matplotlib as plt 5 | import time 6 | 7 | def exeTime(func): 8 | """ 耗时计算装饰器 9 | """ 10 | def newFunc(*args, **args2): 11 | t0 = time.time() 12 | back = func(*args, **args2) 13 | return back, time.time() - t0 14 | return newFunc 15 | 16 | def loadDataSet(filename): 17 | """ 读取数据 18 | 19 | 从文件中获取数据,在《机器学习实战中》,数据格式如下 20 | "feature1 TAB feature2 TAB feature3 TAB label" 21 | 22 | Args: 23 | filename 文件名 24 | 25 | Returns: 26 | X 训练样本集矩阵 27 | y 标签集矩阵 28 | """ 29 | numFeat = len(open(filename).readline().split('\t')) - 1 30 | X = [] 31 | y = [] 32 | file = open(filename) 33 | for line in file.readlines(): 34 | lineArr = [] 35 | curLine = line.strip().split('\t') 36 | for i in range(numFeat): 37 | lineArr.append(float(curLine[i])) 38 | X.append(lineArr) 39 | y.append(float(curLine[-1])) 40 | return np.mat(X), np.mat(y).T 41 | 42 | def h(theta, x): 43 | """预测函数 44 | 45 | Args: 46 | theta 相关系数矩阵 47 | x 特征向量 48 | 49 | Returns: 50 | 预测结果 51 | """ 52 | return (theta.T*x)[0,0] 53 | 54 | def J(theta, X, y): 55 | """代价函数 56 | 57 | Args: 58 | theta 相关系数矩阵 59 | X 样本集矩阵 60 | y 标签集矩阵 61 | 62 | Returns: 63 | 预测误差(代价) 64 | """ 65 | m = len(X) 66 | return (X*theta-y).T*(X*theta-y)/(2*m) 67 | 68 | @exeTime 69 | def bgd(rate, maxLoop, epsilon, X, y): 70 | """批量梯度下降法 71 | 72 | Args: 73 | rate 学习率 74 | maxLoop 最大迭代次数 75 | epsilon 收敛精度 76 | X 样本矩阵 77 | y 标签矩阵 78 | 79 | Returns: 80 | (theta, errors, thetas), timeConsumed 81 | """ 82 | m,n = X.shape 83 | # 初始化theta 84 | theta = np.zeros((n,1)) 85 | count = 0 86 | converged = False 87 | error = float('inf') 88 | errors = [] 89 | thetas = {} 90 | for j in range(n): 91 | thetas[j] = [theta[j,0]] 92 | while count<=maxLoop: 93 | if(converged): 94 | break 95 | count = count + 1 96 | for j in range(n): 97 | deriv = (y-X*theta).T*X[:, j]/m 98 | theta[j,0] = theta[j,0]+rate*deriv 99 | thetas[j].append(theta[j,0]) 100 | error = J(theta, X, y) 101 | errors.append(error[0,0]) 102 | # 如果已经收敛 103 | if(error < epsilon): 104 | converged = True 105 | return theta,errors,thetas 106 | 107 | @exeTime 108 | def sgd(rate, maxLoop, epsilon, X, y): 109 | """随机梯度下降法 110 | Args: 111 | rate 学习率 112 | maxLoop 最大迭代次数 113 | epsilon 收敛精度 114 | X 样本矩阵 115 | y 标签矩阵 116 | Returns: 117 | (theta, error, thetas), timeConsumed 118 | """ 119 | m,n = X.shape 120 | # 初始化theta 121 | theta = np.zeros((n,1)) 122 | count = 0 123 | converged = False 124 | error = float('inf') 125 | errors = [] 126 | thetas = {} 127 | for j in range(n): 128 | thetas[j] = [theta[j,0]] 129 | while count <= maxLoop: 130 | if converged: 131 | break 132 | count = count + 1 133 | errors.append(float('inf')) 134 | for i in range(m): 135 | if converged: 136 | break 137 | diff = y[i,0]-h(theta, X[i].T) 138 | for j in range(n): 139 | theta[j,0] = theta[j,0] + rate*diff*X[i, j] 140 | thetas[j].append(theta[j,0]) 141 | error = J(theta, X, y) 142 | errors[-1] = error[0,0] 143 | # 如果已经收敛 144 | if(error < epsilon): 145 | converged = True 146 | return theta, errors, thetas 147 | 148 | def JLwr(theta, X, y, x, c): 149 | """局部加权线性回归的代价函数计算式 150 | 151 | Args: 152 | theta 相关系数矩阵 153 | X 样本集矩阵 154 | y 标签集矩阵 155 | x 待预测输入 156 | c tau 157 | Returns: 158 | 预测代价 159 | """ 160 | m,n = X.shape 161 | summerize = 0 162 | for i in range(m): 163 | diff = (X[i]-x)*(X[i]-x).T 164 | w = np.exp(-diff/(2*c*c)) 165 | predictDiff = np.power(y[i] - X[i]*theta,2) 166 | summerize = summerize + w*predictDiff 167 | return summerize 168 | 169 | @exeTime 170 | def lwr(rate, maxLoop, epsilon, X, y, x, c=1): 171 | """局部加权线性回归 172 | 173 | Args: 174 | rate 学习率 175 | maxLoop 最大迭代次数 176 | epsilon 预测精度 177 | X 输入样本 178 | y 标签向量 179 | x 待预测向量 180 | c tau 181 | """ 182 | m,n = X.shape 183 | # 初始化theta 184 | theta = np.zeros((n,1)) 185 | count = 0 186 | converged = False 187 | error = float('inf') 188 | errors = [] 189 | thetas = {} 190 | for j in range(n): 191 | thetas[j] = [theta[j,0]] 192 | # 执行批量梯度下降 193 | while count<=maxLoop: 194 | if(converged): 195 | break 196 | count = count + 1 197 | for j in range(n): 198 | deriv = (y-X*theta).T*X[:, j]/m 199 | theta[j,0] = theta[j,0]+rate*deriv 200 | thetas[j].append(theta[j,0]) 201 | error = JLwr(theta, X, y, x, c) 202 | errors.append(error[0,0]) 203 | # 如果已经收敛 204 | if(error < epsilon): 205 | converged = True 206 | return theta,errors,thetas 207 | 208 | def standarize(X): 209 | """特征标准化处理 210 | 211 | Args: 212 | X 样本集 213 | Returns: 214 | 标准后的样本集 215 | """ 216 | m, n = X.shape 217 | # 归一化每一个特征 218 | for j in range(n): 219 | features = X[:,j] 220 | meanVal = features.mean(axis=0) 221 | std = features.std(axis=0) 222 | if std != 0: 223 | X[:, j] = (features-meanVal)/std 224 | else: 225 | X[:, j] = 0 226 | return X 227 | 228 | def normalize(X): 229 | """特征归一化处理 230 | 231 | Args: 232 | X 样本集 233 | Returns: 234 | 归一化后的样本集 235 | """ 236 | m, n = X.shape 237 | # 归一化每一个特征 238 | for j in range(n): 239 | features = X[:,j] 240 | minVal = features.min(axis=0) 241 | maxVal = features.max(axis=0) 242 | diff = maxVal - minVal 243 | if diff != 0: 244 | X[:,j] = (features-minVal)/diff 245 | else: 246 | X[:,j] = 0 247 | return X 248 | -------------------------------------------------------------------------------- /linear_regression/test_bgd.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # linear_regression/test_bgd.py 3 | import regression 4 | from matplotlib import cm 5 | from mpl_toolkits.mplot3d import axes3d 6 | import matplotlib.pyplot as plt 7 | import matplotlib.ticker as mtick 8 | import numpy as np 9 | 10 | if __name__ == "__main__": 11 | X, y = regression.loadDataSet('data/ex1.txt'); 12 | 13 | m,n = X.shape 14 | X = np.concatenate((np.ones((m,1)), X), axis=1) 15 | 16 | rate = 0.02 17 | maxLoop = 1500 18 | epsilon = 0.01 19 | 20 | result, timeConsumed = regression.bgd(rate, maxLoop, epsilon, X, y) 21 | 22 | theta, errors, thetas = result 23 | 24 | # 绘制拟合曲线 25 | fittingFig = plt.figure() 26 | title = 'bgd: rate=%.2f, maxLoop=%d, epsilon=%.3f \n time: %ds'%(rate,maxLoop,epsilon,timeConsumed) 27 | ax = fittingFig.add_subplot(111, title=title) 28 | trainingSet = ax.scatter(X[:, 1].flatten().A[0], y[:,0].flatten().A[0]) 29 | 30 | xCopy = X.copy() 31 | xCopy.sort(0) 32 | yHat = xCopy*theta 33 | fittingLine, = ax.plot(xCopy[:,1], yHat, color='g') 34 | 35 | ax.set_xlabel('Population of City in 10,000s') 36 | ax.set_ylabel('Profit in $10,000s') 37 | 38 | plt.legend([trainingSet, fittingLine], ['Training Set', 'Linear Regression']) 39 | plt.show() 40 | 41 | # 绘制误差曲线 42 | errorsFig = plt.figure() 43 | ax = errorsFig.add_subplot(111) 44 | ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.4f')) 45 | 46 | ax.plot(range(len(errors)), errors) 47 | ax.set_xlabel('Number of iterations') 48 | ax.set_ylabel('Cost J') 49 | 50 | plt.show() 51 | 52 | # 绘制能量下降曲面 53 | size = 100 54 | theta0Vals = np.linspace(-10,10, size) 55 | theta1Vals = np.linspace(-2, 4, size) 56 | JVals = np.zeros((size, size)) 57 | for i in range(size): 58 | for j in range(size): 59 | col = np.matrix([[theta0Vals[i]], [theta1Vals[j]]]) 60 | JVals[i,j] = regression.J(col, X, y) 61 | 62 | theta0Vals, theta1Vals = np.meshgrid(theta0Vals, theta1Vals) 63 | JVals = JVals.T 64 | contourSurf = plt.figure() 65 | ax = contourSurf.gca(projection='3d') 66 | 67 | ax.plot_surface(theta0Vals, theta1Vals, JVals, rstride=2, cstride=2, alpha=0.3, 68 | cmap=cm.rainbow, linewidth=0, antialiased=False) 69 | ax.plot(thetas[0], thetas[1], 'rx') 70 | ax.set_xlabel(r'$\theta_0$') 71 | ax.set_ylabel(r'$\theta_1$') 72 | ax.set_zlabel(r'$J(\theta)$') 73 | 74 | plt.show() 75 | 76 | # 绘制能量轮廓 77 | contourFig = plt.figure() 78 | ax = contourFig.add_subplot(111) 79 | ax.set_xlabel(r'$\theta_0$') 80 | ax.set_ylabel(r'$\theta_1$') 81 | 82 | CS = ax.contour(theta0Vals, theta1Vals, JVals, np.logspace(-2,3,20)) 83 | plt.clabel(CS, inline=1, fontsize=10) 84 | 85 | # 绘制最优解 86 | ax.plot(theta[0,0], theta[1,0], 'rx', markersize=10, linewidth=2) 87 | 88 | # 绘制梯度下降过程 89 | ax.plot(thetas[0], thetas[1], 'rx', markersize=3, linewidth=1) 90 | ax.plot(thetas[0], thetas[1], 'r-') 91 | 92 | plt.show() 93 | -------------------------------------------------------------------------------- /linear_regression/test_feature_scaling.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # linear_regression/test_feature_scaling.py 3 | import regression 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.mplot3d import axes3d 7 | 8 | if __name__ == "__main__": 9 | srcX, y = regression.loadDataSet('data/houses.txt') 10 | 11 | m, n = srcX.shape 12 | X = np.concatenate((np.ones((m,1)), srcX), axis=1) 13 | 14 | rate = 1 15 | maxLoop = 1000 16 | epsilon = 1 17 | 18 | result, timeConsumed = regression.bgd(rate, maxLoop, epsilon, X, y) 19 | theta, errors, thetas = result 20 | 21 | # 打印拟合曲线 22 | fittingFig = plt.figure() 23 | title = 'bgd: rate=%.2f, maxLoop=%d, epsilon=%.3f \n time: %ds'%(rate,maxLoop,epsilon,timeConsumed) 24 | ax = fittingFig.add_subplot(111, title=title) 25 | trainingSet = ax.scatter(X[:, 1].flatten().A[0], y[:,0].flatten().A[0]) 26 | 27 | xCopy = X.copy() 28 | xCopy.sort(0) 29 | yHat = xCopy*theta 30 | fittingLine, = ax.plot(xCopy[:,1], yHat, color='g') 31 | 32 | ax.set_xlabel('Population of City in 10,000s') 33 | ax.set_ylabel('Profit in $10,000s') 34 | 35 | plt.legend([trainingSet, fittingLine], ['Training Set', 'Linear Regression']) 36 | plt.show() 37 | 38 | # 绘制能量函数的轮廓 39 | theta1Vals = np.linspace(min(thetas[1]), max(thetas[1]), 100) 40 | theta2Vals = np.linspace(min(thetas[2]), max(thetas[2]), 100) 41 | JVals = np.zeros((100, 100)) 42 | for i in range(100): 43 | for j in range(100): 44 | theta = np.matrix([[0], [theta1Vals[i]], [theta2Vals[j]]]) 45 | JVals[i,j] = regression.J(theta, X, y) 46 | contourFig = plt.figure() 47 | ax = contourFig.add_subplot(111) 48 | ax.contour(theta1Vals, theta2Vals, JVals, np.logspace(-2,3,20)) 49 | 50 | plt.show() 51 | 52 | # 打印误差曲线 53 | errorsFig = plt.figure() 54 | ax = errorsFig.add_subplot(111) 55 | ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.4f')) 56 | 57 | ax.plot(range(len(errors)), errors) 58 | ax.set_xlabel('Number of iterations') 59 | ax.set_ylabel('Cost J') 60 | 61 | plt.show() 62 | -------------------------------------------------------------------------------- /linear_regression/test_lwr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # linear_regression/test_lwr.py 3 | import regression 4 | import matplotlib.pyplot as plt 5 | import matplotlib.ticker as mtick 6 | import numpy as np 7 | 8 | if __name__ == "__main__": 9 | srcX, y = regression.loadDataSet('data/lwr.txt'); 10 | 11 | m,n = srcX.shape 12 | srcX = np.concatenate((srcX[:, 0], np.power(srcX[:, 0],2)), axis=1) 13 | # 特征缩放 14 | X = regression.standardize(srcX.copy()) 15 | X = np.concatenate((np.ones((m,1)), X), axis=1) 16 | 17 | rate = 0.1 18 | maxLoop = 1000 19 | epsilon = 0.01 20 | 21 | predicateX = regression.standardize(np.matrix([[8, 64]])) 22 | 23 | predicateX = np.concatenate((np.ones((1,1)), predicateX), axis=1) 24 | 25 | result, t = regression.lwr(rate, maxLoop, epsilon, X, y, predicateX, 1) 26 | theta, errors, thetas = result 27 | 28 | result2, t = regression.lwr(rate, maxLoop, epsilon, X, y, predicateX, 0.1) 29 | theta2, errors2, thetas2 = result2 30 | 31 | 32 | # 打印特征点 33 | fittingFig = plt.figure() 34 | title = 'polynomial with bgd: rate=%.2f, maxLoop=%d, epsilon=%.3f'%(rate,maxLoop,epsilon) 35 | ax = fittingFig.add_subplot(111, title=title) 36 | trainingSet = ax.scatter(srcX[:, 0].flatten().A[0], y[:,0].flatten().A[0]) 37 | 38 | print theta 39 | print theta2 40 | 41 | # 打印拟合曲线 42 | xx = np.linspace(1, 7, 50) 43 | xx2 = np.power(xx,2) 44 | yHat1 = [] 45 | yHat2 = [] 46 | for i in range(50): 47 | normalizedSize = (xx[i]-xx.mean())/xx.std(0) 48 | normalizedSize2 = (xx2[i]-xx2.mean())/xx2.std(0) 49 | x = np.matrix([[1,normalizedSize, normalizedSize2]]) 50 | yHat1.append(regression.h(theta, x.T)) 51 | yHat2.append(regression.h(theta2, x.T)) 52 | fittingLine1, = ax.plot(xx, yHat1, color='g') 53 | fittingLine2, = ax.plot(xx, yHat2, color='r') 54 | 55 | ax.set_xlabel('temperature') 56 | ax.set_ylabel('yield') 57 | 58 | plt.legend([trainingSet, fittingLine1, fittingLine2], ['Training Set', r'LWR with $\tau$=1', r'LWR with $\tau$=0.1']) 59 | plt.show() 60 | 61 | # 打印误差曲线 62 | errorsFig = plt.figure() 63 | ax = errorsFig.add_subplot(111) 64 | ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e')) 65 | 66 | ax.plot(range(len(errors)), errors) 67 | ax.set_xlabel('Number of iterations') 68 | ax.set_ylabel('Cost J') 69 | 70 | plt.show() 71 | -------------------------------------------------------------------------------- /linear_regression/test_multiple.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # linear_regression/test_multiple.py 3 | import regression 4 | import numpy as np 5 | from mpl_toolkits.mplot3d import Axes3D 6 | from matplotlib import cm 7 | import matplotlib.pyplot as plt 8 | import matplotlib.ticker as mtick 9 | 10 | if __name__ == "__main__": 11 | srcX, y = regression.loadDataSet('data/houses.txt') 12 | 13 | # 新建特征 14 | m,n= srcX.shape 15 | X = regression.normalize(srcX.copy()) 16 | X = np.concatenate((np.ones((m,1)), X), axis=1) 17 | 18 | rate = 1 19 | maxLoop = 50 20 | epsilon = 1 21 | 22 | result, timeConsumed = regression.bgd(rate, maxLoop, epsilon, X, y) 23 | theta,errors = result 24 | 25 | print 'theta is:' 26 | print theta 27 | print '........' 28 | 29 | # 预测价格 30 | normalizedSize = (1650-srcX[:,0].mean(0))/srcX[:,0].std(0) 31 | normalizedBr = (3-srcX[:,1].mean(0))/srcX[:,1].std(0) 32 | predicateX = np.matrix([[1, normalizedSize, normalizedBr]]) 33 | price = regression.h(theta, predicateX.T) 34 | print 'Predicted price of a 1650 sq-ft, 3 br house: $%.4f'%price 35 | print '........' 36 | 37 | # 打印拟合平面 38 | fittingFig = plt.figure(figsize=(16, 12)) 39 | title = 'polynomial with bgd: rate=%.3f, maxLoop=%d, epsilon=%.3f \n time: %ds'%(rate,maxLoop,epsilon,timeConsumed) 40 | ax = fittingFig.add_subplot(111, projection='3d', title=title) 41 | 42 | xx = np.linspace(0,5000,25) 43 | yy = np.linspace(0,5,25) 44 | zz = np.zeros((25,25)) 45 | for i in range(25): 46 | for j in range(25): 47 | normalizedSize = (xx[i]-srcX[:,0].mean(0))/srcX[:,0].std(0) 48 | normalizedSize = (xx[i]-srcX[:,0].mean(0))/srcX[:,0].std(0) 49 | x = np.matrix([[1,normalizedSize, normalizedBr]]) 50 | zz[i,j] = regression.h(theta, x.T) 51 | xx, yy = np.meshgrid(xx,yy) 52 | ax.zaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e')) 53 | ax.plot_surface(xx, yy, zz, rstride=1, cstride=1, cmap=cm.rainbow, alpha=0.1, antialiased=True) 54 | 55 | xs = srcX[:, 0].flatten().A[0] 56 | ys = srcX[:, 1].flatten().A[0] 57 | zs = y[:, 0].flatten().A[0] 58 | ax.scatter(xs, ys, zs, c='b', marker='o') 59 | 60 | ax.set_xlabel('sq-ft of room') 61 | ax.set_ylabel('bedrooms') 62 | ax.set_zlabel('price') 63 | 64 | plt.show() 65 | 66 | # 打印误差曲线 67 | errorsFig = plt.figure() 68 | ax = errorsFig.add_subplot(111) 69 | ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e')) 70 | 71 | ax.plot(range(len(errors)), errors) 72 | ax.set_xlabel('Number of iterations') 73 | ax.set_ylabel('Cost J') 74 | 75 | plt.show() 76 | -------------------------------------------------------------------------------- /linear_regression/test_sgd.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # linear_regression/test_sgd.py 3 | import regression 4 | from matplotlib import cm 5 | from mpl_toolkits.mplot3d import axes3d 6 | import matplotlib.pyplot as plt 7 | import matplotlib.ticker as mtick 8 | import numpy as np 9 | 10 | if __name__ == "__main__": 11 | X, y = regression.loadDataSet('data/ex1.txt'); 12 | 13 | m,n = X.shape 14 | X = np.concatenate((np.ones((m,1)), X), axis=1) 15 | 16 | rate = 0.01 17 | maxLoop = 100 18 | epsilon =0.01 19 | 20 | result, timeConsumed = regression.sgd(rate, maxLoop, epsilon, X, y) 21 | 22 | theta, errors, thetas = result 23 | 24 | # 绘制拟合曲线 25 | fittingFig = plt.figure() 26 | title = 'sgd: rate=%.2f, maxLoop=%d, epsilon=%.3f \n time: %ds'%(rate,maxLoop,epsilon,timeConsumed) 27 | ax = fittingFig.add_subplot(111, title=title) 28 | trainingSet = ax.scatter(X[:, 1].flatten().A[0], y[:,0].flatten().A[0]) 29 | 30 | xCopy = X.copy() 31 | xCopy.sort(0) 32 | yHat = xCopy*theta 33 | fittingLine, = ax.plot(xCopy[:,1], yHat, color='g') 34 | 35 | ax.set_xlabel('Population of City in 10,000s') 36 | ax.set_ylabel('Profit in $10,000s') 37 | 38 | plt.legend([trainingSet, fittingLine], ['Training Set', 'Linear Regression']) 39 | plt.show() 40 | 41 | # 绘制误差曲线 42 | errorsFig = plt.figure() 43 | ax = errorsFig.add_subplot(111) 44 | ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.4f')) 45 | 46 | ax.plot(range(len(errors)), errors) 47 | ax.set_xlabel('Number of iterations') 48 | ax.set_ylabel('Cost J') 49 | 50 | plt.show() 51 | 52 | # 绘制能量下降曲面 53 | size = 100 54 | theta0Vals = np.linspace(-10,10, size) 55 | theta1Vals = np.linspace(-2, 4, size) 56 | JVals = np.zeros((size, size)) 57 | for i in range(size): 58 | for j in range(size): 59 | col = np.matrix([[theta0Vals[i]], [theta1Vals[j]]]) 60 | JVals[i,j] = regression.J(col, X, y) 61 | 62 | theta0Vals, theta1Vals = np.meshgrid(theta0Vals, theta1Vals) 63 | JVals = JVals.T 64 | contourSurf = plt.figure() 65 | ax = contourSurf.gca(projection='3d') 66 | 67 | ax.plot_surface(theta0Vals, theta1Vals, JVals, rstride=8, cstride=8, alpha=0.3, 68 | cmap=cm.rainbow, linewidth=0, antialiased=False) 69 | ax.plot(thetas[0], thetas[1], 'rx') 70 | ax.set_xlabel(r'$\theta_0$') 71 | ax.set_ylabel(r'$\theta_1$') 72 | ax.set_zlabel(r'$J(\theta)$') 73 | 74 | plt.show() 75 | 76 | # 绘制能量轮廓 77 | contourFig = plt.figure() 78 | ax = contourFig.add_subplot(111) 79 | ax.set_xlabel(r'$\theta_0$') 80 | ax.set_ylabel(r'$\theta_1$') 81 | 82 | CS = ax.contour(theta0Vals, theta1Vals, JVals, np.logspace(-2,3,20)) 83 | plt.clabel(CS, inline=1, fontsize=10) 84 | 85 | # 绘制最优解 86 | ax.plot(theta[0,0], theta[1,0], 'rx', markersize=10, linewidth=2) 87 | 88 | # 绘制梯度下降过程 89 | ax.plot(thetas[0], thetas[1], 'r', linewidth=1) 90 | 91 | plt.show() 92 | -------------------------------------------------------------------------------- /linear_regression/test_temperature_normal.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # linear_regression/test_temperature_normal.py 3 | import regression 4 | from matplotlib import cm 5 | from mpl_toolkits.mplot3d import axes3d 6 | import matplotlib.pyplot as plt 7 | import matplotlib.ticker as mtick 8 | import numpy as np 9 | 10 | if __name__ == "__main__": 11 | X, y = regression.loadDataSet('data/temperature.txt'); 12 | 13 | m,n = X.shape 14 | X = np.concatenate((np.ones((m,1)), X), axis=1) 15 | 16 | rate = 0.0001 17 | maxLoop = 1000 18 | epsilon =0.01 19 | 20 | result, timeConsumed = regression.bgd(rate, maxLoop, epsilon, X, y) 21 | 22 | theta, errors, thetas = result 23 | 24 | # 绘制拟合曲线 25 | fittingFig = plt.figure() 26 | title = 'bgd: rate=%.3f, maxLoop=%d, epsilon=%.3f \n time: %ds'%(rate,maxLoop,epsilon,timeConsumed) 27 | ax = fittingFig.add_subplot(111, title=title) 28 | trainingSet = ax.scatter(X[:, 1].flatten().A[0], y[:,0].flatten().A[0]) 29 | 30 | xCopy = X.copy() 31 | xCopy.sort(0) 32 | yHat = xCopy*theta 33 | fittingLine, = ax.plot(xCopy[:,1], yHat, color='g') 34 | 35 | ax.set_xlabel('temperature') 36 | ax.set_ylabel('yield') 37 | 38 | plt.legend([trainingSet, fittingLine], ['Training Set', 'Linear Regression']) 39 | plt.show() 40 | 41 | # 绘制误差曲线 42 | errorsFig = plt.figure() 43 | ax = errorsFig.add_subplot(111) 44 | ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.4f')) 45 | 46 | ax.plot(range(len(errors)), errors) 47 | ax.set_xlabel('Number of iterations') 48 | ax.set_ylabel('Cost J') 49 | 50 | plt.show() 51 | -------------------------------------------------------------------------------- /linear_regression/test_temperature_polynomial.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # linear_regression/test_temperature_polynomial.py 3 | import regression 4 | import matplotlib.pyplot as plt 5 | import matplotlib.ticker as mtick 6 | import numpy as np 7 | 8 | if __name__ == "__main__": 9 | srcX, y = regression.loadDataSet('data/temperature.txt'); 10 | 11 | m,n = srcX.shape 12 | srcX = np.concatenate((srcX[:, 0], np.power(srcX[:, 0],2)), axis=1) 13 | # 特征缩放 14 | X = regression.standardize(srcX.copy()) 15 | X = np.concatenate((np.ones((m,1)), X), axis=1) 16 | 17 | rate = 0.1 18 | maxLoop = 1000 19 | epsilon = 0.01 20 | 21 | result, timeConsumed = regression.bgd(rate, maxLoop, epsilon, X, y) 22 | theta, errors, thetas = result 23 | 24 | # 打印特征点 25 | fittingFig = plt.figure() 26 | title = 'polynomial with bgd: rate=%.2f, maxLoop=%d, epsilon=%.3f \n time: %ds'%(rate,maxLoop,epsilon,timeConsumed) 27 | ax = fittingFig.add_subplot(111, title=title) 28 | trainingSet = ax.scatter(srcX[:, 0].flatten().A[0], y[:,0].flatten().A[0]) 29 | 30 | print theta 31 | 32 | # 打印拟合曲线 33 | xx = np.linspace(50,100,50) 34 | xx2 = np.power(xx,2) 35 | yHat = [] 36 | for i in range(50): 37 | normalizedSize = (xx[i]-xx.mean())/xx.std(0) 38 | normalizedSize2 = (xx2[i]-xx2.mean())/xx2.std(0) 39 | x = np.matrix([[1,normalizedSize, normalizedSize2]]) 40 | yHat.append(regression.h(theta, x.T)) 41 | fittingLine, = ax.plot(xx, yHat, color='g') 42 | 43 | ax.set_xlabel('temperature') 44 | ax.set_ylabel('yield') 45 | 46 | plt.legend([trainingSet, fittingLine], ['Training Set', 'Polynomial Regression']) 47 | plt.show() 48 | 49 | # 打印误差曲线 50 | errorsFig = plt.figure() 51 | ax = errorsFig.add_subplot(111) 52 | ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e')) 53 | 54 | ax.plot(range(len(errors)), errors) 55 | ax.set_xlabel('Number of iterations') 56 | ax.set_ylabel('Cost J') 57 | 58 | plt.show() 59 | -------------------------------------------------------------------------------- /logical_regression/data/ex3data1.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/logical_regression/data/ex3data1.mat -------------------------------------------------------------------------------- /logical_regression/data/linear.txt: -------------------------------------------------------------------------------- 1 | -0.017612 14.053064 0 2 | -1.395634 4.662541 1 3 | -0.752157 6.538620 0 4 | -1.322371 7.152853 0 5 | 0.423363 11.054677 0 6 | 0.406704 7.067335 1 7 | 0.667394 12.741452 0 8 | -2.460150 6.866805 1 9 | 0.569411 9.548755 0 10 | -0.026632 10.427743 0 11 | 0.850433 6.920334 1 12 | 1.347183 13.175500 0 13 | 1.176813 3.167020 1 14 | -1.781871 9.097953 0 15 | -0.566606 5.749003 1 16 | 0.931635 1.589505 1 17 | -0.024205 6.151823 1 18 | -0.036453 2.690988 1 19 | -0.196949 0.444165 1 20 | 1.014459 5.754399 1 21 | 1.985298 3.230619 1 22 | -1.693453 -0.557540 1 23 | -0.576525 11.778922 0 24 | -0.346811 -1.678730 1 25 | -2.124484 2.672471 1 26 | 1.217916 9.597015 0 27 | -0.733928 9.098687 0 28 | -3.642001 -1.618087 1 29 | 0.315985 3.523953 1 30 | 1.416614 9.619232 0 31 | -0.386323 3.989286 1 32 | 0.556921 8.294984 1 33 | 1.224863 11.587360 0 34 | -1.347803 -2.406051 1 35 | 1.196604 4.951851 1 36 | 0.275221 9.543647 0 37 | 0.470575 9.332488 0 38 | -1.889567 9.542662 0 39 | -1.527893 12.150579 0 40 | -1.185247 11.309318 0 41 | -0.445678 3.297303 1 42 | 1.042222 6.105155 1 43 | -0.618787 10.320986 0 44 | 1.152083 0.548467 1 45 | 0.828534 2.676045 1 46 | -1.237728 10.549033 0 47 | -0.683565 -2.166125 1 48 | 0.229456 5.921938 1 49 | -0.959885 11.555336 0 50 | 0.492911 10.993324 0 51 | 0.184992 8.721488 0 52 | -0.355715 10.325976 0 53 | -0.397822 8.058397 0 54 | 0.824839 13.730343 0 55 | 1.507278 5.027866 1 56 | 0.099671 6.835839 1 57 | -0.344008 10.717485 0 58 | 1.785928 7.718645 1 59 | -0.918801 11.560217 0 60 | -0.364009 4.747300 1 61 | -0.841722 4.119083 1 62 | 0.490426 1.960539 1 63 | -0.007194 9.075792 0 64 | 0.356107 12.447863 0 65 | 0.342578 12.281162 0 66 | -0.810823 -1.466018 1 67 | 2.530777 6.476801 1 68 | 1.296683 11.607559 0 69 | 0.475487 12.040035 0 70 | -0.783277 11.009725 0 71 | 0.074798 11.023650 0 72 | -1.337472 0.468339 1 73 | -0.102781 13.763651 0 74 | -0.147324 2.874846 1 75 | 0.518389 9.887035 0 76 | 1.015399 7.571882 0 77 | -1.658086 -0.027255 1 78 | 1.319944 2.171228 1 79 | 2.056216 5.019981 1 80 | -0.851633 4.375691 1 81 | -1.510047 6.061992 0 82 | -1.076637 -3.181888 1 83 | 1.821096 10.283990 0 84 | 3.010150 8.401766 1 85 | -1.099458 1.688274 1 86 | -0.834872 -1.733869 1 87 | -0.846637 3.849075 1 88 | 1.400102 12.628781 0 89 | 1.752842 5.468166 1 90 | 0.078557 0.059736 1 91 | 0.089392 -0.715300 1 92 | 1.825662 12.693808 0 93 | 0.197445 9.744638 0 94 | 0.126117 0.922311 1 95 | -0.679797 1.220530 1 96 | 0.677983 2.556666 1 97 | 0.761349 10.693862 0 98 | -2.168791 0.143632 1 99 | 1.388610 9.341997 0 100 | 0.317029 14.739025 0 101 | -------------------------------------------------------------------------------- /logical_regression/data/non_linear.txt: -------------------------------------------------------------------------------- 1 | 0.051267 0.69956 1 2 | -0.092742 0.68494 1 3 | -0.21371 0.69225 1 4 | -0.375 0.50219 1 5 | -0.51325 0.46564 1 6 | -0.52477 0.2098 1 7 | -0.39804 0.034357 1 8 | -0.30588 -0.19225 1 9 | 0.016705 -0.40424 1 10 | 0.13191 -0.51389 1 11 | 0.38537 -0.56506 1 12 | 0.52938 -0.5212 1 13 | 0.63882 -0.24342 1 14 | 0.73675 -0.18494 1 15 | 0.54666 0.48757 1 16 | 0.322 0.5826 1 17 | 0.16647 0.53874 1 18 | -0.046659 0.81652 1 19 | -0.17339 0.69956 1 20 | -0.47869 0.63377 1 21 | -0.60541 0.59722 1 22 | -0.62846 0.33406 1 23 | -0.59389 0.005117 1 24 | -0.42108 -0.27266 1 25 | -0.11578 -0.39693 1 26 | 0.20104 -0.60161 1 27 | 0.46601 -0.53582 1 28 | 0.67339 -0.53582 1 29 | -0.13882 0.54605 1 30 | -0.29435 0.77997 1 31 | -0.26555 0.96272 1 32 | -0.16187 0.8019 1 33 | -0.17339 0.64839 1 34 | -0.28283 0.47295 1 35 | -0.36348 0.31213 1 36 | -0.30012 0.027047 1 37 | -0.23675 -0.21418 1 38 | -0.06394 -0.18494 1 39 | 0.062788 -0.16301 1 40 | 0.22984 -0.41155 1 41 | 0.2932 -0.2288 1 42 | 0.48329 -0.18494 1 43 | 0.64459 -0.14108 1 44 | 0.46025 0.012427 1 45 | 0.6273 0.15863 1 46 | 0.57546 0.26827 1 47 | 0.72523 0.44371 1 48 | 0.22408 0.52412 1 49 | 0.44297 0.67032 1 50 | 0.322 0.69225 1 51 | 0.13767 0.57529 1 52 | -0.0063364 0.39985 1 53 | -0.092742 0.55336 1 54 | -0.20795 0.35599 1 55 | -0.20795 0.17325 1 56 | -0.43836 0.21711 1 57 | -0.21947 -0.016813 1 58 | -0.13882 -0.27266 1 59 | 0.18376 0.93348 0 60 | 0.22408 0.77997 0 61 | 0.29896 0.61915 0 62 | 0.50634 0.75804 0 63 | 0.61578 0.7288 0 64 | 0.60426 0.59722 0 65 | 0.76555 0.50219 0 66 | 0.92684 0.3633 0 67 | 0.82316 0.27558 0 68 | 0.96141 0.085526 0 69 | 0.93836 0.012427 0 70 | 0.86348 -0.082602 0 71 | 0.89804 -0.20687 0 72 | 0.85196 -0.36769 0 73 | 0.82892 -0.5212 0 74 | 0.79435 -0.55775 0 75 | 0.59274 -0.7405 0 76 | 0.51786 -0.5943 0 77 | 0.46601 -0.41886 0 78 | 0.35081 -0.57968 0 79 | 0.28744 -0.76974 0 80 | 0.085829 -0.75512 0 81 | 0.14919 -0.57968 0 82 | -0.13306 -0.4481 0 83 | -0.40956 -0.41155 0 84 | -0.39228 -0.25804 0 85 | -0.74366 -0.25804 0 86 | -0.69758 0.041667 0 87 | -0.75518 0.2902 0 88 | -0.69758 0.68494 0 89 | -0.4038 0.70687 0 90 | -0.38076 0.91886 0 91 | -0.50749 0.90424 0 92 | -0.54781 0.70687 0 93 | 0.10311 0.77997 0 94 | 0.057028 0.91886 0 95 | -0.10426 0.99196 0 96 | -0.081221 1.1089 0 97 | 0.28744 1.087 0 98 | 0.39689 0.82383 0 99 | 0.63882 0.88962 0 100 | 0.82316 0.66301 0 101 | 0.67339 0.64108 0 102 | 1.0709 0.10015 0 103 | -0.046659 -0.57968 0 104 | -0.23675 -0.63816 0 105 | -0.15035 -0.36769 0 106 | -0.49021 -0.3019 0 107 | -0.46717 -0.13377 0 108 | -0.28859 -0.060673 0 109 | -0.61118 -0.067982 0 110 | -0.66302 -0.21418 0 111 | -0.59965 -0.41886 0 112 | -0.72638 -0.082602 0 113 | -0.83007 0.31213 0 114 | -0.72062 0.53874 0 115 | -0.59389 0.49488 0 116 | -0.48445 0.99927 0 117 | -0.0063364 0.99927 0 118 | 0.63265 -0.030612 0 119 | -------------------------------------------------------------------------------- /logical_regression/logical_regression.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # logical_regression/logical_regression.py 3 | import numpy as np 4 | import matplotlib as plt 5 | import time 6 | 7 | def exeTime(func): 8 | """耗时计算装饰器 9 | 10 | Args: 11 | func 待装饰函数 12 | Returns: 13 | newFunc 装饰后的函数 14 | """ 15 | def newFunc(*args, **args2): 16 | t0 = time.time() 17 | back = func(*args, **args2) 18 | return back, time.time() - t0 19 | return newFunc 20 | 21 | def loadDataSet(filename): 22 | """读取数据集 23 | 数据以TAB进行分割 24 | 25 | Args: 26 | filename 文件名 27 | Returns: 28 | X 训练样本集矩阵 29 | y 标签集矩阵 30 | """ 31 | numFeat = len(open(filename).readline().split('\t')) - 1 32 | X = [] 33 | y = [] 34 | file = open(filename) 35 | for line in file.readlines(): 36 | lineArr = [] 37 | curLine = line.strip().split('\t') 38 | for i in range(numFeat): 39 | lineArr.append(float(curLine[i])) 40 | X.append([1.0, float(lineArr[0]), float(lineArr[1])]) 41 | y.append(float(curLine[-1])) 42 | return np.mat(X), np.mat(y).T 43 | 44 | def sigmoid(z): 45 | """sigmoid函数 46 | """ 47 | return 1.0/(1.0+np.exp(-z)) 48 | 49 | def J(theta, X, y, theLambda=0): 50 | """预测代价函数 51 | """ 52 | m, n = X.shape 53 | h = sigmoid(X.dot(theta)) 54 | J = (-1.0/m)*(np.log(h).T.dot(y)+np.log(1-h).T.dot(1-y)) + (theLambda/(2.0*m))*np.sum(np.square(theta[1:])) 55 | if np.isnan(J[0]): 56 | return(np.inf) 57 | return J.flatten()[0,0] 58 | 59 | @exeTime 60 | def gradient(X, y, options): 61 | """随机梯度下降法 62 | Args: 63 | X 样本矩阵 64 | y 标签矩阵 65 | rate 学习率 66 | options.theLambda 正规参数 67 | options.maxLoop 最大迭代次数 68 | options.epsilon 收敛精度 69 | options.method 70 | - 'sgd' 随机梯度下降法 71 | - 'bgd' 批量梯度下降法 72 | Returns: 73 | (thetas, errors), timeConsumed 74 | """ 75 | m,n = X.shape 76 | # 初始化参数矩阵 77 | theta = np.ones((n,1)) 78 | count = 0 # 迭代次数 79 | # 初始化误差无限大 80 | error = float('inf') 81 | # 保存误差变化状况 82 | errors = [] 83 | # 保存参数的变化状况 84 | thetas = [] 85 | rate = options.get('rate', 0.01) 86 | epsilon = options.get('epsilon', 0.1) 87 | maxLoop = options.get('maxLoop', 1000) 88 | theLambda = options.get('theLambda', 0) 89 | method = options['method'] 90 | def _sgd(theta): 91 | converged = False 92 | for i in range(maxLoop): 93 | if converged: 94 | break 95 | for j in range(m): 96 | h = sigmoid(X[j] *theta) 97 | diff = h - y[j] 98 | theta = theta - rate*(1.0/m)*X[j].T*diff 99 | error = J(theta, X, y) 100 | errors.append(error) 101 | if error < epsilon: 102 | converged = True 103 | break 104 | thetas.append(theta) 105 | return thetas, errors, i+1 106 | def _bgd(theta): 107 | for i in range(maxLoop): 108 | h = sigmoid(X.dot(theta)) 109 | diff = h - y 110 | # theta0 should not be regularized 111 | theta = theta - rate*((1.0/m)*X.T*diff + (theLambda/m)*np.r_[[[0]], theta[1:]]) 112 | error = J(theta, X, y, theLambda) 113 | errors.append(error) 114 | if error < epsilon: 115 | break 116 | thetas.append(theta) 117 | return thetas, errors, i+1 118 | methods = { 119 | 'sgd': _sgd, 120 | 'bgd': _bgd 121 | } 122 | return methods[method](theta) 123 | 124 | def oneVsAll(X, y, options): 125 | """One-vs-All 多分类 126 | 127 | Args: 128 | X 样本 129 | y 标签 130 | options 训练配置 131 | Returns: 132 | Thetas 权值矩阵 133 | """ 134 | # 类型数 135 | classes = set(np.ravel(y)) 136 | # 决策边界矩阵 137 | Thetas = np.zeros((len(classes), X.shape[1])) 138 | # 一次选定每种分类对应的样本为正样本,其他样本标识为负样本,进行逻辑回归 139 | for idx, c in enumerate(classes): 140 | newY = np.zeros(y.shape) 141 | newY[np.where(y == c)] = 1 142 | result, timeConsumed = gradient(X, newY, options) 143 | thetas,errors,iterations = result 144 | Thetas[idx] = thetas[-1].ravel() 145 | return Thetas 146 | 147 | def predictOneVsAll(X,Thetas): 148 | """One-vs-All下的多分类预测 149 | 150 | Args: 151 | X 样本 152 | Thetas 权值矩阵 153 | Returns: 154 | H 预测结果 155 | """ 156 | H = sigmoid(Thetas * X.T) 157 | return H 158 | -------------------------------------------------------------------------------- /logical_regression/test_linear_boundry.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # logical_regression/test_linear_boundry.py 3 | import numpy as np 4 | import logical_regression as regression 5 | import matplotlib.pyplot as plt 6 | import matplotlib.ticker as mtick 7 | 8 | if __name__ == "__main__": 9 | X, y = regression.loadDataSet('data/linear.txt') 10 | m, n = X.shape 11 | options = [{ 12 | 'rate': 0.1, 13 | 'epsilon': 0.01, 14 | 'maxLoop': 500, 15 | 'method': 'bgd' 16 | },{ 17 | 'rate': 1, 18 | 'epsilon': 0.01, 19 | 'maxLoop': 200, 20 | 'method': 'sgd' 21 | }] 22 | for option in options: 23 | result, timeConsumed = regression.gradient(X, y, option) 24 | thetas, errors, iterationCount = result 25 | theta = thetas[-1] 26 | print theta, errors[-1], iterationCount 27 | # 绘制数据点 28 | fittingFig = plt.figure() 29 | title = '%s: rate=%.2f, iterationCount=%d, error=%.2f \n time: %.2fs' % ( 30 | option['method'], option['rate'], iterationCount, errors[-1], timeConsumed) 31 | ax = fittingFig.add_subplot(111, title=title) 32 | ax.set_xlabel('X1') 33 | ax.set_ylabel('X2') 34 | for i in range(m): 35 | x = X[i].A[0] 36 | if y[i] == 1: 37 | ax.scatter(x[1], x[2], marker='*', color='black', s=50) 38 | else: 39 | ax.scatter(x[1], x[2], marker='o', color='green', s=50) 40 | # 绘制决策边界 41 | x1Min = X[:, 1].min() 42 | x1Max = X[:, 1].max() 43 | x2Min = X[:, 2].min() 44 | x2Max = X[:, 2].max() 45 | xx1, xx2 = np.meshgrid(np.linspace(x1Min, x1Max), 46 | np.linspace(x2Min, x2Max)) 47 | h = regression.sigmoid(np.c_[np.ones((xx1.ravel().shape[0],1)), xx1.ravel(), xx2.ravel()].dot(theta)) 48 | h = h.reshape(xx1.shape) 49 | plt.contour(xx1, xx2, h, [0.5], colors='b', linewidth=.5) 50 | plt.show() 51 | 52 | # 绘制误差曲线 53 | errorsFig = plt.figure() 54 | ax = errorsFig.add_subplot(111) 55 | ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.4f')) 56 | 57 | ax.plot(range(len(errors)), errors) 58 | ax.set_xlabel('Number of iterations') 59 | ax.set_ylabel('Cost J') 60 | plt.show() 61 | 62 | # 绘制theta的变化情况 63 | thetasFig, ax = plt.subplots(len(thetas[0])) 64 | thetas = np.asarray(thetas) 65 | for idx, sp in enumerate(ax): 66 | thetaList = thetas[:, idx] 67 | sp.plot(range(len(thetaList)), thetaList) 68 | sp.set_xlabel('Number of iteration') 69 | sp.set_ylabel(r'$\theta_%d$'%idx) 70 | plt.show() 71 | -------------------------------------------------------------------------------- /logical_regression/test_non_linear_boundry.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # logical_regression/test_non_linear_boundry.py 3 | import numpy as np 4 | import logical_regression as regression 5 | import matplotlib.pyplot as plt 6 | import matplotlib.ticker as mtick 7 | from sklearn.preprocessing import PolynomialFeatures 8 | 9 | if __name__ == "__main__": 10 | X, y = regression.loadDataSet('data/non_linear.txt') 11 | poly = PolynomialFeatures(6) 12 | XX = poly.fit_transform(X[:,1:3]) 13 | m, n = XX.shape 14 | options = [{ 15 | 'rate': 1, 16 | 'epsilon': 0.01, 17 | 'theLambda': theLambda, 18 | 'maxLoop': 3000, 19 | 'method': 'bgd' 20 | } for theLambda in [0, 1.0, 100.0]] 21 | figures, axes = plt.subplots(1,3, sharey = True, figsize=(17,5)) 22 | for idx, option in enumerate(options): 23 | result, timeConsumed = regression.gradient(XX, y, option) 24 | thetas, errors, iterationCount = result 25 | theta = thetas[-1] 26 | print theta, errors[-1], iterationCount 27 | ax = axes[idx] 28 | # 绘制数据点 29 | title = '%s: rate=%.2f, iterationCount=%d, \n theLambda=%d, \n error=%.2f time: %.2fs' % ( 30 | option['method'], option['rate'], iterationCount, option['theLambda'], errors[-1], timeConsumed) 31 | ax.set_title(title) 32 | ax.set_xlabel('X1') 33 | ax.set_ylabel('X2') 34 | for i in range(m): 35 | x = X[i].A[0] 36 | if y[i] == 1: 37 | ax.scatter(x[1], x[2], marker='*', color='black', s=50) 38 | else: 39 | ax.scatter(x[1], x[2], marker='o', color='green', s=50) 40 | # 绘制决策边界 41 | x1Min = X[:, 1].min() 42 | x1Max = X[:, 1].max() 43 | x2Min = X[:, 2].min() 44 | x2Max = X[:, 2].max() 45 | xx1, xx2 = np.meshgrid(np.linspace(x1Min, x1Max), 46 | np.linspace(x2Min, x2Max)) 47 | h = regression.sigmoid(poly.fit_transform(np.c_[xx1.ravel(), xx2.ravel()]).dot(theta)) 48 | h = h.reshape(xx1.shape) 49 | ax.contour(xx1, xx2, h, [0.5], colors='b', linewidth=.5) 50 | plt.show() 51 | -------------------------------------------------------------------------------- /logical_regression/test_onevsall.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # logical_regression/test_onevsall.py 3 | """OneVsAll 多分类测试 4 | """ 5 | import numpy as np 6 | import logical_regression as regression 7 | from scipy.io import loadmat 8 | 9 | if __name__ == "__main__": 10 | data = loadmat('data/ex3data1.mat') 11 | X = np.mat(data['X']) 12 | y = np.mat(data['y']) 13 | # 为X添加偏置 14 | X = np.append(np.ones((X.shape[0], 1)), X, axis=1) 15 | # 采用批量梯度下降法 16 | options = { 17 | 'rate': 0.1, 18 | 'epsilon': 0.1, 19 | 'maxLoop': 5000, 20 | 'method': 'bgd' 21 | } 22 | # 训练 23 | Thetas = regression.oneVsAll(X,y,options) 24 | # 预测 25 | H = regression.predictOneVsAll(X, Thetas) 26 | pred = np.argmax(H,axis=0)+1 27 | # 计算准确率 28 | print 'Training accuracy is: %.2f%'%(np.mean(pred == y.ravel())*100) 29 | -------------------------------------------------------------------------------- /neural_network/data/ex4weights.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/neural_network/data/ex4weights.mat -------------------------------------------------------------------------------- /neural_network/data/handwritten_digits.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/neural_network/data/handwritten_digits.mat -------------------------------------------------------------------------------- /neural_network/nn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # neural_network/nn.py 3 | import numpy as np 4 | from scipy.optimize import minimize 5 | from scipy import stats 6 | 7 | def sigmoid(z): 8 | """sigmoid 9 | """ 10 | return 1 / (1 + np.exp(-z)) 11 | 12 | def sigmoidDerivative(a): 13 | """sigmoid求导 14 | """ 15 | return np.multiply(a, (1-a)) 16 | 17 | def initThetas(hiddenNum, unitNum, inputSize, classNum, epsilon): 18 | """初始化权值矩阵 19 | 20 | Args: 21 | hiddenNum 隐层数目 22 | unitNum 每个隐层的神经元数目 23 | inputSize 输入层规模 24 | classNum 分类数目 25 | epsilon epsilon 26 | Returns: 27 | Thetas 权值矩阵序列 28 | """ 29 | hiddens = [unitNum for i in range(hiddenNum)] 30 | units = [inputSize] + hiddens + [classNum] 31 | Thetas = [] 32 | for idx, unit in enumerate(units): 33 | if idx == len(units) - 1: 34 | break 35 | nextUnit = units[idx + 1] 36 | # 考虑偏置 37 | Theta = np.random.rand(nextUnit, unit + 1) * 2 * epsilon - epsilon 38 | Thetas.append(Theta) 39 | return Thetas 40 | 41 | def computeCost(Thetas, y, theLambda, X=None, a=None): 42 | """计算代价 43 | 44 | Args: 45 | Thetas 权值矩阵序列 46 | X 样本 47 | y 标签集 48 | a 各层激活值 49 | Returns: 50 | J 预测代价 51 | """ 52 | m = y.shape[0] 53 | if a is None: 54 | a = fp(Thetas, X) 55 | error = -np.sum(np.multiply(y.T,np.log(a[-1]))+np.multiply((1-y).T, np.log(1-a[-1]))) 56 | # 正规化参数 57 | reg = -np.sum([np.sum(Theta[:, 1:]) for Theta in Thetas]) 58 | return (1.0 / m) * error + (1.0 / (2 * m)) * theLambda * reg 59 | 60 | def gradientCheck(Thetas,X,y,theLambda): 61 | """梯度校验 62 | 63 | Args: 64 | Thetas 权值矩阵 65 | X 样本 66 | y 标签 67 | theLambda 正规化参数 68 | Returns: 69 | checked 是否检测通过 70 | """ 71 | m, n = X.shape 72 | # 前向传播计算各个神经元的激活值 73 | a = fp(Thetas, X) 74 | # 反向传播计算梯度增量 75 | D = bp(Thetas, a, y, theLambda) 76 | # 计算预测代价 77 | J = computeCost(Thetas, y, theLambda, a=a) 78 | DVec = unroll(D) 79 | # 求梯度近似 80 | epsilon = 1e-4 81 | gradApprox = np.zeros(DVec.shape) 82 | ThetaVec = unroll(Thetas) 83 | shapes = [Theta.shape for Theta in Thetas] 84 | for i,item in enumerate(ThetaVec): 85 | ThetaVec[i] = item - epsilon 86 | JMinus = computeCost(roll(ThetaVec,shapes),y,theLambda,X=X) 87 | ThetaVec[i] = item + epsilon 88 | JPlus = computeCost(roll(ThetaVec,shapes),y,theLambda,X=X) 89 | gradApprox[i] = (JPlus-JMinus) / (2*epsilon) 90 | # 用欧氏距离表示近似程度 91 | diff = np.linalg.norm(gradApprox - DVec) 92 | if diff < 1e-2: 93 | return True 94 | else: 95 | return False 96 | 97 | def adjustLabels(y): 98 | """校正分类标签 99 | 100 | Args: 101 | y 标签集 102 | Returns: 103 | yAdjusted 校正后的标签集 104 | """ 105 | # 保证标签对类型的标识是逻辑标识 106 | if y.shape[1] == 1: 107 | classes = set(np.ravel(y)) 108 | classNum = len(classes) 109 | minClass = min(classes) 110 | if classNum > 2: 111 | yAdjusted = np.zeros((y.shape[0], classNum), np.float64) 112 | for row, label in enumerate(y): 113 | yAdjusted[row, label - minClass] = 1 114 | else: 115 | yAdjusted = np.zeros((y.shape[0], 1), np.float64) 116 | for row, label in enumerate(y): 117 | if label != minClass: 118 | yAdjusted[row, 0] = 1.0 119 | return yAdjusted 120 | return y 121 | 122 | 123 | def unroll(matrixes): 124 | """参数展开 125 | 126 | Args: 127 | matrixes 矩阵 128 | Return: 129 | vec 向量 130 | """ 131 | vec = [] 132 | for matrix in matrixes: 133 | vector = matrix.reshape(1, -1)[0] 134 | vec = np.concatenate((vec, vector)) 135 | return vec 136 | 137 | 138 | def roll(vector, shapes): 139 | """参数恢复 140 | 141 | Args: 142 | vector 向量 143 | shapes shape list 144 | Returns: 145 | matrixes 恢复的矩阵序列 146 | """ 147 | matrixes = [] 148 | begin = 0 149 | for shape in shapes: 150 | end = begin + shape[0] * shape[1] 151 | matrix = vector[begin:end].reshape(shape) 152 | begin = end 153 | matrixes.append(matrix) 154 | return matrixes 155 | 156 | 157 | def fp(Thetas, X): 158 | """前向反馈过程 159 | 160 | Args: 161 | Thetas 权值矩阵 162 | X 输入样本 163 | Returns: 164 | a 各层激活向量 165 | """ 166 | layers = range(len(Thetas) + 1) 167 | layerNum = len(layers) 168 | # 激活向量序列 169 | a = range(layerNum) 170 | # 前向传播计算各层输出 171 | for l in layers: 172 | if l == 0: 173 | a[l] = X.T 174 | else: 175 | z = Thetas[l - 1] * a[l - 1] 176 | a[l] = sigmoid(z) 177 | # 除输出层外,需要添加偏置 178 | if l != layerNum - 1: 179 | a[l] = np.concatenate((np.ones((1, a[l].shape[1])), a[l])) 180 | return a 181 | 182 | 183 | def bp(Thetas, a, y, theLambda): 184 | """反向传播过程 185 | 186 | Args: 187 | a 激活值 188 | y 标签 189 | Returns: 190 | D 权值梯度 191 | """ 192 | m = y.shape[0] 193 | layers = range(len(Thetas) + 1) 194 | layerNum = len(layers) 195 | d = range(len(layers)) 196 | delta = [np.zeros(Theta.shape) for Theta in Thetas] 197 | for l in layers[::-1]: 198 | if l == 0: 199 | # 输入层不计算误差 200 | break 201 | if l == layerNum - 1: 202 | # 输出层误差 203 | d[l] = a[l] - y.T 204 | else: 205 | # 忽略偏置 206 | d[l] = np.multiply((Thetas[l][:,1:].T * d[l + 1]), sigmoidDerivative(a[l][1:, :])) 207 | for l in layers[0:layerNum - 1]: 208 | delta[l] = d[l + 1] * (a[l].T) 209 | D = [np.zeros(Theta.shape) for Theta in Thetas] 210 | for l in range(len(Thetas)): 211 | Theta = Thetas[l] 212 | # 偏置更新增量 213 | D[l][:, 0] = (1.0 / m) * (delta[l][0:, 0].reshape(1, -1)) 214 | # 权值更新增量 215 | D[l][:, 1:] = (1.0 / m) * (delta[l][0:, 1:] + 216 | theLambda * Theta[:, 1:]) 217 | return D 218 | 219 | def updateThetas(m, Thetas, D, alpha, theLambda): 220 | """更新权值 221 | 222 | Args: 223 | m 样本数 224 | Thetas 各层权值矩阵 225 | D 梯度 226 | alpha 学习率 227 | theLambda 正规化参数 228 | Returns: 229 | Thetas 更新后的权值矩阵 230 | """ 231 | for l in range(len(Thetas)): 232 | Thetas[l] = Thetas[l] - alpha * D[l] 233 | return Thetas 234 | 235 | 236 | def gradientDescent(Thetas, X, y, alpha, theLambda): 237 | """梯度下降 238 | 239 | Args: 240 | X 样本 241 | y 标签 242 | alpha 学习率 243 | theLambda 正规化参数 244 | Returns: 245 | J 预测代价 246 | Thetas 更新后的各层权值矩阵 247 | """ 248 | # 样本数,特征数 249 | m, n = X.shape 250 | # 前向传播计算各个神经元的激活值 251 | a = fp(Thetas, X) 252 | # 反向传播计算梯度增量 253 | D = bp(Thetas, a, y, theLambda) 254 | # 计算预测代价 255 | J = computeCost(Thetas,y,theLambda,a=a) 256 | # 更新权值 257 | Thetas = updateThetas(m, Thetas, D, alpha, theLambda) 258 | if np.isnan(J): 259 | J = np.inf 260 | return J, Thetas 261 | 262 | def train(X, y, Thetas=None, hiddenNum=0, unitNum=5, epsilon=1, alpha=1, theLambda=0, precision=0.01, maxIters=50): 263 | """网络训练 264 | 265 | Args: 266 | X 训练样本 267 | y 标签集 268 | Thetas 初始化的Thetas,如果为None,由系统随机初始化Thetas 269 | hiddenNum 隐藏层数目 270 | unitNum 隐藏层的单元数 271 | epsilon 初始化权值的范围[-epsilon, epsilon] 272 | alpha 学习率 273 | theLambda 正规化参数 274 | precision 误差精度 275 | maxIters 最大迭代次数 276 | """ 277 | # 样本数,特征数 278 | m, n = X.shape 279 | # 矫正标签集 280 | y = adjustLabels(y) 281 | classNum = y.shape[1] 282 | # 初始化Theta 283 | if Thetas is None: 284 | Thetas = initThetas( 285 | inputSize=n, 286 | hiddenNum=hiddenNum, 287 | unitNum=unitNum, 288 | classNum=classNum, 289 | epsilon=epsilon 290 | ) 291 | # 先进性梯度校验 292 | print 'Doing Gradient Checking....' 293 | checked = gradientCheck(Thetas, X, y, theLambda) 294 | if checked: 295 | for i in range(maxIters): 296 | error, Thetas = gradientDescent( 297 | Thetas, X, y, alpha=alpha, theLambda=theLambda) 298 | if error < precision: 299 | break 300 | if error == np.inf: 301 | break 302 | if error < precision: 303 | success = True 304 | else: 305 | success = False 306 | return { 307 | 'error': error, 308 | 'Thetas': Thetas, 309 | 'iters': i, 310 | 'success': error 311 | } 312 | else: 313 | print 'Error: Gradient Cheching Failed!!!' 314 | return { 315 | 'error': None, 316 | 'Thetas': None, 317 | 'iters': 0, 318 | 'success': False 319 | } 320 | 321 | def predict(X, Thetas): 322 | """预测函数 323 | 324 | Args: 325 | X: 样本 326 | Thetas: 训练后得到的参数 327 | Return: 328 | a 329 | """ 330 | a = fp(Thetas,X) 331 | return a[-1] 332 | -------------------------------------------------------------------------------- /neural_network/test_handwritten_digits.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # neural_network/test_handwritten_digits.py 3 | """手写字符集 4 | """ 5 | import nn 6 | import numpy as np 7 | from sklearn import datasets 8 | from scipy.io import loadmat 9 | 10 | # digits = datasets.load_digits() 11 | # 12 | # 13 | # X = digits.images.reshape((len(digits.images), -1)) 14 | # y = digits.target.reshape(-1, 1) 15 | 16 | data = loadmat('data/handwritten_digits.mat') 17 | Thetas = loadmat('data/ex4weights.mat') 18 | Thetas = [Thetas['Theta1'], Thetas['Theta2']] 19 | 20 | 21 | X = np.mat(data['X']) 22 | y = np.mat(data['y']) 23 | 24 | res = nn.train(X,y,hiddenNum=1,unitNum=25,Thetas=Thetas, precision = 0.5) 25 | print 'Error is: %.4f'%res['error'] 26 | -------------------------------------------------------------------------------- /neural_network/test_logic_and.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # neural_network/test_logic_and.py 3 | """逻辑AND运算 4 | """ 5 | import nn 6 | import numpy as np 7 | 8 | data = np.mat([ 9 | [0, 0, 0], 10 | [1, 0, 0], 11 | [0, 1, 0], 12 | [1, 1, 1] 13 | ]) 14 | 15 | X = data[:, 0:2] 16 | y = data[:, 2] 17 | 18 | res = nn.train(X, y, hiddenNum=0, alpha=10, maxIters=5000, precision=0.01) 19 | print 'Run %d iterations'%res['iters'] 20 | print 'Error is: %.4f'%res['error'] 21 | print 'Theta is: ', res['Thetas'][0] 22 | -------------------------------------------------------------------------------- /pca/data/bird_small.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/pca/data/bird_small.mat -------------------------------------------------------------------------------- /pca/data/ex7data1.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/pca/data/ex7data1.mat -------------------------------------------------------------------------------- /pca/data/ex7data2.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/pca/data/ex7data2.mat -------------------------------------------------------------------------------- /pca/data/ex7faces.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/pca/data/ex7faces.mat -------------------------------------------------------------------------------- /pca/kmeans.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # svm/kmeans.py 3 | import numpy as np 4 | 5 | def loadDataSet(filename): 6 | """ 7 | 读取数据集 8 | 9 | Args: 10 | filename: 文件名 11 | Returns: 12 | dataMat: 数据样本矩阵 13 | """ 14 | dataMat = [] 15 | fr = open(filename) 16 | for line in fr.readlines(): 17 | curLine = line.strip().split('\t') 18 | # 通过map函数批量转换 19 | fitLine = map(float, curLine) 20 | dataMat.append(fitLine) 21 | return dataMat 22 | 23 | def distEclud(vecA, vecB): 24 | """ 25 | 计算两向量的欧氏距离 26 | 27 | Args: 28 | vecA: 向量A 29 | vecB: 向量B 30 | Returns: 31 | 欧式距离 32 | """ 33 | return np.sqrt(np.sum(np.power(vecA - vecB, 2))) 34 | 35 | def randCent(dataSet, k): 36 | """ 37 | 随机生成k个聚类中心 38 | 39 | Args: 40 | dataSet: 数据集 41 | k: 簇数目 42 | Returns: 43 | centroids: 聚类中心矩阵 44 | """ 45 | _, n = dataSet.shape 46 | centroids = np.mat(np.zeros((k, n))) 47 | for j in range(n): 48 | # 随机聚类中心落在数据集的边界之内 49 | minJ = np.min(dataSet[:, j]) 50 | maxJ = np.max(dataSet[:, j]) 51 | rangeJ = float(maxJ - minJ) 52 | centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1) 53 | return centroids 54 | 55 | def kMeans(dataSet, k, maxIter = 5): 56 | """ 57 | K-Means 58 | 59 | Args: 60 | dataSet: 数据集 61 | k: 聚类数 62 | Returns: 63 | centroids: 聚类中心 64 | clusterAssment: 点分配结果 65 | """ 66 | # 随机初始化聚类中心 67 | centroids = randCent(dataSet, k) 68 | m, n = np.shape(dataSet) 69 | # 点分配结果: 第一列指明样本所在的簇,第二列指明该样本到聚类中心的距离 70 | clusterAssment = np.mat(np.zeros((m, 2))) 71 | # 标识聚类中心是否仍在改变 72 | clusterChanged = True 73 | # 直至聚类中心不再变化 74 | iterCount = 0 75 | while clusterChanged and iterCount < maxIter: 76 | iterCount += 1 77 | clusterChanged = False 78 | # 分配样本到簇 79 | for i in range(m): 80 | # 计算第i个样本到各个聚类中心的距离 81 | minIndex = 0 82 | minDist = np.inf 83 | for j in range(k): 84 | dist = distEclud(dataSet[i, :], centroids[j, :]) 85 | if(dist < minDist): 86 | minIndex = j 87 | minDist = dist 88 | # 判断cluster是否改变 89 | if(clusterAssment[i, 0] != minIndex): 90 | clusterChanged = True 91 | clusterAssment[i, :] = minIndex, minDist**2 92 | # 刷新聚类中心: 移动聚类中心到所在簇的均值位置 93 | for cent in range(k): 94 | # 通过数组过滤获得簇中的点 95 | ptsInCluster = dataSet[np.nonzero( 96 | clusterAssment[:, 0].A == cent)[0]] 97 | if ptsInCluster.shape[0] > 0: 98 | # 计算均值并移动 99 | centroids[cent, :] = np.mean(ptsInCluster, axis=0) 100 | return centroids, clusterAssment 101 | 102 | def biKmeans(dataSet, k): 103 | """ 104 | 二分kmeans算法 105 | Args: 106 | dataSet: 数据集 107 | k: 聚类数 108 | Returns: 109 | centroids: 聚类中心 110 | clusterAssment: 点分配结果 111 | """ 112 | m, n = np.shape(dataSet) 113 | # 起始时,只有一个簇,该簇的聚类中心为所有样本的平均位置 114 | centroid0 = np.mean(dataSet, axis=0).tolist()[0] 115 | # 设置一个列表保存当前的聚类中心 116 | currentCentroids = [centroid0] 117 | # 点分配结果: 第一列指明样本所在的簇,第二列指明该样本到聚类中心的距离 118 | clusterAssment = np.mat(np.zeros((m, 2))) 119 | # 初始化点分配结果,默认将所有样本先分配到初始簇 120 | for j in range(m): 121 | clusterAssment[j, 1] = distEclud(dataSet[j, :], np.mat(centroid0))**2 122 | # 直到簇的数目达标 123 | while len(currentCentroids) < k: 124 | # 当前最小的代价 125 | lowestError = np.inf 126 | # 对于每一个簇 127 | for j in range(len(currentCentroids)): 128 | # 获得该簇的样本 129 | ptsInCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == j)[0], :] 130 | # 在该簇上进行2-means聚类 131 | # 注意,得到的centroids,其聚类编号含0,1 132 | centroids, clusterAss = kMeans(ptsInCluster, 2) 133 | # 获得划分后的误差之和 134 | splitedError = np.sum(clusterAss[:, 1]) 135 | # 获得其他簇的样本 136 | ptsNoInCluster = dataSet[np.nonzero( 137 | clusterAssment[:, 0].A != j)[0]] 138 | # 获得剩余数据集的误差 139 | nonSplitedError = np.sum(ptsNoInCluster[:, 1]) 140 | # 比较,判断此次划分是否划算 141 | if (splitedError + nonSplitedError) < lowestError: 142 | # 如果划算,刷新总误差 143 | lowestError = splitedError + nonSplitedError 144 | # 记录当前的应当划分的簇 145 | needToSplit = j 146 | # 新获得的簇以及点分配结果 147 | newCentroids = centroids.A 148 | newClusterAss = clusterAss.copy() 149 | # 更新簇的分配结果 150 | # 第0簇应当修正为被划分的簇 151 | newClusterAss[np.nonzero(newClusterAss[:, 0].A == 0)[ 152 | 0], 0] = needToSplit 153 | # 第1簇应当修正为最新一簇 154 | newClusterAss[np.nonzero(newClusterAss[:, 0].A == 1)[ 155 | 0], 0] = len(currentCentroids) 156 | # 被划分的簇需要更新 157 | currentCentroids[needToSplit] = newCentroids[0, :] 158 | # 加入新的划分后的簇 159 | currentCentroids.append(newCentroids[1, :]) 160 | # 刷新点分配结果 161 | clusterAssment[np.nonzero( 162 | clusterAssment[:, 0].A == needToSplit 163 | )[0], :] = newClusterAss 164 | return np.mat(currentCentroids), clusterAssment 165 | -------------------------------------------------------------------------------- /pca/pca.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # pca/pca.py 3 | 4 | import numpy as np 5 | 6 | def normalize(X): 7 | """数据标准化处理 8 | 9 | Args: 10 | X 样本 11 | Returns: 12 | XNorm 标准化后的样本 13 | """ 14 | XNorm = X.copy() 15 | m,n = XNorm.shape 16 | mean = np.mean(XNorm, axis=0) 17 | std = np.std(XNorm, axis=0) 18 | XNorm = (XNorm - mean) / std 19 | return XNorm 20 | 21 | def PCA(X, k = 1): 22 | """PCA 23 | 24 | Args: 25 | X 样本 26 | k 目的维度 27 | Returns: 28 | XNorm 标准化后的样本 29 | Z 降维后的新样本 30 | U U 31 | UReduce UReduce 32 | S S 33 | V V 34 | """ 35 | m, n = X.shape 36 | # 数据归一化 37 | XNorm = normalize(X) 38 | # 计算协方差矩阵 39 | Coef = XNorm.T * XNorm/m 40 | # 奇异值分解 41 | U, S, V = np.linalg.svd(Coef) 42 | # 取出前 k 个向量 43 | UReduce = U[:, 0:k] 44 | Z = XNorm * UReduce 45 | return XNorm, Z, U, UReduce, S, V 46 | 47 | def recover(UReduce, Z): 48 | """数据恢复 49 | 50 | Args: 51 | UReduce UReduce 52 | Z 降维后的样本 53 | """ 54 | return Z * UReduce.T 55 | -------------------------------------------------------------------------------- /pca/test_pca4performance.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # pca/test_pca4visualization.py 3 | 4 | import pca 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from scipy.io import loadmat 8 | 9 | def display(images, width, height): 10 | """展示图片 11 | 12 | Args: 13 | images 图像样本 14 | width 图像宽 15 | height 图像高 16 | """ 17 | m, n = images.shape 18 | rows = int(np.floor(np.sqrt(m))) 19 | cols = int(np.ceil(m / rows)) 20 | # 图像拼接 21 | dstImage = images.copy() 22 | dstImage = np.zeros((rows * height, cols * width)) 23 | for i in range(rows): 24 | for j in range(cols): 25 | idx = cols * i + j 26 | image = images[idx].reshape(height, width) 27 | dstImage[i * height:i * height + height, 28 | j * width: j * width + width] = image 29 | plt.imshow(dstImage.T, cmap='gray') 30 | plt.axis('off') 31 | plt.show() 32 | 33 | data = loadmat('data/ex7faces.mat') 34 | X = np.mat(data['X'],dtype=np.float32) 35 | m, n = X.shape 36 | 37 | # 展示原图 38 | display(X[0:100, :], 32, 32) 39 | 40 | XNorm, Z, U, UReduce, S, V = pca.PCA(X, k=100) 41 | XRec = pca.recover(UReduce, Z) 42 | 43 | # 显示修复后的图,可以看出,PCA 损失了一部分细节 44 | display(XRec[0:100, :], 32, 32) 45 | -------------------------------------------------------------------------------- /pca/test_pca4visualization.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # pca/test_pca4visualization.py 3 | 4 | import numpy as np 5 | import kmeans 6 | import pca 7 | from scipy.io import loadmat 8 | from mpl_toolkits.mplot3d import Axes3D 9 | import matplotlib.pyplot as plt 10 | import matplotlib.cm as cmx 11 | import matplotlib.colors as colors 12 | 13 | def getCmap(count): 14 | color_norm = colors.Normalize(vmin=0, vmax=count-1) 15 | scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv') 16 | def map_index_to_rgb_color(index): 17 | return scalar_map.to_rgba(index) 18 | return map_index_to_rgb_color 19 | 20 | fig = plt.figure() 21 | ax = fig.add_subplot(111, projection='3d') 22 | 23 | data = loadmat('data/bird_small.mat') 24 | A = data['A'] 25 | 26 | A = A / 255.0; 27 | 28 | height, width, channels = A.shape 29 | X = np.mat(A.reshape(height * width, channels)) 30 | 31 | m, n = X.shape 32 | 33 | clusterNum = 16 34 | cmap = getCmap(clusterNum) 35 | centroids, clusterAssment = kmeans.kMeans(X, clusterNum) 36 | # 随机选择 1000 个样本绘制 37 | sampleSize = 1000 38 | sampleIndexs = np.random.choice(m, sampleSize) 39 | clusters = clusterAssment[sampleIndexs] 40 | samples = X[sampleIndexs] 41 | 42 | # 三维下观察 43 | for i in range(sampleSize): 44 | x, y, z = samples[i,:].A[0] 45 | center = clusters[i, 0] 46 | color = cmap(center) 47 | ax.scatter([x], [y], [z], color=color, marker='o') 48 | plt.show() 49 | 50 | # 二维下观察 51 | reducedSamples = pca.PCA(samples, k=2)[1] 52 | for i in range(sampleSize): 53 | x, y = reducedSamples[i,:].A[0] 54 | center = clusters[i, 0] 55 | color = cmap(center) 56 | plt.scatter([x], [y], color=color, marker='o') 57 | plt.show() 58 | -------------------------------------------------------------------------------- /recommender_system/data/ex8_movieParams.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/recommender_system/data/ex8_movieParams.mat -------------------------------------------------------------------------------- /recommender_system/data/ex8_movies.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/recommender_system/data/ex8_movies.mat -------------------------------------------------------------------------------- /recommender_system/data/movie_ids.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/recommender_system/data/movie_ids.txt -------------------------------------------------------------------------------- /recommender_system/recommender.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # recommender_system/recommender.py 3 | import numpy as np 4 | from scipy.optimize import minimize, check_grad 5 | from pydash import py_ 6 | 7 | def getRecommender(Y, R, params=None, n=10, theLambda=10, maxIter=100): 8 | """训练方法 9 | 10 | Args: 11 | Y 评价矩阵 12 | R 是否评价矩阵 13 | params 是否具有初始化参数 14 | n 商品特征数 15 | theLambda 正规化参数 16 | maxIter 最大迭代次数 17 | Returns: 18 | train 训练方法 19 | predict 预测方法 20 | """ 21 | 22 | # 商品数,用户数 23 | nm, nu = Y.shape 24 | 25 | # normalize YMean 26 | mu = np.mean(Y, axis=0) 27 | mu = np.zeros((Y.shape[0], 1), dtype=np.float) 28 | for i in range(nm): 29 | totalRates = np.sum(Y[i]) 30 | validCount = len(np.nonzero(R[i])[0]) 31 | mu[i] = totalRates / validCount 32 | Y = Y - mu 33 | 34 | def unroll(Theta, X): 35 | """参数折叠 36 | 37 | Args: 38 | Theta 用户偏好矩阵 39 | X 商品内容矩阵 40 | Returns: 41 | vector 折叠后的参数 42 | """ 43 | 44 | return np.hstack((X.A.T.flatten(), Theta.A.T.flatten())) 45 | 46 | def roll(vector): 47 | """参数回复 48 | 49 | Args: 50 | vector 参数向量 51 | Returns: 52 | Theta 用户偏好矩阵 53 | X 商品内容矩阵 54 | """ 55 | X = np.mat(vector[:nm * n].reshape(n, nm).T) 56 | Theta = np.mat(vector[nm * n:].reshape(n, nu).T) 57 | return Theta, X 58 | 59 | def initParams(): 60 | """初始化参数 61 | 62 | Returns: 63 | Theta 用户对内容的偏好矩阵 64 | X 商品内容矩阵 65 | """ 66 | Theta = np.mat(np.random.rand(nu, n)) 67 | X = np.mat(np.random.rand(nm, n)) 68 | return Theta, X 69 | 70 | def regParams(param): 71 | """正规化参数 72 | Args: 73 | param 参数 74 | Return: 75 | regParam 正规化后的参数 76 | """ 77 | return theLambda * 0.5 * np.sum(np.power(param, 2)) 78 | 79 | def J(params): 80 | """代价函数 81 | 82 | Args: 83 | params 参数向量 84 | nu 用户数 85 | nm 商品数 86 | n 特征数 87 | Return: 88 | J 预测代价 89 | """ 90 | # 参数展开 91 | Theta, X = roll(params) 92 | # 计算误差 93 | rows, cols = np.nonzero(R) 94 | # 预测 95 | H = predict(Theta, X) 96 | Diff = H - Y 97 | Diff[R != 1] = 0 98 | error = 0.5 * np.sum(np.power(Diff, 2)) 99 | # 正规化 Theta 100 | regTheta = regParams(Theta) 101 | # 正规化 x 102 | regX = regParams(X) 103 | return error + regTheta + regX 104 | 105 | def gradient(params): 106 | """梯度下降 107 | 108 | Args: 109 | params 参数向量 110 | Returns: 111 | grad 梯度向量 112 | """ 113 | Theta, X = roll(params) 114 | ThetaGrad = np.mat(np.zeros(Theta.shape)) 115 | XGrad = np.mat(np.zeros(X.shape)) 116 | error = predict(Theta, X) - Y 117 | error[R != 1] = 0 118 | ThetaGrad = error.T * X + theLambda * Theta 119 | XGrad = error * Theta + theLambda * X 120 | return unroll(ThetaGrad, XGrad) 121 | 122 | def train(): 123 | """训练方法 124 | 125 | Returns: 126 | Theta 用户的偏好矩阵 127 | X 商品的内容矩阵 128 | """ 129 | # 初始化参数 130 | if not params: 131 | Theta, X = initParams() 132 | else: 133 | Theta = params['Theta'] 134 | X = params['X'] 135 | # 最小化目标函数 136 | res = minimize(J, x0=unroll(Theta, X), jac=gradient, 137 | method='CG', options={'disp': True, 'maxiter': maxIter}) 138 | Theta, X = roll(res.x) 139 | return Theta, X 140 | 141 | def predict(Theta, X): 142 | """预测 143 | Args: 144 | Theta 用户对内容的偏好矩阵 145 | X 商品内容矩阵 146 | Return: 147 | h 预测 148 | """ 149 | return X * Theta.T + mu 150 | 151 | def getTopRecommends(Theta, X, i, count, rated, items): 152 | """获得推荐 153 | 154 | Args: 155 | Theta Theta 156 | X X 157 | i 用户下标 158 | count 获得推荐的数目 159 | rated 已经评价的类目id 160 | items 商品清单 161 | Returns: 162 | topRecommends 推荐项目 163 | """ 164 | predictions = predict(Theta, X)[:, i] 165 | return py_(items) \ 166 | .map(lambda item, idx: (item, predictions[idx])) \ 167 | .sort_by(lambda item: item[1], reverse = True) \ 168 | .take(count) \ 169 | .value() 170 | 171 | return train, predict, getTopRecommends 172 | -------------------------------------------------------------------------------- /recommender_system/test_movies_rating.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # recommender_system/test_movies_rating.py 3 | 4 | import numpy as np 5 | import recommender 6 | from scipy.io import loadmat 7 | 8 | data = loadmat('data/ex8_movies.mat') 9 | # 评价矩阵 10 | Y = data['Y'] 11 | # 是否评价矩阵 12 | R = data['R'] 13 | 14 | movieParams = loadmat('data/ex8_movieParams.mat') 15 | numMovies = movieParams['num_movies'][0,0] 16 | numFeatures = movieParams['num_features'][0,0] 17 | 18 | # 获得movies 19 | def getMovie(line): 20 | return ' '.join(line.split()[1:]) 21 | 22 | with open('data/movie_ids.txt') as f: 23 | movieList = [getMovie(f.readline()) for i in range(numMovies)] 24 | 25 | myRatings = np.mat(np.zeros((numMovies,1))) 26 | 27 | myRatings[0] = 4 28 | myRatings[97] = 2 29 | myRatings[6] = 3 30 | myRatings[11] = 5 31 | myRatings[53] = 4 32 | myRatings[63] = 5 33 | myRatings[65] = 3 34 | myRatings[68] = 5 35 | myRatings[182] = 4 36 | myRatings[225] = 5 37 | myRatings[354] = 5 38 | print 'New user ratings:' 39 | for i in range(numMovies): 40 | if myRatings[i] > 0: 41 | print 'Rated %d for %s' % (myRatings[i], movieList[i]) 42 | 43 | # 训练推荐模型 44 | Y = np.column_stack((myRatings, Y)) 45 | R = np.column_stack((myRatings, R)).astype(bool) 46 | 47 | print '\nTraing Result:' 48 | train, predict, getTopRecommends = recommender.getRecommender( 49 | Y, R, n=numFeatures, theLambda=10.0) 50 | Theta, X = train() 51 | rated = np.nonzero(myRatings)[0].tolist() 52 | topRecommends = getTopRecommends(Theta, X, -1, 10, rated, movieList) 53 | 54 | print '\nTop recommendations for you:' 55 | for recommend in topRecommends: 56 | print 'Predicting rating %.1f for movie %s' % (recommend[1], recommend[0]) 57 | -------------------------------------------------------------------------------- /svm/data/emailSample1.txt: -------------------------------------------------------------------------------- 1 | > Anyone knows how much it costs to host a web portal ? 2 | > 3 | Well, it depends on how many visitors you're expecting. 4 | This can be anywhere from less than 10 bucks a month to a couple of $100. 5 | You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 6 | if youre running something big.. 7 | 8 | To unsubscribe yourself from this mailing list, send an email to: 9 | groupname-unsubscribe@egroups.com 10 | 11 | -------------------------------------------------------------------------------- /svm/data/emailSample2.txt: -------------------------------------------------------------------------------- 1 | Folks, 2 | 3 | my first time posting - have a bit of Unix experience, but am new to Linux. 4 | 5 | 6 | Just got a new PC at home - Dell box with Windows XP. Added a second hard disk 7 | for Linux. Partitioned the disk and have installed Suse 7.2 from CD, which went 8 | fine except it didn't pick up my monitor. 9 | 10 | I have a Dell branded E151FPp 15" LCD flat panel monitor and a nVidia GeForce4 11 | Ti4200 video card, both of which are probably too new to feature in Suse's default 12 | set. I downloaded a driver from the nVidia website and installed it using RPM. 13 | Then I ran Sax2 (as was recommended in some postings I found on the net), but 14 | it still doesn't feature my video card in the available list. What next? 15 | 16 | Another problem. I have a Dell branded keyboard and if I hit Caps-Lock twice, 17 | the whole machine crashes (in Linux, not Windows) - even the on/off switch is 18 | inactive, leaving me to reach for the power cable instead. 19 | 20 | If anyone can help me in any way with these probs., I'd be really grateful - 21 | I've searched the 'net but have run out of ideas. 22 | 23 | Or should I be going for a different version of Linux such as RedHat? Opinions 24 | welcome. 25 | 26 | Thanks a lot, 27 | Peter 28 | 29 | -- 30 | Irish Linux Users' Group: ilug@linux.ie 31 | http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information. 32 | List maintainer: listmaster@linux.ie 33 | 34 | 35 | -------------------------------------------------------------------------------- /svm/data/ex6data1.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/svm/data/ex6data1.mat -------------------------------------------------------------------------------- /svm/data/ex6data2.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/svm/data/ex6data2.mat -------------------------------------------------------------------------------- /svm/data/ex6data3.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/svm/data/ex6data3.mat -------------------------------------------------------------------------------- /svm/data/spamSample1.txt: -------------------------------------------------------------------------------- 1 | Do You Want To Make $1000 Or More Per Week? 2 | 3 | 4 | 5 | If you are a motivated and qualified individual - I 6 | will personally demonstrate to you a system that will 7 | make you $1,000 per week or more! This is NOT mlm. 8 | 9 | 10 | 11 | Call our 24 hour pre-recorded number to get the 12 | details. 13 | 14 | 15 | 16 | 000-456-789 17 | 18 | 19 | 20 | I need people who want to make serious money. Make 21 | the call and get the facts. 22 | 23 | Invest 2 minutes in yourself now! 24 | 25 | 26 | 27 | 000-456-789 28 | 29 | 30 | 31 | Looking forward to your call and I will introduce you 32 | to people like yourself who 33 | are currently making $10,000 plus per week! 34 | 35 | 36 | 37 | 000-456-789 38 | 39 | 40 | 41 | 3484lJGv6-241lEaN9080lRmS6-271WxHo7524qiyT5-438rjUv5615hQcf0-662eiDB9057dMtVl72 42 | 43 | -------------------------------------------------------------------------------- /svm/data/spamSample2.txt: -------------------------------------------------------------------------------- 1 | Best Buy Viagra Generic Online 2 | 3 | Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed! 4 | 5 | We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers! 6 | http://medphysitcstech.ru 7 | 8 | 9 | -------------------------------------------------------------------------------- /svm/data/spamTest.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/svm/data/spamTest.mat -------------------------------------------------------------------------------- /svm/data/spamTrain.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/svm/data/spamTrain.mat -------------------------------------------------------------------------------- /svm/smo.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # svm/smo.py 3 | 4 | import numpy as np 5 | from sklearn.metrics.pairwise import rbf_kernel 6 | 7 | """ 8 | svm模型 9 | """ 10 | 11 | def linearKernel(): 12 | """线性核函数 13 | """ 14 | def calc(X, A): 15 | return X * A.T 16 | return calc 17 | 18 | def rbfKernel(delta): 19 | """rbf核函数 20 | """ 21 | gamma = 1.0 / (2 * delta**2) 22 | 23 | def calc(X, A): 24 | return np.mat(rbf_kernel(X, A, gamma=gamma)) 25 | return calc 26 | 27 | def getSmo(X, y, C, tol, maxIter, kernel=linearKernel()): 28 | """SMO 29 | 30 | Args: 31 | X 训练样本 32 | y 标签集 33 | C 正规化参数 34 | tol 容忍值 35 | maxIter 最大迭代次数 36 | K 所用核函数 37 | 38 | Returns: 39 | trainSimple 简化版训练算法 40 | train 完整版训练算法 41 | predict 预测函数 42 | """ 43 | m, n = X.shape 44 | # 存放核函数的转化结果 45 | K = kernel(X, X) 46 | # Cache存放预测误差,用以加快计算速度 47 | ECache = np.zeros((m,2)) 48 | 49 | def predict(X, alphas, b, supportVectorsIndex, supportVectors): 50 | """计算权值向量 51 | 52 | Args: 53 | X 预测矩阵 54 | alphas alphas 55 | b b 56 | supportVectorsIndex 支持向量坐标集 57 | supportVectors 支持向量 58 | Returns: 59 | predicts 预测结果 60 | """ 61 | Ks = kernel(supportVectors, X) 62 | predicts = (np.multiply(alphas[supportVectorsIndex], y[ 63 | supportVectorsIndex]).T * Ks + b).T 64 | predicts = np.sign(predicts) 65 | return predicts 66 | 67 | def w(alphas, b, supportVectorsIndex, supportVectors): 68 | """计算权值 69 | 70 | Args: 71 | alphas alphas 72 | b b 73 | supportVectorsIndex 支持向量坐标 74 | supportVectors 支持向量 75 | Returns: 76 | w 权值向量 77 | """ 78 | return (np.multiply(alphas[supportVectorsIndex], y[ 79 | supportVectorsIndex]).T * supportVectors).T 80 | 81 | def E(i, alphas, b): 82 | """计算预测误差 83 | 84 | Args: 85 | i i 86 | alphas alphas 87 | b b 88 | Returns: 89 | E_i 第i个样本的预测误差 90 | """ 91 | FXi = float(np.multiply(alphas, y).T * K[:, i]) + b 92 | E = FXi - float(y[i]) 93 | return E 94 | 95 | def updateE(i, alphas, b): 96 | ECache[i] = [1, E(i, alphas, b)] 97 | 98 | def selectJRand(i): 99 | """ 100 | """ 101 | j = i 102 | while j == i: 103 | j = int(np.random.uniform(0, m)) 104 | return j 105 | 106 | def selectJ(i, Ei, alphas, b): 107 | """选择权值 $$\alpha^{(i)}$$ 108 | """ 109 | maxJ = 0; maxDist=0; Ej = 0 110 | ECache[i] = [1, Ei] 111 | validCaches = np.nonzero(ECache[:, 0])[0] 112 | if len(validCaches) > 1: 113 | for k in validCaches: 114 | if k==i: continue 115 | Ek = E(k, alphas, b) 116 | dist = np.abs(abs(Ei-Ek)) 117 | if maxDist < dist: 118 | Ej = Ek 119 | maxJ = k 120 | maxDist = dist 121 | return maxJ, Ej 122 | else: 123 | ### 随机选择 124 | j = selectJRand(i) 125 | Ej = E(j, alphas, b) 126 | return j, Ej 127 | 128 | def select(i, alphas, b): 129 | """alpha对选择 130 | """ 131 | Ei = E(i, alphas, b) 132 | # 选择违背KKT条件的,作为alpha2 133 | Ri = y[i] * Ei 134 | if (Ri < -tol and alphas[i] < C) or \ 135 | (Ri > tol and alphas[i] > 0): 136 | # 选择第二个参数 137 | j = selectJRand(i) 138 | Ej = E(j, alphas, b) 139 | # j, Ej = selectJ(i, Ei, alphas, b) 140 | # get bounds 141 | if y[i] != y[j]: 142 | L = max(0, alphas[j] - alphas[i]) 143 | H = min(C, C + alphas[j] - alphas[i]) 144 | else: 145 | L = max(0, alphas[j] + alphas[i] - C) 146 | H = min(C, alphas[j] + alphas[i]) 147 | if L == H: 148 | return 0, alphas, b 149 | Kii = K[i, i] 150 | Kjj = K[j, j] 151 | Kij = K[i, j] 152 | eta = 2.0 * Kij - Kii - Kjj 153 | if eta >= 0: 154 | return 0, alphas, b 155 | iOld = alphas[i].copy() 156 | jOld = alphas[j].copy() 157 | alphas[j] = jOld - y[j] * (Ei - Ej) / eta 158 | if alphas[j] > H: 159 | alphas[j] = H 160 | elif alphas[j] < L: 161 | alphas[j] = L 162 | if abs(alphas[j] - jOld) < tol: 163 | alphas[j] = jOld 164 | return 0, alphas, b 165 | alphas[i] = iOld + y[i] * y[j] * (jOld - alphas[j]) 166 | # update ECache 167 | updateE(i, alphas, b) 168 | updateE(j, alphas, b) 169 | # update b 170 | bINew = b - Ei - y[i] * (alphas[i] - iOld) * Kii - y[j] * \ 171 | (alphas[j] - jOld) * Kij 172 | bJNew = b - Ej - y[i] * (alphas[i] - iOld) * Kij - y[j] * \ 173 | (alphas[j] - jOld) * Kjj 174 | if alphas[i] > 0 and alphas[i] < C: 175 | bNew = bINew 176 | elif alphas[j] > 0 and alphas[j] < C: 177 | bNew = bJNew 178 | else: 179 | bNew = (bINew + bJNew) / 2 180 | return 1, alphas, b 181 | else: 182 | return 0, alphas, b 183 | 184 | def train(): 185 | """完整版训练算法 186 | 187 | Returns: 188 | alphas alphas 189 | w w 190 | b b 191 | supportVectorsIndex 支持向量的坐标集 192 | supportVectors 支持向量 193 | iterCount 迭代次数 194 | """ 195 | numChanged = 0 196 | examineAll = True 197 | iterCount = 0 198 | alphas = np.mat(np.zeros((m, 1))) 199 | b = 0 200 | # 如果所有alpha都遵从 KKT 条件,则在整个训练集上迭代 201 | # 否则在处于边界内 (0, C) 的 alpha 中迭代 202 | while (numChanged > 0 or examineAll) and (iterCount < maxIter): 203 | numChanged = 0 204 | if examineAll: 205 | for i in range(m): 206 | changed, alphas, b = select(i, alphas, b) 207 | numChanged += changed 208 | else: 209 | nonBoundIds = np.nonzero((alphas.A > 0) * (alphas.A < C))[0] 210 | for i in nonBoundIds: 211 | changed, alphas, b = select(i, alphas, b) 212 | numChanged += changed 213 | iterCount += 1 214 | 215 | if examineAll: 216 | examineAll = False 217 | elif numChanged == 0: 218 | examineAll = True 219 | supportVectorsIndex = np.nonzero(alphas.A > 0)[0] 220 | supportVectors = np.mat(X[supportVectorsIndex]) 221 | return alphas, w(alphas, b, supportVectorsIndex, supportVectors), b, \ 222 | supportVectorsIndex, supportVectors, iterCount 223 | 224 | def trainSimple(): 225 | """简化版训练算法 226 | 227 | Returns: 228 | alphas alphas 229 | w w 230 | b b 231 | supportVectorsIndex 支持向量的坐标集 232 | supportVectors 支持向量 233 | iterCount 迭代次数 234 | """ 235 | numChanged = 0 236 | iterCount = 0 237 | alphas = np.mat(np.zeros((m, 1))) 238 | b = 0 239 | L = 0 240 | H = 0 241 | while iterCount < maxIter: 242 | numChanged = 0 243 | for i in range(m): 244 | Ei = E(i, alphas, b) 245 | Ri = y[i] * Ei 246 | # 选择违背KKT条件的,作为alpha2 247 | if (Ri < -tol and alphas[i] < C) or \ 248 | (Ri > tol and alphas[i] > 0): 249 | # 选择第二个参数 250 | j = selectJRand(i) 251 | Ej = E(j, alphas, b) 252 | # get bounds 253 | if y[i] != y[j]: 254 | L = max(0, alphas[j] - alphas[i]) 255 | H = min(C, C + alphas[j] - alphas[i]) 256 | else: 257 | L = max(0, alphas[j] + alphas[i] - C) 258 | H = min(C, alphas[j] + alphas[i]) 259 | if L == H: 260 | continue 261 | Kii = K[i, i] 262 | Kjj = K[j, j] 263 | Kij = K[i, j] 264 | eta = 2.0 * Kij - Kii - Kjj 265 | if eta >= 0: 266 | continue 267 | iOld = alphas[i].copy(); 268 | jOld = alphas[j].copy() 269 | alphas[j] = jOld - y[j] * (Ei - Ej) / eta 270 | if alphas[j] > H: 271 | alphas[j] = H 272 | elif alphas[j] < L: 273 | alphas[j] = L 274 | if abs(alphas[j] - jOld) < tol: 275 | alphas[j] = jOld 276 | continue 277 | alphas[i] = iOld + y[i] * y[j] * (jOld - alphas[j]) 278 | # update b 279 | bINew = b - Ei - y[i] * (alphas[i] - iOld) * Kii - y[j] * \ 280 | (alphas[j] - jOld) * Kij 281 | bJNew = b - Ej - y[i] * (alphas[i] - iOld) * Kij - y[j] * \ 282 | (alphas[j] - jOld) * Kjj 283 | if alphas[i] > 0 and alphas[i] < C: 284 | b = bINew 285 | elif alphas[j] > 0 and alphas[j] < C: 286 | b = bJNew 287 | else: 288 | b = (bINew + bJNew) / 2.0 289 | numChanged += 1 290 | if numChanged == 0: 291 | iterCount += 1 292 | else: 293 | iterCount = 0 294 | supportVectorsIndex = np.nonzero(alphas.A > 0)[0] 295 | supportVectors = np.mat(X[supportVectorsIndex]) 296 | return alphas, w(alphas, b, supportVectorsIndex, supportVectors), b, \ 297 | supportVectorsIndex, supportVectors, iterCount 298 | return trainSimple, train, predict 299 | -------------------------------------------------------------------------------- /svm/spam.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # svm/spam.py 3 | 4 | """垃圾邮件分类器 5 | """ 6 | 7 | import numpy as np 8 | from stemming.porter2 import stem 9 | from pydash import py_ 10 | 11 | # 获得词汇表 12 | vocabList = [] 13 | with open('vocab.txt') as f: 14 | for line in f: 15 | idx, w = line.split() 16 | vocabList.append(w) 17 | 18 | 19 | def processEmail(email): 20 | """预处理邮件 21 | 22 | Args: 23 | email 邮件内容 24 | Returns: 25 | indices 单词在词表中的位置 26 | """ 27 | # 转换为小写 --> 标准化 URL --> 标准化 邮箱地址 28 | # --> 去除 HTML 标签 --> 标准化数字 29 | # --> 标准化美元 --> 删除非空格字符 30 | return py_(email) \ 31 | .strip_tags() \ 32 | .reg_exp_replace(r'(http|https)://[^\s]*', 'httpaddr') \ 33 | .reg_exp_replace(r'[^\s]+@[^\s]+', 'emailaddr') \ 34 | .reg_exp_replace(r'\d+', 'number') \ 35 | .reg_exp_replace(r'[$]+', 'dollar') \ 36 | .lower_case() \ 37 | .trim() \ 38 | .words() \ 39 | .map(stem) \ 40 | .map(lambda word : py_.index_of(vocabList, word) + 1) \ 41 | .value() 42 | 43 | def extractFeatures(indices): 44 | """提取特征 45 | 46 | Args: 47 | indices 单词索引 48 | Returns: 49 | feature 邮件特征 50 | """ 51 | feature = py_.map(range(1, len(vocabList) + 1), 52 | lambda index: py_.index_of(indices, index) > -1) 53 | return np.array(feature, dtype=np.uint) 54 | 55 | def getTopPredictors(weights, count): 56 | """获得最佳标识词汇 57 | 58 | Args: 59 | weights 权值 60 | count top count 61 | Returns: 62 | predictors predicators 63 | """ 64 | return py_(vocabList) \ 65 | .map(lambda word, idx: (word, weights[idx])) \ 66 | .sort_by(lambda item: item[1], reverse = True) \ 67 | .take(count) \ 68 | .value() 69 | -------------------------------------------------------------------------------- /svm/test_linear.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # svm/test_linear 3 | import smo 4 | import numpy as np 5 | from sklearn import datasets 6 | from scipy.io import loadmat 7 | import matplotlib.pyplot as plt 8 | 9 | data = loadmat('data/ex6data1.mat') 10 | 11 | X = np.mat(data['X']) 12 | y = np.mat(data['y'], dtype=np.float) 13 | y[y==0] = -1 14 | 15 | m, n = X.shape 16 | tol = 1e-3 17 | maxIter = 20 18 | # C = 1.0 19 | C = 100.0 20 | 21 | trainSimple, train, predict = smo.getSmo(X, y, C, tol, maxIter) 22 | alphas, w, b, supportVectorsIndex, supportVectors, iterCount = trainSimple() 23 | print w 24 | print b 25 | print len(supportVectorsIndex) 26 | print 'iterCount:%d'%iterCount 27 | 28 | predictions = predict(X, alphas, b, supportVectorsIndex, supportVectors) 29 | errorCount = (np.multiply(predictions, y).A < 0 ).sum() 30 | print 'error rate: %.2f'%(float(errorCount)/m) 31 | 32 | # 绘制数据点 33 | x1Min = X[:, 0].min() 34 | x1Max = X[:, 0].max() 35 | x2Min = X[:, 1].min() 36 | x2Max = X[:, 1].max() 37 | plt.xlabel('X1') 38 | plt.ylabel('X2') 39 | plt.xlim(x1Min - 1, x1Max + 1) 40 | plt.ylim(x2Min - 1, x2Max + 1) 41 | plt.title('C=%.1f'%C) 42 | for i in range(m): 43 | x = X[i].A[0] 44 | if y[i] == 1: 45 | color = 'black' 46 | if i in supportVectorsIndex: 47 | color = 'red' 48 | plt.scatter(x[0], x[1], marker='*', color=color, s=50) 49 | else: 50 | color = 'green' 51 | if i in supportVectorsIndex: 52 | color = 'red' 53 | plt.scatter(x[0], x[1], marker='o', color=color, s=50) 54 | 55 | # 绘制决策边界 56 | x = np.arange(x1Min, x1Max, 0.1) 57 | h = (-w[0,0] * x - b[0,0]) / w[1,0] 58 | plt.plot(x, h) 59 | plt.show() 60 | -------------------------------------------------------------------------------- /svm/test_model_selection.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # svm/test_diagnose.py 3 | 4 | import numpy as np 5 | import smo 6 | import matplotlib.pyplot as plt 7 | from scipy.io import loadmat 8 | 9 | data = loadmat('data/ex6data3.mat') 10 | 11 | X = np.mat(data['X'], dtype=np.float) 12 | y = np.mat(data['y'], dtype=np.float) 13 | XVal = np.mat(data['Xval'], dtype=np.float) 14 | yVal = np.mat(data['yval'], dtype=np.float) 15 | 16 | m, n = X.shape 17 | mVal, _ = XVal.shape 18 | 19 | # 纠正负样本 20 | y[y == 0] = -1 21 | yVal[yVal == 0] = -1 22 | 23 | Cs = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30] 24 | deltas = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30] 25 | 26 | # 获得所有 C 及 delta 的组合 27 | deltaCPairs = [[delta, C] for C in Cs for delta in deltas] 28 | 29 | # 获得训练模型 30 | tol = 1e-3 31 | maxIter = 5 32 | models = [smo.getSmo(X, y, C, tol, maxIter, kernel=smo.rbfKernel(delta)) 33 | for delta, C in deltaCPairs] 34 | 35 | # 开始训练 36 | results = [train() for train, _, _ in models] 37 | 38 | # 利用交叉验证集选择模型 39 | predictions = [models[idx][2](XVal, alphas, b, supportVectorsIndex, supportVectors) 40 | for idx, (alphas, w, b, supportVectorsIndex, supportVectors, iterCount) in enumerate(results)] 41 | errorRates = [(np.multiply(prediction, yVal).A < 0).sum() / 42 | float(mVal) for prediction in predictions] 43 | minIdx = np.argmin(errorRates) 44 | alphas, w, b, supportVectorsIndex, supportVectors, iterCount = results[minIdx] 45 | delta, C = deltaCPairs[minIdx] 46 | 47 | # 绘制数据点 48 | x1Min = X[:, 0].min() 49 | x1Max = X[:, 0].max() 50 | x2Min = X[:, 1].min() 51 | x2Max = X[:, 1].max() 52 | plt.title(r'C=%.2f, $\delta$=%.2f, error=%.2f'%(C, delta, errorRates[minIdx])) 53 | plt.xlabel('X1') 54 | plt.ylabel('X2') 55 | plt.xlim(x1Min, x1Max) 56 | plt.ylim(x2Min, x2Max) 57 | 58 | for i in range(m): 59 | x = X[i].A[0] 60 | if y[i] == 1: 61 | color = 'black' 62 | if i in supportVectorsIndex: 63 | color = 'red' 64 | plt.scatter(x[0], x[1], marker='*', color=color, s=50) 65 | else: 66 | color = 'green' 67 | if i in supportVectorsIndex: 68 | color = 'red' 69 | plt.scatter(x[0], x[1], marker='o', color=color, s=50) 70 | 71 | 72 | # 绘制决策边界 73 | xx1, xx2 = np.meshgrid( 74 | np.linspace(x1Min, x1Max, 100), 75 | np.linspace(x2Min, x2Max, 100) 76 | ) 77 | _, _, predict = models[minIdx] 78 | predictX = np.mat(np.c_[xx1.ravel(), xx2.ravel()]) 79 | predictions = predict(predictX, alphas, b, supportVectorsIndex, supportVectors) 80 | predictions = predictions.reshape(xx1.shape) 81 | plt.contour(xx1, xx2, predictions, [0.5], linewidths=5) 82 | plt.show() 83 | -------------------------------------------------------------------------------- /svm/test_non_linear.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # svm/test_non_linear.py 3 | import smo 4 | import numpy as np 5 | from sklearn import datasets 6 | from scipy.io import loadmat 7 | import matplotlib.pyplot as plt 8 | 9 | data = loadmat('data/ex6data2.mat') 10 | 11 | X = np.mat(data['X']) 12 | y = np.mat(data['y'], dtype=np.float) 13 | y[y==0] = -1 14 | 15 | m, n = X.shape 16 | C = 1.0 17 | tol = 1e-3 18 | maxIter = 5 19 | kernel = smo.rbfKernel(0.1) 20 | 21 | trainSimple, train, predict = smo.getSmo(X, y, C, tol, maxIter, kernel=kernel) 22 | alphas, w, b, supportVectorsIndex, supportVectors, iterCount = train() 23 | print supportVectorsIndex 24 | print len(supportVectorsIndex) 25 | print 'iterCount:%d' % iterCount 26 | 27 | predictions = predict(X, alphas, b, supportVectorsIndex, supportVectors) 28 | errorCount = (np.multiply(predictions, y).A < 0 ).sum() 29 | print errorCount 30 | print 'error rate: %.2f'%(float(errorCount)/m) 31 | 32 | # 绘制数据点 33 | x1Min = X[:, 0].min() 34 | x1Max = X[:, 0].max() 35 | x2Min = X[:, 1].min() 36 | x2Max = X[:, 1].max() 37 | plt.title('C=%.1f'%C) 38 | plt.xlabel('X1') 39 | plt.ylabel('X2') 40 | plt.xlim(x1Min, x1Max) 41 | plt.ylim(x2Min, x2Max) 42 | 43 | for i in range(m): 44 | x = X[i].A[0] 45 | if y[i] == 1: 46 | color = 'black' 47 | if i in supportVectorsIndex: 48 | color = 'red' 49 | plt.scatter(x[0], x[1], marker='*', color=color, s=50) 50 | else: 51 | color = 'green' 52 | if i in supportVectorsIndex: 53 | color = 'red' 54 | plt.scatter(x[0], x[1], marker='o', color=color, s=50) 55 | 56 | 57 | # 绘制决策边界 58 | xx1, xx2 = np.meshgrid( 59 | np.linspace(x1Min, x1Max, 100), 60 | np.linspace(x2Min, x2Max, 100) 61 | ) 62 | predictX = np.mat(np.c_[xx1.ravel(), xx2.ravel()]) 63 | predictions = predict(predictX, alphas, b, supportVectorsIndex, supportVectors) 64 | predictions = predictions.reshape(xx1.shape) 65 | plt.contour(xx1, xx2, predictions, [0.5], linewidths=5) 66 | plt.show() 67 | -------------------------------------------------------------------------------- /svm/test_spam.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # svm/test_spam.py 3 | import spam 4 | import numpy as np 5 | from scipy.io import loadmat 6 | from sklearn.svm import SVC 7 | import matplotlib.pyplot as plt 8 | 9 | # 垃圾邮件分类器 10 | data = loadmat('data/spamTrain.mat') 11 | X = np.mat(data['X']) 12 | y = data['y'] 13 | m, n = X.shape 14 | C = 0.1 15 | tol = 1e-3 16 | 17 | # 使用训练集训练分类器 18 | clf = SVC(C=C, kernel='linear', tol=tol) 19 | clf.fit(X, y.ravel()) 20 | predictions = np.mat([clf.predict(X[i, :]) for i in range(m)]) 21 | accuracy = 100 * np.mean(predictions == y) 22 | print 'Training set accuracy: %0.2f %%' % accuracy 23 | 24 | # 使用测试集评估训练结果 25 | data = loadmat('data/spamTest.mat') 26 | XTest = np.mat(data['Xtest']) 27 | yTest = data['ytest'] 28 | mTest, _ = XTest.shape 29 | 30 | clf.fit(XTest, yTest.ravel()) 31 | predictions = np.mat([clf.predict(XTest[i, :]) for i in range(mTest)]) 32 | accuracy = 100 * np.mean(predictions == yTest) 33 | print 'Test set accuracy: %0.2f %%' % accuracy 34 | 35 | # 获得最能标识垃圾邮件的词汇(在模型中获得高权值的) 36 | weights = abs(clf.coef_.flatten()) 37 | top = 15 38 | predictors = spam.getTopPredictors(weights, top) 39 | print '\nTop %d predictors of spam:'%top 40 | for word, weight in predictors: 41 | print '%-15s (%f)' % (word, weight) 42 | 43 | # 使用邮件测试 44 | def genExample(f): 45 | email = open(f).read() 46 | indices = spam.processEmail(email) 47 | features = spam.extractFeatures(indices) 48 | return features 49 | 50 | files = [ 51 | 'data/emailSample1.txt', 52 | 'data/emailSample1.txt', 53 | 'data/spamSample1.txt', 54 | 'data/spamSample2.txt' 55 | ] 56 | 57 | emails = np.mat([genExample(f) for f in files], dtype=np.uint8) 58 | labels = np.array([[0, 0, 1, 1]]).reshape(-1, 1) 59 | predictions = np.mat([clf.predict(emails[i, :]) for i in range(len(files))]) 60 | accuracy = 100 * np.mean(predictions == labels) 61 | print('\nTest set accuracy for own datasets: %0.2f %%' % accuracy) 62 | -------------------------------------------------------------------------------- /svm/vocab.txt: -------------------------------------------------------------------------------- 1 | 1 aa 2 | 2 ab 3 | 3 abil 4 | 4 abl 5 | 5 about 6 | 6 abov 7 | 7 absolut 8 | 8 abus 9 | 9 ac 10 | 10 accept 11 | 11 access 12 | 12 accord 13 | 13 account 14 | 14 achiev 15 | 15 acquir 16 | 16 across 17 | 17 act 18 | 18 action 19 | 19 activ 20 | 20 actual 21 | 21 ad 22 | 22 adam 23 | 23 add 24 | 24 addit 25 | 25 address 26 | 26 administr 27 | 27 adult 28 | 28 advanc 29 | 29 advantag 30 | 30 advertis 31 | 31 advic 32 | 32 advis 33 | 33 ae 34 | 34 af 35 | 35 affect 36 | 36 affili 37 | 37 afford 38 | 38 africa 39 | 39 after 40 | 40 ag 41 | 41 again 42 | 42 against 43 | 43 agenc 44 | 44 agent 45 | 45 ago 46 | 46 agre 47 | 47 agreement 48 | 48 aid 49 | 49 air 50 | 50 al 51 | 51 alb 52 | 52 align 53 | 53 all 54 | 54 allow 55 | 55 almost 56 | 56 alon 57 | 57 along 58 | 58 alreadi 59 | 59 alsa 60 | 60 also 61 | 61 altern 62 | 62 although 63 | 63 alwai 64 | 64 am 65 | 65 amaz 66 | 66 america 67 | 67 american 68 | 68 among 69 | 69 amount 70 | 70 amp 71 | 71 an 72 | 72 analysi 73 | 73 analyst 74 | 74 and 75 | 75 ani 76 | 76 anim 77 | 77 announc 78 | 78 annual 79 | 79 annuiti 80 | 80 anoth 81 | 81 answer 82 | 82 anti 83 | 83 anumb 84 | 84 anybodi 85 | 85 anymor 86 | 86 anyon 87 | 87 anyth 88 | 88 anywai 89 | 89 anywher 90 | 90 aol 91 | 91 ap 92 | 92 apolog 93 | 93 app 94 | 94 appar 95 | 95 appear 96 | 96 appl 97 | 97 appli 98 | 98 applic 99 | 99 appreci 100 | 100 approach 101 | 101 approv 102 | 102 apt 103 | 103 ar 104 | 104 archiv 105 | 105 area 106 | 106 aren 107 | 107 argument 108 | 108 arial 109 | 109 arm 110 | 110 around 111 | 111 arrai 112 | 112 arriv 113 | 113 art 114 | 114 articl 115 | 115 artist 116 | 116 as 117 | 117 ascii 118 | 118 ask 119 | 119 asset 120 | 120 assist 121 | 121 associ 122 | 122 assum 123 | 123 assur 124 | 124 at 125 | 125 atol 126 | 126 attach 127 | 127 attack 128 | 128 attempt 129 | 129 attent 130 | 130 attornei 131 | 131 attract 132 | 132 audio 133 | 133 aug 134 | 134 august 135 | 135 author 136 | 136 auto 137 | 137 autom 138 | 138 automat 139 | 139 avail 140 | 140 averag 141 | 141 avoid 142 | 142 awai 143 | 143 awar 144 | 144 award 145 | 145 ba 146 | 146 babi 147 | 147 back 148 | 148 background 149 | 149 backup 150 | 150 bad 151 | 151 balanc 152 | 152 ban 153 | 153 bank 154 | 154 bar 155 | 155 base 156 | 156 basenumb 157 | 157 basi 158 | 158 basic 159 | 159 bb 160 | 160 bc 161 | 161 bd 162 | 162 be 163 | 163 beat 164 | 164 beberg 165 | 165 becaus 166 | 166 becom 167 | 167 been 168 | 168 befor 169 | 169 begin 170 | 170 behalf 171 | 171 behavior 172 | 172 behind 173 | 173 believ 174 | 174 below 175 | 175 benefit 176 | 176 best 177 | 177 beta 178 | 178 better 179 | 179 between 180 | 180 bf 181 | 181 big 182 | 182 bill 183 | 183 billion 184 | 184 bin 185 | 185 binari 186 | 186 bit 187 | 187 black 188 | 188 blank 189 | 189 block 190 | 190 blog 191 | 191 blood 192 | 192 blue 193 | 193 bnumber 194 | 194 board 195 | 195 bodi 196 | 196 boi 197 | 197 bonu 198 | 198 book 199 | 199 boot 200 | 200 border 201 | 201 boss 202 | 202 boston 203 | 203 botan 204 | 204 both 205 | 205 bottl 206 | 206 bottom 207 | 207 boundari 208 | 208 box 209 | 209 brain 210 | 210 brand 211 | 211 break 212 | 212 brian 213 | 213 bring 214 | 214 broadcast 215 | 215 broker 216 | 216 browser 217 | 217 bug 218 | 218 bui 219 | 219 build 220 | 220 built 221 | 221 bulk 222 | 222 burn 223 | 223 bush 224 | 224 busi 225 | 225 but 226 | 226 button 227 | 227 by 228 | 228 byte 229 | 229 ca 230 | 230 cabl 231 | 231 cach 232 | 232 calcul 233 | 233 california 234 | 234 call 235 | 235 came 236 | 236 camera 237 | 237 campaign 238 | 238 can 239 | 239 canada 240 | 240 cannot 241 | 241 canon 242 | 242 capabl 243 | 243 capillari 244 | 244 capit 245 | 245 car 246 | 246 card 247 | 247 care 248 | 248 career 249 | 249 carri 250 | 250 cartridg 251 | 251 case 252 | 252 cash 253 | 253 cat 254 | 254 catch 255 | 255 categori 256 | 256 caus 257 | 257 cb 258 | 258 cc 259 | 259 cd 260 | 260 ce 261 | 261 cell 262 | 262 cent 263 | 263 center 264 | 264 central 265 | 265 centuri 266 | 266 ceo 267 | 267 certain 268 | 268 certainli 269 | 269 cf 270 | 270 challeng 271 | 271 chanc 272 | 272 chang 273 | 273 channel 274 | 274 char 275 | 275 charact 276 | 276 charg 277 | 277 charset 278 | 278 chat 279 | 279 cheap 280 | 280 check 281 | 281 cheer 282 | 282 chief 283 | 283 children 284 | 284 china 285 | 285 chip 286 | 286 choic 287 | 287 choos 288 | 288 chri 289 | 289 citi 290 | 290 citizen 291 | 291 civil 292 | 292 claim 293 | 293 class 294 | 294 classifi 295 | 295 clean 296 | 296 clear 297 | 297 clearli 298 | 298 click 299 | 299 client 300 | 300 close 301 | 301 clue 302 | 302 cnet 303 | 303 cnumber 304 | 304 co 305 | 305 code 306 | 306 collect 307 | 307 colleg 308 | 308 color 309 | 309 com 310 | 310 combin 311 | 311 come 312 | 312 comfort 313 | 313 command 314 | 314 comment 315 | 315 commentari 316 | 316 commerci 317 | 317 commiss 318 | 318 commit 319 | 319 common 320 | 320 commun 321 | 321 compani 322 | 322 compar 323 | 323 comparison 324 | 324 compat 325 | 325 compet 326 | 326 competit 327 | 327 compil 328 | 328 complet 329 | 329 comprehens 330 | 330 comput 331 | 331 concentr 332 | 332 concept 333 | 333 concern 334 | 334 condit 335 | 335 conf 336 | 336 confer 337 | 337 confid 338 | 338 confidenti 339 | 339 config 340 | 340 configur 341 | 341 confirm 342 | 342 conflict 343 | 343 confus 344 | 344 congress 345 | 345 connect 346 | 346 consid 347 | 347 consolid 348 | 348 constitut 349 | 349 construct 350 | 350 consult 351 | 351 consum 352 | 352 contact 353 | 353 contain 354 | 354 content 355 | 355 continu 356 | 356 contract 357 | 357 contribut 358 | 358 control 359 | 359 conveni 360 | 360 convers 361 | 361 convert 362 | 362 cool 363 | 363 cooper 364 | 364 copi 365 | 365 copyright 366 | 366 core 367 | 367 corpor 368 | 368 correct 369 | 369 correspond 370 | 370 cost 371 | 371 could 372 | 372 couldn 373 | 373 count 374 | 374 countri 375 | 375 coupl 376 | 376 cours 377 | 377 court 378 | 378 cover 379 | 379 coverag 380 | 380 crash 381 | 381 creat 382 | 382 creativ 383 | 383 credit 384 | 384 critic 385 | 385 cross 386 | 386 cultur 387 | 387 current 388 | 388 custom 389 | 389 cut 390 | 390 cv 391 | 391 da 392 | 392 dagga 393 | 393 dai 394 | 394 daili 395 | 395 dan 396 | 396 danger 397 | 397 dark 398 | 398 data 399 | 399 databas 400 | 400 datapow 401 | 401 date 402 | 402 dave 403 | 403 david 404 | 404 dc 405 | 405 de 406 | 406 dead 407 | 407 deal 408 | 408 dear 409 | 409 death 410 | 410 debt 411 | 411 decad 412 | 412 decid 413 | 413 decis 414 | 414 declar 415 | 415 declin 416 | 416 decor 417 | 417 default 418 | 418 defend 419 | 419 defens 420 | 420 defin 421 | 421 definit 422 | 422 degre 423 | 423 delai 424 | 424 delet 425 | 425 deliv 426 | 426 deliveri 427 | 427 dell 428 | 428 demand 429 | 429 democrat 430 | 430 depart 431 | 431 depend 432 | 432 deposit 433 | 433 describ 434 | 434 descript 435 | 435 deserv 436 | 436 design 437 | 437 desir 438 | 438 desktop 439 | 439 despit 440 | 440 detail 441 | 441 detect 442 | 442 determin 443 | 443 dev 444 | 444 devel 445 | 445 develop 446 | 446 devic 447 | 447 di 448 | 448 dial 449 | 449 did 450 | 450 didn 451 | 451 diet 452 | 452 differ 453 | 453 difficult 454 | 454 digit 455 | 455 direct 456 | 456 directli 457 | 457 director 458 | 458 directori 459 | 459 disabl 460 | 460 discount 461 | 461 discov 462 | 462 discoveri 463 | 463 discuss 464 | 464 disk 465 | 465 displai 466 | 466 disposit 467 | 467 distanc 468 | 468 distribut 469 | 469 dn 470 | 470 dnumber 471 | 471 do 472 | 472 doc 473 | 473 document 474 | 474 doe 475 | 475 doer 476 | 476 doesn 477 | 477 dollar 478 | 478 dollarac 479 | 479 dollarnumb 480 | 480 domain 481 | 481 don 482 | 482 done 483 | 483 dont 484 | 484 doubl 485 | 485 doubt 486 | 486 down 487 | 487 download 488 | 488 dr 489 | 489 draw 490 | 490 dream 491 | 491 drive 492 | 492 driver 493 | 493 drop 494 | 494 drug 495 | 495 due 496 | 496 dure 497 | 497 dvd 498 | 498 dw 499 | 499 dynam 500 | 500 ea 501 | 501 each 502 | 502 earli 503 | 503 earlier 504 | 504 earn 505 | 505 earth 506 | 506 easi 507 | 507 easier 508 | 508 easili 509 | 509 eat 510 | 510 eb 511 | 511 ebai 512 | 512 ec 513 | 513 echo 514 | 514 econom 515 | 515 economi 516 | 516 ed 517 | 517 edg 518 | 518 edit 519 | 519 editor 520 | 520 educ 521 | 521 eff 522 | 522 effect 523 | 523 effici 524 | 524 effort 525 | 525 either 526 | 526 el 527 | 527 electron 528 | 528 elimin 529 | 529 els 530 | 530 email 531 | 531 emailaddr 532 | 532 emerg 533 | 533 empir 534 | 534 employ 535 | 535 employe 536 | 536 en 537 | 537 enabl 538 | 538 encod 539 | 539 encourag 540 | 540 end 541 | 541 enemi 542 | 542 enenkio 543 | 543 energi 544 | 544 engin 545 | 545 english 546 | 546 enhanc 547 | 547 enjoi 548 | 548 enough 549 | 549 ensur 550 | 550 enter 551 | 551 enterpris 552 | 552 entertain 553 | 553 entir 554 | 554 entri 555 | 555 enumb 556 | 556 environ 557 | 557 equal 558 | 558 equip 559 | 559 equival 560 | 560 error 561 | 561 especi 562 | 562 essenti 563 | 563 establish 564 | 564 estat 565 | 565 estim 566 | 566 et 567 | 567 etc 568 | 568 euro 569 | 569 europ 570 | 570 european 571 | 571 even 572 | 572 event 573 | 573 eventu 574 | 574 ever 575 | 575 everi 576 | 576 everyon 577 | 577 everyth 578 | 578 evid 579 | 579 evil 580 | 580 exactli 581 | 581 exampl 582 | 582 excel 583 | 583 except 584 | 584 exchang 585 | 585 excit 586 | 586 exclus 587 | 587 execut 588 | 588 exercis 589 | 589 exist 590 | 590 exmh 591 | 591 expand 592 | 592 expect 593 | 593 expens 594 | 594 experi 595 | 595 expert 596 | 596 expir 597 | 597 explain 598 | 598 explor 599 | 599 express 600 | 600 extend 601 | 601 extens 602 | 602 extra 603 | 603 extract 604 | 604 extrem 605 | 605 ey 606 | 606 fa 607 | 607 face 608 | 608 fact 609 | 609 factor 610 | 610 fail 611 | 611 fair 612 | 612 fall 613 | 613 fals 614 | 614 famili 615 | 615 faq 616 | 616 far 617 | 617 fast 618 | 618 faster 619 | 619 fastest 620 | 620 fat 621 | 621 father 622 | 622 favorit 623 | 623 fax 624 | 624 fb 625 | 625 fd 626 | 626 featur 627 | 627 feder 628 | 628 fee 629 | 629 feed 630 | 630 feedback 631 | 631 feel 632 | 632 femal 633 | 633 few 634 | 634 ffffff 635 | 635 ffnumber 636 | 636 field 637 | 637 fight 638 | 638 figur 639 | 639 file 640 | 640 fill 641 | 641 film 642 | 642 filter 643 | 643 final 644 | 644 financ 645 | 645 financi 646 | 646 find 647 | 647 fine 648 | 648 finish 649 | 649 fire 650 | 650 firewal 651 | 651 firm 652 | 652 first 653 | 653 fit 654 | 654 five 655 | 655 fix 656 | 656 flag 657 | 657 flash 658 | 658 flow 659 | 659 fnumber 660 | 660 focu 661 | 661 folder 662 | 662 folk 663 | 663 follow 664 | 664 font 665 | 665 food 666 | 666 for 667 | 667 forc 668 | 668 foreign 669 | 669 forev 670 | 670 forget 671 | 671 fork 672 | 672 form 673 | 673 format 674 | 674 former 675 | 675 fortun 676 | 676 forward 677 | 677 found 678 | 678 foundat 679 | 679 four 680 | 680 franc 681 | 681 free 682 | 682 freedom 683 | 683 french 684 | 684 freshrpm 685 | 685 fri 686 | 686 fridai 687 | 687 friend 688 | 688 from 689 | 689 front 690 | 690 ftoc 691 | 691 ftp 692 | 692 full 693 | 693 fulli 694 | 694 fun 695 | 695 function 696 | 696 fund 697 | 697 further 698 | 698 futur 699 | 699 ga 700 | 700 gain 701 | 701 game 702 | 702 gari 703 | 703 garrigu 704 | 704 gave 705 | 705 gcc 706 | 706 geek 707 | 707 gener 708 | 708 get 709 | 709 gif 710 | 710 gift 711 | 711 girl 712 | 712 give 713 | 713 given 714 | 714 global 715 | 715 gnome 716 | 716 gnu 717 | 717 gnupg 718 | 718 go 719 | 719 goal 720 | 720 god 721 | 721 goe 722 | 722 gold 723 | 723 gone 724 | 724 good 725 | 725 googl 726 | 726 got 727 | 727 govern 728 | 728 gpl 729 | 729 grand 730 | 730 grant 731 | 731 graphic 732 | 732 great 733 | 733 greater 734 | 734 ground 735 | 735 group 736 | 736 grow 737 | 737 growth 738 | 738 gt 739 | 739 guarante 740 | 740 guess 741 | 741 gui 742 | 742 guid 743 | 743 ha 744 | 744 hack 745 | 745 had 746 | 746 half 747 | 747 ham 748 | 748 hand 749 | 749 handl 750 | 750 happen 751 | 751 happi 752 | 752 hard 753 | 753 hardwar 754 | 754 hat 755 | 755 hate 756 | 756 have 757 | 757 haven 758 | 758 he 759 | 759 head 760 | 760 header 761 | 761 headlin 762 | 762 health 763 | 763 hear 764 | 764 heard 765 | 765 heart 766 | 766 heaven 767 | 767 hei 768 | 768 height 769 | 769 held 770 | 770 hello 771 | 771 help 772 | 772 helvetica 773 | 773 her 774 | 774 herba 775 | 775 here 776 | 776 hermio 777 | 777 hettinga 778 | 778 hi 779 | 779 high 780 | 780 higher 781 | 781 highli 782 | 782 highlight 783 | 783 him 784 | 784 histori 785 | 785 hit 786 | 786 hold 787 | 787 home 788 | 788 honor 789 | 789 hope 790 | 790 host 791 | 791 hot 792 | 792 hour 793 | 793 hous 794 | 794 how 795 | 795 howev 796 | 796 hp 797 | 797 html 798 | 798 http 799 | 799 httpaddr 800 | 800 huge 801 | 801 human 802 | 802 hundr 803 | 803 ibm 804 | 804 id 805 | 805 idea 806 | 806 ident 807 | 807 identifi 808 | 808 idnumb 809 | 809 ie 810 | 810 if 811 | 811 ignor 812 | 812 ii 813 | 813 iii 814 | 814 iiiiiiihnumberjnumberhnumberjnumberhnumb 815 | 815 illeg 816 | 816 im 817 | 817 imag 818 | 818 imagin 819 | 819 immedi 820 | 820 impact 821 | 821 implement 822 | 822 import 823 | 823 impress 824 | 824 improv 825 | 825 in 826 | 826 inc 827 | 827 includ 828 | 828 incom 829 | 829 increas 830 | 830 incred 831 | 831 inde 832 | 832 independ 833 | 833 index 834 | 834 india 835 | 835 indian 836 | 836 indic 837 | 837 individu 838 | 838 industri 839 | 839 info 840 | 840 inform 841 | 841 initi 842 | 842 inlin 843 | 843 innov 844 | 844 input 845 | 845 insert 846 | 846 insid 847 | 847 instal 848 | 848 instanc 849 | 849 instant 850 | 850 instead 851 | 851 institut 852 | 852 instruct 853 | 853 insur 854 | 854 int 855 | 855 integr 856 | 856 intel 857 | 857 intellig 858 | 858 intend 859 | 859 interact 860 | 860 interest 861 | 861 interfac 862 | 862 intern 863 | 863 internet 864 | 864 interview 865 | 865 into 866 | 866 intro 867 | 867 introduc 868 | 868 inumb 869 | 869 invest 870 | 870 investig 871 | 871 investor 872 | 872 invok 873 | 873 involv 874 | 874 ip 875 | 875 ireland 876 | 876 irish 877 | 877 is 878 | 878 island 879 | 879 isn 880 | 880 iso 881 | 881 isp 882 | 882 issu 883 | 883 it 884 | 884 item 885 | 885 itself 886 | 886 jabber 887 | 887 jame 888 | 888 java 889 | 889 jim 890 | 890 jnumberiiiiiiihepihepihf 891 | 891 job 892 | 892 joe 893 | 893 john 894 | 894 join 895 | 895 journal 896 | 896 judg 897 | 897 judgment 898 | 898 jul 899 | 899 juli 900 | 900 jump 901 | 901 june 902 | 902 just 903 | 903 justin 904 | 904 keep 905 | 905 kei 906 | 906 kept 907 | 907 kernel 908 | 908 kevin 909 | 909 keyboard 910 | 910 kid 911 | 911 kill 912 | 912 kind 913 | 913 king 914 | 914 kingdom 915 | 915 knew 916 | 916 know 917 | 917 knowledg 918 | 918 known 919 | 919 la 920 | 920 lack 921 | 921 land 922 | 922 languag 923 | 923 laptop 924 | 924 larg 925 | 925 larger 926 | 926 largest 927 | 927 laser 928 | 928 last 929 | 929 late 930 | 930 later 931 | 931 latest 932 | 932 launch 933 | 933 law 934 | 934 lawrenc 935 | 935 le 936 | 936 lead 937 | 937 leader 938 | 938 learn 939 | 939 least 940 | 940 leav 941 | 941 left 942 | 942 legal 943 | 943 lender 944 | 944 length 945 | 945 less 946 | 946 lesson 947 | 947 let 948 | 948 letter 949 | 949 level 950 | 950 lib 951 | 951 librari 952 | 952 licens 953 | 953 life 954 | 954 lifetim 955 | 955 light 956 | 956 like 957 | 957 limit 958 | 958 line 959 | 959 link 960 | 960 linux 961 | 961 list 962 | 962 listen 963 | 963 littl 964 | 964 live 965 | 965 ll 966 | 966 lo 967 | 967 load 968 | 968 loan 969 | 969 local 970 | 970 locat 971 | 971 lock 972 | 972 lockergnom 973 | 973 log 974 | 974 long 975 | 975 longer 976 | 976 look 977 | 977 lose 978 | 978 loss 979 | 979 lost 980 | 980 lot 981 | 981 love 982 | 982 low 983 | 983 lower 984 | 984 lowest 985 | 985 lt 986 | 986 ma 987 | 987 mac 988 | 988 machin 989 | 989 made 990 | 990 magazin 991 | 991 mai 992 | 992 mail 993 | 993 mailer 994 | 994 main 995 | 995 maintain 996 | 996 major 997 | 997 make 998 | 998 maker 999 | 999 male 1000 | 1000 man 1001 | 1001 manag 1002 | 1002 mani 1003 | 1003 manual 1004 | 1004 manufactur 1005 | 1005 map 1006 | 1006 march 1007 | 1007 margin 1008 | 1008 mark 1009 | 1009 market 1010 | 1010 marshal 1011 | 1011 mass 1012 | 1012 master 1013 | 1013 match 1014 | 1014 materi 1015 | 1015 matter 1016 | 1016 matthia 1017 | 1017 mayb 1018 | 1018 me 1019 | 1019 mean 1020 | 1020 measur 1021 | 1021 mechan 1022 | 1022 media 1023 | 1023 medic 1024 | 1024 meet 1025 | 1025 member 1026 | 1026 membership 1027 | 1027 memori 1028 | 1028 men 1029 | 1029 mention 1030 | 1030 menu 1031 | 1031 merchant 1032 | 1032 messag 1033 | 1033 method 1034 | 1034 mh 1035 | 1035 michael 1036 | 1036 microsoft 1037 | 1037 middl 1038 | 1038 might 1039 | 1039 mike 1040 | 1040 mile 1041 | 1041 militari 1042 | 1042 million 1043 | 1043 mime 1044 | 1044 mind 1045 | 1045 mine 1046 | 1046 mini 1047 | 1047 minimum 1048 | 1048 minut 1049 | 1049 miss 1050 | 1050 mistak 1051 | 1051 mobil 1052 | 1052 mode 1053 | 1053 model 1054 | 1054 modem 1055 | 1055 modifi 1056 | 1056 modul 1057 | 1057 moment 1058 | 1058 mon 1059 | 1059 mondai 1060 | 1060 monei 1061 | 1061 monitor 1062 | 1062 month 1063 | 1063 monthli 1064 | 1064 more 1065 | 1065 morn 1066 | 1066 mortgag 1067 | 1067 most 1068 | 1068 mostli 1069 | 1069 mother 1070 | 1070 motiv 1071 | 1071 move 1072 | 1072 movi 1073 | 1073 mpnumber 1074 | 1074 mr 1075 | 1075 ms 1076 | 1076 msg 1077 | 1077 much 1078 | 1078 multi 1079 | 1079 multipart 1080 | 1080 multipl 1081 | 1081 murphi 1082 | 1082 music 1083 | 1083 must 1084 | 1084 my 1085 | 1085 myself 1086 | 1086 name 1087 | 1087 nation 1088 | 1088 natur 1089 | 1089 nbsp 1090 | 1090 near 1091 | 1091 nearli 1092 | 1092 necessari 1093 | 1093 need 1094 | 1094 neg 1095 | 1095 net 1096 | 1096 netscap 1097 | 1097 network 1098 | 1098 never 1099 | 1099 new 1100 | 1100 newslett 1101 | 1101 next 1102 | 1102 nextpart 1103 | 1103 nice 1104 | 1104 nigeria 1105 | 1105 night 1106 | 1106 no 1107 | 1107 nobodi 1108 | 1108 non 1109 | 1109 none 1110 | 1110 nor 1111 | 1111 normal 1112 | 1112 north 1113 | 1113 not 1114 | 1114 note 1115 | 1115 noth 1116 | 1116 notic 1117 | 1117 now 1118 | 1118 nt 1119 | 1119 null 1120 | 1120 number 1121 | 1121 numbera 1122 | 1122 numberam 1123 | 1123 numberanumb 1124 | 1124 numberb 1125 | 1125 numberbit 1126 | 1126 numberc 1127 | 1127 numbercb 1128 | 1128 numbercbr 1129 | 1129 numbercfont 1130 | 1130 numbercli 1131 | 1131 numbercnumb 1132 | 1132 numbercp 1133 | 1133 numberctd 1134 | 1134 numberd 1135 | 1135 numberdari 1136 | 1136 numberdnumb 1137 | 1137 numberenumb 1138 | 1138 numberf 1139 | 1139 numberfb 1140 | 1140 numberff 1141 | 1141 numberffont 1142 | 1142 numberfp 1143 | 1143 numberftd 1144 | 1144 numberk 1145 | 1145 numberm 1146 | 1146 numbermb 1147 | 1147 numberp 1148 | 1148 numberpd 1149 | 1149 numberpm 1150 | 1150 numberpx 1151 | 1151 numberst 1152 | 1152 numberth 1153 | 1153 numbertnumb 1154 | 1154 numberx 1155 | 1155 object 1156 | 1156 oblig 1157 | 1157 obtain 1158 | 1158 obvious 1159 | 1159 occur 1160 | 1160 oct 1161 | 1161 octob 1162 | 1162 of 1163 | 1163 off 1164 | 1164 offer 1165 | 1165 offic 1166 | 1166 offici 1167 | 1167 often 1168 | 1168 oh 1169 | 1169 ok 1170 | 1170 old 1171 | 1171 on 1172 | 1172 onc 1173 | 1173 onli 1174 | 1174 onlin 1175 | 1175 open 1176 | 1176 oper 1177 | 1177 opinion 1178 | 1178 opportun 1179 | 1179 opt 1180 | 1180 optim 1181 | 1181 option 1182 | 1182 or 1183 | 1183 order 1184 | 1184 org 1185 | 1185 organ 1186 | 1186 origin 1187 | 1187 os 1188 | 1188 osdn 1189 | 1189 other 1190 | 1190 otherwis 1191 | 1191 our 1192 | 1192 out 1193 | 1193 outlook 1194 | 1194 output 1195 | 1195 outsid 1196 | 1196 over 1197 | 1197 own 1198 | 1198 owner 1199 | 1199 oz 1200 | 1200 pacif 1201 | 1201 pack 1202 | 1202 packag 1203 | 1203 page 1204 | 1204 pai 1205 | 1205 paid 1206 | 1206 pain 1207 | 1207 palm 1208 | 1208 panel 1209 | 1209 paper 1210 | 1210 paragraph 1211 | 1211 parent 1212 | 1212 part 1213 | 1213 parti 1214 | 1214 particip 1215 | 1215 particular 1216 | 1216 particularli 1217 | 1217 partit 1218 | 1218 partner 1219 | 1219 pass 1220 | 1220 password 1221 | 1221 past 1222 | 1222 patch 1223 | 1223 patent 1224 | 1224 path 1225 | 1225 pattern 1226 | 1226 paul 1227 | 1227 payment 1228 | 1228 pc 1229 | 1229 peac 1230 | 1230 peopl 1231 | 1231 per 1232 | 1232 percent 1233 | 1233 percentag 1234 | 1234 perfect 1235 | 1235 perfectli 1236 | 1236 perform 1237 | 1237 perhap 1238 | 1238 period 1239 | 1239 perl 1240 | 1240 perman 1241 | 1241 permiss 1242 | 1242 person 1243 | 1243 pgp 1244 | 1244 phone 1245 | 1245 photo 1246 | 1246 php 1247 | 1247 phrase 1248 | 1248 physic 1249 | 1249 pick 1250 | 1250 pictur 1251 | 1251 piec 1252 | 1252 piiiiiiii 1253 | 1253 pipe 1254 | 1254 pjnumber 1255 | 1255 place 1256 | 1256 plai 1257 | 1257 plain 1258 | 1258 plan 1259 | 1259 planet 1260 | 1260 plant 1261 | 1261 planta 1262 | 1262 platform 1263 | 1263 player 1264 | 1264 pleas 1265 | 1265 plu 1266 | 1266 plug 1267 | 1267 pm 1268 | 1268 pocket 1269 | 1269 point 1270 | 1270 polic 1271 | 1271 polici 1272 | 1272 polit 1273 | 1273 poor 1274 | 1274 pop 1275 | 1275 popul 1276 | 1276 popular 1277 | 1277 port 1278 | 1278 posit 1279 | 1279 possibl 1280 | 1280 post 1281 | 1281 potenti 1282 | 1282 pound 1283 | 1283 powel 1284 | 1284 power 1285 | 1285 powershot 1286 | 1286 practic 1287 | 1287 pre 1288 | 1288 predict 1289 | 1289 prefer 1290 | 1290 premium 1291 | 1291 prepar 1292 | 1292 present 1293 | 1293 presid 1294 | 1294 press 1295 | 1295 pretti 1296 | 1296 prevent 1297 | 1297 previou 1298 | 1298 previous 1299 | 1299 price 1300 | 1300 principl 1301 | 1301 print 1302 | 1302 printabl 1303 | 1303 printer 1304 | 1304 privaci 1305 | 1305 privat 1306 | 1306 prize 1307 | 1307 pro 1308 | 1308 probabl 1309 | 1309 problem 1310 | 1310 procedur 1311 | 1311 process 1312 | 1312 processor 1313 | 1313 procmail 1314 | 1314 produc 1315 | 1315 product 1316 | 1316 profession 1317 | 1317 profil 1318 | 1318 profit 1319 | 1319 program 1320 | 1320 programm 1321 | 1321 progress 1322 | 1322 project 1323 | 1323 promis 1324 | 1324 promot 1325 | 1325 prompt 1326 | 1326 properti 1327 | 1327 propos 1328 | 1328 proprietari 1329 | 1329 prospect 1330 | 1330 protect 1331 | 1331 protocol 1332 | 1332 prove 1333 | 1333 proven 1334 | 1334 provid 1335 | 1335 proxi 1336 | 1336 pub 1337 | 1337 public 1338 | 1338 publish 1339 | 1339 pudg 1340 | 1340 pull 1341 | 1341 purchas 1342 | 1342 purpos 1343 | 1343 put 1344 | 1344 python 1345 | 1345 qnumber 1346 | 1346 qualifi 1347 | 1347 qualiti 1348 | 1348 quarter 1349 | 1349 question 1350 | 1350 quick 1351 | 1351 quickli 1352 | 1352 quit 1353 | 1353 quot 1354 | 1354 radio 1355 | 1355 ragga 1356 | 1356 rais 1357 | 1357 random 1358 | 1358 rang 1359 | 1359 rate 1360 | 1360 rather 1361 | 1361 ratio 1362 | 1362 razor 1363 | 1363 razornumb 1364 | 1364 re 1365 | 1365 reach 1366 | 1366 read 1367 | 1367 reader 1368 | 1368 readi 1369 | 1369 real 1370 | 1370 realiz 1371 | 1371 realli 1372 | 1372 reason 1373 | 1373 receiv 1374 | 1374 recent 1375 | 1375 recipi 1376 | 1376 recommend 1377 | 1377 record 1378 | 1378 red 1379 | 1379 redhat 1380 | 1380 reduc 1381 | 1381 refer 1382 | 1382 refin 1383 | 1383 reg 1384 | 1384 regard 1385 | 1385 region 1386 | 1386 regist 1387 | 1387 regul 1388 | 1388 regular 1389 | 1389 rel 1390 | 1390 relat 1391 | 1391 relationship 1392 | 1392 releas 1393 | 1393 relev 1394 | 1394 reliabl 1395 | 1395 remain 1396 | 1396 rememb 1397 | 1397 remot 1398 | 1398 remov 1399 | 1399 replac 1400 | 1400 repli 1401 | 1401 report 1402 | 1402 repositori 1403 | 1403 repres 1404 | 1404 republ 1405 | 1405 request 1406 | 1406 requir 1407 | 1407 research 1408 | 1408 reserv 1409 | 1409 resid 1410 | 1410 resourc 1411 | 1411 respect 1412 | 1412 respond 1413 | 1413 respons 1414 | 1414 rest 1415 | 1415 result 1416 | 1416 retail 1417 | 1417 return 1418 | 1418 reveal 1419 | 1419 revenu 1420 | 1420 revers 1421 | 1421 review 1422 | 1422 revok 1423 | 1423 rh 1424 | 1424 rich 1425 | 1425 right 1426 | 1426 risk 1427 | 1427 road 1428 | 1428 robert 1429 | 1429 rock 1430 | 1430 role 1431 | 1431 roll 1432 | 1432 rom 1433 | 1433 roman 1434 | 1434 room 1435 | 1435 root 1436 | 1436 round 1437 | 1437 rpm 1438 | 1438 rss 1439 | 1439 rule 1440 | 1440 run 1441 | 1441 sa 1442 | 1442 safe 1443 | 1443 sai 1444 | 1444 said 1445 | 1445 sale 1446 | 1446 same 1447 | 1447 sampl 1448 | 1448 san 1449 | 1449 saou 1450 | 1450 sat 1451 | 1451 satellit 1452 | 1452 save 1453 | 1453 saw 1454 | 1454 scan 1455 | 1455 schedul 1456 | 1456 school 1457 | 1457 scienc 1458 | 1458 score 1459 | 1459 screen 1460 | 1460 script 1461 | 1461 se 1462 | 1462 search 1463 | 1463 season 1464 | 1464 second 1465 | 1465 secret 1466 | 1466 section 1467 | 1467 secur 1468 | 1468 see 1469 | 1469 seed 1470 | 1470 seek 1471 | 1471 seem 1472 | 1472 seen 1473 | 1473 select 1474 | 1474 self 1475 | 1475 sell 1476 | 1476 seminar 1477 | 1477 send 1478 | 1478 sender 1479 | 1479 sendmail 1480 | 1480 senior 1481 | 1481 sens 1482 | 1482 sensit 1483 | 1483 sent 1484 | 1484 sep 1485 | 1485 separ 1486 | 1486 septemb 1487 | 1487 sequenc 1488 | 1488 seri 1489 | 1489 serif 1490 | 1490 seriou 1491 | 1491 serv 1492 | 1492 server 1493 | 1493 servic 1494 | 1494 set 1495 | 1495 setup 1496 | 1496 seven 1497 | 1497 seventh 1498 | 1498 sever 1499 | 1499 sex 1500 | 1500 sexual 1501 | 1501 sf 1502 | 1502 shape 1503 | 1503 share 1504 | 1504 she 1505 | 1505 shell 1506 | 1506 ship 1507 | 1507 shop 1508 | 1508 short 1509 | 1509 shot 1510 | 1510 should 1511 | 1511 show 1512 | 1512 side 1513 | 1513 sign 1514 | 1514 signatur 1515 | 1515 signific 1516 | 1516 similar 1517 | 1517 simpl 1518 | 1518 simpli 1519 | 1519 sinc 1520 | 1520 sincer 1521 | 1521 singl 1522 | 1522 sit 1523 | 1523 site 1524 | 1524 situat 1525 | 1525 six 1526 | 1526 size 1527 | 1527 skeptic 1528 | 1528 skill 1529 | 1529 skin 1530 | 1530 skip 1531 | 1531 sleep 1532 | 1532 slow 1533 | 1533 small 1534 | 1534 smart 1535 | 1535 smoke 1536 | 1536 smtp 1537 | 1537 snumber 1538 | 1538 so 1539 | 1539 social 1540 | 1540 societi 1541 | 1541 softwar 1542 | 1542 sold 1543 | 1543 solut 1544 | 1544 solv 1545 | 1545 some 1546 | 1546 someon 1547 | 1547 someth 1548 | 1548 sometim 1549 | 1549 son 1550 | 1550 song 1551 | 1551 soni 1552 | 1552 soon 1553 | 1553 sorri 1554 | 1554 sort 1555 | 1555 sound 1556 | 1556 sourc 1557 | 1557 south 1558 | 1558 space 1559 | 1559 spain 1560 | 1560 spam 1561 | 1561 spamassassin 1562 | 1562 spamd 1563 | 1563 spammer 1564 | 1564 speak 1565 | 1565 spec 1566 | 1566 special 1567 | 1567 specif 1568 | 1568 specifi 1569 | 1569 speech 1570 | 1570 speed 1571 | 1571 spend 1572 | 1572 sponsor 1573 | 1573 sport 1574 | 1574 spot 1575 | 1575 src 1576 | 1576 ssh 1577 | 1577 st 1578 | 1578 stabl 1579 | 1579 staff 1580 | 1580 stai 1581 | 1581 stand 1582 | 1582 standard 1583 | 1583 star 1584 | 1584 start 1585 | 1585 state 1586 | 1586 statement 1587 | 1587 statu 1588 | 1588 step 1589 | 1589 steve 1590 | 1590 still 1591 | 1591 stock 1592 | 1592 stop 1593 | 1593 storag 1594 | 1594 store 1595 | 1595 stori 1596 | 1596 strategi 1597 | 1597 stream 1598 | 1598 street 1599 | 1599 string 1600 | 1600 strip 1601 | 1601 strong 1602 | 1602 structur 1603 | 1603 studi 1604 | 1604 stuff 1605 | 1605 stupid 1606 | 1606 style 1607 | 1607 subject 1608 | 1608 submit 1609 | 1609 subscrib 1610 | 1610 subscript 1611 | 1611 substanti 1612 | 1612 success 1613 | 1613 such 1614 | 1614 suffer 1615 | 1615 suggest 1616 | 1616 suit 1617 | 1617 sum 1618 | 1618 summari 1619 | 1619 summer 1620 | 1620 sun 1621 | 1621 super 1622 | 1622 suppli 1623 | 1623 support 1624 | 1624 suppos 1625 | 1625 sure 1626 | 1626 surpris 1627 | 1627 suse 1628 | 1628 suspect 1629 | 1629 sweet 1630 | 1630 switch 1631 | 1631 system 1632 | 1632 tab 1633 | 1633 tabl 1634 | 1634 tablet 1635 | 1635 tag 1636 | 1636 take 1637 | 1637 taken 1638 | 1638 talk 1639 | 1639 tape 1640 | 1640 target 1641 | 1641 task 1642 | 1642 tax 1643 | 1643 teach 1644 | 1644 team 1645 | 1645 tech 1646 | 1646 technic 1647 | 1647 techniqu 1648 | 1648 technolog 1649 | 1649 tel 1650 | 1650 telecom 1651 | 1651 telephon 1652 | 1652 tell 1653 | 1653 temperatur 1654 | 1654 templ 1655 | 1655 ten 1656 | 1656 term 1657 | 1657 termin 1658 | 1658 terror 1659 | 1659 terrorist 1660 | 1660 test 1661 | 1661 texa 1662 | 1662 text 1663 | 1663 than 1664 | 1664 thank 1665 | 1665 that 1666 | 1666 the 1667 | 1667 thei 1668 | 1668 their 1669 | 1669 them 1670 | 1670 themselv 1671 | 1671 then 1672 | 1672 theori 1673 | 1673 there 1674 | 1674 therefor 1675 | 1675 these 1676 | 1676 thi 1677 | 1677 thing 1678 | 1678 think 1679 | 1679 thinkgeek 1680 | 1680 third 1681 | 1681 those 1682 | 1682 though 1683 | 1683 thought 1684 | 1684 thousand 1685 | 1685 thread 1686 | 1686 threat 1687 | 1687 three 1688 | 1688 through 1689 | 1689 thu 1690 | 1690 thursdai 1691 | 1691 ti 1692 | 1692 ticket 1693 | 1693 tim 1694 | 1694 time 1695 | 1695 tip 1696 | 1696 tire 1697 | 1697 titl 1698 | 1698 tm 1699 | 1699 to 1700 | 1700 todai 1701 | 1701 togeth 1702 | 1702 token 1703 | 1703 told 1704 | 1704 toll 1705 | 1705 tom 1706 | 1706 toner 1707 | 1707 toni 1708 | 1708 too 1709 | 1709 took 1710 | 1710 tool 1711 | 1711 top 1712 | 1712 topic 1713 | 1713 total 1714 | 1714 touch 1715 | 1715 toward 1716 | 1716 track 1717 | 1717 trade 1718 | 1718 tradit 1719 | 1719 traffic 1720 | 1720 train 1721 | 1721 transact 1722 | 1722 transfer 1723 | 1723 travel 1724 | 1724 treat 1725 | 1725 tree 1726 | 1726 tri 1727 | 1727 trial 1728 | 1728 trick 1729 | 1729 trip 1730 | 1730 troubl 1731 | 1731 true 1732 | 1732 truli 1733 | 1733 trust 1734 | 1734 truth 1735 | 1735 try 1736 | 1736 tue 1737 | 1737 tuesdai 1738 | 1738 turn 1739 | 1739 tv 1740 | 1740 two 1741 | 1741 type 1742 | 1742 uk 1743 | 1743 ultim 1744 | 1744 un 1745 | 1745 under 1746 | 1746 understand 1747 | 1747 unfortun 1748 | 1748 uniqu 1749 | 1749 unison 1750 | 1750 unit 1751 | 1751 univers 1752 | 1752 unix 1753 | 1753 unless 1754 | 1754 unlik 1755 | 1755 unlimit 1756 | 1756 unseen 1757 | 1757 unsolicit 1758 | 1758 unsubscrib 1759 | 1759 until 1760 | 1760 up 1761 | 1761 updat 1762 | 1762 upgrad 1763 | 1763 upon 1764 | 1764 urgent 1765 | 1765 url 1766 | 1766 us 1767 | 1767 usa 1768 | 1768 usag 1769 | 1769 usb 1770 | 1770 usd 1771 | 1771 usdollarnumb 1772 | 1772 useless 1773 | 1773 user 1774 | 1774 usr 1775 | 1775 usual 1776 | 1776 util 1777 | 1777 vacat 1778 | 1778 valid 1779 | 1779 valu 1780 | 1780 valuabl 1781 | 1781 var 1782 | 1782 variabl 1783 | 1783 varieti 1784 | 1784 variou 1785 | 1785 ve 1786 | 1786 vendor 1787 | 1787 ventur 1788 | 1788 veri 1789 | 1789 verifi 1790 | 1790 version 1791 | 1791 via 1792 | 1792 video 1793 | 1793 view 1794 | 1794 virtual 1795 | 1795 visa 1796 | 1796 visit 1797 | 1797 visual 1798 | 1798 vnumber 1799 | 1799 voic 1800 | 1800 vote 1801 | 1801 vs 1802 | 1802 vulner 1803 | 1803 wa 1804 | 1804 wai 1805 | 1805 wait 1806 | 1806 wake 1807 | 1807 walk 1808 | 1808 wall 1809 | 1809 want 1810 | 1810 war 1811 | 1811 warm 1812 | 1812 warn 1813 | 1813 warranti 1814 | 1814 washington 1815 | 1815 wasn 1816 | 1816 wast 1817 | 1817 watch 1818 | 1818 water 1819 | 1819 we 1820 | 1820 wealth 1821 | 1821 weapon 1822 | 1822 web 1823 | 1823 weblog 1824 | 1824 websit 1825 | 1825 wed 1826 | 1826 wednesdai 1827 | 1827 week 1828 | 1828 weekli 1829 | 1829 weight 1830 | 1830 welcom 1831 | 1831 well 1832 | 1832 went 1833 | 1833 were 1834 | 1834 west 1835 | 1835 what 1836 | 1836 whatev 1837 | 1837 when 1838 | 1838 where 1839 | 1839 whether 1840 | 1840 which 1841 | 1841 while 1842 | 1842 white 1843 | 1843 whitelist 1844 | 1844 who 1845 | 1845 whole 1846 | 1846 whose 1847 | 1847 why 1848 | 1848 wi 1849 | 1849 wide 1850 | 1850 width 1851 | 1851 wife 1852 | 1852 will 1853 | 1853 william 1854 | 1854 win 1855 | 1855 window 1856 | 1856 wing 1857 | 1857 winner 1858 | 1858 wireless 1859 | 1859 wish 1860 | 1860 with 1861 | 1861 within 1862 | 1862 without 1863 | 1863 wnumberp 1864 | 1864 woman 1865 | 1865 women 1866 | 1866 won 1867 | 1867 wonder 1868 | 1868 word 1869 | 1869 work 1870 | 1870 worker 1871 | 1871 world 1872 | 1872 worldwid 1873 | 1873 worri 1874 | 1874 worst 1875 | 1875 worth 1876 | 1876 would 1877 | 1877 wouldn 1878 | 1878 write 1879 | 1879 written 1880 | 1880 wrong 1881 | 1881 wrote 1882 | 1882 www 1883 | 1883 ximian 1884 | 1884 xml 1885 | 1885 xp 1886 | 1886 yahoo 1887 | 1887 ye 1888 | 1888 yeah 1889 | 1889 year 1890 | 1890 yesterdai 1891 | 1891 yet 1892 | 1892 york 1893 | 1893 you 1894 | 1894 young 1895 | 1895 your 1896 | 1896 yourself 1897 | 1897 zdnet 1898 | 1898 zero 1899 | 1899 zip 1900 | --------------------------------------------------------------------------------