├── .gitignore
├── README.md
├── algorithm_analysis
    ├── data
    │   └── water.mat
    ├── diagnose.py
    └── linear_regression.py
├── anomaly_detection
    ├── anomaly.py
    ├── data
    │   ├── ex8data1.mat
    │   └── ex8data2.mat
    └── test_anomaly_detection.py
├── kmeans
    ├── data
    │   ├── places.txt
    │   ├── portlandClubs.txt
    │   ├── testSet.txt
    │   └── testSet2.txt
    ├── kmeans.py
    ├── test_bi_kmeans.py
    └── test_normal_kmeans.py
├── linear_regression
    ├── data
    │   ├── ex0.txt
    │   ├── ex1.txt
    │   ├── houses.txt
    │   ├── lwr.txt
    │   └── temperature.txt
    ├── regression.py
    ├── test_bgd.py
    ├── test_feature_scaling.py
    ├── test_lwr.py
    ├── test_multiple.py
    ├── test_sgd.py
    ├── test_temperature_normal.py
    └── test_temperature_polynomial.py
├── logical_regression
    ├── data
    │   ├── ex3data1.mat
    │   ├── linear.txt
    │   └── non_linear.txt
    ├── logical_regression.py
    ├── test_linear_boundry.py
    ├── test_non_linear_boundry.py
    └── test_onevsall.py
├── neural_network
    ├── data
    │   ├── ex4weights.mat
    │   └── handwritten_digits.mat
    ├── nn.py
    ├── test_handwritten_digits.py
    └── test_logic_and.py
├── pca
    ├── data
    │   ├── bird_small.mat
    │   ├── ex7data1.mat
    │   ├── ex7data2.mat
    │   └── ex7faces.mat
    ├── kmeans.py
    ├── pca.py
    ├── test_pca4performance.py
    └── test_pca4visualization.py
├── recommender_system
    ├── data
    │   ├── ex8_movieParams.mat
    │   ├── ex8_movies.mat
    │   └── movie_ids.txt
    ├── recommender.py
    └── test_movies_rating.py
└── svm
    ├── data
        ├── emailSample1.txt
        ├── emailSample2.txt
        ├── ex6data1.mat
        ├── ex6data2.mat
        ├── ex6data3.mat
        ├── spamSample1.txt
        ├── spamSample2.txt
        ├── spamTest.mat
        └── spamTrain.mat
    ├── smo.py
    ├── spam.py
    ├── test_linear.py
    ├── test_model_selection.py
    ├── test_non_linear.py
    ├── test_spam.py
    └── vocab.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | *.log
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 斯坦福机器学习笔记
 2 | ==================
 3 | 
 4 | Gitbook 地址： [戳我](https://www.gitbook.com/book/yoyoyohamapi/mit-ml/details)
 5 | 
 6 | 本书为斯坦福吴恩达教授的在 coursera 上的[机器学习公开课](https://zh.coursera.org/learn/machine-learning)的知识笔记，涵盖了大部分课上涉及到的知识点和内容，因为篇幅有限，部分公式的推导没有记录在案，但推荐大家还是在草稿本上演算一遍，加深印象，知其然还要知其所以然。
 7 | 
 8 | 本书涉及到的程序代码均放在了我个人的 [github](https://github.com/yoyoyohamapi/mit-ml) 上，采用了 python 实现，大部分代码都是相关学习算法的完整实现和测试。我没有放这门课程的 homework 代码，原因是 homework 布置的编程作业是填空式的作业，而完整实现一个算法虽然历经更多坎坷，但更有助于检验自己对算法理解和掌握程度。
 9 | 
10 | 本书的章节安排与课程对应关系为：
11 | 
12 | | 斯坦福课程 | 本书章节           |
13 | |:-----------|:-------------------|
14 | | Week 2     | 线性回归           |
15 | | Week 3     | 逻辑回归           |
16 | | Week 4-5   | 神经网络           |
17 | | Week 6     | 算法分析与优化     |
18 | | Week 7     | SVM（支持向量机）  |
19 | | Week 8     | K-Means、特征降维  |
20 | | Week 9     | 异常检测、推荐系统 |
21 | | Week 10    | 大规模机器学习     |
22 | | Week 11    | 案例--光学字符识别 |
23 | 
24 | 学生我才疏学浅，对机器学习也只是刚刚入门，文中难免不少纰漏甚至严重错误，希望大家指正，这是对我最大的帮助。本书最大的目的也在于交流学习，而不在 star 和传播。任重而道远，你我共勉。
25 | 


--------------------------------------------------------------------------------
/algorithm_analysis/data/water.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/algorithm_analysis/data/water.mat


--------------------------------------------------------------------------------
/algorithm_analysis/diagnose.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # algorithm_analysis/diagnose.py
  3 | """算法诊断
  4 | """
  5 | import linear_regression
  6 | import numpy as np
  7 | from scipy.io import loadmat
  8 | import matplotlib.pyplot as plt
  9 | from sklearn.preprocessing import PolynomialFeatures
 10 | 
 11 | data = loadmat('data/water.mat')
 12 | #####
 13 | # 数据集划分
 14 | #####
 15 | # 训练集
 16 | X = np.mat(data['X'])
 17 | # 为X添加偏置
 18 | X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
 19 | y = np.mat(data['y'])
 20 | # 交叉验证集
 21 | Xval = np.mat(data['Xval'])
 22 | Xval = np.concatenate((np.ones((Xval.shape[0], 1)), Xval), axis=1)
 23 | yval = np.mat(data['yval'])
 24 | # 测试集
 25 | Xtest = np.mat(data['Xtest'])
 26 | Xtest = np.concatenate((np.ones((Xtest.shape[0], 1)), Xtest), axis=1)
 27 | ytest = np.mat(data['ytest'])
 28 | 
 29 | def diagnoseLR():
 30 |     """线性回归诊断
 31 |     """
 32 |     initTheta = np.mat(np.ones((X.shape[1], 1)))
 33 |     result, timeConsumed = linear_regression.gradient(
 34 |         X, y, rate=0.001, maxLoop=5000, epsilon=0.1, initTheta=initTheta)
 35 |     theta, errors = result
 36 | 
 37 |     # 绘制拟合成果
 38 |     Xmin = X[:, 1].min()
 39 |     Xmax = X[:, 1].max()
 40 |     ymax = y[:, 0].max()
 41 |     ymin = y[:, 0].min()
 42 |     fitX = np.mat(np.linspace(Xmin, Xmax, 20).reshape(-1, 1))
 43 |     fitX = np.concatenate((np.ones((fitX.shape[0], 1)), fitX), axis=1)
 44 |     h = fitX * theta
 45 |     plt.xlim(Xmin, Xmax)
 46 |     plt.ylim(ymin, ymax)
 47 |     # 绘制训练样本
 48 |     plt.scatter(X[:, 1].flatten().A[0], y[:, 0].flatten().A[0],marker='x',color='r', linewidth=2)
 49 |     # 绘制拟合曲线
 50 |     plt.plot(fitX[:, 1], h, color='b')
 51 |     plt.xlabel('Change in water level(x)')
 52 |     plt.ylabel('Water flowing out of the dam(y)')
 53 |     plt.show()
 54 | 
 55 |     # 绘制随样本规模学习曲线
 56 |     m, n = X.shape
 57 |     trainErrors = np.zeros((1,m))
 58 |     valErrors = np.zeros((1,m))
 59 |     for i in range(m):
 60 |         Xtrain = X[0:i+1]
 61 |         ytrain = y[0:i+1]
 62 |         res, timeConsumed = linear_regression.gradient(
 63 |             Xtrain, ytrain, rate=0.001, maxLoop=5000, epsilon=0.1)
 64 |         theta, errors = res
 65 |         trainErrors[0,i] = errors[-1]
 66 |         valErrors[0,i] = linear_regression.J(theta, Xval, yval)
 67 | 
 68 |     plt.plot(np.arange(1,m+1).ravel(), trainErrors.ravel(), color='b', label='Training Error')
 69 |     plt.plot(np.arange(1,m+1).ravel(), valErrors.ravel(), color='g', label='Validation Error')
 70 |     plt.title('Learning curve for linear regression')
 71 |     plt.xlabel('Number of training examples')
 72 |     plt.ylabel('Error')
 73 |     plt.legend()
 74 |     plt.show()
 75 | 
 76 | def diagnosePR():
 77 |     """多项式回归诊断
 78 |     """
 79 |     # 多项式回归
 80 |     poly = PolynomialFeatures(degree=8)
 81 |     XX, XXval, XXtest = [linear_regression.normalize(
 82 |         np.mat(poly.fit_transform(data[:, 1:]))) for data in [X, Xval, Xtest]]
 83 |     initTheta = np.mat(np.ones((XX.shape[1], 1)))
 84 |     theLambdas = [1.0, 0.001, 0.003, 0.01, 0.003, 0.1, 0.3, 1.0, 3.0, 10.0]
 85 |     numTheLambdas = len(theLambdas)
 86 |     trainErrors = np.zeros((1, numTheLambdas))
 87 |     valErrors = np.zeros((1, numTheLambdas))
 88 |     thetas = []
 89 |     for idx, theLambda in enumerate(theLambdas):
 90 |         res, timeConsumed = linear_regression.gradient(
 91 |             XX, y, rate=0.3, maxLoop=500, epsilon=0.01,
 92 |             theLambda=theLambda, initTheta=initTheta)
 93 |         theta, errors = res
 94 |         thetas.append(theta)
 95 |         trainErrors[0, idx] = errors[-1]
 96 |         valErrors[0, idx] = linear_regression.J(
 97 |             theta, XXval, yval, theLambda=theLambda)
 98 |     bestLambda = theLambdas[np.argmin(valErrors)]
 99 |     theta = thetas[np.argmin(valErrors)]
100 |     error = np.min(valErrors)
101 | 
102 |     # # 绘制随样本规模学习曲线
103 |     plt.plot(np.arange(1, numTheLambdas + 1).ravel(),
104 |              trainErrors.ravel(), color='b', label='Training Error')
105 |     plt.plot(np.arange(1, numTheLambdas + 1).ravel(),
106 |              valErrors.ravel(), color='g', label='Validation Error')
107 |     plt.title('Learning curve for polynomial regression')
108 |     plt.xlabel('lambda')
109 |     plt.ylabel('Error')
110 |     plt.legend()
111 |     plt.show()
112 | 
113 |     # 绘制拟合曲线
114 |     fitX = np.mat(np.linspace(-60, 45).reshape(-1, 1))
115 |     fitX = np.concatenate((np.ones((fitX.shape[0], 1)), fitX), axis=1)
116 |     fitXX = linear_regression.normalize(np.mat(poly.fit_transform(fitX[:, 1:])))
117 |     h = fitXX * theta
118 |     plt.title('Polynomial regression learning curve(lambda=%.3f) \n  validation error=%.3f' % (bestLambda, error))
119 |     plt.scatter(X[:, 1].ravel(), y[:, 0].flatten().A[0], marker='x', color='r', linewidth=3)
120 |     plt.plot(fitX[:, 1], h, color='b')
121 |     plt.show()
122 | 
123 | diagnoseLR()
124 | diagnosePR()
125 | 


--------------------------------------------------------------------------------
/algorithm_analysis/linear_regression.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # algorithm_analysis/linear_regression.py
  3 | import numpy as np
  4 | import matplotlib as plt
  5 | import time
  6 | 
  7 | 
  8 | def exeTime(func):
  9 |     """ 耗时计算装饰器
 10 |     """
 11 |     def newFunc(*args, **args2):
 12 |         t0 = time.time()
 13 |         back = func(*args, **args2)
 14 |         return back, time.time() - t0
 15 |     return newFunc
 16 | 
 17 | 
 18 | def h(theta, x):
 19 |     """预测函数
 20 | 
 21 |     Args:
 22 |         theta 相关系数矩阵
 23 |         x 特征向量
 24 | 
 25 |     Returns:
 26 |         预测结果
 27 |     """
 28 |     return (theta.T * x)[0, 0]
 29 | 
 30 | 
 31 | def J(theta, X, y, theLambda=0):
 32 |     """代价函数
 33 | 
 34 |     Args:
 35 |         theta 相关系数矩阵
 36 |         X 样本集矩阵
 37 |         y 标签集矩阵
 38 | 
 39 |     Returns:
 40 |         预测误差（代价）
 41 |     """
 42 |     m = len(X)
 43 |     return (X * theta - y).T * (X * theta - y) / (2 * m) + theLambda * np.sum(np.square(theta)) / (2*m)
 44 | 
 45 | 
 46 | @exeTime
 47 | def gradient(X, y, rate=1, maxLoop=50, epsilon=1e-1, theLambda=0, initTheta=None):
 48 |     """批量梯度下降法
 49 | 
 50 |     Args:
 51 |         X 样本矩阵
 52 |         y 标签矩阵
 53 |         rate 学习率
 54 |         maxLoop 最大迭代次数
 55 |         epsilon 收敛精度
 56 |         theLambda 正规化参数
 57 |     Returns:
 58 |         (theta, errors), timeConsumed
 59 |     """
 60 |     m, n = X.shape
 61 |     # 初始化theta
 62 |     if initTheta is None:
 63 |         theta = np.zeros((n, 1))
 64 |     else:
 65 |         theta = initTheta
 66 |     count = 0
 67 |     converged = False
 68 |     error = float('inf')
 69 |     errors = []
 70 |     for i in range(maxLoop):
 71 |         theta = theta + (1.0 / m) * rate * ((y - X * theta).T * X).T
 72 |         error = J(theta, X, y, theLambda)
 73 |         if np.isnan(error) is True:
 74 |             error = np.inf
 75 |         else:
 76 |             error = error[0, 0]
 77 |         errors.append(error)
 78 |         # 如果已经收敛
 79 |         if(error < epsilon):
 80 |             break
 81 |     return theta, errors
 82 | 
 83 | def standardize(X):
 84 |     """特征标准化处理
 85 | 
 86 |     Args:
 87 |         X 样本集
 88 |     Returns:
 89 |         标准后的样本集
 90 |     """
 91 |     m, n = X.shape
 92 |     # 归一化每一个特征
 93 |     for j in range(n):
 94 |         features = X[:,j]
 95 |         meanVal = features.mean(axis=0)
 96 |         std = features.std(axis=0)
 97 |         if std != 0:
 98 |             X[:, j] = (features-meanVal)/std
 99 |         else:
100 |             X[:, j] = 0
101 |     return X
102 | 
103 | def normalize(X):
104 |     """特征归一化处理
105 | 
106 |     Args:
107 |         X 样本集
108 |     Returns:
109 |         归一化后的样本集
110 |     """
111 |     m, n = X.shape
112 |     # 归一化每一个特征
113 |     for j in range(n):
114 |         features = X[:,j]
115 |         minVal = features.min(axis=0)
116 |         maxVal = features.max(axis=0)
117 |         diff = maxVal - minVal
118 |         if diff != 0:
119 |            X[:,j] = (features-minVal)/diff
120 |         else:
121 |            X[:,j] = 0
122 |     return X
123 | 
124 | def getLearningCurves(X, y, Xval, yval, rate=1, maxLoop=50, epsilon=0.1, theLambda=0):
125 |     """获得学习曲线
126 | 
127 |     Args:
128 |         X 样本集
129 |         y 标签集
130 |         Xval 交叉验证集
131 |         yval 交叉验证集标签
132 |     Returns:
133 |         trainErrors 训练误差随样本规模的变化
134 |         valErrors 校验验证集误差随样本规模的变化
135 |     """
136 |     # 绘制随样本规模学习曲线
137 |     m, n = X.shape
138 |     trainErrors = np.zeros((1,m))
139 |     valErrors = np.zeros((1,m))
140 |     for i in range(m):
141 |         Xtrain = X[0:i+1]
142 |         ytrain = y[0:i+1]
143 |         res, timeConsumed = gradient(
144 |             Xtrain, ytrain, rate=rate, maxLoop=maxLoop, epsilon=epsilon,theLambda=theLambda)
145 |         theta, errors = res
146 |         trainErrors[0,i] = errors[-1]
147 |         valErrors[0,i] = J(theta, Xval, yval, theLambda=theLambda)
148 |     return trainErrors, valErrors
149 | 


--------------------------------------------------------------------------------
/anomaly_detection/anomaly.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | # anomaly_detection/anomaly.py
  3 | 
  4 | import numpy as np
  5 | 
  6 | def F1(predictions, y):
  7 |     """F_1Score
  8 | 
  9 |     Args:
 10 |         predictions 预测
 11 |         y 真实值
 12 |     Returns:
 13 |         F_1Score
 14 |     """
 15 |     TP = np.sum((predictions == 1) & (y == 1))
 16 |     FP = np.sum((predictions == 1) & (y == 0))
 17 |     FN = np.sum((predictions == 0) & (y == 1))
 18 |     if TP + FP == 0:
 19 |         precision = 0
 20 |     else:
 21 |         precision = float(TP) / (TP + FP)
 22 |     if TP + FN == 0:
 23 |         recall = 0
 24 |     else:
 25 |         recall = float(TP) / (TP + FN)
 26 |     if precision + recall == 0:
 27 |         return 0
 28 |     else:
 29 |         return (2.0 * precision * recall) / (precision + recall)
 30 | 
 31 | 
 32 | def gaussianModel(X):
 33 |     """高斯模型
 34 | 
 35 |     Args:
 36 |         X 样本集
 37 |     Returns:
 38 |         p 模型
 39 |     """
 40 |     # 参数估计
 41 |     m, n = X.shape
 42 |     mu = np.mean(X, axis=0)
 43 |     delta2 = np.var(X, axis=0)
 44 |     def p(x):
 45 |         """p(x)
 46 | 
 47 |         Args:
 48 |             x x
 49 |             mu mu
 50 |             delta2 delta2
 51 |         Returns:
 52 |             p
 53 |         """
 54 |         total = 1
 55 |         for j in range(x.shape[0]):
 56 |             total *= np.exp(-np.power((x[j, 0] - mu[0, j]), 2) / (2 * delta2[0, j]**2)
 57 |                             ) / (np.sqrt(2 * np.pi * delta2[0, j]))
 58 |         return total
 59 |     return p
 60 | 
 61 | 
 62 | def multivariateGaussianModel(X):
 63 |     """多元高斯模型
 64 | 
 65 |     Args:
 66 |         X 样本集
 67 |     Returns:
 68 |         p 模型
 69 |     """
 70 |     # 参数估计
 71 |     m, n = X.shape
 72 |     mu = np.mean(X.T, axis=1)
 73 |     Sigma = np.var(X, axis=0)
 74 |     Sigma = np.diagflat(Sigma)
 75 |     # Sigma = np.mat(np.cov(X.T))
 76 |     detSigma = np.linalg.det(Sigma)
 77 | 
 78 |     def p(x):
 79 |         """p(x)
 80 | 
 81 |         Args:
 82 |             x x
 83 |             mu mu
 84 |             delta2 delta2
 85 |         Returns:
 86 |             p
 87 |         """
 88 |         x = x - mu
 89 |         return np.exp(-x.T * np.linalg.pinv(Sigma) * x / 2).A[0] * \
 90 |             ((2*np.pi)**(-n/2) * (detSigma**(-0.5) ))
 91 |     return p
 92 | 
 93 | 
 94 | def train(X, model=gaussianModel):
 95 |     """训练函数
 96 | 
 97 |     Args:
 98 |         X 样本集
 99 |     Returns:
100 |         p 概率模型
101 |     """
102 |     return model(X)
103 | 


--------------------------------------------------------------------------------
/anomaly_detection/data/ex8data1.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/anomaly_detection/data/ex8data1.mat


--------------------------------------------------------------------------------
/anomaly_detection/data/ex8data2.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/anomaly_detection/data/ex8data2.mat


--------------------------------------------------------------------------------
/anomaly_detection/test_anomaly_detection.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | # anomaly_detection/test_anomaly_detection.py
 3 | 
 4 | import numpy as np
 5 | from scipy.io import loadmat
 6 | import matplotlib.pyplot as plt
 7 | import anomaly
 8 | 
 9 | def selectEpsilon(XVal, yVal, p):
10 |     # 通过交叉验证集，选择最好的 epsilon 参数
11 |     pVal = np.mat([p(x.T) for x in XVal]).reshape(-1, 1)
12 |     step = (np.max(pVal) - np.min(pVal)) / 1000
13 |     bestEpsilon = 0
14 |     bestF1 = 0
15 |     for epsilon in np.arange(np.min(pVal), np.max(pVal), step):
16 |         predictions = pVal < epsilon
17 |         F1 = anomaly.F1(predictions, yVal)
18 |         if F1 > bestF1:
19 |             bestF1 = F1
20 |             bestEpsilon = epsilon
21 |     return bestEpsilon, bestF1
22 | 
23 | # 小维度测试......
24 | data = loadmat('data/ex8data1.mat')
25 | X = np.mat(data['X'])
26 | XVal = np.mat(data['Xval'])
27 | yVal = np.mat(data['yval'])
28 | 
29 | # p = anomaly.train(X)
30 | p = anomaly.train(X, model=anomaly.multivariateGaussianModel)
31 | pTest = np.mat([p(x.T) for x in X]).reshape(-1, 1)
32 | 
33 | # 绘制数据点
34 | plt.xlabel('Latency (ms)')
35 | plt.ylabel('Throughput (mb/s)')
36 | plt.plot(X[:, 0], X[:, 1], 'bx')
37 | epsilon, F1 = selectEpsilon(XVal, yVal, p)
38 | 
39 | print 'Best epsilon found using cross-validation: %e\n'%epsilon
40 | print 'Best F1 on Cross Validation Set:  %f\n'%F1
41 | print '# Outliers found: %d' % np.sum(pTest < epsilon)
42 | 
43 | # 获得训练集的异常点
44 | outliers = np.where(pTest < epsilon, True, False).ravel()
45 | plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2, markersize=10, fillstyle='none', markeredgewidth=1)
46 | n = np.linspace(0, 35, 100)
47 | X1 = np.meshgrid(n,n)
48 | XFit = np.mat(np.column_stack((X1[0].T.flatten(), X1[1].T.flatten())))
49 | pFit = np.mat([p(x.T) for x in XFit]).reshape(-1, 1)
50 | pFit = pFit.reshape(X1[0].shape)
51 | # Do not plot if there are infinities
52 | if not np.isinf(np.sum(pFit)):
53 |     plt.contour(X1[0], X1[1], pFit, 10.0**np.arange(-20, 0, 3).T)
54 | plt.show()
55 | 
56 | 
57 | # 大维度测试......
58 | data = loadmat('data/ex8data2.mat')
59 | X = np.mat(data['X'])
60 | XVal = np.mat(data['Xval'])
61 | yVal = np.mat(data['yval'])
62 | 
63 | # p = anomaly.train(X)
64 | p = anomaly.train(X, model=anomaly.multivariateGaussianModel)
65 | pTest = np.mat([p(x.T) for x in X]).reshape(-1, 1)
66 | 
67 | epsilon, F1 = selectEpsilon(XVal, yVal, p)
68 | 
69 | print 'Best epsilon found using cross-validation: %e\n'%epsilon
70 | print 'Best F1 on Cross Validation Set:  %f\n'%F1
71 | print '# Outliers found: %d' % np.sum(pTest < epsilon)
72 | 


--------------------------------------------------------------------------------
/kmeans/data/places.txt:
--------------------------------------------------------------------------------
 1 | Dolphin II	10860 SW Beaverton-Hillsdale Hwy	Beaverton, OR	45.486502	-122.788346
 2 | Hotties	10140 SW Canyon Rd.	Beaverton, OR	45.493150	-122.781021
 3 | Pussycats	8666a SW Canyon Road	Beaverton, OR	45.498187	-122.766147
 4 | Stars Cabaret	4570 Lombard Ave	Beaverton, OR	45.485943	-122.800311
 5 | Sunset Strip	10205 SW Park Way	Beaverton, OR	45.508203	-122.781853
 6 | Vegas VIP Room	10018 SW Canyon Rd	Beaverton, OR	45.493398	-122.779628
 7 | Full Moon Bar and Grill	28014 Southeast Wally Road	Boring, OR	45.430319	-122.376304
 8 | 505 Club	505 Burnside Rd	Gresham, OR	45.507621	-122.425553
 9 | Dolphin	17180 McLoughlin Blvd	Milwaukie, OR	45.399070	-122.618893
10 | Dolphin III	13305 SE McLoughlin BLVD	Milwaukie, OR	45.427072	-122.634159
11 | Acropolis	8325 McLoughlin Blvd	Portland, OR	45.462173	-122.638846
12 | Blush	5145 SE McLoughlin Blvd	Portland, OR	45.485396	-122.646587
13 | Boom Boom Room	8345 Barbur Blvd	Portland, OR	45.464826	-122.699212
14 | Bottoms Up	16900 Saint Helens Rd	Portland, OR	45.646831	-122.842918
15 | Cabaret II	17544 Stark St	Portland, OR	45.519142	-122.482480
16 | Cabaret Lounge	503 W Burnside	Portland, OR	45.523094	-122.675528
17 | Carnaval	330 SW 3rd Avenue	Portland, OR	45.520682	-122.674206
18 | Casa Diablo	2839 NW St. Helens Road	Portland, OR	45.543016	-122.720828
19 | Chantilly Lace	6723 Killingsworth St	Portland, OR	45.562715	-122.593078
20 | Club 205	9939 Stark St	Portland, OR	45.519052	-122.561510
21 | Club Rouge	403 SW Stark	Portland, OR	45.520561	-122.675605
22 | Dancin' Bare	8440 Interstate Ave	Portland, OR	45.584124	-122.682725
23 | Devil's Point	5305 SE Foster Rd	Portland, OR	45.495365	-122.608366
24 | Double Dribble	13550 Southeast Powell Boulevard	Portland, OR	45.497750	-122.524073
25 | Dream on Saloon	15920 Stark St	Portland, OR	45.519142	-122.499672
26 | DV8	5003 Powell Blvd	Portland, OR	45.497498	-122.611177
27 | Exotica	240 Columbia Blvd	Portland, OR	45.583048	-122.668350
28 | Frolics	8845 Sandy Blvd	Portland, OR	45.555384	-122.571475
29 | G-Spot Airport	8654 Sandy Blvd	Portland, OR	45.554263	-122.574167
30 | G-Spot Northeast	3400 NE 82nd Ave	Portland, OR	45.547229	-122.578746
31 | G-Spot Southeast	5241 SE 72nd Ave	Portland, OR	45.484823	-122.589208
32 | Glimmers	3532 Powell Blvd	Portland, OR	45.496918	-122.627920
33 | Golden Dragon Exotic Club	324 SW 3rd Ave	Portland, OR	45.520714	-122.674189
34 | Heat	12131 SE Holgate Blvd.	Portland, OR	45.489637	-122.538196
35 | Honeysuckle's Lingerie	3520 82nd Ave	Portland, OR	45.548651	-122.578730
36 | Hush Playhouse	13560 Powell Blvd	Portland, OR	45.497765	-122.523985
37 | JD's Bar &amp; Grill	4523 NE 60th Ave	Portland, OR	45.555811	-122.600881
38 | Jody's Bar And Grill	12035 Glisan St	Portland, OR	45.526306	-122.538833
39 | Landing Strip	6210 Columbia Blvd	Portland, OR	45.595042	-122.728825
40 | Lucky Devil Lounge	633 SE Powell Blvd	Portland, OR	45.501585	-122.659310
41 | Lure	11051 Barbur Blvd	Portland, OR	45.445233	-122.732606
42 | Magic Garden	217 4th Ave	Portland, OR	45.524692	-122.674466
43 | Mary's Club	129 Broadway	Portland, OR	45.535101	-122.667390
44 | Montego's	15826 SE Division	Portland, OR	45.504448	-122.500034
45 | Mr. Peeps	709 122nd Ave	Portland, OR	45.527863	-122.537726
46 | Mynt Gentlemen's Club	3390 NE Sandy Blvd	Portland, OR	45.532426	-122.628865
47 | Mystic	9950 SE Stark St.	Portland, OR	45.519037	-122.561283
48 | Nicolai Street Clubhouse	2460 24th Ave	Portland, OR	45.540098	-122.641114
49 | Oh Zone	6218 Columbia Blvd	Portland, OR	45.595069	-122.728961
50 | Pallas Club	13639 Powell Blvd	Portland, OR	45.497990	-122.522849
51 | Pirates Cove	7427 Sandy Blvd	Portland, OR	45.549288	-122.586505
52 | Private Pleasures	10931 53rd Ave	Portland, OR	45.446442	-122.731034
53 | Pussycats	3414 Northeast 82nd Avenue	Portland, OR	45.547337	-122.578744
54 | Riverside Corral	545 Tacoma St	Portland, OR	45.464338	-122.660285
55 | Rooster's	605 Columbia Blvd	Portland, OR	45.583693	-122.672462
56 | Rose City Strip	3620 35th Pl	Portland, OR	45.496601	-122.627688
57 | Safari Show Club	3000 SE Powell Blvd	Portland, OR	45.497091	-122.634581
58 | Sassy's Bar &amp; Grill	927 Morrison St	Portland, OR	45.517225	-122.656367
59 | Secret Rendezvous	12503 Division St	Portland, OR	45.504087	-122.534481
60 | Shimmers	7944 Foster Rd	Portland, OR	45.483836	-122.581608
61 | Soobie's	333 SE 122nd Ave	Portland, OR	45.520162	-122.537787
62 | Spyce Gentleman's Club	33 NW 2nd Ave	Portland, OR	45.523370	-122.672388
63 | Sugar Shack	6732 Killingsworth St	Portland, OR	45.562699	-122.593048
64 | The Hawthorne Strip	1008 Hawthorne Blvd	Portland, OR	45.512220	-122.655527
65 | Tommy's Too	10335 Foster Rd	Portland, OR	45.476721	-122.557005
66 | Union Jacks	938 Burnside St	Portland, OR	45.522902	-122.656249
67 | Video Visions	6723 Killingsworth St	Portland, OR	45.562715	-122.593078
68 | Stars Cabaret Bridgeport	17939 SW McEwan Rd	Tigard, OR	45.425788	-122.765754
69 | Jiggles	7455 SW Nyberg St	Tualatin, OR	45.382682	-122.753932
70 | 


--------------------------------------------------------------------------------
/kmeans/data/portlandClubs.txt:
--------------------------------------------------------------------------------
 1 | Dolphin II	10860 SW Beaverton-Hillsdale Hwy	Beaverton, OR
 2 | Hotties	10140 SW Canyon Rd.	Beaverton, OR
 3 | Pussycats	8666a SW Canyon Road	Beaverton, OR
 4 | Stars Cabaret	4570 Lombard Ave	Beaverton, OR
 5 | Sunset Strip	10205 SW Park Way	Beaverton, OR
 6 | Vegas VIP Room	10018 SW Canyon Rd	Beaverton, OR
 7 | Full Moon Bar and Grill	28014 Southeast Wally Road	Boring, OR
 8 | 505 Club	505 Burnside Rd	Gresham, OR
 9 | Dolphin	17180 McLoughlin Blvd	Milwaukie, OR
10 | Dolphin III	13305 SE McLoughlin BLVD	Milwaukie, OR
11 | Acropolis	8325 McLoughlin Blvd	Portland, OR
12 | Blush	5145 SE McLoughlin Blvd	Portland, OR
13 | Boom Boom Room	8345 Barbur Blvd	Portland, OR
14 | Bottoms Up	16900 Saint Helens Rd	Portland, OR
15 | Cabaret II	17544 Stark St	Portland, OR
16 | Cabaret Lounge	503 W Burnside	Portland, OR
17 | Carnaval	330 SW 3rd Avenue	Portland, OR
18 | Casa Diablo	2839 NW St. Helens Road	Portland, OR
19 | Chantilly Lace	6723 Killingsworth St	Portland, OR
20 | Club 205	9939 Stark St	Portland, OR
21 | Club Rouge	403 SW Stark	Portland, OR
22 | Dancin' Bare	8440 Interstate Ave	Portland, OR
23 | Devil's Point	5305 SE Foster Rd	Portland, OR
24 | Double Dribble	13550 Southeast Powell Boulevard	Portland, OR
25 | Dream on Saloon	15920 Stark St	Portland, OR
26 | DV8	5003 Powell Blvd	Portland, OR
27 | Exotica	240 Columbia Blvd	Portland, OR
28 | Frolics	8845 Sandy Blvd	Portland, OR
29 | G-Spot Airport	8654 Sandy Blvd	Portland, OR
30 | G-Spot Northeast	3400 NE 82nd Ave	Portland, OR
31 | G-Spot Southeast	5241 SE 72nd Ave	Portland, OR
32 | Glimmers	3532 Powell Blvd	Portland, OR
33 | Golden Dragon Exotic Club	324 SW 3rd Ave	Portland, OR
34 | Heat	12131 SE Holgate Blvd.	Portland, OR
35 | Honeysuckle's Lingerie	3520 82nd Ave	Portland, OR
36 | Hush Playhouse	13560 Powell Blvd	Portland, OR
37 | JD's Bar &amp; Grill	4523 NE 60th Ave	Portland, OR
38 | Jody's Bar And Grill	12035 Glisan St	Portland, OR
39 | Landing Strip	6210 Columbia Blvd	Portland, OR
40 | Lucky Devil Lounge	633 SE Powell Blvd	Portland, OR
41 | Lure	11051 Barbur Blvd	Portland, OR
42 | Magic Garden	217 4th Ave	Portland, OR
43 | Mary's Club	129 Broadway	Portland, OR
44 | Montego's	15826 SE Division	Portland, OR
45 | Mr. Peeps	709 122nd Ave	Portland, OR
46 | Mynt Gentlemen's Club	3390 NE Sandy Blvd	Portland, OR
47 | Mystic	9950 SE Stark St.	Portland, OR
48 | Nicolai Street Clubhouse	2460 24th Ave	Portland, OR
49 | Oh Zone	6218 Columbia Blvd	Portland, OR
50 | Pallas Club	13639 Powell Blvd	Portland, OR
51 | Pirates Cove	7427 Sandy Blvd	Portland, OR
52 | Private Pleasures	10931 53rd Ave	Portland, OR
53 | Pussycats	3414 Northeast 82nd Avenue	Portland, OR
54 | Riverside Corral	545 Tacoma St	Portland, OR
55 | Rooster's	605 Columbia Blvd	Portland, OR
56 | Rose City Strip	3620 35th Pl	Portland, OR
57 | Safari Show Club	3000 SE Powell Blvd	Portland, OR
58 | Sassy's Bar &amp; Grill	927 Morrison St	Portland, OR
59 | Secret Rendezvous	12503 Division St	Portland, OR
60 | Shimmers	7944 Foster Rd	Portland, OR
61 | Soobie's	333 SE 122nd Ave	Portland, OR
62 | Spyce Gentleman's Club	33 NW 2nd Ave	Portland, OR
63 | Sugar Shack	6732 Killingsworth St	Portland, OR
64 | The Hawthorne Strip	1008 Hawthorne Blvd	Portland, OR
65 | Tommy's Too	10335 Foster Rd	Portland, OR
66 | Union Jacks	938 Burnside St	Portland, OR
67 | Video Visions	6723 Killingsworth St	Portland, OR
68 | Stars Cabaret Bridgeport	17939 SW McEwan Rd	Tigard, OR
69 | Jiggles	7455 SW Nyberg St	Tualatin, OR


--------------------------------------------------------------------------------
/kmeans/data/testSet.txt:
--------------------------------------------------------------------------------
 1 | 1.658985	4.285136
 2 | -3.453687	3.424321
 3 | 4.838138	-1.151539
 4 | -5.379713	-3.362104
 5 | 0.972564	2.924086
 6 | -3.567919	1.531611
 7 | 0.450614	-3.302219
 8 | -3.487105	-1.724432
 9 | 2.668759	1.594842
10 | -3.156485	3.191137
11 | 3.165506	-3.999838
12 | -2.786837	-3.099354
13 | 4.208187	2.984927
14 | -2.123337	2.943366
15 | 0.704199	-0.479481
16 | -0.392370	-3.963704
17 | 2.831667	1.574018
18 | -0.790153	3.343144
19 | 2.943496	-3.357075
20 | -3.195883	-2.283926
21 | 2.336445	2.875106
22 | -1.786345	2.554248
23 | 2.190101	-1.906020
24 | -3.403367	-2.778288
25 | 1.778124	3.880832
26 | -1.688346	2.230267
27 | 2.592976	-2.054368
28 | -4.007257	-3.207066
29 | 2.257734	3.387564
30 | -2.679011	0.785119
31 | 0.939512	-4.023563
32 | -3.674424	-2.261084
33 | 2.046259	2.735279
34 | -3.189470	1.780269
35 | 4.372646	-0.822248
36 | -2.579316	-3.497576
37 | 1.889034	5.190400
38 | -0.798747	2.185588
39 | 2.836520	-2.658556
40 | -3.837877	-3.253815
41 | 2.096701	3.886007
42 | -2.709034	2.923887
43 | 3.367037	-3.184789
44 | -2.121479	-4.232586
45 | 2.329546	3.179764
46 | -3.284816	3.273099
47 | 3.091414	-3.815232
48 | -3.762093	-2.432191
49 | 3.542056	2.778832
50 | -1.736822	4.241041
51 | 2.127073	-2.983680
52 | -4.323818	-3.938116
53 | 3.792121	5.135768
54 | -4.786473	3.358547
55 | 2.624081	-3.260715
56 | -4.009299	-2.978115
57 | 2.493525	1.963710
58 | -2.513661	2.642162
59 | 1.864375	-3.176309
60 | -3.171184	-3.572452
61 | 2.894220	2.489128
62 | -2.562539	2.884438
63 | 3.491078	-3.947487
64 | -2.565729	-2.012114
65 | 3.332948	3.983102
66 | -1.616805	3.573188
67 | 2.280615	-2.559444
68 | -2.651229	-3.103198
69 | 2.321395	3.154987
70 | -1.685703	2.939697
71 | 3.031012	-3.620252
72 | -4.599622	-2.185829
73 | 4.196223	1.126677
74 | -2.133863	3.093686
75 | 4.668892	-2.562705
76 | -2.793241	-2.149706
77 | 2.884105	3.043438
78 | -2.967647	2.848696
79 | 4.479332	-1.764772
80 | -4.905566	-2.911070
81 | 


--------------------------------------------------------------------------------
/kmeans/data/testSet2.txt:
--------------------------------------------------------------------------------
 1 | 3.275154	2.957587
 2 | -3.344465	2.603513
 3 | 0.355083	-3.376585
 4 | 1.852435	3.547351
 5 | -2.078973	2.552013
 6 | -0.993756	-0.884433
 7 | 2.682252	4.007573
 8 | -3.087776	2.878713
 9 | -1.565978	-1.256985
10 | 2.441611	0.444826
11 | -0.659487	3.111284
12 | -0.459601	-2.618005
13 | 2.177680	2.387793
14 | -2.920969	2.917485
15 | -0.028814	-4.168078
16 | 3.625746	2.119041
17 | -3.912363	1.325108
18 | -0.551694	-2.814223
19 | 2.855808	3.483301
20 | -3.594448	2.856651
21 | 0.421993	-2.372646
22 | 1.650821	3.407572
23 | -2.082902	3.384412
24 | -0.718809	-2.492514
25 | 4.513623	3.841029
26 | -4.822011	4.607049
27 | -0.656297	-1.449872
28 | 1.919901	4.439368
29 | -3.287749	3.918836
30 | -1.576936	-2.977622
31 | 3.598143	1.975970
32 | -3.977329	4.900932
33 | -1.791080	-2.184517
34 | 3.914654	3.559303
35 | -1.910108	4.166946
36 | -1.226597	-3.317889
37 | 1.148946	3.345138
38 | -2.113864	3.548172
39 | 0.845762	-3.589788
40 | 2.629062	3.535831
41 | -1.640717	2.990517
42 | -1.881012	-2.485405
43 | 4.606999	3.510312
44 | -4.366462	4.023316
45 | 0.765015	-3.001270
46 | 3.121904	2.173988
47 | -4.025139	4.652310
48 | -0.559558	-3.840539
49 | 4.376754	4.863579
50 | -1.874308	4.032237
51 | -0.089337	-3.026809
52 | 3.997787	2.518662
53 | -3.082978	2.884822
54 | 0.845235	-3.454465
55 | 1.327224	3.358778
56 | -2.889949	3.596178
57 | -0.966018	-2.839827
58 | 2.960769	3.079555
59 | -3.275518	1.577068
60 | 0.639276	-3.412840
61 | 


--------------------------------------------------------------------------------
/kmeans/kmeans.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # kmeans/kmeans.py
  3 | import numpy as np
  4 | 
  5 | def loadDataSet(filename):
  6 |     """
  7 |     读取数据集
  8 | 
  9 |     Args:
 10 |         filename: 文件名
 11 |     Returns:
 12 |         dataMat: 数据样本矩阵
 13 |     """
 14 |     dataMat = []
 15 |     fr = open(filename)
 16 |     for line in fr.readlines():
 17 |         curLine = line.strip().split('\t')
 18 |         # 通过map函数批量转换
 19 |         fitLine = map(float, curLine)
 20 |         dataMat.append(fitLine)
 21 |     return dataMat
 22 | 
 23 | def distEclud(vecA, vecB):
 24 |     """
 25 |     计算两向量的欧氏距离
 26 | 
 27 |     Args:
 28 |         vecA: 向量A
 29 |         vecB: 向量B
 30 |     Returns:
 31 |         欧式距离
 32 |     """
 33 |     return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
 34 | 
 35 | def randCent(dataSet, k):
 36 |     """
 37 |     随机生成k个聚类中心
 38 | 
 39 |     Args:
 40 |         dataSet: 数据集
 41 |         k: 簇数目
 42 |     Returns:
 43 |         centroids: 聚类中心矩阵
 44 |     """
 45 |     _, n = dataSet.shape
 46 |     centroids = np.mat(np.zeros((k, n)))
 47 |     for j in range(n):
 48 |         # 随机聚类中心落在数据集的边界之内
 49 |         minJ = np.min(dataSet[:, j])
 50 |         maxJ = np.max(dataSet[:, j])
 51 |         rangeJ = float(maxJ - minJ)
 52 |         centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
 53 |     return centroids
 54 | 
 55 | def kMeans(dataSet, k, maxIter = 5):
 56 |     """
 57 |     K-Means
 58 | 
 59 |     Args:
 60 |         dataSet: 数据集
 61 |         k: 聚类数
 62 |     Returns:
 63 |         centroids: 聚类中心
 64 |         clusterAssment: 点分配结果
 65 |     """
 66 |     # 随机初始化聚类中心
 67 |     centroids = randCent(dataSet, k)
 68 |     m, n = np.shape(dataSet)
 69 |     # 点分配结果： 第一列指明样本所在的簇，第二列指明该样本到聚类中心的距离
 70 |     clusterAssment = np.mat(np.zeros((m, 2)))
 71 |     # 标识聚类中心是否仍在改变
 72 |     clusterChanged = True
 73 |     # 直至聚类中心不再变化
 74 |     iterCount = 0
 75 |     while clusterChanged and iterCount < maxIter:
 76 |         iterCount += 1
 77 |         clusterChanged = False
 78 |         # 分配样本到簇
 79 |         for i in range(m):
 80 |             # 计算第i个样本到各个聚类中心的距离
 81 |             minIndex = 0
 82 |             minDist = np.inf
 83 |             for j in range(k):
 84 |                 dist = distEclud(dataSet[i, :],  centroids[j, :])
 85 |                 if(dist < minDist):
 86 |                     minIndex = j
 87 |                     minDist = dist
 88 |             # 判断cluster是否改变
 89 |             if(clusterAssment[i, 0] != minIndex):
 90 |                 clusterChanged = True
 91 |             clusterAssment[i, :] = minIndex, minDist**2
 92 |         # 刷新聚类中心: 移动聚类中心到所在簇的均值位置
 93 |         for cent in range(k):
 94 |             # 通过数组过滤获得簇中的点
 95 |             ptsInCluster = dataSet[np.nonzero(
 96 |                 clusterAssment[:, 0].A == cent)[0]]
 97 |             if ptsInCluster.shape[0] > 0:
 98 |                 # 计算均值并移动
 99 |                 centroids[cent, :] = np.mean(ptsInCluster, axis=0)
100 |     return centroids, clusterAssment
101 | 
102 | def biKmeans(dataSet, k):
103 |     """
104 |     二分kmeans算法
105 |     Args:
106 |         dataSet: 数据集
107 |         k: 聚类数
108 |     Returns:
109 |         centroids: 聚类中心
110 |         clusterAssment: 点分配结果
111 |     """
112 |     m, n = np.shape(dataSet)
113 |     # 起始时，只有一个簇，该簇的聚类中心为所有样本的平均位置
114 |     centroid0 = np.mean(dataSet, axis=0).tolist()[0]
115 |     # 设置一个列表保存当前的聚类中心
116 |     currentCentroids = [centroid0]
117 |     # 点分配结果： 第一列指明样本所在的簇，第二列指明该样本到聚类中心的距离
118 |     clusterAssment = np.mat(np.zeros((m, 2)))
119 |     # 初始化点分配结果，默认将所有样本先分配到初始簇
120 |     for j in range(m):
121 |         clusterAssment[j, 1] = distEclud(dataSet[j, :], np.mat(centroid0))**2
122 |     # 直到簇的数目达标
123 |     while len(currentCentroids) < k:
124 |         # 当前最小的代价
125 |         lowestError = np.inf
126 |         # 对于每一个簇
127 |         for j in range(len(currentCentroids)):
128 |             # 获得该簇的样本
129 |             ptsInCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == j)[0], :]
130 |             # 在该簇上进行2-means聚类
131 |             # 注意，得到的centroids，其聚类编号含0，1
132 |             centroids, clusterAss = kMeans(ptsInCluster, 2)
133 |             # 获得划分后的误差之和
134 |             splitedError = np.sum(clusterAss[:, 1])
135 |             # 获得其他簇的样本
136 |             ptsNoInCluster = dataSet[np.nonzero(
137 |                 clusterAssment[:, 0].A != j)[0]]
138 |             # 获得剩余数据集的误差
139 |             nonSplitedError = np.sum(ptsNoInCluster[:, 1])
140 |             # 比较，判断此次划分是否划算
141 |             if (splitedError + nonSplitedError) < lowestError:
142 |                 # 如果划算，刷新总误差
143 |                 lowestError = splitedError + nonSplitedError
144 |                 # 记录当前的应当划分的簇
145 |                 needToSplit = j
146 |                 # 新获得的簇以及点分配结果
147 |                 newCentroids = centroids.A
148 |                 newClusterAss = clusterAss.copy()
149 |         # 更新簇的分配结果
150 |         # 第0簇应当修正为被划分的簇
151 |         newClusterAss[np.nonzero(newClusterAss[:, 0].A == 0)[
152 |             0], 0] = needToSplit
153 |         # 第1簇应当修正为最新一簇
154 |         newClusterAss[np.nonzero(newClusterAss[:, 0].A == 1)[
155 |             0], 0] = len(currentCentroids)
156 |         # 被划分的簇需要更新
157 |         currentCentroids[needToSplit] = newCentroids[0, :]
158 |         # 加入新的划分后的簇
159 |         currentCentroids.append(newCentroids[1, :])
160 |         # 刷新点分配结果
161 |         clusterAssment[np.nonzero(
162 |             clusterAssment[:, 0].A == needToSplit
163 |         )[0], :] = newClusterAss
164 |     return np.mat(currentCentroids), clusterAssment
165 | 


--------------------------------------------------------------------------------
/kmeans/test_bi_kmeans.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # kmeans/test_bi_kmeans.py
 3 | 
 4 | import kmeans
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | if __name__ == "__main__":
 9 |     dataMat = np.mat(kmeans.loadDataSet('data/testSet2.txt'))
10 |     centroids, clusterAssment = kmeans.biKmeans(dataMat, 3)
11 |     clusterCount = centroids.shape[0]
12 |     m = dataMat.shape[0]
13 |     # 绘制散点图
14 |     patterns = ['o', 'D', '^']
15 |     colors = ['b', 'g', 'y']
16 |     fig = plt.figure()
17 |     title = 'bi-kmeans with k=3'
18 |     ax = fig.add_subplot(111, title=title)
19 |     for k in range(clusterCount):
20 |         # 绘制聚类中心
21 |         ax.scatter(centroids[k,0], centroids[k,1], color='r', marker='+', linewidth=20)
22 |         for i in range(m):
23 |             # 绘制属于该聚类中心的样本
24 |             ptsInCluster = dataMat[np.nonzero(clusterAssment[:, 0].A==k)[0]]
25 |             ax.scatter(ptsInCluster[:, 0].flatten().A[0], ptsInCluster[:, 1].flatten().A[0], marker=patterns[k], color=colors[k])
26 |     plt.show()
27 | 


--------------------------------------------------------------------------------
/kmeans/test_normal_kmeans.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # kmeans/test_normal_kmeans.py
 3 | import kmeans
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | if __name__ == "__main__":
 8 |     dataMat = np.mat(kmeans.loadDataSet('data/testSet.txt'))
 9 |     centroids, clusterAssment = kmeans.kMeans(dataMat, 2)
10 |     clusterCount = np.shape(centroids)[0]
11 |     m = np.shape(dataMat)[0]
12 |     # 绘制散点图
13 |     patterns = ['o', 'D', '^', 's']
14 |     colors = ['b', 'g', 'y', 'black']
15 |     fig = plt.figure()
16 |     title = 'kmeans with k=2'
17 |     ax = fig.add_subplot(111, title=title)
18 |     for k in range(clusterCount):
19 |         # 绘制聚类中心
20 |         ax.scatter(centroids[k, 0], centroids[k, 1], color='r', marker='+', linewidth=20)
21 |         for i in range(m):
22 |             # 绘制属于该聚类中心的样本
23 |             ptsInCluster = dataMat[np.nonzero(clusterAssment[:, 0].A==k)[0]]
24 |             ax.scatter(ptsInCluster[:, 0].flatten().A[0], ptsInCluster[:, 1].flatten().A[0], marker=patterns[k], color=colors[k])
25 |     plt.show()
26 | 


--------------------------------------------------------------------------------
/linear_regression/data/ex0.txt:
--------------------------------------------------------------------------------
  1 | 1.000000	0.067732	3.176513
  2 | 1.000000	0.427810	3.816464
  3 | 1.000000	0.995731	4.550095
  4 | 1.000000	0.738336	4.256571
  5 | 1.000000	0.981083	4.560815
  6 | 1.000000	0.526171	3.929515
  7 | 1.000000	0.378887	3.526170
  8 | 1.000000	0.033859	3.156393
  9 | 1.000000	0.132791	3.110301
 10 | 1.000000	0.138306	3.149813
 11 | 1.000000	0.247809	3.476346
 12 | 1.000000	0.648270	4.119688
 13 | 1.000000	0.731209	4.282233
 14 | 1.000000	0.236833	3.486582
 15 | 1.000000	0.969788	4.655492
 16 | 1.000000	0.607492	3.965162
 17 | 1.000000	0.358622	3.514900
 18 | 1.000000	0.147846	3.125947
 19 | 1.000000	0.637820	4.094115
 20 | 1.000000	0.230372	3.476039
 21 | 1.000000	0.070237	3.210610
 22 | 1.000000	0.067154	3.190612
 23 | 1.000000	0.925577	4.631504
 24 | 1.000000	0.717733	4.295890
 25 | 1.000000	0.015371	3.085028
 26 | 1.000000	0.335070	3.448080
 27 | 1.000000	0.040486	3.167440
 28 | 1.000000	0.212575	3.364266
 29 | 1.000000	0.617218	3.993482
 30 | 1.000000	0.541196	3.891471
 31 | 1.000000	0.045353	3.143259
 32 | 1.000000	0.126762	3.114204
 33 | 1.000000	0.556486	3.851484
 34 | 1.000000	0.901144	4.621899
 35 | 1.000000	0.958476	4.580768
 36 | 1.000000	0.274561	3.620992
 37 | 1.000000	0.394396	3.580501
 38 | 1.000000	0.872480	4.618706
 39 | 1.000000	0.409932	3.676867
 40 | 1.000000	0.908969	4.641845
 41 | 1.000000	0.166819	3.175939
 42 | 1.000000	0.665016	4.264980
 43 | 1.000000	0.263727	3.558448
 44 | 1.000000	0.231214	3.436632
 45 | 1.000000	0.552928	3.831052
 46 | 1.000000	0.047744	3.182853
 47 | 1.000000	0.365746	3.498906
 48 | 1.000000	0.495002	3.946833
 49 | 1.000000	0.493466	3.900583
 50 | 1.000000	0.792101	4.238522
 51 | 1.000000	0.769660	4.233080
 52 | 1.000000	0.251821	3.521557
 53 | 1.000000	0.181951	3.203344
 54 | 1.000000	0.808177	4.278105
 55 | 1.000000	0.334116	3.555705
 56 | 1.000000	0.338630	3.502661
 57 | 1.000000	0.452584	3.859776
 58 | 1.000000	0.694770	4.275956
 59 | 1.000000	0.590902	3.916191
 60 | 1.000000	0.307928	3.587961
 61 | 1.000000	0.148364	3.183004
 62 | 1.000000	0.702180	4.225236
 63 | 1.000000	0.721544	4.231083
 64 | 1.000000	0.666886	4.240544
 65 | 1.000000	0.124931	3.222372
 66 | 1.000000	0.618286	4.021445
 67 | 1.000000	0.381086	3.567479
 68 | 1.000000	0.385643	3.562580
 69 | 1.000000	0.777175	4.262059
 70 | 1.000000	0.116089	3.208813
 71 | 1.000000	0.115487	3.169825
 72 | 1.000000	0.663510	4.193949
 73 | 1.000000	0.254884	3.491678
 74 | 1.000000	0.993888	4.533306
 75 | 1.000000	0.295434	3.550108
 76 | 1.000000	0.952523	4.636427
 77 | 1.000000	0.307047	3.557078
 78 | 1.000000	0.277261	3.552874
 79 | 1.000000	0.279101	3.494159
 80 | 1.000000	0.175724	3.206828
 81 | 1.000000	0.156383	3.195266
 82 | 1.000000	0.733165	4.221292
 83 | 1.000000	0.848142	4.413372
 84 | 1.000000	0.771184	4.184347
 85 | 1.000000	0.429492	3.742878
 86 | 1.000000	0.162176	3.201878
 87 | 1.000000	0.917064	4.648964
 88 | 1.000000	0.315044	3.510117
 89 | 1.000000	0.201473	3.274434
 90 | 1.000000	0.297038	3.579622
 91 | 1.000000	0.336647	3.489244
 92 | 1.000000	0.666109	4.237386
 93 | 1.000000	0.583888	3.913749
 94 | 1.000000	0.085031	3.228990
 95 | 1.000000	0.687006	4.286286
 96 | 1.000000	0.949655	4.628614
 97 | 1.000000	0.189912	3.239536
 98 | 1.000000	0.844027	4.457997
 99 | 1.000000	0.333288	3.513384
100 | 1.000000	0.427035	3.729674
101 | 1.000000	0.466369	3.834274
102 | 1.000000	0.550659	3.811155
103 | 1.000000	0.278213	3.598316
104 | 1.000000	0.918769	4.692514
105 | 1.000000	0.886555	4.604859
106 | 1.000000	0.569488	3.864912
107 | 1.000000	0.066379	3.184236
108 | 1.000000	0.335751	3.500796
109 | 1.000000	0.426863	3.743365
110 | 1.000000	0.395746	3.622905
111 | 1.000000	0.694221	4.310796
112 | 1.000000	0.272760	3.583357
113 | 1.000000	0.503495	3.901852
114 | 1.000000	0.067119	3.233521
115 | 1.000000	0.038326	3.105266
116 | 1.000000	0.599122	3.865544
117 | 1.000000	0.947054	4.628625
118 | 1.000000	0.671279	4.231213
119 | 1.000000	0.434811	3.791149
120 | 1.000000	0.509381	3.968271
121 | 1.000000	0.749442	4.253910
122 | 1.000000	0.058014	3.194710
123 | 1.000000	0.482978	3.996503
124 | 1.000000	0.466776	3.904358
125 | 1.000000	0.357767	3.503976
126 | 1.000000	0.949123	4.557545
127 | 1.000000	0.417320	3.699876
128 | 1.000000	0.920461	4.613614
129 | 1.000000	0.156433	3.140401
130 | 1.000000	0.656662	4.206717
131 | 1.000000	0.616418	3.969524
132 | 1.000000	0.853428	4.476096
133 | 1.000000	0.133295	3.136528
134 | 1.000000	0.693007	4.279071
135 | 1.000000	0.178449	3.200603
136 | 1.000000	0.199526	3.299012
137 | 1.000000	0.073224	3.209873
138 | 1.000000	0.286515	3.632942
139 | 1.000000	0.182026	3.248361
140 | 1.000000	0.621523	3.995783
141 | 1.000000	0.344584	3.563262
142 | 1.000000	0.398556	3.649712
143 | 1.000000	0.480369	3.951845
144 | 1.000000	0.153350	3.145031
145 | 1.000000	0.171846	3.181577
146 | 1.000000	0.867082	4.637087
147 | 1.000000	0.223855	3.404964
148 | 1.000000	0.528301	3.873188
149 | 1.000000	0.890192	4.633648
150 | 1.000000	0.106352	3.154768
151 | 1.000000	0.917886	4.623637
152 | 1.000000	0.014855	3.078132
153 | 1.000000	0.567682	3.913596
154 | 1.000000	0.068854	3.221817
155 | 1.000000	0.603535	3.938071
156 | 1.000000	0.532050	3.880822
157 | 1.000000	0.651362	4.176436
158 | 1.000000	0.901225	4.648161
159 | 1.000000	0.204337	3.332312
160 | 1.000000	0.696081	4.240614
161 | 1.000000	0.963924	4.532224
162 | 1.000000	0.981390	4.557105
163 | 1.000000	0.987911	4.610072
164 | 1.000000	0.990947	4.636569
165 | 1.000000	0.736021	4.229813
166 | 1.000000	0.253574	3.500860
167 | 1.000000	0.674722	4.245514
168 | 1.000000	0.939368	4.605182
169 | 1.000000	0.235419	3.454340
170 | 1.000000	0.110521	3.180775
171 | 1.000000	0.218023	3.380820
172 | 1.000000	0.869778	4.565020
173 | 1.000000	0.196830	3.279973
174 | 1.000000	0.958178	4.554241
175 | 1.000000	0.972673	4.633520
176 | 1.000000	0.745797	4.281037
177 | 1.000000	0.445674	3.844426
178 | 1.000000	0.470557	3.891601
179 | 1.000000	0.549236	3.849728
180 | 1.000000	0.335691	3.492215
181 | 1.000000	0.884739	4.592374
182 | 1.000000	0.918916	4.632025
183 | 1.000000	0.441815	3.756750
184 | 1.000000	0.116598	3.133555
185 | 1.000000	0.359274	3.567919
186 | 1.000000	0.814811	4.363382
187 | 1.000000	0.387125	3.560165
188 | 1.000000	0.982243	4.564305
189 | 1.000000	0.780880	4.215055
190 | 1.000000	0.652565	4.174999
191 | 1.000000	0.870030	4.586640
192 | 1.000000	0.604755	3.960008
193 | 1.000000	0.255212	3.529963
194 | 1.000000	0.730546	4.213412
195 | 1.000000	0.493829	3.908685
196 | 1.000000	0.257017	3.585821
197 | 1.000000	0.833735	4.374394
198 | 1.000000	0.070095	3.213817
199 | 1.000000	0.527070	3.952681
200 | 1.000000	0.116163	3.129283
201 | 


--------------------------------------------------------------------------------
/linear_regression/data/ex1.txt:
--------------------------------------------------------------------------------
 1 | 6.1101	17.592
 2 | 5.5277	9.1302
 3 | 8.5186	13.662
 4 | 7.0032	11.854
 5 | 5.8598	6.8233
 6 | 8.3829	11.886
 7 | 7.4764	4.3483
 8 | 8.5781	12
 9 | 6.4862	6.5987
10 | 5.0546	3.8166
11 | 5.7107	3.2522
12 | 14.164	15.505
13 | 5.734	3.1551
14 | 8.4084	7.2258
15 | 5.6407	0.71618
16 | 5.3794	3.5129
17 | 6.3654	5.3048
18 | 5.1301	0.56077
19 | 6.4296	3.6518
20 | 7.0708	5.3893
21 | 6.1891	3.1386
22 | 20.27	21.767
23 | 5.4901	4.263
24 | 6.3261	5.1875
25 | 5.5649	3.0825
26 | 18.945	22.638
27 | 12.828	13.501
28 | 10.957	7.0467
29 | 13.176	14.692
30 | 22.203	24.147
31 | 5.2524	-1.22
32 | 6.5894	5.9966
33 | 9.2482	12.134
34 | 5.8918	1.8495
35 | 8.2111	6.5426
36 | 7.9334	4.5623
37 | 8.0959	4.1164
38 | 5.6063	3.3928
39 | 12.836	10.117
40 | 6.3534	5.4974
41 | 5.4069	0.55657
42 | 6.8825	3.9115
43 | 11.708	5.3854
44 | 5.7737	2.4406
45 | 7.8247	6.7318
46 | 7.0931	1.0463
47 | 5.0702	5.1337
48 | 5.8014	1.844
49 | 11.7	8.0043
50 | 5.5416	1.0179
51 | 7.5402	6.7504
52 | 5.3077	1.8396
53 | 7.4239	4.2885
54 | 7.6031	4.9981
55 | 6.3328	1.4233
56 | 6.3589	-1.4211
57 | 6.2742	2.4756
58 | 5.6397	4.6042
59 | 9.3102	3.9624
60 | 9.4536	5.4141
61 | 8.8254	5.1694
62 | 5.1793	-0.74279
63 | 21.279	17.929
64 | 14.908	12.054
65 | 18.959	17.054
66 | 7.2182	4.8852
67 | 8.2951	5.7442
68 | 10.236	7.7754
69 | 5.4994	1.0173
70 | 20.341	20.992
71 | 10.136	6.6799
72 | 7.3345	4.0259
73 | 6.0062	1.2784
74 | 7.2259	3.3411
75 | 5.0269	-2.6807
76 | 6.5479	0.29678
77 | 7.5386	3.8845
78 | 5.0365	5.7014
79 | 10.274	6.7526
80 | 5.1077	2.0576
81 | 5.7292	0.47953
82 | 5.1884	0.20421
83 | 6.3557	0.67861
84 | 9.7687	7.5435
85 | 6.5159	5.3436
86 | 8.5172	4.2415
87 | 9.1802	6.7981
88 | 6.0020	.92695
89 | 5.5204	0.152
90 | 5.0594	2.8214
91 | 5.7077	1.8451
92 | 7.6366	4.2959
93 | 5.8707	7.2029
94 | 5.3054	1.9869
95 | 8.2934	0.14454
96 | 13.394	9.0551
97 | 5.4369	0.61705
98 | 


--------------------------------------------------------------------------------
/linear_regression/data/houses.txt:
--------------------------------------------------------------------------------
 1 | 2104	3	399900
 2 | 1600	3	329900
 3 | 2400	3	369000
 4 | 1416	2	232000
 5 | 3000	4	539900
 6 | 1985	4	299900
 7 | 1534	3	314900
 8 | 1427	3	198999
 9 | 1380	3	212000
10 | 1494	3	242500
11 | 1940	4	239999
12 | 2000	3	347000
13 | 1890	3	329999
14 | 4478	5	699900
15 | 1268	3	259900
16 | 2300	4	449900
17 | 1320	2	299900
18 | 1236	3	199900
19 | 2609	4	499998
20 | 3031	4	599000
21 | 1767	3	252900
22 | 1888	2	255000
23 | 1604	3	242900
24 | 1962	4	259900
25 | 3890	3	573900
26 | 1100	3	249900
27 | 1458	3	464500
28 | 2526	3	469000
29 | 2200	3	475000
30 | 2637	3	299900
31 | 1839	2	349900
32 | 1000	1	169900
33 | 2040	4	314900
34 | 3137	3	579900
35 | 1811	4	285900
36 | 1437	3	249900
37 | 1239	3	229900
38 | 2132	4	345000
39 | 4215	4	549000
40 | 2162	4	287000
41 | 1664	2	368500
42 | 2238	3	329900
43 | 2567	4	314000
44 | 1200	3	299000
45 | 852	2	179900
46 | 1852	4	299900
47 | 1203	3	239500
48 | 


--------------------------------------------------------------------------------
/linear_regression/data/lwr.txt:
--------------------------------------------------------------------------------
1 | 1	1.0
2 | 2	2.0
3 | 3	3.0
4 | 4	3.15
5 | 5	3.25
6 | 6	3.5
7 | 


--------------------------------------------------------------------------------
/linear_regression/data/temperature.txt:
--------------------------------------------------------------------------------
 1 | 50	3.3
 2 | 50	2.8
 3 | 50	2.9
 4 | 70	2.3
 5 | 70	2.6
 6 | 70	2.1
 7 | 80	2.5
 8 | 80	2.9
 9 | 80	2.4
10 | 90	3.0
11 | 90	3.1
12 | 90	2.8
13 | 100	3.3
14 | 100	3.5
15 | 100	3.0
16 | 


--------------------------------------------------------------------------------
/linear_regression/regression.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # linear_regression/regression.py
  3 | import numpy as np
  4 | import matplotlib as plt
  5 | import time
  6 | 
  7 | def exeTime(func):
  8 |     """ 耗时计算装饰器
  9 |     """
 10 |     def newFunc(*args, **args2):
 11 |         t0 = time.time()
 12 |         back = func(*args, **args2)
 13 |         return back, time.time() - t0
 14 |     return newFunc
 15 | 
 16 | def loadDataSet(filename):
 17 |     """ 读取数据
 18 | 
 19 |     从文件中获取数据，在《机器学习实战中》，数据格式如下
 20 |     "feature1 TAB feature2 TAB feature3 TAB label"
 21 | 
 22 |     Args:
 23 |         filename 文件名
 24 | 
 25 |     Returns:
 26 |         X 训练样本集矩阵
 27 |         y 标签集矩阵
 28 |     """
 29 |     numFeat = len(open(filename).readline().split('\t')) - 1
 30 |     X = []
 31 |     y = []
 32 |     file = open(filename)
 33 |     for line in file.readlines():
 34 |         lineArr = []
 35 |         curLine = line.strip().split('\t')
 36 |         for i in range(numFeat):
 37 |             lineArr.append(float(curLine[i]))
 38 |         X.append(lineArr)
 39 |         y.append(float(curLine[-1]))
 40 |     return np.mat(X), np.mat(y).T
 41 | 
 42 | def h(theta, x):
 43 |     """预测函数
 44 | 
 45 |     Args:
 46 |         theta 相关系数矩阵
 47 |         x 特征向量
 48 | 
 49 |     Returns:
 50 |         预测结果
 51 |     """
 52 |     return (theta.T*x)[0,0]
 53 | 
 54 | def J(theta, X, y):
 55 |     """代价函数
 56 | 
 57 |     Args:
 58 |         theta 相关系数矩阵
 59 |         X 样本集矩阵
 60 |         y 标签集矩阵
 61 | 
 62 |     Returns:
 63 |         预测误差（代价）
 64 |     """
 65 |     m = len(X)
 66 |     return (X*theta-y).T*(X*theta-y)/(2*m)
 67 | 
 68 | @exeTime
 69 | def bgd(rate, maxLoop, epsilon, X, y):
 70 |     """批量梯度下降法
 71 | 
 72 |     Args:
 73 |         rate 学习率
 74 |         maxLoop 最大迭代次数
 75 |         epsilon 收敛精度
 76 |         X 样本矩阵
 77 |         y 标签矩阵
 78 | 
 79 |     Returns:
 80 |         (theta, errors, thetas), timeConsumed
 81 |     """
 82 |     m,n = X.shape
 83 |     # 初始化theta
 84 |     theta = np.zeros((n,1))
 85 |     count = 0
 86 |     converged = False
 87 |     error = float('inf')
 88 |     errors = []
 89 |     thetas = {}
 90 |     for j in range(n):
 91 |         thetas[j] = [theta[j,0]]
 92 |     while count<=maxLoop:
 93 |         if(converged):
 94 |             break
 95 |         count = count + 1
 96 |         for j in range(n):
 97 |             deriv = (y-X*theta).T*X[:, j]/m
 98 |             theta[j,0] = theta[j,0]+rate*deriv
 99 |             thetas[j].append(theta[j,0])
100 |         error = J(theta, X, y)
101 |         errors.append(error[0,0])
102 |         # 如果已经收敛
103 |         if(error < epsilon):
104 |             converged = True
105 |     return theta,errors,thetas
106 | 
107 | @exeTime
108 | def sgd(rate, maxLoop, epsilon, X, y):
109 |     """随机梯度下降法
110 |     Args:
111 |         rate 学习率
112 |         maxLoop 最大迭代次数
113 |         epsilon 收敛精度
114 |         X 样本矩阵
115 |         y 标签矩阵
116 |     Returns:
117 |         (theta, error, thetas), timeConsumed
118 |     """
119 |     m,n = X.shape
120 |     # 初始化theta
121 |     theta = np.zeros((n,1))
122 |     count = 0
123 |     converged = False
124 |     error = float('inf')
125 |     errors = []
126 |     thetas = {}
127 |     for j in range(n):
128 |         thetas[j] = [theta[j,0]]
129 |     while count <= maxLoop:
130 |         if converged:
131 |             break
132 |         count = count + 1
133 |         errors.append(float('inf'))
134 |         for i in range(m):
135 |             if converged:
136 |                 break
137 |             diff = y[i,0]-h(theta, X[i].T)
138 |             for j in range(n):
139 |                 theta[j,0] = theta[j,0] + rate*diff*X[i, j]
140 |                 thetas[j].append(theta[j,0])
141 |             error = J(theta, X, y)
142 |             errors[-1] = error[0,0]
143 |             # 如果已经收敛
144 |             if(error < epsilon):
145 |                 converged = True
146 |     return theta, errors, thetas
147 | 
148 | def JLwr(theta, X, y, x, c):
149 |     """局部加权线性回归的代价函数计算式
150 | 
151 |     Args:
152 |         theta 相关系数矩阵
153 |         X 样本集矩阵
154 |         y 标签集矩阵
155 |         x 待预测输入
156 |         c tau
157 |     Returns:
158 |         预测代价
159 |     """
160 |     m,n = X.shape
161 |     summerize = 0
162 |     for i in range(m):
163 |         diff = (X[i]-x)*(X[i]-x).T
164 |         w = np.exp(-diff/(2*c*c))
165 |         predictDiff = np.power(y[i] - X[i]*theta,2)
166 |         summerize = summerize + w*predictDiff
167 |     return summerize
168 | 
169 | @exeTime
170 | def lwr(rate, maxLoop, epsilon, X, y, x, c=1):
171 |     """局部加权线性回归
172 | 
173 |     Args:
174 |         rate 学习率
175 |         maxLoop 最大迭代次数
176 |         epsilon 预测精度
177 |         X 输入样本
178 |         y 标签向量
179 |         x 待预测向量
180 |         c tau
181 |     """
182 |     m,n = X.shape
183 |     # 初始化theta
184 |     theta = np.zeros((n,1))
185 |     count = 0
186 |     converged = False
187 |     error = float('inf')
188 |     errors = []
189 |     thetas = {}
190 |     for j in range(n):
191 |         thetas[j] = [theta[j,0]]
192 |     # 执行批量梯度下降
193 |     while count<=maxLoop:
194 |         if(converged):
195 |             break
196 |         count = count + 1
197 |         for j in range(n):
198 |             deriv = (y-X*theta).T*X[:, j]/m
199 |             theta[j,0] = theta[j,0]+rate*deriv
200 |             thetas[j].append(theta[j,0])
201 |         error = JLwr(theta, X, y, x, c)
202 |         errors.append(error[0,0])
203 |         # 如果已经收敛
204 |         if(error < epsilon):
205 |             converged = True
206 |     return theta,errors,thetas
207 | 
208 | def standarize(X):
209 |     """特征标准化处理
210 | 
211 |     Args:
212 |         X 样本集
213 |     Returns:
214 |         标准后的样本集
215 |     """
216 |     m, n = X.shape
217 |     # 归一化每一个特征
218 |     for j in range(n):
219 |         features = X[:,j]
220 |         meanVal = features.mean(axis=0)
221 |         std = features.std(axis=0)
222 |         if std != 0:
223 |             X[:, j] = (features-meanVal)/std
224 |         else:
225 |             X[:, j] = 0
226 |     return X
227 | 
228 | def normalize(X):
229 |     """特征归一化处理
230 | 
231 |     Args:
232 |         X 样本集
233 |     Returns:
234 |         归一化后的样本集
235 |     """
236 |     m, n = X.shape
237 |     # 归一化每一个特征
238 |     for j in range(n):
239 |         features = X[:,j]
240 |         minVal = features.min(axis=0)
241 |         maxVal = features.max(axis=0)
242 |         diff = maxVal - minVal
243 |         if diff != 0:
244 |            X[:,j] = (features-minVal)/diff
245 |         else:
246 |            X[:,j] = 0
247 |     return X
248 | 


--------------------------------------------------------------------------------
/linear_regression/test_bgd.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # linear_regression/test_bgd.py
 3 | import regression
 4 | from matplotlib import cm
 5 | from mpl_toolkits.mplot3d import axes3d
 6 | import matplotlib.pyplot as plt
 7 | import matplotlib.ticker as mtick
 8 | import numpy as np
 9 | 
10 | if __name__ == "__main__":
11 |     X, y = regression.loadDataSet('data/ex1.txt');
12 | 
13 |     m,n = X.shape
14 |     X = np.concatenate((np.ones((m,1)), X), axis=1)
15 | 
16 |     rate = 0.02
17 |     maxLoop = 1500
18 |     epsilon = 0.01
19 | 
20 |     result, timeConsumed = regression.bgd(rate, maxLoop, epsilon, X, y)
21 | 
22 |     theta, errors, thetas = result
23 | 
24 |     # 绘制拟合曲线
25 |     fittingFig = plt.figure()
26 |     title = 'bgd: rate=%.2f, maxLoop=%d, epsilon=%.3f \n time: %ds'%(rate,maxLoop,epsilon,timeConsumed)
27 |     ax = fittingFig.add_subplot(111, title=title)
28 |     trainingSet = ax.scatter(X[:, 1].flatten().A[0], y[:,0].flatten().A[0])
29 | 
30 |     xCopy = X.copy()
31 |     xCopy.sort(0)
32 |     yHat = xCopy*theta
33 |     fittingLine, = ax.plot(xCopy[:,1], yHat, color='g')
34 | 
35 |     ax.set_xlabel('Population of City in 10,000s')
36 |     ax.set_ylabel('Profit in $10,000s')
37 | 
38 |     plt.legend([trainingSet, fittingLine], ['Training Set', 'Linear Regression'])
39 |     plt.show()
40 | 
41 |     # 绘制误差曲线
42 |     errorsFig = plt.figure()
43 |     ax = errorsFig.add_subplot(111)
44 |     ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.4f'))
45 | 
46 |     ax.plot(range(len(errors)), errors)
47 |     ax.set_xlabel('Number of iterations')
48 |     ax.set_ylabel('Cost J')
49 | 
50 |     plt.show()
51 | 
52 |     # 绘制能量下降曲面
53 |     size = 100
54 |     theta0Vals = np.linspace(-10,10, size)
55 |     theta1Vals = np.linspace(-2, 4, size)
56 |     JVals = np.zeros((size, size))
57 |     for i in range(size):
58 |         for j in range(size):
59 |             col = np.matrix([[theta0Vals[i]], [theta1Vals[j]]])
60 |             JVals[i,j] = regression.J(col, X, y)
61 | 
62 |     theta0Vals, theta1Vals = np.meshgrid(theta0Vals, theta1Vals)
63 |     JVals = JVals.T
64 |     contourSurf = plt.figure()
65 |     ax = contourSurf.gca(projection='3d')
66 | 
67 |     ax.plot_surface(theta0Vals, theta1Vals, JVals,  rstride=2, cstride=2, alpha=0.3,
68 |                 cmap=cm.rainbow, linewidth=0, antialiased=False)
69 |     ax.plot(thetas[0], thetas[1], 'rx')
70 |     ax.set_xlabel(r'$\theta_0$')
71 |     ax.set_ylabel(r'$\theta_1$')
72 |     ax.set_zlabel(r'$J(\theta)$')
73 | 
74 |     plt.show()
75 | 
76 |     # 绘制能量轮廓
77 |     contourFig = plt.figure()
78 |     ax = contourFig.add_subplot(111)
79 |     ax.set_xlabel(r'$\theta_0$')
80 |     ax.set_ylabel(r'$\theta_1$')
81 | 
82 |     CS = ax.contour(theta0Vals, theta1Vals, JVals, np.logspace(-2,3,20))
83 |     plt.clabel(CS, inline=1, fontsize=10)
84 | 
85 |     # 绘制最优解
86 |     ax.plot(theta[0,0], theta[1,0], 'rx', markersize=10, linewidth=2)
87 | 
88 |     # 绘制梯度下降过程
89 |     ax.plot(thetas[0], thetas[1], 'rx', markersize=3, linewidth=1)
90 |     ax.plot(thetas[0], thetas[1], 'r-')
91 | 
92 |     plt.show()
93 | 


--------------------------------------------------------------------------------
/linear_regression/test_feature_scaling.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # linear_regression/test_feature_scaling.py
 3 | import regression
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from mpl_toolkits.mplot3d import axes3d
 7 | 
 8 | if __name__ == "__main__":
 9 |     srcX, y = regression.loadDataSet('data/houses.txt')
10 | 
11 |     m, n = srcX.shape
12 |     X = np.concatenate((np.ones((m,1)), srcX), axis=1)
13 | 
14 |     rate = 1
15 |     maxLoop = 1000
16 |     epsilon = 1
17 | 
18 |     result, timeConsumed = regression.bgd(rate, maxLoop, epsilon, X, y)
19 |     theta, errors, thetas = result
20 | 
21 |     # 打印拟合曲线
22 |     fittingFig = plt.figure()
23 |     title = 'bgd: rate=%.2f, maxLoop=%d, epsilon=%.3f \n time: %ds'%(rate,maxLoop,epsilon,timeConsumed)
24 |     ax = fittingFig.add_subplot(111, title=title)
25 |     trainingSet = ax.scatter(X[:, 1].flatten().A[0], y[:,0].flatten().A[0])
26 | 
27 |     xCopy = X.copy()
28 |     xCopy.sort(0)
29 |     yHat = xCopy*theta
30 |     fittingLine, = ax.plot(xCopy[:,1], yHat, color='g')
31 | 
32 |     ax.set_xlabel('Population of City in 10,000s')
33 |     ax.set_ylabel('Profit in $10,000s')
34 | 
35 |     plt.legend([trainingSet, fittingLine], ['Training Set', 'Linear Regression'])
36 |     plt.show()
37 | 
38 |     # 绘制能量函数的轮廓
39 |     theta1Vals = np.linspace(min(thetas[1]), max(thetas[1]), 100)
40 |     theta2Vals = np.linspace(min(thetas[2]), max(thetas[2]), 100)
41 |     JVals = np.zeros((100, 100))
42 |     for i in range(100):
43 |         for j in range(100):
44 |             theta = np.matrix([[0], [theta1Vals[i]], [theta2Vals[j]]])
45 |             JVals[i,j] = regression.J(theta, X, y)
46 |     contourFig = plt.figure()
47 |     ax = contourFig.add_subplot(111)
48 |     ax.contour(theta1Vals, theta2Vals, JVals, np.logspace(-2,3,20))
49 | 
50 |     plt.show()
51 | 
52 |     # 打印误差曲线
53 |     errorsFig = plt.figure()
54 |     ax = errorsFig.add_subplot(111)
55 |     ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.4f'))
56 | 
57 |     ax.plot(range(len(errors)), errors)
58 |     ax.set_xlabel('Number of iterations')
59 |     ax.set_ylabel('Cost J')
60 | 
61 |     plt.show()
62 | 


--------------------------------------------------------------------------------
/linear_regression/test_lwr.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # linear_regression/test_lwr.py
 3 | import regression
 4 | import matplotlib.pyplot as plt
 5 | import matplotlib.ticker as mtick
 6 | import numpy as np
 7 | 
 8 | if __name__ == "__main__":
 9 |     srcX, y = regression.loadDataSet('data/lwr.txt');
10 | 
11 |     m,n = srcX.shape
12 |     srcX = np.concatenate((srcX[:, 0], np.power(srcX[:, 0],2)), axis=1)
13 |     # 特征缩放
14 |     X = regression.standardize(srcX.copy())
15 |     X = np.concatenate((np.ones((m,1)), X), axis=1)
16 | 
17 |     rate = 0.1
18 |     maxLoop = 1000
19 |     epsilon = 0.01
20 | 
21 |     predicateX = regression.standardize(np.matrix([[8, 64]]))
22 | 
23 |     predicateX = np.concatenate((np.ones((1,1)), predicateX), axis=1)
24 | 
25 |     result, t = regression.lwr(rate, maxLoop, epsilon, X, y, predicateX, 1)
26 |     theta, errors, thetas = result
27 | 
28 |     result2, t = regression.lwr(rate, maxLoop, epsilon, X, y, predicateX, 0.1)
29 |     theta2, errors2, thetas2 = result2
30 | 
31 | 
32 |     # 打印特征点
33 |     fittingFig = plt.figure()
34 |     title = 'polynomial with bgd: rate=%.2f, maxLoop=%d, epsilon=%.3f'%(rate,maxLoop,epsilon)
35 |     ax = fittingFig.add_subplot(111, title=title)
36 |     trainingSet = ax.scatter(srcX[:, 0].flatten().A[0], y[:,0].flatten().A[0])
37 | 
38 |     print theta
39 |     print theta2
40 | 
41 |     # 打印拟合曲线
42 |     xx = np.linspace(1, 7, 50)
43 |     xx2 = np.power(xx,2)
44 |     yHat1 = []
45 |     yHat2 = []
46 |     for i in range(50):
47 |         normalizedSize = (xx[i]-xx.mean())/xx.std(0)
48 |         normalizedSize2 = (xx2[i]-xx2.mean())/xx2.std(0)
49 |         x = np.matrix([[1,normalizedSize, normalizedSize2]])
50 |         yHat1.append(regression.h(theta, x.T))
51 |         yHat2.append(regression.h(theta2, x.T))
52 |     fittingLine1, = ax.plot(xx, yHat1, color='g')
53 |     fittingLine2, = ax.plot(xx, yHat2, color='r')
54 | 
55 |     ax.set_xlabel('temperature')
56 |     ax.set_ylabel('yield')
57 | 
58 |     plt.legend([trainingSet, fittingLine1, fittingLine2], ['Training Set', r'LWR with $\tau$=1', r'LWR with $\tau$=0.1'])
59 |     plt.show()
60 | 
61 |     # 打印误差曲线
62 |     errorsFig = plt.figure()
63 |     ax = errorsFig.add_subplot(111)
64 |     ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e'))
65 | 
66 |     ax.plot(range(len(errors)), errors)
67 |     ax.set_xlabel('Number of iterations')
68 |     ax.set_ylabel('Cost J')
69 | 
70 |     plt.show()
71 | 


--------------------------------------------------------------------------------
/linear_regression/test_multiple.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # linear_regression/test_multiple.py
 3 | import regression
 4 | import numpy as np
 5 | from mpl_toolkits.mplot3d import Axes3D
 6 | from matplotlib import cm
 7 | import matplotlib.pyplot as plt
 8 | import matplotlib.ticker as mtick
 9 | 
10 | if __name__ == "__main__":
11 |     srcX, y = regression.loadDataSet('data/houses.txt')
12 | 
13 |     # 新建特征
14 |     m,n= srcX.shape
15 |     X = regression.normalize(srcX.copy())
16 |     X = np.concatenate((np.ones((m,1)), X), axis=1)
17 | 
18 |     rate = 1
19 |     maxLoop = 50
20 |     epsilon = 1
21 | 
22 |     result, timeConsumed = regression.bgd(rate, maxLoop, epsilon, X, y)
23 |     theta,errors = result
24 | 
25 |     print 'theta is:'
26 |     print theta
27 |     print '........'
28 | 
29 |     # 预测价格
30 |     normalizedSize = (1650-srcX[:,0].mean(0))/srcX[:,0].std(0)
31 |     normalizedBr = (3-srcX[:,1].mean(0))/srcX[:,1].std(0)
32 |     predicateX = np.matrix([[1, normalizedSize, normalizedBr]])
33 |     price = regression.h(theta, predicateX.T)
34 |     print 'Predicted price of a 1650 sq-ft, 3 br house: $%.4f'%price
35 |     print '........'
36 | 
37 |     # 打印拟合平面
38 |     fittingFig = plt.figure(figsize=(16, 12))
39 |     title = 'polynomial with bgd: rate=%.3f, maxLoop=%d, epsilon=%.3f \n time: %ds'%(rate,maxLoop,epsilon,timeConsumed)
40 |     ax = fittingFig.add_subplot(111, projection='3d', title=title)
41 | 
42 |     xx = np.linspace(0,5000,25)
43 |     yy = np.linspace(0,5,25)
44 |     zz = np.zeros((25,25))
45 |     for i in range(25):
46 |         for j in range(25):
47 |             normalizedSize = (xx[i]-srcX[:,0].mean(0))/srcX[:,0].std(0)
48 |             normalizedSize = (xx[i]-srcX[:,0].mean(0))/srcX[:,0].std(0)
49 |             x = np.matrix([[1,normalizedSize, normalizedBr]])
50 |             zz[i,j] = regression.h(theta, x.T)
51 |     xx, yy = np.meshgrid(xx,yy)
52 |     ax.zaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e'))
53 |     ax.plot_surface(xx, yy, zz, rstride=1, cstride=1, cmap=cm.rainbow, alpha=0.1, antialiased=True)
54 | 
55 |     xs = srcX[:, 0].flatten().A[0]
56 |     ys = srcX[:, 1].flatten().A[0]
57 |     zs = y[:, 0].flatten().A[0]
58 |     ax.scatter(xs, ys, zs, c='b', marker='o')
59 | 
60 |     ax.set_xlabel('sq-ft of room')
61 |     ax.set_ylabel('bedrooms')
62 |     ax.set_zlabel('price')
63 | 
64 |     plt.show()
65 | 
66 |     # 打印误差曲线
67 |     errorsFig = plt.figure()
68 |     ax = errorsFig.add_subplot(111)
69 |     ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e'))
70 | 
71 |     ax.plot(range(len(errors)), errors)
72 |     ax.set_xlabel('Number of iterations')
73 |     ax.set_ylabel('Cost J')
74 | 
75 |     plt.show()
76 | 


--------------------------------------------------------------------------------
/linear_regression/test_sgd.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # linear_regression/test_sgd.py
 3 | import regression
 4 | from matplotlib import cm
 5 | from mpl_toolkits.mplot3d import axes3d
 6 | import matplotlib.pyplot as plt
 7 | import matplotlib.ticker as mtick
 8 | import numpy as np
 9 | 
10 | if __name__ == "__main__":
11 |     X, y = regression.loadDataSet('data/ex1.txt');
12 | 
13 |     m,n = X.shape
14 |     X = np.concatenate((np.ones((m,1)), X), axis=1)
15 | 
16 |     rate = 0.01
17 |     maxLoop = 100
18 |     epsilon =0.01
19 | 
20 |     result, timeConsumed = regression.sgd(rate, maxLoop, epsilon, X, y)
21 | 
22 |     theta, errors, thetas = result
23 | 
24 |     # 绘制拟合曲线
25 |     fittingFig = plt.figure()
26 |     title = 'sgd: rate=%.2f, maxLoop=%d, epsilon=%.3f \n time: %ds'%(rate,maxLoop,epsilon,timeConsumed)
27 |     ax = fittingFig.add_subplot(111, title=title)
28 |     trainingSet = ax.scatter(X[:, 1].flatten().A[0], y[:,0].flatten().A[0])
29 | 
30 |     xCopy = X.copy()
31 |     xCopy.sort(0)
32 |     yHat = xCopy*theta
33 |     fittingLine, = ax.plot(xCopy[:,1], yHat, color='g')
34 | 
35 |     ax.set_xlabel('Population of City in 10,000s')
36 |     ax.set_ylabel('Profit in $10,000s')
37 | 
38 |     plt.legend([trainingSet, fittingLine], ['Training Set', 'Linear Regression'])
39 |     plt.show()
40 | 
41 |     # 绘制误差曲线
42 |     errorsFig = plt.figure()
43 |     ax = errorsFig.add_subplot(111)
44 |     ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.4f'))
45 | 
46 |     ax.plot(range(len(errors)), errors)
47 |     ax.set_xlabel('Number of iterations')
48 |     ax.set_ylabel('Cost J')
49 | 
50 |     plt.show()
51 | 
52 |     # 绘制能量下降曲面
53 |     size = 100
54 |     theta0Vals = np.linspace(-10,10, size)
55 |     theta1Vals = np.linspace(-2, 4, size)
56 |     JVals = np.zeros((size, size))
57 |     for i in range(size):
58 |         for j in range(size):
59 |             col = np.matrix([[theta0Vals[i]], [theta1Vals[j]]])
60 |             JVals[i,j] = regression.J(col, X, y)
61 | 
62 |     theta0Vals, theta1Vals = np.meshgrid(theta0Vals, theta1Vals)
63 |     JVals = JVals.T
64 |     contourSurf = plt.figure()
65 |     ax = contourSurf.gca(projection='3d')
66 | 
67 |     ax.plot_surface(theta0Vals, theta1Vals, JVals,  rstride=8, cstride=8, alpha=0.3,
68 |                 cmap=cm.rainbow, linewidth=0, antialiased=False)
69 |     ax.plot(thetas[0], thetas[1], 'rx')
70 |     ax.set_xlabel(r'$\theta_0$')
71 |     ax.set_ylabel(r'$\theta_1$')
72 |     ax.set_zlabel(r'$J(\theta)$')
73 | 
74 |     plt.show()
75 | 
76 |     # 绘制能量轮廓
77 |     contourFig = plt.figure()
78 |     ax = contourFig.add_subplot(111)
79 |     ax.set_xlabel(r'$\theta_0$')
80 |     ax.set_ylabel(r'$\theta_1$')
81 | 
82 |     CS = ax.contour(theta0Vals, theta1Vals, JVals, np.logspace(-2,3,20))
83 |     plt.clabel(CS, inline=1, fontsize=10)
84 | 
85 |     # 绘制最优解
86 |     ax.plot(theta[0,0], theta[1,0], 'rx', markersize=10, linewidth=2)
87 | 
88 |     # 绘制梯度下降过程
89 |     ax.plot(thetas[0], thetas[1], 'r', linewidth=1)
90 | 
91 |     plt.show()
92 | 


--------------------------------------------------------------------------------
/linear_regression/test_temperature_normal.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # linear_regression/test_temperature_normal.py
 3 | import regression
 4 | from matplotlib import cm
 5 | from mpl_toolkits.mplot3d import axes3d
 6 | import matplotlib.pyplot as plt
 7 | import matplotlib.ticker as mtick
 8 | import numpy as np
 9 | 
10 | if __name__ == "__main__":
11 |     X, y = regression.loadDataSet('data/temperature.txt');
12 | 
13 |     m,n = X.shape
14 |     X = np.concatenate((np.ones((m,1)), X), axis=1)
15 | 
16 |     rate = 0.0001
17 |     maxLoop = 1000
18 |     epsilon =0.01
19 | 
20 |     result, timeConsumed = regression.bgd(rate, maxLoop, epsilon, X, y)
21 | 
22 |     theta, errors, thetas = result
23 | 
24 |     # 绘制拟合曲线
25 |     fittingFig = plt.figure()
26 |     title = 'bgd: rate=%.3f, maxLoop=%d, epsilon=%.3f \n time: %ds'%(rate,maxLoop,epsilon,timeConsumed)
27 |     ax = fittingFig.add_subplot(111, title=title)
28 |     trainingSet = ax.scatter(X[:, 1].flatten().A[0], y[:,0].flatten().A[0])
29 | 
30 |     xCopy = X.copy()
31 |     xCopy.sort(0)
32 |     yHat = xCopy*theta
33 |     fittingLine, = ax.plot(xCopy[:,1], yHat, color='g')
34 | 
35 |     ax.set_xlabel('temperature')
36 |     ax.set_ylabel('yield')
37 | 
38 |     plt.legend([trainingSet, fittingLine], ['Training Set', 'Linear Regression'])
39 |     plt.show()
40 | 
41 |     # 绘制误差曲线
42 |     errorsFig = plt.figure()
43 |     ax = errorsFig.add_subplot(111)
44 |     ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.4f'))
45 | 
46 |     ax.plot(range(len(errors)), errors)
47 |     ax.set_xlabel('Number of iterations')
48 |     ax.set_ylabel('Cost J')
49 | 
50 |     plt.show()
51 | 


--------------------------------------------------------------------------------
/linear_regression/test_temperature_polynomial.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # linear_regression/test_temperature_polynomial.py
 3 | import regression
 4 | import matplotlib.pyplot as plt
 5 | import matplotlib.ticker as mtick
 6 | import numpy as np
 7 | 
 8 | if __name__ == "__main__":
 9 |     srcX, y = regression.loadDataSet('data/temperature.txt');
10 | 
11 |     m,n = srcX.shape
12 |     srcX = np.concatenate((srcX[:, 0], np.power(srcX[:, 0],2)), axis=1)
13 |     # 特征缩放
14 |     X = regression.standardize(srcX.copy())
15 |     X = np.concatenate((np.ones((m,1)), X), axis=1)
16 | 
17 |     rate = 0.1
18 |     maxLoop = 1000
19 |     epsilon = 0.01
20 | 
21 |     result, timeConsumed = regression.bgd(rate, maxLoop, epsilon, X, y)
22 |     theta, errors, thetas = result
23 | 
24 |     # 打印特征点
25 |     fittingFig = plt.figure()
26 |     title = 'polynomial with bgd: rate=%.2f, maxLoop=%d, epsilon=%.3f \n time: %ds'%(rate,maxLoop,epsilon,timeConsumed)
27 |     ax = fittingFig.add_subplot(111, title=title)
28 |     trainingSet = ax.scatter(srcX[:, 0].flatten().A[0], y[:,0].flatten().A[0])
29 | 
30 |     print theta
31 | 
32 |     # 打印拟合曲线
33 |     xx = np.linspace(50,100,50)
34 |     xx2 = np.power(xx,2)
35 |     yHat = []
36 |     for i in range(50):
37 |         normalizedSize = (xx[i]-xx.mean())/xx.std(0)
38 |         normalizedSize2 = (xx2[i]-xx2.mean())/xx2.std(0)
39 |         x = np.matrix([[1,normalizedSize, normalizedSize2]])
40 |         yHat.append(regression.h(theta, x.T))
41 |     fittingLine, = ax.plot(xx, yHat, color='g')
42 | 
43 |     ax.set_xlabel('temperature')
44 |     ax.set_ylabel('yield')
45 | 
46 |     plt.legend([trainingSet, fittingLine], ['Training Set', 'Polynomial Regression'])
47 |     plt.show()
48 | 
49 |     # 打印误差曲线
50 |     errorsFig = plt.figure()
51 |     ax = errorsFig.add_subplot(111)
52 |     ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e'))
53 | 
54 |     ax.plot(range(len(errors)), errors)
55 |     ax.set_xlabel('Number of iterations')
56 |     ax.set_ylabel('Cost J')
57 | 
58 |     plt.show()
59 | 


--------------------------------------------------------------------------------
/logical_regression/data/ex3data1.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/logical_regression/data/ex3data1.mat


--------------------------------------------------------------------------------
/logical_regression/data/linear.txt:
--------------------------------------------------------------------------------
  1 | -0.017612	14.053064	0
  2 | -1.395634	4.662541	1
  3 | -0.752157	6.538620	0
  4 | -1.322371	7.152853	0
  5 | 0.423363	11.054677	0
  6 | 0.406704	7.067335	1
  7 | 0.667394	12.741452	0
  8 | -2.460150	6.866805	1
  9 | 0.569411	9.548755	0
 10 | -0.026632	10.427743	0
 11 | 0.850433	6.920334	1
 12 | 1.347183	13.175500	0
 13 | 1.176813	3.167020	1
 14 | -1.781871	9.097953	0
 15 | -0.566606	5.749003	1
 16 | 0.931635	1.589505	1
 17 | -0.024205	6.151823	1
 18 | -0.036453	2.690988	1
 19 | -0.196949	0.444165	1
 20 | 1.014459	5.754399	1
 21 | 1.985298	3.230619	1
 22 | -1.693453	-0.557540	1
 23 | -0.576525	11.778922	0
 24 | -0.346811	-1.678730	1
 25 | -2.124484	2.672471	1
 26 | 1.217916	9.597015	0
 27 | -0.733928	9.098687	0
 28 | -3.642001	-1.618087	1
 29 | 0.315985	3.523953	1
 30 | 1.416614	9.619232	0
 31 | -0.386323	3.989286	1
 32 | 0.556921	8.294984	1
 33 | 1.224863	11.587360	0
 34 | -1.347803	-2.406051	1
 35 | 1.196604	4.951851	1
 36 | 0.275221	9.543647	0
 37 | 0.470575	9.332488	0
 38 | -1.889567	9.542662	0
 39 | -1.527893	12.150579	0
 40 | -1.185247	11.309318	0
 41 | -0.445678	3.297303	1
 42 | 1.042222	6.105155	1
 43 | -0.618787	10.320986	0
 44 | 1.152083	0.548467	1
 45 | 0.828534	2.676045	1
 46 | -1.237728	10.549033	0
 47 | -0.683565	-2.166125	1
 48 | 0.229456	5.921938	1
 49 | -0.959885	11.555336	0
 50 | 0.492911	10.993324	0
 51 | 0.184992	8.721488	0
 52 | -0.355715	10.325976	0
 53 | -0.397822	8.058397	0
 54 | 0.824839	13.730343	0
 55 | 1.507278	5.027866	1
 56 | 0.099671	6.835839	1
 57 | -0.344008	10.717485	0
 58 | 1.785928	7.718645	1
 59 | -0.918801	11.560217	0
 60 | -0.364009	4.747300	1
 61 | -0.841722	4.119083	1
 62 | 0.490426	1.960539	1
 63 | -0.007194	9.075792	0
 64 | 0.356107	12.447863	0
 65 | 0.342578	12.281162	0
 66 | -0.810823	-1.466018	1
 67 | 2.530777	6.476801	1
 68 | 1.296683	11.607559	0
 69 | 0.475487	12.040035	0
 70 | -0.783277	11.009725	0
 71 | 0.074798	11.023650	0
 72 | -1.337472	0.468339	1
 73 | -0.102781	13.763651	0
 74 | -0.147324	2.874846	1
 75 | 0.518389	9.887035	0
 76 | 1.015399	7.571882	0
 77 | -1.658086	-0.027255	1
 78 | 1.319944	2.171228	1
 79 | 2.056216	5.019981	1
 80 | -0.851633	4.375691	1
 81 | -1.510047	6.061992	0
 82 | -1.076637	-3.181888	1
 83 | 1.821096	10.283990	0
 84 | 3.010150	8.401766	1
 85 | -1.099458	1.688274	1
 86 | -0.834872	-1.733869	1
 87 | -0.846637	3.849075	1
 88 | 1.400102	12.628781	0
 89 | 1.752842	5.468166	1
 90 | 0.078557	0.059736	1
 91 | 0.089392	-0.715300	1
 92 | 1.825662	12.693808	0
 93 | 0.197445	9.744638	0
 94 | 0.126117	0.922311	1
 95 | -0.679797	1.220530	1
 96 | 0.677983	2.556666	1
 97 | 0.761349	10.693862	0
 98 | -2.168791	0.143632	1
 99 | 1.388610	9.341997	0
100 | 0.317029	14.739025	0
101 | 


--------------------------------------------------------------------------------
/logical_regression/data/non_linear.txt:
--------------------------------------------------------------------------------
  1 | 0.051267	0.69956	1
  2 | -0.092742	0.68494	1
  3 | -0.21371	0.69225	1
  4 | -0.375	0.50219	1
  5 | -0.51325	0.46564	1
  6 | -0.52477	0.2098	1
  7 | -0.39804	0.034357	1
  8 | -0.30588	-0.19225	1
  9 | 0.016705	-0.40424	1
 10 | 0.13191	-0.51389	1
 11 | 0.38537	-0.56506	1
 12 | 0.52938	-0.5212	1
 13 | 0.63882	-0.24342	1
 14 | 0.73675	-0.18494	1
 15 | 0.54666	0.48757	1
 16 | 0.322	0.5826	1
 17 | 0.16647	0.53874	1
 18 | -0.046659	0.81652	1
 19 | -0.17339	0.69956	1
 20 | -0.47869	0.63377	1
 21 | -0.60541	0.59722	1
 22 | -0.62846	0.33406	1
 23 | -0.59389	0.005117	1
 24 | -0.42108	-0.27266	1
 25 | -0.11578	-0.39693	1
 26 | 0.20104	-0.60161	1
 27 | 0.46601	-0.53582	1
 28 | 0.67339	-0.53582	1
 29 | -0.13882	0.54605	1
 30 | -0.29435	0.77997	1
 31 | -0.26555	0.96272	1
 32 | -0.16187	0.8019	1
 33 | -0.17339	0.64839	1
 34 | -0.28283	0.47295	1
 35 | -0.36348	0.31213	1
 36 | -0.30012	0.027047	1
 37 | -0.23675	-0.21418	1
 38 | -0.06394	-0.18494	1
 39 | 0.062788	-0.16301	1
 40 | 0.22984	-0.41155	1
 41 | 0.2932	-0.2288	1
 42 | 0.48329	-0.18494	1
 43 | 0.64459	-0.14108	1
 44 | 0.46025	0.012427	1
 45 | 0.6273	0.15863	1
 46 | 0.57546	0.26827	1
 47 | 0.72523	0.44371	1
 48 | 0.22408	0.52412	1
 49 | 0.44297	0.67032	1
 50 | 0.322	0.69225	1
 51 | 0.13767	0.57529	1
 52 | -0.0063364	0.39985	1
 53 | -0.092742	0.55336	1
 54 | -0.20795	0.35599	1
 55 | -0.20795	0.17325	1
 56 | -0.43836	0.21711	1
 57 | -0.21947	-0.016813	1
 58 | -0.13882	-0.27266	1
 59 | 0.18376	0.93348	0
 60 | 0.22408	0.77997	0
 61 | 0.29896	0.61915	0
 62 | 0.50634	0.75804	0
 63 | 0.61578	0.7288	0
 64 | 0.60426	0.59722	0
 65 | 0.76555	0.50219	0
 66 | 0.92684	0.3633	0
 67 | 0.82316	0.27558	0
 68 | 0.96141	0.085526	0
 69 | 0.93836	0.012427	0
 70 | 0.86348	-0.082602	0
 71 | 0.89804	-0.20687	0
 72 | 0.85196	-0.36769	0
 73 | 0.82892	-0.5212	0
 74 | 0.79435	-0.55775	0
 75 | 0.59274	-0.7405	0
 76 | 0.51786	-0.5943	0
 77 | 0.46601	-0.41886	0
 78 | 0.35081	-0.57968	0
 79 | 0.28744	-0.76974	0
 80 | 0.085829	-0.75512	0
 81 | 0.14919	-0.57968	0
 82 | -0.13306	-0.4481	0
 83 | -0.40956	-0.41155	0
 84 | -0.39228	-0.25804	0
 85 | -0.74366	-0.25804	0
 86 | -0.69758	0.041667	0
 87 | -0.75518	0.2902	0
 88 | -0.69758	0.68494	0
 89 | -0.4038	0.70687	0
 90 | -0.38076	0.91886	0
 91 | -0.50749	0.90424	0
 92 | -0.54781	0.70687	0
 93 | 0.10311	0.77997	0
 94 | 0.057028	0.91886	0
 95 | -0.10426	0.99196	0
 96 | -0.081221	1.1089	0
 97 | 0.28744	1.087	0
 98 | 0.39689	0.82383	0
 99 | 0.63882	0.88962	0
100 | 0.82316	0.66301	0
101 | 0.67339	0.64108	0
102 | 1.0709	0.10015	0
103 | -0.046659	-0.57968	0
104 | -0.23675	-0.63816	0
105 | -0.15035	-0.36769	0
106 | -0.49021	-0.3019	0
107 | -0.46717	-0.13377	0
108 | -0.28859	-0.060673	0
109 | -0.61118	-0.067982	0
110 | -0.66302	-0.21418	0
111 | -0.59965	-0.41886	0
112 | -0.72638	-0.082602	0
113 | -0.83007	0.31213	0
114 | -0.72062	0.53874	0
115 | -0.59389	0.49488	0
116 | -0.48445	0.99927	0
117 | -0.0063364	0.99927	0
118 | 0.63265	-0.030612	0
119 | 


--------------------------------------------------------------------------------
/logical_regression/logical_regression.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # logical_regression/logical_regression.py
  3 | import numpy as np
  4 | import matplotlib as plt
  5 | import time
  6 | 
  7 | def exeTime(func):
  8 |     """耗时计算装饰器
  9 | 
 10 |     Args:
 11 |         func 待装饰函数
 12 |     Returns:
 13 |         newFunc 装饰后的函数
 14 |     """
 15 |     def newFunc(*args, **args2):
 16 |         t0 = time.time()
 17 |         back = func(*args, **args2)
 18 |         return back, time.time() - t0
 19 |     return newFunc
 20 | 
 21 | def loadDataSet(filename):
 22 |     """读取数据集
 23 |     数据以TAB进行分割
 24 | 
 25 |     Args:
 26 |         filename 文件名
 27 |     Returns:
 28 |         X 训练样本集矩阵
 29 |         y 标签集矩阵
 30 |     """
 31 |     numFeat = len(open(filename).readline().split('\t')) - 1
 32 |     X = []
 33 |     y = []
 34 |     file = open(filename)
 35 |     for line in file.readlines():
 36 |         lineArr = []
 37 |         curLine = line.strip().split('\t')
 38 |         for i in range(numFeat):
 39 |             lineArr.append(float(curLine[i]))
 40 |         X.append([1.0, float(lineArr[0]), float(lineArr[1])])
 41 |         y.append(float(curLine[-1]))
 42 |     return np.mat(X), np.mat(y).T
 43 | 
 44 | def sigmoid(z):
 45 |     """sigmoid函数
 46 |     """
 47 |     return 1.0/(1.0+np.exp(-z))
 48 | 
 49 | def J(theta, X, y, theLambda=0):
 50 |     """预测代价函数
 51 |     """
 52 |     m, n = X.shape
 53 |     h = sigmoid(X.dot(theta))
 54 |     J = (-1.0/m)*(np.log(h).T.dot(y)+np.log(1-h).T.dot(1-y)) + (theLambda/(2.0*m))*np.sum(np.square(theta[1:]))
 55 |     if np.isnan(J[0]):
 56 |         return(np.inf)
 57 |     return J.flatten()[0,0]
 58 | 
 59 | @exeTime
 60 | def gradient(X, y, options):
 61 |     """随机梯度下降法
 62 |     Args:
 63 |         X 样本矩阵
 64 |         y 标签矩阵
 65 |         rate 学习率
 66 |         options.theLambda 正规参数
 67 |         options.maxLoop 最大迭代次数
 68 |         options.epsilon 收敛精度
 69 |         options.method
 70 |             - 'sgd' 随机梯度下降法
 71 |             - 'bgd' 批量梯度下降法
 72 |     Returns:
 73 |         (thetas, errors), timeConsumed
 74 |     """
 75 |     m,n = X.shape
 76 |     # 初始化参数矩阵
 77 |     theta = np.ones((n,1))
 78 |     count = 0 # 迭代次数
 79 |     # 初始化误差无限大
 80 |     error = float('inf')
 81 |     # 保存误差变化状况
 82 |     errors = []
 83 |     # 保存参数的变化状况
 84 |     thetas = []
 85 |     rate = options.get('rate', 0.01)
 86 |     epsilon = options.get('epsilon', 0.1)
 87 |     maxLoop = options.get('maxLoop', 1000)
 88 |     theLambda = options.get('theLambda', 0)
 89 |     method = options['method']
 90 |     def _sgd(theta):
 91 |         converged = False
 92 |         for i in range(maxLoop):
 93 |             if converged:
 94 |                 break
 95 |             for j in range(m):
 96 |                 h = sigmoid(X[j] *theta)
 97 |                 diff = h - y[j]
 98 |                 theta = theta - rate*(1.0/m)*X[j].T*diff
 99 |                 error = J(theta, X, y)
100 |                 errors.append(error)
101 |                 if error < epsilon:
102 |                     converged = True
103 |                     break
104 |                 thetas.append(theta)
105 |         return thetas, errors, i+1
106 |     def _bgd(theta):
107 |         for i in range(maxLoop):
108 |             h = sigmoid(X.dot(theta))
109 |             diff = h - y
110 |             # theta0 should not be regularized
111 |             theta = theta - rate*((1.0/m)*X.T*diff + (theLambda/m)*np.r_[[[0]], theta[1:]])
112 |             error = J(theta, X, y, theLambda)
113 |             errors.append(error)
114 |             if error < epsilon:
115 |                 break
116 |             thetas.append(theta)
117 |         return thetas, errors, i+1
118 |     methods = {
119 |         'sgd': _sgd,
120 |         'bgd': _bgd
121 |     }
122 |     return methods[method](theta)
123 | 
124 | def oneVsAll(X, y, options):
125 |     """One-vs-All 多分类
126 | 
127 |     Args:
128 |         X 样本
129 |         y 标签
130 |         options 训练配置
131 |     Returns:
132 |         Thetas 权值矩阵
133 |     """
134 |     # 类型数
135 |     classes = set(np.ravel(y))
136 |     # 决策边界矩阵
137 |     Thetas = np.zeros((len(classes), X.shape[1]))
138 |     # 一次选定每种分类对应的样本为正样本，其他样本标识为负样本，进行逻辑回归
139 |     for idx, c in enumerate(classes):
140 |         newY = np.zeros(y.shape)
141 |         newY[np.where(y == c)] = 1
142 |         result, timeConsumed = gradient(X, newY, options)
143 |         thetas,errors,iterations = result
144 |         Thetas[idx] = thetas[-1].ravel()
145 |     return Thetas
146 | 
147 | def predictOneVsAll(X,Thetas):
148 |     """One-vs-All下的多分类预测
149 | 
150 |     Args:
151 |         X 样本
152 |         Thetas 权值矩阵
153 |     Returns：
154 |         H 预测结果
155 |     """
156 |     H = sigmoid(Thetas * X.T)
157 |     return H
158 | 


--------------------------------------------------------------------------------
/logical_regression/test_linear_boundry.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # logical_regression/test_linear_boundry.py
 3 | import numpy as np
 4 | import logical_regression as regression
 5 | import matplotlib.pyplot as plt
 6 | import matplotlib.ticker as mtick
 7 | 
 8 | if __name__ == "__main__":
 9 |     X, y = regression.loadDataSet('data/linear.txt')
10 |     m, n = X.shape
11 |     options = [{
12 |         'rate': 0.1,
13 |         'epsilon': 0.01,
14 |         'maxLoop': 500,
15 |         'method': 'bgd'
16 |     },{
17 |         'rate': 1,
18 |         'epsilon': 0.01,
19 |         'maxLoop': 200,
20 |         'method': 'sgd'
21 |     }]
22 |     for option in options:
23 |         result, timeConsumed = regression.gradient(X, y, option)
24 |         thetas, errors, iterationCount = result
25 |         theta = thetas[-1]
26 |         print theta, errors[-1], iterationCount
27 |         # 绘制数据点
28 |         fittingFig = plt.figure()
29 |         title = '%s: rate=%.2f, iterationCount=%d, error=%.2f \n time: %.2fs' % (
30 |             option['method'], option['rate'], iterationCount, errors[-1], timeConsumed)
31 |         ax = fittingFig.add_subplot(111, title=title)
32 |         ax.set_xlabel('X1')
33 |         ax.set_ylabel('X2')
34 |         for i in range(m):
35 |             x = X[i].A[0]
36 |             if y[i] == 1:
37 |                 ax.scatter(x[1], x[2], marker='*', color='black', s=50)
38 |             else:
39 |                 ax.scatter(x[1], x[2], marker='o', color='green', s=50)
40 |         # 绘制决策边界
41 |         x1Min = X[:, 1].min()
42 |         x1Max = X[:, 1].max()
43 |         x2Min = X[:, 2].min()
44 |         x2Max = X[:, 2].max()
45 |         xx1, xx2 = np.meshgrid(np.linspace(x1Min, x1Max),
46 |                                    np.linspace(x2Min, x2Max))
47 |         h = regression.sigmoid(np.c_[np.ones((xx1.ravel().shape[0],1)), xx1.ravel(), xx2.ravel()].dot(theta))
48 |         h = h.reshape(xx1.shape)
49 |         plt.contour(xx1, xx2, h, [0.5], colors='b', linewidth=.5)
50 |         plt.show()
51 | 
52 |         # 绘制误差曲线
53 |         errorsFig = plt.figure()
54 |         ax = errorsFig.add_subplot(111)
55 |         ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.4f'))
56 | 
57 |         ax.plot(range(len(errors)), errors)
58 |         ax.set_xlabel('Number of iterations')
59 |         ax.set_ylabel('Cost J')
60 |         plt.show()
61 | 
62 |         # 绘制theta的变化情况
63 |         thetasFig, ax = plt.subplots(len(thetas[0]))
64 |         thetas = np.asarray(thetas)
65 |         for idx, sp in enumerate(ax):
66 |             thetaList = thetas[:, idx]
67 |             sp.plot(range(len(thetaList)), thetaList)
68 |             sp.set_xlabel('Number of iteration')
69 |             sp.set_ylabel(r'$\theta_%d$'%idx)
70 |         plt.show()
71 | 


--------------------------------------------------------------------------------
/logical_regression/test_non_linear_boundry.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # logical_regression/test_non_linear_boundry.py
 3 | import numpy as np
 4 | import logical_regression as regression
 5 | import matplotlib.pyplot as plt
 6 | import matplotlib.ticker as mtick
 7 | from sklearn.preprocessing import PolynomialFeatures
 8 | 
 9 | if __name__ == "__main__":
10 |     X, y = regression.loadDataSet('data/non_linear.txt')
11 |     poly = PolynomialFeatures(6)
12 |     XX = poly.fit_transform(X[:,1:3])
13 |     m, n = XX.shape
14 |     options = [{
15 |         'rate': 1,
16 |         'epsilon': 0.01,
17 |         'theLambda': theLambda,
18 |         'maxLoop': 3000,
19 |         'method': 'bgd'
20 |     } for theLambda in [0, 1.0, 100.0]]
21 |     figures, axes = plt.subplots(1,3, sharey = True, figsize=(17,5))
22 |     for idx, option in enumerate(options):
23 |         result, timeConsumed = regression.gradient(XX, y, option)
24 |         thetas, errors, iterationCount = result
25 |         theta = thetas[-1]
26 |         print theta, errors[-1], iterationCount
27 |         ax = axes[idx]
28 |         # 绘制数据点
29 |         title = '%s: rate=%.2f, iterationCount=%d, \n theLambda=%d, \n error=%.2f time: %.2fs' % (
30 |             option['method'], option['rate'], iterationCount, option['theLambda'], errors[-1], timeConsumed)
31 |         ax.set_title(title)
32 |         ax.set_xlabel('X1')
33 |         ax.set_ylabel('X2')
34 |         for i in range(m):
35 |             x = X[i].A[0]
36 |             if y[i] == 1:
37 |                 ax.scatter(x[1], x[2], marker='*', color='black', s=50)
38 |             else:
39 |                 ax.scatter(x[1], x[2], marker='o', color='green', s=50)
40 |         # 绘制决策边界
41 |         x1Min = X[:, 1].min()
42 |         x1Max = X[:, 1].max()
43 |         x2Min = X[:, 2].min()
44 |         x2Max = X[:, 2].max()
45 |         xx1, xx2 = np.meshgrid(np.linspace(x1Min, x1Max),
46 |                                    np.linspace(x2Min, x2Max))
47 |         h = regression.sigmoid(poly.fit_transform(np.c_[xx1.ravel(), xx2.ravel()]).dot(theta))
48 |         h = h.reshape(xx1.shape)
49 |         ax.contour(xx1, xx2, h, [0.5], colors='b', linewidth=.5)
50 |     plt.show()
51 | 


--------------------------------------------------------------------------------
/logical_regression/test_onevsall.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # logical_regression/test_onevsall.py
 3 | """OneVsAll 多分类测试
 4 | """
 5 | import numpy as np
 6 | import logical_regression as regression
 7 | from scipy.io import loadmat
 8 | 
 9 | if __name__ == "__main__":
10 |     data = loadmat('data/ex3data1.mat')
11 |     X = np.mat(data['X'])
12 |     y = np.mat(data['y'])
13 |     # 为X添加偏置
14 |     X = np.append(np.ones((X.shape[0], 1)), X, axis=1)
15 |     # 采用批量梯度下降法
16 |     options = {
17 |         'rate': 0.1,
18 |         'epsilon': 0.1,
19 |         'maxLoop': 5000,
20 |         'method': 'bgd'
21 |     }
22 |     # 训练
23 |     Thetas = regression.oneVsAll(X,y,options)
24 |     # 预测
25 |     H = regression.predictOneVsAll(X, Thetas)
26 |     pred = np.argmax(H,axis=0)+1
27 |     # 计算准确率
28 |     print 'Training accuracy is: %.2f%'%(np.mean(pred == y.ravel())*100)
29 | 


--------------------------------------------------------------------------------
/neural_network/data/ex4weights.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/neural_network/data/ex4weights.mat


--------------------------------------------------------------------------------
/neural_network/data/handwritten_digits.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/neural_network/data/handwritten_digits.mat


--------------------------------------------------------------------------------
/neural_network/nn.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # neural_network/nn.py
  3 | import numpy as np
  4 | from scipy.optimize import minimize
  5 | from scipy import stats
  6 | 
  7 | def sigmoid(z):
  8 |     """sigmoid
  9 |     """
 10 |     return 1 / (1 + np.exp(-z))
 11 | 
 12 | def sigmoidDerivative(a):
 13 |     """sigmoid求导
 14 |     """
 15 |     return np.multiply(a, (1-a))
 16 | 
 17 | def initThetas(hiddenNum, unitNum, inputSize, classNum, epsilon):
 18 |     """初始化权值矩阵
 19 | 
 20 |     Args:
 21 |         hiddenNum 隐层数目
 22 |         unitNum 每个隐层的神经元数目
 23 |         inputSize 输入层规模
 24 |         classNum 分类数目
 25 |         epsilon epsilon
 26 |     Returns:
 27 |         Thetas 权值矩阵序列
 28 |     """
 29 |     hiddens = [unitNum for i in range(hiddenNum)]
 30 |     units = [inputSize] + hiddens + [classNum]
 31 |     Thetas = []
 32 |     for idx, unit in enumerate(units):
 33 |         if idx == len(units) - 1:
 34 |             break
 35 |         nextUnit = units[idx + 1]
 36 |         # 考虑偏置
 37 |         Theta = np.random.rand(nextUnit, unit + 1) * 2 * epsilon - epsilon
 38 |         Thetas.append(Theta)
 39 |     return Thetas
 40 | 
 41 | def computeCost(Thetas, y, theLambda, X=None, a=None):
 42 |     """计算代价
 43 | 
 44 |     Args:
 45 |         Thetas 权值矩阵序列
 46 |         X 样本
 47 |         y 标签集
 48 |         a 各层激活值
 49 |     Returns:
 50 |         J 预测代价
 51 |     """
 52 |     m = y.shape[0]
 53 |     if a is None:
 54 |         a = fp(Thetas, X)
 55 |     error = -np.sum(np.multiply(y.T,np.log(a[-1]))+np.multiply((1-y).T, np.log(1-a[-1])))
 56 |     # 正规化参数
 57 |     reg = -np.sum([np.sum(Theta[:, 1:]) for Theta in Thetas])
 58 |     return (1.0 / m) * error + (1.0 / (2 * m)) * theLambda * reg
 59 | 
 60 | def gradientCheck(Thetas,X,y,theLambda):
 61 |     """梯度校验
 62 | 
 63 |     Args:
 64 |         Thetas 权值矩阵
 65 |         X 样本
 66 |         y 标签
 67 |         theLambda 正规化参数
 68 |     Returns:
 69 |         checked 是否检测通过
 70 |     """
 71 |     m, n = X.shape
 72 |     # 前向传播计算各个神经元的激活值
 73 |     a = fp(Thetas, X)
 74 |     # 反向传播计算梯度增量
 75 |     D = bp(Thetas, a, y, theLambda)
 76 |     # 计算预测代价
 77 |     J = computeCost(Thetas, y, theLambda, a=a)
 78 |     DVec = unroll(D)
 79 |     # 求梯度近似
 80 |     epsilon = 1e-4
 81 |     gradApprox = np.zeros(DVec.shape)
 82 |     ThetaVec = unroll(Thetas)
 83 |     shapes = [Theta.shape for Theta in Thetas]
 84 |     for i,item in enumerate(ThetaVec):
 85 |         ThetaVec[i] = item - epsilon
 86 |         JMinus = computeCost(roll(ThetaVec,shapes),y,theLambda,X=X)
 87 |         ThetaVec[i] = item + epsilon
 88 |         JPlus = computeCost(roll(ThetaVec,shapes),y,theLambda,X=X)
 89 |         gradApprox[i] = (JPlus-JMinus) / (2*epsilon)
 90 |     # 用欧氏距离表示近似程度
 91 |     diff = np.linalg.norm(gradApprox - DVec)
 92 |     if diff < 1e-2:
 93 |         return True
 94 |     else:
 95 |         return False
 96 | 
 97 | def adjustLabels(y):
 98 |     """校正分类标签
 99 | 
100 |     Args:
101 |         y 标签集
102 |     Returns:
103 |         yAdjusted 校正后的标签集
104 |     """
105 |     # 保证标签对类型的标识是逻辑标识
106 |     if y.shape[1] == 1:
107 |         classes = set(np.ravel(y))
108 |         classNum = len(classes)
109 |         minClass = min(classes)
110 |         if classNum > 2:
111 |             yAdjusted = np.zeros((y.shape[0], classNum), np.float64)
112 |             for row, label in enumerate(y):
113 |                 yAdjusted[row, label - minClass] = 1
114 |         else:
115 |             yAdjusted = np.zeros((y.shape[0], 1), np.float64)
116 |             for row, label in enumerate(y):
117 |                 if label != minClass:
118 |                     yAdjusted[row, 0] = 1.0
119 |         return yAdjusted
120 |     return y
121 | 
122 | 
123 | def unroll(matrixes):
124 |     """参数展开
125 | 
126 |     Args:
127 |         matrixes 矩阵
128 |     Return:
129 |         vec 向量
130 |     """
131 |     vec = []
132 |     for matrix in matrixes:
133 |         vector = matrix.reshape(1, -1)[0]
134 |         vec = np.concatenate((vec, vector))
135 |     return vec
136 | 
137 | 
138 | def roll(vector, shapes):
139 |     """参数恢复
140 | 
141 |     Args:
142 |         vector 向量
143 |         shapes shape list
144 |     Returns:
145 |         matrixes 恢复的矩阵序列
146 |     """
147 |     matrixes = []
148 |     begin = 0
149 |     for shape in shapes:
150 |         end = begin + shape[0] * shape[1]
151 |         matrix = vector[begin:end].reshape(shape)
152 |         begin = end
153 |         matrixes.append(matrix)
154 |     return matrixes
155 | 
156 | 
157 | def fp(Thetas, X):
158 |     """前向反馈过程
159 | 
160 |     Args:
161 |         Thetas 权值矩阵
162 |         X 输入样本
163 |     Returns:
164 |         a 各层激活向量
165 |     """
166 |     layers = range(len(Thetas) + 1)
167 |     layerNum = len(layers)
168 |     # 激活向量序列
169 |     a = range(layerNum)
170 |     # 前向传播计算各层输出
171 |     for l in layers:
172 |         if l == 0:
173 |             a[l] = X.T
174 |         else:
175 |             z = Thetas[l - 1] * a[l - 1]
176 |             a[l] = sigmoid(z)
177 |         # 除输出层外，需要添加偏置
178 |         if l != layerNum - 1:
179 |             a[l] = np.concatenate((np.ones((1, a[l].shape[1])), a[l]))
180 |     return a
181 | 
182 | 
183 | def bp(Thetas, a, y, theLambda):
184 |     """反向传播过程
185 | 
186 |     Args:
187 |         a 激活值
188 |         y 标签
189 |     Returns:
190 |         D 权值梯度
191 |     """
192 |     m = y.shape[0]
193 |     layers = range(len(Thetas) + 1)
194 |     layerNum = len(layers)
195 |     d = range(len(layers))
196 |     delta = [np.zeros(Theta.shape) for Theta in Thetas]
197 |     for l in layers[::-1]:
198 |         if l == 0:
199 |             # 输入层不计算误差
200 |             break
201 |         if l == layerNum - 1:
202 |             # 输出层误差
203 |             d[l] = a[l] - y.T
204 |         else:
205 |             # 忽略偏置
206 |             d[l] = np.multiply((Thetas[l][:,1:].T * d[l + 1]), sigmoidDerivative(a[l][1:, :]))
207 |     for l in layers[0:layerNum - 1]:
208 |         delta[l] = d[l + 1] * (a[l].T)
209 |     D = [np.zeros(Theta.shape) for Theta in Thetas]
210 |     for l in range(len(Thetas)):
211 |         Theta = Thetas[l]
212 |         # 偏置更新增量
213 |         D[l][:, 0] = (1.0 / m) * (delta[l][0:, 0].reshape(1, -1))
214 |         # 权值更新增量
215 |         D[l][:, 1:] = (1.0 / m) * (delta[l][0:, 1:] +
216 |                                    theLambda * Theta[:, 1:])
217 |     return D
218 | 
219 | def updateThetas(m, Thetas, D, alpha, theLambda):
220 |     """更新权值
221 | 
222 |     Args:
223 |         m 样本数
224 |         Thetas 各层权值矩阵
225 |         D 梯度
226 |         alpha 学习率
227 |         theLambda 正规化参数
228 |     Returns:
229 |         Thetas 更新后的权值矩阵
230 |     """
231 |     for l in range(len(Thetas)):
232 |         Thetas[l] = Thetas[l] - alpha * D[l]
233 |     return Thetas
234 | 
235 | 
236 | def gradientDescent(Thetas, X, y, alpha, theLambda):
237 |     """梯度下降
238 | 
239 |     Args:
240 |         X 样本
241 |         y 标签
242 |         alpha 学习率
243 |         theLambda 正规化参数
244 |     Returns:
245 |         J 预测代价
246 |         Thetas 更新后的各层权值矩阵
247 |     """
248 |     # 样本数，特征数
249 |     m, n = X.shape
250 |     # 前向传播计算各个神经元的激活值
251 |     a = fp(Thetas, X)
252 |     # 反向传播计算梯度增量
253 |     D = bp(Thetas, a, y, theLambda)
254 |     # 计算预测代价
255 |     J = computeCost(Thetas,y,theLambda,a=a)
256 |     # 更新权值
257 |     Thetas = updateThetas(m, Thetas, D, alpha, theLambda)
258 |     if np.isnan(J):
259 |         J = np.inf
260 |     return J, Thetas
261 | 
262 | def train(X, y, Thetas=None, hiddenNum=0, unitNum=5, epsilon=1, alpha=1, theLambda=0, precision=0.01, maxIters=50):
263 |     """网络训练
264 | 
265 |     Args:
266 |         X 训练样本
267 |         y 标签集
268 |         Thetas 初始化的Thetas，如果为None，由系统随机初始化Thetas
269 |         hiddenNum 隐藏层数目
270 |         unitNum 隐藏层的单元数
271 |         epsilon 初始化权值的范围[-epsilon, epsilon]
272 |         alpha 学习率
273 |         theLambda 正规化参数
274 |         precision 误差精度
275 |         maxIters 最大迭代次数
276 |     """
277 |     # 样本数，特征数
278 |     m, n = X.shape
279 |     # 矫正标签集
280 |     y = adjustLabels(y)
281 |     classNum = y.shape[1]
282 |     # 初始化Theta
283 |     if Thetas is None:
284 |         Thetas = initThetas(
285 |             inputSize=n,
286 |             hiddenNum=hiddenNum,
287 |             unitNum=unitNum,
288 |             classNum=classNum,
289 |             epsilon=epsilon
290 |         )
291 |     # 先进性梯度校验
292 |     print 'Doing Gradient Checking....'
293 |     checked = gradientCheck(Thetas, X, y, theLambda)
294 |     if checked:
295 |         for i in range(maxIters):
296 |             error, Thetas = gradientDescent(
297 |                 Thetas, X, y, alpha=alpha, theLambda=theLambda)
298 |             if error < precision:
299 |                 break
300 |             if error == np.inf:
301 |                 break
302 |         if error < precision:
303 |             success = True
304 |         else:
305 |             success = False
306 |         return {
307 |             'error': error,
308 |             'Thetas': Thetas,
309 |             'iters': i,
310 |             'success': error
311 |         }
312 |     else:
313 |         print 'Error: Gradient Cheching Failed!!!'
314 |         return {
315 |             'error': None,
316 |             'Thetas': None,
317 |             'iters': 0,
318 |             'success': False
319 |         }
320 | 
321 | def predict(X, Thetas):
322 |     """预测函数
323 | 
324 |     Args:
325 |         X: 样本
326 |         Thetas: 训练后得到的参数
327 |     Return:
328 |         a
329 |     """
330 |     a = fp(Thetas,X)
331 |     return a[-1]
332 | 


--------------------------------------------------------------------------------
/neural_network/test_handwritten_digits.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # neural_network/test_handwritten_digits.py
 3 | """手写字符集
 4 | """
 5 | import nn
 6 | import numpy as np
 7 | from sklearn import datasets
 8 | from scipy.io import loadmat
 9 | 
10 | # digits = datasets.load_digits()
11 | #
12 | #
13 | # X = digits.images.reshape((len(digits.images), -1))
14 | # y = digits.target.reshape(-1, 1)
15 | 
16 | data = loadmat('data/handwritten_digits.mat')
17 | Thetas = loadmat('data/ex4weights.mat')
18 | Thetas = [Thetas['Theta1'], Thetas['Theta2']]
19 | 
20 | 
21 | X = np.mat(data['X'])
22 | y = np.mat(data['y'])
23 | 
24 | res = nn.train(X,y,hiddenNum=1,unitNum=25,Thetas=Thetas, precision = 0.5)
25 | print 'Error is: %.4f'%res['error']
26 | 


--------------------------------------------------------------------------------
/neural_network/test_logic_and.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # neural_network/test_logic_and.py
 3 | """逻辑AND运算
 4 | """
 5 | import nn
 6 | import numpy as np
 7 | 
 8 | data = np.mat([
 9 |     [0, 0, 0],
10 |     [1, 0, 0],
11 |     [0, 1, 0],
12 |     [1, 1, 1]
13 | ])
14 | 
15 | X = data[:, 0:2]
16 | y = data[:, 2]
17 | 
18 | res = nn.train(X, y,  hiddenNum=0, alpha=10, maxIters=5000, precision=0.01)
19 | print 'Run %d iterations'%res['iters']
20 | print 'Error is: %.4f'%res['error']
21 | print 'Theta is: ', res['Thetas'][0]
22 | 


--------------------------------------------------------------------------------
/pca/data/bird_small.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/pca/data/bird_small.mat


--------------------------------------------------------------------------------
/pca/data/ex7data1.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/pca/data/ex7data1.mat


--------------------------------------------------------------------------------
/pca/data/ex7data2.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/pca/data/ex7data2.mat


--------------------------------------------------------------------------------
/pca/data/ex7faces.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/pca/data/ex7faces.mat


--------------------------------------------------------------------------------
/pca/kmeans.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # svm/kmeans.py
  3 | import numpy as np
  4 | 
  5 | def loadDataSet(filename):
  6 |     """
  7 |     读取数据集
  8 | 
  9 |     Args:
 10 |         filename: 文件名
 11 |     Returns:
 12 |         dataMat: 数据样本矩阵
 13 |     """
 14 |     dataMat = []
 15 |     fr = open(filename)
 16 |     for line in fr.readlines():
 17 |         curLine = line.strip().split('\t')
 18 |         # 通过map函数批量转换
 19 |         fitLine = map(float, curLine)
 20 |         dataMat.append(fitLine)
 21 |     return dataMat
 22 | 
 23 | def distEclud(vecA, vecB):
 24 |     """
 25 |     计算两向量的欧氏距离
 26 | 
 27 |     Args:
 28 |         vecA: 向量A
 29 |         vecB: 向量B
 30 |     Returns:
 31 |         欧式距离
 32 |     """
 33 |     return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
 34 | 
 35 | def randCent(dataSet, k):
 36 |     """
 37 |     随机生成k个聚类中心
 38 | 
 39 |     Args:
 40 |         dataSet: 数据集
 41 |         k: 簇数目
 42 |     Returns:
 43 |         centroids: 聚类中心矩阵
 44 |     """
 45 |     _, n = dataSet.shape
 46 |     centroids = np.mat(np.zeros((k, n)))
 47 |     for j in range(n):
 48 |         # 随机聚类中心落在数据集的边界之内
 49 |         minJ = np.min(dataSet[:, j])
 50 |         maxJ = np.max(dataSet[:, j])
 51 |         rangeJ = float(maxJ - minJ)
 52 |         centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
 53 |     return centroids
 54 | 
 55 | def kMeans(dataSet, k, maxIter = 5):
 56 |     """
 57 |     K-Means
 58 | 
 59 |     Args:
 60 |         dataSet: 数据集
 61 |         k: 聚类数
 62 |     Returns:
 63 |         centroids: 聚类中心
 64 |         clusterAssment: 点分配结果
 65 |     """
 66 |     # 随机初始化聚类中心
 67 |     centroids = randCent(dataSet, k)
 68 |     m, n = np.shape(dataSet)
 69 |     # 点分配结果： 第一列指明样本所在的簇，第二列指明该样本到聚类中心的距离
 70 |     clusterAssment = np.mat(np.zeros((m, 2)))
 71 |     # 标识聚类中心是否仍在改变
 72 |     clusterChanged = True
 73 |     # 直至聚类中心不再变化
 74 |     iterCount = 0
 75 |     while clusterChanged and iterCount < maxIter:
 76 |         iterCount += 1
 77 |         clusterChanged = False
 78 |         # 分配样本到簇
 79 |         for i in range(m):
 80 |             # 计算第i个样本到各个聚类中心的距离
 81 |             minIndex = 0
 82 |             minDist = np.inf
 83 |             for j in range(k):
 84 |                 dist = distEclud(dataSet[i, :],  centroids[j, :])
 85 |                 if(dist < minDist):
 86 |                     minIndex = j
 87 |                     minDist = dist
 88 |             # 判断cluster是否改变
 89 |             if(clusterAssment[i, 0] != minIndex):
 90 |                 clusterChanged = True
 91 |             clusterAssment[i, :] = minIndex, minDist**2
 92 |         # 刷新聚类中心: 移动聚类中心到所在簇的均值位置
 93 |         for cent in range(k):
 94 |             # 通过数组过滤获得簇中的点
 95 |             ptsInCluster = dataSet[np.nonzero(
 96 |                 clusterAssment[:, 0].A == cent)[0]]
 97 |             if ptsInCluster.shape[0] > 0:
 98 |                 # 计算均值并移动
 99 |                 centroids[cent, :] = np.mean(ptsInCluster, axis=0)
100 |     return centroids, clusterAssment
101 | 
102 | def biKmeans(dataSet, k):
103 |     """
104 |     二分kmeans算法
105 |     Args:
106 |         dataSet: 数据集
107 |         k: 聚类数
108 |     Returns:
109 |         centroids: 聚类中心
110 |         clusterAssment: 点分配结果
111 |     """
112 |     m, n = np.shape(dataSet)
113 |     # 起始时，只有一个簇，该簇的聚类中心为所有样本的平均位置
114 |     centroid0 = np.mean(dataSet, axis=0).tolist()[0]
115 |     # 设置一个列表保存当前的聚类中心
116 |     currentCentroids = [centroid0]
117 |     # 点分配结果： 第一列指明样本所在的簇，第二列指明该样本到聚类中心的距离
118 |     clusterAssment = np.mat(np.zeros((m, 2)))
119 |     # 初始化点分配结果，默认将所有样本先分配到初始簇
120 |     for j in range(m):
121 |         clusterAssment[j, 1] = distEclud(dataSet[j, :], np.mat(centroid0))**2
122 |     # 直到簇的数目达标
123 |     while len(currentCentroids) < k:
124 |         # 当前最小的代价
125 |         lowestError = np.inf
126 |         # 对于每一个簇
127 |         for j in range(len(currentCentroids)):
128 |             # 获得该簇的样本
129 |             ptsInCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == j)[0], :]
130 |             # 在该簇上进行2-means聚类
131 |             # 注意，得到的centroids，其聚类编号含0，1
132 |             centroids, clusterAss = kMeans(ptsInCluster, 2)
133 |             # 获得划分后的误差之和
134 |             splitedError = np.sum(clusterAss[:, 1])
135 |             # 获得其他簇的样本
136 |             ptsNoInCluster = dataSet[np.nonzero(
137 |                 clusterAssment[:, 0].A != j)[0]]
138 |             # 获得剩余数据集的误差
139 |             nonSplitedError = np.sum(ptsNoInCluster[:, 1])
140 |             # 比较，判断此次划分是否划算
141 |             if (splitedError + nonSplitedError) < lowestError:
142 |                 # 如果划算，刷新总误差
143 |                 lowestError = splitedError + nonSplitedError
144 |                 # 记录当前的应当划分的簇
145 |                 needToSplit = j
146 |                 # 新获得的簇以及点分配结果
147 |                 newCentroids = centroids.A
148 |                 newClusterAss = clusterAss.copy()
149 |         # 更新簇的分配结果
150 |         # 第0簇应当修正为被划分的簇
151 |         newClusterAss[np.nonzero(newClusterAss[:, 0].A == 0)[
152 |             0], 0] = needToSplit
153 |         # 第1簇应当修正为最新一簇
154 |         newClusterAss[np.nonzero(newClusterAss[:, 0].A == 1)[
155 |             0], 0] = len(currentCentroids)
156 |         # 被划分的簇需要更新
157 |         currentCentroids[needToSplit] = newCentroids[0, :]
158 |         # 加入新的划分后的簇
159 |         currentCentroids.append(newCentroids[1, :])
160 |         # 刷新点分配结果
161 |         clusterAssment[np.nonzero(
162 |             clusterAssment[:, 0].A == needToSplit
163 |         )[0], :] = newClusterAss
164 |     return np.mat(currentCentroids), clusterAssment
165 | 


--------------------------------------------------------------------------------
/pca/pca.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | # pca/pca.py
 3 | 
 4 | import numpy as np
 5 | 
 6 | def normalize(X):
 7 |     """数据标准化处理
 8 | 
 9 |     Args:
10 |         X 样本
11 |     Returns:
12 |         XNorm 标准化后的样本
13 |     """
14 |     XNorm = X.copy()
15 |     m,n = XNorm.shape
16 |     mean = np.mean(XNorm, axis=0)
17 |     std = np.std(XNorm, axis=0)
18 |     XNorm = (XNorm - mean) / std
19 |     return XNorm
20 | 
21 | def PCA(X, k = 1):
22 |     """PCA
23 | 
24 |     Args:
25 |         X 样本
26 |         k 目的维度
27 |     Returns:
28 |         XNorm 标准化后的样本
29 |         Z 降维后的新样本
30 |         U U
31 |         UReduce UReduce
32 |         S S
33 |         V V
34 |     """
35 |     m, n = X.shape
36 |     # 数据归一化
37 |     XNorm = normalize(X)
38 |     # 计算协方差矩阵
39 |     Coef = XNorm.T * XNorm/m
40 |     # 奇异值分解
41 |     U, S, V = np.linalg.svd(Coef)
42 |     # 取出前 k 个向量
43 |     UReduce = U[:, 0:k]
44 |     Z = XNorm * UReduce
45 |     return XNorm, Z, U, UReduce, S, V
46 | 
47 | def recover(UReduce, Z):
48 |     """数据恢复
49 | 
50 |     Args:
51 |         UReduce UReduce
52 |         Z 降维后的样本
53 |     """
54 |     return Z * UReduce.T
55 | 


--------------------------------------------------------------------------------
/pca/test_pca4performance.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | # pca/test_pca4visualization.py
 3 | 
 4 | import pca
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | from scipy.io import loadmat
 8 | 
 9 | def display(images, width, height):
10 |     """展示图片
11 | 
12 |     Args:
13 |         images 图像样本
14 |         width 图像宽
15 |         height 图像高
16 |     """
17 |     m, n = images.shape
18 |     rows = int(np.floor(np.sqrt(m)))
19 |     cols = int(np.ceil(m / rows))
20 |     # 图像拼接
21 |     dstImage = images.copy()
22 |     dstImage = np.zeros((rows * height, cols * width))
23 |     for i in range(rows):
24 |         for j in range(cols):
25 |             idx = cols * i + j
26 |             image = images[idx].reshape(height, width)
27 |             dstImage[i * height:i * height + height,
28 |                      j * width: j * width + width] = image
29 |     plt.imshow(dstImage.T, cmap='gray')
30 |     plt.axis('off')
31 |     plt.show()
32 | 
33 | data = loadmat('data/ex7faces.mat')
34 | X = np.mat(data['X'],dtype=np.float32)
35 | m, n = X.shape
36 | 
37 | # 展示原图
38 | display(X[0:100, :], 32, 32)
39 | 
40 | XNorm, Z, U, UReduce, S, V = pca.PCA(X, k=100)
41 | XRec = pca.recover(UReduce, Z)
42 | 
43 | # 显示修复后的图，可以看出，PCA 损失了一部分细节
44 | display(XRec[0:100, :], 32, 32)
45 | 


--------------------------------------------------------------------------------
/pca/test_pca4visualization.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | # pca/test_pca4visualization.py
 3 | 
 4 | import numpy as np
 5 | import kmeans
 6 | import pca
 7 | from scipy.io import loadmat
 8 | from mpl_toolkits.mplot3d import Axes3D
 9 | import matplotlib.pyplot as plt
10 | import matplotlib.cm as cmx
11 | import matplotlib.colors as colors
12 | 
13 | def getCmap(count):
14 |     color_norm  = colors.Normalize(vmin=0, vmax=count-1)
15 |     scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv')
16 |     def map_index_to_rgb_color(index):
17 |         return scalar_map.to_rgba(index)
18 |     return map_index_to_rgb_color
19 | 
20 | fig = plt.figure()
21 | ax = fig.add_subplot(111, projection='3d')
22 | 
23 | data = loadmat('data/bird_small.mat')
24 | A = data['A']
25 | 
26 | A = A / 255.0;
27 | 
28 | height, width, channels = A.shape
29 | X = np.mat(A.reshape(height * width, channels))
30 | 
31 | m, n = X.shape
32 | 
33 | clusterNum = 16
34 | cmap = getCmap(clusterNum)
35 | centroids, clusterAssment = kmeans.kMeans(X, clusterNum)
36 | # 随机选择 1000 个样本绘制
37 | sampleSize = 1000
38 | sampleIndexs = np.random.choice(m, sampleSize)
39 | clusters = clusterAssment[sampleIndexs]
40 | samples = X[sampleIndexs]
41 | 
42 | # 三维下观察
43 | for i in range(sampleSize):
44 |     x, y, z = samples[i,:].A[0]
45 |     center = clusters[i, 0]
46 |     color = cmap(center)
47 |     ax.scatter([x], [y], [z], color=color, marker='o')
48 | plt.show()
49 | 
50 | # 二维下观察
51 | reducedSamples = pca.PCA(samples, k=2)[1]
52 | for i in range(sampleSize):
53 |     x, y = reducedSamples[i,:].A[0]
54 |     center = clusters[i, 0]
55 |     color = cmap(center)
56 |     plt.scatter([x], [y], color=color, marker='o')
57 | plt.show()
58 | 


--------------------------------------------------------------------------------
/recommender_system/data/ex8_movieParams.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/recommender_system/data/ex8_movieParams.mat


--------------------------------------------------------------------------------
/recommender_system/data/ex8_movies.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/recommender_system/data/ex8_movies.mat


--------------------------------------------------------------------------------
/recommender_system/data/movie_ids.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/recommender_system/data/movie_ids.txt


--------------------------------------------------------------------------------
/recommender_system/recommender.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | # recommender_system/recommender.py
  3 | import numpy as np
  4 | from scipy.optimize import minimize, check_grad
  5 | from pydash import py_
  6 | 
  7 | def getRecommender(Y, R, params=None, n=10, theLambda=10, maxIter=100):
  8 |     """训练方法
  9 | 
 10 |     Args:
 11 |         Y 评价矩阵
 12 |         R 是否评价矩阵
 13 |         params 是否具有初始化参数
 14 |         n 商品特征数
 15 |         theLambda 正规化参数
 16 |         maxIter 最大迭代次数
 17 |     Returns:
 18 |         train 训练方法
 19 |         predict 预测方法
 20 |     """
 21 | 
 22 |     # 商品数，用户数
 23 |     nm, nu = Y.shape
 24 | 
 25 |     # normalize YMean
 26 |     mu = np.mean(Y, axis=0)
 27 |     mu = np.zeros((Y.shape[0], 1), dtype=np.float)
 28 |     for i in range(nm):
 29 |         totalRates = np.sum(Y[i])
 30 |         validCount = len(np.nonzero(R[i])[0])
 31 |         mu[i] = totalRates / validCount
 32 |     Y = Y - mu
 33 | 
 34 |     def unroll(Theta, X):
 35 |         """参数折叠
 36 | 
 37 |         Args:
 38 |             Theta 用户偏好矩阵
 39 |             X 商品内容矩阵
 40 |         Returns:
 41 |             vector 折叠后的参数
 42 |         """
 43 | 
 44 |         return np.hstack((X.A.T.flatten(), Theta.A.T.flatten()))
 45 | 
 46 |     def roll(vector):
 47 |         """参数回复
 48 | 
 49 |         Args:
 50 |             vector 参数向量
 51 |         Returns:
 52 |             Theta 用户偏好矩阵
 53 |             X 商品内容矩阵
 54 |         """
 55 |         X = np.mat(vector[:nm * n].reshape(n, nm).T)
 56 |         Theta = np.mat(vector[nm * n:].reshape(n, nu).T)
 57 |         return Theta, X
 58 | 
 59 |     def initParams():
 60 |         """初始化参数
 61 | 
 62 |         Returns:
 63 |             Theta 用户对内容的偏好矩阵
 64 |             X 商品内容矩阵
 65 |         """
 66 |         Theta = np.mat(np.random.rand(nu, n))
 67 |         X = np.mat(np.random.rand(nm, n))
 68 |         return Theta, X
 69 | 
 70 |     def regParams(param):
 71 |         """正规化参数
 72 |         Args:
 73 |             param 参数
 74 |         Return:
 75 |             regParam 正规化后的参数
 76 |         """
 77 |         return theLambda * 0.5 * np.sum(np.power(param, 2))
 78 | 
 79 |     def J(params):
 80 |         """代价函数
 81 | 
 82 |         Args:
 83 |             params 参数向量
 84 |             nu 用户数
 85 |             nm 商品数
 86 |             n 特征数
 87 |         Return:
 88 |             J 预测代价
 89 |         """
 90 |         # 参数展开
 91 |         Theta, X = roll(params)
 92 |         # 计算误差
 93 |         rows, cols = np.nonzero(R)
 94 |         # 预测
 95 |         H = predict(Theta, X)
 96 |         Diff = H - Y
 97 |         Diff[R != 1] = 0
 98 |         error = 0.5 * np.sum(np.power(Diff, 2))
 99 |         #  正规化 Theta
100 |         regTheta = regParams(Theta)
101 |         #  正规化 x
102 |         regX = regParams(X)
103 |         return error + regTheta + regX
104 | 
105 |     def gradient(params):
106 |         """梯度下降
107 | 
108 |         Args:
109 |             params 参数向量
110 |         Returns:
111 |             grad 梯度向量
112 |         """
113 |         Theta, X = roll(params)
114 |         ThetaGrad = np.mat(np.zeros(Theta.shape))
115 |         XGrad = np.mat(np.zeros(X.shape))
116 |         error = predict(Theta, X) - Y
117 |         error[R != 1] = 0
118 |         ThetaGrad = error.T * X + theLambda * Theta
119 |         XGrad =  error * Theta + theLambda * X
120 |         return unroll(ThetaGrad, XGrad)
121 | 
122 |     def train():
123 |         """训练方法
124 | 
125 |         Returns:
126 |             Theta 用户的偏好矩阵
127 |             X 商品的内容矩阵
128 |         """
129 |         # 初始化参数
130 |         if not params:
131 |             Theta, X = initParams()
132 |         else:
133 |             Theta = params['Theta']
134 |             X = params['X']
135 |         # 最小化目标函数
136 |         res = minimize(J, x0=unroll(Theta, X), jac=gradient,
137 |                        method='CG', options={'disp': True, 'maxiter': maxIter})
138 |         Theta, X = roll(res.x)
139 |         return Theta, X
140 | 
141 |     def predict(Theta, X):
142 |         """预测
143 |         Args:
144 |             Theta 用户对内容的偏好矩阵
145 |             X 商品内容矩阵
146 |         Return:
147 |             h 预测
148 |         """
149 |         return X * Theta.T + mu
150 | 
151 |     def getTopRecommends(Theta, X, i, count, rated, items):
152 |         """获得推荐
153 | 
154 |         Args:
155 |             Theta Theta
156 |             X X
157 |             i 用户下标
158 |             count 获得推荐的数目
159 |             rated 已经评价的类目id
160 |             items 商品清单
161 |         Returns:
162 |             topRecommends 推荐项目
163 |         """
164 |         predictions = predict(Theta, X)[:, i]
165 |         return py_(items) \
166 |             .map(lambda item, idx: (item, predictions[idx])) \
167 |             .sort_by(lambda item: item[1], reverse = True) \
168 |             .take(count) \
169 |             .value()
170 | 
171 |     return train, predict, getTopRecommends
172 | 


--------------------------------------------------------------------------------
/recommender_system/test_movies_rating.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | # recommender_system/test_movies_rating.py
 3 | 
 4 | import numpy as np
 5 | import recommender
 6 | from scipy.io import loadmat
 7 | 
 8 | data = loadmat('data/ex8_movies.mat')
 9 | # 评价矩阵
10 | Y = data['Y']
11 | # 是否评价矩阵
12 | R = data['R']
13 | 
14 | movieParams = loadmat('data/ex8_movieParams.mat')
15 | numMovies = movieParams['num_movies'][0,0]
16 | numFeatures = movieParams['num_features'][0,0]
17 | 
18 | # 获得movies
19 | def getMovie(line):
20 |     return ' '.join(line.split()[1:])
21 | 
22 | with open('data/movie_ids.txt') as f:
23 |     movieList = [getMovie(f.readline()) for i in range(numMovies)]
24 | 
25 | myRatings = np.mat(np.zeros((numMovies,1)))
26 | 
27 | myRatings[0] = 4
28 | myRatings[97] = 2
29 | myRatings[6] = 3
30 | myRatings[11] = 5
31 | myRatings[53] = 4
32 | myRatings[63] = 5
33 | myRatings[65] = 3
34 | myRatings[68] = 5
35 | myRatings[182] = 4
36 | myRatings[225] = 5
37 | myRatings[354] = 5
38 | print 'New user ratings:'
39 | for i in range(numMovies):
40 |     if myRatings[i] > 0:
41 |         print 'Rated %d for %s' % (myRatings[i], movieList[i])
42 | 
43 | # 训练推荐模型
44 | Y = np.column_stack((myRatings, Y))
45 | R = np.column_stack((myRatings, R)).astype(bool)
46 | 
47 | print '\nTraing Result:'
48 | train, predict, getTopRecommends = recommender.getRecommender(
49 |     Y, R, n=numFeatures, theLambda=10.0)
50 | Theta, X = train()
51 | rated = np.nonzero(myRatings)[0].tolist()
52 | topRecommends = getTopRecommends(Theta, X, -1, 10, rated, movieList)
53 | 
54 | print '\nTop recommendations for you:'
55 | for recommend in topRecommends:
56 |     print 'Predicting rating %.1f for movie %s' % (recommend[1], recommend[0])
57 | 


--------------------------------------------------------------------------------
/svm/data/emailSample1.txt:
--------------------------------------------------------------------------------
 1 | > Anyone knows how much it costs to host a web portal ?
 2 | >
 3 | Well, it depends on how many visitors you're expecting.
 4 | This can be anywhere from less than 10 bucks a month to a couple of $100. 
 5 | You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
 6 | if youre running something big..
 7 | 
 8 | To unsubscribe yourself from this mailing list, send an email to:
 9 | groupname-unsubscribe@egroups.com
10 | 
11 | 


--------------------------------------------------------------------------------
/svm/data/emailSample2.txt:
--------------------------------------------------------------------------------
 1 | Folks,
 2 |  
 3 | my first time posting - have a bit of Unix experience, but am new to Linux.
 4 | 
 5 |  
 6 | Just got a new PC at home - Dell box with Windows XP. Added a second hard disk
 7 | for Linux. Partitioned the disk and have installed Suse 7.2 from CD, which went
 8 | fine except it didn't pick up my monitor.
 9 |  
10 | I have a Dell branded E151FPp 15" LCD flat panel monitor and a nVidia GeForce4
11 | Ti4200 video card, both of which are probably too new to feature in Suse's default
12 | set. I downloaded a driver from the nVidia website and installed it using RPM.
13 | Then I ran Sax2 (as was recommended in some postings I found on the net), but
14 | it still doesn't feature my video card in the available list. What next?
15 |  
16 | Another problem. I have a Dell branded keyboard and if I hit Caps-Lock twice,
17 | the whole machine crashes (in Linux, not Windows) - even the on/off switch is
18 | inactive, leaving me to reach for the power cable instead.
19 |  
20 | If anyone can help me in any way with these probs., I'd be really grateful -
21 | I've searched the 'net but have run out of ideas.
22 |  
23 | Or should I be going for a different version of Linux such as RedHat? Opinions
24 | welcome.
25 |  
26 | Thanks a lot,
27 | Peter
28 | 
29 | -- 
30 | Irish Linux Users' Group: ilug@linux.ie
31 | http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.
32 | List maintainer: listmaster@linux.ie
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/svm/data/ex6data1.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/svm/data/ex6data1.mat


--------------------------------------------------------------------------------
/svm/data/ex6data2.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/svm/data/ex6data2.mat


--------------------------------------------------------------------------------
/svm/data/ex6data3.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/svm/data/ex6data3.mat


--------------------------------------------------------------------------------
/svm/data/spamSample1.txt:
--------------------------------------------------------------------------------
 1 | Do You Want To Make $1000 Or More Per Week?
 2 | 
 3 |  
 4 | 
 5 | If you are a motivated and qualified individual - I 
 6 | will personally demonstrate to you a system that will 
 7 | make you $1,000 per week or more! This is NOT mlm.
 8 | 
 9 |  
10 | 
11 | Call our 24 hour pre-recorded number to get the 
12 | details.  
13 | 
14 |  
15 | 
16 | 000-456-789
17 | 
18 |  
19 | 
20 | I need people who want to make serious money.  Make 
21 | the call and get the facts. 
22 | 
23 | Invest 2 minutes in yourself now!
24 | 
25 |  
26 | 
27 | 000-456-789
28 | 
29 |  
30 | 
31 | Looking forward to your call and I will introduce you 
32 | to people like yourself who
33 | are currently making $10,000 plus per week!
34 | 
35 |  
36 | 
37 | 000-456-789
38 | 
39 | 
40 | 
41 | 3484lJGv6-241lEaN9080lRmS6-271WxHo7524qiyT5-438rjUv5615hQcf0-662eiDB9057dMtVl72
42 | 
43 | 


--------------------------------------------------------------------------------
/svm/data/spamSample2.txt:
--------------------------------------------------------------------------------
1 | Best Buy Viagra Generic Online
2 | 
3 | Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed!
4 | 
5 | We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers!
6 | http://medphysitcstech.ru
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/svm/data/spamTest.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/svm/data/spamTest.mat


--------------------------------------------------------------------------------
/svm/data/spamTrain.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyoyohamapi/mit-ml/bafefeec9a30b80bf0c29c246e517519d69b0f20/svm/data/spamTrain.mat


--------------------------------------------------------------------------------
/svm/smo.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | # svm/smo.py
  3 | 
  4 | import numpy as np
  5 | from sklearn.metrics.pairwise import rbf_kernel
  6 | 
  7 | """
  8 | svm模型
  9 | """
 10 | 
 11 | def linearKernel():
 12 |     """线性核函数
 13 |     """
 14 |     def calc(X, A):
 15 |         return X * A.T
 16 |     return calc
 17 | 
 18 | def rbfKernel(delta):
 19 |     """rbf核函数
 20 |     """
 21 |     gamma = 1.0 / (2 * delta**2)
 22 | 
 23 |     def calc(X, A):
 24 |         return np.mat(rbf_kernel(X, A, gamma=gamma))
 25 |     return calc
 26 | 
 27 | def getSmo(X, y, C, tol, maxIter, kernel=linearKernel()):
 28 |     """SMO
 29 | 
 30 |     Args:
 31 |         X 训练样本
 32 |         y 标签集
 33 |         C 正规化参数
 34 |         tol 容忍值
 35 |         maxIter 最大迭代次数
 36 |         K 所用核函数
 37 | 
 38 |     Returns:
 39 |         trainSimple 简化版训练算法
 40 |         train 完整版训练算法
 41 |         predict 预测函数
 42 |     """
 43 |     m, n = X.shape
 44 |     # 存放核函数的转化结果
 45 |     K = kernel(X, X)
 46 |     # Cache存放预测误差，用以加快计算速度
 47 |     ECache = np.zeros((m,2))
 48 | 
 49 |     def predict(X, alphas, b, supportVectorsIndex, supportVectors):
 50 |         """计算权值向量
 51 | 
 52 |         Args:
 53 |             X 预测矩阵
 54 |             alphas alphas
 55 |             b b
 56 |             supportVectorsIndex 支持向量坐标集
 57 |             supportVectors 支持向量
 58 |         Returns:
 59 |             predicts 预测结果
 60 |         """
 61 |         Ks = kernel(supportVectors, X)
 62 |         predicts = (np.multiply(alphas[supportVectorsIndex], y[
 63 |             supportVectorsIndex]).T * Ks + b).T
 64 |         predicts = np.sign(predicts)
 65 |         return predicts
 66 | 
 67 |     def w(alphas, b, supportVectorsIndex, supportVectors):
 68 |         """计算权值
 69 | 
 70 |         Args:
 71 |             alphas alphas
 72 |             b b
 73 |             supportVectorsIndex 支持向量坐标
 74 |             supportVectors 支持向量
 75 |         Returns:
 76 |             w 权值向量
 77 |         """
 78 |         return (np.multiply(alphas[supportVectorsIndex], y[
 79 |             supportVectorsIndex]).T * supportVectors).T
 80 | 
 81 |     def E(i, alphas, b):
 82 |         """计算预测误差
 83 | 
 84 |         Args:
 85 |             i i
 86 |             alphas alphas
 87 |             b b
 88 |         Returns:
 89 |             E_i 第i个样本的预测误差
 90 |         """
 91 |         FXi = float(np.multiply(alphas, y).T * K[:, i]) + b
 92 |         E = FXi - float(y[i])
 93 |         return E
 94 | 
 95 |     def updateE(i, alphas, b):
 96 |         ECache[i] = [1, E(i, alphas, b)]
 97 | 
 98 |     def selectJRand(i):
 99 |         """
100 |         """
101 |         j = i
102 |         while j == i:
103 |             j = int(np.random.uniform(0, m))
104 |         return j
105 | 
106 |     def selectJ(i, Ei, alphas, b):
107 |         """选择权值 $$\alpha^{(i)}$$
108 |         """
109 |         maxJ = 0; maxDist=0; Ej = 0
110 |         ECache[i] = [1, Ei]
111 |         validCaches = np.nonzero(ECache[:, 0])[0]
112 |         if len(validCaches) > 1:
113 |             for k in validCaches:
114 |                 if k==i: continue
115 |                 Ek = E(k, alphas, b)
116 |                 dist = np.abs(abs(Ei-Ek))
117 |                 if maxDist < dist:
118 |                     Ej = Ek
119 |                     maxJ = k
120 |                     maxDist = dist
121 |             return maxJ, Ej
122 |         else:
123 |             ### 随机选择
124 |             j = selectJRand(i)
125 |             Ej = E(j, alphas, b)
126 |             return j, Ej
127 | 
128 |     def select(i, alphas, b):
129 |         """alpha对选择
130 |         """
131 |         Ei = E(i, alphas, b)
132 |         # 选择违背KKT条件的，作为alpha2
133 |         Ri = y[i] * Ei
134 |         if (Ri < -tol and alphas[i] < C) or \
135 |                 (Ri > tol and alphas[i] > 0):
136 |             # 选择第二个参数
137 |             j = selectJRand(i)
138 |             Ej = E(j, alphas, b)
139 |             # j, Ej = selectJ(i, Ei, alphas, b)
140 |             # get bounds
141 |             if y[i] != y[j]:
142 |                 L = max(0, alphas[j] - alphas[i])
143 |                 H = min(C, C + alphas[j] - alphas[i])
144 |             else:
145 |                 L = max(0, alphas[j] + alphas[i] - C)
146 |                 H = min(C, alphas[j] + alphas[i])
147 |             if L == H:
148 |                 return 0, alphas, b
149 |             Kii = K[i, i]
150 |             Kjj = K[j, j]
151 |             Kij = K[i, j]
152 |             eta = 2.0 * Kij - Kii - Kjj
153 |             if eta >= 0:
154 |                 return 0, alphas, b
155 |             iOld = alphas[i].copy()
156 |             jOld = alphas[j].copy()
157 |             alphas[j] = jOld - y[j] * (Ei - Ej) / eta
158 |             if alphas[j] > H:
159 |                 alphas[j] = H
160 |             elif alphas[j] < L:
161 |                 alphas[j] = L
162 |             if abs(alphas[j] - jOld) < tol:
163 |                 alphas[j] = jOld
164 |                 return 0, alphas, b
165 |             alphas[i] = iOld + y[i] * y[j] * (jOld - alphas[j])
166 |             # update ECache
167 |             updateE(i, alphas, b)
168 |             updateE(j, alphas, b)
169 |             # update b
170 |             bINew = b - Ei - y[i] * (alphas[i] - iOld) * Kii - y[j] * \
171 |                 (alphas[j] - jOld) * Kij
172 |             bJNew = b - Ej - y[i] * (alphas[i] - iOld) * Kij - y[j] * \
173 |                 (alphas[j] - jOld) * Kjj
174 |             if alphas[i] > 0 and alphas[i] < C:
175 |                 bNew = bINew
176 |             elif alphas[j] > 0 and alphas[j] < C:
177 |                 bNew = bJNew
178 |             else:
179 |                 bNew = (bINew + bJNew) / 2
180 |             return 1, alphas, b
181 |         else:
182 |             return 0, alphas, b
183 | 
184 |     def train():
185 |         """完整版训练算法
186 | 
187 |         Returns:
188 |             alphas alphas
189 |             w w
190 |             b b
191 |             supportVectorsIndex 支持向量的坐标集
192 |             supportVectors 支持向量
193 |             iterCount 迭代次数
194 |         """
195 |         numChanged = 0
196 |         examineAll = True
197 |         iterCount = 0
198 |         alphas = np.mat(np.zeros((m, 1)))
199 |         b = 0
200 |         # 如果所有alpha都遵从 KKT 条件，则在整个训练集上迭代
201 |         # 否则在处于边界内 (0, C) 的 alpha 中迭代
202 |         while (numChanged > 0 or examineAll) and (iterCount < maxIter):
203 |             numChanged = 0
204 |             if examineAll:
205 |                 for i in range(m):
206 |                     changed, alphas, b = select(i, alphas, b)
207 |                     numChanged += changed
208 |             else:
209 |                 nonBoundIds = np.nonzero((alphas.A > 0) * (alphas.A < C))[0]
210 |                 for i in nonBoundIds:
211 |                     changed, alphas, b = select(i, alphas, b)
212 |                     numChanged += changed
213 |             iterCount += 1
214 | 
215 |             if examineAll:
216 |                 examineAll = False
217 |             elif numChanged == 0:
218 |                 examineAll = True
219 |         supportVectorsIndex = np.nonzero(alphas.A > 0)[0]
220 |         supportVectors = np.mat(X[supportVectorsIndex])
221 |         return alphas, w(alphas, b, supportVectorsIndex, supportVectors), b, \
222 |             supportVectorsIndex, supportVectors, iterCount
223 | 
224 |     def trainSimple():
225 |         """简化版训练算法
226 | 
227 |         Returns:
228 |             alphas alphas
229 |             w w
230 |             b b
231 |             supportVectorsIndex 支持向量的坐标集
232 |             supportVectors 支持向量
233 |             iterCount 迭代次数
234 |         """
235 |         numChanged = 0
236 |         iterCount = 0
237 |         alphas = np.mat(np.zeros((m, 1)))
238 |         b = 0
239 |         L = 0
240 |         H = 0
241 |         while iterCount < maxIter:
242 |             numChanged = 0
243 |             for i in range(m):
244 |                 Ei = E(i, alphas, b)
245 |                 Ri = y[i] * Ei
246 |                 # 选择违背KKT条件的，作为alpha2
247 |                 if (Ri < -tol and alphas[i] < C) or \
248 |                         (Ri > tol and alphas[i] > 0):
249 |                     # 选择第二个参数
250 |                     j = selectJRand(i)
251 |                     Ej = E(j, alphas, b)
252 |                     # get bounds
253 |                     if y[i] != y[j]:
254 |                         L = max(0, alphas[j] - alphas[i])
255 |                         H = min(C, C + alphas[j] - alphas[i])
256 |                     else:
257 |                         L = max(0, alphas[j] + alphas[i] - C)
258 |                         H = min(C, alphas[j] + alphas[i])
259 |                     if L == H:
260 |                         continue
261 |                     Kii = K[i, i]
262 |                     Kjj = K[j, j]
263 |                     Kij = K[i, j]
264 |                     eta = 2.0 * Kij - Kii - Kjj
265 |                     if eta >= 0:
266 |                         continue
267 |                     iOld = alphas[i].copy();
268 |                     jOld = alphas[j].copy()
269 |                     alphas[j] = jOld - y[j] * (Ei - Ej) / eta
270 |                     if alphas[j] > H:
271 |                         alphas[j] = H
272 |                     elif alphas[j] < L:
273 |                         alphas[j] = L
274 |                     if abs(alphas[j] - jOld) < tol:
275 |                         alphas[j] = jOld
276 |                         continue
277 |                     alphas[i] = iOld + y[i] * y[j] * (jOld - alphas[j])
278 |                     # update b
279 |                     bINew = b - Ei - y[i] * (alphas[i] - iOld) * Kii - y[j] * \
280 |                         (alphas[j] - jOld) * Kij
281 |                     bJNew = b - Ej - y[i] * (alphas[i] - iOld) * Kij - y[j] * \
282 |                         (alphas[j] - jOld) * Kjj
283 |                     if alphas[i] > 0 and alphas[i] < C:
284 |                         b = bINew
285 |                     elif alphas[j] > 0 and alphas[j] < C:
286 |                         b = bJNew
287 |                     else:
288 |                         b = (bINew + bJNew) / 2.0
289 |                     numChanged += 1
290 |             if numChanged == 0:
291 |                 iterCount += 1
292 |             else:
293 |                 iterCount = 0
294 |         supportVectorsIndex = np.nonzero(alphas.A > 0)[0]
295 |         supportVectors = np.mat(X[supportVectorsIndex])
296 |         return alphas, w(alphas, b, supportVectorsIndex, supportVectors), b, \
297 |             supportVectorsIndex, supportVectors, iterCount
298 |     return trainSimple, train, predict
299 | 


--------------------------------------------------------------------------------
/svm/spam.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | # svm/spam.py
 3 | 
 4 | """垃圾邮件分类器
 5 | """
 6 | 
 7 | import numpy as np
 8 | from stemming.porter2 import stem
 9 | from pydash import py_
10 | 
11 | # 获得词汇表
12 | vocabList = []
13 | with open('vocab.txt') as f:
14 |     for line in f:
15 |         idx, w = line.split()
16 |         vocabList.append(w)
17 | 
18 | 
19 | def processEmail(email):
20 |     """预处理邮件
21 | 
22 |     Args:
23 |         email 邮件内容
24 |     Returns:
25 |         indices 单词在词表中的位置
26 |     """
27 |     # 转换为小写 --> 标准化 URL --> 标准化 邮箱地址
28 |     # --> 去除 HTML 标签 --> 标准化数字
29 |     # --> 标准化美元 --> 删除非空格字符
30 |     return py_(email) \
31 |         .strip_tags() \
32 |         .reg_exp_replace(r'(http|https)://[^\s]*', 'httpaddr') \
33 |         .reg_exp_replace(r'[^\s]+@[^\s]+', 'emailaddr') \
34 |         .reg_exp_replace(r'\d+', 'number') \
35 |         .reg_exp_replace(r'[$]+', 'dollar') \
36 |         .lower_case() \
37 |         .trim() \
38 |         .words() \
39 |         .map(stem) \
40 |         .map(lambda word : py_.index_of(vocabList, word) + 1) \
41 |         .value()
42 | 
43 | def extractFeatures(indices):
44 |     """提取特征
45 | 
46 |     Args:
47 |         indices 单词索引
48 |     Returns:
49 |         feature 邮件特征
50 |     """
51 |     feature = py_.map(range(1, len(vocabList) + 1),
52 |                       lambda index: py_.index_of(indices, index) > -1)
53 |     return np.array(feature, dtype=np.uint)
54 | 
55 | def getTopPredictors(weights, count):
56 |     """获得最佳标识词汇
57 | 
58 |     Args:
59 |         weights 权值
60 |         count top count
61 |     Returns:
62 |         predictors predicators
63 |     """
64 |     return py_(vocabList) \
65 |         .map(lambda word, idx: (word, weights[idx])) \
66 |         .sort_by(lambda item: item[1], reverse = True) \
67 |         .take(count) \
68 |         .value()
69 | 


--------------------------------------------------------------------------------
/svm/test_linear.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | # svm/test_linear
 3 | import smo
 4 | import numpy as np
 5 | from sklearn import datasets
 6 | from scipy.io import loadmat
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | data = loadmat('data/ex6data1.mat')
10 | 
11 | X = np.mat(data['X'])
12 | y = np.mat(data['y'], dtype=np.float)
13 | y[y==0] = -1
14 | 
15 | m, n = X.shape
16 | tol = 1e-3
17 | maxIter = 20
18 | # C = 1.0
19 | C = 100.0
20 | 
21 | trainSimple, train, predict = smo.getSmo(X, y, C, tol, maxIter)
22 | alphas, w, b, supportVectorsIndex, supportVectors, iterCount = trainSimple()
23 | print w
24 | print b
25 | print len(supportVectorsIndex)
26 | print 'iterCount:%d'%iterCount
27 | 
28 | predictions = predict(X, alphas, b, supportVectorsIndex, supportVectors)
29 | errorCount = (np.multiply(predictions, y).A  < 0 ).sum()
30 | print 'error rate: %.2f'%(float(errorCount)/m)
31 | 
32 | # 绘制数据点
33 | x1Min = X[:, 0].min()
34 | x1Max = X[:, 0].max()
35 | x2Min = X[:, 1].min()
36 | x2Max = X[:, 1].max()
37 | plt.xlabel('X1')
38 | plt.ylabel('X2')
39 | plt.xlim(x1Min - 1, x1Max + 1)
40 | plt.ylim(x2Min - 1, x2Max + 1)
41 | plt.title('C=%.1f'%C)
42 | for i in range(m):
43 |     x = X[i].A[0]
44 |     if y[i] == 1:
45 |         color = 'black'
46 |         if i in supportVectorsIndex:
47 |             color = 'red'
48 |         plt.scatter(x[0], x[1], marker='*', color=color, s=50)
49 |     else:
50 |         color = 'green'
51 |         if i in supportVectorsIndex:
52 |             color = 'red'
53 |         plt.scatter(x[0], x[1], marker='o', color=color, s=50)
54 | 
55 | # 绘制决策边界
56 | x = np.arange(x1Min, x1Max, 0.1)
57 | h = (-w[0,0] * x - b[0,0]) / w[1,0]
58 | plt.plot(x, h)
59 | plt.show()
60 | 


--------------------------------------------------------------------------------
/svm/test_model_selection.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | # svm/test_diagnose.py
 3 | 
 4 | import numpy as np
 5 | import smo
 6 | import matplotlib.pyplot as plt
 7 | from scipy.io import loadmat
 8 | 
 9 | data = loadmat('data/ex6data3.mat')
10 | 
11 | X = np.mat(data['X'], dtype=np.float)
12 | y = np.mat(data['y'], dtype=np.float)
13 | XVal = np.mat(data['Xval'], dtype=np.float)
14 | yVal = np.mat(data['yval'], dtype=np.float)
15 | 
16 | m, n = X.shape
17 | mVal, _ = XVal.shape
18 | 
19 | # 纠正负样本
20 | y[y == 0] = -1
21 | yVal[yVal == 0] = -1
22 | 
23 | Cs = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]
24 | deltas = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]
25 | 
26 | # 获得所有 C 及 delta 的组合
27 | deltaCPairs = [[delta, C] for C in Cs for delta in deltas]
28 | 
29 | # 获得训练模型
30 | tol = 1e-3
31 | maxIter = 5
32 | models = [smo.getSmo(X, y, C, tol, maxIter, kernel=smo.rbfKernel(delta))
33 |           for delta, C in deltaCPairs]
34 | 
35 | # 开始训练
36 | results = [train() for train, _, _  in models]
37 | 
38 | # 利用交叉验证集选择模型
39 | predictions = [models[idx][2](XVal, alphas, b, supportVectorsIndex, supportVectors)
40 |     for idx, (alphas, w, b, supportVectorsIndex, supportVectors, iterCount) in enumerate(results)]
41 | errorRates = [(np.multiply(prediction, yVal).A < 0).sum() /
42 |               float(mVal) for prediction in predictions]
43 | minIdx = np.argmin(errorRates)
44 | alphas, w, b, supportVectorsIndex, supportVectors, iterCount = results[minIdx]
45 | delta, C = deltaCPairs[minIdx]
46 | 
47 | # 绘制数据点
48 | x1Min = X[:, 0].min()
49 | x1Max = X[:, 0].max()
50 | x2Min = X[:, 1].min()
51 | x2Max = X[:, 1].max()
52 | plt.title(r'C=%.2f, $\delta$=%.2f, error=%.2f'%(C, delta, errorRates[minIdx]))
53 | plt.xlabel('X1')
54 | plt.ylabel('X2')
55 | plt.xlim(x1Min, x1Max)
56 | plt.ylim(x2Min, x2Max)
57 | 
58 | for i in range(m):
59 |     x = X[i].A[0]
60 |     if y[i] == 1:
61 |         color = 'black'
62 |         if i in supportVectorsIndex:
63 |             color = 'red'
64 |         plt.scatter(x[0], x[1], marker='*', color=color, s=50)
65 |     else:
66 |         color = 'green'
67 |         if i in supportVectorsIndex:
68 |             color = 'red'
69 |         plt.scatter(x[0], x[1], marker='o', color=color, s=50)
70 | 
71 | 
72 | # 绘制决策边界
73 | xx1, xx2 = np.meshgrid(
74 |     np.linspace(x1Min, x1Max, 100),
75 |     np.linspace(x2Min, x2Max, 100)
76 | )
77 | _, _, predict = models[minIdx]
78 | predictX = np.mat(np.c_[xx1.ravel(), xx2.ravel()])
79 | predictions = predict(predictX, alphas, b, supportVectorsIndex, supportVectors)
80 | predictions = predictions.reshape(xx1.shape)
81 | plt.contour(xx1, xx2, predictions, [0.5], linewidths=5)
82 | plt.show()
83 | 


--------------------------------------------------------------------------------
/svm/test_non_linear.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | # svm/test_non_linear.py
 3 | import smo
 4 | import numpy as np
 5 | from sklearn import datasets
 6 | from scipy.io import loadmat
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | data = loadmat('data/ex6data2.mat')
10 | 
11 | X = np.mat(data['X'])
12 | y = np.mat(data['y'], dtype=np.float)
13 | y[y==0] = -1
14 | 
15 | m, n = X.shape
16 | C = 1.0
17 | tol = 1e-3
18 | maxIter = 5
19 | kernel = smo.rbfKernel(0.1)
20 | 
21 | trainSimple, train, predict = smo.getSmo(X, y, C, tol, maxIter, kernel=kernel)
22 | alphas, w, b, supportVectorsIndex, supportVectors, iterCount = train()
23 | print supportVectorsIndex
24 | print len(supportVectorsIndex)
25 | print 'iterCount:%d' % iterCount
26 | 
27 | predictions = predict(X, alphas, b, supportVectorsIndex, supportVectors)
28 | errorCount = (np.multiply(predictions, y).A  < 0 ).sum()
29 | print errorCount
30 | print 'error rate: %.2f'%(float(errorCount)/m)
31 | 
32 | # 绘制数据点
33 | x1Min = X[:, 0].min()
34 | x1Max = X[:, 0].max()
35 | x2Min = X[:, 1].min()
36 | x2Max = X[:, 1].max()
37 | plt.title('C=%.1f'%C)
38 | plt.xlabel('X1')
39 | plt.ylabel('X2')
40 | plt.xlim(x1Min, x1Max)
41 | plt.ylim(x2Min, x2Max)
42 | 
43 | for i in range(m):
44 |     x = X[i].A[0]
45 |     if y[i] == 1:
46 |         color = 'black'
47 |         if i in supportVectorsIndex:
48 |             color = 'red'
49 |         plt.scatter(x[0], x[1], marker='*', color=color, s=50)
50 |     else:
51 |         color = 'green'
52 |         if i in supportVectorsIndex:
53 |             color = 'red'
54 |         plt.scatter(x[0], x[1], marker='o', color=color, s=50)
55 | 
56 | 
57 | # 绘制决策边界
58 | xx1, xx2 = np.meshgrid(
59 |     np.linspace(x1Min, x1Max, 100),
60 |     np.linspace(x2Min, x2Max, 100)
61 | )
62 | predictX = np.mat(np.c_[xx1.ravel(), xx2.ravel()])
63 | predictions = predict(predictX, alphas, b, supportVectorsIndex, supportVectors)
64 | predictions = predictions.reshape(xx1.shape)
65 | plt.contour(xx1, xx2, predictions, [0.5], linewidths=5)
66 | plt.show()
67 | 


--------------------------------------------------------------------------------
/svm/test_spam.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | # svm/test_spam.py
 3 | import spam
 4 | import numpy as np
 5 | from scipy.io import loadmat
 6 | from sklearn.svm import SVC
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | # 垃圾邮件分类器
10 | data = loadmat('data/spamTrain.mat')
11 | X = np.mat(data['X'])
12 | y = data['y']
13 | m, n = X.shape
14 | C = 0.1
15 | tol = 1e-3
16 | 
17 | # 使用训练集训练分类器
18 | clf = SVC(C=C, kernel='linear', tol=tol)
19 | clf.fit(X, y.ravel())
20 | predictions = np.mat([clf.predict(X[i, :])  for i in range(m)])
21 | accuracy = 100 * np.mean(predictions == y)
22 | print 'Training set accuracy: %0.2f %%' % accuracy
23 | 
24 | # 使用测试集评估训练结果
25 | data = loadmat('data/spamTest.mat')
26 | XTest = np.mat(data['Xtest'])
27 | yTest = data['ytest']
28 | mTest, _ = XTest.shape
29 | 
30 | clf.fit(XTest, yTest.ravel())
31 | predictions = np.mat([clf.predict(XTest[i, :])  for i in range(mTest)])
32 | accuracy = 100 * np.mean(predictions == yTest)
33 | print 'Test set accuracy: %0.2f %%' % accuracy
34 | 
35 | # 获得最能标识垃圾邮件的词汇（在模型中获得高权值的）
36 | weights = abs(clf.coef_.flatten())
37 | top = 15
38 | predictors = spam.getTopPredictors(weights, top)
39 | print '\nTop %d predictors of spam:'%top
40 | for word, weight in predictors:
41 |     print '%-15s (%f)' % (word, weight)
42 | 
43 | # 使用邮件测试
44 | def genExample(f):
45 |     email = open(f).read()
46 |     indices =  spam.processEmail(email)
47 |     features =  spam.extractFeatures(indices)
48 |     return features
49 | 
50 | files = [
51 |     'data/emailSample1.txt',
52 |     'data/emailSample1.txt',
53 |     'data/spamSample1.txt',
54 |     'data/spamSample2.txt'
55 | ]
56 | 
57 | emails = np.mat([genExample(f) for f in files], dtype=np.uint8)
58 | labels = np.array([[0, 0, 1, 1]]).reshape(-1, 1)
59 | predictions = np.mat([clf.predict(emails[i, :])  for i in range(len(files))])
60 | accuracy = 100 * np.mean(predictions == labels)
61 | print('\nTest set accuracy for own datasets: %0.2f %%' % accuracy)
62 | 


--------------------------------------------------------------------------------
/svm/vocab.txt:
--------------------------------------------------------------------------------
   1 | 1	aa
   2 | 2	ab
   3 | 3	abil
   4 | 4	abl
   5 | 5	about
   6 | 6	abov
   7 | 7	absolut
   8 | 8	abus
   9 | 9	ac
  10 | 10	accept
  11 | 11	access
  12 | 12	accord
  13 | 13	account
  14 | 14	achiev
  15 | 15	acquir
  16 | 16	across
  17 | 17	act
  18 | 18	action
  19 | 19	activ
  20 | 20	actual
  21 | 21	ad
  22 | 22	adam
  23 | 23	add
  24 | 24	addit
  25 | 25	address
  26 | 26	administr
  27 | 27	adult
  28 | 28	advanc
  29 | 29	advantag
  30 | 30	advertis
  31 | 31	advic
  32 | 32	advis
  33 | 33	ae
  34 | 34	af
  35 | 35	affect
  36 | 36	affili
  37 | 37	afford
  38 | 38	africa
  39 | 39	after
  40 | 40	ag
  41 | 41	again
  42 | 42	against
  43 | 43	agenc
  44 | 44	agent
  45 | 45	ago
  46 | 46	agre
  47 | 47	agreement
  48 | 48	aid
  49 | 49	air
  50 | 50	al
  51 | 51	alb
  52 | 52	align
  53 | 53	all
  54 | 54	allow
  55 | 55	almost
  56 | 56	alon
  57 | 57	along
  58 | 58	alreadi
  59 | 59	alsa
  60 | 60	also
  61 | 61	altern
  62 | 62	although
  63 | 63	alwai
  64 | 64	am
  65 | 65	amaz
  66 | 66	america
  67 | 67	american
  68 | 68	among
  69 | 69	amount
  70 | 70	amp
  71 | 71	an
  72 | 72	analysi
  73 | 73	analyst
  74 | 74	and
  75 | 75	ani
  76 | 76	anim
  77 | 77	announc
  78 | 78	annual
  79 | 79	annuiti
  80 | 80	anoth
  81 | 81	answer
  82 | 82	anti
  83 | 83	anumb
  84 | 84	anybodi
  85 | 85	anymor
  86 | 86	anyon
  87 | 87	anyth
  88 | 88	anywai
  89 | 89	anywher
  90 | 90	aol
  91 | 91	ap
  92 | 92	apolog
  93 | 93	app
  94 | 94	appar
  95 | 95	appear
  96 | 96	appl
  97 | 97	appli
  98 | 98	applic
  99 | 99	appreci
 100 | 100	approach
 101 | 101	approv
 102 | 102	apt
 103 | 103	ar
 104 | 104	archiv
 105 | 105	area
 106 | 106	aren
 107 | 107	argument
 108 | 108	arial
 109 | 109	arm
 110 | 110	around
 111 | 111	arrai
 112 | 112	arriv
 113 | 113	art
 114 | 114	articl
 115 | 115	artist
 116 | 116	as
 117 | 117	ascii
 118 | 118	ask
 119 | 119	asset
 120 | 120	assist
 121 | 121	associ
 122 | 122	assum
 123 | 123	assur
 124 | 124	at
 125 | 125	atol
 126 | 126	attach
 127 | 127	attack
 128 | 128	attempt
 129 | 129	attent
 130 | 130	attornei
 131 | 131	attract
 132 | 132	audio
 133 | 133	aug
 134 | 134	august
 135 | 135	author
 136 | 136	auto
 137 | 137	autom
 138 | 138	automat
 139 | 139	avail
 140 | 140	averag
 141 | 141	avoid
 142 | 142	awai
 143 | 143	awar
 144 | 144	award
 145 | 145	ba
 146 | 146	babi
 147 | 147	back
 148 | 148	background
 149 | 149	backup
 150 | 150	bad
 151 | 151	balanc
 152 | 152	ban
 153 | 153	bank
 154 | 154	bar
 155 | 155	base
 156 | 156	basenumb
 157 | 157	basi
 158 | 158	basic
 159 | 159	bb
 160 | 160	bc
 161 | 161	bd
 162 | 162	be
 163 | 163	beat
 164 | 164	beberg
 165 | 165	becaus
 166 | 166	becom
 167 | 167	been
 168 | 168	befor
 169 | 169	begin
 170 | 170	behalf
 171 | 171	behavior
 172 | 172	behind
 173 | 173	believ
 174 | 174	below
 175 | 175	benefit
 176 | 176	best
 177 | 177	beta
 178 | 178	better
 179 | 179	between
 180 | 180	bf
 181 | 181	big
 182 | 182	bill
 183 | 183	billion
 184 | 184	bin
 185 | 185	binari
 186 | 186	bit
 187 | 187	black
 188 | 188	blank
 189 | 189	block
 190 | 190	blog
 191 | 191	blood
 192 | 192	blue
 193 | 193	bnumber
 194 | 194	board
 195 | 195	bodi
 196 | 196	boi
 197 | 197	bonu
 198 | 198	book
 199 | 199	boot
 200 | 200	border
 201 | 201	boss
 202 | 202	boston
 203 | 203	botan
 204 | 204	both
 205 | 205	bottl
 206 | 206	bottom
 207 | 207	boundari
 208 | 208	box
 209 | 209	brain
 210 | 210	brand
 211 | 211	break
 212 | 212	brian
 213 | 213	bring
 214 | 214	broadcast
 215 | 215	broker
 216 | 216	browser
 217 | 217	bug
 218 | 218	bui
 219 | 219	build
 220 | 220	built
 221 | 221	bulk
 222 | 222	burn
 223 | 223	bush
 224 | 224	busi
 225 | 225	but
 226 | 226	button
 227 | 227	by
 228 | 228	byte
 229 | 229	ca
 230 | 230	cabl
 231 | 231	cach
 232 | 232	calcul
 233 | 233	california
 234 | 234	call
 235 | 235	came
 236 | 236	camera
 237 | 237	campaign
 238 | 238	can
 239 | 239	canada
 240 | 240	cannot
 241 | 241	canon
 242 | 242	capabl
 243 | 243	capillari
 244 | 244	capit
 245 | 245	car
 246 | 246	card
 247 | 247	care
 248 | 248	career
 249 | 249	carri
 250 | 250	cartridg
 251 | 251	case
 252 | 252	cash
 253 | 253	cat
 254 | 254	catch
 255 | 255	categori
 256 | 256	caus
 257 | 257	cb
 258 | 258	cc
 259 | 259	cd
 260 | 260	ce
 261 | 261	cell
 262 | 262	cent
 263 | 263	center
 264 | 264	central
 265 | 265	centuri
 266 | 266	ceo
 267 | 267	certain
 268 | 268	certainli
 269 | 269	cf
 270 | 270	challeng
 271 | 271	chanc
 272 | 272	chang
 273 | 273	channel
 274 | 274	char
 275 | 275	charact
 276 | 276	charg
 277 | 277	charset
 278 | 278	chat
 279 | 279	cheap
 280 | 280	check
 281 | 281	cheer
 282 | 282	chief
 283 | 283	children
 284 | 284	china
 285 | 285	chip
 286 | 286	choic
 287 | 287	choos
 288 | 288	chri
 289 | 289	citi
 290 | 290	citizen
 291 | 291	civil
 292 | 292	claim
 293 | 293	class
 294 | 294	classifi
 295 | 295	clean
 296 | 296	clear
 297 | 297	clearli
 298 | 298	click
 299 | 299	client
 300 | 300	close
 301 | 301	clue
 302 | 302	cnet
 303 | 303	cnumber
 304 | 304	co
 305 | 305	code
 306 | 306	collect
 307 | 307	colleg
 308 | 308	color
 309 | 309	com
 310 | 310	combin
 311 | 311	come
 312 | 312	comfort
 313 | 313	command
 314 | 314	comment
 315 | 315	commentari
 316 | 316	commerci
 317 | 317	commiss
 318 | 318	commit
 319 | 319	common
 320 | 320	commun
 321 | 321	compani
 322 | 322	compar
 323 | 323	comparison
 324 | 324	compat
 325 | 325	compet
 326 | 326	competit
 327 | 327	compil
 328 | 328	complet
 329 | 329	comprehens
 330 | 330	comput
 331 | 331	concentr
 332 | 332	concept
 333 | 333	concern
 334 | 334	condit
 335 | 335	conf
 336 | 336	confer
 337 | 337	confid
 338 | 338	confidenti
 339 | 339	config
 340 | 340	configur
 341 | 341	confirm
 342 | 342	conflict
 343 | 343	confus
 344 | 344	congress
 345 | 345	connect
 346 | 346	consid
 347 | 347	consolid
 348 | 348	constitut
 349 | 349	construct
 350 | 350	consult
 351 | 351	consum
 352 | 352	contact
 353 | 353	contain
 354 | 354	content
 355 | 355	continu
 356 | 356	contract
 357 | 357	contribut
 358 | 358	control
 359 | 359	conveni
 360 | 360	convers
 361 | 361	convert
 362 | 362	cool
 363 | 363	cooper
 364 | 364	copi
 365 | 365	copyright
 366 | 366	core
 367 | 367	corpor
 368 | 368	correct
 369 | 369	correspond
 370 | 370	cost
 371 | 371	could
 372 | 372	couldn
 373 | 373	count
 374 | 374	countri
 375 | 375	coupl
 376 | 376	cours
 377 | 377	court
 378 | 378	cover
 379 | 379	coverag
 380 | 380	crash
 381 | 381	creat
 382 | 382	creativ
 383 | 383	credit
 384 | 384	critic
 385 | 385	cross
 386 | 386	cultur
 387 | 387	current
 388 | 388	custom
 389 | 389	cut
 390 | 390	cv
 391 | 391	da
 392 | 392	dagga
 393 | 393	dai
 394 | 394	daili
 395 | 395	dan
 396 | 396	danger
 397 | 397	dark
 398 | 398	data
 399 | 399	databas
 400 | 400	datapow
 401 | 401	date
 402 | 402	dave
 403 | 403	david
 404 | 404	dc
 405 | 405	de
 406 | 406	dead
 407 | 407	deal
 408 | 408	dear
 409 | 409	death
 410 | 410	debt
 411 | 411	decad
 412 | 412	decid
 413 | 413	decis
 414 | 414	declar
 415 | 415	declin
 416 | 416	decor
 417 | 417	default
 418 | 418	defend
 419 | 419	defens
 420 | 420	defin
 421 | 421	definit
 422 | 422	degre
 423 | 423	delai
 424 | 424	delet
 425 | 425	deliv
 426 | 426	deliveri
 427 | 427	dell
 428 | 428	demand
 429 | 429	democrat
 430 | 430	depart
 431 | 431	depend
 432 | 432	deposit
 433 | 433	describ
 434 | 434	descript
 435 | 435	deserv
 436 | 436	design
 437 | 437	desir
 438 | 438	desktop
 439 | 439	despit
 440 | 440	detail
 441 | 441	detect
 442 | 442	determin
 443 | 443	dev
 444 | 444	devel
 445 | 445	develop
 446 | 446	devic
 447 | 447	di
 448 | 448	dial
 449 | 449	did
 450 | 450	didn
 451 | 451	diet
 452 | 452	differ
 453 | 453	difficult
 454 | 454	digit
 455 | 455	direct
 456 | 456	directli
 457 | 457	director
 458 | 458	directori
 459 | 459	disabl
 460 | 460	discount
 461 | 461	discov
 462 | 462	discoveri
 463 | 463	discuss
 464 | 464	disk
 465 | 465	displai
 466 | 466	disposit
 467 | 467	distanc
 468 | 468	distribut
 469 | 469	dn
 470 | 470	dnumber
 471 | 471	do
 472 | 472	doc
 473 | 473	document
 474 | 474	doe
 475 | 475	doer
 476 | 476	doesn
 477 | 477	dollar
 478 | 478	dollarac
 479 | 479	dollarnumb
 480 | 480	domain
 481 | 481	don
 482 | 482	done
 483 | 483	dont
 484 | 484	doubl
 485 | 485	doubt
 486 | 486	down
 487 | 487	download
 488 | 488	dr
 489 | 489	draw
 490 | 490	dream
 491 | 491	drive
 492 | 492	driver
 493 | 493	drop
 494 | 494	drug
 495 | 495	due
 496 | 496	dure
 497 | 497	dvd
 498 | 498	dw
 499 | 499	dynam
 500 | 500	ea
 501 | 501	each
 502 | 502	earli
 503 | 503	earlier
 504 | 504	earn
 505 | 505	earth
 506 | 506	easi
 507 | 507	easier
 508 | 508	easili
 509 | 509	eat
 510 | 510	eb
 511 | 511	ebai
 512 | 512	ec
 513 | 513	echo
 514 | 514	econom
 515 | 515	economi
 516 | 516	ed
 517 | 517	edg
 518 | 518	edit
 519 | 519	editor
 520 | 520	educ
 521 | 521	eff
 522 | 522	effect
 523 | 523	effici
 524 | 524	effort
 525 | 525	either
 526 | 526	el
 527 | 527	electron
 528 | 528	elimin
 529 | 529	els
 530 | 530	email
 531 | 531	emailaddr
 532 | 532	emerg
 533 | 533	empir
 534 | 534	employ
 535 | 535	employe
 536 | 536	en
 537 | 537	enabl
 538 | 538	encod
 539 | 539	encourag
 540 | 540	end
 541 | 541	enemi
 542 | 542	enenkio
 543 | 543	energi
 544 | 544	engin
 545 | 545	english
 546 | 546	enhanc
 547 | 547	enjoi
 548 | 548	enough
 549 | 549	ensur
 550 | 550	enter
 551 | 551	enterpris
 552 | 552	entertain
 553 | 553	entir
 554 | 554	entri
 555 | 555	enumb
 556 | 556	environ
 557 | 557	equal
 558 | 558	equip
 559 | 559	equival
 560 | 560	error
 561 | 561	especi
 562 | 562	essenti
 563 | 563	establish
 564 | 564	estat
 565 | 565	estim
 566 | 566	et
 567 | 567	etc
 568 | 568	euro
 569 | 569	europ
 570 | 570	european
 571 | 571	even
 572 | 572	event
 573 | 573	eventu
 574 | 574	ever
 575 | 575	everi
 576 | 576	everyon
 577 | 577	everyth
 578 | 578	evid
 579 | 579	evil
 580 | 580	exactli
 581 | 581	exampl
 582 | 582	excel
 583 | 583	except
 584 | 584	exchang
 585 | 585	excit
 586 | 586	exclus
 587 | 587	execut
 588 | 588	exercis
 589 | 589	exist
 590 | 590	exmh
 591 | 591	expand
 592 | 592	expect
 593 | 593	expens
 594 | 594	experi
 595 | 595	expert
 596 | 596	expir
 597 | 597	explain
 598 | 598	explor
 599 | 599	express
 600 | 600	extend
 601 | 601	extens
 602 | 602	extra
 603 | 603	extract
 604 | 604	extrem
 605 | 605	ey
 606 | 606	fa
 607 | 607	face
 608 | 608	fact
 609 | 609	factor
 610 | 610	fail
 611 | 611	fair
 612 | 612	fall
 613 | 613	fals
 614 | 614	famili
 615 | 615	faq
 616 | 616	far
 617 | 617	fast
 618 | 618	faster
 619 | 619	fastest
 620 | 620	fat
 621 | 621	father
 622 | 622	favorit
 623 | 623	fax
 624 | 624	fb
 625 | 625	fd
 626 | 626	featur
 627 | 627	feder
 628 | 628	fee
 629 | 629	feed
 630 | 630	feedback
 631 | 631	feel
 632 | 632	femal
 633 | 633	few
 634 | 634	ffffff
 635 | 635	ffnumber
 636 | 636	field
 637 | 637	fight
 638 | 638	figur
 639 | 639	file
 640 | 640	fill
 641 | 641	film
 642 | 642	filter
 643 | 643	final
 644 | 644	financ
 645 | 645	financi
 646 | 646	find
 647 | 647	fine
 648 | 648	finish
 649 | 649	fire
 650 | 650	firewal
 651 | 651	firm
 652 | 652	first
 653 | 653	fit
 654 | 654	five
 655 | 655	fix
 656 | 656	flag
 657 | 657	flash
 658 | 658	flow
 659 | 659	fnumber
 660 | 660	focu
 661 | 661	folder
 662 | 662	folk
 663 | 663	follow
 664 | 664	font
 665 | 665	food
 666 | 666	for
 667 | 667	forc
 668 | 668	foreign
 669 | 669	forev
 670 | 670	forget
 671 | 671	fork
 672 | 672	form
 673 | 673	format
 674 | 674	former
 675 | 675	fortun
 676 | 676	forward
 677 | 677	found
 678 | 678	foundat
 679 | 679	four
 680 | 680	franc
 681 | 681	free
 682 | 682	freedom
 683 | 683	french
 684 | 684	freshrpm
 685 | 685	fri
 686 | 686	fridai
 687 | 687	friend
 688 | 688	from
 689 | 689	front
 690 | 690	ftoc
 691 | 691	ftp
 692 | 692	full
 693 | 693	fulli
 694 | 694	fun
 695 | 695	function
 696 | 696	fund
 697 | 697	further
 698 | 698	futur
 699 | 699	ga
 700 | 700	gain
 701 | 701	game
 702 | 702	gari
 703 | 703	garrigu
 704 | 704	gave
 705 | 705	gcc
 706 | 706	geek
 707 | 707	gener
 708 | 708	get
 709 | 709	gif
 710 | 710	gift
 711 | 711	girl
 712 | 712	give
 713 | 713	given
 714 | 714	global
 715 | 715	gnome
 716 | 716	gnu
 717 | 717	gnupg
 718 | 718	go
 719 | 719	goal
 720 | 720	god
 721 | 721	goe
 722 | 722	gold
 723 | 723	gone
 724 | 724	good
 725 | 725	googl
 726 | 726	got
 727 | 727	govern
 728 | 728	gpl
 729 | 729	grand
 730 | 730	grant
 731 | 731	graphic
 732 | 732	great
 733 | 733	greater
 734 | 734	ground
 735 | 735	group
 736 | 736	grow
 737 | 737	growth
 738 | 738	gt
 739 | 739	guarante
 740 | 740	guess
 741 | 741	gui
 742 | 742	guid
 743 | 743	ha
 744 | 744	hack
 745 | 745	had
 746 | 746	half
 747 | 747	ham
 748 | 748	hand
 749 | 749	handl
 750 | 750	happen
 751 | 751	happi
 752 | 752	hard
 753 | 753	hardwar
 754 | 754	hat
 755 | 755	hate
 756 | 756	have
 757 | 757	haven
 758 | 758	he
 759 | 759	head
 760 | 760	header
 761 | 761	headlin
 762 | 762	health
 763 | 763	hear
 764 | 764	heard
 765 | 765	heart
 766 | 766	heaven
 767 | 767	hei
 768 | 768	height
 769 | 769	held
 770 | 770	hello
 771 | 771	help
 772 | 772	helvetica
 773 | 773	her
 774 | 774	herba
 775 | 775	here
 776 | 776	hermio
 777 | 777	hettinga
 778 | 778	hi
 779 | 779	high
 780 | 780	higher
 781 | 781	highli
 782 | 782	highlight
 783 | 783	him
 784 | 784	histori
 785 | 785	hit
 786 | 786	hold
 787 | 787	home
 788 | 788	honor
 789 | 789	hope
 790 | 790	host
 791 | 791	hot
 792 | 792	hour
 793 | 793	hous
 794 | 794	how
 795 | 795	howev
 796 | 796	hp
 797 | 797	html
 798 | 798	http
 799 | 799	httpaddr
 800 | 800	huge
 801 | 801	human
 802 | 802	hundr
 803 | 803	ibm
 804 | 804	id
 805 | 805	idea
 806 | 806	ident
 807 | 807	identifi
 808 | 808	idnumb
 809 | 809	ie
 810 | 810	if
 811 | 811	ignor
 812 | 812	ii
 813 | 813	iii
 814 | 814	iiiiiiihnumberjnumberhnumberjnumberhnumb
 815 | 815	illeg
 816 | 816	im
 817 | 817	imag
 818 | 818	imagin
 819 | 819	immedi
 820 | 820	impact
 821 | 821	implement
 822 | 822	import
 823 | 823	impress
 824 | 824	improv
 825 | 825	in
 826 | 826	inc
 827 | 827	includ
 828 | 828	incom
 829 | 829	increas
 830 | 830	incred
 831 | 831	inde
 832 | 832	independ
 833 | 833	index
 834 | 834	india
 835 | 835	indian
 836 | 836	indic
 837 | 837	individu
 838 | 838	industri
 839 | 839	info
 840 | 840	inform
 841 | 841	initi
 842 | 842	inlin
 843 | 843	innov
 844 | 844	input
 845 | 845	insert
 846 | 846	insid
 847 | 847	instal
 848 | 848	instanc
 849 | 849	instant
 850 | 850	instead
 851 | 851	institut
 852 | 852	instruct
 853 | 853	insur
 854 | 854	int
 855 | 855	integr
 856 | 856	intel
 857 | 857	intellig
 858 | 858	intend
 859 | 859	interact
 860 | 860	interest
 861 | 861	interfac
 862 | 862	intern
 863 | 863	internet
 864 | 864	interview
 865 | 865	into
 866 | 866	intro
 867 | 867	introduc
 868 | 868	inumb
 869 | 869	invest
 870 | 870	investig
 871 | 871	investor
 872 | 872	invok
 873 | 873	involv
 874 | 874	ip
 875 | 875	ireland
 876 | 876	irish
 877 | 877	is
 878 | 878	island
 879 | 879	isn
 880 | 880	iso
 881 | 881	isp
 882 | 882	issu
 883 | 883	it
 884 | 884	item
 885 | 885	itself
 886 | 886	jabber
 887 | 887	jame
 888 | 888	java
 889 | 889	jim
 890 | 890	jnumberiiiiiiihepihepihf
 891 | 891	job
 892 | 892	joe
 893 | 893	john
 894 | 894	join
 895 | 895	journal
 896 | 896	judg
 897 | 897	judgment
 898 | 898	jul
 899 | 899	juli
 900 | 900	jump
 901 | 901	june
 902 | 902	just
 903 | 903	justin
 904 | 904	keep
 905 | 905	kei
 906 | 906	kept
 907 | 907	kernel
 908 | 908	kevin
 909 | 909	keyboard
 910 | 910	kid
 911 | 911	kill
 912 | 912	kind
 913 | 913	king
 914 | 914	kingdom
 915 | 915	knew
 916 | 916	know
 917 | 917	knowledg
 918 | 918	known
 919 | 919	la
 920 | 920	lack
 921 | 921	land
 922 | 922	languag
 923 | 923	laptop
 924 | 924	larg
 925 | 925	larger
 926 | 926	largest
 927 | 927	laser
 928 | 928	last
 929 | 929	late
 930 | 930	later
 931 | 931	latest
 932 | 932	launch
 933 | 933	law
 934 | 934	lawrenc
 935 | 935	le
 936 | 936	lead
 937 | 937	leader
 938 | 938	learn
 939 | 939	least
 940 | 940	leav
 941 | 941	left
 942 | 942	legal
 943 | 943	lender
 944 | 944	length
 945 | 945	less
 946 | 946	lesson
 947 | 947	let
 948 | 948	letter
 949 | 949	level
 950 | 950	lib
 951 | 951	librari
 952 | 952	licens
 953 | 953	life
 954 | 954	lifetim
 955 | 955	light
 956 | 956	like
 957 | 957	limit
 958 | 958	line
 959 | 959	link
 960 | 960	linux
 961 | 961	list
 962 | 962	listen
 963 | 963	littl
 964 | 964	live
 965 | 965	ll
 966 | 966	lo
 967 | 967	load
 968 | 968	loan
 969 | 969	local
 970 | 970	locat
 971 | 971	lock
 972 | 972	lockergnom
 973 | 973	log
 974 | 974	long
 975 | 975	longer
 976 | 976	look
 977 | 977	lose
 978 | 978	loss
 979 | 979	lost
 980 | 980	lot
 981 | 981	love
 982 | 982	low
 983 | 983	lower
 984 | 984	lowest
 985 | 985	lt
 986 | 986	ma
 987 | 987	mac
 988 | 988	machin
 989 | 989	made
 990 | 990	magazin
 991 | 991	mai
 992 | 992	mail
 993 | 993	mailer
 994 | 994	main
 995 | 995	maintain
 996 | 996	major
 997 | 997	make
 998 | 998	maker
 999 | 999	male
1000 | 1000	man
1001 | 1001	manag
1002 | 1002	mani
1003 | 1003	manual
1004 | 1004	manufactur
1005 | 1005	map
1006 | 1006	march
1007 | 1007	margin
1008 | 1008	mark
1009 | 1009	market
1010 | 1010	marshal
1011 | 1011	mass
1012 | 1012	master
1013 | 1013	match
1014 | 1014	materi
1015 | 1015	matter
1016 | 1016	matthia
1017 | 1017	mayb
1018 | 1018	me
1019 | 1019	mean
1020 | 1020	measur
1021 | 1021	mechan
1022 | 1022	media
1023 | 1023	medic
1024 | 1024	meet
1025 | 1025	member
1026 | 1026	membership
1027 | 1027	memori
1028 | 1028	men
1029 | 1029	mention
1030 | 1030	menu
1031 | 1031	merchant
1032 | 1032	messag
1033 | 1033	method
1034 | 1034	mh
1035 | 1035	michael
1036 | 1036	microsoft
1037 | 1037	middl
1038 | 1038	might
1039 | 1039	mike
1040 | 1040	mile
1041 | 1041	militari
1042 | 1042	million
1043 | 1043	mime
1044 | 1044	mind
1045 | 1045	mine
1046 | 1046	mini
1047 | 1047	minimum
1048 | 1048	minut
1049 | 1049	miss
1050 | 1050	mistak
1051 | 1051	mobil
1052 | 1052	mode
1053 | 1053	model
1054 | 1054	modem
1055 | 1055	modifi
1056 | 1056	modul
1057 | 1057	moment
1058 | 1058	mon
1059 | 1059	mondai
1060 | 1060	monei
1061 | 1061	monitor
1062 | 1062	month
1063 | 1063	monthli
1064 | 1064	more
1065 | 1065	morn
1066 | 1066	mortgag
1067 | 1067	most
1068 | 1068	mostli
1069 | 1069	mother
1070 | 1070	motiv
1071 | 1071	move
1072 | 1072	movi
1073 | 1073	mpnumber
1074 | 1074	mr
1075 | 1075	ms
1076 | 1076	msg
1077 | 1077	much
1078 | 1078	multi
1079 | 1079	multipart
1080 | 1080	multipl
1081 | 1081	murphi
1082 | 1082	music
1083 | 1083	must
1084 | 1084	my
1085 | 1085	myself
1086 | 1086	name
1087 | 1087	nation
1088 | 1088	natur
1089 | 1089	nbsp
1090 | 1090	near
1091 | 1091	nearli
1092 | 1092	necessari
1093 | 1093	need
1094 | 1094	neg
1095 | 1095	net
1096 | 1096	netscap
1097 | 1097	network
1098 | 1098	never
1099 | 1099	new
1100 | 1100	newslett
1101 | 1101	next
1102 | 1102	nextpart
1103 | 1103	nice
1104 | 1104	nigeria
1105 | 1105	night
1106 | 1106	no
1107 | 1107	nobodi
1108 | 1108	non
1109 | 1109	none
1110 | 1110	nor
1111 | 1111	normal
1112 | 1112	north
1113 | 1113	not
1114 | 1114	note
1115 | 1115	noth
1116 | 1116	notic
1117 | 1117	now
1118 | 1118	nt
1119 | 1119	null
1120 | 1120	number
1121 | 1121	numbera
1122 | 1122	numberam
1123 | 1123	numberanumb
1124 | 1124	numberb
1125 | 1125	numberbit
1126 | 1126	numberc
1127 | 1127	numbercb
1128 | 1128	numbercbr
1129 | 1129	numbercfont
1130 | 1130	numbercli
1131 | 1131	numbercnumb
1132 | 1132	numbercp
1133 | 1133	numberctd
1134 | 1134	numberd
1135 | 1135	numberdari
1136 | 1136	numberdnumb
1137 | 1137	numberenumb
1138 | 1138	numberf
1139 | 1139	numberfb
1140 | 1140	numberff
1141 | 1141	numberffont
1142 | 1142	numberfp
1143 | 1143	numberftd
1144 | 1144	numberk
1145 | 1145	numberm
1146 | 1146	numbermb
1147 | 1147	numberp
1148 | 1148	numberpd
1149 | 1149	numberpm
1150 | 1150	numberpx
1151 | 1151	numberst
1152 | 1152	numberth
1153 | 1153	numbertnumb
1154 | 1154	numberx
1155 | 1155	object
1156 | 1156	oblig
1157 | 1157	obtain
1158 | 1158	obvious
1159 | 1159	occur
1160 | 1160	oct
1161 | 1161	octob
1162 | 1162	of
1163 | 1163	off
1164 | 1164	offer
1165 | 1165	offic
1166 | 1166	offici
1167 | 1167	often
1168 | 1168	oh
1169 | 1169	ok
1170 | 1170	old
1171 | 1171	on
1172 | 1172	onc
1173 | 1173	onli
1174 | 1174	onlin
1175 | 1175	open
1176 | 1176	oper
1177 | 1177	opinion
1178 | 1178	opportun
1179 | 1179	opt
1180 | 1180	optim
1181 | 1181	option
1182 | 1182	or
1183 | 1183	order
1184 | 1184	org
1185 | 1185	organ
1186 | 1186	origin
1187 | 1187	os
1188 | 1188	osdn
1189 | 1189	other
1190 | 1190	otherwis
1191 | 1191	our
1192 | 1192	out
1193 | 1193	outlook
1194 | 1194	output
1195 | 1195	outsid
1196 | 1196	over
1197 | 1197	own
1198 | 1198	owner
1199 | 1199	oz
1200 | 1200	pacif
1201 | 1201	pack
1202 | 1202	packag
1203 | 1203	page
1204 | 1204	pai
1205 | 1205	paid
1206 | 1206	pain
1207 | 1207	palm
1208 | 1208	panel
1209 | 1209	paper
1210 | 1210	paragraph
1211 | 1211	parent
1212 | 1212	part
1213 | 1213	parti
1214 | 1214	particip
1215 | 1215	particular
1216 | 1216	particularli
1217 | 1217	partit
1218 | 1218	partner
1219 | 1219	pass
1220 | 1220	password
1221 | 1221	past
1222 | 1222	patch
1223 | 1223	patent
1224 | 1224	path
1225 | 1225	pattern
1226 | 1226	paul
1227 | 1227	payment
1228 | 1228	pc
1229 | 1229	peac
1230 | 1230	peopl
1231 | 1231	per
1232 | 1232	percent
1233 | 1233	percentag
1234 | 1234	perfect
1235 | 1235	perfectli
1236 | 1236	perform
1237 | 1237	perhap
1238 | 1238	period
1239 | 1239	perl
1240 | 1240	perman
1241 | 1241	permiss
1242 | 1242	person
1243 | 1243	pgp
1244 | 1244	phone
1245 | 1245	photo
1246 | 1246	php
1247 | 1247	phrase
1248 | 1248	physic
1249 | 1249	pick
1250 | 1250	pictur
1251 | 1251	piec
1252 | 1252	piiiiiiii
1253 | 1253	pipe
1254 | 1254	pjnumber
1255 | 1255	place
1256 | 1256	plai
1257 | 1257	plain
1258 | 1258	plan
1259 | 1259	planet
1260 | 1260	plant
1261 | 1261	planta
1262 | 1262	platform
1263 | 1263	player
1264 | 1264	pleas
1265 | 1265	plu
1266 | 1266	plug
1267 | 1267	pm
1268 | 1268	pocket
1269 | 1269	point
1270 | 1270	polic
1271 | 1271	polici
1272 | 1272	polit
1273 | 1273	poor
1274 | 1274	pop
1275 | 1275	popul
1276 | 1276	popular
1277 | 1277	port
1278 | 1278	posit
1279 | 1279	possibl
1280 | 1280	post
1281 | 1281	potenti
1282 | 1282	pound
1283 | 1283	powel
1284 | 1284	power
1285 | 1285	powershot
1286 | 1286	practic
1287 | 1287	pre
1288 | 1288	predict
1289 | 1289	prefer
1290 | 1290	premium
1291 | 1291	prepar
1292 | 1292	present
1293 | 1293	presid
1294 | 1294	press
1295 | 1295	pretti
1296 | 1296	prevent
1297 | 1297	previou
1298 | 1298	previous
1299 | 1299	price
1300 | 1300	principl
1301 | 1301	print
1302 | 1302	printabl
1303 | 1303	printer
1304 | 1304	privaci
1305 | 1305	privat
1306 | 1306	prize
1307 | 1307	pro
1308 | 1308	probabl
1309 | 1309	problem
1310 | 1310	procedur
1311 | 1311	process
1312 | 1312	processor
1313 | 1313	procmail
1314 | 1314	produc
1315 | 1315	product
1316 | 1316	profession
1317 | 1317	profil
1318 | 1318	profit
1319 | 1319	program
1320 | 1320	programm
1321 | 1321	progress
1322 | 1322	project
1323 | 1323	promis
1324 | 1324	promot
1325 | 1325	prompt
1326 | 1326	properti
1327 | 1327	propos
1328 | 1328	proprietari
1329 | 1329	prospect
1330 | 1330	protect
1331 | 1331	protocol
1332 | 1332	prove
1333 | 1333	proven
1334 | 1334	provid
1335 | 1335	proxi
1336 | 1336	pub
1337 | 1337	public
1338 | 1338	publish
1339 | 1339	pudg
1340 | 1340	pull
1341 | 1341	purchas
1342 | 1342	purpos
1343 | 1343	put
1344 | 1344	python
1345 | 1345	qnumber
1346 | 1346	qualifi
1347 | 1347	qualiti
1348 | 1348	quarter
1349 | 1349	question
1350 | 1350	quick
1351 | 1351	quickli
1352 | 1352	quit
1353 | 1353	quot
1354 | 1354	radio
1355 | 1355	ragga
1356 | 1356	rais
1357 | 1357	random
1358 | 1358	rang
1359 | 1359	rate
1360 | 1360	rather
1361 | 1361	ratio
1362 | 1362	razor
1363 | 1363	razornumb
1364 | 1364	re
1365 | 1365	reach
1366 | 1366	read
1367 | 1367	reader
1368 | 1368	readi
1369 | 1369	real
1370 | 1370	realiz
1371 | 1371	realli
1372 | 1372	reason
1373 | 1373	receiv
1374 | 1374	recent
1375 | 1375	recipi
1376 | 1376	recommend
1377 | 1377	record
1378 | 1378	red
1379 | 1379	redhat
1380 | 1380	reduc
1381 | 1381	refer
1382 | 1382	refin
1383 | 1383	reg
1384 | 1384	regard
1385 | 1385	region
1386 | 1386	regist
1387 | 1387	regul
1388 | 1388	regular
1389 | 1389	rel
1390 | 1390	relat
1391 | 1391	relationship
1392 | 1392	releas
1393 | 1393	relev
1394 | 1394	reliabl
1395 | 1395	remain
1396 | 1396	rememb
1397 | 1397	remot
1398 | 1398	remov
1399 | 1399	replac
1400 | 1400	repli
1401 | 1401	report
1402 | 1402	repositori
1403 | 1403	repres
1404 | 1404	republ
1405 | 1405	request
1406 | 1406	requir
1407 | 1407	research
1408 | 1408	reserv
1409 | 1409	resid
1410 | 1410	resourc
1411 | 1411	respect
1412 | 1412	respond
1413 | 1413	respons
1414 | 1414	rest
1415 | 1415	result
1416 | 1416	retail
1417 | 1417	return
1418 | 1418	reveal
1419 | 1419	revenu
1420 | 1420	revers
1421 | 1421	review
1422 | 1422	revok
1423 | 1423	rh
1424 | 1424	rich
1425 | 1425	right
1426 | 1426	risk
1427 | 1427	road
1428 | 1428	robert
1429 | 1429	rock
1430 | 1430	role
1431 | 1431	roll
1432 | 1432	rom
1433 | 1433	roman
1434 | 1434	room
1435 | 1435	root
1436 | 1436	round
1437 | 1437	rpm
1438 | 1438	rss
1439 | 1439	rule
1440 | 1440	run
1441 | 1441	sa
1442 | 1442	safe
1443 | 1443	sai
1444 | 1444	said
1445 | 1445	sale
1446 | 1446	same
1447 | 1447	sampl
1448 | 1448	san
1449 | 1449	saou
1450 | 1450	sat
1451 | 1451	satellit
1452 | 1452	save
1453 | 1453	saw
1454 | 1454	scan
1455 | 1455	schedul
1456 | 1456	school
1457 | 1457	scienc
1458 | 1458	score
1459 | 1459	screen
1460 | 1460	script
1461 | 1461	se
1462 | 1462	search
1463 | 1463	season
1464 | 1464	second
1465 | 1465	secret
1466 | 1466	section
1467 | 1467	secur
1468 | 1468	see
1469 | 1469	seed
1470 | 1470	seek
1471 | 1471	seem
1472 | 1472	seen
1473 | 1473	select
1474 | 1474	self
1475 | 1475	sell
1476 | 1476	seminar
1477 | 1477	send
1478 | 1478	sender
1479 | 1479	sendmail
1480 | 1480	senior
1481 | 1481	sens
1482 | 1482	sensit
1483 | 1483	sent
1484 | 1484	sep
1485 | 1485	separ
1486 | 1486	septemb
1487 | 1487	sequenc
1488 | 1488	seri
1489 | 1489	serif
1490 | 1490	seriou
1491 | 1491	serv
1492 | 1492	server
1493 | 1493	servic
1494 | 1494	set
1495 | 1495	setup
1496 | 1496	seven
1497 | 1497	seventh
1498 | 1498	sever
1499 | 1499	sex
1500 | 1500	sexual
1501 | 1501	sf
1502 | 1502	shape
1503 | 1503	share
1504 | 1504	she
1505 | 1505	shell
1506 | 1506	ship
1507 | 1507	shop
1508 | 1508	short
1509 | 1509	shot
1510 | 1510	should
1511 | 1511	show
1512 | 1512	side
1513 | 1513	sign
1514 | 1514	signatur
1515 | 1515	signific
1516 | 1516	similar
1517 | 1517	simpl
1518 | 1518	simpli
1519 | 1519	sinc
1520 | 1520	sincer
1521 | 1521	singl
1522 | 1522	sit
1523 | 1523	site
1524 | 1524	situat
1525 | 1525	six
1526 | 1526	size
1527 | 1527	skeptic
1528 | 1528	skill
1529 | 1529	skin
1530 | 1530	skip
1531 | 1531	sleep
1532 | 1532	slow
1533 | 1533	small
1534 | 1534	smart
1535 | 1535	smoke
1536 | 1536	smtp
1537 | 1537	snumber
1538 | 1538	so
1539 | 1539	social
1540 | 1540	societi
1541 | 1541	softwar
1542 | 1542	sold
1543 | 1543	solut
1544 | 1544	solv
1545 | 1545	some
1546 | 1546	someon
1547 | 1547	someth
1548 | 1548	sometim
1549 | 1549	son
1550 | 1550	song
1551 | 1551	soni
1552 | 1552	soon
1553 | 1553	sorri
1554 | 1554	sort
1555 | 1555	sound
1556 | 1556	sourc
1557 | 1557	south
1558 | 1558	space
1559 | 1559	spain
1560 | 1560	spam
1561 | 1561	spamassassin
1562 | 1562	spamd
1563 | 1563	spammer
1564 | 1564	speak
1565 | 1565	spec
1566 | 1566	special
1567 | 1567	specif
1568 | 1568	specifi
1569 | 1569	speech
1570 | 1570	speed
1571 | 1571	spend
1572 | 1572	sponsor
1573 | 1573	sport
1574 | 1574	spot
1575 | 1575	src
1576 | 1576	ssh
1577 | 1577	st
1578 | 1578	stabl
1579 | 1579	staff
1580 | 1580	stai
1581 | 1581	stand
1582 | 1582	standard
1583 | 1583	star
1584 | 1584	start
1585 | 1585	state
1586 | 1586	statement
1587 | 1587	statu
1588 | 1588	step
1589 | 1589	steve
1590 | 1590	still
1591 | 1591	stock
1592 | 1592	stop
1593 | 1593	storag
1594 | 1594	store
1595 | 1595	stori
1596 | 1596	strategi
1597 | 1597	stream
1598 | 1598	street
1599 | 1599	string
1600 | 1600	strip
1601 | 1601	strong
1602 | 1602	structur
1603 | 1603	studi
1604 | 1604	stuff
1605 | 1605	stupid
1606 | 1606	style
1607 | 1607	subject
1608 | 1608	submit
1609 | 1609	subscrib
1610 | 1610	subscript
1611 | 1611	substanti
1612 | 1612	success
1613 | 1613	such
1614 | 1614	suffer
1615 | 1615	suggest
1616 | 1616	suit
1617 | 1617	sum
1618 | 1618	summari
1619 | 1619	summer
1620 | 1620	sun
1621 | 1621	super
1622 | 1622	suppli
1623 | 1623	support
1624 | 1624	suppos
1625 | 1625	sure
1626 | 1626	surpris
1627 | 1627	suse
1628 | 1628	suspect
1629 | 1629	sweet
1630 | 1630	switch
1631 | 1631	system
1632 | 1632	tab
1633 | 1633	tabl
1634 | 1634	tablet
1635 | 1635	tag
1636 | 1636	take
1637 | 1637	taken
1638 | 1638	talk
1639 | 1639	tape
1640 | 1640	target
1641 | 1641	task
1642 | 1642	tax
1643 | 1643	teach
1644 | 1644	team
1645 | 1645	tech
1646 | 1646	technic
1647 | 1647	techniqu
1648 | 1648	technolog
1649 | 1649	tel
1650 | 1650	telecom
1651 | 1651	telephon
1652 | 1652	tell
1653 | 1653	temperatur
1654 | 1654	templ
1655 | 1655	ten
1656 | 1656	term
1657 | 1657	termin
1658 | 1658	terror
1659 | 1659	terrorist
1660 | 1660	test
1661 | 1661	texa
1662 | 1662	text
1663 | 1663	than
1664 | 1664	thank
1665 | 1665	that
1666 | 1666	the
1667 | 1667	thei
1668 | 1668	their
1669 | 1669	them
1670 | 1670	themselv
1671 | 1671	then
1672 | 1672	theori
1673 | 1673	there
1674 | 1674	therefor
1675 | 1675	these
1676 | 1676	thi
1677 | 1677	thing
1678 | 1678	think
1679 | 1679	thinkgeek
1680 | 1680	third
1681 | 1681	those
1682 | 1682	though
1683 | 1683	thought
1684 | 1684	thousand
1685 | 1685	thread
1686 | 1686	threat
1687 | 1687	three
1688 | 1688	through
1689 | 1689	thu
1690 | 1690	thursdai
1691 | 1691	ti
1692 | 1692	ticket
1693 | 1693	tim
1694 | 1694	time
1695 | 1695	tip
1696 | 1696	tire
1697 | 1697	titl
1698 | 1698	tm
1699 | 1699	to
1700 | 1700	todai
1701 | 1701	togeth
1702 | 1702	token
1703 | 1703	told
1704 | 1704	toll
1705 | 1705	tom
1706 | 1706	toner
1707 | 1707	toni
1708 | 1708	too
1709 | 1709	took
1710 | 1710	tool
1711 | 1711	top
1712 | 1712	topic
1713 | 1713	total
1714 | 1714	touch
1715 | 1715	toward
1716 | 1716	track
1717 | 1717	trade
1718 | 1718	tradit
1719 | 1719	traffic
1720 | 1720	train
1721 | 1721	transact
1722 | 1722	transfer
1723 | 1723	travel
1724 | 1724	treat
1725 | 1725	tree
1726 | 1726	tri
1727 | 1727	trial
1728 | 1728	trick
1729 | 1729	trip
1730 | 1730	troubl
1731 | 1731	true
1732 | 1732	truli
1733 | 1733	trust
1734 | 1734	truth
1735 | 1735	try
1736 | 1736	tue
1737 | 1737	tuesdai
1738 | 1738	turn
1739 | 1739	tv
1740 | 1740	two
1741 | 1741	type
1742 | 1742	uk
1743 | 1743	ultim
1744 | 1744	un
1745 | 1745	under
1746 | 1746	understand
1747 | 1747	unfortun
1748 | 1748	uniqu
1749 | 1749	unison
1750 | 1750	unit
1751 | 1751	univers
1752 | 1752	unix
1753 | 1753	unless
1754 | 1754	unlik
1755 | 1755	unlimit
1756 | 1756	unseen
1757 | 1757	unsolicit
1758 | 1758	unsubscrib
1759 | 1759	until
1760 | 1760	up
1761 | 1761	updat
1762 | 1762	upgrad
1763 | 1763	upon
1764 | 1764	urgent
1765 | 1765	url
1766 | 1766	us
1767 | 1767	usa
1768 | 1768	usag
1769 | 1769	usb
1770 | 1770	usd
1771 | 1771	usdollarnumb
1772 | 1772	useless
1773 | 1773	user
1774 | 1774	usr
1775 | 1775	usual
1776 | 1776	util
1777 | 1777	vacat
1778 | 1778	valid
1779 | 1779	valu
1780 | 1780	valuabl
1781 | 1781	var
1782 | 1782	variabl
1783 | 1783	varieti
1784 | 1784	variou
1785 | 1785	ve
1786 | 1786	vendor
1787 | 1787	ventur
1788 | 1788	veri
1789 | 1789	verifi
1790 | 1790	version
1791 | 1791	via
1792 | 1792	video
1793 | 1793	view
1794 | 1794	virtual
1795 | 1795	visa
1796 | 1796	visit
1797 | 1797	visual
1798 | 1798	vnumber
1799 | 1799	voic
1800 | 1800	vote
1801 | 1801	vs
1802 | 1802	vulner
1803 | 1803	wa
1804 | 1804	wai
1805 | 1805	wait
1806 | 1806	wake
1807 | 1807	walk
1808 | 1808	wall
1809 | 1809	want
1810 | 1810	war
1811 | 1811	warm
1812 | 1812	warn
1813 | 1813	warranti
1814 | 1814	washington
1815 | 1815	wasn
1816 | 1816	wast
1817 | 1817	watch
1818 | 1818	water
1819 | 1819	we
1820 | 1820	wealth
1821 | 1821	weapon
1822 | 1822	web
1823 | 1823	weblog
1824 | 1824	websit
1825 | 1825	wed
1826 | 1826	wednesdai
1827 | 1827	week
1828 | 1828	weekli
1829 | 1829	weight
1830 | 1830	welcom
1831 | 1831	well
1832 | 1832	went
1833 | 1833	were
1834 | 1834	west
1835 | 1835	what
1836 | 1836	whatev
1837 | 1837	when
1838 | 1838	where
1839 | 1839	whether
1840 | 1840	which
1841 | 1841	while
1842 | 1842	white
1843 | 1843	whitelist
1844 | 1844	who
1845 | 1845	whole
1846 | 1846	whose
1847 | 1847	why
1848 | 1848	wi
1849 | 1849	wide
1850 | 1850	width
1851 | 1851	wife
1852 | 1852	will
1853 | 1853	william
1854 | 1854	win
1855 | 1855	window
1856 | 1856	wing
1857 | 1857	winner
1858 | 1858	wireless
1859 | 1859	wish
1860 | 1860	with
1861 | 1861	within
1862 | 1862	without
1863 | 1863	wnumberp
1864 | 1864	woman
1865 | 1865	women
1866 | 1866	won
1867 | 1867	wonder
1868 | 1868	word
1869 | 1869	work
1870 | 1870	worker
1871 | 1871	world
1872 | 1872	worldwid
1873 | 1873	worri
1874 | 1874	worst
1875 | 1875	worth
1876 | 1876	would
1877 | 1877	wouldn
1878 | 1878	write
1879 | 1879	written
1880 | 1880	wrong
1881 | 1881	wrote
1882 | 1882	www
1883 | 1883	ximian
1884 | 1884	xml
1885 | 1885	xp
1886 | 1886	yahoo
1887 | 1887	ye
1888 | 1888	yeah
1889 | 1889	year
1890 | 1890	yesterdai
1891 | 1891	yet
1892 | 1892	york
1893 | 1893	you
1894 | 1894	young
1895 | 1895	your
1896 | 1896	yourself
1897 | 1897	zdnet
1898 | 1898	zero
1899 | 1899	zip
1900 | 


--------------------------------------------------------------------------------