├── .gitignore
├── README.md
├── ch02 - kNN
    └── kNN.py
├── ch03 - DicisionTree
    └── dicisiontree.py
├── ch04 - NaiveBayes
    ├── bayes.py
    └── email
├── ch05 - LogisticRegression
    └── logisticRegression.py
├── ch06 - svm
    └── svmMLiA.py
├── ch07 - AdaBoosting
    └── adaboost.py
├── ch08 - LinearRegression
    └── regression.py
├── ch09 - RegressionTree
    └── regressionTrees.py
├── ch11 - Apriori
    └── apriori.py
└── ch12 - FP-growth
    └── fpGrowth.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Python template
 2 | # Byte-compiled / optimized / DLL files
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | # Created by .ignore support plugin (hsz.mobi)
62 | 
63 | .idea/
64 | 
65 | machinelearninginaction/
66 | 
67 | ch02 - kNN/testDigits/
68 | ch02 - kNN/trainingDigits/
69 | ch04 - NaiveBayes/email
70 | ch06 - svm/digits
71 | 
72 | # test sets
73 | *.txt


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Machine Learning in Action
 2 | ==========================
 3 | 
 4 | This is the source code to go with "Machine Learning in Action"
 5 | by Peter Harrington published by Manning Inc.
 6 | The official page for this book can be found here: http://manning.com/pharrington/
 7 | 
 8 | All the code examples were working on Python 2.6, there shouldn't be any problems with the 2.7.  NumPy will be needed for most examples.  If you have trouble running any of the examples us know on the Forum for this book: http://www.manning-sandbox.com/forum.jspa?forumID=728.
 9 | 
10 | If you want to run these on some other version of Python say--3.0 or IronPython, feel free to fork the code.
11 | 
12 | ### 图灵社区下载地址
13 | [图灵社区:图书:机器学习实战](http://www.ituring.com.cn/book/1021)
14 | 
15 | ### 关于此repo
16 | 此repo为个人对官方源码的改写以及笔记
17 | 
18 | ## 章节说明
19 | ### 第一部分 -- 分类
20 | * 第2章 - k-近邻算法
21 | * 第3章 - 决策树
22 | * 第4章 - 朴素贝叶斯分类器
23 | * 第5章 - Logistic回归
24 | * 第6章 - AdaBoost元算法
25 | 
26 | ### 第二部分 -- 回归方法
27 | * 第8章 - 线性回归/局部加权线性回归, 收缩方法
28 | * 第9章 - 树回归
29 | 
30 | ### 第三部分 -- 无监督学习
31 | * 第10章 - 无监督学习中聚类方法
32 | * 第11章 - Apriori算法进行关联分析
33 | * 第12章 - FP-growth算法进行关联分析
34 | 


--------------------------------------------------------------------------------
/ch02 - kNN/kNN.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | """
  5 | k-近邻算法
  6 | ===
  7 | 存在训练样本集, 且样本集中每个数据都存在标签, 即已知样本集中每一组数据与所属分类的对应关系.
  8 | 
  9 | 当输入没有标签的新数据后, 将新数据的每个特征与样本集中数据对应的特征进行比较,
 10 | 算法提取样本集中特征最相似的k组数据(最近邻)的分类标签, (一般k<20),
 11 | 取k个最相似的数据中出现次数最多的分类, 作为新数据的分类.
 12 | """
 13 | 
 14 | from __future__ import print_function
 15 | 
 16 | import os
 17 | import operator
 18 | 
 19 | import numpy
 20 | import matplotlib
 21 | import matplotlib.pyplot as plt
 22 | 
 23 | """ 获取数据源的函数 """
 24 | 
 25 | 
 26 | def GetFakeDataset():
 27 |     """创建数据集及其分类
 28 |     
 29 |     Returns
 30 |     -------
 31 |     numpy.array, labels : 数据集, 数据集元素对应的标签
 32 |     """
 33 |     groups = numpy.array([
 34 |         [1.0, 1.1],
 35 |         [1.0, 1.0],
 36 |         [0.0, 0.0],
 37 |         [0.0, 0.1],
 38 |     ])
 39 |     labels = ['A', 'A', 'B', 'B']
 40 |     return groups, labels
 41 | 
 42 | 
 43 | def GetFileDataset(filename):
 44 |     """把文本中的数据转换为数据集, labels返回
 45 | 
 46 |     Parameters
 47 |     ----------
 48 |     filename : string
 49 |         文本名
 50 |     
 51 |     Returns
 52 |     -------
 53 |     numpy.array, list : 文本中的数据矩阵, 数据对应的标签列表
 54 |     """
 55 |     with open(filename) as infile:
 56 |         lines = infile.readlines()
 57 |         numberOflines = len(lines)
 58 |         dataset = numpy.zeros((numberOflines, 3))
 59 |         dataLabels = []
 60 |         for index, line in enumerate(lines):
 61 |             listFromLine = line.strip().split()
 62 |             dataset[index,:] = listFromLine[0:3]
 63 |             dataLabels.append(int(listFromLine[-1]))
 64 |         return dataset, dataLabels
 65 | 
 66 | """ kNN 分类器 """
 67 | 
 68 | 
 69 | class KNNModel(object):
 70 |     """ kNN分类器 """
 71 | 
 72 |     def __init__(self, dataset, labels):
 73 |         self.dataset = dataset
 74 |         self.labels = labels
 75 | 
 76 |     def predict(self, inX, k):
 77 |         if k <= 0:
 78 |             raise ValueError('K > 0')
 79 | 
 80 |         m, n = self.dataset.shape
 81 |         # 利用矩阵运算, 每个dataset的分量都减去inX
 82 |         diffMat = numpy.tile(inX, (m, 1)) - self.dataset
 83 |         # 计算欧式距离 sqrt(sum())
 84 |         distances = ((diffMat**2).sum(axis=1))**0.5
 85 |         # 对数据从小到大次序排列，确定前k个距离最小元素所在的主要分类
 86 |         sortedDistInd = distances.argsort()
 87 |         classCount = {}
 88 |         for i in range(k):
 89 |             voteIlabel = self.labels[sortedDistInd[i]]
 90 |             classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
 91 |         # 返回最相近的类
 92 |         sortedClassCount = sorted(
 93 |             classCount.items(), key=operator.itemgetter(1), reverse=True
 94 |         )
 95 |         return sortedClassCount[0][0]
 96 | 
 97 | 
 98 | class KNNModelWithNormalize(object):
 99 |     """ 带归一化的kNN分类器 """
100 | 
101 |     def __init__(self, dataset, labels):
102 |         self.normDataset, self.ranges, self.minVals = self.normalize(dataset)
103 |         self.labels = labels
104 | 
105 |     def normalize(self, dataset):
106 |         """ 对dataset进行归一化处理, 使得输入的特征权重一致 """
107 |         minVals = dataset.min(0)  # 获取每一列的最小值
108 |         maxVals = dataset.max(0)  # 获取每一列的最大值
109 |         ranges = maxVals - minVals  # 每一列的范围
110 |         m, n = dataset.shape
111 |         # 归一化 (Xi - Xmin) / (Xmax - Xmin)
112 |         normDataset = (dataset - numpy.tile(minVals, (m, 1))) / numpy.tile(ranges, (m, 1))
113 |         return normDataset, ranges, minVals
114 | 
115 |     def predict(self, inX, k):
116 |         if k <= 0:
117 |             raise ValueError('K > 0')
118 |         
119 |         # 先对输入特征进行归一化处理
120 |         inX = (inX - self.minVals) / self.ranges
121 | 
122 |         datasetSize = self.normDataset.shape[0]
123 |         # 利用矩阵运算, 每个 dataset 的分量都减去inX
124 |         diffMat = numpy.tile(inX, (self.normDataset.shape[0],1)) - self.normDataset
125 |         # 计算欧式距离 sqrt(sum())
126 |         distances = ((diffMat**2).sum(axis=1))**0.5
127 |         # 对数据从小到大次序排列，确定前k个距离最小元素所在的主要分类
128 |         sortedDistInd = distances.argsort()
129 |         classCount={}
130 |         for i in range(k):
131 |             voteIlabel = self.labels[sortedDistInd[i]]
132 |             classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
133 |         # 返回最相近的类
134 |         sortedClassCount = sorted(
135 |             classCount.items(), key=operator.itemgetter(1), reverse=True
136 |         )
137 |         return sortedClassCount[0][0]
138 | 
139 |     @classmethod
140 |     def test(cls, testfile, k=3, ratio=0.10):
141 |         dataset, labels = GetFileDataset(testfile)
142 |         m, n = dataset.shape
143 |         numTestVectors = int(m * ratio)
144 |         numError = 0
145 | 
146 |         model = cls(dataset[numTestVectors:m, :], labels[numTestVectors:m])
147 |         for i in range(numTestVectors):
148 |             result = model.predict(dataset[i, :], k)
149 |             if result != labels[i]:
150 |                 numError += 1
151 |                 print('× Predict/Real {0}/{1}'.format(result, labels[i]))
152 |             else:
153 |                 print('√ Predict/Real {0}/{1}'.format(result, labels[i]))
154 |         print('Total error rate: {0:.1%}'.format(1.0*numError / numTestVectors))
155 | 
156 | 
157 | def TestClassifyPerson(dataset_filename):
158 |     result2str = {
159 |         1: '完全不感兴趣',
160 |         2: '可能喜欢',
161 |         3: '很有可能喜欢',
162 |     }
163 |     print('请输入该人的相关信息:')
164 |     percentageTimeOfPlayGames = float(
165 |         input('消耗在玩游戏上的时间百分比?\n： ')
166 |     )
167 |     flyMiles = float(
168 |         input('每年搭乘飞机的飞行里程数?\n： ')
169 |     )
170 |     iceCream = float(
171 |         input('每周消费的冰淇淋公升数?\n： ')
172 |     )
173 | 
174 |     dataset, labels = GetFileDataset(dataset_filename)
175 |     DrawPlot(dataset, labels)
176 |     model = KNNModelWithNormalize(dataset, labels)
177 |     inVector = numpy.array([flyMiles, percentageTimeOfPlayGames, iceCream])
178 |     classifierResult = model.predict(inVector, k=3)
179 |     print(
180 |         '预测你对这个人:', result2str[classifierResult]
181 |     )
182 | 
183 | 
184 | """ 使用 Matplotlib绘制散点图 """
185 | 
186 | 
187 | def DrawPlot(dataset, labels):
188 |     """绘制散点图
189 | 
190 |     Parameters
191 |     ----------
192 |     dataset : numpy.array
193 |         数据集
194 |     labels : list of int
195 |         标签值
196 |     """
197 |     fig = plt.figure()
198 |     ax = fig.add_subplot(111)
199 |     _ = ax.scatter(
200 |         dataset[:, 1], dataset[:, 2],
201 |         s=15.0*numpy.array(labels),   # 大小
202 |         c=15.0*numpy.array(labels)    # 颜色
203 |     )
204 |     plt.show()
205 | 
206 | """ 手写识别系统 """
207 | 
208 | 
209 | def VectorDebugPrint(vector):
210 |     for i in range(32):
211 |         print(''.join(
212 |             list(map(
213 |                 lambda x: str(int(x)),
214 |                 vector[i*32:(i+1)*32]
215 |             ))
216 |         ))
217 | 
218 | 
219 | def TranslateImg2Vector(filename):
220 |     """ 把'图像文件'转换为1024维的向量 """
221 |     vector = numpy.zeros((1, 1024))
222 |     with open(filename, 'r') as infile:
223 |         for lineno, line in enumerate(infile):
224 |             for rowno in range(32):
225 |                 vector[0, 32*lineno+rowno] = int(line[rowno])
226 |         return vector
227 | 
228 | 
229 | def GetDigitsDatasetFromDir(dirname):
230 |     """从文件夹中获取数据集, labels
231 | 
232 |     Parameters
233 |     ----------
234 |     dirname 文件夹名称
235 | 
236 |     Returns
237 |     -------
238 |     numpy.array, labels : 数据集, 数据集元素对应的标签
239 |     """
240 |     filenames = os.listdir(dirname)
241 | 
242 |     labels = [None] * len(filenames)
243 |     dataset = numpy.zeros((len(filenames), 1024))
244 | 
245 |     for i, filename in enumerate(filenames):
246 |         fileclass = filename.split('.')[0].split('_')[0]
247 |         filepath = os.path.join(dirname, filename)
248 |         dataset[i, :], labels[i] = TranslateImg2Vector(filepath), fileclass
249 |     return dataset, labels
250 | 
251 | 
252 | def TestHandwritingNumber(trainDir, testDir, k=3):
253 |     dataset, labels = GetDigitsDatasetFromDir(trainDir)
254 |     model = KNNModel(dataset, labels)
255 | 
256 |     dataset, labels = GetDigitsDatasetFromDir(testDir)
257 |     numError = 0
258 |     numTestVectors = len(labels)
259 |     for testVec, label in zip(dataset, labels):
260 |         result = model.predict(testVec, k)
261 |         if result != label:
262 |             numError += 1
263 |             print('× Predict/Real {0}/{1}'.format(result, label))
264 |         else:
265 |             print('√ Predict/Real {0}/{1}'.format(result, label))
266 |     print('Total error rate: {0:.1%}'.format(1.0*numError / numTestVectors))
267 | 
268 | 
269 | if __name__ == '__main__':
270 |     dataset, labels = GetFakeDataset()
271 |     model = KNNModel(dataset, labels)
272 |     inX = [0, 0]
273 |     print('{} should be {}'.format(inX, model.predict(inX, k=3)))
274 | 
275 |     # TestClassifyPerson('datingTestSet2.txt')
276 | 
277 |     TestHandwritingNumber('trainingDigits', 'testDigits', k=3)
278 | 


--------------------------------------------------------------------------------
/ch03 - DicisionTree/dicisiontree.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | """
  5 | 决策树
  6 | ===
  7 | 决策树的数据形式非常容易理解, 决策树很多任务都是为了数据中所蕴含的知识信息,
  8 | 因此决策树可以使用不熟悉的数据集合, 并从中提取出一系列规则, 机器学习算法最终使用机器从数据集中创造的规则.
  9 | 专家系统中经常使用决策树.
 10 | """
 11 | 
 12 | from __future__ import print_function
 13 | 
 14 | import math
 15 | import operator
 16 | import pickle
 17 | from collections import defaultdict
 18 | 
 19 | import numpy as np
 20 | 
 21 | import logging
 22 | logging.basicConfig(
 23 |     level=logging.DEBUG,
 24 |     format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s',
 25 | )
 26 | 
 27 | 
 28 | class Dataset(object):
 29 |     """ 对数据集以及相关操作的封装 """
 30 | 
 31 |     def __init__(self, rawDataset):
 32 |         self.rawDataset = np.array(rawDataset)
 33 | 
 34 |     @property
 35 |     def shape(self):
 36 |         return self.rawDataset.shape
 37 | 
 38 |     @property
 39 |     def classList(self):
 40 |         return self.rawDataset[:, -1].tolist()
 41 | 
 42 |     @property
 43 |     def shannonEntropy(self):
 44 |         """获取数据集的香农熵
 45 |         熵越大代表混乱程度越高, 即混合的数据越多
 46 |         
 47 |         Returns
 48 |         -------
 49 |         float : 数据集的香农熵
 50 |         """
 51 |         # 统计每个 label 出现的次数
 52 |         labelCounts = defaultdict(int)
 53 |         for featVec in self.rawDataset:
 54 |             label = featVec[-1]
 55 |             labelCounts[label] += 1
 56 |         # 计算熵
 57 |         # H = - ∑(n, i=1) ( p(xi) * log(2)(p(xi)) )
 58 |         entropy = 0.0
 59 |         numEntries = len(self.rawDataset)
 60 |         for label in labelCounts:
 61 |             probability = 1.0*labelCounts[label] / numEntries
 62 |             entropy -= probability * math.log(probability, 2)  # 底数为2
 63 |         return entropy
 64 | 
 65 |     def split(self, axis):
 66 |         """ 对数据集按照 axis 指定的特征进行划分
 67 | 
 68 |         Parameters
 69 |         ----------
 70 |         axis : int
 71 |             指定用于进行划分的特征
 72 | 
 73 |         Returns
 74 |         -------
 75 |         按照指定特征划分后的 (值, 子数据集) 对
 76 |         """
 77 |         subDatasets = defaultdict(list)
 78 |         for featureVector in self.rawDataset:
 79 |             value = featureVector[axis]
 80 |             subFeatureVector = (
 81 |                 featureVector[:axis].tolist()
 82 |                 + featureVector[axis+1:].tolist()
 83 |             )  # 去除已经用于划分的特征
 84 |             subDatasets[value].append(subFeatureVector)
 85 |         return (
 86 |             list(subDatasets.keys()),
 87 |             list(map(self.__class__, subDatasets.values())),
 88 |         )
 89 | 
 90 |     def ChooseBestSplitFeature(self):
 91 |         """通过遍历数据集, 计算香农熵, 选择最好的特征进行数据划分
 92 | 
 93 |         Returns
 94 |         -------
 95 |         int : 对数据熵增益最大的划分特征的index
 96 |         """
 97 |         m, n = self.shape
 98 |         numFeatures = n - 1
 99 |         baseEntropy = self.shannonEntropy  # 当前整个数据集的熵
100 |         best = {
101 |             'gain': 0.0,  # 记录最好的信息增益
102 |             'feature': -1,  # 记录最好的特征index
103 |         }
104 |         # 按照不同的特征遍历进行划分
105 |         for featureIndex in range(numFeatures):
106 |             _labels, subDatasets = self.split(featureIndex)
107 |             # 计算按照此特征进行划分后的熵
108 |             newEntropy = 0.0
109 |             for subDataset in subDatasets:
110 |                 sub_m, _sub_n = subDataset.shape
111 |                 probability = 1.0*sub_m / m
112 |                 newEntropy += probability * subDataset.shannonEntropy
113 |             # 计算信息增益, 更新最好的特征
114 |             infoGain = baseEntropy - newEntropy
115 |             if infoGain > best['gain']:
116 |                 best['gain'] = infoGain
117 |                 best['feature'] = featureIndex
118 |         return best['feature']
119 | 
120 | 
121 | def GetFakeDataset():
122 |     dataset = [
123 |         [1, 1, 'yes'],
124 |         [1, 1, 'yes'],
125 |         [1, 0, 'no'],
126 |         [0, 1, 'no'],
127 |         [0, 1, 'no'],
128 |     ]
129 |     labels = ['no surfacing', 'flippers']
130 |     # change to discrete values
131 |     return Dataset(dataset), labels
132 | 
133 | 
134 | class DicisionTree(object):
135 |     """ 决策树 """
136 | 
137 |     def __init__(self, dataset, labels):
138 |         if dataset and labels:
139 |             self.labels = labels
140 |             self.tree = self.BuildTree(Dataset(dataset), self.labels)
141 | 
142 |     def SaveToFile(self, filename):
143 |         with open(filename, 'w') as outfile:
144 |             pickle.dump(self, outfile)
145 | 
146 |     @staticmethod
147 |     def LoadFromFile(filename):
148 |         with open(filename, 'r') as infile:
149 |             tree = pickle.load(infile)
150 |         return tree
151 | 
152 |     @staticmethod
153 |     def GetMajorityClass(classList):
154 |         classCount = defaultdict(int)
155 |         for vote in classList:
156 |             classCount[vote] += 1
157 |         sortedClassCount = sorted(
158 |             classCount.items(),
159 |             key=operator.itemgetter(1),
160 |             reverse=True
161 |         )
162 |         return sortedClassCount[0][0]
163 | 
164 |     def BuildTree(self, dataset, labels):
165 |         labels = labels[:]  # 复制防止破坏原来的 labels 列表
166 | 
167 |         classList = dataset.classList
168 |         # 当子集中所有项都为同一 label , 直接返回
169 |         if classList.count(classList[0]) == len(classList): 
170 |             return classList[0]
171 | 
172 |         # 当所有 feature 都用完, 返回出现次数最多的
173 |         _m, n = dataset.shape
174 |         if n == 1:
175 |             return self.GetMajorityClass(classList)
176 | 
177 |         # 选择信息增益最大的进行划分
178 |         bestFeatureIndex = dataset.ChooseBestSplitFeature()
179 |         bestFeatureLabel = labels[bestFeatureIndex]
180 |         del(labels[bestFeatureIndex])
181 |         logging.info('Spliting by Feature {0}({1})'.format(
182 |             bestFeatureLabel,
183 |             bestFeatureIndex
184 |         ))
185 | 
186 |         dicisionTree = {
187 |             bestFeatureLabel: {},
188 |         }
189 | 
190 |         # 对特征下每个值进行递归划分
191 |         subLabels, subDatasets = dataset.split(bestFeatureIndex)
192 |         logging.info('labels:{0} for Feature {1}'.format(subLabels, bestFeatureLabel))
193 |         for subLabel, subDataset in zip(subLabels, subDatasets):
194 |             logging.info('Building subtree of value `{0}`'.format(subLabel))
195 |             dicisionTree[bestFeatureLabel][subLabel] = self.BuildTree(
196 |                 subDataset,
197 |                 labels
198 |             )
199 |             logging.info('Subtree `{0}` built'.format(subLabel))
200 |         return dicisionTree
201 | 
202 |     def predict(self, inputVector):
203 |         return self.GetClassOfVector(self.tree, self.labels, inputVector)
204 | 
205 |     def GetClassOfVector(self, dicisionTree, featureLabels, inputVector):
206 |         featureLabel = dicisionTree.keys()[0]
207 |         subDicisionTree = dicisionTree[featureLabel]
208 |         featureIndex = featureLabels.index(featureLabel)
209 | 
210 |         downKey = inputVector[featureIndex]
211 |         downNode = subDicisionTree[downKey]
212 | 
213 |         if isinstance(downNode, dict):
214 |             # 递归在子树中查找所属类别
215 |             classLabel = self.GetClassOfVector(
216 |                 downNode, featureLabels,
217 |                 inputVector
218 |             )
219 |         else:
220 |             classLabel = downNode
221 |         return classLabel
222 | 
223 |     @property
224 |     def depth(self):
225 |         return self.GetTreeDepth(self.tree)
226 | 
227 |     @classmethod
228 |     def GetTreeDepth(cls, tree):
229 |         max_depth = 0
230 |         featureLabel = tree.keys()[0]
231 |         subDicisionTree = tree[featureLabel]
232 |         for featureValue in subDicisionTree:
233 |             if isinstance(subDicisionTree[featureValue], dict):
234 |                 depth = 1 + cls.GetTreeDepth(subDicisionTree[featureValue])
235 |             else:
236 |                 depth = 1
237 | 
238 |             max_depth = max(depth, max_depth)
239 |         return max_depth
240 | 
241 |     @property
242 |     def num_leaves(self):
243 |         return self.GetNumLeaves(self.tree)
244 | 
245 |     @classmethod
246 |     def GetNumLeaves(cls, tree):
247 |         num = 0
248 |         featureLabel = tree.keys()[0]
249 |         subDicisionTree = tree[featureLabel]
250 |         for featureValue in subDicisionTree:
251 |             if isinstance(subDicisionTree[featureValue], dict):
252 |                 num += cls.GetNumLeaves(subDicisionTree[featureValue])
253 |             else:
254 |                 num += 1
255 |         return num
256 | 
257 |     @property
258 |     def feature_label(self):
259 |         return self.tree.keys()[0]
260 | 
261 |     def GetSubTree(self, feature_value):
262 |         tree = self.__class__(None, None)
263 |         tree.tree = self.tree[self.feature_label][feature_value]
264 |         return tree
265 | 
266 |     @classmethod
267 |     def GetRetrieveTree(cls, index):
268 |         trees = (
269 |             {'no surfacing': {
270 |                 0: 'no',
271 |                 1: {'flippers':
272 |                     {0: 'no', 1: 'yes'}}
273 |             }},
274 |             {'no surfacing': {
275 |                 0: 'no',
276 |                 1: {'flippers':
277 |                     {0: {'head':
278 |                              {0: 'no', 1: 'yes'}},
279 |                      1:'no'}
280 |             }}},
281 |         )
282 |         tree = cls(None, None)
283 |         tree.tree = trees[index]
284 |         return tree
285 | 
286 | 
287 | def LoadLensesData(filename):
288 |     with open(filename) as infile:
289 |         lensesDataset = []
290 |         for line in infile:
291 |             trainVector = line.strip().split('\t')
292 |             lensesDataset.append(trainVector)
293 |         lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate', ]
294 |     lenseTree = DicisionTree(lensesDataset, lensesLabels)
295 |     return lenseTree
296 | 
297 | 
298 | """ 绘制树形图 """
299 | import matplotlib.pyplot as plt
300 | 
301 | 
302 | class DicisionTreePlotter(object):
303 | 
304 |     DECISION_NODE = {
305 |         'boxstyle': 'sawtooth',
306 |         'fc': '0.8',
307 |     }
308 |     LEAF_NODE = {
309 |         'boxstyle': 'round4',
310 |         'fc': '0.8',
311 |     }
312 |     ARROW_ARGS = {
313 |         'arrowstyle': '<-',
314 |     }
315 | 
316 |     def __init__(self, tree):
317 |         fig = plt.figure(1, facecolor='white')
318 |         fig.clf()
319 |         self.ax1 = plt.subplot(111, frameon=False, xticks=[], yticks=[])
320 |         self.width = 1.0*tree.num_leaves
321 |         self.depth = 1.0*tree.depth
322 |         self.offset = {
323 |             'x': -0.5/self.width,
324 |             'y': 1.0
325 |         }
326 |         self.plot_tree(tree, (0.5, 1.0), '')
327 |         plt.show()
328 | 
329 |     def plot_mid_text(self, text, centerPoint, parentPoint):
330 |         xMid = (parentPoint[0] - centerPoint[0]) / 2.0 + centerPoint[0]
331 |         yMid = (parentPoint[1] - centerPoint[1]) / 2.0 + centerPoint[1]
332 |         self.ax1.text(xMid, yMid, text)
333 | 
334 |     def plot_node(self, text, centerPoint, parentPoint, node_type):
335 |         self.ax1.annotate(
336 |             text,
337 |             xy=parentPoint, xycoords='axes fraction',
338 |             xytext=centerPoint, textcoords='axes fraction',
339 |             va='center', ha='center',
340 |             bbox=node_type, arrowprops=DicisionTreePlotter.ARROW_ARGS
341 |         )
342 | 
343 |     def plot_tree(self, tree, parentPoint, text):
344 |         num_leaves = tree.num_leaves
345 |         featureLabel = tree.feature_label
346 |         centerPoint = (
347 |             self.offset['x'] + (1.0 + num_leaves) / 2.0 / self.width,
348 |             self.offset['y']
349 |         )
350 |         self.plot_mid_text(text, centerPoint, parentPoint)
351 |         self.plot_node(
352 |             featureLabel,
353 |             centerPoint, parentPoint,
354 |             DicisionTreePlotter.DECISION_NODE
355 |         )
356 |         subDicisionTree = tree.tree[featureLabel]
357 |         self.offset['y'] -= 1.0/self.depth
358 |         for featureValue in subDicisionTree:
359 |             if isinstance(subDicisionTree[featureValue], dict):
360 |                 self.plot_tree(
361 |                     tree.GetSubTree(featureValue),
362 |                     centerPoint,
363 |                     str(featureValue)
364 |                 )
365 |             else:
366 |                 self.offset['x'] += 1.0 / self.width
367 |                 self.plot_node(
368 |                     subDicisionTree[featureValue],
369 |                     (self.offset['x'], self.offset['y']),
370 |                     centerPoint,
371 |                     DicisionTreePlotter.LEAF_NODE
372 |                 )
373 |                 self.plot_mid_text(
374 |                     str(featureValue),
375 |                     (self.offset['x'], self.offset['y']),
376 |                     centerPoint
377 |                 )
378 |         self.offset['y'] += 1.0 / self.depth
379 | 
380 | 
381 | if __name__ == '__main__':
382 |     tree = LoadLensesData('lenses.txt')
383 |     print(tree.depth)
384 |     t = DicisionTree.GetRetrieveTree(0)
385 |     print(t.depth, t.num_leaves)
386 |     plotter = DicisionTreePlotter(t)
387 | 


--------------------------------------------------------------------------------
/ch04 - NaiveBayes/bayes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | """
  5 | 朴素贝叶斯
  6 | ===
  7 | 朴素贝叶斯是贝叶斯决策理论的一部分.
  8 | ## 贝叶斯决策
  9 | * 核心思想 -> 选择具有最高概率的决策
 10 | 
 11 | ## 朴素贝叶斯分类器
 12 | 朴素贝叶斯分类器是用于文档分类的常用算法
 13 | * 把每个次的出现或者不出现作为一个特征
 14 | * 假设特征之间相互独立, 即一个单词出现的可能性和其他相邻单词没有关系
 15 | * 每个特征同等重要
 16 | 
 17 | """
 18 | 
 19 | from __future__ import print_function
 20 | 
 21 | import numpy as np
 22 | from numpy import random
 23 | import logging
 24 | logging.basicConfig(
 25 |     level=logging.DEBUG,
 26 |     format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s',
 27 | )
 28 | 
 29 | 
 30 | def getFakeDataset():
 31 |     posts = [
 32 |         ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
 33 |         ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
 34 |         ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
 35 |         ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
 36 |         ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
 37 |         ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid'],
 38 |     ]
 39 |     classes = [0, 1, 0, 1, 0, 1]  # 1表示为侮辱性句子, 0为普通句子
 40 |     return posts, classes
 41 | 
 42 | 
 43 | def getVocabulary(dataset):
 44 |     """创建一个包含所有在文档中出现的不重复词的词典
 45 | 
 46 |     Parameters
 47 |     ----------
 48 |     dataset : list of documents
 49 | 
 50 |     Returns
 51 |     -------
 52 |     list : 词典
 53 |     """
 54 |     vocabulary = set([])
 55 |     for document in dataset:
 56 |         vocabulary |= set(document)  # 取并集
 57 |     return list(vocabulary)
 58 | 
 59 | 
 60 | def getSetOfWords2Vec(vocabulary, inputSet):
 61 |     """`词集模型`
 62 |     词汇表中的单词在输入文档中是否出现
 63 | 
 64 |     Parameters
 65 |     ----------
 66 |     vocabulary : 词典
 67 |     inputSet : 文档
 68 | 
 69 |     Returns
 70 |     -------
 71 |     list : 文档向量, 0/1表示词汇表中的单词在输入文档中是否出现
 72 |     """
 73 |     appearVector = [0]*len(vocabulary)
 74 |     for word in inputSet:
 75 |         if word in vocabulary:
 76 |             appearVector[vocabulary.index(word)] = 1
 77 |     return appearVector
 78 | 
 79 | 
 80 | def getBagOfWords2Vec(vocabulary, inputSet):
 81 |     """`词袋模型`
 82 |     词汇表中的单词在输入文档中出现的次数
 83 | 
 84 |     Parameters
 85 |     ----------
 86 |     vocabulary : 词典
 87 |     inputSet : 文档
 88 | 
 89 |     Returns
 90 |     -------
 91 |     list : 文档向量, 每一维表示词汇表中的单词在输入文档中出现的次数
 92 |     """
 93 |     appearCountVector = [0]*len(vocabulary)
 94 |     for word in inputSet:
 95 |         if word in vocabulary:
 96 |             appearCountVector[vocabulary.index(word)] += 1
 97 |     return appearCountVector
 98 | 
 99 | 
100 | class NaiveBayesModel(object):
101 |     """ 朴素贝叶斯模型 """
102 | 
103 |     def __init__(self, matrix, categories):
104 |         self.trainMatrix = np.array(matrix)
105 |         self.trainCategory = np.array(categories)
106 | 
107 |         # m 为有多少个样例, n 为每个样例的词向量长度
108 |         m, n = self.trainMatrix.shape
109 |         # 样例中为 class1 的概率
110 |         self.pClass1 = 1.0*sum(self.trainCategory) / m
111 |         # 防止概率相乘为 0, 把所有词出现次数初始化为 1, 总词数初始化为 2
112 |         wordsCountVector = {
113 |             'class0': np.ones(n),  # 属于 Class0 的各个词数
114 |             'class1': np.ones(n),  # 属于 Class1 的各个词数
115 |         }
116 |         for rowno in range(m):
117 |             if self.trainCategory[rowno] == 0:
118 |                 wordsCountVector['class0'] += self.trainMatrix[rowno]
119 |             else:
120 |                 wordsCountVector['class1'] += self.trainMatrix[rowno]
121 |         # 防止太多小的浮点数相乘导致下溢出, 对乘积取自然对数
122 |         self.pWordsVector = {
123 |             'class0': np.log(
124 |                 wordsCountVector['class0']
125 |                 / (1 + wordsCountVector['class0'].sum())
126 |             ),
127 |             'class1': np.log(
128 |                 wordsCountVector['class1']
129 |                 / (1 + wordsCountVector['class1'].sum())
130 |             ),
131 |         }
132 | 
133 |     def predict(self, inputVector):
134 |         inputVector = np.array(inputVector)
135 |         p0 = (
136 |             sum(inputVector * self.pWordsVector['class0'])
137 |             + np.log(1.0 - self.pClass1)
138 |         )
139 |         p1 = (
140 |             sum(inputVector * self.pWordsVector['class1'])
141 |             + np.log(self.pClass1)
142 |         )
143 |         # print('{:.3f}/{:.3f} for Class {}/{}'.format(
144 |         #     np.exp(p0)*100, np.exp(p1)*100, 0, 1
145 |         # ))
146 |         if p1 > p0:
147 |             return 1
148 |         else:
149 |             return 0
150 | 
151 | 
152 | def testingNaiveBayes():
153 |     postsToken, postsClass = getFakeDataset()
154 |     vocabulary = getVocabulary(postsToken)
155 |     trainMatrix = [
156 |         getSetOfWords2Vec(vocabulary, post) for post in postsToken
157 |         ]
158 |     model = NaiveBayesModel(trainMatrix, postsClass)
159 | 
160 |     testEntry = ['love', 'my', 'dalmation']
161 |     testPost = getSetOfWords2Vec(vocabulary, testEntry)
162 |     print(testEntry, 'classified as: ', model.predict(testPost))
163 | 
164 |     testEntry = ['stupid', 'garbage']
165 |     testPost = getSetOfWords2Vec(vocabulary, testEntry)
166 |     print(testEntry, 'classified as: ', model.predict(testPost))
167 | 
168 | """ 使用朴素贝叶斯对电子邮件进行分类 """
169 | 
170 | 
171 | def getContentTokens(content):
172 |     """ 简单切分英语文本 """
173 |     import re
174 |     tokens = re.split(r'\W*', content)
175 |     return [token.lower() for token in tokens if len(token) > 2]
176 | 
177 | 
178 | def testNaiveBayesToSpamEmail():
179 |     """ 使用朴素贝叶斯进行电子邮件分类的测试 """
180 |     emails = []
181 |     emails_class = []
182 | 
183 |     for i in range(1, 26):
184 |         # 垃圾邮件样本
185 |         words = getContentTokens(open('email/spam/%d.txt' % i).read())
186 |         emails.append(words)
187 |         emails_class.append(1)
188 |         # 正常邮件样本
189 |         words = getContentTokens(open('email/ham/%d.txt' % i).read())
190 |         emails.append(words)
191 |         emails_class.append(0)
192 | 
193 |     # `留存交叉验证` -- 随机选择数据一部分作为训练集, 剩余部分作为测试集
194 |     # 生成测试集, 训练集
195 |     random_order = random.permutation(50)
196 |     testIndexs, trainIndexs = random_order[:10], random_order[10:]
197 | 
198 |     # 生成词典
199 |     vocabulary = getVocabulary(emails)
200 |     # 训练朴素贝叶斯分类器
201 |     trainMatrix = []
202 |     trainCategories = []
203 |     for docIndex in trainIndexs:
204 |         trainMatrix.append(
205 |             getBagOfWords2Vec(vocabulary, emails[docIndex])  # 使用词袋模型
206 |         )
207 |         trainCategories.append(emails_class[docIndex])
208 |     logging.info('Train dataset is ready.')
209 |     model = NaiveBayesModel(trainMatrix, trainCategories)
210 |     logging.info('NaiveBayes model is trained.')
211 | 
212 |     # 进行分类测试
213 |     errorCount = 0
214 |     for docIndex in testIndexs:
215 |         wordVector = getBagOfWords2Vec(vocabulary, emails[docIndex])
216 |         result = model.predict(wordVector)
217 |         if result != emails_class[docIndex]:
218 |             errorCount += 1
219 |             logging.warning('classification error. Predict/Actual: {}/{}\n{}'.format(
220 |                 result,
221 |                 emails_class[docIndex],
222 |                 ' '.join(emails[docIndex])
223 |             ))
224 |     logging.info('the error rate is: {:.2%}'.format(1.0*errorCount/len(testIndexs)))
225 | 
226 | """ 使用朴素贝叶斯分类器从个人广告中获取区域倾向 """
227 | 
228 | 
229 | def calcMostFreq(vocabulary, fullText, topN):
230 |     import operator
231 |     wordFrequence = {}
232 |     for word in vocabulary:
233 |         wordFrequence[word] = fullText.count(word)
234 |     sortedFrequence = sorted(
235 |         wordFrequence.items(),
236 |         key=lambda x: x[1],
237 |         reverse=True
238 |     )
239 |     return sortedFrequence[:topN]
240 | 
241 | 
242 | def getLocalWords(feed1, feed0):
243 |     summaries = []
244 |     summaries_class = []
245 |     fullText = []
246 |     minLen = min(
247 |         len(feed1['entries']),
248 |         len(feed0['entries'])
249 |     )
250 |     for i in range(minLen):
251 |         # 第一个feed, 例子中为New York
252 |         wordList = getContentTokens(feed1['entries'][i]['summary'])
253 |         summaries.append(wordList)
254 |         fullText.extend(wordList)
255 |         summaries_class.append(1)
256 |         # 第二个feed
257 |         wordList = getContentTokens(feed0['entries'][i]['summary'])
258 |         summaries.append(wordList)
259 |         fullText.extend(wordList)
260 |         summaries_class.append(0)
261 |     vocabulary = getVocabulary(summaries)
262 | 
263 |     # `停用词表` -- 语言中作为冗余/结构辅助性内容的词语表
264 |     # 多语言停用词表例子 www.ranks.nl/resources/stopwords.html
265 |     # 去除出现次数最多的N个词
266 |     topN = 30
267 |     topNWords = calcMostFreq(vocabulary, fullText, topN)
268 |     for word, _count in topNWords:
269 |         if word in vocabulary:
270 |             vocabulary.remove(word)
271 | 
272 |     # 生成测试集, 训练集
273 |     random_order = random.permutation(2*minLen)
274 |     testIndexs, trainIndexs = random_order[:20], random_order[20:]
275 | 
276 |     # 训练朴素贝叶斯分类器
277 |     trainMatrix = []
278 |     trainCategories = []
279 |     for docIndex in trainIndexs:
280 |         trainMatrix.append(getBagOfWords2Vec(vocabulary, summaries[docIndex]))
281 |         trainCategories.append(summaries_class[docIndex])
282 |     model = NaiveBayesModel(trainMatrix, trainCategories)
283 | 
284 |     # 进行分类测试
285 |     errorCount = 0
286 |     for docIndex in testIndexs:
287 |         wordVector = getBagOfWords2Vec(vocabulary, summaries[docIndex])
288 |         result = model.predict(wordVector)
289 |         if result != summaries_class[docIndex]:
290 |             errorCount += 1
291 |             logging.warning('[classification error] Predict/Actual: {}/{}\n{}'.format(
292 |                 result,
293 |                 summaries_class[docIndex],
294 |                 ' '.join(summaries[docIndex])
295 |             ))
296 |     logging.info('[error rate] {:.2%}'.format(1.0*errorCount/len(testIndexs)))
297 |     return vocabulary, model.pWordsVector
298 | 
299 | 
300 | def getTopWords(ny, sf):
301 |     vocabulary, pWordsVector = getLocalWords(ny, sf)
302 |     top = {'NY': [], 'SF': [], }
303 | 
304 |     THRESHOLD = -6.0
305 |     for i in range(len(pWordsVector['class0'])):
306 |         if pWordsVector['class0'][i] > THRESHOLD:
307 |             top['NY'].append((vocabulary[i], pWordsVector['class0'][i]))
308 |         if pWordsVector['class1'][i] > THRESHOLD:
309 |             top['SF'].append((vocabulary[i], pWordsVector['class1'][i]))
310 |     import pprint
311 |     sortedWords = {
312 |         'SF': list(map(
313 |             lambda x: x[0],
314 |             sorted(top['SF'], key=lambda pair: pair[1], reverse=True)
315 |         )),
316 |         'NY': list(map(
317 |             lambda x: x[0],
318 |             sorted(top['NY'], key=lambda pair: pair[1], reverse=True)
319 |         )),
320 |     }
321 |     print('=====>>  SF  <<=====')
322 |     pprint.pprint(sortedWords['SF'])
323 |     print('=====>>  NY  <<=====')
324 |     pprint.pprint(sortedWords['NY'])
325 | 
326 | 
327 | if __name__ == '__main__':
328 |     testingNaiveBayes()
329 |     testNaiveBayesToSpamEmail()
330 | 
331 |     import feedparser
332 |     ny = feedparser.parse('http://newyork.craigslist.org/search/stp?format=rss')
333 |     sf = feedparser.parse('http://sfbay.craigslist.org/search/stp?format=rss')
334 |     getTopWords(ny, sf)
335 | 


--------------------------------------------------------------------------------
/ch04 - NaiveBayes/email:
--------------------------------------------------------------------------------
1 | ../machinelearninginaction/Ch04/email


--------------------------------------------------------------------------------
/ch05 - LogisticRegression/logisticRegression.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | """
  5 | Logistic回归
  6 | ===
  7 | 根据现有数据对分类边界线建立回归公式, 以此进行分类.
  8 | "回归"来源于最佳拟合, 训练分类器即使用最优化算法寻找最佳拟合参数.
  9 | """
 10 | 
 11 | from __future__ import print_function
 12 | 
 13 | import numpy
 14 | import matplotlib.pyplot as plt
 15 | 
 16 | 
 17 | def getDataset(filename='testSet.txt'):
 18 |     dataset = []
 19 |     labels = []
 20 |     with open(filename) as infile:
 21 |         for line in infile:
 22 |             datas = line.strip().split()
 23 |             dataset.append([
 24 |                 1.0,
 25 |                 float(datas[0]),
 26 |                 float(datas[1]),
 27 |             ])
 28 |             labels.append(int(datas[2]))
 29 |         return numpy.array(dataset), labels
 30 | 
 31 | 
 32 | def sigmoid(inX):
 33 |     """ 海维赛德阶跃函数 """
 34 |     return 1.0 / (1 + numpy.exp(-inX))
 35 | 
 36 | """
 37 | 梯度上升算法
 38 | 沿着函数f的梯度方向, 寻找f的最大值
 39 | 因为梯度算子总是指向函数值增长最快的方向
 40 | """
 41 | 
 42 | 
 43 | def getGradientAsecent(dataset, labels):
 44 |     """使用梯度上升算法计算最佳回归系数
 45 | 
 46 |     Parameters
 47 |     ----------
 48 |     dataset
 49 |     labels
 50 | 
 51 |     Returns
 52 |     -------
 53 |     list of floats : 回归系数
 54 |     """
 55 |     dataset = numpy.mat(dataset)
 56 |     labels = numpy.mat(labels).T
 57 |     m, n = dataset.shape
 58 |     alpha = 0.001  # 向目标移动的步长
 59 |     numCycles = 500  # 迭代次数
 60 |     weights = numpy.ones((n, 1))
 61 |     # 计算真实类别与预测类别的差值, 按照该差值的方向调整回归系数
 62 |     # FIXME: 这里使用了大量的矩阵运算, 导致计算效率低下
 63 |     for k in range(numCycles):
 64 |         h = sigmoid(dataset * weights)  # 矩阵相乘
 65 |         error = labels - h  # 向量相减
 66 |         weights += alpha * dataset.T * error  # 矩阵相乘
 67 |     return weights.T[0]
 68 | 
 69 | 
 70 | def getStochasticGradientAsecent_0(dataset, labels):
 71 |     """使用随机梯度上升算法计算最佳回归系数
 72 |     一次只用一个样本点来更新回归系数, 能对数据进行增量更新, 是一个"在线学习"算法
 73 |     但因为数据集可能不是线性可分, 在迭代的时候可能导致回归系数抖动, 收敛速度慢
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     dataset
 78 |     labels
 79 | 
 80 |     Returns
 81 |     -------
 82 |     list of floats : 回归系数
 83 |     """
 84 |     m, n = dataset.shape
 85 |     alpha = 0.01
 86 |     weights = numpy.ones(n)
 87 |     for i in range(m):
 88 |         h = sigmoid(sum(dataset[i]*weights))
 89 |         error = labels[i] - h
 90 |         weights += alpha * error * dataset[i]
 91 |     return weights
 92 | 
 93 | 
 94 | def getStochasticGradientAsecent_1(
 95 |         dataset, labels, numIter=150):
 96 |     """使用改进的随机梯度上升算法计算最佳回归系数
 97 |     步长alpha每次都会调整
 98 |     通过随机选取样本来更新回归系数, 减少周期型抖动, 增加收敛速度
 99 | 
100 |     Parameters
101 |     ----------
102 |     dataset
103 |     labels
104 |     numIter : int default 150
105 |         迭代次数
106 | 
107 |     Returns
108 |     -------
109 |     list of floats : 回归系数
110 |     """
111 |     m, n = dataset.shape
112 |     weights = numpy.ones(n)
113 |     for j in range(numIter):
114 |         dataIndex = range(m)
115 |         for i in range(m):
116 |             # 步长每次迭代都会减少 1/(j+i)
117 |             # j 为迭代次数, i 为样本点的下标
118 |             alpha = 4/(1.0+j+i) + 0.0001  # 常数使得 alpha 永远不会减少到 0
119 |             # 通过随机选择来更新回归系数
120 |             randIndex = int(numpy.random.uniform(0, len(dataIndex)))
121 |             h = sigmoid(sum(dataset[randIndex]*weights))
122 |             error = labels[randIndex] - h
123 |             weights += alpha * error * dataset[randIndex]
124 |             del(dataIndex[randIndex])
125 |     return weights
126 | 
127 | 
128 | def plotBestFit(dataset, labels, weights):
129 |     """绘制数据分界线
130 | 
131 |     Parameters
132 |     ----------
133 |     weights : list of floats
134 |         系数
135 | 
136 |     """
137 |     m, _n = dataset.shape
138 |     # 收集绘制的数据
139 |     cord = {
140 |         '1': {
141 |             'x': [],
142 |             'y': [],
143 |         },
144 |         '2': {
145 |             'x': [],
146 |             'y': [],
147 |         },
148 |     }
149 |     for i in range(m):
150 |         if labels[i] == 1:
151 |             cord['1']['x'].append(dataset[i, 1])
152 |             cord['1']['y'].append(dataset[i, 2])
153 |         else:
154 |             cord['2']['x'].append(dataset[i, 1])
155 |             cord['2']['y'].append(dataset[i, 2])
156 |     # 绘制图形
157 |     figure = plt.figure()
158 |     subplot = figure.add_subplot(111)
159 |     # 绘制散点
160 |     subplot.scatter(
161 |         cord['1']['x'], cord['1']['y'],
162 |         s=30, c='red', marker='s'
163 |     )
164 |     subplot.scatter(
165 |         cord['2']['x'], cord['2']['y'],
166 |         s=30, c='green'
167 |     )
168 |     # 绘制直线
169 |     x = numpy.arange(-3.0, 3.0, 0.1)
170 |     y = (-weights[0] - weights[1] * x)/ weights[2]
171 |     subplot.plot(x, y)
172 |     # 标签
173 |     plt.xlabel('X1')
174 |     plt.ylabel('X2')
175 |     plt.show()
176 | 
177 | """ 利用logistic回归来进行分类 -- 从疝气病症状预测病马的死亡率 """
178 | 
179 | 
180 | def predict(inX, weights):
181 |     probability = sigmoid(sum(numpy.array(inX)*weights))
182 |     if probability > 0.5:
183 |         return 1
184 |     else:
185 |         return 0
186 | 
187 | 
188 | def loadDatasetFromFile(filename):
189 |     dataset = []
190 |     labels = []
191 |     with open(filename) as infile:
192 |         for line in infile:
193 |             datas = line.strip().split('\t')
194 |             row = list(map(lambda x: float(x), datas[:21]))
195 |             dataset.append(row)
196 |             labels.append(float(datas[21]))
197 |     return numpy.array(dataset), numpy.array(labels)
198 | 
199 | 
200 | def testColicPredict(num_iter=1000):
201 |     """在马的疝气病数据上训练 logistic 回归模型
202 | 
203 |     Parameters
204 |     ----------
205 |     num_iter
206 | 
207 |     Returns
208 |     -------
209 | 
210 |     """
211 |     # 训练模型
212 |     train = {}
213 |     train['dataset'], train['labels'] = loadDatasetFromFile(
214 |         'horseColicTraining.txt'
215 |     )
216 |     train['weights'] = getStochasticGradientAsecent_1(
217 |         train['dataset'],
218 |         train['labels'],
219 |         numIter=num_iter
220 |     )
221 |     # 测试
222 |     errorCount = 0
223 |     test = {}
224 |     test['dataset'], test['labels'] = loadDatasetFromFile(
225 |         'horseColicTest.txt'
226 |     )
227 |     m, _n = test['dataset'].shape
228 |     for rowno, row in enumerate(test['dataset']):
229 |         if predict(row, train['weights']) != test['labels'][rowno]:
230 |             errorCount += 1
231 |     errorRate = 1.0*errorCount / m
232 |     print("Error rate: {:.4f}".format(errorRate))
233 |     return errorRate
234 | 
235 | 
236 | def multiTestColicPredict(numTests=10):
237 |     errorSum = 0.0
238 |     # 多次运行结果可能不同, 因为使用随机选取的向量来更新回归系数
239 |     for k in range(numTests):
240 |         errorSum += testColicPredict()
241 |     print('after %d iterations the average error rate is: %f'
242 |           % (numTests, errorSum/float(numTests))
243 |     )
244 | 
245 | if __name__ == '__main__':
246 |     dataset, labels = getDataset()
247 |     weights = {
248 |         0: getGradientAsecent(dataset, labels),
249 |         1: getStochasticGradientAsecent_0(dataset, labels),
250 |         2: getStochasticGradientAsecent_1(dataset, labels),
251 |     }
252 |     # plotBestFit(dataset, labels, weights[0])
253 |     # plotBestFit(dataset, labels, weights[1])
254 |     # plotBestFit(dataset, labels, weights[2])
255 | 
256 |     multiTestColicPredict(10)
257 | 


--------------------------------------------------------------------------------
/ch06 - svm/svmMLiA.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | """
  5 | SVM - 支持向量机
  6 | ===
  7 | 介绍的是SVM的其中一种实现 -- 序列最小化(SMO, Sequential Minimal Optimization)算法
  8 | `分隔超平面` -- 将数据集分隔开来的超平面, 也就是分类的决策边界.
  9 | `间隔` -- 找到离分隔超平面最近的点, 确保他们离分隔面的距离尽可能远, 这其中点到分隔面的距离就是间隔.
 10 |     我们希望间隔尽可能地大, 以保证分类器尽可能健壮
 11 | `支持向量` -- 离分隔超平面最近的那些点
 12 | """
 13 | 
 14 | from __future__ import print_function
 15 | 
 16 | import logging
 17 | 
 18 | import numpy
 19 | 
 20 | logging.basicConfig(
 21 |     # level=logging.DEBUG,
 22 |     level=logging.INFO,
 23 |     format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s',
 24 | )
 25 | 
 26 | 
 27 | def load_dataset(filename):
 28 |     dataset = []
 29 |     labels = []
 30 |     with open(filename) as infile:
 31 |         for line in infile:
 32 |             datas = line.strip().split('\t')
 33 |             dataset.append([float(datas[0]), float(datas[1])])
 34 |             labels.append(float(datas[2]))
 35 |     return dataset, labels
 36 | 
 37 | 
 38 | def random_select_j(i, m):
 39 |     """ 返回任一 [0, m) 之间且不等于 i 的数 """
 40 |     j = i
 41 |     while j == i:
 42 |         j = int(numpy.random.uniform(0, m))
 43 |     return j
 44 | 
 45 | 
 46 | def adjust_alpha(aj, upper_bound, lower_bound):
 47 |     if aj > upper_bound:
 48 |         aj = upper_bound
 49 |     if lower_bound > aj:
 50 |         aj = lower_bound
 51 |     return aj
 52 | 
 53 | 
 54 | def estimate(alphas, labels, dataset, index, b):
 55 |     fx = float(
 56 |         numpy.multiply(alphas, labels).T
 57 |         * (dataset*dataset[index, :].T)
 58 |     ) + b
 59 |     e = fx - float(labels[index])
 60 |     return e
 61 | 
 62 | 
 63 | def smo_simple(dataset, labels, constant, toler, max_iter):
 64 |     """
 65 |     Platt的SMO算法简化版.
 66 |     = = = =
 67 |     每次循环中选择两个alpha进行优化处理.一旦找到一堆合适的alpha,
 68 |     那么就增大其中一个同时减少另外一个.
 69 |     * 两个alpha必须在间隔边界之外
 70 |     * 两个alpha还没有进行过区间化处理或者不在边界上
 71 | 
 72 |     Parameters
 73 |     ----------
 74 |     dataset
 75 |         数据集
 76 |     labels
 77 |         类型标签
 78 |     constant
 79 |         常数, 用于控制"最大化间隔"和"保证大部分点的函数间隔小于1.0"
 80 |     toler
 81 |         容错率
 82 |     max_iter
 83 |         最大循环次数
 84 | 
 85 |     Returns
 86 |     -------
 87 | 
 88 |     """
 89 |     dataset = numpy.mat(dataset)
 90 |     labels = numpy.mat(labels).T
 91 |     b = 0
 92 |     m, n = dataset.shape
 93 |     # 初始化alpha向量
 94 |     alphas = numpy.mat(numpy.zeros((m, 1)))
 95 |     num_iter = 0
 96 |     while num_iter < max_iter:
 97 |         # 对数据集中每个数据向量
 98 |         num_alpha_pairs_changed = False  # alpha 是否已经优化
 99 |         for i in range(m):
100 |             # 计算 alpha[i] 的预测值, 估算其是否可以被优化
101 |             Ei = estimate(alphas, labels, dataset, i, b)
102 |             # 测试正/负间隔距离, alpha值, 是否满足KKT条件
103 |             if not ((labels[i] * Ei < -toler and alphas[i] < constant)
104 |                     or (labels[i] * Ei > toler and alphas[i] > 0)):
105 |                 logging.debug('alpha[{0}]不需要调整.'.format(i))
106 |                 continue
107 | 
108 |             # 选择第二个 alpha[j]
109 |             j = random_select_j(i, m)
110 |             # alpha[j] 的预测值
111 |             Ej = estimate(alphas, labels, dataset, j, b)
112 | 
113 |             # 保存旧值以便与调整后比较
114 |             alphaI_old = alphas[i].copy()
115 |             alphaJ_old = alphas[j].copy()
116 | 
117 |             # 计算 lower_bound/upper_bound, 调整 alpha[j] 至 (0, C) 之间
118 |             if labels[i] != labels[j]:
119 |                 lower_bound = max(0, alphas[j] - alphas[i])
120 |                 upper_bound = min(constant, constant + alphas[j] - alphas[i])
121 |             else:
122 |                 lower_bound = max(0, alphas[j] + alphas[i] - constant)
123 |                 upper_bound = min(constant, alphas[j] + alphas[i])
124 |             if lower_bound == upper_bound:
125 |                 logging.debug('lower_bound == upper_bound == {0}'.format(lower_bound))
126 |                 continue
127 | 
128 |             # 计算 alpha[j] 的最优修改量
129 |             delta = (
130 |                 2.0 * dataset[i, :] * dataset[j, :].T
131 |                 - dataset[i, :] * dataset[i, :].T
132 |                 - dataset[j, :] * dataset[j, :].T
133 |             )
134 |             # 如果 delta==0, 则需要退出for循环的当前迭代过程.
135 |             # 简化版中不处理这种少量出现的特殊情况
136 |             if delta >= 0:
137 |                 logging.warning('{0}(delta) >= 0'.format(delta))
138 |                 continue
139 | 
140 |             # 计算新的 alpha[j]
141 |             alphas[j] -= labels[j] * (Ei - Ej) / delta
142 |             alphas[j] = adjust_alpha(alphas[j], upper_bound, lower_bound)
143 |             # 若 alpha[j] 的改变量太少, 不采用
144 |             delta_j = abs(alphas[j] - alphaJ_old)
145 |             if delta_j < 0.00001:
146 |                 logging.debug('j 变化量太少, 不采用. ({0})'.format(delta_j))
147 |                 continue
148 | 
149 |             # 对 alpha[i] 做 alpha[j] 同样大小, 方向相反的改变
150 |             alphas[i] += labels[j] * labels[i] * (alphaJ_old - alphas[j])
151 | 
152 |             # 给两个 alpha 值设置常量 b
153 |             b1 = (
154 |                 b - Ei
155 |                 - labels[i] * (alphas[i] - alphaI_old) * dataset[i, :] * dataset[i, :].T
156 |                 - labels[j] * (alphas[j] - alphaJ_old) * dataset[i, :] * dataset[j, :].T
157 |             )
158 |             b2 = (
159 |                 b - Ej
160 |                 - labels[i] * (alphas[i] - alphaI_old) * dataset[i, :] * dataset[j, :].T
161 |                 - labels[j] * (alphas[j] - alphaJ_old) * dataset[j, :] * dataset[j, :].T
162 |             )
163 |             if 0 < alphas[i] < constant:
164 |                 b = b1
165 |             elif 0 < alphas[j] < constant:
166 |                 b = b2
167 |             else:
168 |                 b = (b1 + b2) / 2.0
169 | 
170 |             num_alpha_pairs_changed = True
171 |             logging.debug('numIter: {:d} i:{:d}, pairs changed {}'.format(
172 |                 num_iter, i, num_alpha_pairs_changed
173 |             ))
174 |         if num_alpha_pairs_changed == 0:
175 |             num_iter += 1
176 |         else:
177 |             num_iter = 0
178 |         logging.debug('iteration number: {0}'.format(num_iter))
179 |     return b, alphas
180 | 
181 | 
182 | def kernelTrans(X, A, kernel_info):
183 |     """calc the kernel or transform data to a higher dimensional space
184 |     `核函数` --
185 | 
186 |     Parameters
187 |     ----------
188 |     X
189 |     A
190 |     kernel_info : tuple
191 |         包含核函数信息的元组
192 | 
193 |     Returns
194 |     -------
195 | 
196 |     """
197 |     m, n = numpy.shape(X)
198 |     K = numpy.mat(numpy.zeros((m, 1)))
199 |     if kernel_info[0] == 'lin':
200 |         K = X * A.T  # linear kernel
201 |     elif kernel_info[0] == 'rbf':  # radial bias function
202 |         for j in range(m):
203 |             deltaRow = X[j, :] - A
204 |             K[j] = deltaRow*deltaRow.T
205 |         # divide in NumPy is element-wise not matrix like Matlab
206 |         K = numpy.exp(K / (-1 * kernel_info[1] ** 2))
207 |     else:
208 |         raise NameError('未定义的核函数')
209 |     return K
210 | 
211 | 
212 | class Options(object):
213 |     def __init__(self, dataset, labels, constant, toler, kernel_info):
214 |         self.X = dataset
215 |         self.labels = labels
216 |         self.constant = constant
217 |         self.toler = toler
218 |         self.m, self.n = dataset.shape
219 |         self.alphas = numpy.mat(numpy.zeros((self.m, 1)))
220 |         self.b = 0
221 |         # eCache第一列表示该cache值是否有效
222 |         self.eCache = numpy.mat(numpy.zeros((self.m, 2)))
223 |         self.K = numpy.mat(numpy.zeros((self.m, self.m)))
224 |         for i in range(self.m):
225 |             self.K[:, i] = kernelTrans(self.X, self.X[i, :], kernel_info)
226 | 
227 |     def updateEk(self, k):
228 |         Ek = self.calc_estimate(k)
229 |         self.eCache[k] = [1, Ek]
230 | 
231 |     def calc_estimate(self, index):
232 |         fx = float(
233 |             numpy.multiply(self.alphas, self.labels).T * self.K[:, index]
234 |             + self.b
235 |         )
236 |         e = fx - float(self.labels[index])
237 |         return e
238 | 
239 |     def select_j(self, i, Ei):
240 |         maxK = -1
241 |         max_deltaE = 0
242 |         Ej = 0
243 |         self.eCache[i] = [1, Ei]  # 设置第i个eCache缓存值
244 |         validECaches = numpy.nonzero(self.eCache[:, 0].A)[0]
245 |         if len(validECaches) > 1:
246 |             # 在有效的缓存值中寻找deltaE最大的
247 |             for k in validECaches:
248 |                 if k == i:
249 |                     continue
250 |                 Ek = self.calc_estimate(k)
251 |                 deltaE = abs(Ei - Ek)
252 |                 if deltaE > max_deltaE:
253 |                     maxK = k
254 |                     max_deltaE = deltaE
255 |                     Ej = Ek
256 |             return maxK, Ej
257 |         else:
258 |             # 没有任何有效的eCache缓存值 (如第一轮中)
259 |             j = random_select_j(i, self.m)
260 |             Ej = self.calc_estimate(j)
261 |             return j, Ej
262 | 
263 | 
264 | def inner_loop(i, options):
265 |     # 计算 alpha[i] 的预测值, 估算其是否可以被优化
266 |     Ei = options.calc_estimate(i)
267 |     # 测试正/负间隔距离, alpha值, 是否满足KKT条件
268 |     if not (((options.labels[i] * Ei < -options.toler) and (options.alphas[i] < options.constant))
269 |             or ((options.labels[i] * Ei > options.toler) and (options.alphas[i] > 0))):
270 |         logging.debug('alpha[{0}]不需要调整.'.format(i))
271 |         return 0
272 | 
273 |     # 选择第二个 alpha[j], 并计算 alpha[j] 的预测值
274 |     j, Ej = options.select_j(i, Ei)
275 | 
276 |     # 保存旧值以便与调整后比较
277 |     alphaI_old = options.alphas[i].copy()
278 |     alphaJ_old = options.alphas[j].copy()
279 | 
280 |     # 计算 lower_bound/upper_bound, 调整 alpha[j] 至 (0, C) 之间
281 |     if options.labels[i] != options.labels[j]:
282 |         lower_bound = max(0, options.alphas[j] - options.alphas[i])
283 |         upper_bound = min(
284 |             options.constant,
285 |             options.constant + options.alphas[j] - options.alphas[i]
286 |         )
287 |     else:
288 |         lower_bound = max(0, options.alphas[j] + options.alphas[i] - options.constant)
289 |         upper_bound = min(options.constant, options.alphas[j] + options.alphas[i])
290 |     if lower_bound == upper_bound:
291 |         logging.debug('lower_bound == upper_bound == {0}'.format(lower_bound))
292 |         return 0
293 | 
294 |     # 计算 alpha[j] 的最优修改量
295 |     delta = 2.0 * options.K[i, j] - options.K[i, i] - options.K[j, j]
296 |     if delta >= 0:
297 |         logging.warning('{0}(delta) >= 0'.format(delta))
298 |         return 0
299 | 
300 |     # 计算新的 alpha[j]
301 |     options.alphas[j] -= options.labels[j] * (Ei - Ej) / delta
302 |     options.alphas[j] = adjust_alpha(options.alphas[j], upper_bound, lower_bound)
303 |     options.updateEk(j)  # 更新缓存中Ej的值
304 |     # 若 alpha[j] 的改变量太少, 不采用
305 |     delta_j = abs(options.alphas[j] - alphaJ_old)
306 |     if delta_j < 0.00001:
307 |         logging.debug('j 变化量太少, 不采用. ({0})'.format(delta_j))
308 |         return 0
309 | 
310 |     # 对 alpha[i] 做 alpha[j] 同样大小, 方向相反的改变
311 |     options.alphas[i] += options.labels[j] * options.labels[i] * (alphaJ_old - options.alphas[j])
312 |     options.updateEk(i)  # 更新缓存中Ei的值
313 |     # 给两个 alpha 值设置常量 b
314 |     b1 = (
315 |         options.b - Ei
316 |         - options.labels[i] * (options.alphas[i] - alphaI_old) * options.K[i, i]
317 |         - options.labels[j] * (options.alphas[j] - alphaJ_old) * options.K[i, j]
318 |     )
319 |     b2 = (
320 |         options.b - Ej
321 |         - options.labels[i] * (options.alphas[i] - alphaI_old) * options.K[i, j]
322 |         - options.labels[j] * (options.alphas[j] - alphaJ_old) * options.K[j, j]
323 |     )
324 |     if 0 < options.alphas[i] < options.constant:
325 |         options.b = b1
326 |     elif 0 < options.alphas[j] < options.constant:
327 |         options.b = b2
328 |     else:
329 |         options.b = (b1 + b2) / 2.0
330 |     return 1
331 | 
332 | 
333 | def smoP(dataset, labels, constant, toler, max_iter, kernel_info=('lin', 0)):
334 |     options = Options(
335 |         numpy.mat(dataset),
336 |         numpy.mat(labels).T,
337 |         constant, toler, kernel_info
338 |     )
339 |     num_iter = 0
340 |     scan_entire_set = True
341 |     num_alpha_pairs_changed = 0
342 |     while (num_iter < max_iter) and ((num_alpha_pairs_changed > 0) or scan_entire_set):
343 |         num_alpha_pairs_changed = 0
344 | 
345 |         if scan_entire_set:
346 |             # 遍历alpha, 使用 `inner_loop` 选择 alpha-j, 并在可能是对其进行优化
347 |             for i in range(options.m):
348 |                 num_alpha_pairs_changed += inner_loop(i, options)
349 |                 logging.debug('scanning : num_iter({}) i({}) pairs changed({})'.format(
350 |                     num_iter, i, num_alpha_pairs_changed
351 |                 ))
352 |             num_iter += 1
353 |         else:
354 |             # 遍历所有非边界(不在边界0或C上)的 alpha
355 |             non_bound_indexs = numpy.nonzero(
356 |                 (options.alphas.A > 0) * (options.alphas.A < constant)
357 |             )[0]
358 |             for i in non_bound_indexs:
359 |                 num_alpha_pairs_changed += inner_loop(i, options)
360 |                 logging.debug('non-bound : num_iter({}) i({}) pairs changed({})'.format(
361 |                     num_iter, i, num_alpha_pairs_changed
362 |                 ))
363 |             num_iter += 1
364 | 
365 |         if scan_entire_set:
366 |             scan_entire_set = False
367 |         elif num_alpha_pairs_changed == 0:
368 |             scan_entire_set = True
369 |         logging.debug('iteration number: {}'.format(num_iter))
370 |     return options.b, options.alphas
371 | 
372 | 
373 | def get_weights(alphas, dataset, labels):
374 |     dataset = numpy.mat(dataset)
375 |     labels = numpy.mat(labels).T
376 |     m, n = dataset.shape
377 |     w = numpy.zeros((n, 1))
378 |     for i in range(m):
379 |         w += numpy.multiply(alphas[i] * labels[i], dataset[i, :].T)
380 |     return w
381 | 
382 | 
383 | def test_rbf(k1=1.3):
384 |     import pprint
385 |     dataset, labels = load_dataset('testSetRBF.txt')
386 |     b, alphas = smoP(dataset, labels, 200, 0.0001, 10000, ('rbf', k1))  # C=200 important
387 | 
388 |     dataset = numpy.mat(dataset)
389 |     labels = numpy.mat(labels).T
390 |     support_vectors_index = tuple(numpy.nonzero(alphas.A > 0))[0]
391 |     support_vectors = dataset[support_vectors_index]
392 |     support_vectors_label = labels[support_vectors_index]
393 |     m, _n = support_vectors.shape
394 |     logging.info('支持向量 ({})个:'.format(m))
395 |     logging.info(pprint.pformat(zip(
396 |         support_vectors.tolist(), support_vectors_label.A1.tolist()
397 |     )))
398 | 
399 |     m, _n = dataset.shape
400 |     errorCount = 0
401 |     for i in range(m):
402 |         # 利用 核函数 && 支持向量 进行分类.
403 |         kernelEval = kernelTrans(support_vectors, dataset[i, :], ('rbf', k1))
404 |         predict = (
405 |             kernelEval.T
406 |             * numpy.multiply(support_vectors_label, alphas[support_vectors_index])
407 |             + b
408 |         )
409 |         if numpy.sign(predict) != numpy.sign(labels[i]):
410 |             errorCount += 1
411 |     logging.info('训练集上错误率: {:.2%}'.format(1.0 * errorCount / m))
412 | 
413 |     # 使用训练出来的SVM来对测试集进行分类, 检查错误率
414 |     dataset, labels = load_dataset('testSetRBF2.txt')
415 |     dataset = numpy.mat(dataset)
416 |     labels = numpy.mat(labels).T
417 |     m, _n = dataset.shape
418 |     errorCount = 0
419 |     for i in range(m):
420 |         kernelEval = kernelTrans(support_vectors, dataset[i, :], ('rbf', k1))
421 |         predict = (
422 |             kernelEval.T
423 |             * numpy.multiply(support_vectors_label, alphas[support_vectors_index])
424 |             + b
425 |         )
426 |         if numpy.sign(predict) != numpy.sign(labels[i]):
427 |             errorCount += 1
428 |     logging.info('测试集上错误率: {:.2%}'.format(1.0 * errorCount / m))
429 | 
430 | """ 使用SVM来进行手写数字识别 """
431 | 
432 | 
433 | def img2vector(filename):
434 |     vector = numpy.zeros((1, 1024))
435 |     with open(filename) as infile:
436 |         for lineno, line in enumerate(infile):
437 |             for rowno in range(32):
438 |                 vector[0, 32 * lineno + rowno] = int(line[rowno])
439 |         return vector
440 | 
441 | 
442 | def load_images(dir_name):
443 |     import os
444 |     files = os.listdir(dir_name)
445 |     labels = []
446 |     dataset = numpy.zeros((len(files), 1024))
447 |     for i, filename in enumerate(files):
448 |         name = os.path.splitext(filename)[0]
449 |         class_num = int(name.split('_')[0])
450 |         if class_num == 9:
451 |             labels.append(-1)
452 |         elif class_num == 1:
453 |             labels.append(1)
454 |         else:
455 |             raise ValueError('本分类器为二分类器, 不支持除1/9外的数字')
456 |         dataset[i, :] = img2vector('%s/%s' % (dir_name, filename))
457 |     return dataset, labels
458 | 
459 | 
460 | def test_digits(kernel_info=('rbf', 10)):
461 |     dataset, labels = load_images('digits/trainingDigits')
462 |     b, alphas = smoP(dataset, labels, 200, 0.0001, 10000, kernel_info)
463 | 
464 |     dataset = numpy.mat(dataset)
465 |     labels = numpy.mat(labels).T
466 |     support_vectors_index = tuple(numpy.nonzero(alphas.A > 0))[0]
467 |     support_vectors = dataset[support_vectors_index]
468 |     support_vectors_label = labels[support_vectors_index]
469 |     m, _n = support_vectors.shape
470 |     import pprint
471 |     logging.info('支持向量 ({})个:'.format(m))
472 |     # logging.info(pprint.pformat(zip(
473 |     #     support_vectors.tolist(), support_vectors_label.A1.tolist()
474 |     # )))
475 | 
476 |     m, n = dataset.shape
477 |     errorCount = 0
478 |     for i in range(m):
479 |         kernelEval = kernelTrans(support_vectors, dataset[i, :], kernel_info)
480 |         predict = (
481 |             kernelEval.T
482 |             * numpy.multiply(support_vectors_label, alphas[support_vectors_index])
483 |             + b
484 |         )
485 |         if numpy.sign(predict) != numpy.sign(labels[i]):
486 |             errorCount += 1
487 |     logging.info('训练集上错误率: {:.2%}'.format(1.0 * errorCount / m))
488 | 
489 |     dataset, labels = load_images('digits/testDigits')
490 |     dataset = numpy.mat(dataset)
491 |     labels = numpy.mat(labels).T
492 |     errorCount = 0
493 |     m, n = dataset.shape
494 |     for i in range(m):
495 |         kernelEval = kernelTrans(support_vectors, dataset[i, :], kernel_info)
496 |         predict = (
497 |             kernelEval.T
498 |             * numpy.multiply(support_vectors_label, alphas[support_vectors_index])
499 |             + b
500 |         )
501 |         if numpy.sign(predict) != numpy.sign(labels[i]):
502 |             errorCount += 1
503 |     logging.info('测试集上错误率: {:.2%}'.format(1.0 * errorCount / m))
504 | 
505 | """ main 函数 """
506 | 
507 | 
508 | def main():
509 |     # import pprint
510 |     # dataset, labels = load_dataset('testSet.txt')
511 |     # length = len(labels)
512 |     # b, alphas = smo_simple(dataset, labels, 0.6, 0.001, 40)
513 |     # logging.info('支持向量:')
514 |     # logging.info(pprint.pformat(
515 |     #     [(dataset[i], labels[i]) for i in range(length) if alphas[i] > 0]
516 |     # ))
517 | 
518 |     # 使用核函数的SVM
519 |     test_rbf(k1=1.3)
520 | 
521 |     # 手写数字识别
522 |     test_digits()
523 | 
524 | if __name__ == '__main__':
525 |     main()
526 | 
527 | 
528 | '''#######********************************
529 | Non-Kernel VErsions below
530 | '''#######********************************
531 | 
532 | class optStructK:
533 |     def __init__(self,dataMatIn, classLabels, C, toler):  # Initialize the structure with the parameters
534 |         self.X = dataMatIn
535 |         self.labels = classLabels
536 |         self.C = C
537 |         self.tol = toler
538 |         self.m = shape(dataMatIn)[0]
539 |         self.alphas = mat(zeros((self.m,1)))
540 |         self.b = 0
541 |         self.eCache = mat(zeros((self.m,2))) #first column is valid flag
542 | 
543 | def calcEkK(oS, k):
544 |     fXk = float(multiply(oS.alphas,oS.labels).T*(oS.X*oS.X[k,:].T)) + oS.b
545 |     Ek = fXk - float(oS.labels[k])
546 |     return Ek
547 | 
548 | def selectJK(i, oS, Ei):         #this is the second choice -heurstic, and calcs Ej
549 |     maxK = -1; maxDeltaE = 0; Ej = 0
550 |     oS.eCache[i] = [1,Ei]  #set valid #choose the alpha that gives the maximum delta E
551 |     validEcacheList = nonzero(oS.eCache[:,0].A)[0]
552 |     if (len(validEcacheList)) > 1:
553 |         for k in validEcacheList:   #loop through valid Ecache values and find the one that maximizes delta E
554 |             if k == i: continue #don't calc for i, waste of time
555 |             Ek = calc_estimate(oS, k)
556 |             deltaE = abs(Ei - Ek)
557 |             if (deltaE > maxDeltaE):
558 |                 maxK = k; maxDeltaE = deltaE; Ej = Ek
559 |         return maxK, Ej
560 |     else:   #in this case (first time around) we don't have any valid eCache values
561 |         j = random_select_j(i, oS.m)
562 |         Ej = calc_estimate(oS, j)
563 |     return j, Ej
564 | 
565 | def updateEkK(oS, k):#after any alpha has changed update the new value in the cache
566 |     Ek = calc_estimate(oS, k)
567 |     oS.eCache[k] = [1,Ek]
568 | 
569 | def innerLK(i, oS):
570 |     Ei = calc_estimate(oS, i)
571 |     if ((oS.labels[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labels[i]*Ei > oS.tol) and (oS.alphas[i] > 0)):
572 |         j,Ej = select_j(i, oS, Ei) #this has been changed from selectJrand
573 |         alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy();
574 |         if (oS.labels[i] != oS.labels[j]):
575 |             L = max(0, oS.alphas[j] - oS.alphas[i])
576 |             H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
577 |         else:
578 |             L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
579 |             H = min(oS.C, oS.alphas[j] + oS.alphas[i])
580 |         if L==H:
581 |             print("L==H")
582 |             return 0
583 |         eta = 2.0 * oS.X[i,:]*oS.X[j,:].T - oS.X[i,:]*oS.X[i,:].T - oS.X[j,:]*oS.X[j,:].T
584 |         if eta >= 0:
585 |             print("eta>=0")
586 |             return 0
587 |         oS.alphas[j] -= oS.labels[j]*(Ei - Ej)/eta
588 |         oS.alphas[j] = AdjustAlpha(oS.alphas[j],H,L)
589 |         updateEk(oS, j) #added this for the Ecache
590 |         if (abs(oS.alphas[j] - alphaJold) < 0.00001):
591 |             print("j not moving enough")
592 |             return 0
593 |         oS.alphas[i] += oS.labels[j]*oS.labels[i]*(alphaJold - oS.alphas[j])#update i by the same amount as j
594 |         updateEk(oS, i) #added this for the Ecache                    #the update is in the oppostie direction
595 |         b1 = oS.b - Ei- oS.labels[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[i,:].T - oS.labels[j]*(oS.alphas[j]-alphaJold)*oS.X[i,:]*oS.X[j,:].T
596 |         b2 = oS.b - Ej- oS.labels[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[j,:].T - oS.labels[j]*(oS.alphas[j]-alphaJold)*oS.X[j,:]*oS.X[j,:].T
597 |         if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1
598 |         elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2
599 |         else: oS.b = (b1 + b2)/2.0
600 |         return 1
601 |     else: return 0
602 | 
603 | def smoPK(dataMatIn, classLabels, C, toler, maxIter):    #full Platt SMO
604 |     oS = Options(mat(dataMatIn), mat(classLabels).transpose(), C, toler)
605 |     iter = 0
606 |     entireSet = True; isAlphaPairsChanged = 0
607 |     while (iter < maxIter) and ((isAlphaPairsChanged > 0) or (entireSet)):
608 |         isAlphaPairsChanged = 0
609 |         if entireSet:   #go over all
610 |             for i in range(oS.m):
611 |                 isAlphaPairsChanged += innerL(i,oS)
612 |                 print("fullSet, iter: %d i:%d, pairs changed %d" % (iter,i,isAlphaPairsChanged))
613 |             iter += 1
614 |         else:#go over non-bound (railed) alphas
615 |             nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
616 |             for i in nonBoundIs:
617 |                 isAlphaPairsChanged += innerL(i,oS)
618 |                 print("non-bound, iter: %d i:%d, pairs changed %d" % (iter,i,isAlphaPairsChanged))
619 |             iter += 1
620 |         if entireSet: entireSet = False #toggle entire set loop
621 |         elif (isAlphaPairsChanged == 0): entireSet = True
622 |         print("iteration number: %d" % iter)
623 |     return oS.b,oS.alphas
624 | 


--------------------------------------------------------------------------------
/ch07 - AdaBoosting/adaboost.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | """
  5 | AdaBoost -- Adaptive boosting
  6 | ===
  7 | 通过串行训练多个分类器, 每一个分类器根据已训练出来的分类器的性能来进行训练,
  8 | 每个新的分类器集中关注被已有分类器错分的那些数据来获得新的分类器.
  9 | 最终把所有分类器的结果加权求和.
 10 | """
 11 | import logging
 12 | 
 13 | import numpy
 14 | 
 15 | logging.basicConfig(
 16 |     level=logging.DEBUG,
 17 |     # level=logging.INFO,
 18 |     format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s',
 19 | )
 20 | TRACE = logging.DEBUG - 1
 21 | 
 22 | 
 23 | def load_fake_dataset():
 24 |     dataset = numpy.matrix([
 25 |         [1.0, 2.1],
 26 |         [2.0, 1.1],
 27 |         [1.3, 1.0],
 28 |         [1.0, 1.0],
 29 |         [2.0, 1.0],
 30 |     ])
 31 |     labels = [1.0, 1.0, -1.0, -1.0, 1.0]
 32 |     return dataset, labels
 33 | 
 34 | 
 35 | def load_dataset_from_file(filename):
 36 |     dataset = []
 37 |     labels = []
 38 |     num_features = None
 39 |     with open(filename) as infile:
 40 |         for line in infile:
 41 |             line = line.strip().split('\t')
 42 |             if num_features is None:
 43 |                 num_features = len(line)
 44 |             dataset.append(list(map(float, line[:-1])))
 45 |             labels.append(float(line[-1]))
 46 |         return dataset, labels
 47 | 
 48 | 
 49 | class DicisionStump(object):
 50 |     def __init__(self, dataset):
 51 |         self.dataset = dataset
 52 | 
 53 |     def predict(self, dimension, threshold_val, inequal):
 54 |         m, _n = self.dataset.shape
 55 |         predict = numpy.ones((m, 1))
 56 |         if inequal == 'lt':
 57 |             predict[self.dataset[:, dimension] <= threshold_val] = -1.0
 58 |         elif inequal == 'gt':
 59 |             predict[self.dataset[:, dimension] > threshold_val] = -1.0
 60 |         return predict
 61 | 
 62 | 
 63 | class AdaBoostDicisionStump(object):
 64 |     def __init__(self, dataset, labels, max_iter=40):
 65 |         self.dataset = numpy.mat(dataset)
 66 |         self.labels = numpy.mat(labels).T
 67 |         self.m, self.n = self.dataset.shape
 68 |         self.train(max_iter=max_iter)
 69 | 
 70 |     def build_stump(self, D):
 71 |         stump = DicisionStump(self.dataset)
 72 |         num_steps = 10.0  # 在特征的可能值上通过递增步长遍历的次数
 73 |         best_stump_info = {}  # 记录对于给定权重向量D, 最佳的单层决策树
 74 |         best_predict_values = numpy.mat(numpy.zeros((self.m, 1)))
 75 |         min_error = 0x3f3f3f3f  # init error sum, to +infinity
 76 |         # 遍历所有特征
 77 |         for i in range(self.n):
 78 |             # 计算遍历该特征的步长
 79 |             feature_min = self.dataset[:, i].min()
 80 |             feature_max = self.dataset[:, i].max()
 81 |             step = (feature_max - feature_min) / num_steps
 82 |             # 对于该特征, 遍历所有可能的值
 83 |             for j in range(-1, int(num_steps) + 1):  # loop over all range in current dimension
 84 |                 for inequal in ['lt', 'gt']:  # 在 >/< 之间进行切换
 85 |                     threshold_val = feature_min + float(j) * step
 86 |                     predicted_values = stump.predict(i, threshold_val, inequal)
 87 |                     # 记录预测值与实际分类不同
 88 |                     errors = numpy.mat(numpy.ones((self.m, 1)))
 89 |                     errors[predicted_values == self.labels] = 0
 90 |                     # 计算在给定权重下的总错误权重
 91 |                     weighted_errors = D.T * errors
 92 |                     logging.log(TRACE, '[Split] dimension {:d}, threshold {:.2f} threshold inequal: {:s}'.format(
 93 |                         i, threshold_val, inequal
 94 |                     ))
 95 |                     logging.log(TRACE, '[Split] Weighted errors is {:.3f}'.format(weighted_errors[0, 0]))
 96 |                     # 根据总错误权重来更新最好的单层决策树信息
 97 |                     if weighted_errors < min_error:
 98 |                         min_error = weighted_errors
 99 |                         best_predict_values = predicted_values.copy()
100 |                         best_stump_info['dimension'] = i
101 |                         best_stump_info['threshold'] = threshold_val
102 |                         best_stump_info['inequal'] = inequal
103 |         return best_stump_info, min_error, best_predict_values
104 | 
105 |     def train(self, max_iter):
106 |         weak_classifiers = []
107 |         D = numpy.mat(numpy.ones((self.m, 1)) / self.m)
108 |         aggregated_predict = numpy.mat(numpy.zeros((self.m, 1)))
109 |         for i in range(max_iter):
110 |             stump_info, error, predict = self.build_stump(D)
111 |             logging.debug('D: {}'.format(D.T))
112 |             # 计算本次单层决策树输出结果的权重, `max(error, 1e-16)` 保证不会出现除0错误
113 |             alpha = float(0.5 * numpy.log((1.0 - error) / max(error, 1e-16)))
114 |             stump_info['alpha'] = alpha
115 |             weak_classifiers.append(stump_info)  # store Stump Params in Array
116 |             logging.debug('predict: {}'.format(predict.T))
117 |             # 更新权重D
118 |             exponent = numpy.multiply(-1 * alpha * self.labels, predict)
119 |             D = numpy.multiply(D, numpy.exp(exponent))
120 |             D = D / D.sum()  # 保证 D 各维度总和为 1
121 |             # 计算应用所有分类器后的分类结果
122 |             aggregated_predict += alpha * predict
123 |             logging.debug('aggregated predict: {}'.format(aggregated_predict.T))
124 |             aggregated_errors = numpy.multiply(
125 |                 numpy.sign(aggregated_predict) != self.labels,
126 |                 numpy.ones((self.m, 1))
127 |             )
128 |             errorRate = aggregated_errors.sum() / self.m
129 |             logging.info('Total error: {}'.format(errorRate))
130 |             if errorRate == 0.0:
131 |                 break
132 |         self.classifiers = weak_classifiers
133 |         self.aggregated_predict = aggregated_predict
134 | 
135 |     def predict(self, dataset):
136 |         dataset = numpy.mat(dataset)
137 |         stump = DicisionStump(dataset)
138 |         m, _n = dataset.shape
139 |         aggregated_estimate = numpy.mat(numpy.zeros((m, 1)))
140 |         for classifier in self.classifiers:
141 |             logging.info('Applying stumb: {}'.format(classifier))
142 |             estimate = stump.predict(
143 |                 classifier['dimension'],
144 |                 classifier['threshold'],
145 |                 classifier['inequal']
146 |             )
147 |             aggregated_estimate += classifier['alpha'] * estimate
148 |             logging.info(aggregated_estimate)
149 |         return numpy.sign(aggregated_estimate)
150 | 
151 | 
152 | def plotROCCurve(predStrengths, labels):
153 |     """
154 |     ROC曲线(Receiver Operating Characteristic curve)
155 |     ROC曲线给出当阈值变化时假阳率和真阳率的变化情况
156 |     """
157 |     import matplotlib.pyplot as plt
158 |     cursor = (1.0, 1.0)  # 绘制光标的位置
159 |     ySum = 0.0  # variable to calculate AUC
160 |     numPositiveClass = sum(numpy.array(labels) == 1.0)  # 正例的数目
161 |     step = {
162 |         'x': 1.0 / numPositiveClass,
163 |         'y': 1.0 / (len(labels) - numPositiveClass),
164 |     }
165 |     sortedIndicies = predStrengths.A1.argsort()  # get sorted index, it's reverse
166 |     fig = plt.figure()
167 |     fig.clf()
168 |     ax = plt.subplot(111)
169 |     # loop through all the values, drawing a line segment at each point
170 |     for index in sortedIndicies:
171 |         if labels[index] == 1.0:
172 |             deltaX = 0
173 |             deltaY = step['x']
174 |         else:
175 |             deltaX = step['y']
176 |             deltaY = 0
177 |             ySum += cursor[1]
178 |         # draw line from cursor to (cursor[0]-deltaX, cursor[1]-deltaY)
179 |         logging.debug('Drawing line from {} -> {}'.format(
180 |             cursor, (cursor[0]-deltaX, cursor[1]-deltaY)
181 |         ))
182 |         ax.plot(
183 |             [cursor[0], cursor[0]-deltaX],
184 |             [cursor[1], cursor[1]-deltaY],
185 |             c='b'
186 |         )
187 |         cursor = (cursor[0] - deltaX, cursor[1] - deltaY)
188 |     ax.plot([0, 1], [0, 1], 'b--')
189 | 
190 |     plt.xlabel('False positive rate')
191 |     plt.ylabel('True positive rate')
192 |     plt.title('ROC curve for AdaBoost horse colic detection system')
193 |     ax.axis([0, 1, 0, 1])
194 |     plt.show()
195 |     logging.info('曲线下面积AUC(Area Under the Curve): {}'.format(ySum * step['y']))
196 | 
197 | 
198 | def main():
199 |     import pprint
200 |     dataset, labels = load_fake_dataset()
201 |     # D = numpy.mat(numpy.ones((5, 1)) / 5)
202 |     # build_stump(dataset, labels, D)
203 |     model = AdaBoostDicisionStump(dataset, labels)
204 |     logging.info('Classifiers: {}'.format(pprint.pformat(model.classifiers)))
205 |     logging.info('结果对比 (预测/真实):\n{}'.format(zip(
206 |         model.predict(dataset).A1.tolist(),
207 |         labels
208 |     )))
209 | 
210 |     plotROCCurve(model.aggregated_predict, labels)
211 | 
212 |     dataset, labels = load_dataset_from_file('horseColicTraining2.txt')
213 |     model = AdaBoostDicisionStump(dataset, labels)
214 |     plotROCCurve(model.aggregated_predict, labels)
215 | 
216 | 
217 | if __name__ == '__main__':
218 |     main()
219 | 
220 | 


--------------------------------------------------------------------------------
/ch08 - LinearRegression/regression.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | import logging
  5 | 
  6 | import numpy
  7 | 
  8 | logging.basicConfig(
  9 |     level=logging.DEBUG,
 10 |     # level=logging.INFO,
 11 |     format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s',
 12 | )
 13 | TRACE = logging.DEBUG - 1
 14 | 
 15 | 
 16 | def load_dataset_from_file(filename):
 17 |     dataset = []
 18 |     labels = []
 19 |     num_features = None
 20 |     with open(filename) as infile:
 21 |         for line in infile:
 22 |             line = line.strip().split('\t')
 23 |             if num_features is None:
 24 |                 num_features = len(line)
 25 |             dataset.append(list(map(float, line[:-1])))
 26 |             labels.append(float(line[-1]))
 27 |         return dataset, labels
 28 | 
 29 | 
 30 | def standarRegress(xArray, yArray):
 31 |     """使用普通最小二乘法求回归系数"""
 32 |     xMatrix = numpy.mat(xArray)
 33 |     yMatrix = numpy.mat(yArray).T
 34 |     xTx = xMatrix.T * xMatrix
 35 |     if numpy.linalg.det(xTx) == 0.0:
 36 |         logging.error('奇异矩阵无法求逆')
 37 |         return
 38 |     w = xTx.I * (xMatrix.T * yMatrix)
 39 |     # 或下面这个
 40 |     # w = numpy.linalg.solve(xTx, xMatrix.T * yMatrix)
 41 |     return w.A1
 42 | 
 43 | 
 44 | def lwlrRegress(testPoint, xArray, yArray, k=1.0):
 45 |     """局部加权线性回归(LWLR - Locally Weighted Linear Regression)
 46 |     给待预测点附近的每个点赋予一定的权重.
 47 |     LWLR使用"核"来对附近的点赋予更高的权重, 最常用的是高斯核.
 48 |     ===
 49 | 
 50 |     """
 51 |     xMatrix = numpy.mat(xArray)
 52 |     yMatrix = numpy.mat(yArray).T
 53 |     m, _n = xMatrix.shape
 54 |     # 利用高斯核初始化权重矩阵
 55 |     weights = numpy.mat(numpy.eye(m))
 56 |     for j in range(m):
 57 |         diffMat = testPoint - xMatrix[j, :]
 58 |         weights[j, j] = numpy.exp(diffMat * diffMat.T / (-2.0 * k**2))
 59 |     xTx = xMatrix.T * (weights * xMatrix)
 60 |     if numpy.linalg.det(xTx) == 0.0:
 61 |         logging.error('奇异矩阵无法求逆')
 62 |         return
 63 |     ws = xTx.I * (xMatrix.T * (weights * yMatrix))
 64 |     return testPoint * ws
 65 | 
 66 | 
 67 | def lwlrTest(testArray, xArray, yArray, k=1.0):
 68 |     """
 69 |     对于所有的测试点, 使用LWLR局部加权线性回归来计算预测值
 70 |     """
 71 |     m, _n = numpy.array(testArray).shape
 72 |     yHat = numpy.zeros(m)
 73 |     for i in range(m):
 74 |         yHat[i] = lwlrRegress(testArray[i], xArray, yArray, k)
 75 |     return yHat
 76 | 
 77 | 
 78 | def rssError(yArray, yHatArr):
 79 |     """计算预测误差"""
 80 |     yArray = numpy.array(yArray)
 81 |     yHatArr = numpy.array(yHatArr)
 82 |     return ((yArray - yHatArr)**2).sum()
 83 | 
 84 | """缩减方法 -- 岭回归, 前向足部回归, lasso法"""
 85 | 
 86 | 
 87 | def ridgeRegress(xMatrix, yMatrix, lam=0.2):
 88 |     xTx = xMatrix.T * xMatrix
 89 |     _m, n = numpy.shape(xMatrix)
 90 |     denom = xTx + (numpy.eye(n) * lam)
 91 |     if numpy.linalg.det(denom) == 0.0:
 92 |         logging.error('奇异矩阵无法求逆')
 93 |         return
 94 |     ws = denom.I * (xMatrix.T * yMatrix)
 95 |     return ws
 96 | 
 97 | 
 98 | def ridgeTest(xArray, yArray):
 99 |     xMatrix = numpy.mat(xArray)
100 |     yMatrix = numpy.mat(yArray).T
101 |     # 标准化Y
102 |     yMean = numpy.mean(yMatrix, 0)
103 |     yMatrix = yMatrix - yMean     # to eliminate X0 take numpy.mean off of Y
104 |     # 标准化X的每一维
105 |     xMeans = numpy.mean(xMatrix, 0)   # calc numpy.mean then subtract it off
106 |     xVar = numpy.var(xMatrix, 0)      # calc variance of Xi then divide by it
107 |     xMatrix = (xMatrix - xMeans) / xVar
108 | 
109 |     numTestPts = 30
110 |     _m, n = xMatrix.shape
111 |     wMatrix = numpy.zeros((numTestPts, n))
112 |     for i in range(numTestPts):
113 |         ws = ridgeRegress(xMatrix, yMatrix, numpy.exp(i - 10))
114 |         wMatrix[i, :] = ws.T
115 |     return wMatrix
116 | 
117 | 
118 | def main():
119 |     Xs, Ys = load_dataset_from_file('ex0.txt')
120 |     logging.info('原始数据\n{0}'.format([(x, y) for x, y in zip(Xs, Ys)]))
121 | 
122 |     w = standarRegress(Xs, Ys)
123 |     logging.info('最小二乘法回归系数: {0}'.format(w))
124 |     logging.info('预测序列\n{0}'.format([
125 |         (x, y) for x, y in map(
126 |             lambda x: (x, float(numpy.mat(x) * numpy.mat(w).T)),
127 |             Xs
128 |         )
129 |     ]))
130 | 
131 |     # k = 0.003
132 |     k = 0.01
133 |     # k = 0.1
134 |     yHat = lwlrTest(Xs, Xs, Ys, k=k)
135 |     logging.info('LWLR预测序列, 系数 k={0}\n{1}'.format(k, [
136 |         (x, y) for x, y in zip(Xs, yHat)
137 |     ]))
138 | 
139 |     '''
140 |     # 绘制图看拟合效果
141 |     xMatrix = numpy.mat(Xs)
142 |     sorted_index = xMatrix[:, 1].argsort(axis=0)
143 |     xSort = xMatrix[sorted_index][:, 0, :]
144 |     import matplotlib.pyplot as plt
145 |     figure = plt.figure()
146 |     ax = figure.add_subplot(111)
147 |     ax.plot(xSort[:, 1], yHat[sorted_index])  # 拟合曲线
148 |     ax.scatter(
149 |         xMatrix[:, 1].A1, numpy.mat(Ys).T.A1,
150 |         s=2, c='red'
151 |     )  # 原始数据
152 |     plt.show()
153 |     '''
154 | 
155 |     abaloneXs, abalineYs = load_dataset_from_file('abalone.txt')
156 |     w = ridgeTest(abaloneXs, abalineYs)
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     main()
161 | 
162 | 
163 | def regularize(xMatrix):#regularize by columns
164 |     inMat = xMatrix.copy()
165 |     inMeans = numpy.mean(inMat,0)   #calc numpy.mean then subtract it off
166 |     inVar = numpy.var(inMat,0)      #calc variance of Xi then divide by it
167 |     inMat = (inMat - inMeans)/inVar
168 |     return inMat
169 | 
170 | def stageWise(xArray,yArray,eps=0.01,numIt=100):
171 |     xMatrix = numpy.mat(xArray); yMatrix=numpy.mat(yArray).T
172 |     yMean = numpy.mean(yMatrix,0)
173 |     yMatrix = yMatrix - yMean     #can also regularize ys but will get smaller coef
174 |     xMatrix = regularize(xMatrix)
175 |     m,n=numpy.shape(xMatrix)
176 |     #returnMat = numpy.zeros((numIt,n)) #testing code remove
177 |     ws = numpy.zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
178 |     for i in range(numIt):
179 |         print ws.T
180 |         lowestError = numpy.inf; 
181 |         for j in range(n):
182 |             for sign in [-1,1]:
183 |                 wsTest = ws.copy()
184 |                 wsTest[j] += eps*sign
185 |                 yTest = xMatrix*wsTest
186 |                 rssE = rssError(yMatrix.A,yTest.A)
187 |                 if rssE < lowestError:
188 |                     lowestError = rssE
189 |                     wsMax = wsTest
190 |         ws = wsMax.copy()
191 |         #returnMat[i,:]=ws.T
192 |     #return returnMat
193 | 
194 | #def scrapePage(inFile,outFile,yr,numPce,origPrc):
195 | #    from BeautifulSoup import BeautifulSoup
196 | #    fr = open(inFile); fw=open(outFile,'a') #a is append mode writing
197 | #    soup = BeautifulSoup(fr.read())
198 | #    i=1
199 | #    currentRow = soup.findAll('table', r="%d" % i)
200 | #    while(len(currentRow)!=0):
201 | #        title = currentRow[0].findAll('a')[1].text
202 | #        lwrTitle = title.lower()
203 | #        if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
204 | #            newFlag = 1.0
205 | #        else:
206 | #            newFlag = 0.0
207 | #        soldUnicde = currentRow[0].findAll('td')[3].findAll('span')
208 | #        if len(soldUnicde)==0:
209 | #            print "item #%d did not sell" % i
210 | #        else:
211 | #            soldPrice = currentRow[0].findAll('td')[4]
212 | #            priceStr = soldPrice.text
213 | #            priceStr = priceStr.replace('$','') #strips out $
214 | #            priceStr = priceStr.replace(',','') #strips out ,
215 | #            if len(soldPrice)>1:
216 | #                priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping
217 | #            print "%s\t%d\t%s" % (priceStr,newFlag,title)
218 | #            fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr))
219 | #        i += 1
220 | #        currentRow = soup.findAll('table', r="%d" % i)
221 | #    fw.close()
222 |     
223 | from time import sleep
224 | import json
225 | import urllib2
226 | def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
227 |     sleep(10)
228 |     myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
229 |     searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
230 |     pg = urllib2.urlopen(searchURL)
231 |     retDict = json.loads(pg.read())
232 |     for i in range(len(retDict['items'])):
233 |         try:
234 |             currItem = retDict['items'][i]
235 |             if currItem['product']['condition'] == 'new':
236 |                 newFlag = 1
237 |             else: newFlag = 0
238 |             listOfInv = currItem['product']['inventories']
239 |             for item in listOfInv:
240 |                 sellingPrice = item['price']
241 |                 if  sellingPrice > origPrc * 0.5:
242 |                     print "%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice)
243 |                     retX.append([yr, numPce, newFlag, origPrc])
244 |                     retY.append(sellingPrice)
245 |         except: print 'problem with item %d' % i
246 |     
247 | def setDataCollect(retX, retY):
248 |     searchForSet(retX, retY, 8288, 2006, 800, 49.99)
249 |     searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
250 |     searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
251 |     searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
252 |     searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
253 |     searchForSet(retX, retY, 10196, 2009, 3263, 249.99)
254 |     
255 | def crossValidation(xArray,yArray,numVal=10):
256 |     m = len(yArray)                           
257 |     indexList = range(m)
258 |     errorMat = numpy.zeros((numVal,30))#create error numpy.mat 30columns numVal rows
259 |     for i in range(numVal):
260 |         trainX=[]; trainY=[]
261 |         testX = []; testY = []
262 |         numpy.random.shuffle(indexList)
263 |         for j in range(m):#create training set based on first 90% of values in indexList
264 |             if j < m*0.9: 
265 |                 trainX.append(xArray[indexList[j]])
266 |                 trainY.append(yArray[indexList[j]])
267 |             else:
268 |                 testX.append(xArray[indexList[j]])
269 |                 testY.append(yArray[indexList[j]])
270 |         wMat = ridgeTest(trainX,trainY)    #get 30 weight vectors from ridge
271 |         for k in range(30):#loop over all of the ridge estimates
272 |             matTestX = numpy.mat(testX); matTrainX=numpy.mat(trainX)
273 |             meanTrain = numpy.mean(matTrainX,0)
274 |             varTrain = numpy.var(matTrainX,0)
275 |             matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params
276 |             yEst = matTestX * numpy.mat(wMat[k,:]).T + numpy.mean(trainY)#test ridge results and store
277 |             errorMat[i,k]=rssError(yEst.T.A,numpy.array(testY))
278 |             #print errorMat[i,k]
279 |     meanErrors = numpy.mean(errorMat,0)#calc avg performance of the different ridge weight vectors
280 |     minMean = float(min(meanErrors))
281 |     bestWeights = wMat[numpy.nonzero(meanErrors==minMean)]
282 |     #can unregularize to get model
283 |     #when we regularized we wrote Xreg = (x-meanX)/numpy.var(x)
284 |     #we can now write in terms of x not Xreg:  x*w/numpy.var(x) - meanX/numpy.var(x) +meanY
285 |     xMatrix = numpy.mat(xArray); yMatrix=numpy.mat(yArray).T
286 |     meanX = numpy.mean(xMatrix,0); varX = numpy.var(xMatrix,0)
287 |     unReg = bestWeights/varX
288 |     print "the best model from Ridge Regression is:\n",unReg
289 |     print "with constant term: ",-1*sum(numpy.multiply(meanX,unReg)) + numpy.mean(yMatrix)


--------------------------------------------------------------------------------
/ch09 - RegressionTree/regressionTrees.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | from __future__ import print_function
  5 | 
  6 | import copy
  7 | import logging
  8 | 
  9 | import numpy
 10 | 
 11 | TRACE = logging.DEBUG - 1
 12 | logging.basicConfig(
 13 |     level=logging.DEBUG,
 14 |     # level=TRACE,
 15 |     format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s',
 16 | )
 17 | 
 18 | 
 19 | def load_dataset_from_file(filename):
 20 |     dataset = []
 21 |     with open(filename) as infile:
 22 |         for line in infile:
 23 |             line = line.strip().split('\t')
 24 |             dataset.append(list(map(float, line)))
 25 |         return dataset
 26 | 
 27 | 
 28 | def linear_solve(dataset):
 29 |     """求线性回归参数"""
 30 |     m, n = numpy.shape(dataset)
 31 |     X = numpy.mat(numpy.ones((m, n)))
 32 |     X[:, 1:n] = dataset[:, 0:n - 1]
 33 |     Y = dataset[:, -1]
 34 |     xTx = X.T * X
 35 |     if numpy.linalg.det(xTx) == 0.0:
 36 |         raise Exception('This matrix is singular, cannot do inverse,\n'
 37 |                         'try increasing the second value of ops')
 38 |     ws = xTx.I * (X.T * Y)
 39 |     return ws, X, Y
 40 | 
 41 | 
 42 | TYPE_VALUE = 0
 43 | TYPE_MODEL = 1
 44 | 
 45 | 
 46 | class Dataset(object):
 47 | 
 48 |     def __init__(self, dataset):
 49 |         self.rawDataset = numpy.mat(dataset)
 50 | 
 51 |     @property
 52 |     def shape(self):
 53 |         return self.rawDataset.shape
 54 | 
 55 |     @property
 56 |     def leaf_val(self):
 57 |         """因变量均值"""
 58 |         return numpy.mean(self.rawDataset[:, -1])
 59 | 
 60 |     @property
 61 |     def leaf_error(self):
 62 |         """因变量均方差和"""
 63 |         m, _n = self.rawDataset.shape
 64 |         return m * numpy.var(self.rawDataset[:, -1])
 65 | 
 66 |     @property
 67 |     def leaf_model_weights(self):
 68 |         ws, _X, _Y = linear_solve(self.rawDataset)
 69 |         return ws.A1.tolist()
 70 | 
 71 |     @property
 72 |     def leaf_model_error(self):
 73 |         ws, X, Y = linear_solve(self.rawDataset)
 74 |         yHat = X * ws
 75 |         return sum(numpy.power(Y - yHat, 2))
 76 | 
 77 |     def split(self, feature, value):
 78 |         row_indexs = numpy.nonzero(self.rawDataset[:, feature] > value)[0]
 79 |         m0 = self.rawDataset[row_indexs, :]
 80 |         row_indexs = numpy.nonzero(self.rawDataset[:, feature] <= value)[0]
 81 |         m1 = self.rawDataset[row_indexs, :]
 82 |         return Dataset(m0), Dataset(m1)
 83 | 
 84 |     def choose_best_split(self, tree_type=TYPE_VALUE, total_s=1.0, total_n=4):
 85 |         """
 86 | 
 87 |         Parameters
 88 |         ----------
 89 |         tree_type : int
 90 |             TYPE_VALUE 普通回归树
 91 |             TYPE_MODEL 模型回归树
 92 |         total_s : float
 93 |             分裂叶节点时, 数据集方差和下降值最小值
 94 |         total_n : int
 95 |             叶节点中最少包含的样本数
 96 | 
 97 |         Returns
 98 |         -------
 99 |         (int, float) : 对数据集划分的最好特征的index, 划分值
100 |         """
101 |         # 如果所有值都相等, 生成一个叶节点
102 |         if len(set(self.rawDataset[:, -1].T.A1)) == 1:
103 |             if tree_type == TYPE_VALUE:
104 |                 return None, self.leaf_val
105 |             elif tree_type == TYPE_MODEL:
106 |                 return None, self.leaf_model_weights
107 | 
108 |         _m, n = self.rawDataset.shape
109 |         best_info = {
110 |             's': numpy.inf,
111 |             'index': 0,
112 |             'value': 0,
113 |         }
114 |         for feature_index in range(n - 1):
115 |             values = set(self.rawDataset[:, feature_index].A1)
116 |             for split_val in values:
117 |                 d0, d1 = self.split(feature_index, split_val)
118 |                 # 如果切分出来的数据集很小, 跳过?
119 |                 if d0.shape[0] < total_n or d1.shape[0] < total_n:
120 |                     continue
121 |                 if tree_type == TYPE_VALUE:
122 |                     new_s = d0.leaf_error + d1.leaf_error
123 |                 elif tree_type == TYPE_MODEL:
124 |                     new_s = d0.leaf_model_error + d1.leaf_model_error
125 |                 if new_s < best_info['s']:
126 |                     best_info['s'] = new_s
127 |                     best_info['index'] = feature_index
128 |                     best_info['value'] = split_val
129 | 
130 |         # 如果误差减少不大, 则生成一个叶节点
131 |         if tree_type == TYPE_VALUE:
132 |             origin_error = self.leaf_error
133 |         elif tree_type == TYPE_MODEL:
134 |             origin_error = self.leaf_model_error
135 |         if origin_error - best_info['s'] < total_s:
136 |             if tree_type == TYPE_VALUE:
137 |                 return None, self.leaf_val
138 |             elif tree_type == TYPE_MODEL:
139 |                 return None, self.leaf_model_weights
140 | 
141 |         # 如果切分出来的数据集很小, 则生成一个叶节点
142 |         d0, d1 = self.split(best_info['index'], best_info['value'])
143 |         if d0.shape[0] < total_n or d1.shape[0] < total_n:
144 |             if tree_type == TYPE_VALUE:
145 |                 return None, self.leaf_val
146 |             elif tree_type == TYPE_MODEL:
147 |                 return None, self.leaf_model_weights
148 | 
149 |         return best_info['index'], best_info['value']
150 | 
151 | 
152 | class RegressionTree(object):
153 |     """回归树 -- 普通回归树/模型回归树
154 |     普通回归树 - 把相近的一群点作为一个模拟点
155 |     模型回归树 - 把'模式类似'的一群点化为一个线性函数的回归系数
156 |     """
157 | 
158 |     def __init__(self, dataset, tree_type=TYPE_VALUE, total_s=1.0, total_n=4):
159 |         self.tree_type = tree_type
160 |         self.dataset = Dataset(dataset)
161 |         self.tree = self.__build_tree(self.dataset, tree_type, total_s, total_n)
162 | 
163 |     @classmethod
164 |     def __build_tree(cls, dataset, tree_type, total_s, total_n):
165 |         feature_index, value = dataset.choose_best_split(tree_type, total_s, total_n)
166 |         if feature_index is None:
167 |             return value
168 | 
169 |         d0, d1 = dataset.split(feature_index, value)
170 |         tree = {
171 |             'index': feature_index,
172 |             'value': value,
173 |             'left': cls.__build_tree(d0, tree_type, total_s, total_n),
174 |             'right': cls.__build_tree(d1, tree_type, total_s, total_n),
175 |         }
176 |         return tree
177 | 
178 |     @staticmethod
179 |     def is_tree(node):
180 |         return isinstance(node, dict)
181 | 
182 |     @classmethod
183 |     def mean(cls, tree):
184 |         if cls.is_tree(tree['right']):
185 |             tree['right'] = cls.mean(tree['right'])
186 |         if cls.is_tree(tree['left']):
187 |             tree['left'] = cls.mean(tree['left'])
188 |         return (tree['left'] + tree['right']) / 2.0
189 | 
190 |     def prune(self, test_dataset):
191 |         assert self.tree_type == TYPE_VALUE
192 |         return self.__do_prune(copy.deepcopy(self.tree), Dataset(test_dataset))
193 | 
194 |     @classmethod
195 |     def __do_prune(cls, tree, test_dataset):
196 |         m, _n = test_dataset.shape
197 |         if m == 0:
198 |             return cls.mean(tree)
199 | 
200 |         if cls.is_tree(tree['right']) or cls.is_tree(tree['left']):
201 |             d0, d1 = test_dataset.split(tree['index'], tree['value'])
202 |             if cls.is_tree(tree['left']):
203 |                 tree['left'] = cls.__do_prune(tree['left'], d0)
204 |             if cls.is_tree(tree['right']):
205 |                 tree['right'] = cls.__do_prune(tree['right'], d1)
206 | 
207 |         if cls.is_tree(tree['left']) or cls.is_tree(tree['right']):
208 |             return tree
209 |         else:
210 |             # 如果两个子节点都已经不是树, 则对子节点尝试合并
211 |             # 比较合并前后的误差, 如果误差能得到提升则进行合并
212 |             d0, d1 = test_dataset.split(tree['index'], tree['value'])
213 |             errorNoMerge = sum(numpy.power(
214 |                 d0.rawDataset[:, -1] - tree['left'],
215 |                 2
216 |             )) + sum(numpy.power(
217 |                 d1.rawDataset[:, -1] - tree['right'],
218 |                 2
219 |             ))
220 | 
221 |             tree_mean = (tree['left'] + tree['right']) / 2.0
222 |             errorMerge = sum(numpy.power(test_dataset.rawDataset[:, -1], 2))
223 | 
224 |             if errorMerge < errorNoMerge:
225 |                 logging.debug('merging...')
226 |                 return tree_mean
227 |             else:
228 |                 return tree
229 | 
230 |     @staticmethod
231 |     def eval_value(model, in_dataset):
232 |         return float(model)
233 | 
234 |     @staticmethod
235 |     def eval_model(model, in_dataset):
236 |         m, n = in_dataset.shape
237 |         X = numpy.mat(numpy.ones((1, n + 1)))
238 |         X[0, 1:n+1] = in_dataset.rawDataset
239 |         return float(X * numpy.mat(model).T)
240 | 
241 |     def predict(self, test_dataset):
242 |         m, n = test_dataset.shape
243 |         yHat = numpy.mat(numpy.zeros((m, 1)))
244 |         for i in range(m):
245 |             eval_func = None
246 |             if self.tree_type == TYPE_VALUE:
247 |                 eval_func = self.eval_value
248 |             elif self.tree_type == TYPE_MODEL:
249 |                 eval_func = self.eval_model
250 |             yHat[i, 0] = self.__do_predict(
251 |                 self.tree,
252 |                 Dataset(test_dataset[i]),
253 |                 eval_func
254 |             )
255 |             logging.log(TRACE, '{} -> {}'.format(test_dataset[i, 0], yHat[i, 0]))
256 |         return yHat
257 | 
258 |     def __do_predict(self, tree, test_dataset, eval_func):
259 |         if not self.is_tree(tree):
260 |             return eval_func(tree, test_dataset)
261 | 
262 |         if test_dataset.rawDataset[tree['index']] > tree['value']:
263 |             logging.log(TRACE, '{0} > {1} : go left'.format(
264 |                 test_dataset.rawDataset[tree['index']],
265 |                 tree['value']
266 |             ))
267 |             return self.__do_predict(tree['left'], test_dataset, eval_func)
268 |         else:
269 |             logging.log(TRACE, '{0} <= {1} : go right'.format(
270 |                 test_dataset.rawDataset[tree['index']],
271 |                 tree['value']
272 |             ))
273 |             return self.__do_predict(tree['right'], test_dataset, eval_func)
274 | 
275 | 
276 | def main():
277 |     import pprint
278 |     """
279 |     filename = 'ex00.txt'
280 |     dataset = load_dataset_from_file(filename)
281 |     tree = RegressionTree(dataset)
282 |     logging.info('`{0}` -> 回归树:\n{1}'.format(
283 |         filename,
284 |         pprint.pformat(tree.tree)
285 |     ))
286 | 
287 |     filename = 'ex0.txt'
288 |     dataset = load_dataset_from_file(filename)
289 |     tree = RegressionTree(dataset)
290 |     logging.info('`{0}` -> 回归树:\n{1}'.format(
291 |         filename,
292 |         pprint.pformat(tree.tree)
293 |     ))
294 | 
295 |     filename = 'ex2.txt'
296 |     dataset = load_dataset_from_file(filename)
297 |     tree = RegressionTree(dataset)
298 |     logging.info('`{0}` -> 回归树:\n{1}'.format(
299 |         filename,
300 |         pprint.pformat(tree.tree)
301 |     ))
302 |     filename = 'ex2test.txt'
303 |     test_dataset = load_dataset_from_file(filename)
304 |     pruned_tree = tree.prune(test_dataset)
305 |     logging.info('利用`{0}`进行后剪支 -> 回归树:\n{1}'.format(
306 |         filename,
307 |         pprint.pformat(pruned_tree)
308 |     ))
309 | 
310 |     filename = 'exp2.txt'
311 |     dataset = load_dataset_from_file(filename)
312 |     tree = RegressionTree(dataset, TYPE_MODEL, 1, 10)
313 |     logging.info('`{0}` -> 模型回归树:\n{1}'.format(
314 |         filename,
315 |         pprint.pformat(tree.tree)
316 |     ))
317 |     """
318 |     # 回归树/模型树拟合效果对比
319 |     train_filename = 'bikeSpeedVsIq_train.txt'
320 |     train_dataset = load_dataset_from_file(train_filename)
321 |     test_filename = 'bikeSpeedVsIq_test.txt'
322 |     test_dataset = numpy.mat(load_dataset_from_file(test_filename))
323 | 
324 |     regular_regression_tree = RegressionTree(train_dataset, TYPE_VALUE, 1, 20)
325 |     logging.info('`{0}` -> 回归树:\n{1}'.format(
326 |         train_filename,
327 |         pprint.pformat(regular_regression_tree.tree)
328 |     ))
329 |     yHat = regular_regression_tree.predict(test_dataset[:, 0])
330 |     logging.info('{0}'.format(
331 |         numpy.corrcoef(yHat, test_dataset[:, 1], rowvar=0)[0, 1]
332 |     ))
333 | 
334 |     model_regression_tree = RegressionTree(train_dataset, TYPE_MODEL, 1, 20)
335 |     logging.info('`{0}` -> 模型回归树:\n{1}'.format(
336 |         train_filename,
337 |         pprint.pformat(model_regression_tree.tree)
338 |     ))
339 |     yHat = model_regression_tree.predict(test_dataset[:, 0])
340 |     logging.info('{0}'.format(
341 |         numpy.corrcoef(yHat, test_dataset[:, 1], rowvar=0)[0, 1]
342 |     ))
343 | 
344 | 
345 | if __name__ == '__main__':
346 |     main()
347 | 


--------------------------------------------------------------------------------
/ch11 - Apriori/apriori.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | from __future__ import print_function
  5 | 
  6 | import logging
  7 | 
  8 | from numpy import *
  9 | 
 10 | TRACE = logging.DEBUG - 1
 11 | logging.basicConfig(
 12 |     level=logging.DEBUG,
 13 |     # level=TRACE,
 14 |     format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s',
 15 | )
 16 | 
 17 | 
 18 | def load_fake_dataset():
 19 |     return [
 20 |         [1, 3, 4],
 21 |         [2, 3, 5],
 22 |         [1, 2, 3, 5],
 23 |         [2, 5],
 24 |     ]
 25 | 
 26 | 
 27 | def drop_unsupported_candidate_set(
 28 |         dataset, candidate_sets_k, min_support_degree):
 29 |     """
 30 |     '支持度': 数据集中包含该项集的记录所占的比例
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     dataset :
 35 |         数据集
 36 |     candidate_sets_k :
 37 |         候选项集集合
 38 |     min_support_degree :
 39 |         最小支持度
 40 | 
 41 |     Returns
 42 |     -------
 43 |     result, support_degrees :
 44 |         支持度 >= min_support_degree的频繁项集, 频繁项集的支持度
 45 |     """
 46 |     candidate_set_count = {}
 47 |     for transaction in dataset:
 48 |         for candidate_set in candidate_sets_k:
 49 |             if candidate_set.issubset(transaction):
 50 |                 num = candidate_set_count.get(candidate_set, 0)
 51 |                 candidate_set_count[candidate_set] = num + 1
 52 |     result = []
 53 |     support_degrees = {}
 54 |     for candidate_set in candidate_set_count:
 55 |         # 计算每一项的支持度
 56 |         support = 1.0 * candidate_set_count[candidate_set] / len(dataset)
 57 |         if support >= min_support_degree:
 58 |             result.insert(0, candidate_set)
 59 |         support_degrees[candidate_set] = support
 60 |     return result, support_degrees
 61 | 
 62 | 
 63 | def generate_candidate_sets_k(original_sets, k):
 64 |     if k == 1:
 65 |         """构建大小为1的所有候选项集合"""
 66 |         c1 = set([])
 67 |         for transaction in original_sets:
 68 |             for item in transaction:
 69 |                 c1.add(item)
 70 |         return list(map(frozenset,
 71 |                     sorted(list(map(
 72 |                         lambda item: [item, ], c1)))))
 73 | 
 74 |     for one_set in original_sets:
 75 |         assert len(one_set) == k - 1
 76 |     candidate_sets_k = []
 77 |     for i in range(len(original_sets)):
 78 |         for j in range(i + 1, len(original_sets)):
 79 |             # 如果两个集合的前 k-2 个元素相同, 则将它们合并为一个大小为 k 的集合
 80 |             # 原因见书的 P208, 第二段
 81 |             if sorted(list(original_sets[i])[:k - 2]) \
 82 |                     == sorted(list(original_sets[j])[:k - 2]):
 83 |                 candidate_sets_k.append(original_sets[i] | original_sets[j])
 84 |     return candidate_sets_k
 85 | 
 86 | 
 87 | def apriori(raw_dataset, min_support_degree=0.5):
 88 |     candidate_sets = generate_candidate_sets_k(raw_dataset, 1)
 89 |     dataset = list(map(set, raw_dataset))
 90 |     frequent_items, all_support_degree = drop_unsupported_candidate_set(
 91 |         dataset, candidate_sets, min_support_degree
 92 |     )
 93 |     all_frequent_items = [frequent_items, ]
 94 |     k = 2
 95 |     while len(all_frequent_items[k - 2]) > 0:
 96 |         candidate_sets = generate_candidate_sets_k(all_frequent_items[k - 2], k)
 97 |         frequent_items, support_degrees = drop_unsupported_candidate_set(
 98 |             dataset, candidate_sets, min_support_degree
 99 |         )
100 |         all_support_degree.update(support_degrees)
101 |         all_frequent_items.append(frequent_items)
102 |         k += 1
103 |     return all_frequent_items, all_support_degree
104 | 
105 | 
106 | class Rule(object):
107 |     def __init__(self, conditions, consequence, confidence_degree):
108 |         self.conditions = list(conditions)
109 |         self.consequence = list(consequence)
110 |         self.confidence_degree = confidence_degree
111 | 
112 |     def __str__(self):
113 |         return '{0} --> {1}, confidence: {2}'.format(
114 |             self.conditions,
115 |             self.consequence,
116 |             self.confidence_degree
117 |         )
118 | 
119 |     @classmethod
120 |     def generate_rules(
121 |             cls, frequent_sets, support_degrees,
122 |             min_confidence_degree=0.7):
123 |         rules = []
124 |         # only get the sets with two or more items
125 |         for i in range(1, len(frequent_sets)):
126 |             for frequent_set in frequent_sets[i]:
127 |                 H1 = [frozenset([item]) for item in frequent_set]
128 |                 if i == 1:
129 |                     legal_rules = cls.__rules_from_confidence_degree(
130 |                         frequent_set, H1, support_degrees,
131 |                         min_confidence_degree
132 |                     )
133 |                 else:
134 |                     legal_rules = cls.__rules_from_consequences(
135 |                         frequent_set, H1, support_degrees,
136 |                         min_confidence_degree
137 |                     )
138 |                 rules.extend(legal_rules)
139 |         return rules
140 | 
141 |     @classmethod
142 |     def __rules_from_confidence_degree(
143 |             cls, frequent_set, consequences, support_degrees,
144 |             min_confidence_degree):
145 |         """计算置信度
146 |         '置信度': P -> H 的置信度为 support(P ∪ H) / support(P)
147 |         'P ∪ H': P 与 H 的并集
148 | 
149 |         Parameters
150 |         ----------
151 |         frequent_set
152 |         consequences
153 |         support_degrees
154 |         min_confidence_degree
155 | 
156 |         Returns
157 |         -------
158 | 
159 |         """
160 |         legal_rules = []
161 |         for consequence in consequences:
162 |             conditions = frequent_set - consequence
163 |             # 计算置信度
164 |             confidence = support_degrees[frequent_set] / support_degrees[conditions]
165 |             if confidence >= min_confidence_degree:
166 |                 rule = Rule(
167 |                     conditions, consequence,
168 |                     confidence
169 |                 )
170 |                 legal_rules.append(rule)
171 |                 logging.debug(rule)
172 |         return legal_rules
173 | 
174 |     @classmethod
175 |     def __rules_from_consequences(
176 |             cls, frequent_set, consequences, support_degrees,
177 |             min_confidence_degree):
178 |         # try further merging
179 |         if len(frequent_set) <= len(consequences[0]) + 1:
180 |             return None
181 | 
182 |         # create Hm+1 new candidates
183 |         Hmp1 = generate_candidate_sets_k(consequences, len(consequences[0]) + 1)
184 |         legal_rules = cls.__rules_from_confidence_degree(
185 |             frequent_set, Hmp1, support_degrees,
186 |             min_confidence_degree
187 |         )
188 |         legal_consequence = list(map(
189 |             lambda rule: rule.consequence,
190 |             legal_rules
191 |         ))
192 |         # need at least two sets to merge
193 |         if len(legal_consequence) > 1:
194 |             sub_rules = cls.__rules_from_consequences(
195 |                 frequent_set, legal_consequence, support_degrees,
196 |                 min_confidence_degree
197 |             )
198 |             if sub_rules is not None:
199 |                 legal_rules.extend(sub_rules)
200 |         return legal_rules
201 | 
202 | 
203 | def main():
204 |     import pprint
205 |     raw_dataset = load_fake_dataset()
206 |     frequent_sets, support_degrees = apriori(
207 |         raw_dataset, min_support_degree=0.5
208 |     )
209 |     logging.info('frequent_sets: {0}'.format(
210 |         pprint.pformat(frequent_sets)
211 |     ))
212 |     logging.info('support_degrees: {0}'.format(
213 |         pprint.pformat(support_degrees)
214 |     ))
215 | 
216 |     rules = Rule.generate_rules(
217 |         frequent_sets, support_degrees,
218 |         min_confidence_degree=0.7
219 |     )
220 |     logging.info(rules)
221 | 
222 | if __name__ == '__main__':
223 |     main()
224 | 
225 | 
226 | 
227 | # from time import sleep
228 | # from votesmart import votesmart
229 | # votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
230 | # #votesmart.apikey = 'get your api key first'
231 | # def getActionIds():
232 | #     actionIdList = []; billTitleList = []
233 | #     fr = open('recent20bills.txt')
234 | #     for line in fr.readlines():
235 | #         billNum = int(line.split('\t')[0])
236 | #         try:
237 | #             billDetail = votesmart.votes.getBill(billNum) #api call
238 | #             for action in billDetail.actions:
239 | #                 if action.level == 'House' and \
240 | #                 (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
241 | #                     actionId = int(action.actionId)
242 | #                     print('bill: %d has actionId: %d' % (billNum, actionId))
243 | #                     actionIdList.append(actionId)
244 | #                     billTitleList.append(line.strip().split('\t')[1])
245 | #         except:
246 | #             print("problem getting bill %d" % billNum)
247 | #         sleep(1)                                      #delay to be polite
248 | #     return actionIdList, billTitleList
249 | #
250 | # def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
251 | #     itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
252 | #     for billTitle in billTitleList:#fill up itemMeaning list
253 | #         itemMeaning.append('%s -- Nay' % billTitle)
254 | #         itemMeaning.append('%s -- Yea' % billTitle)
255 | #     transDict = {}#list of items in each transaction (politician)
256 | #     voteCount = 2
257 | #     for actionId in actionIdList:
258 | #         sleep(3)
259 | #         print('getting votes for actionId: %d' % actionId)
260 | #         try:
261 | #             voteList = votesmart.votes.getBillActionVotes(actionId)
262 | #             for vote in voteList:
263 | #                 if not transDict.has_key(vote.candidateName):
264 | #                     transDict[vote.candidateName] = []
265 | #                     if vote.officeParties == 'Democratic':
266 | #                         transDict[vote.candidateName].append(1)
267 | #                     elif vote.officeParties == 'Republican':
268 | #                         transDict[vote.candidateName].append(0)
269 | #                 if vote.action == 'Nay':
270 | #                     transDict[vote.candidateName].append(voteCount)
271 | #                 elif vote.action == 'Yea':
272 | #                     transDict[vote.candidateName].append(voteCount + 1)
273 | #         except:
274 | #             print("problem getting actionId: %d" % actionId)
275 | #         voteCount += 2
276 | #     return transDict, itemMeaning
277 | 


--------------------------------------------------------------------------------
/ch12 - FP-growth/fpGrowth.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | from __future__ import print_function
  5 | 
  6 | import logging
  7 | 
  8 | TRACE = logging.DEBUG - 1
  9 | logging.basicConfig(
 10 |     level=logging.DEBUG,
 11 |     # level=TRACE,
 12 |     format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s',
 13 | )
 14 | 
 15 | 
 16 | class Node(object):
 17 |     def __init__(self, name, num_occur, parent):
 18 |         self.parent = parent
 19 |         self.name = name
 20 |         self.count = num_occur
 21 |         self.nodeLink = None
 22 |         self.children = {}
 23 |     
 24 |     def inc(self, num_occur):
 25 |         self.count += num_occur
 26 |         
 27 |     def display(self, depth=1):
 28 |         print('  ' * depth, self.name, ' ', self.count)
 29 |         for child in self.children.values():
 30 |             child.display(depth + 1)
 31 | 
 32 |     def __str__(self):
 33 |         return self.name
 34 | 
 35 |     def __repr__(self):
 36 |         return str(self)
 37 | 
 38 | 
 39 | class TableItem(object):
 40 |     def __init__(self, count, head):
 41 |         self.count = count
 42 |         self.head = head
 43 | 
 44 |     def __str__(self):
 45 |         return '({0}, {1})'.format(self.count, self.head)
 46 | 
 47 |     def __repr__(self):
 48 |         return str(self)
 49 | 
 50 |     def __cmp__(self, other):
 51 |         if self.count != other.count:
 52 |             return cmp(self.count, other.count)
 53 |         else:
 54 |             return cmp(self.head, other.head)
 55 | 
 56 | 
 57 | class FrequentPatternTree(object):
 58 |     def __init__(self, dataset, min_support_degree=1):
 59 |         self.min_support_degree = min_support_degree
 60 |         self.table = {}
 61 |         # 对每个元素出现次数进行计数
 62 |         for transaction in dataset:
 63 |             for item in transaction:
 64 |                 self.table[item] = (
 65 |                     self.table.get(item, 0) + dataset[transaction]
 66 |                 )
 67 |         # 删除出现次数少于 min_support_degree 的项
 68 |         self.table = {
 69 |             key: value for (key, value) in self.table.items()
 70 |             if value >= self.min_support_degree
 71 |         }
 72 |         frequent_items = set(self.table.keys())
 73 | 
 74 |         # 如果所有项都不频繁, 跳过下面的处理步骤
 75 |         if len(frequent_items) == 0:
 76 |             self.root = None
 77 |             self.table = None
 78 |             return
 79 | 
 80 |         # 扩展 headerTable 以便保存计数值以及指向每种类型第一个元素项的指针
 81 |         self.table = {
 82 |             key: TableItem(value, None) for (key, value) in self.table.items()
 83 |         }
 84 | 
 85 |         self.root = Node('Null Set', 1, None)
 86 |         for transaction, count in dataset.items():
 87 |             local_dataset = {}
 88 |             for item in transaction:  # put transaction items in order
 89 |                 if item in frequent_items:
 90 |                     local_dataset[item] = self.table[item].count
 91 |             if len(local_dataset) > 0:
 92 |                 ordered_items = [v[0] for v in sorted(
 93 |                     local_dataset.items(), key=lambda p: p[1],
 94 |                     reverse=True
 95 |                 )]
 96 |                 # populate tree with ordered freq itemset
 97 |                 self.__update(ordered_items, self.root, count)
 98 | 
 99 |     @property
100 |     def is_empty(self):
101 |         return self.root is None
102 | 
103 |     def __update(self, items, root, count):
104 |         if items[0] in root.children:
105 |             # 如果已经在孩子列表中, 增加出现次数
106 |             root.children[items[0]].inc(count)
107 |         else:
108 |             # 把结点添加到当前结点的子节点上
109 |             root.children[items[0]] = Node(items[0], count, root)
110 |             # 更新 table
111 |             if self.table[items[0]].head is None:
112 |                 self.table[items[0]].head = root.children[items[0]]
113 |             else:
114 |                 temp = self.table[items[0]].head
115 |                 while temp.nodeLink is not None:
116 |                     temp = temp.nodeLink
117 |                 temp.nodeLink = root.children[items[0]]
118 |         # call update() with remaining ordered items
119 |         if len(items) > 1:
120 |             self.__update(items[1:], root.children[items[0]], count)
121 | 
122 |     @staticmethod
123 |     def find_prefix_paths(element, node):
124 |         paths = {}
125 |         while node is not None:
126 |             leaf = node
127 |             prefix = []
128 |             while leaf.parent is not None:
129 |                 prefix.append(leaf.name)
130 |                 leaf = leaf.parent
131 |             if len(prefix) > 1:
132 |                 paths[frozenset(prefix[1:])] = node.count
133 |             node = node.nodeLink
134 |         return paths
135 | 
136 |     def mine(self, prefix=None):
137 |         if prefix is None:
138 |             prefix = set([])
139 |         frequent_items = []
140 |         # (sort header table)
141 |         items = [
142 |             pair[0] for pair in
143 |             sorted(self.table.items(), key=lambda p: p[1])
144 |         ]
145 |         for item in items:
146 |             new_frequent_set = prefix | {item}
147 |             # print('finalFrequent Item: ', new_frequent_set)
148 |             frequent_items.append(tuple(new_frequent_set))
149 |             condition_pattern_bases = self.find_prefix_paths(
150 |                 item, self.table[item].head
151 |             )
152 |             # print('condition_pattern_bases :', item, condition_pattern_bases)
153 |             # 2. construct cond FP-tree from cond. pattern base
154 |             condition_tree = FrequentPatternTree(
155 |                 condition_pattern_bases,
156 |                 self.min_support_degree
157 |             )
158 |             # print('head from conditional tree: ', condition_table)
159 |             if not condition_tree.is_empty:  # 3. mine cond. FP-tree
160 |                 # logging.debug('conditional tree for: {0}'.format(new_frequent_set))
161 |                 # condition_tree.display(1)
162 |                 sub_frequent_items = condition_tree.mine(new_frequent_set)
163 |                 frequent_items.extend(sub_frequent_items)
164 |         return frequent_items
165 | 
166 | 
167 | def load_fake_dataset():
168 |     dataset = [
169 |         ['r', 'z', 'h', 'j', 'p'],
170 |         ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
171 |         ['z'],
172 |         ['r', 'x', 'n', 'o', 's'],
173 |         ['y', 'r', 'x', 'z', 'q', 't', 'p'],
174 |         ['y', 'z', 'x', 'e', 'q', 's', 't', 'm'],
175 |     ]
176 |     return dataset
177 | 
178 | 
179 | def main():
180 |     import pprint
181 |     dataset = load_fake_dataset()
182 |     dataset = {frozenset(transaction): 1 for transaction in dataset}
183 |     fp_tree = FrequentPatternTree(dataset, min_support_degree=3)
184 |     logging.info(pprint.pformat(fp_tree.table))
185 | 
186 |     logging.info(FrequentPatternTree.find_prefix_paths('x', fp_tree.table['x'].head))
187 |     logging.info(FrequentPatternTree.find_prefix_paths('z', fp_tree.table['z'].head))
188 |     logging.info(FrequentPatternTree.find_prefix_paths('r', fp_tree.table['r'].head))
189 | 
190 |     frequent_items = fp_tree.mine()
191 |     logging.info(pprint.pformat(frequent_items))
192 | 
193 |     dataset = []
194 |     with open('kosarak.dat', 'r') as infile:
195 |         for line in infile:
196 |             dataset.append(line.split())
197 |     dataset = {frozenset(transaction): 1 for transaction in dataset}
198 |     logging.debug(len(dataset))
199 |     min_support_degree = 100000
200 |     fp_tree = FrequentPatternTree(dataset, min_support_degree)
201 |     frequent_items = fp_tree.mine()
202 |     logging.info(pprint.pformat(frequent_items))
203 | 
204 | 
205 | if __name__ == '__main__':
206 |     main()
207 | 
208 | 
209 | # import twitter
210 | # from time import sleep
211 | # import re
212 | #
213 | # def textParse(bigString):
214 | #     urlsRemoved = re.sub('(http:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString)
215 | #     listOfTokens = re.split(r'\W*', urlsRemoved)
216 | #     return [tok.lower() for tok in listOfTokens if len(tok) > 2]
217 | #
218 | # def getLotsOfTweets(searchStr):
219 | #     CONSUMER_KEY = ''
220 | #     CONSUMER_SECRET = ''
221 | #     ACCESS_TOKEN_KEY = ''
222 | #     ACCESS_TOKEN_SECRET = ''
223 | #     api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET,
224 | #                       access_token_key=ACCESS_TOKEN_KEY,
225 | #                       access_token_secret=ACCESS_TOKEN_SECRET)
226 | #     #you can get 1500 results 15 pages * 100 per page
227 | #     resultsPages = []
228 | #     for i in range(1,15):
229 | #         print("fetching page %d" % i)
230 | #         searchResults = api.GetSearch(searchStr, per_page=100, page=i)
231 | #         resultsPages.append(searchResults)
232 | #         sleep(6)
233 | #     return resultsPages
234 | #
235 | # def mineTweets(tweetArr, minSup=5):
236 | #     parsedList = []
237 | #     for i in range(14):
238 | #         for j in range(100):
239 | #             parsedList.append(textParse(tweetArr[i][j].text))
240 | #     initSet = createInitSet(parsedList)
241 | #     myFPtree, myHeaderTab = createTree(initSet, minSup)
242 | #     myFreqList = []
243 | #     mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
244 | #     return myFreqList
245 | #
246 | 


--------------------------------------------------------------------------------