├── .gitignore ├── README.md ├── ch02 - kNN └── kNN.py ├── ch03 - DicisionTree └── dicisiontree.py ├── ch04 - NaiveBayes ├── bayes.py └── email ├── ch05 - LogisticRegression └── logisticRegression.py ├── ch06 - svm └── svmMLiA.py ├── ch07 - AdaBoosting └── adaboost.py ├── ch08 - LinearRegression └── regression.py ├── ch09 - RegressionTree └── regressionTrees.py ├── ch11 - Apriori └── apriori.py └── ch12 - FP-growth └── fpGrowth.py /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | # Created by .ignore support plugin (hsz.mobi) 62 | 63 | .idea/ 64 | 65 | machinelearninginaction/ 66 | 67 | ch02 - kNN/testDigits/ 68 | ch02 - kNN/trainingDigits/ 69 | ch04 - NaiveBayes/email 70 | ch06 - svm/digits 71 | 72 | # test sets 73 | *.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Machine Learning in Action 2 | ========================== 3 | 4 | This is the source code to go with "Machine Learning in Action" 5 | by Peter Harrington published by Manning Inc. 6 | The official page for this book can be found here: http://manning.com/pharrington/ 7 | 8 | All the code examples were working on Python 2.6, there shouldn't be any problems with the 2.7. NumPy will be needed for most examples. If you have trouble running any of the examples us know on the Forum for this book: http://www.manning-sandbox.com/forum.jspa?forumID=728. 9 | 10 | If you want to run these on some other version of Python say--3.0 or IronPython, feel free to fork the code. 11 | 12 | ### 图灵社区下载地址 13 | [图灵社区:图书:机器学习实战](http://www.ituring.com.cn/book/1021) 14 | 15 | ### 关于此repo 16 | 此repo为个人对官方源码的改写以及笔记 17 | 18 | ## 章节说明 19 | ### 第一部分 -- 分类 20 | * 第2章 - k-近邻算法 21 | * 第3章 - 决策树 22 | * 第4章 - 朴素贝叶斯分类器 23 | * 第5章 - Logistic回归 24 | * 第6章 - AdaBoost元算法 25 | 26 | ### 第二部分 -- 回归方法 27 | * 第8章 - 线性回归/局部加权线性回归, 收缩方法 28 | * 第9章 - 树回归 29 | 30 | ### 第三部分 -- 无监督学习 31 | * 第10章 - 无监督学习中聚类方法 32 | * 第11章 - Apriori算法进行关联分析 33 | * 第12章 - FP-growth算法进行关联分析 34 | -------------------------------------------------------------------------------- /ch02 - kNN/kNN.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=utf-8 3 | 4 | """ 5 | k-近邻算法 6 | === 7 | 存在训练样本集, 且样本集中每个数据都存在标签, 即已知样本集中每一组数据与所属分类的对应关系. 8 | 9 | 当输入没有标签的新数据后, 将新数据的每个特征与样本集中数据对应的特征进行比较, 10 | 算法提取样本集中特征最相似的k组数据(最近邻)的分类标签, (一般k<20), 11 | 取k个最相似的数据中出现次数最多的分类, 作为新数据的分类. 12 | """ 13 | 14 | from __future__ import print_function 15 | 16 | import os 17 | import operator 18 | 19 | import numpy 20 | import matplotlib 21 | import matplotlib.pyplot as plt 22 | 23 | """ 获取数据源的函数 """ 24 | 25 | 26 | def GetFakeDataset(): 27 | """创建数据集及其分类 28 | 29 | Returns 30 | ------- 31 | numpy.array, labels : 数据集, 数据集元素对应的标签 32 | """ 33 | groups = numpy.array([ 34 | [1.0, 1.1], 35 | [1.0, 1.0], 36 | [0.0, 0.0], 37 | [0.0, 0.1], 38 | ]) 39 | labels = ['A', 'A', 'B', 'B'] 40 | return groups, labels 41 | 42 | 43 | def GetFileDataset(filename): 44 | """把文本中的数据转换为数据集, labels返回 45 | 46 | Parameters 47 | ---------- 48 | filename : string 49 | 文本名 50 | 51 | Returns 52 | ------- 53 | numpy.array, list : 文本中的数据矩阵, 数据对应的标签列表 54 | """ 55 | with open(filename) as infile: 56 | lines = infile.readlines() 57 | numberOflines = len(lines) 58 | dataset = numpy.zeros((numberOflines, 3)) 59 | dataLabels = [] 60 | for index, line in enumerate(lines): 61 | listFromLine = line.strip().split() 62 | dataset[index,:] = listFromLine[0:3] 63 | dataLabels.append(int(listFromLine[-1])) 64 | return dataset, dataLabels 65 | 66 | """ kNN 分类器 """ 67 | 68 | 69 | class KNNModel(object): 70 | """ kNN分类器 """ 71 | 72 | def __init__(self, dataset, labels): 73 | self.dataset = dataset 74 | self.labels = labels 75 | 76 | def predict(self, inX, k): 77 | if k <= 0: 78 | raise ValueError('K > 0') 79 | 80 | m, n = self.dataset.shape 81 | # 利用矩阵运算, 每个dataset的分量都减去inX 82 | diffMat = numpy.tile(inX, (m, 1)) - self.dataset 83 | # 计算欧式距离 sqrt(sum()) 84 | distances = ((diffMat**2).sum(axis=1))**0.5 85 | # 对数据从小到大次序排列,确定前k个距离最小元素所在的主要分类 86 | sortedDistInd = distances.argsort() 87 | classCount = {} 88 | for i in range(k): 89 | voteIlabel = self.labels[sortedDistInd[i]] 90 | classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 91 | # 返回最相近的类 92 | sortedClassCount = sorted( 93 | classCount.items(), key=operator.itemgetter(1), reverse=True 94 | ) 95 | return sortedClassCount[0][0] 96 | 97 | 98 | class KNNModelWithNormalize(object): 99 | """ 带归一化的kNN分类器 """ 100 | 101 | def __init__(self, dataset, labels): 102 | self.normDataset, self.ranges, self.minVals = self.normalize(dataset) 103 | self.labels = labels 104 | 105 | def normalize(self, dataset): 106 | """ 对dataset进行归一化处理, 使得输入的特征权重一致 """ 107 | minVals = dataset.min(0) # 获取每一列的最小值 108 | maxVals = dataset.max(0) # 获取每一列的最大值 109 | ranges = maxVals - minVals # 每一列的范围 110 | m, n = dataset.shape 111 | # 归一化 (Xi - Xmin) / (Xmax - Xmin) 112 | normDataset = (dataset - numpy.tile(minVals, (m, 1))) / numpy.tile(ranges, (m, 1)) 113 | return normDataset, ranges, minVals 114 | 115 | def predict(self, inX, k): 116 | if k <= 0: 117 | raise ValueError('K > 0') 118 | 119 | # 先对输入特征进行归一化处理 120 | inX = (inX - self.minVals) / self.ranges 121 | 122 | datasetSize = self.normDataset.shape[0] 123 | # 利用矩阵运算, 每个 dataset 的分量都减去inX 124 | diffMat = numpy.tile(inX, (self.normDataset.shape[0],1)) - self.normDataset 125 | # 计算欧式距离 sqrt(sum()) 126 | distances = ((diffMat**2).sum(axis=1))**0.5 127 | # 对数据从小到大次序排列,确定前k个距离最小元素所在的主要分类 128 | sortedDistInd = distances.argsort() 129 | classCount={} 130 | for i in range(k): 131 | voteIlabel = self.labels[sortedDistInd[i]] 132 | classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 133 | # 返回最相近的类 134 | sortedClassCount = sorted( 135 | classCount.items(), key=operator.itemgetter(1), reverse=True 136 | ) 137 | return sortedClassCount[0][0] 138 | 139 | @classmethod 140 | def test(cls, testfile, k=3, ratio=0.10): 141 | dataset, labels = GetFileDataset(testfile) 142 | m, n = dataset.shape 143 | numTestVectors = int(m * ratio) 144 | numError = 0 145 | 146 | model = cls(dataset[numTestVectors:m, :], labels[numTestVectors:m]) 147 | for i in range(numTestVectors): 148 | result = model.predict(dataset[i, :], k) 149 | if result != labels[i]: 150 | numError += 1 151 | print('× Predict/Real {0}/{1}'.format(result, labels[i])) 152 | else: 153 | print('√ Predict/Real {0}/{1}'.format(result, labels[i])) 154 | print('Total error rate: {0:.1%}'.format(1.0*numError / numTestVectors)) 155 | 156 | 157 | def TestClassifyPerson(dataset_filename): 158 | result2str = { 159 | 1: '完全不感兴趣', 160 | 2: '可能喜欢', 161 | 3: '很有可能喜欢', 162 | } 163 | print('请输入该人的相关信息:') 164 | percentageTimeOfPlayGames = float( 165 | input('消耗在玩游戏上的时间百分比?\n: ') 166 | ) 167 | flyMiles = float( 168 | input('每年搭乘飞机的飞行里程数?\n: ') 169 | ) 170 | iceCream = float( 171 | input('每周消费的冰淇淋公升数?\n: ') 172 | ) 173 | 174 | dataset, labels = GetFileDataset(dataset_filename) 175 | DrawPlot(dataset, labels) 176 | model = KNNModelWithNormalize(dataset, labels) 177 | inVector = numpy.array([flyMiles, percentageTimeOfPlayGames, iceCream]) 178 | classifierResult = model.predict(inVector, k=3) 179 | print( 180 | '预测你对这个人:', result2str[classifierResult] 181 | ) 182 | 183 | 184 | """ 使用 Matplotlib绘制散点图 """ 185 | 186 | 187 | def DrawPlot(dataset, labels): 188 | """绘制散点图 189 | 190 | Parameters 191 | ---------- 192 | dataset : numpy.array 193 | 数据集 194 | labels : list of int 195 | 标签值 196 | """ 197 | fig = plt.figure() 198 | ax = fig.add_subplot(111) 199 | _ = ax.scatter( 200 | dataset[:, 1], dataset[:, 2], 201 | s=15.0*numpy.array(labels), # 大小 202 | c=15.0*numpy.array(labels) # 颜色 203 | ) 204 | plt.show() 205 | 206 | """ 手写识别系统 """ 207 | 208 | 209 | def VectorDebugPrint(vector): 210 | for i in range(32): 211 | print(''.join( 212 | list(map( 213 | lambda x: str(int(x)), 214 | vector[i*32:(i+1)*32] 215 | )) 216 | )) 217 | 218 | 219 | def TranslateImg2Vector(filename): 220 | """ 把'图像文件'转换为1024维的向量 """ 221 | vector = numpy.zeros((1, 1024)) 222 | with open(filename, 'r') as infile: 223 | for lineno, line in enumerate(infile): 224 | for rowno in range(32): 225 | vector[0, 32*lineno+rowno] = int(line[rowno]) 226 | return vector 227 | 228 | 229 | def GetDigitsDatasetFromDir(dirname): 230 | """从文件夹中获取数据集, labels 231 | 232 | Parameters 233 | ---------- 234 | dirname 文件夹名称 235 | 236 | Returns 237 | ------- 238 | numpy.array, labels : 数据集, 数据集元素对应的标签 239 | """ 240 | filenames = os.listdir(dirname) 241 | 242 | labels = [None] * len(filenames) 243 | dataset = numpy.zeros((len(filenames), 1024)) 244 | 245 | for i, filename in enumerate(filenames): 246 | fileclass = filename.split('.')[0].split('_')[0] 247 | filepath = os.path.join(dirname, filename) 248 | dataset[i, :], labels[i] = TranslateImg2Vector(filepath), fileclass 249 | return dataset, labels 250 | 251 | 252 | def TestHandwritingNumber(trainDir, testDir, k=3): 253 | dataset, labels = GetDigitsDatasetFromDir(trainDir) 254 | model = KNNModel(dataset, labels) 255 | 256 | dataset, labels = GetDigitsDatasetFromDir(testDir) 257 | numError = 0 258 | numTestVectors = len(labels) 259 | for testVec, label in zip(dataset, labels): 260 | result = model.predict(testVec, k) 261 | if result != label: 262 | numError += 1 263 | print('× Predict/Real {0}/{1}'.format(result, label)) 264 | else: 265 | print('√ Predict/Real {0}/{1}'.format(result, label)) 266 | print('Total error rate: {0:.1%}'.format(1.0*numError / numTestVectors)) 267 | 268 | 269 | if __name__ == '__main__': 270 | dataset, labels = GetFakeDataset() 271 | model = KNNModel(dataset, labels) 272 | inX = [0, 0] 273 | print('{} should be {}'.format(inX, model.predict(inX, k=3))) 274 | 275 | # TestClassifyPerson('datingTestSet2.txt') 276 | 277 | TestHandwritingNumber('trainingDigits', 'testDigits', k=3) 278 | -------------------------------------------------------------------------------- /ch03 - DicisionTree/dicisiontree.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=utf-8 3 | 4 | """ 5 | 决策树 6 | === 7 | 决策树的数据形式非常容易理解, 决策树很多任务都是为了数据中所蕴含的知识信息, 8 | 因此决策树可以使用不熟悉的数据集合, 并从中提取出一系列规则, 机器学习算法最终使用机器从数据集中创造的规则. 9 | 专家系统中经常使用决策树. 10 | """ 11 | 12 | from __future__ import print_function 13 | 14 | import math 15 | import operator 16 | import pickle 17 | from collections import defaultdict 18 | 19 | import numpy as np 20 | 21 | import logging 22 | logging.basicConfig( 23 | level=logging.DEBUG, 24 | format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s', 25 | ) 26 | 27 | 28 | class Dataset(object): 29 | """ 对数据集以及相关操作的封装 """ 30 | 31 | def __init__(self, rawDataset): 32 | self.rawDataset = np.array(rawDataset) 33 | 34 | @property 35 | def shape(self): 36 | return self.rawDataset.shape 37 | 38 | @property 39 | def classList(self): 40 | return self.rawDataset[:, -1].tolist() 41 | 42 | @property 43 | def shannonEntropy(self): 44 | """获取数据集的香农熵 45 | 熵越大代表混乱程度越高, 即混合的数据越多 46 | 47 | Returns 48 | ------- 49 | float : 数据集的香农熵 50 | """ 51 | # 统计每个 label 出现的次数 52 | labelCounts = defaultdict(int) 53 | for featVec in self.rawDataset: 54 | label = featVec[-1] 55 | labelCounts[label] += 1 56 | # 计算熵 57 | # H = - ∑(n, i=1) ( p(xi) * log(2)(p(xi)) ) 58 | entropy = 0.0 59 | numEntries = len(self.rawDataset) 60 | for label in labelCounts: 61 | probability = 1.0*labelCounts[label] / numEntries 62 | entropy -= probability * math.log(probability, 2) # 底数为2 63 | return entropy 64 | 65 | def split(self, axis): 66 | """ 对数据集按照 axis 指定的特征进行划分 67 | 68 | Parameters 69 | ---------- 70 | axis : int 71 | 指定用于进行划分的特征 72 | 73 | Returns 74 | ------- 75 | 按照指定特征划分后的 (值, 子数据集) 对 76 | """ 77 | subDatasets = defaultdict(list) 78 | for featureVector in self.rawDataset: 79 | value = featureVector[axis] 80 | subFeatureVector = ( 81 | featureVector[:axis].tolist() 82 | + featureVector[axis+1:].tolist() 83 | ) # 去除已经用于划分的特征 84 | subDatasets[value].append(subFeatureVector) 85 | return ( 86 | list(subDatasets.keys()), 87 | list(map(self.__class__, subDatasets.values())), 88 | ) 89 | 90 | def ChooseBestSplitFeature(self): 91 | """通过遍历数据集, 计算香农熵, 选择最好的特征进行数据划分 92 | 93 | Returns 94 | ------- 95 | int : 对数据熵增益最大的划分特征的index 96 | """ 97 | m, n = self.shape 98 | numFeatures = n - 1 99 | baseEntropy = self.shannonEntropy # 当前整个数据集的熵 100 | best = { 101 | 'gain': 0.0, # 记录最好的信息增益 102 | 'feature': -1, # 记录最好的特征index 103 | } 104 | # 按照不同的特征遍历进行划分 105 | for featureIndex in range(numFeatures): 106 | _labels, subDatasets = self.split(featureIndex) 107 | # 计算按照此特征进行划分后的熵 108 | newEntropy = 0.0 109 | for subDataset in subDatasets: 110 | sub_m, _sub_n = subDataset.shape 111 | probability = 1.0*sub_m / m 112 | newEntropy += probability * subDataset.shannonEntropy 113 | # 计算信息增益, 更新最好的特征 114 | infoGain = baseEntropy - newEntropy 115 | if infoGain > best['gain']: 116 | best['gain'] = infoGain 117 | best['feature'] = featureIndex 118 | return best['feature'] 119 | 120 | 121 | def GetFakeDataset(): 122 | dataset = [ 123 | [1, 1, 'yes'], 124 | [1, 1, 'yes'], 125 | [1, 0, 'no'], 126 | [0, 1, 'no'], 127 | [0, 1, 'no'], 128 | ] 129 | labels = ['no surfacing', 'flippers'] 130 | # change to discrete values 131 | return Dataset(dataset), labels 132 | 133 | 134 | class DicisionTree(object): 135 | """ 决策树 """ 136 | 137 | def __init__(self, dataset, labels): 138 | if dataset and labels: 139 | self.labels = labels 140 | self.tree = self.BuildTree(Dataset(dataset), self.labels) 141 | 142 | def SaveToFile(self, filename): 143 | with open(filename, 'w') as outfile: 144 | pickle.dump(self, outfile) 145 | 146 | @staticmethod 147 | def LoadFromFile(filename): 148 | with open(filename, 'r') as infile: 149 | tree = pickle.load(infile) 150 | return tree 151 | 152 | @staticmethod 153 | def GetMajorityClass(classList): 154 | classCount = defaultdict(int) 155 | for vote in classList: 156 | classCount[vote] += 1 157 | sortedClassCount = sorted( 158 | classCount.items(), 159 | key=operator.itemgetter(1), 160 | reverse=True 161 | ) 162 | return sortedClassCount[0][0] 163 | 164 | def BuildTree(self, dataset, labels): 165 | labels = labels[:] # 复制防止破坏原来的 labels 列表 166 | 167 | classList = dataset.classList 168 | # 当子集中所有项都为同一 label , 直接返回 169 | if classList.count(classList[0]) == len(classList): 170 | return classList[0] 171 | 172 | # 当所有 feature 都用完, 返回出现次数最多的 173 | _m, n = dataset.shape 174 | if n == 1: 175 | return self.GetMajorityClass(classList) 176 | 177 | # 选择信息增益最大的进行划分 178 | bestFeatureIndex = dataset.ChooseBestSplitFeature() 179 | bestFeatureLabel = labels[bestFeatureIndex] 180 | del(labels[bestFeatureIndex]) 181 | logging.info('Spliting by Feature {0}({1})'.format( 182 | bestFeatureLabel, 183 | bestFeatureIndex 184 | )) 185 | 186 | dicisionTree = { 187 | bestFeatureLabel: {}, 188 | } 189 | 190 | # 对特征下每个值进行递归划分 191 | subLabels, subDatasets = dataset.split(bestFeatureIndex) 192 | logging.info('labels:{0} for Feature {1}'.format(subLabels, bestFeatureLabel)) 193 | for subLabel, subDataset in zip(subLabels, subDatasets): 194 | logging.info('Building subtree of value `{0}`'.format(subLabel)) 195 | dicisionTree[bestFeatureLabel][subLabel] = self.BuildTree( 196 | subDataset, 197 | labels 198 | ) 199 | logging.info('Subtree `{0}` built'.format(subLabel)) 200 | return dicisionTree 201 | 202 | def predict(self, inputVector): 203 | return self.GetClassOfVector(self.tree, self.labels, inputVector) 204 | 205 | def GetClassOfVector(self, dicisionTree, featureLabels, inputVector): 206 | featureLabel = dicisionTree.keys()[0] 207 | subDicisionTree = dicisionTree[featureLabel] 208 | featureIndex = featureLabels.index(featureLabel) 209 | 210 | downKey = inputVector[featureIndex] 211 | downNode = subDicisionTree[downKey] 212 | 213 | if isinstance(downNode, dict): 214 | # 递归在子树中查找所属类别 215 | classLabel = self.GetClassOfVector( 216 | downNode, featureLabels, 217 | inputVector 218 | ) 219 | else: 220 | classLabel = downNode 221 | return classLabel 222 | 223 | @property 224 | def depth(self): 225 | return self.GetTreeDepth(self.tree) 226 | 227 | @classmethod 228 | def GetTreeDepth(cls, tree): 229 | max_depth = 0 230 | featureLabel = tree.keys()[0] 231 | subDicisionTree = tree[featureLabel] 232 | for featureValue in subDicisionTree: 233 | if isinstance(subDicisionTree[featureValue], dict): 234 | depth = 1 + cls.GetTreeDepth(subDicisionTree[featureValue]) 235 | else: 236 | depth = 1 237 | 238 | max_depth = max(depth, max_depth) 239 | return max_depth 240 | 241 | @property 242 | def num_leaves(self): 243 | return self.GetNumLeaves(self.tree) 244 | 245 | @classmethod 246 | def GetNumLeaves(cls, tree): 247 | num = 0 248 | featureLabel = tree.keys()[0] 249 | subDicisionTree = tree[featureLabel] 250 | for featureValue in subDicisionTree: 251 | if isinstance(subDicisionTree[featureValue], dict): 252 | num += cls.GetNumLeaves(subDicisionTree[featureValue]) 253 | else: 254 | num += 1 255 | return num 256 | 257 | @property 258 | def feature_label(self): 259 | return self.tree.keys()[0] 260 | 261 | def GetSubTree(self, feature_value): 262 | tree = self.__class__(None, None) 263 | tree.tree = self.tree[self.feature_label][feature_value] 264 | return tree 265 | 266 | @classmethod 267 | def GetRetrieveTree(cls, index): 268 | trees = ( 269 | {'no surfacing': { 270 | 0: 'no', 271 | 1: {'flippers': 272 | {0: 'no', 1: 'yes'}} 273 | }}, 274 | {'no surfacing': { 275 | 0: 'no', 276 | 1: {'flippers': 277 | {0: {'head': 278 | {0: 'no', 1: 'yes'}}, 279 | 1:'no'} 280 | }}}, 281 | ) 282 | tree = cls(None, None) 283 | tree.tree = trees[index] 284 | return tree 285 | 286 | 287 | def LoadLensesData(filename): 288 | with open(filename) as infile: 289 | lensesDataset = [] 290 | for line in infile: 291 | trainVector = line.strip().split('\t') 292 | lensesDataset.append(trainVector) 293 | lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate', ] 294 | lenseTree = DicisionTree(lensesDataset, lensesLabels) 295 | return lenseTree 296 | 297 | 298 | """ 绘制树形图 """ 299 | import matplotlib.pyplot as plt 300 | 301 | 302 | class DicisionTreePlotter(object): 303 | 304 | DECISION_NODE = { 305 | 'boxstyle': 'sawtooth', 306 | 'fc': '0.8', 307 | } 308 | LEAF_NODE = { 309 | 'boxstyle': 'round4', 310 | 'fc': '0.8', 311 | } 312 | ARROW_ARGS = { 313 | 'arrowstyle': '<-', 314 | } 315 | 316 | def __init__(self, tree): 317 | fig = plt.figure(1, facecolor='white') 318 | fig.clf() 319 | self.ax1 = plt.subplot(111, frameon=False, xticks=[], yticks=[]) 320 | self.width = 1.0*tree.num_leaves 321 | self.depth = 1.0*tree.depth 322 | self.offset = { 323 | 'x': -0.5/self.width, 324 | 'y': 1.0 325 | } 326 | self.plot_tree(tree, (0.5, 1.0), '') 327 | plt.show() 328 | 329 | def plot_mid_text(self, text, centerPoint, parentPoint): 330 | xMid = (parentPoint[0] - centerPoint[0]) / 2.0 + centerPoint[0] 331 | yMid = (parentPoint[1] - centerPoint[1]) / 2.0 + centerPoint[1] 332 | self.ax1.text(xMid, yMid, text) 333 | 334 | def plot_node(self, text, centerPoint, parentPoint, node_type): 335 | self.ax1.annotate( 336 | text, 337 | xy=parentPoint, xycoords='axes fraction', 338 | xytext=centerPoint, textcoords='axes fraction', 339 | va='center', ha='center', 340 | bbox=node_type, arrowprops=DicisionTreePlotter.ARROW_ARGS 341 | ) 342 | 343 | def plot_tree(self, tree, parentPoint, text): 344 | num_leaves = tree.num_leaves 345 | featureLabel = tree.feature_label 346 | centerPoint = ( 347 | self.offset['x'] + (1.0 + num_leaves) / 2.0 / self.width, 348 | self.offset['y'] 349 | ) 350 | self.plot_mid_text(text, centerPoint, parentPoint) 351 | self.plot_node( 352 | featureLabel, 353 | centerPoint, parentPoint, 354 | DicisionTreePlotter.DECISION_NODE 355 | ) 356 | subDicisionTree = tree.tree[featureLabel] 357 | self.offset['y'] -= 1.0/self.depth 358 | for featureValue in subDicisionTree: 359 | if isinstance(subDicisionTree[featureValue], dict): 360 | self.plot_tree( 361 | tree.GetSubTree(featureValue), 362 | centerPoint, 363 | str(featureValue) 364 | ) 365 | else: 366 | self.offset['x'] += 1.0 / self.width 367 | self.plot_node( 368 | subDicisionTree[featureValue], 369 | (self.offset['x'], self.offset['y']), 370 | centerPoint, 371 | DicisionTreePlotter.LEAF_NODE 372 | ) 373 | self.plot_mid_text( 374 | str(featureValue), 375 | (self.offset['x'], self.offset['y']), 376 | centerPoint 377 | ) 378 | self.offset['y'] += 1.0 / self.depth 379 | 380 | 381 | if __name__ == '__main__': 382 | tree = LoadLensesData('lenses.txt') 383 | print(tree.depth) 384 | t = DicisionTree.GetRetrieveTree(0) 385 | print(t.depth, t.num_leaves) 386 | plotter = DicisionTreePlotter(t) 387 | -------------------------------------------------------------------------------- /ch04 - NaiveBayes/bayes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=utf-8 3 | 4 | """ 5 | 朴素贝叶斯 6 | === 7 | 朴素贝叶斯是贝叶斯决策理论的一部分. 8 | ## 贝叶斯决策 9 | * 核心思想 -> 选择具有最高概率的决策 10 | 11 | ## 朴素贝叶斯分类器 12 | 朴素贝叶斯分类器是用于文档分类的常用算法 13 | * 把每个次的出现或者不出现作为一个特征 14 | * 假设特征之间相互独立, 即一个单词出现的可能性和其他相邻单词没有关系 15 | * 每个特征同等重要 16 | 17 | """ 18 | 19 | from __future__ import print_function 20 | 21 | import numpy as np 22 | from numpy import random 23 | import logging 24 | logging.basicConfig( 25 | level=logging.DEBUG, 26 | format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s', 27 | ) 28 | 29 | 30 | def getFakeDataset(): 31 | posts = [ 32 | ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], 33 | ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], 34 | ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], 35 | ['stop', 'posting', 'stupid', 'worthless', 'garbage'], 36 | ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], 37 | ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid'], 38 | ] 39 | classes = [0, 1, 0, 1, 0, 1] # 1表示为侮辱性句子, 0为普通句子 40 | return posts, classes 41 | 42 | 43 | def getVocabulary(dataset): 44 | """创建一个包含所有在文档中出现的不重复词的词典 45 | 46 | Parameters 47 | ---------- 48 | dataset : list of documents 49 | 50 | Returns 51 | ------- 52 | list : 词典 53 | """ 54 | vocabulary = set([]) 55 | for document in dataset: 56 | vocabulary |= set(document) # 取并集 57 | return list(vocabulary) 58 | 59 | 60 | def getSetOfWords2Vec(vocabulary, inputSet): 61 | """`词集模型` 62 | 词汇表中的单词在输入文档中是否出现 63 | 64 | Parameters 65 | ---------- 66 | vocabulary : 词典 67 | inputSet : 文档 68 | 69 | Returns 70 | ------- 71 | list : 文档向量, 0/1表示词汇表中的单词在输入文档中是否出现 72 | """ 73 | appearVector = [0]*len(vocabulary) 74 | for word in inputSet: 75 | if word in vocabulary: 76 | appearVector[vocabulary.index(word)] = 1 77 | return appearVector 78 | 79 | 80 | def getBagOfWords2Vec(vocabulary, inputSet): 81 | """`词袋模型` 82 | 词汇表中的单词在输入文档中出现的次数 83 | 84 | Parameters 85 | ---------- 86 | vocabulary : 词典 87 | inputSet : 文档 88 | 89 | Returns 90 | ------- 91 | list : 文档向量, 每一维表示词汇表中的单词在输入文档中出现的次数 92 | """ 93 | appearCountVector = [0]*len(vocabulary) 94 | for word in inputSet: 95 | if word in vocabulary: 96 | appearCountVector[vocabulary.index(word)] += 1 97 | return appearCountVector 98 | 99 | 100 | class NaiveBayesModel(object): 101 | """ 朴素贝叶斯模型 """ 102 | 103 | def __init__(self, matrix, categories): 104 | self.trainMatrix = np.array(matrix) 105 | self.trainCategory = np.array(categories) 106 | 107 | # m 为有多少个样例, n 为每个样例的词向量长度 108 | m, n = self.trainMatrix.shape 109 | # 样例中为 class1 的概率 110 | self.pClass1 = 1.0*sum(self.trainCategory) / m 111 | # 防止概率相乘为 0, 把所有词出现次数初始化为 1, 总词数初始化为 2 112 | wordsCountVector = { 113 | 'class0': np.ones(n), # 属于 Class0 的各个词数 114 | 'class1': np.ones(n), # 属于 Class1 的各个词数 115 | } 116 | for rowno in range(m): 117 | if self.trainCategory[rowno] == 0: 118 | wordsCountVector['class0'] += self.trainMatrix[rowno] 119 | else: 120 | wordsCountVector['class1'] += self.trainMatrix[rowno] 121 | # 防止太多小的浮点数相乘导致下溢出, 对乘积取自然对数 122 | self.pWordsVector = { 123 | 'class0': np.log( 124 | wordsCountVector['class0'] 125 | / (1 + wordsCountVector['class0'].sum()) 126 | ), 127 | 'class1': np.log( 128 | wordsCountVector['class1'] 129 | / (1 + wordsCountVector['class1'].sum()) 130 | ), 131 | } 132 | 133 | def predict(self, inputVector): 134 | inputVector = np.array(inputVector) 135 | p0 = ( 136 | sum(inputVector * self.pWordsVector['class0']) 137 | + np.log(1.0 - self.pClass1) 138 | ) 139 | p1 = ( 140 | sum(inputVector * self.pWordsVector['class1']) 141 | + np.log(self.pClass1) 142 | ) 143 | # print('{:.3f}/{:.3f} for Class {}/{}'.format( 144 | # np.exp(p0)*100, np.exp(p1)*100, 0, 1 145 | # )) 146 | if p1 > p0: 147 | return 1 148 | else: 149 | return 0 150 | 151 | 152 | def testingNaiveBayes(): 153 | postsToken, postsClass = getFakeDataset() 154 | vocabulary = getVocabulary(postsToken) 155 | trainMatrix = [ 156 | getSetOfWords2Vec(vocabulary, post) for post in postsToken 157 | ] 158 | model = NaiveBayesModel(trainMatrix, postsClass) 159 | 160 | testEntry = ['love', 'my', 'dalmation'] 161 | testPost = getSetOfWords2Vec(vocabulary, testEntry) 162 | print(testEntry, 'classified as: ', model.predict(testPost)) 163 | 164 | testEntry = ['stupid', 'garbage'] 165 | testPost = getSetOfWords2Vec(vocabulary, testEntry) 166 | print(testEntry, 'classified as: ', model.predict(testPost)) 167 | 168 | """ 使用朴素贝叶斯对电子邮件进行分类 """ 169 | 170 | 171 | def getContentTokens(content): 172 | """ 简单切分英语文本 """ 173 | import re 174 | tokens = re.split(r'\W*', content) 175 | return [token.lower() for token in tokens if len(token) > 2] 176 | 177 | 178 | def testNaiveBayesToSpamEmail(): 179 | """ 使用朴素贝叶斯进行电子邮件分类的测试 """ 180 | emails = [] 181 | emails_class = [] 182 | 183 | for i in range(1, 26): 184 | # 垃圾邮件样本 185 | words = getContentTokens(open('email/spam/%d.txt' % i).read()) 186 | emails.append(words) 187 | emails_class.append(1) 188 | # 正常邮件样本 189 | words = getContentTokens(open('email/ham/%d.txt' % i).read()) 190 | emails.append(words) 191 | emails_class.append(0) 192 | 193 | # `留存交叉验证` -- 随机选择数据一部分作为训练集, 剩余部分作为测试集 194 | # 生成测试集, 训练集 195 | random_order = random.permutation(50) 196 | testIndexs, trainIndexs = random_order[:10], random_order[10:] 197 | 198 | # 生成词典 199 | vocabulary = getVocabulary(emails) 200 | # 训练朴素贝叶斯分类器 201 | trainMatrix = [] 202 | trainCategories = [] 203 | for docIndex in trainIndexs: 204 | trainMatrix.append( 205 | getBagOfWords2Vec(vocabulary, emails[docIndex]) # 使用词袋模型 206 | ) 207 | trainCategories.append(emails_class[docIndex]) 208 | logging.info('Train dataset is ready.') 209 | model = NaiveBayesModel(trainMatrix, trainCategories) 210 | logging.info('NaiveBayes model is trained.') 211 | 212 | # 进行分类测试 213 | errorCount = 0 214 | for docIndex in testIndexs: 215 | wordVector = getBagOfWords2Vec(vocabulary, emails[docIndex]) 216 | result = model.predict(wordVector) 217 | if result != emails_class[docIndex]: 218 | errorCount += 1 219 | logging.warning('classification error. Predict/Actual: {}/{}\n{}'.format( 220 | result, 221 | emails_class[docIndex], 222 | ' '.join(emails[docIndex]) 223 | )) 224 | logging.info('the error rate is: {:.2%}'.format(1.0*errorCount/len(testIndexs))) 225 | 226 | """ 使用朴素贝叶斯分类器从个人广告中获取区域倾向 """ 227 | 228 | 229 | def calcMostFreq(vocabulary, fullText, topN): 230 | import operator 231 | wordFrequence = {} 232 | for word in vocabulary: 233 | wordFrequence[word] = fullText.count(word) 234 | sortedFrequence = sorted( 235 | wordFrequence.items(), 236 | key=lambda x: x[1], 237 | reverse=True 238 | ) 239 | return sortedFrequence[:topN] 240 | 241 | 242 | def getLocalWords(feed1, feed0): 243 | summaries = [] 244 | summaries_class = [] 245 | fullText = [] 246 | minLen = min( 247 | len(feed1['entries']), 248 | len(feed0['entries']) 249 | ) 250 | for i in range(minLen): 251 | # 第一个feed, 例子中为New York 252 | wordList = getContentTokens(feed1['entries'][i]['summary']) 253 | summaries.append(wordList) 254 | fullText.extend(wordList) 255 | summaries_class.append(1) 256 | # 第二个feed 257 | wordList = getContentTokens(feed0['entries'][i]['summary']) 258 | summaries.append(wordList) 259 | fullText.extend(wordList) 260 | summaries_class.append(0) 261 | vocabulary = getVocabulary(summaries) 262 | 263 | # `停用词表` -- 语言中作为冗余/结构辅助性内容的词语表 264 | # 多语言停用词表例子 www.ranks.nl/resources/stopwords.html 265 | # 去除出现次数最多的N个词 266 | topN = 30 267 | topNWords = calcMostFreq(vocabulary, fullText, topN) 268 | for word, _count in topNWords: 269 | if word in vocabulary: 270 | vocabulary.remove(word) 271 | 272 | # 生成测试集, 训练集 273 | random_order = random.permutation(2*minLen) 274 | testIndexs, trainIndexs = random_order[:20], random_order[20:] 275 | 276 | # 训练朴素贝叶斯分类器 277 | trainMatrix = [] 278 | trainCategories = [] 279 | for docIndex in trainIndexs: 280 | trainMatrix.append(getBagOfWords2Vec(vocabulary, summaries[docIndex])) 281 | trainCategories.append(summaries_class[docIndex]) 282 | model = NaiveBayesModel(trainMatrix, trainCategories) 283 | 284 | # 进行分类测试 285 | errorCount = 0 286 | for docIndex in testIndexs: 287 | wordVector = getBagOfWords2Vec(vocabulary, summaries[docIndex]) 288 | result = model.predict(wordVector) 289 | if result != summaries_class[docIndex]: 290 | errorCount += 1 291 | logging.warning('[classification error] Predict/Actual: {}/{}\n{}'.format( 292 | result, 293 | summaries_class[docIndex], 294 | ' '.join(summaries[docIndex]) 295 | )) 296 | logging.info('[error rate] {:.2%}'.format(1.0*errorCount/len(testIndexs))) 297 | return vocabulary, model.pWordsVector 298 | 299 | 300 | def getTopWords(ny, sf): 301 | vocabulary, pWordsVector = getLocalWords(ny, sf) 302 | top = {'NY': [], 'SF': [], } 303 | 304 | THRESHOLD = -6.0 305 | for i in range(len(pWordsVector['class0'])): 306 | if pWordsVector['class0'][i] > THRESHOLD: 307 | top['NY'].append((vocabulary[i], pWordsVector['class0'][i])) 308 | if pWordsVector['class1'][i] > THRESHOLD: 309 | top['SF'].append((vocabulary[i], pWordsVector['class1'][i])) 310 | import pprint 311 | sortedWords = { 312 | 'SF': list(map( 313 | lambda x: x[0], 314 | sorted(top['SF'], key=lambda pair: pair[1], reverse=True) 315 | )), 316 | 'NY': list(map( 317 | lambda x: x[0], 318 | sorted(top['NY'], key=lambda pair: pair[1], reverse=True) 319 | )), 320 | } 321 | print('=====>> SF <<=====') 322 | pprint.pprint(sortedWords['SF']) 323 | print('=====>> NY <<=====') 324 | pprint.pprint(sortedWords['NY']) 325 | 326 | 327 | if __name__ == '__main__': 328 | testingNaiveBayes() 329 | testNaiveBayesToSpamEmail() 330 | 331 | import feedparser 332 | ny = feedparser.parse('http://newyork.craigslist.org/search/stp?format=rss') 333 | sf = feedparser.parse('http://sfbay.craigslist.org/search/stp?format=rss') 334 | getTopWords(ny, sf) 335 | -------------------------------------------------------------------------------- /ch04 - NaiveBayes/email: -------------------------------------------------------------------------------- 1 | ../machinelearninginaction/Ch04/email -------------------------------------------------------------------------------- /ch05 - LogisticRegression/logisticRegression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=utf-8 3 | 4 | """ 5 | Logistic回归 6 | === 7 | 根据现有数据对分类边界线建立回归公式, 以此进行分类. 8 | "回归"来源于最佳拟合, 训练分类器即使用最优化算法寻找最佳拟合参数. 9 | """ 10 | 11 | from __future__ import print_function 12 | 13 | import numpy 14 | import matplotlib.pyplot as plt 15 | 16 | 17 | def getDataset(filename='testSet.txt'): 18 | dataset = [] 19 | labels = [] 20 | with open(filename) as infile: 21 | for line in infile: 22 | datas = line.strip().split() 23 | dataset.append([ 24 | 1.0, 25 | float(datas[0]), 26 | float(datas[1]), 27 | ]) 28 | labels.append(int(datas[2])) 29 | return numpy.array(dataset), labels 30 | 31 | 32 | def sigmoid(inX): 33 | """ 海维赛德阶跃函数 """ 34 | return 1.0 / (1 + numpy.exp(-inX)) 35 | 36 | """ 37 | 梯度上升算法 38 | 沿着函数f的梯度方向, 寻找f的最大值 39 | 因为梯度算子总是指向函数值增长最快的方向 40 | """ 41 | 42 | 43 | def getGradientAsecent(dataset, labels): 44 | """使用梯度上升算法计算最佳回归系数 45 | 46 | Parameters 47 | ---------- 48 | dataset 49 | labels 50 | 51 | Returns 52 | ------- 53 | list of floats : 回归系数 54 | """ 55 | dataset = numpy.mat(dataset) 56 | labels = numpy.mat(labels).T 57 | m, n = dataset.shape 58 | alpha = 0.001 # 向目标移动的步长 59 | numCycles = 500 # 迭代次数 60 | weights = numpy.ones((n, 1)) 61 | # 计算真实类别与预测类别的差值, 按照该差值的方向调整回归系数 62 | # FIXME: 这里使用了大量的矩阵运算, 导致计算效率低下 63 | for k in range(numCycles): 64 | h = sigmoid(dataset * weights) # 矩阵相乘 65 | error = labels - h # 向量相减 66 | weights += alpha * dataset.T * error # 矩阵相乘 67 | return weights.T[0] 68 | 69 | 70 | def getStochasticGradientAsecent_0(dataset, labels): 71 | """使用随机梯度上升算法计算最佳回归系数 72 | 一次只用一个样本点来更新回归系数, 能对数据进行增量更新, 是一个"在线学习"算法 73 | 但因为数据集可能不是线性可分, 在迭代的时候可能导致回归系数抖动, 收敛速度慢 74 | 75 | Parameters 76 | ---------- 77 | dataset 78 | labels 79 | 80 | Returns 81 | ------- 82 | list of floats : 回归系数 83 | """ 84 | m, n = dataset.shape 85 | alpha = 0.01 86 | weights = numpy.ones(n) 87 | for i in range(m): 88 | h = sigmoid(sum(dataset[i]*weights)) 89 | error = labels[i] - h 90 | weights += alpha * error * dataset[i] 91 | return weights 92 | 93 | 94 | def getStochasticGradientAsecent_1( 95 | dataset, labels, numIter=150): 96 | """使用改进的随机梯度上升算法计算最佳回归系数 97 | 步长alpha每次都会调整 98 | 通过随机选取样本来更新回归系数, 减少周期型抖动, 增加收敛速度 99 | 100 | Parameters 101 | ---------- 102 | dataset 103 | labels 104 | numIter : int default 150 105 | 迭代次数 106 | 107 | Returns 108 | ------- 109 | list of floats : 回归系数 110 | """ 111 | m, n = dataset.shape 112 | weights = numpy.ones(n) 113 | for j in range(numIter): 114 | dataIndex = range(m) 115 | for i in range(m): 116 | # 步长每次迭代都会减少 1/(j+i) 117 | # j 为迭代次数, i 为样本点的下标 118 | alpha = 4/(1.0+j+i) + 0.0001 # 常数使得 alpha 永远不会减少到 0 119 | # 通过随机选择来更新回归系数 120 | randIndex = int(numpy.random.uniform(0, len(dataIndex))) 121 | h = sigmoid(sum(dataset[randIndex]*weights)) 122 | error = labels[randIndex] - h 123 | weights += alpha * error * dataset[randIndex] 124 | del(dataIndex[randIndex]) 125 | return weights 126 | 127 | 128 | def plotBestFit(dataset, labels, weights): 129 | """绘制数据分界线 130 | 131 | Parameters 132 | ---------- 133 | weights : list of floats 134 | 系数 135 | 136 | """ 137 | m, _n = dataset.shape 138 | # 收集绘制的数据 139 | cord = { 140 | '1': { 141 | 'x': [], 142 | 'y': [], 143 | }, 144 | '2': { 145 | 'x': [], 146 | 'y': [], 147 | }, 148 | } 149 | for i in range(m): 150 | if labels[i] == 1: 151 | cord['1']['x'].append(dataset[i, 1]) 152 | cord['1']['y'].append(dataset[i, 2]) 153 | else: 154 | cord['2']['x'].append(dataset[i, 1]) 155 | cord['2']['y'].append(dataset[i, 2]) 156 | # 绘制图形 157 | figure = plt.figure() 158 | subplot = figure.add_subplot(111) 159 | # 绘制散点 160 | subplot.scatter( 161 | cord['1']['x'], cord['1']['y'], 162 | s=30, c='red', marker='s' 163 | ) 164 | subplot.scatter( 165 | cord['2']['x'], cord['2']['y'], 166 | s=30, c='green' 167 | ) 168 | # 绘制直线 169 | x = numpy.arange(-3.0, 3.0, 0.1) 170 | y = (-weights[0] - weights[1] * x)/ weights[2] 171 | subplot.plot(x, y) 172 | # 标签 173 | plt.xlabel('X1') 174 | plt.ylabel('X2') 175 | plt.show() 176 | 177 | """ 利用logistic回归来进行分类 -- 从疝气病症状预测病马的死亡率 """ 178 | 179 | 180 | def predict(inX, weights): 181 | probability = sigmoid(sum(numpy.array(inX)*weights)) 182 | if probability > 0.5: 183 | return 1 184 | else: 185 | return 0 186 | 187 | 188 | def loadDatasetFromFile(filename): 189 | dataset = [] 190 | labels = [] 191 | with open(filename) as infile: 192 | for line in infile: 193 | datas = line.strip().split('\t') 194 | row = list(map(lambda x: float(x), datas[:21])) 195 | dataset.append(row) 196 | labels.append(float(datas[21])) 197 | return numpy.array(dataset), numpy.array(labels) 198 | 199 | 200 | def testColicPredict(num_iter=1000): 201 | """在马的疝气病数据上训练 logistic 回归模型 202 | 203 | Parameters 204 | ---------- 205 | num_iter 206 | 207 | Returns 208 | ------- 209 | 210 | """ 211 | # 训练模型 212 | train = {} 213 | train['dataset'], train['labels'] = loadDatasetFromFile( 214 | 'horseColicTraining.txt' 215 | ) 216 | train['weights'] = getStochasticGradientAsecent_1( 217 | train['dataset'], 218 | train['labels'], 219 | numIter=num_iter 220 | ) 221 | # 测试 222 | errorCount = 0 223 | test = {} 224 | test['dataset'], test['labels'] = loadDatasetFromFile( 225 | 'horseColicTest.txt' 226 | ) 227 | m, _n = test['dataset'].shape 228 | for rowno, row in enumerate(test['dataset']): 229 | if predict(row, train['weights']) != test['labels'][rowno]: 230 | errorCount += 1 231 | errorRate = 1.0*errorCount / m 232 | print("Error rate: {:.4f}".format(errorRate)) 233 | return errorRate 234 | 235 | 236 | def multiTestColicPredict(numTests=10): 237 | errorSum = 0.0 238 | # 多次运行结果可能不同, 因为使用随机选取的向量来更新回归系数 239 | for k in range(numTests): 240 | errorSum += testColicPredict() 241 | print('after %d iterations the average error rate is: %f' 242 | % (numTests, errorSum/float(numTests)) 243 | ) 244 | 245 | if __name__ == '__main__': 246 | dataset, labels = getDataset() 247 | weights = { 248 | 0: getGradientAsecent(dataset, labels), 249 | 1: getStochasticGradientAsecent_0(dataset, labels), 250 | 2: getStochasticGradientAsecent_1(dataset, labels), 251 | } 252 | # plotBestFit(dataset, labels, weights[0]) 253 | # plotBestFit(dataset, labels, weights[1]) 254 | # plotBestFit(dataset, labels, weights[2]) 255 | 256 | multiTestColicPredict(10) 257 | -------------------------------------------------------------------------------- /ch06 - svm/svmMLiA.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=utf-8 3 | 4 | """ 5 | SVM - 支持向量机 6 | === 7 | 介绍的是SVM的其中一种实现 -- 序列最小化(SMO, Sequential Minimal Optimization)算法 8 | `分隔超平面` -- 将数据集分隔开来的超平面, 也就是分类的决策边界. 9 | `间隔` -- 找到离分隔超平面最近的点, 确保他们离分隔面的距离尽可能远, 这其中点到分隔面的距离就是间隔. 10 | 我们希望间隔尽可能地大, 以保证分类器尽可能健壮 11 | `支持向量` -- 离分隔超平面最近的那些点 12 | """ 13 | 14 | from __future__ import print_function 15 | 16 | import logging 17 | 18 | import numpy 19 | 20 | logging.basicConfig( 21 | # level=logging.DEBUG, 22 | level=logging.INFO, 23 | format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s', 24 | ) 25 | 26 | 27 | def load_dataset(filename): 28 | dataset = [] 29 | labels = [] 30 | with open(filename) as infile: 31 | for line in infile: 32 | datas = line.strip().split('\t') 33 | dataset.append([float(datas[0]), float(datas[1])]) 34 | labels.append(float(datas[2])) 35 | return dataset, labels 36 | 37 | 38 | def random_select_j(i, m): 39 | """ 返回任一 [0, m) 之间且不等于 i 的数 """ 40 | j = i 41 | while j == i: 42 | j = int(numpy.random.uniform(0, m)) 43 | return j 44 | 45 | 46 | def adjust_alpha(aj, upper_bound, lower_bound): 47 | if aj > upper_bound: 48 | aj = upper_bound 49 | if lower_bound > aj: 50 | aj = lower_bound 51 | return aj 52 | 53 | 54 | def estimate(alphas, labels, dataset, index, b): 55 | fx = float( 56 | numpy.multiply(alphas, labels).T 57 | * (dataset*dataset[index, :].T) 58 | ) + b 59 | e = fx - float(labels[index]) 60 | return e 61 | 62 | 63 | def smo_simple(dataset, labels, constant, toler, max_iter): 64 | """ 65 | Platt的SMO算法简化版. 66 | = = = = 67 | 每次循环中选择两个alpha进行优化处理.一旦找到一堆合适的alpha, 68 | 那么就增大其中一个同时减少另外一个. 69 | * 两个alpha必须在间隔边界之外 70 | * 两个alpha还没有进行过区间化处理或者不在边界上 71 | 72 | Parameters 73 | ---------- 74 | dataset 75 | 数据集 76 | labels 77 | 类型标签 78 | constant 79 | 常数, 用于控制"最大化间隔"和"保证大部分点的函数间隔小于1.0" 80 | toler 81 | 容错率 82 | max_iter 83 | 最大循环次数 84 | 85 | Returns 86 | ------- 87 | 88 | """ 89 | dataset = numpy.mat(dataset) 90 | labels = numpy.mat(labels).T 91 | b = 0 92 | m, n = dataset.shape 93 | # 初始化alpha向量 94 | alphas = numpy.mat(numpy.zeros((m, 1))) 95 | num_iter = 0 96 | while num_iter < max_iter: 97 | # 对数据集中每个数据向量 98 | num_alpha_pairs_changed = False # alpha 是否已经优化 99 | for i in range(m): 100 | # 计算 alpha[i] 的预测值, 估算其是否可以被优化 101 | Ei = estimate(alphas, labels, dataset, i, b) 102 | # 测试正/负间隔距离, alpha值, 是否满足KKT条件 103 | if not ((labels[i] * Ei < -toler and alphas[i] < constant) 104 | or (labels[i] * Ei > toler and alphas[i] > 0)): 105 | logging.debug('alpha[{0}]不需要调整.'.format(i)) 106 | continue 107 | 108 | # 选择第二个 alpha[j] 109 | j = random_select_j(i, m) 110 | # alpha[j] 的预测值 111 | Ej = estimate(alphas, labels, dataset, j, b) 112 | 113 | # 保存旧值以便与调整后比较 114 | alphaI_old = alphas[i].copy() 115 | alphaJ_old = alphas[j].copy() 116 | 117 | # 计算 lower_bound/upper_bound, 调整 alpha[j] 至 (0, C) 之间 118 | if labels[i] != labels[j]: 119 | lower_bound = max(0, alphas[j] - alphas[i]) 120 | upper_bound = min(constant, constant + alphas[j] - alphas[i]) 121 | else: 122 | lower_bound = max(0, alphas[j] + alphas[i] - constant) 123 | upper_bound = min(constant, alphas[j] + alphas[i]) 124 | if lower_bound == upper_bound: 125 | logging.debug('lower_bound == upper_bound == {0}'.format(lower_bound)) 126 | continue 127 | 128 | # 计算 alpha[j] 的最优修改量 129 | delta = ( 130 | 2.0 * dataset[i, :] * dataset[j, :].T 131 | - dataset[i, :] * dataset[i, :].T 132 | - dataset[j, :] * dataset[j, :].T 133 | ) 134 | # 如果 delta==0, 则需要退出for循环的当前迭代过程. 135 | # 简化版中不处理这种少量出现的特殊情况 136 | if delta >= 0: 137 | logging.warning('{0}(delta) >= 0'.format(delta)) 138 | continue 139 | 140 | # 计算新的 alpha[j] 141 | alphas[j] -= labels[j] * (Ei - Ej) / delta 142 | alphas[j] = adjust_alpha(alphas[j], upper_bound, lower_bound) 143 | # 若 alpha[j] 的改变量太少, 不采用 144 | delta_j = abs(alphas[j] - alphaJ_old) 145 | if delta_j < 0.00001: 146 | logging.debug('j 变化量太少, 不采用. ({0})'.format(delta_j)) 147 | continue 148 | 149 | # 对 alpha[i] 做 alpha[j] 同样大小, 方向相反的改变 150 | alphas[i] += labels[j] * labels[i] * (alphaJ_old - alphas[j]) 151 | 152 | # 给两个 alpha 值设置常量 b 153 | b1 = ( 154 | b - Ei 155 | - labels[i] * (alphas[i] - alphaI_old) * dataset[i, :] * dataset[i, :].T 156 | - labels[j] * (alphas[j] - alphaJ_old) * dataset[i, :] * dataset[j, :].T 157 | ) 158 | b2 = ( 159 | b - Ej 160 | - labels[i] * (alphas[i] - alphaI_old) * dataset[i, :] * dataset[j, :].T 161 | - labels[j] * (alphas[j] - alphaJ_old) * dataset[j, :] * dataset[j, :].T 162 | ) 163 | if 0 < alphas[i] < constant: 164 | b = b1 165 | elif 0 < alphas[j] < constant: 166 | b = b2 167 | else: 168 | b = (b1 + b2) / 2.0 169 | 170 | num_alpha_pairs_changed = True 171 | logging.debug('numIter: {:d} i:{:d}, pairs changed {}'.format( 172 | num_iter, i, num_alpha_pairs_changed 173 | )) 174 | if num_alpha_pairs_changed == 0: 175 | num_iter += 1 176 | else: 177 | num_iter = 0 178 | logging.debug('iteration number: {0}'.format(num_iter)) 179 | return b, alphas 180 | 181 | 182 | def kernelTrans(X, A, kernel_info): 183 | """calc the kernel or transform data to a higher dimensional space 184 | `核函数` -- 185 | 186 | Parameters 187 | ---------- 188 | X 189 | A 190 | kernel_info : tuple 191 | 包含核函数信息的元组 192 | 193 | Returns 194 | ------- 195 | 196 | """ 197 | m, n = numpy.shape(X) 198 | K = numpy.mat(numpy.zeros((m, 1))) 199 | if kernel_info[0] == 'lin': 200 | K = X * A.T # linear kernel 201 | elif kernel_info[0] == 'rbf': # radial bias function 202 | for j in range(m): 203 | deltaRow = X[j, :] - A 204 | K[j] = deltaRow*deltaRow.T 205 | # divide in NumPy is element-wise not matrix like Matlab 206 | K = numpy.exp(K / (-1 * kernel_info[1] ** 2)) 207 | else: 208 | raise NameError('未定义的核函数') 209 | return K 210 | 211 | 212 | class Options(object): 213 | def __init__(self, dataset, labels, constant, toler, kernel_info): 214 | self.X = dataset 215 | self.labels = labels 216 | self.constant = constant 217 | self.toler = toler 218 | self.m, self.n = dataset.shape 219 | self.alphas = numpy.mat(numpy.zeros((self.m, 1))) 220 | self.b = 0 221 | # eCache第一列表示该cache值是否有效 222 | self.eCache = numpy.mat(numpy.zeros((self.m, 2))) 223 | self.K = numpy.mat(numpy.zeros((self.m, self.m))) 224 | for i in range(self.m): 225 | self.K[:, i] = kernelTrans(self.X, self.X[i, :], kernel_info) 226 | 227 | def updateEk(self, k): 228 | Ek = self.calc_estimate(k) 229 | self.eCache[k] = [1, Ek] 230 | 231 | def calc_estimate(self, index): 232 | fx = float( 233 | numpy.multiply(self.alphas, self.labels).T * self.K[:, index] 234 | + self.b 235 | ) 236 | e = fx - float(self.labels[index]) 237 | return e 238 | 239 | def select_j(self, i, Ei): 240 | maxK = -1 241 | max_deltaE = 0 242 | Ej = 0 243 | self.eCache[i] = [1, Ei] # 设置第i个eCache缓存值 244 | validECaches = numpy.nonzero(self.eCache[:, 0].A)[0] 245 | if len(validECaches) > 1: 246 | # 在有效的缓存值中寻找deltaE最大的 247 | for k in validECaches: 248 | if k == i: 249 | continue 250 | Ek = self.calc_estimate(k) 251 | deltaE = abs(Ei - Ek) 252 | if deltaE > max_deltaE: 253 | maxK = k 254 | max_deltaE = deltaE 255 | Ej = Ek 256 | return maxK, Ej 257 | else: 258 | # 没有任何有效的eCache缓存值 (如第一轮中) 259 | j = random_select_j(i, self.m) 260 | Ej = self.calc_estimate(j) 261 | return j, Ej 262 | 263 | 264 | def inner_loop(i, options): 265 | # 计算 alpha[i] 的预测值, 估算其是否可以被优化 266 | Ei = options.calc_estimate(i) 267 | # 测试正/负间隔距离, alpha值, 是否满足KKT条件 268 | if not (((options.labels[i] * Ei < -options.toler) and (options.alphas[i] < options.constant)) 269 | or ((options.labels[i] * Ei > options.toler) and (options.alphas[i] > 0))): 270 | logging.debug('alpha[{0}]不需要调整.'.format(i)) 271 | return 0 272 | 273 | # 选择第二个 alpha[j], 并计算 alpha[j] 的预测值 274 | j, Ej = options.select_j(i, Ei) 275 | 276 | # 保存旧值以便与调整后比较 277 | alphaI_old = options.alphas[i].copy() 278 | alphaJ_old = options.alphas[j].copy() 279 | 280 | # 计算 lower_bound/upper_bound, 调整 alpha[j] 至 (0, C) 之间 281 | if options.labels[i] != options.labels[j]: 282 | lower_bound = max(0, options.alphas[j] - options.alphas[i]) 283 | upper_bound = min( 284 | options.constant, 285 | options.constant + options.alphas[j] - options.alphas[i] 286 | ) 287 | else: 288 | lower_bound = max(0, options.alphas[j] + options.alphas[i] - options.constant) 289 | upper_bound = min(options.constant, options.alphas[j] + options.alphas[i]) 290 | if lower_bound == upper_bound: 291 | logging.debug('lower_bound == upper_bound == {0}'.format(lower_bound)) 292 | return 0 293 | 294 | # 计算 alpha[j] 的最优修改量 295 | delta = 2.0 * options.K[i, j] - options.K[i, i] - options.K[j, j] 296 | if delta >= 0: 297 | logging.warning('{0}(delta) >= 0'.format(delta)) 298 | return 0 299 | 300 | # 计算新的 alpha[j] 301 | options.alphas[j] -= options.labels[j] * (Ei - Ej) / delta 302 | options.alphas[j] = adjust_alpha(options.alphas[j], upper_bound, lower_bound) 303 | options.updateEk(j) # 更新缓存中Ej的值 304 | # 若 alpha[j] 的改变量太少, 不采用 305 | delta_j = abs(options.alphas[j] - alphaJ_old) 306 | if delta_j < 0.00001: 307 | logging.debug('j 变化量太少, 不采用. ({0})'.format(delta_j)) 308 | return 0 309 | 310 | # 对 alpha[i] 做 alpha[j] 同样大小, 方向相反的改变 311 | options.alphas[i] += options.labels[j] * options.labels[i] * (alphaJ_old - options.alphas[j]) 312 | options.updateEk(i) # 更新缓存中Ei的值 313 | # 给两个 alpha 值设置常量 b 314 | b1 = ( 315 | options.b - Ei 316 | - options.labels[i] * (options.alphas[i] - alphaI_old) * options.K[i, i] 317 | - options.labels[j] * (options.alphas[j] - alphaJ_old) * options.K[i, j] 318 | ) 319 | b2 = ( 320 | options.b - Ej 321 | - options.labels[i] * (options.alphas[i] - alphaI_old) * options.K[i, j] 322 | - options.labels[j] * (options.alphas[j] - alphaJ_old) * options.K[j, j] 323 | ) 324 | if 0 < options.alphas[i] < options.constant: 325 | options.b = b1 326 | elif 0 < options.alphas[j] < options.constant: 327 | options.b = b2 328 | else: 329 | options.b = (b1 + b2) / 2.0 330 | return 1 331 | 332 | 333 | def smoP(dataset, labels, constant, toler, max_iter, kernel_info=('lin', 0)): 334 | options = Options( 335 | numpy.mat(dataset), 336 | numpy.mat(labels).T, 337 | constant, toler, kernel_info 338 | ) 339 | num_iter = 0 340 | scan_entire_set = True 341 | num_alpha_pairs_changed = 0 342 | while (num_iter < max_iter) and ((num_alpha_pairs_changed > 0) or scan_entire_set): 343 | num_alpha_pairs_changed = 0 344 | 345 | if scan_entire_set: 346 | # 遍历alpha, 使用 `inner_loop` 选择 alpha-j, 并在可能是对其进行优化 347 | for i in range(options.m): 348 | num_alpha_pairs_changed += inner_loop(i, options) 349 | logging.debug('scanning : num_iter({}) i({}) pairs changed({})'.format( 350 | num_iter, i, num_alpha_pairs_changed 351 | )) 352 | num_iter += 1 353 | else: 354 | # 遍历所有非边界(不在边界0或C上)的 alpha 355 | non_bound_indexs = numpy.nonzero( 356 | (options.alphas.A > 0) * (options.alphas.A < constant) 357 | )[0] 358 | for i in non_bound_indexs: 359 | num_alpha_pairs_changed += inner_loop(i, options) 360 | logging.debug('non-bound : num_iter({}) i({}) pairs changed({})'.format( 361 | num_iter, i, num_alpha_pairs_changed 362 | )) 363 | num_iter += 1 364 | 365 | if scan_entire_set: 366 | scan_entire_set = False 367 | elif num_alpha_pairs_changed == 0: 368 | scan_entire_set = True 369 | logging.debug('iteration number: {}'.format(num_iter)) 370 | return options.b, options.alphas 371 | 372 | 373 | def get_weights(alphas, dataset, labels): 374 | dataset = numpy.mat(dataset) 375 | labels = numpy.mat(labels).T 376 | m, n = dataset.shape 377 | w = numpy.zeros((n, 1)) 378 | for i in range(m): 379 | w += numpy.multiply(alphas[i] * labels[i], dataset[i, :].T) 380 | return w 381 | 382 | 383 | def test_rbf(k1=1.3): 384 | import pprint 385 | dataset, labels = load_dataset('testSetRBF.txt') 386 | b, alphas = smoP(dataset, labels, 200, 0.0001, 10000, ('rbf', k1)) # C=200 important 387 | 388 | dataset = numpy.mat(dataset) 389 | labels = numpy.mat(labels).T 390 | support_vectors_index = tuple(numpy.nonzero(alphas.A > 0))[0] 391 | support_vectors = dataset[support_vectors_index] 392 | support_vectors_label = labels[support_vectors_index] 393 | m, _n = support_vectors.shape 394 | logging.info('支持向量 ({})个:'.format(m)) 395 | logging.info(pprint.pformat(zip( 396 | support_vectors.tolist(), support_vectors_label.A1.tolist() 397 | ))) 398 | 399 | m, _n = dataset.shape 400 | errorCount = 0 401 | for i in range(m): 402 | # 利用 核函数 && 支持向量 进行分类. 403 | kernelEval = kernelTrans(support_vectors, dataset[i, :], ('rbf', k1)) 404 | predict = ( 405 | kernelEval.T 406 | * numpy.multiply(support_vectors_label, alphas[support_vectors_index]) 407 | + b 408 | ) 409 | if numpy.sign(predict) != numpy.sign(labels[i]): 410 | errorCount += 1 411 | logging.info('训练集上错误率: {:.2%}'.format(1.0 * errorCount / m)) 412 | 413 | # 使用训练出来的SVM来对测试集进行分类, 检查错误率 414 | dataset, labels = load_dataset('testSetRBF2.txt') 415 | dataset = numpy.mat(dataset) 416 | labels = numpy.mat(labels).T 417 | m, _n = dataset.shape 418 | errorCount = 0 419 | for i in range(m): 420 | kernelEval = kernelTrans(support_vectors, dataset[i, :], ('rbf', k1)) 421 | predict = ( 422 | kernelEval.T 423 | * numpy.multiply(support_vectors_label, alphas[support_vectors_index]) 424 | + b 425 | ) 426 | if numpy.sign(predict) != numpy.sign(labels[i]): 427 | errorCount += 1 428 | logging.info('测试集上错误率: {:.2%}'.format(1.0 * errorCount / m)) 429 | 430 | """ 使用SVM来进行手写数字识别 """ 431 | 432 | 433 | def img2vector(filename): 434 | vector = numpy.zeros((1, 1024)) 435 | with open(filename) as infile: 436 | for lineno, line in enumerate(infile): 437 | for rowno in range(32): 438 | vector[0, 32 * lineno + rowno] = int(line[rowno]) 439 | return vector 440 | 441 | 442 | def load_images(dir_name): 443 | import os 444 | files = os.listdir(dir_name) 445 | labels = [] 446 | dataset = numpy.zeros((len(files), 1024)) 447 | for i, filename in enumerate(files): 448 | name = os.path.splitext(filename)[0] 449 | class_num = int(name.split('_')[0]) 450 | if class_num == 9: 451 | labels.append(-1) 452 | elif class_num == 1: 453 | labels.append(1) 454 | else: 455 | raise ValueError('本分类器为二分类器, 不支持除1/9外的数字') 456 | dataset[i, :] = img2vector('%s/%s' % (dir_name, filename)) 457 | return dataset, labels 458 | 459 | 460 | def test_digits(kernel_info=('rbf', 10)): 461 | dataset, labels = load_images('digits/trainingDigits') 462 | b, alphas = smoP(dataset, labels, 200, 0.0001, 10000, kernel_info) 463 | 464 | dataset = numpy.mat(dataset) 465 | labels = numpy.mat(labels).T 466 | support_vectors_index = tuple(numpy.nonzero(alphas.A > 0))[0] 467 | support_vectors = dataset[support_vectors_index] 468 | support_vectors_label = labels[support_vectors_index] 469 | m, _n = support_vectors.shape 470 | import pprint 471 | logging.info('支持向量 ({})个:'.format(m)) 472 | # logging.info(pprint.pformat(zip( 473 | # support_vectors.tolist(), support_vectors_label.A1.tolist() 474 | # ))) 475 | 476 | m, n = dataset.shape 477 | errorCount = 0 478 | for i in range(m): 479 | kernelEval = kernelTrans(support_vectors, dataset[i, :], kernel_info) 480 | predict = ( 481 | kernelEval.T 482 | * numpy.multiply(support_vectors_label, alphas[support_vectors_index]) 483 | + b 484 | ) 485 | if numpy.sign(predict) != numpy.sign(labels[i]): 486 | errorCount += 1 487 | logging.info('训练集上错误率: {:.2%}'.format(1.0 * errorCount / m)) 488 | 489 | dataset, labels = load_images('digits/testDigits') 490 | dataset = numpy.mat(dataset) 491 | labels = numpy.mat(labels).T 492 | errorCount = 0 493 | m, n = dataset.shape 494 | for i in range(m): 495 | kernelEval = kernelTrans(support_vectors, dataset[i, :], kernel_info) 496 | predict = ( 497 | kernelEval.T 498 | * numpy.multiply(support_vectors_label, alphas[support_vectors_index]) 499 | + b 500 | ) 501 | if numpy.sign(predict) != numpy.sign(labels[i]): 502 | errorCount += 1 503 | logging.info('测试集上错误率: {:.2%}'.format(1.0 * errorCount / m)) 504 | 505 | """ main 函数 """ 506 | 507 | 508 | def main(): 509 | # import pprint 510 | # dataset, labels = load_dataset('testSet.txt') 511 | # length = len(labels) 512 | # b, alphas = smo_simple(dataset, labels, 0.6, 0.001, 40) 513 | # logging.info('支持向量:') 514 | # logging.info(pprint.pformat( 515 | # [(dataset[i], labels[i]) for i in range(length) if alphas[i] > 0] 516 | # )) 517 | 518 | # 使用核函数的SVM 519 | test_rbf(k1=1.3) 520 | 521 | # 手写数字识别 522 | test_digits() 523 | 524 | if __name__ == '__main__': 525 | main() 526 | 527 | 528 | '''#######******************************** 529 | Non-Kernel VErsions below 530 | '''#######******************************** 531 | 532 | class optStructK: 533 | def __init__(self,dataMatIn, classLabels, C, toler): # Initialize the structure with the parameters 534 | self.X = dataMatIn 535 | self.labels = classLabels 536 | self.C = C 537 | self.tol = toler 538 | self.m = shape(dataMatIn)[0] 539 | self.alphas = mat(zeros((self.m,1))) 540 | self.b = 0 541 | self.eCache = mat(zeros((self.m,2))) #first column is valid flag 542 | 543 | def calcEkK(oS, k): 544 | fXk = float(multiply(oS.alphas,oS.labels).T*(oS.X*oS.X[k,:].T)) + oS.b 545 | Ek = fXk - float(oS.labels[k]) 546 | return Ek 547 | 548 | def selectJK(i, oS, Ei): #this is the second choice -heurstic, and calcs Ej 549 | maxK = -1; maxDeltaE = 0; Ej = 0 550 | oS.eCache[i] = [1,Ei] #set valid #choose the alpha that gives the maximum delta E 551 | validEcacheList = nonzero(oS.eCache[:,0].A)[0] 552 | if (len(validEcacheList)) > 1: 553 | for k in validEcacheList: #loop through valid Ecache values and find the one that maximizes delta E 554 | if k == i: continue #don't calc for i, waste of time 555 | Ek = calc_estimate(oS, k) 556 | deltaE = abs(Ei - Ek) 557 | if (deltaE > maxDeltaE): 558 | maxK = k; maxDeltaE = deltaE; Ej = Ek 559 | return maxK, Ej 560 | else: #in this case (first time around) we don't have any valid eCache values 561 | j = random_select_j(i, oS.m) 562 | Ej = calc_estimate(oS, j) 563 | return j, Ej 564 | 565 | def updateEkK(oS, k):#after any alpha has changed update the new value in the cache 566 | Ek = calc_estimate(oS, k) 567 | oS.eCache[k] = [1,Ek] 568 | 569 | def innerLK(i, oS): 570 | Ei = calc_estimate(oS, i) 571 | if ((oS.labels[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labels[i]*Ei > oS.tol) and (oS.alphas[i] > 0)): 572 | j,Ej = select_j(i, oS, Ei) #this has been changed from selectJrand 573 | alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy(); 574 | if (oS.labels[i] != oS.labels[j]): 575 | L = max(0, oS.alphas[j] - oS.alphas[i]) 576 | H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) 577 | else: 578 | L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) 579 | H = min(oS.C, oS.alphas[j] + oS.alphas[i]) 580 | if L==H: 581 | print("L==H") 582 | return 0 583 | eta = 2.0 * oS.X[i,:]*oS.X[j,:].T - oS.X[i,:]*oS.X[i,:].T - oS.X[j,:]*oS.X[j,:].T 584 | if eta >= 0: 585 | print("eta>=0") 586 | return 0 587 | oS.alphas[j] -= oS.labels[j]*(Ei - Ej)/eta 588 | oS.alphas[j] = AdjustAlpha(oS.alphas[j],H,L) 589 | updateEk(oS, j) #added this for the Ecache 590 | if (abs(oS.alphas[j] - alphaJold) < 0.00001): 591 | print("j not moving enough") 592 | return 0 593 | oS.alphas[i] += oS.labels[j]*oS.labels[i]*(alphaJold - oS.alphas[j])#update i by the same amount as j 594 | updateEk(oS, i) #added this for the Ecache #the update is in the oppostie direction 595 | b1 = oS.b - Ei- oS.labels[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[i,:].T - oS.labels[j]*(oS.alphas[j]-alphaJold)*oS.X[i,:]*oS.X[j,:].T 596 | b2 = oS.b - Ej- oS.labels[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[j,:].T - oS.labels[j]*(oS.alphas[j]-alphaJold)*oS.X[j,:]*oS.X[j,:].T 597 | if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1 598 | elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2 599 | else: oS.b = (b1 + b2)/2.0 600 | return 1 601 | else: return 0 602 | 603 | def smoPK(dataMatIn, classLabels, C, toler, maxIter): #full Platt SMO 604 | oS = Options(mat(dataMatIn), mat(classLabels).transpose(), C, toler) 605 | iter = 0 606 | entireSet = True; isAlphaPairsChanged = 0 607 | while (iter < maxIter) and ((isAlphaPairsChanged > 0) or (entireSet)): 608 | isAlphaPairsChanged = 0 609 | if entireSet: #go over all 610 | for i in range(oS.m): 611 | isAlphaPairsChanged += innerL(i,oS) 612 | print("fullSet, iter: %d i:%d, pairs changed %d" % (iter,i,isAlphaPairsChanged)) 613 | iter += 1 614 | else:#go over non-bound (railed) alphas 615 | nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0] 616 | for i in nonBoundIs: 617 | isAlphaPairsChanged += innerL(i,oS) 618 | print("non-bound, iter: %d i:%d, pairs changed %d" % (iter,i,isAlphaPairsChanged)) 619 | iter += 1 620 | if entireSet: entireSet = False #toggle entire set loop 621 | elif (isAlphaPairsChanged == 0): entireSet = True 622 | print("iteration number: %d" % iter) 623 | return oS.b,oS.alphas 624 | -------------------------------------------------------------------------------- /ch07 - AdaBoosting/adaboost.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=utf-8 3 | 4 | """ 5 | AdaBoost -- Adaptive boosting 6 | === 7 | 通过串行训练多个分类器, 每一个分类器根据已训练出来的分类器的性能来进行训练, 8 | 每个新的分类器集中关注被已有分类器错分的那些数据来获得新的分类器. 9 | 最终把所有分类器的结果加权求和. 10 | """ 11 | import logging 12 | 13 | import numpy 14 | 15 | logging.basicConfig( 16 | level=logging.DEBUG, 17 | # level=logging.INFO, 18 | format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s', 19 | ) 20 | TRACE = logging.DEBUG - 1 21 | 22 | 23 | def load_fake_dataset(): 24 | dataset = numpy.matrix([ 25 | [1.0, 2.1], 26 | [2.0, 1.1], 27 | [1.3, 1.0], 28 | [1.0, 1.0], 29 | [2.0, 1.0], 30 | ]) 31 | labels = [1.0, 1.0, -1.0, -1.0, 1.0] 32 | return dataset, labels 33 | 34 | 35 | def load_dataset_from_file(filename): 36 | dataset = [] 37 | labels = [] 38 | num_features = None 39 | with open(filename) as infile: 40 | for line in infile: 41 | line = line.strip().split('\t') 42 | if num_features is None: 43 | num_features = len(line) 44 | dataset.append(list(map(float, line[:-1]))) 45 | labels.append(float(line[-1])) 46 | return dataset, labels 47 | 48 | 49 | class DicisionStump(object): 50 | def __init__(self, dataset): 51 | self.dataset = dataset 52 | 53 | def predict(self, dimension, threshold_val, inequal): 54 | m, _n = self.dataset.shape 55 | predict = numpy.ones((m, 1)) 56 | if inequal == 'lt': 57 | predict[self.dataset[:, dimension] <= threshold_val] = -1.0 58 | elif inequal == 'gt': 59 | predict[self.dataset[:, dimension] > threshold_val] = -1.0 60 | return predict 61 | 62 | 63 | class AdaBoostDicisionStump(object): 64 | def __init__(self, dataset, labels, max_iter=40): 65 | self.dataset = numpy.mat(dataset) 66 | self.labels = numpy.mat(labels).T 67 | self.m, self.n = self.dataset.shape 68 | self.train(max_iter=max_iter) 69 | 70 | def build_stump(self, D): 71 | stump = DicisionStump(self.dataset) 72 | num_steps = 10.0 # 在特征的可能值上通过递增步长遍历的次数 73 | best_stump_info = {} # 记录对于给定权重向量D, 最佳的单层决策树 74 | best_predict_values = numpy.mat(numpy.zeros((self.m, 1))) 75 | min_error = 0x3f3f3f3f # init error sum, to +infinity 76 | # 遍历所有特征 77 | for i in range(self.n): 78 | # 计算遍历该特征的步长 79 | feature_min = self.dataset[:, i].min() 80 | feature_max = self.dataset[:, i].max() 81 | step = (feature_max - feature_min) / num_steps 82 | # 对于该特征, 遍历所有可能的值 83 | for j in range(-1, int(num_steps) + 1): # loop over all range in current dimension 84 | for inequal in ['lt', 'gt']: # 在 >/< 之间进行切换 85 | threshold_val = feature_min + float(j) * step 86 | predicted_values = stump.predict(i, threshold_val, inequal) 87 | # 记录预测值与实际分类不同 88 | errors = numpy.mat(numpy.ones((self.m, 1))) 89 | errors[predicted_values == self.labels] = 0 90 | # 计算在给定权重下的总错误权重 91 | weighted_errors = D.T * errors 92 | logging.log(TRACE, '[Split] dimension {:d}, threshold {:.2f} threshold inequal: {:s}'.format( 93 | i, threshold_val, inequal 94 | )) 95 | logging.log(TRACE, '[Split] Weighted errors is {:.3f}'.format(weighted_errors[0, 0])) 96 | # 根据总错误权重来更新最好的单层决策树信息 97 | if weighted_errors < min_error: 98 | min_error = weighted_errors 99 | best_predict_values = predicted_values.copy() 100 | best_stump_info['dimension'] = i 101 | best_stump_info['threshold'] = threshold_val 102 | best_stump_info['inequal'] = inequal 103 | return best_stump_info, min_error, best_predict_values 104 | 105 | def train(self, max_iter): 106 | weak_classifiers = [] 107 | D = numpy.mat(numpy.ones((self.m, 1)) / self.m) 108 | aggregated_predict = numpy.mat(numpy.zeros((self.m, 1))) 109 | for i in range(max_iter): 110 | stump_info, error, predict = self.build_stump(D) 111 | logging.debug('D: {}'.format(D.T)) 112 | # 计算本次单层决策树输出结果的权重, `max(error, 1e-16)` 保证不会出现除0错误 113 | alpha = float(0.5 * numpy.log((1.0 - error) / max(error, 1e-16))) 114 | stump_info['alpha'] = alpha 115 | weak_classifiers.append(stump_info) # store Stump Params in Array 116 | logging.debug('predict: {}'.format(predict.T)) 117 | # 更新权重D 118 | exponent = numpy.multiply(-1 * alpha * self.labels, predict) 119 | D = numpy.multiply(D, numpy.exp(exponent)) 120 | D = D / D.sum() # 保证 D 各维度总和为 1 121 | # 计算应用所有分类器后的分类结果 122 | aggregated_predict += alpha * predict 123 | logging.debug('aggregated predict: {}'.format(aggregated_predict.T)) 124 | aggregated_errors = numpy.multiply( 125 | numpy.sign(aggregated_predict) != self.labels, 126 | numpy.ones((self.m, 1)) 127 | ) 128 | errorRate = aggregated_errors.sum() / self.m 129 | logging.info('Total error: {}'.format(errorRate)) 130 | if errorRate == 0.0: 131 | break 132 | self.classifiers = weak_classifiers 133 | self.aggregated_predict = aggregated_predict 134 | 135 | def predict(self, dataset): 136 | dataset = numpy.mat(dataset) 137 | stump = DicisionStump(dataset) 138 | m, _n = dataset.shape 139 | aggregated_estimate = numpy.mat(numpy.zeros((m, 1))) 140 | for classifier in self.classifiers: 141 | logging.info('Applying stumb: {}'.format(classifier)) 142 | estimate = stump.predict( 143 | classifier['dimension'], 144 | classifier['threshold'], 145 | classifier['inequal'] 146 | ) 147 | aggregated_estimate += classifier['alpha'] * estimate 148 | logging.info(aggregated_estimate) 149 | return numpy.sign(aggregated_estimate) 150 | 151 | 152 | def plotROCCurve(predStrengths, labels): 153 | """ 154 | ROC曲线(Receiver Operating Characteristic curve) 155 | ROC曲线给出当阈值变化时假阳率和真阳率的变化情况 156 | """ 157 | import matplotlib.pyplot as plt 158 | cursor = (1.0, 1.0) # 绘制光标的位置 159 | ySum = 0.0 # variable to calculate AUC 160 | numPositiveClass = sum(numpy.array(labels) == 1.0) # 正例的数目 161 | step = { 162 | 'x': 1.0 / numPositiveClass, 163 | 'y': 1.0 / (len(labels) - numPositiveClass), 164 | } 165 | sortedIndicies = predStrengths.A1.argsort() # get sorted index, it's reverse 166 | fig = plt.figure() 167 | fig.clf() 168 | ax = plt.subplot(111) 169 | # loop through all the values, drawing a line segment at each point 170 | for index in sortedIndicies: 171 | if labels[index] == 1.0: 172 | deltaX = 0 173 | deltaY = step['x'] 174 | else: 175 | deltaX = step['y'] 176 | deltaY = 0 177 | ySum += cursor[1] 178 | # draw line from cursor to (cursor[0]-deltaX, cursor[1]-deltaY) 179 | logging.debug('Drawing line from {} -> {}'.format( 180 | cursor, (cursor[0]-deltaX, cursor[1]-deltaY) 181 | )) 182 | ax.plot( 183 | [cursor[0], cursor[0]-deltaX], 184 | [cursor[1], cursor[1]-deltaY], 185 | c='b' 186 | ) 187 | cursor = (cursor[0] - deltaX, cursor[1] - deltaY) 188 | ax.plot([0, 1], [0, 1], 'b--') 189 | 190 | plt.xlabel('False positive rate') 191 | plt.ylabel('True positive rate') 192 | plt.title('ROC curve for AdaBoost horse colic detection system') 193 | ax.axis([0, 1, 0, 1]) 194 | plt.show() 195 | logging.info('曲线下面积AUC(Area Under the Curve): {}'.format(ySum * step['y'])) 196 | 197 | 198 | def main(): 199 | import pprint 200 | dataset, labels = load_fake_dataset() 201 | # D = numpy.mat(numpy.ones((5, 1)) / 5) 202 | # build_stump(dataset, labels, D) 203 | model = AdaBoostDicisionStump(dataset, labels) 204 | logging.info('Classifiers: {}'.format(pprint.pformat(model.classifiers))) 205 | logging.info('结果对比 (预测/真实):\n{}'.format(zip( 206 | model.predict(dataset).A1.tolist(), 207 | labels 208 | ))) 209 | 210 | plotROCCurve(model.aggregated_predict, labels) 211 | 212 | dataset, labels = load_dataset_from_file('horseColicTraining2.txt') 213 | model = AdaBoostDicisionStump(dataset, labels) 214 | plotROCCurve(model.aggregated_predict, labels) 215 | 216 | 217 | if __name__ == '__main__': 218 | main() 219 | 220 | -------------------------------------------------------------------------------- /ch08 - LinearRegression/regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=utf-8 3 | 4 | import logging 5 | 6 | import numpy 7 | 8 | logging.basicConfig( 9 | level=logging.DEBUG, 10 | # level=logging.INFO, 11 | format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s', 12 | ) 13 | TRACE = logging.DEBUG - 1 14 | 15 | 16 | def load_dataset_from_file(filename): 17 | dataset = [] 18 | labels = [] 19 | num_features = None 20 | with open(filename) as infile: 21 | for line in infile: 22 | line = line.strip().split('\t') 23 | if num_features is None: 24 | num_features = len(line) 25 | dataset.append(list(map(float, line[:-1]))) 26 | labels.append(float(line[-1])) 27 | return dataset, labels 28 | 29 | 30 | def standarRegress(xArray, yArray): 31 | """使用普通最小二乘法求回归系数""" 32 | xMatrix = numpy.mat(xArray) 33 | yMatrix = numpy.mat(yArray).T 34 | xTx = xMatrix.T * xMatrix 35 | if numpy.linalg.det(xTx) == 0.0: 36 | logging.error('奇异矩阵无法求逆') 37 | return 38 | w = xTx.I * (xMatrix.T * yMatrix) 39 | # 或下面这个 40 | # w = numpy.linalg.solve(xTx, xMatrix.T * yMatrix) 41 | return w.A1 42 | 43 | 44 | def lwlrRegress(testPoint, xArray, yArray, k=1.0): 45 | """局部加权线性回归(LWLR - Locally Weighted Linear Regression) 46 | 给待预测点附近的每个点赋予一定的权重. 47 | LWLR使用"核"来对附近的点赋予更高的权重, 最常用的是高斯核. 48 | === 49 | 50 | """ 51 | xMatrix = numpy.mat(xArray) 52 | yMatrix = numpy.mat(yArray).T 53 | m, _n = xMatrix.shape 54 | # 利用高斯核初始化权重矩阵 55 | weights = numpy.mat(numpy.eye(m)) 56 | for j in range(m): 57 | diffMat = testPoint - xMatrix[j, :] 58 | weights[j, j] = numpy.exp(diffMat * diffMat.T / (-2.0 * k**2)) 59 | xTx = xMatrix.T * (weights * xMatrix) 60 | if numpy.linalg.det(xTx) == 0.0: 61 | logging.error('奇异矩阵无法求逆') 62 | return 63 | ws = xTx.I * (xMatrix.T * (weights * yMatrix)) 64 | return testPoint * ws 65 | 66 | 67 | def lwlrTest(testArray, xArray, yArray, k=1.0): 68 | """ 69 | 对于所有的测试点, 使用LWLR局部加权线性回归来计算预测值 70 | """ 71 | m, _n = numpy.array(testArray).shape 72 | yHat = numpy.zeros(m) 73 | for i in range(m): 74 | yHat[i] = lwlrRegress(testArray[i], xArray, yArray, k) 75 | return yHat 76 | 77 | 78 | def rssError(yArray, yHatArr): 79 | """计算预测误差""" 80 | yArray = numpy.array(yArray) 81 | yHatArr = numpy.array(yHatArr) 82 | return ((yArray - yHatArr)**2).sum() 83 | 84 | """缩减方法 -- 岭回归, 前向足部回归, lasso法""" 85 | 86 | 87 | def ridgeRegress(xMatrix, yMatrix, lam=0.2): 88 | xTx = xMatrix.T * xMatrix 89 | _m, n = numpy.shape(xMatrix) 90 | denom = xTx + (numpy.eye(n) * lam) 91 | if numpy.linalg.det(denom) == 0.0: 92 | logging.error('奇异矩阵无法求逆') 93 | return 94 | ws = denom.I * (xMatrix.T * yMatrix) 95 | return ws 96 | 97 | 98 | def ridgeTest(xArray, yArray): 99 | xMatrix = numpy.mat(xArray) 100 | yMatrix = numpy.mat(yArray).T 101 | # 标准化Y 102 | yMean = numpy.mean(yMatrix, 0) 103 | yMatrix = yMatrix - yMean # to eliminate X0 take numpy.mean off of Y 104 | # 标准化X的每一维 105 | xMeans = numpy.mean(xMatrix, 0) # calc numpy.mean then subtract it off 106 | xVar = numpy.var(xMatrix, 0) # calc variance of Xi then divide by it 107 | xMatrix = (xMatrix - xMeans) / xVar 108 | 109 | numTestPts = 30 110 | _m, n = xMatrix.shape 111 | wMatrix = numpy.zeros((numTestPts, n)) 112 | for i in range(numTestPts): 113 | ws = ridgeRegress(xMatrix, yMatrix, numpy.exp(i - 10)) 114 | wMatrix[i, :] = ws.T 115 | return wMatrix 116 | 117 | 118 | def main(): 119 | Xs, Ys = load_dataset_from_file('ex0.txt') 120 | logging.info('原始数据\n{0}'.format([(x, y) for x, y in zip(Xs, Ys)])) 121 | 122 | w = standarRegress(Xs, Ys) 123 | logging.info('最小二乘法回归系数: {0}'.format(w)) 124 | logging.info('预测序列\n{0}'.format([ 125 | (x, y) for x, y in map( 126 | lambda x: (x, float(numpy.mat(x) * numpy.mat(w).T)), 127 | Xs 128 | ) 129 | ])) 130 | 131 | # k = 0.003 132 | k = 0.01 133 | # k = 0.1 134 | yHat = lwlrTest(Xs, Xs, Ys, k=k) 135 | logging.info('LWLR预测序列, 系数 k={0}\n{1}'.format(k, [ 136 | (x, y) for x, y in zip(Xs, yHat) 137 | ])) 138 | 139 | ''' 140 | # 绘制图看拟合效果 141 | xMatrix = numpy.mat(Xs) 142 | sorted_index = xMatrix[:, 1].argsort(axis=0) 143 | xSort = xMatrix[sorted_index][:, 0, :] 144 | import matplotlib.pyplot as plt 145 | figure = plt.figure() 146 | ax = figure.add_subplot(111) 147 | ax.plot(xSort[:, 1], yHat[sorted_index]) # 拟合曲线 148 | ax.scatter( 149 | xMatrix[:, 1].A1, numpy.mat(Ys).T.A1, 150 | s=2, c='red' 151 | ) # 原始数据 152 | plt.show() 153 | ''' 154 | 155 | abaloneXs, abalineYs = load_dataset_from_file('abalone.txt') 156 | w = ridgeTest(abaloneXs, abalineYs) 157 | 158 | 159 | if __name__ == '__main__': 160 | main() 161 | 162 | 163 | def regularize(xMatrix):#regularize by columns 164 | inMat = xMatrix.copy() 165 | inMeans = numpy.mean(inMat,0) #calc numpy.mean then subtract it off 166 | inVar = numpy.var(inMat,0) #calc variance of Xi then divide by it 167 | inMat = (inMat - inMeans)/inVar 168 | return inMat 169 | 170 | def stageWise(xArray,yArray,eps=0.01,numIt=100): 171 | xMatrix = numpy.mat(xArray); yMatrix=numpy.mat(yArray).T 172 | yMean = numpy.mean(yMatrix,0) 173 | yMatrix = yMatrix - yMean #can also regularize ys but will get smaller coef 174 | xMatrix = regularize(xMatrix) 175 | m,n=numpy.shape(xMatrix) 176 | #returnMat = numpy.zeros((numIt,n)) #testing code remove 177 | ws = numpy.zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy() 178 | for i in range(numIt): 179 | print ws.T 180 | lowestError = numpy.inf; 181 | for j in range(n): 182 | for sign in [-1,1]: 183 | wsTest = ws.copy() 184 | wsTest[j] += eps*sign 185 | yTest = xMatrix*wsTest 186 | rssE = rssError(yMatrix.A,yTest.A) 187 | if rssE < lowestError: 188 | lowestError = rssE 189 | wsMax = wsTest 190 | ws = wsMax.copy() 191 | #returnMat[i,:]=ws.T 192 | #return returnMat 193 | 194 | #def scrapePage(inFile,outFile,yr,numPce,origPrc): 195 | # from BeautifulSoup import BeautifulSoup 196 | # fr = open(inFile); fw=open(outFile,'a') #a is append mode writing 197 | # soup = BeautifulSoup(fr.read()) 198 | # i=1 199 | # currentRow = soup.findAll('table', r="%d" % i) 200 | # while(len(currentRow)!=0): 201 | # title = currentRow[0].findAll('a')[1].text 202 | # lwrTitle = title.lower() 203 | # if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1): 204 | # newFlag = 1.0 205 | # else: 206 | # newFlag = 0.0 207 | # soldUnicde = currentRow[0].findAll('td')[3].findAll('span') 208 | # if len(soldUnicde)==0: 209 | # print "item #%d did not sell" % i 210 | # else: 211 | # soldPrice = currentRow[0].findAll('td')[4] 212 | # priceStr = soldPrice.text 213 | # priceStr = priceStr.replace('$','') #strips out $ 214 | # priceStr = priceStr.replace(',','') #strips out , 215 | # if len(soldPrice)>1: 216 | # priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping 217 | # print "%s\t%d\t%s" % (priceStr,newFlag,title) 218 | # fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr)) 219 | # i += 1 220 | # currentRow = soup.findAll('table', r="%d" % i) 221 | # fw.close() 222 | 223 | from time import sleep 224 | import json 225 | import urllib2 226 | def searchForSet(retX, retY, setNum, yr, numPce, origPrc): 227 | sleep(10) 228 | myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY' 229 | searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum) 230 | pg = urllib2.urlopen(searchURL) 231 | retDict = json.loads(pg.read()) 232 | for i in range(len(retDict['items'])): 233 | try: 234 | currItem = retDict['items'][i] 235 | if currItem['product']['condition'] == 'new': 236 | newFlag = 1 237 | else: newFlag = 0 238 | listOfInv = currItem['product']['inventories'] 239 | for item in listOfInv: 240 | sellingPrice = item['price'] 241 | if sellingPrice > origPrc * 0.5: 242 | print "%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice) 243 | retX.append([yr, numPce, newFlag, origPrc]) 244 | retY.append(sellingPrice) 245 | except: print 'problem with item %d' % i 246 | 247 | def setDataCollect(retX, retY): 248 | searchForSet(retX, retY, 8288, 2006, 800, 49.99) 249 | searchForSet(retX, retY, 10030, 2002, 3096, 269.99) 250 | searchForSet(retX, retY, 10179, 2007, 5195, 499.99) 251 | searchForSet(retX, retY, 10181, 2007, 3428, 199.99) 252 | searchForSet(retX, retY, 10189, 2008, 5922, 299.99) 253 | searchForSet(retX, retY, 10196, 2009, 3263, 249.99) 254 | 255 | def crossValidation(xArray,yArray,numVal=10): 256 | m = len(yArray) 257 | indexList = range(m) 258 | errorMat = numpy.zeros((numVal,30))#create error numpy.mat 30columns numVal rows 259 | for i in range(numVal): 260 | trainX=[]; trainY=[] 261 | testX = []; testY = [] 262 | numpy.random.shuffle(indexList) 263 | for j in range(m):#create training set based on first 90% of values in indexList 264 | if j < m*0.9: 265 | trainX.append(xArray[indexList[j]]) 266 | trainY.append(yArray[indexList[j]]) 267 | else: 268 | testX.append(xArray[indexList[j]]) 269 | testY.append(yArray[indexList[j]]) 270 | wMat = ridgeTest(trainX,trainY) #get 30 weight vectors from ridge 271 | for k in range(30):#loop over all of the ridge estimates 272 | matTestX = numpy.mat(testX); matTrainX=numpy.mat(trainX) 273 | meanTrain = numpy.mean(matTrainX,0) 274 | varTrain = numpy.var(matTrainX,0) 275 | matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params 276 | yEst = matTestX * numpy.mat(wMat[k,:]).T + numpy.mean(trainY)#test ridge results and store 277 | errorMat[i,k]=rssError(yEst.T.A,numpy.array(testY)) 278 | #print errorMat[i,k] 279 | meanErrors = numpy.mean(errorMat,0)#calc avg performance of the different ridge weight vectors 280 | minMean = float(min(meanErrors)) 281 | bestWeights = wMat[numpy.nonzero(meanErrors==minMean)] 282 | #can unregularize to get model 283 | #when we regularized we wrote Xreg = (x-meanX)/numpy.var(x) 284 | #we can now write in terms of x not Xreg: x*w/numpy.var(x) - meanX/numpy.var(x) +meanY 285 | xMatrix = numpy.mat(xArray); yMatrix=numpy.mat(yArray).T 286 | meanX = numpy.mean(xMatrix,0); varX = numpy.var(xMatrix,0) 287 | unReg = bestWeights/varX 288 | print "the best model from Ridge Regression is:\n",unReg 289 | print "with constant term: ",-1*sum(numpy.multiply(meanX,unReg)) + numpy.mean(yMatrix) -------------------------------------------------------------------------------- /ch09 - RegressionTree/regressionTrees.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=utf-8 3 | 4 | from __future__ import print_function 5 | 6 | import copy 7 | import logging 8 | 9 | import numpy 10 | 11 | TRACE = logging.DEBUG - 1 12 | logging.basicConfig( 13 | level=logging.DEBUG, 14 | # level=TRACE, 15 | format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s', 16 | ) 17 | 18 | 19 | def load_dataset_from_file(filename): 20 | dataset = [] 21 | with open(filename) as infile: 22 | for line in infile: 23 | line = line.strip().split('\t') 24 | dataset.append(list(map(float, line))) 25 | return dataset 26 | 27 | 28 | def linear_solve(dataset): 29 | """求线性回归参数""" 30 | m, n = numpy.shape(dataset) 31 | X = numpy.mat(numpy.ones((m, n))) 32 | X[:, 1:n] = dataset[:, 0:n - 1] 33 | Y = dataset[:, -1] 34 | xTx = X.T * X 35 | if numpy.linalg.det(xTx) == 0.0: 36 | raise Exception('This matrix is singular, cannot do inverse,\n' 37 | 'try increasing the second value of ops') 38 | ws = xTx.I * (X.T * Y) 39 | return ws, X, Y 40 | 41 | 42 | TYPE_VALUE = 0 43 | TYPE_MODEL = 1 44 | 45 | 46 | class Dataset(object): 47 | 48 | def __init__(self, dataset): 49 | self.rawDataset = numpy.mat(dataset) 50 | 51 | @property 52 | def shape(self): 53 | return self.rawDataset.shape 54 | 55 | @property 56 | def leaf_val(self): 57 | """因变量均值""" 58 | return numpy.mean(self.rawDataset[:, -1]) 59 | 60 | @property 61 | def leaf_error(self): 62 | """因变量均方差和""" 63 | m, _n = self.rawDataset.shape 64 | return m * numpy.var(self.rawDataset[:, -1]) 65 | 66 | @property 67 | def leaf_model_weights(self): 68 | ws, _X, _Y = linear_solve(self.rawDataset) 69 | return ws.A1.tolist() 70 | 71 | @property 72 | def leaf_model_error(self): 73 | ws, X, Y = linear_solve(self.rawDataset) 74 | yHat = X * ws 75 | return sum(numpy.power(Y - yHat, 2)) 76 | 77 | def split(self, feature, value): 78 | row_indexs = numpy.nonzero(self.rawDataset[:, feature] > value)[0] 79 | m0 = self.rawDataset[row_indexs, :] 80 | row_indexs = numpy.nonzero(self.rawDataset[:, feature] <= value)[0] 81 | m1 = self.rawDataset[row_indexs, :] 82 | return Dataset(m0), Dataset(m1) 83 | 84 | def choose_best_split(self, tree_type=TYPE_VALUE, total_s=1.0, total_n=4): 85 | """ 86 | 87 | Parameters 88 | ---------- 89 | tree_type : int 90 | TYPE_VALUE 普通回归树 91 | TYPE_MODEL 模型回归树 92 | total_s : float 93 | 分裂叶节点时, 数据集方差和下降值最小值 94 | total_n : int 95 | 叶节点中最少包含的样本数 96 | 97 | Returns 98 | ------- 99 | (int, float) : 对数据集划分的最好特征的index, 划分值 100 | """ 101 | # 如果所有值都相等, 生成一个叶节点 102 | if len(set(self.rawDataset[:, -1].T.A1)) == 1: 103 | if tree_type == TYPE_VALUE: 104 | return None, self.leaf_val 105 | elif tree_type == TYPE_MODEL: 106 | return None, self.leaf_model_weights 107 | 108 | _m, n = self.rawDataset.shape 109 | best_info = { 110 | 's': numpy.inf, 111 | 'index': 0, 112 | 'value': 0, 113 | } 114 | for feature_index in range(n - 1): 115 | values = set(self.rawDataset[:, feature_index].A1) 116 | for split_val in values: 117 | d0, d1 = self.split(feature_index, split_val) 118 | # 如果切分出来的数据集很小, 跳过? 119 | if d0.shape[0] < total_n or d1.shape[0] < total_n: 120 | continue 121 | if tree_type == TYPE_VALUE: 122 | new_s = d0.leaf_error + d1.leaf_error 123 | elif tree_type == TYPE_MODEL: 124 | new_s = d0.leaf_model_error + d1.leaf_model_error 125 | if new_s < best_info['s']: 126 | best_info['s'] = new_s 127 | best_info['index'] = feature_index 128 | best_info['value'] = split_val 129 | 130 | # 如果误差减少不大, 则生成一个叶节点 131 | if tree_type == TYPE_VALUE: 132 | origin_error = self.leaf_error 133 | elif tree_type == TYPE_MODEL: 134 | origin_error = self.leaf_model_error 135 | if origin_error - best_info['s'] < total_s: 136 | if tree_type == TYPE_VALUE: 137 | return None, self.leaf_val 138 | elif tree_type == TYPE_MODEL: 139 | return None, self.leaf_model_weights 140 | 141 | # 如果切分出来的数据集很小, 则生成一个叶节点 142 | d0, d1 = self.split(best_info['index'], best_info['value']) 143 | if d0.shape[0] < total_n or d1.shape[0] < total_n: 144 | if tree_type == TYPE_VALUE: 145 | return None, self.leaf_val 146 | elif tree_type == TYPE_MODEL: 147 | return None, self.leaf_model_weights 148 | 149 | return best_info['index'], best_info['value'] 150 | 151 | 152 | class RegressionTree(object): 153 | """回归树 -- 普通回归树/模型回归树 154 | 普通回归树 - 把相近的一群点作为一个模拟点 155 | 模型回归树 - 把'模式类似'的一群点化为一个线性函数的回归系数 156 | """ 157 | 158 | def __init__(self, dataset, tree_type=TYPE_VALUE, total_s=1.0, total_n=4): 159 | self.tree_type = tree_type 160 | self.dataset = Dataset(dataset) 161 | self.tree = self.__build_tree(self.dataset, tree_type, total_s, total_n) 162 | 163 | @classmethod 164 | def __build_tree(cls, dataset, tree_type, total_s, total_n): 165 | feature_index, value = dataset.choose_best_split(tree_type, total_s, total_n) 166 | if feature_index is None: 167 | return value 168 | 169 | d0, d1 = dataset.split(feature_index, value) 170 | tree = { 171 | 'index': feature_index, 172 | 'value': value, 173 | 'left': cls.__build_tree(d0, tree_type, total_s, total_n), 174 | 'right': cls.__build_tree(d1, tree_type, total_s, total_n), 175 | } 176 | return tree 177 | 178 | @staticmethod 179 | def is_tree(node): 180 | return isinstance(node, dict) 181 | 182 | @classmethod 183 | def mean(cls, tree): 184 | if cls.is_tree(tree['right']): 185 | tree['right'] = cls.mean(tree['right']) 186 | if cls.is_tree(tree['left']): 187 | tree['left'] = cls.mean(tree['left']) 188 | return (tree['left'] + tree['right']) / 2.0 189 | 190 | def prune(self, test_dataset): 191 | assert self.tree_type == TYPE_VALUE 192 | return self.__do_prune(copy.deepcopy(self.tree), Dataset(test_dataset)) 193 | 194 | @classmethod 195 | def __do_prune(cls, tree, test_dataset): 196 | m, _n = test_dataset.shape 197 | if m == 0: 198 | return cls.mean(tree) 199 | 200 | if cls.is_tree(tree['right']) or cls.is_tree(tree['left']): 201 | d0, d1 = test_dataset.split(tree['index'], tree['value']) 202 | if cls.is_tree(tree['left']): 203 | tree['left'] = cls.__do_prune(tree['left'], d0) 204 | if cls.is_tree(tree['right']): 205 | tree['right'] = cls.__do_prune(tree['right'], d1) 206 | 207 | if cls.is_tree(tree['left']) or cls.is_tree(tree['right']): 208 | return tree 209 | else: 210 | # 如果两个子节点都已经不是树, 则对子节点尝试合并 211 | # 比较合并前后的误差, 如果误差能得到提升则进行合并 212 | d0, d1 = test_dataset.split(tree['index'], tree['value']) 213 | errorNoMerge = sum(numpy.power( 214 | d0.rawDataset[:, -1] - tree['left'], 215 | 2 216 | )) + sum(numpy.power( 217 | d1.rawDataset[:, -1] - tree['right'], 218 | 2 219 | )) 220 | 221 | tree_mean = (tree['left'] + tree['right']) / 2.0 222 | errorMerge = sum(numpy.power(test_dataset.rawDataset[:, -1], 2)) 223 | 224 | if errorMerge < errorNoMerge: 225 | logging.debug('merging...') 226 | return tree_mean 227 | else: 228 | return tree 229 | 230 | @staticmethod 231 | def eval_value(model, in_dataset): 232 | return float(model) 233 | 234 | @staticmethod 235 | def eval_model(model, in_dataset): 236 | m, n = in_dataset.shape 237 | X = numpy.mat(numpy.ones((1, n + 1))) 238 | X[0, 1:n+1] = in_dataset.rawDataset 239 | return float(X * numpy.mat(model).T) 240 | 241 | def predict(self, test_dataset): 242 | m, n = test_dataset.shape 243 | yHat = numpy.mat(numpy.zeros((m, 1))) 244 | for i in range(m): 245 | eval_func = None 246 | if self.tree_type == TYPE_VALUE: 247 | eval_func = self.eval_value 248 | elif self.tree_type == TYPE_MODEL: 249 | eval_func = self.eval_model 250 | yHat[i, 0] = self.__do_predict( 251 | self.tree, 252 | Dataset(test_dataset[i]), 253 | eval_func 254 | ) 255 | logging.log(TRACE, '{} -> {}'.format(test_dataset[i, 0], yHat[i, 0])) 256 | return yHat 257 | 258 | def __do_predict(self, tree, test_dataset, eval_func): 259 | if not self.is_tree(tree): 260 | return eval_func(tree, test_dataset) 261 | 262 | if test_dataset.rawDataset[tree['index']] > tree['value']: 263 | logging.log(TRACE, '{0} > {1} : go left'.format( 264 | test_dataset.rawDataset[tree['index']], 265 | tree['value'] 266 | )) 267 | return self.__do_predict(tree['left'], test_dataset, eval_func) 268 | else: 269 | logging.log(TRACE, '{0} <= {1} : go right'.format( 270 | test_dataset.rawDataset[tree['index']], 271 | tree['value'] 272 | )) 273 | return self.__do_predict(tree['right'], test_dataset, eval_func) 274 | 275 | 276 | def main(): 277 | import pprint 278 | """ 279 | filename = 'ex00.txt' 280 | dataset = load_dataset_from_file(filename) 281 | tree = RegressionTree(dataset) 282 | logging.info('`{0}` -> 回归树:\n{1}'.format( 283 | filename, 284 | pprint.pformat(tree.tree) 285 | )) 286 | 287 | filename = 'ex0.txt' 288 | dataset = load_dataset_from_file(filename) 289 | tree = RegressionTree(dataset) 290 | logging.info('`{0}` -> 回归树:\n{1}'.format( 291 | filename, 292 | pprint.pformat(tree.tree) 293 | )) 294 | 295 | filename = 'ex2.txt' 296 | dataset = load_dataset_from_file(filename) 297 | tree = RegressionTree(dataset) 298 | logging.info('`{0}` -> 回归树:\n{1}'.format( 299 | filename, 300 | pprint.pformat(tree.tree) 301 | )) 302 | filename = 'ex2test.txt' 303 | test_dataset = load_dataset_from_file(filename) 304 | pruned_tree = tree.prune(test_dataset) 305 | logging.info('利用`{0}`进行后剪支 -> 回归树:\n{1}'.format( 306 | filename, 307 | pprint.pformat(pruned_tree) 308 | )) 309 | 310 | filename = 'exp2.txt' 311 | dataset = load_dataset_from_file(filename) 312 | tree = RegressionTree(dataset, TYPE_MODEL, 1, 10) 313 | logging.info('`{0}` -> 模型回归树:\n{1}'.format( 314 | filename, 315 | pprint.pformat(tree.tree) 316 | )) 317 | """ 318 | # 回归树/模型树拟合效果对比 319 | train_filename = 'bikeSpeedVsIq_train.txt' 320 | train_dataset = load_dataset_from_file(train_filename) 321 | test_filename = 'bikeSpeedVsIq_test.txt' 322 | test_dataset = numpy.mat(load_dataset_from_file(test_filename)) 323 | 324 | regular_regression_tree = RegressionTree(train_dataset, TYPE_VALUE, 1, 20) 325 | logging.info('`{0}` -> 回归树:\n{1}'.format( 326 | train_filename, 327 | pprint.pformat(regular_regression_tree.tree) 328 | )) 329 | yHat = regular_regression_tree.predict(test_dataset[:, 0]) 330 | logging.info('{0}'.format( 331 | numpy.corrcoef(yHat, test_dataset[:, 1], rowvar=0)[0, 1] 332 | )) 333 | 334 | model_regression_tree = RegressionTree(train_dataset, TYPE_MODEL, 1, 20) 335 | logging.info('`{0}` -> 模型回归树:\n{1}'.format( 336 | train_filename, 337 | pprint.pformat(model_regression_tree.tree) 338 | )) 339 | yHat = model_regression_tree.predict(test_dataset[:, 0]) 340 | logging.info('{0}'.format( 341 | numpy.corrcoef(yHat, test_dataset[:, 1], rowvar=0)[0, 1] 342 | )) 343 | 344 | 345 | if __name__ == '__main__': 346 | main() 347 | -------------------------------------------------------------------------------- /ch11 - Apriori/apriori.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=utf-8 3 | 4 | from __future__ import print_function 5 | 6 | import logging 7 | 8 | from numpy import * 9 | 10 | TRACE = logging.DEBUG - 1 11 | logging.basicConfig( 12 | level=logging.DEBUG, 13 | # level=TRACE, 14 | format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s', 15 | ) 16 | 17 | 18 | def load_fake_dataset(): 19 | return [ 20 | [1, 3, 4], 21 | [2, 3, 5], 22 | [1, 2, 3, 5], 23 | [2, 5], 24 | ] 25 | 26 | 27 | def drop_unsupported_candidate_set( 28 | dataset, candidate_sets_k, min_support_degree): 29 | """ 30 | '支持度': 数据集中包含该项集的记录所占的比例 31 | 32 | Parameters 33 | ---------- 34 | dataset : 35 | 数据集 36 | candidate_sets_k : 37 | 候选项集集合 38 | min_support_degree : 39 | 最小支持度 40 | 41 | Returns 42 | ------- 43 | result, support_degrees : 44 | 支持度 >= min_support_degree的频繁项集, 频繁项集的支持度 45 | """ 46 | candidate_set_count = {} 47 | for transaction in dataset: 48 | for candidate_set in candidate_sets_k: 49 | if candidate_set.issubset(transaction): 50 | num = candidate_set_count.get(candidate_set, 0) 51 | candidate_set_count[candidate_set] = num + 1 52 | result = [] 53 | support_degrees = {} 54 | for candidate_set in candidate_set_count: 55 | # 计算每一项的支持度 56 | support = 1.0 * candidate_set_count[candidate_set] / len(dataset) 57 | if support >= min_support_degree: 58 | result.insert(0, candidate_set) 59 | support_degrees[candidate_set] = support 60 | return result, support_degrees 61 | 62 | 63 | def generate_candidate_sets_k(original_sets, k): 64 | if k == 1: 65 | """构建大小为1的所有候选项集合""" 66 | c1 = set([]) 67 | for transaction in original_sets: 68 | for item in transaction: 69 | c1.add(item) 70 | return list(map(frozenset, 71 | sorted(list(map( 72 | lambda item: [item, ], c1))))) 73 | 74 | for one_set in original_sets: 75 | assert len(one_set) == k - 1 76 | candidate_sets_k = [] 77 | for i in range(len(original_sets)): 78 | for j in range(i + 1, len(original_sets)): 79 | # 如果两个集合的前 k-2 个元素相同, 则将它们合并为一个大小为 k 的集合 80 | # 原因见书的 P208, 第二段 81 | if sorted(list(original_sets[i])[:k - 2]) \ 82 | == sorted(list(original_sets[j])[:k - 2]): 83 | candidate_sets_k.append(original_sets[i] | original_sets[j]) 84 | return candidate_sets_k 85 | 86 | 87 | def apriori(raw_dataset, min_support_degree=0.5): 88 | candidate_sets = generate_candidate_sets_k(raw_dataset, 1) 89 | dataset = list(map(set, raw_dataset)) 90 | frequent_items, all_support_degree = drop_unsupported_candidate_set( 91 | dataset, candidate_sets, min_support_degree 92 | ) 93 | all_frequent_items = [frequent_items, ] 94 | k = 2 95 | while len(all_frequent_items[k - 2]) > 0: 96 | candidate_sets = generate_candidate_sets_k(all_frequent_items[k - 2], k) 97 | frequent_items, support_degrees = drop_unsupported_candidate_set( 98 | dataset, candidate_sets, min_support_degree 99 | ) 100 | all_support_degree.update(support_degrees) 101 | all_frequent_items.append(frequent_items) 102 | k += 1 103 | return all_frequent_items, all_support_degree 104 | 105 | 106 | class Rule(object): 107 | def __init__(self, conditions, consequence, confidence_degree): 108 | self.conditions = list(conditions) 109 | self.consequence = list(consequence) 110 | self.confidence_degree = confidence_degree 111 | 112 | def __str__(self): 113 | return '{0} --> {1}, confidence: {2}'.format( 114 | self.conditions, 115 | self.consequence, 116 | self.confidence_degree 117 | ) 118 | 119 | @classmethod 120 | def generate_rules( 121 | cls, frequent_sets, support_degrees, 122 | min_confidence_degree=0.7): 123 | rules = [] 124 | # only get the sets with two or more items 125 | for i in range(1, len(frequent_sets)): 126 | for frequent_set in frequent_sets[i]: 127 | H1 = [frozenset([item]) for item in frequent_set] 128 | if i == 1: 129 | legal_rules = cls.__rules_from_confidence_degree( 130 | frequent_set, H1, support_degrees, 131 | min_confidence_degree 132 | ) 133 | else: 134 | legal_rules = cls.__rules_from_consequences( 135 | frequent_set, H1, support_degrees, 136 | min_confidence_degree 137 | ) 138 | rules.extend(legal_rules) 139 | return rules 140 | 141 | @classmethod 142 | def __rules_from_confidence_degree( 143 | cls, frequent_set, consequences, support_degrees, 144 | min_confidence_degree): 145 | """计算置信度 146 | '置信度': P -> H 的置信度为 support(P ∪ H) / support(P) 147 | 'P ∪ H': P 与 H 的并集 148 | 149 | Parameters 150 | ---------- 151 | frequent_set 152 | consequences 153 | support_degrees 154 | min_confidence_degree 155 | 156 | Returns 157 | ------- 158 | 159 | """ 160 | legal_rules = [] 161 | for consequence in consequences: 162 | conditions = frequent_set - consequence 163 | # 计算置信度 164 | confidence = support_degrees[frequent_set] / support_degrees[conditions] 165 | if confidence >= min_confidence_degree: 166 | rule = Rule( 167 | conditions, consequence, 168 | confidence 169 | ) 170 | legal_rules.append(rule) 171 | logging.debug(rule) 172 | return legal_rules 173 | 174 | @classmethod 175 | def __rules_from_consequences( 176 | cls, frequent_set, consequences, support_degrees, 177 | min_confidence_degree): 178 | # try further merging 179 | if len(frequent_set) <= len(consequences[0]) + 1: 180 | return None 181 | 182 | # create Hm+1 new candidates 183 | Hmp1 = generate_candidate_sets_k(consequences, len(consequences[0]) + 1) 184 | legal_rules = cls.__rules_from_confidence_degree( 185 | frequent_set, Hmp1, support_degrees, 186 | min_confidence_degree 187 | ) 188 | legal_consequence = list(map( 189 | lambda rule: rule.consequence, 190 | legal_rules 191 | )) 192 | # need at least two sets to merge 193 | if len(legal_consequence) > 1: 194 | sub_rules = cls.__rules_from_consequences( 195 | frequent_set, legal_consequence, support_degrees, 196 | min_confidence_degree 197 | ) 198 | if sub_rules is not None: 199 | legal_rules.extend(sub_rules) 200 | return legal_rules 201 | 202 | 203 | def main(): 204 | import pprint 205 | raw_dataset = load_fake_dataset() 206 | frequent_sets, support_degrees = apriori( 207 | raw_dataset, min_support_degree=0.5 208 | ) 209 | logging.info('frequent_sets: {0}'.format( 210 | pprint.pformat(frequent_sets) 211 | )) 212 | logging.info('support_degrees: {0}'.format( 213 | pprint.pformat(support_degrees) 214 | )) 215 | 216 | rules = Rule.generate_rules( 217 | frequent_sets, support_degrees, 218 | min_confidence_degree=0.7 219 | ) 220 | logging.info(rules) 221 | 222 | if __name__ == '__main__': 223 | main() 224 | 225 | 226 | 227 | # from time import sleep 228 | # from votesmart import votesmart 229 | # votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030' 230 | # #votesmart.apikey = 'get your api key first' 231 | # def getActionIds(): 232 | # actionIdList = []; billTitleList = [] 233 | # fr = open('recent20bills.txt') 234 | # for line in fr.readlines(): 235 | # billNum = int(line.split('\t')[0]) 236 | # try: 237 | # billDetail = votesmart.votes.getBill(billNum) #api call 238 | # for action in billDetail.actions: 239 | # if action.level == 'House' and \ 240 | # (action.stage == 'Passage' or action.stage == 'Amendment Vote'): 241 | # actionId = int(action.actionId) 242 | # print('bill: %d has actionId: %d' % (billNum, actionId)) 243 | # actionIdList.append(actionId) 244 | # billTitleList.append(line.strip().split('\t')[1]) 245 | # except: 246 | # print("problem getting bill %d" % billNum) 247 | # sleep(1) #delay to be polite 248 | # return actionIdList, billTitleList 249 | # 250 | # def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints 251 | # itemMeaning = ['Republican', 'Democratic']#list of what each item stands for 252 | # for billTitle in billTitleList:#fill up itemMeaning list 253 | # itemMeaning.append('%s -- Nay' % billTitle) 254 | # itemMeaning.append('%s -- Yea' % billTitle) 255 | # transDict = {}#list of items in each transaction (politician) 256 | # voteCount = 2 257 | # for actionId in actionIdList: 258 | # sleep(3) 259 | # print('getting votes for actionId: %d' % actionId) 260 | # try: 261 | # voteList = votesmart.votes.getBillActionVotes(actionId) 262 | # for vote in voteList: 263 | # if not transDict.has_key(vote.candidateName): 264 | # transDict[vote.candidateName] = [] 265 | # if vote.officeParties == 'Democratic': 266 | # transDict[vote.candidateName].append(1) 267 | # elif vote.officeParties == 'Republican': 268 | # transDict[vote.candidateName].append(0) 269 | # if vote.action == 'Nay': 270 | # transDict[vote.candidateName].append(voteCount) 271 | # elif vote.action == 'Yea': 272 | # transDict[vote.candidateName].append(voteCount + 1) 273 | # except: 274 | # print("problem getting actionId: %d" % actionId) 275 | # voteCount += 2 276 | # return transDict, itemMeaning 277 | -------------------------------------------------------------------------------- /ch12 - FP-growth/fpGrowth.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=utf-8 3 | 4 | from __future__ import print_function 5 | 6 | import logging 7 | 8 | TRACE = logging.DEBUG - 1 9 | logging.basicConfig( 10 | level=logging.DEBUG, 11 | # level=TRACE, 12 | format='[%(levelname)s %(module)s line:%(lineno)d] %(message)s', 13 | ) 14 | 15 | 16 | class Node(object): 17 | def __init__(self, name, num_occur, parent): 18 | self.parent = parent 19 | self.name = name 20 | self.count = num_occur 21 | self.nodeLink = None 22 | self.children = {} 23 | 24 | def inc(self, num_occur): 25 | self.count += num_occur 26 | 27 | def display(self, depth=1): 28 | print(' ' * depth, self.name, ' ', self.count) 29 | for child in self.children.values(): 30 | child.display(depth + 1) 31 | 32 | def __str__(self): 33 | return self.name 34 | 35 | def __repr__(self): 36 | return str(self) 37 | 38 | 39 | class TableItem(object): 40 | def __init__(self, count, head): 41 | self.count = count 42 | self.head = head 43 | 44 | def __str__(self): 45 | return '({0}, {1})'.format(self.count, self.head) 46 | 47 | def __repr__(self): 48 | return str(self) 49 | 50 | def __cmp__(self, other): 51 | if self.count != other.count: 52 | return cmp(self.count, other.count) 53 | else: 54 | return cmp(self.head, other.head) 55 | 56 | 57 | class FrequentPatternTree(object): 58 | def __init__(self, dataset, min_support_degree=1): 59 | self.min_support_degree = min_support_degree 60 | self.table = {} 61 | # 对每个元素出现次数进行计数 62 | for transaction in dataset: 63 | for item in transaction: 64 | self.table[item] = ( 65 | self.table.get(item, 0) + dataset[transaction] 66 | ) 67 | # 删除出现次数少于 min_support_degree 的项 68 | self.table = { 69 | key: value for (key, value) in self.table.items() 70 | if value >= self.min_support_degree 71 | } 72 | frequent_items = set(self.table.keys()) 73 | 74 | # 如果所有项都不频繁, 跳过下面的处理步骤 75 | if len(frequent_items) == 0: 76 | self.root = None 77 | self.table = None 78 | return 79 | 80 | # 扩展 headerTable 以便保存计数值以及指向每种类型第一个元素项的指针 81 | self.table = { 82 | key: TableItem(value, None) for (key, value) in self.table.items() 83 | } 84 | 85 | self.root = Node('Null Set', 1, None) 86 | for transaction, count in dataset.items(): 87 | local_dataset = {} 88 | for item in transaction: # put transaction items in order 89 | if item in frequent_items: 90 | local_dataset[item] = self.table[item].count 91 | if len(local_dataset) > 0: 92 | ordered_items = [v[0] for v in sorted( 93 | local_dataset.items(), key=lambda p: p[1], 94 | reverse=True 95 | )] 96 | # populate tree with ordered freq itemset 97 | self.__update(ordered_items, self.root, count) 98 | 99 | @property 100 | def is_empty(self): 101 | return self.root is None 102 | 103 | def __update(self, items, root, count): 104 | if items[0] in root.children: 105 | # 如果已经在孩子列表中, 增加出现次数 106 | root.children[items[0]].inc(count) 107 | else: 108 | # 把结点添加到当前结点的子节点上 109 | root.children[items[0]] = Node(items[0], count, root) 110 | # 更新 table 111 | if self.table[items[0]].head is None: 112 | self.table[items[0]].head = root.children[items[0]] 113 | else: 114 | temp = self.table[items[0]].head 115 | while temp.nodeLink is not None: 116 | temp = temp.nodeLink 117 | temp.nodeLink = root.children[items[0]] 118 | # call update() with remaining ordered items 119 | if len(items) > 1: 120 | self.__update(items[1:], root.children[items[0]], count) 121 | 122 | @staticmethod 123 | def find_prefix_paths(element, node): 124 | paths = {} 125 | while node is not None: 126 | leaf = node 127 | prefix = [] 128 | while leaf.parent is not None: 129 | prefix.append(leaf.name) 130 | leaf = leaf.parent 131 | if len(prefix) > 1: 132 | paths[frozenset(prefix[1:])] = node.count 133 | node = node.nodeLink 134 | return paths 135 | 136 | def mine(self, prefix=None): 137 | if prefix is None: 138 | prefix = set([]) 139 | frequent_items = [] 140 | # (sort header table) 141 | items = [ 142 | pair[0] for pair in 143 | sorted(self.table.items(), key=lambda p: p[1]) 144 | ] 145 | for item in items: 146 | new_frequent_set = prefix | {item} 147 | # print('finalFrequent Item: ', new_frequent_set) 148 | frequent_items.append(tuple(new_frequent_set)) 149 | condition_pattern_bases = self.find_prefix_paths( 150 | item, self.table[item].head 151 | ) 152 | # print('condition_pattern_bases :', item, condition_pattern_bases) 153 | # 2. construct cond FP-tree from cond. pattern base 154 | condition_tree = FrequentPatternTree( 155 | condition_pattern_bases, 156 | self.min_support_degree 157 | ) 158 | # print('head from conditional tree: ', condition_table) 159 | if not condition_tree.is_empty: # 3. mine cond. FP-tree 160 | # logging.debug('conditional tree for: {0}'.format(new_frequent_set)) 161 | # condition_tree.display(1) 162 | sub_frequent_items = condition_tree.mine(new_frequent_set) 163 | frequent_items.extend(sub_frequent_items) 164 | return frequent_items 165 | 166 | 167 | def load_fake_dataset(): 168 | dataset = [ 169 | ['r', 'z', 'h', 'j', 'p'], 170 | ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], 171 | ['z'], 172 | ['r', 'x', 'n', 'o', 's'], 173 | ['y', 'r', 'x', 'z', 'q', 't', 'p'], 174 | ['y', 'z', 'x', 'e', 'q', 's', 't', 'm'], 175 | ] 176 | return dataset 177 | 178 | 179 | def main(): 180 | import pprint 181 | dataset = load_fake_dataset() 182 | dataset = {frozenset(transaction): 1 for transaction in dataset} 183 | fp_tree = FrequentPatternTree(dataset, min_support_degree=3) 184 | logging.info(pprint.pformat(fp_tree.table)) 185 | 186 | logging.info(FrequentPatternTree.find_prefix_paths('x', fp_tree.table['x'].head)) 187 | logging.info(FrequentPatternTree.find_prefix_paths('z', fp_tree.table['z'].head)) 188 | logging.info(FrequentPatternTree.find_prefix_paths('r', fp_tree.table['r'].head)) 189 | 190 | frequent_items = fp_tree.mine() 191 | logging.info(pprint.pformat(frequent_items)) 192 | 193 | dataset = [] 194 | with open('kosarak.dat', 'r') as infile: 195 | for line in infile: 196 | dataset.append(line.split()) 197 | dataset = {frozenset(transaction): 1 for transaction in dataset} 198 | logging.debug(len(dataset)) 199 | min_support_degree = 100000 200 | fp_tree = FrequentPatternTree(dataset, min_support_degree) 201 | frequent_items = fp_tree.mine() 202 | logging.info(pprint.pformat(frequent_items)) 203 | 204 | 205 | if __name__ == '__main__': 206 | main() 207 | 208 | 209 | # import twitter 210 | # from time import sleep 211 | # import re 212 | # 213 | # def textParse(bigString): 214 | # urlsRemoved = re.sub('(http:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString) 215 | # listOfTokens = re.split(r'\W*', urlsRemoved) 216 | # return [tok.lower() for tok in listOfTokens if len(tok) > 2] 217 | # 218 | # def getLotsOfTweets(searchStr): 219 | # CONSUMER_KEY = '' 220 | # CONSUMER_SECRET = '' 221 | # ACCESS_TOKEN_KEY = '' 222 | # ACCESS_TOKEN_SECRET = '' 223 | # api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET, 224 | # access_token_key=ACCESS_TOKEN_KEY, 225 | # access_token_secret=ACCESS_TOKEN_SECRET) 226 | # #you can get 1500 results 15 pages * 100 per page 227 | # resultsPages = [] 228 | # for i in range(1,15): 229 | # print("fetching page %d" % i) 230 | # searchResults = api.GetSearch(searchStr, per_page=100, page=i) 231 | # resultsPages.append(searchResults) 232 | # sleep(6) 233 | # return resultsPages 234 | # 235 | # def mineTweets(tweetArr, minSup=5): 236 | # parsedList = [] 237 | # for i in range(14): 238 | # for j in range(100): 239 | # parsedList.append(textParse(tweetArr[i][j].text)) 240 | # initSet = createInitSet(parsedList) 241 | # myFPtree, myHeaderTab = createTree(initSet, minSup) 242 | # myFreqList = [] 243 | # mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) 244 | # return myFreqList 245 | # 246 | --------------------------------------------------------------------------------