├── README.md ├── ch1 ├── 1.py └── README.md ├── ch10 ├── Portland.png ├── README.md ├── kMeans.py ├── matplotlib │ ├── README.md │ ├── kMeans.py │ └── 对地图上的点进行聚类.py ├── places.txt ├── portlandClubs.txt ├── screenshot │ ├── Map.png │ ├── Portland.png │ ├── README.md │ ├── dataSet2.png │ ├── k-均值聚类的结果示意图.png │ ├── 二分k-均值算法.png │ └── 数据集.png ├── testSet.txt ├── testSet2.txt └── 对地图上的点进行聚类.py ├── ch11 ├── Apriori.py ├── README.md ├── bills20DataSet.txt ├── lawAssnRules.txt ├── meaning20.txt ├── mushroom.dat ├── recent100bills.txt ├── recent20bills.txt └── votesmart.py ├── ch12 ├── FP-growth.py ├── README.md ├── Twitter.py └── kosarak.zip ├── ch13 ├── README.md ├── iris.data.txt ├── matplotlib │ ├── README.md │ ├── 数据集.py │ ├── 方差百分比.py │ ├── 降维.py │ └── 降维2.py ├── pca.py ├── screenshot │ ├── README.md │ ├── 数据集.png │ ├── 方差百分比.png │ ├── 降维.png │ └── 降维2.png ├── secom.data ├── testSet.txt └── testSet3.txt ├── ch14 ├── 0_5.txt ├── README.md └── SVD.py ├── ch15 ├── README.md ├── err.txt ├── inputFile.txt ├── junk.txt ├── kickStart.txt ├── mrMean.py ├── mrMeanMapper.py ├── mrMeanReducer.py ├── mrSVM.py ├── mrSVMkickStart.py ├── myfile.txt ├── myout.txt ├── pegasos.py ├── proximalSVM.py ├── py27dbg.py ├── svmDat2.txt ├── svmDat26 ├── svmDat27 ├── svmData.txt ├── testSet.txt ├── testSet200.txt └── wc.py ├── ch2 ├── KNN.txt ├── KNN(classify).py ├── KNN(datingTestSet).py ├── KNN(dating完整版).py ├── KNN(handwriting).py ├── README.md ├── creatDist.py ├── datingTestSet.txt ├── datingTraningSet.txt ├── testDigits.zip ├── testSet.txt └── trainingDigits.zip ├── ch3 ├── README.md ├── TheTree.txt ├── calcShannonEnt.py ├── chooseBestFeatureToSplit.py ├── createTree.py ├── lenses.txt ├── lensesTree.txt ├── plotTree.py └── screenshot │ ├── TheTree.png │ └── lensesTree.png ├── ch4 ├── README.md ├── advertisement.py ├── classifyNB && testingNB.py ├── email.py ├── ham.zip ├── matplotlib │ ├── README.md │ ├── math_matplotlib.py │ ├── matplotlib.py │ └── srceenshot │ │ ├── math_matplotlib.png │ │ └── matplotlib.png ├── spam.zip └── trainNB0.py ├── ch5 ├── README.md ├── matplotlib │ ├── sigmoid.py │ └── 梯度上升.py ├── screenshot │ ├── Logistic回归最佳拟合直线.png │ ├── sigmoid.png │ ├── 改进随机梯度上升.png │ ├── 梯度上升.png │ └── 随机梯度上升.png ├── testSet.txt ├── 使用梯度上升找最佳拟合直线.py ├── 改进随机梯度上升.py └── 随机梯度上升.py ├── ch6 ├── README.md ├── digits.zip ├── matplotlib │ ├── 4个线性不可分的数据集效果图.py │ ├── README.md │ ├── 完整版SMO效果图.py │ ├── 核方法中的非线性可分数据效果图.py │ └── 简化版SMO处理小数据集效果图.py ├── screenshot │ ├── 4个线性不可分的数据集.png │ ├── README.md │ ├── 完整版SMO.png │ ├── 核方法中的非线性可分数据.png │ └── 简化版SMO效果图.png ├── testSet.txt ├── testSetRBF.txt ├── testSetRBF2.txt ├── 完整版Platt SMO.py └── 简化版SMO处理小数据集.py ├── ch7 ├── Adaboost.py ├── README.md ├── horseColicTest2.txt ├── horseColicTraining2.txt ├── matplotlib │ └── 单层决策树测试数据.py └── screenshot │ ├── README.md │ ├── ROC曲线.png │ └── 单层决策树测试数据.png ├── ch8 ├── README.md ├── abalone.txt ├── ex0.txt ├── ex1.txt ├── matplotlib │ ├── 前向逐步回归.py │ ├── 局部加权线性回归.py │ ├── 岭回归.py │ └── 线性回归找到最佳拟合曲线.py ├── screenshot │ ├── README.md │ ├── 前向逐步回归.png │ ├── 局部加权线性回归(k=0.003).png │ ├── 局部加权线性回归(k=0.01).png │ ├── 局部加权线性回归(k=1.0).png │ ├── 岭回归.png │ ├── 数据分布.png │ └── 线性回归找到最佳拟合曲线.png ├── 前向逐步回归.py ├── 局部加权线性回归.py ├── 岭回归.py └── 线性回归找到最佳拟合曲线.py └── ch9 ├── CRAT算法用于回归.py ├── README.md ├── bikeSpeedVsIq_test.txt ├── bikeSpeedVsIq_train.txt ├── ex0.txt ├── ex00.txt ├── ex2.txt ├── ex2test.txt ├── exp.txt ├── exp2.txt ├── expTest.txt ├── matplotlib ├── 基于CART算法构建回归树的简单数据集.py ├── 放大100倍.py ├── 测试模型树构建函数的测试数据.py └── 用于测试回归树的分段常数数据集.py ├── regTrees.py ├── screenshot ├── README.md ├── 基于CART算法构建回归树的简单数据集.png ├── 放大100倍.png ├── 测试模型树构建函数的测试数据.png ├── 用于测试回归树的分段常数数据集.png └── 骑自行车速度.png ├── sine.txt └── treeExplore.py /README.md: -------------------------------------------------------------------------------- 1 | # Machine-Learning-in-Action 2 | Practices & Code from Machine-Learning-in-Action. 3 | 4 | 《机器学习实战》的笔记与代码。 5 | 6 | ## schedule 7 | ### PART 1 CLASSIFICATION 8 | 1. [x] chapter01 - Machine learning basics (机器学习基础) 9 | 2. [x] chapter02 - Classifying with k-Nearest Neighbors (k-近邻) 10 | 3. [x] chapter03 - Splitting datasets one feature at a time: decision trees (决策树) 11 | 4. [x] chapter04 - Classifying with probability theory: naive Bayes (朴素贝叶斯) 12 | 5. [x] chapter05 - Logistic regression (Logistic回归) 13 | 6. [x] chapter06 - Support vector machines (支持向量机) 14 | 7. [x] chapter07 - Improving classification with the AdaBoost meta-algorithm (利用AdaBoost元算法提高分类性能) 15 | 16 | ### PART 2 FORECASTING NUMERIC VALUES WITH REGRESSION 17 | 8. [x] chapter08 - Predicting numeric values: regression (预测数值型数据:回归) 18 | 9. [x] chapter09 - Tree-based regression (树回归) 19 | 20 | ### PART 3 UNSUPERVISED LEARNING 21 | 10. [x] chapter10 - Grouping unlabeled items using k-means clustering (利用K-均值聚类算法对未标注数据分组) 22 | 11. [x] chapter11 - Association analysis with the Apriori algorithm (使用Apriori算法进行关联分析) 23 | 12. [x] chapter12 - Efficiently finding frequent itemsets with FP-growth (使用FP-growth算法来高效发现频繁项集) 24 | 25 | ### PART 4 ADDITIONAL TOOLS 26 | 13. [x] chapter13 - Using principal component analysis to simplify data (利用PCA来简化数据) 27 | 14. [x] chapter14 - Simplifying data with the singular value decomposition (利用SVD简化数据) 28 | 15. [x] chapter15 - Big data and MapReduce (大数据与MapReduce) 29 | -------------------------------------------------------------------------------- /ch1/1.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | 4 | from random import * 5 | from numpy import * 6 | 7 | arr = random.rand(4,4) 8 | 9 | print arr 10 | 11 | randMat = mat(random.rand(4,4)) 12 | print randMat.I #逆矩阵 13 | 14 | print randMat*randMat.I #单位矩阵 15 | 16 | 17 | -------------------------------------------------------------------------------- /ch1/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Machine learning basics 3 | ## 学这本书之前,请把线性代数,统计学,python,numpy库,matplotlib等这些前置技能学好!!!不然将会学得很艰难!!! 4 | ### None 5 | 6 | ``` 7 | from random import * 8 | from numpy import * 9 | 10 | arr = random.rand(4,4) 11 | 12 | print arr 13 | 14 | randMat = mat(random.rand(4,4)) 15 | print randMat.I #逆矩阵 16 | 17 | print randMat*randMat.I #单位矩阵 18 | ``` 19 | -------------------------------------------------------------------------------- /ch10/Portland.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/Portland.png -------------------------------------------------------------------------------- /ch10/README.md: -------------------------------------------------------------------------------- 1 | # Ch10 - 利用K-均值聚类算法对未标注数据分组(Grouping unlabeled items using k-means clustering) 2 | 3 | #### 以前我们学的都是监督学习算法,现在我们开始学习无监督学习算法。 4 | #### 所谓无监督学习是指事先并不知道要寻找的内容,即没有目标变量。 5 | #### K-means是聚类算法,聚类是将相似的样本分到同一个簇中,类似全自动分类,根据簇内的对象越相似,聚类的效果就越好。K-means是可以发现k个不同的簇,而且每个簇的中心采用簇中所含值的均值计算而成。 聚类与分类的最大不同在于, 分类的目标事先已知,聚类的标签事先不知道。 6 | #### K-均值算法的伪代码如下: 7 | ### 创建k个点作为起始质心(通常随机选择) 8 | ``` 9 | 当任意一个点的簇分配结果发生改变时: 10 | 对数据集中的每个点: 11 | 对每个质心: 12 | 计算质心与数据点之间的距离 13 | 将数据点分配到距离其最近的簇 14 | 对每一个簇,计算簇中所有点的均值并将均值作为质心。 15 | ``` 16 | #### K-均值聚类算法接收4个参数,两个必要参数为数据集和k的值,另外两个为距离计算函数和初始化函数(可修改)。算法采用计算质心-分配-重新计算质心反复迭代的方式,直到所有点的分配结果不再改变。设置flag为clusterChange=True。 17 | 18 | #### 聚类算法中,k的值是由用户初始定义的,如何才能判断k值定义是否合适,就需要用误差来评价聚类效果的好坏,误差是各个点与其所属类别质心的距离决定的。K-均值聚类的方法效果较差的原因是会收敛到局部最小值,而且全局最小。一种评价聚类效果的方法是SSE(Sum of Squared Error)误差平方和的方法,取平方的结果是使得远离中心的点变得更加突出。 一种降低SSE的方法是增加簇的个数,即提高k值,但是违背了聚类的目标,聚类的目标是在不改变簇数目的前提下提高簇的质量。可选的改进的方法是对生成的簇进行后处理,将最大SSE值的簇划分成两个(K=2的K-均值算法),然后再进行相邻的簇合并。具体方法有两种:1、合并最近的两个质心(合并使得SSE增幅最小的两个质心)2、遍历簇 合并两个然后计算SSE的值,找到使得SSE最小的情况。 19 | 20 | ## 测试K-means的数据集 21 | ![数据集.png](screenshot/数据集.png) 22 | 23 | ## 运行k-均值聚类的结果示意图 24 | ![k-均值聚类的结果示意图](screenshot/k-均值聚类的结果示意图.png) 25 | 26 | #### 但是K-means的聚类效果比较差,因为很容易收敛到局部最小值,而非全局最小值。所以我们要用新的方法(二分K-means)去改进K-means。 27 | 28 | ## 二分K-均值算法 29 | #### 二分K-均值类似后处理的切分思想,初始状态所有数据点属于一个大簇,之后每次选择一个簇切分成两个簇,这个切分满足使SSE值最大程度降低,直到簇数目达到k。另一种思路是每次选择SSE值最大的一个簇进行切分。 满足使SSE值最大程度降低伪代码如下: 30 | #### 将所有点看成一个簇。 31 | ``` 32 | 当簇数目小于k时 33 | 对于每一个簇: 34 | 计算总误差 35 | 在给定的簇上面进行K-均值聚类(k=2) 36 | 计算将该簇一分为二后的总误差 37 | 选择使得误差最小的那个簇进行划分操作 38 | ``` 39 | ## 测试二分K-means的数据集: 40 | ![dataSet2.png](screenshot/dataSet2.png) 41 | 42 | ## 运行二分K-means的效果图: 43 | ![二分k-均值算法.png](screenshot/二分k-均值算法.png) 44 | 45 | # 示例:对地图上的点聚类 46 | ## 原地图: 47 | ![Portland.png](screenshot/Portland.png) 48 | 49 | ## 经过二分K-means算法处理后的图:(簇为5)(簇的数目可以手动改变) 50 | ![Map.png](screenshot/Map.png) 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /ch10/matplotlib/README.md: -------------------------------------------------------------------------------- 1 | # matplotlib 2 | -------------------------------------------------------------------------------- /ch10/places.txt: -------------------------------------------------------------------------------- 1 | Dolphin II 10860 SW Beaverton-Hillsdale Hwy Beaverton, OR 45.486502 -122.788346 2 | Hotties 10140 SW Canyon Rd. Beaverton, OR 45.493150 -122.781021 3 | Pussycats 8666a SW Canyon Road Beaverton, OR 45.498187 -122.766147 4 | Stars Cabaret 4570 Lombard Ave Beaverton, OR 45.485943 -122.800311 5 | Sunset Strip 10205 SW Park Way Beaverton, OR 45.508203 -122.781853 6 | Vegas VIP Room 10018 SW Canyon Rd Beaverton, OR 45.493398 -122.779628 7 | Full Moon Bar and Grill 28014 Southeast Wally Road Boring, OR 45.430319 -122.376304 8 | 505 Club 505 Burnside Rd Gresham, OR 45.507621 -122.425553 9 | Dolphin 17180 McLoughlin Blvd Milwaukie, OR 45.399070 -122.618893 10 | Dolphin III 13305 SE McLoughlin BLVD Milwaukie, OR 45.427072 -122.634159 11 | Acropolis 8325 McLoughlin Blvd Portland, OR 45.462173 -122.638846 12 | Blush 5145 SE McLoughlin Blvd Portland, OR 45.485396 -122.646587 13 | Boom Boom Room 8345 Barbur Blvd Portland, OR 45.464826 -122.699212 14 | Bottoms Up 16900 Saint Helens Rd Portland, OR 45.646831 -122.842918 15 | Cabaret II 17544 Stark St Portland, OR 45.519142 -122.482480 16 | Cabaret Lounge 503 W Burnside Portland, OR 45.523094 -122.675528 17 | Carnaval 330 SW 3rd Avenue Portland, OR 45.520682 -122.674206 18 | Casa Diablo 2839 NW St. Helens Road Portland, OR 45.543016 -122.720828 19 | Chantilly Lace 6723 Killingsworth St Portland, OR 45.562715 -122.593078 20 | Club 205 9939 Stark St Portland, OR 45.519052 -122.561510 21 | Club Rouge 403 SW Stark Portland, OR 45.520561 -122.675605 22 | Dancin' Bare 8440 Interstate Ave Portland, OR 45.584124 -122.682725 23 | Devil's Point 5305 SE Foster Rd Portland, OR 45.495365 -122.608366 24 | Double Dribble 13550 Southeast Powell Boulevard Portland, OR 45.497750 -122.524073 25 | Dream on Saloon 15920 Stark St Portland, OR 45.519142 -122.499672 26 | DV8 5003 Powell Blvd Portland, OR 45.497498 -122.611177 27 | Exotica 240 Columbia Blvd Portland, OR 45.583048 -122.668350 28 | Frolics 8845 Sandy Blvd Portland, OR 45.555384 -122.571475 29 | G-Spot Airport 8654 Sandy Blvd Portland, OR 45.554263 -122.574167 30 | G-Spot Northeast 3400 NE 82nd Ave Portland, OR 45.547229 -122.578746 31 | G-Spot Southeast 5241 SE 72nd Ave Portland, OR 45.484823 -122.589208 32 | Glimmers 3532 Powell Blvd Portland, OR 45.496918 -122.627920 33 | Golden Dragon Exotic Club 324 SW 3rd Ave Portland, OR 45.520714 -122.674189 34 | Heat 12131 SE Holgate Blvd. Portland, OR 45.489637 -122.538196 35 | Honeysuckle's Lingerie 3520 82nd Ave Portland, OR 45.548651 -122.578730 36 | Hush Playhouse 13560 Powell Blvd Portland, OR 45.497765 -122.523985 37 | JD's Bar & Grill 4523 NE 60th Ave Portland, OR 45.555811 -122.600881 38 | Jody's Bar And Grill 12035 Glisan St Portland, OR 45.526306 -122.538833 39 | Landing Strip 6210 Columbia Blvd Portland, OR 45.595042 -122.728825 40 | Lucky Devil Lounge 633 SE Powell Blvd Portland, OR 45.501585 -122.659310 41 | Lure 11051 Barbur Blvd Portland, OR 45.445233 -122.732606 42 | Magic Garden 217 4th Ave Portland, OR 45.524692 -122.674466 43 | Mary's Club 129 Broadway Portland, OR 45.535101 -122.667390 44 | Montego's 15826 SE Division Portland, OR 45.504448 -122.500034 45 | Mr. Peeps 709 122nd Ave Portland, OR 45.527863 -122.537726 46 | Mynt Gentlemen's Club 3390 NE Sandy Blvd Portland, OR 45.532426 -122.628865 47 | Mystic 9950 SE Stark St. Portland, OR 45.519037 -122.561283 48 | Nicolai Street Clubhouse 2460 24th Ave Portland, OR 45.540098 -122.641114 49 | Oh Zone 6218 Columbia Blvd Portland, OR 45.595069 -122.728961 50 | Pallas Club 13639 Powell Blvd Portland, OR 45.497990 -122.522849 51 | Pirates Cove 7427 Sandy Blvd Portland, OR 45.549288 -122.586505 52 | Private Pleasures 10931 53rd Ave Portland, OR 45.446442 -122.731034 53 | Pussycats 3414 Northeast 82nd Avenue Portland, OR 45.547337 -122.578744 54 | Riverside Corral 545 Tacoma St Portland, OR 45.464338 -122.660285 55 | Rooster's 605 Columbia Blvd Portland, OR 45.583693 -122.672462 56 | Rose City Strip 3620 35th Pl Portland, OR 45.496601 -122.627688 57 | Safari Show Club 3000 SE Powell Blvd Portland, OR 45.497091 -122.634581 58 | Sassy's Bar & Grill 927 Morrison St Portland, OR 45.517225 -122.656367 59 | Secret Rendezvous 12503 Division St Portland, OR 45.504087 -122.534481 60 | Shimmers 7944 Foster Rd Portland, OR 45.483836 -122.581608 61 | Soobie's 333 SE 122nd Ave Portland, OR 45.520162 -122.537787 62 | Spyce Gentleman's Club 33 NW 2nd Ave Portland, OR 45.523370 -122.672388 63 | Sugar Shack 6732 Killingsworth St Portland, OR 45.562699 -122.593048 64 | The Hawthorne Strip 1008 Hawthorne Blvd Portland, OR 45.512220 -122.655527 65 | Tommy's Too 10335 Foster Rd Portland, OR 45.476721 -122.557005 66 | Union Jacks 938 Burnside St Portland, OR 45.522902 -122.656249 67 | Video Visions 6723 Killingsworth St Portland, OR 45.562715 -122.593078 68 | Stars Cabaret Bridgeport 17939 SW McEwan Rd Tigard, OR 45.425788 -122.765754 69 | Jiggles 7455 SW Nyberg St Tualatin, OR 45.382682 -122.753932 70 | -------------------------------------------------------------------------------- /ch10/portlandClubs.txt: -------------------------------------------------------------------------------- 1 | Dolphin II 10860 SW Beaverton-Hillsdale Hwy Beaverton, OR 2 | Hotties 10140 SW Canyon Rd. Beaverton, OR 3 | Pussycats 8666a SW Canyon Road Beaverton, OR 4 | Stars Cabaret 4570 Lombard Ave Beaverton, OR 5 | Sunset Strip 10205 SW Park Way Beaverton, OR 6 | Vegas VIP Room 10018 SW Canyon Rd Beaverton, OR 7 | Full Moon Bar and Grill 28014 Southeast Wally Road Boring, OR 8 | 505 Club 505 Burnside Rd Gresham, OR 9 | Dolphin 17180 McLoughlin Blvd Milwaukie, OR 10 | Dolphin III 13305 SE McLoughlin BLVD Milwaukie, OR 11 | Acropolis 8325 McLoughlin Blvd Portland, OR 12 | Blush 5145 SE McLoughlin Blvd Portland, OR 13 | Boom Boom Room 8345 Barbur Blvd Portland, OR 14 | Bottoms Up 16900 Saint Helens Rd Portland, OR 15 | Cabaret II 17544 Stark St Portland, OR 16 | Cabaret Lounge 503 W Burnside Portland, OR 17 | Carnaval 330 SW 3rd Avenue Portland, OR 18 | Casa Diablo 2839 NW St. Helens Road Portland, OR 19 | Chantilly Lace 6723 Killingsworth St Portland, OR 20 | Club 205 9939 Stark St Portland, OR 21 | Club Rouge 403 SW Stark Portland, OR 22 | Dancin' Bare 8440 Interstate Ave Portland, OR 23 | Devil's Point 5305 SE Foster Rd Portland, OR 24 | Double Dribble 13550 Southeast Powell Boulevard Portland, OR 25 | Dream on Saloon 15920 Stark St Portland, OR 26 | DV8 5003 Powell Blvd Portland, OR 27 | Exotica 240 Columbia Blvd Portland, OR 28 | Frolics 8845 Sandy Blvd Portland, OR 29 | G-Spot Airport 8654 Sandy Blvd Portland, OR 30 | G-Spot Northeast 3400 NE 82nd Ave Portland, OR 31 | G-Spot Southeast 5241 SE 72nd Ave Portland, OR 32 | Glimmers 3532 Powell Blvd Portland, OR 33 | Golden Dragon Exotic Club 324 SW 3rd Ave Portland, OR 34 | Heat 12131 SE Holgate Blvd. Portland, OR 35 | Honeysuckle's Lingerie 3520 82nd Ave Portland, OR 36 | Hush Playhouse 13560 Powell Blvd Portland, OR 37 | JD's Bar & Grill 4523 NE 60th Ave Portland, OR 38 | Jody's Bar And Grill 12035 Glisan St Portland, OR 39 | Landing Strip 6210 Columbia Blvd Portland, OR 40 | Lucky Devil Lounge 633 SE Powell Blvd Portland, OR 41 | Lure 11051 Barbur Blvd Portland, OR 42 | Magic Garden 217 4th Ave Portland, OR 43 | Mary's Club 129 Broadway Portland, OR 44 | Montego's 15826 SE Division Portland, OR 45 | Mr. Peeps 709 122nd Ave Portland, OR 46 | Mynt Gentlemen's Club 3390 NE Sandy Blvd Portland, OR 47 | Mystic 9950 SE Stark St. Portland, OR 48 | Nicolai Street Clubhouse 2460 24th Ave Portland, OR 49 | Oh Zone 6218 Columbia Blvd Portland, OR 50 | Pallas Club 13639 Powell Blvd Portland, OR 51 | Pirates Cove 7427 Sandy Blvd Portland, OR 52 | Private Pleasures 10931 53rd Ave Portland, OR 53 | Pussycats 3414 Northeast 82nd Avenue Portland, OR 54 | Riverside Corral 545 Tacoma St Portland, OR 55 | Rooster's 605 Columbia Blvd Portland, OR 56 | Rose City Strip 3620 35th Pl Portland, OR 57 | Safari Show Club 3000 SE Powell Blvd Portland, OR 58 | Sassy's Bar & Grill 927 Morrison St Portland, OR 59 | Secret Rendezvous 12503 Division St Portland, OR 60 | Shimmers 7944 Foster Rd Portland, OR 61 | Soobie's 333 SE 122nd Ave Portland, OR 62 | Spyce Gentleman's Club 33 NW 2nd Ave Portland, OR 63 | Sugar Shack 6732 Killingsworth St Portland, OR 64 | The Hawthorne Strip 1008 Hawthorne Blvd Portland, OR 65 | Tommy's Too 10335 Foster Rd Portland, OR 66 | Union Jacks 938 Burnside St Portland, OR 67 | Video Visions 6723 Killingsworth St Portland, OR 68 | Stars Cabaret Bridgeport 17939 SW McEwan Rd Tigard, OR 69 | Jiggles 7455 SW Nyberg St Tualatin, OR -------------------------------------------------------------------------------- /ch10/screenshot/Map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/screenshot/Map.png -------------------------------------------------------------------------------- /ch10/screenshot/Portland.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/screenshot/Portland.png -------------------------------------------------------------------------------- /ch10/screenshot/README.md: -------------------------------------------------------------------------------- 1 | # screenshot 2 | -------------------------------------------------------------------------------- /ch10/screenshot/dataSet2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/screenshot/dataSet2.png -------------------------------------------------------------------------------- /ch10/screenshot/k-均值聚类的结果示意图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/screenshot/k-均值聚类的结果示意图.png -------------------------------------------------------------------------------- /ch10/screenshot/二分k-均值算法.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/screenshot/二分k-均值算法.png -------------------------------------------------------------------------------- /ch10/screenshot/数据集.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/screenshot/数据集.png -------------------------------------------------------------------------------- /ch10/testSet.txt: -------------------------------------------------------------------------------- 1 | 1.658985 4.285136 2 | -3.453687 3.424321 3 | 4.838138 -1.151539 4 | -5.379713 -3.362104 5 | 0.972564 2.924086 6 | -3.567919 1.531611 7 | 0.450614 -3.302219 8 | -3.487105 -1.724432 9 | 2.668759 1.594842 10 | -3.156485 3.191137 11 | 3.165506 -3.999838 12 | -2.786837 -3.099354 13 | 4.208187 2.984927 14 | -2.123337 2.943366 15 | 0.704199 -0.479481 16 | -0.392370 -3.963704 17 | 2.831667 1.574018 18 | -0.790153 3.343144 19 | 2.943496 -3.357075 20 | -3.195883 -2.283926 21 | 2.336445 2.875106 22 | -1.786345 2.554248 23 | 2.190101 -1.906020 24 | -3.403367 -2.778288 25 | 1.778124 3.880832 26 | -1.688346 2.230267 27 | 2.592976 -2.054368 28 | -4.007257 -3.207066 29 | 2.257734 3.387564 30 | -2.679011 0.785119 31 | 0.939512 -4.023563 32 | -3.674424 -2.261084 33 | 2.046259 2.735279 34 | -3.189470 1.780269 35 | 4.372646 -0.822248 36 | -2.579316 -3.497576 37 | 1.889034 5.190400 38 | -0.798747 2.185588 39 | 2.836520 -2.658556 40 | -3.837877 -3.253815 41 | 2.096701 3.886007 42 | -2.709034 2.923887 43 | 3.367037 -3.184789 44 | -2.121479 -4.232586 45 | 2.329546 3.179764 46 | -3.284816 3.273099 47 | 3.091414 -3.815232 48 | -3.762093 -2.432191 49 | 3.542056 2.778832 50 | -1.736822 4.241041 51 | 2.127073 -2.983680 52 | -4.323818 -3.938116 53 | 3.792121 5.135768 54 | -4.786473 3.358547 55 | 2.624081 -3.260715 56 | -4.009299 -2.978115 57 | 2.493525 1.963710 58 | -2.513661 2.642162 59 | 1.864375 -3.176309 60 | -3.171184 -3.572452 61 | 2.894220 2.489128 62 | -2.562539 2.884438 63 | 3.491078 -3.947487 64 | -2.565729 -2.012114 65 | 3.332948 3.983102 66 | -1.616805 3.573188 67 | 2.280615 -2.559444 68 | -2.651229 -3.103198 69 | 2.321395 3.154987 70 | -1.685703 2.939697 71 | 3.031012 -3.620252 72 | -4.599622 -2.185829 73 | 4.196223 1.126677 74 | -2.133863 3.093686 75 | 4.668892 -2.562705 76 | -2.793241 -2.149706 77 | 2.884105 3.043438 78 | -2.967647 2.848696 79 | 4.479332 -1.764772 80 | -4.905566 -2.911070 81 | -------------------------------------------------------------------------------- /ch10/testSet2.txt: -------------------------------------------------------------------------------- 1 | 3.275154 2.957587 2 | -3.344465 2.603513 3 | 0.355083 -3.376585 4 | 1.852435 3.547351 5 | -2.078973 2.552013 6 | -0.993756 -0.884433 7 | 2.682252 4.007573 8 | -3.087776 2.878713 9 | -1.565978 -1.256985 10 | 2.441611 0.444826 11 | -0.659487 3.111284 12 | -0.459601 -2.618005 13 | 2.177680 2.387793 14 | -2.920969 2.917485 15 | -0.028814 -4.168078 16 | 3.625746 2.119041 17 | -3.912363 1.325108 18 | -0.551694 -2.814223 19 | 2.855808 3.483301 20 | -3.594448 2.856651 21 | 0.421993 -2.372646 22 | 1.650821 3.407572 23 | -2.082902 3.384412 24 | -0.718809 -2.492514 25 | 4.513623 3.841029 26 | -4.822011 4.607049 27 | -0.656297 -1.449872 28 | 1.919901 4.439368 29 | -3.287749 3.918836 30 | -1.576936 -2.977622 31 | 3.598143 1.975970 32 | -3.977329 4.900932 33 | -1.791080 -2.184517 34 | 3.914654 3.559303 35 | -1.910108 4.166946 36 | -1.226597 -3.317889 37 | 1.148946 3.345138 38 | -2.113864 3.548172 39 | 0.845762 -3.589788 40 | 2.629062 3.535831 41 | -1.640717 2.990517 42 | -1.881012 -2.485405 43 | 4.606999 3.510312 44 | -4.366462 4.023316 45 | 0.765015 -3.001270 46 | 3.121904 2.173988 47 | -4.025139 4.652310 48 | -0.559558 -3.840539 49 | 4.376754 4.863579 50 | -1.874308 4.032237 51 | -0.089337 -3.026809 52 | 3.997787 2.518662 53 | -3.082978 2.884822 54 | 0.845235 -3.454465 55 | 1.327224 3.358778 56 | -2.889949 3.596178 57 | -0.966018 -2.839827 58 | 2.960769 3.079555 59 | -3.275518 1.577068 60 | 0.639276 -3.412840 61 | -------------------------------------------------------------------------------- /ch11/Apriori.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from numpy import * 4 | 5 | 6 | def loadDataSet(): 7 | return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] 8 | 9 | # C1 是大小为1的所有候选项集的集合 10 | def createC1(dataSet): 11 | C1 = [] # 存储不重复的项值 12 | for transaction in dataSet: 13 | for item in transaction: 14 | if not [item] in C1: 15 | C1.append([item]) #store all the item unrepeatly 16 | 17 | C1.sort() 18 | #return map(frozenset, C1)#frozen set, user can't change it. 19 | return list(map(frozenset, C1)) 20 | 21 | #该函数用于从 C1 生成 L1 22 | def scanD(D,Ck,minSupport): 23 | #参数:数据集、候选项集列表 Ck以及感兴趣项集的最小支持度 minSupport 24 | ssCnt={} 25 | for tid in D:#遍历数据集 26 | for can in Ck:#遍历候选项 27 | if can.issubset(tid):#判断候选项中是否含数据集的各项 28 | #if not ssCnt.has_key(can): # python3 can not support 29 | if not can in ssCnt: 30 | ssCnt[can]=1 #不含设为1 31 | else: ssCnt[can]+=1#有则计数加1 32 | numItems=float(len(D))#数据集大小 33 | retList = []#L1初始化 34 | supportData = {}#记录候选项中各个数据的支持度 35 | for key in ssCnt: 36 | support = ssCnt[key]/numItems#计算支持度 37 | if support >= minSupport: 38 | retList.insert(0,key)#满足条件加入L1中 39 | supportData[key] = support 40 | return retList, supportData 41 | 42 | #total apriori 43 | def aprioriGen(Lk, k): #组合,向上合并 44 | #creates Ck 参数:频繁项集列表 Lk 与项集元素个数 k 45 | retList = [] 46 | lenLk = len(Lk) 47 | for i in range(lenLk): 48 | for j in range(i+1, lenLk): #两两组合遍历 49 | L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2] 50 | L1.sort(); L2.sort() 51 | if L1==L2: #若两个集合的前k-2个项相同时,则将两个集合合并 52 | retList.append(Lk[i] | Lk[j]) #set union 53 | return retList 54 | 55 | #apriori 56 | def apriori(dataSet, minSupport = 0.5): 57 | C1 = createC1(dataSet) 58 | D = list(map(set, dataSet)) #python3 59 | L1, supportData = scanD(D, C1, minSupport)#单项最小支持度判断 0.5,生成L1 60 | L = [L1] 61 | k = 2 62 | while (len(L[k-2]) > 0):#创建包含更大项集的更大列表,直到下一个大的项集为空 63 | Ck = aprioriGen(L[k-2], k)#Ck 64 | Lk, supK = scanD(D, Ck, minSupport)#get Lk 65 | supportData.update(supK) 66 | L.append(Lk) 67 | k += 1 68 | return L, supportData 69 | 70 | #生成关联规则 71 | def generateRules(L, supportData, minConf=0.7): 72 | #频繁项集列表、包含那些频繁项集支持数据的字典、最小可信度阈值 73 | bigRuleList = [] #存储所有的关联规则 74 | for i in range(1, len(L)): #只获取有两个或者更多集合的项目,从1,即第二个元素开始,L[0]是单个元素的 75 | # 两个及以上的才可能有关联一说,单个元素的项集不存在关联问题 76 | for freqSet in L[i]: 77 | H1 = [frozenset([item]) for item in freqSet] 78 | #该函数遍历L中的每一个频繁项集并对每个频繁项集创建只包含单个元素集合的列表H1 79 | if (i > 1): 80 | #如果频繁项集元素数目超过2,那么会考虑对它做进一步的合并 81 | rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) 82 | else:#第一层时,后件数为1 83 | calcConf(freqSet, H1, supportData, bigRuleList, minConf)# 调用函数2 84 | return bigRuleList 85 | 86 | #生成候选规则集合:计算规则的可信度以及找到满足最小可信度要求的规则 87 | def calcConf(freqSet, H, supportData, brl, minConf=0.7): 88 | #针对项集中只有两个元素时,计算可信度 89 | prunedH = []#返回一个满足最小可信度要求的规则列表 90 | for conseq in H:#后件,遍历 H中的所有项集并计算它们的可信度值 91 | conf = supportData[freqSet]/supportData[freqSet-conseq] #可信度计算,结合支持度数据 92 | if conf >= minConf: 93 | print (freqSet-conseq,'-->',conseq,"conf:",(conf)) # k可信度值 94 | #如果某条规则满足最小可信度值,那么将这些规则输出到屏幕显示 95 | brl.append((freqSet-conseq, conseq, conf))#添加到规则里,brl 是前面通过检查的 bigRuleList 96 | prunedH.append(conseq)#同样需要放入列表到后面检查 97 | return prunedH 98 | 99 | #合并 100 | def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7): 101 | #参数:一个是频繁项集,另一个是可以出现在规则右部的元素列表 H 102 | m = len(H[0]) 103 | if (len(freqSet) > (m + 1)): #频繁项集元素数目大于单个集合的元素数 104 | Hmp1 = aprioriGen(H, m+1)#存在不同顺序、元素相同的集合,合并具有相同部分的集合 105 | Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)#计算可信度 106 | if (len(Hmp1) > 1): 107 | #满足最小可信度要求的规则列表多于1,则递归来判断是否可以进一步组合这些规则 108 | rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf) 109 | 110 | if __name__ == "__main__" : 111 | dataSet = loadDataSet() 112 | # print dataSet 113 | L, supportData = apriori(dataSet) 114 | print L # L 包含满足最小支持度为0.5的频繁项集列表 115 | # print supportData 116 | rules = generateRules(L,supportData,minConf=0.5) 117 | print rules -------------------------------------------------------------------------------- /ch11/README.md: -------------------------------------------------------------------------------- 1 | # Ch11 - 使用Apriori算法进行关联分析(Association analysis with the Apriori algorithm) 2 | 3 | #### 从大规模数据集中寻找物品间的隐含关系被称作关联分析(association analysis)或者关联规则学习(association rule learning)。 关联分析是一种在大规模数据集中寻找有趣关系的任务。这些关系可以有两种形式:频繁项集或者关联规则。 4 | 5 | #### 频繁项集(frequent item sets)是经常出现在一块的物品的集合。 6 | #### 关联规则(association rules)暗示两种物品之间可能存在很强的关系。 7 | 8 | #### 一个项集的支持度(support):被定义为数据集中包含该项集的记录所占的比例。支持度是针对项集来说的,因此可以定义一个最小支持度,而只保留满足最小支持度的项集。 9 | 10 | #### 可信度或置信度(confidence):是针对一条诸如{尿布} ➞ {葡萄酒}的关联规则来定义的。这 条规则的可信度被定义为“支持度({尿布, 葡萄酒})/支持度({尿布})”。从图11-1中可以看到,由 于{尿布, 葡萄酒}的支持度为3/5,尿布的支持度为4/5,所以“尿布 ➞ 葡萄酒”的可信度为3/4=0.75。 这意味着对于包含“尿布”的所有记录,我们的规则对其中75%的记录都适用。(类似条件概率)。 11 | 12 | ## Apriori 原理: 13 | #### Apriori原理可以帮我们减少可能感兴趣的项集。Apriori原理是说如果某个项集是频繁的,那么它的所有子集也是频繁的。这个原理直观上并没有什么帮助,但是如果反过来看就有用了,也就是说如果一个项集是非频繁集那么它的所有超集也是非频繁的。 14 | 15 | 16 | #### 关联分析的目标包括两项:发现频繁项集和发现关联规则。首先需要找到频繁 项集,然后才能获得关联规则。 17 | #### Apriori算法是发现频繁项集的一种方法。该算法首先会生成所有单个物品的项集列表。接着扫描交易记录来查看哪些项集满足最小支持度要求,那些不满足最小支持度的集合会被去掉。然后,对剩下来的集合进行组合以生成包含两个元素的项集。接下来,再重新扫描交易记录,去掉不满足最小支持度的项集。该过程重复进行直到所有项集都被去掉。 18 | 19 | 20 | ## 整个Apriori算法的伪代码如下: 21 | ``` 22 | 当集合中项的个数大于0时: 23 | 构建一个k个项组成的候选项集的列表 24 | 检查数据以确认每个项集都是频繁的 25 | 保留频繁项集并构建k+1项组成的候选项集的列表(向上合并) 26 | ``` 27 | 28 | #### 函数 aprioriGen() 的输入参数为频繁项集列表 Lk 与项集元素个数 k ,输出为 Ck 。举例来说,该函数以{0}、{1}、{2}作为输入,会生成{0,1}、{0,2}以及{1,2}。要完成这一点,首先创建一个空列表,然后计算 Lk 中的元素数目。通过循环来比较 Lk 中的每一个元素与其他元素,紧接着,取列表中的两个集合进行比较。如果这两个集合的前面 k-2 个元素都相等,那么就将这两个集合合成一个大小为 k 的集合 。这里使用集合的并操作来完成。 29 | 30 | ## 从频繁项集中挖掘关联规则 31 | 32 | ### 如果某条规则并不满足最小可信度要求,那么该规则的所有子集也不会满足最小可信度要求。可以利用关联规则的这条性质属性来减少需要测试的规则数目。 33 | 34 | 35 | ## 小总结: 36 | 37 | #### 关联分析是用于发现大数据集中元素间有趣关系的一个工具集,可以采用两种方式来量化这些有趣的关系。 38 | #### 第一种方式是使用频繁项集,它会给出经常在一起出现的元素项。 39 | #### 第二种方式是关联规则,每条关联规则意味着元素项之间的“如果……那么”关系。 40 | #### Apriori的方法简化了计算量,可以在合理的时间范围内找到频繁项集。 41 | #### Apriori原理是说如果一个元素项是不频繁的,那么那些包含该元素的超集也是不频繁的。 42 | #### Apriori算法从单元素项集开始,通过组合满足最小支持度要求的项集来形成更大的集合。支持度用来度量一个集合在原始数据中出现的频率。 43 | #### 每次增加频繁项集的大小,Apriori算法都会重新扫描整个数据集。当数据集很大时,这会显著降低频繁项集发现的速度。 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /ch11/meaning20.txt: -------------------------------------------------------------------------------- 1 | (lp0 2 | S'Republican' 3 | p1 4 | aS'Democratic' 5 | p2 6 | aS'Prohibiting Federal Funding of National Public Radio -- Nay' 7 | p3 8 | aS'Prohibiting Federal Funding of National Public Radio -- Yea' 9 | p4 10 | aS'Removing Troops from Afghanistan -- Nay' 11 | p5 12 | aS'Removing Troops from Afghanistan -- Yea' 13 | p6 14 | aS'Terminating the Home Affordable Modification Program -- Nay' 15 | p7 16 | aS'Terminating the Home Affordable Modification Program -- Yea' 17 | p8 18 | aS'Repealing the Health Care Bill -- Nay' 19 | p9 20 | aS'Repealing the Health Care Bill -- Yea' 21 | p10 22 | aS'Science and Technology Funding -- Nay' 23 | p11 24 | aS'Science and Technology Funding -- Yea' 25 | p12 26 | aS'"Whistleblower Protection" for Offshore Oil Workers -- Nay' 27 | p13 28 | aS'"Whistleblower Protection" for Offshore Oil Workers -- Yea' 29 | p14 30 | aS'Repealing "Don\'t Ask, Don\'t Tell" After Military Review and Certification -- Nay' 31 | p15 32 | aS'Repealing "Don\'t Ask, Don\'t Tell" After Military Review and Certification -- Yea' 33 | p16 34 | aS'Unemployment Benefits Extension -- Nay' 35 | p17 36 | aS'Unemployment Benefits Extension -- Yea' 37 | p18 38 | aS'Unemployment Benefits Extension -- Nay' 39 | p19 40 | aS'Unemployment Benefits Extension -- Yea' 41 | p20 42 | aS'Prohibiting 2010- 2011 Congressional Cost-of-Living Pay Increase -- Nay' 43 | p21 44 | aS'Prohibiting 2010- 2011 Congressional Cost-of-Living Pay Increase -- Yea' 45 | p22 46 | aS'Prohibiting Use of Federal Funds For Planned Parenthood -- Nay' 47 | p23 48 | aS'Prohibiting Use of Federal Funds For Planned Parenthood -- Yea' 49 | p24 50 | aS'Reducing Federal Funding of the US Institute of Peace -- Nay' 51 | p25 52 | aS'Reducing Federal Funding of the US Institute of Peace -- Yea' 53 | p26 54 | aS'Prohibiting the Use of Federal Funds for NASCAR Sponsorships -- Nay' 55 | p27 56 | aS'Prohibiting the Use of Federal Funds for NASCAR Sponsorships -- Yea' 57 | p28 58 | aS'Mine Safety Act -- Nay' 59 | p29 60 | aS'Mine Safety Act -- Yea' 61 | p30 62 | a. -------------------------------------------------------------------------------- /ch11/recent20bills.txt: -------------------------------------------------------------------------------- 1 | 12939 Prohibiting Federal Funding of National Public Radio 2 | 12940 Removing Troops from Afghanistan 3 | 12830 Prioritizing Payment of Public Debt 4 | 12857 Calling for a Balanced Budget Constitutional Amendment 5 | 12988 Terminating the Home Affordable Modification Program 6 | 12040 Repealing Business Transaction Reporting Requirements 7 | 12465 Repealing the Health Care Bill 8 | 11451 Science and Technology Funding 9 | 11364 Credit Default Swap Regulations 10 | 11820 "Whistleblower Protection" for Offshore Oil Workers 11 | 12452 Treaty with Russia to Reduce and Limit Offensive Arms 12 | 11318 Derivatives Regulation Modifications 13 | 11414 Repealing "Don't Ask, Don't Tell" After Military Review and Certification 14 | 11719 Unemployment Benefits Extension 15 | 11205 Prohibiting 2010- 2011 Congressional Cost-of-Living Pay Increase 16 | 12747 Prohibiting Use of Federal Funds For Planned Parenthood 17 | 12792 Reducing Federal Funding of the US Institute of Peace 18 | 12827 Prohibiting the Use of Federal Funds for NASCAR Sponsorships 19 | 12445 Mine Safety Act 20 | 12049 2010-2011 Defense Authorizations -------------------------------------------------------------------------------- /ch11/votesmart.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from time import sleep 4 | from votesmart import votesmart 5 | 6 | votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030' 7 | 8 | 9 | # votesmart.apikey = 'get your api key first' 10 | def getActionIds(): 11 | actionIdList = []; 12 | billTitleList = [] 13 | fr = open('recent20bills.txt') 14 | for line in fr.readlines(): 15 | billNum = int(line.split('\t')[0]) 16 | try: 17 | billDetail = votesmart.votes.getBill(billNum) # api call 18 | for action in billDetail.actions: 19 | if action.level == 'House' and \ 20 | (action.stage == 'Passage' or action.stage == 'Amendment Vote'): 21 | actionId = int(action.actionId) 22 | print 'bill: %d has actionId: %d' % (billNum, actionId) 23 | actionIdList.append(actionId) 24 | billTitleList.append(line.strip().split('\t')[1]) 25 | except: 26 | print "problem getting bill %d" % billNum 27 | sleep(1) # delay to be polite 28 | return actionIdList, billTitleList 29 | 30 | 31 | def getTransList(actionIdList, billTitleList): # this will return a list of lists containing ints 32 | itemMeaning = ['Republican', 'Democratic'] # list of what each item stands for 33 | for billTitle in billTitleList: # fill up itemMeaning list 34 | itemMeaning.append('%s -- Nay' % billTitle) 35 | itemMeaning.append('%s -- Yea' % billTitle) 36 | transDict = {} # list of items in each transaction (politician) 37 | voteCount = 2 38 | for actionId in actionIdList: 39 | sleep(3) 40 | print 'getting votes for actionId: %d' % actionId 41 | try: 42 | voteList = votesmart.votes.getBillActionVotes(actionId) 43 | for vote in voteList: 44 | if not transDict.has_key(vote.candidateName): 45 | transDict[vote.candidateName] = [] 46 | if vote.officeParties == 'Democratic': 47 | transDict[vote.candidateName].append(1) 48 | elif vote.officeParties == 'Republican': 49 | transDict[vote.candidateName].append(0) 50 | if vote.action == 'Nay': 51 | transDict[vote.candidateName].append(voteCount) 52 | elif vote.action == 'Yea': 53 | transDict[vote.candidateName].append(voteCount + 1) 54 | except: 55 | print "problem getting actionId: %d" % actionId 56 | voteCount += 2 57 | return transDict, itemMeaning -------------------------------------------------------------------------------- /ch12/README.md: -------------------------------------------------------------------------------- 1 | # Ch12 - 使用FP-growth算法来高效发现频繁项集(Efficiently finding frequent itemsets with FP-growth) 2 | 3 | #### (Frequent Pattern)FP-growth算法是一种高效发现频繁集的方法。它比Apriori算法要快,这里的任务是将数据集存储在一个特定的称作FP树的结构之后发现频繁项集或者频繁项对,这种做法使得算法的执行速度要快于Apriori两个数量级以上。 4 | 5 | #### FP-growth算法只需要对数据库进行两次扫描(第一次遍历:统计各个数据的频繁度(统计出现频率),第二次遍历:只考虑那些频繁元素),而Apriori算法对于每个潜在的频繁项集都会扫描数据集判定给定模式是否频繁,因此FP-growth算法的速度要比Apriori算法快。它发现频繁项集的基本过程如下: 6 | ``` 7 | (1) 构建FP树 8 | (2) 从FP树中挖掘频繁项集 9 | ``` 10 | #### FP-growth算法将数据存储在一种称为FP树的紧凑数据结构中。FP代表频繁模式(Frequent Pattern)。一棵FP树看上去与计算机科学中的其他树结构类似,但是它通过链接(link)来连接相似元素,被连起来的元素项可以看成一个链表。 11 | 12 | #### 在创建真正的频繁集FP树之前,需要对数据进行过滤(不符合频繁要求)和排序(按照频繁度排序)。利用头指针表,可以快速访问FP树中一个给定类型的所有元素。 13 | 14 | #### 同搜索树不同的是,一个元素项可以在一棵FP树中出现多次。FP树会存储项集的出现频率,而每个项集会以路径的方式存储在树中。存在相似元素的集合会共享树的一部分。只有当集合之间完全不同时,树才会分叉。 树节点上给出集合中的单个元素及其在序列中的出现次数,路径会给出该序列的出现次数。 15 | 16 | ### 从FP树中抽取频繁项集的三个基本步骤如下: 17 | ``` 18 | (1) 从FP树中获得条件模式基; 19 | (2) 利用条件模式基,构建一个条件FP树; 20 | (3) 迭代重复步骤(1)步骤(2),直到树包含一个元素项为止。 21 | ``` 22 | 23 | ## 条件模式基 24 | #### 条件模式基是以所查找元素项为结尾的路径集合,每一条路径其实都是一条前缀路径。 抽取出来每个频繁项集的前缀路径之后,用条件模式基构造条件FP树(t的条件FP树)。简单来说,对于最简单的1个元素的,去掉不满足支持度的元素,选出长度1的频繁项集,然后把给出的元素序列,构造FP树,然后对于每个长度为1的频繁项集,找出条件模式基(前缀路径),然后根据前缀路径,去掉不满足支持度的元素,满足支持度的元素,加上原来长度1的频繁项集,就构成了长度2的频繁项集,以此类推。这个代码递归实现即可。 25 | 26 | ## 示例:从新闻网站点击流中挖掘 27 | #### 在上传的源码中,有一个kosarak.dat文件,它包含将近100万条记录 。该文件中的每一行包含某个用户浏览过的新闻报道。一些用户只看过一篇报道,而有些用户看过2498篇报道,其中用户和报道被编码成整数。 28 | ``` 29 | # 需要创建一个空列表来保存这些频繁项集 30 | myFreqList = [] 31 | 32 | # 执行该代码,构建树以及扫描100万行只需要几秒钟 33 | mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreqList) 34 | 35 | # 看下有多少新闻报道或报道集合曾经被10万或者更多的人浏览过: 36 | print len(myFreqList)# 查询结果为9人 37 | 38 | # 看具体为哪9项: 39 | print myFreqList 40 | ``` 41 | ## 总结: 42 | #### growth算法是一种用于发现数据集中频繁模式的有效方法。FP-growth算法利用Apriori原则,速度更快。在FP-growth算法中,数据集存储在一个称为FP树的结构中。FP树构建完成后,可以通过查找元素项的条件基及构建条件FP树来发现频繁项集。该过程不断以更多元素作为条件重复进行,直到FP树只包含一个元素为止。 43 | #### 可以使用FP-growth算法在多种文本文档中查找频繁单词。频繁项集生成还有其他一些应用,比如购物交易,医学诊断,大气研究等。 44 | 45 | -------------------------------------------------------------------------------- /ch12/Twitter.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import twitter 4 | from time import sleep 5 | import re 6 | 7 | def textParse(bigString): 8 | urlsRemoved = re.sub('(http:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString) 9 | listOfTokens = re.split(r'\W*', urlsRemoved) 10 | return [tok.lower() for tok in listOfTokens if len(tok) > 2] 11 | 12 | def getLotsOfTweets(searchStr): 13 | CONSUMER_KEY = '' 14 | CONSUMER_SECRET = '' 15 | ACCESS_TOKEN_KEY = '' 16 | ACCESS_TOKEN_SECRET = '' 17 | api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET, 18 | access_token_key=ACCESS_TOKEN_KEY, 19 | access_token_secret=ACCESS_TOKEN_SECRET) 20 | #you can get 1500 results 15 pages * 100 per page 21 | resultsPages = [] 22 | for i in range(1,15): 23 | print "fetching page %d" % i 24 | searchResults = api.GetSearch(searchStr, per_page=100, page=i) 25 | resultsPages.append(searchResults) 26 | sleep(6) 27 | return resultsPages 28 | 29 | def mineTweets(tweetArr, minSup=5): 30 | parsedList = [] 31 | for i in range(14): 32 | for j in range(100): 33 | parsedList.append(textParse(tweetArr[i][j].text)) 34 | initSet = createInitSet(parsedList) 35 | myFPtree, myHeaderTab = createTree(initSet, minSup) 36 | myFreqList = [] 37 | mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) 38 | return myFreqList 39 | -------------------------------------------------------------------------------- /ch12/kosarak.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch12/kosarak.zip -------------------------------------------------------------------------------- /ch13/README.md: -------------------------------------------------------------------------------- 1 | # Ch13 - 利用PCA来简化数据(Using principal component analysis to simplify data) 2 | 3 | #### 大量的数据往往拥有超出显示能力的更多特征,但数据显示并非大规模特征下的唯一难题,对数据进行简化也很重要,它可以: 4 | ``` 5 | 使得数据集更易使用。 6 | 降低很多算法的计算开销。 7 | 去除噪声。 8 | 使得结果易懂。 9 | ``` 10 | ## 第一种降维的方法称为主成分分析(Principal Component Analysis,PCA)。 11 | #### 在PCA中,数据从原来的坐标系转换到了新的坐标系,新坐标系的选择是由数据本身决定的。第一个新坐标轴选择的是原始数据中方差最大的方向,第二个新坐标轴的选择和第一个坐标轴正交且具有最大方差的方向。该过程一直重复,重复次数为原始数据中特征的数目。我们会发现,大部分方差都包含在最前面的几个新坐标轴中。因此,我们可以忽略余下的坐标轴,即对数据进行了降维处理。 12 | 13 | ## 另外一种降维技术是因子分析(Factor Analysis)。 14 | #### 在因子分析中,我们假设在观察数据的生成中有一些观察不到的隐变量(latent variable)。假设观察数据是这些隐变量和某些噪声的线性组合。那么隐变量的数据可能比观察数据的数目少,也就是说通过找到隐变量就可以实现数据的降维。 15 | 16 | ## 还有一种降维技术就是独立成分分析(Independent Component Analysis,ICA)。 17 | #### ICA假设数据是从 N 个数据源生成的,这一点和因子分析有些类似。假设数据为多个数据源的混合观察结果,这些数据源之间在统计上是相互独立的,而在PCA中只假设数据是不相关的。同因子分析一样,如果数据源的数目少于观察数据的数目,则可以实现降维过程。 18 | 19 | ## PCA伪代码: 20 | ``` 21 | 去除平均值 22 | 计算协方差矩阵 23 | 计算协方差矩阵的特征值和特征向量 24 | 将特征值从大到小排序 25 | 保留最上面的N个特征向量 26 | 将数据转换到上述N个特征向量构建的新空间中 27 | ``` 28 | ## 数据集 29 | ![数据集.png](screenshot/数据集.png) 30 | 31 | ## 降维(一) 32 | ![降维.png](screenshot/降维.png) 33 | 34 | #### 上图可以看出,将2维的数据降到1维的直观图。直线是一维。 35 | 36 | ## 降维(二) 37 | ![降维2.png](screenshot/降维2.png) 38 | 39 | ## 示例:利用 PCA 对半导体制造数据降维 40 | ### 总方差的百分比 41 | ![方差百分比.png](screenshot/方差百分比.png) 42 | #### 前六个主成分覆盖了数据96.8%的方差,而前20个主成分覆盖了99.3% 的方差。这就表明了,如果保留前6个而去除后584个主成分,我们就可以实现大概100∶1的压缩比。另外,由于舍弃了噪声的主成分,将后面的主成分去除便使得数据更加干净。 43 | 44 | ## 总结: 45 | #### 降维技术使得数据变得更易使用,并且它们往往能够去除数据中的噪声,使得其他机器学习任务更加精确。降维往往作为预处理步骤,在数据应用到其他算法之前清洗数据。有很多技术可以用于数据降维,在这些技术中,独立成分分析、因子分析和主成分分析比较流行,其中又以主成分分析应用最广泛。 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /ch13/matplotlib/README.md: -------------------------------------------------------------------------------- 1 | # matplotlib 2 | -------------------------------------------------------------------------------- /ch13/matplotlib/数据集.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from numpy import * 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | 7 | n = 1000 #number of points to create 8 | xcord0 = [] 9 | ycord0 = [] 10 | xcord1 = [] 11 | ycord1 = [] 12 | markers =[] 13 | colors =[] 14 | fw = open('testSet.txt','w') 15 | for i in range(n): 16 | [r0,r1] = random.standard_normal(2) 17 | fFlyer = r0 + 9.0 18 | tats = 1.0*r1 + fFlyer + 0 19 | xcord0.append(fFlyer) 20 | ycord0.append(tats) 21 | fw.write("%f\t%f\n" % (fFlyer, tats)) 22 | 23 | fw.close() 24 | fig = plt.figure() 25 | ax = fig.add_subplot(111) 26 | ax.scatter(xcord0,ycord0, marker='^', s=90) 27 | plt.xlabel('hours of direct sunlight') 28 | plt.ylabel('liters of water') 29 | plt.show() 30 | -------------------------------------------------------------------------------- /ch13/matplotlib/方差百分比.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from numpy import * 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | import pca 7 | 8 | dataMat = pca.replaceNanWithMean() 9 | 10 | #below is a quick hack copied from pca.pca() 11 | meanVals = mean(dataMat, axis=0) 12 | meanRemoved = dataMat - meanVals #remove mean 13 | covMat = cov(meanRemoved, rowvar=0) 14 | eigVals,eigVects = linalg.eig(mat(covMat)) 15 | eigValInd = argsort(eigVals) #sort, sort goes smallest to largest 16 | eigValInd = eigValInd[::-1]#reverse 17 | sortedEigVals = eigVals[eigValInd] 18 | total = sum(sortedEigVals) 19 | varPercentage = sortedEigVals/total*100 20 | 21 | fig = plt.figure() 22 | ax = fig.add_subplot(111) 23 | ax.plot(range(1, 21), varPercentage[:20], marker='^') 24 | plt.xlabel('Principal Component Number') 25 | plt.ylabel('Percentage of Variance') 26 | plt.show() -------------------------------------------------------------------------------- /ch13/matplotlib/降维.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from numpy import * 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | import pca 7 | 8 | dataMat = pca.loadDataSet('testSet.txt') 9 | lowDMat, reconMat = pca.pca(dataMat, 1) 10 | # lowDMat, reconMat = pca.pca(dataMat,2) #保留原来的2维数据,画图后可看出,数据样本是重合的 11 | 12 | fig = plt.figure() 13 | ax = fig.add_subplot(111) 14 | ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0],marker='^',s=90) 15 | ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0],marker='o',s=50,c='red') 16 | #由两维降为1维数据,降维后为一条红色直线,该方向是样本方差最大的方向,即样本离散程度最大的方向,该方向,将原来的2维数据融合为1维上 17 | plt.show() 18 | -------------------------------------------------------------------------------- /ch13/matplotlib/降维2.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from numpy import * 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | import pca 7 | 8 | n = 1000 #number of points to create 9 | xcord0 = []; ycord0 = [] 10 | xcord1 = []; ycord1 = [] 11 | xcord2 = []; ycord2 = [] 12 | markers =[] 13 | colors =[] 14 | fw = open('testSet3.txt','w') 15 | for i in range(n): 16 | groupNum = int(3*random.uniform()) 17 | [r0,r1] = random.standard_normal(2) 18 | if groupNum == 0: 19 | x = r0 + 16.0 20 | y = 1.0*r1 + x 21 | xcord0.append(x) 22 | ycord0.append(y) 23 | elif groupNum == 1: 24 | x = r0 + 8.0 25 | y = 1.0*r1 + x 26 | xcord1.append(x) 27 | ycord1.append(y) 28 | elif groupNum == 2: 29 | x = r0 + 0.0 30 | y = 1.0*r1 + x 31 | xcord2.append(x) 32 | ycord2.append(y) 33 | fw.write("%f\t%f\t%d\n" % (x, y, groupNum)) 34 | 35 | fw.close() 36 | fig = plt.figure() 37 | ax = fig.add_subplot(211) 38 | ax.scatter(xcord0,ycord0, marker='^', s=90) 39 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red') 40 | ax.scatter(xcord2,ycord2, marker='v', s=50, c='yellow') 41 | 42 | ax = fig.add_subplot(212) 43 | 44 | myDat = pca.loadDataSet('testSet3.txt') 45 | 46 | lowDDat,reconDat = pca.pca(myDat[:,0:2],1) 47 | 48 | label0Mat = lowDDat[nonzero(myDat[:,2]==0)[0],:2][0] #get the items with label 0 49 | label1Mat = lowDDat[nonzero(myDat[:,2]==1)[0],:2][0] #get the items with label 1 50 | label2Mat = lowDDat[nonzero(myDat[:,2]==2)[0],:2][0] #get the items with label 2 51 | 52 | #ax.scatter(label0Mat[:,0],label0Mat[:,1], marker='^', s=90) 53 | #ax.scatter(label1Mat[:,0],label1Mat[:,1], marker='o', s=50, c='red') 54 | #ax.scatter(label2Mat[:,0],label2Mat[:,1], marker='v', s=50, c='yellow') 55 | 56 | ax.scatter(label0Mat[:,0].flatten().A[0], zeros(shape(label0Mat)[0]), marker='^', s=90) 57 | ax.scatter(label1Mat[:,0].flatten().A[0], zeros(shape(label1Mat)[0]), marker='o', s=50, c='red') 58 | ax.scatter(label2Mat[:,0].flatten().A[0], zeros(shape(label2Mat)[0]), marker='v', s=50, c='yellow') 59 | plt.show() -------------------------------------------------------------------------------- /ch13/pca.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from numpy import * 4 | 5 | def loadDataSet(fileName,delim='\t'): 6 | fr=open(fileName) 7 | #使用两个list来构建矩阵 8 | stringArr=[line.strip().split(delim) for line in fr.readlines()] 9 | datArr=[list(map(float,line)) for line in stringArr] 10 | return mat(datArr) 11 | 12 | # PCA算法 13 | def pca(dataMat,topNfeat=9999999): #topNfeat为可选参数,记录特征值个数 14 | meanVals=mean(dataMat,axis=0) #求均值 15 | meanRemoved=dataMat-meanVals #归一化数据 16 | covMat=cov(meanRemoved,rowvar=0) #求协方差 17 | eigVals,eigVects=linalg.eig(mat(covMat)) #计算特征值和特征向量 18 | eigValInd=argsort(eigVals) #对特征值进行排序,默认从小到大 19 | eigValInd=eigValInd[:-(topNfeat+1):-1] #逆序取得特征值最大的元素 20 | redEigVects=eigVects[:,eigValInd] #用特征向量构成矩阵 21 | lowDDataMat=meanRemoved*redEigVects #用归一化后的各个数据与特征矩阵相乘,映射到新的空间 22 | reconMat=(lowDDataMat*redEigVects.T)+meanVals #还原原始数据 23 | return lowDDataMat,reconMat 24 | 25 | def replaceNanWithMean(): #均值代替那些样本中的缺失值 26 | datMat = loadDataSet('secom.data', ' ') 27 | numFeat = shape(datMat)[1] 28 | for i in range(numFeat): 29 | meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number) # .A表示把矩阵转化为数组array 30 | #nonzero(~isnan(datMat[:,i].A))[0] 返回非0元素所在行的索引; 31 | #>>> nonzero([True,False,True]) 32 | # (array([0, 2]),) 第0个和第3个元素非0 33 | #~isnan()返回Ture or False 34 | datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal #set NaN values to mean 35 | return datMat 36 | 37 | if __name__=="__main__": 38 | 39 | dataMat = loadDataSet("testSet.txt") 40 | print shape(dataMat) 41 | lowDmat,reconMat = pca(dataMat,1) 42 | print shape(lowDmat) # 变成一维矩阵 43 | 44 | dataMat = replaceNanWithMean() 45 | # 去除均值 46 | meanVals = mean(dataMat,axis = 0) 47 | meanRemoved = dataMat - meanVals 48 | # 计算协方差 49 | covMat = cov(meanRemoved,rowvar = 0) 50 | # 对矩阵进行特征值分析 51 | eigVals,eigVects = linalg.eig(mat(covMat)) 52 | 53 | #观察特征值结果 54 | print eigVals -------------------------------------------------------------------------------- /ch13/screenshot/README.md: -------------------------------------------------------------------------------- 1 | # screenshot 2 | -------------------------------------------------------------------------------- /ch13/screenshot/数据集.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch13/screenshot/数据集.png -------------------------------------------------------------------------------- /ch13/screenshot/方差百分比.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch13/screenshot/方差百分比.png -------------------------------------------------------------------------------- /ch13/screenshot/降维.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch13/screenshot/降维.png -------------------------------------------------------------------------------- /ch13/screenshot/降维2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch13/screenshot/降维2.png -------------------------------------------------------------------------------- /ch14/0_5.txt: -------------------------------------------------------------------------------- 1 | 00000000000000110000000000000000 2 | 00000000000011111100000000000000 3 | 00000000000111111110000000000000 4 | 00000000001111111111000000000000 5 | 00000000111111111111100000000000 6 | 00000001111111111111110000000000 7 | 00000000111111111111111000000000 8 | 00000000111111100001111100000000 9 | 00000001111111000001111100000000 10 | 00000011111100000000111100000000 11 | 00000011111100000000111110000000 12 | 00000011111100000000011110000000 13 | 00000011111100000000011110000000 14 | 00000001111110000000001111000000 15 | 00000011111110000000001111000000 16 | 00000011111100000000001111000000 17 | 00000001111100000000001111000000 18 | 00000011111100000000001111000000 19 | 00000001111100000000001111000000 20 | 00000001111100000000011111000000 21 | 00000000111110000000001111100000 22 | 00000000111110000000001111100000 23 | 00000000111110000000001111100000 24 | 00000000111110000000011111000000 25 | 00000000111110000000111111000000 26 | 00000000111111000001111110000000 27 | 00000000011111111111111110000000 28 | 00000000001111111111111110000000 29 | 00000000001111111111111110000000 30 | 00000000000111111111111000000000 31 | 00000000000011111111110000000000 32 | 00000000000000111111000000000000 33 | -------------------------------------------------------------------------------- /ch14/README.md: -------------------------------------------------------------------------------- 1 | # Ch14 - 利用SVD简化数据(Simplifying data with the singular value decomposition) 2 | 3 | #### 利用SVD实现,我们能够用小得多的数据集来表示原始数据集。这样做,实际上是去除了噪声和冗余信息。简而言之,SVD是一种从大量数据中提取主要关键数据的方法。 4 | 5 | ## 应用场景 6 | ### 1.隐性语义索引 7 | ### 2.推荐系统 8 | 9 | #### SVD是矩阵分解的一种类型,而矩阵分解是将数据矩阵分解为多个独立部分的过程。 10 | 11 | ## 矩阵分解 12 | #### 很多情况下,数据中的一小段携带了数据集中的大部分信息,其他信息则要么是噪声,要么就是毫不相关的信息。 在线性代数中还有很多矩阵分解技术。矩阵分解可以将原始矩阵表示成新的易于处理的形式,这种新形式是两个或多个矩阵的乘积。 不同的矩阵分解技术具有不同的性质,其中有些更适合于某个应用,有些则更适合于其他应用。最常见的一种矩阵分解技术就是SVD。 13 | 14 | #### 矩阵 Σ ,该矩阵只有对角元素,其他元素均为0。另一个惯例就是,Σ 的对角元素是从大到小排列的。这些对角元素称为奇异值(Singular Value),它们对应了原始数据集矩阵 Data 的奇异值。奇异值和特征值是有关系的。 15 | 16 | #### 科学和工程中,一直存在这样一个普遍事实:在某个奇异值的数目( r 个)之后,其他的奇异值都置为0。这就意味着数据集中仅有 r 个重要特征,而其余特征则都是噪声或冗余特征。 17 | 18 | ## 基于协同过滤的推荐引擎 19 | #### 协同过滤(collaborative filtering )是通过将用户和其他用户的数据进行对比来实现推荐的,唯一所需要的数学方法就是相似度的计算。 20 | 21 | ### 这里介绍了几种相似度计算,欧式距离,皮尔逊相关系数( Pearson correlation),余弦相似度 (cosine similarity)。 22 | 23 | ## 示例:餐馆菜肴推荐引擎 24 | 【略】 25 | 26 | ## 示例:基于 SVD 的图像压缩 27 | 【略】 28 | 29 | ## 总结 30 | #### SVD 是一种强大的降维工具,我们可以利用 SVD 来逼近矩阵并从中提取重要特征。通过保留矩阵 80% ~ 90% 的能量,就可以得到重要的特征并去掉噪声。在大规模数据集上, SVD 的计算和推荐可能是一个很困难的工程问题。通过离线方式来进行SVD 分解和相似度计算,是一种减少冗余计算和推荐所需时间的办法。 31 | -------------------------------------------------------------------------------- /ch15/README.md: -------------------------------------------------------------------------------- 1 | # Ch15 - 大数据与MapReduce(Big data and MapReduce) 2 | 3 | ## 总结: 4 | 5 | ### 当运算需求超出了当前资源的运算能力,可以考虑购买更好的机器,或者租用网络服务并使用MapReduce框架并行执行。另一个情况是,运算需求超出了合理价位下所能购买到的机器的运算能力。其中一个解决方法是将计算转成并行的作业,MapReduce就提供了这种方案的一个具体实施框架。在MapReduce中,作业被分成map阶段和reduce阶段。 6 | 7 | ### 一个典型的作业流程是先使用map阶段并行处理数据,之后将这些数据在reduce阶段合并。这种多对一的模式很经典,但不是唯一的流程方式。mapper和reducer之间传输数据的形式是key/value对。一般地,map阶段后数据还会按照key值进行排序。Hadoop是一个流行的可行MapReduce作业的java项目,它同时提供非Java作业的运行支持,叫做Hadoop流。 8 | 9 | ### 很多机器学习算法都可以容易地写成MapReduce作业,而某些需要经过重写和创新性的修改,才能在MapReduce上运行。 10 | -------------------------------------------------------------------------------- /ch15/err.txt: -------------------------------------------------------------------------------- 1 | No handlers could be found for logger "mrjob.job" 2 | using configs in c:/Users/Peter\.mrjob.conf 3 | creating tmp directory /scratch/$USER\mrSVM.Peter.20111230.181815.061000 4 | reading from STDIN 5 | > 'c:\Python27\python.exe' mrSVM.py --step-num=0 --mapper '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\input_part-00000' 6 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-0-mapper_part-00000 7 | > 'c:\Python27\python.exe' mrSVM.py --step-num=0 --mapper '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\input_part-00001' 8 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-0-mapper_part-00001 9 | STDERR: No handlers could be found for logger "mrjob.job" 10 | STDERR: No handlers could be found for logger "mrjob.job" 11 | Counters from step 1: 12 | (no counters found) 13 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-0-mapper-sorted 14 | > sort '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-0-mapper_part-00000' '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-0-mapper_part-00001' 15 | Piping files into sort for Windows compatibility 16 | > sort 17 | > 'c:\Python27\python.exe' mrSVM.py --step-num=0 --reducer '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\input_part-00000' 18 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-0-reducer_part-00000 19 | STDERR: No handlers could be found for logger "mrjob.job" 20 | Counters from step 1: 21 | (no counters found) 22 | > 'c:\Python27\python.exe' mrSVM.py --step-num=1 --mapper '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\input_part-00000' 23 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-1-mapper_part-00000 24 | > 'c:\Python27\python.exe' mrSVM.py --step-num=1 --mapper '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\input_part-00001' 25 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-1-mapper_part-00001 26 | STDERR: No handlers could be found for logger "mrjob.job" 27 | STDERR: No handlers could be found for logger "mrjob.job" 28 | Counters from step 2: 29 | (no counters found) 30 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-1-mapper-sorted 31 | Piping files into sort for Windows compatibility 32 | > sort 33 | > 'c:\Python27\python.exe' mrSVM.py --step-num=1 --reducer '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\input_part-00000' 34 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-1-reducer_part-00000 35 | STDERR: No handlers could be found for logger "mrjob.job" 36 | Counters from step 2: 37 | (no counters found) 38 | Moving /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-1-reducer_part-00000 -> /scratch/$USER\mrSVM.Peter.20111230.181815.061000\output\part-00000 39 | Streaming final output from /scratch/$USER\mrSVM.Peter.20111230.181815.061000\output 40 | removing tmp directory /scratch/$USER\mrSVM.Peter.20111230.181815.061000 41 | -------------------------------------------------------------------------------- /ch15/inputFile.txt: -------------------------------------------------------------------------------- 1 | 0.970413 2 | 0.901817 3 | 0.828698 4 | 0.197744 5 | 0.466887 6 | 0.962147 7 | 0.187294 8 | 0.388509 9 | 0.243889 10 | 0.115732 11 | 0.616292 12 | 0.713436 13 | 0.761446 14 | 0.944123 15 | 0.200903 16 | 0.547166 17 | 0.800028 18 | 0.848790 19 | 0.001641 20 | 0.058010 21 | 0.859900 22 | 0.009178 23 | 0.736598 24 | 0.683586 25 | 0.142515 26 | 0.212120 27 | 0.752769 28 | 0.546184 29 | 0.652227 30 | 0.583803 31 | 0.812863 32 | 0.036862 33 | 0.075076 34 | 0.257536 35 | 0.431278 36 | 0.600214 37 | 0.985564 38 | 0.055846 39 | 0.905295 40 | 0.336262 41 | 0.198738 42 | 0.845815 43 | 0.527989 44 | 0.448650 45 | 0.235313 46 | 0.599749 47 | 0.443923 48 | 0.968723 49 | 0.911076 50 | 0.279338 51 | 0.569492 52 | 0.635985 53 | 0.267532 54 | 0.975018 55 | 0.463698 56 | 0.842340 57 | 0.065590 58 | 0.233049 59 | 0.810390 60 | 0.448260 61 | 0.431967 62 | 0.549648 63 | 0.703612 64 | 0.187974 65 | 0.231709 66 | 0.784160 67 | 0.072283 68 | 0.921053 69 | 0.735468 70 | 0.715923 71 | 0.150431 72 | 0.661089 73 | 0.734955 74 | 0.633709 75 | 0.216102 76 | 0.498474 77 | 0.195620 78 | 0.339548 79 | 0.245314 80 | 0.819848 81 | 0.521242 82 | 0.549276 83 | 0.200906 84 | 0.202525 85 | 0.922876 86 | 0.025404 87 | 0.604032 88 | 0.752204 89 | 0.158860 90 | 0.651622 91 | 0.592898 92 | 0.500392 93 | 0.410614 94 | 0.968388 95 | 0.265918 96 | 0.565707 97 | 0.413670 98 | 0.080507 99 | 0.929978 100 | 0.609755 101 | -------------------------------------------------------------------------------- /ch15/junk.txt: -------------------------------------------------------------------------------- 1 | jj I am so sick of TV 2 | ss jar jar got a purse 3 | 22 shit ass 4 | -------------------------------------------------------------------------------- /ch15/kickStart.txt: -------------------------------------------------------------------------------- 1 | ["w", [0.001, 0.001]] 2 | ["x", 79] 3 | ["x", 115] 4 | ["x", 107] 5 | ["x", 109] 6 | ["x", 109] 7 | ["x", 88] 8 | ["x", 56] 9 | ["x", 94] 10 | ["x", 50] 11 | ["x", 86] 12 | ["x", 75] 13 | ["x", 30] 14 | ["x", 20] 15 | ["x", 157] 16 | ["x", 15] 17 | ["x", 19] 18 | ["x", 63] 19 | ["x", 124] 20 | ["x", 132] 21 | ["x", 3] 22 | ["x", 140] 23 | ["x", 139] 24 | ["x", 127] 25 | ["x", 98] 26 | ["x", 30] 27 | ["x", 16] 28 | ["x", 4] 29 | ["x", 2] 30 | ["x", 75] 31 | ["x", 123] 32 | ["x", 42] 33 | ["x", 16] 34 | ["x", 94] 35 | ["x", 163] 36 | ["x", 159] 37 | ["x", 23] 38 | ["x", 16] 39 | ["x", 160] 40 | ["x", 5] 41 | ["x", 42] 42 | ["x", 53] 43 | ["x", 83] 44 | ["x", 46] 45 | ["x", 121] 46 | ["x", 73] 47 | ["x", 123] 48 | ["x", 93] 49 | ["x", 99] 50 | ["x", 106] 51 | ["x", 173] 52 | ["x", 192] 53 | ["x", 132] 54 | ["x", 57] 55 | ["x", 47] 56 | ["x", 164] 57 | ["x", 157] 58 | ["x", 199] 59 | ["x", 62] 60 | ["x", 175] 61 | ["x", 154] 62 | ["x", 110] 63 | ["x", 0] 64 | ["x", 116] 65 | ["x", 49] 66 | ["x", 76] 67 | ["x", 121] 68 | ["x", 178] 69 | ["x", 75] 70 | ["x", 167] 71 | ["x", 41] 72 | ["x", 105] 73 | ["x", 71] 74 | ["x", 5] 75 | ["x", 135] 76 | ["x", 80] 77 | ["x", 116] 78 | ["x", 198] 79 | ["x", 164] 80 | ["x", 105] 81 | ["x", 98] 82 | ["x", 156] 83 | ["x", 72] 84 | ["x", 54] 85 | ["x", 62] 86 | ["x", 57] 87 | ["x", 87] 88 | ["x", 68] 89 | ["x", 163] 90 | ["x", 140] 91 | ["x", 40] 92 | ["x", 70] 93 | ["x", 120] 94 | ["x", 172] 95 | ["x", 71] 96 | ["x", 82] 97 | ["x", 168] 98 | ["x", 42] 99 | ["x", 144] 100 | ["x", 27] 101 | ["x", 36] -------------------------------------------------------------------------------- /ch15/mrMean.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from mrjob.job import MRJob 4 | 5 | class MRmean(MRJob): 6 | def __init__(self, *args, **kwargs): 7 | super(MRmean, self).__init__(*args, **kwargs) 8 | self.inCount = 0 9 | self.inSum = 0 10 | self.inSqSum = 0 11 | 12 | def map(self, key, val): #needs exactly 2 arguments 13 | if False: yield 14 | inVal = float(val) 15 | self.inCount += 1 16 | self.inSum += inVal 17 | self.inSqSum += inVal*inVal 18 | 19 | def map_final(self): 20 | mn = self.inSum/self.inCount 21 | mnSq = self.inSqSum/self.inCount 22 | yield (1, [self.inCount, mn, mnSq]) 23 | 24 | def reduce(self, key, packedValues): 25 | cumVal=0.0; cumSumSq=0.0; cumN=0.0 26 | for valArr in packedValues: #get values from streamed inputs 27 | nj = float(valArr[0]) 28 | cumN += nj 29 | cumVal += nj*float(valArr[1]) 30 | cumSumSq += nj*float(valArr[2]) 31 | mean = cumVal/cumN 32 | var = (cumSumSq - 2*mean*cumVal + cumN*mean*mean)/cumN 33 | yield (mean, var) #emit mean and var 34 | 35 | def steps(self): 36 | return ([self.mr(mapper=self.map, mapper_final=self.map_final,\ 37 | reducer=self.reduce,)]) 38 | 39 | if __name__ == '__main__': 40 | MRmean.run() 41 | -------------------------------------------------------------------------------- /ch15/mrMeanMapper.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import sys 4 | from numpy import mat, mean, power 5 | 6 | def read_input(file): 7 | for line in file: 8 | yield line.rstrip() 9 | 10 | input = read_input(sys.stdin)#creates a list of input lines 11 | input = [float(line) for line in input] #overwrite with floats 12 | numInputs = len(input) 13 | input = mat(input) 14 | sqInput = power(input,2) 15 | 16 | #output size, mean, mean(square values) 17 | print "%d\t%f\t%f" % (numInputs, mean(input), mean(sqInput)) #calc mean of columns 18 | print >> sys.stderr, "report: still alive" 19 | -------------------------------------------------------------------------------- /ch15/mrMeanReducer.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import sys 4 | from numpy import mat, mean, power 5 | 6 | def read_input(file): 7 | for line in file: 8 | yield line.rstrip() 9 | 10 | input = read_input(sys.stdin)#creates a list of input lines 11 | 12 | #split input lines into separate items and store in list of lists 13 | mapperOut = [line.split('\t') for line in input] 14 | 15 | #accumulate total number of samples, overall sum and overall sum sq 16 | cumVal=0.0 17 | cumSumSq=0.0 18 | cumN=0.0 19 | for instance in mapperOut: 20 | nj = float(instance[0]) 21 | cumN += nj 22 | cumVal += nj*float(instance[1]) 23 | cumSumSq += nj*float(instance[2]) 24 | 25 | #calculate means 26 | mean = cumVal/cumN 27 | meanSq = cumSumSq/cumN 28 | 29 | #output size, mean, mean(square values) 30 | print "%d\t%f\t%f" % (cumN, mean, meanSq) 31 | print >> sys.stderr, "report: still alive" -------------------------------------------------------------------------------- /ch15/mrSVM.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from mrjob.job import MRJob 4 | 5 | import pickle 6 | from numpy import * 7 | 8 | class MRsvm(MRJob): 9 | DEFAULT_INPUT_PROTOCOL = 'json_value' 10 | 11 | def __init__(self, *args, **kwargs): 12 | super(MRsvm, self).__init__(*args, **kwargs) 13 | self.data = pickle.load(open('C:\Users\Peter\machinelearninginaction\Ch15\svmDat27')) 14 | self.w = 0 15 | self.eta = 0.69 16 | self.dataList = [] 17 | self.k = self.options.batchsize 18 | self.numMappers = 1 19 | self.t = 1 #iteration number 20 | 21 | def configure_options(self): 22 | super(MRsvm, self).configure_options() 23 | self.add_passthrough_option( 24 | '--iterations', dest='iterations', default=2, type='int', 25 | help='T: number of iterations to run') 26 | self.add_passthrough_option( 27 | '--batchsize', dest='batchsize', default=100, type='int', 28 | help='k: number of data points in a batch') 29 | 30 | def map(self, mapperId, inVals): #needs exactly 2 arguments 31 | #input: nodeId, ('w', w-vector) OR nodeId, ('x', int) 32 | if False: yield 33 | if inVals[0]=='w': #accumulate W-vector 34 | self.w = inVals[1] 35 | elif inVals[0]=='x': 36 | self.dataList.append(inVals[1])#accumulate data points to calc 37 | elif inVals[0]=='t': self.t = inVals[1] 38 | else: self.eta=inVals #this is for debug, eta not used in map 39 | 40 | def map_fin(self): 41 | labels = self.data[:,-1]; X=self.data[:,0:-1]#reshape data into X and Y 42 | if self.w == 0: self.w = [0.001]*shape(X)[1] #init w on first iteration 43 | for index in self.dataList: 44 | p = mat(self.w)*X[index,:].T #calc p=w*dataSet[key].T 45 | if labels[index]*p < 1.0: 46 | yield (1, ['u', index])#make sure everything has the same key 47 | yield (1, ['w', self.w]) #so it ends up at the same reducer 48 | yield (1, ['t', self.t]) 49 | 50 | def reduce(self, _, packedVals): 51 | for valArr in packedVals: #get values from streamed inputs 52 | if valArr[0]=='u': self.dataList.append(valArr[1]) 53 | elif valArr[0]=='w': self.w = valArr[1] 54 | elif valArr[0]=='t': self.t = valArr[1] 55 | labels = self.data[:,-1]; X=self.data[:,0:-1] 56 | wMat = mat(self.w); wDelta = mat(zeros(len(self.w))) 57 | for index in self.dataList: 58 | wDelta += float(labels[index])*X[index,:] #wDelta += label*dataSet 59 | eta = 1.0/(2.0*self.t) #calc new: eta 60 | #calc new: w = (1.0 - 1/t)*w + (eta/k)*wDelta 61 | wMat = (1.0 - 1.0/self.t)*wMat + (eta/self.k)*wDelta 62 | for mapperNum in range(1,self.numMappers+1): 63 | yield (mapperNum, ['w', wMat.tolist()[0] ]) #emit w 64 | if self.t < self.options.iterations: 65 | yield (mapperNum, ['t', self.t+1])#increment T 66 | for j in range(self.k/self.numMappers):#emit random ints for mappers iid 67 | yield (mapperNum, ['x', random.randint(shape(self.data)[0]) ]) 68 | 69 | def steps(self): 70 | return ([self.mr(mapper=self.map, reducer=self.reduce, 71 | mapper_final=self.map_fin)]*self.options.iterations) 72 | 73 | if __name__ == '__main__': 74 | MRsvm.run() 75 | -------------------------------------------------------------------------------- /ch15/mrSVMkickStart.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from mrjob.protocol import JSONProtocol 4 | from numpy import * 5 | 6 | fw=open('kickStart2.txt', 'w') 7 | for i in [1]: 8 | for j in range(100): 9 | fw.write('["x", %d]\n' % random.randint(200)) 10 | fw.close() -------------------------------------------------------------------------------- /ch15/pegasos.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from numpy import * 4 | 5 | def loadDataSet(fileName): 6 | dataMat = []; labelMat = [] 7 | fr = open(fileName) 8 | for line in fr.readlines(): 9 | lineArr = line.strip().split('\t') 10 | #dataMat.append([float(lineArr[0]), float(lineArr[1]), float(lineArr[2])]) 11 | dataMat.append([float(lineArr[0]), float(lineArr[1])]) 12 | labelMat.append(float(lineArr[2])) 13 | return dataMat,labelMat 14 | 15 | def seqPegasos(dataSet, labels, lam, T): 16 | m,n = shape(dataSet); w = zeros(n) 17 | for t in range(1, T+1): 18 | i = random.randint(m) 19 | eta = 1.0/(lam*t) 20 | p = predict(w, dataSet[i,:]) 21 | if labels[i]*p < 1: 22 | w = (1.0 - 1/t)*w + eta*labels[i]*dataSet[i,:] 23 | else: 24 | w = (1.0 - 1/t)*w 25 | print w 26 | return w 27 | 28 | def predict(w, x): 29 | return w*x.T 30 | 31 | def batchPegasos(dataSet, labels, lam, T, k): 32 | m,n = shape(dataSet); w = zeros(n); 33 | dataIndex = range(m) 34 | for t in range(1, T+1): 35 | wDelta = mat(zeros(n)) #reset wDelta 36 | eta = 1.0/(lam*t) 37 | random.shuffle(dataIndex) 38 | for j in range(k):#go over training set 39 | i = dataIndex[j] 40 | p = predict(w, dataSet[i,:]) #mapper code 41 | if labels[i]*p < 1: #mapper code 42 | wDelta += labels[i]*dataSet[i,:].A #accumulate changes 43 | w = (1.0 - 1/t)*w + (eta/k)*wDelta #apply changes at each T 44 | return w 45 | 46 | datArr,labelList = loadDataSet('testSet.txt') 47 | datMat = mat(datArr) 48 | #finalWs = seqPegasos(datMat, labelList, 2, 5000) 49 | finalWs = batchPegasos(datMat, labelList, 2, 50, 100) 50 | print finalWs 51 | 52 | import matplotlib 53 | import matplotlib.pyplot as plt 54 | fig = plt.figure() 55 | ax = fig.add_subplot(111) 56 | x1=[]; y1=[]; xm1=[]; ym1=[] 57 | for i in range(len(labelList)): 58 | if labelList[i] == 1.0: 59 | x1.append(datMat[i,0]); y1.append(datMat[i,1]) 60 | else: 61 | xm1.append(datMat[i,0]); ym1.append(datMat[i,1]) 62 | ax.scatter(x1, y1, marker='s', s=90) 63 | ax.scatter(xm1, ym1, marker='o', s=50, c='red') 64 | x = arange(-6.0, 8.0, 0.1) 65 | y = (-finalWs[0,0]*x - 0)/finalWs[0,1] 66 | #y2 = (0.43799*x)/0.12316 67 | y2 = (0.498442*x)/0.092387 #2 iterations 68 | ax.plot(x,y) 69 | ax.plot(x,y2,'g-.') 70 | ax.axis([-6,8,-4,5]) 71 | ax.legend(('50 Iterations', '2 Iterations') ) 72 | plt.show() -------------------------------------------------------------------------------- /ch15/proximalSVM.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import numpy 4 | 5 | def map(key, value): 6 | # input key= class for one training example, e.g. "-1.0" 7 | classes = [float(item) for item in key.split(",")] # e.g. [-1.0] 8 | D = numpy.diag(classes) 9 | 10 | # input value = feature vector for one training example, e.g. "3.0, 7.0, 2.0" 11 | featurematrix = [float(item) for item in value.split(",")] 12 | A = numpy.matrix(featurematrix) 13 | 14 | # create matrix E and vector e 15 | e = numpy.matrix(numpy.ones(len(A)).reshape(len(A),1)) 16 | E = numpy.matrix(numpy.append(A,-e,axis=1)) 17 | 18 | # create a tuple with the values to be used by reducer 19 | # and encode it with base64 to avoid potential trouble with '\t' and '\n' used 20 | # as default separators in Hadoop Streaming 21 | producedvalue = base64.b64encode(pickle.dumps( (E.T*E, E.T*D*e) ) 22 | 23 | # note: a single constant key "producedkey" sends to only one reducer 24 | # somewhat "atypical" due to low degree of parallism on reducer side 25 | print "producedkey\t%s" % (producedvalue) 26 | 27 | def reduce(key, values, mu=0.1): 28 | sumETE = None 29 | sumETDe = None 30 | 31 | # key isn't used, so ignoring it with _ (underscore). 32 | for _, value in values: 33 | # unpickle values 34 | ETE, ETDe = pickle.loads(base64.b64decode(value)) 35 | if sumETE == None: 36 | # create the I/mu with correct dimensions 37 | sumETE = numpy.matrix(numpy.eye(ETE.shape[1])/mu) 38 | sumETE += ETE 39 | 40 | if sumETDe == None: 41 | # create sumETDe with correct dimensions 42 | sumETDe = ETDe 43 | else: 44 | sumETDe += ETDe 45 | 46 | # note: omega = result[:-1] and gamma = result[-1] 47 | # but printing entire vector as output 48 | result = sumETE.I*sumETDe 49 | print "%s\t%s" % (key, str(result.tolist())) 50 | -------------------------------------------------------------------------------- /ch15/py27dbg.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from mrjob.job import MRJob 4 | 5 | import pickle 6 | from numpy import * 7 | 8 | class MRsvm(MRJob): 9 | 10 | def map(self, mapperId, inVals): #needs exactly 2 arguments 11 | if False: yield 12 | yield (1, 22) 13 | 14 | def reduce(self, _, packedVals): 15 | yield "fuck ass" 16 | 17 | def steps(self): 18 | return ([self.mr(mapper=self.map, reducer=self.reduce)]) 19 | 20 | if __name__ == '__main__': 21 | MRsvm.run() 22 | -------------------------------------------------------------------------------- /ch15/wc.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from mrjob.job import MRJob 4 | import json 5 | 6 | 7 | class MRWordCountUtility(MRJob): 8 | 9 | def __init__(self, *args, **kwargs): 10 | super(MRWordCountUtility, self).__init__(*args, **kwargs) 11 | self.chars = 0 12 | self.words = 0 13 | self.lines = 0 14 | 15 | def mapper(self, _, line): 16 | if False: 17 | yield # I'm a generator! 18 | 19 | self.chars += len(line) + 1 # +1 for newline 20 | self.words += sum(1 for word in line.split() if word.strip()) 21 | self.lines += 1 22 | 23 | def mapper_final(self): 24 | yield('chars', self.chars) 25 | yield('words', self.words) 26 | yield('lines', self.lines) 27 | 28 | def reducer(self, key, values): 29 | yield(key, sum(values)) 30 | 31 | 32 | if __name__ == '__main__': 33 | MRWordCountUtility.run() 34 | -------------------------------------------------------------------------------- /ch2/KNN.txt: -------------------------------------------------------------------------------- 1 | https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm -------------------------------------------------------------------------------- /ch2/KNN(classify).py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from numpy import * 4 | import operator 5 | from os import listdir 6 | 7 | 8 | # 用于分类的输入向量inx,输入的训练样本集是dataSet,标签向量是labels 9 | #参数k表示用于选择最近邻居的数目 10 | #通过KNN进行分类 11 | def classify0(inX, dataSet, labels, k): 12 | 13 | dataSetSize = dataSet.shape[0] 14 | # 计算欧式距离 15 | diffMat = tile(inX, (dataSetSize,1)) - dataSet 16 | sqDiffMat = diffMat**2 17 | sqDistances = sqDiffMat.sum(axis=1) #行向量分别相加,从而得到新的一个行向量 18 | distances = sqDistances**0.5 19 | 20 | # 对距离进行排序 21 | sortedDistIndicies = distances.argsort() #argsort()根据元素的值从大到小对元素进行排序,返回下标 22 | classCount={} 23 | 24 | for i in range(k): 25 | voteIlabel = labels[sortedDistIndicies[i]] 26 | # 对选取的K个样本所属的类别个数进行统计 27 | classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 28 | 29 | #逆序,选取出现的类别次数最多的类别 30 | sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) 31 | #返回出现的类别中次数最多的类别 32 | return sortedClassCount[0][0] 33 | 34 | def createDataSet(): 35 | group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) 36 | labels = ['A','A','B','B'] 37 | print classify0([0][0], group,labels,3) # B 38 | # print classify0(([1][0],group,labels,3)) #A 39 | return group, labels 40 | 41 | if __name__ == '__main__': 42 | print createDataSet() 43 | -------------------------------------------------------------------------------- /ch2/KNN(datingTestSet).py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from numpy import * 4 | import operator 5 | from os import listdir 6 | 7 | 8 | # 用于分类的输入向量inx,输入的训练样本集是dataSet,标签向量是labels 9 | # 参数k表示用于选择最近邻居的数目 10 | 11 | #通过KNN进行分类 12 | def classify0(inX, dataSet, labels, k): 13 | 14 | dataSetSize = dataSet.shape[0] 15 | # 计算欧式距离 16 | diffMat = tile(inX, (dataSetSize,1)) - dataSet 17 | sqDiffMat = diffMat**2 18 | sqDistances = sqDiffMat.sum(axis=1) #行向量分别相加,从而得到新的一个行向量 19 | distances = sqDistances**0.5 20 | 21 | # 对距离进行排序 22 | sortedDistIndicies = distances.argsort() #argsort()根据元素的值从大到小对元素进行排序,返回下标 23 | classCount={} 24 | 25 | for i in range(k): 26 | voteIlabel = labels[sortedDistIndicies[i]] 27 | # 对选取的K个样本所属的类别个数进行统计 28 | classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 29 | 30 | #逆序,选取出现的类别次数最多的类别 31 | sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) 32 | #返回出现的类别中次数最多的类别 33 | return sortedClassCount[0][0] 34 | 35 | def filematrix(filename): 36 | fr = open(filename) 37 | numberOfLines = len(fr.readlines()) # 文件中的行数 38 | returnMat = zeros((numberOfLines, 3)) # 初始化矩阵 39 | classLabelVector = [] # 初始化labels 40 | fr = open(filename) 41 | index = 0 42 | for line in fr.readlines(): 43 | line = line.strip() 44 | listFromLine = line.split('\t') 45 | returnMat[index, :] = listFromLine[0:3] 46 | classLabelVector.append(int(listFromLine[-1])) 47 | index += 1 48 | return returnMat, classLabelVector 49 | 50 | 51 | def autoNorm(dataSet): # 归一化特征值 52 | 53 | minVals = dataSet.min(0) 54 | maxVals = dataSet.max(0) 55 | ranges = maxVals - minVals 56 | normDataSet = zeros(shape(dataSet)) 57 | m = dataSet.shape[0] 58 | normDataSet = dataSet - tile(minVals, (m, 1)) 59 | normDataSet = normDataSet / tile(ranges, (m, 1)) # 特征值相除 60 | return normDataSet, ranges, minVals 61 | 62 | 63 | def datingClassTest(): 64 | # 注意:一共有1000个数据 65 | hoRatio = 0.10 # 随机选出 10% 的数据, 对于已有的数据,将90%作为训练,剩下10%作为测试 66 | datingDataMat, datingLabels = filematrix('datingTestSet.txt') 67 | normMat, ranges, minVals = autoNorm(datingDataMat) 68 | m = normMat.shape[0] 69 | numTestVecs = int(m * hoRatio) ##测试量 70 | errorCount = 0.0 71 | 72 | for i in range(numTestVecs): 73 | # 前10%行的数据作为测试集,并且对测试集中的每一行都进行预测,对比测试集中实际的label 74 | # 后90%行的数据全部作为训练集,每个测试集样本都要跟90%的训练集计算距离,算出最相似的label 75 | classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) 76 | print "分类器返回为: %d, 答案为: %d" % (classifierResult, datingLabels[i]) 77 | 78 | if (classifierResult != datingLabels[i]): 79 | errorCount += 1.0 80 | print "error" 81 | 82 | print "测试数据量为:%d" % numTestVecs 83 | print "错误率: %f" % (errorCount / float(numTestVecs)) 84 | print "错误个数:%d" % errorCount 85 | 86 | 87 | if __name__ == '__main__': 88 | # print createDataSet() 89 | datingClassTest() 90 | 91 | -------------------------------------------------------------------------------- /ch2/KNN(handwriting).py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from numpy import * 4 | import numpy as np 5 | import operator 6 | from os import listdir 7 | 8 | 9 | # 用于分类的输入向量inx,输入的训练样本集是dataSet,标签向量是labels 10 | # 参数k表示用于选择最近邻居的数目 11 | 12 | # 通过KNN进行分类 13 | def classify0(inX, dataSet, labels, k): 14 | dataSetSize = dataSet.shape[0] 15 | # 计算欧式距离 16 | diffMat = tile(inX, (dataSetSize, 1)) - dataSet 17 | sqDiffMat = diffMat ** 2 18 | sqDistances = sqDiffMat.sum(axis=1) # 行向量分别相加,从而得到新的一个行向量 19 | distances = sqDistances ** 0.5 20 | 21 | # 对距离进行排序 22 | sortedDistIndicies = distances.argsort() # argsort()根据元素的值从大到小对元素进行排序,返回下标 23 | classCount = {} 24 | 25 | for i in range(k): 26 | voteIlabel = labels[sortedDistIndicies[i]] 27 | # 对选取的K个样本所属的类别个数进行统计 28 | classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 29 | 30 | # 逆序,选取出现的类别次数最多的类别 31 | sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) 32 | # 返回出现的类别中次数最多的类别 33 | return sortedClassCount[0][0] 34 | 35 | 36 | def filematrix(filename): 37 | fr = open(filename) 38 | numberOfLines = len(fr.readlines()) # 文件中的行数 39 | returnMat = zeros((numberOfLines, 3)) # 初始化矩阵 40 | classLabelVector = [] # 初始化labels 41 | fr = open(filename) 42 | index = 0 43 | for line in fr.readlines(): 44 | line = line.strip() 45 | listFromLine = line.split('\t') 46 | returnMat[index, :] = listFromLine[0:3] 47 | classLabelVector.append(int(listFromLine[-1])) 48 | index += 1 49 | return returnMat, classLabelVector 50 | 51 | 52 | def createDataSetFromFile(filename): 53 | # Read lines 54 | file = open(filename) 55 | lines = file.readlines() 56 | file.close() 57 | 58 | # Change lines into array 59 | featureCount = len(lines[0].split()) - 1 60 | group = np.zeros((len(lines), featureCount)) 61 | labels = [] 62 | 63 | for i in range(len(lines)): 64 | lst = lines[i].split() 65 | group[i] = np.array(lst[:-1]) 66 | labels.append(lst[-1]) 67 | 68 | return (group, labels) 69 | 70 | 71 | def autoNorm(dataSet): # 归一化特征值 72 | 73 | minVals = dataSet.min(0) 74 | maxVals = dataSet.max(0) 75 | ranges = maxVals - minVals 76 | normDataSet = zeros(shape(dataSet)) 77 | m = dataSet.shape[0] 78 | normDataSet = dataSet - tile(minVals, (m, 1)) 79 | normDataSet = normDataSet / tile(ranges, (m, 1)) # 特征值相除 80 | return normDataSet, ranges, minVals 81 | 82 | #将图像转换成向量 83 | def img2vector(filename): 84 | 85 | # 创建1 * 1024的Numpy数组 86 | returnVect = zeros((1,1024)) 87 | fr = open(filename) 88 | #文件钱32行 89 | for i in range(32): 90 | lineStr = fr.readline() 91 | # 每行的头32个字符 92 | for j in range(32): 93 | returnVect[0,32*i+j] = int(lineStr[j]) 94 | return returnVect 95 | 96 | def handwritingClassTest(): 97 | 98 | hwLabels = [] 99 | trainingFileList = listdir('trainingDigits') #获取目录内容 100 | m = len(trainingFileList) 101 | 102 | trainingMat = zeros((m,1024)) 103 | 104 | for i in range(m): 105 | #从文件名解析分类数据 106 | fileNameStr = trainingFileList[i] 107 | fileStr = fileNameStr.split('.')[0] 108 | classNumStr = int(fileStr.split('_')[0]) 109 | hwLabels.append(classNumStr) 110 | 111 | trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr) 112 | 113 | #---------------------------------------------------------------# 114 | 115 | testFileList = listdir('testDigits') 116 | errorCount = 0.0 117 | mTest = len(testFileList) 118 | 119 | for i in range(mTest): 120 | 121 | fileNameStr = testFileList[i] 122 | fileStr = fileNameStr.split('.')[0] 123 | classNumStr = int(fileStr.split('_')[0]) 124 | 125 | vectorUnderTest = img2vector('testDigits/%s' % fileNameStr) 126 | 127 | classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) 128 | 129 | print "分类器返回: %d, 真正答案是: %d" % (classifierResult, classNumStr) 130 | 131 | if (classifierResult != classNumStr): 132 | errorCount += 1.0 133 | print "error" 134 | 135 | print "\n错误个数有: %d" % errorCount 136 | print "\n错误率: %f %%" % (errorCount/float(mTest)*100) 137 | 138 | 139 | if __name__ == '__main__': 140 | 141 | testVector = img2vector("0_0.txt") 142 | print testVector[0,0:31] 143 | 144 | handwritingClassTest() 145 | 146 | -------------------------------------------------------------------------------- /ch2/README.md: -------------------------------------------------------------------------------- 1 | # Ch02 - k-近邻(KNN) 2 | 3 | ##### K-近邻算法: 4 | ##### 优点:精度高,对异常值不敏感,无数据输入假定。 5 | ##### 缺点:计算复杂度高,空间复杂度高。 6 | ##### 适合数据范围:数值型和标称型。 7 | 8 | ##### 通常k是不大于20的整数,最后,选择k个最相似数据中出现次数最多的分类,作为新数据的分类。 9 | 10 | ##### 训练算法不适用于K-近邻算法。 11 | 12 | ##### KNN是通过测量不同特征值之间的距离进行分类。 13 | ##### 它的的思路是:如果一个样本在特征空间中的k个最相似(即特征空间中最邻近)的样本中的大多数属于某一个类别,则该样本也属于这个类别。 14 | 15 | ##### KNN算法的思想总结一下: 16 | ##### 就是在训练集中数据和标签已知的情况下,输入测试数据,将测试数据的特征与训练集中对应的特征进行相互比较,找到训练集中与之最为相似的前K个数据,则该测试数据对应的类别就是K个数据中出现次数最多的那个分类,其算法的描述为: 17 | ##### [1] 计算测试数据与各个训练数据之间的距离; 18 | ##### [2] 按照距离的递增关系进行排序; 19 | ##### [3] 选取距离最小的K个点;(k<=20) 20 | ##### [4] 确定前K个点所在类别的出现频率; 21 | ##### [5] 返回前K个点中出现频率最高的类别作为测试数据的预测分类。 22 | 23 | ##### k-近邻算法优缺点 24 | ##### 优点: 25 | ##### [1] 在数据量不是很大时,是作为最简单最有效的算法。 26 | 27 | ##### [2] k-近邻算法是基于实例的学习,使用算法必须有接近实际数据的训练样本数。 28 | 29 | ##### 缺点: 30 | ##### [1] k-近邻算法对每个测试集样本都使用了一次全部的训练集,第一若是训练集大,需要较大的存储空间,这一点倒不是什么问题,现在处理的数据基本上上G,主要是第二点,因为对每个测试集样本都需要使用一次全部的训练集得到最短的k个距离值,那么计算必然非常耗时。 31 | ##### [2] k-近邻算法无法给出任何数据的基础结构信息。无法知晓平均实例样本和典型实例样本具有怎样的特征。 32 | 33 | ##### KNN 与 SVM 的区别是什么? 34 | ##### 一般分类任务主要有两个步骤: 35 | ##### 1.训练; 36 | ##### 2.测试。 37 | 38 | ##### 对于SVM,是先在训练集上训练一个模型,然后用这个模型直接对测试集进行分类。这两个步骤是独立的。 39 | ##### KNN是一种基于实例的学习算法,它不同于贝叶斯、决策树等算法,KNN不需要训练,当有新的实例出现时,直接在训练数据集中找k个最近的实例,把这个新的实例分配给这k个训练实例中实例数最多类。KNN也成为懒惰学习,它不需要训练过程,在类标边界比较整齐的情况下分类的准确率很高。KNN算法需要人为决定K的取值,即找几个最近的实例,k值不同,分类结果的结果也会不同。对于KNN,没有训练过程。只是将训练数据与训练数据进行距离度量来实现分类。 40 | 41 | ##### KNN:原理比较简单,可以需要很少量的样本数据,但一定要足够典型;高纬度情况下会疯掉。 42 | 43 | ##### SVM:适合处理高纬度情况。 44 | 45 | ##### K 值的选取没有一个绝对的标准,但可以想象,K 取太大并不能提高正确率,而且求 K 个最近的邻居是一个O(K*N)复杂度的算法,k 太大,算法效率会更低。 46 | 47 | ##### 虽然说 K 值的选取,会影响结果,有人会认为这个算法不稳定,其实不然,这种影响并不是很大,因为只有这种影响只是在类别边界上产生影响,而在类中心附近的实例影响很小。 48 | -------------------------------------------------------------------------------- /ch2/creatDist.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from numpy import * 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | from matplotlib.patches import Rectangle 7 | 8 | 9 | n = 1000 #1000个点 10 | xcord = zeros((n)) 11 | ycord = zeros((n)) 12 | markers =[] 13 | colors =[] 14 | fw = open('testSet.txt','w') 15 | for i in range(n): 16 | 17 | [r0,r1] = random.standard_normal(2) 18 | myClass = random.uniform(0,1) 19 | if (myClass <= 0.16): 20 | fFlyer = random.uniform(22000, 60000) 21 | tats = 3 + 1.6*r1 22 | markers.append(20) 23 | colors.append(2.1) 24 | classLabel = 1 #'didntLike' 25 | print ("%d, %f, 类1") % (fFlyer, tats) 26 | 27 | elif ((myClass > 0.16) and (myClass <= 0.33)): 28 | fFlyer = 6000*r0 + 70000 29 | tats = 10 + 3*r1 + 2*r0 30 | markers.append(20) 31 | colors.append(1.1) 32 | classLabel = 1 #'didntLike' 33 | print ("%d, %f, 类2") % (fFlyer, tats) 34 | elif ((myClass > 0.33) and (myClass <= 0.66)): 35 | fFlyer = 5000*r0 + 10000 36 | tats = 3 + 2.8*r1 37 | markers.append(30) 38 | colors.append(1.1) 39 | classLabel = 2 #'smallDoses' 40 | print ("%d, %f, 类2") % (fFlyer, tats) 41 | 42 | else: 43 | fFlyer = 10000*r0 + 35000 44 | tats = 10 + 2.0*r1 45 | markers.append(50) 46 | colors.append(0.1) 47 | classLabel = 3 #'largeDoses' 48 | print ("%d, %f, 类3") % (fFlyer, tats) 49 | 50 | if (tats < 0): tats =0 51 | if (fFlyer < 0): fFlyer =0 52 | xcord[i] = fFlyer; 53 | ycord[i]=tats 54 | 55 | fw.write("%d\t%f\t%f\t%d\n" % (fFlyer, tats, random.uniform(0.0, 1.7), classLabel)) 56 | 57 | fw.close() 58 | fig = plt.figure() 59 | ax = fig.add_subplot(111) 60 | ax.scatter(xcord,ycord, c=colors, s=markers) 61 | type1 = ax.scatter([-10], [-10], s=20, c='red') 62 | type2 = ax.scatter([-10], [-15], s=30, c='green') 63 | type3 = ax.scatter([-10], [-20], s=50, c='blue') 64 | ax.legend([type1, type2, type3], ["class 1", "class 2", "class 3"], loc=2) 65 | #ax.axis([-5000,100000,-2,25]) 66 | plt.xlabel('Frequent Flyier Miles Earned Per Year') #横坐标 67 | plt.ylabel('Percentage of Body Covered By Tatoos') #纵坐标 68 | plt.show() 69 | -------------------------------------------------------------------------------- /ch2/testDigits.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch2/testDigits.zip -------------------------------------------------------------------------------- /ch2/trainingDigits.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch2/trainingDigits.zip -------------------------------------------------------------------------------- /ch3/README.md: -------------------------------------------------------------------------------- 1 | # Ch03 - 决策树(DecisionTree) 2 | 3 | ### 决策树示意图: 4 | 5 | ### example: 6 | ![TheTree](screenshot/TheTree.png) 7 | 8 | ### 使用决策树预测隐形眼镜类型: 9 | ![lensesTree](screenshot/lensesTree.png) 10 | -------------------------------------------------------------------------------- /ch3/TheTree.txt: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'no surfacing' 3 | p1 4 | (dp2 5 | I0 6 | S'no' 7 | p3 8 | sI1 9 | (dp4 10 | S'flippers' 11 | p5 12 | (dp6 13 | I0 14 | g3 15 | sI1 16 | S'yes' 17 | p7 18 | ssss. -------------------------------------------------------------------------------- /ch3/calcShannonEnt.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from numpy import * 4 | from math import log 5 | import numpy as np 6 | import operator 7 | from os import listdir 8 | 9 | def createDataSet(): 10 | dataSet = [[1, 1, 'yes'], 11 | [1, 1, 'yes'], 12 | [1, 0, 'no'], 13 | [0, 1, 'no'], 14 | [0, 1, 'no']] 15 | labels = ['no surfacing','flippers'] 16 | return dataSet, labels 17 | 18 | def calcShannonEnt(dataSet): #计算熵 19 | 20 | numEntries = len(dataSet) 21 | labelCounts = {} 22 | for featVec in dataSet: 23 | currentLabel = featVec[-1] 24 | if currentLabel not in labelCounts.keys(): #键值不存在就加入 25 | labelCounts[currentLabel] = 0 26 | labelCounts[currentLabel] += 1 #计数 27 | 28 | shannonEnt = 0.0 29 | for key in labelCounts: 30 | prob = float(labelCounts[key]) / numEntries 31 | shannonEnt -= prob * log(prob, 2) # 以2为底求对数 32 | 33 | return shannonEnt 34 | 35 | 36 | if __name__ =='__main__': 37 | # print createDataSet() 38 | dataSet ,labels = createDataSet() 39 | print calcShannonEnt(dataSet) 40 | -------------------------------------------------------------------------------- /ch3/chooseBestFeatureToSplit.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from numpy import * 4 | from math import log 5 | import numpy as np 6 | import operator 7 | from os import listdir 8 | 9 | def createDataSet(): 10 | dataSet = [[1, 1, 'yes'], 11 | [1, 1, 'yes'], 12 | [1, 0, 'no'], 13 | [0, 1, 'no'], 14 | [0, 1, 'no']] 15 | labels = ['no surfacing','flippers'] 16 | return dataSet, labels 17 | 18 | def calcShannonEnt(dataSet): #计算熵 19 | 20 | numEntries = len(dataSet) 21 | labelCounts = {} 22 | for featVec in dataSet: 23 | currentLabel = featVec[-1] 24 | if currentLabel not in labelCounts.keys(): #键值不存在就加入 25 | labelCounts[currentLabel] = 0 26 | labelCounts[currentLabel] += 1 #计数 27 | 28 | shannonEnt = 0.0 29 | for key in labelCounts: 30 | prob = float(labelCounts[key]) / numEntries 31 | shannonEnt -= prob * log(prob, 2) # 以2为底求对数 32 | 33 | return shannonEnt 34 | 35 | #按照给定特征划分数据集 36 | def splitDataSet(dataSet,axis,value):#传递参数:待划分的数据集,划分数据集的特征(第axis个特征),特征的返回值 37 | 38 | retDataSet = [] #创建新的list对象 39 | 40 | for featVec in dataSet: #抽取 41 | if featVec[axis] == value: 42 | reducedFeatVec = featVec[:axis] 43 | reducedFeatVec.extend(featVec[axis + 1:]) 44 | # print featVec 45 | retDataSet.append(reducedFeatVec) 46 | 47 | return retDataSet 48 | 49 | #选择最好的数据集划分方式 50 | def chooseBestFeatureToSplit(dataSet): 51 | 52 | numFeatures = len(dataSet[0]) - 1 53 | baseEntropy = calcShannonEnt(dataSet) #整个数据集的原始香农熵 54 | bestInfoGain = 0.0 55 | bestFeature = -1 56 | for i in range(numFeatures): #遍历全部特征 57 | featList = [example[i] for example in dataSet]#创建一个新的list对象 58 | uniqueVals = set(featList) #容器set 59 | newEntropy = 0.0 60 | # 遍历当前特征中的所有唯一属性值,对每个唯一属性值划分一次数据集,计算数据集的新熵值,并对所有唯一特征值得到的熵求和。 61 | for value in uniqueVals: #计算每一种划分方式的信息熵 62 | 63 | subDataSet = splitDataSet(dataSet, i, value) 64 | # 计算概率:特征值划分出子集概率 65 | prob = len(subDataSet)/float(len(dataSet)) 66 | #因为我们在根据一个特征计算香农熵的时候,该特征的分类值是相同,这个特征这个分类的香农熵为0, 67 | # 即当我们的分类只有一类是香农熵是0,而分类越多,香农熵会越大 68 | #所以计算新的香农熵的时候使用的是子集 69 | newEntropy += prob * calcShannonEnt(subDataSet) #计算新的熵 70 | 71 | infoGain = baseEntropy - newEntropy 72 | 73 | if (infoGain > bestInfoGain): #计算最好的信息增量 74 | bestInfoGain = infoGain 75 | bestFeature = i 76 | return bestFeature #返回一个整数,返回最好的axis 77 | 78 | if __name__ =='__main__': 79 | 80 | # print createDataSet() 81 | dataSet ,labels = createDataSet() 82 | print calcShannonEnt(dataSet) 83 | 84 | print splitDataSet(dataSet,0,1) 85 | print splitDataSet(dataSet,1,1) 86 | print chooseBestFeatureToSplit(dataSet) # 0:The best axis 87 | -------------------------------------------------------------------------------- /ch3/lenses.txt: -------------------------------------------------------------------------------- 1 | young myope no reduced no lenses 2 | young myope no normal soft 3 | young myope yes reduced no lenses 4 | young myope yes normal hard 5 | young hyper no reduced no lenses 6 | young hyper no normal soft 7 | young hyper yes reduced no lenses 8 | young hyper yes normal hard 9 | pre myope no reduced no lenses 10 | pre myope no normal soft 11 | pre myope yes reduced no lenses 12 | pre myope yes normal hard 13 | pre hyper no reduced no lenses 14 | pre hyper no normal soft 15 | pre hyper yes reduced no lenses 16 | pre hyper yes normal no lenses 17 | presbyopic myope no reduced no lenses 18 | presbyopic myope no normal no lenses 19 | presbyopic myope yes reduced no lenses 20 | presbyopic myope yes normal hard 21 | presbyopic hyper no reduced no lenses 22 | presbyopic hyper no normal soft 23 | presbyopic hyper yes reduced no lenses 24 | presbyopic hyper yes normal no lenses 25 | -------------------------------------------------------------------------------- /ch3/lensesTree.txt: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'tearRate' 3 | p1 4 | (dp2 5 | S'reduced' 6 | p3 7 | S'no lenses' 8 | p4 9 | sS'normal' 10 | p5 11 | (dp6 12 | S'astigmatic' 13 | p7 14 | (dp8 15 | S'yes' 16 | p9 17 | (dp10 18 | S'prescript' 19 | p11 20 | (dp12 21 | S'hyper' 22 | p13 23 | (dp14 24 | S'age' 25 | p15 26 | (dp16 27 | S'pre' 28 | p17 29 | S'no lenses' 30 | p18 31 | sS'presbyopic' 32 | p19 33 | S'no lenses' 34 | p20 35 | sS'young' 36 | p21 37 | S'hard' 38 | p22 39 | sssS'myope' 40 | p23 41 | S'hard' 42 | p24 43 | sssS'no' 44 | p25 45 | (dp26 46 | g15 47 | (dp27 48 | S'pre' 49 | p28 50 | S'soft' 51 | p29 52 | sS'presbyopic' 53 | p30 54 | (dp31 55 | g11 56 | (dp32 57 | S'hyper' 58 | p33 59 | S'soft' 60 | p34 61 | sS'myope' 62 | p35 63 | S'no lenses' 64 | p36 65 | sssS'young' 66 | p37 67 | S'soft' 68 | p38 69 | ssssss. -------------------------------------------------------------------------------- /ch3/plotTree.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import matplotlib.pyplot as plt 4 | 5 | # 定义文本框和箭头格式 6 | decisionNode = dict(boxstyle="sawtooth", fc="0.8") 7 | leafNode = dict(boxstyle="round4", fc="0.8") 8 | arrow_args = dict(arrowstyle="<-") 9 | 10 | # 获取叶节点的数目 11 | def getNumLeafs(myTree): 12 | numLeafs = 0 13 | firstStr = myTree.keys()[0] 14 | secondDict = myTree[firstStr] 15 | for key in secondDict.keys(): 16 | if type(secondDict[key]).__name__=='dict': 17 | numLeafs += getNumLeafs(secondDict[key]) 18 | else: numLeafs +=1 19 | return numLeafs 20 | 21 | # 获取树的层数 22 | def getTreeDepth(myTree): 23 | maxDepth = 0 24 | firstStr = myTree.keys()[0] 25 | secondDict = myTree[firstStr] 26 | for key in secondDict.keys(): 27 | if type(secondDict[key]).__name__=='dict': 28 | thisDepth = 1 + getTreeDepth(secondDict[key]) 29 | else: 30 | thisDepth = 1 31 | 32 | if thisDepth > maxDepth: maxDepth = thisDepth 33 | return maxDepth 34 | 35 | # 绘制带箭头的注解 36 | def plotNode(nodeTxt, centerPt, parentPt, nodeType): 37 | createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', #annotate可以在数据图形上添加文本注释 38 | xytext=centerPt, textcoords='axes fraction', 39 | va="center", ha="center", bbox=nodeType, arrowprops=arrow_args ) 40 | 41 | #更新createPlot代码以得到整棵树 42 | #计算父节点和子节点的中间位置 43 | def plotMidText(cntrPt, parentPt, txtString): #在父子节点间填充文本信息 44 | xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0] 45 | yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1] 46 | createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30) 47 | 48 | # 使用文本注解绘制树节点 49 | def plotTree(myTree, parentPt, nodeTxt): 50 | 51 | #计算宽与高 52 | numLeafs = getNumLeafs(myTree) 53 | depth = getTreeDepth(myTree) 54 | 55 | firstStr = myTree.keys()[0] 56 | cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff) 57 | 58 | #标志子节点属性值 59 | plotMidText(cntrPt, parentPt, nodeTxt) 60 | plotNode(firstStr, cntrPt, parentPt, decisionNode) 61 | 62 | secondDict = myTree[firstStr] 63 | 64 | plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD # 减少 y 偏移 65 | 66 | for key in secondDict.keys(): 67 | if type(secondDict[key]).__name__=='dict': 68 | plotTree(secondDict[key],cntrPt,str(key)) 69 | else: 70 | plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW 71 | plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode) 72 | plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key)) 73 | plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD 74 | 75 | def createPlot(inTree): 76 | fig = plt.figure(1, facecolor='white') 77 | fig.clf() 78 | axprops = dict(xticks=[], yticks=[]) 79 | createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) 80 | #createPlot.ax1 = plt.subplot(111, frameon=False) 81 | plotTree.totalW = float(getNumLeafs(inTree)) 82 | plotTree.totalD = float(getTreeDepth(inTree)) 83 | plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0; 84 | plotTree(inTree, (0.5,1.0), '') 85 | plt.show() 86 | 87 | 88 | # 决策树的读取 89 | def grabTree(filename): # 并在需要的时候将其读取出来 90 | import pickle 91 | fr = open(filename) 92 | return pickle.load(fr) 93 | 94 | if __name__ == '__main__': 95 | #TheTree = grabTree("TheTree.txt") 96 | TheTree = grabTree("lensesTree.txt") 97 | print TheTree 98 | createPlot(TheTree) 99 | 100 | -------------------------------------------------------------------------------- /ch3/screenshot/TheTree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch3/screenshot/TheTree.png -------------------------------------------------------------------------------- /ch3/screenshot/lensesTree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch3/screenshot/lensesTree.png -------------------------------------------------------------------------------- /ch4/README.md: -------------------------------------------------------------------------------- 1 | # Ch04 - 朴素贝叶斯(Naive Bayes) 2 | 3 | ## 朴素贝叶斯概述 4 | 5 | 朴素贝叶斯是一种简单但是非常强大的线性分类器。它在垃圾邮件分类,疾病诊断中都取得了很大的成功。它只所以称为朴素,是因为它假设特征之间是相互独立的,但是在现实生活中,这种假设基本上是不成立的。那么即使是在假设不成立的条件下,它依然表现的很好,尤其是在小规模样本的情况下。但是,如果每个特征之间有很强的关联性和非线性的分类问题会导致朴素贝叶斯模型有很差的分类效果。 6 | 7 | 朴素贝叶斯分类器通过求出使得概率 ![\\inline P\(X|W\)](http://latex.codecogs.com/png.latex?%5Cinline%20P%28X|W%29) 最大化的类别 ![\\inline X](http://latex.codecogs.com/png.latex?%5Cinline%20X),以确定特征向量 ![\\inline W = \(w_1, w_2, w_3, \\dots\)](http://latex.codecogs.com/png.latex?%5Cinline%20W%20%3D%20%28w_1%2C%20w_2%2C%20w_3%2C%20%5Cdots%29) 最有可能属于的类别。 8 | 9 | 根据条件概率公式,![\\inline P\(X|W\) = \\frac{P\(W|X\) \\times P\(X\)}{P\(W\)}](http://latex.codecogs.com/png.latex?%5Cinline%20P%28X|W%29%20%3D%20%5Cfrac{P%28W|X%29%20%5Ctimes%20P%28X%29}{P%28W%29})。![\\inline P\(X\)](http://latex.codecogs.com/png.latex?%5Cinline%20P%28X%29) 可以视为一个先验概率,用类别 ![\\inline X](http://latex.codecogs.com/png.latex?%5Cinline%20X) 在样本中的频率近似算出。![\\inline P\(W\)](http://latex.codecogs.com/png.latex?%5Cinline%20P%28W%29) 虽然很难计算,但它是一个与 ![\\inline X](http://latex.codecogs.com/png.latex?%5Cinline%20X) 无关的常数,而我们只需要找到使得概率最大化的 ![\\inline X](http://latex.codecogs.com/png.latex?%5Cinline%20X),只要比较大小,并不需要精确算出这个概率,所以可以无视这个值。 10 | 11 | 问题就在于如何计算 ![\\inline P\(W|X\)](http://latex.codecogs.com/png.latex?%5Cinline%20P%28W|X%29),这里就是朴素贝叶斯分类器的“朴素”体现出来的地方。朴素贝叶斯分类器做了一个强假设,认为 ![\\inline W](http://latex.codecogs.com/png.latex?%5Cinline%20W) 里的每个特征都是互相独立的,即 ![\\inline P\(W|X\) = P\(w_1|X\) \\times P\(w_2|X\) \\times P\(w_3|X\)\\dots](http://latex.codecogs.com/png.latex?%5Cinline%20P%28W|X%29%20%3D%20P%28w_1|X%29%20%5Ctimes%20P%28w_2|X%29%20%5Ctimes%20P%28w_3|X%29%5Cdots),这就方便了我们的概率计算。 12 | 13 | 为了计算某一个特征的概率 ![\\inline P\(w|X\)](http://latex.codecogs.com/png.latex?%5Cinline%20P%28w|X%29),如果 ![\\inline w](http://latex.codecogs.com/png.latex?%5Cinline%20w) 的取值是离散的,直接使用古典概型计算即可;如果 ![\\inline w](http://latex.codecogs.com/png.latex?%5Cinline%20w) 的取值是连续的,可以假设 ![\\inline w](http://latex.codecogs.com/png.latex?%5Cinline%20w) 服从正态分布。 14 | 15 | 太多的小概率乘起来,可能会因为结果太小导致下溢或者得到不正确的答案。解决方法是:可以将概率取对数拟然(log-likelihood),这样乘法就变成了加法,取值虽然不相同,但也不影响最终答案。 16 | 17 | ## 朴素贝叶斯背后的数学原理 18 | 19 | ### 后验概率(Posterior Probabilities) 20 | ### 条件概率(Conditional Probabilities) 21 | ### 先验概率(Prior Probabilities) 22 | ### 现象概率(Evidence Probabilities) 23 | 24 | ## 应用: 25 | 贝叶斯模型在很多方面都有应用,熟知的领域就有垃圾邮件识别、文本的模糊匹配、欺诈判别、商品推荐等等。通过贝叶斯模型的阐述,应该有这样的一种体会:分析模型并不取决于多么复杂的数学公式,多么高级的软件工具,多么高深的算法组合;它们的原理往往是通俗易懂的,实现起来也没有多高的门槛。比如贝叶斯模型,用Excel的单元格和加减乘除的符号就能实现。所以,不要觉得数据分析建模有多遥远,其实就在你手边。 26 | -------------------------------------------------------------------------------- /ch4/advertisement.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from numpy import * 4 | from math import log 5 | import numpy as np 6 | import operator 7 | from os import listdir 8 | import feedparser 9 | 10 | def createVocabList(dataSet): 11 | vocabSet = set([]) # 创建一个空集 12 | for document in dataSet: 13 | vocabSet = vocabSet | set(document) # 合并两个集合 14 | return list(vocabSet) 15 | 16 | 17 | # 朴素贝叶斯分类器训练函数 18 | def trainNB0(trainMatrix,trainCategory): 19 | numTrainDocs = len(trainMatrix) 20 | numWords = len(trainMatrix[0]) 21 | pAbusive = sum(trainCategory)/float(numTrainDocs) 22 | 23 | # 初始化概率 24 | p0Num = ones(numWords) 25 | p1Num = ones(numWords) 26 | p0Denom = 2.0 27 | p1Denom = 2.0 28 | for i in range(numTrainDocs): 29 | if trainCategory[i] == 1: 30 | # 向量相加 31 | p1Num += trainMatrix[i] 32 | p1Denom += sum(trainMatrix[i]) 33 | elif trainCategory[i] == 0: 34 | p0Num += trainMatrix[i] 35 | p0Denom += sum(trainMatrix[i]) 36 | 37 | # 对每个元素做除法 38 | p1Vect = p1Num / p1Denom 39 | p0Vect = p0Num / p0Denom 40 | # p1Vect = log(p1Num/p1Denom) # change to log 41 | # p0Vect = log(p0Num/p0Denom) # change to log 42 | 43 | return p0Vect,p1Vect,pAbusive 44 | 45 | # 朴素贝叶斯分类函数 46 | def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): 47 | p1 = sum(vec2Classify * p1Vec) + log(pClass1) #元素相乘 48 | p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) 49 | if p1 > p0: 50 | return 1 51 | else: 52 | return 0 53 | 54 | def textParse(bigString): 55 | import re 56 | listofTokens = re.split(r'\w*',bigString) 57 | # 过滤掉长度小于3的字符串 58 | return [tok.lower() for tok in listofTokens if len(tok) > 2] # 过滤掉长度小于三的字符串 59 | 60 | # 朴素贝叶斯词袋模型 61 | def bagOfWords2VecMN(vocabList, inputSet): 62 | returnVec = [0]*len(vocabList) 63 | for word in inputSet: 64 | if word in vocabList: 65 | returnVec[vocabList.index(word)] += 1 66 | return returnVec 67 | 68 | 69 | def calcMostFreq(vocabList,fullText): 70 | # 计算出现的频率 71 | freqDict = {} 72 | for token in vocabList: 73 | freqDict[token]=fullText.count(token) 74 | sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True) 75 | return sortedFreq[:30] 76 | 77 | def localWords(feed1,feed0): 78 | 79 | docList=[]; classList = []; fullText =[] 80 | minLen = min(len(feed1['entries']),len(feed0['entries'])) 81 | for i in range(minLen): 82 | wordList = textParse(feed1['entries'][i]['summary']) 83 | docList.append(wordList) 84 | fullText.extend(wordList) 85 | classList.append(1) #NY is class 1 86 | wordList = textParse(feed0['entries'][i]['summary']) 87 | docList.append(wordList) 88 | fullText.extend(wordList) 89 | classList.append(0) 90 | vocabList = createVocabList(docList) # create vocabulary 91 | top30Words = calcMostFreq(vocabList,fullText) # remove top 30 words 92 | for pairW in top30Words: 93 | if pairW[0] in vocabList: vocabList.remove(pairW[0]) 94 | trainingSet = range(2*minLen); testSet=[] # create test set 95 | for i in range(20): 96 | randIndex = int(random.uniform(0,len(trainingSet))) 97 | testSet.append(trainingSet[randIndex]) 98 | del(trainingSet[randIndex]) 99 | trainMat=[]; trainClasses = [] 100 | for docIndex in trainingSet: # train the classifier (get probs) trainNB0 101 | trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) 102 | trainClasses.append(classList[docIndex]) 103 | p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) 104 | errorCount = 0 105 | for docIndex in testSet: # classify the remaining items 106 | wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) 107 | if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: 108 | errorCount += 1 109 | print 'the error rate is: ',float(errorCount)/len(testSet) 110 | return vocabList,p0V,p1V 111 | 112 | 113 | if __name__ =='__main__': 114 | ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss') 115 | sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') 116 | vocabList,pSF,pNY = localWords(ny,sf) 117 | 118 | -------------------------------------------------------------------------------- /ch4/classifyNB && testingNB.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from numpy import * 4 | from math import log 5 | import numpy as np 6 | import operator 7 | from os import listdir 8 | 9 | 10 | def loadDataSet(): 11 | postingList = [['my','dog','has','flea','problem','help','please',], 12 | ['maybe','not','take','him','to','dog','park','stupid'], 13 | ['my','dalmation','is','so','cute','I','love','him'], 14 | ['stop','posting','stupid','worthless','garbage'], 15 | ['mr','licks','ate','my','steak','how','to','stop','him'], 16 | ['quit','buying','worthless','dog','food','stdpid']] 17 | classVec = [0,1,0,1,0,1] # 1 代表侮辱性文字 0代表正常言论 18 | return postingList,classVec 19 | 20 | 21 | def createVocabList(dataSet): 22 | vocabSet = set([]) # 创建一个空集 23 | for document in dataSet: 24 | vocabSet = vocabSet | set(document) # 合并两个集合 25 | return list(vocabSet) 26 | 27 | def setOfWords2Vec(vocabList, inputSet): 28 | returnVec = [0]*len(vocabList) # 创建一个其中的全部元素都为 0 的向量 29 | for word in inputSet: 30 | if word in vocabList: 31 | returnVec[vocabList.index(word)] = 1 32 | else: 33 | print " %s不在字典集内!" % word 34 | return returnVec 35 | 36 | # 朴素贝叶斯分类器训练函数 37 | def trainNB0(trainMatrix,trainCategory): 38 | numTrainDocs = len(trainMatrix) 39 | numWords = len(trainMatrix[0]) 40 | pAbusive = sum(trainCategory)/float(numTrainDocs) 41 | 42 | # 初始化概率 43 | p0Num = ones(numWords) 44 | p1Num = ones(numWords) 45 | p0Denom = 2.0 46 | p1Denom = 2.0 47 | for i in range(numTrainDocs): 48 | if trainCategory[i] == 1: 49 | # 向量相加 50 | p1Num += trainMatrix[i] 51 | p1Denom += sum(trainMatrix[i]) 52 | elif trainCategory[i] == 0: 53 | p0Num += trainMatrix[i] 54 | p0Denom += sum(trainMatrix[i]) 55 | 56 | # 对每个元素做除法 57 | p1Vect = p1Num / p1Denom 58 | p0Vect = p0Num / p0Denom 59 | # p1Vect = log(p1Num/p1Denom) # change to log 60 | # p0Vect = log(p0Num/p0Denom) # change to log 61 | 62 | return p0Vect,p1Vect,pAbusive 63 | 64 | # 朴素贝叶斯分类函数 65 | def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): 66 | p1 = sum(vec2Classify * p1Vec) + log(pClass1) #元素相乘 67 | p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) 68 | if p1 > p0: 69 | return 1 70 | else: 71 | return 0 72 | 73 | def testingNB(): 74 | listOPosts,listClasses = loadDataSet() 75 | myVocabList = createVocabList(listOPosts) 76 | trainMat=[] 77 | 78 | for postinDoc in listOPosts: 79 | trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) 80 | 81 | p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) 82 | 83 | testEntry = ['love', 'my', 'dalmation'] 84 | thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) 85 | 86 | print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb) 87 | 88 | testEntry = ['stupid', 'garbage'] 89 | thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) 90 | print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb) 91 | 92 | 93 | if __name__ =='__main__': 94 | listPosts,listClass = loadDataSet() 95 | myVocaburaryList = createVocabList(listPosts) 96 | # print sorted(myVocaburaryList) 97 | # print setOfWords2Vec(myVocaburaryList,listPosts[0]) 98 | trainMat = [] 99 | for postinDoc in listPosts: 100 | trainMat.append(setOfWords2Vec(myVocaburaryList,postinDoc)) 101 | p0V,p1V,pAb = trainNB0(trainMat,listClass) 102 | #print p0V,p1V 103 | print pAb 104 | testingNB() 105 | -------------------------------------------------------------------------------- /ch4/email.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from numpy import * 4 | from math import log 5 | import numpy as np 6 | import operator 7 | from os import listdir 8 | 9 | def createVocabList(dataSet): 10 | vocabSet = set([]) # 创建一个空集 11 | for document in dataSet: 12 | vocabSet = vocabSet | set(document) # 合并两个集合 13 | return list(vocabSet) 14 | 15 | 16 | # 朴素贝叶斯分类器训练函数 17 | def trainNB0(trainMatrix, trainCategory): 18 | numTrainDocs = len(trainMatrix) 19 | numWords = len(trainMatrix[0]) 20 | pAbusive = sum(trainCategory) / float(numTrainDocs) 21 | 22 | # 初始化概率 23 | p0Num = ones(numWords) 24 | p1Num = ones(numWords) 25 | p0Denom = 2.0 26 | p1Denom = 2.0 27 | for i in range(numTrainDocs): 28 | if trainCategory[i] == 1: 29 | # 向量相加 30 | p1Num += trainMatrix[i] 31 | p1Denom += sum(trainMatrix[i]) 32 | elif trainCategory[i] == 0: 33 | p0Num += trainMatrix[i] 34 | p0Denom += sum(trainMatrix[i]) 35 | 36 | # 对每个元素做除法 37 | p1Vect = p1Num / p1Denom 38 | p0Vect = p0Num / p0Denom 39 | # p1Vect = log(p1Num/p1Denom) # change to log 40 | # p0Vect = log(p0Num/p0Denom) # change to log 41 | 42 | return p0Vect, p1Vect, pAbusive 43 | 44 | 45 | # 朴素贝叶斯分类函数 46 | def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): 47 | p1 = sum(vec2Classify * p1Vec) + log(pClass1) # 元素相乘 48 | p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) 49 | if p1 > p0: 50 | return 1 51 | else: 52 | return 0 53 | 54 | def textParse(bigString): 55 | import re 56 | listofTokens = re.split(r'\w*', bigString) 57 | # 过滤掉长度小于3的字符串 58 | return [tok.lower() for tok in listofTokens if len(tok) > 2] # 过滤掉长度小于三的字符串 59 | 60 | 61 | # 朴素贝叶斯词袋模型 62 | def bagOfWords2VecMN(vocabList, inputSet): 63 | returnVec = [0] * len(vocabList) 64 | for word in inputSet: 65 | if word in vocabList: 66 | returnVec[vocabList.index(word)] += 1 67 | return returnVec 68 | 69 | 70 | # 文本解析及完整的垃圾邮件测试函数 71 | def spamTest(): 72 | docList = [] 73 | classList = [] 74 | fullText = [] 75 | 76 | # 导入并解析文本文件 77 | for i in range(1, 26): 78 | wordList = textParse(open('email/spam/%d.txt' % i).read()) 79 | docList.append(wordList) 80 | fullText.extend(wordList) 81 | classList.append(1) 82 | 83 | wordList = textParse(open('email/ham/%d.txt' % i).read()) 84 | docList.append(wordList) 85 | fullText.extend(wordList) 86 | classList.append(0) 87 | 88 | vocabList = createVocabList(docList) 89 | trainingSet = range(50) 90 | testSet = [] 91 | 92 | # 随机构建训练集 93 | for i in range(10): 94 | randIndex = int(random.uniform(0, len(trainingSet))) 95 | testSet.append(trainingSet[randIndex]) 96 | del (trainingSet[randIndex]) 97 | 98 | trainMat = [] 99 | trainClasses = [] 100 | 101 | # calculate the probability 102 | for docIndex in trainingSet: 103 | trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) 104 | trainClasses.append(classList[docIndex]) 105 | p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses)) 106 | errorCount = 0 107 | 108 | # 对测试集分类 109 | for docIndex in testSet: 110 | wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) 111 | if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: 112 | errorCount += 1 113 | print "classification error", docList[docIndex] 114 | print 'the error rate is: ', float(errorCount) / len(testSet) 115 | return vocabList, fullText 116 | 117 | 118 | if __name__ == '__main__': 119 | 120 | print spamTest() 121 | -------------------------------------------------------------------------------- /ch4/ham.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch4/ham.zip -------------------------------------------------------------------------------- /ch4/matplotlib/README.md: -------------------------------------------------------------------------------- 1 | # 效果图 2 | # math_matplotlib.py 3 | 4 | ![math_matplotlib](srceenshot/math_matplotlib.png) 5 | 6 | # matplotlib.py 7 | 8 | ![matplotlib](srceenshot/matplotlib.png) 9 | -------------------------------------------------------------------------------- /ch4/matplotlib/math_matplotlib.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | 3 | from numpy import * 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | 7 | t = arange(0.0, 0.5, 0.01) 8 | #s = sin(2*t) 9 | s = sin(2*pi*t) 10 | logS = log(s) 11 | 12 | fig = plt.figure() 13 | ax = fig.add_subplot(211) 14 | ax.plot(t,s) # f(x) = sin(2*pi*t) 15 | ax.set_ylabel('f(x)') 16 | ax.set_xlabel('x') 17 | 18 | ax = fig.add_subplot(212) 19 | ax.plot(t,logS) # f(x) =log(s) 20 | ax.set_ylabel('ln(f(x))') 21 | ax.set_xlabel('x') 22 | plt.show() -------------------------------------------------------------------------------- /ch4/matplotlib/matplotlib.py: -------------------------------------------------------------------------------- 1 | from numpy import * 2 | import matplotlib 3 | import matplotlib.pyplot as plt 4 | 5 | n = 1000 # number of points to create 6 | xcord0 = [] 7 | ycord0 = [] 8 | xcord1 = [] 9 | ycord1 = [] 10 | markers =[] 11 | colors =[] 12 | fw = open('testSet.txt','w') 13 | for i in range(n): 14 | [r0,r1] = random.standard_normal(2) 15 | myClass = random.uniform(0,1) 16 | if (myClass <= 0.5): 17 | fFlyer = r0 + 9.0 18 | tats = 1.0*r1 + fFlyer - 9.0 19 | xcord0.append(fFlyer) 20 | ycord0.append(tats) 21 | else: 22 | fFlyer = r0 + 2.0 23 | tats = r1+fFlyer - 2.0 24 | xcord1.append(fFlyer) 25 | ycord1.append(tats) 26 | #fw.write("%f\t%f\t%d\n" % (fFlyer, tats, classLabel)) 27 | 28 | fw.close() 29 | fig = plt.figure() 30 | ax = fig.add_subplot(111) 31 | #ax.scatter(xcord,ycord, c=colors, s=markers) 32 | ax.scatter(xcord0,ycord0, marker='^', s=90,c='blue') 33 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red') 34 | plt.plot([0,1], label='going up') 35 | plt.show() -------------------------------------------------------------------------------- /ch4/matplotlib/srceenshot/math_matplotlib.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch4/matplotlib/srceenshot/math_matplotlib.png -------------------------------------------------------------------------------- /ch4/matplotlib/srceenshot/matplotlib.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch4/matplotlib/srceenshot/matplotlib.png -------------------------------------------------------------------------------- /ch4/spam.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch4/spam.zip -------------------------------------------------------------------------------- /ch4/trainNB0.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from numpy import * 4 | from math import log 5 | import numpy as np 6 | import operator 7 | from os import listdir 8 | 9 | 10 | def loadDataSet(): 11 | postingList = [['my','dog','has','flea','problem','help','please',], 12 | ['maybe','not','take','him','to','dog','park','stupid'], 13 | ['my','dalmation','is','so','cute','I','love','him'], 14 | ['stop','posting','stupid','worthless','garbage'], 15 | ['mr','licks','ate','my','steak','how','to','stop','him'], 16 | ['quit','buying','worthless','dog','food','stdpid']] 17 | classVec = [0,1,0,1,0,1] # 1 代表侮辱性文字 0代表正常言论 18 | return postingList,classVec 19 | 20 | 21 | def createVocabList(dataSet): 22 | vocabSet = set([]) # 创建一个空集 23 | for document in dataSet: 24 | vocabSet = vocabSet | set(document) # 合并两个集合 25 | return list(vocabSet) 26 | 27 | def setOfWords2Vec(vocabList, inputSet): 28 | returnVec = [0]*len(vocabList) # 创建一个其中的全部元素都为 0 的向量 29 | for word in inputSet: 30 | if word in vocabList: 31 | returnVec[vocabList.index(word)] = 1 32 | else: 33 | print " %s不在字典集内!" % word 34 | return returnVec 35 | 36 | # 朴素贝叶斯分类器训练函数 37 | def trainNB0(trainMatrix,trainCategory): 38 | numTrainDocs = len(trainMatrix) 39 | numWords = len(trainMatrix[0]) 40 | pAbusive = sum(trainCategory)/float(numTrainDocs) 41 | 42 | # 初始化概率 43 | p0Num = ones(numWords) 44 | p1Num = ones(numWords) 45 | p0Denom = 2.0 46 | p1Denom = 2.0 47 | for i in range(numTrainDocs): 48 | if trainCategory[i] == 1: 49 | # 向量相加 50 | p1Num += trainMatrix[i] 51 | p1Denom += sum(trainMatrix[i]) 52 | elif trainCategory[i] == 0: 53 | p0Num += trainMatrix[i] 54 | p0Denom += sum(trainMatrix[i]) 55 | 56 | # 对每个元素做除法 57 | p1Vect = log(p1Num/p1Denom) # change to log 58 | p0Vect = log(p0Num/p0Denom) # change to log 59 | 60 | return p0Vect,p1Vect,pAbusive 61 | 62 | if __name__ =='__main__': 63 | listPosts,listClass = loadDataSet() 64 | myVocaburaryList = createVocabList(listPosts) 65 | print sorted(myVocaburaryList) 66 | print setOfWords2Vec(myVocaburaryList,listPosts[0]) 67 | -------------------------------------------------------------------------------- /ch5/README.md: -------------------------------------------------------------------------------- 1 | # Ch05 - Logistic回归(Logistic regression) 2 | 3 | ## Logistic回归的主要思想: 4 | #### Logistic回归是本书目前为止首次接触最优化算法。 5 | #### 根据现有数据对分类边界线建立回归公式,以此进行分类。 6 | #### 我们想要的函数应该是能接受所有的输入然后预测出类别,两个类的情况下输出0或者1,这种函数叫做单位阶越函数。然而,这种函数的问题在于: 7 | #### 该函数在跳跃点上从0 瞬间跳跃到1,这个瞬间跳跃过程优势很难处理。但幸好的是,另一个函数也有类似的性质,且数学上更容易处理,这就是Sigmoid函数。 8 | ## 下图是坐标尺度下的Sigmoid函数图。 9 | ![sigmoid](screenshot/sigmoid.png) 10 | 11 | ## 梯度上升法 12 | #### 这个在机器学习的数学上并不难。先说一个具体的函数例子。 13 | ![梯度上升法](screenshot/梯度上升.png) 14 | #### 梯度上升算法到达每个点都会重新估计移动的方向。从P0开始,计算完改点的梯度,函数就会根据梯度移动到下一个点P1,梯度再次被重新计算,并沿新的梯度方向移动到P2。如此循环迭代,直到满足停止条件。迭代的过程中,梯度算子总是保证我们能选取到最佳的移动方向。可以看到,梯度算子总是指向函数值增长最快的方向。这里说的只是移动方向,并不是移动量的大小。 15 | 16 | # 下面是几个有区别的训练算法效果示意图: 17 | 18 | ## Logistic回归最佳拟合直线 19 | ![Logistic回归最佳拟合直线](screenshot/Logistic回归最佳拟合直线.png) 20 | 21 | ## 随机梯度上升 22 | ![随机梯度上升](screenshot/随机梯度上升.png) 23 | 24 | ## 改进随机梯度上升 25 | ![改进随机梯度上升](screenshot/改进随机梯度上升.png) 26 | 27 | -------------------------------------------------------------------------------- /ch5/matplotlib/sigmoid.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import sys 3 | from pylab import * 4 | 5 | t = arange(-60.0, 60.3, 0.1) 6 | s = 1/(1 + exp(-t)) 7 | ax = subplot(211) 8 | ax.plot(t,s) 9 | ax.axis([-5,5,0,1]) 10 | plt.xlabel('x') 11 | plt.ylabel('Sigmoid(x)') 12 | ax = subplot(212) 13 | ax.plot(t,s) 14 | ax.axis([-60,60,0,1]) 15 | plt.xlabel('x') 16 | plt.ylabel('Sigmoid(x)') 17 | show() -------------------------------------------------------------------------------- /ch5/matplotlib/梯度上升.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import matplotlib 4 | import numpy as np 5 | import matplotlib.cm as cm 6 | import matplotlib.mlab as mlab 7 | import matplotlib.pyplot as plt 8 | 9 | leafNode = dict(boxstyle="round4", fc="0.8") 10 | arrow_args = dict(arrowstyle="<-") 11 | 12 | matplotlib.rcParams['xtick.direction'] = 'out' 13 | matplotlib.rcParams['ytick.direction'] = 'out' 14 | 15 | delta = 0.025 16 | x = np.arange(-2.0, 2.0, delta) 17 | y = np.arange(-2.0, 2.0, delta) 18 | X, Y = np.meshgrid(x, y) 19 | Z1 = -((X-1)**2) 20 | Z2 = -(Y**2) 21 | 22 | #Z1 = mlab.bivariate_normal(X, Y, 1.0, 1.0, 0.0, 0.0) 23 | #Z2 = mlab.bivariate_normal(X, Y, 1.5, 0.5, 1, 1) 24 | # difference of Gaussians 25 | 26 | Z = 1.0 * (Z2 + Z1)+5.0 27 | 28 | plt.figure() 29 | CS = plt.contour(X, Y, Z) 30 | plt.annotate('', xy=(0.05, 0.05), xycoords='axes fraction', 31 | xytext=(0.2,0.2), textcoords='axes fraction', 32 | va="center", ha="center", bbox=leafNode, arrowprops=arrow_args ) 33 | plt.text(-1.9, -1.8, 'P0') 34 | plt.annotate('', xy=(0.2,0.2), xycoords='axes fraction', 35 | xytext=(0.35,0.3), textcoords='axes fraction', 36 | va="center", ha="center", bbox=leafNode, arrowprops=arrow_args ) 37 | plt.text(-1.35, -1.23, 'P1') 38 | plt.annotate('', xy=(0.35,0.3), xycoords='axes fraction', 39 | xytext=(0.45,0.35), textcoords='axes fraction', 40 | va="center", ha="center", bbox=leafNode, arrowprops=arrow_args ) 41 | plt.text(-0.7, -0.8, 'P2') 42 | plt.text(-0.3, -0.6, 'P3') 43 | plt.clabel(CS, inline=1, fontsize=10) 44 | plt.title("Gradient Ascent") 45 | plt.xlabel('x') 46 | plt.ylabel('y') 47 | plt.show() 48 | -------------------------------------------------------------------------------- /ch5/screenshot/Logistic回归最佳拟合直线.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch5/screenshot/Logistic回归最佳拟合直线.png -------------------------------------------------------------------------------- /ch5/screenshot/sigmoid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch5/screenshot/sigmoid.png -------------------------------------------------------------------------------- /ch5/screenshot/改进随机梯度上升.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch5/screenshot/改进随机梯度上升.png -------------------------------------------------------------------------------- /ch5/screenshot/梯度上升.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch5/screenshot/梯度上升.png -------------------------------------------------------------------------------- /ch5/screenshot/随机梯度上升.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch5/screenshot/随机梯度上升.png -------------------------------------------------------------------------------- /ch5/testSet.txt: -------------------------------------------------------------------------------- 1 | -0.017612 14.053064 0 2 | -1.395634 4.662541 1 3 | -0.752157 6.538620 0 4 | -1.322371 7.152853 0 5 | 0.423363 11.054677 0 6 | 0.406704 7.067335 1 7 | 0.667394 12.741452 0 8 | -2.460150 6.866805 1 9 | 0.569411 9.548755 0 10 | -0.026632 10.427743 0 11 | 0.850433 6.920334 1 12 | 1.347183 13.175500 0 13 | 1.176813 3.167020 1 14 | -1.781871 9.097953 0 15 | -0.566606 5.749003 1 16 | 0.931635 1.589505 1 17 | -0.024205 6.151823 1 18 | -0.036453 2.690988 1 19 | -0.196949 0.444165 1 20 | 1.014459 5.754399 1 21 | 1.985298 3.230619 1 22 | -1.693453 -0.557540 1 23 | -0.576525 11.778922 0 24 | -0.346811 -1.678730 1 25 | -2.124484 2.672471 1 26 | 1.217916 9.597015 0 27 | -0.733928 9.098687 0 28 | -3.642001 -1.618087 1 29 | 0.315985 3.523953 1 30 | 1.416614 9.619232 0 31 | -0.386323 3.989286 1 32 | 0.556921 8.294984 1 33 | 1.224863 11.587360 0 34 | -1.347803 -2.406051 1 35 | 1.196604 4.951851 1 36 | 0.275221 9.543647 0 37 | 0.470575 9.332488 0 38 | -1.889567 9.542662 0 39 | -1.527893 12.150579 0 40 | -1.185247 11.309318 0 41 | -0.445678 3.297303 1 42 | 1.042222 6.105155 1 43 | -0.618787 10.320986 0 44 | 1.152083 0.548467 1 45 | 0.828534 2.676045 1 46 | -1.237728 10.549033 0 47 | -0.683565 -2.166125 1 48 | 0.229456 5.921938 1 49 | -0.959885 11.555336 0 50 | 0.492911 10.993324 0 51 | 0.184992 8.721488 0 52 | -0.355715 10.325976 0 53 | -0.397822 8.058397 0 54 | 0.824839 13.730343 0 55 | 1.507278 5.027866 1 56 | 0.099671 6.835839 1 57 | -0.344008 10.717485 0 58 | 1.785928 7.718645 1 59 | -0.918801 11.560217 0 60 | -0.364009 4.747300 1 61 | -0.841722 4.119083 1 62 | 0.490426 1.960539 1 63 | -0.007194 9.075792 0 64 | 0.356107 12.447863 0 65 | 0.342578 12.281162 0 66 | -0.810823 -1.466018 1 67 | 2.530777 6.476801 1 68 | 1.296683 11.607559 0 69 | 0.475487 12.040035 0 70 | -0.783277 11.009725 0 71 | 0.074798 11.023650 0 72 | -1.337472 0.468339 1 73 | -0.102781 13.763651 0 74 | -0.147324 2.874846 1 75 | 0.518389 9.887035 0 76 | 1.015399 7.571882 0 77 | -1.658086 -0.027255 1 78 | 1.319944 2.171228 1 79 | 2.056216 5.019981 1 80 | -0.851633 4.375691 1 81 | -1.510047 6.061992 0 82 | -1.076637 -3.181888 1 83 | 1.821096 10.283990 0 84 | 3.010150 8.401766 1 85 | -1.099458 1.688274 1 86 | -0.834872 -1.733869 1 87 | -0.846637 3.849075 1 88 | 1.400102 12.628781 0 89 | 1.752842 5.468166 1 90 | 0.078557 0.059736 1 91 | 0.089392 -0.715300 1 92 | 1.825662 12.693808 0 93 | 0.197445 9.744638 0 94 | 0.126117 0.922311 1 95 | -0.679797 1.220530 1 96 | 0.677983 2.556666 1 97 | 0.761349 10.693862 0 98 | -2.168791 0.143632 1 99 | 1.388610 9.341997 0 100 | 0.317029 14.739025 0 101 | -------------------------------------------------------------------------------- /ch5/使用梯度上升找最佳拟合直线.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from numpy import * 4 | 5 | def loadDataSet(): 6 | dataMat = [] #list 7 | labelMat = [] #list 8 | fr = open('testSet.txt') 9 | for line in fr.readlines(): 10 | lineArr = line.strip().split() 11 | dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])]) 12 | labelMat.append(int(lineArr[2])) 13 | return dataMat,labelMat 14 | 15 | def sigmoid(z): 16 | return 1.0 / (1 + exp(-z)) 17 | 18 | # Logistic 回归梯度上升优化算法 19 | def gradAscent(datamat,classlabel): 20 | dataMat = mat(datamat) #convert to NumPy matrix 21 | labeMat = mat(classlabel).transpose() #convert to NumPy matrix 22 | m,n = shape(dataMat) 23 | alpha = 0.001 # 向目标移动的步长 24 | maxCycles = 500 # 迭代次数 25 | weight = ones((n,1)) 26 | for k in range(maxCycles): 27 | h = sigmoid(dataMat * weight) 28 | diff = labeMat - h #误差 29 | weight += alpha * dataMat.transpose() * diff #这里不止一次乘积运算 30 | return weight 31 | 32 | def plotBestFit(weights): 33 | import matplotlib.pyplot as plt 34 | dataMat,labelMat=loadDataSet() 35 | dataArr = array(dataMat) 36 | n = shape(dataArr)[0] 37 | xcord1 = []; ycord1 = [] 38 | xcord2 = []; ycord2 = [] 39 | for i in range(n): 40 | if int(labelMat[i])== 1: 41 | xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) 42 | else: 43 | xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) 44 | fig = plt.figure() 45 | ax = fig.add_subplot(111) 46 | ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') 47 | ax.scatter(xcord2, ycord2, s=30, c='green') 48 | x = arange(-3.0, 3.0, 0.1) 49 | y = (-weights[0]-weights[1]*x)/weights[2] #最佳拟合直线 50 | ax.plot(x, y) 51 | plt.xlabel('X1') 52 | plt.ylabel('X2') 53 | plt.show() 54 | 55 | if __name__ == "__main__" : 56 | dataMat,labelMat = loadDataSet() 57 | # print dataMat,labelMat 58 | weight = gradAscent(dataMat,labelMat) 59 | plotBestFit(weight) 60 | 61 | 62 | -------------------------------------------------------------------------------- /ch5/改进随机梯度上升.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from numpy import * 4 | 5 | def loadDataSet(): 6 | dataMat = [] #list 7 | labelMat = [] #list 8 | fr = open('testSet.txt') 9 | for line in fr.readlines(): 10 | lineArr = line.strip().split() 11 | dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])]) 12 | labelMat.append(int(lineArr[2])) 13 | return dataMat,labelMat 14 | 15 | def sigmoid(z): 16 | return 1.0 / (1 + exp(-z)) 17 | 18 | # Logistic 回归梯度上升优化算法 19 | def gradAscent(datamat,classlabel): 20 | dataMat = mat(datamat) #convert to NumPy matrix 21 | labeMat = mat(classlabel).transpose() #convert to NumPy matrix 22 | m,n = shape(dataMat) 23 | alpha = 0.001 # 向目标移动的步长 24 | maxCycles = 500 # 迭代次数 25 | weight = ones((n,1)) 26 | for k in range(maxCycles): 27 | h = sigmoid(dataMat * weight) 28 | diff = labeMat - h #误差 29 | weight += alpha * dataMat.transpose()*diff #这里不止一次乘积运算 30 | return weight 31 | 32 | def stocGradAscent(dataMatrix, classLabels, numIter=150): 33 | m,n = shape(dataMatrix) 34 | weights = ones(n) #initialize to all ones 35 | for j in range(numIter): 36 | dataIndex = range(m) 37 | for i in range(m): 38 | # apha decreases with iteration, does not 39 | alpha = 4/(1.0+j+i)+0.0001 40 | # go to 0 because of the constant 41 | randIndex = int(random.uniform(0,len(dataIndex))) 42 | h = sigmoid(sum(dataMatrix[randIndex]*weights)) 43 | error = classLabels[randIndex] - h 44 | weights = weights + alpha * error * dataMatrix[randIndex] 45 | del(dataIndex[randIndex]) 46 | return weights 47 | 48 | def plotBestFit(weights): 49 | import matplotlib.pyplot as plt 50 | dataMat,labelMat=loadDataSet() 51 | dataArr = array(dataMat) 52 | n = shape(dataArr)[0] 53 | xcord1 = []; ycord1 = [] 54 | xcord2 = []; ycord2 = [] 55 | for i in range(n): 56 | if int(labelMat[i])== 1: 57 | xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) 58 | else: 59 | xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) 60 | fig = plt.figure() 61 | ax = fig.add_subplot(111) 62 | ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') 63 | ax.scatter(xcord2, ycord2, s=30, c='green') 64 | x = arange(-3.0, 3.0, 0.1) 65 | y = (-weights[0]-weights[1]*x)/weights[2] #最佳拟合直线 66 | ax.plot(x, y) 67 | plt.xlabel('X1') 68 | plt.ylabel('X2') 69 | plt.show() 70 | 71 | if __name__ == "__main__" : 72 | dataMat,labelMat = loadDataSet() 73 | # print dataMat,labelMat 74 | weight = stocGradAscent(array(dataMat),labelMat) 75 | plotBestFit(weight) 76 | 77 | 78 | -------------------------------------------------------------------------------- /ch5/随机梯度上升.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from numpy import * 4 | 5 | def loadDataSet(): 6 | dataMat = [] #list 7 | labelMat = [] #list 8 | fr = open('testSet.txt') 9 | for line in fr.readlines(): 10 | lineArr = line.strip().split() 11 | dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])]) 12 | labelMat.append(int(lineArr[2])) 13 | return dataMat,labelMat 14 | 15 | def sigmoid(z): 16 | return 1.0 / (1 + exp(-z)) 17 | 18 | # Logistic 回归梯度上升优化算法 19 | def gradAscent(datamat,classlabel): 20 | dataMat = mat(datamat) #convert to NumPy matrix 21 | labeMat = mat(classlabel).transpose() #convert to NumPy matrix 22 | m,n = shape(dataMat) 23 | alpha = 0.001 # 向目标移动的步长 24 | maxCycles = 500 # 迭代次数 25 | weight = ones((n,1)) 26 | for k in range(maxCycles): 27 | h = sigmoid(dataMat * weight) 28 | diff = labeMat - h #误差 29 | weight += alpha * dataMat.transpose()*diff #这里不止一次乘积运算 30 | return weight 31 | 32 | def stocGradAscent(dataMatrix, classLabels): 33 | m,n = shape(dataMatrix) 34 | alpha = 0.01 35 | weights = ones(n) #initialize to all ones 36 | for i in range(m): 37 | h = sigmoid(sum(dataMatrix[i]*weights)) 38 | error = classLabels[i] - h 39 | weights = weights + alpha * error * dataMatrix[i] 40 | return weights 41 | 42 | def plotBestFit(weights): 43 | import matplotlib.pyplot as plt 44 | dataMat,labelMat=loadDataSet() 45 | dataArr = array(dataMat) 46 | n = shape(dataArr)[0] 47 | xcord1 = []; ycord1 = [] 48 | xcord2 = []; ycord2 = [] 49 | for i in range(n): 50 | if int(labelMat[i])== 1: 51 | xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) 52 | else: 53 | xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) 54 | fig = plt.figure() 55 | ax = fig.add_subplot(111) 56 | ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') 57 | ax.scatter(xcord2, ycord2, s=30, c='green') 58 | x = arange(-3.0, 3.0, 0.1) 59 | y = (-weights[0]-weights[1]*x)/weights[2] #最佳拟合直线 60 | ax.plot(x, y) 61 | plt.xlabel('X1') 62 | plt.ylabel('X2') 63 | plt.show() 64 | 65 | if __name__ == "__main__" : 66 | dataMat,labelMat = loadDataSet() 67 | # print dataMat,labelMat 68 | weight = stocGradAscent(array(dataMat),labelMat) 69 | plotBestFit(weight) 70 | 71 | 72 | -------------------------------------------------------------------------------- /ch6/README.md: -------------------------------------------------------------------------------- 1 | # Ch06 - 支持向量机(Support vector machines) 2 | 3 | #### 支持向量机是这本书里面最难的一个算法了,其中的原理需要的数学知识最多,当然SVM也是用的非常多的分类器。 4 | #### SVM有很多实现,但这章只关注序列最小优化(SMO)算法。同时,介绍一种称为核函数的方式将SVM扩展到更多的数据集上。在Logitic回归中的介绍的数据集中,它们都是可以在途中画出一条直线将两组数据点分开。这组数据又叫做线性可分数据。但是我们的数据不能用一条直线分开时呢?比如下面这些数据点。 5 | ## 4个线性不可分的数据集示意图 6 | ![4个线性不可分的数据集](screenshot/4个线性不可分的数据集.png) 7 | 8 | #### SMO算法的目标是求出一系列alpha和b,一旦求出这些alpha,就很容易计算出权重向量w并得到分割超平面。 9 | #### SMO算法原理:每次循环中选择两个alpha进行优化处理。一旦找到一对合适alpha,那就增大其中一个同时减少另一个。“合适”是指两个alpha必须要符合一定的条件,条件之一就是这两个alpha必须要在间隔边界之外,而其第二个条件则是这两个alpha还没有进行过区间化处理或者不在边界上。 10 | ## 简化版SMO效果图 11 | ![简化版SMO效果图](screenshot/简化版SMO效果图.png) 12 | #### 上图是数据集上运行简化版SMO后得到的结果,包括画圈的支持向量与分隔超平面。 13 | 14 | ## 完整版SMO效果图(优化速度) 15 | ![完整版SMO](screenshot/完整版SMO.png) 16 | #### 上图是数据集上运行简化版SMO后得到的结果,包括画圈的支持向量与分隔超平面。和简化版的稍微不同。 17 | 18 | 19 | ## 核方法中的非线性可分数据效果图 20 | ![核方法中的非线性可分数据](screenshot/核方法中的非线性可分数据.png) 21 | 22 | #### 如果线性不可分的话,我们需要把原始空间的数据映射到一个高维空间,就可以做到线性可分,这里就需要用到核函数。利用核函数可以将数据映射到高维空间,然后进行线性可分。形象的例子:https://www.zhihu.com/question/21094489 23 | 24 | ## 拉格朗日乘子法与KKT条件(自行了解) 25 | 26 | ## 基于SVM的数字识别(自行写代码)(和KNN类似) 27 | #### 进行数字识别时,采用SVM要比KNN好。 28 | 29 | -------------------------------------------------------------------------------- /ch6/digits.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch6/digits.zip -------------------------------------------------------------------------------- /ch6/matplotlib/4个线性不可分的数据集效果图.py: -------------------------------------------------------------------------------- 1 | #coding utf-8 2 | 3 | from numpy import 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | 7 | xcord0 = []; ycord0 = [] 8 | xcord1 = []; ycord1 = [] 9 | markers =[] 10 | colors =[] 11 | fr = open('testSet.txt') # this file was generated by 2normalGen.py 12 | for line in fr.readlines() 13 | lineSplit = line.strip().split('t') 14 | xPt = float(lineSplit[0]) 15 | yPt = float(lineSplit[1]) 16 | label = int(lineSplit[2]) 17 | if (label == 0) 18 | xcord0.append(xPt) 19 | ycord0.append(yPt) 20 | else 21 | xcord1.append(xPt) 22 | ycord1.append(yPt) 23 | 24 | fr.close() 25 | fig = plt.figure() 26 | ax = fig.add_subplot(221) 27 | xcord0 = []; ycord0 = []; xcord1 = []; ycord1 = [] 28 | for i in range(300) 29 | [x,y] = random.uniform(0,1,2) 30 | if ((x 0.5) and (y 0.5)) or ((x 0.5) and (y 0.5)) 31 | xcord0.append(x); ycord0.append(y) 32 | else 33 | xcord1.append(x); ycord1.append(y) 34 | ax.scatter(xcord0,ycord0, marker='s', s=90) 35 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red') 36 | plt.title('A') 37 | ax = fig.add_subplot(222) 38 | xcord0 = random.standard_normal(150); ycord0 = random.standard_normal(150) 39 | xcord1 = random.standard_normal(150)+2.0; ycord1 = random.standard_normal(150)+2.0 40 | ax.scatter(xcord0,ycord0, marker='s', s=90) 41 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red') 42 | plt.title('B') 43 | ax = fig.add_subplot(223) 44 | xcord0 = []; ycord0 = []; xcord1 = []; ycord1 = [] 45 | for i in range(300) 46 | [x,y] = random.uniform(0,1,2) 47 | if (x 0.5) 48 | xcord0.append(xcos(2.0piy)); ycord0.append(xsin(2.0piy)) 49 | else 50 | xcord1.append(xcos(2.0piy)); ycord1.append(xsin(2.0piy)) 51 | 52 | ax.scatter(xcord0,ycord0, marker='s', s=90) 53 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red') 54 | 55 | plt.title('C') 56 | ax = fig.add_subplot(224) 57 | xcord1 = zeros(150); ycord1 = zeros(150) 58 | xcord0 = random.uniform(-3,3,350); ycord0 = random.uniform(-3,3,350); 59 | 60 | xcord1[050] = 0.3random.standard_normal(50)+2.0; ycord1[050] = 0.3random.standard_normal(50)+2.0 61 | 62 | xcord1[50100] = 0.3random.standard_normal(50)-2.0; ycord1[50100] = 0.3random.standard_normal(50)-3.0 63 | 64 | xcord1[100150] = 0.3random.standard_normal(50)+1.0; ycord1[100150] = 0.3random.standard_normal(50) 65 | 66 | ax.scatter(xcord0,ycord0, marker='s', s=90) 67 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red') 68 | plt.title('D') 69 | plt.show() -------------------------------------------------------------------------------- /ch6/matplotlib/README.md: -------------------------------------------------------------------------------- 1 | # matplotlib code 2 | -------------------------------------------------------------------------------- /ch6/matplotlib/完整版SMO效果图.py: -------------------------------------------------------------------------------- 1 | #coding : utf-8 2 | 3 | from numpy import * 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | from matplotlib.patches import Circle 7 | 8 | xcord0 = [] 9 | ycord0 = [] 10 | xcord1 = [] 11 | ycord1 = [] 12 | markers =[] 13 | colors =[] 14 | fr = open('testSet.txt')#this file was generated by 2normalGen.py 15 | for line in fr.readlines(): 16 | lineSplit = line.strip().split('\t') 17 | xPt = float(lineSplit[0]) 18 | yPt = float(lineSplit[1]) 19 | label = int(lineSplit[2]) 20 | if (label == -1): 21 | xcord0.append(xPt) 22 | ycord0.append(yPt) 23 | else: 24 | xcord1.append(xPt) 25 | ycord1.append(yPt) 26 | 27 | fr.close() 28 | fig = plt.figure() 29 | ax = fig.add_subplot(111) 30 | ax.scatter(xcord0,ycord0, marker='s', s=90) 31 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red') 32 | plt.title('Support Vectors Circled') 33 | 34 | circle = Circle((4.6581910000000004, 3.507396), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5) 35 | ax.add_patch(circle) 36 | circle = Circle((3.4570959999999999, -0.082215999999999997), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5) 37 | ax.add_patch(circle) 38 | circle = Circle((6.0805730000000002, 0.41888599999999998), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5) 39 | ax.add_patch(circle) 40 | circle = Circle((2.911290000000001, -1.590919999999999), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5) 41 | ax.add_patch(circle) 42 | circle = Circle((5.310480000000001, -2.386369999999999), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5) 43 | ax.add_patch(circle) 44 | circle = Circle((8.245097000000001, 1.515159999999999), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5) 45 | ax.add_patch(circle) 46 | #plt.plot([2.3,8.5], [-6,6]) #seperating hyperplane 47 | b = -3.75567; w0=0.8065; w1=-0.2761 48 | x = arange(-2.0, 12.0, 0.1) 49 | y = (-w0*x - b)/w1 50 | ax.plot(x,y) 51 | ax.axis([-2,12,-8,6]) 52 | plt.show() -------------------------------------------------------------------------------- /ch6/matplotlib/核方法中的非线性可分数据效果图.py: -------------------------------------------------------------------------------- 1 | #coding : utf-8 2 | 3 | from numpy import * 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | 7 | xcord0 = []; ycord0 = []; xcord1 = []; ycord1 = [] 8 | fw = open('testSetRBF2.txt', 'w') # input data 9 | 10 | fig = plt.figure() 11 | ax = fig.add_subplot(111) 12 | xcord0 = []; ycord0 = []; xcord1 = []; ycord1 = [] 13 | for i in range(100): 14 | [x,y] = random.uniform(0,1,2) 15 | xpt=x*cos(2.0*pi*y); ypt = x*sin(2.0*pi*y) 16 | if (x > 0.5): 17 | xcord0.append(xpt); ycord0.append(ypt) 18 | label = -1.0 19 | else: 20 | xcord1.append(xpt); ycord1.append(ypt) 21 | label = 1.0 22 | fw.write('%f\t%f\t%f\n' % (xpt, ypt, label)) 23 | ax.scatter(xcord0,ycord0, marker='s', s=90) 24 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red') 25 | plt.title('Non-linearly Separable Data for Kernel Method') 26 | plt.show() 27 | fw.close() -------------------------------------------------------------------------------- /ch6/matplotlib/简化版SMO处理小数据集效果图.py: -------------------------------------------------------------------------------- 1 | #coding : utf-8 2 | 3 | from numpy import * 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | from matplotlib.patches import Circle 7 | 8 | xcord0 = [] 9 | ycord0 = [] 10 | xcord1 = [] 11 | ycord1 = [] 12 | markers =[] 13 | colors =[] 14 | fr = open('testSet.txt')#this file was generated by 2normalGen.py 15 | for line in fr.readlines(): 16 | lineSplit = line.strip().split('\t') 17 | xPt = float(lineSplit[0]) 18 | yPt = float(lineSplit[1]) 19 | label = int(lineSplit[2]) 20 | if (label == -1): 21 | xcord0.append(xPt) 22 | ycord0.append(yPt) 23 | else: 24 | xcord1.append(xPt) 25 | ycord1.append(yPt) 26 | 27 | fr.close() 28 | fig = plt.figure() 29 | ax = fig.add_subplot(111) 30 | ax.scatter(xcord0,ycord0, marker='s', s=90) 31 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red') 32 | plt.title('Support Vectors Circled') 33 | circle = Circle((4.6581910000000004, 3.507396), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5) 34 | ax.add_patch(circle) 35 | circle = Circle((3.4570959999999999, -0.082215999999999997), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5) 36 | ax.add_patch(circle) 37 | circle = Circle((6.0805730000000002, 0.41888599999999998), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5) 38 | ax.add_patch(circle) 39 | #plt.plot([2.3,8.5], [-6,6]) #seperating hyperplane 40 | b = -3.75567; w0=0.8065; w1=-0.2761 41 | x = arange(-2.0, 12.0, 0.1) 42 | y = (-w0*x - b)/w1 43 | ax.plot(x,y) 44 | ax.axis([-2,12,-8,6]) 45 | plt.show() -------------------------------------------------------------------------------- /ch6/screenshot/4个线性不可分的数据集.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch6/screenshot/4个线性不可分的数据集.png -------------------------------------------------------------------------------- /ch6/screenshot/README.md: -------------------------------------------------------------------------------- 1 | # screenshot 2 | -------------------------------------------------------------------------------- /ch6/screenshot/完整版SMO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch6/screenshot/完整版SMO.png -------------------------------------------------------------------------------- /ch6/screenshot/核方法中的非线性可分数据.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch6/screenshot/核方法中的非线性可分数据.png -------------------------------------------------------------------------------- /ch6/screenshot/简化版SMO效果图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch6/screenshot/简化版SMO效果图.png -------------------------------------------------------------------------------- /ch6/testSet.txt: -------------------------------------------------------------------------------- 1 | 3.542485 1.977398 -1 2 | 3.018896 2.556416 -1 3 | 7.551510 -1.580030 1 4 | 2.114999 -0.004466 -1 5 | 8.127113 1.274372 1 6 | 7.108772 -0.986906 1 7 | 8.610639 2.046708 1 8 | 2.326297 0.265213 -1 9 | 3.634009 1.730537 -1 10 | 0.341367 -0.894998 -1 11 | 3.125951 0.293251 -1 12 | 2.123252 -0.783563 -1 13 | 0.887835 -2.797792 -1 14 | 7.139979 -2.329896 1 15 | 1.696414 -1.212496 -1 16 | 8.117032 0.623493 1 17 | 8.497162 -0.266649 1 18 | 4.658191 3.507396 -1 19 | 8.197181 1.545132 1 20 | 1.208047 0.213100 -1 21 | 1.928486 -0.321870 -1 22 | 2.175808 -0.014527 -1 23 | 7.886608 0.461755 1 24 | 3.223038 -0.552392 -1 25 | 3.628502 2.190585 -1 26 | 7.407860 -0.121961 1 27 | 7.286357 0.251077 1 28 | 2.301095 -0.533988 -1 29 | -0.232542 -0.547690 -1 30 | 3.457096 -0.082216 -1 31 | 3.023938 -0.057392 -1 32 | 8.015003 0.885325 1 33 | 8.991748 0.923154 1 34 | 7.916831 -1.781735 1 35 | 7.616862 -0.217958 1 36 | 2.450939 0.744967 -1 37 | 7.270337 -2.507834 1 38 | 1.749721 -0.961902 -1 39 | 1.803111 -0.176349 -1 40 | 8.804461 3.044301 1 41 | 1.231257 -0.568573 -1 42 | 2.074915 1.410550 -1 43 | -0.743036 -1.736103 -1 44 | 3.536555 3.964960 -1 45 | 8.410143 0.025606 1 46 | 7.382988 -0.478764 1 47 | 6.960661 -0.245353 1 48 | 8.234460 0.701868 1 49 | 8.168618 -0.903835 1 50 | 1.534187 -0.622492 -1 51 | 9.229518 2.066088 1 52 | 7.886242 0.191813 1 53 | 2.893743 -1.643468 -1 54 | 1.870457 -1.040420 -1 55 | 5.286862 -2.358286 1 56 | 6.080573 0.418886 1 57 | 2.544314 1.714165 -1 58 | 6.016004 -3.753712 1 59 | 0.926310 -0.564359 -1 60 | 0.870296 -0.109952 -1 61 | 2.369345 1.375695 -1 62 | 1.363782 -0.254082 -1 63 | 7.279460 -0.189572 1 64 | 1.896005 0.515080 -1 65 | 8.102154 -0.603875 1 66 | 2.529893 0.662657 -1 67 | 1.963874 -0.365233 -1 68 | 8.132048 0.785914 1 69 | 8.245938 0.372366 1 70 | 6.543888 0.433164 1 71 | -0.236713 -5.766721 -1 72 | 8.112593 0.295839 1 73 | 9.803425 1.495167 1 74 | 1.497407 -0.552916 -1 75 | 1.336267 -1.632889 -1 76 | 9.205805 -0.586480 1 77 | 1.966279 -1.840439 -1 78 | 8.398012 1.584918 1 79 | 7.239953 -1.764292 1 80 | 7.556201 0.241185 1 81 | 9.015509 0.345019 1 82 | 8.266085 -0.230977 1 83 | 8.545620 2.788799 1 84 | 9.295969 1.346332 1 85 | 2.404234 0.570278 -1 86 | 2.037772 0.021919 -1 87 | 1.727631 -0.453143 -1 88 | 1.979395 -0.050773 -1 89 | 8.092288 -1.372433 1 90 | 1.667645 0.239204 -1 91 | 9.854303 1.365116 1 92 | 7.921057 -1.327587 1 93 | 8.500757 1.492372 1 94 | 1.339746 -0.291183 -1 95 | 3.107511 0.758367 -1 96 | 2.609525 0.902979 -1 97 | 3.263585 1.367898 -1 98 | 2.912122 -0.202359 -1 99 | 1.731786 0.589096 -1 100 | 2.387003 1.573131 -1 101 | -------------------------------------------------------------------------------- /ch6/testSetRBF.txt: -------------------------------------------------------------------------------- 1 | -0.214824 0.662756 -1.000000 2 | -0.061569 -0.091875 1.000000 3 | 0.406933 0.648055 -1.000000 4 | 0.223650 0.130142 1.000000 5 | 0.231317 0.766906 -1.000000 6 | -0.748800 -0.531637 -1.000000 7 | -0.557789 0.375797 -1.000000 8 | 0.207123 -0.019463 1.000000 9 | 0.286462 0.719470 -1.000000 10 | 0.195300 -0.179039 1.000000 11 | -0.152696 -0.153030 1.000000 12 | 0.384471 0.653336 -1.000000 13 | -0.117280 -0.153217 1.000000 14 | -0.238076 0.000583 1.000000 15 | -0.413576 0.145681 1.000000 16 | 0.490767 -0.680029 -1.000000 17 | 0.199894 -0.199381 1.000000 18 | -0.356048 0.537960 -1.000000 19 | -0.392868 -0.125261 1.000000 20 | 0.353588 -0.070617 1.000000 21 | 0.020984 0.925720 -1.000000 22 | -0.475167 -0.346247 -1.000000 23 | 0.074952 0.042783 1.000000 24 | 0.394164 -0.058217 1.000000 25 | 0.663418 0.436525 -1.000000 26 | 0.402158 0.577744 -1.000000 27 | -0.449349 -0.038074 1.000000 28 | 0.619080 -0.088188 -1.000000 29 | 0.268066 -0.071621 1.000000 30 | -0.015165 0.359326 1.000000 31 | 0.539368 -0.374972 -1.000000 32 | -0.319153 0.629673 -1.000000 33 | 0.694424 0.641180 -1.000000 34 | 0.079522 0.193198 1.000000 35 | 0.253289 -0.285861 1.000000 36 | -0.035558 -0.010086 1.000000 37 | -0.403483 0.474466 -1.000000 38 | -0.034312 0.995685 -1.000000 39 | -0.590657 0.438051 -1.000000 40 | -0.098871 -0.023953 1.000000 41 | -0.250001 0.141621 1.000000 42 | -0.012998 0.525985 -1.000000 43 | 0.153738 0.491531 -1.000000 44 | 0.388215 -0.656567 -1.000000 45 | 0.049008 0.013499 1.000000 46 | 0.068286 0.392741 1.000000 47 | 0.747800 -0.066630 -1.000000 48 | 0.004621 -0.042932 1.000000 49 | -0.701600 0.190983 -1.000000 50 | 0.055413 -0.024380 1.000000 51 | 0.035398 -0.333682 1.000000 52 | 0.211795 0.024689 1.000000 53 | -0.045677 0.172907 1.000000 54 | 0.595222 0.209570 -1.000000 55 | 0.229465 0.250409 1.000000 56 | -0.089293 0.068198 1.000000 57 | 0.384300 -0.176570 1.000000 58 | 0.834912 -0.110321 -1.000000 59 | -0.307768 0.503038 -1.000000 60 | -0.777063 -0.348066 -1.000000 61 | 0.017390 0.152441 1.000000 62 | -0.293382 -0.139778 1.000000 63 | -0.203272 0.286855 1.000000 64 | 0.957812 -0.152444 -1.000000 65 | 0.004609 -0.070617 1.000000 66 | -0.755431 0.096711 -1.000000 67 | -0.526487 0.547282 -1.000000 68 | -0.246873 0.833713 -1.000000 69 | 0.185639 -0.066162 1.000000 70 | 0.851934 0.456603 -1.000000 71 | -0.827912 0.117122 -1.000000 72 | 0.233512 -0.106274 1.000000 73 | 0.583671 -0.709033 -1.000000 74 | -0.487023 0.625140 -1.000000 75 | -0.448939 0.176725 1.000000 76 | 0.155907 -0.166371 1.000000 77 | 0.334204 0.381237 -1.000000 78 | 0.081536 -0.106212 1.000000 79 | 0.227222 0.527437 -1.000000 80 | 0.759290 0.330720 -1.000000 81 | 0.204177 -0.023516 1.000000 82 | 0.577939 0.403784 -1.000000 83 | -0.568534 0.442948 -1.000000 84 | -0.011520 0.021165 1.000000 85 | 0.875720 0.422476 -1.000000 86 | 0.297885 -0.632874 -1.000000 87 | -0.015821 0.031226 1.000000 88 | 0.541359 -0.205969 -1.000000 89 | -0.689946 -0.508674 -1.000000 90 | -0.343049 0.841653 -1.000000 91 | 0.523902 -0.436156 -1.000000 92 | 0.249281 -0.711840 -1.000000 93 | 0.193449 0.574598 -1.000000 94 | -0.257542 -0.753885 -1.000000 95 | -0.021605 0.158080 1.000000 96 | 0.601559 -0.727041 -1.000000 97 | -0.791603 0.095651 -1.000000 98 | -0.908298 -0.053376 -1.000000 99 | 0.122020 0.850966 -1.000000 100 | -0.725568 -0.292022 -1.000000 101 | -------------------------------------------------------------------------------- /ch6/testSetRBF2.txt: -------------------------------------------------------------------------------- 1 | 0.676771 -0.486687 -1.000000 2 | 0.008473 0.186070 1.000000 3 | -0.727789 0.594062 -1.000000 4 | 0.112367 0.287852 1.000000 5 | 0.383633 -0.038068 1.000000 6 | -0.927138 -0.032633 -1.000000 7 | -0.842803 -0.423115 -1.000000 8 | -0.003677 -0.367338 1.000000 9 | 0.443211 -0.698469 -1.000000 10 | -0.473835 0.005233 1.000000 11 | 0.616741 0.590841 -1.000000 12 | 0.557463 -0.373461 -1.000000 13 | -0.498535 -0.223231 -1.000000 14 | -0.246744 0.276413 1.000000 15 | -0.761980 -0.244188 -1.000000 16 | 0.641594 -0.479861 -1.000000 17 | -0.659140 0.529830 -1.000000 18 | -0.054873 -0.238900 1.000000 19 | -0.089644 -0.244683 1.000000 20 | -0.431576 -0.481538 -1.000000 21 | -0.099535 0.728679 -1.000000 22 | -0.188428 0.156443 1.000000 23 | 0.267051 0.318101 1.000000 24 | 0.222114 -0.528887 -1.000000 25 | 0.030369 0.113317 1.000000 26 | 0.392321 0.026089 1.000000 27 | 0.298871 -0.915427 -1.000000 28 | -0.034581 -0.133887 1.000000 29 | 0.405956 0.206980 1.000000 30 | 0.144902 -0.605762 -1.000000 31 | 0.274362 -0.401338 1.000000 32 | 0.397998 -0.780144 -1.000000 33 | 0.037863 0.155137 1.000000 34 | -0.010363 -0.004170 1.000000 35 | 0.506519 0.486619 -1.000000 36 | 0.000082 -0.020625 1.000000 37 | 0.057761 -0.155140 1.000000 38 | 0.027748 -0.553763 -1.000000 39 | -0.413363 -0.746830 -1.000000 40 | 0.081500 -0.014264 1.000000 41 | 0.047137 -0.491271 1.000000 42 | -0.267459 0.024770 1.000000 43 | -0.148288 -0.532471 -1.000000 44 | -0.225559 -0.201622 1.000000 45 | 0.772360 -0.518986 -1.000000 46 | -0.440670 0.688739 -1.000000 47 | 0.329064 -0.095349 1.000000 48 | 0.970170 -0.010671 -1.000000 49 | -0.689447 -0.318722 -1.000000 50 | -0.465493 -0.227468 -1.000000 51 | -0.049370 0.405711 1.000000 52 | -0.166117 0.274807 1.000000 53 | 0.054483 0.012643 1.000000 54 | 0.021389 0.076125 1.000000 55 | -0.104404 -0.914042 -1.000000 56 | 0.294487 0.440886 -1.000000 57 | 0.107915 -0.493703 -1.000000 58 | 0.076311 0.438860 1.000000 59 | 0.370593 -0.728737 -1.000000 60 | 0.409890 0.306851 -1.000000 61 | 0.285445 0.474399 -1.000000 62 | -0.870134 -0.161685 -1.000000 63 | -0.654144 -0.675129 -1.000000 64 | 0.285278 -0.767310 -1.000000 65 | 0.049548 -0.000907 1.000000 66 | 0.030014 -0.093265 1.000000 67 | -0.128859 0.278865 1.000000 68 | 0.307463 0.085667 1.000000 69 | 0.023440 0.298638 1.000000 70 | 0.053920 0.235344 1.000000 71 | 0.059675 0.533339 -1.000000 72 | 0.817125 0.016536 -1.000000 73 | -0.108771 0.477254 1.000000 74 | -0.118106 0.017284 1.000000 75 | 0.288339 0.195457 1.000000 76 | 0.567309 -0.200203 -1.000000 77 | -0.202446 0.409387 1.000000 78 | -0.330769 -0.240797 1.000000 79 | -0.422377 0.480683 -1.000000 80 | -0.295269 0.326017 1.000000 81 | 0.261132 0.046478 1.000000 82 | -0.492244 -0.319998 -1.000000 83 | -0.384419 0.099170 1.000000 84 | 0.101882 -0.781145 -1.000000 85 | 0.234592 -0.383446 1.000000 86 | -0.020478 -0.901833 -1.000000 87 | 0.328449 0.186633 1.000000 88 | -0.150059 -0.409158 1.000000 89 | -0.155876 -0.843413 -1.000000 90 | -0.098134 -0.136786 1.000000 91 | 0.110575 -0.197205 1.000000 92 | 0.219021 0.054347 1.000000 93 | 0.030152 0.251682 1.000000 94 | 0.033447 -0.122824 1.000000 95 | -0.686225 -0.020779 -1.000000 96 | -0.911211 -0.262011 -1.000000 97 | 0.572557 0.377526 -1.000000 98 | -0.073647 -0.519163 -1.000000 99 | -0.281830 -0.797236 -1.000000 100 | -0.555263 0.126232 -1.000000 101 | -------------------------------------------------------------------------------- /ch6/简化版SMO处理小数据集.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | 3 | from numpy import * 4 | import numpy as np 5 | from time import sleep 6 | 7 | def loadDataSet(filename): #读入数据 8 | dataMat = [] ; labelMat = [] #创建两个数组 9 | fr = open(filename) 10 | for line in fr.readlines(): 11 | lineArr = line.strip().split('\t') #对当前行进行去回车,空格操作 12 | dataMat.append([float(lineArr[0]),float(lineArr[1])]) #将两个特征加入dataMat 13 | labelMat.append((float(lineArr[2])))#将标签加入labelMat 14 | return dataMat,labelMat 15 | 16 | def selectJrand(i,m):#用于在区间内选择一个整数,i为alpha的下标,m为alpha的个数 17 | j = i 18 | while(j==i):#只要函数值不等于输入值i就会随机,因为要满足 ∑alpha(i)*label(i)=0,同时改变两个alpha 19 | j = int(random.uniform(0,m)) 20 | return j 21 | 22 | def clipAlpha(aj,H,L):#用来调整大于H或小于L的alpha值 23 | if aj>H: 24 | aj = H 25 | if L > aj: 26 | aj = L 27 | return aj 28 | 29 | # 简化版SMO 30 | # 这本数最大的一个函数 31 | # 输入参数:数据集,类别标签,常数C,容错率,取消前最大的循环次数 32 | def smoSimple(dataMatIn,classLabels,C,toler,maxIter):#数据集,类别标签,常熟C,容错率,退出前的最大循环次数 33 | dataMatrix = mat(dataMatIn) ; #转换成numpy矩阵 34 | labelMat = mat(classLabels).transpose() #转换成numpy矩阵 35 | b = 0 ; m,n = shape(dataMatrix) #求出行列 36 | alphas = mat(zeros((m,1)))#讲alpha都初始化为0 37 | iter = 0#没有任何alpha改变下的遍历数据集的次数 38 | while (iter < maxIter) : #当迭代次数小于最大迭代次数 39 | alphaPairsChanged = 0 #用来记录alpha是否被优化 40 | for i in range(m): #对m行数据进行处理 41 | fXi = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b #预测的类别 42 | Ei = fXi - float(labelMat[i]) #误差Ei 43 | #如果误差很大,就可以基于该组数据所对应的alpha进行优化 44 | if ((labelMat[i]*Ei < -toler )and (alphas[i] < C )) or ((labelMat[i]*Ei > toler ) and alphas[i]>0 ) : 45 | #在if语句,测试正间隔和负间隔,同时检查alpha值,保证其不能等于0或C 46 | j = selectJrand(i,m) #随机第二个alpha 47 | fXj = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b 48 | Ej = fXj - float(labelMat[j]) 49 | alphaIold = alphas[i].copy() 50 | alphaJold = alphas[j].copy() #把两个alpha赋值,这样的好处是不改变原有alphas的值 51 | if(labelMat[i] != labelMat[j]):#如果标签向量不相等,保证alpha再0~C之间 52 | L = max(0,alphas[j]+alphas[i]) 53 | H = min(C,C+alphas[j]-alphas[i]) 54 | else: 55 | L = max(0,alphas[j]+alphas[i] - C) 56 | H = min(C,alphas[j]+alphas[i]) 57 | if L == H : print("L==H") ; continue 58 | eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - \ 59 | dataMatrix[j,:]*dataMatrix[j,:].T #是alpha[j]的最优修改量 60 | if eta >= 0 : print "eta>=0";continue 61 | alphas[j] -= labelMat[j]*(Ei - Ej) / eta 62 | alphas[j] = clipAlpha(alphas[j],H,L) #调整alpha的大小 63 | if(abs(alphas[j]-alphaJold) < 0.00001) : print "j not moving enough " ; continue#检查alpha[j] 64 | alphas[i]+=labelMat[i]*labelMat[j]*(alphaJold-alphas[j]) #对i进行修改,修改量与j相同,但方向相反 65 | b1 = b - Ei - labelMat[i]*(alphas[i] - alphaIold) * dataMatrix[i,:]*dataMatrix[i,:].T- \ 66 | labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T 67 | b2 = b - Ej - labelMat[i]*(alphas[i] - alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - \ 68 | labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T 69 | if (0alphas[i]) : b = b1 70 | elif (0alphas[j]) : b = b2 71 | else: b = (b1 + b2 ) / 2.0 72 | alphaPairsChanged+=1 73 | print "iter : %d i:%d , pairs changed %d " % (iter , i , alphaPairsChanged) 74 | if (alphaPairsChanged==0) : iter +=1 75 | else:iter = 0 76 | print "ietration number %d " % iter 77 | return b,alphas 78 | 79 | 80 | if __name__ == "__main__": 81 | dataMat,labelMat = loadDataSet("testSet.txt") 82 | # print labelMat 83 | print smoSimple(dataMat,labelMat,0.6,0.001,40) 84 | -------------------------------------------------------------------------------- /ch7/README.md: -------------------------------------------------------------------------------- 1 | # Ch07 - 利用AdaBoost元算法提高分类性能(Improving classification with the AdaBoost meta-algorithm) 2 | 3 | #### Adaboost是一种迭代算法,其核心思想是针对同一个训练集训练不同的分类器(弱分类器),然后把这些弱分类器集合起来,构成一个更强的最终分类器(强分类器)。 4 | #### 本章用单层决策树作为弱学习器构造了 Adaboost分类器。 实际上,Adaboost函数可以应用于任意分类器,只要该分类器能够处理加权数据即可。 5 | 6 | ## 本章使用的单层决策树测试数据 7 | ![单层决策树测试数据](screenshot/单层决策树测试数据.png) 8 | #### 元算法:算法最后的评估,不是靠一个模型给出的结果,而是综合考虑多个模型结果,来得出最后的结果。 9 | 10 | #### bagging:基于数据随机重抽样的分类器构建方法。 11 | #### boosting:关注被已有分类器错分的那些数据来获得新的分类器。 12 | 13 | ## AdaBoost: 14 | #### 自适应boosting。运行过程:对训练数据中的每个样本,先赋予其一个权重,这些权重开始都相等,先在一个弱分类器上计算错误率,然后在统一数据集上再次训练弱分类器,第二次训练时,将会重新调整每个样本的权重,第一次分对的样本的权重将会降低,分错的样本权重会提高。同时,AdaBoost为每个分类器都分配了一个权重α,这些权重是基于错误率算出来的。具体数学计算公式看书。 15 | 16 | #### AdaBoost算法会这样一直迭代,直到训练出错误率为 0 或者到达迭代次数为止。 17 | 18 | ## AdaBoost的过拟合问题 19 | #### 多个分类器组合可能会进一步凸显出单分类器的不足。当分类器数目越来越多,训练错误率肯定是越来越小,但是测试错误率却是先减后增,这时就是发生了过拟合。 20 | 21 | ## 其他分类性能度量指标 22 | #### 包括正确率(precise),召回率(recall),以及ROC曲线等。 23 | ## ROC曲线 24 | ![ROC曲线](screenshot/ROC曲线.png) 25 | -------------------------------------------------------------------------------- /ch7/matplotlib/单层决策树测试数据.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | 3 | from numpy import * 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | 9 | # def loadDataSet(fileName): 10 | # dataMat = []; labelMat = [] 11 | # fr = open(fileName) 12 | # for line in fr.readlines(): 13 | # lineArr = line.strip().split('\t') 14 | # dataMat.append([float(lineArr[0]), float(lineArr[1])]) 15 | # labelMat.append(float(lineArr[2])) 16 | # return dataMat,labelMat 17 | 18 | 19 | if __name__ == "__main__" : 20 | # datMat,classLabels = loadDataSet("horseColicTraining2.txt") 21 | # datMat = matrix(datMat) 22 | # print datMat 23 | # print "size=%d" % len(datMat) 24 | # print datMat[1,0] 25 | # print classLabels 26 | # print "size=%d" % len(classLabels) 27 | datMat = matrix([[ 1. , 2.1], 28 | [ 1.5, 1.6], 29 | [ 1.3, 1. ], 30 | [ 1. , 1. ], 31 | [ 2. , 1. ]]) 32 | classLabels = [1.0, 1.0, -1.0, -1.0, 1.0] 33 | xcord0 = [] 34 | ycord0 = [] 35 | xcord1 = [] 36 | ycord1 = [] 37 | markers = [] 38 | colors = [] 39 | 40 | for i in range(len(classLabels)): 41 | if classLabels[i] == 1.0: 42 | xcord1.append(datMat[i, 0]), ycord1.append(datMat[i, 1]) 43 | else: 44 | xcord0.append(datMat[i, 0]), ycord0.append(datMat[i, 1]) 45 | 46 | fig = plt.figure() 47 | ax = fig.add_subplot(111) 48 | ax.scatter(xcord0, ycord0, marker='s', s=90) 49 | ax.scatter(xcord1, ycord1, marker='o', s=50, c='red') 50 | plt.title('decision stump test data') 51 | plt.show() 52 | -------------------------------------------------------------------------------- /ch7/screenshot/README.md: -------------------------------------------------------------------------------- 1 | # screenshot 2 | -------------------------------------------------------------------------------- /ch7/screenshot/ROC曲线.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch7/screenshot/ROC曲线.png -------------------------------------------------------------------------------- /ch7/screenshot/单层决策树测试数据.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch7/screenshot/单层决策树测试数据.png -------------------------------------------------------------------------------- /ch8/README.md: -------------------------------------------------------------------------------- 1 | # Ch08 - 预测数值型数据:回归(Predicting numeric values: regression) 2 | 3 | ### 本章讲的是线性回归,就是找出一系列的回归系数,和Logistic回归很像,但是去掉了sigmoid函数。说到回归,一般指的是线性回归。 4 | 5 | ### 假如回归系数放在向量w中,这个w可以用普通最小二乘法(OLS),通过使用numpy库的几个函数即可(注意:如果没有检查行列式是否为0就计算矩阵的逆,就会出现错误,使用linalg函数即可)。比如给我们一些数据(ex0.txt),下图是它们的散点图。 6 | ![数据分布](screenshot/数据分布.png) 7 | 8 | ### 通过标准回归函数和数据导入函数,就是使用最小二乘法。得到下图的最佳拟合直线,其实也不是“最佳”,因为这种线性回归会出现欠拟合的情况。 9 | ![线性回归找到最佳拟合曲线](screenshot/线性回归找到最佳拟合曲线.png) 10 | 11 | ## 局部加权线性回归 12 | ### 线性回归中会出现欠拟合的情况,因为它求的是具有最小均方误差的无偏估计。欠拟合可不会有最好的预测效果。面对这种情况,因此,我们需要在估计中引入一些偏差, 从而降低误差。其中一个方法就是局部加权线性回归 (LWLR)。 13 | ### LWLR通常使用核来对附近的点赋予更高的权重,最常用的是高斯核。k是高斯核对应的权重中的一个参数。下面3个图分别是k=1.0,k=0.01,k=0.003三种不同取值下的效果图。 14 | 15 | ### k = 1.0 (和OLS差不多)(欠拟合) 16 | ![局部加权线性回归(k=1.0)](screenshot/局部加权线性回归(k=1.0).png) 17 | 18 | ### k = 0.01 (理想状态,可以挖掘数据的潜在规律) 19 | ![局部加权线性回归(k=0.01)](screenshot/局部加权线性回归(k=0.01).png) 20 | 21 | ### k= 0.003 (考虑了太多噪声)(过拟合) 22 | ![局部加权线性回归(k=0.003)](screenshot/局部加权线性回归(k=0.003).png) 23 | 24 | ## 岭回归 25 | ### 这里提到了一种在统计学中叫缩减的技术。 26 | ### 使用岭回归和缩减奇数之前,需要对特征做标准化处理。 27 | ### 岭回归使用不同的λ系数时的回归系数变化图。 28 | ![岭回归](screenshot/岭回归.png) 29 | ### 该图绘出了回归系数(纵坐标)与 log(lambda)(横坐标)的关系。在最左边,即lambda最小时,可以得到所有系数的原始值(与线性回归一致),而在右边,系数全部缩减成0。在中间部分的某值将可以取得最好的预测效果。但为找到最佳参数值,还要进行交叉验证。然后自己看一下lasso。 30 | 31 | ## 前向逐步回归 32 | ### 前向逐步回归算法可以得到与lasso差不多的结果,但更简单。 33 | ### 这是一种贪心算法,每一步都尽可能的减少误差,一开始所有权重都设为1,然后每一步所做的决策就是对某个权重增加或减少一个很小的值。 需要设置步数和步长。逐步回归的好处在于能帮助人们理解现有模型并且做出改进,可以找出重要的特征,然后去收集重要的特征的数据。 34 | ### 使用0.005的epsilon值并迭代1000次后的结果如下: 35 | ![前向逐步回归](screenshot/前向逐步回归.png) 36 | 37 | ## 权衡偏差与方差 38 | ### 误差=偏差+测量误差+噪声,随着模型复杂度增加,训练集的误差降低,但是测试集的误差会先减后增,表明从高偏差过渡到了高方差模型,这里权衡两者是很重要的。 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /ch8/matplotlib/前向逐步回归.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | from numpy import * 3 | 4 | # 加载数据集 5 | def loadDataSet(filename): 6 | numFeat = len(open(filename).readline().split("\t")) - 1 7 | dataMat = [] 8 | labelMat = [] 9 | fr = open(filename) 10 | for line in fr.readlines(): 11 | lineArr = [] 12 | curLine = line.strip().split("\t") 13 | for i in range(numFeat): 14 | lineArr.append(float(curLine[i])) 15 | 16 | dataMat.append(lineArr) 17 | labelMat.append(float(curLine[-1])) 18 | 19 | return dataMat, labelMat 20 | 21 | # 计算最佳拟合曲线 22 | def standRegress(xArr, yArr): 23 | xMat = mat(xArr); 24 | yMat = mat(yArr).T # .T代表转置矩阵 25 | xTx = xMat.T * xMat 26 | if linalg.det(xTx) == 0.0: # linalg.det(xTx) 计算行列式的值 27 | print "This matrix is singular , cannot do inverse" 28 | return 29 | ws = xTx.I * (xMat.T * yMat) 30 | return ws 31 | 32 | #==========前向逐步回归============ 33 | 34 | #计算平方误差 35 | def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays 36 | return ((yArr-yHatArr)**2).sum() 37 | 38 | #数据标准化处理 39 | def regularize(xMat):#regularize by columns 40 | inMat = xMat.copy() 41 | inMeans = mean(inMat,0) #calc mean then subtract it off 42 | inVar = var(inMat,0) #calc variance of Xi then divide by it 43 | inMat = (inMat - inMeans)/inVar 44 | return inMat 45 | 46 | 47 | def stageWise(xArr,yArr,eps=0.01,numIt=100): 48 | xMat = mat(xArr); yMat=mat(yArr).T 49 | yMean = mean(yMat,0) 50 | yMat = yMat - yMean #can also regularize ys but will get smaller coef 51 | xMat = regularize(xMat) 52 | m,n=shape(xMat) 53 | returnMat = zeros((numIt,n)) #testing code remove 54 | ws = zeros((n,1)); 55 | wsTest = ws.copy(); 56 | wsMax = ws.copy() 57 | for i in range(numIt): #could change this to while loop 58 | #print ws.T 59 | lowestError = inf; 60 | for j in range(n): 61 | for sign in [-1,1]: 62 | wsTest = ws.copy() 63 | wsTest[j] += eps*sign 64 | yTest = xMat*wsTest 65 | rssE = rssError(yMat.A,yTest.A) 66 | if rssE < lowestError: 67 | lowestError = rssE 68 | wsMax = wsTest 69 | ws = wsMax.copy() 70 | returnMat[i,:]=ws.T 71 | return returnMat 72 | 73 | 74 | xArr,yArr = loadDataSet('abalone.txt') 75 | 76 | # 把这些结果与最小二乘法进行比较,后者的结果可以通过如下代码: 77 | 78 | xMat = mat(xArr) 79 | yMat = mat(yArr).T 80 | xMat = regularize(xMat) 81 | yM = mean(yMat,0) 82 | yMat = yMat - yM 83 | weights = standRegress(xMat, yMat.T) 84 | print weights.T 85 | 86 | # print stageWise(xArr, yArr, 0.01, 200) 87 | mat = stageWise(xArr,yArr,0.005,1000) # 使用0.005的epsilon 迭代 1000次 88 | 89 | def showRidge(): 90 | import matplotlib.pyplot as plt 91 | fig = plt.figure() 92 | ax = fig.add_subplot(111) 93 | ax.plot(mat) 94 | plt.show() 95 | showRidge() 96 | 97 | 98 | -------------------------------------------------------------------------------- /ch8/matplotlib/局部加权线性回归.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/matplotlib/局部加权线性回归.py -------------------------------------------------------------------------------- /ch8/matplotlib/岭回归.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | 3 | from numpy import * 4 | 5 | #==================岭回归================== 6 | 7 | # 加载数据集 8 | def loadDataSet(filename): 9 | numFeat = len(open(filename).readline().split("\t")) - 1 10 | dataMat = [] 11 | labelMat = [] 12 | fr = open(filename) 13 | for line in fr.readlines(): 14 | lineArr = [] 15 | curLine = line.strip().split("\t") 16 | for i in range(numFeat): 17 | lineArr.append(float(curLine[i])) 18 | 19 | dataMat.append(lineArr) 20 | labelMat.append(float(curLine[-1])) 21 | 22 | return dataMat, labelMat 23 | 24 | #用于计算回归系数 25 | def ridgeRegres(xMat,yMat,lam=0.2): 26 | xTx = xMat.T * xMat 27 | denom = xTx + eye(shape(xMat)[1]) * lam 28 | if linalg.det(denom)==0.0: 29 | print "This matrix is singular, cannot do inverse" 30 | return 31 | ws = denom.I * (xMat.T * yMat) 32 | return ws # 回归参数 33 | 34 | #用于在一组lambda上做测试 35 | def ridgeTest(xArr,yArr): 36 | xMat = mat(xArr); yMat = mat(yArr).T 37 | yMean = mean(yMat,0) 38 | #数据标准化 39 | yMat = yMat - yMean 40 | xMeans = mean(xMat,0) 41 | xVar = var(xMat,0) 42 | xMat = (xMat - xMeans)/xVar 43 | 44 | numTestPts = 30 45 | wMat = zeros((numTestPts, shape(xMat)[1])) 46 | for i in range(numTestPts): 47 | ws = ridgeRegres(xMat, yMat, exp(i-10)) 48 | wMat[i,:]=ws.T 49 | return wMat 50 | 51 | abX,abY = loadDataSet('abalone.txt') 52 | ridgeWeights = ridgeTest(abX,abY) 53 | # print ridgeWeights 54 | 55 | def showRidge(): 56 | import matplotlib.pyplot as plt 57 | fig = plt.figure() 58 | ax = fig.add_subplot(111) 59 | ax.plot(ridgeWeights) 60 | plt.show() 61 | 62 | showRidge() 63 | -------------------------------------------------------------------------------- /ch8/matplotlib/线性回归找到最佳拟合曲线.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | from numpy import * 3 | 4 | # ===========用线性回归找到最佳拟合曲线=========== 5 | # 加载数据集 6 | def loadDataSet(filename): 7 | numFeat = len(open(filename).readline().split("\t")) - 1 8 | dataMat = [] 9 | labelMat = [] 10 | fr = open(filename) 11 | for line in fr.readlines(): 12 | lineArr = [] 13 | curLine = line.strip().split("\t") 14 | for i in range(numFeat): 15 | lineArr.append(float(curLine[i])) 16 | 17 | dataMat.append(lineArr) 18 | labelMat.append(float(curLine[-1])) 19 | 20 | return dataMat, labelMat 21 | 22 | 23 | # 计算最佳拟合曲线 24 | def standRegress(xArr, yArr): 25 | xMat = mat(xArr); 26 | yMat = mat(yArr).T # .T代表转置矩阵 27 | xTx = xMat.T * xMat 28 | if linalg.det(xTx) == 0.0: # linalg.det(xTx) 计算行列式的值 29 | print "This matrix is singular , cannot do inverse" 30 | return 31 | ws = xTx.I * (xMat.T * yMat) 32 | return ws 33 | 34 | 35 | # 测试上边的函数 36 | xArr, yArr = loadDataSet("ex0.txt") 37 | # xArr, yArr = loadDataSet("ex1.txt") 38 | ws = standRegress(xArr, yArr) 39 | print "ws(相关系数):\n", ws # ws 存放的就是回归系数 40 | 41 | def show(): 42 | import matplotlib.pyplot as plt 43 | xMat = mat(xArr); 44 | yMat = mat(yArr) 45 | yHat = xMat * ws 46 | fig = plt.figure() # 创建绘图对象 47 | ax = fig.add_subplot(111) # 111表示将画布划分为1行2列选择使用从上到下第一块 48 | # scatter绘制散点图 49 | ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0]) 50 | # 复制,排序 51 | xCopy = xMat.copy() 52 | xCopy.sort(0) 53 | yHat = xCopy * ws 54 | # plot画线 55 | ax.plot(xCopy[:, 1], yHat) 56 | plt.show() 57 | 58 | 59 | show() 60 | 61 | yHat = mat(xArr) * ws 62 | # yHat = xMat * ws 63 | # 利用numpy库提供的corrcoef来计算预测值和真实值得相关性 64 | print "相关性:\n", corrcoef(yHat.T, mat(yArr)) -------------------------------------------------------------------------------- /ch8/screenshot/README.md: -------------------------------------------------------------------------------- 1 | # screenshot 2 | -------------------------------------------------------------------------------- /ch8/screenshot/前向逐步回归.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/前向逐步回归.png -------------------------------------------------------------------------------- /ch8/screenshot/局部加权线性回归(k=0.003).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/局部加权线性回归(k=0.003).png -------------------------------------------------------------------------------- /ch8/screenshot/局部加权线性回归(k=0.01).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/局部加权线性回归(k=0.01).png -------------------------------------------------------------------------------- /ch8/screenshot/局部加权线性回归(k=1.0).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/局部加权线性回归(k=1.0).png -------------------------------------------------------------------------------- /ch8/screenshot/岭回归.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/岭回归.png -------------------------------------------------------------------------------- /ch8/screenshot/数据分布.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/数据分布.png -------------------------------------------------------------------------------- /ch8/screenshot/线性回归找到最佳拟合曲线.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/线性回归找到最佳拟合曲线.png -------------------------------------------------------------------------------- /ch8/前向逐步回归.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | from numpy import * 3 | 4 | # 加载数据集 5 | def loadDataSet(filename): 6 | numFeat = len(open(filename).readline().split("\t")) - 1 7 | dataMat = [] 8 | labelMat = [] 9 | fr = open(filename) 10 | for line in fr.readlines(): 11 | lineArr = [] 12 | curLine = line.strip().split("\t") 13 | for i in range(numFeat): 14 | lineArr.append(float(curLine[i])) 15 | 16 | dataMat.append(lineArr) 17 | labelMat.append(float(curLine[-1])) 18 | 19 | return dataMat, labelMat 20 | 21 | # 计算最佳拟合曲线 22 | def standRegress(xArr, yArr): 23 | xMat = mat(xArr); 24 | yMat = mat(yArr).T # .T代表转置矩阵 25 | xTx = xMat.T * xMat 26 | if linalg.det(xTx) == 0.0: # linalg.det(xTx) 计算行列式的值 27 | print "This matrix is singular , cannot do inverse" 28 | return 29 | ws = xTx.I * (xMat.T * yMat) 30 | return ws 31 | 32 | #==========前向逐步回归============ 33 | 34 | #计算平方误差 35 | def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays 36 | return ((yArr-yHatArr)**2).sum() 37 | 38 | #数据标准化处理 39 | def regularize(xMat):#regularize by columns 40 | inMat = xMat.copy() 41 | inMeans = mean(inMat,0) #calc mean then subtract it off 42 | inVar = var(inMat,0) #calc variance of Xi then divide by it 43 | inMat = (inMat - inMeans)/inVar 44 | return inMat 45 | 46 | 47 | def stageWise(xArr,yArr,eps=0.01,numIt=100): 48 | xMat = mat(xArr); yMat=mat(yArr).T 49 | yMean = mean(yMat,0) 50 | yMat = yMat - yMean #can also regularize ys but will get smaller coef 51 | xMat = regularize(xMat) 52 | m,n=shape(xMat) 53 | returnMat = zeros((numIt,n)) #testing code remove 54 | ws = zeros((n,1)); 55 | wsTest = ws.copy(); 56 | wsMax = ws.copy() 57 | for i in range(numIt): #could change this to while loop 58 | #print ws.T 59 | lowestError = inf; 60 | for j in range(n): 61 | for sign in [-1,1]: 62 | wsTest = ws.copy() 63 | wsTest[j] += eps*sign 64 | yTest = xMat*wsTest 65 | rssE = rssError(yMat.A,yTest.A) 66 | if rssE < lowestError: 67 | lowestError = rssE 68 | wsMax = wsTest 69 | ws = wsMax.copy() 70 | returnMat[i,:]=ws.T 71 | return returnMat 72 | 73 | 74 | xArr,yArr = loadDataSet('abalone.txt') 75 | 76 | # 把这些结果与最小二乘法进行比较,后者的结果可以通过如下代码: 77 | 78 | xMat = mat(xArr) 79 | yMat = mat(yArr).T 80 | xMat = regularize(xMat) 81 | yM = mean(yMat,0) 82 | yMat = yMat - yM 83 | weights = standRegress(xMat, yMat.T) 84 | print weights.T 85 | 86 | # print stageWise(xArr, yArr, 0.01, 200) 87 | mat = stageWise(xArr,yArr,0.005,1000) # 使用0.005的epsilon 迭代 1000次 88 | 89 | def showRidge(): 90 | import matplotlib.pyplot as plt 91 | fig = plt.figure() 92 | ax = fig.add_subplot(111) 93 | ax.plot(mat) 94 | plt.show() 95 | showRidge() 96 | 97 | 98 | -------------------------------------------------------------------------------- /ch8/局部加权线性回归.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/局部加权线性回归.py -------------------------------------------------------------------------------- /ch8/岭回归.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | 3 | from numpy import * 4 | 5 | #==================岭回归================== 6 | 7 | # 加载数据集 8 | def loadDataSet(filename): 9 | numFeat = len(open(filename).readline().split("\t")) - 1 10 | dataMat = [] 11 | labelMat = [] 12 | fr = open(filename) 13 | for line in fr.readlines(): 14 | lineArr = [] 15 | curLine = line.strip().split("\t") 16 | for i in range(numFeat): 17 | lineArr.append(float(curLine[i])) 18 | 19 | dataMat.append(lineArr) 20 | labelMat.append(float(curLine[-1])) 21 | 22 | return dataMat, labelMat 23 | 24 | #用于计算回归系数 25 | def ridgeRegres(xMat,yMat,lam=0.2): 26 | xTx = xMat.T * xMat 27 | denom = xTx + eye(shape(xMat)[1]) * lam 28 | if linalg.det(denom)==0.0: 29 | print "This matrix is singular, cannot do inverse" 30 | return 31 | ws = denom.I * (xMat.T * yMat) 32 | return ws # 回归参数 33 | 34 | #用于在一组lambda上做测试 35 | def ridgeTest(xArr,yArr): 36 | xMat = mat(xArr); yMat = mat(yArr).T 37 | yMean = mean(yMat,0) 38 | #数据标准化 39 | yMat = yMat - yMean 40 | xMeans = mean(xMat,0) 41 | xVar = var(xMat,0) 42 | xMat = (xMat - xMeans)/xVar 43 | 44 | numTestPts = 30 45 | wMat = zeros((numTestPts, shape(xMat)[1])) 46 | for i in range(numTestPts): 47 | ws = ridgeRegres(xMat, yMat, exp(i-10)) 48 | wMat[i,:]=ws.T 49 | return wMat 50 | 51 | abX,abY = loadDataSet('abalone.txt') 52 | ridgeWeights = ridgeTest(abX,abY) 53 | # print ridgeWeights 54 | 55 | def showRidge(): 56 | import matplotlib.pyplot as plt 57 | fig = plt.figure() 58 | ax = fig.add_subplot(111) 59 | ax.plot(ridgeWeights) 60 | plt.show() 61 | 62 | showRidge() -------------------------------------------------------------------------------- /ch8/线性回归找到最佳拟合曲线.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | from numpy import * 3 | 4 | # ===========用线性回归找到最佳拟合曲线=========== 5 | # 加载数据集 6 | def loadDataSet(filename): 7 | numFeat = len(open(filename).readline().split("\t")) - 1 8 | dataMat = [] 9 | labelMat = [] 10 | fr = open(filename) 11 | for line in fr.readlines(): 12 | lineArr = [] 13 | curLine = line.strip().split("\t") 14 | for i in range(numFeat): 15 | lineArr.append(float(curLine[i])) 16 | 17 | dataMat.append(lineArr) 18 | labelMat.append(float(curLine[-1])) 19 | 20 | return dataMat, labelMat 21 | 22 | 23 | # 计算最佳拟合曲线 24 | def standRegress(xArr, yArr): 25 | xMat = mat(xArr); 26 | yMat = mat(yArr).T # .T代表转置矩阵 27 | xTx = xMat.T * xMat 28 | if linalg.det(xTx) == 0.0: # linalg.det(xTx) 计算行列式的值 29 | print "This matrix is singular , cannot do inverse" 30 | return 31 | ws = xTx.I * (xMat.T * yMat) 32 | return ws 33 | 34 | 35 | # 测试上边的函数 36 | xArr, yArr = loadDataSet("ex0.txt") 37 | # xArr, yArr = loadDataSet("ex1.txt") 38 | ws = standRegress(xArr, yArr) 39 | print "ws(相关系数):\n", ws # ws 存放的就是回归系数 40 | 41 | def show(): 42 | import matplotlib.pyplot as plt 43 | xMat = mat(xArr); 44 | yMat = mat(yArr) 45 | yHat = xMat * ws 46 | fig = plt.figure() # 创建绘图对象 47 | ax = fig.add_subplot(111) # 111表示将画布划分为1行2列选择使用从上到下第一块 48 | # scatter绘制散点图 49 | ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0]) 50 | # 复制,排序 51 | xCopy = xMat.copy() 52 | xCopy.sort(0) 53 | yHat = xCopy * ws 54 | # plot画线 55 | ax.plot(xCopy[:, 1], yHat) 56 | plt.show() 57 | 58 | 59 | show() 60 | 61 | yHat = mat(xArr) * ws 62 | # yHat = xMat * ws 63 | # 利用numpy库提供的corrcoef来计算预测值和真实值得相关性 64 | print "相关性:\n", corrcoef(yHat.T, mat(yArr)) -------------------------------------------------------------------------------- /ch9/CRAT算法用于回归.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | 3 | from numpy import * 4 | 5 | #解析文本数据 6 | def loadDataSet(filename): 7 | dataMat=[] 8 | fr=open(filename) 9 | for line in fr.readlines(): 10 | curLine=line.strip().split('\t') 11 | #将每行数据映射为浮点数 12 | fltLine=map(float,curLine) 13 | dataMat.append(fltLine) 14 | return dataMat 15 | 16 | #拆分数据集函数,二元拆分法 17 | #@dataSet:待拆分的数据集 18 | #@feature:作为拆分点的特征索引 19 | #@value:特征的某一取值作为分割值 20 | def binSplitDataSet(dataSet, feature, value): 21 | 22 | # 采用条件过滤的方法获取数据集每个样本目标特征的取值大于value的样本存入mat0 23 | # mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :][0] # 书本错误 typo 24 | mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :] 25 | #样本目标特征取值不大于value的样本存入mat1 26 | # mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :][0] # 书本错误 typo 27 | mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :] 28 | return mat0, mat1 29 | 30 | #回归树的切分函数 31 | 32 | #叶节点生成函数 33 | def regLeaf(dataSet): 34 | #数据集列表最后一列特征值的均值作为叶节点返回 35 | return mean(dataSet[:,-1]) 36 | 37 | #误差计算函数 38 | def regErr(dataSet): 39 | #计算数据集最后一列特征值的均方差*数据集样本数,得到总方差返回 40 | return var(dataSet[:,-1])*shape(dataSet)[0] 41 | 42 | 43 | def linearSolve(dataSet): #helper function used in two places 44 | m,n = shape(dataSet) 45 | X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion 46 | X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y 47 | xTx = X.T*X 48 | if linalg.det(xTx) == 0.0: 49 | raise NameError('This matrix is singular, cannot do inverse,\n\ 50 | try increasing the second value of ops') 51 | ws = xTx.I * (X.T * Y) 52 | return ws,X,Y 53 | 54 | def modelLeaf(dataSet):#create linear model and return coeficients 55 | ws,X,Y = linearSolve(dataSet) 56 | return ws 57 | 58 | def modelErr(dataSet): 59 | ws,X,Y = linearSolve(dataSet) 60 | yHat = X * ws 61 | return sum(power(Y - yHat,2)) 62 | 63 | def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)): 64 | tolS = ops[0]; tolN = ops[1] 65 | # 数据集最后一列所有的值都相同 66 | if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1 67 | # 最优特征返回none,将该数据集最后一列计算均值作为叶节点值返回 68 | return None, leafType(dataSet) 69 | 70 | m,n = shape(dataSet) 71 | # 计算未切分前数据集的误差 72 | S = errType(dataSet) 73 | bestS = inf; bestIndex = 0; bestValue = 0 74 | # 遍历数据集所有的特征,除最后一列目标变量值 75 | for featIndex in range(n-1): 76 | # 遍历每个特征里不同的特征值 77 | for splitVal in set((dataSet[:, featIndex].T.A.tolist())[0]): 78 | #for splitVal in set(dataSet[:,featIndex]): # 书本错误 typo 79 | mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) 80 | if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue 81 | newS = errType(mat0) + errType(mat1) 82 | if newS < bestS: 83 | bestIndex = featIndex 84 | bestValue = splitVal 85 | bestS = newS 86 | # 如果切分后比切分前误差下降值未达到tolS 87 | if (S - bestS) < tolS: 88 | return None, leafType(dataSet) #exit cond 2 89 | mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) 90 | if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #exit cond 3 91 | return None, leafType(dataSet) 92 | # 返回最佳切分特征及最佳切分特征取值 93 | return bestIndex,bestValue 94 | 95 | #创建树函数 96 | #@dataSet:数据集 97 | #@leafType:生成叶节点的类型 1 回归树:叶节点为常数值 2 模型树:叶节点为线性模型 98 | #@errType:计算误差的类型 1 回归错误类型:总方差=均方差*样本数 2 模型错误类型:预测误差(y-yHat)平方的累加和 99 | #@ops:用户指定的参数 100 | def createTree(dataSet,leafType = regLeaf,errType = regErr,ops=(1,4)): 101 | 102 | #选取最佳分割特征和特征值 103 | feat,val=chooseBestSplit(dataSet,leafType,errType,ops) 104 | #如果特征为none,直接返回叶节点值 105 | if feat == None:return val 106 | #树的类型是字典类型 107 | retTree={} 108 | #树字典的一个元素是切分的最佳特征 109 | retTree['spInd']=feat 110 | #第二个元素是最佳特征对应的最佳切分特征值 111 | retTree['spVal']=val 112 | #根据特征索引及特征值对数据集进行二元拆分,并返回拆分的两个数据子集 113 | lSet,rSet=binSplitDataSet(dataSet,feat,val) 114 | #第三个元素是树的左分支,通过lSet子集递归生成左子树 115 | retTree['left']=createTree(lSet,leafType,errType,ops) 116 | #第四个元素是树的右分支,通过rSet子集递归生成右子树 117 | retTree['right']=createTree(rSet,leafType,errType,ops) 118 | #返回生成的数字典 119 | return retTree 120 | 121 | if __name__ == "__main__" : 122 | myDat = loadDataSet("ex00.txt") 123 | myMat = mat(myDat) 124 | print createTree(myMat) 125 | # {'spInd': 0, 'spVal': 0.48813, 'right': -0.044650285714285719, 'left': 1.0180967672413792} 126 | myDat1 = loadDataSet("ex0.txt") 127 | MyMat1 = mat(myDat1) 128 | print createTree(MyMat1) 129 | #{'spInd': 1, 'spVal': 0.39435, 'right': {'spInd': 1, 'spVal': 0.197834, 'right': -0.023838155555555553, 'left': 1.0289583666666666}, 130 | # 'left': {'spInd': 1, 'spVal': 0.582002, 'right': 1.980035071428571, 'left': {'spInd': 1, 'spVal': 0.797583, 'right': 2.9836209534883724, 'left': 3.9871631999999999}}} 131 | 132 | -------------------------------------------------------------------------------- /ch9/README.md: -------------------------------------------------------------------------------- 1 | # Ch09 - 树回归(Tree-based regression) 2 | #### 这张介绍了一种新的树算法。叫做CART(分类回归树),该算法即可以用于分类还可以用于回归。同时介绍了树剪枝,其目的是防止树的过拟合。 3 | #### 我们第三章介绍了贪心算法的决策树,构建算法是ID3,每次选取当前最佳特征来分割数据,并且按照这个特征的所有可能取值来划分,一旦切分完成,这个特征在之后的执行过程中不会再有任何用处。但这种方法切分过于迅速,不能处理连续型特征值,需要将连续型数据离散化后才能处理,这样就破坏了连续变量的内在性质。 4 | #### 二元切分法是另一种树构建算法,每次将数据集切分成两半,如果数据的某个特征满足这个切分的条件,就将这些数据放入左子树,否则右子树。CART(Classification And Regression Trees,分类回归树)使用二元切分来处理连续型变量,并用总方差取代香农熵来分析模型的效果。 5 | 6 | ### 注意:书中提供的代码好像有点小错误。 7 | 8 | ## 绘制ex0和ex00两个切分后的数据点图: 9 | ### ex00.txt: 10 | ![基于CART算法构建回归树的简单数据集](screenshot/基于CART算法构建回归树的简单数据集.png) 11 | ### ex0.txt 12 | ![用于测试回归树的分段常数数据集](screenshot/用于测试回归树的分段常数数据集.png) 13 | 14 | # 树剪枝 15 | #### 如果树节点过多,则该模型可能对数据过拟合,通过降低决策树的复杂度来避免过拟合的过程称为剪枝。我们有两种剪枝的方法,一种是预剪枝,另外一种叫做后剪枝。 16 | 17 | ## 预剪枝 18 | #### 函数chooseBestSplit中的三个提前终止条件是“预剪枝”操作。预剪枝就是对节点的规模做限制,如果节点的规模小于限制,那么不可往下分,还有就是对划分前后总方差的限制,如果变化不大,那么也不可分。但是预剪枝对两个参数的取值很难把握,有时候因为数据的不同,参数对数据数量级十分敏感,一不小心就会造成过拟合。 19 | ## 将ex00.txt数据的y轴放大100倍后的新数据集 20 | ![放大100倍](screenshot/放大100倍.png) 21 | 22 | ## 后剪枝 23 | #### 后剪枝需要将数据集分为训练集和测试集,先用训练集构造出回归树,然后后剪枝是对树结构的递归操作,如果两个子节点都是叶节点,并且如果合并之后能够降低测试误差,那么进行塌陷处理,就是把这个节点的值设为左右子节点的平均值,然后一层一层递归。简单来说,使用后剪枝方法需要将数据集交叉验证,首先给定参数,使得构建出的树足够复杂,之后从上而下找到叶节点,判断合并两个叶节点是否能够取得更好的测试误差,如果是就合并。 24 | 25 | #### 一般来说,同时使用预剪枝和后剪枝的作用效果更好。 26 | 27 | ## 模型树 28 | ### 采用树结构对数据建模,除了将叶节点设定为常数,也可将其设为分段线性函数。回归树的叶子节点是设定为常数值的,而模型树的叶子节点是设定为线性函数的。性函数的节点显然比值节点的树更容易理解,这也是模型树优于回归树的特点之一。 29 | 30 | ## 下面是用来测试构建模型树的分段线性数据: 31 | ![测试模型树构建函数的测试数据](screenshot/测试模型树构建函数的测试数据.png) 32 | ### 使用模型树只要修改一下叶节点的生成函数即可,和回归很像。 33 | 34 | ## 相关系数 35 | #### 这里介绍了一下numpy库里面的corrcoef(yHat, y, rowvar = 0)来求解相关系数R^2,越接近1越好。yHat是预测值,y是目标变量的实际值。 36 | 37 | ## 用于比较树回归模型和普通的线性回归模型的数据散点图。 38 | ![骑自行车速度](screenshot/骑自行车速度.png) 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /ch9/ex00.txt: -------------------------------------------------------------------------------- 1 | 0.036098 0.155096 2 | 0.993349 1.077553 3 | 0.530897 0.893462 4 | 0.712386 0.564858 5 | 0.343554 -0.371700 6 | 0.098016 -0.332760 7 | 0.691115 0.834391 8 | 0.091358 0.099935 9 | 0.727098 1.000567 10 | 0.951949 0.945255 11 | 0.768596 0.760219 12 | 0.541314 0.893748 13 | 0.146366 0.034283 14 | 0.673195 0.915077 15 | 0.183510 0.184843 16 | 0.339563 0.206783 17 | 0.517921 1.493586 18 | 0.703755 1.101678 19 | 0.008307 0.069976 20 | 0.243909 -0.029467 21 | 0.306964 -0.177321 22 | 0.036492 0.408155 23 | 0.295511 0.002882 24 | 0.837522 1.229373 25 | 0.202054 -0.087744 26 | 0.919384 1.029889 27 | 0.377201 -0.243550 28 | 0.814825 1.095206 29 | 0.611270 0.982036 30 | 0.072243 -0.420983 31 | 0.410230 0.331722 32 | 0.869077 1.114825 33 | 0.620599 1.334421 34 | 0.101149 0.068834 35 | 0.820802 1.325907 36 | 0.520044 0.961983 37 | 0.488130 -0.097791 38 | 0.819823 0.835264 39 | 0.975022 0.673579 40 | 0.953112 1.064690 41 | 0.475976 -0.163707 42 | 0.273147 -0.455219 43 | 0.804586 0.924033 44 | 0.074795 -0.349692 45 | 0.625336 0.623696 46 | 0.656218 0.958506 47 | 0.834078 1.010580 48 | 0.781930 1.074488 49 | 0.009849 0.056594 50 | 0.302217 -0.148650 51 | 0.678287 0.907727 52 | 0.180506 0.103676 53 | 0.193641 -0.327589 54 | 0.343479 0.175264 55 | 0.145809 0.136979 56 | 0.996757 1.035533 57 | 0.590210 1.336661 58 | 0.238070 -0.358459 59 | 0.561362 1.070529 60 | 0.377597 0.088505 61 | 0.099142 0.025280 62 | 0.539558 1.053846 63 | 0.790240 0.533214 64 | 0.242204 0.209359 65 | 0.152324 0.132858 66 | 0.252649 -0.055613 67 | 0.895930 1.077275 68 | 0.133300 -0.223143 69 | 0.559763 1.253151 70 | 0.643665 1.024241 71 | 0.877241 0.797005 72 | 0.613765 1.621091 73 | 0.645762 1.026886 74 | 0.651376 1.315384 75 | 0.697718 1.212434 76 | 0.742527 1.087056 77 | 0.901056 1.055900 78 | 0.362314 -0.556464 79 | 0.948268 0.631862 80 | 0.000234 0.060903 81 | 0.750078 0.906291 82 | 0.325412 -0.219245 83 | 0.726828 1.017112 84 | 0.348013 0.048939 85 | 0.458121 -0.061456 86 | 0.280738 -0.228880 87 | 0.567704 0.969058 88 | 0.750918 0.748104 89 | 0.575805 0.899090 90 | 0.507940 1.107265 91 | 0.071769 -0.110946 92 | 0.553520 1.391273 93 | 0.401152 -0.121640 94 | 0.406649 -0.366317 95 | 0.652121 1.004346 96 | 0.347837 -0.153405 97 | 0.081931 -0.269756 98 | 0.821648 1.280895 99 | 0.048014 0.064496 100 | 0.130962 0.184241 101 | 0.773422 1.125943 102 | 0.789625 0.552614 103 | 0.096994 0.227167 104 | 0.625791 1.244731 105 | 0.589575 1.185812 106 | 0.323181 0.180811 107 | 0.822443 1.086648 108 | 0.360323 -0.204830 109 | 0.950153 1.022906 110 | 0.527505 0.879560 111 | 0.860049 0.717490 112 | 0.007044 0.094150 113 | 0.438367 0.034014 114 | 0.574573 1.066130 115 | 0.536689 0.867284 116 | 0.782167 0.886049 117 | 0.989888 0.744207 118 | 0.761474 1.058262 119 | 0.985425 1.227946 120 | 0.132543 -0.329372 121 | 0.346986 -0.150389 122 | 0.768784 0.899705 123 | 0.848921 1.170959 124 | 0.449280 0.069098 125 | 0.066172 0.052439 126 | 0.813719 0.706601 127 | 0.661923 0.767040 128 | 0.529491 1.022206 129 | 0.846455 0.720030 130 | 0.448656 0.026974 131 | 0.795072 0.965721 132 | 0.118156 -0.077409 133 | 0.084248 -0.019547 134 | 0.845815 0.952617 135 | 0.576946 1.234129 136 | 0.772083 1.299018 137 | 0.696648 0.845423 138 | 0.595012 1.213435 139 | 0.648675 1.287407 140 | 0.897094 1.240209 141 | 0.552990 1.036158 142 | 0.332982 0.210084 143 | 0.065615 -0.306970 144 | 0.278661 0.253628 145 | 0.773168 1.140917 146 | 0.203693 -0.064036 147 | 0.355688 -0.119399 148 | 0.988852 1.069062 149 | 0.518735 1.037179 150 | 0.514563 1.156648 151 | 0.976414 0.862911 152 | 0.919074 1.123413 153 | 0.697777 0.827805 154 | 0.928097 0.883225 155 | 0.900272 0.996871 156 | 0.344102 -0.061539 157 | 0.148049 0.204298 158 | 0.130052 -0.026167 159 | 0.302001 0.317135 160 | 0.337100 0.026332 161 | 0.314924 -0.001952 162 | 0.269681 -0.165971 163 | 0.196005 -0.048847 164 | 0.129061 0.305107 165 | 0.936783 1.026258 166 | 0.305540 -0.115991 167 | 0.683921 1.414382 168 | 0.622398 0.766330 169 | 0.902532 0.861601 170 | 0.712503 0.933490 171 | 0.590062 0.705531 172 | 0.723120 1.307248 173 | 0.188218 0.113685 174 | 0.643601 0.782552 175 | 0.520207 1.209557 176 | 0.233115 -0.348147 177 | 0.465625 -0.152940 178 | 0.884512 1.117833 179 | 0.663200 0.701634 180 | 0.268857 0.073447 181 | 0.729234 0.931956 182 | 0.429664 -0.188659 183 | 0.737189 1.200781 184 | 0.378595 -0.296094 185 | 0.930173 1.035645 186 | 0.774301 0.836763 187 | 0.273940 -0.085713 188 | 0.824442 1.082153 189 | 0.626011 0.840544 190 | 0.679390 1.307217 191 | 0.578252 0.921885 192 | 0.785541 1.165296 193 | 0.597409 0.974770 194 | 0.014083 -0.132525 195 | 0.663870 1.187129 196 | 0.552381 1.369630 197 | 0.683886 0.999985 198 | 0.210334 -0.006899 199 | 0.604529 1.212685 200 | 0.250744 0.046297 201 | -------------------------------------------------------------------------------- /ch9/ex2.txt: -------------------------------------------------------------------------------- 1 | 0.228628 -2.266273 2 | 0.965969 112.386764 3 | 0.342761 -31.584855 4 | 0.901444 87.300625 5 | 0.585413 125.295113 6 | 0.334900 18.976650 7 | 0.769043 64.041941 8 | 0.297107 -1.798377 9 | 0.901421 100.133819 10 | 0.176523 0.946348 11 | 0.710234 108.553919 12 | 0.981980 86.399637 13 | 0.085873 -10.137104 14 | 0.537834 90.995536 15 | 0.806158 62.877698 16 | 0.708890 135.416767 17 | 0.787755 118.642009 18 | 0.463241 17.171057 19 | 0.300318 -18.051318 20 | 0.815215 118.319942 21 | 0.139880 7.336784 22 | 0.068373 -15.160836 23 | 0.457563 -34.044555 24 | 0.665652 105.547997 25 | 0.084661 -24.132226 26 | 0.954711 100.935789 27 | 0.953902 130.926480 28 | 0.487381 27.729263 29 | 0.759504 81.106762 30 | 0.454312 -20.360067 31 | 0.295993 -14.988279 32 | 0.156067 7.557349 33 | 0.428582 15.224266 34 | 0.847219 76.240984 35 | 0.499171 11.924204 36 | 0.203993 -22.379119 37 | 0.548539 83.114502 38 | 0.790312 110.159730 39 | 0.937766 119.949824 40 | 0.218321 1.410768 41 | 0.223200 15.501642 42 | 0.896683 107.001620 43 | 0.582311 82.589328 44 | 0.698920 92.470636 45 | 0.823848 59.342323 46 | 0.385021 24.816941 47 | 0.061219 6.695567 48 | 0.841547 115.669032 49 | 0.763328 115.199195 50 | 0.934853 115.753994 51 | 0.222271 -9.255852 52 | 0.217214 -3.958752 53 | 0.706961 106.180427 54 | 0.888426 94.896354 55 | 0.549814 137.267576 56 | 0.107960 -1.293195 57 | 0.085111 37.820659 58 | 0.388789 21.578007 59 | 0.467383 -9.712925 60 | 0.623909 87.181863 61 | 0.373501 -8.228297 62 | 0.513332 101.075609 63 | 0.350725 -40.086564 64 | 0.716211 103.345308 65 | 0.731636 73.912028 66 | 0.273863 -9.457556 67 | 0.211633 -8.332207 68 | 0.944221 100.120253 69 | 0.053764 -13.731698 70 | 0.126833 22.891675 71 | 0.952833 100.649591 72 | 0.391609 3.001104 73 | 0.560301 82.903945 74 | 0.124723 -1.402796 75 | 0.465680 -23.777531 76 | 0.699873 115.586605 77 | 0.164134 -27.405211 78 | 0.455761 9.841938 79 | 0.508542 96.403373 80 | 0.138619 -29.087463 81 | 0.335182 2.768225 82 | 0.908629 118.513475 83 | 0.546601 96.319043 84 | 0.378965 13.583555 85 | 0.968621 98.648346 86 | 0.637999 91.656617 87 | 0.350065 -1.319852 88 | 0.632691 93.645293 89 | 0.936524 65.548418 90 | 0.310956 -49.939516 91 | 0.437652 19.745224 92 | 0.166765 -14.740059 93 | 0.571214 114.872056 94 | 0.952377 73.520802 95 | 0.665329 121.980607 96 | 0.258070 -20.425137 97 | 0.912161 85.005351 98 | 0.777582 100.838446 99 | 0.642707 82.500766 100 | 0.885676 108.045948 101 | 0.080061 2.229873 102 | 0.039914 11.220099 103 | 0.958512 135.837013 104 | 0.377383 5.241196 105 | 0.661073 115.687524 106 | 0.454375 3.043912 107 | 0.412516 -26.419289 108 | 0.854970 89.209930 109 | 0.698472 120.521925 110 | 0.465561 30.051931 111 | 0.328890 39.783113 112 | 0.309133 8.814725 113 | 0.418943 44.161493 114 | 0.553797 120.857321 115 | 0.799873 91.368473 116 | 0.811363 112.981216 117 | 0.785574 107.024467 118 | 0.949198 105.752508 119 | 0.666452 120.014736 120 | 0.652462 112.715799 121 | 0.290749 -14.391613 122 | 0.508548 93.292829 123 | 0.680486 110.367074 124 | 0.356790 -19.526539 125 | 0.199903 -3.372472 126 | 0.264926 5.280579 127 | 0.166431 -6.512506 128 | 0.370042 -32.124495 129 | 0.628061 117.628346 130 | 0.228473 19.425158 131 | 0.044737 3.855393 132 | 0.193282 18.208423 133 | 0.519150 116.176162 134 | 0.351478 -0.461116 135 | 0.872199 111.552716 136 | 0.115150 13.795828 137 | 0.324274 -13.189243 138 | 0.446196 -5.108172 139 | 0.613004 168.180746 140 | 0.533511 129.766743 141 | 0.740859 93.773929 142 | 0.667851 92.449664 143 | 0.900699 109.188248 144 | 0.599142 130.378529 145 | 0.232802 1.222318 146 | 0.838587 134.089674 147 | 0.284794 35.623746 148 | 0.130626 -39.524461 149 | 0.642373 140.613941 150 | 0.786865 100.598825 151 | 0.403228 -1.729244 152 | 0.883615 95.348184 153 | 0.910975 106.814667 154 | 0.819722 70.054508 155 | 0.798198 76.853728 156 | 0.606417 93.521396 157 | 0.108801 -16.106164 158 | 0.318309 -27.605424 159 | 0.856421 107.166848 160 | 0.842940 95.893131 161 | 0.618868 76.917665 162 | 0.531944 124.795495 163 | 0.028546 -8.377094 164 | 0.915263 96.717610 165 | 0.925782 92.074619 166 | 0.624827 105.970743 167 | 0.331364 -1.290825 168 | 0.341700 -23.547711 169 | 0.342155 -16.930416 170 | 0.729397 110.902830 171 | 0.640515 82.713621 172 | 0.228751 -30.812912 173 | 0.948822 69.318649 174 | 0.706390 105.062147 175 | 0.079632 29.420068 176 | 0.451087 -28.724685 177 | 0.833026 76.723835 178 | 0.589806 98.674874 179 | 0.426711 -21.594268 180 | 0.872883 95.887712 181 | 0.866451 94.402102 182 | 0.960398 123.559747 183 | 0.483803 5.224234 184 | 0.811602 99.841379 185 | 0.757527 63.549854 186 | 0.569327 108.435392 187 | 0.841625 60.552308 188 | 0.264639 2.557923 189 | 0.202161 -1.983889 190 | 0.055862 -3.131497 191 | 0.543843 98.362010 192 | 0.689099 112.378209 193 | 0.956951 82.016541 194 | 0.382037 -29.007783 195 | 0.131833 22.478291 196 | 0.156273 0.225886 197 | 0.000256 9.668106 198 | 0.892999 82.436686 199 | 0.206207 -12.619036 200 | 0.487537 5.149336 201 | -------------------------------------------------------------------------------- /ch9/ex2test.txt: -------------------------------------------------------------------------------- 1 | 0.421862 10.830241 2 | 0.105349 -2.241611 3 | 0.155196 21.872976 4 | 0.161152 2.015418 5 | 0.382632 -38.778979 6 | 0.017710 20.109113 7 | 0.129656 15.266887 8 | 0.613926 111.900063 9 | 0.409277 1.874731 10 | 0.807556 111.223754 11 | 0.593722 133.835486 12 | 0.953239 110.465070 13 | 0.257402 15.332899 14 | 0.645385 93.983054 15 | 0.563460 93.645277 16 | 0.408338 -30.719878 17 | 0.874394 91.873505 18 | 0.263805 -0.192752 19 | 0.411198 10.751118 20 | 0.449884 9.211901 21 | 0.646315 113.533660 22 | 0.673718 125.135638 23 | 0.805148 113.300462 24 | 0.759327 72.668572 25 | 0.519172 82.131698 26 | 0.741031 106.777146 27 | 0.030937 9.859127 28 | 0.268848 -34.137955 29 | 0.474901 -11.201301 30 | 0.588266 120.501998 31 | 0.893936 142.826476 32 | 0.870990 105.751746 33 | 0.430763 39.146258 34 | 0.057665 15.371897 35 | 0.100076 9.131761 36 | 0.980716 116.145896 37 | 0.235289 -13.691224 38 | 0.228098 16.089151 39 | 0.622248 99.345551 40 | 0.401467 -1.694383 41 | 0.960334 110.795415 42 | 0.031214 -5.330042 43 | 0.504228 96.003525 44 | 0.779660 75.921582 45 | 0.504496 101.341462 46 | 0.850974 96.293064 47 | 0.701119 102.333839 48 | 0.191551 5.072326 49 | 0.667116 92.310019 50 | 0.555584 80.367129 51 | 0.680006 132.965442 52 | 0.393899 38.605283 53 | 0.048940 -9.861871 54 | 0.963282 115.407485 55 | 0.655496 104.269918 56 | 0.576463 141.127267 57 | 0.675708 96.227996 58 | 0.853457 114.252288 59 | 0.003933 -12.182861 60 | 0.549512 97.927224 61 | 0.218967 -4.712462 62 | 0.659972 120.950439 63 | 0.008256 8.026816 64 | 0.099500 -14.318434 65 | 0.352215 -3.747546 66 | 0.874926 89.247356 67 | 0.635084 99.496059 68 | 0.039641 14.147109 69 | 0.665111 103.298719 70 | 0.156583 -2.540703 71 | 0.648843 119.333019 72 | 0.893237 95.209585 73 | 0.128807 5.558479 74 | 0.137438 5.567685 75 | 0.630538 98.462792 76 | 0.296084 -41.799438 77 | 0.632099 84.895098 78 | 0.987681 106.726447 79 | 0.744909 111.279705 80 | 0.862030 104.581156 81 | 0.080649 -7.679985 82 | 0.831277 59.053356 83 | 0.198716 26.878801 84 | 0.860932 90.632930 85 | 0.883250 92.759595 86 | 0.818003 110.272219 87 | 0.949216 115.200237 88 | 0.460078 -35.957981 89 | 0.561077 93.545761 90 | 0.863767 114.125786 91 | 0.476891 -29.774060 92 | 0.537826 81.587922 93 | 0.686224 110.911198 94 | 0.982327 119.114523 95 | 0.944453 92.033481 96 | 0.078227 30.216873 97 | 0.782937 92.588646 98 | 0.465886 2.222139 99 | 0.885024 90.247890 100 | 0.186077 7.144415 101 | 0.915828 84.010074 102 | 0.796649 115.572156 103 | 0.127821 28.933688 104 | 0.433429 6.782575 105 | 0.946796 108.574116 106 | 0.386915 -17.404601 107 | 0.561192 92.142700 108 | 0.182490 10.764616 109 | 0.878792 95.289476 110 | 0.381342 -6.177464 111 | 0.358474 -11.731754 112 | 0.270647 13.793201 113 | 0.488904 -17.641832 114 | 0.106773 5.684757 115 | 0.270112 4.335675 116 | 0.754985 75.860433 117 | 0.585174 111.640154 118 | 0.458821 12.029692 119 | 0.218017 -26.234872 120 | 0.583887 99.413850 121 | 0.923626 107.802298 122 | 0.833620 104.179678 123 | 0.870691 93.132591 124 | 0.249896 -8.618404 125 | 0.748230 109.160652 126 | 0.019365 34.048884 127 | 0.837588 101.239275 128 | 0.529251 115.514729 129 | 0.742898 67.038771 130 | 0.522034 64.160799 131 | 0.498982 3.983061 132 | 0.479439 24.355908 133 | 0.314834 -14.256200 134 | 0.753251 85.017092 135 | 0.479362 -17.480446 136 | 0.950593 99.072784 137 | 0.718623 58.080256 138 | 0.218720 -19.605593 139 | 0.664113 94.437159 140 | 0.942900 131.725134 141 | 0.314226 18.904871 142 | 0.284509 11.779346 143 | 0.004962 -14.624176 144 | 0.224087 -50.547649 145 | 0.974331 112.822725 146 | 0.894610 112.863995 147 | 0.167350 0.073380 148 | 0.753644 105.024456 149 | 0.632241 108.625812 150 | 0.314189 -6.090797 151 | 0.965527 87.418343 152 | 0.820919 94.610538 153 | 0.144107 -4.748387 154 | 0.072556 -5.682008 155 | 0.002447 29.685714 156 | 0.851007 79.632376 157 | 0.458024 -12.326026 158 | 0.627503 139.458881 159 | 0.422259 -29.827405 160 | 0.714659 63.480271 161 | 0.672320 93.608554 162 | 0.498592 37.112975 163 | 0.698906 96.282845 164 | 0.861441 99.699230 165 | 0.112425 -12.419909 166 | 0.164784 5.244704 167 | 0.481531 -18.070497 168 | 0.375482 1.779411 169 | 0.089325 -14.216755 170 | 0.036609 -6.264372 171 | 0.945004 54.723563 172 | 0.136608 14.970936 173 | 0.292285 -41.723711 174 | 0.029195 -0.660279 175 | 0.998307 100.124230 176 | 0.303928 -5.492264 177 | 0.957863 117.824392 178 | 0.815089 113.377704 179 | 0.466399 -10.249874 180 | 0.876693 115.617275 181 | 0.536121 102.997087 182 | 0.373984 -37.359936 183 | 0.565162 74.967476 184 | 0.085412 -21.449563 185 | 0.686411 64.859620 186 | 0.908752 107.983366 187 | 0.982829 98.005424 188 | 0.052766 -42.139502 189 | 0.777552 91.899340 190 | 0.374316 -3.522501 191 | 0.060231 10.008227 192 | 0.526225 87.317722 193 | 0.583872 67.104433 194 | 0.238276 10.615159 195 | 0.678747 60.624273 196 | 0.067649 15.947398 197 | 0.530182 105.030933 198 | 0.869389 104.969996 199 | 0.698410 75.460417 200 | 0.549430 82.558068 201 | -------------------------------------------------------------------------------- /ch9/exp.txt: -------------------------------------------------------------------------------- 1 | 0.529582 100.737303 2 | 0.985730 103.106872 3 | 0.797869 99.666151 4 | 0.393473 -1.773056 5 | 0.272568 -1.170222 6 | 0.758825 96.752440 7 | 0.218359 2.337347 8 | 0.926357 98.343231 9 | 0.726881 99.633009 10 | 0.805311 102.253834 11 | 0.208632 0.493174 12 | 0.184921 -2.231071 13 | 0.660135 100.139355 14 | 0.871875 96.637420 15 | 0.657182 100.345442 16 | 0.942481 97.751546 17 | 0.427843 -1.380170 18 | 0.845958 98.195303 19 | 0.878696 99.380485 20 | 0.582034 100.971036 21 | 0.118114 2.397033 22 | 0.144718 1.304535 23 | 0.576046 101.624714 24 | 0.750305 97.601324 25 | 0.518281 100.093634 26 | 0.260793 -1.361888 27 | 0.390245 -2.973759 28 | 0.963020 98.877859 29 | 0.880661 97.631997 30 | 0.291780 -1.638124 31 | 0.192903 -2.221257 32 | 0.461442 -1.074725 33 | 0.821171 99.372052 34 | 0.144557 2.589464 35 | 0.379346 0.991090 36 | 0.383822 1.832389 37 | 0.055406 -1.870700 38 | 0.084308 -0.611701 39 | 0.719578 100.087948 40 | 0.417471 -0.510292 41 | 0.477894 -3.426525 42 | 0.871228 100.307522 43 | 0.113074 -1.011079 44 | 0.409434 -0.616173 45 | 0.967141 96.551856 46 | 0.938254 97.052196 47 | 0.079989 2.083496 48 | 0.150207 1.285491 49 | 0.417339 -0.462985 50 | 0.038787 -2.237234 51 | 0.954657 102.111432 52 | 0.844894 98.350138 53 | 0.106770 -0.998182 54 | 0.247831 2.483594 55 | 0.108687 -0.920229 56 | 0.758165 98.079399 57 | 0.199978 -3.490410 58 | 0.600602 99.850119 59 | 0.026466 1.342825 60 | 0.141239 -0.949858 61 | 0.181437 -2.223725 62 | 0.352656 2.251362 63 | 0.803371 99.647157 64 | 0.677303 100.414859 65 | 0.561674 99.133372 66 | 0.497533 -3.764935 67 | 0.523327 98.452850 68 | 0.507075 103.807755 69 | 0.791978 99.414598 70 | 0.956890 95.977239 71 | 0.487927 1.199149 72 | 0.788795 100.012047 73 | 0.554283 98.522458 74 | 0.814361 97.642150 75 | 0.788940 97.399942 76 | 0.515845 102.240479 77 | 0.758538 97.461917 78 | 0.041824 -3.294141 79 | 0.341352 1.246559 80 | 0.194801 -2.285278 81 | 0.805528 99.023113 82 | 0.435762 0.361749 83 | 0.941615 100.746547 84 | 0.478234 0.791146 85 | 0.057445 -4.266792 86 | 0.510079 98.845273 87 | 0.209900 -0.861890 88 | 0.902668 101.429190 89 | 0.456602 -2.856392 90 | 0.997595 99.828241 91 | 0.048240 -0.268920 92 | 0.319531 0.896696 93 | 0.264929 -1.000487 94 | 0.432727 -4.630489 95 | 0.419828 1.260534 96 | 0.667056 99.456518 97 | 0.488173 1.574322 98 | 0.746300 100.563503 99 | 0.528660 100.736739 100 | 0.624185 99.562872 101 | 0.169411 1.809929 102 | 0.011025 4.132846 103 | 0.974164 98.706049 104 | 0.267957 0.297803 105 | 0.726093 99.381040 106 | 0.465163 -2.344545 107 | 0.993698 101.507792 108 | 0.816513 99.903496 109 | 0.398756 0.378060 110 | 0.054974 -0.588770 111 | 0.857067 100.322945 112 | 0.362328 2.551786 113 | 0.316961 -0.528283 114 | 0.167881 -0.376517 115 | 0.393776 3.658204 116 | 0.739991 100.426554 117 | 0.457949 0.857428 118 | 0.060635 2.484776 119 | 0.942634 101.254420 120 | 0.553691 102.467820 121 | 0.394694 -0.248353 122 | 0.714625 99.650556 123 | 0.273503 1.111820 124 | 0.471886 -5.665559 125 | 0.746476 98.720163 126 | 0.140209 0.471820 127 | 0.024197 -2.854251 128 | 0.521287 99.703915 129 | 0.672280 100.463227 130 | 0.380342 -0.785713 131 | 0.956380 99.482209 132 | 0.455254 1.613841 133 | 0.647551 101.591193 134 | 0.682498 98.267734 135 | 0.054839 -2.286019 136 | 0.716849 100.614510 137 | 0.217732 -2.161633 138 | 0.918885 100.260067 139 | 0.576026 101.719788 140 | 0.868511 100.669152 141 | 0.661135 97.637969 142 | 0.166334 1.374014 143 | 0.106850 -3.658050 144 | 0.768242 104.193841 145 | 0.240916 -0.368100 146 | 0.124957 2.821672 147 | 0.984335 98.571444 148 | 0.908524 101.777344 149 | 0.861217 98.656403 150 | 0.944295 100.154508 151 | 0.527278 101.052710 152 | 0.717072 100.788373 153 | 0.130227 0.115694 154 | 0.494734 -1.220681 155 | 0.498733 0.961514 156 | 0.519411 101.331622 157 | 0.712409 104.891067 158 | 0.933858 98.180299 159 | 0.266051 0.398961 160 | 0.153690 -0.657128 161 | 0.209181 1.486816 162 | 0.942699 102.187578 163 | 0.766799 100.213348 164 | 0.862578 101.816969 165 | 0.223266 2.854445 166 | 0.611394 103.428497 167 | 0.996212 98.494158 168 | 0.724945 99.098450 169 | 0.399346 0.879259 170 | 0.750510 98.729864 171 | 0.446060 0.639843 172 | 0.999913 101.502887 173 | 0.111561 3.256383 174 | 0.094755 0.170475 175 | 0.366547 0.488994 176 | 0.179924 -0.871567 177 | 0.969023 99.982789 178 | 0.941420 100.416754 179 | 0.656851 98.520940 180 | 0.983166 99.546591 181 | 0.167843 0.033922 182 | 0.316245 2.171137 183 | 0.817118 102.849575 184 | 0.173642 1.209173 185 | 0.411030 2.022640 186 | 0.265041 2.216470 187 | 0.779660 98.475428 188 | 0.059354 -0.929568 189 | 0.722092 97.974003 190 | 0.511958 101.924447 191 | 0.371938 -0.640602 192 | 0.851009 97.873330 193 | 0.375918 -5.308115 194 | 0.797332 99.763778 195 | 0.107749 -3.770092 196 | 0.156937 -0.876724 197 | 0.960447 99.597097 198 | 0.413434 2.408090 199 | 0.644257 100.453125 200 | 0.119332 -0.495588 201 | -------------------------------------------------------------------------------- /ch9/exp2.txt: -------------------------------------------------------------------------------- 1 | 0.070670 3.470829 2 | 0.534076 6.377132 3 | 0.747221 8.949407 4 | 0.668970 8.034081 5 | 0.586082 6.997721 6 | 0.764962 9.318110 7 | 0.658125 7.880333 8 | 0.346734 4.213359 9 | 0.313967 3.762496 10 | 0.601418 7.188805 11 | 0.404396 4.893403 12 | 0.154345 3.683175 13 | 0.984061 11.712928 14 | 0.597514 7.146694 15 | 0.005144 3.333150 16 | 0.142295 3.743681 17 | 0.280007 3.737376 18 | 0.542008 6.494275 19 | 0.466781 5.532255 20 | 0.706970 8.476718 21 | 0.191038 3.673921 22 | 0.756591 9.176722 23 | 0.912879 10.850358 24 | 0.524701 6.067444 25 | 0.306090 3.681148 26 | 0.429009 5.032168 27 | 0.695091 8.209058 28 | 0.984495 11.909595 29 | 0.702748 8.298454 30 | 0.551771 6.715210 31 | 0.272894 3.983313 32 | 0.014611 3.559081 33 | 0.699852 8.417306 34 | 0.309710 3.739053 35 | 0.444877 5.219649 36 | 0.717509 8.483072 37 | 0.576550 6.894860 38 | 0.284200 3.792626 39 | 0.675922 8.067282 40 | 0.304401 3.671373 41 | 0.233675 3.795962 42 | 0.453779 5.477533 43 | 0.900938 10.701447 44 | 0.502418 6.046703 45 | 0.781843 9.254690 46 | 0.226271 3.546938 47 | 0.619535 7.703312 48 | 0.519998 6.202835 49 | 0.399447 4.934647 50 | 0.785298 9.497564 51 | 0.010767 3.565835 52 | 0.696399 8.307487 53 | 0.524366 6.266060 54 | 0.396583 4.611390 55 | 0.059988 3.484805 56 | 0.946702 11.263118 57 | 0.417559 4.895128 58 | 0.609194 7.239316 59 | 0.730687 8.858371 60 | 0.586694 7.061601 61 | 0.829567 9.937968 62 | 0.964229 11.521595 63 | 0.276813 3.756406 64 | 0.987041 11.947913 65 | 0.876107 10.440538 66 | 0.747582 8.942278 67 | 0.117348 3.567821 68 | 0.188617 3.976420 69 | 0.416655 4.928907 70 | 0.192995 3.978365 71 | 0.244888 3.777018 72 | 0.806349 9.685831 73 | 0.417555 4.990148 74 | 0.233805 3.740022 75 | 0.357325 4.325355 76 | 0.190201 3.638493 77 | 0.705127 8.432886 78 | 0.336599 3.868493 79 | 0.473786 5.871813 80 | 0.384794 4.830712 81 | 0.502217 6.117244 82 | 0.788220 9.454959 83 | 0.478773 5.681631 84 | 0.064296 3.642040 85 | 0.332143 3.886628 86 | 0.618869 7.312725 87 | 0.854981 10.306697 88 | 0.570000 6.764615 89 | 0.512739 6.166836 90 | 0.112285 3.545863 91 | 0.723700 8.526944 92 | 0.192256 3.661033 93 | 0.181268 3.678579 94 | 0.196731 3.916622 95 | 0.510342 6.026652 96 | 0.263713 3.723018 97 | 0.141105 3.529595 98 | 0.150262 3.552314 99 | 0.824724 9.973690 100 | 0.588088 6.893128 101 | 0.411291 4.856380 102 | 0.763717 9.199101 103 | 0.212118 3.740024 104 | 0.264587 3.742917 105 | 0.973524 11.683243 106 | 0.250670 3.679117 107 | 0.823460 9.743861 108 | 0.253752 3.781488 109 | 0.838332 10.172180 110 | 0.501156 6.113263 111 | 0.097275 3.472367 112 | 0.667199 7.948868 113 | 0.487320 6.022060 114 | 0.654640 7.809457 115 | 0.906907 10.775188 116 | 0.821941 9.936140 117 | 0.859396 10.428255 118 | 0.078696 3.490510 119 | 0.938092 11.252471 120 | 0.998868 11.863062 121 | 0.025501 3.515624 122 | 0.451806 5.441171 123 | 0.883872 10.498912 124 | 0.583567 6.912334 125 | 0.823688 10.003723 126 | 0.891032 10.818109 127 | 0.879259 10.639263 128 | 0.163007 3.662715 129 | 0.344263 4.169705 130 | 0.796083 9.422591 131 | 0.903683 10.978834 132 | 0.050129 3.575105 133 | 0.605553 7.306014 134 | 0.628951 7.556742 135 | 0.877052 10.444055 136 | 0.829402 9.856432 137 | 0.121422 3.638276 138 | 0.721517 8.663569 139 | 0.066532 3.673471 140 | 0.996587 11.782002 141 | 0.653384 7.804568 142 | 0.739494 8.817809 143 | 0.640341 7.636812 144 | 0.337828 3.971613 145 | 0.220512 3.713645 146 | 0.368815 4.381696 147 | 0.782509 9.349428 148 | 0.645825 7.790882 149 | 0.277391 3.834258 150 | 0.092569 3.643274 151 | 0.284320 3.609353 152 | 0.344465 4.023259 153 | 0.182523 3.749195 154 | 0.385001 4.426970 155 | 0.747609 8.966676 156 | 0.188907 3.711018 157 | 0.806244 9.610438 158 | 0.014211 3.517818 159 | 0.574813 7.040672 160 | 0.714500 8.525624 161 | 0.538982 6.393940 162 | 0.384638 4.649362 163 | 0.915586 10.936577 164 | 0.883513 10.441493 165 | 0.804148 9.742851 166 | 0.466011 5.833439 167 | 0.800574 9.638874 168 | 0.654980 8.028558 169 | 0.348564 4.064616 170 | 0.978595 11.720218 171 | 0.915906 10.833902 172 | 0.285477 3.818961 173 | 0.988631 11.684010 174 | 0.531069 6.305005 175 | 0.181658 3.806995 176 | 0.039657 3.356861 177 | 0.893344 10.776799 178 | 0.355214 4.263666 179 | 0.783508 9.475445 180 | 0.039768 3.429691 181 | 0.546308 6.472749 182 | 0.786882 9.398951 183 | 0.168282 3.564189 184 | 0.374900 4.399040 185 | 0.737767 8.888536 186 | 0.059849 3.431537 187 | 0.861891 10.246888 188 | 0.597578 7.112627 189 | 0.126050 3.611641 190 | 0.074795 3.609222 191 | 0.634401 7.627416 192 | 0.831633 9.926548 193 | 0.019095 3.470285 194 | 0.396533 4.773104 195 | 0.794973 9.492009 196 | 0.889088 10.420003 197 | 0.003174 3.587139 198 | 0.176767 3.554071 199 | 0.943730 11.227731 200 | 0.758564 8.885337 201 | -------------------------------------------------------------------------------- /ch9/expTest.txt: -------------------------------------------------------------------------------- 1 | 0.042621 0.705087 2 | 0.140649 1.676077 3 | 0.729711 98.287450 4 | 0.420368 0.893020 5 | 0.055112 -1.784342 6 | 0.335700 -2.039774 7 | 0.480745 -1.165972 8 | 0.039408 -2.453546 9 | 0.713000 99.181124 10 | 0.437107 2.288551 11 | 0.553328 99.909260 12 | 0.146352 -3.900741 13 | 0.753615 97.640436 14 | 0.739062 100.411664 15 | 0.391077 0.380562 16 | 0.887119 102.018433 17 | 0.090234 -1.872570 18 | 0.870459 97.253294 19 | 0.174066 0.716029 20 | 0.698476 96.591450 21 | 0.463064 0.197371 22 | 0.201708 -3.424533 23 | 0.335499 -2.823621 24 | 0.873611 101.105294 25 | 0.315239 1.893852 26 | 0.258688 -0.604888 27 | 0.331030 2.185822 28 | 0.938692 98.758321 29 | 0.390971 5.619469 30 | 0.946373 101.358201 31 | 0.841116 100.136301 32 | 0.652268 101.167615 33 | 0.488903 1.912745 34 | 0.076776 0.631315 35 | 0.078587 -0.173226 36 | 0.690439 103.351735 37 | 0.992771 99.322329 38 | 0.357646 -1.662827 39 | 0.996224 100.969483 40 | 0.431983 -2.332204 41 | 0.084956 -0.153660 42 | 0.416978 -3.185275 43 | 0.483920 -0.400342 44 | 0.351282 -0.212100 45 | 0.696687 100.399345 46 | 0.610816 100.447063 47 | 0.876386 97.717446 48 | 0.290065 -1.402790 49 | 0.561540 97.719979 50 | 0.521387 102.671802 51 | 0.124250 -1.447424 52 | 0.760795 100.973153 53 | 0.813137 98.418078 54 | 0.322203 -0.210448 55 | 0.222080 -2.382876 56 | 0.012078 0.145758 57 | 0.215864 -1.753234 58 | 0.286381 -0.029690 59 | 0.504148 100.382630 60 | 0.853875 97.561672 61 | 0.077604 1.836922 62 | 0.533825 100.804076 63 | 0.197164 -1.982653 64 | 0.915268 96.773211 65 | 0.637298 98.012823 66 | 0.222793 0.879413 67 | 0.403267 1.696757 68 | 0.365798 -1.228388 69 | 0.470756 -3.196883 70 | 0.007890 -0.725592 71 | 0.348122 3.658900 72 | 0.816112 102.003904 73 | 0.752076 101.766783 74 | 0.722139 99.311245 75 | 0.050637 -0.053007 76 | 0.794114 96.183380 77 | 0.416684 -2.133790 78 | 0.019078 -2.772976 79 | 0.875982 99.771033 80 | 0.393920 -0.334077 81 | 0.240991 -1.351481 82 | 0.975677 98.774986 83 | 0.790547 99.321853 84 | 0.437987 -1.925655 85 | 0.164944 1.045779 86 | 0.197404 0.812910 87 | 0.679754 101.643453 88 | 0.579659 101.453164 89 | 0.022060 -0.116585 90 | 0.181261 -2.269127 91 | 0.223999 -2.179047 92 | 0.409925 1.365931 93 | 0.360634 -4.286442 94 | 0.164986 -0.749713 95 | 0.583409 99.378572 96 | 0.741431 102.861904 97 | 0.494034 -1.145858 98 | 0.411789 2.687350 99 | 0.940651 102.052953 100 | 0.680743 99.299124 101 | 0.453674 -3.107414 102 | 0.164892 1.666987 103 | 0.778335 99.863542 104 | 0.336990 0.938736 105 | 0.501560 101.008483 106 | 0.855588 101.125709 107 | 0.654224 100.980805 108 | 0.653707 98.019920 109 | 0.588863 96.945577 110 | 0.385631 3.146359 111 | 0.050457 -0.106757 112 | 0.822597 100.607049 113 | 0.208452 -0.460245 114 | 0.040589 0.069251 115 | 0.731871 104.981635 116 | 0.427191 -3.934995 117 | 0.623521 97.676660 118 | 0.203501 -0.529907 119 | 0.181543 0.705354 120 | 0.289069 1.085134 121 | 0.652419 98.896461 122 | 0.111964 3.514297 123 | 0.277014 2.301090 124 | 0.497381 -1.877630 125 | 0.994973 98.092916 126 | 0.084255 3.147329 127 | 0.084836 -2.263086 128 | 0.629725 103.448042 129 | 0.741841 99.908137 130 | 0.788823 99.790969 131 | 0.063125 -2.847334 132 | 0.413608 -2.245895 133 | 0.527976 101.466569 134 | 0.596276 101.079191 135 | 0.845748 100.308275 136 | 0.976452 100.197745 137 | 0.475051 2.563985 138 | 0.694542 99.125422 139 | 0.390583 -1.652652 140 | 0.580233 99.861938 141 | 0.622445 97.933787 142 | 0.744950 102.392552 143 | 0.414662 -1.727387 144 | 0.648774 101.371751 145 | 0.013468 -1.718182 146 | 0.781245 98.393098 147 | 0.871697 103.241025 148 | 0.198555 0.407556 149 | 0.427669 -1.826682 150 | 0.281457 0.137682 151 | 0.837984 98.909162 152 | 0.424066 1.060564 153 | 0.837252 100.688719 154 | 0.369463 1.061182 155 | 0.034532 -0.423989 156 | 0.481137 -0.008675 157 | 0.156752 -0.713391 158 | 0.661411 99.255937 159 | 0.176114 -0.302831 160 | 0.478959 -0.367422 161 | 0.874168 97.783253 162 | 0.167500 -0.829583 163 | 0.864995 99.961025 164 | 0.915850 99.090509 165 | 0.717802 100.059025 166 | 0.497465 -2.379605 167 | 0.351879 -1.832181 168 | 0.600021 99.967671 169 | 0.653842 100.114605 170 | 0.235046 -0.002983 171 | 0.608262 99.428381 172 | 0.979362 95.533709 173 | 0.178479 -0.697517 174 | 0.770679 99.313631 175 | 0.605045 101.927861 176 | 0.342313 -1.473575 177 | 0.927246 101.401583 178 | 0.623712 100.875627 179 | 0.764501 97.575820 180 | 0.670568 101.465970 181 | 0.799404 100.978750 182 | 0.999679 95.883283 183 | 0.341203 -2.047895 184 | 0.640206 98.109133 185 | 0.898167 100.648327 186 | 0.538279 97.178557 187 | 0.804254 102.052744 188 | 0.641926 99.911401 189 | 0.248823 -1.025944 190 | 0.830591 100.349505 191 | 0.468414 -2.691770 192 | 0.492944 0.405210 193 | 0.309762 1.995071 194 | 0.951799 99.978873 195 | 0.935369 104.094296 196 | 0.336673 -4.239911 197 | 0.872527 102.585224 198 | 0.837085 103.322194 199 | 0.525039 99.419610 200 | 0.504804 102.986424 201 | -------------------------------------------------------------------------------- /ch9/matplotlib/基于CART算法构建回归树的简单数据集.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import matplotlib 4 | from numpy import * 5 | 6 | #解析文本数据 7 | def loadDataSet(filename): 8 | dataMat=[] 9 | fr=open(filename) 10 | for line in fr.readlines(): 11 | curLine=line.strip().split('\t') 12 | #将每行数据映射为浮点数 13 | fltLine=map(float,curLine) 14 | dataMat.append(fltLine) 15 | return dataMat 16 | 17 | if __name__=="__main__": 18 | import matplotlib.pyplot as plt 19 | myDat = loadDataSet("ex00.txt") 20 | myMat = mat(myDat) 21 | plt.plot(myMat[:,0],myMat[:,1],'ro') 22 | plt.show() 23 | 24 | -------------------------------------------------------------------------------- /ch9/matplotlib/放大100倍.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import matplotlib 4 | from numpy import * 5 | 6 | #解析文本数据 7 | def loadDataSet(filename): 8 | dataMat=[] 9 | fr=open(filename) 10 | for line in fr.readlines(): 11 | curLine=line.strip().split('\t') 12 | #将每行数据映射为浮点数 13 | fltLine=map(float,curLine) 14 | dataMat.append(fltLine) 15 | return dataMat 16 | 17 | if __name__=="__main__": 18 | import matplotlib.pyplot as plt 19 | myDat = loadDataSet("ex2.txt") 20 | myMat = mat(myDat) 21 | plt.plot(myMat[:,0],myMat[:,1],'ro') 22 | plt.show() 23 | 24 | -------------------------------------------------------------------------------- /ch9/matplotlib/测试模型树构建函数的测试数据.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import matplotlib 4 | from numpy import * 5 | 6 | #解析文本数据 7 | def loadDataSet(filename): 8 | dataMat=[] 9 | fr=open(filename) 10 | for line in fr.readlines(): 11 | curLine=line.strip().split('\t') 12 | #将每行数据映射为浮点数 13 | fltLine=map(float,curLine) 14 | dataMat.append(fltLine) 15 | return dataMat 16 | 17 | if __name__=="__main__": 18 | import matplotlib.pyplot as plt 19 | myDat = loadDataSet("exp2.txt") 20 | myMat = mat(myDat) 21 | plt.plot(myMat[:,0],myMat[:,1],'ro') 22 | plt.show() 23 | 24 | -------------------------------------------------------------------------------- /ch9/matplotlib/用于测试回归树的分段常数数据集.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import matplotlib 4 | from numpy import * 5 | 6 | #解析文本数据 7 | def loadDataSet(filename): 8 | dataMat=[] 9 | fr=open(filename) 10 | for line in fr.readlines(): 11 | curLine=line.strip().split('\t') 12 | #将每行数据映射为浮点数 13 | fltLine=map(float,curLine) 14 | dataMat.append(fltLine) 15 | return dataMat 16 | 17 | if __name__=="__main__": 18 | import matplotlib.pyplot as plt 19 | myDat = loadDataSet("ex0.txt") 20 | myMat = mat(myDat) 21 | plt.plot(myMat[:,1],myMat[:,2],'ro') 22 | plt.show() 23 | 24 | -------------------------------------------------------------------------------- /ch9/screenshot/README.md: -------------------------------------------------------------------------------- 1 | # screenshot 2 | -------------------------------------------------------------------------------- /ch9/screenshot/基于CART算法构建回归树的简单数据集.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch9/screenshot/基于CART算法构建回归树的简单数据集.png -------------------------------------------------------------------------------- /ch9/screenshot/放大100倍.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch9/screenshot/放大100倍.png -------------------------------------------------------------------------------- /ch9/screenshot/测试模型树构建函数的测试数据.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch9/screenshot/测试模型树构建函数的测试数据.png -------------------------------------------------------------------------------- /ch9/screenshot/用于测试回归树的分段常数数据集.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch9/screenshot/用于测试回归树的分段常数数据集.png -------------------------------------------------------------------------------- /ch9/screenshot/骑自行车速度.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch9/screenshot/骑自行车速度.png -------------------------------------------------------------------------------- /ch9/sine.txt: -------------------------------------------------------------------------------- 1 | 0.190350 0.878049 2 | 0.306657 -0.109413 3 | 0.017568 0.030917 4 | 0.122328 0.951109 5 | 0.076274 0.774632 6 | 0.614127 -0.250042 7 | 0.220722 0.807741 8 | 0.089430 0.840491 9 | 0.278817 0.342210 10 | 0.520287 -0.950301 11 | 0.726976 0.852224 12 | 0.180485 1.141859 13 | 0.801524 1.012061 14 | 0.474273 -1.311226 15 | 0.345116 -0.319911 16 | 0.981951 -0.374203 17 | 0.127349 1.039361 18 | 0.757120 1.040152 19 | 0.345419 -0.429760 20 | 0.314532 -0.075762 21 | 0.250828 0.657169 22 | 0.431255 -0.905443 23 | 0.386669 -0.508875 24 | 0.143794 0.844105 25 | 0.470839 -0.951757 26 | 0.093065 0.785034 27 | 0.205377 0.715400 28 | 0.083329 0.853025 29 | 0.243475 0.699252 30 | 0.062389 0.567589 31 | 0.764116 0.834931 32 | 0.018287 0.199875 33 | 0.973603 -0.359748 34 | 0.458826 -1.113178 35 | 0.511200 -1.082561 36 | 0.712587 0.615108 37 | 0.464745 -0.835752 38 | 0.984328 -0.332495 39 | 0.414291 -0.808822 40 | 0.799551 1.072052 41 | 0.499037 -0.924499 42 | 0.966757 -0.191643 43 | 0.756594 0.991844 44 | 0.444938 -0.969528 45 | 0.410167 -0.773426 46 | 0.532335 -0.631770 47 | 0.343909 -0.313313 48 | 0.854302 0.719307 49 | 0.846882 0.916509 50 | 0.740758 1.009525 51 | 0.150668 0.832433 52 | 0.177606 0.893017 53 | 0.445289 -0.898242 54 | 0.734653 0.787282 55 | 0.559488 -0.663482 56 | 0.232311 0.499122 57 | 0.934435 -0.121533 58 | 0.219089 0.823206 59 | 0.636525 0.053113 60 | 0.307605 0.027500 61 | 0.713198 0.693978 62 | 0.116343 1.242458 63 | 0.680737 0.368910 64 | 0.484730 -0.891940 65 | 0.929408 0.234913 66 | 0.008507 0.103505 67 | 0.872161 0.816191 68 | 0.755530 0.985723 69 | 0.620671 0.026417 70 | 0.472260 -0.967451 71 | 0.257488 0.630100 72 | 0.130654 1.025693 73 | 0.512333 -0.884296 74 | 0.747710 0.849468 75 | 0.669948 0.413745 76 | 0.644856 0.253455 77 | 0.894206 0.482933 78 | 0.820471 0.899981 79 | 0.790796 0.922645 80 | 0.010729 0.032106 81 | 0.846777 0.768675 82 | 0.349175 -0.322929 83 | 0.453662 -0.957712 84 | 0.624017 -0.169913 85 | 0.211074 0.869840 86 | 0.062555 0.607180 87 | 0.739709 0.859793 88 | 0.985896 -0.433632 89 | 0.782088 0.976380 90 | 0.642561 0.147023 91 | 0.779007 0.913765 92 | 0.185631 1.021408 93 | 0.525250 -0.706217 94 | 0.236802 0.564723 95 | 0.440958 -0.993781 96 | 0.397580 -0.708189 97 | 0.823146 0.860086 98 | 0.370173 -0.649231 99 | 0.791675 1.162927 100 | 0.456647 -0.956843 101 | 0.113350 0.850107 102 | 0.351074 -0.306095 103 | 0.182684 0.825728 104 | 0.914034 0.305636 105 | 0.751486 0.898875 106 | 0.216572 0.974637 107 | 0.013273 0.062439 108 | 0.469726 -1.226188 109 | 0.060676 0.599451 110 | 0.776310 0.902315 111 | 0.061648 0.464446 112 | 0.714077 0.947507 113 | 0.559264 -0.715111 114 | 0.121876 0.791703 115 | 0.330586 -0.165819 116 | 0.662909 0.379236 117 | 0.785142 0.967030 118 | 0.161352 0.979553 119 | 0.985215 -0.317699 120 | 0.457734 -0.890725 121 | 0.171574 0.963749 122 | 0.334277 -0.266228 123 | 0.501065 -0.910313 124 | 0.988736 -0.476222 125 | 0.659242 0.218365 126 | 0.359861 -0.338734 127 | 0.790434 0.843387 128 | 0.462458 -0.911647 129 | 0.823012 0.813427 130 | 0.594668 -0.603016 131 | 0.498207 -0.878847 132 | 0.574882 -0.419598 133 | 0.570048 -0.442087 134 | 0.331570 -0.347567 135 | 0.195407 0.822284 136 | 0.814327 0.974355 137 | 0.641925 0.073217 138 | 0.238778 0.657767 139 | 0.400138 -0.715598 140 | 0.670479 0.469662 141 | 0.069076 0.680958 142 | 0.294373 0.145767 143 | 0.025628 0.179822 144 | 0.697772 0.506253 145 | 0.729626 0.786519 146 | 0.293071 0.259997 147 | 0.531802 -1.095833 148 | 0.487338 -1.034481 149 | 0.215780 0.933506 150 | 0.625818 0.103845 151 | 0.179389 0.892237 152 | 0.192552 0.915516 153 | 0.671661 0.330361 154 | 0.952391 -0.060263 155 | 0.795133 0.945157 156 | 0.950494 -0.071855 157 | 0.194894 1.000860 158 | 0.351460 -0.227946 159 | 0.863456 0.648456 160 | 0.945221 -0.045667 161 | 0.779840 0.979954 162 | 0.996606 -0.450501 163 | 0.632184 -0.036506 164 | 0.790898 0.994890 165 | 0.022503 0.386394 166 | 0.318983 -0.152749 167 | 0.369633 -0.423960 168 | 0.157300 0.962858 169 | 0.153223 0.882873 170 | 0.360068 -0.653742 171 | 0.433917 -0.872498 172 | 0.133461 0.879002 173 | 0.757252 1.123667 174 | 0.309391 -0.102064 175 | 0.195586 0.925339 176 | 0.240259 0.689117 177 | 0.340591 -0.455040 178 | 0.243436 0.415760 179 | 0.612755 -0.180844 180 | 0.089407 0.723702 181 | 0.469695 -0.987859 182 | 0.943560 -0.097303 183 | 0.177241 0.918082 184 | 0.317756 -0.222902 185 | 0.515337 -0.733668 186 | 0.344773 -0.256893 187 | 0.537029 -0.797272 188 | 0.626878 0.048719 189 | 0.208940 0.836531 190 | 0.470697 -1.080283 191 | 0.054448 0.624676 192 | 0.109230 0.816921 193 | 0.158325 1.044485 194 | 0.976650 -0.309060 195 | 0.643441 0.267336 196 | 0.215841 1.018817 197 | 0.905337 0.409871 198 | 0.154354 0.920009 199 | 0.947922 -0.112378 200 | 0.201391 0.768894 201 | -------------------------------------------------------------------------------- /ch9/treeExplore.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | import regTrees 3 | from numpy import * 4 | import matplotlib 5 | from Tkinter import * 6 | 7 | matplotlib.use('TkAgg') #设置后端TkAgg 8 | #将TkAgg和matplotlib链接起来 9 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg 10 | from matplotlib.figure import Figure 11 | 12 | 13 | def reDraw(tolS, tolN): 14 | reDraw.f.clf() #清空之前的图像 15 | reDraw.a = reDraw.f.add_subplot(111)#重新添加新图 16 | if chkBtnVar.get():#检查选框model tree是否被选中 17 | if tolN < 2: tolN = 2 18 | myTree = regTrees.createTree(reDraw.rawDat, regTrees.modelLeaf,regTrees.modelErr, (tolS, tolN)) 19 | yHat = regTrees.createForeCast(myTree, reDraw.testDat, regTrees.modelTreeEval) 20 | else: 21 | myTree = regTrees.createTree(reDraw.rawDat, ops=(tolS, tolN)) 22 | yHat = regTrees.createForeCast(myTree, reDraw.testDat) 23 | reDraw.a.scatter(reDraw.rawDat[:, 0], reDraw.rawDat[:, 1], s=5) # 绘制真实值 24 | reDraw.a.plot(reDraw.testDat, yHat, linewidth=2.0) # 绘制预测值 25 | reDraw.canvas.show() 26 | 27 | 28 | def getInputs():#获取输入 29 | try:#期望输入是整数 30 | tolN = int(tolNentry.get()) 31 | except:#清楚错误用默认值替换 32 | tolN = 10 33 | print("enter Integer for tolN") 34 | tolNentry.delete(0, END) 35 | tolNentry.insert(0, '10') 36 | try:#期望输入是浮点数 37 | tolS = float(tolSentry.get()) 38 | except: 39 | tolS = 1.0 40 | print("enter Float for tolS") 41 | tolSentry.delete(0, END) 42 | tolSentry.insert(0, '1.0') 43 | return tolN, tolS 44 | 45 | 46 | def drawNewTree(): 47 | tolN, tolS = getInputs() # 从输入文本框中获取参数 48 | reDraw(tolS, tolN) #绘制图 49 | 50 | root = Tk() 51 | 52 | reDraw.f = Figure(figsize=(5, 4), dpi=100) # 创建画布 53 | reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root) 54 | reDraw.canvas.show() 55 | reDraw.canvas.get_tk_widget().grid(row=0, columnspan=3) 56 | 57 | Label(root, text="tolN").grid(row=1, column=0) 58 | tolNentry = Entry(root) 59 | tolNentry.grid(row=1, column=1) 60 | tolNentry.insert(0, '10') 61 | Label(root, text="tolS").grid(row=2, column=0) 62 | tolSentry = Entry(root) 63 | tolSentry.grid(row=2, column=1) 64 | tolSentry.insert(0, '1.0') 65 | Button(root, text="ReDraw", command=drawNewTree).grid(row=1, column=2, rowspan=3) 66 | chkBtnVar = IntVar() 67 | chkBtn = Checkbutton(root, text="Model Tree", variable=chkBtnVar) 68 | chkBtn.grid(row=3, column=0, columnspan=2) 69 | 70 | reDraw.rawDat = mat(regTrees.loadDataSet('sine.txt')) 71 | reDraw.testDat = arange(min(reDraw.rawDat[:, 0]), max(reDraw.rawDat[:, 0]), 0.01) 72 | reDraw(1.0, 10) 73 | 74 | root.mainloop() --------------------------------------------------------------------------------