├── README.md
├── ch1
    ├── 1.py
    └── README.md
├── ch10
    ├── Portland.png
    ├── README.md
    ├── kMeans.py
    ├── matplotlib
    │   ├── README.md
    │   ├── kMeans.py
    │   └── 对地图上的点进行聚类.py
    ├── places.txt
    ├── portlandClubs.txt
    ├── screenshot
    │   ├── Map.png
    │   ├── Portland.png
    │   ├── README.md
    │   ├── dataSet2.png
    │   ├── k-均值聚类的结果示意图.png
    │   ├── 二分k-均值算法.png
    │   └── 数据集.png
    ├── testSet.txt
    ├── testSet2.txt
    └── 对地图上的点进行聚类.py
├── ch11
    ├── Apriori.py
    ├── README.md
    ├── bills20DataSet.txt
    ├── lawAssnRules.txt
    ├── meaning20.txt
    ├── mushroom.dat
    ├── recent100bills.txt
    ├── recent20bills.txt
    └── votesmart.py
├── ch12
    ├── FP-growth.py
    ├── README.md
    ├── Twitter.py
    └── kosarak.zip
├── ch13
    ├── README.md
    ├── iris.data.txt
    ├── matplotlib
    │   ├── README.md
    │   ├── 数据集.py
    │   ├── 方差百分比.py
    │   ├── 降维.py
    │   └── 降维2.py
    ├── pca.py
    ├── screenshot
    │   ├── README.md
    │   ├── 数据集.png
    │   ├── 方差百分比.png
    │   ├── 降维.png
    │   └── 降维2.png
    ├── secom.data
    ├── testSet.txt
    └── testSet3.txt
├── ch14
    ├── 0_5.txt
    ├── README.md
    └── SVD.py
├── ch15
    ├── README.md
    ├── err.txt
    ├── inputFile.txt
    ├── junk.txt
    ├── kickStart.txt
    ├── mrMean.py
    ├── mrMeanMapper.py
    ├── mrMeanReducer.py
    ├── mrSVM.py
    ├── mrSVMkickStart.py
    ├── myfile.txt
    ├── myout.txt
    ├── pegasos.py
    ├── proximalSVM.py
    ├── py27dbg.py
    ├── svmDat2.txt
    ├── svmDat26
    ├── svmDat27
    ├── svmData.txt
    ├── testSet.txt
    ├── testSet200.txt
    └── wc.py
├── ch2
    ├── KNN.txt
    ├── KNN（classify）.py
    ├── KNN（datingTestSet）.py
    ├── KNN（dating完整版）.py
    ├── KNN（handwriting）.py
    ├── README.md
    ├── creatDist.py
    ├── datingTestSet.txt
    ├── datingTraningSet.txt
    ├── testDigits.zip
    ├── testSet.txt
    └── trainingDigits.zip
├── ch3
    ├── README.md
    ├── TheTree.txt
    ├── calcShannonEnt.py
    ├── chooseBestFeatureToSplit.py
    ├── createTree.py
    ├── lenses.txt
    ├── lensesTree.txt
    ├── plotTree.py
    └── screenshot
    │   ├── TheTree.png
    │   └── lensesTree.png
├── ch4
    ├── README.md
    ├── advertisement.py
    ├── classifyNB && testingNB.py
    ├── email.py
    ├── ham.zip
    ├── matplotlib
    │   ├── README.md
    │   ├── math_matplotlib.py
    │   ├── matplotlib.py
    │   └── srceenshot
    │   │   ├── math_matplotlib.png
    │   │   └── matplotlib.png
    ├── spam.zip
    └── trainNB0.py
├── ch5
    ├── README.md
    ├── matplotlib
    │   ├── sigmoid.py
    │   └── 梯度上升.py
    ├── screenshot
    │   ├── Logistic回归最佳拟合直线.png
    │   ├── sigmoid.png
    │   ├── 改进随机梯度上升.png
    │   ├── 梯度上升.png
    │   └── 随机梯度上升.png
    ├── testSet.txt
    ├── 使用梯度上升找最佳拟合直线.py
    ├── 改进随机梯度上升.py
    └── 随机梯度上升.py
├── ch6
    ├── README.md
    ├── digits.zip
    ├── matplotlib
    │   ├── 4个线性不可分的数据集效果图.py
    │   ├── README.md
    │   ├── 完整版SMO效果图.py
    │   ├── 核方法中的非线性可分数据效果图.py
    │   └── 简化版SMO处理小数据集效果图.py
    ├── screenshot
    │   ├── 4个线性不可分的数据集.png
    │   ├── README.md
    │   ├── 完整版SMO.png
    │   ├── 核方法中的非线性可分数据.png
    │   └── 简化版SMO效果图.png
    ├── testSet.txt
    ├── testSetRBF.txt
    ├── testSetRBF2.txt
    ├── 完整版Platt SMO.py
    └── 简化版SMO处理小数据集.py
├── ch7
    ├── Adaboost.py
    ├── README.md
    ├── horseColicTest2.txt
    ├── horseColicTraining2.txt
    ├── matplotlib
    │   └── 单层决策树测试数据.py
    └── screenshot
    │   ├── README.md
    │   ├── ROC曲线.png
    │   └── 单层决策树测试数据.png
├── ch8
    ├── README.md
    ├── abalone.txt
    ├── ex0.txt
    ├── ex1.txt
    ├── matplotlib
    │   ├── 前向逐步回归.py
    │   ├── 局部加权线性回归.py
    │   ├── 岭回归.py
    │   └── 线性回归找到最佳拟合曲线.py
    ├── screenshot
    │   ├── README.md
    │   ├── 前向逐步回归.png
    │   ├── 局部加权线性回归(k=0.003).png
    │   ├── 局部加权线性回归(k=0.01).png
    │   ├── 局部加权线性回归(k=1.0).png
    │   ├── 岭回归.png
    │   ├── 数据分布.png
    │   └── 线性回归找到最佳拟合曲线.png
    ├── 前向逐步回归.py
    ├── 局部加权线性回归.py
    ├── 岭回归.py
    └── 线性回归找到最佳拟合曲线.py
└── ch9
    ├── CRAT算法用于回归.py
    ├── README.md
    ├── bikeSpeedVsIq_test.txt
    ├── bikeSpeedVsIq_train.txt
    ├── ex0.txt
    ├── ex00.txt
    ├── ex2.txt
    ├── ex2test.txt
    ├── exp.txt
    ├── exp2.txt
    ├── expTest.txt
    ├── matplotlib
        ├── 基于CART算法构建回归树的简单数据集.py
        ├── 放大100倍.py
        ├── 测试模型树构建函数的测试数据.py
        └── 用于测试回归树的分段常数数据集.py
    ├── regTrees.py
    ├── screenshot
        ├── README.md
        ├── 基于CART算法构建回归树的简单数据集.png
        ├── 放大100倍.png
        ├── 测试模型树构建函数的测试数据.png
        ├── 用于测试回归树的分段常数数据集.png
        └── 骑自行车速度.png
    ├── sine.txt
    └── treeExplore.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Machine-Learning-in-Action
 2 | Practices & Code from Machine-Learning-in-Action.
 3 | 
 4 | 《机器学习实战》的笔记与代码。
 5 | 
 6 | ## schedule
 7 | ### PART 1 CLASSIFICATION
 8 | 1. [x] chapter01 - Machine learning basics (机器学习基础)
 9 | 2. [x] chapter02 - Classifying with k-Nearest Neighbors (k-近邻)
10 | 3. [x] chapter03 - Splitting datasets one feature at a time: decision trees (决策树)
11 | 4. [x] chapter04 - Classifying with probability theory: naive Bayes (朴素贝叶斯)
12 | 5. [x] chapter05 - Logistic regression (Logistic回归)
13 | 6. [x] chapter06 - Support vector machines (支持向量机)
14 | 7. [x] chapter07 - Improving classification with the AdaBoost meta-algorithm (利用AdaBoost元算法提高分类性能)
15 | 
16 | ### PART 2 FORECASTING NUMERIC VALUES WITH REGRESSION
17 | 8. [x] chapter08 - Predicting numeric values: regression (预测数值型数据：回归)
18 | 9. [x] chapter09 - Tree-based regression (树回归)
19 | 
20 | ### PART 3 UNSUPERVISED LEARNING
21 | 10. [x] chapter10 - Grouping unlabeled items using k-means clustering (利用K-均值聚类算法对未标注数据分组)
22 | 11. [x] chapter11 - Association analysis with the Apriori algorithm (使用Apriori算法进行关联分析)
23 | 12. [x] chapter12 - Efficiently finding frequent itemsets with FP-growth (使用FP-growth算法来高效发现频繁项集)
24 | 
25 | ### PART 4 ADDITIONAL TOOLS
26 | 13. [x] chapter13 - Using principal component analysis to simplify data (利用PCA来简化数据)
27 | 14. [x] chapter14 - Simplifying data with the singular value decomposition (利用SVD简化数据)
28 | 15. [x] chapter15 - Big data and MapReduce (大数据与MapReduce)
29 | 


--------------------------------------------------------------------------------
/ch1/1.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | 
 4 | from random import *
 5 | from numpy import *
 6 | 
 7 | arr = random.rand(4,4)
 8 | 
 9 | print arr
10 | 
11 | randMat = mat(random.rand(4,4))
12 | print randMat.I #逆矩阵
13 | 
14 | print randMat*randMat.I #单位矩阵
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/ch1/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Machine learning basics
 3 | ## 学这本书之前，请把线性代数，统计学，python，numpy库，matplotlib等这些前置技能学好！！！不然将会学得很艰难！！！
 4 | ### None
 5 | 
 6 | ```
 7 | from random import *
 8 | from numpy import *
 9 | 
10 | arr = random.rand(4,4)
11 | 
12 | print arr
13 | 
14 | randMat = mat(random.rand(4,4))
15 | print randMat.I #逆矩阵
16 | 
17 | print randMat*randMat.I #单位矩阵
18 | ```
19 | 


--------------------------------------------------------------------------------
/ch10/Portland.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/Portland.png


--------------------------------------------------------------------------------
/ch10/README.md:
--------------------------------------------------------------------------------
 1 | # Ch10 - 利用K-均值聚类算法对未标注数据分组(Grouping unlabeled items using k-means clustering)
 2 | 
 3 | #### 以前我们学的都是监督学习算法，现在我们开始学习无监督学习算法。
 4 | #### 所谓无监督学习是指事先并不知道要寻找的内容，即没有目标变量。
 5 | #### K-means是聚类算法，聚类是将相似的样本分到同一个簇中，类似全自动分类，根据簇内的对象越相似，聚类的效果就越好。K-means是可以发现k个不同的簇，而且每个簇的中心采用簇中所含值的均值计算而成。 聚类与分类的最大不同在于， 分类的目标事先已知，聚类的标签事先不知道。
 6 | #### K-均值算法的伪代码如下：
 7 | ### 创建k个点作为起始质心（通常随机选择）
 8 | ```
 9 | 当任意一个点的簇分配结果发生改变时：
10 |         对数据集中的每个点：
11 |                 对每个质心：
12 |                 计算质心与数据点之间的距离
13 |          将数据点分配到距离其最近的簇
14 |     对每一个簇，计算簇中所有点的均值并将均值作为质心。
15 | ```
16 | #### K-均值聚类算法接收4个参数，两个必要参数为数据集和k的值，另外两个为距离计算函数和初始化函数（可修改）。算法采用计算质心-分配-重新计算质心反复迭代的方式，直到所有点的分配结果不再改变。设置flag为clusterChange=True。
17 | 
18 | #### 聚类算法中，k的值是由用户初始定义的，如何才能判断k值定义是否合适，就需要用误差来评价聚类效果的好坏，误差是各个点与其所属类别质心的距离决定的。K-均值聚类的方法效果较差的原因是会收敛到局部最小值，而且全局最小。一种评价聚类效果的方法是SSE（Sum of Squared Error）误差平方和的方法，取平方的结果是使得远离中心的点变得更加突出。 一种降低SSE的方法是增加簇的个数，即提高k值，但是违背了聚类的目标，聚类的目标是在不改变簇数目的前提下提高簇的质量。可选的改进的方法是对生成的簇进行后处理，将最大SSE值的簇划分成两个（K=2的K-均值算法），然后再进行相邻的簇合并。具体方法有两种：1、合并最近的两个质心（合并使得SSE增幅最小的两个质心）2、遍历簇 合并两个然后计算SSE的值，找到使得SSE最小的情况。
19 | 
20 | ## 测试K-means的数据集
21 | ![数据集.png](screenshot/数据集.png)
22 | 
23 | ## 运行k-均值聚类的结果示意图
24 | ![k-均值聚类的结果示意图](screenshot/k-均值聚类的结果示意图.png)
25 | 
26 | #### 但是K-means的聚类效果比较差，因为很容易收敛到局部最小值，而非全局最小值。所以我们要用新的方法（二分K-means）去改进K-means。
27 | 
28 | ## 二分K-均值算法 
29 | #### 二分K-均值类似后处理的切分思想，初始状态所有数据点属于一个大簇，之后每次选择一个簇切分成两个簇，这个切分满足使SSE值最大程度降低，直到簇数目达到k。另一种思路是每次选择SSE值最大的一个簇进行切分。 满足使SSE值最大程度降低伪代码如下：
30 | #### 将所有点看成一个簇。
31 | ```
32 | 当簇数目小于k时
33 |         对于每一个簇：
34 |             计算总误差
35 |             在给定的簇上面进行K-均值聚类（k=2）
36 |     计算将该簇一分为二后的总误差
37 |     选择使得误差最小的那个簇进行划分操作
38 | ```
39 | ## 测试二分K-means的数据集：
40 | ![dataSet2.png](screenshot/dataSet2.png)
41 | 
42 | ## 运行二分K-means的效果图：
43 | ![二分k-均值算法.png](screenshot/二分k-均值算法.png)
44 | 
45 | # 示例：对地图上的点聚类 
46 | ## 原地图：
47 | ![Portland.png](screenshot/Portland.png)
48 | 
49 | ## 经过二分K-means算法处理后的图：（簇为5）（簇的数目可以手动改变）
50 | ![Map.png](screenshot/Map.png)
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/ch10/matplotlib/README.md:
--------------------------------------------------------------------------------
1 | # matplotlib
2 | 


--------------------------------------------------------------------------------
/ch10/places.txt:
--------------------------------------------------------------------------------
 1 | Dolphin II	10860 SW Beaverton-Hillsdale Hwy	Beaverton, OR	45.486502	-122.788346
 2 | Hotties	10140 SW Canyon Rd.	Beaverton, OR	45.493150	-122.781021
 3 | Pussycats	8666a SW Canyon Road	Beaverton, OR	45.498187	-122.766147
 4 | Stars Cabaret	4570 Lombard Ave	Beaverton, OR	45.485943	-122.800311
 5 | Sunset Strip	10205 SW Park Way	Beaverton, OR	45.508203	-122.781853
 6 | Vegas VIP Room	10018 SW Canyon Rd	Beaverton, OR	45.493398	-122.779628
 7 | Full Moon Bar and Grill	28014 Southeast Wally Road	Boring, OR	45.430319	-122.376304
 8 | 505 Club	505 Burnside Rd	Gresham, OR	45.507621	-122.425553
 9 | Dolphin	17180 McLoughlin Blvd	Milwaukie, OR	45.399070	-122.618893
10 | Dolphin III	13305 SE McLoughlin BLVD	Milwaukie, OR	45.427072	-122.634159
11 | Acropolis	8325 McLoughlin Blvd	Portland, OR	45.462173	-122.638846
12 | Blush	5145 SE McLoughlin Blvd	Portland, OR	45.485396	-122.646587
13 | Boom Boom Room	8345 Barbur Blvd	Portland, OR	45.464826	-122.699212
14 | Bottoms Up	16900 Saint Helens Rd	Portland, OR	45.646831	-122.842918
15 | Cabaret II	17544 Stark St	Portland, OR	45.519142	-122.482480
16 | Cabaret Lounge	503 W Burnside	Portland, OR	45.523094	-122.675528
17 | Carnaval	330 SW 3rd Avenue	Portland, OR	45.520682	-122.674206
18 | Casa Diablo	2839 NW St. Helens Road	Portland, OR	45.543016	-122.720828
19 | Chantilly Lace	6723 Killingsworth St	Portland, OR	45.562715	-122.593078
20 | Club 205	9939 Stark St	Portland, OR	45.519052	-122.561510
21 | Club Rouge	403 SW Stark	Portland, OR	45.520561	-122.675605
22 | Dancin' Bare	8440 Interstate Ave	Portland, OR	45.584124	-122.682725
23 | Devil's Point	5305 SE Foster Rd	Portland, OR	45.495365	-122.608366
24 | Double Dribble	13550 Southeast Powell Boulevard	Portland, OR	45.497750	-122.524073
25 | Dream on Saloon	15920 Stark St	Portland, OR	45.519142	-122.499672
26 | DV8	5003 Powell Blvd	Portland, OR	45.497498	-122.611177
27 | Exotica	240 Columbia Blvd	Portland, OR	45.583048	-122.668350
28 | Frolics	8845 Sandy Blvd	Portland, OR	45.555384	-122.571475
29 | G-Spot Airport	8654 Sandy Blvd	Portland, OR	45.554263	-122.574167
30 | G-Spot Northeast	3400 NE 82nd Ave	Portland, OR	45.547229	-122.578746
31 | G-Spot Southeast	5241 SE 72nd Ave	Portland, OR	45.484823	-122.589208
32 | Glimmers	3532 Powell Blvd	Portland, OR	45.496918	-122.627920
33 | Golden Dragon Exotic Club	324 SW 3rd Ave	Portland, OR	45.520714	-122.674189
34 | Heat	12131 SE Holgate Blvd.	Portland, OR	45.489637	-122.538196
35 | Honeysuckle's Lingerie	3520 82nd Ave	Portland, OR	45.548651	-122.578730
36 | Hush Playhouse	13560 Powell Blvd	Portland, OR	45.497765	-122.523985
37 | JD's Bar &amp; Grill	4523 NE 60th Ave	Portland, OR	45.555811	-122.600881
38 | Jody's Bar And Grill	12035 Glisan St	Portland, OR	45.526306	-122.538833
39 | Landing Strip	6210 Columbia Blvd	Portland, OR	45.595042	-122.728825
40 | Lucky Devil Lounge	633 SE Powell Blvd	Portland, OR	45.501585	-122.659310
41 | Lure	11051 Barbur Blvd	Portland, OR	45.445233	-122.732606
42 | Magic Garden	217 4th Ave	Portland, OR	45.524692	-122.674466
43 | Mary's Club	129 Broadway	Portland, OR	45.535101	-122.667390
44 | Montego's	15826 SE Division	Portland, OR	45.504448	-122.500034
45 | Mr. Peeps	709 122nd Ave	Portland, OR	45.527863	-122.537726
46 | Mynt Gentlemen's Club	3390 NE Sandy Blvd	Portland, OR	45.532426	-122.628865
47 | Mystic	9950 SE Stark St.	Portland, OR	45.519037	-122.561283
48 | Nicolai Street Clubhouse	2460 24th Ave	Portland, OR	45.540098	-122.641114
49 | Oh Zone	6218 Columbia Blvd	Portland, OR	45.595069	-122.728961
50 | Pallas Club	13639 Powell Blvd	Portland, OR	45.497990	-122.522849
51 | Pirates Cove	7427 Sandy Blvd	Portland, OR	45.549288	-122.586505
52 | Private Pleasures	10931 53rd Ave	Portland, OR	45.446442	-122.731034
53 | Pussycats	3414 Northeast 82nd Avenue	Portland, OR	45.547337	-122.578744
54 | Riverside Corral	545 Tacoma St	Portland, OR	45.464338	-122.660285
55 | Rooster's	605 Columbia Blvd	Portland, OR	45.583693	-122.672462
56 | Rose City Strip	3620 35th Pl	Portland, OR	45.496601	-122.627688
57 | Safari Show Club	3000 SE Powell Blvd	Portland, OR	45.497091	-122.634581
58 | Sassy's Bar &amp; Grill	927 Morrison St	Portland, OR	45.517225	-122.656367
59 | Secret Rendezvous	12503 Division St	Portland, OR	45.504087	-122.534481
60 | Shimmers	7944 Foster Rd	Portland, OR	45.483836	-122.581608
61 | Soobie's	333 SE 122nd Ave	Portland, OR	45.520162	-122.537787
62 | Spyce Gentleman's Club	33 NW 2nd Ave	Portland, OR	45.523370	-122.672388
63 | Sugar Shack	6732 Killingsworth St	Portland, OR	45.562699	-122.593048
64 | The Hawthorne Strip	1008 Hawthorne Blvd	Portland, OR	45.512220	-122.655527
65 | Tommy's Too	10335 Foster Rd	Portland, OR	45.476721	-122.557005
66 | Union Jacks	938 Burnside St	Portland, OR	45.522902	-122.656249
67 | Video Visions	6723 Killingsworth St	Portland, OR	45.562715	-122.593078
68 | Stars Cabaret Bridgeport	17939 SW McEwan Rd	Tigard, OR	45.425788	-122.765754
69 | Jiggles	7455 SW Nyberg St	Tualatin, OR	45.382682	-122.753932
70 | 


--------------------------------------------------------------------------------
/ch10/portlandClubs.txt:
--------------------------------------------------------------------------------
 1 | Dolphin II	10860 SW Beaverton-Hillsdale Hwy	Beaverton, OR
 2 | Hotties	10140 SW Canyon Rd.	Beaverton, OR
 3 | Pussycats	8666a SW Canyon Road	Beaverton, OR
 4 | Stars Cabaret	4570 Lombard Ave	Beaverton, OR
 5 | Sunset Strip	10205 SW Park Way	Beaverton, OR
 6 | Vegas VIP Room	10018 SW Canyon Rd	Beaverton, OR
 7 | Full Moon Bar and Grill	28014 Southeast Wally Road	Boring, OR
 8 | 505 Club	505 Burnside Rd	Gresham, OR
 9 | Dolphin	17180 McLoughlin Blvd	Milwaukie, OR
10 | Dolphin III	13305 SE McLoughlin BLVD	Milwaukie, OR
11 | Acropolis	8325 McLoughlin Blvd	Portland, OR
12 | Blush	5145 SE McLoughlin Blvd	Portland, OR
13 | Boom Boom Room	8345 Barbur Blvd	Portland, OR
14 | Bottoms Up	16900 Saint Helens Rd	Portland, OR
15 | Cabaret II	17544 Stark St	Portland, OR
16 | Cabaret Lounge	503 W Burnside	Portland, OR
17 | Carnaval	330 SW 3rd Avenue	Portland, OR
18 | Casa Diablo	2839 NW St. Helens Road	Portland, OR
19 | Chantilly Lace	6723 Killingsworth St	Portland, OR
20 | Club 205	9939 Stark St	Portland, OR
21 | Club Rouge	403 SW Stark	Portland, OR
22 | Dancin' Bare	8440 Interstate Ave	Portland, OR
23 | Devil's Point	5305 SE Foster Rd	Portland, OR
24 | Double Dribble	13550 Southeast Powell Boulevard	Portland, OR
25 | Dream on Saloon	15920 Stark St	Portland, OR
26 | DV8	5003 Powell Blvd	Portland, OR
27 | Exotica	240 Columbia Blvd	Portland, OR
28 | Frolics	8845 Sandy Blvd	Portland, OR
29 | G-Spot Airport	8654 Sandy Blvd	Portland, OR
30 | G-Spot Northeast	3400 NE 82nd Ave	Portland, OR
31 | G-Spot Southeast	5241 SE 72nd Ave	Portland, OR
32 | Glimmers	3532 Powell Blvd	Portland, OR
33 | Golden Dragon Exotic Club	324 SW 3rd Ave	Portland, OR
34 | Heat	12131 SE Holgate Blvd.	Portland, OR
35 | Honeysuckle's Lingerie	3520 82nd Ave	Portland, OR
36 | Hush Playhouse	13560 Powell Blvd	Portland, OR
37 | JD's Bar &amp; Grill	4523 NE 60th Ave	Portland, OR
38 | Jody's Bar And Grill	12035 Glisan St	Portland, OR
39 | Landing Strip	6210 Columbia Blvd	Portland, OR
40 | Lucky Devil Lounge	633 SE Powell Blvd	Portland, OR
41 | Lure	11051 Barbur Blvd	Portland, OR
42 | Magic Garden	217 4th Ave	Portland, OR
43 | Mary's Club	129 Broadway	Portland, OR
44 | Montego's	15826 SE Division	Portland, OR
45 | Mr. Peeps	709 122nd Ave	Portland, OR
46 | Mynt Gentlemen's Club	3390 NE Sandy Blvd	Portland, OR
47 | Mystic	9950 SE Stark St.	Portland, OR
48 | Nicolai Street Clubhouse	2460 24th Ave	Portland, OR
49 | Oh Zone	6218 Columbia Blvd	Portland, OR
50 | Pallas Club	13639 Powell Blvd	Portland, OR
51 | Pirates Cove	7427 Sandy Blvd	Portland, OR
52 | Private Pleasures	10931 53rd Ave	Portland, OR
53 | Pussycats	3414 Northeast 82nd Avenue	Portland, OR
54 | Riverside Corral	545 Tacoma St	Portland, OR
55 | Rooster's	605 Columbia Blvd	Portland, OR
56 | Rose City Strip	3620 35th Pl	Portland, OR
57 | Safari Show Club	3000 SE Powell Blvd	Portland, OR
58 | Sassy's Bar &amp; Grill	927 Morrison St	Portland, OR
59 | Secret Rendezvous	12503 Division St	Portland, OR
60 | Shimmers	7944 Foster Rd	Portland, OR
61 | Soobie's	333 SE 122nd Ave	Portland, OR
62 | Spyce Gentleman's Club	33 NW 2nd Ave	Portland, OR
63 | Sugar Shack	6732 Killingsworth St	Portland, OR
64 | The Hawthorne Strip	1008 Hawthorne Blvd	Portland, OR
65 | Tommy's Too	10335 Foster Rd	Portland, OR
66 | Union Jacks	938 Burnside St	Portland, OR
67 | Video Visions	6723 Killingsworth St	Portland, OR
68 | Stars Cabaret Bridgeport	17939 SW McEwan Rd	Tigard, OR
69 | Jiggles	7455 SW Nyberg St	Tualatin, OR


--------------------------------------------------------------------------------
/ch10/screenshot/Map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/screenshot/Map.png


--------------------------------------------------------------------------------
/ch10/screenshot/Portland.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/screenshot/Portland.png


--------------------------------------------------------------------------------
/ch10/screenshot/README.md:
--------------------------------------------------------------------------------
1 | # screenshot
2 | 


--------------------------------------------------------------------------------
/ch10/screenshot/dataSet2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/screenshot/dataSet2.png


--------------------------------------------------------------------------------
/ch10/screenshot/k-均值聚类的结果示意图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/screenshot/k-均值聚类的结果示意图.png


--------------------------------------------------------------------------------
/ch10/screenshot/二分k-均值算法.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/screenshot/二分k-均值算法.png


--------------------------------------------------------------------------------
/ch10/screenshot/数据集.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch10/screenshot/数据集.png


--------------------------------------------------------------------------------
/ch10/testSet.txt:
--------------------------------------------------------------------------------
 1 | 1.658985	4.285136
 2 | -3.453687	3.424321
 3 | 4.838138	-1.151539
 4 | -5.379713	-3.362104
 5 | 0.972564	2.924086
 6 | -3.567919	1.531611
 7 | 0.450614	-3.302219
 8 | -3.487105	-1.724432
 9 | 2.668759	1.594842
10 | -3.156485	3.191137
11 | 3.165506	-3.999838
12 | -2.786837	-3.099354
13 | 4.208187	2.984927
14 | -2.123337	2.943366
15 | 0.704199	-0.479481
16 | -0.392370	-3.963704
17 | 2.831667	1.574018
18 | -0.790153	3.343144
19 | 2.943496	-3.357075
20 | -3.195883	-2.283926
21 | 2.336445	2.875106
22 | -1.786345	2.554248
23 | 2.190101	-1.906020
24 | -3.403367	-2.778288
25 | 1.778124	3.880832
26 | -1.688346	2.230267
27 | 2.592976	-2.054368
28 | -4.007257	-3.207066
29 | 2.257734	3.387564
30 | -2.679011	0.785119
31 | 0.939512	-4.023563
32 | -3.674424	-2.261084
33 | 2.046259	2.735279
34 | -3.189470	1.780269
35 | 4.372646	-0.822248
36 | -2.579316	-3.497576
37 | 1.889034	5.190400
38 | -0.798747	2.185588
39 | 2.836520	-2.658556
40 | -3.837877	-3.253815
41 | 2.096701	3.886007
42 | -2.709034	2.923887
43 | 3.367037	-3.184789
44 | -2.121479	-4.232586
45 | 2.329546	3.179764
46 | -3.284816	3.273099
47 | 3.091414	-3.815232
48 | -3.762093	-2.432191
49 | 3.542056	2.778832
50 | -1.736822	4.241041
51 | 2.127073	-2.983680
52 | -4.323818	-3.938116
53 | 3.792121	5.135768
54 | -4.786473	3.358547
55 | 2.624081	-3.260715
56 | -4.009299	-2.978115
57 | 2.493525	1.963710
58 | -2.513661	2.642162
59 | 1.864375	-3.176309
60 | -3.171184	-3.572452
61 | 2.894220	2.489128
62 | -2.562539	2.884438
63 | 3.491078	-3.947487
64 | -2.565729	-2.012114
65 | 3.332948	3.983102
66 | -1.616805	3.573188
67 | 2.280615	-2.559444
68 | -2.651229	-3.103198
69 | 2.321395	3.154987
70 | -1.685703	2.939697
71 | 3.031012	-3.620252
72 | -4.599622	-2.185829
73 | 4.196223	1.126677
74 | -2.133863	3.093686
75 | 4.668892	-2.562705
76 | -2.793241	-2.149706
77 | 2.884105	3.043438
78 | -2.967647	2.848696
79 | 4.479332	-1.764772
80 | -4.905566	-2.911070
81 | 


--------------------------------------------------------------------------------
/ch10/testSet2.txt:
--------------------------------------------------------------------------------
 1 | 3.275154	2.957587
 2 | -3.344465	2.603513
 3 | 0.355083	-3.376585
 4 | 1.852435	3.547351
 5 | -2.078973	2.552013
 6 | -0.993756	-0.884433
 7 | 2.682252	4.007573
 8 | -3.087776	2.878713
 9 | -1.565978	-1.256985
10 | 2.441611	0.444826
11 | -0.659487	3.111284
12 | -0.459601	-2.618005
13 | 2.177680	2.387793
14 | -2.920969	2.917485
15 | -0.028814	-4.168078
16 | 3.625746	2.119041
17 | -3.912363	1.325108
18 | -0.551694	-2.814223
19 | 2.855808	3.483301
20 | -3.594448	2.856651
21 | 0.421993	-2.372646
22 | 1.650821	3.407572
23 | -2.082902	3.384412
24 | -0.718809	-2.492514
25 | 4.513623	3.841029
26 | -4.822011	4.607049
27 | -0.656297	-1.449872
28 | 1.919901	4.439368
29 | -3.287749	3.918836
30 | -1.576936	-2.977622
31 | 3.598143	1.975970
32 | -3.977329	4.900932
33 | -1.791080	-2.184517
34 | 3.914654	3.559303
35 | -1.910108	4.166946
36 | -1.226597	-3.317889
37 | 1.148946	3.345138
38 | -2.113864	3.548172
39 | 0.845762	-3.589788
40 | 2.629062	3.535831
41 | -1.640717	2.990517
42 | -1.881012	-2.485405
43 | 4.606999	3.510312
44 | -4.366462	4.023316
45 | 0.765015	-3.001270
46 | 3.121904	2.173988
47 | -4.025139	4.652310
48 | -0.559558	-3.840539
49 | 4.376754	4.863579
50 | -1.874308	4.032237
51 | -0.089337	-3.026809
52 | 3.997787	2.518662
53 | -3.082978	2.884822
54 | 0.845235	-3.454465
55 | 1.327224	3.358778
56 | -2.889949	3.596178
57 | -0.966018	-2.839827
58 | 2.960769	3.079555
59 | -3.275518	1.577068
60 | 0.639276	-3.412840
61 | 


--------------------------------------------------------------------------------
/ch11/Apriori.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | 
  3 | from numpy import *
  4 | 
  5 | 
  6 | def loadDataSet():
  7 |     return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
  8 | 
  9 | # C1 是大小为1的所有候选项集的集合
 10 | def createC1(dataSet):
 11 |     C1 = [] # 存储不重复的项值
 12 |     for transaction in dataSet:
 13 |         for item in transaction:
 14 |             if not [item] in C1:
 15 |                 C1.append([item]) #store all the item unrepeatly
 16 | 
 17 |     C1.sort()
 18 |     #return map(frozenset, C1)#frozen set, user can't change it.
 19 |     return list(map(frozenset, C1))
 20 | 
 21 | #该函数用于从 C1 生成 L1
 22 | def scanD(D,Ck,minSupport):
 23 | #参数：数据集、候选项集列表 Ck以及感兴趣项集的最小支持度 minSupport
 24 |     ssCnt={}
 25 |     for tid in D:#遍历数据集
 26 |         for can in Ck:#遍历候选项
 27 |             if can.issubset(tid):#判断候选项中是否含数据集的各项
 28 |                 #if not ssCnt.has_key(can): # python3 can not support
 29 |                 if not can in ssCnt:
 30 |                     ssCnt[can]=1 #不含设为1
 31 |                 else: ssCnt[can]+=1#有则计数加1
 32 |     numItems=float(len(D))#数据集大小
 33 |     retList = []#L1初始化
 34 |     supportData = {}#记录候选项中各个数据的支持度
 35 |     for key in ssCnt:
 36 |         support = ssCnt[key]/numItems#计算支持度
 37 |         if support >= minSupport:
 38 |             retList.insert(0,key)#满足条件加入L1中
 39 |         supportData[key] = support
 40 |     return retList, supportData
 41 | 
 42 | #total apriori
 43 | def aprioriGen(Lk, k): #组合，向上合并
 44 |     #creates Ck 参数：频繁项集列表 Lk 与项集元素个数 k
 45 |     retList = []
 46 |     lenLk = len(Lk)
 47 |     for i in range(lenLk):
 48 |         for j in range(i+1, lenLk): #两两组合遍历
 49 |             L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
 50 |             L1.sort(); L2.sort()
 51 |             if L1==L2: #若两个集合的前k-2个项相同时,则将两个集合合并
 52 |                 retList.append(Lk[i] | Lk[j]) #set union
 53 |     return retList
 54 | 
 55 | #apriori
 56 | def apriori(dataSet, minSupport = 0.5):
 57 |     C1 = createC1(dataSet)
 58 |     D = list(map(set, dataSet)) #python3
 59 |     L1, supportData = scanD(D, C1, minSupport)#单项最小支持度判断 0.5，生成L1
 60 |     L = [L1]
 61 |     k = 2
 62 |     while (len(L[k-2]) > 0):#创建包含更大项集的更大列表,直到下一个大的项集为空
 63 |         Ck = aprioriGen(L[k-2], k)#Ck
 64 |         Lk, supK = scanD(D, Ck, minSupport)#get Lk
 65 |         supportData.update(supK)
 66 |         L.append(Lk)
 67 |         k += 1
 68 |     return L, supportData
 69 | 
 70 | #生成关联规则
 71 | def generateRules(L, supportData, minConf=0.7):
 72 |     #频繁项集列表、包含那些频繁项集支持数据的字典、最小可信度阈值
 73 |     bigRuleList = [] #存储所有的关联规则
 74 |     for i in range(1, len(L)):  #只获取有两个或者更多集合的项目，从1,即第二个元素开始，L[0]是单个元素的
 75 |         # 两个及以上的才可能有关联一说，单个元素的项集不存在关联问题
 76 |         for freqSet in L[i]:
 77 |             H1 = [frozenset([item]) for item in freqSet]
 78 |             #该函数遍历L中的每一个频繁项集并对每个频繁项集创建只包含单个元素集合的列表H1
 79 |             if (i > 1):
 80 |             #如果频繁项集元素数目超过2,那么会考虑对它做进一步的合并
 81 |                 rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
 82 |             else:#第一层时，后件数为1
 83 |                 calcConf(freqSet, H1, supportData, bigRuleList, minConf)# 调用函数2
 84 |     return bigRuleList
 85 | 
 86 | #生成候选规则集合：计算规则的可信度以及找到满足最小可信度要求的规则
 87 | def calcConf(freqSet, H, supportData, brl, minConf=0.7):
 88 |     #针对项集中只有两个元素时，计算可信度
 89 |     prunedH = []#返回一个满足最小可信度要求的规则列表
 90 |     for conseq in H:#后件，遍历 H中的所有项集并计算它们的可信度值
 91 |         conf = supportData[freqSet]/supportData[freqSet-conseq] #可信度计算，结合支持度数据
 92 |         if conf >= minConf:
 93 |             print (freqSet-conseq,'-->',conseq,"conf:",(conf)) # k可信度值
 94 |             #如果某条规则满足最小可信度值,那么将这些规则输出到屏幕显示
 95 |             brl.append((freqSet-conseq, conseq, conf))#添加到规则里，brl 是前面通过检查的 bigRuleList
 96 |             prunedH.append(conseq)#同样需要放入列表到后面检查
 97 |     return prunedH
 98 | 
 99 | #合并
100 | def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
101 |     #参数:一个是频繁项集,另一个是可以出现在规则右部的元素列表 H
102 |     m = len(H[0])
103 |     if (len(freqSet) > (m + 1)): #频繁项集元素数目大于单个集合的元素数
104 |         Hmp1 = aprioriGen(H, m+1)#存在不同顺序、元素相同的集合，合并具有相同部分的集合
105 |         Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)#计算可信度
106 |         if (len(Hmp1) > 1):
107 |         #满足最小可信度要求的规则列表多于1,则递归来判断是否可以进一步组合这些规则
108 |             rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
109 | 
110 | if __name__ == "__main__" :
111 |     dataSet = loadDataSet()
112 |     # print dataSet
113 |     L, supportData = apriori(dataSet)
114 |     print L # L 包含满足最小支持度为0.5的频繁项集列表
115 |     # print supportData
116 |     rules = generateRules(L,supportData,minConf=0.5)
117 |     print rules


--------------------------------------------------------------------------------
/ch11/README.md:
--------------------------------------------------------------------------------
 1 | # Ch11 - 使用Apriori算法进行关联分析(Association analysis with the Apriori algorithm)
 2 | 
 3 | #### 从大规模数据集中寻找物品间的隐含关系被称作关联分析(association analysis)或者关联规则学习(association rule learning)。 关联分析是一种在大规模数据集中寻找有趣关系的任务。这些关系可以有两种形式：频繁项集或者关联规则。
 4 | 
 5 | #### 频繁项集(frequent item sets)是经常出现在一块的物品的集合。
 6 | #### 关联规则(association rules)暗示两种物品之间可能存在很强的关系。
 7 | 
 8 | #### 一个项集的支持度(support)：被定义为数据集中包含该项集的记录所占的比例。支持度是针对项集来说的，因此可以定义一个最小支持度，而只保留满足最小支持度的项集。
 9 | 
10 | #### 可信度或置信度(confidence)：是针对一条诸如{尿布} ➞ {葡萄酒}的关联规则来定义的。这 条规则的可信度被定义为“支持度({尿布, 葡萄酒})/支持度({尿布})”。从图11-1中可以看到，由 于{尿布, 葡萄酒}的支持度为3/5，尿布的支持度为4/5，所以“尿布 ➞ 葡萄酒”的可信度为3/4=0.75。 这意味着对于包含“尿布”的所有记录，我们的规则对其中75%的记录都适用。（类似条件概率）。
11 | 
12 | ## Apriori 原理：
13 | #### Apriori原理可以帮我们减少可能感兴趣的项集。Apriori原理是说如果某个项集是频繁的，那么它的所有子集也是频繁的。这个原理直观上并没有什么帮助，但是如果反过来看就有用了，也就是说如果一个项集是非频繁集那么它的所有超集也是非频繁的。 
14 | 
15 | 
16 | #### 关联分析的目标包括两项：发现频繁项集和发现关联规则。首先需要找到频繁 项集，然后才能获得关联规则。
17 | #### Apriori算法是发现频繁项集的一种方法。该算法首先会生成所有单个物品的项集列表。接着扫描交易记录来查看哪些项集满足最小支持度要求，那些不满足最小支持度的集合会被去掉。然后，对剩下来的集合进行组合以生成包含两个元素的项集。接下来，再重新扫描交易记录，去掉不满足最小支持度的项集。该过程重复进行直到所有项集都被去掉。
18 | 
19 | 
20 | ## 整个Apriori算法的伪代码如下:
21 | ```
22 | 当集合中项的个数大于0时:
23 |     构建一个k个项组成的候选项集的列表
24 |     检查数据以确认每个项集都是频繁的
25 |     保留频繁项集并构建k+1项组成的候选项集的列表(向上合并)
26 | ```
27 | 
28 | #### 函数 aprioriGen() 的输入参数为频繁项集列表 Lk 与项集元素个数 k ，输出为 Ck 。举例来说,该函数以{0}、{1}、{2}作为输入，会生成{0,1}、{0,2}以及{1,2}。要完成这一点，首先创建一个空列表,然后计算 Lk 中的元素数目。通过循环来比较 Lk 中的每一个元素与其他元素，紧接着,取列表中的两个集合进行比较。如果这两个集合的前面 k-2 个元素都相等，那么就将这两个集合合成一个大小为 k 的集合 。这里使用集合的并操作来完成。
29 | 
30 | ## 从频繁项集中挖掘关联规则
31 | 
32 | ### 如果某条规则并不满足最小可信度要求，那么该规则的所有子集也不会满足最小可信度要求。可以利用关联规则的这条性质属性来减少需要测试的规则数目。
33 | 
34 | 
35 | ## 小总结：
36 | 
37 | #### 关联分析是用于发现大数据集中元素间有趣关系的一个工具集，可以采用两种方式来量化这些有趣的关系。 
38 | #### 第一种方式是使用频繁项集，它会给出经常在一起出现的元素项。 
39 | #### 第二种方式是关联规则，每条关联规则意味着元素项之间的“如果……那么”关系。
40 | #### Apriori的方法简化了计算量，可以在合理的时间范围内找到频繁项集。
41 | #### Apriori原理是说如果一个元素项是不频繁的，那么那些包含该元素的超集也是不频繁的。 
42 | #### Apriori算法从单元素项集开始，通过组合满足最小支持度要求的项集来形成更大的集合。支持度用来度量一个集合在原始数据中出现的频率。
43 | #### 每次增加频繁项集的大小，Apriori算法都会重新扫描整个数据集。当数据集很大时，这会显著降低频繁项集发现的速度。
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/ch11/meaning20.txt:
--------------------------------------------------------------------------------
 1 | (lp0
 2 | S'Republican'
 3 | p1
 4 | aS'Democratic'
 5 | p2
 6 | aS'Prohibiting Federal Funding of National Public Radio -- Nay'
 7 | p3
 8 | aS'Prohibiting Federal Funding of National Public Radio -- Yea'
 9 | p4
10 | aS'Removing Troops from Afghanistan -- Nay'
11 | p5
12 | aS'Removing Troops from Afghanistan -- Yea'
13 | p6
14 | aS'Terminating the Home Affordable Modification Program -- Nay'
15 | p7
16 | aS'Terminating the Home Affordable Modification Program -- Yea'
17 | p8
18 | aS'Repealing the Health Care Bill -- Nay'
19 | p9
20 | aS'Repealing the Health Care Bill -- Yea'
21 | p10
22 | aS'Science and Technology Funding -- Nay'
23 | p11
24 | aS'Science and Technology Funding -- Yea'
25 | p12
26 | aS'"Whistleblower Protection" for Offshore Oil Workers -- Nay'
27 | p13
28 | aS'"Whistleblower Protection" for Offshore Oil Workers -- Yea'
29 | p14
30 | aS'Repealing "Don\'t Ask, Don\'t Tell" After Military Review and Certification -- Nay'
31 | p15
32 | aS'Repealing "Don\'t Ask, Don\'t Tell" After Military Review and Certification -- Yea'
33 | p16
34 | aS'Unemployment Benefits Extension -- Nay'
35 | p17
36 | aS'Unemployment Benefits Extension -- Yea'
37 | p18
38 | aS'Unemployment Benefits Extension -- Nay'
39 | p19
40 | aS'Unemployment Benefits Extension -- Yea'
41 | p20
42 | aS'Prohibiting 2010- 2011 Congressional Cost-of-Living Pay Increase -- Nay'
43 | p21
44 | aS'Prohibiting 2010- 2011 Congressional Cost-of-Living Pay Increase -- Yea'
45 | p22
46 | aS'Prohibiting Use of Federal Funds For Planned Parenthood -- Nay'
47 | p23
48 | aS'Prohibiting Use of Federal Funds For Planned Parenthood -- Yea'
49 | p24
50 | aS'Reducing Federal Funding of the US Institute of Peace -- Nay'
51 | p25
52 | aS'Reducing Federal Funding of the US Institute of Peace -- Yea'
53 | p26
54 | aS'Prohibiting the Use of Federal Funds for NASCAR Sponsorships -- Nay'
55 | p27
56 | aS'Prohibiting the Use of Federal Funds for NASCAR Sponsorships -- Yea'
57 | p28
58 | aS'Mine Safety Act -- Nay'
59 | p29
60 | aS'Mine Safety Act -- Yea'
61 | p30
62 | a.


--------------------------------------------------------------------------------
/ch11/recent20bills.txt:
--------------------------------------------------------------------------------
 1 | 12939	Prohibiting Federal Funding of National Public Radio
 2 | 12940	Removing Troops from Afghanistan
 3 | 12830	Prioritizing Payment of Public Debt
 4 | 12857	Calling for a Balanced Budget Constitutional Amendment
 5 | 12988	Terminating the Home Affordable Modification Program
 6 | 12040	Repealing Business Transaction Reporting Requirements
 7 | 12465	Repealing the Health Care Bill
 8 | 11451	Science and Technology Funding
 9 | 11364	Credit Default Swap Regulations
10 | 11820	"Whistleblower Protection" for Offshore Oil Workers
11 | 12452	Treaty with Russia to Reduce and Limit Offensive Arms
12 | 11318	Derivatives Regulation Modifications
13 | 11414	Repealing "Don't Ask, Don't Tell" After Military Review and Certification
14 | 11719	Unemployment Benefits Extension
15 | 11205	Prohibiting 2010- 2011 Congressional Cost-of-Living Pay Increase
16 | 12747	Prohibiting Use of Federal Funds For Planned Parenthood
17 | 12792	Reducing Federal Funding of the US Institute of Peace
18 | 12827	Prohibiting the Use of Federal Funds for NASCAR Sponsorships
19 | 12445	Mine Safety Act
20 | 12049	2010-2011 Defense Authorizations


--------------------------------------------------------------------------------
/ch11/votesmart.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from time import sleep
 4 | from votesmart import votesmart
 5 | 
 6 | votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
 7 | 
 8 | 
 9 | # votesmart.apikey = 'get your api key first'
10 | def getActionIds():
11 |     actionIdList = [];
12 |     billTitleList = []
13 |     fr = open('recent20bills.txt')
14 |     for line in fr.readlines():
15 |         billNum = int(line.split('\t')[0])
16 |         try:
17 |             billDetail = votesmart.votes.getBill(billNum)  # api call
18 |             for action in billDetail.actions:
19 |                 if action.level == 'House' and \
20 |                         (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
21 |                     actionId = int(action.actionId)
22 |                     print 'bill: %d has actionId: %d' % (billNum, actionId)
23 |                     actionIdList.append(actionId)
24 |                     billTitleList.append(line.strip().split('\t')[1])
25 |         except:
26 |             print "problem getting bill %d" % billNum
27 |         sleep(1)  # delay to be polite
28 |     return actionIdList, billTitleList
29 | 
30 | 
31 | def getTransList(actionIdList, billTitleList):  # this will return a list of lists containing ints
32 |     itemMeaning = ['Republican', 'Democratic']  # list of what each item stands for
33 |     for billTitle in billTitleList:  # fill up itemMeaning list
34 |         itemMeaning.append('%s -- Nay' % billTitle)
35 |         itemMeaning.append('%s -- Yea' % billTitle)
36 |     transDict = {}  # list of items in each transaction (politician)
37 |     voteCount = 2
38 |     for actionId in actionIdList:
39 |         sleep(3)
40 |         print 'getting votes for actionId: %d' % actionId
41 |         try:
42 |             voteList = votesmart.votes.getBillActionVotes(actionId)
43 |             for vote in voteList:
44 |                 if not transDict.has_key(vote.candidateName):
45 |                     transDict[vote.candidateName] = []
46 |                     if vote.officeParties == 'Democratic':
47 |                         transDict[vote.candidateName].append(1)
48 |                     elif vote.officeParties == 'Republican':
49 |                         transDict[vote.candidateName].append(0)
50 |                 if vote.action == 'Nay':
51 |                     transDict[vote.candidateName].append(voteCount)
52 |                 elif vote.action == 'Yea':
53 |                     transDict[vote.candidateName].append(voteCount + 1)
54 |         except:
55 |             print "problem getting actionId: %d" % actionId
56 |         voteCount += 2
57 |     return transDict, itemMeaning


--------------------------------------------------------------------------------
/ch12/README.md:
--------------------------------------------------------------------------------
 1 | # Ch12 - 使用FP-growth算法来高效发现频繁项集(Efficiently finding frequent itemsets with FP-growth)
 2 | 
 3 | #### (Frequent Pattern)FP-growth算法是一种高效发现频繁集的方法。它比Apriori算法要快，这里的任务是将数据集存储在一个特定的称作FP树的结构之后发现频繁项集或者频繁项对，这种做法使得算法的执行速度要快于Apriori两个数量级以上。
 4 | 
 5 | #### FP-growth算法只需要对数据库进行两次扫描(第一次遍历：统计各个数据的频繁度(统计出现频率)，第二次遍历：只考虑那些频繁元素)，而Apriori算法对于每个潜在的频繁项集都会扫描数据集判定给定模式是否频繁，因此FP-growth算法的速度要比Apriori算法快。它发现频繁项集的基本过程如下： 
 6 | ```
 7 | (1) 构建FP树 
 8 | (2) 从FP树中挖掘频繁项集 
 9 | ```
10 | #### FP-growth算法将数据存储在一种称为FP树的紧凑数据结构中。FP代表频繁模式(Frequent Pattern)。一棵FP树看上去与计算机科学中的其他树结构类似,但是它通过链接(link)来连接相似元素，被连起来的元素项可以看成一个链表。
11 | 
12 | #### 在创建真正的频繁集FP树之前，需要对数据进行过滤（不符合频繁要求）和排序（按照频繁度排序）。利用头指针表，可以快速访问FP树中一个给定类型的所有元素。 
13 | 
14 | #### 同搜索树不同的是,一个元素项可以在一棵FP树中出现多次。FP树会存储项集的出现频率,而每个项集会以路径的方式存储在树中。存在相似元素的集合会共享树的一部分。只有当集合之间完全不同时,树才会分叉。 树节点上给出集合中的单个元素及其在序列中的出现次数，路径会给出该序列的出现次数。
15 | 
16 | ### 从FP树中抽取频繁项集的三个基本步骤如下:
17 | ```
18 | (1) 从FP树中获得条件模式基;
19 | (2) 利用条件模式基,构建一个条件FP树;
20 | (3) 迭代重复步骤(1)步骤(2),直到树包含一个元素项为止。
21 | ```
22 | 
23 | ## 条件模式基
24 | #### 条件模式基是以所查找元素项为结尾的路径集合，每一条路径其实都是一条前缀路径。 抽取出来每个频繁项集的前缀路径之后，用条件模式基构造条件FP树（t的条件FP树）。简单来说，对于最简单的1个元素的，去掉不满足支持度的元素，选出长度1的频繁项集，然后把给出的元素序列，构造FP树，然后对于每个长度为1的频繁项集，找出条件模式基（前缀路径），然后根据前缀路径，去掉不满足支持度的元素，满足支持度的元素，加上原来长度1的频繁项集，就构成了长度2的频繁项集，以此类推。这个代码递归实现即可。
25 | 
26 | ## 示例:从新闻网站点击流中挖掘
27 | #### 在上传的源码中，有一个kosarak.dat文件，它包含将近100万条记录 。该文件中的每一行包含某个用户浏览过的新闻报道。一些用户只看过一篇报道,而有些用户看过2498篇报道，其中用户和报道被编码成整数。
28 | ```
29 | # 需要创建一个空列表来保存这些频繁项集
30 | myFreqList = []
31 | 
32 | # 执行该代码，构建树以及扫描100万行只需要几秒钟
33 | mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreqList)
34 | 
35 | # 看下有多少新闻报道或报道集合曾经被10万或者更多的人浏览过:
36 | print len(myFreqList)# 查询结果为9人
37 | 
38 | # 看具体为哪9项：
39 | print myFreqList
40 | ```
41 | ## 总结：
42 | #### growth算法是一种用于发现数据集中频繁模式的有效方法。FP-growth算法利用Apriori原则，速度更快。在FP-growth算法中，数据集存储在一个称为FP树的结构中。FP树构建完成后，可以通过查找元素项的条件基及构建条件FP树来发现频繁项集。该过程不断以更多元素作为条件重复进行，直到FP树只包含一个元素为止。
43 | #### 可以使用FP-growth算法在多种文本文档中查找频繁单词。频繁项集生成还有其他一些应用，比如购物交易，医学诊断，大气研究等。
44 | 
45 | 


--------------------------------------------------------------------------------
/ch12/Twitter.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import twitter
 4 | from time import sleep
 5 | import re
 6 | 
 7 | def textParse(bigString):
 8 |     urlsRemoved = re.sub('(http:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString)    
 9 |     listOfTokens = re.split(r'\W*', urlsRemoved)
10 |     return [tok.lower() for tok in listOfTokens if len(tok) > 2]
11 | 
12 | def getLotsOfTweets(searchStr):
13 |     CONSUMER_KEY = ''
14 |     CONSUMER_SECRET = ''
15 |     ACCESS_TOKEN_KEY = ''
16 |     ACCESS_TOKEN_SECRET = ''
17 |     api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET,
18 |                       access_token_key=ACCESS_TOKEN_KEY, 
19 |                       access_token_secret=ACCESS_TOKEN_SECRET)
20 |     #you can get 1500 results 15 pages * 100 per page
21 |     resultsPages = []
22 |     for i in range(1,15):
23 |         print "fetching page %d" % i
24 |         searchResults = api.GetSearch(searchStr, per_page=100, page=i)
25 |         resultsPages.append(searchResults)
26 |         sleep(6)
27 |     return resultsPages
28 | 
29 | def mineTweets(tweetArr, minSup=5):
30 |     parsedList = []
31 |     for i in range(14):
32 |         for j in range(100):
33 |             parsedList.append(textParse(tweetArr[i][j].text))
34 |     initSet = createInitSet(parsedList)
35 |     myFPtree, myHeaderTab = createTree(initSet, minSup)
36 |     myFreqList = []
37 |     mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
38 |     return myFreqList
39 | 


--------------------------------------------------------------------------------
/ch12/kosarak.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch12/kosarak.zip


--------------------------------------------------------------------------------
/ch13/README.md:
--------------------------------------------------------------------------------
 1 | # Ch13 - 利用PCA来简化数据(Using principal component analysis to simplify data)
 2 | 
 3 | #### 大量的数据往往拥有超出显示能力的更多特征，但数据显示并非大规模特征下的唯一难题，对数据进行简化也很重要，它可以：
 4 | ```
 5 | 使得数据集更易使用。
 6 | 降低很多算法的计算开销。
 7 | 去除噪声。
 8 | 使得结果易懂。
 9 | ```
10 | ## 第一种降维的方法称为主成分分析(Principal Component Analysis,PCA)。
11 | #### 在PCA中,数据从原来的坐标系转换到了新的坐标系,新坐标系的选择是由数据本身决定的。第一个新坐标轴选择的是原始数据中方差最大的方向，第二个新坐标轴的选择和第一个坐标轴正交且具有最大方差的方向。该过程一直重复，重复次数为原始数据中特征的数目。我们会发现，大部分方差都包含在最前面的几个新坐标轴中。因此，我们可以忽略余下的坐标轴，即对数据进行了降维处理。
12 | 
13 | ## 另外一种降维技术是因子分析(Factor Analysis)。
14 | #### 在因子分析中,我们假设在观察数据的生成中有一些观察不到的隐变量(latent variable)。假设观察数据是这些隐变量和某些噪声的线性组合。那么隐变量的数据可能比观察数据的数目少,也就是说通过找到隐变量就可以实现数据的降维。
15 | 
16 | ## 还有一种降维技术就是独立成分分析(Independent Component Analysis,ICA)。
17 | #### ICA假设数据是从 N 个数据源生成的,这一点和因子分析有些类似。假设数据为多个数据源的混合观察结果，这些数据源之间在统计上是相互独立的，而在PCA中只假设数据是不相关的。同因子分析一样，如果数据源的数目少于观察数据的数目，则可以实现降维过程。
18 | 
19 | ## PCA伪代码：
20 | ```
21 | 去除平均值 
22 | 计算协方差矩阵 
23 | 计算协方差矩阵的特征值和特征向量 
24 | 将特征值从大到小排序 
25 | 保留最上面的N个特征向量 
26 | 将数据转换到上述N个特征向量构建的新空间中
27 | ```
28 | ## 数据集
29 | ![数据集.png](screenshot/数据集.png)
30 | 
31 | ## 降维(一)
32 | ![降维.png](screenshot/降维.png)
33 | 
34 | #### 上图可以看出，将2维的数据降到1维的直观图。直线是一维。
35 | 
36 | ## 降维(二)
37 | ![降维2.png](screenshot/降维2.png)
38 | 
39 | ## 示例：利用 PCA 对半导体制造数据降维 
40 | ### 总方差的百分比
41 | ![方差百分比.png](screenshot/方差百分比.png)
42 | #### 前六个主成分覆盖了数据96.8%的方差，而前20个主成分覆盖了99.3% 的方差。这就表明了，如果保留前6个而去除后584个主成分，我们就可以实现大概100∶1的压缩比。另外，由于舍弃了噪声的主成分，将后面的主成分去除便使得数据更加干净。
43 | 
44 | ## 总结：
45 | #### 降维技术使得数据变得更易使用，并且它们往往能够去除数据中的噪声，使得其他机器学习任务更加精确。降维往往作为预处理步骤，在数据应用到其他算法之前清洗数据。有很多技术可以用于数据降维，在这些技术中，独立成分分析、因子分析和主成分分析比较流行，其中又以主成分分析应用最广泛。
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/ch13/matplotlib/README.md:
--------------------------------------------------------------------------------
1 | # matplotlib
2 | 


--------------------------------------------------------------------------------
/ch13/matplotlib/数据集.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from numpy import *
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | n = 1000 #number of points to create
 8 | xcord0 = []
 9 | ycord0 = []
10 | xcord1 = []
11 | ycord1 = []
12 | markers =[]
13 | colors =[]
14 | fw = open('testSet.txt','w')
15 | for i in range(n):
16 |     [r0,r1] = random.standard_normal(2)
17 |     fFlyer = r0 + 9.0
18 |     tats = 1.0*r1 + fFlyer + 0
19 |     xcord0.append(fFlyer)
20 |     ycord0.append(tats)
21 |     fw.write("%f\t%f\n" % (fFlyer, tats))
22 | 
23 | fw.close()
24 | fig = plt.figure()
25 | ax = fig.add_subplot(111)
26 | ax.scatter(xcord0,ycord0, marker='^', s=90)
27 | plt.xlabel('hours of direct sunlight')
28 | plt.ylabel('liters of water')
29 | plt.show()
30 | 


--------------------------------------------------------------------------------
/ch13/matplotlib/方差百分比.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from numpy import *
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | import pca
 7 | 
 8 | dataMat = pca.replaceNanWithMean()
 9 | 
10 | #below is a quick hack copied from pca.pca()
11 | meanVals = mean(dataMat, axis=0)
12 | meanRemoved = dataMat - meanVals #remove mean
13 | covMat = cov(meanRemoved, rowvar=0)
14 | eigVals,eigVects = linalg.eig(mat(covMat))
15 | eigValInd = argsort(eigVals)            #sort, sort goes smallest to largest
16 | eigValInd = eigValInd[::-1]#reverse
17 | sortedEigVals = eigVals[eigValInd]
18 | total = sum(sortedEigVals)
19 | varPercentage = sortedEigVals/total*100
20 | 
21 | fig = plt.figure()
22 | ax = fig.add_subplot(111)
23 | ax.plot(range(1, 21), varPercentage[:20], marker='^')
24 | plt.xlabel('Principal Component Number')
25 | plt.ylabel('Percentage of Variance')
26 | plt.show()


--------------------------------------------------------------------------------
/ch13/matplotlib/降维.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from numpy import *
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | import pca
 7 | 
 8 | dataMat = pca.loadDataSet('testSet.txt')
 9 | lowDMat, reconMat = pca.pca(dataMat, 1)
10 | # lowDMat, reconMat = pca.pca(dataMat,2) #保留原来的2维数据，画图后可看出，数据样本是重合的
11 | 
12 | fig = plt.figure()
13 | ax = fig.add_subplot(111)
14 | ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0],marker='^',s=90)
15 | ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0],marker='o',s=50,c='red')
16 | #由两维降为1维数据，降维后为一条红色直线，该方向是样本方差最大的方向，即样本离散程度最大的方向，该方向，将原来的2维数据融合为1维上
17 | plt.show()
18 | 


--------------------------------------------------------------------------------
/ch13/matplotlib/降维2.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from numpy import *
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | import pca
 7 | 
 8 | n = 1000 #number of points to create
 9 | xcord0 = []; ycord0 = []
10 | xcord1 = []; ycord1 = []
11 | xcord2 = []; ycord2 = []
12 | markers =[]
13 | colors =[]
14 | fw = open('testSet3.txt','w')
15 | for i in range(n):
16 |     groupNum = int(3*random.uniform())
17 |     [r0,r1] = random.standard_normal(2)
18 |     if groupNum == 0:
19 |         x = r0 + 16.0
20 |         y = 1.0*r1 + x
21 |         xcord0.append(x)
22 |         ycord0.append(y)
23 |     elif groupNum == 1:
24 |         x = r0 + 8.0
25 |         y = 1.0*r1 + x
26 |         xcord1.append(x)
27 |         ycord1.append(y)
28 |     elif groupNum == 2:
29 |         x = r0 + 0.0
30 |         y = 1.0*r1 + x
31 |         xcord2.append(x)
32 |         ycord2.append(y)
33 |     fw.write("%f\t%f\t%d\n" % (x, y, groupNum))
34 | 
35 | fw.close()
36 | fig = plt.figure()
37 | ax = fig.add_subplot(211)
38 | ax.scatter(xcord0,ycord0, marker='^', s=90)
39 | ax.scatter(xcord1,ycord1, marker='o', s=50,  c='red')
40 | ax.scatter(xcord2,ycord2, marker='v', s=50,  c='yellow')
41 | 
42 | ax = fig.add_subplot(212)
43 | 
44 | myDat = pca.loadDataSet('testSet3.txt')
45 | 
46 | lowDDat,reconDat = pca.pca(myDat[:,0:2],1)
47 | 
48 | label0Mat = lowDDat[nonzero(myDat[:,2]==0)[0],:2][0] #get the items with label 0
49 | label1Mat = lowDDat[nonzero(myDat[:,2]==1)[0],:2][0] #get the items with label 1
50 | label2Mat = lowDDat[nonzero(myDat[:,2]==2)[0],:2][0] #get the items with label 2
51 | 
52 | #ax.scatter(label0Mat[:,0],label0Mat[:,1], marker='^', s=90)
53 | #ax.scatter(label1Mat[:,0],label1Mat[:,1], marker='o', s=50,  c='red')
54 | #ax.scatter(label2Mat[:,0],label2Mat[:,1], marker='v', s=50,  c='yellow')
55 | 
56 | ax.scatter(label0Mat[:,0].flatten().A[0], zeros(shape(label0Mat)[0]), marker='^', s=90)
57 | ax.scatter(label1Mat[:,0].flatten().A[0], zeros(shape(label1Mat)[0]), marker='o', s=50,  c='red')
58 | ax.scatter(label2Mat[:,0].flatten().A[0], zeros(shape(label2Mat)[0]), marker='v', s=50,  c='yellow')
59 | plt.show()


--------------------------------------------------------------------------------
/ch13/pca.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from numpy import *
 4 | 
 5 | def loadDataSet(fileName,delim='\t'):
 6 |     fr=open(fileName)
 7 |     #使用两个list来构建矩阵
 8 |     stringArr=[line.strip().split(delim) for line in fr.readlines()]
 9 |     datArr=[list(map(float,line)) for line in stringArr]
10 |     return mat(datArr)
11 | 
12 | # PCA算法
13 | def pca(dataMat,topNfeat=9999999): #topNfeat为可选参数，记录特征值个数
14 |     meanVals=mean(dataMat,axis=0) #求均值
15 |     meanRemoved=dataMat-meanVals  #归一化数据
16 |     covMat=cov(meanRemoved,rowvar=0)    #求协方差
17 |     eigVals,eigVects=linalg.eig(mat(covMat)) #计算特征值和特征向量
18 |     eigValInd=argsort(eigVals)               #对特征值进行排序，默认从小到大
19 |     eigValInd=eigValInd[:-(topNfeat+1):-1]   #逆序取得特征值最大的元素
20 |     redEigVects=eigVects[:,eigValInd]        #用特征向量构成矩阵
21 |     lowDDataMat=meanRemoved*redEigVects      #用归一化后的各个数据与特征矩阵相乘，映射到新的空间
22 |     reconMat=(lowDDataMat*redEigVects.T)+meanVals #还原原始数据
23 |     return lowDDataMat,reconMat
24 | 
25 | def replaceNanWithMean():             #均值代替那些样本中的缺失值
26 |     datMat = loadDataSet('secom.data', ' ')
27 |     numFeat = shape(datMat)[1]
28 |     for i in range(numFeat):
29 |         meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number) # .A表示把矩阵转化为数组array
30 |         #nonzero(~isnan(datMat[:,i].A))[0] 返回非0元素所在行的索引；
31 |         #>>> nonzero([True,False,True])
32 |         #    (array([0, 2]),) 第0个和第3个元素非0
33 |         #~isnan()返回Ture or False
34 |         datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal  #set NaN values to mean
35 |     return datMat
36 | 
37 | if __name__=="__main__":
38 | 	
39 |     dataMat = loadDataSet("testSet.txt")
40 |     print shape(dataMat)
41 |     lowDmat,reconMat = pca(dataMat,1)
42 |     print shape(lowDmat) # 变成一维矩阵
43 | 
44 |     dataMat = replaceNanWithMean()
45 |     # 去除均值
46 |     meanVals = mean(dataMat,axis = 0)
47 |     meanRemoved = dataMat - meanVals
48 |     # 计算协方差
49 |     covMat = cov(meanRemoved,rowvar = 0)
50 |     # 对矩阵进行特征值分析
51 |     eigVals,eigVects = linalg.eig(mat(covMat))
52 | 
53 |     #观察特征值结果
54 |     print eigVals


--------------------------------------------------------------------------------
/ch13/screenshot/README.md:
--------------------------------------------------------------------------------
1 | # screenshot
2 | 


--------------------------------------------------------------------------------
/ch13/screenshot/数据集.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch13/screenshot/数据集.png


--------------------------------------------------------------------------------
/ch13/screenshot/方差百分比.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch13/screenshot/方差百分比.png


--------------------------------------------------------------------------------
/ch13/screenshot/降维.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch13/screenshot/降维.png


--------------------------------------------------------------------------------
/ch13/screenshot/降维2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch13/screenshot/降维2.png


--------------------------------------------------------------------------------
/ch14/0_5.txt:
--------------------------------------------------------------------------------
 1 | 00000000000000110000000000000000
 2 | 00000000000011111100000000000000
 3 | 00000000000111111110000000000000
 4 | 00000000001111111111000000000000
 5 | 00000000111111111111100000000000
 6 | 00000001111111111111110000000000
 7 | 00000000111111111111111000000000
 8 | 00000000111111100001111100000000
 9 | 00000001111111000001111100000000
10 | 00000011111100000000111100000000
11 | 00000011111100000000111110000000
12 | 00000011111100000000011110000000
13 | 00000011111100000000011110000000
14 | 00000001111110000000001111000000
15 | 00000011111110000000001111000000
16 | 00000011111100000000001111000000
17 | 00000001111100000000001111000000
18 | 00000011111100000000001111000000
19 | 00000001111100000000001111000000
20 | 00000001111100000000011111000000
21 | 00000000111110000000001111100000
22 | 00000000111110000000001111100000
23 | 00000000111110000000001111100000
24 | 00000000111110000000011111000000
25 | 00000000111110000000111111000000
26 | 00000000111111000001111110000000
27 | 00000000011111111111111110000000
28 | 00000000001111111111111110000000
29 | 00000000001111111111111110000000
30 | 00000000000111111111111000000000
31 | 00000000000011111111110000000000
32 | 00000000000000111111000000000000
33 | 


--------------------------------------------------------------------------------
/ch14/README.md:
--------------------------------------------------------------------------------
 1 | # Ch14 - 利用SVD简化数据(Simplifying data with the singular value decomposition)
 2 | 
 3 | #### 利用SVD实现，我们能够用小得多的数据集来表示原始数据集。这样做，实际上是去除了噪声和冗余信息。简而言之，SVD是一种从大量数据中提取主要关键数据的方法。
 4 | 
 5 | ## 应用场景
 6 | ### 1.隐性语义索引 
 7 | ### 2.推荐系统
 8 | 
 9 | #### SVD是矩阵分解的一种类型，而矩阵分解是将数据矩阵分解为多个独立部分的过程。
10 | 
11 | ## 矩阵分解
12 | #### 很多情况下,数据中的一小段携带了数据集中的大部分信息，其他信息则要么是噪声，要么就是毫不相关的信息。 在线性代数中还有很多矩阵分解技术。矩阵分解可以将原始矩阵表示成新的易于处理的形式，这种新形式是两个或多个矩阵的乘积。 不同的矩阵分解技术具有不同的性质，其中有些更适合于某个应用,有些则更适合于其他应用。最常见的一种矩阵分解技术就是SVD。
13 | 
14 | #### 矩阵 Σ ，该矩阵只有对角元素，其他元素均为0。另一个惯例就是，Σ 的对角元素是从大到小排列的。这些对角元素称为奇异值(Singular Value),它们对应了原始数据集矩阵 Data 的奇异值。奇异值和特征值是有关系的。
15 | 
16 | #### 科学和工程中，一直存在这样一个普遍事实：在某个奇异值的数目( r 个)之后，其他的奇异值都置为0。这就意味着数据集中仅有 r 个重要特征，而其余特征则都是噪声或冗余特征。
17 | 
18 | ## 基于协同过滤的推荐引擎
19 | #### 协同过滤(collaborative filtering )是通过将用户和其他用户的数据进行对比来实现推荐的，唯一所需要的数学方法就是相似度的计算。
20 | 
21 | ### 这里介绍了几种相似度计算，欧式距离，皮尔逊相关系数( Pearson correlation)，余弦相似度 (cosine similarity)。
22 | 
23 | ## 示例:餐馆菜肴推荐引擎 
24 | 【略】
25 | 
26 | ## 示例:基于 SVD 的图像压缩
27 | 【略】
28 | 
29 | ## 总结
30 | #### SVD 是一种强大的降维工具，我们可以利用 SVD 来逼近矩阵并从中提取重要特征。通过保留矩阵 80% ~ 90% 的能量，就可以得到重要的特征并去掉噪声。在大规模数据集上, SVD 的计算和推荐可能是一个很困难的工程问题。通过离线方式来进行SVD 分解和相似度计算，是一种减少冗余计算和推荐所需时间的办法。
31 | 


--------------------------------------------------------------------------------
/ch15/README.md:
--------------------------------------------------------------------------------
 1 | # Ch15 - 大数据与MapReduce(Big data and MapReduce)
 2 | 
 3 | ## 总结：
 4 | 
 5 | ### 当运算需求超出了当前资源的运算能力，可以考虑购买更好的机器，或者租用网络服务并使用MapReduce框架并行执行。另一个情况是，运算需求超出了合理价位下所能购买到的机器的运算能力。其中一个解决方法是将计算转成并行的作业，MapReduce就提供了这种方案的一个具体实施框架。在MapReduce中，作业被分成map阶段和reduce阶段。
 6 | 
 7 | ### 一个典型的作业流程是先使用map阶段并行处理数据，之后将这些数据在reduce阶段合并。这种多对一的模式很经典，但不是唯一的流程方式。mapper和reducer之间传输数据的形式是key/value对。一般地，map阶段后数据还会按照key值进行排序。Hadoop是一个流行的可行MapReduce作业的java项目，它同时提供非Java作业的运行支持，叫做Hadoop流。
 8 | 
 9 | ### 很多机器学习算法都可以容易地写成MapReduce作业，而某些需要经过重写和创新性的修改，才能在MapReduce上运行。
10 | 


--------------------------------------------------------------------------------
/ch15/err.txt:
--------------------------------------------------------------------------------
 1 | No handlers could be found for logger "mrjob.job"
 2 | using configs in c:/Users/Peter\.mrjob.conf
 3 | creating tmp directory /scratch/$USER\mrSVM.Peter.20111230.181815.061000
 4 | reading from STDIN
 5 | > 'c:\Python27\python.exe' mrSVM.py --step-num=0 --mapper '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\input_part-00000'
 6 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-0-mapper_part-00000
 7 | > 'c:\Python27\python.exe' mrSVM.py --step-num=0 --mapper '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\input_part-00001'
 8 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-0-mapper_part-00001
 9 | STDERR: No handlers could be found for logger "mrjob.job"
10 | STDERR: No handlers could be found for logger "mrjob.job"
11 | Counters from step 1:
12 |   (no counters found)
13 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-0-mapper-sorted
14 | > sort '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-0-mapper_part-00000' '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-0-mapper_part-00001'
15 | Piping files into sort for Windows compatibility
16 | > sort
17 | > 'c:\Python27\python.exe' mrSVM.py --step-num=0 --reducer '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\input_part-00000'
18 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-0-reducer_part-00000
19 | STDERR: No handlers could be found for logger "mrjob.job"
20 | Counters from step 1:
21 |   (no counters found)
22 | > 'c:\Python27\python.exe' mrSVM.py --step-num=1 --mapper '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\input_part-00000'
23 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-1-mapper_part-00000
24 | > 'c:\Python27\python.exe' mrSVM.py --step-num=1 --mapper '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\input_part-00001'
25 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-1-mapper_part-00001
26 | STDERR: No handlers could be found for logger "mrjob.job"
27 | STDERR: No handlers could be found for logger "mrjob.job"
28 | Counters from step 2:
29 |   (no counters found)
30 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-1-mapper-sorted
31 | Piping files into sort for Windows compatibility
32 | > sort
33 | > 'c:\Python27\python.exe' mrSVM.py --step-num=1 --reducer '/scratch/$USER\mrSVM.Peter.20111230.181815.061000\input_part-00000'
34 | writing to /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-1-reducer_part-00000
35 | STDERR: No handlers could be found for logger "mrjob.job"
36 | Counters from step 2:
37 |   (no counters found)
38 | Moving /scratch/$USER\mrSVM.Peter.20111230.181815.061000\step-1-reducer_part-00000 -> /scratch/$USER\mrSVM.Peter.20111230.181815.061000\output\part-00000
39 | Streaming final output from /scratch/$USER\mrSVM.Peter.20111230.181815.061000\output
40 | removing tmp directory /scratch/$USER\mrSVM.Peter.20111230.181815.061000
41 | 


--------------------------------------------------------------------------------
/ch15/inputFile.txt:
--------------------------------------------------------------------------------
  1 | 0.970413
  2 | 0.901817
  3 | 0.828698
  4 | 0.197744
  5 | 0.466887
  6 | 0.962147
  7 | 0.187294
  8 | 0.388509
  9 | 0.243889
 10 | 0.115732
 11 | 0.616292
 12 | 0.713436
 13 | 0.761446
 14 | 0.944123
 15 | 0.200903
 16 | 0.547166
 17 | 0.800028
 18 | 0.848790
 19 | 0.001641
 20 | 0.058010
 21 | 0.859900
 22 | 0.009178
 23 | 0.736598
 24 | 0.683586
 25 | 0.142515
 26 | 0.212120
 27 | 0.752769
 28 | 0.546184
 29 | 0.652227
 30 | 0.583803
 31 | 0.812863
 32 | 0.036862
 33 | 0.075076
 34 | 0.257536
 35 | 0.431278
 36 | 0.600214
 37 | 0.985564
 38 | 0.055846
 39 | 0.905295
 40 | 0.336262
 41 | 0.198738
 42 | 0.845815
 43 | 0.527989
 44 | 0.448650
 45 | 0.235313
 46 | 0.599749
 47 | 0.443923
 48 | 0.968723
 49 | 0.911076
 50 | 0.279338
 51 | 0.569492
 52 | 0.635985
 53 | 0.267532
 54 | 0.975018
 55 | 0.463698
 56 | 0.842340
 57 | 0.065590
 58 | 0.233049
 59 | 0.810390
 60 | 0.448260
 61 | 0.431967
 62 | 0.549648
 63 | 0.703612
 64 | 0.187974
 65 | 0.231709
 66 | 0.784160
 67 | 0.072283
 68 | 0.921053
 69 | 0.735468
 70 | 0.715923
 71 | 0.150431
 72 | 0.661089
 73 | 0.734955
 74 | 0.633709
 75 | 0.216102
 76 | 0.498474
 77 | 0.195620
 78 | 0.339548
 79 | 0.245314
 80 | 0.819848
 81 | 0.521242
 82 | 0.549276
 83 | 0.200906
 84 | 0.202525
 85 | 0.922876
 86 | 0.025404
 87 | 0.604032
 88 | 0.752204
 89 | 0.158860
 90 | 0.651622
 91 | 0.592898
 92 | 0.500392
 93 | 0.410614
 94 | 0.968388
 95 | 0.265918
 96 | 0.565707
 97 | 0.413670
 98 | 0.080507
 99 | 0.929978
100 | 0.609755
101 | 


--------------------------------------------------------------------------------
/ch15/junk.txt:
--------------------------------------------------------------------------------
1 | jj I am so sick of TV
2 | ss jar jar got a purse
3 | 22 shit ass 
4 | 


--------------------------------------------------------------------------------
/ch15/kickStart.txt:
--------------------------------------------------------------------------------
  1 | ["w", [0.001, 0.001]]
  2 | ["x", 79]
  3 | ["x", 115]
  4 | ["x", 107]
  5 | ["x", 109]
  6 | ["x", 109]
  7 | ["x", 88]
  8 | ["x", 56]
  9 | ["x", 94]
 10 | ["x", 50]
 11 | ["x", 86]
 12 | ["x", 75]
 13 | ["x", 30]
 14 | ["x", 20]
 15 | ["x", 157]
 16 | ["x", 15]
 17 | ["x", 19]
 18 | ["x", 63]
 19 | ["x", 124]
 20 | ["x", 132]
 21 | ["x", 3]
 22 | ["x", 140]
 23 | ["x", 139]
 24 | ["x", 127]
 25 | ["x", 98]
 26 | ["x", 30]
 27 | ["x", 16]
 28 | ["x", 4]
 29 | ["x", 2]
 30 | ["x", 75]
 31 | ["x", 123]
 32 | ["x", 42]
 33 | ["x", 16]
 34 | ["x", 94]
 35 | ["x", 163]
 36 | ["x", 159]
 37 | ["x", 23]
 38 | ["x", 16]
 39 | ["x", 160]
 40 | ["x", 5]
 41 | ["x", 42]
 42 | ["x", 53]
 43 | ["x", 83]
 44 | ["x", 46]
 45 | ["x", 121]
 46 | ["x", 73]
 47 | ["x", 123]
 48 | ["x", 93]
 49 | ["x", 99]
 50 | ["x", 106]
 51 | ["x", 173]
 52 | ["x", 192]
 53 | ["x", 132]
 54 | ["x", 57]
 55 | ["x", 47]
 56 | ["x", 164]
 57 | ["x", 157]
 58 | ["x", 199]
 59 | ["x", 62]
 60 | ["x", 175]
 61 | ["x", 154]
 62 | ["x", 110]
 63 | ["x", 0]
 64 | ["x", 116]
 65 | ["x", 49]
 66 | ["x", 76]
 67 | ["x", 121]
 68 | ["x", 178]
 69 | ["x", 75]
 70 | ["x", 167]
 71 | ["x", 41]
 72 | ["x", 105]
 73 | ["x", 71]
 74 | ["x", 5]
 75 | ["x", 135]
 76 | ["x", 80]
 77 | ["x", 116]
 78 | ["x", 198]
 79 | ["x", 164]
 80 | ["x", 105]
 81 | ["x", 98]
 82 | ["x", 156]
 83 | ["x", 72]
 84 | ["x", 54]
 85 | ["x", 62]
 86 | ["x", 57]
 87 | ["x", 87]
 88 | ["x", 68]
 89 | ["x", 163]
 90 | ["x", 140]
 91 | ["x", 40]
 92 | ["x", 70]
 93 | ["x", 120]
 94 | ["x", 172]
 95 | ["x", 71]
 96 | ["x", 82]
 97 | ["x", 168]
 98 | ["x", 42]
 99 | ["x", 144]
100 | ["x", 27]
101 | ["x", 36]


--------------------------------------------------------------------------------
/ch15/mrMean.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from mrjob.job import MRJob
 4 | 
 5 | class MRmean(MRJob):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super(MRmean, self).__init__(*args, **kwargs)
 8 |         self.inCount = 0
 9 |         self.inSum = 0
10 |         self.inSqSum = 0
11 |     
12 |     def map(self, key, val): #needs exactly 2 arguments
13 |         if False: yield
14 |         inVal = float(val)
15 |         self.inCount += 1
16 |         self.inSum += inVal
17 |         self.inSqSum += inVal*inVal
18 |         
19 |     def map_final(self):
20 |         mn = self.inSum/self.inCount
21 |         mnSq = self.inSqSum/self.inCount
22 |         yield (1, [self.inCount, mn, mnSq])
23 | 
24 |     def reduce(self, key, packedValues):
25 |         cumVal=0.0; cumSumSq=0.0; cumN=0.0
26 |         for valArr in packedValues: #get values from streamed inputs
27 |             nj = float(valArr[0])
28 |             cumN += nj
29 |             cumVal += nj*float(valArr[1])
30 |             cumSumSq += nj*float(valArr[2])
31 |         mean = cumVal/cumN
32 |         var = (cumSumSq - 2*mean*cumVal + cumN*mean*mean)/cumN
33 |         yield (mean, var) #emit mean and var
34 |         
35 |     def steps(self):
36 |         return ([self.mr(mapper=self.map, mapper_final=self.map_final,\
37 |                           reducer=self.reduce,)])
38 | 
39 | if __name__ == '__main__':
40 |     MRmean.run()
41 | 


--------------------------------------------------------------------------------
/ch15/mrMeanMapper.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import sys
 4 | from numpy import mat, mean, power
 5 | 
 6 | def read_input(file):
 7 |     for line in file:
 8 |         yield line.rstrip()
 9 |         
10 | input = read_input(sys.stdin)#creates a list of input lines
11 | input = [float(line) for line in input] #overwrite with floats
12 | numInputs = len(input)
13 | input = mat(input)
14 | sqInput = power(input,2)
15 | 
16 | #output size, mean, mean(square values)
17 | print "%d\t%f\t%f" % (numInputs, mean(input), mean(sqInput)) #calc mean of columns
18 | print >> sys.stderr, "report: still alive" 
19 | 


--------------------------------------------------------------------------------
/ch15/mrMeanReducer.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import sys
 4 | from numpy import mat, mean, power
 5 | 
 6 | def read_input(file):
 7 |     for line in file:
 8 |         yield line.rstrip()
 9 |        
10 | input = read_input(sys.stdin)#creates a list of input lines
11 | 
12 | #split input lines into separate items and store in list of lists
13 | mapperOut = [line.split('\t') for line in input]
14 | 
15 | #accumulate total number of samples, overall sum and overall sum sq
16 | cumVal=0.0
17 | cumSumSq=0.0
18 | cumN=0.0
19 | for instance in mapperOut:
20 |     nj = float(instance[0])
21 |     cumN += nj
22 |     cumVal += nj*float(instance[1])
23 |     cumSumSq += nj*float(instance[2])
24 |     
25 | #calculate means
26 | mean = cumVal/cumN
27 | meanSq = cumSumSq/cumN
28 | 
29 | #output size, mean, mean(square values)
30 | print "%d\t%f\t%f" % (cumN, mean, meanSq)
31 | print >> sys.stderr, "report: still alive"


--------------------------------------------------------------------------------
/ch15/mrSVM.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from mrjob.job import MRJob
 4 | 
 5 | import pickle
 6 | from numpy import *
 7 | 
 8 | class MRsvm(MRJob):
 9 |     DEFAULT_INPUT_PROTOCOL = 'json_value'
10 |     
11 |     def __init__(self, *args, **kwargs):
12 |         super(MRsvm, self).__init__(*args, **kwargs)
13 |         self.data = pickle.load(open('C:\Users\Peter\machinelearninginaction\Ch15\svmDat27'))
14 |         self.w = 0
15 |         self.eta = 0.69
16 |         self.dataList = []
17 |         self.k = self.options.batchsize
18 |         self.numMappers = 1
19 |         self.t = 1  #iteration number
20 |                                                  
21 |     def configure_options(self):
22 |         super(MRsvm, self).configure_options()
23 |         self.add_passthrough_option(
24 |             '--iterations', dest='iterations', default=2, type='int',
25 |             help='T: number of iterations to run')
26 |         self.add_passthrough_option(
27 |             '--batchsize', dest='batchsize', default=100, type='int',
28 |             help='k: number of data points in a batch')
29 |     
30 |     def map(self, mapperId, inVals): #needs exactly 2 arguments
31 |         #input: nodeId, ('w', w-vector) OR nodeId, ('x', int)
32 |         if False: yield
33 |         if inVals[0]=='w':                  #accumulate W-vector
34 |             self.w = inVals[1]
35 |         elif inVals[0]=='x':
36 |             self.dataList.append(inVals[1])#accumulate data points to calc
37 |         elif inVals[0]=='t': self.t = inVals[1] 
38 |         else: self.eta=inVals #this is for debug, eta not used in map
39 |         
40 |     def map_fin(self):
41 |         labels = self.data[:,-1]; X=self.data[:,0:-1]#reshape data into X and Y
42 |         if self.w == 0: self.w = [0.001]*shape(X)[1] #init w on first iteration
43 |         for index in self.dataList:
44 |             p = mat(self.w)*X[index,:].T #calc p=w*dataSet[key].T 
45 |             if labels[index]*p < 1.0:
46 |                 yield (1, ['u', index])#make sure everything has the same key                           
47 |         yield (1, ['w', self.w])       #so it ends up at the same reducer
48 |         yield (1, ['t', self.t])
49 | 
50 |     def reduce(self, _, packedVals):
51 |         for valArr in packedVals: #get values from streamed inputs
52 |             if valArr[0]=='u':  self.dataList.append(valArr[1])
53 |             elif valArr[0]=='w': self.w = valArr[1]
54 |             elif valArr[0]=='t':  self.t = valArr[1] 
55 |         labels = self.data[:,-1]; X=self.data[:,0:-1]
56 |         wMat = mat(self.w);   wDelta = mat(zeros(len(self.w)))
57 |         for index in self.dataList:
58 |             wDelta += float(labels[index])*X[index,:] #wDelta += label*dataSet
59 |         eta = 1.0/(2.0*self.t)       #calc new: eta
60 |         #calc new: w = (1.0 - 1/t)*w + (eta/k)*wDelta
61 |         wMat = (1.0 - 1.0/self.t)*wMat + (eta/self.k)*wDelta
62 |         for mapperNum in range(1,self.numMappers+1):
63 |             yield (mapperNum, ['w', wMat.tolist()[0] ]) #emit w
64 |             if self.t < self.options.iterations:
65 |                 yield (mapperNum, ['t', self.t+1])#increment T
66 |                 for j in range(self.k/self.numMappers):#emit random ints for mappers iid
67 |                     yield (mapperNum, ['x', random.randint(shape(self.data)[0]) ])
68 |         
69 |     def steps(self):
70 |         return ([self.mr(mapper=self.map, reducer=self.reduce, 
71 |                          mapper_final=self.map_fin)]*self.options.iterations)
72 | 
73 | if __name__ == '__main__':
74 |     MRsvm.run()
75 | 


--------------------------------------------------------------------------------
/ch15/mrSVMkickStart.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from mrjob.protocol import JSONProtocol
 4 | from numpy import *
 5 | 
 6 | fw=open('kickStart2.txt', 'w')
 7 | for i in [1]:
 8 |     for j in range(100):
 9 |         fw.write('["x", %d]\n' % random.randint(200))
10 | fw.close()


--------------------------------------------------------------------------------
/ch15/pegasos.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from numpy import *
 4 | 
 5 | def loadDataSet(fileName):
 6 |     dataMat = []; labelMat = []
 7 |     fr = open(fileName)
 8 |     for line in fr.readlines():
 9 |         lineArr = line.strip().split('\t')
10 |         #dataMat.append([float(lineArr[0]), float(lineArr[1]), float(lineArr[2])])
11 |         dataMat.append([float(lineArr[0]), float(lineArr[1])])
12 |         labelMat.append(float(lineArr[2]))
13 |     return dataMat,labelMat
14 | 
15 | def seqPegasos(dataSet, labels, lam, T):
16 |     m,n = shape(dataSet); w = zeros(n)
17 |     for t in range(1, T+1):
18 |         i = random.randint(m)
19 |         eta = 1.0/(lam*t)
20 |         p = predict(w, dataSet[i,:])
21 |         if labels[i]*p < 1:
22 |             w = (1.0 - 1/t)*w + eta*labels[i]*dataSet[i,:]
23 |         else:
24 |             w = (1.0 - 1/t)*w
25 |         print w
26 |     return w
27 |         
28 | def predict(w, x):
29 |     return w*x.T
30 | 
31 | def batchPegasos(dataSet, labels, lam, T, k):
32 |     m,n = shape(dataSet); w = zeros(n); 
33 |     dataIndex = range(m)
34 |     for t in range(1, T+1):
35 |         wDelta = mat(zeros(n)) #reset wDelta
36 |         eta = 1.0/(lam*t)
37 |         random.shuffle(dataIndex)
38 |         for j in range(k):#go over training set 
39 |             i = dataIndex[j]
40 |             p = predict(w, dataSet[i,:])        #mapper code
41 |             if labels[i]*p < 1:                 #mapper code
42 |                 wDelta += labels[i]*dataSet[i,:].A #accumulate changes  
43 |         w = (1.0 - 1/t)*w + (eta/k)*wDelta       #apply changes at each T
44 |     return w
45 | 
46 | datArr,labelList = loadDataSet('testSet.txt')
47 | datMat = mat(datArr)
48 | #finalWs = seqPegasos(datMat, labelList, 2, 5000)
49 | finalWs = batchPegasos(datMat, labelList, 2, 50, 100)
50 | print finalWs
51 | 
52 | import matplotlib
53 | import matplotlib.pyplot as plt
54 | fig = plt.figure()
55 | ax = fig.add_subplot(111)
56 | x1=[]; y1=[]; xm1=[]; ym1=[]
57 | for i in range(len(labelList)):
58 |     if labelList[i] == 1.0:
59 |         x1.append(datMat[i,0]); y1.append(datMat[i,1])
60 |     else:
61 |         xm1.append(datMat[i,0]); ym1.append(datMat[i,1])
62 | ax.scatter(x1, y1, marker='s', s=90)
63 | ax.scatter(xm1, ym1, marker='o', s=50, c='red')
64 | x = arange(-6.0, 8.0, 0.1)
65 | y = (-finalWs[0,0]*x - 0)/finalWs[0,1]
66 | #y2 = (0.43799*x)/0.12316
67 | y2 = (0.498442*x)/0.092387 #2 iterations
68 | ax.plot(x,y)
69 | ax.plot(x,y2,'g-.')
70 | ax.axis([-6,8,-4,5])
71 | ax.legend(('50 Iterations', '2 Iterations') )
72 | plt.show()


--------------------------------------------------------------------------------
/ch15/proximalSVM.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import numpy
 4 | 
 5 | def map(key, value):
 6 |    # input key= class for one training example, e.g. "-1.0"
 7 |    classes = [float(item) for item in key.split(",")]   # e.g. [-1.0]
 8 |    D = numpy.diag(classes)
 9 |  
10 |    # input value = feature vector for one training example, e.g. "3.0, 7.0, 2.0"
11 |    featurematrix = [float(item) for item in value.split(",")]
12 |    A = numpy.matrix(featurematrix)
13 |  
14 |    # create matrix E and vector e
15 |    e = numpy.matrix(numpy.ones(len(A)).reshape(len(A),1))
16 |    E = numpy.matrix(numpy.append(A,-e,axis=1)) 
17 |  
18 |    # create a tuple with the values to be used by reducer
19 |    # and encode it with base64 to avoid potential trouble with '\t' and '\n' used
20 |    # as default separators in Hadoop Streaming
21 |    producedvalue = base64.b64encode(pickle.dumps( (E.T*E, E.T*D*e) )    
22 |  
23 |    # note: a single constant key "producedkey" sends to only one reducer
24 |    # somewhat "atypical" due to low degree of parallism on reducer side
25 |    print "producedkey\t%s" % (producedvalue)
26 |    
27 | def reduce(key, values, mu=0.1):
28 |   sumETE = None
29 |   sumETDe = None
30 |  
31 |   # key isn't used, so ignoring it with _ (underscore).
32 |   for _, value in values:
33 |     # unpickle values
34 |     ETE, ETDe = pickle.loads(base64.b64decode(value))
35 |     if sumETE == None:
36 |       # create the I/mu with correct dimensions
37 |       sumETE = numpy.matrix(numpy.eye(ETE.shape[1])/mu)
38 |     sumETE += ETE
39 |  
40 |     if sumETDe == None:
41 |       # create sumETDe with correct dimensions
42 |       sumETDe = ETDe
43 |     else:
44 |       sumETDe += ETDe
45 |  
46 |     # note: omega = result[:-1] and gamma = result[-1]
47 |     # but printing entire vector as output
48 |     result = sumETE.I*sumETDe
49 |     print "%s\t%s" % (key, str(result.tolist()))
50 | 


--------------------------------------------------------------------------------
/ch15/py27dbg.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from mrjob.job import MRJob
 4 | 
 5 | import pickle
 6 | from numpy import *
 7 | 
 8 | class MRsvm(MRJob):
 9 |                                                  
10 |     def map(self, mapperId, inVals): #needs exactly 2 arguments
11 |         if False: yield
12 |         yield (1, 22)
13 | 
14 |     def reduce(self, _, packedVals):
15 |         yield "fuck ass" 
16 |         
17 |     def steps(self):
18 |         return ([self.mr(mapper=self.map, reducer=self.reduce)])
19 | 
20 | if __name__ == '__main__':
21 |     MRsvm.run()
22 | 


--------------------------------------------------------------------------------
/ch15/wc.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from mrjob.job import MRJob
 4 | import json
 5 | 
 6 | 
 7 | class MRWordCountUtility(MRJob):
 8 | 
 9 |     def __init__(self, *args, **kwargs):
10 |         super(MRWordCountUtility, self).__init__(*args, **kwargs)
11 |         self.chars = 0
12 |         self.words = 0
13 |         self.lines = 0
14 | 
15 |     def mapper(self, _, line):
16 |         if False:
17 |             yield  # I'm a generator!
18 | 
19 |         self.chars += len(line) + 1  # +1 for newline
20 |         self.words += sum(1 for word in line.split() if word.strip())
21 |         self.lines += 1
22 | 
23 |     def mapper_final(self):
24 |         yield('chars', self.chars)
25 |         yield('words', self.words)
26 |         yield('lines', self.lines)
27 | 
28 |     def reducer(self, key, values):
29 |         yield(key, sum(values))
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     MRWordCountUtility.run()
34 | 


--------------------------------------------------------------------------------
/ch2/KNN.txt:
--------------------------------------------------------------------------------
1 | https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm


--------------------------------------------------------------------------------
/ch2/KNN（classify）.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from numpy import *
 4 | import operator
 5 | from os import listdir
 6 | 
 7 | 
 8 | # 用于分类的输入向量inx，输入的训练样本集是dataSet，标签向量是labels
 9 | #参数k表示用于选择最近邻居的数目
10 | #通过KNN进行分类
11 | def classify0(inX, dataSet, labels, k):
12 | 
13 |     dataSetSize = dataSet.shape[0]
14 |     # 计算欧式距离
15 |     diffMat = tile(inX, (dataSetSize,1)) - dataSet
16 |     sqDiffMat = diffMat**2
17 |     sqDistances = sqDiffMat.sum(axis=1) #行向量分别相加，从而得到新的一个行向量
18 |     distances = sqDistances**0.5
19 | 
20 |     # 对距离进行排序
21 |     sortedDistIndicies = distances.argsort() #argsort()根据元素的值从大到小对元素进行排序，返回下标
22 |     classCount={}
23 | 
24 |     for i in range(k):
25 |         voteIlabel = labels[sortedDistIndicies[i]]
26 |         # 对选取的K个样本所属的类别个数进行统计
27 |         classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
28 | 
29 |     #逆序，选取出现的类别次数最多的类别
30 |     sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
31 |     #返回出现的类别中次数最多的类别
32 |     return sortedClassCount[0][0]
33 |     
34 | def createDataSet():
35 |     group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
36 |     labels = ['A','A','B','B']
37 |     print classify0([0][0], group,labels,3) # B
38 |    # print classify0(([1][0],group,labels,3)) #A
39 |     return group, labels
40 | 
41 | if __name__ == '__main__':
42 |     print  createDataSet()
43 | 


--------------------------------------------------------------------------------
/ch2/KNN（datingTestSet）.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | from numpy import *
 4 | import operator
 5 | from os import listdir
 6 | 
 7 | 
 8 | # 用于分类的输入向量inx，输入的训练样本集是dataSet，标签向量是labels
 9 | # 参数k表示用于选择最近邻居的数目
10 | 
11 | #通过KNN进行分类
12 | def classify0(inX, dataSet, labels, k):
13 | 
14 |     dataSetSize = dataSet.shape[0]
15 |     # 计算欧式距离
16 |     diffMat = tile(inX, (dataSetSize,1)) - dataSet
17 |     sqDiffMat = diffMat**2
18 |     sqDistances = sqDiffMat.sum(axis=1) #行向量分别相加，从而得到新的一个行向量
19 |     distances = sqDistances**0.5
20 | 
21 |     # 对距离进行排序
22 |     sortedDistIndicies = distances.argsort() #argsort()根据元素的值从大到小对元素进行排序，返回下标
23 |     classCount={}
24 | 
25 |     for i in range(k):
26 |         voteIlabel = labels[sortedDistIndicies[i]]
27 |         # 对选取的K个样本所属的类别个数进行统计
28 |         classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
29 | 
30 |     #逆序，选取出现的类别次数最多的类别
31 |     sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
32 |     #返回出现的类别中次数最多的类别
33 |     return sortedClassCount[0][0]
34 | 
35 | def filematrix(filename):
36 |     fr = open(filename)
37 |     numberOfLines = len(fr.readlines())  # 文件中的行数
38 |     returnMat = zeros((numberOfLines, 3))  # 初始化矩阵
39 |     classLabelVector = []  # 初始化labels
40 |     fr = open(filename)
41 |     index = 0
42 |     for line in fr.readlines():
43 |         line = line.strip()
44 |         listFromLine = line.split('\t')
45 |         returnMat[index, :] = listFromLine[0:3]
46 |         classLabelVector.append(int(listFromLine[-1]))
47 |         index += 1
48 |     return returnMat, classLabelVector
49 | 
50 | 
51 | def autoNorm(dataSet):  # 归一化特征值
52 | 
53 |     minVals = dataSet.min(0)
54 |     maxVals = dataSet.max(0)
55 |     ranges = maxVals - minVals
56 |     normDataSet = zeros(shape(dataSet))
57 |     m = dataSet.shape[0]
58 |     normDataSet = dataSet - tile(minVals, (m, 1))
59 |     normDataSet = normDataSet / tile(ranges, (m, 1))  # 特征值相除
60 |     return normDataSet, ranges, minVals
61 | 
62 | 
63 | def datingClassTest():
64 |     # 注意：一共有1000个数据
65 |     hoRatio = 0.10  # 随机选出 10% 的数据, 对于已有的数据，将90%作为训练，剩下10%作为测试
66 |     datingDataMat, datingLabels = filematrix('datingTestSet.txt')
67 |     normMat, ranges, minVals = autoNorm(datingDataMat)
68 |     m = normMat.shape[0]
69 |     numTestVecs = int(m * hoRatio) ##测试量
70 |     errorCount = 0.0
71 | 
72 |     for i in range(numTestVecs):
73 |         # 前10%行的数据作为测试集，并且对测试集中的每一行都进行预测，对比测试集中实际的label
74 |         # 后90%行的数据全部作为训练集，每个测试集样本都要跟90%的训练集计算距离，算出最相似的label
75 |         classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
76 |         print "分类器返回为: %d, 答案为: %d" % (classifierResult, datingLabels[i])
77 | 
78 |         if (classifierResult != datingLabels[i]):
79 |             errorCount += 1.0
80 |             print "error"
81 | 
82 |     print "测试数据量为：%d" % numTestVecs
83 |     print "错误率: %f" % (errorCount / float(numTestVecs))
84 |     print "错误个数：%d" % errorCount
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     # print  createDataSet()
89 |     datingClassTest()
90 | 
91 | 


--------------------------------------------------------------------------------
/ch2/KNN（handwriting）.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | from numpy import *
  4 | import numpy as np
  5 | import operator
  6 | from os import listdir
  7 | 
  8 | 
  9 | # 用于分类的输入向量inx，输入的训练样本集是dataSet，标签向量是labels
 10 | # 参数k表示用于选择最近邻居的数目
 11 | 
 12 | # 通过KNN进行分类
 13 | def classify0(inX, dataSet, labels, k):
 14 |     dataSetSize = dataSet.shape[0]
 15 |     # 计算欧式距离
 16 |     diffMat = tile(inX, (dataSetSize, 1)) - dataSet
 17 |     sqDiffMat = diffMat ** 2
 18 |     sqDistances = sqDiffMat.sum(axis=1)  # 行向量分别相加，从而得到新的一个行向量
 19 |     distances = sqDistances ** 0.5
 20 | 
 21 |     # 对距离进行排序
 22 |     sortedDistIndicies = distances.argsort()  # argsort()根据元素的值从大到小对元素进行排序，返回下标
 23 |     classCount = {}
 24 | 
 25 |     for i in range(k):
 26 |         voteIlabel = labels[sortedDistIndicies[i]]
 27 |         # 对选取的K个样本所属的类别个数进行统计
 28 |         classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
 29 | 
 30 |     # 逆序，选取出现的类别次数最多的类别
 31 |     sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
 32 |     # 返回出现的类别中次数最多的类别
 33 |     return sortedClassCount[0][0]
 34 | 
 35 | 
 36 | def filematrix(filename):
 37 |     fr = open(filename)
 38 |     numberOfLines = len(fr.readlines())  # 文件中的行数
 39 |     returnMat = zeros((numberOfLines, 3))  # 初始化矩阵
 40 |     classLabelVector = []  # 初始化labels
 41 |     fr = open(filename)
 42 |     index = 0
 43 |     for line in fr.readlines():
 44 |         line = line.strip()
 45 |         listFromLine = line.split('\t')
 46 |         returnMat[index, :] = listFromLine[0:3]
 47 |         classLabelVector.append(int(listFromLine[-1]))
 48 |         index += 1
 49 |     return returnMat, classLabelVector
 50 | 
 51 | 
 52 | def createDataSetFromFile(filename):
 53 |     # Read lines
 54 |     file = open(filename)
 55 |     lines = file.readlines()
 56 |     file.close()
 57 | 
 58 |     # Change lines into array
 59 |     featureCount = len(lines[0].split()) - 1
 60 |     group = np.zeros((len(lines), featureCount))
 61 |     labels = []
 62 | 
 63 |     for i in range(len(lines)):
 64 |         lst = lines[i].split()
 65 |         group[i] = np.array(lst[:-1])
 66 |         labels.append(lst[-1])
 67 | 
 68 |     return (group, labels)
 69 | 
 70 | 
 71 | def autoNorm(dataSet):  # 归一化特征值
 72 | 
 73 |     minVals = dataSet.min(0)
 74 |     maxVals = dataSet.max(0)
 75 |     ranges = maxVals - minVals
 76 |     normDataSet = zeros(shape(dataSet))
 77 |     m = dataSet.shape[0]
 78 |     normDataSet = dataSet - tile(minVals, (m, 1))
 79 |     normDataSet = normDataSet / tile(ranges, (m, 1))  # 特征值相除
 80 |     return normDataSet, ranges, minVals
 81 | 
 82 | #将图像转换成向量
 83 | def img2vector(filename):
 84 | 
 85 |     # 创建1 * 1024的Numpy数组
 86 |     returnVect = zeros((1,1024))
 87 |     fr = open(filename)
 88 |     #文件钱32行
 89 |     for i in range(32):
 90 |         lineStr = fr.readline()
 91 |         # 每行的头32个字符
 92 |         for j in range(32):
 93 |             returnVect[0,32*i+j] = int(lineStr[j])
 94 |     return  returnVect
 95 | 
 96 | def handwritingClassTest():
 97 | 
 98 |     hwLabels = []
 99 |     trainingFileList = listdir('trainingDigits')  #获取目录内容
100 |     m = len(trainingFileList)
101 | 
102 |     trainingMat = zeros((m,1024))
103 | 
104 |     for i in range(m):
105 |         #从文件名解析分类数据
106 |         fileNameStr = trainingFileList[i]
107 |         fileStr = fileNameStr.split('.')[0]
108 |         classNumStr = int(fileStr.split('_')[0])
109 |         hwLabels.append(classNumStr)
110 | 
111 |         trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
112 | 
113 |     #---------------------------------------------------------------#
114 | 
115 |     testFileList = listdir('testDigits')
116 |     errorCount = 0.0
117 |     mTest = len(testFileList)
118 | 
119 |     for i in range(mTest):
120 | 
121 |         fileNameStr = testFileList[i]
122 |         fileStr = fileNameStr.split('.')[0]
123 |         classNumStr = int(fileStr.split('_')[0])
124 | 
125 |         vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
126 | 
127 |         classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
128 | 
129 |         print "分类器返回: %d, 真正答案是: %d" % (classifierResult, classNumStr)
130 | 
131 |         if (classifierResult != classNumStr):
132 |             errorCount += 1.0
133 |             print "error"
134 | 
135 |     print "\n错误个数有: %d" % errorCount
136 |     print "\n错误率: %f %%" % (errorCount/float(mTest)*100)
137 | 
138 | 
139 | if __name__ == '__main__':
140 | 
141 |    testVector = img2vector("0_0.txt")
142 |    print testVector[0,0:31]
143 | 
144 |    handwritingClassTest()
145 | 
146 | 


--------------------------------------------------------------------------------
/ch2/README.md:
--------------------------------------------------------------------------------
 1 | # Ch02 - k-近邻(KNN)
 2 | 
 3 | ##### K-近邻算法：
 4 | ##### 优点：精度高，对异常值不敏感，无数据输入假定。
 5 | ##### 缺点：计算复杂度高，空间复杂度高。
 6 | ##### 适合数据范围：数值型和标称型。
 7 | 
 8 | ##### 通常k是不大于20的整数，最后，选择k个最相似数据中出现次数最多的分类，作为新数据的分类。
 9 | 
10 | ##### 训练算法不适用于K-近邻算法。
11 | 
12 | ##### KNN是通过测量不同特征值之间的距离进行分类。
13 | ##### 它的的思路是：如果一个样本在特征空间中的k个最相似(即特征空间中最邻近)的样本中的大多数属于某一个类别，则该样本也属于这个类别。
14 | 
15 | ##### KNN算法的思想总结一下：
16 | ##### 就是在训练集中数据和标签已知的情况下，输入测试数据，将测试数据的特征与训练集中对应的特征进行相互比较，找到训练集中与之最为相似的前K个数据，则该测试数据对应的类别就是K个数据中出现次数最多的那个分类，其算法的描述为：
17 | ##### [1] 计算测试数据与各个训练数据之间的距离；
18 | ##### [2] 按照距离的递增关系进行排序；
19 | ##### [3] 选取距离最小的K个点；(k<=20)
20 | ##### [4] 确定前K个点所在类别的出现频率；
21 | ##### [5] 返回前K个点中出现频率最高的类别作为测试数据的预测分类。
22 | 
23 | ##### k-近邻算法优缺点
24 | ##### 优点:
25 | ##### [1] 在数据量不是很大时，是作为最简单最有效的算法。
26 | 
27 | ##### [2] k-近邻算法是基于实例的学习，使用算法必须有接近实际数据的训练样本数。
28 | 
29 | ##### 缺点：
30 | ##### [1] k-近邻算法对每个测试集样本都使用了一次全部的训练集，第一若是训练集大，需要较大的存储空间，这一点倒不是什么问题，现在处理的数据基本上上G，主要是第二点，因为对每个测试集样本都需要使用一次全部的训练集得到最短的k个距离值，那么计算必然非常耗时。
31 | ##### [2] k-近邻算法无法给出任何数据的基础结构信息。无法知晓平均实例样本和典型实例样本具有怎样的特征。
32 | 
33 | ##### KNN 与 SVM 的区别是什么？
34 | ##### 一般分类任务主要有两个步骤：
35 | ##### 1.训练；
36 | ##### 2.测试。
37 | 
38 | ##### 对于SVM，是先在训练集上训练一个模型，然后用这个模型直接对测试集进行分类。这两个步骤是独立的。
39 | ##### KNN是一种基于实例的学习算法，它不同于贝叶斯、决策树等算法，KNN不需要训练，当有新的实例出现时，直接在训练数据集中找k个最近的实例，把这个新的实例分配给这k个训练实例中实例数最多类。KNN也成为懒惰学习，它不需要训练过程，在类标边界比较整齐的情况下分类的准确率很高。KNN算法需要人为决定K的取值，即找几个最近的实例，k值不同，分类结果的结果也会不同。对于KNN，没有训练过程。只是将训练数据与训练数据进行距离度量来实现分类。
40 | 
41 | ##### KNN：原理比较简单，可以需要很少量的样本数据，但一定要足够典型；高纬度情况下会疯掉。
42 | 
43 | ##### SVM：适合处理高纬度情况。
44 | 
45 | ##### K 值的选取没有一个绝对的标准，但可以想象，K 取太大并不能提高正确率，而且求 K 个最近的邻居是一个O(K*N)复杂度的算法，k 太大，算法效率会更低。
46 | 
47 | ##### 虽然说 K 值的选取，会影响结果，有人会认为这个算法不稳定，其实不然，这种影响并不是很大，因为只有这种影响只是在类别边界上产生影响，而在类中心附近的实例影响很小。
48 | 


--------------------------------------------------------------------------------
/ch2/creatDist.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from numpy import *
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | from matplotlib.patches import Rectangle
 7 | 
 8 | 
 9 | n = 1000 #1000个点
10 | xcord = zeros((n))
11 | ycord = zeros((n))
12 | markers =[]
13 | colors =[]
14 | fw = open('testSet.txt','w')
15 | for i in range(n):
16 | 
17 |     [r0,r1] = random.standard_normal(2)
18 |     myClass = random.uniform(0,1)
19 |     if (myClass <= 0.16):
20 |         fFlyer = random.uniform(22000, 60000)
21 |         tats = 3 + 1.6*r1
22 |         markers.append(20)
23 |         colors.append(2.1)
24 |         classLabel = 1 #'didntLike'
25 |         print ("%d, %f, 类1") % (fFlyer, tats)
26 | 
27 |     elif ((myClass > 0.16) and (myClass <= 0.33)):
28 |         fFlyer = 6000*r0 + 70000
29 |         tats = 10 + 3*r1 + 2*r0
30 |         markers.append(20)
31 |         colors.append(1.1)
32 |         classLabel = 1 #'didntLike'
33 |         print ("%d, %f, 类2") % (fFlyer, tats)
34 |     elif ((myClass > 0.33) and (myClass <= 0.66)):
35 |         fFlyer = 5000*r0 + 10000
36 |         tats = 3 + 2.8*r1
37 |         markers.append(30)
38 |         colors.append(1.1)
39 |         classLabel = 2 #'smallDoses'
40 |         print ("%d, %f, 类2") % (fFlyer, tats)
41 | 
42 |     else:
43 |         fFlyer = 10000*r0 + 35000
44 |         tats = 10 + 2.0*r1
45 |         markers.append(50)
46 |         colors.append(0.1)
47 |         classLabel = 3 #'largeDoses'
48 |         print ("%d, %f, 类3") % (fFlyer, tats)
49 | 
50 |     if (tats < 0): tats =0
51 |     if (fFlyer < 0): fFlyer =0
52 |     xcord[i] = fFlyer;
53 |     ycord[i]=tats
54 | 
55 |     fw.write("%d\t%f\t%f\t%d\n" % (fFlyer, tats, random.uniform(0.0, 1.7), classLabel))
56 | 
57 | fw.close()
58 | fig = plt.figure()
59 | ax = fig.add_subplot(111)
60 | ax.scatter(xcord,ycord, c=colors, s=markers)
61 | type1 = ax.scatter([-10], [-10], s=20, c='red')
62 | type2 = ax.scatter([-10], [-15], s=30, c='green')
63 | type3 = ax.scatter([-10], [-20], s=50, c='blue')
64 | ax.legend([type1, type2, type3], ["class 1", "class 2", "class 3"], loc=2)
65 | #ax.axis([-5000,100000,-2,25])
66 | plt.xlabel('Frequent Flyier Miles Earned Per Year') #横坐标
67 | plt.ylabel('Percentage of Body Covered By Tatoos') #纵坐标
68 | plt.show()
69 | 


--------------------------------------------------------------------------------
/ch2/testDigits.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch2/testDigits.zip


--------------------------------------------------------------------------------
/ch2/trainingDigits.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch2/trainingDigits.zip


--------------------------------------------------------------------------------
/ch3/README.md:
--------------------------------------------------------------------------------
 1 | # Ch03 - 决策树(DecisionTree)
 2 | 
 3 | ### 决策树示意图：
 4 | 
 5 | ### example:
 6 | ![TheTree](screenshot/TheTree.png)
 7 | 
 8 | ### 使用决策树预测隐形眼镜类型：
 9 | ![lensesTree](screenshot/lensesTree.png)
10 | 


--------------------------------------------------------------------------------
/ch3/TheTree.txt:
--------------------------------------------------------------------------------
 1 | (dp0
 2 | S'no surfacing'
 3 | p1
 4 | (dp2
 5 | I0
 6 | S'no'
 7 | p3
 8 | sI1
 9 | (dp4
10 | S'flippers'
11 | p5
12 | (dp6
13 | I0
14 | g3
15 | sI1
16 | S'yes'
17 | p7
18 | ssss.


--------------------------------------------------------------------------------
/ch3/calcShannonEnt.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | from numpy import *
 4 | from math import log
 5 | import numpy as np
 6 | import operator
 7 | from os import listdir
 8 | 
 9 | def createDataSet():
10 |     dataSet = [[1, 1, 'yes'],
11 |                [1, 1, 'yes'],
12 |                [1, 0, 'no'],
13 |                [0, 1, 'no'],
14 |                [0, 1, 'no']]
15 |     labels = ['no surfacing','flippers']
16 |     return dataSet, labels
17 | 
18 | def calcShannonEnt(dataSet): #计算熵
19 | 
20 |     numEntries = len(dataSet)
21 |     labelCounts = {}
22 |     for featVec in dataSet:
23 |             currentLabel = featVec[-1]
24 |             if currentLabel not in labelCounts.keys(): #键值不存在就加入
25 |                 labelCounts[currentLabel] = 0
26 |             labelCounts[currentLabel] += 1 #计数
27 | 
28 |     shannonEnt = 0.0
29 |     for key in labelCounts:
30 |         prob = float(labelCounts[key]) / numEntries
31 |         shannonEnt -= prob * log(prob, 2) # 以2为底求对数
32 | 
33 |     return shannonEnt
34 | 
35 | 
36 | if __name__ =='__main__':
37 |    # print createDataSet()
38 |    dataSet ,labels = createDataSet()
39 |    print calcShannonEnt(dataSet)
40 | 


--------------------------------------------------------------------------------
/ch3/chooseBestFeatureToSplit.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | from numpy import *
 4 | from math import log
 5 | import numpy as np
 6 | import operator
 7 | from os import listdir
 8 | 
 9 | def createDataSet():
10 |     dataSet = [[1, 1, 'yes'],
11 |                [1, 1, 'yes'],
12 |                [1, 0, 'no'],
13 |                [0, 1, 'no'],
14 |                [0, 1, 'no']]
15 |     labels = ['no surfacing','flippers']
16 |     return dataSet, labels
17 | 
18 | def calcShannonEnt(dataSet): #计算熵
19 | 
20 |     numEntries = len(dataSet)
21 |     labelCounts = {}
22 |     for featVec in dataSet:
23 |             currentLabel = featVec[-1]
24 |             if currentLabel not in labelCounts.keys(): #键值不存在就加入
25 |                 labelCounts[currentLabel] = 0
26 |             labelCounts[currentLabel] += 1 #计数
27 | 
28 |     shannonEnt = 0.0
29 |     for key in labelCounts:
30 |         prob = float(labelCounts[key]) / numEntries
31 |         shannonEnt -= prob * log(prob, 2) # 以2为底求对数
32 | 
33 |     return shannonEnt
34 | 
35 | #按照给定特征划分数据集
36 | def splitDataSet(dataSet,axis,value):#传递参数：待划分的数据集，划分数据集的特征(第axis个特征)，特征的返回值
37 | 
38 |     retDataSet = [] #创建新的list对象
39 | 
40 |     for featVec in dataSet: #抽取
41 |         if featVec[axis] == value:
42 |             reducedFeatVec = featVec[:axis]
43 |             reducedFeatVec.extend(featVec[axis + 1:])
44 |            # print featVec
45 |             retDataSet.append(reducedFeatVec)
46 | 
47 |     return retDataSet
48 | 
49 | #选择最好的数据集划分方式
50 | def chooseBestFeatureToSplit(dataSet):
51 | 
52 |     numFeatures = len(dataSet[0]) - 1
53 |     baseEntropy = calcShannonEnt(dataSet) #整个数据集的原始香农熵
54 |     bestInfoGain = 0.0
55 |     bestFeature = -1
56 |     for i in range(numFeatures):        #遍历全部特征
57 |         featList = [example[i] for example in dataSet]#创建一个新的list对象
58 |         uniqueVals = set(featList)       #容器set
59 |         newEntropy = 0.0
60 |         # 遍历当前特征中的所有唯一属性值，对每个唯一属性值划分一次数据集，计算数据集的新熵值，并对所有唯一特征值得到的熵求和。
61 |         for value in uniqueVals: #计算每一种划分方式的信息熵
62 | 
63 |             subDataSet = splitDataSet(dataSet, i, value)
64 |             # 计算概率：特征值划分出子集概率
65 |             prob = len(subDataSet)/float(len(dataSet))
66 |             #因为我们在根据一个特征计算香农熵的时候，该特征的分类值是相同，这个特征这个分类的香农熵为0，
67 |             # 即当我们的分类只有一类是香农熵是0,而分类越多，香农熵会越大
68 |             #所以计算新的香农熵的时候使用的是子集
69 |             newEntropy += prob * calcShannonEnt(subDataSet) #计算新的熵
70 | 
71 |         infoGain = baseEntropy - newEntropy
72 | 
73 |         if (infoGain > bestInfoGain): #计算最好的信息增量
74 |             bestInfoGain = infoGain
75 |             bestFeature = i
76 |     return bestFeature                      #返回一个整数，返回最好的axis
77 | 
78 | if __name__ =='__main__':
79 | 
80 |   # print createDataSet()
81 |    dataSet ,labels = createDataSet()
82 |    print calcShannonEnt(dataSet)
83 | 
84 |    print splitDataSet(dataSet,0,1)
85 |    print splitDataSet(dataSet,1,1)
86 |    print chooseBestFeatureToSplit(dataSet) # 0：The best axis
87 | 


--------------------------------------------------------------------------------
/ch3/lenses.txt:
--------------------------------------------------------------------------------
 1 | young	myope	no	reduced	no lenses
 2 | young	myope	no	normal	soft
 3 | young	myope	yes	reduced	no lenses
 4 | young	myope	yes	normal	hard
 5 | young	hyper	no	reduced	no lenses
 6 | young	hyper	no	normal	soft
 7 | young	hyper	yes	reduced	no lenses
 8 | young	hyper	yes	normal	hard
 9 | pre	myope	no	reduced	no lenses
10 | pre	myope	no	normal	soft
11 | pre	myope	yes	reduced	no lenses
12 | pre	myope	yes	normal	hard
13 | pre	hyper	no	reduced	no lenses
14 | pre	hyper	no	normal	soft
15 | pre	hyper	yes	reduced	no lenses
16 | pre	hyper	yes	normal	no lenses
17 | presbyopic	myope	no	reduced	no lenses
18 | presbyopic	myope	no	normal	no lenses
19 | presbyopic	myope	yes	reduced	no lenses
20 | presbyopic	myope	yes	normal	hard
21 | presbyopic	hyper	no	reduced	no lenses
22 | presbyopic	hyper	no	normal	soft
23 | presbyopic	hyper	yes	reduced	no lenses
24 | presbyopic	hyper	yes	normal	no lenses
25 | 


--------------------------------------------------------------------------------
/ch3/lensesTree.txt:
--------------------------------------------------------------------------------
 1 | (dp0
 2 | S'tearRate'
 3 | p1
 4 | (dp2
 5 | S'reduced'
 6 | p3
 7 | S'no lenses'
 8 | p4
 9 | sS'normal'
10 | p5
11 | (dp6
12 | S'astigmatic'
13 | p7
14 | (dp8
15 | S'yes'
16 | p9
17 | (dp10
18 | S'prescript'
19 | p11
20 | (dp12
21 | S'hyper'
22 | p13
23 | (dp14
24 | S'age'
25 | p15
26 | (dp16
27 | S'pre'
28 | p17
29 | S'no lenses'
30 | p18
31 | sS'presbyopic'
32 | p19
33 | S'no lenses'
34 | p20
35 | sS'young'
36 | p21
37 | S'hard'
38 | p22
39 | sssS'myope'
40 | p23
41 | S'hard'
42 | p24
43 | sssS'no'
44 | p25
45 | (dp26
46 | g15
47 | (dp27
48 | S'pre'
49 | p28
50 | S'soft'
51 | p29
52 | sS'presbyopic'
53 | p30
54 | (dp31
55 | g11
56 | (dp32
57 | S'hyper'
58 | p33
59 | S'soft'
60 | p34
61 | sS'myope'
62 | p35
63 | S'no lenses'
64 | p36
65 | sssS'young'
66 | p37
67 | S'soft'
68 | p38
69 | ssssss.


--------------------------------------------------------------------------------
/ch3/plotTree.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | # 定义文本框和箭头格式
  6 | decisionNode = dict(boxstyle="sawtooth", fc="0.8")
  7 | leafNode = dict(boxstyle="round4", fc="0.8")
  8 | arrow_args = dict(arrowstyle="<-")
  9 | 
 10 | # 获取叶节点的数目
 11 | def getNumLeafs(myTree):
 12 |     numLeafs = 0
 13 |     firstStr = myTree.keys()[0]
 14 |     secondDict = myTree[firstStr]
 15 |     for key in secondDict.keys():
 16 |         if type(secondDict[key]).__name__=='dict':
 17 |             numLeafs += getNumLeafs(secondDict[key])
 18 |         else:   numLeafs +=1
 19 |     return numLeafs
 20 | 
 21 | # 获取树的层数
 22 | def getTreeDepth(myTree):
 23 |     maxDepth = 0
 24 |     firstStr = myTree.keys()[0]
 25 |     secondDict = myTree[firstStr]
 26 |     for key in secondDict.keys():
 27 |         if type(secondDict[key]).__name__=='dict':
 28 |             thisDepth = 1 + getTreeDepth(secondDict[key])
 29 |         else:
 30 |             thisDepth = 1
 31 | 
 32 |         if thisDepth > maxDepth: maxDepth = thisDepth
 33 |     return maxDepth
 34 | 
 35 | # 绘制带箭头的注解
 36 | def plotNode(nodeTxt, centerPt, parentPt, nodeType):
 37 |     createPlot.ax1.annotate(nodeTxt, xy=parentPt,  xycoords='axes fraction', #annotate可以在数据图形上添加文本注释
 38 |     xytext=centerPt, textcoords='axes fraction',
 39 |     va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )
 40 | 
 41 | #更新createPlot代码以得到整棵树
 42 | #计算父节点和子节点的中间位置
 43 | def plotMidText(cntrPt, parentPt, txtString): #在父子节点间填充文本信息
 44 |     xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
 45 |     yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
 46 |     createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
 47 | 
 48 | # 使用文本注解绘制树节点
 49 | def plotTree(myTree, parentPt, nodeTxt):
 50 | 
 51 |     #计算宽与高
 52 |     numLeafs = getNumLeafs(myTree)
 53 |     depth = getTreeDepth(myTree)
 54 | 
 55 |     firstStr = myTree.keys()[0]
 56 |     cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
 57 | 
 58 |     #标志子节点属性值
 59 |     plotMidText(cntrPt, parentPt, nodeTxt)
 60 |     plotNode(firstStr, cntrPt, parentPt, decisionNode)
 61 | 
 62 |     secondDict = myTree[firstStr]
 63 | 
 64 |     plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD # 减少 y 偏移
 65 | 
 66 |     for key in secondDict.keys():
 67 |         if type(secondDict[key]).__name__=='dict':
 68 |             plotTree(secondDict[key],cntrPt,str(key))
 69 |         else:
 70 |             plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
 71 |             plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
 72 |             plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
 73 |     plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
 74 | 
 75 | def createPlot(inTree):
 76 |     fig = plt.figure(1, facecolor='white')
 77 |     fig.clf()
 78 |     axprops = dict(xticks=[], yticks=[])
 79 |     createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
 80 |     #createPlot.ax1 = plt.subplot(111, frameon=False)
 81 |     plotTree.totalW = float(getNumLeafs(inTree))
 82 |     plotTree.totalD = float(getTreeDepth(inTree))
 83 |     plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
 84 |     plotTree(inTree, (0.5,1.0), '')
 85 |     plt.show()
 86 | 
 87 | 
 88 | # 决策树的读取
 89 | def grabTree(filename):  # 并在需要的时候将其读取出来
 90 |     import pickle
 91 |     fr = open(filename)
 92 |     return pickle.load(fr)
 93 | 
 94 | if __name__ == '__main__':
 95 |     #TheTree = grabTree("TheTree.txt")
 96 |     TheTree = grabTree("lensesTree.txt")
 97 |     print TheTree
 98 |     createPlot(TheTree)
 99 | 
100 | 


--------------------------------------------------------------------------------
/ch3/screenshot/TheTree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch3/screenshot/TheTree.png


--------------------------------------------------------------------------------
/ch3/screenshot/lensesTree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch3/screenshot/lensesTree.png


--------------------------------------------------------------------------------
/ch4/README.md:
--------------------------------------------------------------------------------
 1 | # Ch04 - 朴素贝叶斯(Naive Bayes)
 2 | 
 3 | ## 朴素贝叶斯概述
 4 | 
 5 | 朴素贝叶斯是一种简单但是非常强大的线性分类器。它在垃圾邮件分类，疾病诊断中都取得了很大的成功。它只所以称为朴素，是因为它假设特征之间是相互独立的，但是在现实生活中，这种假设基本上是不成立的。那么即使是在假设不成立的条件下，它依然表现的很好，尤其是在小规模样本的情况下。但是，如果每个特征之间有很强的关联性和非线性的分类问题会导致朴素贝叶斯模型有很差的分类效果。
 6 | 
 7 | 朴素贝叶斯分类器通过求出使得概率 ![\\inline P\(X|W\)](http://latex.codecogs.com/png.latex?%5Cinline%20P%28X|W%29) 最大化的类别 ![\\inline X](http://latex.codecogs.com/png.latex?%5Cinline%20X)，以确定特征向量 ![\\inline W = \(w_1, w_2, w_3, \\dots\)](http://latex.codecogs.com/png.latex?%5Cinline%20W%20%3D%20%28w_1%2C%20w_2%2C%20w_3%2C%20%5Cdots%29) 最有可能属于的类别。
 8 | 
 9 | 根据条件概率公式，![\\inline P\(X|W\) = \\frac{P\(W|X\) \\times P\(X\)}{P\(W\)}](http://latex.codecogs.com/png.latex?%5Cinline%20P%28X|W%29%20%3D%20%5Cfrac{P%28W|X%29%20%5Ctimes%20P%28X%29}{P%28W%29})。![\\inline P\(X\)](http://latex.codecogs.com/png.latex?%5Cinline%20P%28X%29) 可以视为一个先验概率，用类别 ![\\inline X](http://latex.codecogs.com/png.latex?%5Cinline%20X) 在样本中的频率近似算出。![\\inline P\(W\)](http://latex.codecogs.com/png.latex?%5Cinline%20P%28W%29) 虽然很难计算，但它是一个与 ![\\inline X](http://latex.codecogs.com/png.latex?%5Cinline%20X) 无关的常数，而我们只需要找到使得概率最大化的 ![\\inline X](http://latex.codecogs.com/png.latex?%5Cinline%20X)，只要比较大小，并不需要精确算出这个概率，所以可以无视这个值。
10 | 
11 | 问题就在于如何计算 ![\\inline P\(W|X\)](http://latex.codecogs.com/png.latex?%5Cinline%20P%28W|X%29)，这里就是朴素贝叶斯分类器的“朴素”体现出来的地方。朴素贝叶斯分类器做了一个强假设，认为 ![\\inline W](http://latex.codecogs.com/png.latex?%5Cinline%20W) 里的每个特征都是互相独立的，即 ![\\inline P\(W|X\) = P\(w_1|X\) \\times P\(w_2|X\) \\times P\(w_3|X\)\\dots](http://latex.codecogs.com/png.latex?%5Cinline%20P%28W|X%29%20%3D%20P%28w_1|X%29%20%5Ctimes%20P%28w_2|X%29%20%5Ctimes%20P%28w_3|X%29%5Cdots)，这就方便了我们的概率计算。
12 | 
13 | 为了计算某一个特征的概率 ![\\inline P\(w|X\)](http://latex.codecogs.com/png.latex?%5Cinline%20P%28w|X%29)，如果 ![\\inline w](http://latex.codecogs.com/png.latex?%5Cinline%20w) 的取值是离散的，直接使用古典概型计算即可；如果 ![\\inline w](http://latex.codecogs.com/png.latex?%5Cinline%20w) 的取值是连续的，可以假设 ![\\inline w](http://latex.codecogs.com/png.latex?%5Cinline%20w) 服从正态分布。
14 | 
15 | 太多的小概率乘起来，可能会因为结果太小导致下溢或者得到不正确的答案。解决方法是：可以将概率取对数拟然(log-likelihood)，这样乘法就变成了加法，取值虽然不相同，但也不影响最终答案。
16 | 
17 | ## 朴素贝叶斯背后的数学原理
18 | 
19 | ### 后验概率(Posterior Probabilities)
20 | ### 条件概率(Conditional Probabilities)
21 | ### 先验概率(Prior Probabilities)
22 | ### 现象概率(Evidence Probabilities)
23 | 
24 | ## 应用：
25 | 贝叶斯模型在很多方面都有应用，熟知的领域就有垃圾邮件识别、文本的模糊匹配、欺诈判别、商品推荐等等。通过贝叶斯模型的阐述，应该有这样的一种体会：分析模型并不取决于多么复杂的数学公式，多么高级的软件工具，多么高深的算法组合；它们的原理往往是通俗易懂的，实现起来也没有多高的门槛。比如贝叶斯模型，用Excel的单元格和加减乘除的符号就能实现。所以，不要觉得数据分析建模有多遥远，其实就在你手边。
26 | 


--------------------------------------------------------------------------------
/ch4/advertisement.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | from numpy import *
  4 | from math import log
  5 | import numpy as np
  6 | import operator
  7 | from os import listdir
  8 | import feedparser
  9 | 
 10 | def createVocabList(dataSet):
 11 |     vocabSet = set([])  # 创建一个空集
 12 |     for document in dataSet:
 13 |         vocabSet = vocabSet | set(document) # 合并两个集合
 14 |     return list(vocabSet)
 15 | 
 16 | 
 17 | # 朴素贝叶斯分类器训练函数
 18 | def trainNB0(trainMatrix,trainCategory):
 19 |     numTrainDocs = len(trainMatrix)
 20 |     numWords = len(trainMatrix[0])
 21 |     pAbusive = sum(trainCategory)/float(numTrainDocs)
 22 | 
 23 |     # 初始化概率
 24 |     p0Num = ones(numWords)
 25 |     p1Num = ones(numWords)
 26 |     p0Denom = 2.0
 27 |     p1Denom = 2.0
 28 |     for i in range(numTrainDocs):
 29 |         if trainCategory[i] == 1:
 30 |             # 向量相加
 31 |             p1Num += trainMatrix[i]
 32 |             p1Denom += sum(trainMatrix[i])
 33 |         elif trainCategory[i] == 0:
 34 |             p0Num += trainMatrix[i]
 35 |             p0Denom += sum(trainMatrix[i])
 36 | 
 37 |     # 对每个元素做除法
 38 |     p1Vect = p1Num / p1Denom
 39 |     p0Vect = p0Num / p0Denom
 40 |    # p1Vect = log(p1Num/p1Denom)  # change to log
 41 |    # p0Vect = log(p0Num/p0Denom)  # change to log
 42 | 
 43 |     return p0Vect,p1Vect,pAbusive
 44 | 
 45 | # 朴素贝叶斯分类函数
 46 | def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
 47 |     p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #元素相乘
 48 |     p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
 49 |     if p1 > p0:
 50 |         return 1
 51 |     else:
 52 |         return 0
 53 | 
 54 | def textParse(bigString):
 55 |     import re
 56 |     listofTokens = re.split(r'\w*',bigString)
 57 |     # 过滤掉长度小于3的字符串
 58 |     return [tok.lower() for tok in listofTokens if len(tok) > 2] # 过滤掉长度小于三的字符串
 59 | 
 60 | # 朴素贝叶斯词袋模型
 61 | def bagOfWords2VecMN(vocabList, inputSet):
 62 |     returnVec = [0]*len(vocabList)
 63 |     for word in inputSet:
 64 |         if word in vocabList:
 65 |             returnVec[vocabList.index(word)] += 1
 66 |     return returnVec
 67 | 
 68 | 
 69 | def calcMostFreq(vocabList,fullText):
 70 |     # 计算出现的频率
 71 |     freqDict = {}
 72 |     for token in vocabList:
 73 |         freqDict[token]=fullText.count(token)
 74 |     sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)
 75 |     return sortedFreq[:30]
 76 | 
 77 | def localWords(feed1,feed0):
 78 | 
 79 |     docList=[]; classList = []; fullText =[]
 80 |     minLen = min(len(feed1['entries']),len(feed0['entries']))
 81 |     for i in range(minLen):
 82 |         wordList = textParse(feed1['entries'][i]['summary'])
 83 |         docList.append(wordList)
 84 |         fullText.extend(wordList)
 85 |         classList.append(1) #NY is class 1
 86 |         wordList = textParse(feed0['entries'][i]['summary'])
 87 |         docList.append(wordList)
 88 |         fullText.extend(wordList)
 89 |         classList.append(0)
 90 |     vocabList = createVocabList(docList) # create vocabulary
 91 |     top30Words = calcMostFreq(vocabList,fullText)   # remove top 30 words
 92 |     for pairW in top30Words:
 93 |         if pairW[0] in vocabList: vocabList.remove(pairW[0])
 94 |     trainingSet = range(2*minLen); testSet=[]           # create test set
 95 |     for i in range(20):
 96 |         randIndex = int(random.uniform(0,len(trainingSet)))
 97 |         testSet.append(trainingSet[randIndex])
 98 |         del(trainingSet[randIndex])
 99 |     trainMat=[]; trainClasses = []
100 |     for docIndex in trainingSet: # train the classifier (get probs) trainNB0
101 |         trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
102 |         trainClasses.append(classList[docIndex])
103 |     p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
104 |     errorCount = 0
105 |     for docIndex in testSet:        # classify the remaining items
106 |         wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
107 |         if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
108 |             errorCount += 1
109 |     print 'the error rate is: ',float(errorCount)/len(testSet)
110 |     return vocabList,p0V,p1V
111 | 
112 | 
113 | if __name__ =='__main__':
114 |     ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
115 |     sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
116 |     vocabList,pSF,pNY = localWords(ny,sf)
117 | 
118 | 


--------------------------------------------------------------------------------
/ch4/classifyNB && testingNB.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | from numpy import *
  4 | from math import log
  5 | import numpy as np
  6 | import operator
  7 | from os import listdir
  8 | 
  9 | 
 10 | def loadDataSet():
 11 |     postingList = [['my','dog','has','flea','problem','help','please',],
 12 |                    ['maybe','not','take','him','to','dog','park','stupid'],
 13 |                    ['my','dalmation','is','so','cute','I','love','him'],
 14 |                    ['stop','posting','stupid','worthless','garbage'],
 15 |                    ['mr','licks','ate','my','steak','how','to','stop','him'],
 16 |                    ['quit','buying','worthless','dog','food','stdpid']]
 17 |     classVec = [0,1,0,1,0,1] # 1 代表侮辱性文字 0代表正常言论
 18 |     return postingList,classVec
 19 | 
 20 | 
 21 | def createVocabList(dataSet):
 22 |     vocabSet = set([])  # 创建一个空集
 23 |     for document in dataSet:
 24 |         vocabSet = vocabSet | set(document) # 合并两个集合
 25 |     return list(vocabSet)
 26 | 
 27 | def setOfWords2Vec(vocabList, inputSet):
 28 |     returnVec = [0]*len(vocabList) # 创建一个其中的全部元素都为 0 的向量
 29 |     for word in inputSet:
 30 |         if word in vocabList:
 31 |             returnVec[vocabList.index(word)] = 1
 32 |         else:
 33 |             print " %s不在字典集内!" % word
 34 |     return returnVec
 35 | 
 36 | # 朴素贝叶斯分类器训练函数
 37 | def trainNB0(trainMatrix,trainCategory):
 38 |     numTrainDocs = len(trainMatrix)
 39 |     numWords = len(trainMatrix[0])
 40 |     pAbusive = sum(trainCategory)/float(numTrainDocs)
 41 | 
 42 |     # 初始化概率
 43 |     p0Num = ones(numWords)
 44 |     p1Num = ones(numWords)
 45 |     p0Denom = 2.0
 46 |     p1Denom = 2.0
 47 |     for i in range(numTrainDocs):
 48 |         if trainCategory[i] == 1:
 49 |             # 向量相加
 50 |             p1Num += trainMatrix[i]
 51 |             p1Denom += sum(trainMatrix[i])
 52 |         elif trainCategory[i] == 0:
 53 |             p0Num += trainMatrix[i]
 54 |             p0Denom += sum(trainMatrix[i])
 55 | 
 56 |     # 对每个元素做除法
 57 |     p1Vect = p1Num / p1Denom
 58 |     p0Vect = p0Num / p0Denom
 59 |    # p1Vect = log(p1Num/p1Denom)  # change to log
 60 |    # p0Vect = log(p0Num/p0Denom)  # change to log
 61 | 
 62 |     return p0Vect,p1Vect,pAbusive
 63 | 
 64 | # 朴素贝叶斯分类函数
 65 | def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
 66 |     p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #元素相乘
 67 |     p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
 68 |     if p1 > p0:
 69 |         return 1
 70 |     else:
 71 |         return 0
 72 | 
 73 | def testingNB():
 74 |     listOPosts,listClasses = loadDataSet()
 75 |     myVocabList = createVocabList(listOPosts)
 76 |     trainMat=[]
 77 | 
 78 |     for postinDoc in listOPosts:
 79 |         trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
 80 | 
 81 |     p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
 82 | 
 83 |     testEntry = ['love', 'my', 'dalmation']
 84 |     thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
 85 | 
 86 |     print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
 87 | 
 88 |     testEntry = ['stupid', 'garbage']
 89 |     thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
 90 |     print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
 91 | 
 92 | 
 93 | if __name__ =='__main__':
 94 |     listPosts,listClass = loadDataSet()
 95 |     myVocaburaryList = createVocabList(listPosts)
 96 |    # print sorted(myVocaburaryList)
 97 |    # print setOfWords2Vec(myVocaburaryList,listPosts[0])
 98 |     trainMat = []
 99 |     for postinDoc in listPosts:
100 |         trainMat.append(setOfWords2Vec(myVocaburaryList,postinDoc))
101 |     p0V,p1V,pAb = trainNB0(trainMat,listClass)
102 |     #print p0V,p1V
103 |     print pAb
104 |     testingNB()
105 | 


--------------------------------------------------------------------------------
/ch4/email.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | from numpy import *
  4 | from math import log
  5 | import numpy as np
  6 | import operator
  7 | from os import listdir
  8 | 
  9 | def createVocabList(dataSet):
 10 |     vocabSet = set([])  # 创建一个空集
 11 |     for document in dataSet:
 12 |         vocabSet = vocabSet | set(document)  # 合并两个集合
 13 |     return list(vocabSet)
 14 | 
 15 | 
 16 | # 朴素贝叶斯分类器训练函数
 17 | def trainNB0(trainMatrix, trainCategory):
 18 |     numTrainDocs = len(trainMatrix)
 19 |     numWords = len(trainMatrix[0])
 20 |     pAbusive = sum(trainCategory) / float(numTrainDocs)
 21 | 
 22 |     # 初始化概率
 23 |     p0Num = ones(numWords)
 24 |     p1Num = ones(numWords)
 25 |     p0Denom = 2.0
 26 |     p1Denom = 2.0
 27 |     for i in range(numTrainDocs):
 28 |         if trainCategory[i] == 1:
 29 |             # 向量相加
 30 |             p1Num += trainMatrix[i]
 31 |             p1Denom += sum(trainMatrix[i])
 32 |         elif trainCategory[i] == 0:
 33 |             p0Num += trainMatrix[i]
 34 |             p0Denom += sum(trainMatrix[i])
 35 | 
 36 |     # 对每个元素做除法
 37 |     p1Vect = p1Num / p1Denom
 38 |     p0Vect = p0Num / p0Denom
 39 |     # p1Vect = log(p1Num/p1Denom)  # change to log
 40 |     # p0Vect = log(p0Num/p0Denom)  # change to log
 41 | 
 42 |     return p0Vect, p1Vect, pAbusive
 43 | 
 44 | 
 45 | # 朴素贝叶斯分类函数
 46 | def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
 47 |     p1 = sum(vec2Classify * p1Vec) + log(pClass1)  # 元素相乘
 48 |     p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
 49 |     if p1 > p0:
 50 |         return 1
 51 |     else:
 52 |         return 0
 53 | 
 54 | def textParse(bigString):
 55 |     import re
 56 |     listofTokens = re.split(r'\w*', bigString)
 57 |     # 过滤掉长度小于3的字符串
 58 |     return [tok.lower() for tok in listofTokens if len(tok) > 2]  # 过滤掉长度小于三的字符串
 59 | 
 60 | 
 61 | # 朴素贝叶斯词袋模型
 62 | def bagOfWords2VecMN(vocabList, inputSet):
 63 |     returnVec = [0] * len(vocabList)
 64 |     for word in inputSet:
 65 |         if word in vocabList:
 66 |             returnVec[vocabList.index(word)] += 1
 67 |     return returnVec
 68 | 
 69 | 
 70 | # 文本解析及完整的垃圾邮件测试函数
 71 | def spamTest():
 72 |     docList = []
 73 |     classList = []
 74 |     fullText = []
 75 | 
 76 |     # 导入并解析文本文件
 77 |     for i in range(1, 26):
 78 |         wordList = textParse(open('email/spam/%d.txt' % i).read())
 79 |         docList.append(wordList)
 80 |         fullText.extend(wordList)
 81 |         classList.append(1)
 82 | 
 83 |         wordList = textParse(open('email/ham/%d.txt' % i).read())
 84 |         docList.append(wordList)
 85 |         fullText.extend(wordList)
 86 |         classList.append(0)
 87 | 
 88 |     vocabList = createVocabList(docList)
 89 |     trainingSet = range(50)
 90 |     testSet = []
 91 | 
 92 |     # 随机构建训练集
 93 |     for i in range(10):
 94 |         randIndex = int(random.uniform(0, len(trainingSet)))
 95 |         testSet.append(trainingSet[randIndex])
 96 |         del (trainingSet[randIndex])
 97 | 
 98 |     trainMat = []
 99 |     trainClasses = []
100 | 
101 |     # calculate the probability
102 |     for docIndex in trainingSet:
103 |         trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
104 |         trainClasses.append(classList[docIndex])
105 |     p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
106 |     errorCount = 0
107 | 
108 |     # 对测试集分类
109 |     for docIndex in testSet:
110 |         wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
111 |         if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
112 |             errorCount += 1
113 |             print "classification error", docList[docIndex]
114 |     print 'the error rate is: ', float(errorCount) / len(testSet)
115 |     return vocabList, fullText
116 | 
117 | 
118 | if __name__ == '__main__':
119 | 
120 |     print spamTest()
121 | 


--------------------------------------------------------------------------------
/ch4/ham.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch4/ham.zip


--------------------------------------------------------------------------------
/ch4/matplotlib/README.md:
--------------------------------------------------------------------------------
1 | # 效果图
2 | # math_matplotlib.py
3 | 
4 | ![math_matplotlib](srceenshot/math_matplotlib.png)
5 | 
6 | # matplotlib.py
7 | 
8 | ![matplotlib](srceenshot/matplotlib.png)
9 | 


--------------------------------------------------------------------------------
/ch4/matplotlib/math_matplotlib.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | 
 3 | from numpy import *
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | t = arange(0.0, 0.5, 0.01)
 8 | #s = sin(2*t)
 9 | s = sin(2*pi*t)
10 | logS = log(s)
11 | 
12 | fig = plt.figure()
13 | ax = fig.add_subplot(211)
14 | ax.plot(t,s) # f(x) = sin(2*pi*t)
15 | ax.set_ylabel('f(x)')
16 | ax.set_xlabel('x')
17 | 
18 | ax = fig.add_subplot(212)
19 | ax.plot(t,logS) # f(x) =log(s)
20 | ax.set_ylabel('ln(f(x))')
21 | ax.set_xlabel('x')
22 | plt.show()


--------------------------------------------------------------------------------
/ch4/matplotlib/matplotlib.py:
--------------------------------------------------------------------------------
 1 | from numpy import *
 2 | import matplotlib
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | n = 1000 # number of points to create
 6 | xcord0 = []
 7 | ycord0 = []
 8 | xcord1 = []
 9 | ycord1 = []
10 | markers =[]
11 | colors =[]
12 | fw = open('testSet.txt','w')
13 | for i in range(n):
14 |     [r0,r1] = random.standard_normal(2)
15 |     myClass = random.uniform(0,1)
16 |     if (myClass <= 0.5):
17 |         fFlyer = r0 + 9.0
18 |         tats = 1.0*r1 + fFlyer - 9.0
19 |         xcord0.append(fFlyer)
20 |         ycord0.append(tats)
21 |     else:
22 |         fFlyer = r0 + 2.0
23 |         tats = r1+fFlyer - 2.0
24 |         xcord1.append(fFlyer)
25 |         ycord1.append(tats)
26 |     #fw.write("%f\t%f\t%d\n" % (fFlyer, tats, classLabel))
27 | 
28 | fw.close()
29 | fig = plt.figure()
30 | ax = fig.add_subplot(111)
31 | #ax.scatter(xcord,ycord, c=colors, s=markers)
32 | ax.scatter(xcord0,ycord0, marker='^', s=90,c='blue')
33 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red')
34 | plt.plot([0,1], label='going up')
35 | plt.show()


--------------------------------------------------------------------------------
/ch4/matplotlib/srceenshot/math_matplotlib.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch4/matplotlib/srceenshot/math_matplotlib.png


--------------------------------------------------------------------------------
/ch4/matplotlib/srceenshot/matplotlib.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch4/matplotlib/srceenshot/matplotlib.png


--------------------------------------------------------------------------------
/ch4/spam.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch4/spam.zip


--------------------------------------------------------------------------------
/ch4/trainNB0.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | from numpy import *
 4 | from math import log
 5 | import numpy as np
 6 | import operator
 7 | from os import listdir
 8 | 
 9 | 
10 | def loadDataSet():
11 |     postingList = [['my','dog','has','flea','problem','help','please',],
12 |                    ['maybe','not','take','him','to','dog','park','stupid'],
13 |                    ['my','dalmation','is','so','cute','I','love','him'],
14 |                    ['stop','posting','stupid','worthless','garbage'],
15 |                    ['mr','licks','ate','my','steak','how','to','stop','him'],
16 |                    ['quit','buying','worthless','dog','food','stdpid']]
17 |     classVec = [0,1,0,1,0,1] # 1 代表侮辱性文字 0代表正常言论
18 |     return postingList,classVec
19 | 
20 | 
21 | def createVocabList(dataSet):
22 |     vocabSet = set([])  # 创建一个空集
23 |     for document in dataSet:
24 |         vocabSet = vocabSet | set(document) # 合并两个集合
25 |     return list(vocabSet)
26 | 
27 | def setOfWords2Vec(vocabList, inputSet):
28 |     returnVec = [0]*len(vocabList) # 创建一个其中的全部元素都为 0 的向量
29 |     for word in inputSet:
30 |         if word in vocabList:
31 |             returnVec[vocabList.index(word)] = 1
32 |         else:
33 |             print " %s不在字典集内!" % word
34 |     return returnVec
35 | 
36 | # 朴素贝叶斯分类器训练函数
37 | def trainNB0(trainMatrix,trainCategory):
38 |     numTrainDocs = len(trainMatrix)
39 |     numWords = len(trainMatrix[0])
40 |     pAbusive = sum(trainCategory)/float(numTrainDocs)
41 | 
42 |     # 初始化概率
43 |     p0Num = ones(numWords)
44 |     p1Num = ones(numWords)
45 |     p0Denom = 2.0
46 |     p1Denom = 2.0
47 |     for i in range(numTrainDocs):
48 |         if trainCategory[i] == 1:
49 |             # 向量相加
50 |             p1Num += trainMatrix[i]
51 |             p1Denom += sum(trainMatrix[i])
52 |         elif trainCategory[i] == 0:
53 |             p0Num += trainMatrix[i]
54 |             p0Denom += sum(trainMatrix[i])
55 | 
56 |     # 对每个元素做除法
57 |     p1Vect = log(p1Num/p1Denom)  # change to log
58 |     p0Vect = log(p0Num/p0Denom)  # change to log
59 | 
60 |     return p0Vect,p1Vect,pAbusive
61 | 
62 | if __name__ =='__main__':
63 |     listPosts,listClass = loadDataSet()
64 |     myVocaburaryList = createVocabList(listPosts)
65 |     print sorted(myVocaburaryList)
66 |     print setOfWords2Vec(myVocaburaryList,listPosts[0])
67 | 


--------------------------------------------------------------------------------
/ch5/README.md:
--------------------------------------------------------------------------------
 1 | # Ch05 - Logistic回归(Logistic regression)
 2 | 
 3 | ## Logistic回归的主要思想：
 4 | #### Logistic回归是本书目前为止首次接触最优化算法。
 5 | #### 根据现有数据对分类边界线建立回归公式，以此进行分类。 
 6 | #### 我们想要的函数应该是能接受所有的输入然后预测出类别，两个类的情况下输出0或者1，这种函数叫做单位阶越函数。然而，这种函数的问题在于：
 7 | #### 该函数在跳跃点上从0 瞬间跳跃到1，这个瞬间跳跃过程优势很难处理。但幸好的是，另一个函数也有类似的性质，且数学上更容易处理，这就是Sigmoid函数。
 8 | ## 下图是坐标尺度下的Sigmoid函数图。
 9 | ![sigmoid](screenshot/sigmoid.png)
10 | 
11 | ## 梯度上升法
12 | #### 这个在机器学习的数学上并不难。先说一个具体的函数例子。
13 | ![梯度上升法](screenshot/梯度上升.png)
14 | #### 梯度上升算法到达每个点都会重新估计移动的方向。从P0开始，计算完改点的梯度，函数就会根据梯度移动到下一个点P1，梯度再次被重新计算，并沿新的梯度方向移动到P2。如此循环迭代，直到满足停止条件。迭代的过程中，梯度算子总是保证我们能选取到最佳的移动方向。可以看到，梯度算子总是指向函数值增长最快的方向。这里说的只是移动方向，并不是移动量的大小。
15 | 
16 | # 下面是几个有区别的训练算法效果示意图：
17 | 
18 | ## Logistic回归最佳拟合直线
19 | ![Logistic回归最佳拟合直线](screenshot/Logistic回归最佳拟合直线.png)
20 | 
21 | ## 随机梯度上升
22 | ![随机梯度上升](screenshot/随机梯度上升.png)
23 | 
24 | ## 改进随机梯度上升
25 | ![改进随机梯度上升](screenshot/改进随机梯度上升.png)
26 | 
27 | 


--------------------------------------------------------------------------------
/ch5/matplotlib/sigmoid.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import sys
 3 | from pylab import *
 4 | 
 5 | t = arange(-60.0, 60.3, 0.1)
 6 | s = 1/(1 + exp(-t))
 7 | ax = subplot(211)
 8 | ax.plot(t,s)
 9 | ax.axis([-5,5,0,1])
10 | plt.xlabel('x')
11 | plt.ylabel('Sigmoid(x)')
12 | ax = subplot(212)
13 | ax.plot(t,s)
14 | ax.axis([-60,60,0,1])
15 | plt.xlabel('x')
16 | plt.ylabel('Sigmoid(x)')
17 | show()


--------------------------------------------------------------------------------
/ch5/matplotlib/梯度上升.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import matplotlib
 4 | import numpy as np
 5 | import matplotlib.cm as cm
 6 | import matplotlib.mlab as mlab
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | leafNode = dict(boxstyle="round4", fc="0.8")
10 | arrow_args = dict(arrowstyle="<-")
11 | 
12 | matplotlib.rcParams['xtick.direction'] = 'out'
13 | matplotlib.rcParams['ytick.direction'] = 'out'
14 | 
15 | delta = 0.025
16 | x = np.arange(-2.0, 2.0, delta)
17 | y = np.arange(-2.0, 2.0, delta)
18 | X, Y = np.meshgrid(x, y)
19 | Z1 = -((X-1)**2)
20 | Z2 = -(Y**2)
21 | 
22 | #Z1 = mlab.bivariate_normal(X, Y, 1.0, 1.0, 0.0, 0.0)
23 | #Z2 = mlab.bivariate_normal(X, Y, 1.5, 0.5, 1, 1)
24 | # difference of Gaussians
25 | 
26 | Z = 1.0 * (Z2 + Z1)+5.0
27 | 
28 | plt.figure()
29 | CS = plt.contour(X, Y, Z)
30 | plt.annotate('', xy=(0.05, 0.05),  xycoords='axes fraction',
31 |              xytext=(0.2,0.2), textcoords='axes fraction',
32 |              va="center", ha="center", bbox=leafNode, arrowprops=arrow_args )
33 | plt.text(-1.9, -1.8, 'P0')
34 | plt.annotate('', xy=(0.2,0.2),  xycoords='axes fraction',
35 |              xytext=(0.35,0.3), textcoords='axes fraction',
36 |              va="center", ha="center", bbox=leafNode, arrowprops=arrow_args )
37 | plt.text(-1.35, -1.23, 'P1')
38 | plt.annotate('', xy=(0.35,0.3),  xycoords='axes fraction',
39 |              xytext=(0.45,0.35), textcoords='axes fraction',
40 |              va="center", ha="center", bbox=leafNode, arrowprops=arrow_args )
41 | plt.text(-0.7, -0.8, 'P2')
42 | plt.text(-0.3, -0.6, 'P3')
43 | plt.clabel(CS, inline=1, fontsize=10)
44 | plt.title("Gradient Ascent")
45 | plt.xlabel('x')
46 | plt.ylabel('y')
47 | plt.show()
48 | 


--------------------------------------------------------------------------------
/ch5/screenshot/Logistic回归最佳拟合直线.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch5/screenshot/Logistic回归最佳拟合直线.png


--------------------------------------------------------------------------------
/ch5/screenshot/sigmoid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch5/screenshot/sigmoid.png


--------------------------------------------------------------------------------
/ch5/screenshot/改进随机梯度上升.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch5/screenshot/改进随机梯度上升.png


--------------------------------------------------------------------------------
/ch5/screenshot/梯度上升.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch5/screenshot/梯度上升.png


--------------------------------------------------------------------------------
/ch5/screenshot/随机梯度上升.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch5/screenshot/随机梯度上升.png


--------------------------------------------------------------------------------
/ch5/testSet.txt:
--------------------------------------------------------------------------------
  1 | -0.017612	14.053064	0
  2 | -1.395634	4.662541	1
  3 | -0.752157	6.538620	0
  4 | -1.322371	7.152853	0
  5 | 0.423363	11.054677	0
  6 | 0.406704	7.067335	1
  7 | 0.667394	12.741452	0
  8 | -2.460150	6.866805	1
  9 | 0.569411	9.548755	0
 10 | -0.026632	10.427743	0
 11 | 0.850433	6.920334	1
 12 | 1.347183	13.175500	0
 13 | 1.176813	3.167020	1
 14 | -1.781871	9.097953	0
 15 | -0.566606	5.749003	1
 16 | 0.931635	1.589505	1
 17 | -0.024205	6.151823	1
 18 | -0.036453	2.690988	1
 19 | -0.196949	0.444165	1
 20 | 1.014459	5.754399	1
 21 | 1.985298	3.230619	1
 22 | -1.693453	-0.557540	1
 23 | -0.576525	11.778922	0
 24 | -0.346811	-1.678730	1
 25 | -2.124484	2.672471	1
 26 | 1.217916	9.597015	0
 27 | -0.733928	9.098687	0
 28 | -3.642001	-1.618087	1
 29 | 0.315985	3.523953	1
 30 | 1.416614	9.619232	0
 31 | -0.386323	3.989286	1
 32 | 0.556921	8.294984	1
 33 | 1.224863	11.587360	0
 34 | -1.347803	-2.406051	1
 35 | 1.196604	4.951851	1
 36 | 0.275221	9.543647	0
 37 | 0.470575	9.332488	0
 38 | -1.889567	9.542662	0
 39 | -1.527893	12.150579	0
 40 | -1.185247	11.309318	0
 41 | -0.445678	3.297303	1
 42 | 1.042222	6.105155	1
 43 | -0.618787	10.320986	0
 44 | 1.152083	0.548467	1
 45 | 0.828534	2.676045	1
 46 | -1.237728	10.549033	0
 47 | -0.683565	-2.166125	1
 48 | 0.229456	5.921938	1
 49 | -0.959885	11.555336	0
 50 | 0.492911	10.993324	0
 51 | 0.184992	8.721488	0
 52 | -0.355715	10.325976	0
 53 | -0.397822	8.058397	0
 54 | 0.824839	13.730343	0
 55 | 1.507278	5.027866	1
 56 | 0.099671	6.835839	1
 57 | -0.344008	10.717485	0
 58 | 1.785928	7.718645	1
 59 | -0.918801	11.560217	0
 60 | -0.364009	4.747300	1
 61 | -0.841722	4.119083	1
 62 | 0.490426	1.960539	1
 63 | -0.007194	9.075792	0
 64 | 0.356107	12.447863	0
 65 | 0.342578	12.281162	0
 66 | -0.810823	-1.466018	1
 67 | 2.530777	6.476801	1
 68 | 1.296683	11.607559	0
 69 | 0.475487	12.040035	0
 70 | -0.783277	11.009725	0
 71 | 0.074798	11.023650	0
 72 | -1.337472	0.468339	1
 73 | -0.102781	13.763651	0
 74 | -0.147324	2.874846	1
 75 | 0.518389	9.887035	0
 76 | 1.015399	7.571882	0
 77 | -1.658086	-0.027255	1
 78 | 1.319944	2.171228	1
 79 | 2.056216	5.019981	1
 80 | -0.851633	4.375691	1
 81 | -1.510047	6.061992	0
 82 | -1.076637	-3.181888	1
 83 | 1.821096	10.283990	0
 84 | 3.010150	8.401766	1
 85 | -1.099458	1.688274	1
 86 | -0.834872	-1.733869	1
 87 | -0.846637	3.849075	1
 88 | 1.400102	12.628781	0
 89 | 1.752842	5.468166	1
 90 | 0.078557	0.059736	1
 91 | 0.089392	-0.715300	1
 92 | 1.825662	12.693808	0
 93 | 0.197445	9.744638	0
 94 | 0.126117	0.922311	1
 95 | -0.679797	1.220530	1
 96 | 0.677983	2.556666	1
 97 | 0.761349	10.693862	0
 98 | -2.168791	0.143632	1
 99 | 1.388610	9.341997	0
100 | 0.317029	14.739025	0
101 | 


--------------------------------------------------------------------------------
/ch5/使用梯度上升找最佳拟合直线.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from numpy import *
 4 | 
 5 | def loadDataSet():
 6 |     dataMat = [] #list
 7 |     labelMat = [] #list
 8 |     fr = open('testSet.txt')
 9 |     for line in fr.readlines():
10 |         lineArr = line.strip().split()
11 |         dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
12 |         labelMat.append(int(lineArr[2]))
13 |     return dataMat,labelMat
14 | 
15 | def sigmoid(z):
16 |     return 1.0 / (1 + exp(-z))
17 | 
18 | # Logistic 回归梯度上升优化算法
19 | def gradAscent(datamat,classlabel):
20 |     dataMat = mat(datamat) #convert to NumPy matrix
21 |     labeMat = mat(classlabel).transpose() #convert to NumPy matrix
22 |     m,n = shape(dataMat)
23 |     alpha = 0.001 # 向目标移动的步长
24 |     maxCycles = 500 # 迭代次数
25 |     weight = ones((n,1))
26 |     for k in range(maxCycles):
27 |         h = sigmoid(dataMat * weight)
28 |         diff = labeMat - h #误差
29 |         weight += alpha * dataMat.transpose() * diff #这里不止一次乘积运算
30 |     return weight
31 | 
32 | def plotBestFit(weights):
33 |     import matplotlib.pyplot as plt
34 |     dataMat,labelMat=loadDataSet()
35 |     dataArr = array(dataMat)
36 |     n = shape(dataArr)[0]
37 |     xcord1 = []; ycord1 = []
38 |     xcord2 = []; ycord2 = []
39 |     for i in range(n):
40 |         if int(labelMat[i])== 1:
41 |             xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
42 |         else:
43 |             xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
44 |     fig = plt.figure()
45 |     ax = fig.add_subplot(111)
46 |     ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
47 |     ax.scatter(xcord2, ycord2, s=30, c='green')
48 |     x = arange(-3.0, 3.0, 0.1)
49 |     y = (-weights[0]-weights[1]*x)/weights[2] #最佳拟合直线
50 |     ax.plot(x, y)
51 |     plt.xlabel('X1')
52 |     plt.ylabel('X2')
53 |     plt.show()
54 | 
55 | if __name__ == "__main__" :
56 |     dataMat,labelMat = loadDataSet()
57 |     # print dataMat,labelMat
58 |     weight = gradAscent(dataMat,labelMat)
59 |     plotBestFit(weight)
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/ch5/改进随机梯度上升.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from numpy import *
 4 | 
 5 | def loadDataSet():
 6 |     dataMat = [] #list
 7 |     labelMat = [] #list
 8 |     fr = open('testSet.txt')
 9 |     for line in fr.readlines():
10 |         lineArr = line.strip().split()
11 |         dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
12 |         labelMat.append(int(lineArr[2]))
13 |     return dataMat,labelMat
14 | 
15 | def sigmoid(z):
16 |     return 1.0 / (1 + exp(-z))
17 | 
18 | # Logistic 回归梯度上升优化算法
19 | def gradAscent(datamat,classlabel):
20 |     dataMat = mat(datamat) #convert to NumPy matrix
21 |     labeMat = mat(classlabel).transpose() #convert to NumPy matrix
22 |     m,n = shape(dataMat)
23 |     alpha = 0.001 # 向目标移动的步长
24 |     maxCycles = 500 # 迭代次数
25 |     weight = ones((n,1))
26 |     for k in range(maxCycles):
27 |         h = sigmoid(dataMat * weight)
28 |         diff = labeMat - h #误差
29 |         weight += alpha * dataMat.transpose()*diff #这里不止一次乘积运算
30 |     return weight
31 | 
32 | def stocGradAscent(dataMatrix, classLabels, numIter=150):
33 |     m,n = shape(dataMatrix)
34 |     weights = ones(n)   #initialize to all ones
35 |     for j in range(numIter):
36 |         dataIndex = range(m)
37 |         for i in range(m):
38 |             # apha decreases with iteration, does not
39 |             alpha = 4/(1.0+j+i)+0.0001
40 |             # go to 0 because of the constant
41 |             randIndex = int(random.uniform(0,len(dataIndex)))
42 |             h = sigmoid(sum(dataMatrix[randIndex]*weights))
43 |             error = classLabels[randIndex] - h
44 |             weights = weights + alpha * error * dataMatrix[randIndex]
45 |             del(dataIndex[randIndex])
46 |     return weights
47 | 
48 | def plotBestFit(weights):
49 |     import matplotlib.pyplot as plt
50 |     dataMat,labelMat=loadDataSet()
51 |     dataArr = array(dataMat)
52 |     n = shape(dataArr)[0]
53 |     xcord1 = []; ycord1 = []
54 |     xcord2 = []; ycord2 = []
55 |     for i in range(n):
56 |         if int(labelMat[i])== 1:
57 |             xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
58 |         else:
59 |             xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
60 |     fig = plt.figure()
61 |     ax = fig.add_subplot(111)
62 |     ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
63 |     ax.scatter(xcord2, ycord2, s=30, c='green')
64 |     x = arange(-3.0, 3.0, 0.1)
65 |     y = (-weights[0]-weights[1]*x)/weights[2] #最佳拟合直线
66 |     ax.plot(x, y)
67 |     plt.xlabel('X1')
68 |     plt.ylabel('X2')
69 |     plt.show()
70 | 
71 | if __name__ == "__main__" :
72 |     dataMat,labelMat = loadDataSet()
73 |     # print dataMat,labelMat
74 |     weight = stocGradAscent(array(dataMat),labelMat)
75 |     plotBestFit(weight)
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/ch5/随机梯度上升.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from numpy import *
 4 | 
 5 | def loadDataSet():
 6 |     dataMat = [] #list
 7 |     labelMat = [] #list
 8 |     fr = open('testSet.txt')
 9 |     for line in fr.readlines():
10 |         lineArr = line.strip().split()
11 |         dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
12 |         labelMat.append(int(lineArr[2]))
13 |     return dataMat,labelMat
14 | 
15 | def sigmoid(z):
16 |     return 1.0 / (1 + exp(-z))
17 | 
18 | # Logistic 回归梯度上升优化算法
19 | def gradAscent(datamat,classlabel):
20 |     dataMat = mat(datamat) #convert to NumPy matrix
21 |     labeMat = mat(classlabel).transpose() #convert to NumPy matrix
22 |     m,n = shape(dataMat)
23 |     alpha = 0.001 # 向目标移动的步长
24 |     maxCycles = 500 # 迭代次数
25 |     weight = ones((n,1))
26 |     for k in range(maxCycles):
27 |         h = sigmoid(dataMat * weight)
28 |         diff = labeMat - h #误差
29 |         weight += alpha * dataMat.transpose()*diff #这里不止一次乘积运算
30 |     return weight
31 | 
32 | def stocGradAscent(dataMatrix, classLabels):
33 |     m,n = shape(dataMatrix)
34 |     alpha = 0.01
35 |     weights = ones(n)   #initialize to all ones
36 |     for i in range(m):
37 |         h = sigmoid(sum(dataMatrix[i]*weights))
38 |         error = classLabels[i] - h
39 |         weights = weights + alpha * error * dataMatrix[i]
40 |     return weights
41 | 
42 | def plotBestFit(weights):
43 |     import matplotlib.pyplot as plt
44 |     dataMat,labelMat=loadDataSet()
45 |     dataArr = array(dataMat)
46 |     n = shape(dataArr)[0]
47 |     xcord1 = []; ycord1 = []
48 |     xcord2 = []; ycord2 = []
49 |     for i in range(n):
50 |         if int(labelMat[i])== 1:
51 |             xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
52 |         else:
53 |             xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
54 |     fig = plt.figure()
55 |     ax = fig.add_subplot(111)
56 |     ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
57 |     ax.scatter(xcord2, ycord2, s=30, c='green')
58 |     x = arange(-3.0, 3.0, 0.1)
59 |     y = (-weights[0]-weights[1]*x)/weights[2] #最佳拟合直线
60 |     ax.plot(x, y)
61 |     plt.xlabel('X1')
62 |     plt.ylabel('X2')
63 |     plt.show()
64 | 
65 | if __name__ == "__main__" :
66 |     dataMat,labelMat = loadDataSet()
67 |     # print dataMat,labelMat
68 |     weight = stocGradAscent(array(dataMat),labelMat)
69 |     plotBestFit(weight)
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/ch6/README.md:
--------------------------------------------------------------------------------
 1 | # Ch06 - 支持向量机(Support vector machines)
 2 | 
 3 | #### 支持向量机是这本书里面最难的一个算法了，其中的原理需要的数学知识最多，当然SVM也是用的非常多的分类器。
 4 | #### SVM有很多实现，但这章只关注序列最小优化(SMO)算法。同时，介绍一种称为核函数的方式将SVM扩展到更多的数据集上。在Logitic回归中的介绍的数据集中，它们都是可以在途中画出一条直线将两组数据点分开。这组数据又叫做线性可分数据。但是我们的数据不能用一条直线分开时呢？比如下面这些数据点。
 5 | ## 4个线性不可分的数据集示意图
 6 | ![4个线性不可分的数据集](screenshot/4个线性不可分的数据集.png)
 7 | 
 8 | #### SMO算法的目标是求出一系列alpha和b，一旦求出这些alpha，就很容易计算出权重向量w并得到分割超平面。
 9 | #### SMO算法原理：每次循环中选择两个alpha进行优化处理。一旦找到一对合适alpha，那就增大其中一个同时减少另一个。“合适”是指两个alpha必须要符合一定的条件，条件之一就是这两个alpha必须要在间隔边界之外，而其第二个条件则是这两个alpha还没有进行过区间化处理或者不在边界上。
10 | ## 简化版SMO效果图
11 | ![简化版SMO效果图](screenshot/简化版SMO效果图.png)
12 | #### 上图是数据集上运行简化版SMO后得到的结果，包括画圈的支持向量与分隔超平面。
13 | 
14 | ## 完整版SMO效果图（优化速度）
15 | ![完整版SMO](screenshot/完整版SMO.png)
16 | #### 上图是数据集上运行简化版SMO后得到的结果，包括画圈的支持向量与分隔超平面。和简化版的稍微不同。
17 | 
18 | 
19 | ## 核方法中的非线性可分数据效果图
20 | ![核方法中的非线性可分数据](screenshot/核方法中的非线性可分数据.png)
21 | 
22 | #### 如果线性不可分的话，我们需要把原始空间的数据映射到一个高维空间，就可以做到线性可分，这里就需要用到核函数。利用核函数可以将数据映射到高维空间，然后进行线性可分。形象的例子：https://www.zhihu.com/question/21094489
23 | 
24 | ## 拉格朗日乘子法与KKT条件(自行了解)
25 | 
26 | ## 基于SVM的数字识别（自行写代码）（和KNN类似）
27 | #### 进行数字识别时，采用SVM要比KNN好。
28 | 
29 | 


--------------------------------------------------------------------------------
/ch6/digits.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch6/digits.zip


--------------------------------------------------------------------------------
/ch6/matplotlib/4个线性不可分的数据集效果图.py:
--------------------------------------------------------------------------------
 1 | #coding  utf-8
 2 | 
 3 | from numpy import 
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | xcord0 = []; ycord0 = []
 8 | xcord1 = []; ycord1 = []
 9 | markers =[]
10 | colors =[]
11 | fr = open('testSet.txt') # this file was generated by 2normalGen.py
12 | for line in fr.readlines()
13 |     lineSplit = line.strip().split('t')
14 |     xPt = float(lineSplit[0])
15 |     yPt = float(lineSplit[1])
16 |     label = int(lineSplit[2])
17 |     if (label == 0)
18 |         xcord0.append(xPt)
19 |         ycord0.append(yPt)
20 |     else
21 |         xcord1.append(xPt)
22 |         ycord1.append(yPt)
23 | 
24 | fr.close()
25 | fig = plt.figure()
26 | ax = fig.add_subplot(221)
27 | xcord0 = []; ycord0 = []; xcord1 = []; ycord1 = []
28 | for i in range(300)
29 |     [x,y] = random.uniform(0,1,2)
30 |     if ((x  0.5) and (y  0.5)) or ((x  0.5) and (y  0.5))
31 |         xcord0.append(x); ycord0.append(y)
32 |     else
33 |         xcord1.append(x); ycord1.append(y)
34 | ax.scatter(xcord0,ycord0, marker='s', s=90)
35 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red')
36 | plt.title('A')
37 | ax = fig.add_subplot(222)
38 | xcord0 = random.standard_normal(150); ycord0 = random.standard_normal(150)
39 | xcord1 = random.standard_normal(150)+2.0; ycord1 = random.standard_normal(150)+2.0
40 | ax.scatter(xcord0,ycord0, marker='s', s=90)
41 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red')
42 | plt.title('B')
43 | ax = fig.add_subplot(223)
44 | xcord0 = []; ycord0 = []; xcord1 = []; ycord1 = []
45 | for i in range(300)
46 |     [x,y] = random.uniform(0,1,2)
47 |     if (x  0.5)
48 |         xcord0.append(xcos(2.0piy)); ycord0.append(xsin(2.0piy))
49 |     else
50 |         xcord1.append(xcos(2.0piy)); ycord1.append(xsin(2.0piy))
51 | 
52 | ax.scatter(xcord0,ycord0, marker='s', s=90)
53 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red')
54 | 
55 | plt.title('C')
56 | ax = fig.add_subplot(224)
57 | xcord1 = zeros(150); ycord1 = zeros(150)
58 | xcord0 = random.uniform(-3,3,350); ycord0 = random.uniform(-3,3,350);
59 | 
60 | xcord1[050] = 0.3random.standard_normal(50)+2.0; ycord1[050] = 0.3random.standard_normal(50)+2.0
61 | 
62 | xcord1[50100] = 0.3random.standard_normal(50)-2.0; ycord1[50100] = 0.3random.standard_normal(50)-3.0
63 | 
64 | xcord1[100150] = 0.3random.standard_normal(50)+1.0; ycord1[100150] = 0.3random.standard_normal(50)
65 | 
66 | ax.scatter(xcord0,ycord0, marker='s', s=90)
67 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red')
68 | plt.title('D')
69 | plt.show()


--------------------------------------------------------------------------------
/ch6/matplotlib/README.md:
--------------------------------------------------------------------------------
1 | # matplotlib code
2 | 


--------------------------------------------------------------------------------
/ch6/matplotlib/完整版SMO效果图.py:
--------------------------------------------------------------------------------
 1 | #coding : utf-8
 2 | 
 3 | from numpy import *
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | from matplotlib.patches import Circle
 7 | 
 8 | xcord0 = []
 9 | ycord0 = []
10 | xcord1 = []
11 | ycord1 = []
12 | markers =[]
13 | colors =[]
14 | fr = open('testSet.txt')#this file was generated by 2normalGen.py
15 | for line in fr.readlines():
16 |     lineSplit = line.strip().split('\t')
17 |     xPt = float(lineSplit[0])
18 |     yPt = float(lineSplit[1])
19 |     label = int(lineSplit[2])
20 |     if (label == -1):
21 |         xcord0.append(xPt)
22 |         ycord0.append(yPt)
23 |     else:
24 |         xcord1.append(xPt)
25 |         ycord1.append(yPt)
26 | 
27 | fr.close()
28 | fig = plt.figure()
29 | ax = fig.add_subplot(111)
30 | ax.scatter(xcord0,ycord0, marker='s', s=90)
31 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red')
32 | plt.title('Support Vectors Circled')
33 | 
34 | circle = Circle((4.6581910000000004, 3.507396), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
35 | ax.add_patch(circle)
36 | circle = Circle((3.4570959999999999, -0.082215999999999997), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
37 | ax.add_patch(circle)
38 | circle = Circle((6.0805730000000002, 0.41888599999999998), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
39 | ax.add_patch(circle)
40 | circle = Circle((2.911290000000001, -1.590919999999999), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
41 | ax.add_patch(circle)
42 | circle = Circle((5.310480000000001, -2.386369999999999), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
43 | ax.add_patch(circle)
44 | circle = Circle((8.245097000000001, 1.515159999999999), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
45 | ax.add_patch(circle)
46 | #plt.plot([2.3,8.5], [-6,6]) #seperating hyperplane
47 | b = -3.75567; w0=0.8065; w1=-0.2761
48 | x = arange(-2.0, 12.0, 0.1)
49 | y = (-w0*x - b)/w1
50 | ax.plot(x,y)
51 | ax.axis([-2,12,-8,6])
52 | plt.show()


--------------------------------------------------------------------------------
/ch6/matplotlib/核方法中的非线性可分数据效果图.py:
--------------------------------------------------------------------------------
 1 | #coding : utf-8
 2 | 
 3 | from numpy import *
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | xcord0 = []; ycord0 = []; xcord1 = []; ycord1 = []
 8 | fw = open('testSetRBF2.txt', 'w') # input data
 9 | 
10 | fig = plt.figure()
11 | ax = fig.add_subplot(111)
12 | xcord0 = []; ycord0 = []; xcord1 = []; ycord1 = []
13 | for i in range(100):
14 |     [x,y] = random.uniform(0,1,2)
15 |     xpt=x*cos(2.0*pi*y); ypt = x*sin(2.0*pi*y)
16 |     if (x > 0.5):
17 |         xcord0.append(xpt); ycord0.append(ypt)
18 |         label = -1.0
19 |     else:
20 |         xcord1.append(xpt); ycord1.append(ypt)
21 |         label = 1.0
22 |     fw.write('%f\t%f\t%f\n' % (xpt, ypt, label))
23 | ax.scatter(xcord0,ycord0, marker='s', s=90)
24 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red')
25 | plt.title('Non-linearly Separable Data for Kernel Method')
26 | plt.show()
27 | fw.close()


--------------------------------------------------------------------------------
/ch6/matplotlib/简化版SMO处理小数据集效果图.py:
--------------------------------------------------------------------------------
 1 | #coding : utf-8
 2 | 
 3 | from numpy import *
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | from matplotlib.patches import Circle
 7 | 
 8 | xcord0 = []
 9 | ycord0 = []
10 | xcord1 = []
11 | ycord1 = []
12 | markers =[]
13 | colors =[]
14 | fr = open('testSet.txt')#this file was generated by 2normalGen.py
15 | for line in fr.readlines():
16 |     lineSplit = line.strip().split('\t')
17 |     xPt = float(lineSplit[0])
18 |     yPt = float(lineSplit[1])
19 |     label = int(lineSplit[2])
20 |     if (label == -1):
21 |         xcord0.append(xPt)
22 |         ycord0.append(yPt)
23 |     else:
24 |         xcord1.append(xPt)
25 |         ycord1.append(yPt)
26 | 
27 | fr.close()
28 | fig = plt.figure()
29 | ax = fig.add_subplot(111)
30 | ax.scatter(xcord0,ycord0, marker='s', s=90)
31 | ax.scatter(xcord1,ycord1, marker='o', s=50, c='red')
32 | plt.title('Support Vectors Circled')
33 | circle = Circle((4.6581910000000004, 3.507396), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
34 | ax.add_patch(circle)
35 | circle = Circle((3.4570959999999999, -0.082215999999999997), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
36 | ax.add_patch(circle)
37 | circle = Circle((6.0805730000000002, 0.41888599999999998), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
38 | ax.add_patch(circle)
39 | #plt.plot([2.3,8.5], [-6,6]) #seperating hyperplane
40 | b = -3.75567; w0=0.8065; w1=-0.2761
41 | x = arange(-2.0, 12.0, 0.1)
42 | y = (-w0*x - b)/w1
43 | ax.plot(x,y)
44 | ax.axis([-2,12,-8,6])
45 | plt.show()


--------------------------------------------------------------------------------
/ch6/screenshot/4个线性不可分的数据集.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch6/screenshot/4个线性不可分的数据集.png


--------------------------------------------------------------------------------
/ch6/screenshot/README.md:
--------------------------------------------------------------------------------
1 | # screenshot
2 | 


--------------------------------------------------------------------------------
/ch6/screenshot/完整版SMO.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch6/screenshot/完整版SMO.png


--------------------------------------------------------------------------------
/ch6/screenshot/核方法中的非线性可分数据.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch6/screenshot/核方法中的非线性可分数据.png


--------------------------------------------------------------------------------
/ch6/screenshot/简化版SMO效果图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch6/screenshot/简化版SMO效果图.png


--------------------------------------------------------------------------------
/ch6/testSet.txt:
--------------------------------------------------------------------------------
  1 | 3.542485	1.977398	-1
  2 | 3.018896	2.556416	-1
  3 | 7.551510	-1.580030	1
  4 | 2.114999	-0.004466	-1
  5 | 8.127113	1.274372	1
  6 | 7.108772	-0.986906	1
  7 | 8.610639	2.046708	1
  8 | 2.326297	0.265213	-1
  9 | 3.634009	1.730537	-1
 10 | 0.341367	-0.894998	-1
 11 | 3.125951	0.293251	-1
 12 | 2.123252	-0.783563	-1
 13 | 0.887835	-2.797792	-1
 14 | 7.139979	-2.329896	1
 15 | 1.696414	-1.212496	-1
 16 | 8.117032	0.623493	1
 17 | 8.497162	-0.266649	1
 18 | 4.658191	3.507396	-1
 19 | 8.197181	1.545132	1
 20 | 1.208047	0.213100	-1
 21 | 1.928486	-0.321870	-1
 22 | 2.175808	-0.014527	-1
 23 | 7.886608	0.461755	1
 24 | 3.223038	-0.552392	-1
 25 | 3.628502	2.190585	-1
 26 | 7.407860	-0.121961	1
 27 | 7.286357	0.251077	1
 28 | 2.301095	-0.533988	-1
 29 | -0.232542	-0.547690	-1
 30 | 3.457096	-0.082216	-1
 31 | 3.023938	-0.057392	-1
 32 | 8.015003	0.885325	1
 33 | 8.991748	0.923154	1
 34 | 7.916831	-1.781735	1
 35 | 7.616862	-0.217958	1
 36 | 2.450939	0.744967	-1
 37 | 7.270337	-2.507834	1
 38 | 1.749721	-0.961902	-1
 39 | 1.803111	-0.176349	-1
 40 | 8.804461	3.044301	1
 41 | 1.231257	-0.568573	-1
 42 | 2.074915	1.410550	-1
 43 | -0.743036	-1.736103	-1
 44 | 3.536555	3.964960	-1
 45 | 8.410143	0.025606	1
 46 | 7.382988	-0.478764	1
 47 | 6.960661	-0.245353	1
 48 | 8.234460	0.701868	1
 49 | 8.168618	-0.903835	1
 50 | 1.534187	-0.622492	-1
 51 | 9.229518	2.066088	1
 52 | 7.886242	0.191813	1
 53 | 2.893743	-1.643468	-1
 54 | 1.870457	-1.040420	-1
 55 | 5.286862	-2.358286	1
 56 | 6.080573	0.418886	1
 57 | 2.544314	1.714165	-1
 58 | 6.016004	-3.753712	1
 59 | 0.926310	-0.564359	-1
 60 | 0.870296	-0.109952	-1
 61 | 2.369345	1.375695	-1
 62 | 1.363782	-0.254082	-1
 63 | 7.279460	-0.189572	1
 64 | 1.896005	0.515080	-1
 65 | 8.102154	-0.603875	1
 66 | 2.529893	0.662657	-1
 67 | 1.963874	-0.365233	-1
 68 | 8.132048	0.785914	1
 69 | 8.245938	0.372366	1
 70 | 6.543888	0.433164	1
 71 | -0.236713	-5.766721	-1
 72 | 8.112593	0.295839	1
 73 | 9.803425	1.495167	1
 74 | 1.497407	-0.552916	-1
 75 | 1.336267	-1.632889	-1
 76 | 9.205805	-0.586480	1
 77 | 1.966279	-1.840439	-1
 78 | 8.398012	1.584918	1
 79 | 7.239953	-1.764292	1
 80 | 7.556201	0.241185	1
 81 | 9.015509	0.345019	1
 82 | 8.266085	-0.230977	1
 83 | 8.545620	2.788799	1
 84 | 9.295969	1.346332	1
 85 | 2.404234	0.570278	-1
 86 | 2.037772	0.021919	-1
 87 | 1.727631	-0.453143	-1
 88 | 1.979395	-0.050773	-1
 89 | 8.092288	-1.372433	1
 90 | 1.667645	0.239204	-1
 91 | 9.854303	1.365116	1
 92 | 7.921057	-1.327587	1
 93 | 8.500757	1.492372	1
 94 | 1.339746	-0.291183	-1
 95 | 3.107511	0.758367	-1
 96 | 2.609525	0.902979	-1
 97 | 3.263585	1.367898	-1
 98 | 2.912122	-0.202359	-1
 99 | 1.731786	0.589096	-1
100 | 2.387003	1.573131	-1
101 | 


--------------------------------------------------------------------------------
/ch6/testSetRBF.txt:
--------------------------------------------------------------------------------
  1 | -0.214824	0.662756	-1.000000
  2 | -0.061569	-0.091875	1.000000
  3 | 0.406933	0.648055	-1.000000
  4 | 0.223650	0.130142	1.000000
  5 | 0.231317	0.766906	-1.000000
  6 | -0.748800	-0.531637	-1.000000
  7 | -0.557789	0.375797	-1.000000
  8 | 0.207123	-0.019463	1.000000
  9 | 0.286462	0.719470	-1.000000
 10 | 0.195300	-0.179039	1.000000
 11 | -0.152696	-0.153030	1.000000
 12 | 0.384471	0.653336	-1.000000
 13 | -0.117280	-0.153217	1.000000
 14 | -0.238076	0.000583	1.000000
 15 | -0.413576	0.145681	1.000000
 16 | 0.490767	-0.680029	-1.000000
 17 | 0.199894	-0.199381	1.000000
 18 | -0.356048	0.537960	-1.000000
 19 | -0.392868	-0.125261	1.000000
 20 | 0.353588	-0.070617	1.000000
 21 | 0.020984	0.925720	-1.000000
 22 | -0.475167	-0.346247	-1.000000
 23 | 0.074952	0.042783	1.000000
 24 | 0.394164	-0.058217	1.000000
 25 | 0.663418	0.436525	-1.000000
 26 | 0.402158	0.577744	-1.000000
 27 | -0.449349	-0.038074	1.000000
 28 | 0.619080	-0.088188	-1.000000
 29 | 0.268066	-0.071621	1.000000
 30 | -0.015165	0.359326	1.000000
 31 | 0.539368	-0.374972	-1.000000
 32 | -0.319153	0.629673	-1.000000
 33 | 0.694424	0.641180	-1.000000
 34 | 0.079522	0.193198	1.000000
 35 | 0.253289	-0.285861	1.000000
 36 | -0.035558	-0.010086	1.000000
 37 | -0.403483	0.474466	-1.000000
 38 | -0.034312	0.995685	-1.000000
 39 | -0.590657	0.438051	-1.000000
 40 | -0.098871	-0.023953	1.000000
 41 | -0.250001	0.141621	1.000000
 42 | -0.012998	0.525985	-1.000000
 43 | 0.153738	0.491531	-1.000000
 44 | 0.388215	-0.656567	-1.000000
 45 | 0.049008	0.013499	1.000000
 46 | 0.068286	0.392741	1.000000
 47 | 0.747800	-0.066630	-1.000000
 48 | 0.004621	-0.042932	1.000000
 49 | -0.701600	0.190983	-1.000000
 50 | 0.055413	-0.024380	1.000000
 51 | 0.035398	-0.333682	1.000000
 52 | 0.211795	0.024689	1.000000
 53 | -0.045677	0.172907	1.000000
 54 | 0.595222	0.209570	-1.000000
 55 | 0.229465	0.250409	1.000000
 56 | -0.089293	0.068198	1.000000
 57 | 0.384300	-0.176570	1.000000
 58 | 0.834912	-0.110321	-1.000000
 59 | -0.307768	0.503038	-1.000000
 60 | -0.777063	-0.348066	-1.000000
 61 | 0.017390	0.152441	1.000000
 62 | -0.293382	-0.139778	1.000000
 63 | -0.203272	0.286855	1.000000
 64 | 0.957812	-0.152444	-1.000000
 65 | 0.004609	-0.070617	1.000000
 66 | -0.755431	0.096711	-1.000000
 67 | -0.526487	0.547282	-1.000000
 68 | -0.246873	0.833713	-1.000000
 69 | 0.185639	-0.066162	1.000000
 70 | 0.851934	0.456603	-1.000000
 71 | -0.827912	0.117122	-1.000000
 72 | 0.233512	-0.106274	1.000000
 73 | 0.583671	-0.709033	-1.000000
 74 | -0.487023	0.625140	-1.000000
 75 | -0.448939	0.176725	1.000000
 76 | 0.155907	-0.166371	1.000000
 77 | 0.334204	0.381237	-1.000000
 78 | 0.081536	-0.106212	1.000000
 79 | 0.227222	0.527437	-1.000000
 80 | 0.759290	0.330720	-1.000000
 81 | 0.204177	-0.023516	1.000000
 82 | 0.577939	0.403784	-1.000000
 83 | -0.568534	0.442948	-1.000000
 84 | -0.011520	0.021165	1.000000
 85 | 0.875720	0.422476	-1.000000
 86 | 0.297885	-0.632874	-1.000000
 87 | -0.015821	0.031226	1.000000
 88 | 0.541359	-0.205969	-1.000000
 89 | -0.689946	-0.508674	-1.000000
 90 | -0.343049	0.841653	-1.000000
 91 | 0.523902	-0.436156	-1.000000
 92 | 0.249281	-0.711840	-1.000000
 93 | 0.193449	0.574598	-1.000000
 94 | -0.257542	-0.753885	-1.000000
 95 | -0.021605	0.158080	1.000000
 96 | 0.601559	-0.727041	-1.000000
 97 | -0.791603	0.095651	-1.000000
 98 | -0.908298	-0.053376	-1.000000
 99 | 0.122020	0.850966	-1.000000
100 | -0.725568	-0.292022	-1.000000
101 | 


--------------------------------------------------------------------------------
/ch6/testSetRBF2.txt:
--------------------------------------------------------------------------------
  1 | 0.676771	-0.486687	-1.000000
  2 | 0.008473	0.186070	1.000000
  3 | -0.727789	0.594062	-1.000000
  4 | 0.112367	0.287852	1.000000
  5 | 0.383633	-0.038068	1.000000
  6 | -0.927138	-0.032633	-1.000000
  7 | -0.842803	-0.423115	-1.000000
  8 | -0.003677	-0.367338	1.000000
  9 | 0.443211	-0.698469	-1.000000
 10 | -0.473835	0.005233	1.000000
 11 | 0.616741	0.590841	-1.000000
 12 | 0.557463	-0.373461	-1.000000
 13 | -0.498535	-0.223231	-1.000000
 14 | -0.246744	0.276413	1.000000
 15 | -0.761980	-0.244188	-1.000000
 16 | 0.641594	-0.479861	-1.000000
 17 | -0.659140	0.529830	-1.000000
 18 | -0.054873	-0.238900	1.000000
 19 | -0.089644	-0.244683	1.000000
 20 | -0.431576	-0.481538	-1.000000
 21 | -0.099535	0.728679	-1.000000
 22 | -0.188428	0.156443	1.000000
 23 | 0.267051	0.318101	1.000000
 24 | 0.222114	-0.528887	-1.000000
 25 | 0.030369	0.113317	1.000000
 26 | 0.392321	0.026089	1.000000
 27 | 0.298871	-0.915427	-1.000000
 28 | -0.034581	-0.133887	1.000000
 29 | 0.405956	0.206980	1.000000
 30 | 0.144902	-0.605762	-1.000000
 31 | 0.274362	-0.401338	1.000000
 32 | 0.397998	-0.780144	-1.000000
 33 | 0.037863	0.155137	1.000000
 34 | -0.010363	-0.004170	1.000000
 35 | 0.506519	0.486619	-1.000000
 36 | 0.000082	-0.020625	1.000000
 37 | 0.057761	-0.155140	1.000000
 38 | 0.027748	-0.553763	-1.000000
 39 | -0.413363	-0.746830	-1.000000
 40 | 0.081500	-0.014264	1.000000
 41 | 0.047137	-0.491271	1.000000
 42 | -0.267459	0.024770	1.000000
 43 | -0.148288	-0.532471	-1.000000
 44 | -0.225559	-0.201622	1.000000
 45 | 0.772360	-0.518986	-1.000000
 46 | -0.440670	0.688739	-1.000000
 47 | 0.329064	-0.095349	1.000000
 48 | 0.970170	-0.010671	-1.000000
 49 | -0.689447	-0.318722	-1.000000
 50 | -0.465493	-0.227468	-1.000000
 51 | -0.049370	0.405711	1.000000
 52 | -0.166117	0.274807	1.000000
 53 | 0.054483	0.012643	1.000000
 54 | 0.021389	0.076125	1.000000
 55 | -0.104404	-0.914042	-1.000000
 56 | 0.294487	0.440886	-1.000000
 57 | 0.107915	-0.493703	-1.000000
 58 | 0.076311	0.438860	1.000000
 59 | 0.370593	-0.728737	-1.000000
 60 | 0.409890	0.306851	-1.000000
 61 | 0.285445	0.474399	-1.000000
 62 | -0.870134	-0.161685	-1.000000
 63 | -0.654144	-0.675129	-1.000000
 64 | 0.285278	-0.767310	-1.000000
 65 | 0.049548	-0.000907	1.000000
 66 | 0.030014	-0.093265	1.000000
 67 | -0.128859	0.278865	1.000000
 68 | 0.307463	0.085667	1.000000
 69 | 0.023440	0.298638	1.000000
 70 | 0.053920	0.235344	1.000000
 71 | 0.059675	0.533339	-1.000000
 72 | 0.817125	0.016536	-1.000000
 73 | -0.108771	0.477254	1.000000
 74 | -0.118106	0.017284	1.000000
 75 | 0.288339	0.195457	1.000000
 76 | 0.567309	-0.200203	-1.000000
 77 | -0.202446	0.409387	1.000000
 78 | -0.330769	-0.240797	1.000000
 79 | -0.422377	0.480683	-1.000000
 80 | -0.295269	0.326017	1.000000
 81 | 0.261132	0.046478	1.000000
 82 | -0.492244	-0.319998	-1.000000
 83 | -0.384419	0.099170	1.000000
 84 | 0.101882	-0.781145	-1.000000
 85 | 0.234592	-0.383446	1.000000
 86 | -0.020478	-0.901833	-1.000000
 87 | 0.328449	0.186633	1.000000
 88 | -0.150059	-0.409158	1.000000
 89 | -0.155876	-0.843413	-1.000000
 90 | -0.098134	-0.136786	1.000000
 91 | 0.110575	-0.197205	1.000000
 92 | 0.219021	0.054347	1.000000
 93 | 0.030152	0.251682	1.000000
 94 | 0.033447	-0.122824	1.000000
 95 | -0.686225	-0.020779	-1.000000
 96 | -0.911211	-0.262011	-1.000000
 97 | 0.572557	0.377526	-1.000000
 98 | -0.073647	-0.519163	-1.000000
 99 | -0.281830	-0.797236	-1.000000
100 | -0.555263	0.126232	-1.000000
101 | 


--------------------------------------------------------------------------------
/ch6/简化版SMO处理小数据集.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | 
 3 | from numpy import *
 4 | import numpy as np
 5 | from time import sleep
 6 | 
 7 | def loadDataSet(filename): #读入数据
 8 |     dataMat = [] ; labelMat = [] #创建两个数组
 9 |     fr = open(filename)
10 |     for line in fr.readlines():
11 |         lineArr = line.strip().split('\t') #对当前行进行去回车，空格操作
12 |         dataMat.append([float(lineArr[0]),float(lineArr[1])]) #将两个特征加入dataMat
13 |         labelMat.append((float(lineArr[2])))#将标签加入labelMat
14 |     return dataMat,labelMat
15 | 
16 | def selectJrand(i,m):#用于在区间内选择一个整数，i为alpha的下标，m为alpha的个数
17 |     j = i
18 |     while(j==i):#只要函数值不等于输入值i就会随机，因为要满足 ∑alpha(i)*label(i)=0,同时改变两个alpha
19 |         j = int(random.uniform(0,m))
20 |     return j
21 | 
22 | def clipAlpha(aj,H,L):#用来调整大于H或小于L的alpha值
23 |     if aj>H:
24 |         aj = H
25 |     if L > aj:
26 |         aj = L
27 |     return aj
28 | 
29 | # 简化版SMO
30 | # 这本数最大的一个函数
31 | # 输入参数：数据集，类别标签，常数C，容错率，取消前最大的循环次数
32 | def smoSimple(dataMatIn,classLabels,C,toler,maxIter):#数据集，类别标签，常熟C，容错率，退出前的最大循环次数
33 |     dataMatrix = mat(dataMatIn) ;  #转换成numpy矩阵
34 |     labelMat = mat(classLabels).transpose() #转换成numpy矩阵
35 |     b = 0 ; m,n = shape(dataMatrix) #求出行列
36 |     alphas = mat(zeros((m,1)))#讲alpha都初始化为0
37 |     iter = 0#没有任何alpha改变下的遍历数据集的次数
38 |     while (iter < maxIter) : #当迭代次数小于最大迭代次数
39 |         alphaPairsChanged = 0 #用来记录alpha是否被优化
40 |         for i in range(m): #对m行数据进行处理
41 |             fXi = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b #预测的类别
42 |             Ei = fXi - float(labelMat[i]) #误差Ei
43 |             #如果误差很大，就可以基于该组数据所对应的alpha进行优化
44 |             if ((labelMat[i]*Ei < -toler )and (alphas[i] < C )) or ((labelMat[i]*Ei > toler ) and alphas[i]>0 ) :
45 |             #在if语句，测试正间隔和负间隔，同时检查alpha值,保证其不能等于0或C
46 |                 j = selectJrand(i,m) #随机第二个alpha
47 |                 fXj = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b
48 |                 Ej = fXj - float(labelMat[j])
49 |                 alphaIold = alphas[i].copy()
50 |                 alphaJold = alphas[j].copy() #把两个alpha赋值，这样的好处是不改变原有alphas的值
51 |                 if(labelMat[i] != labelMat[j]):#如果标签向量不相等，保证alpha再0~C之间
52 |                     L = max(0,alphas[j]+alphas[i])
53 |                     H = min(C,C+alphas[j]-alphas[i])
54 |                 else:
55 |                     L = max(0,alphas[j]+alphas[i] - C)
56 |                     H = min(C,alphas[j]+alphas[i])
57 |                 if L == H : print("L==H") ; continue
58 |                 eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - \
59 |                     dataMatrix[j,:]*dataMatrix[j,:].T #是alpha[j]的最优修改量
60 |                 if eta >= 0 : print "eta>=0";continue
61 |                 alphas[j] -= labelMat[j]*(Ei - Ej) / eta
62 |                 alphas[j] = clipAlpha(alphas[j],H,L) #调整alpha的大小
63 |                 if(abs(alphas[j]-alphaJold) < 0.00001) : print "j not moving enough " ; continue#检查alpha[j]
64 |                 alphas[i]+=labelMat[i]*labelMat[j]*(alphaJold-alphas[j]) #对i进行修改，修改量与j相同，但方向相反
65 |                 b1 = b - Ei - labelMat[i]*(alphas[i] - alphaIold) * dataMatrix[i,:]*dataMatrix[i,:].T- \
66 |                     labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
67 |                 b2 = b - Ej - labelMat[i]*(alphas[i] - alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - \
68 |                     labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
69 |                 if (0<alphas[i]) and (C>alphas[i]) : b = b1
70 |                 elif (0<alphas[j]) and (C>alphas[j]) : b = b2
71 |                 else: b = (b1 + b2 ) / 2.0
72 |                 alphaPairsChanged+=1
73 |                 print "iter : %d i:%d , pairs changed %d " % (iter , i , alphaPairsChanged)
74 |         if (alphaPairsChanged==0) : iter +=1
75 |         else:iter = 0
76 |         print "ietration number %d " % iter
77 |     return b,alphas
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     dataMat,labelMat = loadDataSet("testSet.txt")
82 |     # print labelMat
83 |     print smoSimple(dataMat,labelMat,0.6,0.001,40)
84 | 


--------------------------------------------------------------------------------
/ch7/README.md:
--------------------------------------------------------------------------------
 1 | # Ch07 - 利用AdaBoost元算法提高分类性能(Improving classification with the AdaBoost meta-algorithm)
 2 | 
 3 | #### Adaboost是一种迭代算法，其核心思想是针对同一个训练集训练不同的分类器(弱分类器)，然后把这些弱分类器集合起来，构成一个更强的最终分类器（强分类器）。
 4 | #### 本章用单层决策树作为弱学习器构造了 Adaboost分类器。 实际上，Adaboost函数可以应用于任意分类器，只要该分类器能够处理加权数据即可。
 5 | 
 6 | ## 本章使用的单层决策树测试数据
 7 | ![单层决策树测试数据](screenshot/单层决策树测试数据.png)
 8 | #### 元算法：算法最后的评估，不是靠一个模型给出的结果，而是综合考虑多个模型结果，来得出最后的结果。
 9 |  
10 | #### bagging：基于数据随机重抽样的分类器构建方法。
11 | #### boosting：关注被已有分类器错分的那些数据来获得新的分类器。
12 |  
13 | ## AdaBoost：
14 | #### 自适应boosting。运行过程：对训练数据中的每个样本，先赋予其一个权重，这些权重开始都相等，先在一个弱分类器上计算错误率，然后在统一数据集上再次训练弱分类器，第二次训练时，将会重新调整每个样本的权重，第一次分对的样本的权重将会降低，分错的样本权重会提高。同时，AdaBoost为每个分类器都分配了一个权重α，这些权重是基于错误率算出来的。具体数学计算公式看书。
15 | 
16 | #### AdaBoost算法会这样一直迭代，直到训练出错误率为 0 或者到达迭代次数为止。
17 | 
18 | ## AdaBoost的过拟合问题 
19 | #### 多个分类器组合可能会进一步凸显出单分类器的不足。当分类器数目越来越多，训练错误率肯定是越来越小，但是测试错误率却是先减后增，这时就是发生了过拟合。
20 | 
21 | ## 其他分类性能度量指标
22 | #### 包括正确率（precise），召回率（recall），以及ROC曲线等。
23 | ## ROC曲线
24 | ![ROC曲线](screenshot/ROC曲线.png)
25 | 


--------------------------------------------------------------------------------
/ch7/matplotlib/单层决策树测试数据.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | 
 3 | from numpy import *
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | 
 8 | 
 9 | # def loadDataSet(fileName):
10 | #     dataMat = []; labelMat = []
11 | #     fr = open(fileName)
12 | #     for line in fr.readlines():
13 | #         lineArr = line.strip().split('\t')
14 | #         dataMat.append([float(lineArr[0]), float(lineArr[1])])
15 | #         labelMat.append(float(lineArr[2]))
16 | #     return dataMat,labelMat
17 | 
18 | 
19 | if __name__ == "__main__" :
20 |     # datMat,classLabels = loadDataSet("horseColicTraining2.txt")
21 |     # datMat = matrix(datMat)
22 |     # print datMat
23 |     # print "size=%d" % len(datMat)
24 |     # print datMat[1,0]
25 |     # print classLabels
26 |     # print "size=%d" % len(classLabels)
27 |     datMat = matrix([[ 1. ,  2.1],
28 |             [ 1.5,  1.6],
29 |             [ 1.3,  1. ],
30 |             [ 1. ,  1. ],
31 |             [ 2. ,  1. ]])
32 |     classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
33 |     xcord0 = []
34 |     ycord0 = []
35 |     xcord1 = []
36 |     ycord1 = []
37 |     markers = []
38 |     colors = []
39 | 
40 |     for i in range(len(classLabels)):
41 |         if classLabels[i] == 1.0:
42 |             xcord1.append(datMat[i, 0]), ycord1.append(datMat[i, 1])
43 |         else:
44 |             xcord0.append(datMat[i, 0]), ycord0.append(datMat[i, 1])
45 |             
46 |     fig = plt.figure()
47 |     ax = fig.add_subplot(111)
48 |     ax.scatter(xcord0, ycord0, marker='s', s=90)
49 |     ax.scatter(xcord1, ycord1, marker='o', s=50, c='red')
50 |     plt.title('decision stump test data')
51 |     plt.show()
52 | 


--------------------------------------------------------------------------------
/ch7/screenshot/README.md:
--------------------------------------------------------------------------------
1 | # screenshot
2 | 


--------------------------------------------------------------------------------
/ch7/screenshot/ROC曲线.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch7/screenshot/ROC曲线.png


--------------------------------------------------------------------------------
/ch7/screenshot/单层决策树测试数据.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch7/screenshot/单层决策树测试数据.png


--------------------------------------------------------------------------------
/ch8/README.md:
--------------------------------------------------------------------------------
 1 | # Ch08 - 预测数值型数据：回归(Predicting numeric values: regression)
 2 | 
 3 | ### 本章讲的是线性回归，就是找出一系列的回归系数，和Logistic回归很像，但是去掉了sigmoid函数。说到回归，一般指的是线性回归。
 4 | 
 5 | ### 假如回归系数放在向量w中，这个w可以用普通最小二乘法（OLS），通过使用numpy库的几个函数即可（注意：如果没有检查行列式是否为0就计算矩阵的逆，就会出现错误，使用linalg函数即可）。比如给我们一些数据(ex0.txt)，下图是它们的散点图。
 6 | ![数据分布](screenshot/数据分布.png)
 7 | 
 8 | ### 通过标准回归函数和数据导入函数，就是使用最小二乘法。得到下图的最佳拟合直线，其实也不是“最佳”，因为这种线性回归会出现欠拟合的情况。
 9 | ![线性回归找到最佳拟合曲线](screenshot/线性回归找到最佳拟合曲线.png)
10 | 
11 | ## 局部加权线性回归
12 | ### 线性回归中会出现欠拟合的情况，因为它求的是具有最小均方误差的无偏估计。欠拟合可不会有最好的预测效果。面对这种情况，因此，我们需要在估计中引入一些偏差， 从而降低误差。其中一个方法就是局部加权线性回归 (LWLR)。
13 | ### LWLR通常使用核来对附近的点赋予更高的权重，最常用的是高斯核。k是高斯核对应的权重中的一个参数。下面3个图分别是k=1.0，k=0.01，k=0.003三种不同取值下的效果图。
14 | 
15 | ### k = 1.0 (和OLS差不多)（欠拟合）
16 | ![局部加权线性回归(k=1.0)](screenshot/局部加权线性回归(k=1.0).png)
17 | 
18 | ### k = 0.01 (理想状态，可以挖掘数据的潜在规律)
19 | ![局部加权线性回归(k=0.01)](screenshot/局部加权线性回归(k=0.01).png)
20 | 
21 | ### k= 0.003 （考虑了太多噪声）（过拟合）
22 | ![局部加权线性回归(k=0.003)](screenshot/局部加权线性回归(k=0.003).png)
23 | 
24 | ## 岭回归
25 | ### 这里提到了一种在统计学中叫缩减的技术。
26 | ### 使用岭回归和缩减奇数之前，需要对特征做标准化处理。
27 | ### 岭回归使用不同的λ系数时的回归系数变化图。
28 | ![岭回归](screenshot/岭回归.png)
29 | ### 该图绘出了回归系数（纵坐标）与 log(lambda)（横坐标）的关系。在最左边，即lambda最小时，可以得到所有系数的原始值(与线性回归一致)，而在右边，系数全部缩减成0。在中间部分的某值将可以取得最好的预测效果。但为找到最佳参数值，还要进行交叉验证。然后自己看一下lasso。
30 | 
31 | ## 前向逐步回归 
32 | ### 前向逐步回归算法可以得到与lasso差不多的结果，但更简单。
33 | ### 这是一种贪心算法，每一步都尽可能的减少误差，一开始所有权重都设为1，然后每一步所做的决策就是对某个权重增加或减少一个很小的值。 需要设置步数和步长。逐步回归的好处在于能帮助人们理解现有模型并且做出改进，可以找出重要的特征，然后去收集重要的特征的数据。
34 | ### 使用0.005的epsilon值并迭代1000次后的结果如下：
35 | ![前向逐步回归](screenshot/前向逐步回归.png)
36 | 
37 | ## 权衡偏差与方差 
38 | ### 误差=偏差+测量误差+噪声，随着模型复杂度增加，训练集的误差降低，但是测试集的误差会先减后增，表明从高偏差过渡到了高方差模型，这里权衡两者是很重要的。
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/ch8/matplotlib/前向逐步回归.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | from numpy import *
 3 | 
 4 | # 加载数据集
 5 | def loadDataSet(filename):
 6 |     numFeat = len(open(filename).readline().split("\t")) - 1
 7 |     dataMat = []
 8 |     labelMat = []
 9 |     fr = open(filename)
10 |     for line in fr.readlines():
11 |         lineArr = []
12 |         curLine = line.strip().split("\t")
13 |         for i in range(numFeat):
14 |             lineArr.append(float(curLine[i]))
15 | 
16 |         dataMat.append(lineArr)
17 |         labelMat.append(float(curLine[-1]))
18 | 
19 |     return dataMat, labelMat
20 | 
21 | # 计算最佳拟合曲线
22 | def standRegress(xArr, yArr):
23 |     xMat = mat(xArr);
24 |     yMat = mat(yArr).T  # .T代表转置矩阵
25 |     xTx = xMat.T * xMat
26 |     if linalg.det(xTx) == 0.0:  # linalg.det(xTx) 计算行列式的值
27 |         print "This matrix is singular , cannot do inverse"
28 |         return
29 |     ws = xTx.I * (xMat.T * yMat)
30 |     return ws
31 | 
32 | #==========前向逐步回归============
33 | 
34 | #计算平方误差
35 | def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
36 |     return ((yArr-yHatArr)**2).sum()
37 | 
38 | #数据标准化处理
39 | def regularize(xMat):#regularize by columns
40 |     inMat = xMat.copy()
41 |     inMeans = mean(inMat,0)   #calc mean then subtract it off
42 |     inVar = var(inMat,0)      #calc variance of Xi then divide by it
43 |     inMat = (inMat - inMeans)/inVar
44 |     return inMat
45 | 
46 | 
47 | def stageWise(xArr,yArr,eps=0.01,numIt=100):
48 |     xMat = mat(xArr); yMat=mat(yArr).T
49 |     yMean = mean(yMat,0)
50 |     yMat = yMat - yMean     #can also regularize ys but will get smaller coef
51 |     xMat = regularize(xMat)
52 |     m,n=shape(xMat)
53 |     returnMat = zeros((numIt,n)) #testing code remove
54 |     ws = zeros((n,1));
55 |     wsTest = ws.copy();
56 |     wsMax = ws.copy()
57 |     for i in range(numIt): #could change this to while loop
58 |         #print ws.T
59 |         lowestError = inf;
60 |         for j in range(n):
61 |             for sign in [-1,1]:
62 |                 wsTest = ws.copy()
63 |                 wsTest[j] += eps*sign
64 |                 yTest = xMat*wsTest
65 |                 rssE = rssError(yMat.A,yTest.A)
66 |                 if rssE < lowestError:
67 |                     lowestError = rssE
68 |                     wsMax = wsTest
69 |         ws = wsMax.copy()
70 |         returnMat[i,:]=ws.T
71 |     return returnMat
72 | 
73 | 
74 | xArr,yArr = loadDataSet('abalone.txt')
75 | 
76 | # 把这些结果与最小二乘法进行比较，后者的结果可以通过如下代码:
77 | 
78 | xMat = mat(xArr)
79 | yMat = mat(yArr).T
80 | xMat = regularize(xMat)
81 | yM = mean(yMat,0)
82 | yMat = yMat - yM
83 | weights = standRegress(xMat, yMat.T)
84 | print weights.T
85 | 
86 | # print stageWise(xArr, yArr, 0.01, 200)
87 | mat = stageWise(xArr,yArr,0.005,1000) # 使用0.005的epsilon 迭代 1000次
88 | 
89 | def showRidge():
90 |     import matplotlib.pyplot as plt
91 |     fig = plt.figure()
92 |     ax = fig.add_subplot(111)
93 |     ax.plot(mat)
94 |     plt.show()
95 | showRidge()
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/ch8/matplotlib/局部加权线性回归.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/matplotlib/局部加权线性回归.py


--------------------------------------------------------------------------------
/ch8/matplotlib/岭回归.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | 
 3 | from numpy import *
 4 | 
 5 | #==================岭回归==================
 6 | 
 7 | # 加载数据集
 8 | def loadDataSet(filename):
 9 |     numFeat = len(open(filename).readline().split("\t")) - 1
10 |     dataMat = []
11 |     labelMat = []
12 |     fr = open(filename)
13 |     for line in fr.readlines():
14 |         lineArr = []
15 |         curLine = line.strip().split("\t")
16 |         for i in range(numFeat):
17 |             lineArr.append(float(curLine[i]))
18 | 
19 |         dataMat.append(lineArr)
20 |         labelMat.append(float(curLine[-1]))
21 | 
22 |     return dataMat, labelMat
23 | 
24 | #用于计算回归系数
25 | def ridgeRegres(xMat,yMat,lam=0.2):
26 |     xTx = xMat.T * xMat
27 |     denom = xTx + eye(shape(xMat)[1]) * lam
28 |     if linalg.det(denom)==0.0:
29 |         print "This matrix is singular, cannot do inverse"
30 |         return
31 |     ws = denom.I * (xMat.T * yMat)
32 |     return ws # 回归参数
33 | 
34 | #用于在一组lambda上做测试
35 | def ridgeTest(xArr,yArr):
36 |     xMat = mat(xArr); yMat = mat(yArr).T
37 |     yMean = mean(yMat,0)
38 |     #数据标准化
39 |     yMat = yMat - yMean
40 |     xMeans = mean(xMat,0)
41 |     xVar = var(xMat,0)
42 |     xMat = (xMat - xMeans)/xVar
43 | 
44 |     numTestPts = 30
45 |     wMat = zeros((numTestPts, shape(xMat)[1]))
46 |     for i in range(numTestPts):
47 |         ws = ridgeRegres(xMat, yMat, exp(i-10))
48 |         wMat[i,:]=ws.T
49 |     return wMat
50 | 
51 | abX,abY = loadDataSet('abalone.txt')
52 | ridgeWeights = ridgeTest(abX,abY)
53 | # print ridgeWeights
54 | 
55 | def showRidge():
56 |     import matplotlib.pyplot as plt
57 |     fig = plt.figure()
58 |     ax = fig.add_subplot(111)
59 |     ax.plot(ridgeWeights)
60 |     plt.show()
61 | 
62 | showRidge()
63 | 


--------------------------------------------------------------------------------
/ch8/matplotlib/线性回归找到最佳拟合曲线.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | from numpy import *
 3 | 
 4 | # ===========用线性回归找到最佳拟合曲线===========
 5 | # 加载数据集
 6 | def loadDataSet(filename):
 7 |     numFeat = len(open(filename).readline().split("\t")) - 1
 8 |     dataMat = []
 9 |     labelMat = []
10 |     fr = open(filename)
11 |     for line in fr.readlines():
12 |         lineArr = []
13 |         curLine = line.strip().split("\t")
14 |         for i in range(numFeat):
15 |             lineArr.append(float(curLine[i]))
16 | 
17 |         dataMat.append(lineArr)
18 |         labelMat.append(float(curLine[-1]))
19 | 
20 |     return dataMat, labelMat
21 | 
22 | 
23 | # 计算最佳拟合曲线
24 | def standRegress(xArr, yArr):
25 |     xMat = mat(xArr);
26 |     yMat = mat(yArr).T  # .T代表转置矩阵
27 |     xTx = xMat.T * xMat
28 |     if linalg.det(xTx) == 0.0:  # linalg.det(xTx) 计算行列式的值
29 |         print "This matrix is singular , cannot do inverse"
30 |         return
31 |     ws = xTx.I * (xMat.T * yMat)
32 |     return ws
33 | 
34 | 
35 | # 测试上边的函数
36 | xArr, yArr = loadDataSet("ex0.txt")
37 | # xArr, yArr = loadDataSet("ex1.txt")
38 | ws = standRegress(xArr, yArr)
39 | print "ws（相关系数）：\n", ws  # ws 存放的就是回归系数
40 | 
41 | def show():
42 |     import matplotlib.pyplot as plt
43 |     xMat = mat(xArr);
44 |     yMat = mat(yArr)
45 |     yHat = xMat * ws
46 |     fig = plt.figure()  # 创建绘图对象
47 |     ax = fig.add_subplot(111)  # 111表示将画布划分为1行2列选择使用从上到下第一块
48 |     # scatter绘制散点图
49 |     ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0])
50 |     # 复制，排序
51 |     xCopy = xMat.copy()
52 |     xCopy.sort(0)
53 |     yHat = xCopy * ws
54 |     # plot画线
55 |     ax.plot(xCopy[:, 1], yHat)
56 |     plt.show()
57 | 
58 | 
59 | show()
60 | 
61 | yHat = mat(xArr) * ws
62 | # yHat = xMat * ws
63 | # 利用numpy库提供的corrcoef来计算预测值和真实值得相关性
64 | print "相关性：\n", corrcoef(yHat.T, mat(yArr))


--------------------------------------------------------------------------------
/ch8/screenshot/README.md:
--------------------------------------------------------------------------------
1 | # screenshot
2 | 


--------------------------------------------------------------------------------
/ch8/screenshot/前向逐步回归.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/前向逐步回归.png


--------------------------------------------------------------------------------
/ch8/screenshot/局部加权线性回归(k=0.003).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/局部加权线性回归(k=0.003).png


--------------------------------------------------------------------------------
/ch8/screenshot/局部加权线性回归(k=0.01).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/局部加权线性回归(k=0.01).png


--------------------------------------------------------------------------------
/ch8/screenshot/局部加权线性回归(k=1.0).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/局部加权线性回归(k=1.0).png


--------------------------------------------------------------------------------
/ch8/screenshot/岭回归.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/岭回归.png


--------------------------------------------------------------------------------
/ch8/screenshot/数据分布.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/数据分布.png


--------------------------------------------------------------------------------
/ch8/screenshot/线性回归找到最佳拟合曲线.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/screenshot/线性回归找到最佳拟合曲线.png


--------------------------------------------------------------------------------
/ch8/前向逐步回归.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | from numpy import *
 3 | 
 4 | # 加载数据集
 5 | def loadDataSet(filename):
 6 |     numFeat = len(open(filename).readline().split("\t")) - 1
 7 |     dataMat = []
 8 |     labelMat = []
 9 |     fr = open(filename)
10 |     for line in fr.readlines():
11 |         lineArr = []
12 |         curLine = line.strip().split("\t")
13 |         for i in range(numFeat):
14 |             lineArr.append(float(curLine[i]))
15 | 
16 |         dataMat.append(lineArr)
17 |         labelMat.append(float(curLine[-1]))
18 | 
19 |     return dataMat, labelMat
20 | 
21 | # 计算最佳拟合曲线
22 | def standRegress(xArr, yArr):
23 |     xMat = mat(xArr);
24 |     yMat = mat(yArr).T  # .T代表转置矩阵
25 |     xTx = xMat.T * xMat
26 |     if linalg.det(xTx) == 0.0:  # linalg.det(xTx) 计算行列式的值
27 |         print "This matrix is singular , cannot do inverse"
28 |         return
29 |     ws = xTx.I * (xMat.T * yMat)
30 |     return ws
31 | 
32 | #==========前向逐步回归============
33 | 
34 | #计算平方误差
35 | def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
36 |     return ((yArr-yHatArr)**2).sum()
37 | 
38 | #数据标准化处理
39 | def regularize(xMat):#regularize by columns
40 |     inMat = xMat.copy()
41 |     inMeans = mean(inMat,0)   #calc mean then subtract it off
42 |     inVar = var(inMat,0)      #calc variance of Xi then divide by it
43 |     inMat = (inMat - inMeans)/inVar
44 |     return inMat
45 | 
46 | 
47 | def stageWise(xArr,yArr,eps=0.01,numIt=100):
48 |     xMat = mat(xArr); yMat=mat(yArr).T
49 |     yMean = mean(yMat,0)
50 |     yMat = yMat - yMean     #can also regularize ys but will get smaller coef
51 |     xMat = regularize(xMat)
52 |     m,n=shape(xMat)
53 |     returnMat = zeros((numIt,n)) #testing code remove
54 |     ws = zeros((n,1));
55 |     wsTest = ws.copy();
56 |     wsMax = ws.copy()
57 |     for i in range(numIt): #could change this to while loop
58 |         #print ws.T
59 |         lowestError = inf;
60 |         for j in range(n):
61 |             for sign in [-1,1]:
62 |                 wsTest = ws.copy()
63 |                 wsTest[j] += eps*sign
64 |                 yTest = xMat*wsTest
65 |                 rssE = rssError(yMat.A,yTest.A)
66 |                 if rssE < lowestError:
67 |                     lowestError = rssE
68 |                     wsMax = wsTest
69 |         ws = wsMax.copy()
70 |         returnMat[i,:]=ws.T
71 |     return returnMat
72 | 
73 | 
74 | xArr,yArr = loadDataSet('abalone.txt')
75 | 
76 | # 把这些结果与最小二乘法进行比较，后者的结果可以通过如下代码:
77 | 
78 | xMat = mat(xArr)
79 | yMat = mat(yArr).T
80 | xMat = regularize(xMat)
81 | yM = mean(yMat,0)
82 | yMat = yMat - yM
83 | weights = standRegress(xMat, yMat.T)
84 | print weights.T
85 | 
86 | # print stageWise(xArr, yArr, 0.01, 200)
87 | mat = stageWise(xArr,yArr,0.005,1000) # 使用0.005的epsilon 迭代 1000次
88 | 
89 | def showRidge():
90 |     import matplotlib.pyplot as plt
91 |     fig = plt.figure()
92 |     ax = fig.add_subplot(111)
93 |     ax.plot(mat)
94 |     plt.show()
95 | showRidge()
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/ch8/局部加权线性回归.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch8/局部加权线性回归.py


--------------------------------------------------------------------------------
/ch8/岭回归.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | 
 3 | from numpy import *
 4 | 
 5 | #==================岭回归==================
 6 | 
 7 | # 加载数据集
 8 | def loadDataSet(filename):
 9 |     numFeat = len(open(filename).readline().split("\t")) - 1
10 |     dataMat = []
11 |     labelMat = []
12 |     fr = open(filename)
13 |     for line in fr.readlines():
14 |         lineArr = []
15 |         curLine = line.strip().split("\t")
16 |         for i in range(numFeat):
17 |             lineArr.append(float(curLine[i]))
18 | 
19 |         dataMat.append(lineArr)
20 |         labelMat.append(float(curLine[-1]))
21 | 
22 |     return dataMat, labelMat
23 | 
24 | #用于计算回归系数
25 | def ridgeRegres(xMat,yMat,lam=0.2):
26 |     xTx = xMat.T * xMat
27 |     denom = xTx + eye(shape(xMat)[1]) * lam
28 |     if linalg.det(denom)==0.0:
29 |         print "This matrix is singular, cannot do inverse"
30 |         return
31 |     ws = denom.I * (xMat.T * yMat)
32 |     return ws # 回归参数
33 | 
34 | #用于在一组lambda上做测试
35 | def ridgeTest(xArr,yArr):
36 |     xMat = mat(xArr); yMat = mat(yArr).T
37 |     yMean = mean(yMat,0)
38 |     #数据标准化
39 |     yMat = yMat - yMean
40 |     xMeans = mean(xMat,0)
41 |     xVar = var(xMat,0)
42 |     xMat = (xMat - xMeans)/xVar
43 | 
44 |     numTestPts = 30
45 |     wMat = zeros((numTestPts, shape(xMat)[1]))
46 |     for i in range(numTestPts):
47 |         ws = ridgeRegres(xMat, yMat, exp(i-10))
48 |         wMat[i,:]=ws.T
49 |     return wMat
50 | 
51 | abX,abY = loadDataSet('abalone.txt')
52 | ridgeWeights = ridgeTest(abX,abY)
53 | # print ridgeWeights
54 | 
55 | def showRidge():
56 |     import matplotlib.pyplot as plt
57 |     fig = plt.figure()
58 |     ax = fig.add_subplot(111)
59 |     ax.plot(ridgeWeights)
60 |     plt.show()
61 | 
62 | showRidge()


--------------------------------------------------------------------------------
/ch8/线性回归找到最佳拟合曲线.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | from numpy import *
 3 | 
 4 | # ===========用线性回归找到最佳拟合曲线===========
 5 | # 加载数据集
 6 | def loadDataSet(filename):
 7 |     numFeat = len(open(filename).readline().split("\t")) - 1
 8 |     dataMat = []
 9 |     labelMat = []
10 |     fr = open(filename)
11 |     for line in fr.readlines():
12 |         lineArr = []
13 |         curLine = line.strip().split("\t")
14 |         for i in range(numFeat):
15 |             lineArr.append(float(curLine[i]))
16 | 
17 |         dataMat.append(lineArr)
18 |         labelMat.append(float(curLine[-1]))
19 | 
20 |     return dataMat, labelMat
21 | 
22 | 
23 | # 计算最佳拟合曲线
24 | def standRegress(xArr, yArr):
25 |     xMat = mat(xArr);
26 |     yMat = mat(yArr).T  # .T代表转置矩阵
27 |     xTx = xMat.T * xMat
28 |     if linalg.det(xTx) == 0.0:  # linalg.det(xTx) 计算行列式的值
29 |         print "This matrix is singular , cannot do inverse"
30 |         return
31 |     ws = xTx.I * (xMat.T * yMat)
32 |     return ws
33 | 
34 | 
35 | # 测试上边的函数
36 | xArr, yArr = loadDataSet("ex0.txt")
37 | # xArr, yArr = loadDataSet("ex1.txt")
38 | ws = standRegress(xArr, yArr)
39 | print "ws（相关系数）：\n", ws  # ws 存放的就是回归系数
40 | 
41 | def show():
42 |     import matplotlib.pyplot as plt
43 |     xMat = mat(xArr);
44 |     yMat = mat(yArr)
45 |     yHat = xMat * ws
46 |     fig = plt.figure()  # 创建绘图对象
47 |     ax = fig.add_subplot(111)  # 111表示将画布划分为1行2列选择使用从上到下第一块
48 |     # scatter绘制散点图
49 |     ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0])
50 |     # 复制，排序
51 |     xCopy = xMat.copy()
52 |     xCopy.sort(0)
53 |     yHat = xCopy * ws
54 |     # plot画线
55 |     ax.plot(xCopy[:, 1], yHat)
56 |     plt.show()
57 | 
58 | 
59 | show()
60 | 
61 | yHat = mat(xArr) * ws
62 | # yHat = xMat * ws
63 | # 利用numpy库提供的corrcoef来计算预测值和真实值得相关性
64 | print "相关性：\n", corrcoef(yHat.T, mat(yArr))


--------------------------------------------------------------------------------
/ch9/CRAT算法用于回归.py:
--------------------------------------------------------------------------------
  1 | #coding: utf-8
  2 | 
  3 | from numpy import *
  4 | 
  5 | #解析文本数据
  6 | def loadDataSet(filename):
  7 |     dataMat=[]
  8 |     fr=open(filename)
  9 |     for line in fr.readlines():
 10 |         curLine=line.strip().split('\t')
 11 |         #将每行数据映射为浮点数
 12 |         fltLine=map(float,curLine)
 13 |         dataMat.append(fltLine)
 14 |     return dataMat
 15 | 
 16 | #拆分数据集函数，二元拆分法
 17 | #@dataSet：待拆分的数据集
 18 | #@feature：作为拆分点的特征索引
 19 | #@value：特征的某一取值作为分割值
 20 | def binSplitDataSet(dataSet, feature, value):
 21 | 
 22 |     # 采用条件过滤的方法获取数据集每个样本目标特征的取值大于value的样本存入mat0
 23 |     # mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :][0]  # 书本错误 typo
 24 |     mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :]
 25 |     #样本目标特征取值不大于value的样本存入mat1
 26 |     # mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :][0] # 书本错误 typo
 27 |     mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :]
 28 |     return mat0, mat1
 29 | 
 30 | #回归树的切分函数
 31 | 
 32 | #叶节点生成函数
 33 | def regLeaf(dataSet):
 34 |     #数据集列表最后一列特征值的均值作为叶节点返回
 35 |     return mean(dataSet[:,-1])
 36 | 
 37 | #误差计算函数
 38 | def regErr(dataSet):
 39 |     #计算数据集最后一列特征值的均方差*数据集样本数，得到总方差返回
 40 |     return var(dataSet[:,-1])*shape(dataSet)[0]
 41 | 
 42 | 
 43 | def linearSolve(dataSet):   #helper function used in two places
 44 |     m,n = shape(dataSet)
 45 |     X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion
 46 |     X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y
 47 |     xTx = X.T*X
 48 |     if linalg.det(xTx) == 0.0:
 49 |         raise NameError('This matrix is singular, cannot do inverse,\n\
 50 |         try increasing the second value of ops')
 51 |     ws = xTx.I * (X.T * Y)
 52 |     return ws,X,Y
 53 | 
 54 | def modelLeaf(dataSet):#create linear model and return coeficients
 55 |     ws,X,Y = linearSolve(dataSet)
 56 |     return ws
 57 | 
 58 | def modelErr(dataSet):
 59 |     ws,X,Y = linearSolve(dataSet)
 60 |     yHat = X * ws
 61 |     return sum(power(Y - yHat,2))
 62 | 
 63 | def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
 64 |     tolS = ops[0]; tolN = ops[1]
 65 |     # 数据集最后一列所有的值都相同
 66 |     if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
 67 |         # 最优特征返回none，将该数据集最后一列计算均值作为叶节点值返回
 68 |         return None, leafType(dataSet)
 69 | 
 70 |     m,n = shape(dataSet)
 71 |     # 计算未切分前数据集的误差
 72 |     S = errType(dataSet)
 73 |     bestS = inf; bestIndex = 0; bestValue = 0
 74 |     # 遍历数据集所有的特征，除最后一列目标变量值
 75 |     for featIndex in range(n-1):
 76 |         # 遍历每个特征里不同的特征值
 77 |         for splitVal in set((dataSet[:, featIndex].T.A.tolist())[0]):
 78 |         #for splitVal in set(dataSet[:,featIndex]):  # 书本错误 typo
 79 |             mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
 80 |             if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
 81 |             newS = errType(mat0) + errType(mat1)
 82 |             if newS < bestS:
 83 |                 bestIndex = featIndex
 84 |                 bestValue = splitVal
 85 |                 bestS = newS
 86 |     # 如果切分后比切分前误差下降值未达到tolS
 87 |     if (S - bestS) < tolS:
 88 |         return None, leafType(dataSet) #exit cond 2
 89 |     mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
 90 |     if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):  #exit cond 3
 91 |         return None, leafType(dataSet)
 92 |     # 返回最佳切分特征及最佳切分特征取值
 93 |     return bestIndex,bestValue
 94 | 
 95 | #创建树函数
 96 | #@dataSet：数据集
 97 | #@leafType：生成叶节点的类型 1 回归树：叶节点为常数值 2 模型树：叶节点为线性模型
 98 | #@errType：计算误差的类型 1 回归错误类型：总方差=均方差*样本数 2 模型错误类型：预测误差(y-yHat)平方的累加和
 99 | #@ops：用户指定的参数
100 | def createTree(dataSet,leafType = regLeaf,errType = regErr,ops=(1,4)):
101 | 
102 |     #选取最佳分割特征和特征值
103 |     feat,val=chooseBestSplit(dataSet,leafType,errType,ops)
104 |     #如果特征为none，直接返回叶节点值
105 |     if feat == None:return val
106 |     #树的类型是字典类型
107 |     retTree={}
108 |     #树字典的一个元素是切分的最佳特征
109 |     retTree['spInd']=feat
110 |     #第二个元素是最佳特征对应的最佳切分特征值
111 |     retTree['spVal']=val
112 |     #根据特征索引及特征值对数据集进行二元拆分，并返回拆分的两个数据子集
113 |     lSet,rSet=binSplitDataSet(dataSet,feat,val)
114 |     #第三个元素是树的左分支，通过lSet子集递归生成左子树
115 |     retTree['left']=createTree(lSet,leafType,errType,ops)
116 |     #第四个元素是树的右分支，通过rSet子集递归生成右子树
117 |     retTree['right']=createTree(rSet,leafType,errType,ops)
118 |     #返回生成的数字典
119 |     return retTree
120 | 
121 | if __name__ == "__main__" :
122 |     myDat = loadDataSet("ex00.txt")
123 |     myMat = mat(myDat)
124 |     print createTree(myMat)
125 |     # {'spInd': 0, 'spVal': 0.48813, 'right': -0.044650285714285719, 'left': 1.0180967672413792}
126 |     myDat1 = loadDataSet("ex0.txt")
127 |     MyMat1 = mat(myDat1)
128 |     print createTree(MyMat1)
129 |     #{'spInd': 1, 'spVal': 0.39435, 'right': {'spInd': 1, 'spVal': 0.197834, 'right': -0.023838155555555553, 'left': 1.0289583666666666},
130 |     # 'left': {'spInd': 1, 'spVal': 0.582002, 'right': 1.980035071428571, 'left': {'spInd': 1, 'spVal': 0.797583, 'right': 2.9836209534883724, 'left': 3.9871631999999999}}}
131 | 
132 | 


--------------------------------------------------------------------------------
/ch9/README.md:
--------------------------------------------------------------------------------
 1 |  # Ch09 - 树回归(Tree-based regression)
 2 | #### 这张介绍了一种新的树算法。叫做CART（分类回归树），该算法即可以用于分类还可以用于回归。同时介绍了树剪枝，其目的是防止树的过拟合。
 3 | #### 我们第三章介绍了贪心算法的决策树，构建算法是ID3，每次选取当前最佳特征来分割数据，并且按照这个特征的所有可能取值来划分，一旦切分完成，这个特征在之后的执行过程中不会再有任何用处。但这种方法切分过于迅速，不能处理连续型特征值，需要将连续型数据离散化后才能处理，这样就破坏了连续变量的内在性质。
 4 | #### 二元切分法是另一种树构建算法，每次将数据集切分成两半，如果数据的某个特征满足这个切分的条件，就将这些数据放入左子树，否则右子树。CART（Classification And Regression Trees，分类回归树）使用二元切分来处理连续型变量，并用总方差取代香农熵来分析模型的效果。
 5 | 
 6 | ### 注意：书中提供的代码好像有点小错误。
 7 | 
 8 | ## 绘制ex0和ex00两个切分后的数据点图：
 9 | ### ex00.txt：
10 | ![基于CART算法构建回归树的简单数据集](screenshot/基于CART算法构建回归树的简单数据集.png)
11 | ### ex0.txt
12 | ![用于测试回归树的分段常数数据集](screenshot/用于测试回归树的分段常数数据集.png)
13 | 
14 | # 树剪枝
15 | #### 如果树节点过多，则该模型可能对数据过拟合，通过降低决策树的复杂度来避免过拟合的过程称为剪枝。我们有两种剪枝的方法，一种是预剪枝，另外一种叫做后剪枝。
16 | 
17 | ## 预剪枝
18 | #### 函数chooseBestSplit中的三个提前终止条件是“预剪枝”操作。预剪枝就是对节点的规模做限制，如果节点的规模小于限制，那么不可往下分，还有就是对划分前后总方差的限制，如果变化不大，那么也不可分。但是预剪枝对两个参数的取值很难把握，有时候因为数据的不同，参数对数据数量级十分敏感，一不小心就会造成过拟合。
19 | ## 将ex00.txt数据的y轴放大100倍后的新数据集
20 | ![放大100倍](screenshot/放大100倍.png)
21 | 
22 | ## 后剪枝
23 | #### 后剪枝需要将数据集分为训练集和测试集，先用训练集构造出回归树，然后后剪枝是对树结构的递归操作，如果两个子节点都是叶节点，并且如果合并之后能够降低测试误差，那么进行塌陷处理，就是把这个节点的值设为左右子节点的平均值，然后一层一层递归。简单来说，使用后剪枝方法需要将数据集交叉验证，首先给定参数，使得构建出的树足够复杂，之后从上而下找到叶节点，判断合并两个叶节点是否能够取得更好的测试误差，如果是就合并。
24 | 
25 | #### 一般来说，同时使用预剪枝和后剪枝的作用效果更好。
26 | 
27 | ## 模型树 
28 | ### 采用树结构对数据建模，除了将叶节点设定为常数，也可将其设为分段线性函数。回归树的叶子节点是设定为常数值的，而模型树的叶子节点是设定为线性函数的。性函数的节点显然比值节点的树更容易理解，这也是模型树优于回归树的特点之一。
29 | 
30 | ## 下面是用来测试构建模型树的分段线性数据：
31 | ![测试模型树构建函数的测试数据](screenshot/测试模型树构建函数的测试数据.png)
32 | ### 使用模型树只要修改一下叶节点的生成函数即可，和回归很像。
33 | 
34 | ## 相关系数
35 | #### 这里介绍了一下numpy库里面的corrcoef(yHat, y, rowvar = 0)来求解相关系数R^2，越接近1越好。yHat是预测值，y是目标变量的实际值。
36 | 
37 | ## 用于比较树回归模型和普通的线性回归模型的数据散点图。
38 | ![骑自行车速度](screenshot/骑自行车速度.png)
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/ch9/ex00.txt:
--------------------------------------------------------------------------------
  1 | 0.036098	0.155096
  2 | 0.993349	1.077553
  3 | 0.530897	0.893462
  4 | 0.712386	0.564858
  5 | 0.343554	-0.371700
  6 | 0.098016	-0.332760
  7 | 0.691115	0.834391
  8 | 0.091358	0.099935
  9 | 0.727098	1.000567
 10 | 0.951949	0.945255
 11 | 0.768596	0.760219
 12 | 0.541314	0.893748
 13 | 0.146366	0.034283
 14 | 0.673195	0.915077
 15 | 0.183510	0.184843
 16 | 0.339563	0.206783
 17 | 0.517921	1.493586
 18 | 0.703755	1.101678
 19 | 0.008307	0.069976
 20 | 0.243909	-0.029467
 21 | 0.306964	-0.177321
 22 | 0.036492	0.408155
 23 | 0.295511	0.002882
 24 | 0.837522	1.229373
 25 | 0.202054	-0.087744
 26 | 0.919384	1.029889
 27 | 0.377201	-0.243550
 28 | 0.814825	1.095206
 29 | 0.611270	0.982036
 30 | 0.072243	-0.420983
 31 | 0.410230	0.331722
 32 | 0.869077	1.114825
 33 | 0.620599	1.334421
 34 | 0.101149	0.068834
 35 | 0.820802	1.325907
 36 | 0.520044	0.961983
 37 | 0.488130	-0.097791
 38 | 0.819823	0.835264
 39 | 0.975022	0.673579
 40 | 0.953112	1.064690
 41 | 0.475976	-0.163707
 42 | 0.273147	-0.455219
 43 | 0.804586	0.924033
 44 | 0.074795	-0.349692
 45 | 0.625336	0.623696
 46 | 0.656218	0.958506
 47 | 0.834078	1.010580
 48 | 0.781930	1.074488
 49 | 0.009849	0.056594
 50 | 0.302217	-0.148650
 51 | 0.678287	0.907727
 52 | 0.180506	0.103676
 53 | 0.193641	-0.327589
 54 | 0.343479	0.175264
 55 | 0.145809	0.136979
 56 | 0.996757	1.035533
 57 | 0.590210	1.336661
 58 | 0.238070	-0.358459
 59 | 0.561362	1.070529
 60 | 0.377597	0.088505
 61 | 0.099142	0.025280
 62 | 0.539558	1.053846
 63 | 0.790240	0.533214
 64 | 0.242204	0.209359
 65 | 0.152324	0.132858
 66 | 0.252649	-0.055613
 67 | 0.895930	1.077275
 68 | 0.133300	-0.223143
 69 | 0.559763	1.253151
 70 | 0.643665	1.024241
 71 | 0.877241	0.797005
 72 | 0.613765	1.621091
 73 | 0.645762	1.026886
 74 | 0.651376	1.315384
 75 | 0.697718	1.212434
 76 | 0.742527	1.087056
 77 | 0.901056	1.055900
 78 | 0.362314	-0.556464
 79 | 0.948268	0.631862
 80 | 0.000234	0.060903
 81 | 0.750078	0.906291
 82 | 0.325412	-0.219245
 83 | 0.726828	1.017112
 84 | 0.348013	0.048939
 85 | 0.458121	-0.061456
 86 | 0.280738	-0.228880
 87 | 0.567704	0.969058
 88 | 0.750918	0.748104
 89 | 0.575805	0.899090
 90 | 0.507940	1.107265
 91 | 0.071769	-0.110946
 92 | 0.553520	1.391273
 93 | 0.401152	-0.121640
 94 | 0.406649	-0.366317
 95 | 0.652121	1.004346
 96 | 0.347837	-0.153405
 97 | 0.081931	-0.269756
 98 | 0.821648	1.280895
 99 | 0.048014	0.064496
100 | 0.130962	0.184241
101 | 0.773422	1.125943
102 | 0.789625	0.552614
103 | 0.096994	0.227167
104 | 0.625791	1.244731
105 | 0.589575	1.185812
106 | 0.323181	0.180811
107 | 0.822443	1.086648
108 | 0.360323	-0.204830
109 | 0.950153	1.022906
110 | 0.527505	0.879560
111 | 0.860049	0.717490
112 | 0.007044	0.094150
113 | 0.438367	0.034014
114 | 0.574573	1.066130
115 | 0.536689	0.867284
116 | 0.782167	0.886049
117 | 0.989888	0.744207
118 | 0.761474	1.058262
119 | 0.985425	1.227946
120 | 0.132543	-0.329372
121 | 0.346986	-0.150389
122 | 0.768784	0.899705
123 | 0.848921	1.170959
124 | 0.449280	0.069098
125 | 0.066172	0.052439
126 | 0.813719	0.706601
127 | 0.661923	0.767040
128 | 0.529491	1.022206
129 | 0.846455	0.720030
130 | 0.448656	0.026974
131 | 0.795072	0.965721
132 | 0.118156	-0.077409
133 | 0.084248	-0.019547
134 | 0.845815	0.952617
135 | 0.576946	1.234129
136 | 0.772083	1.299018
137 | 0.696648	0.845423
138 | 0.595012	1.213435
139 | 0.648675	1.287407
140 | 0.897094	1.240209
141 | 0.552990	1.036158
142 | 0.332982	0.210084
143 | 0.065615	-0.306970
144 | 0.278661	0.253628
145 | 0.773168	1.140917
146 | 0.203693	-0.064036
147 | 0.355688	-0.119399
148 | 0.988852	1.069062
149 | 0.518735	1.037179
150 | 0.514563	1.156648
151 | 0.976414	0.862911
152 | 0.919074	1.123413
153 | 0.697777	0.827805
154 | 0.928097	0.883225
155 | 0.900272	0.996871
156 | 0.344102	-0.061539
157 | 0.148049	0.204298
158 | 0.130052	-0.026167
159 | 0.302001	0.317135
160 | 0.337100	0.026332
161 | 0.314924	-0.001952
162 | 0.269681	-0.165971
163 | 0.196005	-0.048847
164 | 0.129061	0.305107
165 | 0.936783	1.026258
166 | 0.305540	-0.115991
167 | 0.683921	1.414382
168 | 0.622398	0.766330
169 | 0.902532	0.861601
170 | 0.712503	0.933490
171 | 0.590062	0.705531
172 | 0.723120	1.307248
173 | 0.188218	0.113685
174 | 0.643601	0.782552
175 | 0.520207	1.209557
176 | 0.233115	-0.348147
177 | 0.465625	-0.152940
178 | 0.884512	1.117833
179 | 0.663200	0.701634
180 | 0.268857	0.073447
181 | 0.729234	0.931956
182 | 0.429664	-0.188659
183 | 0.737189	1.200781
184 | 0.378595	-0.296094
185 | 0.930173	1.035645
186 | 0.774301	0.836763
187 | 0.273940	-0.085713
188 | 0.824442	1.082153
189 | 0.626011	0.840544
190 | 0.679390	1.307217
191 | 0.578252	0.921885
192 | 0.785541	1.165296
193 | 0.597409	0.974770
194 | 0.014083	-0.132525
195 | 0.663870	1.187129
196 | 0.552381	1.369630
197 | 0.683886	0.999985
198 | 0.210334	-0.006899
199 | 0.604529	1.212685
200 | 0.250744	0.046297
201 | 


--------------------------------------------------------------------------------
/ch9/ex2.txt:
--------------------------------------------------------------------------------
  1 | 0.228628	-2.266273
  2 | 0.965969	112.386764
  3 | 0.342761	-31.584855
  4 | 0.901444	87.300625
  5 | 0.585413	125.295113
  6 | 0.334900	18.976650
  7 | 0.769043	64.041941
  8 | 0.297107	-1.798377
  9 | 0.901421	100.133819
 10 | 0.176523	0.946348
 11 | 0.710234	108.553919
 12 | 0.981980	86.399637
 13 | 0.085873	-10.137104
 14 | 0.537834	90.995536
 15 | 0.806158	62.877698
 16 | 0.708890	135.416767
 17 | 0.787755	118.642009
 18 | 0.463241	17.171057
 19 | 0.300318	-18.051318
 20 | 0.815215	118.319942
 21 | 0.139880	7.336784
 22 | 0.068373	-15.160836
 23 | 0.457563	-34.044555
 24 | 0.665652	105.547997
 25 | 0.084661	-24.132226
 26 | 0.954711	100.935789
 27 | 0.953902	130.926480
 28 | 0.487381	27.729263
 29 | 0.759504	81.106762
 30 | 0.454312	-20.360067
 31 | 0.295993	-14.988279
 32 | 0.156067	7.557349
 33 | 0.428582	15.224266
 34 | 0.847219	76.240984
 35 | 0.499171	11.924204
 36 | 0.203993	-22.379119
 37 | 0.548539	83.114502
 38 | 0.790312	110.159730
 39 | 0.937766	119.949824
 40 | 0.218321	1.410768
 41 | 0.223200	15.501642
 42 | 0.896683	107.001620
 43 | 0.582311	82.589328
 44 | 0.698920	92.470636
 45 | 0.823848	59.342323
 46 | 0.385021	24.816941
 47 | 0.061219	6.695567
 48 | 0.841547	115.669032
 49 | 0.763328	115.199195
 50 | 0.934853	115.753994
 51 | 0.222271	-9.255852
 52 | 0.217214	-3.958752
 53 | 0.706961	106.180427
 54 | 0.888426	94.896354
 55 | 0.549814	137.267576
 56 | 0.107960	-1.293195
 57 | 0.085111	37.820659
 58 | 0.388789	21.578007
 59 | 0.467383	-9.712925
 60 | 0.623909	87.181863
 61 | 0.373501	-8.228297
 62 | 0.513332	101.075609
 63 | 0.350725	-40.086564
 64 | 0.716211	103.345308
 65 | 0.731636	73.912028
 66 | 0.273863	-9.457556
 67 | 0.211633	-8.332207
 68 | 0.944221	100.120253
 69 | 0.053764	-13.731698
 70 | 0.126833	22.891675
 71 | 0.952833	100.649591
 72 | 0.391609	3.001104
 73 | 0.560301	82.903945
 74 | 0.124723	-1.402796
 75 | 0.465680	-23.777531
 76 | 0.699873	115.586605
 77 | 0.164134	-27.405211
 78 | 0.455761	9.841938
 79 | 0.508542	96.403373
 80 | 0.138619	-29.087463
 81 | 0.335182	2.768225
 82 | 0.908629	118.513475
 83 | 0.546601	96.319043
 84 | 0.378965	13.583555
 85 | 0.968621	98.648346
 86 | 0.637999	91.656617
 87 | 0.350065	-1.319852
 88 | 0.632691	93.645293
 89 | 0.936524	65.548418
 90 | 0.310956	-49.939516
 91 | 0.437652	19.745224
 92 | 0.166765	-14.740059
 93 | 0.571214	114.872056
 94 | 0.952377	73.520802
 95 | 0.665329	121.980607
 96 | 0.258070	-20.425137
 97 | 0.912161	85.005351
 98 | 0.777582	100.838446
 99 | 0.642707	82.500766
100 | 0.885676	108.045948
101 | 0.080061	2.229873
102 | 0.039914	11.220099
103 | 0.958512	135.837013
104 | 0.377383	5.241196
105 | 0.661073	115.687524
106 | 0.454375	3.043912
107 | 0.412516	-26.419289
108 | 0.854970	89.209930
109 | 0.698472	120.521925
110 | 0.465561	30.051931
111 | 0.328890	39.783113
112 | 0.309133	8.814725
113 | 0.418943	44.161493
114 | 0.553797	120.857321
115 | 0.799873	91.368473
116 | 0.811363	112.981216
117 | 0.785574	107.024467
118 | 0.949198	105.752508
119 | 0.666452	120.014736
120 | 0.652462	112.715799
121 | 0.290749	-14.391613
122 | 0.508548	93.292829
123 | 0.680486	110.367074
124 | 0.356790	-19.526539
125 | 0.199903	-3.372472
126 | 0.264926	5.280579
127 | 0.166431	-6.512506
128 | 0.370042	-32.124495
129 | 0.628061	117.628346
130 | 0.228473	19.425158
131 | 0.044737	3.855393
132 | 0.193282	18.208423
133 | 0.519150	116.176162
134 | 0.351478	-0.461116
135 | 0.872199	111.552716
136 | 0.115150	13.795828
137 | 0.324274	-13.189243
138 | 0.446196	-5.108172
139 | 0.613004	168.180746
140 | 0.533511	129.766743
141 | 0.740859	93.773929
142 | 0.667851	92.449664
143 | 0.900699	109.188248
144 | 0.599142	130.378529
145 | 0.232802	1.222318
146 | 0.838587	134.089674
147 | 0.284794	35.623746
148 | 0.130626	-39.524461
149 | 0.642373	140.613941
150 | 0.786865	100.598825
151 | 0.403228	-1.729244
152 | 0.883615	95.348184
153 | 0.910975	106.814667
154 | 0.819722	70.054508
155 | 0.798198	76.853728
156 | 0.606417	93.521396
157 | 0.108801	-16.106164
158 | 0.318309	-27.605424
159 | 0.856421	107.166848
160 | 0.842940	95.893131
161 | 0.618868	76.917665
162 | 0.531944	124.795495
163 | 0.028546	-8.377094
164 | 0.915263	96.717610
165 | 0.925782	92.074619
166 | 0.624827	105.970743
167 | 0.331364	-1.290825
168 | 0.341700	-23.547711
169 | 0.342155	-16.930416
170 | 0.729397	110.902830
171 | 0.640515	82.713621
172 | 0.228751	-30.812912
173 | 0.948822	69.318649
174 | 0.706390	105.062147
175 | 0.079632	29.420068
176 | 0.451087	-28.724685
177 | 0.833026	76.723835
178 | 0.589806	98.674874
179 | 0.426711	-21.594268
180 | 0.872883	95.887712
181 | 0.866451	94.402102
182 | 0.960398	123.559747
183 | 0.483803	5.224234
184 | 0.811602	99.841379
185 | 0.757527	63.549854
186 | 0.569327	108.435392
187 | 0.841625	60.552308
188 | 0.264639	2.557923
189 | 0.202161	-1.983889
190 | 0.055862	-3.131497
191 | 0.543843	98.362010
192 | 0.689099	112.378209
193 | 0.956951	82.016541
194 | 0.382037	-29.007783
195 | 0.131833	22.478291
196 | 0.156273	0.225886
197 | 0.000256	9.668106
198 | 0.892999	82.436686
199 | 0.206207	-12.619036
200 | 0.487537	5.149336
201 | 


--------------------------------------------------------------------------------
/ch9/ex2test.txt:
--------------------------------------------------------------------------------
  1 | 0.421862	10.830241
  2 | 0.105349	-2.241611
  3 | 0.155196	21.872976
  4 | 0.161152	2.015418
  5 | 0.382632	-38.778979
  6 | 0.017710	20.109113
  7 | 0.129656	15.266887
  8 | 0.613926	111.900063
  9 | 0.409277	1.874731
 10 | 0.807556	111.223754
 11 | 0.593722	133.835486
 12 | 0.953239	110.465070
 13 | 0.257402	15.332899
 14 | 0.645385	93.983054
 15 | 0.563460	93.645277
 16 | 0.408338	-30.719878
 17 | 0.874394	91.873505
 18 | 0.263805	-0.192752
 19 | 0.411198	10.751118
 20 | 0.449884	9.211901
 21 | 0.646315	113.533660
 22 | 0.673718	125.135638
 23 | 0.805148	113.300462
 24 | 0.759327	72.668572
 25 | 0.519172	82.131698
 26 | 0.741031	106.777146
 27 | 0.030937	9.859127
 28 | 0.268848	-34.137955
 29 | 0.474901	-11.201301
 30 | 0.588266	120.501998
 31 | 0.893936	142.826476
 32 | 0.870990	105.751746
 33 | 0.430763	39.146258
 34 | 0.057665	15.371897
 35 | 0.100076	9.131761
 36 | 0.980716	116.145896
 37 | 0.235289	-13.691224
 38 | 0.228098	16.089151
 39 | 0.622248	99.345551
 40 | 0.401467	-1.694383
 41 | 0.960334	110.795415
 42 | 0.031214	-5.330042
 43 | 0.504228	96.003525
 44 | 0.779660	75.921582
 45 | 0.504496	101.341462
 46 | 0.850974	96.293064
 47 | 0.701119	102.333839
 48 | 0.191551	5.072326
 49 | 0.667116	92.310019
 50 | 0.555584	80.367129
 51 | 0.680006	132.965442
 52 | 0.393899	38.605283
 53 | 0.048940	-9.861871
 54 | 0.963282	115.407485
 55 | 0.655496	104.269918
 56 | 0.576463	141.127267
 57 | 0.675708	96.227996
 58 | 0.853457	114.252288
 59 | 0.003933	-12.182861
 60 | 0.549512	97.927224
 61 | 0.218967	-4.712462
 62 | 0.659972	120.950439
 63 | 0.008256	8.026816
 64 | 0.099500	-14.318434
 65 | 0.352215	-3.747546
 66 | 0.874926	89.247356
 67 | 0.635084	99.496059
 68 | 0.039641	14.147109
 69 | 0.665111	103.298719
 70 | 0.156583	-2.540703
 71 | 0.648843	119.333019
 72 | 0.893237	95.209585
 73 | 0.128807	5.558479
 74 | 0.137438	5.567685
 75 | 0.630538	98.462792
 76 | 0.296084	-41.799438
 77 | 0.632099	84.895098
 78 | 0.987681	106.726447
 79 | 0.744909	111.279705
 80 | 0.862030	104.581156
 81 | 0.080649	-7.679985
 82 | 0.831277	59.053356
 83 | 0.198716	26.878801
 84 | 0.860932	90.632930
 85 | 0.883250	92.759595
 86 | 0.818003	110.272219
 87 | 0.949216	115.200237
 88 | 0.460078	-35.957981
 89 | 0.561077	93.545761
 90 | 0.863767	114.125786
 91 | 0.476891	-29.774060
 92 | 0.537826	81.587922
 93 | 0.686224	110.911198
 94 | 0.982327	119.114523
 95 | 0.944453	92.033481
 96 | 0.078227	30.216873
 97 | 0.782937	92.588646
 98 | 0.465886	2.222139
 99 | 0.885024	90.247890
100 | 0.186077	7.144415
101 | 0.915828	84.010074
102 | 0.796649	115.572156
103 | 0.127821	28.933688
104 | 0.433429	6.782575
105 | 0.946796	108.574116
106 | 0.386915	-17.404601
107 | 0.561192	92.142700
108 | 0.182490	10.764616
109 | 0.878792	95.289476
110 | 0.381342	-6.177464
111 | 0.358474	-11.731754
112 | 0.270647	13.793201
113 | 0.488904	-17.641832
114 | 0.106773	5.684757
115 | 0.270112	4.335675
116 | 0.754985	75.860433
117 | 0.585174	111.640154
118 | 0.458821	12.029692
119 | 0.218017	-26.234872
120 | 0.583887	99.413850
121 | 0.923626	107.802298
122 | 0.833620	104.179678
123 | 0.870691	93.132591
124 | 0.249896	-8.618404
125 | 0.748230	109.160652
126 | 0.019365	34.048884
127 | 0.837588	101.239275
128 | 0.529251	115.514729
129 | 0.742898	67.038771
130 | 0.522034	64.160799
131 | 0.498982	3.983061
132 | 0.479439	24.355908
133 | 0.314834	-14.256200
134 | 0.753251	85.017092
135 | 0.479362	-17.480446
136 | 0.950593	99.072784
137 | 0.718623	58.080256
138 | 0.218720	-19.605593
139 | 0.664113	94.437159
140 | 0.942900	131.725134
141 | 0.314226	18.904871
142 | 0.284509	11.779346
143 | 0.004962	-14.624176
144 | 0.224087	-50.547649
145 | 0.974331	112.822725
146 | 0.894610	112.863995
147 | 0.167350	0.073380
148 | 0.753644	105.024456
149 | 0.632241	108.625812
150 | 0.314189	-6.090797
151 | 0.965527	87.418343
152 | 0.820919	94.610538
153 | 0.144107	-4.748387
154 | 0.072556	-5.682008
155 | 0.002447	29.685714
156 | 0.851007	79.632376
157 | 0.458024	-12.326026
158 | 0.627503	139.458881
159 | 0.422259	-29.827405
160 | 0.714659	63.480271
161 | 0.672320	93.608554
162 | 0.498592	37.112975
163 | 0.698906	96.282845
164 | 0.861441	99.699230
165 | 0.112425	-12.419909
166 | 0.164784	5.244704
167 | 0.481531	-18.070497
168 | 0.375482	1.779411
169 | 0.089325	-14.216755
170 | 0.036609	-6.264372
171 | 0.945004	54.723563
172 | 0.136608	14.970936
173 | 0.292285	-41.723711
174 | 0.029195	-0.660279
175 | 0.998307	100.124230
176 | 0.303928	-5.492264
177 | 0.957863	117.824392
178 | 0.815089	113.377704
179 | 0.466399	-10.249874
180 | 0.876693	115.617275
181 | 0.536121	102.997087
182 | 0.373984	-37.359936
183 | 0.565162	74.967476
184 | 0.085412	-21.449563
185 | 0.686411	64.859620
186 | 0.908752	107.983366
187 | 0.982829	98.005424
188 | 0.052766	-42.139502
189 | 0.777552	91.899340
190 | 0.374316	-3.522501
191 | 0.060231	10.008227
192 | 0.526225	87.317722
193 | 0.583872	67.104433
194 | 0.238276	10.615159
195 | 0.678747	60.624273
196 | 0.067649	15.947398
197 | 0.530182	105.030933
198 | 0.869389	104.969996
199 | 0.698410	75.460417
200 | 0.549430	82.558068
201 | 


--------------------------------------------------------------------------------
/ch9/exp.txt:
--------------------------------------------------------------------------------
  1 | 0.529582	100.737303
  2 | 0.985730	103.106872
  3 | 0.797869	99.666151
  4 | 0.393473	-1.773056
  5 | 0.272568	-1.170222
  6 | 0.758825	96.752440
  7 | 0.218359	2.337347
  8 | 0.926357	98.343231
  9 | 0.726881	99.633009
 10 | 0.805311	102.253834
 11 | 0.208632	0.493174
 12 | 0.184921	-2.231071
 13 | 0.660135	100.139355
 14 | 0.871875	96.637420
 15 | 0.657182	100.345442
 16 | 0.942481	97.751546
 17 | 0.427843	-1.380170
 18 | 0.845958	98.195303
 19 | 0.878696	99.380485
 20 | 0.582034	100.971036
 21 | 0.118114	2.397033
 22 | 0.144718	1.304535
 23 | 0.576046	101.624714
 24 | 0.750305	97.601324
 25 | 0.518281	100.093634
 26 | 0.260793	-1.361888
 27 | 0.390245	-2.973759
 28 | 0.963020	98.877859
 29 | 0.880661	97.631997
 30 | 0.291780	-1.638124
 31 | 0.192903	-2.221257
 32 | 0.461442	-1.074725
 33 | 0.821171	99.372052
 34 | 0.144557	2.589464
 35 | 0.379346	0.991090
 36 | 0.383822	1.832389
 37 | 0.055406	-1.870700
 38 | 0.084308	-0.611701
 39 | 0.719578	100.087948
 40 | 0.417471	-0.510292
 41 | 0.477894	-3.426525
 42 | 0.871228	100.307522
 43 | 0.113074	-1.011079
 44 | 0.409434	-0.616173
 45 | 0.967141	96.551856
 46 | 0.938254	97.052196
 47 | 0.079989	2.083496
 48 | 0.150207	1.285491
 49 | 0.417339	-0.462985
 50 | 0.038787	-2.237234
 51 | 0.954657	102.111432
 52 | 0.844894	98.350138
 53 | 0.106770	-0.998182
 54 | 0.247831	2.483594
 55 | 0.108687	-0.920229
 56 | 0.758165	98.079399
 57 | 0.199978	-3.490410
 58 | 0.600602	99.850119
 59 | 0.026466	1.342825
 60 | 0.141239	-0.949858
 61 | 0.181437	-2.223725
 62 | 0.352656	2.251362
 63 | 0.803371	99.647157
 64 | 0.677303	100.414859
 65 | 0.561674	99.133372
 66 | 0.497533	-3.764935
 67 | 0.523327	98.452850
 68 | 0.507075	103.807755
 69 | 0.791978	99.414598
 70 | 0.956890	95.977239
 71 | 0.487927	1.199149
 72 | 0.788795	100.012047
 73 | 0.554283	98.522458
 74 | 0.814361	97.642150
 75 | 0.788940	97.399942
 76 | 0.515845	102.240479
 77 | 0.758538	97.461917
 78 | 0.041824	-3.294141
 79 | 0.341352	1.246559
 80 | 0.194801	-2.285278
 81 | 0.805528	99.023113
 82 | 0.435762	0.361749
 83 | 0.941615	100.746547
 84 | 0.478234	0.791146
 85 | 0.057445	-4.266792
 86 | 0.510079	98.845273
 87 | 0.209900	-0.861890
 88 | 0.902668	101.429190
 89 | 0.456602	-2.856392
 90 | 0.997595	99.828241
 91 | 0.048240	-0.268920
 92 | 0.319531	0.896696
 93 | 0.264929	-1.000487
 94 | 0.432727	-4.630489
 95 | 0.419828	1.260534
 96 | 0.667056	99.456518
 97 | 0.488173	1.574322
 98 | 0.746300	100.563503
 99 | 0.528660	100.736739
100 | 0.624185	99.562872
101 | 0.169411	1.809929
102 | 0.011025	4.132846
103 | 0.974164	98.706049
104 | 0.267957	0.297803
105 | 0.726093	99.381040
106 | 0.465163	-2.344545
107 | 0.993698	101.507792
108 | 0.816513	99.903496
109 | 0.398756	0.378060
110 | 0.054974	-0.588770
111 | 0.857067	100.322945
112 | 0.362328	2.551786
113 | 0.316961	-0.528283
114 | 0.167881	-0.376517
115 | 0.393776	3.658204
116 | 0.739991	100.426554
117 | 0.457949	0.857428
118 | 0.060635	2.484776
119 | 0.942634	101.254420
120 | 0.553691	102.467820
121 | 0.394694	-0.248353
122 | 0.714625	99.650556
123 | 0.273503	1.111820
124 | 0.471886	-5.665559
125 | 0.746476	98.720163
126 | 0.140209	0.471820
127 | 0.024197	-2.854251
128 | 0.521287	99.703915
129 | 0.672280	100.463227
130 | 0.380342	-0.785713
131 | 0.956380	99.482209
132 | 0.455254	1.613841
133 | 0.647551	101.591193
134 | 0.682498	98.267734
135 | 0.054839	-2.286019
136 | 0.716849	100.614510
137 | 0.217732	-2.161633
138 | 0.918885	100.260067
139 | 0.576026	101.719788
140 | 0.868511	100.669152
141 | 0.661135	97.637969
142 | 0.166334	1.374014
143 | 0.106850	-3.658050
144 | 0.768242	104.193841
145 | 0.240916	-0.368100
146 | 0.124957	2.821672
147 | 0.984335	98.571444
148 | 0.908524	101.777344
149 | 0.861217	98.656403
150 | 0.944295	100.154508
151 | 0.527278	101.052710
152 | 0.717072	100.788373
153 | 0.130227	0.115694
154 | 0.494734	-1.220681
155 | 0.498733	0.961514
156 | 0.519411	101.331622
157 | 0.712409	104.891067
158 | 0.933858	98.180299
159 | 0.266051	0.398961
160 | 0.153690	-0.657128
161 | 0.209181	1.486816
162 | 0.942699	102.187578
163 | 0.766799	100.213348
164 | 0.862578	101.816969
165 | 0.223266	2.854445
166 | 0.611394	103.428497
167 | 0.996212	98.494158
168 | 0.724945	99.098450
169 | 0.399346	0.879259
170 | 0.750510	98.729864
171 | 0.446060	0.639843
172 | 0.999913	101.502887
173 | 0.111561	3.256383
174 | 0.094755	0.170475
175 | 0.366547	0.488994
176 | 0.179924	-0.871567
177 | 0.969023	99.982789
178 | 0.941420	100.416754
179 | 0.656851	98.520940
180 | 0.983166	99.546591
181 | 0.167843	0.033922
182 | 0.316245	2.171137
183 | 0.817118	102.849575
184 | 0.173642	1.209173
185 | 0.411030	2.022640
186 | 0.265041	2.216470
187 | 0.779660	98.475428
188 | 0.059354	-0.929568
189 | 0.722092	97.974003
190 | 0.511958	101.924447
191 | 0.371938	-0.640602
192 | 0.851009	97.873330
193 | 0.375918	-5.308115
194 | 0.797332	99.763778
195 | 0.107749	-3.770092
196 | 0.156937	-0.876724
197 | 0.960447	99.597097
198 | 0.413434	2.408090
199 | 0.644257	100.453125
200 | 0.119332	-0.495588
201 | 


--------------------------------------------------------------------------------
/ch9/exp2.txt:
--------------------------------------------------------------------------------
  1 | 0.070670	3.470829
  2 | 0.534076	6.377132
  3 | 0.747221	8.949407
  4 | 0.668970	8.034081
  5 | 0.586082	6.997721
  6 | 0.764962	9.318110
  7 | 0.658125	7.880333
  8 | 0.346734	4.213359
  9 | 0.313967	3.762496
 10 | 0.601418	7.188805
 11 | 0.404396	4.893403
 12 | 0.154345	3.683175
 13 | 0.984061	11.712928
 14 | 0.597514	7.146694
 15 | 0.005144	3.333150
 16 | 0.142295	3.743681
 17 | 0.280007	3.737376
 18 | 0.542008	6.494275
 19 | 0.466781	5.532255
 20 | 0.706970	8.476718
 21 | 0.191038	3.673921
 22 | 0.756591	9.176722
 23 | 0.912879	10.850358
 24 | 0.524701	6.067444
 25 | 0.306090	3.681148
 26 | 0.429009	5.032168
 27 | 0.695091	8.209058
 28 | 0.984495	11.909595
 29 | 0.702748	8.298454
 30 | 0.551771	6.715210
 31 | 0.272894	3.983313
 32 | 0.014611	3.559081
 33 | 0.699852	8.417306
 34 | 0.309710	3.739053
 35 | 0.444877	5.219649
 36 | 0.717509	8.483072
 37 | 0.576550	6.894860
 38 | 0.284200	3.792626
 39 | 0.675922	8.067282
 40 | 0.304401	3.671373
 41 | 0.233675	3.795962
 42 | 0.453779	5.477533
 43 | 0.900938	10.701447
 44 | 0.502418	6.046703
 45 | 0.781843	9.254690
 46 | 0.226271	3.546938
 47 | 0.619535	7.703312
 48 | 0.519998	6.202835
 49 | 0.399447	4.934647
 50 | 0.785298	9.497564
 51 | 0.010767	3.565835
 52 | 0.696399	8.307487
 53 | 0.524366	6.266060
 54 | 0.396583	4.611390
 55 | 0.059988	3.484805
 56 | 0.946702	11.263118
 57 | 0.417559	4.895128
 58 | 0.609194	7.239316
 59 | 0.730687	8.858371
 60 | 0.586694	7.061601
 61 | 0.829567	9.937968
 62 | 0.964229	11.521595
 63 | 0.276813	3.756406
 64 | 0.987041	11.947913
 65 | 0.876107	10.440538
 66 | 0.747582	8.942278
 67 | 0.117348	3.567821
 68 | 0.188617	3.976420
 69 | 0.416655	4.928907
 70 | 0.192995	3.978365
 71 | 0.244888	3.777018
 72 | 0.806349	9.685831
 73 | 0.417555	4.990148
 74 | 0.233805	3.740022
 75 | 0.357325	4.325355
 76 | 0.190201	3.638493
 77 | 0.705127	8.432886
 78 | 0.336599	3.868493
 79 | 0.473786	5.871813
 80 | 0.384794	4.830712
 81 | 0.502217	6.117244
 82 | 0.788220	9.454959
 83 | 0.478773	5.681631
 84 | 0.064296	3.642040
 85 | 0.332143	3.886628
 86 | 0.618869	7.312725
 87 | 0.854981	10.306697
 88 | 0.570000	6.764615
 89 | 0.512739	6.166836
 90 | 0.112285	3.545863
 91 | 0.723700	8.526944
 92 | 0.192256	3.661033
 93 | 0.181268	3.678579
 94 | 0.196731	3.916622
 95 | 0.510342	6.026652
 96 | 0.263713	3.723018
 97 | 0.141105	3.529595
 98 | 0.150262	3.552314
 99 | 0.824724	9.973690
100 | 0.588088	6.893128
101 | 0.411291	4.856380
102 | 0.763717	9.199101
103 | 0.212118	3.740024
104 | 0.264587	3.742917
105 | 0.973524	11.683243
106 | 0.250670	3.679117
107 | 0.823460	9.743861
108 | 0.253752	3.781488
109 | 0.838332	10.172180
110 | 0.501156	6.113263
111 | 0.097275	3.472367
112 | 0.667199	7.948868
113 | 0.487320	6.022060
114 | 0.654640	7.809457
115 | 0.906907	10.775188
116 | 0.821941	9.936140
117 | 0.859396	10.428255
118 | 0.078696	3.490510
119 | 0.938092	11.252471
120 | 0.998868	11.863062
121 | 0.025501	3.515624
122 | 0.451806	5.441171
123 | 0.883872	10.498912
124 | 0.583567	6.912334
125 | 0.823688	10.003723
126 | 0.891032	10.818109
127 | 0.879259	10.639263
128 | 0.163007	3.662715
129 | 0.344263	4.169705
130 | 0.796083	9.422591
131 | 0.903683	10.978834
132 | 0.050129	3.575105
133 | 0.605553	7.306014
134 | 0.628951	7.556742
135 | 0.877052	10.444055
136 | 0.829402	9.856432
137 | 0.121422	3.638276
138 | 0.721517	8.663569
139 | 0.066532	3.673471
140 | 0.996587	11.782002
141 | 0.653384	7.804568
142 | 0.739494	8.817809
143 | 0.640341	7.636812
144 | 0.337828	3.971613
145 | 0.220512	3.713645
146 | 0.368815	4.381696
147 | 0.782509	9.349428
148 | 0.645825	7.790882
149 | 0.277391	3.834258
150 | 0.092569	3.643274
151 | 0.284320	3.609353
152 | 0.344465	4.023259
153 | 0.182523	3.749195
154 | 0.385001	4.426970
155 | 0.747609	8.966676
156 | 0.188907	3.711018
157 | 0.806244	9.610438
158 | 0.014211	3.517818
159 | 0.574813	7.040672
160 | 0.714500	8.525624
161 | 0.538982	6.393940
162 | 0.384638	4.649362
163 | 0.915586	10.936577
164 | 0.883513	10.441493
165 | 0.804148	9.742851
166 | 0.466011	5.833439
167 | 0.800574	9.638874
168 | 0.654980	8.028558
169 | 0.348564	4.064616
170 | 0.978595	11.720218
171 | 0.915906	10.833902
172 | 0.285477	3.818961
173 | 0.988631	11.684010
174 | 0.531069	6.305005
175 | 0.181658	3.806995
176 | 0.039657	3.356861
177 | 0.893344	10.776799
178 | 0.355214	4.263666
179 | 0.783508	9.475445
180 | 0.039768	3.429691
181 | 0.546308	6.472749
182 | 0.786882	9.398951
183 | 0.168282	3.564189
184 | 0.374900	4.399040
185 | 0.737767	8.888536
186 | 0.059849	3.431537
187 | 0.861891	10.246888
188 | 0.597578	7.112627
189 | 0.126050	3.611641
190 | 0.074795	3.609222
191 | 0.634401	7.627416
192 | 0.831633	9.926548
193 | 0.019095	3.470285
194 | 0.396533	4.773104
195 | 0.794973	9.492009
196 | 0.889088	10.420003
197 | 0.003174	3.587139
198 | 0.176767	3.554071
199 | 0.943730	11.227731
200 | 0.758564	8.885337
201 | 


--------------------------------------------------------------------------------
/ch9/expTest.txt:
--------------------------------------------------------------------------------
  1 | 0.042621	0.705087
  2 | 0.140649	1.676077
  3 | 0.729711	98.287450
  4 | 0.420368	0.893020
  5 | 0.055112	-1.784342
  6 | 0.335700	-2.039774
  7 | 0.480745	-1.165972
  8 | 0.039408	-2.453546
  9 | 0.713000	99.181124
 10 | 0.437107	2.288551
 11 | 0.553328	99.909260
 12 | 0.146352	-3.900741
 13 | 0.753615	97.640436
 14 | 0.739062	100.411664
 15 | 0.391077	0.380562
 16 | 0.887119	102.018433
 17 | 0.090234	-1.872570
 18 | 0.870459	97.253294
 19 | 0.174066	0.716029
 20 | 0.698476	96.591450
 21 | 0.463064	0.197371
 22 | 0.201708	-3.424533
 23 | 0.335499	-2.823621
 24 | 0.873611	101.105294
 25 | 0.315239	1.893852
 26 | 0.258688	-0.604888
 27 | 0.331030	2.185822
 28 | 0.938692	98.758321
 29 | 0.390971	5.619469
 30 | 0.946373	101.358201
 31 | 0.841116	100.136301
 32 | 0.652268	101.167615
 33 | 0.488903	1.912745
 34 | 0.076776	0.631315
 35 | 0.078587	-0.173226
 36 | 0.690439	103.351735
 37 | 0.992771	99.322329
 38 | 0.357646	-1.662827
 39 | 0.996224	100.969483
 40 | 0.431983	-2.332204
 41 | 0.084956	-0.153660
 42 | 0.416978	-3.185275
 43 | 0.483920	-0.400342
 44 | 0.351282	-0.212100
 45 | 0.696687	100.399345
 46 | 0.610816	100.447063
 47 | 0.876386	97.717446
 48 | 0.290065	-1.402790
 49 | 0.561540	97.719979
 50 | 0.521387	102.671802
 51 | 0.124250	-1.447424
 52 | 0.760795	100.973153
 53 | 0.813137	98.418078
 54 | 0.322203	-0.210448
 55 | 0.222080	-2.382876
 56 | 0.012078	0.145758
 57 | 0.215864	-1.753234
 58 | 0.286381	-0.029690
 59 | 0.504148	100.382630
 60 | 0.853875	97.561672
 61 | 0.077604	1.836922
 62 | 0.533825	100.804076
 63 | 0.197164	-1.982653
 64 | 0.915268	96.773211
 65 | 0.637298	98.012823
 66 | 0.222793	0.879413
 67 | 0.403267	1.696757
 68 | 0.365798	-1.228388
 69 | 0.470756	-3.196883
 70 | 0.007890	-0.725592
 71 | 0.348122	3.658900
 72 | 0.816112	102.003904
 73 | 0.752076	101.766783
 74 | 0.722139	99.311245
 75 | 0.050637	-0.053007
 76 | 0.794114	96.183380
 77 | 0.416684	-2.133790
 78 | 0.019078	-2.772976
 79 | 0.875982	99.771033
 80 | 0.393920	-0.334077
 81 | 0.240991	-1.351481
 82 | 0.975677	98.774986
 83 | 0.790547	99.321853
 84 | 0.437987	-1.925655
 85 | 0.164944	1.045779
 86 | 0.197404	0.812910
 87 | 0.679754	101.643453
 88 | 0.579659	101.453164
 89 | 0.022060	-0.116585
 90 | 0.181261	-2.269127
 91 | 0.223999	-2.179047
 92 | 0.409925	1.365931
 93 | 0.360634	-4.286442
 94 | 0.164986	-0.749713
 95 | 0.583409	99.378572
 96 | 0.741431	102.861904
 97 | 0.494034	-1.145858
 98 | 0.411789	2.687350
 99 | 0.940651	102.052953
100 | 0.680743	99.299124
101 | 0.453674	-3.107414
102 | 0.164892	1.666987
103 | 0.778335	99.863542
104 | 0.336990	0.938736
105 | 0.501560	101.008483
106 | 0.855588	101.125709
107 | 0.654224	100.980805
108 | 0.653707	98.019920
109 | 0.588863	96.945577
110 | 0.385631	3.146359
111 | 0.050457	-0.106757
112 | 0.822597	100.607049
113 | 0.208452	-0.460245
114 | 0.040589	0.069251
115 | 0.731871	104.981635
116 | 0.427191	-3.934995
117 | 0.623521	97.676660
118 | 0.203501	-0.529907
119 | 0.181543	0.705354
120 | 0.289069	1.085134
121 | 0.652419	98.896461
122 | 0.111964	3.514297
123 | 0.277014	2.301090
124 | 0.497381	-1.877630
125 | 0.994973	98.092916
126 | 0.084255	3.147329
127 | 0.084836	-2.263086
128 | 0.629725	103.448042
129 | 0.741841	99.908137
130 | 0.788823	99.790969
131 | 0.063125	-2.847334
132 | 0.413608	-2.245895
133 | 0.527976	101.466569
134 | 0.596276	101.079191
135 | 0.845748	100.308275
136 | 0.976452	100.197745
137 | 0.475051	2.563985
138 | 0.694542	99.125422
139 | 0.390583	-1.652652
140 | 0.580233	99.861938
141 | 0.622445	97.933787
142 | 0.744950	102.392552
143 | 0.414662	-1.727387
144 | 0.648774	101.371751
145 | 0.013468	-1.718182
146 | 0.781245	98.393098
147 | 0.871697	103.241025
148 | 0.198555	0.407556
149 | 0.427669	-1.826682
150 | 0.281457	0.137682
151 | 0.837984	98.909162
152 | 0.424066	1.060564
153 | 0.837252	100.688719
154 | 0.369463	1.061182
155 | 0.034532	-0.423989
156 | 0.481137	-0.008675
157 | 0.156752	-0.713391
158 | 0.661411	99.255937
159 | 0.176114	-0.302831
160 | 0.478959	-0.367422
161 | 0.874168	97.783253
162 | 0.167500	-0.829583
163 | 0.864995	99.961025
164 | 0.915850	99.090509
165 | 0.717802	100.059025
166 | 0.497465	-2.379605
167 | 0.351879	-1.832181
168 | 0.600021	99.967671
169 | 0.653842	100.114605
170 | 0.235046	-0.002983
171 | 0.608262	99.428381
172 | 0.979362	95.533709
173 | 0.178479	-0.697517
174 | 0.770679	99.313631
175 | 0.605045	101.927861
176 | 0.342313	-1.473575
177 | 0.927246	101.401583
178 | 0.623712	100.875627
179 | 0.764501	97.575820
180 | 0.670568	101.465970
181 | 0.799404	100.978750
182 | 0.999679	95.883283
183 | 0.341203	-2.047895
184 | 0.640206	98.109133
185 | 0.898167	100.648327
186 | 0.538279	97.178557
187 | 0.804254	102.052744
188 | 0.641926	99.911401
189 | 0.248823	-1.025944
190 | 0.830591	100.349505
191 | 0.468414	-2.691770
192 | 0.492944	0.405210
193 | 0.309762	1.995071
194 | 0.951799	99.978873
195 | 0.935369	104.094296
196 | 0.336673	-4.239911
197 | 0.872527	102.585224
198 | 0.837085	103.322194
199 | 0.525039	99.419610
200 | 0.504804	102.986424
201 | 


--------------------------------------------------------------------------------
/ch9/matplotlib/基于CART算法构建回归树的简单数据集.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import matplotlib
 4 | from numpy import *
 5 | 
 6 | #解析文本数据
 7 | def loadDataSet(filename):
 8 |     dataMat=[]
 9 |     fr=open(filename)
10 |     for line in fr.readlines():
11 |         curLine=line.strip().split('\t')
12 |         #将每行数据映射为浮点数
13 |         fltLine=map(float,curLine)
14 |         dataMat.append(fltLine)
15 |     return dataMat
16 | 
17 | if __name__=="__main__":
18 |     import matplotlib.pyplot as plt
19 |     myDat = loadDataSet("ex00.txt")
20 |     myMat = mat(myDat)
21 |     plt.plot(myMat[:,0],myMat[:,1],'ro')
22 |     plt.show()
23 | 
24 | 


--------------------------------------------------------------------------------
/ch9/matplotlib/放大100倍.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import matplotlib
 4 | from numpy import *
 5 | 
 6 | #解析文本数据
 7 | def loadDataSet(filename):
 8 |     dataMat=[]
 9 |     fr=open(filename)
10 |     for line in fr.readlines():
11 |         curLine=line.strip().split('\t')
12 |         #将每行数据映射为浮点数
13 |         fltLine=map(float,curLine)
14 |         dataMat.append(fltLine)
15 |     return dataMat
16 | 
17 | if __name__=="__main__":
18 |     import matplotlib.pyplot as plt
19 |     myDat = loadDataSet("ex2.txt")
20 |     myMat = mat(myDat)
21 |     plt.plot(myMat[:,0],myMat[:,1],'ro')
22 |     plt.show()
23 | 
24 | 


--------------------------------------------------------------------------------
/ch9/matplotlib/测试模型树构建函数的测试数据.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import matplotlib
 4 | from numpy import *
 5 | 
 6 | #解析文本数据
 7 | def loadDataSet(filename):
 8 |     dataMat=[]
 9 |     fr=open(filename)
10 |     for line in fr.readlines():
11 |         curLine=line.strip().split('\t')
12 |         #将每行数据映射为浮点数
13 |         fltLine=map(float,curLine)
14 |         dataMat.append(fltLine)
15 |     return dataMat
16 | 
17 | if __name__=="__main__":
18 |     import matplotlib.pyplot as plt
19 |     myDat = loadDataSet("exp2.txt")
20 |     myMat = mat(myDat)
21 |     plt.plot(myMat[:,0],myMat[:,1],'ro')
22 |     plt.show()
23 | 
24 | 


--------------------------------------------------------------------------------
/ch9/matplotlib/用于测试回归树的分段常数数据集.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import matplotlib
 4 | from numpy import *
 5 | 
 6 | #解析文本数据
 7 | def loadDataSet(filename):
 8 |     dataMat=[]
 9 |     fr=open(filename)
10 |     for line in fr.readlines():
11 |         curLine=line.strip().split('\t')
12 |         #将每行数据映射为浮点数
13 |         fltLine=map(float,curLine)
14 |         dataMat.append(fltLine)
15 |     return dataMat
16 | 
17 | if __name__=="__main__":
18 |     import matplotlib.pyplot as plt
19 |     myDat = loadDataSet("ex0.txt")
20 |     myMat = mat(myDat)
21 |     plt.plot(myMat[:,1],myMat[:,2],'ro')
22 |     plt.show()
23 | 
24 | 


--------------------------------------------------------------------------------
/ch9/screenshot/README.md:
--------------------------------------------------------------------------------
1 | # screenshot
2 | 


--------------------------------------------------------------------------------
/ch9/screenshot/基于CART算法构建回归树的简单数据集.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch9/screenshot/基于CART算法构建回归树的简单数据集.png


--------------------------------------------------------------------------------
/ch9/screenshot/放大100倍.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch9/screenshot/放大100倍.png


--------------------------------------------------------------------------------
/ch9/screenshot/测试模型树构建函数的测试数据.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch9/screenshot/测试模型树构建函数的测试数据.png


--------------------------------------------------------------------------------
/ch9/screenshot/用于测试回归树的分段常数数据集.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch9/screenshot/用于测试回归树的分段常数数据集.png


--------------------------------------------------------------------------------
/ch9/screenshot/骑自行车速度.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lzyrapx/Machine-Learning-in-Action/69f8bab179407fb8fb6c39fbebdfca794e5f0b12/ch9/screenshot/骑自行车速度.png


--------------------------------------------------------------------------------
/ch9/sine.txt:
--------------------------------------------------------------------------------
  1 | 0.190350	0.878049
  2 | 0.306657	-0.109413
  3 | 0.017568	0.030917
  4 | 0.122328	0.951109
  5 | 0.076274	0.774632
  6 | 0.614127	-0.250042
  7 | 0.220722	0.807741
  8 | 0.089430	0.840491
  9 | 0.278817	0.342210
 10 | 0.520287	-0.950301
 11 | 0.726976	0.852224
 12 | 0.180485	1.141859
 13 | 0.801524	1.012061
 14 | 0.474273	-1.311226
 15 | 0.345116	-0.319911
 16 | 0.981951	-0.374203
 17 | 0.127349	1.039361
 18 | 0.757120	1.040152
 19 | 0.345419	-0.429760
 20 | 0.314532	-0.075762
 21 | 0.250828	0.657169
 22 | 0.431255	-0.905443
 23 | 0.386669	-0.508875
 24 | 0.143794	0.844105
 25 | 0.470839	-0.951757
 26 | 0.093065	0.785034
 27 | 0.205377	0.715400
 28 | 0.083329	0.853025
 29 | 0.243475	0.699252
 30 | 0.062389	0.567589
 31 | 0.764116	0.834931
 32 | 0.018287	0.199875
 33 | 0.973603	-0.359748
 34 | 0.458826	-1.113178
 35 | 0.511200	-1.082561
 36 | 0.712587	0.615108
 37 | 0.464745	-0.835752
 38 | 0.984328	-0.332495
 39 | 0.414291	-0.808822
 40 | 0.799551	1.072052
 41 | 0.499037	-0.924499
 42 | 0.966757	-0.191643
 43 | 0.756594	0.991844
 44 | 0.444938	-0.969528
 45 | 0.410167	-0.773426
 46 | 0.532335	-0.631770
 47 | 0.343909	-0.313313
 48 | 0.854302	0.719307
 49 | 0.846882	0.916509
 50 | 0.740758	1.009525
 51 | 0.150668	0.832433
 52 | 0.177606	0.893017
 53 | 0.445289	-0.898242
 54 | 0.734653	0.787282
 55 | 0.559488	-0.663482
 56 | 0.232311	0.499122
 57 | 0.934435	-0.121533
 58 | 0.219089	0.823206
 59 | 0.636525	0.053113
 60 | 0.307605	0.027500
 61 | 0.713198	0.693978
 62 | 0.116343	1.242458
 63 | 0.680737	0.368910
 64 | 0.484730	-0.891940
 65 | 0.929408	0.234913
 66 | 0.008507	0.103505
 67 | 0.872161	0.816191
 68 | 0.755530	0.985723
 69 | 0.620671	0.026417
 70 | 0.472260	-0.967451
 71 | 0.257488	0.630100
 72 | 0.130654	1.025693
 73 | 0.512333	-0.884296
 74 | 0.747710	0.849468
 75 | 0.669948	0.413745
 76 | 0.644856	0.253455
 77 | 0.894206	0.482933
 78 | 0.820471	0.899981
 79 | 0.790796	0.922645
 80 | 0.010729	0.032106
 81 | 0.846777	0.768675
 82 | 0.349175	-0.322929
 83 | 0.453662	-0.957712
 84 | 0.624017	-0.169913
 85 | 0.211074	0.869840
 86 | 0.062555	0.607180
 87 | 0.739709	0.859793
 88 | 0.985896	-0.433632
 89 | 0.782088	0.976380
 90 | 0.642561	0.147023
 91 | 0.779007	0.913765
 92 | 0.185631	1.021408
 93 | 0.525250	-0.706217
 94 | 0.236802	0.564723
 95 | 0.440958	-0.993781
 96 | 0.397580	-0.708189
 97 | 0.823146	0.860086
 98 | 0.370173	-0.649231
 99 | 0.791675	1.162927
100 | 0.456647	-0.956843
101 | 0.113350	0.850107
102 | 0.351074	-0.306095
103 | 0.182684	0.825728
104 | 0.914034	0.305636
105 | 0.751486	0.898875
106 | 0.216572	0.974637
107 | 0.013273	0.062439
108 | 0.469726	-1.226188
109 | 0.060676	0.599451
110 | 0.776310	0.902315
111 | 0.061648	0.464446
112 | 0.714077	0.947507
113 | 0.559264	-0.715111
114 | 0.121876	0.791703
115 | 0.330586	-0.165819
116 | 0.662909	0.379236
117 | 0.785142	0.967030
118 | 0.161352	0.979553
119 | 0.985215	-0.317699
120 | 0.457734	-0.890725
121 | 0.171574	0.963749
122 | 0.334277	-0.266228
123 | 0.501065	-0.910313
124 | 0.988736	-0.476222
125 | 0.659242	0.218365
126 | 0.359861	-0.338734
127 | 0.790434	0.843387
128 | 0.462458	-0.911647
129 | 0.823012	0.813427
130 | 0.594668	-0.603016
131 | 0.498207	-0.878847
132 | 0.574882	-0.419598
133 | 0.570048	-0.442087
134 | 0.331570	-0.347567
135 | 0.195407	0.822284
136 | 0.814327	0.974355
137 | 0.641925	0.073217
138 | 0.238778	0.657767
139 | 0.400138	-0.715598
140 | 0.670479	0.469662
141 | 0.069076	0.680958
142 | 0.294373	0.145767
143 | 0.025628	0.179822
144 | 0.697772	0.506253
145 | 0.729626	0.786519
146 | 0.293071	0.259997
147 | 0.531802	-1.095833
148 | 0.487338	-1.034481
149 | 0.215780	0.933506
150 | 0.625818	0.103845
151 | 0.179389	0.892237
152 | 0.192552	0.915516
153 | 0.671661	0.330361
154 | 0.952391	-0.060263
155 | 0.795133	0.945157
156 | 0.950494	-0.071855
157 | 0.194894	1.000860
158 | 0.351460	-0.227946
159 | 0.863456	0.648456
160 | 0.945221	-0.045667
161 | 0.779840	0.979954
162 | 0.996606	-0.450501
163 | 0.632184	-0.036506
164 | 0.790898	0.994890
165 | 0.022503	0.386394
166 | 0.318983	-0.152749
167 | 0.369633	-0.423960
168 | 0.157300	0.962858
169 | 0.153223	0.882873
170 | 0.360068	-0.653742
171 | 0.433917	-0.872498
172 | 0.133461	0.879002
173 | 0.757252	1.123667
174 | 0.309391	-0.102064
175 | 0.195586	0.925339
176 | 0.240259	0.689117
177 | 0.340591	-0.455040
178 | 0.243436	0.415760
179 | 0.612755	-0.180844
180 | 0.089407	0.723702
181 | 0.469695	-0.987859
182 | 0.943560	-0.097303
183 | 0.177241	0.918082
184 | 0.317756	-0.222902
185 | 0.515337	-0.733668
186 | 0.344773	-0.256893
187 | 0.537029	-0.797272
188 | 0.626878	0.048719
189 | 0.208940	0.836531
190 | 0.470697	-1.080283
191 | 0.054448	0.624676
192 | 0.109230	0.816921
193 | 0.158325	1.044485
194 | 0.976650	-0.309060
195 | 0.643441	0.267336
196 | 0.215841	1.018817
197 | 0.905337	0.409871
198 | 0.154354	0.920009
199 | 0.947922	-0.112378
200 | 0.201391	0.768894
201 | 


--------------------------------------------------------------------------------
/ch9/treeExplore.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | import regTrees
 3 | from numpy import *
 4 | import matplotlib
 5 | from Tkinter import *
 6 | 
 7 | matplotlib.use('TkAgg') #设置后端TkAgg
 8 | #将TkAgg和matplotlib链接起来
 9 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
10 | from matplotlib.figure import Figure
11 | 
12 | 
13 | def reDraw(tolS, tolN):
14 |     reDraw.f.clf()  #清空之前的图像
15 |     reDraw.a = reDraw.f.add_subplot(111)#重新添加新图
16 |     if chkBtnVar.get():#检查选框model tree是否被选中
17 |         if tolN < 2: tolN = 2
18 |         myTree = regTrees.createTree(reDraw.rawDat, regTrees.modelLeaf,regTrees.modelErr, (tolS, tolN))
19 |         yHat = regTrees.createForeCast(myTree, reDraw.testDat, regTrees.modelTreeEval)
20 |     else:
21 |         myTree = regTrees.createTree(reDraw.rawDat, ops=(tolS, tolN))
22 |         yHat = regTrees.createForeCast(myTree, reDraw.testDat)
23 |     reDraw.a.scatter(reDraw.rawDat[:, 0], reDraw.rawDat[:, 1], s=5)  # 绘制真实值
24 |     reDraw.a.plot(reDraw.testDat, yHat, linewidth=2.0)  # 绘制预测值
25 |     reDraw.canvas.show()
26 | 
27 | 
28 | def getInputs():#获取输入
29 |     try:#期望输入是整数
30 |         tolN = int(tolNentry.get())
31 |     except:#清楚错误用默认值替换
32 |         tolN = 10
33 |         print("enter Integer for tolN")
34 |         tolNentry.delete(0, END)
35 |         tolNentry.insert(0, '10')
36 |     try:#期望输入是浮点数
37 |         tolS = float(tolSentry.get())
38 |     except:
39 |         tolS = 1.0
40 |         print("enter Float for tolS")
41 |         tolSentry.delete(0, END)
42 |         tolSentry.insert(0, '1.0')
43 |     return tolN, tolS
44 | 
45 | 
46 | def drawNewTree():
47 |     tolN, tolS = getInputs()  # 从输入文本框中获取参数
48 |     reDraw(tolS, tolN)  #绘制图
49 | 
50 | root = Tk()
51 | 
52 | reDraw.f = Figure(figsize=(5, 4), dpi=100)  # 创建画布
53 | reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root)
54 | reDraw.canvas.show()
55 | reDraw.canvas.get_tk_widget().grid(row=0, columnspan=3)
56 | 
57 | Label(root, text="tolN").grid(row=1, column=0)
58 | tolNentry = Entry(root)
59 | tolNentry.grid(row=1, column=1)
60 | tolNentry.insert(0, '10')
61 | Label(root, text="tolS").grid(row=2, column=0)
62 | tolSentry = Entry(root)
63 | tolSentry.grid(row=2, column=1)
64 | tolSentry.insert(0, '1.0')
65 | Button(root, text="ReDraw", command=drawNewTree).grid(row=1, column=2, rowspan=3)
66 | chkBtnVar = IntVar()
67 | chkBtn = Checkbutton(root, text="Model Tree", variable=chkBtnVar)
68 | chkBtn.grid(row=3, column=0, columnspan=2)
69 | 
70 | reDraw.rawDat = mat(regTrees.loadDataSet('sine.txt'))
71 | reDraw.testDat = arange(min(reDraw.rawDat[:, 0]), max(reDraw.rawDat[:, 0]), 0.01)
72 | reDraw(1.0, 10)
73 | 
74 | root.mainloop()


--------------------------------------------------------------------------------