├── README.md
├── blog02-Kmeans
    └── test01.py
├── blog03-Kmeans-yh
    ├── data.txt
    ├── result01.png
    ├── test01.py
    └── test02.py
├── blog04-DTC
    ├── result.png
    ├── test01.py
    ├── test02.py
    ├── test03.py
    └── test04.py
├── blog05-LR
    ├── res.png
    ├── res02.png
    ├── res03.png
    ├── test01.py
    ├── test02.py
    ├── test03.py
    └── test04.py
├── blog06-Numpy+Matplotlib
    ├── data.xls
    ├── test01.py
    ├── test02.py
    ├── test03.py
    ├── test04.py
    ├── test05-matplotlib.py
    ├── test06-matplotlib.py
    └── test07-matplotlib.py
├── blog07-pac
    ├── result01.png
    ├── result02.png
    ├── test01.py
    ├── test02.py
    └── test03.py
├── blog08-Apriori
    └── test01.py
├── blog09-LinearRegression
    ├── Index
    ├── glass.csv
    ├── glass.data
    ├── glass.names
    ├── glass.tag
    ├── result01.png
    ├── result02.png
    ├── result03.png
    ├── test01.py
    └── test02.py
├── blog10-Pandas
    ├── 41.txt
    ├── bankloan.png
    ├── ccc.png
    ├── data.csv
    ├── data2.xlsx
    ├── guiyang.png
    ├── test01.py
    ├── test02.py
    ├── test03.py
    ├── test04.py
    ├── test05.py
    ├── test06-dalian.py
    ├── test07.py
    ├── 时序图.png
    └── 贵阳自相关图.png
├── blog11-Matplotlib+SQL
    ├── test01.py
    ├── test02.py
    ├── test03.py
    └── test04.py
├── blog12-matplotlib+SQL
    ├── test01.py
    ├── test02.py
    ├── test03.py
    └── test04.py
├── blog13-wordcloud
    ├── cloudimg.png
    ├── mb.png
    ├── result01.png
    ├── test.txt
    ├── test01.py
    └── test02.py
├── blog14-curve_fit
    ├── data.csv
    ├── result01.png
    ├── result02.png
    ├── test01.py
    ├── test02.py
    ├── test03.py
    ├── test04.py
    ├── test3.png
    └── test4.png
├── blog15-imshow
    ├── result01.png
    ├── result02.png
    ├── result03.png
    ├── test01.py
    ├── test02.py
    ├── test03.py
    ├── test04.py
    ├── test05.py
    ├── test06.py
    └── test07.py
├── blog16-LR
    ├── result01.png
    ├── result02.png
    ├── result03.png
    ├── test01.py
    ├── test02.py
    └── test03.py
├── blog17-networkx
    ├── result01.png
    ├── test01.py
    └── test02.py
├── blog18-Regression
    ├── blog01-LR.py
    ├── blog02-LR.py
    ├── blog03-boston.py
    ├── blog04-boson.py
    ├── blog05-random.py
    ├── blog06-random.py
    ├── blog07-3Drandom.py
    ├── blog08-PolynomialFeatures.py
    ├── blog09-PolynomialFeatures.py
    ├── result01.png
    ├── result02.png
    ├── result03.png
    ├── result04.png
    ├── result05.png
    └── result06.png
├── blog19-Iris
    ├── result01.png
    ├── result02.png
    ├── result03.png
    ├── result04.png
    ├── result05.png
    ├── result06.png
    ├── result07.png
    ├── result08.png
    ├── result09.png
    ├── test01.py
    ├── test02-hist.py
    ├── test03-plot.py
    ├── test04-kde.py
    ├── test05-box.py
    ├── test06-box.py
    ├── test07-show.py
    ├── test08-LR.py
    ├── test09-Kmeans.py
    ├── test10-Kmeans.py
    └── test11-Kmeans.py
├── blog20-KNN
    ├── blog01.py
    ├── blog02.py
    ├── blog03.py
    ├── blog04.py
    ├── result.png
    ├── result02.png
    └── wine.txt
├── blog21-NB
    ├── blog01.py
    ├── blog02.py
    ├── blog03.py
    ├── blog04-getdata.py
    ├── blog05-fenci.py
    ├── blog06-static.py
    ├── blog07-classifier.py
    ├── data.csv
    ├── data_preprocess.py
    ├── result.png
    ├── result2.png
    ├── seed.txt
    ├── seed_x.csv
    └── seed_y.csv
├── blog22-Basemap
    ├── 001.png
    ├── 002.png
    ├── 003.png
    ├── 004.png
    ├── 005.png
    ├── 006.png
    ├── basemap下载.txt
    ├── blog-001.py
    ├── blog-002.py
    ├── blog-003.py
    ├── blog-004.py
    ├── blog-005.py
    ├── blog-006.py
    └── blog-007.py
├── blog23-statsmodels
    ├── blog01.py
    ├── blog02.py
    ├── blog03_show.py
    ├── blog04_show.py
    ├── blog05_groupby.py
    ├── blog06_ARIMA.py
    ├── blog07_ARIMA.py
    ├── blog08_statsmodels.py
    ├── blog09_statsmodels.py
    ├── result01.png
    ├── result02.png
    ├── result03.png
    ├── result04.png
    └── result05.png
├── blog24-Kmeans-Chinese
    ├── BaiduSpiderSpots.rar
    ├── HudongSpider_Result.txt
    ├── blog01_merge.py
    ├── blog02_spider.py
    ├── blog03_fenci.py
    ├── blog04_kmeans.py
    └── result.png
├── blog25-Matplotlib
    ├── allname.txt
    ├── plot.png
    ├── test01-show.py
    ├── test02-show.py
    ├── test03-kmeans.py
    └── test04-kmeans.py
├── blog26-SnowNLP
    ├── data.txt
    ├── result01.png
    ├── result02.png
    ├── result03.png
    ├── result04.png
    ├── result05.png
    ├── test-douban.csv
    ├── test01-spider.py
    ├── test02-wordcloud.py
    ├── test03-snownlp01.py
    ├── test04-snownlp02.py
    ├── test05-snownlp03.py
    ├── test06-snownlp-show.py
    ├── test07-snownlp-show.py
    └── test08-snownlp-show.py
├── blog27-SVM&WineDataset
    ├── data intro.txt
    ├── result01.png
    ├── result02.png
    ├── test01-svm.py
    ├── test02-datapre.py
    ├── test03-svm.py
    ├── test04-update.py
    └── wine.txt
├── blog28-LDA&pyLDAvis
    ├── data.csv
    ├── result.png
    ├── test01-read.py
    ├── test02-jieba.py
    ├── test03-tfidf.py
    ├── test04-lda.py
    └── test05-pyLDAvis.py
└── blog29-DataPreprocessing&KNN
    ├── kddcup.data_10_percent_corrected
    ├── kddcup.data_10_percent_corrected-result-minmax.csv
    ├── kddcup.data_10_percent_corrected-result.csv
    ├── kddcup.data_10_percent_corrected.csv
    ├── result01.png
    ├── result02.png
    ├── test01-data pre.py
    ├── test02-zscoreNormalization.py
    ├── test03-minmax.py
    ├── test04-knn-roc.py
    ├── test05-knn-gitHub-roc.py
    └── test06-knn.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Python-for-Data-Mining
 2 | 该资源为作者在CSDN的撰写Python数据挖掘和数据分析文章的支撑，主要是Python实现数据挖掘、机器学习、文本挖掘等算法代码实现，希望该资源对您有所帮助，一起加油。
 3 | 
 4 | > 该部分代码修改成了Python 3.x版本，与Python 2.x略微不同。<br />
 5 | > 大家注意其差异即可，这也是为了更好的帮助同学们适应新的版本。
 6 | 
 7 | ---
 8 | 
 9 | 具体内容请参照如下CSDN博客：
10 | 
11 | [【Python数据挖掘课程】一.安装Python及爬虫入门介绍](https://blog.csdn.net/eastmount/article/details/52577215) <br />
12 | [【Python数据挖掘课程】二.Kmeans聚类数据分析及Anaconda介绍](https://blog.csdn.net/eastmount/article/details/52777308) <br />
13 | [【Python数据挖掘课程】三.Kmeans聚类代码实现、作业及优化](https://blog.csdn.net/eastmount/article/details/52793549) <br />
14 | [【Python数据挖掘课程】四.决策树DTC数据分析及鸢尾数据集分析](https://blog.csdn.net/eastmount/article/details/52820400) <br />
15 | [【Python数据挖掘课程】五.线性回归知识及预测糖尿病实例](https://blog.csdn.net/eastmount/article/details/52929765) <br />
16 | [【Python数据挖掘课程】六.Numpy、Pandas和Matplotlib包基础知识](https://blog.csdn.net/eastmount/article/details/53144633) <br />
17 | [【Python数据挖掘课程】七.PCA降维操作及subplot子图绘制](https://blog.csdn.net/eastmount/article/details/53285192) <br />
18 | [【Python数据挖掘课程】八.关联规则挖掘及Apriori实现购物推荐](https://blog.csdn.net/eastmount/article/details/53368440) <br />
19 | [【Python数据挖掘课程】九.回归模型LinearRegression简单分析氧化物数据](https://blog.csdn.net/eastmount/article/details/60468818) <br />
20 | [【python数据挖掘课程】十.Pandas、Matplotlib、PCA绘图实用代码补充](https://blog.csdn.net/eastmount/article/details/60675865) <br />
21 | [【python数据挖掘课程】十一.Pandas、Matplotlib结合SQL语句可视化分析](https://blog.csdn.net/eastmount/article/details/62489186) <br />
22 | [【python数据挖掘课程】十二.Pandas、Matplotlib结合SQL语句对比图分析](https://blog.csdn.net/eastmount/article/details/64127445) <br />
23 | [【python数据挖掘课程】十三.WordCloud词云配置过程及词频分析](https://blog.csdn.net/eastmount/article/details/64438407) <br />
24 | [【python数据挖掘课程】十四.Scipy调用curve_fit实现曲线拟合](https://blog.csdn.net/eastmount/article/details/71308373) <br />
25 | [【python数据挖掘课程】十五.Matplotlib调用imshow()函数绘制热图](https://blog.csdn.net/eastmount/article/details/73392106) <br />
26 | [【python数据挖掘课程】十六.逻辑回归LogisticRegression分析鸢尾花数据](https://blog.csdn.net/eastmount/article/details/77920470) <br />
27 | [【python数据挖掘课程】十七.社交网络Networkx库分析人物关系（初识篇）](https://blog.csdn.net/eastmount/article/details/78452581) <br />
28 | [【python数据挖掘课程】十八.线性回归及多项式回归分析四个案例分享](https://blog.csdn.net/eastmount/article/details/78635096) <br />
29 | [【python数据挖掘课程】十九.鸢尾花数据集可视化、线性回归、决策树花样分析](https://blog.csdn.net/eastmount/article/details/78692227) <br />
30 | [【python数据挖掘课程】二十.KNN最近邻分类算法分析详解及平衡秤TXT数据集读取](https://blog.csdn.net/eastmount/article/details/78747128) <br />
31 | [【python数据挖掘课程】二十一.朴素贝叶斯分类器详解及中文文本舆情分析](https://blog.csdn.net/eastmount/article/details/79128235) <br />
32 | [【python数据挖掘课程】二十二.Basemap地图包安装入门及基础知识讲解](https://blog.csdn.net/eastmount/article/details/79188415) <br />
33 | [【python数据挖掘课程】二十三.时间序列金融数据预测及Pandas库详解](https://blog.csdn.net/eastmount/article/details/79188415) <br />
34 | [【python数据挖掘课程】二十四.KMeans文本聚类分析互动百科语料](https://blog.csdn.net/eastmount/article/details/80935427) <br />
35 | [【python数据挖掘课程】二十五.Matplotlib绘制带主题及聚类类标的散点图](https://blog.csdn.net/Eastmount/article/details/81106487) <br />
36 | [【python数据挖掘课程】二十六.基于SnowNLP的豆瓣评论情感分析](https://blog.csdn.net/Eastmount/article/details/85118818) <br />
37 | [【python数据挖掘课程】二十七.基于SVM分类器的红酒数据分析](https://blog.csdn.net/Eastmount/article/details/86512901) <br />
38 | [【python数据挖掘课程】二十八.基于LDA和pyLDAvis的主题挖掘及可视化分析](https://blog.csdn.net/Eastmount/article/details/91380607) <br />
39 | [【python数据挖掘课程】二十九.数据预处理之字符型转换数值型、标准化、归一化处理](https://blog.csdn.net/Eastmount/article/details/103212931) <br />
40 | 
41 | 
42 | 效果图显示如下：
43 | 
44 | <div align="center">
45 |   <img src="https://img-blog.csdnimg.cn/20190116191907871.png" height="60%" width="60%" />
46 | </div>
47 | <br />
48 | 
49 | <div align="center">
50 |   <img src="https://img-blog.csdnimg.cn/20181221110034516.png" height="60%" width="60%" />
51 | </div>
52 | 
53 | <div align="center">
54 |   <img src="https://img-blog.csdn.net/20180129175514027" height="60%" width="60%" />
55 | </div>
56 | 
57 | 
58 | 
59 | <br />
60 | 
61 | ---
62 | 
63 | 都是非常基础的文章，如果有错误或不足之处，还请告知及海涵，谢谢您的鼓励与支持，请帮忙点个Star！您的支持是我最大的动力，共勉~
64 | 
65 | 数据挖掘相关知识分享。
66 | 
67 | By：杨秀璋 Eastmount
68 | 
69 | 2021-01-21
70 | 
71 | 


--------------------------------------------------------------------------------
/blog02-Kmeans/test01.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 第一部分：导入包
 3 | 从sklearn.cluster机器学习聚类包中导入KMeans聚类
 4 | """
 5 | # coding=utf-8  
 6 | from sklearn.cluster import Birch
 7 | from sklearn.cluster import KMeans
 8 |  
 9 | """
10 | 第二部分：数据集
11 | X表示二维矩阵数据，篮球运动员比赛数据
12 | 总共20行，每行两列数据
13 | 第一列表示球员每分钟助攻数：assists_per_minute
14 | 第二列表示球员每分钟得分数：points_per_minute
15 | """
16 |  
17 | X = [[0.0888, 0.5885],
18 |      [0.1399, 0.8291],
19 |      [0.0747, 0.4974],
20 |      [0.0983, 0.5772],
21 |      [0.1276, 0.5703],
22 |      [0.1671, 0.5835],
23 |      [0.1906, 0.5276],
24 |      [0.1061, 0.5523],
25 |      [0.2446, 0.4007],
26 |      [0.1670, 0.4770],
27 |      [0.2485, 0.4313],
28 |      [0.1227, 0.4909],
29 |      [0.1240, 0.5668],
30 |      [0.1461, 0.5113],
31 |      [0.2315, 0.3788],
32 |      [0.0494, 0.5590],
33 |      [0.1107, 0.4799],
34 |      [0.2521, 0.5735],
35 |      [0.1007, 0.6318],
36 |      [0.1067, 0.4326],
37 |      [0.1956, 0.4280]   
38 |     ]
39 |  
40 | #输出数据集
41 | print(X)
42 |  
43 |  
44 | """
45 | 第三部分：KMeans聚类
46 | clf = KMeans(n_clusters=3) 表示类簇数为3，聚成3类数据，clf即赋值为KMeans
47 | y_pred = clf.fit_predict(X) 载入数据集X，并且将聚类的结果赋值给y_pred
48 | """
49 |  
50 | clf = KMeans(n_clusters=3)
51 | y_pred = clf.fit_predict(X)
52 |  
53 | #输出完整Kmeans函数，包括很多省略参数
54 | print(clf)
55 | #输出聚类预测结果，20行数据，每个y_pred对应X一行或一个球员，聚成3类，类标为0、1、2
56 | print(y_pred)
57 |  
58 |  
59 | """
60 | 第四部分：可视化绘图
61 | Python导入Matplotlib包，专门用于绘图
62 | import matplotlib.pyplot as plt 此处as相当于重命名，plt用于显示图像
63 | """
64 |  
65 | import numpy as np
66 | import matplotlib.pyplot as plt
67 |  
68 | #获取第一列和第二列数据 使用for循环获取 n[0]表示X第一列
69 | x = [n[0] for n in X]
70 | print(x)
71 | y = [n[1] for n in X]
72 | print(y)
73 |  
74 | #绘制散点图 参数：x横轴 y纵轴 c=y_pred聚类预测结果 marker类型 o表示圆点 *表示星型 x表示点
75 | plt.scatter(x, y, c=y_pred, marker='x')
76 |  
77 | #绘制标题
78 | plt.title("Kmeans-Basketball Data")
79 |  
80 | #绘制x轴和y轴坐标
81 | plt.xlabel("assists_per_minute")
82 | plt.ylabel("points_per_minute")
83 |  
84 | #设置右上角图例
85 | plt.legend(["A","B","C"])
86 |  
87 | #显示图形
88 | plt.show()
89 | 


--------------------------------------------------------------------------------
/blog03-Kmeans-yh/data.txt:
--------------------------------------------------------------------------------
 1 | 0.0888     201     36.02     28     0.5885
 2 | 0.1399     198     39.32     30     0.8291
 3 | 0.0747     198     38.8     26     0.4974
 4 | 0.0983     191     40.71     30     0.5772
 5 | 0.1276     196     38.4     28     0.5703
 6 | 0.1671     201     34.1     31     0.5835
 7 | 0.1906     193     36.2     30     0.5276
 8 | 0.1061     191     36.75     27     0.5523
 9 | 0.2446     185     38.43     29     0.4007
10 | 0.167     203     33.54     24     0.477
11 | 0.2485     188     35.01     27     0.4313
12 | 0.1227     198     36.67     29     0.4909
13 | 0.124     185     33.88     24     0.5668
14 | 0.1461     191     35.59     30     0.5113
15 | 0.2315     191     38.01     28     0.3788
16 | 0.0494     193     32.38     32     0.559
17 | 0.1107     196     35.22     25     0.4799
18 | 0.2521     183     31.73     29     0.5735
19 | 0.1007     193     28.81     34     0.6318
20 | 0.1067     196     35.6     23     0.4326
21 | 0.1956     188     35.28     32     0.428
22 | 0.1828     191     29.54     28     0.4401
23 | 0.1627     196     31.35     28     0.5581
24 | 0.1403     198     33.5     23     0.4866
25 | 0.1563     193     34.56     32     0.5267
26 | 0.2681     183     39.53     27     0.5439
27 | 0.1236     196     26.7     34     0.4419
28 | 0.13     188     30.77     26     0.3998
29 | 0.0896     198     25.67     30     0.4325
30 | 0.2071     178     36.22     30     0.4086
31 | 0.2244     185     36.55     23     0.4624
32 | 0.3437     185     34.91     31     0.4325
33 | 0.1058     191     28.35     28     0.4903
34 | 0.2326     185     33.53     27     0.4802
35 | 0.1577     193     31.07     25     0.4345
36 | 0.2327     185     36.52     32     0.4819
37 | 0.1256     196     27.87     29     0.6244
38 | 0.107     198     24.31     34     0.3991
39 | 0.1343     193     31.26     28     0.4414
40 | 0.0586     196     22.18     23     0.4013
41 | 0.2383     185     35.25     26     0.3801
42 | 0.1006     198     22.87     30     0.3498
43 | 0.2164     193     24.49     32     0.3185
44 | 0.1485     198     23.57     27     0.3097
45 | 0.227     191     31.72     27     0.4319
46 | 0.1649     188     27.9     25     0.3799
47 | 0.1188     191     22.74     24     0.4091
48 | 0.194     193     20.62     27     0.3588
49 | 0.2495     185     30.46     25     0.4727
50 | 0.2378     185     32.38     27     0.3212
51 | 0.1592     191     25.75     31     0.3418
52 | 0.2069     170     33.84     30     0.4285
53 | 0.2084     185     27.83     25     0.3917
54 | 0.0877     193     21.67     26     0.5769
55 | 0.101     193     21.79     24     0.4773
56 | 0.0942     201     20.17     26     0.4512
57 | 0.055     193     29.07     31     0.3096
58 | 0.1071     196     24.28     24     0.3089
59 | 0.0728     193     19.24     27     0.4573
60 | 0.2771     180     27.07     28     0.3214
61 | 0.0528     196     18.95     22     0.5437
62 | 0.213     188     21.59     30     0.4121
63 | 0.1356     193     13.27     31     0.2185
64 | 0.1043     196     16.3     23     0.3313
65 | 0.113     191     23.01     25     0.3302
66 | 0.1477     196     20.31     31     0.4677
67 | 0.1317     188     17.46     33     0.2406
68 | 0.2187     191     21.95     28     0.3007
69 | 0.2127     188     14.57     37     0.2471
70 | 0.2547     160     34.55     28     0.2894
71 | 0.1591     191     22.0     24     0.3682
72 | 0.0898     196     13.37     34     0.389
73 | 0.2146     188     20.51     24     0.512
74 | 0.1871     183     19.78     28     0.4449
75 | 0.1528     191     16.36     33     0.4035
76 | 0.156     191     16.03     23     0.2683
77 | 0.2348     188     24.27     26     0.2719
78 | 0.1623     180     18.49     28     0.3408
79 | 0.1239     180     17.76     26     0.4393
80 | 0.2178     185     13.31     25     0.3004
81 | 0.1608     185     17.41     26     0.3503
82 | 0.0805     193     13.67     25     0.4388
83 | 0.1776     193     17.46     27     0.2578
84 | 0.1668     185     14.38     35     0.2989
85 | 0.1072     188     12.12     31     0.4455
86 | 0.1821     185     12.63     25     0.3087
87 | 0.188     180     12.24     30     0.3678
88 | 0.1167     196     12.0     24     0.3667
89 | 0.2617     185     24.46     27     0.3189
90 | 0.1994     188     20.06     27     0.4187
91 | 0.1706     170     17.0     25     0.5059
92 | 0.1554     183     11.58     24     0.3195
93 | 0.2282     185     10.08     24     0.2381
94 | 0.1778     185     18.56     23     0.2802
95 | 0.1863     185     11.81     23     0.381
96 | 0.1014     193     13.81     32     0.1593


--------------------------------------------------------------------------------
/blog03-Kmeans-yh/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog03-Kmeans-yh/result01.png


--------------------------------------------------------------------------------
/blog03-Kmeans-yh/test01.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 |   
 3 | from sklearn.cluster import Birch  
 4 | from sklearn.cluster import KMeans  
 5 |   
 6 | X = [[0.0888, 0.5885],  
 7 |      [0.1399, 0.8291],  
 8 |      [0.0747, 0.4974],  
 9 |      [0.0983, 0.5772],  
10 |      [0.1276, 0.5703],  
11 |      [0.1671, 0.5835],  
12 |      [0.1906, 0.5276],  
13 |      [0.1061, 0.5523],  
14 |      [0.2446, 0.4007],  
15 |      [0.1670, 0.4770],  
16 |      [0.2485, 0.4313],  
17 |      [0.1227, 0.4909],  
18 |      [0.1240, 0.5668],  
19 |      [0.1461, 0.5113],  
20 |      [0.2315, 0.3788],  
21 |      [0.0494, 0.5590],  
22 |      [0.1107, 0.4799],  
23 |      [0.2521, 0.5735],  
24 |      [0.1007, 0.6318],  
25 |      [0.1067, 0.4326],  
26 |      [0.1956, 0.4280]     
27 |     ]  
28 | print(X)
29 |  
30 | # Kmeans聚类
31 | clf = KMeans(n_clusters=3)  
32 | y_pred = clf.fit_predict(X)  
33 | print(clf)   
34 | print(y_pred)  
35 |  
36 |  
37 | import numpy as np  
38 | import matplotlib.pyplot as plt  
39 |   
40 | x = [n[0] for n in X]  
41 | print(x)
42 | y = [n[1] for n in X]  
43 | print(y)  
44 |  
45 | # 可视化操作
46 | plt.scatter(x, y, c=y_pred, marker='x')   
47 | plt.title("Kmeans-Basketball Data")   
48 | plt.xlabel("assists_per_minute")  
49 | plt.ylabel("points_per_minute")  
50 | plt.legend(["Rank"])   
51 | plt.show()  
52 | 


--------------------------------------------------------------------------------
/blog03-Kmeans-yh/test02.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 |  
  3 | """
  4 | By: Eastmount CSDN 2016-10-12
  5 | 该部分讲数据集读取，然后赋值给X变量
  6 | 读取文件data.txt 保存结果为X
  7 | """
  8 |  
  9 | import os
 10 |  
 11 | data = []
 12 | for line in open("data.txt", "r").readlines():
 13 |     line = line.rstrip()    #删除换行
 14 |     #删除多余空格，保存一个空格连接
 15 |     result = ' '.join(line.split())
 16 |     #获取每行五个值 '0 0.0888 201 36.02 28 0.5885' 注意：字符串转换为浮点型数
 17 |     s = [float(x) for x in result.strip().split(' ')]
 18 |     #输出结果：['0', '0.0888', '201', '36.02', '28', '0.5885']
 19 |     print(s)
 20 |     #数据存储至data
 21 |     data.append(s)
 22 |  
 23 | #输出完整数据集
 24 | print('完整数据集')
 25 | print(data)
 26 | print(type(data))
 27 |  
 28 | '''
 29 | 现在输出数据集：
 30 | ['0 0.0888 201 36.02 28 0.5885', 
 31 |  '1 0.1399 198 39.32 30 0.8291', 
 32 |  '2 0.0747 198 38.80 26 0.4974', 
 33 |  '3 0.0983 191 40.71 30 0.5772', 
 34 |  '4 0.1276 196 38.40 28 0.5703'
 35 | ]
 36 | '''
 37 |  
 38 | print('第一列 第五列数据')
 39 | L2 = [n[0] for n in data]
 40 | print(L2)
 41 | L5 = [n[4] for n in data]
 42 | print(L5)
 43 |  
 44 | '''
 45 | X表示二维矩阵数据，篮球运动员比赛数据
 46 | 总共96行，每行获取两列数据
 47 | 第一列表示球员每分钟助攻数：assists_per_minute
 48 | 第五列表示球员每分钟得分数：points_per_minute
 49 | '''
 50 |  
 51 | #两列数据生成二维数据
 52 | print('两列数据合并成二维矩阵')
 53 | T = dict(zip(L2,L5))
 54 | type(T)
 55 |  
 56 | #dict类型转换为list
 57 | print('List')
 58 | X = list(map(lambda x,y: (x,y), T.keys(),T.values()))
 59 | print(X)
 60 | print(type(X))
 61 |  
 62 |  
 63 | """
 64 | KMeans聚类
 65 | clf = KMeans(n_clusters=3) 表示类簇数为3，聚成3类数据，clf即赋值为KMeans
 66 | y_pred = clf.fit_predict(X) 载入数据集X，并且将聚类的结果赋值给y_pred
 67 | """
 68 |  
 69 | from sklearn.cluster import Birch
 70 | from sklearn.cluster import KMeans
 71 |  
 72 | clf = KMeans(n_clusters=3)
 73 | y_pred = clf.fit_predict(X)
 74 | print(clf)
 75 | #输出聚类预测结果，96行数据，每个y_pred对应X一行或一个球员，聚成3类，类标为0、1、2
 76 | print(y_pred)
 77 |  
 78 |  
 79 | """
 80 | 可视化绘图
 81 | Python导入Matplotlib包，专门用于绘图
 82 | import matplotlib.pyplot as plt 此处as相当于重命名，plt用于显示图像
 83 | """
 84 |  
 85 | import numpy as np
 86 | import matplotlib.pyplot as plt
 87 |  
 88 |  
 89 | #获取第一列和第二列数据 使用for循环获取 n[0]表示X第一列
 90 | x = [n[0] for n in X]
 91 | print(x)
 92 | y = [n[1] for n in X]
 93 | print(y) 
 94 |  
 95 | #绘制散点图 参数：x横轴 y纵轴 c=y_pred聚类预测结果 marker类型 o表示圆点 *表示星型 x表示点
 96 | #plt.scatter(x, y, c=y_pred, marker='x')
 97 |  
 98 |  
 99 | #坐标
100 | x1 = []
101 | y1 = []
102 |  
103 | x2 = []
104 | y2 = []
105 |  
106 | x3 = []
107 | y3 = []
108 |  
109 | #分布获取类标为0、1、2的数据 赋值给(x1,y1) (x2,y2) (x3,y3)
110 | i = 0
111 | while i < len(X):
112 |     if y_pred[i]==0:
113 |         x1.append(X[i][0])
114 |         y1.append(X[i][1])
115 |     elif y_pred[i]==1:
116 |         x2.append(X[i][0])
117 |         y2.append(X[i][1])
118 |     elif y_pred[i]==2:
119 |         x3.append(X[i][0])
120 |         y3.append(X[i][1])
121 |     
122 |     i = i + 1
123 |  
124 |  
125 | #四种颜色 红 绿 蓝 黑  
126 | plot1, = plt.plot(x1, y1, 'or', marker="x")  
127 | plot2, = plt.plot(x2, y2, 'og', marker="o")  
128 | plot3, = plt.plot(x3, y3, 'ob', marker="*")  
129 |  
130 | #绘制标题
131 | plt.title("Kmeans-Basketball Data")
132 |  
133 | #绘制x轴和y轴坐标
134 | plt.xlabel("assists_per_minute")
135 | plt.ylabel("points_per_minute")
136 |  
137 | #设置右上角图例
138 | plt.legend((plot1, plot2, plot3), ('A', 'B', 'C'), fontsize=10)
139 |  
140 | plt.show()  
141 | 


--------------------------------------------------------------------------------
/blog04-DTC/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog04-DTC/result.png


--------------------------------------------------------------------------------
/blog04-DTC/test01.py:
--------------------------------------------------------------------------------
1 | #导入数据集iris
2 | from sklearn.datasets import load_iris 
3 |  
4 | #载入数据集
5 | iris = load_iris()
6 | #输出数据集
7 | print(iris.data)
8 | 


--------------------------------------------------------------------------------
/blog04-DTC/test02.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Oct 14 21:44:19 2016
 4 | @author: 杨秀璋
 5 | """
 6 |  
 7 | #导入数据集iris
 8 | from sklearn.datasets import load_iris 
 9 |  
10 | #载入数据集
11 | iris = load_iris()
12 |  
13 | print(iris.data)          #输出数据集
14 | print(iris.target)        #输出真实标签
15 | print(len(iris.target))
16 | print(iris.data.shape)    #150个样本 每个样本4个特征
17 |  
18 |  
19 | #导入决策树DTC包
20 | from sklearn.tree import DecisionTreeClassifier
21 |  
22 | #训练
23 | clf = DecisionTreeClassifier()
24 | clf.fit(iris.data, iris.target)
25 | print(clf)
26 |  
27 | #预测
28 | predicted = clf.predict(iris.data)
29 |  
30 | #获取花卉两列数据集
31 | X = iris.data
32 | L1 = [x[0] for x in X]
33 | print(L1)
34 | L2 = [x[1] for x in X]
35 | print(L2)
36 |  
37 | #绘图
38 | import numpy as np
39 | import matplotlib.pyplot as plt
40 | plt.scatter(L1, L2, c=predicted, marker='x')  #cmap=plt.cm.Paired
41 | plt.title("DTC")
42 | plt.show()
43 | 


--------------------------------------------------------------------------------
/blog04-DTC/test03.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Oct 14 21:44:19 2016
 4 | @author: 杨秀璋
 5 | """
 6 |  
 7 | #导入数据集iris
 8 | from sklearn.datasets import load_iris
 9 | import numpy as np
10 |  
11 | #载入数据集
12 | iris = load_iris()
13 |  
14 | '''
15 | print iris.data          #输出数据集
16 | print iris.target        #输出真实标签
17 | print len(iris.target)
18 | print iris.data.shape    #150个样本 每个样本4个特征
19 | '''
20 |  
21 | '''
22 | 重点：分割数据集 构造训练集/测试集，120/30
23 |      70%训练  0-40  50-90  100-140
24 |      30%预测  40-50 90-100 140-150
25 | '''
26 | #训练集
27 | train_data = np.concatenate((iris.data[0:40, :], iris.data[50:90, :], iris.data[100:140, :]), axis = 0)
28 | #训练集样本类别
29 | train_target = np.concatenate((iris.target[0:40], iris.target[50:90], iris.target[100:140]), axis = 0)
30 | #测试集
31 | test_data = np.concatenate((iris.data[40:50, :], iris.data[90:100, :], iris.data[140:150, :]), axis = 0)
32 | #测试集样本类别
33 | test_target = np.concatenate((iris.target[40:50], iris.target[90:100], iris.target[140:150]), axis = 0)
34 |  
35 |  
36 | #导入决策树DTC包
37 | from sklearn.tree import DecisionTreeClassifier
38 |  
39 | #训练
40 | clf = DecisionTreeClassifier()
41 | #注意均使用训练数据集和样本类标
42 | clf.fit(train_data, train_target)
43 | print(clf)
44 |  
45 | #预测结果
46 | predict_target = clf.predict(test_data)
47 | print(predict_target)
48 |  
49 | #预测结果与真实结果比对
50 | print(sum(predict_target == test_target))
51 |  
52 | #输出准确率 召回率 F值
53 | from sklearn import metrics
54 | print(metrics.classification_report(test_target, predict_target))
55 | print(metrics.confusion_matrix(test_target, predict_target))
56 |  
57 |  
58 | #获取花卉测试数据集两列数据集
59 | X = test_data
60 | L1 = [n[0] for n in X]
61 | print(L1)
62 | L2 = [n[1] for n in X]
63 | print(L2)
64 |  
65 | #绘图
66 | import numpy as np
67 | import matplotlib.pyplot as plt
68 | plt.scatter(L1, L2, c=predict_target, marker='x')  #cmap=plt.cm.Paired
69 | plt.title("DecisionTreeClassifier")
70 | plt.show()
71 | 


--------------------------------------------------------------------------------
/blog04-DTC/test04.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Oct 12 23:30:34 2016
 4 | @author: yxz15
 5 | """
 6 |  
 7 | print(__doc__)
 8 |  
 9 | import numpy as np
10 | import matplotlib.pyplot as plt
11 |  
12 | from sklearn.datasets import load_iris
13 | from sklearn.tree import DecisionTreeClassifier
14 |  
15 | # Parameters
16 | n_classes = 3
17 | plot_colors = "bry"
18 | plot_step = 0.02
19 |  
20 | # Load data
21 | iris = load_iris()
22 |  
23 | for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
24 |                                 [1, 2], [1, 3], [2, 3]]):
25 |     # We only take the two corresponding features
26 |     X = iris.data[:, pair]
27 |     y = iris.target
28 |  
29 |     # Train
30 |     clf = DecisionTreeClassifier().fit(X, y)
31 |  
32 |     # Plot the decision boundary
33 |     plt.subplot(2, 3, pairidx + 1)
34 |  
35 |     x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
36 |     y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
37 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
38 |                          np.arange(y_min, y_max, plot_step))
39 |  
40 |     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
41 |     Z = Z.reshape(xx.shape)
42 |     cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
43 |  
44 |     plt.xlabel(iris.feature_names[pair[0]])
45 |     plt.ylabel(iris.feature_names[pair[1]])
46 |     plt.axis("tight")
47 |  
48 |     # Plot the training points
49 |     for i, color in zip(range(n_classes), plot_colors):
50 |         idx = np.where(y == i)
51 |         plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
52 |                     cmap=plt.cm.Paired)
53 |  
54 |     plt.axis("tight")
55 |  
56 | plt.suptitle("Decision surface of a decision tree using paired features")
57 | plt.legend()
58 | plt.show()
59 | 


--------------------------------------------------------------------------------
/blog05-LR/res.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog05-LR/res.png


--------------------------------------------------------------------------------
/blog05-LR/res02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog05-LR/res02.png


--------------------------------------------------------------------------------
/blog05-LR/res03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog05-LR/res03.png


--------------------------------------------------------------------------------
/blog05-LR/test01.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Oct 27 02:37:05 2016
 4 | @author: yxz15
 5 | """
 6 |  
 7 | from sklearn import datasets
 8 | diabetes = datasets.load_diabetes()                          #载入数据
 9 | print(diabetes.data)                                         #数据
10 | print(diabetes.target)                                       #类标
11 | print('总行数: ', len(diabetes.data), len(diabetes.target))  #数据总行数
12 | print('特征数: ', len(diabetes.data[0]))                     #每行数据集维数
13 | print('数据类型: ', diabetes.data.shape)                     #类型
14 | print(type(diabetes.data), type(diabetes.target))            #数据集类型
15 |  
16 | """
17 | [[ 0.03807591  0.05068012  0.06169621 ..., -0.00259226  0.01990842
18 |   -0.01764613]
19 |  [-0.00188202 -0.04464164 -0.05147406 ..., -0.03949338 -0.06832974
20 |   -0.09220405]
21 |   ...
22 |  [-0.04547248 -0.04464164 -0.0730303  ..., -0.03949338 -0.00421986
23 |    0.00306441]]
24 | [ 151.   75.  141.  206.  135.   97.  138.   63.  110.  310.  101.
25 |   ...
26 | 64.   48.  178.  104.  132.  220.   57.]
27 | 总行数:  442 442
28 | 特征数:  10
29 | 数据类型:  (442L, 10L)
30 | <type 'numpy.ndarray'> <type 'numpy.ndarray'>
31 | """
32 | 


--------------------------------------------------------------------------------
/blog05-LR/test02.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Oct 28 00:44:55 2016
 4 | @author: yxz15
 5 | """
 6 |  
 7 | from sklearn import linear_model       #导入线性模型
 8 | import matplotlib.pyplot as plt        #绘图
 9 | import numpy as np
10 |  
11 | #X表示匹萨尺寸 Y表示匹萨价格
12 | X = [[6], [8], [10], [14], [18]]
13 | Y = [[7], [9], [13], [17.5], [18]]
14 |  
15 | print('数据集X: ', X)
16 | print('数据集Y: ', Y)
17 |  
18 | #回归训练
19 | clf = linear_model.LinearRegression() #使用线性回归
20 | clf.fit(X, Y)                         #导入数据集
21 | res = clf.predict(np.array([12]).reshape(-1, 1))[0] #预测结果
22 | print('预测一张12英寸匹萨价格：$%.2f' % res)
23 |  
24 | #预测结果
25 | X2 = [[0], [10], [14], [25]]
26 | Y2 = clf.predict(X2)
27 |  
28 | #绘制线性回归图形
29 | plt.figure()
30 | plt.title(u'diameter-cost curver')   #标题
31 | plt.xlabel(u'diameter')              #x轴坐标
32 | plt.ylabel(u'cost')                  #y轴坐标
33 | plt.axis([0, 25, 0, 25])             #区间
34 | plt.grid(True)                       #显示网格
35 | plt.plot(X, Y, 'k.')                 #绘制训练数据集散点图
36 | plt.plot(X2, Y2, 'g-')               #绘制预测数据集直线
37 | plt.show()
38 |  
39 | 


--------------------------------------------------------------------------------
/blog05-LR/test03.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Oct 28 01:21:30 2016
 4 | @author: yxz15
 5 | """
 6 |  
 7 | from sklearn import datasets
 8 | import matplotlib.pyplot as plt
 9 | import numpy as np
10 | from sklearn import linear_model       #导入线性模型
11 |  
12 | #数据集
13 | diabetes = datasets.load_diabetes() #载入数据
14 |  
15 | #获取一个特征
16 | diabetes_x_temp = diabetes.data[:, np.newaxis, 2] 
17 |  
18 | diabetes_x_train = diabetes_x_temp[:-20]   #训练样本
19 | diabetes_x_test = diabetes_x_temp[-20:]    #测试样本 后20行
20 | diabetes_y_train = diabetes.target[:-20]   #训练标记
21 | diabetes_y_test = diabetes.target[-20:]    #预测对比标记
22 |  
23 | #回归训练及预测
24 | clf = linear_model.LinearRegression()
25 | clf.fit(diabetes_x_train, diabetes_y_train)  #注: 训练数据集
26 |  
27 | #系数 残差平法和 方差得分
28 | print('Coefficients :\n', clf.coef_)
29 | print("Residual sum of square: %.2f" %np.mean((clf.predict(diabetes_x_test) - diabetes_y_test) ** 2))
30 | print("variance score: %.2f" % clf.score(diabetes_x_test, diabetes_y_test))
31 |  
32 | #绘图
33 | plt.title(u'LinearRegression Diabetes')   #标题
34 | plt.xlabel(u'Attributes')                 #x轴坐标
35 | plt.ylabel(u'Measure of disease')         #y轴坐标
36 | #点的准确位置
37 | plt.scatter(diabetes_x_test, diabetes_y_test, color = 'black')
38 | #预测结果 直线表示
39 | plt.plot(diabetes_x_test, clf.predict(diabetes_x_test), color='blue', linewidth = 3)
40 | plt.show()
41 | 


--------------------------------------------------------------------------------
/blog05-LR/test04.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Dec 29 12:47:58 2011
 4 | @author: Administrator
 5 | """
 6 | #第一步 数据集划分
 7 | from sklearn import datasets
 8 | import numpy as np
 9 |  
10 | #获取数据 10*442
11 | d = datasets.load_diabetes()
12 | x = d.data
13 | print('获取x特征')
14 | print(len(x), x.shape)
15 | print(x[:4])
16 |  
17 | #获取一个特征 第3列数据
18 | x_one = x[:,np.newaxis, 2]
19 | print(x_one[:4])
20 |  
21 | #获取的正确结果
22 | y = d.target
23 | print('获取的结果')
24 | print(y[:4])
25 |  
26 | #x特征划分
27 | x_train = x_one[:-42]
28 | x_test = x_one[-42:]
29 | print(len(x_train), len(x_test))
30 | y_train = y[:-42]
31 | y_test = y[-42:]
32 | print(len(y_train), len(y_test))
33 |  
34 |  
35 | #第二步 线性回归实现
36 | from sklearn import linear_model
37 | clf = linear_model.LinearRegression()
38 | print(clf)
39 | clf.fit(x_train, y_train)
40 | pre = clf.predict(x_test)
41 | print('预测结果')
42 | print(pre)
43 | print('真实结果')
44 | print(y_test)
45 |    
46 |    
47 | #第三步 评价结果
48 | cost = np.mean(y_test-pre)**2
49 | print('次方', 2**5)
50 | print('平方和计算:', cost)
51 | print('系数', clf.coef_)
52 | print('截距', clf.intercept_)  
53 | print('方差', clf.score(x_test, y_test))
54 |  
55 |  
56 | #第四步 绘图
57 | import matplotlib.pyplot as plt
58 | plt.title("diabetes")
59 | plt.xlabel("x")
60 | plt.ylabel("y")
61 | plt.plot(x_test, y_test, 'k.')
62 | plt.plot(x_test, pre, 'g-')
63 |  
64 | for idx, m in enumerate(x_test):
65 |     plt.plot([m, m],[y_test[idx], 
66 |               pre[idx]], 'r-')
67 |  
68 | plt.savefig('power.png', dpi=300)
69 |  
70 | plt.show()
71 |  
72 | 


--------------------------------------------------------------------------------
/blog06-Numpy+Matplotlib/data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog06-Numpy+Matplotlib/data.xls


--------------------------------------------------------------------------------
/blog06-Numpy+Matplotlib/test01.py:
--------------------------------------------------------------------------------
 1 | #导入包并重命名
 2 | import numpy as np
 3 |  
 4 | #定义一维数组
 5 | a = np.array([2, 0, 1, 5, 8, 3])
 6 | print('原始数据:', a)
 7 |  
 8 | #输出最大、最小值及形状
 9 | print('最小值:', a.min())
10 | print('最大值:', a.max())
11 | print('形状', a.shape)
12 |  
13 | #数据切片
14 | print('切片操作:')
15 | print(a[:-2])
16 | print(a[-2:])
17 | print(a[:1])
18 |  
19 | #排序
20 | print(type(a))
21 | a.sort()
22 | print('排序后:', a)
23 | 


--------------------------------------------------------------------------------
/blog06-Numpy+Matplotlib/test02.py:
--------------------------------------------------------------------------------
 1 | #定义二维数组
 2 | import numpy as np
 3 | c = np.array([[1, 2, 3, 4],[4, 5, 6, 7], [7, 8, 9, 10]])
 4 |  
 5 | #获取值
 6 | print('形状:', c.shape)
 7 | print('获取值:', c[1][0])
 8 | print('获取某行:')
 9 | print(c[1][:])
10 | print('获取某行并切片:')
11 | print(c[0][:-1])
12 | print(c[0][-1:])
13 |  
14 | #获取具体某列值
15 | print('获取第3列:')
16 | print(c[:,np.newaxis, 2])
17 |  
18 | #调用sin函数
19 | print(np.sin(np.pi/6))
20 | print(type(np.sin(0.5)))
21 |  
22 | #范围定义
23 | print(np.arange(0,4))
24 | print(type(np.arange(0,4)))
25 | 


--------------------------------------------------------------------------------
/blog06-Numpy+Matplotlib/test03.py:
--------------------------------------------------------------------------------
 1 | #读取数据 header设置Excel无标题头
 2 | import pandas as pd
 3 | data = pd.read_excel("data.xls", header=None) 
 4 | print(data)
 5 |  
 6 | #计算数据长度
 7 | print('行数', len(data))
 8 |  
 9 | #计算用户A\B\C用电总和
10 | print(data.sum())
11 |  
12 | #计算用户A\B\C用点量算术平均数
13 | mm = data.sum()
14 | print(mm)
15 |  
16 | #输出预览前5行数据
17 | print('预览前5行数据')
18 | print(data.head())
19 |  
20 | #输出数据基本统计量
21 | print('输出数据基本统计量')
22 | print(data.describe())
23 | 


--------------------------------------------------------------------------------
/blog06-Numpy+Matplotlib/test04.py:
--------------------------------------------------------------------------------
 1 | from pandas import Series, DataFrame
 2 |  
 3 | #通过传递一个list对象来创建Series，默认创建整型索引；
 4 | a = Series([4, 7, -5, 3])
 5 | print('创建Series:')
 6 | print(a)
 7 |  
 8 | #创建一个带有索引来确定每一个数据点的Series ;
 9 | b = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
10 | print('创建带有索引的Series:')
11 | print(b)
12 |  
13 | #如果你有一些数据在一个Python字典中，你可以通过传递字典来创建一个Series；
14 | sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
15 | c = Series(sdata)
16 | print('通过传递字典创建Series:')
17 | print(c)
18 | states = ['California', 'Ohio', 'Oregon', 'Texas']
19 | d = Series(sdata, index=states)
20 | print('California没有字典为空:')
21 | print(d)
22 | 


--------------------------------------------------------------------------------
/blog06-Numpy+Matplotlib/test05-matplotlib.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Nov 14 04:06:01 2016
 4 | @author: yxz15
 5 | """
 6 |  
 7 | #导入数据集
 8 | import pandas as pd
 9 | data = pd.read_excel("data.xls", header=None) 
10 | mm = data.sum()
11 | print('计算用电量总数:')
12 | print(mm)
13 |  
14 | #绘制图形
15 | import numpy as np
16 | import matplotlib.pyplot as plt
17 | #中文字体显示
18 | plt.rc('font', family='SimHei', size=13)
19 | N = 3
20 | #3个用户 0 1 2
21 | ind = np.arange(N)  # the x locations for the groups 
22 | print(ind)
23 | #设置宽度
24 | width = 0.35        
25 | x = [u'用户A', u'用户B', u'用户C']
26 | #绘图
27 | plt.bar(ind, mm, width, color='r', label='sum num')
28 | plt.xlabel(u"用户名")
29 | plt.ylabel(u"总耗电量")
30 | plt.title(u'电力窃漏电用户自动识别--总耗电量')
31 | plt.legend()
32 | #设置底部名称
33 | plt.xticks(ind+width/2, x, rotation=40) #旋转40度
34 | plt.show()
35 |  
36 | 


--------------------------------------------------------------------------------
/blog06-Numpy+Matplotlib/test06-matplotlib.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 |  
 3 | mm = [45, 30, 25]             #每一块占得比例，总和为100
 4 | n = mm[0]+mm[1]+mm[2]
 5 | a = (mm[0]*1.0*100/n)
 6 | b = (mm[1]*1.0*100/n)
 7 | c = (mm[2]*1.0*100/n)
 8 | print(a, b, c, n)
 9 | fracs = [a, b, c]
10 |  
11 | explode=(0, 0, 0.08)             #离开整体的距离，看效果
12 | labels = 'A', 'B', 'C'           #对应每一块的标志
13 |  
14 | plt.pie(fracs, explode=explode, labels=labels,
15 |                 autopct='%1.1f%%', shadow=True, startangle=90, colors = ("g", "r", "y"))
16 |                                  # startangle是开始的角度，默认为0，从这里开始按逆时针方向依次展开
17 |  
18 | plt.title('Raining Hogs and Dogs')   #标题
19 |  
20 | plt.show()
21 | 


--------------------------------------------------------------------------------
/blog06-Numpy+Matplotlib/test07-matplotlib.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | plt.rc('font', family='SimHei', size=13)
 4 |  
 5 | num = np.array([13325, 9403, 9227, 8651])
 6 | ratio = np.array([0.75, 0.76, 0.72, 0.75])
 7 | men = num * ratio
 8 | women = num * (1-ratio)
 9 | x = [u'聊天',u'支付',u'团购\n优惠券',u'在线视频']
10 |  
11 | width = 0.5
12 | idx = np.arange(len(x))
13 | plt.bar(idx, men, width, color='red', label=u'男性用户')
14 | plt.bar(idx, women, width, bottom=men, color='yellow', label=u'女性用户')
15 | plt.xlabel(u'应用类别')
16 | plt.ylabel(u'男女分布')
17 | plt.xticks(idx+width/2, x, rotation=40)
18 | plt.legend()
19 | plt.show()
20 | 


--------------------------------------------------------------------------------
/blog07-pac/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog07-pac/result01.png


--------------------------------------------------------------------------------
/blog07-pac/result02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog07-pac/result02.png


--------------------------------------------------------------------------------
/blog07-pac/test01.py:
--------------------------------------------------------------------------------
 1 | #载入数据集
 2 | from sklearn.datasets import load_boston
 3 | d = load_boston()
 4 | x = d.data
 5 | y = d.target
 6 | print(x[:10])
 7 | print('形状:', x.shape)
 8 |  
 9 | #降维
10 | import numpy as np
11 | from sklearn.decomposition import PCA
12 | pca = PCA(n_components=2)
13 | newData = pca.fit_transform(x)
14 | print('降维后数据:')
15 | print(newData[:4])
16 | print('形状:', newData.shape)
17 | 


--------------------------------------------------------------------------------
/blog07-pac/test02.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 |   
 4 | plt.figure(1) # 创建图表1
 5 | plt.figure(2) # 创建图表2
 6 | ax1 = plt.subplot(211) # 在图表2中创建子图1
 7 | ax2 = plt.subplot(212) # 在图表2中创建子图2
 8 |   
 9 | x = np.linspace(0, 3, 100)
10 | for i in range(5):
11 |     plt.figure(1)    # 选择图表1 
12 |     plt.plot(x, np.exp(i*x/3))
13 |     plt.sca(ax1)    # 选择图表2的子图1
14 |     plt.plot(x, np.sin(i*x))
15 |     plt.sca(ax2)    # 选择图表2的子图2
16 |     plt.plot(x, np.cos(i*x))
17 |   
18 | plt.show()
19 | 


--------------------------------------------------------------------------------
/blog07-pac/test03.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 |  
 3 | #糖尿病数据集
 4 | from sklearn.datasets import load_diabetes
 5 | data = load_diabetes()
 6 | x = data.data
 7 | print(x[:4])
 8 | y = data.target
 9 | print(y[:4])
10 |  
11 | #KMeans聚类算法
12 | from sklearn.cluster import KMeans
13 | #训练
14 | clf = KMeans(n_clusters=2)
15 | print(clf)
16 | clf.fit(x)
17 | #预测
18 | pre = clf.predict(x)
19 | print(pre[:10])
20 |  
21 | #使用PCA降维操作
22 | from sklearn.decomposition import PCA
23 | pca = PCA(n_components=2)
24 | newData = pca.fit_transform(x)
25 | print(newData[:4])
26 |  
27 | L1 = [n[0] for n in newData]
28 | L2 = [n[1] for n in newData]
29 |  
30 | #绘图
31 | import numpy as np
32 | import matplotlib.pyplot as plt
33 |  
34 | #用来正常显示中文标签
35 | plt.rc('font', family='SimHei', size=8)
36 | #plt.rcParams['font.sans-serif']=['SimHei'] 
37 |  
38 | #用来正常显示负号
39 | plt.rcParams['axes.unicode_minus']=False 
40 |  
41 | p1 = plt.subplot(221)
42 | plt.title(u"Kmeans聚类 n=2")
43 | plt.scatter(L1,L2,c=pre,marker="s")
44 | plt.sca(p1)
45 |  
46 |  
47 | ###################################
48 | # 聚类 类蔟数=3
49 |  
50 | clf = KMeans(n_clusters=3)
51 | clf.fit(x)
52 | pre = clf.predict(x)
53 |  
54 | p2 = plt.subplot(222)
55 | plt.title("Kmeans n=3")
56 | plt.scatter(L1,L2,c=pre,marker="s")
57 | plt.sca(p2)
58 |  
59 |  
60 | ###################################
61 | # 聚类 类蔟数=4
62 |  
63 | clf = KMeans(n_clusters=4)
64 | clf.fit(x)
65 | pre = clf.predict(x)
66 |  
67 | p3 = plt.subplot(223)
68 | plt.title("Kmeans n=4")
69 | plt.scatter(L1,L2,c=pre,marker="+")
70 | plt.sca(p3)
71 |  
72 |  
73 | ###################################
74 | # 聚类 类蔟数=5
75 |  
76 | clf = KMeans(n_clusters=5)
77 | clf.fit(x)
78 | pre = clf.predict(x)
79 |  
80 | p4 = plt.subplot(224)
81 | plt.title("Kmeans n=5")
82 | plt.scatter(L1,L2,c=pre,marker="+")
83 | plt.sca(p4)
84 |  
85 | #保存图片本地
86 | plt.savefig('power.png', dpi=300)  
87 | plt.show()
88 |  
89 | 


--------------------------------------------------------------------------------
/blog08-Apriori/test01.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Nov 28 03:29:51 2016
 4 | 地址：http://blog.csdn.net/u010454729/article/details/49078505
 5 | @author: 参考CSDN u010454729 
 6 | """
 7 | 
 8 | def loadDataSet():  
 9 |     return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]  
10 |   
11 | def createC1(dataSet):                  #构建所有候选项集的集合  
12 |     C1 = []  
13 |     for transaction in dataSet:  
14 |         for item in transaction:  
15 |             if not [item] in C1:  
16 |                 C1.append([item])       #C1添加的是列表，对于每一项进行添加，{1},{3},{4},{2},{5}  
17 |     C1.sort()  
18 |     return map(frozenset, C1)           #使用frozenset，被“冰冻”的集合，为后续建立字典key-value使用。  
19 |   
20 | def scanD(D,Ck,minSupport):             #由候选项集生成符合最小支持度的项集L。参数分别为数据集、候选项集列表，最小支持度  
21 |     ssCnt = {}  
22 |     for tid in D:                       #对于数据集里的每一条记录  
23 |         for can in Ck:                  #每个候选项集can  
24 |             if can.issubset(tid):       #若是候选集can是作为记录的子集，那么其值+1,对其计数  
25 |                 if can not in ssCnt:    #ssCnt[can] = ssCnt.get(can,0)+1一句可破，没有的时候为0,加上1,有的时候用get取出，加1  
26 |                     ssCnt[can] = 1  
27 |                 else:  
28 |                     ssCnt[can] += 1
29 |     numItems = float(len(list(D)))    
30 |     retList  = []  
31 |     supportData = {}  
32 |     for key in ssCnt:
33 |         if numItems > 0:                #除以总的记录条数，即为其支持度
34 |             support = ssCnt[key] / numItems
35 |         else:
36 |             support = 0
37 |         if support >= minSupport:  
38 |             retList.insert(0,key)       #超过最小支持度的项集，将其记录下来。  
39 |         supportData[key] = support  
40 |     return retList, supportData
41 |   
42 | def aprioriGen(Lk, k):                  #创建符合置信度的项集Ck,  
43 |     retList = []  
44 |     lenLk   = len(Lk)  
45 |     for i in range(lenLk):  
46 |         for j in range(i+1, lenLk):     #k=3时，[:k-2]即取[0],对{0,1},{0,2},{1,2}这三个项集来说，L1=0，L2=0，将其合并得{0,1,2}，当L1=0,L2=1不添加，  
47 |             L1 = list(Lk[i])[:k-2]  
48 |             L2 = list(Lk[j])[:k-2]  
49 |             L1.sort()  
50 |             L2.sort()  
51 |             if L1==L2:  
52 |                 retList.append(Lk[i]|Lk[j])  
53 |     return retList  
54 |   
55 | def apriori(dataSet, minSupport = 0.5):  
56 |     C1 = createC1(dataSet)  
57 |     D  = map(set,dataSet)  
58 |     L1, supportData = scanD(D,C1,minSupport)  
59 |     L  = [L1]                           #L将包含满足最小支持度，即经过筛选的所有频繁n项集，这里添加频繁1项集  
60 |     k  = 2  
61 |     while (len(L[k-2])>0):              #k=2开始，由频繁1项集生成频繁2项集，直到下一个打的项集为空  
62 |         Ck = aprioriGen(L[k-2], k)  
63 |         Lk, supK = scanD(D, Ck, minSupport)  
64 |         supportData.update(supK)        #supportData为字典，存放每个项集的支持度，并以更新的方式加入新的supK  
65 |         L.append(Lk)  
66 |         k +=1  
67 |     return L,supportData  
68 |   
69 | dataSet = loadDataSet()  
70 | C1 = createC1(dataSet)
71 | 
72 | print("所有候选1项集C1:\n")
73 | for n in C1:
74 |  print(n)
75 |   
76 | D = map(set, dataSet)  
77 | print("数据集D:\n")
78 | for n in D:
79 |     print(n)
80 |   
81 | L1, supportData0 = scanD(D,C1, 0.5)  
82 | print("符合最小支持度的频繁1项集L1:\n",L1)
83 |   
84 | L, suppData = apriori(dataSet)  
85 | print("所有符合最小支持度的项集L：\n",L)
86 | print("频繁2项集：\n",aprioriGen(L[0],2))
87 | 
88 | L, suppData = apriori(dataSet, minSupport=0.7)  
89 | print("所有符合最小支持度为0.7的项集L：\n",L)
90 | 
91 | 


--------------------------------------------------------------------------------
/blog09-LinearRegression/Index:
--------------------------------------------------------------------------------
1 | Index of glass
2 | 
3 | 02 Dec 1996      139 Index
4 | 02 Mar 1993    11903 glass.data
5 | 16 Jul 1992      780 glass.tag
6 | 30 May 1989     3506 glass.names
7 | 


--------------------------------------------------------------------------------
/blog09-LinearRegression/glass.names:
--------------------------------------------------------------------------------
 1 | 1. Title: Glass Identification Database
 2 | 
 3 | 2. Sources:
 4 |     (a) Creator: B. German
 5 |         -- Central Research Establishment
 6 |            Home Office Forensic Science Service
 7 |            Aldermaston, Reading, Berkshire RG7 4PN
 8 |     (b) Donor: Vina Spiehler, Ph.D., DABFT
 9 |                Diagnostic Products Corporation
10 |                (213) 776-0180 (ext 3014)
11 |     (c) Date: September, 1987
12 | 
13 | 3. Past Usage:
14 |     -- Rule Induction in Forensic Science
15 |        -- Ian W. Evett and Ernest J. Spiehler
16 |        -- Central Research Establishment
17 |           Home Office Forensic Science Service
18 |           Aldermaston, Reading, Berkshire RG7 4PN
19 |        -- Unknown technical note number (sorry, not listed here)
20 |        -- General Results: nearest neighbor held its own with respect to the
21 |              rule-based system
22 | 
23 | 4. Relevant Information:n
24 |       Vina conducted a comparison test of her rule-based system, BEAGLE, the
25 |       nearest-neighbor algorithm, and discriminant analysis.  BEAGLE is 
26 |       a product available through VRS Consulting, Inc.; 4676 Admiralty Way,
27 |       Suite 206; Marina Del Ray, CA 90292 (213) 827-7890 and FAX: -3189.
28 |       In determining whether the glass was a type of "float" glass or not,
29 |       the following results were obtained (# incorrect answers):
30 | 
31 |              Type of Sample                            Beagle   NN    DA
32 |              Windows that were float processed (87)     10      12    21
33 |              Windows that were not:            (76)     19      16    22
34 | 
35 |       The study of classification of types of glass was motivated by 
36 |       criminological investigation.  At the scene of the crime, the glass left
37 |       can be used as evidence...if it is correctly identified!
38 | 
39 | 5. Number of Instances: 214
40 | 
41 | 6. Number of Attributes: 10 (including an Id#) plus the class attribute
42 |    -- all attributes are continuously valued
43 | 
44 | 7. Attribute Information:
45 |    1. Id number: 1 to 214
46 |    2. RI: refractive index
47 |    3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as 
48 |                   are attributes 4-10)
49 |    4. Mg: Magnesium
50 |    5. Al: Aluminum
51 |    6. Si: Silicon
52 |    7. K: Potassium
53 |    8. Ca: Calcium
54 |    9. Ba: Barium
55 |   10. Fe: Iron
56 |   11. Type of glass: (class attribute)
57 |       -- 1 building_windows_float_processed
58 |       -- 2 building_windows_non_float_processed
59 |       -- 3 vehicle_windows_float_processed
60 |       -- 4 vehicle_windows_non_float_processed (none in this database)
61 |       -- 5 containers
62 |       -- 6 tableware
63 |       -- 7 headlamps
64 | 
65 | 8. Missing Attribute Values: None
66 | 
67 | Summary Statistics:
68 | Attribute:   Min     Max      Mean     SD      Correlation with class
69 |  2. RI:       1.5112  1.5339   1.5184  0.0030  -0.1642
70 |  3. Na:      10.73   17.38    13.4079  0.8166   0.5030
71 |  4. Mg:       0       4.49     2.6845  1.4424  -0.7447
72 |  5. Al:       0.29    3.5      1.4449  0.4993   0.5988
73 |  6. Si:      69.81   75.41    72.6509  0.7745   0.1515
74 |  7. K:        0       6.21     0.4971  0.6522  -0.0100
75 |  8. Ca:       5.43   16.19     8.9570  1.4232   0.0007
76 |  9. Ba:       0       3.15     0.1750  0.4972   0.5751
77 | 10. Fe:       0       0.51     0.0570  0.0974  -0.1879
78 | 
79 | 9. Class Distribution: (out of 214 total instances)
80 |     -- 163 Window glass (building windows and vehicle windows)
81 |        -- 87 float processed  
82 |           -- 70 building windows
83 |           -- 17 vehicle windows
84 |        -- 76 non-float processed
85 |           -- 76 building windows
86 |           -- 0 vehicle windows
87 |     -- 51 Non-window glass
88 |        -- 13 containers
89 |        -- 9 tableware
90 |        -- 29 headlamps
91 | 
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/blog09-LinearRegression/glass.tag:
--------------------------------------------------------------------------------
 1 | An original file donated by Vina Speihler
 2 | 
 3 | ID, N    -- numeric identifier of the instance
 4 | RI, N    -- refractive index
 5 | NA2O, N  -- Sodium oxide
 6 | MGO, N   -- magnesium oxide
 7 | AL2O3, N -- aluminum oxide
 8 | SIO2, N  -- silcon oxide
 9 | K2O,  N  -- potassium oxide
10 | CAO, N   -- calcium oxide
11 | BAO, N   -- barium oxide
12 | FE2O3, N -- iron oxide
13 | TYPE, N  -- An unknown, but must correspond to the types in the paper
14 | CAMG, N  -- Unsure
15 | 
16 | Types include:
17 |   1. WF (Float Window)
18 |   2. WNF (Non-float Window)
19 |   3. C (Container)
20 |   4. T (Tableware)
21 |   5. H (Headlamp)     214    2568   14127 glass.dat
22 |       19      92     518 glass.tag
23 |       62     742    4775 glassx.dat
24 |       51     610    3928 nonwindo.dat
25 |        6      14     120 phones
26 |      163    1955   12552 window.dat
27 |      515    5981   36020 total
28 | 


--------------------------------------------------------------------------------
/blog09-LinearRegression/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog09-LinearRegression/result01.png


--------------------------------------------------------------------------------
/blog09-LinearRegression/result02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog09-LinearRegression/result02.png


--------------------------------------------------------------------------------
/blog09-LinearRegression/result03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog09-LinearRegression/result03.png


--------------------------------------------------------------------------------
/blog09-LinearRegression/test01.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Mar 05 18:10:07 2017
 4 | @author: eastmount & zj
 5 | """
 6 |  
 7 | #导入玻璃识别数据集
 8 | import pandas as pd
 9 | glass=pd.read_csv("glass.csv")
10 | #显示前6行数据
11 | print(glass.shape)
12 | print(glass.head(6))
13 |  
14 | import seaborn as sns
15 | import matplotlib.pyplot as plt
16 | sns.set(font_scale=1.5)
17 | sns.lmplot(x='al', y='ri', data=glass, ci=None)
18 | #利用Pandas画散点图
19 | glass.plot(kind='scatter', x='al', y='ri')
20 | plt.show()
21 |  
22 | #利用matplotlib做等效的散点图
23 | plt.scatter(glass.al, glass.ri)
24 | plt.xlabel('al')
25 | plt.ylabel('ri')
26 |  
27 | #拟合线性回归模型
28 | from sklearn.linear_model import LinearRegression
29 | linreg = LinearRegression()
30 | feature_cols = ['al']
31 | X = glass[feature_cols]
32 | y = glass.ri
33 | linreg.fit(X, y)
34 | plt.show()
35 |  
36 | #对于所有的x值做出预测       
37 | glass['ri_pred'] = linreg.predict(X)
38 | print("预测的前六行:")
39 | print(glass.head(6))
40 |  
41 | #用直线表示预测结果
42 | plt.plot(glass.al, glass.ri_pred, color='red')
43 | plt.xlabel('al')
44 | plt.ylabel('Predicted ri')
45 | plt.show()
46 |  
47 | #将直线结果和散点图同时显示出来
48 | plt.scatter(glass.al, glass.ri)
49 | plt.plot(glass.al, glass.ri_pred, color='red')
50 | plt.xlabel('al')
51 | plt.ylabel('ri')
52 | plt.show()
53 |  
54 | #利用相关方法线性预测
55 | linreg.intercept_ + linreg.coef_ * 2
56 | #使用预测方法计算Al = 2的预测
57 | linreg.predict(2)
58 |  
59 | #铝检验系数
60 | ai=zip(feature_cols, linreg.coef_)
61 | print(ai)
62 |  
63 | #使用预测方法计算Al = 3的预测
64 | pre=linreg.predict(3)
65 | print(pre)
66 |  
67 | #检查glass_type
68 | sort=glass.glass_type.value_counts().sort_index()
69 | print(sort)
70 |  
71 | #类型1、2、3的窗户玻璃
72 | #类型5，6，7是家用玻璃
73 | glass['household'] = glass.glass_type.map({1:0, 2:0, 3:0, 5:1, 6:1, 7:1})
74 | print(glass.head())
75 |  
76 | plt.scatter(glass.al, glass.household)
77 | plt.xlabel('al')
78 | plt.ylabel('household')
79 | plt.show()
80 |  
81 | #拟合线性回归模型并存储预测
82 | feature_cols = ['al']
83 | X = glass[feature_cols]
84 | y = glass.household
85 | linreg.fit(X, y)
86 | glass['household_pred'] = linreg.predict(X)
87 | plt.show()
88 |  
89 | #包括回归线的散点图
90 | plt.scatter(glass.al, glass.household)
91 | plt.plot(glass.al, glass.household_pred, color='red')
92 | plt.xlabel('al')
93 | plt.ylabel('household')
94 | plt.show()
95 |  
96 | 


--------------------------------------------------------------------------------
/blog09-LinearRegression/test02.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Mar 05 18:28:56 2017
 4 | @author: eastmount & zj
 5 | """
 6 | #导入玻璃识别数据集
 7 | import pandas as pd
 8 | glass=pd.read_csv("glass.csv")
 9 | print(glass.shape)
10 | print(glass.head(6))
11 |  
12 | #拟合Logistic回归模型 存储类预测
13 | import numpy as np
14 | nums = np.array([5, 15, 8])
15 | np.where(nums > 10, 'big', 'small')  
16 | #将household_pred转换为 1或0   
17 | glass['household_pred_class'] = np.where(glass.household_pred >= 0.5, 1, 0)
18 | print(glass.head(6))
19 | 
20 | from sklearn.linear_model import LogisticRegression
21 | logreg = LogisticRegression(C=1e9)
22 | feature_cols = ['al']
23 | X = glass[feature_cols]
24 | y = glass.household
25 | logreg.fit(X, y)
26 | glass['household_pred_class'] = logreg.predict(X)
27 |  
28 |  
29 | #绘图-显示预测结果
30 | plt.scatter(glass.al, glass.household)
31 | plt.plot(glass.al, glass.household_pred_class, color='red')
32 | plt.xlabel('al')
33 | plt.ylabel('household')
34 | plt.show()
35 |  
36 | glass['household_pred_prob'] = logreg.predict_proba(X)[:, 1]
37 | #绘图 绘制预测概率
38 |  
39 | plt.scatter(glass.al, glass.household)
40 | plt.plot(glass.al, glass.household_pred_prob, color='red')
41 | plt.xlabel('al')
42 | plt.ylabel('household')
43 | plt.show()
44 |  
45 | #检查一些例子的预测
46 | print (logreg.predict_proba (1))
47 | print (logreg.predict_proba(2))
48 | print (logreg. predict_proba (3))
49 | 


--------------------------------------------------------------------------------
/blog10-Pandas/41.txt:
--------------------------------------------------------------------------------
  1 | 61.5	55
  2 | 59.8	61
  3 | 56.9	65
  4 | 62.4	58
  5 | 63.3	58
  6 | 62.8	57
  7 | 62.3	57
  8 | 61.9	55
  9 | 65.1	61
 10 | 59.4	61
 11 | 64	55
 12 | 62.8	56
 13 | 60.4	61
 14 | 62.2	54
 15 | 60.2	62
 16 | 60.9	58
 17 | 62	54
 18 | 63.4	54
 19 | 63.8	56
 20 | 62.7	59
 21 | 63.3	56
 22 | 63.8	55
 23 | 61	57
 24 | 59.4	62
 25 | 58.1	62
 26 | 60.4	58
 27 | 62.5	57
 28 | 62.2	57
 29 | 60.5	61
 30 | 60.9	57
 31 | 60	57
 32 | 59.8	57
 33 | 60.7	59
 34 | 59.5	58
 35 | 61.9	58
 36 | 58.2	59
 37 | 64.1	59
 38 | 64	54
 39 | 60.8	59
 40 | 61.8	55
 41 | 61.2	56
 42 | 61.1	56
 43 | 65.2	56
 44 | 58.4	63
 45 | 63.1	56
 46 | 62.4	58
 47 | 61.8	55
 48 | 63.8	56
 49 | 63.3	60
 50 | 60.7	60
 51 | 60.9	61
 52 | 61.9	54
 53 | 60.9	55
 54 | 61.6	58
 55 | 59.3	62
 56 | 61	59
 57 | 59.3	61
 58 | 62.6	57
 59 | 63	57
 60 | 63.2	55
 61 | 60.9	57
 62 | 62.6	59
 63 | 62.5	57
 64 | 62.1	56
 65 | 61.5	59
 66 | 61.4	56
 67 | 62	55.3
 68 | 63.3	57
 69 | 61.8	58
 70 | 60.7	58
 71 | 61.5	60
 72 | 63.1	56
 73 | 62.9	59
 74 | 62.5	57
 75 | 63.7	57
 76 | 59.2	60
 77 | 59.9	58
 78 | 62.4	54
 79 | 62.8	60
 80 | 62.6	59
 81 | 63.4	59
 82 | 62.1	60
 83 | 62.9	58
 84 | 61.6	56
 85 | 57.9	60
 86 | 62.3	59
 87 | 61.2	58
 88 | 60.8	59
 89 | 60.7	58
 90 | 62.9	58
 91 | 62.5	57
 92 | 55.1	69
 93 | 61.6	56
 94 | 62.4	57
 95 | 63.8	56
 96 | 57.5	58
 97 | 59.4	62
 98 | 66.3	62
 99 | 61.6	59
100 | 61.5	58
101 | 63.2	56
102 | 59.9	54
103 | 61.6	55
104 | 61.7	58
105 | 62.9	56
106 | 62.2	55
107 | 63	59
108 | 62.3	55
109 | 58.8	57
110 | 62	55
111 | 61.4	57
112 | 62.2	56
113 | 63	58
114 | 62.2	59
115 | 62.6	56
116 | 62.7	53
117 | 61.7	58
118 | 62.4	54
119 | 60.7	58
120 | 59.9	59
121 | 62.3	56
122 | 62.3	54
123 | 61.7	63
124 | 64.5	57
125 | 65.3	55
126 | 61.6	60
127 | 61.4	56
128 | 59.6	57
129 | 64.4	57
130 | 65.7	60
131 | 62	56
132 | 63.6	58
133 | 61.9	59
134 | 62.6	60
135 | 61.3	60
136 | 60.9	60
137 | 60.1	62
138 | 61.8	59
139 | 61.2	57
140 | 61.9	56
141 | 60.9	57
142 | 59.8	56
143 | 61.8	55
144 | 60	57
145 | 61.6	55
146 | 62.1	64
147 | 63.3	59
148 | 60.2	56
149 | 61.1	58
150 | 60.9	57
151 | 61.7	59
152 | 61.3	56
153 | 62.5	60
154 | 61.4	59
155 | 62.9	57
156 | 62.4	57
157 | 60.7	56
158 | 60.7	58
159 | 61.5	58
160 | 59.9	57
161 | 59.2	59
162 | 60.3	56
163 | 61.7	60
164 | 61.9	57
165 | 61.9	55
166 | 60.4	59
167 | 61	57
168 | 61.5	55
169 | 61.7	56
170 | 59.2	61
171 | 61.3	56
172 | 58	62
173 | 60.2	61
174 | 61.7	55
175 | 62.7	55
176 | 64.6	54
177 | 61.3	61
178 | 63.7	56.4
179 | 62.7	58
180 | 62.2	57
181 | 61.6	56
182 | 61.5	57
183 | 61.8	56
184 | 60.7	56
185 | 59.7	60.5
186 | 60.5	56
187 | 62.7	58
188 | 62.1	58
189 | 62.8	57
190 | 63.8	58
191 | 57.8	60
192 | 62.1	55
193 | 61.1	60
194 | 60	59
195 | 61.2	57
196 | 62.7	59
197 | 61	57
198 | 61	58
199 | 61.4	57
200 | 61.8	61
201 | 59.9	63
202 | 61.3	58
203 | 60.5	58
204 | 64.1	59
205 | 67.9	60
206 | 62.4	58
207 | 63.2	60
208 | 61.3	55
209 | 60.8	56
210 | 61.7	56
211 | 63.6	57
212 | 61.2	58
213 | 62.1	54
214 | 61.5	55
215 | 61.4	59
216 | 61.8	60
217 | 62.2	56
218 | 61.2	56
219 | 60.6	63
220 | 57.5	64
221 | 61.3	56
222 | 57.2	62
223 | 62.9	60
224 | 63.1	58
225 | 60.8	57
226 | 62.7	59
227 | 62.8	60
228 | 55.1	67
229 | 61.4	59
230 | 62.2	55
231 | 63	54
232 | 63.7	56
233 | 63.6	58
234 | 62	57
235 | 61.5	56
236 | 60.5	60
237 | 61.1	60
238 | 61.8	56
239 | 63.3	56
240 | 59.4	64
241 | 62.5	55
242 | 64.5	58
243 | 62.7	59
244 | 64.2	52
245 | 63.7	54
246 | 60.4	58
247 | 61.8	58
248 | 63.2	56
249 | 61.6	56
250 | 61.6	56
251 | 60.9	57
252 | 61	61
253 | 62.1	57
254 | 60.9	60
255 | 61.3	60
256 | 65.8	59
257 | 61.3	56
258 | 58.8	59
259 | 62.3	55
260 | 60.1	62
261 | 61.8	59
262 | 63.6	55.8
263 | 62.2	56
264 | 59.2	59
265 | 61.8	59
266 | 61.3	55
267 | 62.1	60
268 | 60.7	60
269 | 59.6	57
270 | 62.2	56
271 | 60.6	57
272 | 62.9	57
273 | 64.1	55
274 | 61.3	56
275 | 62.7	55
276 | 63.2	56
277 | 60.7	56
278 | 61.9	60
279 | 62.6	55
280 | 60.7	60
281 | 62	60
282 | 63	57
283 | 58	59
284 | 62.9	57
285 | 58.2	60
286 | 63.2	58
287 | 61.3	59
288 | 60.3	60
289 | 62.7	60
290 | 61.3	58
291 | 61.6	60
292 | 61.9	55
293 | 61.7	56
294 | 61.9	58
295 | 61.8	58
296 | 61.6	56
297 | 58.8	66
298 | 61	57
299 | 67.4	60
300 | 63.4	60
301 | 61.5	59
302 | 58	62
303 | 62.4	54
304 | 61.9	57
305 | 61.6	56
306 | 62.2	59
307 | 62.2	58
308 | 61.3	56
309 | 62.3	57
310 | 61.8	57
311 | 62.5	59
312 | 62.9	60
313 | 61.8	59
314 | 62.3	56
315 | 59	70
316 | 60.7	55
317 | 62.5	55
318 | 62.7	58
319 | 60.4	57
320 | 62.1	58
321 | 57.8	60
322 | 63.8	58
323 | 62.8	57
324 | 62.2	58
325 | 62.3	58
326 | 59.9	58
327 | 61.9	54
328 | 63	55
329 | 62.4	58
330 | 62.9	58
331 | 63.5	56
332 | 61.3	56
333 | 60.6	54
334 | 65.1	58
335 | 62.6	58
336 | 58	62
337 | 62.4	61
338 | 61.3	57
339 | 59.9	60
340 | 60.8	58
341 | 63.5	55
342 | 62.2	57
343 | 63.8	58
344 | 64	57
345 | 62.5	56
346 | 62.3	58
347 | 61.7	57
348 | 62.2	58
349 | 61.5	56
350 | 61	59
351 | 62.2	56
352 | 61.5	54
353 | 67.3	59
354 | 61.7	58
355 | 61.9	56
356 | 61.8	58
357 | 58.7	66
358 | 62.5	57
359 | 62.8	56
360 | 61.1	68
361 | 64	57
362 | 62.5	60
363 | 60.6	58
364 | 61.6	55
365 | 62.2	58
366 | 60	57
367 | 61.9	57
368 | 62.8	57
369 | 62	57
370 | 66.4	59
371 | 63.4	56
372 | 60.9	56
373 | 63.1	57
374 | 63.1	59
375 | 59.2	57
376 | 60.7	54
377 | 64.6	56
378 | 61.8	56
379 | 59.9	60
380 | 61.7	55
381 | 62.8	61
382 | 62.7	57
383 | 63.4	58
384 | 63.5	54
385 | 65.7	59
386 | 68.1	56
387 | 63	60
388 | 59.5	58
389 | 63.5	59
390 | 61.7	58
391 | 62.7	58
392 | 62.8	58
393 | 62.4	57
394 | 61	59
395 | 63.1	56
396 | 60.7	57
397 | 60.9	59
398 | 60.1	55
399 | 62.9	58
400 | 63.3	56
401 | 63.8	55
402 | 62.9	57
403 | 63.4	60
404 | 63.9	55
405 | 61.4	56
406 | 61.9	55
407 | 62.4	55
408 | 61.8	58
409 | 61.5	56
410 | 60.4	57
411 | 61.8	55
412 | 62	56
413 | 62.3	56
414 | 61.6	56
415 | 60.6	56
416 | 58.4	62
417 | 61.4	58
418 | 61.9	56
419 | 62	56
420 | 61.5	57
421 | 62.3	58
422 | 60.9	61
423 | 62.4	57
424 | 55	61
425 | 58.6	60
426 | 62	57
427 | 59.8	58
428 | 63.4	55
429 | 64.3	58
430 | 62.2	59
431 | 61.7	57
432 | 61.1	59
433 | 61.5	56
434 | 58.5	62
435 | 61.7	58
436 | 60.4	56
437 | 61.4	56
438 | 61.5	55
439 | 61.4	56
440 | 65	56
441 | 56	60
442 | 60.2	59
443 | 58.3	58
444 | 53.1	63
445 | 60.3	58
446 | 61.4	56
447 | 60.1	57
448 | 63.4	55
449 | 61.5	59
450 | 62.7	56
451 | 62.5	55
452 | 61.3	56
453 | 60.2	56
454 | 62.7	57
455 | 62.3	58
456 | 61.5	56
457 | 59.2	59
458 | 61.8	59
459 | 61.3	55
460 | 61.4	58
461 | 62.8	55
462 | 62.8	64
463 | 62.4	61
464 | 59.3	60
465 | 63	60
466 | 61.3	60
467 | 59.3	62
468 | 61	57
469 | 62.9	57
470 | 59.6	57
471 | 61.8	60
472 | 62.7	57
473 | 65.3	62
474 | 63.8	58
475 | 62.3	56
476 | 59.7	63
477 | 64.3	60
478 | 62.9	58
479 | 62	57
480 | 61.6	59
481 | 61.9	55
482 | 61.3	58
483 | 63.6	57
484 | 59.6	61
485 | 62.2	59
486 | 61.7	55
487 | 63.2	58
488 | 60.8	60
489 | 60.3	59
490 | 60.9	60
491 | 62.4	59
492 | 60.2	60
493 | 62	55
494 | 60.8	57
495 | 62.1	55
496 | 62.7	60
497 | 61.3	58
498 | 60.2	60
499 | 60.7	56


--------------------------------------------------------------------------------
/blog10-Pandas/bankloan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog10-Pandas/bankloan.png


--------------------------------------------------------------------------------
/blog10-Pandas/ccc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog10-Pandas/ccc.png


--------------------------------------------------------------------------------
/blog10-Pandas/data.csv:
--------------------------------------------------------------------------------
 1 | year,Beijing,Chongqing,Shenzhen,Guiyang,Kunming,Shanghai,Wuhai,Changsha
 2 | 2002,4764,1556,5802,1643,2276,4134,1928,1802
 3 | 2003,4737,1596,6256,1949,2233,5118,2072,2040
 4 | 2004,5020.93,1766.24,6756.24,1801.68,2473.78,5855,2516.32,2039.09
 5 | 2005,6788.09,2134.99,7582.27,2168.9,2639.72,6842,3061.77,2313.73
 6 | 2006,8279.51,2269.21,9385.34,2372.66,2903.32,7196,3689.64,2644.15
 7 | 2007,11553.26,2722.58,14049.69,2901.63,3108.12,8361,4664.03,3304.74
 8 | 2008,12418,2785,12665,3149,3750,8195,4781,3288
 9 | 2009,13799,3442,14615,3762,3807,12840,5329,3648
10 | 2010,17782,4281,19170,4410,3660,14464,5746,4418
11 | 2011,16851.95,4733.84,21350.13,5069.52,4715.23,14603.24,7192.9,5862.39
12 | 2012,17021.63,5079.93,19589.82,4846.14,5744.68,14061.37,7344.05,6100.87
13 | 2013,18553,5569,24402,5025,5795,16420,7717,6292
14 | 2014,18833,5519,24723,5608,6384,16787,7951,6116
15 | 


--------------------------------------------------------------------------------
/blog10-Pandas/data2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog10-Pandas/data2.xlsx


--------------------------------------------------------------------------------
/blog10-Pandas/guiyang.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog10-Pandas/guiyang.png


--------------------------------------------------------------------------------
/blog10-Pandas/test01.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 06 10:55:17 2017
 4 | @author: eastmount
 5 | """
 6 |  
 7 | import pandas as pd
 8 | data = pd.read_csv("data.csv",index_col='year') #index_col用作行索引的列名 
 9 | #显示前6行数据 
10 | print(data.shape)  
11 | print(data.head(6))
12 |  
13 | import matplotlib.pyplot as plt
14 | plt.rcParams['font.sans-serif'] = ['simHei'] #用来正常显示中文标签
15 | plt.rcParams['axes.unicode_minus'] = False   #用来正常显示负号
16 | data.plot()
17 | plt.savefig(u'时序图.png', dpi=500)
18 | plt.show()
19 | 


--------------------------------------------------------------------------------
/blog10-Pandas/test02.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 06 10:55:17 2017
 4 | @author: eastmount
 5 | """
 6 |  
 7 | import pandas as pd
 8 | data = pd.read_csv("data.csv",index_col='year') #index_col用作行索引的列名 
 9 | #显示前6行数据 
10 | print(data.shape)  
11 | print(data.head(6))
12 |  
13 | import matplotlib.pyplot as plt
14 | plt.rcParams['font.sans-serif'] = ['simHei'] #用来正常显示中文标签
15 | plt.rcParams['axes.unicode_minus'] = False   #用来正常显示负号
16 | data.plot()
17 | plt.savefig(u'时序图.png', dpi=500)
18 | plt.show()
19 |  
20 | #获取贵阳数据集并绘图
21 | gy = data['Guiyang']
22 | print('输出贵阳数据')
23 | print(gy)
24 | gy.plot()
25 | plt.show()
26 | 


--------------------------------------------------------------------------------
/blog10-Pandas/test03.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 06 10:55:17 2017
 4 | @author: eastmount
 5 | """
 6 | import matplotlib.pyplot as plt
 7 | import pandas as pd
 8 | data = pd.read_csv("data.csv",index_col='year') #index_col用作行索引的列名 
 9 | #显示前6行数据 
10 | print(data.shape)  
11 | print(data.head(6))
12 | #获取贵阳数据集并绘图
13 | gy = data['Guiyang']
14 | print('输出贵阳数据')
15 | print(gy)
16 |  
17 | import numpy as np
18 | x = ['2002','2003','2004','2005','2006','2007','2008',
19 |      '2009','2010','2011','2012','2013','2014']
20 | N = 13
21 | ind = np.arange(N)  #赋值0-13
22 | width=0.35
23 | plt.bar(ind, gy, width, color='r', label='sum num') 
24 | #设置底部名称  
25 | plt.xticks(ind+width/2, x, rotation=40) #旋转40度  
26 | plt.title('The price of Guiyang')  
27 | plt.xlabel('year')  
28 | plt.ylabel('price')  
29 | plt.savefig('guiyang.png',dpi=400)  
30 | plt.show()
31 | 


--------------------------------------------------------------------------------
/blog10-Pandas/test04.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pylab as pl
 3 | # make an array of random numbers with a gaussian distribution with
 4 | # mean = 5.0
 5 | # rms = 3.0
 6 | # number of points = 1000
 7 | data = np.random.normal(5.0, 3.0, 1000)
 8 | # make a histogram of the data array
 9 | pl.hist(data, histtype='stepfilled') #去掉黑色轮廓
10 | # make plot labels
11 | pl.xlabel('data') 
12 | pl.show()
13 | 


--------------------------------------------------------------------------------
/blog10-Pandas/test05.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 06 10:55:17 2017
 4 | @author: yxz15
 5 | """
 6 |  
 7 | import pandas as pd
 8 | data = pd.read_csv("data.csv",index_col='year')
 9 | #显示前6行数据  
10 | print(data.shape)  
11 | print(data.head(6))
12 |  
13 | import matplotlib.pyplot as plt
14 | plt.rcParams['font.sans-serif'] = ['simHei']
15 | plt.rcParams['axes.unicode_minus'] = False
16 | data.plot()
17 | plt.savefig(u'时序图.png', dpi=500)
18 | plt.show()
19 |  
20 | from statsmodels.graphics.tsaplots import plot_acf
21 | gy = data['Guiyang']
22 | print(gy)
23 | plot_acf(gy).show()
24 | plt.savefig(u'贵阳自相关图',dpi=300)
25 |  
26 | from statsmodels.tsa.stattools import adfuller as ADF
27 | print('ADF:',ADF(gy))
28 | 


--------------------------------------------------------------------------------
/blog10-Pandas/test06-dalian.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 06 10:19:15 2017
 4 | @author: yxz15
 5 | """
 6 |  
 7 | #第一部分：导入数据集
 8 | import pandas as pd
 9 | Coke1 =pd.read_csv("data2.csv")
10 | print(Coke1 [:4])
11 |  
12 | #第二部分：聚类
13 | from sklearn.cluster import KMeans
14 | clf=KMeans(n_clusters=3)
15 | pre=clf.fit_predict(Coke1)
16 | print(pre[:4])
17 |  
18 | #第三部分：降维
19 | from sklearn.decomposition import PCA
20 | pca=PCA(n_components=2)
21 | newData=pca.fit_transform(Coke1)
22 | print(newData[:4])
23 | x1=[n[0] for n in newData]
24 | x2=[n[1] for n in newData]
25 |  
26 | #第四部分：用matplotlib包画图
27 | import matplotlib.pyplot as plt
28 | plt.title
29 | plt.xlabel("x feature")
30 | plt.ylabel("y feature")
31 | plt.scatter(x1,x2,c=pre, marker='x')
32 | plt.savefig("bankloan.png",dpi=400)
33 | plt.show()
34 | 


--------------------------------------------------------------------------------
/blog10-Pandas/test07.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 06 21:47:46 2017
 4 | @author: yxz
 5 | """
 6 |  
 7 | from numpy import *
 8 | import matplotlib
 9 | import matplotlib.pyplot as plt
10 | 
11 | #-----------------------------------------------------------------------
12 | #载入数据
13 | def loadDataSet(fileName,delim='\t'):
14 |     fr=open(fileName)
15 |     stringArr=[line.strip().split(delim) for line in fr.readlines()]
16 |     datArr=[list(map(float,line)) for line in stringArr]
17 |     return mat(datArr)
18 |  
19 | def pca(dataMat,topNfeat=9999999):
20 |     meanVals=mean(dataMat,axis=0)
21 |     meanRemoved=dataMat-meanVals
22 |     covMat=cov(meanRemoved,rowvar=0)
23 |     eigVals,eigVets=linalg.eig(mat(covMat))
24 |     eigValInd=argsort(eigVals)
25 |     eigValInd=eigValInd[:-(topNfeat+1):-1]
26 |     redEigVects=eigVets[:,eigValInd]
27 |     print(meanRemoved)
28 |     print(redEigVects)
29 |     lowDDatMat=meanRemoved*redEigVects
30 |     reconMat=(lowDDatMat*redEigVects.T)+meanVals
31 |     return lowDDatMat,reconMat
32 | 
33 | dataMat=loadDataSet('41.txt')
34 | lowDMat,reconMat=pca(dataMat,1)
35 | 
36 | #-----------------------------------------------------------------------
37 | #绘制图像 
38 | def plotPCA(dataMat,reconMat):
39 |     datArr=array(dataMat)
40 |     reconArr=array(reconMat)
41 |     n1=shape(datArr)[0]
42 |     n2=shape(reconArr)[0]
43 |     xcord1=[];ycord1=[]
44 |     xcord2=[];ycord2=[]
45 |     for i in range(n1):
46 |         xcord1.append(datArr[i,0])
47 |         ycord1.append(datArr[i,1])
48 |     for i in range(n2):
49 |         xcord2.append(reconArr[i,0])
50 |         ycord2.append(reconArr[i,1])
51 |     fig=plt.figure()
52 |     ax=fig.add_subplot(111)
53 |     ax.scatter(xcord1,ycord1,s=90,c='red',marker='^')
54 |     ax.scatter(xcord2,ycord2,s=50,c='yellow',marker='o')
55 |     plt.title('PCA')
56 |     plt.savefig('ccc.png',dpi=400)
57 |     plt.show()
58 | 
59 | plotPCA(dataMat,reconMat)
60 | 


--------------------------------------------------------------------------------
/blog10-Pandas/时序图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog10-Pandas/时序图.png


--------------------------------------------------------------------------------
/blog10-Pandas/贵阳自相关图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog10-Pandas/贵阳自相关图.png


--------------------------------------------------------------------------------
/blog11-Matplotlib+SQL/test01.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | '''
 3 | ' 这篇代码主要讲述获取MySQL中数据，再进行简单的统计
 4 | ' 统计采用SQL语句进行 By:Eastmount CSDN
 5 | '''
 6 |  
 7 | import matplotlib.pyplot as plt
 8 | import matplotlib
 9 | import pandas as pd
10 | import numpy as np
11 | import pylab
12 | import MySQLdb
13 | from pylab import *
14 |  
15 |  
16 | # 根据SQL语句输出24小时的柱状图
17 | try:
18 |     conn = MySQLdb.connect(host='localhost',user='root',
19 |                          passwd='123456',port=3306, db='test01')
20 |     cur = conn.cursor() #数据库游标
21 |  
22 |     #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character
23 |     conn.set_character_set('utf8')
24 |     cur.execute('SET NAMES utf8;')
25 |     cur.execute('SET CHARACTER SET utf8;')
26 |     cur.execute('SET character_set_connection=utf8;')
27 |     sql = "select HOUR(FBTime) as hh, count(*) as cnt from csdn group by hh;"
28 |     cur.execute(sql)
29 |     result = cur.fetchall()        #获取结果复合纸给result
30 |     hour1 = [n[0] for n in result]
31 |     print hour1
32 |     num1 = [n[1] for n in result]
33 |     print num1
34 |     
35 |     N = 23  
36 |     ind = np.arange(N)  #赋值0-23  
37 |     width=0.35  
38 |     plt.bar(ind, num1, width, color='r', label='sum num')   
39 |     #设置底部名称    
40 |     plt.xticks(ind+width/2, hour1, rotation=40) #旋转40度
41 |     for i in range(23):   #中心底部翻转90度
42 |         plt.text(i, num1[i], str(num1[i]),
43 |                  ha='center', va='bottom', rotation=45) 
44 |     plt.title('Number-24Hour')    
45 |     plt.xlabel('hours')
46 |     plt.ylabel('The number of blog')
47 |     plt.legend()
48 |     plt.savefig('08csdn.png',dpi=400)    
49 |     plt.show()
50 |  
51 |  
52 | #异常处理
53 | except MySQLdb.Error,e:
54 |     print "Mysql Error %d: %s" % (e.args[0], e.args[1])
55 | finally:
56 |     cur.close()
57 |     conn.commit()  
58 |     conn.close()
59 |  
60 |        
61 | 


--------------------------------------------------------------------------------
/blog11-Matplotlib+SQL/test02.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | '''
 3 | ' 这篇代码主要讲述获取MySQL中数据，再进行简单的统计
 4 | ' 统计采用SQL语句进行 By:Eastmount
 5 | '''
 6 |  
 7 | import matplotlib.pyplot as plt
 8 | import matplotlib
 9 | import pandas as pd
10 | import numpy as np
11 | import pylab
12 | import MySQLdb
13 | from pylab import *
14 | import matplotlib.pyplot as plt
15 |  
16 | #根据SQL语句输出散点
17 | try:
18 |     conn = MySQLdb.connect(host='localhost',user='root',
19 |                          passwd='123456',port=3306, db='test01')
20 |     cur = conn.cursor() #数据库游标
21 |  
22 |     #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character
23 |     conn.set_character_set('utf8')
24 |     cur.execute('SET NAMES utf8;')
25 |     cur.execute('SET CHARACTER SET utf8;')
26 |     cur.execute('SET character_set_connection=utf8;')
27 |     sql = '''select DATE_FORMAT(FBTime,'%Y%m'), count(*) from csdn
28 |             group by DATE_FORMAT(FBTime,'%Y%m');'''
29 |     cur.execute(sql)
30 |     result = cur.fetchall()        #获取结果复合纸给result
31 |     date1 = [n[0] for n in result]
32 |     print date1
33 |     num1 = [n[1] for n in result]
34 |     print num1
35 |     print type(date1)
36 |     plt.scatter(date1,num1,25,color='white',marker='o',
37 |                 edgecolors='#0D8ECF',linewidth=3,alpha=0.8)
38 |     plt.title('Number-12Month')    
39 |     plt.xlabel('Time')
40 |     plt.ylabel('The number of blog')    
41 |     plt.savefig('02csdn.png',dpi=400) 
42 |     plt.show()
43 |  
44 |  
45 | #异常处理
46 | except MySQLdb.Error,e:
47 |     print "Mysql Error %d: %s" % (e.args[0], e.args[1])
48 | finally:
49 |     cur.close()
50 |     conn.commit()  
51 |     conn.close()
52 |             
53 | 


--------------------------------------------------------------------------------
/blog11-Matplotlib+SQL/test03.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | '''
 3 | ' 这篇代码主要讲述获取MySQL中数据，再进行简单的统计
 4 | ' 统计采用SQL语句进行 By：Eastmount
 5 | '''
 6 |  
 7 | import matplotlib.pyplot as plt
 8 | import matplotlib
 9 | import pandas as pd
10 | import numpy as np
11 | import pylab
12 | import MySQLdb
13 | from pylab import *
14 | from pandas import *
15 |  
16 |  
17 | # 根据SQL语句输出24小时的柱状图
18 | try:
19 |     conn = MySQLdb.connect(host='localhost',user='root',
20 |                          passwd='123456',port=3306, db='test01')
21 |     cur = conn.cursor() #数据库游标
22 |  
23 |     #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character
24 |     conn.set_character_set('utf8')
25 |     cur.execute('SET NAMES utf8;')
26 |     cur.execute('SET CHARACTER SET utf8;')
27 |     cur.execute('SET character_set_connection=utf8;')
28 |     sql = '''select DATE_FORMAT(FBTime,'%Y'), Count(*) from csdn
29 |                 group by DATE_FORMAT(FBTime,'%Y');'''
30 |     cur.execute(sql)
31 |     result = cur.fetchall()        #获取结果复合纸给result
32 |     day1 = [n[0] for n in result]
33 |     print len(day1)
34 |     num1 = [n[1] for n in result]
35 |     print len(num1),type(num1)
36 |     matplotlib.style.use('ggplot')
37 |     df=DataFrame(num1, index=day1,columns=['Nums'])
38 |     plt.show(df.plot(kind='bar'))
39 |     plt.savefig('05csdn.png',dpi=400)
40 |  
41 |  
42 | #异常处理
43 | except MySQLdb.Error,e:
44 |     print "Mysql Error %d: %s" % (e.args[0], e.args[1])
45 | finally:
46 |     cur.close()
47 |     conn.commit()  
48 |     conn.close()
49 |  
50 |     
51 | 


--------------------------------------------------------------------------------
/blog11-Matplotlib+SQL/test04.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | '''
 3 | ' 这篇代码主要讲述获取MySQL中数据，再进行简单的统计
 4 | ' 统计采用SQL语句进行 By:Eastmount CSDN
 5 | '''
 6 |  
 7 | import matplotlib.pyplot as plt
 8 | import matplotlib
 9 | import pandas as pd
10 | import numpy as np
11 | import pylab
12 | import MySQLdb
13 | from pylab import *
14 |  
15 |  
16 | # 根据SQL语句输出24小时的柱状图
17 | try:
18 |     conn = MySQLdb.connect(host='localhost',user='root',
19 |                          passwd='123456',port=3306, db='test01')
20 |     cur = conn.cursor() #数据库游标
21 |  
22 |     #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character
23 |     conn.set_character_set('utf8')
24 |     cur.execute('SET NAMES utf8;')
25 |     cur.execute('SET CHARACTER SET utf8;')
26 |     cur.execute('SET character_set_connection=utf8;')
27 |     sql = '''select DATE_FORMAT(FBTime,'%Y-%m-%d'), Count(*) from csdn
28 |                 group by DATE_FORMAT(FBTime,'%Y-%m-%d');'''
29 |     cur.execute(sql)
30 |     result = cur.fetchall()        #获取结果复合纸给result
31 |     day1 = [n[0] for n in result]
32 |     print len(day1)
33 |     num1 = [n[1] for n in result]
34 |     print len(num1),type(num1)
35 |     matplotlib.style.use('ggplot')
36 |     #获取第一天
37 |     start = min(day1)
38 |     print start
39 |     #np.random.randn(len(num1)) 生成正确图形 正态分布随机数
40 |     ts = pd.Series(np.random.randn(len(num1)),
41 |                    index=pd.date_range(start, periods=len(num1)))
42 |     ts = ts.cumsum()
43 |     ts.plot()
44 |     plt.title('Number-365Day')    
45 |     plt.xlabel('Time')
46 |     plt.ylabel('The number of blog')
47 |     plt.savefig('04csdn.png',dpi=400)    
48 |     plt.show()
49 |  
50 |  
51 | #异常处理
52 | except MySQLdb.Error,e:
53 |     print "Mysql Error %d: %s" % (e.args[0], e.args[1])
54 | finally:
55 |     cur.close()
56 |     conn.commit()  
57 |     conn.close()
58 |             
59 | 


--------------------------------------------------------------------------------
/blog12-matplotlib+SQL/test01.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | '''
  3 | ' 这篇代码主要讲述获取MySQL中数据，再进行简单的统计
  4 | ' 统计采用SQL语句进行
  5 | '''
  6 |  
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | import pandas as pd
 10 | import numpy as np
 11 | import pylab
 12 | import MySQLdb
 13 | from pylab import *
 14 |  
 15 | # 根据SQL语句输出24小时的柱状图
 16 | try:
 17 |     conn = MySQLdb.connect(host='localhost',user='root',
 18 |                          passwd='123456',port=3306, db='test01')
 19 |     cur = conn.cursor() #数据库游标
 20 |  
 21 |     #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character
 22 |     conn.set_character_set('utf8')
 23 |     cur.execute('SET NAMES utf8;')
 24 |     cur.execute('SET CHARACTER SET utf8;')
 25 |     cur.execute('SET character_set_connection=utf8;')
 26 |     
 27 |  
 28 |     #################################################
 29 |     # 2014年
 30 |     #################################################
 31 |     sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog
 32 |             where DATE_FORMAT(FBTime,'%Y')='2014' group by mm;'''
 33 |     cur.execute(sql)
 34 |     result = cur.fetchall() #获取结果复制给result
 35 |     hour1 = [n[0] for n in result]
 36 |     print hour1
 37 |     num1 = [n[1] for n in result]
 38 |     print num1
 39 |  
 40 |     N =  12
 41 |     ind = np.arange(N)  #赋值0-11  
 42 |     width=0.35
 43 |     p1 = plt.subplot(221)
 44 |     plt.bar(ind, num1, width, color='b', label='sum num')   
 45 |     #设置底部名称    
 46 |     plt.xticks(ind+width/2, hour1, rotation=40) #旋转40度
 47 |     for i in range(12):   #中心底部翻转90度
 48 |         plt.text(i, num1[i], str(num1[i]),
 49 |                  ha='center', va='bottom', rotation=45) 
 50 |     plt.title('2014 Number-12Month')    
 51 |     plt.sca(p1)
 52 |  
 53 |  
 54 |     #################################################
 55 |     # 2015年
 56 |     #################################################
 57 |     sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog
 58 |             where DATE_FORMAT(FBTime,'%Y')='2015' group by mm;'''
 59 |     cur.execute(sql)
 60 |     result = cur.fetchall()        
 61 |     hour1 = [n[0] for n in result]
 62 |     print hour1
 63 |     num1 = [n[1] for n in result]
 64 |     print num1
 65 |     
 66 |     N =  12
 67 |     ind = np.arange(N)  #赋值0-11  
 68 |     width=0.35
 69 |     p2 = plt.subplot(222)
 70 |     plt.bar(ind, num1, width, color='r', label='sum num')   
 71 |     #设置底部名称    
 72 |     plt.xticks(ind+width/2, hour1, rotation=40) #旋转40度
 73 |     for i in range(12):   #中心底部翻转90度
 74 |         plt.text(i, num1[i], str(num1[i]),
 75 |                  ha='center', va='bottom', rotation=45) 
 76 |     plt.title('2015 Number-12Month')    
 77 |     plt.sca(p2)
 78 |  
 79 |  
 80 |     #################################################
 81 |     # 2016年
 82 |     #################################################
 83 |     sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog
 84 |             where DATE_FORMAT(FBTime,'%Y')='2016' group by mm;'''
 85 |     cur.execute(sql)
 86 |     result = cur.fetchall()        
 87 |     hour1 = [n[0] for n in result]
 88 |     print hour1
 89 |     num1 = [n[1] for n in result]
 90 |     print num1
 91 |  
 92 |     N =  12
 93 |     ind = np.arange(N)  #赋值0-11 
 94 |     width=0.35
 95 |     p3 = plt.subplot(223)
 96 |     plt.bar(ind, num1, width, color='g', label='sum num')   
 97 |     #设置底部名称    
 98 |     plt.xticks(ind+width/2, hour1, rotation=40) #旋转40度
 99 |     for i in range(12):   #中心底部翻转90度
100 |         plt.text(i, num1[i], str(num1[i]),
101 |                  ha='center', va='bottom', rotation=45) 
102 |     plt.title('2016 Number-12Month')    
103 |     plt.sca(p3)
104 |  
105 |     
106 |     #################################################
107 |     # 所有年份数据对比
108 |     #################################################
109 |     sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog group by mm;'''
110 |     cur.execute(sql)
111 |     result = cur.fetchall()     
112 |     hour1 = [n[0] for n in result]
113 |     print hour1
114 |     num1 = [n[1] for n in result]
115 |     print num1
116 |  
117 |     N =  12
118 |     ind = np.arange(N)  #赋值0-11  
119 |     width=0.35
120 |     p4 = plt.subplot(224)
121 |     plt.bar(ind, num1, width, color='y', label='sum num')   
122 |     #设置底部名称    
123 |     plt.xticks(ind+width/2, hour1, rotation=40) #旋转40度
124 |     for i in range(12):   #中心底部翻转90度
125 |         plt.text(i, num1[i], str(num1[i]),
126 |                  ha='center', va='bottom', rotation=45) 
127 |     plt.title('All Year Number-12Month')    
128 |     plt.sca(p4)
129 |  
130 |     plt.savefig('ttt.png',dpi=400)    
131 |     plt.show()
132 |  
133 | #异常处理
134 | except MySQLdb.Error,e:
135 |     print "Mysql Error %d: %s" % (e.args[0], e.args[1])
136 | finally:
137 |     cur.close()
138 |     conn.commit()  
139 |     conn.close()
140 |  
141 |  
142 | 


--------------------------------------------------------------------------------
/blog12-matplotlib+SQL/test02.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | '''
 3 | ' 这篇代码主要讲述获取MySQL中数据，再进行简单的统计
 4 | ' 统计采用SQL语句进行 By：Eastmount CSDN
 5 | '''
 6 |  
 7 | import matplotlib.pyplot as plt
 8 | import matplotlib
 9 | import pandas as pd
10 | import numpy as np
11 | import MySQLdb
12 | from pandas import *
13 |  
14 | try:
15 |     conn = MySQLdb.connect(host='localhost',user='root',
16 |                          passwd='123456',port=3306, db='test01')
17 |     cur = conn.cursor() #数据库游标
18 |  
19 |     #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character
20 |     conn.set_character_set('utf8')
21 |     cur.execute('SET NAMES utf8;')
22 |     cur.execute('SET CHARACTER SET utf8;')
23 |     cur.execute('SET character_set_connection=utf8;')
24 |  
25 |     #所有博客数
26 |     sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog
27 |              group by mm;'''
28 |     cur.execute(sql)
29 |     result = cur.fetchall()        #获取结果复制给result
30 |     hour1 = [n[0] for n in result]
31 |     print hour1
32 |     num1 = [n[1] for n in result]
33 |     print num1
34 |  
35 |     #2014年博客数
36 |     sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog
37 |              where DATE_FORMAT(FBTime,'%Y')='2014' group by mm;'''
38 |     cur.execute(sql)
39 |     result = cur.fetchall()        
40 |     num2 = [n[1] for n in result]
41 |     print num2
42 |  
43 |     #2015年博客数
44 |     sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog
45 |              where DATE_FORMAT(FBTime,'%Y')='2015' group by mm;'''
46 |     cur.execute(sql)
47 |     result = cur.fetchall()       
48 |     num3 = [n[1] for n in result]
49 |     print num3
50 |  
51 |     #2016年博客数
52 |     sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog
53 |              where DATE_FORMAT(FBTime,'%Y')='2016' group by mm;'''
54 |     cur.execute(sql)
55 |     result = cur.fetchall()       
56 |     num4 = [n[1] for n in result]
57 |     print num4
58 |  
59 |     #重点: 数据整合 [12,4]
60 |     data = np.array([num1, num2, num3, num4])
61 |     print data
62 |     d = data.T #转置
63 |     print d
64 |     df = DataFrame(d, index=hour1, columns=['All','2014', '2015', '2016'])
65 |     df.plot(kind='area', alpha=0.2) #设置颜色 透明度
66 |     plt.title('Arae Plot Blog-Month') 
67 |     plt.savefig('csdn.png',dpi=400) 
68 |     plt.show()
69 |  
70 | #异常处理
71 | except MySQLdb.Error,e:
72 |     print "Mysql Error %d: %s" % (e.args[0], e.args[1])
73 | finally:
74 |     cur.close()
75 |     conn.commit()  
76 |     conn.close()
77 |     
78 | 


--------------------------------------------------------------------------------
/blog12-matplotlib+SQL/test03.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | '''
 3 | ' 这篇代码主要讲述获取MySQL中数据，再进行简单的统计
 4 | ' 统计采用SQL语句进行 By：Eastmount CSDN
 5 | '''
 6 |  
 7 | import matplotlib.pyplot as plt
 8 | import matplotlib
 9 | import pandas as pd
10 | import numpy as np
11 | import MySQLdb
12 | from pandas import *
13 |  
14 | try:
15 |     conn = MySQLdb.connect(host='localhost',user='root',
16 |                          passwd='123456',port=3306, db='test01')
17 |     cur = conn.cursor() #数据库游标
18 |  
19 |     #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character
20 |     conn.set_character_set('utf8')
21 |     cur.execute('SET NAMES utf8;')
22 |     cur.execute('SET CHARACTER SET utf8;')
23 |     cur.execute('SET character_set_connection=utf8;')
24 |     sql = '''select  
25 |             COUNT(case dayofweek(FBTime)  when 1 then 1 end) AS '星期日',
26 |             COUNT(case dayofweek(FBTime)  when 2 then 1 end) AS '星期一',
27 |             COUNT(case dayofweek(FBTime)  when 3 then 1 end) AS '星期二',
28 |             COUNT(case dayofweek(FBTime)  when 4 then 1 end) AS '星期三',
29 |             COUNT(case dayofweek(FBTime)  when 5 then 1 end) AS '星期四',
30 |             COUNT(case dayofweek(FBTime)  when 6 then 1 end) AS '星期五',
31 |             COUNT(case dayofweek(FBTime)  when 7 then 1 end) AS '星期六'
32 |             from csdn_blog;
33 |           '''
34 |     cur.execute(sql)
35 |     result = cur.fetchall()     
36 |     print result
37 |     #((31704L, 43081L, 42670L, 43550L, 41270L, 39164L, 29931L),)
38 |     name = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
39 |     #转换为numpy数组
40 |     data = np.array(result)
41 |     print data
42 |     d = data.T #转置
43 |     print d
44 |  
45 |     matplotlib.style.use('ggplot')
46 |     df=DataFrame(d, index=name,columns=['Nums'])
47 |     df.plot(kind='bar')
48 |     plt.title('All Year Blog-Week')    
49 |     plt.xlabel('Week')
50 |     plt.ylabel('The number of blog')
51 |     plt.savefig('01csdn.png',dpi=400)
52 |     plt.show()
53 |  
54 | #异常处理
55 | except MySQLdb.Error,e:
56 |     print "Mysql Error %d: %s" % (e.args[0], e.args[1])
57 | finally:
58 |     cur.close()
59 |     conn.commit()  
60 |     conn.close()
61 |        
62 | 


--------------------------------------------------------------------------------
/blog12-matplotlib+SQL/test04.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | '''
  3 | ' 这篇代码主要讲述获取MySQL中数据，再进行简单的统计
  4 | ' 统计采用SQL语句进行 By:Eastmount CSDN 杨秀璋
  5 | '''
  6 |  
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | import pandas as pd
 10 | import numpy as np
 11 | import MySQLdb
 12 | from pandas import *
 13 |  
 14 | try:
 15 |     conn = MySQLdb.connect(host='localhost',user='root',
 16 |                          passwd='123456',port=3306, db='test01')
 17 |     cur = conn.cursor() #数据库游标
 18 |  
 19 |     #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character
 20 |     conn.set_character_set('utf8')
 21 |     cur.execute('SET NAMES utf8;')
 22 |     cur.execute('SET CHARACTER SET utf8;')
 23 |     cur.execute('SET character_set_connection=utf8;')
 24 |     sql = '''select  
 25 |             COUNT(case dayofweek(FBTime)  when 1 then 1 end) AS '星期日',
 26 |             COUNT(case dayofweek(FBTime)  when 2 then 1 end) AS '星期一',
 27 |             COUNT(case dayofweek(FBTime)  when 3 then 1 end) AS '星期二',
 28 |             COUNT(case dayofweek(FBTime)  when 4 then 1 end) AS '星期三',
 29 |             COUNT(case dayofweek(FBTime)  when 5 then 1 end) AS '星期四',
 30 |             COUNT(case dayofweek(FBTime)  when 6 then 1 end) AS '星期五',
 31 |             COUNT(case dayofweek(FBTime)  when 7 then 1 end) AS '星期六'
 32 |             from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2008';
 33 |           '''
 34 |     cur.execute(sql)
 35 |     result1 = cur.fetchall()        
 36 |     print result1
 37 |     name = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
 38 |     data = np.array(result1)
 39 |     d1 = data.T #转置
 40 |     print d1
 41 |  
 42 |  
 43 |     sql = '''select  
 44 |             COUNT(case dayofweek(FBTime)  when 1 then 1 end) AS '星期日',
 45 |             COUNT(case dayofweek(FBTime)  when 2 then 1 end) AS '星期一',
 46 |             COUNT(case dayofweek(FBTime)  when 3 then 1 end) AS '星期二',
 47 |             COUNT(case dayofweek(FBTime)  when 4 then 1 end) AS '星期三',
 48 |             COUNT(case dayofweek(FBTime)  when 5 then 1 end) AS '星期四',
 49 |             COUNT(case dayofweek(FBTime)  when 6 then 1 end) AS '星期五',
 50 |             COUNT(case dayofweek(FBTime)  when 7 then 1 end) AS '星期六'
 51 |             from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2011';
 52 |           '''
 53 |     cur.execute(sql)
 54 |     result2 = cur.fetchall()        
 55 |     data = np.array(result2)
 56 |     d2 = data.T #转置
 57 |     print d2
 58 |  
 59 |  
 60 |     sql = '''select  
 61 |             COUNT(case dayofweek(FBTime)  when 1 then 1 end) AS '星期日',
 62 |             COUNT(case dayofweek(FBTime)  when 2 then 1 end) AS '星期一',
 63 |             COUNT(case dayofweek(FBTime)  when 3 then 1 end) AS '星期二',
 64 |             COUNT(case dayofweek(FBTime)  when 4 then 1 end) AS '星期三',
 65 |             COUNT(case dayofweek(FBTime)  when 5 then 1 end) AS '星期四',
 66 |             COUNT(case dayofweek(FBTime)  when 6 then 1 end) AS '星期五',
 67 |             COUNT(case dayofweek(FBTime)  when 7 then 1 end) AS '星期六'
 68 |             from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2016';
 69 |           '''
 70 |     cur.execute(sql)
 71 |     result3 = cur.fetchall()       
 72 |     data = np.array(result3)
 73 |     print type(result3),type(data)
 74 |     d3 = data.T #转置
 75 |     print d3
 76 |  
 77 |  
 78 |     #SQL语句获取3个数组，采用循环复制到一个[7][3]的二维数组中
 79 |     data = np.random.rand(7,3)
 80 |     print data
 81 |     i = 0
 82 |     while i<7:
 83 |         data[i][0] = d1[i]
 84 |         data[i][1] = d2[i]
 85 |         data[i][2] = d3[i]
 86 |         i = i + 1
 87 |  
 88 |     print data
 89 |     print type(data)
 90 |  
 91 |     #绘图
 92 |     matplotlib.style.use('ggplot')
 93 |     #数据[7,3]数组 name为星期 columns对应年份
 94 |     df=DataFrame(data, index=name, columns=['2008','2011','2016'])
 95 |     df.plot(kind='bar')   
 96 |     plt.title('Comparison Chart Blog-Week')    
 97 |     plt.xlabel('Week')
 98 |     plt.ylabel('The number of blog')
 99 |     plt.savefig('03csdn.png', dpi=400)
100 |     plt.show()
101 |  
102 |  
103 |  
104 | #异常处理
105 | except MySQLdb.Error,e:
106 |     print "Mysql Error %d: %s" % (e.args[0], e.args[1])
107 | finally:
108 |     cur.close()
109 |     conn.commit()  
110 |     conn.close()
111 |       
112 | 


--------------------------------------------------------------------------------
/blog13-wordcloud/cloudimg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog13-wordcloud/cloudimg.png


--------------------------------------------------------------------------------
/blog13-wordcloud/mb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog13-wordcloud/mb.png


--------------------------------------------------------------------------------
/blog13-wordcloud/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog13-wordcloud/result01.png


--------------------------------------------------------------------------------
/blog13-wordcloud/test.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog13-wordcloud/test.txt


--------------------------------------------------------------------------------
/blog13-wordcloud/test01.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # By：Eastmount CSDN
 3 | import jieba
 4 | import sys
 5 | import matplotlib.pyplot as plt
 6 | from wordcloud import WordCloud
 7 |  
 8 | #打开本体TXT文件
 9 | text = open('test.txt').read()
10 | print(type(text))
11 |  
12 | #结巴分词 cut_all=True 设置为全模式 
13 | wordlist = jieba.cut(text, cut_all = True)
14 |  
15 | #使用空格连接 进行中文分词
16 | wl_space_split = " ".join(wordlist)
17 | print(wl_space_split)
18 |  
19 | #对分词后的文本生成词云
20 | my_wordcloud = WordCloud().generate(wl_space_split)
21 |  
22 | #显示词云图
23 | plt.imshow(my_wordcloud)
24 | #是否显示x轴、y轴下标
25 | plt.axis("off")
26 | plt.show()
27 | 


--------------------------------------------------------------------------------
/blog13-wordcloud/test02.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # By：Eastmount CSDN
 3 | from os import path
 4 | from scipy.misc import imread  
 5 | import jieba
 6 | import sys
 7 | import matplotlib.pyplot as plt
 8 | from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator  
 9 |  
10 | # 打开本体TXT文件
11 | text = open('test.txt').read()
12 |  
13 | # 结巴分词 cut_all=True 设置为全模式 
14 | wordlist = jieba.cut(text)     #cut_all = True
15 |  
16 | # 使用空格连接 进行中文分词
17 | wl_space_split = " ".join(wordlist)
18 | print(wl_space_split)
19 |  
20 | # 读取mask/color图片
21 | d = path.dirname(__file__)
22 | nana_coloring = imread(path.join(d, "mb.png"))
23 |  
24 | # 对分词后的文本生成词云
25 | my_wordcloud = WordCloud( background_color = 'white',      # 设置背景颜色
26 |                             mask = nana_coloring,          # 设置背景图片
27 |                             max_words = 2000,              # 设置最大现实的字数
28 |                             stopwords = STOPWORDS,         # 设置停用词
29 |                             max_font_size = 50,            # 设置字体最大值
30 |                             random_state = 30,             # 设置有多少种随机生成状态，即有多少种配色方案
31 |                             )
32 |  
33 | # generate word cloud 
34 | my_wordcloud.generate(wl_space_split)
35 |  
36 | # create coloring from image  
37 | image_colors = ImageColorGenerator(nana_coloring)
38 |  
39 | # recolor wordcloud and show  
40 | my_wordcloud.recolor(color_func=image_colors)
41 |  
42 | plt.imshow(my_wordcloud)    # 显示词云图
43 | plt.axis("off")             # 是否显示x轴、y轴下标
44 | plt.show()
45 |  
46 | # save img  
47 | my_wordcloud.to_file(path.join(d, "cloudimg.png"))
48 |  
49 |  
50 | 


--------------------------------------------------------------------------------
/blog14-curve_fit/data.csv:
--------------------------------------------------------------------------------
 1 | x,y
 2 | 0,4
 3 | 1,5.2
 4 | 2,5.9
 5 | 3,6.8
 6 | 4,7.34
 7 | 5,8.57
 8 | 6,9.86
 9 | 7,10.12
10 | 8,12.56
11 | 9,14.32
12 | 10,15.42
13 | 11,16.50 
14 | 12,18.92
15 | 13,19.58
16 | 


--------------------------------------------------------------------------------
/blog14-curve_fit/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog14-curve_fit/result01.png


--------------------------------------------------------------------------------
/blog14-curve_fit/result02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog14-curve_fit/result02.png


--------------------------------------------------------------------------------
/blog14-curve_fit/test01.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | #By：Eastmount CSDN
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 |  
 6 | #定义x、y散点坐标
 7 | x = np.arange(1, 16, 1)
 8 | num = [4.00, 5.20, 5.900, 6.80, 7.34,
 9 |        8.57, 9.86, 10.12, 12.56, 14.32,
10 |        15.42, 16.50, 18.92, 19.58, 20.00]
11 | y = np.array(num)
12 |  
13 | #用3次多项式拟合
14 | f1 = np.polyfit(x, y, 3)
15 | p1 = np.poly1d(f1)
16 | print(p1)
17 |  
18 | #也可使用yvals=np.polyval(f1, x)
19 | yvals = p1(x)  #拟合y值
20 |  
21 | #绘图
22 | plot1 = plt.plot(x, y, 's',label='original values')
23 | plot2 = plt.plot(x, yvals, 'r',label='polyfit values')
24 | plt.xlabel('x')
25 | plt.ylabel('y')
26 | plt.legend(loc=4) #指定legend的位置右下角
27 | plt.title('polyfitting')
28 | plt.show()
29 | plt.savefig('test.png')
30 | 


--------------------------------------------------------------------------------
/blog14-curve_fit/test02.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | #By：Eastmount CSDN
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from scipy.optimize import curve_fit
 6 |  
 7 | #自定义函数 e指数形式
 8 | def func(x, a, b):
 9 |     return a*np.exp(b/x)
10 |  
11 | #定义x、y散点坐标
12 | x = np.arange(1, 16, 1)
13 | num = [4.00, 5.20, 5.900, 6.80, 7.34,
14 |        8.57, 9.86, 10.12, 12.56, 14.32,
15 |        15.42, 16.50, 18.92, 19.58, 20.00]
16 | y = np.array(num)
17 |  
18 | #非线性最小二乘法拟合
19 | popt, pcov = curve_fit(func, x, y)
20 | #获取popt里面是拟合系数
21 | a = popt[0] 
22 | b = popt[1]
23 | yvals = func(x,a,b) #拟合y值
24 | print('系数a:', a)
25 | print('系数b:', b)
26 |  
27 | #绘图
28 | plot1 = plt.plot(x, y, 's',label='original values')
29 | plot2 = plt.plot(x, yvals, 'r',label='polyfit values')
30 | plt.xlabel('x')
31 | plt.ylabel('y')
32 | plt.legend(loc=4) #指定legend的位置右下角
33 | plt.title('curve_fit')
34 | plt.show()
35 | plt.savefig('test2.png')
36 | 


--------------------------------------------------------------------------------
/blog14-curve_fit/test03.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | #By：Eastmount CSDN
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from scipy.optimize import curve_fit
 6 | import pandas as pd  
 7 |  
 8 | #自定义函数 e指数形式
 9 | def func(x, a, b):
10 |     return a*pow(x,b)
11 |  
12 | #导入数据及x、y散点坐标
13 | data = pd.read_csv("data.csv")
14 | print(data)
15 | print(data.shape)    
16 | print(data.head(5)) #显示前5行数据
17 | x = data['x']
18 | y = data['y']
19 | print(x)
20 | print(y)
21 |  
22 | #非线性最小二乘法拟合
23 | popt, pcov = curve_fit(func, x, y)
24 | #获取popt里面是拟合系数
25 | a = popt[0] 
26 | b = popt[1]
27 | yvals = func(x,a,b) #拟合y值
28 | print('系数a:', a)
29 | print('系数b:', b)
30 |  
31 | #绘图
32 | plot1 = plt.plot(x, y, 's',label='original values')
33 | plot2 = plt.plot(x, yvals, 'r',label='polyfit values')
34 | plt.xlabel('x')
35 | plt.ylabel('y')
36 | plt.legend(loc=4) #指定legend的位置右下角
37 | plt.title('curve_fit')
38 | plt.savefig('test3.png')
39 | plt.show()
40 |  
41 | 


--------------------------------------------------------------------------------
/blog14-curve_fit/test04.py:
--------------------------------------------------------------------------------
 1 | #encoding:utf-8
 2 | #By：Eastmount CSDN
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from scipy.optimize import curve_fit
 6 |  
 7 | def func(x, a, b, c):
 8 |     return a * np.exp(-b * x) + c
 9 |  
10 | # define the data to be fit with some noise
11 | xdata = np.linspace(0, 4, 50)
12 | y = func(xdata, 2.5, 1.3, 0.5)
13 | y_noise = 0.2 * np.random.normal(size=xdata.size)
14 | ydata = y + y_noise
15 | plt.plot(xdata, ydata, 'b-', label='data')
16 |  
17 | # Fit for the parameters a, b, c of the function `func`
18 | popt, pcov = curve_fit(func, xdata, ydata)
19 | plt.plot(xdata, func(xdata, *popt), 'r-', label='fit')
20 |  
21 | # Constrain the optimization to the region of ``0 < a < 3``, ``0 < b < 2``
22 | # and ``0 < c < 1``:
23 | popt, pcov = curve_fit(func, xdata, ydata, bounds=(0, [3., 2., 1.]))
24 | plt.plot(xdata, func(xdata, *popt), 'g--', label='fit-with-bounds')
25 |  
26 | plt.xlabel('x')
27 | plt.ylabel('y')
28 | plt.legend()
29 | plt.show()
30 | 


--------------------------------------------------------------------------------
/blog14-curve_fit/test3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog14-curve_fit/test3.png


--------------------------------------------------------------------------------
/blog14-curve_fit/test4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog14-curve_fit/test4.png


--------------------------------------------------------------------------------
/blog15-imshow/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog15-imshow/result01.png


--------------------------------------------------------------------------------
/blog15-imshow/result02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog15-imshow/result02.png


--------------------------------------------------------------------------------
/blog15-imshow/result03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog15-imshow/result03.png


--------------------------------------------------------------------------------
/blog15-imshow/test01.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # By：Eastmount CSDN
 3 | from matplotlib import pyplot as plt
 4 |  
 5 | fig = plt.figure()
 6 | ax1 = fig.add_subplot(231)
 7 | ax2 = fig.add_subplot(232)
 8 | ax3 = fig.add_subplot(233)
 9 | ax4 = fig.add_subplot(234) 
10 | ax5 = fig.add_subplot(235)
11 | ax6 = fig.add_subplot(236)
12 | plt.grid(True)
13 | plt.show()
14 | 


--------------------------------------------------------------------------------
/blog15-imshow/test02.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # By：Eastmount CSDN
 3 | import numpy as np
 4 | from pylab import *
 5 | from matplotlib import pyplot as plt
 6 |  
 7 | x = [1, 2, 3, 4]
 8 | y = [3, 5, 10, 25]
 9 |  
10 | #创建Figure
11 | fig = plt.figure()
12 |  
13 | #创建一个或多个子图(subplot绘图区才能绘图)
14 | ax1 = fig.add_subplot(231)
15 | plt.plot(x, y, marker='D') #绘图及选择子图
16 | plt.sca(ax1)
17 |   
18 | ax2 = fig.add_subplot(232)
19 | plt.scatter(x, y, marker='s', color='r') 
20 | plt.sca(ax2)
21 | plt.grid(True)
22 |  
23 | ax3 = fig.add_subplot(233)
24 | plt.bar(x, y, 0.5, color='c') #柱状图 width=0.5间距
25 | plt.sca(ax3)
26 |  
27 | ax4 = fig.add_subplot(234) 
28 | #高斯分布   
29 | mean = 0  #均值为0   
30 | sigma = 1 #标准差为1 (反应数据集中还是分散的值)  
31 | data = mean+sigma*np.random.randn(10000)
32 | plt.hist(data,40,normed=1,histtype='bar',facecolor='yellowgreen',alpha=0.75)
33 | plt.sca(ax4)
34 |  
35 | m = np.arange(-5.0, 5.0, 0.02)
36 | n = np.sin(m)
37 | ax5 = fig.add_subplot(235)
38 | plt.plot(m, n)
39 | plt.sca(ax5)
40 |  
41 | ax6 = fig.add_subplot(236)
42 | xlim(-2.5, 2.5) #设置x轴范围
43 | ylim(-1, 1)     #设置y轴范围
44 | plt.plot(m, n)
45 | plt.sca(ax6)
46 | plt.grid(True)
47 |  
48 | plt.show()
49 | 


--------------------------------------------------------------------------------
/blog15-imshow/test03.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # By：Eastmount CSDN
3 | from matplotlib import pyplot as plt
4 |  
5 | X = [[1,2],[3,4],[5,6]]
6 | plt.imshow(X)
7 | plt.show()
8 | 


--------------------------------------------------------------------------------
/blog15-imshow/test04.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | # By：Eastmount CSDN
3 | from matplotlib import pyplot as plt  
4 |   
5 | X = [[1,2],[3,4],[5,6]]  
6 | plt.imshow(X)  
7 | plt.colorbar()
8 | plt.show()
9 | 


--------------------------------------------------------------------------------
/blog15-imshow/test05.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | #By：Eastmount CSDN
 3 | from matplotlib import pyplot as plt  
 4 |   
 5 | X = [[1,2],[3,4]]   
 6 |  
 7 | fig = plt.figure()
 8 | ax = fig.add_subplot(231)
 9 | ax.imshow(X)
10 |  
11 | ax = fig.add_subplot(232)
12 | ax.imshow(X, cmap=plt.cm.gray) #灰度
13 |  
14 | ax = fig.add_subplot(233)
15 | im = ax.imshow(X, cmap=plt.cm.spring) #春
16 | plt.colorbar(im)                
17 |  
18 | ax = fig.add_subplot(234)
19 | im = ax.imshow(X, cmap=plt.cm.summer)
20 | plt.colorbar(im, cax=None, ax=None, shrink=0.5) #长度为半
21 |  
22 | ax = fig.add_subplot(235)
23 | im = ax.imshow(X, cmap=plt.cm.autumn)
24 | plt.colorbar(im, shrink=0.5, ticks=[-1,0,1])
25 |  
26 | ax = fig.add_subplot(236)
27 | im = ax.imshow(X, cmap=plt.cm.winter)
28 | plt.colorbar(im, shrink=0.5)
29 |  
30 | plt.show()
31 | 


--------------------------------------------------------------------------------
/blog15-imshow/test06.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | #By：Eastmount CSDN
 3 | from matplotlib import pyplot as plt  
 4 |   
 5 | X = [[0, 0.25], [0.5, 0.75]]   
 6 |  
 7 |  
 8 | fig = plt.figure()
 9 | ax = fig.add_subplot(121)
10 | im = ax.imshow(X, cmap=plt.get_cmap('hot'))
11 | plt.colorbar(im, shrink=0.5)
12 |  
13 | ax = fig.add_subplot(122)
14 | im = ax.imshow(X, cmap=plt.get_cmap('hot'), interpolation='nearest',
15 |                vmin=0, vmax=1) 
16 | plt.colorbar(im, shrink=0.2)
17 | plt.show()
18 |  
19 | 


--------------------------------------------------------------------------------
/blog15-imshow/test07.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # By：Eastmount CSDN
 3 | import numpy as np
 4 | from matplotlib import pyplot as plt
 5 | from matplotlib import cm 
 6 | from matplotlib import axes
 7 |  
 8 | def draw_heatmap(data,xlabels,ylabels):
 9 |     #cmap=cm.Blues    
10 |     cmap=cm.get_cmap('rainbow',1000)
11 |     figure=plt.figure(facecolor='w')
12 |     ax=figure.add_subplot(1,1,1,position=[0.1,0.15,0.8,0.8])
13 |     ax.set_yticks(range(len(ylabels)))
14 |     ax.set_yticklabels(ylabels)
15 |     ax.set_xticks(range(len(xlabels)))
16 |     ax.set_xticklabels(xlabels)
17 |     vmax=data[0][0]
18 |     vmin=data[0][0]
19 |     for i in data:
20 |         for j in i:
21 |             if j>vmax:
22 |                 vmax=j
23 |             if j<vmin:
24 |                 vmin=j
25 |     map=ax.imshow(data,interpolation='nearest',cmap=cmap,aspect='auto',vmin=vmin,vmax=vmax)
26 |     cb=plt.colorbar(mappable=map,cax=None,ax=None,shrink=0.5)
27 |     plt.show()
28 |             
29 | a=np.random.rand(10,10)
30 | print(a)
31 | xlabels=['A','B','C','D','E','F','G','H','I','J']
32 | ylabels=['a','b','c','d','e','f','g','h','i','j']
33 | draw_heatmap(a,xlabels,ylabels)  
34 | 


--------------------------------------------------------------------------------
/blog16-LR/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog16-LR/result01.png


--------------------------------------------------------------------------------
/blog16-LR/result02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog16-LR/result02.png


--------------------------------------------------------------------------------
/blog16-LR/result03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog16-LR/result03.png


--------------------------------------------------------------------------------
/blog16-LR/test01.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 |  
 4 | def Sigmoid(x):
 5 |     return 1.0 / (1.0 + np.exp(-x))
 6 |  
 7 | x= np.arange(-10, 10, 0.1)
 8 | h = Sigmoid(x)            #Sigmoid函数
 9 | plt.plot(x, h)
10 | plt.axvline(0.0, color='k')   #坐标轴上加一条竖直的线（0位置）
11 | plt.axhspan(0.0, 1.0, facecolor='1.0', alpha=1.0, ls='dotted')  
12 | plt.axhline(y=0.5, ls='dotted', color='k') 
13 | plt.yticks([0.0, 0.5, 1.0])  #y轴标度
14 | plt.ylim(-0.1, 1.1)          #y轴范围
15 | plt.show()  
16 | 


--------------------------------------------------------------------------------
/blog16-LR/test02.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from sklearn.datasets import load_iris    #导入数据集iris
 4 |   
 5 | #载入数据集  
 6 | iris = load_iris()  
 7 | print(iris.data)          #输出数据集  
 8 | print(iris.target)        #输出真实标签  
 9 | #获取花卉两列数据集  
10 | DD = iris.data  
11 | X = [x[0] for x in DD]  
12 | print(X)
13 | Y = [x[1] for x in DD]  
14 | print(Y)  
15 |   
16 | #plt.scatter(X, Y, c=iris.target, marker='x')
17 | plt.scatter(X[:50], Y[:50], color='red', marker='o', label='setosa') #前50个样本
18 | plt.scatter(X[50:100], Y[50:100], color='blue', marker='x', label='versicolor') #中间50个
19 | plt.scatter(X[100:], Y[100:],color='green', marker='+', label='Virginica') #后50个样本
20 | plt.legend(loc=2) #左上角
21 | plt.show()
22 | 


--------------------------------------------------------------------------------
/blog16-LR/test03.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from sklearn.datasets import load_iris   
 4 | from sklearn.linear_model import LogisticRegression 
 5 |  
 6 | #载入数据集
 7 | iris = load_iris()         
 8 | X = X = iris.data[:, :2]   #获取花卉两列数据集
 9 | Y = iris.target           
10 |  
11 | #逻辑回归模型
12 | lr = LogisticRegression(C=1e5)  
13 | lr.fit(X,Y)
14 |  
15 | #meshgrid函数生成两个网格矩阵
16 | h = .02
17 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
18 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
19 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
20 |  
21 | #pcolormesh函数将xx,yy两个网格矩阵和对应的预测结果Z绘制在图片上
22 | Z = lr.predict(np.c_[xx.ravel(), yy.ravel()])
23 | Z = Z.reshape(xx.shape)
24 | plt.figure(1, figsize=(8,6))
25 | plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
26 |  
27 | #绘制散点图
28 | plt.scatter(X[:50,0], X[:50,1], color='red',marker='o', label='setosa')
29 | plt.scatter(X[50:100,0], X[50:100,1], color='blue', marker='x', label='versicolor')
30 | plt.scatter(X[100:,0], X[100:,1], color='green', marker='s', label='Virginica') 
31 |  
32 | plt.xlabel('Sepal length')
33 | plt.ylabel('Sepal width')
34 | plt.xlim(xx.min(), xx.max())
35 | plt.ylim(yy.min(), yy.max())
36 | plt.xticks(())
37 | plt.yticks(())
38 | plt.legend(loc=2) 
39 | plt.show()
40 | 


--------------------------------------------------------------------------------
/blog17-networkx/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog17-networkx/result01.png


--------------------------------------------------------------------------------
/blog17-networkx/test01.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import networkx as nx
 3 | import matplotlib.pyplot as plt
 4 |  
 5 | #定义有向图
 6 | DG = nx.DiGraph() 
 7 | 
 8 | #添加五个节点(列表)
 9 | DG.add_nodes_from(['A', 'B', 'C', 'D', 'E'])
10 | print(DG.nodes())
11 | 
12 | #添加边(列表)
13 | DG.add_edges_from([('A', 'B'), ('A', 'C'), ('A', 'D'), ('D','A'),('E','A'),('E','D')])
14 | print(DG.edges())
15 | 
16 | #绘制图形 设置节点名显示\节点大小\节点颜色
17 | colors = ['red', 'green', 'blue', 'red', 'yellow']
18 | nx.draw(DG, with_labels=True, node_size=900, node_color = colors)
19 | plt.show()
20 | 


--------------------------------------------------------------------------------
/blog17-networkx/test02.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Nov 02 10:33:58 2017
  4 | @author: eastmount CSDN 杨秀璋
  5 | """
  6 | import pandas as pd
  7 | import numpy as np
  8 | import codecs
  9 | import networkx as nx
 10 | import matplotlib.pyplot as plt
 11 |  
 12 |  
 13 | """ 第一步:读取数据并获取姓名 """
 14 | data = pd.read_csv("data.csv",encoding ="gb2312") #中文乱码
 15 | print data[:4]
 16 | print data[u'姓名'] #获取某一列数据
 17 | print type(data[u'姓名'])
 18 | name = []
 19 | for n in data[u'姓名']:
 20 |     name.append(n)
 21 | print name[0]
 22 |  
 23 |  
 24 | """ 第二步:计算共现矩阵 定义函数实现 """
 25 | a = np.zeros([2,3])
 26 | print a
 27 | print len(name)
 28 | word_vector = np.zeros([len(name),len(name)]) #共现矩阵
 29 |  
 30 | #1.计算学院共线矩阵
 31 | i = 0
 32 | while i<len(name):  #len(name)
 33 |     academy1 = data[u'学院'][i]
 34 |     j = i + 1
 35 |     while j<len(name):
 36 |         academy2 = data[u'学院'][j]
 37 |         if academy1==academy2: #学院相同
 38 |             word_vector[i][j] += 1
 39 |             word_vector[j][i] += 1
 40 |         j = j + 1   
 41 |     i = i + 1
 42 | print word_vector
 43 | np_data = np.array(word_vector)  #矩阵写入文件
 44 | pd_data = pd.DataFrame(np_data)
 45 | pd_data.to_csv('result.csv')
 46 | #2.计算大数据金融班级共线矩阵
 47 | #3.计算性别共线矩阵
 48 | #4.计算宿舍楼层共线矩阵
 49 | """
 50 | i = 0
 51 | while i<len(name):  #len(name)
 52 |     academy1 = data[u'宿舍楼层'][i]
 53 |     j = i + 1
 54 |     while j<len(name):
 55 |         academy2 = data[u'宿舍楼层'][j]
 56 |         if academy1==academy2: #相同
 57 |             word_vector[i][j] += 1
 58 |             word_vector[j][i] += 1
 59 |         j = j + 1   
 60 |     i = i + 1
 61 | print word_vector
 62 | """
 63 |  
 64 |  
 65 | """ 第三步:共现矩阵计算(学生1 学生2 共现词频)文件 """
 66 | words = codecs.open("word_node.txt", "a+", "utf-8")
 67 | i = 0
 68 | while i<len(name):  #len(name)
 69 |     student1 = name[i]
 70 |     j = i + 1
 71 |     while j<len(name):
 72 |         student2 = name[j]
 73 |         #判断学生是否共现 共现词频不为0则加入
 74 |         if word_vector[i][j]>0:
 75 |             words.write(student1 + " " + student2 + " " 
 76 |                 + str(word_vector[i][j]) + "\r\n")
 77 |         j = j + 1
 78 |     i = i + 1
 79 | words.close()
 80 |  
 81 |  
 82 | """ 第四步:图形生成 """
 83 | a = []
 84 | f = codecs.open('word_node.txt','r','utf-8')
 85 | line = f.readline()
 86 | print line
 87 | i = 0
 88 | A = []
 89 | B = []
 90 | while line!="":
 91 |     a.append(line.split())   #保存文件是以空格分离的
 92 |     print a[i][0],a[i][1]
 93 |     A.append(a[i][0])
 94 |     B.append(a[i][1])
 95 |     i = i + 1
 96 |     line = f.readline()
 97 | elem_dic = tuple(zip(A,B)) 
 98 | print type(elem_dic)
 99 | print list(elem_dic)
100 | f.close()
101 |  
102 | import matplotlib
103 | matplotlib.rcParams['font.sans-serif'] = ['SimHei']   
104 | matplotlib.rcParams['font.family']='sans-serif'
105 |  
106 | colors = ["red","green","blue","yellow"]
107 | G = nx.Graph()
108 | G.add_edges_from(list(elem_dic))
109 | #nx.draw(G,with_labels=True,pos=nx.random_layout(G),font_size=12,node_size=2000,node_color=colors) #alpha=0.3
110 | #pos=nx.spring_layout(G,iterations=50)
111 | pos=nx.random_layout(G)
112 | nx.draw_networkx_nodes(G, pos, alpha=0.2,node_size=1200,node_color=colors)
113 | nx.draw_networkx_edges(G, pos, node_color='r', alpha=0.3) #style='dashed'
114 | nx.draw_networkx_labels(G, pos, font_family='sans-serif', alpha=0.5) #font_size=5
115 | plt.show()
116 | 


--------------------------------------------------------------------------------
/blog18-Regression/blog01-LR.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sklearn.linear_model import LinearRegression
 3 |  
 4 | #数据集 直径、价格
 5 | x = [[5],[6],[7],[8],[10],[11],[13],[14],[16],[18]]
 6 | y = [[6],[7.5],[8.6],[9],[12],[13.6],[15.8],[18.5],[19.2],[20]]
 7 | print(x)
 8 | print(y)
 9 | 
10 | clf = LinearRegression()
11 | clf.fit(x,y)
12 | pre = clf.predict([[12]])[0]
13 | print(u'预测直径为12英寸的价格: $%.2f' % pre)
14 | 


--------------------------------------------------------------------------------
/blog18-Regression/blog02-LR.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sklearn.linear_model import LinearRegression
 3 |  
 4 | #数据集 直径、价格
 5 | x = [[5],[6],[7],[8],[10],[11],[13],[14],[16],[18]]
 6 | y = [[6],[7.5],[8.6],[9],[12],[13.6],[15.8],[18.5],[19.2],[20]]
 7 | print(x)
 8 | print(y)
 9 |  
10 | clf = LinearRegression()
11 | clf.fit(x,y)
12 | pre = clf.predict([[12]])[0]
13 | print('预测直径为12英寸的价格: $%.2f' % pre)
14 | x2 = [[0],[12],[15],[25]]
15 | y2 = clf.predict(x2)
16 |  
17 | import matplotlib.pyplot as plt
18 | plt.figure()
19 | plt.rcParams['font.sans-serif'] = ['SimHei'] #指定默认字体
20 | plt.title(u"线性回归预测Pizza直径和价格")
21 | plt.xlabel(u"x")
22 | plt.ylabel(u"price")
23 | plt.axis([0,25,0,25])
24 | plt.scatter(x,y,marker="s",s=20)
25 | plt.plot(x2,y2,"g-")
26 | plt.show()
27 | 


--------------------------------------------------------------------------------
/blog18-Regression/blog03-boston.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #导入数据集boston
 3 | from sklearn.datasets import load_boston
 4 | import numpy as np
 5 | boston = load_boston()
 6 | print(boston.data.shape, boston.target.shape)
 7 | print(boston.data[0])
 8 | print(boston.target)
 9 | 
10 | #划分数据集
11 | boston_temp = boston.data[:, np.newaxis, 5]   
12 | x_train = boston_temp[:-100]      #训练样本  
13 | x_test = boston_temp[-100:]       #测试样本 后100行  
14 | y_train = boston.target[:-100]    #训练标记  
15 | y_test = boston.target[-100:]     #预测对比标记
16 | 


--------------------------------------------------------------------------------
/blog18-Regression/blog04-boson.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sklearn.datasets import load_boston
 3 | import numpy as np
 4 | boston = load_boston()
 5 | print(boston.data.shape, boston.target.shape)
 6 |  
 7 | #划分数据集
 8 | boston_temp = boston.data[:, np.newaxis, 5]   
 9 | x_train = boston_temp[:-100]      #训练样本  
10 | x_test = boston_temp[-100:]       #测试样本 后100行  
11 | y_train = boston.target[:-100]    #训练标记  
12 | y_test = boston.target[-100:]     #预测对比标记
13 |  
14 | #回归分析
15 | from sklearn.linear_model import LinearRegression 
16 | clf = LinearRegression()  
17 | clf.fit(x_train, y_train)  
18 |  
19 | #算法评估
20 | pre = clf.predict(x_test)
21 | print("预测结果", pre)
22 | print("真实结果", y_test)
23 | cost = np.mean(y_test-pre)**2  
24 | print('平方和计算:', cost)
25 | print('系数', clf.coef_)  
26 | print('截距', clf.intercept_)
27 | print('方差', clf.score(x_test, y_test))
28 |  
29 | #绘图分析
30 | import matplotlib.pyplot  as plt
31 | plt.title(u'LinearRegression Boston')     
32 | plt.xlabel(u'x')                   
33 | plt.ylabel(u'price')          
34 | plt.scatter(x_test, y_test, color = 'black')  
35 | plt.plot(x_test, clf.predict(x_test), color='blue', linewidth = 3)
36 | for idx, m in enumerate(x_test):  
37 |     plt.plot([m, m],[y_test[idx],pre[idx]], 'r-')    
38 | plt.show()   
39 | 


--------------------------------------------------------------------------------
/blog18-Regression/blog05-random.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import math
 3 | X = np.arange(0,50,0.2) 
 4 | print(X)
 5 | xArr = []
 6 | yArr = []
 7 | for n in X:
 8 |     xArr.append(n)
 9 |     y = 0.7*n + np.random.uniform(0,1)*math.sin(n)*2 - 3
10 |     yArr.append(y)
11 |  
12 | import matplotlib.pyplot as plt
13 | plt.plot(X, yArr, 'go')
14 | plt.show()
15 | 


--------------------------------------------------------------------------------
/blog18-Regression/blog06-random.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import math
 4 |  
 5 | #随机数生成
 6 | X =  np.arange(0,50,0.2) 
 7 | print(X)
 8 | xArr = []
 9 | yArr = []
10 | for n in X:
11 |     xArr.append(n)
12 |     y = 0.7*n + np.random.uniform(0,1)*math.sin(n)*2 - 3
13 |     yArr.append(y)
14 |  
15 | #线性回归分析
16 | from sklearn.linear_model import LinearRegression
17 | clf = LinearRegression()
18 | print(clf)
19 | X =  np.array(X).reshape((len(X),1))     #list转化为数组
20 | yArr = np.array(yArr).reshape((len(X),1))
21 | clf.fit(X,yArr)
22 | pre = clf.predict(X)
23 |  
24 | import matplotlib.pyplot as plt
25 | plt.plot(X, yArr, 'go')
26 | plt.plot(X, pre, 'r', linewidth=3)
27 | plt.show()
28 | 


--------------------------------------------------------------------------------
/blog18-Regression/blog07-3Drandom.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | from sklearn import linear_model
 4 | from mpl_toolkits.mplot3d import Axes3D
 5 | import matplotlib.pyplot as plt
 6 | import math
 7 |  
 8 | #linspace:开始值、终值和元素个数创建表示等差数列的一维数组
 9 | xx, yy = np.meshgrid(np.linspace(0,10,20), np.linspace(0,100,20))
10 | zz = 2.4 * xx + 4.5 * yy + np.random.randint(0,100,(20,20))
11 | 
12 | #构建成特征、值的形式
13 | X, Z = np.column_stack((xx.flatten(),yy.flatten())), zz.flatten()
14 | 
15 | #线性回归分析
16 | regr = linear_model.LinearRegression()
17 | regr.fit(X, Z)
18 | 
19 | #预测的一个特征
20 | x_test = np.array([[15.7, 91.6]])
21 | print(regr.predict(x_test))
22 | 
23 | #画图可视化分析
24 | fig = plt.figure()
25 | ax = fig.gca(projection='3d')
26 | ax.scatter(xx, yy, zz) #真实点
27 | 
28 | #拟合的平面
29 | ax.plot_wireframe(xx, yy, regr.predict(X).reshape(20,20))
30 | ax.plot_surface(xx, yy, regr.predict(X).reshape(20,20), alpha=0.3)
31 | plt.show()
32 | 


--------------------------------------------------------------------------------
/blog18-Regression/blog08-PolynomialFeatures.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Nov 26 23:31:16 2017
 4 | @author: yxz15
 5 | """
 6 |  
 7 | # -*- coding: utf-8 -*-
 8 | from sklearn.linear_model import LinearRegression
 9 |  
10 | #数据集 直径、价格
11 | x = [[5],[6],[7],[8],[10],[11],[13],[14],[16],[18]]
12 | y = [[6],[7.5],[8.6],[9],[12],[13.6],[15.8],[18.5],[19.2],[20]]
13 | print(x)
14 | print(y)
15 |  
16 | clf = LinearRegression()
17 | clf.fit(x,y)
18 | pre = clf.predict([[12]])[0]
19 | print(u'预测直径为12英寸的价格: $%.2f' % pre)
20 | x2 = [[0],[12],[15],[25]]
21 | y2 = clf.predict(x2)
22 |  
23 | import matplotlib.pyplot as plt
24 | import numpy as np
25 |  
26 | plt.figure()
27 | plt.axis([0,25,0,25])
28 | plt.scatter(x,y,marker="s",s=20)
29 | plt.plot(x2,y2,"g-")
30 |  
31 | #导入多项式回归模型
32 | from sklearn.preprocessing import PolynomialFeatures
33 | xx = np.linspace(0,25,100) #0到25等差数列
34 | quadratic_featurizer = PolynomialFeatures(degree = 2) #实例化一个二次多项式
35 | x_train_quadratic = quadratic_featurizer.fit_transform(x) #用二次多项式多样本x做变换
36 | X_test_quadratic = quadratic_featurizer.transform(x2)
37 | regressor_quadratic = LinearRegression()
38 | regressor_quadratic.fit(x_train_quadratic, y)
39 | xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))# 把训练好X值的多项式特征实例应用到一系列点上,形成矩阵
40 |  
41 | plt.plot(xx, regressor_quadratic.predict(xx_quadratic),
42 |          label="$y = ax^2 + bx + c$",linewidth=2,color="r")
43 | plt.legend()
44 | plt.show()
45 | 


--------------------------------------------------------------------------------
/blog18-Regression/blog09-PolynomialFeatures.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Nov 26 23:31:16 2017
 4 | @author: yxz15
 5 | """
 6 |  
 7 | # -*- coding: utf-8 -*-
 8 | from sklearn.linear_model import LinearRegression
 9 |  
10 | #数据集 直径、价格
11 | x = [[5],[6],[7],[8],[10],[11],[13],[14],[16],[18]]
12 | y = [[6],[7.5],[8.6],[9],[12],[13.6],[15.8],[18.5],[19.2],[20]]
13 | print(x)
14 | print(y)
15 |  
16 | clf = LinearRegression()
17 | clf.fit(x,y)
18 | pre = clf.predict([[12]])[0]
19 | print(u'预测直径为12英寸的价格: $%.2f' % pre)
20 | x2 = [[0],[12],[15],[25]]
21 | y2 = clf.predict(x2)
22 |  
23 | import matplotlib.pyplot as plt
24 | import numpy as np
25 |  
26 | plt.figure()
27 | plt.axis([0,25,0,25])
28 | plt.scatter(x,y,marker="s",s=20)
29 | plt.plot(x2,y2,"g-")
30 | 
31 | #导入多项式回归模型
32 | from sklearn.preprocessing import PolynomialFeatures
33 | xx = np.linspace(0,25,100) #0到25等差数列
34 | quadratic_featurizer = PolynomialFeatures(degree = 4) #实例化一个二次多项式
35 | x_train_quadratic = quadratic_featurizer.fit_transform(x) #用二次多项式多样本x做变换
36 | X_test_quadratic = quadratic_featurizer.transform(x2)
37 | regressor_quadratic = LinearRegression()
38 | regressor_quadratic.fit(x_train_quadratic, y)
39 | xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))# 把训练好X值的多项式特征实例应用到一系列点上,形成矩阵
40 |  
41 | plt.plot(xx, regressor_quadratic.predict(xx_quadratic),
42 |          label="$y = ax^4 + bx + c$",linewidth=2,color="r")
43 | plt.legend()
44 | plt.show()
45 | 


--------------------------------------------------------------------------------
/blog18-Regression/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog18-Regression/result01.png


--------------------------------------------------------------------------------
/blog18-Regression/result02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog18-Regression/result02.png


--------------------------------------------------------------------------------
/blog18-Regression/result03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog18-Regression/result03.png


--------------------------------------------------------------------------------
/blog18-Regression/result04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog18-Regression/result04.png


--------------------------------------------------------------------------------
/blog18-Regression/result05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog18-Regression/result05.png


--------------------------------------------------------------------------------
/blog18-Regression/result06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog18-Regression/result06.png


--------------------------------------------------------------------------------
/blog19-Iris/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result01.png


--------------------------------------------------------------------------------
/blog19-Iris/result02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result02.png


--------------------------------------------------------------------------------
/blog19-Iris/result03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result03.png


--------------------------------------------------------------------------------
/blog19-Iris/result04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result04.png


--------------------------------------------------------------------------------
/blog19-Iris/result05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result05.png


--------------------------------------------------------------------------------
/blog19-Iris/result06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result06.png


--------------------------------------------------------------------------------
/blog19-Iris/result07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result07.png


--------------------------------------------------------------------------------
/blog19-Iris/result08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result08.png


--------------------------------------------------------------------------------
/blog19-Iris/result09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result09.png


--------------------------------------------------------------------------------
/blog19-Iris/test01.py:
--------------------------------------------------------------------------------
 1 | #导入数据集iris  
 2 | from sklearn.datasets import load_iris
 3 | 
 4 | #载入数据集  
 5 | iris = load_iris()
 6 | 
 7 | #输出数据集  
 8 | print(iris.data)
 9 | 
10 | #输出真实标签  
11 | print(iris.target)
12 | print(len(iris.target))
13 | 
14 | #150个样本 每个样本4个特征  
15 | print(iris.data.shape)
16 | 


--------------------------------------------------------------------------------
/blog19-Iris/test02-hist.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | #导入数据集iris  
 5 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
 6 | names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
 7 | 
 8 | #读取csv数据
 9 | dataset = pandas.read_csv(url, names=names)
10 | print(dataset.describe())
11 | 
12 | #直方图 histograms
13 | dataset.hist()
14 | plt.show()
15 | 
16 | 


--------------------------------------------------------------------------------
/blog19-Iris/test03-plot.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | #导入数据集iris  
 5 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
 6 | names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
 7 | 
 8 | #读取csv数据
 9 | dataset = pandas.read_csv(url, names=names)
10 | print(dataset.describe())
11 | 
12 | dataset.plot(x='sepal-length', y='sepal-width', kind='scatter')
13 | plt.show()
14 | 


--------------------------------------------------------------------------------
/blog19-Iris/test04-kde.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | #导入数据集iris
 5 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
 6 | names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
 7 | 
 8 | #读取csv数据
 9 | dataset = pandas.read_csv(url, names=names)
10 | print(dataset.describe())
11 | 
12 | dataset.plot(kind='kde')
13 | plt.show()
14 | 


--------------------------------------------------------------------------------
/blog19-Iris/test05-box.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
 5 | names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
 6 | 
 7 | #读取csv数据
 8 | dataset = pandas.read_csv(url, names=names)
 9 | print(dataset.describe())
10 | 
11 | dataset.plot(kind='kde')
12 | dataset.plot(kind='box', subplots=True, layout=(2,2), 
13 |              sharex=False, sharey=False)
14 | plt.show()
15 | 


--------------------------------------------------------------------------------
/blog19-Iris/test06-box.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
 5 | names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
 6 | dataset = pandas.read_csv(url, names=names)
 7 |  
 8 | from pandas.plotting import radviz
 9 | radviz(dataset, 'class')
10 |  
11 | from pandas.plotting import andrews_curves
12 | andrews_curves(dataset, 'class')
13 |  
14 | from pandas.plotting import parallel_coordinates
15 | parallel_coordinates(dataset, 'class')
16 | plt.show()
17 | 


--------------------------------------------------------------------------------
/blog19-Iris/test07-show.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
 5 | names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
 6 | dataset = pandas.read_csv(url, names=names)
 7 |  
 8 | from pandas.plotting import scatter_matrix
 9 | scatter_matrix(dataset, alpha=0.2, figsize=(6, 6), diagonal='kde')
10 | plt.show()
11 | 


--------------------------------------------------------------------------------
/blog19-Iris/test08-LR.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import load_iris
 2 | hua = load_iris()
 3 | #获取花瓣的长和宽
 4 | x = [n[0] for n in hua.data]
 5 | y = [n[1] for n in hua.data]
 6 | 
 7 | import numpy as np #转换成数组
 8 | x = np.array(x).reshape(len(x),1)
 9 | y = np.array(y).reshape(len(y),1)
10 |  
11 | from sklearn.linear_model import LinearRegression
12 | clf = LinearRegression()
13 | clf.fit(x,y)
14 | pre = clf.predict(x)
15 |  
16 | #第三步 画图
17 | import matplotlib.pyplot as plt
18 | plt.scatter(x,y,s=100)
19 | plt.plot(x,pre,"r-",linewidth=4)
20 | for idx, m in enumerate(x):
21 |     plt.plot([m,m],[y[idx],pre[idx]], 'g-')
22 | plt.show()
23 | 


--------------------------------------------------------------------------------
/blog19-Iris/test09-Kmeans.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import load_iris
 2 | from sklearn.tree import DecisionTreeClassifier
 3 | iris = load_iris()
 4 | clf = DecisionTreeClassifier()
 5 | clf.fit(iris.data, iris.target)
 6 | print(clf)
 7 | predicted = clf.predict(iris.data)
 8 | 
 9 | #获取花卉两列数据集  
10 | X = iris.data
11 | L1 = [x[0] for x in X]
12 | print(L1)
13 | L2 = [x[1] for x in X]
14 | print(L2)
15 | 
16 | import numpy as np
17 | import matplotlib.pyplot as plt
18 | plt.scatter(L1, L2, c=predicted, marker='x') #cmap=plt.cm.Paired
19 | plt.title("DTC")
20 | plt.show()
21 | 


--------------------------------------------------------------------------------
/blog19-Iris/test10-Kmeans.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import load_iris
 2 | from sklearn.tree import DecisionTreeClassifier
 3 | import numpy as np
 4 | 
 5 | iris = load_iris()
 6 | #训练集
 7 | train_data = np.concatenate((iris.data[0:40, :], iris.data[50:90, :], iris.data[100:140, :]), axis = 0)
 8 | train_target = np.concatenate((iris.target[0:40], iris.target[50:90], iris.target[100:140]), axis = 0)
 9 | #测试集
10 | test_data = np.concatenate((iris.data[40:50, :], iris.data[90:100, :], iris.data[140:150, :]), axis = 0)
11 | test_target = np.concatenate((iris.target[40:50], iris.target[90:100], iris.target[140:150]), axis = 0)
12 | 
13 | #训练
14 | clf = DecisionTreeClassifier()
15 | clf.fit(train_data, train_target)
16 | predict_target = clf.predict(test_data)
17 | print(predict_target)
18 | 
19 | #预测结果与真实结果比对
20 | print(sum(predict_target == test_target))
21 | 
22 | #输出准确率 召回率 F值
23 | from sklearn import metrics
24 | print(metrics.classification_report(test_target,predict_target))
25 | print(metrics.confusion_matrix(test_target,predict_target))
26 | X = test_data
27 | L1 = [n[0] for n in X]
28 | print(L1)
29 | L2 = [n[1] for n in X]
30 | print(L2)
31 | 
32 | import matplotlib.pyplot as plt
33 | plt.scatter(L1, L2, c=predict_target, marker='x')  #cmap=plt.cm.Paired
34 | plt.title("DecisionTreeClassifier")  
35 | plt.show()
36 | 


--------------------------------------------------------------------------------
/blog19-Iris/test11-Kmeans.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sklearn.datasets import load_iris
 3 | from sklearn.cluster import KMeans
 4 | iris = load_iris()
 5 | clf = KMeans()
 6 | clf.fit(iris.data, iris.target)
 7 | print(clf)
 8 | predicted = clf.predict(iris.data)
 9 | 
10 | #获取花卉两列数据集  
11 | X = iris.data
12 | L1 = [x[0] for x in X]
13 | print(L1)
14 | L2 = [x[1] for x in X]  
15 | print(L2)
16 | 
17 | import numpy as np
18 | import matplotlib.pyplot as plt
19 | plt.scatter(L1, L2, c=predicted, marker='s',s=200,cmap=plt.cm.Paired)
20 | plt.title("Iris")
21 | plt.show()
22 | 


--------------------------------------------------------------------------------
/blog20-KNN/blog01.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np  
 3 | from sklearn.neighbors import KNeighborsClassifier  
 4 |  
 5 | X = np.array([[-1,-1],[-2,-2],[1,2], [1,1],[-3,-4],[3,2]])
 6 | Y = [0,0,1,1,0,1]
 7 | x = [[4,5],[-4,-3],[2,6]]
 8 | knn = KNeighborsClassifier(n_neighbors=3, algorithm="ball_tree")
 9 | knn.fit(X,Y)
10 | pre = knn.predict(x)
11 | print(pre)
12 | 


--------------------------------------------------------------------------------
/blog20-KNN/blog02.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os 
 3 | import numpy as np
 4 | data = np.loadtxt("wine.txt",dtype=str,delimiter=",")
 5 | print(data)
 6 |  
 7 | yy, x = np.split(data, (1,), axis=1)
 8 | print(yy.shape, x.shape)
 9 | print(x)
10 | print(yy[:5])
11 | 


--------------------------------------------------------------------------------
/blog20-KNN/blog03.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os 
 3 | import numpy as np
 4 | data = np.loadtxt("wine.txt",dtype=str,delimiter=",")
 5 | print(data)
 6 | print(type(data))
 7 |  
 8 | yy, x = np.split(data, (1,), axis=1)
 9 | print(yy.shape, x.shape)
10 | print(x)
11 | print(yy[:5])
12 |  
13 | #从字符型转换为Int整型
14 | X = x.astype(int)
15 | print(X)
16 | #字母转换为数字
17 | y = []
18 | i = 0
19 | print(len(yy))
20 | while i<len(yy):
21 |     if yy[i]=="L":
22 |         y.append(0)
23 |     elif yy[i]=="B":
24 |         y.append(1)
25 |     elif yy[i]=="R":
26 |         y.append(2)
27 |     i = i + 1
28 | print(y[:5])
29 |  
30 | #KNN分析
31 | from sklearn import neighbors
32 | knn = neighbors.KNeighborsClassifier()
33 | print(knn)
34 | knn.fit(X,y)
35 | pre = knn.predict(X)
36 | print(pre)
37 |  
38 | #可视化分析
39 | import matplotlib.pyplot as plt
40 | L1 = [x[0] for x in X]
41 | L2 = [x[2] for x in X]
42 | plt.scatter(L1, L2, c=pre, marker='s', s=200)
43 | plt.show()
44 | 


--------------------------------------------------------------------------------
/blog20-KNN/blog04.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os 
 3 | import numpy as np
 4 |  
 5 | #第一步 导入数据集
 6 | data = np.loadtxt("wine.txt",dtype=str,delimiter=",")
 7 | print(data)
 8 | print(type(data))
 9 | yy, x = np.split(data, (1,), axis=1)
10 | print(yy.shape, x.shape)
11 | #从字符型转换为Int整型
12 | X = x.astype(int)
13 | #获取x两列数据,方便绘图 对应x、y轴
14 | X = X[:, 1:3]  
15 | print(X)
16 | #字母转换为数字
17 | y = []
18 | i = 0
19 | print(len(yy))
20 | while i<len(yy):
21 |     if yy[i]=="L":
22 |         y.append(0)
23 |     elif yy[i]=="B":
24 |         y.append(1)
25 |     elif yy[i]=="R":
26 |         y.append(2)
27 |     i = i + 1
28 | print(y[:5])
29 |  
30 | #第二步 KNN分析
31 | from sklearn import neighbors
32 | knn = neighbors.KNeighborsClassifier()
33 | print(knn)
34 | knn.fit(X,y)
35 | pre = knn.predict(X)
36 | print(pre)
37 |  
38 | #第三步 数据评估  
39 | from sklearn import metrics  
40 | print(sum(pre == y))   #预测结果与真实结果比对
41 | print(metrics.classification_report(y,pre))   #输出准确率 召回率 F值  
42 | print(metrics.confusion_matrix(y,pre)) 
43 |  
44 | #第四步 创建网格 
45 | x1_min, x1_max = X[:,0].min()-0.1, X[:,0].max()+0.1  #第一列
46 | x2_min, x2_max = X[:,1].min()-0.1, X[:,1].max()+0.1  #第二列
47 | xx, yy = np.meshgrid(np.arange(x1_min, x1_max, 0.1),  
48 |                      np.arange(x2_min, x2_max, 0.1))  #生成网格型数据
49 | print(xx.shape, yy.shape) #(42L, 42L) (42L, 42L)
50 | print(xx.ravel().shape, yy.ravel().shape)  #(1764L,) (1764L,)
51 | print(np.c_[xx.ravel(), yy.ravel()].shape) #合并 (1764L, 2L)
52 | #ravel()拉直函数
53 | z = knn.predict(np.c_[xx.ravel(), yy.ravel()])    
54 | print(z)
55 |  
56 | #第五步 绘图可视化
57 | from matplotlib.colors import ListedColormap
58 | import matplotlib.pyplot as plt
59 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])   #颜色Map
60 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
61 | plt.figure()
62 | z = z.reshape(xx.shape)                
63 | plt.pcolormesh(xx, yy, z, cmap=cmap_light)
64 | plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap_bold, s=50)
65 | plt.show()
66 | 


--------------------------------------------------------------------------------
/blog20-KNN/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog20-KNN/result.png


--------------------------------------------------------------------------------
/blog20-KNN/result02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog20-KNN/result02.png


--------------------------------------------------------------------------------
/blog21-NB/blog01.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | from sklearn.naive_bayes import GaussianNB
 4 | X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
 5 | Y = np.array([1, 1, 1, 2, 2, 2])
 6 | clf = GaussianNB()
 7 | clf.fit(X, Y)      
 8 | pre = clf.predict(X)
 9 | print("数据集预测结果:", pre)
10 | print(clf.predict([[-0.8, -1]]))
11 | 
12 | clf_pf = GaussianNB()
13 | clf_pf.partial_fit(X, Y, np.unique(Y)) #增加一部分样本
14 | print(clf_pf.predict([[-0.8, -1]]))
15 | 


--------------------------------------------------------------------------------
/blog21-NB/blog02.py:
--------------------------------------------------------------------------------
 1 | import numpy as np  
 2 | from sklearn.naive_bayes import GaussianNB  
 3 | X = np.array([[-1,-1], [-2,-2], [-3,-3], [-4,-4], [-5,-5], 
 4 |               [1,1], [2,2], [3,3]])  
 5 | y = np.array([1, 1, 1, 1, 1, 2, 2, 2])  
 6 | clf = GaussianNB()  
 7 | clf.partial_fit(X,y,classes=[1,2],
 8 |                 sample_weight=np.array([0.05,0.05,0.1,0.1,0.1,0.2,0.2,0.2]))  
 9 | print(clf.class_prior_)
10 | print(clf.predict([[-6,-6],[4,5],[2,5]]))
11 | print(clf.predict_proba([[-6,-6],[4,5],[2,5]]))
12 | 


--------------------------------------------------------------------------------
/blog21-NB/blog03.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #第一部分 载入数据集
 3 | import pandas as pd
 4 | X = pd.read_csv("seed_x.csv")
 5 | Y = pd.read_csv("seed_y.csv")
 6 | print(X)
 7 | print(Y)
 8 |  
 9 | #第二部分 导入模型
10 | from sklearn.naive_bayes import GaussianNB  
11 | clf = GaussianNB()
12 | clf.fit(X, Y)      
13 | pre = clf.predict(X)
14 | print("数据集预测结果:", pre)
15 |  
16 | #第三部分 降维处理
17 | from sklearn.decomposition import PCA
18 | pca = PCA(n_components=2)
19 | newData = pca.fit_transform(X)
20 | print(newData[:4])
21 |  
22 | #第四部分 绘制图形
23 | import matplotlib.pyplot as plt
24 | L1 = [n[0] for n in newData]
25 | L2 = [n[1] for n in newData]
26 | plt.scatter(L1,L2,c=pre,s=200)
27 | plt.show()
28 | 


--------------------------------------------------------------------------------
/blog21-NB/blog04-getdata.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import pandas as pd
 4 |  
 5 | data = pd.read_csv("data.csv",encoding='gbk')
 6 | print(data)
 7 |  
 8 | #取表中的第1列的所有值
 9 | print("获取第一列内容")
10 | col = data.iloc[:,0]  
11 | #取表中所有值  
12 | arrs = col.values
13 | for a in arrs:
14 |     print(a)
15 | 


--------------------------------------------------------------------------------
/blog21-NB/blog05-fenci.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import pandas as pd
 4 | import jieba
 5 |  
 6 | data = pd.read_csv("data.csv",encoding='gbk')
 7 | print(data)
 8 |  
 9 | #取表中的第1列的所有值
10 | print("获取第一列内容")
11 | col = data.iloc[:,0]  
12 | #取表中所有值  
13 | arrs = col.values
14 | #去除停用词  
15 | stopwords = {}.fromkeys(['，', '。', '！', '这', '我', '非常'])
16 |  
17 | print("\n中文分词后结果:")
18 | for a in arrs:
19 |     #print a
20 |     seglist = jieba.cut(a,cut_all=False)     #精确模式  
21 |     final = ''
22 |     for seg in seglist:
23 |         if seg not in stopwords: #不是停用词的保留
24 |             final += seg
25 |     seg_list = jieba.cut(final, cut_all=False) 
26 |     output = ' '.join(list(seg_list))         #空格拼接
27 |     print(output)
28 | 


--------------------------------------------------------------------------------
/blog21-NB/blog06-static.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import pandas as pd
 4 | import jieba
 5 |  
 6 | data = pd.read_csv("data.csv",encoding='gbk')
 7 | print(data)
 8 |  
 9 | #取表中的第1列的所有值
10 | print("获取第一列内容")
11 | col = data.iloc[:,0]  
12 | #取表中所有值  
13 | arrs = col.values
14 | #去除停用词  
15 | stopwords = {}.fromkeys(['，', '。', '！', '这', '我', '非常'])
16 |  
17 | print("\n中文分词后结果:")
18 | corpus = []
19 | for a in arrs:
20 |     #print a
21 |     seglist = jieba.cut(a,cut_all=False)     #精确模式  
22 |     final = ''
23 |     for seg in seglist:
24 |         if seg not in stopwords: #不是停用词的保留
25 |             final += seg
26 |     seg_list = jieba.cut(final, cut_all=False) 
27 |     output = ' '.join(list(seg_list))         #空格拼接
28 |     print(output)
29 |     corpus.append(output)
30 |  
31 | #计算词频
32 | from sklearn.feature_extraction.text import CountVectorizer
33 | from sklearn.feature_extraction.text import TfidfTransformer
34 |   
35 | vectorizer = CountVectorizer() #将文本中的词语转换为词频矩阵  
36 | X = vectorizer.fit_transform(corpus) #计算个词语出现的次数    
37 | word = vectorizer.get_feature_names() #获取词袋中所有文本关键词  
38 | for w in word: #查看词频结果
39 |     print(w,)
40 | print(X.toarray())  
41 | 


--------------------------------------------------------------------------------
/blog21-NB/blog07-classifier.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import pandas as pd
 4 | import jieba
 5 |  
 6 | #http://blog.csdn.net/eastmount/article/details/50323063
 7 | #http://blog.csdn.net/eastmount/article/details/50256163
 8 |  
 9 | ####################################
10 | #         第一步 读取数据及分词
11 | #
12 | data = pd.read_csv("data.csv",encoding='gbk')
13 | print(data)
14 |  
15 | #取表中的第1列的所有值
16 | print("获取第一列内容")
17 | col = data.iloc[:,0]  
18 | #取表中所有值  
19 | arrs = col.values
20 |  
21 | #去除停用词  
22 | stopwords = {}.fromkeys(['，', '。', '！', '这', '我', '非常'])
23 |  
24 | print("\n中文分词后结果:")
25 | corpus = []
26 | for a in arrs:
27 |     #print a
28 |     seglist = jieba.cut(a,cut_all=False)     #精确模式  
29 |     final = ''
30 |     for seg in seglist:
31 |         if seg not in stopwords: #不是停用词的保留
32 |             final += seg
33 |     seg_list = jieba.cut(final, cut_all=False) 
34 |     output = ' '.join(list(seg_list))         #空格拼接
35 |     print(output)
36 |     corpus.append(output)
37 |  
38 | ####################################
39 | #         第二步 计算词频
40 | #
41 | from sklearn.feature_extraction.text import CountVectorizer
42 | from sklearn.feature_extraction.text import TfidfTransformer
43 |   
44 | vectorizer = CountVectorizer() #将文本中的词语转换为词频矩阵  
45 | X = vectorizer.fit_transform(corpus) #计算个词语出现的次数    
46 | word = vectorizer.get_feature_names() #获取词袋中所有文本关键词  
47 | for w in word: #查看词频结果
48 |     print(w)
49 | print(X.toarray())
50 |  
51 |  
52 | ####################################
53 | #         第三步 数据分析
54 | #
55 | from sklearn.naive_bayes import MultinomialNB  
56 | from sklearn.metrics import precision_recall_curve  
57 | from sklearn.metrics import classification_report
58 |  
59 | #使用前8行数据集进行训练，最后两行数据集用于预测
60 | print("\n\n数据分析:")
61 | X = X.toarray()
62 | x_train = X[:8]
63 | x_test = X[8:]
64 | #1表示好评 0表示差评
65 | y_train = [1,1,0,0,1,0,0,1]
66 | y_test = [1,0]
67 |  
68 | #调用MultinomialNB分类器  
69 | clf = MultinomialNB().fit(x_train, y_train)
70 | pre = clf.predict(x_test)
71 | print("预测结果:",pre)
72 | print("真实结果:",y_test)
73 |  
74 | from sklearn.metrics import classification_report
75 | print(classification_report(y_test, pre))
76 | 
77 | ####################################
78 | #降维绘制图形
79 | from sklearn.decomposition import PCA
80 | pca = PCA(n_components=2)
81 | newData = pca.fit_transform(X)
82 | print(newData)
83 |  
84 | pre = clf.predict(X)
85 | Y = [1,1,0,0,1,0,0,1,1,0]
86 | import matplotlib.pyplot as plt
87 | L1 = [n[0] for n in newData]
88 | L2 = [n[1] for n in newData]
89 | plt.scatter(L1,L2,c=pre,s=200)
90 | plt.show()
91 | 
92 | 


--------------------------------------------------------------------------------
/blog21-NB/data.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog21-NB/data.csv


--------------------------------------------------------------------------------
/blog21-NB/data_preprocess.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | 
 4 | fw1 = open('seed_x.csv', 'w', newline='')
 5 | fw2 = open('seed_y.csv', 'w', newline='')
 6 | writer1 = csv.writer(fw1)
 7 | writer2 = csv.writer(fw2)
 8 | 
 9 | with open('seed.txt','r',encoding='utf-8') as f:
10 |     for data in f.readlines():
11 |         #print(data)
12 |         value = data.split(" ")
13 |         x0 = value[0]
14 |         x1 = value[1]
15 |         x2 = value[2]
16 |         x3 = value[3]
17 |         x4 = value[4]
18 |         x5 = value[5]
19 |         x6 = value[6]
20 |         y = value[7]
21 |         
22 |         #文件写入
23 |         writer1.writerow([x0,x1,x2,x3,x4,x5,x6])
24 |         writer2.writerow(y)
25 | 
26 | fw1.close()
27 | fw2.close()
28 | 


--------------------------------------------------------------------------------
/blog21-NB/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog21-NB/result.png


--------------------------------------------------------------------------------
/blog21-NB/result2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog21-NB/result2.png


--------------------------------------------------------------------------------
/blog21-NB/seed_y.csv:
--------------------------------------------------------------------------------
  1 | 5
  2 | 1
  3 | 1
  4 | 1
  5 | 1
  6 | 1
  7 | 1
  8 | 1
  9 | 1
 10 | 1
 11 | 1
 12 | 1
 13 | 1
 14 | 1
 15 | 1
 16 | 1
 17 | 1
 18 | 1
 19 | 1
 20 | 1
 21 | 1
 22 | 1
 23 | 1
 24 | 1
 25 | 1
 26 | 1
 27 | 1
 28 | 1
 29 | 1
 30 | 1
 31 | 1
 32 | 1
 33 | 1
 34 | 1
 35 | 1
 36 | 1
 37 | 1
 38 | 1
 39 | 1
 40 | 1
 41 | 1
 42 | 1
 43 | 1
 44 | 1
 45 | 1
 46 | 1
 47 | 1
 48 | 1
 49 | 1
 50 | 1
 51 | 1
 52 | 1
 53 | 1
 54 | 1
 55 | 1
 56 | 1
 57 | 1
 58 | 1
 59 | 1
 60 | 1
 61 | 1
 62 | 1
 63 | 1
 64 | 1
 65 | 1
 66 | 1
 67 | 1
 68 | 1
 69 | 1
 70 | 1
 71 | 2
 72 | 2
 73 | 2
 74 | 2
 75 | 2
 76 | 2
 77 | 2
 78 | 2
 79 | 2
 80 | 2
 81 | 2
 82 | 2
 83 | 2
 84 | 2
 85 | 2
 86 | 2
 87 | 2
 88 | 2
 89 | 2
 90 | 2
 91 | 2
 92 | 2
 93 | 2
 94 | 2
 95 | 2
 96 | 2
 97 | 2
 98 | 2
 99 | 2
100 | 2
101 | 2
102 | 2
103 | 2
104 | 2
105 | 2
106 | 2
107 | 2
108 | 2
109 | 2
110 | 2
111 | 2
112 | 2
113 | 2
114 | 2
115 | 2
116 | 2
117 | 2
118 | 2
119 | 2
120 | 2
121 | 2
122 | 2
123 | 2
124 | 2
125 | 2
126 | 2
127 | 2
128 | 2
129 | 2
130 | 2
131 | 2
132 | 2
133 | 2
134 | 2
135 | 2
136 | 2
137 | 2
138 | 2
139 | 2
140 | 2
141 | 3
142 | 3
143 | 3
144 | 3
145 | 3
146 | 3
147 | 3
148 | 3
149 | 3
150 | 3
151 | 3
152 | 3
153 | 3
154 | 3
155 | 3
156 | 3
157 | 3
158 | 3
159 | 3
160 | 3
161 | 3
162 | 3
163 | 3
164 | 3
165 | 3
166 | 3
167 | 3
168 | 3
169 | 3
170 | 3
171 | 3
172 | 3
173 | 3
174 | 3
175 | 3
176 | 3
177 | 3
178 | 3
179 | 3
180 | 3
181 | 3
182 | 3
183 | 3
184 | 3
185 | 3
186 | 3
187 | 3
188 | 3
189 | 3
190 | 3
191 | 3
192 | 3
193 | 3
194 | 3
195 | 3
196 | 3
197 | 3
198 | 3
199 | 3
200 | 3
201 | 3
202 | 3
203 | 3
204 | 3
205 | 3
206 | 3
207 | 3
208 | 3
209 | 3
210 | 3
211 | 


--------------------------------------------------------------------------------
/blog22-Basemap/001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog22-Basemap/001.png


--------------------------------------------------------------------------------
/blog22-Basemap/002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog22-Basemap/002.png


--------------------------------------------------------------------------------
/blog22-Basemap/003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog22-Basemap/003.png


--------------------------------------------------------------------------------
/blog22-Basemap/004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog22-Basemap/004.png


--------------------------------------------------------------------------------
/blog22-Basemap/005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog22-Basemap/005.png


--------------------------------------------------------------------------------
/blog22-Basemap/006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog22-Basemap/006.png


--------------------------------------------------------------------------------
/blog22-Basemap/basemap下载.txt:
--------------------------------------------------------------------------------
1 | https://www.lfd.uci.edu/~gohlke/pythonlibs/#basemap
2 | 
3 | - pip install pyproj
4 | - pip install basemap-1.2.2-cp37-cp37m-win32.whl
5 | - pip install basemap-1.1.0-cp27-cp27m-win_amd64.whl


--------------------------------------------------------------------------------
/blog22-Basemap/blog-001.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from mpl_toolkits.basemap import Basemap
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # 设置basemap-Lambert Conformal 
 6 | m = Basemap(width=12000000,height=9000000,projection='lcc',
 7 |             resolution='c',lat_1=45.,lat_2=55,lat_0=50,lon_0=-107.)
 8 | 
 9 | # 绘制海岸线
10 | m.drawcoastlines()
11 | # 在地图周围绘制边界并填充背景aqua（这个背景最终成为海洋的颜色）
12 | # 将大洲绘制在最上面
13 | m.drawmapboundary(fill_color='aqua')
14 | 
15 | # 填充大陆coral颜色,并设置湖泊颜色为blue
16 | m.fillcontinents(color='coral',lake_color='blue')
17 | plt.show()
18 | 


--------------------------------------------------------------------------------
/blog22-Basemap/blog-002.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from mpl_toolkits.basemap import Basemap
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # 设置basemap Lambert-Conformal 
 6 | # 设置分辨率参数resolution=None 跳过处理边界数据集
 7 | m = Basemap(width=12000000,height=9000000,projection='lcc',
 8 |             resolution=None,lat_1=45.,lat_2=55,lat_0=50,lon_0=-107.)
 9 | 
10 | # 为地图背景绘制海陆罩
11 | # lakes=True 意味着内陆湖和海洋颜色一致
12 | m.drawlsmask(land_color='coral',ocean_color='aqua',lakes=True)
13 | plt.show()
14 | 


--------------------------------------------------------------------------------
/blog22-Basemap/blog-003.py:
--------------------------------------------------------------------------------
1 | from mpl_toolkits.basemap import Basemap
2 | import matplotlib.pyplot as plt
3 | 
4 | m = Basemap(width=12000000,height=9000000,projection='lcc',
5 |             resolution=None,lat_1=45.,lat_2=55,lat_0=50,lon_0=-107.)
6 | m.bluemarble()
7 | plt.show()
8 | 


--------------------------------------------------------------------------------
/blog22-Basemap/blog-004.py:
--------------------------------------------------------------------------------
1 | from mpl_toolkits.basemap import Basemap
2 | import matplotlib.pyplot as plt
3 | 
4 | m = Basemap(width=12000000,height=9000000,projection='lcc',
5 |             resolution=None,lat_1=45.,lat_2=55,lat_0=50,lon_0=-107.)
6 | m.shadedrelief()
7 | plt.show()
8 | 


--------------------------------------------------------------------------------
/blog22-Basemap/blog-005.py:
--------------------------------------------------------------------------------
1 | from mpl_toolkits.basemap import Basemap
2 | import matplotlib.pyplot as plt
3 | 
4 | m = Basemap(width=12000000,height=9000000,projection='lcc',
5 |             resolution=None,lat_1=45.,lat_2=55,lat_0=50,lon_0=-107.)
6 | m.etopo()
7 | plt.show()
8 | 


--------------------------------------------------------------------------------
/blog22-Basemap/blog-006.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog22-Basemap/blog-006.py


--------------------------------------------------------------------------------
/blog22-Basemap/blog-007.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from mpl_toolkits.basemap import Basemap
3 | import matplotlib.pyplot as plt
4 | 
5 | m = Basemap(projection='merc', llcrnrlat=-80, urcrnrlat=80,
6 |             llcrnrlon=-180, urcrnrlon=180)
7 | m.drawcoastlines()
8 | plt.show()
9 | 


--------------------------------------------------------------------------------
/blog23-statsmodels/blog01.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 |  
 4 | df = pd.DataFrame([10,20,30,40],columns=['num'],
 5 |                   index=['a','b','c','d'])
 6 |  
 7 | print(df.index)
 8 | print(df.columns)
 9 | print(df.ix['c'])
10 | print(df.ix[df.index[1:3]])
11 | print(df.sum())
12 | print(df.apply(lambda x:x**2))
13 | 


--------------------------------------------------------------------------------
/blog23-statsmodels/blog02.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog23-statsmodels/blog02.py


--------------------------------------------------------------------------------
/blog23-statsmodels/blog03_show.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: cp936 -*-
 2 | import pandas as pd
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 |  
 6 | a = np.random.standard_normal((9,4))
 7 | df = pd.DataFrame(a)
 8 | df.columns = ["No1", "No2", "No3", "No4"]
 9 | dates = pd.date_range('2015-1-1',periods=9,freq='M')
10 | df.index = dates
11 |  
12 | print(df.cumsum())
13 | df.plot(lw=2.0)
14 | plt.show()
15 | 


--------------------------------------------------------------------------------
/blog23-statsmodels/blog04_show.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: cp936 -*-
 2 | import pandas as pd
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 |  
 6 | a = np.random.standard_normal((9,4))
 7 | df = pd.DataFrame(a)
 8 | df.columns = ["No1", "No2", "No3", "No4"]
 9 | dates = pd.date_range('2015-1-1',periods=9,freq='M')
10 | df.index = dates
11 | print(df['No1'])
12 |  
13 | import matplotlib.pyplot as plt
14 | df['No1'].cumsum().plot(style="r",lw=2.)
15 | plt.xlabel('date')
16 | plt.ylabel('value')
17 | plt.show()
18 | 


--------------------------------------------------------------------------------
/blog23-statsmodels/blog05_groupby.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: cp936 -*-
 2 | import pandas as pd
 3 | import numpy as np
 4 |  
 5 | a = np.random.standard_normal((9,4))
 6 | df = pd.DataFrame(a)
 7 | df.columns = ["No1", "No2", "No3", "No4"]
 8 | dates = pd.date_range('2015-1-1',periods=9,freq='M')
 9 | df.index = dates
10 |  
11 | df['Quarter'] = ['Q1','Q1','Q1','Q2','Q2','Q2','Q3','Q3','Q3']
12 | print(df)
13 | groups = df.groupby('Quarter')
14 | print(groups.sum())
15 | print(groups.mean())
16 | print(groups.max())
17 | print(groups.size())
18 | 


--------------------------------------------------------------------------------
/blog23-statsmodels/blog06_ARIMA.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pandas as pd
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 |  
 6 | dta=[10930,10318,10595,10972,7706,6756,9092,10551,9722,10913,11151,8186,6422, 
 7 | 6337,11649,11652,10310,12043,7937,6476,9662,9570,9981,9331,9449,6773,6304,9355, 
 8 | 10477,10148,10395,11261,8713,7299,10424,10795,11069,11602,11427,9095,7707,10767, 
 9 | 12136,12812,12006,12528,10329,7818,11719,11683,12603,11495,13670,11337,10232, 
10 | 13261,13230,15535,16837,19598,14823,11622,19391,18177,19994,14723,15694,13248, 
11 | 9543,12872,13101,15053,12619,13749,10228,9725,14729,12518,14564,15085,14722, 
12 | 11999,9390,13481,14795,15845,15271,14686,11054,10395]
13 |  
14 | dta = np.array(dta,dtype=np.float) #这里要转下数据类型，不然运行会报错
15 | df = pd.Series(dta)
16 | print(df)
17 | dates = pd.date_range('2001', periods=90, freq='A')
18 | df.index = dates
19 | print(df)
20 | df.plot(figsize=(12,8))
21 | plt.show()
22 | 


--------------------------------------------------------------------------------
/blog23-statsmodels/blog07_ARIMA.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pandas as pd
 3 | import numpy as np
 4 |  
 5 | dta=[10930,10318,10595,10972,7706,6756,9092,10551,9722,10913,11151,8186,6422, 
 6 | 6337,11649,11652,10310,12043,7937,6476,9662,9570,9981,9331,9449,6773,6304,9355, 
 7 | 10477,10148,10395,11261,8713,7299,10424,10795,11069,11602,11427,9095,7707,10767, 
 8 | 12136,12812,12006,12528,10329,7818,11719,11683,12603,11495,13670,11337,10232, 
 9 | 13261,13230,15535,16837,19598,14823,11622,19391,18177,19994,14723,15694,13248, 
10 | 9543,12872,13101,15053,12619,13749,10228,9725,14729,12518,14564,15085,14722, 
11 | 11999,9390,13481,14795,15845,15271,14686,11054,10395]
12 |  
13 | dta = np.array(dta,dtype=np.float) #这里要转下数据类型，不然运行会报错
14 | df = pd.Series(dta)
15 | print(df)
16 | dates = pd.date_range('2001', periods=90, freq='A')
17 | df.index = dates
18 | print(df)
19 | df.plot(figsize=(12,8))
20 |  
21 | import matplotlib.pyplot as plt
22 | fig = plt.figure(figsize=(12,8))
23 | ax1= fig.add_subplot(111)
24 | diff1 = df.diff(1)
25 | diff1.plot(ax=ax1)
26 | plt.show()
27 | 


--------------------------------------------------------------------------------
/blog23-statsmodels/blog08_statsmodels.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pandas as pd
 3 | import numpy as np
 4 |  
 5 | dta=[10930,10318,10595,10972,7706,6756,9092,10551,9722,10913,11151,8186,6422, 
 6 | 6337,11649,11652,10310,12043,7937,6476,9662,9570,9981,9331,9449,6773,6304,9355, 
 7 | 10477,10148,10395,11261,8713,7299,10424,10795,11069,11602,11427,9095,7707,10767, 
 8 | 12136,12812,12006,12528,10329,7818,11719,11683,12603,11495,13670,11337,10232, 
 9 | 13261,13230,15535,16837,19598,14823,11622,19391,18177,19994,14723,15694,13248, 
10 | 9543,12872,13101,15053,12619,13749,10228,9725,14729,12518,14564,15085,14722, 
11 | 11999,9390,13481,14795,15845,15271,14686,11054,10395]
12 |  
13 | dta = np.array(dta,dtype=np.float) #这里要转下数据类型，不然运行会报错
14 | df = pd.Series(dta)
15 | print(df)
16 | dates = pd.date_range('2001', periods=90, freq='A')
17 | df.index = dates
18 | print(df)
19 | df.plot(figsize=(12,8))
20 |  
21 |  
22 | import matplotlib.pyplot as plt
23 | fig = plt.figure(figsize=(12,8))
24 | ax1= fig.add_subplot(111)
25 | diff1 = df.diff(1)
26 | diff1.plot(ax=ax1)
27 |  
28 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
29 | f = plt.figure(facecolor='white')
30 | ax1 = f.add_subplot(211)
31 | plot_acf(df, lags=40, ax=ax1)
32 | ax2 = f.add_subplot(212)
33 | plot_pacf(df, lags=40, ax=ax2)
34 | plt.show()
35 | 


--------------------------------------------------------------------------------
/blog23-statsmodels/blog09_statsmodels.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pandas as pd
 3 | import numpy as np
 4 |  
 5 | dta=[10930,10318,10595,10972,7706,6756,9092,10551,9722,10913,11151,8186,6422, 
 6 | 6337,11649,11652,10310,12043,7937,6476,9662,9570,9981,9331,9449,6773,6304,9355, 
 7 | 10477,10148,10395,11261,8713,7299,10424,10795,11069,11602,11427,9095,7707,10767, 
 8 | 12136,12812,12006,12528,10329,7818,11719,11683,12603,11495,13670,11337,10232, 
 9 | 13261,13230,15535,16837,19598,14823,11622,19391,18177,19994,14723,15694,13248, 
10 | 9543,12872,13101,15053,12619,13749,10228,9725,14729,12518,14564,15085,14722, 
11 | 11999,9390,13481,14795,15845,15271,14686,11054,10395]
12 |  
13 | dta = np.array(dta,dtype=np.float) 
14 | df = pd.Series(dta)
15 | print(df)
16 | dates = pd.date_range('2001', periods=90, freq='A')
17 | df.index = dates
18 | print(df)
19 | df.plot(figsize=(12,8))
20 |  
21 | import matplotlib.pyplot as plt
22 | fig = plt.figure(figsize=(12,8))
23 | ax1= fig.add_subplot(111)
24 | diff1 = df.diff(1)
25 | diff1.plot(ax=ax1)
26 |  
27 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
28 | f = plt.figure(facecolor='white')
29 | ax1 = f.add_subplot(211)
30 | plot_acf(df, lags=40, ax=ax1)
31 | ax2 = f.add_subplot(212)
32 | plot_pacf(df, lags=40, ax=ax2)
33 | plt.show()
34 |  
35 |  
36 | #预测结果
37 | import statsmodels.api as sm
38 | arma_mod80 =  sm.tsa.ARMA(df,(8,0)).fit()
39 | print(arma_mod80.aic, arma_mod80.bic, arma_mod80.hqic)
40 |  
41 | pre = arma_mod80.predict('2090', '2100', dynamic=True)
42 | print(pre)
43 |  
44 | fig, ax = plt.subplots(figsize=(12, 8))
45 | ax = df.ix['2000':].plot(ax=ax)
46 | fig = arma_mod80.plot_predict('2090', '2100', dynamic=True, ax=ax, plot_insample=False)
47 | plt.show()
48 | 


--------------------------------------------------------------------------------
/blog23-statsmodels/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog23-statsmodels/result01.png


--------------------------------------------------------------------------------
/blog23-statsmodels/result02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog23-statsmodels/result02.png


--------------------------------------------------------------------------------
/blog23-statsmodels/result03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog23-statsmodels/result03.png


--------------------------------------------------------------------------------
/blog23-statsmodels/result04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog23-statsmodels/result04.png


--------------------------------------------------------------------------------
/blog23-statsmodels/result05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog23-statsmodels/result05.png


--------------------------------------------------------------------------------
/blog24-Kmeans-Chinese/BaiduSpiderSpots.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog24-Kmeans-Chinese/BaiduSpiderSpots.rar


--------------------------------------------------------------------------------
/blog24-Kmeans-Chinese/blog01_merge.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8            
 2 | import re          
 3 | import os  
 4 | import sys
 5 | import codecs
 6 | import shutil
 7 |  
 8 | def merge_file():
 9 |     path = "BaiduSpiderSpots\\"
10 |     resName = "BaiduSpider_Result.txt"
11 |     if os.path.exists(resName):
12 |         os.remove(resName)
13 |     result = codecs.open(resName, 'w', 'utf-8')
14 |  
15 |     num = 1
16 |     while num <= 100:
17 |         name = "%04d" % num 
18 |         fileName = path + str(name) + ".txt"
19 |         source = open(fileName, 'r', encoding='utf-8')
20 |         line = source.readline()
21 |         line = line.strip('\n')
22 |         line = line.strip('\r')
23 |  
24 |         while line!="":
25 |             line = line.replace('\n',' ')
26 |             line = line.replace('\r',' ')
27 |             result.write(line+ ' ')
28 |             line = source.readline()
29 |         else:
30 |             print('End file: ' + str(num))
31 |             result.write('\r\n')
32 |             source.close()
33 |         num = num + 1
34 |         
35 |     else:
36 |         print('End All')
37 |         result.close()    
38 |  
39 | if __name__ == '__main__':
40 |     merge_file()
41 | 


--------------------------------------------------------------------------------
/blog24-Kmeans-Chinese/blog02_spider.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8  
 2 | # test09_03.py                
 3 | import os  
 4 | import codecs
 5 | from selenium import webdriver      
 6 | from selenium.webdriver.common.keys import Keys       
 7 |  
 8 | driver = webdriver.Firefox()
 9 | 
10 | """
11 | 该部分代码采用python2编写，并且产生于五年前
12 | 主要提供思路，建议大家结合现在的百科HTML节点进行分析和爬取
13 | 该资源更推荐大家运行其他预处理和分析部分的代码
14 | 祝好！
15 | 
16 | ——BY: Eastmount
17 | """
18 |  
19 | #获取摘要信息
20 | def getAbstract(name):  
21 |     try:
22 |         #新建文件夹及文件
23 |         basePathDirectory = "Hudong_Coding"  
24 |         if not os.path.exists(basePathDirectory):  
25 |             os.makedirs(basePathDirectory)  
26 |         baiduFile = os.path.join(basePathDirectory,"HudongSpider.txt")
27 |         #文件不存在新建,存在则追加写入
28 |         if not os.path.exists(baiduFile):  
29 |             info = codecs.open(baiduFile,'w','utf-8')  
30 |         else:  
31 |             info = codecs.open(baiduFile,'a','utf-8')  
32 |  
33 |         url = "http://www.baike.com/wiki/" + name
34 |         print url
35 |         driver.get(url)  
36 |         elem = driver.find_element_by_xpath("//div[@class='summary']/p")  
37 |         print elem.text
38 |         info.writelines(elem.text+'\r\n')  
39 |           
40 |     except Exception,e: 
41 |         print "Error: ",e  
42 |     finally:  
43 |         print '\n'  
44 |         info.write('\r\n')  
45 |   
46 | #主函数  
47 | def main():
48 |     languages = ["JavaScript", "Java", "Python", "Ruby", "PHP",
49 |                  "C++", "CSS", "C#", "C", "GO"]
50 |     print u'开始爬取'
51 |     for lg in languages:  
52 |         print lg
53 |         getAbstract(lg)  
54 |     print u'结束爬取'  
55 |  
56 | if __name__ == '__main__':
57 |     main()  
58 | 


--------------------------------------------------------------------------------
/blog24-Kmeans-Chinese/blog03_fenci.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | import sys
 3 | import re
 4 | import codecs
 5 | import os
 6 | import shutil
 7 | import jieba
 8 | import jieba.analyse
 9 |  
10 | #导入自定义词典
11 | #jieba.load_userdict("dict_baidu.txt")
12 |  
13 | #Read file and cut
14 | def read_file_cut():
15 |  
16 |     fileName = "HudongSpider_Result.txt"
17 |     source = open(fileName, 'r', encoding='utf8')
18 |     resName = "Stop_HudongSpider_Result.txt"
19 |     result = codecs.open(resName, 'w', 'utf-8')
20 |     line = source.readline()
21 |         
22 |     while line!="":
23 |         seglist = jieba.cut(line,cut_all=False)  #精确模式
24 |         output = ' '.join(list(seglist))         #空格拼接
25 |         #print output
26 |         result.write(output)
27 |         line = source.readline()
28 |     else:
29 |         source.close()
30 |         result.close()
31 |         print('End All')
32 |  
33 | #Run function
34 | if __name__ == '__main__':
35 |     read_file_cut()
36 | 


--------------------------------------------------------------------------------
/blog24-Kmeans-Chinese/blog04_kmeans.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8  
  2 | import time          
  3 | import re          
  4 | import os  
  5 | import sys
  6 | import codecs
  7 | import shutil
  8 | import numpy as np
  9 | import matplotlib
 10 | import scipy
 11 | import matplotlib.pyplot as plt
 12 | from sklearn import feature_extraction  
 13 | from sklearn.feature_extraction.text import TfidfTransformer  
 14 | from sklearn.feature_extraction.text import CountVectorizer
 15 | from sklearn.feature_extraction.text import HashingVectorizer 
 16 |  
 17 | if __name__ == "__main__":
 18 |     
 19 |     #########################################################################
 20 |     #                           第一步 计算TFIDF
 21 |     
 22 |     #文档预料 空格连接
 23 |     corpus = []
 24 |     
 25 |     #读取预料 一行预料为一个文档
 26 |     for line in open('Stop_HudongSpider_Result.txt', 'r', encoding='utf8').readlines():
 27 |         #print line
 28 |         corpus.append(line.strip())
 29 |     
 30 |     #将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
 31 |     #remove words occuring less than 5 times
 32 |     vectorizer = CountVectorizer(min_df=2)
 33 |     print(type(vectorizer))
 34 |  
 35 |     #该类会统计每个词语的tf-idf权值
 36 |     transformer = TfidfTransformer()
 37 |  
 38 |     #第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
 39 |     tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
 40 |     print(type(tfidf))
 41 |  
 42 |     #获取词袋模型中的所有词语  
 43 |     word = vectorizer.get_feature_names()
 44 |     
 45 |     #将tf-idf矩阵抽取出来，元素w[i][j]表示j词在i类文本中的tf-idf权重
 46 |     X = tfidf.toarray()
 47 |     weight = np.array(X, dtype='float32')
 48 |     print(type(weight))
 49 |     print(weight.shape)
 50 | 
 51 |     #MemoryError: Unable to allocate array with shape (400, 143556) and data type float64
 52 |     #remove words occuring less than 5 times
 53 |     
 54 |     #打印特征向量文本内容
 55 |     print('Features length: ' + str(len(word)))
 56 |     
 57 |     """
 58 |     resName = "BHTfidf_Result.txt"
 59 |     result = codecs.open(resName, 'w', 'utf-8')
 60 |     for j in range(len(word)):
 61 |         result.write(word[j] + ' ')
 62 |     result.write('\r\n\r\n')
 63 |  
 64 |     #打印每类文本的tf-idf词语权重，第一个for遍历所有文本，第二个for便利某一类文本下的词语权重  
 65 |     for i in range(len(weight)):
 66 |         #print u"-------这里输出第", i, u"类文本的词语tf-idf权重------"  
 67 |         for j in range(len(word)):
 68 |             #print weight[i][j],
 69 |             result.write(str(weight[i][j]) + ' ')
 70 |         result.write('\r\n\r\n')
 71 |  
 72 |     result.close()
 73 |     """
 74 |  
 75 |  
 76 |     ########################################################################
 77 |     #                               第二步 聚类Kmeans
 78 |  
 79 |     print('Start Kmeans:')
 80 |     from sklearn.cluster import KMeans
 81 |     clf = KMeans(n_clusters=4)   #景区 动物 人物 国家
 82 |     s = clf.fit(weight)
 83 |     print(s)
 84 |  
 85 |     '''
 86 |     print 'Start MiniBatchKmeans:'
 87 |     from sklearn.cluster import MiniBatchKMeans
 88 |     clf = MiniBatchKMeans(n_clusters=20)
 89 |     s = clf.fit(weight)
 90 |     print s
 91 |     '''
 92 |  
 93 |     #中心点
 94 |     print(clf.cluster_centers_)
 95 |     
 96 |     #每个样本所属的簇
 97 |     label = []               #存储400个类标 
 98 |     print(clf.labels_)
 99 |     i = 1
100 |     while i <= len(clf.labels_):
101 |         #print(clf.labels_[i-1])
102 |         label.append(clf.labels_[i-1])
103 |         i = i + 1
104 |  
105 |     #用来评估簇的个数是否合适，距离越小说明簇分的越好，选取临界点的簇个数  958.137281791
106 |     print(clf.inertia_)
107 |     y_pred = clf.labels_
108 |     
109 |  
110 |  
111 |     ########################################################################
112 |     #                               第三步 图形输出 降维
113 |  
114 |     from sklearn.decomposition import PCA
115 |     pca = PCA(n_components=2)             #输出两维
116 |     newData = pca.fit_transform(weight)   #载入N维
117 |     print(newData)
118 |  
119 |     x = [n[0] for n in newData]
120 |     y = [n[1] for n in newData]
121 |  
122 |     x1, y1 = [], []   
123 |     x2, y2 = [], [] 
124 |     x3, y3 = [], []
125 |     x4, y4 = [], []
126 |     
127 |     #分布获取类标为0、1、2、3的数据 赋值给(x1,y1) (x2,y2) (x3,y3) (x4,y4)
128 |     i = 0  
129 |     while i < len(newData):  
130 |         if y_pred[i]==0:  
131 |             x1.append(newData[i][0])  
132 |             y1.append(newData[i][1])  
133 |         elif y_pred[i]==1:  
134 |             x2.append(newData[i][0])  
135 |             y2.append(newData[i][1])  
136 |         elif y_pred[i]==2:  
137 |             x3.append(newData[i][0])  
138 |             y3.append(newData[i][1])
139 |         elif y_pred[i]==3:  
140 |             x4.append(newData[i][0])  
141 |             y4.append(newData[i][1])
142 |         i = i + 1
143 |         
144 |     #四种颜色 红 绿 蓝，marker='x'表示类型，o表示圆点 *表示星型 x表示点   
145 |     plot1, = plt.plot(x1, y1, 'or', marker="o", markersize=10)    
146 |     plot2, = plt.plot(x2, y2, 'og', marker="o", markersize=10)    
147 |     plot3, = plt.plot(x3, y3, 'ob', marker="o", markersize=10)
148 |     plot4, = plt.plot(x4, y4, 'oy', marker="o", markersize=10)   
149 |     #plt.title("K-Means Text Clustering")  #绘制标题
150 |     plt.legend((plot1, plot2, plot3, plot4), ('A', 'B', 'C', 'D'), fontsize=10)  
151 |  
152 |     #四种颜色 红 绿 蓝 黑
153 |     #plt.scatter(x1, x2, c=clf.labels_,  s=100)
154 |     plt.show()
155 |  
156 | 


--------------------------------------------------------------------------------
/blog24-Kmeans-Chinese/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog24-Kmeans-Chinese/result.png


--------------------------------------------------------------------------------
/blog25-Matplotlib/allname.txt:
--------------------------------------------------------------------------------
1 | 鹦鹉
2 | 百灵鸟
3 | 喜鹊
4 | 画眉鸟
5 | 秃鹫
6 | 北极鸥
7 | 丹顶鹤


--------------------------------------------------------------------------------
/blog25-Matplotlib/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog25-Matplotlib/plot.png


--------------------------------------------------------------------------------
/blog25-Matplotlib/test01-show.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | import os
 3 | import codecs
 4 | import numpy as np
 5 | import matplotlib
 6 | import matplotlib.pyplot as plt
 7 |  
 8 | x = [2.3, 4.5, 3, 7, 6.5, 4, 5.3]
 9 | y = [5, 4, 7, 5, 5.3, 5.5, 6.2]
10 |  
11 | num = np.arange(7)
12 | name = ["a", "b", "c", "d", "e", "f", "g"]
13 |  
14 | fig, ax = plt.subplots()
15 | ax.scatter(x,y,c='r',s=100)
16 |  
17 | for i,txt in enumerate(name):  #n  
18 |     ax.annotate(txt,(x[i],y[i]))
19 |  
20 | plt.show()
21 | 


--------------------------------------------------------------------------------
/blog25-Matplotlib/test02-show.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | import os
 3 | import codecs
 4 | import numpy as np
 5 | import matplotlib
 6 | import matplotlib.pyplot as plt
 7 |  
 8 | x = [2.3, 4.5, 3, 7, 6.5, 4, 5.3]
 9 | y = [5, 4, 7, 5, 5.3, 5.5, 6.2]
10 |  
11 | n=np.arange(7)
12 | name = ["a", "b", "c", "d", "e", "f", "g"]
13 |  
14 | fig, ax = plt.subplots()
15 | ax.scatter(x,y,c='r',s=100)
16 |  
17 | #定义数组读取名称
18 | corpus = []
19 | result = codecs.open('allname.txt', 'r', 'utf-8')
20 | for u in result.readlines():
21 |     print(u.strip())
22 |     corpus.append(u.strip())
23 |  
24 | #解决中文和负号'-'显示为方块的问题  
25 | matplotlib.rcParams['font.sans-serif'] = ['SimHei']
26 | matplotlib.rcParams['font.family']='sans-serif'
27 | matplotlib.rcParams['axes.unicode_minus'] = False
28 |  
29 | for i,txt in enumerate(corpus): #n  name  
30 |     ax.annotate(txt,(x[i],y[i]))
31 |  
32 | result.close()
33 | plt.savefig('plot.png', dpi=1200)
34 | plt.show()
35 | 


--------------------------------------------------------------------------------
/blog25-Matplotlib/test03-kmeans.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #载入数据集
 3 | from sklearn.datasets import load_iris 
 4 | iris = load_iris()
 5 | print(iris.data)           #输出数据集
 6 | print(iris.target)         #输出真实标签
 7 | print(len(iris.target))
 8 | print(iris.data.shape)     #150个样本 每个样本4个特征
 9 |  
10 |  
11 | #导入决策树DTC包
12 | from sklearn.cluster import KMeans
13 | clf = KMeans(n_clusters=3)
14 | pre = clf.fit_predict(iris.data)      
15 | print(pre)
16 |  
17 | #获取花卉两列数据集
18 | X = iris.data
19 | L1 = [x[0] for x in X]
20 | print(L1)
21 | L2 = [x[1] for x in X]
22 | print(L2)
23 |  
24 | #绘图
25 | import numpy as np
26 | import matplotlib.pyplot as plt
27 | plt.scatter(L1, L2, c=pre, marker='x', s=100) 
28 | plt.title("KMeans")
29 | plt.show()
30 | 


--------------------------------------------------------------------------------
/blog25-Matplotlib/test04-kmeans.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #载入数据集
 3 | from sklearn.datasets import load_iris 
 4 | iris = load_iris()
 5 | print(iris.data)            #输出数据集
 6 | print(iris.target)          #输出真实标签
 7 | print(len(iris.target))
 8 | print(iris.data.shape)      #150个样本 每个样本4个特征
 9 |  
10 |  
11 | #导入决策树DTC包
12 | from sklearn.cluster import KMeans
13 | clf = KMeans(n_clusters=3)
14 | y_pred = clf.fit_predict(iris.data)      
15 | print(y_pred)
16 |  
17 | #降维绘图
18 | from sklearn.decomposition import PCA
19 | pca = PCA(n_components=2)             #输出两维
20 | newData = pca.fit_transform(iris.data)   #载入N维
21 | print(newData)
22 | x = [n[0] for n in newData]
23 | y = [n[1] for n in newData]
24 |  
25 | x1, y1 = [], []   
26 | x2, y2 = [], [] 
27 | x3, y3 = [], []
28 |     
29 | #分别获取类标为0、1、2的数据 赋值给(x1,y1) (x2,y2) (x3,y3) 
30 | i = 0  
31 | while i < len(newData):  
32 |     if y_pred[i]==0:  
33 |         x1.append(newData[i][0])  
34 |         y1.append(newData[i][1])  
35 |     elif y_pred[i]==1:  
36 |         x2.append(newData[i][0])  
37 |         y2.append(newData[i][1])  
38 |     elif y_pred[i]==2:  
39 |         x3.append(newData[i][0])  
40 |         y3.append(newData[i][1])
41 |     i = i + 1
42 |  
43 |  
44 | import matplotlib.pyplot as plt
45 |  
46 | #三种颜色   
47 | plot1, = plt.plot(x1, y1, 'or', marker="o", markersize=10)    
48 | plot2, = plt.plot(x2, y2, 'og', marker="o", markersize=10)    
49 | plot3, = plt.plot(x3, y3, 'ob', marker="o", markersize=10)
50 | plt.title("K-Means Text Clustering")  #绘制标题
51 | plt.legend((plot1, plot2, plot3), ('A', 'B', 'C'))
52 |  
53 | #plt.scatter(x1, x2, c=clf.labels_,  s=100)
54 | plt.show()
55 | 


--------------------------------------------------------------------------------
/blog26-SnowNLP/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog26-SnowNLP/result01.png


--------------------------------------------------------------------------------
/blog26-SnowNLP/result02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog26-SnowNLP/result02.png


--------------------------------------------------------------------------------
/blog26-SnowNLP/result03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog26-SnowNLP/result03.png


--------------------------------------------------------------------------------
/blog26-SnowNLP/result04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog26-SnowNLP/result04.png


--------------------------------------------------------------------------------
/blog26-SnowNLP/result05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog26-SnowNLP/result05.png


--------------------------------------------------------------------------------
/blog26-SnowNLP/test01-spider.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8  
 2 | from selenium import webdriver  
 3 | from selenium.webdriver.common.keys import Keys  
 4 | import selenium.webdriver.support.ui as ui  
 5 | from selenium.webdriver.common.action_chains import ActionChains  
 6 | import time      
 7 | import re      
 8 | import os
 9 | import csv
10 | import codecs
11 | 
12 | #写入文件
13 | c = open("test-douban.csv", "wb")  #写文件
14 | c.write(codecs.BOM_UTF8)          #防止乱码
15 | writer = csv.writer(c)                     #写入对象
16 | writer.writerow(['序号','用户名','链接','评分','评分标题','有用数','日期','评论'])
17 | 
18 | #打开Firefox浏览器 设定等待加载时间 访问URL  
19 | driver = webdriver.Firefox()
20 | i = 0
21 | while i<10:
22 |     num = i*20
23 |     url = "https://movie.douban.com/subject/1292052/comments?start=" + str(num) +"&limit=20&sort=new_score&status=P"
24 |     print(url)
25 |     driver.get(url)
26 |     #用户姓名 超链接
27 |     elem1 = driver.find_elements_by_xpath("//div[@class='avatar']/a")     
28 |     #用户评分
29 |     elem2 = driver.find_elements_by_xpath("//span[@class='comment-info']/span[2]")
30 |     #有用数
31 |     elem3 = driver.find_elements_by_xpath("//span[@class='comment-vote']/span[1]")
32 |     #日期
33 |     elem4 = driver.find_elements_by_xpath("//span[@class='comment-time ']")
34 |     #评论
35 |     elem5 = driver.find_elements_by_xpath("//span[@class='short']")
36 | 
37 |     #循环写入20行评价
38 |     tlist = []
39 |     k = 0
40 |     while k<20:
41 |         #序号
42 |         num = i*20+k+1
43 |         print(num)
44 |         #用户姓名
45 |         name = elem1[k].get_attribute("title").encode('utf-8')
46 |         print(name)
47 |         #超链接
48 |         href = elem1[k].get_attribute("href").encode('utf-8')
49 |         print(href)
50 |         #用户评分及内容
51 |         score = elem2[k].get_attribute("class").encode('utf-8')
52 |         print(score)
53 |         content = elem2[k].get_attribute("title").encode('utf-8')
54 |         print(content)
55 |         #有用数
56 |         useful = elem3[k].text.encode('utf-8')
57 |         print(useful)
58 |         #日期
59 |         date = elem4[k].text.encode('utf-8')
60 |         #评论
61 |         shortcon = elem5[k].text.encode('utf-8')
62 |         print(shortcon)
63 | 
64 |         #写入文件
65 |         templist = []
66 |         templist.append(num)
67 |         templist.append(name)
68 |         templist.append(href)
69 |         templist.append(score)
70 |         templist.append(content)
71 |         templist.append(useful)
72 |         templist.append(date)
73 |         templist.append(shortcon)
74 |         writer.writerow(templist)
75 |         
76 |         k = k + 1
77 |             
78 |     i = i + 1
79 | 
80 | c.close()
81 | 


--------------------------------------------------------------------------------
/blog26-SnowNLP/test02-wordcloud.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import jieba
 3 | import sys
 4 | import matplotlib.pyplot as plt
 5 | from wordcloud import WordCloud
 6 |  
 7 | #打开本体TXT文件
 8 | text = open('data.txt',encoding='utf-8').read()
 9 | print(type(text))
10 |  
11 | #结巴分词 cut_all=True 设置为精准模式 
12 | wordlist = jieba.cut(text, cut_all = False)
13 |  
14 | #使用空格连接 进行中文分词
15 | wl_space_split = " ".join(wordlist)
16 | print(wl_space_split)
17 |  
18 | #对分词后的文本生成词云
19 | my_wordcloud = WordCloud().generate(wl_space_split)
20 |  
21 | #显示词云图
22 | plt.imshow(my_wordcloud)
23 | #是否显示x轴、y轴下标
24 | plt.axis("off")
25 | plt.show()
26 | 


--------------------------------------------------------------------------------
/blog26-SnowNLP/test03-snownlp01.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from snownlp import SnowNLP
 3 | s1 = SnowNLP(u"这本书质量真不太好！")
 4 | print("SnowNLP:")
 5 | print(" ".join(s1.words))
 6 | 
 7 | import jieba
 8 | s2 = jieba.cut(u"这本书质量真不太好！", cut_all=False)
 9 | print("jieba:")
10 | print(" ".join(s2))
11 | 


--------------------------------------------------------------------------------
/blog26-SnowNLP/test04-snownlp02.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from snownlp import SnowNLP
 3 | s = SnowNLP(u"这本书质量真不太好！")
 4 | 
 5 | print(u"\n中文分词:")
 6 | print( " ".join(s.words))
 7 | 
 8 | print(u"\n词性标注:")
 9 | print(s.tags)
10 | for k in s.tags:
11 |     print(k)
12 | 
13 | print(u"\n情感分数:")
14 | print(s.sentiments)
15 | 
16 | print(u"\n转换拼音:")
17 | print(s.pinyin)
18 | 
19 | print(u"\n输出前4个关键词:")
20 | print(s.keywords(4))
21 | for k in s.keywords(4):
22 |     print(k)
23 | 
24 | print(u"\n输出关键句子:")
25 | print(s.summary(1))
26 | for k in s.summary(1):
27 |     print(k)
28 | 
29 | print(u"\n输出tf和idf:")
30 | print(s.tf)
31 | print(s.idf)
32 | 
33 | n = SnowNLP(u'「繁體字」「繁體中文」的叫法在臺灣亦很常見。')
34 | print(u"\n繁简体转换:")
35 | print(n.han)
36 | 


--------------------------------------------------------------------------------
/blog26-SnowNLP/test05-snownlp03.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from snownlp import SnowNLP
 3 | s1 = SnowNLP(u"我今天很开心")
 4 | print("s1情感分数:")
 5 | print(s1.sentiments)
 6 | 
 7 | s2 = SnowNLP(u"我今天很沮丧")
 8 | print("s2情感分数:")
 9 | print(s2.sentiments)
10 | 
11 | s3 = SnowNLP(u"大傻瓜，你脾气真差，动不动就打人")
12 | print("s3情感分数:")
13 | print(s3.sentiments)
14 | 


--------------------------------------------------------------------------------
/blog26-SnowNLP/test06-snownlp-show.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from snownlp import SnowNLP
 3 | import codecs
 4 | import os
 5 | 
 6 | source = open("data.txt","r")
 7 | line = source.readlines()
 8 | sentimentslist = []
 9 | for i in line:
10 |     s = SnowNLP(i.decode("utf-8"))
11 |     print(s.sentiments)
12 |     sentimentslist.append(s.sentiments)
13 | 
14 | import matplotlib.pyplot as plt
15 | import numpy as np
16 | plt.hist(sentimentslist, bins = np.arange(0, 1, 0.01), facecolor = 'g')
17 | plt.xlabel('Sentiments Probability')
18 | plt.ylabel('Quantity')
19 | plt.title('Analysis of Sentiments')
20 | plt.show()
21 | 


--------------------------------------------------------------------------------
/blog26-SnowNLP/test07-snownlp-show.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from snownlp import SnowNLP
 3 | import codecs
 4 | import os
 5 | 
 6 | source = open("data.txt","r", encoding='utf-8')
 7 | line = source.readlines()
 8 | sentimentslist = []
 9 | for i in line:
10 |     s = SnowNLP(i)
11 |     print(s.sentiments)
12 |     sentimentslist.append(s.sentiments)
13 | 
14 | import matplotlib.pyplot as plt
15 | import numpy as np
16 | plt.plot(np.arange(0, 200, 1), sentimentslist, 'k-')
17 | plt.xlabel('Number')
18 | plt.ylabel('Sentiment')
19 | plt.title('Analysis of Sentiments')
20 | plt.show()
21 | 


--------------------------------------------------------------------------------
/blog26-SnowNLP/test08-snownlp-show.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from snownlp import SnowNLP
 3 | import codecs
 4 | import os
 5 | 
 6 | #获取情感分数
 7 | source = open("data.txt","r", encoding='utf-8')
 8 | line = source.readlines()
 9 | sentimentslist = []
10 | for i in line:
11 |     s = SnowNLP(i)
12 |     print(s.sentiments)
13 |     sentimentslist.append(s.sentiments)
14 | 
15 | #区间转换为[-0.5, 0.5]
16 | result = []
17 | i = 0
18 | while i<len(sentimentslist):
19 |     result.append(sentimentslist[i]-0.5)
20 |     i = i + 1
21 | 
22 | #可视化画图
23 | import matplotlib.pyplot as plt
24 | import numpy as np
25 | plt.plot(np.arange(0, 200, 1), result, 'k-')
26 | plt.xlabel('Number')
27 | plt.ylabel('Sentiment')
28 | plt.title('Analysis of Sentiments')
29 | plt.show()
30 | 


--------------------------------------------------------------------------------
/blog27-SVM&WineDataset/data intro.txt:
--------------------------------------------------------------------------------
 1 | Data Set Information
 2 | 
 3 | These data are the results of a chemical analysis of wines grown in the same region in
 4 | Italy but derived from three different cultivars. The analysis determined the quantities of 
 5 | 13 constituents found in each of the three types of wines. 
 6 | 
 7 | I think that the initial data set had around 30 variables, but for some reason I only 
 8 | have the 13 dimensional version. I had a list of what the 30 or so variables were, but
 9 |  a.) I lost it, and b.), I would not know which 13 variables are included in the set. 
10 | 
11 | The attributes are (dontated by Riccardo Leardi, riclea '@' anchem.unige.it ) 
12 | 1) Alcohol 
13 | 2) Malic acid 
14 | 3) Ash 
15 | 4) Alcalinity of ash 
16 | 5) Magnesium 
17 | 6) Total phenols 
18 | 7) Flavanoids 
19 | 8) Nonflavanoid phenols 
20 | 9) Proanthocyanins 
21 | 10)Color intensity 
22 | 11)Hue 
23 | 12)OD280OD315 of diluted wines 
24 | 13)Proline 
25 | 
26 | In a classification context, this is a well posed problem with well behaved class 
27 | structures. A good data set for first testing of a new classifier, but not very challenging. 
28 | 
29 | Attribute Information
30 | All attributes are continuous 
31 | No statistics available, but suggest to standardise variables for certain uses (e.g. for us with classifiers which are NOT scale invariant) 
32 | 
33 | NOTE 1st attribute is class identifier (1-3)
34 | 


--------------------------------------------------------------------------------
/blog27-SVM&WineDataset/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog27-SVM&WineDataset/result01.png


--------------------------------------------------------------------------------
/blog27-SVM&WineDataset/result02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog27-SVM&WineDataset/result02.png


--------------------------------------------------------------------------------
/blog27-SVM&WineDataset/test01-svm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.svm import SVC
 3 | 
 4 | X = np.array([[-1, -1], [-2, -2], [1, 3], [4, 6]])  
 5 | y = np.array([1, 1, 2, 2])
 6 | clf = SVC()  
 7 | clf.fit(X, y)   
 8 | print(clf.fit(X,y))
 9 | print(clf.predict([[-0.8,-1], [2,1]]))
10 | 
11 | #输出结果：[1, 2]
12 | 


--------------------------------------------------------------------------------
/blog27-SVM&WineDataset/test02-datapre.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-  
 2 | import os 
 3 | import numpy as np
 4 | 
 5 | path = "wine.txt"
 6 | data = np.loadtxt(path,dtype=float,delimiter=",")
 7 | print(data)
 8 | 
 9 | yy, x = np.split(data, (1,), axis=1)
10 | print(yy.shape, x.shape)
11 | y = []
12 | for n in yy:
13 |     y.append(int(n))
14 | 
15 | train_data = np.concatenate((x[0:40,:], x[60:100,:], x[140:160,:]), axis = 0) #训练集
16 | train_target = np.concatenate((y[0:40], y[60:100], y[140:160]), axis = 0)  #样本类别
17 | test_data = np.concatenate((x[40:60, :], x[100:140, :], x[160:,:]), axis = 0)  #测试集
18 | test_target = np.concatenate((y[40:60], y[100:140], y[160:]), axis = 0)    #样本类别
19 | 
20 | print(train_data.shape, train_target.shape)
21 | print(test_data.shape, test_target.shape)
22 | 


--------------------------------------------------------------------------------
/blog27-SVM&WineDataset/test03-svm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-  
 2 | import os 
 3 | import numpy as np
 4 | from sklearn.svm import SVC  
 5 | from sklearn import metrics
 6 | import matplotlib.pyplot as plt
 7 | from matplotlib.colors import ListedColormap
 8 | 
 9 | #第一步 加载数据集
10 | path = "wine.txt"
11 | data = np.loadtxt(path,dtype=float,delimiter=",")
12 | print(data)
13 | 
14 | #第二步 划分数据集
15 | yy, x = np.split(data, (1,), axis=1) #第一列为类标yy,后面13列特征为x
16 | print(yy.shape, x.shape)
17 | y = []
18 | for n in yy:  #将类标浮点型转化为整数
19 |     y.append(int(n))
20 | x = x[:, :2]  #获取x前两列数据,方便绘图 对应x、y轴
21 | train_data = np.concatenate((x[0:40,:], x[60:100,:], x[140:160,:]), axis = 0) #训练集
22 | train_target = np.concatenate((y[0:40], y[60:100], y[140:160]), axis = 0)  #样本类别
23 | test_data = np.concatenate((x[40:60, :], x[100:140, :], x[160:,:]), axis = 0) #测试集
24 | test_target = np.concatenate((y[40:60], y[100:140], y[160:]), axis = 0)   #样本类别
25 | print(train_data.shape, train_target.shape)
26 | print(test_data.shape, test_target.shape)
27 | 
28 | #第三步 SVC训练
29 | clf = SVC()
30 | clf.fit(train_data,train_target)
31 | result = clf.predict(test_data)
32 | print(result)
33 | 
34 | #第四步 评价算法 
35 | print(sum(result==test_target)) #预测结果与真实结果比对
36 | print(metrics.classification_report(test_target, result))  #准确率 召回率 F值
37 | 
38 | #第五步 创建网格 
39 | x1_min, x1_max = test_data[:,0].min()-0.1, test_data[:,0].max()+0.1    #第一列
40 | x2_min, x2_max = test_data[:,1].min()-0.1, test_data[:,1].max()+0.1    #第二列
41 | xx, yy = np.meshgrid(np.arange(x1_min, x1_max, 0.1),  
42 |                      np.arange(x2_min, x2_max, 0.1))     #生成网格型数据
43 | z = clf.predict(np.c_[xx.ravel(), yy.ravel()])                        
44 | 
45 | #第六步 绘图可视化
46 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])         #颜色Map
47 | cmap_bold = ListedColormap(['#000000', '#00FF00', '#FFFFFF'])
48 | plt.figure()
49 | z = z.reshape(xx.shape)
50 | print(xx.shape, yy.shape, z.shape, test_target.shape)
51 | plt.pcolormesh(xx, yy, z, cmap=cmap_light)
52 | plt.scatter(test_data[:,0], test_data[:,1], c=test_target,
53 |             cmap=cmap_bold, s=50)
54 | plt.show()
55 | 


--------------------------------------------------------------------------------
/blog27-SVM&WineDataset/test04-update.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-  
 2 | import os 
 3 | import numpy as np
 4 | from sklearn.svm import SVC  
 5 | from sklearn import metrics
 6 | import matplotlib.pyplot as plt
 7 | from matplotlib.colors import ListedColormap
 8 | from sklearn.model_selection import train_test_split
 9 | from sklearn.decomposition import PCA
10 | 
11 | #第一步 加载数据集
12 | path = "wine.txt"
13 | data = np.loadtxt(path,dtype=float,delimiter=",")
14 | print(data)
15 | 
16 | #第二步 划分数据集
17 | yy, x = np.split(data, (1,), axis=1) #第一列类标yy,后面13列特征为x
18 | print(yy.shape, x.shape)
19 | y = []
20 | for n in yy: 
21 |     y.append(int(n))
22 | y =  np.array(y, dtype = int) #list转换数组
23 | #划分数据集 测试集40%
24 | train_data, test_data, train_target, test_target = train_test_split(x, y, test_size=0.4, random_state=42)
25 | print(train_data.shape, train_target.shape)
26 | print(test_data.shape, test_target.shape)
27 | 
28 | #第三步 SVC训练
29 | clf = SVC()
30 | clf.fit(train_data, train_target)
31 | result = clf.predict(test_data)
32 | print(result)
33 | print(test_target)
34 | 
35 | #第四步 评价算法 
36 | print(sum(result==test_target)) #预测结果与真实结果比对
37 | print(metrics.classification_report(test_target, result))  #准确率 召回率 F值
38 | 
39 | #第五步 降维操作
40 | pca = PCA(n_components=2)      
41 | newData = pca.fit_transform(test_data)
42 |                   
43 | #第六步 绘图可视化
44 | plt.figure()
45 | cmap_bold = ListedColormap(['#000000', '#00FF00', '#FFFFFF'])
46 | plt.scatter(newData[:,0], newData[:,1], c=test_target, cmap=cmap_bold, s=50)
47 | plt.show()
48 | 


--------------------------------------------------------------------------------
/blog28-LDA&pyLDAvis/data.csv:
--------------------------------------------------------------------------------
 1 | ﻿id,comment
 2 | 1,新春备年货，新年联欢晚会
 3 | 2,新春节目单，春节联欢晚会红火
 4 | 3,大盘下跌股市散户
 5 | 4,下跌股市赚钱
 6 | 5,金猴新春红火新年
 7 | 6,新车新年年货新春
 8 | 7,股市反弹下跌
 9 | 8,股市散户赚钱
10 | 9,"新年,看春节联欢晚会"
11 | 10,大盘下跌散户
12 | 11,贵州省位于中国西南地区，简称黔
13 | 12,走边神州大地，醉美多彩贵州
14 | 13,贵阳市是贵州省省会城市，有林城的美誉
15 | 14,贵州省包括九个市州和一个新区
16 | 15,贵阳市近年发展大数据取得一定成果
17 | 


--------------------------------------------------------------------------------
/blog28-LDA&pyLDAvis/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog28-LDA&pyLDAvis/result.png


--------------------------------------------------------------------------------
/blog28-LDA&pyLDAvis/test01-read.py:
--------------------------------------------------------------------------------
1 | #coding: utf-8
2 | import pandas as pd
3 | 
4 | #读取数据
5 | f = open('data.csv',encoding='utf-8')
6 | df = pd.read_csv(f)
7 | print(df.shape)         #查看数据维度
8 | print(df.head())        #查看前几行数据
9 | 


--------------------------------------------------------------------------------
/blog28-LDA&pyLDAvis/test02-jieba.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | import pandas as pd
 3 | 
 4 | #第一步 读取数据
 5 | f = open('data.csv',encoding='utf-8')
 6 | df = pd.read_csv(f)
 7 | print(df.shape)         #查看数据维度
 8 | print(df.head())        #查看前几行数据
 9 | 
10 | #第二步 中文分词
11 | import jieba
12 | import jieba.posseg as psg
13 | 
14 | #格式转换 否则会报错  'float' object has no attribute 'decode'
15 | df = pd.DataFrame(df['comment'].astype(str))
16 | 
17 | def chinese_word_cut(mytext):
18 |     return ' '.join(jieba.cut(mytext))
19 | 
20 | #增加一列数据
21 | df['content_cutted'] = df['comment'].apply(chinese_word_cut)
22 | print(df.content_cutted.head())
23 | 


--------------------------------------------------------------------------------
/blog28-LDA&pyLDAvis/test03-tfidf.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | import pandas as pd
 3 | 
 4 | #第一步 读取数据
 5 | f = open('data.csv',encoding='utf-8')
 6 | df = pd.read_csv(f)
 7 | print(df.shape)         #查看数据维度
 8 | print(df.head())        #查看前几行数据
 9 | 
10 | #第二步 中文分词
11 | import jieba
12 | import jieba.posseg as psg
13 | 
14 | #格式转换 否则会报错  'float' object has no attribute 'decode'
15 | df = pd.DataFrame(df['comment'].astype(str))
16 | 
17 | def chinese_word_cut(mytext):
18 |     return ' '.join(jieba.cut(mytext))
19 | 
20 | #增加一列数据
21 | df['content_cutted'] = df['comment'].apply(chinese_word_cut)
22 | print(df.content_cutted.head())
23 | 
24 | #第三步 计算TF-IDF值
25 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
26 | 
27 | #设置特征数
28 | n_features = 2000
29 | 
30 | tf_vectorizer = TfidfVectorizer(strip_accents = 'unicode',
31 |                                 max_features=n_features,
32 |                                 stop_words=['的','或','等','是','有','之','与','可以','还是','比较','这里',
33 |                                             '一个','和','也','被','吗','于','中','最','但是','图片','大家',
34 |                                             '一下','几天','200','还有','一看','300','50','哈哈哈哈',
35 |                                              '“','”','。','，','？','、','；','怎么','本来','发现',
36 |                                              'and','in','of','the','我们','一直','真的','18','一次',
37 |                                            '了','有些','已经','不是','这么','一一','一天','这个','这种',
38 |                                            '一种','位于','之一','天空','没有','很多','有点','什么','五个',
39 |                                            '特别'],
40 |                                 max_df = 0.99,
41 |                                 min_df = 0.002) #去除文档内出现几率过大或过小的词汇
42 | tf = tf_vectorizer.fit_transform(df.content_cutted)
43 | 
44 | print(tf.shape)
45 | print(tf)
46 | 


--------------------------------------------------------------------------------
/blog28-LDA&pyLDAvis/test04-lda.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | import pandas as pd
 3 | 
 4 | #-------------------  第一步 读取数据  ------------------------
 5 | f = open('data.csv', encoding='utf-8')
 6 | df = pd.read_csv(f)
 7 | print(df.shape)         #查看数据维度
 8 | print(df.head())        #查看前几行数据
 9 | 
10 | #-------------------  第二步 中文分词  ----------------------- 
11 | import jieba
12 | import jieba.posseg as psg
13 | 
14 | #格式转换 否则会报错  'float' object has no attribute 'decode'
15 | df = pd.DataFrame(df['comment'].astype(str))
16 | 
17 | def chinese_word_cut(mytext):
18 |     return ' '.join(jieba.cut(mytext))
19 | 
20 | #增加一列数据
21 | df['content_cutted'] = df['comment'].apply(chinese_word_cut)
22 | print(df.content_cutted.head())
23 | 
24 | #-------------------  第三步 计算TF-IDF值  --------------------- 
25 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
26 | 
27 | #设置特征数
28 | n_features = 2000
29 | 
30 | tf_vectorizer = TfidfVectorizer(strip_accents = 'unicode',
31 |                                 max_features=n_features,
32 |                                 stop_words=['的','或','等','是','有','之','与','可以','还是','比较','这里',
33 |                                             '一个','和','也','被','吗','于','中','最','但是','图片','大家',
34 |                                             '一下','几天','200','还有','一看','300','50','哈哈哈哈',
35 |                                              '“','”','。','，','？','、','；','怎么','本来','发现',
36 |                                              'and','in','of','the','我们','一直','真的','18','一次',
37 |                                            '了','有些','已经','不是','这么','一一','一天','这个','这种',
38 |                                            '一种','位于','之一','天空','没有','很多','有点','什么','五个',
39 |                                            '特别'],
40 |                                 max_df = 0.99,
41 |                                 min_df = 0.002) #去除文档内出现几率过大或过小的词汇
42 | tf = tf_vectorizer.fit_transform(df.content_cutted)
43 | 
44 | print(tf.shape)
45 | print(tf)
46 | 
47 | #---------------------  第四步 LDA分析  ---------------------- 
48 | from sklearn.decomposition import LatentDirichletAllocation
49 | 
50 | #设置主题数
51 | n_topics = 3
52 | 
53 | #Python 2.X: n_topics=n_topics
54 | lda = LatentDirichletAllocation(n_components=n_topics,
55 |                                 max_iter=100,
56 |                                 learning_method='online',
57 |                                 learning_offset=50,
58 |                                 random_state=0)
59 | lda.fit(tf)
60 | 
61 | #显示主题数 model.topic_word_
62 | print(lda.components_)
63 | #几个主题就是几行 多少个关键词就是几列 
64 | print(lda.components_.shape)                         
65 | 
66 | #计算困惑度
67 | print(u'困惑度：')
68 | print(lda.perplexity(tf,sub_sampling = False))      
69 | 
70 | #主题-关键词分布
71 | def print_top_words(model, tf_feature_names, n_top_words):
72 |     for topic_idx,topic in enumerate(model.components_):    # lda.component相当于model.topic_word_
73 |         print('Topic #%d:' % topic_idx)
74 |         print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]))
75 |         print("")
76 | 
77 | #定义好函数之后 暂定每个主题输出前20个关键词
78 | n_top_words = 20                                       
79 | tf_feature_names = tf_vectorizer.get_feature_names()
80 | #调用函数
81 | print_top_words(lda, tf_feature_names, n_top_words)
82 | 


--------------------------------------------------------------------------------
/blog28-LDA&pyLDAvis/test05-pyLDAvis.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | import pandas as pd
 3 | 
 4 | #-------------------  第一步 读取数据  ------------------------
 5 | f = open('data.csv', encoding='utf-8')
 6 | df = pd.read_csv(f)
 7 | print(df.shape)         #查看数据维度
 8 | print(df.head())        #查看前几行数据
 9 | 
10 | #-------------------  第二步 中文分词  ----------------------- 
11 | import jieba
12 | import jieba.posseg as psg
13 | 
14 | #格式转换 否则会报错  'float' object has no attribute 'decode'
15 | df = pd.DataFrame(df['comment'].astype(str))
16 | 
17 | def chinese_word_cut(mytext):
18 |     return ' '.join(jieba.cut(mytext))
19 | 
20 | #增加一列数据
21 | df['content_cutted'] = df['comment'].apply(chinese_word_cut)
22 | print(df.content_cutted.head())
23 | 
24 | #-------------------  第三步 计算TF-IDF值  --------------------- 
25 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
26 | 
27 | #设置特征数
28 | n_features = 2000
29 | 
30 | tf_vectorizer = TfidfVectorizer(strip_accents = 'unicode',
31 |                                 max_features=n_features,
32 |                                 stop_words=['的','或','等','是','有','之','与','可以','还是','比较','这里',
33 |                                             '一个','和','也','被','吗','于','中','最','但是','图片','大家',
34 |                                             '一下','几天','200','还有','一看','300','50','哈哈哈哈',
35 |                                              '“','”','。','，','？','、','；','怎么','本来','发现',
36 |                                              'and','in','of','the','我们','一直','真的','18','一次',
37 |                                            '了','有些','已经','不是','这么','一一','一天','这个','这种',
38 |                                            '一种','位于','之一','天空','没有','很多','有点','什么','五个',
39 |                                            '特别'],
40 |                                 max_df = 0.99,
41 |                                 min_df = 0.002) #去除文档内出现几率过大或过小的词汇
42 | tf = tf_vectorizer.fit_transform(df.content_cutted)
43 | 
44 | print(tf.shape)
45 | print(tf)
46 | 
47 | #---------------------  第四步 LDA分析  ---------------------- 
48 | from sklearn.decomposition import LatentDirichletAllocation
49 | 
50 | #设置主题数
51 | n_topics = 3
52 | 
53 | #Python 2.X: n_topics=n_topics
54 | lda = LatentDirichletAllocation(n_components=n_topics,
55 |                                 max_iter=100,
56 |                                 learning_method='online',
57 |                                 learning_offset=50,
58 |                                 random_state=0)
59 | lda.fit(tf)
60 | 
61 | #显示主题数 model.topic_word_
62 | print(lda.components_)
63 | #几个主题就是几行 多少个关键词就是几列 
64 | print(lda.components_.shape)                         
65 | 
66 | #计算困惑度
67 | print(u'困惑度：')
68 | print(lda.perplexity(tf,sub_sampling = False))    
69 | 
70 | #主题-关键词分布
71 | def print_top_words(model, tf_feature_names, n_top_words):
72 |     for topic_idx,topic in enumerate(model.components_):    # lda.component相当于model.topic_word_
73 |         print('Topic #%d:' % topic_idx)
74 |         print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]))
75 |         print("")
76 | 
77 | #定义好函数之后 暂定每个主题输出前20个关键词
78 | n_top_words = 20                                       
79 | tf_feature_names = tf_vectorizer.get_feature_names()
80 | #调用函数
81 | print_top_words(lda, tf_feature_names, n_top_words)
82 | 
83 | 
84 | #-------------------  第五步 可视化分析  --------------------- 
85 | import pyLDAvis
86 | import pyLDAvis.sklearn
87 | 
88 | #pyLDAvis.enable_notebook()
89 | 
90 | data = pyLDAvis.sklearn.prepare(lda,tf,tf_vectorizer)
91 | print(data)
92 | 
93 | #显示图形
94 | pyLDAvis.show(data)
95 | 
96 | #pyLDAvis.save_json(data,' fileobj.html')
97 | 


--------------------------------------------------------------------------------
/blog29-DataPreprocessing&KNN/result01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog29-DataPreprocessing&KNN/result01.png


--------------------------------------------------------------------------------
/blog29-DataPreprocessing&KNN/result02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog29-DataPreprocessing&KNN/result02.png


--------------------------------------------------------------------------------
/blog29-DataPreprocessing&KNN/test01-data pre.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import numpy as np
 3 | import pandas as pd
 4 | import csv
 5 | 
 6 | """
 7 | 功能：数据预处理 将KDD99数据集中字符型转换为数值型
 8 | 原文：https://blog.csdn.net/asialee_bird/article/details/80491256
 9 | 
10 | 强烈推荐博友们阅读asialee_bird大神的文章及Github代码，非常厉害的一位博主。
11 | 修订：Eastmount 2019-11-22
12 | """
13 | 
14 | #label_list为全局变量
15 | global label_list  
16 | 
17 | #文件名
18 | source_file='kddcup.data_10_percent_corrected'
19 | handled_file='kddcup.data_10_percent_corrected.csv'
20 | 
21 | #文件写入操作 wb+删除多余空行
22 | data_file = open(handled_file,'wb+')
23 | 
24 | 
25 | #将相应的非数字类型转换为数字标识即符号型数据转化为数值型数据
26 | def find_index(x,y):
27 |     return [i for i in range(len(y)) if y[i]==x]
28 | 
29 | 
30 | #定义将源文件行中3种协议类型转换成数字标识的函数
31 | def handleProtocol(inputs):
32 |     protocol_list=['tcp','udp','icmp']
33 |     if inputs[1] in protocol_list:
34 |         return find_index(inputs[1], protocol_list)[0]
35 | 
36 | 
37 | #定义将源文件行中70种网络服务类型转换成数字标识的函数
38 | def handleService(inputs):
39 |    service_list=['aol','auth','bgp','courier','csnet_ns','ctf','daytime','discard','domain','domain_u',
40 |                  'echo','eco_i','ecr_i','efs','exec','finger','ftp','ftp_data','gopher','harvest','hostnames',
41 |                  'http','http_2784','http_443','http_8001','imap4','IRC','iso_tsap','klogin','kshell','ldap',
42 |                  'link','login','mtp','name','netbios_dgm','netbios_ns','netbios_ssn','netstat','nnsp','nntp',
43 |                  'ntp_u','other','pm_dump','pop_2','pop_3','printer','private','red_i','remote_job','rje','shell',
44 |                  'smtp','sql_net','ssh','sunrpc','supdup','systat','telnet','tftp_u','tim_i','time','urh_i','urp_i',
45 |                  'uucp','uucp_path','vmnet','whois','X11','Z39_50']
46 |    if inputs[2] in service_list:
47 |        return find_index(inputs[2],service_list)[0]
48 | 
49 | 
50 | #定义将源文件行中11种网络连接状态转换成数字标识的函数
51 | def handleFlag(inputs):
52 |     flag_list=['OTH','REJ','RSTO','RSTOS0','RSTR','S0','S1','S2','S3','SF','SH']
53 |     if inputs[3] in flag_list:
54 |         return find_index(inputs[3],flag_list)[0]
55 | 
56 | 
57 | #定义将源文件行中攻击类型转换成数字标识的函数(训练集中共出现了22个攻击类型，而剩下的17种只在测试集中出现)
58 | def handleLabel(inputs):
59 |     label_list=['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.', 'smurf.',
60 |                 'guess_passwd.', 'pod.', 'teardrop.', 'portsweep.', 'ipsweep.', 'land.', 'ftp_write.',
61 |                 'back.', 'imap.', 'satan.', 'phf.', 'nmap.', 'multihop.', 'warezmaster.', 'warezclient.',
62 |                 'spy.', 'rootkit.']
63 |     #在函数内部使用全局变量并修改它
64 |     global label_list  
65 |     if inputs[41] in label_list:
66 |         return find_index(inputs[41],label_list)[0]
67 |     else:
68 |         label_list.append(inputs[41])
69 |         return find_index(inputs[41],label_list)[0]
70 | 
71 | 
72 | #主函数
73 | if __name__=='__main__':
74 |     #循环读取文件数据
75 |     with open(source_file,'r') as data_source:
76 |         csv_reader = csv.reader(data_source)
77 |         csv_writer = csv.writer(data_file)
78 |         count = 0   #行数
79 |         for row in csv_reader:
80 |             temp_line=np.array(row)                     
81 |             temp_line[1] = handleProtocol(row)       #将源文件行中3种协议类型转换成数字标识
82 |             temp_line[2] = handleService(row)        #将源文件行中70种网络服务类型转换成数字标识
83 |             temp_line[3] = handleFlag(row)             #将源文件行中11种网络连接状态转换成数字标识
84 |             temp_line[41] = handleLabel(row)         #将源文件行中23种攻击类型转换成数字标识
85 |             csv_writer.writerow(temp_line)
86 |             count += 1
87 |             
88 |             #输出每行数据中所修改后的状态
89 |             #print(count,'status:',temp_line[1],temp_line[2],temp_line[3],temp_line[41])
90 |         data_file.close()
91 | 
92 | 


--------------------------------------------------------------------------------
/blog29-DataPreprocessing&KNN/test02-zscoreNormalization.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import numpy as np
 3 | import pandas as pd
 4 | import csv
 5 | 
 6 | #全局变量
 7 | global x_mat
 8 | 
 9 | #数据标准化
10 | def ZscoreNormalization(x, n):
11 |     print(len(x))
12 |     i = 0
13 |     while i<len(x):
14 |         x_mat[i][n] = (x[i] - np.mean(x)) / np.std(x)
15 |         #if x_mat[i][n]>0:
16 |         #    print(x_mat[i][n])
17 |         i = i + 1
18 |     print("The ", n , "feature  is normal.")
19 | 
20 | #-------------------------------------读取文件划分数据集-----------------------------------------
21 | fr = open("kddcup.data_10_percent_corrected.csv")
22 | data_file = open("kddcup.data_10_percent_corrected-result.csv",'wb+')
23 | lines = fr.readlines()
24 | line_nums = len(lines)
25 | print(line_nums)
26 | 
27 | #创建line_nums行 para_num列的矩阵
28 | x_mat = np.zeros((line_nums, 42))
29 | 
30 | #划分数据集
31 | for i in range(line_nums):
32 |     line = lines[i].strip()
33 |     item_mat = line.split(',')
34 |     x_mat[i, :] = item_mat[0:42]    #获取42个特征
35 | fr.close()
36 | print(x_mat.shape)
37 | 
38 | #--------------------------------获取某列特征并依次标准化并赋值-----------------------------
39 | print(len(x_mat[:, 0])) #获取某列数据 494021
40 | print(len(x_mat[0, :])) #获取某行数据 42
41 | 
42 | #标准化处理 
43 | ZscoreNormalization(x_mat[:, 0], 0)    #duration
44 | ZscoreNormalization(x_mat[:, 0], 4)    #src_bytes
45 | ZscoreNormalization(x_mat[:, 0], 5)    #dst_bytes
46 | ZscoreNormalization(x_mat[:, 0], 7)    #wrong_fragment
47 | ZscoreNormalization(x_mat[:, 0], 8)    #urgent
48 | 
49 | ZscoreNormalization(x_mat[:, 0], 9)    #hot
50 | ZscoreNormalization(x_mat[:, 0], 10)  #num_failed_logins
51 | ZscoreNormalization(x_mat[:, 0], 12)  #num_compromised
52 | ZscoreNormalization(x_mat[:, 0], 14)  #su_attempte
53 | ZscoreNormalization(x_mat[:, 0], 15)  #num_root
54 | ZscoreNormalization(x_mat[:, 0], 16)  #num_file_creations
55 | ZscoreNormalization(x_mat[:, 0], 17)  #num_shells
56 | ZscoreNormalization(x_mat[:, 0], 18)  #num_access_files
57 | ZscoreNormalization(x_mat[:, 0], 19)  #num_outbound_cmds
58 | 
59 | ZscoreNormalization(x_mat[:, 0], 22)  #count
60 | ZscoreNormalization(x_mat[:, 0], 23)  #srv_count
61 | ZscoreNormalization(x_mat[:, 0], 24)  #serror_rate
62 | ZscoreNormalization(x_mat[:, 0], 25)  #srv_serror_rate
63 | ZscoreNormalization(x_mat[:, 0], 26)  #rerror_rate
64 | ZscoreNormalization(x_mat[:, 0], 27)  #srv_rerror_rate
65 | ZscoreNormalization(x_mat[:, 0], 28)  #same_srv_rate
66 | ZscoreNormalization(x_mat[:, 0], 29)  #diff_srv_rate
67 | ZscoreNormalization(x_mat[:, 0], 30)  #srv_diff_host_rate
68 | 
69 | ZscoreNormalization(x_mat[:, 0], 31)  #dst_host_count
70 | ZscoreNormalization(x_mat[:, 0], 32)  #dst_host_srv_count
71 | ZscoreNormalization(x_mat[:, 0], 33)  #dst_host_same_srv_rate
72 | ZscoreNormalization(x_mat[:, 0], 34)  #dst_host_diff_srv_rate 
73 | ZscoreNormalization(x_mat[:, 0], 35)  #dst_host_same_src_port_rate
74 | ZscoreNormalization(x_mat[:, 0], 36)  #dst_host_srv_diff_host_rate
75 | ZscoreNormalization(x_mat[:, 0], 37)  #dst_host_serror_rate
76 | ZscoreNormalization(x_mat[:, 0], 38)  #dst_host_srv_serror_rate
77 | ZscoreNormalization(x_mat[:, 0], 39)  #dst_host_rerror_rate
78 | ZscoreNormalization(x_mat[:, 0], 40)  #dst_host_srv_rerror_rate
79 | 
80 | #文件写入操作
81 | csv_writer = csv.writer(data_file)
82 | i = 0
83 | while i<len(x_mat[:, 0]):
84 |     csv_writer.writerow(x_mat[i, :])
85 |     i = i + 1
86 | data_file.close()
87 | 


--------------------------------------------------------------------------------
/blog29-DataPreprocessing&KNN/test03-minmax.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import numpy as np
 3 | import pandas as pd
 4 | import csv
 5 | 
 6 | #全局变量
 7 | global x_mat
 8 | 
 9 | #数据归一化
10 | def MinmaxNormalization(x, n):
11 |     print(len(x))
12 |     i = 0
13 |     while i<len(x):
14 |         x_mat[i][n] = (x[i] - np.min(x)) / (np.max(x)-np.min(x))
15 |         #if x_mat[i][n]>0:
16 |         #    print(x_mat[i][n])
17 |         i = i + 1
18 |     print("The ", n , "feature  is normal.")
19 | 
20 | #-------------------------------------读取文件划分数据集-----------------------------------------
21 | fr = open("kddcup.data_10_percent_corrected-result.csv")
22 | data_file = open("kddcup.data_10_percent_corrected-result-minmax.csv",'wb+')
23 | lines = fr.readlines()
24 | line_nums = len(lines)
25 | print(line_nums)
26 | 
27 | #创建line_nums行 para_num列的矩阵
28 | x_mat = np.zeros((line_nums, 42))
29 | 
30 | #划分数据集
31 | for i in range(line_nums):
32 |     line = lines[i].strip()
33 |     item_mat = line.split(',')
34 |     x_mat[i, :] = item_mat[0:42]    #获取42个特征
35 | fr.close()
36 | print(x_mat.shape)
37 | 
38 | #--------------------------------获取某列特征并依次标准化并赋值-----------------------------
39 | print(len(x_mat[:, 0])) #获取某列数据 494021
40 | print(len(x_mat[0, :])) #获取某行数据 42
41 | 
42 | #归一化处理 
43 | MinmaxNormalization(x_mat[:, 0], 0)    #duration
44 | MinmaxNormalization(x_mat[:, 0], 4)    #src_bytes
45 | MinmaxNormalization(x_mat[:, 0], 5)    #dst_bytes
46 | MinmaxNormalization(x_mat[:, 0], 7)    #wrong_fragment
47 | MinmaxNormalization(x_mat[:, 0], 8)    #urgent
48 | 
49 | MinmaxNormalization(x_mat[:, 0], 9)    #hot
50 | MinmaxNormalization(x_mat[:, 0], 10)  #num_failed_logins
51 | MinmaxNormalization(x_mat[:, 0], 12)  #num_compromised
52 | MinmaxNormalization(x_mat[:, 0], 14)  #su_attempte
53 | MinmaxNormalization(x_mat[:, 0], 15)  #num_root
54 | MinmaxNormalization(x_mat[:, 0], 16)  #num_file_creations
55 | MinmaxNormalization(x_mat[:, 0], 17)  #num_shells
56 | MinmaxNormalization(x_mat[:, 0], 18)  #num_access_files
57 | MinmaxNormalization(x_mat[:, 0], 19)  #num_outbound_cmds
58 | 
59 | MinmaxNormalization(x_mat[:, 0], 22)  #count
60 | MinmaxNormalization(x_mat[:, 0], 23)  #srv_count
61 | MinmaxNormalization(x_mat[:, 0], 24)  #serror_rate
62 | MinmaxNormalization(x_mat[:, 0], 25)  #srv_serror_rate
63 | MinmaxNormalization(x_mat[:, 0], 26)  #rerror_rate
64 | MinmaxNormalization(x_mat[:, 0], 27)  #srv_rerror_rate
65 | MinmaxNormalization(x_mat[:, 0], 28)  #same_srv_rate
66 | MinmaxNormalization(x_mat[:, 0], 29)  #diff_srv_rate
67 | MinmaxNormalization(x_mat[:, 0], 30)  #srv_diff_host_rate
68 | 
69 | MinmaxNormalization(x_mat[:, 0], 31)  #dst_host_count
70 | MinmaxNormalization(x_mat[:, 0], 32)  #dst_host_srv_count
71 | MinmaxNormalization(x_mat[:, 0], 33)  #dst_host_same_srv_rate
72 | MinmaxNormalization(x_mat[:, 0], 34)  #dst_host_diff_srv_rate 
73 | MinmaxNormalization(x_mat[:, 0], 35)  #dst_host_same_src_port_rate
74 | MinmaxNormalization(x_mat[:, 0], 36)  #dst_host_srv_diff_host_rate
75 | MinmaxNormalization(x_mat[:, 0], 37)  #dst_host_serror_rate
76 | MinmaxNormalization(x_mat[:, 0], 38)  #dst_host_srv_serror_rate
77 | MinmaxNormalization(x_mat[:, 0], 39)  #dst_host_rerror_rate
78 | MinmaxNormalization(x_mat[:, 0], 40)  #dst_host_srv_rerror_rate
79 | 
80 | #文件写入操作
81 | csv_writer = csv.writer(data_file)
82 | i = 0
83 | while i<len(x_mat[:, 0]):
84 |     csv_writer.writerow(x_mat[i, :])
85 |     i = i + 1
86 | data_file.close()
87 | 


--------------------------------------------------------------------------------
/blog29-DataPreprocessing&KNN/test04-knn-roc.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-  
  2 | import os
  3 | import csv
  4 | import numpy as np
  5 | from sklearn.svm import SVC  
  6 | from sklearn import metrics
  7 | import matplotlib.pyplot as plt
  8 | from matplotlib.colors import ListedColormap
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.decomposition import PCA
 11 | from sklearn import neighbors
 12 | 
 13 | #-----------------------------------------第一步 加载数据集-----------------------------------------
 14 | fr= open("kddcup.data_10_yxz-result-minmax.csv")
 15 | lines = fr.readlines()
 16 | line_nums = len(lines)
 17 | print(line_nums)
 18 | 
 19 | #创建line_nums行 para_num列的矩阵
 20 | x_mat = np.zeros((line_nums, 31))
 21 | y_label = []
 22 | 
 23 | #划分数据集
 24 | for i in range(line_nums):
 25 |     line = lines[i].strip()
 26 |     item_mat = line.split(',')
 27 |     x_mat[i, :] = item_mat[0:31]    #前41个特征
 28 |     y_label.append(item_mat[-1])  #类标
 29 | fr.close()
 30 | print(x_mat.shape)
 31 | print(len(y_label))
 32 | 
 33 | #-----------------------------------------第二步 划分数据集-----------------------------------------
 34 | y = []
 35 | for n in y_label: 
 36 |     y.append(int(float(n)))
 37 | y =  np.array(y, dtype = int) #list转换数组
 38 | 
 39 | #划分数据集 测试集40%
 40 | train_data, test_data, train_target, test_target = train_test_split(x_mat, y, test_size=0.4, random_state=42)
 41 | print(train_data.shape, train_target.shape)
 42 | print(test_data.shape, test_target.shape)
 43 | 
 44 | 
 45 | #-----------------------------------------第三步 KNN训练-----------------------------------------
 46 | def classify(input_vct, data_set):
 47 |     data_set_size = data_set.shape[0]
 48 |     #扩充input_vct到与data_set同型并相减
 49 |     diff_mat = np.tile(input_vct, (data_set_size, 1)) - data_set  
 50 |     sq_diff_mat = diff_mat**2                          #矩阵中每个元素都平方
 51 |     distance = sq_diff_mat.sum(axis=1)**0.5  #每行相加求和并开平方根
 52 |     return distance.min(axis=0)                         #返回最小距离
 53 | 
 54 | test_size = len(test_target)
 55 | result = np.zeros((test_size, 3))
 56 | for i in range(test_size):
 57 |     #序号 最小欧氏距离 测试集数据类别
 58 |     result[i] = i + 1, classify(test_data[i], train_data), test_target[i]
 59 | #矩阵转置
 60 | result = np.transpose(result)  
 61 |     
 62 | #-----------------------------------------第四步 评价及可视化-----------------------------------------
 63 | def roc(data_set):
 64 |     normal = 0
 65 |     data_set_size = data_set.shape[1]
 66 |     roc_rate = np.zeros((2, data_set_size)) #输出ROC曲线 二维矩阵
 67 |     #计算正常请求数量
 68 |     for i in range(data_set_size):
 69 |         if data_set[2][i] == 1:
 70 |             normal += 1
 71 |     abnormal = data_set_size - normal
 72 |     max_dis = data_set[1].max()               #欧式距离最大值
 73 |     for j in range(1000):
 74 |         threshold = max_dis / 1000 * j
 75 |         normal1 = 0
 76 |         abnormal1 = 0
 77 |         for k in range(data_set_size):
 78 |             if data_set[1][k] > threshold and data_set[2][k] == 1:
 79 |                 normal1 += 1
 80 |             if data_set[1][k] > threshold and data_set[2][k] != 1:
 81 |                 abnormal1 += 1
 82 |         roc_rate[0][j] = normal1 / normal           # 阈值以上正常点/全体正常的点
 83 |         roc_rate[1][j] = abnormal1 / abnormal   # 阈值以上异常点/全体异常点
 84 |     return roc_rate
 85 | 
 86 | #图1 散点图
 87 | #横轴为序号 纵轴为最小欧氏距离
 88 | #点中心颜色根据测试集数据类别而定 点外围无颜色 点大小为最小1 灰度为最大1
 89 | plt.figure(1)
 90 | plt.scatter(result[0], result[1], c=result[2], edgecolors='None', s=2, alpha=1)
 91 | 
 92 | #图2 ROC曲线
 93 | #横轴误报率：即阈值以上正常点/全体正常的点
 94 | #纵轴检测率：即阈值以上异常点/全体异常点
 95 | roc_rate = roc(result)
 96 | plt.figure(2)
 97 | plt.scatter(roc_rate[0], roc_rate[1], edgecolors='None', s=1, alpha=1)    
 98 | plt.show()
 99 | 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/blog29-DataPreprocessing&KNN/test05-knn-gitHub-roc.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-  
  2 | import os
  3 | import csv
  4 | import numpy as np
  5 | from sklearn.svm import SVC  
  6 | from sklearn import metrics
  7 | import matplotlib.pyplot as plt
  8 | from matplotlib.colors import ListedColormap
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.decomposition import PCA
 11 | from sklearn import neighbors
 12 | 
 13 | #-----------------------------------------第一步 加载数据集-----------------------------------------
 14 | fr= open("kddcup.data_10_yxz-result-minmax.csv")
 15 | lines = fr.readlines()
 16 | line_nums = len(lines)
 17 | print(line_nums)
 18 | 
 19 | #创建line_nums行 para_num列的矩阵
 20 | x_mat = np.zeros((line_nums, 31))
 21 | y_label = []
 22 | 
 23 | #划分数据集
 24 | for i in range(line_nums):
 25 |     line = lines[i].strip()
 26 |     item_mat = line.split(',')
 27 |     x_mat[i, :] = item_mat[0:31]    #前41个特征
 28 |     y_label.append(item_mat[-1])  #类标
 29 | fr.close()
 30 | print(x_mat.shape)
 31 | print(len(y_label))
 32 | 
 33 | #-----------------------------------------第二步 划分数据集-----------------------------------------
 34 | y = []
 35 | for n in y_label: 
 36 |     y.append(int(float(n)))
 37 | y =  np.array(y, dtype = int) #list转换数组
 38 | 
 39 | #划分数据集 测试集40%
 40 | train_data, test_data, train_target, test_target = train_test_split(x_mat, y, test_size=0.4, random_state=42)
 41 | print(train_data.shape, train_target.shape)
 42 | print(test_data.shape, test_target.shape)
 43 | 
 44 | 
 45 | #-----------------------------------------第三步 KNN训练-----------------------------------------
 46 | def classify(input_vct, data_set):
 47 |     data_set_size = data_set.shape[0]
 48 |     #扩充input_vct到与data_set同型并相减
 49 |     diff_mat = np.tile(input_vct, (data_set_size, 1)) - data_set  
 50 |     sq_diff_mat = diff_mat**2                          #矩阵中每个元素都平方
 51 |     distance = sq_diff_mat.sum(axis=1)**0.5  #每行相加求和并开平方根
 52 |     return distance.min(axis=0)                         #返回最小距离
 53 | 
 54 | test_size = len(test_target)
 55 | result = np.zeros((test_size, 3))
 56 | for i in range(test_size):
 57 |     #序号 最小欧氏距离 测试集数据类别
 58 |     result[i] = i + 1, classify(test_data[i], train_data), test_target[i]
 59 | #矩阵转置
 60 | result = np.transpose(result)  
 61 |     
 62 | #-----------------------------------------第四步 评价及可视化-----------------------------------------
 63 | def roc(data_set):
 64 |     normal = 0
 65 |     data_set_size = data_set.shape[1]
 66 |     roc_rate = np.zeros((2, data_set_size)) #输出ROC曲线 二维矩阵
 67 |     #计算正常请求数量
 68 |     for i in range(data_set_size):
 69 |         if data_set[2][i] == 1:
 70 |             normal += 1
 71 |     abnormal = data_set_size - normal
 72 |     max_dis = data_set[1].max()               #欧式距离最大值
 73 |     for j in range(1000):
 74 |         threshold = max_dis / 1000 * j
 75 |         normal1 = 0
 76 |         abnormal1 = 0
 77 |         for k in range(data_set_size):
 78 |             if data_set[1][k] > threshold and data_set[2][k] == 1:
 79 |                 normal1 += 1
 80 |             if data_set[1][k] > threshold and data_set[2][k] != 1:
 81 |                 abnormal1 += 1
 82 |         roc_rate[0][j] = normal1 / normal           # 阈值以上正常点/全体正常的点
 83 |         roc_rate[1][j] = abnormal1 / abnormal   # 阈值以上异常点/全体异常点
 84 |     return roc_rate
 85 | 
 86 | #图1 散点图
 87 | #横轴为序号 纵轴为最小欧氏距离
 88 | #点中心颜色根据测试集数据类别而定 点外围无颜色 点大小为最小1 灰度为最大1
 89 | plt.figure(1)
 90 | plt.scatter(result[0], result[1], c=result[2], edgecolors='None', s=2, alpha=1)
 91 | 
 92 | #图2 ROC曲线
 93 | #横轴误报率：即阈值以上正常点/全体正常的点
 94 | #纵轴检测率：即阈值以上异常点/全体异常点
 95 | roc_rate = roc(result)
 96 | plt.figure(2)
 97 | plt.scatter(roc_rate[0], roc_rate[1], edgecolors='None', s=1, alpha=1)    
 98 | plt.show()
 99 | 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/blog29-DataPreprocessing&KNN/test06-knn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-  
 2 | import os
 3 | import csv
 4 | import numpy as np
 5 | import pandas as pd
 6 | from sklearn import metrics
 7 | import matplotlib.pyplot as plt
 8 | from matplotlib.colors import ListedColormap
 9 | from sklearn.model_selection import train_test_split
10 | from sklearn.decomposition import PCA
11 | from sklearn import neighbors
12 | 
13 | #-----------------------------------------第一步 加载数据集-----------------------------------------
14 | fr= open("kddcup.data_10_percent_corrected-result-minmax.csv")
15 | lines = fr.readlines()
16 | line_nums = len(lines)
17 | print(line_nums)
18 | 
19 | #创建line_nums行 para_num列的矩阵
20 | x_mat = np.zeros((line_nums, 41))
21 | y_label = []
22 | 
23 | #划分数据集
24 | for i in range(line_nums):
25 |     line = lines[i].strip()
26 |     item_mat = line.split(',')
27 |     x_mat[i, :] = item_mat[0:41]    #前41个特征
28 |     y_label.append(item_mat[-1])  #类标
29 | fr.close()
30 | print x_mat.shape
31 | print len(y_label)
32 | 
33 | 
34 | #-----------------------------------------第二步 划分数据集-----------------------------------------
35 | y = []
36 | for n in y_label: 
37 |     y.append(int(float(n)))
38 | y =  np.array(y, dtype = int) #list转换数组
39 | 
40 | #划分数据集 测试集40%
41 | train_data, test_data, train_target, test_target = train_test_split(x_mat, y, test_size=0.4, random_state=42)
42 | print train_data.shape, train_target.shape
43 | print test_data.shape, test_target.shape
44 | 
45 | 
46 | #-----------------------------------------第三步 KNN训练-----------------------------------------
47 | clf = neighbors.KNeighborsClassifier()
48 | clf.fit(train_data, train_target)
49 | print clf
50 | result = clf.predict(test_data)
51 | print result
52 | print test_target
53 | 
54 | 
55 | #-----------------------------------------第四步 评价算法-----------------------------------------
56 | print sum(result==test_target) #预测结果与真实结果比对
57 | print(metrics.classification_report(test_target, result))  #准确率 召回率 F值
58 | 
59 | 
60 | #----------------------------------------第五步 降维可视化---------------------------------------
61 | pca = PCA(n_components=2)      
62 | newData = pca.fit_transform(test_data)
63 | plt.figure()
64 | plt.scatter(newData[:,0], newData[:,1], c=test_target, s=50)
65 | plt.show()
66 | 


--------------------------------------------------------------------------------