├── README.md ├── blog02-Kmeans └── test01.py ├── blog03-Kmeans-yh ├── data.txt ├── result01.png ├── test01.py └── test02.py ├── blog04-DTC ├── result.png ├── test01.py ├── test02.py ├── test03.py └── test04.py ├── blog05-LR ├── res.png ├── res02.png ├── res03.png ├── test01.py ├── test02.py ├── test03.py └── test04.py ├── blog06-Numpy+Matplotlib ├── data.xls ├── test01.py ├── test02.py ├── test03.py ├── test04.py ├── test05-matplotlib.py ├── test06-matplotlib.py └── test07-matplotlib.py ├── blog07-pac ├── result01.png ├── result02.png ├── test01.py ├── test02.py └── test03.py ├── blog08-Apriori └── test01.py ├── blog09-LinearRegression ├── Index ├── glass.csv ├── glass.data ├── glass.names ├── glass.tag ├── result01.png ├── result02.png ├── result03.png ├── test01.py └── test02.py ├── blog10-Pandas ├── 41.txt ├── bankloan.png ├── ccc.png ├── data.csv ├── data2.xlsx ├── guiyang.png ├── test01.py ├── test02.py ├── test03.py ├── test04.py ├── test05.py ├── test06-dalian.py ├── test07.py ├── 时序图.png └── 贵阳自相关图.png ├── blog11-Matplotlib+SQL ├── test01.py ├── test02.py ├── test03.py └── test04.py ├── blog12-matplotlib+SQL ├── test01.py ├── test02.py ├── test03.py └── test04.py ├── blog13-wordcloud ├── cloudimg.png ├── mb.png ├── result01.png ├── test.txt ├── test01.py └── test02.py ├── blog14-curve_fit ├── data.csv ├── result01.png ├── result02.png ├── test01.py ├── test02.py ├── test03.py ├── test04.py ├── test3.png └── test4.png ├── blog15-imshow ├── result01.png ├── result02.png ├── result03.png ├── test01.py ├── test02.py ├── test03.py ├── test04.py ├── test05.py ├── test06.py └── test07.py ├── blog16-LR ├── result01.png ├── result02.png ├── result03.png ├── test01.py ├── test02.py └── test03.py ├── blog17-networkx ├── result01.png ├── test01.py └── test02.py ├── blog18-Regression ├── blog01-LR.py ├── blog02-LR.py ├── blog03-boston.py ├── blog04-boson.py ├── blog05-random.py ├── blog06-random.py ├── blog07-3Drandom.py ├── blog08-PolynomialFeatures.py ├── blog09-PolynomialFeatures.py ├── result01.png ├── result02.png ├── result03.png ├── result04.png ├── result05.png └── result06.png ├── blog19-Iris ├── result01.png ├── result02.png ├── result03.png ├── result04.png ├── result05.png ├── result06.png ├── result07.png ├── result08.png ├── result09.png ├── test01.py ├── test02-hist.py ├── test03-plot.py ├── test04-kde.py ├── test05-box.py ├── test06-box.py ├── test07-show.py ├── test08-LR.py ├── test09-Kmeans.py ├── test10-Kmeans.py └── test11-Kmeans.py ├── blog20-KNN ├── blog01.py ├── blog02.py ├── blog03.py ├── blog04.py ├── result.png ├── result02.png └── wine.txt ├── blog21-NB ├── blog01.py ├── blog02.py ├── blog03.py ├── blog04-getdata.py ├── blog05-fenci.py ├── blog06-static.py ├── blog07-classifier.py ├── data.csv ├── data_preprocess.py ├── result.png ├── result2.png ├── seed.txt ├── seed_x.csv └── seed_y.csv ├── blog22-Basemap ├── 001.png ├── 002.png ├── 003.png ├── 004.png ├── 005.png ├── 006.png ├── basemap下载.txt ├── blog-001.py ├── blog-002.py ├── blog-003.py ├── blog-004.py ├── blog-005.py ├── blog-006.py └── blog-007.py ├── blog23-statsmodels ├── blog01.py ├── blog02.py ├── blog03_show.py ├── blog04_show.py ├── blog05_groupby.py ├── blog06_ARIMA.py ├── blog07_ARIMA.py ├── blog08_statsmodels.py ├── blog09_statsmodels.py ├── result01.png ├── result02.png ├── result03.png ├── result04.png └── result05.png ├── blog24-Kmeans-Chinese ├── BaiduSpiderSpots.rar ├── HudongSpider_Result.txt ├── blog01_merge.py ├── blog02_spider.py ├── blog03_fenci.py ├── blog04_kmeans.py └── result.png ├── blog25-Matplotlib ├── allname.txt ├── plot.png ├── test01-show.py ├── test02-show.py ├── test03-kmeans.py └── test04-kmeans.py ├── blog26-SnowNLP ├── data.txt ├── result01.png ├── result02.png ├── result03.png ├── result04.png ├── result05.png ├── test-douban.csv ├── test01-spider.py ├── test02-wordcloud.py ├── test03-snownlp01.py ├── test04-snownlp02.py ├── test05-snownlp03.py ├── test06-snownlp-show.py ├── test07-snownlp-show.py └── test08-snownlp-show.py ├── blog27-SVM&WineDataset ├── data intro.txt ├── result01.png ├── result02.png ├── test01-svm.py ├── test02-datapre.py ├── test03-svm.py ├── test04-update.py └── wine.txt ├── blog28-LDA&pyLDAvis ├── data.csv ├── result.png ├── test01-read.py ├── test02-jieba.py ├── test03-tfidf.py ├── test04-lda.py └── test05-pyLDAvis.py └── blog29-DataPreprocessing&KNN ├── kddcup.data_10_percent_corrected ├── kddcup.data_10_percent_corrected-result-minmax.csv ├── kddcup.data_10_percent_corrected-result.csv ├── kddcup.data_10_percent_corrected.csv ├── result01.png ├── result02.png ├── test01-data pre.py ├── test02-zscoreNormalization.py ├── test03-minmax.py ├── test04-knn-roc.py ├── test05-knn-gitHub-roc.py └── test06-knn.py /README.md: -------------------------------------------------------------------------------- 1 | # Python-for-Data-Mining 2 | 该资源为作者在CSDN的撰写Python数据挖掘和数据分析文章的支撑,主要是Python实现数据挖掘、机器学习、文本挖掘等算法代码实现,希望该资源对您有所帮助,一起加油。 3 | 4 | > 该部分代码修改成了Python 3.x版本,与Python 2.x略微不同。
5 | > 大家注意其差异即可,这也是为了更好的帮助同学们适应新的版本。 6 | 7 | --- 8 | 9 | 具体内容请参照如下CSDN博客: 10 | 11 | [【Python数据挖掘课程】一.安装Python及爬虫入门介绍](https://blog.csdn.net/eastmount/article/details/52577215)
12 | [【Python数据挖掘课程】二.Kmeans聚类数据分析及Anaconda介绍](https://blog.csdn.net/eastmount/article/details/52777308)
13 | [【Python数据挖掘课程】三.Kmeans聚类代码实现、作业及优化](https://blog.csdn.net/eastmount/article/details/52793549)
14 | [【Python数据挖掘课程】四.决策树DTC数据分析及鸢尾数据集分析](https://blog.csdn.net/eastmount/article/details/52820400)
15 | [【Python数据挖掘课程】五.线性回归知识及预测糖尿病实例](https://blog.csdn.net/eastmount/article/details/52929765)
16 | [【Python数据挖掘课程】六.Numpy、Pandas和Matplotlib包基础知识](https://blog.csdn.net/eastmount/article/details/53144633)
17 | [【Python数据挖掘课程】七.PCA降维操作及subplot子图绘制](https://blog.csdn.net/eastmount/article/details/53285192)
18 | [【Python数据挖掘课程】八.关联规则挖掘及Apriori实现购物推荐](https://blog.csdn.net/eastmount/article/details/53368440)
19 | [【Python数据挖掘课程】九.回归模型LinearRegression简单分析氧化物数据](https://blog.csdn.net/eastmount/article/details/60468818)
20 | [【python数据挖掘课程】十.Pandas、Matplotlib、PCA绘图实用代码补充](https://blog.csdn.net/eastmount/article/details/60675865)
21 | [【python数据挖掘课程】十一.Pandas、Matplotlib结合SQL语句可视化分析](https://blog.csdn.net/eastmount/article/details/62489186)
22 | [【python数据挖掘课程】十二.Pandas、Matplotlib结合SQL语句对比图分析](https://blog.csdn.net/eastmount/article/details/64127445)
23 | [【python数据挖掘课程】十三.WordCloud词云配置过程及词频分析](https://blog.csdn.net/eastmount/article/details/64438407)
24 | [【python数据挖掘课程】十四.Scipy调用curve_fit实现曲线拟合](https://blog.csdn.net/eastmount/article/details/71308373)
25 | [【python数据挖掘课程】十五.Matplotlib调用imshow()函数绘制热图](https://blog.csdn.net/eastmount/article/details/73392106)
26 | [【python数据挖掘课程】十六.逻辑回归LogisticRegression分析鸢尾花数据](https://blog.csdn.net/eastmount/article/details/77920470)
27 | [【python数据挖掘课程】十七.社交网络Networkx库分析人物关系(初识篇)](https://blog.csdn.net/eastmount/article/details/78452581)
28 | [【python数据挖掘课程】十八.线性回归及多项式回归分析四个案例分享](https://blog.csdn.net/eastmount/article/details/78635096)
29 | [【python数据挖掘课程】十九.鸢尾花数据集可视化、线性回归、决策树花样分析](https://blog.csdn.net/eastmount/article/details/78692227)
30 | [【python数据挖掘课程】二十.KNN最近邻分类算法分析详解及平衡秤TXT数据集读取](https://blog.csdn.net/eastmount/article/details/78747128)
31 | [【python数据挖掘课程】二十一.朴素贝叶斯分类器详解及中文文本舆情分析](https://blog.csdn.net/eastmount/article/details/79128235)
32 | [【python数据挖掘课程】二十二.Basemap地图包安装入门及基础知识讲解](https://blog.csdn.net/eastmount/article/details/79188415)
33 | [【python数据挖掘课程】二十三.时间序列金融数据预测及Pandas库详解](https://blog.csdn.net/eastmount/article/details/79188415)
34 | [【python数据挖掘课程】二十四.KMeans文本聚类分析互动百科语料](https://blog.csdn.net/eastmount/article/details/80935427)
35 | [【python数据挖掘课程】二十五.Matplotlib绘制带主题及聚类类标的散点图](https://blog.csdn.net/Eastmount/article/details/81106487)
36 | [【python数据挖掘课程】二十六.基于SnowNLP的豆瓣评论情感分析](https://blog.csdn.net/Eastmount/article/details/85118818)
37 | [【python数据挖掘课程】二十七.基于SVM分类器的红酒数据分析](https://blog.csdn.net/Eastmount/article/details/86512901)
38 | [【python数据挖掘课程】二十八.基于LDA和pyLDAvis的主题挖掘及可视化分析](https://blog.csdn.net/Eastmount/article/details/91380607)
39 | [【python数据挖掘课程】二十九.数据预处理之字符型转换数值型、标准化、归一化处理](https://blog.csdn.net/Eastmount/article/details/103212931)
40 | 41 | 42 | 效果图显示如下: 43 | 44 |
45 | 46 |
47 |
48 | 49 |
50 | 51 |
52 | 53 |
54 | 55 |
56 | 57 | 58 | 59 |
60 | 61 | --- 62 | 63 | 都是非常基础的文章,如果有错误或不足之处,还请告知及海涵,谢谢您的鼓励与支持,请帮忙点个Star!您的支持是我最大的动力,共勉~ 64 | 65 | 数据挖掘相关知识分享。 66 | 67 | By:杨秀璋 Eastmount 68 | 69 | 2021-01-21 70 | 71 | -------------------------------------------------------------------------------- /blog02-Kmeans/test01.py: -------------------------------------------------------------------------------- 1 | """ 2 | 第一部分:导入包 3 | 从sklearn.cluster机器学习聚类包中导入KMeans聚类 4 | """ 5 | # coding=utf-8 6 | from sklearn.cluster import Birch 7 | from sklearn.cluster import KMeans 8 | 9 | """ 10 | 第二部分:数据集 11 | X表示二维矩阵数据,篮球运动员比赛数据 12 | 总共20行,每行两列数据 13 | 第一列表示球员每分钟助攻数:assists_per_minute 14 | 第二列表示球员每分钟得分数:points_per_minute 15 | """ 16 | 17 | X = [[0.0888, 0.5885], 18 | [0.1399, 0.8291], 19 | [0.0747, 0.4974], 20 | [0.0983, 0.5772], 21 | [0.1276, 0.5703], 22 | [0.1671, 0.5835], 23 | [0.1906, 0.5276], 24 | [0.1061, 0.5523], 25 | [0.2446, 0.4007], 26 | [0.1670, 0.4770], 27 | [0.2485, 0.4313], 28 | [0.1227, 0.4909], 29 | [0.1240, 0.5668], 30 | [0.1461, 0.5113], 31 | [0.2315, 0.3788], 32 | [0.0494, 0.5590], 33 | [0.1107, 0.4799], 34 | [0.2521, 0.5735], 35 | [0.1007, 0.6318], 36 | [0.1067, 0.4326], 37 | [0.1956, 0.4280] 38 | ] 39 | 40 | #输出数据集 41 | print(X) 42 | 43 | 44 | """ 45 | 第三部分:KMeans聚类 46 | clf = KMeans(n_clusters=3) 表示类簇数为3,聚成3类数据,clf即赋值为KMeans 47 | y_pred = clf.fit_predict(X) 载入数据集X,并且将聚类的结果赋值给y_pred 48 | """ 49 | 50 | clf = KMeans(n_clusters=3) 51 | y_pred = clf.fit_predict(X) 52 | 53 | #输出完整Kmeans函数,包括很多省略参数 54 | print(clf) 55 | #输出聚类预测结果,20行数据,每个y_pred对应X一行或一个球员,聚成3类,类标为0、1、2 56 | print(y_pred) 57 | 58 | 59 | """ 60 | 第四部分:可视化绘图 61 | Python导入Matplotlib包,专门用于绘图 62 | import matplotlib.pyplot as plt 此处as相当于重命名,plt用于显示图像 63 | """ 64 | 65 | import numpy as np 66 | import matplotlib.pyplot as plt 67 | 68 | #获取第一列和第二列数据 使用for循环获取 n[0]表示X第一列 69 | x = [n[0] for n in X] 70 | print(x) 71 | y = [n[1] for n in X] 72 | print(y) 73 | 74 | #绘制散点图 参数:x横轴 y纵轴 c=y_pred聚类预测结果 marker类型 o表示圆点 *表示星型 x表示点 75 | plt.scatter(x, y, c=y_pred, marker='x') 76 | 77 | #绘制标题 78 | plt.title("Kmeans-Basketball Data") 79 | 80 | #绘制x轴和y轴坐标 81 | plt.xlabel("assists_per_minute") 82 | plt.ylabel("points_per_minute") 83 | 84 | #设置右上角图例 85 | plt.legend(["A","B","C"]) 86 | 87 | #显示图形 88 | plt.show() 89 | -------------------------------------------------------------------------------- /blog03-Kmeans-yh/data.txt: -------------------------------------------------------------------------------- 1 | 0.0888 201 36.02 28 0.5885 2 | 0.1399 198 39.32 30 0.8291 3 | 0.0747 198 38.8 26 0.4974 4 | 0.0983 191 40.71 30 0.5772 5 | 0.1276 196 38.4 28 0.5703 6 | 0.1671 201 34.1 31 0.5835 7 | 0.1906 193 36.2 30 0.5276 8 | 0.1061 191 36.75 27 0.5523 9 | 0.2446 185 38.43 29 0.4007 10 | 0.167 203 33.54 24 0.477 11 | 0.2485 188 35.01 27 0.4313 12 | 0.1227 198 36.67 29 0.4909 13 | 0.124 185 33.88 24 0.5668 14 | 0.1461 191 35.59 30 0.5113 15 | 0.2315 191 38.01 28 0.3788 16 | 0.0494 193 32.38 32 0.559 17 | 0.1107 196 35.22 25 0.4799 18 | 0.2521 183 31.73 29 0.5735 19 | 0.1007 193 28.81 34 0.6318 20 | 0.1067 196 35.6 23 0.4326 21 | 0.1956 188 35.28 32 0.428 22 | 0.1828 191 29.54 28 0.4401 23 | 0.1627 196 31.35 28 0.5581 24 | 0.1403 198 33.5 23 0.4866 25 | 0.1563 193 34.56 32 0.5267 26 | 0.2681 183 39.53 27 0.5439 27 | 0.1236 196 26.7 34 0.4419 28 | 0.13 188 30.77 26 0.3998 29 | 0.0896 198 25.67 30 0.4325 30 | 0.2071 178 36.22 30 0.4086 31 | 0.2244 185 36.55 23 0.4624 32 | 0.3437 185 34.91 31 0.4325 33 | 0.1058 191 28.35 28 0.4903 34 | 0.2326 185 33.53 27 0.4802 35 | 0.1577 193 31.07 25 0.4345 36 | 0.2327 185 36.52 32 0.4819 37 | 0.1256 196 27.87 29 0.6244 38 | 0.107 198 24.31 34 0.3991 39 | 0.1343 193 31.26 28 0.4414 40 | 0.0586 196 22.18 23 0.4013 41 | 0.2383 185 35.25 26 0.3801 42 | 0.1006 198 22.87 30 0.3498 43 | 0.2164 193 24.49 32 0.3185 44 | 0.1485 198 23.57 27 0.3097 45 | 0.227 191 31.72 27 0.4319 46 | 0.1649 188 27.9 25 0.3799 47 | 0.1188 191 22.74 24 0.4091 48 | 0.194 193 20.62 27 0.3588 49 | 0.2495 185 30.46 25 0.4727 50 | 0.2378 185 32.38 27 0.3212 51 | 0.1592 191 25.75 31 0.3418 52 | 0.2069 170 33.84 30 0.4285 53 | 0.2084 185 27.83 25 0.3917 54 | 0.0877 193 21.67 26 0.5769 55 | 0.101 193 21.79 24 0.4773 56 | 0.0942 201 20.17 26 0.4512 57 | 0.055 193 29.07 31 0.3096 58 | 0.1071 196 24.28 24 0.3089 59 | 0.0728 193 19.24 27 0.4573 60 | 0.2771 180 27.07 28 0.3214 61 | 0.0528 196 18.95 22 0.5437 62 | 0.213 188 21.59 30 0.4121 63 | 0.1356 193 13.27 31 0.2185 64 | 0.1043 196 16.3 23 0.3313 65 | 0.113 191 23.01 25 0.3302 66 | 0.1477 196 20.31 31 0.4677 67 | 0.1317 188 17.46 33 0.2406 68 | 0.2187 191 21.95 28 0.3007 69 | 0.2127 188 14.57 37 0.2471 70 | 0.2547 160 34.55 28 0.2894 71 | 0.1591 191 22.0 24 0.3682 72 | 0.0898 196 13.37 34 0.389 73 | 0.2146 188 20.51 24 0.512 74 | 0.1871 183 19.78 28 0.4449 75 | 0.1528 191 16.36 33 0.4035 76 | 0.156 191 16.03 23 0.2683 77 | 0.2348 188 24.27 26 0.2719 78 | 0.1623 180 18.49 28 0.3408 79 | 0.1239 180 17.76 26 0.4393 80 | 0.2178 185 13.31 25 0.3004 81 | 0.1608 185 17.41 26 0.3503 82 | 0.0805 193 13.67 25 0.4388 83 | 0.1776 193 17.46 27 0.2578 84 | 0.1668 185 14.38 35 0.2989 85 | 0.1072 188 12.12 31 0.4455 86 | 0.1821 185 12.63 25 0.3087 87 | 0.188 180 12.24 30 0.3678 88 | 0.1167 196 12.0 24 0.3667 89 | 0.2617 185 24.46 27 0.3189 90 | 0.1994 188 20.06 27 0.4187 91 | 0.1706 170 17.0 25 0.5059 92 | 0.1554 183 11.58 24 0.3195 93 | 0.2282 185 10.08 24 0.2381 94 | 0.1778 185 18.56 23 0.2802 95 | 0.1863 185 11.81 23 0.381 96 | 0.1014 193 13.81 32 0.1593 -------------------------------------------------------------------------------- /blog03-Kmeans-yh/result01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog03-Kmeans-yh/result01.png -------------------------------------------------------------------------------- /blog03-Kmeans-yh/test01.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from sklearn.cluster import Birch 4 | from sklearn.cluster import KMeans 5 | 6 | X = [[0.0888, 0.5885], 7 | [0.1399, 0.8291], 8 | [0.0747, 0.4974], 9 | [0.0983, 0.5772], 10 | [0.1276, 0.5703], 11 | [0.1671, 0.5835], 12 | [0.1906, 0.5276], 13 | [0.1061, 0.5523], 14 | [0.2446, 0.4007], 15 | [0.1670, 0.4770], 16 | [0.2485, 0.4313], 17 | [0.1227, 0.4909], 18 | [0.1240, 0.5668], 19 | [0.1461, 0.5113], 20 | [0.2315, 0.3788], 21 | [0.0494, 0.5590], 22 | [0.1107, 0.4799], 23 | [0.2521, 0.5735], 24 | [0.1007, 0.6318], 25 | [0.1067, 0.4326], 26 | [0.1956, 0.4280] 27 | ] 28 | print(X) 29 | 30 | # Kmeans聚类 31 | clf = KMeans(n_clusters=3) 32 | y_pred = clf.fit_predict(X) 33 | print(clf) 34 | print(y_pred) 35 | 36 | 37 | import numpy as np 38 | import matplotlib.pyplot as plt 39 | 40 | x = [n[0] for n in X] 41 | print(x) 42 | y = [n[1] for n in X] 43 | print(y) 44 | 45 | # 可视化操作 46 | plt.scatter(x, y, c=y_pred, marker='x') 47 | plt.title("Kmeans-Basketball Data") 48 | plt.xlabel("assists_per_minute") 49 | plt.ylabel("points_per_minute") 50 | plt.legend(["Rank"]) 51 | plt.show() 52 | -------------------------------------------------------------------------------- /blog03-Kmeans-yh/test02.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | By: Eastmount CSDN 2016-10-12 5 | 该部分讲数据集读取,然后赋值给X变量 6 | 读取文件data.txt 保存结果为X 7 | """ 8 | 9 | import os 10 | 11 | data = [] 12 | for line in open("data.txt", "r").readlines(): 13 | line = line.rstrip() #删除换行 14 | #删除多余空格,保存一个空格连接 15 | result = ' '.join(line.split()) 16 | #获取每行五个值 '0 0.0888 201 36.02 28 0.5885' 注意:字符串转换为浮点型数 17 | s = [float(x) for x in result.strip().split(' ')] 18 | #输出结果:['0', '0.0888', '201', '36.02', '28', '0.5885'] 19 | print(s) 20 | #数据存储至data 21 | data.append(s) 22 | 23 | #输出完整数据集 24 | print('完整数据集') 25 | print(data) 26 | print(type(data)) 27 | 28 | ''' 29 | 现在输出数据集: 30 | ['0 0.0888 201 36.02 28 0.5885', 31 | '1 0.1399 198 39.32 30 0.8291', 32 | '2 0.0747 198 38.80 26 0.4974', 33 | '3 0.0983 191 40.71 30 0.5772', 34 | '4 0.1276 196 38.40 28 0.5703' 35 | ] 36 | ''' 37 | 38 | print('第一列 第五列数据') 39 | L2 = [n[0] for n in data] 40 | print(L2) 41 | L5 = [n[4] for n in data] 42 | print(L5) 43 | 44 | ''' 45 | X表示二维矩阵数据,篮球运动员比赛数据 46 | 总共96行,每行获取两列数据 47 | 第一列表示球员每分钟助攻数:assists_per_minute 48 | 第五列表示球员每分钟得分数:points_per_minute 49 | ''' 50 | 51 | #两列数据生成二维数据 52 | print('两列数据合并成二维矩阵') 53 | T = dict(zip(L2,L5)) 54 | type(T) 55 | 56 | #dict类型转换为list 57 | print('List') 58 | X = list(map(lambda x,y: (x,y), T.keys(),T.values())) 59 | print(X) 60 | print(type(X)) 61 | 62 | 63 | """ 64 | KMeans聚类 65 | clf = KMeans(n_clusters=3) 表示类簇数为3,聚成3类数据,clf即赋值为KMeans 66 | y_pred = clf.fit_predict(X) 载入数据集X,并且将聚类的结果赋值给y_pred 67 | """ 68 | 69 | from sklearn.cluster import Birch 70 | from sklearn.cluster import KMeans 71 | 72 | clf = KMeans(n_clusters=3) 73 | y_pred = clf.fit_predict(X) 74 | print(clf) 75 | #输出聚类预测结果,96行数据,每个y_pred对应X一行或一个球员,聚成3类,类标为0、1、2 76 | print(y_pred) 77 | 78 | 79 | """ 80 | 可视化绘图 81 | Python导入Matplotlib包,专门用于绘图 82 | import matplotlib.pyplot as plt 此处as相当于重命名,plt用于显示图像 83 | """ 84 | 85 | import numpy as np 86 | import matplotlib.pyplot as plt 87 | 88 | 89 | #获取第一列和第二列数据 使用for循环获取 n[0]表示X第一列 90 | x = [n[0] for n in X] 91 | print(x) 92 | y = [n[1] for n in X] 93 | print(y) 94 | 95 | #绘制散点图 参数:x横轴 y纵轴 c=y_pred聚类预测结果 marker类型 o表示圆点 *表示星型 x表示点 96 | #plt.scatter(x, y, c=y_pred, marker='x') 97 | 98 | 99 | #坐标 100 | x1 = [] 101 | y1 = [] 102 | 103 | x2 = [] 104 | y2 = [] 105 | 106 | x3 = [] 107 | y3 = [] 108 | 109 | #分布获取类标为0、1、2的数据 赋值给(x1,y1) (x2,y2) (x3,y3) 110 | i = 0 111 | while i < len(X): 112 | if y_pred[i]==0: 113 | x1.append(X[i][0]) 114 | y1.append(X[i][1]) 115 | elif y_pred[i]==1: 116 | x2.append(X[i][0]) 117 | y2.append(X[i][1]) 118 | elif y_pred[i]==2: 119 | x3.append(X[i][0]) 120 | y3.append(X[i][1]) 121 | 122 | i = i + 1 123 | 124 | 125 | #四种颜色 红 绿 蓝 黑 126 | plot1, = plt.plot(x1, y1, 'or', marker="x") 127 | plot2, = plt.plot(x2, y2, 'og', marker="o") 128 | plot3, = plt.plot(x3, y3, 'ob', marker="*") 129 | 130 | #绘制标题 131 | plt.title("Kmeans-Basketball Data") 132 | 133 | #绘制x轴和y轴坐标 134 | plt.xlabel("assists_per_minute") 135 | plt.ylabel("points_per_minute") 136 | 137 | #设置右上角图例 138 | plt.legend((plot1, plot2, plot3), ('A', 'B', 'C'), fontsize=10) 139 | 140 | plt.show() 141 | -------------------------------------------------------------------------------- /blog04-DTC/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog04-DTC/result.png -------------------------------------------------------------------------------- /blog04-DTC/test01.py: -------------------------------------------------------------------------------- 1 | #导入数据集iris 2 | from sklearn.datasets import load_iris 3 | 4 | #载入数据集 5 | iris = load_iris() 6 | #输出数据集 7 | print(iris.data) 8 | -------------------------------------------------------------------------------- /blog04-DTC/test02.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Oct 14 21:44:19 2016 4 | @author: 杨秀璋 5 | """ 6 | 7 | #导入数据集iris 8 | from sklearn.datasets import load_iris 9 | 10 | #载入数据集 11 | iris = load_iris() 12 | 13 | print(iris.data) #输出数据集 14 | print(iris.target) #输出真实标签 15 | print(len(iris.target)) 16 | print(iris.data.shape) #150个样本 每个样本4个特征 17 | 18 | 19 | #导入决策树DTC包 20 | from sklearn.tree import DecisionTreeClassifier 21 | 22 | #训练 23 | clf = DecisionTreeClassifier() 24 | clf.fit(iris.data, iris.target) 25 | print(clf) 26 | 27 | #预测 28 | predicted = clf.predict(iris.data) 29 | 30 | #获取花卉两列数据集 31 | X = iris.data 32 | L1 = [x[0] for x in X] 33 | print(L1) 34 | L2 = [x[1] for x in X] 35 | print(L2) 36 | 37 | #绘图 38 | import numpy as np 39 | import matplotlib.pyplot as plt 40 | plt.scatter(L1, L2, c=predicted, marker='x') #cmap=plt.cm.Paired 41 | plt.title("DTC") 42 | plt.show() 43 | -------------------------------------------------------------------------------- /blog04-DTC/test03.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Oct 14 21:44:19 2016 4 | @author: 杨秀璋 5 | """ 6 | 7 | #导入数据集iris 8 | from sklearn.datasets import load_iris 9 | import numpy as np 10 | 11 | #载入数据集 12 | iris = load_iris() 13 | 14 | ''' 15 | print iris.data #输出数据集 16 | print iris.target #输出真实标签 17 | print len(iris.target) 18 | print iris.data.shape #150个样本 每个样本4个特征 19 | ''' 20 | 21 | ''' 22 | 重点:分割数据集 构造训练集/测试集,120/30 23 | 70%训练 0-40 50-90 100-140 24 | 30%预测 40-50 90-100 140-150 25 | ''' 26 | #训练集 27 | train_data = np.concatenate((iris.data[0:40, :], iris.data[50:90, :], iris.data[100:140, :]), axis = 0) 28 | #训练集样本类别 29 | train_target = np.concatenate((iris.target[0:40], iris.target[50:90], iris.target[100:140]), axis = 0) 30 | #测试集 31 | test_data = np.concatenate((iris.data[40:50, :], iris.data[90:100, :], iris.data[140:150, :]), axis = 0) 32 | #测试集样本类别 33 | test_target = np.concatenate((iris.target[40:50], iris.target[90:100], iris.target[140:150]), axis = 0) 34 | 35 | 36 | #导入决策树DTC包 37 | from sklearn.tree import DecisionTreeClassifier 38 | 39 | #训练 40 | clf = DecisionTreeClassifier() 41 | #注意均使用训练数据集和样本类标 42 | clf.fit(train_data, train_target) 43 | print(clf) 44 | 45 | #预测结果 46 | predict_target = clf.predict(test_data) 47 | print(predict_target) 48 | 49 | #预测结果与真实结果比对 50 | print(sum(predict_target == test_target)) 51 | 52 | #输出准确率 召回率 F值 53 | from sklearn import metrics 54 | print(metrics.classification_report(test_target, predict_target)) 55 | print(metrics.confusion_matrix(test_target, predict_target)) 56 | 57 | 58 | #获取花卉测试数据集两列数据集 59 | X = test_data 60 | L1 = [n[0] for n in X] 61 | print(L1) 62 | L2 = [n[1] for n in X] 63 | print(L2) 64 | 65 | #绘图 66 | import numpy as np 67 | import matplotlib.pyplot as plt 68 | plt.scatter(L1, L2, c=predict_target, marker='x') #cmap=plt.cm.Paired 69 | plt.title("DecisionTreeClassifier") 70 | plt.show() 71 | -------------------------------------------------------------------------------- /blog04-DTC/test04.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Oct 12 23:30:34 2016 4 | @author: yxz15 5 | """ 6 | 7 | print(__doc__) 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | from sklearn.datasets import load_iris 13 | from sklearn.tree import DecisionTreeClassifier 14 | 15 | # Parameters 16 | n_classes = 3 17 | plot_colors = "bry" 18 | plot_step = 0.02 19 | 20 | # Load data 21 | iris = load_iris() 22 | 23 | for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], 24 | [1, 2], [1, 3], [2, 3]]): 25 | # We only take the two corresponding features 26 | X = iris.data[:, pair] 27 | y = iris.target 28 | 29 | # Train 30 | clf = DecisionTreeClassifier().fit(X, y) 31 | 32 | # Plot the decision boundary 33 | plt.subplot(2, 3, pairidx + 1) 34 | 35 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 36 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 37 | xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), 38 | np.arange(y_min, y_max, plot_step)) 39 | 40 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 41 | Z = Z.reshape(xx.shape) 42 | cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) 43 | 44 | plt.xlabel(iris.feature_names[pair[0]]) 45 | plt.ylabel(iris.feature_names[pair[1]]) 46 | plt.axis("tight") 47 | 48 | # Plot the training points 49 | for i, color in zip(range(n_classes), plot_colors): 50 | idx = np.where(y == i) 51 | plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i], 52 | cmap=plt.cm.Paired) 53 | 54 | plt.axis("tight") 55 | 56 | plt.suptitle("Decision surface of a decision tree using paired features") 57 | plt.legend() 58 | plt.show() 59 | -------------------------------------------------------------------------------- /blog05-LR/res.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog05-LR/res.png -------------------------------------------------------------------------------- /blog05-LR/res02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog05-LR/res02.png -------------------------------------------------------------------------------- /blog05-LR/res03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog05-LR/res03.png -------------------------------------------------------------------------------- /blog05-LR/test01.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Oct 27 02:37:05 2016 4 | @author: yxz15 5 | """ 6 | 7 | from sklearn import datasets 8 | diabetes = datasets.load_diabetes() #载入数据 9 | print(diabetes.data) #数据 10 | print(diabetes.target) #类标 11 | print('总行数: ', len(diabetes.data), len(diabetes.target)) #数据总行数 12 | print('特征数: ', len(diabetes.data[0])) #每行数据集维数 13 | print('数据类型: ', diabetes.data.shape) #类型 14 | print(type(diabetes.data), type(diabetes.target)) #数据集类型 15 | 16 | """ 17 | [[ 0.03807591 0.05068012 0.06169621 ..., -0.00259226 0.01990842 18 | -0.01764613] 19 | [-0.00188202 -0.04464164 -0.05147406 ..., -0.03949338 -0.06832974 20 | -0.09220405] 21 | ... 22 | [-0.04547248 -0.04464164 -0.0730303 ..., -0.03949338 -0.00421986 23 | 0.00306441]] 24 | [ 151. 75. 141. 206. 135. 97. 138. 63. 110. 310. 101. 25 | ... 26 | 64. 48. 178. 104. 132. 220. 57.] 27 | 总行数: 442 442 28 | 特征数: 10 29 | 数据类型: (442L, 10L) 30 | 31 | """ 32 | -------------------------------------------------------------------------------- /blog05-LR/test02.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Oct 28 00:44:55 2016 4 | @author: yxz15 5 | """ 6 | 7 | from sklearn import linear_model #导入线性模型 8 | import matplotlib.pyplot as plt #绘图 9 | import numpy as np 10 | 11 | #X表示匹萨尺寸 Y表示匹萨价格 12 | X = [[6], [8], [10], [14], [18]] 13 | Y = [[7], [9], [13], [17.5], [18]] 14 | 15 | print('数据集X: ', X) 16 | print('数据集Y: ', Y) 17 | 18 | #回归训练 19 | clf = linear_model.LinearRegression() #使用线性回归 20 | clf.fit(X, Y) #导入数据集 21 | res = clf.predict(np.array([12]).reshape(-1, 1))[0] #预测结果 22 | print('预测一张12英寸匹萨价格:$%.2f' % res) 23 | 24 | #预测结果 25 | X2 = [[0], [10], [14], [25]] 26 | Y2 = clf.predict(X2) 27 | 28 | #绘制线性回归图形 29 | plt.figure() 30 | plt.title(u'diameter-cost curver') #标题 31 | plt.xlabel(u'diameter') #x轴坐标 32 | plt.ylabel(u'cost') #y轴坐标 33 | plt.axis([0, 25, 0, 25]) #区间 34 | plt.grid(True) #显示网格 35 | plt.plot(X, Y, 'k.') #绘制训练数据集散点图 36 | plt.plot(X2, Y2, 'g-') #绘制预测数据集直线 37 | plt.show() 38 | 39 | -------------------------------------------------------------------------------- /blog05-LR/test03.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Oct 28 01:21:30 2016 4 | @author: yxz15 5 | """ 6 | 7 | from sklearn import datasets 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | from sklearn import linear_model #导入线性模型 11 | 12 | #数据集 13 | diabetes = datasets.load_diabetes() #载入数据 14 | 15 | #获取一个特征 16 | diabetes_x_temp = diabetes.data[:, np.newaxis, 2] 17 | 18 | diabetes_x_train = diabetes_x_temp[:-20] #训练样本 19 | diabetes_x_test = diabetes_x_temp[-20:] #测试样本 后20行 20 | diabetes_y_train = diabetes.target[:-20] #训练标记 21 | diabetes_y_test = diabetes.target[-20:] #预测对比标记 22 | 23 | #回归训练及预测 24 | clf = linear_model.LinearRegression() 25 | clf.fit(diabetes_x_train, diabetes_y_train) #注: 训练数据集 26 | 27 | #系数 残差平法和 方差得分 28 | print('Coefficients :\n', clf.coef_) 29 | print("Residual sum of square: %.2f" %np.mean((clf.predict(diabetes_x_test) - diabetes_y_test) ** 2)) 30 | print("variance score: %.2f" % clf.score(diabetes_x_test, diabetes_y_test)) 31 | 32 | #绘图 33 | plt.title(u'LinearRegression Diabetes') #标题 34 | plt.xlabel(u'Attributes') #x轴坐标 35 | plt.ylabel(u'Measure of disease') #y轴坐标 36 | #点的准确位置 37 | plt.scatter(diabetes_x_test, diabetes_y_test, color = 'black') 38 | #预测结果 直线表示 39 | plt.plot(diabetes_x_test, clf.predict(diabetes_x_test), color='blue', linewidth = 3) 40 | plt.show() 41 | -------------------------------------------------------------------------------- /blog05-LR/test04.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Dec 29 12:47:58 2011 4 | @author: Administrator 5 | """ 6 | #第一步 数据集划分 7 | from sklearn import datasets 8 | import numpy as np 9 | 10 | #获取数据 10*442 11 | d = datasets.load_diabetes() 12 | x = d.data 13 | print('获取x特征') 14 | print(len(x), x.shape) 15 | print(x[:4]) 16 | 17 | #获取一个特征 第3列数据 18 | x_one = x[:,np.newaxis, 2] 19 | print(x_one[:4]) 20 | 21 | #获取的正确结果 22 | y = d.target 23 | print('获取的结果') 24 | print(y[:4]) 25 | 26 | #x特征划分 27 | x_train = x_one[:-42] 28 | x_test = x_one[-42:] 29 | print(len(x_train), len(x_test)) 30 | y_train = y[:-42] 31 | y_test = y[-42:] 32 | print(len(y_train), len(y_test)) 33 | 34 | 35 | #第二步 线性回归实现 36 | from sklearn import linear_model 37 | clf = linear_model.LinearRegression() 38 | print(clf) 39 | clf.fit(x_train, y_train) 40 | pre = clf.predict(x_test) 41 | print('预测结果') 42 | print(pre) 43 | print('真实结果') 44 | print(y_test) 45 | 46 | 47 | #第三步 评价结果 48 | cost = np.mean(y_test-pre)**2 49 | print('次方', 2**5) 50 | print('平方和计算:', cost) 51 | print('系数', clf.coef_) 52 | print('截距', clf.intercept_) 53 | print('方差', clf.score(x_test, y_test)) 54 | 55 | 56 | #第四步 绘图 57 | import matplotlib.pyplot as plt 58 | plt.title("diabetes") 59 | plt.xlabel("x") 60 | plt.ylabel("y") 61 | plt.plot(x_test, y_test, 'k.') 62 | plt.plot(x_test, pre, 'g-') 63 | 64 | for idx, m in enumerate(x_test): 65 | plt.plot([m, m],[y_test[idx], 66 | pre[idx]], 'r-') 67 | 68 | plt.savefig('power.png', dpi=300) 69 | 70 | plt.show() 71 | 72 | -------------------------------------------------------------------------------- /blog06-Numpy+Matplotlib/data.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog06-Numpy+Matplotlib/data.xls -------------------------------------------------------------------------------- /blog06-Numpy+Matplotlib/test01.py: -------------------------------------------------------------------------------- 1 | #导入包并重命名 2 | import numpy as np 3 | 4 | #定义一维数组 5 | a = np.array([2, 0, 1, 5, 8, 3]) 6 | print('原始数据:', a) 7 | 8 | #输出最大、最小值及形状 9 | print('最小值:', a.min()) 10 | print('最大值:', a.max()) 11 | print('形状', a.shape) 12 | 13 | #数据切片 14 | print('切片操作:') 15 | print(a[:-2]) 16 | print(a[-2:]) 17 | print(a[:1]) 18 | 19 | #排序 20 | print(type(a)) 21 | a.sort() 22 | print('排序后:', a) 23 | -------------------------------------------------------------------------------- /blog06-Numpy+Matplotlib/test02.py: -------------------------------------------------------------------------------- 1 | #定义二维数组 2 | import numpy as np 3 | c = np.array([[1, 2, 3, 4],[4, 5, 6, 7], [7, 8, 9, 10]]) 4 | 5 | #获取值 6 | print('形状:', c.shape) 7 | print('获取值:', c[1][0]) 8 | print('获取某行:') 9 | print(c[1][:]) 10 | print('获取某行并切片:') 11 | print(c[0][:-1]) 12 | print(c[0][-1:]) 13 | 14 | #获取具体某列值 15 | print('获取第3列:') 16 | print(c[:,np.newaxis, 2]) 17 | 18 | #调用sin函数 19 | print(np.sin(np.pi/6)) 20 | print(type(np.sin(0.5))) 21 | 22 | #范围定义 23 | print(np.arange(0,4)) 24 | print(type(np.arange(0,4))) 25 | -------------------------------------------------------------------------------- /blog06-Numpy+Matplotlib/test03.py: -------------------------------------------------------------------------------- 1 | #读取数据 header设置Excel无标题头 2 | import pandas as pd 3 | data = pd.read_excel("data.xls", header=None) 4 | print(data) 5 | 6 | #计算数据长度 7 | print('行数', len(data)) 8 | 9 | #计算用户A\B\C用电总和 10 | print(data.sum()) 11 | 12 | #计算用户A\B\C用点量算术平均数 13 | mm = data.sum() 14 | print(mm) 15 | 16 | #输出预览前5行数据 17 | print('预览前5行数据') 18 | print(data.head()) 19 | 20 | #输出数据基本统计量 21 | print('输出数据基本统计量') 22 | print(data.describe()) 23 | -------------------------------------------------------------------------------- /blog06-Numpy+Matplotlib/test04.py: -------------------------------------------------------------------------------- 1 | from pandas import Series, DataFrame 2 | 3 | #通过传递一个list对象来创建Series,默认创建整型索引; 4 | a = Series([4, 7, -5, 3]) 5 | print('创建Series:') 6 | print(a) 7 | 8 | #创建一个带有索引来确定每一个数据点的Series ; 9 | b = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c']) 10 | print('创建带有索引的Series:') 11 | print(b) 12 | 13 | #如果你有一些数据在一个Python字典中,你可以通过传递字典来创建一个Series; 14 | sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000} 15 | c = Series(sdata) 16 | print('通过传递字典创建Series:') 17 | print(c) 18 | states = ['California', 'Ohio', 'Oregon', 'Texas'] 19 | d = Series(sdata, index=states) 20 | print('California没有字典为空:') 21 | print(d) 22 | -------------------------------------------------------------------------------- /blog06-Numpy+Matplotlib/test05-matplotlib.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Nov 14 04:06:01 2016 4 | @author: yxz15 5 | """ 6 | 7 | #导入数据集 8 | import pandas as pd 9 | data = pd.read_excel("data.xls", header=None) 10 | mm = data.sum() 11 | print('计算用电量总数:') 12 | print(mm) 13 | 14 | #绘制图形 15 | import numpy as np 16 | import matplotlib.pyplot as plt 17 | #中文字体显示 18 | plt.rc('font', family='SimHei', size=13) 19 | N = 3 20 | #3个用户 0 1 2 21 | ind = np.arange(N) # the x locations for the groups 22 | print(ind) 23 | #设置宽度 24 | width = 0.35 25 | x = [u'用户A', u'用户B', u'用户C'] 26 | #绘图 27 | plt.bar(ind, mm, width, color='r', label='sum num') 28 | plt.xlabel(u"用户名") 29 | plt.ylabel(u"总耗电量") 30 | plt.title(u'电力窃漏电用户自动识别--总耗电量') 31 | plt.legend() 32 | #设置底部名称 33 | plt.xticks(ind+width/2, x, rotation=40) #旋转40度 34 | plt.show() 35 | 36 | -------------------------------------------------------------------------------- /blog06-Numpy+Matplotlib/test06-matplotlib.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | mm = [45, 30, 25] #每一块占得比例,总和为100 4 | n = mm[0]+mm[1]+mm[2] 5 | a = (mm[0]*1.0*100/n) 6 | b = (mm[1]*1.0*100/n) 7 | c = (mm[2]*1.0*100/n) 8 | print(a, b, c, n) 9 | fracs = [a, b, c] 10 | 11 | explode=(0, 0, 0.08) #离开整体的距离,看效果 12 | labels = 'A', 'B', 'C' #对应每一块的标志 13 | 14 | plt.pie(fracs, explode=explode, labels=labels, 15 | autopct='%1.1f%%', shadow=True, startangle=90, colors = ("g", "r", "y")) 16 | # startangle是开始的角度,默认为0,从这里开始按逆时针方向依次展开 17 | 18 | plt.title('Raining Hogs and Dogs') #标题 19 | 20 | plt.show() 21 | -------------------------------------------------------------------------------- /blog06-Numpy+Matplotlib/test07-matplotlib.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | plt.rc('font', family='SimHei', size=13) 4 | 5 | num = np.array([13325, 9403, 9227, 8651]) 6 | ratio = np.array([0.75, 0.76, 0.72, 0.75]) 7 | men = num * ratio 8 | women = num * (1-ratio) 9 | x = [u'聊天',u'支付',u'团购\n优惠券',u'在线视频'] 10 | 11 | width = 0.5 12 | idx = np.arange(len(x)) 13 | plt.bar(idx, men, width, color='red', label=u'男性用户') 14 | plt.bar(idx, women, width, bottom=men, color='yellow', label=u'女性用户') 15 | plt.xlabel(u'应用类别') 16 | plt.ylabel(u'男女分布') 17 | plt.xticks(idx+width/2, x, rotation=40) 18 | plt.legend() 19 | plt.show() 20 | -------------------------------------------------------------------------------- /blog07-pac/result01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog07-pac/result01.png -------------------------------------------------------------------------------- /blog07-pac/result02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog07-pac/result02.png -------------------------------------------------------------------------------- /blog07-pac/test01.py: -------------------------------------------------------------------------------- 1 | #载入数据集 2 | from sklearn.datasets import load_boston 3 | d = load_boston() 4 | x = d.data 5 | y = d.target 6 | print(x[:10]) 7 | print('形状:', x.shape) 8 | 9 | #降维 10 | import numpy as np 11 | from sklearn.decomposition import PCA 12 | pca = PCA(n_components=2) 13 | newData = pca.fit_transform(x) 14 | print('降维后数据:') 15 | print(newData[:4]) 16 | print('形状:', newData.shape) 17 | -------------------------------------------------------------------------------- /blog07-pac/test02.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | plt.figure(1) # 创建图表1 5 | plt.figure(2) # 创建图表2 6 | ax1 = plt.subplot(211) # 在图表2中创建子图1 7 | ax2 = plt.subplot(212) # 在图表2中创建子图2 8 | 9 | x = np.linspace(0, 3, 100) 10 | for i in range(5): 11 | plt.figure(1) # 选择图表1 12 | plt.plot(x, np.exp(i*x/3)) 13 | plt.sca(ax1) # 选择图表2的子图1 14 | plt.plot(x, np.sin(i*x)) 15 | plt.sca(ax2) # 选择图表2的子图2 16 | plt.plot(x, np.cos(i*x)) 17 | 18 | plt.show() 19 | -------------------------------------------------------------------------------- /blog07-pac/test03.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #糖尿病数据集 4 | from sklearn.datasets import load_diabetes 5 | data = load_diabetes() 6 | x = data.data 7 | print(x[:4]) 8 | y = data.target 9 | print(y[:4]) 10 | 11 | #KMeans聚类算法 12 | from sklearn.cluster import KMeans 13 | #训练 14 | clf = KMeans(n_clusters=2) 15 | print(clf) 16 | clf.fit(x) 17 | #预测 18 | pre = clf.predict(x) 19 | print(pre[:10]) 20 | 21 | #使用PCA降维操作 22 | from sklearn.decomposition import PCA 23 | pca = PCA(n_components=2) 24 | newData = pca.fit_transform(x) 25 | print(newData[:4]) 26 | 27 | L1 = [n[0] for n in newData] 28 | L2 = [n[1] for n in newData] 29 | 30 | #绘图 31 | import numpy as np 32 | import matplotlib.pyplot as plt 33 | 34 | #用来正常显示中文标签 35 | plt.rc('font', family='SimHei', size=8) 36 | #plt.rcParams['font.sans-serif']=['SimHei'] 37 | 38 | #用来正常显示负号 39 | plt.rcParams['axes.unicode_minus']=False 40 | 41 | p1 = plt.subplot(221) 42 | plt.title(u"Kmeans聚类 n=2") 43 | plt.scatter(L1,L2,c=pre,marker="s") 44 | plt.sca(p1) 45 | 46 | 47 | ################################### 48 | # 聚类 类蔟数=3 49 | 50 | clf = KMeans(n_clusters=3) 51 | clf.fit(x) 52 | pre = clf.predict(x) 53 | 54 | p2 = plt.subplot(222) 55 | plt.title("Kmeans n=3") 56 | plt.scatter(L1,L2,c=pre,marker="s") 57 | plt.sca(p2) 58 | 59 | 60 | ################################### 61 | # 聚类 类蔟数=4 62 | 63 | clf = KMeans(n_clusters=4) 64 | clf.fit(x) 65 | pre = clf.predict(x) 66 | 67 | p3 = plt.subplot(223) 68 | plt.title("Kmeans n=4") 69 | plt.scatter(L1,L2,c=pre,marker="+") 70 | plt.sca(p3) 71 | 72 | 73 | ################################### 74 | # 聚类 类蔟数=5 75 | 76 | clf = KMeans(n_clusters=5) 77 | clf.fit(x) 78 | pre = clf.predict(x) 79 | 80 | p4 = plt.subplot(224) 81 | plt.title("Kmeans n=5") 82 | plt.scatter(L1,L2,c=pre,marker="+") 83 | plt.sca(p4) 84 | 85 | #保存图片本地 86 | plt.savefig('power.png', dpi=300) 87 | plt.show() 88 | 89 | -------------------------------------------------------------------------------- /blog08-Apriori/test01.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Nov 28 03:29:51 2016 4 | 地址:http://blog.csdn.net/u010454729/article/details/49078505 5 | @author: 参考CSDN u010454729 6 | """ 7 | 8 | def loadDataSet(): 9 | return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]] 10 | 11 | def createC1(dataSet): #构建所有候选项集的集合 12 | C1 = [] 13 | for transaction in dataSet: 14 | for item in transaction: 15 | if not [item] in C1: 16 | C1.append([item]) #C1添加的是列表,对于每一项进行添加,{1},{3},{4},{2},{5} 17 | C1.sort() 18 | return map(frozenset, C1) #使用frozenset,被“冰冻”的集合,为后续建立字典key-value使用。 19 | 20 | def scanD(D,Ck,minSupport): #由候选项集生成符合最小支持度的项集L。参数分别为数据集、候选项集列表,最小支持度 21 | ssCnt = {} 22 | for tid in D: #对于数据集里的每一条记录 23 | for can in Ck: #每个候选项集can 24 | if can.issubset(tid): #若是候选集can是作为记录的子集,那么其值+1,对其计数 25 | if can not in ssCnt: #ssCnt[can] = ssCnt.get(can,0)+1一句可破,没有的时候为0,加上1,有的时候用get取出,加1 26 | ssCnt[can] = 1 27 | else: 28 | ssCnt[can] += 1 29 | numItems = float(len(list(D))) 30 | retList = [] 31 | supportData = {} 32 | for key in ssCnt: 33 | if numItems > 0: #除以总的记录条数,即为其支持度 34 | support = ssCnt[key] / numItems 35 | else: 36 | support = 0 37 | if support >= minSupport: 38 | retList.insert(0,key) #超过最小支持度的项集,将其记录下来。 39 | supportData[key] = support 40 | return retList, supportData 41 | 42 | def aprioriGen(Lk, k): #创建符合置信度的项集Ck, 43 | retList = [] 44 | lenLk = len(Lk) 45 | for i in range(lenLk): 46 | for j in range(i+1, lenLk): #k=3时,[:k-2]即取[0],对{0,1},{0,2},{1,2}这三个项集来说,L1=0,L2=0,将其合并得{0,1,2},当L1=0,L2=1不添加, 47 | L1 = list(Lk[i])[:k-2] 48 | L2 = list(Lk[j])[:k-2] 49 | L1.sort() 50 | L2.sort() 51 | if L1==L2: 52 | retList.append(Lk[i]|Lk[j]) 53 | return retList 54 | 55 | def apriori(dataSet, minSupport = 0.5): 56 | C1 = createC1(dataSet) 57 | D = map(set,dataSet) 58 | L1, supportData = scanD(D,C1,minSupport) 59 | L = [L1] #L将包含满足最小支持度,即经过筛选的所有频繁n项集,这里添加频繁1项集 60 | k = 2 61 | while (len(L[k-2])>0): #k=2开始,由频繁1项集生成频繁2项集,直到下一个打的项集为空 62 | Ck = aprioriGen(L[k-2], k) 63 | Lk, supK = scanD(D, Ck, minSupport) 64 | supportData.update(supK) #supportData为字典,存放每个项集的支持度,并以更新的方式加入新的supK 65 | L.append(Lk) 66 | k +=1 67 | return L,supportData 68 | 69 | dataSet = loadDataSet() 70 | C1 = createC1(dataSet) 71 | 72 | print("所有候选1项集C1:\n") 73 | for n in C1: 74 | print(n) 75 | 76 | D = map(set, dataSet) 77 | print("数据集D:\n") 78 | for n in D: 79 | print(n) 80 | 81 | L1, supportData0 = scanD(D,C1, 0.5) 82 | print("符合最小支持度的频繁1项集L1:\n",L1) 83 | 84 | L, suppData = apriori(dataSet) 85 | print("所有符合最小支持度的项集L:\n",L) 86 | print("频繁2项集:\n",aprioriGen(L[0],2)) 87 | 88 | L, suppData = apriori(dataSet, minSupport=0.7) 89 | print("所有符合最小支持度为0.7的项集L:\n",L) 90 | 91 | -------------------------------------------------------------------------------- /blog09-LinearRegression/Index: -------------------------------------------------------------------------------- 1 | Index of glass 2 | 3 | 02 Dec 1996 139 Index 4 | 02 Mar 1993 11903 glass.data 5 | 16 Jul 1992 780 glass.tag 6 | 30 May 1989 3506 glass.names 7 | -------------------------------------------------------------------------------- /blog09-LinearRegression/glass.names: -------------------------------------------------------------------------------- 1 | 1. Title: Glass Identification Database 2 | 3 | 2. Sources: 4 | (a) Creator: B. German 5 | -- Central Research Establishment 6 | Home Office Forensic Science Service 7 | Aldermaston, Reading, Berkshire RG7 4PN 8 | (b) Donor: Vina Spiehler, Ph.D., DABFT 9 | Diagnostic Products Corporation 10 | (213) 776-0180 (ext 3014) 11 | (c) Date: September, 1987 12 | 13 | 3. Past Usage: 14 | -- Rule Induction in Forensic Science 15 | -- Ian W. Evett and Ernest J. Spiehler 16 | -- Central Research Establishment 17 | Home Office Forensic Science Service 18 | Aldermaston, Reading, Berkshire RG7 4PN 19 | -- Unknown technical note number (sorry, not listed here) 20 | -- General Results: nearest neighbor held its own with respect to the 21 | rule-based system 22 | 23 | 4. Relevant Information:n 24 | Vina conducted a comparison test of her rule-based system, BEAGLE, the 25 | nearest-neighbor algorithm, and discriminant analysis. BEAGLE is 26 | a product available through VRS Consulting, Inc.; 4676 Admiralty Way, 27 | Suite 206; Marina Del Ray, CA 90292 (213) 827-7890 and FAX: -3189. 28 | In determining whether the glass was a type of "float" glass or not, 29 | the following results were obtained (# incorrect answers): 30 | 31 | Type of Sample Beagle NN DA 32 | Windows that were float processed (87) 10 12 21 33 | Windows that were not: (76) 19 16 22 34 | 35 | The study of classification of types of glass was motivated by 36 | criminological investigation. At the scene of the crime, the glass left 37 | can be used as evidence...if it is correctly identified! 38 | 39 | 5. Number of Instances: 214 40 | 41 | 6. Number of Attributes: 10 (including an Id#) plus the class attribute 42 | -- all attributes are continuously valued 43 | 44 | 7. Attribute Information: 45 | 1. Id number: 1 to 214 46 | 2. RI: refractive index 47 | 3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as 48 | are attributes 4-10) 49 | 4. Mg: Magnesium 50 | 5. Al: Aluminum 51 | 6. Si: Silicon 52 | 7. K: Potassium 53 | 8. Ca: Calcium 54 | 9. Ba: Barium 55 | 10. Fe: Iron 56 | 11. Type of glass: (class attribute) 57 | -- 1 building_windows_float_processed 58 | -- 2 building_windows_non_float_processed 59 | -- 3 vehicle_windows_float_processed 60 | -- 4 vehicle_windows_non_float_processed (none in this database) 61 | -- 5 containers 62 | -- 6 tableware 63 | -- 7 headlamps 64 | 65 | 8. Missing Attribute Values: None 66 | 67 | Summary Statistics: 68 | Attribute: Min Max Mean SD Correlation with class 69 | 2. RI: 1.5112 1.5339 1.5184 0.0030 -0.1642 70 | 3. Na: 10.73 17.38 13.4079 0.8166 0.5030 71 | 4. Mg: 0 4.49 2.6845 1.4424 -0.7447 72 | 5. Al: 0.29 3.5 1.4449 0.4993 0.5988 73 | 6. Si: 69.81 75.41 72.6509 0.7745 0.1515 74 | 7. K: 0 6.21 0.4971 0.6522 -0.0100 75 | 8. Ca: 5.43 16.19 8.9570 1.4232 0.0007 76 | 9. Ba: 0 3.15 0.1750 0.4972 0.5751 77 | 10. Fe: 0 0.51 0.0570 0.0974 -0.1879 78 | 79 | 9. Class Distribution: (out of 214 total instances) 80 | -- 163 Window glass (building windows and vehicle windows) 81 | -- 87 float processed 82 | -- 70 building windows 83 | -- 17 vehicle windows 84 | -- 76 non-float processed 85 | -- 76 building windows 86 | -- 0 vehicle windows 87 | -- 51 Non-window glass 88 | -- 13 containers 89 | -- 9 tableware 90 | -- 29 headlamps 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /blog09-LinearRegression/glass.tag: -------------------------------------------------------------------------------- 1 | An original file donated by Vina Speihler 2 | 3 | ID, N -- numeric identifier of the instance 4 | RI, N -- refractive index 5 | NA2O, N -- Sodium oxide 6 | MGO, N -- magnesium oxide 7 | AL2O3, N -- aluminum oxide 8 | SIO2, N -- silcon oxide 9 | K2O, N -- potassium oxide 10 | CAO, N -- calcium oxide 11 | BAO, N -- barium oxide 12 | FE2O3, N -- iron oxide 13 | TYPE, N -- An unknown, but must correspond to the types in the paper 14 | CAMG, N -- Unsure 15 | 16 | Types include: 17 | 1. WF (Float Window) 18 | 2. WNF (Non-float Window) 19 | 3. C (Container) 20 | 4. T (Tableware) 21 | 5. H (Headlamp) 214 2568 14127 glass.dat 22 | 19 92 518 glass.tag 23 | 62 742 4775 glassx.dat 24 | 51 610 3928 nonwindo.dat 25 | 6 14 120 phones 26 | 163 1955 12552 window.dat 27 | 515 5981 36020 total 28 | -------------------------------------------------------------------------------- /blog09-LinearRegression/result01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog09-LinearRegression/result01.png -------------------------------------------------------------------------------- /blog09-LinearRegression/result02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog09-LinearRegression/result02.png -------------------------------------------------------------------------------- /blog09-LinearRegression/result03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog09-LinearRegression/result03.png -------------------------------------------------------------------------------- /blog09-LinearRegression/test01.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 05 18:10:07 2017 4 | @author: eastmount & zj 5 | """ 6 | 7 | #导入玻璃识别数据集 8 | import pandas as pd 9 | glass=pd.read_csv("glass.csv") 10 | #显示前6行数据 11 | print(glass.shape) 12 | print(glass.head(6)) 13 | 14 | import seaborn as sns 15 | import matplotlib.pyplot as plt 16 | sns.set(font_scale=1.5) 17 | sns.lmplot(x='al', y='ri', data=glass, ci=None) 18 | #利用Pandas画散点图 19 | glass.plot(kind='scatter', x='al', y='ri') 20 | plt.show() 21 | 22 | #利用matplotlib做等效的散点图 23 | plt.scatter(glass.al, glass.ri) 24 | plt.xlabel('al') 25 | plt.ylabel('ri') 26 | 27 | #拟合线性回归模型 28 | from sklearn.linear_model import LinearRegression 29 | linreg = LinearRegression() 30 | feature_cols = ['al'] 31 | X = glass[feature_cols] 32 | y = glass.ri 33 | linreg.fit(X, y) 34 | plt.show() 35 | 36 | #对于所有的x值做出预测 37 | glass['ri_pred'] = linreg.predict(X) 38 | print("预测的前六行:") 39 | print(glass.head(6)) 40 | 41 | #用直线表示预测结果 42 | plt.plot(glass.al, glass.ri_pred, color='red') 43 | plt.xlabel('al') 44 | plt.ylabel('Predicted ri') 45 | plt.show() 46 | 47 | #将直线结果和散点图同时显示出来 48 | plt.scatter(glass.al, glass.ri) 49 | plt.plot(glass.al, glass.ri_pred, color='red') 50 | plt.xlabel('al') 51 | plt.ylabel('ri') 52 | plt.show() 53 | 54 | #利用相关方法线性预测 55 | linreg.intercept_ + linreg.coef_ * 2 56 | #使用预测方法计算Al = 2的预测 57 | linreg.predict(2) 58 | 59 | #铝检验系数 60 | ai=zip(feature_cols, linreg.coef_) 61 | print(ai) 62 | 63 | #使用预测方法计算Al = 3的预测 64 | pre=linreg.predict(3) 65 | print(pre) 66 | 67 | #检查glass_type 68 | sort=glass.glass_type.value_counts().sort_index() 69 | print(sort) 70 | 71 | #类型1、2、3的窗户玻璃 72 | #类型5,6,7是家用玻璃 73 | glass['household'] = glass.glass_type.map({1:0, 2:0, 3:0, 5:1, 6:1, 7:1}) 74 | print(glass.head()) 75 | 76 | plt.scatter(glass.al, glass.household) 77 | plt.xlabel('al') 78 | plt.ylabel('household') 79 | plt.show() 80 | 81 | #拟合线性回归模型并存储预测 82 | feature_cols = ['al'] 83 | X = glass[feature_cols] 84 | y = glass.household 85 | linreg.fit(X, y) 86 | glass['household_pred'] = linreg.predict(X) 87 | plt.show() 88 | 89 | #包括回归线的散点图 90 | plt.scatter(glass.al, glass.household) 91 | plt.plot(glass.al, glass.household_pred, color='red') 92 | plt.xlabel('al') 93 | plt.ylabel('household') 94 | plt.show() 95 | 96 | -------------------------------------------------------------------------------- /blog09-LinearRegression/test02.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 05 18:28:56 2017 4 | @author: eastmount & zj 5 | """ 6 | #导入玻璃识别数据集 7 | import pandas as pd 8 | glass=pd.read_csv("glass.csv") 9 | print(glass.shape) 10 | print(glass.head(6)) 11 | 12 | #拟合Logistic回归模型 存储类预测 13 | import numpy as np 14 | nums = np.array([5, 15, 8]) 15 | np.where(nums > 10, 'big', 'small') 16 | #将household_pred转换为 1或0 17 | glass['household_pred_class'] = np.where(glass.household_pred >= 0.5, 1, 0) 18 | print(glass.head(6)) 19 | 20 | from sklearn.linear_model import LogisticRegression 21 | logreg = LogisticRegression(C=1e9) 22 | feature_cols = ['al'] 23 | X = glass[feature_cols] 24 | y = glass.household 25 | logreg.fit(X, y) 26 | glass['household_pred_class'] = logreg.predict(X) 27 | 28 | 29 | #绘图-显示预测结果 30 | plt.scatter(glass.al, glass.household) 31 | plt.plot(glass.al, glass.household_pred_class, color='red') 32 | plt.xlabel('al') 33 | plt.ylabel('household') 34 | plt.show() 35 | 36 | glass['household_pred_prob'] = logreg.predict_proba(X)[:, 1] 37 | #绘图 绘制预测概率 38 | 39 | plt.scatter(glass.al, glass.household) 40 | plt.plot(glass.al, glass.household_pred_prob, color='red') 41 | plt.xlabel('al') 42 | plt.ylabel('household') 43 | plt.show() 44 | 45 | #检查一些例子的预测 46 | print (logreg.predict_proba (1)) 47 | print (logreg.predict_proba(2)) 48 | print (logreg. predict_proba (3)) 49 | -------------------------------------------------------------------------------- /blog10-Pandas/41.txt: -------------------------------------------------------------------------------- 1 | 61.5 55 2 | 59.8 61 3 | 56.9 65 4 | 62.4 58 5 | 63.3 58 6 | 62.8 57 7 | 62.3 57 8 | 61.9 55 9 | 65.1 61 10 | 59.4 61 11 | 64 55 12 | 62.8 56 13 | 60.4 61 14 | 62.2 54 15 | 60.2 62 16 | 60.9 58 17 | 62 54 18 | 63.4 54 19 | 63.8 56 20 | 62.7 59 21 | 63.3 56 22 | 63.8 55 23 | 61 57 24 | 59.4 62 25 | 58.1 62 26 | 60.4 58 27 | 62.5 57 28 | 62.2 57 29 | 60.5 61 30 | 60.9 57 31 | 60 57 32 | 59.8 57 33 | 60.7 59 34 | 59.5 58 35 | 61.9 58 36 | 58.2 59 37 | 64.1 59 38 | 64 54 39 | 60.8 59 40 | 61.8 55 41 | 61.2 56 42 | 61.1 56 43 | 65.2 56 44 | 58.4 63 45 | 63.1 56 46 | 62.4 58 47 | 61.8 55 48 | 63.8 56 49 | 63.3 60 50 | 60.7 60 51 | 60.9 61 52 | 61.9 54 53 | 60.9 55 54 | 61.6 58 55 | 59.3 62 56 | 61 59 57 | 59.3 61 58 | 62.6 57 59 | 63 57 60 | 63.2 55 61 | 60.9 57 62 | 62.6 59 63 | 62.5 57 64 | 62.1 56 65 | 61.5 59 66 | 61.4 56 67 | 62 55.3 68 | 63.3 57 69 | 61.8 58 70 | 60.7 58 71 | 61.5 60 72 | 63.1 56 73 | 62.9 59 74 | 62.5 57 75 | 63.7 57 76 | 59.2 60 77 | 59.9 58 78 | 62.4 54 79 | 62.8 60 80 | 62.6 59 81 | 63.4 59 82 | 62.1 60 83 | 62.9 58 84 | 61.6 56 85 | 57.9 60 86 | 62.3 59 87 | 61.2 58 88 | 60.8 59 89 | 60.7 58 90 | 62.9 58 91 | 62.5 57 92 | 55.1 69 93 | 61.6 56 94 | 62.4 57 95 | 63.8 56 96 | 57.5 58 97 | 59.4 62 98 | 66.3 62 99 | 61.6 59 100 | 61.5 58 101 | 63.2 56 102 | 59.9 54 103 | 61.6 55 104 | 61.7 58 105 | 62.9 56 106 | 62.2 55 107 | 63 59 108 | 62.3 55 109 | 58.8 57 110 | 62 55 111 | 61.4 57 112 | 62.2 56 113 | 63 58 114 | 62.2 59 115 | 62.6 56 116 | 62.7 53 117 | 61.7 58 118 | 62.4 54 119 | 60.7 58 120 | 59.9 59 121 | 62.3 56 122 | 62.3 54 123 | 61.7 63 124 | 64.5 57 125 | 65.3 55 126 | 61.6 60 127 | 61.4 56 128 | 59.6 57 129 | 64.4 57 130 | 65.7 60 131 | 62 56 132 | 63.6 58 133 | 61.9 59 134 | 62.6 60 135 | 61.3 60 136 | 60.9 60 137 | 60.1 62 138 | 61.8 59 139 | 61.2 57 140 | 61.9 56 141 | 60.9 57 142 | 59.8 56 143 | 61.8 55 144 | 60 57 145 | 61.6 55 146 | 62.1 64 147 | 63.3 59 148 | 60.2 56 149 | 61.1 58 150 | 60.9 57 151 | 61.7 59 152 | 61.3 56 153 | 62.5 60 154 | 61.4 59 155 | 62.9 57 156 | 62.4 57 157 | 60.7 56 158 | 60.7 58 159 | 61.5 58 160 | 59.9 57 161 | 59.2 59 162 | 60.3 56 163 | 61.7 60 164 | 61.9 57 165 | 61.9 55 166 | 60.4 59 167 | 61 57 168 | 61.5 55 169 | 61.7 56 170 | 59.2 61 171 | 61.3 56 172 | 58 62 173 | 60.2 61 174 | 61.7 55 175 | 62.7 55 176 | 64.6 54 177 | 61.3 61 178 | 63.7 56.4 179 | 62.7 58 180 | 62.2 57 181 | 61.6 56 182 | 61.5 57 183 | 61.8 56 184 | 60.7 56 185 | 59.7 60.5 186 | 60.5 56 187 | 62.7 58 188 | 62.1 58 189 | 62.8 57 190 | 63.8 58 191 | 57.8 60 192 | 62.1 55 193 | 61.1 60 194 | 60 59 195 | 61.2 57 196 | 62.7 59 197 | 61 57 198 | 61 58 199 | 61.4 57 200 | 61.8 61 201 | 59.9 63 202 | 61.3 58 203 | 60.5 58 204 | 64.1 59 205 | 67.9 60 206 | 62.4 58 207 | 63.2 60 208 | 61.3 55 209 | 60.8 56 210 | 61.7 56 211 | 63.6 57 212 | 61.2 58 213 | 62.1 54 214 | 61.5 55 215 | 61.4 59 216 | 61.8 60 217 | 62.2 56 218 | 61.2 56 219 | 60.6 63 220 | 57.5 64 221 | 61.3 56 222 | 57.2 62 223 | 62.9 60 224 | 63.1 58 225 | 60.8 57 226 | 62.7 59 227 | 62.8 60 228 | 55.1 67 229 | 61.4 59 230 | 62.2 55 231 | 63 54 232 | 63.7 56 233 | 63.6 58 234 | 62 57 235 | 61.5 56 236 | 60.5 60 237 | 61.1 60 238 | 61.8 56 239 | 63.3 56 240 | 59.4 64 241 | 62.5 55 242 | 64.5 58 243 | 62.7 59 244 | 64.2 52 245 | 63.7 54 246 | 60.4 58 247 | 61.8 58 248 | 63.2 56 249 | 61.6 56 250 | 61.6 56 251 | 60.9 57 252 | 61 61 253 | 62.1 57 254 | 60.9 60 255 | 61.3 60 256 | 65.8 59 257 | 61.3 56 258 | 58.8 59 259 | 62.3 55 260 | 60.1 62 261 | 61.8 59 262 | 63.6 55.8 263 | 62.2 56 264 | 59.2 59 265 | 61.8 59 266 | 61.3 55 267 | 62.1 60 268 | 60.7 60 269 | 59.6 57 270 | 62.2 56 271 | 60.6 57 272 | 62.9 57 273 | 64.1 55 274 | 61.3 56 275 | 62.7 55 276 | 63.2 56 277 | 60.7 56 278 | 61.9 60 279 | 62.6 55 280 | 60.7 60 281 | 62 60 282 | 63 57 283 | 58 59 284 | 62.9 57 285 | 58.2 60 286 | 63.2 58 287 | 61.3 59 288 | 60.3 60 289 | 62.7 60 290 | 61.3 58 291 | 61.6 60 292 | 61.9 55 293 | 61.7 56 294 | 61.9 58 295 | 61.8 58 296 | 61.6 56 297 | 58.8 66 298 | 61 57 299 | 67.4 60 300 | 63.4 60 301 | 61.5 59 302 | 58 62 303 | 62.4 54 304 | 61.9 57 305 | 61.6 56 306 | 62.2 59 307 | 62.2 58 308 | 61.3 56 309 | 62.3 57 310 | 61.8 57 311 | 62.5 59 312 | 62.9 60 313 | 61.8 59 314 | 62.3 56 315 | 59 70 316 | 60.7 55 317 | 62.5 55 318 | 62.7 58 319 | 60.4 57 320 | 62.1 58 321 | 57.8 60 322 | 63.8 58 323 | 62.8 57 324 | 62.2 58 325 | 62.3 58 326 | 59.9 58 327 | 61.9 54 328 | 63 55 329 | 62.4 58 330 | 62.9 58 331 | 63.5 56 332 | 61.3 56 333 | 60.6 54 334 | 65.1 58 335 | 62.6 58 336 | 58 62 337 | 62.4 61 338 | 61.3 57 339 | 59.9 60 340 | 60.8 58 341 | 63.5 55 342 | 62.2 57 343 | 63.8 58 344 | 64 57 345 | 62.5 56 346 | 62.3 58 347 | 61.7 57 348 | 62.2 58 349 | 61.5 56 350 | 61 59 351 | 62.2 56 352 | 61.5 54 353 | 67.3 59 354 | 61.7 58 355 | 61.9 56 356 | 61.8 58 357 | 58.7 66 358 | 62.5 57 359 | 62.8 56 360 | 61.1 68 361 | 64 57 362 | 62.5 60 363 | 60.6 58 364 | 61.6 55 365 | 62.2 58 366 | 60 57 367 | 61.9 57 368 | 62.8 57 369 | 62 57 370 | 66.4 59 371 | 63.4 56 372 | 60.9 56 373 | 63.1 57 374 | 63.1 59 375 | 59.2 57 376 | 60.7 54 377 | 64.6 56 378 | 61.8 56 379 | 59.9 60 380 | 61.7 55 381 | 62.8 61 382 | 62.7 57 383 | 63.4 58 384 | 63.5 54 385 | 65.7 59 386 | 68.1 56 387 | 63 60 388 | 59.5 58 389 | 63.5 59 390 | 61.7 58 391 | 62.7 58 392 | 62.8 58 393 | 62.4 57 394 | 61 59 395 | 63.1 56 396 | 60.7 57 397 | 60.9 59 398 | 60.1 55 399 | 62.9 58 400 | 63.3 56 401 | 63.8 55 402 | 62.9 57 403 | 63.4 60 404 | 63.9 55 405 | 61.4 56 406 | 61.9 55 407 | 62.4 55 408 | 61.8 58 409 | 61.5 56 410 | 60.4 57 411 | 61.8 55 412 | 62 56 413 | 62.3 56 414 | 61.6 56 415 | 60.6 56 416 | 58.4 62 417 | 61.4 58 418 | 61.9 56 419 | 62 56 420 | 61.5 57 421 | 62.3 58 422 | 60.9 61 423 | 62.4 57 424 | 55 61 425 | 58.6 60 426 | 62 57 427 | 59.8 58 428 | 63.4 55 429 | 64.3 58 430 | 62.2 59 431 | 61.7 57 432 | 61.1 59 433 | 61.5 56 434 | 58.5 62 435 | 61.7 58 436 | 60.4 56 437 | 61.4 56 438 | 61.5 55 439 | 61.4 56 440 | 65 56 441 | 56 60 442 | 60.2 59 443 | 58.3 58 444 | 53.1 63 445 | 60.3 58 446 | 61.4 56 447 | 60.1 57 448 | 63.4 55 449 | 61.5 59 450 | 62.7 56 451 | 62.5 55 452 | 61.3 56 453 | 60.2 56 454 | 62.7 57 455 | 62.3 58 456 | 61.5 56 457 | 59.2 59 458 | 61.8 59 459 | 61.3 55 460 | 61.4 58 461 | 62.8 55 462 | 62.8 64 463 | 62.4 61 464 | 59.3 60 465 | 63 60 466 | 61.3 60 467 | 59.3 62 468 | 61 57 469 | 62.9 57 470 | 59.6 57 471 | 61.8 60 472 | 62.7 57 473 | 65.3 62 474 | 63.8 58 475 | 62.3 56 476 | 59.7 63 477 | 64.3 60 478 | 62.9 58 479 | 62 57 480 | 61.6 59 481 | 61.9 55 482 | 61.3 58 483 | 63.6 57 484 | 59.6 61 485 | 62.2 59 486 | 61.7 55 487 | 63.2 58 488 | 60.8 60 489 | 60.3 59 490 | 60.9 60 491 | 62.4 59 492 | 60.2 60 493 | 62 55 494 | 60.8 57 495 | 62.1 55 496 | 62.7 60 497 | 61.3 58 498 | 60.2 60 499 | 60.7 56 -------------------------------------------------------------------------------- /blog10-Pandas/bankloan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog10-Pandas/bankloan.png -------------------------------------------------------------------------------- /blog10-Pandas/ccc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog10-Pandas/ccc.png -------------------------------------------------------------------------------- /blog10-Pandas/data.csv: -------------------------------------------------------------------------------- 1 | year,Beijing,Chongqing,Shenzhen,Guiyang,Kunming,Shanghai,Wuhai,Changsha 2 | 2002,4764,1556,5802,1643,2276,4134,1928,1802 3 | 2003,4737,1596,6256,1949,2233,5118,2072,2040 4 | 2004,5020.93,1766.24,6756.24,1801.68,2473.78,5855,2516.32,2039.09 5 | 2005,6788.09,2134.99,7582.27,2168.9,2639.72,6842,3061.77,2313.73 6 | 2006,8279.51,2269.21,9385.34,2372.66,2903.32,7196,3689.64,2644.15 7 | 2007,11553.26,2722.58,14049.69,2901.63,3108.12,8361,4664.03,3304.74 8 | 2008,12418,2785,12665,3149,3750,8195,4781,3288 9 | 2009,13799,3442,14615,3762,3807,12840,5329,3648 10 | 2010,17782,4281,19170,4410,3660,14464,5746,4418 11 | 2011,16851.95,4733.84,21350.13,5069.52,4715.23,14603.24,7192.9,5862.39 12 | 2012,17021.63,5079.93,19589.82,4846.14,5744.68,14061.37,7344.05,6100.87 13 | 2013,18553,5569,24402,5025,5795,16420,7717,6292 14 | 2014,18833,5519,24723,5608,6384,16787,7951,6116 15 | -------------------------------------------------------------------------------- /blog10-Pandas/data2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog10-Pandas/data2.xlsx -------------------------------------------------------------------------------- /blog10-Pandas/guiyang.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog10-Pandas/guiyang.png -------------------------------------------------------------------------------- /blog10-Pandas/test01.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 06 10:55:17 2017 4 | @author: eastmount 5 | """ 6 | 7 | import pandas as pd 8 | data = pd.read_csv("data.csv",index_col='year') #index_col用作行索引的列名 9 | #显示前6行数据 10 | print(data.shape) 11 | print(data.head(6)) 12 | 13 | import matplotlib.pyplot as plt 14 | plt.rcParams['font.sans-serif'] = ['simHei'] #用来正常显示中文标签 15 | plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号 16 | data.plot() 17 | plt.savefig(u'时序图.png', dpi=500) 18 | plt.show() 19 | -------------------------------------------------------------------------------- /blog10-Pandas/test02.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 06 10:55:17 2017 4 | @author: eastmount 5 | """ 6 | 7 | import pandas as pd 8 | data = pd.read_csv("data.csv",index_col='year') #index_col用作行索引的列名 9 | #显示前6行数据 10 | print(data.shape) 11 | print(data.head(6)) 12 | 13 | import matplotlib.pyplot as plt 14 | plt.rcParams['font.sans-serif'] = ['simHei'] #用来正常显示中文标签 15 | plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号 16 | data.plot() 17 | plt.savefig(u'时序图.png', dpi=500) 18 | plt.show() 19 | 20 | #获取贵阳数据集并绘图 21 | gy = data['Guiyang'] 22 | print('输出贵阳数据') 23 | print(gy) 24 | gy.plot() 25 | plt.show() 26 | -------------------------------------------------------------------------------- /blog10-Pandas/test03.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 06 10:55:17 2017 4 | @author: eastmount 5 | """ 6 | import matplotlib.pyplot as plt 7 | import pandas as pd 8 | data = pd.read_csv("data.csv",index_col='year') #index_col用作行索引的列名 9 | #显示前6行数据 10 | print(data.shape) 11 | print(data.head(6)) 12 | #获取贵阳数据集并绘图 13 | gy = data['Guiyang'] 14 | print('输出贵阳数据') 15 | print(gy) 16 | 17 | import numpy as np 18 | x = ['2002','2003','2004','2005','2006','2007','2008', 19 | '2009','2010','2011','2012','2013','2014'] 20 | N = 13 21 | ind = np.arange(N) #赋值0-13 22 | width=0.35 23 | plt.bar(ind, gy, width, color='r', label='sum num') 24 | #设置底部名称 25 | plt.xticks(ind+width/2, x, rotation=40) #旋转40度 26 | plt.title('The price of Guiyang') 27 | plt.xlabel('year') 28 | plt.ylabel('price') 29 | plt.savefig('guiyang.png',dpi=400) 30 | plt.show() 31 | -------------------------------------------------------------------------------- /blog10-Pandas/test04.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pylab as pl 3 | # make an array of random numbers with a gaussian distribution with 4 | # mean = 5.0 5 | # rms = 3.0 6 | # number of points = 1000 7 | data = np.random.normal(5.0, 3.0, 1000) 8 | # make a histogram of the data array 9 | pl.hist(data, histtype='stepfilled') #去掉黑色轮廓 10 | # make plot labels 11 | pl.xlabel('data') 12 | pl.show() 13 | -------------------------------------------------------------------------------- /blog10-Pandas/test05.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 06 10:55:17 2017 4 | @author: yxz15 5 | """ 6 | 7 | import pandas as pd 8 | data = pd.read_csv("data.csv",index_col='year') 9 | #显示前6行数据 10 | print(data.shape) 11 | print(data.head(6)) 12 | 13 | import matplotlib.pyplot as plt 14 | plt.rcParams['font.sans-serif'] = ['simHei'] 15 | plt.rcParams['axes.unicode_minus'] = False 16 | data.plot() 17 | plt.savefig(u'时序图.png', dpi=500) 18 | plt.show() 19 | 20 | from statsmodels.graphics.tsaplots import plot_acf 21 | gy = data['Guiyang'] 22 | print(gy) 23 | plot_acf(gy).show() 24 | plt.savefig(u'贵阳自相关图',dpi=300) 25 | 26 | from statsmodels.tsa.stattools import adfuller as ADF 27 | print('ADF:',ADF(gy)) 28 | -------------------------------------------------------------------------------- /blog10-Pandas/test06-dalian.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 06 10:19:15 2017 4 | @author: yxz15 5 | """ 6 | 7 | #第一部分:导入数据集 8 | import pandas as pd 9 | Coke1 =pd.read_csv("data2.csv") 10 | print(Coke1 [:4]) 11 | 12 | #第二部分:聚类 13 | from sklearn.cluster import KMeans 14 | clf=KMeans(n_clusters=3) 15 | pre=clf.fit_predict(Coke1) 16 | print(pre[:4]) 17 | 18 | #第三部分:降维 19 | from sklearn.decomposition import PCA 20 | pca=PCA(n_components=2) 21 | newData=pca.fit_transform(Coke1) 22 | print(newData[:4]) 23 | x1=[n[0] for n in newData] 24 | x2=[n[1] for n in newData] 25 | 26 | #第四部分:用matplotlib包画图 27 | import matplotlib.pyplot as plt 28 | plt.title 29 | plt.xlabel("x feature") 30 | plt.ylabel("y feature") 31 | plt.scatter(x1,x2,c=pre, marker='x') 32 | plt.savefig("bankloan.png",dpi=400) 33 | plt.show() 34 | -------------------------------------------------------------------------------- /blog10-Pandas/test07.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 06 21:47:46 2017 4 | @author: yxz 5 | """ 6 | 7 | from numpy import * 8 | import matplotlib 9 | import matplotlib.pyplot as plt 10 | 11 | #----------------------------------------------------------------------- 12 | #载入数据 13 | def loadDataSet(fileName,delim='\t'): 14 | fr=open(fileName) 15 | stringArr=[line.strip().split(delim) for line in fr.readlines()] 16 | datArr=[list(map(float,line)) for line in stringArr] 17 | return mat(datArr) 18 | 19 | def pca(dataMat,topNfeat=9999999): 20 | meanVals=mean(dataMat,axis=0) 21 | meanRemoved=dataMat-meanVals 22 | covMat=cov(meanRemoved,rowvar=0) 23 | eigVals,eigVets=linalg.eig(mat(covMat)) 24 | eigValInd=argsort(eigVals) 25 | eigValInd=eigValInd[:-(topNfeat+1):-1] 26 | redEigVects=eigVets[:,eigValInd] 27 | print(meanRemoved) 28 | print(redEigVects) 29 | lowDDatMat=meanRemoved*redEigVects 30 | reconMat=(lowDDatMat*redEigVects.T)+meanVals 31 | return lowDDatMat,reconMat 32 | 33 | dataMat=loadDataSet('41.txt') 34 | lowDMat,reconMat=pca(dataMat,1) 35 | 36 | #----------------------------------------------------------------------- 37 | #绘制图像 38 | def plotPCA(dataMat,reconMat): 39 | datArr=array(dataMat) 40 | reconArr=array(reconMat) 41 | n1=shape(datArr)[0] 42 | n2=shape(reconArr)[0] 43 | xcord1=[];ycord1=[] 44 | xcord2=[];ycord2=[] 45 | for i in range(n1): 46 | xcord1.append(datArr[i,0]) 47 | ycord1.append(datArr[i,1]) 48 | for i in range(n2): 49 | xcord2.append(reconArr[i,0]) 50 | ycord2.append(reconArr[i,1]) 51 | fig=plt.figure() 52 | ax=fig.add_subplot(111) 53 | ax.scatter(xcord1,ycord1,s=90,c='red',marker='^') 54 | ax.scatter(xcord2,ycord2,s=50,c='yellow',marker='o') 55 | plt.title('PCA') 56 | plt.savefig('ccc.png',dpi=400) 57 | plt.show() 58 | 59 | plotPCA(dataMat,reconMat) 60 | -------------------------------------------------------------------------------- /blog10-Pandas/时序图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog10-Pandas/时序图.png -------------------------------------------------------------------------------- /blog10-Pandas/贵阳自相关图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog10-Pandas/贵阳自相关图.png -------------------------------------------------------------------------------- /blog11-Matplotlib+SQL/test01.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | ' 这篇代码主要讲述获取MySQL中数据,再进行简单的统计 4 | ' 统计采用SQL语句进行 By:Eastmount CSDN 5 | ''' 6 | 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | import pandas as pd 10 | import numpy as np 11 | import pylab 12 | import MySQLdb 13 | from pylab import * 14 | 15 | 16 | # 根据SQL语句输出24小时的柱状图 17 | try: 18 | conn = MySQLdb.connect(host='localhost',user='root', 19 | passwd='123456',port=3306, db='test01') 20 | cur = conn.cursor() #数据库游标 21 | 22 | #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character 23 | conn.set_character_set('utf8') 24 | cur.execute('SET NAMES utf8;') 25 | cur.execute('SET CHARACTER SET utf8;') 26 | cur.execute('SET character_set_connection=utf8;') 27 | sql = "select HOUR(FBTime) as hh, count(*) as cnt from csdn group by hh;" 28 | cur.execute(sql) 29 | result = cur.fetchall() #获取结果复合纸给result 30 | hour1 = [n[0] for n in result] 31 | print hour1 32 | num1 = [n[1] for n in result] 33 | print num1 34 | 35 | N = 23 36 | ind = np.arange(N) #赋值0-23 37 | width=0.35 38 | plt.bar(ind, num1, width, color='r', label='sum num') 39 | #设置底部名称 40 | plt.xticks(ind+width/2, hour1, rotation=40) #旋转40度 41 | for i in range(23): #中心底部翻转90度 42 | plt.text(i, num1[i], str(num1[i]), 43 | ha='center', va='bottom', rotation=45) 44 | plt.title('Number-24Hour') 45 | plt.xlabel('hours') 46 | plt.ylabel('The number of blog') 47 | plt.legend() 48 | plt.savefig('08csdn.png',dpi=400) 49 | plt.show() 50 | 51 | 52 | #异常处理 53 | except MySQLdb.Error,e: 54 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 55 | finally: 56 | cur.close() 57 | conn.commit() 58 | conn.close() 59 | 60 | 61 | -------------------------------------------------------------------------------- /blog11-Matplotlib+SQL/test02.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | ' 这篇代码主要讲述获取MySQL中数据,再进行简单的统计 4 | ' 统计采用SQL语句进行 By:Eastmount 5 | ''' 6 | 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | import pandas as pd 10 | import numpy as np 11 | import pylab 12 | import MySQLdb 13 | from pylab import * 14 | import matplotlib.pyplot as plt 15 | 16 | #根据SQL语句输出散点 17 | try: 18 | conn = MySQLdb.connect(host='localhost',user='root', 19 | passwd='123456',port=3306, db='test01') 20 | cur = conn.cursor() #数据库游标 21 | 22 | #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character 23 | conn.set_character_set('utf8') 24 | cur.execute('SET NAMES utf8;') 25 | cur.execute('SET CHARACTER SET utf8;') 26 | cur.execute('SET character_set_connection=utf8;') 27 | sql = '''select DATE_FORMAT(FBTime,'%Y%m'), count(*) from csdn 28 | group by DATE_FORMAT(FBTime,'%Y%m');''' 29 | cur.execute(sql) 30 | result = cur.fetchall() #获取结果复合纸给result 31 | date1 = [n[0] for n in result] 32 | print date1 33 | num1 = [n[1] for n in result] 34 | print num1 35 | print type(date1) 36 | plt.scatter(date1,num1,25,color='white',marker='o', 37 | edgecolors='#0D8ECF',linewidth=3,alpha=0.8) 38 | plt.title('Number-12Month') 39 | plt.xlabel('Time') 40 | plt.ylabel('The number of blog') 41 | plt.savefig('02csdn.png',dpi=400) 42 | plt.show() 43 | 44 | 45 | #异常处理 46 | except MySQLdb.Error,e: 47 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 48 | finally: 49 | cur.close() 50 | conn.commit() 51 | conn.close() 52 | 53 | -------------------------------------------------------------------------------- /blog11-Matplotlib+SQL/test03.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | ' 这篇代码主要讲述获取MySQL中数据,再进行简单的统计 4 | ' 统计采用SQL语句进行 By:Eastmount 5 | ''' 6 | 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | import pandas as pd 10 | import numpy as np 11 | import pylab 12 | import MySQLdb 13 | from pylab import * 14 | from pandas import * 15 | 16 | 17 | # 根据SQL语句输出24小时的柱状图 18 | try: 19 | conn = MySQLdb.connect(host='localhost',user='root', 20 | passwd='123456',port=3306, db='test01') 21 | cur = conn.cursor() #数据库游标 22 | 23 | #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character 24 | conn.set_character_set('utf8') 25 | cur.execute('SET NAMES utf8;') 26 | cur.execute('SET CHARACTER SET utf8;') 27 | cur.execute('SET character_set_connection=utf8;') 28 | sql = '''select DATE_FORMAT(FBTime,'%Y'), Count(*) from csdn 29 | group by DATE_FORMAT(FBTime,'%Y');''' 30 | cur.execute(sql) 31 | result = cur.fetchall() #获取结果复合纸给result 32 | day1 = [n[0] for n in result] 33 | print len(day1) 34 | num1 = [n[1] for n in result] 35 | print len(num1),type(num1) 36 | matplotlib.style.use('ggplot') 37 | df=DataFrame(num1, index=day1,columns=['Nums']) 38 | plt.show(df.plot(kind='bar')) 39 | plt.savefig('05csdn.png',dpi=400) 40 | 41 | 42 | #异常处理 43 | except MySQLdb.Error,e: 44 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 45 | finally: 46 | cur.close() 47 | conn.commit() 48 | conn.close() 49 | 50 | 51 | -------------------------------------------------------------------------------- /blog11-Matplotlib+SQL/test04.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | ' 这篇代码主要讲述获取MySQL中数据,再进行简单的统计 4 | ' 统计采用SQL语句进行 By:Eastmount CSDN 5 | ''' 6 | 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | import pandas as pd 10 | import numpy as np 11 | import pylab 12 | import MySQLdb 13 | from pylab import * 14 | 15 | 16 | # 根据SQL语句输出24小时的柱状图 17 | try: 18 | conn = MySQLdb.connect(host='localhost',user='root', 19 | passwd='123456',port=3306, db='test01') 20 | cur = conn.cursor() #数据库游标 21 | 22 | #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character 23 | conn.set_character_set('utf8') 24 | cur.execute('SET NAMES utf8;') 25 | cur.execute('SET CHARACTER SET utf8;') 26 | cur.execute('SET character_set_connection=utf8;') 27 | sql = '''select DATE_FORMAT(FBTime,'%Y-%m-%d'), Count(*) from csdn 28 | group by DATE_FORMAT(FBTime,'%Y-%m-%d');''' 29 | cur.execute(sql) 30 | result = cur.fetchall() #获取结果复合纸给result 31 | day1 = [n[0] for n in result] 32 | print len(day1) 33 | num1 = [n[1] for n in result] 34 | print len(num1),type(num1) 35 | matplotlib.style.use('ggplot') 36 | #获取第一天 37 | start = min(day1) 38 | print start 39 | #np.random.randn(len(num1)) 生成正确图形 正态分布随机数 40 | ts = pd.Series(np.random.randn(len(num1)), 41 | index=pd.date_range(start, periods=len(num1))) 42 | ts = ts.cumsum() 43 | ts.plot() 44 | plt.title('Number-365Day') 45 | plt.xlabel('Time') 46 | plt.ylabel('The number of blog') 47 | plt.savefig('04csdn.png',dpi=400) 48 | plt.show() 49 | 50 | 51 | #异常处理 52 | except MySQLdb.Error,e: 53 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 54 | finally: 55 | cur.close() 56 | conn.commit() 57 | conn.close() 58 | 59 | -------------------------------------------------------------------------------- /blog12-matplotlib+SQL/test01.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | ' 这篇代码主要讲述获取MySQL中数据,再进行简单的统计 4 | ' 统计采用SQL语句进行 5 | ''' 6 | 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | import pandas as pd 10 | import numpy as np 11 | import pylab 12 | import MySQLdb 13 | from pylab import * 14 | 15 | # 根据SQL语句输出24小时的柱状图 16 | try: 17 | conn = MySQLdb.connect(host='localhost',user='root', 18 | passwd='123456',port=3306, db='test01') 19 | cur = conn.cursor() #数据库游标 20 | 21 | #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character 22 | conn.set_character_set('utf8') 23 | cur.execute('SET NAMES utf8;') 24 | cur.execute('SET CHARACTER SET utf8;') 25 | cur.execute('SET character_set_connection=utf8;') 26 | 27 | 28 | ################################################# 29 | # 2014年 30 | ################################################# 31 | sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog 32 | where DATE_FORMAT(FBTime,'%Y')='2014' group by mm;''' 33 | cur.execute(sql) 34 | result = cur.fetchall() #获取结果复制给result 35 | hour1 = [n[0] for n in result] 36 | print hour1 37 | num1 = [n[1] for n in result] 38 | print num1 39 | 40 | N = 12 41 | ind = np.arange(N) #赋值0-11 42 | width=0.35 43 | p1 = plt.subplot(221) 44 | plt.bar(ind, num1, width, color='b', label='sum num') 45 | #设置底部名称 46 | plt.xticks(ind+width/2, hour1, rotation=40) #旋转40度 47 | for i in range(12): #中心底部翻转90度 48 | plt.text(i, num1[i], str(num1[i]), 49 | ha='center', va='bottom', rotation=45) 50 | plt.title('2014 Number-12Month') 51 | plt.sca(p1) 52 | 53 | 54 | ################################################# 55 | # 2015年 56 | ################################################# 57 | sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog 58 | where DATE_FORMAT(FBTime,'%Y')='2015' group by mm;''' 59 | cur.execute(sql) 60 | result = cur.fetchall() 61 | hour1 = [n[0] for n in result] 62 | print hour1 63 | num1 = [n[1] for n in result] 64 | print num1 65 | 66 | N = 12 67 | ind = np.arange(N) #赋值0-11 68 | width=0.35 69 | p2 = plt.subplot(222) 70 | plt.bar(ind, num1, width, color='r', label='sum num') 71 | #设置底部名称 72 | plt.xticks(ind+width/2, hour1, rotation=40) #旋转40度 73 | for i in range(12): #中心底部翻转90度 74 | plt.text(i, num1[i], str(num1[i]), 75 | ha='center', va='bottom', rotation=45) 76 | plt.title('2015 Number-12Month') 77 | plt.sca(p2) 78 | 79 | 80 | ################################################# 81 | # 2016年 82 | ################################################# 83 | sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog 84 | where DATE_FORMAT(FBTime,'%Y')='2016' group by mm;''' 85 | cur.execute(sql) 86 | result = cur.fetchall() 87 | hour1 = [n[0] for n in result] 88 | print hour1 89 | num1 = [n[1] for n in result] 90 | print num1 91 | 92 | N = 12 93 | ind = np.arange(N) #赋值0-11 94 | width=0.35 95 | p3 = plt.subplot(223) 96 | plt.bar(ind, num1, width, color='g', label='sum num') 97 | #设置底部名称 98 | plt.xticks(ind+width/2, hour1, rotation=40) #旋转40度 99 | for i in range(12): #中心底部翻转90度 100 | plt.text(i, num1[i], str(num1[i]), 101 | ha='center', va='bottom', rotation=45) 102 | plt.title('2016 Number-12Month') 103 | plt.sca(p3) 104 | 105 | 106 | ################################################# 107 | # 所有年份数据对比 108 | ################################################# 109 | sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog group by mm;''' 110 | cur.execute(sql) 111 | result = cur.fetchall() 112 | hour1 = [n[0] for n in result] 113 | print hour1 114 | num1 = [n[1] for n in result] 115 | print num1 116 | 117 | N = 12 118 | ind = np.arange(N) #赋值0-11 119 | width=0.35 120 | p4 = plt.subplot(224) 121 | plt.bar(ind, num1, width, color='y', label='sum num') 122 | #设置底部名称 123 | plt.xticks(ind+width/2, hour1, rotation=40) #旋转40度 124 | for i in range(12): #中心底部翻转90度 125 | plt.text(i, num1[i], str(num1[i]), 126 | ha='center', va='bottom', rotation=45) 127 | plt.title('All Year Number-12Month') 128 | plt.sca(p4) 129 | 130 | plt.savefig('ttt.png',dpi=400) 131 | plt.show() 132 | 133 | #异常处理 134 | except MySQLdb.Error,e: 135 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 136 | finally: 137 | cur.close() 138 | conn.commit() 139 | conn.close() 140 | 141 | 142 | -------------------------------------------------------------------------------- /blog12-matplotlib+SQL/test02.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | ' 这篇代码主要讲述获取MySQL中数据,再进行简单的统计 4 | ' 统计采用SQL语句进行 By:Eastmount CSDN 5 | ''' 6 | 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | import pandas as pd 10 | import numpy as np 11 | import MySQLdb 12 | from pandas import * 13 | 14 | try: 15 | conn = MySQLdb.connect(host='localhost',user='root', 16 | passwd='123456',port=3306, db='test01') 17 | cur = conn.cursor() #数据库游标 18 | 19 | #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character 20 | conn.set_character_set('utf8') 21 | cur.execute('SET NAMES utf8;') 22 | cur.execute('SET CHARACTER SET utf8;') 23 | cur.execute('SET character_set_connection=utf8;') 24 | 25 | #所有博客数 26 | sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog 27 | group by mm;''' 28 | cur.execute(sql) 29 | result = cur.fetchall() #获取结果复制给result 30 | hour1 = [n[0] for n in result] 31 | print hour1 32 | num1 = [n[1] for n in result] 33 | print num1 34 | 35 | #2014年博客数 36 | sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog 37 | where DATE_FORMAT(FBTime,'%Y')='2014' group by mm;''' 38 | cur.execute(sql) 39 | result = cur.fetchall() 40 | num2 = [n[1] for n in result] 41 | print num2 42 | 43 | #2015年博客数 44 | sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog 45 | where DATE_FORMAT(FBTime,'%Y')='2015' group by mm;''' 46 | cur.execute(sql) 47 | result = cur.fetchall() 48 | num3 = [n[1] for n in result] 49 | print num3 50 | 51 | #2016年博客数 52 | sql = '''select MONTH(FBTime) as mm, count(*) as cnt from csdn_blog 53 | where DATE_FORMAT(FBTime,'%Y')='2016' group by mm;''' 54 | cur.execute(sql) 55 | result = cur.fetchall() 56 | num4 = [n[1] for n in result] 57 | print num4 58 | 59 | #重点: 数据整合 [12,4] 60 | data = np.array([num1, num2, num3, num4]) 61 | print data 62 | d = data.T #转置 63 | print d 64 | df = DataFrame(d, index=hour1, columns=['All','2014', '2015', '2016']) 65 | df.plot(kind='area', alpha=0.2) #设置颜色 透明度 66 | plt.title('Arae Plot Blog-Month') 67 | plt.savefig('csdn.png',dpi=400) 68 | plt.show() 69 | 70 | #异常处理 71 | except MySQLdb.Error,e: 72 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 73 | finally: 74 | cur.close() 75 | conn.commit() 76 | conn.close() 77 | 78 | -------------------------------------------------------------------------------- /blog12-matplotlib+SQL/test03.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | ' 这篇代码主要讲述获取MySQL中数据,再进行简单的统计 4 | ' 统计采用SQL语句进行 By:Eastmount CSDN 5 | ''' 6 | 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | import pandas as pd 10 | import numpy as np 11 | import MySQLdb 12 | from pandas import * 13 | 14 | try: 15 | conn = MySQLdb.connect(host='localhost',user='root', 16 | passwd='123456',port=3306, db='test01') 17 | cur = conn.cursor() #数据库游标 18 | 19 | #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character 20 | conn.set_character_set('utf8') 21 | cur.execute('SET NAMES utf8;') 22 | cur.execute('SET CHARACTER SET utf8;') 23 | cur.execute('SET character_set_connection=utf8;') 24 | sql = '''select 25 | COUNT(case dayofweek(FBTime) when 1 then 1 end) AS '星期日', 26 | COUNT(case dayofweek(FBTime) when 2 then 1 end) AS '星期一', 27 | COUNT(case dayofweek(FBTime) when 3 then 1 end) AS '星期二', 28 | COUNT(case dayofweek(FBTime) when 4 then 1 end) AS '星期三', 29 | COUNT(case dayofweek(FBTime) when 5 then 1 end) AS '星期四', 30 | COUNT(case dayofweek(FBTime) when 6 then 1 end) AS '星期五', 31 | COUNT(case dayofweek(FBTime) when 7 then 1 end) AS '星期六' 32 | from csdn_blog; 33 | ''' 34 | cur.execute(sql) 35 | result = cur.fetchall() 36 | print result 37 | #((31704L, 43081L, 42670L, 43550L, 41270L, 39164L, 29931L),) 38 | name = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday'] 39 | #转换为numpy数组 40 | data = np.array(result) 41 | print data 42 | d = data.T #转置 43 | print d 44 | 45 | matplotlib.style.use('ggplot') 46 | df=DataFrame(d, index=name,columns=['Nums']) 47 | df.plot(kind='bar') 48 | plt.title('All Year Blog-Week') 49 | plt.xlabel('Week') 50 | plt.ylabel('The number of blog') 51 | plt.savefig('01csdn.png',dpi=400) 52 | plt.show() 53 | 54 | #异常处理 55 | except MySQLdb.Error,e: 56 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 57 | finally: 58 | cur.close() 59 | conn.commit() 60 | conn.close() 61 | 62 | -------------------------------------------------------------------------------- /blog12-matplotlib+SQL/test04.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | ' 这篇代码主要讲述获取MySQL中数据,再进行简单的统计 4 | ' 统计采用SQL语句进行 By:Eastmount CSDN 杨秀璋 5 | ''' 6 | 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | import pandas as pd 10 | import numpy as np 11 | import MySQLdb 12 | from pandas import * 13 | 14 | try: 15 | conn = MySQLdb.connect(host='localhost',user='root', 16 | passwd='123456',port=3306, db='test01') 17 | cur = conn.cursor() #数据库游标 18 | 19 | #防止报错:UnicodeEncodeError: 'latin-1' codec can't encode character 20 | conn.set_character_set('utf8') 21 | cur.execute('SET NAMES utf8;') 22 | cur.execute('SET CHARACTER SET utf8;') 23 | cur.execute('SET character_set_connection=utf8;') 24 | sql = '''select 25 | COUNT(case dayofweek(FBTime) when 1 then 1 end) AS '星期日', 26 | COUNT(case dayofweek(FBTime) when 2 then 1 end) AS '星期一', 27 | COUNT(case dayofweek(FBTime) when 3 then 1 end) AS '星期二', 28 | COUNT(case dayofweek(FBTime) when 4 then 1 end) AS '星期三', 29 | COUNT(case dayofweek(FBTime) when 5 then 1 end) AS '星期四', 30 | COUNT(case dayofweek(FBTime) when 6 then 1 end) AS '星期五', 31 | COUNT(case dayofweek(FBTime) when 7 then 1 end) AS '星期六' 32 | from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2008'; 33 | ''' 34 | cur.execute(sql) 35 | result1 = cur.fetchall() 36 | print result1 37 | name = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday'] 38 | data = np.array(result1) 39 | d1 = data.T #转置 40 | print d1 41 | 42 | 43 | sql = '''select 44 | COUNT(case dayofweek(FBTime) when 1 then 1 end) AS '星期日', 45 | COUNT(case dayofweek(FBTime) when 2 then 1 end) AS '星期一', 46 | COUNT(case dayofweek(FBTime) when 3 then 1 end) AS '星期二', 47 | COUNT(case dayofweek(FBTime) when 4 then 1 end) AS '星期三', 48 | COUNT(case dayofweek(FBTime) when 5 then 1 end) AS '星期四', 49 | COUNT(case dayofweek(FBTime) when 6 then 1 end) AS '星期五', 50 | COUNT(case dayofweek(FBTime) when 7 then 1 end) AS '星期六' 51 | from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2011'; 52 | ''' 53 | cur.execute(sql) 54 | result2 = cur.fetchall() 55 | data = np.array(result2) 56 | d2 = data.T #转置 57 | print d2 58 | 59 | 60 | sql = '''select 61 | COUNT(case dayofweek(FBTime) when 1 then 1 end) AS '星期日', 62 | COUNT(case dayofweek(FBTime) when 2 then 1 end) AS '星期一', 63 | COUNT(case dayofweek(FBTime) when 3 then 1 end) AS '星期二', 64 | COUNT(case dayofweek(FBTime) when 4 then 1 end) AS '星期三', 65 | COUNT(case dayofweek(FBTime) when 5 then 1 end) AS '星期四', 66 | COUNT(case dayofweek(FBTime) when 6 then 1 end) AS '星期五', 67 | COUNT(case dayofweek(FBTime) when 7 then 1 end) AS '星期六' 68 | from csdn_blog where DATE_FORMAT(FBTime,'%Y')='2016'; 69 | ''' 70 | cur.execute(sql) 71 | result3 = cur.fetchall() 72 | data = np.array(result3) 73 | print type(result3),type(data) 74 | d3 = data.T #转置 75 | print d3 76 | 77 | 78 | #SQL语句获取3个数组,采用循环复制到一个[7][3]的二维数组中 79 | data = np.random.rand(7,3) 80 | print data 81 | i = 0 82 | while i<7: 83 | data[i][0] = d1[i] 84 | data[i][1] = d2[i] 85 | data[i][2] = d3[i] 86 | i = i + 1 87 | 88 | print data 89 | print type(data) 90 | 91 | #绘图 92 | matplotlib.style.use('ggplot') 93 | #数据[7,3]数组 name为星期 columns对应年份 94 | df=DataFrame(data, index=name, columns=['2008','2011','2016']) 95 | df.plot(kind='bar') 96 | plt.title('Comparison Chart Blog-Week') 97 | plt.xlabel('Week') 98 | plt.ylabel('The number of blog') 99 | plt.savefig('03csdn.png', dpi=400) 100 | plt.show() 101 | 102 | 103 | 104 | #异常处理 105 | except MySQLdb.Error,e: 106 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 107 | finally: 108 | cur.close() 109 | conn.commit() 110 | conn.close() 111 | 112 | -------------------------------------------------------------------------------- /blog13-wordcloud/cloudimg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog13-wordcloud/cloudimg.png -------------------------------------------------------------------------------- /blog13-wordcloud/mb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog13-wordcloud/mb.png -------------------------------------------------------------------------------- /blog13-wordcloud/result01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog13-wordcloud/result01.png -------------------------------------------------------------------------------- /blog13-wordcloud/test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog13-wordcloud/test.txt -------------------------------------------------------------------------------- /blog13-wordcloud/test01.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # By:Eastmount CSDN 3 | import jieba 4 | import sys 5 | import matplotlib.pyplot as plt 6 | from wordcloud import WordCloud 7 | 8 | #打开本体TXT文件 9 | text = open('test.txt').read() 10 | print(type(text)) 11 | 12 | #结巴分词 cut_all=True 设置为全模式 13 | wordlist = jieba.cut(text, cut_all = True) 14 | 15 | #使用空格连接 进行中文分词 16 | wl_space_split = " ".join(wordlist) 17 | print(wl_space_split) 18 | 19 | #对分词后的文本生成词云 20 | my_wordcloud = WordCloud().generate(wl_space_split) 21 | 22 | #显示词云图 23 | plt.imshow(my_wordcloud) 24 | #是否显示x轴、y轴下标 25 | plt.axis("off") 26 | plt.show() 27 | -------------------------------------------------------------------------------- /blog13-wordcloud/test02.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # By:Eastmount CSDN 3 | from os import path 4 | from scipy.misc import imread 5 | import jieba 6 | import sys 7 | import matplotlib.pyplot as plt 8 | from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 9 | 10 | # 打开本体TXT文件 11 | text = open('test.txt').read() 12 | 13 | # 结巴分词 cut_all=True 设置为全模式 14 | wordlist = jieba.cut(text) #cut_all = True 15 | 16 | # 使用空格连接 进行中文分词 17 | wl_space_split = " ".join(wordlist) 18 | print(wl_space_split) 19 | 20 | # 读取mask/color图片 21 | d = path.dirname(__file__) 22 | nana_coloring = imread(path.join(d, "mb.png")) 23 | 24 | # 对分词后的文本生成词云 25 | my_wordcloud = WordCloud( background_color = 'white', # 设置背景颜色 26 | mask = nana_coloring, # 设置背景图片 27 | max_words = 2000, # 设置最大现实的字数 28 | stopwords = STOPWORDS, # 设置停用词 29 | max_font_size = 50, # 设置字体最大值 30 | random_state = 30, # 设置有多少种随机生成状态,即有多少种配色方案 31 | ) 32 | 33 | # generate word cloud 34 | my_wordcloud.generate(wl_space_split) 35 | 36 | # create coloring from image 37 | image_colors = ImageColorGenerator(nana_coloring) 38 | 39 | # recolor wordcloud and show 40 | my_wordcloud.recolor(color_func=image_colors) 41 | 42 | plt.imshow(my_wordcloud) # 显示词云图 43 | plt.axis("off") # 是否显示x轴、y轴下标 44 | plt.show() 45 | 46 | # save img 47 | my_wordcloud.to_file(path.join(d, "cloudimg.png")) 48 | 49 | 50 | -------------------------------------------------------------------------------- /blog14-curve_fit/data.csv: -------------------------------------------------------------------------------- 1 | x,y 2 | 0,4 3 | 1,5.2 4 | 2,5.9 5 | 3,6.8 6 | 4,7.34 7 | 5,8.57 8 | 6,9.86 9 | 7,10.12 10 | 8,12.56 11 | 9,14.32 12 | 10,15.42 13 | 11,16.50 14 | 12,18.92 15 | 13,19.58 16 | -------------------------------------------------------------------------------- /blog14-curve_fit/result01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog14-curve_fit/result01.png -------------------------------------------------------------------------------- /blog14-curve_fit/result02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog14-curve_fit/result02.png -------------------------------------------------------------------------------- /blog14-curve_fit/test01.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | #By:Eastmount CSDN 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | #定义x、y散点坐标 7 | x = np.arange(1, 16, 1) 8 | num = [4.00, 5.20, 5.900, 6.80, 7.34, 9 | 8.57, 9.86, 10.12, 12.56, 14.32, 10 | 15.42, 16.50, 18.92, 19.58, 20.00] 11 | y = np.array(num) 12 | 13 | #用3次多项式拟合 14 | f1 = np.polyfit(x, y, 3) 15 | p1 = np.poly1d(f1) 16 | print(p1) 17 | 18 | #也可使用yvals=np.polyval(f1, x) 19 | yvals = p1(x) #拟合y值 20 | 21 | #绘图 22 | plot1 = plt.plot(x, y, 's',label='original values') 23 | plot2 = plt.plot(x, yvals, 'r',label='polyfit values') 24 | plt.xlabel('x') 25 | plt.ylabel('y') 26 | plt.legend(loc=4) #指定legend的位置右下角 27 | plt.title('polyfitting') 28 | plt.show() 29 | plt.savefig('test.png') 30 | -------------------------------------------------------------------------------- /blog14-curve_fit/test02.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | #By:Eastmount CSDN 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from scipy.optimize import curve_fit 6 | 7 | #自定义函数 e指数形式 8 | def func(x, a, b): 9 | return a*np.exp(b/x) 10 | 11 | #定义x、y散点坐标 12 | x = np.arange(1, 16, 1) 13 | num = [4.00, 5.20, 5.900, 6.80, 7.34, 14 | 8.57, 9.86, 10.12, 12.56, 14.32, 15 | 15.42, 16.50, 18.92, 19.58, 20.00] 16 | y = np.array(num) 17 | 18 | #非线性最小二乘法拟合 19 | popt, pcov = curve_fit(func, x, y) 20 | #获取popt里面是拟合系数 21 | a = popt[0] 22 | b = popt[1] 23 | yvals = func(x,a,b) #拟合y值 24 | print('系数a:', a) 25 | print('系数b:', b) 26 | 27 | #绘图 28 | plot1 = plt.plot(x, y, 's',label='original values') 29 | plot2 = plt.plot(x, yvals, 'r',label='polyfit values') 30 | plt.xlabel('x') 31 | plt.ylabel('y') 32 | plt.legend(loc=4) #指定legend的位置右下角 33 | plt.title('curve_fit') 34 | plt.show() 35 | plt.savefig('test2.png') 36 | -------------------------------------------------------------------------------- /blog14-curve_fit/test03.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | #By:Eastmount CSDN 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from scipy.optimize import curve_fit 6 | import pandas as pd 7 | 8 | #自定义函数 e指数形式 9 | def func(x, a, b): 10 | return a*pow(x,b) 11 | 12 | #导入数据及x、y散点坐标 13 | data = pd.read_csv("data.csv") 14 | print(data) 15 | print(data.shape) 16 | print(data.head(5)) #显示前5行数据 17 | x = data['x'] 18 | y = data['y'] 19 | print(x) 20 | print(y) 21 | 22 | #非线性最小二乘法拟合 23 | popt, pcov = curve_fit(func, x, y) 24 | #获取popt里面是拟合系数 25 | a = popt[0] 26 | b = popt[1] 27 | yvals = func(x,a,b) #拟合y值 28 | print('系数a:', a) 29 | print('系数b:', b) 30 | 31 | #绘图 32 | plot1 = plt.plot(x, y, 's',label='original values') 33 | plot2 = plt.plot(x, yvals, 'r',label='polyfit values') 34 | plt.xlabel('x') 35 | plt.ylabel('y') 36 | plt.legend(loc=4) #指定legend的位置右下角 37 | plt.title('curve_fit') 38 | plt.savefig('test3.png') 39 | plt.show() 40 | 41 | -------------------------------------------------------------------------------- /blog14-curve_fit/test04.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 2 | #By:Eastmount CSDN 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from scipy.optimize import curve_fit 6 | 7 | def func(x, a, b, c): 8 | return a * np.exp(-b * x) + c 9 | 10 | # define the data to be fit with some noise 11 | xdata = np.linspace(0, 4, 50) 12 | y = func(xdata, 2.5, 1.3, 0.5) 13 | y_noise = 0.2 * np.random.normal(size=xdata.size) 14 | ydata = y + y_noise 15 | plt.plot(xdata, ydata, 'b-', label='data') 16 | 17 | # Fit for the parameters a, b, c of the function `func` 18 | popt, pcov = curve_fit(func, xdata, ydata) 19 | plt.plot(xdata, func(xdata, *popt), 'r-', label='fit') 20 | 21 | # Constrain the optimization to the region of ``0 < a < 3``, ``0 < b < 2`` 22 | # and ``0 < c < 1``: 23 | popt, pcov = curve_fit(func, xdata, ydata, bounds=(0, [3., 2., 1.])) 24 | plt.plot(xdata, func(xdata, *popt), 'g--', label='fit-with-bounds') 25 | 26 | plt.xlabel('x') 27 | plt.ylabel('y') 28 | plt.legend() 29 | plt.show() 30 | -------------------------------------------------------------------------------- /blog14-curve_fit/test3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog14-curve_fit/test3.png -------------------------------------------------------------------------------- /blog14-curve_fit/test4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog14-curve_fit/test4.png -------------------------------------------------------------------------------- /blog15-imshow/result01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog15-imshow/result01.png -------------------------------------------------------------------------------- /blog15-imshow/result02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog15-imshow/result02.png -------------------------------------------------------------------------------- /blog15-imshow/result03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog15-imshow/result03.png -------------------------------------------------------------------------------- /blog15-imshow/test01.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # By:Eastmount CSDN 3 | from matplotlib import pyplot as plt 4 | 5 | fig = plt.figure() 6 | ax1 = fig.add_subplot(231) 7 | ax2 = fig.add_subplot(232) 8 | ax3 = fig.add_subplot(233) 9 | ax4 = fig.add_subplot(234) 10 | ax5 = fig.add_subplot(235) 11 | ax6 = fig.add_subplot(236) 12 | plt.grid(True) 13 | plt.show() 14 | -------------------------------------------------------------------------------- /blog15-imshow/test02.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # By:Eastmount CSDN 3 | import numpy as np 4 | from pylab import * 5 | from matplotlib import pyplot as plt 6 | 7 | x = [1, 2, 3, 4] 8 | y = [3, 5, 10, 25] 9 | 10 | #创建Figure 11 | fig = plt.figure() 12 | 13 | #创建一个或多个子图(subplot绘图区才能绘图) 14 | ax1 = fig.add_subplot(231) 15 | plt.plot(x, y, marker='D') #绘图及选择子图 16 | plt.sca(ax1) 17 | 18 | ax2 = fig.add_subplot(232) 19 | plt.scatter(x, y, marker='s', color='r') 20 | plt.sca(ax2) 21 | plt.grid(True) 22 | 23 | ax3 = fig.add_subplot(233) 24 | plt.bar(x, y, 0.5, color='c') #柱状图 width=0.5间距 25 | plt.sca(ax3) 26 | 27 | ax4 = fig.add_subplot(234) 28 | #高斯分布 29 | mean = 0 #均值为0 30 | sigma = 1 #标准差为1 (反应数据集中还是分散的值) 31 | data = mean+sigma*np.random.randn(10000) 32 | plt.hist(data,40,normed=1,histtype='bar',facecolor='yellowgreen',alpha=0.75) 33 | plt.sca(ax4) 34 | 35 | m = np.arange(-5.0, 5.0, 0.02) 36 | n = np.sin(m) 37 | ax5 = fig.add_subplot(235) 38 | plt.plot(m, n) 39 | plt.sca(ax5) 40 | 41 | ax6 = fig.add_subplot(236) 42 | xlim(-2.5, 2.5) #设置x轴范围 43 | ylim(-1, 1) #设置y轴范围 44 | plt.plot(m, n) 45 | plt.sca(ax6) 46 | plt.grid(True) 47 | 48 | plt.show() 49 | -------------------------------------------------------------------------------- /blog15-imshow/test03.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # By:Eastmount CSDN 3 | from matplotlib import pyplot as plt 4 | 5 | X = [[1,2],[3,4],[5,6]] 6 | plt.imshow(X) 7 | plt.show() 8 | -------------------------------------------------------------------------------- /blog15-imshow/test04.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | # By:Eastmount CSDN 3 | from matplotlib import pyplot as plt 4 | 5 | X = [[1,2],[3,4],[5,6]] 6 | plt.imshow(X) 7 | plt.colorbar() 8 | plt.show() 9 | -------------------------------------------------------------------------------- /blog15-imshow/test05.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | #By:Eastmount CSDN 3 | from matplotlib import pyplot as plt 4 | 5 | X = [[1,2],[3,4]] 6 | 7 | fig = plt.figure() 8 | ax = fig.add_subplot(231) 9 | ax.imshow(X) 10 | 11 | ax = fig.add_subplot(232) 12 | ax.imshow(X, cmap=plt.cm.gray) #灰度 13 | 14 | ax = fig.add_subplot(233) 15 | im = ax.imshow(X, cmap=plt.cm.spring) #春 16 | plt.colorbar(im) 17 | 18 | ax = fig.add_subplot(234) 19 | im = ax.imshow(X, cmap=plt.cm.summer) 20 | plt.colorbar(im, cax=None, ax=None, shrink=0.5) #长度为半 21 | 22 | ax = fig.add_subplot(235) 23 | im = ax.imshow(X, cmap=plt.cm.autumn) 24 | plt.colorbar(im, shrink=0.5, ticks=[-1,0,1]) 25 | 26 | ax = fig.add_subplot(236) 27 | im = ax.imshow(X, cmap=plt.cm.winter) 28 | plt.colorbar(im, shrink=0.5) 29 | 30 | plt.show() 31 | -------------------------------------------------------------------------------- /blog15-imshow/test06.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | #By:Eastmount CSDN 3 | from matplotlib import pyplot as plt 4 | 5 | X = [[0, 0.25], [0.5, 0.75]] 6 | 7 | 8 | fig = plt.figure() 9 | ax = fig.add_subplot(121) 10 | im = ax.imshow(X, cmap=plt.get_cmap('hot')) 11 | plt.colorbar(im, shrink=0.5) 12 | 13 | ax = fig.add_subplot(122) 14 | im = ax.imshow(X, cmap=plt.get_cmap('hot'), interpolation='nearest', 15 | vmin=0, vmax=1) 16 | plt.colorbar(im, shrink=0.2) 17 | plt.show() 18 | 19 | -------------------------------------------------------------------------------- /blog15-imshow/test07.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # By:Eastmount CSDN 3 | import numpy as np 4 | from matplotlib import pyplot as plt 5 | from matplotlib import cm 6 | from matplotlib import axes 7 | 8 | def draw_heatmap(data,xlabels,ylabels): 9 | #cmap=cm.Blues 10 | cmap=cm.get_cmap('rainbow',1000) 11 | figure=plt.figure(facecolor='w') 12 | ax=figure.add_subplot(1,1,1,position=[0.1,0.15,0.8,0.8]) 13 | ax.set_yticks(range(len(ylabels))) 14 | ax.set_yticklabels(ylabels) 15 | ax.set_xticks(range(len(xlabels))) 16 | ax.set_xticklabels(xlabels) 17 | vmax=data[0][0] 18 | vmin=data[0][0] 19 | for i in data: 20 | for j in i: 21 | if j>vmax: 22 | vmax=j 23 | if j0: 75 | words.write(student1 + " " + student2 + " " 76 | + str(word_vector[i][j]) + "\r\n") 77 | j = j + 1 78 | i = i + 1 79 | words.close() 80 | 81 | 82 | """ 第四步:图形生成 """ 83 | a = [] 84 | f = codecs.open('word_node.txt','r','utf-8') 85 | line = f.readline() 86 | print line 87 | i = 0 88 | A = [] 89 | B = [] 90 | while line!="": 91 | a.append(line.split()) #保存文件是以空格分离的 92 | print a[i][0],a[i][1] 93 | A.append(a[i][0]) 94 | B.append(a[i][1]) 95 | i = i + 1 96 | line = f.readline() 97 | elem_dic = tuple(zip(A,B)) 98 | print type(elem_dic) 99 | print list(elem_dic) 100 | f.close() 101 | 102 | import matplotlib 103 | matplotlib.rcParams['font.sans-serif'] = ['SimHei'] 104 | matplotlib.rcParams['font.family']='sans-serif' 105 | 106 | colors = ["red","green","blue","yellow"] 107 | G = nx.Graph() 108 | G.add_edges_from(list(elem_dic)) 109 | #nx.draw(G,with_labels=True,pos=nx.random_layout(G),font_size=12,node_size=2000,node_color=colors) #alpha=0.3 110 | #pos=nx.spring_layout(G,iterations=50) 111 | pos=nx.random_layout(G) 112 | nx.draw_networkx_nodes(G, pos, alpha=0.2,node_size=1200,node_color=colors) 113 | nx.draw_networkx_edges(G, pos, node_color='r', alpha=0.3) #style='dashed' 114 | nx.draw_networkx_labels(G, pos, font_family='sans-serif', alpha=0.5) #font_size=5 115 | plt.show() 116 | -------------------------------------------------------------------------------- /blog18-Regression/blog01-LR.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.linear_model import LinearRegression 3 | 4 | #数据集 直径、价格 5 | x = [[5],[6],[7],[8],[10],[11],[13],[14],[16],[18]] 6 | y = [[6],[7.5],[8.6],[9],[12],[13.6],[15.8],[18.5],[19.2],[20]] 7 | print(x) 8 | print(y) 9 | 10 | clf = LinearRegression() 11 | clf.fit(x,y) 12 | pre = clf.predict([[12]])[0] 13 | print(u'预测直径为12英寸的价格: $%.2f' % pre) 14 | -------------------------------------------------------------------------------- /blog18-Regression/blog02-LR.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.linear_model import LinearRegression 3 | 4 | #数据集 直径、价格 5 | x = [[5],[6],[7],[8],[10],[11],[13],[14],[16],[18]] 6 | y = [[6],[7.5],[8.6],[9],[12],[13.6],[15.8],[18.5],[19.2],[20]] 7 | print(x) 8 | print(y) 9 | 10 | clf = LinearRegression() 11 | clf.fit(x,y) 12 | pre = clf.predict([[12]])[0] 13 | print('预测直径为12英寸的价格: $%.2f' % pre) 14 | x2 = [[0],[12],[15],[25]] 15 | y2 = clf.predict(x2) 16 | 17 | import matplotlib.pyplot as plt 18 | plt.figure() 19 | plt.rcParams['font.sans-serif'] = ['SimHei'] #指定默认字体 20 | plt.title(u"线性回归预测Pizza直径和价格") 21 | plt.xlabel(u"x") 22 | plt.ylabel(u"price") 23 | plt.axis([0,25,0,25]) 24 | plt.scatter(x,y,marker="s",s=20) 25 | plt.plot(x2,y2,"g-") 26 | plt.show() 27 | -------------------------------------------------------------------------------- /blog18-Regression/blog03-boston.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #导入数据集boston 3 | from sklearn.datasets import load_boston 4 | import numpy as np 5 | boston = load_boston() 6 | print(boston.data.shape, boston.target.shape) 7 | print(boston.data[0]) 8 | print(boston.target) 9 | 10 | #划分数据集 11 | boston_temp = boston.data[:, np.newaxis, 5] 12 | x_train = boston_temp[:-100] #训练样本 13 | x_test = boston_temp[-100:] #测试样本 后100行 14 | y_train = boston.target[:-100] #训练标记 15 | y_test = boston.target[-100:] #预测对比标记 16 | -------------------------------------------------------------------------------- /blog18-Regression/blog04-boson.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.datasets import load_boston 3 | import numpy as np 4 | boston = load_boston() 5 | print(boston.data.shape, boston.target.shape) 6 | 7 | #划分数据集 8 | boston_temp = boston.data[:, np.newaxis, 5] 9 | x_train = boston_temp[:-100] #训练样本 10 | x_test = boston_temp[-100:] #测试样本 后100行 11 | y_train = boston.target[:-100] #训练标记 12 | y_test = boston.target[-100:] #预测对比标记 13 | 14 | #回归分析 15 | from sklearn.linear_model import LinearRegression 16 | clf = LinearRegression() 17 | clf.fit(x_train, y_train) 18 | 19 | #算法评估 20 | pre = clf.predict(x_test) 21 | print("预测结果", pre) 22 | print("真实结果", y_test) 23 | cost = np.mean(y_test-pre)**2 24 | print('平方和计算:', cost) 25 | print('系数', clf.coef_) 26 | print('截距', clf.intercept_) 27 | print('方差', clf.score(x_test, y_test)) 28 | 29 | #绘图分析 30 | import matplotlib.pyplot as plt 31 | plt.title(u'LinearRegression Boston') 32 | plt.xlabel(u'x') 33 | plt.ylabel(u'price') 34 | plt.scatter(x_test, y_test, color = 'black') 35 | plt.plot(x_test, clf.predict(x_test), color='blue', linewidth = 3) 36 | for idx, m in enumerate(x_test): 37 | plt.plot([m, m],[y_test[idx],pre[idx]], 'r-') 38 | plt.show() 39 | -------------------------------------------------------------------------------- /blog18-Regression/blog05-random.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | X = np.arange(0,50,0.2) 4 | print(X) 5 | xArr = [] 6 | yArr = [] 7 | for n in X: 8 | xArr.append(n) 9 | y = 0.7*n + np.random.uniform(0,1)*math.sin(n)*2 - 3 10 | yArr.append(y) 11 | 12 | import matplotlib.pyplot as plt 13 | plt.plot(X, yArr, 'go') 14 | plt.show() 15 | -------------------------------------------------------------------------------- /blog18-Regression/blog06-random.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import math 4 | 5 | #随机数生成 6 | X = np.arange(0,50,0.2) 7 | print(X) 8 | xArr = [] 9 | yArr = [] 10 | for n in X: 11 | xArr.append(n) 12 | y = 0.7*n + np.random.uniform(0,1)*math.sin(n)*2 - 3 13 | yArr.append(y) 14 | 15 | #线性回归分析 16 | from sklearn.linear_model import LinearRegression 17 | clf = LinearRegression() 18 | print(clf) 19 | X = np.array(X).reshape((len(X),1)) #list转化为数组 20 | yArr = np.array(yArr).reshape((len(X),1)) 21 | clf.fit(X,yArr) 22 | pre = clf.predict(X) 23 | 24 | import matplotlib.pyplot as plt 25 | plt.plot(X, yArr, 'go') 26 | plt.plot(X, pre, 'r', linewidth=3) 27 | plt.show() 28 | -------------------------------------------------------------------------------- /blog18-Regression/blog07-3Drandom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from sklearn import linear_model 4 | from mpl_toolkits.mplot3d import Axes3D 5 | import matplotlib.pyplot as plt 6 | import math 7 | 8 | #linspace:开始值、终值和元素个数创建表示等差数列的一维数组 9 | xx, yy = np.meshgrid(np.linspace(0,10,20), np.linspace(0,100,20)) 10 | zz = 2.4 * xx + 4.5 * yy + np.random.randint(0,100,(20,20)) 11 | 12 | #构建成特征、值的形式 13 | X, Z = np.column_stack((xx.flatten(),yy.flatten())), zz.flatten() 14 | 15 | #线性回归分析 16 | regr = linear_model.LinearRegression() 17 | regr.fit(X, Z) 18 | 19 | #预测的一个特征 20 | x_test = np.array([[15.7, 91.6]]) 21 | print(regr.predict(x_test)) 22 | 23 | #画图可视化分析 24 | fig = plt.figure() 25 | ax = fig.gca(projection='3d') 26 | ax.scatter(xx, yy, zz) #真实点 27 | 28 | #拟合的平面 29 | ax.plot_wireframe(xx, yy, regr.predict(X).reshape(20,20)) 30 | ax.plot_surface(xx, yy, regr.predict(X).reshape(20,20), alpha=0.3) 31 | plt.show() 32 | -------------------------------------------------------------------------------- /blog18-Regression/blog08-PolynomialFeatures.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Nov 26 23:31:16 2017 4 | @author: yxz15 5 | """ 6 | 7 | # -*- coding: utf-8 -*- 8 | from sklearn.linear_model import LinearRegression 9 | 10 | #数据集 直径、价格 11 | x = [[5],[6],[7],[8],[10],[11],[13],[14],[16],[18]] 12 | y = [[6],[7.5],[8.6],[9],[12],[13.6],[15.8],[18.5],[19.2],[20]] 13 | print(x) 14 | print(y) 15 | 16 | clf = LinearRegression() 17 | clf.fit(x,y) 18 | pre = clf.predict([[12]])[0] 19 | print(u'预测直径为12英寸的价格: $%.2f' % pre) 20 | x2 = [[0],[12],[15],[25]] 21 | y2 = clf.predict(x2) 22 | 23 | import matplotlib.pyplot as plt 24 | import numpy as np 25 | 26 | plt.figure() 27 | plt.axis([0,25,0,25]) 28 | plt.scatter(x,y,marker="s",s=20) 29 | plt.plot(x2,y2,"g-") 30 | 31 | #导入多项式回归模型 32 | from sklearn.preprocessing import PolynomialFeatures 33 | xx = np.linspace(0,25,100) #0到25等差数列 34 | quadratic_featurizer = PolynomialFeatures(degree = 2) #实例化一个二次多项式 35 | x_train_quadratic = quadratic_featurizer.fit_transform(x) #用二次多项式多样本x做变换 36 | X_test_quadratic = quadratic_featurizer.transform(x2) 37 | regressor_quadratic = LinearRegression() 38 | regressor_quadratic.fit(x_train_quadratic, y) 39 | xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))# 把训练好X值的多项式特征实例应用到一系列点上,形成矩阵 40 | 41 | plt.plot(xx, regressor_quadratic.predict(xx_quadratic), 42 | label="$y = ax^2 + bx + c$",linewidth=2,color="r") 43 | plt.legend() 44 | plt.show() 45 | -------------------------------------------------------------------------------- /blog18-Regression/blog09-PolynomialFeatures.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Nov 26 23:31:16 2017 4 | @author: yxz15 5 | """ 6 | 7 | # -*- coding: utf-8 -*- 8 | from sklearn.linear_model import LinearRegression 9 | 10 | #数据集 直径、价格 11 | x = [[5],[6],[7],[8],[10],[11],[13],[14],[16],[18]] 12 | y = [[6],[7.5],[8.6],[9],[12],[13.6],[15.8],[18.5],[19.2],[20]] 13 | print(x) 14 | print(y) 15 | 16 | clf = LinearRegression() 17 | clf.fit(x,y) 18 | pre = clf.predict([[12]])[0] 19 | print(u'预测直径为12英寸的价格: $%.2f' % pre) 20 | x2 = [[0],[12],[15],[25]] 21 | y2 = clf.predict(x2) 22 | 23 | import matplotlib.pyplot as plt 24 | import numpy as np 25 | 26 | plt.figure() 27 | plt.axis([0,25,0,25]) 28 | plt.scatter(x,y,marker="s",s=20) 29 | plt.plot(x2,y2,"g-") 30 | 31 | #导入多项式回归模型 32 | from sklearn.preprocessing import PolynomialFeatures 33 | xx = np.linspace(0,25,100) #0到25等差数列 34 | quadratic_featurizer = PolynomialFeatures(degree = 4) #实例化一个二次多项式 35 | x_train_quadratic = quadratic_featurizer.fit_transform(x) #用二次多项式多样本x做变换 36 | X_test_quadratic = quadratic_featurizer.transform(x2) 37 | regressor_quadratic = LinearRegression() 38 | regressor_quadratic.fit(x_train_quadratic, y) 39 | xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))# 把训练好X值的多项式特征实例应用到一系列点上,形成矩阵 40 | 41 | plt.plot(xx, regressor_quadratic.predict(xx_quadratic), 42 | label="$y = ax^4 + bx + c$",linewidth=2,color="r") 43 | plt.legend() 44 | plt.show() 45 | -------------------------------------------------------------------------------- /blog18-Regression/result01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog18-Regression/result01.png -------------------------------------------------------------------------------- /blog18-Regression/result02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog18-Regression/result02.png -------------------------------------------------------------------------------- /blog18-Regression/result03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog18-Regression/result03.png -------------------------------------------------------------------------------- /blog18-Regression/result04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog18-Regression/result04.png -------------------------------------------------------------------------------- /blog18-Regression/result05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog18-Regression/result05.png -------------------------------------------------------------------------------- /blog18-Regression/result06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog18-Regression/result06.png -------------------------------------------------------------------------------- /blog19-Iris/result01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result01.png -------------------------------------------------------------------------------- /blog19-Iris/result02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result02.png -------------------------------------------------------------------------------- /blog19-Iris/result03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result03.png -------------------------------------------------------------------------------- /blog19-Iris/result04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result04.png -------------------------------------------------------------------------------- /blog19-Iris/result05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result05.png -------------------------------------------------------------------------------- /blog19-Iris/result06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result06.png -------------------------------------------------------------------------------- /blog19-Iris/result07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result07.png -------------------------------------------------------------------------------- /blog19-Iris/result08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result08.png -------------------------------------------------------------------------------- /blog19-Iris/result09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eastmountyxz/Python-for-Data-Mining/f2dd0b8f3c4f5f51a10613dff99041bca4fd64c5/blog19-Iris/result09.png -------------------------------------------------------------------------------- /blog19-Iris/test01.py: -------------------------------------------------------------------------------- 1 | #导入数据集iris   2 | from sklearn.datasets import load_iris 3 | 4 | #载入数据集   5 | iris = load_iris() 6 | 7 | #输出数据集   8 | print(iris.data) 9 | 10 | #输出真实标签   11 | print(iris.target) 12 | print(len(iris.target)) 13 | 14 | #150个样本 每个样本4个特征   15 | print(iris.data.shape) 16 | -------------------------------------------------------------------------------- /blog19-Iris/test02-hist.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import matplotlib.pyplot as plt 3 | 4 | #导入数据集iris   5 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" 6 | names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] 7 | 8 | #读取csv数据 9 | dataset = pandas.read_csv(url, names=names) 10 | print(dataset.describe()) 11 | 12 | #直方图 histograms 13 | dataset.hist() 14 | plt.show() 15 | 16 | -------------------------------------------------------------------------------- /blog19-Iris/test03-plot.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import matplotlib.pyplot as plt 3 | 4 | #导入数据集iris   5 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" 6 | names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] 7 | 8 | #读取csv数据 9 | dataset = pandas.read_csv(url, names=names) 10 | print(dataset.describe()) 11 | 12 | dataset.plot(x='sepal-length', y='sepal-width', kind='scatter') 13 | plt.show() 14 | -------------------------------------------------------------------------------- /blog19-Iris/test04-kde.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import matplotlib.pyplot as plt 3 | 4 | #导入数据集iris 5 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" 6 | names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] 7 | 8 | #读取csv数据 9 | dataset = pandas.read_csv(url, names=names) 10 | print(dataset.describe()) 11 | 12 | dataset.plot(kind='kde') 13 | plt.show() 14 | -------------------------------------------------------------------------------- /blog19-Iris/test05-box.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import matplotlib.pyplot as plt 3 | 4 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" 5 | names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] 6 | 7 | #读取csv数据 8 | dataset = pandas.read_csv(url, names=names) 9 | print(dataset.describe()) 10 | 11 | dataset.plot(kind='kde') 12 | dataset.plot(kind='box', subplots=True, layout=(2,2), 13 | sharex=False, sharey=False) 14 | plt.show() 15 | -------------------------------------------------------------------------------- /blog19-Iris/test06-box.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import matplotlib.pyplot as plt 3 | 4 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" 5 | names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] 6 | dataset = pandas.read_csv(url, names=names) 7 | 8 | from pandas.plotting import radviz 9 | radviz(dataset, 'class') 10 | 11 | from pandas.plotting import andrews_curves 12 | andrews_curves(dataset, 'class') 13 | 14 | from pandas.plotting import parallel_coordinates 15 | parallel_coordinates(dataset, 'class') 16 | plt.show() 17 | -------------------------------------------------------------------------------- /blog19-Iris/test07-show.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import matplotlib.pyplot as plt 3 | 4 | url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" 5 | names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] 6 | dataset = pandas.read_csv(url, names=names) 7 | 8 | from pandas.plotting import scatter_matrix 9 | scatter_matrix(dataset, alpha=0.2, figsize=(6, 6), diagonal='kde') 10 | plt.show() 11 | -------------------------------------------------------------------------------- /blog19-Iris/test08-LR.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | hua = load_iris() 3 | #获取花瓣的长和宽 4 | x = [n[0] for n in hua.data] 5 | y = [n[1] for n in hua.data] 6 | 7 | import numpy as np #转换成数组 8 | x = np.array(x).reshape(len(x),1) 9 | y = np.array(y).reshape(len(y),1) 10 | 11 | from sklearn.linear_model import LinearRegression 12 | clf = LinearRegression() 13 | clf.fit(x,y) 14 | pre = clf.predict(x) 15 | 16 | #第三步 画图 17 | import matplotlib.pyplot as plt 18 | plt.scatter(x,y,s=100) 19 | plt.plot(x,pre,"r-",linewidth=4) 20 | for idx, m in enumerate(x): 21 | plt.plot([m,m],[y[idx],pre[idx]], 'g-') 22 | plt.show() 23 | -------------------------------------------------------------------------------- /blog19-Iris/test09-Kmeans.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.tree import DecisionTreeClassifier 3 | iris = load_iris() 4 | clf = DecisionTreeClassifier() 5 | clf.fit(iris.data, iris.target) 6 | print(clf) 7 | predicted = clf.predict(iris.data) 8 | 9 | #获取花卉两列数据集   10 | X = iris.data 11 | L1 = [x[0] for x in X] 12 | print(L1) 13 | L2 = [x[1] for x in X] 14 | print(L2) 15 | 16 | import numpy as np 17 | import matplotlib.pyplot as plt 18 | plt.scatter(L1, L2, c=predicted, marker='x') #cmap=plt.cm.Paired 19 | plt.title("DTC") 20 | plt.show() 21 | -------------------------------------------------------------------------------- /blog19-Iris/test10-Kmeans.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.tree import DecisionTreeClassifier 3 | import numpy as np 4 | 5 | iris = load_iris() 6 | #训练集 7 | train_data = np.concatenate((iris.data[0:40, :], iris.data[50:90, :], iris.data[100:140, :]), axis = 0) 8 | train_target = np.concatenate((iris.target[0:40], iris.target[50:90], iris.target[100:140]), axis = 0) 9 | #测试集 10 | test_data = np.concatenate((iris.data[40:50, :], iris.data[90:100, :], iris.data[140:150, :]), axis = 0) 11 | test_target = np.concatenate((iris.target[40:50], iris.target[90:100], iris.target[140:150]), axis = 0) 12 | 13 | #训练 14 | clf = DecisionTreeClassifier() 15 | clf.fit(train_data, train_target) 16 | predict_target = clf.predict(test_data) 17 | print(predict_target) 18 | 19 | #预测结果与真实结果比对 20 | print(sum(predict_target == test_target)) 21 | 22 | #输出准确率 召回率 F值 23 | from sklearn import metrics 24 | print(metrics.classification_report(test_target,predict_target)) 25 | print(metrics.confusion_matrix(test_target,predict_target)) 26 | X = test_data 27 | L1 = [n[0] for n in X] 28 | print(L1) 29 | L2 = [n[1] for n in X] 30 | print(L2) 31 | 32 | import matplotlib.pyplot as plt 33 | plt.scatter(L1, L2, c=predict_target, marker='x') #cmap=plt.cm.Paired 34 | plt.title("DecisionTreeClassifier") 35 | plt.show() 36 | -------------------------------------------------------------------------------- /blog19-Iris/test11-Kmeans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.datasets import load_iris 3 | from sklearn.cluster import KMeans 4 | iris = load_iris() 5 | clf = KMeans() 6 | clf.fit(iris.data, iris.target) 7 | print(clf) 8 | predicted = clf.predict(iris.data) 9 | 10 | #获取花卉两列数据集 11 | X = iris.data 12 | L1 = [x[0] for x in X] 13 | print(L1) 14 | L2 = [x[1] for x in X] 15 | print(L2) 16 | 17 | import numpy as np 18 | import matplotlib.pyplot as plt 19 | plt.scatter(L1, L2, c=predicted, marker='s',s=200,cmap=plt.cm.Paired) 20 | plt.title("Iris") 21 | plt.show() 22 | -------------------------------------------------------------------------------- /blog20-KNN/blog01.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from sklearn.neighbors import KNeighborsClassifier 4 | 5 | X = np.array([[-1,-1],[-2,-2],[1,2], [1,1],[-3,-4],[3,2]]) 6 | Y = [0,0,1,1,0,1] 7 | x = [[4,5],[-4,-3],[2,6]] 8 | knn = KNeighborsClassifier(n_neighbors=3, algorithm="ball_tree") 9 | knn.fit(X,Y) 10 | pre = knn.predict(x) 11 | print(pre) 12 | -------------------------------------------------------------------------------- /blog20-KNN/blog02.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import numpy as np 4 | data = np.loadtxt("wine.txt",dtype=str,delimiter=",") 5 | print(data) 6 | 7 | yy, x = np.split(data, (1,), axis=1) 8 | print(yy.shape, x.shape) 9 | print(x) 10 | print(yy[:5]) 11 | -------------------------------------------------------------------------------- /blog20-KNN/blog03.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import numpy as np 4 | data = np.loadtxt("wine.txt",dtype=str,delimiter=",") 5 | print(data) 6 | print(type(data)) 7 | 8 | yy, x = np.split(data, (1,), axis=1) 9 | print(yy.shape, x.shape) 10 | print(x) 11 | print(yy[:5]) 12 | 13 | #从字符型转换为Int整型 14 | X = x.astype(int) 15 | print(X) 16 | #字母转换为数字 17 | y = [] 18 | i = 0 19 | print(len(yy)) 20 | while i0: 16 | # print(x_mat[i][n]) 17 | i = i + 1 18 | print("The ", n , "feature is normal.") 19 | 20 | #-------------------------------------读取文件划分数据集----------------------------------------- 21 | fr = open("kddcup.data_10_percent_corrected.csv") 22 | data_file = open("kddcup.data_10_percent_corrected-result.csv",'wb+') 23 | lines = fr.readlines() 24 | line_nums = len(lines) 25 | print(line_nums) 26 | 27 | #创建line_nums行 para_num列的矩阵 28 | x_mat = np.zeros((line_nums, 42)) 29 | 30 | #划分数据集 31 | for i in range(line_nums): 32 | line = lines[i].strip() 33 | item_mat = line.split(',') 34 | x_mat[i, :] = item_mat[0:42] #获取42个特征 35 | fr.close() 36 | print(x_mat.shape) 37 | 38 | #--------------------------------获取某列特征并依次标准化并赋值----------------------------- 39 | print(len(x_mat[:, 0])) #获取某列数据 494021 40 | print(len(x_mat[0, :])) #获取某行数据 42 41 | 42 | #标准化处理 43 | ZscoreNormalization(x_mat[:, 0], 0) #duration 44 | ZscoreNormalization(x_mat[:, 0], 4) #src_bytes 45 | ZscoreNormalization(x_mat[:, 0], 5) #dst_bytes 46 | ZscoreNormalization(x_mat[:, 0], 7) #wrong_fragment 47 | ZscoreNormalization(x_mat[:, 0], 8) #urgent 48 | 49 | ZscoreNormalization(x_mat[:, 0], 9) #hot 50 | ZscoreNormalization(x_mat[:, 0], 10) #num_failed_logins 51 | ZscoreNormalization(x_mat[:, 0], 12) #num_compromised 52 | ZscoreNormalization(x_mat[:, 0], 14) #su_attempte 53 | ZscoreNormalization(x_mat[:, 0], 15) #num_root 54 | ZscoreNormalization(x_mat[:, 0], 16) #num_file_creations 55 | ZscoreNormalization(x_mat[:, 0], 17) #num_shells 56 | ZscoreNormalization(x_mat[:, 0], 18) #num_access_files 57 | ZscoreNormalization(x_mat[:, 0], 19) #num_outbound_cmds 58 | 59 | ZscoreNormalization(x_mat[:, 0], 22) #count 60 | ZscoreNormalization(x_mat[:, 0], 23) #srv_count 61 | ZscoreNormalization(x_mat[:, 0], 24) #serror_rate 62 | ZscoreNormalization(x_mat[:, 0], 25) #srv_serror_rate 63 | ZscoreNormalization(x_mat[:, 0], 26) #rerror_rate 64 | ZscoreNormalization(x_mat[:, 0], 27) #srv_rerror_rate 65 | ZscoreNormalization(x_mat[:, 0], 28) #same_srv_rate 66 | ZscoreNormalization(x_mat[:, 0], 29) #diff_srv_rate 67 | ZscoreNormalization(x_mat[:, 0], 30) #srv_diff_host_rate 68 | 69 | ZscoreNormalization(x_mat[:, 0], 31) #dst_host_count 70 | ZscoreNormalization(x_mat[:, 0], 32) #dst_host_srv_count 71 | ZscoreNormalization(x_mat[:, 0], 33) #dst_host_same_srv_rate 72 | ZscoreNormalization(x_mat[:, 0], 34) #dst_host_diff_srv_rate 73 | ZscoreNormalization(x_mat[:, 0], 35) #dst_host_same_src_port_rate 74 | ZscoreNormalization(x_mat[:, 0], 36) #dst_host_srv_diff_host_rate 75 | ZscoreNormalization(x_mat[:, 0], 37) #dst_host_serror_rate 76 | ZscoreNormalization(x_mat[:, 0], 38) #dst_host_srv_serror_rate 77 | ZscoreNormalization(x_mat[:, 0], 39) #dst_host_rerror_rate 78 | ZscoreNormalization(x_mat[:, 0], 40) #dst_host_srv_rerror_rate 79 | 80 | #文件写入操作 81 | csv_writer = csv.writer(data_file) 82 | i = 0 83 | while i0: 16 | # print(x_mat[i][n]) 17 | i = i + 1 18 | print("The ", n , "feature is normal.") 19 | 20 | #-------------------------------------读取文件划分数据集----------------------------------------- 21 | fr = open("kddcup.data_10_percent_corrected-result.csv") 22 | data_file = open("kddcup.data_10_percent_corrected-result-minmax.csv",'wb+') 23 | lines = fr.readlines() 24 | line_nums = len(lines) 25 | print(line_nums) 26 | 27 | #创建line_nums行 para_num列的矩阵 28 | x_mat = np.zeros((line_nums, 42)) 29 | 30 | #划分数据集 31 | for i in range(line_nums): 32 | line = lines[i].strip() 33 | item_mat = line.split(',') 34 | x_mat[i, :] = item_mat[0:42] #获取42个特征 35 | fr.close() 36 | print(x_mat.shape) 37 | 38 | #--------------------------------获取某列特征并依次标准化并赋值----------------------------- 39 | print(len(x_mat[:, 0])) #获取某列数据 494021 40 | print(len(x_mat[0, :])) #获取某行数据 42 41 | 42 | #归一化处理 43 | MinmaxNormalization(x_mat[:, 0], 0) #duration 44 | MinmaxNormalization(x_mat[:, 0], 4) #src_bytes 45 | MinmaxNormalization(x_mat[:, 0], 5) #dst_bytes 46 | MinmaxNormalization(x_mat[:, 0], 7) #wrong_fragment 47 | MinmaxNormalization(x_mat[:, 0], 8) #urgent 48 | 49 | MinmaxNormalization(x_mat[:, 0], 9) #hot 50 | MinmaxNormalization(x_mat[:, 0], 10) #num_failed_logins 51 | MinmaxNormalization(x_mat[:, 0], 12) #num_compromised 52 | MinmaxNormalization(x_mat[:, 0], 14) #su_attempte 53 | MinmaxNormalization(x_mat[:, 0], 15) #num_root 54 | MinmaxNormalization(x_mat[:, 0], 16) #num_file_creations 55 | MinmaxNormalization(x_mat[:, 0], 17) #num_shells 56 | MinmaxNormalization(x_mat[:, 0], 18) #num_access_files 57 | MinmaxNormalization(x_mat[:, 0], 19) #num_outbound_cmds 58 | 59 | MinmaxNormalization(x_mat[:, 0], 22) #count 60 | MinmaxNormalization(x_mat[:, 0], 23) #srv_count 61 | MinmaxNormalization(x_mat[:, 0], 24) #serror_rate 62 | MinmaxNormalization(x_mat[:, 0], 25) #srv_serror_rate 63 | MinmaxNormalization(x_mat[:, 0], 26) #rerror_rate 64 | MinmaxNormalization(x_mat[:, 0], 27) #srv_rerror_rate 65 | MinmaxNormalization(x_mat[:, 0], 28) #same_srv_rate 66 | MinmaxNormalization(x_mat[:, 0], 29) #diff_srv_rate 67 | MinmaxNormalization(x_mat[:, 0], 30) #srv_diff_host_rate 68 | 69 | MinmaxNormalization(x_mat[:, 0], 31) #dst_host_count 70 | MinmaxNormalization(x_mat[:, 0], 32) #dst_host_srv_count 71 | MinmaxNormalization(x_mat[:, 0], 33) #dst_host_same_srv_rate 72 | MinmaxNormalization(x_mat[:, 0], 34) #dst_host_diff_srv_rate 73 | MinmaxNormalization(x_mat[:, 0], 35) #dst_host_same_src_port_rate 74 | MinmaxNormalization(x_mat[:, 0], 36) #dst_host_srv_diff_host_rate 75 | MinmaxNormalization(x_mat[:, 0], 37) #dst_host_serror_rate 76 | MinmaxNormalization(x_mat[:, 0], 38) #dst_host_srv_serror_rate 77 | MinmaxNormalization(x_mat[:, 0], 39) #dst_host_rerror_rate 78 | MinmaxNormalization(x_mat[:, 0], 40) #dst_host_srv_rerror_rate 79 | 80 | #文件写入操作 81 | csv_writer = csv.writer(data_file) 82 | i = 0 83 | while i threshold and data_set[2][k] == 1: 79 | normal1 += 1 80 | if data_set[1][k] > threshold and data_set[2][k] != 1: 81 | abnormal1 += 1 82 | roc_rate[0][j] = normal1 / normal # 阈值以上正常点/全体正常的点 83 | roc_rate[1][j] = abnormal1 / abnormal # 阈值以上异常点/全体异常点 84 | return roc_rate 85 | 86 | #图1 散点图 87 | #横轴为序号 纵轴为最小欧氏距离 88 | #点中心颜色根据测试集数据类别而定 点外围无颜色 点大小为最小1 灰度为最大1 89 | plt.figure(1) 90 | plt.scatter(result[0], result[1], c=result[2], edgecolors='None', s=2, alpha=1) 91 | 92 | #图2 ROC曲线 93 | #横轴误报率:即阈值以上正常点/全体正常的点 94 | #纵轴检测率:即阈值以上异常点/全体异常点 95 | roc_rate = roc(result) 96 | plt.figure(2) 97 | plt.scatter(roc_rate[0], roc_rate[1], edgecolors='None', s=1, alpha=1) 98 | plt.show() 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /blog29-DataPreprocessing&KNN/test05-knn-gitHub-roc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import csv 4 | import numpy as np 5 | from sklearn.svm import SVC 6 | from sklearn import metrics 7 | import matplotlib.pyplot as plt 8 | from matplotlib.colors import ListedColormap 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.decomposition import PCA 11 | from sklearn import neighbors 12 | 13 | #-----------------------------------------第一步 加载数据集----------------------------------------- 14 | fr= open("kddcup.data_10_yxz-result-minmax.csv") 15 | lines = fr.readlines() 16 | line_nums = len(lines) 17 | print(line_nums) 18 | 19 | #创建line_nums行 para_num列的矩阵 20 | x_mat = np.zeros((line_nums, 31)) 21 | y_label = [] 22 | 23 | #划分数据集 24 | for i in range(line_nums): 25 | line = lines[i].strip() 26 | item_mat = line.split(',') 27 | x_mat[i, :] = item_mat[0:31] #前41个特征 28 | y_label.append(item_mat[-1]) #类标 29 | fr.close() 30 | print(x_mat.shape) 31 | print(len(y_label)) 32 | 33 | #-----------------------------------------第二步 划分数据集----------------------------------------- 34 | y = [] 35 | for n in y_label: 36 | y.append(int(float(n))) 37 | y = np.array(y, dtype = int) #list转换数组 38 | 39 | #划分数据集 测试集40% 40 | train_data, test_data, train_target, test_target = train_test_split(x_mat, y, test_size=0.4, random_state=42) 41 | print(train_data.shape, train_target.shape) 42 | print(test_data.shape, test_target.shape) 43 | 44 | 45 | #-----------------------------------------第三步 KNN训练----------------------------------------- 46 | def classify(input_vct, data_set): 47 | data_set_size = data_set.shape[0] 48 | #扩充input_vct到与data_set同型并相减 49 | diff_mat = np.tile(input_vct, (data_set_size, 1)) - data_set 50 | sq_diff_mat = diff_mat**2 #矩阵中每个元素都平方 51 | distance = sq_diff_mat.sum(axis=1)**0.5 #每行相加求和并开平方根 52 | return distance.min(axis=0) #返回最小距离 53 | 54 | test_size = len(test_target) 55 | result = np.zeros((test_size, 3)) 56 | for i in range(test_size): 57 | #序号 最小欧氏距离 测试集数据类别 58 | result[i] = i + 1, classify(test_data[i], train_data), test_target[i] 59 | #矩阵转置 60 | result = np.transpose(result) 61 | 62 | #-----------------------------------------第四步 评价及可视化----------------------------------------- 63 | def roc(data_set): 64 | normal = 0 65 | data_set_size = data_set.shape[1] 66 | roc_rate = np.zeros((2, data_set_size)) #输出ROC曲线 二维矩阵 67 | #计算正常请求数量 68 | for i in range(data_set_size): 69 | if data_set[2][i] == 1: 70 | normal += 1 71 | abnormal = data_set_size - normal 72 | max_dis = data_set[1].max() #欧式距离最大值 73 | for j in range(1000): 74 | threshold = max_dis / 1000 * j 75 | normal1 = 0 76 | abnormal1 = 0 77 | for k in range(data_set_size): 78 | if data_set[1][k] > threshold and data_set[2][k] == 1: 79 | normal1 += 1 80 | if data_set[1][k] > threshold and data_set[2][k] != 1: 81 | abnormal1 += 1 82 | roc_rate[0][j] = normal1 / normal # 阈值以上正常点/全体正常的点 83 | roc_rate[1][j] = abnormal1 / abnormal # 阈值以上异常点/全体异常点 84 | return roc_rate 85 | 86 | #图1 散点图 87 | #横轴为序号 纵轴为最小欧氏距离 88 | #点中心颜色根据测试集数据类别而定 点外围无颜色 点大小为最小1 灰度为最大1 89 | plt.figure(1) 90 | plt.scatter(result[0], result[1], c=result[2], edgecolors='None', s=2, alpha=1) 91 | 92 | #图2 ROC曲线 93 | #横轴误报率:即阈值以上正常点/全体正常的点 94 | #纵轴检测率:即阈值以上异常点/全体异常点 95 | roc_rate = roc(result) 96 | plt.figure(2) 97 | plt.scatter(roc_rate[0], roc_rate[1], edgecolors='None', s=1, alpha=1) 98 | plt.show() 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /blog29-DataPreprocessing&KNN/test06-knn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import csv 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn import metrics 7 | import matplotlib.pyplot as plt 8 | from matplotlib.colors import ListedColormap 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.decomposition import PCA 11 | from sklearn import neighbors 12 | 13 | #-----------------------------------------第一步 加载数据集----------------------------------------- 14 | fr= open("kddcup.data_10_percent_corrected-result-minmax.csv") 15 | lines = fr.readlines() 16 | line_nums = len(lines) 17 | print(line_nums) 18 | 19 | #创建line_nums行 para_num列的矩阵 20 | x_mat = np.zeros((line_nums, 41)) 21 | y_label = [] 22 | 23 | #划分数据集 24 | for i in range(line_nums): 25 | line = lines[i].strip() 26 | item_mat = line.split(',') 27 | x_mat[i, :] = item_mat[0:41] #前41个特征 28 | y_label.append(item_mat[-1]) #类标 29 | fr.close() 30 | print x_mat.shape 31 | print len(y_label) 32 | 33 | 34 | #-----------------------------------------第二步 划分数据集----------------------------------------- 35 | y = [] 36 | for n in y_label: 37 | y.append(int(float(n))) 38 | y = np.array(y, dtype = int) #list转换数组 39 | 40 | #划分数据集 测试集40% 41 | train_data, test_data, train_target, test_target = train_test_split(x_mat, y, test_size=0.4, random_state=42) 42 | print train_data.shape, train_target.shape 43 | print test_data.shape, test_target.shape 44 | 45 | 46 | #-----------------------------------------第三步 KNN训练----------------------------------------- 47 | clf = neighbors.KNeighborsClassifier() 48 | clf.fit(train_data, train_target) 49 | print clf 50 | result = clf.predict(test_data) 51 | print result 52 | print test_target 53 | 54 | 55 | #-----------------------------------------第四步 评价算法----------------------------------------- 56 | print sum(result==test_target) #预测结果与真实结果比对 57 | print(metrics.classification_report(test_target, result)) #准确率 召回率 F值 58 | 59 | 60 | #----------------------------------------第五步 降维可视化--------------------------------------- 61 | pca = PCA(n_components=2) 62 | newData = pca.fit_transform(test_data) 63 | plt.figure() 64 | plt.scatter(newData[:,0], newData[:,1], c=test_target, s=50) 65 | plt.show() 66 | --------------------------------------------------------------------------------