├── DBSCAN_data.txt
├── README.md
├── SVM_data.txt
├── sk-Birch.py
├── sk-DBSCAN.py
├── sk-DBSCAN1.py
├── sk-PCA主成分分析.py
├── sk-knn.py
├── sk-k均值聚类.py
├── sk-lasso-多回归.py
├── sk-lasso.py
├── sk-svm.py
├── sk-svm识别手写体.py
├── sk-交叉验证.py
├── sk-优化.py
├── sk-决策树.py
├── sk-分类大全.py
├── sk-卷积神经网络-识别手写数字.py
├── sk-卷积神经网络.py
├── sk-多类多标签.py
├── sk-密度聚类.py
├── sk-小批量k均值聚类.py
├── sk-层次聚类.py
├── sk-层次聚类1.py
├── sk-岭回归.py
├── sk-度量.py
├── sk-数据集-特征选择-交叉验证.py
├── sk-文档贝叶斯.py
├── sk-朴素贝叶斯.py
├── sk-样本数据集.py
├── sk-案例流程.py
├── sk-特征提取.py
├── sk-特征选择.py
├── sk-神经网络.py
├── sk-线性回归.py
├── sk-逻辑分类有b偏量.py
├── sk-逻辑分类没有b偏量.py
├── sk-随机梯度下降.py
├── sk-集成学习.py
└── sk-预处理.py


/DBSCAN_data.txt:
--------------------------------------------------------------------------------
 1 | -2.68420713,1.469732895],
 2 | [-2.71539062,-0.763005825],
 3 | [-2.88981954,-0.618055245],
 4 | [-2.7464372,-1.40005944],
 5 | [-2.72859298,1.50266052],
 6 | [-2.27989736,3.365022195],
 7 | [-2.82089068,-0.369470295],
 8 | [-2.62648199,0.766824075],
 9 | [-2.88795857,-2.568591135],
10 | [-2.67384469,-0.48011265],
11 | [-2.50652679,2.933707545],
12 | [-2.61314272,0.096842835],
13 | [-2.78743398,-1.024830855],
14 | [-3.22520045,-2.264759595],
15 | [-2.64354322,5.33787705],
16 | [-2.38386932,6.05139453],
17 | [-2.6225262,3.681403515],
18 | [-2.64832273,1.436115015],
19 | [-2.19907796,3.956598405],
20 | [-2.58734619,2.34213138],
21 | [1.28479459,3.084476355],
22 | [0.93241075,1.436391405],
23 | [1.46406132,2.268854235],
24 | [0.18096721,-3.71521773],
25 | [1.08713449,0.339256755],
26 | [0.64043675,-1.87795566],
27 | [1.09522371,1.277510445],
28 | [-0.75146714,-4.504983795],
29 | [1.04329778,1.030306095],
30 | [-0.01019007,-3.242586915],
31 | [-0.5110862,-5.681213775],
32 | [0.51109806,-0.460278495],
33 | [0.26233576,-2.46551985],
34 | [0.98404455,-0.55962189],
35 | [-0.174864,-1.133170065],
36 | [0.92757294,2.107062945],
37 | [0.65959279,-1.583893305],
38 | [0.23454059,-1.493648235],
39 | [0.94236171,-2.43820017],
40 | [0.0432464,-2.616702525],
41 | [4.53172698,-0.05329008],
42 | [3.41407223,-2.58716277],
43 | [4.61648461,1.538708805],
44 | [3.97081495,-0.815065605],
45 | [4.34975798,-0.188471475],
46 | [5.39687992,2.462256225],
47 | [2.51938325,-5.361082605],
48 | [4.9320051,1.585696545],
49 | [4.31967279,-1.104966765],
50 | [4.91813423,3.511712835],
51 | [3.66193495,1.0891728],
52 | [3.80234045,-0.972695745],
53 | [4.16537886,0.96876126],
54 | [3.34459422,-3.493869435],
55 | [3.5852673,-2.426881725],
56 | [3.90474358,0.534685455],
57 | [3.94924878,0.18328617],
58 | [5.48876538,5.27195043],
59 | [5.79468686,1.139695065],
60 | [3.29832982,-3.42456273


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # sklearn
2 | 数据挖掘库sklearn的使用教程和demo
3 | 


--------------------------------------------------------------------------------
/SVM_data.txt:
--------------------------------------------------------------------------------
  1 | 3.542485	1.977398	-1
  2 | 3.018896	2.556416	-1
  3 | 7.551510	-1.580030	1
  4 | 2.114999	-0.004466	-1
  5 | 8.127113	1.274372	1
  6 | 7.108772	-0.986906	1
  7 | 8.610639	2.046708	1
  8 | 2.326297	0.265213	-1
  9 | 3.634009	1.730537	-1
 10 | 0.341367	-0.894998	-1
 11 | 3.125951	0.293251	-1
 12 | 2.123252	-0.783563	-1
 13 | 0.887835	-2.797792	-1
 14 | 7.139979	-2.329896	1
 15 | 1.696414	-1.212496	-1
 16 | 8.117032	0.623493	1
 17 | 8.497162	-0.266649	1
 18 | 4.658191	3.507396	-1
 19 | 8.197181	1.545132	1
 20 | 1.208047	0.213100	-1
 21 | 1.928486	-0.321870	-1
 22 | 2.175808	-0.014527	-1
 23 | 7.886608	0.461755	1
 24 | 3.223038	-0.552392	-1
 25 | 3.628502	2.190585	-1
 26 | 7.407860	-0.121961	1
 27 | 7.286357	0.251077	1
 28 | 2.301095	-0.533988	-1
 29 | -0.232542	-0.547690	-1
 30 | 3.457096	-0.082216	-1
 31 | 3.023938	-0.057392	-1
 32 | 8.015003	0.885325	1
 33 | 8.991748	0.923154	1
 34 | 7.916831	-1.781735	1
 35 | 7.616862	-0.217958	1
 36 | 2.450939	0.744967	-1
 37 | 7.270337	-2.507834	1
 38 | 1.749721	-0.961902	-1
 39 | 1.803111	-0.176349	-1
 40 | 8.804461	3.044301	1
 41 | 1.231257	-0.568573	-1
 42 | 2.074915	1.410550	-1
 43 | -0.743036	-1.736103	-1
 44 | 3.536555	3.964960	-1
 45 | 8.410143	0.025606	1
 46 | 7.382988	-0.478764	1
 47 | 6.960661	-0.245353	1
 48 | 8.234460	0.701868	1
 49 | 8.168618	-0.903835	1
 50 | 1.534187	-0.622492	-1
 51 | 9.229518	2.066088	1
 52 | 7.886242	0.191813	1
 53 | 2.893743	-1.643468	-1
 54 | 1.870457	-1.040420	-1
 55 | 5.286862	-2.358286	1
 56 | 6.080573	0.418886	1
 57 | 2.544314	1.714165	-1
 58 | 6.016004	-3.753712	1
 59 | 0.926310	-0.564359	-1
 60 | 0.870296	-0.109952	-1
 61 | 2.369345	1.375695	-1
 62 | 1.363782	-0.254082	-1
 63 | 7.279460	-0.189572	1
 64 | 1.896005	0.515080	-1
 65 | 8.102154	-0.603875	1
 66 | 2.529893	0.662657	-1
 67 | 1.963874	-0.365233	-1
 68 | 8.132048	0.785914	1
 69 | 8.245938	0.372366	1
 70 | 6.543888	0.433164	1
 71 | -0.236713	-5.766721	-1
 72 | 8.112593	0.295839	1
 73 | 9.803425	1.495167	1
 74 | 1.497407	-0.552916	-1
 75 | 1.336267	-1.632889	-1
 76 | 9.205805	-0.586480	1
 77 | 1.966279	-1.840439	-1
 78 | 8.398012	1.584918	1
 79 | 7.239953	-1.764292	1
 80 | 7.556201	0.241185	1
 81 | 9.015509	0.345019	1
 82 | 8.266085	-0.230977	1
 83 | 8.545620	2.788799	1
 84 | 9.295969	1.346332	1
 85 | 2.404234	0.570278	-1
 86 | 2.037772	0.021919	-1
 87 | 1.727631	-0.453143	-1
 88 | 1.979395	-0.050773	-1
 89 | 8.092288	-1.372433	1
 90 | 1.667645	0.239204	-1
 91 | 9.854303	1.365116	1
 92 | 7.921057	-1.327587	1
 93 | 8.500757	1.492372	1
 94 | 1.339746	-0.291183	-1
 95 | 3.107511	0.758367	-1
 96 | 2.609525	0.902979	-1
 97 | 3.263585	1.367898	-1
 98 | 2.912122	-0.202359	-1
 99 | 1.731786	0.589096	-1
100 | 2.387003	1.573131	-1
101 | 


--------------------------------------------------------------------------------
/sk-Birch.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.cluster import Birch
 4 | from sklearn import metrics
 5 | 
 6 | from sklearn.datasets.samples_generator import make_blobs
 7 | # X为样本特征，Y为样本簇类别， 共1000个样本，每个样本2个特征，共4个簇，簇中心在[-1,-1], [0,0],[1,1], [2,2]
 8 | X, y = make_blobs(n_samples=1000, n_features=2, centers=[[-1,-1], [0,0], [1,1], [2,2]], cluster_std=[0.4, 0.3, 0.4, 0.3],random_state =9)
 9 | plt.scatter(X[:, 0], X[:, 1], marker='o',c=y)
10 | plt.show()
11 | 
12 | 
13 | # 不设置聚类数目的Birch
14 | y_pred = Birch(n_clusters = None).fit_predict(X)
15 | plt.scatter(X[:, 0], X[:, 1], c=y_pred)
16 | plt.show()
17 | print("CH指标:", metrics.calinski_harabaz_score(X, y_pred))
18 | 
19 | 
20 | # 设置聚类数目的Birch
21 | y_pred = Birch(n_clusters = 4).fit_predict(X)
22 | plt.scatter(X[:, 0], X[:, 1], c=y_pred)
23 | plt.show()
24 | print("CH指标:", metrics.calinski_harabaz_score(X, y_pred))
25 | 
26 | 
27 | # 尝试多个threshold取值，和多个branching_factor取值
28 | param_grid = {'threshold':[0.5,0.3,0.1],'branching_factor':[50,20,10]}  # 定义优化参数字典，字典中的key值必须是分类算法的函数的参数名
29 | for threshold in param_grid['threshold']:
30 |     for branching_factor in param_grid['branching_factor']:
31 |         clf = Birch(n_clusters = 4,threshold=threshold,branching_factor=branching_factor)
32 |         clf.fit(X)
33 |         y_pred = clf.predict(X)
34 |         print(threshold,branching_factor,"CH指标:", metrics.calinski_harabaz_score(X, y_pred))
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/sk-DBSCAN.py:
--------------------------------------------------------------------------------
 1 | import numpy as np  # 数据结构
 2 | import sklearn.cluster as skc  # 密度聚类
 3 | from sklearn import metrics   # 评估模型
 4 | import matplotlib.pyplot as plt  # 可视化绘图
 5 | 
 6 | data=[
 7 |     [-2.68420713,1.469732895],[-2.71539062,-0.763005825],[-2.88981954,-0.618055245],[-2.7464372,-1.40005944],[-2.72859298,1.50266052],
 8 |     [-2.27989736,3.365022195],[-2.82089068,-0.369470295],[-2.62648199,0.766824075],[-2.88795857,-2.568591135],[-2.67384469,-0.48011265],
 9 |     [-2.50652679,2.933707545],[-2.61314272,0.096842835],[-2.78743398,-1.024830855],[-3.22520045,-2.264759595],[-2.64354322,5.33787705],
10 |     [-2.38386932,6.05139453],[-2.6225262,3.681403515],[-2.64832273,1.436115015],[-2.19907796,3.956598405],[-2.58734619,2.34213138],
11 |     [1.28479459,3.084476355],[0.93241075,1.436391405],[1.46406132,2.268854235],[0.18096721,-3.71521773],[1.08713449,0.339256755],
12 |     [0.64043675,-1.87795566],[1.09522371,1.277510445],[-0.75146714,-4.504983795],[1.04329778,1.030306095],[-0.01019007,-3.242586915],
13 |     [-0.5110862,-5.681213775],[0.51109806,-0.460278495],[0.26233576,-2.46551985],[0.98404455,-0.55962189],[-0.174864,-1.133170065],
14 |     [0.92757294,2.107062945],[0.65959279,-1.583893305],[0.23454059,-1.493648235],[0.94236171,-2.43820017],[0.0432464,-2.616702525],
15 |     [4.53172698,-0.05329008],[3.41407223,-2.58716277],[4.61648461,1.538708805],[3.97081495,-0.815065605],[4.34975798,-0.188471475],
16 |     [5.39687992,2.462256225],[2.51938325,-5.361082605],[4.9320051,1.585696545],[4.31967279,-1.104966765],[4.91813423,3.511712835],
17 |     [3.66193495,1.0891728],[3.80234045,-0.972695745],[4.16537886,0.96876126],[3.34459422,-3.493869435],[3.5852673,-2.426881725],
18 |     [3.90474358,0.534685455],[3.94924878,0.18328617],[5.48876538,5.27195043],[5.79468686,1.139695065],[3.29832982,-3.42456273]
19 | ]
20 | X = np.array(data)
21 | 
22 | db = skc.DBSCAN(eps=1.5, min_samples=3).fit(X) #DBSCAN聚类方法 还有参数，matric = ""距离计算方法
23 | labels = db.labels_  #和X同一个维度，labels对应索引序号的值 为她所在簇的序号。若簇编号为-1，表示为噪声
24 | 
25 | print('每个样本的簇标号:')
26 | print(labels)
27 | 
28 | raito = len(labels[labels[:] == -1]) / len(labels)  #计算噪声点个数占总数的比例
29 | print('噪声比:', format(raito, '.2%'))
30 | 
31 | n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)  # 获取分簇的数目
32 | 
33 | print('分簇的数目: %d' % n_clusters_)
34 | print("轮廓系数: %0.3f" % metrics.silhouette_score(X, labels)) #轮廓系数评价聚类的好坏
35 | 
36 | for i in range(n_clusters_):
37 |     print('簇 ', i, '的所有样本:')
38 |     one_cluster = X[labels == i]
39 |     print(one_cluster)
40 |     plt.plot(one_cluster[:,0],one_cluster[:,1],'o')
41 | 
42 | plt.show()


--------------------------------------------------------------------------------
/sk-DBSCAN1.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.cluster import DBSCAN
 3 | from sklearn import metrics
 4 | from sklearn.datasets.samples_generator import make_blobs
 5 | from sklearn.preprocessing import StandardScaler
 6 | 
 7 | 
 8 | # #############################################################################
 9 | # 产生样本数据
10 | centers = [[1, 1], [-1, -1], [1, -1]]  # 生成聚类中心点
11 | X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,random_state=0) # 生成样本数据集
12 | 
13 | X = StandardScaler().fit_transform(X) # StandardScaler作用：去均值和方差归一化。且是针对每一个特征维度来做的，而不是针对样本。
14 | 
15 | # #############################################################################
16 | # 调用密度聚类  DBSCAN
17 | db = DBSCAN(eps=0.3, min_samples=10).fit(X)
18 | # print(db.labels_)  # db.labels_为所有样本的聚类索引，没有聚类索引为-1
19 | # print(db.core_sample_indices_) # 所有核心样本的索引
20 | core_samples_mask = np.zeros_like(db.labels_, dtype=bool)  # 设置一个样本个数长度的全false向量
21 | core_samples_mask[db.core_sample_indices_] = True #将核心样本部分设置为true
22 | labels = db.labels_
23 | 
24 | # 获取聚类个数。（聚类结果中-1表示没有聚类为离散点）
25 | n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
26 | 
27 | # 模型评估
28 | print('估计的聚类个数为: %d' % n_clusters_)
29 | print("同质性: %0.3f" % metrics.homogeneity_score(labels_true, labels))  # 每个群集只包含单个类的成员。
30 | print("完整性: %0.3f" % metrics.completeness_score(labels_true, labels))  # 给定类的所有成员都分配给同一个群集。
31 | print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))  # 同质性和完整性的调和平均
32 | print("调整兰德指数: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
33 | print("调整互信息: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
34 | print("轮廓系数: %0.3f" % metrics.silhouette_score(X, labels))
35 | 
36 | # #############################################################################
37 | # Plot result
38 | import matplotlib.pyplot as plt
39 | 
40 | # 使用黑色标注离散点
41 | unique_labels = set(labels)
42 | colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
43 | for k, col in zip(unique_labels, colors):
44 |     if k == -1:  # 聚类结果为-1的样本为离散点
45 |         # 使用黑色绘制离散点
46 |         col = [0, 0, 0, 1]
47 | 
48 |     class_member_mask = (labels == k)  # 将所有属于该聚类的样本位置置为true
49 | 
50 |     xy = X[class_member_mask & core_samples_mask]  # 将所有属于该类的核心样本取出，使用大图标绘制
51 |     plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),markeredgecolor='k', markersize=14)
52 | 
53 |     xy = X[class_member_mask & ~core_samples_mask]  # 将所有属于该类的非核心样本取出，使用小图标绘制
54 |     plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),markeredgecolor='k', markersize=6)
55 | 
56 | plt.title('Estimated number of clusters: %d' % n_clusters_)
57 | plt.show()
58 | 
59 | 


--------------------------------------------------------------------------------
/sk-PCA主成分分析.py:
--------------------------------------------------------------------------------
  1 | # # ======================PCA主成分分析=================
  2 | # # 花卉样本数据集
  3 | # from sklearn import datasets
  4 | # import matplotlib.pyplot as plt
  5 | # import numpy as np
  6 | # iris = datasets.load_iris()
  7 | # X = iris.data
  8 | # y = iris.target
  9 | #
 10 | 
 11 | # from sklearn.decomposition import PCA,IncrementalPCA   # 主成分分析（PCA）
 12 | # pca = PCA(n_components=2)  # PCA降维到2维
 13 | # X_pca = pca.fit_transform(X)
 14 | #
 15 | # ipca = IncrementalPCA(n_components=2, batch_size=10)  # 增量PCA降维到2维
 16 | # X_ipca = ipca.fit_transform(X)
 17 | #
 18 | # pca = PCA(n_components=2, svd_solver='randomized', whiten=True)  # PCA 使用随机SVD
 19 | # X_pca1 = pca.fit_transform(X)
 20 | #
 21 | #
 22 | # # 绘制PCA降维后的显示
 23 | # plt.subplot(131)
 24 | # plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, alpha=.8, lw=2)
 25 | # plt.title('PCA')
 26 | #
 27 | # # 绘制增量PCA降维后的显示
 28 | # plt.subplot(132)
 29 | # plt.scatter(X_ipca[:, 0], X_ipca[:, 1], c=y, alpha=.8, lw=2)
 30 | # plt.title('IPCA')
 31 | #
 32 | # # 绘制PCA使用随机SVD降维后的显示
 33 | # plt.subplot(133)
 34 | # plt.scatter(X_pca1[:, 0], X_pca1[:, 1], c=y, alpha=.8, lw=2)
 35 | # plt.title('PCA with rand SVD')
 36 | # plt.show()
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | # ======================核PCA主成分分析=================
 45 | from sklearn.datasets import make_circles
 46 | from sklearn.decomposition import PCA, KernelPCA
 47 | import matplotlib.pyplot as plt
 48 | import numpy as np
 49 | X, y = make_circles(n_samples=400, factor=.3, noise=.05)  # 生成样本数据集
 50 | 
 51 | kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)  # 核PCA降维
 52 | X_kpca = kpca.fit_transform(X)
 53 | X_back = kpca.inverse_transform(X_kpca)
 54 | 
 55 | pca = PCA(n_components=2)  # PCA降维到2维
 56 | X_pca = pca.fit_transform(X)
 57 | 
 58 | # # 绘制原始数据
 59 | plt.subplot(221)
 60 | plt.scatter(X[:, 0], X[:, 1], c=y, alpha=.8, lw=2)
 61 | plt.title('Original space')
 62 | 
 63 | # 绘制PCA降维后的显示
 64 | plt.subplot(222)
 65 | plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, alpha=.8, lw=2)
 66 | plt.title('PCA')
 67 | 
 68 | # 绘制KPCA降维后的显示
 69 | plt.subplot(223)
 70 | plt.scatter(X_kpca[:, 0], X_kpca[:, 1], c=y, alpha=.8, lw=2)
 71 | plt.title('KPCA')
 72 | 
 73 | # 绘制逆空间的显示
 74 | plt.subplot(224)
 75 | plt.scatter(X_back[:, 0], X_back[:, 1], c=y, alpha=.8, lw=2)
 76 | plt.title('inverse space')
 77 | 
 78 | plt.show()
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | # # ======================SparsePCA 稀疏主成分分析=================
 87 | 
 88 | 
 89 | 
 90 | 
 91 | # # =================隐 Dirichlet 分配=================
 92 | # from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  # 隐 Dirichlet 分配
 93 | # lda = LinearDiscriminantAnalysis(n_components=2)  # 降维到2维
 94 | # X_r2 = lda.fit(X, y).transform(X)
 95 | #
 96 | # # Percentage of variance explained for each components
 97 | # print('explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_))
 98 | #
 99 | #
100 | #
101 | # plt.subplot(122)
102 | # for color, i, target_name in zip(colors, [0, 1, 2], target_names):
103 | #     plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color,label=target_name)
104 | #
105 | # plt.legend(loc='best', shadow=False, scatterpoints=1)
106 | # plt.title('LDA of IRIS dataset')
107 | #
108 | # plt.show()


--------------------------------------------------------------------------------
/sk-knn.py:
--------------------------------------------------------------------------------
 1 | # # ==============================无监督查找最近邻（常在聚类中使用，例如变色龙聚类算法）========================
 2 | #
 3 | # from sklearn.neighbors import NearestNeighbors
 4 | # import numpy as np # 快速操作结构数组的工具
 5 | #
 6 | # X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])  # 样本数据
 7 | # test_x = np.array([[-3.2, -2.1], [-2.6, -1.3], [1.4, 1.0], [3.1, 2.6], [2.5, 1.0], [-1.2, -1.3]])  # 设置测试数据
 8 | # # test_x=X  # 测试数据等于样本数据。这样就相当于在样本数据内部查找每个样本的邻节点了。
 9 | # nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)  # 为X生成knn模型
10 | # distances, indices = nbrs.kneighbors(test_x)  # 为test_x中的数据寻找模型中的邻节点
11 | # print('邻节点：',indices)
12 | # print('邻节点距离：',distances)
13 | #
14 | # # ==============================使用kd树和Ball树实现无监督查找最近邻========================
15 | #
16 | # from sklearn.neighbors import KDTree,BallTree
17 | # import numpy as np # 快速操作结构数组的工具
18 | #
19 | # X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
20 | # # test_x = np.array([[-3.2, -2.1], [-2.6, -1.3], [1.4, 1.0], [3.1, 2.6], [2.5, 1.0], [-1.2, -1.3]])  # 设置测试数据
21 | # test_x=X  # 测试数据等于样本数据。这样就相当于在样本数据内部查找每个样本的邻节点了。
22 | # kdt = KDTree(X, leaf_size=30, metric='euclidean')
23 | # distances,indices = kdt.query(test_x, k=2, return_distance=True)
24 | # print('邻节点：',indices)
25 | # print('邻节点距离：',distances)
26 | 
27 | 
28 | 
29 | # # ==============================k最近邻分类========================
30 | # import numpy as np # 快速操作结构数组的工具
31 | # from sklearn.neighbors import KNeighborsClassifier,KDTree   # 导入knn分类器
32 | #
33 | #
34 | # # 数据集。4种属性，3种类别
35 | # data=[
36 | #     [ 5.1,  3.5,  1.4,  0.2, 0],
37 | #     [ 4.9,  3.0,  1.4,  0.2, 0],
38 | #     [ 4.7,  3.2,  1.3,  0.2, 0],
39 | #     [ 4.6,  3.1,  1.5,  0.2, 0],
40 | #     [ 5.0,  3.6,  1.4,  0.2, 0],
41 | #     [ 7.0,  3.2,  4.7,  1.4, 1],
42 | #     [ 6.4,  3.2,  4.5,  1.5, 1],
43 | #     [ 6.9,  3.1,  4.9,  1.5, 1],
44 | #     [ 5.5,  2.3,  4.0,  1.3, 1],
45 | #     [ 6.5,  2.8,  4.6,  1.5, 1],
46 | #     [ 6.3,  3.3,  6.0,  2.5, 2],
47 | #     [ 5.8,  2.7,  5.1,  1.9, 2],
48 | #     [ 7.1,  3.0,  5.9,  2.1, 2],
49 | #     [ 6.3,  2.9,  5.6,  1.8, 2],
50 | #     [ 6.5,  3.0,  5.8,  2.2, 2],
51 | # ]
52 | #
53 | # # 构造数据集
54 | # dataMat = np.array(data)
55 | # X = dataMat[:,0:4]
56 | # y = dataMat[:,4]
57 | #
58 | # knn = KNeighborsClassifier(n_neighbors=2,weights='distance')    # 初始化一个knn模型，设置k=2。weights='distance'样本权重等于距离的倒数。'uniform'为统一权重
59 | # knn.fit(X, y)                                          #根据样本集、结果集，对knn进行建模
60 | # result = knn.predict([[3, 2, 2, 5]])                   #使用knn对新对象进行预测
61 | # print(result)
62 | 
63 | 
64 | # ==============================k最近邻回归========================
65 | 
66 | import numpy as np
67 | import matplotlib.pyplot as plt
68 | from sklearn import neighbors
69 | 
70 | np.random.seed(0)
71 | X = np.sort(5 * np.random.rand(40, 1), axis=0)
72 | T = np.linspace(0, 5, 500)[:, np.newaxis]
73 | y = np.sin(X).ravel()
74 | 
75 | # 为输出值添加噪声
76 | y[::5] += 1 * (0.5 - np.random.rand(8))
77 | 
78 | # 训练回归模型
79 | n_neighbors = 5
80 | 
81 | for i, weights in enumerate(['uniform', 'distance']):
82 |     knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights)
83 |     y_ = knn.fit(X, y).predict(T)
84 | 
85 |     plt.subplot(2, 1, i + 1)
86 |     plt.scatter(X, y, c='k', label='data')
87 |     plt.plot(T, y_, c='g', label='prediction')
88 |     plt.axis('tight')
89 |     plt.legend()
90 |     plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (n_neighbors,weights))
91 | 
92 | plt.show()


--------------------------------------------------------------------------------
/sk-k均值聚类.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sklearn.cluster import Birch  # 从sklearn.cluster机器学习聚类包中导入Birch聚类
 3 | from sklearn.cluster import KMeans  # 从sklearn.cluster机器学习聚类包中导入KMeans聚类
 4 | 
 5 | """
 6 | 第1部分：数据集
 7 | X表示二维矩阵数据，篮球运动员比赛数据
 8 | 总共20行，每行两列数据
 9 | 第一列表示球员每分钟助攻数：x1
10 | 第二列表示球员每分钟得分数：x2
11 | """
12 | 
13 | X = [[0.0888, 0.5885],[0.1399, 0.8291],[0.0747, 0.4974],[0.0983, 0.5772],[0.1276, 0.5703],
14 |      [0.1671, 0.5835],[0.1906, 0.5276],[0.1061, 0.5523],[0.2446, 0.4007],[0.1670, 0.4770],
15 |      [0.2485, 0.4313],[0.1227, 0.4909],[0.1240, 0.5668],[0.1461, 0.5113],[0.2315, 0.3788],
16 |      [0.0494, 0.5590],[0.1107, 0.4799],[0.2521, 0.2735],[0.1007, 0.6318],[0.1067, 0.4326],
17 |      [0.1456, 0.8280]
18 |      ]
19 | 
20 | """
21 | 第2部分：KMeans聚类
22 | clf = KMeans(n_clusters=3) 表示类簇数为3，聚成3类数据，clf即赋值为KMeans
23 | y_pred = clf.fit_predict(X) 载入数据集X，并且将聚类的结果赋值给y_pred
24 | """
25 | 
26 | clf = KMeans(n_clusters=3)  # 聚类算法，参数n_clusters=3，聚成3类
27 | y_pred = clf.fit_predict(X)  # 直接对数据进行聚类，聚类不需要进行预测
28 | 
29 | # 输出完整Kmeans函数，包括很多省略参数
30 | print('k均值模型:\n',clf)
31 | # 输出聚类预测结果，20行数据，每个y_pred对应X一行或一个球员，聚成3类，类标为0、1、2
32 | print('聚类结果:\n',y_pred)
33 | 
34 | """
35 | 第3部分：可视化绘图
36 | Python导入Matplotlib包，专门用于绘图
37 | import matplotlib.pyplot as plt 此处as相当于重命名，plt用于显示图像
38 | """
39 | 
40 | import numpy as np
41 | import matplotlib.pyplot as plt
42 | 
43 | # 获取第一列和第二列数据 使用for循环获取 n[0]表示X第一列
44 | x1 = [n[0] for n in X]
45 | x2 = [n[1] for n in X]
46 | 
47 | # 绘制散点图 参数：x横轴 y纵轴 c=y_pred聚类预测结果 marker类型 o表示圆点 *表示星型 x表示点
48 | plt.scatter(x1, x2, c=y_pred, marker='x')
49 | 
50 | # 绘制标题
51 | plt.title("Kmeans-Basketball Data")
52 | 
53 | # 绘制x轴和y轴坐标
54 | plt.xlabel("x1")
55 | plt.ylabel("x2")
56 | 
57 | # 显示图形
58 | plt.show()


--------------------------------------------------------------------------------
/sk-lasso-多回归.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from sklearn.linear_model import MultiTaskLasso, Lasso
 4 | 
 5 | rng = np.random.RandomState(42)
 6 | # ===========================产生模拟样本数据=========================
 7 | # 用随机的频率、相位产生正弦波的二维系数
 8 | n_samples, n_features, n_tasks = 100, 30, 40  # n_samples样本个数，n_features特征个数，n_tasks估计值的个数
 9 | n_relevant_features = 5 # 自定义实际有用特征的个数
10 | coef = np.zeros((n_tasks, n_features)) # 系数矩阵的维度
11 | 
12 | times = np.linspace(0, 2 * np.pi, n_tasks)
13 | for k in range(n_relevant_features):
14 |     coef[:, k] = np.sin((1. + rng.randn(1)) * times + 3 * rng.randn(1)) # 自定义数据矩阵，用来生成模拟输出值
15 | 
16 | X = rng.randn(n_samples, n_features)  # 产生随机输入矩阵
17 | Y = np.dot(X, coef.T) + rng.randn(n_samples, n_tasks) # 输入*系数+噪声=模拟输出
18 | # ==============================使用样本数据训练系数矩阵============================
19 | coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T])
20 | coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_  # 多任务训练
21 | 
22 | # #############################################################################
23 | # Plot support and time series
24 | fig = plt.figure(figsize=(8, 5))
25 | plt.subplot(1, 2, 1)
26 | plt.spy(coef_lasso_)
27 | plt.xlabel('Feature')
28 | plt.ylabel('Time (or Task)')
29 | plt.text(10, 5, 'Lasso')
30 | plt.subplot(1, 2, 2)
31 | plt.spy(coef_multi_task_lasso_)
32 | plt.xlabel('Feature')
33 | plt.ylabel('Time (or Task)')
34 | plt.text(10, 5, 'MultiTaskLasso')
35 | fig.suptitle('Coefficient non-zero location')
36 | 
37 | feature_to_plot = 0
38 | plt.figure()
39 | lw = 2
40 | plt.plot(coef[:, feature_to_plot], color='seagreen', linewidth=lw,
41 |          label='Ground truth')
42 | plt.plot(coef_lasso_[:, feature_to_plot], color='cornflowerblue', linewidth=lw,
43 |          label='Lasso')
44 | plt.plot(coef_multi_task_lasso_[:, feature_to_plot], color='gold', linewidth=lw,
45 |          label='MultiTaskLasso')
46 | plt.legend(loc='upper center')
47 | plt.axis('tight')
48 | plt.ylim([-1.1, 1.1])
49 | plt.show()


--------------------------------------------------------------------------------
/sk-lasso.py:
--------------------------------------------------------------------------------
 1 | import numpy as np # 快速操作结构数组的工具
 2 | import matplotlib.pyplot as plt  # 可视化绘制
 3 | from sklearn.linear_model import Lasso,LassoCV,LassoLarsCV   # Lasso回归,LassoCV交叉验证实现alpha的选取，LassoLarsCV基于最小角回归交叉验证实现alpha的选取
 4 | 
 5 | 
 6 | # 样本数据集，第一列为x，第二列为y，在x和y之间建立回归模型
 7 | data=[
 8 |     [0.067732,3.176513],[0.427810,3.816464],[0.995731,4.550095],[0.738336,4.256571],[0.981083,4.560815],
 9 |     [0.526171,3.929515],[0.378887,3.526170],[0.033859,3.156393],[0.132791,3.110301],[0.138306,3.149813],
10 |     [0.247809,3.476346],[0.648270,4.119688],[0.731209,4.282233],[0.236833,3.486582],[0.969788,4.655492],
11 |     [0.607492,3.965162],[0.358622,3.514900],[0.147846,3.125947],[0.637820,4.094115],[0.230372,3.476039],
12 |     [0.070237,3.210610],[0.067154,3.190612],[0.925577,4.631504],[0.717733,4.295890],[0.015371,3.085028],
13 |     [0.335070,3.448080],[0.040486,3.167440],[0.212575,3.364266],[0.617218,3.993482],[0.541196,3.891471]
14 | ]
15 | 
16 | 
17 | #生成X和y矩阵
18 | dataMat = np.array(data)
19 | X = dataMat[:,0:1]   # 变量x
20 | y = dataMat[:,1]   #变量y
21 | 
22 | 
23 | 
24 | # ========Lasso回归========
25 | model = Lasso(alpha=0.01)  # 调节alpha可以实现对拟合的程度
26 | # model = LassoCV()  # LassoCV自动调节alpha可以实现选择最佳的alpha。
27 | # model = LassoLarsCV()  # LassoLarsCV自动调节alpha可以实现选择最佳的alpha
28 | model.fit(X, y)   # 线性回归建模
29 | print('系数矩阵:\n',model.coef_)
30 | print('线性回归模型:\n',model)
31 | # print('最佳的alpha：',model.alpha_)  # 只有在使用LassoCV、LassoLarsCV时才有效
32 | # 使用模型预测
33 | predicted = model.predict(X)
34 | 
35 | # 绘制散点图 参数：x横轴 y纵轴
36 | plt.scatter(X, y, marker='x')
37 | plt.plot(X, predicted,c='r')
38 | 
39 | # 绘制x轴和y轴坐标
40 | plt.xlabel("x")
41 | plt.ylabel("y")
42 | 
43 | # 显示图形
44 | plt.show()
45 | 
46 | 


--------------------------------------------------------------------------------
/sk-svm.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # import numpy as np # 快速操作结构数组的工具
 3 | # from sklearn import svm  # svm支持向量机
 4 | # import matplotlib.pyplot as plt # 可视化绘图
 5 | #
 6 | #
 7 | # data_set = np.loadtxt("SVM_data.txt")
 8 | # train_data = data_set[:,0:2]   # 训练特征空间
 9 | # train_target = np.sign(data_set[:,2])  # 训练集类标号
10 | #
11 | # test_data = [[3,-1], [1,1], [7,-3], [9,0]] # 测试特征空间
12 | # test_target = [-1, -1, 1, 1]  # 测试集类标号
13 | #
14 | # plt.scatter(data_set[:,0],data_set[:,1],c=data_set[:,2])  # 绘制可视化图
15 | # plt.show()
16 | #
17 | # # 创建模型
18 | # clf = svm.SVC()
19 | # clf.fit(X=train_data, y=train_target,sample_weight=None)  # 训练模型。参数sample_weight为每个样本设置权重。应对非均衡问题
20 | # result = clf.predict(test_data)  # 使用模型预测值
21 | # print('预测结果：',result)  # 输出预测值[-1. -1.  1.  1.]
22 | #
23 | # # 获得支持向量
24 | # print('支持向量：',clf.support_vectors_)
25 | # # 获得支持向量的索引
26 | # print('支持向量索引：',clf.support_)
27 | # # 为每一个类别获得支持向量的数量
28 | # print('支持向量数量：',clf.n_support_)
29 | #
30 | #
31 | # # # ===============================Linear SVM======================
32 | # from sklearn.svm import LinearSVC
33 | #
34 | # clf = LinearSVC() # 创建线性可分svm模型，参数均使用默认值
35 | # clf.fit(train_data, train_target)  # 训练模型
36 | # result = clf.predict(test_data)  # 使用模型预测值
37 | # print('预测结果：',result)  # 输出预测值[-1. -1.  1.  1.]
38 | #
39 | #
40 | # # # ===============================Linear NuSVC======================
41 | # from sklearn.svm import NuSVC
42 | #
43 | # clf = NuSVC() # 创建线性可分svm模型，参数均使用默认值
44 | # clf.fit(train_data, train_target)  # 训练模型
45 | # result = clf.predict(test_data)  # 使用模型预测值
46 | # print('预测结果：',result)  # 输出预测值[-1. -1.  1.  1.]
47 | 
48 | 
49 | # ===============================样本不平衡、多分类的情况========================
50 | import numpy as np
51 | import matplotlib.pyplot as plt
52 | from sklearn import svm
53 | 
54 | # 创建不均衡样本
55 | rng = np.random.RandomState(0)
56 | n_samples_1 = 1000
57 | n_samples_2 = 100
58 | n_samples_3 = 100
59 | X = np.r_[1.5 * rng.randn(n_samples_1, 2), 0.5 * rng.randn(n_samples_2, 2) + [2, 2],0.5 * rng.randn(n_samples_3, 2) + [-3, 3]]  # 三类样本点中心为(1.5,1.5)、(2,2)、(-3,3)
60 | y = [0] * (n_samples_1) + [1] * (n_samples_2)+ [2] * (n_samples_3)  # 前面的1000个为类别0，后面的100个为类别1，最后100个类别为2
61 | 
62 | # 创建模型获取分离超平面
63 | clf = svm.SVC(decision_function_shape='ovo',kernel='linear', C=1.0)  # decision_function_shape='ovo'为使用1对1多分类处理。会创建n(n-1)/2个二分类。ovr为一对所有的处理方式
64 | clf.fit(X, y)
65 | 
66 | # 多分类的情况下，获取其中二分类器的个数。
67 | dec = clf.decision_function([[1.5,1.5]])  # decision_function()的功能：计算样本点到分割超平面的函数距离。 包含几个2分类器，就有几个函数距离。
68 | print('二分类器个数：',dec.shape[1])
69 | 
70 | # 绘制，第一个二分类器的分割超平面
71 | w = clf.coef_[0]
72 | a = -w[0] / w[1]  # a可以理解为斜率
73 | xx = np.linspace(-5, 5)
74 | yy = a * xx - clf.intercept_[0] / w[1]  # 二维坐标下的直线方程
75 | 
76 | # 使用类权重，获取分割超平面
77 | wclf = svm.SVC(kernel='linear', class_weight={1: 10})
78 | wclf.fit(X, y)
79 | 
80 | 
81 | # 绘制 分割分割超平面
82 | ww = wclf.coef_[0]
83 | wa = -ww[0] / ww[1]
84 | wyy = wa * xx - wclf.intercept_[0] / ww[1]  # 带权重的直线
85 | 
86 | # 绘制第一个二分类器的分割超平面和样本点
87 | h0 = plt.plot(xx, yy, 'k-', label='no weights')
88 | h1 = plt.plot(xx, wyy, 'k--', label='with weights')
89 | plt.scatter(X[:, 0], X[:, 1], c=y)
90 | plt.legend()
91 | 
92 | plt.show()
93 | 
94 | # ===============================SVM回归预测========================
95 | X = [[0, 0], [2, 2]]
96 | y = [0.5, 2.5]
97 | clf = svm.SVR()
98 | clf.fit(X, y)
99 | clf.predict([[1, 1]])


--------------------------------------------------------------------------------
/sk-svm识别手写体.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import numpy as np
 3 | import operator
 4 | from os import listdir
 5 | from sklearn.svm import SVC
 6 | 
 7 | 
 8 | def img2vector(filename):
 9 |     """
10 |     将32x32的二进制图像转换为1x1024向量。
11 |     Parameters:
12 |         filename - 文件名
13 |     Returns:
14 |         returnVect - 返回的二进制图像的1x1024向量
15 |     """
16 |     #创建1x1024零向量
17 |     returnVect = np.zeros((1, 1024))
18 |     #打开文件
19 |     fr = open(filename)
20 |     #按行读取
21 |     for i in range(32):
22 |         #读一行数据
23 |         lineStr = fr.readline()
24 |         #每一行的前32个元素依次添加到returnVect中
25 |         for j in range(32):
26 |             returnVect[0, 32*i+j] = int(lineStr[j])
27 |     #返回转换后的1x1024向量
28 |     return returnVect
29 | 
30 | # 手写数字分类测试
31 | def handwritingClassTest():
32 |     #测试集的Labels
33 |     hwLabels = []
34 |     #返回trainingDigits目录下的文件名
35 |     trainingFileList = listdir('trainingDigits')
36 |     #返回文件夹下文件的个数
37 |     m = len(trainingFileList)
38 |     #初始化训练的Mat矩阵,测试集
39 |     trainingMat = np.zeros((m, 1024))
40 |     #从文件名中解析出训练集的类别
41 |     for i in range(m):
42 |         #获得文件的名字
43 |         fileNameStr = trainingFileList[i]
44 |         #获得分类的数字
45 |         classNumber = int(fileNameStr.split('_')[0])
46 |         #将获得的类别添加到hwLabels中
47 |         hwLabels.append(classNumber)
48 |         #将每一个文件的1x1024数据存储到trainingMat矩阵中
49 |         trainingMat[i,:] = img2vector('trainingDigits/%s' % (fileNameStr))
50 |     clf = SVC(C=200,kernel='rbf')
51 |     clf.fit(trainingMat,hwLabels)
52 |     #返回testDigits目录下的文件列表
53 |     testFileList = listdir('testDigits')
54 |     #错误检测计数
55 |     errorCount = 0.0
56 |     #测试数据的数量
57 |     mTest = len(testFileList)
58 |     #从文件中解析出测试集的类别并进行分类测试
59 |     for i in range(mTest):
60 |         #获得文件的名字
61 |         fileNameStr = testFileList[i]
62 |         #获得分类的数字
63 |         classNumber = int(fileNameStr.split('_')[0])
64 |         #获得测试集的1x1024向量,用于训练
65 |         vectorUnderTest = img2vector('testDigits/%s' % (fileNameStr))
66 |         #获得预测结果
67 |         # classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
68 |         classifierResult = clf.predict(vectorUnderTest)
69 |         print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber))
70 |         if(classifierResult != classNumber):
71 |             errorCount += 1.0
72 |     print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100))
73 | 
74 | if __name__ == '__main__':
75 |     handwritingClassTest()


--------------------------------------------------------------------------------
/sk-交叉验证.py:
--------------------------------------------------------------------------------
  1 | from sklearn.model_selection import train_test_split,cross_val_score,cross_validate # 交叉验证所需的函数
  2 | from sklearn.model_selection import KFold,LeaveOneOut,LeavePOut,ShuffleSplit # 交叉验证所需的子集划分方法
  3 | from sklearn.model_selection import StratifiedKFold,StratifiedShuffleSplit # 分层分割
  4 | from sklearn.model_selection import GroupKFold,LeaveOneGroupOut,LeavePGroupsOut,GroupShuffleSplit # 分组分割
  5 | from sklearn.model_selection import TimeSeriesSplit # 时间序列分割
  6 | from sklearn import datasets  # 自带数据集
  7 | from sklearn import svm  # SVM算法
  8 | from sklearn import preprocessing  # 预处理模块
  9 | from sklearn.metrics import recall_score  # 模型度量
 10 | 
 11 | iris = datasets.load_iris()  # 加载数据集
 12 | print('样本集大小：',iris.data.shape,iris.target.shape)
 13 | 
 14 | # ===================================数据集划分,训练模型==========================
 15 | X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)  # 交叉验证划分训练集和测试集.test_size为测试集所占的比例
 16 | print('训练集大小：',X_train.shape,y_train.shape)  # 训练集样本大小
 17 | print('测试集大小：',X_test.shape,y_test.shape)  # 测试集样本大小
 18 | clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) # 使用训练集训练模型
 19 | print('准确率：',clf.score(X_test, y_test))  # 计算测试集的度量值（准确率）
 20 | 
 21 | 
 22 | #  如果涉及到归一化，则在测试集上也要使用训练集模型提取的归一化函数。
 23 | scaler = preprocessing.StandardScaler().fit(X_train)  # 通过训练集获得归一化函数模型。（也就是先减几，再除以几的函数）。在训练集和测试集上都使用这个归一化函数
 24 | X_train_transformed = scaler.transform(X_train)
 25 | clf = svm.SVC(kernel='linear', C=1).fit(X_train_transformed, y_train) # 使用训练集训练模型
 26 | X_test_transformed = scaler.transform(X_test)
 27 | print(clf.score(X_test_transformed, y_test))  # 计算测试集的度量值（准确度）
 28 | 
 29 | # ===================================直接调用交叉验证评估模型==========================
 30 | clf = svm.SVC(kernel='linear', C=1)
 31 | scores = cross_val_score(clf, iris.data, iris.target, cv=5)  #cv为迭代次数。
 32 | print(scores)  # 打印输出每次迭代的度量值（准确度）
 33 | print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))  # 获取置信区间。（也就是均值和方差）
 34 | 
 35 | # ===================================多种度量结果======================================
 36 | scoring = ['precision_macro', 'recall_macro'] # precision_macro为精度，recall_macro为召回率
 37 | scores = cross_validate(clf, iris.data, iris.target, scoring=scoring,cv=5, return_train_score=True)
 38 | sorted(scores.keys())
 39 | print('测试结果：',scores)  # scores类型为字典。包含训练得分，拟合次数， score-times （得分次数）
 40 | 
 41 | 
 42 | # ==================================K折交叉验证、留一交叉验证、留p交叉验证、随机排列交叉验证==========================================
 43 | # k折划分子集
 44 | kf = KFold(n_splits=2)
 45 | for train, test in kf.split(iris.data):
 46 |     print("k折划分：%s %s" % (train.shape, test.shape))
 47 |     break
 48 | 
 49 | # 留一划分子集
 50 | loo = LeaveOneOut()
 51 | for train, test in loo.split(iris.data):
 52 |     print("留一划分：%s %s" % (train.shape, test.shape))
 53 |     break
 54 | 
 55 | # 留p划分子集
 56 | lpo = LeavePOut(p=2)
 57 | for train, test in loo.split(iris.data):
 58 |     print("留p划分：%s %s" % (train.shape, test.shape))
 59 |     break
 60 | 
 61 | # 随机排列划分子集
 62 | ss = ShuffleSplit(n_splits=3, test_size=0.25,random_state=0)
 63 | for train_index, test_index in ss.split(iris.data):
 64 |     print("随机排列划分：%s %s" % (train.shape, test.shape))
 65 |     break
 66 | 
 67 | # ==================================分层K折交叉验证、分层随机交叉验证==========================================
 68 | skf = StratifiedKFold(n_splits=3)  #各个类别的比例大致和完整数据集中相同
 69 | for train, test in skf.split(iris.data, iris.target):
 70 |     print("分层K折划分：%s %s" % (train.shape, test.shape))
 71 |     break
 72 | 
 73 | skf = StratifiedShuffleSplit(n_splits=3)  # 划分中每个类的比例和完整数据集中的相同
 74 | for train, test in skf.split(iris.data, iris.target):
 75 |     print("分层随机划分：%s %s" % (train.shape, test.shape))
 76 |     break
 77 | 
 78 | 
 79 | # ==================================组 k-fold交叉验证、留一组交叉验证、留 P 组交叉验证、Group Shuffle Split==========================================
 80 | X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
 81 | y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
 82 | groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
 83 | 
 84 | # k折分组
 85 | gkf = GroupKFold(n_splits=3)  # 训练集和测试集属于不同的组
 86 | for train, test in gkf.split(X, y, groups=groups):
 87 |     print("组 k-fold分割：%s %s" % (train, test))
 88 | 
 89 | # 留一分组
 90 | logo = LeaveOneGroupOut()
 91 | for train, test in logo.split(X, y, groups=groups):
 92 |     print("留一组分割：%s %s" % (train, test))
 93 | 
 94 | # 留p分组
 95 | lpgo = LeavePGroupsOut(n_groups=2)
 96 | for train, test in lpgo.split(X, y, groups=groups):
 97 |     print("留 P 组分割：%s %s" % (train, test))
 98 | 
 99 | # 随机分组
100 | gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
101 | for train, test in gss.split(X, y, groups=groups):
102 |     print("随机分割：%s %s" % (train, test))
103 | 
104 | 
105 | # ==================================时间序列分割==========================================
106 | tscv = TimeSeriesSplit(n_splits=3)
107 | TimeSeriesSplit(max_train_size=None, n_splits=3)
108 | for train, test in tscv.split(iris.data):
109 |     print("时间序列分割：%s %s" % (train, test))


--------------------------------------------------------------------------------
/sk-优化.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import load_iris  # 自带的样本数据集
 2 | from sklearn.neighbors import KNeighborsClassifier  # 要估计的是knn里面的参数，包括k的取值和样本权重分布方式
 3 | import matplotlib.pyplot as plt  # 可视化绘图
 4 | from sklearn.model_selection import GridSearchCV,RandomizedSearchCV  # 网格搜索和随机搜索
 5 | 
 6 | iris = load_iris()
 7 | 
 8 | X = iris.data  # 150个样本，4个属性
 9 | y = iris.target # 150个类标号
10 | 
11 | k_range = range(1, 31)  # 优化参数k的取值范围
12 | weight_options = ['uniform', 'distance']  # 代估参数权重的取值范围。uniform为统一取权值，distance表示距离倒数取权值
13 | # 下面是构建parameter grid，其结构是key为参数名称，value是待搜索的数值列表的一个字典结构
14 | param_grid = {'n_neighbors':k_range,'weights':weight_options}  # 定义优化参数字典，字典中的key值必须是分类算法的函数的参数名
15 | print(param_grid)
16 | 
17 | knn = KNeighborsClassifier(n_neighbors=5)  # 定义分类算法。n_neighbors和weights的参数名称和param_grid字典中的key名对应
18 | 
19 | 
20 | # ================================网格搜索=======================================
21 | # 这里GridSearchCV的参数形式和cross_val_score的形式差不多，其中param_grid是parameter grid所对应的参数
22 | # GridSearchCV中的n_jobs设置为-1时，可以实现并行计算（如果你的电脑支持的情况下）
23 | grid = GridSearchCV(estimator = knn, param_grid = param_grid, cv=10, scoring='accuracy') #针对每个参数对进行了10次交叉验证。scoring='accuracy'使用准确率为结果的度量指标。可以添加多个度量指标
24 | grid.fit(X, y)
25 | 
26 | print('网格搜索-度量记录：',grid.cv_results_)  # 包含每次训练的相关信息
27 | print('网格搜索-最佳度量值:',grid.best_score_)  # 获取最佳度量值
28 | print('网格搜索-最佳参数：',grid.best_params_)  # 获取最佳度量值时的代定参数的值。是一个字典
29 | print('网格搜索-最佳模型：',grid.best_estimator_)  # 获取最佳度量时的分类器模型
30 | 
31 | 
32 | # 使用获取的最佳参数生成模型，预测数据
33 | knn = KNeighborsClassifier(n_neighbors=grid.best_params_['n_neighbors'], weights=grid.best_params_['weights'])  # 取出最佳参数进行建模
34 | knn.fit(X, y)  # 训练模型
35 | print(knn.predict([[3, 5, 4, 2]]))  # 预测新对象
36 | 
37 | 
38 | 
39 | # =====================================随机搜索===========================================
40 | rand = RandomizedSearchCV(knn, param_grid, cv=10, scoring='accuracy', n_iter=10, random_state=5)  #
41 | rand.fit(X, y)
42 | 
43 | print('随机搜索-度量记录：',grid.cv_results_)  # 包含每次训练的相关信息
44 | print('随机搜索-最佳度量值:',grid.best_score_)  # 获取最佳度量值
45 | print('随机搜索-最佳参数：',grid.best_params_)  # 获取最佳度量值时的代定参数的值。是一个字典
46 | print('随机搜索-最佳模型：',grid.best_estimator_)  # 获取最佳度量时的分类器模型
47 | 
48 | 
49 | # 使用获取的最佳参数生成模型，预测数据
50 | knn = KNeighborsClassifier(n_neighbors=grid.best_params_['n_neighbors'], weights=grid.best_params_['weights'])  # 取出最佳参数进行建模
51 | knn.fit(X, y)  # 训练模型
52 | print(knn.predict([[3, 5, 4, 2]]))  # 预测新对象
53 | 
54 | 
55 | # =====================================自定义度量===========================================
56 | from sklearn import metrics
57 | # 自定义度量函数
58 | def scorerfun(estimator, X, y):
59 |     y_pred = estimator.predict(X)
60 |     return metrics.accuracy_score(y, y_pred)
61 | 
62 | rand = RandomizedSearchCV(knn, param_grid, cv=10, scoring='accuracy', n_iter=10, random_state=5)  #
63 | rand.fit(X, y)
64 | 
65 | print('随机搜索-最佳度量值:',grid.best_score_)  # 获取最佳度量值
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/sk-决策树.py:
--------------------------------------------------------------------------------
 1 | import numpy as np # 快速操作结构数组的工具
 2 | import pandas as pd # 数据分析处理工具
 3 | import matplotlib.pyplot as plt # 画图工具
 4 | from sklearn import datasets # 机器学习库
 5 | from sklearn.preprocessing import LabelEncoder
 6 | from sklearn import tree
 7 | 
 8 | 
 9 | 
10 | # 下面的数据分为为每个用户的来源网站、位置、是否阅读FAQ、浏览网页数目、选择的服务类型（目标结果）
11 | attr_arr=[['slashdot','USA','yes',18,'None'],
12 |          ['google','France','yes',23,'Premium'],
13 |          ['digg','USA','yes',24,'Basic'],
14 |          ['kiwitobes','France','yes',23,'Basic'],
15 |          ['google','UK','no',21,'Premium'],
16 |          ['(direct)','New Zealand','no',12,'None'],
17 |          ['(direct)','UK','no',21,'Basic'],
18 |          ['google','USA','no',24,'Premium'],
19 |          ['slashdot','France','yes',19,'None'],
20 |          ['digg','USA','no',18,'None'],
21 |          ['google','UK','no',18,'None'],
22 |          ['kiwitobes','UK','no',19,'None'],
23 |          ['digg','New Zealand','yes',12,'Basic'],
24 |          ['slashdot','UK','no',21,'None'],
25 |          ['google','UK','yes',18,'Basic'],
26 |          ['kiwitobes','France','yes',19,'Basic']]
27 | 
28 | #生成属性数据集和结果数据集
29 | dataMat = np.mat(attr_arr)
30 | arrMat = dataMat[:,0:4]
31 | resultMat = dataMat[:,4]
32 | 
33 | # 构造数据集成pandas结构，为了能理解属性的名称
34 | attr_names = ['src', 'address', 'FAQ', 'num']   #特征属性的名称
35 | attr_pd = pd.DataFrame(data=arrMat,columns=attr_names)    #每行为一个对象，每列为一种属性，最后一个为结果值
36 | print(attr_pd)
37 | 
38 | #将数据集中的字符串转化为代表类别的数字。因为sklearn的决策树只识别数字
39 | le = LabelEncoder()
40 | for col in attr_pd.columns:                                            #为每一列序列化,就是将每种字符串转化为对应的数字。用数字代表类别
41 |     attr_pd[col] = le.fit_transform(attr_pd[col])
42 | print(attr_pd)
43 | 
44 | # 构建决策树
45 | clf = tree.DecisionTreeClassifier()
46 | clf.fit(attr_pd, resultMat)
47 | print(clf)
48 | 
49 | # 使用决策树进行预测
50 | result = clf.predict([[1,1,1,0]])    # 输入也必须是数字的。分别代表了每个数字所代表的属性的字符串值
51 | print(result)
52 | 
53 | # 将决策树保存成图片
54 | from sklearn.externals.six import StringIO
55 | import pydotplus
56 | 
57 | dot_data = StringIO()
58 | target_name=['None','Basic','Premium']
59 | tree.export_graphviz(clf, out_file=dot_data,feature_names=attr_names,
60 |                      class_names=target_name,filled=True,rounded=True,
61 |                      special_characters=True)
62 | graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
63 | graph.write_png('tree.png')
64 | 
65 | 


--------------------------------------------------------------------------------
/sk-分类大全.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | 
  3 | # ========加载数据(Data Loading)========
  4 | import numpy as np
  5 | import urllib.request
  6 | 
  7 | # 数据集的请求地址
  8 | url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
  9 | # 下载响应的csv文件
 10 | raw_data = urllib.request.urlopen(url)
 11 | # 加载csv文件成numpy中的矩阵
 12 | dataset = np.loadtxt(raw_data, delimiter=",")
 13 | # 分割成属性集和结果集
 14 | X = dataset[:,0:7]   # 特征矩阵
 15 | y = dataset[:,8]  #目标矩阵
 16 | # print('特征矩阵:\n',X)
 17 | # print('结果矩阵:\n',y)
 18 | 
 19 | # ========数据归一化(Data Normalization)========
 20 | from sklearn import preprocessing
 21 | # 归一化数据集
 22 | normalized_X = preprocessing.normalize(X)
 23 | # 标准话数据集
 24 | standardized_X = preprocessing.scale(X)
 25 | 
 26 | # ========特征选择(Feature Selection)========
 27 | # 树算法(Tree algorithms)计算特征的信息量
 28 | from sklearn import metrics
 29 | from sklearn.ensemble import ExtraTreesClassifier
 30 | model = ExtraTreesClassifier()
 31 | model.fit(X, y)
 32 | # 显示每个特征的重要性
 33 | print('属性重要性:\n',model.feature_importances_)
 34 | 
 35 | # ========逻辑回归========
 36 | from sklearn import metrics
 37 | from sklearn.linear_model import LogisticRegression
 38 | model = LogisticRegression()
 39 | model.fit(X, y)
 40 | print('逻辑回归模型:\n',model)
 41 | # 使用模型预测
 42 | expected = y
 43 | predicted = model.predict(X)
 44 | # 评估模型
 45 | print(metrics.classification_report(expected, predicted))  #评估模型
 46 | print(metrics.confusion_matrix(expected, predicted))  # 使用混淆矩阵评估模型
 47 | 
 48 | # ========朴素贝叶斯========
 49 | from sklearn import metrics
 50 | from sklearn.naive_bayes import GaussianNB
 51 | model = GaussianNB()
 52 | model.fit(X, y)
 53 | print('朴素贝叶斯模型:\n',model)
 54 | # 使用模型预测
 55 | expected = y
 56 | predicted = model.predict(X)
 57 | # 评估模型
 58 | print(metrics.classification_report(expected, predicted))
 59 | print(metrics.confusion_matrix(expected, predicted))
 60 | 
 61 | # ========k近邻========
 62 | from sklearn import metrics
 63 | from sklearn.neighbors import KNeighborsClassifier
 64 | # 使用样本数据构建knn模型
 65 | model = KNeighborsClassifier()
 66 | model.fit(X, y)
 67 | print('KNN模型:\n',model)
 68 | # 使用模型预测
 69 | expected = y
 70 | predicted = model.predict(X)
 71 | # 评估模型
 72 | print(metrics.classification_report(expected, predicted))
 73 | print(metrics.confusion_matrix(expected, predicted))
 74 | 
 75 | 
 76 | # ========决策树========
 77 | from sklearn import metrics
 78 | from sklearn.tree import DecisionTreeClassifier
 79 | # 构建决策树模型
 80 | model = DecisionTreeClassifier()
 81 | model.fit(X, y)
 82 | print('决策树模型:\n',model)
 83 | # 使用模型预测
 84 | expected = y
 85 | predicted = model.predict(X)
 86 | # 评估模型
 87 | print(metrics.classification_report(expected, predicted))
 88 | print(metrics.confusion_matrix(expected, predicted))
 89 | 
 90 | 
 91 | # ========支持向量机========
 92 | from sklearn import metrics
 93 | from sklearn.svm import SVC
 94 | # 构建svm模型
 95 | model = SVC()
 96 | model.fit(X, y)
 97 | print('SVM模型:\n',model)
 98 | # 使用模型预测
 99 | expected = y
100 | predicted = model.predict(X)
101 | # 评估模型
102 | print(metrics.classification_report(expected, predicted))
103 | print(metrics.confusion_matrix(expected, predicted))
104 | 
105 | # ========优化算法参数========
106 | import numpy as np
107 | from sklearn.linear_model import Ridge   #岭回归模型
108 | from scipy.stats import uniform as sp_rand
109 | from sklearn.grid_search import GridSearchCV  #网格搜索
110 | from sklearn.grid_search import RandomizedSearchCV  # 随机搜索
111 | 
112 | # 准备参数的可取值
113 | alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
114 | # 构建岭回归模型，并尝试参数每一个可取值
115 | model = Ridge()
116 | rsearch = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
117 | 
118 | # # 只给定区间，参数随机取值
119 | # param_grid = {'alpha': sp_rand()}
120 | # # 构建岭回归模型，并尝试参数随机值
121 | # model = Ridge()
122 | # rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
123 | 
124 | rsearch.fit(X, y)
125 | print(rsearch)
126 | # 评估搜索结果
127 | print(rsearch.best_score_)
128 | print(rsearch.best_estimator_.alpha)
129 | 


--------------------------------------------------------------------------------
/sk-卷积神经网络-识别手写数字.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | 
  4 | from scipy.ndimage import convolve
  5 | from sklearn import linear_model, datasets, metrics
  6 | from sklearn.cross_validation import train_test_split
  7 | from sklearn.neural_network import BernoulliRBM
  8 | from sklearn.pipeline import Pipeline
  9 | 
 10 | 
 11 | ###############################################################################
 12 | # Setting up
 13 | 
 14 | def nudge_dataset(X, Y):
 15 |     """
 16 |     This produces a dataset 5 times bigger than the original one,
 17 |     by moving the 8x8 images in X around by 1px to left, right, down, up
 18 |     """
 19 |     direction_vectors = [
 20 |         [[0, 1, 0],
 21 |          [0, 0, 0],
 22 |          [0, 0, 0]],
 23 | 
 24 |         [[0, 0, 0],
 25 |          [1, 0, 0],
 26 |          [0, 0, 0]],
 27 | 
 28 |         [[0, 0, 0],
 29 |          [0, 0, 1],
 30 |          [0, 0, 0]],
 31 | 
 32 |         [[0, 0, 0],
 33 |          [0, 0, 0],
 34 |          [0, 1, 0]]]
 35 | 
 36 |     shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant',weights=w).ravel()
 37 |     X = np.concatenate([X] +[np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors])
 38 |     Y = np.concatenate([Y for _ in range(5)], axis=0)
 39 |     return X, Y
 40 | 
 41 | 
 42 | # 记载数据集
 43 | digits = datasets.load_digits()
 44 | X = np.asarray(digits.data, 'float32')
 45 | X, Y = nudge_dataset(X, digits.target)
 46 | X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)  # 归一化到0-1
 47 | 
 48 | 
 49 | # 交叉验证
 50 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2, random_state=0)
 51 | 
 52 | # 逻辑回归模型
 53 | logistic = linear_model.LogisticRegression()
 54 | rbm = BernoulliRBM(random_state=0, verbose=True)
 55 | 
 56 | classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
 57 | 
 58 | ###############################################################################
 59 | # Training
 60 | 
 61 | # Hyper-parameters. These were set by cross-validation,
 62 | # using a GridSearchCV. Here we are not performing cross-validation to
 63 | # save time.
 64 | rbm.learning_rate = 0.06
 65 | rbm.n_iter = 20
 66 | # More components tend to give better prediction performance, but larger
 67 | # fitting time
 68 | rbm.n_components = 100
 69 | logistic.C = 6000.0
 70 | 
 71 | # Training RBM-Logistic Pipeline
 72 | classifier.fit(X_train, Y_train)
 73 | 
 74 | # Training Logistic regression
 75 | logistic_classifier = linear_model.LogisticRegression(C=100.0)
 76 | logistic_classifier.fit(X_train, Y_train)
 77 | 
 78 | ###############################################################################
 79 | # Evaluation
 80 | 
 81 | print()
 82 | print("Logistic regression using RBM features:\n%s\n" % (
 83 |     metrics.classification_report(
 84 |         Y_test,
 85 |         classifier.predict(X_test))))
 86 | 
 87 | print("Logistic regression using raw pixel features:\n%s\n" % (
 88 |     metrics.classification_report(Y_test,logistic_classifier.predict(X_test))))
 89 | 
 90 | ###############################################################################
 91 | # Plotting
 92 | 
 93 | plt.figure(figsize=(4.2, 4))
 94 | for i, comp in enumerate(rbm.components_):
 95 |     plt.subplot(10, 10, i + 1)
 96 |     plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r,interpolation='nearest')
 97 |     plt.xticks(())
 98 |     plt.yticks(())
 99 | plt.suptitle('100 components extracted by RBM', fontsize=16)
100 | plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
101 | plt.show()


--------------------------------------------------------------------------------
/sk-卷积神经网络.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | import matplotlib.pyplot as plt
 3 | import matplotlib.cm as cm
 4 | from urllib import request
 5 | import pickle
 6 | import os
 7 | import gzip
 8 | import numpy as np
 9 | import theano
10 | import lasagne
11 | from lasagne import layers
12 | from lasagne.updates import nesterov_momentum
13 | from nolearn.lasagne import NeuralNet
14 | from nolearn.lasagne import visualize
15 | from sklearn.metrics import classification_report
16 | from sklearn.metrics import confusion_matrix
17 | 
18 | def load_dataset():
19 |     url = 'http://deeplearning.net/data/mnist/mnist.pkl.gz'
20 |     filename = 'mnist.pkl.gz'
21 |     if not os.path.exists(filename):
22 |         print("Downloading MNIST dataset...")
23 |         request.urlretrieve(url, filename)
24 |     with gzip.open(filename, 'rb') as f:
25 |         data = pickle.load(f)
26 |     X_train, y_train = data[0]
27 |     X_val, y_val = data[1]
28 |     X_test, y_test = data[2]
29 |     X_train = X_train.reshape((-1, 1, 28, 28))
30 |     X_val = X_val.reshape((-1, 1, 28, 28))
31 |     X_test = X_test.reshape((-1, 1, 28, 28))
32 |     y_train = y_train.astype(np.uint8)
33 |     y_val = y_val.astype(np.uint8)
34 |     y_test = y_test.astype(np.uint8)
35 |     return X_train, y_train, X_val, y_val, X_test, y_test
36 | 
37 | # 加载MNIST数据集并检验它
38 | X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()
39 | plt.imshow(X_train[0][0], cmap=cm.binary)


--------------------------------------------------------------------------------
/sk-多类多标签.py:
--------------------------------------------------------------------------------
 1 | # 多标签分类格式。将多分类转换为二分类的格式，类似于one-hot编码
 2 | from sklearn.preprocessing import MultiLabelBinarizer
 3 | y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]]
 4 | y_new = MultiLabelBinarizer().fit_transform(y)
 5 | print('新的输出格式：\n',y_new)
 6 | 
 7 | 
 8 | 
 9 | # =========1对其余的多分类构造方式================
10 | from sklearn import datasets
11 | from sklearn.multiclass import OneVsRestClassifier
12 | from sklearn.svm import LinearSVC
13 | 
14 | iris = datasets.load_iris()
15 | X, y = iris.data, iris.target
16 | clf = LinearSVC(random_state=0)  # 构建二分类器
17 | clf = OneVsRestClassifier(clf)  # 根据二分类器构建多分类器
18 | clf.fit(X, y)  # 训练模型
19 | y_pred = clf.predict(X) # 预测样本
20 | print('预测正确的个数：%d,预测错误的个数：%d' %((y==y_pred).sum(),(y!=y_pred).sum()))
21 | 
22 | 
23 | # =========1对1的多分类构造方式================
24 | from sklearn import datasets
25 | from sklearn.multiclass import OneVsOneClassifier
26 | from sklearn.svm import LinearSVC
27 | 
28 | iris = datasets.load_iris()
29 | X, y = iris.data, iris.target
30 | clf = LinearSVC(random_state=0)  # 构建二分类器
31 | clf = OneVsOneClassifier(clf)  # 根据二分类器构建多分类器
32 | clf.fit(X, y)  # 训练模型
33 | y_pred = clf.predict(X) # 预测样本
34 | print('预测正确的个数：%d,预测错误的个数：%d' %((y==y_pred).sum(),(y!=y_pred).sum()))
35 | 
36 | 
37 | # =========误差校正输出代码================
38 | from sklearn import datasets
39 | from sklearn.multiclass import OutputCodeClassifier
40 | from sklearn.svm import LinearSVC
41 | 
42 | iris = datasets.load_iris()
43 | X, y = iris.data, iris.target
44 | clf = LinearSVC(random_state=0)  # 构建二分类器
45 | clf = OutputCodeClassifier(clf,code_size=2, random_state=0)  # 根据二分类器构建多分类器
46 | clf.fit(X, y)  # 训练模型
47 | y_pred = clf.predict(X) # 预测样本
48 | print('预测正确的个数：%d,预测错误的个数：%d' %((y==y_pred).sum(),(y!=y_pred).sum()))
49 | 
50 | 
51 | # =========多输出回归================
52 | from sklearn.datasets import make_regression
53 | from sklearn.multioutput import MultiOutputRegressor
54 | from sklearn.ensemble import GradientBoostingRegressor
55 | from sklearn import metrics
56 | X, y = make_regression(n_samples=10, n_targets=3, random_state=1)  # 产生10个样本，每个样本100个属性，每个样本3个输出值
57 | print('样本特征维度',X.shape)
58 | print('样本输出维度',y.shape)
59 | clf = GradientBoostingRegressor(random_state=0)
60 | clf =MultiOutputRegressor(clf)
61 | clf.fit(X, y)
62 | y_pred = clf.predict(X) # 预测样本
63 | print('均方误差：',metrics.mean_squared_error(y, y_pred))  # 均方误差
64 | 
65 | 
66 | # =========多输出分类================
67 | from sklearn.datasets import make_classification
68 | from sklearn.multioutput import MultiOutputClassifier
69 | from sklearn.ensemble import RandomForestClassifier
70 | from sklearn.utils import shuffle
71 | import numpy as np
72 | X, y1 = make_classification(n_samples=10, n_features=100, n_informative=30, n_classes=3, random_state=1) # 生成分类数据集，10个样本，100个特征，30个有效特征，3种分类
73 | y2 = shuffle(y1, random_state=1)  # 分类结果随机排序
74 | y3 = shuffle(y1, random_state=2)  # 分类结果随机排序
75 | Y = np.vstack((y1, y2, y3)).T  # 多种分类结果组合成
76 | print('多输出多分类器真实输出分类:\n',Y)
77 | n_samples, n_features = X.shape # 10,100
78 | n_outputs = Y.shape[1] # 3个输出
79 | n_classes = 3 # 每种输出有3种分类
80 | forest = RandomForestClassifier(n_estimators=100, random_state=1)  # 生成随机森林多分类器
81 | multi_target_forest = MultiOutputClassifier(forest)  # 构建多输出多分类器
82 | y_pred = multi_target_forest.fit(X, Y).predict(X)
83 | print('多输出多分类器预测输出分类:\n',y_pred)
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/sk-密度聚类.py:
--------------------------------------------------------------------------------
 1 | import numpy as np  # 数据结构
 2 | import sklearn.cluster as skc  # 密度聚类
 3 | from sklearn import metrics   # 评估模型
 4 | import matplotlib.pyplot as plt  # 可视化绘图
 5 | 
 6 | data=[
 7 |     [-2.68420713,1.469732895],[-2.71539062,-0.763005825],[-2.88981954,-0.618055245],[-2.7464372,-1.40005944],[-2.72859298,1.50266052],
 8 |     [-2.27989736,3.365022195],[-2.82089068,-0.369470295],[-2.62648199,0.766824075],[-2.88795857,-2.568591135],[-2.67384469,-0.48011265],
 9 |     [-2.50652679,2.933707545],[-2.61314272,0.096842835],[-2.78743398,-1.024830855],[-3.22520045,-2.264759595],[-2.64354322,5.33787705],
10 |     [-2.38386932,6.05139453],[-2.6225262,3.681403515],[-2.64832273,1.436115015],[-2.19907796,3.956598405],[-2.58734619,2.34213138],
11 |     [1.28479459,3.084476355],[0.93241075,1.436391405],[1.46406132,2.268854235],[0.18096721,-3.71521773],[1.08713449,0.339256755],
12 |     [0.64043675,-1.87795566],[1.09522371,1.277510445],[-0.75146714,-4.504983795],[1.04329778,1.030306095],[-0.01019007,-3.242586915],
13 |     [-0.5110862,-5.681213775],[0.51109806,-0.460278495],[0.26233576,-2.46551985],[0.98404455,-0.55962189],[-0.174864,-1.133170065],
14 |     [0.92757294,2.107062945],[0.65959279,-1.583893305],[0.23454059,-1.493648235],[0.94236171,-2.43820017],[0.0432464,-2.616702525],
15 |     [4.53172698,-0.05329008],[3.41407223,-2.58716277],[4.61648461,1.538708805],[3.97081495,-0.815065605],[4.34975798,-0.188471475],
16 |     [5.39687992,2.462256225],[2.51938325,-5.361082605],[4.9320051,1.585696545],[4.31967279,-1.104966765],[4.91813423,3.511712835],
17 |     [3.66193495,1.0891728],[3.80234045,-0.972695745],[4.16537886,0.96876126],[3.34459422,-3.493869435],[3.5852673,-2.426881725],
18 |     [3.90474358,0.534685455],[3.94924878,0.18328617],[5.48876538,5.27195043],[5.79468686,1.139695065],[3.29832982,-3.42456273]
19 | ]
20 | X = np.array(data)
21 | 
22 | db = skc.DBSCAN(eps=1.5, min_samples=3).fit(X) #DBSCAN聚类方法 还有参数，matric = ""距离计算方法
23 | labels = db.labels_  #和X同一个维度，labels对应索引序号的值 为她所在簇的序号。若簇编号为-1，表示为噪声
24 | 
25 | print('每个样本的簇标号:')
26 | print(labels)
27 | 
28 | raito = len(labels[labels[:] == -1]) / len(labels)  #计算噪声点个数占总数的比例
29 | print('噪声比:', format(raito, '.2%'))
30 | 
31 | n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)  # 获取分簇的数目
32 | 
33 | print('分簇的数目: %d' % n_clusters_)
34 | print("轮廓系数: %0.3f" % metrics.silhouette_score(X, labels)) #轮廓系数评价聚类的好坏
35 | 
36 | for i in range(n_clusters_):
37 |     print('簇 ', i, '的所有样本:')
38 |     one_cluster = X[labels == i]
39 |     print(one_cluster)
40 |     plt.plot(one_cluster[:,0],one_cluster[:,1],'o')
41 | 
42 | plt.show()


--------------------------------------------------------------------------------
/sk-小批量k均值聚类.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | from sklearn.cluster import MiniBatchKMeans, KMeans
 6 | from sklearn.metrics.pairwise import pairwise_distances_argmin
 7 | from sklearn.datasets.samples_generator import make_blobs
 8 | 
 9 | # #############################################################################
10 | # 产生样本数据
11 | np.random.seed(0)
12 | 
13 | batch_size = 45
14 | centers = [[1, 1], [-1, -1], [1, -1]]  # 三种聚类的中心
15 | n_clusters = len(centers)
16 | X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)  # 生成样本随机数
17 | 
18 | # #############################################################################
19 | # k均值聚类
20 | 
21 | k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
22 | begin_time = time.time()  # 记录训练开始时间
23 | k_means.fit(X) # 聚类模型
24 | t_batch = time.time() - begin_time  # 记录训练用时
25 | print('k均值聚类时长：',t_batch)
26 | # #############################################################################
27 | # 小批量k均值聚类
28 | # batch_size为每次更新使用的样本数
29 | mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size,
30 |                       n_init=10, max_no_improvement=10, verbose=0)
31 | begin_time = time.time()  # 记录训练开始时间
32 | mbk.fit(X) # 聚类模型
33 | t_mini_batch = time.time() -  begin_time  # 记录训练用时
34 | print('小批量k均值聚类时长：',t_mini_batch)
35 | # #############################################################################
36 | # 结果可视化
37 | fig = plt.figure(figsize=(16, 6))  # 窗口大小
38 | fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)  # # 窗口四周留白
39 | colors = ['#4EACC5', '#FF9C34', '#4E9A06']  # 三种聚类的颜色
40 | 
41 | # 在两种聚类算法中，样本的所属类标号和聚类中心
42 | k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0) # 三个聚类点排序
43 | mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis=0) # 三个聚类点排序
44 | k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers) # 计算X中每个样本与k_means_cluster_centers中的哪个样本最近。也就是获取所有对象的所属的类标签
45 | mbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers) # 计算X中每个样本与k_means_cluster_centers中的哪个样本最近。也就是获取所有对象的所属的类标签
46 | order = pairwise_distances_argmin(k_means_cluster_centers,mbk_means_cluster_centers)  # 计算k均值聚类点相对于小批量k均值聚类点的索引。因为要比较两次聚类的结果的区别，所以类标号要对应上
47 | 
48 | 
49 | # 绘制KMeans
50 | ax = fig.add_subplot(1, 3, 1)
51 | for k, col in zip(range(n_clusters), colors):
52 |     my_members = k_means_labels == k  # 获取属于当前类别的样本
53 |     cluster_center = k_means_cluster_centers[k]  # 获取当前聚类中心
54 |     ax.plot(X[my_members, 0], X[my_members, 1], 'w',markerfacecolor=col, marker='.') # 绘制当前聚类的样本点
55 |     ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,markeredgecolor='k', markersize=6) # 绘制聚类中心点
56 | ax.set_title('KMeans')
57 | ax.set_xticks(())
58 | ax.set_yticks(())
59 | plt.text(-3.5, 1.8,  'train time: %.2fs\ninertia: %f' % (t_batch, k_means.inertia_))
60 | 
61 | # 绘制MiniBatchKMeans
62 | ax = fig.add_subplot(1, 3, 2)
63 | for k, col in zip(range(n_clusters), colors):
64 |     my_members = mbk_means_labels == k # 获取属于当前类别的样本
65 |     cluster_center = mbk_means_cluster_centers[k] # 获取当前聚类中心
66 |     ax.plot(X[my_members, 0], X[my_members, 1], 'w',markerfacecolor=col, marker='.') # 绘制当前聚类的样本点
67 |     ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,markeredgecolor='k', markersize=6) # 绘制聚类中心点
68 | ax.set_title('MiniBatchKMeans')
69 | ax.set_xticks(())
70 | ax.set_yticks(())
71 | plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' %(t_mini_batch, mbk.inertia_))
72 | 
73 | # 初始化两次结果中
74 | different = (mbk_means_labels == 4)
75 | ax = fig.add_subplot(1, 3, 3)
76 | 
77 | for k in range(n_clusters):
78 |     different += ((k_means_labels == k) != (mbk_means_labels == order[k]))  # 将两种聚类算法中聚类结果不一样的样本设置为true，聚类结果相同的样本设置为false
79 | 
80 | identic = np.logical_not(different)  # 向量取反，也就是聚类结果相同设置true，聚类结果不相同设置为false
81 | 
82 | ax.plot(X[identic, 0], X[identic, 1], 'w',markerfacecolor='#bbbbbb', marker='.') # 绘制聚类结果相同的样本点
83 | ax.plot(X[different, 0], X[different, 1], 'w',markerfacecolor='m', marker='.') # 绘制聚类结果不同的样本点
84 | ax.set_title('Difference')
85 | ax.set_xticks(())
86 | ax.set_yticks(())
87 | 
88 | plt.show()


--------------------------------------------------------------------------------
/sk-层次聚类.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | 
 3 | import numpy as np
 4 | from scipy import ndimage
 5 | from matplotlib import pyplot as plt
 6 | 
 7 | from sklearn import manifold, datasets
 8 | 
 9 | digits = datasets.load_digits(n_class=10)  # 生成10种类别的样本数据
10 | X = digits.data
11 | y = digits.target
12 | n_samples, n_features = X.shape
13 | 
14 | np.random.seed(0)
15 | 
16 | def nudge_images(X, y):
17 |     # Having a larger dataset shows more clearly the behavior of the
18 |     # methods, but we multiply the size of the dataset only by 2, as the
19 |     # cost of the hierarchical clustering methods are strongly
20 |     # super-linear in n_samples
21 |     shift = lambda x: ndimage.shift(x.reshape((8, 8)),.3 * np.random.normal(size=2),mode='constant',).ravel()
22 |     X = np.concatenate([X, np.apply_along_axis(shift, 1, X)])
23 |     Y = np.concatenate([y, y], axis=0)
24 |     return X, Y
25 | 
26 | 
27 | X, y = nudge_images(X, y)
28 | print(y)
29 | 
30 | #----------------------------------------------------------------------
31 | # 可视化聚类
32 | def plot_clustering(X_red, X, labels, title=None):
33 |     x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0)
34 |     X_red = (X_red - x_min) / (x_max - x_min)
35 | 
36 |     plt.figure(figsize=(6, 4))
37 |     for i in range(X_red.shape[0]):
38 |         plt.text(X_red[i, 0], X_red[i, 1], str(y[i]),
39 |                  color=plt.cm.spectral(labels[i] / 10.),
40 |                  fontdict={'weight': 'bold', 'size': 9})
41 | 
42 |     plt.xticks([])
43 |     plt.yticks([])
44 |     if title is not None:
45 |         plt.title(title, size=17)
46 |     plt.axis('off')
47 |     plt.tight_layout()
48 | 
49 | #----------------------------------------------------------------------
50 | # 2D embedding of the digits dataset
51 | print("Computing embedding")
52 | X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)
53 | print("Done.")
54 | 
55 | from sklearn.cluster import AgglomerativeClustering  # 引入层次聚类
56 | 
57 | for linkage in ('ward', 'average', 'complete'):
58 |     clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)  # 通过聚类个数和聚类合并准则创建聚类模型
59 |     begin_time = time()   # 记录开始时间
60 |     clustering.fit(X_red)
61 |     print(linkage,"聚类合并方法进行聚类用时: %.2fs" % (time() - begin_time))
62 | 
63 |     plot_clustering(X_red, X, clustering.labels_, "%s linkage" % linkage)  # 可视化聚类结果
64 | 
65 | 
66 | plt.show()


--------------------------------------------------------------------------------
/sk-层次聚类1.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | import numpy as np
 3 | from scipy import ndimage
 4 | from matplotlib import pyplot as plt
 5 | from sklearn import manifold, datasets
 6 | from sklearn.datasets.samples_generator import make_blobs
 7 | 
 8 | # 产生样本数据
 9 | np.random.seed(0)
10 | 
11 | centers = [[1, 1], [-1, -1], [1, -1]]  # 三种聚类的中心
12 | n_clusters = len(centers)
13 | X, y = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)  # 生成样本随机数
14 | 
15 | 
16 | #----------------------------------------------------------------------
17 | # 可视化聚类
18 | def plot_clustering(X_red, X, labels, title=None):
19 |     x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0)
20 |     X_red = (X_red - x_min) / (x_max - x_min)
21 | 
22 |     plt.figure(figsize=(6, 4))
23 |     for i in range(X_red.shape[0]):
24 |         plt.text(X_red[i, 0], X_red[i, 1], str(y[i]),
25 |                  color=plt.cm.spectral(labels[i] / 10.),
26 |                  fontdict={'weight': 'bold', 'size': 9})
27 | 
28 |     plt.xticks([])
29 |     plt.yticks([])
30 |     if title is not None:
31 |         plt.title(title, size=17)
32 |     plt.axis('off')
33 |     plt.tight_layout()
34 | 
35 | #----------------------------------------------------------------------
36 | # 手写体数据集
37 | print("Computing embedding")
38 | X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)
39 | print("Done.")
40 | 
41 | from sklearn.cluster import AgglomerativeClustering  # 引入层次聚类
42 | 
43 | for linkage in ('ward', 'average', 'complete'):
44 |     clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)  # 通过聚类个数和聚类合并准则创建聚类模型
45 |     begin_time = time()   # 记录开始时间
46 |     clustering.fit(X_red)
47 |     print(linkage,"聚类合并方法进行聚类用时: %.2fs" % (time() - begin_time))
48 | 
49 |     plot_clustering(X_red, X, clustering.labels_, "%s linkage" % linkage)  # 可视化聚类结果
50 | 
51 | 
52 | plt.show()


--------------------------------------------------------------------------------
/sk-岭回归.py:
--------------------------------------------------------------------------------
 1 | import numpy as np # 快速操作结构数组的工具
 2 | import matplotlib.pyplot as plt  # 可视化绘制
 3 | from sklearn.linear_model import Ridge,RidgeCV   # Ridge岭回归,RidgeCV带有广义交叉验证的岭回归
 4 | 
 5 | 
 6 | # 样本数据集，第一列为x，第二列为y，在x和y之间建立回归模型
 7 | data=[
 8 |     [0.067732,3.176513],[0.427810,3.816464],[0.995731,4.550095],[0.738336,4.256571],[0.981083,4.560815],
 9 |     [0.526171,3.929515],[0.378887,3.526170],[0.033859,3.156393],[0.132791,3.110301],[0.138306,3.149813],
10 |     [0.247809,3.476346],[0.648270,4.119688],[0.731209,4.282233],[0.236833,3.486582],[0.969788,4.655492],
11 |     [0.607492,3.965162],[0.358622,3.514900],[0.147846,3.125947],[0.637820,4.094115],[0.230372,3.476039],
12 |     [0.070237,3.210610],[0.067154,3.190612],[0.925577,4.631504],[0.717733,4.295890],[0.015371,3.085028],
13 |     [0.335070,3.448080],[0.040486,3.167440],[0.212575,3.364266],[0.617218,3.993482],[0.541196,3.891471]
14 | ]
15 | 
16 | 
17 | #生成X和y矩阵
18 | dataMat = np.array(data)
19 | X = dataMat[:,0:1]   # 变量x
20 | y = dataMat[:,1]   #变量y
21 | 
22 | 
23 | 
24 | # ========岭回归========
25 | model = Ridge(alpha=0.5)
26 | model = RidgeCV(alphas=[0.1, 1.0, 10.0])  # 通过RidgeCV可以设置多个参数值，算法使用交叉验证获取最佳参数值
27 | model.fit(X, y)   # 线性回归建模
28 | print('系数矩阵:\n',model.coef_)
29 | print('线性回归模型:\n',model)
30 | # print('交叉验证最佳alpha值',model.alpha_)  # 只有在使用RidgeCV算法时才有效
31 | # 使用模型预测
32 | predicted = model.predict(X)
33 | 
34 | # 绘制散点图 参数：x横轴 y纵轴
35 | plt.scatter(X, y, marker='x')
36 | plt.plot(X, predicted,c='r')
37 | 
38 | # 绘制x轴和y轴坐标
39 | plt.xlabel("x")
40 | plt.ylabel("y")
41 | 
42 | # 显示图形
43 | plt.show()
44 | 
45 | 


--------------------------------------------------------------------------------
/sk-度量.py:
--------------------------------------------------------------------------------
  1 | from sklearn import svm, datasets
  2 | from sklearn.model_selection import cross_val_score,cross_validate  # 交叉验证中的模型度量
  3 | import numpy as np # 快速操作结构数组的工具
  4 | import matplotlib.pyplot as plt  # 可视化绘制
  5 | from sklearn.linear_model import LinearRegression  # 线性回归
  6 | from sklearn.metrics import make_scorer
  7 | from sklearn import metrics
  8 | 
  9 | #  =============================分类度量===============================
 10 | print('=============================分类度量===============================')
 11 | iris = datasets.load_iris()  # 加载iris 数据集；用于分类问题
 12 | X, y = iris.data, iris.target  # 150个样本，4个属性，3种分类
 13 | 
 14 | 
 15 | clf = svm.SVC(probability=True, random_state=0)
 16 | 
 17 | # ===========================交叉验证获取度量=======================
 18 | score = cross_val_score(clf, X, y, scoring='accuracy',cv=3)  # 默认进行三次交叉验证
 19 | print('交叉验证度量：',score)
 20 | 
 21 | 
 22 | # ===========================自定义度量=======================
 23 | 
 24 | # 自定义度量函数，输入为真实值和预测值
 25 | def my_custom_loss_func(ground_truth, predictions):
 26 |     diff = np.abs(ground_truth - predictions).max()
 27 |     return np.log(1 + diff)
 28 | 
 29 | loss  = make_scorer(my_custom_loss_func, greater_is_better=False) # 自定义度量对象。结果越小越好。greater_is_better设置为false，系统认为是损失函数，则会将计分函数取反
 30 | score = make_scorer(my_custom_loss_func, greater_is_better=True) # 自定义度量对象。结果越大越好
 31 | clf = svm.SVC()
 32 | clf.fit(X, y)
 33 | 
 34 | print(loss(clf,X,y)) # 对模型进行度量，系统会自动调用模型对输入进行预测，并和真实输出值进行比较，计算损失函数
 35 | print(score(clf,X,y)) # 对模型进行度量，系统会自动调用模型对输入进行预测，并和真实输出值进行比较，计算得分
 36 | 
 37 | 
 38 | # ============================多种度量值=========================
 39 | scoring = ['precision_macro', 'recall_macro'] # precision_macro为精度，recall_macro为召回率
 40 | scores = cross_validate(clf, X, y,scoring=scoring,cv=5, return_train_score=True)
 41 | sorted(scores.keys())
 42 | print('多种度量的测试结果：',scores)  # scores类型为字典。包含训练得分，拟合次数， score-times （得分次数）
 43 | 
 44 | 
 45 | 
 46 | # ============================分类指标=========================
 47 | clf = svm.SVC()  # 构建模型
 48 | clf.fit(X, y) # 训练模型
 49 | predict_y = clf.predict(X) # 预测数据
 50 | 
 51 | print('准确率指标：',metrics.accuracy_score(y, predict_y))  # 计算准确率
 52 | print('Kappa指标：',metrics.cohen_kappa_score(y, predict_y)) # Kappa 检验
 53 | print('混淆矩阵：\n',metrics.confusion_matrix(y, predict_y)) # 混淆矩阵
 54 | 
 55 | target_names = ['class 0', 'class 1', 'class 2']
 56 | print('分类报告：\n',metrics.classification_report(y, predict_y, target_names=target_names))  # 分类报告
 57 | print('汉明损失：',metrics.hamming_loss(y, predict_y))  #汉明损失 。在多分类中， 汉明损失对应于 y 和 predict_y 之间的汉明距离
 58 | print('Jaccard 相似系数：',metrics.jaccard_similarity_score(y, predict_y))   # Jaccard 相似系数
 59 | 
 60 | 
 61 | 
 62 | # 下面的系数在在二分类中不需要使用average参数，在多分类中需要使用average参数进行多个二分类的平均
 63 | # average可取值：macro（宏）、weighted（加权）、micro（微）、samples（样本）、None（返回每个类的分数）
 64 | 
 65 | print('精度计算：',metrics.precision_score(y, predict_y, average='macro'))
 66 | print('召回率：',metrics.recall_score(y, predict_y,average='micro'))
 67 | print('F1值：',metrics.f1_score(y, predict_y,average='weighted'))
 68 | 
 69 | print('FB值：',metrics.fbeta_score(y, predict_y,average='macro', beta=0.5))
 70 | print('FB值：',metrics.fbeta_score(y, predict_y,average='macro', beta=1))
 71 | print('FB值：',metrics.fbeta_score(y, predict_y,average='macro', beta=2))
 72 | print('精确召回曲线：',metrics.precision_recall_fscore_support(y, predict_y,beta=0.5,average=None))
 73 | print('零一损失：',metrics.zero_one_loss(y, predict_y))
 74 | 
 75 | # ROC曲线(二分类)
 76 | y1 = np.array([0, 0, 1, 1])  # 样本类标号
 77 | y_scores = np.array([0.1, 0.4, 0.35, 0.8]) # 样本的得分（属于正样本的概率估计、或置信度值）
 78 | fpr, tpr, thresholds = metrics.roc_curve(y1, y_scores, pos_label=1)
 79 | print('假正率：',fpr)
 80 | print('真正率：',tpr)
 81 | print('门限：',thresholds)
 82 | print('AUC值：',metrics.roc_auc_score(y1, y_scores))
 83 | 
 84 | 
 85 | labels = np.array([0, 1, 2])  # 三种分类的类标号
 86 | pred_decision = clf.decision_function(X)  # 计算样本属于每种分类的得分，所以pred_decision是一个3列的矩阵
 87 | print('hinge_loss：',metrics.hinge_loss(y, pred_decision, labels = labels))
 88 | 
 89 | # 逻辑回归损失，对真实分类和预测分类概率进行对比的损失
 90 | y_true = [0, 0, 1, 1]
 91 | y_pred = [[.9, .1], [.8, .2], [.3, .7], [.01, .99]]
 92 | print('log_loss：',metrics.log_loss(y_true, y_pred))
 93 | 
 94 | 
 95 | # ===============================回归度量==============================
 96 | print(' ===============================回归度量==============================')
 97 | diabetes = datasets.load_diabetes()  # 加载糖尿病数据集；用于回归问题
 98 | X, y = diabetes.data, diabetes.target  # 442个样本，10个属性，数值输出
 99 | 
100 | model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
101 | model.fit(X, y)   # 线性回归建模
102 | predicted_y = model.predict(X)  # 使用模型预测
103 | 
104 | print('解释方差得分：',metrics.explained_variance_score(y, predicted_y))  # 解释方差得分
105 | print('平均绝对误差：',metrics.mean_absolute_error(y, predicted_y))  # 平均绝对误差
106 | print('均方误差：',metrics.mean_squared_error(y, predicted_y))  # 均方误差
107 | print('均方误差对数：',metrics.mean_squared_log_error(y, predicted_y))  # 均方误差对数
108 | print('中位绝对误差：',metrics.median_absolute_error(y, predicted_y))  # 中位绝对误差
109 | print('可决系数：',metrics.r2_score(y, predicted_y, multioutput='variance_weighted')) #可决系数
110 | print('可决系数：',metrics.r2_score(y, predicted_y, multioutput='raw_values')) #可决系数
111 | print('可决系数：',metrics.r2_score(y, predicted_y, multioutput='uniform_average')) #可决系数
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/sk-数据集-特征选择-交叉验证.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | 
 3 | import numpy as np # 快速操作结构数组的工具
 4 | import pandas as pd # 数据分析处理工具
 5 | import matplotlib.pyplot as plt # 画图工具
 6 | from sklearn import datasets # 机器学习库
 7 | 
 8 | 
 9 | # 使用鸢尾花卉样本数据，对待测对象进行分类：分别包括为山鸢尾、变色鸢尾、维吉尼亚尾
10 | 
11 | 
12 | # =======加载样本数据集，清洗转化数据格式=======
13 | 
14 | #数据集 0-山鸢尾、1-变色鸢尾、2-维吉尼亚尾
15 | scikit_iris = datasets.load_iris()     #加载鸢尾花卉数据集。每行一个对象，每列一种属性。['data']为样本数据集，['target']为结果数据集，['target_names']为类别名称，.feature_names属性名称
16 | # 转换成pandas的DataFrame数据格式，方便观察数据
17 | iris = pd.DataFrame(data=np.c_[scikit_iris['data'], scikit_iris['target']],columns=np.append(scikit_iris.feature_names, ['y']))    #每行为一个对象，每列为一种属性，最后一个为结果值
18 | # print(iris.head(2))                  #查看前两行，观察数据格式
19 | # print(iris.isnull().sum())           # isnull()返回布尔矩阵，sum()按列求和。检查数据是否有缺失
20 | # print(iris.groupby('y').count())     # 观察样本中各类别数量是否比较均衡
21 | 
22 | 
23 | # =======选择全部特征训练模型、预测新对象的分类=======
24 | 
25 | X = iris[scikit_iris.feature_names]   #获取样本集
26 | y = iris['y']                         #获取结果集
27 | 
28 | # 第一步，选择model
29 | from sklearn.neighbors import KNeighborsClassifier   # 导入knn分类器
30 | 
31 | knn = KNeighborsClassifier(n_neighbors=1)              # 初始化一个knn模型，设置k=1
32 | # 第二步，fit X、y
33 | knn.fit(X, y)                                          #根据样本集合结果集，对knn进行建模
34 | # 第三步，predict新数据
35 | result = knn.predict([[3, 2, 2, 5]])                   #使用knn对新对象进行预测
36 | print(result)
37 | 
38 | 
39 | # =======使用交叉验证评估模型=======
40 | from sklearn.cross_validation import train_test_split
41 | from sklearn import metrics
42 | 
43 | # 分割训练-测试集
44 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)  #划分训练集合测试集
45 | 
46 | # K=15
47 | knn = KNeighborsClassifier(n_neighbors=15)                                 #创建knn模型
48 | knn.fit(X_train, y_train)                                                  #训练knn模型
49 | 
50 | y_pred_on_train = knn.predict(X_train)       # 预测训练集，为了和预测测试集对比，查看拟合情况
51 | y_pred_on_test = knn.predict(X_test)         # 预测测试集
52 | # print(metrics.accuracy_score(y_train, y_pred_on_train))                      # 计算样本集的正确率
53 | print('正确率: ：{}'.format(metrics.accuracy_score(y_test, y_pred_on_test)))  # 计算测试集的正确率
54 | 


--------------------------------------------------------------------------------
/sk-文档贝叶斯.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | """
  4 |     这个指南的目的是在一个实际任务上探索scikit-learn的主要工具，在二十个不同的主题上分析一个文本集合。
  5 |     在这一节中，可以看到：
  6 |         1、加载文本文件和类别
  7 |         2、适合机器学习的特征向量提取
  8 |         3、训练线性模型进行分类
  9 |         4、使用网格搜索策略，找到一个很好的配置的特征提取组件和分类器
 10 | """
 11 | 
 12 | """
 13 |     1、Loading the 20 newsgroups dataset 加载20个新闻组数据集
 14 |     为了获得更快的执行时间为第一个例子，我们将工作在部分数据集只有4个类别的数据集中：
 15 | """
 16 | categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
 17 | from sklearn.datasets import fetch_20newsgroups
 18 | 
 19 | twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
 20 | print(twenty_train.target)
 21 | print(twenty_train.target_names)  # 训练集中类别的名字，这里只有四个类别
 22 | print(len(twenty_train.data))  # 训练集中数据的长度
 23 | print(len(twenty_train.filenames))  # 训练集文件名长度
 24 | print('-----')
 25 | print("\n".join(twenty_train.data[0].split("\n")[:3]))
 26 | print('-----')
 27 | print(twenty_train.target_names[twenty_train.target[0]])
 28 | print('-----')
 29 | print(twenty_train.target[:10])  # 前十个的类别
 30 | print('-----')
 31 | for t in twenty_train.target[:10]:
 32 |     print(twenty_train.target_names[t])  # 类别的名字
 33 | print('-----')
 34 | """
 35 |     2、Extracting features from text files 从文本文件中提取特征
 36 |     为了在文本文件中使用机器学习算法，首先需要将文本内容转换为数值特征向量
 37 | """
 38 | 
 39 | """
 40 |     Bags of words 词袋
 41 |     最直接的方式就是词袋表示法
 42 |         1、为训练集的任何文档中的每个单词分配一个固定的整数ID（例如通过从字典到整型索引建立字典）
 43 |         2、对于每个文档，计算每个词出现的次数，并存储到X[i,j]中。
 44 | 
 45 |     词袋表示：n_features 是语料中不同单词的数量，这个数量通常大于100000.
 46 |     如果 n_samples == 10000，存储X的数组就需要10000*10000*4byte=4GB,这么大的存储在今天的计算机上是不可能实现的。
 47 |     幸运的是，X中的大多数值都是0，基于这种原因，我们说词袋是典型的高维稀疏数据集，我们可以只存储那些非0的特征向量。
 48 |     scipy.sparse 矩阵就是这种数据结构，而scikit-learn内置了这种数据结构。
 49 | """
 50 | 
 51 | """
 52 |     Tokenizing text with scikit-learn 使用scikit-learn标记文本
 53 |     文本处理、分词、过滤停用词都在这些高级组件中，能够建立特征字典并将文档转换成特征向量。
 54 | """
 55 | from sklearn.feature_extraction.text import CountVectorizer  # sklearn中的文本特征提取组件中，导入特征向量计数函数
 56 | 
 57 | count_vect = CountVectorizer()  # 特征向量计数函数
 58 | X_train_counts = count_vect.fit_transform(twenty_train.data)  # 对文本进行特征向量处理
 59 | print(X_train_counts)  # 特征向量和特征标签
 60 | print(X_train_counts.shape)  # 形状
 61 | print('-----')
 62 | 
 63 | """
 64 |     CountVectorizer支持计算单词或序列的N-grams，一旦合适，这个向量化就可以建立特征词典。
 65 |     在整个训练预料中，词汇中的词汇索引值与其频率有关。
 66 | """
 67 | print(count_vect.vocabulary_.get(u'algorithm'))
 68 | print('-----')
 69 | 
 70 | """
 71 |     From occurrences to frequencies 从事件到频率
 72 |     计数是一个好的开始，但是也存在一个问题：较长的文本将会比较短的文本有很高的平均计数值，即使他们所表示的话题是一样的。
 73 |     为了避免潜在的差异，它可以将文档中的每个单词出现的次数在文档的总字数的比例：这个新的特征叫做词频：tf
 74 |     tf-idf:词频-逆文档频率
 75 | """
 76 | from sklearn.feature_extraction.text import TfidfTransformer  # sklearn中的文本特征提取组件中，导入词频统计函数
 77 | 
 78 | tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)  # 建立词频统计函数,注意这里idf=False
 79 | print(tf_transformer)  # 输出函数属性 TfidfTransformer(norm=u'l2', smooth_idf=True, sublinear_tf=False, use_idf=False)
 80 | print('-----')
 81 | X_train_tf = tf_transformer.transform(X_train_counts)  # 使用函数对文本文档进行tf-idf频率计算
 82 | print(X_train_tf)
 83 | print('-----')
 84 | print(X_train_tf.shape)
 85 | print('-----')
 86 | """
 87 |     在上面的例子中，使用fit()方法来构建基于数据的预测器，然后使用transform()方法来将计数矩阵用tf-idf表示。
 88 |     这两个步骤可以通过跳过冗余处理，来更快的达到相同的最终结果。
 89 |     这些可以通过使用fit_transform()方法来实现：
 90 | """
 91 | tfidf_transformer = TfidfTransformer()  # 这里使用的是tf-idf
 92 | X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
 93 | print(X_train_tfidf)
 94 | print(X_train_tfidf.shape)
 95 | print('-----')
 96 | """
 97 |     Training a classifier 训练一个分类器
 98 |     既然已经有了特征，就可以训练分类器来试图预测一个帖子的类别，先使用贝叶斯分类器，贝叶斯分类器提供了一个良好的基线来完成这个任务。
 99 |     scikit-learn中包括这个分类器的许多变量，最适合进行单词计数的是多项式变量。
100 | """
101 | from sklearn.naive_bayes import MultinomialNB  # 使用sklearn中的贝叶斯分类器，并且加载贝叶斯分类器
102 | 
103 | # 中的MultinomialNB多项式函数
104 | clf = MultinomialNB()  # 加载多项式函数
105 | x_clf = clf.fit(X_train_tfidf, twenty_train.target)  # 构造基于数据的分类器
106 | print(x_clf)  # 分类器属性：MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
107 | print('-----')
108 | """
109 |     为了预测输入的新的文档，我们需要使用与前面相同的特征提取链进行提取特征。
110 |     不同的是，在转换中，使用transform来代替fit_transform，因为训练集已经构造了分类器
111 | """
112 | docs_new = ['God is love', 'OpenGL on the GPU is fast']  # 文档
113 | X_new_counts = count_vect.transform(docs_new)  # 构建文档计数
114 | X_new_tfidf = tfidf_transformer.transform(X_new_counts)  # 构建文档tfidf
115 | predicted = clf.predict(X_new_tfidf)  # 预测文档
116 | print(predicted)  # 预测类别 [3 1]，一个属于3类，一个属于1类
117 | for doc, category in zip(docs_new, predicted):
118 |     print('%r => %s' % (doc, twenty_train.target_names[category]))  # 将文档和类别名字对应起来
119 | print('-----')
120 | """
121 |     Building a pipeline 建立管道
122 |     为了使向量转换更加简单(vectorizer => transformer => classifier)，scikit-learn提供了pipeline类来表示为一个复合分类器
123 | """
124 | from sklearn.pipeline import Pipeline
125 | 
126 | text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
127 | text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
128 | print(text_clf)  # 构造分类器，分类器的属性
129 | predicted = text_clf.predict(docs_new)  # 预测新文档
130 | print(predicted)  # 获取预测值
131 | print('-----')
132 | 
133 | """
134 |     分析总结：
135 |         1、加载数据集，主要是加载训练集，用于对数据进行训练
136 |         2、文本特征提取：
137 |                 对文本进行计数统计 CountVectorizer
138 |                 词频统计  TfidfTransformer  （先计算tf,再计算tfidf）
139 |         3、训练分类器：
140 |                 贝叶斯多项式训练器 MultinomialNB
141 |         4、预测文档：
142 |                 通过构造的训练器进行构造分类器，来进行文档的预测
143 |         5、最简单的方式：
144 |                 通过使用pipeline管道形式，来讲上述所有功能通过管道来一步实现，更加简单的就可以进行预测
145 | """
146 | 
147 | """
148 |     Evaluation of the performance on the test set 测试集性能评价
149 |     评估模型的预测精度同样容易：
150 | """
151 | import numpy as np
152 | 
153 | twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
154 | docs_test = twenty_test.data
155 | predicted = text_clf.predict(docs_test)
156 | print(np.mean(predicted == twenty_test.target))  # 预测的值和测试值的比例，mean就是比例函数
157 | print('-----')  # 精度已经为0.834886817577
158 | 
159 | """
160 |     精度已经实现了83.4%，那么使用支持向量机(SVM)是否能够做的更好呢，支持向量机(SVM)被广泛认为是最好的文本分类算法之一。
161 |     尽管，SVM经常比贝叶斯要慢一些。
162 |     我们可以改变学习方式，使用管道来实现分类：
163 | """
164 | from sklearn.linear_model import SGDClassifier
165 | 
166 | text_clf = Pipeline(
167 |     [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
168 |      ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])
169 | # _ = text_clf.fit(twenty_train.data, twenty_train.target)  # 和下面一句的意思一样，一个杠，表示本身
170 | text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
171 | predicted = text_clf.predict(docs_test)
172 | print(np.mean(predicted == twenty_test.target))  # 精度 0.912782956059
173 | print('-----')
174 | """
175 |     sklearn进一步提供了结果的更详细的性能分析工具：
176 | """
177 | from sklearn import metrics
178 | print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))
179 | print(metrics.confusion_matrix(twenty_test.target, predicted))


--------------------------------------------------------------------------------
/sk-朴素贝叶斯.py:
--------------------------------------------------------------------------------
 1 | from sklearn import datasets
 2 | iris = datasets.load_iris()
 3 | 
 4 | from sklearn.naive_bayes import GaussianNB
 5 | clf = GaussianNB()
 6 | clf = clf.fit(iris.data, iris.target)
 7 | y_pred=clf.predict(iris.data)
 8 | print("高斯朴素贝叶斯，样本总数： %d 错误样本数 : %d" % (iris.data.shape[0],(iris.target != y_pred).sum()))
 9 | 
10 | from sklearn.naive_bayes import MultinomialNB
11 | clf = MultinomialNB()
12 | clf = clf.fit(iris.data, iris.target)
13 | y_pred=clf.predict(iris.data)
14 | print("多项分布朴素贝叶斯，样本总数： %d 错误样本数 : %d" % (iris.data.shape[0],(iris.target != y_pred).sum()))
15 | 
16 | from sklearn.naive_bayes import BernoulliNB
17 | clf = BernoulliNB()
18 | clf = clf.fit(iris.data, iris.target)
19 | y_pred=clf.predict(iris.data)
20 | print("伯努利朴素贝叶斯，样本总数： %d 错误样本数 : %d" % (iris.data.shape[0],(iris.target != y_pred).sum()))
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/sk-样本数据集.py:
--------------------------------------------------------------------------------
  1 | # import numpy as np # 快速操作结构数组的工具
  2 | # import pandas as pd # 数据分析处理工具
  3 | # import matplotlib.pyplot as plt # 画图工具
  4 | # from sklearn import datasets # 机器学习数据集
  5 | # from sklearn.datasets import make_blobs
  6 | # from sklearn import datasets
  7 | 
  8 | # load_boston([return_X_y]) 加载波士顿房价数据；用于回归问题
  9 | # load_iris([return_X_y]) 加载iris 数据集；用于分类问题
 10 | # load_diabetes([return_X_y]) 加载糖尿病数据集；用于回归问题
 11 | # load_digits([n_class, return_X_y]) 加载手写字符集；用于分类问题
 12 | # load_linnerud([return_X_y]) 加载linnerud 数据集；用于多元回归问题
 13 | 
 14 | 
 15 | # # ===========房价数据===========
 16 | # from sklearn.datasets import load_boston
 17 | # from sklearn import linear_model
 18 | # boston = load_boston()
 19 | # data=boston.data
 20 | # target = boston.target
 21 | # print(data.shape)
 22 | # print(target.shape)
 23 | #
 24 | # print('系数矩阵:\n',linear_model.LinearRegression().fit(data,target).coef_)
 25 | #
 26 | #
 27 | # # ===========花卉数据===========
 28 | # from sklearn.datasets import load_iris
 29 | # from sklearn import svm
 30 | # iris = load_iris()
 31 | # data=iris.data
 32 | # target = iris.target
 33 | # print(data.shape)
 34 | # print(target.shape)
 35 | #
 36 | # print('svm模型:\n',svm.SVC().fit(data,target))
 37 | 
 38 | # # ===========糖尿病数据集===========
 39 | # from sklearn.datasets import load_diabetes
 40 | # from sklearn import linear_model
 41 | # diabetes = load_diabetes()
 42 | # data=diabetes.data
 43 | # target = diabetes.target
 44 | # print(data.shape)
 45 | # print(target.shape)
 46 | #
 47 | # print('系数矩阵:\n',linear_model.LinearRegression().fit(data,target).coef_)
 48 | 
 49 | 
 50 | 
 51 | # # ===========手写体数据===========
 52 | # from sklearn.datasets import load_digits
 53 | # import matplotlib.pyplot as plt # 画图工具
 54 | # digits = load_digits()
 55 | # data=digits.data
 56 | # print(data.shape)
 57 | # plt.matshow(digits.images[3])  # 矩阵像素点的样式显示3
 58 | # # plt.imshow(digits.images[3])  # 图片渐变的样式显示3
 59 | # # plt.gray()   # 图片显示为灰度模式
 60 | # plt.show()
 61 | 
 62 | 
 63 | # #  # ===========多元回归===========
 64 | # from sklearn.datasets import load_linnerud
 65 | # from sklearn import linear_model
 66 | # linnerud = load_linnerud()
 67 | # data=linnerud.data
 68 | # target = linnerud.target
 69 | # print(data.shape)
 70 | # print(target.shape)
 71 | #
 72 | # print('系数矩阵:\n',linear_model.LinearRegression().fit(data,target).coef_)
 73 | 
 74 | 
 75 | 
 76 | # # ===========图像样本数据集===========
 77 | # from sklearn.datasets import load_sample_image
 78 | # import matplotlib.pyplot as plt # 画图工具
 79 | # img=load_sample_image('flower.jpg')   # 加载sk自带的花朵图案
 80 | # plt.imshow(img)
 81 | # plt.show()
 82 | 
 83 | 
 84 | 
 85 | # # ===========生成分类样本数据集===========
 86 | # from sklearn import datasets
 87 | # import matplotlib.pyplot as plt # 画图工具
 88 | # data,target=datasets.make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0,n_repeated=0, n_classes=2, n_clusters_per_class=1)
 89 | # print(data.shape)
 90 | # print(target.shape)
 91 | # plt.scatter(data[:,0],data[:,1],c=target)
 92 | # plt.show()
 93 | 
 94 | 
 95 | # import matplotlib.pyplot as plt
 96 | # from sklearn.datasets import make_classification
 97 | # from sklearn.datasets import make_blobs
 98 | # from sklearn.datasets import make_gaussian_quantiles
 99 | # from sklearn.datasets import make_hastie_10_2
100 | #
101 | # plt.figure(figsize=(10, 10))
102 | # plt.subplots_adjust(bottom=.05, top=.9, left=.05, right=.95)
103 | #
104 | # plt.subplot(421)
105 | # plt.title("One informative feature, one cluster per class", fontsize='small')
106 | # X1, Y1 = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_informative=1,n_clusters_per_class=1)
107 | # plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
108 | #
109 | # plt.subplot(422)
110 | # plt.title("Two informative features, one cluster per class", fontsize='small')
111 | # X1, Y1 = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_informative=2,n_clusters_per_class=1)
112 | # plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
113 | #
114 | # plt.subplot(423)
115 | # plt.title("Two informative features, two clusters per class", fontsize='small')
116 | # X2, Y2 = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_informative=2)
117 | # plt.scatter(X2[:, 0], X2[:, 1], marker='o', c=Y2)
118 | #
119 | # plt.subplot(424)
120 | # plt.title("Multi-class, two informative features, one cluster",fontsize='small')
121 | # X1, Y1 = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_informative=2,n_clusters_per_class=1, n_classes=3)
122 | # plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
123 | #
124 | # plt.subplot(425)
125 | # plt.title("Three blobs", fontsize='small')
126 | # # 1000个样本，2个属性，3种类别，方差分别为1.0,3.0,2.0
127 | # X1, Y1 = make_blobs(n_samples=1000, n_features=2, centers=3,cluster_std=[1.0,3.0,2.0])
128 | # plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
129 | #
130 | # plt.subplot(426)
131 | # plt.title("Gaussian divided into four quantiles", fontsize='small')
132 | # X1, Y1 = make_gaussian_quantiles(n_samples=1000, n_features=2, n_classes=4)
133 | # plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
134 | #
135 | # plt.subplot(427)
136 | # plt.title("hastie data ", fontsize='small')
137 | # X1, Y1 = make_hastie_10_2(n_samples=1000)
138 | # plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
139 | # plt.show()
140 | 
141 | 
142 | 
143 | # # ===========生成圆形或月亮型分类数据===========
144 | 
145 | from sklearn.datasets import make_circles
146 | from sklearn.datasets import make_moons
147 | import matplotlib.pyplot as plt
148 | 
149 | fig = plt.figure(1)
150 | x1, y1 = make_circles(n_samples=1000, factor=0.5, noise=0.1)
151 | plt.subplot(121)
152 | plt.title('make_circles function example')
153 | plt.scatter(x1[:, 0], x1[:, 1], marker='o', c=y1)
154 | 
155 | plt.subplot(122)
156 | x1, y1 = make_moons(n_samples=1000, noise=0.1)
157 | plt.title('make_moons function example')
158 | plt.scatter(x1[:, 0], x1[:, 1], marker='o', c=y1)
159 | plt.show()
160 | 
161 | 
162 | 
163 | # # =======清洗转化数据格式======
164 | # # 转换成pandas的DataFrame数据格式，方便观察数据
165 | # pddata = pd.DataFrame(data=np.c_[data, target],columns=np.append(['x1','x2'], ['y']))    #每行为一个对象，每列为一种属性，最后一个为结果值
166 | # # print(iris.head(2))                  #查看前两行，观察数据格式
167 | # # print(iris.isnull().sum())           # isnull()返回布尔矩阵，sum()按列求和。检查数据是否有缺失
168 | # # print(iris.groupby('y').count())     # 观察样本中各类别数量是否比较均衡
169 | 


--------------------------------------------------------------------------------
/sk-案例流程.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | 
  3 | # ========加载数据(Data Loading)========
  4 | import numpy as np
  5 | import urllib.request
  6 | 
  7 | # 数据集的请求地址
  8 | url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
  9 | # 下载响应的csv文件
 10 | raw_data = urllib.request.urlopen(url)
 11 | # 加载csv文件成numpy中的矩阵
 12 | dataset = np.loadtxt(raw_data, delimiter=",")
 13 | # 分割成属性集和结果集
 14 | X = dataset[:,0:7]   # 特征矩阵
 15 | y = dataset[:,8]  #目标矩阵
 16 | # print('特征矩阵:\n',X)
 17 | # print('结果矩阵:\n',y)
 18 | 
 19 | # ========数据归一化(Data Normalization)========
 20 | from sklearn import preprocessing
 21 | # 归一化数据集
 22 | normalized_X = preprocessing.normalize(X)
 23 | # 标准话数据集
 24 | standardized_X = preprocessing.scale(X)
 25 | 
 26 | # ========特征选择(Feature Selection)========
 27 | # 树算法(Tree algorithms)计算特征的信息量
 28 | from sklearn import metrics
 29 | from sklearn.ensemble import ExtraTreesClassifier
 30 | model = ExtraTreesClassifier()
 31 | model.fit(X, y)
 32 | # 显示每个特征的重要性
 33 | print('属性重要性:\n',model.feature_importances_)
 34 | 
 35 | # ========逻辑回归========
 36 | from sklearn import metrics
 37 | from sklearn.linear_model import LogisticRegression
 38 | model = LogisticRegression()
 39 | model.fit(X, y)
 40 | print('逻辑回归模型:\n',model)
 41 | # 使用模型预测
 42 | expected = y
 43 | predicted = model.predict(X)
 44 | # 评估模型
 45 | print(metrics.classification_report(expected, predicted))  #评估模型
 46 | print(metrics.confusion_matrix(expected, predicted))  # 使用混淆矩阵评估模型
 47 | 
 48 | # ========朴素贝叶斯========
 49 | from sklearn import metrics
 50 | from sklearn.naive_bayes import GaussianNB
 51 | model = GaussianNB()
 52 | model.fit(X, y)
 53 | print('朴素贝叶斯模型:\n',model)
 54 | # 使用模型预测
 55 | expected = y
 56 | predicted = model.predict(X)
 57 | # 评估模型
 58 | print(metrics.classification_report(expected, predicted))
 59 | print(metrics.confusion_matrix(expected, predicted))
 60 | 
 61 | # ========k近邻========
 62 | from sklearn import metrics
 63 | from sklearn.neighbors import KNeighborsClassifier
 64 | # 使用样本数据构建knn模型
 65 | model = KNeighborsClassifier()
 66 | model.fit(X, y)
 67 | print('KNN模型:\n',model)
 68 | # 使用模型预测
 69 | expected = y
 70 | predicted = model.predict(X)
 71 | # 评估模型
 72 | print(metrics.classification_report(expected, predicted))
 73 | print(metrics.confusion_matrix(expected, predicted))
 74 | 
 75 | 
 76 | # ========决策树========
 77 | from sklearn import metrics
 78 | from sklearn.tree import DecisionTreeClassifier
 79 | # 构建决策树模型
 80 | model = DecisionTreeClassifier()
 81 | model.fit(X, y)
 82 | print('决策树模型:\n',model)
 83 | # 使用模型预测
 84 | expected = y
 85 | predicted = model.predict(X)
 86 | # 评估模型
 87 | print(metrics.classification_report(expected, predicted))
 88 | print(metrics.confusion_matrix(expected, predicted))
 89 | 
 90 | 
 91 | # ========支持向量机========
 92 | from sklearn import metrics
 93 | from sklearn.svm import SVC
 94 | # 构建svm模型
 95 | model = SVC()
 96 | model.fit(X, y)
 97 | print('SVM模型:\n',model)
 98 | # 使用模型预测
 99 | expected = y
100 | predicted = model.predict(X)
101 | # 评估模型
102 | print(metrics.classification_report(expected, predicted))
103 | print(metrics.confusion_matrix(expected, predicted))
104 | 
105 | # ========优化算法参数========
106 | import numpy as np
107 | from sklearn.linear_model import Ridge   #岭回归模型
108 | from scipy.stats import uniform as sp_rand
109 | from sklearn.grid_search import GridSearchCV  #网格搜索
110 | from sklearn.grid_search import RandomizedSearchCV  # 随机搜索
111 | 
112 | # 准备参数的可取值
113 | alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
114 | # 构建岭回归模型，并尝试参数每一个可取值
115 | model = Ridge()
116 | rsearch = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
117 | 
118 | # # 只给定区间，参数随机取值
119 | # param_grid = {'alpha': sp_rand()}
120 | # # 构建岭回归模型，并尝试参数随机值
121 | # model = Ridge()
122 | # rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
123 | 
124 | rsearch.fit(X, y)
125 | print(rsearch)
126 | # 评估搜索结果
127 | print(rsearch.best_score_)
128 | print(rsearch.best_estimator_.alpha)
129 | 


--------------------------------------------------------------------------------
/sk-特征提取.py:
--------------------------------------------------------------------------------
 1 | # ===========从字典类型加载特征。形成系数矩阵结构==========
 2 | from sklearn.feature_extraction import DictVectorizer
 3 | measurements = [
 4 |     {'name': 'student1', 'age': 12},
 5 |     {'boy':True, 'parents': 'baba'},
 6 |     {'size':16},
 7 | ]
 8 | 
 9 | vec = DictVectorizer().fit(measurements)  # 定义一个加载器，后对一个字典对象提取特征。（值为数值型、布尔型的属性为单独的属性。值为字符串型的属性，形成"属性=值"的新属性）
10 | print('提取的特征：',vec.get_feature_names())  # 查看提取的新属性
11 | print('稀疏矩阵形式：\n',vec.transform(measurements))
12 | print('二维矩阵形式：\n',vec.transform(measurements).toarray())
13 | 
14 | # =================文本特征提取==============
15 | from sklearn.feature_extraction.text import CountVectorizer
16 | corpus = ['This is the first document.',
17 |           'This is the second second document.',
18 |           'And the third one.',
19 |           'Is this the first document?',]
20 | vectorizer = CountVectorizer()
21 | X = vectorizer.fit_transform(corpus)  # 默认提取至少 包含2个字母的单词
22 | print('所有特征：',vectorizer.get_feature_names())
23 | print('样本特征向量：\n',X.toarray())  # X本身为稀疏矩阵存储形式，toarray转换为二维矩阵形式
24 | 
25 | print('document属性的列索引：',vectorizer.vocabulary_.get('document'))  # 从 特征 名称到矩阵的（列索引）
26 | 
27 | # 提取一个单词或两个单词形成的词组。这样就能识别“is this”和“this is”这两种词汇了
28 | bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
29 | analyze = bigram_vectorizer.build_analyzer()
30 | print('所有分词：',analyze('Bi-grams are cool!'))
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/sk-特征选择.py:
--------------------------------------------------------------------------------
  1 | # # ============去除方差小于阈值的特征============
  2 | # from sklearn.feature_selection import VarianceThreshold  #移除低方差特征
  3 | # from sklearn.datasets import load_iris  # 引入花卉数据集
  4 | # iris = load_iris()
  5 | # X= iris.data
  6 | # print(X.shape)
  7 | # print(X.var(axis=0))
  8 | #
  9 | # sel = VarianceThreshold(threshold=0.2)
 10 | # X_transformed=sel.fit_transform(X)
 11 | # print('去除低方差特征：\n',X_transformed.shape)
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | # # ============排序选择优秀特征============
 18 | # from sklearn.datasets import load_iris
 19 | # from sklearn.feature_selection import SelectKBest
 20 | # from sklearn.feature_selection import chi2  # 引入卡方检验统计量
 21 | # # 对于回归: f_regression , mutual_info_regression
 22 | # # 对于分类: chi2 , f_classif , mutual_info_classif
 23 | # iris = load_iris()
 24 | # X, y = iris.data, iris.target
 25 | # print('源样本维度：',X.shape)
 26 | #
 27 | # X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
 28 | # print('新样本维度：',X_new.shape)
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # # ============递归式特征消除============
 34 | # # 这里递归的移除最不重要的像素点来对每个像素点（特征）进行排序
 35 | # from sklearn.svm import SVC
 36 | # from sklearn.datasets import load_digits
 37 | # from sklearn.feature_selection import RFE
 38 | # import matplotlib.pyplot as plt
 39 | #
 40 | # digits = load_digits()  # 加载手写体数据集
 41 | # X = digits.images.reshape((len(digits.images), -1))
 42 | # y = digits.target
 43 | #
 44 | # # 创建ref对象和每个像素点的重要度排名
 45 | # svc = SVC(kernel="linear", C=1)
 46 | # rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
 47 | # rfe.fit(X, y)
 48 | # ranking = rfe.ranking_.reshape(digits.images[0].shape)
 49 | #
 50 | # # 绘制像素点排名
 51 | # plt.matshow(ranking, cmap=plt.cm.Blues)
 52 | # plt.colorbar()
 53 | # plt.title("Ranking of pixels with RFE")
 54 | # plt.show()
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | # # ============使用 SelectFromModel 选取特征============
 61 | #
 62 | # import matplotlib.pyplot as plt
 63 | # from sklearn.datasets import load_boston
 64 | # from sklearn.feature_selection import SelectFromModel
 65 | # from sklearn.linear_model import LassoCV
 66 | # boston = load_boston()  # 加载波士顿房价回归数据
 67 | # X, y = boston['data'], boston['target']  # 取特征数据和输出数据
 68 | # n_features =[13]  # 记录循环中的特征个数，最开始数据集是有13个特征的
 69 | # thresholds=[0]  # 记录门限值，最开始是没有门限值的
 70 | #
 71 | # clf = LassoCV()  # 使用Lasso回归
 72 | #
 73 | # # 设置最小门限为0.25。coef_ 或者 featureimportances 属性值低于门限的都会被去除调
 74 | # sfm = SelectFromModel(clf, threshold=0.1)
 75 | # sfm.fit(X, y)  # 训练模型。找出模型回归系数。
 76 | # X_transform = sfm.transform(X) # 根据回归系数、门限，变换数据集。
 77 | # n_feature =X_transform.shape[1]  # 获取训练以后的特征数目
 78 | # n_features.append(n_feature)
 79 | # thresholds.append(0.1)
 80 | # while n_feature > 2:  # 如果特征数大于2，则从新转换，找最好的两个特征
 81 | #     sfm.threshold += 0.1  # 逐渐增加门限，进一步减少特征数目
 82 | #     X_transform = sfm.transform(X) # 变换数据集
 83 | #     n_feature = X_transform.shape[1]
 84 | #     n_features.append(n_feature)  # 记录训练以后的特征数目
 85 | #     thresholds.append(sfm.threshold)  # 记录门限值
 86 | #
 87 | # plt.title("Features with threshold %0.3f." % sfm.threshold)
 88 | # plt.plot(thresholds, n_features, 'r')
 89 | # plt.xlabel("thresholds")
 90 | # plt.ylabel("Feature number")
 91 | # plt.show()
 92 | 
 93 | 
 94 | 
 95 | 
 96 | # # ============基于 L1 的特征选取============
 97 | # from sklearn.svm import LinearSVC
 98 | # from sklearn.datasets import load_iris
 99 | # from sklearn.feature_selection import SelectFromModel
100 | # iris = load_iris()
101 | # X, y = iris.data, iris.target
102 | # print('原数据集维度：',X.shape)
103 | # lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
104 | # model = SelectFromModel(lsvc, prefit=True)
105 | # X_new = model.transform(X)
106 | # print('新数据集维度：',X_new.shape)
107 | 
108 | 
109 | 
110 | # ============基于 Tree（树）的特征选取============
111 | from sklearn.ensemble import ExtraTreesClassifier
112 | from sklearn.datasets import load_iris
113 | from sklearn.feature_selection import SelectFromModel
114 | dataset = load_iris()
115 | X, y = dataset.data, dataset.target
116 | print('原数据集维度：',X.shape)
117 | clf = ExtraTreesClassifier()
118 | clf = clf.fit(X, y)
119 | print('属性重要程度：',clf.feature_importances_)
120 | 
121 | model = SelectFromModel(clf, prefit=True)
122 | X_new = model.transform(X)
123 | print('新数据集维度：',X.shape)
124 | 
125 | 
126 | 
127 | # ============特征选取作为 pipeline（管道）的一部分============
128 | # from sklearn.pipeline import Pipeline
129 | # clf = Pipeline([
130 | #   ('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))),
131 | #   ('classification', RandomForestClassifier())
132 | # ])
133 | # clf.fit(X, y)
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/sk-神经网络.py:
--------------------------------------------------------------------------------
  1 | # # =============神经网络用于分类=============
  2 | # import numpy as np
  3 | # import matplotlib.pyplot as plt
  4 | # from sklearn.neural_network import MLPClassifier
  5 | # from sklearn.preprocessing import StandardScaler
  6 | # data = [
  7 | #     [-0.017612, 14.053064, 0],[-1.395634, 4.662541, 1],[-0.752157, 6.53862, 0],[-1.322371, 7.152853, 0],[0.423363, 11.054677, 0],
  8 | #     [0.406704, 7.067335, 1],[0.667394, 12.741452, 0],[-2.46015, 6.866805, 1],[0.569411, 9.548755, 0],[-0.026632, 10.427743, 0],
  9 | #     [0.850433, 6.920334, 1],[1.347183, 13.1755, 0],[1.176813, 3.16702, 1],[-1.781871, 9.097953, 0],[-0.566606, 5.749003, 1],
 10 | #     [0.931635, 1.589505, 1],[-0.024205, 6.151823, 1],[-0.036453, 2.690988, 1],[-0.196949, 0.444165, 1],[1.014459, 5.754399, 1],
 11 | #     [1.985298, 3.230619, 1],[-1.693453, -0.55754, 1],[-0.576525, 11.778922, 0],[-0.346811, -1.67873, 1],[-2.124484, 2.672471, 1],
 12 | #     [1.217916, 9.597015, 0],[-0.733928, 9.098687, 0],[1.416614, 9.619232, 0],[1.38861, 9.341997, 0],[0.317029, 14.739025, 0]
 13 | # ]
 14 | #
 15 | # dataMat = np.array(data)
 16 | # X=dataMat[:,0:2]
 17 | # y = dataMat[:,2]
 18 | # # 神经网络对数据尺度敏感，所以最好在训练前标准化，或者归一化，或者缩放到[-1,1]
 19 | # scaler = StandardScaler() # 标准化转换
 20 | # scaler.fit(X)  # 训练标准化对象
 21 | # X = scaler.transform(X)   # 转换数据集
 22 | # # solver='lbfgs',  MLP的求解方法：L-BFGS 在小数据上表现较好，Adam 较为鲁棒，SGD在参数调整较优时会有最佳表现（分类效果与迭代次数）；SGD标识随机梯度下降。
 23 | # # alpha:L2的参数：MLP是可以支持正则化的，默认为L2，具体参数需要调整
 24 | # # hidden_layer_sizes=(5, 2) hidden层2层,第一层5个神经元，第二层2个神经元)，2层隐藏层，也就有3层神经网络
 25 | #
 26 | # clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5,2), random_state=1)  # 神经网络输入为2，第一隐藏层神经元个数为5，第二隐藏层神经元个数为2，输出结果为2分类。
 27 | # clf.fit(X, y)
 28 | # print('每层网络层系数矩阵维度：\n',[coef.shape for coef in clf.coefs_])
 29 | # y_pred = clf.predict([[0.317029, 14.739025]])
 30 | # print('预测结果：',y_pred)
 31 | # y_pred_pro =clf.predict_proba([[0.317029, 14.739025]])
 32 | # print('预测结果概率：\n',y_pred_pro)
 33 | #
 34 | # cengindex = 0
 35 | # for wi in clf.coefs_:
 36 | #     cengindex += 1  # 表示底第几层神经网络。
 37 | #     print('第%d层网络层:' % cengindex)
 38 | #     print('权重矩阵维度:',wi.shape)
 39 | #     print('系数矩阵:\n',wi)
 40 | #
 41 | #
 42 | # # 绘制分割区域
 43 | # x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # 寻找每个维度的范围
 44 | # y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # 寻找每个维度的范围
 45 | # xx1, xx2 = np.meshgrid(np.arange(x_min, x_max, 0.01),np.arange(y_min, y_max,0.01)) # 在特征范围以0.01位步长预测每一个点的输出结果
 46 | # Z = clf.predict(np.c_[xx1.ravel(), xx2.ravel()]) # 先形成待测样本的形式，在通过模型进行预测。
 47 | # Z = Z.reshape(xx1.shape) # 将输出结果转换为和网格的矩阵形式，以便绘图
 48 | # # 绘制区域网格图
 49 | # plt.pcolormesh(xx1, xx2, Z, cmap=plt.cm.Paired)
 50 | # # 绘制样本点
 51 | # plt.scatter(X[:,0],X[:,1],c=y)
 52 | # plt.show()
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | # # =============神经网络用于回归=============
 61 | 
 62 | import numpy as np
 63 | from sklearn.neural_network import MLPRegressor  # 多层线性回归
 64 | from sklearn.preprocessing import StandardScaler
 65 | data = [
 66 |          [ -0.017612,14.053064,14.035452],[ -1.395634, 4.662541, 3.266907],[ -0.752157, 6.53862,5.786463],[ -1.322371, 7.152853, 5.830482],
 67 |          [0.423363,11.054677,11.47804 ],[0.406704, 7.067335, 7.474039],[0.667394,12.741452,13.408846],[ -2.46015,6.866805, 4.406655],
 68 |          [0.569411, 9.548755,10.118166],[ -0.026632,10.427743,10.401111],[0.850433, 6.920334, 7.770767],[1.347183,13.1755,14.522683],
 69 |          [1.176813, 3.16702,4.343833],[ -1.781871, 9.097953, 7.316082],[ -0.566606, 5.749003, 5.182397],[0.931635, 1.589505, 2.52114 ],
 70 |          [ -0.024205, 6.151823, 6.127618],[ -0.036453, 2.690988, 2.654535],[ -0.196949, 0.444165, 0.247216],[1.014459, 5.754399, 6.768858],
 71 |          [1.985298, 3.230619, 5.215917],[ -1.693453,-0.55754, -2.250993],[ -0.576525,11.778922,11.202397],[ -0.346811,-1.67873, -2.025541],
 72 |          [ -2.124484, 2.672471, 0.547987],[1.217916, 9.597015,10.814931],[ -0.733928, 9.098687, 8.364759],[1.416614, 9.619232,11.035846],
 73 |          [1.38861,9.341997,10.730607],[0.317029,14.739025,15.056054]
 74 | ]
 75 | 
 76 | dataMat = np.array(data)
 77 | X=dataMat[:,0:2]
 78 | y = dataMat[:,2]
 79 | scaler = StandardScaler() # 标准化转换
 80 | scaler.fit(X)  # 训练标准化对象
 81 | X = scaler.transform(X)   # 转换数据集
 82 | 
 83 | # solver='lbfgs',  MLP的求解方法：L-BFGS 在小数据上表现较好，Adam 较为鲁棒，SGD在参数调整较优时会有最佳表现（分类效果与迭代次数）；SGD标识随机梯度下降。
 84 | # alpha:L2的参数：MLP是可以支持正则化的，默认为L2，具体参数需要调整
 85 | # hidden_layer_sizes=(5, 2) hidden层2层,第一层5个神经元，第二层2个神经元)，2层隐藏层，也就有3层神经网络
 86 | clf = MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
 87 | clf.fit(X, y)
 88 | print('预测结果：', clf.predict([[0.317029, 14.739025]]))  # 预测某个输入对象
 89 | 
 90 | cengindex = 0
 91 | for wi in clf.coefs_:
 92 |     cengindex += 1  # 表示底第几层神经网络。
 93 |     print('第%d层网络层:' % cengindex)
 94 |     print('权重矩阵维度:',wi.shape)
 95 |     print('系数矩阵：\n',wi)
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/sk-线性回归.py:
--------------------------------------------------------------------------------
 1 | import numpy as np # 快速操作结构数组的工具
 2 | import matplotlib.pyplot as plt  # 可视化绘制
 3 | from sklearn.linear_model import LinearRegression  # 线性回归
 4 | 
 5 | 
 6 | # 样本数据集，第一列为x，第二列为y，在x和y之间建立回归模型
 7 | data=[
 8 |     [0.067732,3.176513],[0.427810,3.816464],[0.995731,4.550095],[0.738336,4.256571],[0.981083,4.560815],
 9 |     [0.526171,3.929515],[0.378887,3.526170],[0.033859,3.156393],[0.132791,3.110301],[0.138306,3.149813],
10 |     [0.247809,3.476346],[0.648270,4.119688],[0.731209,4.282233],[0.236833,3.486582],[0.969788,4.655492],
11 |     [0.607492,3.965162],[0.358622,3.514900],[0.147846,3.125947],[0.637820,4.094115],[0.230372,3.476039],
12 |     [0.070237,3.210610],[0.067154,3.190612],[0.925577,4.631504],[0.717733,4.295890],[0.015371,3.085028],
13 |     [0.335070,3.448080],[0.040486,3.167440],[0.212575,3.364266],[0.617218,3.993482],[0.541196,3.891471]
14 | ]
15 | 
16 | 
17 | #生成X和y矩阵
18 | dataMat = np.array(data)
19 | X = dataMat[:,0:1]   # 变量x
20 | y = dataMat[:,1]   #变量y
21 | 
22 | 
23 | 
24 | # ========线性回归========
25 | model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
26 | model.fit(X, y)   # 线性回归建模
27 | print('系数矩阵:\n',model.coef_)
28 | print('线性回归模型:\n',model)
29 | # 使用模型预测
30 | predicted = model.predict(X)
31 | 
32 | # 绘制散点图 参数：x横轴 y纵轴
33 | plt.scatter(X, y, marker='x')
34 | plt.plot(X, predicted,c='r')
35 | 
36 | # 绘制x轴和y轴坐标
37 | plt.xlabel("x")
38 | plt.ylabel("y")
39 | 
40 | # 显示图形
41 | plt.show()
42 | 
43 | 


--------------------------------------------------------------------------------
/sk-逻辑分类有b偏量.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | 
  3 | import numpy as np # 快速操作结构数组的工具
  4 | import pandas as pd # 数据分析处理工具
  5 | 
  6 | 
  7 | # 样本数据集，第一列为x1，第二列为x2，第三列为分类（三种类别）
  8 | data=[
  9 |         [-2.68420713, 0.32660731, 0],[-2.71539062, -0.16955685, 0],[-2.88981954, -0.13734561, 0],[-2.7464372, -0.31112432, 0],[-2.72859298, 0.33392456, 0],
 10 |         [-2.27989736, 0.74778271, 0],[-2.82089068, -0.08210451, 0],[-2.62648199, 0.17040535, 0],[-2.88795857, -0.57079803, 0],[-2.67384469, -0.1066917, 0],
 11 |         [-2.50652679,0.65193501,0],[-2.61314272,0.02152063,0],[-2.78743398,-0.22774019,0],[-3.22520045,-0.50327991,0],[-2.64354322,1.1861949,0],
 12 |         [-2.38386932,1.34475434,0],[-2.6225262,0.81808967,0],[-2.64832273,0.31913667,0],[-2.19907796,0.87924409,0],[-2.58734619,0.52047364,0],
 13 |         [1.28479459, 0.68543919, 1],[0.93241075, 0.31919809, 1],[1.46406132, 0.50418983, 1],[0.18096721, -0.82560394, 1],[1.08713449, 0.07539039, 1],
 14 |         [0.64043675, -0.41732348, 1],[1.09522371, 0.28389121, 1],[-0.75146714, -1.00110751, 1],[1.04329778, 0.22895691, 1],[-0.01019007, -0.72057487, 1],
 15 |         [-0.5110862,-1.26249195,1],[0.51109806,-0.10228411,1],[0.26233576,-0.5478933,1],[0.98404455,-0.12436042,1],[-0.174864,-0.25181557,1],
 16 |         [0.92757294,0.46823621,1],[0.65959279,-0.35197629,1],[0.23454059,-0.33192183,1],[0.94236171,-0.54182226,1],[0.0432464,-0.58148945,1],
 17 |         [2.53172698, -0.01184224, 2],[1.41407223, -0.57492506, 2],[2.61648461, 0.34193529, 2],[1.97081495, -0.18112569, 2],[2.34975798, -0.04188255, 2],
 18 |         [3.39687992, 0.54716805, 2],[0.51938325, -1.19135169, 2],[2.9320051, 0.35237701, 2],[2.31967279, -0.24554817, 2],[2.91813423, 0.78038063, 2],
 19 |         [1.66193495,0.2420384,2],[1.80234045,-0.21615461,2],[2.16537886,0.21528028,2],[1.34459422,-0.77641543,2],[1.5852673,-0.53930705,2],
 20 |         [1.90474358,0.11881899,2],[1.94924878,0.04073026,2],[3.48876538,1.17154454,2],[3.79468686,0.25326557,2],[1.29832982,-0.76101394,2],
 21 | ]
 22 | # 样本数据集，第一列为x1，第二列为x2，第三列为分类（2种类别）
 23 | data1=[
 24 |     [-0.017612,14.053064,0],
 25 |     [-1.395634,4.662541,1],
 26 |     [-0.752157,6.538620,0],
 27 |     [-1.322371,7.152853,0],
 28 |     [0.423363,11.054677,0],
 29 |     [0.406704,7.067335,1],
 30 |     [0.667394,12.741452,0],
 31 |     [-2.460150,6.866805,1],
 32 |     [0.569411,9.548755,0],
 33 |     [-0.026632,10.427743,0],
 34 |     [0.850433,6.920334,1],
 35 |     [1.347183,13.175500,0],
 36 |     [1.176813,3.167020,1],
 37 |     [-1.781871,9.097953,0],
 38 |     [-0.566606,5.749003,1],
 39 |     [0.931635,1.589505,1],
 40 |     [-0.024205,6.151823,1],
 41 |     [-0.036453,2.690988,1],
 42 |     [-0.196949,0.444165,1],
 43 |     [1.014459,5.754399,1]
 44 | ]
 45 | #生成X和y矩阵
 46 | dataMat = np.mat(data)
 47 | y = dataMat[:,2]   # 类别变量
 48 | b = np.ones(y.shape)  # 添加全1列向量代表b偏量
 49 | X = np.column_stack((b, dataMat[:,0:2]))  # 特征属性集和b偏量组成x
 50 | X = np.mat(X)
 51 | 
 52 | 
 53 | # 特征数据归一化
 54 | # import sklearn.preprocessing as preprocessing   #sk的去均值和归一化
 55 | # scaler=preprocessing.StandardScaler()
 56 | # X = scaler.fit_transform(X)   # 对特征数据集去均值和归一化，可以加快机器性能
 57 | # X = np.mat(X)
 58 | # # print(X)
 59 | # ========逻辑回归========
 60 | 
 61 | from sklearn import metrics
 62 | from sklearn.linear_model import LogisticRegression
 63 | model = LogisticRegression()
 64 | model.fit(X, y)
 65 | print('逻辑回归模型:\n',model)
 66 | # 使用模型预测
 67 | predicted = model.predict(X)   #预测分类
 68 | answer = model.predict_proba(X)  #预测分类概率
 69 | print(answer)
 70 | 
 71 | 
 72 | 
 73 | import matplotlib.pyplot as plt
 74 | 
 75 | # 绘制边界和散点
 76 | # 先产生x1和x2取值范围上的网格点，并预测每个网格点上的值。
 77 | h = 0.02
 78 | x1_min, x1_max = X[:,1].min() - .5, X[:,1].max() + .5
 79 | x2_min, x2_max = X[:,2].min() - .5, X[:,2].max() + .5
 80 | xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h), np.arange(x2_min, x2_max, h))
 81 | testMat = np.c_[xx1.ravel(), xx2.ravel()]   #形成测试特征数据集
 82 | testMat = np.column_stack((np.ones(((testMat.shape[0]),1)),testMat))  #添加第一列为全1代表b偏量
 83 | testMat = np.mat(testMat)
 84 | Z = model.predict(testMat)
 85 | 
 86 | # 绘制区域网格图
 87 | Z = Z.reshape(xx1.shape)
 88 | plt.pcolormesh(xx1, xx2, Z, cmap=plt.cm.Paired)
 89 | 
 90 | 
 91 | # 绘制散点图 参数：x横轴 y纵轴，颜色代表分类。x图标为样本点，.表示预测点
 92 | plt.scatter(X[:,1].flatten().A[0], X[:,2].flatten().A[0],c=y.flatten().A[0],marker='x')   # 绘制样本数据集
 93 | plt.scatter(X[:,1].flatten().A[0], X[:,2].flatten().A[0],c=predicted.tolist(),marker='.') # 绘制预测数据集
 94 | 
 95 | # 绘制x轴和y轴坐标
 96 | plt.xlabel("x")
 97 | plt.ylabel("y")
 98 | 
 99 | # 显示图形
100 | plt.show()


--------------------------------------------------------------------------------
/sk-逻辑分类没有b偏量.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | 
 3 | import numpy as np # 快速操作结构数组的工具
 4 | import pandas as pd # 数据分析处理工具
 5 | 
 6 | 
 7 | # 样本数据集，第一列为x1，第二列为x2，第三列为分类（三种类别）
 8 | data=[
 9 |         [-2.68420713, 0.32660731, 0],[-2.71539062, -0.16955685, 0],[-2.88981954, -0.13734561, 0],[-2.7464372, -0.31112432, 0],[-2.72859298, 0.33392456, 0],
10 |         [-2.27989736, 0.74778271, 0],[-2.82089068, -0.08210451, 0],[-2.62648199, 0.17040535, 0],[-2.88795857, -0.57079803, 0],[-2.67384469, -0.1066917, 0],
11 |         [-2.50652679,0.65193501,0],[-2.61314272,0.02152063,0],[-2.78743398,-0.22774019,0],[-3.22520045,-0.50327991,0],[-2.64354322,1.1861949,0],
12 |         [-2.38386932,1.34475434,0],[-2.6225262,0.81808967,0],[-2.64832273,0.31913667,0],[-2.19907796,0.87924409,0],[-2.58734619,0.52047364,0],
13 |         [1.28479459, 0.68543919, 1],[0.93241075, 0.31919809, 1],[1.46406132, 0.50418983, 1],[0.18096721, -0.82560394, 1],[1.08713449, 0.07539039, 1],
14 |         [0.64043675, -0.41732348, 1],[1.09522371, 0.28389121, 1],[-0.75146714, -1.00110751, 1],[1.04329778, 0.22895691, 1],[-0.01019007, -0.72057487, 1],
15 |         [-0.5110862,-1.26249195,1],[0.51109806,-0.10228411,1],[0.26233576,-0.5478933,1],[0.98404455,-0.12436042,1],[-0.174864,-0.25181557,1],
16 |         [0.92757294,0.46823621,1],[0.65959279,-0.35197629,1],[0.23454059,-0.33192183,1],[0.94236171,-0.54182226,1],[0.0432464,-0.58148945,1],
17 |         [2.53172698, -0.01184224, 2],[1.41407223, -0.57492506, 2],[2.61648461, 0.34193529, 2],[1.97081495, -0.18112569, 2],[2.34975798, -0.04188255, 2],
18 |         [3.39687992, 0.54716805, 2],[0.51938325, -1.19135169, 2],[2.9320051, 0.35237701, 2],[2.31967279, -0.24554817, 2],[2.91813423, 0.78038063, 2],
19 |         [1.66193495,0.2420384,2],[1.80234045,-0.21615461,2],[2.16537886,0.21528028,2],[1.34459422,-0.77641543,2],[1.5852673,-0.53930705,2],
20 |         [1.90474358,0.11881899,2],[1.94924878,0.04073026,2],[3.48876538,1.17154454,2],[3.79468686,0.25326557,2],[1.29832982,-0.76101394,2],
21 | ]
22 | # 样本数据集，第一列为x1，第二列为x2，第三列为分类（2种类别）
23 | data1=[
24 |     [-0.017612,14.053064,0],
25 |     [-1.395634,4.662541,1],
26 |     [-0.752157,6.538620,0],
27 |     [-1.322371,7.152853,0],
28 |     [0.423363,11.054677,0],
29 |     [0.406704,7.067335,1],
30 |     [0.667394,12.741452,0],
31 |     [-2.460150,6.866805,1],
32 |     [0.569411,9.548755,0],
33 |     [-0.026632,10.427743,0],
34 |     [0.850433,6.920334,1],
35 |     [1.347183,13.175500,0],
36 |     [1.176813,3.167020,1],
37 |     [-1.781871,9.097953,0],
38 |     [-0.566606,5.749003,1],
39 |     [0.931635,1.589505,1],
40 |     [-0.024205,6.151823,1],
41 |     [-0.036453,2.690988,1],
42 |     [-0.196949,0.444165,1],
43 |     [1.014459,5.754399,1]
44 | ]
45 | 
46 | #生成X和y矩阵
47 | dataMat = np.mat(data)
48 | X = dataMat[:,0:2]   # 特征数据集
49 | y = dataMat[:,2]   # 类别变量
50 | 
51 | 
52 | import sklearn.preprocessing as preprocessing   #sk的去均值和归一化
53 | scaler=preprocessing.StandardScaler()
54 | X = scaler.fit_transform(X)   # 对特征数据集去均值和归一化，可以加快机器性能
55 | X = np.mat(X)
56 | print(X)
57 | 
58 | # ========逻辑回归========
59 | 
60 | from sklearn import metrics
61 | from sklearn.linear_model import LogisticRegression
62 | model = LogisticRegression()
63 | model.fit(X, y)
64 | print('逻辑回归模型:\n',model)
65 | # 使用模型预测
66 | predicted = model.predict(X)   #预测分类
67 | answer = model.predict_proba(X)  #预测分类概率
68 | print(answer)
69 | 
70 | 
71 | 
72 | import matplotlib.pyplot as plt
73 | 
74 | # 绘制边界和散点
75 | # 先产生x1和x2取值范围上的网格点，并预测每个网格点上的值。
76 | h = 0.02
77 | x1_min, x1_max = X[:,0].min() - .5, X[:,0].max() + .5
78 | x2_min, x2_max = X[:,1].min() - .5, X[:,1].max() + .5
79 | xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h), np.arange(x2_min, x2_max, h))
80 | Z = model.predict(np.c_[xx1.ravel(), xx2.ravel()])
81 | 
82 | # 绘制区域网格图
83 | Z = Z.reshape(xx1.shape)
84 | print(type(Z))
85 | plt.pcolormesh(xx1, xx2, Z, cmap=plt.cm.Paired)
86 | 
87 | 
88 | # 绘制散点图 参数：x横轴 y纵轴，颜色代表分类。x图标为样本点，.表示预测点
89 | plt.scatter(X[:,0].flatten().A[0], X[:,1].flatten().A[0],c=y.flatten().A[0],marker='x')   # 绘制样本数据集
90 | plt.scatter(X[:,0].flatten().A[0], X[:,1].flatten().A[0],c=predicted.tolist(),marker='.') # 绘制预测数据集
91 | 
92 | # 绘制x轴和y轴坐标
93 | plt.xlabel("x")
94 | plt.ylabel("y")
95 | 
96 | # 显示图形
97 | plt.show()


--------------------------------------------------------------------------------
/sk-随机梯度下降.py:
--------------------------------------------------------------------------------
 1 | # #===============随机梯度下降法分类===============
 2 | #
 3 | # from sklearn.linear_model import SGDClassifier
 4 | # from sklearn.datasets.samples_generator import make_blobs
 5 | # import numpy as np
 6 | # import matplotlib.pyplot as plt
 7 | #
 8 | # X, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60)
 9 | # # loss：损失项。hinge：（软-间隔）线性支持向量机，modified_huber：平滑的 hinge 损失，log：logistic 回归，其他所有的回归损失
10 | # # penalty：惩罚项。l2：L2正则，l1：L1正则，elasticnet：(1 - l1_ratio) * L2 + l1_ratio * L1
11 | # clf = SGDClassifier(loss="hinge", alpha=0.01, max_iter=200, fit_intercept=True)  #
12 | # clf.fit(X, y)  # 训练模型
13 | #
14 | # print('回归系数：',clf.coef_)
15 | # print('偏差：',clf.intercept_ )
16 | #
17 | # # 绘制线，点
18 | # xx1 = np.linspace(-1, 5, 10)
19 | # xx2 = np.linspace(-1, 5, 10)
20 | #
21 | # X1, X2 = np.meshgrid(xx1, xx2)  # X1、X2都是10*10的矩阵
22 | # Z = np.empty(X1.shape)
23 | # for (i, j), val in np.ndenumerate(X1):  # 迭代第i行第j列的坐标xx1取值为val
24 | #     x1 = val
25 | #     x2 = X2[i, j]  #
26 | #     p = clf.decision_function([[x1, x2]])  # 计算输出值，也就是到超平面的符号距离。（支持向量到最佳超平面的符号距离为-1和+1）
27 | #     Z[i, j] = p[0]
28 | # levels = [-1.0, 0.0, 1.0]  # 将输出值分为-1,0,1几个区间
29 | # linestyles = ['dashed', 'solid', 'dashed']
30 | # plt.contour(X1, X2, Z, levels, colors='k', linestyles=linestyles)  # 绘制等高线图，高度为-1,0,1，也就是支持向量形成的线和最佳分割超平面
31 | # plt.scatter(X[:, 0], X[:, 1], c=y, s=20)  # 绘制样本点
32 | # plt.show()
33 | 
34 | 
35 | 
36 | # # ==============随机梯度下降法进行多分类=============
37 | # from sklearn.linear_model import SGDClassifier
38 | # from sklearn.metrics import accuracy_score
39 | # from sklearn import datasets
40 | # iris = datasets.load_iris()
41 | # X,y=iris.data,iris.target
42 | # clf = SGDClassifier(alpha=0.001, max_iter=100).fit(X, y)
43 | # y_pred = clf.predict(X)
44 | # print('三分类花卉数据准确率：',accuracy_score(y,y_pred))
45 | # print('包含的二分类器索引：',clf.classes_)    # one versus all 方法来组合多个二分类器
46 | # print('回归系数：',clf.coef_)  # 每一个二分类器的回归系数
47 | # print('偏差：',clf.intercept_ ) # 每一个二分类器的偏差
48 | 
49 | 
50 | 
51 | # #===============随机梯度下降法回归===============
52 | from sklearn import linear_model
53 | from sklearn.datasets import load_boston
54 | X,y = load_boston().data,load_boston().target
55 | clf = linear_model.SGDRegressor(loss='squared_loss',penalty='l2',alpha=0.01,max_iter=1000)
56 | clf.fit(X, y)
57 | print('得分：',clf.score(X,y))
58 | print('回归系数：',clf.coef_)
59 | print('偏差：',clf.intercept_ )
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/sk-集成学习.py:
--------------------------------------------------------------------------------
  1 | # 产生样本数据集
  2 | from sklearn.model_selection import cross_val_score
  3 | from sklearn import datasets
  4 | iris = datasets.load_iris()
  5 | X, y = iris.data[:, 1:3], iris.target
  6 | 
  7 | # # ==================Bagging 元估计器=============
  8 | # from sklearn.ensemble import BaggingClassifier
  9 | # from sklearn.neighbors import KNeighborsClassifier
 10 | # bagging = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5)
 11 | # scores = cross_val_score(bagging, X, y)
 12 | # print('Bagging准确率：',scores.mean())
 13 | #
 14 | # # ==================决策树、随机森林、极限森林对比===============
 15 | #
 16 | # # 决策树
 17 | # from sklearn.tree import DecisionTreeClassifier
 18 | # clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2,random_state=0)
 19 | # scores = cross_val_score(clf, X, y)
 20 | # print('决策树准确率：',scores.mean())
 21 | #
 22 | # # 随机森林
 23 | # from sklearn.ensemble import RandomForestClassifier
 24 | # clf = RandomForestClassifier(n_estimators=10,max_features=2)
 25 | # scores = cross_val_score(clf, X, y)
 26 | # print('随机森林准确率：',scores.mean())
 27 | #
 28 | # # 极限随机树
 29 | # from sklearn.ensemble import ExtraTreesClassifier
 30 | # clf = ExtraTreesClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
 31 | # scores = cross_val_score(clf, X, y)
 32 | # print('极限随机树准确率：',scores.mean())
 33 | #
 34 | # print('模型中各属性的重要程度：',clf.feature_importances_)
 35 | #
 36 | #
 37 | # # ====================AdaBoost=========================
 38 | # from sklearn.ensemble import AdaBoostClassifier
 39 | # clf = AdaBoostClassifier(n_estimators=100)
 40 | # scores = cross_val_score(clf, X, y)
 41 | # print('AdaBoost准确率：',scores.mean())
 42 | #
 43 | #
 44 | # # ====================Gradient Tree Boosting（梯度树提升）=========================
 45 | # # 分类
 46 | # from sklearn.ensemble import GradientBoostingClassifier
 47 | # clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
 48 | # scores = cross_val_score(clf, X, y)
 49 | # print('GDBT分类准确率：',scores.mean())
 50 | #
 51 | # # 回归
 52 | # import numpy as np
 53 | # import matplotlib.pyplot as plt
 54 | # from sklearn.metrics import mean_squared_error
 55 | # from sklearn.datasets import load_boston
 56 | # from sklearn.ensemble import GradientBoostingRegressor
 57 | # from sklearn.utils import shuffle
 58 | # from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
 59 | #
 60 | # boston = load_boston()  # 加载波士顿房价回归数据集
 61 | # X1, y1 = shuffle(boston.data, boston.target, random_state=13) # 将数据集随机打乱
 62 | # X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.1, random_state=0)  # 划分训练集和测试集.test_size为测试集所占的比例
 63 | # clf = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01,max_depth=4,min_samples_split=2,loss='ls')
 64 | # clf.fit(X1, y1)
 65 | # print('GDBT回归MSE：',mean_squared_error(y_test, clf.predict(X_test)))
 66 | # # print('每次训练的得分记录：',clf.train_score_)
 67 | # print('各特征的重要程度：',clf.feature_importances_)
 68 | # plt.plot(np.arange(500), clf.train_score_, 'b-')  # 绘制随着训练次数增加，训练得分的变化
 69 | # plt.show()
 70 | 
 71 | 
 72 | 
 73 | # ====================Voting Classifier（投票分类器）=========================
 74 | 
 75 | from sklearn.linear_model import LogisticRegression
 76 | from sklearn.naive_bayes import GaussianNB
 77 | from sklearn.ensemble import RandomForestClassifier
 78 | from sklearn.ensemble import VotingClassifier
 79 | 
 80 | clf1 = LogisticRegression(random_state=1)
 81 | clf2 = RandomForestClassifier(random_state=1)
 82 | clf3 = GaussianNB()
 83 | 
 84 | eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')  # 无权重投票
 85 | eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft', weights=[2,1,2]) # 权重投票
 86 | 
 87 | for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
 88 |     scores = cross_val_score(clf,X,y,cv=5, scoring='accuracy')
 89 |     print("准确率: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
 90 | 
 91 | # 配合网格搜索
 92 | from sklearn.model_selection import GridSearchCV
 93 | params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200],}  # 搜索寻找最优的lr模型中的C参数和rf模型中的n_estimators
 94 | grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
 95 | grid = grid.fit(iris.data, iris.target)
 96 | print('最优参数：',grid.best_params_)
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/sk-预处理.py:
--------------------------------------------------------------------------------
  1 | from sklearn import preprocessing
  2 | import numpy as np
  3 | 
  4 | X_train = np.array([[ 1., -1.,  -2.],
  5 |                     [ 2.,  0.,  0.],
  6 |                     [ 3.,  1., -1.]])
  7 | X_test = [[-1., 1., 0.]]
  8 | 
  9 | 
 10 | # ===============标准化====================
 11 | # 计算数据集的尺度（也就是数据集的均值和方差）（各列）
 12 | scaler = preprocessing.StandardScaler().fit(X_train)   # 计算均值和方差
 13 | print('均值：',scaler.mean_ )
 14 | print('方差：',scaler.scale_ )
 15 | 
 16 | # 通过尺度去处理另一个数据集，当然另一个数据集仍然可以是自己。
 17 | X_scaled = scaler.transform(X_train)
 18 | print('均值：',X_scaled.mean(axis=0))  # transform会转化数据集为均值为0
 19 | print('方差：',X_scaled.std(axis=0))   # transform会转化数据集为方差为1
 20 | 
 21 | # 上面两步的综合：缩放样本，是样本均值为0，方差为1（各列）
 22 | X_scaled = preprocessing.scale(X_train,axis=0)      # 标准化：去均值和方差
 23 | print('均值：',X_scaled.mean(axis=0))
 24 | print('方差：',X_scaled.std(axis=0))
 25 | 
 26 | # =====================特征缩放====================
 27 | # MinMaxScaler将特征缩放至特定范围内（默认为0-1）
 28 | min_max_scaler = preprocessing.MinMaxScaler()
 29 | X_train_minmax = min_max_scaler.fit_transform(X_train)  # 训练同时转换
 30 | print('每列最大值：',X_train_minmax.max(axis=0))   # 每列最大值为1
 31 | print('每列最小值：',X_train_minmax.min(axis=0))    # 每列最小值为0
 32 | # 缩放对象是记录了，平移距离和缩放大小，再对数据进行的操作
 33 | print('先平移：',min_max_scaler.min_)
 34 | print('再缩放：',min_max_scaler.scale_)
 35 | 
 36 | X_test_minmax = min_max_scaler.transform(X_test)   # 转换实例应用到测试数据:实现和训练数据一致的缩放和移位操作:
 37 | 
 38 | 
 39 | 
 40 | # MaxAbsScaler通过除以每个特征的最大值将训练数据特征缩放至 [-1, 1] 范围内。可以应用在稀疏矩阵上保留矩阵的稀疏性。
 41 | X_train = np.array([[ 0., -1.,  0.],
 42 |                     [ 0., 0.,  0.2],
 43 |                     [ 2.,  0., 0]])
 44 | max_abs_scaler = preprocessing.MaxAbsScaler()
 45 | X_train_maxabs = max_abs_scaler.fit_transform(X_train)
 46 | print('每列最大值：',X_train_maxabs.max(axis=0))   # 每列最大值为1
 47 | print('每列最小值：',X_train_maxabs.min(axis=0))    # 每列最小值不低于-1
 48 | print('缩放比例：',max_abs_scaler.scale_)
 49 | X_test_maxabs = max_abs_scaler.transform(X_test)   # 转换实例应用到测试数据:实现和训练数据一致的缩放和移位操作:
 50 | print('缩放后的矩阵仍然具有稀疏性：\n',X_train_maxabs)
 51 | 
 52 | 
 53 | 
 54 | # ===================缩放有离群值的数据========================
 55 | X_train = np.array([[ 1., -11.,  -2.],
 56 |                     [ 2.,  2.,  0.],
 57 |                     [ 13.,  1., -11.]])
 58 | robust_scale = preprocessing.RobustScaler()
 59 | X_train_robust = robust_scale.fit_transform(X_train)  # 训练同时转换
 60 | print('缩放后的矩阵离群点被处理了：\n',X_train_maxabs)
 61 | 
 62 | 
 63 | 
 64 | 
 65 | # ===================非线性转换===================
 66 | X_train = np.array([[ 1., -1.,  -2.],
 67 |                     [ 2.,  0.,  0.],
 68 |                     [ 3.,  1., -1.]])
 69 | quantile_transformer = preprocessing.QuantileTransformer(random_state=0)  # 将数据映射到了零到一的均匀分布上（默认是均匀分布）
 70 | X_train_trans = quantile_transformer.fit_transform(X_train)
 71 | 
 72 | #查看分位数信息，经过转换以后，分位数的信息基本不变
 73 | print('源分位数情况：',np.percentile(X_train[:, 0], [0, 25, 50, 75, 100]))
 74 | print('变换后分位数情况：',np.percentile(X_train_trans[:, 0], [0, 25, 50, 75, 100]))
 75 | 
 76 | # 下面将数据映射到了零到一的正态分布上：输入的中值称为输出的平均值，并且以0为中心。正常输出被剪切，使得输入的最小和最大值分别对应于1e-7和1-1e-7分位数
 77 | quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal',random_state=0)
 78 | 
 79 | 
 80 | X = [[ 1., -1.,  2.],
 81 |      [ 2.,  0.,  0.],
 82 |      [ 0.,  1., -1.]]
 83 | # ===================样本归一化===================
 84 | X_normalized = preprocessing.normalize(X, norm='l1')  # 使用 l1 或 l2 范式。缩放使每个样本（每行）的一范数或二范数为1
 85 | print('样本归一化：\n',X_normalized)
 86 | # 当然仍然可以先通过样本获取转换对象，再用转换对象归一化其他数据
 87 | normalizer = preprocessing.Normalizer().fit(X)  # 获取转换对象
 88 | normalizer.transform(X)  # 转换任何数据，X或测试集
 89 | 
 90 | # ===================特征二值化===================
 91 | binarizer = preprocessing.Binarizer().fit(X)  # 获取转换模型，生成的门限，默认为0
 92 | print(binarizer)
 93 | # binarizer = preprocessing.Binarizer(threshold=1) # 自定义转换器。门限以上为1，门限（包含）以下为0
 94 | X_normalized = binarizer.transform(X)  # 转换任何数据，X或测试集
 95 | print('特征二值化：\n',X_normalized)
 96 | 
 97 | 
 98 | 
 99 | # ===================分类特征编码(one-hot编码)===================
100 | from sklearn.preprocessing import OneHotEncoder
101 | enc = OneHotEncoder()
102 | enc.fit([[0, 1, 2],   # 每列一个属性，每个属性一种编码
103 |          [1, 0, 0],
104 |          [0, 2, 1],
105 |          [1, 0, 1]])
106 | print('取值范围整数个数：',enc.n_values_)  # 每个属性的最大可取值数目。2,3,3
107 | print('编码后：',enc.transform([[0, 1, 1]]).toarray()) # 转换目标对象。根据可取值所占位数进行罗列。前2位为第一个数字one-hot编码，紧接着的3位为第二个数字的编码，最后3位为第三个数字的编码
108 | print('特征开始位置的索引：',enc.feature_indices_) # 对 n_values_的累积值，代表一个样本转换为编码后的每个属性的开始位置。0,2,5,8
109 | 
110 | 
111 | # ===================缺失值插补===================
112 | from sklearn.preprocessing import Imputer
113 | imp = Imputer(missing_values='NaN', strategy='mean', axis=0)  # missing_values参数设定的值被认为是缺失值，计算均值时忽略不计
114 | imp.fit([[1, 2],   # 计算每列的非空值的均值
115 |          [np.nan, 3],
116 |          [7, 6]])
117 | 
118 | X = [[np.nan, 2], [6, np.nan], [7, 6]]
119 | print('缺失值插值后：\n',imp.transform(X))  # 使用每个的均值为每列缺失值插补
120 | 
121 | 
122 | # ===================生成多项式特征===================
123 | from sklearn.preprocessing import PolynomialFeatures
124 | X = np.array([[0, 1],
125 |               [2, 3],
126 |               [4, 5]])
127 | poly = PolynomialFeatures(2,interaction_only=False)  # 最大二次方。interaction_only参数设置为True，则会只保留交互项
128 | print('生成多项式：\n',poly.fit_transform(X))   # 从 (X_1, X_2) 转换为 (1, X_1, X_2, X_1^2, X_1X_2, X_2^2)
129 | 
130 | 


--------------------------------------------------------------------------------