├── DBSCAN_data.txt ├── README.md ├── SVM_data.txt ├── sk-Birch.py ├── sk-DBSCAN.py ├── sk-DBSCAN1.py ├── sk-PCA主成分分析.py ├── sk-knn.py ├── sk-k均值聚类.py ├── sk-lasso-多回归.py ├── sk-lasso.py ├── sk-svm.py ├── sk-svm识别手写体.py ├── sk-交叉验证.py ├── sk-优化.py ├── sk-决策树.py ├── sk-分类大全.py ├── sk-卷积神经网络-识别手写数字.py ├── sk-卷积神经网络.py ├── sk-多类多标签.py ├── sk-密度聚类.py ├── sk-小批量k均值聚类.py ├── sk-层次聚类.py ├── sk-层次聚类1.py ├── sk-岭回归.py ├── sk-度量.py ├── sk-数据集-特征选择-交叉验证.py ├── sk-文档贝叶斯.py ├── sk-朴素贝叶斯.py ├── sk-样本数据集.py ├── sk-案例流程.py ├── sk-特征提取.py ├── sk-特征选择.py ├── sk-神经网络.py ├── sk-线性回归.py ├── sk-逻辑分类有b偏量.py ├── sk-逻辑分类没有b偏量.py ├── sk-随机梯度下降.py ├── sk-集成学习.py └── sk-预处理.py /DBSCAN_data.txt: -------------------------------------------------------------------------------- 1 | -2.68420713,1.469732895], 2 | [-2.71539062,-0.763005825], 3 | [-2.88981954,-0.618055245], 4 | [-2.7464372,-1.40005944], 5 | [-2.72859298,1.50266052], 6 | [-2.27989736,3.365022195], 7 | [-2.82089068,-0.369470295], 8 | [-2.62648199,0.766824075], 9 | [-2.88795857,-2.568591135], 10 | [-2.67384469,-0.48011265], 11 | [-2.50652679,2.933707545], 12 | [-2.61314272,0.096842835], 13 | [-2.78743398,-1.024830855], 14 | [-3.22520045,-2.264759595], 15 | [-2.64354322,5.33787705], 16 | [-2.38386932,6.05139453], 17 | [-2.6225262,3.681403515], 18 | [-2.64832273,1.436115015], 19 | [-2.19907796,3.956598405], 20 | [-2.58734619,2.34213138], 21 | [1.28479459,3.084476355], 22 | [0.93241075,1.436391405], 23 | [1.46406132,2.268854235], 24 | [0.18096721,-3.71521773], 25 | [1.08713449,0.339256755], 26 | [0.64043675,-1.87795566], 27 | [1.09522371,1.277510445], 28 | [-0.75146714,-4.504983795], 29 | [1.04329778,1.030306095], 30 | [-0.01019007,-3.242586915], 31 | [-0.5110862,-5.681213775], 32 | [0.51109806,-0.460278495], 33 | [0.26233576,-2.46551985], 34 | [0.98404455,-0.55962189], 35 | [-0.174864,-1.133170065], 36 | [0.92757294,2.107062945], 37 | [0.65959279,-1.583893305], 38 | [0.23454059,-1.493648235], 39 | [0.94236171,-2.43820017], 40 | [0.0432464,-2.616702525], 41 | [4.53172698,-0.05329008], 42 | [3.41407223,-2.58716277], 43 | [4.61648461,1.538708805], 44 | [3.97081495,-0.815065605], 45 | [4.34975798,-0.188471475], 46 | [5.39687992,2.462256225], 47 | [2.51938325,-5.361082605], 48 | [4.9320051,1.585696545], 49 | [4.31967279,-1.104966765], 50 | [4.91813423,3.511712835], 51 | [3.66193495,1.0891728], 52 | [3.80234045,-0.972695745], 53 | [4.16537886,0.96876126], 54 | [3.34459422,-3.493869435], 55 | [3.5852673,-2.426881725], 56 | [3.90474358,0.534685455], 57 | [3.94924878,0.18328617], 58 | [5.48876538,5.27195043], 59 | [5.79468686,1.139695065], 60 | [3.29832982,-3.42456273 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sklearn 2 | 数据挖掘库sklearn的使用教程和demo 3 | -------------------------------------------------------------------------------- /SVM_data.txt: -------------------------------------------------------------------------------- 1 | 3.542485 1.977398 -1 2 | 3.018896 2.556416 -1 3 | 7.551510 -1.580030 1 4 | 2.114999 -0.004466 -1 5 | 8.127113 1.274372 1 6 | 7.108772 -0.986906 1 7 | 8.610639 2.046708 1 8 | 2.326297 0.265213 -1 9 | 3.634009 1.730537 -1 10 | 0.341367 -0.894998 -1 11 | 3.125951 0.293251 -1 12 | 2.123252 -0.783563 -1 13 | 0.887835 -2.797792 -1 14 | 7.139979 -2.329896 1 15 | 1.696414 -1.212496 -1 16 | 8.117032 0.623493 1 17 | 8.497162 -0.266649 1 18 | 4.658191 3.507396 -1 19 | 8.197181 1.545132 1 20 | 1.208047 0.213100 -1 21 | 1.928486 -0.321870 -1 22 | 2.175808 -0.014527 -1 23 | 7.886608 0.461755 1 24 | 3.223038 -0.552392 -1 25 | 3.628502 2.190585 -1 26 | 7.407860 -0.121961 1 27 | 7.286357 0.251077 1 28 | 2.301095 -0.533988 -1 29 | -0.232542 -0.547690 -1 30 | 3.457096 -0.082216 -1 31 | 3.023938 -0.057392 -1 32 | 8.015003 0.885325 1 33 | 8.991748 0.923154 1 34 | 7.916831 -1.781735 1 35 | 7.616862 -0.217958 1 36 | 2.450939 0.744967 -1 37 | 7.270337 -2.507834 1 38 | 1.749721 -0.961902 -1 39 | 1.803111 -0.176349 -1 40 | 8.804461 3.044301 1 41 | 1.231257 -0.568573 -1 42 | 2.074915 1.410550 -1 43 | -0.743036 -1.736103 -1 44 | 3.536555 3.964960 -1 45 | 8.410143 0.025606 1 46 | 7.382988 -0.478764 1 47 | 6.960661 -0.245353 1 48 | 8.234460 0.701868 1 49 | 8.168618 -0.903835 1 50 | 1.534187 -0.622492 -1 51 | 9.229518 2.066088 1 52 | 7.886242 0.191813 1 53 | 2.893743 -1.643468 -1 54 | 1.870457 -1.040420 -1 55 | 5.286862 -2.358286 1 56 | 6.080573 0.418886 1 57 | 2.544314 1.714165 -1 58 | 6.016004 -3.753712 1 59 | 0.926310 -0.564359 -1 60 | 0.870296 -0.109952 -1 61 | 2.369345 1.375695 -1 62 | 1.363782 -0.254082 -1 63 | 7.279460 -0.189572 1 64 | 1.896005 0.515080 -1 65 | 8.102154 -0.603875 1 66 | 2.529893 0.662657 -1 67 | 1.963874 -0.365233 -1 68 | 8.132048 0.785914 1 69 | 8.245938 0.372366 1 70 | 6.543888 0.433164 1 71 | -0.236713 -5.766721 -1 72 | 8.112593 0.295839 1 73 | 9.803425 1.495167 1 74 | 1.497407 -0.552916 -1 75 | 1.336267 -1.632889 -1 76 | 9.205805 -0.586480 1 77 | 1.966279 -1.840439 -1 78 | 8.398012 1.584918 1 79 | 7.239953 -1.764292 1 80 | 7.556201 0.241185 1 81 | 9.015509 0.345019 1 82 | 8.266085 -0.230977 1 83 | 8.545620 2.788799 1 84 | 9.295969 1.346332 1 85 | 2.404234 0.570278 -1 86 | 2.037772 0.021919 -1 87 | 1.727631 -0.453143 -1 88 | 1.979395 -0.050773 -1 89 | 8.092288 -1.372433 1 90 | 1.667645 0.239204 -1 91 | 9.854303 1.365116 1 92 | 7.921057 -1.327587 1 93 | 8.500757 1.492372 1 94 | 1.339746 -0.291183 -1 95 | 3.107511 0.758367 -1 96 | 2.609525 0.902979 -1 97 | 3.263585 1.367898 -1 98 | 2.912122 -0.202359 -1 99 | 1.731786 0.589096 -1 100 | 2.387003 1.573131 -1 101 | -------------------------------------------------------------------------------- /sk-Birch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.cluster import Birch 4 | from sklearn import metrics 5 | 6 | from sklearn.datasets.samples_generator import make_blobs 7 | # X为样本特征,Y为样本簇类别, 共1000个样本,每个样本2个特征,共4个簇,簇中心在[-1,-1], [0,0],[1,1], [2,2] 8 | X, y = make_blobs(n_samples=1000, n_features=2, centers=[[-1,-1], [0,0], [1,1], [2,2]], cluster_std=[0.4, 0.3, 0.4, 0.3],random_state =9) 9 | plt.scatter(X[:, 0], X[:, 1], marker='o',c=y) 10 | plt.show() 11 | 12 | 13 | # 不设置聚类数目的Birch 14 | y_pred = Birch(n_clusters = None).fit_predict(X) 15 | plt.scatter(X[:, 0], X[:, 1], c=y_pred) 16 | plt.show() 17 | print("CH指标:", metrics.calinski_harabaz_score(X, y_pred)) 18 | 19 | 20 | # 设置聚类数目的Birch 21 | y_pred = Birch(n_clusters = 4).fit_predict(X) 22 | plt.scatter(X[:, 0], X[:, 1], c=y_pred) 23 | plt.show() 24 | print("CH指标:", metrics.calinski_harabaz_score(X, y_pred)) 25 | 26 | 27 | # 尝试多个threshold取值,和多个branching_factor取值 28 | param_grid = {'threshold':[0.5,0.3,0.1],'branching_factor':[50,20,10]} # 定义优化参数字典,字典中的key值必须是分类算法的函数的参数名 29 | for threshold in param_grid['threshold']: 30 | for branching_factor in param_grid['branching_factor']: 31 | clf = Birch(n_clusters = 4,threshold=threshold,branching_factor=branching_factor) 32 | clf.fit(X) 33 | y_pred = clf.predict(X) 34 | print(threshold,branching_factor,"CH指标:", metrics.calinski_harabaz_score(X, y_pred)) 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /sk-DBSCAN.py: -------------------------------------------------------------------------------- 1 | import numpy as np # 数据结构 2 | import sklearn.cluster as skc # 密度聚类 3 | from sklearn import metrics # 评估模型 4 | import matplotlib.pyplot as plt # 可视化绘图 5 | 6 | data=[ 7 | [-2.68420713,1.469732895],[-2.71539062,-0.763005825],[-2.88981954,-0.618055245],[-2.7464372,-1.40005944],[-2.72859298,1.50266052], 8 | [-2.27989736,3.365022195],[-2.82089068,-0.369470295],[-2.62648199,0.766824075],[-2.88795857,-2.568591135],[-2.67384469,-0.48011265], 9 | [-2.50652679,2.933707545],[-2.61314272,0.096842835],[-2.78743398,-1.024830855],[-3.22520045,-2.264759595],[-2.64354322,5.33787705], 10 | [-2.38386932,6.05139453],[-2.6225262,3.681403515],[-2.64832273,1.436115015],[-2.19907796,3.956598405],[-2.58734619,2.34213138], 11 | [1.28479459,3.084476355],[0.93241075,1.436391405],[1.46406132,2.268854235],[0.18096721,-3.71521773],[1.08713449,0.339256755], 12 | [0.64043675,-1.87795566],[1.09522371,1.277510445],[-0.75146714,-4.504983795],[1.04329778,1.030306095],[-0.01019007,-3.242586915], 13 | [-0.5110862,-5.681213775],[0.51109806,-0.460278495],[0.26233576,-2.46551985],[0.98404455,-0.55962189],[-0.174864,-1.133170065], 14 | [0.92757294,2.107062945],[0.65959279,-1.583893305],[0.23454059,-1.493648235],[0.94236171,-2.43820017],[0.0432464,-2.616702525], 15 | [4.53172698,-0.05329008],[3.41407223,-2.58716277],[4.61648461,1.538708805],[3.97081495,-0.815065605],[4.34975798,-0.188471475], 16 | [5.39687992,2.462256225],[2.51938325,-5.361082605],[4.9320051,1.585696545],[4.31967279,-1.104966765],[4.91813423,3.511712835], 17 | [3.66193495,1.0891728],[3.80234045,-0.972695745],[4.16537886,0.96876126],[3.34459422,-3.493869435],[3.5852673,-2.426881725], 18 | [3.90474358,0.534685455],[3.94924878,0.18328617],[5.48876538,5.27195043],[5.79468686,1.139695065],[3.29832982,-3.42456273] 19 | ] 20 | X = np.array(data) 21 | 22 | db = skc.DBSCAN(eps=1.5, min_samples=3).fit(X) #DBSCAN聚类方法 还有参数,matric = ""距离计算方法 23 | labels = db.labels_ #和X同一个维度,labels对应索引序号的值 为她所在簇的序号。若簇编号为-1,表示为噪声 24 | 25 | print('每个样本的簇标号:') 26 | print(labels) 27 | 28 | raito = len(labels[labels[:] == -1]) / len(labels) #计算噪声点个数占总数的比例 29 | print('噪声比:', format(raito, '.2%')) 30 | 31 | n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # 获取分簇的数目 32 | 33 | print('分簇的数目: %d' % n_clusters_) 34 | print("轮廓系数: %0.3f" % metrics.silhouette_score(X, labels)) #轮廓系数评价聚类的好坏 35 | 36 | for i in range(n_clusters_): 37 | print('簇 ', i, '的所有样本:') 38 | one_cluster = X[labels == i] 39 | print(one_cluster) 40 | plt.plot(one_cluster[:,0],one_cluster[:,1],'o') 41 | 42 | plt.show() -------------------------------------------------------------------------------- /sk-DBSCAN1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.cluster import DBSCAN 3 | from sklearn import metrics 4 | from sklearn.datasets.samples_generator import make_blobs 5 | from sklearn.preprocessing import StandardScaler 6 | 7 | 8 | # ############################################################################# 9 | # 产生样本数据 10 | centers = [[1, 1], [-1, -1], [1, -1]] # 生成聚类中心点 11 | X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,random_state=0) # 生成样本数据集 12 | 13 | X = StandardScaler().fit_transform(X) # StandardScaler作用:去均值和方差归一化。且是针对每一个特征维度来做的,而不是针对样本。 14 | 15 | # ############################################################################# 16 | # 调用密度聚类 DBSCAN 17 | db = DBSCAN(eps=0.3, min_samples=10).fit(X) 18 | # print(db.labels_) # db.labels_为所有样本的聚类索引,没有聚类索引为-1 19 | # print(db.core_sample_indices_) # 所有核心样本的索引 20 | core_samples_mask = np.zeros_like(db.labels_, dtype=bool) # 设置一个样本个数长度的全false向量 21 | core_samples_mask[db.core_sample_indices_] = True #将核心样本部分设置为true 22 | labels = db.labels_ 23 | 24 | # 获取聚类个数。(聚类结果中-1表示没有聚类为离散点) 25 | n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) 26 | 27 | # 模型评估 28 | print('估计的聚类个数为: %d' % n_clusters_) 29 | print("同质性: %0.3f" % metrics.homogeneity_score(labels_true, labels)) # 每个群集只包含单个类的成员。 30 | print("完整性: %0.3f" % metrics.completeness_score(labels_true, labels)) # 给定类的所有成员都分配给同一个群集。 31 | print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) # 同质性和完整性的调和平均 32 | print("调整兰德指数: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) 33 | print("调整互信息: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) 34 | print("轮廓系数: %0.3f" % metrics.silhouette_score(X, labels)) 35 | 36 | # ############################################################################# 37 | # Plot result 38 | import matplotlib.pyplot as plt 39 | 40 | # 使用黑色标注离散点 41 | unique_labels = set(labels) 42 | colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] 43 | for k, col in zip(unique_labels, colors): 44 | if k == -1: # 聚类结果为-1的样本为离散点 45 | # 使用黑色绘制离散点 46 | col = [0, 0, 0, 1] 47 | 48 | class_member_mask = (labels == k) # 将所有属于该聚类的样本位置置为true 49 | 50 | xy = X[class_member_mask & core_samples_mask] # 将所有属于该类的核心样本取出,使用大图标绘制 51 | plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),markeredgecolor='k', markersize=14) 52 | 53 | xy = X[class_member_mask & ~core_samples_mask] # 将所有属于该类的非核心样本取出,使用小图标绘制 54 | plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),markeredgecolor='k', markersize=6) 55 | 56 | plt.title('Estimated number of clusters: %d' % n_clusters_) 57 | plt.show() 58 | 59 | -------------------------------------------------------------------------------- /sk-PCA主成分分析.py: -------------------------------------------------------------------------------- 1 | # # ======================PCA主成分分析================= 2 | # # 花卉样本数据集 3 | # from sklearn import datasets 4 | # import matplotlib.pyplot as plt 5 | # import numpy as np 6 | # iris = datasets.load_iris() 7 | # X = iris.data 8 | # y = iris.target 9 | # 10 | 11 | # from sklearn.decomposition import PCA,IncrementalPCA # 主成分分析(PCA) 12 | # pca = PCA(n_components=2) # PCA降维到2维 13 | # X_pca = pca.fit_transform(X) 14 | # 15 | # ipca = IncrementalPCA(n_components=2, batch_size=10) # 增量PCA降维到2维 16 | # X_ipca = ipca.fit_transform(X) 17 | # 18 | # pca = PCA(n_components=2, svd_solver='randomized', whiten=True) # PCA 使用随机SVD 19 | # X_pca1 = pca.fit_transform(X) 20 | # 21 | # 22 | # # 绘制PCA降维后的显示 23 | # plt.subplot(131) 24 | # plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, alpha=.8, lw=2) 25 | # plt.title('PCA') 26 | # 27 | # # 绘制增量PCA降维后的显示 28 | # plt.subplot(132) 29 | # plt.scatter(X_ipca[:, 0], X_ipca[:, 1], c=y, alpha=.8, lw=2) 30 | # plt.title('IPCA') 31 | # 32 | # # 绘制PCA使用随机SVD降维后的显示 33 | # plt.subplot(133) 34 | # plt.scatter(X_pca1[:, 0], X_pca1[:, 1], c=y, alpha=.8, lw=2) 35 | # plt.title('PCA with rand SVD') 36 | # plt.show() 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | # ======================核PCA主成分分析================= 45 | from sklearn.datasets import make_circles 46 | from sklearn.decomposition import PCA, KernelPCA 47 | import matplotlib.pyplot as plt 48 | import numpy as np 49 | X, y = make_circles(n_samples=400, factor=.3, noise=.05) # 生成样本数据集 50 | 51 | kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10) # 核PCA降维 52 | X_kpca = kpca.fit_transform(X) 53 | X_back = kpca.inverse_transform(X_kpca) 54 | 55 | pca = PCA(n_components=2) # PCA降维到2维 56 | X_pca = pca.fit_transform(X) 57 | 58 | # # 绘制原始数据 59 | plt.subplot(221) 60 | plt.scatter(X[:, 0], X[:, 1], c=y, alpha=.8, lw=2) 61 | plt.title('Original space') 62 | 63 | # 绘制PCA降维后的显示 64 | plt.subplot(222) 65 | plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, alpha=.8, lw=2) 66 | plt.title('PCA') 67 | 68 | # 绘制KPCA降维后的显示 69 | plt.subplot(223) 70 | plt.scatter(X_kpca[:, 0], X_kpca[:, 1], c=y, alpha=.8, lw=2) 71 | plt.title('KPCA') 72 | 73 | # 绘制逆空间的显示 74 | plt.subplot(224) 75 | plt.scatter(X_back[:, 0], X_back[:, 1], c=y, alpha=.8, lw=2) 76 | plt.title('inverse space') 77 | 78 | plt.show() 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | # # ======================SparsePCA 稀疏主成分分析================= 87 | 88 | 89 | 90 | 91 | # # =================隐 Dirichlet 分配================= 92 | # from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # 隐 Dirichlet 分配 93 | # lda = LinearDiscriminantAnalysis(n_components=2) # 降维到2维 94 | # X_r2 = lda.fit(X, y).transform(X) 95 | # 96 | # # Percentage of variance explained for each components 97 | # print('explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_)) 98 | # 99 | # 100 | # 101 | # plt.subplot(122) 102 | # for color, i, target_name in zip(colors, [0, 1, 2], target_names): 103 | # plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color,label=target_name) 104 | # 105 | # plt.legend(loc='best', shadow=False, scatterpoints=1) 106 | # plt.title('LDA of IRIS dataset') 107 | # 108 | # plt.show() -------------------------------------------------------------------------------- /sk-knn.py: -------------------------------------------------------------------------------- 1 | # # ==============================无监督查找最近邻(常在聚类中使用,例如变色龙聚类算法)======================== 2 | # 3 | # from sklearn.neighbors import NearestNeighbors 4 | # import numpy as np # 快速操作结构数组的工具 5 | # 6 | # X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) # 样本数据 7 | # test_x = np.array([[-3.2, -2.1], [-2.6, -1.3], [1.4, 1.0], [3.1, 2.6], [2.5, 1.0], [-1.2, -1.3]]) # 设置测试数据 8 | # # test_x=X # 测试数据等于样本数据。这样就相当于在样本数据内部查找每个样本的邻节点了。 9 | # nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X) # 为X生成knn模型 10 | # distances, indices = nbrs.kneighbors(test_x) # 为test_x中的数据寻找模型中的邻节点 11 | # print('邻节点:',indices) 12 | # print('邻节点距离:',distances) 13 | # 14 | # # ==============================使用kd树和Ball树实现无监督查找最近邻======================== 15 | # 16 | # from sklearn.neighbors import KDTree,BallTree 17 | # import numpy as np # 快速操作结构数组的工具 18 | # 19 | # X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) 20 | # # test_x = np.array([[-3.2, -2.1], [-2.6, -1.3], [1.4, 1.0], [3.1, 2.6], [2.5, 1.0], [-1.2, -1.3]]) # 设置测试数据 21 | # test_x=X # 测试数据等于样本数据。这样就相当于在样本数据内部查找每个样本的邻节点了。 22 | # kdt = KDTree(X, leaf_size=30, metric='euclidean') 23 | # distances,indices = kdt.query(test_x, k=2, return_distance=True) 24 | # print('邻节点:',indices) 25 | # print('邻节点距离:',distances) 26 | 27 | 28 | 29 | # # ==============================k最近邻分类======================== 30 | # import numpy as np # 快速操作结构数组的工具 31 | # from sklearn.neighbors import KNeighborsClassifier,KDTree # 导入knn分类器 32 | # 33 | # 34 | # # 数据集。4种属性,3种类别 35 | # data=[ 36 | # [ 5.1, 3.5, 1.4, 0.2, 0], 37 | # [ 4.9, 3.0, 1.4, 0.2, 0], 38 | # [ 4.7, 3.2, 1.3, 0.2, 0], 39 | # [ 4.6, 3.1, 1.5, 0.2, 0], 40 | # [ 5.0, 3.6, 1.4, 0.2, 0], 41 | # [ 7.0, 3.2, 4.7, 1.4, 1], 42 | # [ 6.4, 3.2, 4.5, 1.5, 1], 43 | # [ 6.9, 3.1, 4.9, 1.5, 1], 44 | # [ 5.5, 2.3, 4.0, 1.3, 1], 45 | # [ 6.5, 2.8, 4.6, 1.5, 1], 46 | # [ 6.3, 3.3, 6.0, 2.5, 2], 47 | # [ 5.8, 2.7, 5.1, 1.9, 2], 48 | # [ 7.1, 3.0, 5.9, 2.1, 2], 49 | # [ 6.3, 2.9, 5.6, 1.8, 2], 50 | # [ 6.5, 3.0, 5.8, 2.2, 2], 51 | # ] 52 | # 53 | # # 构造数据集 54 | # dataMat = np.array(data) 55 | # X = dataMat[:,0:4] 56 | # y = dataMat[:,4] 57 | # 58 | # knn = KNeighborsClassifier(n_neighbors=2,weights='distance') # 初始化一个knn模型,设置k=2。weights='distance'样本权重等于距离的倒数。'uniform'为统一权重 59 | # knn.fit(X, y) #根据样本集、结果集,对knn进行建模 60 | # result = knn.predict([[3, 2, 2, 5]]) #使用knn对新对象进行预测 61 | # print(result) 62 | 63 | 64 | # ==============================k最近邻回归======================== 65 | 66 | import numpy as np 67 | import matplotlib.pyplot as plt 68 | from sklearn import neighbors 69 | 70 | np.random.seed(0) 71 | X = np.sort(5 * np.random.rand(40, 1), axis=0) 72 | T = np.linspace(0, 5, 500)[:, np.newaxis] 73 | y = np.sin(X).ravel() 74 | 75 | # 为输出值添加噪声 76 | y[::5] += 1 * (0.5 - np.random.rand(8)) 77 | 78 | # 训练回归模型 79 | n_neighbors = 5 80 | 81 | for i, weights in enumerate(['uniform', 'distance']): 82 | knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights) 83 | y_ = knn.fit(X, y).predict(T) 84 | 85 | plt.subplot(2, 1, i + 1) 86 | plt.scatter(X, y, c='k', label='data') 87 | plt.plot(T, y_, c='g', label='prediction') 88 | plt.axis('tight') 89 | plt.legend() 90 | plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (n_neighbors,weights)) 91 | 92 | plt.show() -------------------------------------------------------------------------------- /sk-k均值聚类.py: -------------------------------------------------------------------------------- 1 | 2 | from sklearn.cluster import Birch # 从sklearn.cluster机器学习聚类包中导入Birch聚类 3 | from sklearn.cluster import KMeans # 从sklearn.cluster机器学习聚类包中导入KMeans聚类 4 | 5 | """ 6 | 第1部分:数据集 7 | X表示二维矩阵数据,篮球运动员比赛数据 8 | 总共20行,每行两列数据 9 | 第一列表示球员每分钟助攻数:x1 10 | 第二列表示球员每分钟得分数:x2 11 | """ 12 | 13 | X = [[0.0888, 0.5885],[0.1399, 0.8291],[0.0747, 0.4974],[0.0983, 0.5772],[0.1276, 0.5703], 14 | [0.1671, 0.5835],[0.1906, 0.5276],[0.1061, 0.5523],[0.2446, 0.4007],[0.1670, 0.4770], 15 | [0.2485, 0.4313],[0.1227, 0.4909],[0.1240, 0.5668],[0.1461, 0.5113],[0.2315, 0.3788], 16 | [0.0494, 0.5590],[0.1107, 0.4799],[0.2521, 0.2735],[0.1007, 0.6318],[0.1067, 0.4326], 17 | [0.1456, 0.8280] 18 | ] 19 | 20 | """ 21 | 第2部分:KMeans聚类 22 | clf = KMeans(n_clusters=3) 表示类簇数为3,聚成3类数据,clf即赋值为KMeans 23 | y_pred = clf.fit_predict(X) 载入数据集X,并且将聚类的结果赋值给y_pred 24 | """ 25 | 26 | clf = KMeans(n_clusters=3) # 聚类算法,参数n_clusters=3,聚成3类 27 | y_pred = clf.fit_predict(X) # 直接对数据进行聚类,聚类不需要进行预测 28 | 29 | # 输出完整Kmeans函数,包括很多省略参数 30 | print('k均值模型:\n',clf) 31 | # 输出聚类预测结果,20行数据,每个y_pred对应X一行或一个球员,聚成3类,类标为0、1、2 32 | print('聚类结果:\n',y_pred) 33 | 34 | """ 35 | 第3部分:可视化绘图 36 | Python导入Matplotlib包,专门用于绘图 37 | import matplotlib.pyplot as plt 此处as相当于重命名,plt用于显示图像 38 | """ 39 | 40 | import numpy as np 41 | import matplotlib.pyplot as plt 42 | 43 | # 获取第一列和第二列数据 使用for循环获取 n[0]表示X第一列 44 | x1 = [n[0] for n in X] 45 | x2 = [n[1] for n in X] 46 | 47 | # 绘制散点图 参数:x横轴 y纵轴 c=y_pred聚类预测结果 marker类型 o表示圆点 *表示星型 x表示点 48 | plt.scatter(x1, x2, c=y_pred, marker='x') 49 | 50 | # 绘制标题 51 | plt.title("Kmeans-Basketball Data") 52 | 53 | # 绘制x轴和y轴坐标 54 | plt.xlabel("x1") 55 | plt.ylabel("x2") 56 | 57 | # 显示图形 58 | plt.show() -------------------------------------------------------------------------------- /sk-lasso-多回归.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.linear_model import MultiTaskLasso, Lasso 4 | 5 | rng = np.random.RandomState(42) 6 | # ===========================产生模拟样本数据========================= 7 | # 用随机的频率、相位产生正弦波的二维系数 8 | n_samples, n_features, n_tasks = 100, 30, 40 # n_samples样本个数,n_features特征个数,n_tasks估计值的个数 9 | n_relevant_features = 5 # 自定义实际有用特征的个数 10 | coef = np.zeros((n_tasks, n_features)) # 系数矩阵的维度 11 | 12 | times = np.linspace(0, 2 * np.pi, n_tasks) 13 | for k in range(n_relevant_features): 14 | coef[:, k] = np.sin((1. + rng.randn(1)) * times + 3 * rng.randn(1)) # 自定义数据矩阵,用来生成模拟输出值 15 | 16 | X = rng.randn(n_samples, n_features) # 产生随机输入矩阵 17 | Y = np.dot(X, coef.T) + rng.randn(n_samples, n_tasks) # 输入*系数+噪声=模拟输出 18 | # ==============================使用样本数据训练系数矩阵============================ 19 | coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T]) 20 | coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_ # 多任务训练 21 | 22 | # ############################################################################# 23 | # Plot support and time series 24 | fig = plt.figure(figsize=(8, 5)) 25 | plt.subplot(1, 2, 1) 26 | plt.spy(coef_lasso_) 27 | plt.xlabel('Feature') 28 | plt.ylabel('Time (or Task)') 29 | plt.text(10, 5, 'Lasso') 30 | plt.subplot(1, 2, 2) 31 | plt.spy(coef_multi_task_lasso_) 32 | plt.xlabel('Feature') 33 | plt.ylabel('Time (or Task)') 34 | plt.text(10, 5, 'MultiTaskLasso') 35 | fig.suptitle('Coefficient non-zero location') 36 | 37 | feature_to_plot = 0 38 | plt.figure() 39 | lw = 2 40 | plt.plot(coef[:, feature_to_plot], color='seagreen', linewidth=lw, 41 | label='Ground truth') 42 | plt.plot(coef_lasso_[:, feature_to_plot], color='cornflowerblue', linewidth=lw, 43 | label='Lasso') 44 | plt.plot(coef_multi_task_lasso_[:, feature_to_plot], color='gold', linewidth=lw, 45 | label='MultiTaskLasso') 46 | plt.legend(loc='upper center') 47 | plt.axis('tight') 48 | plt.ylim([-1.1, 1.1]) 49 | plt.show() -------------------------------------------------------------------------------- /sk-lasso.py: -------------------------------------------------------------------------------- 1 | import numpy as np # 快速操作结构数组的工具 2 | import matplotlib.pyplot as plt # 可视化绘制 3 | from sklearn.linear_model import Lasso,LassoCV,LassoLarsCV # Lasso回归,LassoCV交叉验证实现alpha的选取,LassoLarsCV基于最小角回归交叉验证实现alpha的选取 4 | 5 | 6 | # 样本数据集,第一列为x,第二列为y,在x和y之间建立回归模型 7 | data=[ 8 | [0.067732,3.176513],[0.427810,3.816464],[0.995731,4.550095],[0.738336,4.256571],[0.981083,4.560815], 9 | [0.526171,3.929515],[0.378887,3.526170],[0.033859,3.156393],[0.132791,3.110301],[0.138306,3.149813], 10 | [0.247809,3.476346],[0.648270,4.119688],[0.731209,4.282233],[0.236833,3.486582],[0.969788,4.655492], 11 | [0.607492,3.965162],[0.358622,3.514900],[0.147846,3.125947],[0.637820,4.094115],[0.230372,3.476039], 12 | [0.070237,3.210610],[0.067154,3.190612],[0.925577,4.631504],[0.717733,4.295890],[0.015371,3.085028], 13 | [0.335070,3.448080],[0.040486,3.167440],[0.212575,3.364266],[0.617218,3.993482],[0.541196,3.891471] 14 | ] 15 | 16 | 17 | #生成X和y矩阵 18 | dataMat = np.array(data) 19 | X = dataMat[:,0:1] # 变量x 20 | y = dataMat[:,1] #变量y 21 | 22 | 23 | 24 | # ========Lasso回归======== 25 | model = Lasso(alpha=0.01) # 调节alpha可以实现对拟合的程度 26 | # model = LassoCV() # LassoCV自动调节alpha可以实现选择最佳的alpha。 27 | # model = LassoLarsCV() # LassoLarsCV自动调节alpha可以实现选择最佳的alpha 28 | model.fit(X, y) # 线性回归建模 29 | print('系数矩阵:\n',model.coef_) 30 | print('线性回归模型:\n',model) 31 | # print('最佳的alpha:',model.alpha_) # 只有在使用LassoCV、LassoLarsCV时才有效 32 | # 使用模型预测 33 | predicted = model.predict(X) 34 | 35 | # 绘制散点图 参数:x横轴 y纵轴 36 | plt.scatter(X, y, marker='x') 37 | plt.plot(X, predicted,c='r') 38 | 39 | # 绘制x轴和y轴坐标 40 | plt.xlabel("x") 41 | plt.ylabel("y") 42 | 43 | # 显示图形 44 | plt.show() 45 | 46 | -------------------------------------------------------------------------------- /sk-svm.py: -------------------------------------------------------------------------------- 1 | # 2 | # import numpy as np # 快速操作结构数组的工具 3 | # from sklearn import svm # svm支持向量机 4 | # import matplotlib.pyplot as plt # 可视化绘图 5 | # 6 | # 7 | # data_set = np.loadtxt("SVM_data.txt") 8 | # train_data = data_set[:,0:2] # 训练特征空间 9 | # train_target = np.sign(data_set[:,2]) # 训练集类标号 10 | # 11 | # test_data = [[3,-1], [1,1], [7,-3], [9,0]] # 测试特征空间 12 | # test_target = [-1, -1, 1, 1] # 测试集类标号 13 | # 14 | # plt.scatter(data_set[:,0],data_set[:,1],c=data_set[:,2]) # 绘制可视化图 15 | # plt.show() 16 | # 17 | # # 创建模型 18 | # clf = svm.SVC() 19 | # clf.fit(X=train_data, y=train_target,sample_weight=None) # 训练模型。参数sample_weight为每个样本设置权重。应对非均衡问题 20 | # result = clf.predict(test_data) # 使用模型预测值 21 | # print('预测结果:',result) # 输出预测值[-1. -1. 1. 1.] 22 | # 23 | # # 获得支持向量 24 | # print('支持向量:',clf.support_vectors_) 25 | # # 获得支持向量的索引 26 | # print('支持向量索引:',clf.support_) 27 | # # 为每一个类别获得支持向量的数量 28 | # print('支持向量数量:',clf.n_support_) 29 | # 30 | # 31 | # # # ===============================Linear SVM====================== 32 | # from sklearn.svm import LinearSVC 33 | # 34 | # clf = LinearSVC() # 创建线性可分svm模型,参数均使用默认值 35 | # clf.fit(train_data, train_target) # 训练模型 36 | # result = clf.predict(test_data) # 使用模型预测值 37 | # print('预测结果:',result) # 输出预测值[-1. -1. 1. 1.] 38 | # 39 | # 40 | # # # ===============================Linear NuSVC====================== 41 | # from sklearn.svm import NuSVC 42 | # 43 | # clf = NuSVC() # 创建线性可分svm模型,参数均使用默认值 44 | # clf.fit(train_data, train_target) # 训练模型 45 | # result = clf.predict(test_data) # 使用模型预测值 46 | # print('预测结果:',result) # 输出预测值[-1. -1. 1. 1.] 47 | 48 | 49 | # ===============================样本不平衡、多分类的情况======================== 50 | import numpy as np 51 | import matplotlib.pyplot as plt 52 | from sklearn import svm 53 | 54 | # 创建不均衡样本 55 | rng = np.random.RandomState(0) 56 | n_samples_1 = 1000 57 | n_samples_2 = 100 58 | n_samples_3 = 100 59 | X = np.r_[1.5 * rng.randn(n_samples_1, 2), 0.5 * rng.randn(n_samples_2, 2) + [2, 2],0.5 * rng.randn(n_samples_3, 2) + [-3, 3]] # 三类样本点中心为(1.5,1.5)、(2,2)、(-3,3) 60 | y = [0] * (n_samples_1) + [1] * (n_samples_2)+ [2] * (n_samples_3) # 前面的1000个为类别0,后面的100个为类别1,最后100个类别为2 61 | 62 | # 创建模型获取分离超平面 63 | clf = svm.SVC(decision_function_shape='ovo',kernel='linear', C=1.0) # decision_function_shape='ovo'为使用1对1多分类处理。会创建n(n-1)/2个二分类。ovr为一对所有的处理方式 64 | clf.fit(X, y) 65 | 66 | # 多分类的情况下,获取其中二分类器的个数。 67 | dec = clf.decision_function([[1.5,1.5]]) # decision_function()的功能:计算样本点到分割超平面的函数距离。 包含几个2分类器,就有几个函数距离。 68 | print('二分类器个数:',dec.shape[1]) 69 | 70 | # 绘制,第一个二分类器的分割超平面 71 | w = clf.coef_[0] 72 | a = -w[0] / w[1] # a可以理解为斜率 73 | xx = np.linspace(-5, 5) 74 | yy = a * xx - clf.intercept_[0] / w[1] # 二维坐标下的直线方程 75 | 76 | # 使用类权重,获取分割超平面 77 | wclf = svm.SVC(kernel='linear', class_weight={1: 10}) 78 | wclf.fit(X, y) 79 | 80 | 81 | # 绘制 分割分割超平面 82 | ww = wclf.coef_[0] 83 | wa = -ww[0] / ww[1] 84 | wyy = wa * xx - wclf.intercept_[0] / ww[1] # 带权重的直线 85 | 86 | # 绘制第一个二分类器的分割超平面和样本点 87 | h0 = plt.plot(xx, yy, 'k-', label='no weights') 88 | h1 = plt.plot(xx, wyy, 'k--', label='with weights') 89 | plt.scatter(X[:, 0], X[:, 1], c=y) 90 | plt.legend() 91 | 92 | plt.show() 93 | 94 | # ===============================SVM回归预测======================== 95 | X = [[0, 0], [2, 2]] 96 | y = [0.5, 2.5] 97 | clf = svm.SVR() 98 | clf.fit(X, y) 99 | clf.predict([[1, 1]]) -------------------------------------------------------------------------------- /sk-svm识别手写体.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | import operator 4 | from os import listdir 5 | from sklearn.svm import SVC 6 | 7 | 8 | def img2vector(filename): 9 | """ 10 | 将32x32的二进制图像转换为1x1024向量。 11 | Parameters: 12 | filename - 文件名 13 | Returns: 14 | returnVect - 返回的二进制图像的1x1024向量 15 | """ 16 | #创建1x1024零向量 17 | returnVect = np.zeros((1, 1024)) 18 | #打开文件 19 | fr = open(filename) 20 | #按行读取 21 | for i in range(32): 22 | #读一行数据 23 | lineStr = fr.readline() 24 | #每一行的前32个元素依次添加到returnVect中 25 | for j in range(32): 26 | returnVect[0, 32*i+j] = int(lineStr[j]) 27 | #返回转换后的1x1024向量 28 | return returnVect 29 | 30 | # 手写数字分类测试 31 | def handwritingClassTest(): 32 | #测试集的Labels 33 | hwLabels = [] 34 | #返回trainingDigits目录下的文件名 35 | trainingFileList = listdir('trainingDigits') 36 | #返回文件夹下文件的个数 37 | m = len(trainingFileList) 38 | #初始化训练的Mat矩阵,测试集 39 | trainingMat = np.zeros((m, 1024)) 40 | #从文件名中解析出训练集的类别 41 | for i in range(m): 42 | #获得文件的名字 43 | fileNameStr = trainingFileList[i] 44 | #获得分类的数字 45 | classNumber = int(fileNameStr.split('_')[0]) 46 | #将获得的类别添加到hwLabels中 47 | hwLabels.append(classNumber) 48 | #将每一个文件的1x1024数据存储到trainingMat矩阵中 49 | trainingMat[i,:] = img2vector('trainingDigits/%s' % (fileNameStr)) 50 | clf = SVC(C=200,kernel='rbf') 51 | clf.fit(trainingMat,hwLabels) 52 | #返回testDigits目录下的文件列表 53 | testFileList = listdir('testDigits') 54 | #错误检测计数 55 | errorCount = 0.0 56 | #测试数据的数量 57 | mTest = len(testFileList) 58 | #从文件中解析出测试集的类别并进行分类测试 59 | for i in range(mTest): 60 | #获得文件的名字 61 | fileNameStr = testFileList[i] 62 | #获得分类的数字 63 | classNumber = int(fileNameStr.split('_')[0]) 64 | #获得测试集的1x1024向量,用于训练 65 | vectorUnderTest = img2vector('testDigits/%s' % (fileNameStr)) 66 | #获得预测结果 67 | # classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) 68 | classifierResult = clf.predict(vectorUnderTest) 69 | print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber)) 70 | if(classifierResult != classNumber): 71 | errorCount += 1.0 72 | print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100)) 73 | 74 | if __name__ == '__main__': 75 | handwritingClassTest() -------------------------------------------------------------------------------- /sk-交叉验证.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split,cross_val_score,cross_validate # 交叉验证所需的函数 2 | from sklearn.model_selection import KFold,LeaveOneOut,LeavePOut,ShuffleSplit # 交叉验证所需的子集划分方法 3 | from sklearn.model_selection import StratifiedKFold,StratifiedShuffleSplit # 分层分割 4 | from sklearn.model_selection import GroupKFold,LeaveOneGroupOut,LeavePGroupsOut,GroupShuffleSplit # 分组分割 5 | from sklearn.model_selection import TimeSeriesSplit # 时间序列分割 6 | from sklearn import datasets # 自带数据集 7 | from sklearn import svm # SVM算法 8 | from sklearn import preprocessing # 预处理模块 9 | from sklearn.metrics import recall_score # 模型度量 10 | 11 | iris = datasets.load_iris() # 加载数据集 12 | print('样本集大小:',iris.data.shape,iris.target.shape) 13 | 14 | # ===================================数据集划分,训练模型========================== 15 | X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0) # 交叉验证划分训练集和测试集.test_size为测试集所占的比例 16 | print('训练集大小:',X_train.shape,y_train.shape) # 训练集样本大小 17 | print('测试集大小:',X_test.shape,y_test.shape) # 测试集样本大小 18 | clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) # 使用训练集训练模型 19 | print('准确率:',clf.score(X_test, y_test)) # 计算测试集的度量值(准确率) 20 | 21 | 22 | # 如果涉及到归一化,则在测试集上也要使用训练集模型提取的归一化函数。 23 | scaler = preprocessing.StandardScaler().fit(X_train) # 通过训练集获得归一化函数模型。(也就是先减几,再除以几的函数)。在训练集和测试集上都使用这个归一化函数 24 | X_train_transformed = scaler.transform(X_train) 25 | clf = svm.SVC(kernel='linear', C=1).fit(X_train_transformed, y_train) # 使用训练集训练模型 26 | X_test_transformed = scaler.transform(X_test) 27 | print(clf.score(X_test_transformed, y_test)) # 计算测试集的度量值(准确度) 28 | 29 | # ===================================直接调用交叉验证评估模型========================== 30 | clf = svm.SVC(kernel='linear', C=1) 31 | scores = cross_val_score(clf, iris.data, iris.target, cv=5) #cv为迭代次数。 32 | print(scores) # 打印输出每次迭代的度量值(准确度) 33 | print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # 获取置信区间。(也就是均值和方差) 34 | 35 | # ===================================多种度量结果====================================== 36 | scoring = ['precision_macro', 'recall_macro'] # precision_macro为精度,recall_macro为召回率 37 | scores = cross_validate(clf, iris.data, iris.target, scoring=scoring,cv=5, return_train_score=True) 38 | sorted(scores.keys()) 39 | print('测试结果:',scores) # scores类型为字典。包含训练得分,拟合次数, score-times (得分次数) 40 | 41 | 42 | # ==================================K折交叉验证、留一交叉验证、留p交叉验证、随机排列交叉验证========================================== 43 | # k折划分子集 44 | kf = KFold(n_splits=2) 45 | for train, test in kf.split(iris.data): 46 | print("k折划分:%s %s" % (train.shape, test.shape)) 47 | break 48 | 49 | # 留一划分子集 50 | loo = LeaveOneOut() 51 | for train, test in loo.split(iris.data): 52 | print("留一划分:%s %s" % (train.shape, test.shape)) 53 | break 54 | 55 | # 留p划分子集 56 | lpo = LeavePOut(p=2) 57 | for train, test in loo.split(iris.data): 58 | print("留p划分:%s %s" % (train.shape, test.shape)) 59 | break 60 | 61 | # 随机排列划分子集 62 | ss = ShuffleSplit(n_splits=3, test_size=0.25,random_state=0) 63 | for train_index, test_index in ss.split(iris.data): 64 | print("随机排列划分:%s %s" % (train.shape, test.shape)) 65 | break 66 | 67 | # ==================================分层K折交叉验证、分层随机交叉验证========================================== 68 | skf = StratifiedKFold(n_splits=3) #各个类别的比例大致和完整数据集中相同 69 | for train, test in skf.split(iris.data, iris.target): 70 | print("分层K折划分:%s %s" % (train.shape, test.shape)) 71 | break 72 | 73 | skf = StratifiedShuffleSplit(n_splits=3) # 划分中每个类的比例和完整数据集中的相同 74 | for train, test in skf.split(iris.data, iris.target): 75 | print("分层随机划分:%s %s" % (train.shape, test.shape)) 76 | break 77 | 78 | 79 | # ==================================组 k-fold交叉验证、留一组交叉验证、留 P 组交叉验证、Group Shuffle Split========================================== 80 | X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10] 81 | y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"] 82 | groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] 83 | 84 | # k折分组 85 | gkf = GroupKFold(n_splits=3) # 训练集和测试集属于不同的组 86 | for train, test in gkf.split(X, y, groups=groups): 87 | print("组 k-fold分割:%s %s" % (train, test)) 88 | 89 | # 留一分组 90 | logo = LeaveOneGroupOut() 91 | for train, test in logo.split(X, y, groups=groups): 92 | print("留一组分割:%s %s" % (train, test)) 93 | 94 | # 留p分组 95 | lpgo = LeavePGroupsOut(n_groups=2) 96 | for train, test in lpgo.split(X, y, groups=groups): 97 | print("留 P 组分割:%s %s" % (train, test)) 98 | 99 | # 随机分组 100 | gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0) 101 | for train, test in gss.split(X, y, groups=groups): 102 | print("随机分割:%s %s" % (train, test)) 103 | 104 | 105 | # ==================================时间序列分割========================================== 106 | tscv = TimeSeriesSplit(n_splits=3) 107 | TimeSeriesSplit(max_train_size=None, n_splits=3) 108 | for train, test in tscv.split(iris.data): 109 | print("时间序列分割:%s %s" % (train, test)) -------------------------------------------------------------------------------- /sk-优化.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris # 自带的样本数据集 2 | from sklearn.neighbors import KNeighborsClassifier # 要估计的是knn里面的参数,包括k的取值和样本权重分布方式 3 | import matplotlib.pyplot as plt # 可视化绘图 4 | from sklearn.model_selection import GridSearchCV,RandomizedSearchCV # 网格搜索和随机搜索 5 | 6 | iris = load_iris() 7 | 8 | X = iris.data # 150个样本,4个属性 9 | y = iris.target # 150个类标号 10 | 11 | k_range = range(1, 31) # 优化参数k的取值范围 12 | weight_options = ['uniform', 'distance'] # 代估参数权重的取值范围。uniform为统一取权值,distance表示距离倒数取权值 13 | # 下面是构建parameter grid,其结构是key为参数名称,value是待搜索的数值列表的一个字典结构 14 | param_grid = {'n_neighbors':k_range,'weights':weight_options} # 定义优化参数字典,字典中的key值必须是分类算法的函数的参数名 15 | print(param_grid) 16 | 17 | knn = KNeighborsClassifier(n_neighbors=5) # 定义分类算法。n_neighbors和weights的参数名称和param_grid字典中的key名对应 18 | 19 | 20 | # ================================网格搜索======================================= 21 | # 这里GridSearchCV的参数形式和cross_val_score的形式差不多,其中param_grid是parameter grid所对应的参数 22 | # GridSearchCV中的n_jobs设置为-1时,可以实现并行计算(如果你的电脑支持的情况下) 23 | grid = GridSearchCV(estimator = knn, param_grid = param_grid, cv=10, scoring='accuracy') #针对每个参数对进行了10次交叉验证。scoring='accuracy'使用准确率为结果的度量指标。可以添加多个度量指标 24 | grid.fit(X, y) 25 | 26 | print('网格搜索-度量记录:',grid.cv_results_) # 包含每次训练的相关信息 27 | print('网格搜索-最佳度量值:',grid.best_score_) # 获取最佳度量值 28 | print('网格搜索-最佳参数:',grid.best_params_) # 获取最佳度量值时的代定参数的值。是一个字典 29 | print('网格搜索-最佳模型:',grid.best_estimator_) # 获取最佳度量时的分类器模型 30 | 31 | 32 | # 使用获取的最佳参数生成模型,预测数据 33 | knn = KNeighborsClassifier(n_neighbors=grid.best_params_['n_neighbors'], weights=grid.best_params_['weights']) # 取出最佳参数进行建模 34 | knn.fit(X, y) # 训练模型 35 | print(knn.predict([[3, 5, 4, 2]])) # 预测新对象 36 | 37 | 38 | 39 | # =====================================随机搜索=========================================== 40 | rand = RandomizedSearchCV(knn, param_grid, cv=10, scoring='accuracy', n_iter=10, random_state=5) # 41 | rand.fit(X, y) 42 | 43 | print('随机搜索-度量记录:',grid.cv_results_) # 包含每次训练的相关信息 44 | print('随机搜索-最佳度量值:',grid.best_score_) # 获取最佳度量值 45 | print('随机搜索-最佳参数:',grid.best_params_) # 获取最佳度量值时的代定参数的值。是一个字典 46 | print('随机搜索-最佳模型:',grid.best_estimator_) # 获取最佳度量时的分类器模型 47 | 48 | 49 | # 使用获取的最佳参数生成模型,预测数据 50 | knn = KNeighborsClassifier(n_neighbors=grid.best_params_['n_neighbors'], weights=grid.best_params_['weights']) # 取出最佳参数进行建模 51 | knn.fit(X, y) # 训练模型 52 | print(knn.predict([[3, 5, 4, 2]])) # 预测新对象 53 | 54 | 55 | # =====================================自定义度量=========================================== 56 | from sklearn import metrics 57 | # 自定义度量函数 58 | def scorerfun(estimator, X, y): 59 | y_pred = estimator.predict(X) 60 | return metrics.accuracy_score(y, y_pred) 61 | 62 | rand = RandomizedSearchCV(knn, param_grid, cv=10, scoring='accuracy', n_iter=10, random_state=5) # 63 | rand.fit(X, y) 64 | 65 | print('随机搜索-最佳度量值:',grid.best_score_) # 获取最佳度量值 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /sk-决策树.py: -------------------------------------------------------------------------------- 1 | import numpy as np # 快速操作结构数组的工具 2 | import pandas as pd # 数据分析处理工具 3 | import matplotlib.pyplot as plt # 画图工具 4 | from sklearn import datasets # 机器学习库 5 | from sklearn.preprocessing import LabelEncoder 6 | from sklearn import tree 7 | 8 | 9 | 10 | # 下面的数据分为为每个用户的来源网站、位置、是否阅读FAQ、浏览网页数目、选择的服务类型(目标结果) 11 | attr_arr=[['slashdot','USA','yes',18,'None'], 12 | ['google','France','yes',23,'Premium'], 13 | ['digg','USA','yes',24,'Basic'], 14 | ['kiwitobes','France','yes',23,'Basic'], 15 | ['google','UK','no',21,'Premium'], 16 | ['(direct)','New Zealand','no',12,'None'], 17 | ['(direct)','UK','no',21,'Basic'], 18 | ['google','USA','no',24,'Premium'], 19 | ['slashdot','France','yes',19,'None'], 20 | ['digg','USA','no',18,'None'], 21 | ['google','UK','no',18,'None'], 22 | ['kiwitobes','UK','no',19,'None'], 23 | ['digg','New Zealand','yes',12,'Basic'], 24 | ['slashdot','UK','no',21,'None'], 25 | ['google','UK','yes',18,'Basic'], 26 | ['kiwitobes','France','yes',19,'Basic']] 27 | 28 | #生成属性数据集和结果数据集 29 | dataMat = np.mat(attr_arr) 30 | arrMat = dataMat[:,0:4] 31 | resultMat = dataMat[:,4] 32 | 33 | # 构造数据集成pandas结构,为了能理解属性的名称 34 | attr_names = ['src', 'address', 'FAQ', 'num'] #特征属性的名称 35 | attr_pd = pd.DataFrame(data=arrMat,columns=attr_names) #每行为一个对象,每列为一种属性,最后一个为结果值 36 | print(attr_pd) 37 | 38 | #将数据集中的字符串转化为代表类别的数字。因为sklearn的决策树只识别数字 39 | le = LabelEncoder() 40 | for col in attr_pd.columns: #为每一列序列化,就是将每种字符串转化为对应的数字。用数字代表类别 41 | attr_pd[col] = le.fit_transform(attr_pd[col]) 42 | print(attr_pd) 43 | 44 | # 构建决策树 45 | clf = tree.DecisionTreeClassifier() 46 | clf.fit(attr_pd, resultMat) 47 | print(clf) 48 | 49 | # 使用决策树进行预测 50 | result = clf.predict([[1,1,1,0]]) # 输入也必须是数字的。分别代表了每个数字所代表的属性的字符串值 51 | print(result) 52 | 53 | # 将决策树保存成图片 54 | from sklearn.externals.six import StringIO 55 | import pydotplus 56 | 57 | dot_data = StringIO() 58 | target_name=['None','Basic','Premium'] 59 | tree.export_graphviz(clf, out_file=dot_data,feature_names=attr_names, 60 | class_names=target_name,filled=True,rounded=True, 61 | special_characters=True) 62 | graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 63 | graph.write_png('tree.png') 64 | 65 | -------------------------------------------------------------------------------- /sk-分类大全.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | # ========加载数据(Data Loading)======== 4 | import numpy as np 5 | import urllib.request 6 | 7 | # 数据集的请求地址 8 | url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data" 9 | # 下载响应的csv文件 10 | raw_data = urllib.request.urlopen(url) 11 | # 加载csv文件成numpy中的矩阵 12 | dataset = np.loadtxt(raw_data, delimiter=",") 13 | # 分割成属性集和结果集 14 | X = dataset[:,0:7] # 特征矩阵 15 | y = dataset[:,8] #目标矩阵 16 | # print('特征矩阵:\n',X) 17 | # print('结果矩阵:\n',y) 18 | 19 | # ========数据归一化(Data Normalization)======== 20 | from sklearn import preprocessing 21 | # 归一化数据集 22 | normalized_X = preprocessing.normalize(X) 23 | # 标准话数据集 24 | standardized_X = preprocessing.scale(X) 25 | 26 | # ========特征选择(Feature Selection)======== 27 | # 树算法(Tree algorithms)计算特征的信息量 28 | from sklearn import metrics 29 | from sklearn.ensemble import ExtraTreesClassifier 30 | model = ExtraTreesClassifier() 31 | model.fit(X, y) 32 | # 显示每个特征的重要性 33 | print('属性重要性:\n',model.feature_importances_) 34 | 35 | # ========逻辑回归======== 36 | from sklearn import metrics 37 | from sklearn.linear_model import LogisticRegression 38 | model = LogisticRegression() 39 | model.fit(X, y) 40 | print('逻辑回归模型:\n',model) 41 | # 使用模型预测 42 | expected = y 43 | predicted = model.predict(X) 44 | # 评估模型 45 | print(metrics.classification_report(expected, predicted)) #评估模型 46 | print(metrics.confusion_matrix(expected, predicted)) # 使用混淆矩阵评估模型 47 | 48 | # ========朴素贝叶斯======== 49 | from sklearn import metrics 50 | from sklearn.naive_bayes import GaussianNB 51 | model = GaussianNB() 52 | model.fit(X, y) 53 | print('朴素贝叶斯模型:\n',model) 54 | # 使用模型预测 55 | expected = y 56 | predicted = model.predict(X) 57 | # 评估模型 58 | print(metrics.classification_report(expected, predicted)) 59 | print(metrics.confusion_matrix(expected, predicted)) 60 | 61 | # ========k近邻======== 62 | from sklearn import metrics 63 | from sklearn.neighbors import KNeighborsClassifier 64 | # 使用样本数据构建knn模型 65 | model = KNeighborsClassifier() 66 | model.fit(X, y) 67 | print('KNN模型:\n',model) 68 | # 使用模型预测 69 | expected = y 70 | predicted = model.predict(X) 71 | # 评估模型 72 | print(metrics.classification_report(expected, predicted)) 73 | print(metrics.confusion_matrix(expected, predicted)) 74 | 75 | 76 | # ========决策树======== 77 | from sklearn import metrics 78 | from sklearn.tree import DecisionTreeClassifier 79 | # 构建决策树模型 80 | model = DecisionTreeClassifier() 81 | model.fit(X, y) 82 | print('决策树模型:\n',model) 83 | # 使用模型预测 84 | expected = y 85 | predicted = model.predict(X) 86 | # 评估模型 87 | print(metrics.classification_report(expected, predicted)) 88 | print(metrics.confusion_matrix(expected, predicted)) 89 | 90 | 91 | # ========支持向量机======== 92 | from sklearn import metrics 93 | from sklearn.svm import SVC 94 | # 构建svm模型 95 | model = SVC() 96 | model.fit(X, y) 97 | print('SVM模型:\n',model) 98 | # 使用模型预测 99 | expected = y 100 | predicted = model.predict(X) 101 | # 评估模型 102 | print(metrics.classification_report(expected, predicted)) 103 | print(metrics.confusion_matrix(expected, predicted)) 104 | 105 | # ========优化算法参数======== 106 | import numpy as np 107 | from sklearn.linear_model import Ridge #岭回归模型 108 | from scipy.stats import uniform as sp_rand 109 | from sklearn.grid_search import GridSearchCV #网格搜索 110 | from sklearn.grid_search import RandomizedSearchCV # 随机搜索 111 | 112 | # 准备参数的可取值 113 | alphas = np.array([1,0.1,0.01,0.001,0.0001,0]) 114 | # 构建岭回归模型,并尝试参数每一个可取值 115 | model = Ridge() 116 | rsearch = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas)) 117 | 118 | # # 只给定区间,参数随机取值 119 | # param_grid = {'alpha': sp_rand()} 120 | # # 构建岭回归模型,并尝试参数随机值 121 | # model = Ridge() 122 | # rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100) 123 | 124 | rsearch.fit(X, y) 125 | print(rsearch) 126 | # 评估搜索结果 127 | print(rsearch.best_score_) 128 | print(rsearch.best_estimator_.alpha) 129 | -------------------------------------------------------------------------------- /sk-卷积神经网络-识别手写数字.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from scipy.ndimage import convolve 5 | from sklearn import linear_model, datasets, metrics 6 | from sklearn.cross_validation import train_test_split 7 | from sklearn.neural_network import BernoulliRBM 8 | from sklearn.pipeline import Pipeline 9 | 10 | 11 | ############################################################################### 12 | # Setting up 13 | 14 | def nudge_dataset(X, Y): 15 | """ 16 | This produces a dataset 5 times bigger than the original one, 17 | by moving the 8x8 images in X around by 1px to left, right, down, up 18 | """ 19 | direction_vectors = [ 20 | [[0, 1, 0], 21 | [0, 0, 0], 22 | [0, 0, 0]], 23 | 24 | [[0, 0, 0], 25 | [1, 0, 0], 26 | [0, 0, 0]], 27 | 28 | [[0, 0, 0], 29 | [0, 0, 1], 30 | [0, 0, 0]], 31 | 32 | [[0, 0, 0], 33 | [0, 0, 0], 34 | [0, 1, 0]]] 35 | 36 | shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant',weights=w).ravel() 37 | X = np.concatenate([X] +[np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors]) 38 | Y = np.concatenate([Y for _ in range(5)], axis=0) 39 | return X, Y 40 | 41 | 42 | # 记载数据集 43 | digits = datasets.load_digits() 44 | X = np.asarray(digits.data, 'float32') 45 | X, Y = nudge_dataset(X, digits.target) 46 | X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) # 归一化到0-1 47 | 48 | 49 | # 交叉验证 50 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2, random_state=0) 51 | 52 | # 逻辑回归模型 53 | logistic = linear_model.LogisticRegression() 54 | rbm = BernoulliRBM(random_state=0, verbose=True) 55 | 56 | classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) 57 | 58 | ############################################################################### 59 | # Training 60 | 61 | # Hyper-parameters. These were set by cross-validation, 62 | # using a GridSearchCV. Here we are not performing cross-validation to 63 | # save time. 64 | rbm.learning_rate = 0.06 65 | rbm.n_iter = 20 66 | # More components tend to give better prediction performance, but larger 67 | # fitting time 68 | rbm.n_components = 100 69 | logistic.C = 6000.0 70 | 71 | # Training RBM-Logistic Pipeline 72 | classifier.fit(X_train, Y_train) 73 | 74 | # Training Logistic regression 75 | logistic_classifier = linear_model.LogisticRegression(C=100.0) 76 | logistic_classifier.fit(X_train, Y_train) 77 | 78 | ############################################################################### 79 | # Evaluation 80 | 81 | print() 82 | print("Logistic regression using RBM features:\n%s\n" % ( 83 | metrics.classification_report( 84 | Y_test, 85 | classifier.predict(X_test)))) 86 | 87 | print("Logistic regression using raw pixel features:\n%s\n" % ( 88 | metrics.classification_report(Y_test,logistic_classifier.predict(X_test)))) 89 | 90 | ############################################################################### 91 | # Plotting 92 | 93 | plt.figure(figsize=(4.2, 4)) 94 | for i, comp in enumerate(rbm.components_): 95 | plt.subplot(10, 10, i + 1) 96 | plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r,interpolation='nearest') 97 | plt.xticks(()) 98 | plt.yticks(()) 99 | plt.suptitle('100 components extracted by RBM', fontsize=16) 100 | plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) 101 | plt.show() -------------------------------------------------------------------------------- /sk-卷积神经网络.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | import matplotlib.cm as cm 4 | from urllib import request 5 | import pickle 6 | import os 7 | import gzip 8 | import numpy as np 9 | import theano 10 | import lasagne 11 | from lasagne import layers 12 | from lasagne.updates import nesterov_momentum 13 | from nolearn.lasagne import NeuralNet 14 | from nolearn.lasagne import visualize 15 | from sklearn.metrics import classification_report 16 | from sklearn.metrics import confusion_matrix 17 | 18 | def load_dataset(): 19 | url = 'http://deeplearning.net/data/mnist/mnist.pkl.gz' 20 | filename = 'mnist.pkl.gz' 21 | if not os.path.exists(filename): 22 | print("Downloading MNIST dataset...") 23 | request.urlretrieve(url, filename) 24 | with gzip.open(filename, 'rb') as f: 25 | data = pickle.load(f) 26 | X_train, y_train = data[0] 27 | X_val, y_val = data[1] 28 | X_test, y_test = data[2] 29 | X_train = X_train.reshape((-1, 1, 28, 28)) 30 | X_val = X_val.reshape((-1, 1, 28, 28)) 31 | X_test = X_test.reshape((-1, 1, 28, 28)) 32 | y_train = y_train.astype(np.uint8) 33 | y_val = y_val.astype(np.uint8) 34 | y_test = y_test.astype(np.uint8) 35 | return X_train, y_train, X_val, y_val, X_test, y_test 36 | 37 | # 加载MNIST数据集并检验它 38 | X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() 39 | plt.imshow(X_train[0][0], cmap=cm.binary) -------------------------------------------------------------------------------- /sk-多类多标签.py: -------------------------------------------------------------------------------- 1 | # 多标签分类格式。将多分类转换为二分类的格式,类似于one-hot编码 2 | from sklearn.preprocessing import MultiLabelBinarizer 3 | y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]] 4 | y_new = MultiLabelBinarizer().fit_transform(y) 5 | print('新的输出格式:\n',y_new) 6 | 7 | 8 | 9 | # =========1对其余的多分类构造方式================ 10 | from sklearn import datasets 11 | from sklearn.multiclass import OneVsRestClassifier 12 | from sklearn.svm import LinearSVC 13 | 14 | iris = datasets.load_iris() 15 | X, y = iris.data, iris.target 16 | clf = LinearSVC(random_state=0) # 构建二分类器 17 | clf = OneVsRestClassifier(clf) # 根据二分类器构建多分类器 18 | clf.fit(X, y) # 训练模型 19 | y_pred = clf.predict(X) # 预测样本 20 | print('预测正确的个数:%d,预测错误的个数:%d' %((y==y_pred).sum(),(y!=y_pred).sum())) 21 | 22 | 23 | # =========1对1的多分类构造方式================ 24 | from sklearn import datasets 25 | from sklearn.multiclass import OneVsOneClassifier 26 | from sklearn.svm import LinearSVC 27 | 28 | iris = datasets.load_iris() 29 | X, y = iris.data, iris.target 30 | clf = LinearSVC(random_state=0) # 构建二分类器 31 | clf = OneVsOneClassifier(clf) # 根据二分类器构建多分类器 32 | clf.fit(X, y) # 训练模型 33 | y_pred = clf.predict(X) # 预测样本 34 | print('预测正确的个数:%d,预测错误的个数:%d' %((y==y_pred).sum(),(y!=y_pred).sum())) 35 | 36 | 37 | # =========误差校正输出代码================ 38 | from sklearn import datasets 39 | from sklearn.multiclass import OutputCodeClassifier 40 | from sklearn.svm import LinearSVC 41 | 42 | iris = datasets.load_iris() 43 | X, y = iris.data, iris.target 44 | clf = LinearSVC(random_state=0) # 构建二分类器 45 | clf = OutputCodeClassifier(clf,code_size=2, random_state=0) # 根据二分类器构建多分类器 46 | clf.fit(X, y) # 训练模型 47 | y_pred = clf.predict(X) # 预测样本 48 | print('预测正确的个数:%d,预测错误的个数:%d' %((y==y_pred).sum(),(y!=y_pred).sum())) 49 | 50 | 51 | # =========多输出回归================ 52 | from sklearn.datasets import make_regression 53 | from sklearn.multioutput import MultiOutputRegressor 54 | from sklearn.ensemble import GradientBoostingRegressor 55 | from sklearn import metrics 56 | X, y = make_regression(n_samples=10, n_targets=3, random_state=1) # 产生10个样本,每个样本100个属性,每个样本3个输出值 57 | print('样本特征维度',X.shape) 58 | print('样本输出维度',y.shape) 59 | clf = GradientBoostingRegressor(random_state=0) 60 | clf =MultiOutputRegressor(clf) 61 | clf.fit(X, y) 62 | y_pred = clf.predict(X) # 预测样本 63 | print('均方误差:',metrics.mean_squared_error(y, y_pred)) # 均方误差 64 | 65 | 66 | # =========多输出分类================ 67 | from sklearn.datasets import make_classification 68 | from sklearn.multioutput import MultiOutputClassifier 69 | from sklearn.ensemble import RandomForestClassifier 70 | from sklearn.utils import shuffle 71 | import numpy as np 72 | X, y1 = make_classification(n_samples=10, n_features=100, n_informative=30, n_classes=3, random_state=1) # 生成分类数据集,10个样本,100个特征,30个有效特征,3种分类 73 | y2 = shuffle(y1, random_state=1) # 分类结果随机排序 74 | y3 = shuffle(y1, random_state=2) # 分类结果随机排序 75 | Y = np.vstack((y1, y2, y3)).T # 多种分类结果组合成 76 | print('多输出多分类器真实输出分类:\n',Y) 77 | n_samples, n_features = X.shape # 10,100 78 | n_outputs = Y.shape[1] # 3个输出 79 | n_classes = 3 # 每种输出有3种分类 80 | forest = RandomForestClassifier(n_estimators=100, random_state=1) # 生成随机森林多分类器 81 | multi_target_forest = MultiOutputClassifier(forest) # 构建多输出多分类器 82 | y_pred = multi_target_forest.fit(X, Y).predict(X) 83 | print('多输出多分类器预测输出分类:\n',y_pred) 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /sk-密度聚类.py: -------------------------------------------------------------------------------- 1 | import numpy as np # 数据结构 2 | import sklearn.cluster as skc # 密度聚类 3 | from sklearn import metrics # 评估模型 4 | import matplotlib.pyplot as plt # 可视化绘图 5 | 6 | data=[ 7 | [-2.68420713,1.469732895],[-2.71539062,-0.763005825],[-2.88981954,-0.618055245],[-2.7464372,-1.40005944],[-2.72859298,1.50266052], 8 | [-2.27989736,3.365022195],[-2.82089068,-0.369470295],[-2.62648199,0.766824075],[-2.88795857,-2.568591135],[-2.67384469,-0.48011265], 9 | [-2.50652679,2.933707545],[-2.61314272,0.096842835],[-2.78743398,-1.024830855],[-3.22520045,-2.264759595],[-2.64354322,5.33787705], 10 | [-2.38386932,6.05139453],[-2.6225262,3.681403515],[-2.64832273,1.436115015],[-2.19907796,3.956598405],[-2.58734619,2.34213138], 11 | [1.28479459,3.084476355],[0.93241075,1.436391405],[1.46406132,2.268854235],[0.18096721,-3.71521773],[1.08713449,0.339256755], 12 | [0.64043675,-1.87795566],[1.09522371,1.277510445],[-0.75146714,-4.504983795],[1.04329778,1.030306095],[-0.01019007,-3.242586915], 13 | [-0.5110862,-5.681213775],[0.51109806,-0.460278495],[0.26233576,-2.46551985],[0.98404455,-0.55962189],[-0.174864,-1.133170065], 14 | [0.92757294,2.107062945],[0.65959279,-1.583893305],[0.23454059,-1.493648235],[0.94236171,-2.43820017],[0.0432464,-2.616702525], 15 | [4.53172698,-0.05329008],[3.41407223,-2.58716277],[4.61648461,1.538708805],[3.97081495,-0.815065605],[4.34975798,-0.188471475], 16 | [5.39687992,2.462256225],[2.51938325,-5.361082605],[4.9320051,1.585696545],[4.31967279,-1.104966765],[4.91813423,3.511712835], 17 | [3.66193495,1.0891728],[3.80234045,-0.972695745],[4.16537886,0.96876126],[3.34459422,-3.493869435],[3.5852673,-2.426881725], 18 | [3.90474358,0.534685455],[3.94924878,0.18328617],[5.48876538,5.27195043],[5.79468686,1.139695065],[3.29832982,-3.42456273] 19 | ] 20 | X = np.array(data) 21 | 22 | db = skc.DBSCAN(eps=1.5, min_samples=3).fit(X) #DBSCAN聚类方法 还有参数,matric = ""距离计算方法 23 | labels = db.labels_ #和X同一个维度,labels对应索引序号的值 为她所在簇的序号。若簇编号为-1,表示为噪声 24 | 25 | print('每个样本的簇标号:') 26 | print(labels) 27 | 28 | raito = len(labels[labels[:] == -1]) / len(labels) #计算噪声点个数占总数的比例 29 | print('噪声比:', format(raito, '.2%')) 30 | 31 | n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # 获取分簇的数目 32 | 33 | print('分簇的数目: %d' % n_clusters_) 34 | print("轮廓系数: %0.3f" % metrics.silhouette_score(X, labels)) #轮廓系数评价聚类的好坏 35 | 36 | for i in range(n_clusters_): 37 | print('簇 ', i, '的所有样本:') 38 | one_cluster = X[labels == i] 39 | print(one_cluster) 40 | plt.plot(one_cluster[:,0],one_cluster[:,1],'o') 41 | 42 | plt.show() -------------------------------------------------------------------------------- /sk-小批量k均值聚类.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | from sklearn.cluster import MiniBatchKMeans, KMeans 6 | from sklearn.metrics.pairwise import pairwise_distances_argmin 7 | from sklearn.datasets.samples_generator import make_blobs 8 | 9 | # ############################################################################# 10 | # 产生样本数据 11 | np.random.seed(0) 12 | 13 | batch_size = 45 14 | centers = [[1, 1], [-1, -1], [1, -1]] # 三种聚类的中心 15 | n_clusters = len(centers) 16 | X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7) # 生成样本随机数 17 | 18 | # ############################################################################# 19 | # k均值聚类 20 | 21 | k_means = KMeans(init='k-means++', n_clusters=3, n_init=10) 22 | begin_time = time.time() # 记录训练开始时间 23 | k_means.fit(X) # 聚类模型 24 | t_batch = time.time() - begin_time # 记录训练用时 25 | print('k均值聚类时长:',t_batch) 26 | # ############################################################################# 27 | # 小批量k均值聚类 28 | # batch_size为每次更新使用的样本数 29 | mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size, 30 | n_init=10, max_no_improvement=10, verbose=0) 31 | begin_time = time.time() # 记录训练开始时间 32 | mbk.fit(X) # 聚类模型 33 | t_mini_batch = time.time() - begin_time # 记录训练用时 34 | print('小批量k均值聚类时长:',t_mini_batch) 35 | # ############################################################################# 36 | # 结果可视化 37 | fig = plt.figure(figsize=(16, 6)) # 窗口大小 38 | fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9) # # 窗口四周留白 39 | colors = ['#4EACC5', '#FF9C34', '#4E9A06'] # 三种聚类的颜色 40 | 41 | # 在两种聚类算法中,样本的所属类标号和聚类中心 42 | k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0) # 三个聚类点排序 43 | mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis=0) # 三个聚类点排序 44 | k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers) # 计算X中每个样本与k_means_cluster_centers中的哪个样本最近。也就是获取所有对象的所属的类标签 45 | mbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers) # 计算X中每个样本与k_means_cluster_centers中的哪个样本最近。也就是获取所有对象的所属的类标签 46 | order = pairwise_distances_argmin(k_means_cluster_centers,mbk_means_cluster_centers) # 计算k均值聚类点相对于小批量k均值聚类点的索引。因为要比较两次聚类的结果的区别,所以类标号要对应上 47 | 48 | 49 | # 绘制KMeans 50 | ax = fig.add_subplot(1, 3, 1) 51 | for k, col in zip(range(n_clusters), colors): 52 | my_members = k_means_labels == k # 获取属于当前类别的样本 53 | cluster_center = k_means_cluster_centers[k] # 获取当前聚类中心 54 | ax.plot(X[my_members, 0], X[my_members, 1], 'w',markerfacecolor=col, marker='.') # 绘制当前聚类的样本点 55 | ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,markeredgecolor='k', markersize=6) # 绘制聚类中心点 56 | ax.set_title('KMeans') 57 | ax.set_xticks(()) 58 | ax.set_yticks(()) 59 | plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' % (t_batch, k_means.inertia_)) 60 | 61 | # 绘制MiniBatchKMeans 62 | ax = fig.add_subplot(1, 3, 2) 63 | for k, col in zip(range(n_clusters), colors): 64 | my_members = mbk_means_labels == k # 获取属于当前类别的样本 65 | cluster_center = mbk_means_cluster_centers[k] # 获取当前聚类中心 66 | ax.plot(X[my_members, 0], X[my_members, 1], 'w',markerfacecolor=col, marker='.') # 绘制当前聚类的样本点 67 | ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,markeredgecolor='k', markersize=6) # 绘制聚类中心点 68 | ax.set_title('MiniBatchKMeans') 69 | ax.set_xticks(()) 70 | ax.set_yticks(()) 71 | plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' %(t_mini_batch, mbk.inertia_)) 72 | 73 | # 初始化两次结果中 74 | different = (mbk_means_labels == 4) 75 | ax = fig.add_subplot(1, 3, 3) 76 | 77 | for k in range(n_clusters): 78 | different += ((k_means_labels == k) != (mbk_means_labels == order[k])) # 将两种聚类算法中聚类结果不一样的样本设置为true,聚类结果相同的样本设置为false 79 | 80 | identic = np.logical_not(different) # 向量取反,也就是聚类结果相同设置true,聚类结果不相同设置为false 81 | 82 | ax.plot(X[identic, 0], X[identic, 1], 'w',markerfacecolor='#bbbbbb', marker='.') # 绘制聚类结果相同的样本点 83 | ax.plot(X[different, 0], X[different, 1], 'w',markerfacecolor='m', marker='.') # 绘制聚类结果不同的样本点 84 | ax.set_title('Difference') 85 | ax.set_xticks(()) 86 | ax.set_yticks(()) 87 | 88 | plt.show() -------------------------------------------------------------------------------- /sk-层次聚类.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | 3 | import numpy as np 4 | from scipy import ndimage 5 | from matplotlib import pyplot as plt 6 | 7 | from sklearn import manifold, datasets 8 | 9 | digits = datasets.load_digits(n_class=10) # 生成10种类别的样本数据 10 | X = digits.data 11 | y = digits.target 12 | n_samples, n_features = X.shape 13 | 14 | np.random.seed(0) 15 | 16 | def nudge_images(X, y): 17 | # Having a larger dataset shows more clearly the behavior of the 18 | # methods, but we multiply the size of the dataset only by 2, as the 19 | # cost of the hierarchical clustering methods are strongly 20 | # super-linear in n_samples 21 | shift = lambda x: ndimage.shift(x.reshape((8, 8)),.3 * np.random.normal(size=2),mode='constant',).ravel() 22 | X = np.concatenate([X, np.apply_along_axis(shift, 1, X)]) 23 | Y = np.concatenate([y, y], axis=0) 24 | return X, Y 25 | 26 | 27 | X, y = nudge_images(X, y) 28 | print(y) 29 | 30 | #---------------------------------------------------------------------- 31 | # 可视化聚类 32 | def plot_clustering(X_red, X, labels, title=None): 33 | x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0) 34 | X_red = (X_red - x_min) / (x_max - x_min) 35 | 36 | plt.figure(figsize=(6, 4)) 37 | for i in range(X_red.shape[0]): 38 | plt.text(X_red[i, 0], X_red[i, 1], str(y[i]), 39 | color=plt.cm.spectral(labels[i] / 10.), 40 | fontdict={'weight': 'bold', 'size': 9}) 41 | 42 | plt.xticks([]) 43 | plt.yticks([]) 44 | if title is not None: 45 | plt.title(title, size=17) 46 | plt.axis('off') 47 | plt.tight_layout() 48 | 49 | #---------------------------------------------------------------------- 50 | # 2D embedding of the digits dataset 51 | print("Computing embedding") 52 | X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X) 53 | print("Done.") 54 | 55 | from sklearn.cluster import AgglomerativeClustering # 引入层次聚类 56 | 57 | for linkage in ('ward', 'average', 'complete'): 58 | clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10) # 通过聚类个数和聚类合并准则创建聚类模型 59 | begin_time = time() # 记录开始时间 60 | clustering.fit(X_red) 61 | print(linkage,"聚类合并方法进行聚类用时: %.2fs" % (time() - begin_time)) 62 | 63 | plot_clustering(X_red, X, clustering.labels_, "%s linkage" % linkage) # 可视化聚类结果 64 | 65 | 66 | plt.show() -------------------------------------------------------------------------------- /sk-层次聚类1.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | import numpy as np 3 | from scipy import ndimage 4 | from matplotlib import pyplot as plt 5 | from sklearn import manifold, datasets 6 | from sklearn.datasets.samples_generator import make_blobs 7 | 8 | # 产生样本数据 9 | np.random.seed(0) 10 | 11 | centers = [[1, 1], [-1, -1], [1, -1]] # 三种聚类的中心 12 | n_clusters = len(centers) 13 | X, y = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7) # 生成样本随机数 14 | 15 | 16 | #---------------------------------------------------------------------- 17 | # 可视化聚类 18 | def plot_clustering(X_red, X, labels, title=None): 19 | x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0) 20 | X_red = (X_red - x_min) / (x_max - x_min) 21 | 22 | plt.figure(figsize=(6, 4)) 23 | for i in range(X_red.shape[0]): 24 | plt.text(X_red[i, 0], X_red[i, 1], str(y[i]), 25 | color=plt.cm.spectral(labels[i] / 10.), 26 | fontdict={'weight': 'bold', 'size': 9}) 27 | 28 | plt.xticks([]) 29 | plt.yticks([]) 30 | if title is not None: 31 | plt.title(title, size=17) 32 | plt.axis('off') 33 | plt.tight_layout() 34 | 35 | #---------------------------------------------------------------------- 36 | # 手写体数据集 37 | print("Computing embedding") 38 | X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X) 39 | print("Done.") 40 | 41 | from sklearn.cluster import AgglomerativeClustering # 引入层次聚类 42 | 43 | for linkage in ('ward', 'average', 'complete'): 44 | clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10) # 通过聚类个数和聚类合并准则创建聚类模型 45 | begin_time = time() # 记录开始时间 46 | clustering.fit(X_red) 47 | print(linkage,"聚类合并方法进行聚类用时: %.2fs" % (time() - begin_time)) 48 | 49 | plot_clustering(X_red, X, clustering.labels_, "%s linkage" % linkage) # 可视化聚类结果 50 | 51 | 52 | plt.show() -------------------------------------------------------------------------------- /sk-岭回归.py: -------------------------------------------------------------------------------- 1 | import numpy as np # 快速操作结构数组的工具 2 | import matplotlib.pyplot as plt # 可视化绘制 3 | from sklearn.linear_model import Ridge,RidgeCV # Ridge岭回归,RidgeCV带有广义交叉验证的岭回归 4 | 5 | 6 | # 样本数据集,第一列为x,第二列为y,在x和y之间建立回归模型 7 | data=[ 8 | [0.067732,3.176513],[0.427810,3.816464],[0.995731,4.550095],[0.738336,4.256571],[0.981083,4.560815], 9 | [0.526171,3.929515],[0.378887,3.526170],[0.033859,3.156393],[0.132791,3.110301],[0.138306,3.149813], 10 | [0.247809,3.476346],[0.648270,4.119688],[0.731209,4.282233],[0.236833,3.486582],[0.969788,4.655492], 11 | [0.607492,3.965162],[0.358622,3.514900],[0.147846,3.125947],[0.637820,4.094115],[0.230372,3.476039], 12 | [0.070237,3.210610],[0.067154,3.190612],[0.925577,4.631504],[0.717733,4.295890],[0.015371,3.085028], 13 | [0.335070,3.448080],[0.040486,3.167440],[0.212575,3.364266],[0.617218,3.993482],[0.541196,3.891471] 14 | ] 15 | 16 | 17 | #生成X和y矩阵 18 | dataMat = np.array(data) 19 | X = dataMat[:,0:1] # 变量x 20 | y = dataMat[:,1] #变量y 21 | 22 | 23 | 24 | # ========岭回归======== 25 | model = Ridge(alpha=0.5) 26 | model = RidgeCV(alphas=[0.1, 1.0, 10.0]) # 通过RidgeCV可以设置多个参数值,算法使用交叉验证获取最佳参数值 27 | model.fit(X, y) # 线性回归建模 28 | print('系数矩阵:\n',model.coef_) 29 | print('线性回归模型:\n',model) 30 | # print('交叉验证最佳alpha值',model.alpha_) # 只有在使用RidgeCV算法时才有效 31 | # 使用模型预测 32 | predicted = model.predict(X) 33 | 34 | # 绘制散点图 参数:x横轴 y纵轴 35 | plt.scatter(X, y, marker='x') 36 | plt.plot(X, predicted,c='r') 37 | 38 | # 绘制x轴和y轴坐标 39 | plt.xlabel("x") 40 | plt.ylabel("y") 41 | 42 | # 显示图形 43 | plt.show() 44 | 45 | -------------------------------------------------------------------------------- /sk-度量.py: -------------------------------------------------------------------------------- 1 | from sklearn import svm, datasets 2 | from sklearn.model_selection import cross_val_score,cross_validate # 交叉验证中的模型度量 3 | import numpy as np # 快速操作结构数组的工具 4 | import matplotlib.pyplot as plt # 可视化绘制 5 | from sklearn.linear_model import LinearRegression # 线性回归 6 | from sklearn.metrics import make_scorer 7 | from sklearn import metrics 8 | 9 | # =============================分类度量=============================== 10 | print('=============================分类度量===============================') 11 | iris = datasets.load_iris() # 加载iris 数据集;用于分类问题 12 | X, y = iris.data, iris.target # 150个样本,4个属性,3种分类 13 | 14 | 15 | clf = svm.SVC(probability=True, random_state=0) 16 | 17 | # ===========================交叉验证获取度量======================= 18 | score = cross_val_score(clf, X, y, scoring='accuracy',cv=3) # 默认进行三次交叉验证 19 | print('交叉验证度量:',score) 20 | 21 | 22 | # ===========================自定义度量======================= 23 | 24 | # 自定义度量函数,输入为真实值和预测值 25 | def my_custom_loss_func(ground_truth, predictions): 26 | diff = np.abs(ground_truth - predictions).max() 27 | return np.log(1 + diff) 28 | 29 | loss = make_scorer(my_custom_loss_func, greater_is_better=False) # 自定义度量对象。结果越小越好。greater_is_better设置为false,系统认为是损失函数,则会将计分函数取反 30 | score = make_scorer(my_custom_loss_func, greater_is_better=True) # 自定义度量对象。结果越大越好 31 | clf = svm.SVC() 32 | clf.fit(X, y) 33 | 34 | print(loss(clf,X,y)) # 对模型进行度量,系统会自动调用模型对输入进行预测,并和真实输出值进行比较,计算损失函数 35 | print(score(clf,X,y)) # 对模型进行度量,系统会自动调用模型对输入进行预测,并和真实输出值进行比较,计算得分 36 | 37 | 38 | # ============================多种度量值========================= 39 | scoring = ['precision_macro', 'recall_macro'] # precision_macro为精度,recall_macro为召回率 40 | scores = cross_validate(clf, X, y,scoring=scoring,cv=5, return_train_score=True) 41 | sorted(scores.keys()) 42 | print('多种度量的测试结果:',scores) # scores类型为字典。包含训练得分,拟合次数, score-times (得分次数) 43 | 44 | 45 | 46 | # ============================分类指标========================= 47 | clf = svm.SVC() # 构建模型 48 | clf.fit(X, y) # 训练模型 49 | predict_y = clf.predict(X) # 预测数据 50 | 51 | print('准确率指标:',metrics.accuracy_score(y, predict_y)) # 计算准确率 52 | print('Kappa指标:',metrics.cohen_kappa_score(y, predict_y)) # Kappa 检验 53 | print('混淆矩阵:\n',metrics.confusion_matrix(y, predict_y)) # 混淆矩阵 54 | 55 | target_names = ['class 0', 'class 1', 'class 2'] 56 | print('分类报告:\n',metrics.classification_report(y, predict_y, target_names=target_names)) # 分类报告 57 | print('汉明损失:',metrics.hamming_loss(y, predict_y)) #汉明损失 。在多分类中, 汉明损失对应于 y 和 predict_y 之间的汉明距离 58 | print('Jaccard 相似系数:',metrics.jaccard_similarity_score(y, predict_y)) # Jaccard 相似系数 59 | 60 | 61 | 62 | # 下面的系数在在二分类中不需要使用average参数,在多分类中需要使用average参数进行多个二分类的平均 63 | # average可取值:macro(宏)、weighted(加权)、micro(微)、samples(样本)、None(返回每个类的分数) 64 | 65 | print('精度计算:',metrics.precision_score(y, predict_y, average='macro')) 66 | print('召回率:',metrics.recall_score(y, predict_y,average='micro')) 67 | print('F1值:',metrics.f1_score(y, predict_y,average='weighted')) 68 | 69 | print('FB值:',metrics.fbeta_score(y, predict_y,average='macro', beta=0.5)) 70 | print('FB值:',metrics.fbeta_score(y, predict_y,average='macro', beta=1)) 71 | print('FB值:',metrics.fbeta_score(y, predict_y,average='macro', beta=2)) 72 | print('精确召回曲线:',metrics.precision_recall_fscore_support(y, predict_y,beta=0.5,average=None)) 73 | print('零一损失:',metrics.zero_one_loss(y, predict_y)) 74 | 75 | # ROC曲线(二分类) 76 | y1 = np.array([0, 0, 1, 1]) # 样本类标号 77 | y_scores = np.array([0.1, 0.4, 0.35, 0.8]) # 样本的得分(属于正样本的概率估计、或置信度值) 78 | fpr, tpr, thresholds = metrics.roc_curve(y1, y_scores, pos_label=1) 79 | print('假正率:',fpr) 80 | print('真正率:',tpr) 81 | print('门限:',thresholds) 82 | print('AUC值:',metrics.roc_auc_score(y1, y_scores)) 83 | 84 | 85 | labels = np.array([0, 1, 2]) # 三种分类的类标号 86 | pred_decision = clf.decision_function(X) # 计算样本属于每种分类的得分,所以pred_decision是一个3列的矩阵 87 | print('hinge_loss:',metrics.hinge_loss(y, pred_decision, labels = labels)) 88 | 89 | # 逻辑回归损失,对真实分类和预测分类概率进行对比的损失 90 | y_true = [0, 0, 1, 1] 91 | y_pred = [[.9, .1], [.8, .2], [.3, .7], [.01, .99]] 92 | print('log_loss:',metrics.log_loss(y_true, y_pred)) 93 | 94 | 95 | # ===============================回归度量============================== 96 | print(' ===============================回归度量==============================') 97 | diabetes = datasets.load_diabetes() # 加载糖尿病数据集;用于回归问题 98 | X, y = diabetes.data, diabetes.target # 442个样本,10个属性,数值输出 99 | 100 | model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) 101 | model.fit(X, y) # 线性回归建模 102 | predicted_y = model.predict(X) # 使用模型预测 103 | 104 | print('解释方差得分:',metrics.explained_variance_score(y, predicted_y)) # 解释方差得分 105 | print('平均绝对误差:',metrics.mean_absolute_error(y, predicted_y)) # 平均绝对误差 106 | print('均方误差:',metrics.mean_squared_error(y, predicted_y)) # 均方误差 107 | print('均方误差对数:',metrics.mean_squared_log_error(y, predicted_y)) # 均方误差对数 108 | print('中位绝对误差:',metrics.median_absolute_error(y, predicted_y)) # 中位绝对误差 109 | print('可决系数:',metrics.r2_score(y, predicted_y, multioutput='variance_weighted')) #可决系数 110 | print('可决系数:',metrics.r2_score(y, predicted_y, multioutput='raw_values')) #可决系数 111 | print('可决系数:',metrics.r2_score(y, predicted_y, multioutput='uniform_average')) #可决系数 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /sk-数据集-特征选择-交叉验证.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import numpy as np # 快速操作结构数组的工具 4 | import pandas as pd # 数据分析处理工具 5 | import matplotlib.pyplot as plt # 画图工具 6 | from sklearn import datasets # 机器学习库 7 | 8 | 9 | # 使用鸢尾花卉样本数据,对待测对象进行分类:分别包括为山鸢尾、变色鸢尾、维吉尼亚尾 10 | 11 | 12 | # =======加载样本数据集,清洗转化数据格式======= 13 | 14 | #数据集 0-山鸢尾、1-变色鸢尾、2-维吉尼亚尾 15 | scikit_iris = datasets.load_iris() #加载鸢尾花卉数据集。每行一个对象,每列一种属性。['data']为样本数据集,['target']为结果数据集,['target_names']为类别名称,.feature_names属性名称 16 | # 转换成pandas的DataFrame数据格式,方便观察数据 17 | iris = pd.DataFrame(data=np.c_[scikit_iris['data'], scikit_iris['target']],columns=np.append(scikit_iris.feature_names, ['y'])) #每行为一个对象,每列为一种属性,最后一个为结果值 18 | # print(iris.head(2)) #查看前两行,观察数据格式 19 | # print(iris.isnull().sum()) # isnull()返回布尔矩阵,sum()按列求和。检查数据是否有缺失 20 | # print(iris.groupby('y').count()) # 观察样本中各类别数量是否比较均衡 21 | 22 | 23 | # =======选择全部特征训练模型、预测新对象的分类======= 24 | 25 | X = iris[scikit_iris.feature_names] #获取样本集 26 | y = iris['y'] #获取结果集 27 | 28 | # 第一步,选择model 29 | from sklearn.neighbors import KNeighborsClassifier # 导入knn分类器 30 | 31 | knn = KNeighborsClassifier(n_neighbors=1) # 初始化一个knn模型,设置k=1 32 | # 第二步,fit X、y 33 | knn.fit(X, y) #根据样本集合结果集,对knn进行建模 34 | # 第三步,predict新数据 35 | result = knn.predict([[3, 2, 2, 5]]) #使用knn对新对象进行预测 36 | print(result) 37 | 38 | 39 | # =======使用交叉验证评估模型======= 40 | from sklearn.cross_validation import train_test_split 41 | from sklearn import metrics 42 | 43 | # 分割训练-测试集 44 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4) #划分训练集合测试集 45 | 46 | # K=15 47 | knn = KNeighborsClassifier(n_neighbors=15) #创建knn模型 48 | knn.fit(X_train, y_train) #训练knn模型 49 | 50 | y_pred_on_train = knn.predict(X_train) # 预测训练集,为了和预测测试集对比,查看拟合情况 51 | y_pred_on_test = knn.predict(X_test) # 预测测试集 52 | # print(metrics.accuracy_score(y_train, y_pred_on_train)) # 计算样本集的正确率 53 | print('正确率: :{}'.format(metrics.accuracy_score(y_test, y_pred_on_test))) # 计算测试集的正确率 54 | -------------------------------------------------------------------------------- /sk-文档贝叶斯.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | """ 4 | 这个指南的目的是在一个实际任务上探索scikit-learn的主要工具,在二十个不同的主题上分析一个文本集合。 5 | 在这一节中,可以看到: 6 | 1、加载文本文件和类别 7 | 2、适合机器学习的特征向量提取 8 | 3、训练线性模型进行分类 9 | 4、使用网格搜索策略,找到一个很好的配置的特征提取组件和分类器 10 | """ 11 | 12 | """ 13 | 1、Loading the 20 newsgroups dataset 加载20个新闻组数据集 14 | 为了获得更快的执行时间为第一个例子,我们将工作在部分数据集只有4个类别的数据集中: 15 | """ 16 | categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'] 17 | from sklearn.datasets import fetch_20newsgroups 18 | 19 | twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) 20 | print(twenty_train.target) 21 | print(twenty_train.target_names) # 训练集中类别的名字,这里只有四个类别 22 | print(len(twenty_train.data)) # 训练集中数据的长度 23 | print(len(twenty_train.filenames)) # 训练集文件名长度 24 | print('-----') 25 | print("\n".join(twenty_train.data[0].split("\n")[:3])) 26 | print('-----') 27 | print(twenty_train.target_names[twenty_train.target[0]]) 28 | print('-----') 29 | print(twenty_train.target[:10]) # 前十个的类别 30 | print('-----') 31 | for t in twenty_train.target[:10]: 32 | print(twenty_train.target_names[t]) # 类别的名字 33 | print('-----') 34 | """ 35 | 2、Extracting features from text files 从文本文件中提取特征 36 | 为了在文本文件中使用机器学习算法,首先需要将文本内容转换为数值特征向量 37 | """ 38 | 39 | """ 40 | Bags of words 词袋 41 | 最直接的方式就是词袋表示法 42 | 1、为训练集的任何文档中的每个单词分配一个固定的整数ID(例如通过从字典到整型索引建立字典) 43 | 2、对于每个文档,计算每个词出现的次数,并存储到X[i,j]中。 44 | 45 | 词袋表示:n_features 是语料中不同单词的数量,这个数量通常大于100000. 46 | 如果 n_samples == 10000,存储X的数组就需要10000*10000*4byte=4GB,这么大的存储在今天的计算机上是不可能实现的。 47 | 幸运的是,X中的大多数值都是0,基于这种原因,我们说词袋是典型的高维稀疏数据集,我们可以只存储那些非0的特征向量。 48 | scipy.sparse 矩阵就是这种数据结构,而scikit-learn内置了这种数据结构。 49 | """ 50 | 51 | """ 52 | Tokenizing text with scikit-learn 使用scikit-learn标记文本 53 | 文本处理、分词、过滤停用词都在这些高级组件中,能够建立特征字典并将文档转换成特征向量。 54 | """ 55 | from sklearn.feature_extraction.text import CountVectorizer # sklearn中的文本特征提取组件中,导入特征向量计数函数 56 | 57 | count_vect = CountVectorizer() # 特征向量计数函数 58 | X_train_counts = count_vect.fit_transform(twenty_train.data) # 对文本进行特征向量处理 59 | print(X_train_counts) # 特征向量和特征标签 60 | print(X_train_counts.shape) # 形状 61 | print('-----') 62 | 63 | """ 64 | CountVectorizer支持计算单词或序列的N-grams,一旦合适,这个向量化就可以建立特征词典。 65 | 在整个训练预料中,词汇中的词汇索引值与其频率有关。 66 | """ 67 | print(count_vect.vocabulary_.get(u'algorithm')) 68 | print('-----') 69 | 70 | """ 71 | From occurrences to frequencies 从事件到频率 72 | 计数是一个好的开始,但是也存在一个问题:较长的文本将会比较短的文本有很高的平均计数值,即使他们所表示的话题是一样的。 73 | 为了避免潜在的差异,它可以将文档中的每个单词出现的次数在文档的总字数的比例:这个新的特征叫做词频:tf 74 | tf-idf:词频-逆文档频率 75 | """ 76 | from sklearn.feature_extraction.text import TfidfTransformer # sklearn中的文本特征提取组件中,导入词频统计函数 77 | 78 | tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) # 建立词频统计函数,注意这里idf=False 79 | print(tf_transformer) # 输出函数属性 TfidfTransformer(norm=u'l2', smooth_idf=True, sublinear_tf=False, use_idf=False) 80 | print('-----') 81 | X_train_tf = tf_transformer.transform(X_train_counts) # 使用函数对文本文档进行tf-idf频率计算 82 | print(X_train_tf) 83 | print('-----') 84 | print(X_train_tf.shape) 85 | print('-----') 86 | """ 87 | 在上面的例子中,使用fit()方法来构建基于数据的预测器,然后使用transform()方法来将计数矩阵用tf-idf表示。 88 | 这两个步骤可以通过跳过冗余处理,来更快的达到相同的最终结果。 89 | 这些可以通过使用fit_transform()方法来实现: 90 | """ 91 | tfidf_transformer = TfidfTransformer() # 这里使用的是tf-idf 92 | X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) 93 | print(X_train_tfidf) 94 | print(X_train_tfidf.shape) 95 | print('-----') 96 | """ 97 | Training a classifier 训练一个分类器 98 | 既然已经有了特征,就可以训练分类器来试图预测一个帖子的类别,先使用贝叶斯分类器,贝叶斯分类器提供了一个良好的基线来完成这个任务。 99 | scikit-learn中包括这个分类器的许多变量,最适合进行单词计数的是多项式变量。 100 | """ 101 | from sklearn.naive_bayes import MultinomialNB # 使用sklearn中的贝叶斯分类器,并且加载贝叶斯分类器 102 | 103 | # 中的MultinomialNB多项式函数 104 | clf = MultinomialNB() # 加载多项式函数 105 | x_clf = clf.fit(X_train_tfidf, twenty_train.target) # 构造基于数据的分类器 106 | print(x_clf) # 分类器属性:MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) 107 | print('-----') 108 | """ 109 | 为了预测输入的新的文档,我们需要使用与前面相同的特征提取链进行提取特征。 110 | 不同的是,在转换中,使用transform来代替fit_transform,因为训练集已经构造了分类器 111 | """ 112 | docs_new = ['God is love', 'OpenGL on the GPU is fast'] # 文档 113 | X_new_counts = count_vect.transform(docs_new) # 构建文档计数 114 | X_new_tfidf = tfidf_transformer.transform(X_new_counts) # 构建文档tfidf 115 | predicted = clf.predict(X_new_tfidf) # 预测文档 116 | print(predicted) # 预测类别 [3 1],一个属于3类,一个属于1类 117 | for doc, category in zip(docs_new, predicted): 118 | print('%r => %s' % (doc, twenty_train.target_names[category])) # 将文档和类别名字对应起来 119 | print('-----') 120 | """ 121 | Building a pipeline 建立管道 122 | 为了使向量转换更加简单(vectorizer => transformer => classifier),scikit-learn提供了pipeline类来表示为一个复合分类器 123 | """ 124 | from sklearn.pipeline import Pipeline 125 | 126 | text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) 127 | text_clf = text_clf.fit(twenty_train.data, twenty_train.target) 128 | print(text_clf) # 构造分类器,分类器的属性 129 | predicted = text_clf.predict(docs_new) # 预测新文档 130 | print(predicted) # 获取预测值 131 | print('-----') 132 | 133 | """ 134 | 分析总结: 135 | 1、加载数据集,主要是加载训练集,用于对数据进行训练 136 | 2、文本特征提取: 137 | 对文本进行计数统计 CountVectorizer 138 | 词频统计 TfidfTransformer (先计算tf,再计算tfidf) 139 | 3、训练分类器: 140 | 贝叶斯多项式训练器 MultinomialNB 141 | 4、预测文档: 142 | 通过构造的训练器进行构造分类器,来进行文档的预测 143 | 5、最简单的方式: 144 | 通过使用pipeline管道形式,来讲上述所有功能通过管道来一步实现,更加简单的就可以进行预测 145 | """ 146 | 147 | """ 148 | Evaluation of the performance on the test set 测试集性能评价 149 | 评估模型的预测精度同样容易: 150 | """ 151 | import numpy as np 152 | 153 | twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) 154 | docs_test = twenty_test.data 155 | predicted = text_clf.predict(docs_test) 156 | print(np.mean(predicted == twenty_test.target)) # 预测的值和测试值的比例,mean就是比例函数 157 | print('-----') # 精度已经为0.834886817577 158 | 159 | """ 160 | 精度已经实现了83.4%,那么使用支持向量机(SVM)是否能够做的更好呢,支持向量机(SVM)被广泛认为是最好的文本分类算法之一。 161 | 尽管,SVM经常比贝叶斯要慢一些。 162 | 我们可以改变学习方式,使用管道来实现分类: 163 | """ 164 | from sklearn.linear_model import SGDClassifier 165 | 166 | text_clf = Pipeline( 167 | [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), 168 | ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]) 169 | # _ = text_clf.fit(twenty_train.data, twenty_train.target) # 和下面一句的意思一样,一个杠,表示本身 170 | text_clf = text_clf.fit(twenty_train.data, twenty_train.target) 171 | predicted = text_clf.predict(docs_test) 172 | print(np.mean(predicted == twenty_test.target)) # 精度 0.912782956059 173 | print('-----') 174 | """ 175 | sklearn进一步提供了结果的更详细的性能分析工具: 176 | """ 177 | from sklearn import metrics 178 | print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names)) 179 | print(metrics.confusion_matrix(twenty_test.target, predicted)) -------------------------------------------------------------------------------- /sk-朴素贝叶斯.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets 2 | iris = datasets.load_iris() 3 | 4 | from sklearn.naive_bayes import GaussianNB 5 | clf = GaussianNB() 6 | clf = clf.fit(iris.data, iris.target) 7 | y_pred=clf.predict(iris.data) 8 | print("高斯朴素贝叶斯,样本总数: %d 错误样本数 : %d" % (iris.data.shape[0],(iris.target != y_pred).sum())) 9 | 10 | from sklearn.naive_bayes import MultinomialNB 11 | clf = MultinomialNB() 12 | clf = clf.fit(iris.data, iris.target) 13 | y_pred=clf.predict(iris.data) 14 | print("多项分布朴素贝叶斯,样本总数: %d 错误样本数 : %d" % (iris.data.shape[0],(iris.target != y_pred).sum())) 15 | 16 | from sklearn.naive_bayes import BernoulliNB 17 | clf = BernoulliNB() 18 | clf = clf.fit(iris.data, iris.target) 19 | y_pred=clf.predict(iris.data) 20 | print("伯努利朴素贝叶斯,样本总数: %d 错误样本数 : %d" % (iris.data.shape[0],(iris.target != y_pred).sum())) 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /sk-样本数据集.py: -------------------------------------------------------------------------------- 1 | # import numpy as np # 快速操作结构数组的工具 2 | # import pandas as pd # 数据分析处理工具 3 | # import matplotlib.pyplot as plt # 画图工具 4 | # from sklearn import datasets # 机器学习数据集 5 | # from sklearn.datasets import make_blobs 6 | # from sklearn import datasets 7 | 8 | # load_boston([return_X_y]) 加载波士顿房价数据;用于回归问题 9 | # load_iris([return_X_y]) 加载iris 数据集;用于分类问题 10 | # load_diabetes([return_X_y]) 加载糖尿病数据集;用于回归问题 11 | # load_digits([n_class, return_X_y]) 加载手写字符集;用于分类问题 12 | # load_linnerud([return_X_y]) 加载linnerud 数据集;用于多元回归问题 13 | 14 | 15 | # # ===========房价数据=========== 16 | # from sklearn.datasets import load_boston 17 | # from sklearn import linear_model 18 | # boston = load_boston() 19 | # data=boston.data 20 | # target = boston.target 21 | # print(data.shape) 22 | # print(target.shape) 23 | # 24 | # print('系数矩阵:\n',linear_model.LinearRegression().fit(data,target).coef_) 25 | # 26 | # 27 | # # ===========花卉数据=========== 28 | # from sklearn.datasets import load_iris 29 | # from sklearn import svm 30 | # iris = load_iris() 31 | # data=iris.data 32 | # target = iris.target 33 | # print(data.shape) 34 | # print(target.shape) 35 | # 36 | # print('svm模型:\n',svm.SVC().fit(data,target)) 37 | 38 | # # ===========糖尿病数据集=========== 39 | # from sklearn.datasets import load_diabetes 40 | # from sklearn import linear_model 41 | # diabetes = load_diabetes() 42 | # data=diabetes.data 43 | # target = diabetes.target 44 | # print(data.shape) 45 | # print(target.shape) 46 | # 47 | # print('系数矩阵:\n',linear_model.LinearRegression().fit(data,target).coef_) 48 | 49 | 50 | 51 | # # ===========手写体数据=========== 52 | # from sklearn.datasets import load_digits 53 | # import matplotlib.pyplot as plt # 画图工具 54 | # digits = load_digits() 55 | # data=digits.data 56 | # print(data.shape) 57 | # plt.matshow(digits.images[3]) # 矩阵像素点的样式显示3 58 | # # plt.imshow(digits.images[3]) # 图片渐变的样式显示3 59 | # # plt.gray() # 图片显示为灰度模式 60 | # plt.show() 61 | 62 | 63 | # # # ===========多元回归=========== 64 | # from sklearn.datasets import load_linnerud 65 | # from sklearn import linear_model 66 | # linnerud = load_linnerud() 67 | # data=linnerud.data 68 | # target = linnerud.target 69 | # print(data.shape) 70 | # print(target.shape) 71 | # 72 | # print('系数矩阵:\n',linear_model.LinearRegression().fit(data,target).coef_) 73 | 74 | 75 | 76 | # # ===========图像样本数据集=========== 77 | # from sklearn.datasets import load_sample_image 78 | # import matplotlib.pyplot as plt # 画图工具 79 | # img=load_sample_image('flower.jpg') # 加载sk自带的花朵图案 80 | # plt.imshow(img) 81 | # plt.show() 82 | 83 | 84 | 85 | # # ===========生成分类样本数据集=========== 86 | # from sklearn import datasets 87 | # import matplotlib.pyplot as plt # 画图工具 88 | # data,target=datasets.make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0,n_repeated=0, n_classes=2, n_clusters_per_class=1) 89 | # print(data.shape) 90 | # print(target.shape) 91 | # plt.scatter(data[:,0],data[:,1],c=target) 92 | # plt.show() 93 | 94 | 95 | # import matplotlib.pyplot as plt 96 | # from sklearn.datasets import make_classification 97 | # from sklearn.datasets import make_blobs 98 | # from sklearn.datasets import make_gaussian_quantiles 99 | # from sklearn.datasets import make_hastie_10_2 100 | # 101 | # plt.figure(figsize=(10, 10)) 102 | # plt.subplots_adjust(bottom=.05, top=.9, left=.05, right=.95) 103 | # 104 | # plt.subplot(421) 105 | # plt.title("One informative feature, one cluster per class", fontsize='small') 106 | # X1, Y1 = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_informative=1,n_clusters_per_class=1) 107 | # plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1) 108 | # 109 | # plt.subplot(422) 110 | # plt.title("Two informative features, one cluster per class", fontsize='small') 111 | # X1, Y1 = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_informative=2,n_clusters_per_class=1) 112 | # plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1) 113 | # 114 | # plt.subplot(423) 115 | # plt.title("Two informative features, two clusters per class", fontsize='small') 116 | # X2, Y2 = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_informative=2) 117 | # plt.scatter(X2[:, 0], X2[:, 1], marker='o', c=Y2) 118 | # 119 | # plt.subplot(424) 120 | # plt.title("Multi-class, two informative features, one cluster",fontsize='small') 121 | # X1, Y1 = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_informative=2,n_clusters_per_class=1, n_classes=3) 122 | # plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1) 123 | # 124 | # plt.subplot(425) 125 | # plt.title("Three blobs", fontsize='small') 126 | # # 1000个样本,2个属性,3种类别,方差分别为1.0,3.0,2.0 127 | # X1, Y1 = make_blobs(n_samples=1000, n_features=2, centers=3,cluster_std=[1.0,3.0,2.0]) 128 | # plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1) 129 | # 130 | # plt.subplot(426) 131 | # plt.title("Gaussian divided into four quantiles", fontsize='small') 132 | # X1, Y1 = make_gaussian_quantiles(n_samples=1000, n_features=2, n_classes=4) 133 | # plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1) 134 | # 135 | # plt.subplot(427) 136 | # plt.title("hastie data ", fontsize='small') 137 | # X1, Y1 = make_hastie_10_2(n_samples=1000) 138 | # plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1) 139 | # plt.show() 140 | 141 | 142 | 143 | # # ===========生成圆形或月亮型分类数据=========== 144 | 145 | from sklearn.datasets import make_circles 146 | from sklearn.datasets import make_moons 147 | import matplotlib.pyplot as plt 148 | 149 | fig = plt.figure(1) 150 | x1, y1 = make_circles(n_samples=1000, factor=0.5, noise=0.1) 151 | plt.subplot(121) 152 | plt.title('make_circles function example') 153 | plt.scatter(x1[:, 0], x1[:, 1], marker='o', c=y1) 154 | 155 | plt.subplot(122) 156 | x1, y1 = make_moons(n_samples=1000, noise=0.1) 157 | plt.title('make_moons function example') 158 | plt.scatter(x1[:, 0], x1[:, 1], marker='o', c=y1) 159 | plt.show() 160 | 161 | 162 | 163 | # # =======清洗转化数据格式====== 164 | # # 转换成pandas的DataFrame数据格式,方便观察数据 165 | # pddata = pd.DataFrame(data=np.c_[data, target],columns=np.append(['x1','x2'], ['y'])) #每行为一个对象,每列为一种属性,最后一个为结果值 166 | # # print(iris.head(2)) #查看前两行,观察数据格式 167 | # # print(iris.isnull().sum()) # isnull()返回布尔矩阵,sum()按列求和。检查数据是否有缺失 168 | # # print(iris.groupby('y').count()) # 观察样本中各类别数量是否比较均衡 169 | -------------------------------------------------------------------------------- /sk-案例流程.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | # ========加载数据(Data Loading)======== 4 | import numpy as np 5 | import urllib.request 6 | 7 | # 数据集的请求地址 8 | url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data" 9 | # 下载响应的csv文件 10 | raw_data = urllib.request.urlopen(url) 11 | # 加载csv文件成numpy中的矩阵 12 | dataset = np.loadtxt(raw_data, delimiter=",") 13 | # 分割成属性集和结果集 14 | X = dataset[:,0:7] # 特征矩阵 15 | y = dataset[:,8] #目标矩阵 16 | # print('特征矩阵:\n',X) 17 | # print('结果矩阵:\n',y) 18 | 19 | # ========数据归一化(Data Normalization)======== 20 | from sklearn import preprocessing 21 | # 归一化数据集 22 | normalized_X = preprocessing.normalize(X) 23 | # 标准话数据集 24 | standardized_X = preprocessing.scale(X) 25 | 26 | # ========特征选择(Feature Selection)======== 27 | # 树算法(Tree algorithms)计算特征的信息量 28 | from sklearn import metrics 29 | from sklearn.ensemble import ExtraTreesClassifier 30 | model = ExtraTreesClassifier() 31 | model.fit(X, y) 32 | # 显示每个特征的重要性 33 | print('属性重要性:\n',model.feature_importances_) 34 | 35 | # ========逻辑回归======== 36 | from sklearn import metrics 37 | from sklearn.linear_model import LogisticRegression 38 | model = LogisticRegression() 39 | model.fit(X, y) 40 | print('逻辑回归模型:\n',model) 41 | # 使用模型预测 42 | expected = y 43 | predicted = model.predict(X) 44 | # 评估模型 45 | print(metrics.classification_report(expected, predicted)) #评估模型 46 | print(metrics.confusion_matrix(expected, predicted)) # 使用混淆矩阵评估模型 47 | 48 | # ========朴素贝叶斯======== 49 | from sklearn import metrics 50 | from sklearn.naive_bayes import GaussianNB 51 | model = GaussianNB() 52 | model.fit(X, y) 53 | print('朴素贝叶斯模型:\n',model) 54 | # 使用模型预测 55 | expected = y 56 | predicted = model.predict(X) 57 | # 评估模型 58 | print(metrics.classification_report(expected, predicted)) 59 | print(metrics.confusion_matrix(expected, predicted)) 60 | 61 | # ========k近邻======== 62 | from sklearn import metrics 63 | from sklearn.neighbors import KNeighborsClassifier 64 | # 使用样本数据构建knn模型 65 | model = KNeighborsClassifier() 66 | model.fit(X, y) 67 | print('KNN模型:\n',model) 68 | # 使用模型预测 69 | expected = y 70 | predicted = model.predict(X) 71 | # 评估模型 72 | print(metrics.classification_report(expected, predicted)) 73 | print(metrics.confusion_matrix(expected, predicted)) 74 | 75 | 76 | # ========决策树======== 77 | from sklearn import metrics 78 | from sklearn.tree import DecisionTreeClassifier 79 | # 构建决策树模型 80 | model = DecisionTreeClassifier() 81 | model.fit(X, y) 82 | print('决策树模型:\n',model) 83 | # 使用模型预测 84 | expected = y 85 | predicted = model.predict(X) 86 | # 评估模型 87 | print(metrics.classification_report(expected, predicted)) 88 | print(metrics.confusion_matrix(expected, predicted)) 89 | 90 | 91 | # ========支持向量机======== 92 | from sklearn import metrics 93 | from sklearn.svm import SVC 94 | # 构建svm模型 95 | model = SVC() 96 | model.fit(X, y) 97 | print('SVM模型:\n',model) 98 | # 使用模型预测 99 | expected = y 100 | predicted = model.predict(X) 101 | # 评估模型 102 | print(metrics.classification_report(expected, predicted)) 103 | print(metrics.confusion_matrix(expected, predicted)) 104 | 105 | # ========优化算法参数======== 106 | import numpy as np 107 | from sklearn.linear_model import Ridge #岭回归模型 108 | from scipy.stats import uniform as sp_rand 109 | from sklearn.grid_search import GridSearchCV #网格搜索 110 | from sklearn.grid_search import RandomizedSearchCV # 随机搜索 111 | 112 | # 准备参数的可取值 113 | alphas = np.array([1,0.1,0.01,0.001,0.0001,0]) 114 | # 构建岭回归模型,并尝试参数每一个可取值 115 | model = Ridge() 116 | rsearch = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas)) 117 | 118 | # # 只给定区间,参数随机取值 119 | # param_grid = {'alpha': sp_rand()} 120 | # # 构建岭回归模型,并尝试参数随机值 121 | # model = Ridge() 122 | # rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100) 123 | 124 | rsearch.fit(X, y) 125 | print(rsearch) 126 | # 评估搜索结果 127 | print(rsearch.best_score_) 128 | print(rsearch.best_estimator_.alpha) 129 | -------------------------------------------------------------------------------- /sk-特征提取.py: -------------------------------------------------------------------------------- 1 | # ===========从字典类型加载特征。形成系数矩阵结构========== 2 | from sklearn.feature_extraction import DictVectorizer 3 | measurements = [ 4 | {'name': 'student1', 'age': 12}, 5 | {'boy':True, 'parents': 'baba'}, 6 | {'size':16}, 7 | ] 8 | 9 | vec = DictVectorizer().fit(measurements) # 定义一个加载器,后对一个字典对象提取特征。(值为数值型、布尔型的属性为单独的属性。值为字符串型的属性,形成"属性=值"的新属性) 10 | print('提取的特征:',vec.get_feature_names()) # 查看提取的新属性 11 | print('稀疏矩阵形式:\n',vec.transform(measurements)) 12 | print('二维矩阵形式:\n',vec.transform(measurements).toarray()) 13 | 14 | # =================文本特征提取============== 15 | from sklearn.feature_extraction.text import CountVectorizer 16 | corpus = ['This is the first document.', 17 | 'This is the second second document.', 18 | 'And the third one.', 19 | 'Is this the first document?',] 20 | vectorizer = CountVectorizer() 21 | X = vectorizer.fit_transform(corpus) # 默认提取至少 包含2个字母的单词 22 | print('所有特征:',vectorizer.get_feature_names()) 23 | print('样本特征向量:\n',X.toarray()) # X本身为稀疏矩阵存储形式,toarray转换为二维矩阵形式 24 | 25 | print('document属性的列索引:',vectorizer.vocabulary_.get('document')) # 从 特征 名称到矩阵的(列索引) 26 | 27 | # 提取一个单词或两个单词形成的词组。这样就能识别“is this”和“this is”这两种词汇了 28 | bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1) 29 | analyze = bigram_vectorizer.build_analyzer() 30 | print('所有分词:',analyze('Bi-grams are cool!')) 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /sk-特征选择.py: -------------------------------------------------------------------------------- 1 | # # ============去除方差小于阈值的特征============ 2 | # from sklearn.feature_selection import VarianceThreshold #移除低方差特征 3 | # from sklearn.datasets import load_iris # 引入花卉数据集 4 | # iris = load_iris() 5 | # X= iris.data 6 | # print(X.shape) 7 | # print(X.var(axis=0)) 8 | # 9 | # sel = VarianceThreshold(threshold=0.2) 10 | # X_transformed=sel.fit_transform(X) 11 | # print('去除低方差特征:\n',X_transformed.shape) 12 | 13 | 14 | 15 | 16 | 17 | # # ============排序选择优秀特征============ 18 | # from sklearn.datasets import load_iris 19 | # from sklearn.feature_selection import SelectKBest 20 | # from sklearn.feature_selection import chi2 # 引入卡方检验统计量 21 | # # 对于回归: f_regression , mutual_info_regression 22 | # # 对于分类: chi2 , f_classif , mutual_info_classif 23 | # iris = load_iris() 24 | # X, y = iris.data, iris.target 25 | # print('源样本维度:',X.shape) 26 | # 27 | # X_new = SelectKBest(chi2, k=2).fit_transform(X, y) 28 | # print('新样本维度:',X_new.shape) 29 | 30 | 31 | 32 | 33 | # # ============递归式特征消除============ 34 | # # 这里递归的移除最不重要的像素点来对每个像素点(特征)进行排序 35 | # from sklearn.svm import SVC 36 | # from sklearn.datasets import load_digits 37 | # from sklearn.feature_selection import RFE 38 | # import matplotlib.pyplot as plt 39 | # 40 | # digits = load_digits() # 加载手写体数据集 41 | # X = digits.images.reshape((len(digits.images), -1)) 42 | # y = digits.target 43 | # 44 | # # 创建ref对象和每个像素点的重要度排名 45 | # svc = SVC(kernel="linear", C=1) 46 | # rfe = RFE(estimator=svc, n_features_to_select=1, step=1) 47 | # rfe.fit(X, y) 48 | # ranking = rfe.ranking_.reshape(digits.images[0].shape) 49 | # 50 | # # 绘制像素点排名 51 | # plt.matshow(ranking, cmap=plt.cm.Blues) 52 | # plt.colorbar() 53 | # plt.title("Ranking of pixels with RFE") 54 | # plt.show() 55 | 56 | 57 | 58 | 59 | 60 | # # ============使用 SelectFromModel 选取特征============ 61 | # 62 | # import matplotlib.pyplot as plt 63 | # from sklearn.datasets import load_boston 64 | # from sklearn.feature_selection import SelectFromModel 65 | # from sklearn.linear_model import LassoCV 66 | # boston = load_boston() # 加载波士顿房价回归数据 67 | # X, y = boston['data'], boston['target'] # 取特征数据和输出数据 68 | # n_features =[13] # 记录循环中的特征个数,最开始数据集是有13个特征的 69 | # thresholds=[0] # 记录门限值,最开始是没有门限值的 70 | # 71 | # clf = LassoCV() # 使用Lasso回归 72 | # 73 | # # 设置最小门限为0.25。coef_ 或者 featureimportances 属性值低于门限的都会被去除调 74 | # sfm = SelectFromModel(clf, threshold=0.1) 75 | # sfm.fit(X, y) # 训练模型。找出模型回归系数。 76 | # X_transform = sfm.transform(X) # 根据回归系数、门限,变换数据集。 77 | # n_feature =X_transform.shape[1] # 获取训练以后的特征数目 78 | # n_features.append(n_feature) 79 | # thresholds.append(0.1) 80 | # while n_feature > 2: # 如果特征数大于2,则从新转换,找最好的两个特征 81 | # sfm.threshold += 0.1 # 逐渐增加门限,进一步减少特征数目 82 | # X_transform = sfm.transform(X) # 变换数据集 83 | # n_feature = X_transform.shape[1] 84 | # n_features.append(n_feature) # 记录训练以后的特征数目 85 | # thresholds.append(sfm.threshold) # 记录门限值 86 | # 87 | # plt.title("Features with threshold %0.3f." % sfm.threshold) 88 | # plt.plot(thresholds, n_features, 'r') 89 | # plt.xlabel("thresholds") 90 | # plt.ylabel("Feature number") 91 | # plt.show() 92 | 93 | 94 | 95 | 96 | # # ============基于 L1 的特征选取============ 97 | # from sklearn.svm import LinearSVC 98 | # from sklearn.datasets import load_iris 99 | # from sklearn.feature_selection import SelectFromModel 100 | # iris = load_iris() 101 | # X, y = iris.data, iris.target 102 | # print('原数据集维度:',X.shape) 103 | # lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y) 104 | # model = SelectFromModel(lsvc, prefit=True) 105 | # X_new = model.transform(X) 106 | # print('新数据集维度:',X_new.shape) 107 | 108 | 109 | 110 | # ============基于 Tree(树)的特征选取============ 111 | from sklearn.ensemble import ExtraTreesClassifier 112 | from sklearn.datasets import load_iris 113 | from sklearn.feature_selection import SelectFromModel 114 | dataset = load_iris() 115 | X, y = dataset.data, dataset.target 116 | print('原数据集维度:',X.shape) 117 | clf = ExtraTreesClassifier() 118 | clf = clf.fit(X, y) 119 | print('属性重要程度:',clf.feature_importances_) 120 | 121 | model = SelectFromModel(clf, prefit=True) 122 | X_new = model.transform(X) 123 | print('新数据集维度:',X.shape) 124 | 125 | 126 | 127 | # ============特征选取作为 pipeline(管道)的一部分============ 128 | # from sklearn.pipeline import Pipeline 129 | # clf = Pipeline([ 130 | # ('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))), 131 | # ('classification', RandomForestClassifier()) 132 | # ]) 133 | # clf.fit(X, y) 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /sk-神经网络.py: -------------------------------------------------------------------------------- 1 | # # =============神经网络用于分类============= 2 | # import numpy as np 3 | # import matplotlib.pyplot as plt 4 | # from sklearn.neural_network import MLPClassifier 5 | # from sklearn.preprocessing import StandardScaler 6 | # data = [ 7 | # [-0.017612, 14.053064, 0],[-1.395634, 4.662541, 1],[-0.752157, 6.53862, 0],[-1.322371, 7.152853, 0],[0.423363, 11.054677, 0], 8 | # [0.406704, 7.067335, 1],[0.667394, 12.741452, 0],[-2.46015, 6.866805, 1],[0.569411, 9.548755, 0],[-0.026632, 10.427743, 0], 9 | # [0.850433, 6.920334, 1],[1.347183, 13.1755, 0],[1.176813, 3.16702, 1],[-1.781871, 9.097953, 0],[-0.566606, 5.749003, 1], 10 | # [0.931635, 1.589505, 1],[-0.024205, 6.151823, 1],[-0.036453, 2.690988, 1],[-0.196949, 0.444165, 1],[1.014459, 5.754399, 1], 11 | # [1.985298, 3.230619, 1],[-1.693453, -0.55754, 1],[-0.576525, 11.778922, 0],[-0.346811, -1.67873, 1],[-2.124484, 2.672471, 1], 12 | # [1.217916, 9.597015, 0],[-0.733928, 9.098687, 0],[1.416614, 9.619232, 0],[1.38861, 9.341997, 0],[0.317029, 14.739025, 0] 13 | # ] 14 | # 15 | # dataMat = np.array(data) 16 | # X=dataMat[:,0:2] 17 | # y = dataMat[:,2] 18 | # # 神经网络对数据尺度敏感,所以最好在训练前标准化,或者归一化,或者缩放到[-1,1] 19 | # scaler = StandardScaler() # 标准化转换 20 | # scaler.fit(X) # 训练标准化对象 21 | # X = scaler.transform(X) # 转换数据集 22 | # # solver='lbfgs', MLP的求解方法:L-BFGS 在小数据上表现较好,Adam 较为鲁棒,SGD在参数调整较优时会有最佳表现(分类效果与迭代次数);SGD标识随机梯度下降。 23 | # # alpha:L2的参数:MLP是可以支持正则化的,默认为L2,具体参数需要调整 24 | # # hidden_layer_sizes=(5, 2) hidden层2层,第一层5个神经元,第二层2个神经元),2层隐藏层,也就有3层神经网络 25 | # 26 | # clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5,2), random_state=1) # 神经网络输入为2,第一隐藏层神经元个数为5,第二隐藏层神经元个数为2,输出结果为2分类。 27 | # clf.fit(X, y) 28 | # print('每层网络层系数矩阵维度:\n',[coef.shape for coef in clf.coefs_]) 29 | # y_pred = clf.predict([[0.317029, 14.739025]]) 30 | # print('预测结果:',y_pred) 31 | # y_pred_pro =clf.predict_proba([[0.317029, 14.739025]]) 32 | # print('预测结果概率:\n',y_pred_pro) 33 | # 34 | # cengindex = 0 35 | # for wi in clf.coefs_: 36 | # cengindex += 1 # 表示底第几层神经网络。 37 | # print('第%d层网络层:' % cengindex) 38 | # print('权重矩阵维度:',wi.shape) 39 | # print('系数矩阵:\n',wi) 40 | # 41 | # 42 | # # 绘制分割区域 43 | # x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # 寻找每个维度的范围 44 | # y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # 寻找每个维度的范围 45 | # xx1, xx2 = np.meshgrid(np.arange(x_min, x_max, 0.01),np.arange(y_min, y_max,0.01)) # 在特征范围以0.01位步长预测每一个点的输出结果 46 | # Z = clf.predict(np.c_[xx1.ravel(), xx2.ravel()]) # 先形成待测样本的形式,在通过模型进行预测。 47 | # Z = Z.reshape(xx1.shape) # 将输出结果转换为和网格的矩阵形式,以便绘图 48 | # # 绘制区域网格图 49 | # plt.pcolormesh(xx1, xx2, Z, cmap=plt.cm.Paired) 50 | # # 绘制样本点 51 | # plt.scatter(X[:,0],X[:,1],c=y) 52 | # plt.show() 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | # # =============神经网络用于回归============= 61 | 62 | import numpy as np 63 | from sklearn.neural_network import MLPRegressor # 多层线性回归 64 | from sklearn.preprocessing import StandardScaler 65 | data = [ 66 | [ -0.017612,14.053064,14.035452],[ -1.395634, 4.662541, 3.266907],[ -0.752157, 6.53862,5.786463],[ -1.322371, 7.152853, 5.830482], 67 | [0.423363,11.054677,11.47804 ],[0.406704, 7.067335, 7.474039],[0.667394,12.741452,13.408846],[ -2.46015,6.866805, 4.406655], 68 | [0.569411, 9.548755,10.118166],[ -0.026632,10.427743,10.401111],[0.850433, 6.920334, 7.770767],[1.347183,13.1755,14.522683], 69 | [1.176813, 3.16702,4.343833],[ -1.781871, 9.097953, 7.316082],[ -0.566606, 5.749003, 5.182397],[0.931635, 1.589505, 2.52114 ], 70 | [ -0.024205, 6.151823, 6.127618],[ -0.036453, 2.690988, 2.654535],[ -0.196949, 0.444165, 0.247216],[1.014459, 5.754399, 6.768858], 71 | [1.985298, 3.230619, 5.215917],[ -1.693453,-0.55754, -2.250993],[ -0.576525,11.778922,11.202397],[ -0.346811,-1.67873, -2.025541], 72 | [ -2.124484, 2.672471, 0.547987],[1.217916, 9.597015,10.814931],[ -0.733928, 9.098687, 8.364759],[1.416614, 9.619232,11.035846], 73 | [1.38861,9.341997,10.730607],[0.317029,14.739025,15.056054] 74 | ] 75 | 76 | dataMat = np.array(data) 77 | X=dataMat[:,0:2] 78 | y = dataMat[:,2] 79 | scaler = StandardScaler() # 标准化转换 80 | scaler.fit(X) # 训练标准化对象 81 | X = scaler.transform(X) # 转换数据集 82 | 83 | # solver='lbfgs', MLP的求解方法:L-BFGS 在小数据上表现较好,Adam 较为鲁棒,SGD在参数调整较优时会有最佳表现(分类效果与迭代次数);SGD标识随机梯度下降。 84 | # alpha:L2的参数:MLP是可以支持正则化的,默认为L2,具体参数需要调整 85 | # hidden_layer_sizes=(5, 2) hidden层2层,第一层5个神经元,第二层2个神经元),2层隐藏层,也就有3层神经网络 86 | clf = MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1) 87 | clf.fit(X, y) 88 | print('预测结果:', clf.predict([[0.317029, 14.739025]])) # 预测某个输入对象 89 | 90 | cengindex = 0 91 | for wi in clf.coefs_: 92 | cengindex += 1 # 表示底第几层神经网络。 93 | print('第%d层网络层:' % cengindex) 94 | print('权重矩阵维度:',wi.shape) 95 | print('系数矩阵:\n',wi) 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /sk-线性回归.py: -------------------------------------------------------------------------------- 1 | import numpy as np # 快速操作结构数组的工具 2 | import matplotlib.pyplot as plt # 可视化绘制 3 | from sklearn.linear_model import LinearRegression # 线性回归 4 | 5 | 6 | # 样本数据集,第一列为x,第二列为y,在x和y之间建立回归模型 7 | data=[ 8 | [0.067732,3.176513],[0.427810,3.816464],[0.995731,4.550095],[0.738336,4.256571],[0.981083,4.560815], 9 | [0.526171,3.929515],[0.378887,3.526170],[0.033859,3.156393],[0.132791,3.110301],[0.138306,3.149813], 10 | [0.247809,3.476346],[0.648270,4.119688],[0.731209,4.282233],[0.236833,3.486582],[0.969788,4.655492], 11 | [0.607492,3.965162],[0.358622,3.514900],[0.147846,3.125947],[0.637820,4.094115],[0.230372,3.476039], 12 | [0.070237,3.210610],[0.067154,3.190612],[0.925577,4.631504],[0.717733,4.295890],[0.015371,3.085028], 13 | [0.335070,3.448080],[0.040486,3.167440],[0.212575,3.364266],[0.617218,3.993482],[0.541196,3.891471] 14 | ] 15 | 16 | 17 | #生成X和y矩阵 18 | dataMat = np.array(data) 19 | X = dataMat[:,0:1] # 变量x 20 | y = dataMat[:,1] #变量y 21 | 22 | 23 | 24 | # ========线性回归======== 25 | model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) 26 | model.fit(X, y) # 线性回归建模 27 | print('系数矩阵:\n',model.coef_) 28 | print('线性回归模型:\n',model) 29 | # 使用模型预测 30 | predicted = model.predict(X) 31 | 32 | # 绘制散点图 参数:x横轴 y纵轴 33 | plt.scatter(X, y, marker='x') 34 | plt.plot(X, predicted,c='r') 35 | 36 | # 绘制x轴和y轴坐标 37 | plt.xlabel("x") 38 | plt.ylabel("y") 39 | 40 | # 显示图形 41 | plt.show() 42 | 43 | -------------------------------------------------------------------------------- /sk-逻辑分类有b偏量.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import numpy as np # 快速操作结构数组的工具 4 | import pandas as pd # 数据分析处理工具 5 | 6 | 7 | # 样本数据集,第一列为x1,第二列为x2,第三列为分类(三种类别) 8 | data=[ 9 | [-2.68420713, 0.32660731, 0],[-2.71539062, -0.16955685, 0],[-2.88981954, -0.13734561, 0],[-2.7464372, -0.31112432, 0],[-2.72859298, 0.33392456, 0], 10 | [-2.27989736, 0.74778271, 0],[-2.82089068, -0.08210451, 0],[-2.62648199, 0.17040535, 0],[-2.88795857, -0.57079803, 0],[-2.67384469, -0.1066917, 0], 11 | [-2.50652679,0.65193501,0],[-2.61314272,0.02152063,0],[-2.78743398,-0.22774019,0],[-3.22520045,-0.50327991,0],[-2.64354322,1.1861949,0], 12 | [-2.38386932,1.34475434,0],[-2.6225262,0.81808967,0],[-2.64832273,0.31913667,0],[-2.19907796,0.87924409,0],[-2.58734619,0.52047364,0], 13 | [1.28479459, 0.68543919, 1],[0.93241075, 0.31919809, 1],[1.46406132, 0.50418983, 1],[0.18096721, -0.82560394, 1],[1.08713449, 0.07539039, 1], 14 | [0.64043675, -0.41732348, 1],[1.09522371, 0.28389121, 1],[-0.75146714, -1.00110751, 1],[1.04329778, 0.22895691, 1],[-0.01019007, -0.72057487, 1], 15 | [-0.5110862,-1.26249195,1],[0.51109806,-0.10228411,1],[0.26233576,-0.5478933,1],[0.98404455,-0.12436042,1],[-0.174864,-0.25181557,1], 16 | [0.92757294,0.46823621,1],[0.65959279,-0.35197629,1],[0.23454059,-0.33192183,1],[0.94236171,-0.54182226,1],[0.0432464,-0.58148945,1], 17 | [2.53172698, -0.01184224, 2],[1.41407223, -0.57492506, 2],[2.61648461, 0.34193529, 2],[1.97081495, -0.18112569, 2],[2.34975798, -0.04188255, 2], 18 | [3.39687992, 0.54716805, 2],[0.51938325, -1.19135169, 2],[2.9320051, 0.35237701, 2],[2.31967279, -0.24554817, 2],[2.91813423, 0.78038063, 2], 19 | [1.66193495,0.2420384,2],[1.80234045,-0.21615461,2],[2.16537886,0.21528028,2],[1.34459422,-0.77641543,2],[1.5852673,-0.53930705,2], 20 | [1.90474358,0.11881899,2],[1.94924878,0.04073026,2],[3.48876538,1.17154454,2],[3.79468686,0.25326557,2],[1.29832982,-0.76101394,2], 21 | ] 22 | # 样本数据集,第一列为x1,第二列为x2,第三列为分类(2种类别) 23 | data1=[ 24 | [-0.017612,14.053064,0], 25 | [-1.395634,4.662541,1], 26 | [-0.752157,6.538620,0], 27 | [-1.322371,7.152853,0], 28 | [0.423363,11.054677,0], 29 | [0.406704,7.067335,1], 30 | [0.667394,12.741452,0], 31 | [-2.460150,6.866805,1], 32 | [0.569411,9.548755,0], 33 | [-0.026632,10.427743,0], 34 | [0.850433,6.920334,1], 35 | [1.347183,13.175500,0], 36 | [1.176813,3.167020,1], 37 | [-1.781871,9.097953,0], 38 | [-0.566606,5.749003,1], 39 | [0.931635,1.589505,1], 40 | [-0.024205,6.151823,1], 41 | [-0.036453,2.690988,1], 42 | [-0.196949,0.444165,1], 43 | [1.014459,5.754399,1] 44 | ] 45 | #生成X和y矩阵 46 | dataMat = np.mat(data) 47 | y = dataMat[:,2] # 类别变量 48 | b = np.ones(y.shape) # 添加全1列向量代表b偏量 49 | X = np.column_stack((b, dataMat[:,0:2])) # 特征属性集和b偏量组成x 50 | X = np.mat(X) 51 | 52 | 53 | # 特征数据归一化 54 | # import sklearn.preprocessing as preprocessing #sk的去均值和归一化 55 | # scaler=preprocessing.StandardScaler() 56 | # X = scaler.fit_transform(X) # 对特征数据集去均值和归一化,可以加快机器性能 57 | # X = np.mat(X) 58 | # # print(X) 59 | # ========逻辑回归======== 60 | 61 | from sklearn import metrics 62 | from sklearn.linear_model import LogisticRegression 63 | model = LogisticRegression() 64 | model.fit(X, y) 65 | print('逻辑回归模型:\n',model) 66 | # 使用模型预测 67 | predicted = model.predict(X) #预测分类 68 | answer = model.predict_proba(X) #预测分类概率 69 | print(answer) 70 | 71 | 72 | 73 | import matplotlib.pyplot as plt 74 | 75 | # 绘制边界和散点 76 | # 先产生x1和x2取值范围上的网格点,并预测每个网格点上的值。 77 | h = 0.02 78 | x1_min, x1_max = X[:,1].min() - .5, X[:,1].max() + .5 79 | x2_min, x2_max = X[:,2].min() - .5, X[:,2].max() + .5 80 | xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h), np.arange(x2_min, x2_max, h)) 81 | testMat = np.c_[xx1.ravel(), xx2.ravel()] #形成测试特征数据集 82 | testMat = np.column_stack((np.ones(((testMat.shape[0]),1)),testMat)) #添加第一列为全1代表b偏量 83 | testMat = np.mat(testMat) 84 | Z = model.predict(testMat) 85 | 86 | # 绘制区域网格图 87 | Z = Z.reshape(xx1.shape) 88 | plt.pcolormesh(xx1, xx2, Z, cmap=plt.cm.Paired) 89 | 90 | 91 | # 绘制散点图 参数:x横轴 y纵轴,颜色代表分类。x图标为样本点,.表示预测点 92 | plt.scatter(X[:,1].flatten().A[0], X[:,2].flatten().A[0],c=y.flatten().A[0],marker='x') # 绘制样本数据集 93 | plt.scatter(X[:,1].flatten().A[0], X[:,2].flatten().A[0],c=predicted.tolist(),marker='.') # 绘制预测数据集 94 | 95 | # 绘制x轴和y轴坐标 96 | plt.xlabel("x") 97 | plt.ylabel("y") 98 | 99 | # 显示图形 100 | plt.show() -------------------------------------------------------------------------------- /sk-逻辑分类没有b偏量.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import numpy as np # 快速操作结构数组的工具 4 | import pandas as pd # 数据分析处理工具 5 | 6 | 7 | # 样本数据集,第一列为x1,第二列为x2,第三列为分类(三种类别) 8 | data=[ 9 | [-2.68420713, 0.32660731, 0],[-2.71539062, -0.16955685, 0],[-2.88981954, -0.13734561, 0],[-2.7464372, -0.31112432, 0],[-2.72859298, 0.33392456, 0], 10 | [-2.27989736, 0.74778271, 0],[-2.82089068, -0.08210451, 0],[-2.62648199, 0.17040535, 0],[-2.88795857, -0.57079803, 0],[-2.67384469, -0.1066917, 0], 11 | [-2.50652679,0.65193501,0],[-2.61314272,0.02152063,0],[-2.78743398,-0.22774019,0],[-3.22520045,-0.50327991,0],[-2.64354322,1.1861949,0], 12 | [-2.38386932,1.34475434,0],[-2.6225262,0.81808967,0],[-2.64832273,0.31913667,0],[-2.19907796,0.87924409,0],[-2.58734619,0.52047364,0], 13 | [1.28479459, 0.68543919, 1],[0.93241075, 0.31919809, 1],[1.46406132, 0.50418983, 1],[0.18096721, -0.82560394, 1],[1.08713449, 0.07539039, 1], 14 | [0.64043675, -0.41732348, 1],[1.09522371, 0.28389121, 1],[-0.75146714, -1.00110751, 1],[1.04329778, 0.22895691, 1],[-0.01019007, -0.72057487, 1], 15 | [-0.5110862,-1.26249195,1],[0.51109806,-0.10228411,1],[0.26233576,-0.5478933,1],[0.98404455,-0.12436042,1],[-0.174864,-0.25181557,1], 16 | [0.92757294,0.46823621,1],[0.65959279,-0.35197629,1],[0.23454059,-0.33192183,1],[0.94236171,-0.54182226,1],[0.0432464,-0.58148945,1], 17 | [2.53172698, -0.01184224, 2],[1.41407223, -0.57492506, 2],[2.61648461, 0.34193529, 2],[1.97081495, -0.18112569, 2],[2.34975798, -0.04188255, 2], 18 | [3.39687992, 0.54716805, 2],[0.51938325, -1.19135169, 2],[2.9320051, 0.35237701, 2],[2.31967279, -0.24554817, 2],[2.91813423, 0.78038063, 2], 19 | [1.66193495,0.2420384,2],[1.80234045,-0.21615461,2],[2.16537886,0.21528028,2],[1.34459422,-0.77641543,2],[1.5852673,-0.53930705,2], 20 | [1.90474358,0.11881899,2],[1.94924878,0.04073026,2],[3.48876538,1.17154454,2],[3.79468686,0.25326557,2],[1.29832982,-0.76101394,2], 21 | ] 22 | # 样本数据集,第一列为x1,第二列为x2,第三列为分类(2种类别) 23 | data1=[ 24 | [-0.017612,14.053064,0], 25 | [-1.395634,4.662541,1], 26 | [-0.752157,6.538620,0], 27 | [-1.322371,7.152853,0], 28 | [0.423363,11.054677,0], 29 | [0.406704,7.067335,1], 30 | [0.667394,12.741452,0], 31 | [-2.460150,6.866805,1], 32 | [0.569411,9.548755,0], 33 | [-0.026632,10.427743,0], 34 | [0.850433,6.920334,1], 35 | [1.347183,13.175500,0], 36 | [1.176813,3.167020,1], 37 | [-1.781871,9.097953,0], 38 | [-0.566606,5.749003,1], 39 | [0.931635,1.589505,1], 40 | [-0.024205,6.151823,1], 41 | [-0.036453,2.690988,1], 42 | [-0.196949,0.444165,1], 43 | [1.014459,5.754399,1] 44 | ] 45 | 46 | #生成X和y矩阵 47 | dataMat = np.mat(data) 48 | X = dataMat[:,0:2] # 特征数据集 49 | y = dataMat[:,2] # 类别变量 50 | 51 | 52 | import sklearn.preprocessing as preprocessing #sk的去均值和归一化 53 | scaler=preprocessing.StandardScaler() 54 | X = scaler.fit_transform(X) # 对特征数据集去均值和归一化,可以加快机器性能 55 | X = np.mat(X) 56 | print(X) 57 | 58 | # ========逻辑回归======== 59 | 60 | from sklearn import metrics 61 | from sklearn.linear_model import LogisticRegression 62 | model = LogisticRegression() 63 | model.fit(X, y) 64 | print('逻辑回归模型:\n',model) 65 | # 使用模型预测 66 | predicted = model.predict(X) #预测分类 67 | answer = model.predict_proba(X) #预测分类概率 68 | print(answer) 69 | 70 | 71 | 72 | import matplotlib.pyplot as plt 73 | 74 | # 绘制边界和散点 75 | # 先产生x1和x2取值范围上的网格点,并预测每个网格点上的值。 76 | h = 0.02 77 | x1_min, x1_max = X[:,0].min() - .5, X[:,0].max() + .5 78 | x2_min, x2_max = X[:,1].min() - .5, X[:,1].max() + .5 79 | xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h), np.arange(x2_min, x2_max, h)) 80 | Z = model.predict(np.c_[xx1.ravel(), xx2.ravel()]) 81 | 82 | # 绘制区域网格图 83 | Z = Z.reshape(xx1.shape) 84 | print(type(Z)) 85 | plt.pcolormesh(xx1, xx2, Z, cmap=plt.cm.Paired) 86 | 87 | 88 | # 绘制散点图 参数:x横轴 y纵轴,颜色代表分类。x图标为样本点,.表示预测点 89 | plt.scatter(X[:,0].flatten().A[0], X[:,1].flatten().A[0],c=y.flatten().A[0],marker='x') # 绘制样本数据集 90 | plt.scatter(X[:,0].flatten().A[0], X[:,1].flatten().A[0],c=predicted.tolist(),marker='.') # 绘制预测数据集 91 | 92 | # 绘制x轴和y轴坐标 93 | plt.xlabel("x") 94 | plt.ylabel("y") 95 | 96 | # 显示图形 97 | plt.show() -------------------------------------------------------------------------------- /sk-随机梯度下降.py: -------------------------------------------------------------------------------- 1 | # #===============随机梯度下降法分类=============== 2 | # 3 | # from sklearn.linear_model import SGDClassifier 4 | # from sklearn.datasets.samples_generator import make_blobs 5 | # import numpy as np 6 | # import matplotlib.pyplot as plt 7 | # 8 | # X, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) 9 | # # loss:损失项。hinge:(软-间隔)线性支持向量机,modified_huber:平滑的 hinge 损失,log:logistic 回归,其他所有的回归损失 10 | # # penalty:惩罚项。l2:L2正则,l1:L1正则,elasticnet:(1 - l1_ratio) * L2 + l1_ratio * L1 11 | # clf = SGDClassifier(loss="hinge", alpha=0.01, max_iter=200, fit_intercept=True) # 12 | # clf.fit(X, y) # 训练模型 13 | # 14 | # print('回归系数:',clf.coef_) 15 | # print('偏差:',clf.intercept_ ) 16 | # 17 | # # 绘制线,点 18 | # xx1 = np.linspace(-1, 5, 10) 19 | # xx2 = np.linspace(-1, 5, 10) 20 | # 21 | # X1, X2 = np.meshgrid(xx1, xx2) # X1、X2都是10*10的矩阵 22 | # Z = np.empty(X1.shape) 23 | # for (i, j), val in np.ndenumerate(X1): # 迭代第i行第j列的坐标xx1取值为val 24 | # x1 = val 25 | # x2 = X2[i, j] # 26 | # p = clf.decision_function([[x1, x2]]) # 计算输出值,也就是到超平面的符号距离。(支持向量到最佳超平面的符号距离为-1和+1) 27 | # Z[i, j] = p[0] 28 | # levels = [-1.0, 0.0, 1.0] # 将输出值分为-1,0,1几个区间 29 | # linestyles = ['dashed', 'solid', 'dashed'] 30 | # plt.contour(X1, X2, Z, levels, colors='k', linestyles=linestyles) # 绘制等高线图,高度为-1,0,1,也就是支持向量形成的线和最佳分割超平面 31 | # plt.scatter(X[:, 0], X[:, 1], c=y, s=20) # 绘制样本点 32 | # plt.show() 33 | 34 | 35 | 36 | # # ==============随机梯度下降法进行多分类============= 37 | # from sklearn.linear_model import SGDClassifier 38 | # from sklearn.metrics import accuracy_score 39 | # from sklearn import datasets 40 | # iris = datasets.load_iris() 41 | # X,y=iris.data,iris.target 42 | # clf = SGDClassifier(alpha=0.001, max_iter=100).fit(X, y) 43 | # y_pred = clf.predict(X) 44 | # print('三分类花卉数据准确率:',accuracy_score(y,y_pred)) 45 | # print('包含的二分类器索引:',clf.classes_) # one versus all 方法来组合多个二分类器 46 | # print('回归系数:',clf.coef_) # 每一个二分类器的回归系数 47 | # print('偏差:',clf.intercept_ ) # 每一个二分类器的偏差 48 | 49 | 50 | 51 | # #===============随机梯度下降法回归=============== 52 | from sklearn import linear_model 53 | from sklearn.datasets import load_boston 54 | X,y = load_boston().data,load_boston().target 55 | clf = linear_model.SGDRegressor(loss='squared_loss',penalty='l2',alpha=0.01,max_iter=1000) 56 | clf.fit(X, y) 57 | print('得分:',clf.score(X,y)) 58 | print('回归系数:',clf.coef_) 59 | print('偏差:',clf.intercept_ ) 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /sk-集成学习.py: -------------------------------------------------------------------------------- 1 | # 产生样本数据集 2 | from sklearn.model_selection import cross_val_score 3 | from sklearn import datasets 4 | iris = datasets.load_iris() 5 | X, y = iris.data[:, 1:3], iris.target 6 | 7 | # # ==================Bagging 元估计器============= 8 | # from sklearn.ensemble import BaggingClassifier 9 | # from sklearn.neighbors import KNeighborsClassifier 10 | # bagging = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5) 11 | # scores = cross_val_score(bagging, X, y) 12 | # print('Bagging准确率:',scores.mean()) 13 | # 14 | # # ==================决策树、随机森林、极限森林对比=============== 15 | # 16 | # # 决策树 17 | # from sklearn.tree import DecisionTreeClassifier 18 | # clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2,random_state=0) 19 | # scores = cross_val_score(clf, X, y) 20 | # print('决策树准确率:',scores.mean()) 21 | # 22 | # # 随机森林 23 | # from sklearn.ensemble import RandomForestClassifier 24 | # clf = RandomForestClassifier(n_estimators=10,max_features=2) 25 | # scores = cross_val_score(clf, X, y) 26 | # print('随机森林准确率:',scores.mean()) 27 | # 28 | # # 极限随机树 29 | # from sklearn.ensemble import ExtraTreesClassifier 30 | # clf = ExtraTreesClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0) 31 | # scores = cross_val_score(clf, X, y) 32 | # print('极限随机树准确率:',scores.mean()) 33 | # 34 | # print('模型中各属性的重要程度:',clf.feature_importances_) 35 | # 36 | # 37 | # # ====================AdaBoost========================= 38 | # from sklearn.ensemble import AdaBoostClassifier 39 | # clf = AdaBoostClassifier(n_estimators=100) 40 | # scores = cross_val_score(clf, X, y) 41 | # print('AdaBoost准确率:',scores.mean()) 42 | # 43 | # 44 | # # ====================Gradient Tree Boosting(梯度树提升)========================= 45 | # # 分类 46 | # from sklearn.ensemble import GradientBoostingClassifier 47 | # clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0) 48 | # scores = cross_val_score(clf, X, y) 49 | # print('GDBT分类准确率:',scores.mean()) 50 | # 51 | # # 回归 52 | # import numpy as np 53 | # import matplotlib.pyplot as plt 54 | # from sklearn.metrics import mean_squared_error 55 | # from sklearn.datasets import load_boston 56 | # from sklearn.ensemble import GradientBoostingRegressor 57 | # from sklearn.utils import shuffle 58 | # from sklearn.model_selection import train_test_split,cross_val_score,cross_validate 59 | # 60 | # boston = load_boston() # 加载波士顿房价回归数据集 61 | # X1, y1 = shuffle(boston.data, boston.target, random_state=13) # 将数据集随机打乱 62 | # X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.1, random_state=0) # 划分训练集和测试集.test_size为测试集所占的比例 63 | # clf = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01,max_depth=4,min_samples_split=2,loss='ls') 64 | # clf.fit(X1, y1) 65 | # print('GDBT回归MSE:',mean_squared_error(y_test, clf.predict(X_test))) 66 | # # print('每次训练的得分记录:',clf.train_score_) 67 | # print('各特征的重要程度:',clf.feature_importances_) 68 | # plt.plot(np.arange(500), clf.train_score_, 'b-') # 绘制随着训练次数增加,训练得分的变化 69 | # plt.show() 70 | 71 | 72 | 73 | # ====================Voting Classifier(投票分类器)========================= 74 | 75 | from sklearn.linear_model import LogisticRegression 76 | from sklearn.naive_bayes import GaussianNB 77 | from sklearn.ensemble import RandomForestClassifier 78 | from sklearn.ensemble import VotingClassifier 79 | 80 | clf1 = LogisticRegression(random_state=1) 81 | clf2 = RandomForestClassifier(random_state=1) 82 | clf3 = GaussianNB() 83 | 84 | eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') # 无权重投票 85 | eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft', weights=[2,1,2]) # 权重投票 86 | 87 | for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']): 88 | scores = cross_val_score(clf,X,y,cv=5, scoring='accuracy') 89 | print("准确率: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) 90 | 91 | # 配合网格搜索 92 | from sklearn.model_selection import GridSearchCV 93 | params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200],} # 搜索寻找最优的lr模型中的C参数和rf模型中的n_estimators 94 | grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) 95 | grid = grid.fit(iris.data, iris.target) 96 | print('最优参数:',grid.best_params_) 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /sk-预处理.py: -------------------------------------------------------------------------------- 1 | from sklearn import preprocessing 2 | import numpy as np 3 | 4 | X_train = np.array([[ 1., -1., -2.], 5 | [ 2., 0., 0.], 6 | [ 3., 1., -1.]]) 7 | X_test = [[-1., 1., 0.]] 8 | 9 | 10 | # ===============标准化==================== 11 | # 计算数据集的尺度(也就是数据集的均值和方差)(各列) 12 | scaler = preprocessing.StandardScaler().fit(X_train) # 计算均值和方差 13 | print('均值:',scaler.mean_ ) 14 | print('方差:',scaler.scale_ ) 15 | 16 | # 通过尺度去处理另一个数据集,当然另一个数据集仍然可以是自己。 17 | X_scaled = scaler.transform(X_train) 18 | print('均值:',X_scaled.mean(axis=0)) # transform会转化数据集为均值为0 19 | print('方差:',X_scaled.std(axis=0)) # transform会转化数据集为方差为1 20 | 21 | # 上面两步的综合:缩放样本,是样本均值为0,方差为1(各列) 22 | X_scaled = preprocessing.scale(X_train,axis=0) # 标准化:去均值和方差 23 | print('均值:',X_scaled.mean(axis=0)) 24 | print('方差:',X_scaled.std(axis=0)) 25 | 26 | # =====================特征缩放==================== 27 | # MinMaxScaler将特征缩放至特定范围内(默认为0-1) 28 | min_max_scaler = preprocessing.MinMaxScaler() 29 | X_train_minmax = min_max_scaler.fit_transform(X_train) # 训练同时转换 30 | print('每列最大值:',X_train_minmax.max(axis=0)) # 每列最大值为1 31 | print('每列最小值:',X_train_minmax.min(axis=0)) # 每列最小值为0 32 | # 缩放对象是记录了,平移距离和缩放大小,再对数据进行的操作 33 | print('先平移:',min_max_scaler.min_) 34 | print('再缩放:',min_max_scaler.scale_) 35 | 36 | X_test_minmax = min_max_scaler.transform(X_test) # 转换实例应用到测试数据:实现和训练数据一致的缩放和移位操作: 37 | 38 | 39 | 40 | # MaxAbsScaler通过除以每个特征的最大值将训练数据特征缩放至 [-1, 1] 范围内。可以应用在稀疏矩阵上保留矩阵的稀疏性。 41 | X_train = np.array([[ 0., -1., 0.], 42 | [ 0., 0., 0.2], 43 | [ 2., 0., 0]]) 44 | max_abs_scaler = preprocessing.MaxAbsScaler() 45 | X_train_maxabs = max_abs_scaler.fit_transform(X_train) 46 | print('每列最大值:',X_train_maxabs.max(axis=0)) # 每列最大值为1 47 | print('每列最小值:',X_train_maxabs.min(axis=0)) # 每列最小值不低于-1 48 | print('缩放比例:',max_abs_scaler.scale_) 49 | X_test_maxabs = max_abs_scaler.transform(X_test) # 转换实例应用到测试数据:实现和训练数据一致的缩放和移位操作: 50 | print('缩放后的矩阵仍然具有稀疏性:\n',X_train_maxabs) 51 | 52 | 53 | 54 | # ===================缩放有离群值的数据======================== 55 | X_train = np.array([[ 1., -11., -2.], 56 | [ 2., 2., 0.], 57 | [ 13., 1., -11.]]) 58 | robust_scale = preprocessing.RobustScaler() 59 | X_train_robust = robust_scale.fit_transform(X_train) # 训练同时转换 60 | print('缩放后的矩阵离群点被处理了:\n',X_train_maxabs) 61 | 62 | 63 | 64 | 65 | # ===================非线性转换=================== 66 | X_train = np.array([[ 1., -1., -2.], 67 | [ 2., 0., 0.], 68 | [ 3., 1., -1.]]) 69 | quantile_transformer = preprocessing.QuantileTransformer(random_state=0) # 将数据映射到了零到一的均匀分布上(默认是均匀分布) 70 | X_train_trans = quantile_transformer.fit_transform(X_train) 71 | 72 | #查看分位数信息,经过转换以后,分位数的信息基本不变 73 | print('源分位数情况:',np.percentile(X_train[:, 0], [0, 25, 50, 75, 100])) 74 | print('变换后分位数情况:',np.percentile(X_train_trans[:, 0], [0, 25, 50, 75, 100])) 75 | 76 | # 下面将数据映射到了零到一的正态分布上:输入的中值称为输出的平均值,并且以0为中心。正常输出被剪切,使得输入的最小和最大值分别对应于1e-7和1-1e-7分位数 77 | quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal',random_state=0) 78 | 79 | 80 | X = [[ 1., -1., 2.], 81 | [ 2., 0., 0.], 82 | [ 0., 1., -1.]] 83 | # ===================样本归一化=================== 84 | X_normalized = preprocessing.normalize(X, norm='l1') # 使用 l1 或 l2 范式。缩放使每个样本(每行)的一范数或二范数为1 85 | print('样本归一化:\n',X_normalized) 86 | # 当然仍然可以先通过样本获取转换对象,再用转换对象归一化其他数据 87 | normalizer = preprocessing.Normalizer().fit(X) # 获取转换对象 88 | normalizer.transform(X) # 转换任何数据,X或测试集 89 | 90 | # ===================特征二值化=================== 91 | binarizer = preprocessing.Binarizer().fit(X) # 获取转换模型,生成的门限,默认为0 92 | print(binarizer) 93 | # binarizer = preprocessing.Binarizer(threshold=1) # 自定义转换器。门限以上为1,门限(包含)以下为0 94 | X_normalized = binarizer.transform(X) # 转换任何数据,X或测试集 95 | print('特征二值化:\n',X_normalized) 96 | 97 | 98 | 99 | # ===================分类特征编码(one-hot编码)=================== 100 | from sklearn.preprocessing import OneHotEncoder 101 | enc = OneHotEncoder() 102 | enc.fit([[0, 1, 2], # 每列一个属性,每个属性一种编码 103 | [1, 0, 0], 104 | [0, 2, 1], 105 | [1, 0, 1]]) 106 | print('取值范围整数个数:',enc.n_values_) # 每个属性的最大可取值数目。2,3,3 107 | print('编码后:',enc.transform([[0, 1, 1]]).toarray()) # 转换目标对象。根据可取值所占位数进行罗列。前2位为第一个数字one-hot编码,紧接着的3位为第二个数字的编码,最后3位为第三个数字的编码 108 | print('特征开始位置的索引:',enc.feature_indices_) # 对 n_values_的累积值,代表一个样本转换为编码后的每个属性的开始位置。0,2,5,8 109 | 110 | 111 | # ===================缺失值插补=================== 112 | from sklearn.preprocessing import Imputer 113 | imp = Imputer(missing_values='NaN', strategy='mean', axis=0) # missing_values参数设定的值被认为是缺失值,计算均值时忽略不计 114 | imp.fit([[1, 2], # 计算每列的非空值的均值 115 | [np.nan, 3], 116 | [7, 6]]) 117 | 118 | X = [[np.nan, 2], [6, np.nan], [7, 6]] 119 | print('缺失值插值后:\n',imp.transform(X)) # 使用每个的均值为每列缺失值插补 120 | 121 | 122 | # ===================生成多项式特征=================== 123 | from sklearn.preprocessing import PolynomialFeatures 124 | X = np.array([[0, 1], 125 | [2, 3], 126 | [4, 5]]) 127 | poly = PolynomialFeatures(2,interaction_only=False) # 最大二次方。interaction_only参数设置为True,则会只保留交互项 128 | print('生成多项式:\n',poly.fit_transform(X)) # 从 (X_1, X_2) 转换为 (1, X_1, X_2, X_1^2, X_1X_2, X_2^2) 129 | 130 | --------------------------------------------------------------------------------