├── 大作业
    └── test.ipynb
├── 期末考试
    └── exam.ipynb
├── 随堂测试
    └── quiz.ipynb
├── 01_Supervised_learning
    ├── 01_06_Naive_bayes.ipynb
    ├── 01_10_Feature_selection.ipynb
    ├── 01_11_Neural_network.ipynb
    ├── 01_04_Stochastic_gradient_descent.ipynb
    ├── 01_09_Multiclass_and_multioutput.ipynb
    ├── 01_02_Logistic_regression.ipynb
    ├── 01_05_Knn.ipynb
    ├── 01_08_Random_forests.ipynb
    └── 01_03_Svm.ipynb
├── 04_Visualizations
    └── 04_01_Visualizations.ipynb
├── 07_Model_persistence
    └── 07_01_Model_persistence.ipynb
├── 05_Dataset_transformations
    ├── 05_06_Random_projection.ipynb
    ├── 05_05_Unsupervised_dimensionality_reduction.ipynb
    ├── 05_04_Imputation_of_missing_values.ipynb
    └── 05_02_Feature_extraction.ipynb
├── 02_Unsupervised_learning
    └── 02_02_Neural_network_models _unsupervised.ipynb
├── 06_Dataset_loading
    ├── README.MD
    ├── 06_02_Real_world_datasets.ipynb
    ├── 06_03_Generated datasets.ipynb
    ├── 06_01_Toy_datasets.ipynb
    └── 06_04_load_files.ipynb
├── _Datasets
    ├── iris_miss.data
    ├── iris_noheader.data
    └── iris.data
├── README.md
├── LICENSE
└── 00_Python_basics
    ├── 00_04_EDA.ipynb
    └── 00_01_Numpy_basic.ipynb


/大作业/test.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/期末考试/exam.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/随堂测试/quiz.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/01_Supervised_learning/01_06_Naive_bayes.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/04_Visualizations/04_01_Visualizations.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/01_Supervised_learning/01_10_Feature_selection.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/01_Supervised_learning/01_11_Neural_network.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/07_Model_persistence/07_01_Model_persistence.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/05_Dataset_transformations/05_06_Random_projection.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/01_Supervised_learning/01_04_Stochastic_gradient_descent.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/01_Supervised_learning/01_09_Multiclass_and_multioutput.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/02_Unsupervised_learning/02_02_Neural_network_models _unsupervised.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/06_Dataset_loading/README.MD:
--------------------------------------------------------------------------------
1 | ## UCI数据集
2 | https://archive.ics.uci.edu/


--------------------------------------------------------------------------------
/05_Dataset_transformations/05_05_Unsupervised_dimensionality_reduction.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/06_Dataset_loading/06_02_Real_world_datasets.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 2,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from sklearn.datasets import fetch_california_housing"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": null,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "###6.2.1 加州房价数据集\n",
19 |     "#sklearn.datasets.fetch_california_housing(*, data_home=None, download_if_missing=True, return_X_y=False, as_frame=False)\n",
20 |     "housing = fetch_california_housing(download_if_missing=True)  #因网络问题，加载失败。可下载数据集通过pandas加载\n",
21 |     "print(housing.data.shape)\n",
22 |     "print(housing.target.shape)"
23 |    ]
24 |   },
25 |   {
26 |    "cell_type": "code",
27 |    "execution_count": null,
28 |    "metadata": {},
29 |    "outputs": [],
30 |    "source": [
31 |     "## 总结：\n",
32 |     "\n",
33 |     "#1、加载方式与toy_dataset（例如iris data基本一致，只是数据量更大）；\n",
34 |     "#2、可能会因为反爬虫等因素无法加载真实数据，可下载后用read_csv等方式加载数据，这些都是公开数据集。\n"
35 |    ]
36 |   }
37 |  ],
38 |  "metadata": {
39 |   "kernelspec": {
40 |    "display_name": "Python 3",
41 |    "language": "python",
42 |    "name": "python3"
43 |   },
44 |   "language_info": {
45 |    "codemirror_mode": {
46 |     "name": "ipython",
47 |     "version": 3
48 |    },
49 |    "file_extension": ".py",
50 |    "mimetype": "text/x-python",
51 |    "name": "python",
52 |    "nbconvert_exporter": "python",
53 |    "pygments_lexer": "ipython3",
54 |    "version": "3.11.8"
55 |   }
56 |  },
57 |  "nbformat": 4,
58 |  "nbformat_minor": 2
59 | }
60 | 


--------------------------------------------------------------------------------
/06_Dataset_loading/06_03_Generated datasets.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 2,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from sklearn.datasets import make_blobs"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": 4,
15 |    "metadata": {},
16 |    "outputs": [
17 |     {
18 |      "name": "stdout",
19 |      "output_type": "stream",
20 |      "text": [
21 |       "(10, 2)\n"
22 |      ]
23 |     }
24 |    ],
25 |    "source": [
26 |     "###6.3.1 生成用于聚类的各向同性高斯斑点\n",
27 |     "## sklearn.datasets.make_blobs(n_samples=100, n_features=2, *, centers=None, cluster_std=1.0, \n",
28 |     "#  center_box=(-10.0, 10.0), shuffle=True, random_state=None, return_centers=False)\n",
29 |     "X, y = make_blobs(n_samples=10, centers=3, n_features=2,random_state=0)\n",
30 |     "print(X.shape)"
31 |    ]
32 |   },
33 |   {
34 |    "cell_type": "code",
35 |    "execution_count": 5,
36 |    "metadata": {},
37 |    "outputs": [
38 |     {
39 |      "data": {
40 |       "text/plain": [
41 |        "array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])"
42 |       ]
43 |      },
44 |      "execution_count": 5,
45 |      "metadata": {},
46 |      "output_type": "execute_result"
47 |     }
48 |    ],
49 |    "source": [
50 |     "y"
51 |    ]
52 |   },
53 |   {
54 |    "cell_type": "code",
55 |    "execution_count": null,
56 |    "metadata": {},
57 |    "outputs": [],
58 |    "source": [
59 |     "## 总结\n",
60 |     "\n",
61 |     "#1、为了在控制数据的统计特性（通常是特征的 correlation （相关性）和 informativeness （信息性））的同时\n",
62 |     "#   评估数据集 (n_samples 和 n_features) 的规模的影响，也可以生成综合数据。\n",
63 |     "#2、此处只列举了make_blobs方法，更多的方法请参考 https://scikit-learn.org/stable/datasets/sample_generators.html"
64 |    ]
65 |   }
66 |  ],
67 |  "metadata": {
68 |   "kernelspec": {
69 |    "display_name": "Python 3",
70 |    "language": "python",
71 |    "name": "python3"
72 |   },
73 |   "language_info": {
74 |    "codemirror_mode": {
75 |     "name": "ipython",
76 |     "version": 3
77 |    },
78 |    "file_extension": ".py",
79 |    "mimetype": "text/x-python",
80 |    "name": "python",
81 |    "nbconvert_exporter": "python",
82 |    "pygments_lexer": "ipython3",
83 |    "version": "3.11.8"
84 |   }
85 |  },
86 |  "nbformat": 4,
87 |  "nbformat_minor": 2
88 | }
89 | 


--------------------------------------------------------------------------------
/01_Supervised_learning/01_02_Logistic_regression.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 11,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from sklearn import datasets, linear_model, neighbors\n",
10 |     "from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score\n",
11 |     "\n",
12 |     "#class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, \n",
13 |     "#intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, \n",
14 |     "#warm_start=False, n_jobs=None, l1_ratio=None)"
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "code",
19 |    "execution_count": 26,
20 |    "metadata": {},
21 |    "outputs": [
22 |     {
23 |      "name": "stdout",
24 |      "output_type": "stream",
25 |      "text": [
26 |       "accuracy_score: \n",
27 |       " 0.9277777777777778\n",
28 |       "precision_score: 0.93\n",
29 |       "recall_score: 0.93\n",
30 |       "f1_score: 0.93\n"
31 |      ]
32 |     }
33 |    ],
34 |    "source": [
35 |     "# 加载digits手写数字数据集\n",
36 |     "X_digits, y_digits = datasets.load_digits(return_X_y=True)\n",
37 |     "#X_digits = X_digits / X_digits.max()\n",
38 |     "\n",
39 |     "n_samples = len(X_digits)\n",
40 |     "\n",
41 |     "#训练集和测试集划分\n",
42 |     "X_train = X_digits[: int(0.9 * n_samples)]  #取前90%的数据作为训练集，其余作为测试集\n",
43 |     "y_train = y_digits[: int(0.9 * n_samples)]\n",
44 |     "X_test = X_digits[int(0.9 * n_samples) :]\n",
45 |     "y_test = y_digits[int(0.9 * n_samples) :]\n",
46 |     "\n",
47 |     "#创建模型\n",
48 |     "logistic = linear_model.LogisticRegression(max_iter=1000,penalty=\"l2\")  #max_iter最大迭代次数\n",
49 |     "                                                                    #penalty正则化，l1或l2\n",
50 |     " \n",
51 |     "#模型训练\n",
52 |     "logistic.fit(X_train, y_train)\n",
53 |     "\n",
54 |     "#模型预测\n",
55 |     "y_pred = logistic.predict(X_test)\n",
56 |     "\n",
57 |     "##评价指标\n",
58 |     "# accuracy\n",
59 |     "print(\"accuracy_score: \\n\", accuracy_score(y_test,y_pred)) #不平衡类别分布的情况下可能会失效\n",
60 |     "\n",
61 |     "# precision指标\n",
62 |     "print(\"precision_score: %.2f\" % precision_score(y_test,y_pred,average='macro')) #多分类，需要加average参数\n",
63 |     "\n",
64 |     "# recall_score指标\n",
65 |     "print(\"recall_score: %.2f\" % recall_score(y_test,y_pred,average='macro'))\n",
66 |     "\n",
67 |     "#f1指标\n",
68 |     "print(\"f1_score: %.2f\" % f1_score(y_test,y_pred,average='macro'))\n"
69 |    ]
70 |   }
71 |  ],
72 |  "metadata": {
73 |   "kernelspec": {
74 |    "display_name": "Python 3",
75 |    "language": "python",
76 |    "name": "python3"
77 |   },
78 |   "language_info": {
79 |    "codemirror_mode": {
80 |     "name": "ipython",
81 |     "version": 3
82 |    },
83 |    "file_extension": ".py",
84 |    "mimetype": "text/x-python",
85 |    "name": "python",
86 |    "nbconvert_exporter": "python",
87 |    "pygments_lexer": "ipython3",
88 |    "version": "3.11.8"
89 |   }
90 |  },
91 |  "nbformat": 4,
92 |  "nbformat_minor": 2
93 | }
94 | 


--------------------------------------------------------------------------------
/_Datasets/iris_miss.data:
--------------------------------------------------------------------------------
  1 | sepal_length,sepal_width,petal_length,petal_width,species
  2 | 5.1,,1.4,0.2,setosa
  3 | 4.9,3.0,1.4,0.2,setosa
  4 | 4.7,3.2,1.3,0.2,setosa
  5 | 4.6,,1.5,0.2,setosa
  6 | 5.0,3.6,1.4,0.2,setosa
  7 | 5.4,3.9,1.7,0.4,setosa
  8 | 4.6,3.4,1.4,0.3,setosa
  9 | 5.0,3.4,1.5,0.2,
 10 | 4.4,2.9,1.4,0.2,setosa
 11 | 4.9,3.1,1.5,0.1,setosa
 12 | 5.4,3.7,1.5,0.2,setosa
 13 | 4.8,3.4,1.6,0.2,
 14 | 4.8,3.0,1.4,0.1,setosa
 15 | 4.3,3.0,1.1,0.1,setosa
 16 | 5.8,4.0,1.2,0.2,setosa
 17 | 5.7,4.4,1.5,0.4,setosa
 18 | 5.4,3.9,1.3,0.4,setosa
 19 | ,3.5,1.4,0.3,setosa
 20 | 5.7,3.8,1.7,0.3,setosa
 21 | 5.1,3.8,1.5,0.3,setosa
 22 | 5.4,3.4,1.7,0.2,setosa
 23 | 5.1,3.7,1.5,0.4,setosa
 24 | 4.6,3.6,1.0,0.2,setosa
 25 | 5.1,3.3,1.7,0.5,setosa
 26 | ,3.4,1.9,0.2,setosa
 27 | 5.0,3.0,1.6,0.2,setosa
 28 | 5.0,3.4,1.6,0.4,setosa
 29 | 5.2,3.5,1.5,0.2,setosa
 30 | 5.2,3.4,1.4,0.2,setosa
 31 | 4.7,3.2,1.6,0.2,setosa
 32 | 4.8,3.1,1.6,0.2,setosa
 33 | 5.4,3.4,1.5,0.4,setosa
 34 | 5.2,,1.5,0.1,setosa
 35 | 5.5,4.2,1.4,0.2,setosa
 36 | 4.9,3.1,1.5,0.1,setosa
 37 | 5.0,3.2,1.2,0.2,setosa
 38 | 5.5,3.5,1.3,0.2,setosa
 39 | 4.9,3.1,1.5,0.1,setosa
 40 | 4.4,3.0,1.3,0.2,setosa
 41 | 5.1,3.4,1.5,,setosa
 42 | 5.0,3.5,1.3,0.3,setosa
 43 | 4.5,2.3,1.3,0.3,setosa
 44 | 4.4,3.2,1.3,0.2,setosa
 45 | 5.0,3.5,1.6,0.6,setosa
 46 | 5.1,3.8,1.9,0.4,setosa
 47 | 4.8,3.0,1.4,0.3,setosa
 48 | 5.1,3.8,1.6,0.2,setosa
 49 | 4.6,3.2,1.4,0.2,setosa
 50 | ,3.7,1.5,0.2,setosa
 51 | 5.0,3.3,1.4,0.2,setosa
 52 | 7.0,3.2,4.7,1.4,versicolor
 53 | 6.4,3.2,4.5,1.5,versicolor
 54 | 6.9,,4.9,1.5,versicolor
 55 | 5.5,2.3,4.0,1.3,versicolor
 56 | 6.5,2.8,4.6,1.5,
 57 | 5.7,2.8,4.5,1.3,versicolor
 58 | 6.3,3.3,4.7,1.6,versicolor
 59 | 4.9,2.4,3.3,1.0,versicolor
 60 | 6.6,2.9,4.6,1.3,versicolor
 61 | 5.2,2.7,3.9,1.4,versicolor
 62 | 5.0,2.0,3.5,1.0,versicolor
 63 | 5.9,3.0,4.2,1.5,versicolor
 64 | 6.0,2.2,4.0,1.0,versicolor
 65 | 6.1,2.9,4.7,1.4,versicolor
 66 | 5.6,2.9,3.6,1.3,versicolor
 67 | 6.7,3.1,4.4,1.4,versicolor
 68 | 5.6,3.0,4.5,1.5,versicolor
 69 | 5.8,2.7,4.1,1.0,versicolor
 70 | 6.2,2.2,4.5,1.5,versicolor
 71 | 5.6,2.5,3.9,1.1,versicolor
 72 | 5.9,3.2,4.8,1.8,versicolor
 73 | 6.1,2.8,4.0,1.3,versicolor
 74 | 6.3,2.5,4.9,1.5,versicolor
 75 | 6.1,2.8,4.7,1.2,versicolor
 76 | 6.4,2.9,4.3,1.3,versicolor
 77 | 6.6,3.0,4.4,1.4,versicolor
 78 | 6.8,2.8,4.8,1.4,versicolor
 79 | 6.7,3.0,5.0,1.7,versicolor
 80 | 6.0,2.9,4.5,1.5,versicolor
 81 | 5.7,2.6,3.5,1.0,versicolor
 82 | 5.5,2.4,3.8,1.1,versicolor
 83 | 5.5,2.4,3.7,1.0,versicolor
 84 | 5.8,2.7,3.9,1.2,versicolor
 85 | 6.0,2.7,5.1,1.6,versicolor
 86 | 5.4,3.0,4.5,1.5,versicolor
 87 | 6.0,3.4,4.5,1.6,versicolor
 88 | 6.7,3.1,4.7,1.5,
 89 | 6.3,2.3,4.4,1.3,versicolor
 90 | 5.6,3.0,4.1,1.3,versicolor
 91 | 5.5,2.5,4.0,1.3,versicolor
 92 | 5.5,2.6,4.4,1.2,versicolor
 93 | 6.1,,4.6,1.4,versicolor
 94 | 5.8,2.6,4.0,1.2,versicolor
 95 | 5.0,2.3,3.3,1.0,versicolor
 96 | 5.6,2.7,4.2,1.3,versicolor
 97 | 5.7,3.0,4.2,1.2,versicolor
 98 | 5.7,2.9,4.2,1.3,versicolor
 99 | 6.2,2.9,4.3,1.3,versicolor
100 | 5.1,2.5,3.0,1.1,versicolor
101 | 5.7,2.8,4.1,1.3,versicolor
102 | 6.3,3.3,6.0,2.5,virginica
103 | 5.8,2.7,5.1,1.9,virginica
104 | 7.1,3.0,5.9,2.1,virginica
105 | 6.3,2.9,5.6,1.8,virginica
106 | 6.5,3.0,5.8,2.2,virginica
107 | 7.6,3.0,6.6,2.1,virginica
108 | 4.9,2.5,4.5,1.7,virginica
109 | 7.3,2.9,6.3,1.8,virginica
110 | 6.7,2.5,5.8,1.8,virginica
111 | 7.2,3.6,6.1,2.5,virginica
112 | 6.5,3.2,5.1,2.0,virginica
113 | 6.4,2.7,5.3,1.9,virginica
114 | 6.8,3.0,5.5,2.1,virginica
115 | 5.7,2.5,5.0,2.0,virginica
116 | 5.8,2.8,5.1,2.4,virginica
117 | 6.4,3.2,5.3,2.3,virginica
118 | 6.5,3.0,5.5,1.8,virginica
119 | 7.7,3.8,6.7,2.2,
120 | 7.7,2.6,6.9,2.3,virginica
121 | 6.0,2.2,5.0,1.5,virginica
122 | 6.9,3.2,,2.3,virginica
123 | 5.6,2.8,4.9,2.0,virginica
124 | 7.7,2.8,6.7,2.0,virginica
125 | 6.3,2.7,4.9,1.8,virginica
126 | 6.7,3.3,5.7,2.1,virginica
127 | 7.2,3.2,6.0,1.8,virginica
128 | 6.2,,4.8,1.8,virginica
129 | 6.1,3.0,4.9,1.8,virginica
130 | 6.4,2.8,5.6,2.1,virginica
131 | 7.2,3.0,5.8,1.6,virginica
132 | 7.4,2.8,6.1,1.9,virginica
133 | 7.9,3.8,6.4,2.0,virginica
134 | 6.4,2.8,5.6,2.2,virginica
135 | 6.3,2.8,5.1,1.5,virginica
136 | 6.1,2.6,5.6,1.4,virginica
137 | 7.7,3.0,6.1,2.3,virginica
138 | 6.3,,5.6,2.4,virginica
139 | 6.4,3.1,5.5,1.8,virginica
140 | 6.0,3.0,4.8,1.8,virginica
141 | 6.9,3.1,5.4,2.1,virginica
142 | 6.7,3.1,5.6,2.4,virginica
143 | 6.9,3.1,5.1,2.3,virginica
144 | 5.8,2.7,5.1,1.9,virginica
145 | 6.8,3.2,5.9,2.3,virginica
146 | 6.7,3.3,5.7,2.5,virginica
147 | 6.7,3.0,5.2,2.3,virginica
148 | 6.3,2.5,,1.9,virginica
149 | 6.5,3.0,5.2,2.0,virginica
150 | 6.2,3.4,5.4,2.3,virginica
151 | 5.9,3.0,5.1,1.8,virginica
152 | 


--------------------------------------------------------------------------------
/_Datasets/iris_noheader.data:
--------------------------------------------------------------------------------
  1 | 5.1,3.5,1.4,0.2,setosa
  2 | 4.9,3.0,1.4,0.2,setosa
  3 | 4.7,3.2,1.3,0.2,setosa
  4 | 4.6,3.1,1.5,0.2,setosa
  5 | 5.0,3.6,1.4,0.2,setosa
  6 | 5.4,3.9,1.7,0.4,setosa
  7 | 4.6,3.4,1.4,0.3,setosa
  8 | 5.0,3.4,1.5,0.2,setosa
  9 | 4.4,2.9,1.4,0.2,setosa
 10 | 4.9,3.1,1.5,0.1,setosa
 11 | 5.4,3.7,1.5,0.2,setosa
 12 | 4.8,3.4,1.6,0.2,setosa
 13 | 4.8,3.0,1.4,0.1,setosa
 14 | 4.3,3.0,1.1,0.1,setosa
 15 | 5.8,4.0,1.2,0.2,setosa
 16 | 5.7,4.4,1.5,0.4,setosa
 17 | 5.4,3.9,1.3,0.4,setosa
 18 | 5.1,3.5,1.4,0.3,setosa
 19 | 5.7,3.8,1.7,0.3,setosa
 20 | 5.1,3.8,1.5,0.3,setosa
 21 | 5.4,3.4,1.7,0.2,setosa
 22 | 5.1,3.7,1.5,0.4,setosa
 23 | 4.6,3.6,1.0,0.2,setosa
 24 | 5.1,3.3,1.7,0.5,setosa
 25 | 4.8,3.4,1.9,0.2,setosa
 26 | 5.0,3.0,1.6,0.2,setosa
 27 | 5.0,3.4,1.6,0.4,setosa
 28 | 5.2,3.5,1.5,0.2,setosa
 29 | 5.2,3.4,1.4,0.2,setosa
 30 | 4.7,3.2,1.6,0.2,setosa
 31 | 4.8,3.1,1.6,0.2,setosa
 32 | 5.4,3.4,1.5,0.4,setosa
 33 | 5.2,4.1,1.5,0.1,setosa
 34 | 5.5,4.2,1.4,0.2,setosa
 35 | 4.9,3.1,1.5,0.1,setosa
 36 | 5.0,3.2,1.2,0.2,setosa
 37 | 5.5,3.5,1.3,0.2,setosa
 38 | 4.9,3.1,1.5,0.1,setosa
 39 | 4.4,3.0,1.3,0.2,setosa
 40 | 5.1,3.4,1.5,0.2,setosa
 41 | 5.0,3.5,1.3,0.3,setosa
 42 | 4.5,2.3,1.3,0.3,setosa
 43 | 4.4,3.2,1.3,0.2,setosa
 44 | 5.0,3.5,1.6,0.6,setosa
 45 | 5.1,3.8,1.9,0.4,setosa
 46 | 4.8,3.0,1.4,0.3,setosa
 47 | 5.1,3.8,1.6,0.2,setosa
 48 | 4.6,3.2,1.4,0.2,setosa
 49 | 5.3,3.7,1.5,0.2,setosa
 50 | 5.0,3.3,1.4,0.2,setosa
 51 | 7.0,3.2,4.7,1.4,versicolor
 52 | 6.4,3.2,4.5,1.5,versicolor
 53 | 6.9,3.1,4.9,1.5,versicolor
 54 | 5.5,2.3,4.0,1.3,versicolor
 55 | 6.5,2.8,4.6,1.5,versicolor
 56 | 5.7,2.8,4.5,1.3,versicolor
 57 | 6.3,3.3,4.7,1.6,versicolor
 58 | 4.9,2.4,3.3,1.0,versicolor
 59 | 6.6,2.9,4.6,1.3,versicolor
 60 | 5.2,2.7,3.9,1.4,versicolor
 61 | 5.0,2.0,3.5,1.0,versicolor
 62 | 5.9,3.0,4.2,1.5,versicolor
 63 | 6.0,2.2,4.0,1.0,versicolor
 64 | 6.1,2.9,4.7,1.4,versicolor
 65 | 5.6,2.9,3.6,1.3,versicolor
 66 | 6.7,3.1,4.4,1.4,versicolor
 67 | 5.6,3.0,4.5,1.5,versicolor
 68 | 5.8,2.7,4.1,1.0,versicolor
 69 | 6.2,2.2,4.5,1.5,versicolor
 70 | 5.6,2.5,3.9,1.1,versicolor
 71 | 5.9,3.2,4.8,1.8,versicolor
 72 | 6.1,2.8,4.0,1.3,versicolor
 73 | 6.3,2.5,4.9,1.5,versicolor
 74 | 6.1,2.8,4.7,1.2,versicolor
 75 | 6.4,2.9,4.3,1.3,versicolor
 76 | 6.6,3.0,4.4,1.4,versicolor
 77 | 6.8,2.8,4.8,1.4,versicolor
 78 | 6.7,3.0,5.0,1.7,versicolor
 79 | 6.0,2.9,4.5,1.5,versicolor
 80 | 5.7,2.6,3.5,1.0,versicolor
 81 | 5.5,2.4,3.8,1.1,versicolor
 82 | 5.5,2.4,3.7,1.0,versicolor
 83 | 5.8,2.7,3.9,1.2,versicolor
 84 | 6.0,2.7,5.1,1.6,versicolor
 85 | 5.4,3.0,4.5,1.5,versicolor
 86 | 6.0,3.4,4.5,1.6,versicolor
 87 | 6.7,3.1,4.7,1.5,versicolor
 88 | 6.3,2.3,4.4,1.3,versicolor
 89 | 5.6,3.0,4.1,1.3,versicolor
 90 | 5.5,2.5,4.0,1.3,versicolor
 91 | 5.5,2.6,4.4,1.2,versicolor
 92 | 6.1,3.0,4.6,1.4,versicolor
 93 | 5.8,2.6,4.0,1.2,versicolor
 94 | 5.0,2.3,3.3,1.0,versicolor
 95 | 5.6,2.7,4.2,1.3,versicolor
 96 | 5.7,3.0,4.2,1.2,versicolor
 97 | 5.7,2.9,4.2,1.3,versicolor
 98 | 6.2,2.9,4.3,1.3,versicolor
 99 | 5.1,2.5,3.0,1.1,versicolor
100 | 5.7,2.8,4.1,1.3,versicolor
101 | 6.3,3.3,6.0,2.5,virginica
102 | 5.8,2.7,5.1,1.9,virginica
103 | 7.1,3.0,5.9,2.1,virginica
104 | 6.3,2.9,5.6,1.8,virginica
105 | 6.5,3.0,5.8,2.2,virginica
106 | 7.6,3.0,6.6,2.1,virginica
107 | 4.9,2.5,4.5,1.7,virginica
108 | 7.3,2.9,6.3,1.8,virginica
109 | 6.7,2.5,5.8,1.8,virginica
110 | 7.2,3.6,6.1,2.5,virginica
111 | 6.5,3.2,5.1,2.0,virginica
112 | 6.4,2.7,5.3,1.9,virginica
113 | 6.8,3.0,5.5,2.1,virginica
114 | 5.7,2.5,5.0,2.0,virginica
115 | 5.8,2.8,5.1,2.4,virginica
116 | 6.4,3.2,5.3,2.3,virginica
117 | 6.5,3.0,5.5,1.8,virginica
118 | 7.7,3.8,6.7,2.2,virginica
119 | 7.7,2.6,6.9,2.3,virginica
120 | 6.0,2.2,5.0,1.5,virginica
121 | 6.9,3.2,5.7,2.3,virginica
122 | 5.6,2.8,4.9,2.0,virginica
123 | 7.7,2.8,6.7,2.0,virginica
124 | 6.3,2.7,4.9,1.8,virginica
125 | 6.7,3.3,5.7,2.1,virginica
126 | 7.2,3.2,6.0,1.8,virginica
127 | 6.2,2.8,4.8,1.8,virginica
128 | 6.1,3.0,4.9,1.8,virginica
129 | 6.4,2.8,5.6,2.1,virginica
130 | 7.2,3.0,5.8,1.6,virginica
131 | 7.4,2.8,6.1,1.9,virginica
132 | 7.9,3.8,6.4,2.0,virginica
133 | 6.4,2.8,5.6,2.2,virginica
134 | 6.3,2.8,5.1,1.5,virginica
135 | 6.1,2.6,5.6,1.4,virginica
136 | 7.7,3.0,6.1,2.3,virginica
137 | 6.3,3.4,5.6,2.4,virginica
138 | 6.4,3.1,5.5,1.8,virginica
139 | 6.0,3.0,4.8,1.8,virginica
140 | 6.9,3.1,5.4,2.1,virginica
141 | 6.7,3.1,5.6,2.4,virginica
142 | 6.9,3.1,5.1,2.3,virginica
143 | 5.8,2.7,5.1,1.9,virginica
144 | 6.8,3.2,5.9,2.3,virginica
145 | 6.7,3.3,5.7,2.5,virginica
146 | 6.7,3.0,5.2,2.3,virginica
147 | 6.3,2.5,5.0,1.9,virginica
148 | 6.5,3.0,5.2,2.0,virginica
149 | 6.2,3.4,5.4,2.3,virginica
150 | 5.9,3.0,5.1,1.8,virginica
151 | 


--------------------------------------------------------------------------------
/_Datasets/iris.data:
--------------------------------------------------------------------------------
  1 | sepal_length,sepal_width,petal_length,petal_width,species
  2 | 5.1,3.5,1.4,0.2,setosa
  3 | 4.9,3.0,1.4,0.2,setosa
  4 | 4.7,3.2,1.3,0.2,setosa
  5 | 4.6,3.1,1.5,0.2,setosa
  6 | 5.0,3.6,1.4,0.2,setosa
  7 | 5.4,3.9,1.7,0.4,setosa
  8 | 4.6,3.4,1.4,0.3,setosa
  9 | 5.0,3.4,1.5,0.2,setosa
 10 | 4.4,2.9,1.4,0.2,setosa
 11 | 4.9,3.1,1.5,0.1,setosa
 12 | 5.4,3.7,1.5,0.2,setosa
 13 | 4.8,3.4,1.6,0.2,setosa
 14 | 4.8,3.0,1.4,0.1,setosa
 15 | 4.3,3.0,1.1,0.1,setosa
 16 | 5.8,4.0,1.2,0.2,setosa
 17 | 5.7,4.4,1.5,0.4,setosa
 18 | 5.4,3.9,1.3,0.4,setosa
 19 | 5.1,3.5,1.4,0.3,setosa
 20 | 5.7,3.8,1.7,0.3,setosa
 21 | 5.1,3.8,1.5,0.3,setosa
 22 | 5.4,3.4,1.7,0.2,setosa
 23 | 5.1,3.7,1.5,0.4,setosa
 24 | 4.6,3.6,1.0,0.2,setosa
 25 | 5.1,3.3,1.7,0.5,setosa
 26 | 4.8,3.4,1.9,0.2,setosa
 27 | 5.0,3.0,1.6,0.2,setosa
 28 | 5.0,3.4,1.6,0.4,setosa
 29 | 5.2,3.5,1.5,0.2,setosa
 30 | 5.2,3.4,1.4,0.2,setosa
 31 | 4.7,3.2,1.6,0.2,setosa
 32 | 4.8,3.1,1.6,0.2,setosa
 33 | 5.4,3.4,1.5,0.4,setosa
 34 | 5.2,4.1,1.5,0.1,setosa
 35 | 5.5,4.2,1.4,0.2,setosa
 36 | 4.9,3.1,1.5,0.1,setosa
 37 | 5.0,3.2,1.2,0.2,setosa
 38 | 5.5,3.5,1.3,0.2,setosa
 39 | 4.9,3.1,1.5,0.1,setosa
 40 | 4.4,3.0,1.3,0.2,setosa
 41 | 5.1,3.4,1.5,0.2,setosa
 42 | 5.0,3.5,1.3,0.3,setosa
 43 | 4.5,2.3,1.3,0.3,setosa
 44 | 4.4,3.2,1.3,0.2,setosa
 45 | 5.0,3.5,1.6,0.6,setosa
 46 | 5.1,3.8,1.9,0.4,setosa
 47 | 4.8,3.0,1.4,0.3,setosa
 48 | 5.1,3.8,1.6,0.2,setosa
 49 | 4.6,3.2,1.4,0.2,setosa
 50 | 5.3,3.7,1.5,0.2,setosa
 51 | 5.0,3.3,1.4,0.2,setosa
 52 | 7.0,3.2,4.7,1.4,versicolor
 53 | 6.4,3.2,4.5,1.5,versicolor
 54 | 6.9,3.1,4.9,1.5,versicolor
 55 | 5.5,2.3,4.0,1.3,versicolor
 56 | 6.5,2.8,4.6,1.5,versicolor
 57 | 5.7,2.8,4.5,1.3,versicolor
 58 | 6.3,3.3,4.7,1.6,versicolor
 59 | 4.9,2.4,3.3,1.0,versicolor
 60 | 6.6,2.9,4.6,1.3,versicolor
 61 | 5.2,2.7,3.9,1.4,versicolor
 62 | 5.0,2.0,3.5,1.0,versicolor
 63 | 5.9,3.0,4.2,1.5,versicolor
 64 | 6.0,2.2,4.0,1.0,versicolor
 65 | 6.1,2.9,4.7,1.4,versicolor
 66 | 5.6,2.9,3.6,1.3,versicolor
 67 | 6.7,3.1,4.4,1.4,versicolor
 68 | 5.6,3.0,4.5,1.5,versicolor
 69 | 5.8,2.7,4.1,1.0,versicolor
 70 | 6.2,2.2,4.5,1.5,versicolor
 71 | 5.6,2.5,3.9,1.1,versicolor
 72 | 5.9,3.2,4.8,1.8,versicolor
 73 | 6.1,2.8,4.0,1.3,versicolor
 74 | 6.3,2.5,4.9,1.5,versicolor
 75 | 6.1,2.8,4.7,1.2,versicolor
 76 | 6.4,2.9,4.3,1.3,versicolor
 77 | 6.6,3.0,4.4,1.4,versicolor
 78 | 6.8,2.8,4.8,1.4,versicolor
 79 | 6.7,3.0,5.0,1.7,versicolor
 80 | 6.0,2.9,4.5,1.5,versicolor
 81 | 5.7,2.6,3.5,1.0,versicolor
 82 | 5.5,2.4,3.8,1.1,versicolor
 83 | 5.5,2.4,3.7,1.0,versicolor
 84 | 5.8,2.7,3.9,1.2,versicolor
 85 | 6.0,2.7,5.1,1.6,versicolor
 86 | 5.4,3.0,4.5,1.5,versicolor
 87 | 6.0,3.4,4.5,1.6,versicolor
 88 | 6.7,3.1,4.7,1.5,versicolor
 89 | 6.3,2.3,4.4,1.3,versicolor
 90 | 5.6,3.0,4.1,1.3,versicolor
 91 | 5.5,2.5,4.0,1.3,versicolor
 92 | 5.5,2.6,4.4,1.2,versicolor
 93 | 6.1,3.0,4.6,1.4,versicolor
 94 | 5.8,2.6,4.0,1.2,versicolor
 95 | 5.0,2.3,3.3,1.0,versicolor
 96 | 5.6,2.7,4.2,1.3,versicolor
 97 | 5.7,3.0,4.2,1.2,versicolor
 98 | 5.7,2.9,4.2,1.3,versicolor
 99 | 6.2,2.9,4.3,1.3,versicolor
100 | 5.1,2.5,3.0,1.1,versicolor
101 | 5.7,2.8,4.1,1.3,versicolor
102 | 6.3,3.3,6.0,2.5,virginica
103 | 5.8,2.7,5.1,1.9,virginica
104 | 7.1,3.0,5.9,2.1,virginica
105 | 6.3,2.9,5.6,1.8,virginica
106 | 6.5,3.0,5.8,2.2,virginica
107 | 7.6,3.0,6.6,2.1,virginica
108 | 4.9,2.5,4.5,1.7,virginica
109 | 7.3,2.9,6.3,1.8,virginica
110 | 6.7,2.5,5.8,1.8,virginica
111 | 7.2,3.6,6.1,2.5,virginica
112 | 6.5,3.2,5.1,2.0,virginica
113 | 6.4,2.7,5.3,1.9,virginica
114 | 6.8,3.0,5.5,2.1,virginica
115 | 5.7,2.5,5.0,2.0,virginica
116 | 5.8,2.8,5.1,2.4,virginica
117 | 6.4,3.2,5.3,2.3,virginica
118 | 6.5,3.0,5.5,1.8,virginica
119 | 7.7,3.8,6.7,2.2,virginica
120 | 7.7,2.6,6.9,2.3,virginica
121 | 6.0,2.2,5.0,1.5,virginica
122 | 6.9,3.2,5.7,2.3,virginica
123 | 5.6,2.8,4.9,2.0,virginica
124 | 7.7,2.8,6.7,2.0,virginica
125 | 6.3,2.7,4.9,1.8,virginica
126 | 6.7,3.3,5.7,2.1,virginica
127 | 7.2,3.2,6.0,1.8,virginica
128 | 6.2,2.8,4.8,1.8,virginica
129 | 6.1,3.0,4.9,1.8,virginica
130 | 6.4,2.8,5.6,2.1,virginica
131 | 7.2,3.0,5.8,1.6,virginica
132 | 7.4,2.8,6.1,1.9,virginica
133 | 7.9,3.8,6.4,2.0,virginica
134 | 6.4,2.8,5.6,2.2,virginica
135 | 6.3,2.8,5.1,1.5,virginica
136 | 6.1,2.6,5.6,1.4,virginica
137 | 7.7,3.0,6.1,2.3,virginica
138 | 6.3,3.4,5.6,2.4,virginica
139 | 6.4,3.1,5.5,1.8,virginica
140 | 6.0,3.0,4.8,1.8,virginica
141 | 6.9,3.1,5.4,2.1,virginica
142 | 6.7,3.1,5.6,2.4,virginica
143 | 6.9,3.1,5.1,2.3,virginica
144 | 5.8,2.7,5.1,1.9,virginica
145 | 6.8,3.2,5.9,2.3,virginica
146 | 6.7,3.3,5.7,2.5,virginica
147 | 6.7,3.0,5.2,2.3,virginica
148 | 6.3,2.5,5.0,1.9,virginica
149 | 6.5,3.0,5.2,2.0,virginica
150 | 6.2,3.4,5.4,2.3,virginica
151 | 5.9,3.0,5.1,1.8,virginica
152 | 


--------------------------------------------------------------------------------
/01_Supervised_learning/01_05_Knn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 4,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from sklearn import datasets,neighbors\n",
 10 |     "from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score\n",
 11 |     "from sklearn.metrics import mean_squared_error, r2_score\n",
 12 |     "\n",
 13 |     "#class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, *, weights='uniform', \n",
 14 |     "#algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 7,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stdout",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "accuracy_score: \n",
 27 |       " 0.9611111111111111\n",
 28 |       "precision_score: 0.96\n",
 29 |       "recall_score: 0.96\n",
 30 |       "f1_score: 0.96\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "### 1.5.1 KNN分类\n",
 36 |     "# 加载digits手写数字数据集\n",
 37 |     "X_digits, y_digits = datasets.load_digits(return_X_y=True)\n",
 38 |     "#X_digits = X_digits / X_digits.max()\n",
 39 |     "\n",
 40 |     "n_samples = len(X_digits)\n",
 41 |     "\n",
 42 |     "#训练集和测试集划分\n",
 43 |     "X_train = X_digits[: int(0.9 * n_samples)]  #取前90%的数据作为训练集，其余作为测试集\n",
 44 |     "y_train = y_digits[: int(0.9 * n_samples)]\n",
 45 |     "X_test = X_digits[int(0.9 * n_samples) :]\n",
 46 |     "y_test = y_digits[int(0.9 * n_samples) :]\n",
 47 |     "\n",
 48 |     "#创建模型，KNeighborsClassifier分类器\n",
 49 |     "knn = neighbors.KNeighborsClassifier(n_neighbors=11)  #11个最近点                                                       \n",
 50 |     " \n",
 51 |     "#模型训练\n",
 52 |     "knn.fit(X_train, y_train) #对于分类问题，可以通过简单的多数投票法来确定预测类别\n",
 53 |     "\n",
 54 |     "#模型预测\n",
 55 |     "y_pred = knn.predict(X_test)\n",
 56 |     "\n",
 57 |     "##评价指标\n",
 58 |     "# accuracy\n",
 59 |     "print(\"accuracy_score: \\n\", accuracy_score(y_test,y_pred)) #不平衡类别分布的情况下可能会失效\n",
 60 |     "\n",
 61 |     "# precision指标\n",
 62 |     "print(\"precision_score: %.2f\" % precision_score(y_test,y_pred,average='macro')) #多分类，需要加average参数\n",
 63 |     "\n",
 64 |     "# recall_score指标\n",
 65 |     "print(\"recall_score: %.2f\" % recall_score(y_test,y_pred,average='macro'))\n",
 66 |     "\n",
 67 |     "#f1指标\n",
 68 |     "print(\"f1_score: %.2f\" % f1_score(y_test,y_pred,average='macro'))\n"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 6,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "name": "stdout",
 78 |      "output_type": "stream",
 79 |      "text": [
 80 |       "Mean squared error: 2454.55\n",
 81 |       "Coefficient of determination: 0.49\n"
 82 |      ]
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "### 1.5.2 KNN回归\n",
 87 |     "# 加载diabetes数据集\n",
 88 |     "diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)\n",
 89 |     "\n",
 90 |     "# 特征数据划分训练集和测试集\n",
 91 |     "diabetes_X_train = diabetes_X[:-20] #后20个样本作为测试集，其余作为训练集\n",
 92 |     "diabetes_X_test = diabetes_X[-20:]\n",
 93 |     "\n",
 94 |     "# 标签（类别）数据划分训练集和测试集\n",
 95 |     "diabetes_y_train = diabetes_y[:-20] #后20个样本作为测试集，其余作为训练集\n",
 96 |     "diabetes_y_test = diabetes_y[-20:]\n",
 97 |     "\n",
 98 |     "#创建模型，KNeighborsRegressor回归器\n",
 99 |     "knn = neighbors.KNeighborsRegressor(n_neighbors=11)  #11个最近点                                                       \n",
100 |     "\n",
101 |     "#模型训练\n",
102 |     "knn.fit(diabetes_X_train, diabetes_y_train)  #对于回归问题，预测值通常是这k个最近邻的目标值的平均值（或加权平均值）                            \n",
103 |     "\n",
104 |     "#模型预测\n",
105 |     "y_pred = knn.predict(diabetes_X_test)      \n",
106 |     "\n",
107 |     "# 模型评估MSE指标\n",
108 |     "print(\"Mean squared error: %.2f\" % mean_squared_error(diabetes_y_test, y_pred))\n",
109 |     "\n",
110 |     "# 模型评估R2指标\n",
111 |     "print(\"Coefficient of determination: %.2f\" % r2_score(diabetes_y_test, y_pred))\n"
112 |    ]
113 |   }
114 |  ],
115 |  "metadata": {
116 |   "kernelspec": {
117 |    "display_name": "Python 3",
118 |    "language": "python",
119 |    "name": "python3"
120 |   },
121 |   "language_info": {
122 |    "codemirror_mode": {
123 |     "name": "ipython",
124 |     "version": 3
125 |    },
126 |    "file_extension": ".py",
127 |    "mimetype": "text/x-python",
128 |    "name": "python",
129 |    "nbconvert_exporter": "python",
130 |    "pygments_lexer": "ipython3",
131 |    "version": "3.11.8"
132 |   }
133 |  },
134 |  "nbformat": 4,
135 |  "nbformat_minor": 2
136 | }
137 | 


--------------------------------------------------------------------------------
/01_Supervised_learning/01_08_Random_forests.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from sklearn import datasets,ensemble \n",
 10 |     "from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score\n",
 11 |     "from sklearn.metrics import mean_squared_error, r2_score\n"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 8,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "name": "stdout",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "accuracy_score: \n",
 24 |       " 0.9388888888888889\n",
 25 |       "precision_score: 0.95\n",
 26 |       "recall_score: 0.94\n",
 27 |       "f1_score: 0.94\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "### 1.8.1 随机森林分类\n",
 33 |     "# 加载digits数据集，数据集简单，便于可视化决策树\n",
 34 |     "X_digits, y_digits = datasets.load_digits(return_X_y=True)\n",
 35 |     "#X_digits = X_digits / X_digits.max()\n",
 36 |     "\n",
 37 |     "n_samples = len(X_digits)\n",
 38 |     "\n",
 39 |     "#训练集和测试集划分\n",
 40 |     "X_train = X_digits[: int(0.9 * n_samples)]  #取前90%的数据作为训练集，其余作为测试集\n",
 41 |     "y_train = y_digits[: int(0.9 * n_samples)]\n",
 42 |     "X_test = X_digits[int(0.9 * n_samples) :]\n",
 43 |     "y_test = y_digits[int(0.9 * n_samples) :]\n",
 44 |     "\n",
 45 |     "#创建模型，RandomForestClassifier分类器\n",
 46 |     "clf = ensemble.RandomForestClassifier(n_estimators=500, max_depth=10) #n_estimators：弱分类器数量\n",
 47 |     "                                                #max_depth树的最大深度，还有很多其他参数，此处使用默认值\n",
 48 |     "\n",
 49 |     "#模型训练\n",
 50 |     "clf.fit(X_train, y_train) #对于分类问题，可以通过简单的多数投票法来确定预测类别\n",
 51 |     "\n",
 52 |     "#模型预测\n",
 53 |     "y_pred = clf.predict(X_test)\n",
 54 |     "\n",
 55 |     "##评价指标\n",
 56 |     "# accuracy\n",
 57 |     "print(\"accuracy_score: \\n\", accuracy_score(y_test,y_pred)) #不平衡类别分布的情况下可能会失效\n",
 58 |     "\n",
 59 |     "# precision指标\n",
 60 |     "print(\"precision_score: %.2f\" % precision_score(y_test,y_pred,average='macro')) #多分类，需要加average参数\n",
 61 |     "\n",
 62 |     "# recall_score指标\n",
 63 |     "print(\"recall_score: %.2f\" % recall_score(y_test,y_pred,average='macro'))\n",
 64 |     "\n",
 65 |     "#f1指标\n",
 66 |     "print(\"f1_score: %.2f\" % f1_score(y_test,y_pred,average='macro'))\n"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 13,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "Mean squared error: 2270.99\n",
 79 |       "Coefficient of determination: 0.53\n"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "### 1.8.2 随机森林回归\n",
 85 |     "# 加载diabetes数据集\n",
 86 |     "diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)\n",
 87 |     "\n",
 88 |     "# 特征数据划分训练集和测试集\n",
 89 |     "diabetes_X_train = diabetes_X[:-20] #后20个样本作为测试集，其余作为训练集\n",
 90 |     "diabetes_X_test = diabetes_X[-20:]\n",
 91 |     "\n",
 92 |     "# 标签（类别）数据划分训练集和测试集\n",
 93 |     "diabetes_y_train = diabetes_y[:-20] #后20个样本作为测试集，其余作为训练集\n",
 94 |     "diabetes_y_test = diabetes_y[-20:]\n",
 95 |     "\n",
 96 |     "#创建模型，RandomForestRegressor回归器\n",
 97 |     "clf = ensemble.RandomForestRegressor(n_estimators=500, max_depth=3) #n_estimators：弱分类器数量\n",
 98 |     "                                                #max_depth树的最大深度，还有很多其他参数，此处使用默认值\n",
 99 |     "#模型训练\n",
100 |     "clf.fit(diabetes_X_train, diabetes_y_train)  #对于回归问题，预测值通常是这k个最近邻的目标值的平均值（或加权平均值）                            \n",
101 |     "\n",
102 |     "#模型预测\n",
103 |     "y_pred = clf.predict(diabetes_X_test)      \n",
104 |     "\n",
105 |     "# 模型评估MSE指标\n",
106 |     "print(\"Mean squared error: %.2f\" % mean_squared_error(diabetes_y_test, y_pred))\n",
107 |     "\n",
108 |     "# 模型评估R2指标\n",
109 |     "print(\"Coefficient of determination: %.2f\" % r2_score(diabetes_y_test, y_pred))\n"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "### 总结\n",
119 |     "#随机森林里的树都是决策树，参数复杂，需要调试才会有好效果。\n",
120 |     "#随机森林里的样本抽样思想以及各决策树可以并行训练，使得随机森林训练较大数据集时效果较好。\n",
121 |     "#某种角度讲，随机森林是最好的机器学习算法，再往后就是深度学习。"
122 |    ]
123 |   }
124 |  ],
125 |  "metadata": {
126 |   "kernelspec": {
127 |    "display_name": "Python 3",
128 |    "language": "python",
129 |    "name": "python3"
130 |   },
131 |   "language_info": {
132 |    "name": "python",
133 |    "version": "3.11.8"
134 |   }
135 |  },
136 |  "nbformat": 4,
137 |  "nbformat_minor": 2
138 | }
139 | 


--------------------------------------------------------------------------------
/01_Supervised_learning/01_03_Svm.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 11,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from sklearn import datasets,svm\n",
 10 |     "from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score\n",
 11 |     "from sklearn.metrics import mean_squared_error, r2_score\n"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 7,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "(1617, 45)"
 23 |       ]
 24 |      },
 25 |      "execution_count": 7,
 26 |      "metadata": {},
 27 |      "output_type": "execute_result"
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "### 1.3.1 svm分类\n",
 32 |     "\n",
 33 |     "# 加载digits手写数字数据集\n",
 34 |     "X_digits, y_digits = datasets.load_digits(return_X_y=True)\n",
 35 |     "#X_digits = X_digits / X_digits.max()\n",
 36 |     "\n",
 37 |     "n_samples = len(X_digits)\n",
 38 |     "\n",
 39 |     "#训练集和测试集划分\n",
 40 |     "X_train = X_digits[: int(0.9 * n_samples)]  #取前90%的数据作为训练集，其余作为测试集\n",
 41 |     "y_train = y_digits[: int(0.9 * n_samples)]\n",
 42 |     "X_test = X_digits[int(0.9 * n_samples) :]\n",
 43 |     "y_test = y_digits[int(0.9 * n_samples) :]\n",
 44 |     "\n",
 45 |     "#创建模型，svm.SVC 是用于支持向量分类（SVC）的类\n",
 46 |     "clf = svm.SVC(decision_function_shape='ovo')  ##可通过kernel参数设置核函数，例如'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'\n",
 47 |     "                                       ##vO（One-vs-One）和OvR（One-vs-Rest）是处理多类别分类问题\n",
 48 |     "\n",
 49 |     "#模型训练\n",
 50 |     "clf.fit(X_train, y_train)\n",
 51 |     "dec = clf.decision_function(X_train) #decision_values 将返回一个数组，其中包含每个样本点到分隔超平面的有符号距离。\n",
 52 |     "                                     #从而可以根据这些距离来进行分类或者其他后续处理。\n",
 53 |     "dec.shape \n"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 9,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "accuracy_score: \n",
 66 |       " 0.95\n",
 67 |       "precision_score: 0.96\n",
 68 |       "recall_score: 0.95\n",
 69 |       "f1_score: 0.95\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "\n",
 75 |     "#模型预测\n",
 76 |     "y_pred = clf.predict(X_test)\n",
 77 |     "\n",
 78 |     "##评价指标\n",
 79 |     "# accuracy\n",
 80 |     "print(\"accuracy_score: \\n\", accuracy_score(y_test,y_pred)) #不平衡类别分布的情况下可能会失效\n",
 81 |     "\n",
 82 |     "# precision指标\n",
 83 |     "print(\"precision_score: %.2f\" % precision_score(y_test,y_pred,average='macro')) #多分类，需要加average参数\n",
 84 |     "\n",
 85 |     "# recall_score指标\n",
 86 |     "print(\"recall_score: %.2f\" % recall_score(y_test,y_pred,average='macro'))\n",
 87 |     "\n",
 88 |     "#f1指标\n",
 89 |     "print(\"f1_score: %.2f\" % f1_score(y_test,y_pred,average='macro'))\n"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 12,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "Mean squared error: 3767.77\n",
102 |       "Coefficient of determination: 0.22\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "### 1.3.2 svm回归\n",
108 |     "\n",
109 |     "# 加载diabetes数据集\n",
110 |     "diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)\n",
111 |     "\n",
112 |     "# 特征数据划分训练集和测试集\n",
113 |     "diabetes_X_train = diabetes_X[:-20] #后20个样本作为测试集，其余作为训练集\n",
114 |     "diabetes_X_test = diabetes_X[-20:]\n",
115 |     "\n",
116 |     "# 标签（类别）数据划分训练集和测试集\n",
117 |     "diabetes_y_train = diabetes_y[:-20] #后20个样本作为测试集，其余作为训练集\n",
118 |     "diabetes_y_test = diabetes_y[-20:]\n",
119 |     "\n",
120 |     "#创建模型，svm.SVR 是用于支持向量回归（SVR）的类\n",
121 |     "clf = svm.SVR() #可通过kernel参数设置核函数，例如'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'\n",
122 |     "\n",
123 |     "#模型训练\n",
124 |     "clf.fit(diabetes_X_train, diabetes_y_train)                             \n",
125 |     "\n",
126 |     "#模型预测\n",
127 |     "y_pred = clf.predict(diabetes_X_test)      \n",
128 |     "\n",
129 |     "# 模型评估MSE指标\n",
130 |     "print(\"Mean squared error: %.2f\" % mean_squared_error(diabetes_y_test, y_pred))\n",
131 |     "\n",
132 |     "# 模型评估R2指标\n",
133 |     "print(\"Coefficient of determination: %.2f\" % r2_score(diabetes_y_test, y_pred))\n"
134 |    ]
135 |   }
136 |  ],
137 |  "metadata": {
138 |   "kernelspec": {
139 |    "display_name": "Python 3",
140 |    "language": "python",
141 |    "name": "python3"
142 |   },
143 |   "language_info": {
144 |    "codemirror_mode": {
145 |     "name": "ipython",
146 |     "version": 3
147 |    },
148 |    "file_extension": ".py",
149 |    "mimetype": "text/x-python",
150 |    "name": "python",
151 |    "nbconvert_exporter": "python",
152 |    "pygments_lexer": "ipython3",
153 |    "version": "3.11.8"
154 |   }
155 |  },
156 |  "nbformat": 4,
157 |  "nbformat_minor": 2
158 | }
159 | 


--------------------------------------------------------------------------------
/05_Dataset_transformations/05_04_Imputation_of_missing_values.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 12,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "###<进阶> 5.4 缺失值处理\n",
 10 |     "#此节可以作为进阶内容，简单点直接用pandas DataFrame进行补全\n",
 11 |     "\n",
 12 |     "## 5.4.1 单变量插补\n",
 13 |     "#单变量算法，它只使用第i个特征维度中的非缺失值(如impute.SimpleImputer)来插补第i个特征维中的值。\n",
 14 |     "#SimpleImputer类提供了计算缺失值的基本策略。缺失值可以用提供的常数值计算，也可以使用缺失值所在的行/列中的统计数据(平均值、中位数或者众数)来计算。\n",
 15 |     "\n",
 16 |     "#以下代码段演示了如何使用包含缺失值的列(轴0)的平均值来替换编码为 np.nan 的缺失值:\n",
 17 |     "import numpy as np\n",
 18 |     "from sklearn.impute import SimpleImputer\n",
 19 |     "imp = SimpleImputer(missing_values=np.nan, strategy='mean') #用均值补全\n",
 20 |     "imp.fit([[1, 2], [np.nan, 3], [7, 6]])\n",
 21 |     "X = [[np.nan, 2], [6, np.nan], [7, 6]]  \n"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 13,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       ">>>>>均值补全前>>>>>>\n",
 34 |       "[nan, 2]\n",
 35 |       "[6, nan]\n",
 36 |       "[7, 6]\n",
 37 |       "\n",
 38 |       "\n",
 39 |       ">>>>>均值补全后>>>>>>\n",
 40 |       "[[4.         2.        ]\n",
 41 |       " [6.         3.66666667]\n",
 42 |       " [7.         6.        ]]\n"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "print('>>>>>均值补全前>>>>>>')\n",
 48 |     "for i in X:\n",
 49 |     "    print(i)\n",
 50 |     "print('\\n')\n",
 51 |     "print('>>>>>均值补全后>>>>>>')\n",
 52 |     "print(imp.transform(X))  #缺失值补全（用列的均值）"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 15,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stdout",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "[['a' 'x']\n",
 65 |       " ['a' 'y']\n",
 66 |       " ['a' 'y']\n",
 67 |       " ['b' 'y']]\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "## 类别特征补全\n",
 73 |     "#当使用 'most_frequent' 或 'constant' 策略时，SimpleImputer类还支持以 string values 或 pandas categoricals 表示的分类数据(categorical data)\n",
 74 |     "import pandas as pd\n",
 75 |     "df = pd.DataFrame([[\"a\", \"x\"],\n",
 76 |     "                   [np.nan, \"y\"],\n",
 77 |     "                   [\"a\", np.nan],\n",
 78 |     "                   [\"b\", \"y\"]], dtype=\"category\")\n",
 79 |     "imp = SimpleImputer(strategy=\"most_frequent\")\n",
 80 |     "print(imp.fit_transform(df))   #把第2行确实特征用出现频次最高的值（样例中是a）补全"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 16,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "[[ 1.  2.]\n",
 93 |       " [ 6. 12.]\n",
 94 |       " [ 3.  6.]]\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "## 5.4.3 多元特征补全 \n",
100 |     "#一种更复杂的方法是使用IterativeImputter类，该类将具有缺失值的每个特征建模为其他特征的函数，并使用该估计值进行插补。\n",
101 |     "#它以迭代循环的方式进行：在每一步中，一个特征列被指定为输出y，其他特征列被视为输入X。对已知y拟合回归器（X，y）。然后，使用回归器来预测y的缺失值。\n",
102 |     "#这是以迭代的方式对每个特征进行的，然后在max_iter插补循环中重复。返回最后一轮插补的结果。\n",
103 |     "\n",
104 |     "#总结：简而言之，基于其他特征用回归算法预测缺失特征结果。\n",
105 |     "\n",
106 |     "import numpy as np\n",
107 |     "from sklearn.experimental import enable_iterative_imputer\n",
108 |     "from sklearn.impute import IterativeImputer\n",
109 |     "imp = IterativeImputer(max_iter=10, random_state=0)\n",
110 |     "imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]])\n",
111 |     "X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]\n",
112 |     "# the model learns that the second feature is double the first\n",
113 |     "print(np.round(imp.transform(X_test)))"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 17,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "data": {
123 |       "text/plain": [
124 |        "array([[1. , 2. , 4. ],\n",
125 |        "       [3. , 4. , 3. ],\n",
126 |        "       [5.5, 6. , 5. ],\n",
127 |        "       [8. , 8. , 7. ]])"
128 |       ]
129 |      },
130 |      "execution_count": 17,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "## 5.4.4 最近邻补全\n",
137 |     "# KNNImputer类提供用于填充缺失值的插补 使用 k 最近邻方法。\n",
138 |     "import numpy as np\n",
139 |     "from sklearn.impute import KNNImputer\n",
140 |     "nan = np.nan\n",
141 |     "X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]\n",
142 |     "imputer = KNNImputer(n_neighbors=2, weights=\"uniform\")\n",
143 |     "imputer.fit_transform(X)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 19,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "array([[ True,  True, False, False],\n",
155 |        "       [False,  True, False,  True],\n",
156 |        "       [False,  True, False, False]])"
157 |       ]
158 |      },
159 |      "execution_count": 19,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "## 5.5.5 标记缺失值\n",
166 |     "#MissingIndicator可用于将数据集转换为 相应的二进制矩阵，指示缺失数据\n",
167 |     "from sklearn.impute import MissingIndicator\n",
168 |     "X = np.array([[-1, -1, 1, 3],\n",
169 |     "              [4, -1, 0, -1],\n",
170 |     "              [8, -1, 1, 0]])\n",
171 |     "indicator = MissingIndicator(missing_values=-1,features='all')\n",
172 |     "mask_missing_values_only = indicator.fit_transform(X)\n",
173 |     "mask_missing_values_only  #缺失值用-1填补的返回True，非缺失值返回False"
174 |    ]
175 |   }
176 |  ],
177 |  "metadata": {
178 |   "kernelspec": {
179 |    "display_name": "Python 3",
180 |    "language": "python",
181 |    "name": "python3"
182 |   },
183 |   "language_info": {
184 |    "codemirror_mode": {
185 |     "name": "ipython",
186 |     "version": 3
187 |    },
188 |    "file_extension": ".py",
189 |    "mimetype": "text/x-python",
190 |    "name": "python",
191 |    "nbconvert_exporter": "python",
192 |    "pygments_lexer": "ipython3",
193 |    "version": "3.11.8"
194 |   }
195 |  },
196 |  "nbformat": 4,
197 |  "nbformat_minor": 2
198 | }
199 | 


--------------------------------------------------------------------------------
/05_Dataset_transformations/05_02_Feature_extraction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "array([[ 1.,  0.,  0., 33.],\n",
 12 |        "       [ 0.,  1.,  0., 12.],\n",
 13 |        "       [ 0.,  0.,  1., 18.]])"
 14 |       ]
 15 |      },
 16 |      "execution_count": 3,
 17 |      "metadata": {},
 18 |      "output_type": "execute_result"
 19 |     }
 20 |    ],
 21 |    "source": [
 22 |     "###<进阶> 5.2.1 从字典类型加载特征\n",
 23 |     "from sklearn.feature_extraction import DictVectorizer\n",
 24 |     "\n",
 25 |     "#类 DictVectorizer 可用于将标准的Python字典（dict）对象列表的要素数组转换为 scikit-learn 估计器使用的 NumPy/SciPy 表示形式。\n",
 26 |     "#类 DictVectorizer 实现了 “one-of-K” 或 “one-hot” 编码，用于分类（也称为标称，离散）特征。\n",
 27 |     "\n",
 28 |     "#在下面的例子，”城市” 是一个分类属性，而 “温度” 是传统的数字特征，对“城市”属性进行了one-hot编码:\n",
 29 |     "measurements = [\n",
 30 |     "    {'city': 'Dubai', 'temperature': 33.},\n",
 31 |     "    {'city': 'London', 'temperature': 12.},\n",
 32 |     "    {'city': 'San Francisco', 'temperature': 18.},\n",
 33 |     "]\n",
 34 |     "\n",
 35 |     "vec = DictVectorizer()\n",
 36 |     "vec.fit_transform(measurements).toarray()\n"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 4,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "array(['city=Dubai', 'city=London', 'city=San Francisco', 'temperature'],\n",
 48 |        "      dtype=object)"
 49 |       ]
 50 |      },
 51 |      "execution_count": 4,
 52 |      "metadata": {},
 53 |      "output_type": "execute_result"
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "vec.get_feature_names_out()"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 15,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "array([[ 0.,  0., -4., -1.,  0.,  0.,  0.,  0.,  0.,  2.],\n",
 69 |        "       [ 0.,  0.,  0., -2., -5.,  0.,  0.,  0.,  0.,  0.]])"
 70 |       ]
 71 |      },
 72 |      "execution_count": 15,
 73 |      "metadata": {},
 74 |      "output_type": "execute_result"
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "###<进阶> 5.2.2 特征哈希（相当于一种降维技巧）\n",
 79 |     "#class sklearn.feature_extraction.FeatureHasher(n_features=1048576, *, input_type='dict', dtype=<class 'numpy.float64'>, alternate_sign=True)[source]¶\n",
 80 |     "#此类将符号特征名称（字符串）序列转换为scipy.sparse矩阵，使用哈希函数计算与名称对应的矩阵列。所使用的哈希函数是Murmurhash3的32位签名版本。\n",
 81 |     "\n",
 82 |     "## input_type是字典结构时\n",
 83 |     "from sklearn.feature_extraction import FeatureHasher\n",
 84 |     "h = FeatureHasher(n_features=10)  #输出特征维度\n",
 85 |     "D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]  #hash字典结构的，2个样本\n",
 86 |     "f = h.transform(D)\n",
 87 |     "arr_hash = f.toarray()\n",
 88 |     "arr_hash\n"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 16,
 94 |    "metadata": {},
 95 |    "outputs": [
 96 |     {
 97 |      "data": {
 98 |       "text/plain": [
 99 |        "array([[ 0.,  0.,  0., -1.,  0.,  0.,  0.,  0.,  0.,  0.]])"
100 |       ]
101 |      },
102 |      "execution_count": 16,
103 |      "metadata": {},
104 |      "output_type": "execute_result"
105 |     }
106 |    ],
107 |    "source": [
108 |     "# 特征hash其实就是对所有输入特征分别hash，再对结果合并\n",
109 |     "# 我们把{'dog': 1, 'cat':2, 'elephant':4}样本拆分成3个样本理解，每次hash只输入单个特征的样本\n",
110 |     "h = FeatureHasher(n_features=10)  #输出特征维度\n",
111 |     "D1 = [{'dog': 1}]  \n",
112 |     "D2 = [{'cat':2}]  \n",
113 |     "D3 = [{'elephant':4}]  \n",
114 |     "\n",
115 |     "f1 = h.transform(D1)   #hash特征dog，hash值如下\n",
116 |     "arr1 = f1.toarray()\n",
117 |     "arr1"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 17,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": [
128 |        "array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 2.]])"
129 |       ]
130 |      },
131 |      "execution_count": 17,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "f2 = h.transform(D2)  #hash特征cat，hash值如下\n",
138 |     "arr2 = f2.toarray()\n",
139 |     "arr2"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 18,
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "data": {
149 |       "text/plain": [
150 |        "array([[ 0.,  0., -4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])"
151 |       ]
152 |      },
153 |      "execution_count": 18,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "f3 = h.transform(D3) #hash特征elephant，hash值如下\n",
160 |     "arr3 = f3.toarray()\n",
161 |     "arr3"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 19,
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "data": {
171 |       "text/plain": [
172 |        "array([[ 0.,  0., -4., -1.,  0.,  0.,  0.,  0.,  0.,  2.]])"
173 |       ]
174 |      },
175 |      "execution_count": 19,
176 |      "metadata": {},
177 |      "output_type": "execute_result"
178 |     }
179 |    ],
180 |    "source": [
181 |     "arr_result = arr1 + arr2 + arr3   # 3个特征分别hash后拼成一条样\n",
182 |     "arr_result                        # 对比可知样本hash后的特征是每个特征hash后的汇总，即arr_resut = arr_hash"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 20,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "data": {
192 |       "text/plain": [
193 |        "array([[ 0.,  0.,  0., -1.,  0., -1.,  0.,  1.],\n",
194 |        "       [ 0.,  0.,  0., -1.,  0., -1.,  0.,  0.],\n",
195 |        "       [ 0., -1.,  0.,  0.,  0.,  0.,  0.,  1.]])"
196 |       ]
197 |      },
198 |      "execution_count": 20,
199 |      "metadata": {},
200 |      "output_type": "execute_result"
201 |     }
202 |    ],
203 |    "source": [
204 |     "# input_type=“string”时，输入必须是字符串的可迭代项上的可迭代值：\n",
205 |     "h = FeatureHasher(n_features=8, input_type=\"string\")\n",
206 |     "raw_X = [[\"dog\", \"cat\", \"snake\"], [\"snake\", \"dog\"], [\"cat\", \"bird\"]]\n",
207 |     "f = h.transform(raw_X)\n",
208 |     "f.toarray()"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "## 总结\n",
218 |     "\n",
219 |     "#1、hash函数的概念，请复习数据结构\n",
220 |     "#2、对特征空间很大的特征（例如one-hot展开空间很大，上万/十万/百万特征），为了节省内存或提高训练效率，可特征hash进行降维，当然这也会损失一些信息。\n",
221 |     "#3、其他文本、图像处理特征处理暂未列出，更多请参考 https://scikit-learn.org/stable/modules/feature_extraction.html"
222 |    ]
223 |   }
224 |  ],
225 |  "metadata": {
226 |   "kernelspec": {
227 |    "display_name": "Python 3",
228 |    "language": "python",
229 |    "name": "python3"
230 |   },
231 |   "language_info": {
232 |    "codemirror_mode": {
233 |     "name": "ipython",
234 |     "version": 3
235 |    },
236 |    "file_extension": ".py",
237 |    "mimetype": "text/x-python",
238 |    "name": "python",
239 |    "nbconvert_exporter": "python",
240 |    "pygments_lexer": "ipython3",
241 |    "version": "3.11.8"
242 |   }
243 |  },
244 |  "nbformat": 4,
245 |  "nbformat_minor": 2
246 | }
247 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 机器学习Sklearn入门指南
 2 | 机器学习入门指南，基于SKlearn讲解如何学习《机器学习》，更新中。 
 3 | 
 4 | ## 注意
 5 | 如果通过Github站内超链接打开Jupyter Notebook文件发生错误，可以点击根据 https://nbviewer.org 生成的“备用链接”间接访问对应文件。  
 6 | 或者通过以下链接访问整个项目的站外备用链接，注意点击站外备用链接里的非Jupyter Notebook格式文件会跳转回到Github仓库内：  
 7 | ●  [**Machine_Learning_Sklearn_Examples**](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/tree/master/)  
 8 | 
 9 | 
10 | ## 机器学习Python基础  
11 | ●  [**Numpy科学计算**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/00_Python_basics/00_01_Numpy_basic.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/00_Python_basics/00_01_Numpy_basic.ipynb)]    
12 | ●  [**Pandas数据分析**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/00_Python_basics/00_02_Pandas_basic.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/00_Python_basics/00_02_Pandas_basic.ipynb)]      
13 | ●  [**Matplotlib可视化**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/00_Python_basics/00_03_Matplotlib_basic.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/00_Python_basics/00_03_Matplotlib_basic.ipynb)]       
14 | ●  [**数据探索EDA**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/00_Python_basics/00_04_EDA.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/00_Python_basics/00_04_EDA.ipynb)]       
15 | 
16 | ## 监督学习
17 | ●  [**线性回归**](https://github.com/solidglue/Machine_Learning_Sklearn_Jupyter_Demo/blob/master/01_Supervised_learning/01_01_Linear_regression.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/01_Supervised_learning/01_01_Linear_regression.ipynb)]      
18 | ●  [**逻辑回归**](https://github.com/solidglue/Machine_Learning_Sklearn_Jupyter_Demo/blob/master/01_Supervised_learning/01_02_Logistic_regression.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/01_Supervised_learning/01_02_Logistic_regression.ipynb)]      
19 | ●  [**支持向量机SVM**](https://github.com/solidglue/Machine_Learning_Sklearn_Jupyter_Demo/blob/master/01_Supervised_learning/01_03_Svm.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/01_Supervised_learning/01_03_Svm.ipynb)]       
20 | ●  随机梯度下降SGD  
21 | ●  [**K近邻（KNN）**](https://github.com/solidglue/Machine_Learning_Sklearn_Jupyter_Demo/blob/master/01_Supervised_learning/01_05_Knn.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/01_Supervised_learning/01_05_Knn.ipynb)]      
22 | ●  朴素贝叶斯  
23 | ●  [**决策树**](https://github.com/solidglue/Machine_Learning_Sklearn_Jupyter_Demo/blob/master/01_Supervised_learning/01_07_Decision_trees.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/01_Supervised_learning/01_07_Decision_trees.ipynb)]      
24 | ●  [**随机森林**](https://github.com/solidglue/Machine_Learning_Sklearn_Jupyter_Demo/blob/master/01_Supervised_learning/01_08_Random_forests.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/01_Supervised_learning/01_08_Random_forests.ipynb)]      
25 | ●  多分类与多输出  
26 | ●  特征选择  
27 | ●  神经网络  
28 | 
29 | ## 无监督学习
30 | ●  [**Kmeans聚类**](https://github.com/solidglue/Machine_Learning_Sklearn_Jupyter_Demo/blob/master/02_Unsupervised_learning/02_01_Kmeans_clustering.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/02_Unsupervised_learning/02_01_Kmeans_clustering.ipynb)]        
31 | ●  神经网络模型（无监督）  
32 | 
33 | ## 模型选择与评估
34 | ●  [**交叉验证**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/03_Model_selection_and_evaluation/03_01_Cross_validation.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/03_Model_selection_and_evaluation/03_01_Cross_validation.ipynb)]      
35 | ●  [**调整估算器超参数**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/03_Model_selection_and_evaluation/03_02_hyper_parameters_estimator.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/03_Model_selection_and_evaluation/03_02_hyper_parameters_estimator.ipynb)]      
36 | ●  [**指标和评分**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/03_Model_selection_and_evaluation/03_03_Metrics_and_scoring.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/03_Model_selection_and_evaluation/03_03_Metrics_and_scoring.ipynb)]      
37 | ●  [**验证曲线：绘制分数以评估模型**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/03_Model_selection_and_evaluation/03_04_Validation_curves.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/03_Model_selection_and_evaluation/03_04_Validation_curves.ipynb)]      
38 | 
39 | ## 可视化
40 | ●  可视化  
41 | 
42 | ## 数据集转换
43 | ●  [**流水线和复合估算器**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/05_Dataset_transformations/05_01_Pipelines_and_composite_estimators.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/05_Dataset_transformations/05_01_Pipelines_and_composite_estimators.ipynb)]      
44 | ●  [**特征提取**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/05_Dataset_transformations/05_02_Feature_extraction.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/05_Dataset_transformations/05_02_Feature_extraction.ipynb)]      
45 | ●  [**数据预处理**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/05_Dataset_transformations/05_03_Preprocessing_data.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/05_Dataset_transformations/05_03_Preprocessing_data.ipynb)]      
46 | ●  [**缺失值补充**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/05_Dataset_transformations/05_04_Imputation_of_missing_values.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/05_Dataset_transformations/05_04_Imputation_of_missing_values.ipynb)]      
47 | ●  无监督降维  
48 | ●  随机投影  
49 | 
50 | ## 数据集加载
51 | ●  [**玩具数据集**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/06_Dataset_loading/06_01_Toy_datasets.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/06_Dataset_loading/06_01_Toy_datasets.ipynb)]      
52 | ●  [**真实世界数据集**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/06_Dataset_loading/06_02_Real_world_datasets.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/06_Dataset_loading/06_02_Real_world_datasets.ipynb)]      
53 | ●  [**生成的数据集**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/06_Dataset_loading/06_03_Generated%20datasets.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/06_Dataset_loading/06_03_Generated%20datasets.ipynb)]      
54 | ●  [**加载其他数据集**](https://github.com/solidglue/Machine_Learning_Sklearn_Examples/blob/master/06_Dataset_loading/06_04_load_files.ipynb)       [~~*(备用链接)*~~](https://nbviewer.org/github/solidglue/Machine_Learning_Sklearn_Examples/blob/master/06_Dataset_loading/06_04_load_files.ipynb)]      
55 | 
56 | 
57 | ## 模型持久化
58 | ●  模型持久化  
59 | 
60 | 
61 | ## *扩展
62 | 
63 | 
64 | 1.**推荐系统**  
65 | 王树森推荐系统公开课 - 基于小红书的场景讲解工业界真实的推荐系统。  
66 | ●  [**Recommender_System**](https://github.com/solidglue/Recommender_System) 
67 | 
68 | 2.**YouTuBe推荐系统排序模型**  
69 | 以"DNN_for_YouTube_Recommendations"模型和电影评分数据集（ml-1m）为基础，详尽的展示了如何基于TensorFlow2实现推荐系统排序模型。  
70 | ●  [**YouTube深度排序模型(多值embedding、多目标学习)**](https://github.com/solidglue/DNN_for_YouTube_Recommendations) 
71 | 
72 | 3.**推荐系统推理服务**  
73 | 基于Goalng、Docker和微服务思想实现了高并发、高性能和高可用的推荐系统推理微服务，包括多种召回/排序服务，并提供多种接口访问方式（REST、gRPC和Dubbo）等，每日可处理上千万次推理请求。   
74 | ● [**推荐系统推理微服务Golang**](https://github.com/solidglue/Recommender_System_Inference_Services)  
75 | 
76 | 4.**深度学习TensorFlow入门教程**  
77 | ●  [**深度学习TensorFlow入门教程**](https://github.com/solidglue/Deep_Learning_TensorFlow2_Examples)  
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/00_Python_basics/00_04_EDA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 26,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import matplotlib.pyplot as plt\n",
 10 |     "import numpy as np\n",
 11 |     "\n",
 12 |     "from sklearn import datasets, linear_model\n",
 13 |     "from sklearn.metrics import mean_squared_error, r2_score\n",
 14 |     "\n",
 15 |     "import pandas as pd\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error\n",
 18 |     "import seaborn as sns"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 27,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/html": [
 29 |        "<div>\n",
 30 |        "<style scoped>\n",
 31 |        "    .dataframe tbody tr th:only-of-type {\n",
 32 |        "        vertical-align: middle;\n",
 33 |        "    }\n",
 34 |        "\n",
 35 |        "    .dataframe tbody tr th {\n",
 36 |        "        vertical-align: top;\n",
 37 |        "    }\n",
 38 |        "\n",
 39 |        "    .dataframe thead th {\n",
 40 |        "        text-align: right;\n",
 41 |        "    }\n",
 42 |        "</style>\n",
 43 |        "<table border=\"1\" class=\"dataframe\">\n",
 44 |        "  <thead>\n",
 45 |        "    <tr style=\"text-align: right;\">\n",
 46 |        "      <th></th>\n",
 47 |        "      <th>sepal_length</th>\n",
 48 |        "      <th>sepal_width</th>\n",
 49 |        "      <th>petal_length</th>\n",
 50 |        "      <th>petal_width</th>\n",
 51 |        "      <th>species</th>\n",
 52 |        "    </tr>\n",
 53 |        "  </thead>\n",
 54 |        "  <tbody>\n",
 55 |        "    <tr>\n",
 56 |        "      <th>0</th>\n",
 57 |        "      <td>5.1</td>\n",
 58 |        "      <td>NaN</td>\n",
 59 |        "      <td>1.4</td>\n",
 60 |        "      <td>0.2</td>\n",
 61 |        "      <td>setosa</td>\n",
 62 |        "    </tr>\n",
 63 |        "    <tr>\n",
 64 |        "      <th>1</th>\n",
 65 |        "      <td>4.9</td>\n",
 66 |        "      <td>3.0</td>\n",
 67 |        "      <td>1.4</td>\n",
 68 |        "      <td>0.2</td>\n",
 69 |        "      <td>setosa</td>\n",
 70 |        "    </tr>\n",
 71 |        "    <tr>\n",
 72 |        "      <th>2</th>\n",
 73 |        "      <td>4.7</td>\n",
 74 |        "      <td>3.2</td>\n",
 75 |        "      <td>1.3</td>\n",
 76 |        "      <td>0.2</td>\n",
 77 |        "      <td>setosa</td>\n",
 78 |        "    </tr>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>3</th>\n",
 81 |        "      <td>4.6</td>\n",
 82 |        "      <td>NaN</td>\n",
 83 |        "      <td>1.5</td>\n",
 84 |        "      <td>0.2</td>\n",
 85 |        "      <td>setosa</td>\n",
 86 |        "    </tr>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>4</th>\n",
 89 |        "      <td>5.0</td>\n",
 90 |        "      <td>3.6</td>\n",
 91 |        "      <td>1.4</td>\n",
 92 |        "      <td>0.2</td>\n",
 93 |        "      <td>setosa</td>\n",
 94 |        "    </tr>\n",
 95 |        "  </tbody>\n",
 96 |        "</table>\n",
 97 |        "</div>"
 98 |       ],
 99 |       "text/plain": [
100 |        "   sepal_length  sepal_width  petal_length  petal_width species\n",
101 |        "0           5.1          NaN           1.4          0.2  setosa\n",
102 |        "1           4.9          3.0           1.4          0.2  setosa\n",
103 |        "2           4.7          3.2           1.3          0.2  setosa\n",
104 |        "3           4.6          NaN           1.5          0.2  setosa\n",
105 |        "4           5.0          3.6           1.4          0.2  setosa"
106 |       ]
107 |      },
108 |      "execution_count": 27,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "### 1.0.0 数据探索EDA <每次面对新数据都应该进行数据探索，依据探索结果进行特征工程和选择模型>\n",
115 |     "#加载iris数据集，数据集被手动删除一些特征值。\n",
116 |     "iris_data = pd.read_csv(\"../_Datasets/iris_miss.data\",sep=',')  #指定字段分隔符，默认逗号\n",
117 |     "\n",
118 |     "##（1）查看前几行数据。\n",
119 |     "iris_data.head()\n"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 28,
125 |    "metadata": {},
126 |    "outputs": [
127 |     {
128 |      "data": {
129 |       "text/plain": [
130 |        "(150, 5)"
131 |       ]
132 |      },
133 |      "execution_count": 28,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "##（2）查看数据维度（例如二维数据的行列数）\n",
140 |     "iris_data.shape  #150行5列\n"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 29,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n",
152 |        "       'species'],\n",
153 |        "      dtype='object')"
154 |       ]
155 |      },
156 |      "execution_count": 29,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "##（3）查看特征列名。\n",
163 |     "iris_data.columns"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 30,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "Index(['SL', 'SW', 'PL', 'PW', 'species'], dtype='object')"
175 |       ]
176 |      },
177 |      "execution_count": 30,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "##（4）特征列重命名\n",
184 |     "\n",
185 |     "#可以根据列名筛选特征。也可以重命名该列。例如将diabetes_X特征s1-s6重命名为feature1 - feature6。\n",
186 |     "##方法1-暴力重命名，此方法需要写全所有列名，否则报错。\n",
187 |     "#diabetes_X.columns = ['feature1', 'feature2', 'feature3', 'feature4', 'species']\n",
188 |     "#diabetes_X.columns\n",
189 |     "\n",
190 |     "#方法2-rename方法，此方法只需写需要重命名的字段。根据需要，非必需\n",
191 |     "iris_data.rename(columns={'sepal_length':'SL','sepal_width':'SW','petal_length':'PL','petal_width':'PW'},inplace=True)\n",
192 |     "iris_data.columns"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 31,
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "data": {
202 |       "text/html": [
203 |        "<div>\n",
204 |        "<style scoped>\n",
205 |        "    .dataframe tbody tr th:only-of-type {\n",
206 |        "        vertical-align: middle;\n",
207 |        "    }\n",
208 |        "\n",
209 |        "    .dataframe tbody tr th {\n",
210 |        "        vertical-align: top;\n",
211 |        "    }\n",
212 |        "\n",
213 |        "    .dataframe thead th {\n",
214 |        "        text-align: right;\n",
215 |        "    }\n",
216 |        "</style>\n",
217 |        "<table border=\"1\" class=\"dataframe\">\n",
218 |        "  <thead>\n",
219 |        "    <tr style=\"text-align: right;\">\n",
220 |        "      <th></th>\n",
221 |        "      <th>SL</th>\n",
222 |        "      <th>SW</th>\n",
223 |        "      <th>PL</th>\n",
224 |        "      <th>PW</th>\n",
225 |        "    </tr>\n",
226 |        "  </thead>\n",
227 |        "  <tbody>\n",
228 |        "    <tr>\n",
229 |        "      <th>count</th>\n",
230 |        "      <td>147.000000</td>\n",
231 |        "      <td>143.000000</td>\n",
232 |        "      <td>148.000000</td>\n",
233 |        "      <td>149.000000</td>\n",
234 |        "    </tr>\n",
235 |        "    <tr>\n",
236 |        "      <th>mean</th>\n",
237 |        "      <td>5.859184</td>\n",
238 |        "      <td>3.042657</td>\n",
239 |        "      <td>3.737162</td>\n",
240 |        "      <td>1.205369</td>\n",
241 |        "    </tr>\n",
242 |        "    <tr>\n",
243 |        "      <th>std</th>\n",
244 |        "      <td>0.828413</td>\n",
245 |        "      <td>0.432075</td>\n",
246 |        "      <td>1.766055</td>\n",
247 |        "      <td>0.761292</td>\n",
248 |        "    </tr>\n",
249 |        "    <tr>\n",
250 |        "      <th>min</th>\n",
251 |        "      <td>4.300000</td>\n",
252 |        "      <td>2.000000</td>\n",
253 |        "      <td>1.000000</td>\n",
254 |        "      <td>0.100000</td>\n",
255 |        "    </tr>\n",
256 |        "    <tr>\n",
257 |        "      <th>25%</th>\n",
258 |        "      <td>5.100000</td>\n",
259 |        "      <td>2.800000</td>\n",
260 |        "      <td>1.575000</td>\n",
261 |        "      <td>0.300000</td>\n",
262 |        "    </tr>\n",
263 |        "    <tr>\n",
264 |        "      <th>50%</th>\n",
265 |        "      <td>5.800000</td>\n",
266 |        "      <td>3.000000</td>\n",
267 |        "      <td>4.300000</td>\n",
268 |        "      <td>1.300000</td>\n",
269 |        "    </tr>\n",
270 |        "    <tr>\n",
271 |        "      <th>75%</th>\n",
272 |        "      <td>6.400000</td>\n",
273 |        "      <td>3.300000</td>\n",
274 |        "      <td>5.100000</td>\n",
275 |        "      <td>1.800000</td>\n",
276 |        "    </tr>\n",
277 |        "    <tr>\n",
278 |        "      <th>max</th>\n",
279 |        "      <td>7.900000</td>\n",
280 |        "      <td>4.400000</td>\n",
281 |        "      <td>6.900000</td>\n",
282 |        "      <td>2.500000</td>\n",
283 |        "    </tr>\n",
284 |        "  </tbody>\n",
285 |        "</table>\n",
286 |        "</div>"
287 |       ],
288 |       "text/plain": [
289 |        "               SL          SW          PL          PW\n",
290 |        "count  147.000000  143.000000  148.000000  149.000000\n",
291 |        "mean     5.859184    3.042657    3.737162    1.205369\n",
292 |        "std      0.828413    0.432075    1.766055    0.761292\n",
293 |        "min      4.300000    2.000000    1.000000    0.100000\n",
294 |        "25%      5.100000    2.800000    1.575000    0.300000\n",
295 |        "50%      5.800000    3.000000    4.300000    1.300000\n",
296 |        "75%      6.400000    3.300000    5.100000    1.800000\n",
297 |        "max      7.900000    4.400000    6.900000    2.500000"
298 |       ]
299 |      },
300 |      "execution_count": 31,
301 |      "metadata": {},
302 |      "output_type": "execute_result"
303 |     }
304 |    ],
305 |    "source": [
306 |     "##（5）统计摘要describe\n",
307 |     "#describe() 方法用于生成DataFrame中数值列的统计摘要。\n",
308 |     "#它提供了每个数值列的计数、均值、标准差、最小值、25th、50th（中位数）、75th 四分位数和最大值。\n",
309 |     "iris_data.describe()"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 32,
315 |    "metadata": {},
316 |    "outputs": [
317 |     {
318 |      "name": "stdout",
319 |      "output_type": "stream",
320 |      "text": [
321 |       "<class 'pandas.core.frame.DataFrame'>\n",
322 |       "RangeIndex: 150 entries, 0 to 149\n",
323 |       "Data columns (total 5 columns):\n",
324 |       " #   Column   Non-Null Count  Dtype  \n",
325 |       "---  ------   --------------  -----  \n",
326 |       " 0   SL       147 non-null    float64\n",
327 |       " 1   SW       143 non-null    float64\n",
328 |       " 2   PL       148 non-null    float64\n",
329 |       " 3   PW       149 non-null    float64\n",
330 |       " 4   species  145 non-null    object \n",
331 |       "dtypes: float64(4), object(1)\n",
332 |       "memory usage: 6.0+ KB\n"
333 |      ]
334 |     }
335 |    ],
336 |    "source": [
337 |     "##（6）INFO摘要\n",
338 |     "#info() 方法用于获取DataFrame的摘要信息，包括每列的非空值数量、列的数据类型等\n",
339 |     "iris_data.info()"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 33,
345 |    "metadata": {},
346 |    "outputs": [
347 |     {
348 |      "data": {
349 |       "text/plain": [
350 |        "SL         3\n",
351 |        "SW         7\n",
352 |        "PL         2\n",
353 |        "PW         1\n",
354 |        "species    5\n",
355 |        "dtype: int64"
356 |       ]
357 |      },
358 |      "execution_count": 33,
359 |      "metadata": {},
360 |      "output_type": "execute_result"
361 |     }
362 |    ],
363 |    "source": [
364 |     "# 也可以用isnull方法简单计算含缺失特征的样本数。\n",
365 |     "missing_values = iris_data.isnull().sum()\n",
366 |     "missing_values"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 34,
372 |    "metadata": {},
373 |    "outputs": [
374 |     {
375 |      "data": {
376 |       "text/html": [
377 |        "<div>\n",
378 |        "<style scoped>\n",
379 |        "    .dataframe tbody tr th:only-of-type {\n",
380 |        "        vertical-align: middle;\n",
381 |        "    }\n",
382 |        "\n",
383 |        "    .dataframe tbody tr th {\n",
384 |        "        vertical-align: top;\n",
385 |        "    }\n",
386 |        "\n",
387 |        "    .dataframe thead th {\n",
388 |        "        text-align: right;\n",
389 |        "    }\n",
390 |        "</style>\n",
391 |        "<table border=\"1\" class=\"dataframe\">\n",
392 |        "  <thead>\n",
393 |        "    <tr style=\"text-align: right;\">\n",
394 |        "      <th></th>\n",
395 |        "      <th>SL</th>\n",
396 |        "      <th>SW</th>\n",
397 |        "      <th>PL</th>\n",
398 |        "      <th>PW</th>\n",
399 |        "      <th>species</th>\n",
400 |        "    </tr>\n",
401 |        "  </thead>\n",
402 |        "  <tbody>\n",
403 |        "    <tr>\n",
404 |        "      <th>1</th>\n",
405 |        "      <td>4.9</td>\n",
406 |        "      <td>3.0</td>\n",
407 |        "      <td>1.4</td>\n",
408 |        "      <td>0.2</td>\n",
409 |        "      <td>setosa</td>\n",
410 |        "    </tr>\n",
411 |        "    <tr>\n",
412 |        "      <th>2</th>\n",
413 |        "      <td>4.7</td>\n",
414 |        "      <td>3.2</td>\n",
415 |        "      <td>1.3</td>\n",
416 |        "      <td>0.2</td>\n",
417 |        "      <td>setosa</td>\n",
418 |        "    </tr>\n",
419 |        "    <tr>\n",
420 |        "      <th>4</th>\n",
421 |        "      <td>5.0</td>\n",
422 |        "      <td>3.6</td>\n",
423 |        "      <td>1.4</td>\n",
424 |        "      <td>0.2</td>\n",
425 |        "      <td>setosa</td>\n",
426 |        "    </tr>\n",
427 |        "    <tr>\n",
428 |        "      <th>5</th>\n",
429 |        "      <td>5.4</td>\n",
430 |        "      <td>3.9</td>\n",
431 |        "      <td>1.7</td>\n",
432 |        "      <td>0.4</td>\n",
433 |        "      <td>setosa</td>\n",
434 |        "    </tr>\n",
435 |        "    <tr>\n",
436 |        "      <th>6</th>\n",
437 |        "      <td>4.6</td>\n",
438 |        "      <td>3.4</td>\n",
439 |        "      <td>1.4</td>\n",
440 |        "      <td>0.3</td>\n",
441 |        "      <td>setosa</td>\n",
442 |        "    </tr>\n",
443 |        "    <tr>\n",
444 |        "      <th>...</th>\n",
445 |        "      <td>...</td>\n",
446 |        "      <td>...</td>\n",
447 |        "      <td>...</td>\n",
448 |        "      <td>...</td>\n",
449 |        "      <td>...</td>\n",
450 |        "    </tr>\n",
451 |        "    <tr>\n",
452 |        "      <th>144</th>\n",
453 |        "      <td>6.7</td>\n",
454 |        "      <td>3.3</td>\n",
455 |        "      <td>5.7</td>\n",
456 |        "      <td>2.5</td>\n",
457 |        "      <td>virginica</td>\n",
458 |        "    </tr>\n",
459 |        "    <tr>\n",
460 |        "      <th>145</th>\n",
461 |        "      <td>6.7</td>\n",
462 |        "      <td>3.0</td>\n",
463 |        "      <td>5.2</td>\n",
464 |        "      <td>2.3</td>\n",
465 |        "      <td>virginica</td>\n",
466 |        "    </tr>\n",
467 |        "    <tr>\n",
468 |        "      <th>147</th>\n",
469 |        "      <td>6.5</td>\n",
470 |        "      <td>3.0</td>\n",
471 |        "      <td>5.2</td>\n",
472 |        "      <td>2.0</td>\n",
473 |        "      <td>virginica</td>\n",
474 |        "    </tr>\n",
475 |        "    <tr>\n",
476 |        "      <th>148</th>\n",
477 |        "      <td>6.2</td>\n",
478 |        "      <td>3.4</td>\n",
479 |        "      <td>5.4</td>\n",
480 |        "      <td>2.3</td>\n",
481 |        "      <td>virginica</td>\n",
482 |        "    </tr>\n",
483 |        "    <tr>\n",
484 |        "      <th>149</th>\n",
485 |        "      <td>5.9</td>\n",
486 |        "      <td>3.0</td>\n",
487 |        "      <td>5.1</td>\n",
488 |        "      <td>1.8</td>\n",
489 |        "      <td>virginica</td>\n",
490 |        "    </tr>\n",
491 |        "  </tbody>\n",
492 |        "</table>\n",
493 |        "<p>132 rows × 5 columns</p>\n",
494 |        "</div>"
495 |       ],
496 |       "text/plain": [
497 |        "      SL   SW   PL   PW    species\n",
498 |        "1    4.9  3.0  1.4  0.2     setosa\n",
499 |        "2    4.7  3.2  1.3  0.2     setosa\n",
500 |        "4    5.0  3.6  1.4  0.2     setosa\n",
501 |        "5    5.4  3.9  1.7  0.4     setosa\n",
502 |        "6    4.6  3.4  1.4  0.3     setosa\n",
503 |        "..   ...  ...  ...  ...        ...\n",
504 |        "144  6.7  3.3  5.7  2.5  virginica\n",
505 |        "145  6.7  3.0  5.2  2.3  virginica\n",
506 |        "147  6.5  3.0  5.2  2.0  virginica\n",
507 |        "148  6.2  3.4  5.4  2.3  virginica\n",
508 |        "149  5.9  3.0  5.1  1.8  virginica\n",
509 |        "\n",
510 |        "[132 rows x 5 columns]"
511 |       ]
512 |      },
513 |      "execution_count": 34,
514 |      "metadata": {},
515 |      "output_type": "execute_result"
516 |     }
517 |    ],
518 |    "source": [
519 |     "## （7）缺失值处理\n",
520 |     "#可简单删除缺失值样本，也可补全（特征补全参考5.4节）\n",
521 |     "df_filtered = iris_data.dropna()\n",
522 |     "df_filtered  #一共有18条样本包含缺失值，删除后剩余132条样本"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": 35,
528 |    "metadata": {},
529 |    "outputs": [],
530 |    "source": [
531 |     "### 总结\n",
532 |     "# 数据探索完后，根据探索结果进行样本补全、特征工程等，再训练模型"
533 |    ]
534 |   }
535 |  ],
536 |  "metadata": {
537 |   "kernelspec": {
538 |    "display_name": "Python 3",
539 |    "language": "python",
540 |    "name": "python3"
541 |   },
542 |   "language_info": {
543 |    "codemirror_mode": {
544 |     "name": "ipython",
545 |     "version": 3
546 |    },
547 |    "file_extension": ".py",
548 |    "mimetype": "text/x-python",
549 |    "name": "python",
550 |    "nbconvert_exporter": "python",
551 |    "pygments_lexer": "ipython3",
552 |    "version": "3.11.8"
553 |   }
554 |  },
555 |  "nbformat": 4,
556 |  "nbformat_minor": 2
557 | }
558 | 


--------------------------------------------------------------------------------
/00_Python_basics/00_01_Numpy_basic.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 100,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np  \n",
 10 |     "\n",
 11 |     "#NumPy（Numerical Python）是一个开源的Python库，提供了高性能的多维数组对象和用于处理数组的函数。\n",
 12 |     "#它是许多科学计算和数据分析任务的基础库之一。"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 101,
 18 |    "metadata": {},
 19 |    "outputs": [
 20 |     {
 21 |      "data": {
 22 |       "text/plain": [
 23 |        "array([1, 2, 3, 4, 5])"
 24 |       ]
 25 |      },
 26 |      "execution_count": 101,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "### 0.1.1 numpy数组\n",
 33 |     "## 创建数组\n",
 34 |     "# 使用np.array直接创建\n",
 35 |     "np.array([1,2,3,4,5])"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 102,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "array([1., 1., 1., 1., 1.])"
 47 |       ]
 48 |      },
 49 |      "execution_count": 102,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "# 使用np.ones、np.zeros创建全1或全0数组\n",
 56 |     "np.ones(5)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 103,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "text/plain": [
 67 |        "array([0., 0., 0., 0., 0.])"
 68 |       ]
 69 |      },
 70 |      "execution_count": 103,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "np.zeros(5)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 104,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "[1 2 3 4 5]\n",
 89 |       "[2 2 2 2 2]\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "## 数组运算（加减乘除等）\n",
 95 |     "arr1 = np.array([1,2,3,4,5])\n",
 96 |     "arr2 = np.array([2,2,2,2,2])\n",
 97 |     "print(arr1)\n",
 98 |     "print(arr2)\n"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 105,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "array([3, 4, 5, 6, 7])"
110 |       ]
111 |      },
112 |      "execution_count": 105,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "#数组相加\n",
119 |     "arr3 = arr1 + arr2\n",
120 |     "arr3"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 106,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "array([ 2,  4,  6,  8, 10])"
132 |       ]
133 |      },
134 |      "execution_count": 106,
135 |      "metadata": {},
136 |      "output_type": "execute_result"
137 |     }
138 |    ],
139 |    "source": [
140 |     "#数组相乘法\n",
141 |     "arr4 = arr1 * arr2\n",
142 |     "arr4"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 107,
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "data": {
152 |       "text/plain": [
153 |        "array([ 2,  4,  6,  8, 10])"
154 |       ]
155 |      },
156 |      "execution_count": 107,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "#数组乘以整数(本质上是数组想成的特例)\n",
163 |     "arr5 = arr1 * 2\n",
164 |     "arr5"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 108,
170 |    "metadata": {},
171 |    "outputs": [
172 |     {
173 |      "data": {
174 |       "text/plain": [
175 |        "array([ 1,  2,  5,  7,  9, 11])"
176 |       ]
177 |      },
178 |      "execution_count": 108,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "## 数组索引，跟python索引一致\n",
185 |     "arr1 = np.array([1,2,5,7,9,11])\n",
186 |     "arr1\n"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 109,
192 |    "metadata": {},
193 |    "outputs": [
194 |     {
195 |      "data": {
196 |       "text/plain": [
197 |        "1"
198 |       ]
199 |      },
200 |      "execution_count": 109,
201 |      "metadata": {},
202 |      "output_type": "execute_result"
203 |     }
204 |    ],
205 |    "source": [
206 |     "#输出第1个元素\n",
207 |     "arr1[0] "
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 110,
213 |    "metadata": {},
214 |    "outputs": [
215 |     {
216 |      "data": {
217 |       "text/plain": [
218 |        "11"
219 |       ]
220 |      },
221 |      "execution_count": 110,
222 |      "metadata": {},
223 |      "output_type": "execute_result"
224 |     }
225 |    ],
226 |    "source": [
227 |     "#输出最后1个元素\n",
228 |     "arr1[-1]"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 111,
234 |    "metadata": {},
235 |    "outputs": [
236 |     {
237 |      "data": {
238 |       "text/plain": [
239 |        "array([5, 7])"
240 |       ]
241 |      },
242 |      "execution_count": 111,
243 |      "metadata": {},
244 |      "output_type": "execute_result"
245 |     }
246 |    ],
247 |    "source": [
248 |     "#输出索引2-4的元素（左闭右开）\n",
249 |     "arr1[2:4]\n"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 112,
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "data": {
259 |       "text/plain": [
260 |        "array([1, 2, 5])"
261 |       ]
262 |      },
263 |      "execution_count": 112,
264 |      "metadata": {},
265 |      "output_type": "execute_result"
266 |     }
267 |    ],
268 |    "source": [
269 |     "#输出索引3及之前的所有元素（左闭右开）\n",
270 |     "arr1[:3]#输出索引3及之前的所有元素（左闭右开）\n"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 113,
276 |    "metadata": {},
277 |    "outputs": [
278 |     {
279 |      "data": {
280 |       "text/plain": [
281 |        "array([ 7,  9, 11])"
282 |       ]
283 |      },
284 |      "execution_count": 113,
285 |      "metadata": {},
286 |      "output_type": "execute_result"
287 |     }
288 |    ],
289 |    "source": [
290 |     "#输出索引3及之后的所有元素\n",
291 |     "arr1[3:]"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 114,
297 |    "metadata": {},
298 |    "outputs": [
299 |     {
300 |      "data": {
301 |       "text/plain": [
302 |        "array([ 1,  2,  5,  7,  9, 11])"
303 |       ]
304 |      },
305 |      "execution_count": 114,
306 |      "metadata": {},
307 |      "output_type": "execute_result"
308 |     }
309 |    ],
310 |    "source": [
311 |     "## 数组聚合\n",
312 |     "arr1 = np.array([1,2,5,7,9,11])\n",
313 |     "arr1\n",
314 |     "\n"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 115,
320 |    "metadata": {},
321 |    "outputs": [
322 |     {
323 |      "data": {
324 |       "text/plain": [
325 |        "11"
326 |       ]
327 |      },
328 |      "execution_count": 115,
329 |      "metadata": {},
330 |      "output_type": "execute_result"
331 |     }
332 |    ],
333 |    "source": [
334 |     "#求数组最大值\n",
335 |     "arr1.max()"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 116,
341 |    "metadata": {},
342 |    "outputs": [
343 |     {
344 |      "data": {
345 |       "text/plain": [
346 |        "1"
347 |       ]
348 |      },
349 |      "execution_count": 116,
350 |      "metadata": {},
351 |      "output_type": "execute_result"
352 |     }
353 |    ],
354 |    "source": [
355 |     "#求数组最小值\n",
356 |     "arr1.min()"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 117,
362 |    "metadata": {},
363 |    "outputs": [
364 |     {
365 |      "data": {
366 |       "text/plain": [
367 |        "35"
368 |       ]
369 |      },
370 |      "execution_count": 117,
371 |      "metadata": {},
372 |      "output_type": "execute_result"
373 |     }
374 |    ],
375 |    "source": [
376 |     "#数组求和\n",
377 |     "arr1.sum()"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 118,
383 |    "metadata": {},
384 |    "outputs": [
385 |     {
386 |      "data": {
387 |       "text/plain": [
388 |        "5.833333333333333"
389 |       ]
390 |      },
391 |      "execution_count": 118,
392 |      "metadata": {},
393 |      "output_type": "execute_result"
394 |     }
395 |    ],
396 |    "source": [
397 |     "#求数组均值\n",
398 |     "arr1.mean()"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 119,
404 |    "metadata": {},
405 |    "outputs": [
406 |     {
407 |      "data": {
408 |       "text/plain": [
409 |        "array([[1, 2, 3],\n",
410 |        "       [4, 5, 6],\n",
411 |        "       [7, 8, 9]])"
412 |       ]
413 |      },
414 |      "execution_count": 119,
415 |      "metadata": {},
416 |      "output_type": "execute_result"
417 |     }
418 |    ],
419 |    "source": [
420 |     "### 0.1.2 numpy矩阵\n",
421 |     "\n",
422 |     "##创建矩阵\n",
423 |     "#使用np.array直接创建\n",
424 |     "np.array([[1,2,3,],[4,5,6],[7,8,9]])"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": 120,
430 |    "metadata": {},
431 |    "outputs": [
432 |     {
433 |      "data": {
434 |       "text/plain": [
435 |        "array([[1., 1., 1.],\n",
436 |        "       [1., 1., 1.],\n",
437 |        "       [1., 1., 1.]])"
438 |       ]
439 |      },
440 |      "execution_count": 120,
441 |      "metadata": {},
442 |      "output_type": "execute_result"
443 |     }
444 |    ],
445 |    "source": [
446 |     "# 使用np.ones、np.zeros创建全1或全0矩阵\n",
447 |     "np.ones((3,3))  #创建3行3列矩阵"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 121,
453 |    "metadata": {},
454 |    "outputs": [
455 |     {
456 |      "data": {
457 |       "text/plain": [
458 |        "array([[0., 0., 0.],\n",
459 |        "       [0., 0., 0.],\n",
460 |        "       [0., 0., 0.]])"
461 |       ]
462 |      },
463 |      "execution_count": 121,
464 |      "metadata": {},
465 |      "output_type": "execute_result"
466 |     }
467 |    ],
468 |    "source": [
469 |     "np.zeros((3,3))  #创建3行3列矩阵"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 122,
475 |    "metadata": {},
476 |    "outputs": [
477 |     {
478 |      "name": "stdout",
479 |      "output_type": "stream",
480 |      "text": [
481 |       "[[1 2 3]\n",
482 |       " [4 5 6]\n",
483 |       " [7 8 9]]\n",
484 |       "[[1 1 1]\n",
485 |       " [2 2 2]\n",
486 |       " [3 3 3]]\n"
487 |      ]
488 |     }
489 |    ],
490 |    "source": [
491 |     "## 矩阵运算\n",
492 |     "arr1 = np.array([[1,2,3,],[4,5,6],[7,8,9]])\n",
493 |     "arr2 = np.array([[1,1,1,],[2,2,2],[3,3,3]])\n",
494 |     "print(arr1)\n",
495 |     "print(arr2)\n"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": 123,
501 |    "metadata": {},
502 |    "outputs": [
503 |     {
504 |      "data": {
505 |       "text/plain": [
506 |        "array([[ 2,  3,  4],\n",
507 |        "       [ 6,  7,  8],\n",
508 |        "       [10, 11, 12]])"
509 |       ]
510 |      },
511 |      "execution_count": 123,
512 |      "metadata": {},
513 |      "output_type": "execute_result"
514 |     }
515 |    ],
516 |    "source": [
517 |     "#矩阵相加（对应索引元素相加）\n",
518 |     "arr3 = arr1 + arr2\n",
519 |     "arr3\n"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": 124,
525 |    "metadata": {},
526 |    "outputs": [
527 |     {
528 |      "data": {
529 |       "text/plain": [
530 |        "array([[ 1,  2,  3],\n",
531 |        "       [ 8, 10, 12],\n",
532 |        "       [21, 24, 27]])"
533 |       ]
534 |      },
535 |      "execution_count": 124,
536 |      "metadata": {},
537 |      "output_type": "execute_result"
538 |     }
539 |    ],
540 |    "source": [
541 |     "\n",
542 |     "#矩阵相乘（对应索引元素相乘）\n",
543 |     "arr4 = arr1 * arr2\n",
544 |     "arr4"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": 125,
550 |    "metadata": {},
551 |    "outputs": [
552 |     {
553 |      "name": "stdout",
554 |      "output_type": "stream",
555 |      "text": [
556 |       "[[1 2]\n",
557 |       " [4 5]\n",
558 |       " [7 8]]\n",
559 |       "[[1 1 1]\n",
560 |       " [2 2 2]] \n",
561 |       "\n",
562 |       "[[ 5  5  5]\n",
563 |       " [14 14 14]\n",
564 |       " [23 23 23]]\n"
565 |      ]
566 |     }
567 |    ],
568 |    "source": [
569 |     "## 矩阵点积（线性代数）\n",
570 |     "arr1 = np.array([[1,2,],[4,5],[7,8]])\n",
571 |     "arr2 = np.array([[1,1,1,],[2,2,2]])\n",
572 |     "print(arr1)\n",
573 |     "print(arr2,'\\n')\n",
574 |     "\n",
575 |     "#矩阵点积\n",
576 |     "arr3 = arr1.dot(arr2)\n",
577 |     "print(arr3)\n"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": 126,
583 |    "metadata": {},
584 |    "outputs": [
585 |     {
586 |      "data": {
587 |       "text/plain": [
588 |        "array([[ 1,  2,  3,  4],\n",
589 |        "       [ 4,  5,  6,  7],\n",
590 |        "       [ 7,  8,  9, 10],\n",
591 |        "       [10, 11, 12, 13]])"
592 |       ]
593 |      },
594 |      "execution_count": 126,
595 |      "metadata": {},
596 |      "output_type": "execute_result"
597 |     }
598 |    ],
599 |    "source": [
600 |     "## 矩阵索引\n",
601 |     "# 第一个索引取行，第二个索引取列\n",
602 |     "arr1 = np.array([[1,2,3,4],[4,5,6,7],[7,8,9,10],[10,11,12,13]])\n",
603 |     "arr1\n"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": 127,
609 |    "metadata": {},
610 |    "outputs": [
611 |     {
612 |      "data": {
613 |       "text/plain": [
614 |        "2"
615 |       ]
616 |      },
617 |      "execution_count": 127,
618 |      "metadata": {},
619 |      "output_type": "execute_result"
620 |     }
621 |    ],
622 |    "source": [
623 |     "#输出第0行第1列元素\n",
624 |     "arr1[0,1]\n"
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": 128,
630 |    "metadata": {},
631 |    "outputs": [
632 |     {
633 |      "data": {
634 |       "text/plain": [
635 |        "array([4, 5, 6, 7])"
636 |       ]
637 |      },
638 |      "execution_count": 128,
639 |      "metadata": {},
640 |      "output_type": "execute_result"
641 |     }
642 |    ],
643 |    "source": [
644 |     "#输出第1行所有元素\n",
645 |     "arr1[1]"
646 |    ]
647 |   },
648 |   {
649 |    "cell_type": "code",
650 |    "execution_count": 129,
651 |    "metadata": {},
652 |    "outputs": [
653 |     {
654 |      "data": {
655 |       "text/plain": [
656 |        "array([ 2,  5,  8, 11])"
657 |       ]
658 |      },
659 |      "execution_count": 129,
660 |      "metadata": {},
661 |      "output_type": "execute_result"
662 |     }
663 |    ],
664 |    "source": [
665 |     "#输出第1列所有元素\n",
666 |     "arr1[:,1]"
667 |    ]
668 |   },
669 |   {
670 |    "cell_type": "code",
671 |    "execution_count": 130,
672 |    "metadata": {},
673 |    "outputs": [
674 |     {
675 |      "data": {
676 |       "text/plain": [
677 |        "array([[ 7,  8,  9, 10],\n",
678 |        "       [10, 11, 12, 13]])"
679 |       ]
680 |      },
681 |      "execution_count": 130,
682 |      "metadata": {},
683 |      "output_type": "execute_result"
684 |     }
685 |    ],
686 |    "source": [
687 |     "#输出第2到第4行所有元素\n",
688 |     "arr1[2:4]"
689 |    ]
690 |   },
691 |   {
692 |    "cell_type": "code",
693 |    "execution_count": 131,
694 |    "metadata": {},
695 |    "outputs": [
696 |     {
697 |      "data": {
698 |       "text/plain": [
699 |        "array([[ 3,  4],\n",
700 |        "       [ 6,  7],\n",
701 |        "       [ 9, 10],\n",
702 |        "       [12, 13]])"
703 |       ]
704 |      },
705 |      "execution_count": 131,
706 |      "metadata": {},
707 |      "output_type": "execute_result"
708 |     }
709 |    ],
710 |    "source": [
711 |     "#输出第2到第4列所有元素\n",
712 |     "arr1[:,2:4]"
713 |    ]
714 |   },
715 |   {
716 |    "cell_type": "code",
717 |    "execution_count": 132,
718 |    "metadata": {},
719 |    "outputs": [
720 |     {
721 |      "data": {
722 |       "text/plain": [
723 |        "array([[ 9, 10],\n",
724 |        "       [12, 13]])"
725 |       ]
726 |      },
727 |      "execution_count": 132,
728 |      "metadata": {},
729 |      "output_type": "execute_result"
730 |     }
731 |    ],
732 |    "source": [
733 |     "#输出第2到第4行&第2到第4列元素\n",
734 |     "arr1[2:4,2:4]"
735 |    ]
736 |   },
737 |   {
738 |    "cell_type": "code",
739 |    "execution_count": 133,
740 |    "metadata": {},
741 |    "outputs": [
742 |     {
743 |      "name": "stdout",
744 |      "output_type": "stream",
745 |      "text": [
746 |       "13\n",
747 |       "1\n",
748 |       "112\n",
749 |       "7.0\n"
750 |      ]
751 |     }
752 |    ],
753 |    "source": [
754 |     "## 矩阵聚合\n",
755 |     "\n",
756 |     "#矩阵全部聚合\n",
757 |     "arr1 = np.array([[1,2,3,4],[4,5,6,7],[7,8,9,10],[10,11,12,13]])\n",
758 |     "print(arr1.max())  #求矩阵最大值\n",
759 |     "print(arr1.min())  #求矩阵最小值\n",
760 |     "print(arr1.sum())  #矩阵求和\n",
761 |     "print(arr1.mean())  #求矩阵均值\n"
762 |    ]
763 |   },
764 |   {
765 |    "cell_type": "code",
766 |    "execution_count": 134,
767 |    "metadata": {},
768 |    "outputs": [
769 |     {
770 |      "name": "stdout",
771 |      "output_type": "stream",
772 |      "text": [
773 |       "arr1= [[ 1  2  3  4]\n",
774 |       " [ 4  5  6  7]\n",
775 |       " [ 7  8  9 10]\n",
776 |       " [10 11 12 13]] \n",
777 |       "\n",
778 |       "7\n",
779 |       "11\n"
780 |      ]
781 |     }
782 |    ],
783 |    "source": [
784 |     "#矩阵按行列聚合（参考按数组聚合）\n",
785 |     "arr1 = np.array([[1,2,3,4],[4,5,6,7],[7,8,9,10],[10,11,12,13]])\n",
786 |     "print('arr1=',arr1,'\\n')\n",
787 |     "\n",
788 |     "print(arr1[1].max())  #求矩阵第1行最大值\n",
789 |     "print(arr1[:,1].max())  #求矩阵第1列最大值\n"
790 |    ]
791 |   },
792 |   {
793 |    "cell_type": "code",
794 |    "execution_count": 135,
795 |    "metadata": {},
796 |    "outputs": [
797 |     {
798 |      "name": "stdout",
799 |      "output_type": "stream",
800 |      "text": [
801 |       "arr1= [[1 2 3 4]\n",
802 |       " [4 5 6 7]] \n",
803 |       "\n",
804 |       "arr1= [[1 4]\n",
805 |       " [2 5]\n",
806 |       " [3 6]\n",
807 |       " [4 7]] \n",
808 |       "\n"
809 |      ]
810 |     }
811 |    ],
812 |    "source": [
813 |     "## 矩阵转置（线性代数）\n",
814 |     "arr1 = np.array([[1,2,3,4],[4,5,6,7]])\n",
815 |     "print('arr1=',arr1,'\\n')\n",
816 |     "\n",
817 |     "arr2 = arr1.T\n",
818 |     "print('arr1=',arr2,'\\n')"
819 |    ]
820 |   },
821 |   {
822 |    "cell_type": "code",
823 |    "execution_count": 136,
824 |    "metadata": {},
825 |    "outputs": [
826 |     {
827 |      "data": {
828 |       "text/plain": [
829 |        "(4, 4)"
830 |       ]
831 |      },
832 |      "execution_count": 136,
833 |      "metadata": {},
834 |      "output_type": "execute_result"
835 |     }
836 |    ],
837 |    "source": [
838 |     "## 矩阵重塑\n",
839 |     "# 查看矩阵维度\n",
840 |     "arr1 = np.array([[1,2,3,4],[4,5,6,7],[7,8,9,10],[10,11,12,13]])\n",
841 |     "arr1.shape"
842 |    ]
843 |   },
844 |   {
845 |    "cell_type": "code",
846 |    "execution_count": 137,
847 |    "metadata": {},
848 |    "outputs": [
849 |     {
850 |      "name": "stdout",
851 |      "output_type": "stream",
852 |      "text": [
853 |       "[[ 1  2  3  4  4  5  6  7]\n",
854 |       " [ 7  8  9 10 10 11 12 13]] \n",
855 |       "\n",
856 |       "(2, 8)\n"
857 |      ]
858 |     }
859 |    ],
860 |    "source": [
861 |     "#将4行4列的矩阵重塑为2行8列\n",
862 |     "arr2 = arr1.reshape(2,8)\n",
863 |     "print(arr2,'\\n')\n",
864 |     "print(arr2.shape)"
865 |    ]
866 |   },
867 |   {
868 |    "cell_type": "code",
869 |    "execution_count": 138,
870 |    "metadata": {},
871 |    "outputs": [
872 |     {
873 |      "data": {
874 |       "text/plain": [
875 |        "array([[[ 1,  2],\n",
876 |        "        [ 3,  4]],\n",
877 |        "\n",
878 |        "       [[ 4,  5],\n",
879 |        "        [ 6,  7]],\n",
880 |        "\n",
881 |        "       [[ 7,  8],\n",
882 |        "        [ 9, 10]],\n",
883 |        "\n",
884 |        "       [[10, 11],\n",
885 |        "        [12, 13]]])"
886 |       ]
887 |      },
888 |      "execution_count": 138,
889 |      "metadata": {},
890 |      "output_type": "execute_result"
891 |     }
892 |    ],
893 |    "source": [
894 |     "## 高纬数组\n",
895 |     "# 例如构建3维数值\n",
896 |     "arr1 = np.array([[[1,2],[3,4]],[[4,5],[6,7]],[[7,8],[9,10]],[[10,11],[12,13]]])\n",
897 |     "arr1\n"
898 |    ]
899 |   },
900 |   {
901 |    "cell_type": "code",
902 |    "execution_count": 139,
903 |    "metadata": {},
904 |    "outputs": [
905 |     {
906 |      "data": {
907 |       "text/plain": [
908 |        "(4, 2, 2)"
909 |       ]
910 |      },
911 |      "execution_count": 139,
912 |      "metadata": {},
913 |      "output_type": "execute_result"
914 |     }
915 |    ],
916 |    "source": [
917 |     "arr1.shape"
918 |    ]
919 |   },
920 |   {
921 |    "cell_type": "code",
922 |    "execution_count": 140,
923 |    "metadata": {},
924 |    "outputs": [],
925 |    "source": [
926 |     "## 参考文档：\n",
927 |     "#1) https://numpy.org/doc/stable/user/index.html\n",
928 |     "#2) https://zhuanlan.zhihu.com/p/81815234"
929 |    ]
930 |   }
931 |  ],
932 |  "metadata": {
933 |   "kernelspec": {
934 |    "display_name": "Python 3",
935 |    "language": "python",
936 |    "name": "python3"
937 |   },
938 |   "language_info": {
939 |    "name": "python",
940 |    "version": "3.11.8"
941 |   }
942 |  },
943 |  "nbformat": 4,
944 |  "nbformat_minor": 2
945 | }
946 | 


--------------------------------------------------------------------------------
/06_Dataset_loading/06_01_Toy_datasets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 13,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from sklearn.datasets import load_iris"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 14,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "{'data': array([[5.1, 3.5, 1.4, 0.2],\n",
 21 |        "        [4.9, 3. , 1.4, 0.2],\n",
 22 |        "        [4.7, 3.2, 1.3, 0.2],\n",
 23 |        "        [4.6, 3.1, 1.5, 0.2],\n",
 24 |        "        [5. , 3.6, 1.4, 0.2],\n",
 25 |        "        [5.4, 3.9, 1.7, 0.4],\n",
 26 |        "        [4.6, 3.4, 1.4, 0.3],\n",
 27 |        "        [5. , 3.4, 1.5, 0.2],\n",
 28 |        "        [4.4, 2.9, 1.4, 0.2],\n",
 29 |        "        [4.9, 3.1, 1.5, 0.1],\n",
 30 |        "        [5.4, 3.7, 1.5, 0.2],\n",
 31 |        "        [4.8, 3.4, 1.6, 0.2],\n",
 32 |        "        [4.8, 3. , 1.4, 0.1],\n",
 33 |        "        [4.3, 3. , 1.1, 0.1],\n",
 34 |        "        [5.8, 4. , 1.2, 0.2],\n",
 35 |        "        [5.7, 4.4, 1.5, 0.4],\n",
 36 |        "        [5.4, 3.9, 1.3, 0.4],\n",
 37 |        "        [5.1, 3.5, 1.4, 0.3],\n",
 38 |        "        [5.7, 3.8, 1.7, 0.3],\n",
 39 |        "        [5.1, 3.8, 1.5, 0.3],\n",
 40 |        "        [5.4, 3.4, 1.7, 0.2],\n",
 41 |        "        [5.1, 3.7, 1.5, 0.4],\n",
 42 |        "        [4.6, 3.6, 1. , 0.2],\n",
 43 |        "        [5.1, 3.3, 1.7, 0.5],\n",
 44 |        "        [4.8, 3.4, 1.9, 0.2],\n",
 45 |        "        [5. , 3. , 1.6, 0.2],\n",
 46 |        "        [5. , 3.4, 1.6, 0.4],\n",
 47 |        "        [5.2, 3.5, 1.5, 0.2],\n",
 48 |        "        [5.2, 3.4, 1.4, 0.2],\n",
 49 |        "        [4.7, 3.2, 1.6, 0.2],\n",
 50 |        "        [4.8, 3.1, 1.6, 0.2],\n",
 51 |        "        [5.4, 3.4, 1.5, 0.4],\n",
 52 |        "        [5.2, 4.1, 1.5, 0.1],\n",
 53 |        "        [5.5, 4.2, 1.4, 0.2],\n",
 54 |        "        [4.9, 3.1, 1.5, 0.2],\n",
 55 |        "        [5. , 3.2, 1.2, 0.2],\n",
 56 |        "        [5.5, 3.5, 1.3, 0.2],\n",
 57 |        "        [4.9, 3.6, 1.4, 0.1],\n",
 58 |        "        [4.4, 3. , 1.3, 0.2],\n",
 59 |        "        [5.1, 3.4, 1.5, 0.2],\n",
 60 |        "        [5. , 3.5, 1.3, 0.3],\n",
 61 |        "        [4.5, 2.3, 1.3, 0.3],\n",
 62 |        "        [4.4, 3.2, 1.3, 0.2],\n",
 63 |        "        [5. , 3.5, 1.6, 0.6],\n",
 64 |        "        [5.1, 3.8, 1.9, 0.4],\n",
 65 |        "        [4.8, 3. , 1.4, 0.3],\n",
 66 |        "        [5.1, 3.8, 1.6, 0.2],\n",
 67 |        "        [4.6, 3.2, 1.4, 0.2],\n",
 68 |        "        [5.3, 3.7, 1.5, 0.2],\n",
 69 |        "        [5. , 3.3, 1.4, 0.2],\n",
 70 |        "        [7. , 3.2, 4.7, 1.4],\n",
 71 |        "        [6.4, 3.2, 4.5, 1.5],\n",
 72 |        "        [6.9, 3.1, 4.9, 1.5],\n",
 73 |        "        [5.5, 2.3, 4. , 1.3],\n",
 74 |        "        [6.5, 2.8, 4.6, 1.5],\n",
 75 |        "        [5.7, 2.8, 4.5, 1.3],\n",
 76 |        "        [6.3, 3.3, 4.7, 1.6],\n",
 77 |        "        [4.9, 2.4, 3.3, 1. ],\n",
 78 |        "        [6.6, 2.9, 4.6, 1.3],\n",
 79 |        "        [5.2, 2.7, 3.9, 1.4],\n",
 80 |        "        [5. , 2. , 3.5, 1. ],\n",
 81 |        "        [5.9, 3. , 4.2, 1.5],\n",
 82 |        "        [6. , 2.2, 4. , 1. ],\n",
 83 |        "        [6.1, 2.9, 4.7, 1.4],\n",
 84 |        "        [5.6, 2.9, 3.6, 1.3],\n",
 85 |        "        [6.7, 3.1, 4.4, 1.4],\n",
 86 |        "        [5.6, 3. , 4.5, 1.5],\n",
 87 |        "        [5.8, 2.7, 4.1, 1. ],\n",
 88 |        "        [6.2, 2.2, 4.5, 1.5],\n",
 89 |        "        [5.6, 2.5, 3.9, 1.1],\n",
 90 |        "        [5.9, 3.2, 4.8, 1.8],\n",
 91 |        "        [6.1, 2.8, 4. , 1.3],\n",
 92 |        "        [6.3, 2.5, 4.9, 1.5],\n",
 93 |        "        [6.1, 2.8, 4.7, 1.2],\n",
 94 |        "        [6.4, 2.9, 4.3, 1.3],\n",
 95 |        "        [6.6, 3. , 4.4, 1.4],\n",
 96 |        "        [6.8, 2.8, 4.8, 1.4],\n",
 97 |        "        [6.7, 3. , 5. , 1.7],\n",
 98 |        "        [6. , 2.9, 4.5, 1.5],\n",
 99 |        "        [5.7, 2.6, 3.5, 1. ],\n",
100 |        "        [5.5, 2.4, 3.8, 1.1],\n",
101 |        "        [5.5, 2.4, 3.7, 1. ],\n",
102 |        "        [5.8, 2.7, 3.9, 1.2],\n",
103 |        "        [6. , 2.7, 5.1, 1.6],\n",
104 |        "        [5.4, 3. , 4.5, 1.5],\n",
105 |        "        [6. , 3.4, 4.5, 1.6],\n",
106 |        "        [6.7, 3.1, 4.7, 1.5],\n",
107 |        "        [6.3, 2.3, 4.4, 1.3],\n",
108 |        "        [5.6, 3. , 4.1, 1.3],\n",
109 |        "        [5.5, 2.5, 4. , 1.3],\n",
110 |        "        [5.5, 2.6, 4.4, 1.2],\n",
111 |        "        [6.1, 3. , 4.6, 1.4],\n",
112 |        "        [5.8, 2.6, 4. , 1.2],\n",
113 |        "        [5. , 2.3, 3.3, 1. ],\n",
114 |        "        [5.6, 2.7, 4.2, 1.3],\n",
115 |        "        [5.7, 3. , 4.2, 1.2],\n",
116 |        "        [5.7, 2.9, 4.2, 1.3],\n",
117 |        "        [6.2, 2.9, 4.3, 1.3],\n",
118 |        "        [5.1, 2.5, 3. , 1.1],\n",
119 |        "        [5.7, 2.8, 4.1, 1.3],\n",
120 |        "        [6.3, 3.3, 6. , 2.5],\n",
121 |        "        [5.8, 2.7, 5.1, 1.9],\n",
122 |        "        [7.1, 3. , 5.9, 2.1],\n",
123 |        "        [6.3, 2.9, 5.6, 1.8],\n",
124 |        "        [6.5, 3. , 5.8, 2.2],\n",
125 |        "        [7.6, 3. , 6.6, 2.1],\n",
126 |        "        [4.9, 2.5, 4.5, 1.7],\n",
127 |        "        [7.3, 2.9, 6.3, 1.8],\n",
128 |        "        [6.7, 2.5, 5.8, 1.8],\n",
129 |        "        [7.2, 3.6, 6.1, 2.5],\n",
130 |        "        [6.5, 3.2, 5.1, 2. ],\n",
131 |        "        [6.4, 2.7, 5.3, 1.9],\n",
132 |        "        [6.8, 3. , 5.5, 2.1],\n",
133 |        "        [5.7, 2.5, 5. , 2. ],\n",
134 |        "        [5.8, 2.8, 5.1, 2.4],\n",
135 |        "        [6.4, 3.2, 5.3, 2.3],\n",
136 |        "        [6.5, 3. , 5.5, 1.8],\n",
137 |        "        [7.7, 3.8, 6.7, 2.2],\n",
138 |        "        [7.7, 2.6, 6.9, 2.3],\n",
139 |        "        [6. , 2.2, 5. , 1.5],\n",
140 |        "        [6.9, 3.2, 5.7, 2.3],\n",
141 |        "        [5.6, 2.8, 4.9, 2. ],\n",
142 |        "        [7.7, 2.8, 6.7, 2. ],\n",
143 |        "        [6.3, 2.7, 4.9, 1.8],\n",
144 |        "        [6.7, 3.3, 5.7, 2.1],\n",
145 |        "        [7.2, 3.2, 6. , 1.8],\n",
146 |        "        [6.2, 2.8, 4.8, 1.8],\n",
147 |        "        [6.1, 3. , 4.9, 1.8],\n",
148 |        "        [6.4, 2.8, 5.6, 2.1],\n",
149 |        "        [7.2, 3. , 5.8, 1.6],\n",
150 |        "        [7.4, 2.8, 6.1, 1.9],\n",
151 |        "        [7.9, 3.8, 6.4, 2. ],\n",
152 |        "        [6.4, 2.8, 5.6, 2.2],\n",
153 |        "        [6.3, 2.8, 5.1, 1.5],\n",
154 |        "        [6.1, 2.6, 5.6, 1.4],\n",
155 |        "        [7.7, 3. , 6.1, 2.3],\n",
156 |        "        [6.3, 3.4, 5.6, 2.4],\n",
157 |        "        [6.4, 3.1, 5.5, 1.8],\n",
158 |        "        [6. , 3. , 4.8, 1.8],\n",
159 |        "        [6.9, 3.1, 5.4, 2.1],\n",
160 |        "        [6.7, 3.1, 5.6, 2.4],\n",
161 |        "        [6.9, 3.1, 5.1, 2.3],\n",
162 |        "        [5.8, 2.7, 5.1, 1.9],\n",
163 |        "        [6.8, 3.2, 5.9, 2.3],\n",
164 |        "        [6.7, 3.3, 5.7, 2.5],\n",
165 |        "        [6.7, 3. , 5.2, 2.3],\n",
166 |        "        [6.3, 2.5, 5. , 1.9],\n",
167 |        "        [6.5, 3. , 5.2, 2. ],\n",
168 |        "        [6.2, 3.4, 5.4, 2.3],\n",
169 |        "        [5.9, 3. , 5.1, 1.8]]),\n",
170 |        " 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
171 |        "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
172 |        "        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
173 |        "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
174 |        "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
175 |        "        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
176 |        "        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),\n",
177 |        " 'frame': None,\n",
178 |        " 'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'),\n",
179 |        " 'DESCR': '.. _iris_dataset:\\n\\nIris plants dataset\\n--------------------\\n\\n**Data Set Characteristics:**\\n\\n:Number of Instances: 150 (50 in each of three classes)\\n:Number of Attributes: 4 numeric, predictive attributes and the class\\n:Attribute Information:\\n    - sepal length in cm\\n    - sepal width in cm\\n    - petal length in cm\\n    - petal width in cm\\n    - class:\\n            - Iris-Setosa\\n            - Iris-Versicolour\\n            - Iris-Virginica\\n\\n:Summary Statistics:\\n\\n============== ==== ==== ======= ===== ====================\\n                Min  Max   Mean    SD   Class Correlation\\n============== ==== ==== ======= ===== ====================\\nsepal length:   4.3  7.9   5.84   0.83    0.7826\\nsepal width:    2.0  4.4   3.05   0.43   -0.4194\\npetal length:   1.0  6.9   3.76   1.76    0.9490  (high!)\\npetal width:    0.1  2.5   1.20   0.76    0.9565  (high!)\\n============== ==== ==== ======= ===== ====================\\n\\n:Missing Attribute Values: None\\n:Class Distribution: 33.3% for each of 3 classes.\\n:Creator: R.A. Fisher\\n:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\\n:Date: July, 1988\\n\\nThe famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\\nfrom Fisher\\'s paper. Note that it\\'s the same as in R, but not as in the UCI\\nMachine Learning Repository, which has two wrong data points.\\n\\nThis is perhaps the best known database to be found in the\\npattern recognition literature.  Fisher\\'s paper is a classic in the field and\\nis referenced frequently to this day.  (See Duda & Hart, for example.)  The\\ndata set contains 3 classes of 50 instances each, where each class refers to a\\ntype of iris plant.  One class is linearly separable from the other 2; the\\nlatter are NOT linearly separable from each other.\\n\\n|details-start|\\n**References**\\n|details-split|\\n\\n- Fisher, R.A. \"The use of multiple measurements in taxonomic problems\"\\n  Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to\\n  Mathematical Statistics\" (John Wiley, NY, 1950).\\n- Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\\n  (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.\\n- Dasarathy, B.V. (1980) \"Nosing Around the Neighborhood: A New System\\n  Structure and Classification Rule for Recognition in Partially Exposed\\n  Environments\".  IEEE Transactions on Pattern Analysis and Machine\\n  Intelligence, Vol. PAMI-2, No. 1, 67-71.\\n- Gates, G.W. (1972) \"The Reduced Nearest Neighbor Rule\".  IEEE Transactions\\n  on Information Theory, May 1972, 431-433.\\n- See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al\"s AUTOCLASS II\\n  conceptual clustering system finds 3 classes in the data.\\n- Many, many more ...\\n\\n|details-end|\\n',\n",
180 |        " 'feature_names': ['sepal length (cm)',\n",
181 |        "  'sepal width (cm)',\n",
182 |        "  'petal length (cm)',\n",
183 |        "  'petal width (cm)'],\n",
184 |        " 'filename': 'iris.csv',\n",
185 |        " 'data_module': 'sklearn.datasets.data'}"
186 |       ]
187 |      },
188 |      "execution_count": 14,
189 |      "metadata": {},
190 |      "output_type": "execute_result"
191 |     }
192 |    ],
193 |    "source": [
194 |     "### 6.1.1 iris数据集\n",
195 |     "## load数据\n",
196 |     "# sklearn.datasets.load_iris(*, return_X_y=False, as_frame=False)[source]\n",
197 |     "irsdata = load_iris()\n",
198 |     "irsdata"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 15,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "data": {
208 |       "text/plain": [
209 |        "sklearn.utils._bunch.Bunch"
210 |       ]
211 |      },
212 |      "execution_count": 15,
213 |      "metadata": {},
214 |      "output_type": "execute_result"
215 |     }
216 |    ],
217 |    "source": [
218 |     "## return_X_y参数 - false\n",
219 |     "# return_X_y是false，返回的是Bunch类型。\n",
220 |     "type(irsdata)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 16,
226 |    "metadata": {},
227 |    "outputs": [
228 |     {
229 |      "data": {
230 |       "text/plain": [
231 |        "array([[5.1, 3.5, 1.4, 0.2],\n",
232 |        "       [4.9, 3. , 1.4, 0.2],\n",
233 |        "       [4.7, 3.2, 1.3, 0.2],\n",
234 |        "       [4.6, 3.1, 1.5, 0.2],\n",
235 |        "       [5. , 3.6, 1.4, 0.2],\n",
236 |        "       [5.4, 3.9, 1.7, 0.4],\n",
237 |        "       [4.6, 3.4, 1.4, 0.3],\n",
238 |        "       [5. , 3.4, 1.5, 0.2],\n",
239 |        "       [4.4, 2.9, 1.4, 0.2],\n",
240 |        "       [4.9, 3.1, 1.5, 0.1],\n",
241 |        "       [5.4, 3.7, 1.5, 0.2],\n",
242 |        "       [4.8, 3.4, 1.6, 0.2],\n",
243 |        "       [4.8, 3. , 1.4, 0.1],\n",
244 |        "       [4.3, 3. , 1.1, 0.1],\n",
245 |        "       [5.8, 4. , 1.2, 0.2],\n",
246 |        "       [5.7, 4.4, 1.5, 0.4],\n",
247 |        "       [5.4, 3.9, 1.3, 0.4],\n",
248 |        "       [5.1, 3.5, 1.4, 0.3],\n",
249 |        "       [5.7, 3.8, 1.7, 0.3],\n",
250 |        "       [5.1, 3.8, 1.5, 0.3],\n",
251 |        "       [5.4, 3.4, 1.7, 0.2],\n",
252 |        "       [5.1, 3.7, 1.5, 0.4],\n",
253 |        "       [4.6, 3.6, 1. , 0.2],\n",
254 |        "       [5.1, 3.3, 1.7, 0.5],\n",
255 |        "       [4.8, 3.4, 1.9, 0.2],\n",
256 |        "       [5. , 3. , 1.6, 0.2],\n",
257 |        "       [5. , 3.4, 1.6, 0.4],\n",
258 |        "       [5.2, 3.5, 1.5, 0.2],\n",
259 |        "       [5.2, 3.4, 1.4, 0.2],\n",
260 |        "       [4.7, 3.2, 1.6, 0.2],\n",
261 |        "       [4.8, 3.1, 1.6, 0.2],\n",
262 |        "       [5.4, 3.4, 1.5, 0.4],\n",
263 |        "       [5.2, 4.1, 1.5, 0.1],\n",
264 |        "       [5.5, 4.2, 1.4, 0.2],\n",
265 |        "       [4.9, 3.1, 1.5, 0.2],\n",
266 |        "       [5. , 3.2, 1.2, 0.2],\n",
267 |        "       [5.5, 3.5, 1.3, 0.2],\n",
268 |        "       [4.9, 3.6, 1.4, 0.1],\n",
269 |        "       [4.4, 3. , 1.3, 0.2],\n",
270 |        "       [5.1, 3.4, 1.5, 0.2],\n",
271 |        "       [5. , 3.5, 1.3, 0.3],\n",
272 |        "       [4.5, 2.3, 1.3, 0.3],\n",
273 |        "       [4.4, 3.2, 1.3, 0.2],\n",
274 |        "       [5. , 3.5, 1.6, 0.6],\n",
275 |        "       [5.1, 3.8, 1.9, 0.4],\n",
276 |        "       [4.8, 3. , 1.4, 0.3],\n",
277 |        "       [5.1, 3.8, 1.6, 0.2],\n",
278 |        "       [4.6, 3.2, 1.4, 0.2],\n",
279 |        "       [5.3, 3.7, 1.5, 0.2],\n",
280 |        "       [5. , 3.3, 1.4, 0.2],\n",
281 |        "       [7. , 3.2, 4.7, 1.4],\n",
282 |        "       [6.4, 3.2, 4.5, 1.5],\n",
283 |        "       [6.9, 3.1, 4.9, 1.5],\n",
284 |        "       [5.5, 2.3, 4. , 1.3],\n",
285 |        "       [6.5, 2.8, 4.6, 1.5],\n",
286 |        "       [5.7, 2.8, 4.5, 1.3],\n",
287 |        "       [6.3, 3.3, 4.7, 1.6],\n",
288 |        "       [4.9, 2.4, 3.3, 1. ],\n",
289 |        "       [6.6, 2.9, 4.6, 1.3],\n",
290 |        "       [5.2, 2.7, 3.9, 1.4],\n",
291 |        "       [5. , 2. , 3.5, 1. ],\n",
292 |        "       [5.9, 3. , 4.2, 1.5],\n",
293 |        "       [6. , 2.2, 4. , 1. ],\n",
294 |        "       [6.1, 2.9, 4.7, 1.4],\n",
295 |        "       [5.6, 2.9, 3.6, 1.3],\n",
296 |        "       [6.7, 3.1, 4.4, 1.4],\n",
297 |        "       [5.6, 3. , 4.5, 1.5],\n",
298 |        "       [5.8, 2.7, 4.1, 1. ],\n",
299 |        "       [6.2, 2.2, 4.5, 1.5],\n",
300 |        "       [5.6, 2.5, 3.9, 1.1],\n",
301 |        "       [5.9, 3.2, 4.8, 1.8],\n",
302 |        "       [6.1, 2.8, 4. , 1.3],\n",
303 |        "       [6.3, 2.5, 4.9, 1.5],\n",
304 |        "       [6.1, 2.8, 4.7, 1.2],\n",
305 |        "       [6.4, 2.9, 4.3, 1.3],\n",
306 |        "       [6.6, 3. , 4.4, 1.4],\n",
307 |        "       [6.8, 2.8, 4.8, 1.4],\n",
308 |        "       [6.7, 3. , 5. , 1.7],\n",
309 |        "       [6. , 2.9, 4.5, 1.5],\n",
310 |        "       [5.7, 2.6, 3.5, 1. ],\n",
311 |        "       [5.5, 2.4, 3.8, 1.1],\n",
312 |        "       [5.5, 2.4, 3.7, 1. ],\n",
313 |        "       [5.8, 2.7, 3.9, 1.2],\n",
314 |        "       [6. , 2.7, 5.1, 1.6],\n",
315 |        "       [5.4, 3. , 4.5, 1.5],\n",
316 |        "       [6. , 3.4, 4.5, 1.6],\n",
317 |        "       [6.7, 3.1, 4.7, 1.5],\n",
318 |        "       [6.3, 2.3, 4.4, 1.3],\n",
319 |        "       [5.6, 3. , 4.1, 1.3],\n",
320 |        "       [5.5, 2.5, 4. , 1.3],\n",
321 |        "       [5.5, 2.6, 4.4, 1.2],\n",
322 |        "       [6.1, 3. , 4.6, 1.4],\n",
323 |        "       [5.8, 2.6, 4. , 1.2],\n",
324 |        "       [5. , 2.3, 3.3, 1. ],\n",
325 |        "       [5.6, 2.7, 4.2, 1.3],\n",
326 |        "       [5.7, 3. , 4.2, 1.2],\n",
327 |        "       [5.7, 2.9, 4.2, 1.3],\n",
328 |        "       [6.2, 2.9, 4.3, 1.3],\n",
329 |        "       [5.1, 2.5, 3. , 1.1],\n",
330 |        "       [5.7, 2.8, 4.1, 1.3],\n",
331 |        "       [6.3, 3.3, 6. , 2.5],\n",
332 |        "       [5.8, 2.7, 5.1, 1.9],\n",
333 |        "       [7.1, 3. , 5.9, 2.1],\n",
334 |        "       [6.3, 2.9, 5.6, 1.8],\n",
335 |        "       [6.5, 3. , 5.8, 2.2],\n",
336 |        "       [7.6, 3. , 6.6, 2.1],\n",
337 |        "       [4.9, 2.5, 4.5, 1.7],\n",
338 |        "       [7.3, 2.9, 6.3, 1.8],\n",
339 |        "       [6.7, 2.5, 5.8, 1.8],\n",
340 |        "       [7.2, 3.6, 6.1, 2.5],\n",
341 |        "       [6.5, 3.2, 5.1, 2. ],\n",
342 |        "       [6.4, 2.7, 5.3, 1.9],\n",
343 |        "       [6.8, 3. , 5.5, 2.1],\n",
344 |        "       [5.7, 2.5, 5. , 2. ],\n",
345 |        "       [5.8, 2.8, 5.1, 2.4],\n",
346 |        "       [6.4, 3.2, 5.3, 2.3],\n",
347 |        "       [6.5, 3. , 5.5, 1.8],\n",
348 |        "       [7.7, 3.8, 6.7, 2.2],\n",
349 |        "       [7.7, 2.6, 6.9, 2.3],\n",
350 |        "       [6. , 2.2, 5. , 1.5],\n",
351 |        "       [6.9, 3.2, 5.7, 2.3],\n",
352 |        "       [5.6, 2.8, 4.9, 2. ],\n",
353 |        "       [7.7, 2.8, 6.7, 2. ],\n",
354 |        "       [6.3, 2.7, 4.9, 1.8],\n",
355 |        "       [6.7, 3.3, 5.7, 2.1],\n",
356 |        "       [7.2, 3.2, 6. , 1.8],\n",
357 |        "       [6.2, 2.8, 4.8, 1.8],\n",
358 |        "       [6.1, 3. , 4.9, 1.8],\n",
359 |        "       [6.4, 2.8, 5.6, 2.1],\n",
360 |        "       [7.2, 3. , 5.8, 1.6],\n",
361 |        "       [7.4, 2.8, 6.1, 1.9],\n",
362 |        "       [7.9, 3.8, 6.4, 2. ],\n",
363 |        "       [6.4, 2.8, 5.6, 2.2],\n",
364 |        "       [6.3, 2.8, 5.1, 1.5],\n",
365 |        "       [6.1, 2.6, 5.6, 1.4],\n",
366 |        "       [7.7, 3. , 6.1, 2.3],\n",
367 |        "       [6.3, 3.4, 5.6, 2.4],\n",
368 |        "       [6.4, 3.1, 5.5, 1.8],\n",
369 |        "       [6. , 3. , 4.8, 1.8],\n",
370 |        "       [6.9, 3.1, 5.4, 2.1],\n",
371 |        "       [6.7, 3.1, 5.6, 2.4],\n",
372 |        "       [6.9, 3.1, 5.1, 2.3],\n",
373 |        "       [5.8, 2.7, 5.1, 1.9],\n",
374 |        "       [6.8, 3.2, 5.9, 2.3],\n",
375 |        "       [6.7, 3.3, 5.7, 2.5],\n",
376 |        "       [6.7, 3. , 5.2, 2.3],\n",
377 |        "       [6.3, 2.5, 5. , 1.9],\n",
378 |        "       [6.5, 3. , 5.2, 2. ],\n",
379 |        "       [6.2, 3.4, 5.4, 2.3],\n",
380 |        "       [5.9, 3. , 5.1, 1.8]])"
381 |       ]
382 |      },
383 |      "execution_count": 16,
384 |      "metadata": {},
385 |      "output_type": "execute_result"
386 |     }
387 |    ],
388 |    "source": [
389 |     "# Bunch类型取data\n",
390 |     "irsdata.data"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 17,
396 |    "metadata": {},
397 |    "outputs": [
398 |     {
399 |      "data": {
400 |       "text/plain": [
401 |        "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
402 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
403 |        "       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
404 |        "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
405 |        "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
406 |        "       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
407 |        "       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])"
408 |       ]
409 |      },
410 |      "execution_count": 17,
411 |      "metadata": {},
412 |      "output_type": "execute_result"
413 |     }
414 |    ],
415 |    "source": [
416 |     "# Bunch类型取target\n",
417 |     "irsdata.target"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 18,
423 |    "metadata": {},
424 |    "outputs": [
425 |     {
426 |      "data": {
427 |       "text/plain": [
428 |        "array([[5.1, 3.5, 1.4, 0.2],\n",
429 |        "       [4.9, 3. , 1.4, 0.2],\n",
430 |        "       [4.7, 3.2, 1.3, 0.2],\n",
431 |        "       [4.6, 3.1, 1.5, 0.2],\n",
432 |        "       [5. , 3.6, 1.4, 0.2],\n",
433 |        "       [5.4, 3.9, 1.7, 0.4],\n",
434 |        "       [4.6, 3.4, 1.4, 0.3],\n",
435 |        "       [5. , 3.4, 1.5, 0.2],\n",
436 |        "       [4.4, 2.9, 1.4, 0.2],\n",
437 |        "       [4.9, 3.1, 1.5, 0.1],\n",
438 |        "       [5.4, 3.7, 1.5, 0.2],\n",
439 |        "       [4.8, 3.4, 1.6, 0.2],\n",
440 |        "       [4.8, 3. , 1.4, 0.1],\n",
441 |        "       [4.3, 3. , 1.1, 0.1],\n",
442 |        "       [5.8, 4. , 1.2, 0.2],\n",
443 |        "       [5.7, 4.4, 1.5, 0.4],\n",
444 |        "       [5.4, 3.9, 1.3, 0.4],\n",
445 |        "       [5.1, 3.5, 1.4, 0.3],\n",
446 |        "       [5.7, 3.8, 1.7, 0.3],\n",
447 |        "       [5.1, 3.8, 1.5, 0.3],\n",
448 |        "       [5.4, 3.4, 1.7, 0.2],\n",
449 |        "       [5.1, 3.7, 1.5, 0.4],\n",
450 |        "       [4.6, 3.6, 1. , 0.2],\n",
451 |        "       [5.1, 3.3, 1.7, 0.5],\n",
452 |        "       [4.8, 3.4, 1.9, 0.2],\n",
453 |        "       [5. , 3. , 1.6, 0.2],\n",
454 |        "       [5. , 3.4, 1.6, 0.4],\n",
455 |        "       [5.2, 3.5, 1.5, 0.2],\n",
456 |        "       [5.2, 3.4, 1.4, 0.2],\n",
457 |        "       [4.7, 3.2, 1.6, 0.2],\n",
458 |        "       [4.8, 3.1, 1.6, 0.2],\n",
459 |        "       [5.4, 3.4, 1.5, 0.4],\n",
460 |        "       [5.2, 4.1, 1.5, 0.1],\n",
461 |        "       [5.5, 4.2, 1.4, 0.2],\n",
462 |        "       [4.9, 3.1, 1.5, 0.2],\n",
463 |        "       [5. , 3.2, 1.2, 0.2],\n",
464 |        "       [5.5, 3.5, 1.3, 0.2],\n",
465 |        "       [4.9, 3.6, 1.4, 0.1],\n",
466 |        "       [4.4, 3. , 1.3, 0.2],\n",
467 |        "       [5.1, 3.4, 1.5, 0.2],\n",
468 |        "       [5. , 3.5, 1.3, 0.3],\n",
469 |        "       [4.5, 2.3, 1.3, 0.3],\n",
470 |        "       [4.4, 3.2, 1.3, 0.2],\n",
471 |        "       [5. , 3.5, 1.6, 0.6],\n",
472 |        "       [5.1, 3.8, 1.9, 0.4],\n",
473 |        "       [4.8, 3. , 1.4, 0.3],\n",
474 |        "       [5.1, 3.8, 1.6, 0.2],\n",
475 |        "       [4.6, 3.2, 1.4, 0.2],\n",
476 |        "       [5.3, 3.7, 1.5, 0.2],\n",
477 |        "       [5. , 3.3, 1.4, 0.2],\n",
478 |        "       [7. , 3.2, 4.7, 1.4],\n",
479 |        "       [6.4, 3.2, 4.5, 1.5],\n",
480 |        "       [6.9, 3.1, 4.9, 1.5],\n",
481 |        "       [5.5, 2.3, 4. , 1.3],\n",
482 |        "       [6.5, 2.8, 4.6, 1.5],\n",
483 |        "       [5.7, 2.8, 4.5, 1.3],\n",
484 |        "       [6.3, 3.3, 4.7, 1.6],\n",
485 |        "       [4.9, 2.4, 3.3, 1. ],\n",
486 |        "       [6.6, 2.9, 4.6, 1.3],\n",
487 |        "       [5.2, 2.7, 3.9, 1.4],\n",
488 |        "       [5. , 2. , 3.5, 1. ],\n",
489 |        "       [5.9, 3. , 4.2, 1.5],\n",
490 |        "       [6. , 2.2, 4. , 1. ],\n",
491 |        "       [6.1, 2.9, 4.7, 1.4],\n",
492 |        "       [5.6, 2.9, 3.6, 1.3],\n",
493 |        "       [6.7, 3.1, 4.4, 1.4],\n",
494 |        "       [5.6, 3. , 4.5, 1.5],\n",
495 |        "       [5.8, 2.7, 4.1, 1. ],\n",
496 |        "       [6.2, 2.2, 4.5, 1.5],\n",
497 |        "       [5.6, 2.5, 3.9, 1.1],\n",
498 |        "       [5.9, 3.2, 4.8, 1.8],\n",
499 |        "       [6.1, 2.8, 4. , 1.3],\n",
500 |        "       [6.3, 2.5, 4.9, 1.5],\n",
501 |        "       [6.1, 2.8, 4.7, 1.2],\n",
502 |        "       [6.4, 2.9, 4.3, 1.3],\n",
503 |        "       [6.6, 3. , 4.4, 1.4],\n",
504 |        "       [6.8, 2.8, 4.8, 1.4],\n",
505 |        "       [6.7, 3. , 5. , 1.7],\n",
506 |        "       [6. , 2.9, 4.5, 1.5],\n",
507 |        "       [5.7, 2.6, 3.5, 1. ],\n",
508 |        "       [5.5, 2.4, 3.8, 1.1],\n",
509 |        "       [5.5, 2.4, 3.7, 1. ],\n",
510 |        "       [5.8, 2.7, 3.9, 1.2],\n",
511 |        "       [6. , 2.7, 5.1, 1.6],\n",
512 |        "       [5.4, 3. , 4.5, 1.5],\n",
513 |        "       [6. , 3.4, 4.5, 1.6],\n",
514 |        "       [6.7, 3.1, 4.7, 1.5],\n",
515 |        "       [6.3, 2.3, 4.4, 1.3],\n",
516 |        "       [5.6, 3. , 4.1, 1.3],\n",
517 |        "       [5.5, 2.5, 4. , 1.3],\n",
518 |        "       [5.5, 2.6, 4.4, 1.2],\n",
519 |        "       [6.1, 3. , 4.6, 1.4],\n",
520 |        "       [5.8, 2.6, 4. , 1.2],\n",
521 |        "       [5. , 2.3, 3.3, 1. ],\n",
522 |        "       [5.6, 2.7, 4.2, 1.3],\n",
523 |        "       [5.7, 3. , 4.2, 1.2],\n",
524 |        "       [5.7, 2.9, 4.2, 1.3],\n",
525 |        "       [6.2, 2.9, 4.3, 1.3],\n",
526 |        "       [5.1, 2.5, 3. , 1.1],\n",
527 |        "       [5.7, 2.8, 4.1, 1.3],\n",
528 |        "       [6.3, 3.3, 6. , 2.5],\n",
529 |        "       [5.8, 2.7, 5.1, 1.9],\n",
530 |        "       [7.1, 3. , 5.9, 2.1],\n",
531 |        "       [6.3, 2.9, 5.6, 1.8],\n",
532 |        "       [6.5, 3. , 5.8, 2.2],\n",
533 |        "       [7.6, 3. , 6.6, 2.1],\n",
534 |        "       [4.9, 2.5, 4.5, 1.7],\n",
535 |        "       [7.3, 2.9, 6.3, 1.8],\n",
536 |        "       [6.7, 2.5, 5.8, 1.8],\n",
537 |        "       [7.2, 3.6, 6.1, 2.5],\n",
538 |        "       [6.5, 3.2, 5.1, 2. ],\n",
539 |        "       [6.4, 2.7, 5.3, 1.9],\n",
540 |        "       [6.8, 3. , 5.5, 2.1],\n",
541 |        "       [5.7, 2.5, 5. , 2. ],\n",
542 |        "       [5.8, 2.8, 5.1, 2.4],\n",
543 |        "       [6.4, 3.2, 5.3, 2.3],\n",
544 |        "       [6.5, 3. , 5.5, 1.8],\n",
545 |        "       [7.7, 3.8, 6.7, 2.2],\n",
546 |        "       [7.7, 2.6, 6.9, 2.3],\n",
547 |        "       [6. , 2.2, 5. , 1.5],\n",
548 |        "       [6.9, 3.2, 5.7, 2.3],\n",
549 |        "       [5.6, 2.8, 4.9, 2. ],\n",
550 |        "       [7.7, 2.8, 6.7, 2. ],\n",
551 |        "       [6.3, 2.7, 4.9, 1.8],\n",
552 |        "       [6.7, 3.3, 5.7, 2.1],\n",
553 |        "       [7.2, 3.2, 6. , 1.8],\n",
554 |        "       [6.2, 2.8, 4.8, 1.8],\n",
555 |        "       [6.1, 3. , 4.9, 1.8],\n",
556 |        "       [6.4, 2.8, 5.6, 2.1],\n",
557 |        "       [7.2, 3. , 5.8, 1.6],\n",
558 |        "       [7.4, 2.8, 6.1, 1.9],\n",
559 |        "       [7.9, 3.8, 6.4, 2. ],\n",
560 |        "       [6.4, 2.8, 5.6, 2.2],\n",
561 |        "       [6.3, 2.8, 5.1, 1.5],\n",
562 |        "       [6.1, 2.6, 5.6, 1.4],\n",
563 |        "       [7.7, 3. , 6.1, 2.3],\n",
564 |        "       [6.3, 3.4, 5.6, 2.4],\n",
565 |        "       [6.4, 3.1, 5.5, 1.8],\n",
566 |        "       [6. , 3. , 4.8, 1.8],\n",
567 |        "       [6.9, 3.1, 5.4, 2.1],\n",
568 |        "       [6.7, 3.1, 5.6, 2.4],\n",
569 |        "       [6.9, 3.1, 5.1, 2.3],\n",
570 |        "       [5.8, 2.7, 5.1, 1.9],\n",
571 |        "       [6.8, 3.2, 5.9, 2.3],\n",
572 |        "       [6.7, 3.3, 5.7, 2.5],\n",
573 |        "       [6.7, 3. , 5.2, 2.3],\n",
574 |        "       [6.3, 2.5, 5. , 1.9],\n",
575 |        "       [6.5, 3. , 5.2, 2. ],\n",
576 |        "       [6.2, 3.4, 5.4, 2.3],\n",
577 |        "       [5.9, 3. , 5.1, 1.8]])"
578 |       ]
579 |      },
580 |      "execution_count": 18,
581 |      "metadata": {},
582 |      "output_type": "execute_result"
583 |     }
584 |    ],
585 |    "source": [
586 |     "## return_X_y参数 - true\n",
587 |     "# return_X_y为true返回的 data,target和上面irsdata.data,irsdata.target是一样的，\n",
588 |     "# 返回的结构是numpy\n",
589 |     "\n",
590 |     "data,target = load_iris(return_X_y=True)\n",
591 |     "\n",
592 |     "#查看data\n",
593 |     "data"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": 19,
599 |    "metadata": {},
600 |    "outputs": [
601 |     {
602 |      "data": {
603 |       "text/plain": [
604 |        "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
605 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
606 |        "       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
607 |        "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
608 |        "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
609 |        "       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
610 |        "       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])"
611 |       ]
612 |      },
613 |      "execution_count": 19,
614 |      "metadata": {},
615 |      "output_type": "execute_result"
616 |     }
617 |    ],
618 |    "source": [
619 |     "#查看target\n",
620 |     "target"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": 20,
626 |    "metadata": {},
627 |    "outputs": [
628 |     {
629 |      "data": {
630 |       "text/html": [
631 |        "<div>\n",
632 |        "<style scoped>\n",
633 |        "    .dataframe tbody tr th:only-of-type {\n",
634 |        "        vertical-align: middle;\n",
635 |        "    }\n",
636 |        "\n",
637 |        "    .dataframe tbody tr th {\n",
638 |        "        vertical-align: top;\n",
639 |        "    }\n",
640 |        "\n",
641 |        "    .dataframe thead th {\n",
642 |        "        text-align: right;\n",
643 |        "    }\n",
644 |        "</style>\n",
645 |        "<table border=\"1\" class=\"dataframe\">\n",
646 |        "  <thead>\n",
647 |        "    <tr style=\"text-align: right;\">\n",
648 |        "      <th></th>\n",
649 |        "      <th>sepal length (cm)</th>\n",
650 |        "      <th>sepal width (cm)</th>\n",
651 |        "      <th>petal length (cm)</th>\n",
652 |        "      <th>petal width (cm)</th>\n",
653 |        "    </tr>\n",
654 |        "  </thead>\n",
655 |        "  <tbody>\n",
656 |        "    <tr>\n",
657 |        "      <th>0</th>\n",
658 |        "      <td>5.1</td>\n",
659 |        "      <td>3.5</td>\n",
660 |        "      <td>1.4</td>\n",
661 |        "      <td>0.2</td>\n",
662 |        "    </tr>\n",
663 |        "    <tr>\n",
664 |        "      <th>1</th>\n",
665 |        "      <td>4.9</td>\n",
666 |        "      <td>3.0</td>\n",
667 |        "      <td>1.4</td>\n",
668 |        "      <td>0.2</td>\n",
669 |        "    </tr>\n",
670 |        "    <tr>\n",
671 |        "      <th>2</th>\n",
672 |        "      <td>4.7</td>\n",
673 |        "      <td>3.2</td>\n",
674 |        "      <td>1.3</td>\n",
675 |        "      <td>0.2</td>\n",
676 |        "    </tr>\n",
677 |        "    <tr>\n",
678 |        "      <th>3</th>\n",
679 |        "      <td>4.6</td>\n",
680 |        "      <td>3.1</td>\n",
681 |        "      <td>1.5</td>\n",
682 |        "      <td>0.2</td>\n",
683 |        "    </tr>\n",
684 |        "    <tr>\n",
685 |        "      <th>4</th>\n",
686 |        "      <td>5.0</td>\n",
687 |        "      <td>3.6</td>\n",
688 |        "      <td>1.4</td>\n",
689 |        "      <td>0.2</td>\n",
690 |        "    </tr>\n",
691 |        "    <tr>\n",
692 |        "      <th>...</th>\n",
693 |        "      <td>...</td>\n",
694 |        "      <td>...</td>\n",
695 |        "      <td>...</td>\n",
696 |        "      <td>...</td>\n",
697 |        "    </tr>\n",
698 |        "    <tr>\n",
699 |        "      <th>145</th>\n",
700 |        "      <td>6.7</td>\n",
701 |        "      <td>3.0</td>\n",
702 |        "      <td>5.2</td>\n",
703 |        "      <td>2.3</td>\n",
704 |        "    </tr>\n",
705 |        "    <tr>\n",
706 |        "      <th>146</th>\n",
707 |        "      <td>6.3</td>\n",
708 |        "      <td>2.5</td>\n",
709 |        "      <td>5.0</td>\n",
710 |        "      <td>1.9</td>\n",
711 |        "    </tr>\n",
712 |        "    <tr>\n",
713 |        "      <th>147</th>\n",
714 |        "      <td>6.5</td>\n",
715 |        "      <td>3.0</td>\n",
716 |        "      <td>5.2</td>\n",
717 |        "      <td>2.0</td>\n",
718 |        "    </tr>\n",
719 |        "    <tr>\n",
720 |        "      <th>148</th>\n",
721 |        "      <td>6.2</td>\n",
722 |        "      <td>3.4</td>\n",
723 |        "      <td>5.4</td>\n",
724 |        "      <td>2.3</td>\n",
725 |        "    </tr>\n",
726 |        "    <tr>\n",
727 |        "      <th>149</th>\n",
728 |        "      <td>5.9</td>\n",
729 |        "      <td>3.0</td>\n",
730 |        "      <td>5.1</td>\n",
731 |        "      <td>1.8</td>\n",
732 |        "    </tr>\n",
733 |        "  </tbody>\n",
734 |        "</table>\n",
735 |        "<p>150 rows × 4 columns</p>\n",
736 |        "</div>"
737 |       ],
738 |       "text/plain": [
739 |        "     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)\n",
740 |        "0                  5.1               3.5                1.4               0.2\n",
741 |        "1                  4.9               3.0                1.4               0.2\n",
742 |        "2                  4.7               3.2                1.3               0.2\n",
743 |        "3                  4.6               3.1                1.5               0.2\n",
744 |        "4                  5.0               3.6                1.4               0.2\n",
745 |        "..                 ...               ...                ...               ...\n",
746 |        "145                6.7               3.0                5.2               2.3\n",
747 |        "146                6.3               2.5                5.0               1.9\n",
748 |        "147                6.5               3.0                5.2               2.0\n",
749 |        "148                6.2               3.4                5.4               2.3\n",
750 |        "149                5.9               3.0                5.1               1.8\n",
751 |        "\n",
752 |        "[150 rows x 4 columns]"
753 |       ]
754 |      },
755 |      "execution_count": 20,
756 |      "metadata": {},
757 |      "output_type": "execute_result"
758 |     }
759 |    ],
760 |    "source": [
761 |     "## as_frame参数 - 返回pandas DataFrame结构。\n",
762 |     "data,target = load_iris(return_X_y=True,as_frame=True)\n",
763 |     "\n",
764 |     "#查看data\n",
765 |     "data"
766 |    ]
767 |   },
768 |   {
769 |    "cell_type": "code",
770 |    "execution_count": 21,
771 |    "metadata": {},
772 |    "outputs": [
773 |     {
774 |      "data": {
775 |       "text/plain": [
776 |        "0      0\n",
777 |        "1      0\n",
778 |        "2      0\n",
779 |        "3      0\n",
780 |        "4      0\n",
781 |        "      ..\n",
782 |        "145    2\n",
783 |        "146    2\n",
784 |        "147    2\n",
785 |        "148    2\n",
786 |        "149    2\n",
787 |        "Name: target, Length: 150, dtype: int32"
788 |       ]
789 |      },
790 |      "execution_count": 21,
791 |      "metadata": {},
792 |      "output_type": "execute_result"
793 |     }
794 |    ],
795 |    "source": [
796 |     "#查看target\n",
797 |     "target"
798 |    ]
799 |   },
800 |   {
801 |    "cell_type": "code",
802 |    "execution_count": 22,
803 |    "metadata": {},
804 |    "outputs": [],
805 |    "source": [
806 |     "## 总结\n",
807 |     "\n",
808 |     "#1、推荐使用 load_iris(return_X_y=True,as_frame=True)方式加载数据，直接返回pandas DataFrame格式的一对数据与标签。\n",
809 |     "#2、其他数据集请参考 https://scikitlearn.com.cn/0.21.3/47/#62"
810 |    ]
811 |   }
812 |  ],
813 |  "metadata": {
814 |   "kernelspec": {
815 |    "display_name": "Python 3",
816 |    "language": "python",
817 |    "name": "python3"
818 |   },
819 |   "language_info": {
820 |    "codemirror_mode": {
821 |     "name": "ipython",
822 |     "version": 3
823 |    },
824 |    "file_extension": ".py",
825 |    "mimetype": "text/x-python",
826 |    "name": "python",
827 |    "nbconvert_exporter": "python",
828 |    "pygments_lexer": "ipython3",
829 |    "version": "3.11.8"
830 |   }
831 |  },
832 |  "nbformat": 4,
833 |  "nbformat_minor": 2
834 | }
835 | 


--------------------------------------------------------------------------------
/06_Dataset_loading/06_04_load_files.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 57,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "import pandas as pd"
  10 |    ]
  11 |   },
  12 |   {
  13 |    "cell_type": "code",
  14 |    "execution_count": 58,
  15 |    "metadata": {},
  16 |    "outputs": [
  17 |     {
  18 |      "data": {
  19 |       "text/html": [
  20 |        "<div>\n",
  21 |        "<style scoped>\n",
  22 |        "    .dataframe tbody tr th:only-of-type {\n",
  23 |        "        vertical-align: middle;\n",
  24 |        "    }\n",
  25 |        "\n",
  26 |        "    .dataframe tbody tr th {\n",
  27 |        "        vertical-align: top;\n",
  28 |        "    }\n",
  29 |        "\n",
  30 |        "    .dataframe thead th {\n",
  31 |        "        text-align: right;\n",
  32 |        "    }\n",
  33 |        "</style>\n",
  34 |        "<table border=\"1\" class=\"dataframe\">\n",
  35 |        "  <thead>\n",
  36 |        "    <tr style=\"text-align: right;\">\n",
  37 |        "      <th></th>\n",
  38 |        "      <th>sepal_length</th>\n",
  39 |        "      <th>sepal_width</th>\n",
  40 |        "      <th>petal_length</th>\n",
  41 |        "      <th>petal_width</th>\n",
  42 |        "      <th>species</th>\n",
  43 |        "    </tr>\n",
  44 |        "  </thead>\n",
  45 |        "  <tbody>\n",
  46 |        "    <tr>\n",
  47 |        "      <th>0</th>\n",
  48 |        "      <td>5.1</td>\n",
  49 |        "      <td>3.5</td>\n",
  50 |        "      <td>1.4</td>\n",
  51 |        "      <td>0.2</td>\n",
  52 |        "      <td>setosa</td>\n",
  53 |        "    </tr>\n",
  54 |        "    <tr>\n",
  55 |        "      <th>1</th>\n",
  56 |        "      <td>4.9</td>\n",
  57 |        "      <td>3.0</td>\n",
  58 |        "      <td>1.4</td>\n",
  59 |        "      <td>0.2</td>\n",
  60 |        "      <td>setosa</td>\n",
  61 |        "    </tr>\n",
  62 |        "    <tr>\n",
  63 |        "      <th>2</th>\n",
  64 |        "      <td>4.7</td>\n",
  65 |        "      <td>3.2</td>\n",
  66 |        "      <td>1.3</td>\n",
  67 |        "      <td>0.2</td>\n",
  68 |        "      <td>setosa</td>\n",
  69 |        "    </tr>\n",
  70 |        "    <tr>\n",
  71 |        "      <th>3</th>\n",
  72 |        "      <td>4.6</td>\n",
  73 |        "      <td>3.1</td>\n",
  74 |        "      <td>1.5</td>\n",
  75 |        "      <td>0.2</td>\n",
  76 |        "      <td>setosa</td>\n",
  77 |        "    </tr>\n",
  78 |        "    <tr>\n",
  79 |        "      <th>4</th>\n",
  80 |        "      <td>5.0</td>\n",
  81 |        "      <td>3.6</td>\n",
  82 |        "      <td>1.4</td>\n",
  83 |        "      <td>0.2</td>\n",
  84 |        "      <td>setosa</td>\n",
  85 |        "    </tr>\n",
  86 |        "  </tbody>\n",
  87 |        "</table>\n",
  88 |        "</div>"
  89 |       ],
  90 |       "text/plain": [
  91 |        "   sepal_length  sepal_width  petal_length  petal_width species\n",
  92 |        "0           5.1          3.5           1.4          0.2  setosa\n",
  93 |        "1           4.9          3.0           1.4          0.2  setosa\n",
  94 |        "2           4.7          3.2           1.3          0.2  setosa\n",
  95 |        "3           4.6          3.1           1.5          0.2  setosa\n",
  96 |        "4           5.0          3.6           1.4          0.2  setosa"
  97 |       ]
  98 |      },
  99 |      "execution_count": 58,
 100 |      "metadata": {},
 101 |      "output_type": "execute_result"
 102 |     }
 103 |    ],
 104 |    "source": [
 105 |     "### 6.4.1 加载csv格式数据\n",
 106 |     "\n",
 107 |     "### 读取文件，默认参数\n",
 108 |     "iris_data = pd.read_csv(\"../_Datasets/iris.data\")  #sep指定字段分隔符，默认逗号。\n",
 109 |     "iris_data.head()"
 110 |    ]
 111 |   },
 112 |   {
 113 |    "cell_type": "code",
 114 |    "execution_count": 59,
 115 |    "metadata": {},
 116 |    "outputs": [
 117 |     {
 118 |      "data": {
 119 |       "text/html": [
 120 |        "<div>\n",
 121 |        "<style scoped>\n",
 122 |        "    .dataframe tbody tr th:only-of-type {\n",
 123 |        "        vertical-align: middle;\n",
 124 |        "    }\n",
 125 |        "\n",
 126 |        "    .dataframe tbody tr th {\n",
 127 |        "        vertical-align: top;\n",
 128 |        "    }\n",
 129 |        "\n",
 130 |        "    .dataframe thead th {\n",
 131 |        "        text-align: right;\n",
 132 |        "    }\n",
 133 |        "</style>\n",
 134 |        "<table border=\"1\" class=\"dataframe\">\n",
 135 |        "  <thead>\n",
 136 |        "    <tr style=\"text-align: right;\">\n",
 137 |        "      <th></th>\n",
 138 |        "      <th>sepal_length</th>\n",
 139 |        "      <th>sepal_width</th>\n",
 140 |        "      <th>petal_length</th>\n",
 141 |        "      <th>petal_width</th>\n",
 142 |        "      <th>species</th>\n",
 143 |        "    </tr>\n",
 144 |        "  </thead>\n",
 145 |        "  <tbody>\n",
 146 |        "    <tr>\n",
 147 |        "      <th>0</th>\n",
 148 |        "      <td>5.1</td>\n",
 149 |        "      <td>3.5</td>\n",
 150 |        "      <td>1.4</td>\n",
 151 |        "      <td>0.2</td>\n",
 152 |        "      <td>setosa</td>\n",
 153 |        "    </tr>\n",
 154 |        "    <tr>\n",
 155 |        "      <th>1</th>\n",
 156 |        "      <td>4.9</td>\n",
 157 |        "      <td>3.0</td>\n",
 158 |        "      <td>1.4</td>\n",
 159 |        "      <td>0.2</td>\n",
 160 |        "      <td>setosa</td>\n",
 161 |        "    </tr>\n",
 162 |        "    <tr>\n",
 163 |        "      <th>2</th>\n",
 164 |        "      <td>4.7</td>\n",
 165 |        "      <td>3.2</td>\n",
 166 |        "      <td>1.3</td>\n",
 167 |        "      <td>0.2</td>\n",
 168 |        "      <td>setosa</td>\n",
 169 |        "    </tr>\n",
 170 |        "    <tr>\n",
 171 |        "      <th>3</th>\n",
 172 |        "      <td>4.6</td>\n",
 173 |        "      <td>3.1</td>\n",
 174 |        "      <td>1.5</td>\n",
 175 |        "      <td>0.2</td>\n",
 176 |        "      <td>setosa</td>\n",
 177 |        "    </tr>\n",
 178 |        "    <tr>\n",
 179 |        "      <th>4</th>\n",
 180 |        "      <td>5.0</td>\n",
 181 |        "      <td>3.6</td>\n",
 182 |        "      <td>1.4</td>\n",
 183 |        "      <td>0.2</td>\n",
 184 |        "      <td>setosa</td>\n",
 185 |        "    </tr>\n",
 186 |        "  </tbody>\n",
 187 |        "</table>\n",
 188 |        "</div>"
 189 |       ],
 190 |       "text/plain": [
 191 |        "   sepal_length  sepal_width  petal_length  petal_width species\n",
 192 |        "0           5.1          3.5           1.4          0.2  setosa\n",
 193 |        "1           4.9          3.0           1.4          0.2  setosa\n",
 194 |        "2           4.7          3.2           1.3          0.2  setosa\n",
 195 |        "3           4.6          3.1           1.5          0.2  setosa\n",
 196 |        "4           5.0          3.6           1.4          0.2  setosa"
 197 |       ]
 198 |      },
 199 |      "execution_count": 59,
 200 |      "metadata": {},
 201 |      "output_type": "execute_result"
 202 |     }
 203 |    ],
 204 |    "source": [
 205 |     "### sep\n",
 206 |     "#读取文件，指定字段分隔符\n",
 207 |     "iris_data = pd.read_csv(\"../_Datasets/iris.data\",sep=',')  \n",
 208 |     "iris_data.head()"
 209 |    ]
 210 |   },
 211 |   {
 212 |    "cell_type": "code",
 213 |    "execution_count": 60,
 214 |    "metadata": {},
 215 |    "outputs": [
 216 |     {
 217 |      "data": {
 218 |       "text/html": [
 219 |        "<div>\n",
 220 |        "<style scoped>\n",
 221 |        "    .dataframe tbody tr th:only-of-type {\n",
 222 |        "        vertical-align: middle;\n",
 223 |        "    }\n",
 224 |        "\n",
 225 |        "    .dataframe tbody tr th {\n",
 226 |        "        vertical-align: top;\n",
 227 |        "    }\n",
 228 |        "\n",
 229 |        "    .dataframe thead th {\n",
 230 |        "        text-align: right;\n",
 231 |        "    }\n",
 232 |        "</style>\n",
 233 |        "<table border=\"1\" class=\"dataframe\">\n",
 234 |        "  <thead>\n",
 235 |        "    <tr style=\"text-align: right;\">\n",
 236 |        "      <th></th>\n",
 237 |        "      <th>sepal_length</th>\n",
 238 |        "      <th>sepal_width</th>\n",
 239 |        "      <th>petal_length</th>\n",
 240 |        "      <th>petal_width</th>\n",
 241 |        "      <th>species</th>\n",
 242 |        "    </tr>\n",
 243 |        "  </thead>\n",
 244 |        "  <tbody>\n",
 245 |        "    <tr>\n",
 246 |        "      <th>0</th>\n",
 247 |        "      <td>5.1</td>\n",
 248 |        "      <td>3.5</td>\n",
 249 |        "      <td>1.4</td>\n",
 250 |        "      <td>0.2</td>\n",
 251 |        "      <td>setosa</td>\n",
 252 |        "    </tr>\n",
 253 |        "    <tr>\n",
 254 |        "      <th>1</th>\n",
 255 |        "      <td>4.9</td>\n",
 256 |        "      <td>3.0</td>\n",
 257 |        "      <td>1.4</td>\n",
 258 |        "      <td>0.2</td>\n",
 259 |        "      <td>setosa</td>\n",
 260 |        "    </tr>\n",
 261 |        "    <tr>\n",
 262 |        "      <th>2</th>\n",
 263 |        "      <td>4.7</td>\n",
 264 |        "      <td>3.2</td>\n",
 265 |        "      <td>1.3</td>\n",
 266 |        "      <td>0.2</td>\n",
 267 |        "      <td>setosa</td>\n",
 268 |        "    </tr>\n",
 269 |        "    <tr>\n",
 270 |        "      <th>3</th>\n",
 271 |        "      <td>4.6</td>\n",
 272 |        "      <td>3.1</td>\n",
 273 |        "      <td>1.5</td>\n",
 274 |        "      <td>0.2</td>\n",
 275 |        "      <td>setosa</td>\n",
 276 |        "    </tr>\n",
 277 |        "    <tr>\n",
 278 |        "      <th>4</th>\n",
 279 |        "      <td>5.0</td>\n",
 280 |        "      <td>3.6</td>\n",
 281 |        "      <td>1.4</td>\n",
 282 |        "      <td>0.2</td>\n",
 283 |        "      <td>setosa</td>\n",
 284 |        "    </tr>\n",
 285 |        "  </tbody>\n",
 286 |        "</table>\n",
 287 |        "</div>"
 288 |       ],
 289 |       "text/plain": [
 290 |        "   sepal_length  sepal_width  petal_length  petal_width species\n",
 291 |        "0           5.1          3.5           1.4          0.2  setosa\n",
 292 |        "1           4.9          3.0           1.4          0.2  setosa\n",
 293 |        "2           4.7          3.2           1.3          0.2  setosa\n",
 294 |        "3           4.6          3.1           1.5          0.2  setosa\n",
 295 |        "4           5.0          3.6           1.4          0.2  setosa"
 296 |       ]
 297 |      },
 298 |      "execution_count": 60,
 299 |      "metadata": {},
 300 |      "output_type": "execute_result"
 301 |     }
 302 |    ],
 303 |    "source": [
 304 |     "### encoding\n",
 305 |     "#读取文件，指定编码\n",
 306 |     "#如果出现中文乱码，可更换为中文编码\"gb2312\"或\"gbk\"\n",
 307 |     "iris_data = pd.read_csv(\"../_Datasets/iris.data\", encoding=\"utf-8\")  \n",
 308 |     "iris_data.head()"
 309 |    ]
 310 |   },
 311 |   {
 312 |    "cell_type": "code",
 313 |    "execution_count": 61,
 314 |    "metadata": {},
 315 |    "outputs": [
 316 |     {
 317 |      "data": {
 318 |       "text/html": [
 319 |        "<div>\n",
 320 |        "<style scoped>\n",
 321 |        "    .dataframe tbody tr th:only-of-type {\n",
 322 |        "        vertical-align: middle;\n",
 323 |        "    }\n",
 324 |        "\n",
 325 |        "    .dataframe tbody tr th {\n",
 326 |        "        vertical-align: top;\n",
 327 |        "    }\n",
 328 |        "\n",
 329 |        "    .dataframe thead th {\n",
 330 |        "        text-align: right;\n",
 331 |        "    }\n",
 332 |        "</style>\n",
 333 |        "<table border=\"1\" class=\"dataframe\">\n",
 334 |        "  <thead>\n",
 335 |        "    <tr style=\"text-align: right;\">\n",
 336 |        "      <th></th>\n",
 337 |        "      <th>5.1</th>\n",
 338 |        "      <th>3.5</th>\n",
 339 |        "      <th>1.4</th>\n",
 340 |        "      <th>0.2</th>\n",
 341 |        "      <th>setosa</th>\n",
 342 |        "    </tr>\n",
 343 |        "  </thead>\n",
 344 |        "  <tbody>\n",
 345 |        "    <tr>\n",
 346 |        "      <th>0</th>\n",
 347 |        "      <td>4.9</td>\n",
 348 |        "      <td>3.0</td>\n",
 349 |        "      <td>1.4</td>\n",
 350 |        "      <td>0.2</td>\n",
 351 |        "      <td>setosa</td>\n",
 352 |        "    </tr>\n",
 353 |        "    <tr>\n",
 354 |        "      <th>1</th>\n",
 355 |        "      <td>4.7</td>\n",
 356 |        "      <td>3.2</td>\n",
 357 |        "      <td>1.3</td>\n",
 358 |        "      <td>0.2</td>\n",
 359 |        "      <td>setosa</td>\n",
 360 |        "    </tr>\n",
 361 |        "    <tr>\n",
 362 |        "      <th>2</th>\n",
 363 |        "      <td>4.6</td>\n",
 364 |        "      <td>3.1</td>\n",
 365 |        "      <td>1.5</td>\n",
 366 |        "      <td>0.2</td>\n",
 367 |        "      <td>setosa</td>\n",
 368 |        "    </tr>\n",
 369 |        "    <tr>\n",
 370 |        "      <th>3</th>\n",
 371 |        "      <td>5.0</td>\n",
 372 |        "      <td>3.6</td>\n",
 373 |        "      <td>1.4</td>\n",
 374 |        "      <td>0.2</td>\n",
 375 |        "      <td>setosa</td>\n",
 376 |        "    </tr>\n",
 377 |        "    <tr>\n",
 378 |        "      <th>4</th>\n",
 379 |        "      <td>5.4</td>\n",
 380 |        "      <td>3.9</td>\n",
 381 |        "      <td>1.7</td>\n",
 382 |        "      <td>0.4</td>\n",
 383 |        "      <td>setosa</td>\n",
 384 |        "    </tr>\n",
 385 |        "  </tbody>\n",
 386 |        "</table>\n",
 387 |        "</div>"
 388 |       ],
 389 |       "text/plain": [
 390 |        "   5.1  3.5  1.4  0.2  setosa\n",
 391 |        "0  4.9  3.0  1.4  0.2  setosa\n",
 392 |        "1  4.7  3.2  1.3  0.2  setosa\n",
 393 |        "2  4.6  3.1  1.5  0.2  setosa\n",
 394 |        "3  5.0  3.6  1.4  0.2  setosa\n",
 395 |        "4  5.4  3.9  1.7  0.4  setosa"
 396 |       ]
 397 |      },
 398 |      "execution_count": 61,
 399 |      "metadata": {},
 400 |      "output_type": "execute_result"
 401 |     }
 402 |    ],
 403 |    "source": [
 404 |     "### header\n",
 405 |     "## 默认header,默认取第0行作为header\n",
 406 |     "# 注意对比iris.data和iris_noheader.data2个原始数据的差异\n",
 407 |     "iris_data2 = pd.read_csv(\"../_Datasets/iris_noheader.data\") \n",
 408 |     "iris_data2.head()"
 409 |    ]
 410 |   },
 411 |   {
 412 |    "cell_type": "code",
 413 |    "execution_count": 62,
 414 |    "metadata": {},
 415 |    "outputs": [
 416 |     {
 417 |      "data": {
 418 |       "text/html": [
 419 |        "<div>\n",
 420 |        "<style scoped>\n",
 421 |        "    .dataframe tbody tr th:only-of-type {\n",
 422 |        "        vertical-align: middle;\n",
 423 |        "    }\n",
 424 |        "\n",
 425 |        "    .dataframe tbody tr th {\n",
 426 |        "        vertical-align: top;\n",
 427 |        "    }\n",
 428 |        "\n",
 429 |        "    .dataframe thead th {\n",
 430 |        "        text-align: right;\n",
 431 |        "    }\n",
 432 |        "</style>\n",
 433 |        "<table border=\"1\" class=\"dataframe\">\n",
 434 |        "  <thead>\n",
 435 |        "    <tr style=\"text-align: right;\">\n",
 436 |        "      <th></th>\n",
 437 |        "      <th>5.0</th>\n",
 438 |        "      <th>3.6</th>\n",
 439 |        "      <th>1.4</th>\n",
 440 |        "      <th>0.2</th>\n",
 441 |        "      <th>setosa</th>\n",
 442 |        "    </tr>\n",
 443 |        "  </thead>\n",
 444 |        "  <tbody>\n",
 445 |        "    <tr>\n",
 446 |        "      <th>0</th>\n",
 447 |        "      <td>5.4</td>\n",
 448 |        "      <td>3.9</td>\n",
 449 |        "      <td>1.7</td>\n",
 450 |        "      <td>0.4</td>\n",
 451 |        "      <td>setosa</td>\n",
 452 |        "    </tr>\n",
 453 |        "    <tr>\n",
 454 |        "      <th>1</th>\n",
 455 |        "      <td>4.6</td>\n",
 456 |        "      <td>3.4</td>\n",
 457 |        "      <td>1.4</td>\n",
 458 |        "      <td>0.3</td>\n",
 459 |        "      <td>setosa</td>\n",
 460 |        "    </tr>\n",
 461 |        "    <tr>\n",
 462 |        "      <th>2</th>\n",
 463 |        "      <td>5.0</td>\n",
 464 |        "      <td>3.4</td>\n",
 465 |        "      <td>1.5</td>\n",
 466 |        "      <td>0.2</td>\n",
 467 |        "      <td>setosa</td>\n",
 468 |        "    </tr>\n",
 469 |        "    <tr>\n",
 470 |        "      <th>3</th>\n",
 471 |        "      <td>4.4</td>\n",
 472 |        "      <td>2.9</td>\n",
 473 |        "      <td>1.4</td>\n",
 474 |        "      <td>0.2</td>\n",
 475 |        "      <td>setosa</td>\n",
 476 |        "    </tr>\n",
 477 |        "    <tr>\n",
 478 |        "      <th>4</th>\n",
 479 |        "      <td>4.9</td>\n",
 480 |        "      <td>3.1</td>\n",
 481 |        "      <td>1.5</td>\n",
 482 |        "      <td>0.1</td>\n",
 483 |        "      <td>setosa</td>\n",
 484 |        "    </tr>\n",
 485 |        "  </tbody>\n",
 486 |        "</table>\n",
 487 |        "</div>"
 488 |       ],
 489 |       "text/plain": [
 490 |        "   5.0  3.6  1.4  0.2  setosa\n",
 491 |        "0  5.4  3.9  1.7  0.4  setosa\n",
 492 |        "1  4.6  3.4  1.4  0.3  setosa\n",
 493 |        "2  5.0  3.4  1.5  0.2  setosa\n",
 494 |        "3  4.4  2.9  1.4  0.2  setosa\n",
 495 |        "4  4.9  3.1  1.5  0.1  setosa"
 496 |       ]
 497 |      },
 498 |      "execution_count": 62,
 499 |      "metadata": {},
 500 |      "output_type": "execute_result"
 501 |     }
 502 |    ],
 503 |    "source": [
 504 |     "## 指定header\n",
 505 |     "#header参数可以是一个list例如：[0,1,3]，这个list表示将文件中的这些行作为列标题（意味着每一列有多个标题），介于中间的行将被忽略掉\n",
 506 |     "#注意：真实数据很多时候没有那么规范，且存在多级标题的可能性。\n",
 507 |     "iris_data3 = pd.read_csv(\"../_Datasets/iris_noheader.data\",header=4)   #取第1行作为header，上面的行抛弃\n",
 508 |     "iris_data3.head()"
 509 |    ]
 510 |   },
 511 |   {
 512 |    "cell_type": "code",
 513 |    "execution_count": 63,
 514 |    "metadata": {},
 515 |    "outputs": [
 516 |     {
 517 |      "name": "stdout",
 518 |      "output_type": "stream",
 519 |      "text": [
 520 |       "(150, 5)\n",
 521 |       "(149, 5)\n",
 522 |       "(145, 5)\n"
 523 |      ]
 524 |     }
 525 |    ],
 526 |    "source": [
 527 |     "#作为header的行，不再作为数据行。以下可知iris_data2的数据少了1行,iris_data3的数据少了5行\n",
 528 |     "print(iris_data.shape)\n",
 529 |     "print(iris_data2.shape)\n",
 530 |     "print(iris_data3.shape)"
 531 |    ]
 532 |   },
 533 |   {
 534 |    "cell_type": "code",
 535 |    "execution_count": 64,
 536 |    "metadata": {},
 537 |    "outputs": [
 538 |     {
 539 |      "data": {
 540 |       "text/html": [
 541 |        "<div>\n",
 542 |        "<style scoped>\n",
 543 |        "    .dataframe tbody tr th:only-of-type {\n",
 544 |        "        vertical-align: middle;\n",
 545 |        "    }\n",
 546 |        "\n",
 547 |        "    .dataframe tbody tr th {\n",
 548 |        "        vertical-align: top;\n",
 549 |        "    }\n",
 550 |        "\n",
 551 |        "    .dataframe thead th {\n",
 552 |        "        text-align: right;\n",
 553 |        "    }\n",
 554 |        "</style>\n",
 555 |        "<table border=\"1\" class=\"dataframe\">\n",
 556 |        "  <thead>\n",
 557 |        "    <tr style=\"text-align: right;\">\n",
 558 |        "      <th></th>\n",
 559 |        "      <th>0</th>\n",
 560 |        "      <th>1</th>\n",
 561 |        "      <th>2</th>\n",
 562 |        "      <th>3</th>\n",
 563 |        "      <th>4</th>\n",
 564 |        "    </tr>\n",
 565 |        "  </thead>\n",
 566 |        "  <tbody>\n",
 567 |        "    <tr>\n",
 568 |        "      <th>0</th>\n",
 569 |        "      <td>5.1</td>\n",
 570 |        "      <td>3.5</td>\n",
 571 |        "      <td>1.4</td>\n",
 572 |        "      <td>0.2</td>\n",
 573 |        "      <td>setosa</td>\n",
 574 |        "    </tr>\n",
 575 |        "    <tr>\n",
 576 |        "      <th>1</th>\n",
 577 |        "      <td>4.9</td>\n",
 578 |        "      <td>3.0</td>\n",
 579 |        "      <td>1.4</td>\n",
 580 |        "      <td>0.2</td>\n",
 581 |        "      <td>setosa</td>\n",
 582 |        "    </tr>\n",
 583 |        "    <tr>\n",
 584 |        "      <th>2</th>\n",
 585 |        "      <td>4.7</td>\n",
 586 |        "      <td>3.2</td>\n",
 587 |        "      <td>1.3</td>\n",
 588 |        "      <td>0.2</td>\n",
 589 |        "      <td>setosa</td>\n",
 590 |        "    </tr>\n",
 591 |        "    <tr>\n",
 592 |        "      <th>3</th>\n",
 593 |        "      <td>4.6</td>\n",
 594 |        "      <td>3.1</td>\n",
 595 |        "      <td>1.5</td>\n",
 596 |        "      <td>0.2</td>\n",
 597 |        "      <td>setosa</td>\n",
 598 |        "    </tr>\n",
 599 |        "    <tr>\n",
 600 |        "      <th>4</th>\n",
 601 |        "      <td>5.0</td>\n",
 602 |        "      <td>3.6</td>\n",
 603 |        "      <td>1.4</td>\n",
 604 |        "      <td>0.2</td>\n",
 605 |        "      <td>setosa</td>\n",
 606 |        "    </tr>\n",
 607 |        "  </tbody>\n",
 608 |        "</table>\n",
 609 |        "</div>"
 610 |       ],
 611 |       "text/plain": [
 612 |        "     0    1    2    3       4\n",
 613 |        "0  5.1  3.5  1.4  0.2  setosa\n",
 614 |        "1  4.9  3.0  1.4  0.2  setosa\n",
 615 |        "2  4.7  3.2  1.3  0.2  setosa\n",
 616 |        "3  4.6  3.1  1.5  0.2  setosa\n",
 617 |        "4  5.0  3.6  1.4  0.2  setosa"
 618 |       ]
 619 |      },
 620 |      "execution_count": 64,
 621 |      "metadata": {},
 622 |      "output_type": "execute_result"
 623 |     }
 624 |    ],
 625 |    "source": [
 626 |     "## 不指定header，默认给1个列名\n",
 627 |     "iris_data4 = pd.read_csv(\"../_Datasets/iris_noheader.data\",header=None)  \n",
 628 |     "iris_data4.head()"
 629 |    ]
 630 |   },
 631 |   {
 632 |    "cell_type": "code",
 633 |    "execution_count": 65,
 634 |    "metadata": {},
 635 |    "outputs": [
 636 |     {
 637 |      "data": {
 638 |       "text/plain": [
 639 |        "(150, 5)"
 640 |       ]
 641 |      },
 642 |      "execution_count": 65,
 643 |      "metadata": {},
 644 |      "output_type": "execute_result"
 645 |     }
 646 |    ],
 647 |    "source": [
 648 |     "iris_data4.shape"
 649 |    ]
 650 |   },
 651 |   {
 652 |    "cell_type": "code",
 653 |    "execution_count": 66,
 654 |    "metadata": {},
 655 |    "outputs": [
 656 |     {
 657 |      "data": {
 658 |       "text/html": [
 659 |        "<div>\n",
 660 |        "<style scoped>\n",
 661 |        "    .dataframe tbody tr th:only-of-type {\n",
 662 |        "        vertical-align: middle;\n",
 663 |        "    }\n",
 664 |        "\n",
 665 |        "    .dataframe tbody tr th {\n",
 666 |        "        vertical-align: top;\n",
 667 |        "    }\n",
 668 |        "\n",
 669 |        "    .dataframe thead th {\n",
 670 |        "        text-align: right;\n",
 671 |        "    }\n",
 672 |        "</style>\n",
 673 |        "<table border=\"1\" class=\"dataframe\">\n",
 674 |        "  <thead>\n",
 675 |        "    <tr style=\"text-align: right;\">\n",
 676 |        "      <th></th>\n",
 677 |        "      <th>sepal_length</th>\n",
 678 |        "      <th>sepal_width</th>\n",
 679 |        "      <th>petal_length</th>\n",
 680 |        "      <th>petal_width</th>\n",
 681 |        "      <th>species</th>\n",
 682 |        "    </tr>\n",
 683 |        "  </thead>\n",
 684 |        "  <tbody>\n",
 685 |        "    <tr>\n",
 686 |        "      <th>0</th>\n",
 687 |        "      <td>5.1</td>\n",
 688 |        "      <td>3.5</td>\n",
 689 |        "      <td>1.4</td>\n",
 690 |        "      <td>0.2</td>\n",
 691 |        "      <td>setosa</td>\n",
 692 |        "    </tr>\n",
 693 |        "    <tr>\n",
 694 |        "      <th>1</th>\n",
 695 |        "      <td>4.9</td>\n",
 696 |        "      <td>3.0</td>\n",
 697 |        "      <td>1.4</td>\n",
 698 |        "      <td>0.2</td>\n",
 699 |        "      <td>setosa</td>\n",
 700 |        "    </tr>\n",
 701 |        "    <tr>\n",
 702 |        "      <th>2</th>\n",
 703 |        "      <td>4.7</td>\n",
 704 |        "      <td>3.2</td>\n",
 705 |        "      <td>1.3</td>\n",
 706 |        "      <td>0.2</td>\n",
 707 |        "      <td>setosa</td>\n",
 708 |        "    </tr>\n",
 709 |        "    <tr>\n",
 710 |        "      <th>3</th>\n",
 711 |        "      <td>4.6</td>\n",
 712 |        "      <td>3.1</td>\n",
 713 |        "      <td>1.5</td>\n",
 714 |        "      <td>0.2</td>\n",
 715 |        "      <td>setosa</td>\n",
 716 |        "    </tr>\n",
 717 |        "    <tr>\n",
 718 |        "      <th>4</th>\n",
 719 |        "      <td>5.0</td>\n",
 720 |        "      <td>3.6</td>\n",
 721 |        "      <td>1.4</td>\n",
 722 |        "      <td>0.2</td>\n",
 723 |        "      <td>setosa</td>\n",
 724 |        "    </tr>\n",
 725 |        "  </tbody>\n",
 726 |        "</table>\n",
 727 |        "</div>"
 728 |       ],
 729 |       "text/plain": [
 730 |        "   sepal_length  sepal_width  petal_length  petal_width species\n",
 731 |        "0           5.1          3.5           1.4          0.2  setosa\n",
 732 |        "1           4.9          3.0           1.4          0.2  setosa\n",
 733 |        "2           4.7          3.2           1.3          0.2  setosa\n",
 734 |        "3           4.6          3.1           1.5          0.2  setosa\n",
 735 |        "4           5.0          3.6           1.4          0.2  setosa"
 736 |       ]
 737 |      },
 738 |      "execution_count": 66,
 739 |      "metadata": {},
 740 |      "output_type": "execute_result"
 741 |     }
 742 |    ],
 743 |    "source": [
 744 |     "### names \n",
 745 |     "#指定列名\n",
 746 |     "#iris_noheader.data没有字段名，默认第0行是字段名，但实际第0行是数据行，这样有问题，需要设置字段名。\n",
 747 |     "#例如把字段名设置为[sepal_length,sepal_width,petal_length,petal_width,species]\n",
 748 |     "iris_data5 = pd.read_csv(\"../_Datasets/iris_noheader.data\",\n",
 749 |     "                         names=[\"sepal_length\",\"sepal_width\",\"petal_length\",\"petal_width\",\"species\"])  \n",
 750 |     "iris_data5.head()\n"
 751 |    ]
 752 |   },
 753 |   {
 754 |    "cell_type": "code",
 755 |    "execution_count": 67,
 756 |    "metadata": {},
 757 |    "outputs": [
 758 |     {
 759 |      "data": {
 760 |       "text/html": [
 761 |        "<div>\n",
 762 |        "<style scoped>\n",
 763 |        "    .dataframe tbody tr th:only-of-type {\n",
 764 |        "        vertical-align: middle;\n",
 765 |        "    }\n",
 766 |        "\n",
 767 |        "    .dataframe tbody tr th {\n",
 768 |        "        vertical-align: top;\n",
 769 |        "    }\n",
 770 |        "\n",
 771 |        "    .dataframe thead th {\n",
 772 |        "        text-align: right;\n",
 773 |        "    }\n",
 774 |        "</style>\n",
 775 |        "<table border=\"1\" class=\"dataframe\">\n",
 776 |        "  <thead>\n",
 777 |        "    <tr style=\"text-align: right;\">\n",
 778 |        "      <th></th>\n",
 779 |        "      <th>sepal_length</th>\n",
 780 |        "      <th>sepal_width</th>\n",
 781 |        "      <th>petal_length</th>\n",
 782 |        "      <th>petal_width</th>\n",
 783 |        "      <th>species</th>\n",
 784 |        "    </tr>\n",
 785 |        "  </thead>\n",
 786 |        "  <tbody>\n",
 787 |        "    <tr>\n",
 788 |        "      <th>0</th>\n",
 789 |        "      <td>5.1</td>\n",
 790 |        "      <td>3.5</td>\n",
 791 |        "      <td>1.4</td>\n",
 792 |        "      <td>0.2</td>\n",
 793 |        "      <td>setosa</td>\n",
 794 |        "    </tr>\n",
 795 |        "    <tr>\n",
 796 |        "      <th>1</th>\n",
 797 |        "      <td>4.9</td>\n",
 798 |        "      <td>3.0</td>\n",
 799 |        "      <td>1.4</td>\n",
 800 |        "      <td>0.2</td>\n",
 801 |        "      <td>setosa</td>\n",
 802 |        "    </tr>\n",
 803 |        "    <tr>\n",
 804 |        "      <th>2</th>\n",
 805 |        "      <td>4.7</td>\n",
 806 |        "      <td>3.2</td>\n",
 807 |        "      <td>1.3</td>\n",
 808 |        "      <td>0.2</td>\n",
 809 |        "      <td>setosa</td>\n",
 810 |        "    </tr>\n",
 811 |        "    <tr>\n",
 812 |        "      <th>3</th>\n",
 813 |        "      <td>4.6</td>\n",
 814 |        "      <td>3.1</td>\n",
 815 |        "      <td>1.5</td>\n",
 816 |        "      <td>0.2</td>\n",
 817 |        "      <td>setosa</td>\n",
 818 |        "    </tr>\n",
 819 |        "    <tr>\n",
 820 |        "      <th>4</th>\n",
 821 |        "      <td>5.0</td>\n",
 822 |        "      <td>3.6</td>\n",
 823 |        "      <td>1.4</td>\n",
 824 |        "      <td>0.2</td>\n",
 825 |        "      <td>setosa</td>\n",
 826 |        "    </tr>\n",
 827 |        "  </tbody>\n",
 828 |        "</table>\n",
 829 |        "</div>"
 830 |       ],
 831 |       "text/plain": [
 832 |        "   sepal_length  sepal_width  petal_length  petal_width species\n",
 833 |        "0           5.1          3.5           1.4          0.2  setosa\n",
 834 |        "1           4.9          3.0           1.4          0.2  setosa\n",
 835 |        "2           4.7          3.2           1.3          0.2  setosa\n",
 836 |        "3           4.6          3.1           1.5          0.2  setosa\n",
 837 |        "4           5.0          3.6           1.4          0.2  setosa"
 838 |       ]
 839 |      },
 840 |      "execution_count": 67,
 841 |      "metadata": {},
 842 |      "output_type": "execute_result"
 843 |     }
 844 |    ],
 845 |    "source": [
 846 |     "### dtypes \n",
 847 |     "#指定数据格式， dtype是字典结构\n",
 848 |     "iris_data6 = pd.read_csv(\"../_Datasets/iris_noheader.data\",\n",
 849 |     "                         names=[\"sepal_length\",\"sepal_width\",\"petal_length\",\"petal_width\",\"species\"],\n",
 850 |     "                         dtype={\"sepal_length\":float})  \n",
 851 |     "iris_data6.head()"
 852 |    ]
 853 |   },
 854 |   {
 855 |    "cell_type": "code",
 856 |    "execution_count": 68,
 857 |    "metadata": {},
 858 |    "outputs": [
 859 |     {
 860 |      "data": {
 861 |       "text/html": [
 862 |        "<div>\n",
 863 |        "<style scoped>\n",
 864 |        "    .dataframe tbody tr th:only-of-type {\n",
 865 |        "        vertical-align: middle;\n",
 866 |        "    }\n",
 867 |        "\n",
 868 |        "    .dataframe tbody tr th {\n",
 869 |        "        vertical-align: top;\n",
 870 |        "    }\n",
 871 |        "\n",
 872 |        "    .dataframe thead th {\n",
 873 |        "        text-align: right;\n",
 874 |        "    }\n",
 875 |        "</style>\n",
 876 |        "<table border=\"1\" class=\"dataframe\">\n",
 877 |        "  <thead>\n",
 878 |        "    <tr style=\"text-align: right;\">\n",
 879 |        "      <th></th>\n",
 880 |        "      <th>sepal_length</th>\n",
 881 |        "      <th>sepal_width</th>\n",
 882 |        "      <th>petal_length</th>\n",
 883 |        "      <th>petal_width</th>\n",
 884 |        "      <th>species</th>\n",
 885 |        "    </tr>\n",
 886 |        "  </thead>\n",
 887 |        "  <tbody>\n",
 888 |        "    <tr>\n",
 889 |        "      <th>0</th>\n",
 890 |        "      <td>5.1</td>\n",
 891 |        "      <td>3.5</td>\n",
 892 |        "      <td>1.4</td>\n",
 893 |        "      <td>0.2</td>\n",
 894 |        "      <td>setosa</td>\n",
 895 |        "    </tr>\n",
 896 |        "    <tr>\n",
 897 |        "      <th>1</th>\n",
 898 |        "      <td>4.9</td>\n",
 899 |        "      <td>3.0</td>\n",
 900 |        "      <td>1.4</td>\n",
 901 |        "      <td>0.2</td>\n",
 902 |        "      <td>setosa</td>\n",
 903 |        "    </tr>\n",
 904 |        "    <tr>\n",
 905 |        "      <th>2</th>\n",
 906 |        "      <td>4.7</td>\n",
 907 |        "      <td>3.2</td>\n",
 908 |        "      <td>1.3</td>\n",
 909 |        "      <td>0.2</td>\n",
 910 |        "      <td>setosa</td>\n",
 911 |        "    </tr>\n",
 912 |        "    <tr>\n",
 913 |        "      <th>3</th>\n",
 914 |        "      <td>4.6</td>\n",
 915 |        "      <td>3.1</td>\n",
 916 |        "      <td>1.5</td>\n",
 917 |        "      <td>0.2</td>\n",
 918 |        "      <td>setosa</td>\n",
 919 |        "    </tr>\n",
 920 |        "    <tr>\n",
 921 |        "      <th>4</th>\n",
 922 |        "      <td>5.0</td>\n",
 923 |        "      <td>3.6</td>\n",
 924 |        "      <td>1.4</td>\n",
 925 |        "      <td>0.2</td>\n",
 926 |        "      <td>setosa</td>\n",
 927 |        "    </tr>\n",
 928 |        "  </tbody>\n",
 929 |        "</table>\n",
 930 |        "</div>"
 931 |       ],
 932 |       "text/plain": [
 933 |        "   sepal_length  sepal_width  petal_length  petal_width species\n",
 934 |        "0           5.1          3.5           1.4          0.2  setosa\n",
 935 |        "1           4.9          3.0           1.4          0.2  setosa\n",
 936 |        "2           4.7          3.2           1.3          0.2  setosa\n",
 937 |        "3           4.6          3.1           1.5          0.2  setosa\n",
 938 |        "4           5.0          3.6           1.4          0.2  setosa"
 939 |       ]
 940 |      },
 941 |      "execution_count": 68,
 942 |      "metadata": {},
 943 |      "output_type": "execute_result"
 944 |     }
 945 |    ],
 946 |    "source": [
 947 |     "### nrows\n",
 948 |     "#设置读取行数，对大文件可部分读取。\n",
 949 |     "iris_data7 = pd.read_csv(\"../_Datasets/iris.data\",nrows=50)  \n",
 950 |     "iris_data7.head()"
 951 |    ]
 952 |   },
 953 |   {
 954 |    "cell_type": "code",
 955 |    "execution_count": 69,
 956 |    "metadata": {},
 957 |    "outputs": [
 958 |     {
 959 |      "data": {
 960 |       "text/plain": [
 961 |        "(50, 5)"
 962 |       ]
 963 |      },
 964 |      "execution_count": 69,
 965 |      "metadata": {},
 966 |      "output_type": "execute_result"
 967 |     }
 968 |    ],
 969 |    "source": [
 970 |     "iris_data7.shape"
 971 |    ]
 972 |   },
 973 |   {
 974 |    "cell_type": "code",
 975 |    "execution_count": 70,
 976 |    "metadata": {},
 977 |    "outputs": [
 978 |     {
 979 |      "data": {
 980 |       "text/html": [
 981 |        "<div>\n",
 982 |        "<style scoped>\n",
 983 |        "    .dataframe tbody tr th:only-of-type {\n",
 984 |        "        vertical-align: middle;\n",
 985 |        "    }\n",
 986 |        "\n",
 987 |        "    .dataframe tbody tr th {\n",
 988 |        "        vertical-align: top;\n",
 989 |        "    }\n",
 990 |        "\n",
 991 |        "    .dataframe thead th {\n",
 992 |        "        text-align: right;\n",
 993 |        "    }\n",
 994 |        "</style>\n",
 995 |        "<table border=\"1\" class=\"dataframe\">\n",
 996 |        "  <thead>\n",
 997 |        "    <tr style=\"text-align: right;\">\n",
 998 |        "      <th></th>\n",
 999 |        "      <th>5.0</th>\n",
1000 |        "      <th>3.3</th>\n",
1001 |        "      <th>1.4</th>\n",
1002 |        "      <th>0.2</th>\n",
1003 |        "      <th>setosa</th>\n",
1004 |        "    </tr>\n",
1005 |        "  </thead>\n",
1006 |        "  <tbody>\n",
1007 |        "    <tr>\n",
1008 |        "      <th>0</th>\n",
1009 |        "      <td>7.0</td>\n",
1010 |        "      <td>3.2</td>\n",
1011 |        "      <td>4.7</td>\n",
1012 |        "      <td>1.4</td>\n",
1013 |        "      <td>versicolor</td>\n",
1014 |        "    </tr>\n",
1015 |        "    <tr>\n",
1016 |        "      <th>1</th>\n",
1017 |        "      <td>6.4</td>\n",
1018 |        "      <td>3.2</td>\n",
1019 |        "      <td>4.5</td>\n",
1020 |        "      <td>1.5</td>\n",
1021 |        "      <td>versicolor</td>\n",
1022 |        "    </tr>\n",
1023 |        "    <tr>\n",
1024 |        "      <th>2</th>\n",
1025 |        "      <td>6.9</td>\n",
1026 |        "      <td>3.1</td>\n",
1027 |        "      <td>4.9</td>\n",
1028 |        "      <td>1.5</td>\n",
1029 |        "      <td>versicolor</td>\n",
1030 |        "    </tr>\n",
1031 |        "    <tr>\n",
1032 |        "      <th>3</th>\n",
1033 |        "      <td>5.5</td>\n",
1034 |        "      <td>2.3</td>\n",
1035 |        "      <td>4.0</td>\n",
1036 |        "      <td>1.3</td>\n",
1037 |        "      <td>versicolor</td>\n",
1038 |        "    </tr>\n",
1039 |        "    <tr>\n",
1040 |        "      <th>4</th>\n",
1041 |        "      <td>6.5</td>\n",
1042 |        "      <td>2.8</td>\n",
1043 |        "      <td>4.6</td>\n",
1044 |        "      <td>1.5</td>\n",
1045 |        "      <td>versicolor</td>\n",
1046 |        "    </tr>\n",
1047 |        "  </tbody>\n",
1048 |        "</table>\n",
1049 |        "</div>"
1050 |       ],
1051 |       "text/plain": [
1052 |        "   5.0  3.3  1.4  0.2      setosa\n",
1053 |        "0  7.0  3.2  4.7  1.4  versicolor\n",
1054 |        "1  6.4  3.2  4.5  1.5  versicolor\n",
1055 |        "2  6.9  3.1  4.9  1.5  versicolor\n",
1056 |        "3  5.5  2.3  4.0  1.3  versicolor\n",
1057 |        "4  6.5  2.8  4.6  1.5  versicolor"
1058 |       ]
1059 |      },
1060 |      "execution_count": 70,
1061 |      "metadata": {},
1062 |      "output_type": "execute_result"
1063 |     }
1064 |    ],
1065 |    "source": [
1066 |     "### skiprows \n",
1067 |     "#从头部跳过多少行。注意先过滤，然后再确定表头\n",
1068 |     "iris_data8 = pd.read_csv(\"../_Datasets/iris.data\",skiprows=50)  \n",
1069 |     "iris_data8.head()"
1070 |    ]
1071 |   },
1072 |   {
1073 |    "cell_type": "code",
1074 |    "execution_count": 71,
1075 |    "metadata": {},
1076 |    "outputs": [
1077 |     {
1078 |      "data": {
1079 |       "text/plain": [
1080 |        "(100, 5)"
1081 |       ]
1082 |      },
1083 |      "execution_count": 71,
1084 |      "metadata": {},
1085 |      "output_type": "execute_result"
1086 |     }
1087 |    ],
1088 |    "source": [
1089 |     "iris_data8.shape"
1090 |    ]
1091 |   },
1092 |   {
1093 |    "cell_type": "code",
1094 |    "execution_count": 75,
1095 |    "metadata": {},
1096 |    "outputs": [
1097 |     {
1098 |      "data": {
1099 |       "text/html": [
1100 |        "<div>\n",
1101 |        "<style scoped>\n",
1102 |        "    .dataframe tbody tr th:only-of-type {\n",
1103 |        "        vertical-align: middle;\n",
1104 |        "    }\n",
1105 |        "\n",
1106 |        "    .dataframe tbody tr th {\n",
1107 |        "        vertical-align: top;\n",
1108 |        "    }\n",
1109 |        "\n",
1110 |        "    .dataframe thead th {\n",
1111 |        "        text-align: right;\n",
1112 |        "    }\n",
1113 |        "</style>\n",
1114 |        "<table border=\"1\" class=\"dataframe\">\n",
1115 |        "  <thead>\n",
1116 |        "    <tr style=\"text-align: right;\">\n",
1117 |        "      <th></th>\n",
1118 |        "      <th>sepal_length</th>\n",
1119 |        "      <th>sepal_width</th>\n",
1120 |        "      <th>petal_length</th>\n",
1121 |        "      <th>petal_width</th>\n",
1122 |        "      <th>species</th>\n",
1123 |        "    </tr>\n",
1124 |        "  </thead>\n",
1125 |        "  <tbody>\n",
1126 |        "    <tr>\n",
1127 |        "      <th>0</th>\n",
1128 |        "      <td>5.1</td>\n",
1129 |        "      <td>3.5</td>\n",
1130 |        "      <td>1.4</td>\n",
1131 |        "      <td>0.2</td>\n",
1132 |        "      <td>setosa</td>\n",
1133 |        "    </tr>\n",
1134 |        "    <tr>\n",
1135 |        "      <th>1</th>\n",
1136 |        "      <td>4.9</td>\n",
1137 |        "      <td>3.0</td>\n",
1138 |        "      <td>1.4</td>\n",
1139 |        "      <td>0.2</td>\n",
1140 |        "      <td>setosa</td>\n",
1141 |        "    </tr>\n",
1142 |        "    <tr>\n",
1143 |        "      <th>2</th>\n",
1144 |        "      <td>4.7</td>\n",
1145 |        "      <td>3.2</td>\n",
1146 |        "      <td>1.3</td>\n",
1147 |        "      <td>0.2</td>\n",
1148 |        "      <td>setosa</td>\n",
1149 |        "    </tr>\n",
1150 |        "    <tr>\n",
1151 |        "      <th>3</th>\n",
1152 |        "      <td>4.6</td>\n",
1153 |        "      <td>3.1</td>\n",
1154 |        "      <td>1.5</td>\n",
1155 |        "      <td>0.2</td>\n",
1156 |        "      <td>setosa</td>\n",
1157 |        "    </tr>\n",
1158 |        "    <tr>\n",
1159 |        "      <th>4</th>\n",
1160 |        "      <td>5.0</td>\n",
1161 |        "      <td>3.6</td>\n",
1162 |        "      <td>1.4</td>\n",
1163 |        "      <td>0.2</td>\n",
1164 |        "      <td>setosa</td>\n",
1165 |        "    </tr>\n",
1166 |        "  </tbody>\n",
1167 |        "</table>\n",
1168 |        "</div>"
1169 |       ],
1170 |       "text/plain": [
1171 |        "   sepal_length  sepal_width  petal_length  petal_width species\n",
1172 |        "0           5.1          3.5           1.4          0.2  setosa\n",
1173 |        "1           4.9          3.0           1.4          0.2  setosa\n",
1174 |        "2           4.7          3.2           1.3          0.2  setosa\n",
1175 |        "3           4.6          3.1           1.5          0.2  setosa\n",
1176 |        "4           5.0          3.6           1.4          0.2  setosa"
1177 |       ]
1178 |      },
1179 |      "execution_count": 75,
1180 |      "metadata": {},
1181 |      "output_type": "execute_result"
1182 |     }
1183 |    ],
1184 |    "source": [
1185 |     "### skipfooter \n",
1186 |     "#从文件末尾过滤行\n",
1187 |     "iris_data9 = pd.read_csv(\"../_Datasets/iris.data\",skipfooter=50,engine=\"python\")  \n",
1188 |     "iris_data9.head()"
1189 |    ]
1190 |   },
1191 |   {
1192 |    "cell_type": "code",
1193 |    "execution_count": 73,
1194 |    "metadata": {},
1195 |    "outputs": [
1196 |     {
1197 |      "data": {
1198 |       "text/plain": [
1199 |        "(100, 5)"
1200 |       ]
1201 |      },
1202 |      "execution_count": 73,
1203 |      "metadata": {},
1204 |      "output_type": "execute_result"
1205 |     }
1206 |    ],
1207 |    "source": [
1208 |     "iris_data9.shape"
1209 |    ]
1210 |   },
1211 |   {
1212 |    "cell_type": "code",
1213 |    "execution_count": 74,
1214 |    "metadata": {},
1215 |    "outputs": [],
1216 |    "source": [
1217 |     "## 总结\n",
1218 |     "\n",
1219 |     "# 1、scikit-learn使用任何存储为numpy数组或者scipy稀疏数组的数值数据。 其他可以转化成数值数组的类型也可以接受，如pandas中的DataFrame。\n",
1220 |     "# 2、本样例主要讲解最基本的pandas.io 的read_csv导入csv格式的数据集，此外还支持json、html、sql、html等格式数据；\n",
1221 |     "# 3、pandas.io参考  https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html\n",
1222 |     "# 4、推荐的一些将标准纵列形式的数据转换为scikit-learn可以使用的格式的方法:\n",
1223 |     "#         pandas.io、scipy.io 、numpy/routines.io、datasets.load_svmlight_file、datasets.load_files 。\n",
1224 |     "#    对于一些杂项数据，例如图像，视屏，音频。您可以参考：skimage.io 、 Imageio 、scipy.misc.imread和scipy.io.wavfile.read。\n",
1225 |     "# 5、pandas.read_csv在读取文件时有很多多文件内容的操作方法，但是一般建议先读进DataFrame再处理，更加方便。          \n"
1226 |    ]
1227 |   }
1228 |  ],
1229 |  "metadata": {
1230 |   "kernelspec": {
1231 |    "display_name": "Python 3",
1232 |    "language": "python",
1233 |    "name": "python3"
1234 |   },
1235 |   "language_info": {
1236 |    "codemirror_mode": {
1237 |     "name": "ipython",
1238 |     "version": 3
1239 |    },
1240 |    "file_extension": ".py",
1241 |    "mimetype": "text/x-python",
1242 |    "name": "python",
1243 |    "nbconvert_exporter": "python",
1244 |    "pygments_lexer": "ipython3",
1245 |    "version": "3.11.8"
1246 |   }
1247 |  },
1248 |  "nbformat": 4,
1249 |  "nbformat_minor": 2
1250 | }
1251 | 


--------------------------------------------------------------------------------