├── Report.pdf ├── 9_集成学习.ipynb └── 3_逻辑回归.ipynb /Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TavinWang/Machine-Learning/HEAD/Report.pdf -------------------------------------------------------------------------------- /9_集成学习.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d09eb5dc", 6 | "metadata": {}, 7 | "source": [ 8 | "## 使用随机森林的方法解决手写数字识别问题" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "1a7e1688", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from sklearn.datasets import load_digits\n", 19 | "from sklearn.model_selection import train_test_split\n", 20 | "from sklearn.ensemble import RandomForestClassifier\n", 21 | "from sklearn.metrics import accuracy_score, confusion_matrix\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "from sklearn.datasets import fetch_openml\n", 24 | "import numpy as np\n", 25 | "\n", 26 | "plt.rcParams['font.sans-serif'] = ['SimHei']" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "id": "7a5bac87", 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "Accuracy: 0.9666666666666667\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "digits = load_digits()\n", 45 | "X, y = digits.data, digits.target\n", 46 | "\n", 47 | "# 数据集划分\n", 48 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1017)\n", 49 | "\n", 50 | "# 创建随机森林分类器\n", 51 | "rf_classifier = RandomForestClassifier(n_estimators=100, random_state=1017)\n", 52 | "\n", 53 | "rf_classifier.fit(X_train, y_train)\n", 54 | "\n", 55 | "y_pred = rf_classifier.predict(X_test)\n", 56 | "\n", 57 | "# 计算准确率\n", 58 | "accuracy = accuracy_score(y_test, y_pred)\n", 59 | "print(f'Accuracy: {accuracy}')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "id": "88ac4339", 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "image/png": "", 71 | "text/plain": [ 72 | "
" 73 | ] 74 | }, 75 | "metadata": {}, 76 | "output_type": "display_data" 77 | } 78 | ], 79 | "source": [ 80 | "import seaborn as sns\n", 81 | "\n", 82 | "# 计算混淆矩阵\n", 83 | "cm = confusion_matrix(y_test, y_pred)\n", 84 | "\n", 85 | "# 使用Seaborn库绘制热力图\n", 86 | "plt.figure(figsize=(8, 6))\n", 87 | "sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\", xticklabels=digits.target_names, yticklabels=digits.target_names)\n", 88 | "plt.title('混淆矩阵')\n", 89 | "plt.xlabel('预测值')\n", 90 | "plt.ylabel('真实值')\n", 91 | "plt.show()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "id": "d5a9a869", 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "image/png": "", 103 | "text/plain": [ 104 | "
" 105 | ] 106 | }, 107 | "metadata": {}, 108 | "output_type": "display_data" 109 | } 110 | ], 111 | "source": [ 112 | "# 可视化一些样本的预测结果\n", 113 | "n_samples = 10\n", 114 | "indices = np.random.choice(len(X_test), n_samples, replace=False)\n", 115 | "\n", 116 | "plt.figure(figsize=(10, 5))\n", 117 | "for i, index in enumerate(indices):\n", 118 | " plt.subplot(2, n_samples//2, i + 1)\n", 119 | " plt.imshow(X_test[index].reshape(8, 8), cmap='gray', interpolation='none')\n", 120 | " plt.title(f'真实值: {y_test[index]}\\n预测值: {y_pred[index]}', fontsize=10)\n", 121 | " plt.axis('off')\n", 122 | "\n", 123 | "plt.tight_layout()\n", 124 | "plt.show()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "2dc1a654", 130 | "metadata": {}, 131 | "source": [ 132 | "调整n_estimators参数" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 20, 138 | "id": "0bf0177b", 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "Accuracy: 0.9465\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "rf_classifier_1 = RandomForestClassifier(n_estimators=10, random_state=1017)\n", 151 | "\n", 152 | "rf_classifier_1.fit(X_train, y_train)\n", 153 | "\n", 154 | "y_pred = rf_classifier_1.predict(X_test)\n", 155 | "\n", 156 | "accuracy = accuracy_score(y_test, y_pred)\n", 157 | "print(f'Accuracy: {accuracy}')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 22, 163 | "id": "b58fb885", 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "Accuracy: 0.9660714285714286\n" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "rf_classifier_2 = RandomForestClassifier(n_estimators=50, random_state=1017)\n", 176 | "\n", 177 | "rf_classifier_2.fit(X_train, y_train)\n", 178 | "\n", 179 | "y_pred = rf_classifier_2.predict(X_test)\n", 180 | "\n", 181 | "accuracy = accuracy_score(y_test, y_pred)\n", 182 | "print(f'Accuracy: {accuracy}')" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "id": "dc3e8568", 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3 (ipykernel)", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.11.5" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 5 215 | } 216 | -------------------------------------------------------------------------------- /3_逻辑回归.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f62b57e3", 6 | "metadata": {}, 7 | "source": [ 8 | "# 逻辑回归" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "a0584555", 14 | "metadata": {}, 15 | "source": [ 16 | "## (1)梯度下降实现逻辑回归" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 386, 22 | "id": "89b1ae22", 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import numpy as np\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import pandas as pd\n", 29 | "plt.rcParams['font.sans-serif'] = ['SimHei']" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 387, 35 | "id": "199bbe8d", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "# def sigmoid(z):\n", 40 | "# return 1/(1+np.exp(-z))" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 388, 46 | "id": "117637fc", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# def gradient_descent(X, y, rate, iterations=10000):\n", 51 | "# m = len(y) \n", 52 | "# X = np.c_[np.ones(m), X] # 添加常数项\n", 53 | "# theta = np.zeros(X.shape[1]) # 初始化参数,包括截距\n", 54 | "# cost_record = np.zeros(iterations) # 用来记录cost function的变化\n", 55 | " \n", 56 | "# for i in range(iterations):\n", 57 | "# h = sigmoid(np.dot(X, theta))\n", 58 | " \n", 59 | "# # 计算梯度\n", 60 | "# gradient = np.dot(X.T, (h-y))\n", 61 | " \n", 62 | "# # 更新参数\n", 63 | "# theta = theta - rate * gradient\n", 64 | " \n", 65 | "# # 记录cost function\n", 66 | "# cost = -np.dot(y.T, np.log(sigmoid(np.dot(X, theta)))) - np.dot((1-y).T, np.log(1-sigmoid(np.dot(X, theta))))\n", 67 | "# cost_record[i] = cost\n", 68 | " \n", 69 | "# return theta, cost_record" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 462, 75 | "id": "67d2a9f1", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "class LogisticRegression:\n", 80 | " def __init__(self, learning_rate=0.01, num_iterations=10000):\n", 81 | " self.learning_rate = learning_rate\n", 82 | " self.num_iterations = num_iterations\n", 83 | " self.weights = None\n", 84 | " self.bias = None\n", 85 | " self.loss_record = []\n", 86 | " self.gradient_record = []\n", 87 | " self.accuracy_record = []\n", 88 | "\n", 89 | " def sigmoid(self, z):\n", 90 | " return 1 / (1 + np.exp(-z))\n", 91 | "\n", 92 | " def fit(self, X, y):\n", 93 | " m, n = X.shape\n", 94 | " self.weights = np.zeros(n)\n", 95 | " self.bias = 0\n", 96 | "\n", 97 | " for iteration in range(self.num_iterations):\n", 98 | " # 计算预测值\n", 99 | " z = np.dot(X, self.weights) + self.bias\n", 100 | " predictions = self.sigmoid(z)\n", 101 | "\n", 102 | " # 计算损失函数\n", 103 | " cost = -1/m * np.sum(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))\n", 104 | " self.loss_record.append(cost)\n", 105 | "\n", 106 | " # 计算梯度\n", 107 | " dw = 1/m * np.dot(X.T, (predictions - y))\n", 108 | "\n", 109 | " db = 1/m * np.sum((predictions - y))\n", 110 | " self.gradient_record.append(np.linalg.norm(dw))\n", 111 | " \n", 112 | " # 计算准确率\n", 113 | " accuracy = self.compute_accuracy(predictions.round(), y)\n", 114 | " self.accuracy_record.append(accuracy)\n", 115 | "\n", 116 | " # 更新参数\n", 117 | " self.weights -= (self.learning_rate * dw)\n", 118 | " self.bias -= self.learning_rate * db\n", 119 | " \n", 120 | " return self.loss_record, self.gradient_record\n", 121 | " \n", 122 | " def compute_accuracy(self, predictions, y):\n", 123 | " accuracy = np.mean(predictions == y)\n", 124 | " return accuracy\n", 125 | "\n", 126 | " def predict(self, X):\n", 127 | " z = np.dot(X, self.weights) + self.bias\n", 128 | " predictions = self.sigmoid(z)\n", 129 | " return (predictions > 0.5).astype(int)\n", 130 | " \n", 131 | " def plot_accuracy(self):\n", 132 | " plt.plot(range(1, self.num_iterations + 1), self.accuracy_record, marker='o', color='green')\n", 133 | " plt.title('Accuracy over iterations')\n", 134 | " plt.xlabel('Iterations')\n", 135 | " plt.ylabel('Accuracy')\n", 136 | " plt.show()\n", 137 | " \n", 138 | " def get_theta(self):\n", 139 | " return self.weights, self.bias\n", 140 | " " 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 390, 146 | "id": "5217314d", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "def draw(num_iterations, loss_record, gradient_norm_record):\n", 151 | " plt.figure(figsize=(10, 4))\n", 152 | " plt.subplot(1, 2, 1)\n", 153 | " plt.plot(range(num_iterations), loss_record)\n", 154 | " plt.xlabel('迭代次数')\n", 155 | " plt.ylabel('损失函数值')\n", 156 | " plt.title('损失函数变化')\n", 157 | " \n", 158 | " plt.subplot(1, 2, 2)\n", 159 | " plt.plot(range(num_iterations), gradient_norm_record)\n", 160 | " plt.xlabel('迭代次数')\n", 161 | " plt.ylabel('梯度模')\n", 162 | " plt.title('梯度模变化')\n", 163 | " \n", 164 | " plt.tight_layout()\n", 165 | " plt.show()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "id": "4cbc80d1", 171 | "metadata": {}, 172 | "source": [ 173 | "## (2)鸢尾花数据集" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "id": "9163bc8a", 179 | "metadata": {}, 180 | "source": [ 181 | "iris 数据中有三个标签,需要考虑如何使用二分类算法解决多分类问题" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 391, 187 | "id": "2aa898ce", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "from sklearn.datasets import load_iris\n", 192 | "from sklearn.model_selection import train_test_split\n", 193 | "from sklearn.metrics import accuracy_score" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 392, 199 | "id": "601a126f", 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "(150, 4)\n", 207 | "['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']\n", 208 | "['setosa' 'versicolor' 'virginica']\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "# 加载iris数据集\n", 214 | "iris = load_iris()\n", 215 | "X = iris.data\n", 216 | "y = iris.target\n", 217 | "print(X.shape)\n", 218 | "\n", 219 | "feature_names = iris.feature_names\n", 220 | "print(feature_names)\n", 221 | "\n", 222 | "target_names = iris.target_names\n", 223 | "print(target_names)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 393, 229 | "id": "8e773e11", 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "from sklearn.preprocessing import StandardScaler\n", 234 | "# 标准化\n", 235 | "scaler = StandardScaler()\n", 236 | "X_scaled = scaler.fit_transform(X)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "id": "35f21907", 242 | "metadata": {}, 243 | "source": [ 244 | "### OvO" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "id": "6a48a69d", 250 | "metadata": {}, 251 | "source": [ 252 | "思路:训练三个分类器,每个分类器对两个类别进行分类,最后通过三个分类器的结果“投票”决定划分的类别" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "id": "32339e21", 258 | "metadata": {}, 259 | "source": [ 260 | "#### 分类器1(setosa vs. versicolor)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 394, 266 | "id": "b60d3837", 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=66)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 395, 276 | "id": "ef080a5d", 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "indices_01 = (y_train == 0) | (y_train == 1) # 找出类别为0和1的索引\n", 281 | "indices_01_t = (y_test == 0) | (y_test == 1)\n", 282 | "\n", 283 | "X_train_1 = X_train[indices_01]\n", 284 | "X_test_1 = X_test[indices_01_t]\n", 285 | "\n", 286 | "y_train_1 = y_train[indices_01]\n", 287 | "y_test_1 = y_test[indices_01_t]" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "id": "5d902334", 293 | "metadata": {}, 294 | "source": [ 295 | "下面使用编写的梯度下降逻辑回归训练" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 463, 301 | "id": "bf8acf69", 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "name": "stdout", 306 | "output_type": "stream", 307 | "text": [ 308 | "[1 1 1 0 1 1 0 0 0 0 0 1 1 0 1 1 1 0 0]\n", 309 | "[1 1 1 0 1 1 0 0 0 0 0 1 1 0 1 1 1 0 0]\n" 310 | ] 311 | } 312 | ], 313 | "source": [ 314 | "if __name__ == \"__main__\":\n", 315 | " # 实例化和训练逻辑回归模型\n", 316 | " model_1 = LogisticRegression()\n", 317 | " ls_1, gr_1 = model_1.fit(X_train_1, y_train_1)\n", 318 | "\n", 319 | "\n", 320 | " print(model_1.predict(X_test_1))\n", 321 | " print(y_test_1)\n", 322 | " \n", 323 | " theta = model_1.get_theta()\n" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 464, 329 | "id": "1094df79", 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "data": { 334 | "text/plain": [ 335 | "(array([ 1.02352992, -2.24077983, 2.08131386, 1.89977742]),\n", 336 | " 2.104288827931714)" 337 | ] 338 | }, 339 | "execution_count": 464, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | } 343 | ], 344 | "source": [ 345 | "theta" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "id": "1619b4dd", 351 | "metadata": {}, 352 | "source": [ 353 | "模型在测试集上分类完全正确!!!" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 465, 359 | "id": "c78bd14b", 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "data": { 364 | "image/png": "", 365 | "text/plain": [ 366 | "
" 367 | ] 368 | }, 369 | "metadata": {}, 370 | "output_type": "display_data" 371 | } 372 | ], 373 | "source": [ 374 | "draw(10000, ls, gr)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 398, 380 | "id": "e8792b72", 381 | "metadata": {}, 382 | "outputs": [ 383 | { 384 | "data": { 385 | "image/png": "", 386 | "text/plain": [ 387 | "
" 388 | ] 389 | }, 390 | "metadata": {}, 391 | "output_type": "display_data" 392 | } 393 | ], 394 | "source": [ 395 | "model_1.plot_accuracy() # 绘制准确率曲线" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "id": "8a74cc7a", 401 | "metadata": {}, 402 | "source": [ 403 | "使用Sklearn包验证" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 456, 409 | "id": "00cf077f", 410 | "metadata": {}, 411 | "outputs": [ 412 | { 413 | "name": "stdout", 414 | "output_type": "stream", 415 | "text": [ 416 | "coefficients: [[ 0.93436309 -1.35444543 1.58684153 1.50716883]]\n", 417 | "intercept: [2.25128365]\n", 418 | "预测结果: [1 1 1 0 1 1 0 0 0 0 0 1 1 0 1 1 1 0 0]\n" 419 | ] 420 | } 421 | ], 422 | "source": [ 423 | "from sklearn.linear_model import LogisticRegression\n", 424 | "model_s1 = LogisticRegression()\n", 425 | "model_s1.fit(X_train_1, y_train_1)\n", 426 | "weights = model_s1.coef_\n", 427 | "intercept = model_s1.intercept_\n", 428 | "\n", 429 | "# 输出参数\n", 430 | "print(\"coefficients:\", weights)\n", 431 | "print(\"intercept:\", intercept)\n", 432 | "print(\"预测结果:\", model_1.predict(X_test_1))" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "id": "bca8eedc", 438 | "metadata": {}, 439 | "source": [ 440 | "#### 分类器2(setosa vs. virginica)" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 400, 446 | "id": "6591c6db", 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "indices_02 = (y_train == 0) | (y_train == 2) # 找出类别为0和2的索引\n", 451 | "indices_02_t = (y_test == 0) | (y_test == 2)\n", 452 | "\n", 453 | "X_train_2 = X_train[indices_02]\n", 454 | "X_test_2 = X_test[indices_02_t]\n", 455 | "\n", 456 | "y_train_2 = y_train[indices_02]\n", 457 | "y_test_2 = y_test[indices_02_t]\n", 458 | "\n", 459 | "y_train_2[y_train_2 == 2] = 1\n", 460 | "y_test_2[y_test_2 == 2] = 1" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 401, 466 | "id": "3ea7803b", 467 | "metadata": {}, 468 | "outputs": [ 469 | { 470 | "name": "stdout", 471 | "output_type": "stream", 472 | "text": [ 473 | "[0 0 0 0 1 1 1 0 1 1 0 1 1 0 1 1 0 0 1 1]\n", 474 | "[0 0 0 0 1 1 1 0 1 1 0 1 1 0 1 1 0 0 1 1]\n" 475 | ] 476 | } 477 | ], 478 | "source": [ 479 | "# 实例化和训练逻辑回归模型\n", 480 | "model_2 = LogisticRegression()\n", 481 | "model_2.fit(X_train_2, y_train_2)\n", 482 | "\n", 483 | "print(model_2.predict(X_test_2))\n", 484 | "print(y_test_2)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "id": "fb9a2dec", 490 | "metadata": {}, 491 | "source": [ 492 | "使用sklearn包验证" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 402, 498 | "id": "4db6316f", 499 | "metadata": {}, 500 | "outputs": [ 501 | { 502 | "name": "stdout", 503 | "output_type": "stream", 504 | "text": [ 505 | "coefficients: [[ 0.8224266 -0.7385878 1.33829004 1.39546167]]\n", 506 | "intercept: [0.71541032]\n", 507 | "预测结果: [0 0 0 0 1 1 1 0 1 1 0 1 1 0 1 1 0 0 1 1]\n" 508 | ] 509 | } 510 | ], 511 | "source": [ 512 | "model_s2 = LogisticRegression()\n", 513 | "model_s2.fit(X_train_2, y_train_2)\n", 514 | "weights = model_s2.coef_\n", 515 | "intercept = model_s2.intercept_\n", 516 | "\n", 517 | "# 输出参数\n", 518 | "print(\"coefficients:\", weights)\n", 519 | "print(\"intercept:\", intercept)\n", 520 | "print(\"预测结果:\", model_2.predict(X_test_2))" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "id": "8afc7a59", 526 | "metadata": {}, 527 | "source": [ 528 | "#### 分类器3(versicolor vs. virginica)" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 403, 534 | "id": "c21350c2", 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [ 538 | "indices_03 = (y_train == 1) | (y_train == 2) # 找出类别为0和2的索引\n", 539 | "indices_03_t = (y_test == 1) | (y_test == 2)\n", 540 | "\n", 541 | "X_train_3 = X_train[indices_03]\n", 542 | "X_test_3 = X_test[indices_03_t]\n", 543 | "\n", 544 | "y_train_3 = y_train[indices_03]\n", 545 | "y_test_3 = y_test[indices_03_t]\n", 546 | "\n", 547 | "y_train_3[y_train_3 == 2] = 0\n", 548 | "y_test_3[y_test_3 == 2] = 0" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": 404, 554 | "id": "b4c57d13", 555 | "metadata": {}, 556 | "outputs": [ 557 | { 558 | "name": "stdout", 559 | "output_type": "stream", 560 | "text": [ 561 | "[1 1 1 1 1 0 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0]\n", 562 | "[1 1 1 1 1 0 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0]\n" 563 | ] 564 | } 565 | ], 566 | "source": [ 567 | "# 实例化和训练逻辑回归模型\n", 568 | "model_3 = LogisticRegression()\n", 569 | "model_3.fit(X_train_3, y_train_3)\n", 570 | "\n", 571 | "print(model_3.predict(X_test_3))\n", 572 | "print(y_test_3)" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "id": "be7d1819", 578 | "metadata": {}, 579 | "source": [ 580 | "使用sklearn包验证" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 405, 586 | "id": "39abca1c", 587 | "metadata": {}, 588 | "outputs": [ 589 | { 590 | "name": "stdout", 591 | "output_type": "stream", 592 | "text": [ 593 | "coefficients: [[-0.25269278 0.57601981 -2.14118478 -2.89522098]]\n", 594 | "intercept: [3.57649947]\n", 595 | "预测结果: [1 1 1 1 1 0 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0]\n" 596 | ] 597 | } 598 | ], 599 | "source": [ 600 | "model_s3 = LogisticRegression()\n", 601 | "model_s3.fit(X_train_3, y_train_3)\n", 602 | "weights = model_s3.coef_\n", 603 | "intercept = model_s3.intercept_\n", 604 | "\n", 605 | "# 输出参数\n", 606 | "print(\"coefficients:\", weights)\n", 607 | "print(\"intercept:\", intercept)\n", 608 | "print(\"预测结果:\", model_3.predict(X_test_3))" 609 | ] 610 | }, 611 | { 612 | "cell_type": "markdown", 613 | "id": "a7def043", 614 | "metadata": {}, 615 | "source": [ 616 | "三个分类器均已训练完成,接下来用三个分类器进行分类" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": 410, 622 | "id": "60f352c3", 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [ 626 | "p1 = model_1.predict(X_test)" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 411, 632 | "id": "0866d288", 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "p2 = model_2.predict(X_test)" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 412, 642 | "id": "3a8b3fdd", 643 | "metadata": {}, 644 | "outputs": [], 645 | "source": [ 646 | "p3 = model_3.predict(X_test)" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": 417, 652 | "id": "a4130681", 653 | "metadata": {}, 654 | "outputs": [ 655 | { 656 | "name": "stdout", 657 | "output_type": "stream", 658 | "text": [ 659 | "30\n" 660 | ] 661 | } 662 | ], 663 | "source": [ 664 | "print(X_test.shape[0])" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 420, 670 | "id": "bc7a6f6f", 671 | "metadata": {}, 672 | "outputs": [ 673 | { 674 | "name": "stdout", 675 | "output_type": "stream", 676 | "text": [ 677 | "[1 1 1 0 1 1 0 0 0 1 1 1 0 1 1 0 1 1 1 1 0 1 1 1 1 1 0 0 1 1]\n" 678 | ] 679 | } 680 | ], 681 | "source": [ 682 | "print(p1)" 683 | ] 684 | }, 685 | { 686 | "cell_type": "markdown", 687 | "id": "985feb1a", 688 | "metadata": {}, 689 | "source": [ 690 | "根据二分类结果进行投票" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 437, 696 | "id": "e938c97d", 697 | "metadata": {}, 698 | "outputs": [], 699 | "source": [ 700 | "pre = np.zeros(X_test.shape[0])\n", 701 | "\n", 702 | "for i in range(X_test.shape[0]):\n", 703 | " sp_dict = {'0': 0, '1': 0, '2': 0}\n", 704 | " if p1[i] == 0:\n", 705 | " sp_dict['0'] += 1\n", 706 | " else:\n", 707 | " sp_dict['1'] += 1\n", 708 | " if p2[i] == 0:\n", 709 | " sp_dict['0'] += 1\n", 710 | " else:\n", 711 | " sp_dict['2'] += 1\n", 712 | " if p3[i] == 0:\n", 713 | " sp_dict['2'] += 1\n", 714 | " else:\n", 715 | " sp_dict['1'] += 1\n", 716 | " max_key = max(sp_dict, key=sp_dict.get) \n", 717 | " pre[i] = (max_key)" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": 443, 723 | "id": "1cd6104c", 724 | "metadata": {}, 725 | "outputs": [ 726 | { 727 | "data": { 728 | "text/plain": [ 729 | "array([1., 1., 1., 0., 1., 1., 0., 0., 0., 2., 2., 2., 0., 2., 2., 0., 1.,\n", 730 | " 1., 2., 2., 0., 1., 1., 2., 1., 2., 0., 0., 2., 2.])" 731 | ] 732 | }, 733 | "execution_count": 443, 734 | "metadata": {}, 735 | "output_type": "execute_result" 736 | } 737 | ], 738 | "source": [ 739 | "pre" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": 444, 745 | "id": "a2a57c59", 746 | "metadata": {}, 747 | "outputs": [ 748 | { 749 | "data": { 750 | "text/plain": [ 751 | "array([1, 1, 1, 0, 1, 1, 0, 0, 0, 2, 2, 2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1,\n", 752 | " 1, 2, 1, 2, 0, 0, 2, 2])" 753 | ] 754 | }, 755 | "execution_count": 444, 756 | "metadata": {}, 757 | "output_type": "execute_result" 758 | } 759 | ], 760 | "source": [ 761 | "y_test" 762 | ] 763 | }, 764 | { 765 | "cell_type": "markdown", 766 | "id": "ecc9a099", 767 | "metadata": {}, 768 | "source": [ 769 | "对比可见,成功通过二分类解决多分类问题,在测试集上分类结果完全正确!!!" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "id": "58b94c81", 776 | "metadata": {}, 777 | "outputs": [], 778 | "source": [] 779 | } 780 | ], 781 | "metadata": { 782 | "kernelspec": { 783 | "display_name": "Python 3 (ipykernel)", 784 | "language": "python", 785 | "name": "python3" 786 | }, 787 | "language_info": { 788 | "codemirror_mode": { 789 | "name": "ipython", 790 | "version": 3 791 | }, 792 | "file_extension": ".py", 793 | "mimetype": "text/x-python", 794 | "name": "python", 795 | "nbconvert_exporter": "python", 796 | "pygments_lexer": "ipython3", 797 | "version": "3.11.5" 798 | } 799 | }, 800 | "nbformat": 4, 801 | "nbformat_minor": 5 802 | } 803 | --------------------------------------------------------------------------------