├── README.md ├── mashroom.ipynb ├── pysaprk.ml.clustering 学习.ipynb ├── pyspark-RDD.ipynb ├── pyspark-sql-dataframe.ipynb ├── pyspark-sql-functions.ipynb ├── pyspark.ml.classification.ipynb ├── pyspark.ml.feature.ipynb └── pyspark.ml.regression.ipynb /README.md: -------------------------------------------------------------------------------- 1 | ### 学习pyspark 2 | -------------------------------------------------------------------------------- /mashroom.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "spark = SparkSession.builder.appName('mushroom').master('local[1]').getOrCreate()" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### 导入数据并确定数据类型" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 10, 23 | "metadata": { 24 | "scrolled": false 25 | }, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "23" 31 | ] 32 | }, 33 | "execution_count": 10, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "df0 = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/mushrooms.csv', header=True, inferSchema=True, encoding='utf-8')\n", 40 | "len(df0.columns)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 14, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "+---------+\n", 53 | "|cap-shape|\n", 54 | "+---------+\n", 55 | "| x|\n", 56 | "| f|\n", 57 | "| k|\n", 58 | "| c|\n", 59 | "| b|\n", 60 | "| s|\n", 61 | "+---------+\n", 62 | "\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "df0.select('cap-shape').distinct().show()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 5, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "label = df0.rdd.map(lambda row: row[0])\n", 77 | "row = df0.rdd.map(lambda row: row[1:])" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 6, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "dfi = label.map(lambda m: 0.0 if m=='p' else 1.0).zip(row.map(lambda x: list(x))).toDF(schema=['label','row'])" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 7, 92 | "metadata": { 93 | "scrolled": true 94 | }, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "Row(label=0.0, row=['b', 'y', 'y', 't', 'l', 'f', 'c', 'b', 'n', 'e', 'c', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'n', 's', 'm'])" 100 | ] 101 | }, 102 | "execution_count": 7, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "dfi.first()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 15, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "# from pyspark.ml.feature import VectorAssembler\n", 118 | "# vecAss = VectorAssembler(inputCols=df0.columns[1:], outputCol='feature')\n", 119 | "# df0 = vecAss.transform(df0)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 16, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "from pyspark.ml.feature import CountVectorizer\n", 129 | "import numpy as np\n", 130 | "from numpy import allclose\n", 131 | "cv = CountVectorizer(inputCol='row', outputCol='vectors')\n", 132 | "model = cv.fit(dfi)\n", 133 | "tf = model.transform(dfi)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 17, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "[Row(label=0.0, row=['x', 's', 'n', 't', 'p', 'f', 'c', 'n', 'k', 'e', 'e', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'k', 's', 'u'], vectors=SparseVector(24, {0: 3.0, 1: 1.0, 2: 3.0, 3: 4.0, 4: 2.0, 6: 2.0, 7: 1.0, 8: 2.0, 9: 1.0, 10: 1.0, 15: 1.0, 20: 1.0}))]" 145 | ] 146 | }, 147 | "execution_count": 17, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "tf.take(1)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 19, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "(train_data, test_data) = tf.randomSplit([0.8, 0.2])" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 20, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "data": { 172 | "text/plain": [ 173 | "SparseVector(24, {0: 0.0532, 1: 0.0375, 2: 0.0577, 3: 0.0947, 4: 0.064, 5: 0.0519, 6: 0.0436, 7: 0.022, 8: 0.0487, 9: 0.0411, 10: 0.0427, 11: 0.0299, 12: 0.0552, 13: 0.0683, 14: 0.0247, 15: 0.0164, 16: 0.0247, 17: 0.072, 18: 0.0844, 19: 0.0326, 20: 0.0135, 21: 0.0045, 22: 0.0132, 23: 0.0033})" 174 | ] 175 | }, 176 | "execution_count": 20, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "from pyspark.ml.classification import RandomForestClassifier\n", 183 | "rf = RandomForestClassifier(numTrees=40, maxDepth=20, labelCol=\"label\", featuresCol='vectors')\n", 184 | "model = rf.fit(train_data)\n", 185 | "model.featureImportances" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 32, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "result = model.transform(test_data)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 43, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "+----------+\n", 207 | "|prediction|\n", 208 | "+----------+\n", 209 | "| 0.0|\n", 210 | "| 0.0|\n", 211 | "| 1.0|\n", 212 | "| 1.0|\n", 213 | "| 1.0|\n", 214 | "+----------+\n", 215 | "only showing top 5 rows\n", 216 | "\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "result.select('prediction').show(5)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 34, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "name": "stdout", 231 | "output_type": "stream", 232 | "text": [ 233 | "+-----+--------------------+--------------------+--------------------+--------------------+----------+\n", 234 | "|label| row| vectors| rawPrediction| probability|prediction|\n", 235 | "+-----+--------------------+--------------------+--------------------+--------------------+----------+\n", 236 | "| 0.0|[b, e, e, ?, s, s...|(24,[0,1,3,5,6,7,...|[28.4161036920659...|[0.71040259230164...| 0.0|\n", 237 | "| 0.0|[b, f, y, f, f, f...|(24,[0,1,2,5,6,7,...|[37.1750915750915...|[0.92937728937728...| 0.0|\n", 238 | "| 0.0|[b, n, w, f, n, f...|(24,[0,1,2,4,5,6,...|[4.02235172235172...|[0.10055879305879...| 1.0|\n", 239 | "+-----+--------------------+--------------------+--------------------+--------------------+----------+\n", 240 | "only showing top 3 rows\n", 241 | "\n" 242 | ] 243 | } 244 | ], 245 | "source": [ 246 | "result.show(3)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 36, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "data": { 256 | "text/plain": [ 257 | "1287" 258 | ] 259 | }, 260 | "execution_count": 36, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "result.rdd.map(lambda row:1 if row.label == row.prediction else 0).sum()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 45, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "0.8880822746521476" 278 | ] 279 | }, 280 | "execution_count": 45, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "result.rdd.map(lambda row:1 if row.label == row.prediction else 0).sum()/result.count()" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 14, 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "name": "stderr", 296 | "output_type": "stream", 297 | "text": [ 298 | "/home/ffzs/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 299 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 300 | ] 301 | } 302 | ], 303 | "source": [ 304 | "from sklearn.ensemble import RandomForestClassifier\n", 305 | "import pandas as pd\n", 306 | "from sklearn import cross_validation\n", 307 | "from sklearn.model_selection import train_test_split\n", 308 | "from sklearn.cross_validation import cross_val_score" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 15, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "dfp = tf.toPandas()" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 16, 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "data": { 327 | "text/html": [ 328 | "
\n", 329 | "\n", 342 | "\n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | "
labelrowvectors
00.0[\u0000, s, \u0000, t, \u0000, f, c, n, \u0000, e, \u0000, \u0000, s, \u0000, \u0000, ...(0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 1.0, 1.0, 0.0, ...
11.0[x, s, y, t, a, f, c, b, k, e, c, s, s, w, w, ...(3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 1.0, 1.0, 1.0, ...
\n", 366 | "
" 367 | ], 368 | "text/plain": [ 369 | " label row \\\n", 370 | "0 0.0 [\u0000, s, \u0000, t, \u0000, f, c, n, \u0000, e, \u0000, \u0000, s, \u0000, \u0000, ... \n", 371 | "1 1.0 [x, s, y, t, a, f, c, b, k, e, c, s, s, w, w, ... \n", 372 | "\n", 373 | " vectors \n", 374 | "0 (0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 1.0, 1.0, 0.0, ... \n", 375 | "1 (3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 1.0, 1.0, 1.0, ... " 376 | ] 377 | }, 378 | "execution_count": 16, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "dfp.head(2)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 17, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "clf = RandomForestClassifier(random_state=22, n_estimators = 30, min_samples_split=3, min_samples_leaf=2)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 18, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "X = dfp['vectors'].tolist()" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 19, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "y = dfp['label'].tolist()" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 20, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 21, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "data": { 430 | "text/plain": [ 431 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 432 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 433 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 434 | " min_samples_leaf=2, min_samples_split=3,\n", 435 | " min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,\n", 436 | " oob_score=False, random_state=22, verbose=0, warm_start=False)" 437 | ] 438 | }, 439 | "execution_count": 21, 440 | "metadata": {}, 441 | "output_type": "execute_result" 442 | } 443 | ], 444 | "source": [ 445 | "clf.fit(X_train, y_train)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 23, 451 | "metadata": { 452 | "scrolled": true 453 | }, 454 | "outputs": [ 455 | { 456 | "name": "stdout", 457 | "output_type": "stream", 458 | "text": [ 459 | "0.9218461538461539\n" 460 | ] 461 | } 462 | ], 463 | "source": [ 464 | "print(clf.score(X_test, y_test))" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 32, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "scores = cross_val_score(clf, X, y, cv=10)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 33, 479 | "metadata": {}, 480 | "outputs": [ 481 | { 482 | "data": { 483 | "text/plain": [ 484 | "0.8905588981998195" 485 | ] 486 | }, 487 | "execution_count": 33, 488 | "metadata": {}, 489 | "output_type": "execute_result" 490 | } 491 | ], 492 | "source": [ 493 | "scores.mean()" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 1, 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "from pyspark.sql import SparkSession\n", 524 | "spark = SparkSession.builder.appName('mushroom').getOrCreate()" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 32, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "df = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/stock.csv',encoding='gbk',header=True, inferSchema=True)" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 33, 539 | "metadata": {}, 540 | "outputs": [ 541 | { 542 | "data": { 543 | "text/plain": [ 544 | "[('日期', 'timestamp'),\n", 545 | " ('股票代码', 'string'),\n", 546 | " ('名称', 'string'),\n", 547 | " ('收盘价', 'double'),\n", 548 | " ('最高价', 'double'),\n", 549 | " ('最低价', 'double'),\n", 550 | " ('开盘价', 'double'),\n", 551 | " ('前收盘', 'double'),\n", 552 | " ('涨跌额', 'string'),\n", 553 | " ('涨跌幅', 'string'),\n", 554 | " ('换手率', 'double'),\n", 555 | " ('成交量', 'int'),\n", 556 | " ('成交金额', 'double'),\n", 557 | " ('总市值', 'double'),\n", 558 | " ('流通市值', 'double')]" 559 | ] 560 | }, 561 | "execution_count": 33, 562 | "metadata": {}, 563 | "output_type": "execute_result" 564 | } 565 | ], 566 | "source": [ 567 | "df.dtypes" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": 34, 573 | "metadata": {}, 574 | "outputs": [], 575 | "source": [ 576 | "# from pyspark.sql.types import StructType, StructField, LongType, StringType, DateType ,DoubleType # 导入类型\n", 577 | "# schema = StructType([\n", 578 | "# StructField(\"日期\", DateType(), True),\n", 579 | "# StructField(\"收盘价\", DoubleType(), True),\n", 580 | "# StructField(\"成交量\", LongType(), True),\n", 581 | "# StructField(\"名称\", StringType(), True)\n", 582 | "# ])" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 35, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "df.write.csv(path='hdfs:///user/csv/stock.csv', header=True, sep=\",\", mode='overwrite')" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 49, 597 | "metadata": { 598 | "scrolled": true 599 | }, 600 | "outputs": [ 601 | { 602 | "data": { 603 | "text/plain": [ 604 | "'股票代码'" 605 | ] 606 | }, 607 | "execution_count": 49, 608 | "metadata": {}, 609 | "output_type": "execute_result" 610 | } 611 | ], 612 | "source": [ 613 | "df.columns[1]" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 61, 619 | "metadata": {}, 620 | "outputs": [], 621 | "source": [ 622 | "df0 = spark.read.jdbc(url=\"jdbc:mysql://localhost:3306/test?user=root&password=666666\", table=\"mashroom\")" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 63, 628 | "metadata": {}, 629 | "outputs": [ 630 | { 631 | "data": { 632 | "text/plain": [ 633 | "8124" 634 | ] 635 | }, 636 | "execution_count": 63, 637 | "metadata": {}, 638 | "output_type": "execute_result" 639 | } 640 | ], 641 | "source": [ 642 | "df0.count()" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": 64, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "df0.write.jdbc(url=\"jdbc:mysql://localhost:3306/test?user=root&password=666666&useUnicode=true&characterEncoding=GBK\",\n", 652 | " mode=\"overwrite\",\n", 653 | " table=\"test\",\n", 654 | " properties={\"driver\":'com.mysql.jdbc.Driver'})" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 65, 660 | "metadata": {}, 661 | "outputs": [], 662 | "source": [ 663 | "spark.stop()" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": 68, 669 | "metadata": {}, 670 | "outputs": [ 671 | { 672 | "name": "stdout", 673 | "output_type": "stream", 674 | "text": [ 675 | "+--------------+------+\n", 676 | "| country|median|\n", 677 | "+--------------+------+\n", 678 | "| New Zealand| 39.0|\n", 679 | "| Spain| 37.0|\n", 680 | "| Ireland| 35.0|\n", 681 | "| Sweden| 34.0|\n", 682 | "| Italy| 34.0|\n", 683 | "| Norway| 34.0|\n", 684 | "| Denmark| 34.0|\n", 685 | "| Israel| 34.0|\n", 686 | "| Australia| 34.0|\n", 687 | "| Netherlands| 34.0|\n", 688 | "| Argentina| 33.5|\n", 689 | "| Canada| 33.5|\n", 690 | "| Belgium| 33.0|\n", 691 | "| Switzerland| 33.0|\n", 692 | "| Japan| 33.0|\n", 693 | "|United Kingdom| 33.0|\n", 694 | "| United States| 32.0|\n", 695 | "| Portugal| 32.0|\n", 696 | "| Romania| 32.0|\n", 697 | "| Germany| 31.0|\n", 698 | "+--------------+------+\n", 699 | "only showing top 20 rows\n", 700 | "\n" 701 | ] 702 | } 703 | ], 704 | "source": [ 705 | "spark = SparkSession.builder.enableHiveSupport().master(\"local[*]\").appName(\"read_hive\").getOrCreate()\n", 706 | "\n", 707 | "df=spark.sql(\"select * from age\")\n", 708 | "df.show()" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": 87, 714 | "metadata": {}, 715 | "outputs": [ 716 | { 717 | "data": { 718 | "text/plain": [ 719 | "DataFrame[]" 720 | ] 721 | }, 722 | "execution_count": 87, 723 | "metadata": {}, 724 | "output_type": "execute_result" 725 | } 726 | ], 727 | "source": [ 728 | "spark.sql('create table if not exists age2(name string, num int)')\n", 729 | "#df0.write.mode(\"overwrite\").insertInto(\"age2\")" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": 80, 735 | "metadata": {}, 736 | "outputs": [ 737 | { 738 | "name": "stdout", 739 | "output_type": "stream", 740 | "text": [ 741 | "+--------+---------+-----------+\n", 742 | "|database|tableName|isTemporary|\n", 743 | "+--------+---------+-----------+\n", 744 | "| default| age| false|\n", 745 | "| default| age2| false|\n", 746 | "| default| country| false|\n", 747 | "| default| qn| false|\n", 748 | "+--------+---------+-----------+\n", 749 | "\n" 750 | ] 751 | } 752 | ], 753 | "source": [ 754 | "spark.sql('show tables').show()" 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": 81, 760 | "metadata": {}, 761 | "outputs": [], 762 | "source": [ 763 | "df.write.mode(\"overwrite\").insertInto(\"age2\")" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": 86, 769 | "metadata": {}, 770 | "outputs": [ 771 | { 772 | "name": "stdout", 773 | "output_type": "stream", 774 | "text": [ 775 | "+-----------+---+\n", 776 | "| name|num|\n", 777 | "+-----------+---+\n", 778 | "|New Zealand| 39|\n", 779 | "| Spain| 37|\n", 780 | "| Ireland| 35|\n", 781 | "| Sweden| 34|\n", 782 | "| Italy| 34|\n", 783 | "| Norway| 34|\n", 784 | "| Denmark| 34|\n", 785 | "| Israel| 34|\n", 786 | "| Australia| 34|\n", 787 | "|Netherlands| 34|\n", 788 | "+-----------+---+\n", 789 | "\n" 790 | ] 791 | } 792 | ], 793 | "source": [ 794 | "spark.sql('select * from age2 sort by num limit 10 ').show()" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": 18, 800 | "metadata": {}, 801 | "outputs": [], 802 | "source": [ 803 | "spark.stop()" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": null, 809 | "metadata": {}, 810 | "outputs": [], 811 | "source": [] 812 | } 813 | ], 814 | "metadata": { 815 | "kernelspec": { 816 | "display_name": "Python 3", 817 | "language": "python", 818 | "name": "python3" 819 | }, 820 | "language_info": { 821 | "codemirror_mode": { 822 | "name": "ipython", 823 | "version": 3 824 | }, 825 | "file_extension": ".py", 826 | "mimetype": "text/x-python", 827 | "name": "python", 828 | "nbconvert_exporter": "python", 829 | "pygments_lexer": "ipython3", 830 | "version": "3.6.4" 831 | } 832 | }, 833 | "nbformat": 4, 834 | "nbformat_minor": 2 835 | } 836 | -------------------------------------------------------------------------------- /pysaprk.ml.clustering 学习.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "data:https://www.kaggle.com/vjchoudhary7/customer-segmentation-tutorial-in-python" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyspark.sql import SparkSession\n", 17 | "spark = SparkSession.builder.master('local[1]').appName('learn_cluster').getOrCreate()" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "df = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/Mall_Customers.csv', header=True, inferSchema=True)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": { 33 | "scrolled": false 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "df = df.withColumnRenamed('Annual Income (k$)', 'Income').withColumnRenamed('Spending Score (1-100)', 'Spend')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "+----------+------+---+------+-----+\n", 50 | "|CustomerID|Gender|Age|Income|Spend|\n", 51 | "+----------+------+---+------+-----+\n", 52 | "| 1| Male| 19| 15| 39|\n", 53 | "| 2| Male| 21| 15| 81|\n", 54 | "| 3|Female| 20| 16| 6|\n", 55 | "+----------+------+---+------+-----+\n", 56 | "only showing top 3 rows\n", 57 | "\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "df.show(3)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 5, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "CustomerID 0\n", 74 | "Gender 0\n", 75 | "Age 0\n", 76 | "Income 0\n", 77 | "Spend 0\n", 78 | "dtype: int64" 79 | ] 80 | }, 81 | "execution_count": 5, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [ 87 | "# 查看是否有缺失值\n", 88 | "df.toPandas().isna().sum()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 6, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "from pyspark.ml.feature import VectorAssembler\n", 98 | "vecAss = VectorAssembler(inputCols = df.columns[3:], outputCol = 'features')\n", 99 | "df_km = vecAss.transform(df).select('CustomerID', 'features')" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 8, 105 | "metadata": { 106 | "scrolled": true 107 | }, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "+----------+-----------+\n", 114 | "|CustomerID| features|\n", 115 | "+----------+-----------+\n", 116 | "| 1|[15.0,39.0]|\n", 117 | "| 2|[15.0,81.0]|\n", 118 | "| 3| [16.0,6.0]|\n", 119 | "+----------+-----------+\n", 120 | "only showing top 3 rows\n", 121 | "\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "df_km.show(3)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 9, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "pd_df = df.toPandas()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 10, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/html": [ 146 | "
\n", 147 | "\n", 160 | "\n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | "
CustomerIDGenderAgeIncomeSpend
01Male191539
12Male211581
23Female20166
34Female231677
45Female311740
\n", 214 | "
" 215 | ], 216 | "text/plain": [ 217 | " CustomerID Gender Age Income Spend\n", 218 | "0 1 Male 19 15 39\n", 219 | "1 2 Male 21 15 81\n", 220 | "2 3 Female 20 16 6\n", 221 | "3 4 Female 23 16 77\n", 222 | "4 5 Female 31 17 40" 223 | ] 224 | }, 225 | "execution_count": 10, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "pd_df.head()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 18, 237 | "metadata": { 238 | "scrolled": false 239 | }, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/html": [ 244 | "" 245 | ], 246 | "text/vnd.plotly.v1+html": [ 247 | "" 248 | ] 249 | }, 250 | "metadata": {}, 251 | "output_type": "display_data" 252 | }, 253 | { 254 | "data": { 255 | "application/vnd.plotly.v1+json": { 256 | "data": [ 257 | { 258 | "marker": { 259 | "size": 6 260 | }, 261 | "mode": "markers", 262 | "type": "scatter", 263 | "x": [ 264 | 15, 265 | 15, 266 | 16, 267 | 16, 268 | 17, 269 | 17, 270 | 18, 271 | 18, 272 | 19, 273 | 19, 274 | 19, 275 | 19, 276 | 20, 277 | 20, 278 | 20, 279 | 20, 280 | 21, 281 | 21, 282 | 23, 283 | 23, 284 | 24, 285 | 24, 286 | 25, 287 | 25, 288 | 28, 289 | 28, 290 | 28, 291 | 28, 292 | 29, 293 | 29, 294 | 30, 295 | 30, 296 | 33, 297 | 33, 298 | 33, 299 | 33, 300 | 34, 301 | 34, 302 | 37, 303 | 37, 304 | 38, 305 | 38, 306 | 39, 307 | 39, 308 | 39, 309 | 39, 310 | 40, 311 | 40, 312 | 40, 313 | 40, 314 | 42, 315 | 42, 316 | 43, 317 | 43, 318 | 43, 319 | 43, 320 | 44, 321 | 44, 322 | 46, 323 | 46, 324 | 46, 325 | 46, 326 | 47, 327 | 47, 328 | 48, 329 | 48, 330 | 48, 331 | 48, 332 | 48, 333 | 48, 334 | 49, 335 | 49, 336 | 50, 337 | 50, 338 | 54, 339 | 54, 340 | 54, 341 | 54, 342 | 54, 343 | 54, 344 | 54, 345 | 54, 346 | 54, 347 | 54, 348 | 54, 349 | 54, 350 | 57, 351 | 57, 352 | 58, 353 | 58, 354 | 59, 355 | 59, 356 | 60, 357 | 60, 358 | 60, 359 | 60, 360 | 60, 361 | 60, 362 | 61, 363 | 61, 364 | 62, 365 | 62, 366 | 62, 367 | 62, 368 | 62, 369 | 62, 370 | 63, 371 | 63, 372 | 63, 373 | 63, 374 | 63, 375 | 63, 376 | 64, 377 | 64, 378 | 65, 379 | 65, 380 | 65, 381 | 65, 382 | 67, 383 | 67, 384 | 67, 385 | 67, 386 | 69, 387 | 69, 388 | 70, 389 | 70, 390 | 71, 391 | 71, 392 | 71, 393 | 71, 394 | 71, 395 | 71, 396 | 72, 397 | 72, 398 | 73, 399 | 73, 400 | 73, 401 | 73, 402 | 74, 403 | 74, 404 | 75, 405 | 75, 406 | 76, 407 | 76, 408 | 77, 409 | 77, 410 | 77, 411 | 77, 412 | 78, 413 | 78, 414 | 78, 415 | 78, 416 | 78, 417 | 78, 418 | 78, 419 | 78, 420 | 78, 421 | 78, 422 | 78, 423 | 78, 424 | 79, 425 | 79, 426 | 81, 427 | 81, 428 | 85, 429 | 85, 430 | 86, 431 | 86, 432 | 87, 433 | 87, 434 | 87, 435 | 87, 436 | 87, 437 | 87, 438 | 88, 439 | 88, 440 | 88, 441 | 88, 442 | 93, 443 | 93, 444 | 97, 445 | 97, 446 | 98, 447 | 98, 448 | 99, 449 | 99, 450 | 101, 451 | 101, 452 | 103, 453 | 103, 454 | 103, 455 | 103, 456 | 113, 457 | 113, 458 | 120, 459 | 120, 460 | 126, 461 | 126, 462 | 137, 463 | 137 464 | ], 465 | "y": [ 466 | 39, 467 | 81, 468 | 6, 469 | 77, 470 | 40, 471 | 76, 472 | 6, 473 | 94, 474 | 3, 475 | 72, 476 | 14, 477 | 99, 478 | 15, 479 | 77, 480 | 13, 481 | 79, 482 | 35, 483 | 66, 484 | 29, 485 | 98, 486 | 35, 487 | 73, 488 | 5, 489 | 73, 490 | 14, 491 | 82, 492 | 32, 493 | 61, 494 | 31, 495 | 87, 496 | 4, 497 | 73, 498 | 4, 499 | 92, 500 | 14, 501 | 81, 502 | 17, 503 | 73, 504 | 26, 505 | 75, 506 | 35, 507 | 92, 508 | 36, 509 | 61, 510 | 28, 511 | 65, 512 | 55, 513 | 47, 514 | 42, 515 | 42, 516 | 52, 517 | 60, 518 | 54, 519 | 60, 520 | 45, 521 | 41, 522 | 50, 523 | 46, 524 | 51, 525 | 46, 526 | 56, 527 | 55, 528 | 52, 529 | 59, 530 | 51, 531 | 59, 532 | 50, 533 | 48, 534 | 59, 535 | 47, 536 | 55, 537 | 42, 538 | 49, 539 | 56, 540 | 47, 541 | 54, 542 | 53, 543 | 48, 544 | 52, 545 | 42, 546 | 51, 547 | 55, 548 | 41, 549 | 44, 550 | 57, 551 | 46, 552 | 58, 553 | 55, 554 | 60, 555 | 46, 556 | 55, 557 | 41, 558 | 49, 559 | 40, 560 | 42, 561 | 52, 562 | 47, 563 | 50, 564 | 42, 565 | 49, 566 | 41, 567 | 48, 568 | 59, 569 | 55, 570 | 56, 571 | 42, 572 | 50, 573 | 46, 574 | 43, 575 | 48, 576 | 52, 577 | 54, 578 | 42, 579 | 46, 580 | 48, 581 | 50, 582 | 43, 583 | 59, 584 | 43, 585 | 57, 586 | 56, 587 | 40, 588 | 58, 589 | 91, 590 | 29, 591 | 77, 592 | 35, 593 | 95, 594 | 11, 595 | 75, 596 | 9, 597 | 75, 598 | 34, 599 | 71, 600 | 5, 601 | 88, 602 | 7, 603 | 73, 604 | 10, 605 | 72, 606 | 5, 607 | 93, 608 | 40, 609 | 87, 610 | 12, 611 | 97, 612 | 36, 613 | 74, 614 | 22, 615 | 90, 616 | 17, 617 | 88, 618 | 20, 619 | 76, 620 | 16, 621 | 89, 622 | 1, 623 | 78, 624 | 1, 625 | 73, 626 | 35, 627 | 83, 628 | 5, 629 | 93, 630 | 26, 631 | 75, 632 | 20, 633 | 95, 634 | 27, 635 | 63, 636 | 13, 637 | 75, 638 | 10, 639 | 92, 640 | 13, 641 | 86, 642 | 15, 643 | 69, 644 | 14, 645 | 90, 646 | 32, 647 | 86, 648 | 15, 649 | 88, 650 | 39, 651 | 97, 652 | 24, 653 | 68, 654 | 17, 655 | 85, 656 | 23, 657 | 69, 658 | 8, 659 | 91, 660 | 16, 661 | 79, 662 | 28, 663 | 74, 664 | 18, 665 | 83 666 | ] 667 | } 668 | ], 669 | "layout": {} 670 | }, 671 | "text/html": [ 672 | "
" 673 | ], 674 | "text/vnd.plotly.v1+html": [ 675 | "
" 676 | ] 677 | }, 678 | "metadata": {}, 679 | "output_type": "display_data" 680 | } 681 | ], 682 | "source": [ 683 | "from plotly.offline import iplot, init_notebook_mode\n", 684 | "import plotly.graph_objs as go\n", 685 | "init_notebook_mode(connected=True)\n", 686 | "trace = go.Scatter(x=pd_df.Income, y=pd_df.Spend , \n", 687 | " mode='markers',\n", 688 | " marker = {'size':6})\n", 689 | "iplot([trace])" 690 | ] 691 | }, 692 | { 693 | "cell_type": "markdown", 694 | "metadata": {}, 695 | "source": [ 696 | "## KMeans\n", 697 | "`class pyspark.ml.clustering.KMeans(self, featuresCol=\"features\", predictionCol=\"prediction\", k=2, initMode=\"k-means||\", initSteps=2, tol=1e-4, maxIter=20, seed=None)\n", 698 | "`" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "metadata": {}, 704 | "source": [ 705 | "**参数解释**" 706 | ] 707 | }, 708 | { 709 | "cell_type": "markdown", 710 | "metadata": {}, 711 | "source": [ 712 | "`\n", 713 | "initMode: 初始化算法,可以使随机的“random\",也可以是”k-means||\"\n", 714 | "initSteps: k-means||初始化的步数,需>0\n", 715 | "fit(datast,params=None)方法\n", 716 | "`" 717 | ] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "metadata": {}, 722 | "source": [ 723 | "`\n", 724 | "cluster: 每个训练数据点预测的聚类中心数据框\n", 725 | "clusterSize: 每个簇的大小(簇内数据点的个数)\n", 726 | "k: 模型训练的簇个数\n", 727 | "predictions: 由模型transform方法产生的数据框\n", 728 | "`" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": 20, 734 | "metadata": {}, 735 | "outputs": [], 736 | "source": [ 737 | "from pyspark.ml.clustering import KMeans\n", 738 | "\n", 739 | "cost = list(range(2,20))\n", 740 | "for k in range(2, 20):\n", 741 | " kmeans = KMeans(k=k, seed=1)\n", 742 | " km_model = kmeans.fit(df_km)\n", 743 | " # computeCost:计算输入点与其对应的聚类中心之间的平方距离之和。\n", 744 | " cost[k-2] = km_model.computeCost(df_km)" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": 21, 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [ 753 | "import matplotlib.pyplot as plt\n", 754 | "%matplotlib inline" 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": 22, 760 | "metadata": { 761 | "scrolled": false 762 | }, 763 | "outputs": [ 764 | { 765 | "data": { 766 | "text/plain": [ 767 | "Text(0,0.5,'cost')" 768 | ] 769 | }, 770 | "execution_count": 22, 771 | "metadata": {}, 772 | "output_type": "execute_result" 773 | }, 774 | { 775 | "data": { 776 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgEAAAFzCAYAAACn5No2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3Xl0XOWd5vHnV6VdqpIlWbLKq7yALbM4JDIYcLCBNiRpIIekk5COg6GTJkv3pM/Qk+ltOJlepqe7p3s6OZMJabrTDQEmSSeEsKQTwuoE8IJY7IBtjGxkS7ZsyZKtfa93/qjrRbZsl1BV3Vq+n3Pq6NZ7r1y/l0ud++i9977XnHMCAAC5J+B3AQAAwB+EAAAAchQhAACAHEUIAAAgRxECAADIUYQAAAByFCEAAIAcRQgAACBHEQIAAMhRhAAAAHJUnt8FJNvMmTNdXV2d32UAAJAyr7766hHnXPX5tsv6EFBXV6fGxka/ywAAIGXMbF8823E6AACAHEUIAAAgRxECAADIUYQAAAByFCEAAIAcRQgAACBHEQIAAMhRhAAAAHIUIQAAgBxFCAAAIEcRAgAAyFGEgCnoHhjVMzsOq3do1O9SAACYNkLAFGw/cEyf/26jtrd2+10KAADTRgiYgvpIWJK0s63H50oAAJg+QsAUzCwrVHWoUDsIAQCALEAImKL6SFg723r9LgMAgGkjBExRfSSkpvZejYxF/S4FAIBpIQRM0fJIWKPjTns6+vwuBQCAaSEETNFyLg4EAGQJQsAULZxZqoK8ACEAAJDxkhICzCzfzJ7wltea2Yveq8XMNpjZSjNrPaV9qZkVmdmTZrbNzB60mLjaktGHs8kLBrR0VoiLAwEAGS/hIcDMiiW9KmmdJDnnXnDOrXbOrZa0XdLrkiok3Xu83Tn3tqT1klqdcyu89eum0JZS9ZGQdrb1yDmX6o8GACBhEh4CnHODzrlLJbWe2m5mJZKWOOe2K3bw/riZbTWzR7y/5q+T9LS3+XOSrp1CW0rVR8Lq7B9Re+9wqj8aAICESeU1AeskPestN0m6xzl3uaSIpDWSqiQdn4+3R1LlFNomMLO7zKzRzBo7OjoS3pHjMwcyaRAAIJOlMgTcLOlJb7lZ0jOnLNdIOiKp3Gsr997H2zaBc+4+51yDc66huro6oZ2QpPpa7hAAAGS+lIQAb7h/rWLD95J0t6TbzCwg6WJJbyo2SnCDt/46Sc9PoS2lykvyNWdGMRcHAgAyWqpGAlZK2uGcG/Lef1PSnZK2SHrUObdD0sOS5pjZdkldih3s421LueMXBwIAkKnykvUPO+eWnLK8VdItp7xvU2xk4NTthyXddNo/E29bytVHwnpuV7uGRsdVlB/0uxwAAKaMyYLeo+WRsKJO2n2YUwIAgMxECHiP6pk+GACQ4QgB79H8yhKVFgS5OBAAkLEIAe9RIGBaWhtirgAAQMYiBExDfSTM9MEAgIxFCJiG+khYvUNjaj066HcpAABMGSFgGrg4EACQyQgB07CsNiQzcXEgACAjEQKmobQwTwsqSxgJAABkJELANNVHwtp5iBAAAMg8hIBpqo+Eta9zQH3DY36XAgDAlBACpmm5d3Hg24wGAAAyDCFgmupnx0LADi4OBABkGELANM0uL1K4KI+LAwEAGYcQME1mdmLmQAAAMgkhIAHqI2HtauvVeJTpgwEAmYMQkADLI2ENjo5rX2e/36UAABA3QkACnJw+mIsDAQCZgxCQABfMKlMwYFwXAADIKISABCjKD2rRzFJCAAAgoxACEoQ7BAAAmYYQkCD1kbAOdg/p2MCI36UAABAXQkCCLJ/NxYEAgMxCCEiQ+khIkjglAADIGISABKkJFWlmWQEhAACQMQgBCVQfCWsnTxMEAGQIQkAC1UfC2n2oT6PjUb9LAQDgvAgBCVQfCWlkPKq9HUwfDABIf4SABDo5fTCnBAAA6Y8QkECLq8tUEAwQAgAAGYEQkED5wYCW1JRpByEAAJABCAEJFps+mAmDAADpjxCQYPWRkI70Daujd9jvUgAAOCdCQIKdnD6YUwIAgPRGCEiw5dwhAADIEEkJAWaWb2ZPeMsrzazVzF70XkvNrMjMnjSzbWb2oMW857Zk9OG9mlFSoEh5ESEAAJD2Eh4CzKxY0quS1nlNFZLudc6t9l5vS1ovqdU5t8Jbv26abWmFiwMBAJkg4SHAOTfonLtUUqvXVCHp42a21cwe8f5yv07S09765yRdO822tFIfCampo09Do+N+lwIAwFml4pqAJkn3OOculxSRtEZSlaRub32PpMpptqWV+khY41GnpvY+v0sBAOCsUhECmiU9c8pyjaQjksq9tnLv/XTaJjCzu8ys0cwaOzo6EtiV+ByfPphJgwAA6SwVIeBuSbeZWUDSxZLelPSspBu89ddJen6abRM45+5zzjU45xqqq6sT3qHzqasqVVE+0wcDANJbKkLANyXdKWmLpEedczskPSxpjpltl9Sl2IF9Om1pJRgwLa0NEwIAAGktL1n/sHNuifezTdLa09YNS7rptF+ZTlvaWR4J6T9+fUjOOaXZXYwAAEhisqCkWR4Jq3twVG3dQ36XAgDApAgBSVLPzIEAgDRHCEiSZYQAAECaIwQkSVlhnuZXljBzIAAgbRECkqg+EmKuAABA2iIEJFF9JKzmzn4NjIz5XQoAAGcgBCRRfSQs56RdhzglAABIP4SAJFrOxYEAgDRGCEiiuRXFChXmEQIAAGmJEJBEZqZlkRB3CAAA0hIhIMnqI2HtautRNOr8LgUAgAkIAUm2PBJW/8i4Wo4O+F0KAAATEAKSjOmDAQDpihCQZEtrQwqYtIPrAgAAaYYQkGRF+UEtnFnKSAAAIO0QAlKgPhLWjoOEAABAeiEEpEB9JKwDxwbVPTjqdykAAJxACEiB4zMH7uKUAAAgjRACUoA7BAAA6YgQkAKzwoWqKMln5kAAQFohBKSAmak+EtbOQ4wEAADSByEgReojYb19qFdj41G/SwEAQBIhIGWWR8IaHouqubPf71IAAJBECEiZ4xcHMnMgACBdEAJSZElNmfKDxh0CAIC0QQhIkYK8gBZXlxECAABpgxCQQsuZPhgAkEYIASlUHwmrvXdYnX3DfpcCAAAhIJVOzhzIxYEAAP8RAlKoPhKSxPTBAID0QAhIoaqyQtWECgkBAIC0QAhIsfpIWDsIAQCANEAISLH6SFh7Ovo0Msb0wQAAfxECUmz57LBGx52a2vv8LgUAkOMIASm2nIsDAQBpghCQYnVVpSrMCxACAAC+S0oIMLN8M3vilPcPmNlmM3vczPLMbKWZtZrZi95rqZkVmdmTZrbNzB60mLjaktGHZMkLBrS0NqSdhwgBAAB/JTwEmFmxpFclrfPer5aU55xbJSks6QZJFZLudc6t9l5vS1ovqdU5t8Jbv24KbRmlvjY2fbBzzu9SAAA5LOEhwDk36Jy7VFKr13RY0jdO+7wKSR83s61m9oj31/x1kp721j8n6doptGWU+khIRwdGdbiH6YMBAP5J+jUBzrl3nHNbzexWSVFJv5DUJOke59zlkiKS1kiqktTt/VqPpMoptE1gZneZWaOZNXZ0dCSnY9NwcvpgTgkAAPyTkgsDzewWSV+RdLNzbkxSs6RnvNXNkmokHZFU7rWVe+/jbZvAOXefc67BOddQXV2d6O5M2zIvBDBpEADAT0kPAWZWK+mrkm5yzh1/cs7dkm4zs4CkiyW9KelZxa4XkGJD/s9PoS2jlBfna86MYkYCAAC+SsVIwAbFhvyf8u4E+B1J35R0p6Qtkh51zu2Q9LCkOWa2XVKXYgf7eNsyTn0kTAgAAPgqL1n/sHNuiffzbyX97SSbrD1t+2FJN522TbxtGWf57LCe23VYQ6PjKsoP+l0OACAHMVmQT5ZHQoo66e1DveffGACAJCAE+IQ7BAAAfiME+GReRYlKC4KEAACAbwgBPgkETMsiYW4TBAD4hhDgo/pISLvaepk+GADgC0KAj+ojYfUOj6n16KDfpQAAchAhwEf1zBwIAPARIcBHy2pDMuMOAQCAPwgBPiopyFNdVSkhAADgC0KAz+ojIe1sY8IgAEDqEQJ8tjwS1v6uAfUOjfpdCgAgxxACfHb84kCmDwYApBohwGdMHwwA8AshwGeR8iKVF+drB9cFAABSjBDgMzNTfSTEXAEAgJQjBKSB+khYbx/q0XiU6YMBAKlDCEgD9ZGwhkajau7s97sUAEAOIQSkgeVcHAgA8AEhIA0sqSlTMGCEAABAShEC0kBRflCLq0uZORAAkFKEgDRRHwkzEgAASClCQJpYHgmrrXtIxwZG/C4FAJAjCAFp4vjMgcwXAABIFUJAmjg5fTDXBQAAUoMQkCaqQ4WaWVbIdQEAgJQhBKSR+khIOw4SAgAAqUEISCPLI2E1tfdpdDzqdykAgBxACEgjl82v0Mh4VL96p8PvUgAAOYAQkEaur69RTahQD7y8z+9SAAA5gBCQRvKDAX3migXauLtDezv6/C4HAJDlCAFp5tNXzFN+0PTdTYwGAACSixCQZmpCRfrIJRH96NVW9Q2P+V0OACCLEQLS0Iar6tQ3PKYfv9bqdykAgCxGCEhDl82boUvnluuBl5vlnPO7HABAliIEpCEz04Yr67Sno18vNXX6XQ4AIEslJQSYWb6ZPeEtF5nZk2a2zcwetJiEtiWjD377zUsjqiot0P0vN/tdCgAgSyU8BJhZsaRXJa3zmtZLanXOrZBU4bUnui3rFOUHddvl8/TsrsNq6RrwuxwAQBZKeAhwzg065y6VdPyqtuskPe0tPyfp2iS0ZaX1qxYoYKYHN3O7IAAg8d5TCDCzq6eweZWkbm+5R1JlEtpOr+8uM2s0s8aOjsydgjdSXqwbL5qlH7zSosGRcb/LAQBkmbhCgJk9d1rT303hM45IKveWy733iW6bwDl3n3OuwTnXUF1dPYVS08+GK+vUPTiqx9444HcpAIAsc84QYGaXmtkGSbPN7Hbv9WVJQ1P4jGcl3eAtXyfp+SS0Za3LF1ZqWW1I93O7IAAgwc43EmCT/Dwi6bem8BkPS5pjZtsldSl2EE90W9YyM224qk67DvVq67tdfpcDAMgiFs9fl2b2N865P05BPQnX0NDgGhsb/S5jWgZHxrXqfz6rq5dU6Vuf+YDf5QAA0pyZveqcazjfdvFeGPinZhY2s6CZXWtmoWnWhykoLgjqUyvn6am3Dqute9DvcgAAWSLeEPADSddI+ntJn5P0k6RVhEl9dtUCRZ3Tw5v3+10KACBLxBsCIs65JyUtcs6tl1SWxJowiXmVJbp+2Sx9b+t+DY1yuyAAYPriDQFdZvYTSb82s5skHUtiTTiLO66qU2f/iH66vc3vUgAAWSDeEPAJSX/hnPtvis0E+MnklYSzuXpJlRZXl+q7m5r9LgUAkAXiDQHjkj5gZv8oqUFSf/JKwtkcv11wW2u3Xt9/1O9yAAAZLt4QcL+kOZJ+7v28P0n14Dw+9v65KivM0wM8XRAAME3xhoA659x/d8495Zz7c0l1SawJ51BWmKff+sBc/fTXbWrvncrEjQAATBRvCNhvZn9mZteZ2Z9J4j41H91+5QKNjjt9b0uL36UAADJYvCHgi5KCik0X3CPpC0mrCOe1qLpM11xYrYe37NPoeNTvcgAAGSreEPBdxe4K+D3Fntz3b0mrCHG546oFau8d1s/fPOR3KQCADBVvCKhxzv2ri/krSbOSWRTOb+2FNVpQVcIFggCA9yzeELDPzP7Ie27AH0s6mMyicH6BgOmzqxaocd9RvXmg2+9yAAAZKN4QcIekAcWuCeiXtCFZBSF+n2iYp+L8IKMBAID3JK4Q4Jwbds79H+fc73k/uTctDZQX5+vW98/RY9sO6mj/iN/lAAAyTLwjAUhTG66s08hYVN9/hdsFAQBTQwjIcEtrQ7pyUZUe2rxPY9wuCACYAkJAFthwVZ0OHBvUMzvb/S4FAJBBCAFZ4DfqazRnRjEXCAIApoQQkAXyggGtX7VAm/Z2avfhXr/LAQBkCEJAlvjUynkqyAswGgAAiBshIEtUlhbooytm68evHVD34Kjf5QAAMgAhIItsuKpOg6Pj+mEjtwsCAM6PEJBFLp5TroYFFXpw8z5Fo87vcgAAaY4QkGU2XFWnfZ0D2ri7w+9SAABpjhCQZT50ca1qQoW6nwsEAQDnQQjIMvnBgD5zxQJt3N2hvR19fpcDAEhjhIAs9Okr5ik/aPrupn1+lwIASGOEgCxUEyrSb14S0Y9ebVXf8Jjf5QAA0hQhIEttuKpOfcNjevS1Vr9LAQCkKUJAlnrfvBm6dG65Hti0T85xuyAA4EyEgCxlZtpwZZ2a2vv0UlOn3+UAANIQISCL3bQioqrSAm4XBABMihCQxQrzgvr05fP17K7Dauka8LscAECaIQRkuc+smq+AmR7azO2CAICJUhICzGytmb3ovVrM7Gtm1npK21IzKzKzJ81sm5k9aDFxtaWiD5kqUl6sD11Uq++/0qLBkXG/ywEApJGUhADn3AvOudXOudWStks6Kune423OubclrZfU6pxbIalC0roptOEcbr9ygboHR/XYGwf8LgUAkEZSejrAzEokLZF0WNLHzWyrmT3i/TV/naSnvU2fk3TtFNpwDpcvrNSy2pDuf7mZ2wUBACek+pqAdZKeldQk6R7n3OWSIpLWSKqS1O1t1yOpcgptE5jZXWbWaGaNHR08Tc/MdMdVddp1qFdb3+3yuxwAQJpIdQi4WdKTkpolPeO1NUuqkXREUrnXVu69j7dtAufcfc65BudcQ3V1dcI7kYk++r45Ki/O53kCAIATUhYCvCH/tYoN4d8t6TYzC0i6WNKbio0Q3OBtfp2k56fQhvMoLgjqtpXz9PO3DnG7IABAUmpHAlZK2uGcG5L0TUl3Stoi6VHn3A5JD0uaY2bbJXUpdrCPtw1xuPPqhQqY9C+/2ut3KQCANJCXqg9yzm2VdIu33KbYqMCp64cl3XTar8XbhjjUlhfpY5fN1fdfadF/uv4CzSwr9LskAICPmCwox9y1ZpFGxqN6gKmEASDnEQJyzOLqMn3oolo98HKz+obH/C4HAOAjQkAO+uKaxeoZGtP3tuz3uxQAgI8IATloxbwZunpJlf7lxb0aHmMqYQDIVYSAHPWlNUt0uGdYP3mdqYQBIFcRAnLU1UuqdMmccn17416NR5lKGAByESEgR5mZvrR2sd490q+n3jrkdzkAAB8QAnLYjRfVauHMUt37wh4eLAQAOYgQkMOCAdMXrlmkXx/o1ktNnX6XAwBIMUJAjrv1/XNUEyrUvRub/C4FAJBihIAcV5gX1Oc/uFAvNXVqW8sxv8sBAKQQIQD69OXzFS7K07c37vG7FABAChECoFBRvm6/sk4/f+uQ9nT0+V0OACBFCAGQJN1xdZ0KggHdt5HHDANAriAEQJI0s6xQn1o5Tz9+vVWHuof8LgcAkAKEAJzwux9cpKiTvvMiowEAkAsIAThhXmWJbr40ov+3Zb+ODYz4XQ4AIMkIAZjgi2sXq39kXA9u2ud3KQCAJCMEYIJltWFdt6xG//ZyswZHeMwwAGQzQgDO8KW1i9XVP6J/b2zxuxQAQBIRAnCGlXWValhQoft+uVej41G/ywEAJAkhAJP60trFOnBsUE9uP+h3KQCAJCEEYFLXLq3R0lkh3fvCHkWjPGYYALIRIQCTCgRMX1y7SLsP9+n5t9v9LgcAkASEAJzVTZfO1pwZxbr3BR4sBADZiBCAs8oPBnTXNYvUuO+oXmnu8rscAECCEQJwTp9smKfK0gJGAwAgCxECcE7FBUHdeVWdntvVrp1tPX6XAwBIIEIAzuv2K+tUWhDUP21kNAAAsgkhAOdVXpKv375ivp7Y3qaWrgG/ywEAJAghAHH53OpFCpj0z7/iMcMAkC0IAYhLbXmRPnbZXP3glRYd6Rv2uxwAQAIQAhC3u9Ys0sh4VPe/1Ox3KQCABCAEIG6Lq8v0oYtq9d1NzeodGvW7HADANBECMCVfXLNYPUNj+t7W/X6XAgCYppSEADNbaWatZvai91phZk+a2TYze9Biit5rWyr6gJgV82bo6iVV+pdfvavhsXG/ywEATEOqRgIqJN3rnFvtnFstaaWkVufcCm/dOknrp9GGFPrSmiVq7x3Wj1874HcpAIBpSGUI+LiZbTWzRyRdL+lpb91zkq6VdN002pBCVy+p0iVzyvVPG/donMcMA0DGSlUIaJJ0j3PuckkRSR+T1O2t65FUKalqGm1IITPTl9YuVnPngH7+5iG/ywEAvEepCgHNkp45ZTkqqdx7Xy7piPd6r20TmNldZtZoZo0dHR2J7Ac8N15Uq4UzS3XvxiY5x2gAAGSiVIWAuyXdZmYBSRdL+kNJN3jrrpP0vKRnp9E2gXPuPudcg3Ouobq6OvG9gYIB0xeuWaQ3D/ToxaYzchgAIAOkKgR8U9KdkrZIelTSdyTNMbPtkroUO7A/PI02+ODW989RTaiQxwwDQIbKS8WHOOfaJK09rfmm094PT6MNPijMC+rzH1yov/6PXXqj5ZjeN2+G3yUBAKaAyYIwLZ++fL7CRXn6NqMBAJBxCAGYllBRvm6/sk5P7TikpvY+v8sBAEwBIQDTdsfVdSoIBnTfLxkNAIBMQgjAtM0sK9RtK+fp0dcPqK170O9yAABxIgQgIT7/wUWKOukbz7yjkbGo3+UAAOJACEBCzKss0Scb5ur7r7Toqr95Tv/rqV1qPTrgd1kAgHOwbJ/traGhwTU2NvpdRk6IRp02vtOhhzfv03O72iVJ1y6t0fpVC3TNhdUKBnjgIwCkgpm96pxrOO92hAAkQ+vRAX1/a4u+/0qLjvQNa25FsX77ivn6ZMM8zSwr9Ls8AMhqhAAPIcBfI2NR/WLHIT20eZ827+1SftD04YsjWr9qgVbWVciM0QEASDRCgIcQkD6a2nv10Ob9euS1VvUOjenCWWVav2qBbr1sjkJF+X6XBwBZgxDgIQSkn4GRMT2x7aAe2rxfvz7QrZKCoD76vjlav2q+Lppdfv5/AABwToQADyEgvW1rOaaHNu/TE9sPamg0qsvmz9D6KxboNy+NqCg/6Hd5AJCRCAEeQkBm6B4Y1SOvteqhLfu0t6NfM0ry9YkPzNVvX7FAC2eW+l0eAGQUQoCHEJBZnHPatLdTD2/er6feOqSxqNMHL5ipz1wxX79RP0t5Qaa2AIDzIQR4CAGZq71nSD94pUXf27pfB7uHNCtcqM+vXqQ7rq5TPmEAAM6KEOAhBGS+sfGonn+7Qw+83KwXm45o6ayQ/urWi7WyrtLv0gAgLcUbAvhzCmkvLxjQuuWz9NDnr9A/396gvuExfeLbm/RHP9quo/0jfpcHABmLEICMsm75LD199zX6wjWL9KPXWnX9/96oHza2KNtHtAAgGQgByDglBXn6k4/U66dfWa2FM0v11R9t16fu26x3Dvf6XRoAZBRCADLWstqwfviFK/W3H79Euw/36sPf+JX+7ue7NDgy7ndpAJARCAHIaIGA6VMr5+vZu9foo++bo2+9sEc3fH2jnveeYggAODtCALJCVVmh/uGTK/S9312lgmBAd97/ir788Ks61D3kd2kAkLYIAcgqVy6u0s/+4Bp99calenZnu67/hxf0ry++q7HxqN+lAUDaIQQg6xTkBfR71y7R0/95jRrqKvUXT+7QR//vS3qj5ZjfpQFAWiEEIGvNryrR/Xeu1Lc+834d6RvWrd96Sff85E11D476XRoApAVCALKamekjl0T0zN1rdMdVdXp4yz5d/w8b9dgbB5hbAEDOIwQgJ4SK8vW1my/S47+/WrNnFOkPvv+GPvudrXr3SL/fpQGAbwgByCkXzynXo1++Wn/50Yu0reWYbvz6L/X1Z3ZraJS5BQDkHkIAck4wYPrslXV69g/X6MaLavX1Z97Rh7/xK734zhG/SwOAlOIpgsh5v9zdoXsee1P7Oge0dFZIS2rKtLimTEtqyrSkukyLqktVlB/0u0wAiBuPEvYQAhCPodFx3f9ys155t0tNHX1q6RpQ1PtqmElzK4q1pLpMi6u9cFATW64oLfC3cACYBCHAQwjAezE0Oq7mzn41tfdpT3u/mjr61NTep70dfRoeOznxUFVpgRbXnB4OSjW7vFiBgPnYAwC5LN4QkJeKYoBMU5Qf1LLasJbVhie0j0edDh4bjIUDLxg0tffpZ2+26djAyfkHivODWlxTGgsHp40eEA4ApAtCADAFwYBpXmWJ5lWW6NplNRPWdfYNe+EgNoLQ1NGnxuajeuyNgye2qQ0X6aZLI7rlfbN1yZxymREIAPiH0wFAkg2MjGlvR792tvXoqbcOa+Pudo2OOy2cWaqbvUCwpCbkd5kAsgjXBHgIAUg33QOj+vlbbXp820Ft2tOpqJPqI2HdsmK2bl4R0dyKEr9LBJDh0i4EmNkDkpZKapf0l5IeldTsrf6cpH2SfiRpnqTtkm6XVBhPmztHJwgBSGftvUP66fZYIHh9f+wBRx9YUKFbVszWRy6JqDpU6HOFADJRvCEgJZMFmdlqSXnOuVWSwpIiku51zq32Xm9LWi+p1Tm3QlKFpHVTaAMyUk2oSHdevVCPfvlq/eq/Xquv3rhU/cNj+trjb+mKv35Gn/3OFv17YwsPPQKQFCkZCTCzCyRVOOe2mtkvJX1H0h9IGpPUIum3JD0s6RHn3CNmdrekakkL4mlzzv3J2T6bkQBkot2He/X4Gwf1+LaD2t81oIJgQGuXVuuW983W9ctmqbiAyYsAnF1a3SLonHtHkszsVklRSbsk3eOc+6mZvSxpjaQqSd3er/Qoduog3rYJzOwuSXdJ0vz585PQIyC5LpwV0n+5can+8IYLtb21W49vO6gntx/UL3YcVklBUOuWz9ItK2brgxdUqyAvMQN6I2NRHRsYUdfAiI72j+rYwIiODozq6MCIjvaPaHgsqvcvmKEPXlCtmWWcpgCyQcpuETSzWyR9RdLNkgokveGtapZUI+mIpHKvrdx7XxZn2wTOufsk3SfFRgIS2xMgdcxMK+bN0Ip5M/SnH6nX1ne79Pi2g/rZm2167I2DmlGSrw9fXKubV8zWFQuotJkrAAALJElEQVSrFAyYnHPqHxnX0f4RHTt+EPcO5EcHTju4n3LA7x85+0OUivODCgZMD27eJ0m6ZE651lxYrTVLq3XZvBnKC/IYEiATpep0QK2kH0r6kHOu38z+h6Tdkh5ULAzcJmmVpCucc18ws59K+kdJ8+Npc849c7bP5nQAstHIWFQvNnXo8TdiowMDI+OqLC1QXsB0bGBUI+PRs/5uuChPlaUFmlFSoIqSfFWUxJYrS/O9tlh7rK1AM0ryVZQfVDTq9ObBbm18u0O/fKdDr+0/pvGoU6gwT1cvmak1S6t1zYXVmjOjOIX/JQBMJq3uDjCzP5L0u5IOeU0/U+yCvlJJ/+Gc+5qZFUp6RLGD/DbF7gQoiKeNuwOQywZHxvXsrsN6fleH8gKmitJTD+75Ew745cX5CfurvXtwVC83HdHG3R3auLtDbd1DkqQLasq05sJYILh8YSUPXwJ8kFYhwE+EACD5nHNqau87EQi2vNulkbGoivIDWrWoKnbq4MJqLZxZyiyJQAoQAjyEACD1BkfGtfndztipg90d2nukX5I0r7JY11wQCwRXLZmpskJmLgeSgRDgIQQA/tvfOaCN73Ro49sd2rTniPpHxpUXMDXUVWjNhTVac2G16iMhRgmABCEEeAgBQHoZGYvq1X1HT5w62NnWIyn2WObls8Oqj4RVHwlpWW1Yi6vLEnYLJJBLCAEeQgCQ3tp7hvTLd45o895O7TrUo92H+zQyFru7IT9oWlxdpuWRWDhYFgmpPhJmngLgPAgBHkIAkFnGxqN690i/drT1aNehXu1s69HOth4d7hk+sc3MskLVe4GAUQPgTGk1YyAAxCsvGNAFs0K6YFZIHz2lvat/RLvaerTTCwa7DvXo/pebJ4waLKkJqb42xKgBECdCAICMUFlaoKuWzNRVS2aeaJts1OClPUf049cPnNjm+KjB4uoyVYcKVVVaoKqyQlWVFZxYLi0IclEichIhAEDGmsqowQ8bW846NXJhXkAzywpVWVrghYNCzSwr8N5PDAxVpQVMgISsQQgAkHUmGzWQpKHRcXX2j6izb9j7edpy/7A6+0b0zuE+dfQNnzjVcLqywrwzAsOMkgKFivJUVhh7hYryVFaUp1Bhfuynt64wL8CoA9IGIQBAzijKD2rOjOK4nm9w/EFM5wsMB44NanvrsfM+s+G4/KApVJR/IiyUFeUpXHRyuawwX6FTQkMsUOSrIC+ggEkBMwXMZN6ynWiLPXDq9G3slPfHt5nsd8oK8wgnOYgQAACTMLMTB+EFVaVx/c7w2Lj6hsbUNzym3qHYq294TH3DoxPfD42pd2j0xHZt3UMn1vUOjWp0PPV3bVWHCrVqUZWuXFSlVYsqmeI5RxACACBBCvOCKiwLqmqadyQMj43HQoEXDHqGRjUyFpVTbIQiGpWizinqvPdOcjr1/clt3CnrJvxO1MlJirrYBZY72nq0aU+nnth2UJI0K3xqKKjSgqoSQkEWIgQAQJo5HiZSfXujc07vHunXpr2d2ry3Sy81deqxN2KhIFJedCIQXLm4SnMrigkFWYDJggAAk3LOaU+HFwr2dGrz3k519o9IkubMKNYq79RBLBSU+FwtTsWMgR5CAAAkhnNO77T3afPeTm3yQsHRgVFJsSdErloYGyVYtahKs+O4+BLJQwjwEAIAIDmiUafd7b3avKdTm/Z2asu7XTrmhYIFVSUTQkFteZHP1eYWQoCHEAAAqRGNOu061OtdU9CpLXs71TM0JkkqL87XvMpiza8s0bzKEs2rKNH8ythr9oxinvuQYIQADyEAAPwxHnXa2dajre926d0j/drfNaCWowNq7RqcMKdCwKRIebHmVRafDAdVJZrrLc8sK+AixCniAUIAAF8FA6aL55Tr4jnlE9qjUafDvUNq6RrU/q6BWDjwXht3d6i9d3jC9sX5wclHEapKNLeiWCUFmXsoc85paDSqonx/ZpLM3P9yAICMFAiYIuXFipQX6/KFlWesHxodV+vRWDjY3zmglqODJ4LCpj2dZzwDoqq0QOXF+SopDKqkIE+lBUGVFOapJD+o0sI8lRSc8rMgTyWF3s+CM9eXFOQpGDj3wdg5p4GRcfUPH58M6uQkUP0jY+objk0ader6M5a9OSD6R8Y1HnV6689vVGlh6g/JhAAAQFopyg9qSU1IS2pCZ6xzzqmrf8Q7tTColq4BtR4dVN/wmAaGYwfhTm/98QP18QNt/J8fUEnBydBQmB84edD3DvTx/HMBk0oLT07/fHx5VqjIWw6qrCjWHvDpdAchAACQMczMe7JjoS6bXxHX7zjnNDIe1cDwuPpHxk4c0AdHxtU/Mq6BkTH1D5/2c2TsxPZDo1HNrYgFgrKiiQf0M5eDJ7Ypzk//R1QTAgAAWc3MYrMw5gVVUVrgdzlphXsyAADIUYQAAAByFCEAAIAcRQgAACBHEQIAAMhRhAAAAHIUIQAAgBxFCAAAIEcRAgAAyFGEAAAAchQhAACAHEUIAAAgRxECAADIUeZc/M9YzkRm1iFpn48lzJR0xMfPTzX6m71yqa8S/c122d7fBc656vNtlPUhwG9m1uica/C7jlShv9krl/oq0d9sl2v9PRtOBwAAkKMIAQAA5ChCQPLd53cBKUZ/s1cu9VWiv9ku1/o7Ka4JAAAgRzESAABAjiIEJIiZPWBmm83scTPLm2T9SjNrNbMXvddSP+pMhHj6YmZFZvakmW0zswfNzPyoNRHMbO0pfW0xsw2TbJPx+9fM8s3sCW85rv2Xyfv51P5678/5Hfa2ydj9fNr+jasfmbp/T+vreb+/3nYZu2+ngxCQAGa2WlKec26VpLCkGybZrELSvc651d7r7ZQWmVjx9GW9pFbn3Apv+3UprTCBnHMvHO+rpO2SXp9ks4zev2ZWLOlVndxP8e6/jNzPp/c3zu+wlKH7eZL9G28/Mm7/nt7XOL+/Uobu2+kiBCTGYUnf8JbP9t+0QtLHzWyrmT2SKYn6LOLpy3WSnvaWn5N0bcqqSxIzK5G0xDm3fZLVGb1/nXODzrlLJbV6TfHuv4zcz5P0N57vsJSh+3mS/sbbj4zbv5P0VdJ5v79Shu7b6SIEJIBz7h3n3FYzu1VSVNIvJtmsSdI9zrnLJUUkrUlljQkWT1+qJHV7yz2SKlNUWzKtk/TsWdZl0/6V4t9/WbGf4/wOS9mzn+PtR1bsX8+5vr9S9uzbKZn0vBemzsxukfQVSTc758Ym2aRZ0punLNekprKkaNb5+3JEUrm3XK7smJ7zZkk/Psu6ZmXP/pXi339Zs5/j+A5L2bOfmxVfP7Jm/+rc318pe/btlDASkABmVivpq5Jucs71nmWzuyXdZmYBSRfr5P9smSievjyrk+dVr5P0fIpqSwpvaHCtYkOik8mm/SvFv/+yYj/H+R2Wsmc/x9uPbNm/5/v+Stmzb6eEEJAYGxQbPnrKu6r0c2b296dt801Jd0raIulR59yOVBeZQBP6Imlwkv4+LGmOmW2X1KVzD8NlgpWSdjjnhsxsYZbvX2mS/XeWfmfLfj79O/w7Wb6fz+hHlu/fE99fScryfTslTBYEAECOYiQAAIAcRQgAACBHEQIAAMhRhAAAAHIUIQBAwpnZHWZ2h991ADg3QgAAADmKEAAgaczsIjN73sxCftcC4ExMGwwgWSKKTTbzofPMwgfAJ4wEAEiW31fsSW4L/C4EwOQIAQCS5S8lfcn7CSANEQIAJMuQc65F0i7vCX0A0gzPDgAAIEcxEgAAQI4iBAAAkKMIAQAA5ChCAAAAOYoQAABAjiIEAACQowgBAADkqP8P9dzuqF7ZCiIAAAAASUVORK5CYII=\n", 777 | "text/plain": [ 778 | "" 779 | ] 780 | }, 781 | "metadata": {}, 782 | "output_type": "display_data" 783 | } 784 | ], 785 | "source": [ 786 | "fig, ax = plt.subplots(1,1, figsize=(8,6))\n", 787 | "ax.plot(range(2,20), cost)\n", 788 | "ax.set_xlabel('k')\n", 789 | "ax.set_ylabel('cost')" 790 | ] 791 | }, 792 | { 793 | "cell_type": "markdown", 794 | "metadata": {}, 795 | "source": [ 796 | "可以见到在k=5时,出现了拐角,我们取k=5" 797 | ] 798 | }, 799 | { 800 | "cell_type": "code", 801 | "execution_count": 23, 802 | "metadata": {}, 803 | "outputs": [], 804 | "source": [ 805 | "kmeans = KMeans(k=5, seed=1)\n", 806 | "km_model = kmeans.fit(df_km)\n", 807 | "centers = km_model.clusterCenters()" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": 24, 813 | "metadata": {}, 814 | "outputs": [ 815 | { 816 | "data": { 817 | "text/plain": [ 818 | "[array([55.2962963 , 49.51851852]),\n", 819 | " array([25.72727273, 79.36363636]),\n", 820 | " array([86.53846154, 82.12820513]),\n", 821 | " array([88.2 , 17.11428571]),\n", 822 | " array([26.30434783, 20.91304348])]" 823 | ] 824 | }, 825 | "execution_count": 24, 826 | "metadata": {}, 827 | "output_type": "execute_result" 828 | } 829 | ], 830 | "source": [ 831 | "centers" 832 | ] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "execution_count": 32, 837 | "metadata": {}, 838 | "outputs": [], 839 | "source": [ 840 | "transformed = km_model.transform(df_km).select('CustomerID', 'prediction')" 841 | ] 842 | }, 843 | { 844 | "cell_type": "code", 845 | "execution_count": 33, 846 | "metadata": {}, 847 | "outputs": [ 848 | { 849 | "name": "stdout", 850 | "output_type": "stream", 851 | "text": [ 852 | "+----------+----------+\n", 853 | "|CustomerID|prediction|\n", 854 | "+----------+----------+\n", 855 | "| 1| 4|\n", 856 | "| 2| 1|\n", 857 | "| 3| 4|\n", 858 | "+----------+----------+\n", 859 | "only showing top 3 rows\n", 860 | "\n" 861 | ] 862 | } 863 | ], 864 | "source": [ 865 | "transformed.show(3)" 866 | ] 867 | }, 868 | { 869 | "cell_type": "code", 870 | "execution_count": 35, 871 | "metadata": {}, 872 | "outputs": [], 873 | "source": [ 874 | "df_pred = df.join(transformed, 'CustomerID')" 875 | ] 876 | }, 877 | { 878 | "cell_type": "code", 879 | "execution_count": 36, 880 | "metadata": {}, 881 | "outputs": [ 882 | { 883 | "name": "stdout", 884 | "output_type": "stream", 885 | "text": [ 886 | "+----------+------+---+------+-----+----------+\n", 887 | "|CustomerID|Gender|Age|Income|Spend|prediction|\n", 888 | "+----------+------+---+------+-----+----------+\n", 889 | "| 1| Male| 19| 15| 39| 4|\n", 890 | "| 2| Male| 21| 15| 81| 1|\n", 891 | "| 3|Female| 20| 16| 6| 4|\n", 892 | "+----------+------+---+------+-----+----------+\n", 893 | "only showing top 3 rows\n", 894 | "\n" 895 | ] 896 | } 897 | ], 898 | "source": [ 899 | "df_pred.show(3)" 900 | ] 901 | }, 902 | { 903 | "cell_type": "code", 904 | "execution_count": 39, 905 | "metadata": {}, 906 | "outputs": [ 907 | { 908 | "data": { 909 | "application/vnd.plotly.v1+json": { 910 | "data": [ 911 | { 912 | "marker": { 913 | "color": [ 914 | 4, 915 | 1, 916 | 4, 917 | 1, 918 | 4, 919 | 1, 920 | 4, 921 | 1, 922 | 4, 923 | 1, 924 | 4, 925 | 1, 926 | 4, 927 | 1, 928 | 4, 929 | 1, 930 | 4, 931 | 1, 932 | 4, 933 | 1, 934 | 4, 935 | 1, 936 | 4, 937 | 1, 938 | 4, 939 | 1, 940 | 4, 941 | 1, 942 | 4, 943 | 1, 944 | 4, 945 | 1, 946 | 4, 947 | 1, 948 | 4, 949 | 1, 950 | 4, 951 | 1, 952 | 4, 953 | 1, 954 | 4, 955 | 1, 956 | 4, 957 | 0, 958 | 4, 959 | 1, 960 | 0, 961 | 0, 962 | 0, 963 | 0, 964 | 0, 965 | 0, 966 | 0, 967 | 0, 968 | 0, 969 | 0, 970 | 0, 971 | 0, 972 | 0, 973 | 0, 974 | 0, 975 | 0, 976 | 0, 977 | 0, 978 | 0, 979 | 0, 980 | 0, 981 | 0, 982 | 0, 983 | 0, 984 | 0, 985 | 0, 986 | 0, 987 | 0, 988 | 0, 989 | 0, 990 | 0, 991 | 0, 992 | 0, 993 | 0, 994 | 0, 995 | 0, 996 | 0, 997 | 0, 998 | 0, 999 | 0, 1000 | 0, 1001 | 0, 1002 | 0, 1003 | 0, 1004 | 0, 1005 | 0, 1006 | 0, 1007 | 0, 1008 | 0, 1009 | 0, 1010 | 0, 1011 | 0, 1012 | 0, 1013 | 0, 1014 | 0, 1015 | 0, 1016 | 0, 1017 | 0, 1018 | 0, 1019 | 0, 1020 | 0, 1021 | 0, 1022 | 0, 1023 | 0, 1024 | 0, 1025 | 0, 1026 | 0, 1027 | 0, 1028 | 0, 1029 | 0, 1030 | 0, 1031 | 0, 1032 | 0, 1033 | 0, 1034 | 0, 1035 | 0, 1036 | 0, 1037 | 2, 1038 | 3, 1039 | 2, 1040 | 0, 1041 | 2, 1042 | 3, 1043 | 2, 1044 | 3, 1045 | 2, 1046 | 0, 1047 | 2, 1048 | 3, 1049 | 2, 1050 | 3, 1051 | 2, 1052 | 3, 1053 | 2, 1054 | 3, 1055 | 2, 1056 | 0, 1057 | 2, 1058 | 3, 1059 | 2, 1060 | 3, 1061 | 2, 1062 | 3, 1063 | 2, 1064 | 3, 1065 | 2, 1066 | 3, 1067 | 2, 1068 | 3, 1069 | 2, 1070 | 3, 1071 | 2, 1072 | 3, 1073 | 2, 1074 | 3, 1075 | 2, 1076 | 3, 1077 | 2, 1078 | 3, 1079 | 2, 1080 | 3, 1081 | 2, 1082 | 3, 1083 | 2, 1084 | 3, 1085 | 2, 1086 | 3, 1087 | 2, 1088 | 3, 1089 | 2, 1090 | 3, 1091 | 2, 1092 | 3, 1093 | 2, 1094 | 3, 1095 | 2, 1096 | 3, 1097 | 2, 1098 | 3, 1099 | 2, 1100 | 3, 1101 | 2, 1102 | 3, 1103 | 2, 1104 | 3, 1105 | 2, 1106 | 3, 1107 | 2, 1108 | 3, 1109 | 2, 1110 | 3, 1111 | 2, 1112 | 3, 1113 | 2 1114 | ], 1115 | "colorscale": "Viridis", 1116 | "size": 10 1117 | }, 1118 | "mode": "markers", 1119 | "type": "scatter", 1120 | "x": [ 1121 | 15, 1122 | 15, 1123 | 16, 1124 | 16, 1125 | 17, 1126 | 17, 1127 | 18, 1128 | 18, 1129 | 19, 1130 | 19, 1131 | 19, 1132 | 19, 1133 | 20, 1134 | 20, 1135 | 20, 1136 | 20, 1137 | 21, 1138 | 21, 1139 | 23, 1140 | 23, 1141 | 24, 1142 | 24, 1143 | 25, 1144 | 25, 1145 | 28, 1146 | 28, 1147 | 28, 1148 | 28, 1149 | 29, 1150 | 29, 1151 | 30, 1152 | 30, 1153 | 33, 1154 | 33, 1155 | 33, 1156 | 33, 1157 | 34, 1158 | 34, 1159 | 37, 1160 | 37, 1161 | 38, 1162 | 38, 1163 | 39, 1164 | 39, 1165 | 39, 1166 | 39, 1167 | 40, 1168 | 40, 1169 | 40, 1170 | 40, 1171 | 42, 1172 | 42, 1173 | 43, 1174 | 43, 1175 | 43, 1176 | 43, 1177 | 44, 1178 | 44, 1179 | 46, 1180 | 46, 1181 | 46, 1182 | 46, 1183 | 47, 1184 | 47, 1185 | 48, 1186 | 48, 1187 | 48, 1188 | 48, 1189 | 48, 1190 | 48, 1191 | 49, 1192 | 49, 1193 | 50, 1194 | 50, 1195 | 54, 1196 | 54, 1197 | 54, 1198 | 54, 1199 | 54, 1200 | 54, 1201 | 54, 1202 | 54, 1203 | 54, 1204 | 54, 1205 | 54, 1206 | 54, 1207 | 57, 1208 | 57, 1209 | 58, 1210 | 58, 1211 | 59, 1212 | 59, 1213 | 60, 1214 | 60, 1215 | 60, 1216 | 60, 1217 | 60, 1218 | 60, 1219 | 61, 1220 | 61, 1221 | 62, 1222 | 62, 1223 | 62, 1224 | 62, 1225 | 62, 1226 | 62, 1227 | 63, 1228 | 63, 1229 | 63, 1230 | 63, 1231 | 63, 1232 | 63, 1233 | 64, 1234 | 64, 1235 | 65, 1236 | 65, 1237 | 65, 1238 | 65, 1239 | 67, 1240 | 67, 1241 | 67, 1242 | 67, 1243 | 69, 1244 | 69, 1245 | 70, 1246 | 70, 1247 | 71, 1248 | 71, 1249 | 71, 1250 | 71, 1251 | 71, 1252 | 71, 1253 | 72, 1254 | 72, 1255 | 73, 1256 | 73, 1257 | 73, 1258 | 73, 1259 | 74, 1260 | 74, 1261 | 75, 1262 | 75, 1263 | 76, 1264 | 76, 1265 | 77, 1266 | 77, 1267 | 77, 1268 | 77, 1269 | 78, 1270 | 78, 1271 | 78, 1272 | 78, 1273 | 78, 1274 | 78, 1275 | 78, 1276 | 78, 1277 | 78, 1278 | 78, 1279 | 78, 1280 | 78, 1281 | 79, 1282 | 79, 1283 | 81, 1284 | 81, 1285 | 85, 1286 | 85, 1287 | 86, 1288 | 86, 1289 | 87, 1290 | 87, 1291 | 87, 1292 | 87, 1293 | 87, 1294 | 87, 1295 | 88, 1296 | 88, 1297 | 88, 1298 | 88, 1299 | 93, 1300 | 93, 1301 | 97, 1302 | 97, 1303 | 98, 1304 | 98, 1305 | 99, 1306 | 99, 1307 | 101, 1308 | 101, 1309 | 103, 1310 | 103, 1311 | 103, 1312 | 103, 1313 | 113, 1314 | 113, 1315 | 120, 1316 | 120, 1317 | 126, 1318 | 126, 1319 | 137, 1320 | 137 1321 | ], 1322 | "y": [ 1323 | 39, 1324 | 81, 1325 | 6, 1326 | 77, 1327 | 40, 1328 | 76, 1329 | 6, 1330 | 94, 1331 | 3, 1332 | 72, 1333 | 14, 1334 | 99, 1335 | 15, 1336 | 77, 1337 | 13, 1338 | 79, 1339 | 35, 1340 | 66, 1341 | 29, 1342 | 98, 1343 | 35, 1344 | 73, 1345 | 5, 1346 | 73, 1347 | 14, 1348 | 82, 1349 | 32, 1350 | 61, 1351 | 31, 1352 | 87, 1353 | 4, 1354 | 73, 1355 | 4, 1356 | 92, 1357 | 14, 1358 | 81, 1359 | 17, 1360 | 73, 1361 | 26, 1362 | 75, 1363 | 35, 1364 | 92, 1365 | 36, 1366 | 61, 1367 | 28, 1368 | 65, 1369 | 55, 1370 | 47, 1371 | 42, 1372 | 42, 1373 | 52, 1374 | 60, 1375 | 54, 1376 | 60, 1377 | 45, 1378 | 41, 1379 | 50, 1380 | 46, 1381 | 51, 1382 | 46, 1383 | 56, 1384 | 55, 1385 | 52, 1386 | 59, 1387 | 51, 1388 | 59, 1389 | 50, 1390 | 48, 1391 | 59, 1392 | 47, 1393 | 55, 1394 | 42, 1395 | 49, 1396 | 56, 1397 | 47, 1398 | 54, 1399 | 53, 1400 | 48, 1401 | 52, 1402 | 42, 1403 | 51, 1404 | 55, 1405 | 41, 1406 | 44, 1407 | 57, 1408 | 46, 1409 | 58, 1410 | 55, 1411 | 60, 1412 | 46, 1413 | 55, 1414 | 41, 1415 | 49, 1416 | 40, 1417 | 42, 1418 | 52, 1419 | 47, 1420 | 50, 1421 | 42, 1422 | 49, 1423 | 41, 1424 | 48, 1425 | 59, 1426 | 55, 1427 | 56, 1428 | 42, 1429 | 50, 1430 | 46, 1431 | 43, 1432 | 48, 1433 | 52, 1434 | 54, 1435 | 42, 1436 | 46, 1437 | 48, 1438 | 50, 1439 | 43, 1440 | 59, 1441 | 43, 1442 | 57, 1443 | 56, 1444 | 40, 1445 | 58, 1446 | 91, 1447 | 29, 1448 | 77, 1449 | 35, 1450 | 95, 1451 | 11, 1452 | 75, 1453 | 9, 1454 | 75, 1455 | 34, 1456 | 71, 1457 | 5, 1458 | 88, 1459 | 7, 1460 | 73, 1461 | 10, 1462 | 72, 1463 | 5, 1464 | 93, 1465 | 40, 1466 | 87, 1467 | 12, 1468 | 97, 1469 | 36, 1470 | 74, 1471 | 22, 1472 | 90, 1473 | 17, 1474 | 88, 1475 | 20, 1476 | 76, 1477 | 16, 1478 | 89, 1479 | 1, 1480 | 78, 1481 | 1, 1482 | 73, 1483 | 35, 1484 | 83, 1485 | 5, 1486 | 93, 1487 | 26, 1488 | 75, 1489 | 20, 1490 | 95, 1491 | 27, 1492 | 63, 1493 | 13, 1494 | 75, 1495 | 10, 1496 | 92, 1497 | 13, 1498 | 86, 1499 | 15, 1500 | 69, 1501 | 14, 1502 | 90, 1503 | 32, 1504 | 86, 1505 | 15, 1506 | 88, 1507 | 39, 1508 | 97, 1509 | 24, 1510 | 68, 1511 | 17, 1512 | 85, 1513 | 23, 1514 | 69, 1515 | 8, 1516 | 91, 1517 | 16, 1518 | 79, 1519 | 28, 1520 | 74, 1521 | 18, 1522 | 83 1523 | ] 1524 | } 1525 | ], 1526 | "layout": {} 1527 | }, 1528 | "text/html": [ 1529 | "
" 1530 | ], 1531 | "text/vnd.plotly.v1+html": [ 1532 | "
" 1533 | ] 1534 | }, 1535 | "metadata": {}, 1536 | "output_type": "display_data" 1537 | } 1538 | ], 1539 | "source": [ 1540 | "pd_df = df_pred.toPandas()\n", 1541 | "trace = go.Scatter(x=pd_df.Income, y=pd_df.Spend, \n", 1542 | " mode='markers',\n", 1543 | " marker = {'size':10,'color':pd_df.prediction,'colorscale':'Viridis'})\n", 1544 | "iplot([trace])" 1545 | ] 1546 | }, 1547 | { 1548 | "cell_type": "markdown", 1549 | "metadata": {}, 1550 | "source": [ 1551 | "## BisectingKMeans 二分k均值\n", 1552 | "`pyspark.ml.clustering.BisectingKMeans(featuresCol='features', predictionCol='prediction', maxIter=20, seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure='euclidean')`" 1553 | ] 1554 | }, 1555 | { 1556 | "cell_type": "markdown", 1557 | "metadata": {}, 1558 | "source": [ 1559 | "二分k均值(bisecting k-means)算法的主要思想是:首先将所有点作为一个簇,然后将该簇一分为二。之后选择能最大程度降低聚类代价函数(也就是误差平方和)的簇划分为两个簇。以此进行下去,直到簇的数目等于用户给定的数目k为止。\n", 1560 | "\n", 1561 | " 以上隐含着一个原则是:因为聚类的误差平方和能够衡量聚类性能,该值越小表示数据点月接近于它们的质心,聚类效果就越好。所以我们就需要对误差平方和最大的簇进行再一次的划分,因为误差平方和越大,表示该簇聚类越不好,越有可能是多个簇被当成一个簇了,所以我们首先需要对这个簇进行划分。" 1562 | ] 1563 | }, 1564 | { 1565 | "cell_type": "markdown", 1566 | "metadata": {}, 1567 | "source": [ 1568 | "**参数**\n", 1569 | "\n", 1570 | "`maxIter: 最大迭代次数\n", 1571 | "K:聚类簇数\n", 1572 | "minDivisibleClusterSize: 聚类的最少数据点数(>1)或比例(0-1之间)\n", 1573 | "fit(dataset, params=None)方法`" 1574 | ] 1575 | }, 1576 | { 1577 | "cell_type": "markdown", 1578 | "metadata": {}, 1579 | "source": [ 1580 | "**model属性**\n", 1581 | "\n", 1582 | "`\n", 1583 | "clusterCenters(): 获取聚类中心,numpy array类型\n", 1584 | "computeCost():计算点与其中心的平方和距离\n", 1585 | "Transform():对预测数据进行预测\n", 1586 | "hasSummary:训练模型是否有summary\n", 1587 | "Summary:获取summary\n", 1588 | "`" 1589 | ] 1590 | }, 1591 | { 1592 | "cell_type": "markdown", 1593 | "metadata": {}, 1594 | "source": [ 1595 | "**Summary拥有的属性**\n", 1596 | "\n", 1597 | "`\n", 1598 | "cluster:预测的聚类中心\n", 1599 | "clusterSizes:每个聚类的大小\n", 1600 | "K:聚类个数\n", 1601 | "Predictions:由模型的transforn方法产生的预测数据框\n", 1602 | "`" 1603 | ] 1604 | }, 1605 | { 1606 | "cell_type": "markdown", 1607 | "metadata": {}, 1608 | "source": [ 1609 | "## GaussianMixture 高斯混合模型\n", 1610 | "`pyspark.ml.clustering.GaussianMixture(featuresCol='features', predictionCol='prediction', k=2, probabilityCol='probability', tol=0.01, maxIter=100, seed=None)`" 1611 | ] 1612 | }, 1613 | { 1614 | "cell_type": "markdown", 1615 | "metadata": {}, 1616 | "source": [ 1617 | "对象实现了用来拟合高斯混合模型的 期望最大化 (EM) 算法。它还可以为多变量模型绘制置信区间,同时计算 BIC(Bayesian Information Criterion,贝叶斯信息准则)来评估数据中聚类的数量。" 1618 | ] 1619 | }, 1620 | { 1621 | "cell_type": "markdown", 1622 | "metadata": {}, 1623 | "source": [ 1624 | "优点:GMM的优点是投影后样本点不是得到一个确定的分类标记,而是得到每个类的概率,这是一个重要信息。GMM不仅可以用在聚类上,也可以用在概率密度估计上。缺点:当每个混合模型没有足够多的点时,估算协方差变得困难起来,同时算法会发散并且找具有无穷大似然函数值的解,除非人为地对协方差进行正则化。GMM每一步迭代的计算量比较大,大于k-means。GMM的求解办法基于EM算法,因此有可能陷入局部极值,这和初始值的选取十分相关了。\n", 1625 | "\n" 1626 | ] 1627 | }, 1628 | { 1629 | "cell_type": "markdown", 1630 | "metadata": {}, 1631 | "source": [ 1632 | "注意对于高维数据(具有许多功能),此算法可能表现不佳。这是由于高维数据(a)使得难以聚类(基于统计/理论论证)和(b)高斯分布的数值问题。" 1633 | ] 1634 | }, 1635 | { 1636 | "cell_type": "markdown", 1637 | "metadata": {}, 1638 | "source": [ 1639 | "**参数**\n", 1640 | "\n", 1641 | "`fit(dataset,params=None)方法\n", 1642 | "k: 独立高斯分布的个数,>1\n", 1643 | "maxIter: 最大迭代次数 >=0\n", 1644 | "tol: 迭代算法的收敛偏差 >=0\n", 1645 | "Setter方法和getter方法`" 1646 | ] 1647 | }, 1648 | { 1649 | "cell_type": "markdown", 1650 | "metadata": {}, 1651 | "source": [ 1652 | "**model属性**\n", 1653 | "\n", 1654 | "`\n", 1655 | "gaussianDF: 抽取高斯分布作为数据框,每一行代表高斯分布,有两列:mean(vector)和 cov(Matrix)\n", 1656 | "hasSummary: 模型是否有总括函数\n", 1657 | "summary: 获取总括信息\n", 1658 | "transform(dataset,params=None)方法\n", 1659 | "weights: 高斯混合模型的权重,和为1\n", 1660 | "`" 1661 | ] 1662 | }, 1663 | { 1664 | "cell_type": "code", 1665 | "execution_count": 45, 1666 | "metadata": {}, 1667 | "outputs": [], 1668 | "source": [ 1669 | "spark.stop()" 1670 | ] 1671 | }, 1672 | { 1673 | "cell_type": "code", 1674 | "execution_count": null, 1675 | "metadata": {}, 1676 | "outputs": [], 1677 | "source": [] 1678 | } 1679 | ], 1680 | "metadata": { 1681 | "kernelspec": { 1682 | "display_name": "Python 3", 1683 | "language": "python", 1684 | "name": "python3" 1685 | }, 1686 | "language_info": { 1687 | "codemirror_mode": { 1688 | "name": "ipython", 1689 | "version": 3 1690 | }, 1691 | "file_extension": ".py", 1692 | "mimetype": "text/x-python", 1693 | "name": "python", 1694 | "nbconvert_exporter": "python", 1695 | "pygments_lexer": "ipython3", 1696 | "version": "3.6.4" 1697 | } 1698 | }, 1699 | "nbformat": 4, 1700 | "nbformat_minor": 2 1701 | } 1702 | -------------------------------------------------------------------------------- /pyspark-RDD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "RDD(Resilient Distributed Dataset)叫做弹性分布式数据集,是Spark中最基本的数据抽象,它代表一个不可变、可分区、里面的元素可并行计算的集合。RDD具有数据流模型的特点:自动容错、位置感知性调度和可伸缩性。RDD允许用户在执行多个查询时显式地将工作集缓存在内存中,后续的查询能够重用工作集,这极大地提升了查询速度。\n", 8 | "\n", 9 | "(1)一组分片(Partition),即数据集的基本组成单位。对于RDD来说,每个分片都会被一个计算任务处理,并决定并行计算的粒度。用户可以在创建RDD时指定RDD的分片个数,如果没有指定,那么就会采用默认值。默认值就是程序所分配到的CPU Core的数目。\n", 10 | "\n", 11 | "(2)一个计算每个分区的函数。Spark中RDD的计算是以分片为单位的,每个RDD都会实现compute函数以达到这个目的。compute函数会对迭代器进行复合,不需要保存每次计算的结果。\n", 12 | "\n", 13 | "(3)RDD之间的依赖关系。RDD的每次转换都会生成一个新的RDD,所以RDD之间就会形成类似于流水线一样的前后依赖关系。在部分分区数据丢失时,Spark可以通过这个依赖关系重新计算丢失的分区数据,而不是对RDD的所有分区进行重新计算。\n", 14 | "\n", 15 | "(4)一个Partitioner,即RDD的分片函数。当前Spark中实现了两种类型的分片函数,一个是基于哈希的HashPartitioner,另外一个是基于范围的RangePartitioner。只有对于于key-value的RDD,才会有Partitioner,非key-value的RDD的Parititioner的值是None。Partitioner函数不但决定了RDD本身的分片数量,也决定了parent RDD Shuffle输出时的分片数量。\n", 16 | "\n", 17 | "(5)一个列表,存储存取每个Partition的优先位置(preferred location)。对于一个HDFS文件来说,这个列表保存的就是每个Partition所在的块的位置。按照“移动数据不如移动计算”的理念,Spark在进行任务调度的时候,会尽可能地将计算任务分配到其所要处理数据块的存储位置。\n", 18 | "\n", 19 | "使用手册 \n", 20 | "http://spark.apache.org/docs/latest/api/python/pyspark.html\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "#pyspark.SparkContext()是spark应用的入口,也可以称为驱动\n", 30 | "from pyspark import SparkConf, SparkContext" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "conf = SparkConf().setAppName(\"sparkApp1\").setMaster(\"local\")\n", 40 | "sc = SparkContext.getOrCreate(conf)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "[0, 2, 3, 4, 6]\n", 53 | "[[0], [2], [3], [4], [6]]\n", 54 | "[0, 2, 4]\n", 55 | "[[], [0], [], [2], [4]]\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "#parallelize(c,numSlices=None)分发本地Python集合以形成RDD。如果输入表示性能范围,则建议使用xrange。\n", 61 | "#glom()通过将每个分区内的所有元素合并到一个列表中返回一个RDD。\n", 62 | "rdd1 = sc.parallelize([0,2,3,4,6], 5)\n", 63 | "rdd2 = sc.parallelize(range(0, 6, 2), 5)\n", 64 | "print(rdd1.collect())\n", 65 | "print(rdd1.glom().collect())\n", 66 | "print(rdd2.collect())\n", 67 | "print(rdd2.glom().collect())" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/plain": [ 78 | "[0, 9]" 79 | ] 80 | }, 81 | "execution_count": 4, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [ 87 | "#runJob(rdd, partitionFunc, partitions=None, allowLocal=False)\n", 88 | "#在指定的分区集合上执行给定的分区,将结果作为元素的数组返回。如果没有指定分区,那么它将在所有分区上运行。\n", 89 | "sc.runJob(rdd1, lambda part: [x * x for x in part], [0, 2], True)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 6, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "1528077753028\n", 102 | "ffzs\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "print(sc.startTime)\n", 108 | "print(sc.sparkUser())" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "# rdd.glom()\n", 118 | "# glom()定义了将原rdd相同分区的元素放在一个列表中构成新的rdd的转换操作。\n", 119 | "# rdd.collect()\n", 120 | "# 返回由rdd元素组成的列表\n", 121 | "# rdd.collectAsMap()\n", 122 | "# 将键值对形式的RDD以字典的形式返回给master " 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 37, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# cache()\n", 132 | "# 将RDD持久化为MEMORY_ONLY" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 34, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "[('a', 'aa', 1), ('b', 'bb', 1), ('c', 'cc', 1)]" 144 | ] 145 | }, 146 | "execution_count": 34, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "# map(f, preservesPartitioning=False)\n", 153 | "# 通过对这个RDD的每个元素应用一个函数来返回一个新的RDD\n", 154 | "rdd = sc.parallelize([\"b\", \"a\", \"c\"])\n", 155 | "sorted(rdd.map(lambda x:(x, x*2, 1)).collect())" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 38, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "[1, 1, 1, 2, 2, 3]\n", 168 | "[[(2, 2), (2, 2)], [(3, 3), (3, 3)], [(4, 4), (4, 4)]]\n", 169 | "[(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)]\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "#flatMap(f, preservesPartitioning=False)\n", 175 | "#首先将一个函数应用到这个RDD的所有元素上,然后将结果全部展开,返回一个新的RDD\n", 176 | "rdd = sc.parallelize([2, 3, 4])\n", 177 | "print(sorted(rdd.flatMap(lambda x: range(1, x)).collect()))\n", 178 | "print(sorted(rdd.map(lambda x: [(x, x), (x, x)]).collect()))\n", 179 | "print(sorted(rdd.flatMap(lambda x: [(x, x), (x, x)]).collect()))\n" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 39, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "[('a', 3), ('b', 1)]" 191 | ] 192 | }, 193 | "execution_count": 39, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "# mapValues(f)\n", 200 | "#通过map函数对RDD中的每个key传递value,而不改变键;同时保留了原始的RDD分区。\n", 201 | "x = sc.parallelize([(\"a\", [\"apple\", \"banana\", \"lemon\"]), (\"b\", [\"grapes\"])])\n", 202 | "def f(x): return len(x)\n", 203 | "x.mapValues(f).collect()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 52, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]\n", 216 | "[('a', ['x', 'y', 'z']), ('b', ['p', 'r'])]\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "#flatMapValues(f)\n", 222 | "#通过flatMap函数传递键值对RDD中的每个值,而不改变键;这也保留了原始的RDD分区。\n", 223 | "x = sc.parallelize([(\"a\", [\"x\", \"y\", \"z\"]), (\"b\", [\"p\", \"r\"])])\n", 224 | "def f(x): return x\n", 225 | "print(x.flatMapValues(f).collect())\n", 226 | "print(x.mapValues(f).collect())" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 53, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "data": { 236 | "text/plain": [ 237 | "[3, 7]" 238 | ] 239 | }, 240 | "execution_count": 53, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "# mapPartitions(f, preservesPartitioning=False)\n", 247 | "# 与map不同,map是对每一个元素用函数作用;而mapPartitions是对每一个分区用一个函数去作用,每一个分区的元素先构成一个迭代器iterator,iterator是一个像列表,但里面的元素又保持分布式特点的一类对象;输入的参数就是这个iterator,然后对iterator进行运算,iterator支持的函数不是太多,sum,count等一些spark定义的基本函数应该都是支持的。但如果要进行更为复杂的一些个性化函数运算,可以就用不了。实践中发生可以通过[x for i in iterator]的方式,将iterator转换为列表,然后就可以进行各种操作。但是这样在分区内部或分组内部就失去了分布式运算的特点。\n", 248 | "# yield是生成的意思,但是在python中则是作为生成器理解,生成器的用处主要可以迭代,这样简化了很多运算模型。\n", 249 | "rdd = sc.parallelize([1, 2, 3, 4], 2)\n", 250 | "def f(iterator): yield sum(iterator)\n", 251 | "rdd.mapPartitions(f).collect()" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 55, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/plain": [ 262 | "6" 263 | ] 264 | }, 265 | "execution_count": 55, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "# mapPartitionsWithIndex(f, preservesPartitioning=False)\n", 272 | "# 通过在这个RDD的每个分区上应用一个函数来返回一个新的RDD,同时跟踪原始分区的索引。为对索引进行操作提供可能\n", 273 | "rdd = sc.parallelize([1, 2, 3, 4], 4)\n", 274 | "def f(splitIndex, iterator): yield splitIndex\n", 275 | "rdd.mapPartitionsWithIndex(f).sum()" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 57, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "0" 287 | ] 288 | }, 289 | "execution_count": 57, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "# partitionBy(numPartitions, partitionFunc=)\n", 296 | "# 返回使用指定的分区器分区的RDD的副本\n", 297 | "# set().intersection 取交集\n", 298 | "pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x))\n", 299 | "sets = pairs.partitionBy(2).glom().collect()\n", 300 | "len(set(sets[0]).intersection(set(sets[1])))" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 63, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "name": "stdout", 310 | "output_type": "stream", 311 | "text": [ 312 | "[[1], [2, 3], [4, 5]]\n", 313 | "[[], [1], [4, 5], [2, 3], []]\n" 314 | ] 315 | } 316 | ], 317 | "source": [ 318 | "# coalesce(numPartitions, shuffle=False)\n", 319 | "# 返回一个新的RDD,将RDD重新分区,减少分区不适用shuffle ,正加分区数的话要shuffle为true 同repartition\n", 320 | "print(sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect())\n", 321 | "print(sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(5,True).glom().collect())" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 64, 327 | "metadata": {}, 328 | "outputs": [ 329 | { 330 | "name": "stdout", 331 | "output_type": "stream", 332 | "text": [ 333 | "2\n", 334 | "10\n" 335 | ] 336 | } 337 | ], 338 | "source": [ 339 | "# repartition(numPartitions)\n", 340 | "# 重新分区,默认shuffle 减少分区用coalesce\n", 341 | "rdd = sc.parallelize([1,2,3,4,5,6,7], 4)\n", 342 | "print(len(rdd.repartition(2).glom().collect()))\n", 343 | "print(len(rdd.repartition(10).glom().collect()))" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 65, 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": [ 354 | "[(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]" 355 | ] 356 | }, 357 | "execution_count": 65, 358 | "metadata": {}, 359 | "output_type": "execute_result" 360 | } 361 | ], 362 | "source": [ 363 | "# zip(other)\n", 364 | "# 一个RDD作为key,另一个让RDD作为value\n", 365 | "x = sc.parallelize(range(0,5))\n", 366 | "y = sc.parallelize(range(1000, 1005))\n", 367 | "x.zip(y).collect()" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 68, 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "text/plain": [ 378 | "[('a', 0), ('b', 1), ('c', 2), ('d', 3)]" 379 | ] 380 | }, 381 | "execution_count": 68, 382 | "metadata": {}, 383 | "output_type": "execute_result" 384 | } 385 | ], 386 | "source": [ 387 | "# rdd.zipWithIndex()\n", 388 | "# RDD为key 排序位置索引作为value\n", 389 | "sc.parallelize([\"a\", \"b\", \"c\", \"d\"], 2).zipWithIndex().collect()" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 73, 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "name": "stdout", 399 | "output_type": "stream", 400 | "text": [ 401 | "['a', 'b', 'c', 'd', 'e']\n", 402 | "[['a'], ['b', 'c'], ['d', 'e']]\n", 403 | "[('a', 0), ('b', 1), ('c', 4), ('d', 2), ('e', 5)]\n" 404 | ] 405 | } 406 | ], 407 | "source": [ 408 | "# zipWithUniqueId()\n", 409 | "# 根据分区k 按公式k,n+k,2*n+k产生value,RDD为key\n", 410 | "rdd = sc.parallelize([\"a\", \"b\", \"c\", \"d\", \"e\"], 3)\n", 411 | "print(rdd.collect())\n", 412 | "print(rdd.glom().collect())\n", 413 | "print(rdd.zipWithUniqueId().collect())" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 4, 419 | "metadata": {}, 420 | "outputs": [ 421 | { 422 | "name": "stdout", 423 | "output_type": "stream", 424 | "text": [ 425 | "[(0, 0), (1, 1), (4, 2)]\n", 426 | "[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]\n" 427 | ] 428 | }, 429 | { 430 | "data": { 431 | "text/plain": [ 432 | "[(0, [[0], [0]]),\n", 433 | " (1, [[1], [1]]),\n", 434 | " (2, [[], [2]]),\n", 435 | " (3, [[], [3]]),\n", 436 | " (4, [[2], [4]])]" 437 | ] 438 | }, 439 | "execution_count": 4, 440 | "metadata": {}, 441 | "output_type": "execute_result" 442 | } 443 | ], 444 | "source": [ 445 | "# rdd.keyBy()\n", 446 | "# RDD通过函数创建元组\n", 447 | "x = sc.parallelize(range(0,3)).keyBy(lambda x: x*x)\n", 448 | "y = sc.parallelize(zip(range(0,5), range(0,5)))\n", 449 | "print(x.collect())\n", 450 | "print(y.collect())\n", 451 | "[(x, list(map(list, y))) for x, y in sorted(x.cogroup(y).collect())]" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 6, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "# foreach(f)\n", 461 | "# 是一个公式作用于rdd所有元素,生成非rdd\n", 462 | "def fun(x): \n", 463 | " print(x)\n", 464 | "sc.parallelize([1, 2, 3, 4, 5]).foreach(fun)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 7, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "# foreachPartition(f)\n", 474 | "# 使一个函数作用于RDD上每一个分区\n", 475 | "def fun(iterator):\n", 476 | " for x in iterator:\n", 477 | " print(x)\n", 478 | "sc.parallelize([1, 2, 3, 4, 5]).foreachPartition(fun)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 8, 484 | "metadata": {}, 485 | "outputs": [ 486 | { 487 | "name": "stdout", 488 | "output_type": "stream", 489 | "text": [ 490 | "None\n", 491 | "1\n", 492 | "2\n", 493 | "3\n", 494 | "\n" 495 | ] 496 | }, 497 | { 498 | "data": { 499 | "text/plain": [ 500 | "'\\n1\\n2\\n3\\n'" 501 | ] 502 | }, 503 | "execution_count": 8, 504 | "metadata": {}, 505 | "output_type": "execute_result" 506 | } 507 | ], 508 | "source": [ 509 | "inputData=sc.parallelize([1,2,3])\n", 510 | "def f(x):#定义一个将内容追加于文件末尾的函数\n", 511 | " with open('./example.txt','a+') as fl:\n", 512 | " print(x,file=fl)\n", 513 | "\n", 514 | "open('./example.txt','w').close()#操作之前先关闭之前可能存在的对该文件的写操作\n", 515 | "y=inputData.foreach(f)\n", 516 | "print(y)\n", 517 | "#结果为:None,因为函数f没有返回值\n", 518 | "#查看写文件的结果\n", 519 | "with open('./example.txt') as fl:\n", 520 | " print(fl.read())" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 9, 526 | "metadata": {}, 527 | "outputs": [ 528 | { 529 | "data": { 530 | "text/plain": [ 531 | "[(0, [2, 8]), (1, [1, 1, 3, 5])]" 532 | ] 533 | }, 534 | "execution_count": 9, 535 | "metadata": {}, 536 | "output_type": "execute_result" 537 | } 538 | ], 539 | "source": [ 540 | "# groupBy(f, numPartitions=None, partitionFunc=)\n", 541 | "# 根据函数符合条件与否进行分组返回分组项目的RDD\n", 542 | "rdd = sc.parallelize([1, 1, 2, 3, 5, 8])\n", 543 | "result = rdd.groupBy(lambda x: x % 2).collect()\n", 544 | "sorted([(x, sorted(y)) for (x, y) in result])" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": 11, 550 | "metadata": {}, 551 | "outputs": [ 552 | { 553 | "name": "stdout", 554 | "output_type": "stream", 555 | "text": [ 556 | "[('a', 2), ('b', 1)]\n", 557 | "[('a', [1, 1]), ('b', [1])]\n" 558 | ] 559 | } 560 | ], 561 | "source": [ 562 | "#groupByKey(numPartitions=None, partitionFunc=)\n", 563 | "#原rdd为键值对,groupByKey()则将原rdd的元素相同键的值编进一个sequence\n", 564 | "#如果您正在进行分组以执行每个密钥的聚合(例如总计或平均值),则使用reduceByKey或aggregateByKey将提供更好的性能。\n", 565 | "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n", 566 | "print(sorted(rdd.groupByKey().mapValues(len).collect()))\n", 567 | "print(sorted(rdd.groupByKey().mapValues(list).collect()))" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": 20, 573 | "metadata": {}, 574 | "outputs": [ 575 | { 576 | "name": "stdout", 577 | "output_type": "stream", 578 | "text": [ 579 | "[('a', ([1], [2])), ('b', ([4], []))]\n", 580 | "([4], [])\n" 581 | ] 582 | } 583 | ], 584 | "source": [ 585 | "# cogroup(other, numPartitions=None)\n", 586 | "# 对于self或other中的每个关键字k,返回一个包含一个元组的结果RDD,以及该关键字在自身和其他关键字中的值列表。\n", 587 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n", 588 | "y = sc.parallelize([(\"a\", 2)])\n", 589 | "print([(x, tuple(map(list, y))) for x, y in sorted(list(x.cogroup(y).collect()))])\n", 590 | "print(tuple(map(list,list(x.cogroup(y).collect()[0][1]))))\n" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 21, 596 | "metadata": {}, 597 | "outputs": [ 598 | { 599 | "name": "stdout", 600 | "output_type": "stream", 601 | "text": [ 602 | "[('a', ([5], [1], [2], [])), ('b', ([6], [4], [], [42]))]\n" 603 | ] 604 | } 605 | ], 606 | "source": [ 607 | "# groupWith(other, *others)\n", 608 | "# cogroup的别名,但支持多个RDD\n", 609 | "w = sc.parallelize([(\"a\", 5), (\"b\", 6)])\n", 610 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n", 611 | "y = sc.parallelize([(\"a\", 2)])\n", 612 | "z = sc.parallelize([(\"b\", 42)])\n", 613 | "print([(x, tuple(map(list, y))) for x, y in sorted(list(w.groupWith(x, y, z).collect()))])" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 24, 619 | "metadata": {}, 620 | "outputs": [ 621 | { 622 | "name": "stdout", 623 | "output_type": "stream", 624 | "text": [ 625 | "15\n", 626 | "abcde\n" 627 | ] 628 | } 629 | ], 630 | "source": [ 631 | "# reduce(f)\n", 632 | "# reduce函数是将rdd中的每个元素两两之间按函数f进行操作,然后再结果再两两之间按f进行操作,一直进行下去,\n", 633 | "# 即所谓的shuffle过程。reduce得到的结果是普通的python对象,而不是rdd.\n", 634 | "# operator 操作函数 https://docs.python.org/3/library/operator.html\n", 635 | "from operator import *\n", 636 | "print(sc.parallelize([1, 2, 3, 4, 5]).reduce(add))\n", 637 | "print(sc.parallelize([\"a\", \"b\", \"c\", \"d\", \"e\"]).reduce(concat))" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 29, 643 | "metadata": {}, 644 | "outputs": [ 645 | { 646 | "data": { 647 | "text/plain": [ 648 | "[('a', 2), ('b', 1)]" 649 | ] 650 | }, 651 | "execution_count": 29, 652 | "metadata": {}, 653 | "output_type": "execute_result" 654 | } 655 | ], 656 | "source": [ 657 | "# reduceByKey(func, numPartitions=None, partitionFunc=)\n", 658 | "# 按key分组 组内进行reduce处理\n", 659 | "from operator import *\n", 660 | "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n", 661 | "sorted(rdd.reduceByKey(add).collect())" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 36, 667 | "metadata": {}, 668 | "outputs": [ 669 | { 670 | "name": "stdout", 671 | "output_type": "stream", 672 | "text": [ 673 | "{'a': 2, 'b': 1}\n", 674 | "[('a', 2), ('b', 1)]\n" 675 | ] 676 | } 677 | ], 678 | "source": [ 679 | "# reduceByKeyLocally(func)\n", 680 | "# 其他与reduceByKey一样,只不过聚合后立即将键,值对以字典的形式传给到集群master,即输出为字典\n", 681 | "# 这还将在将结果发送到reducer之前在每个映射器上进行本地合并,类似于“合并器”中的MapReduce的\n", 682 | "from operator import *\n", 683 | "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n", 684 | "print(rdd.reduceByKeyLocally(add))\n", 685 | "print(sorted(rdd.reduceByKeyLocally(add).items()))" 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": 38, 691 | "metadata": {}, 692 | "outputs": [ 693 | { 694 | "data": { 695 | "text/plain": [ 696 | "-5" 697 | ] 698 | }, 699 | "execution_count": 38, 700 | "metadata": {}, 701 | "output_type": "execute_result" 702 | } 703 | ], 704 | "source": [ 705 | "# treeReduce(f, depth=2)\n", 706 | "# 分区间多次进行reduce\n", 707 | "# depth 树的深度(执行次数?)\n", 708 | "add = lambda x, y: x + y\n", 709 | "rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10)\n", 710 | "rdd.treeReduce(add, 2)" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": 28, 716 | "metadata": {}, 717 | "outputs": [ 718 | { 719 | "name": "stdout", 720 | "output_type": "stream", 721 | "text": [ 722 | "[5, 6]\n", 723 | "['a', 'b']\n" 724 | ] 725 | } 726 | ], 727 | "source": [ 728 | "# rdd.keys()\n", 729 | "# 原rdd的元素为键值对,返回原rdd元素的键为元素的rdd\n", 730 | "# rdd.values()\n", 731 | "# 原rdd的元素为键值对,返回原rdd元素的值为元素的rdd\n", 732 | "w = sc.parallelize([(\"a\", 5), (\"b\", 6)])\n", 733 | "print(w.keys().collect())\n", 734 | "print(w.values().collect())" 735 | ] 736 | }, 737 | { 738 | "cell_type": "markdown", 739 | "metadata": {}, 740 | "source": [ 741 | "`aggregate函数`\n", 742 | "\n", 743 | "将每个分区里面的元素进行聚合,然后用combine函数将每个分区的结果和初始值(zeroValue)进行combine操作。这个函数最终返回的类型不需要和RDD中元素类型一致。\n", 744 | "\n", 745 | "seqOp操作会聚合各分区中的元素,然后combOp操作把所有分区的聚合结果再次聚合,两个操作的初始值都是zeroValue. seqOp的操作是遍历分区中的所有元素(T),第一个T跟zeroValue做操作,结果再作为与第二个T做操作的zeroValue,直到遍历完整个分区。combOp操作是把各分区聚合的结果,再聚合。aggregate函数返回一个跟RDD不同类型的值。因此,需要一个操作seqOp来把分区中的元素T合并成一个U,另外一个操作combOp把所有U聚合。\n" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": 38, 751 | "metadata": {}, 752 | "outputs": [ 753 | { 754 | "name": "stdout", 755 | "output_type": "stream", 756 | "text": [ 757 | "(10, 4)\n", 758 | "(10, 4)\n", 759 | "(10, 28)\n" 760 | ] 761 | } 762 | ], 763 | "source": [ 764 | "#aggregate(zeroValue, seqOp, combOp)\n", 765 | "seqOp = (lambda x, y : (x[0] + y, x[1] + 1))\n", 766 | "combOp = (lambda x, y : (x[0] + y[0], x[1] + y[1]))\n", 767 | "print(sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp))\n", 768 | "print(sc.parallelize([1, 2, 3, 4],3).aggregate((0, 0), seqOp, combOp))\n", 769 | "# 三个分区多加了4个6 ?\n", 770 | "print(sc.parallelize([1, 2, 3, 4],3).aggregate((0, 6), seqOp, combOp))" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": null, 776 | "metadata": {}, 777 | "outputs": [], 778 | "source": [ 779 | "# aggregateByKey(zeroValue, seqFunc, combFunc, numPartitions=None, partitionFunc=)\n", 780 | "# 跟aggregate逻辑相同,bykey顾名思义 按照key分区 ,而aggregate按区分配;\n", 781 | "# 但是zeroValue与aggregate中的用法很不一样,这里的zeroValue是一个值,它即可以跟这样键聚合,也可以跟那个键聚合,而且zeroValue必须与键内聚合时定义的形式一致。" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": 37, 787 | "metadata": {}, 788 | "outputs": [ 789 | { 790 | "data": { 791 | "text/plain": [ 792 | "-5" 793 | ] 794 | }, 795 | "execution_count": 37, 796 | "metadata": {}, 797 | "output_type": "execute_result" 798 | } 799 | ], 800 | "source": [ 801 | "# treeAggregate(zeroValue, seqOp, combOp, depth=2)\n", 802 | "# 与aggregate不同的地方是:在每个分区,会做两次或者多次combOp,避免将所有局部的值传给driver端.另外,经过测验初始值zeroValue不会参与combOp.\n", 803 | "# depth:树的深度\n", 804 | "add = lambda x, y: x + y\n", 805 | "rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10)\n", 806 | "rdd.treeAggregate(0, add, add, 2)" 807 | ] 808 | }, 809 | { 810 | "cell_type": "code", 811 | "execution_count": 39, 812 | "metadata": {}, 813 | "outputs": [ 814 | { 815 | "data": { 816 | "text/plain": [ 817 | "15" 818 | ] 819 | }, 820 | "execution_count": 39, 821 | "metadata": {}, 822 | "output_type": "execute_result" 823 | } 824 | ], 825 | "source": [ 826 | "# fold(zeroValue, op)\n", 827 | "# partitionBy的简易版,初始一个值,分区内部执行函数和汇总函数为同一个函数\n", 828 | "from operator import add\n", 829 | "sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": 40, 835 | "metadata": {}, 836 | "outputs": [ 837 | { 838 | "data": { 839 | "text/plain": [ 840 | "[('a', 2), ('b', 1)]" 841 | ] 842 | }, 843 | "execution_count": 40, 844 | "metadata": {}, 845 | "output_type": "execute_result" 846 | } 847 | ], 848 | "source": [ 849 | "# foldByKey(zeroValue, func, numPartitions=None, partitionFunc=)\n", 850 | "# 跟fold逻辑相同,只不过是按照key进行分组\n", 851 | "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n", 852 | "from operator import add\n", 853 | "sorted(rdd.foldByKey(0, add).collect())" 854 | ] 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": 42, 859 | "metadata": {}, 860 | "outputs": [ 861 | { 862 | "name": "stdout", 863 | "output_type": "stream", 864 | "text": [ 865 | "[('a', 1), ('a', 2), ('b', 1), ('b', 3), ('c', 5), ('c', 6)]\n", 866 | "[('a', [1, 2]), ('b', [1, 3]), ('c', [5, 6])]\n" 867 | ] 868 | } 869 | ], 870 | "source": [ 871 | "# combineByKey(createCombiner, mergeValue, mergeCombiners, numPartitions=None, partitionFunc=)\n", 872 | "# 将RDD [(K,V)]转换为RDD [(K,C)]类型的结果,通过三个函数进行转换聚合的目的,\n", 873 | "# createcombiner函数 rdd值、类型转换\n", 874 | "# 根据key对值进行合并\n", 875 | "# 将合并列表,将连个c合并成一个\n", 876 | "x=sc.parallelize([('a',1),('a',2),('b',1),('b',3),('c',5),('c',6)])\n", 877 | "def to_list(a):\n", 878 | " return [a]\n", 879 | "def append(a,b):\n", 880 | " a.append(b)\n", 881 | " return a\n", 882 | "def extend(a,b):\n", 883 | " a.extend(b)\n", 884 | " return a\n", 885 | "print(x.collect())\n", 886 | "print(x.combineByKey(to_list,append,extend).collect())" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": 43, 892 | "metadata": {}, 893 | "outputs": [ 894 | { 895 | "name": "stdout", 896 | "output_type": "stream", 897 | "text": [ 898 | "[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]\n", 899 | "[('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]\n" 900 | ] 901 | } 902 | ], 903 | "source": [ 904 | "# rdd.sortBy(keyfunc, ascending=True, numPartitions=None)\n", 905 | "# 根据key对应的函数进行排序\n", 906 | "tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]\n", 907 | "print(sc.parallelize(tmp).sortBy(lambda x: x[0]).collect())\n", 908 | "print(sc.parallelize(tmp).sortBy(lambda x: x[1]).collect())" 909 | ] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": 45, 914 | "metadata": {}, 915 | "outputs": [ 916 | { 917 | "name": "stdout", 918 | "output_type": "stream", 919 | "text": [ 920 | "('1', 3)\n", 921 | "[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]\n", 922 | "[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]\n", 923 | "[('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5), ('little', 4), ('Mary', 1), ('was', 8), ('white', 9), ('whose', 6)]\n" 924 | ] 925 | } 926 | ], 927 | "source": [ 928 | "# sortByKey(ascending=True, numPartitions=None, keyfunc=>)\n", 929 | "# 对此RDD进行排序,假定它由(键,值)对组成\n", 930 | "tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]\n", 931 | "print(sc.parallelize(tmp).sortByKey().first())\n", 932 | "print(sc.parallelize(tmp).sortByKey(True, 1).collect())\n", 933 | "print(sc.parallelize(tmp).sortByKey(True, 2).collect())\n", 934 | "tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]\n", 935 | "tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)])\n", 936 | "print(sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=lambda k: k.lower()).collect())" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": 47, 942 | "metadata": {}, 943 | "outputs": [ 944 | { 945 | "name": "stdout", 946 | "output_type": "stream", 947 | "text": [ 948 | "(count: 4, mean: 2.5, stdev: 1.118033988749895, max: 4.0, min: 1.0)\n" 949 | ] 950 | } 951 | ], 952 | "source": [ 953 | "# stats()\n", 954 | "# 计算rdd中全体元素的均值、方差、最大值、最小值和个数的信息\n", 955 | "samp=sc.parallelize([1,2,3,4]).stats()\n", 956 | "print(samp)" 957 | ] 958 | }, 959 | { 960 | "cell_type": "code", 961 | "execution_count": 48, 962 | "metadata": {}, 963 | "outputs": [ 964 | { 965 | "data": { 966 | "text/plain": [ 967 | "3" 968 | ] 969 | }, 970 | "execution_count": 48, 971 | "metadata": {}, 972 | "output_type": "execute_result" 973 | } 974 | ], 975 | "source": [ 976 | "# rdd.count()\n", 977 | "# 计算rdd所有元素个数\n", 978 | "sc.parallelize([2, 3, 4]).count()" 979 | ] 980 | }, 981 | { 982 | "cell_type": "code", 983 | "execution_count": 3, 984 | "metadata": {}, 985 | "outputs": [ 986 | { 987 | "data": { 988 | "text/plain": [ 989 | "10000" 990 | ] 991 | }, 992 | "execution_count": 3, 993 | "metadata": {}, 994 | "output_type": "execute_result" 995 | } 996 | ], 997 | "source": [ 998 | "# countApprox(timeout, confidence=0.95)\n", 999 | "# 在限定时间内做出有可能的结果,即使任务没有完成\n", 1000 | "rdd = sc.parallelize(range(10000), 10)\n", 1001 | "rdd.countApprox(1000, 1.0)" 1002 | ] 1003 | }, 1004 | { 1005 | "cell_type": "code", 1006 | "execution_count": 5, 1007 | "metadata": {}, 1008 | "outputs": [ 1009 | { 1010 | "name": "stdout", 1011 | "output_type": "stream", 1012 | "text": [ 1013 | "1060\n", 1014 | "19\n" 1015 | ] 1016 | } 1017 | ], 1018 | "source": [ 1019 | "# countApproxDistinct(relativeSD=0.05)\n", 1020 | "# 返回RDD中不同值数的近似值\n", 1021 | "# relativeSD 相对准确度。较小的值创建需要更多空间的计数器。它必须大于0.000017。\n", 1022 | "n = sc.parallelize(range(1000)).map(str).countApproxDistinct()\n", 1023 | "print(n)\n", 1024 | "n = sc.parallelize([i % 20 for i in range(1000)]).countApproxDistinct()\n", 1025 | "print(n)" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": 9, 1031 | "metadata": {}, 1032 | "outputs": [ 1033 | { 1034 | "name": "stdout", 1035 | "output_type": "stream", 1036 | "text": [ 1037 | "[('a', 2), ('b', 1)]\n", 1038 | "defaultdict(, {'a': 2, 'b': 1})\n" 1039 | ] 1040 | } 1041 | ], 1042 | "source": [ 1043 | "# countByKey()\n", 1044 | "# 计算每个键的元素数量,并将结果作为字典返回给主数据。\n", 1045 | "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n", 1046 | "print(sorted(rdd.countByKey().items()))\n", 1047 | "print(rdd.countByKey())" 1048 | ] 1049 | }, 1050 | { 1051 | "cell_type": "code", 1052 | "execution_count": 21, 1053 | "metadata": {}, 1054 | "outputs": [ 1055 | { 1056 | "name": "stdout", 1057 | "output_type": "stream", 1058 | "text": [ 1059 | "[[1, 2], [1, 2, 2]]\n", 1060 | "[(1, 2), (2, 3)]\n" 1061 | ] 1062 | } 1063 | ], 1064 | "source": [ 1065 | "# countByValue()\n", 1066 | "# 将此RDD中每个唯一值的计数返回为(值,计数)对的字典。\n", 1067 | "print(sc.parallelize([1, 2, 1, 2, 2], 2).glom().collect())\n", 1068 | "print(sorted(sc.parallelize([1, 2, 1, 2, 2], 2).countByValue().items()))" 1069 | ] 1070 | }, 1071 | { 1072 | "cell_type": "code", 1073 | "execution_count": 13, 1074 | "metadata": {}, 1075 | "outputs": [], 1076 | "source": [ 1077 | "# first() 返回第一个元素\n", 1078 | "# max()返回最大值\n", 1079 | "# take(num) 返回开始num个值\n", 1080 | "# top(num, key=None) 计算rdd所有元素按降序排列后最顶部的几个元素\n", 1081 | "# min() rdd中的最小值\n", 1082 | "# mean() 计算rdd所有元素均值\n", 1083 | "# variance() 方差\n", 1084 | "# stdev() 标准差\n", 1085 | "# sum() 和" 1086 | ] 1087 | }, 1088 | { 1089 | "cell_type": "code", 1090 | "execution_count": 19, 1091 | "metadata": {}, 1092 | "outputs": [ 1093 | { 1094 | "name": "stdout", 1095 | "output_type": "stream", 1096 | "text": [ 1097 | "([0, 25, 50], [25, 26])\n", 1098 | "([0, 5, 25, 50], [5, 20, 26])\n", 1099 | "([0, 15, 30, 45, 60], [15, 15, 15, 6])\n", 1100 | "(('a', 'b', 'c'), [2, 2])\n" 1101 | ] 1102 | } 1103 | ], 1104 | "source": [ 1105 | "# histogram(buckets)\n", 1106 | "# 对rdd中的元素进行频数统计,统计区间有两种,一种是给出段数,一种是直接给出区间。返回为元组\n", 1107 | "rdd = sc.parallelize(range(51))\n", 1108 | "print(rdd.histogram(2))\n", 1109 | "print(rdd.histogram([0, 5, 25, 50]))\n", 1110 | "print(rdd.histogram([0, 15, 30, 45, 60]))\n", 1111 | "rdd = sc.parallelize([\"ab\", \"ac\", \"b\", \"bd\", \"ef\"])\n", 1112 | "print(rdd.histogram((\"a\", \"b\", \"c\")))" 1113 | ] 1114 | }, 1115 | { 1116 | "cell_type": "code", 1117 | "execution_count": 20, 1118 | "metadata": {}, 1119 | "outputs": [ 1120 | { 1121 | "data": { 1122 | "text/plain": [ 1123 | "['1', '2', '', '3']" 1124 | ] 1125 | }, 1126 | "execution_count": 20, 1127 | "metadata": {}, 1128 | "output_type": "execute_result" 1129 | } 1130 | ], 1131 | "source": [ 1132 | "# pipe(command, env=None, checkCode=False)\n", 1133 | "# 通过管道向后面环节输出command处理过的结果,具体功能就体现在command,command为linux命令。 \n", 1134 | "# pipe函数中的'cat'为linux命令,表示打印内容。\n", 1135 | "sc.parallelize(['1', '2', '', '3']).pipe('cat').collect()" 1136 | ] 1137 | }, 1138 | { 1139 | "cell_type": "code", 1140 | "execution_count": 21, 1141 | "metadata": {}, 1142 | "outputs": [ 1143 | { 1144 | "data": { 1145 | "text/plain": [ 1146 | "[2, 4]" 1147 | ] 1148 | }, 1149 | "execution_count": 21, 1150 | "metadata": {}, 1151 | "output_type": "execute_result" 1152 | } 1153 | ], 1154 | "source": [ 1155 | "# filter(f)\n", 1156 | "# 返回满足条件的新RDD\n", 1157 | "rdd = sc.parallelize([1, 2, 3, 4, 5])\n", 1158 | "rdd.filter(lambda x: x % 2 == 0).collect()" 1159 | ] 1160 | }, 1161 | { 1162 | "cell_type": "code", 1163 | "execution_count": 10, 1164 | "metadata": {}, 1165 | "outputs": [ 1166 | { 1167 | "data": { 1168 | "text/plain": [ 1169 | "[1, 2, 3]" 1170 | ] 1171 | }, 1172 | "execution_count": 10, 1173 | "metadata": {}, 1174 | "output_type": "execute_result" 1175 | } 1176 | ], 1177 | "source": [ 1178 | "# distinct(numPartitions=None)\n", 1179 | "# 返回一个没有重复元素的新RDD,就是去重处理\n", 1180 | "sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect())" 1181 | ] 1182 | }, 1183 | { 1184 | "cell_type": "code", 1185 | "execution_count": 3, 1186 | "metadata": {}, 1187 | "outputs": [ 1188 | { 1189 | "name": "stdout", 1190 | "output_type": "stream", 1191 | "text": [ 1192 | "11\n", 1193 | "20\n" 1194 | ] 1195 | } 1196 | ], 1197 | "source": [ 1198 | "# sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect())\n", 1199 | "# 返回此RDD的采样子集\n", 1200 | "# withReplacement:是否重复采样\n", 1201 | "# fraction:样本预期占RDD的大小,每一个元素被取到的概率一样,是一个【0,1】的数\n", 1202 | "# seed 随机模式的种子\n", 1203 | "rdd = sc.parallelize(range(100), 4)\n", 1204 | "print(rdd.sample(False, 0.1, 81).count())\n", 1205 | "print(rdd.sample(False, 0.2, 81).count())" 1206 | ] 1207 | }, 1208 | { 1209 | "cell_type": "code", 1210 | "execution_count": 28, 1211 | "metadata": {}, 1212 | "outputs": [ 1213 | { 1214 | "name": "stdout", 1215 | "output_type": "stream", 1216 | "text": [ 1217 | "[('a', 0), ('b', 0), ('a', 1), ('a', 2), ('b', 1), ('b', 2), ('a', 3), ('a', 4), ('a', 5), ('a', 6)]\n", 1218 | "209 98\n" 1219 | ] 1220 | }, 1221 | { 1222 | "ename": "AttributeError", 1223 | "evalue": "'ResultIterable' object has no attribute 'takeSample'", 1224 | "traceback": [ 1225 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 1226 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", 1227 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'b'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'b'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtakeSample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1228 | "\u001b[0;31mAttributeError\u001b[0m: 'ResultIterable' object has no attribute 'takeSample'" 1229 | ], 1230 | "output_type": "error" 1231 | } 1232 | ], 1233 | "source": [ 1234 | "# sampleByKey(withReplacement, fractions, seed=None)\n", 1235 | "# 返回按键取样的RDD的子集(通过分层抽样)。用分数指定的不同键的变量采样率来创建这个RDD的样本,这是抽样速率图的关键。\n", 1236 | "# 多个key的fractions 以字典方式传递\n", 1237 | "fractions = {\"a\": 0.2, \"b\": 0.1}\n", 1238 | "rdd = sc.parallelize(fractions.keys()).cartesian(sc.parallelize(range(0, 1000)))\n", 1239 | "sample = dict(rdd.sampleByKey(False, fractions, 2).groupByKey().collect())\n", 1240 | "print(rdd.take(10))\n", 1241 | "print(len(sample['a']), len(sample['b']))\n", 1242 | "print(sorted(sample['a'])[:10])" 1243 | ] 1244 | }, 1245 | { 1246 | "cell_type": "code", 1247 | "execution_count": 26, 1248 | "metadata": {}, 1249 | "outputs": [ 1250 | { 1251 | "data": { 1252 | "text/plain": [ 1253 | "1.0" 1254 | ] 1255 | }, 1256 | "execution_count": 26, 1257 | "metadata": {}, 1258 | "output_type": "execute_result" 1259 | } 1260 | ], 1261 | "source": [ 1262 | "# sampleStdev()\n", 1263 | "# 计算这个RDD元素的样本标准差(通过除以N-1而不是N)来修正估计标准差的偏差。\n", 1264 | "sc.parallelize([1, 2, 3]).sampleStdev()" 1265 | ] 1266 | }, 1267 | { 1268 | "cell_type": "code", 1269 | "execution_count": 27, 1270 | "metadata": {}, 1271 | "outputs": [ 1272 | { 1273 | "data": { 1274 | "text/plain": [ 1275 | "1.0" 1276 | ] 1277 | }, 1278 | "execution_count": 27, 1279 | "metadata": {}, 1280 | "output_type": "execute_result" 1281 | } 1282 | ], 1283 | "source": [ 1284 | "# sampleVariance()\n", 1285 | "# 计算这个RDD元素的样本方差(它纠正了通过除以N-1而不是N来估计方差的偏差)。\n", 1286 | "sc.parallelize([1, 2, 3]).sampleVariance()" 1287 | ] 1288 | }, 1289 | { 1290 | "cell_type": "code", 1291 | "execution_count": 31, 1292 | "metadata": {}, 1293 | "outputs": [ 1294 | { 1295 | "name": "stdout", 1296 | "output_type": "stream", 1297 | "text": [ 1298 | "[6, 9, 9, 8, 0, 7, 0, 8, 3, 6, 7, 8]\n", 1299 | "5\n" 1300 | ] 1301 | } 1302 | ], 1303 | "source": [ 1304 | "# takeSample(withReplacement, num, seed=None)\n", 1305 | "# 返回这个RDD的一个固定大小的采样子集。\n", 1306 | "# 只有当结果数组被认为是很小的时候,才应该使用这个方法,因为所有的数据都被加载到驱动程序的内存中。\n", 1307 | "rdd = sc.parallelize(range(0, 10))\n", 1308 | "print(rdd.takeSample(True, 12, 1))\n", 1309 | "print(len(rdd.takeSample(False, 5, 2)))" 1310 | ] 1311 | }, 1312 | { 1313 | "cell_type": "code", 1314 | "execution_count": 32, 1315 | "metadata": {}, 1316 | "outputs": [ 1317 | { 1318 | "data": { 1319 | "text/plain": [ 1320 | "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]" 1321 | ] 1322 | }, 1323 | "execution_count": 32, 1324 | "metadata": {}, 1325 | "output_type": "execute_result" 1326 | } 1327 | ], 1328 | "source": [ 1329 | "# toLocalIterator()\n", 1330 | "# 返回包含这个RDD中所有元素的迭代器。迭代器将消耗与此RDD中最大分区相同的内存。\n", 1331 | "rdd = sc.parallelize(range(10))\n", 1332 | "[x for x in rdd.toLocalIterator()]" 1333 | ] 1334 | }, 1335 | { 1336 | "cell_type": "code", 1337 | "execution_count": 39, 1338 | "metadata": {}, 1339 | "outputs": [ 1340 | { 1341 | "data": { 1342 | "text/plain": [ 1343 | "[1, 1, 2, 3, 1, 1, 2, 3]" 1344 | ] 1345 | }, 1346 | "execution_count": 39, 1347 | "metadata": {}, 1348 | "output_type": "execute_result" 1349 | } 1350 | ], 1351 | "source": [ 1352 | "# union(other)\n", 1353 | "# 返回这个RDD和另一个的结合。不去重\n", 1354 | "rdd = sc.parallelize([1, 1, 2, 3])\n", 1355 | "rdd.union(rdd).collect()" 1356 | ] 1357 | }, 1358 | { 1359 | "cell_type": "code", 1360 | "execution_count": 40, 1361 | "metadata": {}, 1362 | "outputs": [ 1363 | { 1364 | "data": { 1365 | "text/plain": [ 1366 | "[2, 1, 3]" 1367 | ] 1368 | }, 1369 | "execution_count": 40, 1370 | "metadata": {}, 1371 | "output_type": "execute_result" 1372 | } 1373 | ], 1374 | "source": [ 1375 | "# intersection(other)\n", 1376 | "# 返回这个RDD和另一个的交集。即使输入RDDs完成了,输出也不会包含任何重复的元素。\n", 1377 | "# 该方法在内部执行洗牌。\n", 1378 | "rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])\n", 1379 | "rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])\n", 1380 | "rdd1.intersection(rdd2).collect()" 1381 | ] 1382 | }, 1383 | { 1384 | "cell_type": "code", 1385 | "execution_count": 41, 1386 | "metadata": {}, 1387 | "outputs": [ 1388 | { 1389 | "data": { 1390 | "text/plain": [ 1391 | "[('a', 1), ('b', 4), ('b', 5)]" 1392 | ] 1393 | }, 1394 | "execution_count": 41, 1395 | "metadata": {}, 1396 | "output_type": "execute_result" 1397 | } 1398 | ], 1399 | "source": [ 1400 | "# subtract(other, numPartitions=None)\n", 1401 | "# 返回自己有其他没有的元素的值\n", 1402 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4), (\"b\", 5), (\"a\", 3)])\n", 1403 | "y = sc.parallelize([(\"a\", 3), (\"c\", None)])\n", 1404 | "sorted(x.subtract(y).collect())" 1405 | ] 1406 | }, 1407 | { 1408 | "cell_type": "code", 1409 | "execution_count": 42, 1410 | "metadata": {}, 1411 | "outputs": [ 1412 | { 1413 | "data": { 1414 | "text/plain": [ 1415 | "[('b', 4), ('b', 5)]" 1416 | ] 1417 | }, 1418 | "execution_count": 42, 1419 | "metadata": {}, 1420 | "output_type": "execute_result" 1421 | } 1422 | ], 1423 | "source": [ 1424 | "# subtractByKey(other, numPartitions=None)\n", 1425 | "# 返回每一个(键,值)对,在另一个没有成对的匹配键。\n", 1426 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4), (\"b\", 5), (\"a\", 2)])\n", 1427 | "y = sc.parallelize([(\"a\", 3), (\"c\", None)])\n", 1428 | "sorted(x.subtractByKey(y).collect())" 1429 | ] 1430 | }, 1431 | { 1432 | "cell_type": "code", 1433 | "execution_count": 43, 1434 | "metadata": {}, 1435 | "outputs": [ 1436 | { 1437 | "data": { 1438 | "text/plain": [ 1439 | "[(1, 1), (1, 2), (2, 1), (2, 2)]" 1440 | ] 1441 | }, 1442 | "execution_count": 43, 1443 | "metadata": {}, 1444 | "output_type": "execute_result" 1445 | } 1446 | ], 1447 | "source": [ 1448 | "# cartesian(other)\n", 1449 | "# 返回这个RDD和另一个RDD的笛卡尔积,也就是所有成对的元素(a,b)的RDD,a为本身RDD,b为其他RDD\n", 1450 | "rdd = sc.parallelize([1, 2])\n", 1451 | "sorted(rdd.cartesian(rdd).collect())" 1452 | ] 1453 | }, 1454 | { 1455 | "cell_type": "code", 1456 | "execution_count": 44, 1457 | "metadata": {}, 1458 | "outputs": [ 1459 | { 1460 | "data": { 1461 | "text/plain": [ 1462 | "[('a', (1, 2)), ('a', (1, 3))]" 1463 | ] 1464 | }, 1465 | "execution_count": 44, 1466 | "metadata": {}, 1467 | "output_type": "execute_result" 1468 | } 1469 | ], 1470 | "source": [ 1471 | "# join(other, numPartitions=None)\n", 1472 | "# 返回一个包含所有成对元素的RDD,其中包含在self和other中匹配的键。每一对元素都将作为一个(k,(v1,v2))返回,其中(k,v1)为self(k,v2)为other。\n", 1473 | "# 在集群中执行散列连接\n", 1474 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n", 1475 | "y = sc.parallelize([(\"a\", 2), (\"a\", 3)])\n", 1476 | "sorted(x.join(y).collect())" 1477 | ] 1478 | }, 1479 | { 1480 | "cell_type": "code", 1481 | "execution_count": 45, 1482 | "metadata": {}, 1483 | "outputs": [ 1484 | { 1485 | "data": { 1486 | "text/plain": [ 1487 | "[('a', (2, 1)), ('b', (None, 4))]" 1488 | ] 1489 | }, 1490 | "execution_count": 45, 1491 | "metadata": {}, 1492 | "output_type": "execute_result" 1493 | } 1494 | ], 1495 | "source": [ 1496 | "# rightOuterJoin(other, numPartitions=None)\n", 1497 | "# 对于在otherRDD中的每一个(k, w)元素,生成的RDD中有k键的生成(k, (v, w)), 如果没有k键的话也要生成none补位(k,(None, w))\n", 1498 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n", 1499 | "y = sc.parallelize([(\"a\", 2)])\n", 1500 | "sorted(y.rightOuterJoin(x).collect())" 1501 | ] 1502 | }, 1503 | { 1504 | "cell_type": "code", 1505 | "execution_count": 46, 1506 | "metadata": {}, 1507 | "outputs": [ 1508 | { 1509 | "data": { 1510 | "text/plain": [ 1511 | "[('a', (1, 2)), ('b', (4, None))]" 1512 | ] 1513 | }, 1514 | "execution_count": 46, 1515 | "metadata": {}, 1516 | "output_type": "execute_result" 1517 | } 1518 | ], 1519 | "source": [ 1520 | "# leftOuterJoin(other, numPartitions=None)\n", 1521 | "# 就是用第二个rdd的key去第一个rdd中寻找,在value组合的时候还是第一个rdd的值在前,第二个rdd的值在后。其他与leftOuterJoin完全一样。\n", 1522 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n", 1523 | "y = sc.parallelize([(\"a\", 2)])\n", 1524 | "sorted(x.leftOuterJoin(y).collect())" 1525 | ] 1526 | }, 1527 | { 1528 | "cell_type": "code", 1529 | "execution_count": 48, 1530 | "metadata": {}, 1531 | "outputs": [ 1532 | { 1533 | "name": "stdout", 1534 | "output_type": "stream", 1535 | "text": [ 1536 | "500\n", 1537 | "192 308\n" 1538 | ] 1539 | } 1540 | ], 1541 | "source": [ 1542 | "# randomSplit(weights, seed=None)\n", 1543 | "# 将RDD按照一定的比例随机分开\n", 1544 | "rdd = sc.parallelize(range(500), 1)\n", 1545 | "rdd1, rdd2 = rdd.randomSplit([2, 3], 17)\n", 1546 | "print(len(rdd1.collect() + rdd2.collect()))\n", 1547 | "print(rdd1.count(), rdd2.count())" 1548 | ] 1549 | }, 1550 | { 1551 | "cell_type": "code", 1552 | "execution_count": null, 1553 | "metadata": {}, 1554 | "outputs": [], 1555 | "source": [] 1556 | } 1557 | ], 1558 | "metadata": { 1559 | "kernelspec": { 1560 | "display_name": "Python 3", 1561 | "language": "python", 1562 | "name": "python3" 1563 | }, 1564 | "language_info": { 1565 | "codemirror_mode": { 1566 | "name": "ipython", 1567 | "version": 3 1568 | }, 1569 | "file_extension": ".py", 1570 | "mimetype": "text/x-python", 1571 | "name": "python", 1572 | "nbconvert_exporter": "python", 1573 | "pygments_lexer": "ipython3", 1574 | "version": "3.6.4" 1575 | } 1576 | }, 1577 | "nbformat": 4, 1578 | "nbformat_minor": 2 1579 | } 1580 | -------------------------------------------------------------------------------- /pyspark.ml.classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### 数据准备" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyspark.sql import SparkSession\n", 17 | "from pyspark import SparkConf, SparkContext\n", 18 | "spark = SparkSession.builder.master('local[1]').appName('learn_ml').getOrCreate()" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "df0 = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/mushrooms.csv', header=True, inferSchema=True, encoding='utf-8')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": { 34 | "scrolled": false 35 | }, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "23" 41 | ] 42 | }, 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "len(df0.columns)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "看看分类的类别" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "**查看是否有na值**" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "False" 75 | ] 76 | }, 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "# df0.toPandas().isna().sum()\n", 84 | "df0.toPandas().isna().values.any()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 5, 90 | "metadata": { 91 | "scrolled": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "from pyspark.ml.feature import StringIndexer, VectorAssembler\n", 96 | "old_columns_names = df0.columns\n", 97 | "new_columns_names = [name+'-new' for name in old_columns_names]\n", 98 | "for i in range(len(old_columns_names)):\n", 99 | " indexer = StringIndexer(inputCol=old_columns_names[i], outputCol=new_columns_names[i])\n", 100 | " df0 = indexer.fit(df0).transform(df0)\n", 101 | "vecAss = VectorAssembler(inputCols=new_columns_names[1:], outputCol='features')\n", 102 | "df0 = vecAss.transform(df0)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 6, 108 | "metadata": { 109 | "scrolled": false 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "df0 = df0.withColumnRenamed(new_columns_names[0], 'label')" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 7, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "dfi = df0.select(['label', 'features'])" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 8, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# df0.describe().toPandas().T" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 9, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "+-----+------------------------------------------------------------------------------+\n", 144 | "|label|features |\n", 145 | "+-----+------------------------------------------------------------------------------+\n", 146 | "|1.0 |(22,[1,3,4,7,8,9,10,19,20,21],[1.0,1.0,6.0,1.0,7.0,1.0,2.0,2.0,2.0,4.0]) |\n", 147 | "|0.0 |(22,[1,2,3,4,8,9,10,19,20,21],[1.0,3.0,1.0,4.0,7.0,1.0,3.0,1.0,3.0,1.0]) |\n", 148 | "|0.0 |(22,[0,1,2,3,4,8,9,10,19,20,21],[3.0,1.0,4.0,1.0,5.0,3.0,1.0,3.0,1.0,3.0,5.0])|\n", 149 | "|1.0 |(22,[2,3,4,7,8,9,10,19,20,21],[4.0,1.0,6.0,1.0,3.0,1.0,2.0,2.0,2.0,4.0]) |\n", 150 | "|0.0 |(22,[1,2,6,8,10,18,19,20,21],[1.0,1.0,1.0,7.0,2.0,1.0,1.0,4.0,1.0]) |\n", 151 | "+-----+------------------------------------------------------------------------------+\n", 152 | "only showing top 5 rows\n", 153 | "\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "dfi.show(5, truncate=0)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 10, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "# label = df0.rdd.map(lambda row: row[0])\n", 168 | "# row = df0.rdd.map(lambda row: row[1:])\n", 169 | "# dfi = label.map(lambda m: 0.0 if m=='p' else 1.0).zip(row.map(lambda x: list(x))).toDF(schema=['label','feature'])" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 11, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "train_data, test_data = dfi.randomSplit([4.0, 1.0], 100)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 12, 184 | "metadata": { 185 | "scrolled": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "# test_data.filter(test_data['label']==1).show(5, truncate=0)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "### 评估器\n", 197 | "**分类(classification)**" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "### LogisticRegression :逻辑回归,支持多项逻辑(softmax)和二项逻辑回归" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "pyspark.ml.classification.LogisticRegression(self, featuresCol=\"features\", labelCol=\"label\", predictionCol=\"prediction\", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol=\"probability\", rawPredictionCol=\"rawPrediction\", standardization=True, weightCol=None, aggregationDepth=2, family=\"auto\")\n" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "`\n", 219 | "regParam: 正则化参数(>=0)\n", 220 | "elasticNetParam: ElasticNet混合参数,0-1之间,当alpha为0时,惩罚为L2正则化,当为1时为L1正则化\n", 221 | "fitIntercept: 是否拟合一个截距项\n", 222 | "Standardization: 是否在拟合数据之前对数据进行标准化\n", 223 | "aggregationDepth: 树聚合所建议的深度(>=2)\n", 224 | "`" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 20, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "from pyspark.ml.classification import LogisticRegression\n", 234 | "blor = LogisticRegression(regParam=0.01)\n", 235 | "blorModel = blor.fit(train_data)\n", 236 | "result = blorModel.transform(test_data)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 21, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "0.9661954517516902" 248 | ] 249 | }, 250 | "execution_count": 21, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "result.filter(result.label == result.prediction).count()/result.count()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 22, 262 | "metadata": { 263 | "collapsed": true 264 | }, 265 | "outputs": [ 266 | { 267 | "name": "stdout", 268 | "output_type": "stream", 269 | "text": [ 270 | "+--------------------+--------------------+\n", 271 | "| FPR| TPR|\n", 272 | "+--------------------+--------------------+\n", 273 | "| 0.0| 0.0|\n", 274 | "| 0.0|0.020466901183242726|\n", 275 | "| 0.0| 0.04093380236648545|\n", 276 | "|5.934718100890207E-4|0.060761112887751836|\n", 277 | "|0.001186943620178...| 0.08058842340901823|\n", 278 | "|0.001483679525222552| 0.10073552926127279|\n", 279 | "|0.001780415430267...| 0.12088263511352734|\n", 280 | "|0.002373887240356083| 0.14070994563479372|\n", 281 | "|0.002670623145400...| 0.1608570514870483|\n", 282 | "|0.002670623145400...| 0.18132395267029103|\n", 283 | "|0.002670623145400...| 0.20179085385353374|\n", 284 | "|0.002670623145400...| 0.22225775503677647|\n", 285 | "|0.002670623145400...| 0.24272465622001918|\n", 286 | "|0.002670623145400...| 0.2631915574032619|\n", 287 | "|0.002670623145400...| 0.2836584585865046|\n", 288 | "|0.002670623145400...| 0.30412535976974736|\n", 289 | "|0.002670623145400...| 0.3245922609529901|\n", 290 | "|0.002670623145400...| 0.34505916213623283|\n", 291 | "|0.002670623145400...| 0.3655260633194755|\n", 292 | "|0.002670623145400...| 0.38599296450271825|\n", 293 | "+--------------------+--------------------+\n", 294 | "only showing top 20 rows\n", 295 | "\n", 296 | "+--------------------+------------------+\n", 297 | "| recall| precision|\n", 298 | "+--------------------+------------------+\n", 299 | "| 0.0| 1.0|\n", 300 | "|0.020466901183242726| 1.0|\n", 301 | "| 0.04093380236648545| 1.0|\n", 302 | "|0.060761112887751836|0.9895833333333334|\n", 303 | "| 0.08058842340901823| 0.984375|\n", 304 | "| 0.10073552926127279| 0.984375|\n", 305 | "| 0.12088263511352734| 0.984375|\n", 306 | "| 0.14070994563479372|0.9821428571428571|\n", 307 | "| 0.1608570514870483| 0.982421875|\n", 308 | "| 0.18132395267029103| 0.984375|\n", 309 | "| 0.20179085385353374| 0.9859375|\n", 310 | "| 0.22225775503677647|0.9872159090909091|\n", 311 | "| 0.24272465622001918| 0.98828125|\n", 312 | "| 0.2631915574032619|0.9891826923076923|\n", 313 | "| 0.2836584585865046|0.9899553571428571|\n", 314 | "| 0.30412535976974736| 0.990625|\n", 315 | "| 0.3245922609529901| 0.9912109375|\n", 316 | "| 0.34505916213623283|0.9917279411764706|\n", 317 | "| 0.3655260633194755| 0.9921875|\n", 318 | "| 0.38599296450271825|0.9925986842105263|\n", 319 | "+--------------------+------------------+\n", 320 | "only showing top 20 rows\n", 321 | "\n" 322 | ] 323 | } 324 | ], 325 | "source": [ 326 | "blorModel.\n", 327 | "blorModel.summary.pr.show()" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "### 决策树\n", 335 | "pyspark.ml.classification.DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='gini', seed=None)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "`\n", 343 | "checkpointInterval:设置checkpoint区间(>=1),或宕掉checkpoint(-1),例如10意味着缓冲区(cache)将会每迭代10次获得一次checkpoint\n", 344 | "fit(datasset,params=None)\n", 345 | "impurity: 信息增益计算的准则,选项\"entropy\", \"gini\"\n", 346 | "maxBins:连续特征离散化的最大分箱,必须>=2 并且>=分类特征分类的数量\n", 347 | "maxDepth:树的最大深度\n", 348 | "minInfoGain:分割结点所需的最小的信息增益\n", 349 | "minInstancesPerNode:每个结点最小实例个数\n", 350 | "`" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 13, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "from pyspark.ml.classification import DecisionTreeClassifier\n", 360 | "dt = DecisionTreeClassifier(maxDepth=5)\n", 361 | "dtModel = dt.fit(train_data)\n", 362 | "result = dtModel.transform(test_data)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 14, 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "data": { 372 | "text/plain": [ 373 | "1.0" 374 | ] 375 | }, 376 | "execution_count": 14, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "# accuracy\n", 383 | "result.filter(result.label == result.prediction).count()/result.count()" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "### 梯度增强树\n", 391 | "pyspark.ml.classification.GBTClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType='logistic', maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "`\n", 399 | "checkpointInterval: 同DecisionTreeClassifier\n", 400 | "fit(dataset,params=None)方法\n", 401 | "lossType: GBT要最小化的损失函数,选项:logistic\n", 402 | "maxBins: 同DecisionTreeClassifier\n", 403 | "maxDepth: 同DecisionTreeClassifier\n", 404 | "maxIter: 同DecisionTreeClassifier\n", 405 | "minInfoGain: 同DecisionTreeClassifier\n", 406 | "minInstancesPerNode:同DecisionTreeClassifier\n", 407 | "stepSize: 每次迭代优化的步长\n", 408 | "subsamplingRate: 同RandomForesetClassier\n", 409 | "`" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 16, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "from pyspark.ml.classification import GBTClassifier\n", 419 | "gbt = GBTClassifier(maxDepth=5)\n", 420 | "gbtModel = gbt.fit(train_data)\n", 421 | "result = gbtModel.transform(test_data)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 17, 427 | "metadata": {}, 428 | "outputs": [ 429 | { 430 | "data": { 431 | "text/plain": [ 432 | "1.0" 433 | ] 434 | }, 435 | "execution_count": 17, 436 | "metadata": {}, 437 | "output_type": "execute_result" 438 | } 439 | ], 440 | "source": [ 441 | "result.filter(result.label == result.prediction).count()/result.count()" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "### 随机森林\n", 449 | "pyspark.ml.classification.RandomForestClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='gini', numTrees=20, featureSubsetStrategy='auto', seed=None, subsamplingRate=1.0)" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "`\n", 457 | "checkpoint:同DecisionTreeClassifier\n", 458 | "featureSubsetStrategy:每棵树上要分割的特征数目,选项为\"auto\",\"all\", \"onethird\", \"sqrt\", \"log2\", \"(0.0-1.0],\"[1-n]\"\n", 459 | "fit(dataset,params=None)方法\n", 460 | "impurity: 同DecisionTreeClassifier\n", 461 | "maxBins:同DecisionTreeClassifier\n", 462 | "maxDepth:同DecisionTreeClassifier\n", 463 | "minInfoGain: 同DecisionTreeClassifier\n", 464 | "numTrees: 训练树的个数\n", 465 | "subsamplingRate: 用于训练每颗决策树的样本个数,区间(0,1]\n", 466 | "`" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 13, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "from pyspark.ml.classification import RandomForestClassifier\n", 476 | "rf = RandomForestClassifier(numTrees=10, maxDepth=5)\n", 477 | "rfModel = rf.fit(train_data)\n", 478 | "# model.featureImportances\n", 479 | "result = rfModel.transform(test_data)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 19, 485 | "metadata": { 486 | "scrolled": true 487 | }, 488 | "outputs": [ 489 | { 490 | "data": { 491 | "text/plain": [ 492 | "1.0" 493 | ] 494 | }, 495 | "execution_count": 19, 496 | "metadata": {}, 497 | "output_type": "execute_result" 498 | } 499 | ], 500 | "source": [ 501 | "result.filter(result.label == result.prediction).count()/result.count()" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "### 朴素贝叶斯\n", 509 | "pyspark.ml.classification.NaiveBayes(featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', smoothing=1.0, modelType='multinomial', thresholds=None, weightCol=None)" 510 | ] 511 | }, 512 | { 513 | "cell_type": "markdown", 514 | "metadata": {}, 515 | "source": [ 516 | "`\n", 517 | "modelType: 选项:multinomial(多项式)和bernoulli(伯努利)\n", 518 | "smoothing: 平滑参数,应该>=0,默认为1.0\n", 519 | "`" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 24, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "from pyspark.ml.classification import NaiveBayes\n", 529 | "nb = NaiveBayes()\n", 530 | "nbModel = nb.fit(train_data)\n", 531 | "result = nbModel.transform(test_data)" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 25, 537 | "metadata": {}, 538 | "outputs": [ 539 | { 540 | "data": { 541 | "text/plain": [ 542 | "0.9231714812538414" 543 | ] 544 | }, 545 | "execution_count": 25, 546 | "metadata": {}, 547 | "output_type": "execute_result" 548 | } 549 | ], 550 | "source": [ 551 | "result.filter(result.label == result.prediction).count()/result.count()" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "### LinearSVC 支持向量机\n", 559 | "pyspark.ml.classification.LinearSVC(featuresCol='features', labelCol='label', predictionCol='prediction', maxIter=100, regParam=0.0, tol=1e-06, rawPredictionCol='rawPrediction', fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2)" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 17, 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [ 568 | "from pyspark.ml.classification import LinearSVC\n", 569 | "svm = LinearSVC(maxIter=10, regPcaram=0.01)\n", 570 | "svmModel = svm.fit(train_data)\n", 571 | "result = svmModel.transform(test_data)" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 18, 577 | "metadata": { 578 | "scrolled": true 579 | }, 580 | "outputs": [ 581 | { 582 | "data": { 583 | "text/plain": [ 584 | "0.9797172710510141" 585 | ] 586 | }, 587 | "execution_count": 18, 588 | "metadata": {}, 589 | "output_type": "execute_result" 590 | } 591 | ], 592 | "source": [ 593 | "# accuracy\n", 594 | "result.filter(result.label == result.prediction).count()/result.count()" 595 | ] 596 | } 597 | ], 598 | "metadata": { 599 | "kernelspec": { 600 | "display_name": "Python 3", 601 | "language": "python", 602 | "name": "python3" 603 | }, 604 | "language_info": { 605 | "codemirror_mode": { 606 | "name": "ipython", 607 | "version": 3 608 | }, 609 | "file_extension": ".py", 610 | "mimetype": "text/x-python", 611 | "name": "python", 612 | "nbconvert_exporter": "python", 613 | "pygments_lexer": "ipython3", 614 | "version": "3.6.4" 615 | } 616 | }, 617 | "nbformat": 4, 618 | "nbformat_minor": 2 619 | } 620 | -------------------------------------------------------------------------------- /pyspark.ml.feature.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession \n", 10 | "spark = SparkSession.builder.appName('learn_ml').master('local[1]').getOrCreate()" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "ml 模块 三个抽象类:\n", 18 | "转换器(Transformer)、评估器(Estimator)和管道(Pipeline)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### pyspark.ml.feature.Binarizer(self, threshold=0.0, inputCol=None, outputCol=None)\n", 26 | "根据指定的阈值将连续变量转换为对应的二进制" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "df = spark.createDataFrame([(0.5,),(1.0,),(1.5,)], ['values'])" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "+------+\n", 48 | "|values|\n", 49 | "+------+\n", 50 | "| 0.5|\n", 51 | "| 1.0|\n", 52 | "| 1.5|\n", 53 | "+------+\n", 54 | "\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "df.show()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "+------+--------+\n", 72 | "|values|features|\n", 73 | "+------+--------+\n", 74 | "| 0.5| 0.0|\n", 75 | "| 1.0| 1.0|\n", 76 | "| 1.5| 1.0|\n", 77 | "+------+--------+\n", 78 | "\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "from pyspark.ml.feature import Binarizer\n", 84 | "binarizer = Binarizer(threshold=0.7, inputCol=\"values\", outputCol=\"features\")\n", 85 | "binarizer.transform(df).show()" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | "+------+-----+\n", 98 | "|values|freqs|\n", 99 | "+------+-----+\n", 100 | "| 0.5| 0.0|\n", 101 | "| 1.0| 1.0|\n", 102 | "| 1.5| 1.0|\n", 103 | "+------+-----+\n", 104 | "\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "# 通过setParams,更改配置\n", 110 | "binarizer.setParams(outputCol=\"freqs\").transform(df).show()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 6, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "+------+------+\n", 123 | "|values|vector|\n", 124 | "+------+------+\n", 125 | "| 0.5| 1.0|\n", 126 | "| 1.0| 1.0|\n", 127 | "| 1.5| 1.0|\n", 128 | "+------+------+\n", 129 | "\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "# 通过params更改配置\n", 135 | "params = {binarizer.threshold: -0.5, binarizer.outputCol: \"vector\"}\n", 136 | "binarizer.transform(df, params).show()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 7, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# 保存配置\n", 146 | "import os\n", 147 | "#temp_path = os.getcwd()\n", 148 | "temp_path = os.path.abspath('.')\n", 149 | "binarizerPath = \"file://{}/binarizer\".format(temp_path)\n", 150 | "binarizer.save(binarizerPath)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 8, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "True" 162 | ] 163 | }, 164 | "execution_count": 8, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "# 加载配置\n", 171 | "loadedBinarizer = Binarizer.load(binarizerPath)\n", 172 | "loadedBinarizer.getThreshold() == binarizer.getThreshold()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### pyspark.ml.feature.Bucketizer(self, splits=None, inputCol=None, outputCol=None, handleInvalid=\"error\")\n", 180 | "与Binarizer类似,该方法根据阈值列表(分割的参数),将连续变量转换为多项值(连续变量离散化到指定的范围区间)\n" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 9, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "+------+-------+\n", 193 | "|values|buckets|\n", 194 | "+------+-------+\n", 195 | "| 0.1| 0.0|\n", 196 | "| 0.4| 0.0|\n", 197 | "| 1.2| 1.0|\n", 198 | "| 1.5| 2.0|\n", 199 | "| NaN| 3.0|\n", 200 | "| NaN| 3.0|\n", 201 | "+------+-------+\n", 202 | "\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "from pyspark.ml.feature import Bucketizer\n", 208 | "values = [(0.1,), (0.4,), (1.2,), (1.5,), (float(\"nan\"),), (float(\"nan\"),)]\n", 209 | "df = spark.createDataFrame(values, [\"values\"])\n", 210 | "# splits 为分类区间\n", 211 | "bucketizer = Bucketizer(splits=[-float(\"inf\"), 0.5, 1.4, float(\"inf\")],inputCol=\"values\", outputCol=\"buckets\")\n", 212 | "# 这里setHandleInvalid是对nan值进行处理,默认是error:有nan则报错;keep:将nan保留为新分类;skip:忽略nan值\n", 213 | "bucketed = bucketizer.setHandleInvalid(\"keep\").transform(df)\n", 214 | "bucketed.show()" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 10, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "+------+---+\n", 227 | "|values| b|\n", 228 | "+------+---+\n", 229 | "| 0.1|0.0|\n", 230 | "| 0.4|0.0|\n", 231 | "| 1.2|1.0|\n", 232 | "| 1.5|2.0|\n", 233 | "| NaN|3.0|\n", 234 | "| NaN|3.0|\n", 235 | "+------+---+\n", 236 | "\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "# 更改配置\n", 242 | "bucketizer.setParams(outputCol=\"b\").transform(df).show()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "### pyspark.ml.feature.ChiSqSelector(self, numTopFeatures=50, featuresCol=\"features\", outputCol=None, labelCol=\"label\", selectorType=\"numTopFeatures\", percentile=0.1, fpr=0.05, fdr=0.05, fwe=0.05)\n", 250 | "对于分类目标变量(思考分类模型),此功能允许你选择预定义数量的特征(由numTopFeatures参数进行参数化),以便最好地说明目标的变化。该方法需要两部:需要.fit()——可以计算卡方检验,调用.fit()方法,将DataFrame作为参数传入返回一个ChiSqSelectorModel对象,然后可以使用该对象的.transform()方法来转换DataFrame。默认情况下,选择方法是numTopFeatures,默认顶级要素数设置为50。\n", 251 | "percentile 相识于num ,选取百分比的特征\n", 252 | "fpr 选择p-values低于阈值的所有特征,从而控制误差的选择率。\n", 253 | "fdr 使用 Benjamini-Hochberg procedure \n", 254 | "fwe 选择p-values低于阈值的所有特征。阈值按1 / numFeatures缩放" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 11, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "name": "stdout", 264 | "output_type": "stream", 265 | "text": [ 266 | "+------------------+-----+----------------+\n", 267 | "| features|label|selectedFeatures|\n", 268 | "+------------------+-----+----------------+\n", 269 | "|[0.0,0.0,18.0,1.0]| 1.0| [18.0,1.0]|\n", 270 | "|[0.0,1.0,12.0,0.0]| 0.0| [12.0,0.0]|\n", 271 | "|[1.0,0.0,15.0,0.1]| 0.0| [15.0,0.1]|\n", 272 | "+------------------+-----+----------------+\n", 273 | "\n" 274 | ] 275 | } 276 | ], 277 | "source": [ 278 | "from pyspark.ml.linalg import Vectors\n", 279 | "from pyspark.ml.feature import ChiSqSelector\n", 280 | "df = spark.createDataFrame(\n", 281 | "[(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),\n", 282 | "(Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),\n", 283 | "(Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],\n", 284 | "[\"features\", \"label\"])\n", 285 | "selector = ChiSqSelector(numTopFeatures=2, outputCol=\"selectedFeatures\")\n", 286 | "model = selector.fit(df)\n", 287 | "model.transform(df).show()" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "### pyspark.ml.feature.CountVectorizer(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None, outputCol=None)\n", 295 | "从文档集合中提取词汇表并生成向量" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 12, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "+-----+---------------+-------------------------+\n", 308 | "|label|raw |vectors |\n", 309 | "+-----+---------------+-------------------------+\n", 310 | "|0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|\n", 311 | "|1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|\n", 312 | "+-----+---------------+-------------------------+\n", 313 | "\n" 314 | ] 315 | } 316 | ], 317 | "source": [ 318 | "from pyspark.ml.feature import CountVectorizer\n", 319 | "df = spark.createDataFrame(\n", 320 | "[(0, [\"a\", \"b\", \"c\"]), (1, [\"a\", \"b\", \"b\", \"c\", \"a\"])],\n", 321 | "[\"label\", \"raw\"])\n", 322 | "cv = CountVectorizer(inputCol=\"raw\", outputCol=\"vectors\")\n", 323 | "model = cv.fit(df)\n", 324 | "model.transform(df).show(truncate=False)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 13, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "data": { 334 | "text/plain": [ 335 | "['a', 'b', 'c']" 336 | ] 337 | }, 338 | "execution_count": 13, 339 | "metadata": {}, 340 | "output_type": "execute_result" 341 | } 342 | ], 343 | "source": [ 344 | "sorted(model.vocabulary) " 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 14, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "# 保存model\n", 354 | "import os\n", 355 | "#temp_path = os.getcwd()\n", 356 | "temp_path = os.path.abspath('.')\n", 357 | "modelPath = \"file://{}/count-vectorizer-model\".format(temp_path)\n", 358 | "model.save(modelPath)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 15, 364 | "metadata": {}, 365 | "outputs": [ 366 | { 367 | "data": { 368 | "text/plain": [ 369 | "True" 370 | ] 371 | }, 372 | "execution_count": 15, 373 | "metadata": {}, 374 | "output_type": "execute_result" 375 | } 376 | ], 377 | "source": [ 378 | "# 加载model\n", 379 | "from pyspark.ml.feature import CountVectorizerModel\n", 380 | "loadedModel = CountVectorizerModel.load(modelPath)\n", 381 | "loadedModel.vocabulary == model.vocabulary" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "### pyspark.ml.feature.ElementwiseProduct(scalingVec=None, inputCol=None, outputCol=None)\n", 389 | "使用提供的“权重”向量输出每个输入向量的阿达马乘积(即,逐元素乘积)。换句话说,它通过标量乘数缩放数据集的每一列。" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 16, 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "name": "stdout", 399 | "output_type": "stream", 400 | "text": [ 401 | "+-------------+-------------+\n", 402 | "| values| eprod|\n", 403 | "+-------------+-------------+\n", 404 | "|[2.0,1.0,3.0]|[2.0,2.0,9.0]|\n", 405 | "+-------------+-------------+\n", 406 | "\n", 407 | "+-------------+--------------+\n", 408 | "| values| eprod|\n", 409 | "+-------------+--------------+\n", 410 | "|[2.0,1.0,3.0]|[4.0,3.0,15.0]|\n", 411 | "+-------------+--------------+\n", 412 | "\n" 413 | ] 414 | } 415 | ], 416 | "source": [ 417 | "from pyspark.ml.feature import ElementwiseProduct \n", 418 | "from pyspark.ml.linalg import Vectors\n", 419 | "df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], [\"values\"])\n", 420 | "ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),\n", 421 | "inputCol=\"values\", outputCol=\"eprod\")\n", 422 | "ep.transform(df).show()\n", 423 | "ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).show()\n" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": {}, 429 | "source": [ 430 | "### pyspark.ml.feature.Imputer(*args, **kwargs)\n", 431 | "用于完成缺失值的插补估计器,使用缺失值所在列的平均值或中值。 输入列应该是DoubleType或FloatType。 目前的Imputer不支持分类特征,可能会为分类特征创建不正确的值。\n", 432 | "请注意,平均值/中值是在过滤出缺失值之后计算的。 输入列中的所有Null值都被视为缺失,所以也被归类。 为了计算中位数,使用pyspark.sql.DataFrame.approxQuantile(),相对误差为0.001。\n" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 17, 438 | "metadata": {}, 439 | "outputs": [ 440 | { 441 | "name": "stdout", 442 | "output_type": "stream", 443 | "text": [ 444 | "+---+---+\n", 445 | "| a| b|\n", 446 | "+---+---+\n", 447 | "|1.0|NaN|\n", 448 | "|2.0|NaN|\n", 449 | "|NaN|3.0|\n", 450 | "|4.0|4.0|\n", 451 | "|5.0|5.0|\n", 452 | "+---+---+\n", 453 | "\n", 454 | "+---+---+\n", 455 | "| a| b|\n", 456 | "+---+---+\n", 457 | "|3.0|4.0|\n", 458 | "+---+---+\n", 459 | "\n", 460 | "+---+---+-----+-----+\n", 461 | "| a| b|out_a|out_b|\n", 462 | "+---+---+-----+-----+\n", 463 | "|1.0|NaN| 1.0| 4.0|\n", 464 | "|2.0|NaN| 2.0| 4.0|\n", 465 | "|NaN|3.0| 3.0| 3.0|\n", 466 | "|4.0|4.0| 4.0| 4.0|\n", 467 | "|5.0|5.0| 5.0| 5.0|\n", 468 | "+---+---+-----+-----+\n", 469 | "\n" 470 | ] 471 | } 472 | ], 473 | "source": [ 474 | "from pyspark.ml.feature import Imputer\n", 475 | "df = spark.createDataFrame([(1.0, float(\"nan\")), (2.0, float(\"nan\")), (float(\"nan\"), 3.0),\n", 476 | " (4.0, 4.0), (5.0, 5.0)], [\"a\", \"b\"])\n", 477 | "imputer = Imputer(inputCols=[\"a\", \"b\"], outputCols=[\"out_a\", \"out_b\"])\n", 478 | "model = imputer.fit(df)\n", 479 | "df.show()\n", 480 | "model.surrogateDF.show()\n", 481 | "model.transform(df).show()" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 18, 487 | "metadata": {}, 488 | "outputs": [ 489 | { 490 | "name": "stdout", 491 | "output_type": "stream", 492 | "text": [ 493 | "+---+---+-----+-----+\n", 494 | "| a| b|out_a|out_b|\n", 495 | "+---+---+-----+-----+\n", 496 | "|1.0|NaN| 1.0| 4.0|\n", 497 | "|2.0|NaN| 2.0| 4.0|\n", 498 | "|NaN|3.0| 2.0| 3.0|\n", 499 | "|4.0|4.0| 4.0| 4.0|\n", 500 | "|5.0|5.0| 5.0| 5.0|\n", 501 | "+---+---+-----+-----+\n", 502 | "\n" 503 | ] 504 | } 505 | ], 506 | "source": [ 507 | "imputer.setStrategy(\"median\").setMissingValue(float(\"nan\")).fit(df).transform(df).show()" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "### pyspark.ml.feature.MaxAbsScaler(self, inputCol=None, outputCol=None)\n", 515 | "通过分割每个特征中的最大绝对值来单独重新缩放每个特征以范围[-1,1]。 它不会移动/居中数据,因此不会破坏任何稀疏性" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 19, 521 | "metadata": {}, 522 | "outputs": [ 523 | { 524 | "name": "stdout", 525 | "output_type": "stream", 526 | "text": [ 527 | "+-----+------+\n", 528 | "| a|scaled|\n", 529 | "+-----+------+\n", 530 | "|[1.0]| [0.5]|\n", 531 | "|[2.0]| [1.0]|\n", 532 | "+-----+------+\n", 533 | "\n" 534 | ] 535 | } 536 | ], 537 | "source": [ 538 | "from pyspark.ml.feature import MaxAbsScaler\n", 539 | "from pyspark.ml.linalg import Vectors\n", 540 | "df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], [\"a\"])\n", 541 | "maScaler = MaxAbsScaler(inputCol=\"a\", outputCol=\"scaled\")\n", 542 | "model = maScaler.fit(df)\n", 543 | "model.transform(df).show()" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "### pyspark.ml.feature.MinMaxScaler(self, min=0.0, max=1.0, inputCol=None, outputCol=None)\n", 551 | "使用列汇总统计信息,将每个特征单独重新标定为一个常用范围[min,max],这也称为最小 - 最大标准化或重新标定(注意由于零值可能会被转换为非零值,因此即使对于稀疏输入,转换器的输出也将是DenseVector)。 特征E的重新缩放的值被计算为,数据将被缩放到[0.0,1.0]范围内。\n", 552 | "Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min\n", 553 | "For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)\n" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 42, 559 | "metadata": {}, 560 | "outputs": [ 561 | { 562 | "name": "stdout", 563 | "output_type": "stream", 564 | "text": [ 565 | "[0.0] [2.0]\n", 566 | "+-----+------+\n", 567 | "| a|scaled|\n", 568 | "+-----+------+\n", 569 | "|[0.0]| [0.0]|\n", 570 | "|[2.0]| [1.0]|\n", 571 | "+-----+------+\n", 572 | "\n" 573 | ] 574 | } 575 | ], 576 | "source": [ 577 | "from pyspark.ml.feature import MinMaxScaler\n", 578 | "from pyspark.ml.linalg import Vectors\n", 579 | "df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], [\"a\"])\n", 580 | "mmScaler = MinMaxScaler(inputCol=\"a\", outputCol=\"scaled\")\n", 581 | "model = mmScaler.fit(df)\n", 582 | "print(model.originalMin, model.originalMax)\n", 583 | "model.transform(df).show()" 584 | ] 585 | }, 586 | { 587 | "cell_type": "markdown", 588 | "metadata": {}, 589 | "source": [ 590 | "### pyspark.ml.feature.NGram(n=2, inputCol=None, outputCol=None)\n", 591 | "一种功能转换器,用于将输入的字符串数组转换为n-gram数组。输入数组中的空值将被忽略。它返回一个n-gram数组,其中每个n-gram由一个以空格分隔的单词串表示。当输入为空时,返回一个空数组。当输入数组长度小于n(每n-gram的元素数)时,不返回n-gram。" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 23, 597 | "metadata": {}, 598 | "outputs": [ 599 | { 600 | "name": "stdout", 601 | "output_type": "stream", 602 | "text": [ 603 | "+---------------+--------------------+\n", 604 | "| inputTokens| nGrams|\n", 605 | "+---------------+--------------------+\n", 606 | "|[a, b, c, d, e]|[a b, b c, c d, d e]|\n", 607 | "+---------------+--------------------+\n", 608 | "\n" 609 | ] 610 | } 611 | ], 612 | "source": [ 613 | "from pyspark.ml.feature import NGram\n", 614 | "from pyspark.sql import Row\n", 615 | "df = spark.createDataFrame([Row(inputTokens=[\"a\", \"b\", \"c\", \"d\", \"e\"])])\n", 616 | "ngram = NGram(n=2, inputCol=\"inputTokens\", outputCol=\"nGrams\")\n", 617 | "ngram.transform(df).show()" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 24, 623 | "metadata": {}, 624 | "outputs": [ 625 | { 626 | "name": "stdout", 627 | "output_type": "stream", 628 | "text": [ 629 | "+---------------+------------------+\n", 630 | "| inputTokens| nGrams|\n", 631 | "+---------------+------------------+\n", 632 | "|[a, b, c, d, e]|[a b c d, b c d e]|\n", 633 | "+---------------+------------------+\n", 634 | "\n" 635 | ] 636 | } 637 | ], 638 | "source": [ 639 | "# 更改 n-gram 长度\n", 640 | "ngram.setParams(n=4).transform(df).show()" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 25, 646 | "metadata": {}, 647 | "outputs": [ 648 | { 649 | "name": "stdout", 650 | "output_type": "stream", 651 | "text": [ 652 | "+---------------+------------------+\n", 653 | "| inputTokens| output|\n", 654 | "+---------------+------------------+\n", 655 | "|[a, b, c, d, e]|[a b c d, b c d e]|\n", 656 | "+---------------+------------------+\n", 657 | "\n" 658 | ] 659 | } 660 | ], 661 | "source": [ 662 | "# 临时修改输出列\n", 663 | "ngram.transform(df, {ngram.outputCol: \"output\"}).show()" 664 | ] 665 | }, 666 | { 667 | "cell_type": "markdown", 668 | "metadata": {}, 669 | "source": [ 670 | "### pyspark.ml.feature.Normalizer(self, p=2.0, inputCol=None, outputCol=None)\n", 671 | "使用给定的p范数标准化矢量以得到单位范数(默认为L2)。" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": 26, 677 | "metadata": {}, 678 | "outputs": [ 679 | { 680 | "name": "stdout", 681 | "output_type": "stream", 682 | "text": [ 683 | "+----------+-------------------+----------+\n", 684 | "| dense| sparse| features|\n", 685 | "+----------+-------------------+----------+\n", 686 | "|[3.0,-4.0]|(4,[1,3],[4.0,3.0])|[0.6,-0.8]|\n", 687 | "+----------+-------------------+----------+\n", 688 | "\n" 689 | ] 690 | } 691 | ], 692 | "source": [ 693 | "from pyspark.ml.feature import Normalizer\n", 694 | "from pyspark.ml.linalg import Vectors\n", 695 | "svec = Vectors.sparse(4, {1: 4.0, 3: 3.0})\n", 696 | "df = spark.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], [\"dense\", \"sparse\"])\n", 697 | "normalizer = Normalizer(p=2.0, inputCol=\"dense\", outputCol=\"features\")\n", 698 | "normalizer.transform(df).show()" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 27, 704 | "metadata": {}, 705 | "outputs": [ 706 | { 707 | "name": "stdout", 708 | "output_type": "stream", 709 | "text": [ 710 | "+----------+-------------------+-------------------+\n", 711 | "| dense| sparse| freqs|\n", 712 | "+----------+-------------------+-------------------+\n", 713 | "|[3.0,-4.0]|(4,[1,3],[4.0,3.0])|(4,[1,3],[0.8,0.6])|\n", 714 | "+----------+-------------------+-------------------+\n", 715 | "\n" 716 | ] 717 | } 718 | ], 719 | "source": [ 720 | "normalizer.setParams(inputCol=\"sparse\", outputCol=\"freqs\").transform(df).show()" 721 | ] 722 | }, 723 | { 724 | "cell_type": "markdown", 725 | "metadata": {}, 726 | "source": [ 727 | "### pyspark.ml.feature.OneHotEncoderEstimator(inputCols=None, outputCols=None, handleInvalid='error', dropLast=True)\n", 728 | "(分类列编码为二进制向量列)\n", 729 | "一个热门的编码器,将一列类别索引映射到一列二进制向量,每行至多有一个单值,表示输入类别索引。 例如,对于5个类别,输入值2.0将映射到[0.0,0.0,1.0,0.0]的输出向量。 最后一个类别默认不包含(可通过dropLast进行配置),因为它使向量条目总和为1,因此线性相关。 所以一个4.0的输入值映射到[0.0,0.0,0.0,0.0]。这与scikit-learn的OneHotEncoder不同,后者保留所有类别。 输出向量是稀疏的。\n", 730 | "当handleInvalid配置为“keep”时,会添加一个指示无效值的额外“类别”作为最后一个类别。因此,当dropLast为true时,无效值将被编码为全零向量。" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": 28, 736 | "metadata": {}, 737 | "outputs": [ 738 | { 739 | "name": "stdout", 740 | "output_type": "stream", 741 | "text": [ 742 | "+-----+-------------+\n", 743 | "|input| output|\n", 744 | "+-----+-------------+\n", 745 | "| 0.0|(2,[0],[1.0])|\n", 746 | "| 1.0|(2,[1],[1.0])|\n", 747 | "| 2.0| (2,[],[])|\n", 748 | "+-----+-------------+\n", 749 | "\n" 750 | ] 751 | } 752 | ], 753 | "source": [ 754 | "from pyspark.ml.feature import OneHotEncoderEstimator\n", 755 | "from pyspark.ml.linalg import Vectors\n", 756 | "df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], [\"input\"])\n", 757 | "ohe = OneHotEncoderEstimator(inputCols=[\"input\"], outputCols=[\"output\"])\n", 758 | "model = ohe.fit(df)\n", 759 | "model.transform(df).show()" 760 | ] 761 | }, 762 | { 763 | "cell_type": "markdown", 764 | "metadata": {}, 765 | "source": [ 766 | "### pyspark.ml.feature.PCA(self, k=None, inputCol=None, outputCol=None)\n", 767 | "PCA训练一个模型将向量投影到前k个主成分的较低维空间。" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": 29, 773 | "metadata": {}, 774 | "outputs": [ 775 | { 776 | "name": "stdout", 777 | "output_type": "stream", 778 | "text": [ 779 | "+---------------------+----------------------------------------+\n", 780 | "|features |pca_features |\n", 781 | "+---------------------+----------------------------------------+\n", 782 | "|(5,[1,3],[1.0,7.0]) |[1.6485728230883807,-4.013282700516296] |\n", 783 | "|[2.0,0.0,3.0,4.0,5.0]|[-4.645104331781534,-1.1167972663619026]|\n", 784 | "|[4.0,0.0,0.0,6.0,7.0]|[-6.428880535676489,-5.337951427775355] |\n", 785 | "+---------------------+----------------------------------------+\n", 786 | "\n" 787 | ] 788 | } 789 | ], 790 | "source": [ 791 | "from pyspark.ml.feature import PCA\n", 792 | "from pyspark.ml.linalg import Vectors\n", 793 | "data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),\n", 794 | " (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),\n", 795 | " (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]\n", 796 | "df = spark.createDataFrame(data,[\"features\"])\n", 797 | "pca = PCA(k=2, inputCol=\"features\", outputCol=\"pca_features\")\n", 798 | "model = pca.fit(df)\n", 799 | "model.transform(df).show(truncate=0)" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": 30, 805 | "metadata": {}, 806 | "outputs": [ 807 | { 808 | "data": { 809 | "text/plain": [ 810 | "DenseVector([0.7944, 0.2056])" 811 | ] 812 | }, 813 | "execution_count": 30, 814 | "metadata": {}, 815 | "output_type": "execute_result" 816 | } 817 | ], 818 | "source": [ 819 | "model.explainedVariance" 820 | ] 821 | }, 822 | { 823 | "cell_type": "markdown", 824 | "metadata": {}, 825 | "source": [ 826 | "### pyspark.ml.feature.QuantileDiscretizer(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, handleInvalid=\"error\")\n", 827 | "与Bucketizer方法类似,但QuantileDiscretizer采用具有连续特征的列,并输出具有分箱分类特征的列。可以使用numBuckets参数设置区域的数量。所使用的桶的数量可能小于该值,例如,如果输入的不同值太少而不能创建足够的不同分位数。nan会占用一个新的分类" 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": 31, 833 | "metadata": {}, 834 | "outputs": [ 835 | { 836 | "name": "stdout", 837 | "output_type": "stream", 838 | "text": [ 839 | "+------+-------+\n", 840 | "|values|buckets|\n", 841 | "+------+-------+\n", 842 | "| 0.1| 0.0|\n", 843 | "| 0.4| 1.0|\n", 844 | "| 1.2| 1.0|\n", 845 | "| 1.5| 1.0|\n", 846 | "| NaN| 2.0|\n", 847 | "| NaN| 2.0|\n", 848 | "+------+-------+\n", 849 | "\n" 850 | ] 851 | } 852 | ], 853 | "source": [ 854 | "from pyspark.ml.feature import QuantileDiscretizer\n", 855 | "values = [(0.1,), (0.4,), (1.2,), (1.5,), (float(\"nan\"),), (float(\"nan\"),)]\n", 856 | "df = spark.createDataFrame(values, [\"values\"])\n", 857 | "qds = QuantileDiscretizer(numBuckets=2,\n", 858 | " inputCol=\"values\", outputCol=\"buckets\", relativeError=0.01, handleInvalid=\"error\")\n", 859 | "bucketizer = qds.fit(df)\n", 860 | "qds.setHandleInvalid(\"keep\").fit(df).transform(df).show()" 861 | ] 862 | }, 863 | { 864 | "cell_type": "markdown", 865 | "metadata": {}, 866 | "source": [ 867 | "### pyspark.ml.feature.RegexTokenizer(minTokenLength=1, gaps=True, pattern='\\s+', inputCol=None, outputCol=None, toLowercase=True)\n", 868 | "基于java正则表达式的标记生成器" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": 32, 874 | "metadata": {}, 875 | "outputs": [ 876 | { 877 | "name": "stdout", 878 | "output_type": "stream", 879 | "text": [ 880 | "+------+---------+\n", 881 | "| text| words|\n", 882 | "+------+---------+\n", 883 | "|A B c|[a, b, c]|\n", 884 | "+------+---------+\n", 885 | "\n" 886 | ] 887 | } 888 | ], 889 | "source": [ 890 | "from pyspark.ml.feature import RegexTokenizer\n", 891 | "df = spark.createDataFrame([(\"A B c\",)], [\"text\"])\n", 892 | "reTokenizer = RegexTokenizer(inputCol=\"text\", outputCol=\"words\")\n", 893 | "reTokenizer.transform(df).show()" 894 | ] 895 | }, 896 | { 897 | "cell_type": "markdown", 898 | "metadata": {}, 899 | "source": [ 900 | "### pyspark.ml.feature.SQLTransformer(statement=None)\n", 901 | "实现由SQL语句定义的转换。目前我们只支持SQL语法," 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": 33, 907 | "metadata": {}, 908 | "outputs": [ 909 | { 910 | "name": "stdout", 911 | "output_type": "stream", 912 | "text": [ 913 | "+---+---+---+\n", 914 | "| id| v1| v2|\n", 915 | "+---+---+---+\n", 916 | "| 0|1.0|3.0|\n", 917 | "| 2|2.0|5.0|\n", 918 | "+---+---+---+\n", 919 | "\n", 920 | "+---+---+---+---+----+\n", 921 | "| id| v1| v2| v3| v4|\n", 922 | "+---+---+---+---+----+\n", 923 | "| 0|1.0|3.0|4.0| 3.0|\n", 924 | "| 2|2.0|5.0|7.0|10.0|\n", 925 | "+---+---+---+---+----+\n", 926 | "\n" 927 | ] 928 | } 929 | ], 930 | "source": [ 931 | "from pyspark.ml.feature import SQLTransformer\n", 932 | "df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], [\"id\", \"v1\", \"v2\"])\n", 933 | "sqlTrans = SQLTransformer(\n", 934 | " statement=\"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__\")\n", 935 | "df.show()\n", 936 | "sqlTrans.transform(df).show()\n" 937 | ] 938 | }, 939 | { 940 | "cell_type": "markdown", 941 | "metadata": {}, 942 | "source": [ 943 | "### pyspark.ml.feature.StandardScaler(self, withMean=False, withStd=True, inputCol=None, outputCol=None)\n", 944 | "(标准化列,使其拥有零均值和等于1的标准差)\n", 945 | "通过使用训练集中样本的列汇总统计消除平均值和缩放到单位方差来标准化特征。使用校正后的样本标准偏差计算“单位标准差”,该标准偏差计算为无偏样本方差的平方根。\n" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": 34, 951 | "metadata": {}, 952 | "outputs": [ 953 | { 954 | "name": "stdout", 955 | "output_type": "stream", 956 | "text": [ 957 | "[1.0] [1.4142135623730951]\n", 958 | "+-----+-------------------+\n", 959 | "| a| scaled|\n", 960 | "+-----+-------------------+\n", 961 | "|[0.0]| [0.0]|\n", 962 | "|[2.0]|[1.414213562373095]|\n", 963 | "+-----+-------------------+\n", 964 | "\n" 965 | ] 966 | } 967 | ], 968 | "source": [ 969 | "from pyspark.ml.feature import StandardScaler\n", 970 | "from pyspark.ml.linalg import Vectors\n", 971 | "df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], [\"a\"])\n", 972 | "standardScaler = StandardScaler(inputCol=\"a\", outputCol=\"scaled\")\n", 973 | "model = standardScaler.fit(df)\n", 974 | "print(model.mean, model.std)\n", 975 | "model.transform(df).show()" 976 | ] 977 | }, 978 | { 979 | "cell_type": "markdown", 980 | "metadata": {}, 981 | "source": [ 982 | "### pyspark.ml.feature.StopWordsRemover(inputCol=None, outputCol=None, stopWords=None, caseSensitive=False)\n", 983 | "一个特征转换器,用于过滤掉输入中的停用词。" 984 | ] 985 | }, 986 | { 987 | "cell_type": "code", 988 | "execution_count": 35, 989 | "metadata": {}, 990 | "outputs": [ 991 | { 992 | "name": "stdout", 993 | "output_type": "stream", 994 | "text": [ 995 | "+---------+------+\n", 996 | "| text| words|\n", 997 | "+---------+------+\n", 998 | "|[a, b, c]|[a, c]|\n", 999 | "+---------+------+\n", 1000 | "\n" 1001 | ] 1002 | } 1003 | ], 1004 | "source": [ 1005 | "from pyspark.ml.feature import StopWordsRemover\n", 1006 | "df = spark.createDataFrame([([\"a\", \"b\", \"c\"],)], [\"text\"])\n", 1007 | "remover = StopWordsRemover(inputCol=\"text\", outputCol=\"words\", stopWords=[\"b\"])\n", 1008 | "remover.transform(df).show()" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "markdown", 1013 | "metadata": {}, 1014 | "source": [ 1015 | "### pyspark.ml.feature.Tokenizer(inputCol=None, outputCol=None)\n", 1016 | "一个标记生成器,它将输入字符串转换为小写,然后用空格分隔它。" 1017 | ] 1018 | }, 1019 | { 1020 | "cell_type": "code", 1021 | "execution_count": 36, 1022 | "metadata": { 1023 | "scrolled": true 1024 | }, 1025 | "outputs": [ 1026 | { 1027 | "name": "stdout", 1028 | "output_type": "stream", 1029 | "text": [ 1030 | "+--------+------------+\n", 1031 | "| text| words|\n", 1032 | "+--------+------------+\n", 1033 | "|ASD VA c|[asd, va, c]|\n", 1034 | "+--------+------------+\n", 1035 | "\n" 1036 | ] 1037 | } 1038 | ], 1039 | "source": [ 1040 | "from pyspark.ml.feature import Tokenizer\n", 1041 | "df = spark.createDataFrame([(\"ASD VA c\",)], [\"text\"])\n", 1042 | "tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"words\")\n", 1043 | "tokenizer.transform(df).show()" 1044 | ] 1045 | }, 1046 | { 1047 | "cell_type": "markdown", 1048 | "metadata": {}, 1049 | "source": [ 1050 | "### pyspark.ml.feature.VectorSlicer(inputCol=None, outputCol=None, indices=None, names=None)\n", 1051 | "此类采用特征向量并输出具有原始特征的子阵列的新特征向量。 可以使用索引(setIndices())或名称(setNames())指定要素子集。必须至少选择一个功能。不允许使用重复的功能,因此所选索引和名称之间不能重叠。 输出向量将首先按所选索引(按给定顺序)排序要素,然后是所选名称(按给定顺序)。" 1052 | ] 1053 | }, 1054 | { 1055 | "cell_type": "code", 1056 | "execution_count": 37, 1057 | "metadata": {}, 1058 | "outputs": [ 1059 | { 1060 | "name": "stdout", 1061 | "output_type": "stream", 1062 | "text": [ 1063 | "+-----------------------+----------+\n", 1064 | "|features |sliced |\n", 1065 | "+-----------------------+----------+\n", 1066 | "|[-2.0,2.3,0.0,0.0,1.0] |[2.3,1.0] |\n", 1067 | "|[0.0,0.0,0.0,0.0,0.0] |[0.0,0.0] |\n", 1068 | "|[0.6,-1.1,-3.0,4.5,3.3]|[-1.1,3.3]|\n", 1069 | "+-----------------------+----------+\n", 1070 | "\n" 1071 | ] 1072 | } 1073 | ], 1074 | "source": [ 1075 | "from pyspark.ml.feature import VectorSlicer\n", 1076 | "from pyspark.ml.linalg import Vectors\n", 1077 | "df = spark.createDataFrame([\n", 1078 | " (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),\n", 1079 | " (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),\n", 1080 | " (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], [\"features\"])\n", 1081 | "vs = VectorSlicer(inputCol=\"features\", outputCol=\"sliced\", indices=[1, 4])\n", 1082 | "vs.transform(df).show(truncate=0)" 1083 | ] 1084 | }, 1085 | { 1086 | "cell_type": "markdown", 1087 | "metadata": {}, 1088 | "source": [ 1089 | "### pyspark.ml.feature.VectorAssembler(inputCols=None, outputCol=None)\n", 1090 | "将多个列合并到向量列中的要素转换器。" 1091 | ] 1092 | }, 1093 | { 1094 | "cell_type": "code", 1095 | "execution_count": 38, 1096 | "metadata": {}, 1097 | "outputs": [ 1098 | { 1099 | "name": "stdout", 1100 | "output_type": "stream", 1101 | "text": [ 1102 | "+---+---+---+-------------+\n", 1103 | "| a| b| c| features|\n", 1104 | "+---+---+---+-------------+\n", 1105 | "| 1| 0| 3|[1.0,0.0,3.0]|\n", 1106 | "+---+---+---+-------------+\n", 1107 | "\n" 1108 | ] 1109 | } 1110 | ], 1111 | "source": [ 1112 | "from pyspark.ml.feature import VectorAssembler\n", 1113 | "df = spark.createDataFrame([(1, 0, 3)], [\"a\", \"b\", \"c\"])\n", 1114 | "vecAssembler = VectorAssembler(inputCols=[\"a\", \"b\", \"c\"], outputCol=\"features\")\n", 1115 | "vecAssembler.transform(df).show()" 1116 | ] 1117 | }, 1118 | { 1119 | "cell_type": "markdown", 1120 | "metadata": {}, 1121 | "source": [ 1122 | "### pyspark.ml.feature.Word2Vec(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000)\n", 1123 | "Word2Vec训练Map(String,Vector)模型,即将单词转换为代码以进行进一步的自然语言处理或机器学习过程。" 1124 | ] 1125 | }, 1126 | { 1127 | "cell_type": "code", 1128 | "execution_count": 39, 1129 | "metadata": {}, 1130 | "outputs": [ 1131 | { 1132 | "name": "stdout", 1133 | "output_type": "stream", 1134 | "text": [ 1135 | "+----+--------------------+\n", 1136 | "|word| vector|\n", 1137 | "+----+--------------------+\n", 1138 | "| a|[0.09461779892444...|\n", 1139 | "| b|[1.15474212169647...|\n", 1140 | "| c|[-0.3794820010662...|\n", 1141 | "+----+--------------------+\n", 1142 | "\n" 1143 | ] 1144 | } 1145 | ], 1146 | "source": [ 1147 | "from pyspark.ml.feature import Word2Vec\n", 1148 | "sent = (\"a b \" * 100 + \"a c \" * 10).split(\" \")\n", 1149 | "doc = spark.createDataFrame([(sent,), (sent,)], [\"sentence\"])\n", 1150 | "word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol=\"sentence\", outputCol=\"model\")\n", 1151 | "model = word2Vec.fit(doc)\n", 1152 | "model.getVectors().show()" 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "code", 1157 | "execution_count": 40, 1158 | "metadata": {}, 1159 | "outputs": [ 1160 | { 1161 | "name": "stdout", 1162 | "output_type": "stream", 1163 | "text": [ 1164 | "+----+-------------------+\n", 1165 | "|word| similarity|\n", 1166 | "+----+-------------------+\n", 1167 | "| b|0.25053444504737854|\n", 1168 | "+----+-------------------+\n", 1169 | "\n" 1170 | ] 1171 | }, 1172 | { 1173 | "data": { 1174 | "text/plain": [ 1175 | "[('b', 0.25053444504737854)]" 1176 | ] 1177 | }, 1178 | "execution_count": 40, 1179 | "metadata": {}, 1180 | "output_type": "execute_result" 1181 | } 1182 | ], 1183 | "source": [ 1184 | "# 找相似字符\n", 1185 | "model.findSynonyms(\"a\", 1).show()\n", 1186 | "model.findSynonymsArray(\"a\", 1)" 1187 | ] 1188 | }, 1189 | { 1190 | "cell_type": "code", 1191 | "execution_count": 41, 1192 | "metadata": {}, 1193 | "outputs": [ 1194 | { 1195 | "name": "stdout", 1196 | "output_type": "stream", 1197 | "text": [ 1198 | "+----+----------+\n", 1199 | "|word|similarity|\n", 1200 | "+----+----------+\n", 1201 | "| b| 0.251|\n", 1202 | "| c| -0.698|\n", 1203 | "+----+----------+\n", 1204 | "\n" 1205 | ] 1206 | } 1207 | ], 1208 | "source": [ 1209 | "from pyspark.sql.functions import format_number as fmt\n", 1210 | "model.findSynonyms(\"a\", 2).select(\"word\", fmt(\"similarity\", 3).alias(\"similarity\")).show()" 1211 | ] 1212 | }, 1213 | { 1214 | "cell_type": "code", 1215 | "execution_count": null, 1216 | "metadata": {}, 1217 | "outputs": [], 1218 | "source": [] 1219 | } 1220 | ], 1221 | "metadata": { 1222 | "kernelspec": { 1223 | "display_name": "Python 3", 1224 | "language": "python", 1225 | "name": "python3" 1226 | }, 1227 | "language_info": { 1228 | "codemirror_mode": { 1229 | "name": "ipython", 1230 | "version": 3 1231 | }, 1232 | "file_extension": ".py", 1233 | "mimetype": "text/x-python", 1234 | "name": "python", 1235 | "nbconvert_exporter": "python", 1236 | "pygments_lexer": "ipython3", 1237 | "version": "3.6.4" 1238 | } 1239 | }, 1240 | "nbformat": 4, 1241 | "nbformat_minor": 2 1242 | } 1243 | -------------------------------------------------------------------------------- /pyspark.ml.regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "spark = SparkSession.builder.appName('learn_regression').master('local[1]').getOrCreate()" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "df_train = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/boston/train.csv', header=True, inferSchema=True, encoding='utf-8')" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "df_test = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/boston/test.csv', header=True, inferSchema=True, encoding='utf-8')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 4, 34 | "metadata": { 35 | "scrolled": true 36 | }, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+-----+\n", 43 | "| ID| crim| zn|indus|chas| nox| rm| age| dis|rad|tax|ptratio| black|lstat| medv|\n", 44 | "+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+-----+\n", 45 | "| 1|0.00632|18.0| 2.31| 0|0.538|6.575|65.2| 4.09| 1|296| 15.3| 396.9| 4.98| 24.0|\n", 46 | "| 2|0.02731| 0.0| 7.07| 0|0.469|6.421|78.9|4.9671| 2|242| 17.8| 396.9| 9.14| 21.6|\n", 47 | "| 3|0.02729| 0.0| 7.07| 0|0.469|7.185|61.1|4.9671| 2|242| 17.8|392.83| 4.03|22.77|\n", 48 | "+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+-----+\n", 49 | "only showing top 3 rows\n", 50 | "\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "from pyspark.sql.functions import lit\n", 56 | "df_test = df_test.withColumn('medv', lit(22.77))\n", 57 | "df0 = df_train.union(df_test).sort('ID')\n", 58 | "df0.show(3)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "`\n", 66 | "CRIM--  城镇人均犯罪率。\n", 67 | "ZN  - 占地面积超过25,000平方英尺的住宅用地比例。\n", 68 | "INDUS  - 每个城镇非零售业务的比例。\n", 69 | "CHAS  - Charles River虚拟变量(如果河流经过则= 1;否则为0)。\n", 70 | "NOX  - 氮氧化物浓度(每千万份)。\n", 71 | "RM  - 每间住宅的平均房间数。\n", 72 | "AGE  - 1940年以前建造的自住单位比例。\n", 73 | "DIS  - 加权平均值到五个波士顿就业中心的距离。\n", 74 | "RAD  - 径向高速公路的可达性指数。\n", 75 | "TAX  - 每10,000美元的全额物业税率。\n", 76 | "PTRATIO  - 城镇的学生与教师比例。\n", 77 | "BLACK  - 1000(Bk - 0.63)²其中Bk是城镇黑人的比例。\n", 78 | "LSTAT  - 人口较低的地位(百分比)。\n", 79 | "MEDV  - 自住房屋的中位数价值1000美元。这是目标变量。\n", 80 | "`" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "from pyspark.ml.feature import VectorAssembler\n", 90 | "def feature_converter(df):\n", 91 | " vecAss = VectorAssembler(inputCols=df0.columns[1:-1], outputCol='features')\n", 92 | " df_va = vecAss.transform(df)\n", 93 | " return df_va\n", 94 | "\n", 95 | "train_data, test_data = feature_converter(df0).select(['features', 'medv']).randomSplit([7.0, 3.0], 101)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 6, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "354" 107 | ] 108 | }, 109 | "execution_count": 6, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "train_data.count()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 7, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | "152" 127 | ] 128 | }, 129 | "execution_count": 7, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "test_data.count()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "## 决策树回归\n", 143 | "`pyspark.ml.regression.DecisionTreeRegressor(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='variance', seed=None, varianceCol=None)`" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "`\n", 151 | "fit(dataset, params=None)方法 \n", 152 | "Impurity: 信息增益计算准则,支持选项:variance \n", 153 | "maxBins: 连续特征离散化的最大分箱个数, >=2并且>=任何分类特征的分类个数 \n", 154 | "maxDepth: 最大树深 \n", 155 | "minInfoGain: 分割节点所需最小信息增益 \n", 156 | "minInstancesPerNode: 分割后每个子节点最小实例个数 \n", 157 | "`" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 13, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "from pyspark.ml.regression import DecisionTreeRegressor\n", 167 | "dt = DecisionTreeRegressor(maxDepth=5, varianceCol=\"variance\", labelCol='medv')\n", 168 | "dt_model = dt.fit(train_data)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 14, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "SparseVector(13, {0: 0.0503, 2: 0.011, 4: 0.0622, 5: 0.1441, 6: 0.1852, 7: 0.0262, 8: 0.0022, 9: 0.0886, 10: 0.0142, 12: 0.4159})" 180 | ] 181 | }, 182 | "execution_count": 14, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "dt_model.featureImportances" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 15, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "result = dt_model.transform(test_data)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 16, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "+--------------------+-----+------------------+------------------+\n", 210 | "| features| medv| prediction| variance|\n", 211 | "+--------------------+-----+------------------+------------------+\n", 212 | "|[0.03237,0.0,2.18...| 33.4| 34.12833333333334|29.509013888888756|\n", 213 | "|[0.08829,12.5,7.8...| 22.9|21.195135135135136| 4.446162819576342|\n", 214 | "|[0.14455,12.5,7.8...|22.77|22.425999999999995|0.5578440000003866|\n", 215 | "+--------------------+-----+------------------+------------------+\n", 216 | "only showing top 3 rows\n", 217 | "\n" 218 | ] 219 | } 220 | ], 221 | "source": [ 222 | "result.show(3)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 17, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "测试数据的均方根误差(rmse):6.555920141221407\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "from pyspark.ml.evaluation import RegressionEvaluator\n", 240 | "dt_evaluator = RegressionEvaluator(labelCol='medv', metricName=\"rmse\", predictionCol='prediction')\n", 241 | "rmse = dt_evaluator.evaluate(result)\n", 242 | "print('测试数据的均方根误差(rmse):{}'.format(rmse))" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "## 梯度提升树回归 (Gradient-boosted tree regression)\n", 250 | "pyspark.ml.regression.GBTRegressor(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType='squared', maxIter=20, stepSize=0.1, seed=None, impurity='variance')" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "`\n", 258 | "fit(dataset,params=None)方法 \n", 259 | "lossType: GBT要最小化的损失函数,可选:squared, absolute \n", 260 | "maxIter: 最大迭代次数 \n", 261 | "stepSize: 每次优化迭代的步长 \n", 262 | "subsamplingRate:用于训练每颗决策树的训练数据集的比例,区间[0,1] \n", 263 | "`" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 8, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "from pyspark.ml.regression import GBTRegressor\n", 273 | "gbt = GBTRegressor(maxIter=10, labelCol='medv', maxDepth=3)\n", 274 | "gbt_model = gbt.fit(train_data)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 9, 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "data": { 284 | "text/plain": [ 285 | "SparseVector(13, {0: 0.0219, 1: 0.0364, 2: 0.0305, 3: 0.0114, 4: 0.0032, 5: 0.1372, 6: 0.146, 7: 0.1033, 8: 0.0518, 9: 0.0819, 10: 0.0883, 11: 0.0048, 12: 0.2832})" 286 | ] 287 | }, 288 | "execution_count": 9, 289 | "metadata": {}, 290 | "output_type": "execute_result" 291 | } 292 | ], 293 | "source": [ 294 | "gbt_model.featureImportances" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 10, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "result = gbt_model.transform(test_data)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 11, 309 | "metadata": { 310 | "scrolled": true 311 | }, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "+--------------------+-----+------------------+\n", 318 | "| features| medv| prediction|\n", 319 | "+--------------------+-----+------------------+\n", 320 | "|[0.03237,0.0,2.18...| 33.4| 31.98716729056085|\n", 321 | "|[0.08829,12.5,7.8...| 22.9|22.254258637918248|\n", 322 | "|[0.14455,12.5,7.8...|22.77|20.066468254729102|\n", 323 | "+--------------------+-----+------------------+\n", 324 | "only showing top 3 rows\n", 325 | "\n" 326 | ] 327 | } 328 | ], 329 | "source": [ 330 | "result.show(3)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 20, 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/plain": [ 341 | "[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]" 342 | ] 343 | }, 344 | "execution_count": 20, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "gbt_model.treeWeights" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 12, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "name": "stdout", 360 | "output_type": "stream", 361 | "text": [ 362 | "测试数据的均方根误差(rmse):5.624145397622545\n" 363 | ] 364 | } 365 | ], 366 | "source": [ 367 | "from pyspark.ml.evaluation import RegressionEvaluator\n", 368 | "gbt_evaluator = RegressionEvaluator(labelCol='medv', metricName=\"rmse\", predictionCol='prediction')\n", 369 | "rmse = gbt_evaluator.evaluate(result)\n", 370 | "print('测试数据的均方根误差(rmse):{}'.format(rmse))" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "## 线性回归(LinearRegression)\n", 378 | "pyspark.ml.regression.LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction', maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, standardization=True, solver='auto', weightCol=None, aggregationDepth=2, loss='squaredError', epsilon=1.35)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "\n", 386 | "学习目标是通过正规化最小化指定的损失函数。这支持两种损失:\n", 387 | "+ squaredError (a.k.a 平方损失)\n", 388 | "+ huber (对于相对较小的误差和相对大的误差的绝对误差的平方误差的混合,我们从训练数据估计比例参数)\n", 389 | "\n", 390 | "支持多种类型的正则化: \n", 391 | "+ None:OLS \n", 392 | "+ L2:ridge回归 \n", 393 | "+ L1:Lasso回归 \n", 394 | "+ L1+L2:elastic回归\n", 395 | "\n", 396 | "注意:与huber loss匹配仅支持none和L2正规化。\n" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "`\n", 404 | "aggregationDepth: 树聚合的深度, >=2 \n", 405 | "elasticNtParam: ElasticNet混合参数,在[0,1]范围内,alpha=0为L2, alpha=1为L1 \n", 406 | "fit(dataset,params=None)方法 \n", 407 | "fitIntercept: 是否拟合截距 \n", 408 | "maxIter: 最大迭代次数 \n", 409 | "regParam:正则化参数 >=0 \n", 410 | "solver: 优化算法,没设置或空则使用”auto” \n", 411 | "standardization: 是否对拟合模型的特征进行标准化 \n", 412 | "`" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "`\n", 420 | "Summary属性\n", 421 | "coefficientStandardErrors \n", 422 | "devianceResiduals: 加权残差 \n", 423 | "explainedVariance: 返回解释的方差回归得分,explainedVariance=1−variance(y−(̂ y))/variance(y) \n", 424 | "meanAbsoluteError: 返回均值绝对误差 \n", 425 | "meanSquaredError: 返回均值平方误 \n", 426 | "numInstances: 预测的实例个数 \n", 427 | "pValues: 系数和截距的双边P值,只有用”normal”solver才可用 \n", 428 | "predictions: 模型transform方法返回的预测 \n", 429 | "r2: R方 \n", 430 | "residuals: 残差 \n", 431 | "rootMeanSquaredError: 均方误差平方根 \n", 432 | "tValues: T统计量\n", 433 | "`" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 23, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "from pyspark.ml.regression import LinearRegression\n", 443 | "lr = LinearRegression(maxIter=10, elasticNetParam=0.8, regParam=0.3, labelCol='medv')\n", 444 | "lr_model = lr.fit(train_data)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 26, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "trainingSummary = lr_model.summary" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 27, 459 | "metadata": {}, 460 | "outputs": [ 461 | { 462 | "name": "stdout", 463 | "output_type": "stream", 464 | "text": [ 465 | "RMSE: 5.457496\n", 466 | "r2: 0.432071\n" 467 | ] 468 | } 469 | ], 470 | "source": [ 471 | "print(\"RMSE: %f\" % trainingSummary.rootMeanSquaredError)\n", 472 | "print(\"r2: %f\" % trainingSummary.r2)" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 55, 478 | "metadata": {}, 479 | "outputs": [ 480 | { 481 | "name": "stdout", 482 | "output_type": "stream", 483 | "text": [ 484 | "+--------------------+-----+------------------+\n", 485 | "| features| medv| prediction|\n", 486 | "+--------------------+-----+------------------+\n", 487 | "|[0.03237,0.0,2.18...| 33.4|27.066314856077966|\n", 488 | "|[0.08829,12.5,7.8...| 22.9|23.721352298735898|\n", 489 | "|[0.14455,12.5,7.8...|22.77|21.388248900632398|\n", 490 | "+--------------------+-----+------------------+\n", 491 | "only showing top 3 rows\n", 492 | "\n" 493 | ] 494 | } 495 | ], 496 | "source": [ 497 | "result = lr_model.transform(test_data)\n", 498 | "result.show(3)" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 43, 504 | "metadata": {}, 505 | "outputs": [ 506 | { 507 | "name": "stdout", 508 | "output_type": "stream", 509 | "text": [ 510 | "R平方(r2):0.469\n" 511 | ] 512 | } 513 | ], 514 | "source": [ 515 | "from pyspark.ml.evaluation import RegressionEvaluator\n", 516 | "lr_evaluator = RegressionEvaluator(labelCol='medv', metricName=\"r2\", predictionCol='prediction')\n", 517 | "r2 = lr_evaluator.evaluate(result)\n", 518 | "print('R平方(r2):{:.3}'.format(r2))" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 44, 524 | "metadata": { 525 | "scrolled": true 526 | }, 527 | "outputs": [], 528 | "source": [ 529 | "test_evaluation = lr_model.evaluate(test_data)" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 42, 535 | "metadata": {}, 536 | "outputs": [ 537 | { 538 | "name": "stdout", 539 | "output_type": "stream", 540 | "text": [ 541 | "RMSE:5.7\n", 542 | "r2:0.469\n" 543 | ] 544 | } 545 | ], 546 | "source": [ 547 | "print('RMSE:{:.3}'.format(test_evaluation.rootMeanSquaredError))\n", 548 | "print('r2:{:.3}'.format(test_evaluation.r2))" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "## 随机森林回归\n", 556 | "pyspark.ml.regression.RandomForestRegressor(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='variance', subsamplingRate=1.0, seed=None, numTrees=20, featureSubsetStrategy='auto')" 557 | ] 558 | }, 559 | { 560 | "cell_type": "markdown", 561 | "metadata": {}, 562 | "source": [ 563 | "`\n", 564 | "fit(dataset,params=None)方法 \n", 565 | "featureSubsetStrategy: 每棵树的节点上要分割的特征数量,可选:auto, all, onethird, sqrt, log2,(0.0,1.0],[1-n] \n", 566 | "impurity: 信息增益计算的准则,可选:variance \n", 567 | "maxBins: 连续特征离散化最大分箱个数。 \n", 568 | "maxDepth: 树的最大深度 \n", 569 | "minInfoGain: 树节点分割特征所需最小的信息增益 \n", 570 | "minInstancesPerNode: 每个结点所需最小实例个数 \n", 571 | "numTrees: 训练树的个数 \n", 572 | "subsamplingRate: 学习每颗决策树所需样本比例 \n", 573 | "`" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 47, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "from pyspark.ml.regression import RandomForestRegressor\n", 583 | "rf = RandomForestRegressor(numTrees=10, maxDepth=5, seed=101, labelCol='medv')\n", 584 | "rf_model = rf.fit(train_data)" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 54, 590 | "metadata": {}, 591 | "outputs": [ 592 | { 593 | "name": "stdout", 594 | "output_type": "stream", 595 | "text": [ 596 | "+--------------------+-----+------------------+\n", 597 | "| features| medv| prediction|\n", 598 | "+--------------------+-----+------------------+\n", 599 | "|[0.03237,0.0,2.18...| 33.4| 30.12804440796982|\n", 600 | "|[0.08829,12.5,7.8...| 22.9|21.338106353716338|\n", 601 | "|[0.14455,12.5,7.8...|22.77|19.764914032872827|\n", 602 | "+--------------------+-----+------------------+\n", 603 | "only showing top 3 rows\n", 604 | "\n" 605 | ] 606 | } 607 | ], 608 | "source": [ 609 | "result = rf_model.transform(test_data)\n", 610 | "result.show(3)" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 51, 616 | "metadata": {}, 617 | "outputs": [ 618 | { 619 | "data": { 620 | "text/plain": [ 621 | "[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]" 622 | ] 623 | }, 624 | "execution_count": 51, 625 | "metadata": {}, 626 | "output_type": "execute_result" 627 | } 628 | ], 629 | "source": [ 630 | "rf_model.treeWeights" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": 53, 636 | "metadata": {}, 637 | "outputs": [ 638 | { 639 | "name": "stdout", 640 | "output_type": "stream", 641 | "text": [ 642 | "测试数据的均方根误差(rmse):5.268739233773331\n" 643 | ] 644 | } 645 | ], 646 | "source": [ 647 | "from pyspark.ml.evaluation import RegressionEvaluator\n", 648 | "rf_evaluator = RegressionEvaluator(labelCol='medv', metricName=\"rmse\", predictionCol='prediction')\n", 649 | "rmse = rf_evaluator.evaluate(result)\n", 650 | "print('测试数据的均方根误差(rmse):{}'.format(rmse))" 651 | ] 652 | } 653 | ], 654 | "metadata": { 655 | "kernelspec": { 656 | "display_name": "Python 3", 657 | "language": "python", 658 | "name": "python3" 659 | }, 660 | "language_info": { 661 | "codemirror_mode": { 662 | "name": "ipython", 663 | "version": 3 664 | }, 665 | "file_extension": ".py", 666 | "mimetype": "text/x-python", 667 | "name": "python", 668 | "nbconvert_exporter": "python", 669 | "pygments_lexer": "ipython3", 670 | "version": "3.6.4" 671 | } 672 | }, 673 | "nbformat": 4, 674 | "nbformat_minor": 2 675 | } 676 | --------------------------------------------------------------------------------