├── README.md
├── mashroom.ipynb
├── pysaprk.ml.clustering 学习.ipynb
├── pyspark-RDD.ipynb
├── pyspark-sql-dataframe.ipynb
├── pyspark-sql-functions.ipynb
├── pyspark.ml.classification.ipynb
├── pyspark.ml.feature.ipynb
└── pyspark.ml.regression.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | ### 学习pyspark
2 | 


--------------------------------------------------------------------------------
/mashroom.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pyspark.sql import SparkSession\n",
 10 |     "spark = SparkSession.builder.appName('mushroom').master('local[1]').getOrCreate()"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### 导入数据并确定数据类型"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 10,
 23 |    "metadata": {
 24 |     "scrolled": false
 25 |    },
 26 |    "outputs": [
 27 |     {
 28 |      "data": {
 29 |       "text/plain": [
 30 |        "23"
 31 |       ]
 32 |      },
 33 |      "execution_count": 10,
 34 |      "metadata": {},
 35 |      "output_type": "execute_result"
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "df0 = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/mushrooms.csv', header=True, inferSchema=True, encoding='utf-8')\n",
 40 |     "len(df0.columns)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 14,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "+---------+\n",
 53 |       "|cap-shape|\n",
 54 |       "+---------+\n",
 55 |       "|        x|\n",
 56 |       "|        f|\n",
 57 |       "|        k|\n",
 58 |       "|        c|\n",
 59 |       "|        b|\n",
 60 |       "|        s|\n",
 61 |       "+---------+\n",
 62 |       "\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "df0.select('cap-shape').distinct().show()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 5,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "label = df0.rdd.map(lambda row: row[0])\n",
 77 |     "row = df0.rdd.map(lambda row: row[1:])"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 6,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "dfi = label.map(lambda m: 0.0 if m=='p' else 1.0).zip(row.map(lambda x: list(x))).toDF(schema=['label','row'])"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 7,
 92 |    "metadata": {
 93 |     "scrolled": true
 94 |    },
 95 |    "outputs": [
 96 |     {
 97 |      "data": {
 98 |       "text/plain": [
 99 |        "Row(label=0.0, row=['b', 'y', 'y', 't', 'l', 'f', 'c', 'b', 'n', 'e', 'c', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'n', 's', 'm'])"
100 |       ]
101 |      },
102 |      "execution_count": 7,
103 |      "metadata": {},
104 |      "output_type": "execute_result"
105 |     }
106 |    ],
107 |    "source": [
108 |     "dfi.first()"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 15,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "# from pyspark.ml.feature import VectorAssembler\n",
118 |     "# vecAss = VectorAssembler(inputCols=df0.columns[1:], outputCol='feature')\n",
119 |     "# df0 = vecAss.transform(df0)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 16,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "from pyspark.ml.feature import CountVectorizer\n",
129 |     "import numpy as np\n",
130 |     "from numpy import allclose\n",
131 |     "cv = CountVectorizer(inputCol='row', outputCol='vectors')\n",
132 |     "model = cv.fit(dfi)\n",
133 |     "tf = model.transform(dfi)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 17,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "[Row(label=0.0, row=['x', 's', 'n', 't', 'p', 'f', 'c', 'n', 'k', 'e', 'e', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'k', 's', 'u'], vectors=SparseVector(24, {0: 3.0, 1: 1.0, 2: 3.0, 3: 4.0, 4: 2.0, 6: 2.0, 7: 1.0, 8: 2.0, 9: 1.0, 10: 1.0, 15: 1.0, 20: 1.0}))]"
145 |       ]
146 |      },
147 |      "execution_count": 17,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "tf.take(1)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 19,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "(train_data, test_data) = tf.randomSplit([0.8, 0.2])"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 20,
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "data": {
172 |       "text/plain": [
173 |        "SparseVector(24, {0: 0.0532, 1: 0.0375, 2: 0.0577, 3: 0.0947, 4: 0.064, 5: 0.0519, 6: 0.0436, 7: 0.022, 8: 0.0487, 9: 0.0411, 10: 0.0427, 11: 0.0299, 12: 0.0552, 13: 0.0683, 14: 0.0247, 15: 0.0164, 16: 0.0247, 17: 0.072, 18: 0.0844, 19: 0.0326, 20: 0.0135, 21: 0.0045, 22: 0.0132, 23: 0.0033})"
174 |       ]
175 |      },
176 |      "execution_count": 20,
177 |      "metadata": {},
178 |      "output_type": "execute_result"
179 |     }
180 |    ],
181 |    "source": [
182 |     "from pyspark.ml.classification import RandomForestClassifier\n",
183 |     "rf = RandomForestClassifier(numTrees=40, maxDepth=20, labelCol=\"label\", featuresCol='vectors')\n",
184 |     "model = rf.fit(train_data)\n",
185 |     "model.featureImportances"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 32,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "result = model.transform(test_data)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 43,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "+----------+\n",
207 |       "|prediction|\n",
208 |       "+----------+\n",
209 |       "|       0.0|\n",
210 |       "|       0.0|\n",
211 |       "|       1.0|\n",
212 |       "|       1.0|\n",
213 |       "|       1.0|\n",
214 |       "+----------+\n",
215 |       "only showing top 5 rows\n",
216 |       "\n"
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "result.select('prediction').show(5)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 34,
227 |    "metadata": {},
228 |    "outputs": [
229 |     {
230 |      "name": "stdout",
231 |      "output_type": "stream",
232 |      "text": [
233 |       "+-----+--------------------+--------------------+--------------------+--------------------+----------+\n",
234 |       "|label|                 row|             vectors|       rawPrediction|         probability|prediction|\n",
235 |       "+-----+--------------------+--------------------+--------------------+--------------------+----------+\n",
236 |       "|  0.0|[b, e, e, ?, s, s...|(24,[0,1,3,5,6,7,...|[28.4161036920659...|[0.71040259230164...|       0.0|\n",
237 |       "|  0.0|[b, f, y, f, f, f...|(24,[0,1,2,5,6,7,...|[37.1750915750915...|[0.92937728937728...|       0.0|\n",
238 |       "|  0.0|[b, n, w, f, n, f...|(24,[0,1,2,4,5,6,...|[4.02235172235172...|[0.10055879305879...|       1.0|\n",
239 |       "+-----+--------------------+--------------------+--------------------+--------------------+----------+\n",
240 |       "only showing top 3 rows\n",
241 |       "\n"
242 |      ]
243 |     }
244 |    ],
245 |    "source": [
246 |     "result.show(3)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 36,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "data": {
256 |       "text/plain": [
257 |        "1287"
258 |       ]
259 |      },
260 |      "execution_count": 36,
261 |      "metadata": {},
262 |      "output_type": "execute_result"
263 |     }
264 |    ],
265 |    "source": [
266 |     "result.rdd.map(lambda row:1 if row.label == row.prediction else 0).sum()"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 45,
272 |    "metadata": {},
273 |    "outputs": [
274 |     {
275 |      "data": {
276 |       "text/plain": [
277 |        "0.8880822746521476"
278 |       ]
279 |      },
280 |      "execution_count": 45,
281 |      "metadata": {},
282 |      "output_type": "execute_result"
283 |     }
284 |    ],
285 |    "source": [
286 |     "result.rdd.map(lambda row:1 if row.label == row.prediction else 0).sum()/result.count()"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 14,
292 |    "metadata": {},
293 |    "outputs": [
294 |     {
295 |      "name": "stderr",
296 |      "output_type": "stream",
297 |      "text": [
298 |       "/home/ffzs/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
299 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
300 |      ]
301 |     }
302 |    ],
303 |    "source": [
304 |     "from sklearn.ensemble import RandomForestClassifier\n",
305 |     "import pandas as pd\n",
306 |     "from sklearn import cross_validation\n",
307 |     "from sklearn.model_selection import train_test_split\n",
308 |     "from sklearn.cross_validation import cross_val_score"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 15,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "dfp = tf.toPandas()"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 16,
323 |    "metadata": {},
324 |    "outputs": [
325 |     {
326 |      "data": {
327 |       "text/html": [
328 |        "<div>\n",
329 |        "<style scoped>\n",
330 |        "    .dataframe tbody tr th:only-of-type {\n",
331 |        "        vertical-align: middle;\n",
332 |        "    }\n",
333 |        "\n",
334 |        "    .dataframe tbody tr th {\n",
335 |        "        vertical-align: top;\n",
336 |        "    }\n",
337 |        "\n",
338 |        "    .dataframe thead th {\n",
339 |        "        text-align: right;\n",
340 |        "    }\n",
341 |        "</style>\n",
342 |        "<table border=\"1\" class=\"dataframe\">\n",
343 |        "  <thead>\n",
344 |        "    <tr style=\"text-align: right;\">\n",
345 |        "      <th></th>\n",
346 |        "      <th>label</th>\n",
347 |        "      <th>row</th>\n",
348 |        "      <th>vectors</th>\n",
349 |        "    </tr>\n",
350 |        "  </thead>\n",
351 |        "  <tbody>\n",
352 |        "    <tr>\n",
353 |        "      <th>0</th>\n",
354 |        "      <td>0.0</td>\n",
355 |        "      <td>[\u0000, s, \u0000, t, \u0000, f, c, n, \u0000, e, \u0000, \u0000, s, \u0000, \u0000, ...</td>\n",
356 |        "      <td>(0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 1.0, 1.0, 0.0, ...</td>\n",
357 |        "    </tr>\n",
358 |        "    <tr>\n",
359 |        "      <th>1</th>\n",
360 |        "      <td>1.0</td>\n",
361 |        "      <td>[x, s, y, t, a, f, c, b, k, e, c, s, s, w, w, ...</td>\n",
362 |        "      <td>(3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 1.0, 1.0, 1.0, ...</td>\n",
363 |        "    </tr>\n",
364 |        "  </tbody>\n",
365 |        "</table>\n",
366 |        "</div>"
367 |       ],
368 |       "text/plain": [
369 |        "   label                                                row  \\\n",
370 |        "0    0.0  [\u0000, s, \u0000, t, \u0000, f, c, n, \u0000, e, \u0000, \u0000, s, \u0000, \u0000, ...   \n",
371 |        "1    1.0  [x, s, y, t, a, f, c, b, k, e, c, s, s, w, w, ...   \n",
372 |        "\n",
373 |        "                                             vectors  \n",
374 |        "0  (0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 1.0, 1.0, 0.0, ...  \n",
375 |        "1  (3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 1.0, 1.0, 1.0, ...  "
376 |       ]
377 |      },
378 |      "execution_count": 16,
379 |      "metadata": {},
380 |      "output_type": "execute_result"
381 |     }
382 |    ],
383 |    "source": [
384 |     "dfp.head(2)"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 17,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": [
393 |     "clf = RandomForestClassifier(random_state=22, n_estimators = 30, min_samples_split=3, min_samples_leaf=2)"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": 18,
399 |    "metadata": {},
400 |    "outputs": [],
401 |    "source": [
402 |     "X = dfp['vectors'].tolist()"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 19,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "y = dfp['label'].tolist()"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": 20,
417 |    "metadata": {},
418 |    "outputs": [],
419 |    "source": [
420 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 21,
426 |    "metadata": {},
427 |    "outputs": [
428 |     {
429 |      "data": {
430 |       "text/plain": [
431 |        "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
432 |        "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
433 |        "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
434 |        "            min_samples_leaf=2, min_samples_split=3,\n",
435 |        "            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,\n",
436 |        "            oob_score=False, random_state=22, verbose=0, warm_start=False)"
437 |       ]
438 |      },
439 |      "execution_count": 21,
440 |      "metadata": {},
441 |      "output_type": "execute_result"
442 |     }
443 |    ],
444 |    "source": [
445 |     "clf.fit(X_train, y_train)"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 23,
451 |    "metadata": {
452 |     "scrolled": true
453 |    },
454 |    "outputs": [
455 |     {
456 |      "name": "stdout",
457 |      "output_type": "stream",
458 |      "text": [
459 |       "0.9218461538461539\n"
460 |      ]
461 |     }
462 |    ],
463 |    "source": [
464 |     "print(clf.score(X_test, y_test))"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": 32,
470 |    "metadata": {},
471 |    "outputs": [],
472 |    "source": [
473 |     "scores = cross_val_score(clf, X, y, cv=10)"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": 33,
479 |    "metadata": {},
480 |    "outputs": [
481 |     {
482 |      "data": {
483 |       "text/plain": [
484 |        "0.8905588981998195"
485 |       ]
486 |      },
487 |      "execution_count": 33,
488 |      "metadata": {},
489 |      "output_type": "execute_result"
490 |     }
491 |    ],
492 |    "source": [
493 |     "scores.mean()"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": null,
499 |    "metadata": {},
500 |    "outputs": [],
501 |    "source": []
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "metadata": {},
507 |    "outputs": [],
508 |    "source": []
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": null,
513 |    "metadata": {},
514 |    "outputs": [],
515 |    "source": []
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": 1,
520 |    "metadata": {},
521 |    "outputs": [],
522 |    "source": [
523 |     "from pyspark.sql import SparkSession\n",
524 |     "spark = SparkSession.builder.appName('mushroom').getOrCreate()"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": 32,
530 |    "metadata": {},
531 |    "outputs": [],
532 |    "source": [
533 |     "df = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/stock.csv',encoding='gbk',header=True, inferSchema=True)"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": 33,
539 |    "metadata": {},
540 |    "outputs": [
541 |     {
542 |      "data": {
543 |       "text/plain": [
544 |        "[('日期', 'timestamp'),\n",
545 |        " ('股票代码', 'string'),\n",
546 |        " ('名称', 'string'),\n",
547 |        " ('收盘价', 'double'),\n",
548 |        " ('最高价', 'double'),\n",
549 |        " ('最低价', 'double'),\n",
550 |        " ('开盘价', 'double'),\n",
551 |        " ('前收盘', 'double'),\n",
552 |        " ('涨跌额', 'string'),\n",
553 |        " ('涨跌幅', 'string'),\n",
554 |        " ('换手率', 'double'),\n",
555 |        " ('成交量', 'int'),\n",
556 |        " ('成交金额', 'double'),\n",
557 |        " ('总市值', 'double'),\n",
558 |        " ('流通市值', 'double')]"
559 |       ]
560 |      },
561 |      "execution_count": 33,
562 |      "metadata": {},
563 |      "output_type": "execute_result"
564 |     }
565 |    ],
566 |    "source": [
567 |     "df.dtypes"
568 |    ]
569 |   },
570 |   {
571 |    "cell_type": "code",
572 |    "execution_count": 34,
573 |    "metadata": {},
574 |    "outputs": [],
575 |    "source": [
576 |     "# from pyspark.sql.types import StructType, StructField, LongType, StringType, DateType ,DoubleType # 导入类型\n",
577 |     "# schema = StructType([\n",
578 |     "#     StructField(\"日期\", DateType(), True),\n",
579 |     "#     StructField(\"收盘价\", DoubleType(), True),\n",
580 |     "#     StructField(\"成交量\", LongType(), True),\n",
581 |     "#     StructField(\"名称\", StringType(), True)\n",
582 |     "# ])"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": 35,
588 |    "metadata": {},
589 |    "outputs": [],
590 |    "source": [
591 |     "df.write.csv(path='hdfs:///user/csv/stock.csv', header=True, sep=\",\", mode='overwrite')"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": 49,
597 |    "metadata": {
598 |     "scrolled": true
599 |    },
600 |    "outputs": [
601 |     {
602 |      "data": {
603 |       "text/plain": [
604 |        "'股票代码'"
605 |       ]
606 |      },
607 |      "execution_count": 49,
608 |      "metadata": {},
609 |      "output_type": "execute_result"
610 |     }
611 |    ],
612 |    "source": [
613 |     "df.columns[1]"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": 61,
619 |    "metadata": {},
620 |    "outputs": [],
621 |    "source": [
622 |     "df0 = spark.read.jdbc(url=\"jdbc:mysql://localhost:3306/test?user=root&password=666666\", table=\"mashroom\")"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": 63,
628 |    "metadata": {},
629 |    "outputs": [
630 |     {
631 |      "data": {
632 |       "text/plain": [
633 |        "8124"
634 |       ]
635 |      },
636 |      "execution_count": 63,
637 |      "metadata": {},
638 |      "output_type": "execute_result"
639 |     }
640 |    ],
641 |    "source": [
642 |     "df0.count()"
643 |    ]
644 |   },
645 |   {
646 |    "cell_type": "code",
647 |    "execution_count": 64,
648 |    "metadata": {},
649 |    "outputs": [],
650 |    "source": [
651 |     "df0.write.jdbc(url=\"jdbc:mysql://localhost:3306/test?user=root&password=666666&useUnicode=true&characterEncoding=GBK\",\n",
652 |     "              mode=\"overwrite\",\n",
653 |     "              table=\"test\",\n",
654 |     "              properties={\"driver\":'com.mysql.jdbc.Driver'})"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 65,
660 |    "metadata": {},
661 |    "outputs": [],
662 |    "source": [
663 |     "spark.stop()"
664 |    ]
665 |   },
666 |   {
667 |    "cell_type": "code",
668 |    "execution_count": 68,
669 |    "metadata": {},
670 |    "outputs": [
671 |     {
672 |      "name": "stdout",
673 |      "output_type": "stream",
674 |      "text": [
675 |       "+--------------+------+\n",
676 |       "|       country|median|\n",
677 |       "+--------------+------+\n",
678 |       "|   New Zealand|  39.0|\n",
679 |       "|         Spain|  37.0|\n",
680 |       "|       Ireland|  35.0|\n",
681 |       "|        Sweden|  34.0|\n",
682 |       "|         Italy|  34.0|\n",
683 |       "|        Norway|  34.0|\n",
684 |       "|       Denmark|  34.0|\n",
685 |       "|        Israel|  34.0|\n",
686 |       "|     Australia|  34.0|\n",
687 |       "|   Netherlands|  34.0|\n",
688 |       "|     Argentina|  33.5|\n",
689 |       "|        Canada|  33.5|\n",
690 |       "|       Belgium|  33.0|\n",
691 |       "|   Switzerland|  33.0|\n",
692 |       "|         Japan|  33.0|\n",
693 |       "|United Kingdom|  33.0|\n",
694 |       "| United States|  32.0|\n",
695 |       "|      Portugal|  32.0|\n",
696 |       "|       Romania|  32.0|\n",
697 |       "|       Germany|  31.0|\n",
698 |       "+--------------+------+\n",
699 |       "only showing top 20 rows\n",
700 |       "\n"
701 |      ]
702 |     }
703 |    ],
704 |    "source": [
705 |     "spark = SparkSession.builder.enableHiveSupport().master(\"local[*]\").appName(\"read_hive\").getOrCreate()\n",
706 |     "\n",
707 |     "df=spark.sql(\"select * from age\")\n",
708 |     "df.show()"
709 |    ]
710 |   },
711 |   {
712 |    "cell_type": "code",
713 |    "execution_count": 87,
714 |    "metadata": {},
715 |    "outputs": [
716 |     {
717 |      "data": {
718 |       "text/plain": [
719 |        "DataFrame[]"
720 |       ]
721 |      },
722 |      "execution_count": 87,
723 |      "metadata": {},
724 |      "output_type": "execute_result"
725 |     }
726 |    ],
727 |    "source": [
728 |     "spark.sql('create table if not exists age2(name string, num int)')\n",
729 |     "#df0.write.mode(\"overwrite\").insertInto(\"age2\")"
730 |    ]
731 |   },
732 |   {
733 |    "cell_type": "code",
734 |    "execution_count": 80,
735 |    "metadata": {},
736 |    "outputs": [
737 |     {
738 |      "name": "stdout",
739 |      "output_type": "stream",
740 |      "text": [
741 |       "+--------+---------+-----------+\n",
742 |       "|database|tableName|isTemporary|\n",
743 |       "+--------+---------+-----------+\n",
744 |       "| default|      age|      false|\n",
745 |       "| default|     age2|      false|\n",
746 |       "| default|  country|      false|\n",
747 |       "| default|       qn|      false|\n",
748 |       "+--------+---------+-----------+\n",
749 |       "\n"
750 |      ]
751 |     }
752 |    ],
753 |    "source": [
754 |     "spark.sql('show tables').show()"
755 |    ]
756 |   },
757 |   {
758 |    "cell_type": "code",
759 |    "execution_count": 81,
760 |    "metadata": {},
761 |    "outputs": [],
762 |    "source": [
763 |     "df.write.mode(\"overwrite\").insertInto(\"age2\")"
764 |    ]
765 |   },
766 |   {
767 |    "cell_type": "code",
768 |    "execution_count": 86,
769 |    "metadata": {},
770 |    "outputs": [
771 |     {
772 |      "name": "stdout",
773 |      "output_type": "stream",
774 |      "text": [
775 |       "+-----------+---+\n",
776 |       "|       name|num|\n",
777 |       "+-----------+---+\n",
778 |       "|New Zealand| 39|\n",
779 |       "|      Spain| 37|\n",
780 |       "|    Ireland| 35|\n",
781 |       "|     Sweden| 34|\n",
782 |       "|      Italy| 34|\n",
783 |       "|     Norway| 34|\n",
784 |       "|    Denmark| 34|\n",
785 |       "|     Israel| 34|\n",
786 |       "|  Australia| 34|\n",
787 |       "|Netherlands| 34|\n",
788 |       "+-----------+---+\n",
789 |       "\n"
790 |      ]
791 |     }
792 |    ],
793 |    "source": [
794 |     "spark.sql('select * from age2 sort by num limit 10 ').show()"
795 |    ]
796 |   },
797 |   {
798 |    "cell_type": "code",
799 |    "execution_count": 18,
800 |    "metadata": {},
801 |    "outputs": [],
802 |    "source": [
803 |     "spark.stop()"
804 |    ]
805 |   },
806 |   {
807 |    "cell_type": "code",
808 |    "execution_count": null,
809 |    "metadata": {},
810 |    "outputs": [],
811 |    "source": []
812 |   }
813 |  ],
814 |  "metadata": {
815 |   "kernelspec": {
816 |    "display_name": "Python 3",
817 |    "language": "python",
818 |    "name": "python3"
819 |   },
820 |   "language_info": {
821 |    "codemirror_mode": {
822 |     "name": "ipython",
823 |     "version": 3
824 |    },
825 |    "file_extension": ".py",
826 |    "mimetype": "text/x-python",
827 |    "name": "python",
828 |    "nbconvert_exporter": "python",
829 |    "pygments_lexer": "ipython3",
830 |    "version": "3.6.4"
831 |   }
832 |  },
833 |  "nbformat": 4,
834 |  "nbformat_minor": 2
835 | }
836 | 


--------------------------------------------------------------------------------
/pysaprk.ml.clustering 学习.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "data：https://www.kaggle.com/vjchoudhary7/customer-segmentation-tutorial-in-python"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": 1,
  13 |    "metadata": {},
  14 |    "outputs": [],
  15 |    "source": [
  16 |     "from pyspark.sql import SparkSession\n",
  17 |     "spark = SparkSession.builder.master('local[1]').appName('learn_cluster').getOrCreate()"
  18 |    ]
  19 |   },
  20 |   {
  21 |    "cell_type": "code",
  22 |    "execution_count": 2,
  23 |    "metadata": {},
  24 |    "outputs": [],
  25 |    "source": [
  26 |     "df = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/Mall_Customers.csv', header=True, inferSchema=True)"
  27 |    ]
  28 |   },
  29 |   {
  30 |    "cell_type": "code",
  31 |    "execution_count": 3,
  32 |    "metadata": {
  33 |     "scrolled": false
  34 |    },
  35 |    "outputs": [],
  36 |    "source": [
  37 |     "df = df.withColumnRenamed('Annual Income (k$)', 'Income').withColumnRenamed('Spending Score (1-100)', 'Spend')"
  38 |    ]
  39 |   },
  40 |   {
  41 |    "cell_type": "code",
  42 |    "execution_count": 4,
  43 |    "metadata": {},
  44 |    "outputs": [
  45 |     {
  46 |      "name": "stdout",
  47 |      "output_type": "stream",
  48 |      "text": [
  49 |       "+----------+------+---+------+-----+\n",
  50 |       "|CustomerID|Gender|Age|Income|Spend|\n",
  51 |       "+----------+------+---+------+-----+\n",
  52 |       "|         1|  Male| 19|    15|   39|\n",
  53 |       "|         2|  Male| 21|    15|   81|\n",
  54 |       "|         3|Female| 20|    16|    6|\n",
  55 |       "+----------+------+---+------+-----+\n",
  56 |       "only showing top 3 rows\n",
  57 |       "\n"
  58 |      ]
  59 |     }
  60 |    ],
  61 |    "source": [
  62 |     "df.show(3)"
  63 |    ]
  64 |   },
  65 |   {
  66 |    "cell_type": "code",
  67 |    "execution_count": 5,
  68 |    "metadata": {},
  69 |    "outputs": [
  70 |     {
  71 |      "data": {
  72 |       "text/plain": [
  73 |        "CustomerID    0\n",
  74 |        "Gender        0\n",
  75 |        "Age           0\n",
  76 |        "Income        0\n",
  77 |        "Spend         0\n",
  78 |        "dtype: int64"
  79 |       ]
  80 |      },
  81 |      "execution_count": 5,
  82 |      "metadata": {},
  83 |      "output_type": "execute_result"
  84 |     }
  85 |    ],
  86 |    "source": [
  87 |     "# 查看是否有缺失值\n",
  88 |     "df.toPandas().isna().sum()"
  89 |    ]
  90 |   },
  91 |   {
  92 |    "cell_type": "code",
  93 |    "execution_count": 6,
  94 |    "metadata": {},
  95 |    "outputs": [],
  96 |    "source": [
  97 |     "from pyspark.ml.feature import VectorAssembler\n",
  98 |     "vecAss = VectorAssembler(inputCols = df.columns[3:], outputCol = 'features')\n",
  99 |     "df_km = vecAss.transform(df).select('CustomerID', 'features')"
 100 |    ]
 101 |   },
 102 |   {
 103 |    "cell_type": "code",
 104 |    "execution_count": 8,
 105 |    "metadata": {
 106 |     "scrolled": true
 107 |    },
 108 |    "outputs": [
 109 |     {
 110 |      "name": "stdout",
 111 |      "output_type": "stream",
 112 |      "text": [
 113 |       "+----------+-----------+\n",
 114 |       "|CustomerID|   features|\n",
 115 |       "+----------+-----------+\n",
 116 |       "|         1|[15.0,39.0]|\n",
 117 |       "|         2|[15.0,81.0]|\n",
 118 |       "|         3| [16.0,6.0]|\n",
 119 |       "+----------+-----------+\n",
 120 |       "only showing top 3 rows\n",
 121 |       "\n"
 122 |      ]
 123 |     }
 124 |    ],
 125 |    "source": [
 126 |     "df_km.show(3)"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "code",
 131 |    "execution_count": 9,
 132 |    "metadata": {},
 133 |    "outputs": [],
 134 |    "source": [
 135 |     "pd_df = df.toPandas()"
 136 |    ]
 137 |   },
 138 |   {
 139 |    "cell_type": "code",
 140 |    "execution_count": 10,
 141 |    "metadata": {},
 142 |    "outputs": [
 143 |     {
 144 |      "data": {
 145 |       "text/html": [
 146 |        "<div>\n",
 147 |        "<style scoped>\n",
 148 |        "    .dataframe tbody tr th:only-of-type {\n",
 149 |        "        vertical-align: middle;\n",
 150 |        "    }\n",
 151 |        "\n",
 152 |        "    .dataframe tbody tr th {\n",
 153 |        "        vertical-align: top;\n",
 154 |        "    }\n",
 155 |        "\n",
 156 |        "    .dataframe thead th {\n",
 157 |        "        text-align: right;\n",
 158 |        "    }\n",
 159 |        "</style>\n",
 160 |        "<table border=\"1\" class=\"dataframe\">\n",
 161 |        "  <thead>\n",
 162 |        "    <tr style=\"text-align: right;\">\n",
 163 |        "      <th></th>\n",
 164 |        "      <th>CustomerID</th>\n",
 165 |        "      <th>Gender</th>\n",
 166 |        "      <th>Age</th>\n",
 167 |        "      <th>Income</th>\n",
 168 |        "      <th>Spend</th>\n",
 169 |        "    </tr>\n",
 170 |        "  </thead>\n",
 171 |        "  <tbody>\n",
 172 |        "    <tr>\n",
 173 |        "      <th>0</th>\n",
 174 |        "      <td>1</td>\n",
 175 |        "      <td>Male</td>\n",
 176 |        "      <td>19</td>\n",
 177 |        "      <td>15</td>\n",
 178 |        "      <td>39</td>\n",
 179 |        "    </tr>\n",
 180 |        "    <tr>\n",
 181 |        "      <th>1</th>\n",
 182 |        "      <td>2</td>\n",
 183 |        "      <td>Male</td>\n",
 184 |        "      <td>21</td>\n",
 185 |        "      <td>15</td>\n",
 186 |        "      <td>81</td>\n",
 187 |        "    </tr>\n",
 188 |        "    <tr>\n",
 189 |        "      <th>2</th>\n",
 190 |        "      <td>3</td>\n",
 191 |        "      <td>Female</td>\n",
 192 |        "      <td>20</td>\n",
 193 |        "      <td>16</td>\n",
 194 |        "      <td>6</td>\n",
 195 |        "    </tr>\n",
 196 |        "    <tr>\n",
 197 |        "      <th>3</th>\n",
 198 |        "      <td>4</td>\n",
 199 |        "      <td>Female</td>\n",
 200 |        "      <td>23</td>\n",
 201 |        "      <td>16</td>\n",
 202 |        "      <td>77</td>\n",
 203 |        "    </tr>\n",
 204 |        "    <tr>\n",
 205 |        "      <th>4</th>\n",
 206 |        "      <td>5</td>\n",
 207 |        "      <td>Female</td>\n",
 208 |        "      <td>31</td>\n",
 209 |        "      <td>17</td>\n",
 210 |        "      <td>40</td>\n",
 211 |        "    </tr>\n",
 212 |        "  </tbody>\n",
 213 |        "</table>\n",
 214 |        "</div>"
 215 |       ],
 216 |       "text/plain": [
 217 |        "   CustomerID  Gender  Age  Income  Spend\n",
 218 |        "0           1    Male   19      15     39\n",
 219 |        "1           2    Male   21      15     81\n",
 220 |        "2           3  Female   20      16      6\n",
 221 |        "3           4  Female   23      16     77\n",
 222 |        "4           5  Female   31      17     40"
 223 |       ]
 224 |      },
 225 |      "execution_count": 10,
 226 |      "metadata": {},
 227 |      "output_type": "execute_result"
 228 |     }
 229 |    ],
 230 |    "source": [
 231 |     "pd_df.head()"
 232 |    ]
 233 |   },
 234 |   {
 235 |    "cell_type": "code",
 236 |    "execution_count": 18,
 237 |    "metadata": {
 238 |     "scrolled": false
 239 |    },
 240 |    "outputs": [
 241 |     {
 242 |      "data": {
 243 |       "text/html": [
 244 |        "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
 245 |       ],
 246 |       "text/vnd.plotly.v1+html": [
 247 |        "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
 248 |       ]
 249 |      },
 250 |      "metadata": {},
 251 |      "output_type": "display_data"
 252 |     },
 253 |     {
 254 |      "data": {
 255 |       "application/vnd.plotly.v1+json": {
 256 |        "data": [
 257 |         {
 258 |          "marker": {
 259 |           "size": 6
 260 |          },
 261 |          "mode": "markers",
 262 |          "type": "scatter",
 263 |          "x": [
 264 |           15,
 265 |           15,
 266 |           16,
 267 |           16,
 268 |           17,
 269 |           17,
 270 |           18,
 271 |           18,
 272 |           19,
 273 |           19,
 274 |           19,
 275 |           19,
 276 |           20,
 277 |           20,
 278 |           20,
 279 |           20,
 280 |           21,
 281 |           21,
 282 |           23,
 283 |           23,
 284 |           24,
 285 |           24,
 286 |           25,
 287 |           25,
 288 |           28,
 289 |           28,
 290 |           28,
 291 |           28,
 292 |           29,
 293 |           29,
 294 |           30,
 295 |           30,
 296 |           33,
 297 |           33,
 298 |           33,
 299 |           33,
 300 |           34,
 301 |           34,
 302 |           37,
 303 |           37,
 304 |           38,
 305 |           38,
 306 |           39,
 307 |           39,
 308 |           39,
 309 |           39,
 310 |           40,
 311 |           40,
 312 |           40,
 313 |           40,
 314 |           42,
 315 |           42,
 316 |           43,
 317 |           43,
 318 |           43,
 319 |           43,
 320 |           44,
 321 |           44,
 322 |           46,
 323 |           46,
 324 |           46,
 325 |           46,
 326 |           47,
 327 |           47,
 328 |           48,
 329 |           48,
 330 |           48,
 331 |           48,
 332 |           48,
 333 |           48,
 334 |           49,
 335 |           49,
 336 |           50,
 337 |           50,
 338 |           54,
 339 |           54,
 340 |           54,
 341 |           54,
 342 |           54,
 343 |           54,
 344 |           54,
 345 |           54,
 346 |           54,
 347 |           54,
 348 |           54,
 349 |           54,
 350 |           57,
 351 |           57,
 352 |           58,
 353 |           58,
 354 |           59,
 355 |           59,
 356 |           60,
 357 |           60,
 358 |           60,
 359 |           60,
 360 |           60,
 361 |           60,
 362 |           61,
 363 |           61,
 364 |           62,
 365 |           62,
 366 |           62,
 367 |           62,
 368 |           62,
 369 |           62,
 370 |           63,
 371 |           63,
 372 |           63,
 373 |           63,
 374 |           63,
 375 |           63,
 376 |           64,
 377 |           64,
 378 |           65,
 379 |           65,
 380 |           65,
 381 |           65,
 382 |           67,
 383 |           67,
 384 |           67,
 385 |           67,
 386 |           69,
 387 |           69,
 388 |           70,
 389 |           70,
 390 |           71,
 391 |           71,
 392 |           71,
 393 |           71,
 394 |           71,
 395 |           71,
 396 |           72,
 397 |           72,
 398 |           73,
 399 |           73,
 400 |           73,
 401 |           73,
 402 |           74,
 403 |           74,
 404 |           75,
 405 |           75,
 406 |           76,
 407 |           76,
 408 |           77,
 409 |           77,
 410 |           77,
 411 |           77,
 412 |           78,
 413 |           78,
 414 |           78,
 415 |           78,
 416 |           78,
 417 |           78,
 418 |           78,
 419 |           78,
 420 |           78,
 421 |           78,
 422 |           78,
 423 |           78,
 424 |           79,
 425 |           79,
 426 |           81,
 427 |           81,
 428 |           85,
 429 |           85,
 430 |           86,
 431 |           86,
 432 |           87,
 433 |           87,
 434 |           87,
 435 |           87,
 436 |           87,
 437 |           87,
 438 |           88,
 439 |           88,
 440 |           88,
 441 |           88,
 442 |           93,
 443 |           93,
 444 |           97,
 445 |           97,
 446 |           98,
 447 |           98,
 448 |           99,
 449 |           99,
 450 |           101,
 451 |           101,
 452 |           103,
 453 |           103,
 454 |           103,
 455 |           103,
 456 |           113,
 457 |           113,
 458 |           120,
 459 |           120,
 460 |           126,
 461 |           126,
 462 |           137,
 463 |           137
 464 |          ],
 465 |          "y": [
 466 |           39,
 467 |           81,
 468 |           6,
 469 |           77,
 470 |           40,
 471 |           76,
 472 |           6,
 473 |           94,
 474 |           3,
 475 |           72,
 476 |           14,
 477 |           99,
 478 |           15,
 479 |           77,
 480 |           13,
 481 |           79,
 482 |           35,
 483 |           66,
 484 |           29,
 485 |           98,
 486 |           35,
 487 |           73,
 488 |           5,
 489 |           73,
 490 |           14,
 491 |           82,
 492 |           32,
 493 |           61,
 494 |           31,
 495 |           87,
 496 |           4,
 497 |           73,
 498 |           4,
 499 |           92,
 500 |           14,
 501 |           81,
 502 |           17,
 503 |           73,
 504 |           26,
 505 |           75,
 506 |           35,
 507 |           92,
 508 |           36,
 509 |           61,
 510 |           28,
 511 |           65,
 512 |           55,
 513 |           47,
 514 |           42,
 515 |           42,
 516 |           52,
 517 |           60,
 518 |           54,
 519 |           60,
 520 |           45,
 521 |           41,
 522 |           50,
 523 |           46,
 524 |           51,
 525 |           46,
 526 |           56,
 527 |           55,
 528 |           52,
 529 |           59,
 530 |           51,
 531 |           59,
 532 |           50,
 533 |           48,
 534 |           59,
 535 |           47,
 536 |           55,
 537 |           42,
 538 |           49,
 539 |           56,
 540 |           47,
 541 |           54,
 542 |           53,
 543 |           48,
 544 |           52,
 545 |           42,
 546 |           51,
 547 |           55,
 548 |           41,
 549 |           44,
 550 |           57,
 551 |           46,
 552 |           58,
 553 |           55,
 554 |           60,
 555 |           46,
 556 |           55,
 557 |           41,
 558 |           49,
 559 |           40,
 560 |           42,
 561 |           52,
 562 |           47,
 563 |           50,
 564 |           42,
 565 |           49,
 566 |           41,
 567 |           48,
 568 |           59,
 569 |           55,
 570 |           56,
 571 |           42,
 572 |           50,
 573 |           46,
 574 |           43,
 575 |           48,
 576 |           52,
 577 |           54,
 578 |           42,
 579 |           46,
 580 |           48,
 581 |           50,
 582 |           43,
 583 |           59,
 584 |           43,
 585 |           57,
 586 |           56,
 587 |           40,
 588 |           58,
 589 |           91,
 590 |           29,
 591 |           77,
 592 |           35,
 593 |           95,
 594 |           11,
 595 |           75,
 596 |           9,
 597 |           75,
 598 |           34,
 599 |           71,
 600 |           5,
 601 |           88,
 602 |           7,
 603 |           73,
 604 |           10,
 605 |           72,
 606 |           5,
 607 |           93,
 608 |           40,
 609 |           87,
 610 |           12,
 611 |           97,
 612 |           36,
 613 |           74,
 614 |           22,
 615 |           90,
 616 |           17,
 617 |           88,
 618 |           20,
 619 |           76,
 620 |           16,
 621 |           89,
 622 |           1,
 623 |           78,
 624 |           1,
 625 |           73,
 626 |           35,
 627 |           83,
 628 |           5,
 629 |           93,
 630 |           26,
 631 |           75,
 632 |           20,
 633 |           95,
 634 |           27,
 635 |           63,
 636 |           13,
 637 |           75,
 638 |           10,
 639 |           92,
 640 |           13,
 641 |           86,
 642 |           15,
 643 |           69,
 644 |           14,
 645 |           90,
 646 |           32,
 647 |           86,
 648 |           15,
 649 |           88,
 650 |           39,
 651 |           97,
 652 |           24,
 653 |           68,
 654 |           17,
 655 |           85,
 656 |           23,
 657 |           69,
 658 |           8,
 659 |           91,
 660 |           16,
 661 |           79,
 662 |           28,
 663 |           74,
 664 |           18,
 665 |           83
 666 |          ]
 667 |         }
 668 |        ],
 669 |        "layout": {}
 670 |       },
 671 |       "text/html": [
 672 |        "<div id=\"22757464-5adf-45c6-af9b-f2d84f686ea0\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";Plotly.newPlot(\"22757464-5adf-45c6-af9b-f2d84f686ea0\", [{\"type\": \"scatter\", \"x\": [15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 23, 23, 24, 24, 25, 25, 28, 28, 28, 28, 29, 29, 30, 30, 33, 33, 33, 33, 34, 34, 37, 37, 38, 38, 39, 39, 39, 39, 40, 40, 40, 40, 42, 42, 43, 43, 43, 43, 44, 44, 46, 46, 46, 46, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49, 50, 50, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 57, 57, 58, 58, 59, 59, 60, 60, 60, 60, 60, 60, 61, 61, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 64, 64, 65, 65, 65, 65, 67, 67, 67, 67, 69, 69, 70, 70, 71, 71, 71, 71, 71, 71, 72, 72, 73, 73, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 77, 77, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 79, 79, 81, 81, 85, 85, 86, 86, 87, 87, 87, 87, 87, 87, 88, 88, 88, 88, 93, 93, 97, 97, 98, 98, 99, 99, 101, 101, 103, 103, 103, 103, 113, 113, 120, 120, 126, 126, 137, 137], \"y\": [39, 81, 6, 77, 40, 76, 6, 94, 3, 72, 14, 99, 15, 77, 13, 79, 35, 66, 29, 98, 35, 73, 5, 73, 14, 82, 32, 61, 31, 87, 4, 73, 4, 92, 14, 81, 17, 73, 26, 75, 35, 92, 36, 61, 28, 65, 55, 47, 42, 42, 52, 60, 54, 60, 45, 41, 50, 46, 51, 46, 56, 55, 52, 59, 51, 59, 50, 48, 59, 47, 55, 42, 49, 56, 47, 54, 53, 48, 52, 42, 51, 55, 41, 44, 57, 46, 58, 55, 60, 46, 55, 41, 49, 40, 42, 52, 47, 50, 42, 49, 41, 48, 59, 55, 56, 42, 50, 46, 43, 48, 52, 54, 42, 46, 48, 50, 43, 59, 43, 57, 56, 40, 58, 91, 29, 77, 35, 95, 11, 75, 9, 75, 34, 71, 5, 88, 7, 73, 10, 72, 5, 93, 40, 87, 12, 97, 36, 74, 22, 90, 17, 88, 20, 76, 16, 89, 1, 78, 1, 73, 35, 83, 5, 93, 26, 75, 20, 95, 27, 63, 13, 75, 10, 92, 13, 86, 15, 69, 14, 90, 32, 86, 15, 88, 39, 97, 24, 68, 17, 85, 23, 69, 8, 91, 16, 79, 28, 74, 18, 83], \"mode\": \"markers\", \"marker\": {\"size\": 6}}], {}, {\"showLink\": true, \"linkText\": \"Export to plot.ly\"})});</script>"
 673 |       ],
 674 |       "text/vnd.plotly.v1+html": [
 675 |        "<div id=\"22757464-5adf-45c6-af9b-f2d84f686ea0\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";Plotly.newPlot(\"22757464-5adf-45c6-af9b-f2d84f686ea0\", [{\"type\": \"scatter\", \"x\": [15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 23, 23, 24, 24, 25, 25, 28, 28, 28, 28, 29, 29, 30, 30, 33, 33, 33, 33, 34, 34, 37, 37, 38, 38, 39, 39, 39, 39, 40, 40, 40, 40, 42, 42, 43, 43, 43, 43, 44, 44, 46, 46, 46, 46, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49, 50, 50, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 57, 57, 58, 58, 59, 59, 60, 60, 60, 60, 60, 60, 61, 61, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 64, 64, 65, 65, 65, 65, 67, 67, 67, 67, 69, 69, 70, 70, 71, 71, 71, 71, 71, 71, 72, 72, 73, 73, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 77, 77, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 79, 79, 81, 81, 85, 85, 86, 86, 87, 87, 87, 87, 87, 87, 88, 88, 88, 88, 93, 93, 97, 97, 98, 98, 99, 99, 101, 101, 103, 103, 103, 103, 113, 113, 120, 120, 126, 126, 137, 137], \"y\": [39, 81, 6, 77, 40, 76, 6, 94, 3, 72, 14, 99, 15, 77, 13, 79, 35, 66, 29, 98, 35, 73, 5, 73, 14, 82, 32, 61, 31, 87, 4, 73, 4, 92, 14, 81, 17, 73, 26, 75, 35, 92, 36, 61, 28, 65, 55, 47, 42, 42, 52, 60, 54, 60, 45, 41, 50, 46, 51, 46, 56, 55, 52, 59, 51, 59, 50, 48, 59, 47, 55, 42, 49, 56, 47, 54, 53, 48, 52, 42, 51, 55, 41, 44, 57, 46, 58, 55, 60, 46, 55, 41, 49, 40, 42, 52, 47, 50, 42, 49, 41, 48, 59, 55, 56, 42, 50, 46, 43, 48, 52, 54, 42, 46, 48, 50, 43, 59, 43, 57, 56, 40, 58, 91, 29, 77, 35, 95, 11, 75, 9, 75, 34, 71, 5, 88, 7, 73, 10, 72, 5, 93, 40, 87, 12, 97, 36, 74, 22, 90, 17, 88, 20, 76, 16, 89, 1, 78, 1, 73, 35, 83, 5, 93, 26, 75, 20, 95, 27, 63, 13, 75, 10, 92, 13, 86, 15, 69, 14, 90, 32, 86, 15, 88, 39, 97, 24, 68, 17, 85, 23, 69, 8, 91, 16, 79, 28, 74, 18, 83], \"mode\": \"markers\", \"marker\": {\"size\": 6}}], {}, {\"showLink\": true, \"linkText\": \"Export to plot.ly\"})});</script>"
 676 |       ]
 677 |      },
 678 |      "metadata": {},
 679 |      "output_type": "display_data"
 680 |     }
 681 |    ],
 682 |    "source": [
 683 |     "from plotly.offline import iplot, init_notebook_mode\n",
 684 |     "import plotly.graph_objs as go\n",
 685 |     "init_notebook_mode(connected=True)\n",
 686 |     "trace = go.Scatter(x=pd_df.Income, y=pd_df.Spend , \n",
 687 |     "                     mode='markers',\n",
 688 |     "                    marker = {'size':6})\n",
 689 |     "iplot([trace])"
 690 |    ]
 691 |   },
 692 |   {
 693 |    "cell_type": "markdown",
 694 |    "metadata": {},
 695 |    "source": [
 696 |     "## KMeans\n",
 697 |     "`class pyspark.ml.clustering.KMeans(self, featuresCol=\"features\", predictionCol=\"prediction\", k=2, initMode=\"k-means||\", initSteps=2, tol=1e-4, maxIter=20, seed=None)\n",
 698 |     "`"
 699 |    ]
 700 |   },
 701 |   {
 702 |    "cell_type": "markdown",
 703 |    "metadata": {},
 704 |    "source": [
 705 |     "**参数解释**"
 706 |    ]
 707 |   },
 708 |   {
 709 |    "cell_type": "markdown",
 710 |    "metadata": {},
 711 |    "source": [
 712 |     "`\n",
 713 |     "initMode: 初始化算法，可以使随机的“random\"，也可以是”k-means||\"\n",
 714 |     "initSteps: k-means||初始化的步数，需>0\n",
 715 |     "fit(datast,params=None)方法\n",
 716 |     "`"
 717 |    ]
 718 |   },
 719 |   {
 720 |    "cell_type": "markdown",
 721 |    "metadata": {},
 722 |    "source": [
 723 |     "`\n",
 724 |     "cluster: 每个训练数据点预测的聚类中心数据框\n",
 725 |     "clusterSize: 每个簇的大小（簇内数据点的个数）\n",
 726 |     "k:  模型训练的簇个数\n",
 727 |     "predictions: 由模型transform方法产生的数据框\n",
 728 |     "`"
 729 |    ]
 730 |   },
 731 |   {
 732 |    "cell_type": "code",
 733 |    "execution_count": 20,
 734 |    "metadata": {},
 735 |    "outputs": [],
 736 |    "source": [
 737 |     "from pyspark.ml.clustering import KMeans\n",
 738 |     "\n",
 739 |     "cost = list(range(2,20))\n",
 740 |     "for k in range(2, 20):\n",
 741 |     "    kmeans = KMeans(k=k, seed=1)\n",
 742 |     "    km_model = kmeans.fit(df_km)\n",
 743 |     "    # computeCost:计算输入点与其对应的聚类中心之间的平方距离之和。\n",
 744 |     "    cost[k-2] = km_model.computeCost(df_km)"
 745 |    ]
 746 |   },
 747 |   {
 748 |    "cell_type": "code",
 749 |    "execution_count": 21,
 750 |    "metadata": {},
 751 |    "outputs": [],
 752 |    "source": [
 753 |     "import matplotlib.pyplot as plt\n",
 754 |     "%matplotlib inline"
 755 |    ]
 756 |   },
 757 |   {
 758 |    "cell_type": "code",
 759 |    "execution_count": 22,
 760 |    "metadata": {
 761 |     "scrolled": false
 762 |    },
 763 |    "outputs": [
 764 |     {
 765 |      "data": {
 766 |       "text/plain": [
 767 |        "Text(0,0.5,'cost')"
 768 |       ]
 769 |      },
 770 |      "execution_count": 22,
 771 |      "metadata": {},
 772 |      "output_type": "execute_result"
 773 |     },
 774 |     {
 775 |      "data": {
 776 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgEAAAFzCAYAAACn5No2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3Xl0XOWd5vHnV6VdqpIlWbLKq7yALbM4JDIYcLCBNiRpIIekk5COg6GTJkv3pM/Qk+ltOJlepqe7p3s6OZMJabrTDQEmSSeEsKQTwuoE8IJY7IBtjGxkS7ZsyZKtfa93/qjrRbZsl1BV3Vq+n3Pq6NZ7r1y/l0ud++i9977XnHMCAAC5J+B3AQAAwB+EAAAAchQhAACAHEUIAAAgRxECAADIUYQAAAByFCEAAIAcRQgAACBHEQIAAMhRhAAAAHJUnt8FJNvMmTNdXV2d32UAAJAyr7766hHnXPX5tsv6EFBXV6fGxka/ywAAIGXMbF8823E6AACAHEUIAAAgRxECAADIUYQAAAByFCEAAIAcRQgAACBHEQIAAMhRhAAAAHIUIQAAgBxFCAAAIEcRAgAAyFGEgCnoHhjVMzsOq3do1O9SAACYNkLAFGw/cEyf/26jtrd2+10KAADTRgiYgvpIWJK0s63H50oAAJg+QsAUzCwrVHWoUDsIAQCALEAImKL6SFg723r9LgMAgGkjBExRfSSkpvZejYxF/S4FAIBpIQRM0fJIWKPjTns6+vwuBQCAaSEETNFyLg4EAGQJQsAULZxZqoK8ACEAAJDxkhICzCzfzJ7wltea2Yveq8XMNpjZSjNrPaV9qZkVmdmTZrbNzB60mLjaktGHs8kLBrR0VoiLAwEAGS/hIcDMiiW9KmmdJDnnXnDOrXbOrZa0XdLrkiok3Xu83Tn3tqT1klqdcyu89eum0JZS9ZGQdrb1yDmX6o8GACBhEh4CnHODzrlLJbWe2m5mJZKWOOe2K3bw/riZbTWzR7y/5q+T9LS3+XOSrp1CW0rVR8Lq7B9Re+9wqj8aAICESeU1AeskPestN0m6xzl3uaSIpDWSqiQdn4+3R1LlFNomMLO7zKzRzBo7OjoS3pHjMwcyaRAAIJOlMgTcLOlJb7lZ0jOnLNdIOiKp3Gsr997H2zaBc+4+51yDc66huro6oZ2QpPpa7hAAAGS+lIQAb7h/rWLD95J0t6TbzCwg6WJJbyo2SnCDt/46Sc9PoS2lykvyNWdGMRcHAgAyWqpGAlZK2uGcG/Lef1PSnZK2SHrUObdD0sOS5pjZdkldih3s421LueMXBwIAkKnykvUPO+eWnLK8VdItp7xvU2xk4NTthyXddNo/E29bytVHwnpuV7uGRsdVlB/0uxwAAKaMyYLeo+WRsKJO2n2YUwIAgMxECHiP6pk+GACQ4QgB79H8yhKVFgS5OBAAkLEIAe9RIGBaWhtirgAAQMYiBExDfSTM9MEAgIxFCJiG+khYvUNjaj066HcpAABMGSFgGrg4EACQyQgB07CsNiQzcXEgACAjEQKmobQwTwsqSxgJAABkJELANNVHwtp5iBAAAMg8hIBpqo+Eta9zQH3DY36XAgDAlBACpmm5d3Hg24wGAAAyDCFgmupnx0LADi4OBABkGELANM0uL1K4KI+LAwEAGYcQME1mdmLmQAAAMgkhIAHqI2HtauvVeJTpgwEAmYMQkADLI2ENjo5rX2e/36UAABA3QkACnJw+mIsDAQCZgxCQABfMKlMwYFwXAADIKISABCjKD2rRzFJCAAAgoxACEoQ7BAAAmYYQkCD1kbAOdg/p2MCI36UAABAXQkCCLJ/NxYEAgMxCCEiQ+khIkjglAADIGISABKkJFWlmWQEhAACQMQgBCVQfCWsnTxMEAGQIQkAC1UfC2n2oT6PjUb9LAQDgvAgBCVQfCWlkPKq9HUwfDABIf4SABDo5fTCnBAAA6Y8QkECLq8tUEAwQAgAAGYEQkED5wYCW1JRpByEAAJABCAEJFps+mAmDAADpjxCQYPWRkI70Daujd9jvUgAAOCdCQIKdnD6YUwIAgPRGCEiw5dwhAADIEEkJAWaWb2ZPeMsrzazVzF70XkvNrMjMnjSzbWb2oMW857Zk9OG9mlFSoEh5ESEAAJD2Eh4CzKxY0quS1nlNFZLudc6t9l5vS1ovqdU5t8Jbv26abWmFiwMBAJkg4SHAOTfonLtUUqvXVCHp42a21cwe8f5yv07S09765yRdO822tFIfCampo09Do+N+lwIAwFml4pqAJkn3OOculxSRtEZSlaRub32PpMpptqWV+khY41GnpvY+v0sBAOCsUhECmiU9c8pyjaQjksq9tnLv/XTaJjCzu8ys0cwaOzo6EtiV+ByfPphJgwAA6SwVIeBuSbeZWUDSxZLelPSspBu89ddJen6abRM45+5zzjU45xqqq6sT3qHzqasqVVE+0wcDANJbKkLANyXdKWmLpEedczskPSxpjpltl9Sl2IF9Om1pJRgwLa0NEwIAAGktL1n/sHNuifezTdLa09YNS7rptF+ZTlvaWR4J6T9+fUjOOaXZXYwAAEhisqCkWR4Jq3twVG3dQ36XAgDApAgBSVLPzIEAgDRHCEiSZYQAAECaIwQkSVlhnuZXljBzIAAgbRECkqg+EmKuAABA2iIEJFF9JKzmzn4NjIz5XQoAAGcgBCRRfSQs56RdhzglAABIP4SAJFrOxYEAgDRGCEiiuRXFChXmEQIAAGmJEJBEZqZlkRB3CAAA0hIhIMnqI2HtautRNOr8LgUAgAkIAUm2PBJW/8i4Wo4O+F0KAAATEAKSjOmDAQDpihCQZEtrQwqYtIPrAgAAaYYQkGRF+UEtnFnKSAAAIO0QAlKgPhLWjoOEAABAeiEEpEB9JKwDxwbVPTjqdykAAJxACEiB4zMH7uKUAAAgjRACUoA7BAAA6YgQkAKzwoWqKMln5kAAQFohBKSAmak+EtbOQ4wEAADSByEgReojYb19qFdj41G/SwEAQBIhIGWWR8IaHouqubPf71IAAJBECEiZ4xcHMnMgACBdEAJSZElNmfKDxh0CAIC0QQhIkYK8gBZXlxECAABpgxCQQsuZPhgAkEYIASlUHwmrvXdYnX3DfpcCAAAhIJVOzhzIxYEAAP8RAlKoPhKSxPTBAID0QAhIoaqyQtWECgkBAIC0QAhIsfpIWDsIAQCANEAISLH6SFh7Ovo0Msb0wQAAfxECUmz57LBGx52a2vv8LgUAkOMIASm2nIsDAQBpghCQYnVVpSrMCxACAAC+S0oIMLN8M3vilPcPmNlmM3vczPLMbKWZtZrZi95rqZkVmdmTZrbNzB60mLjaktGHZMkLBrS0NqSdhwgBAAB/JTwEmFmxpFclrfPer5aU55xbJSks6QZJFZLudc6t9l5vS1ovqdU5t8Jbv24KbRmlvjY2fbBzzu9SAAA5LOEhwDk36Jy7VFKr13RY0jdO+7wKSR83s61m9oj31/x1kp721j8n6doptGWU+khIRwdGdbiH6YMBAP5J+jUBzrl3nHNbzexWSVFJv5DUJOke59zlkiKS1kiqktTt/VqPpMoptE1gZneZWaOZNXZ0dCSnY9NwcvpgTgkAAPyTkgsDzewWSV+RdLNzbkxSs6RnvNXNkmokHZFU7rWVe+/jbZvAOXefc67BOddQXV2d6O5M2zIvBDBpEADAT0kPAWZWK+mrkm5yzh1/cs7dkm4zs4CkiyW9KelZxa4XkGJD/s9PoS2jlBfna86MYkYCAAC+SsVIwAbFhvyf8u4E+B1J35R0p6Qtkh51zu2Q9LCkOWa2XVKXYgf7eNsyTn0kTAgAAPgqL1n/sHNuiffzbyX97SSbrD1t+2FJN522TbxtGWf57LCe23VYQ6PjKsoP+l0OACAHMVmQT5ZHQoo66e1DveffGACAJCAE+IQ7BAAAfiME+GReRYlKC4KEAACAbwgBPgkETMsiYW4TBAD4hhDgo/pISLvaepk+GADgC0KAj+ojYfUOj6n16KDfpQAAchAhwEf1zBwIAPARIcBHy2pDMuMOAQCAPwgBPiopyFNdVSkhAADgC0KAz+ojIe1sY8IgAEDqEQJ8tjwS1v6uAfUOjfpdCgAgxxACfHb84kCmDwYApBohwGdMHwwA8AshwGeR8iKVF+drB9cFAABSjBDgMzNTfSTEXAEAgJQjBKSB+khYbx/q0XiU6YMBAKlDCEgD9ZGwhkajau7s97sUAEAOIQSkgeVcHAgA8AEhIA0sqSlTMGCEAABAShEC0kBRflCLq0uZORAAkFKEgDRRHwkzEgAASClCQJpYHgmrrXtIxwZG/C4FAJAjCAFp4vjMgcwXAABIFUJAmjg5fTDXBQAAUoMQkCaqQ4WaWVbIdQEAgJQhBKSR+khIOw4SAgAAqUEISCPLI2E1tfdpdDzqdykAgBxACEgjl82v0Mh4VL96p8PvUgAAOYAQkEaur69RTahQD7y8z+9SAAA5gBCQRvKDAX3migXauLtDezv6/C4HAJDlCAFp5tNXzFN+0PTdTYwGAACSixCQZmpCRfrIJRH96NVW9Q2P+V0OACCLEQLS0Iar6tQ3PKYfv9bqdykAgCxGCEhDl82boUvnluuBl5vlnPO7HABAliIEpCEz04Yr67Sno18vNXX6XQ4AIEslJQSYWb6ZPeEtF5nZk2a2zcwetJiEtiWjD377zUsjqiot0P0vN/tdCgAgSyU8BJhZsaRXJa3zmtZLanXOrZBU4bUnui3rFOUHddvl8/TsrsNq6RrwuxwAQBZKeAhwzg065y6VdPyqtuskPe0tPyfp2iS0ZaX1qxYoYKYHN3O7IAAg8d5TCDCzq6eweZWkbm+5R1JlEtpOr+8uM2s0s8aOjsydgjdSXqwbL5qlH7zSosGRcb/LAQBkmbhCgJk9d1rT303hM45IKveWy733iW6bwDl3n3OuwTnXUF1dPYVS08+GK+vUPTiqx9444HcpAIAsc84QYGaXmtkGSbPN7Hbv9WVJQ1P4jGcl3eAtXyfp+SS0Za3LF1ZqWW1I93O7IAAgwc43EmCT/Dwi6bem8BkPS5pjZtsldSl2EE90W9YyM224qk67DvVq67tdfpcDAMgiFs9fl2b2N865P05BPQnX0NDgGhsb/S5jWgZHxrXqfz6rq5dU6Vuf+YDf5QAA0pyZveqcazjfdvFeGPinZhY2s6CZXWtmoWnWhykoLgjqUyvn6am3Dqute9DvcgAAWSLeEPADSddI+ntJn5P0k6RVhEl9dtUCRZ3Tw5v3+10KACBLxBsCIs65JyUtcs6tl1SWxJowiXmVJbp+2Sx9b+t+DY1yuyAAYPriDQFdZvYTSb82s5skHUtiTTiLO66qU2f/iH66vc3vUgAAWSDeEPAJSX/hnPtvis0E+MnklYSzuXpJlRZXl+q7m5r9LgUAkAXiDQHjkj5gZv8oqUFSf/JKwtkcv11wW2u3Xt9/1O9yAAAZLt4QcL+kOZJ+7v28P0n14Dw+9v65KivM0wM8XRAAME3xhoA659x/d8495Zz7c0l1SawJ51BWmKff+sBc/fTXbWrvncrEjQAATBRvCNhvZn9mZteZ2Z9J4j41H91+5QKNjjt9b0uL36UAADJYvCHgi5KCik0X3CPpC0mrCOe1qLpM11xYrYe37NPoeNTvcgAAGSreEPBdxe4K+D3Fntz3b0mrCHG546oFau8d1s/fPOR3KQCADBVvCKhxzv2ri/krSbOSWRTOb+2FNVpQVcIFggCA9yzeELDPzP7Ie27AH0s6mMyicH6BgOmzqxaocd9RvXmg2+9yAAAZKN4QcIekAcWuCeiXtCFZBSF+n2iYp+L8IKMBAID3JK4Q4Jwbds79H+fc73k/uTctDZQX5+vW98/RY9sO6mj/iN/lAAAyTLwjAUhTG66s08hYVN9/hdsFAQBTQwjIcEtrQ7pyUZUe2rxPY9wuCACYAkJAFthwVZ0OHBvUMzvb/S4FAJBBCAFZ4DfqazRnRjEXCAIApoQQkAXyggGtX7VAm/Z2avfhXr/LAQBkCEJAlvjUynkqyAswGgAAiBshIEtUlhbooytm68evHVD34Kjf5QAAMgAhIItsuKpOg6Pj+mEjtwsCAM6PEJBFLp5TroYFFXpw8z5Fo87vcgAAaY4QkGU2XFWnfZ0D2ri7w+9SAABpjhCQZT50ca1qQoW6nwsEAQDnQQjIMvnBgD5zxQJt3N2hvR19fpcDAEhjhIAs9Okr5ik/aPrupn1+lwIASGOEgCxUEyrSb14S0Y9ebVXf8Jjf5QAA0hQhIEttuKpOfcNjevS1Vr9LAQCkKUJAlnrfvBm6dG65Hti0T85xuyAA4EyEgCxlZtpwZZ2a2vv0UlOn3+UAANIQISCL3bQioqrSAm4XBABMihCQxQrzgvr05fP17K7Dauka8LscAECaIQRkuc+smq+AmR7azO2CAICJUhICzGytmb3ovVrM7Gtm1npK21IzKzKzJ81sm5k9aDFxtaWiD5kqUl6sD11Uq++/0qLBkXG/ywEApJGUhADn3AvOudXOudWStks6Kune423OubclrZfU6pxbIalC0roptOEcbr9ygboHR/XYGwf8LgUAkEZSejrAzEokLZF0WNLHzWyrmT3i/TV/naSnvU2fk3TtFNpwDpcvrNSy2pDuf7mZ2wUBACek+pqAdZKeldQk6R7n3OWSIpLWSKqS1O1t1yOpcgptE5jZXWbWaGaNHR08Tc/MdMdVddp1qFdb3+3yuxwAQJpIdQi4WdKTkpolPeO1NUuqkXREUrnXVu69j7dtAufcfc65BudcQ3V1dcI7kYk++r45Ki/O53kCAIATUhYCvCH/tYoN4d8t6TYzC0i6WNKbio0Q3OBtfp2k56fQhvMoLgjqtpXz9PO3DnG7IABAUmpHAlZK2uGcG5L0TUl3Stoi6VHn3A5JD0uaY2bbJXUpdrCPtw1xuPPqhQqY9C+/2ut3KQCANJCXqg9yzm2VdIu33KbYqMCp64cl3XTar8XbhjjUlhfpY5fN1fdfadF/uv4CzSwr9LskAICPmCwox9y1ZpFGxqN6gKmEASDnEQJyzOLqMn3oolo98HKz+obH/C4HAOAjQkAO+uKaxeoZGtP3tuz3uxQAgI8IATloxbwZunpJlf7lxb0aHmMqYQDIVYSAHPWlNUt0uGdYP3mdqYQBIFcRAnLU1UuqdMmccn17416NR5lKGAByESEgR5mZvrR2sd490q+n3jrkdzkAAB8QAnLYjRfVauHMUt37wh4eLAQAOYgQkMOCAdMXrlmkXx/o1ktNnX6XAwBIMUJAjrv1/XNUEyrUvRub/C4FAJBihIAcV5gX1Oc/uFAvNXVqW8sxv8sBAKQQIQD69OXzFS7K07c37vG7FABAChECoFBRvm6/sk4/f+uQ9nT0+V0OACBFCAGQJN1xdZ0KggHdt5HHDANAriAEQJI0s6xQn1o5Tz9+vVWHuof8LgcAkAKEAJzwux9cpKiTvvMiowEAkAsIAThhXmWJbr40ov+3Zb+ODYz4XQ4AIMkIAZjgi2sXq39kXA9u2ud3KQCAJCMEYIJltWFdt6xG//ZyswZHeMwwAGQzQgDO8KW1i9XVP6J/b2zxuxQAQBIRAnCGlXWValhQoft+uVej41G/ywEAJAkhAJP60trFOnBsUE9uP+h3KQCAJCEEYFLXLq3R0lkh3fvCHkWjPGYYALIRIQCTCgRMX1y7SLsP9+n5t9v9LgcAkASEAJzVTZfO1pwZxbr3BR4sBADZiBCAs8oPBnTXNYvUuO+oXmnu8rscAECCEQJwTp9smKfK0gJGAwAgCxECcE7FBUHdeVWdntvVrp1tPX6XAwBIIEIAzuv2K+tUWhDUP21kNAAAsgkhAOdVXpKv375ivp7Y3qaWrgG/ywEAJAghAHH53OpFCpj0z7/iMcMAkC0IAYhLbXmRPnbZXP3glRYd6Rv2uxwAQAIQAhC3u9Ys0sh4VPe/1Ox3KQCABCAEIG6Lq8v0oYtq9d1NzeodGvW7HADANBECMCVfXLNYPUNj+t7W/X6XAgCYppSEADNbaWatZvai91phZk+a2TYze9Biit5rWyr6gJgV82bo6iVV+pdfvavhsXG/ywEATEOqRgIqJN3rnFvtnFstaaWkVufcCm/dOknrp9GGFPrSmiVq7x3Wj1874HcpAIBpSGUI+LiZbTWzRyRdL+lpb91zkq6VdN002pBCVy+p0iVzyvVPG/donMcMA0DGSlUIaJJ0j3PuckkRSR+T1O2t65FUKalqGm1IITPTl9YuVnPngH7+5iG/ywEAvEepCgHNkp45ZTkqqdx7Xy7piPd6r20TmNldZtZoZo0dHR2J7Ac8N15Uq4UzS3XvxiY5x2gAAGSiVIWAuyXdZmYBSRdL+kNJN3jrrpP0vKRnp9E2gXPuPudcg3Ouobq6OvG9gYIB0xeuWaQ3D/ToxaYzchgAIAOkKgR8U9KdkrZIelTSdyTNMbPtkroUO7A/PI02+ODW989RTaiQxwwDQIbKS8WHOOfaJK09rfmm094PT6MNPijMC+rzH1yov/6PXXqj5ZjeN2+G3yUBAKaAyYIwLZ++fL7CRXn6NqMBAJBxCAGYllBRvm6/sk5P7TikpvY+v8sBAEwBIQDTdsfVdSoIBnTfLxkNAIBMQgjAtM0sK9RtK+fp0dcPqK170O9yAABxIgQgIT7/wUWKOukbz7yjkbGo3+UAAOJACEBCzKss0Scb5ur7r7Toqr95Tv/rqV1qPTrgd1kAgHOwbJ/traGhwTU2NvpdRk6IRp02vtOhhzfv03O72iVJ1y6t0fpVC3TNhdUKBnjgIwCkgpm96pxrOO92hAAkQ+vRAX1/a4u+/0qLjvQNa25FsX77ivn6ZMM8zSwr9Ls8AMhqhAAPIcBfI2NR/WLHIT20eZ827+1SftD04YsjWr9qgVbWVciM0QEASDRCgIcQkD6a2nv10Ob9euS1VvUOjenCWWVav2qBbr1sjkJF+X6XBwBZgxDgIQSkn4GRMT2x7aAe2rxfvz7QrZKCoD76vjlav2q+Lppdfv5/AABwToQADyEgvW1rOaaHNu/TE9sPamg0qsvmz9D6KxboNy+NqCg/6Hd5AJCRCAEeQkBm6B4Y1SOvteqhLfu0t6NfM0ry9YkPzNVvX7FAC2eW+l0eAGQUQoCHEJBZnHPatLdTD2/er6feOqSxqNMHL5ipz1wxX79RP0t5Qaa2AIDzIQR4CAGZq71nSD94pUXf27pfB7uHNCtcqM+vXqQ7rq5TPmEAAM6KEOAhBGS+sfGonn+7Qw+83KwXm45o6ayQ/urWi7WyrtLv0gAgLcUbAvhzCmkvLxjQuuWz9NDnr9A/396gvuExfeLbm/RHP9quo/0jfpcHABmLEICMsm75LD199zX6wjWL9KPXWnX9/96oHza2KNtHtAAgGQgByDglBXn6k4/U66dfWa2FM0v11R9t16fu26x3Dvf6XRoAZBRCADLWstqwfviFK/W3H79Euw/36sPf+JX+7ue7NDgy7ndpAJARCAHIaIGA6VMr5+vZu9foo++bo2+9sEc3fH2jnveeYggAODtCALJCVVmh/uGTK/S9312lgmBAd97/ir788Ks61D3kd2kAkLYIAcgqVy6u0s/+4Bp99calenZnu67/hxf0ry++q7HxqN+lAUDaIQQg6xTkBfR71y7R0/95jRrqKvUXT+7QR//vS3qj5ZjfpQFAWiEEIGvNryrR/Xeu1Lc+834d6RvWrd96Sff85E11D476XRoApAVCALKamekjl0T0zN1rdMdVdXp4yz5d/w8b9dgbB5hbAEDOIwQgJ4SK8vW1my/S47+/WrNnFOkPvv+GPvudrXr3SL/fpQGAbwgByCkXzynXo1++Wn/50Yu0reWYbvz6L/X1Z3ZraJS5BQDkHkIAck4wYPrslXV69g/X6MaLavX1Z97Rh7/xK734zhG/SwOAlOIpgsh5v9zdoXsee1P7Oge0dFZIS2rKtLimTEtqyrSkukyLqktVlB/0u0wAiBuPEvYQAhCPodFx3f9ys155t0tNHX1q6RpQ1PtqmElzK4q1pLpMi6u9cFATW64oLfC3cACYBCHAQwjAezE0Oq7mzn41tfdpT3u/mjr61NTep70dfRoeOznxUFVpgRbXnB4OSjW7vFiBgPnYAwC5LN4QkJeKYoBMU5Qf1LLasJbVhie0j0edDh4bjIUDLxg0tffpZ2+26djAyfkHivODWlxTGgsHp40eEA4ApAtCADAFwYBpXmWJ5lWW6NplNRPWdfYNe+EgNoLQ1NGnxuajeuyNgye2qQ0X6aZLI7rlfbN1yZxymREIAPiH0wFAkg2MjGlvR792tvXoqbcOa+Pudo2OOy2cWaqbvUCwpCbkd5kAsgjXBHgIAUg33QOj+vlbbXp820Ft2tOpqJPqI2HdsmK2bl4R0dyKEr9LBJDh0i4EmNkDkpZKapf0l5IeldTsrf6cpH2SfiRpnqTtkm6XVBhPmztHJwgBSGftvUP66fZYIHh9f+wBRx9YUKFbVszWRy6JqDpU6HOFADJRvCEgJZMFmdlqSXnOuVWSwpIiku51zq32Xm9LWi+p1Tm3QlKFpHVTaAMyUk2oSHdevVCPfvlq/eq/Xquv3rhU/cNj+trjb+mKv35Gn/3OFv17YwsPPQKQFCkZCTCzCyRVOOe2mtkvJX1H0h9IGpPUIum3JD0s6RHn3CNmdrekakkL4mlzzv3J2T6bkQBkot2He/X4Gwf1+LaD2t81oIJgQGuXVuuW983W9ctmqbiAyYsAnF1a3SLonHtHkszsVklRSbsk3eOc+6mZvSxpjaQqSd3er/Qoduog3rYJzOwuSXdJ0vz585PQIyC5LpwV0n+5can+8IYLtb21W49vO6gntx/UL3YcVklBUOuWz9ItK2brgxdUqyAvMQN6I2NRHRsYUdfAiI72j+rYwIiODozq6MCIjvaPaHgsqvcvmKEPXlCtmWWcpgCyQcpuETSzWyR9RdLNkgokveGtapZUI+mIpHKvrdx7XxZn2wTOufsk3SfFRgIS2xMgdcxMK+bN0Ip5M/SnH6nX1ne79Pi2g/rZm2167I2DmlGSrw9fXKubV8zWFQuotJkrAAALJElEQVSrFAyYnHPqHxnX0f4RHTt+EPcO5EcHTju4n3LA7x85+0OUivODCgZMD27eJ0m6ZE651lxYrTVLq3XZvBnKC/IYEiATpep0QK2kH0r6kHOu38z+h6Tdkh5ULAzcJmmVpCucc18ws59K+kdJ8+Npc849c7bP5nQAstHIWFQvNnXo8TdiowMDI+OqLC1QXsB0bGBUI+PRs/5uuChPlaUFmlFSoIqSfFWUxJYrS/O9tlh7rK1AM0ryVZQfVDTq9ObBbm18u0O/fKdDr+0/pvGoU6gwT1cvmak1S6t1zYXVmjOjOIX/JQBMJq3uDjCzP5L0u5IOeU0/U+yCvlJJ/+Gc+5qZFUp6RLGD/DbF7gQoiKeNuwOQywZHxvXsrsN6fleH8gKmitJTD+75Ew745cX5CfurvXtwVC83HdHG3R3auLtDbd1DkqQLasq05sJYILh8YSUPXwJ8kFYhwE+EACD5nHNqau87EQi2vNulkbGoivIDWrWoKnbq4MJqLZxZyiyJQAoQAjyEACD1BkfGtfndztipg90d2nukX5I0r7JY11wQCwRXLZmpskJmLgeSgRDgIQQA/tvfOaCN73Ro49sd2rTniPpHxpUXMDXUVWjNhTVac2G16iMhRgmABCEEeAgBQHoZGYvq1X1HT5w62NnWIyn2WObls8Oqj4RVHwlpWW1Yi6vLEnYLJJBLCAEeQgCQ3tp7hvTLd45o895O7TrUo92H+zQyFru7IT9oWlxdpuWRWDhYFgmpPhJmngLgPAgBHkIAkFnGxqN690i/drT1aNehXu1s69HOth4d7hk+sc3MskLVe4GAUQPgTGk1YyAAxCsvGNAFs0K6YFZIHz2lvat/RLvaerTTCwa7DvXo/pebJ4waLKkJqb42xKgBECdCAICMUFlaoKuWzNRVS2aeaJts1OClPUf049cPnNjm+KjB4uoyVYcKVVVaoKqyQlWVFZxYLi0IclEichIhAEDGmsqowQ8bW846NXJhXkAzywpVWVrghYNCzSwr8N5PDAxVpQVMgISsQQgAkHUmGzWQpKHRcXX2j6izb9j7edpy/7A6+0b0zuE+dfQNnzjVcLqywrwzAsOMkgKFivJUVhh7hYryVFaUp1Bhfuynt64wL8CoA9IGIQBAzijKD2rOjOK4nm9w/EFM5wsMB44NanvrsfM+s+G4/KApVJR/IiyUFeUpXHRyuawwX6FTQkMsUOSrIC+ggEkBMwXMZN6ynWiLPXDq9G3slPfHt5nsd8oK8wgnOYgQAACTMLMTB+EFVaVx/c7w2Lj6hsbUNzym3qHYq294TH3DoxPfD42pd2j0xHZt3UMn1vUOjWp0PPV3bVWHCrVqUZWuXFSlVYsqmeI5RxACACBBCvOCKiwLqmqadyQMj43HQoEXDHqGRjUyFpVTbIQiGpWizinqvPdOcjr1/clt3CnrJvxO1MlJirrYBZY72nq0aU+nnth2UJI0K3xqKKjSgqoSQkEWIgQAQJo5HiZSfXujc07vHunXpr2d2ry3Sy81deqxN2KhIFJedCIQXLm4SnMrigkFWYDJggAAk3LOaU+HFwr2dGrz3k519o9IkubMKNYq79RBLBSU+FwtTsWMgR5CAAAkhnNO77T3afPeTm3yQsHRgVFJsSdErloYGyVYtahKs+O4+BLJQwjwEAIAIDmiUafd7b3avKdTm/Z2asu7XTrmhYIFVSUTQkFteZHP1eYWQoCHEAAAqRGNOu061OtdU9CpLXs71TM0JkkqL87XvMpiza8s0bzKEs2rKNH8ythr9oxinvuQYIQADyEAAPwxHnXa2dajre926d0j/drfNaCWowNq7RqcMKdCwKRIebHmVRafDAdVJZrrLc8sK+AixCniAUIAAF8FA6aL55Tr4jnlE9qjUafDvUNq6RrU/q6BWDjwXht3d6i9d3jC9sX5wclHEapKNLeiWCUFmXsoc85paDSqonx/ZpLM3P9yAICMFAiYIuXFipQX6/KFlWesHxodV+vRWDjY3zmglqODJ4LCpj2dZzwDoqq0QOXF+SopDKqkIE+lBUGVFOapJD+o0sI8lRSc8rMgTyWF3s+CM9eXFOQpGDj3wdg5p4GRcfUPH58M6uQkUP0jY+objk0ader6M5a9OSD6R8Y1HnV6689vVGlh6g/JhAAAQFopyg9qSU1IS2pCZ6xzzqmrf8Q7tTColq4BtR4dVN/wmAaGYwfhTm/98QP18QNt/J8fUEnBydBQmB84edD3DvTx/HMBk0oLT07/fHx5VqjIWw6qrCjWHvDpdAchAACQMczMe7JjoS6bXxHX7zjnNDIe1cDwuPpHxk4c0AdHxtU/Mq6BkTH1D5/2c2TsxPZDo1HNrYgFgrKiiQf0M5eDJ7Ypzk//R1QTAgAAWc3MYrMw5gVVUVrgdzlphXsyAADIUYQAAAByFCEAAIAcRQgAACBHEQIAAMhRhAAAAHIUIQAAgBxFCAAAIEcRAgAAyFGEAAAAchQhAACAHEUIAAAgRxECAADIUeZc/M9YzkRm1iFpn48lzJR0xMfPTzX6m71yqa8S/c122d7fBc656vNtlPUhwG9m1uica/C7jlShv9krl/oq0d9sl2v9PRtOBwAAkKMIAQAA5ChCQPLd53cBKUZ/s1cu9VWiv9ku1/o7Ka4JAAAgRzESAABAjiIEJIiZPWBmm83scTPLm2T9SjNrNbMXvddSP+pMhHj6YmZFZvakmW0zswfNzPyoNRHMbO0pfW0xsw2TbJPx+9fM8s3sCW85rv2Xyfv51P5678/5Hfa2ydj9fNr+jasfmbp/T+vreb+/3nYZu2+ngxCQAGa2WlKec26VpLCkGybZrELSvc651d7r7ZQWmVjx9GW9pFbn3Apv+3UprTCBnHMvHO+rpO2SXp9ks4zev2ZWLOlVndxP8e6/jNzPp/c3zu+wlKH7eZL9G28/Mm7/nt7XOL+/Uobu2+kiBCTGYUnf8JbP9t+0QtLHzWyrmT2SKYn6LOLpy3WSnvaWn5N0bcqqSxIzK5G0xDm3fZLVGb1/nXODzrlLJbV6TfHuv4zcz5P0N57vsJSh+3mS/sbbj4zbv5P0VdJ5v79Shu7b6SIEJIBz7h3n3FYzu1VSVNIvJtmsSdI9zrnLJUUkrUlljQkWT1+qJHV7yz2SKlNUWzKtk/TsWdZl0/6V4t9/WbGf4/wOS9mzn+PtR1bsX8+5vr9S9uzbKZn0vBemzsxukfQVSTc758Ym2aRZ0punLNekprKkaNb5+3JEUrm3XK7smJ7zZkk/Psu6ZmXP/pXi339Zs5/j+A5L2bOfmxVfP7Jm/+rc318pe/btlDASkABmVivpq5Jucs71nmWzuyXdZmYBSRfr5P9smSievjyrk+dVr5P0fIpqSwpvaHCtYkOik8mm/SvFv/+yYj/H+R2Wsmc/x9uPbNm/5/v+Stmzb6eEEJAYGxQbPnrKu6r0c2b296dt801Jd0raIulR59yOVBeZQBP6Imlwkv4+LGmOmW2X1KVzD8NlgpWSdjjnhsxsYZbvX2mS/XeWfmfLfj79O/w7Wb6fz+hHlu/fE99fScryfTslTBYEAECOYiQAAIAcRQgAACBHEQIAAMhRhAAAAHIUIQBAwpnZHWZ2h991ADg3QgAAADmKEAAgaczsIjN73sxCftcC4ExMGwwgWSKKTTbzofPMwgfAJ4wEAEiW31fsSW4L/C4EwOQIAQCS5S8lfcn7CSANEQIAJMuQc65F0i7vCX0A0gzPDgAAIEcxEgAAQI4iBAAAkKMIAQAA5ChCAAAAOYoQAABAjiIEAACQowgBAADkqP8P9dzuqF7ZCiIAAAAASUVORK5CYII=\n",
 777 |       "text/plain": [
 778 |        "<matplotlib.figure.Figure at 0x7f3155967160>"
 779 |       ]
 780 |      },
 781 |      "metadata": {},
 782 |      "output_type": "display_data"
 783 |     }
 784 |    ],
 785 |    "source": [
 786 |     "fig, ax = plt.subplots(1,1, figsize=(8,6))\n",
 787 |     "ax.plot(range(2,20), cost)\n",
 788 |     "ax.set_xlabel('k')\n",
 789 |     "ax.set_ylabel('cost')"
 790 |    ]
 791 |   },
 792 |   {
 793 |    "cell_type": "markdown",
 794 |    "metadata": {},
 795 |    "source": [
 796 |     "可以见到在k=5时，出现了拐角，我们取k=5"
 797 |    ]
 798 |   },
 799 |   {
 800 |    "cell_type": "code",
 801 |    "execution_count": 23,
 802 |    "metadata": {},
 803 |    "outputs": [],
 804 |    "source": [
 805 |     "kmeans = KMeans(k=5, seed=1)\n",
 806 |     "km_model = kmeans.fit(df_km)\n",
 807 |     "centers = km_model.clusterCenters()"
 808 |    ]
 809 |   },
 810 |   {
 811 |    "cell_type": "code",
 812 |    "execution_count": 24,
 813 |    "metadata": {},
 814 |    "outputs": [
 815 |     {
 816 |      "data": {
 817 |       "text/plain": [
 818 |        "[array([55.2962963 , 49.51851852]),\n",
 819 |        " array([25.72727273, 79.36363636]),\n",
 820 |        " array([86.53846154, 82.12820513]),\n",
 821 |        " array([88.2       , 17.11428571]),\n",
 822 |        " array([26.30434783, 20.91304348])]"
 823 |       ]
 824 |      },
 825 |      "execution_count": 24,
 826 |      "metadata": {},
 827 |      "output_type": "execute_result"
 828 |     }
 829 |    ],
 830 |    "source": [
 831 |     "centers"
 832 |    ]
 833 |   },
 834 |   {
 835 |    "cell_type": "code",
 836 |    "execution_count": 32,
 837 |    "metadata": {},
 838 |    "outputs": [],
 839 |    "source": [
 840 |     "transformed = km_model.transform(df_km).select('CustomerID', 'prediction')"
 841 |    ]
 842 |   },
 843 |   {
 844 |    "cell_type": "code",
 845 |    "execution_count": 33,
 846 |    "metadata": {},
 847 |    "outputs": [
 848 |     {
 849 |      "name": "stdout",
 850 |      "output_type": "stream",
 851 |      "text": [
 852 |       "+----------+----------+\n",
 853 |       "|CustomerID|prediction|\n",
 854 |       "+----------+----------+\n",
 855 |       "|         1|         4|\n",
 856 |       "|         2|         1|\n",
 857 |       "|         3|         4|\n",
 858 |       "+----------+----------+\n",
 859 |       "only showing top 3 rows\n",
 860 |       "\n"
 861 |      ]
 862 |     }
 863 |    ],
 864 |    "source": [
 865 |     "transformed.show(3)"
 866 |    ]
 867 |   },
 868 |   {
 869 |    "cell_type": "code",
 870 |    "execution_count": 35,
 871 |    "metadata": {},
 872 |    "outputs": [],
 873 |    "source": [
 874 |     "df_pred = df.join(transformed, 'CustomerID')"
 875 |    ]
 876 |   },
 877 |   {
 878 |    "cell_type": "code",
 879 |    "execution_count": 36,
 880 |    "metadata": {},
 881 |    "outputs": [
 882 |     {
 883 |      "name": "stdout",
 884 |      "output_type": "stream",
 885 |      "text": [
 886 |       "+----------+------+---+------+-----+----------+\n",
 887 |       "|CustomerID|Gender|Age|Income|Spend|prediction|\n",
 888 |       "+----------+------+---+------+-----+----------+\n",
 889 |       "|         1|  Male| 19|    15|   39|         4|\n",
 890 |       "|         2|  Male| 21|    15|   81|         1|\n",
 891 |       "|         3|Female| 20|    16|    6|         4|\n",
 892 |       "+----------+------+---+------+-----+----------+\n",
 893 |       "only showing top 3 rows\n",
 894 |       "\n"
 895 |      ]
 896 |     }
 897 |    ],
 898 |    "source": [
 899 |     "df_pred.show(3)"
 900 |    ]
 901 |   },
 902 |   {
 903 |    "cell_type": "code",
 904 |    "execution_count": 39,
 905 |    "metadata": {},
 906 |    "outputs": [
 907 |     {
 908 |      "data": {
 909 |       "application/vnd.plotly.v1+json": {
 910 |        "data": [
 911 |         {
 912 |          "marker": {
 913 |           "color": [
 914 |            4,
 915 |            1,
 916 |            4,
 917 |            1,
 918 |            4,
 919 |            1,
 920 |            4,
 921 |            1,
 922 |            4,
 923 |            1,
 924 |            4,
 925 |            1,
 926 |            4,
 927 |            1,
 928 |            4,
 929 |            1,
 930 |            4,
 931 |            1,
 932 |            4,
 933 |            1,
 934 |            4,
 935 |            1,
 936 |            4,
 937 |            1,
 938 |            4,
 939 |            1,
 940 |            4,
 941 |            1,
 942 |            4,
 943 |            1,
 944 |            4,
 945 |            1,
 946 |            4,
 947 |            1,
 948 |            4,
 949 |            1,
 950 |            4,
 951 |            1,
 952 |            4,
 953 |            1,
 954 |            4,
 955 |            1,
 956 |            4,
 957 |            0,
 958 |            4,
 959 |            1,
 960 |            0,
 961 |            0,
 962 |            0,
 963 |            0,
 964 |            0,
 965 |            0,
 966 |            0,
 967 |            0,
 968 |            0,
 969 |            0,
 970 |            0,
 971 |            0,
 972 |            0,
 973 |            0,
 974 |            0,
 975 |            0,
 976 |            0,
 977 |            0,
 978 |            0,
 979 |            0,
 980 |            0,
 981 |            0,
 982 |            0,
 983 |            0,
 984 |            0,
 985 |            0,
 986 |            0,
 987 |            0,
 988 |            0,
 989 |            0,
 990 |            0,
 991 |            0,
 992 |            0,
 993 |            0,
 994 |            0,
 995 |            0,
 996 |            0,
 997 |            0,
 998 |            0,
 999 |            0,
1000 |            0,
1001 |            0,
1002 |            0,
1003 |            0,
1004 |            0,
1005 |            0,
1006 |            0,
1007 |            0,
1008 |            0,
1009 |            0,
1010 |            0,
1011 |            0,
1012 |            0,
1013 |            0,
1014 |            0,
1015 |            0,
1016 |            0,
1017 |            0,
1018 |            0,
1019 |            0,
1020 |            0,
1021 |            0,
1022 |            0,
1023 |            0,
1024 |            0,
1025 |            0,
1026 |            0,
1027 |            0,
1028 |            0,
1029 |            0,
1030 |            0,
1031 |            0,
1032 |            0,
1033 |            0,
1034 |            0,
1035 |            0,
1036 |            0,
1037 |            2,
1038 |            3,
1039 |            2,
1040 |            0,
1041 |            2,
1042 |            3,
1043 |            2,
1044 |            3,
1045 |            2,
1046 |            0,
1047 |            2,
1048 |            3,
1049 |            2,
1050 |            3,
1051 |            2,
1052 |            3,
1053 |            2,
1054 |            3,
1055 |            2,
1056 |            0,
1057 |            2,
1058 |            3,
1059 |            2,
1060 |            3,
1061 |            2,
1062 |            3,
1063 |            2,
1064 |            3,
1065 |            2,
1066 |            3,
1067 |            2,
1068 |            3,
1069 |            2,
1070 |            3,
1071 |            2,
1072 |            3,
1073 |            2,
1074 |            3,
1075 |            2,
1076 |            3,
1077 |            2,
1078 |            3,
1079 |            2,
1080 |            3,
1081 |            2,
1082 |            3,
1083 |            2,
1084 |            3,
1085 |            2,
1086 |            3,
1087 |            2,
1088 |            3,
1089 |            2,
1090 |            3,
1091 |            2,
1092 |            3,
1093 |            2,
1094 |            3,
1095 |            2,
1096 |            3,
1097 |            2,
1098 |            3,
1099 |            2,
1100 |            3,
1101 |            2,
1102 |            3,
1103 |            2,
1104 |            3,
1105 |            2,
1106 |            3,
1107 |            2,
1108 |            3,
1109 |            2,
1110 |            3,
1111 |            2,
1112 |            3,
1113 |            2
1114 |           ],
1115 |           "colorscale": "Viridis",
1116 |           "size": 10
1117 |          },
1118 |          "mode": "markers",
1119 |          "type": "scatter",
1120 |          "x": [
1121 |           15,
1122 |           15,
1123 |           16,
1124 |           16,
1125 |           17,
1126 |           17,
1127 |           18,
1128 |           18,
1129 |           19,
1130 |           19,
1131 |           19,
1132 |           19,
1133 |           20,
1134 |           20,
1135 |           20,
1136 |           20,
1137 |           21,
1138 |           21,
1139 |           23,
1140 |           23,
1141 |           24,
1142 |           24,
1143 |           25,
1144 |           25,
1145 |           28,
1146 |           28,
1147 |           28,
1148 |           28,
1149 |           29,
1150 |           29,
1151 |           30,
1152 |           30,
1153 |           33,
1154 |           33,
1155 |           33,
1156 |           33,
1157 |           34,
1158 |           34,
1159 |           37,
1160 |           37,
1161 |           38,
1162 |           38,
1163 |           39,
1164 |           39,
1165 |           39,
1166 |           39,
1167 |           40,
1168 |           40,
1169 |           40,
1170 |           40,
1171 |           42,
1172 |           42,
1173 |           43,
1174 |           43,
1175 |           43,
1176 |           43,
1177 |           44,
1178 |           44,
1179 |           46,
1180 |           46,
1181 |           46,
1182 |           46,
1183 |           47,
1184 |           47,
1185 |           48,
1186 |           48,
1187 |           48,
1188 |           48,
1189 |           48,
1190 |           48,
1191 |           49,
1192 |           49,
1193 |           50,
1194 |           50,
1195 |           54,
1196 |           54,
1197 |           54,
1198 |           54,
1199 |           54,
1200 |           54,
1201 |           54,
1202 |           54,
1203 |           54,
1204 |           54,
1205 |           54,
1206 |           54,
1207 |           57,
1208 |           57,
1209 |           58,
1210 |           58,
1211 |           59,
1212 |           59,
1213 |           60,
1214 |           60,
1215 |           60,
1216 |           60,
1217 |           60,
1218 |           60,
1219 |           61,
1220 |           61,
1221 |           62,
1222 |           62,
1223 |           62,
1224 |           62,
1225 |           62,
1226 |           62,
1227 |           63,
1228 |           63,
1229 |           63,
1230 |           63,
1231 |           63,
1232 |           63,
1233 |           64,
1234 |           64,
1235 |           65,
1236 |           65,
1237 |           65,
1238 |           65,
1239 |           67,
1240 |           67,
1241 |           67,
1242 |           67,
1243 |           69,
1244 |           69,
1245 |           70,
1246 |           70,
1247 |           71,
1248 |           71,
1249 |           71,
1250 |           71,
1251 |           71,
1252 |           71,
1253 |           72,
1254 |           72,
1255 |           73,
1256 |           73,
1257 |           73,
1258 |           73,
1259 |           74,
1260 |           74,
1261 |           75,
1262 |           75,
1263 |           76,
1264 |           76,
1265 |           77,
1266 |           77,
1267 |           77,
1268 |           77,
1269 |           78,
1270 |           78,
1271 |           78,
1272 |           78,
1273 |           78,
1274 |           78,
1275 |           78,
1276 |           78,
1277 |           78,
1278 |           78,
1279 |           78,
1280 |           78,
1281 |           79,
1282 |           79,
1283 |           81,
1284 |           81,
1285 |           85,
1286 |           85,
1287 |           86,
1288 |           86,
1289 |           87,
1290 |           87,
1291 |           87,
1292 |           87,
1293 |           87,
1294 |           87,
1295 |           88,
1296 |           88,
1297 |           88,
1298 |           88,
1299 |           93,
1300 |           93,
1301 |           97,
1302 |           97,
1303 |           98,
1304 |           98,
1305 |           99,
1306 |           99,
1307 |           101,
1308 |           101,
1309 |           103,
1310 |           103,
1311 |           103,
1312 |           103,
1313 |           113,
1314 |           113,
1315 |           120,
1316 |           120,
1317 |           126,
1318 |           126,
1319 |           137,
1320 |           137
1321 |          ],
1322 |          "y": [
1323 |           39,
1324 |           81,
1325 |           6,
1326 |           77,
1327 |           40,
1328 |           76,
1329 |           6,
1330 |           94,
1331 |           3,
1332 |           72,
1333 |           14,
1334 |           99,
1335 |           15,
1336 |           77,
1337 |           13,
1338 |           79,
1339 |           35,
1340 |           66,
1341 |           29,
1342 |           98,
1343 |           35,
1344 |           73,
1345 |           5,
1346 |           73,
1347 |           14,
1348 |           82,
1349 |           32,
1350 |           61,
1351 |           31,
1352 |           87,
1353 |           4,
1354 |           73,
1355 |           4,
1356 |           92,
1357 |           14,
1358 |           81,
1359 |           17,
1360 |           73,
1361 |           26,
1362 |           75,
1363 |           35,
1364 |           92,
1365 |           36,
1366 |           61,
1367 |           28,
1368 |           65,
1369 |           55,
1370 |           47,
1371 |           42,
1372 |           42,
1373 |           52,
1374 |           60,
1375 |           54,
1376 |           60,
1377 |           45,
1378 |           41,
1379 |           50,
1380 |           46,
1381 |           51,
1382 |           46,
1383 |           56,
1384 |           55,
1385 |           52,
1386 |           59,
1387 |           51,
1388 |           59,
1389 |           50,
1390 |           48,
1391 |           59,
1392 |           47,
1393 |           55,
1394 |           42,
1395 |           49,
1396 |           56,
1397 |           47,
1398 |           54,
1399 |           53,
1400 |           48,
1401 |           52,
1402 |           42,
1403 |           51,
1404 |           55,
1405 |           41,
1406 |           44,
1407 |           57,
1408 |           46,
1409 |           58,
1410 |           55,
1411 |           60,
1412 |           46,
1413 |           55,
1414 |           41,
1415 |           49,
1416 |           40,
1417 |           42,
1418 |           52,
1419 |           47,
1420 |           50,
1421 |           42,
1422 |           49,
1423 |           41,
1424 |           48,
1425 |           59,
1426 |           55,
1427 |           56,
1428 |           42,
1429 |           50,
1430 |           46,
1431 |           43,
1432 |           48,
1433 |           52,
1434 |           54,
1435 |           42,
1436 |           46,
1437 |           48,
1438 |           50,
1439 |           43,
1440 |           59,
1441 |           43,
1442 |           57,
1443 |           56,
1444 |           40,
1445 |           58,
1446 |           91,
1447 |           29,
1448 |           77,
1449 |           35,
1450 |           95,
1451 |           11,
1452 |           75,
1453 |           9,
1454 |           75,
1455 |           34,
1456 |           71,
1457 |           5,
1458 |           88,
1459 |           7,
1460 |           73,
1461 |           10,
1462 |           72,
1463 |           5,
1464 |           93,
1465 |           40,
1466 |           87,
1467 |           12,
1468 |           97,
1469 |           36,
1470 |           74,
1471 |           22,
1472 |           90,
1473 |           17,
1474 |           88,
1475 |           20,
1476 |           76,
1477 |           16,
1478 |           89,
1479 |           1,
1480 |           78,
1481 |           1,
1482 |           73,
1483 |           35,
1484 |           83,
1485 |           5,
1486 |           93,
1487 |           26,
1488 |           75,
1489 |           20,
1490 |           95,
1491 |           27,
1492 |           63,
1493 |           13,
1494 |           75,
1495 |           10,
1496 |           92,
1497 |           13,
1498 |           86,
1499 |           15,
1500 |           69,
1501 |           14,
1502 |           90,
1503 |           32,
1504 |           86,
1505 |           15,
1506 |           88,
1507 |           39,
1508 |           97,
1509 |           24,
1510 |           68,
1511 |           17,
1512 |           85,
1513 |           23,
1514 |           69,
1515 |           8,
1516 |           91,
1517 |           16,
1518 |           79,
1519 |           28,
1520 |           74,
1521 |           18,
1522 |           83
1523 |          ]
1524 |         }
1525 |        ],
1526 |        "layout": {}
1527 |       },
1528 |       "text/html": [
1529 |        "<div id=\"da6bd68d-d660-45e7-af0c-8c7b3e501c9a\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";Plotly.newPlot(\"da6bd68d-d660-45e7-af0c-8c7b3e501c9a\", [{\"type\": \"scatter\", \"x\": [15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 23, 23, 24, 24, 25, 25, 28, 28, 28, 28, 29, 29, 30, 30, 33, 33, 33, 33, 34, 34, 37, 37, 38, 38, 39, 39, 39, 39, 40, 40, 40, 40, 42, 42, 43, 43, 43, 43, 44, 44, 46, 46, 46, 46, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49, 50, 50, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 57, 57, 58, 58, 59, 59, 60, 60, 60, 60, 60, 60, 61, 61, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 64, 64, 65, 65, 65, 65, 67, 67, 67, 67, 69, 69, 70, 70, 71, 71, 71, 71, 71, 71, 72, 72, 73, 73, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 77, 77, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 79, 79, 81, 81, 85, 85, 86, 86, 87, 87, 87, 87, 87, 87, 88, 88, 88, 88, 93, 93, 97, 97, 98, 98, 99, 99, 101, 101, 103, 103, 103, 103, 113, 113, 120, 120, 126, 126, 137, 137], \"y\": [39, 81, 6, 77, 40, 76, 6, 94, 3, 72, 14, 99, 15, 77, 13, 79, 35, 66, 29, 98, 35, 73, 5, 73, 14, 82, 32, 61, 31, 87, 4, 73, 4, 92, 14, 81, 17, 73, 26, 75, 35, 92, 36, 61, 28, 65, 55, 47, 42, 42, 52, 60, 54, 60, 45, 41, 50, 46, 51, 46, 56, 55, 52, 59, 51, 59, 50, 48, 59, 47, 55, 42, 49, 56, 47, 54, 53, 48, 52, 42, 51, 55, 41, 44, 57, 46, 58, 55, 60, 46, 55, 41, 49, 40, 42, 52, 47, 50, 42, 49, 41, 48, 59, 55, 56, 42, 50, 46, 43, 48, 52, 54, 42, 46, 48, 50, 43, 59, 43, 57, 56, 40, 58, 91, 29, 77, 35, 95, 11, 75, 9, 75, 34, 71, 5, 88, 7, 73, 10, 72, 5, 93, 40, 87, 12, 97, 36, 74, 22, 90, 17, 88, 20, 76, 16, 89, 1, 78, 1, 73, 35, 83, 5, 93, 26, 75, 20, 95, 27, 63, 13, 75, 10, 92, 13, 86, 15, 69, 14, 90, 32, 86, 15, 88, 39, 97, 24, 68, 17, 85, 23, 69, 8, 91, 16, 79, 28, 74, 18, 83], \"mode\": \"markers\", \"marker\": {\"size\": 10, \"color\": [4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 0, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 2, 0, 2, 3, 2, 3, 2, 0, 2, 3, 2, 3, 2, 3, 2, 3, 2, 0, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2], \"colorscale\": \"Viridis\"}}], {}, {\"showLink\": true, \"linkText\": \"Export to plot.ly\"})});</script>"
1530 |       ],
1531 |       "text/vnd.plotly.v1+html": [
1532 |        "<div id=\"da6bd68d-d660-45e7-af0c-8c7b3e501c9a\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";Plotly.newPlot(\"da6bd68d-d660-45e7-af0c-8c7b3e501c9a\", [{\"type\": \"scatter\", \"x\": [15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 23, 23, 24, 24, 25, 25, 28, 28, 28, 28, 29, 29, 30, 30, 33, 33, 33, 33, 34, 34, 37, 37, 38, 38, 39, 39, 39, 39, 40, 40, 40, 40, 42, 42, 43, 43, 43, 43, 44, 44, 46, 46, 46, 46, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49, 50, 50, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 57, 57, 58, 58, 59, 59, 60, 60, 60, 60, 60, 60, 61, 61, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 64, 64, 65, 65, 65, 65, 67, 67, 67, 67, 69, 69, 70, 70, 71, 71, 71, 71, 71, 71, 72, 72, 73, 73, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 77, 77, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 79, 79, 81, 81, 85, 85, 86, 86, 87, 87, 87, 87, 87, 87, 88, 88, 88, 88, 93, 93, 97, 97, 98, 98, 99, 99, 101, 101, 103, 103, 103, 103, 113, 113, 120, 120, 126, 126, 137, 137], \"y\": [39, 81, 6, 77, 40, 76, 6, 94, 3, 72, 14, 99, 15, 77, 13, 79, 35, 66, 29, 98, 35, 73, 5, 73, 14, 82, 32, 61, 31, 87, 4, 73, 4, 92, 14, 81, 17, 73, 26, 75, 35, 92, 36, 61, 28, 65, 55, 47, 42, 42, 52, 60, 54, 60, 45, 41, 50, 46, 51, 46, 56, 55, 52, 59, 51, 59, 50, 48, 59, 47, 55, 42, 49, 56, 47, 54, 53, 48, 52, 42, 51, 55, 41, 44, 57, 46, 58, 55, 60, 46, 55, 41, 49, 40, 42, 52, 47, 50, 42, 49, 41, 48, 59, 55, 56, 42, 50, 46, 43, 48, 52, 54, 42, 46, 48, 50, 43, 59, 43, 57, 56, 40, 58, 91, 29, 77, 35, 95, 11, 75, 9, 75, 34, 71, 5, 88, 7, 73, 10, 72, 5, 93, 40, 87, 12, 97, 36, 74, 22, 90, 17, 88, 20, 76, 16, 89, 1, 78, 1, 73, 35, 83, 5, 93, 26, 75, 20, 95, 27, 63, 13, 75, 10, 92, 13, 86, 15, 69, 14, 90, 32, 86, 15, 88, 39, 97, 24, 68, 17, 85, 23, 69, 8, 91, 16, 79, 28, 74, 18, 83], \"mode\": \"markers\", \"marker\": {\"size\": 10, \"color\": [4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 0, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 2, 0, 2, 3, 2, 3, 2, 0, 2, 3, 2, 3, 2, 3, 2, 3, 2, 0, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2], \"colorscale\": \"Viridis\"}}], {}, {\"showLink\": true, \"linkText\": \"Export to plot.ly\"})});</script>"
1533 |       ]
1534 |      },
1535 |      "metadata": {},
1536 |      "output_type": "display_data"
1537 |     }
1538 |    ],
1539 |    "source": [
1540 |     "pd_df = df_pred.toPandas()\n",
1541 |     "trace = go.Scatter(x=pd_df.Income, y=pd_df.Spend, \n",
1542 |     "                 mode='markers',\n",
1543 |     "                marker = {'size':10,'color':pd_df.prediction,'colorscale':'Viridis'})\n",
1544 |     "iplot([trace])"
1545 |    ]
1546 |   },
1547 |   {
1548 |    "cell_type": "markdown",
1549 |    "metadata": {},
1550 |    "source": [
1551 |     "## BisectingKMeans 二分k均值\n",
1552 |     "`pyspark.ml.clustering.BisectingKMeans(featuresCol='features', predictionCol='prediction', maxIter=20, seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure='euclidean')`"
1553 |    ]
1554 |   },
1555 |   {
1556 |    "cell_type": "markdown",
1557 |    "metadata": {},
1558 |    "source": [
1559 |     "二分k均值（bisecting k-means）算法的主要思想是：首先将所有点作为一个簇，然后将该簇一分为二。之后选择能最大程度降低聚类代价函数（也就是误差平方和）的簇划分为两个簇。以此进行下去，直到簇的数目等于用户给定的数目k为止。\n",
1560 |     "\n",
1561 |     "   以上隐含着一个原则是：因为聚类的误差平方和能够衡量聚类性能，该值越小表示数据点月接近于它们的质心，聚类效果就越好。所以我们就需要对误差平方和最大的簇进行再一次的划分，因为误差平方和越大，表示该簇聚类越不好，越有可能是多个簇被当成一个簇了，所以我们首先需要对这个簇进行划分。"
1562 |    ]
1563 |   },
1564 |   {
1565 |    "cell_type": "markdown",
1566 |    "metadata": {},
1567 |    "source": [
1568 |     "**参数**\n",
1569 |     "\n",
1570 |     "`maxIter: 最大迭代次数\n",
1571 |     "K：聚类簇数\n",
1572 |     "minDivisibleClusterSize: 聚类的最少数据点数(>1)或比例(0-1之间)\n",
1573 |     "fit(dataset, params=None)方法`"
1574 |    ]
1575 |   },
1576 |   {
1577 |    "cell_type": "markdown",
1578 |    "metadata": {},
1579 |    "source": [
1580 |     "**model属性**\n",
1581 |     "\n",
1582 |     "`\n",
1583 |     "clusterCenters(): 获取聚类中心，numpy array类型\n",
1584 |     "computeCost()：计算点与其中心的平方和距离\n",
1585 |     "Transform()：对预测数据进行预测\n",
1586 |     "hasSummary:训练模型是否有summary\n",
1587 |     "Summary：获取summary\n",
1588 |     "`"
1589 |    ]
1590 |   },
1591 |   {
1592 |    "cell_type": "markdown",
1593 |    "metadata": {},
1594 |    "source": [
1595 |     "**Summary拥有的属性**\n",
1596 |     "\n",
1597 |     "`\n",
1598 |     "cluster：预测的聚类中心\n",
1599 |     "clusterSizes：每个聚类的大小\n",
1600 |     "K：聚类个数\n",
1601 |     "Predictions：由模型的transforn方法产生的预测数据框\n",
1602 |     "`"
1603 |    ]
1604 |   },
1605 |   {
1606 |    "cell_type": "markdown",
1607 |    "metadata": {},
1608 |    "source": [
1609 |     "## GaussianMixture 高斯混合模型\n",
1610 |     "`pyspark.ml.clustering.GaussianMixture(featuresCol='features', predictionCol='prediction', k=2, probabilityCol='probability', tol=0.01, maxIter=100, seed=None)`"
1611 |    ]
1612 |   },
1613 |   {
1614 |    "cell_type": "markdown",
1615 |    "metadata": {},
1616 |    "source": [
1617 |     "对象实现了用来拟合高斯混合模型的 期望最大化 (EM) 算法。它还可以为多变量模型绘制置信区间，同时计算 BIC（Bayesian Information Criterion，贝叶斯信息准则）来评估数据中聚类的数量。"
1618 |    ]
1619 |   },
1620 |   {
1621 |    "cell_type": "markdown",
1622 |    "metadata": {},
1623 |    "source": [
1624 |     "优点:GMM的优点是投影后样本点不是得到一个确定的分类标记，而是得到每个类的概率，这是一个重要信息。GMM不仅可以用在聚类上，也可以用在概率密度估计上。缺点:当每个混合模型没有足够多的点时，估算协方差变得困难起来，同时算法会发散并且找具有无穷大似然函数值的解，除非人为地对协方差进行正则化。GMM每一步迭代的计算量比较大，大于k-means。GMM的求解办法基于EM算法，因此有可能陷入局部极值，这和初始值的选取十分相关了。\n",
1625 |     "\n"
1626 |    ]
1627 |   },
1628 |   {
1629 |    "cell_type": "markdown",
1630 |    "metadata": {},
1631 |    "source": [
1632 |     "注意对于高维数据（具有许多功能），此算法可能表现不佳。这是由于高维数据（a）使得难以聚类（基于统计/理论论证）和（b）高斯分布的数值问题。"
1633 |    ]
1634 |   },
1635 |   {
1636 |    "cell_type": "markdown",
1637 |    "metadata": {},
1638 |    "source": [
1639 |     "**参数**\n",
1640 |     "\n",
1641 |     "`fit(dataset,params=None)方法\n",
1642 |     "k: 独立高斯分布的个数，>1\n",
1643 |     "maxIter: 最大迭代次数 >=0\n",
1644 |     "tol: 迭代算法的收敛偏差 >=0\n",
1645 |     "Setter方法和getter方法`"
1646 |    ]
1647 |   },
1648 |   {
1649 |    "cell_type": "markdown",
1650 |    "metadata": {},
1651 |    "source": [
1652 |     "**model属性**\n",
1653 |     "\n",
1654 |     "`\n",
1655 |     "gaussianDF: 抽取高斯分布作为数据框，每一行代表高斯分布，有两列：mean（vector)和           cov(Matrix)\n",
1656 |     "hasSummary: 模型是否有总括函数\n",
1657 |     "summary： 获取总括信息\n",
1658 |     "transform(dataset,params=None)方法\n",
1659 |     "weights: 高斯混合模型的权重，和为1\n",
1660 |     "`"
1661 |    ]
1662 |   },
1663 |   {
1664 |    "cell_type": "code",
1665 |    "execution_count": 45,
1666 |    "metadata": {},
1667 |    "outputs": [],
1668 |    "source": [
1669 |     "spark.stop()"
1670 |    ]
1671 |   },
1672 |   {
1673 |    "cell_type": "code",
1674 |    "execution_count": null,
1675 |    "metadata": {},
1676 |    "outputs": [],
1677 |    "source": []
1678 |   }
1679 |  ],
1680 |  "metadata": {
1681 |   "kernelspec": {
1682 |    "display_name": "Python 3",
1683 |    "language": "python",
1684 |    "name": "python3"
1685 |   },
1686 |   "language_info": {
1687 |    "codemirror_mode": {
1688 |     "name": "ipython",
1689 |     "version": 3
1690 |    },
1691 |    "file_extension": ".py",
1692 |    "mimetype": "text/x-python",
1693 |    "name": "python",
1694 |    "nbconvert_exporter": "python",
1695 |    "pygments_lexer": "ipython3",
1696 |    "version": "3.6.4"
1697 |   }
1698 |  },
1699 |  "nbformat": 4,
1700 |  "nbformat_minor": 2
1701 | }
1702 | 


--------------------------------------------------------------------------------
/pyspark-RDD.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "RDD（Resilient Distributed Dataset）叫做弹性分布式数据集，是Spark中最基本的数据抽象，它代表一个不可变、可分区、里面的元素可并行计算的集合。RDD具有数据流模型的特点：自动容错、位置感知性调度和可伸缩性。RDD允许用户在执行多个查询时显式地将工作集缓存在内存中，后续的查询能够重用工作集，这极大地提升了查询速度。\n",
   8 |     "\n",
   9 |     "（1）一组分片（Partition），即数据集的基本组成单位。对于RDD来说，每个分片都会被一个计算任务处理，并决定并行计算的粒度。用户可以在创建RDD时指定RDD的分片个数，如果没有指定，那么就会采用默认值。默认值就是程序所分配到的CPU Core的数目。\n",
  10 |     "\n",
  11 |     "（2）一个计算每个分区的函数。Spark中RDD的计算是以分片为单位的，每个RDD都会实现compute函数以达到这个目的。compute函数会对迭代器进行复合，不需要保存每次计算的结果。\n",
  12 |     "\n",
  13 |     "（3）RDD之间的依赖关系。RDD的每次转换都会生成一个新的RDD，所以RDD之间就会形成类似于流水线一样的前后依赖关系。在部分分区数据丢失时，Spark可以通过这个依赖关系重新计算丢失的分区数据，而不是对RDD的所有分区进行重新计算。\n",
  14 |     "\n",
  15 |     "（4）一个Partitioner，即RDD的分片函数。当前Spark中实现了两种类型的分片函数，一个是基于哈希的HashPartitioner，另外一个是基于范围的RangePartitioner。只有对于于key-value的RDD，才会有Partitioner，非key-value的RDD的Parititioner的值是None。Partitioner函数不但决定了RDD本身的分片数量，也决定了parent RDD Shuffle输出时的分片数量。\n",
  16 |     "\n",
  17 |     "（5）一个列表，存储存取每个Partition的优先位置（preferred location）。对于一个HDFS文件来说，这个列表保存的就是每个Partition所在的块的位置。按照“移动数据不如移动计算”的理念，Spark在进行任务调度的时候，会尽可能地将计算任务分配到其所要处理数据块的存储位置。\n",
  18 |     "\n",
  19 |     "使用手册 \n",
  20 |     "http://spark.apache.org/docs/latest/api/python/pyspark.html\n"
  21 |    ]
  22 |   },
  23 |   {
  24 |    "cell_type": "code",
  25 |    "execution_count": 1,
  26 |    "metadata": {},
  27 |    "outputs": [],
  28 |    "source": [
  29 |     "#pyspark.SparkContext()是spark应用的入口，也可以称为驱动\n",
  30 |     "from pyspark import SparkConf, SparkContext"
  31 |    ]
  32 |   },
  33 |   {
  34 |    "cell_type": "code",
  35 |    "execution_count": 2,
  36 |    "metadata": {},
  37 |    "outputs": [],
  38 |    "source": [
  39 |     "conf = SparkConf().setAppName(\"sparkApp1\").setMaster(\"local\")\n",
  40 |     "sc = SparkContext.getOrCreate(conf)"
  41 |    ]
  42 |   },
  43 |   {
  44 |    "cell_type": "code",
  45 |    "execution_count": 3,
  46 |    "metadata": {},
  47 |    "outputs": [
  48 |     {
  49 |      "name": "stdout",
  50 |      "output_type": "stream",
  51 |      "text": [
  52 |       "[0, 2, 3, 4, 6]\n",
  53 |       "[[0], [2], [3], [4], [6]]\n",
  54 |       "[0, 2, 4]\n",
  55 |       "[[], [0], [], [2], [4]]\n"
  56 |      ]
  57 |     }
  58 |    ],
  59 |    "source": [
  60 |     "#parallelize（c，numSlices=None）分发本地Python集合以形成RDD。如果输入表示性能范围，则建议使用xrange。\n",
  61 |     "#glom（）通过将每个分区内的所有元素合并到一个列表中返回一个RDD。\n",
  62 |     "rdd1 = sc.parallelize([0,2,3,4,6], 5)\n",
  63 |     "rdd2 = sc.parallelize(range(0, 6, 2), 5)\n",
  64 |     "print(rdd1.collect())\n",
  65 |     "print(rdd1.glom().collect())\n",
  66 |     "print(rdd2.collect())\n",
  67 |     "print(rdd2.glom().collect())"
  68 |    ]
  69 |   },
  70 |   {
  71 |    "cell_type": "code",
  72 |    "execution_count": 4,
  73 |    "metadata": {},
  74 |    "outputs": [
  75 |     {
  76 |      "data": {
  77 |       "text/plain": [
  78 |        "[0, 9]"
  79 |       ]
  80 |      },
  81 |      "execution_count": 4,
  82 |      "metadata": {},
  83 |      "output_type": "execute_result"
  84 |     }
  85 |    ],
  86 |    "source": [
  87 |     "#runJob(rdd, partitionFunc, partitions=None, allowLocal=False）\n",
  88 |     "#在指定的分区集合上执行给定的分区，将结果作为元素的数组返回。如果没有指定分区，那么它将在所有分区上运行。\n",
  89 |     "sc.runJob(rdd1, lambda part: [x * x for x in part], [0, 2], True)"
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "code",
  94 |    "execution_count": 6,
  95 |    "metadata": {},
  96 |    "outputs": [
  97 |     {
  98 |      "name": "stdout",
  99 |      "output_type": "stream",
 100 |      "text": [
 101 |       "1528077753028\n",
 102 |       "ffzs\n"
 103 |      ]
 104 |     }
 105 |    ],
 106 |    "source": [
 107 |     "print(sc.startTime)\n",
 108 |     "print(sc.sparkUser())"
 109 |    ]
 110 |   },
 111 |   {
 112 |    "cell_type": "code",
 113 |    "execution_count": null,
 114 |    "metadata": {},
 115 |    "outputs": [],
 116 |    "source": [
 117 |     "# rdd.glom()\n",
 118 |     "# glom()定义了将原rdd相同分区的元素放在一个列表中构成新的rdd的转换操作。\n",
 119 |     "# rdd.collect()\n",
 120 |     "# 返回由rdd元素组成的列表\n",
 121 |     "# rdd.collectAsMap()\n",
 122 |     "# 将键值对形式的RDD以字典的形式返回给master "
 123 |    ]
 124 |   },
 125 |   {
 126 |    "cell_type": "code",
 127 |    "execution_count": 37,
 128 |    "metadata": {},
 129 |    "outputs": [],
 130 |    "source": [
 131 |     "# cache()\n",
 132 |     "# 将RDD持久化为MEMORY_ONLY"
 133 |    ]
 134 |   },
 135 |   {
 136 |    "cell_type": "code",
 137 |    "execution_count": 34,
 138 |    "metadata": {},
 139 |    "outputs": [
 140 |     {
 141 |      "data": {
 142 |       "text/plain": [
 143 |        "[('a', 'aa', 1), ('b', 'bb', 1), ('c', 'cc', 1)]"
 144 |       ]
 145 |      },
 146 |      "execution_count": 34,
 147 |      "metadata": {},
 148 |      "output_type": "execute_result"
 149 |     }
 150 |    ],
 151 |    "source": [
 152 |     "# map(f, preservesPartitioning=False)\n",
 153 |     "# 通过对这个RDD的每个元素应用一个函数来返回一个新的RDD\n",
 154 |     "rdd = sc.parallelize([\"b\", \"a\", \"c\"])\n",
 155 |     "sorted(rdd.map(lambda x:(x, x*2, 1)).collect())"
 156 |    ]
 157 |   },
 158 |   {
 159 |    "cell_type": "code",
 160 |    "execution_count": 38,
 161 |    "metadata": {},
 162 |    "outputs": [
 163 |     {
 164 |      "name": "stdout",
 165 |      "output_type": "stream",
 166 |      "text": [
 167 |       "[1, 1, 1, 2, 2, 3]\n",
 168 |       "[[(2, 2), (2, 2)], [(3, 3), (3, 3)], [(4, 4), (4, 4)]]\n",
 169 |       "[(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)]\n"
 170 |      ]
 171 |     }
 172 |    ],
 173 |    "source": [
 174 |     "#flatMap(f, preservesPartitioning=False)\n",
 175 |     "#首先将一个函数应用到这个RDD的所有元素上，然后将结果全部展开，返回一个新的RDD\n",
 176 |     "rdd = sc.parallelize([2, 3, 4])\n",
 177 |     "print(sorted(rdd.flatMap(lambda x: range(1, x)).collect()))\n",
 178 |     "print(sorted(rdd.map(lambda x: [(x, x), (x, x)]).collect()))\n",
 179 |     "print(sorted(rdd.flatMap(lambda x: [(x, x), (x, x)]).collect()))\n"
 180 |    ]
 181 |   },
 182 |   {
 183 |    "cell_type": "code",
 184 |    "execution_count": 39,
 185 |    "metadata": {},
 186 |    "outputs": [
 187 |     {
 188 |      "data": {
 189 |       "text/plain": [
 190 |        "[('a', 3), ('b', 1)]"
 191 |       ]
 192 |      },
 193 |      "execution_count": 39,
 194 |      "metadata": {},
 195 |      "output_type": "execute_result"
 196 |     }
 197 |    ],
 198 |    "source": [
 199 |     "# mapValues(f)\n",
 200 |     "#通过map函数对RDD中的每个key传递value，而不改变键;同时保留了原始的RDD分区。\n",
 201 |     "x = sc.parallelize([(\"a\", [\"apple\", \"banana\", \"lemon\"]), (\"b\", [\"grapes\"])])\n",
 202 |     "def f(x): return len(x)\n",
 203 |     "x.mapValues(f).collect()"
 204 |    ]
 205 |   },
 206 |   {
 207 |    "cell_type": "code",
 208 |    "execution_count": 52,
 209 |    "metadata": {},
 210 |    "outputs": [
 211 |     {
 212 |      "name": "stdout",
 213 |      "output_type": "stream",
 214 |      "text": [
 215 |       "[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]\n",
 216 |       "[('a', ['x', 'y', 'z']), ('b', ['p', 'r'])]\n"
 217 |      ]
 218 |     }
 219 |    ],
 220 |    "source": [
 221 |     "#flatMapValues(f)\n",
 222 |     "#通过flatMap函数传递键值对RDD中的每个值，而不改变键;这也保留了原始的RDD分区。\n",
 223 |     "x = sc.parallelize([(\"a\", [\"x\", \"y\", \"z\"]), (\"b\", [\"p\", \"r\"])])\n",
 224 |     "def f(x): return x\n",
 225 |     "print(x.flatMapValues(f).collect())\n",
 226 |     "print(x.mapValues(f).collect())"
 227 |    ]
 228 |   },
 229 |   {
 230 |    "cell_type": "code",
 231 |    "execution_count": 53,
 232 |    "metadata": {},
 233 |    "outputs": [
 234 |     {
 235 |      "data": {
 236 |       "text/plain": [
 237 |        "[3, 7]"
 238 |       ]
 239 |      },
 240 |      "execution_count": 53,
 241 |      "metadata": {},
 242 |      "output_type": "execute_result"
 243 |     }
 244 |    ],
 245 |    "source": [
 246 |     "# mapPartitions(f, preservesPartitioning=False)\n",
 247 |     "# 与map不同，map是对每一个元素用函数作用；而mapPartitions是对每一个分区用一个函数去作用，每一个分区的元素先构成一个迭代器iterator，iterator是一个像列表，但里面的元素又保持分布式特点的一类对象;输入的参数就是这个iterator，然后对iterator进行运算，iterator支持的函数不是太多，sum,count等一些spark定义的基本函数应该都是支持的。但如果要进行更为复杂的一些个性化函数运算，可以就用不了。实践中发生可以通过[x for i in iterator]的方式，将iterator转换为列表，然后就可以进行各种操作。但是这样在分区内部或分组内部就失去了分布式运算的特点。\n",
 248 |     "# yield是生成的意思，但是在python中则是作为生成器理解，生成器的用处主要可以迭代，这样简化了很多运算模型。\n",
 249 |     "rdd = sc.parallelize([1, 2, 3, 4], 2)\n",
 250 |     "def f(iterator): yield sum(iterator)\n",
 251 |     "rdd.mapPartitions(f).collect()"
 252 |    ]
 253 |   },
 254 |   {
 255 |    "cell_type": "code",
 256 |    "execution_count": 55,
 257 |    "metadata": {},
 258 |    "outputs": [
 259 |     {
 260 |      "data": {
 261 |       "text/plain": [
 262 |        "6"
 263 |       ]
 264 |      },
 265 |      "execution_count": 55,
 266 |      "metadata": {},
 267 |      "output_type": "execute_result"
 268 |     }
 269 |    ],
 270 |    "source": [
 271 |     "# mapPartitionsWithIndex(f, preservesPartitioning=False)\n",
 272 |     "# 通过在这个RDD的每个分区上应用一个函数来返回一个新的RDD，同时跟踪原始分区的索引。为对索引进行操作提供可能\n",
 273 |     "rdd = sc.parallelize([1, 2, 3, 4], 4)\n",
 274 |     "def f(splitIndex, iterator): yield splitIndex\n",
 275 |     "rdd.mapPartitionsWithIndex(f).sum()"
 276 |    ]
 277 |   },
 278 |   {
 279 |    "cell_type": "code",
 280 |    "execution_count": 57,
 281 |    "metadata": {},
 282 |    "outputs": [
 283 |     {
 284 |      "data": {
 285 |       "text/plain": [
 286 |        "0"
 287 |       ]
 288 |      },
 289 |      "execution_count": 57,
 290 |      "metadata": {},
 291 |      "output_type": "execute_result"
 292 |     }
 293 |    ],
 294 |    "source": [
 295 |     "# partitionBy(numPartitions, partitionFunc=<function portable_hash>)\n",
 296 |     "# 返回使用指定的分区器分区的RDD的副本\n",
 297 |     "# set().intersection 取交集\n",
 298 |     "pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x))\n",
 299 |     "sets = pairs.partitionBy(2).glom().collect()\n",
 300 |     "len(set(sets[0]).intersection(set(sets[1])))"
 301 |    ]
 302 |   },
 303 |   {
 304 |    "cell_type": "code",
 305 |    "execution_count": 63,
 306 |    "metadata": {},
 307 |    "outputs": [
 308 |     {
 309 |      "name": "stdout",
 310 |      "output_type": "stream",
 311 |      "text": [
 312 |       "[[1], [2, 3], [4, 5]]\n",
 313 |       "[[], [1], [4, 5], [2, 3], []]\n"
 314 |      ]
 315 |     }
 316 |    ],
 317 |    "source": [
 318 |     "# coalesce(numPartitions, shuffle=False)\n",
 319 |     "# 返回一个新的RDD，将RDD重新分区,减少分区不适用shuffle ，正加分区数的话要shuffle为true 同repartition\n",
 320 |     "print(sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect())\n",
 321 |     "print(sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(5,True).glom().collect())"
 322 |    ]
 323 |   },
 324 |   {
 325 |    "cell_type": "code",
 326 |    "execution_count": 64,
 327 |    "metadata": {},
 328 |    "outputs": [
 329 |     {
 330 |      "name": "stdout",
 331 |      "output_type": "stream",
 332 |      "text": [
 333 |       "2\n",
 334 |       "10\n"
 335 |      ]
 336 |     }
 337 |    ],
 338 |    "source": [
 339 |     "# repartition(numPartitions)\n",
 340 |     "# 重新分区，默认shuffle 减少分区用coalesce\n",
 341 |     "rdd = sc.parallelize([1,2,3,4,5,6,7], 4)\n",
 342 |     "print(len(rdd.repartition(2).glom().collect()))\n",
 343 |     "print(len(rdd.repartition(10).glom().collect()))"
 344 |    ]
 345 |   },
 346 |   {
 347 |    "cell_type": "code",
 348 |    "execution_count": 65,
 349 |    "metadata": {},
 350 |    "outputs": [
 351 |     {
 352 |      "data": {
 353 |       "text/plain": [
 354 |        "[(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]"
 355 |       ]
 356 |      },
 357 |      "execution_count": 65,
 358 |      "metadata": {},
 359 |      "output_type": "execute_result"
 360 |     }
 361 |    ],
 362 |    "source": [
 363 |     "# zip(other)\n",
 364 |     "# 一个RDD作为key，另一个让RDD作为value\n",
 365 |     "x = sc.parallelize(range(0,5))\n",
 366 |     "y = sc.parallelize(range(1000, 1005))\n",
 367 |     "x.zip(y).collect()"
 368 |    ]
 369 |   },
 370 |   {
 371 |    "cell_type": "code",
 372 |    "execution_count": 68,
 373 |    "metadata": {},
 374 |    "outputs": [
 375 |     {
 376 |      "data": {
 377 |       "text/plain": [
 378 |        "[('a', 0), ('b', 1), ('c', 2), ('d', 3)]"
 379 |       ]
 380 |      },
 381 |      "execution_count": 68,
 382 |      "metadata": {},
 383 |      "output_type": "execute_result"
 384 |     }
 385 |    ],
 386 |    "source": [
 387 |     "# rdd.zipWithIndex()\n",
 388 |     "# RDD为key 排序位置索引作为value\n",
 389 |     "sc.parallelize([\"a\", \"b\", \"c\", \"d\"], 2).zipWithIndex().collect()"
 390 |    ]
 391 |   },
 392 |   {
 393 |    "cell_type": "code",
 394 |    "execution_count": 73,
 395 |    "metadata": {},
 396 |    "outputs": [
 397 |     {
 398 |      "name": "stdout",
 399 |      "output_type": "stream",
 400 |      "text": [
 401 |       "['a', 'b', 'c', 'd', 'e']\n",
 402 |       "[['a'], ['b', 'c'], ['d', 'e']]\n",
 403 |       "[('a', 0), ('b', 1), ('c', 4), ('d', 2), ('e', 5)]\n"
 404 |      ]
 405 |     }
 406 |    ],
 407 |    "source": [
 408 |     "# zipWithUniqueId(）\n",
 409 |     "# 根据分区k 按公式k，n+k，2*n+k产生value，RDD为key\n",
 410 |     "rdd = sc.parallelize([\"a\", \"b\", \"c\", \"d\", \"e\"], 3)\n",
 411 |     "print(rdd.collect())\n",
 412 |     "print(rdd.glom().collect())\n",
 413 |     "print(rdd.zipWithUniqueId().collect())"
 414 |    ]
 415 |   },
 416 |   {
 417 |    "cell_type": "code",
 418 |    "execution_count": 4,
 419 |    "metadata": {},
 420 |    "outputs": [
 421 |     {
 422 |      "name": "stdout",
 423 |      "output_type": "stream",
 424 |      "text": [
 425 |       "[(0, 0), (1, 1), (4, 2)]\n",
 426 |       "[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]\n"
 427 |      ]
 428 |     },
 429 |     {
 430 |      "data": {
 431 |       "text/plain": [
 432 |        "[(0, [[0], [0]]),\n",
 433 |        " (1, [[1], [1]]),\n",
 434 |        " (2, [[], [2]]),\n",
 435 |        " (3, [[], [3]]),\n",
 436 |        " (4, [[2], [4]])]"
 437 |       ]
 438 |      },
 439 |      "execution_count": 4,
 440 |      "metadata": {},
 441 |      "output_type": "execute_result"
 442 |     }
 443 |    ],
 444 |    "source": [
 445 |     "# rdd.keyBy()\n",
 446 |     "# RDD通过函数创建元组\n",
 447 |     "x = sc.parallelize(range(0,3)).keyBy(lambda x: x*x)\n",
 448 |     "y = sc.parallelize(zip(range(0,5), range(0,5)))\n",
 449 |     "print(x.collect())\n",
 450 |     "print(y.collect())\n",
 451 |     "[(x, list(map(list, y))) for x, y in sorted(x.cogroup(y).collect())]"
 452 |    ]
 453 |   },
 454 |   {
 455 |    "cell_type": "code",
 456 |    "execution_count": 6,
 457 |    "metadata": {},
 458 |    "outputs": [],
 459 |    "source": [
 460 |     "# foreach(f)\n",
 461 |     "# 是一个公式作用于rdd所有元素，生成非rdd\n",
 462 |     "def fun(x): \n",
 463 |     "    print(x)\n",
 464 |     "sc.parallelize([1, 2, 3, 4, 5]).foreach(fun)"
 465 |    ]
 466 |   },
 467 |   {
 468 |    "cell_type": "code",
 469 |    "execution_count": 7,
 470 |    "metadata": {},
 471 |    "outputs": [],
 472 |    "source": [
 473 |     "# foreachPartition(f)\n",
 474 |     "# 使一个函数作用于RDD上每一个分区\n",
 475 |     "def fun(iterator):\n",
 476 |     "    for x in iterator:\n",
 477 |     "        print(x)\n",
 478 |     "sc.parallelize([1, 2, 3, 4, 5]).foreachPartition(fun)"
 479 |    ]
 480 |   },
 481 |   {
 482 |    "cell_type": "code",
 483 |    "execution_count": 8,
 484 |    "metadata": {},
 485 |    "outputs": [
 486 |     {
 487 |      "name": "stdout",
 488 |      "output_type": "stream",
 489 |      "text": [
 490 |       "None\n",
 491 |       "1\n",
 492 |       "2\n",
 493 |       "3\n",
 494 |       "\n"
 495 |      ]
 496 |     },
 497 |     {
 498 |      "data": {
 499 |       "text/plain": [
 500 |        "'\\n1\\n2\\n3\\n'"
 501 |       ]
 502 |      },
 503 |      "execution_count": 8,
 504 |      "metadata": {},
 505 |      "output_type": "execute_result"
 506 |     }
 507 |    ],
 508 |    "source": [
 509 |     "inputData=sc.parallelize([1,2,3])\n",
 510 |     "def f(x):#定义一个将内容追加于文件末尾的函数\n",
 511 |     "    with open('./example.txt','a+') as fl:\n",
 512 |     "        print(x,file=fl)\n",
 513 |     "\n",
 514 |     "open('./example.txt','w').close()#操作之前先关闭之前可能存在的对该文件的写操作\n",
 515 |     "y=inputData.foreach(f)\n",
 516 |     "print(y)\n",
 517 |     "#结果为：None,因为函数f没有返回值\n",
 518 |     "#查看写文件的结果\n",
 519 |     "with open('./example.txt') as fl:\n",
 520 |     "    print(fl.read())"
 521 |    ]
 522 |   },
 523 |   {
 524 |    "cell_type": "code",
 525 |    "execution_count": 9,
 526 |    "metadata": {},
 527 |    "outputs": [
 528 |     {
 529 |      "data": {
 530 |       "text/plain": [
 531 |        "[(0, [2, 8]), (1, [1, 1, 3, 5])]"
 532 |       ]
 533 |      },
 534 |      "execution_count": 9,
 535 |      "metadata": {},
 536 |      "output_type": "execute_result"
 537 |     }
 538 |    ],
 539 |    "source": [
 540 |     "# groupBy(f, numPartitions=None, partitionFunc=<function portable_hash>）\n",
 541 |     "# 根据函数符合条件与否进行分组返回分组项目的RDD\n",
 542 |     "rdd = sc.parallelize([1, 1, 2, 3, 5, 8])\n",
 543 |     "result = rdd.groupBy(lambda x: x % 2).collect()\n",
 544 |     "sorted([(x, sorted(y)) for (x, y) in result])"
 545 |    ]
 546 |   },
 547 |   {
 548 |    "cell_type": "code",
 549 |    "execution_count": 11,
 550 |    "metadata": {},
 551 |    "outputs": [
 552 |     {
 553 |      "name": "stdout",
 554 |      "output_type": "stream",
 555 |      "text": [
 556 |       "[('a', 2), ('b', 1)]\n",
 557 |       "[('a', [1, 1]), ('b', [1])]\n"
 558 |      ]
 559 |     }
 560 |    ],
 561 |    "source": [
 562 |     "#groupByKey(numPartitions=None, partitionFunc=<function portable_hash>)\n",
 563 |     "#原rdd为键值对，groupByKey()则将原rdd的元素相同键的值编进一个sequence\n",
 564 |     "#如果您正在进行分组以执行每个密钥的聚合（例如总计或平均值），则使用reduceByKey或aggregateByKey将提供更好的性能。\n",
 565 |     "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n",
 566 |     "print(sorted(rdd.groupByKey().mapValues(len).collect()))\n",
 567 |     "print(sorted(rdd.groupByKey().mapValues(list).collect()))"
 568 |    ]
 569 |   },
 570 |   {
 571 |    "cell_type": "code",
 572 |    "execution_count": 20,
 573 |    "metadata": {},
 574 |    "outputs": [
 575 |     {
 576 |      "name": "stdout",
 577 |      "output_type": "stream",
 578 |      "text": [
 579 |       "[('a', ([1], [2])), ('b', ([4], []))]\n",
 580 |       "([4], [])\n"
 581 |      ]
 582 |     }
 583 |    ],
 584 |    "source": [
 585 |     "# cogroup(other, numPartitions=None)\n",
 586 |     "# 对于self或other中的每个关键字k，返回一个包含一个元组的结果RDD，以及该关键字在自身和其他关键字中的值列表。\n",
 587 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
 588 |     "y = sc.parallelize([(\"a\", 2)])\n",
 589 |     "print([(x, tuple(map(list, y))) for x, y in sorted(list(x.cogroup(y).collect()))])\n",
 590 |     "print(tuple(map(list,list(x.cogroup(y).collect()[0][1]))))\n"
 591 |    ]
 592 |   },
 593 |   {
 594 |    "cell_type": "code",
 595 |    "execution_count": 21,
 596 |    "metadata": {},
 597 |    "outputs": [
 598 |     {
 599 |      "name": "stdout",
 600 |      "output_type": "stream",
 601 |      "text": [
 602 |       "[('a', ([5], [1], [2], [])), ('b', ([6], [4], [], [42]))]\n"
 603 |      ]
 604 |     }
 605 |    ],
 606 |    "source": [
 607 |     "# groupWith(other, *others)\n",
 608 |     "# cogroup的别名，但支持多个RDD\n",
 609 |     "w = sc.parallelize([(\"a\", 5), (\"b\", 6)])\n",
 610 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
 611 |     "y = sc.parallelize([(\"a\", 2)])\n",
 612 |     "z = sc.parallelize([(\"b\", 42)])\n",
 613 |     "print([(x, tuple(map(list, y))) for x, y in sorted(list(w.groupWith(x, y, z).collect()))])"
 614 |    ]
 615 |   },
 616 |   {
 617 |    "cell_type": "code",
 618 |    "execution_count": 24,
 619 |    "metadata": {},
 620 |    "outputs": [
 621 |     {
 622 |      "name": "stdout",
 623 |      "output_type": "stream",
 624 |      "text": [
 625 |       "15\n",
 626 |       "abcde\n"
 627 |      ]
 628 |     }
 629 |    ],
 630 |    "source": [
 631 |     "# reduce(f)\n",
 632 |     "# reduce函数是将rdd中的每个元素两两之间按函数f进行操作，然后再结果再两两之间按f进行操作，一直进行下去，\n",
 633 |     "# 即所谓的shuffle过程。reduce得到的结果是普通的python对象，而不是rdd.\n",
 634 |     "# operator 操作函数 https://docs.python.org/3/library/operator.html\n",
 635 |     "from operator import *\n",
 636 |     "print(sc.parallelize([1, 2, 3, 4, 5]).reduce(add))\n",
 637 |     "print(sc.parallelize([\"a\", \"b\", \"c\", \"d\", \"e\"]).reduce(concat))"
 638 |    ]
 639 |   },
 640 |   {
 641 |    "cell_type": "code",
 642 |    "execution_count": 29,
 643 |    "metadata": {},
 644 |    "outputs": [
 645 |     {
 646 |      "data": {
 647 |       "text/plain": [
 648 |        "[('a', 2), ('b', 1)]"
 649 |       ]
 650 |      },
 651 |      "execution_count": 29,
 652 |      "metadata": {},
 653 |      "output_type": "execute_result"
 654 |     }
 655 |    ],
 656 |    "source": [
 657 |     "# reduceByKey(func, numPartitions=None, partitionFunc=<function portable_hash>)\n",
 658 |     "# 按key分组 组内进行reduce处理\n",
 659 |     "from operator import *\n",
 660 |     "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n",
 661 |     "sorted(rdd.reduceByKey(add).collect())"
 662 |    ]
 663 |   },
 664 |   {
 665 |    "cell_type": "code",
 666 |    "execution_count": 36,
 667 |    "metadata": {},
 668 |    "outputs": [
 669 |     {
 670 |      "name": "stdout",
 671 |      "output_type": "stream",
 672 |      "text": [
 673 |       "{'a': 2, 'b': 1}\n",
 674 |       "[('a', 2), ('b', 1)]\n"
 675 |      ]
 676 |     }
 677 |    ],
 678 |    "source": [
 679 |     "# reduceByKeyLocally(func)\n",
 680 |     "# 其他与reduceByKey一样，只不过聚合后立即将键，值对以字典的形式传给到集群master，即输出为字典\n",
 681 |     "# 这还将在将结果发送到reducer之前在每个映射器上进行本地合并，类似于“合并器”中的MapReduce的\n",
 682 |     "from operator import *\n",
 683 |     "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n",
 684 |     "print(rdd.reduceByKeyLocally(add))\n",
 685 |     "print(sorted(rdd.reduceByKeyLocally(add).items()))"
 686 |    ]
 687 |   },
 688 |   {
 689 |    "cell_type": "code",
 690 |    "execution_count": 38,
 691 |    "metadata": {},
 692 |    "outputs": [
 693 |     {
 694 |      "data": {
 695 |       "text/plain": [
 696 |        "-5"
 697 |       ]
 698 |      },
 699 |      "execution_count": 38,
 700 |      "metadata": {},
 701 |      "output_type": "execute_result"
 702 |     }
 703 |    ],
 704 |    "source": [
 705 |     "# treeReduce(f, depth=2)\n",
 706 |     "# 分区间多次进行reduce\n",
 707 |     "# depth 树的深度（执行次数？）\n",
 708 |     "add = lambda x, y: x + y\n",
 709 |     "rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10)\n",
 710 |     "rdd.treeReduce(add, 2)"
 711 |    ]
 712 |   },
 713 |   {
 714 |    "cell_type": "code",
 715 |    "execution_count": 28,
 716 |    "metadata": {},
 717 |    "outputs": [
 718 |     {
 719 |      "name": "stdout",
 720 |      "output_type": "stream",
 721 |      "text": [
 722 |       "[5, 6]\n",
 723 |       "['a', 'b']\n"
 724 |      ]
 725 |     }
 726 |    ],
 727 |    "source": [
 728 |     "# rdd.keys()\n",
 729 |     "# 原rdd的元素为键值对，返回原rdd元素的键为元素的rdd\n",
 730 |     "# rdd.values()\n",
 731 |     "# 原rdd的元素为键值对，返回原rdd元素的值为元素的rdd\n",
 732 |     "w = sc.parallelize([(\"a\", 5), (\"b\", 6)])\n",
 733 |     "print(w.keys().collect())\n",
 734 |     "print(w.values().collect())"
 735 |    ]
 736 |   },
 737 |   {
 738 |    "cell_type": "markdown",
 739 |    "metadata": {},
 740 |    "source": [
 741 |     "`aggregate函数`\n",
 742 |     "\n",
 743 |     "将每个分区里面的元素进行聚合，然后用combine函数将每个分区的结果和初始值(zeroValue)进行combine操作。这个函数最终返回的类型不需要和RDD中元素类型一致。\n",
 744 |     "\n",
 745 |     "seqOp操作会聚合各分区中的元素，然后combOp操作把所有分区的聚合结果再次聚合，两个操作的初始值都是zeroValue.  seqOp的操作是遍历分区中的所有元素(T)，第一个T跟zeroValue做操作，结果再作为与第二个T做操作的zeroValue，直到遍历完整个分区。combOp操作是把各分区聚合的结果，再聚合。aggregate函数返回一个跟RDD不同类型的值。因此，需要一个操作seqOp来把分区中的元素T合并成一个U，另外一个操作combOp把所有U聚合。\n"
 746 |    ]
 747 |   },
 748 |   {
 749 |    "cell_type": "code",
 750 |    "execution_count": 38,
 751 |    "metadata": {},
 752 |    "outputs": [
 753 |     {
 754 |      "name": "stdout",
 755 |      "output_type": "stream",
 756 |      "text": [
 757 |       "(10, 4)\n",
 758 |       "(10, 4)\n",
 759 |       "(10, 28)\n"
 760 |      ]
 761 |     }
 762 |    ],
 763 |    "source": [
 764 |     "#aggregate(zeroValue, seqOp, combOp)\n",
 765 |     "seqOp = (lambda x, y : (x[0] + y, x[1] + 1))\n",
 766 |     "combOp = (lambda x, y : (x[0] + y[0], x[1] + y[1]))\n",
 767 |     "print(sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp))\n",
 768 |     "print(sc.parallelize([1, 2, 3, 4],3).aggregate((0, 0), seqOp, combOp))\n",
 769 |     "# 三个分区多加了4个6 ?\n",
 770 |     "print(sc.parallelize([1, 2, 3, 4],3).aggregate((0, 6), seqOp, combOp))"
 771 |    ]
 772 |   },
 773 |   {
 774 |    "cell_type": "code",
 775 |    "execution_count": null,
 776 |    "metadata": {},
 777 |    "outputs": [],
 778 |    "source": [
 779 |     "# aggregateByKey(zeroValue, seqFunc, combFunc, numPartitions=None, partitionFunc=<function portable_hash>)\n",
 780 |     "# 跟aggregate逻辑相同，bykey顾名思义 按照key分区 ，而aggregate按区分配；\n",
 781 |     "# 但是zeroValue与aggregate中的用法很不一样，这里的zeroValue是一个值，它即可以跟这样键聚合，也可以跟那个键聚合，而且zeroValue必须与键内聚合时定义的形式一致。"
 782 |    ]
 783 |   },
 784 |   {
 785 |    "cell_type": "code",
 786 |    "execution_count": 37,
 787 |    "metadata": {},
 788 |    "outputs": [
 789 |     {
 790 |      "data": {
 791 |       "text/plain": [
 792 |        "-5"
 793 |       ]
 794 |      },
 795 |      "execution_count": 37,
 796 |      "metadata": {},
 797 |      "output_type": "execute_result"
 798 |     }
 799 |    ],
 800 |    "source": [
 801 |     "# treeAggregate(zeroValue, seqOp, combOp, depth=2)\n",
 802 |     "# 与aggregate不同的地方是:在每个分区,会做两次或者多次combOp,避免将所有局部的值传给driver端.另外,经过测验初始值zeroValue不会参与combOp.\n",
 803 |     "# depth：树的深度\n",
 804 |     "add = lambda x, y: x + y\n",
 805 |     "rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10)\n",
 806 |     "rdd.treeAggregate(0, add, add, 2)"
 807 |    ]
 808 |   },
 809 |   {
 810 |    "cell_type": "code",
 811 |    "execution_count": 39,
 812 |    "metadata": {},
 813 |    "outputs": [
 814 |     {
 815 |      "data": {
 816 |       "text/plain": [
 817 |        "15"
 818 |       ]
 819 |      },
 820 |      "execution_count": 39,
 821 |      "metadata": {},
 822 |      "output_type": "execute_result"
 823 |     }
 824 |    ],
 825 |    "source": [
 826 |     "# fold(zeroValue, op)\n",
 827 |     "# partitionBy的简易版，初始一个值，分区内部执行函数和汇总函数为同一个函数\n",
 828 |     "from operator import add\n",
 829 |     "sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)"
 830 |    ]
 831 |   },
 832 |   {
 833 |    "cell_type": "code",
 834 |    "execution_count": 40,
 835 |    "metadata": {},
 836 |    "outputs": [
 837 |     {
 838 |      "data": {
 839 |       "text/plain": [
 840 |        "[('a', 2), ('b', 1)]"
 841 |       ]
 842 |      },
 843 |      "execution_count": 40,
 844 |      "metadata": {},
 845 |      "output_type": "execute_result"
 846 |     }
 847 |    ],
 848 |    "source": [
 849 |     "# foldByKey(zeroValue, func, numPartitions=None, partitionFunc=)\n",
 850 |     "# 跟fold逻辑相同，只不过是按照key进行分组\n",
 851 |     "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n",
 852 |     "from operator import add\n",
 853 |     "sorted(rdd.foldByKey(0, add).collect())"
 854 |    ]
 855 |   },
 856 |   {
 857 |    "cell_type": "code",
 858 |    "execution_count": 42,
 859 |    "metadata": {},
 860 |    "outputs": [
 861 |     {
 862 |      "name": "stdout",
 863 |      "output_type": "stream",
 864 |      "text": [
 865 |       "[('a', 1), ('a', 2), ('b', 1), ('b', 3), ('c', 5), ('c', 6)]\n",
 866 |       "[('a', [1, 2]), ('b', [1, 3]), ('c', [5, 6])]\n"
 867 |      ]
 868 |     }
 869 |    ],
 870 |    "source": [
 871 |     "# combineByKey(createCombiner, mergeValue, mergeCombiners, numPartitions=None, partitionFunc=)\n",
 872 |     "# 将RDD [（K，V）]转换为RDD [（K，C）]类型的结果，通过三个函数进行转换聚合的目的，\n",
 873 |     "# createcombiner函数 rdd值、类型转换\n",
 874 |     "# 根据key对值进行合并\n",
 875 |     "# 将合并列表，将连个c合并成一个\n",
 876 |     "x=sc.parallelize([('a',1),('a',2),('b',1),('b',3),('c',5),('c',6)])\n",
 877 |     "def to_list(a):\n",
 878 |     "    return [a]\n",
 879 |     "def append(a,b):\n",
 880 |     "    a.append(b)\n",
 881 |     "    return a\n",
 882 |     "def extend(a,b):\n",
 883 |     "    a.extend(b)\n",
 884 |     "    return a\n",
 885 |     "print(x.collect())\n",
 886 |     "print(x.combineByKey(to_list,append,extend).collect())"
 887 |    ]
 888 |   },
 889 |   {
 890 |    "cell_type": "code",
 891 |    "execution_count": 43,
 892 |    "metadata": {},
 893 |    "outputs": [
 894 |     {
 895 |      "name": "stdout",
 896 |      "output_type": "stream",
 897 |      "text": [
 898 |       "[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]\n",
 899 |       "[('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]\n"
 900 |      ]
 901 |     }
 902 |    ],
 903 |    "source": [
 904 |     "# rdd.sortBy(keyfunc, ascending=True, numPartitions=None)\n",
 905 |     "# 根据key对应的函数进行排序\n",
 906 |     "tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]\n",
 907 |     "print(sc.parallelize(tmp).sortBy(lambda x: x[0]).collect())\n",
 908 |     "print(sc.parallelize(tmp).sortBy(lambda x: x[1]).collect())"
 909 |    ]
 910 |   },
 911 |   {
 912 |    "cell_type": "code",
 913 |    "execution_count": 45,
 914 |    "metadata": {},
 915 |    "outputs": [
 916 |     {
 917 |      "name": "stdout",
 918 |      "output_type": "stream",
 919 |      "text": [
 920 |       "('1', 3)\n",
 921 |       "[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]\n",
 922 |       "[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]\n",
 923 |       "[('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5), ('little', 4), ('Mary', 1), ('was', 8), ('white', 9), ('whose', 6)]\n"
 924 |      ]
 925 |     }
 926 |    ],
 927 |    "source": [
 928 |     "# sortByKey(ascending=True, numPartitions=None, keyfunc=<function RDD.<lambda>>)\n",
 929 |     "# 对此RDD进行排序，假定它由（键，值）对组成\n",
 930 |     "tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]\n",
 931 |     "print(sc.parallelize(tmp).sortByKey().first())\n",
 932 |     "print(sc.parallelize(tmp).sortByKey(True, 1).collect())\n",
 933 |     "print(sc.parallelize(tmp).sortByKey(True, 2).collect())\n",
 934 |     "tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]\n",
 935 |     "tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)])\n",
 936 |     "print(sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=lambda k: k.lower()).collect())"
 937 |    ]
 938 |   },
 939 |   {
 940 |    "cell_type": "code",
 941 |    "execution_count": 47,
 942 |    "metadata": {},
 943 |    "outputs": [
 944 |     {
 945 |      "name": "stdout",
 946 |      "output_type": "stream",
 947 |      "text": [
 948 |       "(count: 4, mean: 2.5, stdev: 1.118033988749895, max: 4.0, min: 1.0)\n"
 949 |      ]
 950 |     }
 951 |    ],
 952 |    "source": [
 953 |     "# stats()\n",
 954 |     "# 计算rdd中全体元素的均值、方差、最大值、最小值和个数的信息\n",
 955 |     "samp=sc.parallelize([1,2,3,4]).stats()\n",
 956 |     "print(samp)"
 957 |    ]
 958 |   },
 959 |   {
 960 |    "cell_type": "code",
 961 |    "execution_count": 48,
 962 |    "metadata": {},
 963 |    "outputs": [
 964 |     {
 965 |      "data": {
 966 |       "text/plain": [
 967 |        "3"
 968 |       ]
 969 |      },
 970 |      "execution_count": 48,
 971 |      "metadata": {},
 972 |      "output_type": "execute_result"
 973 |     }
 974 |    ],
 975 |    "source": [
 976 |     "# rdd.count()\n",
 977 |     "# 计算rdd所有元素个数\n",
 978 |     "sc.parallelize([2, 3, 4]).count()"
 979 |    ]
 980 |   },
 981 |   {
 982 |    "cell_type": "code",
 983 |    "execution_count": 3,
 984 |    "metadata": {},
 985 |    "outputs": [
 986 |     {
 987 |      "data": {
 988 |       "text/plain": [
 989 |        "10000"
 990 |       ]
 991 |      },
 992 |      "execution_count": 3,
 993 |      "metadata": {},
 994 |      "output_type": "execute_result"
 995 |     }
 996 |    ],
 997 |    "source": [
 998 |     "# countApprox(timeout, confidence=0.95)\n",
 999 |     "# 在限定时间内做出有可能的结果，即使任务没有完成\n",
1000 |     "rdd = sc.parallelize(range(10000), 10)\n",
1001 |     "rdd.countApprox(1000, 1.0)"
1002 |    ]
1003 |   },
1004 |   {
1005 |    "cell_type": "code",
1006 |    "execution_count": 5,
1007 |    "metadata": {},
1008 |    "outputs": [
1009 |     {
1010 |      "name": "stdout",
1011 |      "output_type": "stream",
1012 |      "text": [
1013 |       "1060\n",
1014 |       "19\n"
1015 |      ]
1016 |     }
1017 |    ],
1018 |    "source": [
1019 |     "# countApproxDistinct(relativeSD=0.05)\n",
1020 |     "# 返回RDD中不同值数的近似值\n",
1021 |     "# relativeSD 相对准确度。较小的值创建需要更多空间的计数器。它必须大于0.000017。\n",
1022 |     "n = sc.parallelize(range(1000)).map(str).countApproxDistinct()\n",
1023 |     "print(n)\n",
1024 |     "n = sc.parallelize([i % 20 for i in range(1000)]).countApproxDistinct()\n",
1025 |     "print(n)"
1026 |    ]
1027 |   },
1028 |   {
1029 |    "cell_type": "code",
1030 |    "execution_count": 9,
1031 |    "metadata": {},
1032 |    "outputs": [
1033 |     {
1034 |      "name": "stdout",
1035 |      "output_type": "stream",
1036 |      "text": [
1037 |       "[('a', 2), ('b', 1)]\n",
1038 |       "defaultdict(<class 'int'>, {'a': 2, 'b': 1})\n"
1039 |      ]
1040 |     }
1041 |    ],
1042 |    "source": [
1043 |     "# countByKey()\n",
1044 |     "# 计算每个键的元素数量，并将结果作为字典返回给主数据。\n",
1045 |     "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n",
1046 |     "print(sorted(rdd.countByKey().items()))\n",
1047 |     "print(rdd.countByKey())"
1048 |    ]
1049 |   },
1050 |   {
1051 |    "cell_type": "code",
1052 |    "execution_count": 21,
1053 |    "metadata": {},
1054 |    "outputs": [
1055 |     {
1056 |      "name": "stdout",
1057 |      "output_type": "stream",
1058 |      "text": [
1059 |       "[[1, 2], [1, 2, 2]]\n",
1060 |       "[(1, 2), (2, 3)]\n"
1061 |      ]
1062 |     }
1063 |    ],
1064 |    "source": [
1065 |     "# countByValue()\n",
1066 |     "# 将此RDD中每个唯一值的计数返回为（值，计数）对的字典。\n",
1067 |     "print(sc.parallelize([1, 2, 1, 2, 2], 2).glom().collect())\n",
1068 |     "print(sorted(sc.parallelize([1, 2, 1, 2, 2], 2).countByValue().items()))"
1069 |    ]
1070 |   },
1071 |   {
1072 |    "cell_type": "code",
1073 |    "execution_count": 13,
1074 |    "metadata": {},
1075 |    "outputs": [],
1076 |    "source": [
1077 |     "# first() 返回第一个元素\n",
1078 |     "# max()返回最大值\n",
1079 |     "# take(num) 返回开始num个值\n",
1080 |     "# top(num, key=None) 计算rdd所有元素按降序排列后最顶部的几个元素\n",
1081 |     "# min() rdd中的最小值\n",
1082 |     "# mean() 计算rdd所有元素均值\n",
1083 |     "# variance() 方差\n",
1084 |     "# stdev() 标准差\n",
1085 |     "# sum() 和"
1086 |    ]
1087 |   },
1088 |   {
1089 |    "cell_type": "code",
1090 |    "execution_count": 19,
1091 |    "metadata": {},
1092 |    "outputs": [
1093 |     {
1094 |      "name": "stdout",
1095 |      "output_type": "stream",
1096 |      "text": [
1097 |       "([0, 25, 50], [25, 26])\n",
1098 |       "([0, 5, 25, 50], [5, 20, 26])\n",
1099 |       "([0, 15, 30, 45, 60], [15, 15, 15, 6])\n",
1100 |       "(('a', 'b', 'c'), [2, 2])\n"
1101 |      ]
1102 |     }
1103 |    ],
1104 |    "source": [
1105 |     "# histogram(buckets）\n",
1106 |     "# 对rdd中的元素进行频数统计，统计区间有两种，一种是给出段数，一种是直接给出区间。返回为元组\n",
1107 |     "rdd = sc.parallelize(range(51))\n",
1108 |     "print(rdd.histogram(2))\n",
1109 |     "print(rdd.histogram([0, 5, 25, 50]))\n",
1110 |     "print(rdd.histogram([0, 15, 30, 45, 60]))\n",
1111 |     "rdd = sc.parallelize([\"ab\", \"ac\", \"b\", \"bd\", \"ef\"])\n",
1112 |     "print(rdd.histogram((\"a\", \"b\", \"c\")))"
1113 |    ]
1114 |   },
1115 |   {
1116 |    "cell_type": "code",
1117 |    "execution_count": 20,
1118 |    "metadata": {},
1119 |    "outputs": [
1120 |     {
1121 |      "data": {
1122 |       "text/plain": [
1123 |        "['1', '2', '', '3']"
1124 |       ]
1125 |      },
1126 |      "execution_count": 20,
1127 |      "metadata": {},
1128 |      "output_type": "execute_result"
1129 |     }
1130 |    ],
1131 |    "source": [
1132 |     "# pipe(command, env=None, checkCode=False)\n",
1133 |     "# 通过管道向后面环节输出command处理过的结果，具体功能就体现在command，command为linux命令。 \n",
1134 |     "# pipe函数中的'cat'为linux命令，表示打印内容。\n",
1135 |     "sc.parallelize(['1', '2', '', '3']).pipe('cat').collect()"
1136 |    ]
1137 |   },
1138 |   {
1139 |    "cell_type": "code",
1140 |    "execution_count": 21,
1141 |    "metadata": {},
1142 |    "outputs": [
1143 |     {
1144 |      "data": {
1145 |       "text/plain": [
1146 |        "[2, 4]"
1147 |       ]
1148 |      },
1149 |      "execution_count": 21,
1150 |      "metadata": {},
1151 |      "output_type": "execute_result"
1152 |     }
1153 |    ],
1154 |    "source": [
1155 |     "# filter(f)\n",
1156 |     "# 返回满足条件的新RDD\n",
1157 |     "rdd = sc.parallelize([1, 2, 3, 4, 5])\n",
1158 |     "rdd.filter(lambda x: x % 2 == 0).collect()"
1159 |    ]
1160 |   },
1161 |   {
1162 |    "cell_type": "code",
1163 |    "execution_count": 10,
1164 |    "metadata": {},
1165 |    "outputs": [
1166 |     {
1167 |      "data": {
1168 |       "text/plain": [
1169 |        "[1, 2, 3]"
1170 |       ]
1171 |      },
1172 |      "execution_count": 10,
1173 |      "metadata": {},
1174 |      "output_type": "execute_result"
1175 |     }
1176 |    ],
1177 |    "source": [
1178 |     "# distinct(numPartitions=None)\n",
1179 |     "# 返回一个没有重复元素的新RDD，就是去重处理\n",
1180 |     "sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect())"
1181 |    ]
1182 |   },
1183 |   {
1184 |    "cell_type": "code",
1185 |    "execution_count": 3,
1186 |    "metadata": {},
1187 |    "outputs": [
1188 |     {
1189 |      "name": "stdout",
1190 |      "output_type": "stream",
1191 |      "text": [
1192 |       "11\n",
1193 |       "20\n"
1194 |      ]
1195 |     }
1196 |    ],
1197 |    "source": [
1198 |     "# sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect())\n",
1199 |     "# 返回此RDD的采样子集\n",
1200 |     "# withReplacement：是否重复采样\n",
1201 |     "# fraction：样本预期占RDD的大小，每一个元素被取到的概率一样，是一个【0,1】的数\n",
1202 |     "# seed 随机模式的种子\n",
1203 |     "rdd = sc.parallelize(range(100), 4)\n",
1204 |     "print(rdd.sample(False, 0.1, 81).count())\n",
1205 |     "print(rdd.sample(False, 0.2, 81).count())"
1206 |    ]
1207 |   },
1208 |   {
1209 |    "cell_type": "code",
1210 |    "execution_count": 28,
1211 |    "metadata": {},
1212 |    "outputs": [
1213 |     {
1214 |      "name": "stdout",
1215 |      "output_type": "stream",
1216 |      "text": [
1217 |       "[('a', 0), ('b', 0), ('a', 1), ('a', 2), ('b', 1), ('b', 2), ('a', 3), ('a', 4), ('a', 5), ('a', 6)]\n",
1218 |       "209 98\n"
1219 |      ]
1220 |     },
1221 |     {
1222 |      "ename": "AttributeError",
1223 |      "evalue": "'ResultIterable' object has no attribute 'takeSample'",
1224 |      "traceback": [
1225 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1226 |       "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
1227 |       "\u001b[0;32m<ipython-input-28-9f7c9d528662>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'b'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'b'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtakeSample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     10\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1228 |       "\u001b[0;31mAttributeError\u001b[0m: 'ResultIterable' object has no attribute 'takeSample'"
1229 |      ],
1230 |      "output_type": "error"
1231 |     }
1232 |    ],
1233 |    "source": [
1234 |     "# sampleByKey(withReplacement, fractions, seed=None)\n",
1235 |     "# 返回按键取样的RDD的子集（通过分层抽样）。用分数指定的不同键的变量采样率来创建这个RDD的样本，这是抽样速率图的关键。\n",
1236 |     "# 多个key的fractions 以字典方式传递\n",
1237 |     "fractions = {\"a\": 0.2, \"b\": 0.1}\n",
1238 |     "rdd = sc.parallelize(fractions.keys()).cartesian(sc.parallelize(range(0, 1000)))\n",
1239 |     "sample = dict(rdd.sampleByKey(False, fractions, 2).groupByKey().collect())\n",
1240 |     "print(rdd.take(10))\n",
1241 |     "print(len(sample['a']), len(sample['b']))\n",
1242 |     "print(sorted(sample['a'])[:10])"
1243 |    ]
1244 |   },
1245 |   {
1246 |    "cell_type": "code",
1247 |    "execution_count": 26,
1248 |    "metadata": {},
1249 |    "outputs": [
1250 |     {
1251 |      "data": {
1252 |       "text/plain": [
1253 |        "1.0"
1254 |       ]
1255 |      },
1256 |      "execution_count": 26,
1257 |      "metadata": {},
1258 |      "output_type": "execute_result"
1259 |     }
1260 |    ],
1261 |    "source": [
1262 |     "# sampleStdev()\n",
1263 |     "# 计算这个RDD元素的样本标准差（通过除以N-1而不是N）来修正估计标准差的偏差。\n",
1264 |     "sc.parallelize([1, 2, 3]).sampleStdev()"
1265 |    ]
1266 |   },
1267 |   {
1268 |    "cell_type": "code",
1269 |    "execution_count": 27,
1270 |    "metadata": {},
1271 |    "outputs": [
1272 |     {
1273 |      "data": {
1274 |       "text/plain": [
1275 |        "1.0"
1276 |       ]
1277 |      },
1278 |      "execution_count": 27,
1279 |      "metadata": {},
1280 |      "output_type": "execute_result"
1281 |     }
1282 |    ],
1283 |    "source": [
1284 |     "# sampleVariance()\n",
1285 |     "# 计算这个RDD元素的样本方差（它纠正了通过除以N-1而不是N来估计方差的偏差）。\n",
1286 |     "sc.parallelize([1, 2, 3]).sampleVariance()"
1287 |    ]
1288 |   },
1289 |   {
1290 |    "cell_type": "code",
1291 |    "execution_count": 31,
1292 |    "metadata": {},
1293 |    "outputs": [
1294 |     {
1295 |      "name": "stdout",
1296 |      "output_type": "stream",
1297 |      "text": [
1298 |       "[6, 9, 9, 8, 0, 7, 0, 8, 3, 6, 7, 8]\n",
1299 |       "5\n"
1300 |      ]
1301 |     }
1302 |    ],
1303 |    "source": [
1304 |     "# takeSample(withReplacement, num, seed=None)\n",
1305 |     "# 返回这个RDD的一个固定大小的采样子集。\n",
1306 |     "# 只有当结果数组被认为是很小的时候，才应该使用这个方法，因为所有的数据都被加载到驱动程序的内存中。\n",
1307 |     "rdd = sc.parallelize(range(0, 10))\n",
1308 |     "print(rdd.takeSample(True, 12, 1))\n",
1309 |     "print(len(rdd.takeSample(False, 5, 2)))"
1310 |    ]
1311 |   },
1312 |   {
1313 |    "cell_type": "code",
1314 |    "execution_count": 32,
1315 |    "metadata": {},
1316 |    "outputs": [
1317 |     {
1318 |      "data": {
1319 |       "text/plain": [
1320 |        "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
1321 |       ]
1322 |      },
1323 |      "execution_count": 32,
1324 |      "metadata": {},
1325 |      "output_type": "execute_result"
1326 |     }
1327 |    ],
1328 |    "source": [
1329 |     "# toLocalIterator()\n",
1330 |     "# 返回包含这个RDD中所有元素的迭代器。迭代器将消耗与此RDD中最大分区相同的内存。\n",
1331 |     "rdd = sc.parallelize(range(10))\n",
1332 |     "[x for x in rdd.toLocalIterator()]"
1333 |    ]
1334 |   },
1335 |   {
1336 |    "cell_type": "code",
1337 |    "execution_count": 39,
1338 |    "metadata": {},
1339 |    "outputs": [
1340 |     {
1341 |      "data": {
1342 |       "text/plain": [
1343 |        "[1, 1, 2, 3, 1, 1, 2, 3]"
1344 |       ]
1345 |      },
1346 |      "execution_count": 39,
1347 |      "metadata": {},
1348 |      "output_type": "execute_result"
1349 |     }
1350 |    ],
1351 |    "source": [
1352 |     "# union(other)\n",
1353 |     "# 返回这个RDD和另一个的结合。不去重\n",
1354 |     "rdd = sc.parallelize([1, 1, 2, 3])\n",
1355 |     "rdd.union(rdd).collect()"
1356 |    ]
1357 |   },
1358 |   {
1359 |    "cell_type": "code",
1360 |    "execution_count": 40,
1361 |    "metadata": {},
1362 |    "outputs": [
1363 |     {
1364 |      "data": {
1365 |       "text/plain": [
1366 |        "[2, 1, 3]"
1367 |       ]
1368 |      },
1369 |      "execution_count": 40,
1370 |      "metadata": {},
1371 |      "output_type": "execute_result"
1372 |     }
1373 |    ],
1374 |    "source": [
1375 |     "# intersection(other)\n",
1376 |     "# 返回这个RDD和另一个的交集。即使输入RDDs完成了，输出也不会包含任何重复的元素。\n",
1377 |     "# 该方法在内部执行洗牌。\n",
1378 |     "rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])\n",
1379 |     "rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])\n",
1380 |     "rdd1.intersection(rdd2).collect()"
1381 |    ]
1382 |   },
1383 |   {
1384 |    "cell_type": "code",
1385 |    "execution_count": 41,
1386 |    "metadata": {},
1387 |    "outputs": [
1388 |     {
1389 |      "data": {
1390 |       "text/plain": [
1391 |        "[('a', 1), ('b', 4), ('b', 5)]"
1392 |       ]
1393 |      },
1394 |      "execution_count": 41,
1395 |      "metadata": {},
1396 |      "output_type": "execute_result"
1397 |     }
1398 |    ],
1399 |    "source": [
1400 |     "# subtract(other, numPartitions=None)\n",
1401 |     "# 返回自己有其他没有的元素的值\n",
1402 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4), (\"b\", 5), (\"a\", 3)])\n",
1403 |     "y = sc.parallelize([(\"a\", 3), (\"c\", None)])\n",
1404 |     "sorted(x.subtract(y).collect())"
1405 |    ]
1406 |   },
1407 |   {
1408 |    "cell_type": "code",
1409 |    "execution_count": 42,
1410 |    "metadata": {},
1411 |    "outputs": [
1412 |     {
1413 |      "data": {
1414 |       "text/plain": [
1415 |        "[('b', 4), ('b', 5)]"
1416 |       ]
1417 |      },
1418 |      "execution_count": 42,
1419 |      "metadata": {},
1420 |      "output_type": "execute_result"
1421 |     }
1422 |    ],
1423 |    "source": [
1424 |     "# subtractByKey(other, numPartitions=None)\n",
1425 |     "# 返回每一个（键，值）对，在另一个没有成对的匹配键。\n",
1426 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4), (\"b\", 5), (\"a\", 2)])\n",
1427 |     "y = sc.parallelize([(\"a\", 3), (\"c\", None)])\n",
1428 |     "sorted(x.subtractByKey(y).collect())"
1429 |    ]
1430 |   },
1431 |   {
1432 |    "cell_type": "code",
1433 |    "execution_count": 43,
1434 |    "metadata": {},
1435 |    "outputs": [
1436 |     {
1437 |      "data": {
1438 |       "text/plain": [
1439 |        "[(1, 1), (1, 2), (2, 1), (2, 2)]"
1440 |       ]
1441 |      },
1442 |      "execution_count": 43,
1443 |      "metadata": {},
1444 |      "output_type": "execute_result"
1445 |     }
1446 |    ],
1447 |    "source": [
1448 |     "# cartesian(other)\n",
1449 |     "# 返回这个RDD和另一个RDD的笛卡尔积，也就是所有成对的元素（a，b）的RDD，a为本身RDD，b为其他RDD\n",
1450 |     "rdd = sc.parallelize([1, 2])\n",
1451 |     "sorted(rdd.cartesian(rdd).collect())"
1452 |    ]
1453 |   },
1454 |   {
1455 |    "cell_type": "code",
1456 |    "execution_count": 44,
1457 |    "metadata": {},
1458 |    "outputs": [
1459 |     {
1460 |      "data": {
1461 |       "text/plain": [
1462 |        "[('a', (1, 2)), ('a', (1, 3))]"
1463 |       ]
1464 |      },
1465 |      "execution_count": 44,
1466 |      "metadata": {},
1467 |      "output_type": "execute_result"
1468 |     }
1469 |    ],
1470 |    "source": [
1471 |     "# join(other, numPartitions=None)\n",
1472 |     "# 返回一个包含所有成对元素的RDD，其中包含在self和other中匹配的键。每一对元素都将作为一个（k，（v1，v2））返回，其中（k，v1）为self（k，v2）为other。\n",
1473 |     "# 在集群中执行散列连接\n",
1474 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
1475 |     "y = sc.parallelize([(\"a\", 2), (\"a\", 3)])\n",
1476 |     "sorted(x.join(y).collect())"
1477 |    ]
1478 |   },
1479 |   {
1480 |    "cell_type": "code",
1481 |    "execution_count": 45,
1482 |    "metadata": {},
1483 |    "outputs": [
1484 |     {
1485 |      "data": {
1486 |       "text/plain": [
1487 |        "[('a', (2, 1)), ('b', (None, 4))]"
1488 |       ]
1489 |      },
1490 |      "execution_count": 45,
1491 |      "metadata": {},
1492 |      "output_type": "execute_result"
1493 |     }
1494 |    ],
1495 |    "source": [
1496 |     "# rightOuterJoin(other, numPartitions=None)\n",
1497 |     "# 对于在otherRDD中的每一个(k, w)元素，生成的RDD中有ｋ键的生成(k, (v, w)),　如果没有ｋ键的话也要生成none补位(k,(None, w))\n",
1498 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
1499 |     "y = sc.parallelize([(\"a\", 2)])\n",
1500 |     "sorted(y.rightOuterJoin(x).collect())"
1501 |    ]
1502 |   },
1503 |   {
1504 |    "cell_type": "code",
1505 |    "execution_count": 46,
1506 |    "metadata": {},
1507 |    "outputs": [
1508 |     {
1509 |      "data": {
1510 |       "text/plain": [
1511 |        "[('a', (1, 2)), ('b', (4, None))]"
1512 |       ]
1513 |      },
1514 |      "execution_count": 46,
1515 |      "metadata": {},
1516 |      "output_type": "execute_result"
1517 |     }
1518 |    ],
1519 |    "source": [
1520 |     "# leftOuterJoin(other, numPartitions=None)\n",
1521 |     "# 就是用第二个rdd的key去第一个rdd中寻找，在value组合的时候还是第一个rdd的值在前，第二个rdd的值在后。其他与leftOuterJoin完全一样。\n",
1522 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
1523 |     "y = sc.parallelize([(\"a\", 2)])\n",
1524 |     "sorted(x.leftOuterJoin(y).collect())"
1525 |    ]
1526 |   },
1527 |   {
1528 |    "cell_type": "code",
1529 |    "execution_count": 48,
1530 |    "metadata": {},
1531 |    "outputs": [
1532 |     {
1533 |      "name": "stdout",
1534 |      "output_type": "stream",
1535 |      "text": [
1536 |       "500\n",
1537 |       "192 308\n"
1538 |      ]
1539 |     }
1540 |    ],
1541 |    "source": [
1542 |     "# randomSplit(weights, seed=None)\n",
1543 |     "# 将RDD按照一定的比例随机分开\n",
1544 |     "rdd = sc.parallelize(range(500), 1)\n",
1545 |     "rdd1, rdd2 = rdd.randomSplit([2, 3], 17)\n",
1546 |     "print(len(rdd1.collect() + rdd2.collect()))\n",
1547 |     "print(rdd1.count(), rdd2.count())"
1548 |    ]
1549 |   },
1550 |   {
1551 |    "cell_type": "code",
1552 |    "execution_count": null,
1553 |    "metadata": {},
1554 |    "outputs": [],
1555 |    "source": []
1556 |   }
1557 |  ],
1558 |  "metadata": {
1559 |   "kernelspec": {
1560 |    "display_name": "Python 3",
1561 |    "language": "python",
1562 |    "name": "python3"
1563 |   },
1564 |   "language_info": {
1565 |    "codemirror_mode": {
1566 |     "name": "ipython",
1567 |     "version": 3
1568 |    },
1569 |    "file_extension": ".py",
1570 |    "mimetype": "text/x-python",
1571 |    "name": "python",
1572 |    "nbconvert_exporter": "python",
1573 |    "pygments_lexer": "ipython3",
1574 |    "version": "3.6.4"
1575 |   }
1576 |  },
1577 |  "nbformat": 4,
1578 |  "nbformat_minor": 2
1579 | }
1580 | 


--------------------------------------------------------------------------------
/pyspark.ml.classification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### 数据准备"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "from pyspark.sql import SparkSession\n",
 17 |     "from pyspark import SparkConf, SparkContext\n",
 18 |     "spark = SparkSession.builder.master('local[1]').appName('learn_ml').getOrCreate()"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "df0 = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/mushrooms.csv', header=True, inferSchema=True, encoding='utf-8')"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {
 34 |     "scrolled": false
 35 |    },
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/plain": [
 40 |        "23"
 41 |       ]
 42 |      },
 43 |      "execution_count": 3,
 44 |      "metadata": {},
 45 |      "output_type": "execute_result"
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "len(df0.columns)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "看看分类的类别"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "**查看是否有na值**"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 4,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "False"
 75 |       ]
 76 |      },
 77 |      "execution_count": 4,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "# df0.toPandas().isna().sum()\n",
 84 |     "df0.toPandas().isna().values.any()"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 5,
 90 |    "metadata": {
 91 |     "scrolled": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "from pyspark.ml.feature import StringIndexer, VectorAssembler\n",
 96 |     "old_columns_names = df0.columns\n",
 97 |     "new_columns_names = [name+'-new' for name in old_columns_names]\n",
 98 |     "for i in range(len(old_columns_names)):\n",
 99 |     "    indexer = StringIndexer(inputCol=old_columns_names[i], outputCol=new_columns_names[i])\n",
100 |     "    df0 = indexer.fit(df0).transform(df0)\n",
101 |     "vecAss = VectorAssembler(inputCols=new_columns_names[1:], outputCol='features')\n",
102 |     "df0 = vecAss.transform(df0)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 6,
108 |    "metadata": {
109 |     "scrolled": false
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "df0 = df0.withColumnRenamed(new_columns_names[0], 'label')"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 7,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "dfi = df0.select(['label', 'features'])"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 8,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "# df0.describe().toPandas().T"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 9,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stdout",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "+-----+------------------------------------------------------------------------------+\n",
144 |       "|label|features                                                                      |\n",
145 |       "+-----+------------------------------------------------------------------------------+\n",
146 |       "|1.0  |(22,[1,3,4,7,8,9,10,19,20,21],[1.0,1.0,6.0,1.0,7.0,1.0,2.0,2.0,2.0,4.0])      |\n",
147 |       "|0.0  |(22,[1,2,3,4,8,9,10,19,20,21],[1.0,3.0,1.0,4.0,7.0,1.0,3.0,1.0,3.0,1.0])      |\n",
148 |       "|0.0  |(22,[0,1,2,3,4,8,9,10,19,20,21],[3.0,1.0,4.0,1.0,5.0,3.0,1.0,3.0,1.0,3.0,5.0])|\n",
149 |       "|1.0  |(22,[2,3,4,7,8,9,10,19,20,21],[4.0,1.0,6.0,1.0,3.0,1.0,2.0,2.0,2.0,4.0])      |\n",
150 |       "|0.0  |(22,[1,2,6,8,10,18,19,20,21],[1.0,1.0,1.0,7.0,2.0,1.0,1.0,4.0,1.0])           |\n",
151 |       "+-----+------------------------------------------------------------------------------+\n",
152 |       "only showing top 5 rows\n",
153 |       "\n"
154 |      ]
155 |     }
156 |    ],
157 |    "source": [
158 |     "dfi.show(5, truncate=0)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 10,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "# label = df0.rdd.map(lambda row: row[0])\n",
168 |     "# row = df0.rdd.map(lambda row: row[1:])\n",
169 |     "# dfi = label.map(lambda m: 0.0 if m=='p' else 1.0).zip(row.map(lambda x: list(x))).toDF(schema=['label','feature'])"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 11,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "train_data, test_data = dfi.randomSplit([4.0, 1.0], 100)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 12,
184 |    "metadata": {
185 |     "scrolled": true
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "# test_data.filter(test_data['label']==1).show(5, truncate=0)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "### 评估器\n",
197 |     "**分类(classification)**"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "### LogisticRegression :逻辑回归,支持多项逻辑（softmax）和二项逻辑回归"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "pyspark.ml.classification.LogisticRegression(self, featuresCol=\"features\", labelCol=\"label\", predictionCol=\"prediction\", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol=\"probability\", rawPredictionCol=\"rawPrediction\", standardization=True, weightCol=None, aggregationDepth=2, family=\"auto\")\n"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "metadata": {},
217 |    "source": [
218 |     "`\n",
219 |     "regParam: 正则化参数(>=0)\n",
220 |     "elasticNetParam: ElasticNet混合参数，0-1之间，当alpha为0时,惩罚为L2正则化，当为1时为L1正则化\n",
221 |     "fitIntercept: 是否拟合一个截距项\n",
222 |     "Standardization： 是否在拟合数据之前对数据进行标准化\n",
223 |     "aggregationDepth: 树聚合所建议的深度(>=2)\n",
224 |     "`"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 20,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "from pyspark.ml.classification import LogisticRegression\n",
234 |     "blor = LogisticRegression(regParam=0.01)\n",
235 |     "blorModel = blor.fit(train_data)\n",
236 |     "result = blorModel.transform(test_data)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 21,
242 |    "metadata": {},
243 |    "outputs": [
244 |     {
245 |      "data": {
246 |       "text/plain": [
247 |        "0.9661954517516902"
248 |       ]
249 |      },
250 |      "execution_count": 21,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "result.filter(result.label == result.prediction).count()/result.count()"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 22,
262 |    "metadata": {
263 |     "collapsed": true
264 |    },
265 |    "outputs": [
266 |     {
267 |      "name": "stdout",
268 |      "output_type": "stream",
269 |      "text": [
270 |       "+--------------------+--------------------+\n",
271 |       "|                 FPR|                 TPR|\n",
272 |       "+--------------------+--------------------+\n",
273 |       "|                 0.0|                 0.0|\n",
274 |       "|                 0.0|0.020466901183242726|\n",
275 |       "|                 0.0| 0.04093380236648545|\n",
276 |       "|5.934718100890207E-4|0.060761112887751836|\n",
277 |       "|0.001186943620178...| 0.08058842340901823|\n",
278 |       "|0.001483679525222552| 0.10073552926127279|\n",
279 |       "|0.001780415430267...| 0.12088263511352734|\n",
280 |       "|0.002373887240356083| 0.14070994563479372|\n",
281 |       "|0.002670623145400...|  0.1608570514870483|\n",
282 |       "|0.002670623145400...| 0.18132395267029103|\n",
283 |       "|0.002670623145400...| 0.20179085385353374|\n",
284 |       "|0.002670623145400...| 0.22225775503677647|\n",
285 |       "|0.002670623145400...| 0.24272465622001918|\n",
286 |       "|0.002670623145400...|  0.2631915574032619|\n",
287 |       "|0.002670623145400...|  0.2836584585865046|\n",
288 |       "|0.002670623145400...| 0.30412535976974736|\n",
289 |       "|0.002670623145400...|  0.3245922609529901|\n",
290 |       "|0.002670623145400...| 0.34505916213623283|\n",
291 |       "|0.002670623145400...|  0.3655260633194755|\n",
292 |       "|0.002670623145400...| 0.38599296450271825|\n",
293 |       "+--------------------+--------------------+\n",
294 |       "only showing top 20 rows\n",
295 |       "\n",
296 |       "+--------------------+------------------+\n",
297 |       "|              recall|         precision|\n",
298 |       "+--------------------+------------------+\n",
299 |       "|                 0.0|               1.0|\n",
300 |       "|0.020466901183242726|               1.0|\n",
301 |       "| 0.04093380236648545|               1.0|\n",
302 |       "|0.060761112887751836|0.9895833333333334|\n",
303 |       "| 0.08058842340901823|          0.984375|\n",
304 |       "| 0.10073552926127279|          0.984375|\n",
305 |       "| 0.12088263511352734|          0.984375|\n",
306 |       "| 0.14070994563479372|0.9821428571428571|\n",
307 |       "|  0.1608570514870483|       0.982421875|\n",
308 |       "| 0.18132395267029103|          0.984375|\n",
309 |       "| 0.20179085385353374|         0.9859375|\n",
310 |       "| 0.22225775503677647|0.9872159090909091|\n",
311 |       "| 0.24272465622001918|        0.98828125|\n",
312 |       "|  0.2631915574032619|0.9891826923076923|\n",
313 |       "|  0.2836584585865046|0.9899553571428571|\n",
314 |       "| 0.30412535976974736|          0.990625|\n",
315 |       "|  0.3245922609529901|      0.9912109375|\n",
316 |       "| 0.34505916213623283|0.9917279411764706|\n",
317 |       "|  0.3655260633194755|         0.9921875|\n",
318 |       "| 0.38599296450271825|0.9925986842105263|\n",
319 |       "+--------------------+------------------+\n",
320 |       "only showing top 20 rows\n",
321 |       "\n"
322 |      ]
323 |     }
324 |    ],
325 |    "source": [
326 |     "blorModel.\n",
327 |     "blorModel.summary.pr.show()"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {},
333 |    "source": [
334 |     "### 决策树\n",
335 |     "pyspark.ml.classification.DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='gini', seed=None)"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {},
341 |    "source": [
342 |     "`\n",
343 |     "checkpointInterval：设置checkpoint区间(>=1)，或宕掉checkpoint(-1)，例如10意味着缓冲区(cache)将会每迭代10次获得一次checkpoint\n",
344 |     "fit(datasset,params=None)\n",
345 |     "impurity: 信息增益计算的准则，选项\"entropy\", \"gini\"\n",
346 |     "maxBins：连续特征离散化的最大分箱，必须>=2 并且>=分类特征分类的数量\n",
347 |     "maxDepth：树的最大深度\n",
348 |     "minInfoGain：分割结点所需的最小的信息增益\n",
349 |     "minInstancesPerNode：每个结点最小实例个数\n",
350 |     "`"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 13,
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": [
359 |     "from pyspark.ml.classification import DecisionTreeClassifier\n",
360 |     "dt = DecisionTreeClassifier(maxDepth=5)\n",
361 |     "dtModel = dt.fit(train_data)\n",
362 |     "result = dtModel.transform(test_data)"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 14,
368 |    "metadata": {},
369 |    "outputs": [
370 |     {
371 |      "data": {
372 |       "text/plain": [
373 |        "1.0"
374 |       ]
375 |      },
376 |      "execution_count": 14,
377 |      "metadata": {},
378 |      "output_type": "execute_result"
379 |     }
380 |    ],
381 |    "source": [
382 |     "# accuracy\n",
383 |     "result.filter(result.label == result.prediction).count()/result.count()"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "markdown",
388 |    "metadata": {},
389 |    "source": [
390 |     "### 梯度增强树\n",
391 |     "pyspark.ml.classification.GBTClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType='logistic', maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0)"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "markdown",
396 |    "metadata": {},
397 |    "source": [
398 |     "`\n",
399 |     "checkpointInterval: 同DecisionTreeClassifier\n",
400 |     "fit(dataset,params=None)方法\n",
401 |     "lossType: GBT要最小化的损失函数，选项：logistic\n",
402 |     "maxBins: 同DecisionTreeClassifier\n",
403 |     "maxDepth: 同DecisionTreeClassifier\n",
404 |     "maxIter: 同DecisionTreeClassifier\n",
405 |     "minInfoGain: 同DecisionTreeClassifier\n",
406 |     "minInstancesPerNode：同DecisionTreeClassifier\n",
407 |     "stepSize: 每次迭代优化的步长\n",
408 |     "subsamplingRate: 同RandomForesetClassier\n",
409 |     "`"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 16,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "from pyspark.ml.classification import GBTClassifier\n",
419 |     "gbt = GBTClassifier(maxDepth=5)\n",
420 |     "gbtModel = gbt.fit(train_data)\n",
421 |     "result = gbtModel.transform(test_data)"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 17,
427 |    "metadata": {},
428 |    "outputs": [
429 |     {
430 |      "data": {
431 |       "text/plain": [
432 |        "1.0"
433 |       ]
434 |      },
435 |      "execution_count": 17,
436 |      "metadata": {},
437 |      "output_type": "execute_result"
438 |     }
439 |    ],
440 |    "source": [
441 |     "result.filter(result.label == result.prediction).count()/result.count()"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "markdown",
446 |    "metadata": {},
447 |    "source": [
448 |     "### 随机森林\n",
449 |     "pyspark.ml.classification.RandomForestClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='gini', numTrees=20, featureSubsetStrategy='auto', seed=None, subsamplingRate=1.0)"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "metadata": {},
455 |    "source": [
456 |     "`\n",
457 |     "checkpoint：同DecisionTreeClassifier\n",
458 |     "featureSubsetStrategy：每棵树上要分割的特征数目，选项为\"auto\",\"all\", \"onethird\", \"sqrt\", \"log2\", \"(0.0-1.0],\"[1-n]\"\n",
459 |     "fit(dataset,params=None)方法\n",
460 |     "impurity: 同DecisionTreeClassifier\n",
461 |     "maxBins:同DecisionTreeClassifier\n",
462 |     "maxDepth：同DecisionTreeClassifier\n",
463 |     "minInfoGain: 同DecisionTreeClassifier\n",
464 |     "numTrees: 训练树的个数\n",
465 |     "subsamplingRate: 用于训练每颗决策树的样本个数，区间(0,1]\n",
466 |     "`"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 13,
472 |    "metadata": {},
473 |    "outputs": [],
474 |    "source": [
475 |     "from pyspark.ml.classification import RandomForestClassifier\n",
476 |     "rf = RandomForestClassifier(numTrees=10, maxDepth=5)\n",
477 |     "rfModel = rf.fit(train_data)\n",
478 |     "# model.featureImportances\n",
479 |     "result = rfModel.transform(test_data)"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": 19,
485 |    "metadata": {
486 |     "scrolled": true
487 |    },
488 |    "outputs": [
489 |     {
490 |      "data": {
491 |       "text/plain": [
492 |        "1.0"
493 |       ]
494 |      },
495 |      "execution_count": 19,
496 |      "metadata": {},
497 |      "output_type": "execute_result"
498 |     }
499 |    ],
500 |    "source": [
501 |     "result.filter(result.label == result.prediction).count()/result.count()"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "markdown",
506 |    "metadata": {},
507 |    "source": [
508 |     "### 朴素贝叶斯\n",
509 |     "pyspark.ml.classification.NaiveBayes(featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', smoothing=1.0, modelType='multinomial', thresholds=None, weightCol=None)"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "markdown",
514 |    "metadata": {},
515 |    "source": [
516 |     "`\n",
517 |     "modelType: 选项：multinomial（多项式）和bernoulli（伯努利）\n",
518 |     "smoothing: 平滑参数，应该>=0，默认为1.0\n",
519 |     "`"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": 24,
525 |    "metadata": {},
526 |    "outputs": [],
527 |    "source": [
528 |     "from pyspark.ml.classification import NaiveBayes\n",
529 |     "nb = NaiveBayes()\n",
530 |     "nbModel = nb.fit(train_data)\n",
531 |     "result = nbModel.transform(test_data)"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": 25,
537 |    "metadata": {},
538 |    "outputs": [
539 |     {
540 |      "data": {
541 |       "text/plain": [
542 |        "0.9231714812538414"
543 |       ]
544 |      },
545 |      "execution_count": 25,
546 |      "metadata": {},
547 |      "output_type": "execute_result"
548 |     }
549 |    ],
550 |    "source": [
551 |     "result.filter(result.label == result.prediction).count()/result.count()"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "markdown",
556 |    "metadata": {},
557 |    "source": [
558 |     "### LinearSVC 支持向量机\n",
559 |     "pyspark.ml.classification.LinearSVC(featuresCol='features', labelCol='label', predictionCol='prediction', maxIter=100, regParam=0.0, tol=1e-06, rawPredictionCol='rawPrediction', fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2)"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": 17,
565 |    "metadata": {},
566 |    "outputs": [],
567 |    "source": [
568 |     "from pyspark.ml.classification import LinearSVC\n",
569 |     "svm = LinearSVC(maxIter=10, regPcaram=0.01)\n",
570 |     "svmModel = svm.fit(train_data)\n",
571 |     "result = svmModel.transform(test_data)"
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "code",
576 |    "execution_count": 18,
577 |    "metadata": {
578 |     "scrolled": true
579 |    },
580 |    "outputs": [
581 |     {
582 |      "data": {
583 |       "text/plain": [
584 |        "0.9797172710510141"
585 |       ]
586 |      },
587 |      "execution_count": 18,
588 |      "metadata": {},
589 |      "output_type": "execute_result"
590 |     }
591 |    ],
592 |    "source": [
593 |     "# accuracy\n",
594 |     "result.filter(result.label == result.prediction).count()/result.count()"
595 |    ]
596 |   }
597 |  ],
598 |  "metadata": {
599 |   "kernelspec": {
600 |    "display_name": "Python 3",
601 |    "language": "python",
602 |    "name": "python3"
603 |   },
604 |   "language_info": {
605 |    "codemirror_mode": {
606 |     "name": "ipython",
607 |     "version": 3
608 |    },
609 |    "file_extension": ".py",
610 |    "mimetype": "text/x-python",
611 |    "name": "python",
612 |    "nbconvert_exporter": "python",
613 |    "pygments_lexer": "ipython3",
614 |    "version": "3.6.4"
615 |   }
616 |  },
617 |  "nbformat": 4,
618 |  "nbformat_minor": 2
619 | }
620 | 


--------------------------------------------------------------------------------
/pyspark.ml.feature.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "from pyspark.sql import SparkSession \n",
  10 |     "spark = SparkSession.builder.appName('learn_ml').master('local[1]').getOrCreate()"
  11 |    ]
  12 |   },
  13 |   {
  14 |    "cell_type": "markdown",
  15 |    "metadata": {},
  16 |    "source": [
  17 |     "ml 模块 三个抽象类：\n",
  18 |     "转换器（Transformer）、评估器（Estimator）和管道（Pipeline）"
  19 |    ]
  20 |   },
  21 |   {
  22 |    "cell_type": "markdown",
  23 |    "metadata": {},
  24 |    "source": [
  25 |     "### pyspark.ml.feature.Binarizer(self, threshold=0.0, inputCol=None, outputCol=None)\n",
  26 |     "根据指定的阈值将连续变量转换为对应的二进制"
  27 |    ]
  28 |   },
  29 |   {
  30 |    "cell_type": "code",
  31 |    "execution_count": 2,
  32 |    "metadata": {},
  33 |    "outputs": [],
  34 |    "source": [
  35 |     "df = spark.createDataFrame([(0.5,),(1.0,),(1.5,)], ['values'])"
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "code",
  40 |    "execution_count": 3,
  41 |    "metadata": {},
  42 |    "outputs": [
  43 |     {
  44 |      "name": "stdout",
  45 |      "output_type": "stream",
  46 |      "text": [
  47 |       "+------+\n",
  48 |       "|values|\n",
  49 |       "+------+\n",
  50 |       "|   0.5|\n",
  51 |       "|   1.0|\n",
  52 |       "|   1.5|\n",
  53 |       "+------+\n",
  54 |       "\n"
  55 |      ]
  56 |     }
  57 |    ],
  58 |    "source": [
  59 |     "df.show()"
  60 |    ]
  61 |   },
  62 |   {
  63 |    "cell_type": "code",
  64 |    "execution_count": 4,
  65 |    "metadata": {},
  66 |    "outputs": [
  67 |     {
  68 |      "name": "stdout",
  69 |      "output_type": "stream",
  70 |      "text": [
  71 |       "+------+--------+\n",
  72 |       "|values|features|\n",
  73 |       "+------+--------+\n",
  74 |       "|   0.5|     0.0|\n",
  75 |       "|   1.0|     1.0|\n",
  76 |       "|   1.5|     1.0|\n",
  77 |       "+------+--------+\n",
  78 |       "\n"
  79 |      ]
  80 |     }
  81 |    ],
  82 |    "source": [
  83 |     "from pyspark.ml.feature import Binarizer\n",
  84 |     "binarizer = Binarizer(threshold=0.7, inputCol=\"values\", outputCol=\"features\")\n",
  85 |     "binarizer.transform(df).show()"
  86 |    ]
  87 |   },
  88 |   {
  89 |    "cell_type": "code",
  90 |    "execution_count": 5,
  91 |    "metadata": {},
  92 |    "outputs": [
  93 |     {
  94 |      "name": "stdout",
  95 |      "output_type": "stream",
  96 |      "text": [
  97 |       "+------+-----+\n",
  98 |       "|values|freqs|\n",
  99 |       "+------+-----+\n",
 100 |       "|   0.5|  0.0|\n",
 101 |       "|   1.0|  1.0|\n",
 102 |       "|   1.5|  1.0|\n",
 103 |       "+------+-----+\n",
 104 |       "\n"
 105 |      ]
 106 |     }
 107 |    ],
 108 |    "source": [
 109 |     "# 通过setParams，更改配置\n",
 110 |     "binarizer.setParams(outputCol=\"freqs\").transform(df).show()"
 111 |    ]
 112 |   },
 113 |   {
 114 |    "cell_type": "code",
 115 |    "execution_count": 6,
 116 |    "metadata": {},
 117 |    "outputs": [
 118 |     {
 119 |      "name": "stdout",
 120 |      "output_type": "stream",
 121 |      "text": [
 122 |       "+------+------+\n",
 123 |       "|values|vector|\n",
 124 |       "+------+------+\n",
 125 |       "|   0.5|   1.0|\n",
 126 |       "|   1.0|   1.0|\n",
 127 |       "|   1.5|   1.0|\n",
 128 |       "+------+------+\n",
 129 |       "\n"
 130 |      ]
 131 |     }
 132 |    ],
 133 |    "source": [
 134 |     "# 通过params更改配置\n",
 135 |     "params = {binarizer.threshold: -0.5, binarizer.outputCol: \"vector\"}\n",
 136 |     "binarizer.transform(df, params).show()"
 137 |    ]
 138 |   },
 139 |   {
 140 |    "cell_type": "code",
 141 |    "execution_count": 7,
 142 |    "metadata": {},
 143 |    "outputs": [],
 144 |    "source": [
 145 |     "# 保存配置\n",
 146 |     "import os\n",
 147 |     "#temp_path = os.getcwd()\n",
 148 |     "temp_path = os.path.abspath('.')\n",
 149 |     "binarizerPath = \"file://{}/binarizer\".format(temp_path)\n",
 150 |     "binarizer.save(binarizerPath)"
 151 |    ]
 152 |   },
 153 |   {
 154 |    "cell_type": "code",
 155 |    "execution_count": 8,
 156 |    "metadata": {},
 157 |    "outputs": [
 158 |     {
 159 |      "data": {
 160 |       "text/plain": [
 161 |        "True"
 162 |       ]
 163 |      },
 164 |      "execution_count": 8,
 165 |      "metadata": {},
 166 |      "output_type": "execute_result"
 167 |     }
 168 |    ],
 169 |    "source": [
 170 |     "# 加载配置\n",
 171 |     "loadedBinarizer = Binarizer.load(binarizerPath)\n",
 172 |     "loadedBinarizer.getThreshold() == binarizer.getThreshold()"
 173 |    ]
 174 |   },
 175 |   {
 176 |    "cell_type": "markdown",
 177 |    "metadata": {},
 178 |    "source": [
 179 |     "### pyspark.ml.feature.Bucketizer(self, splits=None, inputCol=None, outputCol=None, handleInvalid=\"error\")\n",
 180 |     "与Binarizer类似，该方法根据阈值列表（分割的参数），将连续变量转换为多项值（连续变量离散化到指定的范围区间）\n"
 181 |    ]
 182 |   },
 183 |   {
 184 |    "cell_type": "code",
 185 |    "execution_count": 9,
 186 |    "metadata": {},
 187 |    "outputs": [
 188 |     {
 189 |      "name": "stdout",
 190 |      "output_type": "stream",
 191 |      "text": [
 192 |       "+------+-------+\n",
 193 |       "|values|buckets|\n",
 194 |       "+------+-------+\n",
 195 |       "|   0.1|    0.0|\n",
 196 |       "|   0.4|    0.0|\n",
 197 |       "|   1.2|    1.0|\n",
 198 |       "|   1.5|    2.0|\n",
 199 |       "|   NaN|    3.0|\n",
 200 |       "|   NaN|    3.0|\n",
 201 |       "+------+-------+\n",
 202 |       "\n"
 203 |      ]
 204 |     }
 205 |    ],
 206 |    "source": [
 207 |     "from pyspark.ml.feature import Bucketizer\n",
 208 |     "values = [(0.1,), (0.4,), (1.2,), (1.5,), (float(\"nan\"),), (float(\"nan\"),)]\n",
 209 |     "df = spark.createDataFrame(values, [\"values\"])\n",
 210 |     "# splits 为分类区间\n",
 211 |     "bucketizer = Bucketizer(splits=[-float(\"inf\"), 0.5, 1.4, float(\"inf\")],inputCol=\"values\", outputCol=\"buckets\")\n",
 212 |     "# 这里setHandleInvalid是对nan值进行处理，默认是error：有nan则报错；keep：将nan保留为新分类；skip：忽略nan值\n",
 213 |     "bucketed = bucketizer.setHandleInvalid(\"keep\").transform(df)\n",
 214 |     "bucketed.show()"
 215 |    ]
 216 |   },
 217 |   {
 218 |    "cell_type": "code",
 219 |    "execution_count": 10,
 220 |    "metadata": {},
 221 |    "outputs": [
 222 |     {
 223 |      "name": "stdout",
 224 |      "output_type": "stream",
 225 |      "text": [
 226 |       "+------+---+\n",
 227 |       "|values|  b|\n",
 228 |       "+------+---+\n",
 229 |       "|   0.1|0.0|\n",
 230 |       "|   0.4|0.0|\n",
 231 |       "|   1.2|1.0|\n",
 232 |       "|   1.5|2.0|\n",
 233 |       "|   NaN|3.0|\n",
 234 |       "|   NaN|3.0|\n",
 235 |       "+------+---+\n",
 236 |       "\n"
 237 |      ]
 238 |     }
 239 |    ],
 240 |    "source": [
 241 |     "# 更改配置\n",
 242 |     "bucketizer.setParams(outputCol=\"b\").transform(df).show()"
 243 |    ]
 244 |   },
 245 |   {
 246 |    "cell_type": "markdown",
 247 |    "metadata": {},
 248 |    "source": [
 249 |     "### pyspark.ml.feature.ChiSqSelector(self, numTopFeatures=50, featuresCol=\"features\", outputCol=None, labelCol=\"label\", selectorType=\"numTopFeatures\", percentile=0.1, fpr=0.05, fdr=0.05, fwe=0.05)\n",
 250 |     "对于分类目标变量（思考分类模型），此功能允许你选择预定义数量的特征（由numTopFeatures参数进行参数化），以便最好地说明目标的变化。该方法需要两部：需要.fit()——可以计算卡方检验，调用.fit()方法，将DataFrame作为参数传入返回一个ChiSqSelectorModel对象，然后可以使用该对象的.transform()方法来转换DataFrame。默认情况下，选择方法是numTopFeatures，默认顶级要素数设置为50。\n",
 251 |     "percentile 相识于num ，选取百分比的特征\n",
 252 |     "fpr 选择p-values低于阈值的所有特征，从而控制误差的选择率。\n",
 253 |     "fdr 使用  Benjamini-Hochberg procedure \n",
 254 |     "fwe 选择p-values低于阈值的所有特征。阈值按1 / numFeatures缩放"
 255 |    ]
 256 |   },
 257 |   {
 258 |    "cell_type": "code",
 259 |    "execution_count": 11,
 260 |    "metadata": {},
 261 |    "outputs": [
 262 |     {
 263 |      "name": "stdout",
 264 |      "output_type": "stream",
 265 |      "text": [
 266 |       "+------------------+-----+----------------+\n",
 267 |       "|          features|label|selectedFeatures|\n",
 268 |       "+------------------+-----+----------------+\n",
 269 |       "|[0.0,0.0,18.0,1.0]|  1.0|      [18.0,1.0]|\n",
 270 |       "|[0.0,1.0,12.0,0.0]|  0.0|      [12.0,0.0]|\n",
 271 |       "|[1.0,0.0,15.0,0.1]|  0.0|      [15.0,0.1]|\n",
 272 |       "+------------------+-----+----------------+\n",
 273 |       "\n"
 274 |      ]
 275 |     }
 276 |    ],
 277 |    "source": [
 278 |     "from pyspark.ml.linalg import Vectors\n",
 279 |     "from pyspark.ml.feature import ChiSqSelector\n",
 280 |     "df = spark.createDataFrame(\n",
 281 |     "[(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),\n",
 282 |     "(Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),\n",
 283 |     "(Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],\n",
 284 |     "[\"features\", \"label\"])\n",
 285 |     "selector = ChiSqSelector(numTopFeatures=2, outputCol=\"selectedFeatures\")\n",
 286 |     "model = selector.fit(df)\n",
 287 |     "model.transform(df).show()"
 288 |    ]
 289 |   },
 290 |   {
 291 |    "cell_type": "markdown",
 292 |    "metadata": {},
 293 |    "source": [
 294 |     "### pyspark.ml.feature.CountVectorizer(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None, outputCol=None)\n",
 295 |     "从文档集合中提取词汇表并生成向量"
 296 |    ]
 297 |   },
 298 |   {
 299 |    "cell_type": "code",
 300 |    "execution_count": 12,
 301 |    "metadata": {},
 302 |    "outputs": [
 303 |     {
 304 |      "name": "stdout",
 305 |      "output_type": "stream",
 306 |      "text": [
 307 |       "+-----+---------------+-------------------------+\n",
 308 |       "|label|raw            |vectors                  |\n",
 309 |       "+-----+---------------+-------------------------+\n",
 310 |       "|0    |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|\n",
 311 |       "|1    |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|\n",
 312 |       "+-----+---------------+-------------------------+\n",
 313 |       "\n"
 314 |      ]
 315 |     }
 316 |    ],
 317 |    "source": [
 318 |     "from pyspark.ml.feature import CountVectorizer\n",
 319 |     "df = spark.createDataFrame(\n",
 320 |     "[(0, [\"a\", \"b\", \"c\"]), (1, [\"a\", \"b\", \"b\", \"c\", \"a\"])],\n",
 321 |     "[\"label\", \"raw\"])\n",
 322 |     "cv = CountVectorizer(inputCol=\"raw\", outputCol=\"vectors\")\n",
 323 |     "model = cv.fit(df)\n",
 324 |     "model.transform(df).show(truncate=False)"
 325 |    ]
 326 |   },
 327 |   {
 328 |    "cell_type": "code",
 329 |    "execution_count": 13,
 330 |    "metadata": {},
 331 |    "outputs": [
 332 |     {
 333 |      "data": {
 334 |       "text/plain": [
 335 |        "['a', 'b', 'c']"
 336 |       ]
 337 |      },
 338 |      "execution_count": 13,
 339 |      "metadata": {},
 340 |      "output_type": "execute_result"
 341 |     }
 342 |    ],
 343 |    "source": [
 344 |     "sorted(model.vocabulary) "
 345 |    ]
 346 |   },
 347 |   {
 348 |    "cell_type": "code",
 349 |    "execution_count": 14,
 350 |    "metadata": {},
 351 |    "outputs": [],
 352 |    "source": [
 353 |     "# 保存model\n",
 354 |     "import os\n",
 355 |     "#temp_path = os.getcwd()\n",
 356 |     "temp_path = os.path.abspath('.')\n",
 357 |     "modelPath = \"file://{}/count-vectorizer-model\".format(temp_path)\n",
 358 |     "model.save(modelPath)"
 359 |    ]
 360 |   },
 361 |   {
 362 |    "cell_type": "code",
 363 |    "execution_count": 15,
 364 |    "metadata": {},
 365 |    "outputs": [
 366 |     {
 367 |      "data": {
 368 |       "text/plain": [
 369 |        "True"
 370 |       ]
 371 |      },
 372 |      "execution_count": 15,
 373 |      "metadata": {},
 374 |      "output_type": "execute_result"
 375 |     }
 376 |    ],
 377 |    "source": [
 378 |     "# 加载model\n",
 379 |     "from pyspark.ml.feature import CountVectorizerModel\n",
 380 |     "loadedModel = CountVectorizerModel.load(modelPath)\n",
 381 |     "loadedModel.vocabulary == model.vocabulary"
 382 |    ]
 383 |   },
 384 |   {
 385 |    "cell_type": "markdown",
 386 |    "metadata": {},
 387 |    "source": [
 388 |     "### pyspark.ml.feature.ElementwiseProduct(scalingVec=None, inputCol=None, outputCol=None)\n",
 389 |     "使用提供的“权重”向量输出每个输入向量的阿达马乘积（即，逐元素乘积）。换句话说，它通过标量乘数缩放数据集的每一列。"
 390 |    ]
 391 |   },
 392 |   {
 393 |    "cell_type": "code",
 394 |    "execution_count": 16,
 395 |    "metadata": {},
 396 |    "outputs": [
 397 |     {
 398 |      "name": "stdout",
 399 |      "output_type": "stream",
 400 |      "text": [
 401 |       "+-------------+-------------+\n",
 402 |       "|       values|        eprod|\n",
 403 |       "+-------------+-------------+\n",
 404 |       "|[2.0,1.0,3.0]|[2.0,2.0,9.0]|\n",
 405 |       "+-------------+-------------+\n",
 406 |       "\n",
 407 |       "+-------------+--------------+\n",
 408 |       "|       values|         eprod|\n",
 409 |       "+-------------+--------------+\n",
 410 |       "|[2.0,1.0,3.0]|[4.0,3.0,15.0]|\n",
 411 |       "+-------------+--------------+\n",
 412 |       "\n"
 413 |      ]
 414 |     }
 415 |    ],
 416 |    "source": [
 417 |     "from pyspark.ml.feature import ElementwiseProduct \n",
 418 |     "from pyspark.ml.linalg import Vectors\n",
 419 |     "df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], [\"values\"])\n",
 420 |     "ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),\n",
 421 |     "inputCol=\"values\", outputCol=\"eprod\")\n",
 422 |     "ep.transform(df).show()\n",
 423 |     "ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).show()\n"
 424 |    ]
 425 |   },
 426 |   {
 427 |    "cell_type": "markdown",
 428 |    "metadata": {},
 429 |    "source": [
 430 |     "### pyspark.ml.feature.Imputer(*args, **kwargs)\n",
 431 |     "用于完成缺失值的插补估计器，使用缺失值所在列的平均值或中值。 输入列应该是DoubleType或FloatType。 目前的Imputer不支持分类特征，可能会为分类特征创建不正确的值。\n",
 432 |     "请注意，平均值/中值是在过滤出缺失值之后计算的。 输入列中的所有Null值都被视为缺失，所以也被归类。 为了计算中位数，使用pyspark.sql.DataFrame.approxQuantile（），相对误差为0.001。\n"
 433 |    ]
 434 |   },
 435 |   {
 436 |    "cell_type": "code",
 437 |    "execution_count": 17,
 438 |    "metadata": {},
 439 |    "outputs": [
 440 |     {
 441 |      "name": "stdout",
 442 |      "output_type": "stream",
 443 |      "text": [
 444 |       "+---+---+\n",
 445 |       "|  a|  b|\n",
 446 |       "+---+---+\n",
 447 |       "|1.0|NaN|\n",
 448 |       "|2.0|NaN|\n",
 449 |       "|NaN|3.0|\n",
 450 |       "|4.0|4.0|\n",
 451 |       "|5.0|5.0|\n",
 452 |       "+---+---+\n",
 453 |       "\n",
 454 |       "+---+---+\n",
 455 |       "|  a|  b|\n",
 456 |       "+---+---+\n",
 457 |       "|3.0|4.0|\n",
 458 |       "+---+---+\n",
 459 |       "\n",
 460 |       "+---+---+-----+-----+\n",
 461 |       "|  a|  b|out_a|out_b|\n",
 462 |       "+---+---+-----+-----+\n",
 463 |       "|1.0|NaN|  1.0|  4.0|\n",
 464 |       "|2.0|NaN|  2.0|  4.0|\n",
 465 |       "|NaN|3.0|  3.0|  3.0|\n",
 466 |       "|4.0|4.0|  4.0|  4.0|\n",
 467 |       "|5.0|5.0|  5.0|  5.0|\n",
 468 |       "+---+---+-----+-----+\n",
 469 |       "\n"
 470 |      ]
 471 |     }
 472 |    ],
 473 |    "source": [
 474 |     "from pyspark.ml.feature import Imputer\n",
 475 |     "df = spark.createDataFrame([(1.0, float(\"nan\")), (2.0, float(\"nan\")), (float(\"nan\"), 3.0),\n",
 476 |     "                             (4.0, 4.0), (5.0, 5.0)], [\"a\", \"b\"])\n",
 477 |     "imputer = Imputer(inputCols=[\"a\", \"b\"], outputCols=[\"out_a\", \"out_b\"])\n",
 478 |     "model = imputer.fit(df)\n",
 479 |     "df.show()\n",
 480 |     "model.surrogateDF.show()\n",
 481 |     "model.transform(df).show()"
 482 |    ]
 483 |   },
 484 |   {
 485 |    "cell_type": "code",
 486 |    "execution_count": 18,
 487 |    "metadata": {},
 488 |    "outputs": [
 489 |     {
 490 |      "name": "stdout",
 491 |      "output_type": "stream",
 492 |      "text": [
 493 |       "+---+---+-----+-----+\n",
 494 |       "|  a|  b|out_a|out_b|\n",
 495 |       "+---+---+-----+-----+\n",
 496 |       "|1.0|NaN|  1.0|  4.0|\n",
 497 |       "|2.0|NaN|  2.0|  4.0|\n",
 498 |       "|NaN|3.0|  2.0|  3.0|\n",
 499 |       "|4.0|4.0|  4.0|  4.0|\n",
 500 |       "|5.0|5.0|  5.0|  5.0|\n",
 501 |       "+---+---+-----+-----+\n",
 502 |       "\n"
 503 |      ]
 504 |     }
 505 |    ],
 506 |    "source": [
 507 |     "imputer.setStrategy(\"median\").setMissingValue(float(\"nan\")).fit(df).transform(df).show()"
 508 |    ]
 509 |   },
 510 |   {
 511 |    "cell_type": "markdown",
 512 |    "metadata": {},
 513 |    "source": [
 514 |     "### pyspark.ml.feature.MaxAbsScaler(self, inputCol=None, outputCol=None)\n",
 515 |     "通过分割每个特征中的最大绝对值来单独重新缩放每个特征以范围[-1,1]。 它不会移动/居中数据，因此不会破坏任何稀疏性"
 516 |    ]
 517 |   },
 518 |   {
 519 |    "cell_type": "code",
 520 |    "execution_count": 19,
 521 |    "metadata": {},
 522 |    "outputs": [
 523 |     {
 524 |      "name": "stdout",
 525 |      "output_type": "stream",
 526 |      "text": [
 527 |       "+-----+------+\n",
 528 |       "|    a|scaled|\n",
 529 |       "+-----+------+\n",
 530 |       "|[1.0]| [0.5]|\n",
 531 |       "|[2.0]| [1.0]|\n",
 532 |       "+-----+------+\n",
 533 |       "\n"
 534 |      ]
 535 |     }
 536 |    ],
 537 |    "source": [
 538 |     "from pyspark.ml.feature import MaxAbsScaler\n",
 539 |     "from pyspark.ml.linalg import Vectors\n",
 540 |     "df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], [\"a\"])\n",
 541 |     "maScaler = MaxAbsScaler(inputCol=\"a\", outputCol=\"scaled\")\n",
 542 |     "model = maScaler.fit(df)\n",
 543 |     "model.transform(df).show()"
 544 |    ]
 545 |   },
 546 |   {
 547 |    "cell_type": "markdown",
 548 |    "metadata": {},
 549 |    "source": [
 550 |     "### pyspark.ml.feature.MinMaxScaler(self, min=0.0, max=1.0, inputCol=None, outputCol=None)\n",
 551 |     "使用列汇总统计信息，将每个特征单独重新标定为一个常用范围[min，max]，这也称为最小 - 最大标准化或重新标定（注意由于零值可能会被转换为非零值，因此即使对于稀疏输入，转换器的输出也将是DenseVector）。 特征E的重新缩放的值被计算为，数据将被缩放到[0.0,1.0]范围内。\n",
 552 |     "Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min\n",
 553 |     "For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)\n"
 554 |    ]
 555 |   },
 556 |   {
 557 |    "cell_type": "code",
 558 |    "execution_count": 42,
 559 |    "metadata": {},
 560 |    "outputs": [
 561 |     {
 562 |      "name": "stdout",
 563 |      "output_type": "stream",
 564 |      "text": [
 565 |       "[0.0] [2.0]\n",
 566 |       "+-----+------+\n",
 567 |       "|    a|scaled|\n",
 568 |       "+-----+------+\n",
 569 |       "|[0.0]| [0.0]|\n",
 570 |       "|[2.0]| [1.0]|\n",
 571 |       "+-----+------+\n",
 572 |       "\n"
 573 |      ]
 574 |     }
 575 |    ],
 576 |    "source": [
 577 |     "from pyspark.ml.feature import MinMaxScaler\n",
 578 |     "from pyspark.ml.linalg import Vectors\n",
 579 |     "df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], [\"a\"])\n",
 580 |     "mmScaler = MinMaxScaler(inputCol=\"a\", outputCol=\"scaled\")\n",
 581 |     "model = mmScaler.fit(df)\n",
 582 |     "print(model.originalMin, model.originalMax)\n",
 583 |     "model.transform(df).show()"
 584 |    ]
 585 |   },
 586 |   {
 587 |    "cell_type": "markdown",
 588 |    "metadata": {},
 589 |    "source": [
 590 |     "### pyspark.ml.feature.NGram(n=2, inputCol=None, outputCol=None)\n",
 591 |     "一种功能转换器，用于将输入的字符串数组转换为n-gram数组。输入数组中的空值将被忽略。它返回一个n-gram数组，其中每个n-gram由一个以空格分隔的单词串表示。当输入为空时，返回一个空数组。当输入数组长度小于n（每n-gram的元素数）时，不返回n-gram。"
 592 |    ]
 593 |   },
 594 |   {
 595 |    "cell_type": "code",
 596 |    "execution_count": 23,
 597 |    "metadata": {},
 598 |    "outputs": [
 599 |     {
 600 |      "name": "stdout",
 601 |      "output_type": "stream",
 602 |      "text": [
 603 |       "+---------------+--------------------+\n",
 604 |       "|    inputTokens|              nGrams|\n",
 605 |       "+---------------+--------------------+\n",
 606 |       "|[a, b, c, d, e]|[a b, b c, c d, d e]|\n",
 607 |       "+---------------+--------------------+\n",
 608 |       "\n"
 609 |      ]
 610 |     }
 611 |    ],
 612 |    "source": [
 613 |     "from pyspark.ml.feature import NGram\n",
 614 |     "from pyspark.sql import Row\n",
 615 |     "df = spark.createDataFrame([Row(inputTokens=[\"a\", \"b\", \"c\", \"d\", \"e\"])])\n",
 616 |     "ngram = NGram(n=2, inputCol=\"inputTokens\", outputCol=\"nGrams\")\n",
 617 |     "ngram.transform(df).show()"
 618 |    ]
 619 |   },
 620 |   {
 621 |    "cell_type": "code",
 622 |    "execution_count": 24,
 623 |    "metadata": {},
 624 |    "outputs": [
 625 |     {
 626 |      "name": "stdout",
 627 |      "output_type": "stream",
 628 |      "text": [
 629 |       "+---------------+------------------+\n",
 630 |       "|    inputTokens|            nGrams|\n",
 631 |       "+---------------+------------------+\n",
 632 |       "|[a, b, c, d, e]|[a b c d, b c d e]|\n",
 633 |       "+---------------+------------------+\n",
 634 |       "\n"
 635 |      ]
 636 |     }
 637 |    ],
 638 |    "source": [
 639 |     "# 更改 n-gram 长度\n",
 640 |     "ngram.setParams(n=4).transform(df).show()"
 641 |    ]
 642 |   },
 643 |   {
 644 |    "cell_type": "code",
 645 |    "execution_count": 25,
 646 |    "metadata": {},
 647 |    "outputs": [
 648 |     {
 649 |      "name": "stdout",
 650 |      "output_type": "stream",
 651 |      "text": [
 652 |       "+---------------+------------------+\n",
 653 |       "|    inputTokens|            output|\n",
 654 |       "+---------------+------------------+\n",
 655 |       "|[a, b, c, d, e]|[a b c d, b c d e]|\n",
 656 |       "+---------------+------------------+\n",
 657 |       "\n"
 658 |      ]
 659 |     }
 660 |    ],
 661 |    "source": [
 662 |     "# 临时修改输出列\n",
 663 |     "ngram.transform(df, {ngram.outputCol: \"output\"}).show()"
 664 |    ]
 665 |   },
 666 |   {
 667 |    "cell_type": "markdown",
 668 |    "metadata": {},
 669 |    "source": [
 670 |     "### pyspark.ml.feature.Normalizer(self, p=2.0, inputCol=None, outputCol=None)\n",
 671 |     "使用给定的p范数标准化矢量以得到单位范数（默认为L2）。"
 672 |    ]
 673 |   },
 674 |   {
 675 |    "cell_type": "code",
 676 |    "execution_count": 26,
 677 |    "metadata": {},
 678 |    "outputs": [
 679 |     {
 680 |      "name": "stdout",
 681 |      "output_type": "stream",
 682 |      "text": [
 683 |       "+----------+-------------------+----------+\n",
 684 |       "|     dense|             sparse|  features|\n",
 685 |       "+----------+-------------------+----------+\n",
 686 |       "|[3.0,-4.0]|(4,[1,3],[4.0,3.0])|[0.6,-0.8]|\n",
 687 |       "+----------+-------------------+----------+\n",
 688 |       "\n"
 689 |      ]
 690 |     }
 691 |    ],
 692 |    "source": [
 693 |     "from pyspark.ml.feature import Normalizer\n",
 694 |     "from pyspark.ml.linalg import Vectors\n",
 695 |     "svec = Vectors.sparse(4, {1: 4.0, 3: 3.0})\n",
 696 |     "df = spark.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], [\"dense\", \"sparse\"])\n",
 697 |     "normalizer = Normalizer(p=2.0, inputCol=\"dense\", outputCol=\"features\")\n",
 698 |     "normalizer.transform(df).show()"
 699 |    ]
 700 |   },
 701 |   {
 702 |    "cell_type": "code",
 703 |    "execution_count": 27,
 704 |    "metadata": {},
 705 |    "outputs": [
 706 |     {
 707 |      "name": "stdout",
 708 |      "output_type": "stream",
 709 |      "text": [
 710 |       "+----------+-------------------+-------------------+\n",
 711 |       "|     dense|             sparse|              freqs|\n",
 712 |       "+----------+-------------------+-------------------+\n",
 713 |       "|[3.0,-4.0]|(4,[1,3],[4.0,3.0])|(4,[1,3],[0.8,0.6])|\n",
 714 |       "+----------+-------------------+-------------------+\n",
 715 |       "\n"
 716 |      ]
 717 |     }
 718 |    ],
 719 |    "source": [
 720 |     "normalizer.setParams(inputCol=\"sparse\", outputCol=\"freqs\").transform(df).show()"
 721 |    ]
 722 |   },
 723 |   {
 724 |    "cell_type": "markdown",
 725 |    "metadata": {},
 726 |    "source": [
 727 |     "### pyspark.ml.feature.OneHotEncoderEstimator(inputCols=None, outputCols=None, handleInvalid='error', dropLast=True)\n",
 728 |     "(分类列编码为二进制向量列)\n",
 729 |     "一个热门的编码器，将一列类别索引映射到一列二进制向量，每行至多有一个单值，表示输入类别索引。 例如，对于5个类别，输入值2.0将映射到[0.0，0.0，1.0，0.0]的输出向量。 最后一个类别默认不包含（可通过dropLast进行配置），因为它使向量条目总和为1，因此线性相关。 所以一个4.0的输入值映射到[0.0，0.0，0.0，0.0]。这与scikit-learn的OneHotEncoder不同，后者保留所有类别。 输出向量是稀疏的。\n",
 730 |     "当handleInvalid配置为“keep”时，会添加一个指示无效值的额外“类别”作为最后一个类别。因此，当dropLast为true时，无效值将被编码为全零向量。"
 731 |    ]
 732 |   },
 733 |   {
 734 |    "cell_type": "code",
 735 |    "execution_count": 28,
 736 |    "metadata": {},
 737 |    "outputs": [
 738 |     {
 739 |      "name": "stdout",
 740 |      "output_type": "stream",
 741 |      "text": [
 742 |       "+-----+-------------+\n",
 743 |       "|input|       output|\n",
 744 |       "+-----+-------------+\n",
 745 |       "|  0.0|(2,[0],[1.0])|\n",
 746 |       "|  1.0|(2,[1],[1.0])|\n",
 747 |       "|  2.0|    (2,[],[])|\n",
 748 |       "+-----+-------------+\n",
 749 |       "\n"
 750 |      ]
 751 |     }
 752 |    ],
 753 |    "source": [
 754 |     "from pyspark.ml.feature import OneHotEncoderEstimator\n",
 755 |     "from pyspark.ml.linalg import Vectors\n",
 756 |     "df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], [\"input\"])\n",
 757 |     "ohe = OneHotEncoderEstimator(inputCols=[\"input\"], outputCols=[\"output\"])\n",
 758 |     "model = ohe.fit(df)\n",
 759 |     "model.transform(df).show()"
 760 |    ]
 761 |   },
 762 |   {
 763 |    "cell_type": "markdown",
 764 |    "metadata": {},
 765 |    "source": [
 766 |     "### pyspark.ml.feature.PCA(self, k=None, inputCol=None, outputCol=None)\n",
 767 |     "PCA训练一个模型将向量投影到前k个主成分的较低维空间。"
 768 |    ]
 769 |   },
 770 |   {
 771 |    "cell_type": "code",
 772 |    "execution_count": 29,
 773 |    "metadata": {},
 774 |    "outputs": [
 775 |     {
 776 |      "name": "stdout",
 777 |      "output_type": "stream",
 778 |      "text": [
 779 |       "+---------------------+----------------------------------------+\n",
 780 |       "|features             |pca_features                            |\n",
 781 |       "+---------------------+----------------------------------------+\n",
 782 |       "|(5,[1,3],[1.0,7.0])  |[1.6485728230883807,-4.013282700516296] |\n",
 783 |       "|[2.0,0.0,3.0,4.0,5.0]|[-4.645104331781534,-1.1167972663619026]|\n",
 784 |       "|[4.0,0.0,0.0,6.0,7.0]|[-6.428880535676489,-5.337951427775355] |\n",
 785 |       "+---------------------+----------------------------------------+\n",
 786 |       "\n"
 787 |      ]
 788 |     }
 789 |    ],
 790 |    "source": [
 791 |     "from pyspark.ml.feature import PCA\n",
 792 |     "from pyspark.ml.linalg import Vectors\n",
 793 |     "data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),\n",
 794 |     "     (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),\n",
 795 |     "     (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]\n",
 796 |     "df = spark.createDataFrame(data,[\"features\"])\n",
 797 |     "pca = PCA(k=2, inputCol=\"features\", outputCol=\"pca_features\")\n",
 798 |     "model = pca.fit(df)\n",
 799 |     "model.transform(df).show(truncate=0)"
 800 |    ]
 801 |   },
 802 |   {
 803 |    "cell_type": "code",
 804 |    "execution_count": 30,
 805 |    "metadata": {},
 806 |    "outputs": [
 807 |     {
 808 |      "data": {
 809 |       "text/plain": [
 810 |        "DenseVector([0.7944, 0.2056])"
 811 |       ]
 812 |      },
 813 |      "execution_count": 30,
 814 |      "metadata": {},
 815 |      "output_type": "execute_result"
 816 |     }
 817 |    ],
 818 |    "source": [
 819 |     "model.explainedVariance"
 820 |    ]
 821 |   },
 822 |   {
 823 |    "cell_type": "markdown",
 824 |    "metadata": {},
 825 |    "source": [
 826 |     "### pyspark.ml.feature.QuantileDiscretizer(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, handleInvalid=\"error\")\n",
 827 |     "与Bucketizer方法类似，但QuantileDiscretizer采用具有连续特征的列，并输出具有分箱分类特征的列。可以使用numBuckets参数设置区域的数量。所使用的桶的数量可能小于该值，例如，如果输入的不同值太少而不能创建足够的不同分位数。nan会占用一个新的分类"
 828 |    ]
 829 |   },
 830 |   {
 831 |    "cell_type": "code",
 832 |    "execution_count": 31,
 833 |    "metadata": {},
 834 |    "outputs": [
 835 |     {
 836 |      "name": "stdout",
 837 |      "output_type": "stream",
 838 |      "text": [
 839 |       "+------+-------+\n",
 840 |       "|values|buckets|\n",
 841 |       "+------+-------+\n",
 842 |       "|   0.1|    0.0|\n",
 843 |       "|   0.4|    1.0|\n",
 844 |       "|   1.2|    1.0|\n",
 845 |       "|   1.5|    1.0|\n",
 846 |       "|   NaN|    2.0|\n",
 847 |       "|   NaN|    2.0|\n",
 848 |       "+------+-------+\n",
 849 |       "\n"
 850 |      ]
 851 |     }
 852 |    ],
 853 |    "source": [
 854 |     "from pyspark.ml.feature import QuantileDiscretizer\n",
 855 |     "values = [(0.1,), (0.4,), (1.2,), (1.5,), (float(\"nan\"),), (float(\"nan\"),)]\n",
 856 |     "df = spark.createDataFrame(values, [\"values\"])\n",
 857 |     "qds = QuantileDiscretizer(numBuckets=2,\n",
 858 |     "     inputCol=\"values\", outputCol=\"buckets\", relativeError=0.01, handleInvalid=\"error\")\n",
 859 |     "bucketizer = qds.fit(df)\n",
 860 |     "qds.setHandleInvalid(\"keep\").fit(df).transform(df).show()"
 861 |    ]
 862 |   },
 863 |   {
 864 |    "cell_type": "markdown",
 865 |    "metadata": {},
 866 |    "source": [
 867 |     "### pyspark.ml.feature.RegexTokenizer(minTokenLength=1, gaps=True, pattern='\\s+', inputCol=None, outputCol=None, toLowercase=True)\n",
 868 |     "基于java正则表达式的标记生成器"
 869 |    ]
 870 |   },
 871 |   {
 872 |    "cell_type": "code",
 873 |    "execution_count": 32,
 874 |    "metadata": {},
 875 |    "outputs": [
 876 |     {
 877 |      "name": "stdout",
 878 |      "output_type": "stream",
 879 |      "text": [
 880 |       "+------+---------+\n",
 881 |       "|  text|    words|\n",
 882 |       "+------+---------+\n",
 883 |       "|A B  c|[a, b, c]|\n",
 884 |       "+------+---------+\n",
 885 |       "\n"
 886 |      ]
 887 |     }
 888 |    ],
 889 |    "source": [
 890 |     "from pyspark.ml.feature import RegexTokenizer\n",
 891 |     "df = spark.createDataFrame([(\"A B  c\",)], [\"text\"])\n",
 892 |     "reTokenizer = RegexTokenizer(inputCol=\"text\", outputCol=\"words\")\n",
 893 |     "reTokenizer.transform(df).show()"
 894 |    ]
 895 |   },
 896 |   {
 897 |    "cell_type": "markdown",
 898 |    "metadata": {},
 899 |    "source": [
 900 |     "###  pyspark.ml.feature.SQLTransformer(statement=None)\n",
 901 |     "实现由SQL语句定义的转换。目前我们只支持SQL语法，"
 902 |    ]
 903 |   },
 904 |   {
 905 |    "cell_type": "code",
 906 |    "execution_count": 33,
 907 |    "metadata": {},
 908 |    "outputs": [
 909 |     {
 910 |      "name": "stdout",
 911 |      "output_type": "stream",
 912 |      "text": [
 913 |       "+---+---+---+\n",
 914 |       "| id| v1| v2|\n",
 915 |       "+---+---+---+\n",
 916 |       "|  0|1.0|3.0|\n",
 917 |       "|  2|2.0|5.0|\n",
 918 |       "+---+---+---+\n",
 919 |       "\n",
 920 |       "+---+---+---+---+----+\n",
 921 |       "| id| v1| v2| v3|  v4|\n",
 922 |       "+---+---+---+---+----+\n",
 923 |       "|  0|1.0|3.0|4.0| 3.0|\n",
 924 |       "|  2|2.0|5.0|7.0|10.0|\n",
 925 |       "+---+---+---+---+----+\n",
 926 |       "\n"
 927 |      ]
 928 |     }
 929 |    ],
 930 |    "source": [
 931 |     "from pyspark.ml.feature import SQLTransformer\n",
 932 |     "df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], [\"id\", \"v1\", \"v2\"])\n",
 933 |     "sqlTrans = SQLTransformer(\n",
 934 |     "     statement=\"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__\")\n",
 935 |     "df.show()\n",
 936 |     "sqlTrans.transform(df).show()\n"
 937 |    ]
 938 |   },
 939 |   {
 940 |    "cell_type": "markdown",
 941 |    "metadata": {},
 942 |    "source": [
 943 |     "### pyspark.ml.feature.StandardScaler(self, withMean=False, withStd=True, inputCol=None, outputCol=None)\n",
 944 |     "(标准化列，使其拥有零均值和等于1的标准差)\n",
 945 |     "通过使用训练集中样本的列汇总统计消除平均值和缩放到单位方差来标准化特征。使用校正后的样本标准偏差计算“单位标准差”，该标准偏差计算为无偏样本方差的平方根。\n"
 946 |    ]
 947 |   },
 948 |   {
 949 |    "cell_type": "code",
 950 |    "execution_count": 34,
 951 |    "metadata": {},
 952 |    "outputs": [
 953 |     {
 954 |      "name": "stdout",
 955 |      "output_type": "stream",
 956 |      "text": [
 957 |       "[1.0] [1.4142135623730951]\n",
 958 |       "+-----+-------------------+\n",
 959 |       "|    a|             scaled|\n",
 960 |       "+-----+-------------------+\n",
 961 |       "|[0.0]|              [0.0]|\n",
 962 |       "|[2.0]|[1.414213562373095]|\n",
 963 |       "+-----+-------------------+\n",
 964 |       "\n"
 965 |      ]
 966 |     }
 967 |    ],
 968 |    "source": [
 969 |     "from pyspark.ml.feature import StandardScaler\n",
 970 |     "from pyspark.ml.linalg import Vectors\n",
 971 |     "df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], [\"a\"])\n",
 972 |     "standardScaler = StandardScaler(inputCol=\"a\", outputCol=\"scaled\")\n",
 973 |     "model = standardScaler.fit(df)\n",
 974 |     "print(model.mean, model.std)\n",
 975 |     "model.transform(df).show()"
 976 |    ]
 977 |   },
 978 |   {
 979 |    "cell_type": "markdown",
 980 |    "metadata": {},
 981 |    "source": [
 982 |     "### pyspark.ml.feature.StopWordsRemover(inputCol=None, outputCol=None, stopWords=None, caseSensitive=False)\n",
 983 |     "一个特征转换器，用于过滤掉输入中的停用词。"
 984 |    ]
 985 |   },
 986 |   {
 987 |    "cell_type": "code",
 988 |    "execution_count": 35,
 989 |    "metadata": {},
 990 |    "outputs": [
 991 |     {
 992 |      "name": "stdout",
 993 |      "output_type": "stream",
 994 |      "text": [
 995 |       "+---------+------+\n",
 996 |       "|     text| words|\n",
 997 |       "+---------+------+\n",
 998 |       "|[a, b, c]|[a, c]|\n",
 999 |       "+---------+------+\n",
1000 |       "\n"
1001 |      ]
1002 |     }
1003 |    ],
1004 |    "source": [
1005 |     "from pyspark.ml.feature import StopWordsRemover\n",
1006 |     "df = spark.createDataFrame([([\"a\", \"b\", \"c\"],)], [\"text\"])\n",
1007 |     "remover = StopWordsRemover(inputCol=\"text\", outputCol=\"words\", stopWords=[\"b\"])\n",
1008 |     "remover.transform(df).show()"
1009 |    ]
1010 |   },
1011 |   {
1012 |    "cell_type": "markdown",
1013 |    "metadata": {},
1014 |    "source": [
1015 |     "### pyspark.ml.feature.Tokenizer(inputCol=None, outputCol=None)\n",
1016 |     "一个标记生成器，它将输入字符串转换为小写，然后用空格分隔它。"
1017 |    ]
1018 |   },
1019 |   {
1020 |    "cell_type": "code",
1021 |    "execution_count": 36,
1022 |    "metadata": {
1023 |     "scrolled": true
1024 |    },
1025 |    "outputs": [
1026 |     {
1027 |      "name": "stdout",
1028 |      "output_type": "stream",
1029 |      "text": [
1030 |       "+--------+------------+\n",
1031 |       "|    text|       words|\n",
1032 |       "+--------+------------+\n",
1033 |       "|ASD VA c|[asd, va, c]|\n",
1034 |       "+--------+------------+\n",
1035 |       "\n"
1036 |      ]
1037 |     }
1038 |    ],
1039 |    "source": [
1040 |     "from pyspark.ml.feature import Tokenizer\n",
1041 |     "df = spark.createDataFrame([(\"ASD VA c\",)], [\"text\"])\n",
1042 |     "tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"words\")\n",
1043 |     "tokenizer.transform(df).show()"
1044 |    ]
1045 |   },
1046 |   {
1047 |    "cell_type": "markdown",
1048 |    "metadata": {},
1049 |    "source": [
1050 |     "### pyspark.ml.feature.VectorSlicer(inputCol=None, outputCol=None, indices=None, names=None)\n",
1051 |     "此类采用特征向量并输出具有原始特征的子阵列的新特征向量。 可以使用索引（setIndices（））或名称（setNames（））指定要素子集。必须至少选择一个功能。不允许使用重复的功能，因此所选索引和名称之间不能重叠。 输出向量将首先按所选索引（按给定顺序）排序要素，然后是所选名称（按给定顺序）。"
1052 |    ]
1053 |   },
1054 |   {
1055 |    "cell_type": "code",
1056 |    "execution_count": 37,
1057 |    "metadata": {},
1058 |    "outputs": [
1059 |     {
1060 |      "name": "stdout",
1061 |      "output_type": "stream",
1062 |      "text": [
1063 |       "+-----------------------+----------+\n",
1064 |       "|features               |sliced    |\n",
1065 |       "+-----------------------+----------+\n",
1066 |       "|[-2.0,2.3,0.0,0.0,1.0] |[2.3,1.0] |\n",
1067 |       "|[0.0,0.0,0.0,0.0,0.0]  |[0.0,0.0] |\n",
1068 |       "|[0.6,-1.1,-3.0,4.5,3.3]|[-1.1,3.3]|\n",
1069 |       "+-----------------------+----------+\n",
1070 |       "\n"
1071 |      ]
1072 |     }
1073 |    ],
1074 |    "source": [
1075 |     "from pyspark.ml.feature import VectorSlicer\n",
1076 |     "from pyspark.ml.linalg import Vectors\n",
1077 |     "df = spark.createDataFrame([\n",
1078 |     "     (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),\n",
1079 |     "     (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),\n",
1080 |     "     (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], [\"features\"])\n",
1081 |     "vs = VectorSlicer(inputCol=\"features\", outputCol=\"sliced\", indices=[1, 4])\n",
1082 |     "vs.transform(df).show(truncate=0)"
1083 |    ]
1084 |   },
1085 |   {
1086 |    "cell_type": "markdown",
1087 |    "metadata": {},
1088 |    "source": [
1089 |     "### pyspark.ml.feature.VectorAssembler(inputCols=None, outputCol=None)\n",
1090 |     "将多个列合并到向量列中的要素转换器。"
1091 |    ]
1092 |   },
1093 |   {
1094 |    "cell_type": "code",
1095 |    "execution_count": 38,
1096 |    "metadata": {},
1097 |    "outputs": [
1098 |     {
1099 |      "name": "stdout",
1100 |      "output_type": "stream",
1101 |      "text": [
1102 |       "+---+---+---+-------------+\n",
1103 |       "|  a|  b|  c|     features|\n",
1104 |       "+---+---+---+-------------+\n",
1105 |       "|  1|  0|  3|[1.0,0.0,3.0]|\n",
1106 |       "+---+---+---+-------------+\n",
1107 |       "\n"
1108 |      ]
1109 |     }
1110 |    ],
1111 |    "source": [
1112 |     "from pyspark.ml.feature import VectorAssembler\n",
1113 |     "df = spark.createDataFrame([(1, 0, 3)], [\"a\", \"b\", \"c\"])\n",
1114 |     "vecAssembler = VectorAssembler(inputCols=[\"a\", \"b\", \"c\"], outputCol=\"features\")\n",
1115 |     "vecAssembler.transform(df).show()"
1116 |    ]
1117 |   },
1118 |   {
1119 |    "cell_type": "markdown",
1120 |    "metadata": {},
1121 |    "source": [
1122 |     "### pyspark.ml.feature.Word2Vec(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000)\n",
1123 |     "Word2Vec训练Map（String，Vector）模型，即将单词转换为代码以进行进一步的自然语言处理或机器学习过程。"
1124 |    ]
1125 |   },
1126 |   {
1127 |    "cell_type": "code",
1128 |    "execution_count": 39,
1129 |    "metadata": {},
1130 |    "outputs": [
1131 |     {
1132 |      "name": "stdout",
1133 |      "output_type": "stream",
1134 |      "text": [
1135 |       "+----+--------------------+\n",
1136 |       "|word|              vector|\n",
1137 |       "+----+--------------------+\n",
1138 |       "|   a|[0.09461779892444...|\n",
1139 |       "|   b|[1.15474212169647...|\n",
1140 |       "|   c|[-0.3794820010662...|\n",
1141 |       "+----+--------------------+\n",
1142 |       "\n"
1143 |      ]
1144 |     }
1145 |    ],
1146 |    "source": [
1147 |     "from pyspark.ml.feature import Word2Vec\n",
1148 |     "sent = (\"a b \" * 100 + \"a c \" * 10).split(\" \")\n",
1149 |     "doc = spark.createDataFrame([(sent,), (sent,)], [\"sentence\"])\n",
1150 |     "word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol=\"sentence\", outputCol=\"model\")\n",
1151 |     "model = word2Vec.fit(doc)\n",
1152 |     "model.getVectors().show()"
1153 |    ]
1154 |   },
1155 |   {
1156 |    "cell_type": "code",
1157 |    "execution_count": 40,
1158 |    "metadata": {},
1159 |    "outputs": [
1160 |     {
1161 |      "name": "stdout",
1162 |      "output_type": "stream",
1163 |      "text": [
1164 |       "+----+-------------------+\n",
1165 |       "|word|         similarity|\n",
1166 |       "+----+-------------------+\n",
1167 |       "|   b|0.25053444504737854|\n",
1168 |       "+----+-------------------+\n",
1169 |       "\n"
1170 |      ]
1171 |     },
1172 |     {
1173 |      "data": {
1174 |       "text/plain": [
1175 |        "[('b', 0.25053444504737854)]"
1176 |       ]
1177 |      },
1178 |      "execution_count": 40,
1179 |      "metadata": {},
1180 |      "output_type": "execute_result"
1181 |     }
1182 |    ],
1183 |    "source": [
1184 |     "# 找相似字符\n",
1185 |     "model.findSynonyms(\"a\", 1).show()\n",
1186 |     "model.findSynonymsArray(\"a\", 1)"
1187 |    ]
1188 |   },
1189 |   {
1190 |    "cell_type": "code",
1191 |    "execution_count": 41,
1192 |    "metadata": {},
1193 |    "outputs": [
1194 |     {
1195 |      "name": "stdout",
1196 |      "output_type": "stream",
1197 |      "text": [
1198 |       "+----+----------+\n",
1199 |       "|word|similarity|\n",
1200 |       "+----+----------+\n",
1201 |       "|   b|     0.251|\n",
1202 |       "|   c|    -0.698|\n",
1203 |       "+----+----------+\n",
1204 |       "\n"
1205 |      ]
1206 |     }
1207 |    ],
1208 |    "source": [
1209 |     "from pyspark.sql.functions import format_number as fmt\n",
1210 |     "model.findSynonyms(\"a\", 2).select(\"word\", fmt(\"similarity\", 3).alias(\"similarity\")).show()"
1211 |    ]
1212 |   },
1213 |   {
1214 |    "cell_type": "code",
1215 |    "execution_count": null,
1216 |    "metadata": {},
1217 |    "outputs": [],
1218 |    "source": []
1219 |   }
1220 |  ],
1221 |  "metadata": {
1222 |   "kernelspec": {
1223 |    "display_name": "Python 3",
1224 |    "language": "python",
1225 |    "name": "python3"
1226 |   },
1227 |   "language_info": {
1228 |    "codemirror_mode": {
1229 |     "name": "ipython",
1230 |     "version": 3
1231 |    },
1232 |    "file_extension": ".py",
1233 |    "mimetype": "text/x-python",
1234 |    "name": "python",
1235 |    "nbconvert_exporter": "python",
1236 |    "pygments_lexer": "ipython3",
1237 |    "version": "3.6.4"
1238 |   }
1239 |  },
1240 |  "nbformat": 4,
1241 |  "nbformat_minor": 2
1242 | }
1243 | 


--------------------------------------------------------------------------------
/pyspark.ml.regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pyspark.sql import SparkSession\n",
 10 |     "spark = SparkSession.builder.appName('learn_regression').master('local[1]').getOrCreate()"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "df_train = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/boston/train.csv', header=True, inferSchema=True, encoding='utf-8')"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "df_test = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/boston/test.csv', header=True, inferSchema=True, encoding='utf-8')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 4,
 34 |    "metadata": {
 35 |     "scrolled": true
 36 |    },
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+-----+\n",
 43 |       "| ID|   crim|  zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio| black|lstat| medv|\n",
 44 |       "+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+-----+\n",
 45 |       "|  1|0.00632|18.0| 2.31|   0|0.538|6.575|65.2|  4.09|  1|296|   15.3| 396.9| 4.98| 24.0|\n",
 46 |       "|  2|0.02731| 0.0| 7.07|   0|0.469|6.421|78.9|4.9671|  2|242|   17.8| 396.9| 9.14| 21.6|\n",
 47 |       "|  3|0.02729| 0.0| 7.07|   0|0.469|7.185|61.1|4.9671|  2|242|   17.8|392.83| 4.03|22.77|\n",
 48 |       "+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+-----+\n",
 49 |       "only showing top 3 rows\n",
 50 |       "\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "from pyspark.sql.functions import lit\n",
 56 |     "df_test = df_test.withColumn('medv', lit(22.77))\n",
 57 |     "df0 = df_train.union(df_test).sort('ID')\n",
 58 |     "df0.show(3)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "`\n",
 66 |     "CRIM--  城镇人均犯罪率。\n",
 67 |     "ZN  - 占地面积超过25,000平方英尺的住宅用地比例。\n",
 68 |     "INDUS  - 每个城镇非零售业务的比例。\n",
 69 |     "CHAS  - Charles River虚拟变量（如果河流经过则= 1;否则为0）。\n",
 70 |     "NOX  - 氮氧化物浓度（每千万份）。\n",
 71 |     "RM  - 每间住宅的平均房间数。\n",
 72 |     "AGE  - 1940年以前建造的自住单位比例。\n",
 73 |     "DIS  - 加权平均值到五个波士顿就业中心的距离。\n",
 74 |     "RAD  - 径向高速公路的可达性指数。\n",
 75 |     "TAX  - 每10,000美元的全额物业税率。\n",
 76 |     "PTRATIO  - 城镇的学生与教师比例。\n",
 77 |     "BLACK  - 1000（Bk - 0.63）²其中Bk是城镇黑人的比例。\n",
 78 |     "LSTAT  - 人口较低的地位（百分比）。\n",
 79 |     "MEDV  - 自住房屋的中位数价值1000美元。这是目标变量。\n",
 80 |     "`"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 5,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "from pyspark.ml.feature import VectorAssembler\n",
 90 |     "def feature_converter(df):\n",
 91 |     "    vecAss = VectorAssembler(inputCols=df0.columns[1:-1], outputCol='features')\n",
 92 |     "    df_va = vecAss.transform(df)\n",
 93 |     "    return df_va\n",
 94 |     "\n",
 95 |     "train_data, test_data = feature_converter(df0).select(['features', 'medv']).randomSplit([7.0, 3.0], 101)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 6,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "data": {
105 |       "text/plain": [
106 |        "354"
107 |       ]
108 |      },
109 |      "execution_count": 6,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "train_data.count()"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 7,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "data": {
125 |       "text/plain": [
126 |        "152"
127 |       ]
128 |      },
129 |      "execution_count": 7,
130 |      "metadata": {},
131 |      "output_type": "execute_result"
132 |     }
133 |    ],
134 |    "source": [
135 |     "test_data.count()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "## 决策树回归\n",
143 |     "`pyspark.ml.regression.DecisionTreeRegressor(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='variance', seed=None, varianceCol=None)`"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "`\n",
151 |     "fit(dataset, params=None)方法 \n",
152 |     "Impurity: 信息增益计算准则，支持选项：variance \n",
153 |     "maxBins: 连续特征离散化的最大分箱个数， >=2并且>=任何分类特征的分类个数 \n",
154 |     "maxDepth: 最大树深 \n",
155 |     "minInfoGain: 分割节点所需最小信息增益 \n",
156 |     "minInstancesPerNode: 分割后每个子节点最小实例个数 \n",
157 |     "`"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 13,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "from pyspark.ml.regression import DecisionTreeRegressor\n",
167 |     "dt = DecisionTreeRegressor(maxDepth=5, varianceCol=\"variance\", labelCol='medv')\n",
168 |     "dt_model = dt.fit(train_data)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 14,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "data": {
178 |       "text/plain": [
179 |        "SparseVector(13, {0: 0.0503, 2: 0.011, 4: 0.0622, 5: 0.1441, 6: 0.1852, 7: 0.0262, 8: 0.0022, 9: 0.0886, 10: 0.0142, 12: 0.4159})"
180 |       ]
181 |      },
182 |      "execution_count": 14,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "dt_model.featureImportances"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 15,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "result = dt_model.transform(test_data)"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 16,
203 |    "metadata": {},
204 |    "outputs": [
205 |     {
206 |      "name": "stdout",
207 |      "output_type": "stream",
208 |      "text": [
209 |       "+--------------------+-----+------------------+------------------+\n",
210 |       "|            features| medv|        prediction|          variance|\n",
211 |       "+--------------------+-----+------------------+------------------+\n",
212 |       "|[0.03237,0.0,2.18...| 33.4| 34.12833333333334|29.509013888888756|\n",
213 |       "|[0.08829,12.5,7.8...| 22.9|21.195135135135136| 4.446162819576342|\n",
214 |       "|[0.14455,12.5,7.8...|22.77|22.425999999999995|0.5578440000003866|\n",
215 |       "+--------------------+-----+------------------+------------------+\n",
216 |       "only showing top 3 rows\n",
217 |       "\n"
218 |      ]
219 |     }
220 |    ],
221 |    "source": [
222 |     "result.show(3)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 17,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "测试数据的均方根误差（rmse）:6.555920141221407\n"
235 |      ]
236 |     }
237 |    ],
238 |    "source": [
239 |     "from pyspark.ml.evaluation import RegressionEvaluator\n",
240 |     "dt_evaluator = RegressionEvaluator(labelCol='medv', metricName=\"rmse\", predictionCol='prediction')\n",
241 |     "rmse = dt_evaluator.evaluate(result)\n",
242 |     "print('测试数据的均方根误差（rmse）:{}'.format(rmse))"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "## 梯度提升树回归 （Gradient-boosted tree regression）\n",
250 |     "pyspark.ml.regression.GBTRegressor(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType='squared', maxIter=20, stepSize=0.1, seed=None, impurity='variance')"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "`\n",
258 |     "fit(dataset,params=None)方法 \n",
259 |     "lossType: GBT要最小化的损失函数，可选：squared, absolute  \n",
260 |     "maxIter: 最大迭代次数 \n",
261 |     "stepSize: 每次优化迭代的步长 \n",
262 |     "subsamplingRate:用于训练每颗决策树的训练数据集的比例，区间[0,1] \n",
263 |     "`"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 8,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "from pyspark.ml.regression import GBTRegressor\n",
273 |     "gbt = GBTRegressor(maxIter=10, labelCol='medv', maxDepth=3)\n",
274 |     "gbt_model = gbt.fit(train_data)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 9,
280 |    "metadata": {},
281 |    "outputs": [
282 |     {
283 |      "data": {
284 |       "text/plain": [
285 |        "SparseVector(13, {0: 0.0219, 1: 0.0364, 2: 0.0305, 3: 0.0114, 4: 0.0032, 5: 0.1372, 6: 0.146, 7: 0.1033, 8: 0.0518, 9: 0.0819, 10: 0.0883, 11: 0.0048, 12: 0.2832})"
286 |       ]
287 |      },
288 |      "execution_count": 9,
289 |      "metadata": {},
290 |      "output_type": "execute_result"
291 |     }
292 |    ],
293 |    "source": [
294 |     "gbt_model.featureImportances"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 10,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "result = gbt_model.transform(test_data)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 11,
309 |    "metadata": {
310 |     "scrolled": true
311 |    },
312 |    "outputs": [
313 |     {
314 |      "name": "stdout",
315 |      "output_type": "stream",
316 |      "text": [
317 |       "+--------------------+-----+------------------+\n",
318 |       "|            features| medv|        prediction|\n",
319 |       "+--------------------+-----+------------------+\n",
320 |       "|[0.03237,0.0,2.18...| 33.4| 31.98716729056085|\n",
321 |       "|[0.08829,12.5,7.8...| 22.9|22.254258637918248|\n",
322 |       "|[0.14455,12.5,7.8...|22.77|20.066468254729102|\n",
323 |       "+--------------------+-----+------------------+\n",
324 |       "only showing top 3 rows\n",
325 |       "\n"
326 |      ]
327 |     }
328 |    ],
329 |    "source": [
330 |     "result.show(3)"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 20,
336 |    "metadata": {},
337 |    "outputs": [
338 |     {
339 |      "data": {
340 |       "text/plain": [
341 |        "[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]"
342 |       ]
343 |      },
344 |      "execution_count": 20,
345 |      "metadata": {},
346 |      "output_type": "execute_result"
347 |     }
348 |    ],
349 |    "source": [
350 |     "gbt_model.treeWeights"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 12,
356 |    "metadata": {},
357 |    "outputs": [
358 |     {
359 |      "name": "stdout",
360 |      "output_type": "stream",
361 |      "text": [
362 |       "测试数据的均方根误差（rmse）:5.624145397622545\n"
363 |      ]
364 |     }
365 |    ],
366 |    "source": [
367 |     "from pyspark.ml.evaluation import RegressionEvaluator\n",
368 |     "gbt_evaluator = RegressionEvaluator(labelCol='medv', metricName=\"rmse\", predictionCol='prediction')\n",
369 |     "rmse = gbt_evaluator.evaluate(result)\n",
370 |     "print('测试数据的均方根误差（rmse）:{}'.format(rmse))"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "markdown",
375 |    "metadata": {},
376 |    "source": [
377 |     "## 线性回归（LinearRegression）\n",
378 |     "pyspark.ml.regression.LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction', maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, standardization=True, solver='auto', weightCol=None, aggregationDepth=2, loss='squaredError', epsilon=1.35)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {},
384 |    "source": [
385 |     "\n",
386 |     "学习目标是通过正规化最小化指定的损失函数。这支持两种损失：\n",
387 |     "+ squaredError (a.k.a 平方损失)\n",
388 |     "+ huber (对于相对较小的误差和相对大的误差的绝对误差的平方误差的混合，我们从训练数据估计比例参数)\n",
389 |     "\n",
390 |     "支持多种类型的正则化： \n",
391 |     "+ None：OLS \n",
392 |     "+ L2：ridge回归 \n",
393 |     "+ L1：Lasso回归 \n",
394 |     "+ L1+L2：elastic回归\n",
395 |     "\n",
396 |     "注意：与huber loss匹配仅支持none和L2正规化。\n"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "markdown",
401 |    "metadata": {},
402 |    "source": [
403 |     "`\n",
404 |     "aggregationDepth: 树聚合的深度, >=2 \n",
405 |     "elasticNtParam: ElasticNet混合参数，在[0,1]范围内，alpha=0为L2， alpha=1为L1 \n",
406 |     "fit(dataset,params=None)方法 \n",
407 |     "fitIntercept: 是否拟合截距 \n",
408 |     "maxIter: 最大迭代次数 \n",
409 |     "regParam：正则化参数 >=0 \n",
410 |     "solver: 优化算法，没设置或空则使用”auto” \n",
411 |     "standardization: 是否对拟合模型的特征进行标准化 \n",
412 |     "`"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "metadata": {},
418 |    "source": [
419 |     "`\n",
420 |     "Summary属性\n",
421 |     "coefficientStandardErrors \n",
422 |     "devianceResiduals: 加权残差 \n",
423 |     "explainedVariance: 返回解释的方差回归得分，explainedVariance=1−variance(y−(̂ y))/variance(y) \n",
424 |     "meanAbsoluteError: 返回均值绝对误差 \n",
425 |     "meanSquaredError: 返回均值平方误 \n",
426 |     "numInstances: 预测的实例个数 \n",
427 |     "pValues: 系数和截距的双边P值，只有用”normal”solver才可用 \n",
428 |     "predictions: 模型transform方法返回的预测 \n",
429 |     "r2: R方 \n",
430 |     "residuals: 残差 \n",
431 |     "rootMeanSquaredError: 均方误差平方根 \n",
432 |     "tValues： T统计量\n",
433 |     "`"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": 23,
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": [
442 |     "from pyspark.ml.regression import LinearRegression\n",
443 |     "lr = LinearRegression(maxIter=10, elasticNetParam=0.8, regParam=0.3, labelCol='medv')\n",
444 |     "lr_model = lr.fit(train_data)"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 26,
450 |    "metadata": {},
451 |    "outputs": [],
452 |    "source": [
453 |     "trainingSummary = lr_model.summary"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": 27,
459 |    "metadata": {},
460 |    "outputs": [
461 |     {
462 |      "name": "stdout",
463 |      "output_type": "stream",
464 |      "text": [
465 |       "RMSE: 5.457496\n",
466 |       "r2: 0.432071\n"
467 |      ]
468 |     }
469 |    ],
470 |    "source": [
471 |     "print(\"RMSE: %f\" % trainingSummary.rootMeanSquaredError)\n",
472 |     "print(\"r2: %f\" % trainingSummary.r2)"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": 55,
478 |    "metadata": {},
479 |    "outputs": [
480 |     {
481 |      "name": "stdout",
482 |      "output_type": "stream",
483 |      "text": [
484 |       "+--------------------+-----+------------------+\n",
485 |       "|            features| medv|        prediction|\n",
486 |       "+--------------------+-----+------------------+\n",
487 |       "|[0.03237,0.0,2.18...| 33.4|27.066314856077966|\n",
488 |       "|[0.08829,12.5,7.8...| 22.9|23.721352298735898|\n",
489 |       "|[0.14455,12.5,7.8...|22.77|21.388248900632398|\n",
490 |       "+--------------------+-----+------------------+\n",
491 |       "only showing top 3 rows\n",
492 |       "\n"
493 |      ]
494 |     }
495 |    ],
496 |    "source": [
497 |     "result = lr_model.transform(test_data)\n",
498 |     "result.show(3)"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": 43,
504 |    "metadata": {},
505 |    "outputs": [
506 |     {
507 |      "name": "stdout",
508 |      "output_type": "stream",
509 |      "text": [
510 |       "R平方（r2）:0.469\n"
511 |      ]
512 |     }
513 |    ],
514 |    "source": [
515 |     "from pyspark.ml.evaluation import RegressionEvaluator\n",
516 |     "lr_evaluator = RegressionEvaluator(labelCol='medv', metricName=\"r2\", predictionCol='prediction')\n",
517 |     "r2 = lr_evaluator.evaluate(result)\n",
518 |     "print('R平方（r2）:{:.3}'.format(r2))"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "code",
523 |    "execution_count": 44,
524 |    "metadata": {
525 |     "scrolled": true
526 |    },
527 |    "outputs": [],
528 |    "source": [
529 |     "test_evaluation = lr_model.evaluate(test_data)"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 42,
535 |    "metadata": {},
536 |    "outputs": [
537 |     {
538 |      "name": "stdout",
539 |      "output_type": "stream",
540 |      "text": [
541 |       "RMSE:5.7\n",
542 |       "r2:0.469\n"
543 |      ]
544 |     }
545 |    ],
546 |    "source": [
547 |     "print('RMSE:{:.3}'.format(test_evaluation.rootMeanSquaredError))\n",
548 |     "print('r2:{:.3}'.format(test_evaluation.r2))"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "markdown",
553 |    "metadata": {},
554 |    "source": [
555 |     "## 随机森林回归\n",
556 |     "pyspark.ml.regression.RandomForestRegressor(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='variance', subsamplingRate=1.0, seed=None, numTrees=20, featureSubsetStrategy='auto')"
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "markdown",
561 |    "metadata": {},
562 |    "source": [
563 |     "`\n",
564 |     "fit(dataset,params=None)方法 \n",
565 |     "featureSubsetStrategy: 每棵树的节点上要分割的特征数量，可选：auto, all, onethird, sqrt, log2,(0.0,1.0],[1-n] \n",
566 |     "impurity: 信息增益计算的准则，可选：variance \n",
567 |     "maxBins: 连续特征离散化最大分箱个数。 \n",
568 |     "maxDepth: 树的最大深度 \n",
569 |     "minInfoGain: 树节点分割特征所需最小的信息增益 \n",
570 |     "minInstancesPerNode: 每个结点所需最小实例个数 \n",
571 |     "numTrees: 训练树的个数 \n",
572 |     "subsamplingRate: 学习每颗决策树所需样本比例 \n",
573 |     "`"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 47,
579 |    "metadata": {},
580 |    "outputs": [],
581 |    "source": [
582 |     "from pyspark.ml.regression import RandomForestRegressor\n",
583 |     "rf = RandomForestRegressor(numTrees=10, maxDepth=5, seed=101, labelCol='medv')\n",
584 |     "rf_model = rf.fit(train_data)"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": 54,
590 |    "metadata": {},
591 |    "outputs": [
592 |     {
593 |      "name": "stdout",
594 |      "output_type": "stream",
595 |      "text": [
596 |       "+--------------------+-----+------------------+\n",
597 |       "|            features| medv|        prediction|\n",
598 |       "+--------------------+-----+------------------+\n",
599 |       "|[0.03237,0.0,2.18...| 33.4| 30.12804440796982|\n",
600 |       "|[0.08829,12.5,7.8...| 22.9|21.338106353716338|\n",
601 |       "|[0.14455,12.5,7.8...|22.77|19.764914032872827|\n",
602 |       "+--------------------+-----+------------------+\n",
603 |       "only showing top 3 rows\n",
604 |       "\n"
605 |      ]
606 |     }
607 |    ],
608 |    "source": [
609 |     "result = rf_model.transform(test_data)\n",
610 |     "result.show(3)"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": 51,
616 |    "metadata": {},
617 |    "outputs": [
618 |     {
619 |      "data": {
620 |       "text/plain": [
621 |        "[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]"
622 |       ]
623 |      },
624 |      "execution_count": 51,
625 |      "metadata": {},
626 |      "output_type": "execute_result"
627 |     }
628 |    ],
629 |    "source": [
630 |     "rf_model.treeWeights"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "code",
635 |    "execution_count": 53,
636 |    "metadata": {},
637 |    "outputs": [
638 |     {
639 |      "name": "stdout",
640 |      "output_type": "stream",
641 |      "text": [
642 |       "测试数据的均方根误差（rmse）:5.268739233773331\n"
643 |      ]
644 |     }
645 |    ],
646 |    "source": [
647 |     "from pyspark.ml.evaluation import RegressionEvaluator\n",
648 |     "rf_evaluator = RegressionEvaluator(labelCol='medv', metricName=\"rmse\", predictionCol='prediction')\n",
649 |     "rmse = rf_evaluator.evaluate(result)\n",
650 |     "print('测试数据的均方根误差（rmse）:{}'.format(rmse))"
651 |    ]
652 |   }
653 |  ],
654 |  "metadata": {
655 |   "kernelspec": {
656 |    "display_name": "Python 3",
657 |    "language": "python",
658 |    "name": "python3"
659 |   },
660 |   "language_info": {
661 |    "codemirror_mode": {
662 |     "name": "ipython",
663 |     "version": 3
664 |    },
665 |    "file_extension": ".py",
666 |    "mimetype": "text/x-python",
667 |    "name": "python",
668 |    "nbconvert_exporter": "python",
669 |    "pygments_lexer": "ipython3",
670 |    "version": "3.6.4"
671 |   }
672 |  },
673 |  "nbformat": 4,
674 |  "nbformat_minor": 2
675 | }
676 | 


--------------------------------------------------------------------------------