├── README.md
├── mashroom.ipynb
├── pysaprk.ml.clustering 学习.ipynb
├── pyspark-RDD.ipynb
├── pyspark-sql-dataframe.ipynb
├── pyspark-sql-functions.ipynb
├── pyspark.ml.classification.ipynb
├── pyspark.ml.feature.ipynb
└── pyspark.ml.regression.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | ### 学习pyspark
2 |
--------------------------------------------------------------------------------
/mashroom.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from pyspark.sql import SparkSession\n",
10 | "spark = SparkSession.builder.appName('mushroom').master('local[1]').getOrCreate()"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "### 导入数据并确定数据类型"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 10,
23 | "metadata": {
24 | "scrolled": false
25 | },
26 | "outputs": [
27 | {
28 | "data": {
29 | "text/plain": [
30 | "23"
31 | ]
32 | },
33 | "execution_count": 10,
34 | "metadata": {},
35 | "output_type": "execute_result"
36 | }
37 | ],
38 | "source": [
39 | "df0 = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/mushrooms.csv', header=True, inferSchema=True, encoding='utf-8')\n",
40 | "len(df0.columns)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 14,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "name": "stdout",
50 | "output_type": "stream",
51 | "text": [
52 | "+---------+\n",
53 | "|cap-shape|\n",
54 | "+---------+\n",
55 | "| x|\n",
56 | "| f|\n",
57 | "| k|\n",
58 | "| c|\n",
59 | "| b|\n",
60 | "| s|\n",
61 | "+---------+\n",
62 | "\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "df0.select('cap-shape').distinct().show()"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 5,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "label = df0.rdd.map(lambda row: row[0])\n",
77 | "row = df0.rdd.map(lambda row: row[1:])"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 6,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "dfi = label.map(lambda m: 0.0 if m=='p' else 1.0).zip(row.map(lambda x: list(x))).toDF(schema=['label','row'])"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 7,
92 | "metadata": {
93 | "scrolled": true
94 | },
95 | "outputs": [
96 | {
97 | "data": {
98 | "text/plain": [
99 | "Row(label=0.0, row=['b', 'y', 'y', 't', 'l', 'f', 'c', 'b', 'n', 'e', 'c', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'n', 's', 'm'])"
100 | ]
101 | },
102 | "execution_count": 7,
103 | "metadata": {},
104 | "output_type": "execute_result"
105 | }
106 | ],
107 | "source": [
108 | "dfi.first()"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 15,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "# from pyspark.ml.feature import VectorAssembler\n",
118 | "# vecAss = VectorAssembler(inputCols=df0.columns[1:], outputCol='feature')\n",
119 | "# df0 = vecAss.transform(df0)"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 16,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "from pyspark.ml.feature import CountVectorizer\n",
129 | "import numpy as np\n",
130 | "from numpy import allclose\n",
131 | "cv = CountVectorizer(inputCol='row', outputCol='vectors')\n",
132 | "model = cv.fit(dfi)\n",
133 | "tf = model.transform(dfi)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 17,
139 | "metadata": {},
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "[Row(label=0.0, row=['x', 's', 'n', 't', 'p', 'f', 'c', 'n', 'k', 'e', 'e', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'k', 's', 'u'], vectors=SparseVector(24, {0: 3.0, 1: 1.0, 2: 3.0, 3: 4.0, 4: 2.0, 6: 2.0, 7: 1.0, 8: 2.0, 9: 1.0, 10: 1.0, 15: 1.0, 20: 1.0}))]"
145 | ]
146 | },
147 | "execution_count": 17,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "tf.take(1)"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 19,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "(train_data, test_data) = tf.randomSplit([0.8, 0.2])"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 20,
168 | "metadata": {},
169 | "outputs": [
170 | {
171 | "data": {
172 | "text/plain": [
173 | "SparseVector(24, {0: 0.0532, 1: 0.0375, 2: 0.0577, 3: 0.0947, 4: 0.064, 5: 0.0519, 6: 0.0436, 7: 0.022, 8: 0.0487, 9: 0.0411, 10: 0.0427, 11: 0.0299, 12: 0.0552, 13: 0.0683, 14: 0.0247, 15: 0.0164, 16: 0.0247, 17: 0.072, 18: 0.0844, 19: 0.0326, 20: 0.0135, 21: 0.0045, 22: 0.0132, 23: 0.0033})"
174 | ]
175 | },
176 | "execution_count": 20,
177 | "metadata": {},
178 | "output_type": "execute_result"
179 | }
180 | ],
181 | "source": [
182 | "from pyspark.ml.classification import RandomForestClassifier\n",
183 | "rf = RandomForestClassifier(numTrees=40, maxDepth=20, labelCol=\"label\", featuresCol='vectors')\n",
184 | "model = rf.fit(train_data)\n",
185 | "model.featureImportances"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 32,
191 | "metadata": {},
192 | "outputs": [],
193 | "source": [
194 | "result = model.transform(test_data)"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 43,
200 | "metadata": {},
201 | "outputs": [
202 | {
203 | "name": "stdout",
204 | "output_type": "stream",
205 | "text": [
206 | "+----------+\n",
207 | "|prediction|\n",
208 | "+----------+\n",
209 | "| 0.0|\n",
210 | "| 0.0|\n",
211 | "| 1.0|\n",
212 | "| 1.0|\n",
213 | "| 1.0|\n",
214 | "+----------+\n",
215 | "only showing top 5 rows\n",
216 | "\n"
217 | ]
218 | }
219 | ],
220 | "source": [
221 | "result.select('prediction').show(5)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 34,
227 | "metadata": {},
228 | "outputs": [
229 | {
230 | "name": "stdout",
231 | "output_type": "stream",
232 | "text": [
233 | "+-----+--------------------+--------------------+--------------------+--------------------+----------+\n",
234 | "|label| row| vectors| rawPrediction| probability|prediction|\n",
235 | "+-----+--------------------+--------------------+--------------------+--------------------+----------+\n",
236 | "| 0.0|[b, e, e, ?, s, s...|(24,[0,1,3,5,6,7,...|[28.4161036920659...|[0.71040259230164...| 0.0|\n",
237 | "| 0.0|[b, f, y, f, f, f...|(24,[0,1,2,5,6,7,...|[37.1750915750915...|[0.92937728937728...| 0.0|\n",
238 | "| 0.0|[b, n, w, f, n, f...|(24,[0,1,2,4,5,6,...|[4.02235172235172...|[0.10055879305879...| 1.0|\n",
239 | "+-----+--------------------+--------------------+--------------------+--------------------+----------+\n",
240 | "only showing top 3 rows\n",
241 | "\n"
242 | ]
243 | }
244 | ],
245 | "source": [
246 | "result.show(3)"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 36,
252 | "metadata": {},
253 | "outputs": [
254 | {
255 | "data": {
256 | "text/plain": [
257 | "1287"
258 | ]
259 | },
260 | "execution_count": 36,
261 | "metadata": {},
262 | "output_type": "execute_result"
263 | }
264 | ],
265 | "source": [
266 | "result.rdd.map(lambda row:1 if row.label == row.prediction else 0).sum()"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 45,
272 | "metadata": {},
273 | "outputs": [
274 | {
275 | "data": {
276 | "text/plain": [
277 | "0.8880822746521476"
278 | ]
279 | },
280 | "execution_count": 45,
281 | "metadata": {},
282 | "output_type": "execute_result"
283 | }
284 | ],
285 | "source": [
286 | "result.rdd.map(lambda row:1 if row.label == row.prediction else 0).sum()/result.count()"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 14,
292 | "metadata": {},
293 | "outputs": [
294 | {
295 | "name": "stderr",
296 | "output_type": "stream",
297 | "text": [
298 | "/home/ffzs/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
299 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n"
300 | ]
301 | }
302 | ],
303 | "source": [
304 | "from sklearn.ensemble import RandomForestClassifier\n",
305 | "import pandas as pd\n",
306 | "from sklearn import cross_validation\n",
307 | "from sklearn.model_selection import train_test_split\n",
308 | "from sklearn.cross_validation import cross_val_score"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 15,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "dfp = tf.toPandas()"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": 16,
323 | "metadata": {},
324 | "outputs": [
325 | {
326 | "data": {
327 | "text/html": [
328 | "
\n",
329 | "\n",
342 | "
\n",
343 | " \n",
344 | " \n",
345 | " | \n",
346 | " label | \n",
347 | " row | \n",
348 | " vectors | \n",
349 | "
\n",
350 | " \n",
351 | " \n",
352 | " \n",
353 | " 0 | \n",
354 | " 0.0 | \n",
355 | " [\u0000, s, \u0000, t, \u0000, f, c, n, \u0000, e, \u0000, \u0000, s, \u0000, \u0000, ... | \n",
356 | " (0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 1.0, 1.0, 0.0, ... | \n",
357 | "
\n",
358 | " \n",
359 | " 1 | \n",
360 | " 1.0 | \n",
361 | " [x, s, y, t, a, f, c, b, k, e, c, s, s, w, w, ... | \n",
362 | " (3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 1.0, 1.0, 1.0, ... | \n",
363 | "
\n",
364 | " \n",
365 | "
\n",
366 | "
"
367 | ],
368 | "text/plain": [
369 | " label row \\\n",
370 | "0 0.0 [\u0000, s, \u0000, t, \u0000, f, c, n, \u0000, e, \u0000, \u0000, s, \u0000, \u0000, ... \n",
371 | "1 1.0 [x, s, y, t, a, f, c, b, k, e, c, s, s, w, w, ... \n",
372 | "\n",
373 | " vectors \n",
374 | "0 (0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 1.0, 1.0, 0.0, ... \n",
375 | "1 (3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 1.0, 1.0, 1.0, ... "
376 | ]
377 | },
378 | "execution_count": 16,
379 | "metadata": {},
380 | "output_type": "execute_result"
381 | }
382 | ],
383 | "source": [
384 | "dfp.head(2)"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 17,
390 | "metadata": {},
391 | "outputs": [],
392 | "source": [
393 | "clf = RandomForestClassifier(random_state=22, n_estimators = 30, min_samples_split=3, min_samples_leaf=2)"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": 18,
399 | "metadata": {},
400 | "outputs": [],
401 | "source": [
402 | "X = dfp['vectors'].tolist()"
403 | ]
404 | },
405 | {
406 | "cell_type": "code",
407 | "execution_count": 19,
408 | "metadata": {},
409 | "outputs": [],
410 | "source": [
411 | "y = dfp['label'].tolist()"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": 20,
417 | "metadata": {},
418 | "outputs": [],
419 | "source": [
420 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": 21,
426 | "metadata": {},
427 | "outputs": [
428 | {
429 | "data": {
430 | "text/plain": [
431 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
432 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
433 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
434 | " min_samples_leaf=2, min_samples_split=3,\n",
435 | " min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,\n",
436 | " oob_score=False, random_state=22, verbose=0, warm_start=False)"
437 | ]
438 | },
439 | "execution_count": 21,
440 | "metadata": {},
441 | "output_type": "execute_result"
442 | }
443 | ],
444 | "source": [
445 | "clf.fit(X_train, y_train)"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": 23,
451 | "metadata": {
452 | "scrolled": true
453 | },
454 | "outputs": [
455 | {
456 | "name": "stdout",
457 | "output_type": "stream",
458 | "text": [
459 | "0.9218461538461539\n"
460 | ]
461 | }
462 | ],
463 | "source": [
464 | "print(clf.score(X_test, y_test))"
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": 32,
470 | "metadata": {},
471 | "outputs": [],
472 | "source": [
473 | "scores = cross_val_score(clf, X, y, cv=10)"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": 33,
479 | "metadata": {},
480 | "outputs": [
481 | {
482 | "data": {
483 | "text/plain": [
484 | "0.8905588981998195"
485 | ]
486 | },
487 | "execution_count": 33,
488 | "metadata": {},
489 | "output_type": "execute_result"
490 | }
491 | ],
492 | "source": [
493 | "scores.mean()"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": null,
499 | "metadata": {},
500 | "outputs": [],
501 | "source": []
502 | },
503 | {
504 | "cell_type": "code",
505 | "execution_count": null,
506 | "metadata": {},
507 | "outputs": [],
508 | "source": []
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": null,
513 | "metadata": {},
514 | "outputs": [],
515 | "source": []
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": 1,
520 | "metadata": {},
521 | "outputs": [],
522 | "source": [
523 | "from pyspark.sql import SparkSession\n",
524 | "spark = SparkSession.builder.appName('mushroom').getOrCreate()"
525 | ]
526 | },
527 | {
528 | "cell_type": "code",
529 | "execution_count": 32,
530 | "metadata": {},
531 | "outputs": [],
532 | "source": [
533 | "df = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/stock.csv',encoding='gbk',header=True, inferSchema=True)"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 33,
539 | "metadata": {},
540 | "outputs": [
541 | {
542 | "data": {
543 | "text/plain": [
544 | "[('日期', 'timestamp'),\n",
545 | " ('股票代码', 'string'),\n",
546 | " ('名称', 'string'),\n",
547 | " ('收盘价', 'double'),\n",
548 | " ('最高价', 'double'),\n",
549 | " ('最低价', 'double'),\n",
550 | " ('开盘价', 'double'),\n",
551 | " ('前收盘', 'double'),\n",
552 | " ('涨跌额', 'string'),\n",
553 | " ('涨跌幅', 'string'),\n",
554 | " ('换手率', 'double'),\n",
555 | " ('成交量', 'int'),\n",
556 | " ('成交金额', 'double'),\n",
557 | " ('总市值', 'double'),\n",
558 | " ('流通市值', 'double')]"
559 | ]
560 | },
561 | "execution_count": 33,
562 | "metadata": {},
563 | "output_type": "execute_result"
564 | }
565 | ],
566 | "source": [
567 | "df.dtypes"
568 | ]
569 | },
570 | {
571 | "cell_type": "code",
572 | "execution_count": 34,
573 | "metadata": {},
574 | "outputs": [],
575 | "source": [
576 | "# from pyspark.sql.types import StructType, StructField, LongType, StringType, DateType ,DoubleType # 导入类型\n",
577 | "# schema = StructType([\n",
578 | "# StructField(\"日期\", DateType(), True),\n",
579 | "# StructField(\"收盘价\", DoubleType(), True),\n",
580 | "# StructField(\"成交量\", LongType(), True),\n",
581 | "# StructField(\"名称\", StringType(), True)\n",
582 | "# ])"
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": 35,
588 | "metadata": {},
589 | "outputs": [],
590 | "source": [
591 | "df.write.csv(path='hdfs:///user/csv/stock.csv', header=True, sep=\",\", mode='overwrite')"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": 49,
597 | "metadata": {
598 | "scrolled": true
599 | },
600 | "outputs": [
601 | {
602 | "data": {
603 | "text/plain": [
604 | "'股票代码'"
605 | ]
606 | },
607 | "execution_count": 49,
608 | "metadata": {},
609 | "output_type": "execute_result"
610 | }
611 | ],
612 | "source": [
613 | "df.columns[1]"
614 | ]
615 | },
616 | {
617 | "cell_type": "code",
618 | "execution_count": 61,
619 | "metadata": {},
620 | "outputs": [],
621 | "source": [
622 | "df0 = spark.read.jdbc(url=\"jdbc:mysql://localhost:3306/test?user=root&password=666666\", table=\"mashroom\")"
623 | ]
624 | },
625 | {
626 | "cell_type": "code",
627 | "execution_count": 63,
628 | "metadata": {},
629 | "outputs": [
630 | {
631 | "data": {
632 | "text/plain": [
633 | "8124"
634 | ]
635 | },
636 | "execution_count": 63,
637 | "metadata": {},
638 | "output_type": "execute_result"
639 | }
640 | ],
641 | "source": [
642 | "df0.count()"
643 | ]
644 | },
645 | {
646 | "cell_type": "code",
647 | "execution_count": 64,
648 | "metadata": {},
649 | "outputs": [],
650 | "source": [
651 | "df0.write.jdbc(url=\"jdbc:mysql://localhost:3306/test?user=root&password=666666&useUnicode=true&characterEncoding=GBK\",\n",
652 | " mode=\"overwrite\",\n",
653 | " table=\"test\",\n",
654 | " properties={\"driver\":'com.mysql.jdbc.Driver'})"
655 | ]
656 | },
657 | {
658 | "cell_type": "code",
659 | "execution_count": 65,
660 | "metadata": {},
661 | "outputs": [],
662 | "source": [
663 | "spark.stop()"
664 | ]
665 | },
666 | {
667 | "cell_type": "code",
668 | "execution_count": 68,
669 | "metadata": {},
670 | "outputs": [
671 | {
672 | "name": "stdout",
673 | "output_type": "stream",
674 | "text": [
675 | "+--------------+------+\n",
676 | "| country|median|\n",
677 | "+--------------+------+\n",
678 | "| New Zealand| 39.0|\n",
679 | "| Spain| 37.0|\n",
680 | "| Ireland| 35.0|\n",
681 | "| Sweden| 34.0|\n",
682 | "| Italy| 34.0|\n",
683 | "| Norway| 34.0|\n",
684 | "| Denmark| 34.0|\n",
685 | "| Israel| 34.0|\n",
686 | "| Australia| 34.0|\n",
687 | "| Netherlands| 34.0|\n",
688 | "| Argentina| 33.5|\n",
689 | "| Canada| 33.5|\n",
690 | "| Belgium| 33.0|\n",
691 | "| Switzerland| 33.0|\n",
692 | "| Japan| 33.0|\n",
693 | "|United Kingdom| 33.0|\n",
694 | "| United States| 32.0|\n",
695 | "| Portugal| 32.0|\n",
696 | "| Romania| 32.0|\n",
697 | "| Germany| 31.0|\n",
698 | "+--------------+------+\n",
699 | "only showing top 20 rows\n",
700 | "\n"
701 | ]
702 | }
703 | ],
704 | "source": [
705 | "spark = SparkSession.builder.enableHiveSupport().master(\"local[*]\").appName(\"read_hive\").getOrCreate()\n",
706 | "\n",
707 | "df=spark.sql(\"select * from age\")\n",
708 | "df.show()"
709 | ]
710 | },
711 | {
712 | "cell_type": "code",
713 | "execution_count": 87,
714 | "metadata": {},
715 | "outputs": [
716 | {
717 | "data": {
718 | "text/plain": [
719 | "DataFrame[]"
720 | ]
721 | },
722 | "execution_count": 87,
723 | "metadata": {},
724 | "output_type": "execute_result"
725 | }
726 | ],
727 | "source": [
728 | "spark.sql('create table if not exists age2(name string, num int)')\n",
729 | "#df0.write.mode(\"overwrite\").insertInto(\"age2\")"
730 | ]
731 | },
732 | {
733 | "cell_type": "code",
734 | "execution_count": 80,
735 | "metadata": {},
736 | "outputs": [
737 | {
738 | "name": "stdout",
739 | "output_type": "stream",
740 | "text": [
741 | "+--------+---------+-----------+\n",
742 | "|database|tableName|isTemporary|\n",
743 | "+--------+---------+-----------+\n",
744 | "| default| age| false|\n",
745 | "| default| age2| false|\n",
746 | "| default| country| false|\n",
747 | "| default| qn| false|\n",
748 | "+--------+---------+-----------+\n",
749 | "\n"
750 | ]
751 | }
752 | ],
753 | "source": [
754 | "spark.sql('show tables').show()"
755 | ]
756 | },
757 | {
758 | "cell_type": "code",
759 | "execution_count": 81,
760 | "metadata": {},
761 | "outputs": [],
762 | "source": [
763 | "df.write.mode(\"overwrite\").insertInto(\"age2\")"
764 | ]
765 | },
766 | {
767 | "cell_type": "code",
768 | "execution_count": 86,
769 | "metadata": {},
770 | "outputs": [
771 | {
772 | "name": "stdout",
773 | "output_type": "stream",
774 | "text": [
775 | "+-----------+---+\n",
776 | "| name|num|\n",
777 | "+-----------+---+\n",
778 | "|New Zealand| 39|\n",
779 | "| Spain| 37|\n",
780 | "| Ireland| 35|\n",
781 | "| Sweden| 34|\n",
782 | "| Italy| 34|\n",
783 | "| Norway| 34|\n",
784 | "| Denmark| 34|\n",
785 | "| Israel| 34|\n",
786 | "| Australia| 34|\n",
787 | "|Netherlands| 34|\n",
788 | "+-----------+---+\n",
789 | "\n"
790 | ]
791 | }
792 | ],
793 | "source": [
794 | "spark.sql('select * from age2 sort by num limit 10 ').show()"
795 | ]
796 | },
797 | {
798 | "cell_type": "code",
799 | "execution_count": 18,
800 | "metadata": {},
801 | "outputs": [],
802 | "source": [
803 | "spark.stop()"
804 | ]
805 | },
806 | {
807 | "cell_type": "code",
808 | "execution_count": null,
809 | "metadata": {},
810 | "outputs": [],
811 | "source": []
812 | }
813 | ],
814 | "metadata": {
815 | "kernelspec": {
816 | "display_name": "Python 3",
817 | "language": "python",
818 | "name": "python3"
819 | },
820 | "language_info": {
821 | "codemirror_mode": {
822 | "name": "ipython",
823 | "version": 3
824 | },
825 | "file_extension": ".py",
826 | "mimetype": "text/x-python",
827 | "name": "python",
828 | "nbconvert_exporter": "python",
829 | "pygments_lexer": "ipython3",
830 | "version": "3.6.4"
831 | }
832 | },
833 | "nbformat": 4,
834 | "nbformat_minor": 2
835 | }
836 |
--------------------------------------------------------------------------------
/pysaprk.ml.clustering 学习.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "data:https://www.kaggle.com/vjchoudhary7/customer-segmentation-tutorial-in-python"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "from pyspark.sql import SparkSession\n",
17 | "spark = SparkSession.builder.master('local[1]').appName('learn_cluster').getOrCreate()"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 2,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "df = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/Mall_Customers.csv', header=True, inferSchema=True)"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 3,
32 | "metadata": {
33 | "scrolled": false
34 | },
35 | "outputs": [],
36 | "source": [
37 | "df = df.withColumnRenamed('Annual Income (k$)', 'Income').withColumnRenamed('Spending Score (1-100)', 'Spend')"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 4,
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "+----------+------+---+------+-----+\n",
50 | "|CustomerID|Gender|Age|Income|Spend|\n",
51 | "+----------+------+---+------+-----+\n",
52 | "| 1| Male| 19| 15| 39|\n",
53 | "| 2| Male| 21| 15| 81|\n",
54 | "| 3|Female| 20| 16| 6|\n",
55 | "+----------+------+---+------+-----+\n",
56 | "only showing top 3 rows\n",
57 | "\n"
58 | ]
59 | }
60 | ],
61 | "source": [
62 | "df.show(3)"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 5,
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "CustomerID 0\n",
74 | "Gender 0\n",
75 | "Age 0\n",
76 | "Income 0\n",
77 | "Spend 0\n",
78 | "dtype: int64"
79 | ]
80 | },
81 | "execution_count": 5,
82 | "metadata": {},
83 | "output_type": "execute_result"
84 | }
85 | ],
86 | "source": [
87 | "# 查看是否有缺失值\n",
88 | "df.toPandas().isna().sum()"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 6,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "from pyspark.ml.feature import VectorAssembler\n",
98 | "vecAss = VectorAssembler(inputCols = df.columns[3:], outputCol = 'features')\n",
99 | "df_km = vecAss.transform(df).select('CustomerID', 'features')"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 8,
105 | "metadata": {
106 | "scrolled": true
107 | },
108 | "outputs": [
109 | {
110 | "name": "stdout",
111 | "output_type": "stream",
112 | "text": [
113 | "+----------+-----------+\n",
114 | "|CustomerID| features|\n",
115 | "+----------+-----------+\n",
116 | "| 1|[15.0,39.0]|\n",
117 | "| 2|[15.0,81.0]|\n",
118 | "| 3| [16.0,6.0]|\n",
119 | "+----------+-----------+\n",
120 | "only showing top 3 rows\n",
121 | "\n"
122 | ]
123 | }
124 | ],
125 | "source": [
126 | "df_km.show(3)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 9,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "pd_df = df.toPandas()"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 10,
141 | "metadata": {},
142 | "outputs": [
143 | {
144 | "data": {
145 | "text/html": [
146 | "\n",
147 | "\n",
160 | "
\n",
161 | " \n",
162 | " \n",
163 | " | \n",
164 | " CustomerID | \n",
165 | " Gender | \n",
166 | " Age | \n",
167 | " Income | \n",
168 | " Spend | \n",
169 | "
\n",
170 | " \n",
171 | " \n",
172 | " \n",
173 | " 0 | \n",
174 | " 1 | \n",
175 | " Male | \n",
176 | " 19 | \n",
177 | " 15 | \n",
178 | " 39 | \n",
179 | "
\n",
180 | " \n",
181 | " 1 | \n",
182 | " 2 | \n",
183 | " Male | \n",
184 | " 21 | \n",
185 | " 15 | \n",
186 | " 81 | \n",
187 | "
\n",
188 | " \n",
189 | " 2 | \n",
190 | " 3 | \n",
191 | " Female | \n",
192 | " 20 | \n",
193 | " 16 | \n",
194 | " 6 | \n",
195 | "
\n",
196 | " \n",
197 | " 3 | \n",
198 | " 4 | \n",
199 | " Female | \n",
200 | " 23 | \n",
201 | " 16 | \n",
202 | " 77 | \n",
203 | "
\n",
204 | " \n",
205 | " 4 | \n",
206 | " 5 | \n",
207 | " Female | \n",
208 | " 31 | \n",
209 | " 17 | \n",
210 | " 40 | \n",
211 | "
\n",
212 | " \n",
213 | "
\n",
214 | "
"
215 | ],
216 | "text/plain": [
217 | " CustomerID Gender Age Income Spend\n",
218 | "0 1 Male 19 15 39\n",
219 | "1 2 Male 21 15 81\n",
220 | "2 3 Female 20 16 6\n",
221 | "3 4 Female 23 16 77\n",
222 | "4 5 Female 31 17 40"
223 | ]
224 | },
225 | "execution_count": 10,
226 | "metadata": {},
227 | "output_type": "execute_result"
228 | }
229 | ],
230 | "source": [
231 | "pd_df.head()"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 18,
237 | "metadata": {
238 | "scrolled": false
239 | },
240 | "outputs": [
241 | {
242 | "data": {
243 | "text/html": [
244 | ""
245 | ],
246 | "text/vnd.plotly.v1+html": [
247 | ""
248 | ]
249 | },
250 | "metadata": {},
251 | "output_type": "display_data"
252 | },
253 | {
254 | "data": {
255 | "application/vnd.plotly.v1+json": {
256 | "data": [
257 | {
258 | "marker": {
259 | "size": 6
260 | },
261 | "mode": "markers",
262 | "type": "scatter",
263 | "x": [
264 | 15,
265 | 15,
266 | 16,
267 | 16,
268 | 17,
269 | 17,
270 | 18,
271 | 18,
272 | 19,
273 | 19,
274 | 19,
275 | 19,
276 | 20,
277 | 20,
278 | 20,
279 | 20,
280 | 21,
281 | 21,
282 | 23,
283 | 23,
284 | 24,
285 | 24,
286 | 25,
287 | 25,
288 | 28,
289 | 28,
290 | 28,
291 | 28,
292 | 29,
293 | 29,
294 | 30,
295 | 30,
296 | 33,
297 | 33,
298 | 33,
299 | 33,
300 | 34,
301 | 34,
302 | 37,
303 | 37,
304 | 38,
305 | 38,
306 | 39,
307 | 39,
308 | 39,
309 | 39,
310 | 40,
311 | 40,
312 | 40,
313 | 40,
314 | 42,
315 | 42,
316 | 43,
317 | 43,
318 | 43,
319 | 43,
320 | 44,
321 | 44,
322 | 46,
323 | 46,
324 | 46,
325 | 46,
326 | 47,
327 | 47,
328 | 48,
329 | 48,
330 | 48,
331 | 48,
332 | 48,
333 | 48,
334 | 49,
335 | 49,
336 | 50,
337 | 50,
338 | 54,
339 | 54,
340 | 54,
341 | 54,
342 | 54,
343 | 54,
344 | 54,
345 | 54,
346 | 54,
347 | 54,
348 | 54,
349 | 54,
350 | 57,
351 | 57,
352 | 58,
353 | 58,
354 | 59,
355 | 59,
356 | 60,
357 | 60,
358 | 60,
359 | 60,
360 | 60,
361 | 60,
362 | 61,
363 | 61,
364 | 62,
365 | 62,
366 | 62,
367 | 62,
368 | 62,
369 | 62,
370 | 63,
371 | 63,
372 | 63,
373 | 63,
374 | 63,
375 | 63,
376 | 64,
377 | 64,
378 | 65,
379 | 65,
380 | 65,
381 | 65,
382 | 67,
383 | 67,
384 | 67,
385 | 67,
386 | 69,
387 | 69,
388 | 70,
389 | 70,
390 | 71,
391 | 71,
392 | 71,
393 | 71,
394 | 71,
395 | 71,
396 | 72,
397 | 72,
398 | 73,
399 | 73,
400 | 73,
401 | 73,
402 | 74,
403 | 74,
404 | 75,
405 | 75,
406 | 76,
407 | 76,
408 | 77,
409 | 77,
410 | 77,
411 | 77,
412 | 78,
413 | 78,
414 | 78,
415 | 78,
416 | 78,
417 | 78,
418 | 78,
419 | 78,
420 | 78,
421 | 78,
422 | 78,
423 | 78,
424 | 79,
425 | 79,
426 | 81,
427 | 81,
428 | 85,
429 | 85,
430 | 86,
431 | 86,
432 | 87,
433 | 87,
434 | 87,
435 | 87,
436 | 87,
437 | 87,
438 | 88,
439 | 88,
440 | 88,
441 | 88,
442 | 93,
443 | 93,
444 | 97,
445 | 97,
446 | 98,
447 | 98,
448 | 99,
449 | 99,
450 | 101,
451 | 101,
452 | 103,
453 | 103,
454 | 103,
455 | 103,
456 | 113,
457 | 113,
458 | 120,
459 | 120,
460 | 126,
461 | 126,
462 | 137,
463 | 137
464 | ],
465 | "y": [
466 | 39,
467 | 81,
468 | 6,
469 | 77,
470 | 40,
471 | 76,
472 | 6,
473 | 94,
474 | 3,
475 | 72,
476 | 14,
477 | 99,
478 | 15,
479 | 77,
480 | 13,
481 | 79,
482 | 35,
483 | 66,
484 | 29,
485 | 98,
486 | 35,
487 | 73,
488 | 5,
489 | 73,
490 | 14,
491 | 82,
492 | 32,
493 | 61,
494 | 31,
495 | 87,
496 | 4,
497 | 73,
498 | 4,
499 | 92,
500 | 14,
501 | 81,
502 | 17,
503 | 73,
504 | 26,
505 | 75,
506 | 35,
507 | 92,
508 | 36,
509 | 61,
510 | 28,
511 | 65,
512 | 55,
513 | 47,
514 | 42,
515 | 42,
516 | 52,
517 | 60,
518 | 54,
519 | 60,
520 | 45,
521 | 41,
522 | 50,
523 | 46,
524 | 51,
525 | 46,
526 | 56,
527 | 55,
528 | 52,
529 | 59,
530 | 51,
531 | 59,
532 | 50,
533 | 48,
534 | 59,
535 | 47,
536 | 55,
537 | 42,
538 | 49,
539 | 56,
540 | 47,
541 | 54,
542 | 53,
543 | 48,
544 | 52,
545 | 42,
546 | 51,
547 | 55,
548 | 41,
549 | 44,
550 | 57,
551 | 46,
552 | 58,
553 | 55,
554 | 60,
555 | 46,
556 | 55,
557 | 41,
558 | 49,
559 | 40,
560 | 42,
561 | 52,
562 | 47,
563 | 50,
564 | 42,
565 | 49,
566 | 41,
567 | 48,
568 | 59,
569 | 55,
570 | 56,
571 | 42,
572 | 50,
573 | 46,
574 | 43,
575 | 48,
576 | 52,
577 | 54,
578 | 42,
579 | 46,
580 | 48,
581 | 50,
582 | 43,
583 | 59,
584 | 43,
585 | 57,
586 | 56,
587 | 40,
588 | 58,
589 | 91,
590 | 29,
591 | 77,
592 | 35,
593 | 95,
594 | 11,
595 | 75,
596 | 9,
597 | 75,
598 | 34,
599 | 71,
600 | 5,
601 | 88,
602 | 7,
603 | 73,
604 | 10,
605 | 72,
606 | 5,
607 | 93,
608 | 40,
609 | 87,
610 | 12,
611 | 97,
612 | 36,
613 | 74,
614 | 22,
615 | 90,
616 | 17,
617 | 88,
618 | 20,
619 | 76,
620 | 16,
621 | 89,
622 | 1,
623 | 78,
624 | 1,
625 | 73,
626 | 35,
627 | 83,
628 | 5,
629 | 93,
630 | 26,
631 | 75,
632 | 20,
633 | 95,
634 | 27,
635 | 63,
636 | 13,
637 | 75,
638 | 10,
639 | 92,
640 | 13,
641 | 86,
642 | 15,
643 | 69,
644 | 14,
645 | 90,
646 | 32,
647 | 86,
648 | 15,
649 | 88,
650 | 39,
651 | 97,
652 | 24,
653 | 68,
654 | 17,
655 | 85,
656 | 23,
657 | 69,
658 | 8,
659 | 91,
660 | 16,
661 | 79,
662 | 28,
663 | 74,
664 | 18,
665 | 83
666 | ]
667 | }
668 | ],
669 | "layout": {}
670 | },
671 | "text/html": [
672 | ""
673 | ],
674 | "text/vnd.plotly.v1+html": [
675 | ""
676 | ]
677 | },
678 | "metadata": {},
679 | "output_type": "display_data"
680 | }
681 | ],
682 | "source": [
683 | "from plotly.offline import iplot, init_notebook_mode\n",
684 | "import plotly.graph_objs as go\n",
685 | "init_notebook_mode(connected=True)\n",
686 | "trace = go.Scatter(x=pd_df.Income, y=pd_df.Spend , \n",
687 | " mode='markers',\n",
688 | " marker = {'size':6})\n",
689 | "iplot([trace])"
690 | ]
691 | },
692 | {
693 | "cell_type": "markdown",
694 | "metadata": {},
695 | "source": [
696 | "## KMeans\n",
697 | "`class pyspark.ml.clustering.KMeans(self, featuresCol=\"features\", predictionCol=\"prediction\", k=2, initMode=\"k-means||\", initSteps=2, tol=1e-4, maxIter=20, seed=None)\n",
698 | "`"
699 | ]
700 | },
701 | {
702 | "cell_type": "markdown",
703 | "metadata": {},
704 | "source": [
705 | "**参数解释**"
706 | ]
707 | },
708 | {
709 | "cell_type": "markdown",
710 | "metadata": {},
711 | "source": [
712 | "`\n",
713 | "initMode: 初始化算法,可以使随机的“random\",也可以是”k-means||\"\n",
714 | "initSteps: k-means||初始化的步数,需>0\n",
715 | "fit(datast,params=None)方法\n",
716 | "`"
717 | ]
718 | },
719 | {
720 | "cell_type": "markdown",
721 | "metadata": {},
722 | "source": [
723 | "`\n",
724 | "cluster: 每个训练数据点预测的聚类中心数据框\n",
725 | "clusterSize: 每个簇的大小(簇内数据点的个数)\n",
726 | "k: 模型训练的簇个数\n",
727 | "predictions: 由模型transform方法产生的数据框\n",
728 | "`"
729 | ]
730 | },
731 | {
732 | "cell_type": "code",
733 | "execution_count": 20,
734 | "metadata": {},
735 | "outputs": [],
736 | "source": [
737 | "from pyspark.ml.clustering import KMeans\n",
738 | "\n",
739 | "cost = list(range(2,20))\n",
740 | "for k in range(2, 20):\n",
741 | " kmeans = KMeans(k=k, seed=1)\n",
742 | " km_model = kmeans.fit(df_km)\n",
743 | " # computeCost:计算输入点与其对应的聚类中心之间的平方距离之和。\n",
744 | " cost[k-2] = km_model.computeCost(df_km)"
745 | ]
746 | },
747 | {
748 | "cell_type": "code",
749 | "execution_count": 21,
750 | "metadata": {},
751 | "outputs": [],
752 | "source": [
753 | "import matplotlib.pyplot as plt\n",
754 | "%matplotlib inline"
755 | ]
756 | },
757 | {
758 | "cell_type": "code",
759 | "execution_count": 22,
760 | "metadata": {
761 | "scrolled": false
762 | },
763 | "outputs": [
764 | {
765 | "data": {
766 | "text/plain": [
767 | "Text(0,0.5,'cost')"
768 | ]
769 | },
770 | "execution_count": 22,
771 | "metadata": {},
772 | "output_type": "execute_result"
773 | },
774 | {
775 | "data": {
776 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgEAAAFzCAYAAACn5No2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3Xl0XOWd5vHnV6VdqpIlWbLKq7yALbM4JDIYcLCBNiRpIIekk5COg6GTJkv3pM/Qk+ltOJlepqe7p3s6OZMJabrTDQEmSSeEsKQTwuoE8IJY7IBtjGxkS7ZsyZKtfa93/qjrRbZsl1BV3Vq+n3Pq6NZ7r1y/l0ud++i9977XnHMCAAC5J+B3AQAAwB+EAAAAchQhAACAHEUIAAAgRxECAADIUYQAAAByFCEAAIAcRQgAACBHEQIAAMhRhAAAAHJUnt8FJNvMmTNdXV2d32UAAJAyr7766hHnXPX5tsv6EFBXV6fGxka/ywAAIGXMbF8823E6AACAHEUIAAAgRxECAADIUYQAAAByFCEAAIAcRQgAACBHEQIAAMhRhAAAAHIUIQAAgBxFCAAAIEcRAgAAyFGEgCnoHhjVMzsOq3do1O9SAACYNkLAFGw/cEyf/26jtrd2+10KAADTRgiYgvpIWJK0s63H50oAAJg+QsAUzCwrVHWoUDsIAQCALEAImKL6SFg723r9LgMAgGkjBExRfSSkpvZejYxF/S4FAIBpIQRM0fJIWKPjTns6+vwuBQCAaSEETNFyLg4EAGQJQsAULZxZqoK8ACEAAJDxkhICzCzfzJ7wltea2Yveq8XMNpjZSjNrPaV9qZkVmdmTZrbNzB60mLjaktGHs8kLBrR0VoiLAwEAGS/hIcDMiiW9KmmdJDnnXnDOrXbOrZa0XdLrkiok3Xu83Tn3tqT1klqdcyu89eum0JZS9ZGQdrb1yDmX6o8GACBhEh4CnHODzrlLJbWe2m5mJZKWOOe2K3bw/riZbTWzR7y/5q+T9LS3+XOSrp1CW0rVR8Lq7B9Re+9wqj8aAICESeU1AeskPestN0m6xzl3uaSIpDWSqiQdn4+3R1LlFNomMLO7zKzRzBo7OjoS3pHjMwcyaRAAIJOlMgTcLOlJb7lZ0jOnLNdIOiKp3Gsr997H2zaBc+4+51yDc66huro6oZ2QpPpa7hAAAGS+lIQAb7h/rWLD95J0t6TbzCwg6WJJbyo2SnCDt/46Sc9PoS2lykvyNWdGMRcHAgAyWqpGAlZK2uGcG/Lef1PSnZK2SHrUObdD0sOS5pjZdkldih3s421LueMXBwIAkKnykvUPO+eWnLK8VdItp7xvU2xk4NTthyXddNo/E29bytVHwnpuV7uGRsdVlB/0uxwAAKaMyYLeo+WRsKJO2n2YUwIAgMxECHiP6pk+GACQ4QgB79H8yhKVFgS5OBAAkLEIAe9RIGBaWhtirgAAQMYiBExDfSTM9MEAgIxFCJiG+khYvUNjaj066HcpAABMGSFgGrg4EACQyQgB07CsNiQzcXEgACAjEQKmobQwTwsqSxgJAABkJELANNVHwtp5iBAAAMg8hIBpqo+Eta9zQH3DY36XAgDAlBACpmm5d3Hg24wGAAAyDCFgmupnx0LADi4OBABkGELANM0uL1K4KI+LAwEAGYcQME1mdmLmQAAAMgkhIAHqI2HtauvVeJTpgwEAmYMQkADLI2ENjo5rX2e/36UAABA3QkACnJw+mIsDAQCZgxCQABfMKlMwYFwXAADIKISABCjKD2rRzFJCAAAgoxACEoQ7BAAAmYYQkCD1kbAOdg/p2MCI36UAABAXQkCCLJ/NxYEAgMxCCEiQ+khIkjglAADIGISABKkJFWlmWQEhAACQMQgBCVQfCWsnTxMEAGQIQkAC1UfC2n2oT6PjUb9LAQDgvAgBCVQfCWlkPKq9HUwfDABIf4SABDo5fTCnBAAA6Y8QkECLq8tUEAwQAgAAGYEQkED5wYCW1JRpByEAAJABCAEJFps+mAmDAADpjxCQYPWRkI70Daujd9jvUgAAOCdCQIKdnD6YUwIAgPRGCEiw5dwhAADIEEkJAWaWb2ZPeMsrzazVzF70XkvNrMjMnjSzbWb2oMW857Zk9OG9mlFSoEh5ESEAAJD2Eh4CzKxY0quS1nlNFZLudc6t9l5vS1ovqdU5t8Jbv26abWmFiwMBAJkg4SHAOTfonLtUUqvXVCHp42a21cwe8f5yv07S09765yRdO822tFIfCampo09Do+N+lwIAwFml4pqAJkn3OOculxSRtEZSlaRub32PpMpptqWV+khY41GnpvY+v0sBAOCsUhECmiU9c8pyjaQjksq9tnLv/XTaJjCzu8ys0cwaOzo6EtiV+ByfPphJgwAA6SwVIeBuSbeZWUDSxZLelPSspBu89ddJen6abRM45+5zzjU45xqqq6sT3qHzqasqVVE+0wcDANJbKkLANyXdKWmLpEedczskPSxpjpltl9Sl2IF9Om1pJRgwLa0NEwIAAGktL1n/sHNuifezTdLa09YNS7rptF+ZTlvaWR4J6T9+fUjOOaXZXYwAAEhisqCkWR4Jq3twVG3dQ36XAgDApAgBSVLPzIEAgDRHCEiSZYQAAECaIwQkSVlhnuZXljBzIAAgbRECkqg+EmKuAABA2iIEJFF9JKzmzn4NjIz5XQoAAGcgBCRRfSQs56RdhzglAABIP4SAJFrOxYEAgDRGCEiiuRXFChXmEQIAAGmJEJBEZqZlkRB3CAAA0hIhIMnqI2HtautRNOr8LgUAgAkIAUm2PBJW/8i4Wo4O+F0KAAATEAKSjOmDAQDpihCQZEtrQwqYtIPrAgAAaYYQkGRF+UEtnFnKSAAAIO0QAlKgPhLWjoOEAABAeiEEpEB9JKwDxwbVPTjqdykAAJxACEiB4zMH7uKUAAAgjRACUoA7BAAA6YgQkAKzwoWqKMln5kAAQFohBKSAmak+EtbOQ4wEAADSByEgReojYb19qFdj41G/SwEAQBIhIGWWR8IaHouqubPf71IAAJBECEiZ4xcHMnMgACBdEAJSZElNmfKDxh0CAIC0QQhIkYK8gBZXlxECAABpgxCQQsuZPhgAkEYIASlUHwmrvXdYnX3DfpcCAAAhIJVOzhzIxYEAAP8RAlKoPhKSxPTBAID0QAhIoaqyQtWECgkBAIC0QAhIsfpIWDsIAQCANEAISLH6SFh7Ovo0Msb0wQAAfxECUmz57LBGx52a2vv8LgUAkOMIASm2nIsDAQBpghCQYnVVpSrMCxACAAC+S0oIMLN8M3vilPcPmNlmM3vczPLMbKWZtZrZi95rqZkVmdmTZrbNzB60mLjaktGHZMkLBrS0NqSdhwgBAAB/JTwEmFmxpFclrfPer5aU55xbJSks6QZJFZLudc6t9l5vS1ovqdU5t8Jbv24KbRmlvjY2fbBzzu9SAAA5LOEhwDk36Jy7VFKr13RY0jdO+7wKSR83s61m9oj31/x1kp721j8n6doptGWU+khIRwdGdbiH6YMBAP5J+jUBzrl3nHNbzexWSVFJv5DUJOke59zlkiKS1kiqktTt/VqPpMoptE1gZneZWaOZNXZ0dCSnY9NwcvpgTgkAAPyTkgsDzewWSV+RdLNzbkxSs6RnvNXNkmokHZFU7rWVe+/jbZvAOXefc67BOddQXV2d6O5M2zIvBDBpEADAT0kPAWZWK+mrkm5yzh1/cs7dkm4zs4CkiyW9KelZxa4XkGJD/s9PoS2jlBfna86MYkYCAAC+SsVIwAbFhvyf8u4E+B1J35R0p6Qtkh51zu2Q9LCkOWa2XVKXYgf7eNsyTn0kTAgAAPgqL1n/sHNuiffzbyX97SSbrD1t+2FJN522TbxtGWf57LCe23VYQ6PjKsoP+l0OACAHMVmQT5ZHQoo66e1DveffGACAJCAE+IQ7BAAAfiME+GReRYlKC4KEAACAbwgBPgkETMsiYW4TBAD4hhDgo/pISLvaepk+GADgC0KAj+ojYfUOj6n16KDfpQAAchAhwEf1zBwIAPARIcBHy2pDMuMOAQCAPwgBPiopyFNdVSkhAADgC0KAz+ojIe1sY8IgAEDqEQJ8tjwS1v6uAfUOjfpdCgAgxxACfHb84kCmDwYApBohwGdMHwwA8AshwGeR8iKVF+drB9cFAABSjBDgMzNTfSTEXAEAgJQjBKSB+khYbx/q0XiU6YMBAKlDCEgD9ZGwhkajau7s97sUAEAOIQSkgeVcHAgA8AEhIA0sqSlTMGCEAABAShEC0kBRflCLq0uZORAAkFKEgDRRHwkzEgAASClCQJpYHgmrrXtIxwZG/C4FAJAjCAFp4vjMgcwXAABIFUJAmjg5fTDXBQAAUoMQkCaqQ4WaWVbIdQEAgJQhBKSR+khIOw4SAgAAqUEISCPLI2E1tfdpdDzqdykAgBxACEgjl82v0Mh4VL96p8PvUgAAOYAQkEaur69RTahQD7y8z+9SAAA5gBCQRvKDAX3migXauLtDezv6/C4HAJDlCAFp5tNXzFN+0PTdTYwGAACSixCQZmpCRfrIJRH96NVW9Q2P+V0OACCLEQLS0Iar6tQ3PKYfv9bqdykAgCxGCEhDl82boUvnluuBl5vlnPO7HABAliIEpCEz04Yr67Sno18vNXX6XQ4AIEslJQSYWb6ZPeEtF5nZk2a2zcwetJiEtiWjD377zUsjqiot0P0vN/tdCgAgSyU8BJhZsaRXJa3zmtZLanXOrZBU4bUnui3rFOUHddvl8/TsrsNq6RrwuxwAQBZKeAhwzg065y6VdPyqtuskPe0tPyfp2iS0ZaX1qxYoYKYHN3O7IAAg8d5TCDCzq6eweZWkbm+5R1JlEtpOr+8uM2s0s8aOjsydgjdSXqwbL5qlH7zSosGRcb/LAQBkmbhCgJk9d1rT303hM45IKveWy733iW6bwDl3n3OuwTnXUF1dPYVS08+GK+vUPTiqx9444HcpAIAsc84QYGaXmtkGSbPN7Hbv9WVJQ1P4jGcl3eAtXyfp+SS0Za3LF1ZqWW1I93O7IAAgwc43EmCT/Dwi6bem8BkPS5pjZtsldSl2EE90W9YyM224qk67DvVq67tdfpcDAMgiFs9fl2b2N865P05BPQnX0NDgGhsb/S5jWgZHxrXqfz6rq5dU6Vuf+YDf5QAA0pyZveqcazjfdvFeGPinZhY2s6CZXWtmoWnWhykoLgjqUyvn6am3Dqute9DvcgAAWSLeEPADSddI+ntJn5P0k6RVhEl9dtUCRZ3Tw5v3+10KACBLxBsCIs65JyUtcs6tl1SWxJowiXmVJbp+2Sx9b+t+DY1yuyAAYPriDQFdZvYTSb82s5skHUtiTTiLO66qU2f/iH66vc3vUgAAWSDeEPAJSX/hnPtvis0E+MnklYSzuXpJlRZXl+q7m5r9LgUAkAXiDQHjkj5gZv8oqUFSf/JKwtkcv11wW2u3Xt9/1O9yAAAZLt4QcL+kOZJ+7v28P0n14Dw+9v65KivM0wM8XRAAME3xhoA659x/d8495Zz7c0l1SawJ51BWmKff+sBc/fTXbWrvncrEjQAATBRvCNhvZn9mZteZ2Z9J4j41H91+5QKNjjt9b0uL36UAADJYvCHgi5KCik0X3CPpC0mrCOe1qLpM11xYrYe37NPoeNTvcgAAGSreEPBdxe4K+D3Fntz3b0mrCHG546oFau8d1s/fPOR3KQCADBVvCKhxzv2ri/krSbOSWRTOb+2FNVpQVcIFggCA9yzeELDPzP7Ie27AH0s6mMyicH6BgOmzqxaocd9RvXmg2+9yAAAZKN4QcIekAcWuCeiXtCFZBSF+n2iYp+L8IKMBAID3JK4Q4Jwbds79H+fc73k/uTctDZQX5+vW98/RY9sO6mj/iN/lAAAyTLwjAUhTG66s08hYVN9/hdsFAQBTQwjIcEtrQ7pyUZUe2rxPY9wuCACYAkJAFthwVZ0OHBvUMzvb/S4FAJBBCAFZ4DfqazRnRjEXCAIApoQQkAXyggGtX7VAm/Z2avfhXr/LAQBkCEJAlvjUynkqyAswGgAAiBshIEtUlhbooytm68evHVD34Kjf5QAAMgAhIItsuKpOg6Pj+mEjtwsCAM6PEJBFLp5TroYFFXpw8z5Fo87vcgAAaY4QkGU2XFWnfZ0D2ri7w+9SAABpjhCQZT50ca1qQoW6nwsEAQDnQQjIMvnBgD5zxQJt3N2hvR19fpcDAEhjhIAs9Okr5ik/aPrupn1+lwIASGOEgCxUEyrSb14S0Y9ebVXf8Jjf5QAA0hQhIEttuKpOfcNjevS1Vr9LAQCkKUJAlnrfvBm6dG65Hti0T85xuyAA4EyEgCxlZtpwZZ2a2vv0UlOn3+UAANIQISCL3bQioqrSAm4XBABMihCQxQrzgvr05fP17K7Dauka8LscAECaIQRkuc+smq+AmR7azO2CAICJUhICzGytmb3ovVrM7Gtm1npK21IzKzKzJ81sm5k9aDFxtaWiD5kqUl6sD11Uq++/0qLBkXG/ywEApJGUhADn3AvOudXOudWStks6Kune423OubclrZfU6pxbIalC0roptOEcbr9ygboHR/XYGwf8LgUAkEZSejrAzEokLZF0WNLHzWyrmT3i/TV/naSnvU2fk3TtFNpwDpcvrNSy2pDuf7mZ2wUBACek+pqAdZKeldQk6R7n3OWSIpLWSKqS1O1t1yOpcgptE5jZXWbWaGaNHR08Tc/MdMdVddp1qFdb3+3yuxwAQJpIdQi4WdKTkpolPeO1NUuqkXREUrnXVu69j7dtAufcfc65BudcQ3V1dcI7kYk++r45Ki/O53kCAIATUhYCvCH/tYoN4d8t6TYzC0i6WNKbio0Q3OBtfp2k56fQhvMoLgjqtpXz9PO3DnG7IABAUmpHAlZK2uGcG5L0TUl3Stoi6VHn3A5JD0uaY2bbJXUpdrCPtw1xuPPqhQqY9C+/2ut3KQCANJCXqg9yzm2VdIu33KbYqMCp64cl3XTar8XbhjjUlhfpY5fN1fdfadF/uv4CzSwr9LskAICPmCwox9y1ZpFGxqN6gKmEASDnEQJyzOLqMn3oolo98HKz+obH/C4HAOAjQkAO+uKaxeoZGtP3tuz3uxQAgI8IATloxbwZunpJlf7lxb0aHmMqYQDIVYSAHPWlNUt0uGdYP3mdqYQBIFcRAnLU1UuqdMmccn17416NR5lKGAByESEgR5mZvrR2sd490q+n3jrkdzkAAB8QAnLYjRfVauHMUt37wh4eLAQAOYgQkMOCAdMXrlmkXx/o1ktNnX6XAwBIMUJAjrv1/XNUEyrUvRub/C4FAJBihIAcV5gX1Oc/uFAvNXVqW8sxv8sBAKQQIQD69OXzFS7K07c37vG7FABAChECoFBRvm6/sk4/f+uQ9nT0+V0OACBFCAGQJN1xdZ0KggHdt5HHDANAriAEQJI0s6xQn1o5Tz9+vVWHuof8LgcAkAKEAJzwux9cpKiTvvMiowEAkAsIAThhXmWJbr40ov+3Zb+ODYz4XQ4AIMkIAZjgi2sXq39kXA9u2ud3KQCAJCMEYIJltWFdt6xG//ZyswZHeMwwAGQzQgDO8KW1i9XVP6J/b2zxuxQAQBIRAnCGlXWValhQoft+uVej41G/ywEAJAkhAJP60trFOnBsUE9uP+h3KQCAJCEEYFLXLq3R0lkh3fvCHkWjPGYYALIRIQCTCgRMX1y7SLsP9+n5t9v9LgcAkASEAJzVTZfO1pwZxbr3BR4sBADZiBCAs8oPBnTXNYvUuO+oXmnu8rscAECCEQJwTp9smKfK0gJGAwAgCxECcE7FBUHdeVWdntvVrp1tPX6XAwBIIEIAzuv2K+tUWhDUP21kNAAAsgkhAOdVXpKv375ivp7Y3qaWrgG/ywEAJAghAHH53OpFCpj0z7/iMcMAkC0IAYhLbXmRPnbZXP3glRYd6Rv2uxwAQAIQAhC3u9Ys0sh4VPe/1Ox3KQCABCAEIG6Lq8v0oYtq9d1NzeodGvW7HADANBECMCVfXLNYPUNj+t7W/X6XAgCYppSEADNbaWatZvai91phZk+a2TYze9Biit5rWyr6gJgV82bo6iVV+pdfvavhsXG/ywEATEOqRgIqJN3rnFvtnFstaaWkVufcCm/dOknrp9GGFPrSmiVq7x3Wj1874HcpAIBpSGUI+LiZbTWzRyRdL+lpb91zkq6VdN002pBCVy+p0iVzyvVPG/donMcMA0DGSlUIaJJ0j3PuckkRSR+T1O2t65FUKalqGm1IITPTl9YuVnPngH7+5iG/ywEAvEepCgHNkp45ZTkqqdx7Xy7piPd6r20TmNldZtZoZo0dHR2J7Ac8N15Uq4UzS3XvxiY5x2gAAGSiVIWAuyXdZmYBSRdL+kNJN3jrrpP0vKRnp9E2gXPuPudcg3Ouobq6OvG9gYIB0xeuWaQ3D/ToxaYzchgAIAOkKgR8U9KdkrZIelTSdyTNMbPtkroUO7A/PI02+ODW989RTaiQxwwDQIbKS8WHOOfaJK09rfmm094PT6MNPijMC+rzH1yov/6PXXqj5ZjeN2+G3yUBAKaAyYIwLZ++fL7CRXn6NqMBAJBxCAGYllBRvm6/sk5P7TikpvY+v8sBAEwBIQDTdsfVdSoIBnTfLxkNAIBMQgjAtM0sK9RtK+fp0dcPqK170O9yAABxIgQgIT7/wUWKOukbz7yjkbGo3+UAAOJACEBCzKss0Scb5ur7r7Toqr95Tv/rqV1qPTrgd1kAgHOwbJ/traGhwTU2NvpdRk6IRp02vtOhhzfv03O72iVJ1y6t0fpVC3TNhdUKBnjgIwCkgpm96pxrOO92hAAkQ+vRAX1/a4u+/0qLjvQNa25FsX77ivn6ZMM8zSwr9Ls8AMhqhAAPIcBfI2NR/WLHIT20eZ827+1SftD04YsjWr9qgVbWVciM0QEASDRCgIcQkD6a2nv10Ob9euS1VvUOjenCWWVav2qBbr1sjkJF+X6XBwBZgxDgIQSkn4GRMT2x7aAe2rxfvz7QrZKCoD76vjlav2q+Lppdfv5/AABwToQADyEgvW1rOaaHNu/TE9sPamg0qsvmz9D6KxboNy+NqCg/6Hd5AJCRCAEeQkBm6B4Y1SOvteqhLfu0t6NfM0ry9YkPzNVvX7FAC2eW+l0eAGQUQoCHEJBZnHPatLdTD2/er6feOqSxqNMHL5ipz1wxX79RP0t5Qaa2AIDzIQR4CAGZq71nSD94pUXf27pfB7uHNCtcqM+vXqQ7rq5TPmEAAM6KEOAhBGS+sfGonn+7Qw+83KwXm45o6ayQ/urWi7WyrtLv0gAgLcUbAvhzCmkvLxjQuuWz9NDnr9A/396gvuExfeLbm/RHP9quo/0jfpcHABmLEICMsm75LD199zX6wjWL9KPXWnX9/96oHza2KNtHtAAgGQgByDglBXn6k4/U66dfWa2FM0v11R9t16fu26x3Dvf6XRoAZBRCADLWstqwfviFK/W3H79Euw/36sPf+JX+7ue7NDgy7ndpAJARCAHIaIGA6VMr5+vZu9foo++bo2+9sEc3fH2jnveeYggAODtCALJCVVmh/uGTK/S9312lgmBAd97/ir788Ks61D3kd2kAkLYIAcgqVy6u0s/+4Bp99calenZnu67/hxf0ry++q7HxqN+lAUDaIQQg6xTkBfR71y7R0/95jRrqKvUXT+7QR//vS3qj5ZjfpQFAWiEEIGvNryrR/Xeu1Lc+834d6RvWrd96Sff85E11D476XRoApAVCALKamekjl0T0zN1rdMdVdXp4yz5d/w8b9dgbB5hbAEDOIwQgJ4SK8vW1my/S47+/WrNnFOkPvv+GPvudrXr3SL/fpQGAbwgByCkXzynXo1++Wn/50Yu0reWYbvz6L/X1Z3ZraJS5BQDkHkIAck4wYPrslXV69g/X6MaLavX1Z97Rh7/xK734zhG/SwOAlOIpgsh5v9zdoXsee1P7Oge0dFZIS2rKtLimTEtqyrSkukyLqktVlB/0u0wAiBuPEvYQAhCPodFx3f9ys155t0tNHX1q6RpQ1PtqmElzK4q1pLpMi6u9cFATW64oLfC3cACYBCHAQwjAezE0Oq7mzn41tfdpT3u/mjr61NTep70dfRoeOznxUFVpgRbXnB4OSjW7vFiBgPnYAwC5LN4QkJeKYoBMU5Qf1LLasJbVhie0j0edDh4bjIUDLxg0tffpZ2+26djAyfkHivODWlxTGgsHp40eEA4ApAtCADAFwYBpXmWJ5lWW6NplNRPWdfYNe+EgNoLQ1NGnxuajeuyNgye2qQ0X6aZLI7rlfbN1yZxymREIAPiH0wFAkg2MjGlvR792tvXoqbcOa+Pudo2OOy2cWaqbvUCwpCbkd5kAsgjXBHgIAUg33QOj+vlbbXp820Ft2tOpqJPqI2HdsmK2bl4R0dyKEr9LBJDh0i4EmNkDkpZKapf0l5IeldTsrf6cpH2SfiRpnqTtkm6XVBhPmztHJwgBSGftvUP66fZYIHh9f+wBRx9YUKFbVszWRy6JqDpU6HOFADJRvCEgJZMFmdlqSXnOuVWSwpIiku51zq32Xm9LWi+p1Tm3QlKFpHVTaAMyUk2oSHdevVCPfvlq/eq/Xquv3rhU/cNj+trjb+mKv35Gn/3OFv17YwsPPQKQFCkZCTCzCyRVOOe2mtkvJX1H0h9IGpPUIum3JD0s6RHn3CNmdrekakkL4mlzzv3J2T6bkQBkot2He/X4Gwf1+LaD2t81oIJgQGuXVuuW983W9ctmqbiAyYsAnF1a3SLonHtHkszsVklRSbsk3eOc+6mZvSxpjaQqSd3er/Qoduog3rYJzOwuSXdJ0vz585PQIyC5LpwV0n+5can+8IYLtb21W49vO6gntx/UL3YcVklBUOuWz9ItK2brgxdUqyAvMQN6I2NRHRsYUdfAiI72j+rYwIiODozq6MCIjvaPaHgsqvcvmKEPXlCtmWWcpgCyQcpuETSzWyR9RdLNkgokveGtapZUI+mIpHKvrdx7XxZn2wTOufsk3SfFRgIS2xMgdcxMK+bN0Ip5M/SnH6nX1ne79Pi2g/rZm2167I2DmlGSrw9fXKubV8zWFQuotJkrAAALJElEQVSrFAyYnHPqHxnX0f4RHTt+EPcO5EcHTju4n3LA7x85+0OUivODCgZMD27eJ0m6ZE651lxYrTVLq3XZvBnKC/IYEiATpep0QK2kH0r6kHOu38z+h6Tdkh5ULAzcJmmVpCucc18ws59K+kdJ8+Npc849c7bP5nQAstHIWFQvNnXo8TdiowMDI+OqLC1QXsB0bGBUI+PRs/5uuChPlaUFmlFSoIqSfFWUxJYrS/O9tlh7rK1AM0ryVZQfVDTq9ObBbm18u0O/fKdDr+0/pvGoU6gwT1cvmak1S6t1zYXVmjOjOIX/JQBMJq3uDjCzP5L0u5IOeU0/U+yCvlJJ/+Gc+5qZFUp6RLGD/DbF7gQoiKeNuwOQywZHxvXsrsN6fleH8gKmitJTD+75Ew745cX5CfurvXtwVC83HdHG3R3auLtDbd1DkqQLasq05sJYILh8YSUPXwJ8kFYhwE+EACD5nHNqau87EQi2vNulkbGoivIDWrWoKnbq4MJqLZxZyiyJQAoQAjyEACD1BkfGtfndztipg90d2nukX5I0r7JY11wQCwRXLZmpskJmLgeSgRDgIQQA/tvfOaCN73Ro49sd2rTniPpHxpUXMDXUVWjNhTVac2G16iMhRgmABCEEeAgBQHoZGYvq1X1HT5w62NnWIyn2WObls8Oqj4RVHwlpWW1Yi6vLEnYLJJBLCAEeQgCQ3tp7hvTLd45o895O7TrUo92H+zQyFru7IT9oWlxdpuWRWDhYFgmpPhJmngLgPAgBHkIAkFnGxqN690i/drT1aNehXu1s69HOth4d7hk+sc3MskLVe4GAUQPgTGk1YyAAxCsvGNAFs0K6YFZIHz2lvat/RLvaerTTCwa7DvXo/pebJ4waLKkJqb42xKgBECdCAICMUFlaoKuWzNRVS2aeaJts1OClPUf049cPnNjm+KjB4uoyVYcKVVVaoKqyQlWVFZxYLi0IclEichIhAEDGmsqowQ8bW846NXJhXkAzywpVWVrghYNCzSwr8N5PDAxVpQVMgISsQQgAkHUmGzWQpKHRcXX2j6izb9j7edpy/7A6+0b0zuE+dfQNnzjVcLqywrwzAsOMkgKFivJUVhh7hYryVFaUp1Bhfuynt64wL8CoA9IGIQBAzijKD2rOjOK4nm9w/EFM5wsMB44NanvrsfM+s+G4/KApVJR/IiyUFeUpXHRyuawwX6FTQkMsUOSrIC+ggEkBMwXMZN6ynWiLPXDq9G3slPfHt5nsd8oK8wgnOYgQAACTMLMTB+EFVaVx/c7w2Lj6hsbUNzym3qHYq294TH3DoxPfD42pd2j0xHZt3UMn1vUOjWp0PPV3bVWHCrVqUZWuXFSlVYsqmeI5RxACACBBCvOCKiwLqmqadyQMj43HQoEXDHqGRjUyFpVTbIQiGpWizinqvPdOcjr1/clt3CnrJvxO1MlJirrYBZY72nq0aU+nnth2UJI0K3xqKKjSgqoSQkEWIgQAQJo5HiZSfXujc07vHunXpr2d2ry3Sy81deqxN2KhIFJedCIQXLm4SnMrigkFWYDJggAAk3LOaU+HFwr2dGrz3k519o9IkubMKNYq79RBLBSU+FwtTsWMgR5CAAAkhnNO77T3afPeTm3yQsHRgVFJsSdErloYGyVYtahKs+O4+BLJQwjwEAIAIDmiUafd7b3avKdTm/Z2asu7XTrmhYIFVSUTQkFteZHP1eYWQoCHEAAAqRGNOu061OtdU9CpLXs71TM0JkkqL87XvMpiza8s0bzKEs2rKNH8ythr9oxinvuQYIQADyEAAPwxHnXa2dajre926d0j/drfNaCWowNq7RqcMKdCwKRIebHmVRafDAdVJZrrLc8sK+AixCniAUIAAF8FA6aL55Tr4jnlE9qjUafDvUNq6RrU/q6BWDjwXht3d6i9d3jC9sX5wclHEapKNLeiWCUFmXsoc85paDSqonx/ZpLM3P9yAICMFAiYIuXFipQX6/KFlWesHxodV+vRWDjY3zmglqODJ4LCpj2dZzwDoqq0QOXF+SopDKqkIE+lBUGVFOapJD+o0sI8lRSc8rMgTyWF3s+CM9eXFOQpGDj3wdg5p4GRcfUPH58M6uQkUP0jY+objk0ader6M5a9OSD6R8Y1HnV6689vVGlh6g/JhAAAQFopyg9qSU1IS2pCZ6xzzqmrf8Q7tTColq4BtR4dVN/wmAaGYwfhTm/98QP18QNt/J8fUEnBydBQmB84edD3DvTx/HMBk0oLT07/fHx5VqjIWw6qrCjWHvDpdAchAACQMczMe7JjoS6bXxHX7zjnNDIe1cDwuPpHxk4c0AdHxtU/Mq6BkTH1D5/2c2TsxPZDo1HNrYgFgrKiiQf0M5eDJ7Ypzk//R1QTAgAAWc3MYrMw5gVVUVrgdzlphXsyAADIUYQAAAByFCEAAIAcRQgAACBHEQIAAMhRhAAAAHIUIQAAgBxFCAAAIEcRAgAAyFGEAAAAchQhAACAHEUIAAAgRxECAADIUeZc/M9YzkRm1iFpn48lzJR0xMfPTzX6m71yqa8S/c122d7fBc656vNtlPUhwG9m1uica/C7jlShv9krl/oq0d9sl2v9PRtOBwAAkKMIAQAA5ChCQPLd53cBKUZ/s1cu9VWiv9ku1/o7Ka4JAAAgRzESAABAjiIEJIiZPWBmm83scTPLm2T9SjNrNbMXvddSP+pMhHj6YmZFZvakmW0zswfNzPyoNRHMbO0pfW0xsw2TbJPx+9fM8s3sCW85rv2Xyfv51P5678/5Hfa2ydj9fNr+jasfmbp/T+vreb+/3nYZu2+ngxCQAGa2WlKec26VpLCkGybZrELSvc651d7r7ZQWmVjx9GW9pFbn3Apv+3UprTCBnHMvHO+rpO2SXp9ks4zev2ZWLOlVndxP8e6/jNzPp/c3zu+wlKH7eZL9G28/Mm7/nt7XOL+/Uobu2+kiBCTGYUnf8JbP9t+0QtLHzWyrmT2SKYn6LOLpy3WSnvaWn5N0bcqqSxIzK5G0xDm3fZLVGb1/nXODzrlLJbV6TfHuv4zcz5P0N57vsJSh+3mS/sbbj4zbv5P0VdJ5v79Shu7b6SIEJIBz7h3n3FYzu1VSVNIvJtmsSdI9zrnLJUUkrUlljQkWT1+qJHV7yz2SKlNUWzKtk/TsWdZl0/6V4t9/WbGf4/wOS9mzn+PtR1bsX8+5vr9S9uzbKZn0vBemzsxukfQVSTc758Ym2aRZ0punLNekprKkaNb5+3JEUrm3XK7smJ7zZkk/Psu6ZmXP/pXi339Zs5/j+A5L2bOfmxVfP7Jm/+rc318pe/btlDASkABmVivpq5Jucs71nmWzuyXdZmYBSRfr5P9smSievjyrk+dVr5P0fIpqSwpvaHCtYkOik8mm/SvFv/+yYj/H+R2Wsmc/x9uPbNm/5/v+Stmzb6eEEJAYGxQbPnrKu6r0c2b296dt801Jd0raIulR59yOVBeZQBP6Imlwkv4+LGmOmW2X1KVzD8NlgpWSdjjnhsxsYZbvX2mS/XeWfmfLfj79O/w7Wb6fz+hHlu/fE99fScryfTslTBYEAECOYiQAAIAcRQgAACBHEQIAAMhRhAAAAHIUIQBAwpnZHWZ2h991ADg3QgAAADmKEAAgaczsIjN73sxCftcC4ExMGwwgWSKKTTbzofPMwgfAJ4wEAEiW31fsSW4L/C4EwOQIAQCS5S8lfcn7CSANEQIAJMuQc65F0i7vCX0A0gzPDgAAIEcxEgAAQI4iBAAAkKMIAQAA5ChCAAAAOYoQAABAjiIEAACQowgBAADkqP8P9dzuqF7ZCiIAAAAASUVORK5CYII=\n",
777 | "text/plain": [
778 | ""
779 | ]
780 | },
781 | "metadata": {},
782 | "output_type": "display_data"
783 | }
784 | ],
785 | "source": [
786 | "fig, ax = plt.subplots(1,1, figsize=(8,6))\n",
787 | "ax.plot(range(2,20), cost)\n",
788 | "ax.set_xlabel('k')\n",
789 | "ax.set_ylabel('cost')"
790 | ]
791 | },
792 | {
793 | "cell_type": "markdown",
794 | "metadata": {},
795 | "source": [
796 | "可以见到在k=5时,出现了拐角,我们取k=5"
797 | ]
798 | },
799 | {
800 | "cell_type": "code",
801 | "execution_count": 23,
802 | "metadata": {},
803 | "outputs": [],
804 | "source": [
805 | "kmeans = KMeans(k=5, seed=1)\n",
806 | "km_model = kmeans.fit(df_km)\n",
807 | "centers = km_model.clusterCenters()"
808 | ]
809 | },
810 | {
811 | "cell_type": "code",
812 | "execution_count": 24,
813 | "metadata": {},
814 | "outputs": [
815 | {
816 | "data": {
817 | "text/plain": [
818 | "[array([55.2962963 , 49.51851852]),\n",
819 | " array([25.72727273, 79.36363636]),\n",
820 | " array([86.53846154, 82.12820513]),\n",
821 | " array([88.2 , 17.11428571]),\n",
822 | " array([26.30434783, 20.91304348])]"
823 | ]
824 | },
825 | "execution_count": 24,
826 | "metadata": {},
827 | "output_type": "execute_result"
828 | }
829 | ],
830 | "source": [
831 | "centers"
832 | ]
833 | },
834 | {
835 | "cell_type": "code",
836 | "execution_count": 32,
837 | "metadata": {},
838 | "outputs": [],
839 | "source": [
840 | "transformed = km_model.transform(df_km).select('CustomerID', 'prediction')"
841 | ]
842 | },
843 | {
844 | "cell_type": "code",
845 | "execution_count": 33,
846 | "metadata": {},
847 | "outputs": [
848 | {
849 | "name": "stdout",
850 | "output_type": "stream",
851 | "text": [
852 | "+----------+----------+\n",
853 | "|CustomerID|prediction|\n",
854 | "+----------+----------+\n",
855 | "| 1| 4|\n",
856 | "| 2| 1|\n",
857 | "| 3| 4|\n",
858 | "+----------+----------+\n",
859 | "only showing top 3 rows\n",
860 | "\n"
861 | ]
862 | }
863 | ],
864 | "source": [
865 | "transformed.show(3)"
866 | ]
867 | },
868 | {
869 | "cell_type": "code",
870 | "execution_count": 35,
871 | "metadata": {},
872 | "outputs": [],
873 | "source": [
874 | "df_pred = df.join(transformed, 'CustomerID')"
875 | ]
876 | },
877 | {
878 | "cell_type": "code",
879 | "execution_count": 36,
880 | "metadata": {},
881 | "outputs": [
882 | {
883 | "name": "stdout",
884 | "output_type": "stream",
885 | "text": [
886 | "+----------+------+---+------+-----+----------+\n",
887 | "|CustomerID|Gender|Age|Income|Spend|prediction|\n",
888 | "+----------+------+---+------+-----+----------+\n",
889 | "| 1| Male| 19| 15| 39| 4|\n",
890 | "| 2| Male| 21| 15| 81| 1|\n",
891 | "| 3|Female| 20| 16| 6| 4|\n",
892 | "+----------+------+---+------+-----+----------+\n",
893 | "only showing top 3 rows\n",
894 | "\n"
895 | ]
896 | }
897 | ],
898 | "source": [
899 | "df_pred.show(3)"
900 | ]
901 | },
902 | {
903 | "cell_type": "code",
904 | "execution_count": 39,
905 | "metadata": {},
906 | "outputs": [
907 | {
908 | "data": {
909 | "application/vnd.plotly.v1+json": {
910 | "data": [
911 | {
912 | "marker": {
913 | "color": [
914 | 4,
915 | 1,
916 | 4,
917 | 1,
918 | 4,
919 | 1,
920 | 4,
921 | 1,
922 | 4,
923 | 1,
924 | 4,
925 | 1,
926 | 4,
927 | 1,
928 | 4,
929 | 1,
930 | 4,
931 | 1,
932 | 4,
933 | 1,
934 | 4,
935 | 1,
936 | 4,
937 | 1,
938 | 4,
939 | 1,
940 | 4,
941 | 1,
942 | 4,
943 | 1,
944 | 4,
945 | 1,
946 | 4,
947 | 1,
948 | 4,
949 | 1,
950 | 4,
951 | 1,
952 | 4,
953 | 1,
954 | 4,
955 | 1,
956 | 4,
957 | 0,
958 | 4,
959 | 1,
960 | 0,
961 | 0,
962 | 0,
963 | 0,
964 | 0,
965 | 0,
966 | 0,
967 | 0,
968 | 0,
969 | 0,
970 | 0,
971 | 0,
972 | 0,
973 | 0,
974 | 0,
975 | 0,
976 | 0,
977 | 0,
978 | 0,
979 | 0,
980 | 0,
981 | 0,
982 | 0,
983 | 0,
984 | 0,
985 | 0,
986 | 0,
987 | 0,
988 | 0,
989 | 0,
990 | 0,
991 | 0,
992 | 0,
993 | 0,
994 | 0,
995 | 0,
996 | 0,
997 | 0,
998 | 0,
999 | 0,
1000 | 0,
1001 | 0,
1002 | 0,
1003 | 0,
1004 | 0,
1005 | 0,
1006 | 0,
1007 | 0,
1008 | 0,
1009 | 0,
1010 | 0,
1011 | 0,
1012 | 0,
1013 | 0,
1014 | 0,
1015 | 0,
1016 | 0,
1017 | 0,
1018 | 0,
1019 | 0,
1020 | 0,
1021 | 0,
1022 | 0,
1023 | 0,
1024 | 0,
1025 | 0,
1026 | 0,
1027 | 0,
1028 | 0,
1029 | 0,
1030 | 0,
1031 | 0,
1032 | 0,
1033 | 0,
1034 | 0,
1035 | 0,
1036 | 0,
1037 | 2,
1038 | 3,
1039 | 2,
1040 | 0,
1041 | 2,
1042 | 3,
1043 | 2,
1044 | 3,
1045 | 2,
1046 | 0,
1047 | 2,
1048 | 3,
1049 | 2,
1050 | 3,
1051 | 2,
1052 | 3,
1053 | 2,
1054 | 3,
1055 | 2,
1056 | 0,
1057 | 2,
1058 | 3,
1059 | 2,
1060 | 3,
1061 | 2,
1062 | 3,
1063 | 2,
1064 | 3,
1065 | 2,
1066 | 3,
1067 | 2,
1068 | 3,
1069 | 2,
1070 | 3,
1071 | 2,
1072 | 3,
1073 | 2,
1074 | 3,
1075 | 2,
1076 | 3,
1077 | 2,
1078 | 3,
1079 | 2,
1080 | 3,
1081 | 2,
1082 | 3,
1083 | 2,
1084 | 3,
1085 | 2,
1086 | 3,
1087 | 2,
1088 | 3,
1089 | 2,
1090 | 3,
1091 | 2,
1092 | 3,
1093 | 2,
1094 | 3,
1095 | 2,
1096 | 3,
1097 | 2,
1098 | 3,
1099 | 2,
1100 | 3,
1101 | 2,
1102 | 3,
1103 | 2,
1104 | 3,
1105 | 2,
1106 | 3,
1107 | 2,
1108 | 3,
1109 | 2,
1110 | 3,
1111 | 2,
1112 | 3,
1113 | 2
1114 | ],
1115 | "colorscale": "Viridis",
1116 | "size": 10
1117 | },
1118 | "mode": "markers",
1119 | "type": "scatter",
1120 | "x": [
1121 | 15,
1122 | 15,
1123 | 16,
1124 | 16,
1125 | 17,
1126 | 17,
1127 | 18,
1128 | 18,
1129 | 19,
1130 | 19,
1131 | 19,
1132 | 19,
1133 | 20,
1134 | 20,
1135 | 20,
1136 | 20,
1137 | 21,
1138 | 21,
1139 | 23,
1140 | 23,
1141 | 24,
1142 | 24,
1143 | 25,
1144 | 25,
1145 | 28,
1146 | 28,
1147 | 28,
1148 | 28,
1149 | 29,
1150 | 29,
1151 | 30,
1152 | 30,
1153 | 33,
1154 | 33,
1155 | 33,
1156 | 33,
1157 | 34,
1158 | 34,
1159 | 37,
1160 | 37,
1161 | 38,
1162 | 38,
1163 | 39,
1164 | 39,
1165 | 39,
1166 | 39,
1167 | 40,
1168 | 40,
1169 | 40,
1170 | 40,
1171 | 42,
1172 | 42,
1173 | 43,
1174 | 43,
1175 | 43,
1176 | 43,
1177 | 44,
1178 | 44,
1179 | 46,
1180 | 46,
1181 | 46,
1182 | 46,
1183 | 47,
1184 | 47,
1185 | 48,
1186 | 48,
1187 | 48,
1188 | 48,
1189 | 48,
1190 | 48,
1191 | 49,
1192 | 49,
1193 | 50,
1194 | 50,
1195 | 54,
1196 | 54,
1197 | 54,
1198 | 54,
1199 | 54,
1200 | 54,
1201 | 54,
1202 | 54,
1203 | 54,
1204 | 54,
1205 | 54,
1206 | 54,
1207 | 57,
1208 | 57,
1209 | 58,
1210 | 58,
1211 | 59,
1212 | 59,
1213 | 60,
1214 | 60,
1215 | 60,
1216 | 60,
1217 | 60,
1218 | 60,
1219 | 61,
1220 | 61,
1221 | 62,
1222 | 62,
1223 | 62,
1224 | 62,
1225 | 62,
1226 | 62,
1227 | 63,
1228 | 63,
1229 | 63,
1230 | 63,
1231 | 63,
1232 | 63,
1233 | 64,
1234 | 64,
1235 | 65,
1236 | 65,
1237 | 65,
1238 | 65,
1239 | 67,
1240 | 67,
1241 | 67,
1242 | 67,
1243 | 69,
1244 | 69,
1245 | 70,
1246 | 70,
1247 | 71,
1248 | 71,
1249 | 71,
1250 | 71,
1251 | 71,
1252 | 71,
1253 | 72,
1254 | 72,
1255 | 73,
1256 | 73,
1257 | 73,
1258 | 73,
1259 | 74,
1260 | 74,
1261 | 75,
1262 | 75,
1263 | 76,
1264 | 76,
1265 | 77,
1266 | 77,
1267 | 77,
1268 | 77,
1269 | 78,
1270 | 78,
1271 | 78,
1272 | 78,
1273 | 78,
1274 | 78,
1275 | 78,
1276 | 78,
1277 | 78,
1278 | 78,
1279 | 78,
1280 | 78,
1281 | 79,
1282 | 79,
1283 | 81,
1284 | 81,
1285 | 85,
1286 | 85,
1287 | 86,
1288 | 86,
1289 | 87,
1290 | 87,
1291 | 87,
1292 | 87,
1293 | 87,
1294 | 87,
1295 | 88,
1296 | 88,
1297 | 88,
1298 | 88,
1299 | 93,
1300 | 93,
1301 | 97,
1302 | 97,
1303 | 98,
1304 | 98,
1305 | 99,
1306 | 99,
1307 | 101,
1308 | 101,
1309 | 103,
1310 | 103,
1311 | 103,
1312 | 103,
1313 | 113,
1314 | 113,
1315 | 120,
1316 | 120,
1317 | 126,
1318 | 126,
1319 | 137,
1320 | 137
1321 | ],
1322 | "y": [
1323 | 39,
1324 | 81,
1325 | 6,
1326 | 77,
1327 | 40,
1328 | 76,
1329 | 6,
1330 | 94,
1331 | 3,
1332 | 72,
1333 | 14,
1334 | 99,
1335 | 15,
1336 | 77,
1337 | 13,
1338 | 79,
1339 | 35,
1340 | 66,
1341 | 29,
1342 | 98,
1343 | 35,
1344 | 73,
1345 | 5,
1346 | 73,
1347 | 14,
1348 | 82,
1349 | 32,
1350 | 61,
1351 | 31,
1352 | 87,
1353 | 4,
1354 | 73,
1355 | 4,
1356 | 92,
1357 | 14,
1358 | 81,
1359 | 17,
1360 | 73,
1361 | 26,
1362 | 75,
1363 | 35,
1364 | 92,
1365 | 36,
1366 | 61,
1367 | 28,
1368 | 65,
1369 | 55,
1370 | 47,
1371 | 42,
1372 | 42,
1373 | 52,
1374 | 60,
1375 | 54,
1376 | 60,
1377 | 45,
1378 | 41,
1379 | 50,
1380 | 46,
1381 | 51,
1382 | 46,
1383 | 56,
1384 | 55,
1385 | 52,
1386 | 59,
1387 | 51,
1388 | 59,
1389 | 50,
1390 | 48,
1391 | 59,
1392 | 47,
1393 | 55,
1394 | 42,
1395 | 49,
1396 | 56,
1397 | 47,
1398 | 54,
1399 | 53,
1400 | 48,
1401 | 52,
1402 | 42,
1403 | 51,
1404 | 55,
1405 | 41,
1406 | 44,
1407 | 57,
1408 | 46,
1409 | 58,
1410 | 55,
1411 | 60,
1412 | 46,
1413 | 55,
1414 | 41,
1415 | 49,
1416 | 40,
1417 | 42,
1418 | 52,
1419 | 47,
1420 | 50,
1421 | 42,
1422 | 49,
1423 | 41,
1424 | 48,
1425 | 59,
1426 | 55,
1427 | 56,
1428 | 42,
1429 | 50,
1430 | 46,
1431 | 43,
1432 | 48,
1433 | 52,
1434 | 54,
1435 | 42,
1436 | 46,
1437 | 48,
1438 | 50,
1439 | 43,
1440 | 59,
1441 | 43,
1442 | 57,
1443 | 56,
1444 | 40,
1445 | 58,
1446 | 91,
1447 | 29,
1448 | 77,
1449 | 35,
1450 | 95,
1451 | 11,
1452 | 75,
1453 | 9,
1454 | 75,
1455 | 34,
1456 | 71,
1457 | 5,
1458 | 88,
1459 | 7,
1460 | 73,
1461 | 10,
1462 | 72,
1463 | 5,
1464 | 93,
1465 | 40,
1466 | 87,
1467 | 12,
1468 | 97,
1469 | 36,
1470 | 74,
1471 | 22,
1472 | 90,
1473 | 17,
1474 | 88,
1475 | 20,
1476 | 76,
1477 | 16,
1478 | 89,
1479 | 1,
1480 | 78,
1481 | 1,
1482 | 73,
1483 | 35,
1484 | 83,
1485 | 5,
1486 | 93,
1487 | 26,
1488 | 75,
1489 | 20,
1490 | 95,
1491 | 27,
1492 | 63,
1493 | 13,
1494 | 75,
1495 | 10,
1496 | 92,
1497 | 13,
1498 | 86,
1499 | 15,
1500 | 69,
1501 | 14,
1502 | 90,
1503 | 32,
1504 | 86,
1505 | 15,
1506 | 88,
1507 | 39,
1508 | 97,
1509 | 24,
1510 | 68,
1511 | 17,
1512 | 85,
1513 | 23,
1514 | 69,
1515 | 8,
1516 | 91,
1517 | 16,
1518 | 79,
1519 | 28,
1520 | 74,
1521 | 18,
1522 | 83
1523 | ]
1524 | }
1525 | ],
1526 | "layout": {}
1527 | },
1528 | "text/html": [
1529 | ""
1530 | ],
1531 | "text/vnd.plotly.v1+html": [
1532 | ""
1533 | ]
1534 | },
1535 | "metadata": {},
1536 | "output_type": "display_data"
1537 | }
1538 | ],
1539 | "source": [
1540 | "pd_df = df_pred.toPandas()\n",
1541 | "trace = go.Scatter(x=pd_df.Income, y=pd_df.Spend, \n",
1542 | " mode='markers',\n",
1543 | " marker = {'size':10,'color':pd_df.prediction,'colorscale':'Viridis'})\n",
1544 | "iplot([trace])"
1545 | ]
1546 | },
1547 | {
1548 | "cell_type": "markdown",
1549 | "metadata": {},
1550 | "source": [
1551 | "## BisectingKMeans 二分k均值\n",
1552 | "`pyspark.ml.clustering.BisectingKMeans(featuresCol='features', predictionCol='prediction', maxIter=20, seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure='euclidean')`"
1553 | ]
1554 | },
1555 | {
1556 | "cell_type": "markdown",
1557 | "metadata": {},
1558 | "source": [
1559 | "二分k均值(bisecting k-means)算法的主要思想是:首先将所有点作为一个簇,然后将该簇一分为二。之后选择能最大程度降低聚类代价函数(也就是误差平方和)的簇划分为两个簇。以此进行下去,直到簇的数目等于用户给定的数目k为止。\n",
1560 | "\n",
1561 | " 以上隐含着一个原则是:因为聚类的误差平方和能够衡量聚类性能,该值越小表示数据点月接近于它们的质心,聚类效果就越好。所以我们就需要对误差平方和最大的簇进行再一次的划分,因为误差平方和越大,表示该簇聚类越不好,越有可能是多个簇被当成一个簇了,所以我们首先需要对这个簇进行划分。"
1562 | ]
1563 | },
1564 | {
1565 | "cell_type": "markdown",
1566 | "metadata": {},
1567 | "source": [
1568 | "**参数**\n",
1569 | "\n",
1570 | "`maxIter: 最大迭代次数\n",
1571 | "K:聚类簇数\n",
1572 | "minDivisibleClusterSize: 聚类的最少数据点数(>1)或比例(0-1之间)\n",
1573 | "fit(dataset, params=None)方法`"
1574 | ]
1575 | },
1576 | {
1577 | "cell_type": "markdown",
1578 | "metadata": {},
1579 | "source": [
1580 | "**model属性**\n",
1581 | "\n",
1582 | "`\n",
1583 | "clusterCenters(): 获取聚类中心,numpy array类型\n",
1584 | "computeCost():计算点与其中心的平方和距离\n",
1585 | "Transform():对预测数据进行预测\n",
1586 | "hasSummary:训练模型是否有summary\n",
1587 | "Summary:获取summary\n",
1588 | "`"
1589 | ]
1590 | },
1591 | {
1592 | "cell_type": "markdown",
1593 | "metadata": {},
1594 | "source": [
1595 | "**Summary拥有的属性**\n",
1596 | "\n",
1597 | "`\n",
1598 | "cluster:预测的聚类中心\n",
1599 | "clusterSizes:每个聚类的大小\n",
1600 | "K:聚类个数\n",
1601 | "Predictions:由模型的transforn方法产生的预测数据框\n",
1602 | "`"
1603 | ]
1604 | },
1605 | {
1606 | "cell_type": "markdown",
1607 | "metadata": {},
1608 | "source": [
1609 | "## GaussianMixture 高斯混合模型\n",
1610 | "`pyspark.ml.clustering.GaussianMixture(featuresCol='features', predictionCol='prediction', k=2, probabilityCol='probability', tol=0.01, maxIter=100, seed=None)`"
1611 | ]
1612 | },
1613 | {
1614 | "cell_type": "markdown",
1615 | "metadata": {},
1616 | "source": [
1617 | "对象实现了用来拟合高斯混合模型的 期望最大化 (EM) 算法。它还可以为多变量模型绘制置信区间,同时计算 BIC(Bayesian Information Criterion,贝叶斯信息准则)来评估数据中聚类的数量。"
1618 | ]
1619 | },
1620 | {
1621 | "cell_type": "markdown",
1622 | "metadata": {},
1623 | "source": [
1624 | "优点:GMM的优点是投影后样本点不是得到一个确定的分类标记,而是得到每个类的概率,这是一个重要信息。GMM不仅可以用在聚类上,也可以用在概率密度估计上。缺点:当每个混合模型没有足够多的点时,估算协方差变得困难起来,同时算法会发散并且找具有无穷大似然函数值的解,除非人为地对协方差进行正则化。GMM每一步迭代的计算量比较大,大于k-means。GMM的求解办法基于EM算法,因此有可能陷入局部极值,这和初始值的选取十分相关了。\n",
1625 | "\n"
1626 | ]
1627 | },
1628 | {
1629 | "cell_type": "markdown",
1630 | "metadata": {},
1631 | "source": [
1632 | "注意对于高维数据(具有许多功能),此算法可能表现不佳。这是由于高维数据(a)使得难以聚类(基于统计/理论论证)和(b)高斯分布的数值问题。"
1633 | ]
1634 | },
1635 | {
1636 | "cell_type": "markdown",
1637 | "metadata": {},
1638 | "source": [
1639 | "**参数**\n",
1640 | "\n",
1641 | "`fit(dataset,params=None)方法\n",
1642 | "k: 独立高斯分布的个数,>1\n",
1643 | "maxIter: 最大迭代次数 >=0\n",
1644 | "tol: 迭代算法的收敛偏差 >=0\n",
1645 | "Setter方法和getter方法`"
1646 | ]
1647 | },
1648 | {
1649 | "cell_type": "markdown",
1650 | "metadata": {},
1651 | "source": [
1652 | "**model属性**\n",
1653 | "\n",
1654 | "`\n",
1655 | "gaussianDF: 抽取高斯分布作为数据框,每一行代表高斯分布,有两列:mean(vector)和 cov(Matrix)\n",
1656 | "hasSummary: 模型是否有总括函数\n",
1657 | "summary: 获取总括信息\n",
1658 | "transform(dataset,params=None)方法\n",
1659 | "weights: 高斯混合模型的权重,和为1\n",
1660 | "`"
1661 | ]
1662 | },
1663 | {
1664 | "cell_type": "code",
1665 | "execution_count": 45,
1666 | "metadata": {},
1667 | "outputs": [],
1668 | "source": [
1669 | "spark.stop()"
1670 | ]
1671 | },
1672 | {
1673 | "cell_type": "code",
1674 | "execution_count": null,
1675 | "metadata": {},
1676 | "outputs": [],
1677 | "source": []
1678 | }
1679 | ],
1680 | "metadata": {
1681 | "kernelspec": {
1682 | "display_name": "Python 3",
1683 | "language": "python",
1684 | "name": "python3"
1685 | },
1686 | "language_info": {
1687 | "codemirror_mode": {
1688 | "name": "ipython",
1689 | "version": 3
1690 | },
1691 | "file_extension": ".py",
1692 | "mimetype": "text/x-python",
1693 | "name": "python",
1694 | "nbconvert_exporter": "python",
1695 | "pygments_lexer": "ipython3",
1696 | "version": "3.6.4"
1697 | }
1698 | },
1699 | "nbformat": 4,
1700 | "nbformat_minor": 2
1701 | }
1702 |
--------------------------------------------------------------------------------
/pyspark-RDD.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "RDD(Resilient Distributed Dataset)叫做弹性分布式数据集,是Spark中最基本的数据抽象,它代表一个不可变、可分区、里面的元素可并行计算的集合。RDD具有数据流模型的特点:自动容错、位置感知性调度和可伸缩性。RDD允许用户在执行多个查询时显式地将工作集缓存在内存中,后续的查询能够重用工作集,这极大地提升了查询速度。\n",
8 | "\n",
9 | "(1)一组分片(Partition),即数据集的基本组成单位。对于RDD来说,每个分片都会被一个计算任务处理,并决定并行计算的粒度。用户可以在创建RDD时指定RDD的分片个数,如果没有指定,那么就会采用默认值。默认值就是程序所分配到的CPU Core的数目。\n",
10 | "\n",
11 | "(2)一个计算每个分区的函数。Spark中RDD的计算是以分片为单位的,每个RDD都会实现compute函数以达到这个目的。compute函数会对迭代器进行复合,不需要保存每次计算的结果。\n",
12 | "\n",
13 | "(3)RDD之间的依赖关系。RDD的每次转换都会生成一个新的RDD,所以RDD之间就会形成类似于流水线一样的前后依赖关系。在部分分区数据丢失时,Spark可以通过这个依赖关系重新计算丢失的分区数据,而不是对RDD的所有分区进行重新计算。\n",
14 | "\n",
15 | "(4)一个Partitioner,即RDD的分片函数。当前Spark中实现了两种类型的分片函数,一个是基于哈希的HashPartitioner,另外一个是基于范围的RangePartitioner。只有对于于key-value的RDD,才会有Partitioner,非key-value的RDD的Parititioner的值是None。Partitioner函数不但决定了RDD本身的分片数量,也决定了parent RDD Shuffle输出时的分片数量。\n",
16 | "\n",
17 | "(5)一个列表,存储存取每个Partition的优先位置(preferred location)。对于一个HDFS文件来说,这个列表保存的就是每个Partition所在的块的位置。按照“移动数据不如移动计算”的理念,Spark在进行任务调度的时候,会尽可能地将计算任务分配到其所要处理数据块的存储位置。\n",
18 | "\n",
19 | "使用手册 \n",
20 | "http://spark.apache.org/docs/latest/api/python/pyspark.html\n"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 1,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "#pyspark.SparkContext()是spark应用的入口,也可以称为驱动\n",
30 | "from pyspark import SparkConf, SparkContext"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "conf = SparkConf().setAppName(\"sparkApp1\").setMaster(\"local\")\n",
40 | "sc = SparkContext.getOrCreate(conf)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 3,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "name": "stdout",
50 | "output_type": "stream",
51 | "text": [
52 | "[0, 2, 3, 4, 6]\n",
53 | "[[0], [2], [3], [4], [6]]\n",
54 | "[0, 2, 4]\n",
55 | "[[], [0], [], [2], [4]]\n"
56 | ]
57 | }
58 | ],
59 | "source": [
60 | "#parallelize(c,numSlices=None)分发本地Python集合以形成RDD。如果输入表示性能范围,则建议使用xrange。\n",
61 | "#glom()通过将每个分区内的所有元素合并到一个列表中返回一个RDD。\n",
62 | "rdd1 = sc.parallelize([0,2,3,4,6], 5)\n",
63 | "rdd2 = sc.parallelize(range(0, 6, 2), 5)\n",
64 | "print(rdd1.collect())\n",
65 | "print(rdd1.glom().collect())\n",
66 | "print(rdd2.collect())\n",
67 | "print(rdd2.glom().collect())"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 4,
73 | "metadata": {},
74 | "outputs": [
75 | {
76 | "data": {
77 | "text/plain": [
78 | "[0, 9]"
79 | ]
80 | },
81 | "execution_count": 4,
82 | "metadata": {},
83 | "output_type": "execute_result"
84 | }
85 | ],
86 | "source": [
87 | "#runJob(rdd, partitionFunc, partitions=None, allowLocal=False)\n",
88 | "#在指定的分区集合上执行给定的分区,将结果作为元素的数组返回。如果没有指定分区,那么它将在所有分区上运行。\n",
89 | "sc.runJob(rdd1, lambda part: [x * x for x in part], [0, 2], True)"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 6,
95 | "metadata": {},
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "1528077753028\n",
102 | "ffzs\n"
103 | ]
104 | }
105 | ],
106 | "source": [
107 | "print(sc.startTime)\n",
108 | "print(sc.sparkUser())"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "# rdd.glom()\n",
118 | "# glom()定义了将原rdd相同分区的元素放在一个列表中构成新的rdd的转换操作。\n",
119 | "# rdd.collect()\n",
120 | "# 返回由rdd元素组成的列表\n",
121 | "# rdd.collectAsMap()\n",
122 | "# 将键值对形式的RDD以字典的形式返回给master "
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 37,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "# cache()\n",
132 | "# 将RDD持久化为MEMORY_ONLY"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 34,
138 | "metadata": {},
139 | "outputs": [
140 | {
141 | "data": {
142 | "text/plain": [
143 | "[('a', 'aa', 1), ('b', 'bb', 1), ('c', 'cc', 1)]"
144 | ]
145 | },
146 | "execution_count": 34,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "# map(f, preservesPartitioning=False)\n",
153 | "# 通过对这个RDD的每个元素应用一个函数来返回一个新的RDD\n",
154 | "rdd = sc.parallelize([\"b\", \"a\", \"c\"])\n",
155 | "sorted(rdd.map(lambda x:(x, x*2, 1)).collect())"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 38,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "name": "stdout",
165 | "output_type": "stream",
166 | "text": [
167 | "[1, 1, 1, 2, 2, 3]\n",
168 | "[[(2, 2), (2, 2)], [(3, 3), (3, 3)], [(4, 4), (4, 4)]]\n",
169 | "[(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)]\n"
170 | ]
171 | }
172 | ],
173 | "source": [
174 | "#flatMap(f, preservesPartitioning=False)\n",
175 | "#首先将一个函数应用到这个RDD的所有元素上,然后将结果全部展开,返回一个新的RDD\n",
176 | "rdd = sc.parallelize([2, 3, 4])\n",
177 | "print(sorted(rdd.flatMap(lambda x: range(1, x)).collect()))\n",
178 | "print(sorted(rdd.map(lambda x: [(x, x), (x, x)]).collect()))\n",
179 | "print(sorted(rdd.flatMap(lambda x: [(x, x), (x, x)]).collect()))\n"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 39,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "data": {
189 | "text/plain": [
190 | "[('a', 3), ('b', 1)]"
191 | ]
192 | },
193 | "execution_count": 39,
194 | "metadata": {},
195 | "output_type": "execute_result"
196 | }
197 | ],
198 | "source": [
199 | "# mapValues(f)\n",
200 | "#通过map函数对RDD中的每个key传递value,而不改变键;同时保留了原始的RDD分区。\n",
201 | "x = sc.parallelize([(\"a\", [\"apple\", \"banana\", \"lemon\"]), (\"b\", [\"grapes\"])])\n",
202 | "def f(x): return len(x)\n",
203 | "x.mapValues(f).collect()"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 52,
209 | "metadata": {},
210 | "outputs": [
211 | {
212 | "name": "stdout",
213 | "output_type": "stream",
214 | "text": [
215 | "[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]\n",
216 | "[('a', ['x', 'y', 'z']), ('b', ['p', 'r'])]\n"
217 | ]
218 | }
219 | ],
220 | "source": [
221 | "#flatMapValues(f)\n",
222 | "#通过flatMap函数传递键值对RDD中的每个值,而不改变键;这也保留了原始的RDD分区。\n",
223 | "x = sc.parallelize([(\"a\", [\"x\", \"y\", \"z\"]), (\"b\", [\"p\", \"r\"])])\n",
224 | "def f(x): return x\n",
225 | "print(x.flatMapValues(f).collect())\n",
226 | "print(x.mapValues(f).collect())"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 53,
232 | "metadata": {},
233 | "outputs": [
234 | {
235 | "data": {
236 | "text/plain": [
237 | "[3, 7]"
238 | ]
239 | },
240 | "execution_count": 53,
241 | "metadata": {},
242 | "output_type": "execute_result"
243 | }
244 | ],
245 | "source": [
246 | "# mapPartitions(f, preservesPartitioning=False)\n",
247 | "# 与map不同,map是对每一个元素用函数作用;而mapPartitions是对每一个分区用一个函数去作用,每一个分区的元素先构成一个迭代器iterator,iterator是一个像列表,但里面的元素又保持分布式特点的一类对象;输入的参数就是这个iterator,然后对iterator进行运算,iterator支持的函数不是太多,sum,count等一些spark定义的基本函数应该都是支持的。但如果要进行更为复杂的一些个性化函数运算,可以就用不了。实践中发生可以通过[x for i in iterator]的方式,将iterator转换为列表,然后就可以进行各种操作。但是这样在分区内部或分组内部就失去了分布式运算的特点。\n",
248 | "# yield是生成的意思,但是在python中则是作为生成器理解,生成器的用处主要可以迭代,这样简化了很多运算模型。\n",
249 | "rdd = sc.parallelize([1, 2, 3, 4], 2)\n",
250 | "def f(iterator): yield sum(iterator)\n",
251 | "rdd.mapPartitions(f).collect()"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 55,
257 | "metadata": {},
258 | "outputs": [
259 | {
260 | "data": {
261 | "text/plain": [
262 | "6"
263 | ]
264 | },
265 | "execution_count": 55,
266 | "metadata": {},
267 | "output_type": "execute_result"
268 | }
269 | ],
270 | "source": [
271 | "# mapPartitionsWithIndex(f, preservesPartitioning=False)\n",
272 | "# 通过在这个RDD的每个分区上应用一个函数来返回一个新的RDD,同时跟踪原始分区的索引。为对索引进行操作提供可能\n",
273 | "rdd = sc.parallelize([1, 2, 3, 4], 4)\n",
274 | "def f(splitIndex, iterator): yield splitIndex\n",
275 | "rdd.mapPartitionsWithIndex(f).sum()"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 57,
281 | "metadata": {},
282 | "outputs": [
283 | {
284 | "data": {
285 | "text/plain": [
286 | "0"
287 | ]
288 | },
289 | "execution_count": 57,
290 | "metadata": {},
291 | "output_type": "execute_result"
292 | }
293 | ],
294 | "source": [
295 | "# partitionBy(numPartitions, partitionFunc=)\n",
296 | "# 返回使用指定的分区器分区的RDD的副本\n",
297 | "# set().intersection 取交集\n",
298 | "pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x))\n",
299 | "sets = pairs.partitionBy(2).glom().collect()\n",
300 | "len(set(sets[0]).intersection(set(sets[1])))"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 63,
306 | "metadata": {},
307 | "outputs": [
308 | {
309 | "name": "stdout",
310 | "output_type": "stream",
311 | "text": [
312 | "[[1], [2, 3], [4, 5]]\n",
313 | "[[], [1], [4, 5], [2, 3], []]\n"
314 | ]
315 | }
316 | ],
317 | "source": [
318 | "# coalesce(numPartitions, shuffle=False)\n",
319 | "# 返回一个新的RDD,将RDD重新分区,减少分区不适用shuffle ,正加分区数的话要shuffle为true 同repartition\n",
320 | "print(sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect())\n",
321 | "print(sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(5,True).glom().collect())"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 64,
327 | "metadata": {},
328 | "outputs": [
329 | {
330 | "name": "stdout",
331 | "output_type": "stream",
332 | "text": [
333 | "2\n",
334 | "10\n"
335 | ]
336 | }
337 | ],
338 | "source": [
339 | "# repartition(numPartitions)\n",
340 | "# 重新分区,默认shuffle 减少分区用coalesce\n",
341 | "rdd = sc.parallelize([1,2,3,4,5,6,7], 4)\n",
342 | "print(len(rdd.repartition(2).glom().collect()))\n",
343 | "print(len(rdd.repartition(10).glom().collect()))"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 65,
349 | "metadata": {},
350 | "outputs": [
351 | {
352 | "data": {
353 | "text/plain": [
354 | "[(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]"
355 | ]
356 | },
357 | "execution_count": 65,
358 | "metadata": {},
359 | "output_type": "execute_result"
360 | }
361 | ],
362 | "source": [
363 | "# zip(other)\n",
364 | "# 一个RDD作为key,另一个让RDD作为value\n",
365 | "x = sc.parallelize(range(0,5))\n",
366 | "y = sc.parallelize(range(1000, 1005))\n",
367 | "x.zip(y).collect()"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": 68,
373 | "metadata": {},
374 | "outputs": [
375 | {
376 | "data": {
377 | "text/plain": [
378 | "[('a', 0), ('b', 1), ('c', 2), ('d', 3)]"
379 | ]
380 | },
381 | "execution_count": 68,
382 | "metadata": {},
383 | "output_type": "execute_result"
384 | }
385 | ],
386 | "source": [
387 | "# rdd.zipWithIndex()\n",
388 | "# RDD为key 排序位置索引作为value\n",
389 | "sc.parallelize([\"a\", \"b\", \"c\", \"d\"], 2).zipWithIndex().collect()"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": 73,
395 | "metadata": {},
396 | "outputs": [
397 | {
398 | "name": "stdout",
399 | "output_type": "stream",
400 | "text": [
401 | "['a', 'b', 'c', 'd', 'e']\n",
402 | "[['a'], ['b', 'c'], ['d', 'e']]\n",
403 | "[('a', 0), ('b', 1), ('c', 4), ('d', 2), ('e', 5)]\n"
404 | ]
405 | }
406 | ],
407 | "source": [
408 | "# zipWithUniqueId()\n",
409 | "# 根据分区k 按公式k,n+k,2*n+k产生value,RDD为key\n",
410 | "rdd = sc.parallelize([\"a\", \"b\", \"c\", \"d\", \"e\"], 3)\n",
411 | "print(rdd.collect())\n",
412 | "print(rdd.glom().collect())\n",
413 | "print(rdd.zipWithUniqueId().collect())"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 4,
419 | "metadata": {},
420 | "outputs": [
421 | {
422 | "name": "stdout",
423 | "output_type": "stream",
424 | "text": [
425 | "[(0, 0), (1, 1), (4, 2)]\n",
426 | "[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]\n"
427 | ]
428 | },
429 | {
430 | "data": {
431 | "text/plain": [
432 | "[(0, [[0], [0]]),\n",
433 | " (1, [[1], [1]]),\n",
434 | " (2, [[], [2]]),\n",
435 | " (3, [[], [3]]),\n",
436 | " (4, [[2], [4]])]"
437 | ]
438 | },
439 | "execution_count": 4,
440 | "metadata": {},
441 | "output_type": "execute_result"
442 | }
443 | ],
444 | "source": [
445 | "# rdd.keyBy()\n",
446 | "# RDD通过函数创建元组\n",
447 | "x = sc.parallelize(range(0,3)).keyBy(lambda x: x*x)\n",
448 | "y = sc.parallelize(zip(range(0,5), range(0,5)))\n",
449 | "print(x.collect())\n",
450 | "print(y.collect())\n",
451 | "[(x, list(map(list, y))) for x, y in sorted(x.cogroup(y).collect())]"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": 6,
457 | "metadata": {},
458 | "outputs": [],
459 | "source": [
460 | "# foreach(f)\n",
461 | "# 是一个公式作用于rdd所有元素,生成非rdd\n",
462 | "def fun(x): \n",
463 | " print(x)\n",
464 | "sc.parallelize([1, 2, 3, 4, 5]).foreach(fun)"
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": 7,
470 | "metadata": {},
471 | "outputs": [],
472 | "source": [
473 | "# foreachPartition(f)\n",
474 | "# 使一个函数作用于RDD上每一个分区\n",
475 | "def fun(iterator):\n",
476 | " for x in iterator:\n",
477 | " print(x)\n",
478 | "sc.parallelize([1, 2, 3, 4, 5]).foreachPartition(fun)"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 8,
484 | "metadata": {},
485 | "outputs": [
486 | {
487 | "name": "stdout",
488 | "output_type": "stream",
489 | "text": [
490 | "None\n",
491 | "1\n",
492 | "2\n",
493 | "3\n",
494 | "\n"
495 | ]
496 | },
497 | {
498 | "data": {
499 | "text/plain": [
500 | "'\\n1\\n2\\n3\\n'"
501 | ]
502 | },
503 | "execution_count": 8,
504 | "metadata": {},
505 | "output_type": "execute_result"
506 | }
507 | ],
508 | "source": [
509 | "inputData=sc.parallelize([1,2,3])\n",
510 | "def f(x):#定义一个将内容追加于文件末尾的函数\n",
511 | " with open('./example.txt','a+') as fl:\n",
512 | " print(x,file=fl)\n",
513 | "\n",
514 | "open('./example.txt','w').close()#操作之前先关闭之前可能存在的对该文件的写操作\n",
515 | "y=inputData.foreach(f)\n",
516 | "print(y)\n",
517 | "#结果为:None,因为函数f没有返回值\n",
518 | "#查看写文件的结果\n",
519 | "with open('./example.txt') as fl:\n",
520 | " print(fl.read())"
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": 9,
526 | "metadata": {},
527 | "outputs": [
528 | {
529 | "data": {
530 | "text/plain": [
531 | "[(0, [2, 8]), (1, [1, 1, 3, 5])]"
532 | ]
533 | },
534 | "execution_count": 9,
535 | "metadata": {},
536 | "output_type": "execute_result"
537 | }
538 | ],
539 | "source": [
540 | "# groupBy(f, numPartitions=None, partitionFunc=)\n",
541 | "# 根据函数符合条件与否进行分组返回分组项目的RDD\n",
542 | "rdd = sc.parallelize([1, 1, 2, 3, 5, 8])\n",
543 | "result = rdd.groupBy(lambda x: x % 2).collect()\n",
544 | "sorted([(x, sorted(y)) for (x, y) in result])"
545 | ]
546 | },
547 | {
548 | "cell_type": "code",
549 | "execution_count": 11,
550 | "metadata": {},
551 | "outputs": [
552 | {
553 | "name": "stdout",
554 | "output_type": "stream",
555 | "text": [
556 | "[('a', 2), ('b', 1)]\n",
557 | "[('a', [1, 1]), ('b', [1])]\n"
558 | ]
559 | }
560 | ],
561 | "source": [
562 | "#groupByKey(numPartitions=None, partitionFunc=)\n",
563 | "#原rdd为键值对,groupByKey()则将原rdd的元素相同键的值编进一个sequence\n",
564 | "#如果您正在进行分组以执行每个密钥的聚合(例如总计或平均值),则使用reduceByKey或aggregateByKey将提供更好的性能。\n",
565 | "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n",
566 | "print(sorted(rdd.groupByKey().mapValues(len).collect()))\n",
567 | "print(sorted(rdd.groupByKey().mapValues(list).collect()))"
568 | ]
569 | },
570 | {
571 | "cell_type": "code",
572 | "execution_count": 20,
573 | "metadata": {},
574 | "outputs": [
575 | {
576 | "name": "stdout",
577 | "output_type": "stream",
578 | "text": [
579 | "[('a', ([1], [2])), ('b', ([4], []))]\n",
580 | "([4], [])\n"
581 | ]
582 | }
583 | ],
584 | "source": [
585 | "# cogroup(other, numPartitions=None)\n",
586 | "# 对于self或other中的每个关键字k,返回一个包含一个元组的结果RDD,以及该关键字在自身和其他关键字中的值列表。\n",
587 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
588 | "y = sc.parallelize([(\"a\", 2)])\n",
589 | "print([(x, tuple(map(list, y))) for x, y in sorted(list(x.cogroup(y).collect()))])\n",
590 | "print(tuple(map(list,list(x.cogroup(y).collect()[0][1]))))\n"
591 | ]
592 | },
593 | {
594 | "cell_type": "code",
595 | "execution_count": 21,
596 | "metadata": {},
597 | "outputs": [
598 | {
599 | "name": "stdout",
600 | "output_type": "stream",
601 | "text": [
602 | "[('a', ([5], [1], [2], [])), ('b', ([6], [4], [], [42]))]\n"
603 | ]
604 | }
605 | ],
606 | "source": [
607 | "# groupWith(other, *others)\n",
608 | "# cogroup的别名,但支持多个RDD\n",
609 | "w = sc.parallelize([(\"a\", 5), (\"b\", 6)])\n",
610 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
611 | "y = sc.parallelize([(\"a\", 2)])\n",
612 | "z = sc.parallelize([(\"b\", 42)])\n",
613 | "print([(x, tuple(map(list, y))) for x, y in sorted(list(w.groupWith(x, y, z).collect()))])"
614 | ]
615 | },
616 | {
617 | "cell_type": "code",
618 | "execution_count": 24,
619 | "metadata": {},
620 | "outputs": [
621 | {
622 | "name": "stdout",
623 | "output_type": "stream",
624 | "text": [
625 | "15\n",
626 | "abcde\n"
627 | ]
628 | }
629 | ],
630 | "source": [
631 | "# reduce(f)\n",
632 | "# reduce函数是将rdd中的每个元素两两之间按函数f进行操作,然后再结果再两两之间按f进行操作,一直进行下去,\n",
633 | "# 即所谓的shuffle过程。reduce得到的结果是普通的python对象,而不是rdd.\n",
634 | "# operator 操作函数 https://docs.python.org/3/library/operator.html\n",
635 | "from operator import *\n",
636 | "print(sc.parallelize([1, 2, 3, 4, 5]).reduce(add))\n",
637 | "print(sc.parallelize([\"a\", \"b\", \"c\", \"d\", \"e\"]).reduce(concat))"
638 | ]
639 | },
640 | {
641 | "cell_type": "code",
642 | "execution_count": 29,
643 | "metadata": {},
644 | "outputs": [
645 | {
646 | "data": {
647 | "text/plain": [
648 | "[('a', 2), ('b', 1)]"
649 | ]
650 | },
651 | "execution_count": 29,
652 | "metadata": {},
653 | "output_type": "execute_result"
654 | }
655 | ],
656 | "source": [
657 | "# reduceByKey(func, numPartitions=None, partitionFunc=)\n",
658 | "# 按key分组 组内进行reduce处理\n",
659 | "from operator import *\n",
660 | "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n",
661 | "sorted(rdd.reduceByKey(add).collect())"
662 | ]
663 | },
664 | {
665 | "cell_type": "code",
666 | "execution_count": 36,
667 | "metadata": {},
668 | "outputs": [
669 | {
670 | "name": "stdout",
671 | "output_type": "stream",
672 | "text": [
673 | "{'a': 2, 'b': 1}\n",
674 | "[('a', 2), ('b', 1)]\n"
675 | ]
676 | }
677 | ],
678 | "source": [
679 | "# reduceByKeyLocally(func)\n",
680 | "# 其他与reduceByKey一样,只不过聚合后立即将键,值对以字典的形式传给到集群master,即输出为字典\n",
681 | "# 这还将在将结果发送到reducer之前在每个映射器上进行本地合并,类似于“合并器”中的MapReduce的\n",
682 | "from operator import *\n",
683 | "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n",
684 | "print(rdd.reduceByKeyLocally(add))\n",
685 | "print(sorted(rdd.reduceByKeyLocally(add).items()))"
686 | ]
687 | },
688 | {
689 | "cell_type": "code",
690 | "execution_count": 38,
691 | "metadata": {},
692 | "outputs": [
693 | {
694 | "data": {
695 | "text/plain": [
696 | "-5"
697 | ]
698 | },
699 | "execution_count": 38,
700 | "metadata": {},
701 | "output_type": "execute_result"
702 | }
703 | ],
704 | "source": [
705 | "# treeReduce(f, depth=2)\n",
706 | "# 分区间多次进行reduce\n",
707 | "# depth 树的深度(执行次数?)\n",
708 | "add = lambda x, y: x + y\n",
709 | "rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10)\n",
710 | "rdd.treeReduce(add, 2)"
711 | ]
712 | },
713 | {
714 | "cell_type": "code",
715 | "execution_count": 28,
716 | "metadata": {},
717 | "outputs": [
718 | {
719 | "name": "stdout",
720 | "output_type": "stream",
721 | "text": [
722 | "[5, 6]\n",
723 | "['a', 'b']\n"
724 | ]
725 | }
726 | ],
727 | "source": [
728 | "# rdd.keys()\n",
729 | "# 原rdd的元素为键值对,返回原rdd元素的键为元素的rdd\n",
730 | "# rdd.values()\n",
731 | "# 原rdd的元素为键值对,返回原rdd元素的值为元素的rdd\n",
732 | "w = sc.parallelize([(\"a\", 5), (\"b\", 6)])\n",
733 | "print(w.keys().collect())\n",
734 | "print(w.values().collect())"
735 | ]
736 | },
737 | {
738 | "cell_type": "markdown",
739 | "metadata": {},
740 | "source": [
741 | "`aggregate函数`\n",
742 | "\n",
743 | "将每个分区里面的元素进行聚合,然后用combine函数将每个分区的结果和初始值(zeroValue)进行combine操作。这个函数最终返回的类型不需要和RDD中元素类型一致。\n",
744 | "\n",
745 | "seqOp操作会聚合各分区中的元素,然后combOp操作把所有分区的聚合结果再次聚合,两个操作的初始值都是zeroValue. seqOp的操作是遍历分区中的所有元素(T),第一个T跟zeroValue做操作,结果再作为与第二个T做操作的zeroValue,直到遍历完整个分区。combOp操作是把各分区聚合的结果,再聚合。aggregate函数返回一个跟RDD不同类型的值。因此,需要一个操作seqOp来把分区中的元素T合并成一个U,另外一个操作combOp把所有U聚合。\n"
746 | ]
747 | },
748 | {
749 | "cell_type": "code",
750 | "execution_count": 38,
751 | "metadata": {},
752 | "outputs": [
753 | {
754 | "name": "stdout",
755 | "output_type": "stream",
756 | "text": [
757 | "(10, 4)\n",
758 | "(10, 4)\n",
759 | "(10, 28)\n"
760 | ]
761 | }
762 | ],
763 | "source": [
764 | "#aggregate(zeroValue, seqOp, combOp)\n",
765 | "seqOp = (lambda x, y : (x[0] + y, x[1] + 1))\n",
766 | "combOp = (lambda x, y : (x[0] + y[0], x[1] + y[1]))\n",
767 | "print(sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp))\n",
768 | "print(sc.parallelize([1, 2, 3, 4],3).aggregate((0, 0), seqOp, combOp))\n",
769 | "# 三个分区多加了4个6 ?\n",
770 | "print(sc.parallelize([1, 2, 3, 4],3).aggregate((0, 6), seqOp, combOp))"
771 | ]
772 | },
773 | {
774 | "cell_type": "code",
775 | "execution_count": null,
776 | "metadata": {},
777 | "outputs": [],
778 | "source": [
779 | "# aggregateByKey(zeroValue, seqFunc, combFunc, numPartitions=None, partitionFunc=)\n",
780 | "# 跟aggregate逻辑相同,bykey顾名思义 按照key分区 ,而aggregate按区分配;\n",
781 | "# 但是zeroValue与aggregate中的用法很不一样,这里的zeroValue是一个值,它即可以跟这样键聚合,也可以跟那个键聚合,而且zeroValue必须与键内聚合时定义的形式一致。"
782 | ]
783 | },
784 | {
785 | "cell_type": "code",
786 | "execution_count": 37,
787 | "metadata": {},
788 | "outputs": [
789 | {
790 | "data": {
791 | "text/plain": [
792 | "-5"
793 | ]
794 | },
795 | "execution_count": 37,
796 | "metadata": {},
797 | "output_type": "execute_result"
798 | }
799 | ],
800 | "source": [
801 | "# treeAggregate(zeroValue, seqOp, combOp, depth=2)\n",
802 | "# 与aggregate不同的地方是:在每个分区,会做两次或者多次combOp,避免将所有局部的值传给driver端.另外,经过测验初始值zeroValue不会参与combOp.\n",
803 | "# depth:树的深度\n",
804 | "add = lambda x, y: x + y\n",
805 | "rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10)\n",
806 | "rdd.treeAggregate(0, add, add, 2)"
807 | ]
808 | },
809 | {
810 | "cell_type": "code",
811 | "execution_count": 39,
812 | "metadata": {},
813 | "outputs": [
814 | {
815 | "data": {
816 | "text/plain": [
817 | "15"
818 | ]
819 | },
820 | "execution_count": 39,
821 | "metadata": {},
822 | "output_type": "execute_result"
823 | }
824 | ],
825 | "source": [
826 | "# fold(zeroValue, op)\n",
827 | "# partitionBy的简易版,初始一个值,分区内部执行函数和汇总函数为同一个函数\n",
828 | "from operator import add\n",
829 | "sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)"
830 | ]
831 | },
832 | {
833 | "cell_type": "code",
834 | "execution_count": 40,
835 | "metadata": {},
836 | "outputs": [
837 | {
838 | "data": {
839 | "text/plain": [
840 | "[('a', 2), ('b', 1)]"
841 | ]
842 | },
843 | "execution_count": 40,
844 | "metadata": {},
845 | "output_type": "execute_result"
846 | }
847 | ],
848 | "source": [
849 | "# foldByKey(zeroValue, func, numPartitions=None, partitionFunc=)\n",
850 | "# 跟fold逻辑相同,只不过是按照key进行分组\n",
851 | "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n",
852 | "from operator import add\n",
853 | "sorted(rdd.foldByKey(0, add).collect())"
854 | ]
855 | },
856 | {
857 | "cell_type": "code",
858 | "execution_count": 42,
859 | "metadata": {},
860 | "outputs": [
861 | {
862 | "name": "stdout",
863 | "output_type": "stream",
864 | "text": [
865 | "[('a', 1), ('a', 2), ('b', 1), ('b', 3), ('c', 5), ('c', 6)]\n",
866 | "[('a', [1, 2]), ('b', [1, 3]), ('c', [5, 6])]\n"
867 | ]
868 | }
869 | ],
870 | "source": [
871 | "# combineByKey(createCombiner, mergeValue, mergeCombiners, numPartitions=None, partitionFunc=)\n",
872 | "# 将RDD [(K,V)]转换为RDD [(K,C)]类型的结果,通过三个函数进行转换聚合的目的,\n",
873 | "# createcombiner函数 rdd值、类型转换\n",
874 | "# 根据key对值进行合并\n",
875 | "# 将合并列表,将连个c合并成一个\n",
876 | "x=sc.parallelize([('a',1),('a',2),('b',1),('b',3),('c',5),('c',6)])\n",
877 | "def to_list(a):\n",
878 | " return [a]\n",
879 | "def append(a,b):\n",
880 | " a.append(b)\n",
881 | " return a\n",
882 | "def extend(a,b):\n",
883 | " a.extend(b)\n",
884 | " return a\n",
885 | "print(x.collect())\n",
886 | "print(x.combineByKey(to_list,append,extend).collect())"
887 | ]
888 | },
889 | {
890 | "cell_type": "code",
891 | "execution_count": 43,
892 | "metadata": {},
893 | "outputs": [
894 | {
895 | "name": "stdout",
896 | "output_type": "stream",
897 | "text": [
898 | "[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]\n",
899 | "[('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]\n"
900 | ]
901 | }
902 | ],
903 | "source": [
904 | "# rdd.sortBy(keyfunc, ascending=True, numPartitions=None)\n",
905 | "# 根据key对应的函数进行排序\n",
906 | "tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]\n",
907 | "print(sc.parallelize(tmp).sortBy(lambda x: x[0]).collect())\n",
908 | "print(sc.parallelize(tmp).sortBy(lambda x: x[1]).collect())"
909 | ]
910 | },
911 | {
912 | "cell_type": "code",
913 | "execution_count": 45,
914 | "metadata": {},
915 | "outputs": [
916 | {
917 | "name": "stdout",
918 | "output_type": "stream",
919 | "text": [
920 | "('1', 3)\n",
921 | "[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]\n",
922 | "[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]\n",
923 | "[('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5), ('little', 4), ('Mary', 1), ('was', 8), ('white', 9), ('whose', 6)]\n"
924 | ]
925 | }
926 | ],
927 | "source": [
928 | "# sortByKey(ascending=True, numPartitions=None, keyfunc=>)\n",
929 | "# 对此RDD进行排序,假定它由(键,值)对组成\n",
930 | "tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]\n",
931 | "print(sc.parallelize(tmp).sortByKey().first())\n",
932 | "print(sc.parallelize(tmp).sortByKey(True, 1).collect())\n",
933 | "print(sc.parallelize(tmp).sortByKey(True, 2).collect())\n",
934 | "tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]\n",
935 | "tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)])\n",
936 | "print(sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=lambda k: k.lower()).collect())"
937 | ]
938 | },
939 | {
940 | "cell_type": "code",
941 | "execution_count": 47,
942 | "metadata": {},
943 | "outputs": [
944 | {
945 | "name": "stdout",
946 | "output_type": "stream",
947 | "text": [
948 | "(count: 4, mean: 2.5, stdev: 1.118033988749895, max: 4.0, min: 1.0)\n"
949 | ]
950 | }
951 | ],
952 | "source": [
953 | "# stats()\n",
954 | "# 计算rdd中全体元素的均值、方差、最大值、最小值和个数的信息\n",
955 | "samp=sc.parallelize([1,2,3,4]).stats()\n",
956 | "print(samp)"
957 | ]
958 | },
959 | {
960 | "cell_type": "code",
961 | "execution_count": 48,
962 | "metadata": {},
963 | "outputs": [
964 | {
965 | "data": {
966 | "text/plain": [
967 | "3"
968 | ]
969 | },
970 | "execution_count": 48,
971 | "metadata": {},
972 | "output_type": "execute_result"
973 | }
974 | ],
975 | "source": [
976 | "# rdd.count()\n",
977 | "# 计算rdd所有元素个数\n",
978 | "sc.parallelize([2, 3, 4]).count()"
979 | ]
980 | },
981 | {
982 | "cell_type": "code",
983 | "execution_count": 3,
984 | "metadata": {},
985 | "outputs": [
986 | {
987 | "data": {
988 | "text/plain": [
989 | "10000"
990 | ]
991 | },
992 | "execution_count": 3,
993 | "metadata": {},
994 | "output_type": "execute_result"
995 | }
996 | ],
997 | "source": [
998 | "# countApprox(timeout, confidence=0.95)\n",
999 | "# 在限定时间内做出有可能的结果,即使任务没有完成\n",
1000 | "rdd = sc.parallelize(range(10000), 10)\n",
1001 | "rdd.countApprox(1000, 1.0)"
1002 | ]
1003 | },
1004 | {
1005 | "cell_type": "code",
1006 | "execution_count": 5,
1007 | "metadata": {},
1008 | "outputs": [
1009 | {
1010 | "name": "stdout",
1011 | "output_type": "stream",
1012 | "text": [
1013 | "1060\n",
1014 | "19\n"
1015 | ]
1016 | }
1017 | ],
1018 | "source": [
1019 | "# countApproxDistinct(relativeSD=0.05)\n",
1020 | "# 返回RDD中不同值数的近似值\n",
1021 | "# relativeSD 相对准确度。较小的值创建需要更多空间的计数器。它必须大于0.000017。\n",
1022 | "n = sc.parallelize(range(1000)).map(str).countApproxDistinct()\n",
1023 | "print(n)\n",
1024 | "n = sc.parallelize([i % 20 for i in range(1000)]).countApproxDistinct()\n",
1025 | "print(n)"
1026 | ]
1027 | },
1028 | {
1029 | "cell_type": "code",
1030 | "execution_count": 9,
1031 | "metadata": {},
1032 | "outputs": [
1033 | {
1034 | "name": "stdout",
1035 | "output_type": "stream",
1036 | "text": [
1037 | "[('a', 2), ('b', 1)]\n",
1038 | "defaultdict(, {'a': 2, 'b': 1})\n"
1039 | ]
1040 | }
1041 | ],
1042 | "source": [
1043 | "# countByKey()\n",
1044 | "# 计算每个键的元素数量,并将结果作为字典返回给主数据。\n",
1045 | "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n",
1046 | "print(sorted(rdd.countByKey().items()))\n",
1047 | "print(rdd.countByKey())"
1048 | ]
1049 | },
1050 | {
1051 | "cell_type": "code",
1052 | "execution_count": 21,
1053 | "metadata": {},
1054 | "outputs": [
1055 | {
1056 | "name": "stdout",
1057 | "output_type": "stream",
1058 | "text": [
1059 | "[[1, 2], [1, 2, 2]]\n",
1060 | "[(1, 2), (2, 3)]\n"
1061 | ]
1062 | }
1063 | ],
1064 | "source": [
1065 | "# countByValue()\n",
1066 | "# 将此RDD中每个唯一值的计数返回为(值,计数)对的字典。\n",
1067 | "print(sc.parallelize([1, 2, 1, 2, 2], 2).glom().collect())\n",
1068 | "print(sorted(sc.parallelize([1, 2, 1, 2, 2], 2).countByValue().items()))"
1069 | ]
1070 | },
1071 | {
1072 | "cell_type": "code",
1073 | "execution_count": 13,
1074 | "metadata": {},
1075 | "outputs": [],
1076 | "source": [
1077 | "# first() 返回第一个元素\n",
1078 | "# max()返回最大值\n",
1079 | "# take(num) 返回开始num个值\n",
1080 | "# top(num, key=None) 计算rdd所有元素按降序排列后最顶部的几个元素\n",
1081 | "# min() rdd中的最小值\n",
1082 | "# mean() 计算rdd所有元素均值\n",
1083 | "# variance() 方差\n",
1084 | "# stdev() 标准差\n",
1085 | "# sum() 和"
1086 | ]
1087 | },
1088 | {
1089 | "cell_type": "code",
1090 | "execution_count": 19,
1091 | "metadata": {},
1092 | "outputs": [
1093 | {
1094 | "name": "stdout",
1095 | "output_type": "stream",
1096 | "text": [
1097 | "([0, 25, 50], [25, 26])\n",
1098 | "([0, 5, 25, 50], [5, 20, 26])\n",
1099 | "([0, 15, 30, 45, 60], [15, 15, 15, 6])\n",
1100 | "(('a', 'b', 'c'), [2, 2])\n"
1101 | ]
1102 | }
1103 | ],
1104 | "source": [
1105 | "# histogram(buckets)\n",
1106 | "# 对rdd中的元素进行频数统计,统计区间有两种,一种是给出段数,一种是直接给出区间。返回为元组\n",
1107 | "rdd = sc.parallelize(range(51))\n",
1108 | "print(rdd.histogram(2))\n",
1109 | "print(rdd.histogram([0, 5, 25, 50]))\n",
1110 | "print(rdd.histogram([0, 15, 30, 45, 60]))\n",
1111 | "rdd = sc.parallelize([\"ab\", \"ac\", \"b\", \"bd\", \"ef\"])\n",
1112 | "print(rdd.histogram((\"a\", \"b\", \"c\")))"
1113 | ]
1114 | },
1115 | {
1116 | "cell_type": "code",
1117 | "execution_count": 20,
1118 | "metadata": {},
1119 | "outputs": [
1120 | {
1121 | "data": {
1122 | "text/plain": [
1123 | "['1', '2', '', '3']"
1124 | ]
1125 | },
1126 | "execution_count": 20,
1127 | "metadata": {},
1128 | "output_type": "execute_result"
1129 | }
1130 | ],
1131 | "source": [
1132 | "# pipe(command, env=None, checkCode=False)\n",
1133 | "# 通过管道向后面环节输出command处理过的结果,具体功能就体现在command,command为linux命令。 \n",
1134 | "# pipe函数中的'cat'为linux命令,表示打印内容。\n",
1135 | "sc.parallelize(['1', '2', '', '3']).pipe('cat').collect()"
1136 | ]
1137 | },
1138 | {
1139 | "cell_type": "code",
1140 | "execution_count": 21,
1141 | "metadata": {},
1142 | "outputs": [
1143 | {
1144 | "data": {
1145 | "text/plain": [
1146 | "[2, 4]"
1147 | ]
1148 | },
1149 | "execution_count": 21,
1150 | "metadata": {},
1151 | "output_type": "execute_result"
1152 | }
1153 | ],
1154 | "source": [
1155 | "# filter(f)\n",
1156 | "# 返回满足条件的新RDD\n",
1157 | "rdd = sc.parallelize([1, 2, 3, 4, 5])\n",
1158 | "rdd.filter(lambda x: x % 2 == 0).collect()"
1159 | ]
1160 | },
1161 | {
1162 | "cell_type": "code",
1163 | "execution_count": 10,
1164 | "metadata": {},
1165 | "outputs": [
1166 | {
1167 | "data": {
1168 | "text/plain": [
1169 | "[1, 2, 3]"
1170 | ]
1171 | },
1172 | "execution_count": 10,
1173 | "metadata": {},
1174 | "output_type": "execute_result"
1175 | }
1176 | ],
1177 | "source": [
1178 | "# distinct(numPartitions=None)\n",
1179 | "# 返回一个没有重复元素的新RDD,就是去重处理\n",
1180 | "sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect())"
1181 | ]
1182 | },
1183 | {
1184 | "cell_type": "code",
1185 | "execution_count": 3,
1186 | "metadata": {},
1187 | "outputs": [
1188 | {
1189 | "name": "stdout",
1190 | "output_type": "stream",
1191 | "text": [
1192 | "11\n",
1193 | "20\n"
1194 | ]
1195 | }
1196 | ],
1197 | "source": [
1198 | "# sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect())\n",
1199 | "# 返回此RDD的采样子集\n",
1200 | "# withReplacement:是否重复采样\n",
1201 | "# fraction:样本预期占RDD的大小,每一个元素被取到的概率一样,是一个【0,1】的数\n",
1202 | "# seed 随机模式的种子\n",
1203 | "rdd = sc.parallelize(range(100), 4)\n",
1204 | "print(rdd.sample(False, 0.1, 81).count())\n",
1205 | "print(rdd.sample(False, 0.2, 81).count())"
1206 | ]
1207 | },
1208 | {
1209 | "cell_type": "code",
1210 | "execution_count": 28,
1211 | "metadata": {},
1212 | "outputs": [
1213 | {
1214 | "name": "stdout",
1215 | "output_type": "stream",
1216 | "text": [
1217 | "[('a', 0), ('b', 0), ('a', 1), ('a', 2), ('b', 1), ('b', 2), ('a', 3), ('a', 4), ('a', 5), ('a', 6)]\n",
1218 | "209 98\n"
1219 | ]
1220 | },
1221 | {
1222 | "ename": "AttributeError",
1223 | "evalue": "'ResultIterable' object has no attribute 'takeSample'",
1224 | "traceback": [
1225 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1226 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
1227 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'b'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'b'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtakeSample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1228 | "\u001b[0;31mAttributeError\u001b[0m: 'ResultIterable' object has no attribute 'takeSample'"
1229 | ],
1230 | "output_type": "error"
1231 | }
1232 | ],
1233 | "source": [
1234 | "# sampleByKey(withReplacement, fractions, seed=None)\n",
1235 | "# 返回按键取样的RDD的子集(通过分层抽样)。用分数指定的不同键的变量采样率来创建这个RDD的样本,这是抽样速率图的关键。\n",
1236 | "# 多个key的fractions 以字典方式传递\n",
1237 | "fractions = {\"a\": 0.2, \"b\": 0.1}\n",
1238 | "rdd = sc.parallelize(fractions.keys()).cartesian(sc.parallelize(range(0, 1000)))\n",
1239 | "sample = dict(rdd.sampleByKey(False, fractions, 2).groupByKey().collect())\n",
1240 | "print(rdd.take(10))\n",
1241 | "print(len(sample['a']), len(sample['b']))\n",
1242 | "print(sorted(sample['a'])[:10])"
1243 | ]
1244 | },
1245 | {
1246 | "cell_type": "code",
1247 | "execution_count": 26,
1248 | "metadata": {},
1249 | "outputs": [
1250 | {
1251 | "data": {
1252 | "text/plain": [
1253 | "1.0"
1254 | ]
1255 | },
1256 | "execution_count": 26,
1257 | "metadata": {},
1258 | "output_type": "execute_result"
1259 | }
1260 | ],
1261 | "source": [
1262 | "# sampleStdev()\n",
1263 | "# 计算这个RDD元素的样本标准差(通过除以N-1而不是N)来修正估计标准差的偏差。\n",
1264 | "sc.parallelize([1, 2, 3]).sampleStdev()"
1265 | ]
1266 | },
1267 | {
1268 | "cell_type": "code",
1269 | "execution_count": 27,
1270 | "metadata": {},
1271 | "outputs": [
1272 | {
1273 | "data": {
1274 | "text/plain": [
1275 | "1.0"
1276 | ]
1277 | },
1278 | "execution_count": 27,
1279 | "metadata": {},
1280 | "output_type": "execute_result"
1281 | }
1282 | ],
1283 | "source": [
1284 | "# sampleVariance()\n",
1285 | "# 计算这个RDD元素的样本方差(它纠正了通过除以N-1而不是N来估计方差的偏差)。\n",
1286 | "sc.parallelize([1, 2, 3]).sampleVariance()"
1287 | ]
1288 | },
1289 | {
1290 | "cell_type": "code",
1291 | "execution_count": 31,
1292 | "metadata": {},
1293 | "outputs": [
1294 | {
1295 | "name": "stdout",
1296 | "output_type": "stream",
1297 | "text": [
1298 | "[6, 9, 9, 8, 0, 7, 0, 8, 3, 6, 7, 8]\n",
1299 | "5\n"
1300 | ]
1301 | }
1302 | ],
1303 | "source": [
1304 | "# takeSample(withReplacement, num, seed=None)\n",
1305 | "# 返回这个RDD的一个固定大小的采样子集。\n",
1306 | "# 只有当结果数组被认为是很小的时候,才应该使用这个方法,因为所有的数据都被加载到驱动程序的内存中。\n",
1307 | "rdd = sc.parallelize(range(0, 10))\n",
1308 | "print(rdd.takeSample(True, 12, 1))\n",
1309 | "print(len(rdd.takeSample(False, 5, 2)))"
1310 | ]
1311 | },
1312 | {
1313 | "cell_type": "code",
1314 | "execution_count": 32,
1315 | "metadata": {},
1316 | "outputs": [
1317 | {
1318 | "data": {
1319 | "text/plain": [
1320 | "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
1321 | ]
1322 | },
1323 | "execution_count": 32,
1324 | "metadata": {},
1325 | "output_type": "execute_result"
1326 | }
1327 | ],
1328 | "source": [
1329 | "# toLocalIterator()\n",
1330 | "# 返回包含这个RDD中所有元素的迭代器。迭代器将消耗与此RDD中最大分区相同的内存。\n",
1331 | "rdd = sc.parallelize(range(10))\n",
1332 | "[x for x in rdd.toLocalIterator()]"
1333 | ]
1334 | },
1335 | {
1336 | "cell_type": "code",
1337 | "execution_count": 39,
1338 | "metadata": {},
1339 | "outputs": [
1340 | {
1341 | "data": {
1342 | "text/plain": [
1343 | "[1, 1, 2, 3, 1, 1, 2, 3]"
1344 | ]
1345 | },
1346 | "execution_count": 39,
1347 | "metadata": {},
1348 | "output_type": "execute_result"
1349 | }
1350 | ],
1351 | "source": [
1352 | "# union(other)\n",
1353 | "# 返回这个RDD和另一个的结合。不去重\n",
1354 | "rdd = sc.parallelize([1, 1, 2, 3])\n",
1355 | "rdd.union(rdd).collect()"
1356 | ]
1357 | },
1358 | {
1359 | "cell_type": "code",
1360 | "execution_count": 40,
1361 | "metadata": {},
1362 | "outputs": [
1363 | {
1364 | "data": {
1365 | "text/plain": [
1366 | "[2, 1, 3]"
1367 | ]
1368 | },
1369 | "execution_count": 40,
1370 | "metadata": {},
1371 | "output_type": "execute_result"
1372 | }
1373 | ],
1374 | "source": [
1375 | "# intersection(other)\n",
1376 | "# 返回这个RDD和另一个的交集。即使输入RDDs完成了,输出也不会包含任何重复的元素。\n",
1377 | "# 该方法在内部执行洗牌。\n",
1378 | "rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])\n",
1379 | "rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])\n",
1380 | "rdd1.intersection(rdd2).collect()"
1381 | ]
1382 | },
1383 | {
1384 | "cell_type": "code",
1385 | "execution_count": 41,
1386 | "metadata": {},
1387 | "outputs": [
1388 | {
1389 | "data": {
1390 | "text/plain": [
1391 | "[('a', 1), ('b', 4), ('b', 5)]"
1392 | ]
1393 | },
1394 | "execution_count": 41,
1395 | "metadata": {},
1396 | "output_type": "execute_result"
1397 | }
1398 | ],
1399 | "source": [
1400 | "# subtract(other, numPartitions=None)\n",
1401 | "# 返回自己有其他没有的元素的值\n",
1402 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4), (\"b\", 5), (\"a\", 3)])\n",
1403 | "y = sc.parallelize([(\"a\", 3), (\"c\", None)])\n",
1404 | "sorted(x.subtract(y).collect())"
1405 | ]
1406 | },
1407 | {
1408 | "cell_type": "code",
1409 | "execution_count": 42,
1410 | "metadata": {},
1411 | "outputs": [
1412 | {
1413 | "data": {
1414 | "text/plain": [
1415 | "[('b', 4), ('b', 5)]"
1416 | ]
1417 | },
1418 | "execution_count": 42,
1419 | "metadata": {},
1420 | "output_type": "execute_result"
1421 | }
1422 | ],
1423 | "source": [
1424 | "# subtractByKey(other, numPartitions=None)\n",
1425 | "# 返回每一个(键,值)对,在另一个没有成对的匹配键。\n",
1426 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4), (\"b\", 5), (\"a\", 2)])\n",
1427 | "y = sc.parallelize([(\"a\", 3), (\"c\", None)])\n",
1428 | "sorted(x.subtractByKey(y).collect())"
1429 | ]
1430 | },
1431 | {
1432 | "cell_type": "code",
1433 | "execution_count": 43,
1434 | "metadata": {},
1435 | "outputs": [
1436 | {
1437 | "data": {
1438 | "text/plain": [
1439 | "[(1, 1), (1, 2), (2, 1), (2, 2)]"
1440 | ]
1441 | },
1442 | "execution_count": 43,
1443 | "metadata": {},
1444 | "output_type": "execute_result"
1445 | }
1446 | ],
1447 | "source": [
1448 | "# cartesian(other)\n",
1449 | "# 返回这个RDD和另一个RDD的笛卡尔积,也就是所有成对的元素(a,b)的RDD,a为本身RDD,b为其他RDD\n",
1450 | "rdd = sc.parallelize([1, 2])\n",
1451 | "sorted(rdd.cartesian(rdd).collect())"
1452 | ]
1453 | },
1454 | {
1455 | "cell_type": "code",
1456 | "execution_count": 44,
1457 | "metadata": {},
1458 | "outputs": [
1459 | {
1460 | "data": {
1461 | "text/plain": [
1462 | "[('a', (1, 2)), ('a', (1, 3))]"
1463 | ]
1464 | },
1465 | "execution_count": 44,
1466 | "metadata": {},
1467 | "output_type": "execute_result"
1468 | }
1469 | ],
1470 | "source": [
1471 | "# join(other, numPartitions=None)\n",
1472 | "# 返回一个包含所有成对元素的RDD,其中包含在self和other中匹配的键。每一对元素都将作为一个(k,(v1,v2))返回,其中(k,v1)为self(k,v2)为other。\n",
1473 | "# 在集群中执行散列连接\n",
1474 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
1475 | "y = sc.parallelize([(\"a\", 2), (\"a\", 3)])\n",
1476 | "sorted(x.join(y).collect())"
1477 | ]
1478 | },
1479 | {
1480 | "cell_type": "code",
1481 | "execution_count": 45,
1482 | "metadata": {},
1483 | "outputs": [
1484 | {
1485 | "data": {
1486 | "text/plain": [
1487 | "[('a', (2, 1)), ('b', (None, 4))]"
1488 | ]
1489 | },
1490 | "execution_count": 45,
1491 | "metadata": {},
1492 | "output_type": "execute_result"
1493 | }
1494 | ],
1495 | "source": [
1496 | "# rightOuterJoin(other, numPartitions=None)\n",
1497 | "# 对于在otherRDD中的每一个(k, w)元素,生成的RDD中有k键的生成(k, (v, w)), 如果没有k键的话也要生成none补位(k,(None, w))\n",
1498 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
1499 | "y = sc.parallelize([(\"a\", 2)])\n",
1500 | "sorted(y.rightOuterJoin(x).collect())"
1501 | ]
1502 | },
1503 | {
1504 | "cell_type": "code",
1505 | "execution_count": 46,
1506 | "metadata": {},
1507 | "outputs": [
1508 | {
1509 | "data": {
1510 | "text/plain": [
1511 | "[('a', (1, 2)), ('b', (4, None))]"
1512 | ]
1513 | },
1514 | "execution_count": 46,
1515 | "metadata": {},
1516 | "output_type": "execute_result"
1517 | }
1518 | ],
1519 | "source": [
1520 | "# leftOuterJoin(other, numPartitions=None)\n",
1521 | "# 就是用第二个rdd的key去第一个rdd中寻找,在value组合的时候还是第一个rdd的值在前,第二个rdd的值在后。其他与leftOuterJoin完全一样。\n",
1522 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
1523 | "y = sc.parallelize([(\"a\", 2)])\n",
1524 | "sorted(x.leftOuterJoin(y).collect())"
1525 | ]
1526 | },
1527 | {
1528 | "cell_type": "code",
1529 | "execution_count": 48,
1530 | "metadata": {},
1531 | "outputs": [
1532 | {
1533 | "name": "stdout",
1534 | "output_type": "stream",
1535 | "text": [
1536 | "500\n",
1537 | "192 308\n"
1538 | ]
1539 | }
1540 | ],
1541 | "source": [
1542 | "# randomSplit(weights, seed=None)\n",
1543 | "# 将RDD按照一定的比例随机分开\n",
1544 | "rdd = sc.parallelize(range(500), 1)\n",
1545 | "rdd1, rdd2 = rdd.randomSplit([2, 3], 17)\n",
1546 | "print(len(rdd1.collect() + rdd2.collect()))\n",
1547 | "print(rdd1.count(), rdd2.count())"
1548 | ]
1549 | },
1550 | {
1551 | "cell_type": "code",
1552 | "execution_count": null,
1553 | "metadata": {},
1554 | "outputs": [],
1555 | "source": []
1556 | }
1557 | ],
1558 | "metadata": {
1559 | "kernelspec": {
1560 | "display_name": "Python 3",
1561 | "language": "python",
1562 | "name": "python3"
1563 | },
1564 | "language_info": {
1565 | "codemirror_mode": {
1566 | "name": "ipython",
1567 | "version": 3
1568 | },
1569 | "file_extension": ".py",
1570 | "mimetype": "text/x-python",
1571 | "name": "python",
1572 | "nbconvert_exporter": "python",
1573 | "pygments_lexer": "ipython3",
1574 | "version": "3.6.4"
1575 | }
1576 | },
1577 | "nbformat": 4,
1578 | "nbformat_minor": 2
1579 | }
1580 |
--------------------------------------------------------------------------------
/pyspark.ml.classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### 数据准备"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "from pyspark.sql import SparkSession\n",
17 | "from pyspark import SparkConf, SparkContext\n",
18 | "spark = SparkSession.builder.master('local[1]').appName('learn_ml').getOrCreate()"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "df0 = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/mushrooms.csv', header=True, inferSchema=True, encoding='utf-8')"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 3,
33 | "metadata": {
34 | "scrolled": false
35 | },
36 | "outputs": [
37 | {
38 | "data": {
39 | "text/plain": [
40 | "23"
41 | ]
42 | },
43 | "execution_count": 3,
44 | "metadata": {},
45 | "output_type": "execute_result"
46 | }
47 | ],
48 | "source": [
49 | "len(df0.columns)"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "看看分类的类别"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "**查看是否有na值**"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 4,
69 | "metadata": {},
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "False"
75 | ]
76 | },
77 | "execution_count": 4,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "# df0.toPandas().isna().sum()\n",
84 | "df0.toPandas().isna().values.any()"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 5,
90 | "metadata": {
91 | "scrolled": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "from pyspark.ml.feature import StringIndexer, VectorAssembler\n",
96 | "old_columns_names = df0.columns\n",
97 | "new_columns_names = [name+'-new' for name in old_columns_names]\n",
98 | "for i in range(len(old_columns_names)):\n",
99 | " indexer = StringIndexer(inputCol=old_columns_names[i], outputCol=new_columns_names[i])\n",
100 | " df0 = indexer.fit(df0).transform(df0)\n",
101 | "vecAss = VectorAssembler(inputCols=new_columns_names[1:], outputCol='features')\n",
102 | "df0 = vecAss.transform(df0)"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 6,
108 | "metadata": {
109 | "scrolled": false
110 | },
111 | "outputs": [],
112 | "source": [
113 | "df0 = df0.withColumnRenamed(new_columns_names[0], 'label')"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 7,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "dfi = df0.select(['label', 'features'])"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 8,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "# df0.describe().toPandas().T"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 9,
137 | "metadata": {},
138 | "outputs": [
139 | {
140 | "name": "stdout",
141 | "output_type": "stream",
142 | "text": [
143 | "+-----+------------------------------------------------------------------------------+\n",
144 | "|label|features |\n",
145 | "+-----+------------------------------------------------------------------------------+\n",
146 | "|1.0 |(22,[1,3,4,7,8,9,10,19,20,21],[1.0,1.0,6.0,1.0,7.0,1.0,2.0,2.0,2.0,4.0]) |\n",
147 | "|0.0 |(22,[1,2,3,4,8,9,10,19,20,21],[1.0,3.0,1.0,4.0,7.0,1.0,3.0,1.0,3.0,1.0]) |\n",
148 | "|0.0 |(22,[0,1,2,3,4,8,9,10,19,20,21],[3.0,1.0,4.0,1.0,5.0,3.0,1.0,3.0,1.0,3.0,5.0])|\n",
149 | "|1.0 |(22,[2,3,4,7,8,9,10,19,20,21],[4.0,1.0,6.0,1.0,3.0,1.0,2.0,2.0,2.0,4.0]) |\n",
150 | "|0.0 |(22,[1,2,6,8,10,18,19,20,21],[1.0,1.0,1.0,7.0,2.0,1.0,1.0,4.0,1.0]) |\n",
151 | "+-----+------------------------------------------------------------------------------+\n",
152 | "only showing top 5 rows\n",
153 | "\n"
154 | ]
155 | }
156 | ],
157 | "source": [
158 | "dfi.show(5, truncate=0)"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 10,
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "# label = df0.rdd.map(lambda row: row[0])\n",
168 | "# row = df0.rdd.map(lambda row: row[1:])\n",
169 | "# dfi = label.map(lambda m: 0.0 if m=='p' else 1.0).zip(row.map(lambda x: list(x))).toDF(schema=['label','feature'])"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 11,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "train_data, test_data = dfi.randomSplit([4.0, 1.0], 100)"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 12,
184 | "metadata": {
185 | "scrolled": true
186 | },
187 | "outputs": [],
188 | "source": [
189 | "# test_data.filter(test_data['label']==1).show(5, truncate=0)"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "### 评估器\n",
197 | "**分类(classification)**"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "### LogisticRegression :逻辑回归,支持多项逻辑(softmax)和二项逻辑回归"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "pyspark.ml.classification.LogisticRegression(self, featuresCol=\"features\", labelCol=\"label\", predictionCol=\"prediction\", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol=\"probability\", rawPredictionCol=\"rawPrediction\", standardization=True, weightCol=None, aggregationDepth=2, family=\"auto\")\n"
212 | ]
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "`\n",
219 | "regParam: 正则化参数(>=0)\n",
220 | "elasticNetParam: ElasticNet混合参数,0-1之间,当alpha为0时,惩罚为L2正则化,当为1时为L1正则化\n",
221 | "fitIntercept: 是否拟合一个截距项\n",
222 | "Standardization: 是否在拟合数据之前对数据进行标准化\n",
223 | "aggregationDepth: 树聚合所建议的深度(>=2)\n",
224 | "`"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 20,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "from pyspark.ml.classification import LogisticRegression\n",
234 | "blor = LogisticRegression(regParam=0.01)\n",
235 | "blorModel = blor.fit(train_data)\n",
236 | "result = blorModel.transform(test_data)"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 21,
242 | "metadata": {},
243 | "outputs": [
244 | {
245 | "data": {
246 | "text/plain": [
247 | "0.9661954517516902"
248 | ]
249 | },
250 | "execution_count": 21,
251 | "metadata": {},
252 | "output_type": "execute_result"
253 | }
254 | ],
255 | "source": [
256 | "result.filter(result.label == result.prediction).count()/result.count()"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": 22,
262 | "metadata": {
263 | "collapsed": true
264 | },
265 | "outputs": [
266 | {
267 | "name": "stdout",
268 | "output_type": "stream",
269 | "text": [
270 | "+--------------------+--------------------+\n",
271 | "| FPR| TPR|\n",
272 | "+--------------------+--------------------+\n",
273 | "| 0.0| 0.0|\n",
274 | "| 0.0|0.020466901183242726|\n",
275 | "| 0.0| 0.04093380236648545|\n",
276 | "|5.934718100890207E-4|0.060761112887751836|\n",
277 | "|0.001186943620178...| 0.08058842340901823|\n",
278 | "|0.001483679525222552| 0.10073552926127279|\n",
279 | "|0.001780415430267...| 0.12088263511352734|\n",
280 | "|0.002373887240356083| 0.14070994563479372|\n",
281 | "|0.002670623145400...| 0.1608570514870483|\n",
282 | "|0.002670623145400...| 0.18132395267029103|\n",
283 | "|0.002670623145400...| 0.20179085385353374|\n",
284 | "|0.002670623145400...| 0.22225775503677647|\n",
285 | "|0.002670623145400...| 0.24272465622001918|\n",
286 | "|0.002670623145400...| 0.2631915574032619|\n",
287 | "|0.002670623145400...| 0.2836584585865046|\n",
288 | "|0.002670623145400...| 0.30412535976974736|\n",
289 | "|0.002670623145400...| 0.3245922609529901|\n",
290 | "|0.002670623145400...| 0.34505916213623283|\n",
291 | "|0.002670623145400...| 0.3655260633194755|\n",
292 | "|0.002670623145400...| 0.38599296450271825|\n",
293 | "+--------------------+--------------------+\n",
294 | "only showing top 20 rows\n",
295 | "\n",
296 | "+--------------------+------------------+\n",
297 | "| recall| precision|\n",
298 | "+--------------------+------------------+\n",
299 | "| 0.0| 1.0|\n",
300 | "|0.020466901183242726| 1.0|\n",
301 | "| 0.04093380236648545| 1.0|\n",
302 | "|0.060761112887751836|0.9895833333333334|\n",
303 | "| 0.08058842340901823| 0.984375|\n",
304 | "| 0.10073552926127279| 0.984375|\n",
305 | "| 0.12088263511352734| 0.984375|\n",
306 | "| 0.14070994563479372|0.9821428571428571|\n",
307 | "| 0.1608570514870483| 0.982421875|\n",
308 | "| 0.18132395267029103| 0.984375|\n",
309 | "| 0.20179085385353374| 0.9859375|\n",
310 | "| 0.22225775503677647|0.9872159090909091|\n",
311 | "| 0.24272465622001918| 0.98828125|\n",
312 | "| 0.2631915574032619|0.9891826923076923|\n",
313 | "| 0.2836584585865046|0.9899553571428571|\n",
314 | "| 0.30412535976974736| 0.990625|\n",
315 | "| 0.3245922609529901| 0.9912109375|\n",
316 | "| 0.34505916213623283|0.9917279411764706|\n",
317 | "| 0.3655260633194755| 0.9921875|\n",
318 | "| 0.38599296450271825|0.9925986842105263|\n",
319 | "+--------------------+------------------+\n",
320 | "only showing top 20 rows\n",
321 | "\n"
322 | ]
323 | }
324 | ],
325 | "source": [
326 | "blorModel.\n",
327 | "blorModel.summary.pr.show()"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "metadata": {},
333 | "source": [
334 | "### 决策树\n",
335 | "pyspark.ml.classification.DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='gini', seed=None)"
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "metadata": {},
341 | "source": [
342 | "`\n",
343 | "checkpointInterval:设置checkpoint区间(>=1),或宕掉checkpoint(-1),例如10意味着缓冲区(cache)将会每迭代10次获得一次checkpoint\n",
344 | "fit(datasset,params=None)\n",
345 | "impurity: 信息增益计算的准则,选项\"entropy\", \"gini\"\n",
346 | "maxBins:连续特征离散化的最大分箱,必须>=2 并且>=分类特征分类的数量\n",
347 | "maxDepth:树的最大深度\n",
348 | "minInfoGain:分割结点所需的最小的信息增益\n",
349 | "minInstancesPerNode:每个结点最小实例个数\n",
350 | "`"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 13,
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "from pyspark.ml.classification import DecisionTreeClassifier\n",
360 | "dt = DecisionTreeClassifier(maxDepth=5)\n",
361 | "dtModel = dt.fit(train_data)\n",
362 | "result = dtModel.transform(test_data)"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": 14,
368 | "metadata": {},
369 | "outputs": [
370 | {
371 | "data": {
372 | "text/plain": [
373 | "1.0"
374 | ]
375 | },
376 | "execution_count": 14,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "# accuracy\n",
383 | "result.filter(result.label == result.prediction).count()/result.count()"
384 | ]
385 | },
386 | {
387 | "cell_type": "markdown",
388 | "metadata": {},
389 | "source": [
390 | "### 梯度增强树\n",
391 | "pyspark.ml.classification.GBTClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType='logistic', maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0)"
392 | ]
393 | },
394 | {
395 | "cell_type": "markdown",
396 | "metadata": {},
397 | "source": [
398 | "`\n",
399 | "checkpointInterval: 同DecisionTreeClassifier\n",
400 | "fit(dataset,params=None)方法\n",
401 | "lossType: GBT要最小化的损失函数,选项:logistic\n",
402 | "maxBins: 同DecisionTreeClassifier\n",
403 | "maxDepth: 同DecisionTreeClassifier\n",
404 | "maxIter: 同DecisionTreeClassifier\n",
405 | "minInfoGain: 同DecisionTreeClassifier\n",
406 | "minInstancesPerNode:同DecisionTreeClassifier\n",
407 | "stepSize: 每次迭代优化的步长\n",
408 | "subsamplingRate: 同RandomForesetClassier\n",
409 | "`"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": 16,
415 | "metadata": {},
416 | "outputs": [],
417 | "source": [
418 | "from pyspark.ml.classification import GBTClassifier\n",
419 | "gbt = GBTClassifier(maxDepth=5)\n",
420 | "gbtModel = gbt.fit(train_data)\n",
421 | "result = gbtModel.transform(test_data)"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": 17,
427 | "metadata": {},
428 | "outputs": [
429 | {
430 | "data": {
431 | "text/plain": [
432 | "1.0"
433 | ]
434 | },
435 | "execution_count": 17,
436 | "metadata": {},
437 | "output_type": "execute_result"
438 | }
439 | ],
440 | "source": [
441 | "result.filter(result.label == result.prediction).count()/result.count()"
442 | ]
443 | },
444 | {
445 | "cell_type": "markdown",
446 | "metadata": {},
447 | "source": [
448 | "### 随机森林\n",
449 | "pyspark.ml.classification.RandomForestClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='gini', numTrees=20, featureSubsetStrategy='auto', seed=None, subsamplingRate=1.0)"
450 | ]
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "metadata": {},
455 | "source": [
456 | "`\n",
457 | "checkpoint:同DecisionTreeClassifier\n",
458 | "featureSubsetStrategy:每棵树上要分割的特征数目,选项为\"auto\",\"all\", \"onethird\", \"sqrt\", \"log2\", \"(0.0-1.0],\"[1-n]\"\n",
459 | "fit(dataset,params=None)方法\n",
460 | "impurity: 同DecisionTreeClassifier\n",
461 | "maxBins:同DecisionTreeClassifier\n",
462 | "maxDepth:同DecisionTreeClassifier\n",
463 | "minInfoGain: 同DecisionTreeClassifier\n",
464 | "numTrees: 训练树的个数\n",
465 | "subsamplingRate: 用于训练每颗决策树的样本个数,区间(0,1]\n",
466 | "`"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": 13,
472 | "metadata": {},
473 | "outputs": [],
474 | "source": [
475 | "from pyspark.ml.classification import RandomForestClassifier\n",
476 | "rf = RandomForestClassifier(numTrees=10, maxDepth=5)\n",
477 | "rfModel = rf.fit(train_data)\n",
478 | "# model.featureImportances\n",
479 | "result = rfModel.transform(test_data)"
480 | ]
481 | },
482 | {
483 | "cell_type": "code",
484 | "execution_count": 19,
485 | "metadata": {
486 | "scrolled": true
487 | },
488 | "outputs": [
489 | {
490 | "data": {
491 | "text/plain": [
492 | "1.0"
493 | ]
494 | },
495 | "execution_count": 19,
496 | "metadata": {},
497 | "output_type": "execute_result"
498 | }
499 | ],
500 | "source": [
501 | "result.filter(result.label == result.prediction).count()/result.count()"
502 | ]
503 | },
504 | {
505 | "cell_type": "markdown",
506 | "metadata": {},
507 | "source": [
508 | "### 朴素贝叶斯\n",
509 | "pyspark.ml.classification.NaiveBayes(featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', smoothing=1.0, modelType='multinomial', thresholds=None, weightCol=None)"
510 | ]
511 | },
512 | {
513 | "cell_type": "markdown",
514 | "metadata": {},
515 | "source": [
516 | "`\n",
517 | "modelType: 选项:multinomial(多项式)和bernoulli(伯努利)\n",
518 | "smoothing: 平滑参数,应该>=0,默认为1.0\n",
519 | "`"
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": 24,
525 | "metadata": {},
526 | "outputs": [],
527 | "source": [
528 | "from pyspark.ml.classification import NaiveBayes\n",
529 | "nb = NaiveBayes()\n",
530 | "nbModel = nb.fit(train_data)\n",
531 | "result = nbModel.transform(test_data)"
532 | ]
533 | },
534 | {
535 | "cell_type": "code",
536 | "execution_count": 25,
537 | "metadata": {},
538 | "outputs": [
539 | {
540 | "data": {
541 | "text/plain": [
542 | "0.9231714812538414"
543 | ]
544 | },
545 | "execution_count": 25,
546 | "metadata": {},
547 | "output_type": "execute_result"
548 | }
549 | ],
550 | "source": [
551 | "result.filter(result.label == result.prediction).count()/result.count()"
552 | ]
553 | },
554 | {
555 | "cell_type": "markdown",
556 | "metadata": {},
557 | "source": [
558 | "### LinearSVC 支持向量机\n",
559 | "pyspark.ml.classification.LinearSVC(featuresCol='features', labelCol='label', predictionCol='prediction', maxIter=100, regParam=0.0, tol=1e-06, rawPredictionCol='rawPrediction', fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2)"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": 17,
565 | "metadata": {},
566 | "outputs": [],
567 | "source": [
568 | "from pyspark.ml.classification import LinearSVC\n",
569 | "svm = LinearSVC(maxIter=10, regPcaram=0.01)\n",
570 | "svmModel = svm.fit(train_data)\n",
571 | "result = svmModel.transform(test_data)"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": 18,
577 | "metadata": {
578 | "scrolled": true
579 | },
580 | "outputs": [
581 | {
582 | "data": {
583 | "text/plain": [
584 | "0.9797172710510141"
585 | ]
586 | },
587 | "execution_count": 18,
588 | "metadata": {},
589 | "output_type": "execute_result"
590 | }
591 | ],
592 | "source": [
593 | "# accuracy\n",
594 | "result.filter(result.label == result.prediction).count()/result.count()"
595 | ]
596 | }
597 | ],
598 | "metadata": {
599 | "kernelspec": {
600 | "display_name": "Python 3",
601 | "language": "python",
602 | "name": "python3"
603 | },
604 | "language_info": {
605 | "codemirror_mode": {
606 | "name": "ipython",
607 | "version": 3
608 | },
609 | "file_extension": ".py",
610 | "mimetype": "text/x-python",
611 | "name": "python",
612 | "nbconvert_exporter": "python",
613 | "pygments_lexer": "ipython3",
614 | "version": "3.6.4"
615 | }
616 | },
617 | "nbformat": 4,
618 | "nbformat_minor": 2
619 | }
620 |
--------------------------------------------------------------------------------
/pyspark.ml.feature.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from pyspark.sql import SparkSession \n",
10 | "spark = SparkSession.builder.appName('learn_ml').master('local[1]').getOrCreate()"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "ml 模块 三个抽象类:\n",
18 | "转换器(Transformer)、评估器(Estimator)和管道(Pipeline)"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### pyspark.ml.feature.Binarizer(self, threshold=0.0, inputCol=None, outputCol=None)\n",
26 | "根据指定的阈值将连续变量转换为对应的二进制"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "df = spark.createDataFrame([(0.5,),(1.0,),(1.5,)], ['values'])"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "name": "stdout",
45 | "output_type": "stream",
46 | "text": [
47 | "+------+\n",
48 | "|values|\n",
49 | "+------+\n",
50 | "| 0.5|\n",
51 | "| 1.0|\n",
52 | "| 1.5|\n",
53 | "+------+\n",
54 | "\n"
55 | ]
56 | }
57 | ],
58 | "source": [
59 | "df.show()"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 4,
65 | "metadata": {},
66 | "outputs": [
67 | {
68 | "name": "stdout",
69 | "output_type": "stream",
70 | "text": [
71 | "+------+--------+\n",
72 | "|values|features|\n",
73 | "+------+--------+\n",
74 | "| 0.5| 0.0|\n",
75 | "| 1.0| 1.0|\n",
76 | "| 1.5| 1.0|\n",
77 | "+------+--------+\n",
78 | "\n"
79 | ]
80 | }
81 | ],
82 | "source": [
83 | "from pyspark.ml.feature import Binarizer\n",
84 | "binarizer = Binarizer(threshold=0.7, inputCol=\"values\", outputCol=\"features\")\n",
85 | "binarizer.transform(df).show()"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 5,
91 | "metadata": {},
92 | "outputs": [
93 | {
94 | "name": "stdout",
95 | "output_type": "stream",
96 | "text": [
97 | "+------+-----+\n",
98 | "|values|freqs|\n",
99 | "+------+-----+\n",
100 | "| 0.5| 0.0|\n",
101 | "| 1.0| 1.0|\n",
102 | "| 1.5| 1.0|\n",
103 | "+------+-----+\n",
104 | "\n"
105 | ]
106 | }
107 | ],
108 | "source": [
109 | "# 通过setParams,更改配置\n",
110 | "binarizer.setParams(outputCol=\"freqs\").transform(df).show()"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 6,
116 | "metadata": {},
117 | "outputs": [
118 | {
119 | "name": "stdout",
120 | "output_type": "stream",
121 | "text": [
122 | "+------+------+\n",
123 | "|values|vector|\n",
124 | "+------+------+\n",
125 | "| 0.5| 1.0|\n",
126 | "| 1.0| 1.0|\n",
127 | "| 1.5| 1.0|\n",
128 | "+------+------+\n",
129 | "\n"
130 | ]
131 | }
132 | ],
133 | "source": [
134 | "# 通过params更改配置\n",
135 | "params = {binarizer.threshold: -0.5, binarizer.outputCol: \"vector\"}\n",
136 | "binarizer.transform(df, params).show()"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 7,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "# 保存配置\n",
146 | "import os\n",
147 | "#temp_path = os.getcwd()\n",
148 | "temp_path = os.path.abspath('.')\n",
149 | "binarizerPath = \"file://{}/binarizer\".format(temp_path)\n",
150 | "binarizer.save(binarizerPath)"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 8,
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/plain": [
161 | "True"
162 | ]
163 | },
164 | "execution_count": 8,
165 | "metadata": {},
166 | "output_type": "execute_result"
167 | }
168 | ],
169 | "source": [
170 | "# 加载配置\n",
171 | "loadedBinarizer = Binarizer.load(binarizerPath)\n",
172 | "loadedBinarizer.getThreshold() == binarizer.getThreshold()"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "### pyspark.ml.feature.Bucketizer(self, splits=None, inputCol=None, outputCol=None, handleInvalid=\"error\")\n",
180 | "与Binarizer类似,该方法根据阈值列表(分割的参数),将连续变量转换为多项值(连续变量离散化到指定的范围区间)\n"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 9,
186 | "metadata": {},
187 | "outputs": [
188 | {
189 | "name": "stdout",
190 | "output_type": "stream",
191 | "text": [
192 | "+------+-------+\n",
193 | "|values|buckets|\n",
194 | "+------+-------+\n",
195 | "| 0.1| 0.0|\n",
196 | "| 0.4| 0.0|\n",
197 | "| 1.2| 1.0|\n",
198 | "| 1.5| 2.0|\n",
199 | "| NaN| 3.0|\n",
200 | "| NaN| 3.0|\n",
201 | "+------+-------+\n",
202 | "\n"
203 | ]
204 | }
205 | ],
206 | "source": [
207 | "from pyspark.ml.feature import Bucketizer\n",
208 | "values = [(0.1,), (0.4,), (1.2,), (1.5,), (float(\"nan\"),), (float(\"nan\"),)]\n",
209 | "df = spark.createDataFrame(values, [\"values\"])\n",
210 | "# splits 为分类区间\n",
211 | "bucketizer = Bucketizer(splits=[-float(\"inf\"), 0.5, 1.4, float(\"inf\")],inputCol=\"values\", outputCol=\"buckets\")\n",
212 | "# 这里setHandleInvalid是对nan值进行处理,默认是error:有nan则报错;keep:将nan保留为新分类;skip:忽略nan值\n",
213 | "bucketed = bucketizer.setHandleInvalid(\"keep\").transform(df)\n",
214 | "bucketed.show()"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 10,
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "name": "stdout",
224 | "output_type": "stream",
225 | "text": [
226 | "+------+---+\n",
227 | "|values| b|\n",
228 | "+------+---+\n",
229 | "| 0.1|0.0|\n",
230 | "| 0.4|0.0|\n",
231 | "| 1.2|1.0|\n",
232 | "| 1.5|2.0|\n",
233 | "| NaN|3.0|\n",
234 | "| NaN|3.0|\n",
235 | "+------+---+\n",
236 | "\n"
237 | ]
238 | }
239 | ],
240 | "source": [
241 | "# 更改配置\n",
242 | "bucketizer.setParams(outputCol=\"b\").transform(df).show()"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {},
248 | "source": [
249 | "### pyspark.ml.feature.ChiSqSelector(self, numTopFeatures=50, featuresCol=\"features\", outputCol=None, labelCol=\"label\", selectorType=\"numTopFeatures\", percentile=0.1, fpr=0.05, fdr=0.05, fwe=0.05)\n",
250 | "对于分类目标变量(思考分类模型),此功能允许你选择预定义数量的特征(由numTopFeatures参数进行参数化),以便最好地说明目标的变化。该方法需要两部:需要.fit()——可以计算卡方检验,调用.fit()方法,将DataFrame作为参数传入返回一个ChiSqSelectorModel对象,然后可以使用该对象的.transform()方法来转换DataFrame。默认情况下,选择方法是numTopFeatures,默认顶级要素数设置为50。\n",
251 | "percentile 相识于num ,选取百分比的特征\n",
252 | "fpr 选择p-values低于阈值的所有特征,从而控制误差的选择率。\n",
253 | "fdr 使用 Benjamini-Hochberg procedure \n",
254 | "fwe 选择p-values低于阈值的所有特征。阈值按1 / numFeatures缩放"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 11,
260 | "metadata": {},
261 | "outputs": [
262 | {
263 | "name": "stdout",
264 | "output_type": "stream",
265 | "text": [
266 | "+------------------+-----+----------------+\n",
267 | "| features|label|selectedFeatures|\n",
268 | "+------------------+-----+----------------+\n",
269 | "|[0.0,0.0,18.0,1.0]| 1.0| [18.0,1.0]|\n",
270 | "|[0.0,1.0,12.0,0.0]| 0.0| [12.0,0.0]|\n",
271 | "|[1.0,0.0,15.0,0.1]| 0.0| [15.0,0.1]|\n",
272 | "+------------------+-----+----------------+\n",
273 | "\n"
274 | ]
275 | }
276 | ],
277 | "source": [
278 | "from pyspark.ml.linalg import Vectors\n",
279 | "from pyspark.ml.feature import ChiSqSelector\n",
280 | "df = spark.createDataFrame(\n",
281 | "[(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),\n",
282 | "(Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),\n",
283 | "(Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],\n",
284 | "[\"features\", \"label\"])\n",
285 | "selector = ChiSqSelector(numTopFeatures=2, outputCol=\"selectedFeatures\")\n",
286 | "model = selector.fit(df)\n",
287 | "model.transform(df).show()"
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "metadata": {},
293 | "source": [
294 | "### pyspark.ml.feature.CountVectorizer(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None, outputCol=None)\n",
295 | "从文档集合中提取词汇表并生成向量"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 12,
301 | "metadata": {},
302 | "outputs": [
303 | {
304 | "name": "stdout",
305 | "output_type": "stream",
306 | "text": [
307 | "+-----+---------------+-------------------------+\n",
308 | "|label|raw |vectors |\n",
309 | "+-----+---------------+-------------------------+\n",
310 | "|0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|\n",
311 | "|1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|\n",
312 | "+-----+---------------+-------------------------+\n",
313 | "\n"
314 | ]
315 | }
316 | ],
317 | "source": [
318 | "from pyspark.ml.feature import CountVectorizer\n",
319 | "df = spark.createDataFrame(\n",
320 | "[(0, [\"a\", \"b\", \"c\"]), (1, [\"a\", \"b\", \"b\", \"c\", \"a\"])],\n",
321 | "[\"label\", \"raw\"])\n",
322 | "cv = CountVectorizer(inputCol=\"raw\", outputCol=\"vectors\")\n",
323 | "model = cv.fit(df)\n",
324 | "model.transform(df).show(truncate=False)"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 13,
330 | "metadata": {},
331 | "outputs": [
332 | {
333 | "data": {
334 | "text/plain": [
335 | "['a', 'b', 'c']"
336 | ]
337 | },
338 | "execution_count": 13,
339 | "metadata": {},
340 | "output_type": "execute_result"
341 | }
342 | ],
343 | "source": [
344 | "sorted(model.vocabulary) "
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": 14,
350 | "metadata": {},
351 | "outputs": [],
352 | "source": [
353 | "# 保存model\n",
354 | "import os\n",
355 | "#temp_path = os.getcwd()\n",
356 | "temp_path = os.path.abspath('.')\n",
357 | "modelPath = \"file://{}/count-vectorizer-model\".format(temp_path)\n",
358 | "model.save(modelPath)"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": 15,
364 | "metadata": {},
365 | "outputs": [
366 | {
367 | "data": {
368 | "text/plain": [
369 | "True"
370 | ]
371 | },
372 | "execution_count": 15,
373 | "metadata": {},
374 | "output_type": "execute_result"
375 | }
376 | ],
377 | "source": [
378 | "# 加载model\n",
379 | "from pyspark.ml.feature import CountVectorizerModel\n",
380 | "loadedModel = CountVectorizerModel.load(modelPath)\n",
381 | "loadedModel.vocabulary == model.vocabulary"
382 | ]
383 | },
384 | {
385 | "cell_type": "markdown",
386 | "metadata": {},
387 | "source": [
388 | "### pyspark.ml.feature.ElementwiseProduct(scalingVec=None, inputCol=None, outputCol=None)\n",
389 | "使用提供的“权重”向量输出每个输入向量的阿达马乘积(即,逐元素乘积)。换句话说,它通过标量乘数缩放数据集的每一列。"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": 16,
395 | "metadata": {},
396 | "outputs": [
397 | {
398 | "name": "stdout",
399 | "output_type": "stream",
400 | "text": [
401 | "+-------------+-------------+\n",
402 | "| values| eprod|\n",
403 | "+-------------+-------------+\n",
404 | "|[2.0,1.0,3.0]|[2.0,2.0,9.0]|\n",
405 | "+-------------+-------------+\n",
406 | "\n",
407 | "+-------------+--------------+\n",
408 | "| values| eprod|\n",
409 | "+-------------+--------------+\n",
410 | "|[2.0,1.0,3.0]|[4.0,3.0,15.0]|\n",
411 | "+-------------+--------------+\n",
412 | "\n"
413 | ]
414 | }
415 | ],
416 | "source": [
417 | "from pyspark.ml.feature import ElementwiseProduct \n",
418 | "from pyspark.ml.linalg import Vectors\n",
419 | "df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], [\"values\"])\n",
420 | "ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),\n",
421 | "inputCol=\"values\", outputCol=\"eprod\")\n",
422 | "ep.transform(df).show()\n",
423 | "ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).show()\n"
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {},
429 | "source": [
430 | "### pyspark.ml.feature.Imputer(*args, **kwargs)\n",
431 | "用于完成缺失值的插补估计器,使用缺失值所在列的平均值或中值。 输入列应该是DoubleType或FloatType。 目前的Imputer不支持分类特征,可能会为分类特征创建不正确的值。\n",
432 | "请注意,平均值/中值是在过滤出缺失值之后计算的。 输入列中的所有Null值都被视为缺失,所以也被归类。 为了计算中位数,使用pyspark.sql.DataFrame.approxQuantile(),相对误差为0.001。\n"
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": 17,
438 | "metadata": {},
439 | "outputs": [
440 | {
441 | "name": "stdout",
442 | "output_type": "stream",
443 | "text": [
444 | "+---+---+\n",
445 | "| a| b|\n",
446 | "+---+---+\n",
447 | "|1.0|NaN|\n",
448 | "|2.0|NaN|\n",
449 | "|NaN|3.0|\n",
450 | "|4.0|4.0|\n",
451 | "|5.0|5.0|\n",
452 | "+---+---+\n",
453 | "\n",
454 | "+---+---+\n",
455 | "| a| b|\n",
456 | "+---+---+\n",
457 | "|3.0|4.0|\n",
458 | "+---+---+\n",
459 | "\n",
460 | "+---+---+-----+-----+\n",
461 | "| a| b|out_a|out_b|\n",
462 | "+---+---+-----+-----+\n",
463 | "|1.0|NaN| 1.0| 4.0|\n",
464 | "|2.0|NaN| 2.0| 4.0|\n",
465 | "|NaN|3.0| 3.0| 3.0|\n",
466 | "|4.0|4.0| 4.0| 4.0|\n",
467 | "|5.0|5.0| 5.0| 5.0|\n",
468 | "+---+---+-----+-----+\n",
469 | "\n"
470 | ]
471 | }
472 | ],
473 | "source": [
474 | "from pyspark.ml.feature import Imputer\n",
475 | "df = spark.createDataFrame([(1.0, float(\"nan\")), (2.0, float(\"nan\")), (float(\"nan\"), 3.0),\n",
476 | " (4.0, 4.0), (5.0, 5.0)], [\"a\", \"b\"])\n",
477 | "imputer = Imputer(inputCols=[\"a\", \"b\"], outputCols=[\"out_a\", \"out_b\"])\n",
478 | "model = imputer.fit(df)\n",
479 | "df.show()\n",
480 | "model.surrogateDF.show()\n",
481 | "model.transform(df).show()"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": 18,
487 | "metadata": {},
488 | "outputs": [
489 | {
490 | "name": "stdout",
491 | "output_type": "stream",
492 | "text": [
493 | "+---+---+-----+-----+\n",
494 | "| a| b|out_a|out_b|\n",
495 | "+---+---+-----+-----+\n",
496 | "|1.0|NaN| 1.0| 4.0|\n",
497 | "|2.0|NaN| 2.0| 4.0|\n",
498 | "|NaN|3.0| 2.0| 3.0|\n",
499 | "|4.0|4.0| 4.0| 4.0|\n",
500 | "|5.0|5.0| 5.0| 5.0|\n",
501 | "+---+---+-----+-----+\n",
502 | "\n"
503 | ]
504 | }
505 | ],
506 | "source": [
507 | "imputer.setStrategy(\"median\").setMissingValue(float(\"nan\")).fit(df).transform(df).show()"
508 | ]
509 | },
510 | {
511 | "cell_type": "markdown",
512 | "metadata": {},
513 | "source": [
514 | "### pyspark.ml.feature.MaxAbsScaler(self, inputCol=None, outputCol=None)\n",
515 | "通过分割每个特征中的最大绝对值来单独重新缩放每个特征以范围[-1,1]。 它不会移动/居中数据,因此不会破坏任何稀疏性"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": 19,
521 | "metadata": {},
522 | "outputs": [
523 | {
524 | "name": "stdout",
525 | "output_type": "stream",
526 | "text": [
527 | "+-----+------+\n",
528 | "| a|scaled|\n",
529 | "+-----+------+\n",
530 | "|[1.0]| [0.5]|\n",
531 | "|[2.0]| [1.0]|\n",
532 | "+-----+------+\n",
533 | "\n"
534 | ]
535 | }
536 | ],
537 | "source": [
538 | "from pyspark.ml.feature import MaxAbsScaler\n",
539 | "from pyspark.ml.linalg import Vectors\n",
540 | "df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], [\"a\"])\n",
541 | "maScaler = MaxAbsScaler(inputCol=\"a\", outputCol=\"scaled\")\n",
542 | "model = maScaler.fit(df)\n",
543 | "model.transform(df).show()"
544 | ]
545 | },
546 | {
547 | "cell_type": "markdown",
548 | "metadata": {},
549 | "source": [
550 | "### pyspark.ml.feature.MinMaxScaler(self, min=0.0, max=1.0, inputCol=None, outputCol=None)\n",
551 | "使用列汇总统计信息,将每个特征单独重新标定为一个常用范围[min,max],这也称为最小 - 最大标准化或重新标定(注意由于零值可能会被转换为非零值,因此即使对于稀疏输入,转换器的输出也将是DenseVector)。 特征E的重新缩放的值被计算为,数据将被缩放到[0.0,1.0]范围内。\n",
552 | "Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min\n",
553 | "For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)\n"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": 42,
559 | "metadata": {},
560 | "outputs": [
561 | {
562 | "name": "stdout",
563 | "output_type": "stream",
564 | "text": [
565 | "[0.0] [2.0]\n",
566 | "+-----+------+\n",
567 | "| a|scaled|\n",
568 | "+-----+------+\n",
569 | "|[0.0]| [0.0]|\n",
570 | "|[2.0]| [1.0]|\n",
571 | "+-----+------+\n",
572 | "\n"
573 | ]
574 | }
575 | ],
576 | "source": [
577 | "from pyspark.ml.feature import MinMaxScaler\n",
578 | "from pyspark.ml.linalg import Vectors\n",
579 | "df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], [\"a\"])\n",
580 | "mmScaler = MinMaxScaler(inputCol=\"a\", outputCol=\"scaled\")\n",
581 | "model = mmScaler.fit(df)\n",
582 | "print(model.originalMin, model.originalMax)\n",
583 | "model.transform(df).show()"
584 | ]
585 | },
586 | {
587 | "cell_type": "markdown",
588 | "metadata": {},
589 | "source": [
590 | "### pyspark.ml.feature.NGram(n=2, inputCol=None, outputCol=None)\n",
591 | "一种功能转换器,用于将输入的字符串数组转换为n-gram数组。输入数组中的空值将被忽略。它返回一个n-gram数组,其中每个n-gram由一个以空格分隔的单词串表示。当输入为空时,返回一个空数组。当输入数组长度小于n(每n-gram的元素数)时,不返回n-gram。"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": 23,
597 | "metadata": {},
598 | "outputs": [
599 | {
600 | "name": "stdout",
601 | "output_type": "stream",
602 | "text": [
603 | "+---------------+--------------------+\n",
604 | "| inputTokens| nGrams|\n",
605 | "+---------------+--------------------+\n",
606 | "|[a, b, c, d, e]|[a b, b c, c d, d e]|\n",
607 | "+---------------+--------------------+\n",
608 | "\n"
609 | ]
610 | }
611 | ],
612 | "source": [
613 | "from pyspark.ml.feature import NGram\n",
614 | "from pyspark.sql import Row\n",
615 | "df = spark.createDataFrame([Row(inputTokens=[\"a\", \"b\", \"c\", \"d\", \"e\"])])\n",
616 | "ngram = NGram(n=2, inputCol=\"inputTokens\", outputCol=\"nGrams\")\n",
617 | "ngram.transform(df).show()"
618 | ]
619 | },
620 | {
621 | "cell_type": "code",
622 | "execution_count": 24,
623 | "metadata": {},
624 | "outputs": [
625 | {
626 | "name": "stdout",
627 | "output_type": "stream",
628 | "text": [
629 | "+---------------+------------------+\n",
630 | "| inputTokens| nGrams|\n",
631 | "+---------------+------------------+\n",
632 | "|[a, b, c, d, e]|[a b c d, b c d e]|\n",
633 | "+---------------+------------------+\n",
634 | "\n"
635 | ]
636 | }
637 | ],
638 | "source": [
639 | "# 更改 n-gram 长度\n",
640 | "ngram.setParams(n=4).transform(df).show()"
641 | ]
642 | },
643 | {
644 | "cell_type": "code",
645 | "execution_count": 25,
646 | "metadata": {},
647 | "outputs": [
648 | {
649 | "name": "stdout",
650 | "output_type": "stream",
651 | "text": [
652 | "+---------------+------------------+\n",
653 | "| inputTokens| output|\n",
654 | "+---------------+------------------+\n",
655 | "|[a, b, c, d, e]|[a b c d, b c d e]|\n",
656 | "+---------------+------------------+\n",
657 | "\n"
658 | ]
659 | }
660 | ],
661 | "source": [
662 | "# 临时修改输出列\n",
663 | "ngram.transform(df, {ngram.outputCol: \"output\"}).show()"
664 | ]
665 | },
666 | {
667 | "cell_type": "markdown",
668 | "metadata": {},
669 | "source": [
670 | "### pyspark.ml.feature.Normalizer(self, p=2.0, inputCol=None, outputCol=None)\n",
671 | "使用给定的p范数标准化矢量以得到单位范数(默认为L2)。"
672 | ]
673 | },
674 | {
675 | "cell_type": "code",
676 | "execution_count": 26,
677 | "metadata": {},
678 | "outputs": [
679 | {
680 | "name": "stdout",
681 | "output_type": "stream",
682 | "text": [
683 | "+----------+-------------------+----------+\n",
684 | "| dense| sparse| features|\n",
685 | "+----------+-------------------+----------+\n",
686 | "|[3.0,-4.0]|(4,[1,3],[4.0,3.0])|[0.6,-0.8]|\n",
687 | "+----------+-------------------+----------+\n",
688 | "\n"
689 | ]
690 | }
691 | ],
692 | "source": [
693 | "from pyspark.ml.feature import Normalizer\n",
694 | "from pyspark.ml.linalg import Vectors\n",
695 | "svec = Vectors.sparse(4, {1: 4.0, 3: 3.0})\n",
696 | "df = spark.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], [\"dense\", \"sparse\"])\n",
697 | "normalizer = Normalizer(p=2.0, inputCol=\"dense\", outputCol=\"features\")\n",
698 | "normalizer.transform(df).show()"
699 | ]
700 | },
701 | {
702 | "cell_type": "code",
703 | "execution_count": 27,
704 | "metadata": {},
705 | "outputs": [
706 | {
707 | "name": "stdout",
708 | "output_type": "stream",
709 | "text": [
710 | "+----------+-------------------+-------------------+\n",
711 | "| dense| sparse| freqs|\n",
712 | "+----------+-------------------+-------------------+\n",
713 | "|[3.0,-4.0]|(4,[1,3],[4.0,3.0])|(4,[1,3],[0.8,0.6])|\n",
714 | "+----------+-------------------+-------------------+\n",
715 | "\n"
716 | ]
717 | }
718 | ],
719 | "source": [
720 | "normalizer.setParams(inputCol=\"sparse\", outputCol=\"freqs\").transform(df).show()"
721 | ]
722 | },
723 | {
724 | "cell_type": "markdown",
725 | "metadata": {},
726 | "source": [
727 | "### pyspark.ml.feature.OneHotEncoderEstimator(inputCols=None, outputCols=None, handleInvalid='error', dropLast=True)\n",
728 | "(分类列编码为二进制向量列)\n",
729 | "一个热门的编码器,将一列类别索引映射到一列二进制向量,每行至多有一个单值,表示输入类别索引。 例如,对于5个类别,输入值2.0将映射到[0.0,0.0,1.0,0.0]的输出向量。 最后一个类别默认不包含(可通过dropLast进行配置),因为它使向量条目总和为1,因此线性相关。 所以一个4.0的输入值映射到[0.0,0.0,0.0,0.0]。这与scikit-learn的OneHotEncoder不同,后者保留所有类别。 输出向量是稀疏的。\n",
730 | "当handleInvalid配置为“keep”时,会添加一个指示无效值的额外“类别”作为最后一个类别。因此,当dropLast为true时,无效值将被编码为全零向量。"
731 | ]
732 | },
733 | {
734 | "cell_type": "code",
735 | "execution_count": 28,
736 | "metadata": {},
737 | "outputs": [
738 | {
739 | "name": "stdout",
740 | "output_type": "stream",
741 | "text": [
742 | "+-----+-------------+\n",
743 | "|input| output|\n",
744 | "+-----+-------------+\n",
745 | "| 0.0|(2,[0],[1.0])|\n",
746 | "| 1.0|(2,[1],[1.0])|\n",
747 | "| 2.0| (2,[],[])|\n",
748 | "+-----+-------------+\n",
749 | "\n"
750 | ]
751 | }
752 | ],
753 | "source": [
754 | "from pyspark.ml.feature import OneHotEncoderEstimator\n",
755 | "from pyspark.ml.linalg import Vectors\n",
756 | "df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], [\"input\"])\n",
757 | "ohe = OneHotEncoderEstimator(inputCols=[\"input\"], outputCols=[\"output\"])\n",
758 | "model = ohe.fit(df)\n",
759 | "model.transform(df).show()"
760 | ]
761 | },
762 | {
763 | "cell_type": "markdown",
764 | "metadata": {},
765 | "source": [
766 | "### pyspark.ml.feature.PCA(self, k=None, inputCol=None, outputCol=None)\n",
767 | "PCA训练一个模型将向量投影到前k个主成分的较低维空间。"
768 | ]
769 | },
770 | {
771 | "cell_type": "code",
772 | "execution_count": 29,
773 | "metadata": {},
774 | "outputs": [
775 | {
776 | "name": "stdout",
777 | "output_type": "stream",
778 | "text": [
779 | "+---------------------+----------------------------------------+\n",
780 | "|features |pca_features |\n",
781 | "+---------------------+----------------------------------------+\n",
782 | "|(5,[1,3],[1.0,7.0]) |[1.6485728230883807,-4.013282700516296] |\n",
783 | "|[2.0,0.0,3.0,4.0,5.0]|[-4.645104331781534,-1.1167972663619026]|\n",
784 | "|[4.0,0.0,0.0,6.0,7.0]|[-6.428880535676489,-5.337951427775355] |\n",
785 | "+---------------------+----------------------------------------+\n",
786 | "\n"
787 | ]
788 | }
789 | ],
790 | "source": [
791 | "from pyspark.ml.feature import PCA\n",
792 | "from pyspark.ml.linalg import Vectors\n",
793 | "data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),\n",
794 | " (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),\n",
795 | " (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]\n",
796 | "df = spark.createDataFrame(data,[\"features\"])\n",
797 | "pca = PCA(k=2, inputCol=\"features\", outputCol=\"pca_features\")\n",
798 | "model = pca.fit(df)\n",
799 | "model.transform(df).show(truncate=0)"
800 | ]
801 | },
802 | {
803 | "cell_type": "code",
804 | "execution_count": 30,
805 | "metadata": {},
806 | "outputs": [
807 | {
808 | "data": {
809 | "text/plain": [
810 | "DenseVector([0.7944, 0.2056])"
811 | ]
812 | },
813 | "execution_count": 30,
814 | "metadata": {},
815 | "output_type": "execute_result"
816 | }
817 | ],
818 | "source": [
819 | "model.explainedVariance"
820 | ]
821 | },
822 | {
823 | "cell_type": "markdown",
824 | "metadata": {},
825 | "source": [
826 | "### pyspark.ml.feature.QuantileDiscretizer(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, handleInvalid=\"error\")\n",
827 | "与Bucketizer方法类似,但QuantileDiscretizer采用具有连续特征的列,并输出具有分箱分类特征的列。可以使用numBuckets参数设置区域的数量。所使用的桶的数量可能小于该值,例如,如果输入的不同值太少而不能创建足够的不同分位数。nan会占用一个新的分类"
828 | ]
829 | },
830 | {
831 | "cell_type": "code",
832 | "execution_count": 31,
833 | "metadata": {},
834 | "outputs": [
835 | {
836 | "name": "stdout",
837 | "output_type": "stream",
838 | "text": [
839 | "+------+-------+\n",
840 | "|values|buckets|\n",
841 | "+------+-------+\n",
842 | "| 0.1| 0.0|\n",
843 | "| 0.4| 1.0|\n",
844 | "| 1.2| 1.0|\n",
845 | "| 1.5| 1.0|\n",
846 | "| NaN| 2.0|\n",
847 | "| NaN| 2.0|\n",
848 | "+------+-------+\n",
849 | "\n"
850 | ]
851 | }
852 | ],
853 | "source": [
854 | "from pyspark.ml.feature import QuantileDiscretizer\n",
855 | "values = [(0.1,), (0.4,), (1.2,), (1.5,), (float(\"nan\"),), (float(\"nan\"),)]\n",
856 | "df = spark.createDataFrame(values, [\"values\"])\n",
857 | "qds = QuantileDiscretizer(numBuckets=2,\n",
858 | " inputCol=\"values\", outputCol=\"buckets\", relativeError=0.01, handleInvalid=\"error\")\n",
859 | "bucketizer = qds.fit(df)\n",
860 | "qds.setHandleInvalid(\"keep\").fit(df).transform(df).show()"
861 | ]
862 | },
863 | {
864 | "cell_type": "markdown",
865 | "metadata": {},
866 | "source": [
867 | "### pyspark.ml.feature.RegexTokenizer(minTokenLength=1, gaps=True, pattern='\\s+', inputCol=None, outputCol=None, toLowercase=True)\n",
868 | "基于java正则表达式的标记生成器"
869 | ]
870 | },
871 | {
872 | "cell_type": "code",
873 | "execution_count": 32,
874 | "metadata": {},
875 | "outputs": [
876 | {
877 | "name": "stdout",
878 | "output_type": "stream",
879 | "text": [
880 | "+------+---------+\n",
881 | "| text| words|\n",
882 | "+------+---------+\n",
883 | "|A B c|[a, b, c]|\n",
884 | "+------+---------+\n",
885 | "\n"
886 | ]
887 | }
888 | ],
889 | "source": [
890 | "from pyspark.ml.feature import RegexTokenizer\n",
891 | "df = spark.createDataFrame([(\"A B c\",)], [\"text\"])\n",
892 | "reTokenizer = RegexTokenizer(inputCol=\"text\", outputCol=\"words\")\n",
893 | "reTokenizer.transform(df).show()"
894 | ]
895 | },
896 | {
897 | "cell_type": "markdown",
898 | "metadata": {},
899 | "source": [
900 | "### pyspark.ml.feature.SQLTransformer(statement=None)\n",
901 | "实现由SQL语句定义的转换。目前我们只支持SQL语法,"
902 | ]
903 | },
904 | {
905 | "cell_type": "code",
906 | "execution_count": 33,
907 | "metadata": {},
908 | "outputs": [
909 | {
910 | "name": "stdout",
911 | "output_type": "stream",
912 | "text": [
913 | "+---+---+---+\n",
914 | "| id| v1| v2|\n",
915 | "+---+---+---+\n",
916 | "| 0|1.0|3.0|\n",
917 | "| 2|2.0|5.0|\n",
918 | "+---+---+---+\n",
919 | "\n",
920 | "+---+---+---+---+----+\n",
921 | "| id| v1| v2| v3| v4|\n",
922 | "+---+---+---+---+----+\n",
923 | "| 0|1.0|3.0|4.0| 3.0|\n",
924 | "| 2|2.0|5.0|7.0|10.0|\n",
925 | "+---+---+---+---+----+\n",
926 | "\n"
927 | ]
928 | }
929 | ],
930 | "source": [
931 | "from pyspark.ml.feature import SQLTransformer\n",
932 | "df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], [\"id\", \"v1\", \"v2\"])\n",
933 | "sqlTrans = SQLTransformer(\n",
934 | " statement=\"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__\")\n",
935 | "df.show()\n",
936 | "sqlTrans.transform(df).show()\n"
937 | ]
938 | },
939 | {
940 | "cell_type": "markdown",
941 | "metadata": {},
942 | "source": [
943 | "### pyspark.ml.feature.StandardScaler(self, withMean=False, withStd=True, inputCol=None, outputCol=None)\n",
944 | "(标准化列,使其拥有零均值和等于1的标准差)\n",
945 | "通过使用训练集中样本的列汇总统计消除平均值和缩放到单位方差来标准化特征。使用校正后的样本标准偏差计算“单位标准差”,该标准偏差计算为无偏样本方差的平方根。\n"
946 | ]
947 | },
948 | {
949 | "cell_type": "code",
950 | "execution_count": 34,
951 | "metadata": {},
952 | "outputs": [
953 | {
954 | "name": "stdout",
955 | "output_type": "stream",
956 | "text": [
957 | "[1.0] [1.4142135623730951]\n",
958 | "+-----+-------------------+\n",
959 | "| a| scaled|\n",
960 | "+-----+-------------------+\n",
961 | "|[0.0]| [0.0]|\n",
962 | "|[2.0]|[1.414213562373095]|\n",
963 | "+-----+-------------------+\n",
964 | "\n"
965 | ]
966 | }
967 | ],
968 | "source": [
969 | "from pyspark.ml.feature import StandardScaler\n",
970 | "from pyspark.ml.linalg import Vectors\n",
971 | "df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], [\"a\"])\n",
972 | "standardScaler = StandardScaler(inputCol=\"a\", outputCol=\"scaled\")\n",
973 | "model = standardScaler.fit(df)\n",
974 | "print(model.mean, model.std)\n",
975 | "model.transform(df).show()"
976 | ]
977 | },
978 | {
979 | "cell_type": "markdown",
980 | "metadata": {},
981 | "source": [
982 | "### pyspark.ml.feature.StopWordsRemover(inputCol=None, outputCol=None, stopWords=None, caseSensitive=False)\n",
983 | "一个特征转换器,用于过滤掉输入中的停用词。"
984 | ]
985 | },
986 | {
987 | "cell_type": "code",
988 | "execution_count": 35,
989 | "metadata": {},
990 | "outputs": [
991 | {
992 | "name": "stdout",
993 | "output_type": "stream",
994 | "text": [
995 | "+---------+------+\n",
996 | "| text| words|\n",
997 | "+---------+------+\n",
998 | "|[a, b, c]|[a, c]|\n",
999 | "+---------+------+\n",
1000 | "\n"
1001 | ]
1002 | }
1003 | ],
1004 | "source": [
1005 | "from pyspark.ml.feature import StopWordsRemover\n",
1006 | "df = spark.createDataFrame([([\"a\", \"b\", \"c\"],)], [\"text\"])\n",
1007 | "remover = StopWordsRemover(inputCol=\"text\", outputCol=\"words\", stopWords=[\"b\"])\n",
1008 | "remover.transform(df).show()"
1009 | ]
1010 | },
1011 | {
1012 | "cell_type": "markdown",
1013 | "metadata": {},
1014 | "source": [
1015 | "### pyspark.ml.feature.Tokenizer(inputCol=None, outputCol=None)\n",
1016 | "一个标记生成器,它将输入字符串转换为小写,然后用空格分隔它。"
1017 | ]
1018 | },
1019 | {
1020 | "cell_type": "code",
1021 | "execution_count": 36,
1022 | "metadata": {
1023 | "scrolled": true
1024 | },
1025 | "outputs": [
1026 | {
1027 | "name": "stdout",
1028 | "output_type": "stream",
1029 | "text": [
1030 | "+--------+------------+\n",
1031 | "| text| words|\n",
1032 | "+--------+------------+\n",
1033 | "|ASD VA c|[asd, va, c]|\n",
1034 | "+--------+------------+\n",
1035 | "\n"
1036 | ]
1037 | }
1038 | ],
1039 | "source": [
1040 | "from pyspark.ml.feature import Tokenizer\n",
1041 | "df = spark.createDataFrame([(\"ASD VA c\",)], [\"text\"])\n",
1042 | "tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"words\")\n",
1043 | "tokenizer.transform(df).show()"
1044 | ]
1045 | },
1046 | {
1047 | "cell_type": "markdown",
1048 | "metadata": {},
1049 | "source": [
1050 | "### pyspark.ml.feature.VectorSlicer(inputCol=None, outputCol=None, indices=None, names=None)\n",
1051 | "此类采用特征向量并输出具有原始特征的子阵列的新特征向量。 可以使用索引(setIndices())或名称(setNames())指定要素子集。必须至少选择一个功能。不允许使用重复的功能,因此所选索引和名称之间不能重叠。 输出向量将首先按所选索引(按给定顺序)排序要素,然后是所选名称(按给定顺序)。"
1052 | ]
1053 | },
1054 | {
1055 | "cell_type": "code",
1056 | "execution_count": 37,
1057 | "metadata": {},
1058 | "outputs": [
1059 | {
1060 | "name": "stdout",
1061 | "output_type": "stream",
1062 | "text": [
1063 | "+-----------------------+----------+\n",
1064 | "|features |sliced |\n",
1065 | "+-----------------------+----------+\n",
1066 | "|[-2.0,2.3,0.0,0.0,1.0] |[2.3,1.0] |\n",
1067 | "|[0.0,0.0,0.0,0.0,0.0] |[0.0,0.0] |\n",
1068 | "|[0.6,-1.1,-3.0,4.5,3.3]|[-1.1,3.3]|\n",
1069 | "+-----------------------+----------+\n",
1070 | "\n"
1071 | ]
1072 | }
1073 | ],
1074 | "source": [
1075 | "from pyspark.ml.feature import VectorSlicer\n",
1076 | "from pyspark.ml.linalg import Vectors\n",
1077 | "df = spark.createDataFrame([\n",
1078 | " (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),\n",
1079 | " (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),\n",
1080 | " (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], [\"features\"])\n",
1081 | "vs = VectorSlicer(inputCol=\"features\", outputCol=\"sliced\", indices=[1, 4])\n",
1082 | "vs.transform(df).show(truncate=0)"
1083 | ]
1084 | },
1085 | {
1086 | "cell_type": "markdown",
1087 | "metadata": {},
1088 | "source": [
1089 | "### pyspark.ml.feature.VectorAssembler(inputCols=None, outputCol=None)\n",
1090 | "将多个列合并到向量列中的要素转换器。"
1091 | ]
1092 | },
1093 | {
1094 | "cell_type": "code",
1095 | "execution_count": 38,
1096 | "metadata": {},
1097 | "outputs": [
1098 | {
1099 | "name": "stdout",
1100 | "output_type": "stream",
1101 | "text": [
1102 | "+---+---+---+-------------+\n",
1103 | "| a| b| c| features|\n",
1104 | "+---+---+---+-------------+\n",
1105 | "| 1| 0| 3|[1.0,0.0,3.0]|\n",
1106 | "+---+---+---+-------------+\n",
1107 | "\n"
1108 | ]
1109 | }
1110 | ],
1111 | "source": [
1112 | "from pyspark.ml.feature import VectorAssembler\n",
1113 | "df = spark.createDataFrame([(1, 0, 3)], [\"a\", \"b\", \"c\"])\n",
1114 | "vecAssembler = VectorAssembler(inputCols=[\"a\", \"b\", \"c\"], outputCol=\"features\")\n",
1115 | "vecAssembler.transform(df).show()"
1116 | ]
1117 | },
1118 | {
1119 | "cell_type": "markdown",
1120 | "metadata": {},
1121 | "source": [
1122 | "### pyspark.ml.feature.Word2Vec(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000)\n",
1123 | "Word2Vec训练Map(String,Vector)模型,即将单词转换为代码以进行进一步的自然语言处理或机器学习过程。"
1124 | ]
1125 | },
1126 | {
1127 | "cell_type": "code",
1128 | "execution_count": 39,
1129 | "metadata": {},
1130 | "outputs": [
1131 | {
1132 | "name": "stdout",
1133 | "output_type": "stream",
1134 | "text": [
1135 | "+----+--------------------+\n",
1136 | "|word| vector|\n",
1137 | "+----+--------------------+\n",
1138 | "| a|[0.09461779892444...|\n",
1139 | "| b|[1.15474212169647...|\n",
1140 | "| c|[-0.3794820010662...|\n",
1141 | "+----+--------------------+\n",
1142 | "\n"
1143 | ]
1144 | }
1145 | ],
1146 | "source": [
1147 | "from pyspark.ml.feature import Word2Vec\n",
1148 | "sent = (\"a b \" * 100 + \"a c \" * 10).split(\" \")\n",
1149 | "doc = spark.createDataFrame([(sent,), (sent,)], [\"sentence\"])\n",
1150 | "word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol=\"sentence\", outputCol=\"model\")\n",
1151 | "model = word2Vec.fit(doc)\n",
1152 | "model.getVectors().show()"
1153 | ]
1154 | },
1155 | {
1156 | "cell_type": "code",
1157 | "execution_count": 40,
1158 | "metadata": {},
1159 | "outputs": [
1160 | {
1161 | "name": "stdout",
1162 | "output_type": "stream",
1163 | "text": [
1164 | "+----+-------------------+\n",
1165 | "|word| similarity|\n",
1166 | "+----+-------------------+\n",
1167 | "| b|0.25053444504737854|\n",
1168 | "+----+-------------------+\n",
1169 | "\n"
1170 | ]
1171 | },
1172 | {
1173 | "data": {
1174 | "text/plain": [
1175 | "[('b', 0.25053444504737854)]"
1176 | ]
1177 | },
1178 | "execution_count": 40,
1179 | "metadata": {},
1180 | "output_type": "execute_result"
1181 | }
1182 | ],
1183 | "source": [
1184 | "# 找相似字符\n",
1185 | "model.findSynonyms(\"a\", 1).show()\n",
1186 | "model.findSynonymsArray(\"a\", 1)"
1187 | ]
1188 | },
1189 | {
1190 | "cell_type": "code",
1191 | "execution_count": 41,
1192 | "metadata": {},
1193 | "outputs": [
1194 | {
1195 | "name": "stdout",
1196 | "output_type": "stream",
1197 | "text": [
1198 | "+----+----------+\n",
1199 | "|word|similarity|\n",
1200 | "+----+----------+\n",
1201 | "| b| 0.251|\n",
1202 | "| c| -0.698|\n",
1203 | "+----+----------+\n",
1204 | "\n"
1205 | ]
1206 | }
1207 | ],
1208 | "source": [
1209 | "from pyspark.sql.functions import format_number as fmt\n",
1210 | "model.findSynonyms(\"a\", 2).select(\"word\", fmt(\"similarity\", 3).alias(\"similarity\")).show()"
1211 | ]
1212 | },
1213 | {
1214 | "cell_type": "code",
1215 | "execution_count": null,
1216 | "metadata": {},
1217 | "outputs": [],
1218 | "source": []
1219 | }
1220 | ],
1221 | "metadata": {
1222 | "kernelspec": {
1223 | "display_name": "Python 3",
1224 | "language": "python",
1225 | "name": "python3"
1226 | },
1227 | "language_info": {
1228 | "codemirror_mode": {
1229 | "name": "ipython",
1230 | "version": 3
1231 | },
1232 | "file_extension": ".py",
1233 | "mimetype": "text/x-python",
1234 | "name": "python",
1235 | "nbconvert_exporter": "python",
1236 | "pygments_lexer": "ipython3",
1237 | "version": "3.6.4"
1238 | }
1239 | },
1240 | "nbformat": 4,
1241 | "nbformat_minor": 2
1242 | }
1243 |
--------------------------------------------------------------------------------
/pyspark.ml.regression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from pyspark.sql import SparkSession\n",
10 | "spark = SparkSession.builder.appName('learn_regression').master('local[1]').getOrCreate()"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "df_train = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/boston/train.csv', header=True, inferSchema=True, encoding='utf-8')"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 3,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "df_test = spark.read.csv('file:///home/ffzs/python-projects/learn_spark/boston/test.csv', header=True, inferSchema=True, encoding='utf-8')"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 4,
34 | "metadata": {
35 | "scrolled": true
36 | },
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | "+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+-----+\n",
43 | "| ID| crim| zn|indus|chas| nox| rm| age| dis|rad|tax|ptratio| black|lstat| medv|\n",
44 | "+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+-----+\n",
45 | "| 1|0.00632|18.0| 2.31| 0|0.538|6.575|65.2| 4.09| 1|296| 15.3| 396.9| 4.98| 24.0|\n",
46 | "| 2|0.02731| 0.0| 7.07| 0|0.469|6.421|78.9|4.9671| 2|242| 17.8| 396.9| 9.14| 21.6|\n",
47 | "| 3|0.02729| 0.0| 7.07| 0|0.469|7.185|61.1|4.9671| 2|242| 17.8|392.83| 4.03|22.77|\n",
48 | "+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+-----+\n",
49 | "only showing top 3 rows\n",
50 | "\n"
51 | ]
52 | }
53 | ],
54 | "source": [
55 | "from pyspark.sql.functions import lit\n",
56 | "df_test = df_test.withColumn('medv', lit(22.77))\n",
57 | "df0 = df_train.union(df_test).sort('ID')\n",
58 | "df0.show(3)"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "`\n",
66 | "CRIM-- 城镇人均犯罪率。\n",
67 | "ZN - 占地面积超过25,000平方英尺的住宅用地比例。\n",
68 | "INDUS - 每个城镇非零售业务的比例。\n",
69 | "CHAS - Charles River虚拟变量(如果河流经过则= 1;否则为0)。\n",
70 | "NOX - 氮氧化物浓度(每千万份)。\n",
71 | "RM - 每间住宅的平均房间数。\n",
72 | "AGE - 1940年以前建造的自住单位比例。\n",
73 | "DIS - 加权平均值到五个波士顿就业中心的距离。\n",
74 | "RAD - 径向高速公路的可达性指数。\n",
75 | "TAX - 每10,000美元的全额物业税率。\n",
76 | "PTRATIO - 城镇的学生与教师比例。\n",
77 | "BLACK - 1000(Bk - 0.63)²其中Bk是城镇黑人的比例。\n",
78 | "LSTAT - 人口较低的地位(百分比)。\n",
79 | "MEDV - 自住房屋的中位数价值1000美元。这是目标变量。\n",
80 | "`"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 5,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "from pyspark.ml.feature import VectorAssembler\n",
90 | "def feature_converter(df):\n",
91 | " vecAss = VectorAssembler(inputCols=df0.columns[1:-1], outputCol='features')\n",
92 | " df_va = vecAss.transform(df)\n",
93 | " return df_va\n",
94 | "\n",
95 | "train_data, test_data = feature_converter(df0).select(['features', 'medv']).randomSplit([7.0, 3.0], 101)"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 6,
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "data": {
105 | "text/plain": [
106 | "354"
107 | ]
108 | },
109 | "execution_count": 6,
110 | "metadata": {},
111 | "output_type": "execute_result"
112 | }
113 | ],
114 | "source": [
115 | "train_data.count()"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 7,
121 | "metadata": {},
122 | "outputs": [
123 | {
124 | "data": {
125 | "text/plain": [
126 | "152"
127 | ]
128 | },
129 | "execution_count": 7,
130 | "metadata": {},
131 | "output_type": "execute_result"
132 | }
133 | ],
134 | "source": [
135 | "test_data.count()"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "## 决策树回归\n",
143 | "`pyspark.ml.regression.DecisionTreeRegressor(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='variance', seed=None, varianceCol=None)`"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "`\n",
151 | "fit(dataset, params=None)方法 \n",
152 | "Impurity: 信息增益计算准则,支持选项:variance \n",
153 | "maxBins: 连续特征离散化的最大分箱个数, >=2并且>=任何分类特征的分类个数 \n",
154 | "maxDepth: 最大树深 \n",
155 | "minInfoGain: 分割节点所需最小信息增益 \n",
156 | "minInstancesPerNode: 分割后每个子节点最小实例个数 \n",
157 | "`"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 13,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "from pyspark.ml.regression import DecisionTreeRegressor\n",
167 | "dt = DecisionTreeRegressor(maxDepth=5, varianceCol=\"variance\", labelCol='medv')\n",
168 | "dt_model = dt.fit(train_data)"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 14,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "data": {
178 | "text/plain": [
179 | "SparseVector(13, {0: 0.0503, 2: 0.011, 4: 0.0622, 5: 0.1441, 6: 0.1852, 7: 0.0262, 8: 0.0022, 9: 0.0886, 10: 0.0142, 12: 0.4159})"
180 | ]
181 | },
182 | "execution_count": 14,
183 | "metadata": {},
184 | "output_type": "execute_result"
185 | }
186 | ],
187 | "source": [
188 | "dt_model.featureImportances"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 15,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "result = dt_model.transform(test_data)"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 16,
203 | "metadata": {},
204 | "outputs": [
205 | {
206 | "name": "stdout",
207 | "output_type": "stream",
208 | "text": [
209 | "+--------------------+-----+------------------+------------------+\n",
210 | "| features| medv| prediction| variance|\n",
211 | "+--------------------+-----+------------------+------------------+\n",
212 | "|[0.03237,0.0,2.18...| 33.4| 34.12833333333334|29.509013888888756|\n",
213 | "|[0.08829,12.5,7.8...| 22.9|21.195135135135136| 4.446162819576342|\n",
214 | "|[0.14455,12.5,7.8...|22.77|22.425999999999995|0.5578440000003866|\n",
215 | "+--------------------+-----+------------------+------------------+\n",
216 | "only showing top 3 rows\n",
217 | "\n"
218 | ]
219 | }
220 | ],
221 | "source": [
222 | "result.show(3)"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 17,
228 | "metadata": {},
229 | "outputs": [
230 | {
231 | "name": "stdout",
232 | "output_type": "stream",
233 | "text": [
234 | "测试数据的均方根误差(rmse):6.555920141221407\n"
235 | ]
236 | }
237 | ],
238 | "source": [
239 | "from pyspark.ml.evaluation import RegressionEvaluator\n",
240 | "dt_evaluator = RegressionEvaluator(labelCol='medv', metricName=\"rmse\", predictionCol='prediction')\n",
241 | "rmse = dt_evaluator.evaluate(result)\n",
242 | "print('测试数据的均方根误差(rmse):{}'.format(rmse))"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {},
248 | "source": [
249 | "## 梯度提升树回归 (Gradient-boosted tree regression)\n",
250 | "pyspark.ml.regression.GBTRegressor(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType='squared', maxIter=20, stepSize=0.1, seed=None, impurity='variance')"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "`\n",
258 | "fit(dataset,params=None)方法 \n",
259 | "lossType: GBT要最小化的损失函数,可选:squared, absolute \n",
260 | "maxIter: 最大迭代次数 \n",
261 | "stepSize: 每次优化迭代的步长 \n",
262 | "subsamplingRate:用于训练每颗决策树的训练数据集的比例,区间[0,1] \n",
263 | "`"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 8,
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "from pyspark.ml.regression import GBTRegressor\n",
273 | "gbt = GBTRegressor(maxIter=10, labelCol='medv', maxDepth=3)\n",
274 | "gbt_model = gbt.fit(train_data)"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 9,
280 | "metadata": {},
281 | "outputs": [
282 | {
283 | "data": {
284 | "text/plain": [
285 | "SparseVector(13, {0: 0.0219, 1: 0.0364, 2: 0.0305, 3: 0.0114, 4: 0.0032, 5: 0.1372, 6: 0.146, 7: 0.1033, 8: 0.0518, 9: 0.0819, 10: 0.0883, 11: 0.0048, 12: 0.2832})"
286 | ]
287 | },
288 | "execution_count": 9,
289 | "metadata": {},
290 | "output_type": "execute_result"
291 | }
292 | ],
293 | "source": [
294 | "gbt_model.featureImportances"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 10,
300 | "metadata": {},
301 | "outputs": [],
302 | "source": [
303 | "result = gbt_model.transform(test_data)"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 11,
309 | "metadata": {
310 | "scrolled": true
311 | },
312 | "outputs": [
313 | {
314 | "name": "stdout",
315 | "output_type": "stream",
316 | "text": [
317 | "+--------------------+-----+------------------+\n",
318 | "| features| medv| prediction|\n",
319 | "+--------------------+-----+------------------+\n",
320 | "|[0.03237,0.0,2.18...| 33.4| 31.98716729056085|\n",
321 | "|[0.08829,12.5,7.8...| 22.9|22.254258637918248|\n",
322 | "|[0.14455,12.5,7.8...|22.77|20.066468254729102|\n",
323 | "+--------------------+-----+------------------+\n",
324 | "only showing top 3 rows\n",
325 | "\n"
326 | ]
327 | }
328 | ],
329 | "source": [
330 | "result.show(3)"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": 20,
336 | "metadata": {},
337 | "outputs": [
338 | {
339 | "data": {
340 | "text/plain": [
341 | "[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]"
342 | ]
343 | },
344 | "execution_count": 20,
345 | "metadata": {},
346 | "output_type": "execute_result"
347 | }
348 | ],
349 | "source": [
350 | "gbt_model.treeWeights"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 12,
356 | "metadata": {},
357 | "outputs": [
358 | {
359 | "name": "stdout",
360 | "output_type": "stream",
361 | "text": [
362 | "测试数据的均方根误差(rmse):5.624145397622545\n"
363 | ]
364 | }
365 | ],
366 | "source": [
367 | "from pyspark.ml.evaluation import RegressionEvaluator\n",
368 | "gbt_evaluator = RegressionEvaluator(labelCol='medv', metricName=\"rmse\", predictionCol='prediction')\n",
369 | "rmse = gbt_evaluator.evaluate(result)\n",
370 | "print('测试数据的均方根误差(rmse):{}'.format(rmse))"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "## 线性回归(LinearRegression)\n",
378 | "pyspark.ml.regression.LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction', maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, standardization=True, solver='auto', weightCol=None, aggregationDepth=2, loss='squaredError', epsilon=1.35)"
379 | ]
380 | },
381 | {
382 | "cell_type": "markdown",
383 | "metadata": {},
384 | "source": [
385 | "\n",
386 | "学习目标是通过正规化最小化指定的损失函数。这支持两种损失:\n",
387 | "+ squaredError (a.k.a 平方损失)\n",
388 | "+ huber (对于相对较小的误差和相对大的误差的绝对误差的平方误差的混合,我们从训练数据估计比例参数)\n",
389 | "\n",
390 | "支持多种类型的正则化: \n",
391 | "+ None:OLS \n",
392 | "+ L2:ridge回归 \n",
393 | "+ L1:Lasso回归 \n",
394 | "+ L1+L2:elastic回归\n",
395 | "\n",
396 | "注意:与huber loss匹配仅支持none和L2正规化。\n"
397 | ]
398 | },
399 | {
400 | "cell_type": "markdown",
401 | "metadata": {},
402 | "source": [
403 | "`\n",
404 | "aggregationDepth: 树聚合的深度, >=2 \n",
405 | "elasticNtParam: ElasticNet混合参数,在[0,1]范围内,alpha=0为L2, alpha=1为L1 \n",
406 | "fit(dataset,params=None)方法 \n",
407 | "fitIntercept: 是否拟合截距 \n",
408 | "maxIter: 最大迭代次数 \n",
409 | "regParam:正则化参数 >=0 \n",
410 | "solver: 优化算法,没设置或空则使用”auto” \n",
411 | "standardization: 是否对拟合模型的特征进行标准化 \n",
412 | "`"
413 | ]
414 | },
415 | {
416 | "cell_type": "markdown",
417 | "metadata": {},
418 | "source": [
419 | "`\n",
420 | "Summary属性\n",
421 | "coefficientStandardErrors \n",
422 | "devianceResiduals: 加权残差 \n",
423 | "explainedVariance: 返回解释的方差回归得分,explainedVariance=1−variance(y−(̂ y))/variance(y) \n",
424 | "meanAbsoluteError: 返回均值绝对误差 \n",
425 | "meanSquaredError: 返回均值平方误 \n",
426 | "numInstances: 预测的实例个数 \n",
427 | "pValues: 系数和截距的双边P值,只有用”normal”solver才可用 \n",
428 | "predictions: 模型transform方法返回的预测 \n",
429 | "r2: R方 \n",
430 | "residuals: 残差 \n",
431 | "rootMeanSquaredError: 均方误差平方根 \n",
432 | "tValues: T统计量\n",
433 | "`"
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": 23,
439 | "metadata": {},
440 | "outputs": [],
441 | "source": [
442 | "from pyspark.ml.regression import LinearRegression\n",
443 | "lr = LinearRegression(maxIter=10, elasticNetParam=0.8, regParam=0.3, labelCol='medv')\n",
444 | "lr_model = lr.fit(train_data)"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": 26,
450 | "metadata": {},
451 | "outputs": [],
452 | "source": [
453 | "trainingSummary = lr_model.summary"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": 27,
459 | "metadata": {},
460 | "outputs": [
461 | {
462 | "name": "stdout",
463 | "output_type": "stream",
464 | "text": [
465 | "RMSE: 5.457496\n",
466 | "r2: 0.432071\n"
467 | ]
468 | }
469 | ],
470 | "source": [
471 | "print(\"RMSE: %f\" % trainingSummary.rootMeanSquaredError)\n",
472 | "print(\"r2: %f\" % trainingSummary.r2)"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 55,
478 | "metadata": {},
479 | "outputs": [
480 | {
481 | "name": "stdout",
482 | "output_type": "stream",
483 | "text": [
484 | "+--------------------+-----+------------------+\n",
485 | "| features| medv| prediction|\n",
486 | "+--------------------+-----+------------------+\n",
487 | "|[0.03237,0.0,2.18...| 33.4|27.066314856077966|\n",
488 | "|[0.08829,12.5,7.8...| 22.9|23.721352298735898|\n",
489 | "|[0.14455,12.5,7.8...|22.77|21.388248900632398|\n",
490 | "+--------------------+-----+------------------+\n",
491 | "only showing top 3 rows\n",
492 | "\n"
493 | ]
494 | }
495 | ],
496 | "source": [
497 | "result = lr_model.transform(test_data)\n",
498 | "result.show(3)"
499 | ]
500 | },
501 | {
502 | "cell_type": "code",
503 | "execution_count": 43,
504 | "metadata": {},
505 | "outputs": [
506 | {
507 | "name": "stdout",
508 | "output_type": "stream",
509 | "text": [
510 | "R平方(r2):0.469\n"
511 | ]
512 | }
513 | ],
514 | "source": [
515 | "from pyspark.ml.evaluation import RegressionEvaluator\n",
516 | "lr_evaluator = RegressionEvaluator(labelCol='medv', metricName=\"r2\", predictionCol='prediction')\n",
517 | "r2 = lr_evaluator.evaluate(result)\n",
518 | "print('R平方(r2):{:.3}'.format(r2))"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "execution_count": 44,
524 | "metadata": {
525 | "scrolled": true
526 | },
527 | "outputs": [],
528 | "source": [
529 | "test_evaluation = lr_model.evaluate(test_data)"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": 42,
535 | "metadata": {},
536 | "outputs": [
537 | {
538 | "name": "stdout",
539 | "output_type": "stream",
540 | "text": [
541 | "RMSE:5.7\n",
542 | "r2:0.469\n"
543 | ]
544 | }
545 | ],
546 | "source": [
547 | "print('RMSE:{:.3}'.format(test_evaluation.rootMeanSquaredError))\n",
548 | "print('r2:{:.3}'.format(test_evaluation.r2))"
549 | ]
550 | },
551 | {
552 | "cell_type": "markdown",
553 | "metadata": {},
554 | "source": [
555 | "## 随机森林回归\n",
556 | "pyspark.ml.regression.RandomForestRegressor(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='variance', subsamplingRate=1.0, seed=None, numTrees=20, featureSubsetStrategy='auto')"
557 | ]
558 | },
559 | {
560 | "cell_type": "markdown",
561 | "metadata": {},
562 | "source": [
563 | "`\n",
564 | "fit(dataset,params=None)方法 \n",
565 | "featureSubsetStrategy: 每棵树的节点上要分割的特征数量,可选:auto, all, onethird, sqrt, log2,(0.0,1.0],[1-n] \n",
566 | "impurity: 信息增益计算的准则,可选:variance \n",
567 | "maxBins: 连续特征离散化最大分箱个数。 \n",
568 | "maxDepth: 树的最大深度 \n",
569 | "minInfoGain: 树节点分割特征所需最小的信息增益 \n",
570 | "minInstancesPerNode: 每个结点所需最小实例个数 \n",
571 | "numTrees: 训练树的个数 \n",
572 | "subsamplingRate: 学习每颗决策树所需样本比例 \n",
573 | "`"
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": 47,
579 | "metadata": {},
580 | "outputs": [],
581 | "source": [
582 | "from pyspark.ml.regression import RandomForestRegressor\n",
583 | "rf = RandomForestRegressor(numTrees=10, maxDepth=5, seed=101, labelCol='medv')\n",
584 | "rf_model = rf.fit(train_data)"
585 | ]
586 | },
587 | {
588 | "cell_type": "code",
589 | "execution_count": 54,
590 | "metadata": {},
591 | "outputs": [
592 | {
593 | "name": "stdout",
594 | "output_type": "stream",
595 | "text": [
596 | "+--------------------+-----+------------------+\n",
597 | "| features| medv| prediction|\n",
598 | "+--------------------+-----+------------------+\n",
599 | "|[0.03237,0.0,2.18...| 33.4| 30.12804440796982|\n",
600 | "|[0.08829,12.5,7.8...| 22.9|21.338106353716338|\n",
601 | "|[0.14455,12.5,7.8...|22.77|19.764914032872827|\n",
602 | "+--------------------+-----+------------------+\n",
603 | "only showing top 3 rows\n",
604 | "\n"
605 | ]
606 | }
607 | ],
608 | "source": [
609 | "result = rf_model.transform(test_data)\n",
610 | "result.show(3)"
611 | ]
612 | },
613 | {
614 | "cell_type": "code",
615 | "execution_count": 51,
616 | "metadata": {},
617 | "outputs": [
618 | {
619 | "data": {
620 | "text/plain": [
621 | "[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]"
622 | ]
623 | },
624 | "execution_count": 51,
625 | "metadata": {},
626 | "output_type": "execute_result"
627 | }
628 | ],
629 | "source": [
630 | "rf_model.treeWeights"
631 | ]
632 | },
633 | {
634 | "cell_type": "code",
635 | "execution_count": 53,
636 | "metadata": {},
637 | "outputs": [
638 | {
639 | "name": "stdout",
640 | "output_type": "stream",
641 | "text": [
642 | "测试数据的均方根误差(rmse):5.268739233773331\n"
643 | ]
644 | }
645 | ],
646 | "source": [
647 | "from pyspark.ml.evaluation import RegressionEvaluator\n",
648 | "rf_evaluator = RegressionEvaluator(labelCol='medv', metricName=\"rmse\", predictionCol='prediction')\n",
649 | "rmse = rf_evaluator.evaluate(result)\n",
650 | "print('测试数据的均方根误差(rmse):{}'.format(rmse))"
651 | ]
652 | }
653 | ],
654 | "metadata": {
655 | "kernelspec": {
656 | "display_name": "Python 3",
657 | "language": "python",
658 | "name": "python3"
659 | },
660 | "language_info": {
661 | "codemirror_mode": {
662 | "name": "ipython",
663 | "version": 3
664 | },
665 | "file_extension": ".py",
666 | "mimetype": "text/x-python",
667 | "name": "python",
668 | "nbconvert_exporter": "python",
669 | "pygments_lexer": "ipython3",
670 | "version": "3.6.4"
671 | }
672 | },
673 | "nbformat": 4,
674 | "nbformat_minor": 2
675 | }
676 |
--------------------------------------------------------------------------------