├── LICENSE.md
├── README.md
├── data
└── foxdata.txt
├── howto
├── README.md
├── download_install_run_spark.md
└── minimize_verbosity.md
├── images
├── Data-Algorithms-with-Spark_mech2.pdf
├── Data-Algorithms-with-Spark_mech2.png
├── Data_Algorithms_with_Spark_COVER_9781492082385.png
├── data_algorithms_image.jpg
├── data_algorithms_with_spark.jpg
└── pyspark_algorithms2.jpg
└── tutorial
├── .DS_Store
├── add-indices
└── add-indices.txt
├── basic-average
└── basic-average.txt
├── basic-filter
└── basic-filter.txt
├── basic-join
└── basicjoin.txt
├── basic-map
└── basic-map.txt
├── basic-multiply
└── basic-multiply.txt
├── basic-sort
└── sort-by-key.txt
├── basic-sum
└── basic-sum.txt
├── basic-union
└── basic-union.txt
├── bigrams
└── bigrams.txt
├── cartesian
└── cartesian.txt
├── combine-by-key
├── README.md
├── combine-by-key.txt
├── distributed_computing_with_spark_by_Javier_Santos_Paniego.pdf
├── spark-combineByKey.md
├── spark-combineByKey.txt
└── standard_deviation_by_combineByKey.md
├── dna-basecount
├── README.md
├── basemapper.py
├── dna-basecount.md
├── dna-basecount2.md
├── dna-basecount3.md
└── dna_seq.txt
├── map-partitions
└── README.md
├── pyspark-examples
├── dataframes
│ ├── VIDEO-DataFrames.txt
│ ├── dataframe-examples.md
│ ├── dataframe-session-2018-04-26.txt
│ ├── dataframe-session-2018-05-15.txt
│ ├── dataframe-session-2018-10-30.txt
│ ├── dataframe-session-2019-02-14.txt
│ ├── dataframe-session-2020-11-04.txt
│ ├── dataframe-session-2021-05-12-intro.txt
│ ├── dataframe-session-2022-05-12.txt
│ └── dataframe-session-2022-05-19-Converting-DataFrame-to-RDD.txt
└── rdds
│ ├── combineByKey_example.py
│ ├── count_min_max.py
│ ├── groupbykey_and_reducebykey_example.ipynb
│ ├── pyspark-session-2015-02-23.txt
│ ├── pyspark-session-2015-03-13.txt
│ ├── pyspark-session-2015-04-10.txt
│ ├── pyspark-session-2018-01-18.txt
│ ├── pyspark-session-2018-04-12.txt
│ ├── pyspark-session-2018-10-02.txt
│ ├── pyspark-session-2018-10-09.txt
│ ├── pyspark-session-2019-01-22.txt
│ ├── pyspark-session-2019-01-30.txt
│ ├── pyspark-session-2019-04-16.txt
│ ├── pyspark-session-2019-04-18.txt
│ ├── pyspark-session-2019-04-26.txt
│ ├── pyspark-session-2019-05-09.txt
│ ├── pyspark-session-2019-10-09.txt
│ ├── pyspark-session-2019-10-16.txt
│ ├── pyspark-session-2020-01-22.txt
│ ├── pyspark-session-2020-01-24.txt
│ ├── pyspark-session-2020-02-03.txt
│ ├── pyspark-session-2020-04-16.txt
│ ├── pyspark-session-2020-04-23.txt
│ ├── pyspark-session-2020-07-06-word-count.txt
│ ├── pyspark-session-2020-10-05.txt
│ ├── pyspark-session-2020-10-07.txt
│ ├── pyspark-session-2020-10-12.txt
│ ├── pyspark-session-2020-10-15.txt
│ ├── pyspark-session-2020-10-19.txt
│ ├── pyspark-session-2021-01-19.txt
│ ├── pyspark-session-2021-01-21.ipynb
│ ├── pyspark-session-2021-01-26.txt
│ ├── pyspark-session-2021-04-12.txt
│ ├── pyspark-session-2021-04-14.txt
│ ├── pyspark-session-2021-04-19.txt
│ ├── pyspark-session-2021-04-21-mapPartitions.txt
│ ├── pyspark-session-2021-04-29-min-max-avg.txt
│ ├── pyspark-session-2021-05-05-join.txt
│ ├── pyspark-session-2021-10-06.txt
│ ├── pyspark-session-2021-10-11-filter-map-flatMap.txt
│ ├── pyspark-session-2021-10-20-understanding-partitions.txt
│ ├── pyspark-session-2021-10-25-RDD-join.txt
│ ├── pyspark-session-2022-04-12.txt
│ ├── pyspark-session-2022-04-14-mappers-and-filters-and-reduce.txt
│ ├── pyspark-session-2022-04-19-read-text-groupbykey-mapvalues-filter.txt
│ ├── pyspark-session_2019-10-07.txt
│ ├── pyspark-session_2020-07-01.txt
│ └── understanding_partitions.txt
├── pyspark-udf
└── pyspark_udf_maptype.txt
├── ranking
├── README.md
└── ranking_functions_in_pyspark.md
├── split-function
└── README.md
├── top-N
└── top-N.txt
└── wordcount
├── README.md
├── run_word_count.sh
├── run_word_count_ver2.sh
├── word_count.py
├── word_count_ver2.py
├── wordcount-shorthand.txt
└── wordcount.txt
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright [2019] [Mahmoud Parsian]
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | http://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PySpark Tutorial
2 |
3 | * PySpark is the Python API for Spark.
4 |
5 | * The purpose of PySpark tutorial is to provide
6 | basic distributed algorithms using PySpark.
7 |
8 | * PySpark supports two types of Data Abstractions:
9 | * RDDs
10 | * DataFrames
11 |
12 | * **PySpark Interactive Mode**: has an interactive shell
13 | (`$SPARK_HOME/bin/pyspark`) for basic testing
14 | and debugging and is not supposed to be used
15 | for production environment.
16 |
17 | * **PySpark Batch Mode**: you may use `$SPARK_HOME/bin/spark-submit`
18 | command for running PySpark programs (may be used for
19 | testing and production environemtns)
20 |
21 | ------
22 |
23 | # [Glossary: big data, MapReduce, Spark](https://github.com/mahmoudparsian/big-data-mapreduce-course/blob/master/slides/glossary/README.md)
24 |
25 | ------
26 |
27 | # [Basics of PySpark with Examples](./howto/README.md)
28 |
29 | ------
30 |
31 | # PySpark Examples and Tutorials
32 |
33 | * [PySpark Examples: RDDs](./tutorial/pyspark-examples/rdds/)
34 | * [PySpark Examples: DataFramess](./tutorial/pyspark-examples/dataframes/)
35 | * [DNA Base Counting](./tutorial/dna-basecount/README.md)
36 | * [Classic Word Count](./tutorial/wordcount)
37 | * [Find Frequency of Bigrams](./tutorial/bigrams)
38 | * [Join of Two Relations R(K, V1), S(K, V2)](./tutorial/basic-join)
39 | * [Basic Mapping of RDD Elements](./tutorial/basic-map)
40 | * [How to add all RDD elements together](./tutorial/basic-sum)
41 | * [How to multiply all RDD elements together](./tutorial/basic-multiply)
42 | * [Find Top-N and Bottom-N](./tutorial/top-N)
43 | * [Find average by using combineByKey()](./tutorial/combine-by-key)
44 | * [How to filter RDD elements](./tutorial/basic-filter)
45 | * [How to find average](./tutorial/basic-average)
46 | * [Cartesian Product: rdd1.cartesian(rdd2)](./tutorial/cartesian)
47 | * [Sort By Key: sortByKey() ascending/descending](./tutorial/basic-sort)
48 | * [How to Add Indices](./tutorial/add-indices)
49 | * [Map Partitions: mapPartitions() by Examples](./tutorial/map-partitions/README.md)
50 | * [Monoid: Design Principle](https://github.com/mahmoudparsian/data-algorithms-with-spark/blob/master/wiki-spark/docs/monoid/README.md)
51 | * [Ranking Functions by Examples](./tutorial/ranking/README.md)
52 |
53 | ------
54 |
55 | # Books
56 |
57 | ### [Data Algorithms with Spark](https://github.com/mahmoudparsian/data-algorithms-with-spark/)
58 |
59 | ### [Data Algorithms](https://github.com/mahmoudparsian/data-algorithms-book/)
60 |
61 | ### [PySpark Algorithms](https://github.com/mahmoudparsian/pyspark-algorithms/)
62 |
63 | -----
64 |
65 | # Miscellaneous
66 |
67 | ### [Download, Install Spark and Run PySpark](./howto/download_install_run_spark.md)
68 |
69 | ### [How to Minimize the Verbosity of Spark](./howto/minimize_verbosity.md)
70 |
71 | -------
72 |
73 | # PySpark Tutorial and References...
74 | * [Getting started with PySpark - Part 1](http://www.mccarroll.net/blog/pyspark/)
75 | * [Getting started with PySpark - Part 2](http://www.mccarroll.net/blog/pyspark2/index.html)
76 | * [A really really fast introduction to PySpark](http://www.slideshare.net/hkarau/a-really-really-fast-introduction-to-py-spark-lightning-fast-cluster-computing-with-python-1)
77 | * [PySpark](http://www.slideshare.net/thegiivee/pysaprk?qid=81cf1b31-8b19-4570-89a5-21d03cad6ecd&v=default&b=&from_search=9)
78 | * [Basic Big Data Manipulation with PySpark](http://bigdatasciencebootcamp.com/posts/Part_3/basic_big_data.html)
79 | * [Working in Pyspark: Basics of Working with Data and RDDs](http://www.learnbymarketing.com/618/pyspark-rdd-basics-examples/)
80 |
81 | -------
82 |
83 | # Questions/Comments
84 | * [View Mahmoud Parsian's profile on LinkedIn](http://www.linkedin.com/in/mahmoudparsian)
85 | * Please send me an email: mahmoud.parsian@yahoo.com
86 | * [Twitter: @mahmoudparsian](http://twitter.com/mahmoudparsian)
87 |
88 | Thank you!
89 |
90 | ````
91 | best regards,
92 | Mahmoud Parsian
93 | ````
94 |
95 | -----
96 |
97 |
98 |
99 |
103 |
104 |
105 |
109 |
110 |
111 |
115 |
116 |
117 |
121 |
122 | ------
123 |
124 | [//]: # (metadata:)
125 | [//]: # (Spark, PySpark, Python)
126 | [//]: # (MapReduce, Distributed Algorithms, mappers, reducers, partitioners)
127 | [//]: # (Transformations, Actions, RDDs, DataFrames, SQL)
128 |
--------------------------------------------------------------------------------
/data/foxdata.txt:
--------------------------------------------------------------------------------
1 | red fox jumped high
2 | fox jumped over high fence
3 | red fox jumped
4 |
--------------------------------------------------------------------------------
/howto/README.md:
--------------------------------------------------------------------------------
1 | # PySpark Tutorial
2 |
3 | * Spark is a multi-language engine for executing data engineering,
4 | data science, and machine learning on single-node machines or clusters.
5 |
6 | * PySpark is the Python API for Spark.
7 |
8 | # Start PySpark
9 |
10 | First make sure that you have started the Spark cluster.
11 | To start Spark, you execute the following. Note, if you
12 | are going to run PySpark shell in your laptop/macbook,
13 | then you do not need to start any clauter -- your
14 | laptop/macbook as a cluster of a single node:
15 |
16 | export SPARK_HOME=
17 | cd $SPARK_HOME
18 | ./sbin/start-all.sh
19 |
20 |
21 | To start PySpark, execute the following:
22 |
23 |
24 | cd $SPARK_HOME
25 | ./bin/pyspark
26 |
27 |
28 | Successful execution will give you the PySpark prompt:
29 |
30 |
31 | ~ % ./spark-3.3.0/bin/pyspark
32 | Python 3.10.5 (v3.10.5:f377153967, Jun 6 2022, 12:36:10) [Clang 13.0.0 (clang-1300.0.29.30)] on darwin
33 | Type "help", "copyright", "credits" or "license" for more information.
34 | Setting default log level to "WARN".
35 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
36 | Welcome to
37 | ____ __
38 | / __/__ ___ _____/ /__
39 | _\ \/ _ \/ _ `/ __/ '_/
40 | /__ / .__/\_,_/_/ /_/\_\ version 3.3.0
41 | /_/
42 |
43 | Using Python version 3.10.5 (v3.10.5:f377153967, Jun 6 2022 12:36:10)
44 | Spark context Web UI available at http://10.0.0.232:4040
45 | Spark context available as 'sc' (master = local[*], app id = local-1656268371486).
46 | SparkSession available as 'spark'.
47 | >>>
48 |
49 |
50 | Note that the shell already have created two objects:
51 | * SparkContext (`sc`) object and you may use it to create RDDs.
52 | * SparkSession (`spark`) object and you may use it to create DataFrames.
53 |
54 | # Creating RDDs
55 |
56 | You may create RDDs by:
57 | * reading textfiles,
58 | * Python collections and data structures,
59 | * local file system,
60 | * S3 and HDFS,
61 | * and other data sources.
62 |
63 |
64 | ## Create RDD from a Data Structure (or Collection)
65 |
66 | * Example-1
67 |
68 | >>> data = [1, 2, 3, 4, 5, 8, 9]
69 | >>> data
70 | [1, 2, 3, 4, 5, 8, 9]
71 | >>> myRDD = sc.parallelize(data)
72 | >>> myRDD.collect()
73 | [1, 2, 3, 4, 5, 8, 9]
74 | >>> myRDD.count()
75 | 7
76 | >>>
77 |
78 |
79 | * Example-2
80 |
81 | >>> kv = [('a',7), ('a', 2), ('b', 2), ('b',4), ('c',1), ('c',2), ('c',3), ('c',4)]
82 | >>> kv
83 | [('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)]
84 | >>> rdd2 = sc.parallelize(kv)
85 | >>> rdd2.collect()
86 | [('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)]
87 | >>>
88 | >>> rdd3 = rdd2.reduceByKey(lambda x, y : x+y)
89 | >>> rdd3.collect()
90 | [('a', 9), ('c', 10), ('b', 6)]
91 | >>>
92 |
93 |
94 | * Example-3
95 |
96 |
97 | >>> kv = [('a',7), ('a', 2), ('b', 2), ('b',4), ('c',1), ('c',2), ('c',3), ('c',4)]
98 | >>> kv
99 | [('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)]
100 | >>> rdd2 = sc.parallelize(kv)
101 | >>> rdd2.collect()
102 | [('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)]
103 |
104 | >>> rdd3 = rdd2.groupByKey()
105 | >>> rdd3.collect()
106 | [
107 | ('a', ),
108 | ('c', ),
109 | ('b', )
110 | ]
111 |
112 | >>> rdd3.map(lambda x : (x[0], list(x[1]))).collect()
113 | [
114 | ('a', [7, 2]),
115 | ('c', [1, 2, 3, 4]),
116 | ('b', [2, 4])
117 | ]
118 | >>>
119 |
120 |
121 |
122 | # Create RDD from a Local File System (Java Example)
123 |
124 | import org.apache.spark.api.java.JavaRDD;
125 | import org.apache.spark.api.java.JavaSparkContext;
126 | ...
127 | JavaSparkContext context = new JavaSparkContext();
128 | ...
129 | final String inputPath ="file:///dir1/dir2/myinputfile.txt";
130 | JavaRDD rdd = context.textFile(inputPath);
131 | ...
132 |
133 |
134 | # Create RDD from HDFS (Java Example)
135 |
136 | * Example-1:
137 |
138 | import org.apache.spark.api.java.JavaRDD;
139 | import org.apache.spark.api.java.JavaSparkContext;
140 | ...
141 | JavaSparkContext context = new JavaSparkContext();
142 | ...
143 | final String inputPath ="hdfs://myhadoopserver:9000/dir1/dir2/myinputfile.txt";
144 | JavaRDD rdd = context.textFile(inputPath);
145 | ...
146 |
147 | * Example-2:
148 |
149 |
150 | import org.apache.spark.api.java.JavaRDD;
151 | import org.apache.spark.api.java.JavaSparkContext;
152 | ...
153 | JavaSparkContext context = new JavaSparkContext();
154 | ...
155 | final String inputPath ="/dir1/dir2/myinputfile.txt";
156 | JavaRDD rdd = context.textFile(inputPath);
157 | ...
158 |
159 |
160 | # Questions/Comments
161 |
162 | * [View Mahmoud Parsian's profile on LinkedIn](http://www.linkedin.com/in/mahmoudparsian)
163 | * Please send me an email: mahmoud.parsian@yahoo.com
164 | * [Twitter: @mahmoudparsian](http://twitter.com/mahmoudparsian)
165 |
166 |
167 | Thank you!
168 |
169 | ````
170 | best regards,
171 | Mahmoud Parsian
172 | ````
173 |
174 | -----
175 |
176 |
177 |
178 |
182 |
183 |
184 |
188 |
189 |
190 |
194 |
195 |
196 |
200 |
--------------------------------------------------------------------------------
/howto/download_install_run_spark.md:
--------------------------------------------------------------------------------
1 | # Download, Install, and Run PySpark
2 |
3 | # 1. For macbook users: Enable "Remote Login"
4 |
5 |
6 | System Preferences --> Sharing --> enable "Remote Login" service
7 |
8 |
9 |
10 | # 2. Make Sure Java 8 is Installed Properly
11 |
12 | java -version
13 | java version "1.8.0_72"
14 | Java(TM) SE Runtime Environment (build 1.8.0_72-b15)
15 | Java HotSpot(TM) 64-Bit Server VM (build 25.72-b15, mixed mode)
16 |
17 |
18 | # 3. Download
19 |
20 | Download the latest binary Spark from the following URL:
21 |
22 | https://www.apache.org/dyn/closer.lua/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
23 |
24 |
25 | # 4. Open the Downloaded File
26 |
27 | Assuming that I have downloaded my file in
28 | `/home/mparsian/spark-3.3.0-bin-hadoop3.tgz`
29 |
30 |
31 | cd /home/mparsian
32 |
33 | tar zvfx spark-3.3.0-bin-hadoop3.tgz
34 | x spark-3.3.0-bin-hadoop3/
35 | x spark-3.3.0-bin-hadoop3/NOTICE
36 | x spark-3.3.0-bin-hadoop3/CHANGES.txt
37 | ...
38 |
39 |
40 | # 5. Start the Spark Cluster
41 |
42 | cd /home/mparsian/spark-3.3.0-bin-hadoop3/
43 |
44 | ./sbin/start-all.sh
45 |
46 | NOTE: If you are going to run Spark in your pc/macbook/windows,
47 | then you do NOT need to start cluster at all. Invoking
48 | ./bin/pyspark, your laptop is considered as your cluster
49 |
50 |
51 | # 6. Check Master and Worker
52 |
53 | Make sure that Master and Worker processes are running:
54 |
55 |
56 | jps
57 | 1347 Master
58 | 1390 Worker
59 |
60 |
61 | # 7. Check The Spark URL
62 |
63 | http://localhost:8080
64 |
65 |
66 | # 8. Define Very Basic Python Program
67 |
68 | * Python program: `/home/mparsian/spark-3.3.0-bin-hadoop3/test.py`
69 |
70 | #!/usr/bin/python
71 | import sys
72 |
73 | for line in sys.stdin:
74 | print "hello " + line
75 |
76 |
77 | * Python program: `/home/mparsian/spark-3.3.0-bin-hadoop3/test2.py`
78 |
79 | #!/usr/bin/python
80 | def fun2(str):
81 | str2 = str + " zaza"
82 | return str2
83 |
84 |
85 | # 9. Start and Run pyspark
86 |
87 | cd /home/mparsian/spark-3.3.0-bin-hadoop3/
88 | ./bin/pyspark
89 | ...
90 | ...
91 | Welcome to
92 | ____ __
93 | / __/__ ___ _____/ /__
94 | _\ \/ _ \/ _ `/ __/ '_/
95 | /__ / .__/\_,_/_/ /_/\_\ version 3.3.0
96 | /_/
97 |
98 | >>> data = ["john","paul","george","ringo"]
99 | >>> data
100 | ['john', 'paul', 'george', 'ringo']
101 |
102 | >>> rdd = sc.parallelize(data)
103 | >>> rdd.collect()
104 | ['john', 'paul', 'george', 'ringo']
105 |
106 |
107 | >>> test = "/home/mparsian/spark-3.3.0-bin-hadoop3/test.py"
108 | >>> test2 = "/home/mparsian/spark-3.3.0-bin-hadoop3/test2.py"
109 | >>> import test
110 | >>> import test2
111 |
112 |
113 | >>> pipeRDD = rdd.pipe(test)
114 | >>> pipeRDD.collect()
115 | [u'hello john', u'', u'hello paul', u'', u'hello george', u'', u'hello ringo', u'']
116 |
117 |
118 | >>> rdd.collect()
119 | ['john', 'paul', 'george', 'ringo']
120 |
121 |
122 | >>> rdd2 = rdd.map(lambda x : test2.fun2(x))
123 | >>> rdd2.collect()
124 | ['john zaza', 'paul zaza', 'george zaza', 'ringo zaza']
125 | >>>
126 |
127 |
--------------------------------------------------------------------------------
/howto/minimize_verbosity.md:
--------------------------------------------------------------------------------
1 | How to Minimize the Verbosity of Spark
2 | ======================================
3 | * Step-1: create a log4j.properties file
4 | ````
5 | cp $SPARK_HOME/conf/log4j.properties.template $SPARK_HOME/conf/log4j.properties
6 | ````
7 | * Step-2: Edit $SPARK_HOME/conf/log4j.properties file: replace "INFO" with "WARN"
8 |
9 | * Now your file should look like:
10 | ````
11 | cat $SPARK_HOME/conf/log4j.properties
12 | # Set everything to be logged to the console
13 | log4j.rootCategory=WARN, console
14 | log4j.appender.console=org.apache.log4j.ConsoleAppender
15 | log4j.appender.console.target=System.err
16 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
17 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
18 |
19 | # Settings to quiet third party logs that are too verbose
20 | log4j.logger.org.eclipse.jetty=WARN
21 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
22 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=WARN
23 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=WARN
24 | ````
--------------------------------------------------------------------------------
/images/Data-Algorithms-with-Spark_mech2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/Data-Algorithms-with-Spark_mech2.pdf
--------------------------------------------------------------------------------
/images/Data-Algorithms-with-Spark_mech2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/Data-Algorithms-with-Spark_mech2.png
--------------------------------------------------------------------------------
/images/Data_Algorithms_with_Spark_COVER_9781492082385.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/Data_Algorithms_with_Spark_COVER_9781492082385.png
--------------------------------------------------------------------------------
/images/data_algorithms_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/data_algorithms_image.jpg
--------------------------------------------------------------------------------
/images/data_algorithms_with_spark.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/data_algorithms_with_spark.jpg
--------------------------------------------------------------------------------
/images/pyspark_algorithms2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/pyspark_algorithms2.jpg
--------------------------------------------------------------------------------
/tutorial/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/tutorial/.DS_Store
--------------------------------------------------------------------------------
/tutorial/add-indices/add-indices.txt:
--------------------------------------------------------------------------------
1 | # ./pyspark
2 | Welcome to
3 | ____ __
4 | / __/__ ___ _____/ /__
5 | _\ \/ _ \/ _ `/ __/ '_/
6 | /__ / .__/\_,_/_/ /_/\_\ version 1.4.0
7 | /_/
8 |
9 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
10 | SparkContext available as sc, SQLContext available as sqlContext.
11 | >>> a = [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)]
12 | >>> a
13 | [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)]
14 |
15 | >>> rdd = sc.parallelize(a);
16 | >>> rdd.collect()
17 | [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)]
18 |
19 | >>> sorted = rdd.sortByKey()
20 | >>> sorted.collect()
21 | [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)]
22 |
23 |
24 | >>> rdd2 = rdd.map(lambda (x,y) : (y,x))
25 | >>> rdd2.collect()
26 | [(2, 'g1'), (4, 'g2'), (3, 'g3'), (8, 'g4')]
27 |
28 | >>> sorted = rdd2.sortByKey()
29 | >>> sorted.collect()
30 | [(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')]
31 |
32 |
33 | >>> sorted = rdd2.sortByKey(False)
34 | >>> sorted.collect()
35 | [(8, 'g4'), (4, 'g2'), (3, 'g3'), (2, 'g1')]
36 |
37 | >>> sorted = rdd2.sortByKey()
38 | >>> sorted.collect()
39 | [(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')]
40 | >>>
41 | >>> list
42 | [(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')]
43 |
44 | >>>
45 | >>> sorted.collect()
46 | [(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')]
47 |
48 | >>> indices = sorted.zipWithIndex()
49 | >>> indices.collect()
50 | [((2, 'g1'), 0), ((3, 'g3'), 1), ((4, 'g2'), 2), ((8, 'g4'), 3)]
51 | >>>
--------------------------------------------------------------------------------
/tutorial/basic-average/basic-average.txt:
--------------------------------------------------------------------------------
1 | # ./pyspark
2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
3 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
4 | Welcome to
5 | ____ __
6 | / __/__ ___ _____/ /__
7 | _\ \/ _ \/ _ `/ __/ '_/
8 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0
9 | /_/
10 |
11 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
12 | SparkContext available as sc.
13 | >>> sc
14 |
15 | >>>
16 | >>> nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 20])
17 | >>> nums.collect()
18 | [1, 2, 3, 4, 5, 6, 7, 8, 20]
19 | >>> sumAndCount = nums.map(lambda x: (x, 1)).fold((0, 0), (lambda x, y: (x[0] + y[0], x[1] + y[1])))
20 | >>> sumAndCount
21 | (56, 9)
22 | >>>
23 | >>> avg = float(sumAndCount[0]) / float(sumAndCount[1])
24 | >>> avg
25 | 6.2222222222222223
26 | >>>
27 |
--------------------------------------------------------------------------------
/tutorial/basic-filter/basic-filter.txt:
--------------------------------------------------------------------------------
1 | # ./pyspark
2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
3 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
4 | Welcome to
5 | ____ __
6 | / __/__ ___ _____/ /__
7 | _\ \/ _ \/ _ `/ __/ '_/
8 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0
9 | /_/
10 |
11 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
12 | SparkContext available as sc.
13 | >>> sc
14 |
15 |
16 | >>>
17 | >>> nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7])
18 | >>> nums.collect()
19 | [1, 2, 3, 4, 5, 6, 7]
20 |
21 | >>> filtered1 = nums.filter(lambda x : x % 2 == 1)
22 | >>> filtered1.collect()
23 | [1, 3, 5, 7]
24 | >>>
25 | >>> filtered2 = nums.filter(lambda x : x % 2 == 0)
26 | >>> filtered2.collect()
27 | [2, 4, 6]
28 | >>>
29 |
--------------------------------------------------------------------------------
/tutorial/basic-join/basicjoin.txt:
--------------------------------------------------------------------------------
1 | # cat > R.txt
2 | k1,v1
3 | k1,v2
4 | k2,v3
5 | k2,v4
6 | k3,v7
7 | k3,v8
8 | k3,v9
9 |
10 | # cat > S.txt
11 | k1,v11
12 | k1,v22
13 | k1,v33
14 | k2,v55
15 | k4,v77
16 | k5,v88
17 |
18 | # ./pyspark
19 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
20 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
21 | Type "help", "copyright", "credits" or "license" for more information.
22 | Welcome to
23 | ____ __
24 | / __/__ ___ _____/ /__
25 | _\ \/ _ \/ _ `/ __/ '_/
26 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0
27 | /_/
28 |
29 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
30 | SparkContext available as sc.
31 | >>> R = sc.textFile("R.txt");
32 | >>> R.collect()
33 | [u'k1,v1',
34 | u'k1,v2',
35 | u'k2,v3',
36 | u'k2,v4',
37 | u'k3,v7',
38 | u'k3,v8',
39 | u'k3,v9']
40 |
41 | >>> S = sc.textFile("S.txt");
42 | >>> S.collect()
43 | [u'k1,v11',
44 | u'k1,v22',
45 | u'k1,v33',
46 | u'k2,v55',
47 | u'k4,v77',
48 | u'k5,v88'
49 | ]
50 |
51 | >>> r1 = R.map(lambda s: s.split(","))
52 | >>> r1.collect()
53 | [
54 | [u'k1', u'v1'],
55 | [u'k1', u'v2'],
56 | [u'k2', u'v3'],
57 | [u'k2', u'v4'],
58 | [u'k3', u'v7'],
59 | [u'k3', u'v8'],
60 | [u'k3', u'v9']
61 | ]
62 | >>> r2 = r1.flatMap(lambda s: [(s[0], s[1])])
63 | >>> r2.collect()
64 | [
65 | (u'k1', u'v1'),
66 | (u'k1', u'v2'),
67 | (u'k2', u'v3'),
68 | (u'k2', u'v4'),
69 | (u'k3', u'v7'),
70 | (u'k3', u'v8'),
71 | (u'k3', u'v9')
72 | ]
73 | >>>
74 | >>> s1 = S.map(lambda s: s.split(","))
75 | >>> s1.collect()
76 | [
77 | [u'k1', u'v11'],
78 | [u'k1', u'v22'],
79 | [u'k1', u'v33'],
80 | [u'k2', u'v55'],
81 | [u'k4', u'v77'],
82 | [u'k5', u'v88']
83 | ]
84 | >>> s2 = s1.flatMap(lambda s: [(s[0], s[1])])
85 | >>> s2.collect()
86 | [
87 | (u'k1', u'v11'),
88 | (u'k1', u'v22'),
89 | (u'k1', u'v33'),
90 | (u'k2', u'v55'),
91 | (u'k4', u'v77'),
92 | (u'k5', u'v88')
93 | ]
94 | >>> RjoinedS = r2.join(s2)
95 | >>> RjoinedS.collect()
96 | [
97 | (u'k2', (u'v3', u'v55')),
98 | (u'k2', (u'v4', u'v55')),
99 | (u'k1', (u'v1', u'v11')),
100 | (u'k1', (u'v1', u'v22')),
101 | (u'k1', (u'v1', u'v33')),
102 | (u'k1', (u'v2', u'v11')),
103 | (u'k1', (u'v2', u'v22')),
104 | (u'k1', (u'v2', u'v33'))
105 | ]
106 | >>>
--------------------------------------------------------------------------------
/tutorial/basic-map/basic-map.txt:
--------------------------------------------------------------------------------
1 | # ./pyspark
2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
3 | Welcome to
4 | ____ __
5 | / __/__ ___ _____/ /__
6 | _\ \/ _ \/ _ `/ __/ '_/
7 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0
8 | /_/
9 |
10 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
11 | SparkContext available as sc.
12 | >>> sc
13 |
14 | >>>
15 | >>> nums = sc.parallelize([1, 2, 3, 4, 5])
16 | >>> nums.collect()
17 | [1, 2, 3, 4, 5]
18 | >>>
19 | >>> bytwo = nums.map(lambda x: x + 2)
20 | >>> bytwo.collect()
21 | [3, 4, 5, 6, 7]
22 | >>>
23 | >>> squared = nums.map(lambda x: x * x)
24 | >>> squared.collect()
25 | [1, 4, 9, 16, 25]
26 | >>>
27 |
--------------------------------------------------------------------------------
/tutorial/basic-multiply/basic-multiply.txt:
--------------------------------------------------------------------------------
1 | # ./pyspark
2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
3 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
4 | Type "help", "copyright", "credits" or "license" for more information.
5 | Welcome to
6 | ____ __
7 | / __/__ ___ _____/ /__
8 | _\ \/ _ \/ _ `/ __/ '_/
9 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0
10 | /_/
11 |
12 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
13 | SparkContext available as sc.
14 | >>> sc
15 |
16 | >>> numbers = sc.parallelize([1, 2, 3, 4])
17 | >>> mult = numbers.fold(1, (lambda x, y: x * y))
18 |
19 | >>> mult
20 | 24
21 |
--------------------------------------------------------------------------------
/tutorial/basic-sort/sort-by-key.txt:
--------------------------------------------------------------------------------
1 | # cat data.txt
2 | crazy crazy fox jumped
3 | crazy fox jumped
4 | fox is fast
5 | fox is smart
6 | dog is smart
7 |
8 | # ./bin/pyspark
9 | Welcome to
10 | ____ __
11 | / __/__ ___ _____/ /__
12 | _\ \/ _ \/ _ `/ __/ '_/
13 | /__ / .__/\_,_/_/ /_/\_\ version 1.4.0
14 | /_/
15 |
16 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
17 | SparkContext available as sc, SQLContext available as sqlContext.
18 | >>>
19 | >>> lines = sc.textFile('data.txt', 1);
20 | >>> lines.collect()
21 | [
22 | u'crazy crazy fox jumped',
23 | u'crazy fox jumped',
24 | u'fox is fast',
25 | u'fox is smart',
26 | u'dog is smart'
27 | ]
28 |
29 | >>> frequencies = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
30 | >>> frequencies.collect()
31 | [
32 | (u'crazy', 3),
33 | (u'jumped', 2),
34 | (u'is', 3),
35 | (u'fox', 4),
36 | (u'dog', 1),
37 | (u'fast', 1),
38 | (u'smart', 2)
39 | ]
40 |
41 | >>> frequencies.count()
42 | 7
43 |
44 | >>> sorted = frequencies.sortByKey()
45 | >>> sorted.collect()
46 | [
47 | (u'crazy', 3),
48 | (u'dog', 1),
49 | (u'fast', 1),
50 | (u'fox', 4),
51 | (u'is', 3),
52 | (u'jumped', 2),
53 | (u'smart', 2)
54 | ]
55 | >>>
56 | >>> sortedDescending = frequencies.sortByKey(False)
57 | >>> sortedDescending.collect()
58 | [
59 | (u'smart', 2),
60 | (u'jumped', 2),
61 | (u'is', 3),
62 | (u'fox', 4),
63 | (u'fast', 1),
64 | (u'dog', 1),
65 | (u'crazy', 3)
66 | ]
67 |
--------------------------------------------------------------------------------
/tutorial/basic-sum/basic-sum.txt:
--------------------------------------------------------------------------------
1 | # ./pyspark
2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
3 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
4 | Type "help", "copyright", "credits" or "license" for more information.
5 | Welcome to
6 | ____ __
7 | / __/__ ___ _____/ /__
8 | _\ \/ _ \/ _ `/ __/ '_/
9 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0
10 | /_/
11 |
12 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
13 | SparkContext available as sc.
14 | >>> sc
15 |
16 | >>> numbers = sc.parallelize([1, 2, 3, 4])
17 | >>> sum = numbers.fold(0, (lambda x, y: x + y))
18 |
19 | >>> sum
20 | 10
21 |
--------------------------------------------------------------------------------
/tutorial/basic-union/basic-union.txt:
--------------------------------------------------------------------------------
1 | # ./pyspark
2 | Welcome to
3 | ____ __
4 | / __/__ ___ _____/ /__
5 | _\ \/ _ \/ _ `/ __/ '_/
6 | /__ / .__/\_,_/_/ /_/\_\ version 1.4.0
7 | /_/
8 |
9 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
10 | SparkContext available as sc, SQLContext available as sqlContext.
11 |
12 | >>> d1= [('k1', 1), ('k2', 2), ('k3', 5)]
13 | >>> d1
14 | [('k1', 1), ('k2', 2), ('k3', 5)]
15 |
16 | >>> d2= [('k1', 3), ('k2',4), ('k4', 8)]
17 | >>> d2
18 | [('k1', 3), ('k2', 4), ('k4', 8)]
19 |
20 | >>> rdd1 = sc.parallelize(d1)
21 | >>> rdd1.collect()
22 | [('k1', 1), ('k2', 2), ('k3', 5)]
23 |
24 | >>> rdd2 = sc.parallelize(d2)
25 | >>> rdd2.collect();
26 | [('k1', 3), ('k2', 4), ('k4', 8)]
27 |
28 | >>> rdd3 = rdd1.union(rdd2)
29 | >>> rdd3.collect()
30 | [('k1', 1), ('k2', 2), ('k3', 5), ('k1', 3), ('k2', 4), ('k4', 8)]
31 |
32 | >>> rdd4 = rdd3.reduceByKey(lambda x,y: x+y)
33 | >>> rdd4.collect()
34 | [('k3', 5), ('k2', 6), ('k1', 4), ('k4', 8)]
--------------------------------------------------------------------------------
/tutorial/bigrams/bigrams.txt:
--------------------------------------------------------------------------------
1 | 1. Prepare Input
2 |
3 | # cat data.txt
4 | crazy crazy fox jumped over the fence
5 | crazy fox jumped
6 | the fence is high for fox
7 | crazy fox is smart
8 | fox jumped very high
9 |
10 | 2. Invoke pyspark
11 |
12 | # export SPARK_HOME=...
13 | # SPARK_HOME/bin/pyspark
14 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
15 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
16 | Type "help", "copyright", "credits" or "license" for more information.
17 | Welcome to
18 | ____ __
19 | / __/__ ___ _____/ /__
20 | _\ \/ _ \/ _ `/ __/ '_/
21 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0
22 | /_/
23 |
24 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
25 | SparkContext available as sc.
26 | >>> sc
27 |
28 | >>> lines = sc.textFile("data.txt")
29 | >>> lines.collect()
30 |
31 | [u'crazy crazy fox jumped over the fence',
32 | u'crazy fox jumped',
33 | u'the fence is high for fox',
34 | u'crazy fox is smart',
35 | u'fox jumped very high'
36 | ]
37 | >>> bigrams = lines.map(lambda s : s.split(" ")).flatMap(lambda s: [((s[i],s[i+1]),1) for i in range (0, len(s)-1)])
38 | >>> bigrams.collect()
39 | [((u'crazy', u'crazy'), 1),
40 | ((u'crazy', u'fox'), 1),
41 | ((u'fox', u'jumped'), 1),
42 | ((u'jumped', u'over'), 1),
43 | ((u'over', u'the'), 1),
44 | ((u'the', u'fence'), 1),
45 | ((u'crazy', u'fox'), 1),
46 | ((u'fox', u'jumped'), 1),
47 | ((u'the', u'fence'), 1),
48 | ((u'fence', u'is'), 1),
49 | ((u'is', u'high'), 1),
50 | ((u'high', u'for'), 1),
51 | ((u'for', u'fox'), 1),
52 | ((u'crazy', u'fox'), 1),
53 | ((u'fox', u'is'), 1),
54 | ((u'is', u'smart'), 1),
55 | ((u'fox', u'jumped'), 1),
56 | ((u'jumped', u'very'), 1),
57 | ((u'very', u'high'), 1)
58 | ]
59 | >>>
60 | >>> counts = bigrams.reduceByKey(lambda x, y : x+y)
61 | >>> counts.collect()
62 | [
63 | ((u'high', u'for'), 1),
64 | ((u'fox', u'is'), 1),
65 | ((u'is', u'smart'), 1),
66 | ((u'is', u'high'), 1),
67 | ((u'fence', u'is'), 1),
68 | ((u'very', u'high'), 1),
69 | ((u'crazy', u'fox'), 3),
70 | ((u'over', u'the'), 1),
71 | ((u'for', u'fox'), 1),
72 | ((u'the', u'fence'), 2),
73 | ((u'crazy', u'crazy'), 1),
74 | ((u'jumped', u'over'), 1),
75 | ((u'jumped', u'very'), 1),
76 | ((u'fox', u'jumped'), 3)
77 | ]
78 |
--------------------------------------------------------------------------------
/tutorial/cartesian/cartesian.txt:
--------------------------------------------------------------------------------
1 | # ./pyspark
2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
3 | ...
4 | Welcome to
5 | ____ __
6 | / __/__ ___ _____/ /__
7 | _\ \/ _ \/ _ `/ __/ '_/
8 | /__ / .__/\_,_/_/ /_/\_\ version 1.3.0
9 | /_/
10 |
11 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
12 | SparkContext available as sc, SQLContext available as sqlCtx.
13 | >>> a = [('k1','v1'), ('k2', 'v2')]
14 | >>> a
15 | [('k1', 'v1'), ('k2', 'v2')]
16 | >>> b = [('k3','v3'), ('k4', 'v4'), ('k5', 'v5') ]
17 | >>> b
18 | [('k3', 'v3'), ('k4', 'v4'), ('k5', 'v5')]
19 | >>> rdd1= sc.parallelize(a)
20 | >>> rdd1.collect()
21 | [('k1', 'v1'), ('k2', 'v2')]
22 | >>> rdd2= sc.parallelize(b)
23 | >>> rdd2.collect()
24 | [('k3', 'v3'), ('k4', 'v4'), ('k5', 'v5')]
25 | >>> rdd3 = rdd1.cartesian(rdd2)
26 | >>> rdd3.collect()
27 | [
28 | (('k1', 'v1'), ('k3', 'v3')),
29 | (('k1', 'v1'), ('k4', 'v4')),
30 | (('k1', 'v1'), ('k5', 'v5')),
31 | (('k2', 'v2'), ('k3', 'v3')),
32 | (('k2', 'v2'), ('k4', 'v4')),
33 | (('k2', 'v2'), ('k5', 'v5'))
34 | ]
35 | >>>
36 |
--------------------------------------------------------------------------------
/tutorial/combine-by-key/README.md:
--------------------------------------------------------------------------------
1 | Spark's combineByKey() Examples and Tutorial
2 | ============================================
3 |
4 | * [Mean Calculation by combineByKey()](./spark-combineByKey.md)
5 | * [Standard Deviation and Mean Calculation by combineByKey()](./standard_deviation_by_combineByKey.md)
6 |
7 |
8 | [](http://shop.oreilly.com/product/0636920033950.do)
9 |
--------------------------------------------------------------------------------
/tutorial/combine-by-key/combine-by-key.txt:
--------------------------------------------------------------------------------
1 | # export SPARK_HOME=...
2 | # SPARK_HOME/bin/pyspark
3 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
4 | Welcome to
5 | ____ __
6 | / __/__ ___ _____/ /__
7 | _\ \/ _ \/ _ `/ __/ '_/
8 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0
9 | /_/
10 |
11 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
12 | SparkContext available as sc.
13 | >>> sc
14 |
15 |
16 | >>> input = [("k1", 1), ("k1", 2), ("k1", 3), ("k1", 4), ("k1", 5),
17 | ("k2", 6), ("k2", 7), ("k2", 8),
18 | ("k3", 10), ("k3", 12)]
19 | >>> rdd = sc.parallelize(input)
20 | >>> sumCount = rdd.combineByKey(
21 | (lambda x: (x, 1)),
22 | (lambda x, y: (x[0] + y, x[1] + 1)),
23 | (lambda x, y: (x[0] + y[0], x[1] + y[1]))
24 | )
25 | >>> sumCount.collect()
26 | [('k3', (22, 2)), ('k2', (21, 3)), ('k1', (15, 5))]
27 | >>>
28 | >>> avg = sumCount.mapValues( lambda v : v[0] / v[1])
29 | >>> avg.collect()
30 | [('k3', 11), ('k2', 7), ('k1', 3)]
31 | >>>
--------------------------------------------------------------------------------
/tutorial/combine-by-key/distributed_computing_with_spark_by_Javier_Santos_Paniego.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/tutorial/combine-by-key/distributed_computing_with_spark_by_Javier_Santos_Paniego.pdf
--------------------------------------------------------------------------------
/tutorial/combine-by-key/standard_deviation_by_combineByKey.md:
--------------------------------------------------------------------------------
1 | Mean and Standard Deviation by Spark's combineByKey()
2 | =====================================================
3 |
4 | ````
5 | # ./bin/pyspark
6 | Python 2.7.10 (default, Oct 23 2015, 19:19:21)
7 | ...
8 | Welcome to
9 | ____ __
10 | / __/__ ___ _____/ /__
11 | _\ \/ _ \/ _ `/ __/ '_/
12 | /__ / .__/\_,_/_/ /_/\_\ version 1.6.1
13 | /_/
14 |
15 | Using Python version 2.7.10 (default, Oct 23 2015 19:19:21)
16 | SparkContext available as sc, HiveContext available as sqlContext.
17 | >>> data = [
18 | ... ("A", 2.), ("A", 4.), ("A", 9.),
19 | ... ("B", 10.), ("B", 20.),
20 | ... ("Z", 3.), ("Z", 5.), ("Z", 8.), ("Z", 12.)
21 | ... ]
22 | >>> data
23 | [
24 | ('A', 2.0),
25 | ('A', 4.0),
26 | ('A', 9.0),
27 | ('B', 10.0),
28 | ('B', 20.0),
29 | ('Z', 3.0),
30 | ('Z', 5.0),
31 | ('Z', 8.0),
32 | ('Z', 12.0)
33 | ]
34 | >>> rdd = sc.parallelize( data )
35 | >>> rdd.collect()
36 | [
37 | ('A', 2.0),
38 | ('A', 4.0),
39 | ('A', 9.0),
40 | ('B', 10.0),
41 | ('B', 20.0),
42 | ('Z', 3.0),
43 | ('Z', 5.0),
44 | ('Z', 8.0),
45 | ('Z', 12.0)
46 | ]
47 | >>> rdd.count()
48 | 9
49 | >>> sumCount = rdd.combineByKey(lambda value: (value, value*value, 1),
50 | ... lambda x, value: (x[0] + value, x[1] + value*value, x[2] + 1),
51 | ... lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2])
52 | ... )
53 |
54 | >>> sumCount.collect()
55 | [
56 | ('A', (15.0, 101.0, 3)),
57 | ('Z', (28.0, 242.0, 4)),
58 | ('B', (30.0, 500.0, 2))
59 | ]
60 |
61 | >>> import math
62 | >>> def stdDev( sumX, sumSquared, n ):
63 | ... mean = sumX / n
64 | ... stdDeviation = math.sqrt ((sumSquared - n*mean*mean) /n)
65 | ... return (mean, stdDeviation)
66 | ... ^D
67 |
68 | >>> meanAndStdDev = sumCount.mapValues(lambda x : stdDev(x[0], x[1], x[2]))
69 | >>> meanAndStdDev.collect()
70 | [
71 | ('A', (5.0, 2.943920288775949)),
72 | ('Z', (7.0, 3.391164991562634)),
73 | ('B', (15.0, 5.0))
74 | ]
75 | >>>
76 | ````
--------------------------------------------------------------------------------
/tutorial/dna-basecount/README.md:
--------------------------------------------------------------------------------
1 | DNA Base Counting
2 | =================
3 |
4 | The following examples demostrates the usage of PySpark to count DNA bases.
5 | In a nutshell, ````DNA Base Counting```` counts the number of A's, T's, C's, G's,
6 | and N's (N refers to undefined code).
7 |
8 |
9 | * [DNA Base Counting Without In-Mapper Combiner](./dna-basecount.md)
10 |
11 | * [DNA Base Counting With In-Mapper Combiner](./dna-basecount2.md)
12 |
13 | * [DNA Base Counting With External Python Function](./dna-basecount3.md)
14 |
15 |
16 | [](http://shop.oreilly.com/product/0636920033950.do)
17 |
--------------------------------------------------------------------------------
/tutorial/dna-basecount/basemapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | def mapper(seq):
4 | freq = dict()
5 | for x in list(seq):
6 | if x in freq:
7 | freq[x] +=1
8 | else:
9 | freq[x] = 1
10 | #
11 | kv = [(x, freq[x]) for x in freq]
12 | return kv
13 | #
14 | #print mapper("ATCGATCGATAT")
15 |
--------------------------------------------------------------------------------
/tutorial/dna-basecount/dna-basecount.md:
--------------------------------------------------------------------------------
1 | DNA Base Counting using PySpark
2 | ===============================
3 |
4 | DNA Base Count Definition
5 | -------------------------
6 | [DNA Base Counting is defined here.](https://www.safaribooksonline.com/library/view/data-algorithms/9781491906170/ch24.html)
7 |
8 | Solution in PySpark
9 | -------------------
10 | This solution assumes that each record is a DNA sequence.
11 | This solution emits a ````(base, 1)```` for every base in
12 | a given sequence and then aggregates all frequencies for
13 | unique bases.
14 |
15 |
16 | ````
17 | $ cat /home/mparsian/dna_seq.txt
18 | ATATCCCCGGGAT
19 | ATCGATCGATAT
20 |
21 |
22 | # ./bin/pyspark
23 | Python 2.7.10 (default, Aug 22 2015, 20:33:39)
24 | Welcome to
25 | ____ __
26 | / __/__ ___ _____/ /__
27 | _\ \/ _ \/ _ `/ __/ '_/
28 | /__ / .__/\_,_/_/ /_/\_\ version 1.6.0
29 | /_/
30 |
31 | SparkContext available as sc, HiveContext available as sqlContext.
32 | >>> recs = sc.textFile('file:///home/mparsian/dna_seq.txt')
33 |
34 | >>> recs.collect()
35 | [
36 | u'ATATCCCCGGGAT',
37 | u'ATCGATCGATAT'
38 | ]
39 |
40 | >>> rdd = recs.flatMap(lambda x : [(c,1) for c in list(x)])
41 | >>> rdd.collect()
42 | [
43 | (u'A', 1),
44 | (u'T', 1),
45 | (u'A', 1),
46 | (u'T', 1),
47 | (u'C', 1),
48 | (u'C', 1),
49 | (u'C', 1),
50 | (u'C', 1),
51 | (u'G', 1),
52 | (u'G', 1),
53 | (u'G', 1),
54 | (u'A', 1),
55 | (u'T', 1),
56 | (u'A', 1),
57 | (u'T', 1),
58 | (u'C', 1),
59 | (u'G', 1),
60 | (u'A', 1),
61 | (u'T', 1),
62 | (u'C', 1),
63 | (u'G', 1),
64 | (u'A', 1),
65 | (u'T', 1),
66 | (u'A', 1),
67 | (u'T', 1)
68 | ]
69 | >>> baseCount = rdd.reduceByKey(lambda x,y : x+y)
70 | >>> baseCount.collect()
71 | [
72 | (u'A', 7),
73 | (u'C', 6),
74 | (u'G', 5),
75 | (u'T', 7)
76 | ]
77 | >>>
78 | ````
79 |
80 |
81 |
--------------------------------------------------------------------------------
/tutorial/dna-basecount/dna-basecount2.md:
--------------------------------------------------------------------------------
1 | DNA Base Counting using PySpark Using In-Mapper Combiner
2 | ========================================================
3 |
4 | DNA Base Count Definition
5 | -------------------------
6 | [DNA Base Counting is defined here.](https://www.safaribooksonline.com/library/view/data-algorithms/9781491906170/ch24.html)
7 |
8 | Solution in PySpark
9 | -------------------
10 | This solution assumes that each record is a DNA sequence.
11 | This solution uses "In-Mapper Combiner" design pattern
12 | and aggregates bases for each sequence before full
13 | aggregation of all frequencies for unique bases.
14 |
15 |
16 | ````
17 | $ cat /home/mparsian/dna_seq.txt
18 | ATATCCCCGGGAT
19 | ATCGATCGATAT
20 |
21 |
22 | # ./bin/pyspark
23 | Python 2.7.10 (default, Aug 22 2015, 20:33:39)
24 | Welcome to
25 | ____ __
26 | / __/__ ___ _____/ /__
27 | _\ \/ _ \/ _ `/ __/ '_/
28 | /__ / .__/\_,_/_/ /_/\_\ version 1.6.0
29 | /_/
30 |
31 | SparkContext available as sc, HiveContext available as sqlContext.
32 | >>> recs = sc.texFile('file:///home/mparsian/dna_seq.txt')
33 |
34 | >>> recs.collect()
35 | [
36 | u'ATATCCCCGGGAT',
37 | u'ATCGATCGATAT'
38 | ]
39 |
40 | >>> def mapper(seq):
41 | ... freq = dict()
42 | ... for x in list(seq):
43 | ... if x in freq:
44 | ... freq[x] +=1
45 | ... else:
46 | ... freq[x] = 1
47 | ... #
48 | ... kv = [(x, freq[x]) for x in freq]
49 | ... return kv
50 | ... ^D
51 |
52 |
53 | >>> rdd = recs.flatMap(mapper)
54 | >>> rdd.collect()
55 | [
56 | (u'A', 3),
57 | (u'C', 4),
58 | (u'T', 3),
59 | (u'G', 3),
60 | (u'A', 4),
61 | (u'C', 2),
62 | (u'T', 4),
63 | (u'G', 2)
64 | ]
65 | >>> baseCount = rdd.reduceByKey(lambda x,y : x+y)
66 | >>> baseCount.collect()
67 | [
68 | (u'A', 7),
69 | (u'C', 6),
70 | (u'G', 5),
71 | (u'T', 7)
72 | ]
73 | >>>
74 | ````
75 |
76 |
77 |
--------------------------------------------------------------------------------
/tutorial/dna-basecount/dna-basecount3.md:
--------------------------------------------------------------------------------
1 | DNA Base Counting using PySpark
2 | ===============================
3 |
4 | DNA Base Count Definition
5 | -------------------------
6 | [DNA Base Counting is defined here.](https://www.safaribooksonline.com/library/view/data-algorithms/9781491906170/ch24.html)
7 |
8 | Solution in PySpark
9 | -------------------
10 | This solution assumes that each record is a DNA sequence.
11 | This solution emits a ````(base, 1)```` for every base in
12 | a given sequence and then aggregates all frequencies for
13 | unique bases. For this solution we use an external Python
14 | function defined in ````basemapper.py````
15 |
16 | * Define Python Function
17 |
18 | ````
19 | $ export SPARK_HOME=/home/mparsian/spark-1.6.1-bin-hadoop2.6
20 | $ cat $SPARK_HOME/basemapper.py
21 | #!/usr/bin/python
22 |
23 | def mapper(seq):
24 | freq = dict()
25 | for x in list(seq):
26 | if x in freq:
27 | freq[x] +=1
28 | else:
29 | freq[x] = 1
30 | #
31 | kv = [(x, freq[x]) for x in freq]
32 | return kv
33 | #
34 | #for testing:
35 | #print mapper("ATCGATCGATAT")
36 | ````
37 | * Define Very Basic Sample Input
38 |
39 | ````
40 | $ cat /home/mparsian/dna_seq.txt
41 | ATATCCCCGGGAT
42 | ATCGATCGATAT
43 | ````
44 |
45 | * Sample PySpark Run
46 |
47 | ````
48 | # ./bin/pyspark
49 | Welcome to
50 | ____ __
51 | / __/__ ___ _____/ /__
52 | _\ \/ _ \/ _ `/ __/ '_/
53 | /__ / .__/\_,_/_/ /_/\_\ version 1.6.1
54 | /_/
55 |
56 | SparkContext available as sc, HiveContext available as sqlContext.
57 | >>> recs = sc.texFile('file:///home/mparsian/dna_seq.txt')
58 |
59 | >>> recs.collect()
60 | [
61 | u'ATATCCCCGGGAT',
62 | u'ATCGATCGATAT'
63 | ]
64 |
65 | >>> basemapper = "/Users/mparsian/spark-1.6.1-bin-hadoop2.6/basemapper.py"
66 | >>> import basemapper
67 | >>> basemapper
68 |
69 | >>>
70 | >>> recs = sc.textFile('file:////Users/mparsian/zmp/github/pyspark-tutorial/tutorial/dna-basecount/dna_seq.txt')
71 | >>> rdd = recs.flatMap(basemapper.mapper)
72 | >>> rdd.collect()
73 | [(u'A', 3), (u'C', 4), (u'T', 3), (u'G', 3), (u'A', 4), (u'C', 2), (u'T', 4), (u'G', 2)]
74 |
75 | >>> baseCount = rdd.reduceByKey(lambda x,y : x+y)
76 | >>> baseCount.collect()
77 | [(u'A', 7), (u'C', 6), (u'G', 5), (u'T', 7)]
78 | >>>
79 | ````
--------------------------------------------------------------------------------
/tutorial/dna-basecount/dna_seq.txt:
--------------------------------------------------------------------------------
1 | ATATCCCCGGGAT
2 | ATCGATCGATAT
3 |
--------------------------------------------------------------------------------
/tutorial/map-partitions/README.md:
--------------------------------------------------------------------------------
1 | Spark's mapPartitions()
2 | =======================
3 |
4 | According to Spark API: ````mapPartitions(func)```` transformation is
5 | similar to ````map()````, but runs separately on each partition (block)
6 | of the RDD, so ````func```` must be of type ````Iterator => Iterator````
7 | when running on an RDD of type T.
8 |
9 |
10 | The ````mapPartitions()```` transformation should be used when you want to
11 | extract some condensed information (such as finding the minimum and maximum
12 | of numbers) from each partition. For example, if you want to find the minimum
13 | and maximum of all numbers in your input, then using ````map()```` can be
14 | pretty inefficient, since you will be generating tons of intermediate
15 | (K,V) pairs, but the bottom line is you just want to find two numbers: the
16 | minimum and maximum of all numbers in your input. Another example can be if
17 | you want to find top-10 (or bottom-10) for your input, then mapPartitions()
18 | can work very well: find the top-10 (or bottom-10) per partition, then find
19 | the top-10 (or bottom-10) for all partitions: this way you are limiting
20 | emitting too many intermediate (K,V) pairs.
21 |
22 |
23 | Example-1: Sum Each Partition
24 | =============================
25 | ````
26 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
27 | >>> numbers
28 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
29 |
30 | >>> rdd = sc.parallelize(numbers, 3)
31 |
32 | >>> rdd.collect()
33 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
34 |
35 | >>> rdd.getNumPartitions()
36 | 3
37 |
38 | >>> def f(iterator):
39 | ... for x in iterator:
40 | ... print(x)
41 | ... print "==="
42 | ...
43 | >>> rdd.mapPartitions(f).collect()
44 | 1
45 | 2
46 | 3
47 | ===
48 | 7
49 | 8
50 | 9
51 | 10
52 | ===
53 | 4
54 | 5
55 | 6
56 | ===
57 |
58 | >>> def adder(iterator):
59 | ... yield sum(iterator)
60 | ...
61 | >>> rdd.mapPartitions(adder).collect()
62 | [6, 15, 34]
63 |
64 | ````
65 |
66 |
67 | Example-2: Find Minimum and Maximum
68 | ===================================
69 | Use ````mapPartitions()```` and find the minimum and maximum from each partition.
70 |
71 | To make it a cleaner solution, we define a python function to return the minimum and maximum
72 | for a given iteration.
73 |
74 | ````
75 | $ cat minmax.py
76 | #!/usr/bin/python
77 |
78 | def minmax(iterator):
79 | firsttime = 0
80 | #min = 0;
81 | #max = 0;
82 | for x in iterator:
83 | if (firsttime == 0):
84 | min = x;
85 | max = x;
86 | firsttime = 1
87 | else:
88 | if x > max:
89 | max = x
90 | if x < min:
91 | min = x
92 | #
93 | return [(min, max)]
94 | #
95 | #data = [10, 20, 3, 4, 5, 2, 2, 20, 20, 10]
96 | #print minmax(data)
97 | ````
98 | Then we use the minmax function for the ````mapPartitions()````:
99 |
100 | >>> rdd = spark.sparkContext.parallelize(data, 3)
101 | >>> mapped = rdd.mapPartitions(minmax)
102 | >>> mapped.collect()
103 | [(3, 20), (2, 5), (2, 20)]
104 | >>> minmax_list = mapped.collect()
105 | >>> minimum = min(minmax_list[0])
106 | >>> minimum
107 | 3
108 | >>> maximum = max(minmax_list[0])
109 | >>> maximum
110 | 20
111 |
112 | ````
113 | ### NOTE: data can be huge, but for understanding
114 | ### the mapPartitions() we use a very small data set
115 |
116 | >>> data = [10, 20, 3, 4, 5, 2, 2, 20, 20, 10]
117 | >>> rdd = sc.parallelize(data, 3)
118 |
119 | >>> rdd.getNumPartitions()
120 | 3
121 |
122 | >>> rdd.collect()
123 | [10, 20, 3, 4, 5, 2, 2, 20, 20, 10]
124 |
125 | >>> def f(iterator):
126 | ... for x in iterator:
127 | ... print(x)
128 | ... print "==="
129 | ... ^D
130 |
131 | >>> rdd.foreachPartition(f)
132 | 10
133 | 20
134 | 3
135 | ===
136 | 4
137 | 5
138 | 2
139 | ===
140 | 2
141 | 20
142 | 20
143 | 10
144 | ===
145 | >>>
146 |
147 | >>> minmax = "/Users/mparsian/spark-1.6.1-bin-hadoop2.6/minmax.py"
148 | >>> import minmax
149 |
150 | ### NOTE: the minmaxlist is a small list of numbers
151 | ### two mumbers (min and max) are generated per partition
152 | >>> minmaxlist = rdd.mapPartitions(minmax.minmax).collect()
153 | >>> minmaxlist
154 | [3, 20, 2, 5, 2, 20]
155 |
156 | >>> min(minmaxlist)
157 | 2
158 | >>> max(minmaxlist)
159 | 20
160 | ````
161 |
162 | Questions/Comments
163 | ==================
164 | * [View Mahmoud Parsian's profile on LinkedIn](http://www.linkedin.com/in/mahmoudparsian)
165 | * Please send me an email: mahmoud.parsian@yahoo.com
166 | * [Twitter: @mahmoudparsian](http://twitter.com/mahmoudparsian)
167 |
168 | Thank you!
169 |
170 | ````
171 | best regards,
172 | Mahmoud Parsian
173 | ````
174 |
175 | [](http://shop.oreilly.com/product/0636920033950.do)
176 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/VIDEO-DataFrames.txt:
--------------------------------------------------------------------------------
1 | Structuring Apache Spark 2.0: SQL, DataFrames, Datasets And Streaming - by Michael Armbrust
2 | https://www.youtube.com/watch?v=1a4pgYzeFwE
3 | 28 mins
4 |
5 | AWS Tutorial - AWS Athena + S3
6 | 20 mins
7 | https://www.youtube.com/watch?v=SiUDN95sJIo
8 |
9 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/dataframe-examples.md:
--------------------------------------------------------------------------------
1 | ## Spark DataFrame Examples (using PySpark):
2 |
3 | 1. [Introduction to PySpark DataFrames (slides)](https://projector-video-pdf-converter.datacamp.com/13023/chapter3.pdf)
4 |
5 | 2. [Apache Spark's DataFrame Examples](http://spark.apache.org/examples.html)
6 |
7 | 3. [PySpark Dataframe Basics](https://changhsinlee.com/pyspark-dataframe-basics/)
8 |
9 | 4. [PySpark Dataframe Basics -- notebook](https://github.com/changhsinlee/changhsinlee.github.io/blob/master/notebook/2018-03-04-pyspark-dataframe-basics/dataframe-basics.ipynb)
10 |
11 | 5. [My Tutorial/Spark SQL Tutorial (PySpark)](https://www.zepl.com/viewer/notebooks/bm90ZTovL3pqZmZkdS8wN2M3YmI0MmJjMWI0YmE0OTc1M2IzMzZkMjA2MTk4Ny9ub3RlLmpzb24)
12 |
13 | 6. [Complete Guide on DataFrame Operations in PySpark](https://www.analyticsvidhya.com/blog/2016/10/spark-dataframe-and-operations/)
14 |
15 | 7. [Introduction to DataFrame Operations in PySpark](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html)
16 |
17 | 8. [PySpark DataFrame Tutorial: Introduction to DataFrames](https://dzone.com/articles/pyspark-dataframe-tutorial-introduction-to-datafra)
18 |
19 | 9. [Introduction to DataFrames - Python](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html)
20 |
21 | 10. [How to use Spark SQL: A hands-on tutorial](https://opensource.com/article/19/3/apache-spark-and-dataframes-tutorial)
22 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/dataframe-session-2019-02-14.txt:
--------------------------------------------------------------------------------
1 | $ cat /Users/mparsian/tmp/emps_no_header.txt
2 | 1001,alex,67000,SALES
3 | 1002,bob,24000,SALES
4 | 1003,boby,24000,SALES
5 | 1004,jane,69000,SOFTWARE
6 | 1005,betty,55000,SOFTWARE
7 | 1006,jeff,59000,SOFTWARE
8 | 1007,dara,72000,SOFTWARE
9 |
10 |
11 | $ ./bin/pyspark
12 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
13 | [Clang 6.0 (clang-600.0.57)] on darwin
14 | Type "help", "copyright", "credits" or "license" for more information.
15 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
16 | Welcome to
17 | ____ __
18 | / __/__ ___ _____/ /__
19 | _\ \/ _ \/ _ `/ __/ '_/
20 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0
21 | /_/
22 |
23 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
24 | SparkSession available as 'spark'.
25 | >>>
26 | >>>
27 | >>>
28 | >>>
29 | >>>
30 | >>>
31 | >>>
32 | >>> input_path = "/Users/mparsian/tmp/emps_no_header.txt"
33 | >>> df = spark.read.csv(input_path)
34 | >>> df.show()
35 | +----+-----+-----+--------+
36 | | _c0| _c1| _c2| _c3|
37 | +----+-----+-----+--------+
38 | |1001| alex|67000| SALES|
39 | |1002| bob|24000| SALES|
40 | |1003| boby|24000| SALES|
41 | |1004| jane|69000|SOFTWARE|
42 | |1005|betty|55000|SOFTWARE|
43 | |1006| jeff|59000|SOFTWARE|
44 | |1007| dara|72000|SOFTWARE|
45 | +----+-----+-----+--------+
46 |
47 | >>> df.collect()
48 | [
49 | Row(_c0='1001', _c1='alex', _c2='67000', _c3='SALES'),
50 | Row(_c0='1002', _c1='bob', _c2='24000', _c3='SALES'),
51 | Row(_c0='1003', _c1='boby', _c2='24000', _c3='SALES'),
52 | Row(_c0='1004', _c1='jane', _c2='69000', _c3='SOFTWARE'),
53 | Row(_c0='1005', _c1='betty', _c2='55000', _c3='SOFTWARE'),
54 | Row(_c0='1006', _c1='jeff', _c2='59000', _c3='SOFTWARE'),
55 | Row(_c0='1007', _c1='dara', _c2='72000', _c3='SOFTWARE')
56 | ]
57 | >>>
58 | >>>
59 |
60 | >>>
61 | >>> df2 = df.selectExpr("_c0 as id", "_c1 as name", "_c2 as salary", "_c3 as dept")
62 | >>> df2.show()
63 | +----+-----+------+--------+
64 | | id| name|salary| dept|
65 | +----+-----+------+--------+
66 | |1001| alex| 67000| SALES|
67 | |1002| bob| 24000| SALES|
68 | |1003| boby| 24000| SALES|
69 | |1004| jane| 69000|SOFTWARE|
70 | |1005|betty| 55000|SOFTWARE|
71 | |1006| jeff| 59000|SOFTWARE|
72 | |1007| dara| 72000|SOFTWARE|
73 | +----+-----+------+--------+
74 |
75 | >>> df2.printSchema()
76 | root
77 | |-- id: string (nullable = true)
78 | |-- name: string (nullable = true)
79 | |-- salary: string (nullable = true)
80 | |-- dept: string (nullable = true)
81 |
82 | >>> df2.createOrReplaceTempView("emp_table")
83 | >>>
84 | >>>
85 | >>> df3 = spark.sql("SELECT * FROM emp_table WHERE id > 1002")
86 | >>> df3.show()
87 | +----+-----+------+--------+
88 | | id| name|salary| dept|
89 | +----+-----+------+--------+
90 | |1003| boby| 24000| SALES|
91 | |1004| jane| 69000|SOFTWARE|
92 | |1005|betty| 55000|SOFTWARE|
93 | |1006| jeff| 59000|SOFTWARE|
94 | |1007| dara| 72000|SOFTWARE|
95 | +----+-----+------+--------+
96 |
97 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/dataframe-session-2020-11-04.txt:
--------------------------------------------------------------------------------
1 | $ cat /tmp/emps_no_header.txt
2 | 1001,alex,67000,SALES
3 | 1002,bob,24000,SALES
4 | 1003,boby,24000,SALES
5 | 1004,jane,69000,SOFTWARE
6 | 1005,betty,55000,SOFTWARE
7 | 1006,jeff,59000,SOFTWARE
8 | 1007,dara,72000,SOFTWARE
9 | 1001,al,69000,SALES
10 | 1002,bobby,24900,BUSINESS
11 |
12 | $ ./bin/pyspark
13 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
14 | [Clang 6.0 (clang-600.0.57)] on darwin
15 | Type "help", "copyright", "credits" or "license" for more information.
16 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
17 | Setting default log level to "WARN".
18 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
19 | Welcome to
20 | ____ __
21 | / __/__ ___ _____/ /__
22 | _\ \/ _ \/ _ `/ __/ '_/
23 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0
24 | /_/
25 |
26 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
27 | SparkSession available as 'spark'.
28 | >>> input_path = '/tmp/emps_no_header.txt'
29 | >>> df = spark.read.csv(input_path)
30 | >>> df.show()
31 | +----+-----+-----+--------+
32 | | _c0| _c1| _c2| _c3|
33 | +----+-----+-----+--------+
34 | |1001| alex|67000| SALES|
35 | |1002| bob|24000| SALES|
36 | |1003| boby|24000| SALES|
37 | |1004| jane|69000|SOFTWARE|
38 | |1005|betty|55000|SOFTWARE|
39 | |1006| jeff|59000|SOFTWARE|
40 | |1007| dara|72000|SOFTWARE|
41 | |1001| al|69000| SALES|
42 | |1002|bobby|24900|BUSINESS|
43 | +----+-----+-----+--------+
44 |
45 | >>> df.count()
46 | 9
47 | >>> df.printSchema()
48 | root
49 | |-- _c0: string (nullable = true)
50 | |-- _c1: string (nullable = true)
51 | |-- _c2: string (nullable = true)
52 | |-- _c3: string (nullable = true)
53 |
54 | >>> df2 = df.selectExpr("_c0 as id", "_c1 as name", "_c2 as salary", "_c3 as dept")
55 | >>> df2.show()
56 | +----+-----+------+--------+
57 | | id| name|salary| dept|
58 | +----+-----+------+--------+
59 | |1001| alex| 67000| SALES|
60 | |1002| bob| 24000| SALES|
61 | |1003| boby| 24000| SALES|
62 | |1004| jane| 69000|SOFTWARE|
63 | |1005|betty| 55000|SOFTWARE|
64 | |1006| jeff| 59000|SOFTWARE|
65 | |1007| dara| 72000|SOFTWARE|
66 | |1001| al| 69000| SALES|
67 | |1002|bobby| 24900|BUSINESS|
68 | +----+-----+------+--------+
69 |
70 | >>> df2.createOrReplaceTempView("emp_table")
71 | >>> df3 = spark.sql("SELECT * FROM emp_table WHERE id > 1002")
72 | >>> df3.show()
73 | +----+-----+------+--------+
74 | | id| name|salary| dept|
75 | +----+-----+------+--------+
76 | |1003| boby| 24000| SALES|
77 | |1004| jane| 69000|SOFTWARE|
78 | |1005|betty| 55000|SOFTWARE|
79 | |1006| jeff| 59000|SOFTWARE|
80 | |1007| dara| 72000|SOFTWARE|
81 | +----+-----+------+--------+
82 |
83 | >>> df3.printSchema()
84 | root
85 | |-- id: string (nullable = true)
86 | |-- name: string (nullable = true)
87 | |-- salary: string (nullable = true)
88 | |-- dept: string (nullable = true)
89 |
90 | >>> df4 = df2.filter(df2.id > 1002)
91 | >>> df4.show()
92 | +----+-----+------+--------+
93 | | id| name|salary| dept|
94 | +----+-----+------+--------+
95 | |1003| boby| 24000| SALES|
96 | |1004| jane| 69000|SOFTWARE|
97 | |1005|betty| 55000|SOFTWARE|
98 | |1006| jeff| 59000|SOFTWARE|
99 | |1007| dara| 72000|SOFTWARE|
100 | +----+-----+------+--------+
101 |
102 | >>> df5 = spark.sql("SELECT id, salary FROM emp_table WHERE id > 1002")
103 | >>> df5.show()
104 | +----+------+
105 | | id|salary|
106 | +----+------+
107 | |1003| 24000|
108 | |1004| 69000|
109 | |1005| 55000|
110 | |1006| 59000|
111 | |1007| 72000|
112 | +----+------+
113 |
114 | >>>
115 | >>> df6 = spark.sql("SELECT name, salary FROM emp_table WHERE salary > 55000 ORDER BY salary")
116 | >>> df6.show()
117 | +----+------+
118 | |name|salary|
119 | +----+------+
120 | |jeff| 59000|
121 | |alex| 67000|
122 | |jane| 69000|
123 | | al| 69000|
124 | |dara| 72000|
125 | +----+------+
126 |
127 | >>> df6 = spark.sql("SELECT name, salary FROM emp_table WHERE salary > 55000 ORDER BY salary DESC")
128 | >>> df6.show()
129 | +----+------+
130 | |name|salary|
131 | +----+------+
132 | |dara| 72000|
133 | | al| 69000|
134 | |jane| 69000|
135 | |alex| 67000|
136 | |jeff| 59000|
137 | +----+------+
138 |
139 | >>> df7 = spark.sql("SELECT dept, COUNT(*) as count FROM emp_table GROUP BY dept")
140 | >>> df7.show()
141 | +--------+-----+
142 | | dept|count|
143 | +--------+-----+
144 | | SALES| 4|
145 | |BUSINESS| 1|
146 | |SOFTWARE| 4|
147 | +--------+-----+
148 |
149 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/dataframe-session-2021-05-12-intro.txt:
--------------------------------------------------------------------------------
1 | #--------------------
2 | # DataFrame Tutorial:
3 | #--------------------
4 | https://dzone.com/articles/pyspark-dataframe-tutorial-introduction-to-datafra
5 |
6 |
7 | #---------------------
8 | # Demo of DataFrames
9 | #---------------------
10 |
11 | $ cat /tmp/cats.csv
12 | name,age,gender,weight
13 | cuttie,2,female,6
14 | mono,3,male,9
15 | pishi,2,female,4
16 | zazo,1,male,4
17 | fuzzy,1,female,4
18 |
19 | $ ./bin/pyspark
20 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
21 | Welcome to
22 | ____ __
23 | / __/__ ___ _____/ /__
24 | _\ \/ _ \/ _ `/ __/ '_/
25 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1
26 | /_/
27 |
28 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
29 | Spark context Web UI available at http://10.0.0.93:4040
30 | Spark context available as 'sc' (master = local[*], app id = local-1620755686906).
31 | SparkSession available as 'spark'.
32 |
33 | >>>
34 | >>> input_path = '/tmp/cats.csv'
35 | >>> input_path
36 | '/tmp/cats.csv'
37 | >>> cats = spark.read.csv(input_path, inferSchema = True, header = True)
38 |
39 | >>>
40 | >>> cats.show(truncate=False)
41 | +------+---+------+------+
42 | |name |age|gender|weight|
43 | +------+---+------+------+
44 | |cuttie|2 |female|6 |
45 | |mono |3 |male |9 |
46 | |pishi |2 |female|4 |
47 | |zazo |1 |male |4 |
48 | |fuzzy |1 |female|4 |
49 | +------+---+------+------+
50 |
51 | >>> cats.printSchema()
52 | root
53 | |-- name: string (nullable = true)
54 | |-- age: integer (nullable = true)
55 | |-- gender: string (nullable = true)
56 | |-- weight: integer (nullable = true)
57 |
58 | >>> cats.count()
59 | 5
60 | >>> cats.columns
61 | ['name', 'age', 'gender', 'weight']
62 | >>> cats.describe('weight').show()
63 | +-------+------------------+
64 | |summary| weight|
65 | +-------+------------------+
66 | | count| 5|
67 | | mean| 5.4|
68 | | stddev|2.1908902300206643|
69 | | min| 4|
70 | | max| 9|
71 | +-------+------------------+
72 |
73 | >>> name_age = cats.select("name", "age")
74 | >>> name_age.show(truncate=False)
75 | +------+---+
76 | |name |age|
77 | +------+---+
78 | |cuttie|2 |
79 | |mono |3 |
80 | |pishi |2 |
81 | |zazo |1 |
82 | |fuzzy |1 |
83 | +------+---+
84 |
85 | >>> name_age.printSchema()
86 | root
87 | |-- name: string (nullable = true)
88 | |-- age: integer (nullable = true)
89 |
90 | >>> cats.select('age').distinct().show()
91 | +---+
92 | |age|
93 | +---+
94 | | 1|
95 | | 3|
96 | | 2|
97 | +---+
98 |
99 | >>> cats.select('name', 'age').distinct().show()
100 | +------+---+
101 | | name|age|
102 | +------+---+
103 | | zazo| 1|
104 | |cuttie| 2|
105 | | fuzzy| 1|
106 | | mono| 3|
107 | | pishi| 2|
108 | +------+---+
109 |
110 | >>> cats.filter(cats.age > 1).show()
111 | +------+---+------+------+
112 | | name|age|gender|weight|
113 | +------+---+------+------+
114 | |cuttie| 2|female| 6|
115 | | mono| 3| male| 9|
116 | | pishi| 2|female| 4|
117 | +------+---+------+------+
118 |
119 |
120 | >>> cats.orderBy(cats.age).show()
121 | +------+---+------+------+
122 | | name|age|gender|weight|
123 | +------+---+------+------+
124 | | zazo| 1| male| 4|
125 | | fuzzy| 1|female| 4|
126 | |cuttie| 2|female| 6|
127 | | pishi| 2|female| 4|
128 | | mono| 3| male| 9|
129 | +------+---+------+------+
130 |
131 | >>> age_df = cats.groupby("age").count()
132 | >>> age_df.show()
133 | +---+-----+
134 | |age|count|
135 | +---+-----+
136 | | 1| 2|
137 | | 3| 1|
138 | | 2| 2|
139 | +---+-----+
140 |
141 | >>> cats.show()
142 | +------+---+------+------+
143 | | name|age|gender|weight|
144 | +------+---+------+------+
145 | |cuttie| 2|female| 6|
146 | | mono| 3| male| 9|
147 | | pishi| 2|female| 4|
148 | | zazo| 1| male| 4|
149 | | fuzzy| 1|female| 4|
150 | +------+---+------+------+
151 |
152 | >>> cats.registerTempTable('cats_table')
153 | >>> spark.sql("select * from cats_table").show()
154 | +------+---+------+------+
155 | | name|age|gender|weight|
156 | +------+---+------+------+
157 | |cuttie| 2|female| 6|
158 | | mono| 3| male| 9|
159 | | pishi| 2|female| 4|
160 | | zazo| 1| male| 4|
161 | | fuzzy| 1|female| 4|
162 | +------+---+------+------+
163 |
164 | >>> spark.sql("select * from cats_table where age > 1").show()
165 | +------+---+------+------+
166 | | name|age|gender|weight|
167 | +------+---+------+------+
168 | |cuttie| 2|female| 6|
169 | | mono| 3| male| 9|
170 | | pishi| 2|female| 4|
171 | +------+---+------+------+
172 |
173 | >>> spark.sql("select age, count(*) from cats_table group by age").show()
174 | +---+--------+
175 | |age|count(1)|
176 | +---+--------+
177 | | 1| 2|
178 | | 3| 1|
179 | | 2| 2|
180 | +---+--------+
181 |
182 | >>> def exec_sql(query):
183 | ... spark.sql(query).show()
184 | ...
185 | >>>
186 | >>> exec_sql("select age, count(*) from cats_table group by age")
187 | +---+--------+
188 | |age|count(1)|
189 | +---+--------+
190 | | 1| 2|
191 | | 3| 1|
192 | | 2| 2|
193 | +---+--------+
194 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/dataframe-session-2022-05-12.txt:
--------------------------------------------------------------------------------
1 | >>> spark
2 |
3 |
4 | >>> spark.version
5 | '3.2.0'
6 |
7 | >>> # create a Python collection as data
8 | >>> data =
9 | [
10 | ('alex', 20, 12000),
11 | ('jane', 30, 45000),
12 | ('rafa', 40, 56000),
13 | ('ted', 30, 145000),
14 | ('xo2', 10, 1332000),
15 | ('mary', 44, 555000)
16 | ]
17 |
18 | >>> data
19 | [
20 | ('alex', 20, 12000),
21 | ('jane', 30, 45000),
22 | ('rafa', 40, 56000),
23 | ('ted', 30, 145000),
24 | ('xo2', 10, 1332000),
25 | ('mary', 44, 555000)
26 | ]
27 |
28 | >>> #define column names
29 | >>> column_names = ['name', 'age', 'salary']
30 | >>> column_names
31 | ['name', 'age', 'salary']
32 |
33 | >>> # create a DataFrame as df
34 | >>> df = spark.createDataFrame(data, column_names)
35 | >>>
36 | >>> # inspect created DataFrame
37 | >>> df
38 | DataFrame[name: string, age: bigint, salary: bigint]
39 |
40 | >>> # inspect created DataFrame's Schema
41 | >>> df.printSchema()
42 | root
43 | |-- name: string (nullable = true)
44 | |-- age: long (nullable = true)
45 | |-- salary: long (nullable = true)
46 |
47 | >>> # display the first 20 rows of a DataFrame
48 | >>> df.show()
49 | +----+---+-------+
50 | |name|age| salary|
51 | +----+---+-------+
52 | |alex| 20| 12000|
53 | |jane| 30| 45000|
54 | |rafa| 40| 56000|
55 | | ted| 30| 145000|
56 | | xo2| 10|1332000|
57 | |mary| 44| 555000|
58 | +----+---+-------+
59 |
60 | >>> # count the number of rows
61 | >>> df.count()
62 | 6
63 |
64 |
65 | >>> # Creates or replaces a local temporary view with this DataFrame
66 | >>> df.createOrReplaceTempView("people")
67 |
68 | >>> df2 = spark.sql("select * from people where salary > 67000")
69 | >>> df2.show()
70 | +----+---+-------+
71 | |name|age| salary|
72 | +----+---+-------+
73 | | ted| 30| 145000|
74 | | xo2| 10|1332000|
75 | |mary| 44| 555000|
76 | +----+---+-------+
77 |
78 | >>> df3 = spark.sql("select * from people where salary > 67000 and age > 11")
79 | >>> df3.show()
80 | +----+---+------+
81 | |name|age|salary|
82 | +----+---+------+
83 | | ted| 30|145000|
84 | |mary| 44|555000|
85 | +----+---+------+
86 |
87 | >>> df.show()
88 | +----+---+-------+
89 | |name|age| salary|
90 | +----+---+-------+
91 | |alex| 20| 12000|
92 | |jane| 30| 45000|
93 | |rafa| 40| 56000|
94 | | ted| 30| 145000|
95 | | xo2| 10|1332000|
96 | |mary| 44| 555000|
97 | +----+---+-------+
98 |
99 | >>> df4 = spark.sql("select * from people")
100 | >>> df4.show()
101 | +----+---+-------+
102 | |name|age| salary|
103 | +----+---+-------+
104 | |alex| 20| 12000|
105 | |jane| 30| 45000|
106 | |rafa| 40| 56000|
107 | | ted| 30| 145000|
108 | | xo2| 10|1332000|
109 | |mary| 44| 555000|
110 | +----+---+-------+
111 |
112 | >>> cart = spark.sql("select * from people p1, people p2")
113 | >>> cart.show()
114 | +----+---+------+----+---+-------+
115 | |name|age|salary|name|age| salary|
116 | +----+---+------+----+---+-------+
117 | |alex| 20| 12000|alex| 20| 12000|
118 | |alex| 20| 12000|jane| 30| 45000|
119 | |alex| 20| 12000|rafa| 40| 56000|
120 | |alex| 20| 12000| ted| 30| 145000|
121 | |alex| 20| 12000| xo2| 10|1332000|
122 | |alex| 20| 12000|mary| 44| 555000|
123 | |jane| 30| 45000|alex| 20| 12000|
124 | |jane| 30| 45000|jane| 30| 45000|
125 | |jane| 30| 45000|rafa| 40| 56000|
126 | |jane| 30| 45000| ted| 30| 145000|
127 | |jane| 30| 45000| xo2| 10|1332000|
128 | |jane| 30| 45000|mary| 44| 555000|
129 | |rafa| 40| 56000|alex| 20| 12000|
130 | |rafa| 40| 56000|jane| 30| 45000|
131 | |rafa| 40| 56000|rafa| 40| 56000|
132 | |rafa| 40| 56000| ted| 30| 145000|
133 | |rafa| 40| 56000| xo2| 10|1332000|
134 | |rafa| 40| 56000|mary| 44| 555000|
135 | | ted| 30|145000|alex| 20| 12000|
136 | | ted| 30|145000|jane| 30| 45000|
137 | +----+---+------+----+---+-------+
138 | only showing top 20 rows
139 |
140 | >>> cart
141 | >>> Frame[name: string, age: bigint, salary: bigint, name: string, age: bigint, salary: bigint]
142 | >>>
143 |
144 | >>> cart2 = spark.sql("select p1.name as name, p2.age as age, p1.salary as salary, p2.name as name2, p2.age as age2, p2.salary as salary2 from people p1, people p2")
145 | >>> cart2.show()
146 | +----+---+------+-----+----+-------+
147 | |name|age|salary|name2|age2|salary2|
148 | +----+---+------+-----+----+-------+
149 | |alex| 20| 12000| alex| 20| 12000|
150 | |alex| 30| 12000| jane| 30| 45000|
151 | |alex| 40| 12000| rafa| 40| 56000|
152 | |alex| 30| 12000| ted| 30| 145000|
153 | |alex| 10| 12000| xo2| 10|1332000|
154 | |alex| 44| 12000| mary| 44| 555000|
155 | |jane| 20| 45000| alex| 20| 12000|
156 | |jane| 30| 45000| jane| 30| 45000|
157 | |jane| 40| 45000| rafa| 40| 56000|
158 | |jane| 30| 45000| ted| 30| 145000|
159 | |jane| 10| 45000| xo2| 10|1332000|
160 | |jane| 44| 45000| mary| 44| 555000|
161 | |rafa| 20| 56000| alex| 20| 12000|
162 | |rafa| 30| 56000| jane| 30| 45000|
163 | |rafa| 40| 56000| rafa| 40| 56000|
164 | |rafa| 30| 56000| ted| 30| 145000|
165 | |rafa| 10| 56000| xo2| 10|1332000|
166 | |rafa| 44| 56000| mary| 44| 555000|
167 | | ted| 20|145000| alex| 20| 12000|
168 | | ted| 30|145000| jane| 30| 45000|
169 | +----+---+------+-----+----+-------+
170 | only showing top 20 rows
171 |
172 | >>>
173 | >>> cart2
174 | DataFrame[name: string, age: bigint, salary: bigint, name2: string, age2: bigint, salary2: bigint]
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/dataframe-session-2022-05-19-Converting-DataFrame-to-RDD.txt:
--------------------------------------------------------------------------------
1 | This demo shows how to convert
2 | 1. a DataFrame to an RDD
3 | 2. an RDD to a DataFrame
4 |
5 |
6 | ~ % /Users/mparsian/spark-3.2.1/bin/pyspark
7 | Python 3.8.9 (default, Jul 19 2021, 09:37:32)
8 | Welcome to Spark version 3.2.1
9 |
10 | Spark context Web UI available at http://10.0.0.234:4041
11 | Spark context available as 'sc' (master = local[*], app id = local-1653016254174).
12 | SparkSession available as 'spark'.
13 | >>> data = [('alex', 'sales', 23000), ('jane', 'HR', 29000), ('bob', 'sales', 43000),('mary', 'HR', 93000)]
14 | >>> data
15 | [('alex', 'sales', 23000), ('jane', 'HR', 29000), ('bob', 'sales', 43000), ('mary', 'HR', 93000)]
16 | >>> df = spark.createDataFrame(data, ['name', 'dept', 'salary'])
17 | >>> df.show()
18 | +----+-----+------+
19 | |name| dept|salary|
20 | +----+-----+------+
21 | |alex|sales| 23000|
22 | |jane| HR| 29000|
23 | | bob|sales| 43000|
24 | |mary| HR| 93000|
25 | +----+-----+------+
26 |
27 | >>> df.printSchema()
28 | root
29 | |-- name: string (nullable = true)
30 | |-- dept: string (nullable = true)
31 | |-- salary: long (nullable = true)
32 |
33 | >>> rdd5 = df.rdd
34 | >>> rdd5.collect()
35 | [
36 | Row(name='alex', dept='sales', salary=23000),
37 | Row(name='jane', dept='HR', salary=29000),
38 | Row(name='bob', dept='sales', salary=43000),
39 | Row(name='mary', dept='HR', salary=93000)
40 | ]
41 | >>>
42 | >>> df2 = rdd5.toDF()
43 | >>> df2.show()
44 | +----+-----+------+
45 | |name| dept|salary|
46 | +----+-----+------+
47 | |alex|sales| 23000|
48 | |jane| HR| 29000|
49 | | bob|sales| 43000|
50 | |mary| HR| 93000|
51 | +----+-----+------+
52 |
53 | >>> from pyspark.sql import Row
54 | >>> # NOTE: to convert an RDD into a DataFrame,
55 | >>> # each Row() must have the same column names:
56 | >>> rows =
57 | [
58 | Row(name='alex', dept='sales', salary=23000),
59 | Row(name='jane', dept='HR', salary=29000, address='123 main street')
60 | ]
61 | >>> rdd = sc.parallelize(rows)
62 | >>> rdd.collect()
63 | [Row(name='alex', dept='sales', salary=23000), Row(name='jane', dept='HR', salary=29000, address='123 main street')]
64 | >>> df44 = rdd.toDF()
65 | >>> df44.show()
66 | 22/05/19 20:21:51 ERROR Executor: Exception in task 10.0 in stage 15.0 (TID 100)
67 | java.lang.IllegalStateException: Input row doesn't have expected number of values required by the schema. 3 fields are required while 4 values are provided.
68 | ...
69 | >>> # create Row()'s which have the same columns
70 | >>> rows =
71 | [
72 | Row(name='alex', dept='sales', salary=23000, address=None),
73 | Row(name='jane', dept='HR', salary=29000, address='123 main street')
74 | ]
75 | >>> rdd = sc.parallelize(rows)
76 | >>> df44 = rdd.toDF()
77 | >>> df44.show()
78 | +----+-----+------+---------------+
79 | |name| dept|salary| address|
80 | +----+-----+------+---------------+
81 | |alex|sales| 23000| null|
82 | |jane| HR| 29000|123 main street|
83 | +----+-----+------+---------------+
84 |
85 | >>>
86 | >>> some_data = [('alex', 10), ('jane', 20)]
87 | >>> rdd3 = sc.parallelize(some_data)
88 | >>> rdd3.collect()
89 | [('alex', 10), ('jane', 20)]
90 | >>> rdd3_with_rows = rdd3.map(lambda x: Row(name=x[0], age=x[1]))
91 | >>> rdd3_with_rows.collect()
92 | [Row(name='alex', age=10), Row(name='jane', age=20)]
93 | >>> df3 = rdd3_with_rows.toDF()
94 | >>> df3.show()
95 | +----+---+
96 | |name|age|
97 | +----+---+
98 | |alex| 10|
99 | |jane| 20|
100 | +----+---+
101 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/combineByKey_example.py:
--------------------------------------------------------------------------------
1 | Problem: Given a set of (K, V) pairs,
2 | find (sum, count, min, max) per key using
3 | the combineByKey() transformation.
4 |
5 | ~/spark-2.4.4 $ ./bin/pyspark
6 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
7 | [Clang 6.0 (clang-600.0.57)] on darwin
8 | Type "help", "copyright", "credits" or "license" for more information.
9 | Welcome to
10 | ____ __
11 | / __/__ ___ _____/ /__
12 | _\ \/ _ \/ _ `/ __/ '_/
13 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4
14 | /_/
15 |
16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
17 | SparkSession available as 'spark'.
18 | >>>
19 |
20 | >>>
21 | >>> spark
22 |
23 | >>> data = [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 6), ('B', 7), ('B', 8) ]
24 | >>> data
25 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 6), ('B', 7), ('B', 8)]
26 | >>> rdd = spark.sparkContext.parallelize(data)
27 | >>>
28 | >>>
29 | >>> rdd.count()
30 | 7
31 | >>> rdd.collect()
32 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 6), ('B', 7), ('B', 8)]
33 | >>> # (K, (sum, count, min, max))
34 | ...
35 | >>> def single(v):
36 | ... return (v, 1, v, v)
37 | ...
38 | >>> def merge(C, v):
39 | ... return (C[0]+v, C[1]+1, min(C[2],v), max(C[3],v))
40 | ...
41 | >>> def combine(C1, C2):
42 | ... return (C1[0]+C2[0], C1[1]+C2[1], min(C1[2], C2[2]), max(C1[3], C2[3]) )
43 | ...
44 | >>> rdd2 = rdd.combineByKey(single, merge, combine)
45 | >>> rdd2.collect()
46 | [
47 | ('B', (21, 3, 6, 8)),
48 | ('A', (14, 4, 2, 5))
49 | ]
50 |
51 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/count_min_max.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import sys
4 |
5 | from pyspark.sql import SparkSession
6 |
7 | #
8 | print ("This is the name of the script: ", sys.argv[0])
9 | print ("Number of arguments: ", len(sys.argv))
10 | print ("The arguments are: " , str(sys.argv))
11 | #
12 |
13 | # DEFINE your input path
14 | input_path = sys.argv[1]
15 | print("input_path: ", input_path)
16 |
17 |
18 | # CREATE an instance of a SparkSession object
19 | spark = SparkSession\
20 | .builder\
21 | .appName("PythonWordCount")\
22 | .getOrCreate()
23 |
24 | # CREATE a new RDD[String]
25 | #lines = spark.sparkContext.textFile(input_path)
26 | # APPLY a SET of TRANSFORMATIONS...
27 |
28 | #-------------------------------------------
29 | def minmax(partition):
30 | first_time = False
31 | #count
32 | #min2
33 | #max2
34 | for x in partition:
35 | if (first_time == False):
36 | count = 1
37 | min2 = x
38 | max2 = x
39 | first_time = True
40 | else:
41 | count = count + 1
42 | max2 = max(x, max2)
43 | min2 = min(x, min2)
44 | #end-for
45 | #
46 | return [(count, min2, max2)]
47 | #end-def
48 | #---------------------
49 | def iterate_partition(partition):
50 | elements = []
51 | for x in partition:
52 | elements.append(x)
53 | print("elements=", elements)
54 | #print ("==================")
55 | #end-def
56 | #-------------------------
57 | def add3(t1, t2):
58 | count = t1[0] + t2[0]
59 | min2 = min(t1[1], t2[1])
60 | max2 = max(t1[2], t2[2])
61 | return (count, min2, max2)
62 | #end-def
63 |
64 | data = [10, 20, 30, 44, 55, 3, 4, 60, 50, 5, 2, 2, 20, 20, 10, 30, 70]
65 | print("data=", data)
66 | print("==============")
67 |
68 | #
69 | rdd = spark.sparkContext.parallelize(data, 4)
70 | print("rdd.collect()=", rdd.collect())
71 | print("==============")
72 | #
73 | rdd.foreachPartition(iterate_partition)
74 | print("==============")
75 | #
76 |
77 | count_min_max_rdd = rdd.mapPartitions(minmax)
78 | print("minmax_rdd.collect()=", count_min_max_rdd.collect())
79 |
80 | final_triplet = count_min_max_rdd.reduce(add3)
81 | print("final_triplet=", final_triplet)
82 |
83 | spark.stop()
84 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2015-03-13.txt:
--------------------------------------------------------------------------------
1 | pyspark-tutorial-
2 | pyspark-tutorial provides basic algorithms using pyspark
3 |
4 | interactive session: valid and tested: Feb. 23, 2015
5 |
6 | mparsian@Mahmouds-MacBook:~/zmp/BigData-MapReduce-Course/pyspark# cat data.txt
7 | crazy crazy fox jumped
8 | crazy fox jumped
9 | fox is fast
10 | fox is smart
11 | dog is smart
12 |
13 | SPARK_HOME=~/zmp/zs/spark-1.2.0
14 | mparsian@Mahmouds-MacBook:~/zmp/BigData-MapReduce-Course/pyspark# ~/zmp/zs/spark-1.2.0/bin/pyspark
15 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
16 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
17 | Type "help", "copyright", "credits" or "license" for more information.
18 |
19 | Welcome to
20 | ____ __
21 | / __/__ ___ _____/ /__
22 | _\ \/ _ \/ _ `/ __/ '_/
23 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0
24 | /_/
25 |
26 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
27 | SparkContext available as sc.
28 | >>> sc
29 |
30 | >>> lines = sc.textFile("data.txt", 1)
31 | >>> debuglines = lines.collect();
32 | >>> debuglines
33 | [u'crazy crazy fox jumped', u'crazy fox jumped', u'fox is fast', u'fox is smart', u'dog is smart']
34 | >>> words = lines.flatMap(lambda x: x.split(' '))
35 | >>> debugwords = words.collect();
36 | >>> debugwords
37 | [u'crazy', u'crazy', u'fox', u'jumped', u'crazy', u'fox', u'jumped', u'fox', u'is', u'fast', u'fox', u'is', u'smart', u'dog', u'is', u'smart']
38 | >>> ones = words.map(lambda x: (x, 1))
39 | >>> debugones = ones.collect()
40 | >>> debugones
41 | [(u'crazy', 1), (u'crazy', 1), (u'fox', 1), (u'jumped', 1), (u'crazy', 1), (u'fox', 1), (u'jumped', 1), (u'fox', 1), (u'is', 1), (u'fast', 1), (u'fox', 1), (u'is', 1), (u'smart', 1), (u'dog', 1), (u'is', 1), (u'smart', 1)]
42 | >>> counts = ones.reduceByKey(lambda x, y: x + y)
43 | >>> debugcounts = counts.collect()
44 | >>> debugcounts
45 | [(u'crazy', 3), (u'jumped', 2), (u'is', 3), (u'fox', 4), (u'dog', 1), (u'fast', 1), (u'smart', 2)]
46 | >>>
47 |
48 | >>> grouped = ones.groupByKey();
49 | >>> debuggrouped = grouped.collect();
50 |
51 | >>> counts.saveAsTextFile("output.txt")
52 |
53 | mparsian@Mahmouds-MacBook:~/zmp/BigData-MapReduce-Course/pyspark# cat output.txt/part*
54 | (u'crazy', 3)
55 | (u'jumped', 2)
56 | (u'is', 3)
57 | (u'fox', 4)
58 | (u'dog', 1)
59 | (u'fast', 1)
60 | (u'smart', 2)
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2015-04-10.txt:
--------------------------------------------------------------------------------
1 | First session on PySpark
2 |
3 | mparsian@Mahmouds-MacBook-2:~/spark-1.3.0/bin# cat zfox_data.txt
4 | crazy red fox ran fast
5 | red fox jumped very very high
6 | red fox is very crazy
7 | red fox ran very fast
8 | mparsian@Mahmouds-MacBook-2:~/spark-1.3.0/bin#
9 |
10 | mparsian@Mahmouds-MacBook-2:~/spark-1.3.0/bin# ./pyspark
11 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
12 | Welcome to
13 | ____ __
14 | / __/__ ___ _____/ /__
15 | _\ \/ _ \/ _ `/ __/ '_/
16 | /__ / .__/\_,_/_/ /_/\_\ version 1.3.0
17 | /_/
18 |
19 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
20 | SparkContext available as sc, SQLContext available as sqlCtx.
21 | >>>
22 | >>> sc
23 |
24 | >>>
25 | >>> lines = sc.textFile("zfox_data.txt")
26 | >>>
27 | >>> lines.collect()
28 | [u'crazy red fox ran fast', u'red fox jumped very very high', u'red fox is very crazy', u'red fox ran very fast']
29 | >>>
30 | >>> lines.count()
31 | 4
32 | >>>
33 | >>> words = lines.flatMap(lambda x: x.split(' '))
34 | >>>
35 | >>> words.collect()
36 | [u'crazy', u'red', u'fox', u'ran', u'fast', u'red', u'fox', u'jumped', u'very', u'very', u'high', u'red', u'fox', u'is', u'very', u'crazy', u'red', u'fox', u'ran', u'very', u'fast']
37 | >>>
38 | >>> words.count()
39 | 21
40 | >>>
41 | >>> ones = words.map(lambda x: (x, 1))
42 | >>>
43 | >>> ones.collect()
44 | [(u'crazy', 1), (u'red', 1), (u'fox', 1), (u'ran', 1), (u'fast', 1), (u'red', 1), (u'fox', 1), (u'jumped', 1), (u'very', 1), (u'very', 1), (u'high', 1), (u'red', 1), (u'fox', 1), (u'is', 1), (u'very', 1), (u'crazy', 1), (u'red', 1), (u'fox', 1), (u'ran', 1), (u'very', 1), (u'fast', 1)]
45 | >>>
46 | >>> counts = ones.reduceByKey(lambda x, y: x + y)
47 | >>>
48 | >>>
49 | >>> counts.collect()
50 | [(u'crazy', 2), (u'ran', 2), (u'is', 1), (u'fox', 4), (u'fast', 2), (u'high', 1), (u'very', 4), (u'red', 4), (u'jumped', 1)]
51 | >>>
52 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2018-01-18.txt:
--------------------------------------------------------------------------------
1 | mparsian@mahmoudsmacbook ~/spark-2.2.1 $ source zbin/zenv_setup.sh
2 | mparsian@mahmoudsmacbook ~/spark-2.2.1 $ ./bin/pyspark
3 | Python 2.7.10 (default, Feb 7 2017, 00:08:15)
4 | [GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] on darwin
5 | Type "help", "copyright", "credits" or "license" for more information.
6 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
7 | Setting default log level to "WARN".
8 | Welcome to
9 | ____ __
10 | / __/__ ___ _____/ /__
11 | _\ \/ _ \/ _ `/ __/ '_/
12 | /__ / .__/\_,_/_/ /_/\_\ version 2.2.1
13 | /_/
14 |
15 | Using Python version 2.7.10 (default, Feb 7 2017 00:08:15)
16 | SparkSession available as 'spark'.
17 | >>> spark
18 |
19 | >>>
20 | >>>
21 | >>>
22 | >>>
23 | >>> spark
24 |
25 | >>>
26 | >>>
27 | >>> sc = spark.sparkContext
28 | >>>
29 | >>> sc
30 |
31 | >>>
32 | >>>
33 | >>> rdd = sc.textFile("file:///Users/mparsian/zmp/github/pyspark-tutorial/data/foxdata.txt")
34 | >>>
35 | >>>
36 | >>> rdd
37 | file:///Users/mparsian/zmp/github/pyspark-tutorial/data/foxdata.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
38 | >>>
39 | >>>
40 | >>> rdd.count()
41 | 3
42 | >>> rdd.collect()
43 | [u'red fox jumped high', u'fox jumped over high fence', u'red fox jumped']
44 | >>> rdd.take(2)
45 | [u'red fox jumped high', u'fox jumped over high fence']
46 | >>> rdd.take(1)
47 | [u'red fox jumped high']
48 | >>> rdd.collect()
49 | [u'red fox jumped high', u'fox jumped over high fence', u'red fox jumped']
50 | >>>
51 | >>>
52 |
53 | >>> rdd2 = rdd.map(lambda x : (x, len(x)))
54 | >>> rdd2.collect()
55 | [(u'red fox jumped high', 19), (u'fox jumped over high fence', 26), (u'red fox jumped', 14)]
56 | >>> rdd2 = rdd.map(lambda x : (x, len(x), len(x)-2))
57 | >>>
58 | >>> rdd2.collect()
59 | [(u'red fox jumped high', 19, 17), (u'fox jumped over high fence', 26, 24), (u'red fox jumped', 14, 12)]
60 | >>> rdd3 = rdd.map(lambda x : (x, len(x), len(x)-2))
61 | >>>
62 | >>>
63 | >>> rdd3.collect()
64 | [(u'red fox jumped high', 19, 17), (u'fox jumped over high fence', 26, 24), (u'red fox jumped', 14, 12)]
65 | >>>
66 | >>>
67 | >>> rdd4 = rdd.map(lambda x : (len(x), x, x))
68 | >>> rdd4.collect()
69 | [(19, u'red fox jumped high', u'red fox jumped high'), (26, u'fox jumped over high fence', u'fox jumped over high fence'), (14, u'red fox jumped', u'red fox jumped')]
70 | >>>
71 | >>>
72 | >>>
73 | >>> rdd.collect()
74 | [u'red fox jumped high', u'fox jumped over high fence', u'red fox jumped']
75 | >>> rdd2 = rdd.flatMap(lambda x: x.split(" "))
76 | >>> rdd2.collect()
77 | [u'red', u'fox', u'jumped', u'high', u'fox', u'jumped', u'over', u'high', u'fence', u'red', u'fox', u'jumped']
78 | >>> rdd2.count()
79 | 12
80 | >>>
81 | >>>
82 | >>> pairs = rdd2.map(lambda w : (w, 1))
83 | >>> pairs.count()
84 | 12
85 | >>> pairs.collect()
86 | [(u'red', 1), (u'fox', 1), (u'jumped', 1), (u'high', 1), (u'fox', 1), (u'jumped', 1), (u'over', 1), (u'high', 1), (u'fence', 1), (u'red', 1), (u'fox', 1), (u'jumped', 1)]
87 | >>>
88 | >>>
89 |
90 | >>>
91 | >>> pairs.collect()
92 | [(u'red', 1), (u'fox', 1), (u'jumped', 1), (u'high', 1), (u'fox', 1), (u'jumped', 1), (u'over', 1), (u'high', 1), (u'fence', 1), (u'red', 1), (u'fox', 1), (u'jumped', 1)]
93 | >>> grouped = pairs.groupByKey()
94 | >>> grouped.collect()
95 | [(u'high', ), (u'over', ), (u'fox', ), (u'red', ), (u'fence', ), (u'jumped', )]
96 | >>> grouped.mapValues(lambda iter : list(iter)).collect()
97 | [(u'high', [1, 1]), (u'over', [1]), (u'fox', [1, 1, 1]), (u'red', [1, 1]), (u'fence', [1]), (u'jumped', [1, 1, 1])]
98 | >>>
99 | >>> freq = grouped.mapValues(lambda iter: sum(iter))
100 | >>> freq.collect()
101 | [(u'high', 2), (u'over', 1), (u'fox', 3), (u'red', 2), (u'fence', 1), (u'jumped', 3)]
102 | >>> freq.collectAsHashMap()
103 | Traceback (most recent call last):
104 | File "", line 1, in
105 | AttributeError: 'PipelinedRDD' object has no attribute 'collectAsHashMap'
106 | >>> freq.collectAsHashMap
107 | Traceback (most recent call last):
108 | File "", line 1, in
109 | AttributeError: 'PipelinedRDD' object has no attribute 'collectAsHashMap'
110 | >>> freq.collectAsMap
111 | :1>
112 | >>> freq.collectAsMap()
113 | {u'fence': 1, u'jumped': 3, u'over': 1, u'fox': 3, u'high': 2, u'red': 2}
114 | >>>
115 | >>>
116 | >>>
117 | >>>
118 | >>>
119 | >>> pairs.collect()
120 | [(u'red', 1), (u'fox', 1), (u'jumped', 1), (u'high', 1), (u'fox', 1), (u'jumped', 1), (u'over', 1), (u'high', 1), (u'fence', 1), (u'red', 1), (u'fox', 1), (u'jumped', 1)]
121 | >>> freq = pairs.reduceByKey(lambda x, y: x+y)
122 | >>> freq.collectAsMap()
123 | {u'fence': 1, u'jumped': 3, u'over': 1, u'fox': 3, u'high': 2, u'red': 2}
124 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2018-10-02.txt:
--------------------------------------------------------------------------------
1 | mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./zbin/zenv_setup.sh
2 | mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./bin/pyspark
3 | Python 2.7.10 (default, Oct 6 2017, 22:29:07)
4 | [GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.31)] on darwin
5 | Type "help", "copyright", "credits" or "license" for more information.
6 | 18/10/02 15:50:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
7 | Welcome to
8 | ____ __
9 | / __/__ ___ _____/ /__
10 | _\ \/ _ \/ _ `/ __/ '_/
11 | /__ / .__/\_,_/_/ /_/\_\ version 2.3.0
12 | /_/
13 |
14 | Using Python version 2.7.10 (default, Oct 6 2017 22:29:07)
15 | SparkSession available as 'spark'.
16 | >>>
17 | >>>
18 | >>>
19 | >>>
20 | >>> spark
21 |
22 | >>> spark.sparkContext
23 |
24 | >>>
25 | >>> spark.version
26 | u'2.3.0'
27 | >>>
28 | >>>
29 | >>>
30 | >>>
31 | >>>
32 | >>>
33 | >>> input_path = "/Users/mparsian/spark-2.3.0/myinput.txt"
34 | >>> myrdd = spark.sparkContext.textFile(input_path)
35 | >>> rdd.count()
36 | Traceback (most recent call last):
37 | File "", line 1, in
38 | NameError: name 'rdd' is not defined
39 | >>> myrdd.count()
40 | 3
41 | >>> myrdd.collect()
42 | [u'this is record 1', u'this is record 2', u'this is record 3']
43 | >>>
44 | >>>
45 | >>> def tokenize(rec):
46 | ... tokens = rec.split()
47 | ... return tokens
48 | ...
49 | >>>
50 | >>> rec33 = "this is it"
51 | >>> mytokens = tokenize(rec33)
52 | >>> mytokens
53 | ['this', 'is', 'it']
54 | >>>
55 | >>>
56 | >>> words = myrdd.flatMap(lambda record: tokenize(record))
57 | >>> words.collect()
58 | [u'this', u'is', u'record', u'1', u'this', u'is', u'record', u'2', u'this', u'is', u'record', u'3']
59 | >>> words.count()
60 | 12
61 | >>>
62 | >>> duplicated = myrdd.map(lambda rec: rec + ";" rec)
63 | File "", line 1
64 | duplicated = myrdd.map(lambda rec: rec + ";" rec)
65 | ^
66 | SyntaxError: invalid syntax
67 | >>> duplicated = myrdd.map(lambda rec: rec + ";" + rec)
68 | >>> duplicated.count()
69 | 3
70 | >>> duplicated.collect()
71 | [u'this is record 1;this is record 1', u'this is record 2;this is record 2', u'this is record 3;this is record 3']
72 | >>>
73 | >>> def myconcat(rec):
74 | ... return rec + ";" + rec
75 | ...
76 | >>>
77 | >>> z = myconcat("testing")
78 | >>> z
79 | 'testing;testing'
80 | >>> duplicated2 = myrdd.map(myconcat)
81 | >>> duplicated2.count()
82 | 3
83 | >>> duplicated2.collect()
84 | [u'this is record 1;this is record 1', u'this is record 2;this is record 2', u'this is record 3;this is record 3']
85 | >>>
86 | >>>
87 | >>>
88 | >>> words.collect()
89 | [u'this', u'is', u'record', u'1', u'this', u'is', u'record', u'2', u'this', u'is', u'record', u'3']
90 | >>> words.count()
91 | 12
92 | >>> pairs = words.map(lambda w: (w, 1))
93 | >>> pairs.collect()
94 | [(u'this', 1), (u'is', 1), (u'record', 1), (u'1', 1), (u'this', 1), (u'is', 1), (u'record', 1), (u'2', 1), (u'this', 1), (u'is', 1), (u'record', 1), (u'3', 1)]
95 | >>> pairs.count()
96 | 12
97 | >>> freq = pairs.reduceByKey(lambda x, y : x+y)
98 | >>> freq.collect()
99 | [(u'this', 3), (u'1', 1), (u'is', 3), (u'3', 1), (u'record', 3), (u'2', 1)]
100 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2018-10-09.txt:
--------------------------------------------------------------------------------
1 | mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./zbin/zenv_setup.sh
2 | mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./bin/pyspark
3 | Python 2.7.10 (default, Oct 6 2017, 22:29:07)
4 | [GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.31)] on darwin
5 | Type "help", "copyright", "credits" or "license" for more information.
6 | 18/10/09 18:04:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
7 | Welcome to
8 | ____ __
9 | / __/__ ___ _____/ /__
10 | _\ \/ _ \/ _ `/ __/ '_/
11 | /__ / .__/\_,_/_/ /_/\_\ version 2.3.0
12 | /_/
13 |
14 | Using Python version 2.7.10 (default, Oct 6 2017 22:29:07)
15 | SparkSession available as 'spark'.
16 | >>>
17 | >>>
18 | >>> spark
19 |
20 | >>>
21 | >>>
22 | >>>
23 | >>>
24 | >>> data = [1, -3, 4, 2, -5, 2]
25 | >>> data
26 | [1, -3, 4, 2, -5, 2]
27 | >>> rdd = spark.sparkContext.parallalize(data)
28 | Traceback (most recent call last):
29 | File "", line 1, in
30 | AttributeError: 'SparkContext' object has no attribute 'parallalize'
31 | >>> rdd = spark.sparkContext.parallelize(data)
32 | >>> rdd.count()
33 | 6
34 | >>> rdd.collect()
35 | [1, -3, 4, 2, -5, 2]
36 | >>>
37 | >>> def myfun(n):
38 | ... mylist = []
39 | ... if n > 0:
40 | ... mylist.append(100)
41 | ... mylist.append(200)
42 | ... else:
43 | ... mylist.append(0)
44 | ... #
45 | ... return mylist
46 | ...
47 | >>>
48 | >>> x = myfun(3)
49 | >>> x
50 | [100, 200]
51 | >>> y = myfun(-55)
52 | >>> y
53 | [0]
54 | >>>
55 | >>> rdd2 = rdd.flatMap(myfun)
56 | >>> rdd.collect()
57 | [1, -3, 4, 2, -5, 2]
58 | >>> rdd2.collect()
59 | [100, 200, 0, 100, 200, 100, 200, 0, 100, 200]
60 | >>> rdd2.count()
61 | 10
62 | >>>
63 | >>>
64 | >>>
65 | >>> rdd3 = rdd2.filter(lambda x : x > 100)
66 | >>> rdd3.collect()
67 | [200, 200, 200, 200]
68 | >>>
69 | >>> rdd4 = rdd2.filter(lambda x : x > 10)
70 | >>> rdd4.collect()
71 | [100, 200, 100, 200, 100, 200, 100, 200]
72 | >>>
73 | >>>
74 | >>> def keep100(n):
75 | ... if n > 100:
76 | ... return True
77 | ... else:
78 | ... return False
79 | ...
80 | >>>
81 | >>> rdd5 = rdd2.filter(keep100)
82 | >>> rdd5.collect()
83 | [200, 200, 200, 200]
84 | >>>
85 | >>>
86 | >>> rdd2.collect()
87 | [100, 200, 0, 100, 200, 100, 200, 0, 100, 200]
88 | >>> rdd6 = rdd.map(lambda x : x+1000)
89 | >>> rdd6.collect()
90 | [1001, 997, 1004, 1002, 995, 1002]
91 | >>>
92 | >>> def myadder(n):
93 | ... if n > 0:
94 | ... return n+1000
95 | ... else:
96 | ... return n
97 | ...
98 | >>>
99 | >>> rdd2.collect()
100 | [100, 200, 0, 100, 200, 100, 200, 0, 100, 200]
101 | >>> rdd7 = rdd2.map(myadder)
102 | >>> rdd7.collect()
103 | [1100, 1200, 0, 1100, 1200, 1100, 1200, 0, 1100, 1200]
104 | >>>
105 | >>>
106 | >>>
107 | >>>
108 | >>>
109 | >>>
110 | >>> rdd2.collect()
111 | [100, 200, 0, 100, 200, 100, 200, 0, 100, 200]
112 | >>> mysum = rdd2.reduce(lambda x,y: x+y)
113 | >>> mysum
114 | 1200
115 | >>>
116 | >>>
117 | >>>
118 | >>>
119 | >>>
120 | >>> pairs = [("a", 2), ("b", 3), ("a", 3), ("b", 4), ("a", 7), ("b", 10), ("c", 7), ("c", 1)]
121 | >>>
122 | >>> pairs
123 | [('a', 2), ('b', 3), ('a', 3), ('b', 4), ('a', 7), ('b', 10), ('c', 7), ('c', 1)]
124 | >>>
125 | >>> pairs_rdd = spark.sparkContext.parallelize(pairs)
126 | >>> pairs_rdd.count()
127 | 8
128 | >>> pairs_rdd.collect()
129 | [('a', 2), ('b', 3), ('a', 3), ('b', 4), ('a', 7), ('b', 10), ('c', 7), ('c', 1)]
130 | >>>
131 | >>>
132 | >>> grouped = pairs_rdd.groupByKey()
133 | >>> grouped.collect()
134 | [('a', ), ('c', ), ('b', )]
135 | >>> grouped.mapValues(lambda it: list(it)).collect()
136 | [('a', [2, 3, 7]), ('c', [7, 1]), ('b', [3, 4, 10])]
137 | >>>
138 | >>> incby100 = pairs_rdd.mapValues(lambda x : x+100)
139 | >>> incby100.collect()
140 | [('a', 102), ('b', 103), ('a', 103), ('b', 104), ('a', 107), ('b', 110), ('c', 107), ('c', 101)]
141 | >>> incby1000 = pairs_rdd.map(lambda (k,v) : (k, v+1000))
142 | >>> incby1000.collect()
143 | [('a', 1002), ('b', 1003), ('a', 1003), ('b', 1004), ('a', 1007), ('b', 1010), ('c', 1007), ('c', 1001)]
144 | >>>
145 | >>>
146 | >>> grouped.collect()
147 | [('a', ), ('c', ), ('b', )]
148 | >>>
149 | >>> average = grouped.mapValues(lambda it: sum(it)/len(it))
150 | >>> average.collect()
151 | [('a', 4), ('c', 4), ('b', 5)]
152 | >>> average = grouped.mapValues(lambda it: float(sum(it))/float(len(it)))
153 | >>> average.collect()
154 | [('a', 4.0), ('c', 4.0), ('b', 5.666666666666667)]
155 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-01-30.txt:
--------------------------------------------------------------------------------
1 | $ ./bin/pyspark
2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
3 | [Clang 6.0 (clang-600.0.57)] on darwin
4 | Setting default log level to "WARN".
5 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
6 | Welcome to
7 | ____ __
8 | / __/__ ___ _____/ /__
9 | _\ \/ _ \/ _ `/ __/ '_/
10 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0
11 | /_/
12 |
13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
14 | SparkSession available as 'spark'.
15 | >>>
16 | >>> spark
17 |
18 | >>>
19 | >>>
20 | >>> pairs = [("alex", 100, 1), ("jane", 200, 3), ("ted", 300, 3)]
21 | >>> pairs
22 | [('alex', 100, 1), ('jane', 200, 3), ('ted', 300, 3)]
23 | >>>
24 | >>> rdd = spark.sparkContext.parallelize(pairs)
25 | >>> rdd.collect()
26 | [('alex', 100, 1), ('jane', 200, 3), ('ted', 300, 3)]
27 | >>> rdd.count()
28 | 3
29 | >>> def find_average(record):
30 | ... return record[1]/record[2]
31 | ...
32 | >>>
33 | >>> x = ('jane', 200, 3)
34 | >>> y = find_average(x)
35 | >>> y
36 | 66.66666666666667
37 | >>> x = ('ted', 300, 3)
38 | >>> y = find_average(x)
39 | >>> y
40 | 100.0
41 | >>> rdd.collect()
42 | [('alex', 100, 1), ('jane', 200, 3), ('ted', 300, 3)]
43 | >>> rdd2 = rdd.map(find_average)
44 | >>> rdd2.collect()
45 | [100.0, 66.66666666666667, 100.0]
46 | >>> def find_average(record):
47 | ... return (record[0], record[1]/record[2])
48 | ...
49 | >>>
50 | >>> x = ('jane', 200, 3)
51 | >>> y = find_average(x)
52 | >>> y
53 | ('jane', 66.66666666666667)
54 | >>> rdd2 = rdd.map(find_average)
55 | >>> rdd2.collect()
56 | [('alex', 100.0), ('jane', 66.66666666666667), ('ted', 100.0)]
57 | >>> def find_average22(record):
58 | ... return [(record[0], record[1]/record[2])]
59 | ...
60 | >>> x = ('ted', 300, 3)
61 | >>> y = find_average22(x)
62 | >>> y
63 | [('ted', 100.0)]
64 | >>>
65 | >>>
66 | >>> rdd3 = rdd.flatMap(find_average22)
67 | >>> rdd3.collect()
68 | [('alex', 100.0), ('jane', 66.66666666666667), ('ted', 100.0)]
69 | >>>
70 | >>>
71 | >>>
72 | >>> numbers = [1, 2, 3, 4, 5, 6]
73 | >>> rdd4 = spark.sparkContext.parallelize(numbers)
74 | >>> rdd4.count()
75 | 6
76 | >>> rdd.collect()
77 | [('alex', 100, 1), ('jane', 200, 3), ('ted', 300, 3)]
78 | >>> rdd4.collect()
79 | [1, 2, 3, 4, 5, 6]
80 | >>>
81 | >>>
82 | >>> mysum = rdd4.reduce(lambda x, y: x+7)
83 | >>> mysum
84 | 36
85 | >>> rdd5 = rdd4.map(lambda x : x +7)
86 | >>> rdd5.collect()
87 | [8, 9, 10, 11, 12, 13]
88 | >>> rdd5
89 | PythonRDD[8] at collect at :1
90 | >>> rdd4
91 | ParallelCollectionRDD[5] at parallelize at PythonRDD.scala:195
92 | >>>
93 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-04-16.txt:
--------------------------------------------------------------------------------
1 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ pwd
2 | /Users/mparsian/spark-2.4.0
3 |
4 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ ls -l
5 | -rw-r--r--@ 1 mparsian 897801646 21357 Oct 28 23:36 LICENSE
6 | -rw-r--r--@ 1 mparsian 897801646 42919 Oct 28 23:36 NOTICE
7 | drwxr-xr-x@ 3 mparsian 897801646 96 Oct 28 23:36 R
8 | -rw-r--r--@ 1 mparsian 897801646 3952 Oct 28 23:36 README.md
9 | -rw-r--r--@ 1 mparsian 897801646 156 Oct 28 23:36 RELEASE
10 | drwxr-xr-x@ 29 mparsian 897801646 928 Oct 28 23:36 bin
11 | drwxr-xr-x@ 9 mparsian 897801646 288 Oct 28 23:36 conf
12 | drwxr-xr-x@ 5 mparsian 897801646 160 Oct 28 23:36 data
13 | drwxr-xr-x@ 4 mparsian 897801646 128 Oct 28 23:36 examples
14 | drwxr-xr-x@ 227 mparsian 897801646 7264 Oct 28 23:36 jars
15 | drwxr-xr-x@ 4 mparsian 897801646 128 Oct 28 23:36 kubernetes
16 | drwxr-xr-x@ 48 mparsian 897801646 1536 Oct 28 23:36 licenses
17 | drwxr-xr-x 16 mparsian 897801646 512 Mar 25 12:29 logs
18 | drwxr-xr-x@ 19 mparsian 897801646 608 Oct 28 23:36 python
19 | drwxr-xr-x@ 24 mparsian 897801646 768 Oct 28 23:36 sbin
20 | drwxr-xr-x 2 mparsian 897801646 64 Jan 8 03:00 work
21 | drwxr-xr-x@ 3 mparsian 897801646 96 Oct 28 23:36 yarn
22 |
23 |
24 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ ls -l bin
25 | total 224
26 | -rwxr-xr-x@ 1 mparsian 897801646 1089 Oct 28 23:36 beeline
27 | -rw-r--r--@ 1 mparsian 897801646 1064 Oct 28 23:36 beeline.cmd
28 | -rwxr-xr-x@ 1 mparsian 897801646 5427 Oct 28 23:36 docker-image-tool.sh
29 | -rwxr-xr-x@ 1 mparsian 897801646 1933 Oct 28 23:36 find-spark-home
30 | -rw-r--r--@ 1 mparsian 897801646 2681 Oct 28 23:36 find-spark-home.cmd
31 | -rw-r--r--@ 1 mparsian 897801646 1892 Oct 28 23:36 load-spark-env.cmd
32 | -rw-r--r--@ 1 mparsian 897801646 2025 Oct 28 23:36 load-spark-env.sh
33 | -rwxr-xr-x@ 1 mparsian 897801646 2987 Oct 28 23:36 pyspark
34 | ...
35 |
36 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ ./bin/pyspark
37 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
38 | [Clang 6.0 (clang-600.0.57)] on darwin
39 | Type "help", "copyright", "credits" or "license" for more information.
40 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
41 | Welcome to
42 | ____ __
43 | / __/__ ___ _____/ /__
44 | _\ \/ _ \/ _ `/ __/ '_/
45 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0
46 | /_/
47 |
48 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
49 | SparkSession available as 'spark'.
50 |
51 | >>> spark
52 |
53 | >>>
54 | >>>
55 | >>>
56 | >>> data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
57 | >>> data
58 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
59 | >>>
60 | >>> rdd = spark.sparkContext.parallelize(data)
61 | >>> rdd.count()
62 | 12
63 | >>> rdd.collect()
64 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
65 | >>> rdd.getNumPartitions()
66 | 8
67 | >>> rdd2 = spark.sparkContext.parallelize(data, 3)
68 | >>> rdd2.collect()
69 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
70 | >>> rdd2.getNumPartitions()
71 | 3
72 | >>> rdd3 = rdd.map(lambda x : x+100)
73 | >>> rdd3.collect()
74 | [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112]
75 | >>>
76 | >>> def myfun(x):
77 | ... return x+100
78 | ...
79 | >>>
80 | >>>
81 | >>> y = myfun(4)
82 | >>> y
83 | 104
84 | >>> z = myfun(60)
85 | >>> z
86 | 160
87 | >>> rdd4 = rdd.map(myfun)
88 | >>> rdd4.collect()
89 | [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112]
90 | >>> rdd5 = rdd.map(lambda x: (x, 1))
91 | >>> rdd5.collect()
92 | [(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)]
93 | >>> rdd2.collect()
94 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
95 | >>> N = rdd.reduce(lambda x, y: x+y)
96 | >>> N
97 | 78
98 | >>> exit()
99 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-04-18.txt:
--------------------------------------------------------------------------------
1 |
2 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ cat > fox.txt
3 | a fox jumped
4 | a red fox jumped and jumped
5 | a blue and red fox jumped
6 | fox is blue red
7 |
8 | $ cat fox.txt
9 | a fox jumped
10 | a red fox jumped and jumped
11 | a blue and red fox jumped
12 | fox is blue red
13 |
14 | ~/spark-2.4.0 $ ./bin/pyspark
15 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
16 | [Clang 6.0 (clang-600.0.57)] on darwin
17 | Type "help", "copyright", "credits" or "license" for more information.
18 | 2019-04-18 18:02:14 WARN NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
19 | Setting default log level to "WARN".
20 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21 | Welcome to
22 | ____ __
23 | / __/__ ___ _____/ /__
24 | _\ \/ _ \/ _ `/ __/ '_/
25 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0
26 | /_/
27 |
28 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
29 | SparkSession available as 'spark'.
30 | >>> spark
31 |
32 |
33 | >>> records = spark.sparkContext.textFile("/Users/mparsian/spark-2.4.0/fox.txt")
34 | >>> records.collect()
35 | [
36 | 'a fox jumped',
37 | 'a red fox jumped and jumped',
38 | 'a blue and red fox jumped',
39 | 'fox is blue red'
40 | ]
41 | >>> records.count()
42 | 4
43 | >>>
44 | >>> def tokenize(record):
45 | ... tokens = record.split(" ")
46 | ... return tokens
47 | ...
48 | >>>
49 | >>> x = "a fox jumped"
50 | >>> x
51 | 'a fox jumped'
52 | >>> tokens = tokenize(x)
53 | >>> tokens
54 | ['a', 'fox', 'jumped']
55 | >>>
56 | >>>
57 | >>> words = records.flatMap(tokenize)
58 | >>> words.collect()
59 | ['a', 'fox', 'jumped', 'a', 'red', 'fox', 'jumped', 'and', 'jumped', 'a', 'blue', 'and', 'red', 'fox', 'jumped', 'fox', 'is', 'blue', 'red']
60 | >>> words.count()
61 | 19
62 | >>> pairs = words.map(lambda x : (x,1))
63 | >>> pairs.collect()
64 | [('a', 1), ('fox', 1), ('jumped', 1), ('a', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('and', 1), ('jumped', 1), ('a', 1), ('blue', 1), ('and', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('fox', 1), ('is', 1), ('blue', 1), ('red', 1)]
65 | >>> pairs.count()
66 | 19
67 | >>>
68 | >>> frequencies = pairs.reduceByKey(lambda a, b: a+b)
69 | >>> frequencies.collect()
70 | [('is', 1), ('a', 3), ('fox', 4), ('jumped', 4), ('red', 3), ('and', 2), ('blue', 2)]
71 | >>>
72 | >>>
73 | >>> filtered = frequencies.filter(lambda x : x[1] > 2)
74 | >>> filtered.collect()
75 | [('a', 3), ('fox', 4), ('jumped', 4), ('red', 3)]
76 | >>> filtered.count()
77 | 4
78 | >>> a = ("dada", 5)
79 | >>> a[0]
80 | 'dada'
81 | >>> a[1]
82 | 5
83 | >>>
84 | >>>
85 | >>> test = records.map(tokenize)
86 | >>> test.collect()
87 | [['a', 'fox', 'jumped'], ['a', 'red', 'fox', 'jumped', 'and', 'jumped'], ['a', 'blue', 'and', 'red', 'fox', 'jumped'], ['fox', 'is', 'blue', 'red']]
88 | >>> test.count()
89 | 4
90 | >>>
91 | >>>
92 | >>> pairs.collect()
93 | [('a', 1), ('fox', 1), ('jumped', 1), ('a', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('and', 1), ('jumped', 1), ('a', 1), ('blue', 1), ('and', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('fox', 1), ('is', 1), ('blue', 1), ('red', 1)]
94 | >>>
95 | >>> grouped = pairs.groupByKey()
96 |
97 | >>> grouped.collect()
98 | [
99 | ('is', ),
100 | ('a', ),
101 | ('fox', ),
102 | ('jumped', ),
103 | ('red', ),
104 | ('and', ),
105 | ('blue', )
106 | ]
107 | >>>
108 | >>> grouped = pairs.groupByKey().mapValues(lambda it: list(it))
109 | >>> grouped.collect()
110 | [
111 | ('is', [1]),
112 | ('a', [1, 1, 1]),
113 | ('fox', [1, 1, 1, 1]),
114 | ('jumped', [1, 1, 1, 1]),
115 | ('red', [1, 1, 1]),
116 | ('and', [1, 1]),
117 | ('blue', [1, 1])
118 | ]
119 | >>> grouped = pairs.groupByKey()
120 | >>> grouped.collect()
121 | [('is', ), ('a', ), ('fox', ), ('jumped', ), ('red', ), ('and', ), ('blue', )]
122 | >>> freq2 = grouped.mapValues(lambda it: sum(it))
123 | >>> freq2.collect()
124 | [
125 | ('is', 1),
126 | ('a', 3),
127 | ('fox', 4),
128 | ('jumped', 4),
129 | ('red', 3),
130 | ('and', 2),
131 | ('blue', 2)
132 | ]
133 | >>> freq2.count()
134 | 7
135 | >>> frequencies = records.flatMap(tokenize).map(lambda x: (x,1)).reduceByKey(lambda a, b: a+b)
136 | >>> frequencies.collect()
137 | [
138 | ('is', 1),
139 | ('a', 3),
140 | ('fox', 4),
141 | ('jumped', 4),
142 | ('red', 3),
143 | ('and', 2),
144 | ('blue', 2)
145 | ]
146 | >>>
147 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-04-26.txt:
--------------------------------------------------------------------------------
1 | Finding Average by Key using reduceByKey() Transformation
2 |
3 | $ ./bin/pyspark
4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
5 | [Clang 6.0 (clang-600.0.57)] on darwin
6 | Type "help", "copyright", "credits" or "license" for more information.
7 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
8 | Welcome to
9 | ____ __
10 | / __/__ ___ _____/ /__
11 | _\ \/ _ \/ _ `/ __/ '_/
12 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0
13 | /_/
14 |
15 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
16 | SparkSession available as 'spark'.
17 | >>>
18 | >>>
19 | >>>
20 | >>>
21 | >>> data = [('k1', 3), ('k1', 4),('k1', 5),('k2', 7),('k2', 7),('k2', 7),('k3', 30),('k3', 30),('k3', 40),('k3', 50)]
22 | >>> data
23 | [('k1', 3), ('k1', 4), ('k1', 5), ('k2', 7), ('k2', 7), ('k2', 7), ('k3', 30), ('k3', 30), ('k3', 40), ('k3', 50)]
24 | >>>
25 | >>> pairs = spark.sparkContext.parallelize(data)
26 | >>> pairs.collect()
27 | [('k1', 3), ('k1', 4), ('k1', 5), ('k2', 7), ('k2', 7), ('k2', 7), ('k3', 30), ('k3', 30), ('k3', 40), ('k3', 50)]
28 | >>> pairs.count()
29 | 10
30 | >>> pairs2 = pairs.distinct()
31 | >>> pairs2.count()
32 | 7
33 | >>> pairs2.collect()
34 | [('k1', 5), ('k3', 40), ('k1', 3), ('k3', 50), ('k2', 7), ('k1', 4), ('k3', 30)]
35 | >>>
36 | >>> tuples = pairs.map(lambda x: (x[0], (x[1], 1) ) )
37 | >>> tuples.collect()
38 | [('k1', (3, 1)), ('k1', (4, 1)), ('k1', (5, 1)), ('k2', (7, 1)), ('k2', (7, 1)), ('k2', (7, 1)), ('k3', (30, 1)), ('k3', (30, 1)), ('k3', (40, 1)), ('k3', (50, 1))]
39 |
40 | >>>
41 | >>> def adder(x, y):
42 | ... sum2 = x[0] + y[0]
43 | ... count = x[1] + y[1]
44 | ... return (sum2, count)
45 | ...
46 | >>>
47 | >>> x = (10, 2)
48 | >>> y = (20, 4)
49 | >>> r = adder(x, y)
50 | >>> r
51 | (30, 6)
52 | >>>
53 | >>> result = tuples.reduceByKey(adder)
54 | >>> result.collect()
55 | [('k1', (12, 3)), ('k3', (150, 4)), ('k2', (21, 3))]
56 | >>> result = tuples.reduceByKey(lambda x, y: adder(x, y))
57 | >>> result.collect()
58 | [('k1', (12, 3)), ('k3', (150, 4)), ('k2', (21, 3))]
59 | >>> avg = result.mapValues(lambda pair: float(pair[0])/float(pair[1]))
60 | >>> avg.collect()
61 | [('k1', 4.0), ('k3', 37.5), ('k2', 7.0)]
62 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-05-09.txt:
--------------------------------------------------------------------------------
1 | Learn Partitioning RDDs and using mapPartitions() Transformation
2 |
3 | $ ./bin/pyspark
4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
5 | [Clang 6.0 (clang-600.0.57)] on darwin
6 | Type "help", "copyright", "credits" or "license" for more information.
7 | Setting default log level to "WARN".
8 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
9 | Welcome to
10 | ____ __
11 | / __/__ ___ _____/ /__
12 | _\ \/ _ \/ _ `/ __/ '_/
13 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0
14 | /_/
15 |
16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
17 | SparkSession available as 'spark'.
18 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
19 | >>>
20 | >>>
21 | >>> numbers
22 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
23 | >>> rdd = spark.sparkContext.parallelize(numbers, 3)
24 | >>> rdd.count()
25 | 10
26 | >>> rdd.collect()
27 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
28 | >>>
29 |
30 | >>> def f(iterator):
31 | ... for x in iterator:
32 | ... print(x)
33 | ... print("===")
34 | ...
35 | >>>
36 | >>> rdd.foreachPartition(f)
37 | 4
38 | 5
39 | 6
40 | ===
41 | 7
42 | 8
43 | 9
44 | 10
45 | ===
46 | 1
47 | 2
48 | 3
49 | ===
50 | >>>
51 | >>>
52 | >>> rdd = spark.sparkContext.parallelize(numbers, 2)
53 | >>> rdd.foreachPartition(f)
54 | 1
55 | 2
56 | 3
57 | 4
58 | 5
59 | ===
60 | 6
61 | 7
62 | 8
63 | 9
64 | 10
65 | ===
66 | >>>
67 | >>> n = rdd.getNumPartitions()
68 | >>> n
69 | 2
70 | >>> rdd = spark.sparkContext.parallelize(numbers, 4)
71 | >>> n = rdd.getNumPartitions()
72 | >>> n
73 | 4
74 | >>> rdd.foreachPartition(f)
75 | 5
76 | 6
77 | ===
78 | 3
79 | 4
80 | ===
81 | 7
82 | 8
83 | 9
84 | 10
85 | ===
86 | 1
87 | 2
88 | ===
89 | >>> rdd = spark.sparkContext.parallelize(numbers, 14)
90 | >>> rdd.foreachPartition(f)
91 | 4
92 | ===
93 | ===
94 | ===
95 | 3
96 | ===
97 | 1
98 | ===
99 | 5
100 | ===
101 | 2
102 | ===
103 | ===
104 | 6
105 | ===
106 | ===
107 | 8
108 | ===
109 | 7
110 | ===
111 | 9
112 | ===
113 | 10
114 | ===
115 | >>> def min_max_count(iterator):
116 | ... firsttime = 1
117 | ... #minimum
118 | ... #maximum
119 | ... #count
120 | ... for x in iterator:
121 | ... if (firsttime == 1):
122 | ... minimum = x
123 | ... maximum = x
124 | ... count = 1
125 | ... firsttime = 0
126 | ... else:
127 | ... count = count + 1
128 | ... minimum = min(x, minimum)
129 | ... maximum = max(x, maximum)
130 | ... #
131 | ... return (minimum, maximum, count)
132 | ...
133 | >>>
134 | >>> data = [12, 34, 3, 5, 7, 9, 91, 77, 12, 13, 14, 15, 16]
135 | >>> data
136 | [12, 34, 3, 5, 7, 9, 91, 77, 12, 13, 14, 15, 16]
137 | >>> rdd = spark.sparkContext.parallelize(numbers, 3)
138 | >>> n = rdd.getNumPartitions()
139 | >>> n
140 | 3
141 | >>> rdd.collect()
142 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
143 | >>> result = rdd.mapPartitions(min_max_count)
144 | >>> result.collect()
145 | [1, 3, 3, 4, 6, 3, 7, 10, 4]
146 | >>> def min_max_count(iterator):
147 | ... firsttime = 1
148 | ... #minimum
149 | ... #maximum
150 | ... #count
151 | ... for x in iterator:
152 | ... if (firsttime == 1):
153 | ... minimum = x
154 | ... maximum = x
155 | ... count = 1
156 | ... firsttime = 0
157 | ... else:
158 | ... count = count + 1
159 | ... minimum = min(x, minimum)
160 | ... maximum = max(x, maximum)
161 | ... #
162 | ... return [minimum, maximum, count]
163 | ...
164 | >>>
165 | >>> result = rdd.mapPartitions(min_max_count)
166 | >>> result.collect()
167 | [1, 3, 3, 4, 6, 3, 7, 10, 4]
168 | >>>
169 | >>>
170 | >>>
171 | >>>
172 | >>> def min_max_count(iterator):
173 | ... firsttime = 1
174 | ... #minimum
175 | ... #maximum
176 | ... #count
177 | ... for x in iterator:
178 | ... if (firsttime == 1):
179 | ... minimum = x
180 | ... maximum = x
181 | ... count = 1
182 | ... firsttime = 0
183 | ... else:
184 | ... count = count + 1
185 | ... minimum = min(x, minimum)
186 | ... maximum = max(x, maximum)
187 | ... #
188 | ... return [[minimum, maximum, count]]
189 | ...
190 | >>> result = rdd.mapPartitions(min_max_count)
191 | >>> result.collect()
192 | [[1, 3, 3], [4, 6, 3], [7, 10, 4]]
193 | >>>
194 |
195 | >>> data
196 | [12, 34, 3, 5, 7, 9, 91, 77, 12, 13, 14, 15, 16]
197 | >>> rdd = spark.sparkContext.parallelize(data, 3)
198 | >>>
199 | >>>
200 | >>> result = rdd.mapPartitions(min_max_count)
201 | >>> result.collect()
202 | [[3, 34, 4], [7, 91, 4], [12, 16, 5]]
203 | >>> rdd.foreachPartition(f)
204 | 12
205 | 13
206 | 14
207 | 15
208 | 16
209 | ===
210 | 7
211 | 9
212 | 91
213 | 77
214 | ===
215 | 12
216 | 34
217 | 3
218 | 5
219 | ===
220 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-10-09.txt:
--------------------------------------------------------------------------------
1 | /spark-2.4.4 $ ./bin/pyspark
2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
3 | [Clang 6.0 (clang-600.0.57)] on darwin
4 | Type "help", "copyright", "credits" or "license" for more information.
5 | 19/10/09 18:57:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
6 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
7 | Setting default log level to "WARN".
8 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
9 | Welcome to
10 | ____ __
11 | / __/__ ___ _____/ /__
12 | _\ \/ _ \/ _ `/ __/ '_/
13 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4
14 | /_/
15 |
16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
17 | SparkSession available as 'spark'.
18 | >>>
19 | >>>
20 | >>>
21 | >>>
22 | >>> numbers = [1, 2, 3, 1, 2, 3, 4, 4, 5, 6]
23 | >>> numbers
24 | [1, 2, 3, 1, 2, 3, 4, 4, 5, 6]
25 | >>> rdd = spark.sparkContext.parallelize(numbers)
26 | >>> rdd.collect()
27 | [1, 2, 3, 1, 2, 3, 4, 4, 5, 6]
28 | >>> rdd.count()
29 | 10
30 | >>> rdd2 = rdd.filter(lambda x : x > 3)
31 | >>> rdd2.collect()
32 | [4, 4, 5, 6]
33 | >>>
34 | >>>
35 | >>> def custom_filter(x):
36 | ... if x > 3:
37 | ... return True
38 | ... else:
39 | ... return False
40 | ... ^D
41 | >>>
42 | >>> x = custom_filter(10)
43 | >>> x
44 | True
45 | >>> x = custom_filter(2)
46 | >>> x
47 | False
48 | >>> rdd3 = rdd.filter(custom_filter)
49 | >>> rdd3.collect()
50 | [4, 4, 5, 6]
51 | >>> rdd2.collect()
52 | [4, 4, 5, 6]
53 | >>>
54 | >>>
55 | >>> data = [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2),('B', 7)]
56 | >>> data
57 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2), ('B', 7)]
58 | >>>
59 | >>> rdd = spark.sparkContext.parallelize(data)
60 | >>> rdd.collect()
61 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2), ('B', 7)]
62 | >>>
63 | >>>
64 | >>>
65 | >>>
66 | >>> total = rdd.reduceByKey(lambda x, y: x+y)
67 | >>> total.collect()
68 | [('B', 9), ('A', 14)]
69 | >>>
70 | >>>
71 | >>> rdd.collect()
72 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2), ('B', 7)]
73 | >>> grouped = rdd.groupByKey()
74 | >>> grouped.collect()
75 | [
76 | ('B', ),
77 | ('A', )
78 | ]
79 | >>> grouped.map(lambda x: (x[0], list(x[1])).collect()
80 | [('B', [2, 7]), ('A', [2, 3, 4, 5])]
81 | >>> total2 = grouped.map(lambda x: (x[0], sum(x[1])))
82 | >>> total2.collect()
83 | [('B', 9), ('A', 14)]
84 | >>>
85 |
86 | >>>
87 | >>> spark
88 |
89 | >>> numbers = [-1, 2, 3, -55, 88, 99, -99, 66, 777]
90 | >>> numbers
91 | [-1, 2, 3, -55, 88, 99, -99, 66, 777]
92 | >>> rdd = spark.sparkContext.parallelize(numbers)
93 | >>> rdd.collect()
94 | [-1, 2, 3, -55, 88, 99, -99, 66, 777]
95 | >>>
96 | >>> positives = rdd.filter(lambda x : x > 0)
97 | >>> positives.collect()
98 | [2, 3, 88, 99, 66, 777]
99 | >>>
100 | >>> negatives = rdd.filter(lambda x : x < 0)
101 | >>> negatives.collect()
102 | [-1, -55, -99]
103 | >>> def keep_positives(n):
104 | ... if (n > 0):
105 | ... return True
106 | ... else:
107 | ... return False
108 | ... ^D
109 | >>>
110 | >>> a = keep_positives(100)
111 | >>> a
112 | True
113 | >>> a = keep_positives(-9)
114 | >>> a
115 | False
116 | >>> pos2 = rdd.filter(keep_positives)
117 | >>> pos2.collect()
118 | [2, 3, 88, 99, 66, 777]
119 | >>> pos2222 = pos2.filter(lambda x : True)
120 | >>> pos2222.collect()
121 | [2, 3, 88, 99, 66, 777]
122 | >>>
123 | >>>
124 | >>> pairs = [('A', 2), ('A', 3), ('A', 4),('A', 5), ('A', 6), ('B', 10), ('B', 2)]
125 | >>> pairs
126 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('A', 6), ('B', 10), ('B', 2)]
127 | >>>
128 | >>>
129 | >>> rdd = spark.sparkContext.parallelize(pairs)
130 | >>> rdd.collect()
131 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('A', 6), ('B', 10), ('B', 2)]
132 | >>> totals = rdd.reduceByKey(lambda a, b : a+b)
133 | >>> result = totals.collect()
134 | >>> result
135 | [('B', 12), ('A', 20)]
136 | >>>
137 | >>>
138 | >>> rdd.collect()
139 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('A', 6), ('B', 10), ('B', 2)]
140 | >>> grouped = rdd.groupByKey()
141 | >>> grouped.collect()
142 | [
143 | ('B', ),
144 | ('A', )
145 | ]
146 | >>>
147 | >>> grouped.map(lambda x: (x[0], list(x[1]))).collect()
148 | [('B', [10, 2]), ('A', [2, 3, 4, 5, 6])]
149 | >>>
150 | >>> sum2 = grouped.map(lambda x: (x[0], sum(x[1])))
151 | >>> sum2.collect()
152 | [('B', 12), ('A', 20)]
153 | >>>
154 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-10-16.txt:
--------------------------------------------------------------------------------
1 | $ ./bin/pyspark
2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
3 | [Clang 6.0 (clang-600.0.57)] on darwin
4 | Type "help", "copyright", "credits" or "license" for more information.
5 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
6 | Setting default log level to "WARN".
7 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
8 | Welcome to
9 | ____ __
10 | / __/__ ___ _____/ /__
11 | _\ \/ _ \/ _ `/ __/ '_/
12 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4
13 | /_/
14 |
15 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
16 | SparkSession available as 'spark'.
17 | >>>
18 | >>> a =[ 1, 2, 3, 4, 5]
19 | >>> rdd = spark.sparkContext.parallelize(a)
20 | >>> rdd.collect()
21 | [1, 2, 3, 4, 5]
22 | >>> rdd.count()
23 | 5
24 | >>> sumofvalues = rdd.reduce(lambda x, y: x+y)
25 | >>> sumofvalues
26 | 15
27 | >>>
28 | >>> product = rdd.reduce(lambda x, y: x*y)
29 | >>> product
30 | 120
31 | >>> z = [ "1", "2", "3", "4", "5", "6", "7"]
32 | >>> rdd = spark.sparkContext.parallelize(z)
33 | >>> rdd.collect()
34 | ['1', '2', '3', '4', '5', '6', '7']
35 | >>> concat = rdd.reduce(lambda x, y: x+y)
36 | >>> concat
37 | '1234567'
38 | >>>
39 | >>> [ "1", "2", "3", "4", "5", "6", "7"]
40 | ['1', '2', '3', '4', '5', '6', '7']
41 | >>> z = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b"]
42 | >>>
43 | >>>
44 | >>> z
45 | ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b']
46 | >>> rdd = spark.sparkContext.parallelize(z, 3)
47 | >>> rdd.collect()
48 | ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b']
49 | >>> concat = rdd.reduce(lambda x, y: x+y)
50 | >>> concat
51 | '123456789ab'
52 | >>> rdd = spark.sparkContext.parallelize(z, 10)
53 | >>> concat = rdd.reduce(lambda x, y: x+y)
54 | >>> concat
55 | '123456789ab'
56 |
57 |
58 | >>>
59 | >>> nums = [1, 3, 5, 4, 2, 1, 0, 9, 10]
60 | >>> nums
61 | [1, 3, 5, 4, 2, 1, 0, 9, 10]
62 | >>> rdd = spark.sparkContext.parallelize(nums)
63 | >>> rdd.collect()
64 | [1, 3, 5, 4, 2, 1, 0, 9, 10]
65 | >>> rdd.count()
66 | 9
67 | >>> sumvalues = rdd.reduce(lambda a, b: a+b)
68 | >>> sumvalues
69 | 35
70 |
71 | >>> product = rdd.reduce(lambda a, b: a*b)
72 | >>> product
73 | 0
74 | >>> nums = [1, 3, 5, 4, 2, 1, 30, 9, 10]
75 | >>> rdd = spark.sparkContext.parallelize(nums)
76 | >>> sumvalues = rdd.reduce(lambda a, b: a+b)
77 | >>> sumvalues
78 | 65
79 | >>> product = rdd.reduce(lambda a, b: a*b)
80 | >>> product
81 | 324000
82 | >>> rdd.collect()
83 | [1, 3, 5, 4, 2, 1, 30, 9, 10]
84 |
85 | >>> strs = ["1", "3", "5", "4", "2", "1"]
86 | >>> strs
87 | ['1', '3', '5', '4', '2', '1']
88 | >>> rdd = spark.sparkContext.parallelize(strs)
89 | >>> concat = rdd.reduce(lambda a, b: a+b)
90 | >>> concat
91 | '135421'
92 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-01-22.txt:
--------------------------------------------------------------------------------
1 | $ ./bin/pyspark
2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
3 | [Clang 6.0 (clang-600.0.57)] on darwin
4 | Type "help", "copyright", "credits" or "license" for more information.
5 | Welcome to
6 | ____ __
7 | / __/__ ___ _____/ /__
8 | _\ \/ _ \/ _ `/ __/ '_/
9 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4
10 | /_/
11 |
12 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
13 | SparkSession available as 'spark'.
14 | >>>
15 | >>>
16 | >>>
17 | >>> spark
18 |
19 | >>>
20 | >>> sc = spark.sparkContext
21 | >>>
22 | >>> sc
23 |
24 | >>>
25 | >>>
26 | >>> numbers = [1, 2, 3, 4, 5, 6, -1, -2]
27 | >>> numbers
28 | [1, 2, 3, 4, 5, 6, -1, -2]
29 | >>> len(numbers)
30 | 8
31 | >>> rdd = sc.parallelize(numbers)
32 | >>> rdd.collect()
33 | [1, 2, 3, 4, 5, 6, -1, -2]
34 | >>> rdd.count()
35 | 8
36 | >>> rdd
37 | ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195
38 | >>>
39 | >>> rdd_pos = rdd.filter(lambda x: x > 0)
40 | >>> rdd_pos.collect()
41 | [1, 2, 3, 4, 5, 6]
42 |
43 | >>> rdd_pos.count()
44 | 6
45 | >>>
46 | >>> rdd_pos.collect()
47 | [1, 2, 3, 4, 5, 6]
48 | >>>
49 | >>> sum_of_all = rdd_pos.reduce(lambda x, y: x+y)
50 | >>> sum_of_all
51 | 21
52 | >>> rdd_pos.take(2)
53 | [1, 2]
54 | >>>
55 | >>>
56 | >>> rdd.collect()
57 | [1, 2, 3, 4, 5, 6, -1, -2]
58 | >>> rdd.count()
59 | 8
60 | >>> rdd4 = rdd.map(lambda x : x+100)
61 | >>> rdd4.collect()
62 | [101, 102, 103, 104, 105, 106, 99, 98]
63 | >>>
64 | >>>
65 | >>>
66 | >>> kv = [('alex', 2), ('alex', 20), ('alex', 40), ('jane', 100), ('jane', 400)]
67 | >>> kv
68 | [('alex', 2), ('alex', 20), ('alex', 40), ('jane', 100), ('jane', 400)]
69 | >>> len(kv)
70 | 5
71 | >>> key_value_pairs = sc.parallelize(kv)
72 | >>> key_value_pairs.count()
73 | 5
74 | >>> key_value_pairs.collect()
75 | [
76 | ('alex', 2),
77 | ('alex', 20),
78 | ('alex', 40),
79 | ('jane', 100),
80 | ('jane', 400)
81 | ]
82 | >>>
83 | >>>
84 | >>> grouped = key_value_pairs.groupByKey()
85 | >>> grouped.collect()
86 | [
87 | ('alex', ),
88 | ('jane', )
89 | ]
90 | >>>
91 | >>> grouped.map(lambda x: (x[0], list(x[1]))).collect()
92 | [
93 | ('alex', [2, 20, 40]),
94 | ('jane', [100, 400])
95 | ]
96 | >>> grouped_sum = grouped.map(lambda x: (x[0], sum(x[1])))
97 | >>> grouped_sum.collect()
98 | [
99 | ('alex', 62),
100 | ('jane', 500)
101 | ]
102 | >>>
103 | >>>
104 | >>> grouped.collect()
105 | [
106 | ('alex', ),
107 | ('jane', )
108 | ]
109 | >>> grouped_sum_2 = grouped.mapValues(lambda x: sum(x))
110 | >>> grouped_sum_2.collect()
111 | [
112 | ('alex', 62),
113 | ('jane', 500)
114 | ]
115 | >>>
116 |
117 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-01-24.txt:
--------------------------------------------------------------------------------
1 | How to read a text file and convert into an RDD[String]
2 |
3 | $ cat /tmp/books.txt
4 | ISBN-100,sales,biology
5 | IS-01235,sales,econ
6 | ISBN-101,sales,econ
7 | ISBN-102,sales,biology
8 | ISBN-109,econ,sales
9 | ISBN-103,CS,sales
10 | ISBN-104,CS,biology
11 | ISBN-105,CS,econ
12 | ISBN-200,CS
13 |
14 | $ ./bin/pyspark
15 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
16 | [Clang 6.0 (clang-600.0.57)] on darwin
17 | Welcome to
18 | ____ __
19 | / __/__ ___ _____/ /__
20 | _\ \/ _ \/ _ `/ __/ '_/
21 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4
22 | /_/
23 |
24 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
25 | SparkSession available as 'spark'.
26 | >>>
27 | >>>
28 | >>>
29 | >>> spark
30 |
31 | >>>
32 | >>>
33 | >>>
34 | >>> input_path = "/tmp/books.txt"
35 | >>>
36 | >>> records = spark.sparkContext.textFile(input_path)
37 | >>> records.collect()
38 | [
39 | 'ISBN-100,sales,biology',
40 | 'IS-01235,sales,econ',
41 | 'ISBN-101,sales,econ',
42 | 'ISBN-102,sales,biology',
43 | 'ISBN-109,econ,sales',
44 | 'ISBN-103,CS,sales',
45 | 'ISBN-104,CS,biology',
46 | 'ISBN-105,CS,econ',
47 | 'ISBN-200,CS'
48 | ]
49 | >>> records.count()
50 | 9
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-02-03.txt:
--------------------------------------------------------------------------------
1 | mparsian@Mahmouds-MacBook ~/spark-2.4.4 $ ./bin/pyspark
2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
3 | Welcome to
4 | ____ __
5 | / __/__ ___ _____/ /__
6 | _\ \/ _ \/ _ `/ __/ '_/
7 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4
8 | /_/
9 |
10 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
11 | SparkSession available as 'spark'.
12 | >>>
13 | >>>
14 | >>> numbers = [1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30]
15 | >>>
16 | >>>
17 | >>> numbers
18 | [1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30]
19 | >>> len(numbers)
20 | 56
21 | >>> rdd = spark.sparkContext.parallelize(numbers)
22 | >>> rdd.count()
23 | 56
24 | >>> rdd.collect()
25 | [1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30]
26 | >>>
27 | >>>
28 | >>> def min_max_count(partition):
29 | ... first_time = True
30 | ... count = 0
31 | ... for n in partition:
32 | ... count += 1
33 | ... if first_time == True:
34 | ... min2 = n
35 | ... max2 = n
36 | ... first_time = False
37 | ... else:
38 | ... min2 = min(min2, n)
39 | ... max2 = max(max2, n)
40 | ... return (min2, max2, count)
41 | ...
42 | >>>
43 | >>> target = rdd.mapPartitions(min_max_count)
44 | >>> target.count()
45 | 24
46 | >>> target.collect()
47 | [-2, 3, 7, -2, 3, 7, -2, 3, 7, -20, 30, 7, -2, 3, 7, -2, 3, 7, -2, 3, 7, -20, 30, 7]
48 | >>>
49 | >>>
50 | >>> def min_max_count(partition):
51 | ... first_time = True
52 | ... count = 0
53 | ... for n in partition:
54 | ... count += 1
55 | ... if first_time == True:
56 | ... min2 = n
57 | ... max2 = n
58 | ... first_time = False
59 | ... else:
60 | ... min2 = min(min2, n)
61 | ... max2 = max(max2, n)
62 | ... return [(min2, max2, count)]
63 | ...
64 | >>>
65 | >>> target = rdd.mapPartitions(min_max_count)
66 | >>> target.collect()
67 | [(-2, 3, 7), (-2, 3, 7), (-2, 3, 7), (-20, 30, 7), (-2, 3, 7), (-2, 3, 7), (-2, 3, 7), (-20, 30, 7)]
68 | >>>
69 | >>> rdd.getNumPartitions()
70 | 8
71 | >>> rdd = spark.sparkContext.parallelize(numbers, 4)
72 | >>> rdd.getNumPartitions()
73 | 4
74 | >>> target = rdd.mapPartitions(min_max_count)
75 | >>> target.collect()
76 | [(-2, 3, 14), (-20, 30, 14), (-2, 3, 14), (-20, 30, 14)]
77 | >>>
78 | >>>
79 | >>>
80 | >>> def add_t3(x, y):
81 | ... count = x[2] + y[2]
82 | ... min2 = min(x[0], y[0])
83 | ... max2 = max(x[1], y[1])
84 | ... return (min2, max2, count)
85 | ...
86 | >>>
87 | >>> add_t3( (2, 5, 40), (7, 50, 60))
88 | (2, 50, 100)
89 | >>> final_result = target.reduce(add_t3)
90 | >>> final_result
91 | (-20, 30, 56)
92 | >>>
93 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-04-16.txt:
--------------------------------------------------------------------------------
1 | $ ./bin/pyspark
2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
3 | [Clang 6.0 (clang-600.0.57)] on darwin
4 | Type "help", "copyright", "credits" or "license" for more information.
5 |
6 | Welcome to
7 | ____ __
8 | / __/__ ___ _____/ /__
9 | _\ \/ _ \/ _ `/ __/ '_/
10 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4
11 | /_/
12 |
13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
14 | SparkSession available as 'spark'.
15 | >>>
16 | >>> spark
17 |
18 | >>>
19 | >>> input_path = '/Users/mparsian/spark-2.4.4/zbin/foxdata.txt'
20 | >>> input_path
21 | '/Users/mparsian/spark-2.4.4/zbin/foxdata.txt'
22 | >>>
23 | >>> recs = spark.sparkContext.textFile(input_path)
24 | >>>
25 | >>> recs.collect()
26 | ['red fox jumped high', 'fox jumped over high fence', 'red fox jumped']
27 | >>> recs.count()
28 | 3
29 | >>>
30 | >>>
31 | >>> words = recs.map(lambda r: r.split(" "))
32 | >>> words.collect()
33 | [['red', 'fox', 'jumped', 'high'], ['fox', 'jumped', 'over', 'high', 'fence'], ['red', 'fox', 'jumped']]
34 | >>>
35 | >>> words.take(1)
36 | [['red', 'fox', 'jumped', 'high']]
37 | >>> words.take(2)
38 | [['red', 'fox', 'jumped', 'high'], ['fox', 'jumped', 'over', 'high', 'fence']]
39 | >>> # recs : RDD[String]
40 | ...
41 | >>> # words : RDD[[String]]
42 | ...
43 | >>> x = "fox jumped"
44 | >>> y = x.split(" ")
45 | >>> y
46 | ['fox', 'jumped']
47 | >>>
48 | >>>
49 | >>> single_words = words.flatMap(lambda x: x)
50 | >>> single_words.collect()
51 | ['red', 'fox', 'jumped', 'high', 'fox', 'jumped', 'over', 'high', 'fence', 'red', 'fox', 'jumped']
52 | >>> words.count()
53 | 3
54 | >>> single_words.count()
55 | 12
56 | >>> # single_words : RDD[String]
57 | ...
58 | >>>
59 | >>> pairs = single_words.map(lambda x : (x, 1))
60 | >>> pairs.collect()
61 | [('red', 1), ('fox', 1), ('jumped', 1), ('high', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('high', 1), ('fence', 1), ('red', 1), ('fox', 1), ('jumped', 1)]
62 | >>>
63 | >>> pairs.collect()
64 | [('red', 1), ('fox', 1), ('jumped', 1), ('high', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('high', 1), ('fence', 1), ('red', 1), ('fox', 1), ('jumped', 1)]
65 | >>> freq = pairs.reduceByKey(lambda a, b : a+b)
66 | >>> freq.collect()
67 | [('high', 2), ('fence', 1), ('red', 2), ('fox', 3), ('jumped', 3), ('over', 1)]
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-04-23.txt:
--------------------------------------------------------------------------------
1 | ~/spark-2.4.4 $ ./bin/pyspark
2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
3 | Welcome to
4 | ____ __
5 | / __/__ ___ _____/ /__
6 | _\ \/ _ \/ _ `/ __/ '_/
7 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4
8 | /_/
9 |
10 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
11 | SparkSession available as 'spark'.
12 | >>> data = [ [1, 2, 3], [4, 5], [], [10] ]
13 | >>> data
14 | [[1, 2, 3], [4, 5], [], [10]]
15 | >>> len(data)
16 | 4
17 | >>> rdd = spark.sparkContext.parallelize(data)
18 | >>> rdd.collect()
19 | [[1, 2, 3], [4, 5], [], [10]]
20 | >>> rdd.count()
21 | 4
22 | >>> rdd2 = rdd.map(lambda x: x)
23 | >>> rdd2.count()
24 | 4
25 | >>> rdd2.collect()
26 | [[1, 2, 3], [4, 5], [], [10]]
27 | >>>
28 | >>> rdd3 = rdd.flatMap(lambda x: x)
29 | >>> rdd3.count()
30 | 6
31 | >>> rdd3.collect()
32 | [1, 2, 3, 4, 5, 10]
33 | >>>
34 | >>> data2 = [ [1, 2, 3, [44, 55] ], [4, 5], [], [10] ]
35 | >>> rdd4 = spark.sparkContext.parallelize(data2)
36 | >>> rdd4.collect()
37 | [[1, 2, 3, [44, 55]], [4, 5], [], [10]]
38 | >>> rdd5 = rdd4.flatMap(lambda x: x)
39 | >>> rdd5.collect()
40 | [1, 2, 3, [44, 55], 4, 5, 10]
41 | >>>
42 | >>>
43 | >>> data = [1, 2, 3, 4, 5, 6]
44 | >>> rdd = spark.sparkContext.parallelize(data)
45 | >>> rdd.collect()
46 | [1, 2, 3, 4, 5, 6]
47 | >>> sumofvalues = rdd.reduce(lambda x, y: x+y)
48 | >>> sumofvalues
49 | 21
50 | >>> sumofvalues = rdd.reduce(lambda x, y: x*y)
51 | >>> sumofvalues
52 | 720
53 | >>> tuples2 = [(1,20), (3,40), (5,60)]
54 | >>> rdd = spark.sparkContext.parallelize(tuples2)
55 | >>> rdd.collect()
56 | [(1, 20), (3, 40), (5, 60)]
57 | >>> rdd.count()
58 | 3
59 | >>> sum2 = rdd.rduce(lambda x, y: (x[0]+y[0], x[1]+y[1]))
60 | Traceback (most recent call last):
61 | File "", line 1, in
62 | AttributeError: 'RDD' object has no attribute 'rduce'
63 | >>> sum2 = rdd.reduce(lambda x, y: (x[0]+y[0], x[1]+y[1]))
64 | >>> sum2
65 | (9, 120)
66 | >>>
67 | >>>
68 | >>> kv = [('A', 2), ('A', 3), ('A', 4), ('B', 10), ('B', 20), ('C', 7)]
69 | >>> kv
70 | [('A', 2), ('A', 3), ('A', 4), ('B', 10), ('B', 20), ('C', 7)]
71 | >>> len(kv)
72 | 6
73 | >>> rdd = spark.sparkContext.parallelize(kv)
74 | >>> rdd.collect()
75 | [('A', 2), ('A', 3), ('A', 4), ('B', 10), ('B', 20), ('C', 7)]
76 | >>> rdd.count()
77 | 6
78 | >>> sum_by_key = rdd.reduceByKey(lambda x, y: x+y)
79 | >>> sum_by_key.collect()
80 | [('B', 30), ('C', 7), ('A', 9)]
81 | >>>
82 | >>>
83 | >>>
84 | >>> grouped = rdd.groupByKey()
85 | >>> grouped.collect()
86 | [('B', ), ('C', ), ('A', )]
87 | >>> grouped.mapValues(lambda iter: list(iter)).collect()
88 | [('B', [10, 20]), ('C', [7]), ('A', [2, 3, 4])]
89 | >>>
90 | >>> sum_of_values_2 = grouped.mapValues(lambda iter: sum(iter))
91 | >>> sum_of_values_2.collect()
92 | [('B', 30), ('C', 7), ('A', 9)]
93 |
94 | ... # find average of values per key for a give rdd by groupByKey()
95 |
96 | ... # find average of values per key for a give rdd by reduceByKey()
97 | ...
98 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-07-06-word-count.txt:
--------------------------------------------------------------------------------
1 | $ cat /tmp/foxy.txt
2 | a Fox jumped high and high and jumped and jumped
3 | fox of red jumped fox of red jumped fox of red jumped
4 | oh no
5 | fox of blue jumped
6 | oh boy
7 | a Fox is a red fox of hen
8 | a fox is a high fox
9 | orange fox is high and blue and blue
10 |
11 | mparsian@usfc-olw-025011 ~/spark-3.0.0 $ ./bin/pyspark
12 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
13 | [Clang 6.0 (clang-600.0.57)] on darwin
14 | Type "help", "copyright", "credits" or "license" for more information.
15 | 20/07/06 17:59:22 WARN Utils: Your hostname, Mahmouds-MacBook.local resolves to a loopback address: 127.0.0.1; using 10.0.0.93 instead (on interface en0)
16 | 20/07/06 17:59:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
17 | 20/07/06 17:59:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
18 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
19 | Setting default log level to "WARN".
20 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21 | Welcome to
22 | ____ __
23 | / __/__ ___ _____/ /__
24 | _\ \/ _ \/ _ `/ __/ '_/
25 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0
26 | /_/
27 |
28 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
29 | SparkSession available as 'spark'.
30 | >>>
31 | >>>
32 | >>> spark
33 |
34 | >>>
35 | >>> input_path = '/tmp/foxy.txt'
36 | >>> input_path
37 | '/tmp/foxy.txt'
38 | >>>
39 | >>> recs = spark.sparkContext.textFile(input_path)
40 | >>> recs.count()
41 | 8
42 | >>> recs.collect()
43 | [
44 | 'a Fox jumped high and high and jumped and jumped',
45 | 'fox of red jumped fox of red jumped fox of red jumped',
46 | 'oh no',
47 | 'fox of blue jumped',
48 | 'oh boy',
49 | 'a Fox is a red fox of hen',
50 | 'a fox is a high fox',
51 | 'orange fox is high and blue and blue'
52 | ]
53 | >>>
54 | >>>
55 | >>>
56 | >>> splitted = recs.map(lambda x: x.split(" "))
57 | >>> splitted.count()
58 | 8
59 | >>> splitted.collect()
60 | [
61 | ['a', 'Fox', 'jumped', 'high', 'and', 'high', 'and', 'jumped', 'and', 'jumped'],
62 | ['fox', 'of', 'red', 'jumped', 'fox', 'of', 'red', 'jumped', 'fox', 'of', 'red', 'jumped'],
63 | ['oh', 'no'],
64 | ['fox', 'of', 'blue', 'jumped'],
65 | ['oh', 'boy'],
66 | ['a', 'Fox', 'is', 'a', 'red', 'fox', 'of', 'hen'],
67 | ['a', 'fox', 'is', 'a', 'high', 'fox'],
68 | ['orange', 'fox', 'is', 'high', 'and', 'blue', 'and', 'blue']
69 | ]
70 | >>>
71 | >>>
72 | >>> words = splitted.flatMap(lambda x: x)
73 | >>> words.count()
74 | 52
75 | >>> words.collect()
76 | ['a', 'Fox', 'jumped', 'high', 'and', 'high', 'and', 'jumped', 'and', 'jumped', 'fox', 'of', 'red', 'jumped', 'fox', 'of', 'red', 'jumped', 'fox', 'of', 'red', 'jumped', 'oh', 'no', 'fox', 'of', 'blue', 'jumped', 'oh', 'boy', 'a', 'Fox', 'is', 'a', 'red', 'fox', 'of', 'hen', 'a', 'fox', 'is', 'a', 'high', 'fox', 'orange', 'fox', 'is', 'high', 'and', 'blue', 'and', 'blue']
77 | >>>
78 | >>>
79 | >>> pairs = words.map(lambda x : (x, 1))
80 | >>> pairs.collect()
81 | [('a', 1), ('Fox', 1), ('jumped', 1), ('high', 1), ('and', 1), ('high', 1), ('and', 1), ('jumped', 1), ('and', 1), ('jumped', 1), ('fox', 1), ('of', 1), ('red', 1), ('jumped', 1), ('fox', 1), ('of', 1), ('red', 1), ('jumped', 1), ('fox', 1), ('of', 1), ('red', 1), ('jumped', 1), ('oh', 1), ('no', 1), ('fox', 1), ('of', 1), ('blue', 1), ('jumped', 1), ('oh', 1), ('boy', 1), ('a', 1), ('Fox', 1), ('is', 1), ('a', 1), ('red', 1), ('fox', 1), ('of', 1), ('hen', 1), ('a', 1), ('fox', 1), ('is', 1), ('a', 1), ('high', 1), ('fox', 1), ('orange', 1), ('fox', 1), ('is', 1), ('high', 1), ('and', 1), ('blue', 1), ('and', 1), ('blue', 1)]
82 | >>>
83 | >>>
84 | >>> freq = pairs.reduceByKey(lambda a, b: a+b)
85 | >>>
86 | >>> freq.collect()
87 | [('Fox', 2), ('high', 4), ('of', 5), ('oh', 2), ('no', 1), ('boy', 1), ('is', 3), ('hen', 1), ('orange', 1), ('a', 5), ('jumped', 7), ('and', 5), ('fox', 8), ('red', 4), ('blue', 3)]
88 | >>>
89 | >>>
90 | >>> grouped = pairs.groupByKey()
91 | >>> grouped.collect()
92 | [
93 | ('Fox', ),
94 | ('high', ),
95 | ('of', ),
96 | ('oh', ),
97 | ('no', ),
98 | ('boy', ),
99 | ('is', ),
100 | ('hen', ),
101 | ('orange', ),
102 | ('a', ),
103 | ('jumped', ),
104 | ('and', ),
105 | ('fox', ),
106 | ('red', ),
107 | ('blue', )
108 | ]
109 | >>>
110 | >>> grouped.mapValues(lambda iter: list(iter)).collect()
111 | [
112 | ('Fox', [1, 1]),
113 | ('high', [1, 1, 1, 1]),
114 | ('of', [1, 1, 1, 1, 1]),
115 | ('oh', [1, 1]),
116 | ('no', [1]),
117 | ('boy', [1]),
118 | ('is', [1, 1, 1]),
119 | ('hen', [1]),
120 | ('orange', [1]),
121 | ('a', [1, 1, 1, 1, 1]),
122 | ('jumped', [1, 1, 1, 1, 1, 1, 1]),
123 | ('and', [1, 1, 1, 1, 1]),
124 | ('fox', [1, 1, 1, 1, 1, 1, 1, 1]),
125 | ('red', [1, 1, 1, 1]),
126 | ('blue', [1, 1, 1])
127 | ]
128 | >>> freq2 = grouped.mapValues(lambda iter: sum(iter))
129 | >>> freq2.collect()
130 | [('Fox', 2), ('high', 4), ('of', 5), ('oh', 2), ('no', 1), ('boy', 1), ('is', 3), ('hen', 1), ('orange', 1), ('a', 5), ('jumped', 7), ('and', 5), ('fox', 8), ('red', 4), ('blue', 3)]
131 | >>>
132 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-10-05.txt:
--------------------------------------------------------------------------------
1 | $ cat /tmp/foxy.txt
2 | a fox jumped and jumped
3 | red fox jumped high
4 | a red high fox jumped and jumped
5 | red fox is red
6 |
7 | $ ./bin/pyspark
8 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
9 | [Clang 6.0 (clang-600.0.57)] on darwin
10 | Type "help", "copyright", "credits" or "license" for more information.
11 |
12 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
13 | Setting default log level to "WARN".
14 | To adjust logging level use sc.setLogLevel(newLevel).
15 | For SparkR, use setLogLevel(newLevel).
16 | Welcome to
17 | ____ __
18 | / __/__ ___ _____/ /__
19 | _\ \/ _ \/ _ `/ __/ '_/
20 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0
21 | /_/
22 |
23 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
24 | SparkSession available as 'spark'.
25 | >>>
26 | >>>
27 | >>> numbers = [1, 2, 3, 4, 5, 6, 10]
28 | >>> numbers
29 | [1, 2, 3, 4, 5, 6, 10]
30 | >>>
31 | >>>
32 | >>> spark
33 |
34 |
35 | >>># create a new RDD from a Python collection named numbers
36 | >>> rdd_numbers = spark.sparkContext.parallelize(numbers)
37 | >>> rdd_numbers.count()
38 | 7
39 |
40 | >>> rdd_numbers.collect()
41 | [1, 2, 3, 4, 5, 6, 10]
42 | >>> # rdd_numbers : RDD[Integer]
43 | ...
44 | >>> total = rdd_numbers.reduce(lambda x, y: x+y)
45 | >>> total
46 | 31
47 |
48 | >>># create a new RDD from rdd_numbers
49 | >>> tuples2 = rdd_numbers.map(lambda x: (x, x+1))
50 | >>> tuples2.count()
51 | 7
52 | >>> tuples2.collect()
53 | [(1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (10, 11)]
54 | >>>
55 | >>>
56 | >>> input_path = '/tmp/foxy.txt'
57 | >>># create a new RDD[String] from a given text file
58 | >>> recs = spark.sparkContext.textFile(input_path)
59 | >>> recs.collect()
60 | [
61 | 'a fox jumped and jumped',
62 | 'red fox jumped high',
63 | 'a red high fox jumped and jumped',
64 | 'red fox is red'
65 | ]
66 | >>> recs.count()
67 | 4
68 | >>> # recs : RDD[String]
69 |
70 | >>># create a new RDD[(String, Integer)]
71 | >>> recs_length = recs.map(lambda x : (x, len(x)))
72 | >>> recs_length.collect()
73 | [
74 | ('a fox jumped and jumped', 23),
75 | ('red fox jumped high', 19),
76 | ('a red high fox jumped and jumped', 32),
77 | ('red fox is red', 14)
78 | ]
79 | >>> # recs_length : RDD[(String, Integer)]
80 |
81 | >>># keep only records if their lengt is greater than 20
82 | >>> recs_gt_20 = recs.filter(lambda x: len(x) > 20)
83 | >>>
84 | >>> recs_gt_20.collect()
85 | [
86 | 'a fox jumped and jumped',
87 | 'a red high fox jumped and jumped'
88 | ]
89 | >>> recs_gt_20.count()
90 | 2
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-10-07.txt:
--------------------------------------------------------------------------------
1 | $ ./bin/pyspark
2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
3 | [Clang 6.0 (clang-600.0.57)] on darwin
4 | Type "help", "copyright", "credits" or "license" for more information.
5 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
6 | Setting default log level to "WARN".
7 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
8 | Welcome to
9 | ____ __
10 | / __/__ ___ _____/ /__
11 | _\ \/ _ \/ _ `/ __/ '_/
12 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0
13 | /_/
14 |
15 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
16 | SparkSession available as 'spark'.
17 | >>> spark
18 |
19 | >>>
20 | >>>
21 | >>>
22 | >>> # create RDD[(String, Integer)]
23 | ...
24 | >>> key_value_pairs =
25 | [
26 | ('alex', 10),
27 | ('alex', 20),
28 | ('alex', 30),
29 | ('bob', 100),
30 | ('bob', 200),
31 | ('zazo', 7)
32 | ]
33 |
34 | >>> # create an RDD[(String, Integer)] from a python collection
35 | >>> key_value = spark.sparkContext.parallelize(key_value_pairs)
36 | >>> key_value.count()
37 | 6
38 | >>> key_value.collect()
39 | [
40 | ('alex', 10),
41 | ('alex', 20),
42 | ('alex', 30),
43 | ('bob', 100),
44 | ('bob', 200),
45 | ('zazo', 7)
46 | ]
47 | >>>
48 | >>>
49 | >>># use the reduceByKey() transformation
50 | >>> sum_of_values_per_key = key_value.reduceByKey(lambda x, y: x+y)
51 | >>>
52 | >>> sum_of_values_per_key.count()
53 | 3
54 | >>> sum_of_values_per_key.collect()
55 | [
56 | ('bob', 300),
57 | ('alex', 60),
58 | ('zazo', 7)
59 | ]
60 | >>>
61 | >>>
62 | >>>
63 | >>> filtered = sum_of_values_per_key.filter(lambda x: x[1] > 10)
64 | >>> filtered.collect()
65 | [('bob', 300), ('alex', 60)]
66 | >>>
67 | >>>
68 | >>> key_value.collect()
69 | [
70 | ('alex', 10),
71 | ('alex', 20),
72 | ('alex', 30),
73 | ('bob', 100),
74 | ('bob', 200),
75 | ('zazo', 7)
76 | ]
77 | >>>
78 | >>> grouped = key_value.groupByKey()
79 | >>> grouped.collect()
80 | [
81 | ('bob', ),
82 | ('alex', ),
83 | ('zazo', )
84 | ]
85 | >>> grouped.mapValues(lambda v : list(v)).collect()
86 | [
87 | ('bob', [100, 200]),
88 | ('alex', [10, 20, 30]),
89 | ('zazo', [7])
90 | ]
91 | >>> sum_of_values_per_key_2 = grouped.mapValues(lambda values: sum(values))
92 | >>> sum_of_values_per_key_2.collect()
93 | [
94 | ('bob', 300),
95 | ('alex', 60),
96 | ('zazo', 7)
97 | ]
98 | >>>
99 | >>>
100 | >>> pairs = [('a', 10), ('a', 100), ('a', 200), ('b', 10)]
101 | >>> rdd = spark.sparkContext.parallelize(pairs)
102 | >>>
103 | >>> rdd.collect()
104 | [('a', 10), ('a', 100), ('a', 200), ('b', 10)]
105 | >>> rdd2 = rdd.mapValues(lambda v: v+1000)
106 | >>> rdd2.collect()
107 | [('a', 1010), ('a', 1100), ('a', 1200), ('b', 1010)]
108 | >>>
109 | >>> rdd3 = rdd.map(lambda x: x[1]+1000)
110 | >>> rdd3.collect()
111 | [1010, 1100, 1200, 1010]
112 | >>>
113 | >>>
114 | >>> rdd3 = rdd.map(lambda x: (x[0], x[1]+1000))
115 | >>> rdd3.collect()
116 | [('a', 1010), ('a', 1100), ('a', 1200), ('b', 1010)]
117 | >>>
118 | >>>
119 | >>> data = [ ['a', 'b', 'c'], ['z'], [], [], ['alex', 'bob'] ]
120 | >>> rdd = spark.sparkContext.parallelize(data)
121 | >>> rdd.collect()
122 | [['a', 'b', 'c'], ['z'], [], [], ['alex', 'bob']]
123 | >>> rdd.count()
124 | 5
125 | >>> flattened = rdd.flatMap(lambda x: x)
126 | >>> flattened.count()
127 | 6
128 | >>> flattened.collect()
129 | ['a', 'b', 'c', 'z', 'alex', 'bob']
130 | >>> mapped = rdd.map(lambda x: x)
131 | >>> mapped.count()
132 | 5
133 | >>> mapped.collect()
134 | [['a', 'b', 'c'], ['z'], [], [], ['alex', 'bob']]
135 | >>>
136 | >>>
137 | >>> data = [ ['a', 'b', 'c'], ['z'], [], [], ('alex', 'bob') ]
138 | >>> flattened2 = rdd.flatMap(lambda x: x)
139 | >>> flattened2.collect()
140 | ['a', 'b', 'c', 'z', 'alex', 'bob']
141 | >>>
142 | >>>
143 | >>>
144 | >>> data2 = [ ['a', 'b', 'c'], ['z'], [], [], ('alex', 'bob') ]
145 | >>> data2
146 | [['a', 'b', 'c'], ['z'], [], [], ('alex', 'bob')]
147 | >>> rdd2 = spark.sparkContext.parallelize(data2)
148 | >>>
149 | >>>
150 | >>> rdd2.collect()
151 | [['a', 'b', 'c'], ['z'], [], [], ('alex', 'bob')]
152 | >>> rdd2.count()
153 | 5
154 | >>> flattened2 = rdd2.flatMap(lambda x: x)
155 | >>> flattened2.collect()
156 | ['a', 'b', 'c', 'z', 'alex', 'bob']
157 | >>>
158 | >>>
159 | >>> data3 = [ ['a', 'b', 'c'], ['z'], [], [], 'alex', 'bob' ]
160 | >>> rdd3 = spark.sparkContext.parallelize(data3)
161 | >>> flattened3 = rdd3.flatMap(lambda x: x)
162 | >>> flattened3.collect()
163 | ['a', 'b', 'c', 'z', 'a', 'l', 'e', 'x', 'b', 'o', 'b']
164 | >>>
165 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-10-12.txt:
--------------------------------------------------------------------------------
1 | $ ls -l /tmp/data/
2 | -rw-r--r-- 1 mparsian wheel 72 Oct 12 20:00 file1
3 | -rw-r--r-- 1 mparsian wheel 94 Oct 12 20:01 file2
4 | -rw-r--r-- 1 mparsian wheel 35 Oct 12 20:01 file3
5 |
6 | $ cat /tmp/data/file1
7 | file1: this is record 1
8 | file1: this is record 2
9 | file1: this is record 3
10 |
11 | $ cat /tmp/data/file2
12 | file2: this is record 1
13 | file2: this is record 2
14 | file2: this is fox 3
15 | file2: this is it 4
16 |
17 | $ cat /tmp/data/file3
18 | file3: record 1
19 | file3: ewcord 2222
20 |
21 |
22 | $ ./bin/pyspark
23 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
24 | [Clang 6.0 (clang-600.0.57)] on darwin
25 | Type "help", "copyright", "credits" or "license" for more information.
26 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
27 | Setting default log level to "WARN".
28 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
29 | Welcome to
30 | ____ __
31 | / __/__ ___ _____/ /__
32 | _\ \/ _ \/ _ `/ __/ '_/
33 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0
34 | /_/
35 |
36 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
37 | SparkSession available as 'spark'.
38 |
39 |
40 | >>> input_path = '/tmp/data'
41 | >>>
42 | >>> recs = spark.sparkContext.textFile(input_path)
43 | >>> recs.count()
44 | 9
45 | >>> recs.collect()
46 | ['file3: record 1', 'file3: ewcord 2222', 'file2: this is record 1', 'file2: this is record 2', 'file2: this is fox 3', 'file2: this is it 4', 'file1: this is record 1', 'file1: this is record 2', 'file1: this is record 3']
47 | >>>
48 | >>> union2 = recs.union(recs)
49 | >>> union2.count()
50 | 18
51 | >>> union2.collect()
52 | ['file3: record 1', 'file3: ewcord 2222', 'file2: this is record 1', 'file2: this is record 2', 'file2: this is fox 3', 'file2: this is it 4', 'file1: this is record 1', 'file1: this is record 2', 'file1: this is record 3', 'file3: record 1', 'file3: ewcord 2222', 'file2: this is record 1', 'file2: this is record 2', 'file2: this is fox 3', 'file2: this is it 4', 'file1: this is record 1', 'file1: this is record 2', 'file1: this is record 3']
53 |
54 |
55 |
56 | >>> records = [('A', 1), ('B', 10), ('A', 2), ('A', 3), ('B', 20), ('B', 60)]
57 | >>>
58 | >>>
59 | >>> records
60 | [('A', 1), ('B', 10), ('A', 2), ('A', 3), ('B', 20), ('B', 60)]
61 | >>>
62 | >>> recs_rdd = spark.sparkContext.parallelize(records)
63 | >>> recs_rdd.count()
64 | 6
65 | >>> recs_rdd.collect()
66 | [('A', 1), ('B', 10), ('A', 2), ('A', 3), ('B', 20), ('B', 60)]
67 | >>> # recs_rdd: RDD[(String, Integer)]
68 | ...
69 | >>> sum_per_key = recs_rdd.reduceByKey(lambda x, y: x+y)
70 | >>> sum_per_key.count()
71 | 2
72 | >>> sum_per_key.collect()
73 | [('B', 90), ('A', 6)]
74 | >>> # avg_by_key: [('B', 30), ('A', 2)]
75 | ...
76 | >>>
77 |
78 | >>> sum_count = recs_rdd.mapValues(lambda v: (v, 1))
79 | >>>
80 | >>> sum_count.collect()
81 | [('A', (1, 1)), ('B', (10, 1)), ('A', (2, 1)), ('A', (3, 1)), ('B', (20, 1)), ('B', (60, 1))]
82 | >>>
83 | >>>
84 | >>> sum_count1 = (10, 1)
85 | >>> sum_count2 = (20, 2)
86 | >>> # (10+20, 1+2)
87 | ... # (30, 3)
88 | ...
89 | >>> sum_count_per_key = sum_count.reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1]))
90 | >>> sum_count_per_key.collect()
91 | [('B', (90, 3)), ('A', (6, 3))]
92 | >>>
93 | ])> avg_by_key = sum_count_per_key.mapValues(lambda sum_and_count_tuple : sum_and_count_tuple[0] / sum_and_count_tuple[1
94 | >>> avg_by_key.count()
95 | 2
96 | >>> avg_by_key.collect()
97 | [('B', 30.0), ('A', 2.0)]
98 | >>>
99 |
100 |
101 | >>> sum_count.collect()
102 | [('A', (1, 1)), ('B', (10, 1)), ('A', (2, 1)), ('A', (3, 1)), ('B', (20, 1)), ('B', (60, 1))]
103 |
104 | >>> def add_sum_count(x, y):
105 | ... sum2 = x[0] + y[0]
106 | ... count = x[1] + y[1]
107 | ... return (sum2, count)
108 | ...
109 | >>>
110 | >>> sum_count_per_key = sum_count.reduceByKey(lambda x, y: add_sum_count(x, y))
111 | >>> sum_count_per_key.collect()
112 | [('B', (90, 3)), ('A', (6, 3))]
113 | >>> avg_per_key = sum_count_per_key.mapValues(lambda tuple: tuple[0] / tuple[1])
114 | >>> avg_per_key.collect()
115 | [('B', 30.0), ('A', 2.0)]
116 | >>>
117 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-10-19.txt:
--------------------------------------------------------------------------------
1 | mapPartitions() Explained.
2 |
3 |
4 | ./bin/pyspark
5 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
6 | [Clang 6.0 (clang-600.0.57)] on darwin
7 | Type "help", "copyright", "credits" or "license" for more information.
8 | 20/10/19 20:19:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
9 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
10 | Setting default log level to "WARN".
11 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
12 | Welcome to
13 | ____ __
14 | / __/__ ___ _____/ /__
15 | _\ \/ _ \/ _ `/ __/ '_/
16 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0
17 | /_/
18 |
19 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
20 | SparkSession available as 'spark'.
21 | >>> input_path = '/Users/mparsian/numbers'
22 | >>> rdd = spark.sparkContext.textFile(input_path)
23 | >>>
24 | >>> rdd.collect()
25 | ['3', '5', '55', '44', '9', '3', '66', '77', '88', '34', '23', '11', '14', '4', '3', '8', '9', '78', '79', '60', '56', '45']
26 | >>> num_of_partitions = rdd.numPartitions()
27 | Traceback (most recent call last):
28 | File "", line 1, in
29 | AttributeError: 'RDD' object has no attribute 'numPartitions'
30 | >>> num_of_partitions = rdd.getNumPartitions()
31 | >>>
32 | >>> num_of_partitions
33 | 2
34 | >>> rdd = spark.sparkContext.textFile(input_path, 4)
35 | >>> num_of_partitions = rdd.getNumPartitions()
36 | >>> num_of_partitions
37 | 5
38 | >>> rdd = spark.sparkContext.textFile(input_path, 4)
39 | >>> num_of_partitions = rdd.getNumPartitions()
40 | >>> num_of_partitions
41 | 5
42 | >>> def debug(iterator):
43 | ... elements = []
44 | ... for x in iterator:
45 | ... elements.append(x)
46 | ... print("elements="+ str(elements))
47 | ...
48 | >>>
49 | >>> rdd.foreachPartition(debug)
50 | elements=['78', '79', '60', '56', '45']
51 | elements=[]
52 | elements=['11', '14', '4', '3', '8', '9']
53 | elements=['3', '5', '55', '44', '9', '3', '66']
54 | elements=['77', '88', '34', '23']
55 | >>>
56 | >>>
57 | >>> rdd = spark.sparkContext.textFile(input_path)
58 | >>> rdd.colect()
59 | Traceback (most recent call last):
60 | File "", line 1, in
61 | AttributeError: 'RDD' object has no attribute 'colect'
62 | >>> rdd.collect()
63 | ['3', '5', '55', '44', '9', '3', '66', '77', '88', '34', '23', '11', '14', '4', '3', '8', '9', '78', '79', '60', '56', '45']
64 | >>> num_of_partitions = rdd.getNumPartitions()
65 | >>> num_of_partitions
66 | 2
67 | >>> rdd.foreachPartition(debug)
68 | elements=['14', '4', '3', '8', '9', '78', '79', '60', '56', '45']
69 | elements=['3', '5', '55', '44', '9', '3', '66', '77', '88', '34', '23', '11']
70 | >>>
71 | >>>
72 | >>>
73 | >>> def find_min_max(partition):
74 | ... first_time = False
75 | ... for n in partition:
76 | ... if first_time == False:
77 | ... min2 = n
78 | ... max2 = n
79 | ... first_time == True
80 | ... else:
81 | ... min2 = min(n, min2)
82 | ... max2 = max(n, max2)
83 | ... return [(min2, max2)]
84 | ...
85 | >>>
86 | >>> target = rdd.mapPartitions(find_min_max)
87 | >>> target.collect()
88 | [('11', '11'), ('45', '45')]
89 | >>>
90 | >>> rdd_integer = rdd.map(lambda n : int(n))
91 | >>> rdd_integer.collect()
92 | [3, 5, 55, 44, 9, 3, 66, 77, 88, 34, 23, 11, 14, 4, 3, 8, 9, 78, 79, 60, 56, 45]
93 | >>> target = rdd.mapPartitions(find_min_max)
94 | >>> target.collect()
95 | [('11', '11'), ('45', '45')]
96 | >>>
97 | >>>
98 | >>> target = rdd_integer.mapPartitions(find_min_max)
99 | >>> target.collect()
100 | [(11, 11), (45, 45)]
101 | >>>
102 | >>>
103 | >>> def find_min_max(partition):
104 | ... first_time = False
105 | ... for n in partition:
106 | ... if first_time == False:
107 | ... min2 = n
108 | ... max2 = n
109 | ... first_time = True
110 | ... else:
111 | ... min2 = min(n, min2)
112 | ... max2 = max(n, max2)
113 | ... return [(min2, max2)]
114 | ...
115 | ...
116 | >>> def debug(iterator):
117 | ... elements = []
118 | ... for x in iterator:
119 | ... elements.append(x)
120 | ... print("elements="+ str(elements))
121 | ...
122 | >>>
123 | >>> target = rdd_integer.mapPartitions(find_min_max)
124 |
125 | >>> target.collect()
126 | [(3, 88), (3, 79)]
127 | >>> rdd_integer.foreachPartition(debug)
128 | elements=[14, 4, 3, 8, 9, 78, 79, 60, 56, 45]
129 | elements=[3, 5, 55, 44, 9, 3, 66, 77, 88, 34, 23, 11]
130 | >>> target
131 | PythonRDD[14] at collect at :1
132 | >>> final_min_max = target.reduce(lambda x, y: (min(x[0], y[0]), max(x[1], y[1])))
133 | >>> final_min_max
134 | (3, 88)
135 | >>>
136 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-01-19.txt:
--------------------------------------------------------------------------------
1 | $ ./bin/pyspark
2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
3 | [Clang 6.0 (clang-600.0.57)] on darwin
4 | Type "help", "copyright", "credits" or "license" for more information.
5 | 21/01/19 20:03:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
6 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
7 | Setting default log level to "WARN".
8 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
9 | Welcome to
10 | ____ __
11 | / __/__ ___ _____/ /__
12 | _\ \/ _ \/ _ `/ __/ '_/
13 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0
14 | /_/
15 |
16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
17 | SparkSession available as 'spark'.
18 | >>>
19 | >>>
20 | >>> tuples2 = [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)]
21 | >>> tuples2
22 | [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)]
23 | >>>
24 | >>>
25 | >>> pairs_rdd = spark.sparkContext.parallelize(tuples2)
26 | >>> pairs_rdd
27 | ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:262
28 |
29 | >>> pairs_rdd.collect()
30 | [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)]
31 | >>> pairs_rdd.count()
32 | 5
33 | >>> tuples33 = [('alex', 4, 44), ('alex', 5, 55), ('bob', 40, 66)]
34 | >>> tuples33
35 | [('alex', 4, 44), ('alex', 5, 55), ('bob', 40, 66)]
36 | >>> rdd = spark.sparkContext.parallelize(tuples33)
37 | >>>
38 | >>> rdd.collect()
39 | [('alex', 4, 44), ('alex', 5, 55), ('bob', 40, 66)]
40 | >>> rdd.count()
41 | 3
42 | >>>
43 | >>>
44 | >>>
45 | >>> pairs_rdd.collect()
46 | [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)]
47 |
48 | >>> new_rdd = pairs_rdd.map(lambda x: (x[0], x[1], 2*int(x[1])))
49 | >>> new_rdd.collect()
50 | [('alex', 4, 8), ('alex', 5, 10), ('bob', 40, 80), ('bob', 50, 100), ('bob', 4, 8)]
51 | >>>
52 | >>> columns = ["name", "age", "salary"]
53 | >>> some_tuples = [('alex', 40, 80000), ('alex', 50, 1000000), ('bob', 40, 8000000), ('bob', 50, 10000)]
54 | >>> df = spark.createDataFrame(some_tuples, columns)
55 | >>> df.show()
56 | +----+---+-------+
57 | |name|age| salary|
58 | +----+---+-------+
59 | |alex| 40| 80000|
60 | |alex| 50|1000000|
61 | | bob| 40|8000000|
62 | | bob| 50| 10000|
63 | +----+---+-------+
64 |
65 | >>> df.printSchema()
66 | root
67 | |-- name: string (nullable = true)
68 | |-- age: long (nullable = true)
69 | |-- salary: long (nullable = true)
70 |
71 | >>> rdd = spark.sparkContext.parallelize(some_tuples)
72 | >>> rdd.collect()
73 | [('alex', 40, 80000), ('alex', 50, 1000000), ('bob', 40, 8000000), ('bob', 50, 10000)]
74 | >>> rdd.take(2)
75 | [('alex', 40, 80000), ('alex', 50, 1000000)]
76 | >>>
77 |
78 | >>>
79 | >>> data = ["alex,20", "alex,30", "bob,40", "bob,50", "bob,60"]
80 | >>> data
81 | ['alex,20', 'alex,30', 'bob,40', 'bob,50', 'bob,60']
82 | >>>
83 | >>>
84 | >>> rdd = spark.sparkContext.parallelize(data)
85 | >>> rdd.collect()
86 | ['alex,20', 'alex,30', 'bob,40', 'bob,50', 'bob,60']
87 | >>> rdd.count()
88 | 5
89 |
90 | >>> def create_pairs(rec):
91 | ... tokens = rec.split(",")
92 | ... key = tokens[0]
93 | ... value = tokens[1]
94 | ... return (key, value)
95 | ...
96 | >>>
97 | >>> pairs = rdd.map(lambda x: create_pairs(x))
98 | >>> pairs.collect()
99 | [('alex', '20'), ('alex', '30'), ('bob', '40'), ('bob', '50'), ('bob', '60')]
100 | >>> pairs.count()
101 | 5
102 | >>> pairs = rdd.map(create_pairs)
103 | >>> pairs.collect()
104 | [('alex', '20'), ('alex', '30'), ('bob', '40'), ('bob', '50'), ('bob', '60')]
105 | >>> pairs.count()
106 | 5
107 | >>>
108 | >>> sum_by_key = pairs.reduceByKey(lambda x, y: x+y)
109 | >>> sum_by_key.collect()
110 | [('bob', '405060'), ('alex', '2030')]
111 | >>>
112 | >>> def create_pair(rec):
113 | ... tokens = rec.split(",")
114 | ... key = tokens[0]
115 | ... value = int(tokens[1])
116 | ... return (key, value)
117 | ...
118 | >>>
119 |
120 | >>> rdd2 = rdd.map(lambda x: create_pair(x))
121 | >>> rdd2.collect()
122 | [('alex', 20), ('alex', 30), ('bob', 40), ('bob', 50), ('bob', 60)]
123 | >>> sum_by_key = rdd2.reduceByKey(lambda x, y: x+y)
124 | >>> sum_by_key.collect()
125 | [('bob', 150), ('alex', 50)]
126 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-01-26.txt:
--------------------------------------------------------------------------------
1 | Spark's Mapper Transformations:
2 |
3 | # map: 1 -> 1
4 |
5 | # flatMap: 1 -> Many
6 |
7 | # mapPartitions: partition -> 1 (Many to 1)
8 |
9 | Many = 0, 1, 2, 3, 4, ...
10 | partition = many elements
11 |
12 | $ ./bin/pyspark
13 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
14 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
15 | Welcome to
16 | ____ __
17 | / __/__ ___ _____/ /__
18 | _\ \/ _ \/ _ `/ __/ '_/
19 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0
20 | /_/
21 |
22 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
23 | SparkSession available as 'spark'.
24 | >>>
25 | >>>
26 | >>> spark
27 |
28 | >>> sc = spark.sparkContext
29 | >>> sc
30 |
31 | >>>
32 | >>>
33 | >>> data = [ [1, 2, 3], [4, 5, 6, 7] ]
34 | >>> data
35 | [[1, 2, 3], [4, 5, 6, 7]]
36 | >>> data[0]
37 | [1, 2, 3]
38 | >>> data[1]
39 | [4, 5, 6, 7]
40 | >>>
41 | >>> rdd = spark.sparkContext.parallelize(data)
42 | >>> rdd.collect()
43 | [[1, 2, 3], [4, 5, 6, 7]]
44 | >>> rdd.count()
45 | 2
46 | >>>
47 | >>> rdd_mapped = rdd.map(lambda x: x)
48 | >>> rdd_mapped.collect()
49 | [[1, 2, 3], [4, 5, 6, 7]]
50 | >>> rdd_mapped.count()
51 | 2
52 | >>>
53 | >>> rdd_flat_mapped = rdd.flatMap(lambda x: x)
54 | >>> rdd_flat_mapped.collect()
55 | [1, 2, 3, 4, 5, 6, 7]
56 | >>> rdd_flat_mapped.count()
57 | 7
58 | >>> data = [ [1, 2, 3], [], [4, 5, 6, 7], [], [9] ]
59 | >>> data
60 | [[1, 2, 3], [], [4, 5, 6, 7], [], [9]]
61 | >>> data[0]
62 | [1, 2, 3]
63 | >>> data[1]
64 | []
65 | >>> data[3]
66 | []
67 | >>> data[2]
68 | [4, 5, 6, 7]
69 | >>> data[3]
70 | []
71 | >>> data[4]
72 | [9]
73 | >>> rdd = spark.sparkContext.parallelize(data)
74 | >>> rdd.collect()
75 | [[1, 2, 3], [], [4, 5, 6, 7], [], [9]]
76 | >>> rdd.count()
77 | 5
78 | >>> rdd_mapped = rdd.map(lambda x: x)
79 | >>> rdd_mapped.collect()
80 | [[1, 2, 3], [], [4, 5, 6, 7], [], [9]]
81 | >>> rdd_mapped.count()
82 | 5
83 | >>> rdd_flat_mapped = rdd.flatMap(lambda x: x)
84 | >>> rdd_flat_mapped.collect()
85 | [1, 2, 3, 4, 5, 6, 7, 9]
86 | >>> rdd_flat_mapped.count()
87 | 8
88 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-04-12.txt:
--------------------------------------------------------------------------------
1 | ~/spark-3.1.1 $ ./bin/pyspark
2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
3 | [Clang 6.0 (clang-600.0.57)] on darwin
4 | Type "help", "copyright", "credits" or "license" for more information.
5 | 21/04/12 20:59:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
6 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
7 | Setting default log level to "WARN".
8 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
9 | Welcome to
10 | ____ __
11 | / __/__ ___ _____/ /__
12 | _\ \/ _ \/ _ `/ __/ '_/
13 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1
14 | /_/
15 |
16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
17 | Spark context Web UI available at http://10.0.0.93:4040
18 | Spark context available as 'sc' (master = local[*], app id = local-1618286379380).
19 | SparkSession available as 'spark'.
20 | >>> spark
21 |
22 | >>>
23 | >>>
24 | >>> numbers = [1, 2, 3, 6, 7, 8, 99, 10, -10, -30]
25 | >>> numbers
26 | [1, 2, 3, 6, 7, 8, 99, 10, -10, -30]
27 |
28 | >>># create an RDD[Integer] from a collection
29 | >>># RDD = Resilient Distributed Dataset
30 | >>> rdd = spark.sparkContext.parallelize(numbers)
31 | >>> rdd.collect()
32 | [1, 2, 3, 6, 7, 8, 99, 10, -10, -30]
33 | >>> rdd.count()
34 | 10
35 |
36 | >>># fund sum of all numbers in rdd as (RDD[Integer])
37 | >>> total = rdd.reduce(lambda x, y: x+y)
38 | >>> total
39 | 96
40 |
41 | >>>#apply a filter: find all positive numbers
42 | >>> positives = rdd.filter(lambda x : x > 0)
43 | >>> positives.collect()
44 | [1, 2, 3, 6, 7, 8, 99, 10]
45 | >>>
46 | >>># increment every element by 1000
47 | >>> rdd2 = rdd.map(lambda x : x+1000)
48 | >>> rdd2.collect()
49 | [1001, 1002, 1003, 1006, 1007, 1008, 1099, 1010, 990, 970]
50 | >>>
51 | >>># create (key, value) pairs
52 | >>> data = [("m1", 4), ("m1", 5), ("m2", 3), ("m2", 4), ("m2", 5), ("m3", 2), ("m3", 4)]
53 | >>> data
54 | [('m1', 4), ('m1', 5), ('m2', 3), ('m2', 4), ('m2', 5), ('m3', 2), ('m3', 4)]
55 |
56 | >>>
57 | >>> pairs = spark.sparkContext.parallelize(data)
58 | >>> pairs.collect()
59 | [('m1', 4), ('m1', 5), ('m2', 3), ('m2', 4), ('m2', 5), ('m3', 2), ('m3', 4)]
60 |
61 | >>># keep elements if their associated value is Greater Than 3
62 | >>># x[0] refers to key
63 | >>># x[1] refers to value
64 | >>> rating45 = pairs.filter(lambda x : x[1] > 3)
65 | >>> rating45.collect()
66 | [('m1', 4), ('m1', 5), ('m2', 4), ('m2', 5), ('m3', 4)]
67 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-04-14.txt:
--------------------------------------------------------------------------------
1 | $ cat /tmp/foxdata.txt
2 | a red fox jumped of high
3 | fox jumped over a high fence
4 | red of fox jumped
5 |
6 |
7 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
8 | ...
9 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
10 | Welcome to
11 | ____ __
12 | / __/__ ___ _____/ /__
13 | _\ \/ _ \/ _ `/ __/ '_/
14 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1
15 | /_/
16 |
17 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
18 | Spark context Web UI available at http://10.0.0.93:4040
19 | Spark context available as 'sc' (master = local[*], app id = local-1618456720582).
20 | SparkSession available as 'spark'.
21 | >>>
22 | >>>
23 | >>>
24 | >>> spark
25 |
26 |
27 | >>> input_path = "/tmp/foxdata.txt"
28 | >>> input_path
29 | '/tmp/foxdata.txt'
30 | >>> # Read input path and create an RDD[String]
31 | ...
32 | >>> records = spark.sparkContext.textFile(input_path)
33 | >>> records
34 | /tmp/foxdata.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
35 | >>>
36 | >>> records.collect()
37 | [
38 | 'a red fox jumped of high',
39 | 'fox jumped over a high fence',
40 | 'red of fox jumped'
41 | ]
42 | >>> records.count()
43 | 3
44 | >>> # tokenize records and create RDD[ [String] ]
45 | ...
46 | >>> tokenizd = records.map(lambda record: record.split(" "))
47 | >>> tokenizd.collect()
48 | [
49 | ['a', 'red', 'fox', 'jumped', 'of', 'high'],
50 | ['fox', 'jumped', 'over', 'a', 'high', 'fence'],
51 | ['red', 'of', 'fox', 'jumped']
52 | ]
53 | >>> tokenizd.count()
54 | 3
55 | >>> pairs = tokenizd.map(lambda word : (word, 1))
56 | >>> pairs.collect()
57 | [
58 | (['a', 'red', 'fox', 'jumped', 'of', 'high'], 1),
59 | (['fox', 'jumped', 'over', 'a', 'high', 'fence'], 1),
60 | (['red', 'of', 'fox', 'jumped'], 1)
61 | ]
62 | >>>
63 | >>> words = tokenizd.flatMap(lambda arr: arr)
64 | >>> words.collect()
65 | ['a', 'red', 'fox', 'jumped', 'of', 'high', 'fox', 'jumped', 'over', 'a', 'high', 'fence', 'red', 'of', 'fox', 'jumped']
66 | >>> words.count()
67 | 16
68 | >>> # words : RDD[String]
69 | ...
70 | >>> key_value_pairs = words.map(lambda word: (word, 1))
71 | >>> key_value_pairs.collect()
72 | [('a', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('of', 1), ('high', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('a', 1), ('high', 1), ('fence', 1), ('red', 1), ('of', 1), ('fox', 1), ('jumped', 1)]
73 | >>>
74 | >>> # key_value_pairs: RDD[(String, Integer)]
75 | ...
76 | >>>
77 | >>> grouped = key_value_pairs.groupByKey()
78 | >>> grouped.collect()
79 | [
80 | ('of', ),
81 | ('high', ),
82 | ('fence', ),
83 | ('a', ),
84 | ('red', ),
85 | ('fox', ),
86 | ('jumped', ),
87 | ('over', )
88 | ]
89 | >>>
90 | >>> debugged = grouped.mapValues(lambda values: list(values))
91 | >>> debugged.collect()
92 | [
93 | ('of', [1, 1]),
94 | ('high', [1, 1]),
95 | ('fence', [1]),
96 | ('a', [1, 1]),
97 | ('red', [1, 1]),
98 | ('fox', [1, 1, 1]),
99 | ('jumped', [1, 1, 1]),
100 | ('over', [1])
101 | ]
102 | >>>
103 | >>>
104 | >>> frequency = grouped.mapValues(lambda values: sum(values))
105 | >>> frequency.collect()
106 | [('of', 2), ('high', 2), ('fence', 1), ('a', 2), ('red', 2), ('fox', 3), ('jumped', 3), ('over', 1)]
107 | >>>
108 | >>>
109 | >>>
110 | >>> key_value_pairs.collect()
111 | [('a', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('of', 1), ('high', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('a', 1), ('high', 1), ('fence', 1), ('red', 1), ('of', 1), ('fox', 1), ('jumped', 1)]
112 | >>>
113 | >>>
114 | >>>
115 | >>> reduced = key_value_pairs.reduceByKey(lambda x, y: x+y)
116 | >>> reduced.collect()
117 | [('of', 2), ('high', 2), ('fence', 1), ('a', 2), ('red', 2), ('fox', 3), ('jumped', 3), ('over', 1)]
118 | >>>
119 | >>> rdd7 = reduced.mapValues(lambda x: x+100)
120 | >>> rdd7.collect()
121 | [('of', 102), ('high', 102), ('fence', 101), ('a', 102), ('red', 102), ('fox', 103), ('jumped', 103), ('over', 101)]
122 |
123 | >>> rdd77 = reduced.map(lambda x: x[1]+100)
124 | >>> rdd77.collect()
125 | [102, 102, 101, 102, 102, 103, 103, 101]
126 |
127 | >>> rdd77 = reduced.map(lambda x: (x[0], x[1]+100))
128 | >>> rdd77.collect()
129 | [('of', 102), ('high', 102), ('fence', 101), ('a', 102), ('red', 102), ('fox', 103), ('jumped', 103), ('over', 101)]
130 | >>>
131 |
132 | >>># get number of partitions for rdd77
133 | >>> rdd77.getNumPartitions()
134 | 2
135 | >>>
136 | >>>
137 | >>> KV = [('x', 3), ('x', 5), ('x', 8), ('y', 50), ('y', 60), ('y', 70), ('z', 3)]
138 | >>> KV
139 | [('x', 3), ('x', 5), ('x', 8), ('y', 50), ('y', 60), ('y', 70), ('z', 3)]
140 | >>> rdd = spark.sparkContext.parallelize(KV)
141 | >>>
142 | >>> rdd.collect()
143 | [('x', 3), ('x', 5), ('x', 8), ('y', 50), ('y', 60), ('y', 70), ('z', 3)]
144 | >>> rdd.count()
145 | 7
146 | >>>
147 | >>> filtered1 = rdd.filter(lambda x : x[1] > 10)
148 | >>> filtered1.collect()
149 | [('y', 50), ('y', 60), ('y', 70)]
150 | >>> filtered2 = rdd.filter(lambda x : x[1] < 10)
151 | >>> filtered2.collect()
152 | [('x', 3), ('x', 5), ('x', 8), ('z', 3)]
153 | >>>
154 | >>>
155 | >>> added = rdd.reduceByKey(lambda a, b: a+b)
156 | >>> added.collect()
157 | [('y', 180), ('z', 3), ('x', 16)]
158 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-04-19.txt:
--------------------------------------------------------------------------------
1 | $ ./bin/pyspark
2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
3 | [Clang 6.0 (clang-600.0.57)] on darwin
4 | Type "help", "copyright", "credits" or "license" for more information.
5 |
6 | 21/04/19 20:20:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
7 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
8 | Setting default log level to "WARN".
9 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
10 | Welcome to
11 | ____ __
12 | / __/__ ___ _____/ /__
13 | _\ \/ _ \/ _ `/ __/ '_/
14 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1
15 | /_/
16 |
17 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
18 | Spark context Web UI available at http://10.0.0.93:4040
19 | Spark context available as 'sc' (master = local[*], app id = local-1618888841845).
20 | SparkSession available as 'spark'.
21 | >>>
22 | >>># Create an RDD[(String, Integer)] as rdd
23 | >>> kv =[('a', 3), ('a', 4), ('a', 5), ('b', 30),('b', 40),('b', 50),('z', 3)]
24 | >>> rdd = spark.sparkContext.parallelize(kv)
25 | >>>
26 | >>>
27 | >>> rdd.count()
28 | 7
29 | >>> rdd.collect()
30 | [('a', 3), ('a', 4), ('a', 5), ('b', 30), ('b', 40), ('b', 50), ('z', 3)]
31 | >>> def mapfun1(e):
32 | ... k = e[0]
33 | ... v = e[1]
34 | ... return (k, (v, v+5))
35 | ...
36 | >>># Create an RDD[(String, (Integer, Integer))] as rdd2
37 | >>># rdd2 has key type of String and value type of (Integer, Integer)
38 | >>> rdd2 = rdd.map(mapfun1)
39 | >>>
40 | >>> rdd2.collect()
41 | [('a', (3, 8)), ('a', (4, 9)), ('a', (5, 10)), ('b', (30, 35)), ('b', (40, 45)), ('b', (50, 55)), ('z', (3, 8))]
42 | >>> rdd2.count()
43 | 7
44 | >>> # rdd: RDD[(String, Integer)]
45 | ...
46 | >>> # rdd2: RDD[(String, (Integer, Integer)]
47 | >>>
48 | >>># Create an RDD[(String, Integer)] as rdd3
49 | >>> rdd3 = rdd2.map(lambda x: (x[0], x[1][0]+x[1][1]))
50 | >>> rdd3.count()
51 | 7
52 | >>> rdd3.collect()
53 | [('a', 11), ('a', 13), ('a', 15), ('b', 65), ('b', 85), ('b', 105), ('z', 11)]
54 | >>>
55 | >>> rdd31 = rdd2.mapValues(lambda v: v[0]+v[1])
56 | >>> rdd31.count()
57 | 7
58 | >>> rdd31.collect()
59 | [('a', 11), ('a', 13), ('a', 15), ('b', 65), ('b', 85), ('b', 105), ('z', 11)]
60 | >>>
61 | >>>
62 | >>>
63 | >>> strings = ["abc", "xyzt", "", "123"]
64 | >>> rdd_strings = spark.sparkContext.parallelize(strings)
65 | >>>
66 | >>> rdd_strings.count()
67 | 4
68 | >>> rdd_strings_2 = rdd_strings.flatMap(lambda v: v)
69 | >>> rdd_strings_2.collect()
70 | ['a', 'b', 'c', 'x', 'y', 'z', 't', '1', '2', '3']
71 | >>>
72 | >>> lists = [ [1, 2, 3], [], [6,7,8,9,10], [] ]
73 | >>> rdd4 = spark.sparkContext.parallelize(lists)
74 | >>> rdd4.collect()
75 | [[1, 2, 3], [], [6, 7, 8, 9, 10], []]
76 | >>> rdd4.count()
77 | 4
78 |
79 | >>> rdd5 = rdd4.flatMap(lambda v: v)
80 | >>> rdd5.collect()
81 | [1, 2, 3, 6, 7, 8, 9, 10]
82 | >>> rdd5.count()
83 | 8
84 | >>>
85 | >>> lists = [ [7, (1,2), (2,4)], ["abc", 99], [6, (7, 7), (8, 8)], [] ]
86 | >>> rdd9 = spark.sparkContext.parallelize(lists)
87 | >>> rdd9.collect()
88 | [[7, (1, 2), (2, 4)], ['abc', 99], [6, (7, 7), (8, 8)], []]
89 | >>> rdd9.count()
90 | 4
91 | >>> rdd10 = rdd9.flatMap(lambda v: v)
92 | >>> rdd10.collect()
93 | [7, (1, 2), (2, 4), 'abc', 99, 6, (7, 7), (8, 8)]
94 | >>>
95 | >>>
96 | >>> rdd11 = rdd10.flatMap(lambda v: v)
97 | >>> rdd11.collect()
98 | 21/04/19 20:43:44 ERROR Executor: Exception in task 5.0 in stage 17.0 (TID 141)
99 | TypeError: 'int' object is not iterable
100 |
101 | >>>
102 | >>> mylist = [(7, 1, 2), (2, 4), ('abc', 99, 6), (7, 7), (8, 8)]
103 | >>> rdd = spark.sparkContext.parallelize(mylist)
104 | >>> rdd.collect()
105 | [(7, 1, 2), (2, 4), ('abc', 99, 6), (7, 7), (8, 8)]
106 | >>> rdd2 = rdd.flatMap(lambda x: x)
107 | >>> rdd2.collect()
108 | [7, 1, 2, 2, 4, 'abc', 99, 6, 7, 7, 8, 8]
109 | >>>
110 | >>>
111 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-04-21-mapPartitions.txt:
--------------------------------------------------------------------------------
1 | ./bin/pyspark
2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
3 | ...
4 | Welcome to
5 | ____ __
6 | / __/__ ___ _____/ /__
7 | _\ \/ _ \/ _ `/ __/ '_/
8 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1
9 | /_/
10 |
11 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
12 | Spark context Web UI available at http://10.0.0.93:4040
13 | Spark context available as 'sc' (master = local[*], app id = local-1619061713234).
14 | SparkSession available as 'spark'.
15 | >>>
16 | >>>
17 | >>>
18 | >>> nums = [1, 2, 3, 4, -1, 4, 5, 6, 7, -3, -1, 2, 3, 9, -1, -2]
19 | >>> nums
20 | [1, 2, 3, 4, -1, 4, 5, 6, 7, -3, -1, 2, 3, 9, -1, -2]
21 | >>>
22 | >>>
23 |
24 | >>> rdd = sc.parallelize(nums)
25 | >>> rdd.collect()
26 | [1, 2, 3, 4, -1, 4, 5, 6, 7, -3, -1, 2, 3, 9, -1, -2]
27 | >>># find the default number of partitions
28 | >>> rdd.getNumPartitions()
29 | 8
30 | >>>
31 | >>># set number of partitions explicitly to 3
32 | >>> rdd = sc.parallelize(nums, 3)
33 | >>> rdd.getNumPartitions()
34 | 3
35 | >>> def debug(partition):
36 | ... elements = []
37 | ... for x in partition:
38 | ... elements.append(x)
39 | ... print("elements=", elements)
40 | ...
41 | >>> rdd.foreachPartition(debug)
42 | elements= [4, 5, 6, 7, -3]
43 | elements= [1, 2, 3, 4, -1]
44 | elements= [-1, 2, 3, 9, -1, -2]
45 | >>>
46 | >>>#define a function which handles a single partition
47 | >>> def min_max_count(partition):
48 | ... first_time = False
49 | ... local_count = 0
50 | ... for n in partition:
51 | ... local_count += 1
52 | ... if (first_time == False):
53 | ... local_min = n
54 | ... local_max = n
55 | ... first_time = True
56 | ... else:
57 | ... local_min = min(n, local_min)
58 | ... local_max = max(n, local_max)
59 | ... return [(local_min, local_max, local_count)]
60 | ...
61 | >>># Test your custom function without Spark
62 | >>> x = [1, 2, 3, -3, -6, 9, 10, 4, 5, 6]
63 | >>> result = min_max_count(x)
64 | >>> result
65 | [(-6, 10, 10)]
66 | >>>
67 | >>> rdd.foreachPartition(debug)
68 | elements= [1, 2, 3, 4, -1]
69 | elements= [-1, 2, 3, 9, -1, -2]
70 | elements= [4, 5, 6, 7, -3]
71 | >>>
72 | >>> rdd2 = rdd.mapPartitions(min_max_count)
73 | >>> rdd2.collect()
74 | [(-1, 4, 5), (-3, 7, 5), (-2, 9, 6)]
75 |
76 | >>> final_answer = rdd2.reduce(lambda x, y: ( min(x[0], y[0]), max(x[1], y[1]), x[2]+y[2]) )
77 | >>> final_answer
78 | (-3, 9, 16)
79 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-04-29-min-max-avg.txt:
--------------------------------------------------------------------------------
1 | Given billions of numbers, find (minimum, maximum, average)
2 | for all numbers.
3 |
4 | I provide 2 solutions: one using tuple of 4: (minimum, maximum, sum, count)
5 | another solution using tuple of 3: (minimum, maximum, sum)
6 |
7 |
8 | $ ./bin/pyspark
9 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
10 | Welcome to
11 | ____ __
12 | / __/__ ___ _____/ /__
13 | _\ \/ _ \/ _ `/ __/ '_/
14 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1
15 | /_/
16 |
17 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
18 | Spark context Web UI available at http://10.0.0.93:4040
19 | Spark context available as 'sc' (master = local[*], app id = local-1619727491830).
20 | SparkSession available as 'spark'.
21 | >>>
22 | >>>
23 | >>> nums = [1, 2, 3, -1, -2, -3, 4, 5, 6, 7, 8]
24 | >>>
25 | >>># Let rd denote billions of numbers
26 | >>> rdd = spark.sparkContext.parallelize(nums)
27 | >>> rdd.collect()
28 | [1, 2, 3, -1, -2, -3, 4, 5, 6, 7, 8]
29 | >>>
30 |
31 | >>># Create tuple of 4 elements as: (minimum, maximum, sum, count)
32 | >>> tuple4 = rdd.map(lambda n: (n, n, n, 1))
33 | >>> tuple4.collect()
34 | [(1, 1, 1, 1), (2, 2, 2, 1), (3, 3, 3, 1), (-1, -1, -1, 1), (-2, -2, -2, 1), (-3, -3, -3, 1), (4, 4, 4, 1), (5, 5, 5, 1), (6, 6, 6, 1), (7, 7, 7, 1), (8, 8, 8, 1)]
35 |
36 | >>># Perform a reduction on tuple4
37 | >>> min_max_sum_count = tuple4.reduce(lambda x, y: (min(x[0], y[0]), max(x[1],y[1]), x[2]+y[2], x[3]+y[3]) )
38 | >>>
39 | >>># Now, min_max_sum_count represents (minimum, maximum, sum, count)
40 | >>> min_max_sum_count
41 | (-3, 8, 30, 11)
42 | >>> final = (min_max_sum_count[0], min_max_sum_count[1], min_max_sum_count[2] / min_max_sum_count[3])
43 | >>> final
44 | (-3, 8, 2.727272727272727)
45 | >>>
46 |
47 | >>># Solution using tuple of 3
48 | >>> tuple3 = rdd.map(lambda n: (n, n, n))
49 | >>> min_max_sum = tuple3.reduce(lambda x, y: (min(x[0], y[0]), max(x[1],y[1]), x[2]+y[2]) )
50 | >>> min_max_sum
51 | (-3, 8, 30)
52 | >>> N = rdd.count()
53 | >>> N
54 | 11
55 | >>> final = (min_max_sum[0], min_max_sum[1], min_max_sum[2] / N)
56 | >>> final
57 | (-3, 8, 2.727272727272727)
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-05-05-join.txt:
--------------------------------------------------------------------------------
1 | PySpark Documentation: Join function in PySpark:
2 | http://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.join.html
3 |
4 | $ ./bin/pyspark
5 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
6 |
7 | Welcome to
8 | ____ __
9 | / __/__ ___ _____/ /__
10 | _\ \/ _ \/ _ `/ __/ '_/
11 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1
12 | /_/
13 |
14 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
15 | Spark context Web UI available at http://10.0.0.93:4040
16 | Spark context available as 'sc' (master = local[*], app id = local-1620269740798).
17 | SparkSession available as 'spark'.
18 | >>>
19 | >>> x = spark.sparkContext.parallelize([("spark", 1), ("hadoop", 4)])
20 | >>> x.collect()
21 | [
22 | ('spark', 1),
23 | ('hadoop', 4)
24 | ]
25 | >>>
26 | >>> y = spark.sparkContext.parallelize([("spark", 2), ("hadoop", 5)])
27 | >>> y.collect()
28 | [
29 | ('spark', 2),
30 | ('hadoop', 5)
31 | ]
32 | >>>
33 | >>> joined = x.join(y)
34 | >>> joined.collect()
35 | [
36 | ('spark', (1, 2)),
37 | ('hadoop', (4, 5))
38 | ]
39 | >>>
40 | >>>
41 | >>> x = spark.sparkContext.parallelize([("a", 1), ("b", 4), ("c", 4)])
42 | >>> x.collect()
43 | [('a', 1), ('b', 4), ('c', 4)]
44 | >>> y = spark.sparkContext.parallelize([("a", 2), ("a", 3), ("a", 7), ("d", 8)])
45 | >>> y.collect()
46 | [('a', 2), ('a', 3), ('a', 7), ('d', 8)]
47 | >>>
48 | >>> joined = x.join(y)
49 | >>> joined.collect()
50 | [('a', (1, 2)), ('a', (1, 3)), ('a', (1, 7))]
51 | >>>
52 | >>>
53 | >>> joined.count()
54 | 3
55 | >>> x = spark.sparkContext.parallelize([("a", 1), ("b", 4), ("b", 5), ("c", 4)]);
56 | >>> x.collect()
57 | [('a', 1), ('b', 4), ('b', 5), ('c', 4)]
58 | >>>
59 | >>> y = spark.sparkContext.parallelize([("a", 2), ("a", 3), ("a", 7), ("b", 61), ("b", 71), ("d", 8)])
60 | >>> y.collect()
61 | [('a', 2), ('a', 3), ('a', 7), ('b', 61), ('b', 71), ('d', 8)]
62 | >>> joined = x.join(y)
63 | >>> joined.collect()
64 | [
65 | ('b', (4, 61)),
66 | ('b', (4, 71)),
67 | ('b', (5, 61)),
68 | ('b', (5, 71)),
69 | ('a', (1, 2)),
70 | ('a', (1, 3)),
71 | ('a', (1, 7))
72 | ]
73 | >>>
74 | >>>#pyspark.RDD.cartesian
75 | >>>#RDD.cartesian(other)
76 | >>>#Return the Cartesian product of this RDD and another one,
77 | >>>#that is, the RDD of all pairs of elements (a, b) where a is
78 | >>>#in self and b is in other.
79 | >>># Examples
80 |
81 | >>>
82 | >>> rdd = spark.sparkContext.parallelize([1, 2])
83 | >>> sorted(rdd.cartesian(rdd).collect())
84 | [(1, 1), (1, 2), (2, 1), (2, 2)]
85 |
86 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-10-11-filter-map-flatMap.txt:
--------------------------------------------------------------------------------
1 | Understand filter(), map(), and flatMap()
2 |
3 | $ ./bin/pyspark
4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
5 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
6 | Welcome to
7 | ____ __
8 | / __/__ ___ _____/ /__
9 | _\ \/ _ \/ _ `/ __/ '_/
10 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.2
11 | /_/
12 |
13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
14 | Spark context Web UI available at http://10.0.0.94:4040
15 | Spark context available as 'sc' (master = local[*], app id = local-1634007457887).
16 | SparkSession available as 'spark'.
17 | >>>
18 | >>>
19 | >>>
20 | >>> records = ["this is fox", "fox", "is", "fox is red", "fox is gone"]
21 | >>> records
22 | ['this is fox', 'fox', 'is', 'fox is red', 'fox is gone']
23 | >>> >>>
24 | >>>
25 | >>> rdd = sc.parallelize(records)
26 | >>>
27 | >>> rdd
28 | ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274
29 | >>> rdd.count()
30 | 5
31 | >>> rdd.collect()
32 | ['this is fox', 'fox', 'is', 'fox is red', 'fox is gone']
33 | >>>
34 | >>>
35 | >>> filtered = rdd.filter(lambda x: len(x) > 3)
36 | >>> filtered.collect()
37 | ['this is fox', 'fox is red', 'fox is gone']
38 | >>>
39 | >>>
40 | >>> def apply_filter(x):
41 | ... if len(x) > 3: return True
42 | ... return False
43 | ...
44 | >>>
45 | >>> b = apply_filter("this is a long one")
46 | >>> b
47 | True
48 | >>> c = apply_filter("one")
49 | >>> c
50 | False
51 | >>>
52 | >>> filtered_recs = rdd.filter(apply_filter)
53 | >>>
54 | >>> filtered_recs.collect()
55 | ['this is fox', 'fox is red', 'fox is gone']
56 | >>>
57 | >>>
58 | >>> rdd.collect()
59 | ['this is fox', 'fox', 'is', 'fox is red', 'fox is gone']
60 | >>> flattened = rdd.flatMap(lambda x: x.split(" "))
61 | >>>
62 | >>> flattened.collect()
63 | ['this', 'is', 'fox', 'fox', 'is', 'fox', 'is', 'red', 'fox', 'is', 'gone']
64 | >>> flattened.count()
65 | 11
66 | >>> mapped = rdd.map(lambda x: x.split(" "))
67 | >>> mapped.collect()
68 | [['this', 'is', 'fox'], ['fox'], ['is'], ['fox', 'is', 'red'], ['fox', 'is', 'gone']]
69 | >>> mapped.count()
70 | 5
71 | >>>
72 | >>> a = [ ["this", "is"], [], [], ["fox", "is", "red", "jumped"] ]
73 | >>> a
74 | [['this', 'is'], [], [], ['fox', 'is', 'red', 'jumped']]
75 | >>> rdd_list = sc.parallelize(a)
76 | >>> rdd_list.collect()
77 | [['this', 'is'], [], [], ['fox', 'is', 'red', 'jumped']]
78 | >>> rdd_list.count()
79 | 4
80 | >>> flattened22 = rdd_list.flatMap(lambda L : L)
81 | >>> flattened22.collect()
82 | ['this', 'is', 'fox', 'is', 'red', 'jumped']
83 | >>>
84 | >>>
85 | >>> key_value_pairs = [("a", 10), ("a", 20), ("a", 30), ("a", 40), ("b", 300), ("b", 400)]
86 | >>> key_value_pairs
87 | [('a', 10), ('a', 20), ('a', 30), ('a', 40), ('b', 300), ('b', 400)]
88 | >>> key_value_rdd = sc.parallelize(key_value_pairs)
89 | >>>
90 | >>> key_value_rdd.collect()
91 | [('a', 10), ('a', 20), ('a', 30), ('a', 40), ('b', 300), ('b', 400)]
92 | >>>
93 | >>> def custom_func(x):
94 | ... k = x[0]
95 | ... v = x[1]
96 | ... if (v < 30): return []
97 | ... return [(k, v+1000), ("MYKEY", v+4000)]
98 | ...
99 | >>>
100 | >>> y = custom_func(("x", 25))
101 | >>> y
102 | []
103 | >>> y = custom_func(("x", 300))
104 | >>> y
105 | [('x', 1300), ('MYKEY', 4300)]
106 | >>> flattened = key_value_rdd.flatMap(custom_func)
107 | >>> flattened.collect()
108 | [('a', 1030), ('MYKEY', 4030), ('a', 1040), ('MYKEY', 4040), ('b', 1300), ('MYKEY', 4300), ('b', 1400), ('MYKEY', 4400)]
109 | >>> flattened.count()
110 | 8
111 | >>>
112 | >>> mapped = key_value_rdd.map(custom_func)
113 | >>> mapped.collect()
114 | [[], [], [('a', 1030), ('MYKEY', 4030)], [('a', 1040), ('MYKEY', 4040)], [('b', 1300), ('MYKEY', 4300)], [('b', 1400), ('MYKEY', 4400)]]
115 | >>> mapped.count()
116 | 6
117 | >>> filtered99 = mapped.filter(lambda x: len(x) > 0)
118 | >>> filtered99.collect()
119 | [[('a', 1030), ('MYKEY', 4030)], [('a', 1040), ('MYKEY', 4040)], [('b', 1300), ('MYKEY', 4300)], [('b', 1400), ('MYKEY', 4400)]]
120 | >>>
121 | >>>
122 | >>>
123 | >>> x = set()
124 | >>> x.add(1)
125 | >>> x
126 | {1}
127 | >>> x.add(1)
128 | >>> x
129 | {1}
130 | >>> x.add(3)
131 | >>> x.add(4)
132 | >>> x
133 | {1, 3, 4}
134 | >>> x.add(4)
135 | >>> x.add(4)
136 | >>> x.add(4)
137 | >>> x.add(4)
138 | >>> x
139 | {1, 3, 4}
140 | >>> x = []
141 | >>> x.append(1)
142 | >>> x
143 | [1]
144 | >>> x.append(1)
145 | >>> x
146 | [1, 1]
147 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-10-20-understanding-partitions.txt:
--------------------------------------------------------------------------------
1 | Understanding Partitions
2 |
3 | $ ./bin/pyspark
4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
5 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
6 | Welcome to
7 | ____ __
8 | / __/__ ___ _____/ /__
9 | _\ \/ _ \/ _ `/ __/ '_/
10 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.2
11 | /_/
12 |
13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
14 | Spark context Web UI available at http://10.0.0.94:4040
15 | Spark context available as 'sc' (master = local[*], app id = local-1634788905125).
16 | SparkSession available as 'spark'.
17 | >>>
18 | >>> nums = [1, 2, 3, 4, 5, 77, 77, 66, 99, 33, 33, 22, 22, 11, 123, 44, 45, 67, 89, 77, 66, 44, 55, 99, 80, 90]
19 | >>> nums
20 | [1, 2, 3, 4, 5, 77, 77, 66, 99, 33, 33, 22, 22, 11, 123, 44, 45, 67, 89, 77, 66, 44, 55, 99, 80, 90]
21 | >>> # rdd : RDD[Integer]
22 | >>> rdd = sc.parallelize(nums)
23 | >>> rdd.count()
24 | 26
25 | >>> rdd.collect()
26 | [1, 2, 3, 4, 5, 77, 77, 66, 99, 33, 33, 22, 22, 11, 123, 44, 45, 67, 89, 77, 66, 44, 55, 99, 80, 90]
27 | >>>
28 | >>> # get number of partitions (default, set by cluster manager)
29 | >>> rdd.getNumPartitions()
30 | 8
31 | >>> # set number of partitions explicitly to 4
32 | >>> rdd2 = sc.parallelize(nums, 4)
33 | >>> rdd2.getNumPartitions()
34 | 4
35 | >>> # define a debugger to output all elements of a partition
36 | >>> def debug_partition(partition):
37 | ... print("partition=", list(partition))
38 | ...
39 | >>> rdd.foreachPartition(debug_partition)
40 | partition= [1, 2, 3]
41 | partition= [33, 33, 22]
42 | partition= [22, 11, 123]
43 | partition= [44, 45, 67]
44 | partition= [77, 66, 99]
45 | partition= [44, 55, 99, 80, 90]
46 | partition= [89, 77, 66]
47 | partition= [4, 5, 77]
48 | >>>
49 | >>> rdd2.foreachPartition(debug_partition)
50 | partition= [89, 77, 66, 44, 55, 99, 80, 90]
51 | partition= [1, 2, 3, 4, 5, 77]
52 | partition= [22, 11, 123, 44, 45, 67]
53 | partition= [77, 66, 99, 33, 33, 22]
54 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-10-25-RDD-join.txt:
--------------------------------------------------------------------------------
1 | Inner Join Example
2 |
3 | $ pyspark
4 | Python 3.7.10 (default, Jun 3 2021, 00:02:01)
5 | Welcome to
6 | ____ __
7 | / __/__ ___ _____/ /__
8 | _\ \/ _ \/ _ `/ __/ '_/
9 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.5-amzn-0
10 | /_/
11 |
12 | Using Python version 3.7.10 (default, Jun 3 2021 00:02:01)
13 | SparkContext available as 'sc'.
14 | SparkSession available as 'spark'.
15 | >>>
16 | >>>
17 | >>> x = sc.parallelize([("a", 1), ("a", 4), ("b", 7), ("b", 8), ("c", 89)])
18 | >>> y = sc.parallelize([("a", 100), ("a", 400), ("b", 700), ("b", 800), ("b", 900), ("d", 890)])
19 | >>> x.collect()
20 | [
21 | ('a', 1), ('a', 4),
22 | ('b', 7), ('b', 8),
23 | ('c', 89)
24 | ]
25 | >>> y.collect()
26 | [
27 | ('a', 100), ('a', 400),
28 | ('b', 700), ('b', 800), ('b', 900),
29 | ('d', 890)
30 | ]
31 |
32 | >>> joined = x.join(y)
33 | >>> joined.collect()
34 | [
35 | ('b', (7, 800)),
36 | ('b', (7, 900)),
37 | ('b', (7, 700)),
38 | ('b', (8, 800)),
39 | ('b', (8, 900)),
40 | ('b', (8, 700)),
41 | ('a', (1, 100)),
42 | ('a', (1, 400)),
43 | ('a', (4, 100)),
44 | ('a', (4, 400))
45 | ]
46 | >>> joined2 = y.join(x)
47 | >>> joined2.collect()
48 | [
49 | ('b', (700, 8)),
50 | ('b', (700, 7)),
51 | ('b', (800, 8)),
52 | ('b', (800, 7)),
53 | ('b', (900, 8)),
54 | ('b', (900, 7)),
55 | ('a', (100, 4)),
56 | ('a', (100, 1)),
57 | ('a', (400, 4)),
58 | ('a', (400, 1))
59 | ]
60 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2022-04-12.txt:
--------------------------------------------------------------------------------
1 | ~ % cd spark-3.2.0
2 | spark-3.2.0 % ls -l
3 | total 192
4 | -rwxrwxrwx@ 1 mparsian staff 22878 Oct 6 2021 LICENSE
5 | -rwxrwxrwx@ 1 mparsian staff 57677 Oct 6 2021 NOTICE
6 | drwxrwxrwx@ 3 mparsian staff 96 Oct 6 2021 R
7 | -rwxrwxrwx@ 1 mparsian staff 4512 Oct 6 2021 README.md
8 | -rwxrwxrwx@ 1 mparsian staff 167 Oct 6 2021 RELEASE
9 | drwxrwxrwx@ 29 mparsian staff 928 Nov 17 18:15 bin
10 | drwxrwxrwx@ 9 mparsian staff 288 Nov 17 18:15 conf
11 | drwxrwxrwx@ 5 mparsian staff 160 Nov 17 18:15 data
12 | drwxrwxrwx@ 4 mparsian staff 128 Oct 6 2021 examples
13 | drwxrwxrwx@ 237 mparsian staff 7584 Nov 17 18:15 jars
14 | drwxrwxrwx@ 4 mparsian staff 128 Nov 17 18:15 kubernetes
15 | drwxrwxrwx@ 60 mparsian staff 1920 Nov 17 18:15 licenses
16 | drwxrwxrwx@ 20 mparsian staff 640 Nov 17 18:15 python
17 | drwxrwxrwx@ 29 mparsian staff 928 Nov 17 18:15 sbin
18 | drwxrwxrwx@ 3 mparsian staff 96 Oct 6 2021 yarn
19 |
20 | spark-3.2.0 % ./bin/pyspark
21 | Python 3.8.9 (default, Mar 30 2022, 13:51:17)
22 | [Clang 13.1.6 (clang-1316.0.21.2.3)] on darwin
23 | Type "help", "copyright", "credits" or "license" for more information.
24 | Welcome to
25 | ____ __
26 | / __/__ ___ _____/ /__
27 | _\ \/ _ \/ _ `/ __/ '_/
28 | /__ / .__/\_,_/_/ /_/\_\ version 3.2.0
29 | /_/
30 |
31 | Using Python version 3.8.9 (default, Mar 30 2022 13:51:17)
32 | Spark context Web UI available at http://10.0.0.234:4040
33 | Spark context available as 'sc' (master = local[*], app id = local-1649822374103).
34 | SparkSession available as 'spark'.
35 | >>>
36 | >>>
37 | >>> spark.version
38 | '3.2.0'
39 | >>>
40 | >>>
41 | >>> numbers = [1, 2, 5, 6, 7, 8, 9, 10, 30, 40, 50]
42 | >>> numbers
43 | [1, 2, 5, 6, 7, 8, 9, 10, 30, 40, 50]
44 | >>> # rdd = Resilient Dist. Dataset
45 | >>> rdd = spark.sparkContext.parallelize(numbers)
46 | >>> rdd.collect()
47 | [1, 2, 5, 6, 7, 8, 9, 10, 30, 40, 50]
48 | >>> # rdd is partitioned, read-only, operates in parallel
49 | >>> rdd.count()
50 | 11
51 | >>> total = rdd.reduce(lambda x, y: x+y)
52 | >>> total
53 | 168
54 | >>> rdd_greater_than_20 = rdd.filter(lambda x : x > 20)
55 | >>> rdd_greater_than_20.collect()
56 | [30, 40, 50]
57 | >>>
58 | >>> rdd_greater_than_20.count()
59 | 3
60 | >>> rdd.take(3)
61 | [1, 2, 5]
62 | >>>
63 | >>> ^D
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2022-04-14-mappers-and-filters-and-reduce.txt:
--------------------------------------------------------------------------------
1 | # spark : SparkSession
2 |
3 | # create a Python collection
4 | numbers = [1, 2, 3, 4, 5, -1, -2, -3, 10, 12, 30]
5 |
6 | # create an RDD[Integer] from a Python collection
7 | rdd = spark.sparkContext.parallelize(numbers)
8 |
9 | # get all elements (used for debugging -- do not use this for large RDDs)
10 | rdd.collect()
11 | [1, 2, 3, 4, 5, -1, -2, -3, 10, 12, 30]
12 |
13 | # count the number of elements
14 | rdd.count()
15 | 11
16 |
17 | # apply a map() transformation to rdd and create a new RDD as rdd2
18 | rdd2 = rdd.map(lambda x : 3 *x)
19 | rdd2.collect()
20 | [3, 6, 9, 12, 15, -3, -6, -9, 30, 36, 90]
21 |
22 | # create a new RDD (as rdd3) from rdd2
23 | rdd3 = rdd2.map(lambda x: (x, 2*x))
24 | rdd3.collect()
25 | [
26 | (3, 6),
27 | (6, 12),
28 | (9, 18),
29 | (12, 24),
30 | (15, 30),
31 | (-3, -6),
32 | (-6, -12),
33 | (-9, -18),
34 | (30, 60),
35 | (36, 72),
36 | (90, 180)
37 | ]
38 |
39 | # find all positive numbers from a given RDD (as rdd)
40 | # filter() is a transformation
41 | positives = rdd.filter(lambda x : x > 0)
42 | positives.collect()
43 | [1, 2, 3, 4, 5, 10, 12, 30]
44 |
45 | # find all negative numbers from a given RDD (as rdd)
46 | # filter() is a transformation
47 | negatives = rdd.filter(lambda x : x < 0)
48 | negatives.collect()
49 | [-1, -2, -3]
50 |
51 | # find the sum of all numbers for a given RDD[Integer]
52 | # reduce() is an action: it creates a NON-RDD
53 | # reduce() is NOT a Transformation): it does NOT create an RDD
54 | total = rdd.reduce(lambda x, y: x+y)
55 |
56 |
57 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2022-04-19-read-text-groupbykey-mapvalues-filter.txt:
--------------------------------------------------------------------------------
1 | % cat /tmp/movies.txt
2 | user9,m1,5
3 | user8,m2,4
4 | user1,m1,2
5 | user1,9
6 | user1,m1,2
7 | user2,m2,3
8 | user2,m3,5
9 | user3,m3,4
10 | user6,m3,4
11 | user7,m3,3
12 | user3,king
13 | user4,m1,3
14 | user5,m2,5
15 | user6,m4,5
16 | user7,m5,5
17 | user1
18 | user3,m3,5
19 | user4,m4,1
20 |
21 | % ./bin/pyspark
22 | Python 3.8.9 (default, Mar 30 2022, 13:51:17)
23 | [Clang 13.1.6 (clang-1316.0.21.2.3)] on darwin
24 | Type "help", "copyright", "credits" or "license" for more information.
25 | Welcome to
26 | ____ __
27 | / __/__ ___ _____/ /__
28 | _\ \/ _ \/ _ `/ __/ '_/
29 | /__ / .__/\_,_/_/ /_/\_\ version 3.2.0
30 | /_/
31 |
32 | Using Python version 3.8.9 (default, Mar 30 2022 13:51:17)
33 | Spark context Web UI available at http://10.0.0.234:4041
34 | Spark context available as 'sc' (master = local[*], app id = local-1650425312842).
35 | SparkSession available as 'spark'.
36 | >>>
37 | >>>
38 | >>>
39 | >>> input_path = "/tmp/movies.txt"
40 | >>> input_path
41 | '/tmp/movies.txt'
42 | >>> # read input and create RDD[String]
43 | >>> records = spark.sparkContext.textFile(input_path)
44 | >>> records.collect()
45 | [
46 | 'user9,m1,5',
47 | 'user8,m2,4',
48 | 'user1,m1,2',
49 | 'user1,9',
50 | 'user1,m1,2',
51 | 'user2,m2,3',
52 | 'user2,m3,5',
53 | 'user3,m3,4',
54 | 'user6,m3,4',
55 | 'user7,m3,3',
56 | 'user3,king',
57 | 'user4,m1,3',
58 | 'user5,m2,5',
59 | 'user6,m4,5',
60 | 'user7,m5,5',
61 | 'user1',
62 | 'user3,m3,5',
63 | 'user4,m4,1'
64 | ]
65 | >>> records.count()
66 | 18
67 | >>>
68 | >>>
69 | >>> records.getNumPartitions()
70 | 2
71 | >>>
72 | >>>
73 | >>>
74 | >>> pairs = [("A", 3), ("A", 4), ("A", 5), ("B", 30), ("B", 40), ("B", 50), ("B", 60), ("C", 100)]
75 | >>> pairs
76 | [('A', 3), ('A', 4), ('A', 5), ('B', 30), ('B', 40), ('B', 50), ('B', 60), ('C', 100)]
77 | >>> rdd = spark.sparkContext.parallelize(pairs)
78 | >>> rdd.collect()
79 | [('A', 3), ('A', 4), ('A', 5), ('B', 30), ('B', 40), ('B', 50), ('B', 60), ('C', 100)]
80 | >>> rdd.count()
81 | 8
82 | >>> rdd.getNumPartitions()
83 | 16
84 | >>> # NOTE: since the number of partitions is more than
85 | >>> # the number of elements: this implies that
86 | >>> # some of the partitions can be empty (partition
87 | >>> # is created, but has no elements at all).
88 | >>>
89 | >>>
90 | # find average of values per key: A, B, C
91 | >>> # use groupByKey() transformation
92 | >>> grouped = rdd.groupByKey()
93 | >>> grouped.collect()
94 | [
95 | ('B', ),
96 | ('C', ),
97 | ('A', )
98 | ]
99 |
100 | >>> grouped.mapValues(lambda values: list(values)).collect()
101 | [
102 | ('B', [30, 40, 50, 60]),
103 | ('C', [100]),
104 | ('A', [3, 4, 5])
105 | ]
106 | >>> # similar to SQL's GROUP BY
107 | >>> # values : ResultIterable
108 | >>> avg_by_key = grouped.mapValues(lambda values: sum(values) / len(values))
109 | >>> avg_by_key.collect()
110 | [('B', 45.0), ('C', 100.0), ('A', 4.0)]
111 | >>>
112 | >>>
113 | >>> rdd.collect()
114 | [('A', 3), ('A', 4), ('A', 5), ('B', 30), ('B', 40), ('B', 50), ('B', 60), ('C', 100)]
115 | >>> rdd_44 = rdd.mapValues(lambda v : v * 10)
116 | >>> rdd_44.collect()
117 | [('A', 30), ('A', 40), ('A', 50), ('B', 300), ('B', 400), ('B', 500), ('B', 600), ('C', 1000)]
118 | >>> # v : denotes the value component of (key, value) pair.
119 | >>>
120 | >>>
121 | >>> # apply a filter and keep (key, value) pairs
122 | >>> # if and only if value id greter than 100
123 | >>>
124 | >>> # understand tuple of 2 elements as (key, value) pair:
125 | >>> x = ("K", 2345)
126 | >>> x[0]
127 | 'K'
128 | >>> x[1]
129 | 2345
130 | >>>
131 | >>>
132 | >>> # apply a filter to rdd_44 and keep (key, value)
133 | >>> # pairs if and only if value is greter than 100
134 | >>> # x denotes a single element of source RDD (rdd_44)
135 | >>> rdd5 = rdd_44.filter(lambda x: x[1] > 100)
136 | >>> rdd5.collect()
137 | [('B', 300), ('B', 400), ('B', 500), ('B', 600), ('C', 1000)]
138 | >>>
139 | >>>
140 | >>> some_lists = [ [1, 2, 3], [7, 8, 9, 10], [], [] ]
141 | >>> len(some_lists)
142 | 4
143 | >>> some_lists[0]
144 | [1, 2, 3]
145 | >>> some_lists[1]
146 | [7, 8, 9, 10]
147 | >>> some_lists[2]
148 | []
149 | >>> some_lists[3]
150 | []
151 | >>> rdd = spark.sparkContext.parallelize(some_lists)
152 | >>> rdd.collect()
153 | [[1, 2, 3], [7, 8, 9, 10], [], []]
154 | >>> rdd.count()
155 | 4
156 | >>> # each rdd element is a list denoted by [...]
157 | >>>
158 | >>> rdd2 = rdd.flatMap(lambda x: x)
159 | >>> rdd2.collect()
160 | [1, 2, 3, 7, 8, 9, 10]
161 | >>> rdd2.count()
162 | 7
163 | >>> rdd3 = rdd.map(lambda x: x)
164 | >>> rdd3.collect()
165 | [[1, 2, 3], [7, 8, 9, 10], [], []]
166 | >>> rdd3.collect()
167 | [[1, 2, 3], [7, 8, 9, 10], [], []]
168 | >>> rdd3.count()
169 | 4
170 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session_2020-07-01.txt:
--------------------------------------------------------------------------------
1 | cat /Users/mparsian/spark-3.0.0/zbin/foxdata.txt
2 | red fox jumped high
3 | fox jumped over high fence
4 | red fox jumped
5 |
6 | mparsian@Mahmouds-MacBook ~/spark-3.0.0 $ ./bin/pyspark
7 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
8 | [Clang 6.0 (clang-600.0.57)] on darwin
9 | Type "help", "copyright", "credits" or "license" for more information.
10 | 20/07/01 17:51:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
11 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
12 | Setting default log level to "WARN".
13 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
14 | Welcome to
15 | ____ __
16 | / __/__ ___ _____/ /__
17 | _\ \/ _ \/ _ `/ __/ '_/
18 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0
19 | /_/
20 |
21 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
22 | SparkSession available as 'spark'.
23 | >>>
24 | >>>
25 | >>>
26 | >>>
27 | >>> input_path = '/Users/mparsian/spark-3.0.0/zbin/foxdata.txt'
28 | >>> input_path
29 | '/Users/mparsian/spark-3.0.0/zbin/foxdata.txt'
30 | >>> recs = spark.sparkContext.textFile(input_path)
31 | >>>
32 | >>>
33 | >>>
34 | >>> recs
35 | /Users/mparsian/spark-3.0.0/zbin/foxdata.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
36 | >>>
37 | >>>
38 | >>> recs.collect()
39 | ['red fox jumped high', 'fox jumped over high fence', 'red fox jumped']
40 | >>> recs.count()
41 | 3
42 | >>> rdd_with_len = recs.map(lambda x: (x, len(x)))
43 | >>> rdd_with_len.collect()
44 | [('red fox jumped high', 19), ('fox jumped over high fence', 26), ('red fox jumped', 14)]
45 | >>>
46 | >>>
47 | >>>
48 | >>> upper = recs.map(lambda x: x.upper())
49 | >>> upper.collect()
50 | ['RED FOX JUMPED HIGH', 'FOX JUMPED OVER HIGH FENCE', 'RED FOX JUMPED']
51 | >>> spark
52 |
53 | >>> lower = recs.map(lambda x: x.lower())
54 | >>> lower.collect()
55 | ['red fox jumped high', 'fox jumped over high fence', 'red fox jumped']
56 | >>>
57 | >>>
58 | >>>
59 | >>> lower_and_upper = lower.union(upper)
60 | >>> lower_and_upper.collect()
61 | ['red fox jumped high', 'fox jumped over high fence', 'red fox jumped', 'RED FOX JUMPED HIGH', 'FOX JUMPED OVER HIGH FENCE', 'RED FOX JUMPED']
62 | >>> lower_and_upper.count()
63 | 6
64 | >>>
65 | >>>
66 | >>>
67 | >>> counts = recs.map(lambda x : (len(x), 3*len(x)))
68 | >>> counts.collect()
69 | [(19, 57), (26, 78), (14, 42)]
70 | >>>
71 | >>>
72 | >>>
73 | >>> numbers = [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
74 | >>> numbers
75 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
76 |
77 | >>> numbers = [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
78 | >>>
79 | >>>
80 | >>>
81 | >>> numbers
82 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
83 | >>> rdd = spark.sparkContext.parallelize(numbers)
84 | >>> rdd.collect()
85 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
86 | >>> rdd.count()
87 | 14
88 | >>> pos = rdd.filter(lambda x: x > 0)
89 | >>> pos.collect()
90 | [1, 2, 3, 5, 6, 7, 8, 77, 99, 100]
91 |
92 | >>>
93 | >>> squared = rdd.map(lambda x : x*x)
94 | >>> squared.collect()
95 | [1, 4, 9, 25, 36, 49, 64, 1, 16, 5929, 9801, 7569, 10000, 10000]
96 | >>> tuples3 = rdd.map(lambda x : (x, x*x, x*100))
97 | >>> tuples3.collect()
98 | [(1, 1, 100), (2, 4, 200), (3, 9, 300), (5, 25, 500), (6, 36, 600), (7, 49, 700), (8, 64, 800), (-1, 1, -100), (-4, 16, -400), (77, 5929, 7700), (99, 9801, 9900), (-87, 7569, -8700), (-100, 10000, -10000), (100, 10000, 10000)]
99 | >>>
100 | >>>
101 | >>>
102 | >>> rdd.collect()
103 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
104 | >>> gt4 = rdd.filter(lambda x: x > 4)
105 | >>> gt4.collect()
106 | [5, 6, 7, 8, 77, 99, 100]
107 | >>>
108 | >>>
109 | >>>
110 | >>> rdd.collect()
111 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
112 | >>> total = rdd.reduce(lambda x, y: x+y)
113 | >>> total
114 | 116
115 |
116 | Assume that rdd has 3 partitions: partition-1, partition-2, partition-3
117 |
118 | >>> partition-1: 1, 2, 3, 5, 6, 7, 8
119 | partition-1: will sum up to: 32
120 |
121 | >>> partition-2: -1, -4, 77, 99
122 | partition-2: will sum up to: 171
123 |
124 | >>> partition-3: -87, -100, 100
125 | partition-3: will sum up to: -87
126 |
127 | ===============
128 | partition-1 & partition-2 will result in: 203
129 | 203 & partition-3 will result in: 116 (Final result)
130 |
131 |
--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/understanding_partitions.txt:
--------------------------------------------------------------------------------
1 | understanding_partitions.txt
2 |
3 | $ ./bin/pyspark
4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
5 | Type "help", "copyright", "credits" or "license" for more information.
6 | Welcome to
7 | ____ __
8 | / __/__ ___ _____/ /__
9 | _\ \/ _ \/ _ `/ __/ '_/
10 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0
11 | /_/
12 |
13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
14 | SparkSession available as 'spark'.
15 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
16 | >>> rdd = sc.parallelize(numbers, 3)
17 | >>> rdd.collect()
18 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
19 | >>> rdd.count()
20 | 10
21 | >>> rdd.getNumPartitions()
22 | 3
23 | >>> def f(iterator):
24 | ... for x in iterator:
25 | ... print(x)
26 | ... print("=====")
27 | ...
28 | >>>
29 | >>> rdd.foreachPartition(f)
30 | 4
31 | 5
32 | 6
33 | =====
34 | 7
35 | 8
36 | 9
37 | 10
38 | =====
39 | 1
40 | 2
41 | 3
42 | =====
43 | >>> rdd_default = sc.parallelize(numbers)
44 | >>> rdd_default.collect()
45 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
46 | >>> rdd.getNumPartitions()
47 | 3
48 | >>> rdd_default.getNumPartitions()
49 | 8
50 | >>> rdd.foreachPartition(f)
51 | 4
52 | 5
53 | 6
54 | =====
55 | 1
56 | 2
57 | 3
58 | =====
59 | 7
60 | 8
61 | 9
62 | 10
63 | =====
64 | >>> rdd_default.foreachPartition(f)
65 | 6
66 | =====
67 | 7
68 | =====
69 | 3
70 | =====
71 | 2
72 | =====
73 | 8
74 | =====
75 | 4
76 | 5
77 | =====
78 | 9
79 | 10
80 | =====
81 | 1
82 | =====
83 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
84 | >>> rdd_by_4 = sc.parallelize(numbers, 4)
85 | >>> rdd_by_4.collect()
86 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
87 | >>> rdd_by_4.foreachPartition(f)
88 | 1
89 | 2
90 | 3
91 | =====
92 | 10
93 | 11
94 | 12
95 | =====
96 | 4
97 | 5
98 | 6
99 | =====
100 | 7
101 | 8
102 | 9
103 | =====
104 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15]
105 | >>> rdd_by_6 = sc.parallelize(numbers, 6)
106 | >>> rdd_by_6.foreachPartition(f)
107 | 7
108 | 8
109 | =====
110 | 1
111 | 2
112 | =====
113 | 11
114 | 12
115 | 13
116 | 15
117 | =====
118 | 3
119 | 4
120 | =====
121 | 9
122 | 10
123 | =====
124 | 5
125 | 6
126 | =====
127 | >>> numbers = [1, 2, 3, 4, 5, 6]
128 | >>> rdd_empty = sc.parallelize(numbers, 10)
129 | >>> rdd_empty.foreachPartition(f)
130 | 2
131 | =====
132 | 3
133 | =====
134 | =====
135 | =====
136 | 4
137 | =====
138 | =====
139 | 6
140 | =====
141 | 1
142 | =====
143 | 5
144 | =====
145 | =====
146 | >>>
--------------------------------------------------------------------------------
/tutorial/pyspark-udf/pyspark_udf_maptype.txt:
--------------------------------------------------------------------------------
1 | $SPARK_HOME/bin/pyspark
2 | Python 3.8.9 (default, Nov 9 2021, 04:26:29)
3 | Welcome to
4 | ____ __
5 | / __/__ ___ _____/ /__
6 | _\ \/ _ \/ _ `/ __/ '_/
7 | /__ / .__/\_,_/_/ /_/\_\ version 3.2.0
8 | /_/
9 |
10 | Using Python version 3.8.9 (default, Nov 9 2021 04:26:29)
11 | Spark context Web UI available at http://10.0.0.232:4040
12 | Spark context available as 'sc' (master = local[*], app id = local-1641011178190).
13 | SparkSession available as 'spark'.
14 |
15 | >>> from pyspark.sql import Row
16 |
17 | >>> data = spark.createDataFrame(
18 | ... [Row(zip_code='94087', city='Sunnyvale'),
19 | ... Row(zip_code='94088', city='Cupertino'),
20 | ... Row(zip_code='95055', city='Santa Clara'),
21 | ... Row(zip_code='95054', city='Palo Alto')])
22 |
23 | >>>
24 | >>> data.show()
25 | +--------+-----------+
26 | |zip_code| city|
27 | +--------+-----------+
28 | | 94087| Sunnyvale|
29 | | 94088| Cupertino|
30 | | 95055|Santa Clara|
31 | | 95054| Palo Alto|
32 | +--------+-----------+
33 |
34 | >>> from pyspark.sql.functions import udf
35 | >>> from pyspark.sql import types as T
36 | >>>
37 | >>> @udf(T.MapType(T.StringType(), T.StringType()))
38 | ... def create_structure(zip_code, city):
39 | ... return {zip_code: city}
40 | ...
41 | >>> data.withColumn('structure', create_structure(data.zip_code, data.city)).toJSON().collect()
42 | [
43 | '{"zip_code":"94087","city":"Sunnyvale","structure":{"94087":"Sunnyvale"}}',
44 | '{"zip_code":"94088","city":"Cupertino","structure":{"94088":"Cupertino"}}',
45 | '{"zip_code":"95055","city":"Santa Clara","structure":{"95055":"Santa Clara"}}',
46 | '{"zip_code":"95054","city":"Palo Alto","structure":{"95054":"Palo Alto"}}'
47 | ]
48 |
49 | >>> data.withColumn('structure', create_structure(data.zip_code, data.city)).show(truncate=False)
50 | +--------+-----------+----------------------+
51 | |zip_code|city |structure |
52 | +--------+-----------+----------------------+
53 | |94087 |Sunnyvale |{94087 -> Sunnyvale} |
54 | |94088 |Cupertino |{94088 -> Cupertino} |
55 | |95055 |Santa Clara|{95055 -> Santa Clara}|
56 | |95054 |Palo Alto |{95054 -> Palo Alto} |
57 | +--------+-----------+----------------------+
58 |
--------------------------------------------------------------------------------
/tutorial/split-function/README.md:
--------------------------------------------------------------------------------
1 | How To Use Split Function
2 | =========================
3 |
4 | * Example-1: Split ````RDD```` into Tokens
5 |
6 | ````
7 | # ./bin/pyspark
8 | Python 2.7.10 (default, Oct 23 2015, 19:19:21)
9 |
10 | Welcome to
11 | ____ __
12 | / __/__ ___ _____/ /__
13 | _\ \/ _ \/ _ `/ __/ '_/
14 | /__ / .__/\_,_/_/ /_/\_\ version 1.6.1
15 | /_/
16 |
17 | Using Python version 2.7.10 (default, Oct 23 2015 19:19:21)
18 | SparkContext available as sc, HiveContext available as sqlContext.
19 |
20 | >>> data = ["abc,de", "abc,de,ze", "abc,de,ze,pe"]
21 | >>> data
22 | ['abc,de', 'abc,de,ze', 'abc,de,ze,pe']
23 |
24 | >>> rdd = sc.parallelize(data)
25 | >>> rdd.collect()
26 | ['abc,de', 'abc,de,ze', 'abc,de,ze,pe']
27 | >>> rdd.count()
28 | 3
29 |
30 | >>> rdd2 = rdd.flatMap(lambda x : x.split(","))
31 | >>> rdd2.collect()
32 | ['abc', 'de', 'abc', 'de', 'ze', 'abc', 'de', 'ze', 'pe']
33 | >>> rdd2.count()
34 | 9
35 | ````
36 |
37 | * Example-2: Create Key-Value Pairs
38 |
39 | ````
40 | >>> data2 = ["abc,de", "xyz,deeee,ze", "abc,de,ze,pe", "xyz,bababa"]
41 | >>> data2
42 | ['abc,de', 'xyz,deeee,ze', 'abc,de,ze,pe', 'xyz,bababa']
43 |
44 | >>> rdd4 = sc.parallelize(data2)
45 | >>> rdd4.collect()
46 | ['abc,de', 'xyz,deeee,ze', 'abc,de,ze,pe', 'xyz,bababa']
47 |
48 | >>> rdd5 = rdd4.map(lambda x : (x.split(",")[0], x.split(",")[1]))
49 | >>> rdd5.collect()
50 | [('abc', 'de'), ('xyz', 'deeee'), ('abc', 'de'), ('xyz', 'bababa')]
51 | ````
52 |
--------------------------------------------------------------------------------
/tutorial/top-N/top-N.txt:
--------------------------------------------------------------------------------
1 | # ./pyspark
2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
3 | Welcome to
4 | ____ __
5 | / __/__ ___ _____/ /__
6 | _\ \/ _ \/ _ `/ __/ '_/
7 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0
8 | /_/
9 |
10 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
11 | SparkContext available as sc.
12 | >>>
13 | >>> nums = [10, 1, 2, 9, 3, 4, 5, 6, 7]
14 | >>> sc.parallelize(nums).takeOrdered(3)
15 | [1, 2, 3]
16 | >>> sc.parallelize(nums).takeOrdered(3, key=lambda x: -x)
17 | [10, 9, 7]
18 | >>>
19 | >>> kv = [(10,"z1"), (1,"z2"), (2,"z3"), (9,"z4"), (3,"z5"), (4,"z6"), (5,"z7"), (6,"z8"), (7,"z9")]
20 | >>> sc.parallelize(kv).takeOrdered(3)
21 | [(1, 'z2'), (2, 'z3'), (3, 'z5')]
22 | >>>
23 | >>> sc.parallelize(kv).takeOrdered(3, key=lambda x: -x[0])
24 | [(10, 'z1'), (9, 'z4'), (7, 'z9')]
25 |
--------------------------------------------------------------------------------
/tutorial/wordcount/README.md:
--------------------------------------------------------------------------------
1 | * word_count.py
2 |
3 | Word Count solution in PySpark: Note that input file is
4 | hard-coded: not a very good practice. The purpose is to
5 | show how to read files in Spark.
6 |
7 | * word_count_ver2.py
8 |
9 | I pass input file as a parameter.
10 |
11 |
12 | ````
13 | best regards,
14 | Mahmoud Parsian
15 | ````
16 |
--------------------------------------------------------------------------------
/tutorial/wordcount/run_word_count.sh:
--------------------------------------------------------------------------------
1 | # define Spark's installed directory
2 | export SPARK_HOME="/Users/mparsian/spark-2.2.1"
3 | #
4 | # define your input path
5 | #INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt"
6 | #
7 | # define your PySpark program
8 | PROG="/Users/mparsian/zmp/pyspark_book_project/programs/word_count.py"
9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG
12 |
--------------------------------------------------------------------------------
/tutorial/wordcount/run_word_count_ver2.sh:
--------------------------------------------------------------------------------
1 | # define Spark's installed directory
2 | export SPARK_HOME="/Users/mparsian/spark-2.2.1"
3 | #
4 | # define your input path
5 | INPUT_PATH="file:///Users/mparsian/spark-2.2.1/zbin/sample.txt"
6 | #
7 | # define your PySpark program
8 | PROG="/Users/mparsian/zmp/github/pyspark-tutorial/tutorial/wordcount/word_count_ver2.py"
9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 |
--------------------------------------------------------------------------------
/tutorial/wordcount/word_count.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import sys
4 |
5 | from pyspark.sql import SparkSession
6 | #-----------------------------------
7 |
8 |
9 | if __name__ == "__main__":
10 |
11 | # create an instance of a SparkSession as spark
12 | spark = SparkSession\
13 | .builder\
14 | .appName("wordcount")\
15 | .getOrCreate()
16 |
17 | inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt"
18 |
19 | # create SparkContext as sc
20 | sc = spark.sparkContext
21 |
22 | # create RDD from a text file
23 | textfileRDD = sc.textFile(inputPath)
24 | print(textfileRDD.collect())
25 |
26 | wordsRDD = textfileRDD.flatMap(lambda line: line.split(" "))
27 | print(wordsRDD.collect())
28 |
29 | pairsRDD = wordsRDD.map(lambda word: (word, 1))
30 | print(pairsRDD.collect())
31 |
32 | frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b)
33 | print(frequenciesRDD.collect())
34 |
35 | # done!
36 | spark.stop()
37 |
--------------------------------------------------------------------------------
/tutorial/wordcount/word_count_ver2.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import sys
4 |
5 | from pyspark.sql import SparkSession
6 | #-----------------------------------
7 |
8 |
9 | if __name__ == "__main__":
10 |
11 | # create an instance of a SparkSession as spark
12 | spark = SparkSession\
13 | .builder\
14 | .appName("wordcount")\
15 | .getOrCreate()
16 |
17 | # inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt"
18 | #
19 | # sys.argv[0] is the name of the script.
20 | # sys.argv[1] is the first parameter
21 | inputPath = sys.argv[1] # input file
22 | print("inputPath: {}".format(inputPath))
23 |
24 |
25 | # create SparkContext as sc
26 | sc = spark.sparkContext
27 |
28 | # create RDD from a text file
29 | textfileRDD = sc.textFile(inputPath)
30 | print(textfileRDD.collect())
31 |
32 | wordsRDD = textfileRDD.flatMap(lambda line: line.split(" "))
33 | print(wordsRDD.collect())
34 |
35 | pairsRDD = wordsRDD.map(lambda word: (word, 1))
36 | print(pairsRDD.collect())
37 |
38 | frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b)
39 | print(frequenciesRDD.collect())
40 |
41 | # done!
42 | spark.stop()
43 |
--------------------------------------------------------------------------------
/tutorial/wordcount/wordcount-shorthand.txt:
--------------------------------------------------------------------------------
1 | # cat data.txt
2 | crazy crazy fox jumped
3 | crazy fox jumped
4 | fox is fast
5 | fox is smart
6 | dog is smart
7 |
8 | # ./bin/pyspark
9 | Welcome to
10 | ____ __
11 | / __/__ ___ _____/ /__
12 | _\ \/ _ \/ _ `/ __/ '_/
13 | /__ / .__/\_,_/_/ /_/\_\ version 1.4.0
14 | /_/
15 |
16 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
17 | SparkContext available as sc, SQLContext available as sqlContext.
18 | >>>
19 | >>> lines = sc.textFile('data.txt', 1);
20 | >>> lines.collect()
21 | [
22 | u'crazy crazy fox jumped',
23 | u'crazy fox jumped',
24 | u'fox is fast',
25 | u'fox is smart',
26 | u'dog is smart'
27 | ]
28 |
29 | >>> frequencies = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
30 | >>> frequencies.collect()
31 | [
32 | (u'crazy', 3),
33 | (u'jumped', 2),
34 | (u'is', 3),
35 | (u'fox', 4),
36 | (u'dog', 1),
37 | (u'fast', 1),
38 | (u'smart', 2)
39 | ]
40 |
41 | >>> frequencies.count()
42 | 7
--------------------------------------------------------------------------------
/tutorial/wordcount/wordcount.txt:
--------------------------------------------------------------------------------
1 | 1. Prepare Input
2 |
3 | # cat data.txt
4 | crazy crazy fox jumped
5 | crazy fox jumped
6 | fox is fast
7 | fox is smart
8 | dog is smart
9 |
10 | 2. Invoke pyspark
11 |
12 | # export SPARK_HOME=...
13 | # SPARK_HOME/bin/pyspark
14 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12)
15 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
16 | Type "help", "copyright", "credits" or "license" for more information.
17 |
18 | Welcome to
19 | ____ __
20 | / __/__ ___ _____/ /__
21 | _\ \/ _ \/ _ `/ __/ '_/
22 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0
23 | /_/
24 |
25 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12)
26 | SparkContext available as sc.
27 | >>> sc
28 |
29 | >>> lines = sc.textFile("data.txt", 1)
30 | >>> debuglines = lines.collect();
31 | >>> debuglines
32 | [u'crazy crazy fox jumped',
33 | u'crazy fox jumped',
34 | u'fox is fast',
35 | u'fox is smart',
36 | u'dog is smart'
37 | ]
38 | >>> words = lines.flatMap(lambda x: x.split(' '))
39 | >>> debugwords = words.collect();
40 | >>> debugwords
41 | [
42 | u'crazy',
43 | u'crazy',
44 | u'fox',
45 | u'jumped',
46 | u'crazy',
47 | u'fox',
48 | u'jumped',
49 | u'fox',
50 | u'is',
51 | u'fast',
52 | u'fox',
53 | u'is',
54 | u'smart',
55 | u'dog',
56 | u'is',
57 | u'smart'
58 | ]
59 | >>> ones = words.map(lambda x: (x, 1))
60 | >>> debugones = ones.collect()
61 | >>> debugones
62 | [
63 | (u'crazy', 1),
64 | (u'crazy', 1),
65 | (u'fox', 1),
66 | (u'jumped', 1),
67 | (u'crazy', 1),
68 | (u'fox', 1),
69 | (u'jumped', 1),
70 | (u'fox', 1),
71 | (u'is', 1),
72 | (u'fast', 1),
73 | (u'fox', 1),
74 | (u'is', 1),
75 | (u'smart', 1),
76 | (u'dog', 1),
77 | (u'is', 1),
78 | (u'smart', 1)
79 | ]
80 | >>> counts = ones.reduceByKey(lambda x, y: x + y)
81 | >>> debugcounts = counts.collect()
82 | >>> debugcounts
83 | [
84 | (u'crazy', 3),
85 | (u'jumped', 2),
86 | (u'is', 3),
87 | (u'fox', 4),
88 | (u'dog', 1),
89 | (u'fast', 1),
90 | (u'smart', 2)
91 | ]
92 | >>>
93 | >>> counts.saveAsTextFile("output")
94 |
95 | 3. Examine Output
96 |
97 | # cat output/part*
98 | (u'crazy', 3)
99 | (u'jumped', 2)
100 | (u'is', 3)
101 | (u'fox', 4)
102 | (u'dog', 1)
103 | (u'fast', 1)
104 | (u'smart', 2)
105 |
--------------------------------------------------------------------------------