├── LICENSE.md
├── README.md
├── data
    └── foxdata.txt
├── howto
    ├── README.md
    ├── download_install_run_spark.md
    └── minimize_verbosity.md
├── images
    ├── Data-Algorithms-with-Spark_mech2.pdf
    ├── Data-Algorithms-with-Spark_mech2.png
    ├── Data_Algorithms_with_Spark_COVER_9781492082385.png
    ├── data_algorithms_image.jpg
    ├── data_algorithms_with_spark.jpg
    └── pyspark_algorithms2.jpg
└── tutorial
    ├── .DS_Store
    ├── add-indices
        └── add-indices.txt
    ├── basic-average
        └── basic-average.txt
    ├── basic-filter
        └── basic-filter.txt
    ├── basic-join
        └── basicjoin.txt
    ├── basic-map
        └── basic-map.txt
    ├── basic-multiply
        └── basic-multiply.txt
    ├── basic-sort
        └── sort-by-key.txt
    ├── basic-sum
        └── basic-sum.txt
    ├── basic-union
        └── basic-union.txt
    ├── bigrams
        └── bigrams.txt
    ├── cartesian
        └── cartesian.txt
    ├── combine-by-key
        ├── README.md
        ├── combine-by-key.txt
        ├── distributed_computing_with_spark_by_Javier_Santos_Paniego.pdf
        ├── spark-combineByKey.md
        ├── spark-combineByKey.txt
        └── standard_deviation_by_combineByKey.md
    ├── dna-basecount
        ├── README.md
        ├── basemapper.py
        ├── dna-basecount.md
        ├── dna-basecount2.md
        ├── dna-basecount3.md
        └── dna_seq.txt
    ├── map-partitions
        └── README.md
    ├── pyspark-examples
        ├── dataframes
        │   ├── VIDEO-DataFrames.txt
        │   ├── dataframe-examples.md
        │   ├── dataframe-session-2018-04-26.txt
        │   ├── dataframe-session-2018-05-15.txt
        │   ├── dataframe-session-2018-10-30.txt
        │   ├── dataframe-session-2019-02-14.txt
        │   ├── dataframe-session-2020-11-04.txt
        │   ├── dataframe-session-2021-05-12-intro.txt
        │   ├── dataframe-session-2022-05-12.txt
        │   └── dataframe-session-2022-05-19-Converting-DataFrame-to-RDD.txt
        └── rdds
        │   ├── combineByKey_example.py
        │   ├── count_min_max.py
        │   ├── groupbykey_and_reducebykey_example.ipynb
        │   ├── pyspark-session-2015-02-23.txt
        │   ├── pyspark-session-2015-03-13.txt
        │   ├── pyspark-session-2015-04-10.txt
        │   ├── pyspark-session-2018-01-18.txt
        │   ├── pyspark-session-2018-04-12.txt
        │   ├── pyspark-session-2018-10-02.txt
        │   ├── pyspark-session-2018-10-09.txt
        │   ├── pyspark-session-2019-01-22.txt
        │   ├── pyspark-session-2019-01-30.txt
        │   ├── pyspark-session-2019-04-16.txt
        │   ├── pyspark-session-2019-04-18.txt
        │   ├── pyspark-session-2019-04-26.txt
        │   ├── pyspark-session-2019-05-09.txt
        │   ├── pyspark-session-2019-10-09.txt
        │   ├── pyspark-session-2019-10-16.txt
        │   ├── pyspark-session-2020-01-22.txt
        │   ├── pyspark-session-2020-01-24.txt
        │   ├── pyspark-session-2020-02-03.txt
        │   ├── pyspark-session-2020-04-16.txt
        │   ├── pyspark-session-2020-04-23.txt
        │   ├── pyspark-session-2020-07-06-word-count.txt
        │   ├── pyspark-session-2020-10-05.txt
        │   ├── pyspark-session-2020-10-07.txt
        │   ├── pyspark-session-2020-10-12.txt
        │   ├── pyspark-session-2020-10-15.txt
        │   ├── pyspark-session-2020-10-19.txt
        │   ├── pyspark-session-2021-01-19.txt
        │   ├── pyspark-session-2021-01-21.ipynb
        │   ├── pyspark-session-2021-01-26.txt
        │   ├── pyspark-session-2021-04-12.txt
        │   ├── pyspark-session-2021-04-14.txt
        │   ├── pyspark-session-2021-04-19.txt
        │   ├── pyspark-session-2021-04-21-mapPartitions.txt
        │   ├── pyspark-session-2021-04-29-min-max-avg.txt
        │   ├── pyspark-session-2021-05-05-join.txt
        │   ├── pyspark-session-2021-10-06.txt
        │   ├── pyspark-session-2021-10-11-filter-map-flatMap.txt
        │   ├── pyspark-session-2021-10-20-understanding-partitions.txt
        │   ├── pyspark-session-2021-10-25-RDD-join.txt
        │   ├── pyspark-session-2022-04-12.txt
        │   ├── pyspark-session-2022-04-14-mappers-and-filters-and-reduce.txt
        │   ├── pyspark-session-2022-04-19-read-text-groupbykey-mapvalues-filter.txt
        │   ├── pyspark-session_2019-10-07.txt
        │   ├── pyspark-session_2020-07-01.txt
        │   └── understanding_partitions.txt
    ├── pyspark-udf
        └── pyspark_udf_maptype.txt
    ├── ranking
        ├── README.md
        └── ranking_functions_in_pyspark.md
    ├── split-function
        └── README.md
    ├── top-N
        └── top-N.txt
    └── wordcount
        ├── README.md
        ├── run_word_count.sh
        ├── run_word_count_ver2.sh
        ├── word_count.py
        ├── word_count_ver2.py
        ├── wordcount-shorthand.txt
        └── wordcount.txt


/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright [2019] [Mahmoud Parsian]
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 | http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PySpark Tutorial
  2 | 
  3 | * PySpark is the Python API for Spark.  
  4 | 
  5 | * The purpose of PySpark tutorial is to provide 
  6 |   basic distributed algorithms using PySpark. 
  7 | 
  8 | * PySpark supports two types of Data Abstractions:
  9 | 	* RDDs
 10 | 	* DataFrames
 11 | 
 12 | * **PySpark Interactive Mode**: has an interactive shell 
 13 |   (`$SPARK_HOME/bin/pyspark`) for basic testing 
 14 |   and debugging and is not supposed to be used 
 15 |   for production environment.
 16 | 
 17 | * **PySpark Batch Mode**: you may use `$SPARK_HOME/bin/spark-submit` 
 18 |   command for running PySpark programs (may be used for 
 19 |   testing and production environemtns)
 20 | 
 21 | ------
 22 | 
 23 | # [Glossary: big data, MapReduce, Spark](https://github.com/mahmoudparsian/big-data-mapreduce-course/blob/master/slides/glossary/README.md)
 24 | 
 25 | ------
 26 | 
 27 | # [Basics of PySpark with Examples](./howto/README.md) 
 28 | 
 29 | ------
 30 | 
 31 | # PySpark Examples and Tutorials
 32 | 
 33 | * [PySpark Examples: RDDs](./tutorial/pyspark-examples/rdds/)
 34 | * [PySpark Examples: DataFramess](./tutorial/pyspark-examples/dataframes/)
 35 | * [DNA Base Counting](./tutorial/dna-basecount/README.md)
 36 | * [Classic Word Count](./tutorial/wordcount)
 37 | * [Find Frequency of Bigrams](./tutorial/bigrams)
 38 | * [Join of Two Relations R(K, V<sub>1</sub>), S(K, V<sub>2</sub>)](./tutorial/basic-join)
 39 | * [Basic Mapping of RDD Elements](./tutorial/basic-map)
 40 | * [How to add all RDD elements together](./tutorial/basic-sum)
 41 | * [How to multiply all RDD elements together](./tutorial/basic-multiply)
 42 | * [Find Top-N and Bottom-N](./tutorial/top-N)
 43 | * [Find average by using combineByKey()](./tutorial/combine-by-key)
 44 | * [How to filter RDD elements](./tutorial/basic-filter)
 45 | * [How to find average](./tutorial/basic-average)
 46 | * [Cartesian Product: rdd1.cartesian(rdd2)](./tutorial/cartesian)
 47 | * [Sort By Key: sortByKey() ascending/descending](./tutorial/basic-sort)
 48 | * [How to Add Indices](./tutorial/add-indices)
 49 | * [Map Partitions: mapPartitions() by Examples](./tutorial/map-partitions/README.md)
 50 | * [Monoid: Design Principle](https://github.com/mahmoudparsian/data-algorithms-with-spark/blob/master/wiki-spark/docs/monoid/README.md)
 51 | * [Ranking Functions by Examples](./tutorial/ranking/README.md)
 52 | 
 53 | ------
 54 | 
 55 | # Books
 56 | 
 57 | ### [Data Algorithms with Spark](https://github.com/mahmoudparsian/data-algorithms-with-spark/) 
 58 | 
 59 | ### [Data Algorithms](https://github.com/mahmoudparsian/data-algorithms-book/) 
 60 | 
 61 | ### [PySpark Algorithms](https://github.com/mahmoudparsian/pyspark-algorithms/) 
 62 | 
 63 | -----
 64 | 
 65 | # Miscellaneous 
 66 | 
 67 | ### [Download, Install Spark and Run PySpark](./howto/download_install_run_spark.md) 
 68 | 
 69 | ### [How to Minimize the Verbosity of Spark](./howto/minimize_verbosity.md) 
 70 | 
 71 | -------
 72 | 
 73 | # PySpark Tutorial and References...
 74 | * [Getting started with PySpark - Part 1](http://www.mccarroll.net/blog/pyspark/)
 75 | * [Getting started with PySpark - Part 2](http://www.mccarroll.net/blog/pyspark2/index.html)
 76 | * [A really really fast introduction to PySpark](http://www.slideshare.net/hkarau/a-really-really-fast-introduction-to-py-spark-lightning-fast-cluster-computing-with-python-1)
 77 | * [PySpark](http://www.slideshare.net/thegiivee/pysaprk?qid=81cf1b31-8b19-4570-89a5-21d03cad6ecd&v=default&b=&from_search=9)
 78 | * [Basic Big Data Manipulation with PySpark](http://bigdatasciencebootcamp.com/posts/Part_3/basic_big_data.html)
 79 | * [Working in Pyspark: Basics of Working with Data and RDDs](http://www.learnbymarketing.com/618/pyspark-rdd-basics-examples/)
 80 | 
 81 | -------
 82 | 
 83 | # Questions/Comments
 84 | * [View Mahmoud Parsian's profile on LinkedIn](http://www.linkedin.com/in/mahmoudparsian)
 85 | * Please send me an email: mahmoud.parsian@yahoo.com
 86 | * [Twitter: @mahmoudparsian](http://twitter.com/mahmoudparsian) 
 87 | 
 88 | Thank you!
 89 | 
 90 | ````
 91 | best regards,
 92 | Mahmoud Parsian
 93 | ````
 94 | 
 95 | -----
 96 | 
 97 | 
 98 | <a href="https://www.oreilly.com/library/view/data-algorithms-with/9781492082378/">
 99 |     <img
100 |         alt="Data Algorithms with Spark"
101 |         src="images/Data-Algorithms-with-Spark_mech2.png"
102 | >
103 | 
104 | <a href="https://www.oreilly.com/library/view/data-algorithms-with/9781492082378/">
105 |     <img
106 |         alt="Data Algorithms with Spark"
107 |         src="images/Data_Algorithms_with_Spark_COVER_9781492082385.png"
108 | >
109 | 
110 | <a href="https://www.amazon.com/dp/B07X4B2218/ref=sr_1_2">
111 |     <img
112 |         alt="PySpark Algorithms"
113 |         src="images/pyspark_algorithms2.jpg"
114 | >
115 | 
116 | <a href="http://shop.oreilly.com/product/0636920033950.do">
117 |     <img
118 |         alt="Data Algorithms"
119 |         src="images/data_algorithms_image.jpg"
120 | >
121 | 
122 | ------
123 | 
124 | [//]: # (metadata:)
125 | [//]: # (Spark, PySpark, Python)
126 | [//]: # (MapReduce, Distributed Algorithms, mappers, reducers, partitioners)
127 | [//]: # (Transformations, Actions, RDDs, DataFrames, SQL)
128 | 


--------------------------------------------------------------------------------
/data/foxdata.txt:
--------------------------------------------------------------------------------
1 | red fox jumped high
2 | fox jumped over high fence
3 | red fox jumped
4 | 


--------------------------------------------------------------------------------
/howto/README.md:
--------------------------------------------------------------------------------
  1 | # PySpark Tutorial
  2 | 
  3 | * Spark is a multi-language engine for executing data engineering, 
  4 |   data science, and machine learning on single-node machines or clusters.
  5 | 
  6 | * PySpark is the Python API for Spark.  
  7 | 
  8 | # Start PySpark
  9 | 
 10 | First make sure that you have started the Spark cluster. 
 11 | To start Spark, you execute the following. Note, if you 
 12 | are going to run PySpark shell in your laptop/macbook, 
 13 | then you do not need to start any clauter -- your 
 14 | laptop/macbook as a cluster of a single node:
 15 | 
 16 |     export SPARK_HOME=<installed-directory-for-spark>
 17 | 	cd $SPARK_HOME
 18 | 	./sbin/start-all.sh
 19 | 
 20 | 
 21 | To start PySpark, execute the following:
 22 | 
 23 | 
 24 | 	cd $SPARK_HOME
 25 | 	./bin/pyspark
 26 | 
 27 | 
 28 | Successful execution will give you the PySpark prompt:
 29 | 
 30 | 
 31 | 	~  % ./spark-3.3.0/bin/pyspark
 32 | 	Python 3.10.5 (v3.10.5:f377153967, Jun  6 2022, 12:36:10) [Clang 13.0.0 (clang-1300.0.29.30)] on darwin
 33 | 	Type "help", "copyright", "credits" or "license" for more information.
 34 | 	Setting default log level to "WARN".
 35 | 	To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 36 | 	Welcome to
 37 | 		  ____              __
 38 | 		 / __/__  ___ _____/ /__
 39 | 		_\ \/ _ \/ _ `/ __/  '_/
 40 | 	   /__ / .__/\_,_/_/ /_/\_\   version 3.3.0
 41 | 		  /_/
 42 | 
 43 | 	Using Python version 3.10.5 (v3.10.5:f377153967, Jun  6 2022 12:36:10)
 44 | 	Spark context Web UI available at http://10.0.0.232:4040
 45 | 	Spark context available as 'sc' (master = local[*], app id = local-1656268371486).
 46 | 	SparkSession available as 'spark'.
 47 | 	>>>
 48 | 
 49 | 
 50 | Note that the shell already have created two objects:
 51 | * SparkContext (`sc`) object and you may use it to create RDDs.
 52 | * SparkSession (`spark`) object and you may use it to create DataFrames.
 53 | 
 54 | # Creating RDDs
 55 | 
 56 | You may create RDDs by:
 57 | * reading textfiles, 
 58 | * Python collections and data structures, 
 59 | * local file system, 
 60 | * S3 and HDFS, 
 61 | * and other data sources.
 62 | 
 63 | 
 64 | ## Create RDD from a Data Structure (or Collection)
 65 | 
 66 | * Example-1
 67 | 
 68 | 		>>> data = [1, 2, 3, 4, 5, 8, 9]
 69 | 		>>> data
 70 | 		[1, 2, 3, 4, 5, 8, 9]
 71 | 		>>> myRDD = sc.parallelize(data)
 72 | 		>>> myRDD.collect()
 73 | 		[1, 2, 3, 4, 5, 8, 9]
 74 | 		>>> myRDD.count()
 75 | 		7
 76 | 		>>> 
 77 | 
 78 | 
 79 | * Example-2
 80 | 
 81 | 		>>> kv = [('a',7), ('a', 2), ('b', 2), ('b',4), ('c',1), ('c',2), ('c',3), ('c',4)]
 82 | 		>>> kv
 83 | 		[('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)]
 84 | 		>>> rdd2 = sc.parallelize(kv)
 85 | 		>>> rdd2.collect()
 86 | 		[('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)]
 87 | 		>>>
 88 | 		>>> rdd3 = rdd2.reduceByKey(lambda x, y : x+y)
 89 | 		>>> rdd3.collect()
 90 | 		[('a', 9), ('c', 10), ('b', 6)]
 91 | 		>>> 
 92 | 
 93 | 
 94 | * Example-3
 95 | 
 96 | 
 97 | 		>>> kv = [('a',7), ('a', 2), ('b', 2), ('b',4), ('c',1), ('c',2), ('c',3), ('c',4)]
 98 | 		>>> kv
 99 | 		[('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)]
100 | 		>>> rdd2 = sc.parallelize(kv)
101 | 		>>> rdd2.collect()
102 | 		[('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)]
103 | 
104 | 		>>> rdd3 = rdd2.groupByKey()
105 | 		>>> rdd3.collect()
106 | 		[
107 | 		 ('a', <pyspark.resultiterable.ResultIterable object at 0x104ec4c50>), 
108 | 		 ('c', <pyspark.resultiterable.ResultIterable object at 0x104ec4cd0>), 
109 | 		 ('b', <pyspark.resultiterable.ResultIterable object at 0x104ce7290>)
110 | 		]
111 | 
112 | 		>>> rdd3.map(lambda x : (x[0], list(x[1]))).collect()
113 | 		[
114 | 		 ('a', [7, 2]), 
115 | 		 ('c', [1, 2, 3, 4]), 
116 | 		 ('b', [2, 4])
117 | 		]
118 | 		>>> 
119 | 
120 | 
121 | 
122 | # Create RDD from a Local File System (Java Example)
123 | 
124 | 	import org.apache.spark.api.java.JavaRDD;
125 | 	import org.apache.spark.api.java.JavaSparkContext;
126 | 	...
127 | 	JavaSparkContext context = new JavaSparkContext();
128 | 	...
129 | 	final String inputPath ="file:///dir1/dir2/myinputfile.txt";
130 | 	JavaRDD<String> rdd = context.textFile(inputPath);
131 |     ...
132 | 
133 | 
134 | # Create RDD from HDFS (Java Example)
135 | 
136 | * Example-1:
137 | 
138 | 		import org.apache.spark.api.java.JavaRDD;
139 | 		import org.apache.spark.api.java.JavaSparkContext;
140 | 		...
141 | 		JavaSparkContext context = new JavaSparkContext();
142 | 		...
143 | 		final String inputPath ="hdfs://myhadoopserver:9000/dir1/dir2/myinputfile.txt";
144 | 		JavaRDD<String> rdd = context.textFile(inputPath);
145 | 		...
146 | 
147 | * Example-2:
148 | 
149 | 
150 | 		import org.apache.spark.api.java.JavaRDD;
151 | 		import org.apache.spark.api.java.JavaSparkContext;
152 | 		...
153 | 		JavaSparkContext context = new JavaSparkContext();
154 | 		...
155 | 		final String inputPath ="/dir1/dir2/myinputfile.txt";
156 | 		JavaRDD<String> rdd = context.textFile(inputPath);
157 | 		...
158 | 
159 | 
160 | # Questions/Comments
161 | 
162 | * [View Mahmoud Parsian's profile on LinkedIn](http://www.linkedin.com/in/mahmoudparsian)
163 | * Please send me an email: mahmoud.parsian@yahoo.com
164 | * [Twitter: @mahmoudparsian](http://twitter.com/mahmoudparsian) 
165 | 
166 | 
167 | Thank you!
168 | 
169 | ````
170 | best regards,
171 | Mahmoud Parsian
172 | ````
173 | 
174 | -----
175 | 
176 | 
177 | <a href="https://www.oreilly.com/library/view/data-algorithms-with/9781492082378/">
178 |     <img
179 |         alt="Data Algorithms with Spark"
180 |         src="../images/Data-Algorithms-with-Spark_mech2.png"
181 | >
182 | 
183 | <a href="https://www.oreilly.com/library/view/data-algorithms-with/9781492082378/">
184 |     <img
185 |         alt="Data Algorithms with Spark"
186 |         src="../images/Data_Algorithms_with_Spark_COVER_9781492082385.png"
187 | >
188 | 
189 | <a href="https://www.amazon.com/dp/B07X4B2218/ref=sr_1_2">
190 |     <img
191 |         alt="PySpark Algorithms"
192 |         src="../images/pyspark_algorithms2.jpg"
193 | >
194 | 
195 | <a href="http://shop.oreilly.com/product/0636920033950.do">
196 |     <img
197 |         alt="Data Algorithms"
198 |         src="../images/data_algorithms_image.jpg"
199 | >
200 | 


--------------------------------------------------------------------------------
/howto/download_install_run_spark.md:
--------------------------------------------------------------------------------
  1 | # Download, Install, and Run PySpark
  2 | 
  3 | # 1. For macbook users: Enable "Remote Login"
  4 | 
  5 | 
  6 |       System Preferences --> Sharing --> enable "Remote Login" service
  7 | 
  8 | 
  9 | 
 10 | # 2. Make Sure Java 8 is Installed Properly
 11 | 
 12 | 	java -version
 13 | 	java version "1.8.0_72"
 14 | 	Java(TM) SE Runtime Environment (build 1.8.0_72-b15)
 15 | 	Java HotSpot(TM) 64-Bit Server VM (build 25.72-b15, mixed mode)
 16 | 
 17 | 
 18 | # 3. Download 
 19 | 
 20 | Download the latest binary Spark from the following URL:
 21 | 
 22 | 	https://www.apache.org/dyn/closer.lua/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
 23 | 
 24 | 
 25 | # 4. Open the Downloaded File
 26 | 
 27 | Assuming that I have downloaded my file in 
 28 | `/home/mparsian/spark-3.3.0-bin-hadoop3.tgz`
 29 | 
 30 | 
 31 | 	cd /home/mparsian
 32 | 
 33 | 	tar zvfx  spark-3.3.0-bin-hadoop3.tgz
 34 | 	x spark-3.3.0-bin-hadoop3/
 35 | 	x spark-3.3.0-bin-hadoop3/NOTICE
 36 | 	x spark-3.3.0-bin-hadoop3/CHANGES.txt
 37 | 	...
 38 | 
 39 | 
 40 | # 5. Start the Spark Cluster
 41 | 
 42 | 	cd /home/mparsian/spark-3.3.0-bin-hadoop3/
 43 | 
 44 | 	./sbin/start-all.sh
 45 | 	
 46 | 	NOTE: If you are going to run Spark in your pc/macbook/windows, 
 47 | 	then you do NOT need to start cluster at all. Invoking
 48 | 	./bin/pyspark, your laptop is considered as your cluster 
 49 | 
 50 | 
 51 | # 6. Check Master and Worker
 52 | 
 53 | Make sure that Master and Worker processes are running:
 54 | 
 55 | 
 56 | 	jps
 57 | 	1347 Master
 58 | 	1390 Worker
 59 | 
 60 | 
 61 | # 7. Check The Spark URL
 62 | 
 63 | 	http://localhost:8080
 64 | 
 65 | 
 66 | # 8. Define Very Basic Python Program
 67 | 
 68 | * Python program: `/home/mparsian/spark-3.3.0-bin-hadoop3/test.py`
 69 | 
 70 | 		#!/usr/bin/python
 71 | 		import sys
 72 | 
 73 | 		for line in sys.stdin:
 74 | 			print "hello " + line
 75 | 		
 76 | 
 77 | * Python program: `/home/mparsian/spark-3.3.0-bin-hadoop3/test2.py`	
 78 | 	
 79 | 		#!/usr/bin/python
 80 | 		def fun2(str):
 81 | 			str2 = str + " zaza"
 82 | 			return str2
 83 | 
 84 | 
 85 | # 9. Start and Run pyspark
 86 | 
 87 | 		cd /home/mparsian/spark-3.3.0-bin-hadoop3/
 88 | 		./bin/pyspark
 89 | 				...
 90 | 		...
 91 | 		Welcome to
 92 | 			  ____              __
 93 | 			 / __/__  ___ _____/ /__
 94 | 			_\ \/ _ \/ _ `/ __/  '_/
 95 | 		   /__ / .__/\_,_/_/ /_/\_\   version 3.3.0
 96 | 			  /_/
 97 | 
 98 | 		>>> data = ["john","paul","george","ringo"]
 99 | 		>>> data
100 | 		['john', 'paul', 'george', 'ringo']
101 | 
102 | 		>>> rdd = sc.parallelize(data)
103 | 		>>> rdd.collect()
104 | 		['john', 'paul', 'george', 'ringo']
105 | 
106 | 
107 | 		>>> test = "/home/mparsian/spark-3.3.0-bin-hadoop3/test.py"
108 | 		>>> test2 = "/home/mparsian/spark-3.3.0-bin-hadoop3/test2.py"
109 | 		>>> import test
110 | 		>>> import test2
111 | 
112 | 
113 | 		>>> pipeRDD =  rdd.pipe(test)
114 | 		>>> pipeRDD.collect()
115 | 		[u'hello john', u'', u'hello paul', u'', u'hello george', u'', u'hello ringo', u'']
116 | 
117 | 
118 | 		>>> rdd.collect()
119 | 		['john', 'paul', 'george', 'ringo']
120 | 
121 | 
122 | 		>>> rdd2 = rdd.map(lambda x : test2.fun2(x))
123 | 		>>> rdd2.collect()
124 | 		['john zaza', 'paul zaza', 'george zaza', 'ringo zaza']
125 | 		>>>
126 | 
127 | 


--------------------------------------------------------------------------------
/howto/minimize_verbosity.md:
--------------------------------------------------------------------------------
 1 | How to Minimize the Verbosity of Spark
 2 | ======================================
 3 | * Step-1: create a log4j.properties file
 4 | ````
 5 | cp $SPARK_HOME/conf/log4j.properties.template $SPARK_HOME/conf/log4j.properties
 6 | ````
 7 | * Step-2: Edit $SPARK_HOME/conf/log4j.properties file: replace "INFO" with "WARN"
 8 | 
 9 | * Now your file should look like:
10 | ````
11 | cat $SPARK_HOME/conf/log4j.properties
12 | # Set everything to be logged to the console
13 | log4j.rootCategory=WARN, console
14 | log4j.appender.console=org.apache.log4j.ConsoleAppender
15 | log4j.appender.console.target=System.err
16 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
17 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
18 | 
19 | # Settings to quiet third party logs that are too verbose
20 | log4j.logger.org.eclipse.jetty=WARN
21 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
22 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=WARN
23 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=WARN
24 | ````


--------------------------------------------------------------------------------
/images/Data-Algorithms-with-Spark_mech2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/Data-Algorithms-with-Spark_mech2.pdf


--------------------------------------------------------------------------------
/images/Data-Algorithms-with-Spark_mech2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/Data-Algorithms-with-Spark_mech2.png


--------------------------------------------------------------------------------
/images/Data_Algorithms_with_Spark_COVER_9781492082385.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/Data_Algorithms_with_Spark_COVER_9781492082385.png


--------------------------------------------------------------------------------
/images/data_algorithms_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/data_algorithms_image.jpg


--------------------------------------------------------------------------------
/images/data_algorithms_with_spark.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/data_algorithms_with_spark.jpg


--------------------------------------------------------------------------------
/images/pyspark_algorithms2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/pyspark_algorithms2.jpg


--------------------------------------------------------------------------------
/tutorial/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/tutorial/.DS_Store


--------------------------------------------------------------------------------
/tutorial/add-indices/add-indices.txt:
--------------------------------------------------------------------------------
 1 | # ./pyspark
 2 | Welcome to
 3 |       ____              __
 4 |      / __/__  ___ _____/ /__
 5 |     _\ \/ _ \/ _ `/ __/  '_/
 6 |    /__ / .__/\_,_/_/ /_/\_\   version 1.4.0
 7 |       /_/
 8 | 
 9 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
10 | SparkContext available as sc, SQLContext available as sqlContext.
11 | >>> a = [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)]
12 | >>> a
13 | [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)]
14 | 
15 | >>> rdd = sc.parallelize(a);
16 | >>> rdd.collect()
17 | [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)]
18 | 
19 | >>> sorted = rdd.sortByKey()
20 | >>> sorted.collect()
21 | [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)]
22 | 
23 | 
24 | >>> rdd2 = rdd.map(lambda (x,y) : (y,x))
25 | >>> rdd2.collect()
26 | [(2, 'g1'), (4, 'g2'), (3, 'g3'), (8, 'g4')]
27 | 
28 | >>> sorted = rdd2.sortByKey()
29 | >>> sorted.collect()
30 | [(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')]
31 | 
32 | 
33 | >>> sorted = rdd2.sortByKey(False)
34 | >>> sorted.collect()
35 | [(8, 'g4'), (4, 'g2'), (3, 'g3'), (2, 'g1')]
36 | 
37 | >>> sorted = rdd2.sortByKey()
38 | >>> sorted.collect()
39 | [(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')]
40 | >>>
41 | >>> list
42 | [(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')]
43 | 
44 | >>>
45 | >>> sorted.collect()
46 | [(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')]
47 | 
48 | >>> indices = sorted.zipWithIndex()
49 | >>> indices.collect()
50 | [((2, 'g1'), 0), ((3, 'g3'), 1), ((4, 'g2'), 2), ((8, 'g4'), 3)]
51 | >>>


--------------------------------------------------------------------------------
/tutorial/basic-average/basic-average.txt:
--------------------------------------------------------------------------------
 1 | # ./pyspark
 2 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12)
 3 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
 4 | Welcome to
 5 |       ____              __
 6 |      / __/__  ___ _____/ /__
 7 |     _\ \/ _ \/ _ `/ __/  '_/
 8 |    /__ / .__/\_,_/_/ /_/\_\   version 1.2.0
 9 |       /_/
10 | 
11 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
12 | SparkContext available as sc.
13 | >>> sc
14 | <pyspark.context.SparkContext object at 0x10ab3e210>
15 | >>>
16 | >>> nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 20])
17 | >>> nums.collect()
18 | [1, 2, 3, 4, 5, 6, 7, 8, 20]
19 | >>> sumAndCount = nums.map(lambda x: (x, 1)).fold((0, 0), (lambda x, y: (x[0] + y[0], x[1] + y[1])))
20 | >>> sumAndCount
21 | (56, 9)
22 | >>>
23 | >>> avg = float(sumAndCount[0]) / float(sumAndCount[1])
24 | >>> avg
25 | 6.2222222222222223
26 | >>>
27 | 


--------------------------------------------------------------------------------
/tutorial/basic-filter/basic-filter.txt:
--------------------------------------------------------------------------------
 1 | # ./pyspark
 2 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12)
 3 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
 4 | Welcome to
 5 |       ____              __
 6 |      / __/__  ___ _____/ /__
 7 |     _\ \/ _ \/ _ `/ __/  '_/
 8 |    /__ / .__/\_,_/_/ /_/\_\   version 1.2.0
 9 |       /_/
10 | 
11 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
12 | SparkContext available as sc.
13 | >>> sc
14 | <pyspark.context.SparkContext object at 0x10d926210>
15 | 
16 | >>>
17 | >>> nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7])
18 | >>> nums.collect()
19 | [1, 2, 3, 4, 5, 6, 7]
20 | 
21 | >>> filtered1 = nums.filter(lambda x : x % 2 == 1)
22 | >>> filtered1.collect()
23 | [1, 3, 5, 7]
24 | >>>
25 | >>> filtered2 = nums.filter(lambda x : x % 2 == 0)
26 | >>> filtered2.collect()
27 | [2, 4, 6]
28 | >>>
29 | 


--------------------------------------------------------------------------------
/tutorial/basic-join/basicjoin.txt:
--------------------------------------------------------------------------------
  1 | # cat > R.txt
  2 | k1,v1
  3 | k1,v2
  4 | k2,v3
  5 | k2,v4
  6 | k3,v7
  7 | k3,v8
  8 | k3,v9
  9 | 
 10 | # cat > S.txt
 11 | k1,v11
 12 | k1,v22
 13 | k1,v33
 14 | k2,v55
 15 | k4,v77
 16 | k5,v88
 17 | 
 18 | # ./pyspark
 19 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12)
 20 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
 21 | Type "help", "copyright", "credits" or "license" for more information.
 22 | Welcome to
 23 |       ____              __
 24 |      / __/__  ___ _____/ /__
 25 |     _\ \/ _ \/ _ `/ __/  '_/
 26 |    /__ / .__/\_,_/_/ /_/\_\   version 1.2.0
 27 |       /_/
 28 | 
 29 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
 30 | SparkContext available as sc.
 31 | >>> R = sc.textFile("R.txt");
 32 | >>> R.collect()
 33 | [u'k1,v1', 
 34 |  u'k1,v2', 
 35 |  u'k2,v3', 
 36 |  u'k2,v4', 
 37 |  u'k3,v7', 
 38 |  u'k3,v8', 
 39 |  u'k3,v9']
 40 |  
 41 | >>> S = sc.textFile("S.txt");
 42 | >>> S.collect()
 43 | [u'k1,v11', 
 44 |  u'k1,v22', 
 45 |  u'k1,v33', 
 46 |  u'k2,v55', 
 47 |  u'k4,v77', 
 48 |  u'k5,v88'
 49 | ]
 50 | 
 51 | >>> r1 = R.map(lambda s: s.split(","))
 52 | >>> r1.collect()
 53 | [
 54 |  [u'k1', u'v1'], 
 55 |  [u'k1', u'v2'], 
 56 |  [u'k2', u'v3'], 
 57 |  [u'k2', u'v4'], 
 58 |  [u'k3', u'v7'], 
 59 |  [u'k3', u'v8'], 
 60 |  [u'k3', u'v9']
 61 | ]
 62 | >>> r2 = r1.flatMap(lambda s: [(s[0], s[1])])
 63 | >>> r2.collect()
 64 | [
 65 |  (u'k1', u'v1'), 
 66 |  (u'k1', u'v2'), 
 67 |  (u'k2', u'v3'), 
 68 |  (u'k2', u'v4'), 
 69 |  (u'k3', u'v7'), 
 70 |  (u'k3', u'v8'), 
 71 |  (u'k3', u'v9')
 72 | ]
 73 | >>>
 74 | >>> s1 = S.map(lambda s: s.split(","))
 75 | >>> s1.collect()
 76 | [
 77 |  [u'k1', u'v11'], 
 78 |  [u'k1', u'v22'], 
 79 |  [u'k1', u'v33'], 
 80 |  [u'k2', u'v55'], 
 81 |  [u'k4', u'v77'], 
 82 |  [u'k5', u'v88']
 83 | ]
 84 | >>> s2 = s1.flatMap(lambda s: [(s[0], s[1])])
 85 | >>> s2.collect()
 86 | [
 87 |  (u'k1', u'v11'), 
 88 |  (u'k1', u'v22'), 
 89 |  (u'k1', u'v33'), 
 90 |  (u'k2', u'v55'), 
 91 |  (u'k4', u'v77'), 
 92 |  (u'k5', u'v88')
 93 | ]
 94 | >>> RjoinedS = r2.join(s2)
 95 | >>> RjoinedS.collect()
 96 | [
 97 |  (u'k2', (u'v3', u'v55')), 
 98 |  (u'k2', (u'v4', u'v55')), 
 99 |  (u'k1', (u'v1', u'v11')), 
100 |  (u'k1', (u'v1', u'v22')), 
101 |  (u'k1', (u'v1', u'v33')), 
102 |  (u'k1', (u'v2', u'v11')), 
103 |  (u'k1', (u'v2', u'v22')), 
104 |  (u'k1', (u'v2', u'v33'))
105 | ]
106 | >>>


--------------------------------------------------------------------------------
/tutorial/basic-map/basic-map.txt:
--------------------------------------------------------------------------------
 1 | # ./pyspark
 2 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12)
 3 | Welcome to
 4 |       ____              __
 5 |      / __/__  ___ _____/ /__
 6 |     _\ \/ _ \/ _ `/ __/  '_/
 7 |    /__ / .__/\_,_/_/ /_/\_\   version 1.2.0
 8 |       /_/
 9 | 
10 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
11 | SparkContext available as sc.
12 | >>> sc
13 | <pyspark.context.SparkContext object at 0x10d926210>
14 | >>>
15 | >>> nums = sc.parallelize([1, 2, 3, 4, 5])
16 | >>> nums.collect()
17 | [1, 2, 3, 4, 5]
18 | >>>
19 | >>> bytwo = nums.map(lambda x: x + 2)
20 | >>> bytwo.collect()
21 | [3, 4, 5, 6, 7]
22 | >>>
23 | >>> squared = nums.map(lambda x: x * x)
24 | >>> squared.collect()
25 | [1, 4, 9, 16, 25]
26 | >>>
27 | 


--------------------------------------------------------------------------------
/tutorial/basic-multiply/basic-multiply.txt:
--------------------------------------------------------------------------------
 1 | # ./pyspark
 2 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12)
 3 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
 4 | Type "help", "copyright", "credits" or "license" for more information.
 5 | Welcome to
 6 |       ____              __
 7 |      / __/__  ___ _____/ /__
 8 |     _\ \/ _ \/ _ `/ __/  '_/
 9 |    /__ / .__/\_,_/_/ /_/\_\   version 1.2.0
10 |       /_/
11 | 
12 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
13 | SparkContext available as sc.
14 | >>> sc
15 | <pyspark.context.SparkContext object at 0x1058bf210>
16 | >>> numbers = sc.parallelize([1, 2, 3, 4])
17 | >>> mult = numbers.fold(1, (lambda x, y: x * y))
18 | 
19 | >>> mult
20 | 24
21 | 


--------------------------------------------------------------------------------
/tutorial/basic-sort/sort-by-key.txt:
--------------------------------------------------------------------------------
 1 | # cat data.txt
 2 | crazy crazy fox jumped
 3 | crazy fox jumped
 4 | fox is fast
 5 | fox is smart
 6 | dog is smart
 7 | 
 8 | # ./bin/pyspark
 9 | Welcome to
10 |       ____              __
11 |      / __/__  ___ _____/ /__
12 |     _\ \/ _ \/ _ `/ __/  '_/
13 |    /__ / .__/\_,_/_/ /_/\_\   version 1.4.0
14 |       /_/
15 | 
16 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
17 | SparkContext available as sc, SQLContext available as sqlContext.
18 | >>>
19 | >>> lines = sc.textFile('data.txt', 1);
20 | >>> lines.collect()
21 | [
22 |  u'crazy crazy fox jumped', 
23 |  u'crazy fox jumped', 
24 |  u'fox is fast', 
25 |  u'fox is smart', 
26 |  u'dog is smart'
27 | ]
28 | 
29 | >>> frequencies = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
30 | >>> frequencies.collect()
31 | [
32 |  (u'crazy', 3), 
33 |  (u'jumped', 2), 
34 |  (u'is', 3), 
35 |  (u'fox', 4), 
36 |  (u'dog', 1), 
37 |  (u'fast', 1), 
38 |  (u'smart', 2)
39 | ]
40 | 
41 | >>> frequencies.count()
42 | 7
43 | 
44 | >>> sorted = frequencies.sortByKey()
45 | >>> sorted.collect()
46 | [
47 |  (u'crazy', 3), 
48 |  (u'dog', 1), 
49 |  (u'fast', 1), 
50 |  (u'fox', 4), 
51 |  (u'is', 3), 
52 |  (u'jumped', 2), 
53 |  (u'smart', 2)
54 | ]
55 | >>>
56 | >>> sortedDescending = frequencies.sortByKey(False)
57 | >>> sortedDescending.collect()
58 | [
59 |  (u'smart', 2), 
60 |  (u'jumped', 2), 
61 |  (u'is', 3), 
62 |  (u'fox', 4), 
63 |  (u'fast', 1), 
64 |  (u'dog', 1), 
65 |  (u'crazy', 3)
66 | ]
67 | 


--------------------------------------------------------------------------------
/tutorial/basic-sum/basic-sum.txt:
--------------------------------------------------------------------------------
 1 | # ./pyspark
 2 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12)
 3 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
 4 | Type "help", "copyright", "credits" or "license" for more information.
 5 | Welcome to
 6 |       ____              __
 7 |      / __/__  ___ _____/ /__
 8 |     _\ \/ _ \/ _ `/ __/  '_/
 9 |    /__ / .__/\_,_/_/ /_/\_\   version 1.2.0
10 |       /_/
11 | 
12 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
13 | SparkContext available as sc.
14 | >>> sc
15 | <pyspark.context.SparkContext object at 0x1058bf210>
16 | >>> numbers = sc.parallelize([1, 2, 3, 4])
17 | >>> sum = numbers.fold(0, (lambda x, y: x + y))
18 | 
19 | >>> sum
20 | 10
21 | 


--------------------------------------------------------------------------------
/tutorial/basic-union/basic-union.txt:
--------------------------------------------------------------------------------
 1 | # ./pyspark
 2 | Welcome to
 3 |       ____              __
 4 |      / __/__  ___ _____/ /__
 5 |     _\ \/ _ \/ _ `/ __/  '_/
 6 |    /__ / .__/\_,_/_/ /_/\_\   version 1.4.0
 7 |       /_/
 8 | 
 9 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
10 | SparkContext available as sc, SQLContext available as sqlContext.
11 | 
12 | >>> d1= [('k1', 1), ('k2', 2), ('k3', 5)]
13 | >>> d1
14 | [('k1', 1), ('k2', 2), ('k3', 5)]
15 | 
16 | >>> d2= [('k1', 3), ('k2',4), ('k4', 8)]
17 | >>> d2
18 | [('k1', 3), ('k2', 4), ('k4', 8)]
19 | 
20 | >>> rdd1 = sc.parallelize(d1)
21 | >>> rdd1.collect()
22 | [('k1', 1), ('k2', 2), ('k3', 5)]
23 | 
24 | >>> rdd2 = sc.parallelize(d2)
25 | >>> rdd2.collect();
26 | [('k1', 3), ('k2', 4), ('k4', 8)]
27 | 
28 | >>> rdd3 = rdd1.union(rdd2)
29 | >>> rdd3.collect()
30 | [('k1', 1), ('k2', 2), ('k3', 5), ('k1', 3), ('k2', 4), ('k4', 8)]
31 | 
32 | >>> rdd4 = rdd3.reduceByKey(lambda x,y: x+y)
33 | >>> rdd4.collect()
34 | [('k3', 5), ('k2', 6), ('k1', 4), ('k4', 8)]


--------------------------------------------------------------------------------
/tutorial/bigrams/bigrams.txt:
--------------------------------------------------------------------------------
 1 | 1. Prepare Input
 2 | 
 3 | # cat data.txt
 4 | crazy crazy fox jumped over the fence
 5 | crazy fox jumped
 6 | the fence is high for fox
 7 | crazy fox is smart
 8 | fox jumped very high
 9 | 
10 | 2. Invoke pyspark
11 | 
12 | # export SPARK_HOME=...
13 | # SPARK_HOME/bin/pyspark
14 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12)
15 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
16 | Type "help", "copyright", "credits" or "license" for more information.
17 | Welcome to
18 |       ____              __
19 |      / __/__  ___ _____/ /__
20 |     _\ \/ _ \/ _ `/ __/  '_/
21 |    /__ / .__/\_,_/_/ /_/\_\   version 1.2.0
22 |       /_/
23 | 
24 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
25 | SparkContext available as sc.
26 | >>> sc
27 | <pyspark.context.SparkContext object at 0x10c501210>
28 | >>> lines = sc.textFile("data.txt")
29 | >>> lines.collect()
30 | 
31 | [u'crazy crazy fox jumped over the fence', 
32 |  u'crazy fox jumped', 
33 |  u'the fence is high for fox', 
34 |  u'crazy fox is smart', 
35 |  u'fox jumped very high'
36 | ]
37 | >>> bigrams = lines.map(lambda s : s.split(" ")).flatMap(lambda s: [((s[i],s[i+1]),1) for i in range (0, len(s)-1)])
38 | >>> bigrams.collect()
39 | [((u'crazy', u'crazy'), 1), 
40 |  ((u'crazy', u'fox'), 1), 
41 |  ((u'fox', u'jumped'), 1), 
42 |  ((u'jumped', u'over'), 1), 
43 |  ((u'over', u'the'), 1), 
44 |  ((u'the', u'fence'), 1), 
45 |  ((u'crazy', u'fox'), 1), 
46 |  ((u'fox', u'jumped'), 1), 
47 |  ((u'the', u'fence'), 1), 
48 |  ((u'fence', u'is'), 1), 
49 |  ((u'is', u'high'), 1), 
50 |  ((u'high', u'for'), 1), 
51 |  ((u'for', u'fox'), 1), 
52 |  ((u'crazy', u'fox'), 1), 
53 |  ((u'fox', u'is'), 1), 
54 |  ((u'is', u'smart'), 1), 
55 |  ((u'fox', u'jumped'), 1), 
56 |  ((u'jumped', u'very'), 1), 
57 |  ((u'very', u'high'), 1)
58 | ]
59 | >>>
60 | >>> counts = bigrams.reduceByKey(lambda x, y : x+y)
61 | >>> counts.collect()
62 | [
63 |  ((u'high', u'for'), 1), 
64 |  ((u'fox', u'is'), 1), 
65 |  ((u'is', u'smart'), 1), 
66 |  ((u'is', u'high'), 1), 
67 |  ((u'fence', u'is'), 1), 
68 |  ((u'very', u'high'), 1), 
69 |  ((u'crazy', u'fox'), 3), 
70 |  ((u'over', u'the'), 1), 
71 |  ((u'for', u'fox'), 1), 
72 |  ((u'the', u'fence'), 2), 
73 |  ((u'crazy', u'crazy'), 1), 
74 |  ((u'jumped', u'over'), 1), 
75 |  ((u'jumped', u'very'), 1), 
76 |  ((u'fox', u'jumped'), 3)
77 |  ]
78 | 


--------------------------------------------------------------------------------
/tutorial/cartesian/cartesian.txt:
--------------------------------------------------------------------------------
 1 | # ./pyspark
 2 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12)
 3 | ...
 4 | Welcome to
 5 |       ____              __
 6 |      / __/__  ___ _____/ /__
 7 |     _\ \/ _ \/ _ `/ __/  '_/
 8 |    /__ / .__/\_,_/_/ /_/\_\   version 1.3.0
 9 |       /_/
10 | 
11 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
12 | SparkContext available as sc, SQLContext available as sqlCtx.
13 | >>> a = [('k1','v1'), ('k2', 'v2')]
14 | >>> a
15 | [('k1', 'v1'), ('k2', 'v2')]
16 | >>> b = [('k3','v3'), ('k4', 'v4'), ('k5', 'v5') ]
17 | >>> b
18 | [('k3', 'v3'), ('k4', 'v4'), ('k5', 'v5')]
19 | >>> rdd1= sc.parallelize(a)
20 | >>> rdd1.collect()
21 | [('k1', 'v1'), ('k2', 'v2')]
22 | >>> rdd2= sc.parallelize(b)
23 | >>> rdd2.collect()
24 | [('k3', 'v3'), ('k4', 'v4'), ('k5', 'v5')]
25 | >>> rdd3 = rdd1.cartesian(rdd2)
26 | >>> rdd3.collect()
27 | [
28 |  (('k1', 'v1'), ('k3', 'v3')), 
29 |  (('k1', 'v1'), ('k4', 'v4')), 
30 |  (('k1', 'v1'), ('k5', 'v5')), 
31 |  (('k2', 'v2'), ('k3', 'v3')), 
32 |  (('k2', 'v2'), ('k4', 'v4')), 
33 |  (('k2', 'v2'), ('k5', 'v5'))
34 | ]
35 | >>>
36 | 


--------------------------------------------------------------------------------
/tutorial/combine-by-key/README.md:
--------------------------------------------------------------------------------
1 | Spark's combineByKey() Examples and Tutorial
2 | ============================================
3 | 
4 | * [Mean Calculation by combineByKey()](./spark-combineByKey.md)
5 | * [Standard Deviation and Mean Calculation by combineByKey()](./standard_deviation_by_combineByKey.md)
6 | 
7 | 
8 | [![Data Algorithms Book](https://github.com/mahmoudparsian/data-algorithms-book/blob/master/misc/data_algorithms_image.jpg)](http://shop.oreilly.com/product/0636920033950.do) 
9 | 


--------------------------------------------------------------------------------
/tutorial/combine-by-key/combine-by-key.txt:
--------------------------------------------------------------------------------
 1 | # export SPARK_HOME=...
 2 | # SPARK_HOME/bin/pyspark
 3 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12)
 4 | Welcome to
 5 |       ____              __
 6 |      / __/__  ___ _____/ /__
 7 |     _\ \/ _ \/ _ `/ __/  '_/
 8 |    /__ / .__/\_,_/_/ /_/\_\   version 1.2.0
 9 |       /_/
10 | 
11 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
12 | SparkContext available as sc.
13 | >>> sc
14 | <pyspark.context.SparkContext object at 0x10c501210>
15 | 
16 | >>> input = [("k1", 1), ("k1", 2), ("k1", 3), ("k1", 4), ("k1", 5), 
17 |              ("k2", 6), ("k2", 7), ("k2", 8), 
18 |              ("k3", 10), ("k3", 12)]
19 | >>> rdd = sc.parallelize(input)
20 | >>> sumCount = rdd.combineByKey( 
21 |                                 (lambda x: (x, 1)), 
22 |                                 (lambda x, y: (x[0] + y, x[1] + 1)), 
23 |                                 (lambda x, y: (x[0] + y[0], x[1] + y[1])) 
24 |                                )
25 | >>> sumCount.collect()
26 | [('k3', (22, 2)), ('k2', (21, 3)), ('k1', (15, 5))]
27 | >>> 
28 | >>> avg = sumCount.mapValues( lambda v : v[0] / v[1])
29 | >>> avg.collect()
30 | [('k3', 11), ('k2', 7), ('k1', 3)]
31 | >>>


--------------------------------------------------------------------------------
/tutorial/combine-by-key/distributed_computing_with_spark_by_Javier_Santos_Paniego.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/tutorial/combine-by-key/distributed_computing_with_spark_by_Javier_Santos_Paniego.pdf


--------------------------------------------------------------------------------
/tutorial/combine-by-key/standard_deviation_by_combineByKey.md:
--------------------------------------------------------------------------------
 1 | Mean and Standard Deviation by Spark's combineByKey()
 2 | =====================================================
 3 | 
 4 | ````
 5 | # ./bin/pyspark
 6 | Python 2.7.10 (default, Oct 23 2015, 19:19:21)
 7 | ...
 8 | Welcome to
 9 |       ____              __
10 |      / __/__  ___ _____/ /__
11 |     _\ \/ _ \/ _ `/ __/  '_/
12 |    /__ / .__/\_,_/_/ /_/\_\   version 1.6.1
13 |       /_/
14 | 
15 | Using Python version 2.7.10 (default, Oct 23 2015 19:19:21)
16 | SparkContext available as sc, HiveContext available as sqlContext.
17 | >>> data = [
18 | ...         ("A", 2.), ("A", 4.), ("A", 9.),
19 | ...         ("B", 10.), ("B", 20.),
20 | ...         ("Z", 3.), ("Z", 5.), ("Z", 8.), ("Z", 12.)
21 | ...        ]
22 | >>> data
23 | [
24 |  ('A', 2.0), 
25 |  ('A', 4.0), 
26 |  ('A', 9.0), 
27 |  ('B', 10.0), 
28 |  ('B', 20.0), 
29 |  ('Z', 3.0),
30 |  ('Z', 5.0), 
31 |  ('Z', 8.0), 
32 |  ('Z', 12.0)
33 | ]
34 | >>> rdd = sc.parallelize( data )
35 | >>> rdd.collect()
36 | [
37 |  ('A', 2.0), 
38 |  ('A', 4.0), 
39 |  ('A', 9.0), 
40 |  ('B', 10.0), 
41 |  ('B', 20.0), 
42 |  ('Z', 3.0), 
43 |  ('Z', 5.0), 
44 |  ('Z', 8.0), 
45 |  ('Z', 12.0)
46 | ]
47 | >>> rdd.count()
48 | 9
49 | >>> sumCount = rdd.combineByKey(lambda value: (value, value*value, 1),
50 | ...                             lambda x, value: (x[0] + value, x[1] + value*value, x[2] + 1),
51 | ...                             lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2])
52 | ...                            )
53 | 
54 | >>> sumCount.collect()
55 | [
56 |  ('A', (15.0, 101.0, 3)), 
57 |  ('Z', (28.0, 242.0, 4)), 
58 |  ('B', (30.0, 500.0, 2))
59 | ]
60 | 
61 | >>> import math
62 | >>> def  stdDev( sumX, sumSquared, n ):
63 | ...     mean = sumX / n
64 | ...     stdDeviation = math.sqrt ((sumSquared - n*mean*mean) /n)
65 | ...     return (mean, stdDeviation)
66 | ... ^D
67 | 
68 | >>> meanAndStdDev = sumCount.mapValues(lambda x : stdDev(x[0], x[1], x[2]))
69 | >>> meanAndStdDev.collect()
70 | [
71 |  ('A', (5.0, 2.943920288775949)), 
72 |  ('Z', (7.0, 3.391164991562634)), 
73 |  ('B', (15.0, 5.0))
74 | ]
75 | >>>
76 | ````


--------------------------------------------------------------------------------
/tutorial/dna-basecount/README.md:
--------------------------------------------------------------------------------
 1 | DNA Base Counting
 2 | =================
 3 | 
 4 | The following examples demostrates the usage of PySpark to count DNA bases.
 5 | In a nutshell, ````DNA Base Counting```` counts the number of A's, T's, C's, G's, 
 6 | and N's (N refers to undefined code).
 7 | 
 8 | 
 9 | * [DNA Base Counting Without In-Mapper Combiner](./dna-basecount.md)
10 | 
11 | * [DNA Base Counting With In-Mapper Combiner](./dna-basecount2.md)
12 | 
13 | * [DNA Base Counting With External Python Function](./dna-basecount3.md)
14 | 
15 | 
16 | [![Data Algorithms Book](https://github.com/mahmoudparsian/data-algorithms-book/blob/master/misc/data_algorithms_image.jpg)](http://shop.oreilly.com/product/0636920033950.do) 
17 | 


--------------------------------------------------------------------------------
/tutorial/dna-basecount/basemapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | def mapper(seq):
 4 | 	freq = dict()
 5 | 	for x in list(seq):
 6 | 		if x in freq:
 7 | 			freq[x] +=1
 8 | 		else:
 9 | 			freq[x] = 1
10 | #
11 | 	kv = [(x, freq[x]) for x in freq]
12 | 	return kv
13 | #
14 | #print mapper("ATCGATCGATAT")	
15 | 


--------------------------------------------------------------------------------
/tutorial/dna-basecount/dna-basecount.md:
--------------------------------------------------------------------------------
 1 | DNA Base Counting using PySpark
 2 | ===============================
 3 | 
 4 | DNA Base Count Definition
 5 | -------------------------
 6 | [DNA Base Counting is defined here.](https://www.safaribooksonline.com/library/view/data-algorithms/9781491906170/ch24.html)
 7 | 
 8 | Solution in PySpark
 9 | -------------------
10 | This solution assumes that each record is a DNA sequence. 
11 | This solution emits a ````(base, 1)```` for every base in 
12 | a given sequence and then aggregates all frequencies for 
13 | unique bases.
14 | 
15 | 
16 | ````
17 | $ cat /home/mparsian/dna_seq.txt
18 | ATATCCCCGGGAT
19 | ATCGATCGATAT
20 | 
21 | 
22 | # ./bin/pyspark
23 | Python 2.7.10 (default, Aug 22 2015, 20:33:39) 
24 | Welcome to
25 |       ____              __
26 |      / __/__  ___ _____/ /__
27 |     _\ \/ _ \/ _ `/ __/  '_/
28 |    /__ / .__/\_,_/_/ /_/\_\   version 1.6.0
29 |       /_/
30 | 
31 | SparkContext available as sc, HiveContext available as sqlContext.
32 | >>> recs = sc.textFile('file:///home/mparsian/dna_seq.txt')
33 | 
34 | >>> recs.collect()
35 | [
36 |  u'ATATCCCCGGGAT', 
37 |  u'ATCGATCGATAT'
38 | ]
39 | 
40 | >>> rdd = recs.flatMap(lambda x : [(c,1) for c in list(x)])
41 | >>> rdd.collect()
42 | [
43 |  (u'A', 1), 
44 |  (u'T', 1), 
45 |  (u'A', 1), 
46 |  (u'T', 1), 
47 |  (u'C', 1), 
48 |  (u'C', 1), 
49 |  (u'C', 1), 
50 |  (u'C', 1), 
51 |  (u'G', 1), 
52 |  (u'G', 1), 
53 |  (u'G', 1), 
54 |  (u'A', 1), 
55 |  (u'T', 1), 
56 |  (u'A', 1), 
57 |  (u'T', 1), 
58 |  (u'C', 1), 
59 |  (u'G', 1), 
60 |  (u'A', 1), 
61 |  (u'T', 1), 
62 |  (u'C', 1), 
63 |  (u'G', 1), 
64 |  (u'A', 1), 
65 |  (u'T', 1), 
66 |  (u'A', 1), 
67 |  (u'T', 1)
68 | ]
69 | >>> baseCount = rdd.reduceByKey(lambda x,y : x+y)
70 | >>> baseCount.collect()
71 | [
72 |  (u'A', 7), 
73 |  (u'C', 6), 
74 |  (u'G', 5), 
75 |  (u'T', 7)
76 | ]
77 | >>> 
78 | ````
79 | 
80 | 	
81 | 


--------------------------------------------------------------------------------
/tutorial/dna-basecount/dna-basecount2.md:
--------------------------------------------------------------------------------
 1 | DNA Base Counting using PySpark Using In-Mapper Combiner
 2 | ========================================================
 3 | 
 4 | DNA Base Count Definition
 5 | -------------------------
 6 | [DNA Base Counting is defined here.](https://www.safaribooksonline.com/library/view/data-algorithms/9781491906170/ch24.html)
 7 | 
 8 | Solution in PySpark
 9 | -------------------
10 | This solution assumes that each record is a DNA sequence. 
11 | This solution uses "In-Mapper Combiner" design pattern 
12 | and aggregates bases for each sequence before full
13 | aggregation of all frequencies for unique bases.
14 | 
15 | 
16 | ````
17 | $ cat /home/mparsian/dna_seq.txt
18 | ATATCCCCGGGAT
19 | ATCGATCGATAT
20 | 
21 | 
22 | # ./bin/pyspark
23 | Python 2.7.10 (default, Aug 22 2015, 20:33:39) 
24 | Welcome to
25 |       ____              __
26 |      / __/__  ___ _____/ /__
27 |     _\ \/ _ \/ _ `/ __/  '_/
28 |    /__ / .__/\_,_/_/ /_/\_\   version 1.6.0
29 |       /_/
30 | 
31 | SparkContext available as sc, HiveContext available as sqlContext.
32 | >>> recs = sc.texFile('file:///home/mparsian/dna_seq.txt')
33 | 
34 | >>> recs.collect()
35 | [
36 |  u'ATATCCCCGGGAT', 
37 |  u'ATCGATCGATAT'
38 | ]
39 | 
40 | >>> def mapper(seq):
41 | ...     freq = dict()
42 | ...     for x in list(seq):
43 | ...             if x in freq:
44 | ...                     freq[x] +=1
45 | ...             else:
46 | ...                     freq[x] = 1
47 | ...     #
48 | ...     kv = [(x, freq[x]) for x in freq]
49 | ...     return kv
50 | ... ^D
51 | 
52 | 
53 | >>> rdd = recs.flatMap(mapper)
54 | >>> rdd.collect()
55 | [
56 |  (u'A', 3), 
57 |  (u'C', 4), 
58 |  (u'T', 3), 
59 |  (u'G', 3), 
60 |  (u'A', 4), 
61 |  (u'C', 2), 
62 |  (u'T', 4), 
63 |  (u'G', 2)
64 | ]
65 | >>> baseCount = rdd.reduceByKey(lambda x,y : x+y)
66 | >>> baseCount.collect()
67 | [
68 |  (u'A', 7), 
69 |  (u'C', 6), 
70 |  (u'G', 5), 
71 |  (u'T', 7)
72 | ]
73 | >>> 
74 | ````
75 | 
76 | 	
77 | 


--------------------------------------------------------------------------------
/tutorial/dna-basecount/dna-basecount3.md:
--------------------------------------------------------------------------------
 1 | DNA Base Counting using PySpark
 2 | ===============================
 3 | 
 4 | DNA Base Count Definition
 5 | -------------------------
 6 | [DNA Base Counting is defined here.](https://www.safaribooksonline.com/library/view/data-algorithms/9781491906170/ch24.html)
 7 | 
 8 | Solution in PySpark
 9 | -------------------
10 | This solution assumes that each record is a DNA sequence. 
11 | This solution emits a ````(base, 1)```` for every base in 
12 | a given sequence and then aggregates all frequencies for 
13 | unique bases. For this solution we use an external Python
14 | function defined in ````basemapper.py````
15 | 
16 | * Define Python Function
17 | 
18 | ````
19 | $ export SPARK_HOME=/home/mparsian/spark-1.6.1-bin-hadoop2.6
20 | $ cat $SPARK_HOME/basemapper.py
21 | #!/usr/bin/python
22 | 
23 | def mapper(seq):
24 | 	freq = dict()
25 | 	for x in list(seq):
26 | 		if x in freq:
27 | 			freq[x] +=1
28 | 		else:
29 | 			freq[x] = 1
30 | #
31 | 	kv = [(x, freq[x]) for x in freq]
32 | 	return kv
33 | #
34 | #for testing:
35 | #print mapper("ATCGATCGATAT")	
36 | ````
37 | * Define Very Basic Sample Input
38 |  
39 | ````
40 | $ cat /home/mparsian/dna_seq.txt
41 | ATATCCCCGGGAT
42 | ATCGATCGATAT
43 | ````
44 | 
45 | * Sample PySpark Run
46 | 
47 | ````
48 | # ./bin/pyspark
49 | Welcome to
50 |       ____              __
51 |      / __/__  ___ _____/ /__
52 |     _\ \/ _ \/ _ `/ __/  '_/
53 |    /__ / .__/\_,_/_/ /_/\_\   version 1.6.1
54 |       /_/
55 | 
56 | SparkContext available as sc, HiveContext available as sqlContext.
57 | >>> recs = sc.texFile('file:///home/mparsian/dna_seq.txt')
58 | 
59 | >>> recs.collect()
60 | [
61 |  u'ATATCCCCGGGAT', 
62 |  u'ATCGATCGATAT'
63 | ]
64 | 
65 | >>> basemapper = "/Users/mparsian/spark-1.6.1-bin-hadoop2.6/basemapper.py"
66 | >>> import basemapper
67 | >>> basemapper
68 | <module 'basemapper' from 'basemapper.py'>
69 | >>>
70 | >>> recs = sc.textFile('file:////Users/mparsian/zmp/github/pyspark-tutorial/tutorial/dna-basecount/dna_seq.txt')
71 | >>> rdd = recs.flatMap(basemapper.mapper)
72 | >>> rdd.collect()
73 | [(u'A', 3), (u'C', 4), (u'T', 3), (u'G', 3), (u'A', 4), (u'C', 2), (u'T', 4), (u'G', 2)]
74 | 
75 | >>> baseCount = rdd.reduceByKey(lambda x,y : x+y)
76 | >>> baseCount.collect()
77 | [(u'A', 7), (u'C', 6), (u'G', 5), (u'T', 7)]
78 | >>>
79 | ````


--------------------------------------------------------------------------------
/tutorial/dna-basecount/dna_seq.txt:
--------------------------------------------------------------------------------
1 | ATATCCCCGGGAT
2 | ATCGATCGATAT
3 | 


--------------------------------------------------------------------------------
/tutorial/map-partitions/README.md:
--------------------------------------------------------------------------------
  1 | Spark's mapPartitions()
  2 | =======================
  3 | 
  4 | According to Spark API: ````mapPartitions(func)````	transformation is 
  5 | similar to ````map()````, but runs separately on each partition (block) 
  6 | of the RDD, so ````func```` must be of type ````Iterator<T> => Iterator<U>````
  7 | when running on an RDD of type T.
  8 | 
  9 | 
 10 | The ````mapPartitions()```` transformation should be used when you want to 
 11 | extract some condensed information (such as finding the minimum and maximum 
 12 | of numbers) from each partition. For example, if you want to find the minimum 
 13 | and maximum of all numbers in your input, then using ````map()```` can be 
 14 | pretty inefficient, since you will be generating tons of intermediate 
 15 | (K,V) pairs, but the bottom line is you just want to find two numbers: the 
 16 | minimum and maximum of all numbers in your input. Another example can be if 
 17 | you want to find top-10 (or bottom-10) for your input, then mapPartitions() 
 18 | can work very well: find the top-10 (or bottom-10) per partition, then find 
 19 | the top-10 (or bottom-10) for all partitions: this way you are limiting 
 20 | emitting too many intermediate (K,V) pairs.
 21 | 
 22 | 
 23 | Example-1: Sum Each Partition
 24 | =============================
 25 | ````
 26 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 27 | >>> numbers
 28 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 29 | 
 30 | >>> rdd = sc.parallelize(numbers, 3)
 31 | 
 32 | >>> rdd.collect()
 33 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 34 | 
 35 | >>> rdd.getNumPartitions()
 36 | 3
 37 | 
 38 | >>> def f(iterator):
 39 | ...     for x in iterator:
 40 | ...             print(x)
 41 | ...     print "==="
 42 | ...
 43 | >>> rdd.mapPartitions(f).collect()
 44 | 1
 45 | 2
 46 | 3
 47 | ===
 48 | 7
 49 | 8
 50 | 9
 51 | 10
 52 | ===
 53 | 4
 54 | 5
 55 | 6
 56 | ===
 57 | 
 58 | >>> def adder(iterator):
 59 | ...     yield sum(iterator)
 60 | ...
 61 | >>> rdd.mapPartitions(adder).collect()
 62 | [6, 15, 34]
 63 | 
 64 | ````
 65 | 
 66 | 
 67 | Example-2: Find Minimum and Maximum
 68 | ===================================
 69 | Use ````mapPartitions()```` and find the minimum and maximum from each partition.
 70 | 
 71 | To make it a cleaner solution, we define a python function to return the minimum and maximum 
 72 | for a given iteration.
 73 | 
 74 | ````
 75 | $ cat minmax.py
 76 | #!/usr/bin/python
 77 | 
 78 | def minmax(iterator):
 79 | 	firsttime = 0
 80 | 	#min = 0;
 81 | 	#max = 0;
 82 | 	for x in iterator:
 83 | 		if (firsttime == 0):
 84 | 			min = x;
 85 | 			max = x;
 86 | 			firsttime = 1
 87 | 		else:
 88 | 			if x > max:
 89 | 				max = x
 90 | 			if x < min:
 91 | 				min = x
 92 | 		#
 93 | 	return [(min, max)]
 94 | #
 95 | #data = [10, 20, 3, 4, 5, 2, 2, 20, 20, 10]
 96 | #print minmax(data)	
 97 | ````
 98 | Then we use the minmax function for the ````mapPartitions()````:
 99 | 
100 | >>> rdd = spark.sparkContext.parallelize(data, 3)
101 | >>> mapped = rdd.mapPartitions(minmax)
102 | >>> mapped.collect()
103 | [(3, 20), (2, 5), (2, 20)]
104 | >>> minmax_list = mapped.collect()
105 | >>> minimum = min(minmax_list[0])
106 | >>> minimum
107 | 3
108 | >>> maximum = max(minmax_list[0])
109 | >>> maximum
110 | 20
111 | 
112 | ````
113 | ### NOTE: data  can be huge, but for understanding 
114 | ### the mapPartitions() we use a very small data set
115 | 
116 | >>> data = [10, 20, 3, 4, 5, 2, 2, 20, 20, 10]
117 | >>> rdd = sc.parallelize(data, 3)
118 | 
119 | >>> rdd.getNumPartitions()
120 | 3
121 | 
122 | >>> rdd.collect()
123 | [10, 20, 3, 4, 5, 2, 2, 20, 20, 10]
124 | 
125 | >>> def f(iterator):
126 | ...     for x in iterator:
127 | ...             print(x)
128 | ...     print "==="
129 | ... ^D
130 | 
131 | >>> rdd.foreachPartition(f)
132 | 10
133 | 20
134 | 3
135 | ===
136 | 4
137 | 5
138 | 2
139 | ===
140 | 2
141 | 20
142 | 20
143 | 10
144 | ===
145 | >>>
146 | 
147 | >>> minmax = "/Users/mparsian/spark-1.6.1-bin-hadoop2.6/minmax.py"
148 | >>> import minmax
149 | 
150 | ### NOTE: the minmaxlist is a small list of numbers 
151 | ### two mumbers (min and max) are generated per partition
152 | >>> minmaxlist = rdd.mapPartitions(minmax.minmax).collect()
153 | >>> minmaxlist 
154 | [3, 20, 2, 5, 2, 20]
155 | 
156 | >>> min(minmaxlist)
157 | 2
158 | >>> max(minmaxlist)
159 | 20
160 | ````
161 | 
162 | Questions/Comments
163 | ==================
164 | * [View Mahmoud Parsian's profile on LinkedIn](http://www.linkedin.com/in/mahmoudparsian)
165 | * Please send me an email: mahmoud.parsian@yahoo.com
166 | * [Twitter: @mahmoudparsian](http://twitter.com/mahmoudparsian)
167 | 
168 | Thank you!
169 | 
170 | ````
171 | best regards,
172 | Mahmoud Parsian
173 | ````
174 | 
175 | [![Data Algorithms Book](https://github.com/mahmoudparsian/data-algorithms-book/blob/master/misc/data_algorithms_image.jpg)](http://shop.oreilly.com/product/0636920033950.do)
176 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/VIDEO-DataFrames.txt:
--------------------------------------------------------------------------------
1 | Structuring Apache Spark 2.0: SQL, DataFrames, Datasets And Streaming - by Michael Armbrust
2 | https://www.youtube.com/watch?v=1a4pgYzeFwE
3 | 28 mins
4 | 
5 | AWS Tutorial - AWS Athena + S3
6 | 20 mins
7 | https://www.youtube.com/watch?v=SiUDN95sJIo
8 | 
9 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/dataframe-examples.md:
--------------------------------------------------------------------------------
 1 | ## Spark DataFrame Examples (using PySpark):
 2 | 
 3 | 1. [Introduction to PySpark DataFrames (slides)](https://projector-video-pdf-converter.datacamp.com/13023/chapter3.pdf)
 4 | 
 5 | 2. [Apache Spark's DataFrame Examples](http://spark.apache.org/examples.html)
 6 | 
 7 | 3. [PySpark Dataframe Basics](https://changhsinlee.com/pyspark-dataframe-basics/)
 8 | 
 9 | 4. [PySpark Dataframe Basics -- notebook](https://github.com/changhsinlee/changhsinlee.github.io/blob/master/notebook/2018-03-04-pyspark-dataframe-basics/dataframe-basics.ipynb)
10 | 
11 | 5. [My Tutorial/Spark SQL Tutorial (PySpark)](https://www.zepl.com/viewer/notebooks/bm90ZTovL3pqZmZkdS8wN2M3YmI0MmJjMWI0YmE0OTc1M2IzMzZkMjA2MTk4Ny9ub3RlLmpzb24)
12 | 
13 | 6. [Complete Guide on DataFrame Operations in PySpark](https://www.analyticsvidhya.com/blog/2016/10/spark-dataframe-and-operations/)
14 | 
15 | 7. [Introduction to DataFrame Operations in PySpark](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html)
16 | 
17 | 8. [PySpark DataFrame Tutorial: Introduction to DataFrames](https://dzone.com/articles/pyspark-dataframe-tutorial-introduction-to-datafra)
18 | 
19 | 9. [Introduction to DataFrames - Python](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html)
20 | 
21 | 10. [How to use Spark SQL: A hands-on tutorial](https://opensource.com/article/19/3/apache-spark-and-dataframes-tutorial)
22 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/dataframe-session-2019-02-14.txt:
--------------------------------------------------------------------------------
 1 |  $ cat /Users/mparsian/tmp/emps_no_header.txt
 2 | 1001,alex,67000,SALES
 3 | 1002,bob,24000,SALES
 4 | 1003,boby,24000,SALES
 5 | 1004,jane,69000,SOFTWARE
 6 | 1005,betty,55000,SOFTWARE
 7 | 1006,jeff,59000,SOFTWARE
 8 | 1007,dara,72000,SOFTWARE 
 9 |  
10 |  
11 |  $ ./bin/pyspark
12 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
13 | [Clang 6.0 (clang-600.0.57)] on darwin
14 | Type "help", "copyright", "credits" or "license" for more information.
15 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
16 | Welcome to
17 |       ____              __
18 |      / __/__  ___ _____/ /__
19 |     _\ \/ _ \/ _ `/ __/  '_/
20 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.0
21 |       /_/
22 | 
23 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
24 | SparkSession available as 'spark'.
25 | >>>
26 | >>>
27 | >>>
28 | >>>
29 | >>>
30 | >>>
31 | >>>
32 | >>> input_path = "/Users/mparsian/tmp/emps_no_header.txt"
33 | >>> df = spark.read.csv(input_path)
34 | >>> df.show()
35 | +----+-----+-----+--------+
36 | | _c0|  _c1|  _c2|     _c3|
37 | +----+-----+-----+--------+
38 | |1001| alex|67000|   SALES|
39 | |1002|  bob|24000|   SALES|
40 | |1003| boby|24000|   SALES|
41 | |1004| jane|69000|SOFTWARE|
42 | |1005|betty|55000|SOFTWARE|
43 | |1006| jeff|59000|SOFTWARE|
44 | |1007| dara|72000|SOFTWARE|
45 | +----+-----+-----+--------+
46 | 
47 | >>> df.collect()
48 | [
49 |  Row(_c0='1001', _c1='alex', _c2='67000', _c3='SALES'), 
50 |  Row(_c0='1002', _c1='bob', _c2='24000', _c3='SALES'), 
51 |  Row(_c0='1003', _c1='boby', _c2='24000', _c3='SALES'), 
52 |  Row(_c0='1004', _c1='jane', _c2='69000', _c3='SOFTWARE'), 
53 |  Row(_c0='1005', _c1='betty', _c2='55000', _c3='SOFTWARE'), 
54 |  Row(_c0='1006', _c1='jeff', _c2='59000', _c3='SOFTWARE'), 
55 |  Row(_c0='1007', _c1='dara', _c2='72000', _c3='SOFTWARE')
56 | ]
57 | >>>
58 | >>>
59 | 
60 | >>>
61 | >>> df2 = df.selectExpr("_c0 as id", "_c1 as name", "_c2 as salary", "_c3 as dept")
62 | >>> df2.show()
63 | +----+-----+------+--------+
64 | |  id| name|salary|    dept|
65 | +----+-----+------+--------+
66 | |1001| alex| 67000|   SALES|
67 | |1002|  bob| 24000|   SALES|
68 | |1003| boby| 24000|   SALES|
69 | |1004| jane| 69000|SOFTWARE|
70 | |1005|betty| 55000|SOFTWARE|
71 | |1006| jeff| 59000|SOFTWARE|
72 | |1007| dara| 72000|SOFTWARE|
73 | +----+-----+------+--------+
74 | 
75 | >>> df2.printSchema()
76 | root
77 |  |-- id: string (nullable = true)
78 |  |-- name: string (nullable = true)
79 |  |-- salary: string (nullable = true)
80 |  |-- dept: string (nullable = true)
81 | 
82 | >>> df2.createOrReplaceTempView("emp_table")
83 | >>>
84 | >>>
85 | >>> df3 = spark.sql("SELECT * FROM emp_table WHERE id > 1002")
86 | >>> df3.show()
87 | +----+-----+------+--------+
88 | |  id| name|salary|    dept|
89 | +----+-----+------+--------+
90 | |1003| boby| 24000|   SALES|
91 | |1004| jane| 69000|SOFTWARE|
92 | |1005|betty| 55000|SOFTWARE|
93 | |1006| jeff| 59000|SOFTWARE|
94 | |1007| dara| 72000|SOFTWARE|
95 | +----+-----+------+--------+
96 | 
97 | >>> 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/dataframe-session-2020-11-04.txt:
--------------------------------------------------------------------------------
  1 | $ cat /tmp/emps_no_header.txt
  2 | 1001,alex,67000,SALES
  3 | 1002,bob,24000,SALES
  4 | 1003,boby,24000,SALES
  5 | 1004,jane,69000,SOFTWARE
  6 | 1005,betty,55000,SOFTWARE
  7 | 1006,jeff,59000,SOFTWARE
  8 | 1007,dara,72000,SOFTWARE
  9 | 1001,al,69000,SALES
 10 | 1002,bobby,24900,BUSINESS
 11 | 
 12 | $ ./bin/pyspark
 13 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 14 | [Clang 6.0 (clang-600.0.57)] on darwin
 15 | Type "help", "copyright", "credits" or "license" for more information.
 16 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 17 | Setting default log level to "WARN".
 18 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 19 | Welcome to
 20 |       ____              __
 21 |      / __/__  ___ _____/ /__
 22 |     _\ \/ _ \/ _ `/ __/  '_/
 23 |    /__ / .__/\_,_/_/ /_/\_\   version 3.0.0
 24 |       /_/
 25 | 
 26 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 27 | SparkSession available as 'spark'.
 28 | >>> input_path = '/tmp/emps_no_header.txt'
 29 | >>> df  = spark.read.csv(input_path)
 30 | >>> df.show()
 31 | +----+-----+-----+--------+
 32 | | _c0|  _c1|  _c2|     _c3|
 33 | +----+-----+-----+--------+
 34 | |1001| alex|67000|   SALES|
 35 | |1002|  bob|24000|   SALES|
 36 | |1003| boby|24000|   SALES|
 37 | |1004| jane|69000|SOFTWARE|
 38 | |1005|betty|55000|SOFTWARE|
 39 | |1006| jeff|59000|SOFTWARE|
 40 | |1007| dara|72000|SOFTWARE|
 41 | |1001|   al|69000|   SALES|
 42 | |1002|bobby|24900|BUSINESS|
 43 | +----+-----+-----+--------+
 44 | 
 45 | >>> df.count()
 46 | 9
 47 | >>> df.printSchema()
 48 | root
 49 |  |-- _c0: string (nullable = true)
 50 |  |-- _c1: string (nullable = true)
 51 |  |-- _c2: string (nullable = true)
 52 |  |-- _c3: string (nullable = true)
 53 | 
 54 | >>> df2 = df.selectExpr("_c0 as id", "_c1 as name", "_c2 as salary", "_c3 as dept")
 55 | >>> df2.show()
 56 | +----+-----+------+--------+
 57 | |  id| name|salary|    dept|
 58 | +----+-----+------+--------+
 59 | |1001| alex| 67000|   SALES|
 60 | |1002|  bob| 24000|   SALES|
 61 | |1003| boby| 24000|   SALES|
 62 | |1004| jane| 69000|SOFTWARE|
 63 | |1005|betty| 55000|SOFTWARE|
 64 | |1006| jeff| 59000|SOFTWARE|
 65 | |1007| dara| 72000|SOFTWARE|
 66 | |1001|   al| 69000|   SALES|
 67 | |1002|bobby| 24900|BUSINESS|
 68 | +----+-----+------+--------+
 69 | 
 70 | >>> df2.createOrReplaceTempView("emp_table")
 71 | >>> df3 = spark.sql("SELECT * FROM emp_table WHERE id > 1002")
 72 | >>> df3.show()
 73 | +----+-----+------+--------+
 74 | |  id| name|salary|    dept|
 75 | +----+-----+------+--------+
 76 | |1003| boby| 24000|   SALES|
 77 | |1004| jane| 69000|SOFTWARE|
 78 | |1005|betty| 55000|SOFTWARE|
 79 | |1006| jeff| 59000|SOFTWARE|
 80 | |1007| dara| 72000|SOFTWARE|
 81 | +----+-----+------+--------+
 82 | 
 83 | >>> df3.printSchema()
 84 | root
 85 |  |-- id: string (nullable = true)
 86 |  |-- name: string (nullable = true)
 87 |  |-- salary: string (nullable = true)
 88 |  |-- dept: string (nullable = true)
 89 | 
 90 | >>> df4 = df2.filter(df2.id > 1002)
 91 | >>> df4.show()
 92 | +----+-----+------+--------+
 93 | |  id| name|salary|    dept|
 94 | +----+-----+------+--------+
 95 | |1003| boby| 24000|   SALES|
 96 | |1004| jane| 69000|SOFTWARE|
 97 | |1005|betty| 55000|SOFTWARE|
 98 | |1006| jeff| 59000|SOFTWARE|
 99 | |1007| dara| 72000|SOFTWARE|
100 | +----+-----+------+--------+
101 | 
102 | >>> df5 = spark.sql("SELECT id, salary  FROM emp_table WHERE id > 1002")
103 | >>> df5.show()
104 | +----+------+
105 | |  id|salary|
106 | +----+------+
107 | |1003| 24000|
108 | |1004| 69000|
109 | |1005| 55000|
110 | |1006| 59000|
111 | |1007| 72000|
112 | +----+------+
113 | 
114 | >>>
115 | >>> df6 = spark.sql("SELECT name, salary FROM emp_table WHERE salary > 55000 ORDER BY salary")
116 | >>> df6.show()
117 | +----+------+
118 | |name|salary|
119 | +----+------+
120 | |jeff| 59000|
121 | |alex| 67000|
122 | |jane| 69000|
123 | |  al| 69000|
124 | |dara| 72000|
125 | +----+------+
126 | 
127 | >>> df6 = spark.sql("SELECT name, salary FROM emp_table WHERE salary > 55000 ORDER BY salary DESC")
128 | >>> df6.show()
129 | +----+------+
130 | |name|salary|
131 | +----+------+
132 | |dara| 72000|
133 | |  al| 69000|
134 | |jane| 69000|
135 | |alex| 67000|
136 | |jeff| 59000|
137 | +----+------+
138 | 
139 | >>> df7 = spark.sql("SELECT dept, COUNT(*) as count FROM emp_table GROUP BY dept")
140 | >>> df7.show()
141 | +--------+-----+
142 | |    dept|count|
143 | +--------+-----+
144 | |   SALES|    4|
145 | |BUSINESS|    1|
146 | |SOFTWARE|    4|
147 | +--------+-----+
148 | 
149 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/dataframe-session-2021-05-12-intro.txt:
--------------------------------------------------------------------------------
  1 | #--------------------
  2 | # DataFrame Tutorial:
  3 | #--------------------
  4 |    https://dzone.com/articles/pyspark-dataframe-tutorial-introduction-to-datafra
  5 | 
  6 | 
  7 | #---------------------
  8 | # Demo of DataFrames
  9 | #---------------------
 10 | 
 11 | $ cat /tmp/cats.csv
 12 | name,age,gender,weight
 13 | cuttie,2,female,6
 14 | mono,3,male,9
 15 | pishi,2,female,4
 16 | zazo,1,male,4
 17 | fuzzy,1,female,4
 18 | 
 19 | $ ./bin/pyspark
 20 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 21 | Welcome to
 22 |       ____              __
 23 |      / __/__  ___ _____/ /__
 24 |     _\ \/ _ \/ _ `/ __/  '_/
 25 |    /__ / .__/\_,_/_/ /_/\_\   version 3.1.1
 26 |       /_/
 27 | 
 28 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 29 | Spark context Web UI available at http://10.0.0.93:4040
 30 | Spark context available as 'sc' (master = local[*], app id = local-1620755686906).
 31 | SparkSession available as 'spark'.
 32 | 
 33 | >>>
 34 | >>> input_path = '/tmp/cats.csv'
 35 | >>> input_path
 36 | '/tmp/cats.csv'
 37 | >>> cats = spark.read.csv(input_path, inferSchema = True, header = True)
 38 | 
 39 | >>>
 40 | >>> cats.show(truncate=False)
 41 | +------+---+------+------+
 42 | |name  |age|gender|weight|
 43 | +------+---+------+------+
 44 | |cuttie|2  |female|6     |
 45 | |mono  |3  |male  |9     |
 46 | |pishi |2  |female|4     |
 47 | |zazo  |1  |male  |4     |
 48 | |fuzzy |1  |female|4     |
 49 | +------+---+------+------+
 50 | 
 51 | >>> cats.printSchema()
 52 | root
 53 |  |-- name: string (nullable = true)
 54 |  |-- age: integer (nullable = true)
 55 |  |-- gender: string (nullable = true)
 56 |  |-- weight: integer (nullable = true)
 57 | 
 58 | >>> cats.count()
 59 | 5
 60 | >>> cats.columns
 61 | ['name', 'age', 'gender', 'weight']
 62 | >>> cats.describe('weight').show()
 63 | +-------+------------------+
 64 | |summary|            weight|
 65 | +-------+------------------+
 66 | |  count|                 5|
 67 | |   mean|               5.4|
 68 | | stddev|2.1908902300206643|
 69 | |    min|                 4|
 70 | |    max|                 9|
 71 | +-------+------------------+
 72 | 
 73 | >>> name_age = cats.select("name", "age")
 74 | >>> name_age.show(truncate=False)
 75 | +------+---+
 76 | |name  |age|
 77 | +------+---+
 78 | |cuttie|2  |
 79 | |mono  |3  |
 80 | |pishi |2  |
 81 | |zazo  |1  |
 82 | |fuzzy |1  |
 83 | +------+---+
 84 | 
 85 | >>> name_age.printSchema()
 86 | root
 87 |  |-- name: string (nullable = true)
 88 |  |-- age: integer (nullable = true)
 89 | 
 90 | >>> cats.select('age').distinct().show()
 91 | +---+
 92 | |age|
 93 | +---+
 94 | |  1|
 95 | |  3|
 96 | |  2|
 97 | +---+
 98 | 
 99 | >>> cats.select('name', 'age').distinct().show()
100 | +------+---+
101 | |  name|age|
102 | +------+---+
103 | |  zazo|  1|
104 | |cuttie|  2|
105 | | fuzzy|  1|
106 | |  mono|  3|
107 | | pishi|  2|
108 | +------+---+
109 | 
110 | >>> cats.filter(cats.age > 1).show()
111 | +------+---+------+------+
112 | |  name|age|gender|weight|
113 | +------+---+------+------+
114 | |cuttie|  2|female|     6|
115 | |  mono|  3|  male|     9|
116 | | pishi|  2|female|     4|
117 | +------+---+------+------+
118 | 
119 | 
120 | >>> cats.orderBy(cats.age).show()
121 | +------+---+------+------+
122 | |  name|age|gender|weight|
123 | +------+---+------+------+
124 | |  zazo|  1|  male|     4|
125 | | fuzzy|  1|female|     4|
126 | |cuttie|  2|female|     6|
127 | | pishi|  2|female|     4|
128 | |  mono|  3|  male|     9|
129 | +------+---+------+------+
130 | 
131 | >>> age_df = cats.groupby("age").count()
132 | >>> age_df.show()
133 | +---+-----+
134 | |age|count|
135 | +---+-----+
136 | |  1|    2|
137 | |  3|    1|
138 | |  2|    2|
139 | +---+-----+
140 | 
141 | >>> cats.show()
142 | +------+---+------+------+
143 | |  name|age|gender|weight|
144 | +------+---+------+------+
145 | |cuttie|  2|female|     6|
146 | |  mono|  3|  male|     9|
147 | | pishi|  2|female|     4|
148 | |  zazo|  1|  male|     4|
149 | | fuzzy|  1|female|     4|
150 | +------+---+------+------+
151 | 
152 | >>> cats.registerTempTable('cats_table')
153 | >>> spark.sql("select * from cats_table").show()
154 | +------+---+------+------+
155 | |  name|age|gender|weight|
156 | +------+---+------+------+
157 | |cuttie|  2|female|     6|
158 | |  mono|  3|  male|     9|
159 | | pishi|  2|female|     4|
160 | |  zazo|  1|  male|     4|
161 | | fuzzy|  1|female|     4|
162 | +------+---+------+------+
163 | 
164 | >>> spark.sql("select * from cats_table where age > 1").show()
165 | +------+---+------+------+
166 | |  name|age|gender|weight|
167 | +------+---+------+------+
168 | |cuttie|  2|female|     6|
169 | |  mono|  3|  male|     9|
170 | | pishi|  2|female|     4|
171 | +------+---+------+------+
172 | 
173 | >>> spark.sql("select  age, count(*) from cats_table group by age").show()
174 | +---+--------+
175 | |age|count(1)|
176 | +---+--------+
177 | |  1|       2|
178 | |  3|       1|
179 | |  2|       2|
180 | +---+--------+
181 | 
182 | >>> def exec_sql(query):
183 | ...   spark.sql(query).show()
184 | ...
185 | >>>
186 | >>> exec_sql("select  age, count(*) from cats_table group by age")
187 | +---+--------+
188 | |age|count(1)|
189 | +---+--------+
190 | |  1|       2|
191 | |  3|       1|
192 | |  2|       2|
193 | +---+--------+
194 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/dataframe-session-2022-05-12.txt:
--------------------------------------------------------------------------------
  1 | >>> spark
  2 | <pyspark.sql.session.SparkSession object at 0x1034a29d0>
  3 | 
  4 | >>> spark.version
  5 | '3.2.0'
  6 | 
  7 | >>> # create a Python collection as data
  8 | >>> data = 
  9 | [
 10 |  ('alex', 20, 12000), 
 11 |  ('jane', 30, 45000), 
 12 |  ('rafa', 40, 56000), 
 13 |  ('ted', 30, 145000), 
 14 |  ('xo2', 10, 1332000), 
 15 |  ('mary', 44, 555000)
 16 | ]
 17 | 
 18 | >>> data
 19 | [
 20 |  ('alex', 20, 12000), 
 21 |  ('jane', 30, 45000), 
 22 |  ('rafa', 40, 56000), 
 23 |  ('ted', 30, 145000), 
 24 |  ('xo2', 10, 1332000), 
 25 |  ('mary', 44, 555000)
 26 | ]
 27 | 
 28 | >>> #define column names
 29 | >>> column_names = ['name', 'age', 'salary']
 30 | >>> column_names
 31 | ['name', 'age', 'salary']
 32 | 
 33 | >>> # create a DataFrame as df
 34 | >>> df = spark.createDataFrame(data, column_names)
 35 | >>>
 36 | >>> # inspect created DataFrame
 37 | >>> df
 38 | DataFrame[name: string, age: bigint, salary: bigint]
 39 | 
 40 | >>> # inspect created DataFrame's Schema
 41 | >>> df.printSchema()
 42 | root
 43 |  |-- name: string (nullable = true)
 44 |  |-- age: long (nullable = true)
 45 |  |-- salary: long (nullable = true)
 46 | 
 47 | >>> # display the first 20 rows of a DataFrame
 48 | >>> df.show()
 49 | +----+---+-------+
 50 | |name|age| salary|
 51 | +----+---+-------+
 52 | |alex| 20|  12000|
 53 | |jane| 30|  45000|
 54 | |rafa| 40|  56000|
 55 | | ted| 30| 145000|
 56 | | xo2| 10|1332000|
 57 | |mary| 44| 555000|
 58 | +----+---+-------+
 59 | 
 60 | >>> # count the number of rows
 61 | >>> df.count()
 62 | 6
 63 | 
 64 | 
 65 | >>> # Creates or replaces a local temporary view with this DataFrame
 66 | >>> df.createOrReplaceTempView("people")
 67 | 
 68 | >>> df2 = spark.sql("select * from people where salary > 67000")
 69 | >>> df2.show()
 70 | +----+---+-------+
 71 | |name|age| salary|
 72 | +----+---+-------+
 73 | | ted| 30| 145000|
 74 | | xo2| 10|1332000|
 75 | |mary| 44| 555000|
 76 | +----+---+-------+
 77 | 
 78 | >>> df3 = spark.sql("select * from people where salary > 67000 and age > 11")
 79 | >>> df3.show()
 80 | +----+---+------+
 81 | |name|age|salary|
 82 | +----+---+------+
 83 | | ted| 30|145000|
 84 | |mary| 44|555000|
 85 | +----+---+------+
 86 | 
 87 | >>> df.show()
 88 | +----+---+-------+
 89 | |name|age| salary|
 90 | +----+---+-------+
 91 | |alex| 20|  12000|
 92 | |jane| 30|  45000|
 93 | |rafa| 40|  56000|
 94 | | ted| 30| 145000|
 95 | | xo2| 10|1332000|
 96 | |mary| 44| 555000|
 97 | +----+---+-------+
 98 | 
 99 | >>> df4 = spark.sql("select * from people")
100 | >>> df4.show()
101 | +----+---+-------+
102 | |name|age| salary|
103 | +----+---+-------+
104 | |alex| 20|  12000|
105 | |jane| 30|  45000|
106 | |rafa| 40|  56000|
107 | | ted| 30| 145000|
108 | | xo2| 10|1332000|
109 | |mary| 44| 555000|
110 | +----+---+-------+
111 | 
112 | >>> cart = spark.sql("select * from people p1, people p2")
113 | >>> cart.show()
114 | +----+---+------+----+---+-------+
115 | |name|age|salary|name|age| salary|
116 | +----+---+------+----+---+-------+
117 | |alex| 20| 12000|alex| 20|  12000|
118 | |alex| 20| 12000|jane| 30|  45000|
119 | |alex| 20| 12000|rafa| 40|  56000|
120 | |alex| 20| 12000| ted| 30| 145000|
121 | |alex| 20| 12000| xo2| 10|1332000|
122 | |alex| 20| 12000|mary| 44| 555000|
123 | |jane| 30| 45000|alex| 20|  12000|
124 | |jane| 30| 45000|jane| 30|  45000|
125 | |jane| 30| 45000|rafa| 40|  56000|
126 | |jane| 30| 45000| ted| 30| 145000|
127 | |jane| 30| 45000| xo2| 10|1332000|
128 | |jane| 30| 45000|mary| 44| 555000|
129 | |rafa| 40| 56000|alex| 20|  12000|
130 | |rafa| 40| 56000|jane| 30|  45000|
131 | |rafa| 40| 56000|rafa| 40|  56000|
132 | |rafa| 40| 56000| ted| 30| 145000|
133 | |rafa| 40| 56000| xo2| 10|1332000|
134 | |rafa| 40| 56000|mary| 44| 555000|
135 | | ted| 30|145000|alex| 20|  12000|
136 | | ted| 30|145000|jane| 30|  45000|
137 | +----+---+------+----+---+-------+
138 | only showing top 20 rows
139 | 
140 | >>> cart                                                                                                   
141 | >>> Frame[name: string, age: bigint, salary: bigint, name: string, age: bigint, salary: bigint]      
142 | >>>                                                          
143 | 
144 | >>> cart2 = spark.sql("select p1.name as name, p2.age as age, p1.salary as salary, p2.name as name2, p2.age as age2, p2.salary as salary2 from people p1, people p2")
145 | >>> cart2.show()
146 | +----+---+------+-----+----+-------+
147 | |name|age|salary|name2|age2|salary2|
148 | +----+---+------+-----+----+-------+
149 | |alex| 20| 12000| alex|  20|  12000|
150 | |alex| 30| 12000| jane|  30|  45000|
151 | |alex| 40| 12000| rafa|  40|  56000|
152 | |alex| 30| 12000|  ted|  30| 145000|
153 | |alex| 10| 12000|  xo2|  10|1332000|
154 | |alex| 44| 12000| mary|  44| 555000|
155 | |jane| 20| 45000| alex|  20|  12000|
156 | |jane| 30| 45000| jane|  30|  45000|
157 | |jane| 40| 45000| rafa|  40|  56000|
158 | |jane| 30| 45000|  ted|  30| 145000|
159 | |jane| 10| 45000|  xo2|  10|1332000|
160 | |jane| 44| 45000| mary|  44| 555000|
161 | |rafa| 20| 56000| alex|  20|  12000|
162 | |rafa| 30| 56000| jane|  30|  45000|
163 | |rafa| 40| 56000| rafa|  40|  56000|
164 | |rafa| 30| 56000|  ted|  30| 145000|
165 | |rafa| 10| 56000|  xo2|  10|1332000|
166 | |rafa| 44| 56000| mary|  44| 555000|
167 | | ted| 20|145000| alex|  20|  12000|
168 | | ted| 30|145000| jane|  30|  45000|
169 | +----+---+------+-----+----+-------+
170 | only showing top 20 rows
171 | 
172 | >>>
173 | >>> cart2
174 | DataFrame[name: string, age: bigint, salary: bigint, name2: string, age2: bigint, salary2: bigint]


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/dataframes/dataframe-session-2022-05-19-Converting-DataFrame-to-RDD.txt:
--------------------------------------------------------------------------------
  1 | This demo shows how to convert 
  2 | 1. a DataFrame to an RDD
  3 | 2. an RDD to a DataFrame
  4 | 
  5 | 
  6 | ~  % /Users/mparsian/spark-3.2.1/bin/pyspark
  7 | Python 3.8.9 (default, Jul 19 2021, 09:37:32)
  8 | Welcome to Spark version 3.2.1
  9 | 
 10 | Spark context Web UI available at http://10.0.0.234:4041
 11 | Spark context available as 'sc' (master = local[*], app id = local-1653016254174).
 12 | SparkSession available as 'spark'.
 13 | >>> data = [('alex', 'sales', 23000), ('jane', 'HR', 29000), ('bob', 'sales', 43000),('mary', 'HR', 93000)]
 14 | >>> data
 15 | [('alex', 'sales', 23000), ('jane', 'HR', 29000), ('bob', 'sales', 43000), ('mary', 'HR', 93000)]
 16 | >>> df = spark.createDataFrame(data, ['name', 'dept', 'salary'])
 17 | >>> df.show()
 18 | +----+-----+------+
 19 | |name| dept|salary|
 20 | +----+-----+------+
 21 | |alex|sales| 23000|
 22 | |jane|   HR| 29000|
 23 | | bob|sales| 43000|
 24 | |mary|   HR| 93000|
 25 | +----+-----+------+
 26 | 
 27 | >>> df.printSchema()
 28 | root
 29 |  |-- name: string (nullable = true)
 30 |  |-- dept: string (nullable = true)
 31 |  |-- salary: long (nullable = true)
 32 | 
 33 | >>> rdd5 = df.rdd
 34 | >>> rdd5.collect()
 35 | [
 36 |  Row(name='alex', dept='sales', salary=23000), 
 37 |  Row(name='jane', dept='HR', salary=29000), 
 38 |  Row(name='bob', dept='sales', salary=43000), 
 39 |  Row(name='mary', dept='HR', salary=93000)
 40 | ]
 41 | >>>
 42 | >>> df2 = rdd5.toDF()
 43 | >>> df2.show()
 44 | +----+-----+------+
 45 | |name| dept|salary|
 46 | +----+-----+------+
 47 | |alex|sales| 23000|
 48 | |jane|   HR| 29000|
 49 | | bob|sales| 43000|
 50 | |mary|   HR| 93000|
 51 | +----+-----+------+
 52 | 
 53 | >>> from pyspark.sql import Row
 54 | >>> # NOTE: to convert an RDD into a DataFrame,  
 55 | >>> # each Row() must have the same column names:
 56 | >>> rows = 
 57 | [
 58 |  Row(name='alex', dept='sales', salary=23000), 
 59 |  Row(name='jane', dept='HR', salary=29000, address='123 main street')
 60 | ]
 61 | >>> rdd = sc.parallelize(rows)
 62 | >>> rdd.collect()
 63 | [Row(name='alex', dept='sales', salary=23000), Row(name='jane', dept='HR', salary=29000, address='123 main street')]
 64 | >>> df44 = rdd.toDF()
 65 | >>> df44.show()
 66 | 22/05/19 20:21:51 ERROR Executor: Exception in task 10.0 in stage 15.0 (TID 100)
 67 | java.lang.IllegalStateException: Input row doesn't have expected number of values required by the schema. 3 fields are required while 4 values are provided.
 68 | ...
 69 | >>> # create Row()'s which have the same columns
 70 | >>> rows = 
 71 | [
 72 |  Row(name='alex', dept='sales', salary=23000, address=None), 
 73 |  Row(name='jane', dept='HR', salary=29000, address='123 main street')
 74 | ]
 75 | >>> rdd = sc.parallelize(rows)
 76 | >>> df44 = rdd.toDF()
 77 | >>> df44.show()
 78 | +----+-----+------+---------------+
 79 | |name| dept|salary|        address|
 80 | +----+-----+------+---------------+
 81 | |alex|sales| 23000|           null|
 82 | |jane|   HR| 29000|123 main street|
 83 | +----+-----+------+---------------+
 84 | 
 85 | >>>
 86 | >>> some_data = [('alex', 10), ('jane', 20)]
 87 | >>> rdd3 = sc.parallelize(some_data)
 88 | >>> rdd3.collect()
 89 | [('alex', 10), ('jane', 20)]
 90 | >>> rdd3_with_rows = rdd3.map(lambda x: Row(name=x[0], age=x[1]))
 91 | >>> rdd3_with_rows.collect()
 92 | [Row(name='alex', age=10), Row(name='jane', age=20)]
 93 | >>> df3 = rdd3_with_rows.toDF()
 94 | >>> df3.show()
 95 | +----+---+
 96 | |name|age|
 97 | +----+---+
 98 | |alex| 10|
 99 | |jane| 20|
100 | +----+---+
101 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/combineByKey_example.py:
--------------------------------------------------------------------------------
 1 | Problem: Given a set of (K, V) pairs,
 2 | find (sum, count, min, max) per key using 
 3 | the combineByKey() transformation.
 4 | 
 5 | ~/spark-2.4.4 $ ./bin/pyspark
 6 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 7 | [Clang 6.0 (clang-600.0.57)] on darwin
 8 | Type "help", "copyright", "credits" or "license" for more information.
 9 | Welcome to
10 |       ____              __
11 |      / __/__  ___ _____/ /__
12 |     _\ \/ _ \/ _ `/ __/  '_/
13 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
14 |       /_/
15 | 
16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
17 | SparkSession available as 'spark'.
18 | >>>
19 | 
20 | >>>
21 | >>> spark
22 | <pyspark.sql.session.SparkSession object at 0x119f71668>
23 | >>> data = [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 6), ('B', 7), ('B', 8) ]
24 | >>> data
25 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 6), ('B', 7), ('B', 8)]
26 | >>> rdd = spark.sparkContext.parallelize(data)
27 | >>>
28 | >>>
29 | >>> rdd.count()
30 | 7
31 | >>> rdd.collect()
32 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 6), ('B', 7), ('B', 8)]
33 | >>> # (K, (sum, count, min, max))
34 | ...
35 | >>> def single(v):
36 | ...    return (v, 1, v, v)
37 | ...
38 | >>> def merge(C, v):
39 | ...    return (C[0]+v, C[1]+1, min(C[2],v), max(C[3],v))
40 | ...
41 | >>> def combine(C1, C2):
42 | ...    return (C1[0]+C2[0], C1[1]+C2[1], min(C1[2], C2[2]), max(C1[3], C2[3]) )
43 | ...
44 | >>> rdd2 = rdd.combineByKey(single, merge, combine)
45 | >>> rdd2.collect()
46 | [
47 |  ('B', (21, 3, 6, 8)), 
48 |  ('A', (14, 4, 2, 5))
49 | ]
50 | 
51 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/count_min_max.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import sys
 4 | 
 5 | from pyspark.sql import SparkSession
 6 | 
 7 | #
 8 | print ("This is the name of the script: ", sys.argv[0])
 9 | print ("Number of arguments: ", len(sys.argv))
10 | print ("The arguments are: " , str(sys.argv))
11 | #
12 | 
13 | #   DEFINE your input path
14 | input_path = sys.argv[1]
15 | print("input_path: ", input_path)
16 | 
17 | 
18 | #   CREATE an instance of a SparkSession object
19 | spark = SparkSession\
20 |     .builder\
21 |     .appName("PythonWordCount")\
22 |     .getOrCreate()
23 | 
24 | #   CREATE a new RDD[String]
25 | #lines = spark.sparkContext.textFile(input_path)
26 | # APPLY a SET of TRANSFORMATIONS...
27 | 
28 | #-------------------------------------------
29 | def minmax(partition):
30 | 	first_time = False
31 | 	#count
32 | 	#min2
33 | 	#max2
34 | 	for x in partition:
35 | 		if (first_time == False):
36 | 			count = 1
37 | 			min2 = x
38 | 			max2 = x
39 | 			first_time = True
40 | 		else:
41 | 			count = count + 1
42 | 			max2 = max(x, max2)
43 | 			min2 = min(x, min2)
44 | 	#end-for
45 | 	#
46 | 	return [(count, min2, max2)]
47 | #end-def
48 | #---------------------
49 | def iterate_partition(partition):
50 |    elements = []
51 |    for x in partition:
52 |       elements.append(x)
53 |    print("elements=", elements)
54 |    #print ("==================")
55 | #end-def
56 | #-------------------------
57 | def add3(t1, t2):
58 | 	count = t1[0] + t2[0]
59 | 	min2 = min(t1[1], t2[1])
60 | 	max2 = max(t1[2], t2[2])
61 | 	return (count, min2, max2)
62 | #end-def
63 | 
64 | data = [10, 20, 30, 44, 55, 3, 4, 60, 50, 5, 2, 2, 20, 20, 10, 30, 70]
65 | print("data=", data)
66 | print("==============")
67 | 
68 | #
69 | rdd = spark.sparkContext.parallelize(data, 4)
70 | print("rdd.collect()=", rdd.collect())
71 | print("==============")
72 | #
73 | rdd.foreachPartition(iterate_partition)
74 | print("==============")
75 | #
76 | 
77 | count_min_max_rdd = rdd.mapPartitions(minmax)
78 | print("minmax_rdd.collect()=", count_min_max_rdd.collect())
79 | 
80 | final_triplet = count_min_max_rdd.reduce(add3)
81 | print("final_triplet=", final_triplet)
82 | 
83 | spark.stop()
84 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2015-03-13.txt:
--------------------------------------------------------------------------------
 1 | pyspark-tutorial-
 2 | pyspark-tutorial provides basic algorithms using pyspark
 3 | 
 4 | interactive session: valid and tested: Feb. 23, 2015
 5 | 
 6 | mparsian@Mahmouds-MacBook:~/zmp/BigData-MapReduce-Course/pyspark# cat data.txt
 7 | crazy crazy fox jumped
 8 | crazy fox jumped
 9 | fox is fast
10 | fox is smart
11 | dog is smart
12 | 
13 | SPARK_HOME=~/zmp/zs/spark-1.2.0
14 | mparsian@Mahmouds-MacBook:~/zmp/BigData-MapReduce-Course/pyspark# ~/zmp/zs/spark-1.2.0/bin/pyspark
15 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12)
16 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
17 | Type "help", "copyright", "credits" or "license" for more information.
18 | 
19 | Welcome to
20 |       ____              __
21 |      / __/__  ___ _____/ /__
22 |     _\ \/ _ \/ _ `/ __/  '_/
23 |    /__ / .__/\_,_/_/ /_/\_\   version 1.2.0
24 |       /_/
25 | 
26 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
27 | SparkContext available as sc.
28 | >>> sc
29 | <pyspark.context.SparkContext object at 0x10ae02210>
30 | >>> lines = sc.textFile("data.txt", 1)
31 | >>> debuglines = lines.collect();
32 | >>> debuglines
33 | [u'crazy crazy fox jumped', u'crazy fox jumped', u'fox is fast', u'fox is smart', u'dog is smart']
34 | >>> words = lines.flatMap(lambda x: x.split(' '))
35 | >>> debugwords = words.collect();
36 | >>> debugwords
37 | [u'crazy', u'crazy', u'fox', u'jumped', u'crazy', u'fox', u'jumped', u'fox', u'is', u'fast', u'fox', u'is', u'smart', u'dog', u'is', u'smart']
38 | >>> ones = words.map(lambda x: (x, 1))
39 | >>> debugones = ones.collect()
40 | >>> debugones
41 | [(u'crazy', 1), (u'crazy', 1), (u'fox', 1), (u'jumped', 1), (u'crazy', 1), (u'fox', 1), (u'jumped', 1), (u'fox', 1), (u'is', 1), (u'fast', 1), (u'fox', 1), (u'is', 1), (u'smart', 1), (u'dog', 1), (u'is', 1), (u'smart', 1)]
42 | >>> counts = ones.reduceByKey(lambda x, y: x + y)
43 | >>> debugcounts = counts.collect()
44 | >>> debugcounts
45 | [(u'crazy', 3), (u'jumped', 2), (u'is', 3), (u'fox', 4), (u'dog', 1), (u'fast', 1), (u'smart', 2)]
46 | >>>
47 | 
48 | >>> grouped = ones.groupByKey();
49 | >>> debuggrouped = grouped.collect();
50 | 
51 | >>> counts.saveAsTextFile("output.txt")
52 | 
53 | mparsian@Mahmouds-MacBook:~/zmp/BigData-MapReduce-Course/pyspark# cat output.txt/part*
54 | (u'crazy', 3)
55 | (u'jumped', 2)
56 | (u'is', 3)
57 | (u'fox', 4)
58 | (u'dog', 1)
59 | (u'fast', 1)
60 | (u'smart', 2)


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2015-04-10.txt:
--------------------------------------------------------------------------------
 1 | First session on PySpark
 2 | 
 3 | mparsian@Mahmouds-MacBook-2:~/spark-1.3.0/bin# cat zfox_data.txt
 4 | crazy red fox ran fast
 5 | red fox jumped very very high
 6 | red fox is very crazy
 7 | red fox ran very fast
 8 | mparsian@Mahmouds-MacBook-2:~/spark-1.3.0/bin# 
 9 | 
10 | mparsian@Mahmouds-MacBook-2:~/spark-1.3.0/bin# ./pyspark
11 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12) 
12 | Welcome to
13 |       ____              __
14 |      / __/__  ___ _____/ /__
15 |     _\ \/ _ \/ _ `/ __/  '_/
16 |    /__ / .__/\_,_/_/ /_/\_\   version 1.3.0
17 |       /_/
18 | 
19 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
20 | SparkContext available as sc, SQLContext available as sqlCtx.
21 | >>> 
22 | >>> sc
23 | <pyspark.context.SparkContext object at 0x1097ac410>
24 | >>> 
25 | >>> lines = sc.textFile("zfox_data.txt")
26 | >>> 
27 | >>> lines.collect()
28 | [u'crazy red fox ran fast', u'red fox jumped very very high', u'red fox is very crazy', u'red fox ran very fast']
29 | >>> 
30 | >>> lines.count()
31 | 4
32 | >>> 
33 | >>> words = lines.flatMap(lambda x: x.split(' '))
34 | >>> 
35 | >>> words.collect()
36 | [u'crazy', u'red', u'fox', u'ran', u'fast', u'red', u'fox', u'jumped', u'very', u'very', u'high', u'red', u'fox', u'is', u'very', u'crazy', u'red', u'fox', u'ran', u'very', u'fast']
37 | >>> 
38 | >>> words.count()
39 | 21
40 | >>> 
41 | >>> ones = words.map(lambda x: (x, 1))
42 | >>> 
43 | >>> ones.collect()
44 | [(u'crazy', 1), (u'red', 1), (u'fox', 1), (u'ran', 1), (u'fast', 1), (u'red', 1), (u'fox', 1), (u'jumped', 1), (u'very', 1), (u'very', 1), (u'high', 1), (u'red', 1), (u'fox', 1), (u'is', 1), (u'very', 1), (u'crazy', 1), (u'red', 1), (u'fox', 1), (u'ran', 1), (u'very', 1), (u'fast', 1)]
45 | >>> 
46 | >>> counts = ones.reduceByKey(lambda x, y: x + y)
47 | >>> 
48 | >>> 
49 | >>> counts.collect()
50 | [(u'crazy', 2), (u'ran', 2), (u'is', 1), (u'fox', 4), (u'fast', 2), (u'high', 1), (u'very', 4), (u'red', 4), (u'jumped', 1)]
51 | >>> 
52 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2018-01-18.txt:
--------------------------------------------------------------------------------
  1 | mparsian@mahmoudsmacbook ~/spark-2.2.1 $ source zbin/zenv_setup.sh
  2 | mparsian@mahmoudsmacbook ~/spark-2.2.1 $ ./bin/pyspark
  3 | Python 2.7.10 (default, Feb  7 2017, 00:08:15)
  4 | [GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] on darwin
  5 | Type "help", "copyright", "credits" or "license" for more information.
  6 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
  7 | Setting default log level to "WARN".
  8 | Welcome to
  9 |       ____              __
 10 |      / __/__  ___ _____/ /__
 11 |     _\ \/ _ \/ _ `/ __/  '_/
 12 |    /__ / .__/\_,_/_/ /_/\_\   version 2.2.1
 13 |       /_/
 14 | 
 15 | Using Python version 2.7.10 (default, Feb  7 2017 00:08:15)
 16 | SparkSession available as 'spark'.
 17 | >>> spark
 18 | <pyspark.sql.session.SparkSession object at 0x10d17da50>
 19 | >>>
 20 | >>>
 21 | >>>
 22 | >>>
 23 | >>> spark
 24 | <pyspark.sql.session.SparkSession object at 0x10d17da50>
 25 | >>>
 26 | >>>
 27 | >>> sc = spark.sparkContext
 28 | >>>
 29 | >>> sc
 30 | <SparkContext master=local[*] appName=PySparkShell>
 31 | >>>
 32 | >>>
 33 | >>> rdd = sc.textFile("file:///Users/mparsian/zmp/github/pyspark-tutorial/data/foxdata.txt")
 34 | >>>
 35 | >>>
 36 | >>> rdd
 37 | file:///Users/mparsian/zmp/github/pyspark-tutorial/data/foxdata.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
 38 | >>>
 39 | >>>
 40 | >>> rdd.count()
 41 | 3
 42 | >>> rdd.collect()
 43 | [u'red fox jumped high', u'fox jumped over high fence', u'red fox jumped']
 44 | >>> rdd.take(2)
 45 | [u'red fox jumped high', u'fox jumped over high fence']
 46 | >>> rdd.take(1)
 47 | [u'red fox jumped high']
 48 | >>> rdd.collect()
 49 | [u'red fox jumped high', u'fox jumped over high fence', u'red fox jumped']
 50 | >>>
 51 | >>>
 52 | 
 53 | >>> rdd2 = rdd.map(lambda x : (x, len(x)))
 54 | >>> rdd2.collect()
 55 | [(u'red fox jumped high', 19), (u'fox jumped over high fence', 26), (u'red fox jumped', 14)]
 56 | >>> rdd2 = rdd.map(lambda x : (x, len(x), len(x)-2))
 57 | >>>
 58 | >>> rdd2.collect()
 59 | [(u'red fox jumped high', 19, 17), (u'fox jumped over high fence', 26, 24), (u'red fox jumped', 14, 12)]
 60 | >>> rdd3 = rdd.map(lambda x : (x, len(x), len(x)-2))
 61 | >>>
 62 | >>>
 63 | >>> rdd3.collect()
 64 | [(u'red fox jumped high', 19, 17), (u'fox jumped over high fence', 26, 24), (u'red fox jumped', 14, 12)]
 65 | >>>
 66 | >>>
 67 | >>> rdd4 = rdd.map(lambda x : (len(x), x, x))
 68 | >>> rdd4.collect()
 69 | [(19, u'red fox jumped high', u'red fox jumped high'), (26, u'fox jumped over high fence', u'fox jumped over high fence'), (14, u'red fox jumped', u'red fox jumped')]
 70 | >>>
 71 | >>>
 72 | >>>
 73 | >>> rdd.collect()
 74 | [u'red fox jumped high', u'fox jumped over high fence', u'red fox jumped']
 75 | >>> rdd2 = rdd.flatMap(lambda x: x.split(" "))
 76 | >>> rdd2.collect()
 77 | [u'red', u'fox', u'jumped', u'high', u'fox', u'jumped', u'over', u'high', u'fence', u'red', u'fox', u'jumped']
 78 | >>> rdd2.count()
 79 | 12
 80 | >>>
 81 | >>>
 82 | >>> pairs = rdd2.map(lambda w : (w, 1))
 83 | >>> pairs.count()
 84 | 12
 85 | >>> pairs.collect()
 86 | [(u'red', 1), (u'fox', 1), (u'jumped', 1), (u'high', 1), (u'fox', 1), (u'jumped', 1), (u'over', 1), (u'high', 1), (u'fence', 1), (u'red', 1), (u'fox', 1), (u'jumped', 1)]
 87 | >>>
 88 | >>>
 89 | 
 90 | >>>
 91 | >>> pairs.collect()
 92 | [(u'red', 1), (u'fox', 1), (u'jumped', 1), (u'high', 1), (u'fox', 1), (u'jumped', 1), (u'over', 1), (u'high', 1), (u'fence', 1), (u'red', 1), (u'fox', 1), (u'jumped', 1)]
 93 | >>> grouped = pairs.groupByKey()
 94 | >>> grouped.collect()
 95 | [(u'high', <pyspark.resultiterable.ResultIterable object at 0x10d1bb550>), (u'over', <pyspark.resultiterable.ResultIterable object at 0x10d1bb510>), (u'fox', <pyspark.resultiterable.ResultIterable object at 0x10d17d310>), (u'red', <pyspark.resultiterable.ResultIterable object at 0x10d17d250>), (u'fence', <pyspark.resultiterable.ResultIterable object at 0x10d17d350>), (u'jumped', <pyspark.resultiterable.ResultIterable object at 0x10d17d390>)]
 96 | >>> grouped.mapValues(lambda iter : list(iter)).collect()
 97 | [(u'high', [1, 1]), (u'over', [1]), (u'fox', [1, 1, 1]), (u'red', [1, 1]), (u'fence', [1]), (u'jumped', [1, 1, 1])]
 98 | >>>
 99 | >>> freq = grouped.mapValues(lambda iter: sum(iter))
100 | >>> freq.collect()
101 | [(u'high', 2), (u'over', 1), (u'fox', 3), (u'red', 2), (u'fence', 1), (u'jumped', 3)]
102 | >>> freq.collectAsHashMap()
103 | Traceback (most recent call last):
104 |   File "<stdin>", line 1, in <module>
105 | AttributeError: 'PipelinedRDD' object has no attribute 'collectAsHashMap'
106 | >>> freq.collectAsHashMap
107 | Traceback (most recent call last):
108 |   File "<stdin>", line 1, in <module>
109 | AttributeError: 'PipelinedRDD' object has no attribute 'collectAsHashMap'
110 | >>> freq.collectAsMap
111 | <bound method PipelinedRDD.collectAsMap of PythonRDD[19] at collect at <stdin>:1>
112 | >>> freq.collectAsMap()
113 | {u'fence': 1, u'jumped': 3, u'over': 1, u'fox': 3, u'high': 2, u'red': 2}
114 | >>>
115 | >>>
116 | >>>
117 | >>>
118 | >>>
119 | >>> pairs.collect()
120 | [(u'red', 1), (u'fox', 1), (u'jumped', 1), (u'high', 1), (u'fox', 1), (u'jumped', 1), (u'over', 1), (u'high', 1), (u'fence', 1), (u'red', 1), (u'fox', 1), (u'jumped', 1)]
121 | >>> freq = pairs.reduceByKey(lambda x, y: x+y)
122 | >>> freq.collectAsMap()
123 | {u'fence': 1, u'jumped': 3, u'over': 1, u'fox': 3, u'high': 2, u'red': 2}
124 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2018-10-02.txt:
--------------------------------------------------------------------------------
  1 | mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./zbin/zenv_setup.sh
  2 | mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./bin/pyspark
  3 | Python 2.7.10 (default, Oct  6 2017, 22:29:07)
  4 | [GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.31)] on darwin
  5 | Type "help", "copyright", "credits" or "license" for more information.
  6 | 18/10/02 15:50:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
  7 | Welcome to
  8 |       ____              __
  9 |      / __/__  ___ _____/ /__
 10 |     _\ \/ _ \/ _ `/ __/  '_/
 11 |    /__ / .__/\_,_/_/ /_/\_\   version 2.3.0
 12 |       /_/
 13 | 
 14 | Using Python version 2.7.10 (default, Oct  6 2017 22:29:07)
 15 | SparkSession available as 'spark'.
 16 | >>>
 17 | >>>
 18 | >>>
 19 | >>>
 20 | >>> spark
 21 | <pyspark.sql.session.SparkSession object at 0x1082bcc50>
 22 | >>> spark.sparkContext
 23 | <SparkContext master=local[*] appName=PySparkShell>
 24 | >>>
 25 | >>> spark.version
 26 | u'2.3.0'
 27 | >>>
 28 | >>>
 29 | >>>
 30 | >>>
 31 | >>>
 32 | >>>
 33 | >>> input_path = "/Users/mparsian/spark-2.3.0/myinput.txt"
 34 | >>> myrdd = spark.sparkContext.textFile(input_path)
 35 | >>> rdd.count()
 36 | Traceback (most recent call last):
 37 |   File "<stdin>", line 1, in <module>
 38 | NameError: name 'rdd' is not defined
 39 | >>> myrdd.count()
 40 | 3
 41 | >>> myrdd.collect()
 42 | [u'this is record 1', u'this is record 2', u'this is record 3']
 43 | >>>
 44 | >>>
 45 | >>> def tokenize(rec):
 46 | ...     tokens = rec.split()
 47 | ...     return tokens
 48 | ...
 49 | >>>
 50 | >>> rec33 = "this is it"
 51 | >>> mytokens = tokenize(rec33)
 52 | >>> mytokens
 53 | ['this', 'is', 'it']
 54 | >>>
 55 | >>>
 56 | >>> words = myrdd.flatMap(lambda record: tokenize(record))
 57 | >>> words.collect()
 58 | [u'this', u'is', u'record', u'1', u'this', u'is', u'record', u'2', u'this', u'is', u'record', u'3']
 59 | >>> words.count()
 60 | 12
 61 | >>>
 62 | >>> duplicated = myrdd.map(lambda rec: rec + ";" rec)
 63 |   File "<stdin>", line 1
 64 |     duplicated = myrdd.map(lambda rec: rec + ";" rec)
 65 |                                                    ^
 66 | SyntaxError: invalid syntax
 67 | >>> duplicated = myrdd.map(lambda rec: rec + ";" + rec)
 68 | >>> duplicated.count()
 69 | 3
 70 | >>> duplicated.collect()
 71 | [u'this is record 1;this is record 1', u'this is record 2;this is record 2', u'this is record 3;this is record 3']
 72 | >>>
 73 | >>> def myconcat(rec):
 74 | ...     return rec + ";" + rec
 75 | ...
 76 | >>>
 77 | >>> z = myconcat("testing")
 78 | >>> z
 79 | 'testing;testing'
 80 | >>> duplicated2 = myrdd.map(myconcat)
 81 | >>> duplicated2.count()
 82 | 3
 83 | >>> duplicated2.collect()
 84 | [u'this is record 1;this is record 1', u'this is record 2;this is record 2', u'this is record 3;this is record 3']
 85 | >>>
 86 | >>>
 87 | >>>
 88 | >>> words.collect()
 89 | [u'this', u'is', u'record', u'1', u'this', u'is', u'record', u'2', u'this', u'is', u'record', u'3']
 90 | >>> words.count()
 91 | 12
 92 | >>> pairs = words.map(lambda w: (w, 1))
 93 | >>> pairs.collect()
 94 | [(u'this', 1), (u'is', 1), (u'record', 1), (u'1', 1), (u'this', 1), (u'is', 1), (u'record', 1), (u'2', 1), (u'this', 1), (u'is', 1), (u'record', 1), (u'3', 1)]
 95 | >>> pairs.count()
 96 | 12
 97 | >>> freq = pairs.reduceByKey(lambda x, y : x+y)
 98 | >>> freq.collect()
 99 | [(u'this', 3), (u'1', 1), (u'is', 3), (u'3', 1), (u'record', 3), (u'2', 1)]
100 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2018-10-09.txt:
--------------------------------------------------------------------------------
  1 | mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./zbin/zenv_setup.sh
  2 | mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./bin/pyspark
  3 | Python 2.7.10 (default, Oct  6 2017, 22:29:07)
  4 | [GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.31)] on darwin
  5 | Type "help", "copyright", "credits" or "license" for more information.
  6 | 18/10/09 18:04:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
  7 | Welcome to
  8 |       ____              __
  9 |      / __/__  ___ _____/ /__
 10 |     _\ \/ _ \/ _ `/ __/  '_/
 11 |    /__ / .__/\_,_/_/ /_/\_\   version 2.3.0
 12 |       /_/
 13 | 
 14 | Using Python version 2.7.10 (default, Oct  6 2017 22:29:07)
 15 | SparkSession available as 'spark'.
 16 | >>>
 17 | >>>
 18 | >>> spark
 19 | <pyspark.sql.session.SparkSession object at 0x1025f9c50>
 20 | >>>
 21 | >>>
 22 | >>>
 23 | >>>
 24 | >>> data = [1, -3, 4, 2, -5, 2]
 25 | >>> data
 26 | [1, -3, 4, 2, -5, 2]
 27 | >>> rdd = spark.sparkContext.parallalize(data)
 28 | Traceback (most recent call last):
 29 |   File "<stdin>", line 1, in <module>
 30 | AttributeError: 'SparkContext' object has no attribute 'parallalize'
 31 | >>> rdd = spark.sparkContext.parallelize(data)
 32 | >>> rdd.count()
 33 | 6
 34 | >>> rdd.collect()
 35 | [1, -3, 4, 2, -5, 2]
 36 | >>>
 37 | >>> def myfun(n):
 38 | ...     mylist = []
 39 | ...     if n > 0:
 40 | ...             mylist.append(100)
 41 | ...             mylist.append(200)
 42 | ...     else:
 43 | ...             mylist.append(0)
 44 | ...     #
 45 | ...     return mylist
 46 | ...
 47 | >>>
 48 | >>> x = myfun(3)
 49 | >>> x
 50 | [100, 200]
 51 | >>> y = myfun(-55)
 52 | >>> y
 53 | [0]
 54 | >>>
 55 | >>> rdd2 = rdd.flatMap(myfun)
 56 | >>> rdd.collect()
 57 | [1, -3, 4, 2, -5, 2]
 58 | >>> rdd2.collect()
 59 | [100, 200, 0, 100, 200, 100, 200, 0, 100, 200]
 60 | >>> rdd2.count()
 61 | 10
 62 | >>>
 63 | >>>
 64 | >>>
 65 | >>> rdd3 = rdd2.filter(lambda x : x > 100)
 66 | >>> rdd3.collect()
 67 | [200, 200, 200, 200]
 68 | >>>
 69 | >>> rdd4 = rdd2.filter(lambda x : x > 10)
 70 | >>> rdd4.collect()
 71 | [100, 200, 100, 200, 100, 200, 100, 200]
 72 | >>>
 73 | >>>
 74 | >>> def keep100(n):
 75 | ...     if n > 100:
 76 | ...             return True
 77 | ...     else:
 78 | ...             return False
 79 | ...
 80 | >>>
 81 | >>> rdd5 = rdd2.filter(keep100)
 82 | >>> rdd5.collect()
 83 | [200, 200, 200, 200]
 84 | >>>
 85 | >>>
 86 | >>> rdd2.collect()
 87 | [100, 200, 0, 100, 200, 100, 200, 0, 100, 200]
 88 | >>> rdd6 = rdd.map(lambda x : x+1000)
 89 | >>> rdd6.collect()
 90 | [1001, 997, 1004, 1002, 995, 1002]
 91 | >>>
 92 | >>> def myadder(n):
 93 | ...     if n > 0:
 94 | ...             return n+1000
 95 | ...     else:
 96 | ...             return n
 97 | ...
 98 | >>>
 99 | >>> rdd2.collect()
100 | [100, 200, 0, 100, 200, 100, 200, 0, 100, 200]
101 | >>> rdd7 = rdd2.map(myadder)
102 | >>> rdd7.collect()
103 | [1100, 1200, 0, 1100, 1200, 1100, 1200, 0, 1100, 1200]
104 | >>>
105 | >>>
106 | >>>
107 | >>>
108 | >>>
109 | >>>
110 | >>> rdd2.collect()
111 | [100, 200, 0, 100, 200, 100, 200, 0, 100, 200]
112 | >>> mysum = rdd2.reduce(lambda x,y: x+y)
113 | >>> mysum
114 | 1200
115 | >>>
116 | >>>
117 | >>>
118 | >>>
119 | >>>
120 | >>> pairs = [("a", 2), ("b", 3), ("a", 3), ("b", 4), ("a", 7), ("b", 10), ("c", 7), ("c", 1)]
121 | >>>
122 | >>> pairs
123 | [('a', 2), ('b', 3), ('a', 3), ('b', 4), ('a', 7), ('b', 10), ('c', 7), ('c', 1)]
124 | >>>
125 | >>> pairs_rdd = spark.sparkContext.parallelize(pairs)
126 | >>> pairs_rdd.count()
127 | 8
128 | >>> pairs_rdd.collect()
129 | [('a', 2), ('b', 3), ('a', 3), ('b', 4), ('a', 7), ('b', 10), ('c', 7), ('c', 1)]
130 | >>>
131 | >>>
132 | >>> grouped = pairs_rdd.groupByKey()
133 | >>> grouped.collect()
134 | [('a', <pyspark.resultiterable.ResultIterable object at 0x102747790>), ('c', <pyspark.resultiterable.ResultIterable object at 0x1027476d0>), ('b', <pyspark.resultiterable.ResultIterable object at 0x10271bf10>)]
135 | >>> grouped.mapValues(lambda it: list(it)).collect()
136 | [('a', [2, 3, 7]), ('c', [7, 1]), ('b', [3, 4, 10])]
137 | >>>
138 | >>> incby100 = pairs_rdd.mapValues(lambda x : x+100)
139 | >>> incby100.collect()
140 | [('a', 102), ('b', 103), ('a', 103), ('b', 104), ('a', 107), ('b', 110), ('c', 107), ('c', 101)]
141 | >>> incby1000 = pairs_rdd.map(lambda (k,v) : (k, v+1000))
142 | >>> incby1000.collect()
143 | [('a', 1002), ('b', 1003), ('a', 1003), ('b', 1004), ('a', 1007), ('b', 1010), ('c', 1007), ('c', 1001)]
144 | >>>
145 | >>>
146 | >>> grouped.collect()
147 | [('a', <pyspark.resultiterable.ResultIterable object at 0x10273dbd0>), ('c', <pyspark.resultiterable.ResultIterable object at 0x10273d4d0>), ('b', <pyspark.resultiterable.ResultIterable object at 0x10273d950>)]
148 | >>>
149 | >>> average = grouped.mapValues(lambda it: sum(it)/len(it))
150 | >>> average.collect()
151 | [('a', 4), ('c', 4), ('b', 5)]
152 | >>> average = grouped.mapValues(lambda it: float(sum(it))/float(len(it)))
153 | >>> average.collect()
154 | [('a', 4.0), ('c', 4.0), ('b', 5.666666666666667)]
155 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-01-30.txt:
--------------------------------------------------------------------------------
 1 | $ ./bin/pyspark
 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 3 | [Clang 6.0 (clang-600.0.57)] on darwin
 4 | Setting default log level to "WARN".
 5 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 6 | Welcome to
 7 |       ____              __
 8 |      / __/__  ___ _____/ /__
 9 |     _\ \/ _ \/ _ `/ __/  '_/
10 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.0
11 |       /_/
12 | 
13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
14 | SparkSession available as 'spark'.
15 | >>>
16 | >>> spark
17 | <pyspark.sql.session.SparkSession object at 0x1164b7828>
18 | >>>
19 | >>>
20 | >>> pairs = [("alex", 100, 1), ("jane", 200, 3), ("ted", 300, 3)]
21 | >>> pairs
22 | [('alex', 100, 1), ('jane', 200, 3), ('ted', 300, 3)]
23 | >>>
24 | >>> rdd = spark.sparkContext.parallelize(pairs)
25 | >>> rdd.collect()
26 | [('alex', 100, 1), ('jane', 200, 3), ('ted', 300, 3)]
27 | >>> rdd.count()
28 | 3
29 | >>> def find_average(record):
30 | ...     return record[1]/record[2]
31 | ...
32 | >>>
33 | >>> x = ('jane', 200, 3)
34 | >>> y = find_average(x)
35 | >>> y
36 | 66.66666666666667
37 | >>> x = ('ted', 300, 3)
38 | >>> y = find_average(x)
39 | >>> y
40 | 100.0
41 | >>> rdd.collect()
42 | [('alex', 100, 1), ('jane', 200, 3), ('ted', 300, 3)]
43 | >>> rdd2 = rdd.map(find_average)
44 | >>> rdd2.collect()
45 | [100.0, 66.66666666666667, 100.0]
46 | >>> def find_average(record):
47 | ...     return (record[0], record[1]/record[2])
48 | ...
49 | >>>
50 | >>> x = ('jane', 200, 3)
51 | >>> y = find_average(x)
52 | >>> y
53 | ('jane', 66.66666666666667)
54 | >>> rdd2 = rdd.map(find_average)
55 | >>> rdd2.collect()
56 | [('alex', 100.0), ('jane', 66.66666666666667), ('ted', 100.0)]
57 | >>> def find_average22(record):
58 | ...     return [(record[0], record[1]/record[2])]
59 | ...
60 | >>> x = ('ted', 300, 3)
61 | >>> y = find_average22(x)
62 | >>> y
63 | [('ted', 100.0)]
64 | >>>
65 | >>>
66 | >>> rdd3 = rdd.flatMap(find_average22)
67 | >>> rdd3.collect()
68 | [('alex', 100.0), ('jane', 66.66666666666667), ('ted', 100.0)]
69 | >>>
70 | >>>
71 | >>>
72 | >>> numbers = [1, 2, 3, 4, 5, 6]
73 | >>> rdd4 = spark.sparkContext.parallelize(numbers)
74 | >>> rdd4.count()
75 | 6
76 | >>> rdd.collect()
77 | [('alex', 100, 1), ('jane', 200, 3), ('ted', 300, 3)]
78 | >>> rdd4.collect()
79 | [1, 2, 3, 4, 5, 6]
80 | >>>
81 | >>>
82 | >>> mysum = rdd4.reduce(lambda x, y: x+7)
83 | >>> mysum
84 | 36
85 | >>> rdd5 = rdd4.map(lambda x : x +7)
86 | >>> rdd5.collect()
87 | [8, 9, 10, 11, 12, 13]
88 | >>> rdd5
89 | PythonRDD[8] at collect at <stdin>:1
90 | >>> rdd4
91 | ParallelCollectionRDD[5] at parallelize at PythonRDD.scala:195
92 | >>>
93 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-04-16.txt:
--------------------------------------------------------------------------------
 1 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ pwd
 2 | /Users/mparsian/spark-2.4.0
 3 | 
 4 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ ls -l
 5 | -rw-r--r--@   1 mparsian  897801646  21357 Oct 28 23:36 LICENSE
 6 | -rw-r--r--@   1 mparsian  897801646  42919 Oct 28 23:36 NOTICE
 7 | drwxr-xr-x@   3 mparsian  897801646     96 Oct 28 23:36 R
 8 | -rw-r--r--@   1 mparsian  897801646   3952 Oct 28 23:36 README.md
 9 | -rw-r--r--@   1 mparsian  897801646    156 Oct 28 23:36 RELEASE
10 | drwxr-xr-x@  29 mparsian  897801646    928 Oct 28 23:36 bin
11 | drwxr-xr-x@   9 mparsian  897801646    288 Oct 28 23:36 conf
12 | drwxr-xr-x@   5 mparsian  897801646    160 Oct 28 23:36 data
13 | drwxr-xr-x@   4 mparsian  897801646    128 Oct 28 23:36 examples
14 | drwxr-xr-x@ 227 mparsian  897801646   7264 Oct 28 23:36 jars
15 | drwxr-xr-x@   4 mparsian  897801646    128 Oct 28 23:36 kubernetes
16 | drwxr-xr-x@  48 mparsian  897801646   1536 Oct 28 23:36 licenses
17 | drwxr-xr-x   16 mparsian  897801646    512 Mar 25 12:29 logs
18 | drwxr-xr-x@  19 mparsian  897801646    608 Oct 28 23:36 python
19 | drwxr-xr-x@  24 mparsian  897801646    768 Oct 28 23:36 sbin
20 | drwxr-xr-x    2 mparsian  897801646     64 Jan  8 03:00 work
21 | drwxr-xr-x@   3 mparsian  897801646     96 Oct 28 23:36 yarn
22 | 
23 | 
24 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ ls -l bin
25 | total 224
26 | -rwxr-xr-x@ 1 mparsian  897801646  1089 Oct 28 23:36 beeline
27 | -rw-r--r--@ 1 mparsian  897801646  1064 Oct 28 23:36 beeline.cmd
28 | -rwxr-xr-x@ 1 mparsian  897801646  5427 Oct 28 23:36 docker-image-tool.sh
29 | -rwxr-xr-x@ 1 mparsian  897801646  1933 Oct 28 23:36 find-spark-home
30 | -rw-r--r--@ 1 mparsian  897801646  2681 Oct 28 23:36 find-spark-home.cmd
31 | -rw-r--r--@ 1 mparsian  897801646  1892 Oct 28 23:36 load-spark-env.cmd
32 | -rw-r--r--@ 1 mparsian  897801646  2025 Oct 28 23:36 load-spark-env.sh
33 | -rwxr-xr-x@ 1 mparsian  897801646  2987 Oct 28 23:36 pyspark
34 | ...
35 | 
36 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ ./bin/pyspark
37 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
38 | [Clang 6.0 (clang-600.0.57)] on darwin
39 | Type "help", "copyright", "credits" or "license" for more information.
40 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
41 | Welcome to
42 |       ____              __
43 |      / __/__  ___ _____/ /__
44 |     _\ \/ _ \/ _ `/ __/  '_/
45 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.0
46 |       /_/
47 | 
48 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
49 | SparkSession available as 'spark'.
50 | 
51 | >>> spark
52 | <pyspark.sql.session.SparkSession object at 0x111140860>
53 | >>>
54 | >>>
55 | >>>
56 | >>> data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
57 | >>> data
58 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
59 | >>>
60 | >>> rdd = spark.sparkContext.parallelize(data)
61 | >>> rdd.count()
62 | 12
63 | >>> rdd.collect()
64 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
65 | >>> rdd.getNumPartitions()
66 | 8
67 | >>> rdd2 = spark.sparkContext.parallelize(data, 3)
68 | >>> rdd2.collect()
69 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
70 | >>> rdd2.getNumPartitions()
71 | 3
72 | >>> rdd3 = rdd.map(lambda x : x+100)
73 | >>> rdd3.collect()
74 | [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112]
75 | >>>
76 | >>> def myfun(x):
77 | ...     return x+100
78 | ...
79 | >>>
80 | >>>
81 | >>> y = myfun(4)
82 | >>> y
83 | 104
84 | >>> z = myfun(60)
85 | >>> z
86 | 160
87 | >>> rdd4 = rdd.map(myfun)
88 | >>> rdd4.collect()
89 | [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112]
90 | >>> rdd5 = rdd.map(lambda x: (x, 1))
91 | >>> rdd5.collect()
92 | [(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)]
93 | >>> rdd2.collect()
94 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
95 | >>> N = rdd.reduce(lambda x, y: x+y)
96 | >>> N
97 | 78
98 | >>> exit()
99 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-04-18.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ cat > fox.txt
  3 | a fox jumped
  4 | a red fox jumped and jumped
  5 | a blue and red fox jumped
  6 | fox is blue red
  7 | 
  8 | $ cat fox.txt
  9 | a fox jumped
 10 | a red fox jumped and jumped
 11 | a blue and red fox jumped
 12 | fox is blue red
 13 | 
 14 | ~/spark-2.4.0 $ ./bin/pyspark
 15 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 16 | [Clang 6.0 (clang-600.0.57)] on darwin
 17 | Type "help", "copyright", "credits" or "license" for more information.
 18 | 2019-04-18 18:02:14 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 19 | Setting default log level to "WARN".
 20 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 21 | Welcome to
 22 |       ____              __
 23 |      / __/__  ___ _____/ /__
 24 |     _\ \/ _ \/ _ `/ __/  '_/
 25 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.0
 26 |       /_/
 27 | 
 28 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 29 | SparkSession available as 'spark'.
 30 | >>> spark
 31 | <pyspark.sql.session.SparkSession object at 0x10225b8d0>
 32 | 
 33 | >>> records = spark.sparkContext.textFile("/Users/mparsian/spark-2.4.0/fox.txt")
 34 | >>> records.collect()
 35 | [
 36 |  'a fox jumped', 
 37 |  'a red fox jumped and jumped', 
 38 |  'a blue and red fox jumped', 
 39 |  'fox is blue red'
 40 | ]
 41 | >>> records.count()
 42 | 4
 43 | >>>
 44 | >>> def tokenize(record):
 45 | ...     tokens = record.split(" ")
 46 | ...     return tokens
 47 | ...
 48 | >>>
 49 | >>> x = "a fox jumped"
 50 | >>> x
 51 | 'a fox jumped'
 52 | >>> tokens = tokenize(x)
 53 | >>> tokens
 54 | ['a', 'fox', 'jumped']
 55 | >>>
 56 | >>>
 57 | >>> words =  records.flatMap(tokenize)
 58 | >>> words.collect()
 59 | ['a', 'fox', 'jumped', 'a', 'red', 'fox', 'jumped', 'and', 'jumped', 'a', 'blue', 'and', 'red', 'fox', 'jumped', 'fox', 'is', 'blue', 'red']
 60 | >>> words.count()
 61 | 19
 62 | >>> pairs = words.map(lambda x : (x,1))
 63 | >>> pairs.collect()
 64 | [('a', 1), ('fox', 1), ('jumped', 1), ('a', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('and', 1), ('jumped', 1), ('a', 1), ('blue', 1), ('and', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('fox', 1), ('is', 1), ('blue', 1), ('red', 1)]
 65 | >>> pairs.count()
 66 | 19
 67 | >>>
 68 | >>> frequencies = pairs.reduceByKey(lambda a, b: a+b)
 69 | >>> frequencies.collect()
 70 | [('is', 1), ('a', 3), ('fox', 4), ('jumped', 4), ('red', 3), ('and', 2), ('blue', 2)]
 71 | >>>
 72 | >>>
 73 | >>> filtered = frequencies.filter(lambda x : x[1] > 2)
 74 | >>> filtered.collect()
 75 | [('a', 3), ('fox', 4), ('jumped', 4), ('red', 3)]
 76 | >>> filtered.count()
 77 | 4
 78 | >>> a = ("dada", 5)
 79 | >>> a[0]
 80 | 'dada'
 81 | >>> a[1]
 82 | 5
 83 | >>>
 84 | >>>
 85 | >>> test =  records.map(tokenize)
 86 | >>> test.collect()
 87 | [['a', 'fox', 'jumped'], ['a', 'red', 'fox', 'jumped', 'and', 'jumped'], ['a', 'blue', 'and', 'red', 'fox', 'jumped'], ['fox', 'is', 'blue', 'red']]
 88 | >>> test.count()
 89 | 4
 90 | >>>
 91 | >>>
 92 | >>> pairs.collect()
 93 | [('a', 1), ('fox', 1), ('jumped', 1), ('a', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('and', 1), ('jumped', 1), ('a', 1), ('blue', 1), ('and', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('fox', 1), ('is', 1), ('blue', 1), ('red', 1)]
 94 | >>>
 95 | >>> grouped = pairs.groupByKey()
 96 | 
 97 | >>> grouped.collect()
 98 | [
 99 |  ('is', <pyspark.resultiterable.ResultIterable object at 0x102268da0>),
100 |  ('a', <pyspark.resultiterable.ResultIterable object at 0x1022c1a90>), 
101 |  ('fox', <pyspark.resultiterable.ResultIterable object at 0x1022c1a58>),
102 |  ('jumped', <pyspark.resultiterable.ResultIterable object at 0x1022c1be0>), 
103 |  ('red', <pyspark.resultiterable.ResultIterable object at 0x1022c1b00>), 
104 |  ('and', <pyspark.resultiterable.ResultIterable object at 0x1022c1dd8>), 
105 |  ('blue', <pyspark.resultiterable.ResultIterable object at 0x1022c1d30>)
106 | ]
107 | >>>
108 | >>> grouped = pairs.groupByKey().mapValues(lambda it: list(it))
109 | >>> grouped.collect()
110 | [
111 | ('is', [1]), 
112 | ('a', [1, 1, 1]), 
113 | ('fox', [1, 1, 1, 1]), 
114 | ('jumped', [1, 1, 1, 1]), 
115 | ('red', [1, 1, 1]), 
116 | ('and', [1, 1]), 
117 | ('blue', [1, 1])
118 | ]
119 | >>> grouped = pairs.groupByKey()
120 | >>> grouped.collect()
121 | [('is', <pyspark.resultiterable.ResultIterable object at 0x1022d2400>), ('a', <pyspark.resultiterable.ResultIterable object at 0x1022d2470>), ('fox', <pyspark.resultiterable.ResultIterable object at 0x1022d2438>), ('jumped', <pyspark.resultiterable.ResultIterable object at 0x1022d25c0>), ('red', <pyspark.resultiterable.ResultIterable object at 0x1022d24e0>), ('and', <pyspark.resultiterable.ResultIterable object at 0x1022d26d8>), ('blue', <pyspark.resultiterable.ResultIterable object at 0x1022d2748>)]
122 | >>> freq2 = grouped.mapValues(lambda it: sum(it))
123 | >>> freq2.collect()
124 | [
125 |  ('is', 1), 
126 |  ('a', 3), 
127 |  ('fox', 4), 
128 |  ('jumped', 4), 
129 |  ('red', 3), 
130 |  ('and', 2), 
131 |  ('blue', 2)
132 | ]
133 | >>> freq2.count()
134 | 7
135 | >>> frequencies = records.flatMap(tokenize).map(lambda x: (x,1)).reduceByKey(lambda a, b: a+b)
136 | >>> frequencies.collect()
137 | [
138 |  ('is', 1), 
139 |  ('a', 3), 
140 |  ('fox', 4), 
141 |  ('jumped', 4), 
142 |  ('red', 3), 
143 |  ('and', 2), 
144 |  ('blue', 2)
145 | ]
146 | >>>
147 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-04-26.txt:
--------------------------------------------------------------------------------
 1 | Finding Average by Key using reduceByKey() Transformation
 2 | 
 3 | $ ./bin/pyspark
 4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 5 | [Clang 6.0 (clang-600.0.57)] on darwin
 6 | Type "help", "copyright", "credits" or "license" for more information.
 7 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 8 | Welcome to
 9 |       ____              __
10 |      / __/__  ___ _____/ /__
11 |     _\ \/ _ \/ _ `/ __/  '_/
12 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.0
13 |       /_/
14 | 
15 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
16 | SparkSession available as 'spark'.
17 | >>>
18 | >>>
19 | >>>
20 | >>>
21 | >>> data = [('k1', 3), ('k1', 4),('k1', 5),('k2', 7),('k2', 7),('k2', 7),('k3', 30),('k3', 30),('k3', 40),('k3', 50)]
22 | >>> data
23 | [('k1', 3), ('k1', 4), ('k1', 5), ('k2', 7), ('k2', 7), ('k2', 7), ('k3', 30), ('k3', 30), ('k3', 40), ('k3', 50)]
24 | >>>
25 | >>> pairs = spark.sparkContext.parallelize(data)
26 | >>> pairs.collect()
27 | [('k1', 3), ('k1', 4), ('k1', 5), ('k2', 7), ('k2', 7), ('k2', 7), ('k3', 30), ('k3', 30), ('k3', 40), ('k3', 50)]
28 | >>> pairs.count()
29 | 10
30 | >>> pairs2 = pairs.distinct()
31 | >>> pairs2.count()
32 | 7
33 | >>> pairs2.collect()
34 | [('k1', 5), ('k3', 40), ('k1', 3), ('k3', 50), ('k2', 7), ('k1', 4), ('k3', 30)]
35 | >>>
36 | >>> tuples = pairs.map(lambda x: (x[0], (x[1], 1) ) )
37 | >>> tuples.collect()
38 | [('k1', (3, 1)), ('k1', (4, 1)), ('k1', (5, 1)), ('k2', (7, 1)), ('k2', (7, 1)), ('k2', (7, 1)), ('k3', (30, 1)), ('k3', (30, 1)), ('k3', (40, 1)), ('k3', (50, 1))]
39 | 
40 | >>>
41 | >>> def adder(x, y):
42 | ...     sum2 = x[0] + y[0]
43 | ...     count = x[1] + y[1]
44 | ...     return (sum2, count)
45 | ...
46 | >>>
47 | >>> x = (10, 2)
48 | >>> y = (20, 4)
49 | >>> r = adder(x, y)
50 | >>> r
51 | (30, 6)
52 | >>>
53 | >>> result = tuples.reduceByKey(adder)
54 | >>> result.collect()
55 | [('k1', (12, 3)), ('k3', (150, 4)), ('k2', (21, 3))]
56 | >>> result = tuples.reduceByKey(lambda x, y: adder(x, y))
57 | >>> result.collect()
58 | [('k1', (12, 3)), ('k3', (150, 4)), ('k2', (21, 3))]
59 | >>> avg = result.mapValues(lambda pair: float(pair[0])/float(pair[1]))
60 | >>> avg.collect()
61 | [('k1', 4.0), ('k3', 37.5), ('k2', 7.0)]
62 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-05-09.txt:
--------------------------------------------------------------------------------
  1 | Learn Partitioning RDDs and using mapPartitions() Transformation
  2 | 
  3 | $ ./bin/pyspark
  4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
  5 | [Clang 6.0 (clang-600.0.57)] on darwin
  6 | Type "help", "copyright", "credits" or "license" for more information.
  7 | Setting default log level to "WARN".
  8 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
  9 | Welcome to
 10 |       ____              __
 11 |      / __/__  ___ _____/ /__
 12 |     _\ \/ _ \/ _ `/ __/  '_/
 13 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.0
 14 |       /_/
 15 | 
 16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 17 | SparkSession available as 'spark'.
 18 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 19 | >>>
 20 | >>>
 21 | >>> numbers
 22 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 23 | >>> rdd = spark.sparkContext.parallelize(numbers, 3)
 24 | >>> rdd.count()
 25 | 10
 26 | >>> rdd.collect()
 27 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 28 | >>>
 29 | 
 30 | >>> def f(iterator):
 31 | ...     for x in iterator:
 32 | ...             print(x)
 33 | ...     print("===")
 34 | ...
 35 | >>>
 36 | >>> rdd.foreachPartition(f)
 37 | 4
 38 | 5
 39 | 6
 40 | ===
 41 | 7
 42 | 8
 43 | 9
 44 | 10
 45 | ===
 46 | 1
 47 | 2
 48 | 3
 49 | ===
 50 | >>>
 51 | >>>
 52 | >>> rdd = spark.sparkContext.parallelize(numbers, 2)
 53 | >>> rdd.foreachPartition(f)
 54 | 1
 55 | 2
 56 | 3
 57 | 4
 58 | 5
 59 | ===
 60 | 6
 61 | 7
 62 | 8
 63 | 9
 64 | 10
 65 | ===
 66 | >>> 
 67 | >>> n = rdd.getNumPartitions()
 68 | >>> n
 69 | 2
 70 | >>> rdd = spark.sparkContext.parallelize(numbers, 4)
 71 | >>> n = rdd.getNumPartitions()
 72 | >>> n
 73 | 4
 74 | >>> rdd.foreachPartition(f)
 75 | 5
 76 | 6
 77 | ===
 78 | 3
 79 | 4
 80 | ===
 81 | 7
 82 | 8
 83 | 9
 84 | 10
 85 | ===
 86 | 1
 87 | 2
 88 | ===
 89 | >>> rdd = spark.sparkContext.parallelize(numbers, 14)
 90 | >>> rdd.foreachPartition(f)
 91 | 4
 92 | ===
 93 | ===
 94 | ===
 95 | 3
 96 | ===
 97 | 1
 98 | ===
 99 | 5
100 | ===
101 | 2
102 | ===
103 | ===
104 | 6
105 | ===
106 | ===
107 | 8
108 | ===
109 | 7
110 | ===
111 | 9
112 | ===
113 | 10
114 | ===
115 | >>> def min_max_count(iterator):
116 | ...     firsttime = 1
117 | ...     #minimum
118 | ...     #maximum
119 | ...     #count
120 | ...     for x in iterator:
121 | ...             if (firsttime == 1):
122 | ...                     minimum = x
123 | ...                     maximum = x
124 | ...                     count = 1
125 | ...                     firsttime = 0
126 | ...             else:
127 | ...                     count = count + 1
128 | ...                     minimum = min(x, minimum)
129 | ...                     maximum = max(x, maximum)
130 | ...             #
131 | ...     return (minimum, maximum, count)
132 | ...
133 | >>>
134 | >>> data = [12, 34, 3, 5, 7, 9, 91, 77, 12, 13, 14, 15, 16]
135 | >>> data
136 | [12, 34, 3, 5, 7, 9, 91, 77, 12, 13, 14, 15, 16]
137 | >>> rdd = spark.sparkContext.parallelize(numbers, 3)
138 | >>> n = rdd.getNumPartitions()
139 | >>> n
140 | 3
141 | >>> rdd.collect()
142 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
143 | >>> result = rdd.mapPartitions(min_max_count)
144 | >>> result.collect()
145 | [1, 3, 3, 4, 6, 3, 7, 10, 4]
146 | >>> def min_max_count(iterator):
147 | ...     firsttime = 1
148 | ...     #minimum
149 | ...     #maximum
150 | ...     #count
151 | ...     for x in iterator:
152 | ...             if (firsttime == 1):
153 | ...                     minimum = x
154 | ...                     maximum = x
155 | ...                     count = 1
156 | ...                     firsttime = 0
157 | ...             else:
158 | ...                     count = count + 1
159 | ...                     minimum = min(x, minimum)
160 | ...                     maximum = max(x, maximum)
161 | ...             #
162 | ...     return [minimum, maximum, count]
163 | ...
164 | >>>
165 | >>> result = rdd.mapPartitions(min_max_count)
166 | >>> result.collect()
167 | [1, 3, 3, 4, 6, 3, 7, 10, 4]
168 | >>>
169 | >>>
170 | >>>
171 | >>>
172 | >>> def min_max_count(iterator):
173 | ...     firsttime = 1
174 | ...     #minimum
175 | ...     #maximum
176 | ...     #count
177 | ...     for x in iterator:
178 | ...             if (firsttime == 1):
179 | ...                     minimum = x
180 | ...                     maximum = x
181 | ...                     count = 1
182 | ...                     firsttime = 0
183 | ...             else:
184 | ...                     count = count + 1
185 | ...                     minimum = min(x, minimum)
186 | ...                     maximum = max(x, maximum)
187 | ...             #
188 | ...     return [[minimum, maximum, count]]
189 | ...
190 | >>> result = rdd.mapPartitions(min_max_count)
191 | >>> result.collect()
192 | [[1, 3, 3], [4, 6, 3], [7, 10, 4]]
193 | >>>
194 | 
195 | >>> data
196 | [12, 34, 3, 5, 7, 9, 91, 77, 12, 13, 14, 15, 16]
197 | >>> rdd = spark.sparkContext.parallelize(data, 3)
198 | >>>
199 | >>>
200 | >>> result = rdd.mapPartitions(min_max_count)
201 | >>> result.collect()
202 | [[3, 34, 4], [7, 91, 4], [12, 16, 5]]
203 | >>> rdd.foreachPartition(f)
204 | 12
205 | 13
206 | 14
207 | 15
208 | 16
209 | ===
210 | 7
211 | 9
212 | 91
213 | 77
214 | ===
215 | 12
216 | 34
217 | 3
218 | 5
219 | ===
220 | >>> 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-10-09.txt:
--------------------------------------------------------------------------------
  1 | /spark-2.4.4 $ ./bin/pyspark
  2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
  3 | [Clang 6.0 (clang-600.0.57)] on darwin
  4 | Type "help", "copyright", "credits" or "license" for more information.
  5 | 19/10/09 18:57:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
  6 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
  7 | Setting default log level to "WARN".
  8 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
  9 | Welcome to
 10 |       ____              __
 11 |      / __/__  ___ _____/ /__
 12 |     _\ \/ _ \/ _ `/ __/  '_/
 13 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
 14 |       /_/
 15 | 
 16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 17 | SparkSession available as 'spark'.
 18 | >>>
 19 | >>>
 20 | >>>
 21 | >>>
 22 | >>> numbers = [1, 2, 3, 1, 2, 3, 4, 4, 5, 6]
 23 | >>> numbers
 24 | [1, 2, 3, 1, 2, 3, 4, 4, 5, 6]
 25 | >>> rdd = spark.sparkContext.parallelize(numbers)
 26 | >>> rdd.collect()
 27 | [1, 2, 3, 1, 2, 3, 4, 4, 5, 6]
 28 | >>> rdd.count()
 29 | 10
 30 | >>> rdd2 = rdd.filter(lambda x : x > 3)
 31 | >>> rdd2.collect()
 32 | [4, 4, 5, 6]
 33 | >>>
 34 | >>>
 35 | >>> def custom_filter(x):
 36 | ...     if x > 3:
 37 | ...        return True
 38 | ...     else:
 39 | ...        return False
 40 | ... ^D
 41 | >>>
 42 | >>> x = custom_filter(10)
 43 | >>> x
 44 | True
 45 | >>> x = custom_filter(2)
 46 | >>> x
 47 | False
 48 | >>> rdd3 = rdd.filter(custom_filter)
 49 | >>> rdd3.collect()
 50 | [4, 4, 5, 6]
 51 | >>> rdd2.collect()
 52 | [4, 4, 5, 6]
 53 | >>>
 54 | >>>
 55 | >>> data = [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2),('B', 7)]
 56 | >>> data
 57 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2), ('B', 7)]
 58 | >>>
 59 | >>> rdd = spark.sparkContext.parallelize(data)
 60 | >>> rdd.collect()
 61 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2), ('B', 7)]
 62 | >>>
 63 | >>>
 64 | >>>
 65 | >>>
 66 | >>> total = rdd.reduceByKey(lambda x, y: x+y)
 67 | >>> total.collect()
 68 | [('B', 9), ('A', 14)]
 69 | >>>
 70 | >>>
 71 | >>> rdd.collect()
 72 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2), ('B', 7)]
 73 | >>> grouped = rdd.groupByKey()
 74 | >>> grouped.collect()
 75 | [
 76 |  ('B', <pyspark.resultiterable.ResultIterable object at 0x114ef18d0>), 
 77 |  ('A', <pyspark.resultiterable.ResultIterable object at 0x114ef19e8>)
 78 | ]
 79 | >>> grouped.map(lambda x: (x[0], list(x[1])).collect()
 80 | [('B', [2, 7]), ('A', [2, 3, 4, 5])]
 81 | >>> total2 = grouped.map(lambda x: (x[0], sum(x[1])))
 82 | >>> total2.collect()
 83 | [('B', 9), ('A', 14)]
 84 | >>>
 85 | 
 86 | >>>
 87 | >>> spark
 88 | <pyspark.sql.session.SparkSession object at 0x11848eda0>
 89 | >>> numbers = [-1, 2, 3, -55, 88, 99, -99, 66, 777]
 90 | >>> numbers
 91 | [-1, 2, 3, -55, 88, 99, -99, 66, 777]
 92 | >>> rdd = spark.sparkContext.parallelize(numbers)
 93 | >>> rdd.collect()
 94 | [-1, 2, 3, -55, 88, 99, -99, 66, 777]
 95 | >>>
 96 | >>> positives = rdd.filter(lambda x : x > 0)
 97 | >>> positives.collect()
 98 | [2, 3, 88, 99, 66, 777]
 99 | >>>
100 | >>> negatives = rdd.filter(lambda x : x < 0)
101 | >>> negatives.collect()
102 | [-1, -55, -99]
103 | >>> def keep_positives(n):
104 | ...     if (n > 0):
105 | ...        return True
106 | ...     else:
107 | ...        return False
108 | ... ^D
109 | >>>
110 | >>> a = keep_positives(100)
111 | >>> a
112 | True
113 | >>> a = keep_positives(-9)
114 | >>> a
115 | False
116 | >>> pos2 = rdd.filter(keep_positives)
117 | >>> pos2.collect()
118 | [2, 3, 88, 99, 66, 777]
119 | >>> pos2222 = pos2.filter(lambda x : True)
120 | >>> pos2222.collect()
121 | [2, 3, 88, 99, 66, 777]
122 | >>>
123 | >>>
124 | >>> pairs = [('A', 2), ('A', 3), ('A', 4),('A', 5), ('A', 6), ('B', 10), ('B', 2)]
125 | >>> pairs
126 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('A', 6), ('B', 10), ('B', 2)]
127 | >>>
128 | >>>
129 | >>> rdd = spark.sparkContext.parallelize(pairs)
130 | >>> rdd.collect()
131 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('A', 6), ('B', 10), ('B', 2)]
132 | >>> totals = rdd.reduceByKey(lambda a, b : a+b)
133 | >>> result = totals.collect()
134 | >>> result
135 | [('B', 12), ('A', 20)]
136 | >>>
137 | >>>
138 | >>> rdd.collect()
139 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('A', 6), ('B', 10), ('B', 2)]
140 | >>> grouped = rdd.groupByKey()
141 | >>> grouped.collect()
142 | [
143 |  ('B', <pyspark.resultiterable.ResultIterable object at 0x1184f3b70>), 
144 |  ('A', <pyspark.resultiterable.ResultIterable object at 0x1184f3c88>)
145 | ]
146 | >>>
147 | >>> grouped.map(lambda x: (x[0], list(x[1]))).collect()
148 | [('B', [10, 2]), ('A', [2, 3, 4, 5, 6])]
149 | >>>
150 | >>> sum2 = grouped.map(lambda x: (x[0], sum(x[1])))
151 | >>> sum2.collect()
152 | [('B', 12), ('A', 20)]
153 | >>>
154 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2019-10-16.txt:
--------------------------------------------------------------------------------
 1 | $ ./bin/pyspark
 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 3 | [Clang 6.0 (clang-600.0.57)] on darwin
 4 | Type "help", "copyright", "credits" or "license" for more information.
 5 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 6 | Setting default log level to "WARN".
 7 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 8 | Welcome to
 9 |       ____              __
10 |      / __/__  ___ _____/ /__
11 |     _\ \/ _ \/ _ `/ __/  '_/
12 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
13 |       /_/
14 | 
15 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
16 | SparkSession available as 'spark'.
17 | >>>
18 | >>> a =[ 1, 2, 3, 4, 5]
19 | >>> rdd = spark.sparkContext.parallelize(a)
20 | >>> rdd.collect()
21 | [1, 2, 3, 4, 5]
22 | >>> rdd.count()
23 | 5
24 | >>> sumofvalues = rdd.reduce(lambda x, y: x+y)
25 | >>> sumofvalues
26 | 15
27 | >>>
28 | >>> product = rdd.reduce(lambda x, y: x*y)
29 | >>> product
30 | 120
31 | >>> z = [ "1", "2", "3", "4", "5", "6", "7"]
32 | >>> rdd = spark.sparkContext.parallelize(z)
33 | >>> rdd.collect()
34 | ['1', '2', '3', '4', '5', '6', '7']
35 | >>> concat = rdd.reduce(lambda x, y: x+y)
36 | >>> concat
37 | '1234567'
38 | >>>
39 | >>> [ "1", "2", "3", "4", "5", "6", "7"]
40 | ['1', '2', '3', '4', '5', '6', '7']
41 | >>> z = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b"]
42 | >>>
43 | >>>
44 | >>> z
45 | ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b']
46 | >>> rdd = spark.sparkContext.parallelize(z, 3)
47 | >>> rdd.collect()
48 | ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b']
49 | >>> concat = rdd.reduce(lambda x, y: x+y)
50 | >>> concat
51 | '123456789ab'
52 | >>> rdd = spark.sparkContext.parallelize(z, 10)
53 | >>> concat = rdd.reduce(lambda x, y: x+y)
54 | >>> concat
55 | '123456789ab'
56 | 
57 | 
58 | >>>
59 | >>> nums = [1, 3, 5, 4, 2, 1, 0, 9, 10]
60 | >>> nums
61 | [1, 3, 5, 4, 2, 1, 0, 9, 10]
62 | >>> rdd = spark.sparkContext.parallelize(nums)
63 | >>> rdd.collect()
64 | [1, 3, 5, 4, 2, 1, 0, 9, 10]
65 | >>> rdd.count()
66 | 9
67 | >>> sumvalues = rdd.reduce(lambda a, b: a+b)
68 | >>> sumvalues
69 | 35
70 | 
71 | >>> product = rdd.reduce(lambda a, b: a*b)
72 | >>> product
73 | 0
74 | >>> nums = [1, 3, 5, 4, 2, 1, 30, 9, 10]
75 | >>> rdd = spark.sparkContext.parallelize(nums)
76 | >>> sumvalues = rdd.reduce(lambda a, b: a+b)
77 | >>> sumvalues
78 | 65
79 | >>> product = rdd.reduce(lambda a, b: a*b)
80 | >>> product
81 | 324000
82 | >>> rdd.collect()
83 | [1, 3, 5, 4, 2, 1, 30, 9, 10]
84 | 
85 | >>> strs = ["1", "3", "5", "4", "2", "1"]
86 | >>> strs
87 | ['1', '3', '5', '4', '2', '1']
88 | >>> rdd = spark.sparkContext.parallelize(strs)
89 | >>> concat = rdd.reduce(lambda a, b: a+b)
90 | >>> concat
91 | '135421'
92 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-01-22.txt:
--------------------------------------------------------------------------------
  1 | $ ./bin/pyspark
  2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
  3 | [Clang 6.0 (clang-600.0.57)] on darwin
  4 | Type "help", "copyright", "credits" or "license" for more information.
  5 | Welcome to
  6 |       ____              __
  7 |      / __/__  ___ _____/ /__
  8 |     _\ \/ _ \/ _ `/ __/  '_/
  9 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
 10 |       /_/
 11 | 
 12 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 13 | SparkSession available as 'spark'.
 14 | >>>
 15 | >>>
 16 | >>>
 17 | >>> spark
 18 | <pyspark.sql.session.SparkSession object at 0x10aa38668>
 19 | >>>
 20 | >>> sc = spark.sparkContext
 21 | >>>
 22 | >>> sc
 23 | <SparkContext master=local[*] appName=PySparkShell>
 24 | >>>
 25 | >>>
 26 | >>> numbers = [1, 2, 3, 4, 5, 6, -1, -2]
 27 | >>> numbers
 28 | [1, 2, 3, 4, 5, 6, -1, -2]
 29 | >>> len(numbers)
 30 | 8
 31 | >>> rdd = sc.parallelize(numbers)
 32 | >>> rdd.collect()
 33 | [1, 2, 3, 4, 5, 6, -1, -2]
 34 | >>> rdd.count()
 35 | 8
 36 | >>> rdd
 37 | ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195
 38 | >>>
 39 | >>> rdd_pos = rdd.filter(lambda x: x > 0)
 40 | >>> rdd_pos.collect()
 41 | [1, 2, 3, 4, 5, 6]
 42 | 
 43 | >>> rdd_pos.count()
 44 | 6
 45 | >>>
 46 | >>> rdd_pos.collect()
 47 | [1, 2, 3, 4, 5, 6]
 48 | >>>
 49 | >>> sum_of_all = rdd_pos.reduce(lambda x, y: x+y)
 50 | >>> sum_of_all
 51 | 21
 52 | >>> rdd_pos.take(2)
 53 | [1, 2]
 54 | >>>
 55 | >>>
 56 | >>> rdd.collect()
 57 | [1, 2, 3, 4, 5, 6, -1, -2]
 58 | >>> rdd.count()
 59 | 8
 60 | >>> rdd4 = rdd.map(lambda x : x+100)
 61 | >>> rdd4.collect()
 62 | [101, 102, 103, 104, 105, 106, 99, 98]
 63 | >>>
 64 | >>>
 65 | >>>
 66 | >>> kv = [('alex', 2), ('alex', 20), ('alex', 40), ('jane', 100), ('jane', 400)]
 67 | >>> kv
 68 | [('alex', 2), ('alex', 20), ('alex', 40), ('jane', 100), ('jane', 400)]
 69 | >>> len(kv)
 70 | 5
 71 | >>> key_value_pairs = sc.parallelize(kv)
 72 | >>> key_value_pairs.count()
 73 | 5
 74 | >>> key_value_pairs.collect()
 75 | [
 76 |  ('alex', 2), 
 77 |  ('alex', 20), 
 78 |  ('alex', 40), 
 79 |  ('jane', 100), 
 80 |  ('jane', 400)
 81 | ]
 82 | >>>
 83 | >>>
 84 | >>> grouped = key_value_pairs.groupByKey()
 85 | >>> grouped.collect()
 86 | [
 87 |  ('alex', <pyspark.resultiterable.ResultIterable object at 0x10aa9f5f8>), 
 88 |  ('jane', <pyspark.resultiterable.ResultIterable object at 0x10aa9f5c0>)
 89 | ]
 90 | >>>
 91 | >>> grouped.map(lambda x: (x[0], list(x[1]))).collect()
 92 | [
 93 |  ('alex', [2, 20, 40]), 
 94 |  ('jane', [100, 400])
 95 | ]
 96 | >>> grouped_sum = grouped.map(lambda x: (x[0], sum(x[1])))
 97 | >>> grouped_sum.collect()
 98 | [
 99 |  ('alex', 62),
100 |  ('jane', 500)
101 | ]
102 | >>>
103 | >>>
104 | >>> grouped.collect()
105 | [
106 |  ('alex', <pyspark.resultiterable.ResultIterable object at 0x10aa9fb70>), 
107 |  ('jane', <pyspark.resultiterable.ResultIterable object at 0x10aa9fe48>)
108 | ]
109 | >>> grouped_sum_2 = grouped.mapValues(lambda x: sum(x))
110 | >>> grouped_sum_2.collect()
111 | [
112 |  ('alex', 62), 
113 |  ('jane', 500)
114 | ]
115 | >>>
116 | 
117 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-01-24.txt:
--------------------------------------------------------------------------------
 1 | How to read a text file and convert into an RDD[String]
 2 | 
 3 | $ cat /tmp/books.txt
 4 | ISBN-100,sales,biology
 5 | IS-01235,sales,econ
 6 | ISBN-101,sales,econ
 7 | ISBN-102,sales,biology
 8 | ISBN-109,econ,sales
 9 | ISBN-103,CS,sales
10 | ISBN-104,CS,biology
11 | ISBN-105,CS,econ
12 | ISBN-200,CS
13 | 
14 | $ ./bin/pyspark
15 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
16 | [Clang 6.0 (clang-600.0.57)] on darwin
17 | Welcome to
18 |       ____              __
19 |      / __/__  ___ _____/ /__
20 |     _\ \/ _ \/ _ `/ __/  '_/
21 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
22 |       /_/
23 | 
24 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
25 | SparkSession available as 'spark'.
26 | >>>
27 | >>>
28 | >>>
29 | >>> spark
30 | <pyspark.sql.session.SparkSession object at 0x11957b668>
31 | >>>
32 | >>>
33 | >>>
34 | >>> input_path = "/tmp/books.txt"
35 | >>>
36 | >>> records = spark.sparkContext.textFile(input_path)
37 | >>> records.collect()
38 | [
39 |  'ISBN-100,sales,biology', 
40 |  'IS-01235,sales,econ', 
41 |  'ISBN-101,sales,econ', 
42 |  'ISBN-102,sales,biology', 
43 |  'ISBN-109,econ,sales', 
44 |  'ISBN-103,CS,sales', 
45 |  'ISBN-104,CS,biology', 
46 |  'ISBN-105,CS,econ', 
47 |  'ISBN-200,CS'
48 | ]
49 | >>> records.count()
50 | 9


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-02-03.txt:
--------------------------------------------------------------------------------
 1 | mparsian@Mahmouds-MacBook ~/spark-2.4.4 $ ./bin/pyspark
 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 3 | Welcome to
 4 |       ____              __
 5 |      / __/__  ___ _____/ /__
 6 |     _\ \/ _ \/ _ `/ __/  '_/
 7 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
 8 |       /_/
 9 | 
10 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
11 | SparkSession available as 'spark'.
12 | >>>
13 | >>>
14 | >>> numbers = [1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30]
15 | >>>
16 | >>>
17 | >>> numbers
18 | [1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30]
19 | >>> len(numbers)
20 | 56
21 | >>> rdd = spark.sparkContext.parallelize(numbers)
22 | >>> rdd.count()
23 | 56
24 | >>> rdd.collect()
25 | [1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30]
26 | >>>
27 | >>>
28 | >>> def min_max_count(partition):
29 | ...     first_time = True
30 | ...     count = 0
31 | ...     for n in partition:
32 | ...        count += 1
33 | ...        if first_time == True:
34 | ...           min2 = n
35 | ...           max2 = n
36 | ...           first_time = False
37 | ...        else:
38 | ...           min2 = min(min2, n)
39 | ...           max2 = max(max2, n)
40 | ...     return (min2, max2, count)
41 | ...
42 | >>>
43 | >>> target = rdd.mapPartitions(min_max_count)
44 | >>> target.count()
45 | 24
46 | >>> target.collect()
47 | [-2, 3, 7, -2, 3, 7, -2, 3, 7, -20, 30, 7, -2, 3, 7, -2, 3, 7, -2, 3, 7, -20, 30, 7]
48 | >>>
49 | >>>
50 | >>> def min_max_count(partition):
51 | ...     first_time = True
52 | ...     count = 0
53 | ...     for n in partition:
54 | ...        count += 1
55 | ...        if first_time == True:
56 | ...           min2 = n
57 | ...           max2 = n
58 | ...           first_time = False
59 | ...        else:
60 | ...           min2 = min(min2, n)
61 | ...           max2 = max(max2, n)
62 | ...     return [(min2, max2, count)]
63 | ...
64 | >>>
65 | >>> target = rdd.mapPartitions(min_max_count)
66 | >>> target.collect()
67 | [(-2, 3, 7), (-2, 3, 7), (-2, 3, 7), (-20, 30, 7), (-2, 3, 7), (-2, 3, 7), (-2, 3, 7), (-20, 30, 7)]
68 | >>>
69 | >>> rdd.getNumPartitions()
70 | 8
71 | >>> rdd = spark.sparkContext.parallelize(numbers, 4)
72 | >>> rdd.getNumPartitions()
73 | 4
74 | >>> target = rdd.mapPartitions(min_max_count)
75 | >>> target.collect()
76 | [(-2, 3, 14), (-20, 30, 14), (-2, 3, 14), (-20, 30, 14)]
77 | >>>
78 | >>>
79 | >>>
80 | >>> def add_t3(x, y):
81 | ...     count = x[2] + y[2]
82 | ...     min2 = min(x[0], y[0])
83 | ...     max2 = max(x[1], y[1])
84 | ...     return (min2, max2, count)
85 | ...
86 | >>>
87 | >>> add_t3( (2, 5, 40), (7, 50, 60))
88 | (2, 50, 100)
89 | >>> final_result = target.reduce(add_t3)
90 | >>> final_result
91 | (-20, 30, 56)
92 | >>>
93 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-04-16.txt:
--------------------------------------------------------------------------------
 1 |  $ ./bin/pyspark
 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 3 | [Clang 6.0 (clang-600.0.57)] on darwin
 4 | Type "help", "copyright", "credits" or "license" for more information.
 5 | 
 6 | Welcome to
 7 |       ____              __
 8 |      / __/__  ___ _____/ /__
 9 |     _\ \/ _ \/ _ `/ __/  '_/
10 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
11 |       /_/
12 | 
13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
14 | SparkSession available as 'spark'.
15 | >>>
16 | >>> spark
17 | <pyspark.sql.session.SparkSession object at 0x117a227b8>
18 | >>>
19 | >>> input_path = '/Users/mparsian/spark-2.4.4/zbin/foxdata.txt'
20 | >>> input_path
21 | '/Users/mparsian/spark-2.4.4/zbin/foxdata.txt'
22 | >>>
23 | >>> recs = spark.sparkContext.textFile(input_path)
24 | >>>
25 | >>> recs.collect()
26 | ['red fox jumped high', 'fox jumped over high fence', 'red fox jumped']
27 | >>> recs.count()
28 | 3
29 | >>>
30 | >>>
31 | >>> words = recs.map(lambda r: r.split(" "))
32 | >>> words.collect()
33 | [['red', 'fox', 'jumped', 'high'], ['fox', 'jumped', 'over', 'high', 'fence'], ['red', 'fox', 'jumped']]
34 | >>>
35 | >>> words.take(1)
36 | [['red', 'fox', 'jumped', 'high']]
37 | >>> words.take(2)
38 | [['red', 'fox', 'jumped', 'high'], ['fox', 'jumped', 'over', 'high', 'fence']]
39 | >>> # recs : RDD[String]
40 | ...
41 | >>> # words : RDD[[String]]
42 | ...
43 | >>> x = "fox jumped"
44 | >>> y = x.split(" ")
45 | >>> y
46 | ['fox', 'jumped']
47 | >>>
48 | >>>
49 | >>> single_words = words.flatMap(lambda x: x)
50 | >>> single_words.collect()
51 | ['red', 'fox', 'jumped', 'high', 'fox', 'jumped', 'over', 'high', 'fence', 'red', 'fox', 'jumped']
52 | >>> words.count()
53 | 3
54 | >>> single_words.count()
55 | 12
56 | >>> # single_words : RDD[String]
57 | ...
58 | >>>
59 | >>> pairs = single_words.map(lambda x : (x, 1))
60 | >>> pairs.collect()
61 | [('red', 1), ('fox', 1), ('jumped', 1), ('high', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('high', 1), ('fence', 1), ('red', 1), ('fox', 1), ('jumped', 1)]
62 | >>>
63 | >>> pairs.collect()
64 | [('red', 1), ('fox', 1), ('jumped', 1), ('high', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('high', 1), ('fence', 1), ('red', 1), ('fox', 1), ('jumped', 1)]
65 | >>> freq = pairs.reduceByKey(lambda a, b : a+b)
66 | >>> freq.collect()
67 | [('high', 2), ('fence', 1), ('red', 2), ('fox', 3), ('jumped', 3), ('over', 1)]


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-04-23.txt:
--------------------------------------------------------------------------------
 1 | ~/spark-2.4.4 $ ./bin/pyspark
 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 3 | Welcome to
 4 |       ____              __
 5 |      / __/__  ___ _____/ /__
 6 |     _\ \/ _ \/ _ `/ __/  '_/
 7 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
 8 |       /_/
 9 | 
10 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
11 | SparkSession available as 'spark'.
12 | >>> data = [ [1, 2, 3], [4, 5], [], [10] ]
13 | >>> data
14 | [[1, 2, 3], [4, 5], [], [10]]
15 | >>> len(data)
16 | 4
17 | >>> rdd = spark.sparkContext.parallelize(data)
18 | >>> rdd.collect()
19 | [[1, 2, 3], [4, 5], [], [10]]
20 | >>> rdd.count()
21 | 4
22 | >>> rdd2 = rdd.map(lambda x: x)
23 | >>> rdd2.count()
24 | 4
25 | >>> rdd2.collect()
26 | [[1, 2, 3], [4, 5], [], [10]]
27 | >>>
28 | >>> rdd3 = rdd.flatMap(lambda x: x)
29 | >>> rdd3.count()
30 | 6
31 | >>> rdd3.collect()
32 | [1, 2, 3, 4, 5, 10]
33 | >>>
34 | >>> data2 = [ [1, 2, 3, [44, 55] ], [4, 5], [], [10] ]
35 | >>> rdd4 = spark.sparkContext.parallelize(data2)
36 | >>> rdd4.collect()
37 | [[1, 2, 3, [44, 55]], [4, 5], [], [10]]
38 | >>> rdd5 = rdd4.flatMap(lambda x: x)
39 | >>> rdd5.collect()
40 | [1, 2, 3, [44, 55], 4, 5, 10]
41 | >>>
42 | >>>
43 | >>> data = [1, 2, 3, 4, 5, 6]
44 | >>> rdd = spark.sparkContext.parallelize(data)
45 | >>> rdd.collect()
46 | [1, 2, 3, 4, 5, 6]
47 | >>> sumofvalues = rdd.reduce(lambda x, y: x+y)
48 | >>> sumofvalues
49 | 21
50 | >>> sumofvalues = rdd.reduce(lambda x, y: x*y)
51 | >>> sumofvalues
52 | 720
53 | >>> tuples2 = [(1,20), (3,40), (5,60)]
54 | >>> rdd = spark.sparkContext.parallelize(tuples2)
55 | >>> rdd.collect()
56 | [(1, 20), (3, 40), (5, 60)]
57 | >>> rdd.count()
58 | 3
59 | >>> sum2 = rdd.rduce(lambda x, y: (x[0]+y[0], x[1]+y[1]))
60 | Traceback (most recent call last):
61 |   File "<stdin>", line 1, in <module>
62 | AttributeError: 'RDD' object has no attribute 'rduce'
63 | >>> sum2 = rdd.reduce(lambda x, y: (x[0]+y[0], x[1]+y[1]))
64 | >>> sum2
65 | (9, 120)
66 | >>>
67 | >>>
68 | >>> kv = [('A', 2), ('A', 3), ('A', 4), ('B', 10), ('B', 20), ('C', 7)]
69 | >>> kv
70 | [('A', 2), ('A', 3), ('A', 4), ('B', 10), ('B', 20), ('C', 7)]
71 | >>> len(kv)
72 | 6
73 | >>> rdd = spark.sparkContext.parallelize(kv)
74 | >>> rdd.collect()
75 | [('A', 2), ('A', 3), ('A', 4), ('B', 10), ('B', 20), ('C', 7)]
76 | >>> rdd.count()
77 | 6
78 | >>> sum_by_key = rdd.reduceByKey(lambda x, y: x+y)
79 | >>> sum_by_key.collect()
80 | [('B', 30), ('C', 7), ('A', 9)]
81 | >>>
82 | >>>
83 | >>>
84 | >>> grouped = rdd.groupByKey()
85 | >>> grouped.collect()
86 | [('B', <pyspark.resultiterable.ResultIterable object at 0x114b06128>), ('C', <pyspark.resultiterable.ResultIterable object at 0x114b06208>), ('A', <pyspark.resultiterable.ResultIterable object at 0x114b06390>)]
87 | >>> grouped.mapValues(lambda iter: list(iter)).collect()
88 | [('B', [10, 20]), ('C', [7]), ('A', [2, 3, 4])]
89 | >>>
90 | >>> sum_of_values_2 = grouped.mapValues(lambda iter: sum(iter))
91 | >>> sum_of_values_2.collect()
92 | [('B', 30), ('C', 7), ('A', 9)]
93 | 
94 | ... # find average of values per key for a give rdd by groupByKey()
95 | 
96 | ... # find average of values per key for a give rdd by reduceByKey()
97 | ...
98 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-07-06-word-count.txt:
--------------------------------------------------------------------------------
  1 | $ cat /tmp/foxy.txt
  2 | a Fox jumped high and high and jumped and jumped
  3 | fox of red jumped fox of red jumped fox of red jumped
  4 | oh no
  5 | fox of blue jumped
  6 | oh boy
  7 | a Fox is a red fox of hen
  8 | a fox is a high fox
  9 | orange fox is high and blue and blue
 10 | 
 11 | mparsian@usfc-olw-025011 ~/spark-3.0.0 $ ./bin/pyspark
 12 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 13 | [Clang 6.0 (clang-600.0.57)] on darwin
 14 | Type "help", "copyright", "credits" or "license" for more information.
 15 | 20/07/06 17:59:22 WARN Utils: Your hostname, Mahmouds-MacBook.local resolves to a loopback address: 127.0.0.1; using 10.0.0.93 instead (on interface en0)
 16 | 20/07/06 17:59:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
 17 | 20/07/06 17:59:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 18 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 19 | Setting default log level to "WARN".
 20 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 21 | Welcome to
 22 |       ____              __
 23 |      / __/__  ___ _____/ /__
 24 |     _\ \/ _ \/ _ `/ __/  '_/
 25 |    /__ / .__/\_,_/_/ /_/\_\   version 3.0.0
 26 |       /_/
 27 | 
 28 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 29 | SparkSession available as 'spark'.
 30 | >>>
 31 | >>>
 32 | >>> spark
 33 | <pyspark.sql.session.SparkSession object at 0x7fca5613ec18>
 34 | >>>
 35 | >>> input_path = '/tmp/foxy.txt'
 36 | >>> input_path
 37 | '/tmp/foxy.txt'
 38 | >>>
 39 | >>> recs = spark.sparkContext.textFile(input_path)
 40 | >>> recs.count()
 41 | 8
 42 | >>> recs.collect()
 43 | [
 44 |  'a Fox jumped high and high and jumped and jumped', 
 45 |  'fox of red jumped fox of red jumped fox of red jumped', 
 46 |  'oh no', 
 47 |  'fox of blue jumped', 
 48 |  'oh boy', 
 49 |  'a Fox is a red fox of hen', 
 50 |  'a fox is a high fox', 
 51 |  'orange fox is high and blue and blue'
 52 | ]
 53 | >>>
 54 | >>>
 55 | >>>
 56 | >>> splitted = recs.map(lambda x: x.split(" "))
 57 | >>> splitted.count()
 58 | 8
 59 | >>> splitted.collect()
 60 | [
 61 |  ['a', 'Fox', 'jumped', 'high', 'and', 'high', 'and', 'jumped', 'and', 'jumped'], 
 62 |  ['fox', 'of', 'red', 'jumped', 'fox', 'of', 'red', 'jumped', 'fox', 'of', 'red', 'jumped'], 
 63 |  ['oh', 'no'], 
 64 |  ['fox', 'of', 'blue', 'jumped'], 
 65 |  ['oh', 'boy'], 
 66 |  ['a', 'Fox', 'is', 'a', 'red', 'fox', 'of', 'hen'], 
 67 |  ['a', 'fox', 'is', 'a', 'high', 'fox'], 
 68 |  ['orange', 'fox', 'is', 'high', 'and', 'blue', 'and', 'blue']
 69 | ]
 70 | >>>
 71 | >>>
 72 | >>> words = splitted.flatMap(lambda x: x)
 73 | >>> words.count()
 74 | 52
 75 | >>> words.collect()
 76 | ['a', 'Fox', 'jumped', 'high', 'and', 'high', 'and', 'jumped', 'and', 'jumped', 'fox', 'of', 'red', 'jumped', 'fox', 'of', 'red', 'jumped', 'fox', 'of', 'red', 'jumped', 'oh', 'no', 'fox', 'of', 'blue', 'jumped', 'oh', 'boy', 'a', 'Fox', 'is', 'a', 'red', 'fox', 'of', 'hen', 'a', 'fox', 'is', 'a', 'high', 'fox', 'orange', 'fox', 'is', 'high', 'and', 'blue', 'and', 'blue']
 77 | >>>
 78 | >>>
 79 | >>> pairs = words.map(lambda x : (x, 1))
 80 | >>> pairs.collect()
 81 | [('a', 1), ('Fox', 1), ('jumped', 1), ('high', 1), ('and', 1), ('high', 1), ('and', 1), ('jumped', 1), ('and', 1), ('jumped', 1), ('fox', 1), ('of', 1), ('red', 1), ('jumped', 1), ('fox', 1), ('of', 1), ('red', 1), ('jumped', 1), ('fox', 1), ('of', 1), ('red', 1), ('jumped', 1), ('oh', 1), ('no', 1), ('fox', 1), ('of', 1), ('blue', 1), ('jumped', 1), ('oh', 1), ('boy', 1), ('a', 1), ('Fox', 1), ('is', 1), ('a', 1), ('red', 1), ('fox', 1), ('of', 1), ('hen', 1), ('a', 1), ('fox', 1), ('is', 1), ('a', 1), ('high', 1), ('fox', 1), ('orange', 1), ('fox', 1), ('is', 1), ('high', 1), ('and', 1), ('blue', 1), ('and', 1), ('blue', 1)]
 82 | >>>
 83 | >>>
 84 | >>> freq = pairs.reduceByKey(lambda a, b: a+b)
 85 | >>>
 86 | >>> freq.collect()
 87 | [('Fox', 2), ('high', 4), ('of', 5), ('oh', 2), ('no', 1), ('boy', 1), ('is', 3), ('hen', 1), ('orange', 1), ('a', 5), ('jumped', 7), ('and', 5), ('fox', 8), ('red', 4), ('blue', 3)]
 88 | >>>
 89 | >>>
 90 | >>> grouped = pairs.groupByKey()
 91 | >>> grouped.collect()
 92 | [
 93 |  ('Fox', <pyspark.resultiterable.ResultIterable object at 0x7fca56198ef0>), 
 94 |  ('high', <pyspark.resultiterable.ResultIterable object at 0x7fca56198b00>), 
 95 |  ('of', <pyspark.resultiterable.ResultIterable object at 0x7fca56198dd8>), 
 96 |  ('oh', <pyspark.resultiterable.ResultIterable object at 0x7fca56198e80>), 
 97 |  ('no', <pyspark.resultiterable.ResultIterable object at 0x7fca56198d30>), 
 98 |  ('boy', <pyspark.resultiterable.ResultIterable object at 0x7fca56198da0>), 
 99 |  ('is', <pyspark.resultiterable.ResultIterable object at 0x7fca56198be0>), 
100 |  ('hen', <pyspark.resultiterable.ResultIterable object at 0x7fca56198eb8>), 
101 |  ('orange', <pyspark.resultiterable.ResultIterable object at 0x7fca56198710>), 
102 |  ('a', <pyspark.resultiterable.ResultIterable object at 0x7fca561989e8>), 
103 |  ('jumped', <pyspark.resultiterable.ResultIterable object at 0x7fca56198cc0>), 
104 |  ('and', <pyspark.resultiterable.ResultIterable object at 0x7fca561988d0>), 
105 |  ('fox', <pyspark.resultiterable.ResultIterable object at 0x7fca56198828>), 
106 |  ('red', <pyspark.resultiterable.ResultIterable object at 0x7fca56198668>), 
107 |  ('blue', <pyspark.resultiterable.ResultIterable object at 0x7fca561ab080>)
108 | ]
109 | >>>
110 | >>> grouped.mapValues(lambda iter: list(iter)).collect()
111 | [
112 |  ('Fox', [1, 1]), 
113 |  ('high', [1, 1, 1, 1]), 
114 |  ('of', [1, 1, 1, 1, 1]), 
115 |  ('oh', [1, 1]), 
116 |  ('no', [1]), 
117 |  ('boy', [1]), 
118 |  ('is', [1, 1, 1]), 
119 |  ('hen', [1]), 
120 |  ('orange', [1]), 
121 |  ('a', [1, 1, 1, 1, 1]), 
122 |  ('jumped', [1, 1, 1, 1, 1, 1, 1]), 
123 |  ('and', [1, 1, 1, 1, 1]), 
124 |  ('fox', [1, 1, 1, 1, 1, 1, 1, 1]), 
125 |  ('red', [1, 1, 1, 1]), 
126 |  ('blue', [1, 1, 1])
127 | ]
128 | >>> freq2 = grouped.mapValues(lambda iter: sum(iter))
129 | >>> freq2.collect()
130 | [('Fox', 2), ('high', 4), ('of', 5), ('oh', 2), ('no', 1), ('boy', 1), ('is', 3), ('hen', 1), ('orange', 1), ('a', 5), ('jumped', 7), ('and', 5), ('fox', 8), ('red', 4), ('blue', 3)]
131 | >>>
132 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-10-05.txt:
--------------------------------------------------------------------------------
 1 | $ cat /tmp/foxy.txt
 2 | a fox jumped and jumped
 3 | red fox jumped high
 4 | a red high fox jumped and jumped
 5 | red fox is red
 6 | 
 7 | $ ./bin/pyspark
 8 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 
 9 | [Clang 6.0 (clang-600.0.57)] on darwin
10 | Type "help", "copyright", "credits" or "license" for more information.
11 | 
12 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
13 | Setting default log level to "WARN".
14 | To adjust logging level use sc.setLogLevel(newLevel). 
15 | For SparkR, use setLogLevel(newLevel).
16 | Welcome to
17 |       ____              __
18 |      / __/__  ___ _____/ /__
19 |     _\ \/ _ \/ _ `/ __/  '_/
20 |    /__ / .__/\_,_/_/ /_/\_\   version 3.0.0
21 |       /_/
22 | 
23 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
24 | SparkSession available as 'spark'.
25 | >>> 
26 | >>> 
27 | >>> numbers = [1, 2, 3, 4, 5, 6, 10]
28 | >>> numbers
29 | [1, 2, 3, 4, 5, 6, 10]
30 | >>> 
31 | >>> 
32 | >>> spark
33 | <pyspark.sql.session.SparkSession object at 0x7f8e3713eba8>
34 | 
35 | >>># create a new RDD from a Python collection named numbers
36 | >>> rdd_numbers = spark.sparkContext.parallelize(numbers)
37 | >>> rdd_numbers.count()
38 | 7                                                                               
39 | 
40 | >>> rdd_numbers.collect()
41 | [1, 2, 3, 4, 5, 6, 10]
42 | >>> # rdd_numbers : RDD[Integer]
43 | ... 
44 | >>> total = rdd_numbers.reduce(lambda x, y: x+y)
45 | >>> total 
46 | 31
47 | 
48 | >>># create a new RDD from rdd_numbers
49 | >>> tuples2 = rdd_numbers.map(lambda x: (x, x+1))
50 | >>> tuples2.count()
51 | 7
52 | >>> tuples2.collect()
53 | [(1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (10, 11)]
54 | >>> 
55 | >>> 
56 | >>> input_path = '/tmp/foxy.txt'
57 | >>># create a new RDD[String] from a given text file
58 | >>> recs = spark.sparkContext.textFile(input_path)
59 | >>> recs.collect()
60 | [
61 |  'a fox jumped and jumped', 
62 |  'red fox jumped high', 
63 |  'a red high fox jumped and jumped', 
64 |  'red fox is red'
65 | ]
66 | >>> recs.count()
67 | 4
68 | >>> # recs : RDD[String]
69 | 
70 | >>># create a new RDD[(String, Integer)] 
71 | >>> recs_length = recs.map(lambda x : (x, len(x)))
72 | >>> recs_length.collect()
73 | [
74 |  ('a fox jumped and jumped', 23), 
75 |  ('red fox jumped high', 19), 
76 |  ('a red high fox jumped and jumped', 32), 
77 |  ('red fox is red', 14)
78 | ]
79 | >>> # recs_length : RDD[(String, Integer)]
80 | 
81 | >>># keep only records if their lengt is greater than 20
82 | >>> recs_gt_20 = recs.filter(lambda x: len(x) > 20)
83 | >>> 
84 | >>> recs_gt_20.collect()
85 | [
86 |  'a fox jumped and jumped', 
87 |  'a red high fox jumped and jumped'
88 | ]
89 | >>> recs_gt_20.count()
90 | 2


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-10-07.txt:
--------------------------------------------------------------------------------
  1 | $ ./bin/pyspark
  2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 
  3 | [Clang 6.0 (clang-600.0.57)] on darwin
  4 | Type "help", "copyright", "credits" or "license" for more information.
  5 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
  6 | Setting default log level to "WARN".
  7 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
  8 | Welcome to
  9 |       ____              __
 10 |      / __/__  ___ _____/ /__
 11 |     _\ \/ _ \/ _ `/ __/  '_/
 12 |    /__ / .__/\_,_/_/ /_/\_\   version 3.0.0
 13 |       /_/
 14 | 
 15 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 16 | SparkSession available as 'spark'.
 17 | >>> spark
 18 | <pyspark.sql.session.SparkSession object at 0x7fc7a913eba8>
 19 | >>> 
 20 | >>> 
 21 | >>> 
 22 | >>> # create RDD[(String, Integer)]
 23 | ... 
 24 | >>> key_value_pairs = 
 25 | [
 26 |  ('alex', 10), 
 27 |  ('alex', 20), 
 28 |  ('alex', 30), 
 29 |  ('bob', 100), 
 30 |  ('bob', 200), 
 31 |  ('zazo', 7)
 32 | ]
 33 | 
 34 | >>> # create an RDD[(String, Integer)] from a python collection
 35 | >>> key_value = spark.sparkContext.parallelize(key_value_pairs)
 36 | >>> key_value.count()
 37 | 6                                                                               
 38 | >>> key_value.collect()
 39 | [
 40 |  ('alex', 10), 
 41 |  ('alex', 20), 
 42 |  ('alex', 30), 
 43 |  ('bob', 100), 
 44 |  ('bob', 200), 
 45 |  ('zazo', 7)
 46 | ]
 47 | >>> 
 48 | >>> 
 49 | >>># use the reduceByKey() transformation 
 50 | >>> sum_of_values_per_key = key_value.reduceByKey(lambda x, y: x+y)
 51 | >>> 
 52 | >>> sum_of_values_per_key.count()
 53 | 3                                                                               
 54 | >>> sum_of_values_per_key.collect()
 55 | [
 56 |  ('bob', 300), 
 57 |  ('alex', 60), 
 58 |  ('zazo', 7)
 59 | ]
 60 | >>> 
 61 | >>> 
 62 | >>> 
 63 | >>> filtered = sum_of_values_per_key.filter(lambda x: x[1] > 10)
 64 | >>> filtered.collect()
 65 | [('bob', 300), ('alex', 60)]
 66 | >>> 
 67 | >>> 
 68 | >>> key_value.collect()
 69 | [
 70 |  ('alex', 10), 
 71 |  ('alex', 20), 
 72 |  ('alex', 30), 
 73 |  ('bob', 100), 
 74 |  ('bob', 200), 
 75 |  ('zazo', 7)
 76 | ]
 77 | >>> 
 78 | >>> grouped = key_value.groupByKey()
 79 | >>> grouped.collect()
 80 | [ 
 81 |  ('bob', <pyspark.resultiterable.ResultIterable object at 0x7fc7a919f5c0>), 
 82 |  ('alex', <pyspark.resultiterable.ResultIterable object at 0x7fc7a919f630>), 
 83 |  ('zazo', <pyspark.resultiterable.ResultIterable object at 0x7fc7a919f588>)
 84 | ]
 85 | >>> grouped.mapValues(lambda v : list(v)).collect()
 86 | [
 87 |  ('bob', [100, 200]), 
 88 |  ('alex', [10, 20, 30]), 
 89 |  ('zazo', [7])
 90 | ]
 91 | >>> sum_of_values_per_key_2 = grouped.mapValues(lambda values: sum(values))
 92 | >>> sum_of_values_per_key_2.collect()
 93 | [
 94 |  ('bob', 300), 
 95 |  ('alex', 60), 
 96 |  ('zazo', 7)
 97 | ]
 98 | >>> 
 99 | >>> 
100 | >>> pairs = [('a', 10), ('a', 100), ('a', 200), ('b', 10)]
101 | >>> rdd = spark.sparkContext.parallelize(pairs)
102 | >>> 
103 | >>> rdd.collect()
104 | [('a', 10), ('a', 100), ('a', 200), ('b', 10)]
105 | >>> rdd2 = rdd.mapValues(lambda v: v+1000)
106 | >>> rdd2.collect()
107 | [('a', 1010), ('a', 1100), ('a', 1200), ('b', 1010)]
108 | >>> 
109 | >>> rdd3 = rdd.map(lambda x: x[1]+1000)
110 | >>> rdd3.collect()
111 | [1010, 1100, 1200, 1010]
112 | >>> 
113 | >>> 
114 | >>> rdd3 = rdd.map(lambda x: (x[0], x[1]+1000))
115 | >>> rdd3.collect()
116 | [('a', 1010), ('a', 1100), ('a', 1200), ('b', 1010)]
117 | >>> 
118 | >>> 
119 | >>> data = [ ['a', 'b', 'c'], ['z'], [], [], ['alex', 'bob'] ]
120 | >>> rdd = spark.sparkContext.parallelize(data)
121 | >>> rdd.collect()
122 | [['a', 'b', 'c'], ['z'], [], [], ['alex', 'bob']]
123 | >>> rdd.count()
124 | 5
125 | >>> flattened = rdd.flatMap(lambda x: x)
126 | >>> flattened.count()
127 | 6
128 | >>> flattened.collect()
129 | ['a', 'b', 'c', 'z', 'alex', 'bob']
130 | >>> mapped = rdd.map(lambda x: x)
131 | >>> mapped.count()
132 | 5
133 | >>> mapped.collect()
134 | [['a', 'b', 'c'], ['z'], [], [], ['alex', 'bob']]
135 | >>> 
136 | >>> 
137 | >>> data = [ ['a', 'b', 'c'], ['z'], [], [], ('alex', 'bob') ]
138 | >>> flattened2 = rdd.flatMap(lambda x: x)
139 | >>> flattened2.collect()
140 | ['a', 'b', 'c', 'z', 'alex', 'bob']
141 | >>> 
142 | >>> 
143 | >>> 
144 | >>> data2 = [ ['a', 'b', 'c'], ['z'], [], [], ('alex', 'bob') ]
145 | >>> data2
146 | [['a', 'b', 'c'], ['z'], [], [], ('alex', 'bob')]
147 | >>> rdd2 = spark.sparkContext.parallelize(data2)
148 | >>> 
149 | >>> 
150 | >>> rdd2.collect()
151 | [['a', 'b', 'c'], ['z'], [], [], ('alex', 'bob')]
152 | >>> rdd2.count()
153 | 5
154 | >>> flattened2 = rdd2.flatMap(lambda x: x)
155 | >>> flattened2.collect()
156 | ['a', 'b', 'c', 'z', 'alex', 'bob']
157 | >>> 
158 | >>> 
159 | >>> data3 = [ ['a', 'b', 'c'], ['z'], [], [], 'alex', 'bob' ]
160 | >>> rdd3 = spark.sparkContext.parallelize(data3)
161 | >>> flattened3 = rdd3.flatMap(lambda x: x)
162 | >>> flattened3.collect()
163 | ['a', 'b', 'c', 'z', 'a', 'l', 'e', 'x', 'b', 'o', 'b']
164 | >>> 
165 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-10-12.txt:
--------------------------------------------------------------------------------
  1 |  $ ls -l /tmp/data/
  2 | -rw-r--r--  1 mparsian  wheel  72 Oct 12 20:00 file1
  3 | -rw-r--r--  1 mparsian  wheel  94 Oct 12 20:01 file2
  4 | -rw-r--r--  1 mparsian  wheel  35 Oct 12 20:01 file3
  5 | 
  6 | $ cat /tmp/data/file1
  7 | file1: this is record 1
  8 | file1: this is record 2
  9 | file1: this is record 3
 10 | 
 11 | $ cat /tmp/data/file2
 12 | file2:  this is record 1
 13 | file2:   this is record 2
 14 | file2:   this is fox 3
 15 | file2: this is it 4
 16 | 
 17 | $ cat /tmp/data/file3
 18 | file3: record 1
 19 | file3: ewcord 2222
 20 | 
 21 | 
 22 | $ ./bin/pyspark
 23 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 
 24 | [Clang 6.0 (clang-600.0.57)] on darwin
 25 | Type "help", "copyright", "credits" or "license" for more information.
 26 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 27 | Setting default log level to "WARN".
 28 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 29 | Welcome to
 30 |       ____              __
 31 |      / __/__  ___ _____/ /__
 32 |     _\ \/ _ \/ _ `/ __/  '_/
 33 |    /__ / .__/\_,_/_/ /_/\_\   version 3.0.0
 34 |       /_/
 35 | 
 36 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 37 | SparkSession available as 'spark'.
 38 | 
 39 | 
 40 | >>> input_path = '/tmp/data'
 41 | >>> 
 42 | >>> recs = spark.sparkContext.textFile(input_path)
 43 | >>> recs.count()
 44 | 9                                                                               
 45 | >>> recs.collect()
 46 | ['file3: record 1', 'file3: ewcord 2222', 'file2:  this is record 1', 'file2:   this is record 2', 'file2:   this is fox 3', 'file2: this is it 4', 'file1: this is record 1', 'file1: this is record 2', 'file1: this is record 3']
 47 | >>> 
 48 | >>> union2 = recs.union(recs)
 49 | >>> union2.count()
 50 | 18
 51 | >>> union2.collect()
 52 | ['file3: record 1', 'file3: ewcord 2222', 'file2:  this is record 1', 'file2:   this is record 2', 'file2:   this is fox 3', 'file2: this is it 4', 'file1: this is record 1', 'file1: this is record 2', 'file1: this is record 3', 'file3: record 1', 'file3: ewcord 2222', 'file2:  this is record 1', 'file2:   this is record 2', 'file2:   this is fox 3', 'file2: this is it 4', 'file1: this is record 1', 'file1: this is record 2', 'file1: this is record 3']
 53 | 
 54 | 
 55 | 
 56 | >>> records = [('A', 1), ('B', 10), ('A', 2), ('A', 3), ('B', 20), ('B', 60)]
 57 | >>> 
 58 | >>> 
 59 | >>> records
 60 | [('A', 1), ('B', 10), ('A', 2), ('A', 3), ('B', 20), ('B', 60)]
 61 | >>> 
 62 | >>> recs_rdd = spark.sparkContext.parallelize(records)
 63 | >>> recs_rdd.count()
 64 | 6                                                                               
 65 | >>> recs_rdd.collect()
 66 | [('A', 1), ('B', 10), ('A', 2), ('A', 3), ('B', 20), ('B', 60)]
 67 | >>> # recs_rdd: RDD[(String, Integer)]
 68 | ... 
 69 | >>> sum_per_key = recs_rdd.reduceByKey(lambda x, y: x+y)
 70 | >>> sum_per_key.count()
 71 | 2                                                                               
 72 | >>> sum_per_key.collect()
 73 | [('B', 90), ('A', 6)]
 74 | >>> # avg_by_key: [('B', 30), ('A', 2)]
 75 | ... 
 76 | >>> 
 77 | 
 78 | >>> sum_count = recs_rdd.mapValues(lambda v: (v, 1))
 79 | >>> 
 80 | >>> sum_count.collect()
 81 | [('A', (1, 1)), ('B', (10, 1)), ('A', (2, 1)), ('A', (3, 1)), ('B', (20, 1)), ('B', (60, 1))]
 82 | >>> 
 83 | >>> 
 84 | >>> sum_count1 = (10, 1)
 85 | >>> sum_count2 = (20, 2)
 86 | >>> # (10+20, 1+2)
 87 | ... # (30, 3)
 88 | ... 
 89 | >>> sum_count_per_key = sum_count.reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1]))
 90 | >>> sum_count_per_key.collect()
 91 | [('B', (90, 3)), ('A', (6, 3))]
 92 | >>> 
 93 | ])> avg_by_key = sum_count_per_key.mapValues(lambda sum_and_count_tuple : sum_and_count_tuple[0] / sum_and_count_tuple[1 
 94 | >>> avg_by_key.count()
 95 | 2
 96 | >>> avg_by_key.collect()
 97 | [('B', 30.0), ('A', 2.0)]
 98 | >>> 
 99 | 
100 | 
101 | >>> sum_count.collect()
102 | [('A', (1, 1)), ('B', (10, 1)), ('A', (2, 1)), ('A', (3, 1)), ('B', (20, 1)), ('B', (60, 1))]
103 | 
104 | >>> def add_sum_count(x, y):
105 | ...    sum2 = x[0] + y[0]
106 | ...    count = x[1] + y[1]
107 | ...    return (sum2, count)
108 | ... 
109 | >>> 
110 | >>> sum_count_per_key = sum_count.reduceByKey(lambda x, y: add_sum_count(x, y))
111 | >>> sum_count_per_key.collect()
112 | [('B', (90, 3)), ('A', (6, 3))]
113 | >>> avg_per_key = sum_count_per_key.mapValues(lambda tuple: tuple[0] / tuple[1])
114 | >>> avg_per_key.collect()
115 | [('B', 30.0), ('A', 2.0)]
116 | >>> 
117 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2020-10-19.txt:
--------------------------------------------------------------------------------
  1 | mapPartitions() Explained.
  2 | 
  3 | 
  4 | ./bin/pyspark 
  5 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 
  6 | [Clang 6.0 (clang-600.0.57)] on darwin
  7 | Type "help", "copyright", "credits" or "license" for more information.
  8 | 20/10/19 20:19:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
  9 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 10 | Setting default log level to "WARN".
 11 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 12 | Welcome to
 13 |       ____              __
 14 |      / __/__  ___ _____/ /__
 15 |     _\ \/ _ \/ _ `/ __/  '_/
 16 |    /__ / .__/\_,_/_/ /_/\_\   version 3.0.0
 17 |       /_/
 18 | 
 19 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 20 | SparkSession available as 'spark'.
 21 | >>> input_path = '/Users/mparsian/numbers'
 22 | >>> rdd = spark.sparkContext.textFile(input_path)
 23 | >>> 
 24 | >>> rdd.collect()
 25 | ['3', '5', '55', '44', '9', '3', '66', '77', '88', '34', '23', '11', '14', '4', '3', '8', '9', '78', '79', '60', '56', '45']
 26 | >>> num_of_partitions = rdd.numPartitions()
 27 | Traceback (most recent call last):
 28 |   File "<stdin>", line 1, in <module>
 29 | AttributeError: 'RDD' object has no attribute 'numPartitions'
 30 | >>> num_of_partitions = rdd.getNumPartitions()
 31 | >>> 
 32 | >>> num_of_partitions
 33 | 2
 34 | >>> rdd = spark.sparkContext.textFile(input_path, 4)
 35 | >>> num_of_partitions = rdd.getNumPartitions()
 36 | >>> num_of_partitions
 37 | 5
 38 | >>> rdd = spark.sparkContext.textFile(input_path, 4)
 39 | >>> num_of_partitions = rdd.getNumPartitions()
 40 | >>> num_of_partitions
 41 | 5
 42 | >>> def debug(iterator):
 43 | ...   elements = []
 44 | ...   for x in iterator:
 45 | ...     elements.append(x)
 46 | ...   print("elements="+ str(elements))
 47 | ... 
 48 | >>> 
 49 | >>> rdd.foreachPartition(debug)
 50 | elements=['78', '79', '60', '56', '45']
 51 | elements=[]
 52 | elements=['11', '14', '4', '3', '8', '9']
 53 | elements=['3', '5', '55', '44', '9', '3', '66']
 54 | elements=['77', '88', '34', '23']
 55 | >>> 
 56 | >>> 
 57 | >>> rdd = spark.sparkContext.textFile(input_path)
 58 | >>> rdd.colect()
 59 | Traceback (most recent call last):
 60 |   File "<stdin>", line 1, in <module>
 61 | AttributeError: 'RDD' object has no attribute 'colect'
 62 | >>> rdd.collect()
 63 | ['3', '5', '55', '44', '9', '3', '66', '77', '88', '34', '23', '11', '14', '4', '3', '8', '9', '78', '79', '60', '56', '45']
 64 | >>> num_of_partitions = rdd.getNumPartitions()
 65 | >>> num_of_partitions
 66 | 2
 67 | >>> rdd.foreachPartition(debug)
 68 | elements=['14', '4', '3', '8', '9', '78', '79', '60', '56', '45']
 69 | elements=['3', '5', '55', '44', '9', '3', '66', '77', '88', '34', '23', '11']
 70 | >>> 
 71 | >>> 
 72 | >>> 
 73 | >>> def find_min_max(partition):
 74 | ...   first_time = False
 75 | ...   for n in partition:
 76 | ...     if first_time == False:
 77 | ...       min2 = n 
 78 | ...       max2 = n
 79 | ...       first_time == True
 80 | ...     else:
 81 | ...       min2 = min(n, min2) 
 82 | ...       max2 = max(n, max2)
 83 | ...   return [(min2, max2)]
 84 | ... 
 85 | >>> 
 86 | >>> target = rdd.mapPartitions(find_min_max)
 87 | >>> target.collect()
 88 | [('11', '11'), ('45', '45')]
 89 | >>> 
 90 | >>> rdd_integer = rdd.map(lambda n : int(n))
 91 | >>> rdd_integer.collect()
 92 | [3, 5, 55, 44, 9, 3, 66, 77, 88, 34, 23, 11, 14, 4, 3, 8, 9, 78, 79, 60, 56, 45]
 93 | >>> target = rdd.mapPartitions(find_min_max)
 94 | >>> target.collect()
 95 | [('11', '11'), ('45', '45')]
 96 | >>> 
 97 | >>> 
 98 | >>> target = rdd_integer.mapPartitions(find_min_max)
 99 | >>> target.collect()
100 | [(11, 11), (45, 45)]
101 | >>> 
102 | >>> 
103 | >>> def find_min_max(partition):
104 | ...   first_time = False
105 | ...   for n in partition:
106 | ...     if first_time == False:
107 | ...       min2 = n 
108 | ...       max2 = n
109 | ...       first_time = True
110 | ...     else:
111 | ...       min2 = min(n, min2) 
112 | ...       max2 = max(n, max2)
113 | ...   return [(min2, max2)]
114 | ...   
115 | ... 
116 | >>> def debug(iterator):
117 | ...   elements = []
118 | ...   for x in iterator:
119 | ...     elements.append(x)
120 | ...   print("elements="+ str(elements))
121 | ... 
122 | >>> 
123 | >>> target = rdd_integer.mapPartitions(find_min_max)
124 | 
125 | >>> target.collect()
126 | [(3, 88), (3, 79)]
127 | >>> rdd_integer.foreachPartition(debug)
128 | elements=[14, 4, 3, 8, 9, 78, 79, 60, 56, 45]
129 | elements=[3, 5, 55, 44, 9, 3, 66, 77, 88, 34, 23, 11]
130 | >>> target
131 | PythonRDD[14] at collect at <stdin>:1
132 | >>> final_min_max = target.reduce(lambda x, y: (min(x[0], y[0]), max(x[1], y[1])))
133 | >>> final_min_max
134 | (3, 88)
135 | >>> 
136 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-01-19.txt:
--------------------------------------------------------------------------------
  1 | $ ./bin/pyspark
  2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
  3 | [Clang 6.0 (clang-600.0.57)] on darwin
  4 | Type "help", "copyright", "credits" or "license" for more information.
  5 | 21/01/19 20:03:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
  6 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
  7 | Setting default log level to "WARN".
  8 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
  9 | Welcome to
 10 |       ____              __
 11 |      / __/__  ___ _____/ /__
 12 |     _\ \/ _ \/ _ `/ __/  '_/
 13 |    /__ / .__/\_,_/_/ /_/\_\   version 3.0.0
 14 |       /_/
 15 | 
 16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 17 | SparkSession available as 'spark'.
 18 | >>>
 19 | >>>
 20 | >>> tuples2 = [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)]
 21 | >>> tuples2
 22 | [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)]
 23 | >>>
 24 | >>>
 25 | >>> pairs_rdd = spark.sparkContext.parallelize(tuples2)
 26 | >>> pairs_rdd
 27 | ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:262
 28 | 
 29 | >>> pairs_rdd.collect()
 30 | [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)]
 31 | >>> pairs_rdd.count()
 32 | 5
 33 | >>> tuples33 = [('alex', 4, 44), ('alex', 5, 55), ('bob', 40, 66)]
 34 | >>> tuples33
 35 | [('alex', 4, 44), ('alex', 5, 55), ('bob', 40, 66)]
 36 | >>> rdd = spark.sparkContext.parallelize(tuples33)
 37 | >>>
 38 | >>> rdd.collect()
 39 | [('alex', 4, 44), ('alex', 5, 55), ('bob', 40, 66)]
 40 | >>> rdd.count()
 41 | 3
 42 | >>>
 43 | >>>
 44 | >>>
 45 | >>> pairs_rdd.collect()
 46 | [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)]
 47 | 
 48 | >>> new_rdd = pairs_rdd.map(lambda x: (x[0], x[1], 2*int(x[1])))
 49 | >>> new_rdd.collect()
 50 | [('alex', 4, 8), ('alex', 5, 10), ('bob', 40, 80), ('bob', 50, 100), ('bob', 4, 8)]
 51 | >>>
 52 | >>> columns = ["name", "age", "salary"]
 53 | >>> some_tuples = [('alex', 40, 80000), ('alex', 50, 1000000), ('bob', 40, 8000000), ('bob', 50, 10000)]
 54 | >>> df = spark.createDataFrame(some_tuples, columns)
 55 | >>> df.show()
 56 | +----+---+-------+
 57 | |name|age| salary|
 58 | +----+---+-------+
 59 | |alex| 40|  80000|
 60 | |alex| 50|1000000|
 61 | | bob| 40|8000000|
 62 | | bob| 50|  10000|
 63 | +----+---+-------+
 64 | 
 65 | >>> df.printSchema()
 66 | root
 67 |  |-- name: string (nullable = true)
 68 |  |-- age: long (nullable = true)
 69 |  |-- salary: long (nullable = true)
 70 | 
 71 | >>> rdd = spark.sparkContext.parallelize(some_tuples)
 72 | >>> rdd.collect()
 73 | [('alex', 40, 80000), ('alex', 50, 1000000), ('bob', 40, 8000000), ('bob', 50, 10000)]
 74 | >>> rdd.take(2)
 75 | [('alex', 40, 80000), ('alex', 50, 1000000)]
 76 | >>>
 77 | 
 78 | >>>
 79 | >>> data = ["alex,20", "alex,30", "bob,40", "bob,50", "bob,60"]
 80 | >>> data
 81 | ['alex,20', 'alex,30', 'bob,40', 'bob,50', 'bob,60']
 82 | >>>
 83 | >>>
 84 | >>> rdd = spark.sparkContext.parallelize(data)
 85 | >>> rdd.collect()
 86 | ['alex,20', 'alex,30', 'bob,40', 'bob,50', 'bob,60']
 87 | >>> rdd.count()
 88 | 5
 89 | 
 90 | >>> def create_pairs(rec):
 91 | ...   tokens = rec.split(",")
 92 | ...   key = tokens[0]
 93 | ...   value = tokens[1]
 94 | ...   return (key, value)
 95 | ...
 96 | >>>
 97 | >>> pairs = rdd.map(lambda x: create_pairs(x))
 98 | >>> pairs.collect()
 99 | [('alex', '20'), ('alex', '30'), ('bob', '40'), ('bob', '50'), ('bob', '60')]
100 | >>> pairs.count()
101 | 5
102 | >>> pairs = rdd.map(create_pairs)
103 | >>> pairs.collect()
104 | [('alex', '20'), ('alex', '30'), ('bob', '40'), ('bob', '50'), ('bob', '60')]
105 | >>> pairs.count()
106 | 5
107 | >>>
108 | >>> sum_by_key = pairs.reduceByKey(lambda x, y: x+y)
109 | >>> sum_by_key.collect()
110 | [('bob', '405060'), ('alex', '2030')]
111 | >>>
112 | >>> def create_pair(rec):
113 | ...   tokens = rec.split(",")
114 | ...   key = tokens[0]
115 | ...   value = int(tokens[1])
116 | ...   return (key, value)
117 | ...
118 | >>>
119 | 
120 | >>> rdd2 = rdd.map(lambda x: create_pair(x))
121 | >>> rdd2.collect()
122 | [('alex', 20), ('alex', 30), ('bob', 40), ('bob', 50), ('bob', 60)]
123 | >>> sum_by_key = rdd2.reduceByKey(lambda x, y: x+y)
124 | >>> sum_by_key.collect()
125 | [('bob', 150), ('alex', 50)]
126 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-01-26.txt:
--------------------------------------------------------------------------------
 1 | Spark's Mapper Transformations:
 2 | 
 3 | # map: 1 -> 1
 4 | 
 5 | # flatMap: 1 -> Many
 6 | 
 7 | # mapPartitions: partition -> 1  (Many to 1)
 8 | 
 9 | Many = 0, 1, 2, 3, 4, ...
10 | partition = many elements
11 | 
12 | $ ./bin/pyspark
13 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
14 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
15 | Welcome to
16 |       ____              __
17 |      / __/__  ___ _____/ /__
18 |     _\ \/ _ \/ _ `/ __/  '_/
19 |    /__ / .__/\_,_/_/ /_/\_\   version 3.0.0
20 |       /_/
21 | 
22 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
23 | SparkSession available as 'spark'.
24 | >>>
25 | >>>
26 | >>> spark
27 | <pyspark.sql.session.SparkSession object at 0x7f8fc593dba8>
28 | >>> sc = spark.sparkContext
29 | >>> sc
30 | <SparkContext master=local[*] appName=PySparkShell>
31 | >>>
32 | >>>
33 | >>> data = [ [1, 2, 3], [4, 5, 6, 7] ]
34 | >>> data
35 | [[1, 2, 3], [4, 5, 6, 7]]
36 | >>> data[0]
37 | [1, 2, 3]
38 | >>> data[1]
39 | [4, 5, 6, 7]
40 | >>>
41 | >>> rdd = spark.sparkContext.parallelize(data)
42 | >>> rdd.collect()
43 | [[1, 2, 3], [4, 5, 6, 7]]
44 | >>> rdd.count()
45 | 2
46 | >>>
47 | >>> rdd_mapped = rdd.map(lambda x: x)
48 | >>> rdd_mapped.collect()
49 | [[1, 2, 3], [4, 5, 6, 7]]
50 | >>> rdd_mapped.count()
51 | 2
52 | >>>
53 | >>> rdd_flat_mapped = rdd.flatMap(lambda x: x)
54 | >>> rdd_flat_mapped.collect()
55 | [1, 2, 3, 4, 5, 6, 7]
56 | >>> rdd_flat_mapped.count()
57 | 7
58 | >>> data = [ [1, 2, 3], [], [4, 5, 6, 7], [], [9] ]
59 | >>> data
60 | [[1, 2, 3], [], [4, 5, 6, 7], [], [9]]
61 | >>> data[0]
62 | [1, 2, 3]
63 | >>> data[1]
64 | []
65 | >>> data[3]
66 | []
67 | >>> data[2]
68 | [4, 5, 6, 7]
69 | >>> data[3]
70 | []
71 | >>> data[4]
72 | [9]
73 | >>> rdd = spark.sparkContext.parallelize(data)
74 | >>> rdd.collect()
75 | [[1, 2, 3], [], [4, 5, 6, 7], [], [9]]
76 | >>> rdd.count()
77 | 5
78 | >>> rdd_mapped = rdd.map(lambda x: x)
79 | >>> rdd_mapped.collect()
80 | [[1, 2, 3], [], [4, 5, 6, 7], [], [9]]
81 | >>> rdd_mapped.count()
82 | 5
83 | >>> rdd_flat_mapped = rdd.flatMap(lambda x: x)
84 | >>> rdd_flat_mapped.collect()
85 | [1, 2, 3, 4, 5, 6, 7, 9]
86 | >>> rdd_flat_mapped.count()
87 | 8
88 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-04-12.txt:
--------------------------------------------------------------------------------
 1 | ~/spark-3.1.1 $ ./bin/pyspark
 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 3 | [Clang 6.0 (clang-600.0.57)] on darwin
 4 | Type "help", "copyright", "credits" or "license" for more information.
 5 | 21/04/12 20:59:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 6 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 7 | Setting default log level to "WARN".
 8 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 9 | Welcome to
10 |       ____              __
11 |      / __/__  ___ _____/ /__
12 |     _\ \/ _ \/ _ `/ __/  '_/
13 |    /__ / .__/\_,_/_/ /_/\_\   version 3.1.1
14 |       /_/
15 | 
16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
17 | Spark context Web UI available at http://10.0.0.93:4040
18 | Spark context available as 'sc' (master = local[*], app id = local-1618286379380).
19 | SparkSession available as 'spark'.
20 | >>> spark
21 | <pyspark.sql.session.SparkSession object at 0x7fd4cf98d438>
22 | >>>
23 | >>>
24 | >>> numbers = [1, 2, 3, 6, 7, 8, 99, 10, -10, -30]
25 | >>> numbers
26 | [1, 2, 3, 6, 7, 8, 99, 10, -10, -30]
27 | 
28 | >>># create an RDD[Integer] from a collection
29 | >>># RDD = Resilient Distributed Dataset
30 | >>> rdd = spark.sparkContext.parallelize(numbers)
31 | >>> rdd.collect()
32 | [1, 2, 3, 6, 7, 8, 99, 10, -10, -30]
33 | >>> rdd.count()
34 | 10
35 | 
36 | >>># fund sum of all numbers in rdd as (RDD[Integer])
37 | >>> total = rdd.reduce(lambda x, y: x+y)
38 | >>> total
39 | 96
40 | 
41 | >>>#apply a filter: find all positive numbers
42 | >>> positives = rdd.filter(lambda x : x > 0)
43 | >>> positives.collect()
44 | [1, 2, 3, 6, 7, 8, 99, 10]
45 | >>>
46 | >>># increment every element by 1000
47 | >>> rdd2 = rdd.map(lambda x : x+1000)
48 | >>> rdd2.collect()
49 | [1001, 1002, 1003, 1006, 1007, 1008, 1099, 1010, 990, 970]
50 | >>>
51 | >>># create (key, value) pairs
52 | >>> data = [("m1", 4), ("m1", 5), ("m2", 3), ("m2", 4), ("m2", 5), ("m3", 2), ("m3", 4)]
53 | >>> data
54 | [('m1', 4), ('m1', 5), ('m2', 3), ('m2', 4), ('m2', 5), ('m3', 2), ('m3', 4)]
55 | 
56 | >>>
57 | >>> pairs = spark.sparkContext.parallelize(data)
58 | >>> pairs.collect()
59 | [('m1', 4), ('m1', 5), ('m2', 3), ('m2', 4), ('m2', 5), ('m3', 2), ('m3', 4)]
60 | 
61 | >>># keep elements if their associated value is Greater Than 3
62 | >>># x[0] refers to key
63 | >>># x[1] refers to value
64 | >>> rating45 = pairs.filter(lambda x : x[1] > 3)
65 | >>> rating45.collect()
66 | [('m1', 4), ('m1', 5), ('m2', 4), ('m2', 5), ('m3', 4)]
67 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-04-14.txt:
--------------------------------------------------------------------------------
  1 | $ cat /tmp/foxdata.txt
  2 | a red fox jumped of high
  3 | fox jumped over a high fence
  4 | red of fox jumped
  5 | 
  6 | 
  7 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
  8 | ...
  9 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 10 | Welcome to
 11 |       ____              __
 12 |      / __/__  ___ _____/ /__
 13 |     _\ \/ _ \/ _ `/ __/  '_/
 14 |    /__ / .__/\_,_/_/ /_/\_\   version 3.1.1
 15 |       /_/
 16 | 
 17 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 18 | Spark context Web UI available at http://10.0.0.93:4040
 19 | Spark context available as 'sc' (master = local[*], app id = local-1618456720582).
 20 | SparkSession available as 'spark'.
 21 | >>>
 22 | >>>
 23 | >>>
 24 | >>> spark
 25 | <pyspark.sql.session.SparkSession object at 0x7fc8d618d438>
 26 | 
 27 | >>> input_path = "/tmp/foxdata.txt"
 28 | >>> input_path
 29 | '/tmp/foxdata.txt'
 30 | >>> # Read input path and create an RDD[String]
 31 | ...
 32 | >>> records = spark.sparkContext.textFile(input_path)
 33 | >>> records
 34 | /tmp/foxdata.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
 35 | >>>
 36 | >>> records.collect()
 37 | [
 38 |  'a red fox jumped of high', 
 39 |  'fox jumped over a high fence', 
 40 |  'red of fox jumped'
 41 | ]
 42 | >>> records.count()
 43 | 3
 44 | >>> #  tokenize records and create RDD[ [String] ]
 45 | ...
 46 | >>> tokenizd = records.map(lambda record: record.split(" "))
 47 | >>> tokenizd.collect()
 48 | [
 49 |  ['a', 'red', 'fox', 'jumped', 'of', 'high'], 
 50 |  ['fox', 'jumped', 'over', 'a', 'high', 'fence'], 
 51 |  ['red', 'of', 'fox', 'jumped']
 52 | ]
 53 | >>> tokenizd.count()
 54 | 3
 55 | >>> pairs = tokenizd.map(lambda word : (word, 1))
 56 | >>> pairs.collect()
 57 | [
 58 |  (['a', 'red', 'fox', 'jumped', 'of', 'high'], 1), 
 59 |  (['fox', 'jumped', 'over', 'a', 'high', 'fence'], 1), 
 60 |  (['red', 'of', 'fox', 'jumped'], 1)
 61 | ]
 62 | >>>
 63 | >>> words = tokenizd.flatMap(lambda arr: arr)
 64 | >>> words.collect()
 65 | ['a', 'red', 'fox', 'jumped', 'of', 'high', 'fox', 'jumped', 'over', 'a', 'high', 'fence', 'red', 'of', 'fox', 'jumped']
 66 | >>> words.count()
 67 | 16
 68 | >>> # words : RDD[String]
 69 | ...
 70 | >>> key_value_pairs = words.map(lambda word: (word, 1))
 71 | >>> key_value_pairs.collect()
 72 | [('a', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('of', 1), ('high', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('a', 1), ('high', 1), ('fence', 1), ('red', 1), ('of', 1), ('fox', 1), ('jumped', 1)]
 73 | >>>
 74 | >>> # key_value_pairs: RDD[(String, Integer)]
 75 | ...
 76 | >>>
 77 | >>> grouped = key_value_pairs.groupByKey()
 78 | >>> grouped.collect()
 79 | [
 80 |  ('of', <pyspark.resultiterable.ResultIterable object at 0x7fc8d61f9390>), 
 81 |  ('high', <pyspark.resultiterable.ResultIterable object at 0x7fc8d61f9400>), 
 82 |  ('fence', <pyspark.resultiterable.ResultIterable object at 0x7fc8d61f94e0>), 
 83 |  ('a', <pyspark.resultiterable.ResultIterable object at 0x7fc8d61f9470>), 
 84 |  ('red', <pyspark.resultiterable.ResultIterable object at 0x7fc8d61f9438>), 
 85 |  ('fox', <pyspark.resultiterable.ResultIterable object at 0x7fc8d61f95f8>), 
 86 |  ('jumped', <pyspark.resultiterable.ResultIterable object at 0x7fc8d61f9550>), 
 87 |  ('over', <pyspark.resultiterable.ResultIterable object at 0x7fc8d61f96d8>)
 88 | ]
 89 | >>>
 90 | >>> debugged = grouped.mapValues(lambda values: list(values))
 91 | >>> debugged.collect()
 92 | [
 93 |  ('of', [1, 1]), 
 94 |  ('high', [1, 1]), 
 95 |  ('fence', [1]), 
 96 |  ('a', [1, 1]), 
 97 |  ('red', [1, 1]), 
 98 |  ('fox', [1, 1, 1]),
 99 |  ('jumped', [1, 1, 1]), 
100 |  ('over', [1])
101 | ]
102 | >>>
103 | >>>
104 | >>> frequency = grouped.mapValues(lambda values: sum(values))
105 | >>> frequency.collect()
106 | [('of', 2), ('high', 2), ('fence', 1), ('a', 2), ('red', 2), ('fox', 3), ('jumped', 3), ('over', 1)]
107 | >>>
108 | >>>
109 | >>>
110 | >>> key_value_pairs.collect()
111 | [('a', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('of', 1), ('high', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('a', 1), ('high', 1), ('fence', 1), ('red', 1), ('of', 1), ('fox', 1), ('jumped', 1)]
112 | >>>
113 | >>>
114 | >>>
115 | >>> reduced = key_value_pairs.reduceByKey(lambda x, y: x+y)
116 | >>> reduced.collect()
117 | [('of', 2), ('high', 2), ('fence', 1), ('a', 2), ('red', 2), ('fox', 3), ('jumped', 3), ('over', 1)]
118 | >>>
119 | >>> rdd7 = reduced.mapValues(lambda x: x+100)
120 | >>> rdd7.collect()
121 | [('of', 102), ('high', 102), ('fence', 101), ('a', 102), ('red', 102), ('fox', 103), ('jumped', 103), ('over', 101)]
122 | 
123 | >>> rdd77 = reduced.map(lambda x: x[1]+100)
124 | >>> rdd77.collect()
125 | [102, 102, 101, 102, 102, 103, 103, 101]
126 | 
127 | >>> rdd77 = reduced.map(lambda x: (x[0], x[1]+100))
128 | >>> rdd77.collect()
129 | [('of', 102), ('high', 102), ('fence', 101), ('a', 102), ('red', 102), ('fox', 103), ('jumped', 103), ('over', 101)]
130 | >>>
131 | 
132 | >>># get number of partitions for rdd77
133 | >>> rdd77.getNumPartitions()
134 | 2
135 | >>>
136 | >>>
137 | >>> KV = [('x', 3), ('x', 5), ('x', 8), ('y', 50), ('y', 60), ('y', 70), ('z', 3)]
138 | >>> KV
139 | [('x', 3), ('x', 5), ('x', 8), ('y', 50), ('y', 60), ('y', 70), ('z', 3)]
140 | >>> rdd = spark.sparkContext.parallelize(KV)
141 | >>>
142 | >>> rdd.collect()
143 | [('x', 3), ('x', 5), ('x', 8), ('y', 50), ('y', 60), ('y', 70), ('z', 3)]
144 | >>> rdd.count()
145 | 7
146 | >>>
147 | >>> filtered1 = rdd.filter(lambda x : x[1] > 10)
148 | >>> filtered1.collect()
149 | [('y', 50), ('y', 60), ('y', 70)]
150 | >>> filtered2 = rdd.filter(lambda x : x[1] < 10)
151 | >>> filtered2.collect()
152 | [('x', 3), ('x', 5), ('x', 8), ('z', 3)]
153 | >>>
154 | >>>
155 | >>> added = rdd.reduceByKey(lambda a, b: a+b)
156 | >>> added.collect()
157 | [('y', 180), ('z', 3), ('x', 16)]
158 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-04-19.txt:
--------------------------------------------------------------------------------
  1 | $ ./bin/pyspark
  2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
  3 | [Clang 6.0 (clang-600.0.57)] on darwin
  4 | Type "help", "copyright", "credits" or "license" for more information.
  5 | 
  6 | 21/04/19 20:20:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
  7 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
  8 | Setting default log level to "WARN".
  9 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 10 | Welcome to
 11 |       ____              __
 12 |      / __/__  ___ _____/ /__
 13 |     _\ \/ _ \/ _ `/ __/  '_/
 14 |    /__ / .__/\_,_/_/ /_/\_\   version 3.1.1
 15 |       /_/
 16 | 
 17 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 18 | Spark context Web UI available at http://10.0.0.93:4040
 19 | Spark context available as 'sc' (master = local[*], app id = local-1618888841845).
 20 | SparkSession available as 'spark'.
 21 | >>>
 22 | >>># Create an RDD[(String, Integer)] as rdd
 23 | >>> kv =[('a', 3), ('a', 4), ('a', 5), ('b', 30),('b', 40),('b', 50),('z', 3)]
 24 | >>> rdd = spark.sparkContext.parallelize(kv)
 25 | >>>
 26 | >>>
 27 | >>> rdd.count()
 28 | 7
 29 | >>> rdd.collect()
 30 | [('a', 3), ('a', 4), ('a', 5), ('b', 30), ('b', 40), ('b', 50), ('z', 3)]
 31 | >>> def mapfun1(e):
 32 | ...   k = e[0]
 33 | ...   v = e[1]
 34 | ...   return (k, (v, v+5))
 35 | ...
 36 | >>># Create an RDD[(String, (Integer, Integer))] as rdd2
 37 | >>># rdd2 has key type of String and value type of (Integer, Integer)
 38 | >>> rdd2 = rdd.map(mapfun1)
 39 | >>>
 40 | >>> rdd2.collect()
 41 | [('a', (3, 8)), ('a', (4, 9)), ('a', (5, 10)), ('b', (30, 35)), ('b', (40, 45)), ('b', (50, 55)), ('z', (3, 8))]
 42 | >>> rdd2.count()
 43 | 7
 44 | >>> # rdd: RDD[(String, Integer)]
 45 | ...
 46 | >>> # rdd2: RDD[(String, (Integer, Integer)]
 47 | >>>
 48 | >>># Create an RDD[(String, Integer)] as rdd3
 49 | >>> rdd3 = rdd2.map(lambda x: (x[0], x[1][0]+x[1][1]))
 50 | >>> rdd3.count()
 51 | 7
 52 | >>> rdd3.collect()
 53 | [('a', 11), ('a', 13), ('a', 15), ('b', 65), ('b', 85), ('b', 105), ('z', 11)]
 54 | >>>
 55 | >>> rdd31 = rdd2.mapValues(lambda v: v[0]+v[1])
 56 | >>> rdd31.count()
 57 | 7
 58 | >>> rdd31.collect()
 59 | [('a', 11), ('a', 13), ('a', 15), ('b', 65), ('b', 85), ('b', 105), ('z', 11)]
 60 | >>>
 61 | >>>
 62 | >>>
 63 | >>> strings = ["abc", "xyzt", "", "123"]
 64 | >>> rdd_strings = spark.sparkContext.parallelize(strings)
 65 | >>>
 66 | >>> rdd_strings.count()
 67 | 4
 68 | >>> rdd_strings_2 = rdd_strings.flatMap(lambda v: v)
 69 | >>> rdd_strings_2.collect()
 70 | ['a', 'b', 'c', 'x', 'y', 'z', 't', '1', '2', '3']
 71 | >>>
 72 | >>> lists = [ [1, 2, 3], [], [6,7,8,9,10], [] ]
 73 | >>> rdd4 = spark.sparkContext.parallelize(lists)
 74 | >>> rdd4.collect()
 75 | [[1, 2, 3], [], [6, 7, 8, 9, 10], []]
 76 | >>> rdd4.count()
 77 | 4
 78 | 
 79 | >>> rdd5 = rdd4.flatMap(lambda v: v)
 80 | >>> rdd5.collect()
 81 | [1, 2, 3, 6, 7, 8, 9, 10]
 82 | >>> rdd5.count()
 83 | 8
 84 | >>>
 85 | >>> lists = [ [7, (1,2), (2,4)], ["abc", 99], [6, (7, 7), (8, 8)], [] ]
 86 | >>> rdd9 = spark.sparkContext.parallelize(lists)
 87 | >>> rdd9.collect()
 88 | [[7, (1, 2), (2, 4)], ['abc', 99], [6, (7, 7), (8, 8)], []]
 89 | >>> rdd9.count()
 90 | 4
 91 | >>> rdd10 = rdd9.flatMap(lambda v: v)
 92 | >>> rdd10.collect()
 93 | [7, (1, 2), (2, 4), 'abc', 99, 6, (7, 7), (8, 8)]
 94 | >>>
 95 | >>>
 96 | >>> rdd11 = rdd10.flatMap(lambda v: v)
 97 | >>> rdd11.collect()
 98 | 21/04/19 20:43:44 ERROR Executor: Exception in task 5.0 in stage 17.0 (TID 141)
 99 | TypeError: 'int' object is not iterable
100 | 
101 | >>>
102 | >>> mylist = [(7, 1, 2), (2, 4), ('abc', 99, 6), (7, 7), (8, 8)]
103 | >>> rdd = spark.sparkContext.parallelize(mylist)
104 | >>> rdd.collect()
105 | [(7, 1, 2), (2, 4), ('abc', 99, 6), (7, 7), (8, 8)]
106 | >>> rdd2 = rdd.flatMap(lambda x: x)
107 | >>> rdd2.collect()
108 | [7, 1, 2, 2, 4, 'abc', 99, 6, 7, 7, 8, 8]
109 | >>>
110 | >>>
111 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-04-21-mapPartitions.txt:
--------------------------------------------------------------------------------
 1 |  ./bin/pyspark
 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 3 | ...
 4 | Welcome to
 5 |       ____              __
 6 |      / __/__  ___ _____/ /__
 7 |     _\ \/ _ \/ _ `/ __/  '_/
 8 |    /__ / .__/\_,_/_/ /_/\_\   version 3.1.1
 9 |       /_/
10 | 
11 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
12 | Spark context Web UI available at http://10.0.0.93:4040
13 | Spark context available as 'sc' (master = local[*], app id = local-1619061713234).
14 | SparkSession available as 'spark'.
15 | >>>
16 | >>>
17 | >>>
18 | >>> nums = [1, 2, 3, 4, -1, 4, 5, 6, 7, -3, -1, 2, 3, 9, -1, -2]
19 | >>> nums
20 | [1, 2, 3, 4, -1, 4, 5, 6, 7, -3, -1, 2, 3, 9, -1, -2]
21 | >>>
22 | >>>
23 | 
24 | >>> rdd = sc.parallelize(nums)
25 | >>> rdd.collect()
26 | [1, 2, 3, 4, -1, 4, 5, 6, 7, -3, -1, 2, 3, 9, -1, -2]
27 | >>># find the default number of partitions
28 | >>> rdd.getNumPartitions()
29 | 8
30 | >>>
31 | >>># set number of partitions explicitly to 3
32 | >>> rdd = sc.parallelize(nums, 3)
33 | >>> rdd.getNumPartitions()
34 | 3
35 | >>> def debug(partition):
36 | ...   elements = []
37 | ...   for x in partition:
38 | ...     elements.append(x)
39 | ...   print("elements=", elements)
40 | ...
41 | >>> rdd.foreachPartition(debug)
42 | elements= [4, 5, 6, 7, -3]                                        
43 | elements= [1, 2, 3, 4, -1]
44 | elements= [-1, 2, 3, 9, -1, -2]
45 | >>>
46 | >>>#define a function which handles a single partition
47 | >>> def min_max_count(partition):
48 | ...   first_time = False
49 | ...   local_count = 0
50 | ...   for n in partition:
51 | ...     local_count += 1
52 | ...     if (first_time == False):
53 | ...       local_min = n
54 | ...       local_max = n
55 | ...       first_time = True
56 | ...     else:
57 | ...       local_min = min(n, local_min)
58 | ...       local_max = max(n, local_max)
59 | ...   return [(local_min, local_max, local_count)]
60 | ...
61 | >>># Test your custom function without Spark
62 | >>> x = [1, 2, 3, -3, -6, 9, 10, 4, 5, 6]
63 | >>> result = min_max_count(x)
64 | >>> result
65 | [(-6, 10, 10)]
66 | >>>
67 | >>> rdd.foreachPartition(debug)
68 | elements= [1, 2, 3, 4, -1]
69 | elements= [-1, 2, 3, 9, -1, -2]
70 | elements= [4, 5, 6, 7, -3]
71 | >>>
72 | >>> rdd2 = rdd.mapPartitions(min_max_count)
73 | >>> rdd2.collect()
74 | [(-1, 4, 5), (-3, 7, 5), (-2, 9, 6)]
75 | 
76 | >>> final_answer = rdd2.reduce(lambda x, y: ( min(x[0], y[0]), max(x[1], y[1]), x[2]+y[2]) )
77 | >>> final_answer
78 | (-3, 9, 16)
79 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-04-29-min-max-avg.txt:
--------------------------------------------------------------------------------
 1 | Given billions of numbers, find (minimum, maximum, average)
 2 | for all numbers.
 3 | 
 4 | I provide 2 solutions: one using tuple of 4: (minimum, maximum, sum, count)
 5 | another solution using tuple of 3: (minimum, maximum, sum)
 6 | 
 7 | 
 8 | $ ./bin/pyspark
 9 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
10 | Welcome to
11 |       ____              __
12 |      / __/__  ___ _____/ /__
13 |     _\ \/ _ \/ _ `/ __/  '_/
14 |    /__ / .__/\_,_/_/ /_/\_\   version 3.1.1
15 |       /_/
16 | 
17 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
18 | Spark context Web UI available at http://10.0.0.93:4040
19 | Spark context available as 'sc' (master = local[*], app id = local-1619727491830).
20 | SparkSession available as 'spark'.
21 | >>>
22 | >>>
23 | >>> nums = [1, 2, 3, -1, -2, -3, 4, 5, 6, 7, 8]
24 | >>>
25 | >>># Let rd denote billions of numbers
26 | >>> rdd = spark.sparkContext.parallelize(nums)
27 | >>> rdd.collect()
28 | [1, 2, 3, -1, -2, -3, 4, 5, 6, 7, 8]
29 | >>>
30 | 
31 | >>># Create tuple of 4 elements as: (minimum, maximum, sum, count)
32 | >>> tuple4 = rdd.map(lambda n: (n, n, n, 1))
33 | >>> tuple4.collect()
34 | [(1, 1, 1, 1), (2, 2, 2, 1), (3, 3, 3, 1), (-1, -1, -1, 1), (-2, -2, -2, 1), (-3, -3, -3, 1), (4, 4, 4, 1), (5, 5, 5, 1), (6, 6, 6, 1), (7, 7, 7, 1), (8, 8, 8, 1)]
35 | 
36 | >>># Perform a reduction on tuple4
37 | >>> min_max_sum_count = tuple4.reduce(lambda x, y: (min(x[0], y[0]), max(x[1],y[1]), x[2]+y[2], x[3]+y[3])  )
38 | >>>
39 | >>># Now, min_max_sum_count represents (minimum, maximum, sum, count)
40 | >>> min_max_sum_count
41 | (-3, 8, 30, 11)
42 | >>> final = (min_max_sum_count[0], min_max_sum_count[1], min_max_sum_count[2] / min_max_sum_count[3])
43 | >>> final
44 | (-3, 8, 2.727272727272727)
45 | >>>
46 | 
47 | >>># Solution using tuple of 3
48 | >>> tuple3 = rdd.map(lambda n: (n, n, n))
49 | >>> min_max_sum = tuple3.reduce(lambda x, y: (min(x[0], y[0]), max(x[1],y[1]), x[2]+y[2])  )
50 | >>> min_max_sum
51 | (-3, 8, 30)
52 | >>> N = rdd.count()
53 | >>> N
54 | 11
55 | >>> final = (min_max_sum[0], min_max_sum[1], min_max_sum[2] / N)
56 | >>> final
57 | (-3, 8, 2.727272727272727)


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-05-05-join.txt:
--------------------------------------------------------------------------------
 1 | PySpark Documentation: Join function in PySpark:
 2 | http://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.join.html
 3 | 
 4 | $ ./bin/pyspark
 5 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 6 | 
 7 | Welcome to
 8 |       ____              __
 9 |      / __/__  ___ _____/ /__
10 |     _\ \/ _ \/ _ `/ __/  '_/
11 |    /__ / .__/\_,_/_/ /_/\_\   version 3.1.1
12 |       /_/
13 | 
14 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
15 | Spark context Web UI available at http://10.0.0.93:4040
16 | Spark context available as 'sc' (master = local[*], app id = local-1620269740798).
17 | SparkSession available as 'spark'.
18 | >>>
19 | >>> x = spark.sparkContext.parallelize([("spark", 1), ("hadoop", 4)])
20 | >>> x.collect()
21 | [
22 |  ('spark', 1), 
23 |  ('hadoop', 4)
24 | ]
25 | >>>
26 | >>> y = spark.sparkContext.parallelize([("spark", 2), ("hadoop", 5)])
27 | >>> y.collect()
28 | [
29 |  ('spark', 2), 
30 |  ('hadoop', 5)
31 | ]
32 | >>>
33 | >>> joined = x.join(y)
34 | >>> joined.collect()
35 | [
36 |  ('spark', (1, 2)), 
37 |  ('hadoop', (4, 5))
38 | ]
39 | >>>
40 | >>>
41 | >>> x = spark.sparkContext.parallelize([("a", 1), ("b", 4), ("c", 4)])
42 | >>> x.collect()
43 | [('a', 1), ('b', 4), ('c', 4)]
44 | >>> y = spark.sparkContext.parallelize([("a", 2), ("a", 3), ("a", 7), ("d", 8)])
45 | >>> y.collect()
46 | [('a', 2), ('a', 3), ('a', 7), ('d', 8)]
47 | >>>
48 | >>> joined = x.join(y)
49 | >>> joined.collect()
50 | [('a', (1, 2)), ('a', (1, 3)), ('a', (1, 7))]
51 | >>>
52 | >>>
53 | >>> joined.count()
54 | 3
55 | >>> x = spark.sparkContext.parallelize([("a", 1), ("b", 4), ("b", 5), ("c", 4)]);
56 | >>> x.collect()
57 | [('a', 1), ('b', 4), ('b', 5), ('c', 4)]
58 | >>>
59 | >>> y = spark.sparkContext.parallelize([("a", 2), ("a", 3), ("a", 7), ("b", 61), ("b", 71), ("d", 8)])
60 | >>> y.collect()
61 | [('a', 2), ('a', 3), ('a', 7), ('b', 61), ('b', 71), ('d', 8)]
62 | >>> joined = x.join(y)
63 | >>> joined.collect()
64 | [
65 |  ('b', (4, 61)), 
66 |  ('b', (4, 71)), 
67 |  ('b', (5, 61)), 
68 |  ('b', (5, 71)), 
69 |  ('a', (1, 2)), 
70 |  ('a', (1, 3)), 
71 |  ('a', (1, 7))
72 | ]
73 | >>>
74 | >>>#pyspark.RDD.cartesian
75 | >>>#RDD.cartesian(other)
76 | >>>#Return the Cartesian product of this RDD and another one, 
77 | >>>#that is, the RDD of all pairs of elements (a, b) where a is 
78 | >>>#in self and b is in other.
79 | >>># Examples
80 | 
81 | >>>
82 | >>> rdd = spark.sparkContext.parallelize([1, 2])
83 | >>> sorted(rdd.cartesian(rdd).collect())
84 | [(1, 1), (1, 2), (2, 1), (2, 2)]
85 | 
86 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-10-11-filter-map-flatMap.txt:
--------------------------------------------------------------------------------
  1 | Understand filter(), map(), and flatMap()
  2 | 
  3 | $ ./bin/pyspark
  4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
  5 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
  6 | Welcome to
  7 |       ____              __
  8 |      / __/__  ___ _____/ /__
  9 |     _\ \/ _ \/ _ `/ __/  '_/
 10 |    /__ / .__/\_,_/_/ /_/\_\   version 3.1.2
 11 |       /_/
 12 | 
 13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 14 | Spark context Web UI available at http://10.0.0.94:4040
 15 | Spark context available as 'sc' (master = local[*], app id = local-1634007457887).
 16 | SparkSession available as 'spark'.
 17 | >>>
 18 | >>>
 19 | >>>
 20 | >>> records = ["this is fox", "fox", "is", "fox is red", "fox is gone"]
 21 | >>> records
 22 | ['this is fox', 'fox', 'is', 'fox is red', 'fox is gone']
 23 | >>> >>>
 24 | >>>
 25 | >>> rdd = sc.parallelize(records)
 26 | >>>
 27 | >>> rdd
 28 | ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274
 29 | >>> rdd.count()
 30 | 5
 31 | >>> rdd.collect()
 32 | ['this is fox', 'fox', 'is', 'fox is red', 'fox is gone']
 33 | >>>
 34 | >>>
 35 | >>> filtered = rdd.filter(lambda x: len(x) > 3)
 36 | >>> filtered.collect()
 37 | ['this is fox', 'fox is red', 'fox is gone']
 38 | >>>
 39 | >>>
 40 | >>> def apply_filter(x):
 41 | ...     if len(x) > 3: return True
 42 | ...     return False
 43 | ...
 44 | >>>
 45 | >>> b = apply_filter("this is a long one")
 46 | >>> b
 47 | True
 48 | >>> c = apply_filter("one")
 49 | >>> c
 50 | False
 51 | >>>
 52 | >>> filtered_recs = rdd.filter(apply_filter)
 53 | >>>
 54 | >>> filtered_recs.collect()
 55 | ['this is fox', 'fox is red', 'fox is gone']
 56 | >>>
 57 | >>>
 58 | >>> rdd.collect()
 59 | ['this is fox', 'fox', 'is', 'fox is red', 'fox is gone']
 60 | >>> flattened = rdd.flatMap(lambda x: x.split(" "))
 61 | >>>
 62 | >>> flattened.collect()
 63 | ['this', 'is', 'fox', 'fox', 'is', 'fox', 'is', 'red', 'fox', 'is', 'gone']
 64 | >>> flattened.count()
 65 | 11
 66 | >>> mapped = rdd.map(lambda x: x.split(" "))
 67 | >>> mapped.collect()
 68 | [['this', 'is', 'fox'], ['fox'], ['is'], ['fox', 'is', 'red'], ['fox', 'is', 'gone']]
 69 | >>> mapped.count()
 70 | 5
 71 | >>>
 72 | >>> a = [ ["this", "is"], [], [], ["fox", "is", "red", "jumped"] ]
 73 | >>> a
 74 | [['this', 'is'], [], [], ['fox', 'is', 'red', 'jumped']]
 75 | >>> rdd_list = sc.parallelize(a)
 76 | >>> rdd_list.collect()
 77 | [['this', 'is'], [], [], ['fox', 'is', 'red', 'jumped']]
 78 | >>> rdd_list.count()
 79 | 4
 80 | >>> flattened22 = rdd_list.flatMap(lambda L : L)
 81 | >>> flattened22.collect()
 82 | ['this', 'is', 'fox', 'is', 'red', 'jumped']
 83 | >>>
 84 | >>>
 85 | >>> key_value_pairs = [("a", 10), ("a", 20), ("a", 30), ("a", 40), ("b", 300), ("b", 400)]
 86 | >>> key_value_pairs
 87 | [('a', 10), ('a', 20), ('a', 30), ('a', 40), ('b', 300), ('b', 400)]
 88 | >>> key_value_rdd = sc.parallelize(key_value_pairs)
 89 | >>>
 90 | >>> key_value_rdd.collect()
 91 | [('a', 10), ('a', 20), ('a', 30), ('a', 40), ('b', 300), ('b', 400)]
 92 | >>>
 93 | >>> def custom_func(x):
 94 | ...     k = x[0]
 95 | ...     v = x[1]
 96 | ...     if (v < 30): return []
 97 | ...     return [(k, v+1000), ("MYKEY", v+4000)]
 98 | ...
 99 | >>>
100 | >>> y = custom_func(("x", 25))
101 | >>> y
102 | []
103 | >>> y = custom_func(("x", 300))
104 | >>> y
105 | [('x', 1300), ('MYKEY', 4300)]
106 | >>> flattened = key_value_rdd.flatMap(custom_func)
107 | >>> flattened.collect()
108 | [('a', 1030), ('MYKEY', 4030), ('a', 1040), ('MYKEY', 4040), ('b', 1300), ('MYKEY', 4300), ('b', 1400), ('MYKEY', 4400)]
109 | >>> flattened.count()
110 | 8
111 | >>>
112 | >>> mapped = key_value_rdd.map(custom_func)
113 | >>> mapped.collect()
114 | [[], [], [('a', 1030), ('MYKEY', 4030)], [('a', 1040), ('MYKEY', 4040)], [('b', 1300), ('MYKEY', 4300)], [('b', 1400), ('MYKEY', 4400)]]
115 | >>> mapped.count()
116 | 6
117 | >>> filtered99 = mapped.filter(lambda x: len(x) > 0)
118 | >>> filtered99.collect()
119 | [[('a', 1030), ('MYKEY', 4030)], [('a', 1040), ('MYKEY', 4040)], [('b', 1300), ('MYKEY', 4300)], [('b', 1400), ('MYKEY', 4400)]]
120 | >>>
121 | >>>
122 | >>>
123 | >>> x = set()
124 | >>> x.add(1)
125 | >>> x
126 | {1}
127 | >>> x.add(1)
128 | >>> x
129 | {1}
130 | >>> x.add(3)
131 | >>> x.add(4)
132 | >>> x
133 | {1, 3, 4}
134 | >>> x.add(4)
135 | >>> x.add(4)
136 | >>> x.add(4)
137 | >>> x.add(4)
138 | >>> x
139 | {1, 3, 4}
140 | >>> x = []
141 | >>> x.append(1)
142 | >>> x
143 | [1]
144 | >>> x.append(1)
145 | >>> x
146 | [1, 1]
147 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-10-20-understanding-partitions.txt:
--------------------------------------------------------------------------------
 1 | Understanding Partitions
 2 | 
 3 | $ ./bin/pyspark
 4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
 5 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 6 | Welcome to
 7 |       ____              __
 8 |      / __/__  ___ _____/ /__
 9 |     _\ \/ _ \/ _ `/ __/  '_/
10 |    /__ / .__/\_,_/_/ /_/\_\   version 3.1.2
11 |       /_/
12 | 
13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
14 | Spark context Web UI available at http://10.0.0.94:4040
15 | Spark context available as 'sc' (master = local[*], app id = local-1634788905125).
16 | SparkSession available as 'spark'.
17 | >>>
18 | >>> nums = [1, 2, 3, 4, 5, 77, 77, 66, 99, 33, 33, 22, 22, 11, 123, 44, 45, 67, 89, 77, 66, 44, 55, 99, 80, 90]
19 | >>> nums
20 | [1, 2, 3, 4, 5, 77, 77, 66, 99, 33, 33, 22, 22, 11, 123, 44, 45, 67, 89, 77, 66, 44, 55, 99, 80, 90]
21 | >>> # rdd : RDD[Integer]
22 | >>> rdd = sc.parallelize(nums)
23 | >>> rdd.count()
24 | 26
25 | >>> rdd.collect()
26 | [1, 2, 3, 4, 5, 77, 77, 66, 99, 33, 33, 22, 22, 11, 123, 44, 45, 67, 89, 77, 66, 44, 55, 99, 80, 90]
27 | >>>
28 | >>> # get number of partitions (default, set by cluster manager)
29 | >>> rdd.getNumPartitions()
30 | 8
31 | >>> # set number of partitions explicitly to 4
32 | >>> rdd2 = sc.parallelize(nums, 4)
33 | >>> rdd2.getNumPartitions()
34 | 4
35 | >>> # define a debugger to output all elements of a partition
36 | >>> def debug_partition(partition):
37 | ...     print("partition=", list(partition))
38 | ...
39 | >>> rdd.foreachPartition(debug_partition)
40 | partition= [1, 2, 3]
41 | partition= [33, 33, 22]
42 | partition= [22, 11, 123]
43 | partition= [44, 45, 67]
44 | partition= [77, 66, 99]
45 | partition= [44, 55, 99, 80, 90]
46 | partition= [89, 77, 66]
47 | partition= [4, 5, 77]
48 | >>>
49 | >>> rdd2.foreachPartition(debug_partition)
50 | partition= [89, 77, 66, 44, 55, 99, 80, 90]
51 | partition= [1, 2, 3, 4, 5, 77]
52 | partition= [22, 11, 123, 44, 45, 67]
53 | partition= [77, 66, 99, 33, 33, 22]
54 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2021-10-25-RDD-join.txt:
--------------------------------------------------------------------------------
 1 | Inner Join Example
 2 | 
 3 | $ pyspark
 4 | Python 3.7.10 (default, Jun  3 2021, 00:02:01)
 5 | Welcome to
 6 |       ____              __
 7 |      / __/__  ___ _____/ /__
 8 |     _\ \/ _ \/ _ `/ __/  '_/
 9 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.5-amzn-0
10 |       /_/
11 | 
12 | Using Python version 3.7.10 (default, Jun  3 2021 00:02:01)
13 | SparkContext available as 'sc'.
14 | SparkSession available as 'spark'.
15 | >>>
16 | >>>
17 | >>> x = sc.parallelize([("a", 1), ("a", 4), ("b", 7), ("b", 8), ("c", 89)])
18 | >>> y = sc.parallelize([("a", 100), ("a", 400), ("b", 700), ("b", 800), ("b", 900), ("d", 890)])
19 | >>> x.collect()
20 | [
21 |  ('a', 1), ('a', 4), 
22 |  ('b', 7), ('b', 8), 
23 |  ('c', 89)
24 | ]
25 | >>> y.collect()
26 | [
27 |  ('a', 100), ('a', 400), 
28 |  ('b', 700), ('b', 800), ('b', 900), 
29 |  ('d', 890)
30 | ]
31 | 
32 | >>> joined = x.join(y)
33 | >>> joined.collect()
34 | [
35 |  ('b', (7, 800)), 
36 |  ('b', (7, 900)), 
37 |  ('b', (7, 700)), 
38 |  ('b', (8, 800)), 
39 |  ('b', (8, 900)), 
40 |  ('b', (8, 700)), 
41 |  ('a', (1, 100)), 
42 |  ('a', (1, 400)), 
43 |  ('a', (4, 100)), 
44 |  ('a', (4, 400))
45 | ]
46 | >>> joined2 = y.join(x)
47 | >>> joined2.collect()
48 | [
49 |  ('b', (700, 8)), 
50 |  ('b', (700, 7)), 
51 |  ('b', (800, 8)), 
52 |  ('b', (800, 7)), 
53 |  ('b', (900, 8)), 
54 |  ('b', (900, 7)), 
55 |  ('a', (100, 4)), 
56 |  ('a', (100, 1)), 
57 |  ('a', (400, 4)), 
58 |  ('a', (400, 1))
59 | ]
60 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2022-04-12.txt:
--------------------------------------------------------------------------------
 1 | ~  % cd spark-3.2.0
 2 | spark-3.2.0  % ls -l
 3 | total 192
 4 | -rwxrwxrwx@   1 mparsian  staff  22878 Oct  6  2021 LICENSE
 5 | -rwxrwxrwx@   1 mparsian  staff  57677 Oct  6  2021 NOTICE
 6 | drwxrwxrwx@   3 mparsian  staff     96 Oct  6  2021 R
 7 | -rwxrwxrwx@   1 mparsian  staff   4512 Oct  6  2021 README.md
 8 | -rwxrwxrwx@   1 mparsian  staff    167 Oct  6  2021 RELEASE
 9 | drwxrwxrwx@  29 mparsian  staff    928 Nov 17 18:15 bin
10 | drwxrwxrwx@   9 mparsian  staff    288 Nov 17 18:15 conf
11 | drwxrwxrwx@   5 mparsian  staff    160 Nov 17 18:15 data
12 | drwxrwxrwx@   4 mparsian  staff    128 Oct  6  2021 examples
13 | drwxrwxrwx@ 237 mparsian  staff   7584 Nov 17 18:15 jars
14 | drwxrwxrwx@   4 mparsian  staff    128 Nov 17 18:15 kubernetes
15 | drwxrwxrwx@  60 mparsian  staff   1920 Nov 17 18:15 licenses
16 | drwxrwxrwx@  20 mparsian  staff    640 Nov 17 18:15 python
17 | drwxrwxrwx@  29 mparsian  staff    928 Nov 17 18:15 sbin
18 | drwxrwxrwx@   3 mparsian  staff     96 Oct  6  2021 yarn
19 | 
20 | spark-3.2.0  % ./bin/pyspark
21 | Python 3.8.9 (default, Mar 30 2022, 13:51:17)
22 | [Clang 13.1.6 (clang-1316.0.21.2.3)] on darwin
23 | Type "help", "copyright", "credits" or "license" for more information.
24 | Welcome to
25 |       ____              __
26 |      / __/__  ___ _____/ /__
27 |     _\ \/ _ \/ _ `/ __/  '_/
28 |    /__ / .__/\_,_/_/ /_/\_\   version 3.2.0
29 |       /_/
30 | 
31 | Using Python version 3.8.9 (default, Mar 30 2022 13:51:17)
32 | Spark context Web UI available at http://10.0.0.234:4040
33 | Spark context available as 'sc' (master = local[*], app id = local-1649822374103).
34 | SparkSession available as 'spark'.
35 | >>>
36 | >>>
37 | >>> spark.version
38 | '3.2.0'
39 | >>>
40 | >>>
41 | >>> numbers = [1, 2, 5, 6, 7, 8, 9, 10, 30, 40, 50]
42 | >>> numbers
43 | [1, 2, 5, 6, 7, 8, 9, 10, 30, 40, 50]
44 | >>> # rdd = Resilient Dist. Dataset
45 | >>> rdd = spark.sparkContext.parallelize(numbers)
46 | >>> rdd.collect()
47 | [1, 2, 5, 6, 7, 8, 9, 10, 30, 40, 50]
48 | >>> # rdd is partitioned, read-only, operates in parallel
49 | >>> rdd.count()
50 | 11
51 | >>> total = rdd.reduce(lambda x, y: x+y)
52 | >>> total
53 | 168
54 | >>> rdd_greater_than_20 = rdd.filter(lambda x : x > 20)
55 | >>> rdd_greater_than_20.collect()
56 | [30, 40, 50]
57 | >>>
58 | >>> rdd_greater_than_20.count()
59 | 3
60 | >>> rdd.take(3)
61 | [1, 2, 5]
62 | >>>
63 | >>> ^D


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2022-04-14-mappers-and-filters-and-reduce.txt:
--------------------------------------------------------------------------------
 1 | # spark : SparkSession
 2 | 
 3 | # create a Python collection
 4 | numbers = [1, 2, 3, 4, 5, -1, -2, -3, 10, 12, 30]
 5 | 
 6 | # create an RDD[Integer] from a Python collection
 7 | rdd = spark.sparkContext.parallelize(numbers)
 8 | 
 9 | # get all elements (used for debugging -- do not use this for large RDDs)
10 | rdd.collect()
11 | [1, 2, 3, 4, 5, -1, -2, -3, 10, 12, 30]
12 | 
13 | # count the number of elements
14 | rdd.count()
15 | 11
16 | 
17 | # apply a map() transformation to rdd and create a new RDD as rdd2
18 | rdd2 = rdd.map(lambda x : 3 *x)
19 | rdd2.collect()
20 | [3, 6, 9, 12, 15, -3, -6, -9, 30, 36, 90]
21 | 
22 | # create a new RDD (as rdd3) from rdd2
23 | rdd3 = rdd2.map(lambda x: (x, 2*x))
24 | rdd3.collect()
25 | [
26 | (3, 6),
27 |  (6, 12),
28 |  (9, 18),
29 |  (12, 24),
30 |  (15, 30),
31 |  (-3, -6),
32 |  (-6, -12),
33 |  (-9, -18),
34 |  (30, 60),
35 |  (36, 72),
36 |  (90, 180)
37 | ]
38 | 
39 | # find all positive numbers from a given RDD (as rdd)
40 | # filter() is a transformation
41 | positives = rdd.filter(lambda x : x > 0)
42 | positives.collect()
43 | [1, 2, 3, 4, 5, 10, 12, 30]
44 | 
45 | # find all negative numbers from a given RDD (as rdd)
46 | # filter() is a transformation
47 | negatives = rdd.filter(lambda x : x < 0)
48 | negatives.collect()
49 | [-1, -2, -3]
50 | 
51 | # find the sum of all numbers for a given RDD[Integer]
52 | # reduce() is an action: it creates a NON-RDD
53 | # reduce() is  NOT a Transformation): it does NOT create an RDD
54 | total = rdd.reduce(lambda x, y: x+y)
55 | 
56 | 
57 |  


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session-2022-04-19-read-text-groupbykey-mapvalues-filter.txt:
--------------------------------------------------------------------------------
  1 | % cat /tmp/movies.txt
  2 | user9,m1,5
  3 | user8,m2,4
  4 | user1,m1,2
  5 | user1,9
  6 | user1,m1,2
  7 | user2,m2,3
  8 | user2,m3,5
  9 | user3,m3,4
 10 | user6,m3,4
 11 | user7,m3,3
 12 | user3,king
 13 | user4,m1,3
 14 | user5,m2,5
 15 | user6,m4,5
 16 | user7,m5,5
 17 | user1
 18 | user3,m3,5
 19 | user4,m4,1
 20 | 
 21 | % ./bin/pyspark
 22 | Python 3.8.9 (default, Mar 30 2022, 13:51:17)
 23 | [Clang 13.1.6 (clang-1316.0.21.2.3)] on darwin
 24 | Type "help", "copyright", "credits" or "license" for more information.
 25 | Welcome to
 26 |       ____              __
 27 |      / __/__  ___ _____/ /__
 28 |     _\ \/ _ \/ _ `/ __/  '_/
 29 |    /__ / .__/\_,_/_/ /_/\_\   version 3.2.0
 30 |       /_/
 31 | 
 32 | Using Python version 3.8.9 (default, Mar 30 2022 13:51:17)
 33 | Spark context Web UI available at http://10.0.0.234:4041
 34 | Spark context available as 'sc' (master = local[*], app id = local-1650425312842).
 35 | SparkSession available as 'spark'.
 36 | >>>
 37 | >>>
 38 | >>>
 39 | >>> input_path = "/tmp/movies.txt"
 40 | >>> input_path
 41 | '/tmp/movies.txt'
 42 | >>> # read input and create RDD[String]
 43 | >>> records = spark.sparkContext.textFile(input_path)
 44 | >>> records.collect()
 45 | [
 46 |   'user9,m1,5', 
 47 |   'user8,m2,4', 
 48 |   'user1,m1,2', 
 49 |   'user1,9', 
 50 |   'user1,m1,2', 
 51 |   'user2,m2,3', 
 52 |   'user2,m3,5', 
 53 |   'user3,m3,4', 
 54 |   'user6,m3,4', 
 55 |   'user7,m3,3', 
 56 |   'user3,king', 
 57 |   'user4,m1,3', 
 58 |   'user5,m2,5', 
 59 |   'user6,m4,5', 
 60 |   'user7,m5,5', 
 61 |   'user1', 
 62 |   'user3,m3,5', 
 63 |   'user4,m4,1'
 64 | ]
 65 | >>> records.count()
 66 | 18
 67 | >>>
 68 | >>>
 69 | >>> records.getNumPartitions()
 70 | 2
 71 | >>>
 72 | >>>
 73 | >>>
 74 | >>> pairs = [("A", 3), ("A", 4), ("A", 5), ("B", 30), ("B", 40), ("B", 50), ("B", 60), ("C", 100)]
 75 | >>> pairs
 76 | [('A', 3), ('A', 4), ('A', 5), ('B', 30), ('B', 40), ('B', 50), ('B', 60), ('C', 100)]
 77 | >>> rdd = spark.sparkContext.parallelize(pairs)
 78 | >>> rdd.collect()
 79 | [('A', 3), ('A', 4), ('A', 5), ('B', 30), ('B', 40), ('B', 50), ('B', 60), ('C', 100)]
 80 | >>> rdd.count()
 81 | 8
 82 | >>> rdd.getNumPartitions()
 83 | 16
 84 | >>> # NOTE: since the number of partitions is more than 
 85 | >>> #       the number of elements: this implies that 
 86 | >>> #       some of the partitions can be empty (partition
 87 | >>> #       is created, but has no elements at all).
 88 | >>>
 89 | >>>
 90 | # find average of values per key: A, B, C
 91 | >>> # use groupByKey() transformation
 92 | >>> grouped = rdd.groupByKey()
 93 | >>> grouped.collect()
 94 | [
 95 |  ('B', <pyspark.resultiterable.ResultIterable object at 0x1093f7b80>), 
 96 |  ('C', <pyspark.resultiterable.ResultIterable object at 0x1094030d0>), 
 97 |  ('A', <pyspark.resultiterable.ResultIterable object at 0x109403130>)
 98 | ]
 99 | 
100 | >>> grouped.mapValues(lambda values: list(values)).collect()
101 | [
102 |  ('B', [30, 40, 50, 60]), 
103 |  ('C', [100]), 
104 |  ('A', [3, 4, 5])
105 | ]
106 | >>> # similar to SQL's GROUP BY
107 | >>> # values : ResultIterable
108 | >>> avg_by_key = grouped.mapValues(lambda values: sum(values) / len(values))
109 | >>> avg_by_key.collect()
110 | [('B', 45.0), ('C', 100.0), ('A', 4.0)]
111 | >>>
112 | >>>
113 | >>> rdd.collect()
114 | [('A', 3), ('A', 4), ('A', 5), ('B', 30), ('B', 40), ('B', 50), ('B', 60), ('C', 100)]
115 | >>> rdd_44 = rdd.mapValues(lambda v : v * 10)
116 | >>> rdd_44.collect()
117 | [('A', 30), ('A', 40), ('A', 50), ('B', 300), ('B', 400), ('B', 500), ('B', 600), ('C', 1000)]
118 | >>> # v : denotes the value component of (key, value) pair.
119 | >>>
120 | >>>
121 | >>> # apply a filter and keep (key, value) pairs 
122 | >>> # if and only if value id greter than 100
123 | >>>
124 | >>> # understand tuple of 2 elements as (key, value) pair:
125 | >>> x = ("K", 2345)
126 | >>> x[0]
127 | 'K'
128 | >>> x[1]
129 | 2345
130 | >>>
131 | >>>
132 | >>> # apply a filter to rdd_44 and keep (key, value) 
133 | >>> # pairs if and only if value is greter than 100
134 | >>> # x denotes a single element of source RDD (rdd_44)
135 | >>> rdd5 = rdd_44.filter(lambda x: x[1] > 100)
136 | >>> rdd5.collect()
137 | [('B', 300), ('B', 400), ('B', 500), ('B', 600), ('C', 1000)]
138 | >>>
139 | >>>
140 | >>> some_lists = [ [1, 2, 3], [7, 8, 9, 10], [], [] ]
141 | >>> len(some_lists)
142 | 4
143 | >>> some_lists[0]
144 | [1, 2, 3]
145 | >>> some_lists[1]
146 | [7, 8, 9, 10]
147 | >>> some_lists[2]
148 | []
149 | >>> some_lists[3]
150 | []
151 | >>> rdd = spark.sparkContext.parallelize(some_lists)
152 | >>> rdd.collect()
153 | [[1, 2, 3], [7, 8, 9, 10], [], []]
154 | >>> rdd.count()
155 | 4
156 | >>> # each rdd element is a list denoted by [...]
157 | >>>
158 | >>> rdd2 = rdd.flatMap(lambda x: x)
159 | >>> rdd2.collect()
160 | [1, 2, 3, 7, 8, 9, 10]
161 | >>> rdd2.count()
162 | 7
163 | >>> rdd3 = rdd.map(lambda x: x)
164 | >>> rdd3.collect()
165 | [[1, 2, 3], [7, 8, 9, 10], [], []]
166 | >>> rdd3.collect()
167 | [[1, 2, 3], [7, 8, 9, 10], [], []]
168 | >>> rdd3.count()
169 | 4
170 | >>> 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/pyspark-session_2020-07-01.txt:
--------------------------------------------------------------------------------
  1 | cat /Users/mparsian/spark-3.0.0/zbin/foxdata.txt
  2 | red fox jumped high
  3 | fox jumped over high fence
  4 | red fox jumped
  5 | 
  6 | mparsian@Mahmouds-MacBook ~/spark-3.0.0 $ ./bin/pyspark
  7 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
  8 | [Clang 6.0 (clang-600.0.57)] on darwin
  9 | Type "help", "copyright", "credits" or "license" for more information.
 10 | 20/07/01 17:51:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 11 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 12 | Setting default log level to "WARN".
 13 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 14 | Welcome to
 15 |       ____              __
 16 |      / __/__  ___ _____/ /__
 17 |     _\ \/ _ \/ _ `/ __/  '_/
 18 |    /__ / .__/\_,_/_/ /_/\_\   version 3.0.0
 19 |       /_/
 20 | 
 21 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 22 | SparkSession available as 'spark'.
 23 | >>>
 24 | >>>
 25 | >>>
 26 | >>>
 27 | >>> input_path = '/Users/mparsian/spark-3.0.0/zbin/foxdata.txt'
 28 | >>> input_path
 29 | '/Users/mparsian/spark-3.0.0/zbin/foxdata.txt'
 30 | >>> recs = spark.sparkContext.textFile(input_path)
 31 | >>>
 32 | >>>
 33 | >>>
 34 | >>> recs
 35 | /Users/mparsian/spark-3.0.0/zbin/foxdata.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
 36 | >>>
 37 | >>>
 38 | >>> recs.collect()
 39 | ['red fox jumped high', 'fox jumped over high fence', 'red fox jumped']
 40 | >>> recs.count()
 41 | 3
 42 | >>> rdd_with_len = recs.map(lambda x: (x, len(x)))
 43 | >>> rdd_with_len.collect()
 44 | [('red fox jumped high', 19), ('fox jumped over high fence', 26), ('red fox jumped', 14)]
 45 | >>>
 46 | >>>
 47 | >>>
 48 | >>> upper = recs.map(lambda x: x.upper())
 49 | >>> upper.collect()
 50 | ['RED FOX JUMPED HIGH', 'FOX JUMPED OVER HIGH FENCE', 'RED FOX JUMPED']
 51 | >>> spark
 52 | <pyspark.sql.session.SparkSession object at 0x7fdc8d93eba8>
 53 | >>> lower = recs.map(lambda x: x.lower())
 54 | >>> lower.collect()
 55 | ['red fox jumped high', 'fox jumped over high fence', 'red fox jumped']
 56 | >>>
 57 | >>>
 58 | >>>
 59 | >>> lower_and_upper = lower.union(upper)
 60 | >>> lower_and_upper.collect()
 61 | ['red fox jumped high', 'fox jumped over high fence', 'red fox jumped', 'RED FOX JUMPED HIGH', 'FOX JUMPED OVER HIGH FENCE', 'RED FOX JUMPED']
 62 | >>> lower_and_upper.count()
 63 | 6
 64 | >>>
 65 | >>>
 66 | >>>
 67 | >>> counts = recs.map(lambda x : (len(x), 3*len(x)))
 68 | >>> counts.collect()
 69 | [(19, 57), (26, 78), (14, 42)]
 70 | >>>
 71 | >>>
 72 | >>>
 73 | >>> numbers = [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
 74 | >>> numbers
 75 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
 76 | 
 77 | >>> numbers = [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
 78 | >>>
 79 | >>>
 80 | >>>
 81 | >>> numbers
 82 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
 83 | >>> rdd = spark.sparkContext.parallelize(numbers)
 84 | >>> rdd.collect()
 85 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
 86 | >>> rdd.count()
 87 | 14
 88 | >>> pos = rdd.filter(lambda x: x > 0)
 89 | >>> pos.collect()
 90 | [1, 2, 3, 5, 6, 7, 8, 77, 99, 100]
 91 | 
 92 | >>>
 93 | >>> squared = rdd.map(lambda x : x*x)
 94 | >>> squared.collect()
 95 | [1, 4, 9, 25, 36, 49, 64, 1, 16, 5929, 9801, 7569, 10000, 10000]
 96 | >>> tuples3 = rdd.map(lambda x : (x, x*x, x*100))
 97 | >>> tuples3.collect()
 98 | [(1, 1, 100), (2, 4, 200), (3, 9, 300), (5, 25, 500), (6, 36, 600), (7, 49, 700), (8, 64, 800), (-1, 1, -100), (-4, 16, -400), (77, 5929, 7700), (99, 9801, 9900), (-87, 7569, -8700), (-100, 10000, -10000), (100, 10000, 10000)]
 99 | >>>
100 | >>>
101 | >>>
102 | >>> rdd.collect()
103 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
104 | >>> gt4 = rdd.filter(lambda x: x > 4)
105 | >>> gt4.collect()
106 | [5, 6, 7, 8, 77, 99, 100]
107 | >>>
108 | >>>
109 | >>>
110 | >>> rdd.collect()
111 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100]
112 | >>> total = rdd.reduce(lambda x, y: x+y)
113 | >>> total
114 | 116
115 | 
116 | Assume that rdd has 3 partitions: partition-1, partition-2, partition-3
117 | 
118 | >>> partition-1: 1, 2, 3, 5, 6, 7, 8
119 | partition-1: will sum up to: 32
120 | 
121 | >>> partition-2: -1, -4, 77, 99 
122 | partition-2: will sum up to: 171
123 | 
124 | >>> partition-3: -87, -100, 100
125 | partition-3: will sum up to: -87
126 | 
127 | ===============
128 | partition-1 & partition-2 will result in: 203
129 | 203 & partition-3 will result in: 116 (Final result)
130 |                      
131 | 


--------------------------------------------------------------------------------
/tutorial/pyspark-examples/rdds/understanding_partitions.txt:
--------------------------------------------------------------------------------
  1 | understanding_partitions.txt
  2 | 
  3 | $ ./bin/pyspark
  4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
  5 | Type "help", "copyright", "credits" or "license" for more information.
  6 | Welcome to
  7 |       ____              __
  8 |      / __/__  ___ _____/ /__
  9 |     _\ \/ _ \/ _ `/ __/  '_/
 10 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.0
 11 |       /_/
 12 | 
 13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
 14 | SparkSession available as 'spark'.
 15 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 16 | >>> rdd = sc.parallelize(numbers, 3)
 17 | >>> rdd.collect()
 18 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 19 | >>> rdd.count()
 20 | 10
 21 | >>> rdd.getNumPartitions()
 22 | 3
 23 | >>> def f(iterator):
 24 | ...     for x in iterator:
 25 | ...         print(x)
 26 | ...     print("=====")
 27 | ...
 28 | >>>
 29 | >>> rdd.foreachPartition(f)
 30 | 4
 31 | 5
 32 | 6
 33 | =====
 34 | 7
 35 | 8
 36 | 9
 37 | 10
 38 | =====
 39 | 1
 40 | 2
 41 | 3
 42 | =====
 43 | >>> rdd_default = sc.parallelize(numbers)
 44 | >>> rdd_default.collect()
 45 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 46 | >>> rdd.getNumPartitions()
 47 | 3
 48 | >>> rdd_default.getNumPartitions()
 49 | 8
 50 | >>> rdd.foreachPartition(f)
 51 | 4
 52 | 5
 53 | 6
 54 | =====
 55 | 1
 56 | 2
 57 | 3
 58 | =====
 59 | 7
 60 | 8
 61 | 9
 62 | 10
 63 | =====
 64 | >>> rdd_default.foreachPartition(f)
 65 | 6
 66 | =====
 67 | 7
 68 | =====
 69 | 3
 70 | =====
 71 | 2
 72 | =====
 73 | 8
 74 | =====
 75 | 4
 76 | 5
 77 | =====
 78 | 9
 79 | 10
 80 | =====
 81 | 1
 82 | =====
 83 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
 84 | >>> rdd_by_4 = sc.parallelize(numbers, 4)
 85 | >>> rdd_by_4.collect()
 86 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
 87 | >>> rdd_by_4.foreachPartition(f)
 88 | 1
 89 | 2
 90 | 3
 91 | =====
 92 | 10
 93 | 11
 94 | 12
 95 | =====
 96 | 4
 97 | 5
 98 | 6
 99 | =====
100 | 7
101 | 8
102 | 9
103 | =====
104 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15]
105 | >>> rdd_by_6 = sc.parallelize(numbers, 6)
106 | >>> rdd_by_6.foreachPartition(f)
107 | 7
108 | 8
109 | =====
110 | 1
111 | 2
112 | =====
113 | 11
114 | 12
115 | 13
116 | 15
117 | =====
118 | 3
119 | 4
120 | =====
121 | 9
122 | 10
123 | =====
124 | 5
125 | 6
126 | =====
127 | >>> numbers = [1, 2, 3, 4, 5, 6]
128 | >>> rdd_empty = sc.parallelize(numbers, 10)
129 | >>> rdd_empty.foreachPartition(f)
130 | 2
131 | =====
132 | 3
133 | =====
134 | =====
135 | =====
136 | 4
137 | =====
138 | =====
139 | 6
140 | =====
141 | 1
142 | =====
143 | 5
144 | =====
145 | =====
146 | >>>


--------------------------------------------------------------------------------
/tutorial/pyspark-udf/pyspark_udf_maptype.txt:
--------------------------------------------------------------------------------
 1 | $SPARK_HOME/bin/pyspark
 2 | Python 3.8.9 (default, Nov  9 2021, 04:26:29)
 3 | Welcome to
 4 |       ____              __
 5 |      / __/__  ___ _____/ /__
 6 |     _\ \/ _ \/ _ `/ __/  '_/
 7 |    /__ / .__/\_,_/_/ /_/\_\   version 3.2.0
 8 |       /_/
 9 | 
10 | Using Python version 3.8.9 (default, Nov  9 2021 04:26:29)
11 | Spark context Web UI available at http://10.0.0.232:4040
12 | Spark context available as 'sc' (master = local[*], app id = local-1641011178190).
13 | SparkSession available as 'spark'.
14 | 
15 | >>> from pyspark.sql import Row
16 | 
17 | >>> data = spark.createDataFrame(
18 | ... [Row(zip_code='94087', city='Sunnyvale'),
19 | ...  Row(zip_code='94088', city='Cupertino'),
20 | ...  Row(zip_code='95055', city='Santa Clara'),
21 | ...  Row(zip_code='95054', city='Palo Alto')])
22 | 
23 | >>>
24 | >>> data.show()
25 | +--------+-----------+
26 | |zip_code|       city|
27 | +--------+-----------+
28 | |   94087|  Sunnyvale|
29 | |   94088|  Cupertino|
30 | |   95055|Santa Clara|
31 | |   95054|  Palo Alto|
32 | +--------+-----------+
33 | 
34 | >>> from pyspark.sql.functions import udf
35 | >>> from pyspark.sql import types as T
36 | >>>
37 | >>> @udf(T.MapType(T.StringType(), T.StringType()))
38 | ... def create_structure(zip_code, city):
39 | ...     return {zip_code: city}
40 | ...
41 | >>> data.withColumn('structure', create_structure(data.zip_code, data.city)).toJSON().collect()
42 | [
43 |  '{"zip_code":"94087","city":"Sunnyvale","structure":{"94087":"Sunnyvale"}}', 
44 |  '{"zip_code":"94088","city":"Cupertino","structure":{"94088":"Cupertino"}}', 
45 |  '{"zip_code":"95055","city":"Santa Clara","structure":{"95055":"Santa Clara"}}', 
46 |  '{"zip_code":"95054","city":"Palo Alto","structure":{"95054":"Palo Alto"}}'
47 | ]
48 | 
49 | >>> data.withColumn('structure', create_structure(data.zip_code, data.city)).show(truncate=False)
50 | +--------+-----------+----------------------+
51 | |zip_code|city       |structure             |
52 | +--------+-----------+----------------------+
53 | |94087   |Sunnyvale  |{94087 -> Sunnyvale}  |
54 | |94088   |Cupertino  |{94088 -> Cupertino}  |
55 | |95055   |Santa Clara|{95055 -> Santa Clara}|
56 | |95054   |Palo Alto  |{95054 -> Palo Alto}  |
57 | +--------+-----------+----------------------+
58 | 


--------------------------------------------------------------------------------
/tutorial/split-function/README.md:
--------------------------------------------------------------------------------
 1 | How To Use Split Function
 2 | =========================
 3 | 
 4 | * Example-1: Split ````RDD<String>```` into Tokens
 5 | 
 6 | ````
 7 | # ./bin/pyspark
 8 | Python 2.7.10 (default, Oct 23 2015, 19:19:21)
 9 | 
10 | Welcome to
11 |       ____              __
12 |      / __/__  ___ _____/ /__
13 |     _\ \/ _ \/ _ `/ __/  '_/
14 |    /__ / .__/\_,_/_/ /_/\_\   version 1.6.1
15 |       /_/
16 | 
17 | Using Python version 2.7.10 (default, Oct 23 2015 19:19:21)
18 | SparkContext available as sc, HiveContext available as sqlContext.
19 | 
20 | >>> data = ["abc,de", "abc,de,ze", "abc,de,ze,pe"]
21 | >>> data
22 | ['abc,de', 'abc,de,ze', 'abc,de,ze,pe']
23 | 
24 | >>> rdd = sc.parallelize(data)
25 | >>> rdd.collect()
26 | ['abc,de', 'abc,de,ze', 'abc,de,ze,pe']
27 | >>> rdd.count()
28 | 3
29 | 
30 | >>> rdd2 = rdd.flatMap(lambda x : x.split(","))
31 | >>> rdd2.collect()
32 | ['abc', 'de', 'abc', 'de', 'ze', 'abc', 'de', 'ze', 'pe']
33 | >>> rdd2.count()
34 | 9
35 | ````
36 | 
37 | * Example-2: Create Key-Value Pairs
38 | 
39 | ````
40 | >>> data2 = ["abc,de", "xyz,deeee,ze", "abc,de,ze,pe", "xyz,bababa"]
41 | >>> data2
42 | ['abc,de', 'xyz,deeee,ze', 'abc,de,ze,pe', 'xyz,bababa']
43 | 
44 | >>> rdd4 = sc.parallelize(data2)
45 | >>> rdd4.collect()
46 | ['abc,de', 'xyz,deeee,ze', 'abc,de,ze,pe', 'xyz,bababa']
47 | 
48 | >>> rdd5 = rdd4.map(lambda x : (x.split(",")[0], x.split(",")[1]))
49 | >>> rdd5.collect()
50 | [('abc', 'de'), ('xyz', 'deeee'), ('abc', 'de'), ('xyz', 'bababa')]
51 | ````
52 | 


--------------------------------------------------------------------------------
/tutorial/top-N/top-N.txt:
--------------------------------------------------------------------------------
 1 | # ./pyspark
 2 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12)
 3 | Welcome to
 4 |       ____              __
 5 |      / __/__  ___ _____/ /__
 6 |     _\ \/ _ \/ _ `/ __/  '_/
 7 |    /__ / .__/\_,_/_/ /_/\_\   version 1.2.0
 8 |       /_/
 9 | 
10 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
11 | SparkContext available as sc.
12 | >>>
13 | >>> nums = [10, 1, 2, 9, 3, 4, 5, 6, 7]
14 | >>> sc.parallelize(nums).takeOrdered(3)
15 | [1, 2, 3]
16 | >>> sc.parallelize(nums).takeOrdered(3, key=lambda x: -x)
17 | [10, 9, 7]
18 | >>>
19 | >>> kv = [(10,"z1"), (1,"z2"), (2,"z3"), (9,"z4"), (3,"z5"), (4,"z6"), (5,"z7"), (6,"z8"), (7,"z9")]
20 | >>> sc.parallelize(kv).takeOrdered(3)
21 | [(1, 'z2'), (2, 'z3'), (3, 'z5')]
22 | >>>
23 | >>> sc.parallelize(kv).takeOrdered(3, key=lambda x: -x[0])
24 | [(10, 'z1'), (9, 'z4'), (7, 'z9')]
25 | 


--------------------------------------------------------------------------------
/tutorial/wordcount/README.md:
--------------------------------------------------------------------------------
 1 | * word_count.py
 2 | 
 3 | Word Count solution in PySpark: Note that input file is 
 4 | hard-coded: not a very good practice. The purpose is to 
 5 | show how to read files in Spark.
 6 | 
 7 | * word_count_ver2.py
 8 | 
 9 | I pass input file as a parameter.
10 | 
11 | 
12 | ````
13 | best regards,
14 | Mahmoud Parsian
15 | ````
16 | 


--------------------------------------------------------------------------------
/tutorial/wordcount/run_word_count.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/Users/mparsian/spark-2.2.1"
 3 | #
 4 | # define your input path
 5 | #INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/Users/mparsian/zmp/pyspark_book_project/programs/word_count.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG 
12 | 


--------------------------------------------------------------------------------
/tutorial/wordcount/run_word_count_ver2.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/Users/mparsian/spark-2.2.1"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="file:///Users/mparsian/spark-2.2.1/zbin/sample.txt"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/Users/mparsian/zmp/github/pyspark-tutorial/tutorial/wordcount/word_count_ver2.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/tutorial/wordcount/word_count.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import sys
 4 | 
 5 | from pyspark.sql import SparkSession
 6 | #-----------------------------------
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 
11 |     # create an instance of a SparkSession as spark
12 |     spark = SparkSession\
13 |         .builder\
14 |         .appName("wordcount")\
15 |         .getOrCreate()
16 | 
17 |     inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt"
18 | 
19 |     # create SparkContext as sc
20 |     sc = spark.sparkContext
21 | 
22 |     # create RDD from a text file
23 |     textfileRDD = sc.textFile(inputPath)
24 |     print(textfileRDD.collect())
25 | 
26 |     wordsRDD = textfileRDD.flatMap(lambda line: line.split(" "))
27 |     print(wordsRDD.collect())
28 | 
29 |     pairsRDD =  wordsRDD.map(lambda word: (word, 1))
30 |     print(pairsRDD.collect())
31 | 
32 |     frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b)
33 |     print(frequenciesRDD.collect())
34 | 
35 |     # done!
36 |     spark.stop()
37 | 


--------------------------------------------------------------------------------
/tutorial/wordcount/word_count_ver2.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import sys
 4 | 
 5 | from pyspark.sql import SparkSession
 6 | #-----------------------------------
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 
11 |     # create an instance of a SparkSession as spark
12 |     spark = SparkSession\
13 |         .builder\
14 |         .appName("wordcount")\
15 |         .getOrCreate()
16 | 
17 |     # inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt"
18 |     #
19 |     #   sys.argv[0] is the name of the script.
20 |     #   sys.argv[1] is the first parameter
21 |     inputPath = sys.argv[1] # input file
22 |     print("inputPath: {}".format(inputPath))
23 | 
24 | 
25 |     # create SparkContext as sc
26 |     sc = spark.sparkContext
27 | 
28 |     # create RDD from a text file
29 |     textfileRDD = sc.textFile(inputPath)
30 |     print(textfileRDD.collect())
31 | 
32 |     wordsRDD = textfileRDD.flatMap(lambda line: line.split(" "))
33 |     print(wordsRDD.collect())
34 | 
35 |     pairsRDD =  wordsRDD.map(lambda word: (word, 1))
36 |     print(pairsRDD.collect())
37 | 
38 |     frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b)
39 |     print(frequenciesRDD.collect())
40 | 
41 |     # done!
42 |     spark.stop()
43 | 


--------------------------------------------------------------------------------
/tutorial/wordcount/wordcount-shorthand.txt:
--------------------------------------------------------------------------------
 1 | # cat data.txt
 2 | crazy crazy fox jumped
 3 | crazy fox jumped
 4 | fox is fast
 5 | fox is smart
 6 | dog is smart
 7 | 
 8 | # ./bin/pyspark
 9 | Welcome to
10 |       ____              __
11 |      / __/__  ___ _____/ /__
12 |     _\ \/ _ \/ _ `/ __/  '_/
13 |    /__ / .__/\_,_/_/ /_/\_\   version 1.4.0
14 |       /_/
15 | 
16 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
17 | SparkContext available as sc, SQLContext available as sqlContext.
18 | >>>
19 | >>> lines = sc.textFile('data.txt', 1);
20 | >>> lines.collect()
21 | [
22 |  u'crazy crazy fox jumped', 
23 |  u'crazy fox jumped', 
24 |  u'fox is fast', 
25 |  u'fox is smart', 
26 |  u'dog is smart'
27 | ]
28 | 
29 | >>> frequencies = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
30 | >>> frequencies.collect()
31 | [
32 |  (u'crazy', 3), 
33 |  (u'jumped', 2), 
34 |  (u'is', 3), 
35 |  (u'fox', 4), 
36 |  (u'dog', 1), 
37 |  (u'fast', 1), 
38 |  (u'smart', 2)
39 | ]
40 | 
41 | >>> frequencies.count()
42 | 7


--------------------------------------------------------------------------------
/tutorial/wordcount/wordcount.txt:
--------------------------------------------------------------------------------
  1 | 1. Prepare Input
  2 | 
  3 | # cat data.txt
  4 | crazy crazy fox jumped
  5 | crazy fox jumped
  6 | fox is fast
  7 | fox is smart
  8 | dog is smart
  9 | 
 10 | 2. Invoke pyspark
 11 | 
 12 | # export SPARK_HOME=...
 13 | # SPARK_HOME/bin/pyspark
 14 | Python 2.6.9 (unknown, Sep  9 2014, 15:05:12)
 15 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin
 16 | Type "help", "copyright", "credits" or "license" for more information.
 17 | 
 18 | Welcome to
 19 |       ____              __
 20 |      / __/__  ___ _____/ /__
 21 |     _\ \/ _ \/ _ `/ __/  '_/
 22 |    /__ / .__/\_,_/_/ /_/\_\   version 1.2.0
 23 |       /_/
 24 | 
 25 | Using Python version 2.6.9 (unknown, Sep  9 2014 15:05:12)
 26 | SparkContext available as sc.
 27 | >>> sc
 28 | <pyspark.context.SparkContext object at 0x10ae02210>
 29 | >>> lines = sc.textFile("data.txt", 1)
 30 | >>> debuglines = lines.collect();
 31 | >>> debuglines
 32 | [u'crazy crazy fox jumped', 
 33 |  u'crazy fox jumped', 
 34 |  u'fox is fast', 
 35 |  u'fox is smart', 
 36 |  u'dog is smart'
 37 | ]
 38 | >>> words = lines.flatMap(lambda x: x.split(' '))
 39 | >>> debugwords = words.collect();
 40 | >>> debugwords
 41 | [
 42 |  u'crazy', 
 43 |  u'crazy', 
 44 |  u'fox', 
 45 |  u'jumped', 
 46 |  u'crazy', 
 47 |  u'fox', 
 48 |  u'jumped', 
 49 |  u'fox', 
 50 |  u'is', 
 51 |  u'fast', 
 52 |  u'fox', 
 53 |  u'is', 
 54 |  u'smart', 
 55 |  u'dog', 
 56 |  u'is', 
 57 |  u'smart'
 58 | ]
 59 | >>> ones = words.map(lambda x: (x, 1))
 60 | >>> debugones = ones.collect()
 61 | >>> debugones
 62 | [
 63 |  (u'crazy', 1), 
 64 |  (u'crazy', 1), 
 65 |  (u'fox', 1), 
 66 |  (u'jumped', 1), 
 67 |  (u'crazy', 1), 
 68 |  (u'fox', 1), 
 69 |  (u'jumped', 1), 
 70 |  (u'fox', 1), 
 71 |  (u'is', 1), 
 72 |  (u'fast', 1), 
 73 |  (u'fox', 1), 
 74 |  (u'is', 1), 
 75 |  (u'smart', 1), 
 76 |  (u'dog', 1), 
 77 |  (u'is', 1), 
 78 |  (u'smart', 1)
 79 | ]
 80 | >>> counts = ones.reduceByKey(lambda x, y: x + y)
 81 | >>> debugcounts = counts.collect()
 82 | >>> debugcounts
 83 | [
 84 |  (u'crazy', 3), 
 85 |  (u'jumped', 2), 
 86 |  (u'is', 3), 
 87 |  (u'fox', 4), 
 88 |  (u'dog', 1), 
 89 |  (u'fast', 1), 
 90 |  (u'smart', 2)
 91 | ]
 92 | >>>
 93 | >>> counts.saveAsTextFile("output")
 94 | 
 95 | 3. Examine Output
 96 | 
 97 | # cat output/part*
 98 | (u'crazy', 3)
 99 | (u'jumped', 2)
100 | (u'is', 3)
101 | (u'fox', 4)
102 | (u'dog', 1)
103 | (u'fast', 1)
104 | (u'smart', 2)
105 | 


--------------------------------------------------------------------------------